summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/io
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/io')
-rw-r--r--usr/src/uts/common/io/bpf/bpf_filter.c576
-rw-r--r--usr/src/uts/common/io/bpf/bpf_wrap.c35
-rw-r--r--usr/src/uts/common/io/cpqary3/cpqary3.c2
-rw-r--r--usr/src/uts/common/io/dld/dld_drv.c50
-rw-r--r--usr/src/uts/common/io/dld/dld_proto.c120
-rw-r--r--usr/src/uts/common/io/dld/dld_str.c108
-rw-r--r--usr/src/uts/common/io/dls/dls.c126
-rw-r--r--usr/src/uts/common/io/dls/dls_link.c1
-rw-r--r--usr/src/uts/common/io/dls/dls_mgmt.c423
-rw-r--r--usr/src/uts/common/io/dls/dls_stat.c172
-rw-r--r--usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE32
-rw-r--r--usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip1
-rw-r--r--usr/src/uts/common/io/dr_sas/dr_sas.c5509
-rw-r--r--usr/src/uts/common/io/dr_sas/dr_sas.conf15
-rw-r--r--usr/src/uts/common/io/dr_sas/dr_sas.h1769
-rw-r--r--usr/src/uts/common/io/dr_sas/dr_sas_list.h212
-rw-r--r--usr/src/uts/common/io/dump.c30
-rw-r--r--usr/src/uts/common/io/eventfd.c88
-rw-r--r--usr/src/uts/common/io/gsqueue/gsqueue.c608
-rw-r--r--usr/src/uts/common/io/inotify.c1559
-rw-r--r--usr/src/uts/common/io/inotify.conf16
-rw-r--r--usr/src/uts/common/io/ixgbe/ixgbe_main.c8
-rw-r--r--usr/src/uts/common/io/ksocket/ksocket.c14
-rw-r--r--usr/src/uts/common/io/ksocket/ksocket_impl.h6
-rw-r--r--usr/src/uts/common/io/ksyms.c9
-rw-r--r--usr/src/uts/common/io/mac/mac.c6
-rw-r--r--usr/src/uts/common/io/mac/mac_client.c43
-rw-r--r--usr/src/uts/common/io/mac/mac_datapath_setup.c20
-rw-r--r--usr/src/uts/common/io/mac/mac_protect.c118
-rw-r--r--usr/src/uts/common/io/mac/mac_provider.c3
-rw-r--r--usr/src/uts/common/io/mac/mac_sched.c8
-rw-r--r--usr/src/uts/common/io/mac/mac_stat.c8
-rw-r--r--usr/src/uts/common/io/mem.c11
-rw-r--r--usr/src/uts/common/io/mr_sas/mr_sas.conf8
-rw-r--r--usr/src/uts/common/io/nfp/THIRDPARTYLICENSE19
-rw-r--r--usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip1
-rw-r--r--usr/src/uts/common/io/nfp/autoversion.h21
-rw-r--r--usr/src/uts/common/io/nfp/drvlist.c19
-rw-r--r--usr/src/uts/common/io/nfp/hostif.c1192
-rw-r--r--usr/src/uts/common/io/nfp/i21285.c310
-rw-r--r--usr/src/uts/common/io/nfp/i21285.h43
-rw-r--r--usr/src/uts/common/io/nfp/i21555.c423
-rw-r--r--usr/src/uts/common/io/nfp/i21555.h51
-rw-r--r--usr/src/uts/common/io/nfp/i21555d.c28
-rw-r--r--usr/src/uts/common/io/nfp/nfdev-common.h141
-rw-r--r--usr/src/uts/common/io/nfp/nfdev-solaris.h37
-rw-r--r--usr/src/uts/common/io/nfp/nfp.h113
-rw-r--r--usr/src/uts/common/io/nfp/nfp_cmd.h68
-rw-r--r--usr/src/uts/common/io/nfp/nfp_common.h68
-rw-r--r--usr/src/uts/common/io/nfp/nfp_error.h48
-rw-r--r--usr/src/uts/common/io/nfp/nfp_hostif.h54
-rw-r--r--usr/src/uts/common/io/nfp/nfp_ifvers.c51
-rw-r--r--usr/src/uts/common/io/nfp/nfp_osif.h105
-rw-r--r--usr/src/uts/common/io/nfp/nfpci.h171
-rw-r--r--usr/src/uts/common/io/nfp/osif.c184
-rw-r--r--usr/src/uts/common/io/nvme/nvme.c105
-rw-r--r--usr/src/uts/common/io/nvme/nvme_var.h22
-rw-r--r--usr/src/uts/common/io/overlay/overlay.c2185
-rw-r--r--usr/src/uts/common/io/overlay/overlay.conf16
-rw-r--r--usr/src/uts/common/io/overlay/overlay.mapfile46
-rw-r--r--usr/src/uts/common/io/overlay/overlay_fm.c82
-rw-r--r--usr/src/uts/common/io/overlay/overlay_mux.c371
-rw-r--r--usr/src/uts/common/io/overlay/overlay_plugin.c281
-rw-r--r--usr/src/uts/common/io/overlay/overlay_prop.c122
-rw-r--r--usr/src/uts/common/io/overlay/overlay_target.c1651
-rw-r--r--usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c394
-rw-r--r--usr/src/uts/common/io/pciex/hotplug/pciehpc.c15
-rw-r--r--usr/src/uts/common/io/pciex/pcie.c7
-rw-r--r--usr/src/uts/common/io/pciex/pcie_fault.c76
-rw-r--r--usr/src/uts/common/io/physmem.c8
-rw-r--r--usr/src/uts/common/io/pseudo.conf9
-rw-r--r--usr/src/uts/common/io/pseudonex.c26
-rw-r--r--usr/src/uts/common/io/ptm.c47
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpgbin0 -> 86314 bytes
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpgbin0 -> 37055 bytes
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.pngbin0 -> 9054 bytes
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.pngbin0 -> 9907 bytes
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpgbin0 -> 46722 bytes
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.binbin0 -> 1177408 bytes
-rw-r--r--usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.binbin0 -> 528720 bytes
-rw-r--r--usr/src/uts/common/io/qede/qede_list.h1
-rw-r--r--usr/src/uts/common/io/qede/qede_version.h1
-rw-r--r--usr/src/uts/common/io/random.c3
-rw-r--r--usr/src/uts/common/io/rsm/rsm.c2
-rw-r--r--usr/src/uts/common/io/sata/adapters/ahci/ahci.c1
-rw-r--r--usr/src/uts/common/io/scsi/adapters/mpt_sas/mpt_sas.conf7
-rw-r--r--usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c2
-rw-r--r--usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c215
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt.c565
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf16
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c2023
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c282
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c362
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c238
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c1457
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c286
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c367
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c613
-rw-r--r--usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c160
-rw-r--r--usr/src/uts/common/io/scsi/targets/sd.c107
-rw-r--r--usr/src/uts/common/io/signalfd.c4
-rw-r--r--usr/src/uts/common/io/usb/clients/hid/hid.c212
-rw-r--r--usr/src/uts/common/io/vnd/frameio.c465
-rw-r--r--usr/src/uts/common/io/vnd/vnd.c5857
-rw-r--r--usr/src/uts/common/io/vnd/vnd.conf16
-rw-r--r--usr/src/uts/common/io/zfd.c1154
106 files changed, 33629 insertions, 1111 deletions
diff --git a/usr/src/uts/common/io/bpf/bpf_filter.c b/usr/src/uts/common/io/bpf/bpf_filter.c
deleted file mode 100644
index db5b224a5e..0000000000
--- a/usr/src/uts/common/io/bpf/bpf_filter.c
+++ /dev/null
@@ -1,576 +0,0 @@
-/* $NetBSD: bpf_filter.c,v 1.35 2008/08/20 13:01:54 joerg Exp $ */
-
-/*
- * Copyright (c) 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997
- * The Regents of the University of California. All rights reserved.
- *
- * This code is derived from the Stanford/CMU enet packet filter,
- * (net/enet.c) distributed as part of 4.3BSD, and code contributed
- * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
- * Berkeley Laboratory.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)bpf_filter.c 8.1 (Berkeley) 6/10/93
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/stream.h>
-#include <sys/byteorder.h>
-#include <sys/sdt.h>
-
-#define EXTRACT_SHORT(p) BE_IN16(p)
-#define EXTRACT_LONG(p) BE_IN32(p)
-
-#ifdef _KERNEL
-#define M_LEN(_m) ((_m)->b_wptr - (_m)->b_rptr)
-#define mtod(_a, _t) ((_t)((_a)->b_rptr))
-#define MINDEX(len, m, k) \
-{ \
- len = M_LEN(m); \
- while (k >= len) { \
- k -= len; \
- m = m->b_cont; \
- if (m == 0) \
- return (0); \
- len = M_LEN(m); \
- } \
-}
-
-static int m_xword(mblk_t *, uint32_t, int *);
-static int m_xhalf(mblk_t *, uint32_t, int *);
-
-static int
-m_xword(mblk_t *m, uint32_t k, int *err)
-{
- int len;
- uchar_t *cp, *np;
- mblk_t *m0;
-
- *err = 1;
- MINDEX(len, m, k);
- cp = mtod(m, uchar_t *) + k;
- if (len >= k + 4) {
- *err = 0;
- return (EXTRACT_LONG(cp));
- }
- m0 = m->b_cont;
- if (m0 == 0 || M_LEN(m0) + len - k < 4) {
- DTRACE_PROBE3(mblk_xword_fail, mblk_t *, m0, int, len, int, k);
- return (0);
- }
- *err = 0;
- np = mtod(m0, uchar_t *);
- switch (len - k) {
-
- case 1:
- return ((cp[0] << 24) | (np[0] << 16) | (np[1] << 8) | np[2]);
-
- case 2:
- return ((cp[0] << 24) | (cp[1] << 16) | (np[0] << 8) | np[1]);
-
- default:
- return ((cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | np[0]);
- }
-}
-
-static int
-m_xhalf(mblk_t *m, uint32_t k, int *err)
-{
- int len;
- uchar_t *cp;
- mblk_t *m0;
-
- *err = 1;
- MINDEX(len, m, k);
- cp = mtod(m, uchar_t *) + k;
- if (len >= k + 2) {
- *err = 0;
- return (EXTRACT_SHORT(cp));
- }
- m0 = m->b_cont;
- if (m0 == 0) {
- DTRACE_PROBE3(mblk_xhalf_fail, mblk_t *, m0, int, len, int, k);
- return (0);
- }
- *err = 0;
- return ((cp[0] << 8) | mtod(m0, uchar_t *)[0]);
-}
-#else /* _KERNEL */
-#include <stdlib.h>
-#endif /* !_KERNEL */
-
-#include <net/bpf.h>
-
-/*
- * Execute the filter program starting at pc on the packet p
- * wirelen is the length of the original packet
- * buflen is the amount of data present
- * When buflen is non-0, p is a pointer to a the start of the packet and the
- * packet is only in one mblk_t.
- * When buflen is 0, p is an mblk_t pointer.
- */
-uint_t
-bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
-{
- uint32_t A, X, k;
- uint32_t mem[BPF_MEMWORDS];
-
- if (pc == 0)
- /*
- * No filter means accept all.
- */
- return ((uint_t)-1);
- A = 0;
- X = 0;
- --pc;
- /* CONSTCOND */
- while (1) {
- ++pc;
- switch (pc->code) {
-
- default:
-#ifdef _KERNEL
- DTRACE_PROBE1(bpf_insn_unknown,
- struct bpf_insn *, pc);
- return (0);
-#else
- abort();
-#endif
- case BPF_RET|BPF_K:
- return ((uint_t)pc->k);
-
- case BPF_RET|BPF_A:
- return ((uint_t)A);
-
- case BPF_LD|BPF_W|BPF_ABS:
- k = pc->k;
- if (k + sizeof (int32_t) > buflen) {
-#ifdef _KERNEL
- int merr = 0;
-
- if (buflen != 0)
- return (0);
- A = m_xword((mblk_t *)p, k, &merr);
- if (merr != 0)
- return (0);
- continue;
-#else
- return (0);
-#endif
- }
- A = EXTRACT_LONG(&p[k]);
- continue;
-
- case BPF_LD|BPF_H|BPF_ABS:
- k = pc->k;
- if (k + sizeof (int16_t) > buflen) {
-#ifdef _KERNEL
- int merr;
-
- if (buflen != 0)
- return (0);
- A = m_xhalf((mblk_t *)p, k, &merr);
- if (merr != 0)
- return (0);
- continue;
-#else
- return (0);
-#endif
- }
- A = EXTRACT_SHORT(&p[k]);
- continue;
-
- case BPF_LD|BPF_B|BPF_ABS:
- k = pc->k;
- if (k >= buflen) {
-#ifdef _KERNEL
- mblk_t *m;
- int len;
-
- if (buflen != 0)
- return (0);
- m = (mblk_t *)p;
- MINDEX(len, m, k);
- A = mtod(m, uchar_t *)[k];
- continue;
-#else
- return (0);
-#endif
- }
- A = p[k];
- continue;
-
- case BPF_LD|BPF_W|BPF_LEN:
- A = wirelen;
- continue;
-
- case BPF_LDX|BPF_W|BPF_LEN:
- X = wirelen;
- continue;
-
- case BPF_LD|BPF_W|BPF_IND:
- k = X + pc->k;
- if (k + sizeof (int32_t) > buflen) {
-#ifdef _KERNEL
- int merr = 0;
-
- if (buflen != 0)
- return (0);
- A = m_xword((mblk_t *)p, k, &merr);
- if (merr != 0)
- return (0);
- continue;
-#else
- return (0);
-#endif
- }
- A = EXTRACT_LONG(&p[k]);
- continue;
-
- case BPF_LD|BPF_H|BPF_IND:
- k = X + pc->k;
- if (k + sizeof (int16_t) > buflen) {
-#ifdef _KERNEL
- int merr = 0;
-
- if (buflen != 0)
- return (0);
- A = m_xhalf((mblk_t *)p, k, &merr);
- if (merr != 0)
- return (0);
- continue;
-#else
- return (0);
-#endif
- }
- A = EXTRACT_SHORT(&p[k]);
- continue;
-
- case BPF_LD|BPF_B|BPF_IND:
- k = X + pc->k;
- if (k >= buflen) {
-#ifdef _KERNEL
- mblk_t *m;
- int len;
-
- if (buflen != 0)
- return (0);
- m = (mblk_t *)p;
- MINDEX(len, m, k);
- A = mtod(m, uchar_t *)[k];
- continue;
-#else
- return (0);
-#endif
- }
- A = p[k];
- continue;
-
- case BPF_LDX|BPF_MSH|BPF_B:
- k = pc->k;
- if (k >= buflen) {
-#ifdef _KERNEL
- mblk_t *m;
- int len;
-
- if (buflen != 0)
- return (0);
- m = (mblk_t *)p;
- MINDEX(len, m, k);
- X = (mtod(m, char *)[k] & 0xf) << 2;
- continue;
-#else
- return (0);
-#endif
- }
- X = (p[pc->k] & 0xf) << 2;
- continue;
-
- case BPF_LD|BPF_IMM:
- A = pc->k;
- continue;
-
- case BPF_LDX|BPF_IMM:
- X = pc->k;
- continue;
-
- case BPF_LD|BPF_MEM:
- A = mem[pc->k];
- continue;
-
- case BPF_LDX|BPF_MEM:
- X = mem[pc->k];
- continue;
-
- case BPF_ST:
- mem[pc->k] = A;
- continue;
-
- case BPF_STX:
- mem[pc->k] = X;
- continue;
-
- case BPF_JMP|BPF_JA:
- pc += pc->k;
- continue;
-
- case BPF_JMP|BPF_JGT|BPF_K:
- pc += (A > pc->k) ? pc->jt : pc->jf;
- continue;
-
- case BPF_JMP|BPF_JGE|BPF_K:
- pc += (A >= pc->k) ? pc->jt : pc->jf;
- continue;
-
- case BPF_JMP|BPF_JEQ|BPF_K:
- pc += (A == pc->k) ? pc->jt : pc->jf;
- continue;
-
- case BPF_JMP|BPF_JSET|BPF_K:
- pc += (A & pc->k) ? pc->jt : pc->jf;
- continue;
-
- case BPF_JMP|BPF_JGT|BPF_X:
- pc += (A > X) ? pc->jt : pc->jf;
- continue;
-
- case BPF_JMP|BPF_JGE|BPF_X:
- pc += (A >= X) ? pc->jt : pc->jf;
- continue;
-
- case BPF_JMP|BPF_JEQ|BPF_X:
- pc += (A == X) ? pc->jt : pc->jf;
- continue;
-
- case BPF_JMP|BPF_JSET|BPF_X:
- pc += (A & X) ? pc->jt : pc->jf;
- continue;
-
- case BPF_ALU|BPF_ADD|BPF_X:
- A += X;
- continue;
-
- case BPF_ALU|BPF_SUB|BPF_X:
- A -= X;
- continue;
-
- case BPF_ALU|BPF_MUL|BPF_X:
- A *= X;
- continue;
-
- case BPF_ALU|BPF_DIV|BPF_X:
- if (X == 0)
- return (0);
- A /= X;
- continue;
-
- case BPF_ALU|BPF_AND|BPF_X:
- A &= X;
- continue;
-
- case BPF_ALU|BPF_OR|BPF_X:
- A |= X;
- continue;
-
- case BPF_ALU|BPF_LSH|BPF_X:
- A <<= X;
- continue;
-
- case BPF_ALU|BPF_RSH|BPF_X:
- A >>= X;
- continue;
-
- case BPF_ALU|BPF_ADD|BPF_K:
- A += pc->k;
- continue;
-
- case BPF_ALU|BPF_SUB|BPF_K:
- A -= pc->k;
- continue;
-
- case BPF_ALU|BPF_MUL|BPF_K:
- A *= pc->k;
- continue;
-
- case BPF_ALU|BPF_DIV|BPF_K:
- A /= pc->k;
- continue;
-
- case BPF_ALU|BPF_AND|BPF_K:
- A &= pc->k;
- continue;
-
- case BPF_ALU|BPF_OR|BPF_K:
- A |= pc->k;
- continue;
-
- case BPF_ALU|BPF_LSH|BPF_K:
- A <<= pc->k;
- continue;
-
- case BPF_ALU|BPF_RSH|BPF_K:
- A >>= pc->k;
- continue;
-
- case BPF_ALU|BPF_NEG:
- A = -A;
- continue;
-
- case BPF_MISC|BPF_TAX:
- X = A;
- continue;
-
- case BPF_MISC|BPF_TXA:
- A = X;
- continue;
- }
- }
- /* NOTREACHED */
-}
-
-#ifdef _KERNEL
-/*
- * Return true if the 'fcode' is a valid filter program.
- * The constraints are that each jump be forward and to a valid
- * code, that memory accesses are within valid ranges (to the
- * extent that this can be checked statically; loads of packet
- * data have to be, and are, also checked at run time), and that
- * the code terminates with either an accept or reject.
- *
- * The kernel needs to be able to verify an application's filter code.
- * Otherwise, a bogus program could easily crash the system.
- */
-int
-bpf_validate(struct bpf_insn *f, int len)
-{
- uint_t i, from;
- struct bpf_insn *p;
-
- if (len < 1 || len > BPF_MAXINSNS)
- return (0);
-
- for (i = 0; i < len; ++i) {
- p = &f[i];
- DTRACE_PROBE1(bpf_valid_insn, struct bpf_insn *, p);
- switch (BPF_CLASS(p->code)) {
- /*
- * Check that memory operations use valid addresses.
- */
- case BPF_LD:
- case BPF_LDX:
- switch (BPF_MODE(p->code)) {
- case BPF_MEM:
- if (p->k >= BPF_MEMWORDS)
- return (0);
- break;
- case BPF_ABS:
- case BPF_IND:
- case BPF_MSH:
- case BPF_IMM:
- case BPF_LEN:
- break;
- default:
- return (0);
- }
- break;
- case BPF_ST:
- case BPF_STX:
- if (p->k >= BPF_MEMWORDS)
- return (0);
- break;
- case BPF_ALU:
- switch (BPF_OP(p->code)) {
- case BPF_ADD:
- case BPF_SUB:
- case BPF_MUL:
- case BPF_OR:
- case BPF_AND:
- case BPF_LSH:
- case BPF_RSH:
- case BPF_NEG:
- break;
- case BPF_DIV:
- /*
- * Check for constant division by 0.
- */
- if (BPF_RVAL(p->code) == BPF_K && p->k == 0)
- return (0);
- break;
- default:
- return (0);
- }
- break;
- case BPF_JMP:
- /*
- * Check that jumps are within the code block,
- * and that unconditional branches don't go
- * backwards as a result of an overflow.
- * Unconditional branches have a 32-bit offset,
- * so they could overflow; we check to make
- * sure they don't. Conditional branches have
- * an 8-bit offset, and the from address is <=
- * BPF_MAXINSNS, and we assume that BPF_MAXINSNS
- * is sufficiently small that adding 255 to it
- * won't overflow.
- *
- * We know that len is <= BPF_MAXINSNS, and we
- * assume that BPF_MAXINSNS is < the maximum size
- * of a uint_t, so that i + 1 doesn't overflow.
- */
- from = i + 1;
- switch (BPF_OP(p->code)) {
- case BPF_JA:
- if (from + p->k < from || from + p->k >= len)
- return (0);
- break;
- case BPF_JEQ:
- case BPF_JGT:
- case BPF_JGE:
- case BPF_JSET:
- if (from + p->jt >= len || from + p->jf >= len)
- return (0);
- break;
- default:
- return (0);
- }
- break;
- case BPF_RET:
- break;
- case BPF_MISC:
- break;
- default:
- return (0);
- }
- }
-
- return (BPF_CLASS(f[len - 1].code) == BPF_RET);
-}
-#endif
diff --git a/usr/src/uts/common/io/bpf/bpf_wrap.c b/usr/src/uts/common/io/bpf/bpf_wrap.c
new file mode 100644
index 0000000000..6cbde58a20
--- /dev/null
+++ b/usr/src/uts/common/io/bpf/bpf_wrap.c
@@ -0,0 +1,35 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <net/bpf.h>
+#include <inet/bpf.h>
+
+/*
+ * With BPF filter validation and evaluation moved into the 'ip' module, these
+ * wrapper functions are provided to expose the original interface.
+ */
+
+uint_t
+bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
+{
+ return ((uint_t)ip_bpf_filter((ip_bpf_insn_t *)pc, p, wirelen, buflen));
+}
+
+int
+bpf_validate(struct bpf_insn *f, int len)
+{
+ return ((int)ip_bpf_validate((ip_bpf_insn_t *)f, (uint_t)len));
+}
diff --git a/usr/src/uts/common/io/cpqary3/cpqary3.c b/usr/src/uts/common/io/cpqary3/cpqary3.c
index 622f0dcf68..f67d77b3d2 100644
--- a/usr/src/uts/common/io/cpqary3/cpqary3.c
+++ b/usr/src/uts/common/io/cpqary3/cpqary3.c
@@ -41,7 +41,7 @@ extern cpqary3_driver_info_t gdriver_info;
* Global Variables Definitions
*/
-static char cpqary3_brief[] = "HP Smart Array Driver";
+static char cpqary3_brief[] = "HP Smart Array (Legacy)";
void *cpqary3_state;
/* HPQaculi Changes */
diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c
index eca17349c3..00b5f0e3de 100644
--- a/usr/src/uts/common/io/dld/dld_drv.c
+++ b/usr/src/uts/common/io/dld/dld_drv.c
@@ -347,8 +347,8 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
if ((err = dls_devnet_hold_tmp(diap->dia_linkid, &dlh)) != 0)
return (err);
- if ((err = mac_perim_enter_by_macname(
- dls_devnet_mac(dlh), &mph)) != 0) {
+ if ((err = mac_perim_enter_by_macname(dls_devnet_mac(dlh),
+ &mph)) != 0) {
dls_devnet_rele_tmp(dlh);
return (err);
}
@@ -360,7 +360,6 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
}
mac_sdu_get(dlp->dl_mh, NULL, &diap->dia_max_sdu);
-
dls_link_rele(dlp);
mac_perim_exit(mph);
dls_devnet_rele_tmp(dlh);
@@ -702,7 +701,8 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set,
err = EACCES;
goto done;
}
- err = dls_devnet_setzid(dlh, dzp->diz_zid);
+ err = dls_devnet_setzid(dlh, dzp->diz_zid,
+ dzp->diz_transient);
} else {
kprop->pr_perm_flags = MAC_PROP_PERM_RW;
(*(zoneid_t *)kprop->pr_val) = dls_devnet_getzid(dlh);
@@ -876,7 +876,7 @@ drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
return (err);
if ((err = dls_devnet_rename(dir->dir_linkid1, dir->dir_linkid2,
- dir->dir_link)) != 0)
+ dir->dir_link, dir->dir_zoneinit)) != 0)
return (err);
if (dir->dir_linkid2 == DATALINK_INVALID_LINKID)
@@ -1331,10 +1331,13 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred,
dls_link_t *dlp = NULL;
dld_ioc_gettran_t *dgt = karg;
- if ((ret = mac_perim_enter_by_linkid(dgt->dgt_linkid, &mph)) != 0)
+ if ((ret = dls_devnet_hold_tmp(dgt->dgt_linkid, &dlh)) != 0)
goto done;
- if ((ret = dls_devnet_hold_link(dgt->dgt_linkid, &dlh, &dlp)) != 0)
+ if ((ret = mac_perim_enter_by_macname(dls_devnet_mac(dlh), &mph)) != 0)
+ goto done;
+
+ if ((ret = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0)
goto done;
/*
@@ -1353,13 +1356,14 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred,
}
done:
- if (dlh != NULL && dlp != NULL) {
- dls_devnet_rele_link(dlh, dlp);
- }
+ if (dlp != NULL)
+ dls_link_rele(dlp);
- if (mph != NULL) {
+ if (mph != NULL)
mac_perim_exit(mph);
- }
+
+ if (dlh != NULL)
+ dls_devnet_rele_tmp(dlh);
return (ret);
}
@@ -1383,10 +1387,13 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred,
if (dti->dti_nbytes != 256 || dti->dti_off != 0)
return (EINVAL);
- if ((ret = mac_perim_enter_by_linkid(dti->dti_linkid, &mph)) != 0)
+ if ((ret = dls_devnet_hold_tmp(dti->dti_linkid, &dlh)) != 0)
goto done;
- if ((ret = dls_devnet_hold_link(dti->dti_linkid, &dlh, &dlp)) != 0)
+ if ((ret = mac_perim_enter_by_macname(dls_devnet_mac(dlh), &mph)) != 0)
+ goto done;
+
+ if ((ret = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0)
goto done;
/*
@@ -1406,13 +1413,14 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred,
}
done:
- if (dlh != NULL && dlp != NULL) {
- dls_devnet_rele_link(dlh, dlp);
- }
+ if (dlp != NULL)
+ dls_link_rele(dlp);
- if (mph != NULL) {
+ if (mph != NULL)
mac_perim_exit(mph);
- }
+
+ if (dlh != NULL)
+ dls_devnet_rele_tmp(dlh);
return (ret);
}
@@ -1509,7 +1517,6 @@ done:
return (ret);
}
-
/*
* Note that ioctls that modify links have a NULL di_priv_func(), as
* privileges can only be checked after we know the class of the link being
@@ -1585,7 +1592,8 @@ static dld_ioc_modentry_t dld_ioc_modtable[] = {
{SIMNET_IOC, "simnet", 0, NULL, 0},
{BRIDGE_IOC, "bridge", 0, NULL, 0},
{IPTUN_IOC, "iptun", 0, NULL, 0},
- {IBPART_IOC, "ibp", -1, NULL, 0}
+ {IBPART_IOC, "ibp", -1, NULL, 0},
+ {OVERLAY_IOC, "overlay", 0, NULL, 0}
};
#define DLDIOC_CNT \
(sizeof (dld_ioc_modtable) / sizeof (dld_ioc_modentry_t))
diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c
index 1171804c28..596147f4e9 100644
--- a/usr/src/uts/common/io/dld/dld_proto.c
+++ b/usr/src/uts/common/io/dld/dld_proto.c
@@ -42,7 +42,7 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req,
proto_bind_req, proto_unbind_req, proto_promiscon_req, proto_promiscoff_req,
proto_enabmulti_req, proto_disabmulti_req, proto_physaddr_req,
proto_setphysaddr_req, proto_udqos_req, proto_req, proto_capability_req,
- proto_notify_req, proto_passive_req;
+ proto_notify_req, proto_passive_req, proto_exclusive_req;
static void proto_capability_advertise(dld_str_t *, mblk_t *);
static int dld_capab_poll_disable(dld_str_t *, dld_capab_poll_t *);
@@ -122,6 +122,9 @@ dld_proto(dld_str_t *dsp, mblk_t *mp)
case DL_PASSIVE_REQ:
proto_passive_req(dsp, mp);
break;
+ case DL_EXCLUSIVE_REQ:
+ proto_exclusive_req(dsp, mp);
+ break;
default:
proto_req(dsp, mp);
break;
@@ -606,6 +609,14 @@ proto_promiscon_req(dld_str_t *dsp, mblk_t *mp)
new_flags |= DLS_PROMISC_PHYS;
break;
+ case DL_PROMISC_RX_ONLY:
+ new_flags |= DLS_PROMISC_RX_ONLY;
+ break;
+
+ case DL_PROMISC_FIXUPS:
+ new_flags |= DLS_PROMISC_FIXUPS;
+ break;
+
default:
dl_err = DL_NOTSUPPORTED;
goto failed2;
@@ -693,6 +704,22 @@ proto_promiscoff_req(dld_str_t *dsp, mblk_t *mp)
new_flags &= ~DLS_PROMISC_PHYS;
break;
+ case DL_PROMISC_RX_ONLY:
+ if (!(dsp->ds_promisc & DLS_PROMISC_RX_ONLY)) {
+ dl_err = DL_NOTENAB;
+ goto failed2;
+ }
+ new_flags &= ~DLS_PROMISC_RX_ONLY;
+ break;
+
+ case DL_PROMISC_FIXUPS:
+ if (!(dsp->ds_promisc & DLS_PROMISC_FIXUPS)) {
+ dl_err = DL_NOTENAB;
+ goto failed2;
+ }
+ new_flags &= ~DLS_PROMISC_FIXUPS;
+ break;
+
default:
dl_err = DL_NOTSUPPORTED;
goto failed2;
@@ -1293,7 +1320,8 @@ proto_passive_req(dld_str_t *dsp, mblk_t *mp)
* If we've already become active by issuing an active primitive,
* then it's too late to try to become passive.
*/
- if (dsp->ds_passivestate == DLD_ACTIVE) {
+ if (dsp->ds_passivestate == DLD_ACTIVE ||
+ dsp->ds_passivestate == DLD_EXCLUSIVE) {
dl_err = DL_OUTSTATE;
goto failed;
}
@@ -1347,12 +1375,20 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags)
ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+ if (dsp->ds_sap == ETHERTYPE_IPV6)
+ return (ENOTSUP);
+
switch (flags) {
case DLD_ENABLE:
dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf,
direct->di_rx_ch);
- direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
+ if (direct->di_flags & DI_DIRECT_RAW) {
+ direct->di_tx_df =
+ (uintptr_t)str_mdata_raw_fastpath_put;
+ } else {
+ direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
+ }
direct->di_tx_dh = dsp;
direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify;
direct->di_tx_cb_dh = dsp->ds_mch;
@@ -1451,6 +1487,9 @@ dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags)
ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+ if (dsp->ds_sap == ETHERTYPE_IPV6)
+ return (ENOTSUP);
+
switch (flags) {
case DLD_ENABLE:
return (dld_capab_poll_enable(dsp, poll));
@@ -1461,12 +1500,34 @@ dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags)
}
static int
+dld_capab_ipcheck(dld_str_t *dsp, void *data, uint_t flags)
+{
+ dld_capab_ipcheck_t *ipc = data;
+
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
+ switch (flags) {
+ case DLD_ENABLE:
+ ipc->ipc_allowed_df = (uintptr_t)mac_protect_check_addr;
+ ipc->ipc_allowed_dh = dsp->ds_mch;
+ return (0);
+ case DLD_DISABLE:
+ return (0);
+ }
+
+ return (ENOTSUP);
+}
+
+static int
dld_capab_lso(dld_str_t *dsp, void *data, uint_t flags)
{
dld_capab_lso_t *lso = data;
ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+ if (dsp->ds_sap == ETHERTYPE_IPV6)
+ return (ENOTSUP);
+
switch (flags) {
case DLD_ENABLE: {
mac_capab_lso_t mac_lso;
@@ -1522,8 +1583,9 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags)
* completes. So we limit the check to DLD_ENABLE case.
*/
if ((flags == DLD_ENABLE && type != DLD_CAPAB_PERIM) &&
- (!(dsp->ds_sap == ETHERTYPE_IP || dsp->ds_sap == ETHERTYPE_IPV6) ||
- !check_mod_above(dsp->ds_rq, "ip"))) {
+ (((dsp->ds_sap != ETHERTYPE_IP && dsp->ds_sap != ETHERTYPE_IPV6) ||
+ !check_mod_above(dsp->ds_rq, "ip")) &&
+ !check_mod_above(dsp->ds_rq, "vnd"))) {
return (ENOTSUP);
}
@@ -1552,6 +1614,10 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags)
err = dld_capab_lso(dsp, data, flags);
break;
+ case DLD_CAPAB_IPCHECK:
+ err = dld_capab_ipcheck(dsp, data, flags);
+ break;
+
default:
err = ENOTSUP;
break;
@@ -1613,10 +1679,15 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
}
/*
- * Direct capability negotiation interface between IP and DLD
+ * Direct capability negotiation interface between IP/VND and DLD. Note
+ * that for vnd we only allow the case where the media type is the
+ * native media type so we know that there are no transformations that
+ * would have to happen to the mac header that it receives.
*/
- if ((dsp->ds_sap == ETHERTYPE_IP || dsp->ds_sap == ETHERTYPE_IPV6) &&
- check_mod_above(dsp->ds_rq, "ip")) {
+ if (((dsp->ds_sap == ETHERTYPE_IP || dsp->ds_sap == ETHERTYPE_IPV6) &&
+ check_mod_above(dsp->ds_rq, "ip")) ||
+ (check_mod_above(dsp->ds_rq, "vnd") &&
+ dsp->ds_mip->mi_media == dsp->ds_mip->mi_nativemedia)) {
dld_capable = B_TRUE;
subsize += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_dld_t);
@@ -1735,3 +1806,36 @@ dld_capabilities_disable(dld_str_t *dsp)
if (dsp->ds_polling)
(void) dld_capab_poll_disable(dsp, NULL);
}
+
+static void
+proto_exclusive_req(dld_str_t *dsp, mblk_t *mp)
+{
+ int ret = 0;
+ t_uscalar_t dl_err;
+ mac_perim_handle_t mph;
+
+ if (dsp->ds_passivestate != DLD_UNINITIALIZED) {
+ dl_err = DL_OUTSTATE;
+ goto failed;
+ }
+
+ if (MBLKL(mp) < DL_EXCLUSIVE_REQ_SIZE) {
+ dl_err = DL_BADPRIM;
+ goto failed;
+ }
+
+ mac_perim_enter_by_mh(dsp->ds_mh, &mph);
+ ret = dls_exclusive_set(dsp, B_TRUE);
+ mac_perim_exit(mph);
+
+ if (ret != 0) {
+ dl_err = DL_SYSERR;
+ goto failed;
+ }
+
+ dsp->ds_passivestate = DLD_EXCLUSIVE;
+ dlokack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ);
+ return;
+failed:
+ dlerrorack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ, dl_err, (t_uscalar_t)ret);
+}
diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c
index 9f89165455..e9e98441b5 100644
--- a/usr/src/uts/common/io/dld/dld_str.c
+++ b/usr/src/uts/common/io/dld/dld_str.c
@@ -23,6 +23,10 @@
*/
/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
* Data-Link Driver
*/
@@ -857,6 +861,77 @@ i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
return (mp);
}
+static boolean_t
+i_dld_raw_ether_check(dld_str_t *dsp, mac_header_info_t *mhip, mblk_t **mpp)
+{
+ mblk_t *mp = *mpp;
+ mblk_t *newmp;
+ uint_t pri, vid, dvid;
+
+ dvid = mac_client_vid(dsp->ds_mch);
+
+ /*
+ * Discard the packet if this is a VLAN stream but the VID in
+ * the packet is not correct.
+ */
+ vid = VLAN_ID(mhip->mhi_tci);
+ if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
+ return (B_FALSE);
+
+ /*
+ * Discard the packet if this packet is a tagged packet
+ * but both pri and VID are 0.
+ */
+ pri = VLAN_PRI(mhip->mhi_tci);
+ if (mhip->mhi_istagged && !mhip->mhi_ispvid && pri == 0 &&
+ vid == VLAN_ID_NONE)
+ return (B_FALSE);
+
+ /*
+ * Update the priority bits to the per-stream priority if
+ * priority is not set in the packet. Update the VID for
+ * packets on a VLAN stream.
+ */
+ pri = (pri == 0) ? dsp->ds_pri : 0;
+ if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
+ if ((newmp = i_dld_ether_header_update_tag(mp, pri,
+ dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
+ return (B_FALSE);
+ }
+ *mpp = newmp;
+ }
+
+ return (B_TRUE);
+}
+
+mac_tx_cookie_t
+str_mdata_raw_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
+ uint16_t flag)
+{
+ boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
+ mac_header_info_t mhi;
+ mac_tx_cookie_t cookie;
+
+ if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
+ goto discard;
+
+ if (is_ethernet) {
+ if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE)
+ goto discard;
+ }
+
+ if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != (mac_tx_cookie_t)NULL) {
+ DLD_SETQFULL(dsp);
+ }
+ return (cookie);
+discard:
+ /* TODO: bump kstat? */
+ freemsg(mp);
+ return ((mac_tx_cookie_t)NULL);
+}
+
+
+
/*
* M_DATA put (IP fast-path mode)
*/
@@ -905,7 +980,6 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
mblk_t *bp, *newmp;
size_t size;
mac_header_info_t mhi;
- uint_t pri, vid, dvid;
uint_t max_sdu;
/*
@@ -951,38 +1025,8 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
goto discard;
if (is_ethernet) {
- dvid = mac_client_vid(dsp->ds_mch);
-
- /*
- * Discard the packet if this is a VLAN stream but the VID in
- * the packet is not correct.
- */
- vid = VLAN_ID(mhi.mhi_tci);
- if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
- goto discard;
-
- /*
- * Discard the packet if this packet is a tagged packet
- * but both pri and VID are 0.
- */
- pri = VLAN_PRI(mhi.mhi_tci);
- if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
- vid == VLAN_ID_NONE)
+ if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE)
goto discard;
-
- /*
- * Update the priority bits to the per-stream priority if
- * priority is not set in the packet. Update the VID for
- * packets on a VLAN stream.
- */
- pri = (pri == 0) ? dsp->ds_pri : 0;
- if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
- if ((newmp = i_dld_ether_header_update_tag(mp, pri,
- dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
- goto discard;
- }
- mp = newmp;
- }
}
if (DLD_TX(dsp, mp, 0, 0) != 0) {
diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c
index 2dc16c4586..b71d95bd44 100644
--- a/usr/src/uts/common/io/dls/dls.c
+++ b/usr/src/uts/common/io/dls/dls.c
@@ -250,19 +250,69 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
{
int err = 0;
uint32_t old_flags = dsp->ds_promisc;
+ uint32_t new_type = new_flags &
+ ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS);
mac_client_promisc_type_t mptype = MAC_CLIENT_PROMISC_ALL;
+ uint16_t mac_flags = 0;
+ boolean_t doremove = B_FALSE;
ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
ASSERT(!(new_flags & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI |
- DLS_PROMISC_PHYS)));
+ DLS_PROMISC_PHYS | DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)));
+
+ /*
+ * If we only have the non-data receive flags set or are only changing
+ * them, then there's nothing to do other than update the flags here.
+ * Basically when we only have something in the set of
+ * DLS_PROMISC_RX_ONLY and DLS_PROMISC_FIXUPS around, then there's
+ * nothing else for us to do other than toggle it, as there's no need to
+ * talk to MAC and we don't have to do anything else.
+ */
+ if ((old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 &&
+ (new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0) {
+ dsp->ds_promisc = new_flags;
+ return (0);
+ }
/*
* If the user has only requested DLS_PROMISC_MULTI then we need to make
* sure that they don't see all packets.
*/
- if (new_flags == DLS_PROMISC_MULTI)
+ if (new_type == DLS_PROMISC_MULTI)
mptype = MAC_CLIENT_PROMISC_MULTI;
+ /*
+ * Look at new flags and figure out the correct mac promisc flags.
+ * If we've only requested DLS_PROMISC_SAP and not _MULTI or _PHYS,
+ * don't turn on physical promisc mode.
+ */
+ if (new_flags & DLS_PROMISC_RX_ONLY)
+ mac_flags |= MAC_PROMISC_FLAGS_NO_TX_LOOP;
+ if (new_flags & DLS_PROMISC_FIXUPS)
+ mac_flags |= MAC_PROMISC_FLAGS_DO_FIXUPS;
+ if (new_type == DLS_PROMISC_SAP)
+ mac_flags |= MAC_PROMISC_FLAGS_NO_PHYS;
+
+ /*
+ * If we're coming in and we're being asked to transition to a state
+ * where the only DLS flags would be enabled are flags that change what
+ * we do with promiscuous packets (DLS_PROMISC_RX_ONLY and
+ * DLS_PROMISC_FIXUPS) and not which packets we should receive, then we
+ * need to remove the MAC layer promiscuous handler.
+ */
+ if ((new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 &&
+ (old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) != 0 &&
+ new_flags != 0) {
+ doremove = B_TRUE;
+ }
+
+ /*
+ * There are three cases we care about here with respect to MAC. Going
+ * from nothing to something, something to nothing, something to
+ * something where we need to change how we're getting stuff from mac.
+ * In the last case, as long as they're not equal, we need to assume
+ * something has changed and do something about it.
+ */
if (dsp->ds_promisc == 0 && new_flags != 0) {
/*
* If only DLS_PROMISC_SAP, we don't turn on the
@@ -270,9 +320,7 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
*/
dsp->ds_promisc = new_flags;
err = mac_promisc_add(dsp->ds_mch, mptype,
- dls_rx_promisc, dsp, &dsp->ds_mph,
- (new_flags != DLS_PROMISC_SAP) ? 0 :
- MAC_PROMISC_FLAGS_NO_PHYS);
+ dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags);
if (err != 0) {
dsp->ds_promisc = old_flags;
return (err);
@@ -283,7 +331,8 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
mac_promisc_remove(dsp->ds_vlan_mph);
dsp->ds_vlan_mph = NULL;
}
- } else if (dsp->ds_promisc != 0 && new_flags == 0) {
+ } else if (dsp->ds_promisc != 0 &&
+ (new_flags == 0 || doremove == B_TRUE)) {
ASSERT(dsp->ds_mph != NULL);
mac_promisc_remove(dsp->ds_mph);
@@ -298,19 +347,13 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags)
MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp,
&dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS);
}
- } else if (dsp->ds_promisc == DLS_PROMISC_SAP && new_flags != 0 &&
- new_flags != dsp->ds_promisc) {
- /*
- * If the old flag is PROMISC_SAP, but the current flag has
- * changed to some new non-zero value, we need to turn the
- * physical promiscuous mode.
- */
+ } else if (new_flags != 0 && new_flags != old_flags) {
ASSERT(dsp->ds_mph != NULL);
mac_promisc_remove(dsp->ds_mph);
/* Honors both after-remove and before-add semantics! */
dsp->ds_promisc = new_flags;
err = mac_promisc_add(dsp->ds_mch, mptype,
- dls_rx_promisc, dsp, &dsp->ds_mph, 0);
+ dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags);
if (err != 0)
dsp->ds_promisc = old_flags;
} else {
@@ -631,6 +674,22 @@ boolean_t
dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx,
void **ds_rx_arg, boolean_t loopback)
{
+ if (dsp->ds_promisc == 0) {
+ /*
+ * If there are active walkers of the mi_promisc_list when
+ * promiscuousness is disabled, ds_promisc will be cleared,
+ * but the DLS will remain on the mi_promisc_list until the
+ * walk is completed. If we do not recognize this case here,
+ * we won't properly execute the ds_promisc case in the common
+ * accept routine -- and we will potentially accept a packet
+ * that has originated with this DLS (which in turn can
+ * induce recursion and death by stack overflow). If
+ * ds_promisc is zero, we know that we are in this window --
+ * and we refuse to accept the packet.
+ */
+ return (B_FALSE);
+ }
+
return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE,
loopback));
}
@@ -661,7 +720,10 @@ dls_mac_active_set(dls_link_t *dlp)
* Set the function to start receiving packets.
*/
mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp);
+ } else if (dlp->dl_exclusive == B_TRUE) {
+ return (EBUSY);
}
+
dlp->dl_nactive++;
return (0);
}
@@ -687,7 +749,11 @@ dls_active_set(dld_str_t *dsp)
if (dsp->ds_passivestate == DLD_PASSIVE)
return (0);
- /* If we're already active, then there's nothing more to do. */
+ if (dsp->ds_dlp->dl_exclusive == B_TRUE &&
+ dsp->ds_passivestate != DLD_EXCLUSIVE)
+ return (EBUSY);
+
+ /* If we're already active, we need to check the link's exclusivity */
if ((dsp->ds_nactive == 0) &&
((err = dls_mac_active_set(dsp->ds_dlp)) != 0)) {
/* except for ENXIO all other errors are mapped to EBUSY */
@@ -696,7 +762,8 @@ dls_active_set(dld_str_t *dsp)
return (err);
}
- dsp->ds_passivestate = DLD_ACTIVE;
+ dsp->ds_passivestate = dsp->ds_dlp->dl_exclusive == B_TRUE ?
+ DLD_EXCLUSIVE : DLD_ACTIVE;
dsp->ds_nactive++;
return (0);
}
@@ -727,7 +794,32 @@ dls_active_clear(dld_str_t *dsp, boolean_t all)
if (dsp->ds_nactive != 0)
return;
- ASSERT(dsp->ds_passivestate == DLD_ACTIVE);
+ ASSERT(dsp->ds_passivestate == DLD_ACTIVE ||
+ dsp->ds_passivestate == DLD_EXCLUSIVE);
dls_mac_active_clear(dsp->ds_dlp);
+ /*
+ * We verify below to ensure that no other part of DLS has mucked with
+ * our exclusive state.
+ */
+ if (dsp->ds_passivestate == DLD_EXCLUSIVE)
+ VERIFY(dls_exclusive_set(dsp, B_FALSE) == 0);
dsp->ds_passivestate = DLD_UNINITIALIZED;
}
+
+int
+dls_exclusive_set(dld_str_t *dsp, boolean_t enable)
+{
+ ASSERT(MAC_PERIM_HELD(dsp->ds_mh));
+
+ if (enable == B_FALSE) {
+ dsp->ds_dlp->dl_exclusive = B_FALSE;
+ return (0);
+ }
+
+ if (dsp->ds_dlp->dl_nactive != 0)
+ return (EBUSY);
+
+ dsp->ds_dlp->dl_exclusive = B_TRUE;
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c
index 4099d0b801..eee3569b10 100644
--- a/usr/src/uts/common/io/dls/dls_link.c
+++ b/usr/src/uts/common/io/dls/dls_link.c
@@ -686,6 +686,7 @@ i_dls_link_destroy(dls_link_t *dlp)
dlp->dl_mnh = NULL;
dlp->dl_unknowns = 0;
dlp->dl_nonip_cnt = 0;
+ dlp->dl_exclusive = B_FALSE;
kmem_cache_free(i_dls_link_cachep, dlp);
}
diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c
index 49e867a19e..90b65ab36a 100644
--- a/usr/src/uts/common/io/dls/dls_mgmt.c
+++ b/usr/src/uts/common/io/dls/dls_mgmt.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
*/
/*
* Copyright (c) 2016 by Delphix. All rights reserved.
@@ -85,6 +86,14 @@ static door_handle_t dls_mgmt_dh = NULL;
/* dls_devnet_t dd_flags */
#define DD_CONDEMNED 0x1
#define DD_IMPLICIT_IPTUN 0x2 /* Implicitly-created ip*.*tun* tunnel */
+#define DD_INITIALIZING 0x4
+
+/*
+ * If the link is marked as initializing or condemned then it should
+ * not be visible outside of the DLS framework.
+ */
+#define DD_NOT_VISIBLE(flags) ( \
+ (flags & (DD_CONDEMNED | DD_INITIALIZING)) != 0)
/*
* This structure is used to keep the <linkid, macname> mapping.
@@ -108,13 +117,14 @@ typedef struct dls_devnet_s {
zoneid_t dd_zid; /* current zone */
boolean_t dd_prop_loaded;
taskqid_t dd_prop_taskid;
+ boolean_t dd_transient; /* link goes away when zone does */
} dls_devnet_t;
static int i_dls_devnet_create_iptun(const char *, const char *,
datalink_id_t *);
static int i_dls_devnet_destroy_iptun(datalink_id_t);
-static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t);
-static int dls_devnet_unset(const char *, datalink_id_t *, boolean_t);
+static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t, boolean_t);
+static int dls_devnet_unset(mac_handle_t, datalink_id_t *, boolean_t);
/*ARGSUSED*/
static int
@@ -134,9 +144,9 @@ i_dls_devnet_destructor(void *buf, void *arg)
{
dls_devnet_t *ddp = buf;
- ASSERT(ddp->dd_ksp == NULL);
- ASSERT(ddp->dd_ref == 0);
- ASSERT(ddp->dd_tref == 0);
+ VERIFY(ddp->dd_ksp == NULL);
+ VERIFY(ddp->dd_ref == 0);
+ VERIFY(ddp->dd_tref == 0);
mutex_destroy(&ddp->dd_mutex);
cv_destroy(&ddp->dd_cv);
}
@@ -148,7 +158,12 @@ dls_zone_remove(datalink_id_t linkid, void *arg)
dls_devnet_t *ddp;
if (dls_devnet_hold_tmp(linkid, &ddp) == 0) {
- (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID);
+ /*
+ * Don't bother moving transient links back to the global zone
+ * since we will simply delete them in dls_devnet_unset.
+ */
+ if (!ddp->dd_transient)
+ (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE);
dls_devnet_rele_tmp(ddp);
}
return (0);
@@ -529,6 +544,7 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid)
getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID;
(void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN);
+ getlinkid.ld_zoneid = getzoneid();
if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval,
sizeof (retval))) == 0) {
@@ -537,6 +553,27 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid)
return (err);
}
+int
+dls_mgmt_get_linkid_in_zone(const char *link, datalink_id_t *linkid,
+ zoneid_t zid)
+{
+ dlmgmt_door_getlinkid_t getlinkid;
+ dlmgmt_getlinkid_retval_t retval;
+ int err;
+
+ ASSERT(getzoneid() == GLOBAL_ZONEID || zid == getzoneid());
+ getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID;
+ (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN);
+ getlinkid.ld_zoneid = zid;
+
+ if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval,
+ sizeof (retval))) == 0) {
+ *linkid = retval.lr_linkid;
+ }
+ return (err);
+}
+
+
datalink_id_t
dls_mgmt_get_next(datalink_id_t linkid, datalink_class_t class,
datalink_media_t dmedia, uint32_t flags)
@@ -736,13 +773,24 @@ dls_devnet_stat_update(kstat_t *ksp, int rw)
* Create the "link" kstats.
*/
static void
-dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid)
+dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid, zoneid_t newzoneid)
{
kstat_t *ksp;
+ char *nm;
+ char kname[MAXLINKNAMELEN];
+
+ if (zoneid != newzoneid) {
+ ASSERT(zoneid == GLOBAL_ZONEID);
+ (void) snprintf(kname, sizeof (kname), "z%d_%s", newzoneid,
+ ddp->dd_linkname);
+ nm = kname;
+ } else {
+ nm = ddp->dd_linkname;
+ }
- if (dls_stat_create("link", 0, ddp->dd_linkname, zoneid,
+ if (dls_stat_create("link", 0, nm, zoneid,
dls_devnet_stat_update, (void *)(uintptr_t)ddp->dd_linkid,
- &ksp) == 0) {
+ &ksp, newzoneid) == 0) {
ASSERT(ksp != NULL);
if (zoneid == ddp->dd_owner_zid) {
ASSERT(ddp->dd_ksp == NULL);
@@ -762,12 +810,12 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid)
{
if (zoneid == ddp->dd_owner_zid) {
if (ddp->dd_ksp != NULL) {
- kstat_delete(ddp->dd_ksp);
+ dls_stat_delete(ddp->dd_ksp);
ddp->dd_ksp = NULL;
}
} else {
if (ddp->dd_zone_ksp != NULL) {
- kstat_delete(ddp->dd_zone_ksp);
+ dls_stat_delete(ddp->dd_zone_ksp);
ddp->dd_zone_ksp = NULL;
}
}
@@ -778,24 +826,38 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid)
* and create the new set using the new name.
*/
static void
-dls_devnet_stat_rename(dls_devnet_t *ddp)
+dls_devnet_stat_rename(dls_devnet_t *ddp, boolean_t zoneinit)
{
if (ddp->dd_ksp != NULL) {
- kstat_delete(ddp->dd_ksp);
+ dls_stat_delete(ddp->dd_ksp);
ddp->dd_ksp = NULL;
}
- /* We can't rename a link while it's assigned to a non-global zone. */
+ if (zoneinit && ddp->dd_zone_ksp != NULL) {
+ dls_stat_delete(ddp->dd_zone_ksp);
+ ddp->dd_zone_ksp = NULL;
+ }
+ /*
+ * We can't rename a link while it's assigned to a non-global zone
+ * unless we're first initializing the zone while readying it.
+ */
ASSERT(ddp->dd_zone_ksp == NULL);
- dls_devnet_stat_create(ddp, ddp->dd_owner_zid);
+ dls_devnet_stat_create(ddp, ddp->dd_owner_zid,
+ (zoneinit ? ddp->dd_zid : ddp->dd_owner_zid));
+ if (zoneinit)
+ dls_devnet_stat_create(ddp, ddp->dd_zid, ddp->dd_zid);
}
/*
- * Associate a linkid with a given link (identified by macname)
+ * Associate the linkid with the link identified by macname. If this
+ * is called on behalf of a physical link then linkid may be
+ * DATALINK_INVALID_LINKID. Otherwise, if called on behalf of a
+ * virtual link, linkid must have a value.
*/
static int
-dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid,
+dls_devnet_set(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid,
dls_devnet_t **ddpp)
{
+ const char *macname = mac_name(mh);
dls_devnet_t *ddp = NULL;
datalink_class_t class;
int err;
@@ -828,17 +890,41 @@ dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid,
}
/*
- * This might be a physical link that has already
- * been created, but which does not have a linkid
- * because dlmgmtd was not running when it was created.
+ * If we arrive here we know we are attempting to set
+ * the linkid on a physical link. A virtual link
+ * should never arrive here because it should never
+ * call this function without a linkid. Virtual links
+ * are created through dlgmtmd and thus we know
+ * dlmgmtd is alive to assign it a linkid (search for
+ * uses of dladm_create_datalink_id() to prove this to
+ * yourself); we don't have the same guarantee for a
+ * physical link which may perform an upcall for a
+ * linkid while dlmgmtd is down but will continue
+ * creating a devnet without the linkid (see
+ * softmac_create_datalink() to see how physical link
+ * creation works). That is why there is no entry in
+ * the id hash but there is one in the macname hash --
+ * softmac couldn't acquire a linkid the first time it
+ * called this function.
+ *
+ * Because of the check above, we also know that
+ * ddp->dd_linkid is not set. Following this, the link
+ * must still be in the DD_INITIALIZING state because
+ * that flag is removed IFF dd_linkid is set. This is
+ * why we can ASSERT the DD_INITIALIZING flag below if
+ * the call to i_dls_devnet_setzid() fails.
*/
if (linkid == DATALINK_INVALID_LINKID ||
class != DATALINK_CLASS_PHYS) {
err = EINVAL;
goto done;
}
+
+ ASSERT(ddp->dd_flags & DD_INITIALIZING);
+
} else {
ddp = kmem_cache_alloc(i_dls_devnet_cachep, KM_SLEEP);
+ ddp->dd_flags = DD_INITIALIZING;
ddp->dd_tref = 0;
ddp->dd_ref++;
ddp->dd_owner_zid = zoneid;
@@ -875,8 +961,19 @@ done:
rw_exit(&i_dls_devnet_lock);
if (err == 0) {
if (zoneid != GLOBAL_ZONEID &&
- (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE)) != 0)
- (void) dls_devnet_unset(macname, &linkid, B_TRUE);
+ (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE,
+ B_FALSE)) != 0) {
+ /*
+ * At this point the link is marked as
+ * DD_INITIALIZING -- there can be no
+ * outstanding temp refs and therefore no need
+ * to wait for them.
+ */
+ ASSERT(ddp->dd_flags & DD_INITIALIZING);
+ (void) dls_devnet_unset(mh, &linkid, B_FALSE);
+ return (err);
+ }
+
/*
* The kstat subsystem holds its own locks (rather perimeter)
* before calling the ks_update (dls_devnet_stat_update) entry
@@ -884,20 +981,35 @@ done:
* lock hierarchy is kstat locks -> i_dls_devnet_lock.
*/
if (stat_create)
- dls_devnet_stat_create(ddp, zoneid);
+ dls_devnet_stat_create(ddp, zoneid, zoneid);
if (ddpp != NULL)
*ddpp = ddp;
+
+ mutex_enter(&ddp->dd_mutex);
+ if (linkid != DATALINK_INVALID_LINKID && !ddp->dd_prop_loaded &&
+ ddp->dd_prop_taskid == TASKQID_INVALID) {
+ ddp->dd_prop_taskid = taskq_dispatch(system_taskq,
+ dls_devnet_prop_task, ddp, TQ_SLEEP);
+ }
+ mutex_exit(&ddp->dd_mutex);
+
}
return (err);
}
/*
- * Disassociate a linkid with a given link (identified by macname)
- * This waits until temporary references to the dls_devnet_t are gone.
+ * Disassociate the linkid from the link identified by macname. If
+ * wait is B_TRUE, wait until all temporary refs are released and the
+ * prop task is finished.
+ *
+ * If waiting then you SHOULD NOT call this from inside the MAC perim
+ * as deadlock will ensue. Otherwise, this function is safe to call
+ * from inside or outside the MAC perim.
*/
static int
-dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
+dls_devnet_unset(mac_handle_t mh, datalink_id_t *id, boolean_t wait)
{
+ const char *macname = mac_name(mh);
dls_devnet_t *ddp;
int err;
mod_hash_val_t val;
@@ -918,21 +1030,62 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
* deadlock. Return EBUSY if the asynchronous thread started for
* property loading as part of the post attach hasn't yet completed.
*/
- ASSERT(ddp->dd_ref != 0);
+ VERIFY(ddp->dd_ref != 0);
if ((ddp->dd_ref != 1) || (!wait &&
(ddp->dd_tref != 0 || ddp->dd_prop_taskid != 0))) {
- mutex_exit(&ddp->dd_mutex);
- rw_exit(&i_dls_devnet_lock);
- return (EBUSY);
+ int zstatus = 0;
+
+ /*
+ * There are a couple of alternatives that might be going on
+ * here; a) the zone is shutting down and it has a transient
+ * link assigned, in which case we want to clean it up instead
+ * of moving it back to the global zone, or b) its possible
+ * that we're trying to clean up an orphaned vnic that was
+ * delegated to a zone and which wasn't cleaned up properly
+ * when the zone went away. Check for either of these cases
+ * before we simply return EBUSY.
+ *
+ * zstatus indicates which situation we are dealing with:
+ * 0 - means return EBUSY
+ * 1 - means case (a), cleanup transient link
+ * -1 - means case (b), orphained VNIC
+ */
+ if (ddp->dd_ref > 1 && ddp->dd_zid != GLOBAL_ZONEID) {
+ zone_t *zp;
+
+ if ((zp = zone_find_by_id(ddp->dd_zid)) == NULL) {
+ zstatus = -1;
+ } else {
+ if (ddp->dd_transient) {
+ zone_status_t s = zone_status_get(zp);
+
+ if (s >= ZONE_IS_SHUTTING_DOWN)
+ zstatus = 1;
+ }
+ zone_rele(zp);
+ }
+ }
+
+ if (zstatus == 0) {
+ mutex_exit(&ddp->dd_mutex);
+ rw_exit(&i_dls_devnet_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * We want to delete the link, reset ref to 1;
+ */
+ if (zstatus == -1)
+ /* Log a warning, but continue in this case */
+ cmn_err(CE_WARN, "clear orphaned datalink: %s\n",
+ ddp->dd_linkname);
+ ddp->dd_ref = 1;
}
ddp->dd_flags |= DD_CONDEMNED;
ddp->dd_ref--;
*id = ddp->dd_linkid;
- if (ddp->dd_zid != GLOBAL_ZONEID)
- (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE);
-
/*
* Remove this dls_devnet_t from the hash table.
*/
@@ -947,19 +1100,40 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait)
}
rw_exit(&i_dls_devnet_lock);
+ /*
+ * It is important to call i_dls_devnet_setzid() WITHOUT the
+ * i_dls_devnet_lock held. The setzid call grabs the MAC
+ * perim; thus causing DLS -> MAC lock ordering if performed
+ * with the i_dls_devnet_lock held. This forces consumers to
+ * grab the MAC perim before calling dls_devnet_unset() (the
+ * locking rules state MAC -> DLS order). By performing the
+ * setzid outside of the i_dls_devnet_lock consumers can
+ * safely call dls_devnet_unset() outside the MAC perim.
+ */
+ if (ddp->dd_zid != GLOBAL_ZONEID) {
+ dls_devnet_stat_destroy(ddp, ddp->dd_zid);
+ (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE,
+ B_FALSE);
+ }
+
if (wait) {
/*
* Wait until all temporary references are released.
+ * The holders of the tref need the MAC perim to
+ * perform their work and release the tref. To avoid
+ * deadlock, assert that the perim is never held here.
*/
+ ASSERT0(MAC_PERIM_HELD(mh));
while ((ddp->dd_tref != 0) || (ddp->dd_prop_taskid != 0))
cv_wait(&ddp->dd_cv, &ddp->dd_mutex);
} else {
- ASSERT(ddp->dd_tref == 0 &&
- ddp->dd_prop_taskid == (taskqid_t)NULL);
+ VERIFY(ddp->dd_tref == 0);
+ VERIFY(ddp->dd_prop_taskid == (taskqid_t)NULL);
}
- if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+ if (ddp->dd_linkid != DATALINK_INVALID_LINKID) {
dls_devnet_stat_destroy(ddp, ddp->dd_owner_zid);
+ }
ddp->dd_prop_loaded = B_FALSE;
ddp->dd_linkid = DATALINK_INVALID_LINKID;
@@ -1019,8 +1193,8 @@ dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp,
}
mutex_enter(&ddp->dd_mutex);
- ASSERT(ddp->dd_ref > 0);
- if (ddp->dd_flags & DD_CONDEMNED) {
+ VERIFY(ddp->dd_ref > 0);
+ if (DD_NOT_VISIBLE(ddp->dd_flags)) {
mutex_exit(&ddp->dd_mutex);
rw_exit(&i_dls_devnet_lock);
return (ENOENT);
@@ -1087,8 +1261,8 @@ dls_devnet_hold_by_dev(dev_t dev, dls_dl_handle_t *ddhp)
return (ENOENT);
}
mutex_enter(&ddp->dd_mutex);
- ASSERT(ddp->dd_ref > 0);
- if (ddp->dd_flags & DD_CONDEMNED) {
+ VERIFY(ddp->dd_ref > 0);
+ if (DD_NOT_VISIBLE(ddp->dd_flags)) {
mutex_exit(&ddp->dd_mutex);
rw_exit(&i_dls_devnet_lock);
return (ENOENT);
@@ -1105,7 +1279,7 @@ void
dls_devnet_rele(dls_devnet_t *ddp)
{
mutex_enter(&ddp->dd_mutex);
- ASSERT(ddp->dd_ref > 1);
+ VERIFY(ddp->dd_ref > 1);
ddp->dd_ref--;
if ((ddp->dd_flags & DD_IMPLICIT_IPTUN) && ddp->dd_ref == 1) {
mutex_exit(&ddp->dd_mutex);
@@ -1117,7 +1291,7 @@ dls_devnet_rele(dls_devnet_t *ddp)
}
static int
-dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp)
+dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid)
{
char drv[MAXLINKNAMELEN];
uint_t ppa;
@@ -1127,7 +1301,7 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp)
dls_dev_handle_t ddh;
int err;
- if ((err = dls_mgmt_get_linkid(link, &linkid)) == 0)
+ if ((err = dls_mgmt_get_linkid_in_zone(link, &linkid, zid)) == 0)
return (dls_devnet_hold(linkid, ddpp));
/*
@@ -1270,9 +1444,15 @@ dls_devnet_phydev(datalink_id_t vlanid, dev_t *devp)
*
* This case does not change the <link name, linkid> mapping, so the link's
* kstats need to be updated with using name associated the given id2.
+ *
+ * The zoneinit parameter is used to allow us to create a VNIC in the global
+ * zone which is assigned to a non-global zone. Since there is a race condition
+ * in the create process if two VNICs have the same name, we need to rename it
+ * after it has been assigned to the zone.
*/
int
-dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
+dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link,
+ boolean_t zoneinit)
{
dls_dev_handle_t ddh = NULL;
int err = 0;
@@ -1317,10 +1497,12 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
}
mutex_enter(&ddp->dd_mutex);
- if (ddp->dd_ref > 1) {
- mutex_exit(&ddp->dd_mutex);
- err = EBUSY;
- goto done;
+ if (!zoneinit) {
+ if (ddp->dd_ref > 1) {
+ mutex_exit(&ddp->dd_mutex);
+ err = EBUSY;
+ goto done;
+ }
}
mutex_exit(&ddp->dd_mutex);
@@ -1331,7 +1513,15 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link)
/* rename mac client name and its flow if exists */
if ((err = mac_open(ddp->dd_mac, &mh)) != 0)
goto done;
- (void) mac_rename_primary(mh, link);
+ if (zoneinit) {
+ char tname[MAXLINKNAMELEN];
+
+ (void) snprintf(tname, sizeof (tname), "z%d_%s",
+ ddp->dd_zid, link);
+ (void) mac_rename_primary(mh, tname);
+ } else {
+ (void) mac_rename_primary(mh, link);
+ }
mac_close(mh);
goto done;
}
@@ -1398,7 +1588,7 @@ done:
rw_exit(&i_dls_devnet_lock);
if (err == 0)
- dls_devnet_stat_rename(ddp);
+ dls_devnet_stat_rename(ddp, zoneinit);
if (mph != NULL)
mac_perim_exit(mph);
@@ -1407,7 +1597,8 @@ done:
}
static int
-i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop)
+i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop,
+ boolean_t transient)
{
int err;
mac_perim_handle_t mph;
@@ -1436,10 +1627,18 @@ i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop)
sizeof (retval));
if (err != 0)
goto done;
+
+ /*
+ * We set upcall_done only if the upcall is
+ * successful. This way, if dls_link_setzid() fails,
+ * we know another upcall must be done to reset the
+ * dlmgmtd state.
+ */
upcall_done = B_TRUE;
}
if ((err = dls_link_setzid(ddp->dd_mac, new_zoneid)) == 0) {
ddp->dd_zid = new_zoneid;
+ ddp->dd_transient = transient;
devnet_need_rebuild = B_TRUE;
}
@@ -1454,7 +1653,7 @@ done:
}
int
-dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
+dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid, boolean_t transient)
{
dls_devnet_t *ddp;
int err;
@@ -1476,7 +1675,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
refheld = B_TRUE;
}
- if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE)) != 0) {
+ if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE, transient)) != 0) {
if (refheld)
dls_devnet_rele(ddp);
return (err);
@@ -1493,7 +1692,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid)
if (old_zid != GLOBAL_ZONEID)
dls_devnet_stat_destroy(ddh, old_zid);
if (new_zid != GLOBAL_ZONEID)
- dls_devnet_stat_create(ddh, new_zid);
+ dls_devnet_stat_create(ddh, new_zid, new_zid);
return (0);
}
@@ -1531,15 +1730,19 @@ dls_devnet_islinkvisible(datalink_id_t linkid, zoneid_t zoneid)
* Access a vanity naming node.
*/
int
-dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
+dls_devnet_open_in_zone(const char *link, dls_dl_handle_t *dhp, dev_t *devp,
+ zoneid_t zid)
{
dls_devnet_t *ddp;
dls_link_t *dlp;
- zoneid_t zid = getzoneid();
+ zoneid_t czid = getzoneid();
int err;
mac_perim_handle_t mph;
- if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0)
+ if (czid != GLOBAL_ZONEID && czid != zid)
+ return (ENOENT);
+
+ if ((err = dls_devnet_hold_by_name(link, &ddp, zid)) != 0)
return (err);
dls_devnet_prop_task_wait(ddp);
@@ -1572,6 +1775,12 @@ dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
return (0);
}
+int
+dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp)
+{
+ return (dls_devnet_open_in_zone(link, dhp, devp, getzoneid()));
+}
+
/*
* Close access to a vanity naming node.
*/
@@ -1628,13 +1837,32 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid)
* we need to use the linkid to get the user name for the link
* when we create the MAC client.
*/
- if ((err = dls_devnet_set(mac_name(mh), linkid, zoneid, &ddp)) == 0) {
+ if ((err = dls_devnet_set(mh, linkid, zoneid, &ddp)) == 0) {
if ((err = dls_link_hold_create(mac_name(mh), &dlp)) != 0) {
mac_perim_exit(mph);
- (void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE);
+ (void) dls_devnet_unset(mh, &linkid, B_FALSE);
return (err);
}
+
+ /*
+ * If dd_linkid is set then the link was successfully
+ * initialized. In this case we can remove the
+ * initializing flag and make the link visible to the
+ * rest of the system.
+ *
+ * If not set then we were called by softmac and it
+ * was unable to obtain a linkid for the physical link
+ * because dlmgmtd is down. In that case softmac will
+ * eventually obtain a linkid and call
+ * dls_devnet_recreate() to complete initialization.
+ */
+ mutex_enter(&ddp->dd_mutex);
+ if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+ ddp->dd_flags &= ~DD_INITIALIZING;
+ mutex_exit(&ddp->dd_mutex);
+
}
+
mac_perim_exit(mph);
return (err);
}
@@ -1648,8 +1876,19 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid)
int
dls_devnet_recreate(mac_handle_t mh, datalink_id_t linkid)
{
- ASSERT(linkid != DATALINK_INVALID_LINKID);
- return (dls_devnet_set(mac_name(mh), linkid, GLOBAL_ZONEID, NULL));
+ dls_devnet_t *ddp;
+ int err;
+
+ VERIFY(linkid != DATALINK_INVALID_LINKID);
+ if ((err = dls_devnet_set(mh, linkid, GLOBAL_ZONEID, &ddp)) == 0) {
+ mutex_enter(&ddp->dd_mutex);
+ if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+ ddp->dd_flags &= ~DD_INITIALIZING;
+ mutex_exit(&ddp->dd_mutex);
+ }
+
+ return (err);
+
}
int
@@ -1659,15 +1898,52 @@ dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait)
mac_perim_handle_t mph;
*idp = DATALINK_INVALID_LINKID;
- err = dls_devnet_unset(mac_name(mh), idp, wait);
- if (err != 0 && err != ENOENT)
+ err = dls_devnet_unset(mh, idp, wait);
+
+ /*
+ * We continue on in the face of ENOENT because the devnet
+ * unset and DLS link release are not atomic and we may have a
+ * scenario where there is no entry in i_dls_devnet_hash for
+ * the MAC name but there is an entry in i_dls_link_hash. For
+ * example, if the following occurred:
+ *
+ * 1. dls_devnet_unset() returns success, and
+ *
+ * 2. dls_link_rele_by_name() fails with ENOTEMPTY because
+ * flows still exist, and
+ *
+ * 3. dls_devnet_set() fails to set the zone id and calls
+ * dls_devnet_unset() -- leaving an entry in
+ * i_dls_link_hash but no corresponding entry in
+ * i_dls_devnet_hash.
+ *
+ * Even if #3 wasn't true the dls_devnet_set() may fail for
+ * different reasons in the future; the point is that it _can_
+ * fail as part of its contract. We can't rely on it working
+ * so we must assume that these two pieces of state (devnet
+ * and link hashes), which should always be in sync, can get
+ * out of sync and thus even if we get ENOENT from the devnet
+ * hash we should still try to delete from the link hash just
+ * in case.
+ *
+ * We could prevent the ENOTEMPTY from dls_link_rele_by_name()
+ * by calling mac_disable() before calling
+ * dls_devnet_destroy() but that's not currently possible due
+ * to a long-standing bug. OpenSolaris 6791335: The semantics
+ * of mac_disable() were modified by Crossbow such that
+ * dls_devnet_destroy() needs to be called before
+ * mac_disable() can succeed. This is because of the implicit
+ * reference that dls has on the mac_impl_t.
+ */
+ if (err != 0 && err != ENOENT) {
return (err);
+ }
mac_perim_enter_by_mh(mh, &mph);
err = dls_link_rele_by_name(mac_name(mh));
- mac_perim_exit(mph);
-
if (err != 0) {
+ dls_devnet_t *ddp;
+
/*
* XXX It is a general GLDv3 bug that dls_devnet_set() has to
* be called to re-set the link when destroy fails. The
@@ -1675,9 +1951,22 @@ dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait)
* called from kernel context or from a zone other than that
* which initially created the link.
*/
- (void) dls_devnet_set(mac_name(mh), *idp, crgetzoneid(CRED()),
- NULL);
+ (void) dls_devnet_set(mh, *idp, crgetzoneid(CRED()), &ddp);
+
+ /*
+ * You might think dd_linkid should always be set
+ * here, but in the case where dls_devnet_unset()
+ * returns ENOENT it will be DATALINK_INVALID_LINKID.
+ * Stay consistent with the rest of DLS and only
+ * remove the initializing flag if linkid is set.
+ */
+ mutex_enter(&ddp->dd_mutex);
+ if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+ ddp->dd_flags &= ~DD_INITIALIZING;
+ mutex_exit(&ddp->dd_mutex);
}
+
+ mac_perim_exit(mph);
return (err);
}
diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c
index 51e4be7260..82dceff278 100644
--- a/usr/src/uts/common/io/dls/dls_stat.c
+++ b/usr/src/uts/common/io/dls/dls_stat.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2011 Joyent, Inc. All rights reserved.
*/
/*
@@ -30,30 +31,33 @@
#include <sys/dld_impl.h>
#include <sys/mac_ether.h>
-static mac_stat_info_t i_dls_si[] = {
- { MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_MULTIRCV, "multircv", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_BRDCSTRCV, "brdcstrcv", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_MULTIXMT, "multixmt", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_BRDCSTXMT, "brdcstxmt", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_NORCVBUF, "norcvbuf", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_IERRORS, "ierrors", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_NOXMTBUF, "noxmtbuf", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_OERRORS, "oerrors", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_COLLISIONS, "collisions", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_RBYTES, "rbytes", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_IPACKETS, "ipackets", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_OBYTES, "obytes", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_OPACKETS, "opackets", KSTAT_DATA_UINT32, 0 },
- { MAC_STAT_RBYTES, "rbytes64", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_IPACKETS, "ipackets64", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_OBYTES, "obytes64", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_OPACKETS, "opackets64", KSTAT_DATA_UINT64, 0 },
- { MAC_STAT_LINK_STATE, "link_state", KSTAT_DATA_UINT32,
- (uint64_t)LINK_STATE_UNKNOWN}
-};
-
-#define STAT_INFO_COUNT (sizeof (i_dls_si) / sizeof (i_dls_si[0]))
+/*
+ * structure for link kstats
+ */
+typedef struct {
+ kstat_named_t dk_ifspeed;
+ kstat_named_t dk_multircv;
+ kstat_named_t dk_brdcstrcv;
+ kstat_named_t dk_multixmt;
+ kstat_named_t dk_brdcstxmt;
+ kstat_named_t dk_norcvbuf;
+ kstat_named_t dk_ierrors;
+ kstat_named_t dk_noxmtbuf;
+ kstat_named_t dk_oerrors;
+ kstat_named_t dk_collisions;
+ kstat_named_t dk_rbytes;
+ kstat_named_t dk_ipackets;
+ kstat_named_t dk_obytes;
+ kstat_named_t dk_opackets;
+ kstat_named_t dk_rbytes64;
+ kstat_named_t dk_ipackets64;
+ kstat_named_t dk_obytes64;
+ kstat_named_t dk_opackets64;
+ kstat_named_t dk_link_state;
+ kstat_named_t dk_link_duplex;
+ kstat_named_t dk_unknowns;
+ kstat_named_t dk_zonename;
+} dls_kstat_t;
/*
* Exported functions.
@@ -61,42 +65,54 @@ static mac_stat_info_t i_dls_si[] = {
int
dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
{
- kstat_named_t *knp;
- uint_t i;
- uint64_t val;
+ dls_kstat_t *dkp = ksp->ks_data;
if (rw != KSTAT_READ)
return (EACCES);
- knp = (kstat_named_t *)ksp->ks_data;
- for (i = 0; i < STAT_INFO_COUNT; i++) {
- val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat);
-
- switch (i_dls_si[i].msi_type) {
- case KSTAT_DATA_UINT64:
- knp->value.ui64 = val;
- break;
- case KSTAT_DATA_UINT32:
- knp->value.ui32 = (uint32_t)val;
- break;
- default:
- ASSERT(B_FALSE);
- }
-
- knp++;
- }
+ dkp->dk_ifspeed.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_IFSPEED);
+ dkp->dk_multircv.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_MULTIRCV);
+ dkp->dk_brdcstrcv.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_BRDCSTRCV);
+ dkp->dk_multixmt.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_MULTIXMT);
+ dkp->dk_brdcstxmt.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_BRDCSTXMT);
+ dkp->dk_norcvbuf.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_NORCVBUF);
+ dkp->dk_ierrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_IERRORS);
+ dkp->dk_noxmtbuf.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_NOXMTBUF);
+ dkp->dk_oerrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OERRORS);
+ dkp->dk_collisions.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_COLLISIONS);
+ dkp->dk_rbytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES);
+ dkp->dk_ipackets.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_IPACKETS);
+ dkp->dk_obytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES);
+ dkp->dk_opackets.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_OPACKETS);
+ dkp->dk_rbytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES);
+ dkp->dk_ipackets64.value.ui64 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_IPACKETS);
+ dkp->dk_obytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES);
+ dkp->dk_opackets64.value.ui64 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_OPACKETS);
+ dkp->dk_link_state.value.ui32 = mac_stat_get(dlp->dl_mh,
+ MAC_STAT_LINK_STATE);
/*
* Ethernet specific kstat "link_duplex"
*/
if (dlp->dl_mip->mi_nativemedia != DL_ETHER) {
- knp->value.ui32 = LINK_DUPLEX_UNKNOWN;
+ dkp->dk_link_duplex.value.ui32 = LINK_DUPLEX_UNKNOWN;
} else {
- val = mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX);
- knp->value.ui32 = (uint32_t)val;
+ dkp->dk_link_duplex.value.ui32 =
+ (uint32_t)mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX);
}
- knp++;
- knp->value.ui32 = dlp->dl_unknowns;
+
+ dkp->dk_unknowns.value.ui32 = dlp->dl_unknowns;
return (0);
}
@@ -104,30 +120,66 @@ dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw)
int
dls_stat_create(const char *module, int instance, const char *name,
zoneid_t zoneid, int (*update)(struct kstat *, int), void *private,
- kstat_t **kspp)
+ kstat_t **kspp, zoneid_t newzoneid)
{
kstat_t *ksp;
- kstat_named_t *knp;
- uint_t i;
+ zone_t *zone;
+ dls_kstat_t *dkp;
if ((ksp = kstat_create_zone(module, instance, name, "net",
- KSTAT_TYPE_NAMED, STAT_INFO_COUNT + 2, 0, zoneid)) == NULL) {
+ KSTAT_TYPE_NAMED, sizeof (dls_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zoneid)) == NULL) {
return (EINVAL);
}
ksp->ks_update = update;
ksp->ks_private = private;
+ dkp = ksp->ks_data = kmem_zalloc(sizeof (dls_kstat_t), KM_SLEEP);
+ if ((zone = zone_find_by_id(newzoneid)) != NULL) {
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ }
- knp = (kstat_named_t *)ksp->ks_data;
- for (i = 0; i < STAT_INFO_COUNT; i++) {
- kstat_named_init(knp, i_dls_si[i].msi_name,
- i_dls_si[i].msi_type);
- knp++;
+ kstat_named_init(&dkp->dk_ifspeed, "ifspeed", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_multircv, "multircv", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_brdcstrcv, "brdcstrcv", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_multixmt, "multixmt", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_brdcstxmt, "brdcstxmt", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_norcvbuf, "norcvbuf", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_ierrors, "ierrors", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_noxmtbuf, "noxmtbuf", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_oerrors, "oerrors", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_collisions, "collisions", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_rbytes, "rbytes", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_ipackets, "ipackets", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_obytes, "obytes", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_opackets, "opackets", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_rbytes64, "rbytes64", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_ipackets64, "ipackets64", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_obytes64, "obytes64", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_opackets64, "opackets64", KSTAT_DATA_UINT64);
+ kstat_named_init(&dkp->dk_link_state, "link_state", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_link_duplex, "link_duplex",
+ KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_unknowns, "unknowns", KSTAT_DATA_UINT32);
+ kstat_named_init(&dkp->dk_zonename, "zonename", KSTAT_DATA_STRING);
+
+ if (zone != NULL) {
+ kstat_named_setstr(&dkp->dk_zonename, zone->zone_name);
+ zone_rele(zone);
}
- kstat_named_init(knp++, "link_duplex", KSTAT_DATA_UINT32);
- kstat_named_init(knp, "unknowns", KSTAT_DATA_UINT32);
kstat_install(ksp);
*kspp = ksp;
return (0);
}
+
+void
+dls_stat_delete(kstat_t *ksp)
+{
+ void *data;
+ if (ksp != NULL) {
+ data = ksp->ks_data;
+ kstat_delete(ksp);
+ kmem_free(data, sizeof (dls_kstat_t));
+ }
+}
diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE
new file mode 100644
index 0000000000..00aefb6f51
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE
@@ -0,0 +1,32 @@
+/*
+ * MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip
new file mode 100644
index 0000000000..ac6d2d1b15
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+DR_SAS DRIVER
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.c b/usr/src/uts/common/io/dr_sas/dr_sas.c
new file mode 100644
index 0000000000..0d277ada42
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.c
@@ -0,0 +1,5509 @@
+/*
+ * dr_sas.c: source for dr_sas driver
+ *
+ * MegaRAID device driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Version:
+ * Author:
+ * Arun Chandrashekhar
+ * Manju R
+ * Rajesh Prabhakaran
+ * Seokmann Ju
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/open.h>
+#include <sys/cred.h>
+#include <sys/modctl.h>
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/stat.h>
+#include <sys/mkdev.h>
+#include <sys/pci.h>
+#include <sys/scsi/scsi.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/atomic.h>
+#include <sys/signal.h>
+#include <sys/fs/dv_node.h> /* devfs_clean */
+
+#include "dr_sas.h"
+
+/*
+ * FMA header files
+ */
+#include <sys/ddifm.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/fm/io/ddi.h>
+
+/*
+ * Local static data
+ */
+static void *drsas_state = NULL;
+static int debug_level_g = CL_NONE;
+
+#pragma weak scsi_hba_open
+#pragma weak scsi_hba_close
+#pragma weak scsi_hba_ioctl
+
+static ddi_dma_attr_t drsas_generic_dma_attr = {
+ DMA_ATTR_V0, /* dma_attr_version */
+ 0, /* low DMA address range */
+ 0xFFFFFFFFU, /* high DMA address range */
+ 0xFFFFFFFFU, /* DMA counter register */
+ 8, /* DMA address alignment */
+ 0x07, /* DMA burstsizes */
+ 1, /* min DMA size */
+ 0xFFFFFFFFU, /* max DMA size */
+ 0xFFFFFFFFU, /* segment boundary */
+ DRSAS_MAX_SGE_CNT, /* dma_attr_sglen */
+ 512, /* granularity of device */
+ 0 /* bus specific DMA flags */
+};
+
+int32_t drsas_max_cap_maxxfer = 0x1000000;
+
+/*
+ * cb_ops contains base level routines
+ */
+static struct cb_ops drsas_cb_ops = {
+ drsas_open, /* open */
+ drsas_close, /* close */
+ nodev, /* strategy */
+ nodev, /* print */
+ nodev, /* dump */
+ nodev, /* read */
+ nodev, /* write */
+ drsas_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ nochpoll, /* poll */
+ nodev, /* cb_prop_op */
+ 0, /* streamtab */
+ D_NEW | D_HOTPLUG, /* cb_flag */
+ CB_REV, /* cb_rev */
+ nodev, /* cb_aread */
+ nodev /* cb_awrite */
+};
+
+/*
+ * dev_ops contains configuration routines
+ */
+static struct dev_ops drsas_ops = {
+ DEVO_REV, /* rev, */
+ 0, /* refcnt */
+ drsas_getinfo, /* getinfo */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ drsas_attach, /* attach */
+ drsas_detach, /* detach */
+ drsas_reset, /* reset */
+ &drsas_cb_ops, /* char/block ops */
+ NULL, /* bus ops */
+ NULL, /* power */
+ ddi_quiesce_not_supported, /* quiesce */
+};
+
+char _depends_on[] = "misc/scsi";
+
+static struct modldrv modldrv = {
+ &mod_driverops, /* module type - driver */
+ DRSAS_VERSION,
+ &drsas_ops, /* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, /* ml_rev - must be MODREV_1 */
+ &modldrv, /* ml_linkage */
+ NULL /* end of driver linkage */
+};
+
+static struct ddi_device_acc_attr endian_attr = {
+ DDI_DEVICE_ATTR_V0,
+ DDI_STRUCTURE_LE_ACC,
+ DDI_STRICTORDER_ACC
+};
+
+
+/*
+ * ************************************************************************** *
+ * *
+ * common entry points - for loadable kernel modules *
+ * *
+ * ************************************************************************** *
+ */
+
+int
+_init(void)
+{
+ int ret;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ ret = ddi_soft_state_init(&drsas_state,
+ sizeof (struct drsas_instance), 0);
+
+ if (ret != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas: could not init state"));
+ return (ret);
+ }
+
+ if ((ret = scsi_hba_init(&modlinkage)) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas: could not init scsi hba"));
+ ddi_soft_state_fini(&drsas_state);
+ return (ret);
+ }
+
+ ret = mod_install(&modlinkage);
+
+ if (ret != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas: mod_install failed"));
+ scsi_hba_fini(&modlinkage);
+ ddi_soft_state_fini(&drsas_state);
+ }
+
+ return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int ret;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS)
+ return (ret);
+
+ scsi_hba_fini(&modlinkage);
+
+ ddi_soft_state_fini(&drsas_state);
+
+ return (ret);
+}
+
+
+/*
+ * ************************************************************************** *
+ * *
+ * common entry points - for autoconfiguration *
+ * *
+ * ************************************************************************** *
+ */
+
+static int
+drsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ int instance_no;
+ int nregs;
+ uint8_t added_isr_f = 0;
+ uint8_t added_soft_isr_f = 0;
+ uint8_t create_devctl_node_f = 0;
+ uint8_t create_scsi_node_f = 0;
+ uint8_t create_ioc_node_f = 0;
+ uint8_t tran_alloc_f = 0;
+ uint8_t irq;
+ uint16_t vendor_id;
+ uint16_t device_id;
+ uint16_t subsysvid;
+ uint16_t subsysid;
+ uint16_t command;
+ off_t reglength = 0;
+ int intr_types = 0;
+ char *data;
+ int msi_enable = 0;
+
+ scsi_hba_tran_t *tran;
+ ddi_dma_attr_t tran_dma_attr;
+ struct drsas_instance *instance;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* CONSTCOND */
+ ASSERT(NO_COMPETING_THREADS);
+
+ instance_no = ddi_get_instance(dip);
+
+ /*
+ * check to see whether this device is in a DMA-capable slot.
+ */
+ if (ddi_slaveonly(dip) == DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas%d: Device in slave-only slot, unused",
+ instance_no));
+ return (DDI_FAILURE);
+ }
+
+ switch (cmd) {
+ case DDI_ATTACH:
+ con_log(CL_DLEVEL1, (CE_NOTE, "dr_sas: DDI_ATTACH"));
+ /* allocate the soft state for the instance */
+ if (ddi_soft_state_zalloc(drsas_state, instance_no)
+ != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas%d: Failed to allocate soft state",
+ instance_no));
+
+ return (DDI_FAILURE);
+ }
+
+ instance = (struct drsas_instance *)ddi_get_soft_state
+ (drsas_state, instance_no);
+
+ if (instance == NULL) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas%d: Bad soft state", instance_no));
+
+ ddi_soft_state_free(drsas_state, instance_no);
+
+ return (DDI_FAILURE);
+ }
+
+ bzero((caddr_t)instance,
+ sizeof (struct drsas_instance));
+
+ instance->func_ptr = kmem_zalloc(
+ sizeof (struct drsas_func_ptr), KM_SLEEP);
+ ASSERT(instance->func_ptr);
+
+ /* Setup the PCI configuration space handles */
+ if (pci_config_setup(dip, &instance->pci_handle) !=
+ DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas%d: pci config setup failed ",
+ instance_no));
+
+ kmem_free(instance->func_ptr,
+ sizeof (struct drsas_func_ptr));
+ ddi_soft_state_free(drsas_state, instance_no);
+
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: failed to get registers."));
+
+ pci_config_teardown(&instance->pci_handle);
+ kmem_free(instance->func_ptr,
+ sizeof (struct drsas_func_ptr));
+ ddi_soft_state_free(drsas_state, instance_no);
+
+ return (DDI_FAILURE);
+ }
+
+ vendor_id = pci_config_get16(instance->pci_handle,
+ PCI_CONF_VENID);
+ device_id = pci_config_get16(instance->pci_handle,
+ PCI_CONF_DEVID);
+
+ subsysvid = pci_config_get16(instance->pci_handle,
+ PCI_CONF_SUBVENID);
+ subsysid = pci_config_get16(instance->pci_handle,
+ PCI_CONF_SUBSYSID);
+
+ pci_config_put16(instance->pci_handle, PCI_CONF_COMM,
+ (pci_config_get16(instance->pci_handle,
+ PCI_CONF_COMM) | PCI_COMM_ME));
+ irq = pci_config_get8(instance->pci_handle,
+ PCI_CONF_ILINE);
+
+ con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+ "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s",
+ instance_no, vendor_id, device_id, subsysvid,
+ subsysid, irq, DRSAS_VERSION));
+
+ /* enable bus-mastering */
+ command = pci_config_get16(instance->pci_handle,
+ PCI_CONF_COMM);
+
+ if (!(command & PCI_COMM_ME)) {
+ command |= PCI_COMM_ME;
+
+ pci_config_put16(instance->pci_handle,
+ PCI_CONF_COMM, command);
+
+ con_log(CL_ANN, (CE_CONT, "dr_sas%d: "
+ "enable bus-mastering", instance_no));
+ } else {
+ con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+ "bus-mastering already set", instance_no));
+ }
+
+ /* initialize function pointers */
+ if ((device_id == PCI_DEVICE_ID_LSI_2108VDE) ||
+ (device_id == PCI_DEVICE_ID_LSI_2108V)) {
+ con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: "
+ "2108V/DE detected", instance_no));
+ instance->func_ptr->read_fw_status_reg =
+ read_fw_status_reg_ppc;
+ instance->func_ptr->issue_cmd = issue_cmd_ppc;
+ instance->func_ptr->issue_cmd_in_sync_mode =
+ issue_cmd_in_sync_mode_ppc;
+ instance->func_ptr->issue_cmd_in_poll_mode =
+ issue_cmd_in_poll_mode_ppc;
+ instance->func_ptr->enable_intr =
+ enable_intr_ppc;
+ instance->func_ptr->disable_intr =
+ disable_intr_ppc;
+ instance->func_ptr->intr_ack = intr_ack_ppc;
+ } else {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: Invalid device detected"));
+
+ pci_config_teardown(&instance->pci_handle);
+ kmem_free(instance->func_ptr,
+ sizeof (struct drsas_func_ptr));
+ ddi_soft_state_free(drsas_state, instance_no);
+
+ return (DDI_FAILURE);
+ }
+
+ instance->baseaddress = pci_config_get32(
+ instance->pci_handle, PCI_CONF_BASE0);
+ instance->baseaddress &= 0x0fffc;
+
+ instance->dip = dip;
+ instance->vendor_id = vendor_id;
+ instance->device_id = device_id;
+ instance->subsysvid = subsysvid;
+ instance->subsysid = subsysid;
+ instance->instance = instance_no;
+
+ /* Initialize FMA */
+ instance->fm_capabilities = ddi_prop_get_int(
+ DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS,
+ "fm-capable", DDI_FM_EREPORT_CAPABLE |
+ DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE
+ | DDI_FM_ERRCB_CAPABLE);
+
+ drsas_fm_init(instance);
+
+ /* Initialize Interrupts */
+ if ((ddi_dev_regsize(instance->dip,
+ REGISTER_SET_IO_2108, &reglength) != DDI_SUCCESS) ||
+ reglength < MINIMUM_MFI_MEM_SZ) {
+ return (DDI_FAILURE);
+ }
+ if (reglength > DEFAULT_MFI_MEM_SZ) {
+ reglength = DEFAULT_MFI_MEM_SZ;
+ con_log(CL_DLEVEL1, (CE_NOTE,
+ "dr_sas: register length to map is "
+ "0x%lx bytes", reglength));
+ }
+ if (ddi_regs_map_setup(instance->dip,
+ REGISTER_SET_IO_2108, &instance->regmap, 0,
+ reglength, &endian_attr, &instance->regmap_handle)
+ != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: couldn't map control registers"));
+ goto fail_attach;
+ }
+
+ /*
+ * Disable Interrupt Now.
+ * Setup Software interrupt
+ */
+ instance->func_ptr->disable_intr(instance);
+
+ msi_enable = 0;
+ if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0,
+ "drsas-enable-msi", &data) == DDI_SUCCESS) {
+ if (strncmp(data, "yes", 3) == 0) {
+ msi_enable = 1;
+ con_log(CL_ANN, (CE_WARN,
+ "msi_enable = %d ENABLED",
+ msi_enable));
+ }
+ ddi_prop_free(data);
+ }
+
+ con_log(CL_DLEVEL1, (CE_WARN, "msi_enable = %d",
+ msi_enable));
+
+ /* Check for all supported interrupt types */
+ if (ddi_intr_get_supported_types(
+ dip, &intr_types) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "ddi_intr_get_supported_types() failed"));
+ goto fail_attach;
+ }
+
+ con_log(CL_DLEVEL1, (CE_NOTE,
+ "ddi_intr_get_supported_types() ret: 0x%x",
+ intr_types));
+
+ /* Initialize and Setup Interrupt handler */
+ if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) {
+ if (drsas_add_intrs(instance,
+ DDI_INTR_TYPE_MSIX) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "MSIX interrupt query failed"));
+ goto fail_attach;
+ }
+ instance->intr_type = DDI_INTR_TYPE_MSIX;
+ } else if (msi_enable && (intr_types &
+ DDI_INTR_TYPE_MSI)) {
+ if (drsas_add_intrs(instance,
+ DDI_INTR_TYPE_MSI) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "MSI interrupt query failed"));
+ goto fail_attach;
+ }
+ instance->intr_type = DDI_INTR_TYPE_MSI;
+ } else if (intr_types & DDI_INTR_TYPE_FIXED) {
+ msi_enable = 0;
+ if (drsas_add_intrs(instance,
+ DDI_INTR_TYPE_FIXED) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "FIXED interrupt query failed"));
+ goto fail_attach;
+ }
+ instance->intr_type = DDI_INTR_TYPE_FIXED;
+ } else {
+ con_log(CL_ANN, (CE_WARN, "Device cannot "
+ "suppport either FIXED or MSI/X "
+ "interrupts"));
+ goto fail_attach;
+ }
+
+ added_isr_f = 1;
+
+ /* setup the mfi based low level driver */
+ if (init_mfi(instance) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas: "
+ "could not initialize the low level driver"));
+
+ goto fail_attach;
+ }
+
+ /* Initialize all Mutex */
+ INIT_LIST_HEAD(&instance->completed_pool_list);
+ mutex_init(&instance->completed_pool_mtx,
+ "completed_pool_mtx", MUTEX_DRIVER,
+ DDI_INTR_PRI(instance->intr_pri));
+
+ mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx",
+ MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+ cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL);
+
+ mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx",
+ MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri));
+
+ /* Register our soft-isr for highlevel interrupts. */
+ instance->isr_level = instance->intr_pri;
+ if (instance->isr_level == HIGH_LEVEL_INTR) {
+ if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH,
+ &instance->soft_intr_id, NULL, NULL,
+ drsas_softintr, (caddr_t)instance) !=
+ DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ " Software ISR did not register"));
+
+ goto fail_attach;
+ }
+
+ added_soft_isr_f = 1;
+ }
+
+ /* Allocate a transport structure */
+ tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP);
+
+ if (tran == NULL) {
+ con_log(CL_ANN, (CE_WARN,
+ "scsi_hba_tran_alloc failed"));
+ goto fail_attach;
+ }
+
+ tran_alloc_f = 1;
+
+ instance->tran = tran;
+
+ tran->tran_hba_private = instance;
+ tran->tran_tgt_init = drsas_tran_tgt_init;
+ tran->tran_tgt_probe = scsi_hba_probe;
+ tran->tran_tgt_free = drsas_tran_tgt_free;
+ tran->tran_init_pkt = drsas_tran_init_pkt;
+ tran->tran_start = drsas_tran_start;
+ tran->tran_abort = drsas_tran_abort;
+ tran->tran_reset = drsas_tran_reset;
+ tran->tran_getcap = drsas_tran_getcap;
+ tran->tran_setcap = drsas_tran_setcap;
+ tran->tran_destroy_pkt = drsas_tran_destroy_pkt;
+ tran->tran_dmafree = drsas_tran_dmafree;
+ tran->tran_sync_pkt = drsas_tran_sync_pkt;
+ tran->tran_bus_config = drsas_tran_bus_config;
+
+ tran_dma_attr = drsas_generic_dma_attr;
+ tran_dma_attr.dma_attr_sgllen = instance->max_num_sge;
+
+ /* Attach this instance of the hba */
+ if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0)
+ != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "scsi_hba_attach failed"));
+
+ goto fail_attach;
+ }
+
+ /* create devctl node for cfgadm command */
+ if (ddi_create_minor_node(dip, "devctl",
+ S_IFCHR, INST2DEVCTL(instance_no),
+ DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: failed to create devctl node."));
+
+ goto fail_attach;
+ }
+
+ create_devctl_node_f = 1;
+
+ /* create scsi node for cfgadm command */
+ if (ddi_create_minor_node(dip, "scsi", S_IFCHR,
+ INST2SCSI(instance_no),
+ DDI_NT_SCSI_ATTACHMENT_POINT, 0) ==
+ DDI_FAILURE) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: failed to create scsi node."));
+
+ goto fail_attach;
+ }
+
+ create_scsi_node_f = 1;
+
+ (void) sprintf(instance->iocnode, "%d:lsirdctl",
+ instance_no);
+
+ /*
+ * Create a node for applications
+ * for issuing ioctl to the driver.
+ */
+ if (ddi_create_minor_node(dip, instance->iocnode,
+ S_IFCHR, INST2LSIRDCTL(instance_no),
+ DDI_PSEUDO, 0) == DDI_FAILURE) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: failed to create ioctl node."));
+
+ goto fail_attach;
+ }
+
+ create_ioc_node_f = 1;
+
+ /* Create a taskq to handle dr events */
+ if ((instance->taskq = ddi_taskq_create(dip,
+ "drsas_dr_taskq", 1,
+ TASKQ_DEFAULTPRI, 0)) == NULL) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: failed to create taskq "));
+ instance->taskq = NULL;
+ goto fail_attach;
+ }
+
+ /* enable interrupt */
+ instance->func_ptr->enable_intr(instance);
+
+ /* initiate AEN */
+ if (start_mfi_aen(instance)) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: failed to initiate AEN."));
+ goto fail_initiate_aen;
+ }
+
+ con_log(CL_DLEVEL1, (CE_NOTE,
+ "AEN started for instance %d.", instance_no));
+
+ /* Finally! We are on the air. */
+ ddi_report_dev(dip);
+
+ if (drsas_check_acc_handle(instance->regmap_handle) !=
+ DDI_SUCCESS) {
+ goto fail_attach;
+ }
+ if (drsas_check_acc_handle(instance->pci_handle) !=
+ DDI_SUCCESS) {
+ goto fail_attach;
+ }
+ instance->dr_ld_list =
+ kmem_zalloc(MRDRV_MAX_LD * sizeof (struct drsas_ld),
+ KM_SLEEP);
+ break;
+ case DDI_PM_RESUME:
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: DDI_PM_RESUME"));
+ break;
+ case DDI_RESUME:
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: DDI_RESUME"));
+ break;
+ default:
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: invalid attach cmd=%x", cmd));
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+
+fail_initiate_aen:
+fail_attach:
+ if (create_devctl_node_f) {
+ ddi_remove_minor_node(dip, "devctl");
+ }
+
+ if (create_scsi_node_f) {
+ ddi_remove_minor_node(dip, "scsi");
+ }
+
+ if (create_ioc_node_f) {
+ ddi_remove_minor_node(dip, instance->iocnode);
+ }
+
+ if (tran_alloc_f) {
+ scsi_hba_tran_free(tran);
+ }
+
+
+ if (added_soft_isr_f) {
+ ddi_remove_softintr(instance->soft_intr_id);
+ }
+
+ if (added_isr_f) {
+ drsas_rem_intrs(instance);
+ }
+
+ if (instance && instance->taskq) {
+ ddi_taskq_destroy(instance->taskq);
+ }
+
+ drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+
+ drsas_fm_fini(instance);
+
+ pci_config_teardown(&instance->pci_handle);
+
+ ddi_soft_state_free(drsas_state, instance_no);
+
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: return failure from drsas_attach"));
+
+ return (DDI_FAILURE);
+}
+
+/*ARGSUSED*/
+static int
+drsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
+{
+ int rval;
+ int drsas_minor = getminor((dev_t)arg);
+
+ struct drsas_instance *instance;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ switch (cmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ instance = (struct drsas_instance *)
+ ddi_get_soft_state(drsas_state,
+ MINOR2INST(drsas_minor));
+
+ if (instance == NULL) {
+ *resultp = NULL;
+ rval = DDI_FAILURE;
+ } else {
+ *resultp = instance->dip;
+ rval = DDI_SUCCESS;
+ }
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *resultp = (void *)instance;
+ rval = DDI_SUCCESS;
+ break;
+ default:
+ *resultp = NULL;
+ rval = DDI_FAILURE;
+ }
+
+ return (rval);
+}
+
+static int
+drsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ int instance_no;
+
+ struct drsas_instance *instance;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* CONSTCOND */
+ ASSERT(NO_COMPETING_THREADS);
+
+ instance_no = ddi_get_instance(dip);
+
+ instance = (struct drsas_instance *)ddi_get_soft_state(drsas_state,
+ instance_no);
+
+ if (!instance) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas:%d could not get instance in detach",
+ instance_no));
+
+ return (DDI_FAILURE);
+ }
+
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas%d: detaching device 0x%4x:0x%4x:0x%4x:0x%4x",
+ instance_no, instance->vendor_id, instance->device_id,
+ instance->subsysvid, instance->subsysid));
+
+ switch (cmd) {
+ case DDI_DETACH:
+ con_log(CL_ANN, (CE_NOTE,
+ "drsas_detach: DDI_DETACH"));
+
+ if (scsi_hba_detach(dip) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas:%d failed to detach",
+ instance_no));
+
+ return (DDI_FAILURE);
+ }
+
+ scsi_hba_tran_free(instance->tran);
+
+ flush_cache(instance);
+
+ if (abort_aen_cmd(instance, instance->aen_cmd)) {
+ con_log(CL_ANN, (CE_WARN, "drsas_detach: "
+ "failed to abort prevous AEN command"));
+
+ return (DDI_FAILURE);
+ }
+
+ instance->func_ptr->disable_intr(instance);
+
+ if (instance->isr_level == HIGH_LEVEL_INTR) {
+ ddi_remove_softintr(instance->soft_intr_id);
+ }
+
+ drsas_rem_intrs(instance);
+
+ if (instance->taskq) {
+ ddi_taskq_destroy(instance->taskq);
+ }
+ kmem_free(instance->dr_ld_list, MRDRV_MAX_LD
+ * sizeof (struct drsas_ld));
+ free_space_for_mfi(instance);
+
+ drsas_fm_fini(instance);
+
+ pci_config_teardown(&instance->pci_handle);
+
+ kmem_free(instance->func_ptr,
+ sizeof (struct drsas_func_ptr));
+
+ ddi_soft_state_free(drsas_state, instance_no);
+ break;
+ case DDI_PM_SUSPEND:
+ con_log(CL_ANN, (CE_NOTE,
+ "drsas_detach: DDI_PM_SUSPEND"));
+
+ break;
+ case DDI_SUSPEND:
+ con_log(CL_ANN, (CE_NOTE,
+ "drsas_detach: DDI_SUSPEND"));
+
+ break;
+ default:
+ con_log(CL_ANN, (CE_WARN,
+ "invalid detach command:0x%x", cmd));
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * ************************************************************************** *
+ * *
+ * common entry points - for character driver types *
+ * *
+ * ************************************************************************** *
+ */
+static int
+drsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp)
+{
+ int rval = 0;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* Check root permissions */
+ if (drv_priv(credp) != 0) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: Non-root ioctl access denied!"));
+ return (EPERM);
+ }
+
+ /* Verify we are being opened as a character device */
+ if (otyp != OTYP_CHR) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: ioctl node must be a char node"));
+ return (EINVAL);
+ }
+
+ if (ddi_get_soft_state(drsas_state, MINOR2INST(getminor(*dev)))
+ == NULL) {
+ return (ENXIO);
+ }
+
+ if (scsi_hba_open) {
+ rval = scsi_hba_open(dev, openflags, otyp, credp);
+ }
+
+ return (rval);
+}
+
+static int
+drsas_close(dev_t dev, int openflags, int otyp, cred_t *credp)
+{
+ int rval = 0;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* no need for locks! */
+
+ if (scsi_hba_close) {
+ rval = scsi_hba_close(dev, openflags, otyp, credp);
+ }
+
+ return (rval);
+}
+
+static int
+drsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+ int *rvalp)
+{
+ int rval = 0;
+
+ struct drsas_instance *instance;
+ struct drsas_ioctl *ioctl;
+ struct drsas_aen aen;
+ int i;
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ instance = ddi_get_soft_state(drsas_state, MINOR2INST(getminor(dev)));
+
+ if (instance == NULL) {
+ /* invalid minor number */
+ con_log(CL_ANN, (CE_WARN, "dr_sas: adapter not found."));
+ return (ENXIO);
+ }
+
+ ioctl = (struct drsas_ioctl *)kmem_zalloc(sizeof (struct drsas_ioctl),
+ KM_SLEEP);
+ ASSERT(ioctl);
+
+ switch ((uint_t)cmd) {
+ case DRSAS_IOCTL_FIRMWARE:
+ for (i = 0; i < sizeof (struct drsas_ioctl); i++) {
+ if (ddi_copyin((uint8_t *)arg+i,
+ (uint8_t *)ioctl+i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "drsas_ioctl "
+ "ERROR IOCTL copyin"));
+ kmem_free(ioctl,
+ sizeof (struct drsas_ioctl));
+ return (EFAULT);
+ }
+ }
+ if (ioctl->control_code == DRSAS_DRIVER_IOCTL_COMMON) {
+ rval = handle_drv_ioctl(instance, ioctl, mode);
+ } else {
+ rval = handle_mfi_ioctl(instance, ioctl, mode);
+ }
+ for (i = 0; i < sizeof (struct drsas_ioctl) - 1; i++) {
+ if (ddi_copyout((uint8_t *)ioctl+i,
+ (uint8_t *)arg+i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "drsas_ioctl: ddi_copyout "
+ "failed"));
+ rval = 1;
+ break;
+ }
+ }
+
+ break;
+ case DRSAS_IOCTL_AEN:
+ for (i = 0; i < sizeof (struct drsas_aen); i++) {
+ if (ddi_copyin((uint8_t *)arg+i,
+ (uint8_t *)&aen+i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "drsas_ioctl: "
+ "ERROR AEN copyin"));
+ kmem_free(ioctl,
+ sizeof (struct drsas_ioctl));
+ return (EFAULT);
+ }
+ }
+
+ rval = handle_mfi_aen(instance, &aen);
+ for (i = 0; i < sizeof (struct drsas_aen); i++) {
+ if (ddi_copyout((uint8_t *)&aen + i,
+ (uint8_t *)arg + i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "drsas_ioctl: "
+ "ddi_copyout failed"));
+ rval = 1;
+ break;
+ }
+ }
+
+ break;
+ default:
+ rval = scsi_hba_ioctl(dev, cmd, arg,
+ mode, credp, rvalp);
+
+ con_log(CL_DLEVEL1, (CE_NOTE, "drsas_ioctl: "
+ "scsi_hba_ioctl called, ret = %x.", rval));
+ }
+
+ kmem_free(ioctl, sizeof (struct drsas_ioctl));
+ return (rval);
+}
+
+/*
+ * ************************************************************************** *
+ * *
+ * common entry points - for block driver types *
+ * *
+ * ************************************************************************** *
+ */
+/*ARGSUSED*/
+static int
+drsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd)
+{
+ int instance_no;
+
+ struct drsas_instance *instance;
+
+ instance_no = ddi_get_instance(dip);
+ instance = (struct drsas_instance *)ddi_get_soft_state
+ (drsas_state, instance_no);
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ if (!instance) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas:%d could not get adapter "
+ "in reset", instance_no));
+ return (DDI_FAILURE);
+ }
+
+ instance->func_ptr->disable_intr(instance);
+
+ con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d",
+ instance_no));
+
+ flush_cache(instance);
+
+ return (DDI_SUCCESS);
+}
+
+
+/*
+ * ************************************************************************** *
+ * *
+ * entry points (SCSI HBA) *
+ * *
+ * ************************************************************************** *
+ */
+/*ARGSUSED*/
+static int
+drsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *tran, struct scsi_device *sd)
+{
+ struct drsas_instance *instance;
+ uint16_t tgt = sd->sd_address.a_target;
+ uint8_t lun = sd->sd_address.a_lun;
+
+ con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init target %d lun %d",
+ tgt, lun));
+
+ instance = ADDR2MR(&sd->sd_address);
+
+ if (ndi_dev_is_persistent_node(tgt_dip) == 0) {
+ (void) ndi_merge_node(tgt_dip, drsas_name_node);
+ ddi_set_name_addr(tgt_dip, NULL);
+
+ con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init in "
+ "ndi_dev_is_persistent_node DDI_FAILURE t = %d l = %d",
+ tgt, lun));
+ return (DDI_FAILURE);
+ }
+
+ con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init dev_dip %p tgt_dip %p",
+ (void *)instance->dr_ld_list[tgt].dip, (void *)tgt_dip));
+
+ if (tgt < MRDRV_MAX_LD && lun == 0) {
+ if (instance->dr_ld_list[tgt].dip == NULL &&
+ strcmp(ddi_driver_name(sd->sd_dev), "sd") == 0) {
+ instance->dr_ld_list[tgt].dip = tgt_dip;
+ instance->dr_ld_list[tgt].lun_type = DRSAS_LD_LUN;
+ }
+ }
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+ struct drsas_instance *instance;
+ int tgt = sd->sd_address.a_target;
+ int lun = sd->sd_address.a_lun;
+
+ instance = ADDR2MR(&sd->sd_address);
+
+ con_log(CL_ANN1, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun));
+
+ if (tgt < MRDRV_MAX_LD && lun == 0) {
+ if (instance->dr_ld_list[tgt].dip == tgt_dip) {
+ instance->dr_ld_list[tgt].dip = NULL;
+ }
+ }
+}
+
+static dev_info_t *
+drsas_find_child(struct drsas_instance *instance, uint16_t tgt, uint8_t lun)
+{
+ dev_info_t *child = NULL;
+ char addr[SCSI_MAXNAMELEN];
+ char tmp[MAXNAMELEN];
+
+ (void) sprintf(addr, "%x,%x", tgt, lun);
+ for (child = ddi_get_child(instance->dip); child;
+ child = ddi_get_next_sibling(child)) {
+
+ if (drsas_name_node(child, tmp, MAXNAMELEN) !=
+ DDI_SUCCESS) {
+ continue;
+ }
+
+ if (strcmp(addr, tmp) == 0) {
+ break;
+ }
+ }
+ con_log(CL_ANN1, (CE_NOTE, "drsas_find_child: return child = %p",
+ (void *)child));
+ return (child);
+}
+
+static int
+drsas_name_node(dev_info_t *dip, char *name, int len)
+{
+ int tgt, lun;
+
+ tgt = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+ DDI_PROP_DONTPASS, "target", -1);
+ con_log(CL_ANN1, (CE_NOTE,
+ "drsas_name_node: dip %p tgt %d", (void *)dip, tgt));
+ if (tgt == -1) {
+ return (DDI_FAILURE);
+ }
+ lun = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+ "lun", -1);
+ con_log(CL_ANN1,
+ (CE_NOTE, "drsas_name_node: tgt %d lun %d", tgt, lun));
+ if (lun == -1) {
+ return (DDI_FAILURE);
+ }
+ (void) snprintf(name, len, "%x,%x", tgt, lun);
+ return (DDI_SUCCESS);
+}
+
+static struct scsi_pkt *
+drsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt,
+ struct buf *bp, int cmdlen, int statuslen, int tgtlen,
+ int flags, int (*callback)(), caddr_t arg)
+{
+ struct scsa_cmd *acmd;
+ struct drsas_instance *instance;
+ struct scsi_pkt *new_pkt;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ instance = ADDR2MR(ap);
+
+ /* step #1 : pkt allocation */
+ if (pkt == NULL) {
+ pkt = scsi_hba_pkt_alloc(instance->dip, ap, cmdlen, statuslen,
+ tgtlen, sizeof (struct scsa_cmd), callback, arg);
+ if (pkt == NULL) {
+ return (NULL);
+ }
+
+ acmd = PKT2CMD(pkt);
+
+ /*
+ * Initialize the new pkt - we redundantly initialize
+ * all the fields for illustrative purposes.
+ */
+ acmd->cmd_pkt = pkt;
+ acmd->cmd_flags = 0;
+ acmd->cmd_scblen = statuslen;
+ acmd->cmd_cdblen = cmdlen;
+ acmd->cmd_dmahandle = NULL;
+ acmd->cmd_ncookies = 0;
+ acmd->cmd_cookie = 0;
+ acmd->cmd_cookiecnt = 0;
+ acmd->cmd_nwin = 0;
+
+ pkt->pkt_address = *ap;
+ pkt->pkt_comp = (void (*)())NULL;
+ pkt->pkt_flags = 0;
+ pkt->pkt_time = 0;
+ pkt->pkt_resid = 0;
+ pkt->pkt_state = 0;
+ pkt->pkt_statistics = 0;
+ pkt->pkt_reason = 0;
+ new_pkt = pkt;
+ } else {
+ acmd = PKT2CMD(pkt);
+ new_pkt = NULL;
+ }
+
+ /* step #2 : dma allocation/move */
+ if (bp && bp->b_bcount != 0) {
+ if (acmd->cmd_dmahandle == NULL) {
+ if (drsas_dma_alloc(instance, pkt, bp, flags,
+ callback) == DDI_FAILURE) {
+ if (new_pkt) {
+ scsi_hba_pkt_free(ap, new_pkt);
+ }
+ return ((struct scsi_pkt *)NULL);
+ }
+ } else {
+ if (drsas_dma_move(instance, pkt, bp) == DDI_FAILURE) {
+ return ((struct scsi_pkt *)NULL);
+ }
+ }
+ }
+
+ return (pkt);
+}
+
+static int
+drsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt)
+{
+ uchar_t cmd_done = 0;
+
+ struct drsas_instance *instance = ADDR2MR(ap);
+ struct drsas_cmd *cmd;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d:SCSI CDB[0]=0x%x",
+ __func__, __LINE__, pkt->pkt_cdbp[0]));
+
+ pkt->pkt_reason = CMD_CMPLT;
+ *pkt->pkt_scbp = STATUS_GOOD; /* clear arq scsi_status */
+
+ cmd = build_cmd(instance, ap, pkt, &cmd_done);
+
+ /*
+ * Check if the command is already completed by the drsas_build_cmd()
+ * routine. In which case the busy_flag would be clear and scb will be
+ * NULL and appropriate reason provided in pkt_reason field
+ */
+ if (cmd_done) {
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_scbp[0] = STATUS_GOOD;
+ pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET
+ | STATE_SENT_CMD;
+ if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) {
+ (*pkt->pkt_comp)(pkt);
+ }
+
+ return (TRAN_ACCEPT);
+ }
+
+ if (cmd == NULL) {
+ return (TRAN_BUSY);
+ }
+
+ if ((pkt->pkt_flags & FLAG_NOINTR) == 0) {
+ if (instance->fw_outstanding > instance->max_fw_cmds) {
+ con_log(CL_ANN, (CE_CONT, "dr_sas:Firmware busy"));
+ return_mfi_pkt(instance, cmd);
+ return (TRAN_BUSY);
+ }
+
+ /* Synchronize the Cmd frame for the controller */
+ (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0,
+ DDI_DMA_SYNC_FORDEV);
+
+ instance->func_ptr->issue_cmd(cmd, instance);
+
+ } else {
+ struct drsas_header *hdr = &cmd->frame->hdr;
+
+ cmd->sync_cmd = DRSAS_TRUE;
+
+ (void) instance->func_ptr->
+ issue_cmd_in_poll_mode(instance, cmd);
+
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_statistics = 0;
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+
+ switch (ddi_get8(cmd->frame_dma_obj.acc_handle,
+ &hdr->cmd_status)) {
+ case MFI_STAT_OK:
+ pkt->pkt_scbp[0] = STATUS_GOOD;
+ break;
+
+ case MFI_STAT_SCSI_DONE_WITH_ERROR:
+
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_statistics = 0;
+
+ ((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1;
+ break;
+
+ case MFI_STAT_DEVICE_NOT_FOUND:
+ pkt->pkt_reason = CMD_DEV_GONE;
+ pkt->pkt_statistics = STAT_DISCON;
+ break;
+
+ default:
+ ((struct scsi_status *)pkt->pkt_scbp)->sts_busy = 1;
+ }
+
+ return_mfi_pkt(instance, cmd);
+ (void) drsas_common_check(instance, cmd);
+
+ if (pkt->pkt_comp) {
+ (*pkt->pkt_comp)(pkt);
+ }
+
+ }
+
+ return (TRAN_ACCEPT);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* abort command not supported by H/W */
+
+ return (DDI_FAILURE);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_reset(struct scsi_address *ap, int level)
+{
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* reset command not supported by H/W */
+
+ return (DDI_FAILURE);
+
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_getcap(struct scsi_address *ap, char *cap, int whom)
+{
+ int rval = 0;
+
+ struct drsas_instance *instance = ADDR2MR(ap);
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* we do allow inquiring about capabilities for other targets */
+ if (cap == NULL) {
+ return (-1);
+ }
+
+ switch (scsi_hba_lookup_capstr(cap)) {
+ case SCSI_CAP_DMA_MAX:
+ /* Limit to 16MB max transfer */
+ rval = drsas_max_cap_maxxfer;
+ break;
+ case SCSI_CAP_MSG_OUT:
+ rval = 1;
+ break;
+ case SCSI_CAP_DISCONNECT:
+ rval = 0;
+ break;
+ case SCSI_CAP_SYNCHRONOUS:
+ rval = 0;
+ break;
+ case SCSI_CAP_WIDE_XFER:
+ rval = 1;
+ break;
+ case SCSI_CAP_TAGGED_QING:
+ rval = 1;
+ break;
+ case SCSI_CAP_UNTAGGED_QING:
+ rval = 1;
+ break;
+ case SCSI_CAP_PARITY:
+ rval = 1;
+ break;
+ case SCSI_CAP_INITIATOR_ID:
+ rval = instance->init_id;
+ break;
+ case SCSI_CAP_ARQ:
+ rval = 1;
+ break;
+ case SCSI_CAP_LINKED_CMDS:
+ rval = 0;
+ break;
+ case SCSI_CAP_RESET_NOTIFICATION:
+ rval = 1;
+ break;
+ case SCSI_CAP_GEOMETRY:
+ rval = -1;
+
+ break;
+ default:
+ con_log(CL_DLEVEL2, (CE_NOTE, "Default cap coming 0x%x",
+ scsi_hba_lookup_capstr(cap)));
+ rval = -1;
+ break;
+ }
+
+ return (rval);
+}
+
+/*ARGSUSED*/
+static int
+drsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom)
+{
+ int rval = 1;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ /* We don't allow setting capabilities for other targets */
+ if (cap == NULL || whom == 0) {
+ return (-1);
+ }
+
+ switch (scsi_hba_lookup_capstr(cap)) {
+ case SCSI_CAP_DMA_MAX:
+ case SCSI_CAP_MSG_OUT:
+ case SCSI_CAP_PARITY:
+ case SCSI_CAP_LINKED_CMDS:
+ case SCSI_CAP_RESET_NOTIFICATION:
+ case SCSI_CAP_DISCONNECT:
+ case SCSI_CAP_SYNCHRONOUS:
+ case SCSI_CAP_UNTAGGED_QING:
+ case SCSI_CAP_WIDE_XFER:
+ case SCSI_CAP_INITIATOR_ID:
+ case SCSI_CAP_ARQ:
+ /*
+ * None of these are settable via
+ * the capability interface.
+ */
+ break;
+ case SCSI_CAP_TAGGED_QING:
+ rval = 1;
+ break;
+ case SCSI_CAP_SECTOR_SIZE:
+ rval = 1;
+ break;
+
+ case SCSI_CAP_TOTAL_SECTORS:
+ rval = 1;
+ break;
+ default:
+ rval = -1;
+ break;
+ }
+
+ return (rval);
+}
+
+static void
+drsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+ struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ if (acmd->cmd_flags & CFLAG_DMAVALID) {
+ acmd->cmd_flags &= ~CFLAG_DMAVALID;
+
+ (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle);
+
+ ddi_dma_free_handle(&acmd->cmd_dmahandle);
+
+ acmd->cmd_dmahandle = NULL;
+ }
+
+ /* free the pkt */
+ scsi_hba_pkt_free(ap, pkt);
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+ register struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ if (acmd->cmd_flags & CFLAG_DMAVALID) {
+ acmd->cmd_flags &= ~CFLAG_DMAVALID;
+
+ (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle);
+
+ ddi_dma_free_handle(&acmd->cmd_dmahandle);
+
+ acmd->cmd_dmahandle = NULL;
+ }
+}
+
+/*ARGSUSED*/
+static void
+drsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt)
+{
+ register struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ if (acmd->cmd_flags & CFLAG_DMAVALID) {
+ (void) ddi_dma_sync(acmd->cmd_dmahandle, acmd->cmd_dma_offset,
+ acmd->cmd_dma_len, (acmd->cmd_flags & CFLAG_DMASEND) ?
+ DDI_DMA_SYNC_FORDEV : DDI_DMA_SYNC_FORCPU);
+ }
+}
+
+/*
+ * drsas_isr(caddr_t)
+ *
+ * The Interrupt Service Routine
+ *
+ * Collect status for all completed commands and do callback
+ *
+ */
+static uint_t
+drsas_isr(struct drsas_instance *instance)
+{
+ int need_softintr;
+ uint32_t producer;
+ uint32_t consumer;
+ uint32_t context;
+
+ struct drsas_cmd *cmd;
+
+ con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__));
+
+ ASSERT(instance);
+ if ((instance->intr_type == DDI_INTR_TYPE_FIXED) &&
+ !instance->func_ptr->intr_ack(instance)) {
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle,
+ 0, 0, DDI_DMA_SYNC_FORCPU);
+
+ if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle)
+ != DDI_SUCCESS) {
+ drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ producer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+ instance->producer);
+ consumer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+ instance->consumer);
+
+ con_log(CL_ANN1, (CE_CONT, " producer %x consumer %x ",
+ producer, consumer));
+ if (producer == consumer) {
+ con_log(CL_ANN1, (CE_WARN, "producer = consumer case"));
+ return (DDI_INTR_UNCLAIMED);
+ }
+ mutex_enter(&instance->completed_pool_mtx);
+
+ while (consumer != producer) {
+ context = ddi_get32(instance->mfi_internal_dma_obj.acc_handle,
+ &instance->reply_queue[consumer]);
+ cmd = instance->cmd_list[context];
+ mlist_add_tail(&cmd->list, &instance->completed_pool_list);
+
+ consumer++;
+ if (consumer == (instance->max_fw_cmds + 1)) {
+ consumer = 0;
+ }
+ }
+
+ mutex_exit(&instance->completed_pool_mtx);
+
+ ddi_put32(instance->mfi_internal_dma_obj.acc_handle,
+ instance->consumer, consumer);
+ (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle,
+ 0, 0, DDI_DMA_SYNC_FORDEV);
+
+ if (instance->softint_running) {
+ need_softintr = 0;
+ } else {
+ need_softintr = 1;
+ }
+
+ if (instance->isr_level == HIGH_LEVEL_INTR) {
+ if (need_softintr) {
+ ddi_trigger_softintr(instance->soft_intr_id);
+ }
+ } else {
+ /*
+ * Not a high-level interrupt, therefore call the soft level
+ * interrupt explicitly
+ */
+ (void) drsas_softintr(instance);
+ }
+
+ return (DDI_INTR_CLAIMED);
+}
+
+
+/*
+ * ************************************************************************** *
+ * *
+ * libraries *
+ * *
+ * ************************************************************************** *
+ */
+/*
+ * get_mfi_pkt : Get a command from the free pool
+ * After successful allocation, the caller of this routine
+ * must clear the frame buffer (memset to zero) before
+ * using the packet further.
+ *
+ * ***** Note *****
+ * After clearing the frame buffer the context id of the
+ * frame buffer SHOULD be restored back.
+ */
+static struct drsas_cmd *
+get_mfi_pkt(struct drsas_instance *instance)
+{
+ mlist_t *head = &instance->cmd_pool_list;
+ struct drsas_cmd *cmd = NULL;
+
+ mutex_enter(&instance->cmd_pool_mtx);
+ ASSERT(mutex_owned(&instance->cmd_pool_mtx));
+
+ if (!mlist_empty(head)) {
+ cmd = mlist_entry(head->next, struct drsas_cmd, list);
+ mlist_del_init(head->next);
+ }
+ if (cmd != NULL)
+ cmd->pkt = NULL;
+ mutex_exit(&instance->cmd_pool_mtx);
+
+ return (cmd);
+}
+
+/*
+ * return_mfi_pkt : Return a cmd to free command pool
+ */
+static void
+return_mfi_pkt(struct drsas_instance *instance, struct drsas_cmd *cmd)
+{
+ mutex_enter(&instance->cmd_pool_mtx);
+ ASSERT(mutex_owned(&instance->cmd_pool_mtx));
+
+ mlist_add(&cmd->list, &instance->cmd_pool_list);
+
+ mutex_exit(&instance->cmd_pool_mtx);
+}
+
+/*
+ * destroy_mfi_frame_pool
+ */
+static void
+destroy_mfi_frame_pool(struct drsas_instance *instance)
+{
+ int i;
+ uint32_t max_cmd = instance->max_fw_cmds;
+
+ struct drsas_cmd *cmd;
+
+ /* return all frames to pool */
+ for (i = 0; i < max_cmd+1; i++) {
+
+ cmd = instance->cmd_list[i];
+
+ if (cmd->frame_dma_obj_status == DMA_OBJ_ALLOCATED)
+ (void) drsas_free_dma_obj(instance, cmd->frame_dma_obj);
+
+ cmd->frame_dma_obj_status = DMA_OBJ_FREED;
+ }
+
+}
+
+/*
+ * create_mfi_frame_pool
+ */
+static int
+create_mfi_frame_pool(struct drsas_instance *instance)
+{
+ int i = 0;
+ int cookie_cnt;
+ uint16_t max_cmd;
+ uint16_t sge_sz;
+ uint32_t sgl_sz;
+ uint32_t tot_frame_size;
+
+ struct drsas_cmd *cmd;
+
+ max_cmd = instance->max_fw_cmds;
+
+ sge_sz = sizeof (struct drsas_sge64);
+
+ /* calculated the number of 64byte frames required for SGL */
+ sgl_sz = sge_sz * instance->max_num_sge;
+ tot_frame_size = sgl_sz + MRMFI_FRAME_SIZE + SENSE_LENGTH;
+
+ con_log(CL_DLEVEL3, (CE_NOTE, "create_mfi_frame_pool: "
+ "sgl_sz %x tot_frame_size %x", sgl_sz, tot_frame_size));
+
+ while (i < max_cmd+1) {
+ cmd = instance->cmd_list[i];
+
+ cmd->frame_dma_obj.size = tot_frame_size;
+ cmd->frame_dma_obj.dma_attr = drsas_generic_dma_attr;
+ cmd->frame_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ cmd->frame_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ cmd->frame_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ cmd->frame_dma_obj.dma_attr.dma_attr_align = 64;
+
+
+ cookie_cnt = drsas_alloc_dma_obj(instance, &cmd->frame_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC);
+
+ if (cookie_cnt == -1 || cookie_cnt > 1) {
+ con_log(CL_ANN, (CE_WARN,
+ "create_mfi_frame_pool: could not alloc."));
+ return (DDI_FAILURE);
+ }
+
+ bzero(cmd->frame_dma_obj.buffer, tot_frame_size);
+
+ cmd->frame_dma_obj_status = DMA_OBJ_ALLOCATED;
+ cmd->frame = (union drsas_frame *)cmd->frame_dma_obj.buffer;
+ cmd->frame_phys_addr =
+ cmd->frame_dma_obj.dma_cookie[0].dmac_address;
+
+ cmd->sense = (uint8_t *)(((unsigned long)
+ cmd->frame_dma_obj.buffer) +
+ tot_frame_size - SENSE_LENGTH);
+ cmd->sense_phys_addr =
+ cmd->frame_dma_obj.dma_cookie[0].dmac_address +
+ tot_frame_size - SENSE_LENGTH;
+
+ if (!cmd->frame || !cmd->sense) {
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: pci_pool_alloc failed"));
+
+ return (ENOMEM);
+ }
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &cmd->frame->io.context, cmd->index);
+ i++;
+
+ con_log(CL_DLEVEL3, (CE_NOTE, "[%x]-%x",
+ cmd->index, cmd->frame_phys_addr));
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * free_additional_dma_buffer
+ */
+static void
+free_additional_dma_buffer(struct drsas_instance *instance)
+{
+ if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) {
+ (void) drsas_free_dma_obj(instance,
+ instance->mfi_internal_dma_obj);
+ instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED;
+ }
+
+ if (instance->mfi_evt_detail_obj.status == DMA_OBJ_ALLOCATED) {
+ (void) drsas_free_dma_obj(instance,
+ instance->mfi_evt_detail_obj);
+ instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED;
+ }
+}
+
+/*
+ * alloc_additional_dma_buffer
+ */
+static int
+alloc_additional_dma_buffer(struct drsas_instance *instance)
+{
+ uint32_t reply_q_sz;
+ uint32_t internal_buf_size = PAGESIZE*2;
+
+ /* max cmds plus 1 + producer & consumer */
+ reply_q_sz = sizeof (uint32_t) * (instance->max_fw_cmds + 1 + 2);
+
+ instance->mfi_internal_dma_obj.size = internal_buf_size;
+ instance->mfi_internal_dma_obj.dma_attr = drsas_generic_dma_attr;
+ instance->mfi_internal_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ instance->mfi_internal_dma_obj.dma_attr.dma_attr_count_max =
+ 0xFFFFFFFFU;
+ instance->mfi_internal_dma_obj.dma_attr.dma_attr_sgllen = 1;
+
+ if (drsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas: could not alloc reply queue"));
+ return (DDI_FAILURE);
+ }
+
+ bzero(instance->mfi_internal_dma_obj.buffer, internal_buf_size);
+
+ instance->mfi_internal_dma_obj.status |= DMA_OBJ_ALLOCATED;
+
+ instance->producer = (uint32_t *)((unsigned long)
+ instance->mfi_internal_dma_obj.buffer);
+ instance->consumer = (uint32_t *)((unsigned long)
+ instance->mfi_internal_dma_obj.buffer + 4);
+ instance->reply_queue = (uint32_t *)((unsigned long)
+ instance->mfi_internal_dma_obj.buffer + 8);
+ instance->internal_buf = (caddr_t)(((unsigned long)
+ instance->mfi_internal_dma_obj.buffer) + reply_q_sz + 8);
+ instance->internal_buf_dmac_add =
+ instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address +
+ (reply_q_sz + 8);
+ instance->internal_buf_size = internal_buf_size -
+ (reply_q_sz + 8);
+
+ /* allocate evt_detail */
+ instance->mfi_evt_detail_obj.size = sizeof (struct drsas_evt_detail);
+ instance->mfi_evt_detail_obj.dma_attr = drsas_generic_dma_attr;
+ instance->mfi_evt_detail_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ instance->mfi_evt_detail_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ instance->mfi_evt_detail_obj.dma_attr.dma_attr_sgllen = 1;
+ instance->mfi_evt_detail_obj.dma_attr.dma_attr_align = 1;
+
+ if (drsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "alloc_additional_dma_buffer: "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ bzero(instance->mfi_evt_detail_obj.buffer,
+ sizeof (struct drsas_evt_detail));
+
+ instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED;
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * free_space_for_mfi
+ */
+static void
+free_space_for_mfi(struct drsas_instance *instance)
+{
+ int i;
+ uint32_t max_cmd = instance->max_fw_cmds;
+
+ /* already freed */
+ if (instance->cmd_list == NULL) {
+ return;
+ }
+
+ free_additional_dma_buffer(instance);
+
+ /* first free the MFI frame pool */
+ destroy_mfi_frame_pool(instance);
+
+ /* free all the commands in the cmd_list */
+ for (i = 0; i < instance->max_fw_cmds+1; i++) {
+ kmem_free(instance->cmd_list[i],
+ sizeof (struct drsas_cmd));
+
+ instance->cmd_list[i] = NULL;
+ }
+
+ /* free the cmd_list buffer itself */
+ kmem_free(instance->cmd_list,
+ sizeof (struct drsas_cmd *) * (max_cmd+1));
+
+ instance->cmd_list = NULL;
+
+ INIT_LIST_HEAD(&instance->cmd_pool_list);
+}
+
+/*
+ * alloc_space_for_mfi
+ */
+static int
+alloc_space_for_mfi(struct drsas_instance *instance)
+{
+ int i;
+ uint32_t max_cmd;
+ size_t sz;
+
+ struct drsas_cmd *cmd;
+
+ max_cmd = instance->max_fw_cmds;
+
+ /* reserve 1 more slot for flush_cache */
+ sz = sizeof (struct drsas_cmd *) * (max_cmd+1);
+
+ /*
+ * instance->cmd_list is an array of struct drsas_cmd pointers.
+ * Allocate the dynamic array first and then allocate individual
+ * commands.
+ */
+ instance->cmd_list = kmem_zalloc(sz, KM_SLEEP);
+ ASSERT(instance->cmd_list);
+
+ for (i = 0; i < max_cmd+1; i++) {
+ instance->cmd_list[i] = kmem_zalloc(sizeof (struct drsas_cmd),
+ KM_SLEEP);
+ ASSERT(instance->cmd_list[i]);
+ }
+
+ INIT_LIST_HEAD(&instance->cmd_pool_list);
+
+ /* add all the commands to command pool (instance->cmd_pool) */
+ for (i = 0; i < max_cmd; i++) {
+ cmd = instance->cmd_list[i];
+ cmd->index = i;
+
+ mlist_add_tail(&cmd->list, &instance->cmd_pool_list);
+ }
+
+ /* single slot for flush_cache won't be added in command pool */
+ cmd = instance->cmd_list[max_cmd];
+ cmd->index = i;
+
+ /* create a frame pool and assign one frame to each cmd */
+ if (create_mfi_frame_pool(instance)) {
+ con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool"));
+ return (DDI_FAILURE);
+ }
+
+ /* create a frame pool and assign one frame to each cmd */
+ if (alloc_additional_dma_buffer(instance)) {
+ con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool"));
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * get_ctrl_info
+ */
+static int
+get_ctrl_info(struct drsas_instance *instance,
+ struct drsas_ctrl_info *ctrl_info)
+{
+ int ret = 0;
+
+ struct drsas_cmd *cmd;
+ struct drsas_dcmd_frame *dcmd;
+ struct drsas_ctrl_info *ci;
+
+ cmd = get_mfi_pkt(instance);
+
+ if (!cmd) {
+ con_log(CL_ANN, (CE_WARN,
+ "Failed to get a cmd for ctrl info"));
+ return (DDI_FAILURE);
+ }
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+ cmd->index);
+
+ dcmd = &cmd->frame->dcmd;
+
+ ci = (struct drsas_ctrl_info *)instance->internal_buf;
+
+ if (!ci) {
+ con_log(CL_ANN, (CE_WARN,
+ "Failed to alloc mem for ctrl info"));
+ return_mfi_pkt(instance, cmd);
+ return (DDI_FAILURE);
+ }
+
+ (void) memset(ci, 0, sizeof (struct drsas_ctrl_info));
+
+ /* for( i = 0; i < DCMD_MBOX_SZ; i++ ) dcmd->mbox.b[i] = 0; */
+ (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status,
+ MFI_CMD_STATUS_POLL_MODE);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+ MFI_FRAME_DIR_READ);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+ sizeof (struct drsas_ctrl_info));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+ DR_DCMD_CTRL_GET_INFO);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+ instance->internal_buf_dmac_add);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+ sizeof (struct drsas_ctrl_info));
+
+ cmd->frame_count = 1;
+
+ if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+ ret = 0;
+ ddi_rep_get8(cmd->frame_dma_obj.acc_handle,
+ (uint8_t *)ctrl_info, (uint8_t *)ci,
+ sizeof (struct drsas_ctrl_info), DDI_DEV_AUTOINCR);
+ } else {
+ con_log(CL_ANN, (CE_WARN, "get_ctrl_info: Ctrl info failed"));
+ ret = -1;
+ }
+
+ return_mfi_pkt(instance, cmd);
+ if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+ ret = -1;
+ }
+
+ return (ret);
+}
+
+/*
+ * abort_aen_cmd
+ */
+static int
+abort_aen_cmd(struct drsas_instance *instance,
+ struct drsas_cmd *cmd_to_abort)
+{
+ int ret = 0;
+
+ struct drsas_cmd *cmd;
+ struct drsas_abort_frame *abort_fr;
+
+ cmd = get_mfi_pkt(instance);
+
+ if (!cmd) {
+ con_log(CL_ANN, (CE_WARN,
+ "Failed to get a cmd for ctrl info"));
+ return (DDI_FAILURE);
+ }
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+ cmd->index);
+
+ abort_fr = &cmd->frame->abort;
+
+ /* prepare and issue the abort frame */
+ ddi_put8(cmd->frame_dma_obj.acc_handle,
+ &abort_fr->cmd, MFI_CMD_OP_ABORT);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &abort_fr->cmd_status,
+ MFI_CMD_STATUS_SYNC_MODE);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &abort_fr->flags, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &abort_fr->abort_context,
+ cmd_to_abort->index);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &abort_fr->abort_mfi_phys_addr_lo, cmd_to_abort->frame_phys_addr);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &abort_fr->abort_mfi_phys_addr_hi, 0);
+
+ instance->aen_cmd->abort_aen = 1;
+
+ cmd->sync_cmd = DRSAS_TRUE;
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+ con_log(CL_ANN, (CE_WARN,
+ "abort_aen_cmd: issue_cmd_in_sync_mode failed"));
+ ret = -1;
+ } else {
+ ret = 0;
+ }
+
+ instance->aen_cmd->abort_aen = 1;
+ instance->aen_cmd = 0;
+
+ return_mfi_pkt(instance, cmd);
+ (void) drsas_common_check(instance, cmd);
+
+ return (ret);
+}
+
+/*
+ * init_mfi
+ */
+static int
+init_mfi(struct drsas_instance *instance)
+{
+ struct drsas_cmd *cmd;
+ struct drsas_ctrl_info ctrl_info;
+ struct drsas_init_frame *init_frame;
+ struct drsas_init_queue_info *initq_info;
+
+ /* we expect the FW state to be READY */
+ if (mfi_state_transition_to_ready(instance)) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas: F/W is not ready"));
+ goto fail_ready_state;
+ }
+
+ /* get various operational parameters from status register */
+ instance->max_num_sge =
+ (instance->func_ptr->read_fw_status_reg(instance) &
+ 0xFF0000) >> 0x10;
+ /*
+ * Reduce the max supported cmds by 1. This is to ensure that the
+ * reply_q_sz (1 more than the max cmd that driver may send)
+ * does not exceed max cmds that the FW can support
+ */
+ instance->max_fw_cmds =
+ instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF;
+ instance->max_fw_cmds = instance->max_fw_cmds - 1;
+
+ instance->max_num_sge =
+ (instance->max_num_sge > DRSAS_MAX_SGE_CNT) ?
+ DRSAS_MAX_SGE_CNT : instance->max_num_sge;
+
+ /* create a pool of commands */
+ if (alloc_space_for_mfi(instance) != DDI_SUCCESS)
+ goto fail_alloc_fw_space;
+
+ /*
+ * Prepare a init frame. Note the init frame points to queue info
+ * structure. Each frame has SGL allocated after first 64 bytes. For
+ * this frame - since we don't need any SGL - we use SGL's space as
+ * queue info structure
+ */
+ cmd = get_mfi_pkt(instance);
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+ cmd->index);
+
+ init_frame = (struct drsas_init_frame *)cmd->frame;
+ initq_info = (struct drsas_init_queue_info *)
+ ((unsigned long)init_frame + 64);
+
+ (void) memset(init_frame, 0, MRMFI_FRAME_SIZE);
+ (void) memset(initq_info, 0, sizeof (struct drsas_init_queue_info));
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &initq_info->init_flags, 0);
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->reply_queue_entries, instance->max_fw_cmds + 1);
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->producer_index_phys_addr_hi, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->producer_index_phys_addr_lo,
+ instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address);
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->consumer_index_phys_addr_hi, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->consumer_index_phys_addr_lo,
+ instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 4);
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->reply_queue_start_phys_addr_hi, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &initq_info->reply_queue_start_phys_addr_lo,
+ instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 8);
+
+ ddi_put8(cmd->frame_dma_obj.acc_handle,
+ &init_frame->cmd, MFI_CMD_OP_INIT);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &init_frame->cmd_status,
+ MFI_CMD_STATUS_POLL_MODE);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &init_frame->flags, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &init_frame->queue_info_new_phys_addr_lo,
+ cmd->frame_phys_addr + 64);
+ ddi_put32(cmd->frame_dma_obj.acc_handle,
+ &init_frame->queue_info_new_phys_addr_hi, 0);
+
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &init_frame->data_xfer_len,
+ sizeof (struct drsas_init_queue_info));
+
+ cmd->frame_count = 1;
+
+ /* issue the init frame in polled mode */
+ if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+ con_log(CL_ANN, (CE_WARN, "failed to init firmware"));
+ goto fail_fw_init;
+ }
+
+ return_mfi_pkt(instance, cmd);
+ if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+ goto fail_fw_init;
+ }
+
+ /* gather misc FW related information */
+ if (!get_ctrl_info(instance, &ctrl_info)) {
+ instance->max_sectors_per_req = ctrl_info.max_request_size;
+ con_log(CL_ANN1, (CE_NOTE, "product name %s ld present %d",
+ ctrl_info.product_name, ctrl_info.ld_present_count));
+ } else {
+ instance->max_sectors_per_req = instance->max_num_sge *
+ PAGESIZE / 512;
+ }
+
+ if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+ goto fail_fw_init;
+ }
+
+ return (DDI_SUCCESS);
+
+fail_fw_init:
+fail_alloc_fw_space:
+
+ free_space_for_mfi(instance);
+
+fail_ready_state:
+ ddi_regs_map_free(&instance->regmap_handle);
+
+fail_mfi_reg_setup:
+ return (DDI_FAILURE);
+}
+
+/*
+ * mfi_state_transition_to_ready : Move the FW to READY state
+ *
+ * @reg_set : MFI register set
+ */
+static int
+mfi_state_transition_to_ready(struct drsas_instance *instance)
+{
+ int i;
+ uint8_t max_wait;
+ uint32_t fw_ctrl;
+ uint32_t fw_state;
+ uint32_t cur_state;
+
+ fw_state =
+ instance->func_ptr->read_fw_status_reg(instance) & MFI_STATE_MASK;
+ con_log(CL_ANN1, (CE_NOTE,
+ "mfi_state_transition_to_ready:FW state = 0x%x", fw_state));
+
+ while (fw_state != MFI_STATE_READY) {
+ con_log(CL_ANN, (CE_NOTE,
+ "mfi_state_transition_to_ready:FW state%x", fw_state));
+
+ switch (fw_state) {
+ case MFI_STATE_FAULT:
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: FW in FAULT state!!"));
+
+ return (ENODEV);
+ case MFI_STATE_WAIT_HANDSHAKE:
+ /* set the CLR bit in IMR0 */
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: FW waiting for HANDSHAKE"));
+ /*
+ * PCI_Hot Plug: MFI F/W requires
+ * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG)
+ * to be set
+ */
+ /* WR_IB_MSG_0(MFI_INIT_CLEAR_HANDSHAKE, instance); */
+ WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE |
+ MFI_INIT_HOTPLUG, instance);
+
+ max_wait = 2;
+ cur_state = MFI_STATE_WAIT_HANDSHAKE;
+ break;
+ case MFI_STATE_BOOT_MESSAGE_PENDING:
+ /* set the CLR bit in IMR0 */
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: FW state boot message pending"));
+ /*
+ * PCI_Hot Plug: MFI F/W requires
+ * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG)
+ * to be set
+ */
+ WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance);
+
+ max_wait = 10;
+ cur_state = MFI_STATE_BOOT_MESSAGE_PENDING;
+ break;
+ case MFI_STATE_OPERATIONAL:
+ /* bring it to READY state; assuming max wait 2 secs */
+ instance->func_ptr->disable_intr(instance);
+ con_log(CL_ANN1, (CE_NOTE,
+ "dr_sas: FW in OPERATIONAL state"));
+ /*
+ * PCI_Hot Plug: MFI F/W requires
+ * (MFI_INIT_READY | MFI_INIT_MFIMODE | MFI_INIT_ABORT)
+ * to be set
+ */
+ /* WR_IB_DOORBELL(MFI_INIT_READY, instance); */
+ WR_IB_DOORBELL(MFI_RESET_FLAGS, instance);
+
+ max_wait = 10;
+ cur_state = MFI_STATE_OPERATIONAL;
+ break;
+ case MFI_STATE_UNDEFINED:
+ /* this state should not last for more than 2 seconds */
+ con_log(CL_ANN, (CE_NOTE, "FW state undefined"));
+
+ max_wait = 2;
+ cur_state = MFI_STATE_UNDEFINED;
+ break;
+ case MFI_STATE_BB_INIT:
+ max_wait = 2;
+ cur_state = MFI_STATE_BB_INIT;
+ break;
+ case MFI_STATE_FW_INIT:
+ max_wait = 2;
+ cur_state = MFI_STATE_FW_INIT;
+ break;
+ case MFI_STATE_DEVICE_SCAN:
+ max_wait = 10;
+ cur_state = MFI_STATE_DEVICE_SCAN;
+ break;
+ default:
+ con_log(CL_ANN, (CE_NOTE,
+ "dr_sas: Unknown state 0x%x", fw_state));
+ return (ENODEV);
+ }
+
+ /* the cur_state should not last for more than max_wait secs */
+ for (i = 0; i < (max_wait * MILLISEC); i++) {
+ /* fw_state = RD_OB_MSG_0(instance) & MFI_STATE_MASK; */
+ fw_state =
+ instance->func_ptr->read_fw_status_reg(instance) &
+ MFI_STATE_MASK;
+
+ if (fw_state == cur_state) {
+ delay(1 * drv_usectohz(MILLISEC));
+ } else {
+ break;
+ }
+ }
+
+ /* return error if fw_state hasn't changed after max_wait */
+ if (fw_state == cur_state) {
+ con_log(CL_ANN, (CE_NOTE,
+ "FW state hasn't changed in %d secs", max_wait));
+ return (ENODEV);
+ }
+ };
+
+ fw_ctrl = RD_IB_DOORBELL(instance);
+
+ con_log(CL_ANN1, (CE_NOTE,
+ "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl));
+
+ /*
+ * Write 0xF to the doorbell register to do the following.
+ * - Abort all outstanding commands (bit 0).
+ * - Transition from OPERATIONAL to READY state (bit 1).
+ * - Discard (possible) low MFA posted in 64-bit mode (bit-2).
+ * - Set to release FW to continue running (i.e. BIOS handshake
+ * (bit 3).
+ */
+ WR_IB_DOORBELL(0xF, instance);
+
+ if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+ return (ENODEV);
+ }
+ return (DDI_SUCCESS);
+}
+
+/*
+ * get_seq_num
+ */
+static int
+get_seq_num(struct drsas_instance *instance,
+ struct drsas_evt_log_info *eli)
+{
+ int ret = DDI_SUCCESS;
+
+ dma_obj_t dcmd_dma_obj;
+ struct drsas_cmd *cmd;
+ struct drsas_dcmd_frame *dcmd;
+
+ cmd = get_mfi_pkt(instance);
+
+ if (!cmd) {
+ cmn_err(CE_WARN, "dr_sas: failed to get a cmd");
+ return (ENOMEM);
+ }
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+ cmd->index);
+
+ dcmd = &cmd->frame->dcmd;
+
+ /* allocate the data transfer buffer */
+ dcmd_dma_obj.size = sizeof (struct drsas_evt_log_info);
+ dcmd_dma_obj.dma_attr = drsas_generic_dma_attr;
+ dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ dcmd_dma_obj.dma_attr.dma_attr_align = 1;
+
+ if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN,
+ "get_seq_num: could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ (void) memset(dcmd_dma_obj.buffer, 0,
+ sizeof (struct drsas_evt_log_info));
+
+ (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+ MFI_FRAME_DIR_READ);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+ sizeof (struct drsas_evt_log_info));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+ DR_DCMD_CTRL_EVENT_GET_INFO);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+ sizeof (struct drsas_evt_log_info));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+ dcmd_dma_obj.dma_cookie[0].dmac_address);
+
+ cmd->sync_cmd = DRSAS_TRUE;
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+ cmn_err(CE_WARN, "get_seq_num: "
+ "failed to issue DRSAS_DCMD_CTRL_EVENT_GET_INFO");
+ ret = DDI_FAILURE;
+ } else {
+ /* copy the data back into callers buffer */
+ ddi_rep_get8(cmd->frame_dma_obj.acc_handle, (uint8_t *)eli,
+ (uint8_t *)dcmd_dma_obj.buffer,
+ sizeof (struct drsas_evt_log_info), DDI_DEV_AUTOINCR);
+ ret = DDI_SUCCESS;
+ }
+
+ if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS)
+ ret = DDI_FAILURE;
+
+ return_mfi_pkt(instance, cmd);
+ if (drsas_common_check(instance, cmd) != DDI_SUCCESS) {
+ ret = DDI_FAILURE;
+ }
+ return (ret);
+}
+
+/*
+ * start_mfi_aen
+ */
+static int
+start_mfi_aen(struct drsas_instance *instance)
+{
+ int ret = 0;
+
+ struct drsas_evt_log_info eli;
+ union drsas_evt_class_locale class_locale;
+
+ /* get the latest sequence number from FW */
+ (void) memset(&eli, 0, sizeof (struct drsas_evt_log_info));
+
+ if (get_seq_num(instance, &eli)) {
+ cmn_err(CE_WARN, "start_mfi_aen: failed to get seq num");
+ return (-1);
+ }
+
+ /* register AEN with FW for latest sequence number plus 1 */
+ class_locale.members.reserved = 0;
+ class_locale.members.locale = DR_EVT_LOCALE_ALL;
+ class_locale.members.class = DR_EVT_CLASS_INFO;
+ ret = register_mfi_aen(instance, eli.newest_seq_num + 1,
+ class_locale.word);
+
+ if (ret) {
+ cmn_err(CE_WARN, "start_mfi_aen: aen registration failed");
+ return (-1);
+ }
+
+ return (ret);
+}
+
+/*
+ * flush_cache
+ */
+static void
+flush_cache(struct drsas_instance *instance)
+{
+ struct drsas_cmd *cmd = NULL;
+ struct drsas_dcmd_frame *dcmd;
+ uint32_t max_cmd = instance->max_fw_cmds;
+
+ cmd = instance->cmd_list[max_cmd];
+
+ if (cmd == NULL)
+ return;
+
+ dcmd = &cmd->frame->dcmd;
+
+ (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 0);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+ MFI_FRAME_DIR_NONE);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+ DR_DCMD_CTRL_CACHE_FLUSH);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.b[0],
+ DR_FLUSH_CTRL_CACHE | DR_FLUSH_DISK_CACHE);
+
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) {
+ con_log(CL_ANN1, (CE_WARN,
+ "flush_cache: failed to issue MFI_DCMD_CTRL_CACHE_FLUSH"));
+ }
+ con_log(CL_DLEVEL1, (CE_NOTE, "done"));
+}
+
+/*
+ * service_mfi_aen- Completes an AEN command
+ * @instance: Adapter soft state
+ * @cmd: Command to be completed
+ *
+ */
+static void
+service_mfi_aen(struct drsas_instance *instance, struct drsas_cmd *cmd)
+{
+ uint32_t seq_num;
+ struct drsas_evt_detail *evt_detail =
+ (struct drsas_evt_detail *)instance->mfi_evt_detail_obj.buffer;
+ int rval = 0;
+ int tgt = 0;
+ ddi_acc_handle_t acc_handle;
+
+ acc_handle = cmd->frame_dma_obj.acc_handle;
+
+ cmd->cmd_status = ddi_get8(acc_handle, &cmd->frame->io.cmd_status);
+
+ if (cmd->cmd_status == ENODATA) {
+ cmd->cmd_status = 0;
+ }
+
+ /*
+ * log the MFI AEN event to the sysevent queue so that
+ * application will get noticed
+ */
+ if (ddi_log_sysevent(instance->dip, DDI_VENDOR_LSI, "LSIMEGA", "SAS",
+ NULL, NULL, DDI_NOSLEEP) != DDI_SUCCESS) {
+ int instance_no = ddi_get_instance(instance->dip);
+ con_log(CL_ANN, (CE_WARN,
+ "dr_sas%d: Failed to log AEN event", instance_no));
+ }
+ /*
+ * Check for any ld devices that has changed state. i.e. online
+ * or offline.
+ */
+ con_log(CL_ANN1, (CE_NOTE,
+ "AEN: code = %x class = %x locale = %x args = %x",
+ ddi_get32(acc_handle, &evt_detail->code),
+ evt_detail->cl.members.class,
+ ddi_get16(acc_handle, &evt_detail->cl.members.locale),
+ ddi_get8(acc_handle, &evt_detail->arg_type)));
+
+ switch (ddi_get32(acc_handle, &evt_detail->code)) {
+ case DR_EVT_CFG_CLEARED: {
+ for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) {
+ if (instance->dr_ld_list[tgt].dip != NULL) {
+ rval = drsas_service_evt(instance, tgt, 0,
+ DRSAS_EVT_UNCONFIG_TGT);
+ con_log(CL_ANN1, (CE_WARN,
+ "dr_sas: CFG CLEARED AEN rval = %d "
+ "tgt id = %d", rval, tgt));
+ }
+ }
+ break;
+ }
+
+ case DR_EVT_LD_DELETED: {
+ rval = drsas_service_evt(instance,
+ ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0,
+ DRSAS_EVT_UNCONFIG_TGT);
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: LD DELETED AEN rval = %d "
+ "tgt id = %d index = %d", rval,
+ ddi_get16(acc_handle, &evt_detail->args.ld.target_id),
+ ddi_get8(acc_handle, &evt_detail->args.ld.ld_index)));
+ break;
+ } /* End of DR_EVT_LD_DELETED */
+
+ case DR_EVT_LD_CREATED: {
+ rval = drsas_service_evt(instance,
+ ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0,
+ DRSAS_EVT_CONFIG_TGT);
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: LD CREATED AEN rval = %d "
+ "tgt id = %d index = %d", rval,
+ ddi_get16(acc_handle, &evt_detail->args.ld.target_id),
+ ddi_get8(acc_handle, &evt_detail->args.ld.ld_index)));
+ break;
+ } /* End of DR_EVT_LD_CREATED */
+ } /* End of Main Switch */
+
+ /* get copy of seq_num and class/locale for re-registration */
+ seq_num = ddi_get32(acc_handle, &evt_detail->seq_num);
+ seq_num++;
+ (void) memset(instance->mfi_evt_detail_obj.buffer, 0,
+ sizeof (struct drsas_evt_detail));
+
+ ddi_put8(acc_handle, &cmd->frame->dcmd.cmd_status, 0x0);
+ ddi_put32(acc_handle, &cmd->frame->dcmd.mbox.w[0], seq_num);
+
+ instance->aen_seq_num = seq_num;
+
+ cmd->frame_count = 1;
+
+ /* Issue the aen registration frame */
+ instance->func_ptr->issue_cmd(cmd, instance);
+}
+
+/*
+ * complete_cmd_in_sync_mode - Completes an internal command
+ * @instance: Adapter soft state
+ * @cmd: Command to be completed
+ *
+ * The issue_cmd_in_sync_mode() function waits for a command to complete
+ * after it issues a command. This function wakes up that waiting routine by
+ * calling wake_up() on the wait queue.
+ */
+static void
+complete_cmd_in_sync_mode(struct drsas_instance *instance,
+ struct drsas_cmd *cmd)
+{
+ cmd->cmd_status = ddi_get8(cmd->frame_dma_obj.acc_handle,
+ &cmd->frame->io.cmd_status);
+
+ cmd->sync_cmd = DRSAS_FALSE;
+
+ if (cmd->cmd_status == ENODATA) {
+ cmd->cmd_status = 0;
+ }
+
+ cv_broadcast(&instance->int_cmd_cv);
+}
+
+/*
+ * drsas_softintr - The Software ISR
+ * @param arg : HBA soft state
+ *
+ * called from high-level interrupt if hi-level interrupt are not there,
+ * otherwise triggered as a soft interrupt
+ */
+static uint_t
+drsas_softintr(struct drsas_instance *instance)
+{
+ struct scsi_pkt *pkt;
+ struct scsa_cmd *acmd;
+ struct drsas_cmd *cmd;
+ struct mlist_head *pos, *next;
+ mlist_t process_list;
+ struct drsas_header *hdr;
+ struct scsi_arq_status *arqstat;
+
+ con_log(CL_ANN1, (CE_CONT, "drsas_softintr called"));
+
+ ASSERT(instance);
+ mutex_enter(&instance->completed_pool_mtx);
+
+ if (mlist_empty(&instance->completed_pool_list)) {
+ mutex_exit(&instance->completed_pool_mtx);
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ instance->softint_running = 1;
+
+ INIT_LIST_HEAD(&process_list);
+ mlist_splice(&instance->completed_pool_list, &process_list);
+ INIT_LIST_HEAD(&instance->completed_pool_list);
+
+ mutex_exit(&instance->completed_pool_mtx);
+
+ /* perform all callbacks first, before releasing the SCBs */
+ mlist_for_each_safe(pos, next, &process_list) {
+ cmd = mlist_entry(pos, struct drsas_cmd, list);
+
+ /* syncronize the Cmd frame for the controller */
+ (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle,
+ 0, 0, DDI_DMA_SYNC_FORCPU);
+
+ if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) !=
+ DDI_SUCCESS) {
+ drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ hdr = &cmd->frame->hdr;
+
+ /* remove the internal command from the process list */
+ mlist_del_init(&cmd->list);
+
+ switch (ddi_get8(cmd->frame_dma_obj.acc_handle, &hdr->cmd)) {
+ case MFI_CMD_OP_PD_SCSI:
+ case MFI_CMD_OP_LD_SCSI:
+ case MFI_CMD_OP_LD_READ:
+ case MFI_CMD_OP_LD_WRITE:
+ /*
+ * MFI_CMD_OP_PD_SCSI and MFI_CMD_OP_LD_SCSI
+ * could have been issued either through an
+ * IO path or an IOCTL path. If it was via IOCTL,
+ * we will send it to internal completion.
+ */
+ if (cmd->sync_cmd == DRSAS_TRUE) {
+ complete_cmd_in_sync_mode(instance, cmd);
+ break;
+ }
+
+ /* regular commands */
+ acmd = cmd->cmd;
+ pkt = CMD2PKT(acmd);
+
+ if (acmd->cmd_flags & CFLAG_DMAVALID) {
+ if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+ (void) ddi_dma_sync(acmd->cmd_dmahandle,
+ acmd->cmd_dma_offset,
+ acmd->cmd_dma_len,
+ DDI_DMA_SYNC_FORCPU);
+ }
+ }
+
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_statistics = 0;
+ pkt->pkt_state = STATE_GOT_BUS
+ | STATE_GOT_TARGET | STATE_SENT_CMD
+ | STATE_XFERRED_DATA | STATE_GOT_STATUS;
+
+ con_log(CL_ANN1, (CE_CONT,
+ "CDB[0] = %x completed for %s: size %lx context %x",
+ pkt->pkt_cdbp[0], ((acmd->islogical) ? "LD" : "PD"),
+ acmd->cmd_dmacount, hdr->context));
+
+ if (pkt->pkt_cdbp[0] == SCMD_INQUIRY) {
+ struct scsi_inquiry *inq;
+
+ if (acmd->cmd_dmacount != 0) {
+ bp_mapin(acmd->cmd_buf);
+ inq = (struct scsi_inquiry *)
+ acmd->cmd_buf->b_un.b_addr;
+
+ /* don't expose physical drives to OS */
+ if (acmd->islogical &&
+ (hdr->cmd_status == MFI_STAT_OK)) {
+ display_scsi_inquiry(
+ (caddr_t)inq);
+ } else if ((hdr->cmd_status ==
+ MFI_STAT_OK) && inq->inq_dtype ==
+ DTYPE_DIRECT) {
+
+ display_scsi_inquiry(
+ (caddr_t)inq);
+
+ /* for physical disk */
+ hdr->cmd_status =
+ MFI_STAT_DEVICE_NOT_FOUND;
+ }
+ }
+ }
+
+ switch (hdr->cmd_status) {
+ case MFI_STAT_OK:
+ pkt->pkt_scbp[0] = STATUS_GOOD;
+ break;
+ case MFI_STAT_LD_CC_IN_PROGRESS:
+ case MFI_STAT_LD_RECON_IN_PROGRESS:
+ pkt->pkt_scbp[0] = STATUS_GOOD;
+ break;
+ case MFI_STAT_LD_INIT_IN_PROGRESS:
+ con_log(CL_ANN,
+ (CE_WARN, "Initialization in Progress"));
+ pkt->pkt_reason = CMD_TRAN_ERR;
+
+ break;
+ case MFI_STAT_SCSI_DONE_WITH_ERROR:
+ con_log(CL_ANN1, (CE_CONT, "scsi_done error"));
+
+ pkt->pkt_reason = CMD_CMPLT;
+ ((struct scsi_status *)
+ pkt->pkt_scbp)->sts_chk = 1;
+
+ if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) {
+
+ con_log(CL_ANN,
+ (CE_WARN, "TEST_UNIT_READY fail"));
+
+ } else {
+ pkt->pkt_state |= STATE_ARQ_DONE;
+ arqstat = (void *)(pkt->pkt_scbp);
+ arqstat->sts_rqpkt_reason = CMD_CMPLT;
+ arqstat->sts_rqpkt_resid = 0;
+ arqstat->sts_rqpkt_state |=
+ STATE_GOT_BUS | STATE_GOT_TARGET
+ | STATE_SENT_CMD
+ | STATE_XFERRED_DATA;
+ *(uint8_t *)&arqstat->sts_rqpkt_status =
+ STATUS_GOOD;
+ ddi_rep_get8(
+ cmd->frame_dma_obj.acc_handle,
+ (uint8_t *)
+ &(arqstat->sts_sensedata),
+ cmd->sense,
+ acmd->cmd_scblen -
+ offsetof(struct scsi_arq_status,
+ sts_sensedata), DDI_DEV_AUTOINCR);
+ }
+ break;
+ case MFI_STAT_LD_OFFLINE:
+ case MFI_STAT_DEVICE_NOT_FOUND:
+ con_log(CL_ANN1, (CE_CONT,
+ "device not found error"));
+ pkt->pkt_reason = CMD_DEV_GONE;
+ pkt->pkt_statistics = STAT_DISCON;
+ break;
+ case MFI_STAT_LD_LBA_OUT_OF_RANGE:
+ pkt->pkt_state |= STATE_ARQ_DONE;
+ pkt->pkt_reason = CMD_CMPLT;
+ ((struct scsi_status *)
+ pkt->pkt_scbp)->sts_chk = 1;
+
+ arqstat = (void *)(pkt->pkt_scbp);
+ arqstat->sts_rqpkt_reason = CMD_CMPLT;
+ arqstat->sts_rqpkt_resid = 0;
+ arqstat->sts_rqpkt_state |= STATE_GOT_BUS
+ | STATE_GOT_TARGET | STATE_SENT_CMD
+ | STATE_XFERRED_DATA;
+ *(uint8_t *)&arqstat->sts_rqpkt_status =
+ STATUS_GOOD;
+
+ arqstat->sts_sensedata.es_valid = 1;
+ arqstat->sts_sensedata.es_key =
+ KEY_ILLEGAL_REQUEST;
+ arqstat->sts_sensedata.es_class =
+ CLASS_EXTENDED_SENSE;
+
+ /*
+ * LOGICAL BLOCK ADDRESS OUT OF RANGE:
+ * ASC: 0x21h; ASCQ: 0x00h;
+ */
+ arqstat->sts_sensedata.es_add_code = 0x21;
+ arqstat->sts_sensedata.es_qual_code = 0x00;
+
+ break;
+
+ default:
+ con_log(CL_ANN, (CE_CONT, "Unknown status!"));
+ pkt->pkt_reason = CMD_TRAN_ERR;
+
+ break;
+ }
+
+ atomic_add_16(&instance->fw_outstanding, (-1));
+
+ return_mfi_pkt(instance, cmd);
+
+ (void) drsas_common_check(instance, cmd);
+
+ if (acmd->cmd_dmahandle) {
+ if (drsas_check_dma_handle(
+ acmd->cmd_dmahandle) != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip,
+ DDI_SERVICE_UNAFFECTED);
+ pkt->pkt_reason = CMD_TRAN_ERR;
+ pkt->pkt_statistics = 0;
+ }
+ }
+
+ /* Call the callback routine */
+ if (((pkt->pkt_flags & FLAG_NOINTR) == 0) &&
+ pkt->pkt_comp) {
+ (*pkt->pkt_comp)(pkt);
+ }
+
+ break;
+ case MFI_CMD_OP_SMP:
+ case MFI_CMD_OP_STP:
+ complete_cmd_in_sync_mode(instance, cmd);
+ break;
+ case MFI_CMD_OP_DCMD:
+ /* see if got an event notification */
+ if (ddi_get32(cmd->frame_dma_obj.acc_handle,
+ &cmd->frame->dcmd.opcode) ==
+ DR_DCMD_CTRL_EVENT_WAIT) {
+ if ((instance->aen_cmd == cmd) &&
+ (instance->aen_cmd->abort_aen)) {
+ con_log(CL_ANN, (CE_WARN,
+ "drsas_softintr: "
+ "aborted_aen returned"));
+ } else {
+ atomic_add_16(&instance->fw_outstanding,
+ (-1));
+ service_mfi_aen(instance, cmd);
+ }
+ } else {
+ complete_cmd_in_sync_mode(instance, cmd);
+ }
+
+ break;
+ case MFI_CMD_OP_ABORT:
+ con_log(CL_ANN, (CE_WARN, "MFI_CMD_OP_ABORT complete"));
+ /*
+ * MFI_CMD_OP_ABORT successfully completed
+ * in the synchronous mode
+ */
+ complete_cmd_in_sync_mode(instance, cmd);
+ break;
+ default:
+ drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE);
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+
+ if (cmd->pkt != NULL) {
+ pkt = cmd->pkt;
+ if (((pkt->pkt_flags & FLAG_NOINTR) == 0) &&
+ pkt->pkt_comp) {
+ (*pkt->pkt_comp)(pkt);
+ }
+ }
+ con_log(CL_ANN, (CE_WARN, "Cmd type unknown !"));
+ break;
+ }
+ }
+
+ instance->softint_running = 0;
+
+ return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * drsas_alloc_dma_obj
+ *
+ * Allocate the memory and other resources for an dma object.
+ */
+static int
+drsas_alloc_dma_obj(struct drsas_instance *instance, dma_obj_t *obj,
+ uchar_t endian_flags)
+{
+ int i;
+ size_t alen = 0;
+ uint_t cookie_cnt;
+ struct ddi_device_acc_attr tmp_endian_attr;
+
+ tmp_endian_attr = endian_attr;
+ tmp_endian_attr.devacc_attr_endian_flags = endian_flags;
+
+ i = ddi_dma_alloc_handle(instance->dip, &obj->dma_attr,
+ DDI_DMA_SLEEP, NULL, &obj->dma_handle);
+ if (i != DDI_SUCCESS) {
+
+ switch (i) {
+ case DDI_DMA_BADATTR :
+ con_log(CL_ANN, (CE_WARN,
+ "Failed ddi_dma_alloc_handle- Bad attribute"));
+ break;
+ case DDI_DMA_NORESOURCES :
+ con_log(CL_ANN, (CE_WARN,
+ "Failed ddi_dma_alloc_handle- No Resources"));
+ break;
+ default :
+ con_log(CL_ANN, (CE_WARN,
+ "Failed ddi_dma_alloc_handle: "
+ "unknown status %d", i));
+ break;
+ }
+
+ return (-1);
+ }
+
+ if ((ddi_dma_mem_alloc(obj->dma_handle, obj->size, &tmp_endian_attr,
+ DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
+ &obj->buffer, &alen, &obj->acc_handle) != DDI_SUCCESS) ||
+ alen < obj->size) {
+
+ ddi_dma_free_handle(&obj->dma_handle);
+
+ con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_mem_alloc"));
+
+ return (-1);
+ }
+
+ if (ddi_dma_addr_bind_handle(obj->dma_handle, NULL, obj->buffer,
+ obj->size, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP,
+ NULL, &obj->dma_cookie[0], &cookie_cnt) != DDI_SUCCESS) {
+
+ ddi_dma_mem_free(&obj->acc_handle);
+ ddi_dma_free_handle(&obj->dma_handle);
+
+ con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_addr_bind_handle"));
+
+ return (-1);
+ }
+
+ if (drsas_check_dma_handle(obj->dma_handle) != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+ return (-1);
+ }
+
+ if (drsas_check_acc_handle(obj->acc_handle) != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST);
+ return (-1);
+ }
+
+ return (cookie_cnt);
+}
+
+/*
+ * drsas_free_dma_obj(struct drsas_instance *, dma_obj_t)
+ *
+ * De-allocate the memory and other resources for an dma object, which must
+ * have been alloated by a previous call to drsas_alloc_dma_obj()
+ */
+static int
+drsas_free_dma_obj(struct drsas_instance *instance, dma_obj_t obj)
+{
+
+ if (drsas_check_dma_handle(obj.dma_handle) != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+ return (DDI_FAILURE);
+ }
+
+ if (drsas_check_acc_handle(obj.acc_handle) != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+ return (DDI_FAILURE);
+ }
+
+ (void) ddi_dma_unbind_handle(obj.dma_handle);
+ ddi_dma_mem_free(&obj.acc_handle);
+ ddi_dma_free_handle(&obj.dma_handle);
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * drsas_dma_alloc(instance_t *, struct scsi_pkt *, struct buf *,
+ * int, int (*)())
+ *
+ * Allocate dma resources for a new scsi command
+ */
+static int
+drsas_dma_alloc(struct drsas_instance *instance, struct scsi_pkt *pkt,
+ struct buf *bp, int flags, int (*callback)())
+{
+ int dma_flags;
+ int (*cb)(caddr_t);
+ int i;
+
+ ddi_dma_attr_t tmp_dma_attr = drsas_generic_dma_attr;
+ struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+ acmd->cmd_buf = bp;
+
+ if (bp->b_flags & B_READ) {
+ acmd->cmd_flags &= ~CFLAG_DMASEND;
+ dma_flags = DDI_DMA_READ;
+ } else {
+ acmd->cmd_flags |= CFLAG_DMASEND;
+ dma_flags = DDI_DMA_WRITE;
+ }
+
+ if (flags & PKT_CONSISTENT) {
+ acmd->cmd_flags |= CFLAG_CONSISTENT;
+ dma_flags |= DDI_DMA_CONSISTENT;
+ }
+
+ if (flags & PKT_DMA_PARTIAL) {
+ dma_flags |= DDI_DMA_PARTIAL;
+ }
+
+ dma_flags |= DDI_DMA_REDZONE;
+
+ cb = (callback == NULL_FUNC) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP;
+
+ tmp_dma_attr.dma_attr_sgllen = instance->max_num_sge;
+ tmp_dma_attr.dma_attr_addr_hi = 0xffffffffffffffffull;
+
+ if ((i = ddi_dma_alloc_handle(instance->dip, &tmp_dma_attr,
+ cb, 0, &acmd->cmd_dmahandle)) != DDI_SUCCESS) {
+ switch (i) {
+ case DDI_DMA_BADATTR:
+ bioerror(bp, EFAULT);
+ return (DDI_FAILURE);
+
+ case DDI_DMA_NORESOURCES:
+ bioerror(bp, 0);
+ return (DDI_FAILURE);
+
+ default:
+ con_log(CL_ANN, (CE_PANIC, "ddi_dma_alloc_handle: "
+ "impossible result (0x%x)", i));
+ bioerror(bp, EFAULT);
+ return (DDI_FAILURE);
+ }
+ }
+
+ i = ddi_dma_buf_bind_handle(acmd->cmd_dmahandle, bp, dma_flags,
+ cb, 0, &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies);
+
+ switch (i) {
+ case DDI_DMA_PARTIAL_MAP:
+ if ((dma_flags & DDI_DMA_PARTIAL) == 0) {
+ con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: "
+ "DDI_DMA_PARTIAL_MAP impossible"));
+ goto no_dma_cookies;
+ }
+
+ if (ddi_dma_numwin(acmd->cmd_dmahandle, &acmd->cmd_nwin) ==
+ DDI_FAILURE) {
+ con_log(CL_ANN, (CE_PANIC, "ddi_dma_numwin failed"));
+ goto no_dma_cookies;
+ }
+
+ if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin,
+ &acmd->cmd_dma_offset, &acmd->cmd_dma_len,
+ &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) ==
+ DDI_FAILURE) {
+
+ con_log(CL_ANN, (CE_PANIC, "ddi_dma_getwin failed"));
+ goto no_dma_cookies;
+ }
+
+ goto get_dma_cookies;
+ case DDI_DMA_MAPPED:
+ acmd->cmd_nwin = 1;
+ acmd->cmd_dma_len = 0;
+ acmd->cmd_dma_offset = 0;
+
+get_dma_cookies:
+ i = 0;
+ acmd->cmd_dmacount = 0;
+ for (;;) {
+ acmd->cmd_dmacount +=
+ acmd->cmd_dmacookies[i++].dmac_size;
+
+ if (i == instance->max_num_sge ||
+ i == acmd->cmd_ncookies)
+ break;
+
+ ddi_dma_nextcookie(acmd->cmd_dmahandle,
+ &acmd->cmd_dmacookies[i]);
+ }
+
+ acmd->cmd_cookie = i;
+ acmd->cmd_cookiecnt = i;
+
+ acmd->cmd_flags |= CFLAG_DMAVALID;
+
+ if (bp->b_bcount >= acmd->cmd_dmacount) {
+ pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount;
+ } else {
+ pkt->pkt_resid = 0;
+ }
+
+ return (DDI_SUCCESS);
+ case DDI_DMA_NORESOURCES:
+ bioerror(bp, 0);
+ break;
+ case DDI_DMA_NOMAPPING:
+ bioerror(bp, EFAULT);
+ break;
+ case DDI_DMA_TOOBIG:
+ bioerror(bp, EINVAL);
+ break;
+ case DDI_DMA_INUSE:
+ con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle:"
+ " DDI_DMA_INUSE impossible"));
+ break;
+ default:
+ con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: "
+ "impossible result (0x%x)", i));
+ break;
+ }
+
+no_dma_cookies:
+ ddi_dma_free_handle(&acmd->cmd_dmahandle);
+ acmd->cmd_dmahandle = NULL;
+ acmd->cmd_flags &= ~CFLAG_DMAVALID;
+ return (DDI_FAILURE);
+}
+
+/*
+ * drsas_dma_move(struct drsas_instance *, struct scsi_pkt *, struct buf *)
+ *
+ * move dma resources to next dma window
+ *
+ */
+static int
+drsas_dma_move(struct drsas_instance *instance, struct scsi_pkt *pkt,
+ struct buf *bp)
+{
+ int i = 0;
+
+ struct scsa_cmd *acmd = PKT2CMD(pkt);
+
+ /*
+ * If there are no more cookies remaining in this window,
+ * must move to the next window first.
+ */
+ if (acmd->cmd_cookie == acmd->cmd_ncookies) {
+ if (acmd->cmd_curwin == acmd->cmd_nwin && acmd->cmd_nwin == 1) {
+ return (DDI_SUCCESS);
+ }
+
+ /* at last window, cannot move */
+ if (++acmd->cmd_curwin >= acmd->cmd_nwin) {
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin,
+ &acmd->cmd_dma_offset, &acmd->cmd_dma_len,
+ &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) ==
+ DDI_FAILURE) {
+ return (DDI_FAILURE);
+ }
+
+ acmd->cmd_cookie = 0;
+ } else {
+ /* still more cookies in this window - get the next one */
+ ddi_dma_nextcookie(acmd->cmd_dmahandle,
+ &acmd->cmd_dmacookies[0]);
+ }
+
+ /* get remaining cookies in this window, up to our maximum */
+ for (;;) {
+ acmd->cmd_dmacount += acmd->cmd_dmacookies[i++].dmac_size;
+ acmd->cmd_cookie++;
+
+ if (i == instance->max_num_sge ||
+ acmd->cmd_cookie == acmd->cmd_ncookies) {
+ break;
+ }
+
+ ddi_dma_nextcookie(acmd->cmd_dmahandle,
+ &acmd->cmd_dmacookies[i]);
+ }
+
+ acmd->cmd_cookiecnt = i;
+
+ if (bp->b_bcount >= acmd->cmd_dmacount) {
+ pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount;
+ } else {
+ pkt->pkt_resid = 0;
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * build_cmd
+ */
+static struct drsas_cmd *
+build_cmd(struct drsas_instance *instance, struct scsi_address *ap,
+ struct scsi_pkt *pkt, uchar_t *cmd_done)
+{
+ uint16_t flags = 0;
+ uint32_t i;
+ uint32_t context __unused;
+ uint32_t sge_bytes;
+ ddi_acc_handle_t acc_handle;
+ struct drsas_cmd *cmd;
+ struct drsas_sge64 *mfi_sgl;
+ struct scsa_cmd *acmd = PKT2CMD(pkt);
+ struct drsas_pthru_frame *pthru;
+ struct drsas_io_frame *ldio;
+
+ /* find out if this is logical or physical drive command. */
+ acmd->islogical = MRDRV_IS_LOGICAL(ap);
+ acmd->device_id = MAP_DEVICE_ID(instance, ap);
+ *cmd_done = 0;
+
+ /* get the command packet */
+ if (!(cmd = get_mfi_pkt(instance))) {
+ return (NULL);
+ }
+
+ acc_handle = cmd->frame_dma_obj.acc_handle;
+
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(acc_handle, &cmd->frame->hdr.context, cmd->index);
+
+ cmd->pkt = pkt;
+ cmd->cmd = acmd;
+
+ /* lets get the command directions */
+ if (acmd->cmd_flags & CFLAG_DMASEND) {
+ flags = MFI_FRAME_DIR_WRITE;
+
+ if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+ (void) ddi_dma_sync(acmd->cmd_dmahandle,
+ acmd->cmd_dma_offset, acmd->cmd_dma_len,
+ DDI_DMA_SYNC_FORDEV);
+ }
+ } else if (acmd->cmd_flags & ~CFLAG_DMASEND) {
+ flags = MFI_FRAME_DIR_READ;
+
+ if (acmd->cmd_flags & CFLAG_CONSISTENT) {
+ (void) ddi_dma_sync(acmd->cmd_dmahandle,
+ acmd->cmd_dma_offset, acmd->cmd_dma_len,
+ DDI_DMA_SYNC_FORCPU);
+ }
+ } else {
+ flags = MFI_FRAME_DIR_NONE;
+ }
+
+ flags |= MFI_FRAME_SGL64;
+
+ switch (pkt->pkt_cdbp[0]) {
+
+ /*
+ * case SCMD_SYNCHRONIZE_CACHE:
+ * flush_cache(instance);
+ * return_mfi_pkt(instance, cmd);
+ * *cmd_done = 1;
+ *
+ * return (NULL);
+ */
+
+ case SCMD_READ:
+ case SCMD_WRITE:
+ case SCMD_READ_G1:
+ case SCMD_WRITE_G1:
+ if (acmd->islogical) {
+ ldio = (struct drsas_io_frame *)cmd->frame;
+
+ /*
+ * preare the Logical IO frame:
+ * 2nd bit is zero for all read cmds
+ */
+ ddi_put8(acc_handle, &ldio->cmd,
+ (pkt->pkt_cdbp[0] & 0x02) ? MFI_CMD_OP_LD_WRITE
+ : MFI_CMD_OP_LD_READ);
+ ddi_put8(acc_handle, &ldio->cmd_status, 0x0);
+ ddi_put8(acc_handle, &ldio->scsi_status, 0x0);
+ ddi_put8(acc_handle, &ldio->target_id, acmd->device_id);
+ ddi_put16(acc_handle, &ldio->timeout, 0);
+ ddi_put8(acc_handle, &ldio->reserved_0, 0);
+ ddi_put16(acc_handle, &ldio->pad_0, 0);
+ ddi_put16(acc_handle, &ldio->flags, flags);
+
+ /* Initialize sense Information */
+ bzero(cmd->sense, SENSE_LENGTH);
+ ddi_put8(acc_handle, &ldio->sense_len, SENSE_LENGTH);
+ ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_hi, 0);
+ ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_lo,
+ cmd->sense_phys_addr);
+ ddi_put32(acc_handle, &ldio->start_lba_hi, 0);
+ ddi_put8(acc_handle, &ldio->access_byte,
+ (acmd->cmd_cdblen != 6) ? pkt->pkt_cdbp[1] : 0);
+ ddi_put8(acc_handle, &ldio->sge_count,
+ acmd->cmd_cookiecnt);
+ mfi_sgl = (struct drsas_sge64 *)&ldio->sgl;
+
+ context = ddi_get32(acc_handle, &ldio->context);
+
+ if (acmd->cmd_cdblen == CDB_GROUP0) {
+ ddi_put32(acc_handle, &ldio->lba_count, (
+ (uint16_t)(pkt->pkt_cdbp[4])));
+
+ ddi_put32(acc_handle, &ldio->start_lba_lo, (
+ ((uint32_t)(pkt->pkt_cdbp[3])) |
+ ((uint32_t)(pkt->pkt_cdbp[2]) << 8) |
+ ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F)
+ << 16)));
+ } else if (acmd->cmd_cdblen == CDB_GROUP1) {
+ ddi_put32(acc_handle, &ldio->lba_count, (
+ ((uint16_t)(pkt->pkt_cdbp[8])) |
+ ((uint16_t)(pkt->pkt_cdbp[7]) << 8)));
+
+ ddi_put32(acc_handle, &ldio->start_lba_lo, (
+ ((uint32_t)(pkt->pkt_cdbp[5])) |
+ ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+ ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+ ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+ } else if (acmd->cmd_cdblen == CDB_GROUP2) {
+ ddi_put32(acc_handle, &ldio->lba_count, (
+ ((uint16_t)(pkt->pkt_cdbp[9])) |
+ ((uint16_t)(pkt->pkt_cdbp[8]) << 8) |
+ ((uint16_t)(pkt->pkt_cdbp[7]) << 16) |
+ ((uint16_t)(pkt->pkt_cdbp[6]) << 24)));
+
+ ddi_put32(acc_handle, &ldio->start_lba_lo, (
+ ((uint32_t)(pkt->pkt_cdbp[5])) |
+ ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+ ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+ ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+ } else if (acmd->cmd_cdblen == CDB_GROUP3) {
+ ddi_put32(acc_handle, &ldio->lba_count, (
+ ((uint16_t)(pkt->pkt_cdbp[13])) |
+ ((uint16_t)(pkt->pkt_cdbp[12]) << 8) |
+ ((uint16_t)(pkt->pkt_cdbp[11]) << 16) |
+ ((uint16_t)(pkt->pkt_cdbp[10]) << 24)));
+
+ ddi_put32(acc_handle, &ldio->start_lba_lo, (
+ ((uint32_t)(pkt->pkt_cdbp[9])) |
+ ((uint32_t)(pkt->pkt_cdbp[8]) << 8) |
+ ((uint32_t)(pkt->pkt_cdbp[7]) << 16) |
+ ((uint32_t)(pkt->pkt_cdbp[6]) << 24)));
+
+ ddi_put32(acc_handle, &ldio->start_lba_lo, (
+ ((uint32_t)(pkt->pkt_cdbp[5])) |
+ ((uint32_t)(pkt->pkt_cdbp[4]) << 8) |
+ ((uint32_t)(pkt->pkt_cdbp[3]) << 16) |
+ ((uint32_t)(pkt->pkt_cdbp[2]) << 24)));
+ }
+
+ break;
+ }
+ /* fall through */
+ default:
+
+ switch (pkt->pkt_cdbp[0]) {
+ case SCMD_MODE_SENSE:
+ case SCMD_MODE_SENSE_G1: {
+ union scsi_cdb *cdbp;
+ uint16_t page_code;
+
+ cdbp = (void *)pkt->pkt_cdbp;
+ page_code = (uint16_t)cdbp->cdb_un.sg.scsi[0];
+ switch (page_code) {
+ case 0x3:
+ case 0x4:
+ drsas_mode_sense_build(pkt);
+ return_mfi_pkt(instance, cmd);
+ *cmd_done = 1;
+ return (NULL);
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ pthru = (struct drsas_pthru_frame *)cmd->frame;
+
+ /* prepare the DCDB frame */
+ ddi_put8(acc_handle, &pthru->cmd, (acmd->islogical) ?
+ MFI_CMD_OP_LD_SCSI : MFI_CMD_OP_PD_SCSI);
+ ddi_put8(acc_handle, &pthru->cmd_status, 0x0);
+ ddi_put8(acc_handle, &pthru->scsi_status, 0x0);
+ ddi_put8(acc_handle, &pthru->target_id, acmd->device_id);
+ ddi_put8(acc_handle, &pthru->lun, 0);
+ ddi_put8(acc_handle, &pthru->cdb_len, acmd->cmd_cdblen);
+ ddi_put16(acc_handle, &pthru->timeout, 0);
+ ddi_put16(acc_handle, &pthru->flags, flags);
+ ddi_put32(acc_handle, &pthru->data_xfer_len,
+ acmd->cmd_dmacount);
+ ddi_put8(acc_handle, &pthru->sge_count, acmd->cmd_cookiecnt);
+ mfi_sgl = (struct drsas_sge64 *)&pthru->sgl;
+
+ bzero(cmd->sense, SENSE_LENGTH);
+ ddi_put8(acc_handle, &pthru->sense_len, SENSE_LENGTH);
+ ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0);
+ ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo,
+ cmd->sense_phys_addr);
+
+ context = ddi_get32(acc_handle, &pthru->context);
+ ddi_rep_put8(acc_handle, (uint8_t *)pkt->pkt_cdbp,
+ (uint8_t *)pthru->cdb, acmd->cmd_cdblen, DDI_DEV_AUTOINCR);
+
+ break;
+ }
+#ifdef lint
+ context = context;
+#endif
+ /* prepare the scatter-gather list for the firmware */
+ for (i = 0; i < acmd->cmd_cookiecnt; i++, mfi_sgl++) {
+ ddi_put64(acc_handle, &mfi_sgl->phys_addr,
+ acmd->cmd_dmacookies[i].dmac_laddress);
+ ddi_put32(acc_handle, &mfi_sgl->length,
+ acmd->cmd_dmacookies[i].dmac_size);
+ }
+
+ sge_bytes = sizeof (struct drsas_sge64)*acmd->cmd_cookiecnt;
+
+ cmd->frame_count = (sge_bytes / MRMFI_FRAME_SIZE) +
+ ((sge_bytes % MRMFI_FRAME_SIZE) ? 1 : 0) + 1;
+
+ if (cmd->frame_count >= 8) {
+ cmd->frame_count = 8;
+ }
+
+ return (cmd);
+}
+
+/*
+ * issue_mfi_pthru
+ */
+static int
+issue_mfi_pthru(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+ struct drsas_cmd *cmd, int mode)
+{
+ void *ubuf;
+ uint32_t kphys_addr = 0;
+ uint32_t xferlen = 0;
+ uint_t model;
+ ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle;
+ dma_obj_t pthru_dma_obj;
+ struct drsas_pthru_frame *kpthru;
+ struct drsas_pthru_frame *pthru;
+ int i;
+ pthru = &cmd->frame->pthru;
+ kpthru = (struct drsas_pthru_frame *)&ioctl->frame[0];
+
+ model = ddi_model_convert_from(mode & FMODELS);
+ if (model == DDI_MODEL_ILP32) {
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32"));
+
+ xferlen = kpthru->sgl.sge32[0].length;
+
+ ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr;
+ } else {
+#ifdef _ILP32
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32"));
+ xferlen = kpthru->sgl.sge32[0].length;
+ ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr;
+#else
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP64"));
+ xferlen = kpthru->sgl.sge64[0].length;
+ ubuf = (void *)(ulong_t)kpthru->sgl.sge64[0].phys_addr;
+#endif
+ }
+
+ if (xferlen) {
+ /* means IOCTL requires DMA */
+ /* allocate the data transfer buffer */
+ pthru_dma_obj.size = xferlen;
+ pthru_dma_obj.dma_attr = drsas_generic_dma_attr;
+ pthru_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ pthru_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ pthru_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ pthru_dma_obj.dma_attr.dma_attr_align = 1;
+
+ /* allocate kernel buffer for DMA */
+ if (drsas_alloc_dma_obj(instance, &pthru_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_pthru: "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+ if (kpthru->flags & MFI_FRAME_DIR_WRITE) {
+ for (i = 0; i < xferlen; i++) {
+ if (ddi_copyin((uint8_t *)ubuf+i,
+ (uint8_t *)pthru_dma_obj.buffer+i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_pthru : "
+ "copy from user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ kphys_addr = pthru_dma_obj.dma_cookie[0].dmac_address;
+ }
+
+ ddi_put8(acc_handle, &pthru->cmd, kpthru->cmd);
+ ddi_put8(acc_handle, &pthru->sense_len, kpthru->sense_len);
+ ddi_put8(acc_handle, &pthru->cmd_status, 0);
+ ddi_put8(acc_handle, &pthru->scsi_status, 0);
+ ddi_put8(acc_handle, &pthru->target_id, kpthru->target_id);
+ ddi_put8(acc_handle, &pthru->lun, kpthru->lun);
+ ddi_put8(acc_handle, &pthru->cdb_len, kpthru->cdb_len);
+ ddi_put8(acc_handle, &pthru->sge_count, kpthru->sge_count);
+ ddi_put16(acc_handle, &pthru->timeout, kpthru->timeout);
+ ddi_put32(acc_handle, &pthru->data_xfer_len, kpthru->data_xfer_len);
+
+ ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0);
+ /* pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr; */
+ ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0);
+
+ ddi_rep_put8(acc_handle, (uint8_t *)kpthru->cdb, (uint8_t *)pthru->cdb,
+ pthru->cdb_len, DDI_DEV_AUTOINCR);
+
+ ddi_put16(acc_handle, &pthru->flags, kpthru->flags & ~MFI_FRAME_SGL64);
+ ddi_put32(acc_handle, &pthru->sgl.sge32[0].length, xferlen);
+ ddi_put32(acc_handle, &pthru->sgl.sge32[0].phys_addr, kphys_addr);
+
+ cmd->sync_cmd = DRSAS_TRUE;
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_pthru: fw_ioctl failed"));
+ } else {
+ if (xferlen && kpthru->flags & MFI_FRAME_DIR_READ) {
+ for (i = 0; i < xferlen; i++) {
+ if (ddi_copyout(
+ (uint8_t *)pthru_dma_obj.buffer+i,
+ (uint8_t *)ubuf+i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_pthru : "
+ "copy to user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+ }
+
+ kpthru->cmd_status = ddi_get8(acc_handle, &pthru->cmd_status);
+ kpthru->scsi_status = ddi_get8(acc_handle, &pthru->scsi_status);
+
+ con_log(CL_ANN, (CE_NOTE, "issue_mfi_pthru: cmd_status %x, "
+ "scsi_status %x", kpthru->cmd_status, kpthru->scsi_status));
+
+ if (xferlen) {
+ /* free kernel buffer */
+ if (drsas_free_dma_obj(instance, pthru_dma_obj) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_dcmd
+ */
+static int
+issue_mfi_dcmd(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+ struct drsas_cmd *cmd, int mode)
+{
+ void *ubuf;
+ uint32_t kphys_addr = 0;
+ uint32_t xferlen = 0;
+ uint32_t model;
+ dma_obj_t dcmd_dma_obj;
+ struct drsas_dcmd_frame *kdcmd;
+ struct drsas_dcmd_frame *dcmd;
+ ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle;
+ int i;
+ dcmd = &cmd->frame->dcmd;
+ kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0];
+
+ model = ddi_model_convert_from(mode & FMODELS);
+ if (model == DDI_MODEL_ILP32) {
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
+
+ xferlen = kdcmd->sgl.sge32[0].length;
+
+ ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+ } else {
+#ifdef _ILP32
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32"));
+ xferlen = kdcmd->sgl.sge32[0].length;
+ ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+#else
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_LP64"));
+ xferlen = kdcmd->sgl.sge64[0].length;
+ ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr;
+#endif
+ }
+ if (xferlen) {
+ /* means IOCTL requires DMA */
+ /* allocate the data transfer buffer */
+ dcmd_dma_obj.size = xferlen;
+ dcmd_dma_obj.dma_attr = drsas_generic_dma_attr;
+ dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ dcmd_dma_obj.dma_attr.dma_attr_align = 1;
+
+ /* allocate kernel buffer for DMA */
+ if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+ if (kdcmd->flags & MFI_FRAME_DIR_WRITE) {
+ for (i = 0; i < xferlen; i++) {
+ if (ddi_copyin((uint8_t *)ubuf + i,
+ (uint8_t *)dcmd_dma_obj.buffer + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_dcmd : "
+ "copy from user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ kphys_addr = dcmd_dma_obj.dma_cookie[0].dmac_address;
+ }
+
+ ddi_put8(acc_handle, &dcmd->cmd, kdcmd->cmd);
+ ddi_put8(acc_handle, &dcmd->cmd_status, 0);
+ ddi_put8(acc_handle, &dcmd->sge_count, kdcmd->sge_count);
+ ddi_put16(acc_handle, &dcmd->timeout, kdcmd->timeout);
+ ddi_put32(acc_handle, &dcmd->data_xfer_len, kdcmd->data_xfer_len);
+ ddi_put32(acc_handle, &dcmd->opcode, kdcmd->opcode);
+
+ ddi_rep_put8(acc_handle, (uint8_t *)kdcmd->mbox.b,
+ (uint8_t *)dcmd->mbox.b, DCMD_MBOX_SZ, DDI_DEV_AUTOINCR);
+
+ ddi_put16(acc_handle, &dcmd->flags, kdcmd->flags & ~MFI_FRAME_SGL64);
+ ddi_put32(acc_handle, &dcmd->sgl.sge32[0].length, xferlen);
+ ddi_put32(acc_handle, &dcmd->sgl.sge32[0].phys_addr, kphys_addr);
+
+ cmd->sync_cmd = DRSAS_TRUE;
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: fw_ioctl failed"));
+ } else {
+ if (xferlen && (kdcmd->flags & MFI_FRAME_DIR_READ)) {
+ for (i = 0; i < xferlen; i++) {
+ if (ddi_copyout(
+ (uint8_t *)dcmd_dma_obj.buffer + i,
+ (uint8_t *)ubuf + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_dcmd : "
+ "copy to user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+ }
+
+ kdcmd->cmd_status = ddi_get8(acc_handle, &dcmd->cmd_status);
+
+ if (xferlen) {
+ /* free kernel buffer */
+ if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_smp
+ */
+static int
+issue_mfi_smp(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+ struct drsas_cmd *cmd, int mode)
+{
+ void *request_ubuf;
+ void *response_ubuf;
+ uint32_t request_xferlen = 0;
+ uint32_t response_xferlen = 0;
+ uint_t model;
+ dma_obj_t request_dma_obj;
+ dma_obj_t response_dma_obj;
+ ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle;
+ struct drsas_smp_frame *ksmp;
+ struct drsas_smp_frame *smp;
+ struct drsas_sge32 *sge32;
+#ifndef _ILP32
+ struct drsas_sge64 *sge64;
+#endif
+ int i;
+ uint64_t tmp_sas_addr;
+
+ smp = &cmd->frame->smp;
+ ksmp = (struct drsas_smp_frame *)&ioctl->frame[0];
+
+ model = ddi_model_convert_from(mode & FMODELS);
+ if (model == DDI_MODEL_ILP32) {
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32"));
+
+ sge32 = &ksmp->sgl[0].sge32[0];
+ response_xferlen = sge32[0].length;
+ request_xferlen = sge32[1].length;
+ con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: "
+ "response_xferlen = %x, request_xferlen = %x",
+ response_xferlen, request_xferlen));
+
+ response_ubuf = (void *)(ulong_t)sge32[0].phys_addr;
+ request_ubuf = (void *)(ulong_t)sge32[1].phys_addr;
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: "
+ "response_ubuf = %p, request_ubuf = %p",
+ response_ubuf, request_ubuf));
+ } else {
+#ifdef _ILP32
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32"));
+
+ sge32 = &ksmp->sgl[0].sge32[0];
+ response_xferlen = sge32[0].length;
+ request_xferlen = sge32[1].length;
+ con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: "
+ "response_xferlen = %x, request_xferlen = %x",
+ response_xferlen, request_xferlen));
+
+ response_ubuf = (void *)(ulong_t)sge32[0].phys_addr;
+ request_ubuf = (void *)(ulong_t)sge32[1].phys_addr;
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: "
+ "response_ubuf = %p, request_ubuf = %p",
+ response_ubuf, request_ubuf));
+#else
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_LP64"));
+
+ sge64 = &ksmp->sgl[0].sge64[0];
+ response_xferlen = sge64[0].length;
+ request_xferlen = sge64[1].length;
+
+ response_ubuf = (void *)(ulong_t)sge64[0].phys_addr;
+ request_ubuf = (void *)(ulong_t)sge64[1].phys_addr;
+#endif
+ }
+ if (request_xferlen) {
+ /* means IOCTL requires DMA */
+ /* allocate the data transfer buffer */
+ request_dma_obj.size = request_xferlen;
+ request_dma_obj.dma_attr = drsas_generic_dma_attr;
+ request_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ request_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ request_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ request_dma_obj.dma_attr.dma_attr_align = 1;
+
+ /* allocate kernel buffer for DMA */
+ if (drsas_alloc_dma_obj(instance, &request_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+ for (i = 0; i < request_xferlen; i++) {
+ if (ddi_copyin((uint8_t *)request_ubuf + i,
+ (uint8_t *)request_dma_obj.buffer + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+ "copy from user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ if (response_xferlen) {
+ /* means IOCTL requires DMA */
+ /* allocate the data transfer buffer */
+ response_dma_obj.size = response_xferlen;
+ response_dma_obj.dma_attr = drsas_generic_dma_attr;
+ response_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ response_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ response_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ response_dma_obj.dma_attr.dma_attr_align = 1;
+
+ /* allocate kernel buffer for DMA */
+ if (drsas_alloc_dma_obj(instance, &response_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+ for (i = 0; i < response_xferlen; i++) {
+ if (ddi_copyin((uint8_t *)response_ubuf + i,
+ (uint8_t *)response_dma_obj.buffer + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: "
+ "copy from user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ ddi_put8(acc_handle, &smp->cmd, ksmp->cmd);
+ ddi_put8(acc_handle, &smp->cmd_status, 0);
+ ddi_put8(acc_handle, &smp->connection_status, 0);
+ ddi_put8(acc_handle, &smp->sge_count, ksmp->sge_count);
+ /* smp->context = ksmp->context; */
+ ddi_put16(acc_handle, &smp->timeout, ksmp->timeout);
+ ddi_put32(acc_handle, &smp->data_xfer_len, ksmp->data_xfer_len);
+
+ bcopy((void *)&ksmp->sas_addr, (void *)&tmp_sas_addr,
+ sizeof (uint64_t));
+ ddi_put64(acc_handle, &smp->sas_addr, tmp_sas_addr);
+
+ ddi_put16(acc_handle, &smp->flags, ksmp->flags & ~MFI_FRAME_SGL64);
+
+ model = ddi_model_convert_from(mode & FMODELS);
+ if (model == DDI_MODEL_ILP32) {
+ con_log(CL_ANN1, (CE_NOTE,
+ "handle_drv_ioctl: DDI_MODEL_ILP32"));
+
+ sge32 = &smp->sgl[0].sge32[0];
+ ddi_put32(acc_handle, &sge32[0].length, response_xferlen);
+ ddi_put32(acc_handle, &sge32[0].phys_addr,
+ response_dma_obj.dma_cookie[0].dmac_address);
+ ddi_put32(acc_handle, &sge32[1].length, request_xferlen);
+ ddi_put32(acc_handle, &sge32[1].phys_addr,
+ request_dma_obj.dma_cookie[0].dmac_address);
+ } else {
+#ifdef _ILP32
+ con_log(CL_ANN1, (CE_NOTE,
+ "handle_drv_ioctl: DDI_MODEL_ILP32"));
+ sge32 = &smp->sgl[0].sge32[0];
+ ddi_put32(acc_handle, &sge32[0].length, response_xferlen);
+ ddi_put32(acc_handle, &sge32[0].phys_addr,
+ response_dma_obj.dma_cookie[0].dmac_address);
+ ddi_put32(acc_handle, &sge32[1].length, request_xferlen);
+ ddi_put32(acc_handle, &sge32[1].phys_addr,
+ request_dma_obj.dma_cookie[0].dmac_address);
+#else
+ con_log(CL_ANN1, (CE_NOTE,
+ "issue_mfi_smp: DDI_MODEL_LP64"));
+ sge64 = &smp->sgl[0].sge64[0];
+ ddi_put32(acc_handle, &sge64[0].length, response_xferlen);
+ ddi_put64(acc_handle, &sge64[0].phys_addr,
+ response_dma_obj.dma_cookie[0].dmac_address);
+ ddi_put32(acc_handle, &sge64[1].length, request_xferlen);
+ ddi_put64(acc_handle, &sge64[1].phys_addr,
+ request_dma_obj.dma_cookie[0].dmac_address);
+#endif
+ }
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp : "
+ "smp->response_xferlen = %d, smp->request_xferlen = %d "
+ "smp->data_xfer_len = %d", ddi_get32(acc_handle, &sge32[0].length),
+ ddi_get32(acc_handle, &sge32[1].length),
+ ddi_get32(acc_handle, &smp->data_xfer_len)));
+
+ cmd->sync_cmd = DRSAS_TRUE;
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_smp: fw_ioctl failed"));
+ } else {
+ con_log(CL_ANN1, (CE_NOTE,
+ "issue_mfi_smp: copy to user space"));
+
+ if (request_xferlen) {
+ for (i = 0; i < request_xferlen; i++) {
+ if (ddi_copyout(
+ (uint8_t *)request_dma_obj.buffer +
+ i, (uint8_t *)request_ubuf + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_smp : copy to user space"
+ " failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ if (response_xferlen) {
+ for (i = 0; i < response_xferlen; i++) {
+ if (ddi_copyout(
+ (uint8_t *)response_dma_obj.buffer
+ + i, (uint8_t *)response_ubuf
+ + i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_smp : copy to "
+ "user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+ }
+
+ ksmp->cmd_status = ddi_get8(acc_handle, &smp->cmd_status);
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: smp->cmd_status = %d",
+ ddi_get8(acc_handle, &smp->cmd_status)));
+
+
+ if (request_xferlen) {
+ /* free kernel buffer */
+ if (drsas_free_dma_obj(instance, request_dma_obj) !=
+ DDI_SUCCESS)
+ return (DDI_FAILURE);
+ }
+
+ if (response_xferlen) {
+ /* free kernel buffer */
+ if (drsas_free_dma_obj(instance, response_dma_obj) !=
+ DDI_SUCCESS)
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * issue_mfi_stp
+ */
+static int
+issue_mfi_stp(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+ struct drsas_cmd *cmd, int mode)
+{
+ void *fis_ubuf;
+ void *data_ubuf;
+ uint32_t fis_xferlen = 0;
+ uint32_t data_xferlen = 0;
+ uint_t model;
+ dma_obj_t fis_dma_obj;
+ dma_obj_t data_dma_obj;
+ struct drsas_stp_frame *kstp;
+ struct drsas_stp_frame *stp;
+ ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle;
+ int i;
+
+ stp = &cmd->frame->stp;
+ kstp = (struct drsas_stp_frame *)&ioctl->frame[0];
+
+ model = ddi_model_convert_from(mode & FMODELS);
+ if (model == DDI_MODEL_ILP32) {
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32"));
+
+ fis_xferlen = kstp->sgl.sge32[0].length;
+ data_xferlen = kstp->sgl.sge32[1].length;
+
+ fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr;
+ data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr;
+ }
+ else
+ {
+#ifdef _ILP32
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32"));
+
+ fis_xferlen = kstp->sgl.sge32[0].length;
+ data_xferlen = kstp->sgl.sge32[1].length;
+
+ fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr;
+ data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr;
+#else
+ con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_LP64"));
+
+ fis_xferlen = kstp->sgl.sge64[0].length;
+ data_xferlen = kstp->sgl.sge64[1].length;
+
+ fis_ubuf = (void *)(ulong_t)kstp->sgl.sge64[0].phys_addr;
+ data_ubuf = (void *)(ulong_t)kstp->sgl.sge64[1].phys_addr;
+#endif
+ }
+
+
+ if (fis_xferlen) {
+ con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: "
+ "fis_ubuf = %p fis_xferlen = %x", fis_ubuf, fis_xferlen));
+
+ /* means IOCTL requires DMA */
+ /* allocate the data transfer buffer */
+ fis_dma_obj.size = fis_xferlen;
+ fis_dma_obj.dma_attr = drsas_generic_dma_attr;
+ fis_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ fis_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ fis_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ fis_dma_obj.dma_attr.dma_attr_align = 1;
+
+ /* allocate kernel buffer for DMA */
+ if (drsas_alloc_dma_obj(instance, &fis_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_stp : "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+ for (i = 0; i < fis_xferlen; i++) {
+ if (ddi_copyin((uint8_t *)fis_ubuf + i,
+ (uint8_t *)fis_dma_obj.buffer + i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+ "copy from user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ if (data_xferlen) {
+ con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: data_ubuf = %p "
+ "data_xferlen = %x", data_ubuf, data_xferlen));
+
+ /* means IOCTL requires DMA */
+ /* allocate the data transfer buffer */
+ data_dma_obj.size = data_xferlen;
+ data_dma_obj.dma_attr = drsas_generic_dma_attr;
+ data_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU;
+ data_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU;
+ data_dma_obj.dma_attr.dma_attr_sgllen = 1;
+ data_dma_obj.dma_attr.dma_attr_align = 1;
+
+/* allocate kernel buffer for DMA */
+ if (drsas_alloc_dma_obj(instance, &data_dma_obj,
+ (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+ "could not allocate data transfer buffer."));
+ return (DDI_FAILURE);
+ }
+
+ /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */
+ for (i = 0; i < data_xferlen; i++) {
+ if (ddi_copyin((uint8_t *)data_ubuf + i,
+ (uint8_t *)data_dma_obj.buffer + i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: "
+ "copy from user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ ddi_put8(acc_handle, &stp->cmd, kstp->cmd);
+ ddi_put8(acc_handle, &stp->cmd_status, 0);
+ ddi_put8(acc_handle, &stp->connection_status, 0);
+ ddi_put8(acc_handle, &stp->target_id, kstp->target_id);
+ ddi_put8(acc_handle, &stp->sge_count, kstp->sge_count);
+
+ ddi_put16(acc_handle, &stp->timeout, kstp->timeout);
+ ddi_put32(acc_handle, &stp->data_xfer_len, kstp->data_xfer_len);
+
+ ddi_rep_put8(acc_handle, (uint8_t *)kstp->fis, (uint8_t *)stp->fis, 10,
+ DDI_DEV_AUTOINCR);
+
+ ddi_put16(acc_handle, &stp->flags, kstp->flags & ~MFI_FRAME_SGL64);
+ ddi_put32(acc_handle, &stp->stp_flags, kstp->stp_flags);
+ ddi_put32(acc_handle, &stp->sgl.sge32[0].length, fis_xferlen);
+ ddi_put32(acc_handle, &stp->sgl.sge32[0].phys_addr,
+ fis_dma_obj.dma_cookie[0].dmac_address);
+ ddi_put32(acc_handle, &stp->sgl.sge32[1].length, data_xferlen);
+ ddi_put32(acc_handle, &stp->sgl.sge32[1].phys_addr,
+ data_dma_obj.dma_cookie[0].dmac_address);
+
+ cmd->sync_cmd = DRSAS_TRUE;
+ cmd->frame_count = 1;
+
+ if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) {
+ con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: fw_ioctl failed"));
+ } else {
+
+ if (fis_xferlen) {
+ for (i = 0; i < fis_xferlen; i++) {
+ if (ddi_copyout(
+ (uint8_t *)fis_dma_obj.buffer + i,
+ (uint8_t *)fis_ubuf + i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_stp : copy to "
+ "user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+ }
+ if (data_xferlen) {
+ for (i = 0; i < data_xferlen; i++) {
+ if (ddi_copyout(
+ (uint8_t *)data_dma_obj.buffer + i,
+ (uint8_t *)data_ubuf + i, 1, mode)) {
+ con_log(CL_ANN, (CE_WARN,
+ "issue_mfi_stp : copy to"
+ " user space failed"));
+ return (DDI_FAILURE);
+ }
+ }
+ }
+
+ kstp->cmd_status = ddi_get8(acc_handle, &stp->cmd_status);
+
+ if (fis_xferlen) {
+ /* free kernel buffer */
+ if (drsas_free_dma_obj(instance, fis_dma_obj) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+ }
+
+ if (data_xferlen) {
+ /* free kernel buffer */
+ if (drsas_free_dma_obj(instance, data_dma_obj) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * fill_up_drv_ver
+ */
+static void
+fill_up_drv_ver(struct drsas_drv_ver *dv)
+{
+ (void) memset(dv, 0, sizeof (struct drsas_drv_ver));
+
+ (void) memcpy(dv->signature, "$LSI LOGIC$", strlen("$LSI LOGIC$"));
+ (void) memcpy(dv->os_name, "Solaris", strlen("Solaris"));
+ (void) memcpy(dv->drv_name, "dr_sas", strlen("dr_sas"));
+ (void) memcpy(dv->drv_ver, DRSAS_VERSION, strlen(DRSAS_VERSION));
+ (void) memcpy(dv->drv_rel_date, DRSAS_RELDATE,
+ strlen(DRSAS_RELDATE));
+}
+
+/*
+ * handle_drv_ioctl
+ */
+static int
+handle_drv_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+ int mode)
+{
+ int i;
+ int rval = DDI_SUCCESS;
+ int *props = NULL;
+ void *ubuf;
+
+ uint8_t *pci_conf_buf;
+ uint32_t xferlen;
+ uint32_t num_props;
+ uint_t model;
+ struct drsas_dcmd_frame *kdcmd;
+ struct drsas_drv_ver dv;
+ struct drsas_pci_information pi;
+
+ kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0];
+
+ model = ddi_model_convert_from(mode & FMODELS);
+ if (model == DDI_MODEL_ILP32) {
+ con_log(CL_ANN1, (CE_NOTE,
+ "handle_drv_ioctl: DDI_MODEL_ILP32"));
+
+ xferlen = kdcmd->sgl.sge32[0].length;
+
+ ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+ } else {
+#ifdef _ILP32
+ con_log(CL_ANN1, (CE_NOTE,
+ "handle_drv_ioctl: DDI_MODEL_ILP32"));
+ xferlen = kdcmd->sgl.sge32[0].length;
+ ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr;
+#else
+ con_log(CL_ANN1, (CE_NOTE,
+ "handle_drv_ioctl: DDI_MODEL_LP64"));
+ xferlen = kdcmd->sgl.sge64[0].length;
+ ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr;
+#endif
+ }
+ con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+ "dataBuf=%p size=%d bytes", ubuf, xferlen));
+
+ switch (kdcmd->opcode) {
+ case DRSAS_DRIVER_IOCTL_DRIVER_VERSION:
+ con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+ "DRSAS_DRIVER_IOCTL_DRIVER_VERSION"));
+
+ fill_up_drv_ver(&dv);
+ for (i = 0; i < xferlen; i++) {
+ if (ddi_copyout((uint8_t *)&dv + i, (uint8_t *)ubuf + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+ "DRSAS_DRIVER_IOCTL_DRIVER_VERSION"
+ " : copy to user space failed"));
+ kdcmd->cmd_status = 1;
+ rval = DDI_FAILURE;
+ break;
+ }
+ }
+ if (i == xferlen)
+ kdcmd->cmd_status = 0;
+ break;
+ case DRSAS_DRIVER_IOCTL_PCI_INFORMATION:
+ con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: "
+ "DRSAS_DRIVER_IOCTL_PCI_INFORMAITON"));
+
+ if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, instance->dip,
+ 0, "reg", &props, &num_props)) {
+ con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+ "DRSAS_DRIVER_IOCTL_PCI_INFORMATION : "
+ "ddi_prop_look_int_array failed"));
+ rval = DDI_FAILURE;
+ } else {
+
+ pi.busNumber = (props[0] >> 16) & 0xFF;
+ pi.deviceNumber = (props[0] >> 11) & 0x1f;
+ pi.functionNumber = (props[0] >> 8) & 0x7;
+ ddi_prop_free((void *)props);
+ }
+
+ pci_conf_buf = (uint8_t *)&pi.pciHeaderInfo;
+
+ for (i = 0; i < (sizeof (struct drsas_pci_information) -
+ offsetof(struct drsas_pci_information, pciHeaderInfo));
+ i++) {
+ pci_conf_buf[i] =
+ pci_config_get8(instance->pci_handle, i);
+ }
+ for (i = 0; i < xferlen; i++) {
+ if (ddi_copyout((uint8_t *)&pi + i, (uint8_t *)ubuf + i,
+ 1, mode)) {
+ con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+ "DRSAS_DRIVER_IOCTL_PCI_INFORMATION"
+ " : copy to user space failed"));
+ kdcmd->cmd_status = 1;
+ rval = DDI_FAILURE;
+ break;
+ }
+ }
+
+ if (i == xferlen)
+ kdcmd->cmd_status = 0;
+
+ break;
+ default:
+ con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: "
+ "invalid driver specific IOCTL opcode = 0x%x",
+ kdcmd->opcode));
+ kdcmd->cmd_status = 1;
+ rval = DDI_FAILURE;
+ break;
+ }
+
+ return (rval);
+}
+
+/*
+ * handle_mfi_ioctl
+ */
+static int
+handle_mfi_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl,
+ int mode)
+{
+ int rval = DDI_SUCCESS;
+
+ struct drsas_header *hdr;
+ struct drsas_cmd *cmd;
+
+ cmd = get_mfi_pkt(instance);
+
+ if (!cmd) {
+ con_log(CL_ANN, (CE_WARN, "dr_sas: "
+ "failed to get a cmd packet"));
+ return (DDI_FAILURE);
+ }
+
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+ cmd->index);
+
+ hdr = (struct drsas_header *)&ioctl->frame[0];
+
+ switch (hdr->cmd) {
+ case MFI_CMD_OP_DCMD:
+ rval = issue_mfi_dcmd(instance, ioctl, cmd, mode);
+ break;
+ case MFI_CMD_OP_SMP:
+ rval = issue_mfi_smp(instance, ioctl, cmd, mode);
+ break;
+ case MFI_CMD_OP_STP:
+ rval = issue_mfi_stp(instance, ioctl, cmd, mode);
+ break;
+ case MFI_CMD_OP_LD_SCSI:
+ case MFI_CMD_OP_PD_SCSI:
+ rval = issue_mfi_pthru(instance, ioctl, cmd, mode);
+ break;
+ default:
+ con_log(CL_ANN, (CE_WARN, "handle_mfi_ioctl: "
+ "invalid mfi ioctl hdr->cmd = %d", hdr->cmd));
+ rval = DDI_FAILURE;
+ break;
+ }
+
+
+ return_mfi_pkt(instance, cmd);
+ if (drsas_common_check(instance, cmd) != DDI_SUCCESS)
+ rval = DDI_FAILURE;
+ return (rval);
+}
+
+/*
+ * AEN
+ */
+static int
+handle_mfi_aen(struct drsas_instance *instance, struct drsas_aen *aen)
+{
+ int rval = 0;
+
+ rval = register_mfi_aen(instance, instance->aen_seq_num,
+ aen->class_locale_word);
+
+ aen->cmd_status = (uint8_t)rval;
+
+ return (rval);
+}
+
+static int
+register_mfi_aen(struct drsas_instance *instance, uint32_t seq_num,
+ uint32_t class_locale_word)
+{
+ int ret_val;
+
+ struct drsas_cmd *cmd, *aen_cmd;
+ struct drsas_dcmd_frame *dcmd;
+ union drsas_evt_class_locale curr_aen;
+ union drsas_evt_class_locale prev_aen;
+
+ /*
+ * If there an AEN pending already (aen_cmd), check if the
+ * class_locale of that pending AEN is inclusive of the new
+ * AEN request we currently have. If it is, then we don't have
+ * to do anything. In other words, whichever events the current
+ * AEN request is subscribing to, have already been subscribed
+ * to.
+ *
+ * If the old_cmd is _not_ inclusive, then we have to abort
+ * that command, form a class_locale that is superset of both
+ * old and current and re-issue to the FW
+ */
+
+ curr_aen.word = class_locale_word;
+ aen_cmd = instance->aen_cmd;
+ if (aen_cmd) {
+ prev_aen.word = ddi_get32(aen_cmd->frame_dma_obj.acc_handle,
+ &aen_cmd->frame->dcmd.mbox.w[1]);
+
+ /*
+ * A class whose enum value is smaller is inclusive of all
+ * higher values. If a PROGRESS (= -1) was previously
+ * registered, then a new registration requests for higher
+ * classes need not be sent to FW. They are automatically
+ * included.
+ *
+ * Locale numbers don't have such hierarchy. They are bitmap
+ * values
+ */
+ if ((prev_aen.members.class <= curr_aen.members.class) &&
+ !((prev_aen.members.locale & curr_aen.members.locale) ^
+ curr_aen.members.locale)) {
+ /*
+ * Previously issued event registration includes
+ * current request. Nothing to do.
+ */
+
+ return (0);
+ } else {
+ curr_aen.members.locale |= prev_aen.members.locale;
+
+ if (prev_aen.members.class < curr_aen.members.class)
+ curr_aen.members.class = prev_aen.members.class;
+
+ ret_val = abort_aen_cmd(instance, aen_cmd);
+
+ if (ret_val) {
+ con_log(CL_ANN, (CE_WARN, "register_mfi_aen: "
+ "failed to abort prevous AEN command"));
+
+ return (ret_val);
+ }
+ }
+ } else {
+ curr_aen.word = class_locale_word;
+ }
+
+ cmd = get_mfi_pkt(instance);
+
+ if (!cmd)
+ return (ENOMEM);
+ /* Clear the frame buffer and assign back the context id */
+ (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context,
+ cmd->index);
+
+ dcmd = &cmd->frame->dcmd;
+
+ /* for(i = 0; i < DCMD_MBOX_SZ; i++) dcmd->mbox.b[i] = 0; */
+ (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ);
+
+ (void) memset(instance->mfi_evt_detail_obj.buffer, 0,
+ sizeof (struct drsas_evt_detail));
+
+ /* Prepare DCMD for aen registration */
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0);
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags,
+ MFI_FRAME_DIR_READ);
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len,
+ sizeof (struct drsas_evt_detail));
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode,
+ DR_DCMD_CTRL_EVENT_WAIT);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[0], seq_num);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[1],
+ curr_aen.word);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr,
+ instance->mfi_evt_detail_obj.dma_cookie[0].dmac_address);
+ ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length,
+ sizeof (struct drsas_evt_detail));
+
+ instance->aen_seq_num = seq_num;
+
+
+ /*
+ * Store reference to the cmd used to register for AEN. When an
+ * application wants us to register for AEN, we have to abort this
+ * cmd and re-register with a new EVENT LOCALE supplied by that app
+ */
+ instance->aen_cmd = cmd;
+
+ cmd->frame_count = 1;
+
+ /* Issue the aen registration frame */
+ /* atomic_add_16 (&instance->fw_outstanding, 1); */
+ instance->func_ptr->issue_cmd(cmd, instance);
+
+ return (0);
+}
+
+static void
+display_scsi_inquiry(caddr_t scsi_inq)
+{
+#define MAX_SCSI_DEVICE_CODE 14
+ int i;
+ char inquiry_buf[256] = {0};
+ int bufsize = sizeof (inquiry_buf);
+ int len;
+ const char *const scsi_device_types[] = {
+ "Direct-Access ",
+ "Sequential-Access",
+ "Printer ",
+ "Processor ",
+ "WORM ",
+ "CD-ROM ",
+ "Scanner ",
+ "Optical Device ",
+ "Medium Changer ",
+ "Communications ",
+ "Unknown ",
+ "Unknown ",
+ "Unknown ",
+ "Enclosure ",
+ };
+
+ len = 0;
+
+ len += snprintf(inquiry_buf + len, bufsize - len, " Vendor: ");
+ for (i = 8; i < 16; i++) {
+ len += snprintf(inquiry_buf + len, bufsize - len, "%c",
+ scsi_inq[i]);
+ }
+
+ len += snprintf(inquiry_buf + len, bufsize - len, " Model: ");
+
+ for (i = 16; i < 32; i++) {
+ len += snprintf(inquiry_buf + len, bufsize - len, "%c",
+ scsi_inq[i]);
+ }
+
+ len += snprintf(inquiry_buf + len, bufsize - len, " Rev: ");
+
+ for (i = 32; i < 36; i++) {
+ len += snprintf(inquiry_buf + len, bufsize - len, "%c",
+ scsi_inq[i]);
+ }
+
+ len += snprintf(inquiry_buf + len, bufsize - len, "\n");
+
+
+ i = scsi_inq[0] & 0x1f;
+
+
+ len += snprintf(inquiry_buf + len, bufsize - len, " Type: %s ",
+ i < MAX_SCSI_DEVICE_CODE ? scsi_device_types[i] :
+ "Unknown ");
+
+
+ len += snprintf(inquiry_buf + len, bufsize - len,
+ " ANSI SCSI revision: %02x", scsi_inq[2] & 0x07);
+
+ if ((scsi_inq[2] & 0x07) == 1 && (scsi_inq[3] & 0x0f) == 1) {
+ len += snprintf(inquiry_buf + len, bufsize - len, " CCS\n");
+ } else {
+ len += snprintf(inquiry_buf + len, bufsize - len, "\n");
+ }
+
+ con_log(CL_ANN1, (CE_CONT, inquiry_buf));
+}
+
+static int
+read_fw_status_reg_ppc(struct drsas_instance *instance)
+{
+ return ((int)RD_OB_SCRATCH_PAD_0(instance));
+}
+
+static void
+issue_cmd_ppc(struct drsas_cmd *cmd, struct drsas_instance *instance)
+{
+ atomic_add_16(&instance->fw_outstanding, 1);
+
+ /* Issue the command to the FW */
+ WR_IB_QPORT((cmd->frame_phys_addr) |
+ (((cmd->frame_count - 1) << 1) | 1), instance);
+}
+
+/*
+ * issue_cmd_in_sync_mode
+ */
+static int
+issue_cmd_in_sync_mode_ppc(struct drsas_instance *instance,
+ struct drsas_cmd *cmd)
+{
+ int i;
+ uint32_t msecs = MFI_POLL_TIMEOUT_SECS * (10 * MILLISEC);
+
+ con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: called"));
+
+ cmd->cmd_status = ENODATA;
+
+ WR_IB_QPORT((cmd->frame_phys_addr) |
+ (((cmd->frame_count - 1) << 1) | 1), instance);
+
+ mutex_enter(&instance->int_cmd_mtx);
+
+ for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) {
+ cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx);
+ }
+
+ mutex_exit(&instance->int_cmd_mtx);
+
+ con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: done"));
+
+ if (i < (msecs -1)) {
+ return (DDI_SUCCESS);
+ } else {
+ return (DDI_FAILURE);
+ }
+}
+
+/*
+ * issue_cmd_in_poll_mode
+ */
+static int
+issue_cmd_in_poll_mode_ppc(struct drsas_instance *instance,
+ struct drsas_cmd *cmd)
+{
+ int i;
+ uint16_t flags;
+ uint32_t msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC;
+ struct drsas_header *frame_hdr;
+
+ con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_poll_mode_ppc: called"));
+
+ frame_hdr = (struct drsas_header *)cmd->frame;
+ ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status,
+ MFI_CMD_STATUS_POLL_MODE);
+ flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags);
+ flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
+
+ ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags);
+
+ /* issue the frame using inbound queue port */
+ WR_IB_QPORT((cmd->frame_phys_addr) |
+ (((cmd->frame_count - 1) << 1) | 1), instance);
+
+ /* wait for cmd_status to change from 0xFF */
+ for (i = 0; i < msecs && (
+ ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status)
+ == MFI_CMD_STATUS_POLL_MODE); i++) {
+ drv_usecwait(MILLISEC); /* wait for 1000 usecs */
+ }
+
+ if (ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status)
+ == MFI_CMD_STATUS_POLL_MODE) {
+ con_log(CL_ANN, (CE_NOTE, "issue_cmd_in_poll_mode: "
+ "cmd polling timed out"));
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+static void
+enable_intr_ppc(struct drsas_instance *instance)
+{
+ uint32_t mask;
+
+ con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: called"));
+
+ /* WR_OB_DOORBELL_CLEAR(0xFFFFFFFF, instance); */
+ WR_OB_DOORBELL_CLEAR(OB_DOORBELL_CLEAR_MASK, instance);
+
+ /* WR_OB_INTR_MASK(~0x80000000, instance); */
+ WR_OB_INTR_MASK(~(MFI_REPLY_2108_MESSAGE_INTR_MASK), instance);
+
+ /* dummy read to force PCI flush */
+ mask = RD_OB_INTR_MASK(instance);
+
+ con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: "
+ "outbound_intr_mask = 0x%x", mask));
+}
+
+static void
+disable_intr_ppc(struct drsas_instance *instance)
+{
+ uint32_t mask __unused;
+
+ con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: called"));
+
+ con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: before : "
+ "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance)));
+
+ /* WR_OB_INTR_MASK(0xFFFFFFFF, instance); */
+ WR_OB_INTR_MASK(OB_INTR_MASK, instance);
+
+ con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: after : "
+ "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance)));
+
+ /* dummy read to force PCI flush */
+ mask = RD_OB_INTR_MASK(instance);
+#ifdef lint
+ mask = mask;
+#endif
+}
+
+static int
+intr_ack_ppc(struct drsas_instance *instance)
+{
+ uint32_t status;
+
+ con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: called"));
+
+ /* check if it is our interrupt */
+ status = RD_OB_INTR_STATUS(instance);
+
+ con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: status = 0x%x", status));
+
+ if (!(status & MFI_REPLY_2108_MESSAGE_INTR)) {
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ /* clear the interrupt by writing back the same value */
+ WR_OB_DOORBELL_CLEAR(status, instance);
+
+ /* dummy READ */
+ status = RD_OB_INTR_STATUS(instance);
+
+ con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: interrupt cleared"));
+
+ return (DDI_INTR_CLAIMED);
+}
+
+static int
+drsas_common_check(struct drsas_instance *instance,
+ struct drsas_cmd *cmd)
+{
+ int ret = DDI_SUCCESS;
+
+ if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) !=
+ DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+ if (cmd->pkt != NULL) {
+ cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+ cmd->pkt->pkt_statistics = 0;
+ }
+ ret = DDI_FAILURE;
+ }
+ if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle)
+ != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+ if (cmd->pkt != NULL) {
+ cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+ cmd->pkt->pkt_statistics = 0;
+ }
+ ret = DDI_FAILURE;
+ }
+ if (drsas_check_dma_handle(instance->mfi_evt_detail_obj.dma_handle) !=
+ DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+ if (cmd->pkt != NULL) {
+ cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+ cmd->pkt->pkt_statistics = 0;
+ }
+ ret = DDI_FAILURE;
+ }
+ if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) {
+ ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED);
+
+ ddi_fm_acc_err_clear(instance->regmap_handle, DDI_FME_VER0);
+
+ if (cmd->pkt != NULL) {
+ cmd->pkt->pkt_reason = CMD_TRAN_ERR;
+ cmd->pkt->pkt_statistics = 0;
+ }
+ ret = DDI_FAILURE;
+ }
+
+ return (ret);
+}
+
+/*ARGSUSED*/
+static int
+drsas_fm_error_cb(dev_info_t *dip, ddi_fm_error_t *err, const void *impl_data)
+{
+ /*
+ * as the driver can always deal with an error in any dma or
+ * access handle, we can just return the fme_status value.
+ */
+ pci_ereport_post(dip, err, NULL);
+ return (err->fme_status);
+}
+
+static void
+drsas_fm_init(struct drsas_instance *instance)
+{
+ /* Need to change iblock to priority for new MSI intr */
+ ddi_iblock_cookie_t fm_ibc;
+
+ /* Only register with IO Fault Services if we have some capability */
+ if (instance->fm_capabilities) {
+ /* Adjust access and dma attributes for FMA */
+ endian_attr.devacc_attr_access = DDI_FLAGERR_ACC;
+ drsas_generic_dma_attr.dma_attr_flags = DDI_DMA_FLAGERR;
+
+ /*
+ * Register capabilities with IO Fault Services.
+ * fm_capabilities will be updated to indicate
+ * capabilities actually supported (not requested.)
+ */
+
+ ddi_fm_init(instance->dip, &instance->fm_capabilities, &fm_ibc);
+
+ /*
+ * Initialize pci ereport capabilities if ereport
+ * capable (should always be.)
+ */
+
+ if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) ||
+ DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+ pci_ereport_setup(instance->dip);
+ }
+
+ /*
+ * Register error callback if error callback capable.
+ */
+ if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+ ddi_fm_handler_register(instance->dip,
+ drsas_fm_error_cb, (void*) instance);
+ }
+ } else {
+ endian_attr.devacc_attr_access = DDI_DEFAULT_ACC;
+ drsas_generic_dma_attr.dma_attr_flags = 0;
+ }
+}
+
+static void
+drsas_fm_fini(struct drsas_instance *instance)
+{
+ /* Only unregister FMA capabilities if registered */
+ if (instance->fm_capabilities) {
+ /*
+ * Un-register error callback if error callback capable.
+ */
+ if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+ ddi_fm_handler_unregister(instance->dip);
+ }
+
+ /*
+ * Release any resources allocated by pci_ereport_setup()
+ */
+ if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) ||
+ DDI_FM_ERRCB_CAP(instance->fm_capabilities)) {
+ pci_ereport_teardown(instance->dip);
+ }
+
+ /* Unregister from IO Fault Services */
+ ddi_fm_fini(instance->dip);
+
+ /* Adjust access and dma attributes for FMA */
+ endian_attr.devacc_attr_access = DDI_DEFAULT_ACC;
+ drsas_generic_dma_attr.dma_attr_flags = 0;
+ }
+}
+
+int
+drsas_check_acc_handle(ddi_acc_handle_t handle)
+{
+ ddi_fm_error_t de;
+
+ if (handle == NULL) {
+ return (DDI_FAILURE);
+ }
+
+ ddi_fm_acc_err_get(handle, &de, DDI_FME_VERSION);
+
+ return (de.fme_status);
+}
+
+int
+drsas_check_dma_handle(ddi_dma_handle_t handle)
+{
+ ddi_fm_error_t de;
+
+ if (handle == NULL) {
+ return (DDI_FAILURE);
+ }
+
+ ddi_fm_dma_err_get(handle, &de, DDI_FME_VERSION);
+
+ return (de.fme_status);
+}
+
+void
+drsas_fm_ereport(struct drsas_instance *instance, char *detail)
+{
+ uint64_t ena;
+ char buf[FM_MAX_CLASS];
+
+ (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
+ ena = fm_ena_generate(0, FM_ENA_FMT1);
+ if (DDI_FM_EREPORT_CAP(instance->fm_capabilities)) {
+ ddi_fm_ereport_post(instance->dip, buf, ena, DDI_NOSLEEP,
+ FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERSION, NULL);
+ }
+}
+
+static int
+drsas_add_intrs(struct drsas_instance *instance, int intr_type)
+{
+
+ dev_info_t *dip = instance->dip;
+ int avail, actual, count;
+ int i, flag, ret;
+
+ con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: intr_type = %x",
+ intr_type));
+
+ /* Get number of interrupts */
+ ret = ddi_intr_get_nintrs(dip, intr_type, &count);
+ if ((ret != DDI_SUCCESS) || (count == 0)) {
+ con_log(CL_ANN, (CE_WARN, "ddi_intr_get_nintrs() failed:"
+ "ret %d count %d", ret, count));
+
+ return (DDI_FAILURE);
+ }
+
+ con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: count = %d ", count));
+
+ /* Get number of available interrupts */
+ ret = ddi_intr_get_navail(dip, intr_type, &avail);
+ if ((ret != DDI_SUCCESS) || (avail == 0)) {
+ con_log(CL_ANN, (CE_WARN, "ddi_intr_get_navail() failed:"
+ "ret %d avail %d", ret, avail));
+
+ return (DDI_FAILURE);
+ }
+ con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: avail = %d ", avail));
+
+ /* Only one interrupt routine. So limit the count to 1 */
+ if (count > 1) {
+ count = 1;
+ }
+
+ /*
+ * Allocate an array of interrupt handlers. Currently we support
+ * only one interrupt. The framework can be extended later.
+ */
+ instance->intr_size = count * sizeof (ddi_intr_handle_t);
+ instance->intr_htable = kmem_zalloc(instance->intr_size, KM_SLEEP);
+ ASSERT(instance->intr_htable);
+
+ flag = ((intr_type == DDI_INTR_TYPE_MSI) || (intr_type ==
+ DDI_INTR_TYPE_MSIX)) ? DDI_INTR_ALLOC_STRICT:DDI_INTR_ALLOC_NORMAL;
+
+ /* Allocate interrupt */
+ ret = ddi_intr_alloc(dip, instance->intr_htable, intr_type, 0,
+ count, &actual, flag);
+
+ if ((ret != DDI_SUCCESS) || (actual == 0)) {
+ con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+ "avail = %d", avail));
+ kmem_free(instance->intr_htable, instance->intr_size);
+ return (DDI_FAILURE);
+ }
+ if (actual < count) {
+ con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+ "Requested = %d Received = %d", count, actual));
+ }
+ instance->intr_cnt = actual;
+
+ /*
+ * Get the priority of the interrupt allocated.
+ */
+ if ((ret = ddi_intr_get_pri(instance->intr_htable[0],
+ &instance->intr_pri)) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+ "get priority call failed"));
+
+ for (i = 0; i < actual; i++) {
+ (void) ddi_intr_free(instance->intr_htable[i]);
+ }
+ kmem_free(instance->intr_htable, instance->intr_size);
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Test for high level mutex. we don't support them.
+ */
+ if (instance->intr_pri >= ddi_intr_get_hilevel_pri()) {
+ con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: "
+ "High level interrupts not supported."));
+
+ for (i = 0; i < actual; i++) {
+ (void) ddi_intr_free(instance->intr_htable[i]);
+ }
+ kmem_free(instance->intr_htable, instance->intr_size);
+ return (DDI_FAILURE);
+ }
+
+ con_log(CL_DLEVEL1, (CE_NOTE, "drsas_add_intrs: intr_pri = 0x%x ",
+ instance->intr_pri));
+
+ /* Call ddi_intr_add_handler() */
+ for (i = 0; i < actual; i++) {
+ ret = ddi_intr_add_handler(instance->intr_htable[i],
+ (ddi_intr_handler_t *)drsas_isr, (caddr_t)instance,
+ (caddr_t)(uintptr_t)i);
+
+ if (ret != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "drsas_add_intrs:"
+ "failed %d", ret));
+
+ for (i = 0; i < actual; i++) {
+ (void) ddi_intr_free(instance->intr_htable[i]);
+ }
+ kmem_free(instance->intr_htable, instance->intr_size);
+ return (DDI_FAILURE);
+ }
+
+ }
+
+ con_log(CL_DLEVEL1, (CE_WARN, " ddi_intr_add_handler done"));
+
+ if ((ret = ddi_intr_get_cap(instance->intr_htable[0],
+ &instance->intr_cap)) != DDI_SUCCESS) {
+ con_log(CL_ANN, (CE_WARN, "ddi_intr_get_cap() failed %d",
+ ret));
+
+ /* Free already allocated intr */
+ for (i = 0; i < actual; i++) {
+ (void) ddi_intr_remove_handler(
+ instance->intr_htable[i]);
+ (void) ddi_intr_free(instance->intr_htable[i]);
+ }
+ kmem_free(instance->intr_htable, instance->intr_size);
+ return (DDI_FAILURE);
+ }
+
+ if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) {
+ con_log(CL_ANN, (CE_WARN, "Calling ddi_intr_block _enable"));
+
+ (void) ddi_intr_block_enable(instance->intr_htable,
+ instance->intr_cnt);
+ } else {
+ con_log(CL_ANN, (CE_NOTE, " calling ddi_intr_enable"));
+
+ for (i = 0; i < instance->intr_cnt; i++) {
+ (void) ddi_intr_enable(instance->intr_htable[i]);
+ con_log(CL_ANN, (CE_NOTE, "ddi intr enable returns "
+ "%d", i));
+ }
+ }
+
+ return (DDI_SUCCESS);
+
+}
+
+
+static void
+drsas_rem_intrs(struct drsas_instance *instance)
+{
+ int i;
+
+ con_log(CL_ANN, (CE_NOTE, "drsas_rem_intrs called"));
+
+ /* Disable all interrupts first */
+ if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) {
+ (void) ddi_intr_block_disable(instance->intr_htable,
+ instance->intr_cnt);
+ } else {
+ for (i = 0; i < instance->intr_cnt; i++) {
+ (void) ddi_intr_disable(instance->intr_htable[i]);
+ }
+ }
+
+ /* Remove all the handlers */
+
+ for (i = 0; i < instance->intr_cnt; i++) {
+ (void) ddi_intr_remove_handler(instance->intr_htable[i]);
+ (void) ddi_intr_free(instance->intr_htable[i]);
+ }
+
+ kmem_free(instance->intr_htable, instance->intr_size);
+}
+
+static int
+drsas_tran_bus_config(dev_info_t *parent, uint_t flags,
+ ddi_bus_config_op_t op, void *arg, dev_info_t **childp)
+{
+ struct drsas_instance *instance;
+ int config;
+ int rval;
+
+ char *ptr = NULL;
+ int tgt, lun;
+
+ con_log(CL_ANN1, (CE_NOTE, "Bus config called for op = %x", op));
+
+ if ((instance = ddi_get_soft_state(drsas_state,
+ ddi_get_instance(parent))) == NULL) {
+ return (NDI_FAILURE);
+ }
+
+ /* Hold nexus during bus_config */
+ ndi_devi_enter(parent, &config);
+ switch (op) {
+ case BUS_CONFIG_ONE: {
+
+ /* parse wwid/target name out of name given */
+ if ((ptr = strchr((char *)arg, '@')) == NULL) {
+ rval = NDI_FAILURE;
+ break;
+ }
+ ptr++;
+
+ if (drsas_parse_devname(arg, &tgt, &lun) != 0) {
+ rval = NDI_FAILURE;
+ break;
+ }
+
+ if (lun == 0) {
+ rval = drsas_config_ld(instance, tgt, lun, childp);
+ } else {
+ rval = NDI_FAILURE;
+ }
+
+ break;
+ }
+ case BUS_CONFIG_DRIVER:
+ case BUS_CONFIG_ALL: {
+
+ rval = drsas_config_all_devices(instance);
+
+ rval = NDI_SUCCESS;
+ break;
+ }
+ }
+
+ if (rval == NDI_SUCCESS) {
+ rval = ndi_busop_bus_config(parent, flags, op, arg, childp, 0);
+
+ }
+ ndi_devi_exit(parent, config);
+
+ con_log(CL_ANN1, (CE_NOTE, "drsas_tran_bus_config: rval = %x",
+ rval));
+ return (rval);
+}
+
+static int
+drsas_config_all_devices(struct drsas_instance *instance)
+{
+ int rval, tgt;
+
+ for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) {
+ (void) drsas_config_ld(instance, tgt, 0, NULL);
+
+ }
+
+ rval = NDI_SUCCESS;
+ return (rval);
+}
+
+static int
+drsas_parse_devname(char *devnm, int *tgt, int *lun)
+{
+ char devbuf[SCSI_MAXNAMELEN];
+ char *addr;
+ char *p, *tp, *lp;
+ long num;
+
+ /* Parse dev name and address */
+ (void) strcpy(devbuf, devnm);
+ addr = "";
+ for (p = devbuf; *p != '\0'; p++) {
+ if (*p == '@') {
+ addr = p + 1;
+ *p = '\0';
+ } else if (*p == ':') {
+ *p = '\0';
+ break;
+ }
+ }
+
+ /* Parse target and lun */
+ for (p = tp = addr, lp = NULL; *p != '\0'; p++) {
+ if (*p == ',') {
+ lp = p + 1;
+ *p = '\0';
+ break;
+ }
+ }
+ if (tgt && tp) {
+ if (ddi_strtol(tp, NULL, 0x10, &num)) {
+ return (DDI_FAILURE); /* Can declare this as constant */
+ }
+ *tgt = (int)num;
+ }
+ if (lun && lp) {
+ if (ddi_strtol(lp, NULL, 0x10, &num)) {
+ return (DDI_FAILURE);
+ }
+ *lun = (int)num;
+ }
+ return (DDI_SUCCESS); /* Success case */
+}
+
+static int
+drsas_config_ld(struct drsas_instance *instance, uint16_t tgt,
+ uint8_t lun, dev_info_t **ldip)
+{
+ struct scsi_device *sd;
+ dev_info_t *child;
+ int rval;
+
+ con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: t = %d l = %d",
+ tgt, lun));
+
+ if ((child = drsas_find_child(instance, tgt, lun)) != NULL) {
+ if (ldip) {
+ *ldip = child;
+ }
+ con_log(CL_ANN1, (CE_NOTE,
+ "drsas_config_ld: Child = %p found t = %d l = %d",
+ (void *)child, tgt, lun));
+ return (NDI_SUCCESS);
+ }
+
+ sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP);
+ sd->sd_address.a_hba_tran = instance->tran;
+ sd->sd_address.a_target = (uint16_t)tgt;
+ sd->sd_address.a_lun = (uint8_t)lun;
+
+ if (scsi_hba_probe(sd, NULL) == SCSIPROBE_EXISTS)
+ rval = drsas_config_scsi_device(instance, sd, ldip);
+ else
+ rval = NDI_FAILURE;
+
+ /* sd_unprobe is blank now. Free buffer manually */
+ if (sd->sd_inq) {
+ kmem_free(sd->sd_inq, SUN_INQSIZE);
+ sd->sd_inq = (struct scsi_inquiry *)NULL;
+ }
+
+ kmem_free(sd, sizeof (struct scsi_device));
+ con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: return rval = %d",
+ rval));
+ return (rval);
+}
+
+static int
+drsas_config_scsi_device(struct drsas_instance *instance,
+ struct scsi_device *sd, dev_info_t **dipp)
+{
+ char *nodename = NULL;
+ char **compatible = NULL;
+ int ncompatible = 0;
+ char *childname;
+ dev_info_t *ldip = NULL;
+ int tgt = sd->sd_address.a_target;
+ int lun = sd->sd_address.a_lun;
+ int dtype = sd->sd_inq->inq_dtype & DTYPE_MASK;
+ int rval;
+
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: scsi_device t%dL%d", tgt, lun));
+ scsi_hba_nodename_compatible_get(sd->sd_inq, NULL, dtype,
+ NULL, &nodename, &compatible, &ncompatible);
+
+ if (nodename == NULL) {
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: Found no compatible driver "
+ "for t%dL%d", tgt, lun));
+ rval = NDI_FAILURE;
+ goto finish;
+ }
+
+ childname = (dtype == DTYPE_DIRECT) ? "sd" : nodename;
+ con_log(CL_ANN1, (CE_WARN,
+ "dr_sas: Childname = %2s nodename = %s", childname, nodename));
+
+ /* Create a dev node */
+ rval = ndi_devi_alloc(instance->dip, childname, DEVI_SID_NODEID, &ldip);
+ con_log(CL_ANN1, (CE_WARN,
+ "dr_sas_config_scsi_device: ndi_devi_alloc rval = %x", rval));
+ if (rval == NDI_SUCCESS) {
+ if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "target", tgt) !=
+ DDI_PROP_SUCCESS) {
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+ "property for t%dl%d target", tgt, lun));
+ rval = NDI_FAILURE;
+ goto finish;
+ }
+ if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "lun", lun) !=
+ DDI_PROP_SUCCESS) {
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+ "property for t%dl%d lun", tgt, lun));
+ rval = NDI_FAILURE;
+ goto finish;
+ }
+
+ if (ndi_prop_update_string_array(DDI_DEV_T_NONE, ldip,
+ "compatible", compatible, ncompatible) !=
+ DDI_PROP_SUCCESS) {
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create "
+ "property for t%dl%d compatible", tgt, lun));
+ rval = NDI_FAILURE;
+ goto finish;
+ }
+
+ rval = ndi_devi_online(ldip, NDI_ONLINE_ATTACH);
+ if (rval != NDI_SUCCESS) {
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to online "
+ "t%dl%d", tgt, lun));
+ ndi_prop_remove_all(ldip);
+ (void) ndi_devi_free(ldip);
+ } else {
+ con_log(CL_ANN1, (CE_WARN, "dr_sas: online Done :"
+ "0 t%dl%d", tgt, lun));
+ }
+
+ }
+finish:
+ if (dipp) {
+ *dipp = ldip;
+ }
+
+ con_log(CL_DLEVEL1, (CE_WARN,
+ "dr_sas: config_scsi_device rval = %d t%dL%d",
+ rval, tgt, lun));
+ scsi_hba_nodename_compatible_free(nodename, compatible);
+ return (rval);
+}
+
+static int
+drsas_service_evt(struct drsas_instance *instance, int tgt, int lun, int event)
+{
+ struct drsas_eventinfo *mrevt = NULL;
+
+ con_log(CL_ANN1, (CE_NOTE,
+ "drsas_service_evt called for t%dl%d event = %d",
+ tgt, lun, event));
+
+ if ((instance->taskq == NULL) || (mrevt =
+ kmem_zalloc(sizeof (struct drsas_eventinfo), KM_NOSLEEP)) == NULL) {
+ return (ENOMEM);
+ }
+
+ mrevt->instance = instance;
+ mrevt->tgt = tgt;
+ mrevt->lun = lun;
+ mrevt->event = event;
+
+ if ((ddi_taskq_dispatch(instance->taskq,
+ (void (*)(void *))drsas_issue_evt_taskq, mrevt, DDI_NOSLEEP)) !=
+ DDI_SUCCESS) {
+ con_log(CL_ANN1, (CE_NOTE,
+ "dr_sas: Event task failed for t%dl%d event = %d",
+ tgt, lun, event));
+ kmem_free(mrevt, sizeof (struct drsas_eventinfo));
+ return (DDI_FAILURE);
+ }
+ return (DDI_SUCCESS);
+}
+
+static void
+drsas_issue_evt_taskq(struct drsas_eventinfo *mrevt)
+{
+ struct drsas_instance *instance = mrevt->instance;
+ dev_info_t *dip, *pdip;
+ int circ1 = 0;
+ char *devname;
+
+ con_log(CL_ANN1, (CE_NOTE, "drsas_issue_evt_taskq: called for"
+ " tgt %d lun %d event %d",
+ mrevt->tgt, mrevt->lun, mrevt->event));
+
+ if (mrevt->tgt < MRDRV_MAX_LD && mrevt->lun == 0) {
+ dip = instance->dr_ld_list[mrevt->tgt].dip;
+ } else {
+ return;
+ }
+
+ ndi_devi_enter(instance->dip, &circ1);
+ switch (mrevt->event) {
+ case DRSAS_EVT_CONFIG_TGT:
+ if (dip == NULL) {
+
+ if (mrevt->lun == 0) {
+ (void) drsas_config_ld(instance, mrevt->tgt,
+ 0, NULL);
+ }
+ con_log(CL_ANN1, (CE_NOTE,
+ "dr_sas: EVT_CONFIG_TGT called:"
+ " for tgt %d lun %d event %d",
+ mrevt->tgt, mrevt->lun, mrevt->event));
+
+ } else {
+ con_log(CL_ANN1, (CE_NOTE,
+ "dr_sas: EVT_CONFIG_TGT dip != NULL:"
+ " for tgt %d lun %d event %d",
+ mrevt->tgt, mrevt->lun, mrevt->event));
+ }
+ break;
+ case DRSAS_EVT_UNCONFIG_TGT:
+ if (dip) {
+ if (i_ddi_devi_attached(dip)) {
+
+ pdip = ddi_get_parent(dip);
+
+ devname = kmem_zalloc(MAXNAMELEN + 1, KM_SLEEP);
+ (void) ddi_deviname(dip, devname);
+
+ (void) devfs_clean(pdip, devname + 1,
+ DV_CLEAN_FORCE);
+ kmem_free(devname, MAXNAMELEN + 1);
+ }
+ (void) ndi_devi_offline(dip, NDI_DEVI_REMOVE);
+ con_log(CL_ANN1, (CE_NOTE,
+ "dr_sas: EVT_UNCONFIG_TGT called:"
+ " for tgt %d lun %d event %d",
+ mrevt->tgt, mrevt->lun, mrevt->event));
+ } else {
+ con_log(CL_ANN1, (CE_NOTE,
+ "dr_sas: EVT_UNCONFIG_TGT dip == NULL:"
+ " for tgt %d lun %d event %d",
+ mrevt->tgt, mrevt->lun, mrevt->event));
+ }
+ break;
+ }
+ kmem_free(mrevt, sizeof (struct drsas_eventinfo));
+ ndi_devi_exit(instance->dip, circ1);
+}
+
+static void
+drsas_mode_sense_build(struct scsi_pkt *pkt)
+{
+ union scsi_cdb *cdbp;
+ uint16_t page_code;
+ struct scsa_cmd *acmd;
+ struct buf *bp;
+ struct mode_header *modehdrp;
+
+ cdbp = (void *)pkt->pkt_cdbp;
+ page_code = cdbp->cdb_un.sg.scsi[0];
+ acmd = PKT2CMD(pkt);
+ bp = acmd->cmd_buf;
+ if ((!bp) && bp->b_un.b_addr && bp->b_bcount && acmd->cmd_dmacount) {
+ con_log(CL_ANN1, (CE_WARN, "Failing MODESENSE Command"));
+ /* ADD pkt statistics as Command failed. */
+ return;
+ }
+
+ bp_mapin(bp);
+ bzero(bp->b_un.b_addr, bp->b_bcount);
+
+ switch (page_code) {
+ case 0x3: {
+ struct mode_format *page3p = NULL;
+ modehdrp = (struct mode_header *)(bp->b_un.b_addr);
+ modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH;
+
+ page3p = (void *)((caddr_t)modehdrp +
+ MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH);
+ page3p->mode_page.code = 0x3;
+ page3p->mode_page.length =
+ (uchar_t)(sizeof (struct mode_format));
+ page3p->data_bytes_sect = 512;
+ page3p->sect_track = 63;
+ break;
+ }
+ case 0x4: {
+ struct mode_geometry *page4p = NULL;
+ modehdrp = (struct mode_header *)(bp->b_un.b_addr);
+ modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH;
+
+ page4p = (void *)((caddr_t)modehdrp +
+ MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH);
+ page4p->mode_page.code = 0x4;
+ page4p->mode_page.length =
+ (uchar_t)(sizeof (struct mode_geometry));
+ page4p->heads = 255;
+ page4p->rpm = 10000;
+ break;
+ }
+ default:
+ break;
+ }
+}
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.conf b/usr/src/uts/common/io/dr_sas/dr_sas.conf
new file mode 100644
index 0000000000..3792f43ca4
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.conf
@@ -0,0 +1,15 @@
+#
+# Copyright (c) 2008-2009, LSI Logic Corporation.
+# All rights reserved.
+#
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# dr_sas.conf for sol 10 (and later) for all supported architectures
+#
+# global definitions
+
+# MSI specific flag. user can uncomment this line and set flag "yes" to enable MSI
+#drsas-enable-msi="yes";
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.h b/usr/src/uts/common/io/dr_sas/dr_sas.h
new file mode 100644
index 0000000000..aa95aa3d08
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas.h
@@ -0,0 +1,1769 @@
+/*
+ * dr_sas.h: header for dr_sas
+ *
+ * Solaris MegaRAID driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef _DR_SAS_H_
+#define _DR_SAS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/scsi/scsi.h>
+#include "dr_sas_list.h"
+
+/*
+ * MegaRAID SAS2.0 Driver meta data
+ */
+#define DRSAS_VERSION "LSIv2.0"
+#define DRSAS_RELDATE "Jan 9, 2009"
+
+#define DRSAS_TRUE 1
+#define DRSAS_FALSE 0
+
+/*
+ * MegaRAID SAS2.0 device id conversion definitions.
+ */
+#define INST2LSIRDCTL(x) ((x) << INST_MINOR_SHIFT)
+
+/*
+ * MegaRAID SAS2.0 supported controllers
+ */
+#define PCI_DEVICE_ID_LSI_2108VDE 0x0078
+#define PCI_DEVICE_ID_LSI_2108V 0x0079
+
+/*
+ * Register Index for 2108 Controllers.
+ */
+#define REGISTER_SET_IO_2108 (2)
+
+#define DRSAS_MAX_SGE_CNT 0x50
+
+#define DRSAS_IOCTL_DRIVER 0x12341234
+#define DRSAS_IOCTL_FIRMWARE 0x12345678
+#define DRSAS_IOCTL_AEN 0x87654321
+
+#define DRSAS_1_SECOND 1000000
+
+/* Dynamic Enumeration Flags */
+#define DRSAS_PD_LUN 1
+#define DRSAS_LD_LUN 0
+#define DRSAS_PD_TGT_MAX 255
+#define DRSAS_GET_PD_MAX(s) ((s)->dr_pd_max)
+#define WWN_STRLEN 17
+
+/*
+ * =====================================
+ * MegaRAID SAS2.0 MFI firmware definitions
+ * =====================================
+ */
+/*
+ * MFI stands for MegaRAID SAS2.0 FW Interface. This is just a moniker for
+ * protocol between the software and firmware. Commands are issued using
+ * "message frames"
+ */
+
+/*
+ * FW posts its state in upper 4 bits of outbound_msg_0 register
+ */
+#define MFI_STATE_SHIFT 28
+#define MFI_STATE_MASK ((uint32_t)0xF<<MFI_STATE_SHIFT)
+#define MFI_STATE_UNDEFINED ((uint32_t)0x0<<MFI_STATE_SHIFT)
+#define MFI_STATE_BB_INIT ((uint32_t)0x1<<MFI_STATE_SHIFT)
+#define MFI_STATE_FW_INIT ((uint32_t)0x4<<MFI_STATE_SHIFT)
+#define MFI_STATE_WAIT_HANDSHAKE ((uint32_t)0x6<<MFI_STATE_SHIFT)
+#define MFI_STATE_FW_INIT_2 ((uint32_t)0x7<<MFI_STATE_SHIFT)
+#define MFI_STATE_DEVICE_SCAN ((uint32_t)0x8<<MFI_STATE_SHIFT)
+#define MFI_STATE_BOOT_MESSAGE_PENDING ((uint32_t)0x9<<MFI_STATE_SHIFT)
+#define MFI_STATE_FLUSH_CACHE ((uint32_t)0xA<<MFI_STATE_SHIFT)
+#define MFI_STATE_READY ((uint32_t)0xB<<MFI_STATE_SHIFT)
+#define MFI_STATE_OPERATIONAL ((uint32_t)0xC<<MFI_STATE_SHIFT)
+#define MFI_STATE_FAULT ((uint32_t)0xF<<MFI_STATE_SHIFT)
+
+#define MRMFI_FRAME_SIZE 64
+
+/*
+ * During FW init, clear pending cmds & reset state using inbound_msg_0
+ *
+ * ABORT : Abort all pending cmds
+ * READY : Move from OPERATIONAL to READY state; discard queue info
+ * MFIMODE : Discard (possible) low MFA posted in 64-bit mode (??)
+ * CLR_HANDSHAKE: FW is waiting for HANDSHAKE from BIOS or Driver
+ */
+#define MFI_INIT_ABORT 0x00000001
+#define MFI_INIT_READY 0x00000002
+#define MFI_INIT_MFIMODE 0x00000004
+#define MFI_INIT_CLEAR_HANDSHAKE 0x00000008
+#define MFI_INIT_HOTPLUG 0x00000010
+#define MFI_STOP_ADP 0x00000020
+#define MFI_RESET_FLAGS MFI_INIT_READY|MFI_INIT_MFIMODE|MFI_INIT_ABORT
+
+/*
+ * MFI frame flags
+ */
+#define MFI_FRAME_POST_IN_REPLY_QUEUE 0x0000
+#define MFI_FRAME_DONT_POST_IN_REPLY_QUEUE 0x0001
+#define MFI_FRAME_SGL32 0x0000
+#define MFI_FRAME_SGL64 0x0002
+#define MFI_FRAME_SENSE32 0x0000
+#define MFI_FRAME_SENSE64 0x0004
+#define MFI_FRAME_DIR_NONE 0x0000
+#define MFI_FRAME_DIR_WRITE 0x0008
+#define MFI_FRAME_DIR_READ 0x0010
+#define MFI_FRAME_DIR_BOTH 0x0018
+
+/*
+ * Definition for cmd_status
+ */
+#define MFI_CMD_STATUS_POLL_MODE 0xFF
+#define MFI_CMD_STATUS_SYNC_MODE 0xFF
+
+/*
+ * MFI command opcodes
+ */
+#define MFI_CMD_OP_INIT 0x00
+#define MFI_CMD_OP_LD_READ 0x01
+#define MFI_CMD_OP_LD_WRITE 0x02
+#define MFI_CMD_OP_LD_SCSI 0x03
+#define MFI_CMD_OP_PD_SCSI 0x04
+#define MFI_CMD_OP_DCMD 0x05
+#define MFI_CMD_OP_ABORT 0x06
+#define MFI_CMD_OP_SMP 0x07
+#define MFI_CMD_OP_STP 0x08
+
+#define DR_DCMD_CTRL_GET_INFO 0x01010000
+
+#define DR_DCMD_CTRL_CACHE_FLUSH 0x01101000
+#define DR_FLUSH_CTRL_CACHE 0x01
+#define DR_FLUSH_DISK_CACHE 0x02
+
+#define DR_DCMD_CTRL_SHUTDOWN 0x01050000
+#define DRSAS_ENABLE_DRIVE_SPINDOWN 0x01
+
+#define DR_DCMD_CTRL_EVENT_GET_INFO 0x01040100
+#define DR_DCMD_CTRL_EVENT_GET 0x01040300
+#define DR_DCMD_CTRL_EVENT_WAIT 0x01040500
+#define DR_DCMD_LD_GET_PROPERTIES 0x03030000
+#define DR_DCMD_PD_GET_INFO 0x02020000
+
+/*
+ * Solaris Specific MAX values
+ */
+#define MAX_SGL 24
+/*
+ * MFI command completion codes
+ */
+enum MFI_STAT {
+ MFI_STAT_OK = 0x00,
+ MFI_STAT_INVALID_CMD = 0x01,
+ MFI_STAT_INVALID_DCMD = 0x02,
+ MFI_STAT_INVALID_PARAMETER = 0x03,
+ MFI_STAT_INVALID_SEQUENCE_NUMBER = 0x04,
+ MFI_STAT_ABORT_NOT_POSSIBLE = 0x05,
+ MFI_STAT_APP_HOST_CODE_NOT_FOUND = 0x06,
+ MFI_STAT_APP_IN_USE = 0x07,
+ MFI_STAT_APP_NOT_INITIALIZED = 0x08,
+ MFI_STAT_ARRAY_INDEX_INVALID = 0x09,
+ MFI_STAT_ARRAY_ROW_NOT_EMPTY = 0x0a,
+ MFI_STAT_CONFIG_RESOURCE_CONFLICT = 0x0b,
+ MFI_STAT_DEVICE_NOT_FOUND = 0x0c,
+ MFI_STAT_DRIVE_TOO_SMALL = 0x0d,
+ MFI_STAT_FLASH_ALLOC_FAIL = 0x0e,
+ MFI_STAT_FLASH_BUSY = 0x0f,
+ MFI_STAT_FLASH_ERROR = 0x10,
+ MFI_STAT_FLASH_IMAGE_BAD = 0x11,
+ MFI_STAT_FLASH_IMAGE_INCOMPLETE = 0x12,
+ MFI_STAT_FLASH_NOT_OPEN = 0x13,
+ MFI_STAT_FLASH_NOT_STARTED = 0x14,
+ MFI_STAT_FLUSH_FAILED = 0x15,
+ MFI_STAT_HOST_CODE_NOT_FOUNT = 0x16,
+ MFI_STAT_LD_CC_IN_PROGRESS = 0x17,
+ MFI_STAT_LD_INIT_IN_PROGRESS = 0x18,
+ MFI_STAT_LD_LBA_OUT_OF_RANGE = 0x19,
+ MFI_STAT_LD_MAX_CONFIGURED = 0x1a,
+ MFI_STAT_LD_NOT_OPTIMAL = 0x1b,
+ MFI_STAT_LD_RBLD_IN_PROGRESS = 0x1c,
+ MFI_STAT_LD_RECON_IN_PROGRESS = 0x1d,
+ MFI_STAT_LD_WRONG_RAID_LEVEL = 0x1e,
+ MFI_STAT_MAX_SPARES_EXCEEDED = 0x1f,
+ MFI_STAT_MEMORY_NOT_AVAILABLE = 0x20,
+ MFI_STAT_MFC_HW_ERROR = 0x21,
+ MFI_STAT_NO_HW_PRESENT = 0x22,
+ MFI_STAT_NOT_FOUND = 0x23,
+ MFI_STAT_NOT_IN_ENCL = 0x24,
+ MFI_STAT_PD_CLEAR_IN_PROGRESS = 0x25,
+ MFI_STAT_PD_TYPE_WRONG = 0x26,
+ MFI_STAT_PR_DISABLED = 0x27,
+ MFI_STAT_ROW_INDEX_INVALID = 0x28,
+ MFI_STAT_SAS_CONFIG_INVALID_ACTION = 0x29,
+ MFI_STAT_SAS_CONFIG_INVALID_DATA = 0x2a,
+ MFI_STAT_SAS_CONFIG_INVALID_PAGE = 0x2b,
+ MFI_STAT_SAS_CONFIG_INVALID_TYPE = 0x2c,
+ MFI_STAT_SCSI_DONE_WITH_ERROR = 0x2d,
+ MFI_STAT_SCSI_IO_FAILED = 0x2e,
+ MFI_STAT_SCSI_RESERVATION_CONFLICT = 0x2f,
+ MFI_STAT_SHUTDOWN_FAILED = 0x30,
+ MFI_STAT_TIME_NOT_SET = 0x31,
+ MFI_STAT_WRONG_STATE = 0x32,
+ MFI_STAT_LD_OFFLINE = 0x33,
+ /* UNUSED: 0x34 to 0xfe */
+ MFI_STAT_INVALID_STATUS = 0xFF
+};
+
+enum DR_EVT_CLASS {
+ DR_EVT_CLASS_DEBUG = -2,
+ DR_EVT_CLASS_PROGRESS = -1,
+ DR_EVT_CLASS_INFO = 0,
+ DR_EVT_CLASS_WARNING = 1,
+ DR_EVT_CLASS_CRITICAL = 2,
+ DR_EVT_CLASS_FATAL = 3,
+ DR_EVT_CLASS_DEAD = 4
+};
+
+enum DR_EVT_LOCALE {
+ DR_EVT_LOCALE_LD = 0x0001,
+ DR_EVT_LOCALE_PD = 0x0002,
+ DR_EVT_LOCALE_ENCL = 0x0004,
+ DR_EVT_LOCALE_BBU = 0x0008,
+ DR_EVT_LOCALE_SAS = 0x0010,
+ DR_EVT_LOCALE_CTRL = 0x0020,
+ DR_EVT_LOCALE_CONFIG = 0x0040,
+ DR_EVT_LOCALE_CLUSTER = 0x0080,
+ DR_EVT_LOCALE_ALL = 0xffff
+};
+
+#define DR_EVT_CFG_CLEARED 0x0004
+#define DR_EVT_LD_CREATED 0x008a
+#define DR_EVT_LD_DELETED 0x008b
+#define DR_EVT_PD_REMOVED_EXT 0x00f8
+#define DR_EVT_PD_INSERTED_EXT 0x00f7
+
+enum LD_STATE {
+ LD_OFFLINE = 0,
+ LD_PARTIALLY_DEGRADED = 1,
+ LD_DEGRADED = 2,
+ LD_OPTIMAL = 3,
+ LD_INVALID = 0xFF
+};
+
+enum DRSAS_EVT {
+ DRSAS_EVT_CONFIG_TGT = 0,
+ DRSAS_EVT_UNCONFIG_TGT = 1,
+ DRSAS_EVT_UNCONFIG_SMP = 2
+};
+
+#define DMA_OBJ_ALLOCATED 1
+#define DMA_OBJ_REALLOCATED 2
+#define DMA_OBJ_FREED 3
+
+/*
+ * dma_obj_t - Our DMA object
+ * @param buffer : kernel virtual address
+ * @param size : size of the data to be allocated
+ * @param acc_handle : access handle
+ * @param dma_handle : dma handle
+ * @param dma_cookie : scatter-gather list
+ * @param dma_attr : dma attributes for this buffer
+ * Our DMA object. The caller must initialize the size and dma attributes
+ * (dma_attr) fields before allocating the resources.
+ */
+typedef struct {
+ caddr_t buffer;
+ uint32_t size;
+ ddi_acc_handle_t acc_handle;
+ ddi_dma_handle_t dma_handle;
+ ddi_dma_cookie_t dma_cookie[DRSAS_MAX_SGE_CNT];
+ ddi_dma_attr_t dma_attr;
+ uint8_t status;
+ uint8_t reserved[3];
+} dma_obj_t;
+
+struct drsas_eventinfo {
+ struct drsas_instance *instance;
+ int tgt;
+ int lun;
+ int event;
+};
+
+struct drsas_ld {
+ dev_info_t *dip;
+ uint8_t lun_type;
+ uint8_t reserved[3];
+};
+
+struct drsas_pd {
+ dev_info_t *dip;
+ uint8_t lun_type;
+ uint8_t dev_id;
+ uint8_t flags;
+ uint8_t reserved;
+};
+
+struct drsas_pd_info {
+ uint16_t deviceId;
+ uint16_t seqNum;
+ uint8_t inquiryData[96];
+ uint8_t vpdPage83[64];
+ uint8_t notSupported;
+ uint8_t scsiDevType;
+ uint8_t a;
+ uint8_t device_speed;
+ uint32_t mediaerrcnt;
+ uint32_t other;
+ uint32_t pred;
+ uint32_t lastpred;
+ uint16_t fwState;
+ uint8_t disabled;
+ uint8_t linkspwwd;
+ uint32_t ddfType;
+ struct {
+ uint8_t count;
+ uint8_t isPathBroken;
+ uint8_t connectorIndex[2];
+ uint8_t reserved[4];
+ uint64_t sasAddr[2];
+ uint8_t reserved2[16];
+ } pathInfo;
+};
+
+typedef struct drsas_instance {
+ uint32_t *producer;
+ uint32_t *consumer;
+
+ uint32_t *reply_queue;
+ dma_obj_t mfi_internal_dma_obj;
+
+ uint8_t init_id;
+ uint8_t reserved[3];
+
+ uint16_t max_num_sge;
+ uint16_t max_fw_cmds;
+ uint32_t max_sectors_per_req;
+
+ struct drsas_cmd **cmd_list;
+
+ mlist_t cmd_pool_list;
+ kmutex_t cmd_pool_mtx;
+
+ mlist_t cmd_pend_list;
+ kmutex_t cmd_pend_mtx;
+
+ dma_obj_t mfi_evt_detail_obj;
+ struct drsas_cmd *aen_cmd;
+
+ uint32_t aen_seq_num;
+ uint32_t aen_class_locale_word;
+
+ scsi_hba_tran_t *tran;
+
+ kcondvar_t int_cmd_cv;
+ kmutex_t int_cmd_mtx;
+
+ kcondvar_t aen_cmd_cv;
+ kmutex_t aen_cmd_mtx;
+
+ kcondvar_t abort_cmd_cv;
+ kmutex_t abort_cmd_mtx;
+
+ dev_info_t *dip;
+ ddi_acc_handle_t pci_handle;
+
+ timeout_id_t timeout_id;
+ uint32_t unique_id;
+ uint16_t fw_outstanding;
+ caddr_t regmap;
+ ddi_acc_handle_t regmap_handle;
+ uint8_t isr_level;
+ ddi_iblock_cookie_t iblock_cookie;
+ ddi_iblock_cookie_t soft_iblock_cookie;
+ ddi_softintr_t soft_intr_id;
+ uint8_t softint_running;
+ kmutex_t completed_pool_mtx;
+ mlist_t completed_pool_list;
+
+ caddr_t internal_buf;
+ uint32_t internal_buf_dmac_add;
+ uint32_t internal_buf_size;
+
+ uint16_t vendor_id;
+ uint16_t device_id;
+ uint16_t subsysvid;
+ uint16_t subsysid;
+ int instance;
+ int baseaddress;
+ char iocnode[16];
+
+ int fm_capabilities;
+
+ struct drsas_func_ptr *func_ptr;
+ /* MSI interrupts specific */
+ ddi_intr_handle_t *intr_htable;
+ int intr_type;
+ int intr_cnt;
+ size_t intr_size;
+ uint_t intr_pri;
+ int intr_cap;
+
+ ddi_taskq_t *taskq;
+ struct drsas_ld *dr_ld_list;
+} drsas_t;
+
+struct drsas_func_ptr {
+ int (*read_fw_status_reg)(struct drsas_instance *);
+ void (*issue_cmd)(struct drsas_cmd *, struct drsas_instance *);
+ int (*issue_cmd_in_sync_mode)(struct drsas_instance *,
+ struct drsas_cmd *);
+ int (*issue_cmd_in_poll_mode)(struct drsas_instance *,
+ struct drsas_cmd *);
+ void (*enable_intr)(struct drsas_instance *);
+ void (*disable_intr)(struct drsas_instance *);
+ int (*intr_ack)(struct drsas_instance *);
+};
+
+/*
+ * ### Helper routines ###
+ */
+
+/*
+ * con_log() - console log routine
+ * @param level : indicates the severity of the message.
+ * @fparam mt : format string
+ *
+ * con_log displays the error messages on the console based on the current
+ * debug level. Also it attaches the appropriate kernel severity level with
+ * the message.
+ *
+ *
+ * console messages debug levels
+ */
+#define CL_NONE 0 /* No debug information */
+#define CL_ANN 1 /* print unconditionally, announcements */
+#define CL_ANN1 2 /* No o/p */
+#define CL_DLEVEL1 3 /* debug level 1, informative */
+#define CL_DLEVEL2 4 /* debug level 2, verbose */
+#define CL_DLEVEL3 5 /* debug level 3, very verbose */
+
+#ifdef __SUNPRO_C
+#define __func__ ""
+#endif
+
+#define con_log(level, fmt) { if (debug_level_g >= level) cmn_err fmt; }
+
+/*
+ * ### SCSA definitions ###
+ */
+#define PKT2TGT(pkt) ((pkt)->pkt_address.a_target)
+#define PKT2LUN(pkt) ((pkt)->pkt_address.a_lun)
+#define PKT2TRAN(pkt) ((pkt)->pkt_adress.a_hba_tran)
+#define ADDR2TRAN(ap) ((ap)->a_hba_tran)
+
+#define TRAN2MR(tran) (struct drsas_instance *)(tran)->tran_hba_private)
+#define ADDR2MR(ap) (TRAN2MR(ADDR2TRAN(ap))
+
+#define PKT2CMD(pkt) ((struct scsa_cmd *)(pkt)->pkt_ha_private)
+#define CMD2PKT(sp) ((sp)->cmd_pkt)
+#define PKT2REQ(pkt) (&(PKT2CMD(pkt)->request))
+
+#define CMD2ADDR(cmd) (&CMD2PKT(cmd)->pkt_address)
+#define CMD2TRAN(cmd) (CMD2PKT(cmd)->pkt_address.a_hba_tran)
+#define CMD2MR(cmd) (TRAN2MR(CMD2TRAN(cmd)))
+
+#define CFLAG_DMAVALID 0x0001 /* requires a dma operation */
+#define CFLAG_DMASEND 0x0002 /* Transfer from the device */
+#define CFLAG_CONSISTENT 0x0040 /* consistent data transfer */
+
+/*
+ * ### Data structures for ioctl inteface and internal commands ###
+ */
+
+/*
+ * Data direction flags
+ */
+#define UIOC_RD 0x00001
+#define UIOC_WR 0x00002
+
+#define SCP2HOST(scp) (scp)->device->host /* to host */
+#define SCP2HOSTDATA(scp) SCP2HOST(scp)->hostdata /* to soft state */
+#define SCP2CHANNEL(scp) (scp)->device->channel /* to channel */
+#define SCP2TARGET(scp) (scp)->device->id /* to target */
+#define SCP2LUN(scp) (scp)->device->lun /* to LUN */
+
+#define SCSIHOST2ADAP(host) (((caddr_t *)(host->hostdata))[0])
+#define SCP2ADAPTER(scp) \
+ (struct drsas_instance *)SCSIHOST2ADAP(SCP2HOST(scp))
+
+#define MRDRV_IS_LOGICAL_SCSA(instance, acmd) \
+ (acmd->device_id < MRDRV_MAX_LD) ? 1 : 0
+#define MRDRV_IS_LOGICAL(ap) \
+ ((ap->a_target < MRDRV_MAX_LD) && (ap->a_lun == 0)) ? 1 : 0
+#define MAP_DEVICE_ID(instance, ap) \
+ (ap->a_target)
+
+#define HIGH_LEVEL_INTR 1
+#define NORMAL_LEVEL_INTR 0
+
+/*
+ * scsa_cmd - Per-command mr private data
+ * @param cmd_dmahandle : dma handle
+ * @param cmd_dmacookies : current dma cookies
+ * @param cmd_pkt : scsi_pkt reference
+ * @param cmd_dmacount : dma count
+ * @param cmd_cookie : next cookie
+ * @param cmd_ncookies : cookies per window
+ * @param cmd_cookiecnt : cookies per sub-win
+ * @param cmd_nwin : number of dma windows
+ * @param cmd_curwin : current dma window
+ * @param cmd_dma_offset : current window offset
+ * @param cmd_dma_len : current window length
+ * @param cmd_flags : private flags
+ * @param cmd_cdblen : length of cdb
+ * @param cmd_scblen : length of scb
+ * @param cmd_buf : command buffer
+ * @param channel : channel for scsi sub-system
+ * @param target : target for scsi sub-system
+ * @param lun : LUN for scsi sub-system
+ *
+ * - Allocated at same time as scsi_pkt by scsi_hba_pkt_alloc(9E)
+ * - Pointed to by pkt_ha_private field in scsi_pkt
+ */
+struct scsa_cmd {
+ ddi_dma_handle_t cmd_dmahandle;
+ ddi_dma_cookie_t cmd_dmacookies[DRSAS_MAX_SGE_CNT];
+ struct scsi_pkt *cmd_pkt;
+ ulong_t cmd_dmacount;
+ uint_t cmd_cookie;
+ uint_t cmd_ncookies;
+ uint_t cmd_cookiecnt;
+ uint_t cmd_nwin;
+ uint_t cmd_curwin;
+ off_t cmd_dma_offset;
+ ulong_t cmd_dma_len;
+ ulong_t cmd_flags;
+ uint_t cmd_cdblen;
+ uint_t cmd_scblen;
+ struct buf *cmd_buf;
+ ushort_t device_id;
+ uchar_t islogical;
+ uchar_t lun;
+ struct drsas_device *drsas_dev;
+};
+
+
+struct drsas_cmd {
+ union drsas_frame *frame;
+ uint32_t frame_phys_addr;
+ uint8_t *sense;
+ uint32_t sense_phys_addr;
+ dma_obj_t frame_dma_obj;
+ uint8_t frame_dma_obj_status;
+
+ uint32_t index;
+ uint8_t sync_cmd;
+ uint8_t cmd_status;
+ uint16_t abort_aen;
+ mlist_t list;
+ uint32_t frame_count;
+ struct scsa_cmd *cmd;
+ struct scsi_pkt *pkt;
+};
+
+#define MAX_MGMT_ADAPTERS 1024
+#define IOC_SIGNATURE "MR-SAS"
+
+#define IOC_CMD_FIRMWARE 0x0
+#define DRSAS_DRIVER_IOCTL_COMMON 0xF0010000
+#define DRSAS_DRIVER_IOCTL_DRIVER_VERSION 0xF0010100
+#define DRSAS_DRIVER_IOCTL_PCI_INFORMATION 0xF0010200
+#define DRSAS_DRIVER_IOCTL_MRRAID_STATISTICS 0xF0010300
+
+
+#define DRSAS_MAX_SENSE_LENGTH 32
+
+struct drsas_mgmt_info {
+
+ uint16_t count;
+ struct drsas_instance *instance[MAX_MGMT_ADAPTERS];
+ uint16_t map[MAX_MGMT_ADAPTERS];
+ int max_index;
+};
+
+#pragma pack(1)
+
+/*
+ * SAS controller properties
+ */
+struct drsas_ctrl_prop {
+ uint16_t seq_num;
+ uint16_t pred_fail_poll_interval;
+ uint16_t intr_throttle_count;
+ uint16_t intr_throttle_timeouts;
+
+ uint8_t rebuild_rate;
+ uint8_t patrol_read_rate;
+ uint8_t bgi_rate;
+ uint8_t cc_rate;
+ uint8_t recon_rate;
+
+ uint8_t cache_flush_interval;
+
+ uint8_t spinup_drv_count;
+ uint8_t spinup_delay;
+
+ uint8_t cluster_enable;
+ uint8_t coercion_mode;
+ uint8_t disk_write_cache_disable;
+ uint8_t alarm_enable;
+
+ uint8_t reserved[44];
+};
+
+/*
+ * SAS controller information
+ */
+struct drsas_ctrl_info {
+ /* PCI device information */
+ struct {
+ uint16_t vendor_id;
+ uint16_t device_id;
+ uint16_t sub_vendor_id;
+ uint16_t sub_device_id;
+ uint8_t reserved[24];
+ } pci;
+
+ /* Host interface information */
+ struct {
+ uint8_t PCIX : 1;
+ uint8_t PCIE : 1;
+ uint8_t iSCSI : 1;
+ uint8_t SAS_3G : 1;
+ uint8_t reserved_0 : 4;
+ uint8_t reserved_1[6];
+ uint8_t port_count;
+ uint64_t port_addr[8];
+ } host_interface;
+
+ /* Device (backend) interface information */
+ struct {
+ uint8_t SPI : 1;
+ uint8_t SAS_3G : 1;
+ uint8_t SATA_1_5G : 1;
+ uint8_t SATA_3G : 1;
+ uint8_t reserved_0 : 4;
+ uint8_t reserved_1[6];
+ uint8_t port_count;
+ uint64_t port_addr[8];
+ } device_interface;
+
+ /* List of components residing in flash. All str are null terminated */
+ uint32_t image_check_word;
+ uint32_t image_component_count;
+
+ struct {
+ char name[8];
+ char version[32];
+ char build_date[16];
+ char built_time[16];
+ } image_component[8];
+
+ /*
+ * List of flash components that have been flashed on the card, but
+ * are not in use, pending reset of the adapter. This list will be
+ * empty if a flash operation has not occurred. All stings are null
+ * terminated
+ */
+ uint32_t pending_image_component_count;
+
+ struct {
+ char name[8];
+ char version[32];
+ char build_date[16];
+ char build_time[16];
+ } pending_image_component[8];
+
+ uint8_t max_arms;
+ uint8_t max_spans;
+ uint8_t max_arrays;
+ uint8_t max_lds;
+
+ char product_name[80];
+ char serial_no[32];
+
+ /*
+ * Other physical/controller/operation information. Indicates the
+ * presence of the hardware
+ */
+ struct {
+ uint32_t bbu : 1;
+ uint32_t alarm : 1;
+ uint32_t nvram : 1;
+ uint32_t uart : 1;
+ uint32_t reserved : 28;
+ } hw_present;
+
+ uint32_t current_fw_time;
+
+ /* Maximum data transfer sizes */
+ uint16_t max_concurrent_cmds;
+ uint16_t max_sge_count;
+ uint32_t max_request_size;
+
+ /* Logical and physical device counts */
+ uint16_t ld_present_count;
+ uint16_t ld_degraded_count;
+ uint16_t ld_offline_count;
+
+ uint16_t pd_present_count;
+ uint16_t pd_disk_present_count;
+ uint16_t pd_disk_pred_failure_count;
+ uint16_t pd_disk_failed_count;
+
+ /* Memory size information */
+ uint16_t nvram_size;
+ uint16_t memory_size;
+ uint16_t flash_size;
+
+ /* Error counters */
+ uint16_t mem_correctable_error_count;
+ uint16_t mem_uncorrectable_error_count;
+
+ /* Cluster information */
+ uint8_t cluster_permitted;
+ uint8_t cluster_active;
+ uint8_t reserved_1[2];
+
+ /* Controller capabilities structures */
+ struct {
+ uint32_t raid_level_0 : 1;
+ uint32_t raid_level_1 : 1;
+ uint32_t raid_level_5 : 1;
+ uint32_t raid_level_1E : 1;
+ uint32_t reserved : 28;
+ } raid_levels;
+
+ struct {
+ uint32_t rbld_rate : 1;
+ uint32_t cc_rate : 1;
+ uint32_t bgi_rate : 1;
+ uint32_t recon_rate : 1;
+ uint32_t patrol_rate : 1;
+ uint32_t alarm_control : 1;
+ uint32_t cluster_supported : 1;
+ uint32_t bbu : 1;
+ uint32_t spanning_allowed : 1;
+ uint32_t dedicated_hotspares : 1;
+ uint32_t revertible_hotspares : 1;
+ uint32_t foreign_config_import : 1;
+ uint32_t self_diagnostic : 1;
+ uint32_t reserved : 19;
+ } adapter_operations;
+
+ struct {
+ uint32_t read_policy : 1;
+ uint32_t write_policy : 1;
+ uint32_t io_policy : 1;
+ uint32_t access_policy : 1;
+ uint32_t reserved : 28;
+ } ld_operations;
+
+ struct {
+ uint8_t min;
+ uint8_t max;
+ uint8_t reserved[2];
+ } stripe_size_operations;
+
+ struct {
+ uint32_t force_online : 1;
+ uint32_t force_offline : 1;
+ uint32_t force_rebuild : 1;
+ uint32_t reserved : 29;
+ } pd_operations;
+
+ struct {
+ uint32_t ctrl_supports_sas : 1;
+ uint32_t ctrl_supports_sata : 1;
+ uint32_t allow_mix_in_encl : 1;
+ uint32_t allow_mix_in_ld : 1;
+ uint32_t allow_sata_in_cluster : 1;
+ uint32_t reserved : 27;
+ } pd_mix_support;
+
+ /* Include the controller properties (changeable items) */
+ uint8_t reserved_2[12];
+ struct drsas_ctrl_prop properties;
+
+ uint8_t pad[0x800 - 0x640];
+};
+
+/*
+ * ==================================
+ * MegaRAID SAS2.0 driver definitions
+ * ==================================
+ */
+#define MRDRV_MAX_NUM_CMD 1024
+
+#define MRDRV_MAX_PD_CHANNELS 2
+#define MRDRV_MAX_LD_CHANNELS 2
+#define MRDRV_MAX_CHANNELS (MRDRV_MAX_PD_CHANNELS + \
+ MRDRV_MAX_LD_CHANNELS)
+#define MRDRV_MAX_DEV_PER_CHANNEL 128
+#define MRDRV_DEFAULT_INIT_ID -1
+#define MRDRV_MAX_CMD_PER_LUN 1000
+#define MRDRV_MAX_LUN 1
+#define MRDRV_MAX_LD 64
+
+#define MRDRV_RESET_WAIT_TIME 300
+#define MRDRV_RESET_NOTICE_INTERVAL 5
+
+#define DRSAS_IOCTL_CMD 0
+
+/*
+ * FW can accept both 32 and 64 bit SGLs. We want to allocate 32/64 bit
+ * SGLs based on the size of dma_addr_t
+ */
+#define IS_DMA64 (sizeof (dma_addr_t) == 8)
+
+#define IB_MSG_0_OFF 0x10 /* XScale */
+#define OB_MSG_0_OFF 0x18 /* XScale */
+#define IB_DOORBELL_OFF 0x20 /* XScale & ROC */
+#define OB_INTR_STATUS_OFF 0x30 /* XScale & ROC */
+#define OB_INTR_MASK_OFF 0x34 /* XScale & ROC */
+#define IB_QPORT_OFF 0x40 /* XScale & ROC */
+#define OB_DOORBELL_CLEAR_OFF 0xA0 /* ROC */
+#define OB_SCRATCH_PAD_0_OFF 0xB0 /* ROC */
+#define OB_INTR_MASK 0xFFFFFFFF
+#define OB_DOORBELL_CLEAR_MASK 0xFFFFFFFF
+
+/*
+ * All MFI register set macros accept drsas_register_set*
+ */
+#define WR_IB_MSG_0(v, instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + IB_MSG_0_OFF), (v))
+
+#define RD_OB_MSG_0(instance) ddi_get32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_MSG_0_OFF))
+
+#define WR_IB_DOORBELL(v, instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF), (v))
+
+#define RD_IB_DOORBELL(instance) ddi_get32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF))
+
+#define WR_OB_INTR_STATUS(v, instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF), (v))
+
+#define RD_OB_INTR_STATUS(instance) ddi_get32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF))
+
+#define WR_OB_INTR_MASK(v, instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), (v))
+
+#define RD_OB_INTR_MASK(instance) ddi_get32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF))
+
+#define WR_IB_QPORT(v, instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + IB_QPORT_OFF), (v))
+
+#define WR_OB_DOORBELL_CLEAR(v, instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_DOORBELL_CLEAR_OFF), \
+ (v))
+
+#define RD_OB_SCRATCH_PAD_0(instance) ddi_get32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_0_OFF))
+
+/*
+ * When FW is in MFI_STATE_READY or MFI_STATE_OPERATIONAL, the state data
+ * of Outbound Msg Reg 0 indicates max concurrent cmds supported, max SGEs
+ * supported per cmd and if 64-bit MFAs (M64) is enabled or disabled.
+ */
+#define MFI_OB_INTR_STATUS_MASK 0x00000002
+
+/*
+ * This MFI_REPLY_2108_MESSAGE_INTR flag is used also
+ * in enable_intr_ppc also. Hence bit 2, i.e. 0x4 has
+ * been set in this flag along with bit 1.
+ */
+#define MFI_REPLY_2108_MESSAGE_INTR 0x00000001
+#define MFI_REPLY_2108_MESSAGE_INTR_MASK 0x00000005
+
+#define MFI_POLL_TIMEOUT_SECS 60
+
+#define MFI_ENABLE_INTR(instance) ddi_put32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), 1)
+#define MFI_DISABLE_INTR(instance) \
+{ \
+ uint32_t disable = 1; \
+ uint32_t mask = ddi_get32((instance)->regmap_handle, \
+ (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF));\
+ mask &= ~disable; \
+ ddi_put32((instance)->regmap_handle, (uint32_t *) \
+ (uintptr_t)((instance)->regmap + OB_INTR_MASK_OFF), mask); \
+}
+
+/* By default, the firmware programs for 8 Kbytes of memory */
+#define DEFAULT_MFI_MEM_SZ 8192
+#define MINIMUM_MFI_MEM_SZ 4096
+
+/* DCMD Message Frame MAILBOX0-11 */
+#define DCMD_MBOX_SZ 12
+
+
+struct drsas_register_set {
+ uint32_t reserved_0[4];
+
+ uint32_t inbound_msg_0;
+ uint32_t inbound_msg_1;
+ uint32_t outbound_msg_0;
+ uint32_t outbound_msg_1;
+
+ uint32_t inbound_doorbell;
+ uint32_t inbound_intr_status;
+ uint32_t inbound_intr_mask;
+
+ uint32_t outbound_doorbell;
+ uint32_t outbound_intr_status;
+ uint32_t outbound_intr_mask;
+
+ uint32_t reserved_1[2];
+
+ uint32_t inbound_queue_port;
+ uint32_t outbound_queue_port;
+
+ uint32_t reserved_2[22];
+
+ uint32_t outbound_doorbell_clear;
+
+ uint32_t reserved_3[3];
+
+ uint32_t outbound_scratch_pad;
+
+ uint32_t reserved_4[3];
+
+ uint32_t inbound_low_queue_port;
+
+ uint32_t inbound_high_queue_port;
+
+ uint32_t reserved_5;
+ uint32_t index_registers[820];
+};
+
+struct drsas_sge32 {
+ uint32_t phys_addr;
+ uint32_t length;
+};
+
+struct drsas_sge64 {
+ uint64_t phys_addr;
+ uint32_t length;
+};
+
+union drsas_sgl {
+ struct drsas_sge32 sge32[1];
+ struct drsas_sge64 sge64[1];
+};
+
+struct drsas_header {
+ uint8_t cmd;
+ uint8_t sense_len;
+ uint8_t cmd_status;
+ uint8_t scsi_status;
+
+ uint8_t target_id;
+ uint8_t lun;
+ uint8_t cdb_len;
+ uint8_t sge_count;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t timeout;
+ uint32_t data_xferlen;
+};
+
+union drsas_sgl_frame {
+ struct drsas_sge32 sge32[8];
+ struct drsas_sge64 sge64[5];
+};
+
+struct drsas_init_frame {
+ uint8_t cmd;
+ uint8_t reserved_0;
+ uint8_t cmd_status;
+
+ uint8_t reserved_1;
+ uint32_t reserved_2;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t reserved_3;
+ uint32_t data_xfer_len;
+
+ uint32_t queue_info_new_phys_addr_lo;
+ uint32_t queue_info_new_phys_addr_hi;
+ uint32_t queue_info_old_phys_addr_lo;
+ uint32_t queue_info_old_phys_addr_hi;
+
+ uint32_t reserved_4[6];
+};
+
+struct drsas_init_queue_info {
+ uint32_t init_flags;
+ uint32_t reply_queue_entries;
+
+ uint32_t reply_queue_start_phys_addr_lo;
+ uint32_t reply_queue_start_phys_addr_hi;
+ uint32_t producer_index_phys_addr_lo;
+ uint32_t producer_index_phys_addr_hi;
+ uint32_t consumer_index_phys_addr_lo;
+ uint32_t consumer_index_phys_addr_hi;
+};
+
+struct drsas_io_frame {
+ uint8_t cmd;
+ uint8_t sense_len;
+ uint8_t cmd_status;
+ uint8_t scsi_status;
+
+ uint8_t target_id;
+ uint8_t access_byte;
+ uint8_t reserved_0;
+ uint8_t sge_count;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t timeout;
+ uint32_t lba_count;
+
+ uint32_t sense_buf_phys_addr_lo;
+ uint32_t sense_buf_phys_addr_hi;
+
+ uint32_t start_lba_lo;
+ uint32_t start_lba_hi;
+
+ union drsas_sgl sgl;
+};
+
+struct drsas_pthru_frame {
+ uint8_t cmd;
+ uint8_t sense_len;
+ uint8_t cmd_status;
+ uint8_t scsi_status;
+
+ uint8_t target_id;
+ uint8_t lun;
+ uint8_t cdb_len;
+ uint8_t sge_count;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t timeout;
+ uint32_t data_xfer_len;
+
+ uint32_t sense_buf_phys_addr_lo;
+ uint32_t sense_buf_phys_addr_hi;
+
+ uint8_t cdb[16];
+ union drsas_sgl sgl;
+};
+
+struct drsas_dcmd_frame {
+ uint8_t cmd;
+ uint8_t reserved_0;
+ uint8_t cmd_status;
+ uint8_t reserved_1[4];
+ uint8_t sge_count;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t timeout;
+
+ uint32_t data_xfer_len;
+ uint32_t opcode;
+
+ union {
+ uint8_t b[DCMD_MBOX_SZ];
+ uint16_t s[6];
+ uint32_t w[3];
+ } mbox;
+
+ union drsas_sgl sgl;
+};
+
+struct drsas_abort_frame {
+ uint8_t cmd;
+ uint8_t reserved_0;
+ uint8_t cmd_status;
+
+ uint8_t reserved_1;
+ uint32_t reserved_2;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t reserved_3;
+ uint32_t reserved_4;
+
+ uint32_t abort_context;
+ uint32_t pad_1;
+
+ uint32_t abort_mfi_phys_addr_lo;
+ uint32_t abort_mfi_phys_addr_hi;
+
+ uint32_t reserved_5[6];
+};
+
+struct drsas_smp_frame {
+ uint8_t cmd;
+ uint8_t reserved_1;
+ uint8_t cmd_status;
+ uint8_t connection_status;
+
+ uint8_t reserved_2[3];
+ uint8_t sge_count;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t timeout;
+
+ uint32_t data_xfer_len;
+
+ uint64_t sas_addr;
+
+ union drsas_sgl sgl[2];
+};
+
+struct drsas_stp_frame {
+ uint8_t cmd;
+ uint8_t reserved_1;
+ uint8_t cmd_status;
+ uint8_t connection_status;
+
+ uint8_t target_id;
+ uint8_t reserved_2[2];
+ uint8_t sge_count;
+
+ uint32_t context;
+ uint8_t req_id;
+ uint8_t msgvector;
+ uint16_t pad_0;
+
+ uint16_t flags;
+ uint16_t timeout;
+
+ uint32_t data_xfer_len;
+
+ uint16_t fis[10];
+ uint32_t stp_flags;
+ union drsas_sgl sgl;
+};
+
+union drsas_frame {
+ struct drsas_header hdr;
+ struct drsas_init_frame init;
+ struct drsas_io_frame io;
+ struct drsas_pthru_frame pthru;
+ struct drsas_dcmd_frame dcmd;
+ struct drsas_abort_frame abort;
+ struct drsas_smp_frame smp;
+ struct drsas_stp_frame stp;
+
+ uint8_t raw_bytes[64];
+};
+
+typedef struct drsas_pd_address {
+ uint16_t device_id;
+ uint16_t encl_id;
+
+ union {
+ struct {
+ uint8_t encl_index;
+ uint8_t slot_number;
+ } pd_address;
+ struct {
+ uint8_t encl_position;
+ uint8_t encl_connector_index;
+ } encl_address;
+ }address;
+
+ uint8_t scsi_dev_type;
+
+ union {
+ uint8_t port_bitmap;
+ uint8_t port_numbers;
+ } connected;
+
+ uint64_t sas_addr[2];
+} drsas_pd_address_t;
+
+union drsas_evt_class_locale {
+ struct {
+ uint16_t locale;
+ uint8_t reserved;
+ int8_t class;
+ } members;
+
+ uint32_t word;
+};
+
+struct drsas_evt_log_info {
+ uint32_t newest_seq_num;
+ uint32_t oldest_seq_num;
+ uint32_t clear_seq_num;
+ uint32_t shutdown_seq_num;
+ uint32_t boot_seq_num;
+};
+
+struct drsas_progress {
+ uint16_t progress;
+ uint16_t elapsed_seconds;
+};
+
+struct drsas_evtarg_ld {
+ uint16_t target_id;
+ uint8_t ld_index;
+ uint8_t reserved;
+};
+
+struct drsas_evtarg_pd {
+ uint16_t device_id;
+ uint8_t encl_index;
+ uint8_t slot_number;
+};
+
+struct drsas_evt_detail {
+ uint32_t seq_num;
+ uint32_t time_stamp;
+ uint32_t code;
+ union drsas_evt_class_locale cl;
+ uint8_t arg_type;
+ uint8_t reserved1[15];
+
+ union {
+ struct {
+ struct drsas_evtarg_pd pd;
+ uint8_t cdb_length;
+ uint8_t sense_length;
+ uint8_t reserved[2];
+ uint8_t cdb[16];
+ uint8_t sense[64];
+ } cdbSense;
+
+ struct drsas_evtarg_ld ld;
+
+ struct {
+ struct drsas_evtarg_ld ld;
+ uint64_t count;
+ } ld_count;
+
+ struct {
+ uint64_t lba;
+ struct drsas_evtarg_ld ld;
+ } ld_lba;
+
+ struct {
+ struct drsas_evtarg_ld ld;
+ uint32_t prevOwner;
+ uint32_t newOwner;
+ } ld_owner;
+
+ struct {
+ uint64_t ld_lba;
+ uint64_t pd_lba;
+ struct drsas_evtarg_ld ld;
+ struct drsas_evtarg_pd pd;
+ } ld_lba_pd_lba;
+
+ struct {
+ struct drsas_evtarg_ld ld;
+ struct drsas_progress prog;
+ } ld_prog;
+
+ struct {
+ struct drsas_evtarg_ld ld;
+ uint32_t prev_state;
+ uint32_t new_state;
+ } ld_state;
+
+ struct {
+ uint64_t strip;
+ struct drsas_evtarg_ld ld;
+ } ld_strip;
+
+ struct drsas_evtarg_pd pd;
+
+ struct {
+ struct drsas_evtarg_pd pd;
+ uint32_t err;
+ } pd_err;
+
+ struct {
+ uint64_t lba;
+ struct drsas_evtarg_pd pd;
+ } pd_lba;
+
+ struct {
+ uint64_t lba;
+ struct drsas_evtarg_pd pd;
+ struct drsas_evtarg_ld ld;
+ } pd_lba_ld;
+
+ struct {
+ struct drsas_evtarg_pd pd;
+ struct drsas_progress prog;
+ } pd_prog;
+
+ struct {
+ struct drsas_evtarg_pd pd;
+ uint32_t prevState;
+ uint32_t newState;
+ } pd_state;
+
+ struct {
+ uint16_t vendorId;
+ uint16_t deviceId;
+ uint16_t subVendorId;
+ uint16_t subDeviceId;
+ } pci;
+
+ uint32_t rate;
+ char str[96];
+
+ struct {
+ uint32_t rtc;
+ uint32_t elapsedSeconds;
+ } time;
+
+ struct {
+ uint32_t ecar;
+ uint32_t elog;
+ char str[64];
+ } ecc;
+
+ drsas_pd_address_t pd_addr;
+
+ uint8_t b[96];
+ uint16_t s[48];
+ uint32_t w[24];
+ uint64_t d[12];
+ } args;
+
+ char description[128];
+
+};
+
+/* only 63 are usable by the application */
+#define MAX_LOGICAL_DRIVES 64
+/* only 255 physical devices may be used */
+#define MAX_PHYSICAL_DEVICES 256
+#define MAX_PD_PER_ENCLOSURE 64
+/* maximum disks per array */
+#define MAX_ROW_SIZE 32
+/* maximum spans per logical drive */
+#define MAX_SPAN_DEPTH 8
+/* maximum number of arrays a hot spare may be dedicated to */
+#define MAX_ARRAYS_DEDICATED 16
+/* maximum number of arrays which may exist */
+#define MAX_ARRAYS 128
+/* maximum number of foreign configs that may ha managed at once */
+#define MAX_FOREIGN_CONFIGS 8
+/* maximum spares (global and dedicated combined) */
+#define MAX_SPARES_FOR_THE_CONTROLLER MAX_PHYSICAL_DEVICES
+/* maximum possible Target IDs (i.e. 0 to 63) */
+#define MAX_TARGET_ID 63
+/* maximum number of supported enclosures */
+#define MAX_ENCLOSURES 32
+/* maximum number of PHYs per controller */
+#define MAX_PHYS_PER_CONTROLLER 16
+/* maximum number of LDs per array (due to DDF limitations) */
+#define MAX_LDS_PER_ARRAY 16
+
+/*
+ * -----------------------------------------------------------------------------
+ * -----------------------------------------------------------------------------
+ *
+ * Logical Drive commands
+ *
+ * -----------------------------------------------------------------------------
+ * -----------------------------------------------------------------------------
+ */
+#define DR_DCMD_LD 0x03000000, /* Logical Device (LD) opcodes */
+
+/*
+ * Input: dcmd.opcode - DR_DCMD_LD_GET_LIST
+ * dcmd.mbox - reserved
+ * dcmd.sge IN - ptr to returned DR_LD_LIST structure
+ * Desc: Return the logical drive list structure
+ * Status: No error
+ */
+
+/*
+ * defines the logical drive reference structure
+ */
+typedef union _DR_LD_REF { /* LD reference structure */
+ struct {
+ uint8_t targetId; /* LD target id (0 to MAX_TARGET_ID) */
+ uint8_t reserved; /* reserved for in line with DR_PD_REF */
+ uint16_t seqNum; /* Sequence Number */
+ } ld_ref;
+ uint32_t ref; /* shorthand reference to full 32-bits */
+} DR_LD_REF; /* 4 bytes */
+
+/*
+ * defines the logical drive list structure
+ */
+typedef struct _DR_LD_LIST {
+ uint32_t ldCount; /* number of LDs */
+ uint32_t reserved; /* pad to 8-byte boundary */
+ struct {
+ DR_LD_REF ref; /* LD reference */
+ uint8_t state; /* current LD state (DR_LD_STATE) */
+ uint8_t reserved[3]; /* pad to 8-byte boundary */
+ uint64_t size; /* LD size */
+ } ldList[MAX_LOGICAL_DRIVES];
+} DR_LD_LIST;
+
+struct drsas_drv_ver {
+ uint8_t signature[12];
+ uint8_t os_name[16];
+ uint8_t os_ver[12];
+ uint8_t drv_name[20];
+ uint8_t drv_ver[32];
+ uint8_t drv_rel_date[20];
+};
+
+#define PCI_TYPE0_ADDRESSES 6
+#define PCI_TYPE1_ADDRESSES 2
+#define PCI_TYPE2_ADDRESSES 5
+
+struct drsas_pci_common_header {
+ uint16_t vendorID; /* (ro) */
+ uint16_t deviceID; /* (ro) */
+ uint16_t command; /* Device control */
+ uint16_t status;
+ uint8_t revisionID; /* (ro) */
+ uint8_t progIf; /* (ro) */
+ uint8_t subClass; /* (ro) */
+ uint8_t baseClass; /* (ro) */
+ uint8_t cacheLineSize; /* (ro+) */
+ uint8_t latencyTimer; /* (ro+) */
+ uint8_t headerType; /* (ro) */
+ uint8_t bist; /* Built in self test */
+
+ union {
+ struct {
+ uint32_t baseAddresses[PCI_TYPE0_ADDRESSES];
+ uint32_t cis;
+ uint16_t subVendorID;
+ uint16_t subSystemID;
+ uint32_t romBaseAddress;
+ uint8_t capabilitiesPtr;
+ uint8_t reserved1[3];
+ uint32_t reserved2;
+ uint8_t interruptLine;
+ uint8_t interruptPin; /* (ro) */
+ uint8_t minimumGrant; /* (ro) */
+ uint8_t maximumLatency; /* (ro) */
+ } type_0;
+
+ struct {
+ uint32_t baseAddresses[PCI_TYPE1_ADDRESSES];
+ uint8_t primaryBus;
+ uint8_t secondaryBus;
+ uint8_t subordinateBus;
+ uint8_t secondaryLatency;
+ uint8_t ioBase;
+ uint8_t ioLimit;
+ uint16_t secondaryStatus;
+ uint16_t memoryBase;
+ uint16_t memoryLimit;
+ uint16_t prefetchBase;
+ uint16_t prefetchLimit;
+ uint32_t prefetchBaseUpper32;
+ uint32_t prefetchLimitUpper32;
+ uint16_t ioBaseUpper16;
+ uint16_t ioLimitUpper16;
+ uint8_t capabilitiesPtr;
+ uint8_t reserved1[3];
+ uint32_t romBaseAddress;
+ uint8_t interruptLine;
+ uint8_t interruptPin;
+ uint16_t bridgeControl;
+ } type_1;
+
+ struct {
+ uint32_t socketRegistersBaseAddress;
+ uint8_t capabilitiesPtr;
+ uint8_t reserved;
+ uint16_t secondaryStatus;
+ uint8_t primaryBus;
+ uint8_t secondaryBus;
+ uint8_t subordinateBus;
+ uint8_t secondaryLatency;
+ struct {
+ uint32_t base;
+ uint32_t limit;
+ } range[PCI_TYPE2_ADDRESSES-1];
+ uint8_t interruptLine;
+ uint8_t interruptPin;
+ uint16_t bridgeControl;
+ } type_2;
+ } header;
+};
+
+struct drsas_pci_link_capability {
+ union {
+ struct {
+ uint32_t linkSpeed :4;
+ uint32_t linkWidth :6;
+ uint32_t aspmSupport :2;
+ uint32_t losExitLatency :3;
+ uint32_t l1ExitLatency :3;
+ uint32_t rsvdp :6;
+ uint32_t portNumber :8;
+ } bits;
+
+ uint32_t asUlong;
+ } cap;
+
+};
+
+struct drsas_pci_link_status_capability {
+ union {
+ struct {
+ uint16_t linkSpeed :4;
+ uint16_t negotiatedLinkWidth :6;
+ uint16_t linkTrainingError :1;
+ uint16_t linkTraning :1;
+ uint16_t slotClockConfig :1;
+ uint16_t rsvdZ :3;
+ } bits;
+
+ uint16_t asUshort;
+ } stat_cap;
+
+ uint16_t reserved;
+
+};
+
+struct drsas_pci_capabilities {
+ struct drsas_pci_link_capability linkCapability;
+ struct drsas_pci_link_status_capability linkStatusCapability;
+};
+
+struct drsas_pci_information
+{
+ uint32_t busNumber;
+ uint8_t deviceNumber;
+ uint8_t functionNumber;
+ uint8_t interruptVector;
+ uint8_t reserved;
+ struct drsas_pci_common_header pciHeaderInfo;
+ struct drsas_pci_capabilities capability;
+ uint8_t reserved2[32];
+};
+
+struct drsas_ioctl {
+ uint16_t version;
+ uint16_t controller_id;
+ uint8_t signature[8];
+ uint32_t reserved_1;
+ uint32_t control_code;
+ uint32_t reserved_2[2];
+ uint8_t frame[64];
+ union drsas_sgl_frame sgl_frame;
+ uint8_t sense_buff[DRSAS_MAX_SENSE_LENGTH];
+ uint8_t data[1];
+};
+
+struct drsas_aen {
+ uint16_t host_no;
+ uint16_t cmd_status;
+ uint32_t seq_num;
+ uint32_t class_locale_word;
+};
+#pragma pack()
+
+#ifndef DDI_VENDOR_LSI
+#define DDI_VENDOR_LSI "LSI"
+#endif /* DDI_VENDOR_LSI */
+
+static int drsas_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
+static int drsas_attach(dev_info_t *, ddi_attach_cmd_t);
+static int drsas_reset(dev_info_t *, ddi_reset_cmd_t);
+static int drsas_detach(dev_info_t *, ddi_detach_cmd_t);
+static int drsas_open(dev_t *, int, int, cred_t *);
+static int drsas_close(dev_t, int, int, cred_t *);
+static int drsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+
+static int drsas_tran_tgt_init(dev_info_t *, dev_info_t *,
+ scsi_hba_tran_t *, struct scsi_device *);
+static struct scsi_pkt *drsas_tran_init_pkt(struct scsi_address *, register
+ struct scsi_pkt *, struct buf *, int, int, int, int,
+ int (*)(), caddr_t);
+static int drsas_tran_start(struct scsi_address *,
+ register struct scsi_pkt *);
+static int drsas_tran_abort(struct scsi_address *, struct scsi_pkt *);
+static int drsas_tran_reset(struct scsi_address *, int);
+static int drsas_tran_getcap(struct scsi_address *, char *, int);
+static int drsas_tran_setcap(struct scsi_address *, char *, int, int);
+static void drsas_tran_destroy_pkt(struct scsi_address *,
+ struct scsi_pkt *);
+static void drsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *);
+static void drsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *);
+static uint_t drsas_isr();
+static uint_t drsas_softintr();
+
+static int init_mfi(struct drsas_instance *);
+static int drsas_free_dma_obj(struct drsas_instance *, dma_obj_t);
+static int drsas_alloc_dma_obj(struct drsas_instance *, dma_obj_t *,
+ uchar_t);
+static struct drsas_cmd *get_mfi_pkt(struct drsas_instance *);
+static void return_mfi_pkt(struct drsas_instance *,
+ struct drsas_cmd *);
+
+static void free_space_for_mfi(struct drsas_instance *);
+static void free_additional_dma_buffer(struct drsas_instance *);
+static int alloc_additional_dma_buffer(struct drsas_instance *);
+static int read_fw_status_reg_ppc(struct drsas_instance *);
+static void issue_cmd_ppc(struct drsas_cmd *, struct drsas_instance *);
+static int issue_cmd_in_poll_mode_ppc(struct drsas_instance *,
+ struct drsas_cmd *);
+static int issue_cmd_in_sync_mode_ppc(struct drsas_instance *,
+ struct drsas_cmd *);
+static void enable_intr_ppc(struct drsas_instance *);
+static void disable_intr_ppc(struct drsas_instance *);
+static int intr_ack_ppc(struct drsas_instance *);
+static int mfi_state_transition_to_ready(struct drsas_instance *);
+static void destroy_mfi_frame_pool(struct drsas_instance *);
+static int create_mfi_frame_pool(struct drsas_instance *);
+static int drsas_dma_alloc(struct drsas_instance *, struct scsi_pkt *,
+ struct buf *, int, int (*)());
+static int drsas_dma_move(struct drsas_instance *,
+ struct scsi_pkt *, struct buf *);
+static void flush_cache(struct drsas_instance *instance);
+static void display_scsi_inquiry(caddr_t);
+static int start_mfi_aen(struct drsas_instance *instance);
+static int handle_drv_ioctl(struct drsas_instance *instance,
+ struct drsas_ioctl *ioctl, int mode);
+static int handle_mfi_ioctl(struct drsas_instance *instance,
+ struct drsas_ioctl *ioctl, int mode);
+static int handle_mfi_aen(struct drsas_instance *instance,
+ struct drsas_aen *aen);
+static void fill_up_drv_ver(struct drsas_drv_ver *dv);
+static struct drsas_cmd *build_cmd(struct drsas_instance *instance,
+ struct scsi_address *ap, struct scsi_pkt *pkt,
+ uchar_t *cmd_done);
+static int register_mfi_aen(struct drsas_instance *instance,
+ uint32_t seq_num, uint32_t class_locale_word);
+static int issue_mfi_pthru(struct drsas_instance *instance, struct
+ drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int issue_mfi_dcmd(struct drsas_instance *instance, struct
+ drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int issue_mfi_smp(struct drsas_instance *instance, struct
+ drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int issue_mfi_stp(struct drsas_instance *instance, struct
+ drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode);
+static int abort_aen_cmd(struct drsas_instance *instance,
+ struct drsas_cmd *cmd_to_abort);
+
+static int drsas_common_check(struct drsas_instance *instance,
+ struct drsas_cmd *cmd);
+static void drsas_fm_init(struct drsas_instance *instance);
+static void drsas_fm_fini(struct drsas_instance *instance);
+static int drsas_fm_error_cb(dev_info_t *, ddi_fm_error_t *,
+ const void *);
+static void drsas_fm_ereport(struct drsas_instance *instance,
+ char *detail);
+static int drsas_check_dma_handle(ddi_dma_handle_t handle);
+static int drsas_check_acc_handle(ddi_acc_handle_t handle);
+
+static void drsas_rem_intrs(struct drsas_instance *instance);
+static int drsas_add_intrs(struct drsas_instance *instance, int intr_type);
+
+static void drsas_tran_tgt_free(dev_info_t *, dev_info_t *,
+ scsi_hba_tran_t *, struct scsi_device *);
+static int drsas_tran_bus_config(dev_info_t *, uint_t,
+ ddi_bus_config_op_t, void *, dev_info_t **);
+static int drsas_parse_devname(char *, int *, int *);
+static int drsas_config_all_devices(struct drsas_instance *);
+static int drsas_config_scsi_device(struct drsas_instance *,
+ struct scsi_device *, dev_info_t **);
+static int drsas_config_ld(struct drsas_instance *, uint16_t,
+ uint8_t, dev_info_t **);
+static dev_info_t *drsas_find_child(struct drsas_instance *, uint16_t,
+ uint8_t);
+static int drsas_name_node(dev_info_t *, char *, int);
+static void drsas_issue_evt_taskq(struct drsas_eventinfo *);
+static int drsas_service_evt(struct drsas_instance *, int, int, int);
+static void drsas_mode_sense_build(struct scsi_pkt *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DR_SAS_H_ */
diff --git a/usr/src/uts/common/io/dr_sas/dr_sas_list.h b/usr/src/uts/common/io/dr_sas/dr_sas_list.h
new file mode 100644
index 0000000000..4154a77796
--- /dev/null
+++ b/usr/src/uts/common/io/dr_sas/dr_sas_list.h
@@ -0,0 +1,212 @@
+/*
+ * dr_sas_list.h: header for dr_sas
+ *
+ * Solaris MegaRAID driver for SAS2.0 controllers
+ * Copyright (c) 2008-2009, LSI Logic Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the author nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _DR_SAS_LIST_H_
+#define _DR_SAS_LIST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct mlist_head {
+ struct mlist_head *next, *prev;
+};
+
+typedef struct mlist_head mlist_t;
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+ struct mlist_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) { \
+ (ptr)->next = (ptr); (ptr)->prev = (ptr); \
+}
+
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static void __list_add(struct mlist_head *new,
+ struct mlist_head *prev,
+ struct mlist_head *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+
+/*
+ * mlist_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static void mlist_add(struct mlist_head *new, struct mlist_head *head)
+{
+ __list_add(new, head, head->next);
+}
+
+
+/*
+ * mlist_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static void mlist_add_tail(struct mlist_head *new, struct mlist_head *head)
+{
+ __list_add(new, head->prev, head);
+}
+
+
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static void __list_del(struct mlist_head *prev,
+ struct mlist_head *next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+
+/*
+ * mlist_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static void mlist_del_init(struct mlist_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ INIT_LIST_HEAD(entry);
+}
+
+
+/*
+ * mlist_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static int mlist_empty(struct mlist_head *head)
+{
+ return (head->next == head);
+}
+
+
+/*
+ * mlist_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static void mlist_splice(struct mlist_head *list, struct mlist_head *head)
+{
+ struct mlist_head *first = list->next;
+
+ if (first != list) {
+ struct mlist_head *last = list->prev;
+ struct mlist_head *at = head->next;
+
+ first->prev = head;
+ head->next = first;
+
+ last->next = at;
+ at->prev = last;
+ }
+}
+
+
+/*
+ * mlist_entry - get the struct for this entry
+ * @ptr: the &struct mlist_head pointer.
+ * @type: the type of the struct this is embedded in.
+ * @member: the name of the list_struct within the struct.
+ */
+#define mlist_entry(ptr, type, member) \
+ ((type *)((size_t)(ptr) - offsetof(type, member)))
+
+
+/*
+ * mlist_for_each - iterate over a list
+ * @pos: the &struct mlist_head to use as a loop counter.
+ * @head: the head for your list.
+ */
+#define mlist_for_each(pos, head) \
+ for (pos = (head)->next, prefetch(pos->next); pos != (head); \
+ pos = pos->next, prefetch(pos->next))
+
+
+/*
+ * mlist_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos: the &struct mlist_head to use as a loop counter.
+ * @n: another &struct mlist_head to use as temporary storage
+ * @head: the head for your list.
+ */
+#define mlist_for_each_safe(pos, n, head) \
+ for (pos = (head)->next, n = pos->next; pos != (head); \
+ pos = n, n = pos->next)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DR_SAS_LIST_H_ */
diff --git a/usr/src/uts/common/io/dump.c b/usr/src/uts/common/io/dump.c
index 4fd52e6448..f4d8c1cf2c 100644
--- a/usr/src/uts/common/io/dump.c
+++ b/usr/src/uts/common/io/dump.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
* Delphix (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
@@ -46,6 +47,7 @@
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
+#include <sys/random.h>
static dev_info_t *dump_devi;
@@ -141,16 +143,20 @@ dump_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred, int *rvalp)
*rvalp = dump_conflags;
if (dumpvp && !(dumpvp->v_flag & VISSWAP))
*rvalp |= DUMP_EXCL;
+
mutex_exit(&dump_lock);
break;
case DIOCSETCONF:
mutex_enter(&dump_lock);
if (arg == DUMP_KERNEL || arg == DUMP_ALL ||
- arg == DUMP_CURPROC)
- dump_conflags = arg;
- else
+ arg == DUMP_CURPROC) {
+ dump_conflags = (dump_conflags & DUMP_STATE) |
+ (arg & DUMP_CONTENT);
+ } else {
error = EINVAL;
+ }
+
mutex_exit(&dump_lock);
break;
@@ -181,6 +187,24 @@ dump_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred, int *rvalp)
VN_RELE(vp);
break;
+ case DIOCSCRYPTKEY: {
+ uint8_t key[DUMP_CRYPT_KEYLEN];
+ uint8_t nonce[DUMP_CRYPT_NONCELEN];
+
+ if ((error = copyin((uint8_t *)arg, key, sizeof (key))) != 0)
+ break;
+
+ (void) random_get_pseudo_bytes(nonce, sizeof (nonce));
+
+ mutex_enter(&dump_lock);
+ bcopy(key, dump_crypt_key, DUMP_CRYPT_KEYLEN);
+ bcopy(nonce, dump_crypt_nonce, DUMP_CRYPT_NONCELEN);
+ dump_conflags |= DUMP_ENCRYPT; /* a one-way trip */
+ mutex_exit(&dump_lock);
+
+ break;
+ }
+
case DIOCDUMP:
mutex_enter(&dump_lock);
if (dumpvp == NULL)
diff --git a/usr/src/uts/common/io/eventfd.c b/usr/src/uts/common/io/eventfd.c
index 32f875917f..efc1f9233f 100644
--- a/usr/src/uts/common/io/eventfd.c
+++ b/usr/src/uts/common/io/eventfd.c
@@ -141,37 +141,39 @@ eventfd_read(dev_t dev, uio_t *uio, cred_t *cr)
* transitions from EVENTFD_VALMAX to a lower value. At all other
* times, it is already considered writable by poll.
*/
- if (oval == EVENTFD_VALMAX) {
+ if (oval >= EVENTFD_VALMAX) {
pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
}
return (err);
}
-/*ARGSUSED*/
static int
-eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
+eventfd_post(eventfd_state_t *state, uint64_t val, boolean_t is_async,
+ boolean_t file_nonblock)
{
- eventfd_state_t *state;
- minor_t minor = getminor(dev);
- uint64_t val, oval;
- int err;
-
- if (uio->uio_resid < sizeof (val))
- return (EINVAL);
-
- if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
- return (err);
-
- if (val > EVENTFD_VALMAX)
- return (EINVAL);
-
- state = ddi_get_soft_state(eventfd_softstate, minor);
+ uint64_t oval;
+ boolean_t overflow = B_FALSE;
mutex_enter(&state->efd_lock);
while (val > EVENTFD_VALMAX - state->efd_value) {
- if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
+
+ /*
+ * When called from (LX) AIO, expectations about overflow and
+ * blocking are different than normal operation. If the
+ * incoming value would cause overflow, it is clamped to reach
+ * the overflow value exactly. This is added to the existing
+ * value without blocking. Any pollers of the eventfd will see
+ * POLLERR asserted when this occurs.
+ */
+ if (is_async) {
+ val = EVENTFD_VALOVERFLOW - state->efd_value;
+ overflow = B_TRUE;
+ break;
+ }
+
+ if (file_nonblock) {
mutex_exit(&state->efd_lock);
return (EAGAIN);
}
@@ -186,7 +188,7 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
}
/*
- * We now know that we can add the value without overflowing.
+ * We now know that we can safely add the value.
*/
state->efd_value = (oval = state->efd_value) + val;
@@ -200,10 +202,13 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
mutex_exit(&state->efd_lock);
/*
- * Notify pollers as well if the eventfd is now readable.
+ * Notify pollers as well if the eventfd has become readable or has
+ * transitioned into overflow.
*/
if (oval == 0) {
pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
+ } else if (overflow && val != 0) {
+ pollwakeup(&state->efd_pollhd, POLLERR);
}
return (0);
@@ -211,6 +216,29 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
/*ARGSUSED*/
static int
+eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
+{
+ eventfd_state_t *state;
+ boolean_t file_nonblock;
+ uint64_t val;
+ int err;
+
+ if (uio->uio_resid < sizeof (val))
+ return (EINVAL);
+
+ if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
+ return (err);
+
+ if (val > EVENTFD_VALMAX)
+ return (EINVAL);
+
+ file_nonblock = (uio->uio_fmode & (FNDELAY|FNONBLOCK)) != 0;
+ state = ddi_get_soft_state(eventfd_softstate, getminor(dev));
+ return (eventfd_post(state, val, B_FALSE, file_nonblock));
+}
+
+/*ARGSUSED*/
+static int
eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
struct pollhead **phpp)
{
@@ -228,6 +256,9 @@ eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
if (state->efd_value < EVENTFD_VALMAX)
revents |= POLLWRNORM | POLLOUT;
+ if (state->efd_value == EVENTFD_VALOVERFLOW)
+ revents |= POLLERR;
+
*reventsp = revents & events;
if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
*phpp = &state->efd_pollhd;
@@ -244,17 +275,28 @@ eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
{
eventfd_state_t *state;
minor_t minor = getminor(dev);
+ uint64_t *valp;
state = ddi_get_soft_state(eventfd_softstate, minor);
switch (cmd) {
- case EVENTFDIOC_SEMAPHORE: {
+ case EVENTFDIOC_SEMAPHORE:
mutex_enter(&state->efd_lock);
state->efd_semaphore ^= 1;
mutex_exit(&state->efd_lock);
+ return (0);
+ case EVENTFDIOC_POST:
+ /*
+ * This ioctl is expected to be kernel-internal, used only by
+ * the AIO emulation in LX.
+ */
+ if ((md & FKIOCTL) == 0) {
+ break;
+ }
+ valp = (uint64_t *)arg;
+ VERIFY(eventfd_post(state, *valp, B_TRUE, B_FALSE) == 0);
return (0);
- }
default:
break;
diff --git a/usr/src/uts/common/io/gsqueue/gsqueue.c b/usr/src/uts/common/io/gsqueue/gsqueue.c
new file mode 100644
index 0000000000..03bb799499
--- /dev/null
+++ b/usr/src/uts/common/io/gsqueue/gsqueue.c
@@ -0,0 +1,608 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * Serialization queues are a technique used in illumos to provide what's
+ * commonly known as a 'vertical' perimeter. The idea (described a bit in
+ * uts/common/inet/squeue.c) is to provide a means to make sure that message
+ * blocks (mblk_t) are processed in a specific order. Subsystems like ip and vnd
+ * consume these on different policies, ip on a conn_t basis, vnd on a per
+ * device basis, and use this to ensure that only one packet is being processed
+ * at a given time.
+ *
+ * Serialization queues were originally used by ip. As part of that
+ * implementation, many of the details of ip were baked into it. That includes
+ * things like conn_t, ip receive attributes, and the notion of sets. While an
+ * individual serialization queue, or gsqueue_t, is a useful level of
+ * abstraction, it isn't the basis on which monst consumers want to manage them.
+ * Instead, we have the notion of a set of serialization queues. These sets are
+ * DR (CPU Dynamic reconfiguration) aware, and allow consumers to have a
+ * gsqueue_t per CPU to fanout on without managing them all itself. In the
+ * original implementation, this existed, but they were heavily tied into the
+ * infrastructure of IP, and its notion of polling on the underlying MAC
+ * devices.
+ *
+ * The result of that past is a new interface to serialization queues and a
+ * similar, but slightly different, abstraction to sets of these
+ * (gsqueue_set_t). When designing this there are two different approaches that
+ * one could consider. The first is that the system has one gsqueue_set_t that
+ * the entire world shares, whether IP or some other consumer. The other is that
+ * every consumer has their own set.
+ *
+ * The trade offs between these two failure modes are the pathological failure
+ * modes. There is no guarantee that any two consumers here are equivalent. In
+ * fact, they very likely have very different latency profiles. If they are
+ * being processed in the same queue, that can lead to very odd behaviors. More
+ * generally, if we have a series of processing functions from one consumer
+ * which are generally short, and another which are generally long, that'll
+ * cause undue latency that's harder to observe. If we instead take the approach
+ * that each consumer should have its own set that it fans out over then we
+ * won't end up with the problem that a given serialization queue will have
+ * multiple latency profiles, but instead we'll see cpu contention for the bound
+ * gsqueue_t worker thread. Keep in mind though, that only the gsqueue_t worker
+ * thread is bound and it is in fact possible for it to be processed by other
+ * threads on other CPUs.
+ *
+ * We've opted to go down the second path, so each consumer has its own
+ * independent set of serialization queues that it is bound over.
+ *
+ * Structure Hierarchies
+ * ---------------------
+ *
+ * At the top level, we have a single list of gsqueue_set_t. The gsqueue_set_t
+ * encapsulates all the per-CPU gsqueue_t that exist in the form of
+ * gsqueue_cpu_t. The gsqueue_cpu_t has been designed such that it could
+ * accommodate more than one gsqueue_t, but today there is a one to one mapping.
+ *
+ * We maintain two different lists of gsqueue_cpu_t, the active and defunct
+ * sets. The active set is maintained in the array `gs_cpus`. There are NCPU
+ * entries available in `gs_cpus` with the total number of currently active cpus
+ * described in `gs_ncpus`. The ordering of `gs_cpus` is unimportant. When
+ * there is no longer a need for a given binding (see the following section for
+ * more explanation on when this is the case) then we move the entry to the
+ * `gs_defunct` list which is just a list_t of gsqueue_cpu_t.
+ *
+ * In addition, each gsqueue_set_t can have a series of callbacks registered
+ * with it. These are described in the following section. Graphically, a given
+ * gsqueue_set_t looks roughly like the following:
+ *
+ * +---------------+
+ * | gsqueue_set_t |
+ * +---------------+
+ * | | |
+ * | | * . . . gs_cpus
+ * | | |
+ * | | | +-------------------------------------------------+
+ * | | +--->| gsqueue_cpu_t || gsqueue_cpu_t || gsqueue_cpu_t |...
+ * | | +-------------------------------------------------+
+ * | |
+ * | * . . . gs_defunct
+ * | |
+ * | | +---------------+ +---------------+ +---------------+
+ * | +--->| gsqueue_cpu_t |-->| gsqueue_cpu_t |-->| gsqueue_cpu_t |...
+ * | +---------------+ +---------------+ +---------------+
+ * * . . . gs_cbs
+ * |
+ * | +--------------+ +--------------+ +--------------+
+ * +--->| gsqueue_cb_t |-->| gsqueue_cb_t |->| gsqueue_cb_t |...
+ * +--------------+ +--------------+ +--------------+
+ *
+ * CPU DR, gsqueue_t, and gsqueue_t
+ * --------------------------------
+ *
+ * Recall, that every serialization queue (gsqueue_t or squeue_t) has a worker
+ * thread that may end up doing work. As part of supporting fanout, we have one
+ * gsqueue_t per CPU, and its worker thread is bound to that CPU. Because of
+ * this binding, we need to deal with CPU DR changes.
+ *
+ * The gsqueue driver maintains a single CPU DR callback that is used for the
+ * entire sub-system. We break down CPU DR events into three groups. Offline
+ * events, online events, and events we can ignore. When the first group occurs,
+ * we need to go through every gsqueue_t, find the gsqueue_cpu_t that
+ * corresponds to that processor id, and unbind all of its gsqueue_t's. It's
+ * rather important that we only unbind the gsqueue_t's and not actually destroy
+ * them. When this happens, they could very easily have data queued inside of
+ * them and it's unreasonable to just throw out everything in them at this
+ * point. The data remains intact and service continues uinterrupted.
+ *
+ * When we receive an online event, we do the opposite. We try to find a
+ * gsqueue_cpu_t that previously was bound to this CPU (by leaving its gqc_cpuid
+ * field intact) in the defunct list. If we find one, we remove it from the
+ * defunct list and add it to the active list as well as binding the gsqueue_t
+ * to the CPU in question. If we don't find one, then we create a new one.
+ *
+ * To deal with these kinds of situations, we allow a consumer to register
+ * callbacks for the gsqueue_t that they are interested in. These callbacks will
+ * fire whenever we are handling a topology change. The design of the callbacks
+ * is not that the user can take any administrative action during them, but
+ * rather set something for them to do asynchronously. It is illegal to make any
+ * calls into the gsqueue system while you are in a callback.
+ *
+ * Locking
+ * -------
+ *
+ * The lock ordering here is fairly straightforward. Due to our use of CPU
+ * binding and the CPU DR callbacks, we have an additional lock to consider
+ * cpu_lock. Because of that, the following are the rules for locking:
+ *
+ *
+ * o If performing binding operations, you must grab cpu_lock. cpu_lock is
+ * also at the top of the order.
+ *
+ * o cpu_lock > gsqueue_lock > gsqueue_t`gs_lock > squeue_t`sq_lock
+ * If you need to take multiple locks, you must take the greatest
+ * (left-most) one first.
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/stream.h>
+#include <sys/modctl.h>
+#include <sys/cpuvar.h>
+#include <sys/list.h>
+#include <sys/sysmacros.h>
+
+#include <sys/gsqueue.h>
+#include <sys/squeue_impl.h>
+
+typedef struct gsqueue_cb {
+ struct gsqueue_cb *gcb_next;
+ gsqueue_cb_f gcb_func;
+ void *gcb_arg;
+} gsqueue_cb_t;
+
+typedef struct gsqueue_cpu {
+ list_node_t gqc_lnode;
+ squeue_t *gqc_head;
+ processorid_t gqc_cpuid;
+} gsqueue_cpu_t;
+
+struct gsqueue_set {
+ list_node_t gs_next;
+ pri_t gs_wpri;
+ kmutex_t gs_lock;
+ int gs_ncpus;
+ gsqueue_cpu_t **gs_cpus;
+ list_t gs_defunct;
+ gsqueue_cb_t *gs_cbs;
+};
+
+static kmutex_t gsqueue_lock;
+static list_t gsqueue_list;
+static kmem_cache_t *gsqueue_cb_cache;
+static kmem_cache_t *gsqueue_cpu_cache;
+static kmem_cache_t *gsqueue_set_cache;
+
+static gsqueue_cpu_t *
+gsqueue_cpu_create(pri_t wpri, processorid_t cpuid)
+{
+ gsqueue_cpu_t *scp;
+
+ scp = kmem_cache_alloc(gsqueue_cpu_cache, KM_SLEEP);
+
+ list_link_init(&scp->gqc_lnode);
+ scp->gqc_cpuid = cpuid;
+ scp->gqc_head = squeue_create(wpri, B_FALSE);
+ scp->gqc_head->sq_state = SQS_DEFAULT;
+ squeue_bind(scp->gqc_head, cpuid);
+
+ return (scp);
+}
+
+static void
+gsqueue_cpu_destroy(gsqueue_cpu_t *scp)
+{
+ squeue_destroy(scp->gqc_head);
+ kmem_cache_free(gsqueue_cpu_cache, scp);
+}
+
+gsqueue_set_t *
+gsqueue_set_create(pri_t wpri)
+{
+ int i;
+ gsqueue_set_t *gssp;
+
+ gssp = kmem_cache_alloc(gsqueue_set_cache, KM_SLEEP);
+ gssp->gs_wpri = wpri;
+ gssp->gs_ncpus = 0;
+
+ /*
+ * We're grabbing CPU lock. Once we let go of it we have to ensure all
+ * set up of the gsqueue_set_t is complete, as it'll be in there for the
+ * various CPU DR bits.
+ */
+ mutex_enter(&cpu_lock);
+
+ for (i = 0; i < NCPU; i++) {
+ gsqueue_cpu_t *scp;
+ cpu_t *cp = cpu_get(i);
+ if (cp != NULL && CPU_ACTIVE(cp) &&
+ cp->cpu_flags & CPU_EXISTS) {
+ scp = gsqueue_cpu_create(wpri, cp->cpu_id);
+ gssp->gs_cpus[gssp->gs_ncpus] = scp;
+ gssp->gs_ncpus++;
+ }
+ }
+
+ /* Finally we can add it to our global list and be done */
+ mutex_enter(&gsqueue_lock);
+ list_insert_tail(&gsqueue_list, gssp);
+ mutex_exit(&gsqueue_lock);
+ mutex_exit(&cpu_lock);
+
+ return (gssp);
+}
+
+void
+gsqueue_set_destroy(gsqueue_set_t *gssp)
+{
+ int i;
+ gsqueue_cpu_t *scp;
+
+ /*
+ * Go through and unbind all of the squeues while cpu_lock is held and
+ * move them to the defunct list. Once that's done, we don't need to do
+ * anything else with cpu_lock.
+ */
+ mutex_enter(&cpu_lock);
+ mutex_enter(&gsqueue_lock);
+ list_remove(&gsqueue_list, gssp);
+ mutex_exit(&gsqueue_lock);
+
+ mutex_enter(&gssp->gs_lock);
+
+ for (i = 0; i < gssp->gs_ncpus; i++) {
+ scp = gssp->gs_cpus[i];
+ squeue_unbind(scp->gqc_head);
+ list_insert_tail(&gssp->gs_defunct, scp);
+ gssp->gs_cpus[i] = NULL;
+ }
+ gssp->gs_ncpus = 0;
+
+ mutex_exit(&gssp->gs_lock);
+ mutex_exit(&cpu_lock);
+
+ while ((scp = list_remove_head(&gssp->gs_defunct)) != NULL) {
+ gsqueue_cpu_destroy(scp);
+ }
+
+ while (gssp->gs_cbs != NULL) {
+ gsqueue_cb_t *cbp;
+
+ cbp = gssp->gs_cbs;
+ gssp->gs_cbs = cbp->gcb_next;
+ kmem_cache_free(gsqueue_cb_cache, cbp);
+ }
+
+ ASSERT3U(gssp->gs_ncpus, ==, 0);
+ ASSERT3P(list_head(&gssp->gs_defunct), ==, NULL);
+ ASSERT3P(gssp->gs_cbs, ==, NULL);
+ kmem_cache_free(gsqueue_set_cache, gssp);
+}
+
+gsqueue_t *
+gsqueue_set_get(gsqueue_set_t *gssp, uint_t index)
+{
+ squeue_t *sqp;
+ gsqueue_cpu_t *scp;
+
+ mutex_enter(&gssp->gs_lock);
+ scp = gssp->gs_cpus[index % gssp->gs_ncpus];
+ sqp = scp->gqc_head;
+ mutex_exit(&gssp->gs_lock);
+ return ((gsqueue_t *)sqp);
+}
+
+uintptr_t
+gsqueue_set_cb_add(gsqueue_set_t *gssp, gsqueue_cb_f cb, void *arg)
+{
+ gsqueue_cb_t *cbp;
+
+ cbp = kmem_cache_alloc(gsqueue_cb_cache, KM_SLEEP);
+ cbp->gcb_func = cb;
+ cbp->gcb_arg = arg;
+
+ mutex_enter(&gssp->gs_lock);
+ cbp->gcb_next = gssp->gs_cbs;
+ gssp->gs_cbs = cbp;
+ mutex_exit(&gssp->gs_lock);
+ return ((uintptr_t)cbp);
+}
+
+int
+gsqueue_set_cb_remove(gsqueue_set_t *gssp, uintptr_t id)
+{
+ gsqueue_cb_t *cbp, *prev;
+ mutex_enter(&gssp->gs_lock);
+ cbp = gssp->gs_cbs;
+ prev = NULL;
+ while (cbp != NULL) {
+ if ((uintptr_t)cbp != id) {
+ prev = cbp;
+ cbp = cbp->gcb_next;
+ continue;
+ }
+
+ if (prev == NULL) {
+ gssp->gs_cbs = cbp->gcb_next;
+ } else {
+ prev->gcb_next = cbp->gcb_next;
+ }
+
+ mutex_exit(&gssp->gs_lock);
+ kmem_cache_free(gsqueue_cb_cache, cbp);
+ return (0);
+ }
+ mutex_exit(&gssp->gs_lock);
+ return (-1);
+}
+
+void
+gsqueue_enter_one(gsqueue_t *gsp, mblk_t *mp, gsqueue_proc_f func, void *arg,
+ int flags, uint8_t tag)
+{
+ squeue_t *sqp = (squeue_t *)gsp;
+
+ ASSERT(mp->b_next == NULL);
+ ASSERT(mp->b_prev == NULL);
+ mp->b_queue = (queue_t *)func;
+ mp->b_prev = arg;
+ sqp->sq_enter(sqp, mp, mp, 1, NULL, flags, tag);
+}
+
+static void
+gsqueue_notify(gsqueue_set_t *gssp, squeue_t *sqp, boolean_t online)
+{
+ gsqueue_cb_t *cbp;
+
+ ASSERT(MUTEX_HELD(&gssp->gs_lock));
+ cbp = gssp->gs_cbs;
+ while (cbp != NULL) {
+ cbp->gcb_func(gssp, (gsqueue_t *)sqp, cbp->gcb_arg, online);
+ cbp = cbp->gcb_next;
+ }
+
+}
+
+/*
+ * When we online a processor we need to go through and either bind a defunct
+ * squeue or create a new one. We'll try to reuse a gsqueue_cpu_t from the
+ * defunct list that used to be on that processor. If no such gsqueue_cpu_t
+ * exists, then we'll create a new one. We'd rather avoid taking over an
+ * existing defunct one that used to be on another CPU, as its not unreasonable
+ * to believe that its CPU will come back. More CPUs are offlined and onlined by
+ * the administrator or by creating cpu sets than actually get offlined by FMA.
+ */
+static void
+gsqueue_handle_online(processorid_t id)
+{
+ gsqueue_set_t *gssp;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ mutex_enter(&gsqueue_lock);
+ for (gssp = list_head(&gsqueue_list); gssp != NULL;
+ gssp = list_next(&gsqueue_list, gssp)) {
+ gsqueue_cpu_t *scp;
+
+ mutex_enter(&gssp->gs_lock);
+ for (scp = list_head(&gssp->gs_defunct); scp != NULL;
+ scp = list_next(&gssp->gs_defunct, scp)) {
+ if (scp->gqc_cpuid == id) {
+ list_remove(&gssp->gs_defunct, scp);
+ break;
+ }
+ }
+
+ if (scp == NULL) {
+ scp = gsqueue_cpu_create(gssp->gs_wpri, id);
+ } else {
+ squeue_bind(scp->gqc_head, id);
+ }
+
+ ASSERT(gssp->gs_ncpus < NCPU);
+ gssp->gs_cpus[gssp->gs_ncpus] = scp;
+ gssp->gs_ncpus++;
+ gsqueue_notify(gssp, scp->gqc_head, B_TRUE);
+ mutex_exit(&gssp->gs_lock);
+ }
+ mutex_exit(&gsqueue_lock);
+}
+
+static void
+gsqueue_handle_offline(processorid_t id)
+{
+ gsqueue_set_t *gssp;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ mutex_enter(&gsqueue_lock);
+ for (gssp = list_head(&gsqueue_list); gssp != NULL;
+ gssp = list_next(&gsqueue_list, gssp)) {
+ int i;
+ gsqueue_cpu_t *scp = NULL;
+
+ mutex_enter(&gssp->gs_lock);
+ for (i = 0; i < gssp->gs_ncpus; i++) {
+ if (gssp->gs_cpus[i]->gqc_cpuid == id) {
+ scp = gssp->gs_cpus[i];
+ break;
+ }
+ }
+
+ if (scp != NULL) {
+ squeue_unbind(scp->gqc_head);
+ list_insert_tail(&gssp->gs_defunct, scp);
+ gssp->gs_cpus[i] = gssp->gs_cpus[gssp->gs_ncpus-1];
+ gssp->gs_ncpus--;
+ gsqueue_notify(gssp, scp->gqc_head, B_FALSE);
+ }
+ mutex_exit(&gssp->gs_lock);
+ }
+ mutex_exit(&gsqueue_lock);
+}
+
+/* ARGSUSED */
+static int
+gsqueue_cpu_setup(cpu_setup_t what, int id, void *unused)
+{
+ cpu_t *cp;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ cp = cpu_get(id);
+ switch (what) {
+ case CPU_CONFIG:
+ case CPU_ON:
+ case CPU_INIT:
+ case CPU_CPUPART_IN:
+ if (cp != NULL && CPU_ACTIVE(cp) && cp->cpu_flags & CPU_EXISTS)
+ gsqueue_handle_online(cp->cpu_id);
+ break;
+ case CPU_UNCONFIG:
+ case CPU_OFF:
+ case CPU_CPUPART_OUT:
+ gsqueue_handle_offline(cp->cpu_id);
+ break;
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+
+/* ARGSUSED */
+static int
+gsqueue_set_cache_construct(void *buf, void *arg, int kmflags)
+{
+ gsqueue_set_t *gssp = buf;
+
+ gssp->gs_cpus = kmem_alloc(sizeof (gsqueue_cpu_t *) * NCPU, kmflags);
+ if (gssp->gs_cpus == NULL)
+ return (-1);
+
+ mutex_init(&gssp->gs_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&gssp->gs_defunct, sizeof (gsqueue_cpu_t),
+ offsetof(gsqueue_cpu_t, gqc_lnode));
+ gssp->gs_ncpus = 0;
+ gssp->gs_cbs = NULL;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+gsqueue_set_cache_destruct(void *buf, void *arg)
+{
+ gsqueue_set_t *gssp = buf;
+
+ kmem_free(gssp->gs_cpus, sizeof (gsqueue_cpu_t *) * NCPU);
+ gssp->gs_cpus = NULL;
+ list_destroy(&gssp->gs_defunct);
+ mutex_destroy(&gssp->gs_lock);
+}
+
+static void
+gsqueue_ddiinit(void)
+{
+ list_create(&gsqueue_list, sizeof (gsqueue_set_t),
+ offsetof(gsqueue_set_t, gs_next));
+ mutex_init(&gsqueue_lock, NULL, MUTEX_DRIVER, NULL);
+
+ gsqueue_cb_cache = kmem_cache_create("gsqueue_cb_cache",
+ sizeof (gsqueue_cb_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ gsqueue_cpu_cache = kmem_cache_create("gsqueue_cpu_cache",
+ sizeof (gsqueue_cpu_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ gsqueue_set_cache = kmem_cache_create("squeue_set_cache",
+ sizeof (gsqueue_set_t),
+ 0, gsqueue_set_cache_construct, gsqueue_set_cache_destruct,
+ NULL, NULL, NULL, 0);
+
+
+ mutex_enter(&cpu_lock);
+ register_cpu_setup_func(gsqueue_cpu_setup, NULL);
+ mutex_exit(&cpu_lock);
+}
+
+static int
+gsqueue_ddifini(void)
+{
+ mutex_enter(&gsqueue_lock);
+ if (list_is_empty(&gsqueue_list) == 0) {
+ mutex_exit(&gsqueue_lock);
+ return (EBUSY);
+ }
+ list_destroy(&gsqueue_list);
+ mutex_exit(&gsqueue_lock);
+
+ mutex_enter(&cpu_lock);
+ register_cpu_setup_func(gsqueue_cpu_setup, NULL);
+ mutex_exit(&cpu_lock);
+
+ kmem_cache_destroy(gsqueue_set_cache);
+ kmem_cache_destroy(gsqueue_cpu_cache);
+ kmem_cache_destroy(gsqueue_cb_cache);
+
+ mutex_destroy(&gsqueue_lock);
+
+ return (0);
+}
+
+static struct modlmisc gsqueue_modmisc = {
+ &mod_miscops,
+ "gsqueue"
+};
+
+static struct modlinkage gsqueue_modlinkage = {
+ MODREV_1,
+ &gsqueue_modmisc,
+ NULL
+};
+
+int
+_init(void)
+{
+ int ret;
+
+ gsqueue_ddiinit();
+ if ((ret = mod_install(&gsqueue_modlinkage)) != 0) {
+ VERIFY(gsqueue_ddifini() == 0);
+ return (ret);
+ }
+
+ return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&gsqueue_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int ret;
+
+ if ((ret = gsqueue_ddifini()) != 0)
+ return (ret);
+
+ if ((ret = mod_remove(&gsqueue_modlinkage)) != 0)
+ return (ret);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/inotify.c b/usr/src/uts/common/io/inotify.c
new file mode 100644
index 0000000000..67bf55f213
--- /dev/null
+++ b/usr/src/uts/common/io/inotify.c
@@ -0,0 +1,1559 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2020 Joyent, Inc.
+ * Copyright (c) 2015 The MathWorks, Inc. All rights reserved.
+ */
+
+/*
+ * Support for the inotify facility, a Linux-borne facility for asynchronous
+ * notification of certain events on specified files or directories. Our
+ * implementation broadly leverages the file event monitoring facility, and
+ * would actually be quite straightforward were it not for a very serious
+ * blunder in the inotify interface: in addition to allowing for one to be
+ * notified on events on a particular file or directory, inotify also allows
+ * for one to be notified on certain events on files _within_ a watched
+ * directory -- even though those events have absolutely nothing to do with
+ * the directory itself. This leads to all sorts of madness because file
+ * operations are (of course) not undertaken on paths but rather on open
+ * files -- and the relationships between open files and the paths that resolve
+ * to those files are neither static nor isomorphic. We implement this
+ * concept by having _child watches_ when directories are watched with events
+ * in IN_CHILD_EVENTS. We add child watches when a watch on a directory is
+ * first added, and we modify those child watches dynamically as files are
+ * created, deleted, moved into or moved out of the specified directory. This
+ * mechanism works well, absent hard links. Hard links, unfortunately, break
+ * this rather badly, and the user is warned that watches on directories that
+ * have multiple directory entries referring to the same file may behave
+ * unexpectedly.
+ */
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/inotify.h>
+#include <sys/fem.h>
+#include <sys/conf.h>
+#include <sys/stat.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vmem.h>
+#include <sys/avl.h>
+#include <sys/sysmacros.h>
+#include <sys/cyclic.h>
+#include <sys/filio.h>
+
+struct inotify_state;
+struct inotify_kevent;
+
+typedef struct inotify_watch inotify_watch_t;
+typedef struct inotify_state inotify_state_t;
+typedef struct inotify_kevent inotify_kevent_t;
+
+struct inotify_watch {
+ kmutex_t inw_lock; /* lock protecting ref count */
+ int inw_refcnt; /* reference count */
+ uint8_t inw_zombie:1; /* boolean: is zombie */
+ uint8_t inw_fired:1; /* boolean: fired one-shot */
+ uint8_t inw_active:1; /* boolean: watch is active */
+ uint8_t inw_orphaned:1; /* boolean: orphaned */
+ kcondvar_t inw_cv; /* condvar for zombifier */
+ uint32_t inw_mask; /* mask of watch */
+ int32_t inw_wd; /* watch descriptor */
+ vnode_t *inw_vp; /* underlying vnode */
+ inotify_watch_t *inw_parent; /* parent, if a child */
+ avl_node_t inw_byvp; /* watches by vnode */
+ avl_node_t inw_bywd; /* watches by descriptor */
+ avl_tree_t inw_children; /* children, if a parent */
+ char *inw_name; /* name, if a child */
+ list_node_t inw_orphan; /* orphan list */
+ cred_t *inw_cred; /* cred, if orphaned */
+ inotify_state_t *inw_state; /* corresponding state */
+};
+
+struct inotify_kevent {
+ inotify_kevent_t *ine_next; /* next event in queue */
+ struct inotify_event ine_event; /* event (variable size) */
+};
+
+#define INOTIFY_EVENT_LENGTH(ev) \
+ (sizeof (inotify_kevent_t) + (ev)->ine_event.len)
+
+struct inotify_state {
+ kmutex_t ins_lock; /* lock protecting state */
+ avl_tree_t ins_byvp; /* watches by vnode */
+ avl_tree_t ins_bywd; /* watches by descriptor */
+ vmem_t *ins_wds; /* watch identifier arena */
+ int ins_maxwatches; /* maximum number of watches */
+ int ins_maxevents; /* maximum number of events */
+ int ins_nevents; /* current # of events */
+ int32_t ins_size; /* total size of events */
+ inotify_kevent_t *ins_head; /* head of event queue */
+ inotify_kevent_t *ins_tail; /* tail of event queue */
+ pollhead_t ins_pollhd; /* poll head */
+ kcondvar_t ins_cv; /* condvar for reading */
+ list_t ins_orphans; /* orphan list */
+ ddi_periodic_t ins_cleaner; /* cyclic for cleaning */
+ inotify_watch_t *ins_zombies; /* zombie watch list */
+ cred_t *ins_cred; /* creator's credentials */
+ inotify_state_t *ins_next; /* next state on global list */
+};
+
+/*
+ * Tunables (exported read-only in lx-branded zones via /proc).
+ */
+int inotify_maxwatches = 8192; /* max watches per instance */
+int inotify_maxevents = 16384; /* max events */
+int inotify_maxinstances = 128; /* max instances per user */
+
+/*
+ * Internal global variables.
+ */
+static kmutex_t inotify_lock; /* lock protecting state */
+static dev_info_t *inotify_devi; /* device info */
+static fem_t *inotify_femp; /* FEM pointer */
+static vmem_t *inotify_minor; /* minor number arena */
+static void *inotify_softstate; /* softstate pointer */
+static inotify_state_t *inotify_state; /* global list if state */
+
+static void inotify_watch_event(inotify_watch_t *, uint64_t, char *);
+static void inotify_watch_insert(inotify_watch_t *, vnode_t *, char *);
+static void inotify_watch_delete(inotify_watch_t *, uint32_t);
+static void inotify_watch_remove(inotify_state_t *state,
+ inotify_watch_t *watch);
+
+static int
+inotify_fop_close(femarg_t *vf, int flag, int count, offset_t offset,
+ cred_t *cr, caller_context_t *ct)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_close(vf, flag, count, offset, cr, ct)) == 0) {
+ inotify_watch_event(watch, flag & FWRITE ?
+ IN_CLOSE_WRITE : IN_CLOSE_NOWRITE, NULL);
+ }
+
+ return (rval);
+}
+
+static int
+inotify_fop_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl,
+ int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
+ vsecattr_t *vsecp)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_create(vf, name, vap, excl, mode,
+ vpp, cr, flag, ct, vsecp)) == 0) {
+ inotify_watch_insert(watch, *vpp, name);
+ inotify_watch_event(watch, IN_CREATE, name);
+ }
+
+ return (rval);
+}
+
+static int
+inotify_fop_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr,
+ caller_context_t *ct, int flags)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_link(vf, svp, tnm, cr, ct, flags)) == 0) {
+ inotify_watch_insert(watch, svp, tnm);
+ inotify_watch_event(watch, IN_CREATE, tnm);
+ }
+
+ return (rval);
+}
+
+static int
+inotify_fop_mkdir(femarg_t *vf, char *name, vattr_t *vap, vnode_t **vpp,
+ cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_mkdir(vf, name, vap, vpp, cr,
+ ct, flags, vsecp)) == 0) {
+ inotify_watch_insert(watch, *vpp, name);
+ inotify_watch_event(watch, IN_CREATE | IN_ISDIR, name);
+ }
+
+ return (rval);
+}
+
+static int
+inotify_fop_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_open(vf, mode, cr, ct)) == 0)
+ inotify_watch_event(watch, IN_OPEN, NULL);
+
+ return (rval);
+}
+
+static int
+inotify_fop_read(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr,
+ caller_context_t *ct)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval = vnext_read(vf, uiop, ioflag, cr, ct);
+ inotify_watch_event(watch, IN_ACCESS, NULL);
+
+ return (rval);
+}
+
+static int
+inotify_fop_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval = vnext_readdir(vf, uiop, cr, eofp, ct, flags);
+ inotify_watch_event(watch, IN_ACCESS | IN_ISDIR, NULL);
+
+ return (rval);
+}
+
+int
+inotify_fop_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_remove(vf, nm, cr, ct, flags)) == 0)
+ inotify_watch_event(watch, IN_DELETE, nm);
+
+ return (rval);
+}
+
+int
+inotify_fop_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr,
+ caller_context_t *ct, int flags)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_rmdir(vf, nm, cdir, cr, ct, flags)) == 0)
+ inotify_watch_event(watch, IN_DELETE | IN_ISDIR, nm);
+
+ return (rval);
+}
+
+static int
+inotify_fop_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval;
+
+ if ((rval = vnext_setattr(vf, vap, flags, cr, ct)) == 0)
+ inotify_watch_event(watch, IN_ATTRIB, NULL);
+
+ return (rval);
+}
+
+static int
+inotify_fop_write(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr,
+ caller_context_t *ct)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+ int rval = vnext_write(vf, uiop, ioflag, cr, ct);
+ inotify_watch_event(watch, IN_MODIFY, NULL);
+
+ return (rval);
+}
+
+static int
+inotify_fop_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *name,
+ caller_context_t *ct)
+{
+ inotify_watch_t *watch = vf->fa_fnode->fn_available;
+
+ switch (vnevent) {
+ case VE_RENAME_SRC:
+ inotify_watch_event(watch, IN_MOVE_SELF, NULL);
+ inotify_watch_delete(watch, IN_MOVE_SELF);
+ break;
+ case VE_REMOVE:
+ /*
+ * Linux will apparently fire an IN_ATTRIB event when the link
+ * count changes (including when it drops to 0 on a remove).
+ * This is merely somewhat odd; what is amazing is that this
+ * IN_ATTRIB event is not visible on an inotify watch on the
+ * parent directory. (IN_ATTRIB events are normally sent to
+ * watches on the parent directory). While it's hard to
+ * believe that this constitutes desired semantics, ltp
+ * unfortunately tests this case (if implicitly); in the name
+ * of bug-for-bug compatibility, we fire IN_ATTRIB iff we are
+ * explicitly watching the file that has been removed.
+ */
+ if (watch->inw_parent == NULL)
+ inotify_watch_event(watch, IN_ATTRIB, NULL);
+
+ /*FALLTHROUGH*/
+ case VE_RENAME_DEST:
+ inotify_watch_event(watch, IN_DELETE_SELF, NULL);
+ inotify_watch_delete(watch, IN_DELETE_SELF);
+ break;
+ case VE_RMDIR:
+ /*
+ * It seems that IN_ISDIR should really be OR'd in here, but
+ * Linux doesn't seem to do that in this case; for the sake of
+ * bug-for-bug compatibility, we don't do it either.
+ */
+ inotify_watch_event(watch, IN_DELETE_SELF, NULL);
+ inotify_watch_delete(watch, IN_DELETE_SELF);
+ break;
+ case VE_CREATE:
+ case VE_TRUNCATE:
+ case VE_RESIZE:
+ inotify_watch_event(watch, IN_MODIFY | IN_ATTRIB, NULL);
+ break;
+ case VE_LINK:
+ inotify_watch_event(watch, IN_ATTRIB, NULL);
+ break;
+ case VE_RENAME_SRC_DIR:
+ inotify_watch_event(watch, IN_MOVED_FROM, name);
+ break;
+ case VE_RENAME_DEST_DIR:
+ if (name == NULL)
+ name = dvp->v_path;
+
+ inotify_watch_insert(watch, dvp, name);
+ inotify_watch_event(watch, IN_MOVED_TO, name);
+ break;
+ case VE_SUPPORT:
+ case VE_MOUNTEDOVER:
+ case VE_PRE_RENAME_SRC:
+ case VE_PRE_RENAME_DEST:
+ case VE_PRE_RENAME_DEST_DIR:
+ break;
+ }
+
+ return (vnext_vnevent(vf, vnevent, dvp, name, ct));
+}
+
+const fs_operation_def_t inotify_vnodesrc_template[] = {
+ VOPNAME_CLOSE, { .femop_close = inotify_fop_close },
+ VOPNAME_CREATE, { .femop_create = inotify_fop_create },
+ VOPNAME_LINK, { .femop_link = inotify_fop_link },
+ VOPNAME_MKDIR, { .femop_mkdir = inotify_fop_mkdir },
+ VOPNAME_OPEN, { .femop_open = inotify_fop_open },
+ VOPNAME_READ, { .femop_read = inotify_fop_read },
+ VOPNAME_READDIR, { .femop_readdir = inotify_fop_readdir },
+ VOPNAME_REMOVE, { .femop_remove = inotify_fop_remove },
+ VOPNAME_RMDIR, { .femop_rmdir = inotify_fop_rmdir },
+ VOPNAME_SETATTR, { .femop_setattr = inotify_fop_setattr },
+ VOPNAME_WRITE, { .femop_write = inotify_fop_write },
+ VOPNAME_VNEVENT, { .femop_vnevent = inotify_fop_vnevent },
+ NULL, NULL
+};
+
+static int
+inotify_watch_cmpwd(inotify_watch_t *lhs, inotify_watch_t *rhs)
+{
+ if (lhs->inw_wd < rhs->inw_wd)
+ return (-1);
+
+ if (lhs->inw_wd > rhs->inw_wd)
+ return (1);
+
+ return (0);
+}
+
+static int
+inotify_watch_cmpvp(inotify_watch_t *lhs, inotify_watch_t *rhs)
+{
+ uintptr_t lvp = (uintptr_t)lhs->inw_vp, rvp = (uintptr_t)rhs->inw_vp;
+
+ if (lvp < rvp)
+ return (-1);
+
+ if (lvp > rvp)
+ return (1);
+
+ return (0);
+}
+
+static void
+inotify_watch_hold(inotify_watch_t *watch)
+{
+ mutex_enter(&watch->inw_lock);
+ VERIFY(watch->inw_refcnt > 0);
+ watch->inw_refcnt++;
+ mutex_exit(&watch->inw_lock);
+}
+
+static void
+inotify_watch_release(inotify_watch_t *watch)
+{
+ mutex_enter(&watch->inw_lock);
+ VERIFY(watch->inw_refcnt > 1);
+
+ if (--watch->inw_refcnt == 1 && watch->inw_zombie) {
+ /*
+ * We're down to our last reference; kick anyone that might be
+ * waiting.
+ */
+ cv_signal(&watch->inw_cv);
+ }
+
+ mutex_exit(&watch->inw_lock);
+}
+
+static void
+inotify_watch_event(inotify_watch_t *watch, uint64_t mask, char *name)
+{
+ inotify_kevent_t *event, *tail;
+ inotify_state_t *state = watch->inw_state;
+ uint32_t wd = watch->inw_wd, cookie = 0, len;
+ boolean_t removal = mask & IN_REMOVAL ? B_TRUE : B_FALSE;
+ inotify_watch_t *source = watch;
+
+ if (!(mask &= watch->inw_mask) || mask == IN_ISDIR)
+ return;
+
+ if (watch->inw_parent != NULL) {
+ /*
+ * This is an event on the child; if this isn't a valid child
+ * event, return. Otherwise, we move our watch to be our
+ * parent (which we know is around because we have a hold on
+ * it) and continue.
+ */
+ if (!(mask & IN_CHILD_EVENTS))
+ return;
+
+ name = watch->inw_name;
+ watch = watch->inw_parent;
+ wd = watch->inw_wd;
+ }
+
+ if (!removal) {
+ mutex_enter(&state->ins_lock);
+
+ if (watch->inw_zombie ||
+ watch->inw_fired || !watch->inw_active) {
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+ } else {
+ if (!watch->inw_active)
+ return;
+
+ VERIFY(MUTEX_HELD(&state->ins_lock));
+ }
+
+ /*
+ * If this is an operation on a directory and it's a child event
+ * (event if it's not on a child), we specify IN_ISDIR.
+ */
+ if (source->inw_vp->v_type == VDIR && (mask & IN_CHILD_EVENTS))
+ mask |= IN_ISDIR;
+
+ if (mask & (IN_MOVED_FROM | IN_MOVED_TO))
+ cookie = (uint32_t)curthread->t_did;
+
+ if (state->ins_nevents >= state->ins_maxevents) {
+ /*
+ * We're at our maximum number of events -- turn our event
+ * into an IN_Q_OVERFLOW event, which will be coalesced if
+ * it's already the tail event.
+ */
+ mask = IN_Q_OVERFLOW;
+ wd = (uint32_t)-1;
+ cookie = 0;
+ len = 0;
+ }
+
+ if ((tail = state->ins_tail) != NULL && tail->ine_event.wd == wd &&
+ tail->ine_event.mask == mask && tail->ine_event.cookie == cookie &&
+ ((tail->ine_event.len == 0 && len == 0) ||
+ (name != NULL && tail->ine_event.len != 0 &&
+ strcmp(tail->ine_event.name, name) == 0))) {
+ /*
+ * This is an implicitly coalesced event; we're done.
+ */
+ if (!removal)
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+
+ if (name != NULL) {
+ /*
+ * We are in the context of a file event monitoring operation,
+ * so the name length is bounded by the kernel.
+ */
+ len = strlen(name) + 1;
+ len = roundup(len, sizeof (struct inotify_event));
+ } else {
+ len = 0;
+ }
+
+ event = kmem_zalloc(sizeof (inotify_kevent_t) + len, KM_SLEEP);
+ event->ine_event.wd = wd;
+ event->ine_event.mask = (uint32_t)mask;
+ event->ine_event.cookie = cookie;
+ event->ine_event.len = len;
+
+ if (name != NULL)
+ (void) strcpy(event->ine_event.name, name);
+
+ if (tail != NULL) {
+ tail->ine_next = event;
+ } else {
+ VERIFY(state->ins_head == NULL);
+ state->ins_head = event;
+ cv_broadcast(&state->ins_cv);
+ }
+
+ state->ins_tail = event;
+ state->ins_nevents++;
+ state->ins_size += sizeof (event->ine_event) + len;
+
+ if (removal)
+ return;
+
+ if ((watch->inw_mask & IN_ONESHOT) && !watch->inw_fired) {
+ /*
+ * If this is a one-shot, we need to remove the watch. (Note
+ * that this will recurse back into inotify_watch_event() to
+ * fire the IN_IGNORED event -- but with "removal" set.)
+ */
+ watch->inw_fired = 1;
+ inotify_watch_remove(state, watch);
+ }
+
+ mutex_exit(&state->ins_lock);
+ pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN);
+}
+
+/*
+ * Destroy a watch. By the time we're in here, the watch must have exactly
+ * one reference.
+ */
+static void
+inotify_watch_destroy(inotify_watch_t *watch)
+{
+ VERIFY(MUTEX_HELD(&watch->inw_lock));
+
+ if (watch->inw_name != NULL)
+ kmem_free(watch->inw_name, strlen(watch->inw_name) + 1);
+
+ kmem_free(watch, sizeof (inotify_watch_t));
+}
+
+static int
+inotify_fem_install(vnode_t *vp, inotify_watch_t *watch)
+{
+ /*
+ * For vnodes that are devices (of type VCHR or VBLK), we silently
+ * refuse to actually install any event monitor. This is to avoid
+ * single-thread deadlock when both a special device vnode and its
+ * underlying real vnode are being watched: releasing the device
+ * vnode upon watch removal can induce an attribute update on the
+ * underlying vnode, which will bring us into inotify_watch_event()
+ * with our lock already held. While we could fail earlier and more
+ * explicitly in this case, we choose to keep with the Linux behavior
+ * on unwatchable entities and allow the watch but not generate any
+ * events for it.
+ */
+ if (vp->v_type == VCHR || vp->v_type == VBLK)
+ return (0);
+
+ return (fem_install(vp, inotify_femp, watch, OPARGUNIQ,
+ (void (*)(void *))inotify_watch_hold,
+ (void (*)(void *))inotify_watch_release));
+}
+
+static int
+inotify_fem_uninstall(vnode_t *vp, inotify_watch_t *watch)
+{
+ /*
+ * See inotify_fem_install(), above, for our rationale here.
+ */
+ if (vp->v_type == VCHR || vp->v_type == VBLK)
+ return (0);
+
+ return (fem_uninstall(vp, inotify_femp, watch));
+}
+
+/*
+ * Zombify a watch. By the time we come in here, it must be true that the
+ * watch has already been fem_uninstall()'d -- the only reference should be
+ * in the state's data structure. If we can get away with freeing it, we'll
+ * do that -- but if the reference count is greater than one due to an active
+ * vnode operation, we'll put this watch on the zombie list on the state
+ * structure.
+ */
+static void
+inotify_watch_zombify(inotify_watch_t *watch)
+{
+ inotify_state_t *state = watch->inw_state;
+
+ VERIFY(MUTEX_HELD(&state->ins_lock));
+ VERIFY(!watch->inw_zombie);
+
+ watch->inw_zombie = 1;
+
+ if (watch->inw_parent != NULL) {
+ inotify_watch_release(watch->inw_parent);
+ } else {
+ avl_remove(&state->ins_byvp, watch);
+ avl_remove(&state->ins_bywd, watch);
+ vmem_free(state->ins_wds, (void *)(uintptr_t)watch->inw_wd, 1);
+ watch->inw_wd = -1;
+ }
+
+ mutex_enter(&watch->inw_lock);
+
+ if (watch->inw_refcnt == 1) {
+ /*
+ * There are no operations in flight and there is no way
+ * for anyone to discover this watch -- we can destroy it.
+ */
+ inotify_watch_destroy(watch);
+ } else {
+ /*
+ * There are operations in flight; we will need to enqueue
+ * this for later destruction.
+ */
+ watch->inw_parent = state->ins_zombies;
+ state->ins_zombies = watch;
+ mutex_exit(&watch->inw_lock);
+ }
+}
+
+static inotify_watch_t *
+inotify_watch_add(inotify_state_t *state, inotify_watch_t *parent,
+ const char *name, vnode_t *vp, uint32_t mask)
+{
+ inotify_watch_t *watch;
+ int err;
+
+ VERIFY(MUTEX_HELD(&state->ins_lock));
+
+ watch = kmem_zalloc(sizeof (inotify_watch_t), KM_SLEEP);
+
+ watch->inw_vp = vp;
+ watch->inw_mask = mask;
+ watch->inw_state = state;
+ watch->inw_refcnt = 1;
+
+ if (parent == NULL) {
+ watch->inw_wd = (int)(uintptr_t)vmem_alloc(state->ins_wds,
+ 1, VM_BESTFIT | VM_SLEEP);
+ avl_add(&state->ins_byvp, watch);
+ avl_add(&state->ins_bywd, watch);
+
+ avl_create(&watch->inw_children,
+ (int(*)(const void *, const void *))inotify_watch_cmpvp,
+ sizeof (inotify_watch_t),
+ offsetof(inotify_watch_t, inw_byvp));
+ } else {
+ VERIFY(name != NULL);
+ inotify_watch_hold(parent);
+ watch->inw_mask &= IN_CHILD_EVENTS;
+ watch->inw_parent = parent;
+
+ /*
+ * Copy the name. Note that when the name is user-specified,
+ * its length is bounded by the copyinstr() to be MAXPATHLEN
+ * (and regardless, we know by this point that it exists in
+ * our parent).
+ */
+ watch->inw_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
+ (void) strcpy(watch->inw_name, name);
+
+ avl_add(&parent->inw_children, watch);
+ }
+
+ /*
+ * Add our monitor to the vnode. We must not have the watch lock held
+ * when we do this, as it will immediately hold our watch.
+ */
+ err = inotify_fem_install(vp, watch);
+
+ VERIFY(err == 0);
+
+ return (watch);
+}
+
+/*
+ * Remove a (non-child) watch. This is called from either synchronous context
+ * via inotify_rm_watch() or monitor context via either a vnevent or a
+ * one-shot.
+ */
+static void
+inotify_watch_remove(inotify_state_t *state, inotify_watch_t *watch)
+{
+ inotify_watch_t *child;
+ int err;
+
+ VERIFY(MUTEX_HELD(&state->ins_lock));
+ VERIFY(watch->inw_parent == NULL);
+
+ err = inotify_fem_uninstall(watch->inw_vp, watch);
+ VERIFY(err == 0);
+
+ /*
+ * If we have children, we're going to remove them all and set them
+ * all to be zombies.
+ */
+ while ((child = avl_first(&watch->inw_children)) != NULL) {
+ VERIFY(child->inw_parent == watch);
+ avl_remove(&watch->inw_children, child);
+
+ err = inotify_fem_uninstall(child->inw_vp, child);
+ VERIFY(err == 0);
+
+ /*
+ * If this child watch has been orphaned, remove it from the
+ * state's list of orphans.
+ */
+ if (child->inw_orphaned) {
+ list_remove(&state->ins_orphans, child);
+ crfree(child->inw_cred);
+ }
+
+ VN_PHANTOM_RELE(child->inw_vp);
+
+ /*
+ * We're down (or should be down) to a single reference to
+ * this child watch; it's safe to zombify it.
+ */
+ inotify_watch_zombify(child);
+ }
+
+ inotify_watch_event(watch, IN_IGNORED | IN_REMOVAL, NULL);
+ VN_PHANTOM_RELE(watch->inw_vp);
+
+ /*
+ * It's now safe to zombify the watch -- we know that the only reference
+ * can come from operations in flight.
+ */
+ inotify_watch_zombify(watch);
+}
+
+/*
+ * Delete a watch. Should only be called from VOP context.
+ */
+static void
+inotify_watch_delete(inotify_watch_t *watch, uint32_t event)
+{
+ inotify_state_t *state = watch->inw_state;
+ inotify_watch_t cmp = { .inw_vp = watch->inw_vp }, *parent;
+ int err;
+
+ if (event != IN_DELETE_SELF && !(watch->inw_mask & IN_CHILD_EVENTS))
+ return;
+
+ mutex_enter(&state->ins_lock);
+
+ if (watch->inw_zombie) {
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+
+ if ((parent = watch->inw_parent) == NULL) {
+ if (event == IN_DELETE_SELF) {
+ /*
+ * If we're here because we're being deleted and we
+ * are not a child watch, we need to delete the entire
+ * watch, children and all.
+ */
+ inotify_watch_remove(state, watch);
+ }
+
+ mutex_exit(&state->ins_lock);
+ return;
+ } else {
+ if (event == IN_DELETE_SELF &&
+ !(parent->inw_mask & IN_EXCL_UNLINK)) {
+ /*
+ * This is a child watch for a file that is being
+ * removed and IN_EXCL_UNLINK has not been specified;
+ * indicate that it is orphaned and add it to the list
+ * of orphans. (This list will be checked by the
+ * cleaning cyclic to determine when the watch has
+ * become the only hold on the vnode, at which point
+ * the watch can be zombified.) Note that we check
+ * if the watch is orphaned before we orphan it: hard
+ * links make it possible for VE_REMOVE to be called
+ * multiple times on the same vnode. (!)
+ */
+ if (!watch->inw_orphaned) {
+ watch->inw_orphaned = 1;
+ watch->inw_cred = CRED();
+ crhold(watch->inw_cred);
+ list_insert_head(&state->ins_orphans, watch);
+ }
+
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+
+ if (watch->inw_orphaned) {
+ /*
+ * If we're here, a file was orphaned and then later
+ * moved -- which almost certainly means that hard
+ * links are on the scene. We choose the orphan over
+ * the move because we don't want to spuriously
+ * drop events if we can avoid it.
+ */
+ crfree(watch->inw_cred);
+ list_remove(&state->ins_orphans, watch);
+ }
+ }
+
+ if (avl_find(&parent->inw_children, &cmp, NULL) == NULL) {
+ /*
+ * This watch has already been deleted from the parent.
+ */
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+
+ avl_remove(&parent->inw_children, watch);
+ err = inotify_fem_uninstall(watch->inw_vp, watch);
+ VERIFY(err == 0);
+
+ VN_PHANTOM_RELE(watch->inw_vp);
+
+ /*
+ * It's now safe to zombify the watch -- which won't actually delete
+ * it as we know that the reference count is greater than 1.
+ */
+ inotify_watch_zombify(watch);
+ mutex_exit(&state->ins_lock);
+}
+
+/*
+ * Insert a new child watch. Should only be called from VOP context when
+ * a child is created in a watched directory.
+ */
+static void
+inotify_watch_insert(inotify_watch_t *watch, vnode_t *vp, char *name)
+{
+ inotify_state_t *state = watch->inw_state;
+ inotify_watch_t cmp = { .inw_vp = vp };
+
+ if (!(watch->inw_mask & IN_CHILD_EVENTS))
+ return;
+
+ mutex_enter(&state->ins_lock);
+
+ if (watch->inw_zombie || watch->inw_parent != NULL || vp == NULL) {
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+
+ if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) {
+ mutex_exit(&state->ins_lock);
+ return;
+ }
+
+ VN_PHANTOM_HOLD(vp);
+ watch = inotify_watch_add(state, watch, name, vp, watch->inw_mask);
+ VERIFY(watch != NULL);
+
+ mutex_exit(&state->ins_lock);
+}
+
+
+static int
+inotify_add_watch(inotify_state_t *state, vnode_t *vp, uint32_t mask,
+ int32_t *wdp)
+{
+ inotify_watch_t *watch, cmp = { .inw_vp = vp };
+ uint32_t set;
+
+ set = (mask & (IN_ALL_EVENTS | IN_MODIFIERS)) | IN_UNMASKABLE;
+
+ /*
+ * Lookup our vnode to determine if we already have a watch on it.
+ */
+ mutex_enter(&state->ins_lock);
+
+ if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) {
+ /*
+ * We don't have this watch; allocate a new one, provided that
+ * we have fewer than our limit.
+ */
+ if (avl_numnodes(&state->ins_bywd) >= state->ins_maxwatches) {
+ mutex_exit(&state->ins_lock);
+ return (ENOSPC);
+ }
+
+ VN_PHANTOM_HOLD(vp);
+ watch = inotify_watch_add(state, NULL, NULL, vp, set);
+ *wdp = watch->inw_wd;
+ mutex_exit(&state->ins_lock);
+
+ return (0);
+ }
+
+ VERIFY(!watch->inw_zombie);
+
+ if (!(mask & IN_MASK_ADD)) {
+ /*
+ * Note that if we're resetting our event mask and we're
+ * transitioning from an event mask that includes child events
+ * to one that doesn't, there will be potentially some stale
+ * child watches. This is basically fine: they won't fire,
+ * and they will correctly be removed when the watch is
+ * removed.
+ */
+ watch->inw_mask = 0;
+ }
+
+ watch->inw_mask |= set;
+
+ *wdp = watch->inw_wd;
+
+ mutex_exit(&state->ins_lock);
+
+ return (0);
+}
+
+static int
+inotify_add_child(inotify_state_t *state, vnode_t *vp, char *name)
+{
+ inotify_watch_t *watch, cmp = { .inw_vp = vp };
+ vnode_t *cvp;
+ int err;
+
+ /*
+ * Verify that the specified child doesn't have a directory component
+ * within it.
+ */
+ if (strchr(name, '/') != NULL)
+ return (EINVAL);
+
+ /*
+ * Lookup the underlying file. Note that this will succeed even if
+ * we don't have permissions to actually read the file.
+ */
+ if ((err = lookupnameat(name,
+ UIO_SYSSPACE, NO_FOLLOW, NULL, &cvp, vp)) != 0) {
+ return (err);
+ }
+
+ /*
+ * Use our vnode to find our watch, and then add our child watch to it.
+ */
+ mutex_enter(&state->ins_lock);
+
+ if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) {
+ /*
+ * This is unexpected -- it means that we don't have the
+ * watch that we thought we had.
+ */
+ mutex_exit(&state->ins_lock);
+ VN_RELE(cvp);
+ return (ENXIO);
+ }
+
+ /*
+ * Now lookup the child vnode in the watch; we'll only add it if it
+ * isn't already there.
+ */
+ cmp.inw_vp = cvp;
+
+ if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) {
+ mutex_exit(&state->ins_lock);
+ VN_RELE(cvp);
+ return (0);
+ }
+
+ /* Trade the plain hold from lookupnameat() for a phantom hold */
+ VN_PHANTOM_HOLD(cvp);
+ VN_RELE(cvp);
+
+ watch = inotify_watch_add(state, watch, name, cvp, watch->inw_mask);
+ VERIFY(watch != NULL);
+ mutex_exit(&state->ins_lock);
+
+ return (0);
+}
+
+static int
+inotify_rm_watch(inotify_state_t *state, int32_t wd)
+{
+ inotify_watch_t *watch, cmp = { .inw_wd = wd };
+
+ mutex_enter(&state->ins_lock);
+
+ if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) {
+ mutex_exit(&state->ins_lock);
+ return (EINVAL);
+ }
+
+ inotify_watch_remove(state, watch);
+ mutex_exit(&state->ins_lock);
+
+ /*
+ * Because removing a watch will generate an IN_IGNORED event (and
+ * because inotify_watch_remove() won't alone induce a pollwakeup()),
+ * we need to explicitly issue a pollwakeup().
+ */
+ pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN);
+
+ return (0);
+}
+
+static int
+inotify_activate(inotify_state_t *state, int32_t wd)
+{
+ inotify_watch_t *watch, cmp = { .inw_wd = wd };
+
+ mutex_enter(&state->ins_lock);
+
+ if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) {
+ mutex_exit(&state->ins_lock);
+ return (EINVAL);
+ }
+
+ watch->inw_active = 1;
+
+ mutex_exit(&state->ins_lock);
+
+ return (0);
+}
+
+/*
+ * Called periodically as a cyclic to process the orphans and zombies.
+ */
+static void
+inotify_clean(void *arg)
+{
+ inotify_state_t *state = arg;
+ inotify_watch_t *watch, *parent, *next, **prev;
+ cred_t *savecred;
+ int err;
+
+ mutex_enter(&state->ins_lock);
+
+ for (watch = list_head(&state->ins_orphans);
+ watch != NULL; watch = next) {
+ next = list_next(&state->ins_orphans, watch);
+
+ VERIFY(!watch->inw_zombie);
+ VERIFY((parent = watch->inw_parent) != NULL);
+
+ if (watch->inw_vp->v_count > 1)
+ continue;
+
+ avl_remove(&parent->inw_children, watch);
+ err = inotify_fem_uninstall(watch->inw_vp, watch);
+ VERIFY(err == 0);
+
+ list_remove(&state->ins_orphans, watch);
+
+ /*
+ * For purposes of releasing the vnode, we need to switch our
+ * cred to be the cred of the orphaning thread (which we held
+ * at the time this watch was orphaned).
+ */
+ savecred = curthread->t_cred;
+ curthread->t_cred = watch->inw_cred;
+ VN_PHANTOM_RELE(watch->inw_vp);
+ crfree(watch->inw_cred);
+ curthread->t_cred = savecred;
+
+ inotify_watch_zombify(watch);
+ }
+
+ prev = &state->ins_zombies;
+
+ while ((watch = *prev) != NULL) {
+ mutex_enter(&watch->inw_lock);
+
+ if (watch->inw_refcnt == 1) {
+ *prev = watch->inw_parent;
+ inotify_watch_destroy(watch);
+ continue;
+ }
+
+ prev = &watch->inw_parent;
+ mutex_exit(&watch->inw_lock);
+ }
+
+ mutex_exit(&state->ins_lock);
+}
+
+/*ARGSUSED*/
+static int
+inotify_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
+{
+ inotify_state_t *state;
+ major_t major = getemajor(*devp);
+ minor_t minor = getminor(*devp);
+ int instances = 0;
+ char c[64];
+
+ if (minor != INOTIFYMNRN_INOTIFY)
+ return (ENXIO);
+
+ mutex_enter(&inotify_lock);
+
+ for (state = inotify_state; state != NULL; state = state->ins_next) {
+ if (state->ins_cred == cred_p)
+ instances++;
+ }
+
+ if (instances >= inotify_maxinstances) {
+ mutex_exit(&inotify_lock);
+ return (EMFILE);
+ }
+
+ minor = (minor_t)(uintptr_t)vmem_alloc(inotify_minor, 1,
+ VM_BESTFIT | VM_SLEEP);
+
+ if (ddi_soft_state_zalloc(inotify_softstate, minor) != DDI_SUCCESS) {
+ vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1);
+ mutex_exit(&inotify_lock);
+ return (EINVAL);
+ }
+
+ state = ddi_get_soft_state(inotify_softstate, minor);
+ *devp = makedevice(major, minor);
+
+ crhold(cred_p);
+ state->ins_cred = cred_p;
+ state->ins_next = inotify_state;
+ inotify_state = state;
+
+ (void) snprintf(c, sizeof (c), "inotify_watchid_%d", minor);
+ state->ins_wds = vmem_create(c, (void *)1, UINT32_MAX, 1,
+ NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
+
+ avl_create(&state->ins_bywd,
+ (int(*)(const void *, const void *))inotify_watch_cmpwd,
+ sizeof (inotify_watch_t),
+ offsetof(inotify_watch_t, inw_bywd));
+
+ avl_create(&state->ins_byvp,
+ (int(*)(const void *, const void *))inotify_watch_cmpvp,
+ sizeof (inotify_watch_t),
+ offsetof(inotify_watch_t, inw_byvp));
+
+ list_create(&state->ins_orphans, sizeof (inotify_watch_t),
+ offsetof(inotify_watch_t, inw_orphan));
+
+ state->ins_maxwatches = inotify_maxwatches;
+ state->ins_maxevents = inotify_maxevents;
+
+ mutex_exit(&inotify_lock);
+
+ state->ins_cleaner = ddi_periodic_add(inotify_clean,
+ state, NANOSEC, DDI_IPL_0);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+inotify_read(dev_t dev, uio_t *uio, cred_t *cr)
+{
+ inotify_state_t *state;
+ inotify_kevent_t *event;
+ minor_t minor = getminor(dev);
+ int err = 0, nevents = 0;
+ size_t len;
+
+ state = ddi_get_soft_state(inotify_softstate, minor);
+
+ mutex_enter(&state->ins_lock);
+
+ while (state->ins_head == NULL) {
+ if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
+ mutex_exit(&state->ins_lock);
+ return (EAGAIN);
+ }
+
+ if (!cv_wait_sig_swap(&state->ins_cv, &state->ins_lock)) {
+ mutex_exit(&state->ins_lock);
+ return (EINTR);
+ }
+ }
+
+ /*
+ * We have events and we have our lock; return as many as we can.
+ */
+ while ((event = state->ins_head) != NULL) {
+ len = sizeof (event->ine_event) + event->ine_event.len;
+
+ if (uio->uio_resid < len) {
+ if (nevents == 0)
+ err = EINVAL;
+ break;
+ }
+
+ nevents++;
+
+ if ((err = uiomove(&event->ine_event, len, UIO_READ, uio)) != 0)
+ break;
+
+ VERIFY(state->ins_nevents > 0);
+ state->ins_nevents--;
+
+ VERIFY(state->ins_size > 0);
+ state->ins_size -= len;
+
+ if ((state->ins_head = event->ine_next) == NULL) {
+ VERIFY(event == state->ins_tail);
+ VERIFY(state->ins_nevents == 0);
+ state->ins_tail = NULL;
+ }
+
+ kmem_free(event, INOTIFY_EVENT_LENGTH(event));
+ }
+
+ mutex_exit(&state->ins_lock);
+
+ return (err);
+}
+
+static int
+inotify_poll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ inotify_state_t *state;
+ minor_t minor = getminor(dev);
+
+ state = ddi_get_soft_state(inotify_softstate, minor);
+
+ mutex_enter(&state->ins_lock);
+
+ if (state->ins_head != NULL) {
+ *reventsp = events & (POLLRDNORM | POLLIN);
+ } else {
+ *reventsp = 0;
+ }
+
+ if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
+ *phpp = &state->ins_pollhd;
+ }
+
+ mutex_exit(&state->ins_lock);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+inotify_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
+{
+ inotify_state_t *state;
+ minor_t minor = getminor(dev);
+ file_t *fp;
+ int rval;
+
+ state = ddi_get_soft_state(inotify_softstate, minor);
+
+ switch (cmd) {
+ case INOTIFYIOC_ADD_WATCH: {
+ inotify_addwatch_t addwatch;
+ file_t *fp;
+
+ if (copyin((void *)arg, &addwatch, sizeof (addwatch)) != 0)
+ return (EFAULT);
+
+ if ((fp = getf(addwatch.inaw_fd)) == NULL)
+ return (EBADF);
+
+ rval = inotify_add_watch(state, fp->f_vnode,
+ addwatch.inaw_mask, rv);
+
+ releasef(addwatch.inaw_fd);
+ return (rval);
+ }
+
+ case INOTIFYIOC_ADD_CHILD: {
+ inotify_addchild_t addchild;
+ char name[MAXPATHLEN];
+
+ if (copyin((void *)arg, &addchild, sizeof (addchild)) != 0)
+ return (EFAULT);
+
+ if (copyinstr(addchild.inac_name, name, MAXPATHLEN, NULL) != 0)
+ return (EFAULT);
+
+ if ((fp = getf(addchild.inac_fd)) == NULL)
+ return (EBADF);
+
+ rval = inotify_add_child(state, fp->f_vnode, name);
+
+ releasef(addchild.inac_fd);
+ return (rval);
+ }
+
+ case INOTIFYIOC_RM_WATCH:
+ return (inotify_rm_watch(state, arg));
+
+ case INOTIFYIOC_ACTIVATE:
+ return (inotify_activate(state, arg));
+
+ case FIONREAD: {
+ int32_t size;
+
+ mutex_enter(&state->ins_lock);
+ size = state->ins_size;
+ mutex_exit(&state->ins_lock);
+
+ if (copyout(&size, (void *)arg, sizeof (size)) != 0)
+ return (EFAULT);
+
+ return (0);
+ }
+
+ default:
+ break;
+ }
+
+ return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+inotify_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
+{
+ inotify_state_t *state, **sp;
+ inotify_watch_t *watch, *zombies;
+ inotify_kevent_t *event;
+ minor_t minor = getminor(dev);
+
+ state = ddi_get_soft_state(inotify_softstate, minor);
+
+ if (state->ins_pollhd.ph_list != NULL) {
+ pollwakeup(&state->ins_pollhd, POLLERR);
+ pollhead_clean(&state->ins_pollhd);
+ }
+
+ mutex_enter(&state->ins_lock);
+
+ /*
+ * First, destroy all of our watches.
+ */
+ while ((watch = avl_first(&state->ins_bywd)) != NULL)
+ inotify_watch_remove(state, watch);
+
+ /*
+ * And now destroy our event queue.
+ */
+ while ((event = state->ins_head) != NULL) {
+ state->ins_head = event->ine_next;
+ kmem_free(event, INOTIFY_EVENT_LENGTH(event));
+ }
+
+ zombies = state->ins_zombies;
+ state->ins_zombies = NULL;
+ mutex_exit(&state->ins_lock);
+
+ /*
+ * Now that our state lock is dropped, we can synchronously wait on
+ * any zombies.
+ */
+ while ((watch = zombies) != NULL) {
+ zombies = zombies->inw_parent;
+
+ mutex_enter(&watch->inw_lock);
+
+ while (watch->inw_refcnt > 1)
+ cv_wait(&watch->inw_cv, &watch->inw_lock);
+
+ inotify_watch_destroy(watch);
+ }
+
+ if (state->ins_cleaner != NULL) {
+ ddi_periodic_delete(state->ins_cleaner);
+ state->ins_cleaner = NULL;
+ }
+
+ mutex_enter(&inotify_lock);
+
+ /*
+ * Remove our state from our global list, and release our hold on
+ * the cred.
+ */
+ for (sp = &inotify_state; *sp != state; sp = &((*sp)->ins_next))
+ VERIFY(*sp != NULL);
+
+ *sp = (*sp)->ins_next;
+ crfree(state->ins_cred);
+ vmem_destroy(state->ins_wds);
+
+ ddi_soft_state_free(inotify_softstate, minor);
+ vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1);
+
+ mutex_exit(&inotify_lock);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+inotify_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+{
+ mutex_enter(&inotify_lock);
+
+ if (ddi_soft_state_init(&inotify_softstate,
+ sizeof (inotify_state_t), 0) != 0) {
+ cmn_err(CE_NOTE, "/dev/inotify failed to create soft state");
+ mutex_exit(&inotify_lock);
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_create_minor_node(devi, "inotify", S_IFCHR,
+ INOTIFYMNRN_INOTIFY, DDI_PSEUDO, 0) == DDI_FAILURE) {
+ cmn_err(CE_NOTE, "/dev/inotify couldn't create minor node");
+ ddi_soft_state_fini(&inotify_softstate);
+ mutex_exit(&inotify_lock);
+ return (DDI_FAILURE);
+ }
+
+ if (fem_create("inotify_fem",
+ inotify_vnodesrc_template, &inotify_femp) != 0) {
+ cmn_err(CE_NOTE, "/dev/inotify couldn't create FEM state");
+ ddi_remove_minor_node(devi, NULL);
+ ddi_soft_state_fini(&inotify_softstate);
+ mutex_exit(&inotify_lock);
+ return (DDI_FAILURE);
+ }
+
+ ddi_report_dev(devi);
+ inotify_devi = devi;
+
+ inotify_minor = vmem_create("inotify_minor", (void *)INOTIFYMNRN_CLONE,
+ UINT32_MAX - INOTIFYMNRN_CLONE, 1, NULL, NULL, NULL, 0,
+ VM_SLEEP | VMC_IDENTIFIER);
+
+ mutex_exit(&inotify_lock);
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+inotify_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ switch (cmd) {
+ case DDI_DETACH:
+ break;
+
+ case DDI_SUSPEND:
+ return (DDI_SUCCESS);
+
+ default:
+ return (DDI_FAILURE);
+ }
+
+ mutex_enter(&inotify_lock);
+ fem_free(inotify_femp);
+ vmem_destroy(inotify_minor);
+
+ ddi_remove_minor_node(inotify_devi, NULL);
+ inotify_devi = NULL;
+
+ ddi_soft_state_fini(&inotify_softstate);
+ mutex_exit(&inotify_lock);
+
+ return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+inotify_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ int error;
+
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *result = (void *)inotify_devi;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)0;
+ error = DDI_SUCCESS;
+ break;
+ default:
+ error = DDI_FAILURE;
+ }
+ return (error);
+}
+
+static struct cb_ops inotify_cb_ops = {
+ inotify_open, /* open */
+ inotify_close, /* close */
+ nulldev, /* strategy */
+ nulldev, /* print */
+ nodev, /* dump */
+ inotify_read, /* read */
+ nodev, /* write */
+ inotify_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ inotify_poll, /* poll */
+ ddi_prop_op, /* cb_prop_op */
+ 0, /* streamtab */
+ D_NEW | D_MP /* Driver compatibility flag */
+};
+
+static struct dev_ops inotify_ops = {
+ DEVO_REV, /* devo_rev */
+ 0, /* refcnt */
+ inotify_info, /* get_dev_info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ inotify_attach, /* attach */
+ inotify_detach, /* detach */
+ nodev, /* reset */
+ &inotify_cb_ops, /* driver operations */
+ NULL, /* bus operations */
+ nodev, /* dev power */
+ ddi_quiesce_not_needed, /* quiesce */
+};
+
+static struct modldrv modldrv = {
+ &mod_driverops, /* module type (this is a pseudo driver) */
+ "inotify support", /* name of module */
+ &inotify_ops, /* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ (void *)&modldrv,
+ NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ return (mod_remove(&modlinkage));
+}
diff --git a/usr/src/uts/common/io/inotify.conf b/usr/src/uts/common/io/inotify.conf
new file mode 100644
index 0000000000..ce9da6180f
--- /dev/null
+++ b/usr/src/uts/common/io/inotify.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014 Joyent, Inc. All rights reserved.
+#
+
+name="inotify" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_main.c b/usr/src/uts/common/io/ixgbe/ixgbe_main.c
index 86c46bee44..9966641df4 100644
--- a/usr/src/uts/common/io/ixgbe/ixgbe_main.c
+++ b/usr/src/uts/common/io/ixgbe/ixgbe_main.c
@@ -284,7 +284,7 @@ static adapter_info_t ixgbe_82599eb_cap = {
128, /* default number of rx queues */
64, /* maximum number of rx groups */
1, /* minimum number of rx groups */
- 1, /* default number of rx groups */
+ 32, /* default number of rx groups */
128, /* maximum number of tx queues */
1, /* minimum number of tx queues */
8, /* default number of tx queues */
@@ -315,7 +315,7 @@ static adapter_info_t ixgbe_X540_cap = {
128, /* default number of rx queues */
64, /* maximum number of rx groups */
1, /* minimum number of rx groups */
- 1, /* default number of rx groups */
+ 32, /* default number of rx groups */
128, /* maximum number of tx queues */
1, /* minimum number of tx queues */
8, /* default number of tx queues */
@@ -2049,6 +2049,7 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg,
void *arg1, void *arg2)
{
ixgbe_t *ixgbe = (ixgbe_t *)arg1;
+ int prev = ixgbe->intr_cnt;
switch (cbaction) {
/* IRM callback */
@@ -2062,7 +2063,8 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg,
if (ixgbe_intr_adjust(ixgbe, cbaction, count) !=
DDI_SUCCESS) {
ixgbe_error(ixgbe,
- "IRM CB: Failed to adjust interrupts");
+ "IRM CB: Failed to adjust interrupts [%d %d %d]",
+ cbaction, count, prev);
goto cb_fail;
}
break;
diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c
index a3cd9dfbb1..0a5eec209f 100644
--- a/usr/src/uts/common/io/ksocket/ksocket.c
+++ b/usr/src/uts/common/io/ksocket/ksocket.c
@@ -22,7 +22,7 @@
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/file.h>
@@ -932,3 +932,15 @@ ksocket_rele(ksocket_t ks)
cv_signal(&so->so_closing_cv);
}
}
+
+int
+ksocket_krecv_set(ksocket_t ks, ksocket_krecv_f cb, void *arg)
+{
+ return (so_krecv_set(KSTOSO(ks), (so_krecv_f)cb, arg));
+}
+
+void
+ksocket_krecv_unblock(ksocket_t ks)
+{
+ so_krecv_unblock(KSTOSO(ks));
+}
diff --git a/usr/src/uts/common/io/ksocket/ksocket_impl.h b/usr/src/uts/common/io/ksocket/ksocket_impl.h
index ac5251540f..516a68d358 100644
--- a/usr/src/uts/common/io/ksocket/ksocket_impl.h
+++ b/usr/src/uts/common/io/ksocket/ksocket_impl.h
@@ -22,11 +22,17 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
*/
#ifndef _INET_KSOCKET_KSOCKET_IMPL_H
#define _INET_KSOCKET_KSOCKET_IMPL_H
+/*
+ * Note that if this relationship ever changes, the logic in ksocket_krecv_set
+ * must be updated and we must maintain local state about this on whatever the
+ * new ksocket object is.
+ */
#define KSTOSO(ks) ((struct sonode *)(ks))
#define SOTOKS(so) ((ksocket_t)(uintptr_t)(so))
diff --git a/usr/src/uts/common/io/ksyms.c b/usr/src/uts/common/io/ksyms.c
index 74e71ed7e8..759b524186 100644
--- a/usr/src/uts/common/io/ksyms.c
+++ b/usr/src/uts/common/io/ksyms.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
@@ -219,6 +220,14 @@ ksyms_open(dev_t *devp, int flag, int otyp, struct cred *cred)
char *addr;
void *hptr = NULL;
ksyms_buflist_hdr_t hdr;
+
+ /*
+ * This device should never be visible in a zone, but if it somehow
+ * does get created we refuse to allow the zone to use it.
+ */
+ if (crgetzoneid(cred) != GLOBAL_ZONEID)
+ return (EACCES);
+
bzero(&hdr, sizeof (struct ksyms_buflist_hdr));
list_create(&hdr.blist, PAGESIZE,
offsetof(ksyms_buflist_t, buflist_node));
diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c
index 41d3ee5fe1..4ce359f87b 100644
--- a/usr/src/uts/common/io/mac/mac.c
+++ b/usr/src/uts/common/io/mac/mac.c
@@ -159,7 +159,7 @@
* perimeter) across a call to any other layer from the mac layer. The call to
* any other layer could be via mi_* entry points, classifier entry points into
* the driver or via upcall pointers into layers above. The mac perimeter may
- * be acquired or held only in the down direction, for e.g. when calling into
+ * be acquired or held only in the down direction, e.g. when calling into
* a mi_* driver enty point to provide atomicity of the operation.
*
* R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
@@ -208,7 +208,7 @@
* number whenever the ring's stop routine is invoked.
* See comments in mac_rx_ring();
*
- * R17 Similarly mi_stop is another synchronization point and the driver must
+ * R17. Similarly mi_stop is another synchronization point and the driver must
* ensure that all upcalls are done and there won't be any future upcall
* before returning from mi_stop.
*
@@ -228,7 +228,7 @@
*
* cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
*
- * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
+ * mac perim -> i_dls_devnet_lock [dls_devnet_rename]
*
* Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
* client to driver. In the case of clients that explictly use the mac provided
diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c
index 605cb51bf7..b166e7987a 100644
--- a/usr/src/uts/common/io/mac/mac_client.c
+++ b/usr/src/uts/common/io/mac/mac_client.c
@@ -3348,6 +3348,11 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type,
mac_cb_info_t *mcbi;
int rc;
+ if ((flags & MAC_PROMISC_FLAGS_NO_COPY) &&
+ (flags & MAC_PROMISC_FLAGS_DO_FIXUPS)) {
+ return (EINVAL);
+ }
+
i_mac_perim_enter(mip);
if ((rc = mac_start((mac_handle_t)mip)) != 0) {
@@ -3394,6 +3399,7 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type,
mpip->mpi_strip_vlan_tag =
((flags & MAC_PROMISC_FLAGS_VLAN_TAG_STRIP) != 0);
mpip->mpi_no_copy = ((flags & MAC_PROMISC_FLAGS_NO_COPY) != 0);
+ mpip->mpi_do_fixups = ((flags & MAC_PROMISC_FLAGS_DO_FIXUPS) != 0);
mcbi = &mip->mi_promisc_cb_info;
mutex_enter(mcbi->mcbi_lockp);
@@ -4104,13 +4110,33 @@ mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp,
{
mblk_t *mp_next;
- if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag) {
+ if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag ||
+ (mpip->mpi_do_fixups && local)) {
mblk_t *mp_copy;
mp_copy = copymsg(mp);
if (mp_copy == NULL)
return;
+ /*
+ * The consumer has requested we emulate HW offloads
+ * for host-local packets.
+ */
+ if (mpip->mpi_do_fixups && local) {
+ /*
+ * Remember that copymsg() doesn't copy
+ * b_next, so we are only passing a single
+ * packet to mac_hw_emul(). Also keep in mind
+ * that mp_copy will become an mblk chain if
+ * the argument is an LSO message.
+ */
+ mac_hw_emul(&mp_copy, NULL, NULL,
+ MAC_HWCKSUM_EMUL | MAC_LSO_EMUL);
+
+ if (mp_copy == NULL)
+ return;
+ }
+
if (mpip->mpi_strip_vlan_tag) {
mp_copy = mac_strip_vlan_tag_chain(mp_copy);
if (mp_copy == NULL)
@@ -4319,16 +4345,15 @@ mac_info_get(const char *name, mac_info_t *minfop)
/*
* To get the capabilities that MAC layer cares about, such as rings, factory
* mac address, vnic or not, it should directly invoke this function. If the
- * link is part of a bridge, then the only "capability" it has is the inability
- * to do zero copy.
+ * link is part of a bridge, then the link is unable to do zero copy.
*/
boolean_t
i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
{
mac_impl_t *mip = (mac_impl_t *)mh;
- if (mip->mi_bridge_link != NULL) {
- return (cap == MAC_CAPAB_NO_ZCOPY);
+ if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY) {
+ return (B_TRUE);
} else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB) {
boolean_t res;
@@ -4524,7 +4549,13 @@ mac_addr_len(mac_handle_t mh)
boolean_t
mac_is_vnic(mac_handle_t mh)
{
- return (((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC);
+ return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC) != 0);
+}
+
+boolean_t
+mac_is_overlay(mac_handle_t mh)
+{
+ return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_OVERLAY) != 0);
}
mac_handle_t
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
index 038b8460aa..4819cd86df 100644
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -605,6 +605,7 @@ mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg)
*
* TODO: Cleanup and tighten some of the assumptions.
*/
+boolean_t mac_check_overlay = B_TRUE;
boolean_t mac_use_bw_heuristic = B_TRUE;
static int
mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
@@ -612,6 +613,7 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
uint64_t cpu_speed, bw = 0;
int srings = 0;
boolean_t bw_enabled = B_FALSE;
+ mac_client_impl_t *mcip = flent->fe_mcip;
ASSERT(!(flent->fe_type & FLOW_USER));
if (flent->fe_resource_props.mrp_mask & MRP_MAXBW &&
@@ -639,7 +641,16 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus)
*/
if (mac_soft_ring_enable)
srings = srings * 2;
+ } else if (mac_check_overlay == B_TRUE &&
+ (mcip->mci_state_flags & MCIS_IS_VNIC) != 0) {
+ /* Is this a VNIC on an overlay? */
+ mac_handle_t mh = (mac_handle_t)mcip->mci_mip;
+ if (mac_is_overlay(mh) == B_TRUE) {
+ srings = mac_rx_soft_ring_10gig_count;
+ }
}
+
+
} else {
/*
* Soft ring computation using CPU speed and specified
@@ -2203,7 +2214,14 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type,
mac_srs->srs_state |= SRS_SOFTRING_QUEUE;
}
- mac_srs->srs_worker = thread_create(NULL, 0,
+ /*
+ * Create the srs_worker with twice the stack of a normal kernel thread
+ * to reduce the likelihood of stack overflows in receive-side
+ * processing. (The larger stacks are not the only precaution taken
+ * against stack overflows; see the use of the MAC_RX_SRS_TOODEEP
+ * macro for details.)
+ */
+ mac_srs->srs_worker = thread_create(NULL, default_stksize << 1,
mac_srs_worker, mac_srs, 0, &p0, TS_RUN, mac_srs->srs_pri);
if (is_tx_srs) {
diff --git a/usr/src/uts/common/io/mac/mac_protect.c b/usr/src/uts/common/io/mac/mac_protect.c
index da83dc643e..ee493bbca1 100644
--- a/usr/src/uts/common/io/mac/mac_protect.c
+++ b/usr/src/uts/common/io/mac/mac_protect.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc. All rights reserved.
*/
/*
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
@@ -209,7 +209,7 @@ typedef struct slaac_addr {
} slaac_addr_t;
static void start_txn_cleanup_timer(mac_client_impl_t *);
-static boolean_t allowed_ips_set(mac_resource_props_t *, uint32_t);
+static boolean_t dynamic_method_set(mac_protect_t *, uint32_t);
#define BUMP_STAT(m, s) (m)->mci_misc_stat.mms_##s++
@@ -580,8 +580,7 @@ intercept_dhcpv4_outbound(mac_client_impl_t *mcip, ipha_t *ipha, uchar_t *end)
if (get_dhcpv4_info(ipha, end, &dh4) != 0)
return (B_TRUE);
- /* ip_nospoof/allowed-ips and DHCP are mutually exclusive by default */
- if (allowed_ips_set(mrp, IPV4_VERSION))
+ if (!dynamic_method_set(&mrp->mrp_protect, MPT_DYN_DHCPV4))
return (B_FALSE);
if (get_dhcpv4_option(dh4, end, CD_DHCP_TYPE, &opt, &opt_len) != 0 ||
@@ -1310,8 +1309,7 @@ intercept_dhcpv6_outbound(mac_client_impl_t *mcip, ip6_t *ip6h, uchar_t *end)
if (get_dhcpv6_info(ip6h, end, &dh6) != 0)
return (B_TRUE);
- /* ip_nospoof/allowed-ips and DHCP are mutually exclusive by default */
- if (allowed_ips_set(mrp, IPV6_VERSION))
+ if (!dynamic_method_set(&mrp->mrp_protect, MPT_DYN_DHCPV6))
return (B_FALSE);
/*
@@ -1517,6 +1515,10 @@ intercept_ra_inbound(mac_client_impl_t *mcip, ip6_t *ip6h, uchar_t *end,
{
struct nd_opt_hdr *opt;
int len, optlen;
+ mac_protect_t *protect = &MCIP_RESOURCE_PROPS(mcip)->mrp_protect;
+
+ if (!dynamic_method_set(protect, MPT_DYN_SLAAC))
+ return;
if (ip6h->ip6_hlim != 255) {
DTRACE_PROBE1(invalid__hoplimit, uint8_t, ip6h->ip6_hlim);
@@ -1755,6 +1757,7 @@ ipnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *protect,
if (*addr == INADDR_ANY)
return (B_TRUE);
+ /* If any specific addresses or subnets are allowed, check them */
for (i = 0; i < protect->mp_ipaddrcnt; i++) {
mac_ipaddr_t *v4addr = &protect->mp_ipaddrs[i];
@@ -1775,14 +1778,19 @@ ipnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *protect,
return (B_TRUE);
}
}
- return (protect->mp_ipaddrcnt == 0 ?
- check_dhcpv4_dyn_ip(mcip, *addr) : B_FALSE);
+
+ if (dynamic_method_set(protect, MPT_DYN_DHCPV4)) {
+ return (check_dhcpv4_dyn_ip(mcip, *addr));
+ }
+
+ return (B_FALSE);
}
static boolean_t
ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect,
in6_addr_t *addr)
{
+ boolean_t slaac_enabled, dhcpv6_enabled;
uint_t i;
/*
@@ -1793,7 +1801,7 @@ ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect,
IN6_ARE_ADDR_EQUAL(&mcip->mci_v6_local_addr, addr)))
return (B_TRUE);
-
+ /* If any specific addresses or subnets are allowed, check them */
for (i = 0; i < protect->mp_ipaddrcnt; i++) {
mac_ipaddr_t *v6addr = &protect->mp_ipaddrs[i];
@@ -1804,12 +1812,15 @@ ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect,
return (B_TRUE);
}
- if (protect->mp_ipaddrcnt == 0) {
- return (check_slaac_ip(mcip, addr) ||
- check_dhcpv6_dyn_ip(mcip, addr));
- } else {
- return (B_FALSE);
- }
+ slaac_enabled = dynamic_method_set(protect, MPT_DYN_SLAAC);
+ if (slaac_enabled && check_slaac_ip(mcip, addr))
+ return (B_TRUE);
+
+ dhcpv6_enabled = dynamic_method_set(protect, MPT_DYN_DHCPV6);
+ if (dhcpv6_enabled && check_dhcpv6_dyn_ip(mcip, addr))
+ return (B_TRUE);
+
+ return (B_FALSE);
}
/*
@@ -2025,6 +2036,9 @@ dhcpnospoof_check_cid(mac_protect_t *p, uchar_t *cid, uint_t cidlen)
bcmp(dcid->dc_id, cid, cidlen) == 0)
return (B_TRUE);
}
+
+ DTRACE_PROBE3(missing__cid, mac_protect_t *, p,
+ uchar_t *, cid, uint_t, cidlen);
return (B_FALSE);
}
@@ -2046,6 +2060,12 @@ dhcpnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *p,
bcmp(mcip->mci_unicast->ma_addr, dh4->chaddr, maclen) != 0) {
return (B_FALSE);
}
+
+ /* Everything after here is checking the Client Identifier */
+ if (p->mp_allcids == MPT_TRUE) {
+ return (B_TRUE);
+ }
+
if (get_dhcpv4_option(dh4, end, CD_CLIENT_ID, &cid, &optlen) == 0)
cidlen = optlen;
@@ -2082,6 +2102,11 @@ dhcpnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *p,
mtype == DHCPV6_MSG_RECONFIGURE)
return (B_TRUE);
+ /* Everything after here is checking the Client Identifier */
+ if (p->mp_allcids == MPT_TRUE) {
+ return (B_TRUE);
+ }
+
d6o = get_dhcpv6_option(&dh6[1], end - (uchar_t *)&dh6[1], NULL,
DHCPV6_OPT_CLIENTID, &cidlen);
if (d6o == NULL || (uchar_t *)d6o + cidlen > end)
@@ -2159,7 +2184,6 @@ dhcpnospoof_check(mac_client_impl_t *mcip, mac_protect_t *protect,
return (0);
fail:
- /* increment dhcpnospoof stat here */
freemsg(nmp);
return (err);
}
@@ -2487,6 +2511,11 @@ mac_protect_validate(mac_resource_props_t *mrp)
if ((err = validate_cids(p)) != 0)
return (err);
+ if (p->mp_allcids != MPT_FALSE && p->mp_allcids != MPT_TRUE &&
+ p->mp_allcids != MPT_RESET) {
+ return (EINVAL);
+ }
+
return (0);
}
@@ -2554,6 +2583,16 @@ mac_protect_update(mac_resource_props_t *new, mac_resource_props_t *curr)
cp->mp_cidcnt = 0;
}
}
+ if (np->mp_allcids == MPT_RESET) {
+ cp->mp_allcids = MPT_FALSE;
+ } else if (np->mp_allcids != 0) {
+ cp->mp_allcids = MPT_TRUE;
+ }
+ if (np->mp_dynamic == MPT_RESET) {
+ cp->mp_dynamic = 0;
+ } else if (np->mp_dynamic != 0) {
+ cp->mp_dynamic = np->mp_dynamic;
+ }
}
void
@@ -2597,15 +2636,50 @@ mac_protect_fini(mac_client_impl_t *mcip)
}
static boolean_t
-allowed_ips_set(mac_resource_props_t *mrp, uint32_t af)
+dynamic_method_set(mac_protect_t *mpt, uint32_t method)
+{
+ if (mpt->mp_dynamic != 0) {
+ return ((mpt->mp_dynamic & method) != 0);
+ } else {
+ return (mpt->mp_ipaddrcnt == 0);
+ }
+}
+
+boolean_t
+mac_protect_check_addr(mac_client_handle_t mch, boolean_t isv6,
+ in6_addr_t *v6addr)
{
- int i;
+ mac_perim_handle_t perim;
+ mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
+ mac_handle_t mh = (mac_handle_t)mcip->mci_mip;
- for (i = 0; i < mrp->mrp_protect.mp_ipaddrcnt; i++) {
- if (mrp->mrp_protect.mp_ipaddrs[i].ip_version == af)
- return (B_TRUE);
+ mac_perim_enter_by_mh(mh, &perim);
+
+ mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
+ mac_protect_t *p;
+ boolean_t allowed;
+
+ ASSERT(mrp != NULL);
+
+ p = &mrp->mrp_protect;
+
+ /* If mac protection/ipnospoof isn't enabled, return true */
+ if ((mrp->mrp_mask & MRP_PROTECT) == 0 ||
+ (p->mp_types & MPT_IPNOSPOOF) == 0) {
+ allowed = B_TRUE;
+ goto done;
}
- return (B_FALSE);
+
+ if (isv6) {
+ allowed = ipnospoof_check_v6(mcip, p, v6addr);
+ } else {
+ in_addr_t *v4addr = &V4_PART_OF_V6((*v6addr));
+ allowed = ipnospoof_check_v4(mcip, p, v4addr);
+ }
+
+done:
+ mac_perim_exit(perim);
+ return (allowed);
}
mac_protect_t *
diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c
index bfaf232d25..bcca602589 100644
--- a/usr/src/uts/common/io/mac/mac_provider.c
+++ b/usr/src/uts/common/io/mac/mac_provider.c
@@ -393,6 +393,9 @@ mac_register(mac_register_t *mregp, mac_handle_t *mhp)
if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
mip->mi_state_flags |= MIS_IS_AGGR;
+ if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL))
+ mip->mi_state_flags |= MIS_IS_OVERLAY;
+
mac_addr_factory_init(mip);
mac_transceiver_init(mip);
diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c
index 5b3e87dfd1..8f983e50e4 100644
--- a/usr/src/uts/common/io/mac/mac_sched.c
+++ b/usr/src/uts/common/io/mac/mac_sched.c
@@ -1367,11 +1367,11 @@ int mac_srs_worker_wakeup_ticks = 0;
* can occur in situ (in the interrupt thread) or if it should be left to a
* worker thread. Note that the constant used to make this determination is
* not entirely made-up, and is a result of some emprical validation. That
- * said, the constant is left as a static variable to allow it to be
+ * said, the constant is left as a global variable to allow it to be
* dynamically tuned in the field if and as needed.
*/
-static uintptr_t mac_rx_srs_stack_needed = 10240;
-static uint_t mac_rx_srs_stack_toodeep;
+uintptr_t mac_rx_srs_stack_needed = 14336;
+uint_t mac_rx_srs_stack_toodeep;
#ifndef STACK_GROWTH_DOWN
#error Downward stack growth assumed.
@@ -1379,7 +1379,7 @@ static uint_t mac_rx_srs_stack_toodeep;
#define MAC_RX_SRS_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
(uintptr_t)curthread->t_stkbase < mac_rx_srs_stack_needed && \
- ++mac_rx_srs_stack_toodeep)
+ (++mac_rx_srs_stack_toodeep || (mac_rx_srs_stack_toodeep = 1)))
/*
diff --git a/usr/src/uts/common/io/mac/mac_stat.c b/usr/src/uts/common/io/mac/mac_stat.c
index dbb5c0a914..e1151565a6 100644
--- a/usr/src/uts/common/io/mac/mac_stat.c
+++ b/usr/src/uts/common/io/mac/mac_stat.c
@@ -391,8 +391,8 @@ i_mac_stat_create(void *handle, const char *modname, const char *statname,
kstat_t *ksp;
kstat_named_t *knp;
- ksp = kstat_create(modname, 0, statname, "net",
- KSTAT_TYPE_NAMED, count, 0);
+ ksp = kstat_create_zone(modname, 0, statname, "net",
+ KSTAT_TYPE_NAMED, count, 0, getzoneid());
if (ksp == NULL)
return (NULL);
@@ -949,9 +949,9 @@ mac_driver_stat_create(mac_impl_t *mip)
major_t major = getmajor(mip->mi_phy_dev);
count = MAC_MOD_NKSTAT + MAC_NKSTAT + mip->mi_type->mt_statcount;
- ksp = kstat_create((const char *)ddi_major_to_name(major),
+ ksp = kstat_create_zone((const char *)ddi_major_to_name(major),
getminor(mip->mi_phy_dev) - 1, MAC_KSTAT_NAME,
- MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0);
+ MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0, getzoneid());
if (ksp == NULL)
return;
diff --git a/usr/src/uts/common/io/mem.c b/usr/src/uts/common/io/mem.c
index 950fab1272..fcea4a8f03 100644
--- a/usr/src/uts/common/io/mem.c
+++ b/usr/src/uts/common/io/mem.c
@@ -225,10 +225,19 @@ mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
case M_NULL:
case M_ZERO:
case M_FULL:
+ /* standard devices */
+ break;
+
case M_MEM:
case M_KMEM:
case M_ALLKMEM:
- /* standard devices */
+ /*
+ * These devices should never be visible in a zone, but if they
+ * somehow do get created we refuse to allow the zone to use
+ * them.
+ */
+ if (crgetzoneid(cred) != GLOBAL_ZONEID)
+ return (EACCES);
break;
default:
diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.conf b/usr/src/uts/common/io/mr_sas/mr_sas.conf
index cfda434e23..6c585c6a42 100644
--- a/usr/src/uts/common/io/mr_sas/mr_sas.conf
+++ b/usr/src/uts/common/io/mr_sas/mr_sas.conf
@@ -13,3 +13,11 @@
# Fast-Path specific flag. Default is "yes".
# mrsas-enable-fp="yes";
+flow_control="dmult" queue="qsort" tape="sctp";
+
+# MSI specific flag. To enable MSI modify the flag value to "yes"
+mrsas-enable-msi="yes";
+
+# Fast-Path specific flag. To enable Fast-Path modify the flag value to "yes"
+mrsas-enable-fp="yes";
+
diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE
new file mode 100644
index 0000000000..187088ff34
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2014, Thales UK Limited
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip
new file mode 100644
index 0000000000..cde8b65b37
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+NFAST CRYPTO ACCELERATOR DRIVER
diff --git a/usr/src/uts/common/io/nfp/autoversion.h b/usr/src/uts/common/io/nfp/autoversion.h
new file mode 100644
index 0000000000..b9021942b2
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/autoversion.h
@@ -0,0 +1,21 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+/* AUTOGENERATED - DO NOT EDIT */
+#ifndef AUTOVERSION_H
+#define AUTOVERSION_H
+
+#define VERSION_RELEASEMAJOR 2
+#define VERSION_RELEASEMINOR 26
+#define VERSION_RELEASEPATCH 40
+#define VERSION_NO "2.26.40cam999"
+#define VERSION_COMPNAME "nfdrv"
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/drvlist.c b/usr/src/uts/common/io/nfp/drvlist.c
new file mode 100644
index 0000000000..a04b1fd5b0
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/drvlist.c
@@ -0,0 +1,19 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_cmd.h"
+
+const nfpcmd_dev *nfp_drvlist[] = {
+ &i21285_cmddev,
+ &i21555_cmddev,
+ NULL
+};
+
diff --git a/usr/src/uts/common/io/nfp/hostif.c b/usr/src/uts/common/io/nfp/hostif.c
new file mode 100644
index 0000000000..684be703ea
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/hostif.c
@@ -0,0 +1,1192 @@
+/*
+
+hostif.c: nFast PCI driver for Solaris 2.5, 2.6, 2.7 and 2.8
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+06/05/1998 jsh Original solaris 2.6
+21/05/1999 jsh added support for solaris 2.5
+10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit)
+??/??/2001 jsh added support for solaris 2.8 (32 and 64 bit)
+16/10/2001 jsh moved from nfast to new structure in nfdrv
+12/02/2002 jsh added high level interrupt support
+
+*/
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/map.h>
+#include <sys/debug.h>
+#include <sys/modctl.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/open.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/pci.h>
+
+#include "nfp_common.h"
+#include "nfp_hostif.h"
+#include "nfp_osif.h"
+#include "nfp_cmd.h"
+
+#include "nfp.h"
+
+/* mapped memory attributes, no-swap endianess (done in higher level) */
+static struct ddi_device_acc_attr nosw_attr = {
+ DDI_DEVICE_ATTR_V0,
+ DDI_NEVERSWAP_ACC,
+ DDI_STRICTORDER_ACC
+};
+
+/* dma attributes */
+static ddi_dma_attr_t dma_attrs = {
+ DMA_ATTR_V0, /* version number */
+ (uint64_t)0x0, /* low address */
+ (uint64_t)0xffffffff, /* high address */
+ (uint64_t)0xffffff, /* DMA counter max */
+ (uint64_t)0x1, /* alignment */
+ 0x0c, /* burst sizes */
+ 0x1, /* minimum transfer size */
+ (uint64_t)0x3ffffff, /* maximum transfer size */
+ (uint64_t)0x7fff, /* maximum segment size */
+ 1, /* no scatter/gather lists */
+ 1, /* granularity */
+ 0 /* DMA flags */
+};
+
+/*
+ * Debug message control
+ * Debug Levels:
+ * 0 = no messages
+ * 1 = Errors
+ * 2 = Subroutine calls & control flow
+ * 3 = I/O Data (verbose!)
+ * Can be set with adb or in the /etc/system file with
+ * "set nfp:nfp_debug=<value>"
+ */
+
+int nfp_debug= 1;
+
+static void *state_head; /* opaque handle top of state structs */
+
+static int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp);
+static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp);
+static int nfp_release_dev( dev_info_t *dip );
+
+static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp);
+static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp);
+static int nfp_strategy(struct buf *bp);
+
+static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp);
+static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp);
+
+static void nfp_wrtimeout (void *pdev);
+static void nfp_rdtimeout (void *pdev);
+
+static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result);
+static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+
+static void nfp_read_complete_final(nfp_dev *pdev, int ok);
+static void nfp_write_complete_final(nfp_dev *pdev, int ok);
+
+/* nfp file ops --------------------------------------------------- */
+
+static struct cb_ops nfp_cb_ops = {
+ nfp_open,
+ nfp_close,
+ nodev, /* no nfp_strategy */
+ nodev, /* no print routine */
+ nodev, /* no dump routine */
+ nfp_read,
+ nfp_write,
+ nfp_ioctl,
+ nodev, /* no devmap routine */
+ nodev, /* no mmap routine */
+ nodev, /* no segmap routine */
+ nfp_chpoll,
+ ddi_prop_op,
+ 0, /* not a STREAMS driver, no cb_str routine */
+ D_NEW | D_MP | EXTRA_CB_FLAGS, /* must be safe for multi-thread/multi-processor */
+ CB_REV,
+ nodev, /* aread */
+ nodev /* awrite */
+};
+
+static struct dev_ops nfp_ops = {
+ DEVO_REV, /* DEVO_REV indicated by manual */
+ 0, /* device reference count */
+ nfp_getinfo,
+ nulldev, /* identify */
+ nulldev, /* probe */
+ nfp_attach,
+ nfp_detach,
+ nodev, /* device reset routine */
+ &nfp_cb_ops,
+ (struct bus_ops *)0, /* bus operations */
+};
+
+extern struct mod_ops mod_driverops;
+static struct modldrv modldrv = {
+ &mod_driverops,
+ NFP_DRVNAME,
+ &nfp_ops,
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, /* MODREV_1 indicated by manual */
+ (void *)&modldrv,
+ NULL, /* termination of list of linkage structures */
+};
+
+/* interface resource allocation */
+
+int nfp_alloc_pci_push( nfp_dev *pdev ) {
+ /* allocate resources needed for PCI Push,
+ * if not already allocated.
+ * return True if successful
+ */
+ nfp_err ret;
+ uint_t cookie_count;
+ size_t real_length;
+
+ if(!pdev->read_buf) {
+ /* allocate read buffer */
+ pdev->read_buf = kmem_zalloc( NFP_READBUF_SIZE, KM_NOSLEEP );
+ }
+ if(!pdev->read_buf) {
+ nfp_log( NFP_DBG1, "nfp_attach: kmem_zalloc read buffer failed");
+ pdev->read_buf = NULL;
+ return 0;
+ }
+
+ if(!pdev->rd_dma_ok) {
+ /* allocate dma handle for read buffer */
+ ret = ddi_dma_alloc_handle( pdev->dip,
+ &dma_attrs,
+ DDI_DMA_DONTWAIT,
+ NULL,
+ &pdev->read_dma_handle );
+ if( ret != DDI_SUCCESS ) {
+ nfp_log( NFP_DBG1,
+ "nfp_alloc_pci_push: ddi_dma_alloc_handle failed (%d)",
+ ret );
+ return 0;
+ }
+
+ /* Allocate the memory for dma transfers */
+ ret = ddi_dma_mem_alloc(pdev->read_dma_handle, NFP_READBUF_SIZE, &nosw_attr,
+ DDI_DMA_CONSISTENT, DDI_DMA_DONTWAIT, NULL,
+ (caddr_t*)&pdev->read_buf, &real_length, &pdev->acchandle);
+ if (ret != DDI_SUCCESS) {
+ nfp_log( NFP_DBG1, "nfp_alloc_pci_push: ddi_dma_mem_alloc failed (%d)", ret);
+ ddi_dma_free_handle( &pdev->read_dma_handle );
+ return 0;
+ }
+
+ ret = ddi_dma_addr_bind_handle( pdev->read_dma_handle,
+ NULL, /* kernel address space */
+ (caddr_t)pdev->read_buf, real_length,
+ DDI_DMA_READ | DDI_DMA_CONSISTENT, /* dma flags */
+ DDI_DMA_DONTWAIT, NULL,
+ &pdev->read_dma_cookie, &cookie_count );
+ if( ret != DDI_DMA_MAPPED ) {
+ nfp_log( NFP_DBG1,
+ "nfp_alloc_pci_push: ddi_dma_addr_bind_handle failed (%d)",
+ ret);
+ ddi_dma_mem_free(&pdev->acchandle);
+ ddi_dma_free_handle( &pdev->read_dma_handle );
+ return 0;
+ }
+ if( cookie_count > 1 ) {
+ nfp_log( NFP_DBG1,
+ "nfp_alloc_pci_push: error:"
+ " ddi_dma_addr_bind_handle wants %d transfers",
+ cookie_count);
+ ddi_dma_mem_free(&pdev->acchandle);
+ (void) ddi_dma_unbind_handle( pdev->read_dma_handle );
+ ddi_dma_free_handle( &pdev->read_dma_handle );
+ return 0;
+ }
+ pdev->rd_dma_ok = 1;
+ }
+ return pdev->rd_dma_ok;
+}
+
+void nfp_free_pci_push( nfp_dev *pdev ) {
+ /* free resources allocated to PCI Push */
+ if( pdev->rd_dma_ok ) {
+ (void) ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL);
+ ddi_dma_mem_free(&pdev->acchandle);
+ (void) ddi_dma_unbind_handle( pdev->read_dma_handle );
+ ddi_dma_free_handle( &pdev->read_dma_handle );
+ pdev->rd_dma_ok = 0;
+ }
+ if( pdev->read_buf ) {
+ kmem_free( pdev->read_buf, NFP_READBUF_SIZE );
+ pdev->read_buf = NULL;
+ }
+}
+
+/* include definition of nfp_set_ifvers() */
+#define nfp_ifvers NFDEV_IF_PCI_PUSH
+#include "nfp_ifvers.c"
+#undef nfp_ifvers
+
+/*--------------------*/
+/* nfp_isr */
+/*--------------------*/
+
+static u_int nfp_isr( char *pdev_in ) {
+ /* LINTED: alignment */
+ nfp_dev *pdev= (nfp_dev *)pdev_in;
+ nfp_err ne;
+ int handled;
+
+ nfp_log( NFP_DBG3, "nfp_isr: entered");
+
+ if( !pdev ) {
+ nfp_log( NFP_DBG1, "nfp_isr: cannot find dev");
+ return DDI_INTR_UNCLAIMED;
+ }
+
+ /* The isr needs to be mutex'ed - an SMP can call us while we're still
+ * running!
+ */
+ mutex_enter(&pdev->low_mutex);
+ ne= pdev->cmddev->isr( pdev->common.cmdctx, &handled );
+ mutex_exit(&pdev->low_mutex);
+
+ if( !ne && handled )
+ return DDI_INTR_CLAIMED;
+ if (ne)
+ nfp_log( NFP_DBG1, "nfp_isr: failed");
+ else
+ nfp_log( NFP_DBG3, "nfp_isr: unclaimed");
+ return DDI_INTR_UNCLAIMED;
+}
+
+static u_int nfp_soft_isr( char *pdev_in ) {
+ /* LINTED: alignment */
+ nfp_dev *pdev= (nfp_dev *)pdev_in;
+ int rd, wr;
+
+ nfp_log( NFP_DBG3, "nfp_soft_isr: entered");
+
+ if( !pdev ) {
+ nfp_log( NFP_DBG1, "nfp_soft_isr: cannot find dev");
+ return DDI_INTR_UNCLAIMED;
+ }
+ rd= wr= 0;
+
+ mutex_enter(&pdev->high_mutex);
+ if(pdev->high_read) {
+ pdev->high_read= 0;
+ mutex_exit(&pdev->high_mutex);
+ rd= 1;
+ }
+ if(pdev->high_write) {
+ pdev->high_write= 0;
+ wr= 1;
+ }
+ mutex_exit(&pdev->high_mutex);
+
+ if(rd) {
+ nfp_log( NFP_DBG3, "nfp_soft_isr: read done");
+ nfp_read_complete_final(pdev, pdev->rd_ok);
+ }
+ if(wr) {
+ nfp_log( NFP_DBG3, "nfp_soft_isr: write done");
+ nfp_write_complete_final(pdev, pdev->wr_ok);
+ }
+ if( rd || wr )
+ return DDI_INTR_CLAIMED;
+
+ nfp_log( NFP_DBG2, "nfp_isr: unclaimed");
+ return DDI_INTR_UNCLAIMED;
+}
+
+
+/*-------------------------*/
+/* nfp_read */
+/*-------------------------*/
+
+void nfp_read_complete(nfp_dev *pdev, int ok) {
+ nfp_log( NFP_DBG2,"nfp_read_complete: entering");
+
+ if(pdev->high_intr) {
+ nfp_log(NFP_DBG2, "nfp_read_complete: high_intr");
+ mutex_enter(&pdev->high_mutex);
+ nfp_log(NFP_DBG3, "nfp_read_complete: high_mutex entered");
+ if(pdev->high_read)
+ nfp_log(NFP_DBG1, "nfp_read_complete: high_read allread set!");
+ pdev->high_read= 1;
+ pdev->rd_ok= ok;
+ nfp_log(NFP_DBG3, "nfp_read_complete: exiting high_mutex");
+ mutex_exit(&pdev->high_mutex);
+ ddi_trigger_softintr(pdev->soft_int_id);
+ } else
+ nfp_read_complete_final( pdev, ok );
+ nfp_log( NFP_DBG2,"nfp_read_complete: exiting");
+}
+
+static void nfp_read_complete_final(nfp_dev *pdev, int ok) {
+ nfp_log( NFP_DBG2,"nfp_read_complete_final: entering");
+ if(pdev->rdtimeout)
+ (void) untimeout(pdev->rdtimeout);
+ if(!pdev->rd_outstanding) {
+ nfp_log( NFP_DBG1,"nfp_read_complete_final: !pdev->rd_outstanding");
+ }
+ nfp_log( NFP_DBG2,"nfp_read_complete_final: pdev->rd_outstanding=0, ok %d", ok);
+ mutex_enter(&pdev->isr_mutex);
+ pdev->rd_outstanding= 0;
+ pdev->rd_ready= 1;
+ pdev->rd_ok= ok;
+ cv_broadcast(&pdev->rd_cv);
+ mutex_exit(&pdev->isr_mutex);
+ pollwakeup (&pdev->pollhead, POLLRDNORM);
+ nfp_log( NFP_DBG2,"nfp_read_complete_final: exiting");
+}
+
+static void nfp_rdtimeout( void *pdev_in )
+{
+ nfp_dev *pdev= (nfp_dev *)pdev_in;
+
+ nfp_log( NFP_DBG1, "nfp_rdtimeout: read timed out");
+
+ if (!pdev) {
+ nfp_log( NFP_DBG1, "nfp_rdtimeout: NULL pdev." );
+ return;
+ }
+ pdev->rdtimeout= 0;
+ nfp_read_complete_final(pdev, 0);
+}
+
+/* ARGSUSED */
+static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp) {
+ int ret;
+ nfp_log( NFP_DBG2, "nfp_read: entered" );
+ if (ddi_get_soft_state(state_head, getminor(dev)) != NULL) {
+ nfp_log( NFP_DBG1, "nfp_read: unable to get nfp_dev");
+ return (ENODEV);
+ }
+ nfp_log( NFP_DBG2, "nfp_read: about to physio." );
+ ret = physio(nfp_strategy, (struct buf *)0, dev, B_READ, minphys, uiop );
+ if(ret)
+ nfp_log( NFP_DBG1, "nfp_read: physio returned %x.", ret );
+ return ret;
+}
+
+/*-------------------------*/
+/* nfp_write */
+/*-------------------------*/
+
+void nfp_write_complete( nfp_dev *pdev, int ok) {
+ nfp_log( NFP_DBG2,"nfp_write_complete: entering");
+
+ if(pdev->high_intr) {
+ mutex_enter(&pdev->high_mutex);
+ if(pdev->high_write)
+ nfp_log(NFP_DBG1, "nfp_write_complete: high_write allread set!");
+ pdev->high_write= 1;
+ pdev->wr_ok= ok;
+ mutex_exit(&pdev->high_mutex);
+ ddi_trigger_softintr(pdev->soft_int_id);
+ } else
+ nfp_write_complete_final( pdev, ok );
+ nfp_log( NFP_DBG2,"nfp_write_complete: exiting");
+}
+
+static void nfp_write_complete_final( nfp_dev *pdev, int ok) {
+ struct buf *local_wr_bp;
+ nfp_log( NFP_DBG2,"nfp_write_complete_final: entering");
+ if(pdev->wrtimeout)
+ (void) untimeout(pdev->wrtimeout);
+
+ if (!pdev->wr_bp) {
+ nfp_log( NFP_DBG2, "nfp_write_complete_final: write: wr_bp == NULL." );
+ return;
+ }
+
+ bp_mapout(pdev->wr_bp);
+ pdev->wr_bp->b_resid = ok ? 0 : pdev->wr_bp->b_bcount;
+ /* Make sure we set wr_ready before calling biodone to avoid a race */
+ pdev->wr_ready = 1;
+ bioerror(pdev->wr_bp, ok ? 0 : ENXIO);
+ local_wr_bp = pdev->wr_bp;
+ pdev->wr_bp = 0;
+ biodone(local_wr_bp);
+ nfp_log( NFP_DBG2, "nfp_write_complete_final: isr_mutex extited");
+ pollwakeup (&pdev->pollhead, POLLWRNORM);
+
+ nfp_log( NFP_DBG2, "nfp_write_complete_final: leaving");
+}
+
+static void nfp_wrtimeout( void *pdev_in )
+{
+ nfp_dev *pdev= (nfp_dev *)pdev_in;
+
+ nfp_log( NFP_DBG1, "nfp_wrtimeout: write timed out");
+
+ if (!pdev) {
+ nfp_log( NFP_DBG1, "nfp_wrtimeout: NULL pdev." );
+ return;
+ }
+ pdev->wrtimeout= 0;
+ nfp_write_complete_final(pdev, 0);
+}
+
+/* ARGSUSED */
+static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp) {
+ int ret;
+ nfp_log( NFP_DBG2, "nfp_write: entered." );
+ if (ddi_get_soft_state(state_head, getminor(dev)) == NULL) {
+ nfp_log( NFP_DBG1, "nfp_chread: unable to get nfp_dev.");
+ return (ENODEV);
+ }
+ nfp_log( NFP_DBG2, "nfp_write: about to physio." );
+ ret = physio(nfp_strategy, (struct buf *)0, dev, B_WRITE, minphys, uiop );
+ if(ret)
+ nfp_log( NFP_DBG1, "nfp_write: physio returned %x.", ret );
+ return ret;
+}
+
+/*-------------------------*/
+/* nfp_strategy */
+/*-------------------------*/
+
+#define NFP_STRAT_ERR(thebp,err,txt) \
+ nfp_log( NFP_DBG1, "nfp_strategy: " txt ".\n"); \
+ (thebp)->b_resid = (thebp)->b_bcount; \
+ bioerror ((thebp), err); \
+ biodone ((thebp));
+
+static int nfp_strategy(struct buf *bp) {
+ register struct nfp_dev *pdev;
+ nfp_err ne;
+
+ nfp_log( NFP_DBG2, "nfp_strategy: entered." );
+ if (!(pdev = ddi_get_soft_state(state_head, getminor(bp->b_edev)))) {
+ NFP_STRAT_ERR (bp, ENXIO, "unable to get nfp_dev");
+ return (0);
+ }
+
+ if (bp->b_flags & B_READ) {
+ int count;
+ /* read */
+ if (!pdev->rd_ready) {
+ NFP_STRAT_ERR (bp,ENXIO,"read called when not ready");
+ return (0);
+ }
+ pdev->rd_ready=0;
+ pdev->rd_pending = 0;
+ if( !pdev->rd_ok) {
+ NFP_STRAT_ERR (bp,ENXIO,"read failed");
+ return (0);
+ }
+ /* copy data from module */
+ if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) {
+ nfp_log( NFP_DBG3, "nfp_strategy: copying kernel read buffer");
+ if( ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL) != DDI_SUCCESS )
+ {
+ NFP_STRAT_ERR(bp,ENXIO,"ddi_dma_sync(read_dma_handle) failed");
+ return (0);
+ }
+ /* LINTED: alignment */
+ count= *(unsigned int *)(pdev->read_buf+4);
+ count= FROM_LE32_MEM(&count);
+ nfp_log( NFP_DBG3, "nfp_strategy: read count %d", count);
+ if(count<0 || count>bp->b_bcount) {
+ NFP_STRAT_ERR(bp,ENXIO,"bad read byte count from device");
+ nfp_log( NFP_DBG1, "nfp_strategy: bad read byte count (%d) from device", count);
+ return (0);
+ }
+ bp_mapin (bp);
+ bcopy( pdev->read_buf + 8, bp->b_un.b_addr, count );
+ bp_mapout (bp);
+ } else {
+ bp_mapin (bp);
+ ne= pdev->cmddev->read_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx, &count );
+ bp_mapout (bp);
+ if( ne != NFP_SUCCESS) {
+ NFP_STRAT_ERR (bp,nfp_oserr(ne),"read_block failed");
+ return (0);
+ }
+ }
+ bioerror(bp, 0);
+ bp->b_resid = 0;
+ biodone (bp);
+ } else {
+ /* write */
+ if (!pdev->wr_ready) {
+ NFP_STRAT_ERR (bp,ENXIO,"write called when not ready");
+ return (0);
+ }
+ if (pdev->wr_bp) {
+ NFP_STRAT_ERR (bp,ENXIO,"wr_bp != NULL");
+ return (0);
+ }
+ pdev->wrtimeout= timeout(nfp_wrtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000));
+ pdev->wr_bp = bp;
+ pdev->wr_ready = 0;
+ bp_mapin (bp);
+ ne= pdev->cmddev->write_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx);
+ if( ne != NFP_SUCCESS ) {
+ bp_mapout (bp);
+ (void) untimeout(pdev->wrtimeout);
+ pdev->wr_bp = 0;
+ pdev->wr_ready = 1;
+ NFP_STRAT_ERR (bp,nfp_oserr(ne),"write failed");
+ return (0);
+ }
+ }
+ nfp_log( NFP_DBG2, "nfp_strategy: leaving");
+
+ return (0);
+}
+
+
+/*--------------------*/
+/* poll / select */
+/*--------------------*/
+
+static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp) {
+ nfp_dev *pdev;
+ short revents;
+
+ if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) {
+ nfp_log( NFP_DBG1, "nfp_chpoll: unable to get nfp_dev");
+ *reventsp=0;
+ return (0);
+ }
+ nfp_log( NFP_DBG2, "nfp_chpoll: entered %x", events);
+
+ revents=0;
+ if (events&POLLWRNORM) {
+ if (pdev->wr_ready) {
+ nfp_log( NFP_DBG2, "nfp_chpoll: write ready");
+ revents|=POLLWRNORM;
+ }
+ }
+
+ if (events&POLLRDNORM) {
+ if (pdev->rd_ready) {
+ nfp_log( NFP_DBG2, "nfp_chpoll: read ready");
+ revents|=POLLRDNORM;
+ }
+ }
+
+ if (!revents && !anyyet) {
+ *phpp=&pdev->pollhead;
+ }
+ *reventsp=revents;
+
+ nfp_log( NFP_DBG2, "nfp_chpoll: leaving");
+ return (0);
+}
+
+
+/*--------------------*/
+/* ioctl */
+/*--------------------*/
+
+/* ARGSUSED */
+static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp) {
+ register struct nfp_dev *pdev;
+
+ nfp_log( NFP_DBG2, "nfp_ioctl: entered." );
+
+ if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) {
+ nfp_log( NFP_DBG1, "nfp_ioctl: unable to get nfp dev.");
+ return (ENXIO);
+ }
+
+ switch (cmd) {
+ case NFDEV_IOCTL_ENQUIRY:
+ {
+ long *outp;
+ int outlen;
+ nfdev_enquiry_str enq_data;
+
+ enq_data.busno = (unsigned int)-1;
+ enq_data.slotno = (unsigned char)-1;
+
+ /* get our bus and slot num */
+ if (ddi_getlongprop (DDI_DEV_T_NONE,
+ pdev->dip, 0, "reg",
+ (caddr_t)&outp, &outlen) != DDI_PROP_NOT_FOUND) {
+ nfp_log( NFP_DBG2, "ddi_getlongprop('reg') ok." );
+ if( outlen > 0 ) {
+ enq_data.busno = ((*outp)>>16) & 0xff;
+ enq_data.slotno = ((*outp)>>11) & 0x1f;
+ nfp_log( NFP_DBG2, "busno %d, slotno %d.",
+ enq_data.busno, enq_data.slotno );
+ }
+ } else
+ nfp_log( NFP_DBG1, "ddi_getlongprop('reg') failed." );
+
+ if( ddi_copyout( (char *)&enq_data, (void *)arg, sizeof(enq_data), mode ) != 0 ) {
+ nfp_log( NFP_DBG1, "ddi_copyout() failed." );
+ return EFAULT;
+ }
+ }
+ break;
+
+ case NFDEV_IOCTL_ENSUREREADING:
+ {
+ unsigned int addr, len;
+ nfp_err ret;
+ if( ddi_copyin( (void *)arg, (char *)&len, sizeof(unsigned int), mode ) != 0 ) {
+ nfp_log( NFP_DBG1, "ddi_copyin() failed." );
+ return (EFAULT);
+ }
+ /* signal a read to the module */
+ nfp_log( NFP_DBG2, "nfp_ioctl: signalling read request to module, len = %x.", len );
+ if (len>8192) {
+ nfp_log( NFP_DBG1, "nfp_ioctl: len >8192 = %x.", len );
+ return EINVAL;
+ }
+ if (pdev->rd_outstanding==1) {
+ nfp_log( NFP_DBG1, "nfp_ioctl: not about to call read with read outstanding.");
+ return EIO;
+ }
+
+ addr= 0;
+ if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) {
+ if( len > NFP_READBUF_SIZE ) {
+ nfp_log( NFP_DBG1, "nfp_ioctl: len > NFP_READBUF_SIZE = %x.", len );
+ return EINVAL;
+ }
+ addr= pdev->read_dma_cookie.dmac_address;
+ }
+
+ pdev->rd_outstanding = 1;
+ nfp_log( NFP_DBG2,"nfp_ioctl: pdev->rd_outstanding=1");
+
+ /* setup timeout timer */
+ pdev->rdtimeout= timeout(nfp_rdtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000));
+
+ nfp_log( NFP_DBG2, "nfp_ioctl: read request");
+ ret = pdev->cmddev->ensure_reading(addr, len, pdev->common.cmdctx);
+ if ( ret != NFP_SUCCESS ) {
+ (void) untimeout(pdev->rdtimeout);
+ pdev->rdtimeout = 0;
+ pdev->rd_outstanding = 0;
+ nfp_log( NFP_DBG1, "nfp_ioctl : cmddev->ensure_reading failed ");
+ return nfp_oserr( ret );
+ }
+ }
+ break;
+
+ case NFDEV_IOCTL_PCI_IFVERS:
+ {
+ int vers;
+
+ nfp_log( NFP_DBG2, "nfp_ioctl: NFDEV_IOCTL_PCI_IFVERS");
+
+ if( ddi_copyin( (void *)arg, (char *)&vers, sizeof(vers), mode ) != 0 ) {
+ nfp_log( NFP_DBG1, "ddi_copyin() failed." );
+ return (EFAULT);
+ }
+
+ if( pdev->rd_outstanding ) {
+ nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d as read outstanding", vers);
+ return EIO;
+ }
+
+ nfp_set_ifvers(pdev, vers);
+ if( pdev->ifvers != vers ) {
+ nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d", vers);
+ return EIO;
+ }
+ }
+ break;
+
+ case NFDEV_IOCTL_STATS:
+ {
+ if( ddi_copyout( (char *)&(pdev->common.stats),
+ (void *)arg,
+ sizeof(nfdev_stats_str),
+ mode ) != 0 ) {
+ nfp_log( NFP_DBG1, "ddi_copyout() failed." );
+ return EFAULT;
+ }
+ }
+ break;
+
+ default:
+ nfp_log( NFP_DBG1, "nfp_ioctl: unknown ioctl." );
+ return EINVAL;
+ }
+
+ return 0;
+}
+
+/*-------------------------*/
+/* nfp_open */
+/*-------------------------*/
+
+/* ARGSUSED */
+int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp)
+{
+ nfp_err ret;
+ register struct nfp_dev *pdev;
+
+ nfp_log( NFP_DBG2, "entered nfp_open." );
+
+ pdev = (nfp_dev *)ddi_get_soft_state(state_head, getminor(*dev));
+
+ if( !pdev ) {
+ nfp_log( NFP_DBG1, "nfp_open: unable to get nfp dev.");
+ return (ENODEV);
+ }
+
+ if( otyp != OTYP_CHR ) {
+ nfp_log( NFP_DBG1, "nfp_open: not opened as character device");
+ return (EINVAL);
+ }
+
+ mutex_enter(&pdev->busy_mutex);
+
+ if (pdev->busy) {
+ mutex_exit(&pdev->busy_mutex);
+ nfp_log( NFP_DBG1, "nfp_open: device busy");
+ return EBUSY;
+ }
+ pdev->busy= 1;
+ mutex_exit(&pdev->busy_mutex);
+
+ /* use oldest possible interface until told otherwise */
+ pdev->ifvers= NFDEV_IF_STANDARD;
+ nfp_log( NFP_DBG3, "nfp_open: setting ifvers %d", pdev->ifvers);
+ pdev->rd_ready= 0; /* drop any old data */
+
+ ret = pdev->cmddev->open(pdev->common.cmdctx);
+ if( ret != NFP_SUCCESS ) {
+ nfp_log( NFP_DBG1, "nfp_open : cmddev->open failed ");
+ return nfp_oserr( ret );
+ }
+
+ nfp_log( NFP_DBG2, "nfp_open: done");
+
+ return 0;
+}
+
+/*--------------------*/
+/* nfp_close */
+/*--------------------*/
+
+/* ARGSUSED */
+static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp) {
+ nfp_dev *pdev;
+ nfp_err ret;
+
+ nfp_log( NFP_DBG2, "nfp_close: entered");
+
+ pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor(dev));
+ if( !pdev ) {
+ nfp_log( NFP_DBG1, "nfp_close: cannot find dev.");
+ return ENODEV;
+ }
+
+ mutex_enter(&pdev->isr_mutex);
+ if(pdev->rd_outstanding) {
+ int lbolt, err;
+ nfp_get_lbolt(&lbolt, err);
+ if(!err)
+ (void) cv_timedwait(&pdev->rd_cv, &pdev->isr_mutex, lbolt + (NFP_TIMEOUT_SEC * drv_usectohz(1000000)) );
+ }
+ mutex_exit(&pdev->isr_mutex);
+ ret = pdev->cmddev->close(pdev->common.cmdctx);
+ if (ret != NFP_SUCCESS ) {
+ nfp_log( NFP_DBG1, " nfp_close : cmddev->close failed");
+ return nfp_oserr( ret );
+ }
+
+ mutex_enter(&pdev->busy_mutex);
+ pdev->busy= 0;
+ mutex_exit(&pdev->busy_mutex);
+
+ return 0;
+}
+
+/****************************************************************************
+
+ nfp driver config
+
+ ****************************************************************************/
+
+/*-------------------------*/
+/* nfp_getinfo */
+/*-------------------------*/
+
+/* ARGSUSED */
+static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) {
+ int error;
+ nfp_dev *pdev;
+
+ nfp_log( NFP_DBG2, "nfp_getinfo: entered" );
+
+ pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor((dev_t)arg));
+ if( !pdev ) {
+ nfp_log( NFP_DBG1, "nfp_close: cannot find dev.");
+ return ENODEV;
+ }
+
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ if (pdev == NULL) {
+ *result = NULL;
+ error = DDI_FAILURE;
+ } else {
+ /*
+ * don't need to use a MUTEX even though we are
+ * accessing our instance structure; dev->dip
+ * never changes.
+ */
+ *result = pdev->dip;
+ error = DDI_SUCCESS;
+ }
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)(uintptr_t)getminor((dev_t)arg);
+ error = DDI_SUCCESS;
+ break;
+ default:
+ *result = NULL;
+ error = DDI_FAILURE;
+ }
+
+ nfp_log( NFP_DBG2, "nfp_getinfo: leaving." );
+ return (error);
+}
+
+/*-------------------------*/
+/* nfp_release */
+/*-------------------------*/
+
+static int nfp_release_dev( dev_info_t *dip ) {
+ nfp_dev *pdev;
+ int instance, i;
+ nfp_err ret;
+
+ nfp_log( NFP_DBG2, "nfp_release_dev: entering" );
+
+ instance = ddi_get_instance(dip);
+ pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance);
+ if (pdev) {
+ nfp_log( NFP_DBG3, "nfp_release_dev: removing device" );
+
+ nfp_free_pci_push(pdev);
+
+ if( pdev->cmddev ) {
+ nfp_log( NFP_DBG3, "nfp_release_dev: destroying cmd dev" );
+ ret = pdev->cmddev->destroy(pdev->common.cmdctx);
+ if (ret != NFP_SUCCESS) {
+ nfp_log( NFP_DBG1, " nfp_release_dev : cmddev->destroy failed ");
+ return nfp_oserr( ret );
+ }
+ }
+
+ if(pdev->high_iblock_cookie) {
+ nfp_log( NFP_DBG3, "nfp_release_dev: removing high and soft irq" );
+ ddi_remove_softintr(pdev->soft_int_id);
+ ddi_remove_intr(pdev->dip, 0, pdev->high_iblock_cookie);
+ mutex_destroy( &pdev->busy_mutex );
+ cv_destroy( &pdev->rd_cv );
+ mutex_destroy( &pdev->isr_mutex );
+ mutex_destroy( &pdev->high_mutex );
+ } else if(pdev->iblock_cookie) {
+ nfp_log( NFP_DBG3, "nfp_release_dev: removing irq" );
+ ddi_remove_intr(pdev->dip, 0, pdev->iblock_cookie);
+ mutex_destroy( &pdev->busy_mutex );
+ cv_destroy( &pdev->rd_cv );
+ mutex_destroy( &pdev->isr_mutex );
+ }
+ if(pdev->low_iblock_cookie) {
+ ddi_remove_intr(pdev->dip, 0, pdev->low_iblock_cookie);
+ mutex_destroy( &pdev->low_mutex);
+ }
+
+ for(i=0;i<6;i++) {
+ if( pdev->common.extra[i] ) {
+ nfp_log( NFP_DBG3, "nfp_release_dev: unmapping BAR %d", i );
+ ddi_regs_map_free ((ddi_acc_handle_t *)&pdev->common.extra[i]);
+ }
+ }
+
+ ddi_remove_minor_node(dip, NULL);
+
+ if (pdev->conf_handle)
+ pci_config_teardown( &pdev->conf_handle );
+
+ ddi_soft_state_free(state_head, instance);
+ }
+ nfp_log( NFP_DBG2, "nfp_release: finished" );
+
+ return DDI_SUCCESS;
+}
+
+
+/*-------------------------*/
+/* nfp_attach */
+/*-------------------------*/
+
+static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) {
+ int instance;
+ nfp_dev *pdev = NULL;
+ int intres;
+ uint16_t device, vendor, sub_device, sub_vendor;
+ long *outp;
+ nfpcmd_dev const *cmddev;
+ int index, i;
+ nfp_err ret;
+
+ nfp_log( NFP_DBG2, "nfp_attach: entered." );
+
+ if (cmd != DDI_ATTACH) {
+ nfp_log( NFP_DBG1, "nfp_attach: bad command." );
+ goto bailout;
+ }
+
+ instance = ddi_get_instance(dip);
+
+ if (ddi_soft_state_zalloc(state_head, instance) != 0) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_soft_state_zalloc() failed." );
+ goto bailout;
+ }
+
+ pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance);
+ if( !pdev ) {
+ nfp_log( NFP_DBG1, "nfp_attach: cannot find dev.");
+ return ENODEV;
+ }
+ pdev->dip = dip;
+
+ /* map in pci config registers */
+ if (pci_config_setup(dip, &pdev->conf_handle)) {
+ nfp_log( NFP_DBG1, "nfp_attach: pci_config_setup() failed." );
+ goto bailout;
+ }
+
+ /* find out what we have got */
+ vendor= PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_VENID );
+ device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_DEVID );
+ sub_vendor = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBVENID );
+ sub_device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBSYSID );
+
+ index= 0;
+ while( (cmddev = nfp_drvlist[index++]) != NULL ) {
+ if( cmddev->vendorid == vendor &&
+ cmddev->deviceid == device &&
+ cmddev->sub_vendorid == sub_vendor &&
+ cmddev->sub_deviceid == sub_device )
+ break;
+ }
+ if( !cmddev ) {
+ nfp_log( NFP_DBG1, "nfp_attach: unknonw device." );
+ goto bailout;
+ }
+
+ /* map BARs */
+ for( i=0; i<6; i++ ) {
+ if( cmddev->bar_sizes[i] ) {
+ off_t size;
+ if( ddi_dev_regsize(dip, i+1, &size) != DDI_SUCCESS) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_dev_regsize() failed for BAR %d", i );
+ goto bailout;
+ }
+ if( size < (cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK) ) {
+ nfp_log( NFP_DBG1, "nfp_attach: BAR %d too small %x (%x)", i, size, (cmddev->bar_sizes[i] & ~0xF) );
+ goto bailout;
+ }
+ if (ddi_regs_map_setup(dip, i+1, (caddr_t *)&pdev->common.bar[i],
+ 0, cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK, &nosw_attr, (ddi_acc_handle_t *)&pdev->common.extra[i] )) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_regs_map_setup() failed for BAR %d", i );
+ goto bailout;
+ }
+ nfp_log( NFP_DBG3, "nfp_attach: BAR[%d] mapped to %x (%x)", i, pdev->common.bar[i], size );
+ }
+ }
+
+ pdev->read_buf = NULL;
+ pdev->rd_dma_ok = 0;
+
+ /* attach to minor node */
+ if (ddi_create_minor_node(dip, "nfp", S_IFCHR, instance, (char *)cmddev->name, 0) == DDI_FAILURE) {
+ ddi_remove_minor_node(dip, NULL);
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_create_minor_node() failed." );
+ goto bailout;
+ }
+
+ pdev->wr_ready = 1;
+ pdev->rd_ready = 0;
+ pdev->rd_pending = 0;
+ pdev->rd_outstanding = 0;
+ pdev->busy=0;
+ pdev->cmddev= cmddev;
+
+ ret = pdev->cmddev->create(&pdev->common);
+ if( ret != NFP_SUCCESS) {
+ nfp_log( NFP_DBG1, "nfp_attach: failed to create command device");
+ goto bailout;
+ }
+ pdev->common.dev= pdev;
+
+ if (ddi_intr_hilevel(dip, 0) != 0){
+ nfp_log( NFP_DBG2, "nfp_attach: high-level interrupt");
+ if( ddi_get_iblock_cookie(dip, 0, &pdev->high_iblock_cookie) ) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(high) failed." );
+ goto bailout;
+ }
+ if( ddi_get_iblock_cookie(dip, 0, &pdev->low_iblock_cookie) ) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(low) failed." );
+ goto bailout;
+ }
+ mutex_init(&pdev->high_mutex, NULL, MUTEX_DRIVER,
+ (void *)pdev->high_iblock_cookie);
+ mutex_init(&pdev->low_mutex, NULL, MUTEX_DRIVER,
+ (void *)pdev->low_iblock_cookie);
+ if (ddi_add_intr(dip, 0, NULL,
+ NULL, nfp_isr,
+ (caddr_t)pdev) != DDI_SUCCESS) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr(high) failed." );
+ goto bailout;
+ }
+ if( ddi_get_soft_iblock_cookie(dip, DDI_SOFTINT_HIGH,
+ &pdev->iblock_cookie) ) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(soft) failed." );
+ goto bailout;
+ }
+ mutex_init(&pdev->isr_mutex, NULL, MUTEX_DRIVER,
+ (void *)pdev->iblock_cookie);
+ if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, &pdev->soft_int_id,
+ &pdev->iblock_cookie, NULL,
+ nfp_soft_isr, (caddr_t)pdev) != DDI_SUCCESS)
+ goto bailout;
+ pdev->high_intr= 1;
+ } else {
+ nfp_log( NFP_DBG2, "nfp_attach: low-level interrupt");
+
+ if (ddi_get_iblock_cookie (dip, 0, &pdev->iblock_cookie)) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie() failed." );
+ goto bailout;
+ }
+
+ mutex_init(&pdev->isr_mutex, "nfp isr mutex", MUTEX_DRIVER, (void *)pdev->iblock_cookie);
+
+ if (ddi_add_intr(dip, 0, NULL,
+ (ddi_idevice_cookie_t *)NULL, nfp_isr,
+ (caddr_t)pdev) != DDI_SUCCESS) {
+ nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr() failed." );
+ goto bailout;
+ }
+ }
+ mutex_init(&pdev->busy_mutex, "nfp busy mutex", MUTEX_DRIVER, NULL );
+ cv_init(&pdev->rd_cv, "nfp read condvar", CV_DRIVER, NULL );
+
+ /* get our bus and slot num */
+ if (ddi_getlongprop (DDI_DEV_T_NONE,
+ pdev->dip, 0, "reg",
+ (caddr_t)&outp, &intres) != DDI_PROP_NOT_FOUND) {
+ nfp_log( NFP_DBG2, "nfp_attach: ddi_getlongprop('reg') ok." );
+ if( intres > 0 ) {
+ nfp_log( NFP_DBG1, "nfp_attach: found PCI nfast bus %x slot %x.",
+ ((*outp)>>16) & 0xff, ((*outp)>>11) & 0x1f );
+ }
+ }
+
+ nfp_log( NFP_DBG2, "nfp_attach: attach succeeded." );
+ return DDI_SUCCESS;
+
+bailout:
+ (void) nfp_release_dev( dip );
+
+ return DDI_FAILURE;
+}
+
+/*-------------------------*/
+/* nfp_detach */
+/*-------------------------*/
+
+/*
+ * When our driver is unloaded, nfp_detach cleans up and frees the resources
+ * we allocated in nfp_attach.
+ */
+static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) {
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ (void) nfp_release_dev(dip);
+
+ return (DDI_SUCCESS);
+}
+
+/*-------------------------*/
+/* _init */
+/*-------------------------*/
+
+int _init(void) {
+ register int error;
+
+ nfp_log( NFP_DBG2, "_init: entered" );
+
+ if ((error = ddi_soft_state_init(&state_head, sizeof (struct nfp_dev), 1)) != 0) {
+ nfp_log( NFP_DBG1, "_init: soft_state_init() failed" );
+ return (error);
+ }
+
+ if ((error = mod_install(&modlinkage)) != 0) {
+ nfp_log( NFP_DBG1, "_init: mod_install() failed" );
+ ddi_soft_state_fini(&state_head);
+ }
+
+ nfp_log( NFP_DBG2, "_init: leaving" );
+ return (error);
+}
+
+/*-------------------------*/
+/* _info */
+/*-------------------------*/
+
+int _info(struct modinfo *modinfop) {
+ nfp_log( NFP_DBG2, "_info: entered" );
+
+ return (mod_info(&modlinkage, modinfop));
+}
+
+/*-------------------------*/
+/* _fini */
+/*-------------------------*/
+
+int _fini(void) {
+ int status;
+
+ nfp_log( NFP_DBG2, "_fini: entered" );
+
+ if ((status = mod_remove(&modlinkage)) != 0) {
+ nfp_log( NFP_DBG2, "_fini: mod_remove() failed." );
+ return (status);
+ }
+
+ ddi_soft_state_fini(&state_head);
+
+ nfp_log( NFP_DBG2, "_fini: leaving" );
+
+ return (status);
+}
+
diff --git a/usr/src/uts/common/io/nfp/i21285.c b/usr/src/uts/common/io/nfp/i21285.c
new file mode 100644
index 0000000000..f51a09188d
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21285.c
@@ -0,0 +1,310 @@
+/*
+
+i21285.c: nCipher PCI HSM intel/digital 21285 command driver
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+
+history
+
+09/10/2001 jsh Original
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_error.h"
+#include "nfp_hostif.h"
+#include "nfp_osif.h"
+#include "i21285.h"
+#include "nfp_cmd.h"
+#include "nfpci.h"
+
+/* create ------------------------------------------------------- */
+
+static nfp_err i21285_create( nfp_cdev *pdev ) {
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG2, "i21285_create: entered");
+ pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */
+
+ nfp_log( NFP_DBG2, "i21285_create: enable doorbell");
+ if(!pdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21285_create: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+ TO_LE32_IO( &tmp32, DOORBELL_ENABLE | POSTLIST_ENABLE);
+ nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 );
+
+ return NFP_SUCCESS;
+}
+
+/* stop ------------------------------------------------------- */
+
+static nfp_err i21285_destroy( void * ctx ) {
+ nfp_cdev *pdev;
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG2, "i21285_destroy: entered");
+
+ pdev= (nfp_cdev *)ctx;
+ if(!pdev) {
+ nfp_log( NFP_DBG1, "i21285_destroy: NULL pdev");
+ return NFP_ENODEV;
+ }
+ if(!pdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21285_destroy: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+ TO_LE32_IO( &tmp32, DOORBELL_DISABLE | POSTLIST_DISABLE );
+ nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 );
+
+ return NFP_SUCCESS;
+}
+
+/* open ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21285_open( void * ctx ) {
+ nfp_log( NFP_DBG2, "i21285_open: entered");
+
+ return NFP_SUCCESS;
+}
+
+/* close ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21285_close( void * ctx ) {
+ nfp_log( NFP_DBG2, "i21285_close: entered");
+
+ return NFP_SUCCESS;
+}
+
+/* isr ------------------------------------------------------- */
+
+static nfp_err i21285_isr( void *ctx, int *handled ) {
+ nfp_cdev *pdev;
+ unsigned int doorbell;
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG3, "i21285_isr: entered");
+
+ *handled= 0;
+ pdev= (nfp_cdev *)ctx;
+ if(!pdev) {
+ nfp_log( NFP_DBG1, "i21285_isr: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL);
+ doorbell= FROM_LE32_IO(&doorbell) & 0xffff;
+ while( doorbell && doorbell != 0xffff) {
+ *handled= 1;
+ /* service interrupts */
+ if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+ TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED);
+ nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+ nfp_log(NFP_DBG2, "i21285_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+
+ nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+ }
+
+ if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) {
+ TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED );
+ nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+ nfp_log(NFP_DBG2, "i21285_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 );
+ nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0);
+ }
+
+ if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED |
+ NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+ nfp_log( NFP_DBG1, "i21285_isr: unexpected interrupt %x", doorbell );
+ TO_LE32_IO( &tmp32, 0xffff & doorbell );
+ nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+ }
+ doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL);
+ doorbell= FROM_LE32_IO(&doorbell) & 0xffff;
+ }
+ return 0;
+}
+
+/* write ------------------------------------------------------- */
+
+static nfp_err i21285_write( const char *block, int len, void *ctx ) {
+ nfp_cdev *cdev;
+ unsigned int hdr[2];
+ nfp_err ne;
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG2, "i21285_write: entered");
+
+ cdev= (nfp_cdev *)ctx;
+ if(!cdev) {
+ nfp_log( NFP_DBG1, "i21285_write: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ MEMBAR ]= %x\n", cdev->bar[ MEMBAR ]);
+ nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ IOBAR ]= %x\n", cdev->bar[ IOBAR ]);
+ if(!cdev->bar[ MEMBAR ]) {
+ nfp_log( NFP_DBG1, "i21285_write: null BAR[%d]", MEMBAR );
+ return NFP_ENOMEM;
+ }
+ ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_user_to_dev failed");
+ return ne;
+ }
+ TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL);
+ TO_LE32_MEM(&hdr[1], len);
+
+ ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21285_write: nfp_copy_to_dev failed");
+ return ne;
+ }
+
+ ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_dev failed");
+ return ne;
+ }
+
+ TO_LE32_MEM( &tmp32, len );
+ if ( hdr[0] != tmp32 ) {
+ nfp_log( NFP_DBG1, "i21285_write: length not written");
+ return NFP_EIO;
+ }
+
+ TO_LE32_IO( &tmp32, NFAST_INT_HOST_WRITE_REQUEST);
+
+ nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+ nfp_log( NFP_DBG2, "i21285_write: done");
+ return NFP_SUCCESS;
+}
+
+/* read ------------------------------------------------------- */
+
+static nfp_err i21285_read( char *block, int len, void *ctx, int *rcount) {
+ nfp_cdev *cdev;
+ nfp_err ne;
+ int count;
+
+ nfp_log( NFP_DBG2, "i21285_read: entered, len %d", len);
+ *rcount= 0;
+
+ cdev= (nfp_cdev *)ctx;
+ if(!cdev) {
+ nfp_log( NFP_DBG1, "i21285_read: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ if(!cdev->bar[ MEMBAR ]) {
+ nfp_log( NFP_DBG1, "i21285_read: null BAR[%d]", MEMBAR );
+ return NFP_ENOMEM;
+ }
+ ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4);
+ if(ne) {
+ nfp_log( NFP_DBG1, "i21285_read: nfp_copy_from_dev failed.");
+ return ne;
+ }
+ count= FROM_LE32_MEM(&count);
+ if(count<0 || count>len) {
+ nfp_log( NFP_DBG1, "i21285_read: bad byte count (%d) from device", count);
+ return NFP_EIO;
+ }
+ ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count);
+ if( ne ) {
+ nfp_log( NFP_DBG1, "i21285_read: nfp_copy_to_user_from_dev failed.");
+ return ne;
+ }
+ nfp_log( NFP_DBG2, "i21285_read: done");
+ *rcount= count;
+ return NFP_SUCCESS;
+}
+
+/* chupdate ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21285_chupdate( char *data, int len, void *ctx ) {
+ nfp_log( NFP_DBG1, "i21285_chupdate: NYI");
+ return NFP_SUCCESS;
+}
+
+/* ensure reading -------------------------------------------------- */
+
+static nfp_err i21285_ensure_reading( unsigned int addr, int len, void *ctx ) {
+ nfp_cdev *cdev;
+ unsigned int hdr[2];
+ unsigned int tmp32;
+ nfp_err ne;
+
+ nfp_log( NFP_DBG2, "i21285_ensure_reading: entered");
+
+ if(addr) {
+ nfp_log( NFP_DBG2, "i21285_ensure_reading: bad addr");
+ return -NFP_EINVAL;
+ }
+
+ cdev= (nfp_cdev *)ctx;
+ if(!cdev) {
+ nfp_log( NFP_DBG1, "i21285_ensure_reading: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ if(!cdev->bar[ MEMBAR ]) {
+ nfp_log( NFP_DBG1, "i21285_ensure_reading: null BAR[%d]", MEMBAR );
+ return NFP_ENXIO;
+ }
+ nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]);
+ nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]);
+ TO_LE32_MEM( &hdr[0], NFPCI_JOB_CONTROL);
+ TO_LE32_MEM( &hdr[1], len);
+ ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, 8);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_to_dev failed");
+ return ne;
+ }
+ ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_from_dev failed");
+ return ne;
+ }
+ TO_LE32_MEM( &tmp32, len );
+ if ( hdr[0] != tmp32 ) {
+ nfp_log( NFP_DBG1, "i21285_ensure_reading: len not written");
+ return NFP_EIO;
+ };
+ TO_LE32_IO( &tmp32, NFAST_INT_HOST_READ_REQUEST );
+ nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 );
+
+ return NFP_SUCCESS;
+}
+
+/* command device structure ------------------------------------- */
+
+
+const nfpcmd_dev i21285_cmddev = {
+ "nCipher Gen 1 PCI",
+ PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_DEC_21285,
+ PCI_VENDOR_ID_NCIPHER, PCI_DEVICE_ID_NFAST_GEN1,
+ { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE, 0, 0, 0 },
+ NFP_CMD_FLG_NEED_IOBUF,
+ i21285_create,
+ i21285_destroy,
+ i21285_open,
+ i21285_close,
+ i21285_isr,
+ i21285_write,
+ i21285_read,
+ i21285_chupdate,
+ i21285_ensure_reading,
+ 0, /* no debug */
+};
+
diff --git a/usr/src/uts/common/io/nfp/i21285.h b/usr/src/uts/common/io/nfp/i21285.h
new file mode 100644
index 0000000000..4ea1d853ec
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21285.h
@@ -0,0 +1,43 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#ifndef NFP_I21285_H
+#define NFP_I21285_H
+
+#ifndef PCI_VENDOR_ID_DEC
+#define PCI_VENDOR_ID_DEC 0x1011
+#endif
+#ifndef PCI_DEVICE_ID_DEC_21285
+#define PCI_DEVICE_ID_DEC_21285 0x1065
+#endif
+#ifndef PCI_VENDOR_ID_NCIPHER
+#define PCI_VENDOR_ID_NCIPHER 0x0100
+#endif
+
+#ifndef PCI_DEVICE_ID_NFAST_GEN1
+#define PCI_DEVICE_ID_NFAST_GEN1 0x0100
+#endif
+
+#define I21285_OFFSET_DOORBELL 0x60
+#define I21285_OFFSET_INTERRUPT_MASK 0x34
+
+#define DOORBELL_ENABLE 0x0
+#define DOORBELL_DISABLE 0x4
+
+#define POSTLIST_ENABLE 0x0
+#define POSTLIST_DISABLE 0x8
+
+#define IOBAR 1
+#define MEMBAR 2
+
+#define IOSIZE 0x80
+#define MEMSIZE 0x100000
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/i21555.c b/usr/src/uts/common/io/nfp/i21555.c
new file mode 100644
index 0000000000..82024dc800
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21555.c
@@ -0,0 +1,423 @@
+/*
+
+i21555.c: nCipher PCI HSM intel 21555 command driver
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+09/10/2001 jsh Original
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_error.h"
+#include "nfp_hostif.h"
+#include "nfp_osif.h"
+#include "i21555.h"
+#include "nfp_cmd.h"
+#include "nfpci.h"
+
+/* started ------------------------------------------------------
+ *
+ * Check that device is ready to talk, by checking that
+ * the i21555 has master enabled on its secondary interface
+ */
+
+static nfp_err i21555_started( nfp_cdev *pdev ) {
+ unsigned int tmp32;
+#ifdef CONFIGSPACE_DEBUG
+ unsigned int reg32[64];
+ int i;
+#endif
+ nfp_err ne;
+
+ nfp_log( NFP_DBG2, "i21555_started: entered");
+
+#ifdef CONFIGSPACE_DEBUG
+ /* Suck up all the registers */
+ for (i=0; i < 64; i++) {
+ ne = nfp_config_inl( pdev, i*4, &reg32[i] );
+ }
+
+ for (i=0; i < 16; i++) {
+ int j = i * 4;
+ nfp_log( NFP_DBG3, "i21555 config reg %2x: %08x %08x %08x %08x", j*4,
+ reg32[j], reg32[j+1], reg32[j+2], reg32[j+3]);
+ }
+#endif
+
+ ne = nfp_config_inl( pdev, I21555_CFG_SEC_CMD_STATUS, &tmp32 );
+ if (ne) {
+ /* succeed if PCI config reads are not implemented */
+ if (ne == NFP_EUNKNOWN)
+ return NFP_SUCCESS;
+ nfp_log( NFP_DBG1, "i21555_started: nfp_config_inl failed");
+ return ne;
+ }
+
+ tmp32= FROM_LE32_IO(&tmp32) & 0xffff;
+
+ if ( tmp32 & CFG_CMD_MASTER ) {
+ nfp_log( NFP_DBG3, "i21555_started: Yes %x", tmp32);
+ return NFP_SUCCESS;
+ } else {
+ nfp_log( NFP_DBG1, "i21555_started: device not started yet %x", tmp32);
+ return NFP_ESTARTING;
+ }
+}
+
+/* create ------------------------------------------------------- */
+
+static nfp_err i21555_create( nfp_cdev *pdev ) {
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG2, "i21555_create: entered");
+ pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */
+
+ if(!pdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21555_create: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+ nfp_log( NFP_DBG2, "i21555_create: enable doorbell");
+ TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_ENABLE );
+ nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 );
+ nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 );
+ return NFP_SUCCESS;
+}
+
+/* stop ------------------------------------------------------- */
+
+static nfp_err i21555_destroy( void * ctx ) {
+ nfp_cdev *pdev;
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG2, "i21555_destroy: entered");
+
+ pdev= (nfp_cdev *)ctx;
+ if(!pdev) {
+ nfp_log( NFP_DBG1, "i21555_destroy: NULL pdev");
+ return NFP_ENODEV;
+ }
+ if(!pdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21555_destroy: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+ TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_DISABLE );
+ nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 );
+ nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 );
+
+ return NFP_SUCCESS;
+}
+
+/* open ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21555_open( void * ctx ) {
+
+ nfp_log( NFP_DBG2, "i21555_open: entered");
+
+ return NFP_SUCCESS;
+}
+
+/* close ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21555_close( void * ctx ) {
+ nfp_log( NFP_DBG2, "i21555_close: entered");
+
+ return NFP_SUCCESS;
+}
+
+/* isr ------------------------------------------------------- */
+
+static nfp_err i21555_isr( void *ctx, int *handled ) {
+ nfp_cdev *pdev;
+ nfp_err ne;
+ unsigned short doorbell;
+ unsigned short tmp16;
+
+ nfp_log( NFP_DBG3, "i21555_isr: entered");
+
+ *handled= 0;
+ pdev= (nfp_cdev *)ctx;
+ if(!pdev) {
+ nfp_log( NFP_DBG1, "i21555_isr: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ pdev->stats.isr++;
+
+ if(!pdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21555_isr: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+
+ /* This interrupt may not be from our module, so check that it actually is
+ * us before handling it.
+ */
+ ne = i21555_started( pdev );
+ if (ne) {
+ if (ne != NFP_ESTARTING) {
+ nfp_log( NFP_DBG1, "i21555_isr: i21555_started failed");
+ }
+ return ne;
+ }
+
+ doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET);
+ doorbell= FROM_LE16_IO(&doorbell);
+ while( doorbell && doorbell != 0xffff) {
+ *handled= 1;
+ /* service interrupts */
+ if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+ pdev->stats.isr_write++;
+ TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED);
+ nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 );
+
+ nfp_log( NFP_DBG2, "i21555_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+
+ nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 );
+ }
+
+ if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) {
+ pdev->stats.isr_read++;
+ TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED);
+ nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 );
+
+ nfp_log( NFP_DBG2, "i21555_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 );
+ nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0);
+ }
+
+ if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED |
+ NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) {
+ TO_LE16_IO(&tmp16,doorbell);
+ nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 );
+ nfp_log( NFP_DBG1, "i21555_isr: unexpected interrupt %x", doorbell );
+ }
+ doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET);
+ doorbell= FROM_LE16_IO(&doorbell);
+ }
+ nfp_log( NFP_DBG3, "i21555_isr: exiting");
+ return 0;
+}
+
+/* write ------------------------------------------------------- */
+
+static nfp_err i21555_write( const char *block, int len, void *ctx) {
+ nfp_cdev *cdev;
+ unsigned int hdr[2];
+ nfp_err ne;
+ unsigned short tmp16;
+ unsigned int tmp32;
+
+ nfp_log( NFP_DBG2, "i21555_write: entered");
+
+ cdev= (nfp_cdev *)ctx;
+ if(!cdev) {
+ nfp_log( NFP_DBG1, "i21555_write: NULL cdev");
+ return NFP_ENODEV;
+ }
+
+ cdev->stats.write_fail++;
+
+ if(!cdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21555_write: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+
+ ne = i21555_started( cdev );
+ if (ne) {
+ if (ne != NFP_ESTARTING) {
+ nfp_log( NFP_DBG1, "i21555_write: i21555_started failed");
+ }
+ return ne;
+ }
+
+ nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]);
+ nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]);
+ nfp_log( NFP_DBG3, "i21555_write: block len %d", len );
+ ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_user_to_dev failed");
+ return ne;
+ }
+ TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL);
+ TO_LE32_MEM(&hdr[1], len);
+ ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_write: nfp_copy_to_dev failed");
+ return ne;
+ }
+
+ ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_dev failed");
+ return ne;
+ }
+
+ TO_LE32_MEM(&tmp32, len);
+ if ( hdr[0] != tmp32 ) {
+ nfp_log( NFP_DBG1, "i21555_write: length not written");
+ return NFP_EIO;
+ }
+ TO_LE16_IO(&tmp16, NFAST_INT_HOST_WRITE_REQUEST >> 16);
+ nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16);
+
+ cdev->stats.write_fail--;
+ cdev->stats.write_block++;
+ cdev->stats.write_byte += len;
+
+ nfp_log( NFP_DBG2, "i21555_write: done");
+ return NFP_SUCCESS;
+}
+
+/* read ------------------------------------------------------- */
+
+static nfp_err i21555_read( char *block, int len, void *ctx, int *rcount) {
+ nfp_cdev *cdev;
+ nfp_err ne;
+ int count;
+
+ nfp_log( NFP_DBG2, "i21555_read: entered");
+ *rcount= 0;
+
+ cdev= (nfp_cdev *)ctx;
+ if(!cdev) {
+ nfp_log( NFP_DBG1, "i21555_read: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ cdev->stats.read_fail++;
+
+ if(!cdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21555_read: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+
+ ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_read: nfp_copy_from_dev failed.");
+ return ne;
+ }
+ count= FROM_LE32_MEM(&count);
+ if(count<0 || count>len) {
+ nfp_log( NFP_DBG1, "i21555_read: bad byte count (%d) from device", count);
+ return NFP_EIO;
+ }
+ ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_read: nfp_copy_to_user failed.");
+ return ne;
+ }
+ nfp_log( NFP_DBG2, "i21555_read: done");
+ *rcount= count;
+ cdev->stats.read_fail--;
+ cdev->stats.read_block++;
+ cdev->stats.read_byte += len;
+ return NFP_SUCCESS;
+}
+
+/* chupdate ------------------------------------------------------- */
+
+/* ARGSUSED */
+static nfp_err i21555_chupdate( char *data, int len, void *ctx ) {
+ nfp_log( NFP_DBG1, "i21555_chupdate: NYI");
+ return NFP_SUCCESS;
+}
+
+/* ensure reading -------------------------------------------------- */
+
+static nfp_err i21555_ensure_reading( unsigned int addr, int len, void *ctx ) {
+ nfp_cdev *cdev;
+ unsigned int hdr[3];
+ unsigned short tmp16;
+ unsigned int tmp32;
+ nfp_err ne;
+ int hdr_len;
+
+ nfp_log( NFP_DBG2, "i21555_ensure_reading: entered");
+
+ cdev= (nfp_cdev *)ctx;
+ if(!cdev) {
+ nfp_log( NFP_DBG1, "i21555_ensure_reading: NULL pdev");
+ return NFP_ENODEV;
+ }
+
+ cdev->stats.ensure_fail++;
+
+ if(!cdev->bar[ IOBAR ]) {
+ nfp_log( NFP_DBG1, "i21555_ensure_reading: null BAR[%d]", IOBAR );
+ return NFP_ENOMEM;
+ }
+
+ ne = i21555_started( cdev );
+ if (ne) {
+ if (ne != NFP_ESTARTING) {
+ nfp_log( NFP_DBG1, "i21555_ensure_reading: i21555_started failed");
+ }
+ return ne;
+ }
+
+ nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]);
+ nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]);
+ if(addr) {
+ nfp_log( NFP_DBG3, "i21555_ensure_reading: new format, addr %x", addr);
+ TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL_PCI_PUSH);
+ TO_LE32_MEM(&hdr[1], len);
+ TO_LE32_MEM(&hdr[2], addr);
+ hdr_len= 12;
+ } else {
+ TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL);
+ TO_LE32_MEM(&hdr[1], len);
+ hdr_len= 8;
+ }
+ ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, hdr_len);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_to_dev failed");
+ return ne;
+ }
+
+ ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4);
+ if (ne) {
+ nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_from_dev failed");
+ return ne;
+ }
+
+ TO_LE32_MEM(&tmp32, len);
+
+ if ( hdr[0] != tmp32 ) {
+ nfp_log( NFP_DBG1, "i21555_ensure_reading: len not written");
+ return NFP_EIO;
+ }
+ TO_LE16_IO( &tmp16, NFAST_INT_HOST_READ_REQUEST >> 16);
+ nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16);
+
+ cdev->stats.ensure_fail--;
+ cdev->stats.ensure++;
+
+ return NFP_SUCCESS;
+}
+
+/* command device structure ------------------------------------- */
+
+const nfpcmd_dev i21555_cmddev = {
+ "nCipher Gen 2 PCI",
+ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_21555,
+ PCI_VENDOR_ID_NCIPHER, PCI_SUBSYSTEM_ID_NFAST_REV1,
+ { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE_JOBS, 0, 0, 0 },
+ NFP_CMD_FLG_NEED_IOBUF,
+ i21555_create,
+ i21555_destroy,
+ i21555_open,
+ i21555_close,
+ i21555_isr,
+ i21555_write,
+ i21555_read,
+ i21555_chupdate,
+ i21555_ensure_reading,
+ i21555_debug,
+};
diff --git a/usr/src/uts/common/io/nfp/i21555.h b/usr/src/uts/common/io/nfp/i21555.h
new file mode 100644
index 0000000000..d8f3965938
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21555.h
@@ -0,0 +1,51 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#ifndef I21555_H
+#define I21555_H
+
+#ifndef PCI_VENDOR_ID_INTEL
+#define PCI_VENDOR_ID_INTEL 0x8086
+#endif
+
+#ifndef PCI_DEVICE_ID_INTEL_21555
+#define PCI_DEVICE_ID_INTEL_21555 0xb555
+#endif
+
+#ifndef PCI_VENDOR_ID_NCIPHER
+#define PCI_VENDOR_ID_NCIPHER 0x0100
+#endif
+
+#ifndef PCI_SUBSYSTEM_ID_NFAST_REV1
+#define PCI_SUBSYSTEM_ID_NFAST_REV1 0x0100
+#endif
+
+#define I21555_OFFSET_DOORBELL_PRI_SET 0x9C
+#define I21555_OFFSET_DOORBELL_SEC_SET 0x9E
+#define I21555_OFFSET_DOORBELL_PRI_CLEAR 0x98
+
+#define I21555_OFFSET_DOORBELL_PRI_SET_MASK 0xA4
+#define I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK 0xA0
+
+#define I21555_DOORBELL_PRI_ENABLE 0x0000
+#define I21555_DOORBELL_PRI_DISABLE 0xFFFF
+
+#define I21555_CFG_SEC_CMD_STATUS 0x44
+
+#define CFG_CMD_MASTER 0x0004
+
+#define IOBAR 1
+#define MEMBAR 2
+
+#define IOSIZE 0x100
+
+extern nfp_err i21555_debug( int cmd, void *ctx );
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/i21555d.c b/usr/src/uts/common/io/nfp/i21555d.c
new file mode 100644
index 0000000000..183ace8275
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/i21555d.c
@@ -0,0 +1,28 @@
+/*
+
+i21555d.c: nCipher PCI HSM intel 21555 debug ioctl
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+
+history
+
+15/05/2002 jsh Original, does nothing
+
+*/
+
+#include "nfp_common.h"
+#include "nfp_error.h"
+#include "nfp_osif.h"
+#include "i21555.h"
+
+/* ARGSUSED */
+nfp_err i21555_debug( int cmd, void *ctx) {
+ nfp_log( NFP_DBG1, "i21555_debug: entered");
+
+ return NFP_EUNKNOWN;
+}
diff --git a/usr/src/uts/common/io/nfp/nfdev-common.h b/usr/src/uts/common/io/nfp/nfdev-common.h
new file mode 100644
index 0000000000..8a97bf2c63
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfdev-common.h
@@ -0,0 +1,141 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+/** \file nfdev-common.h
+ *
+ * \brief nFast device driver (not generic SCSI) ioctl struct definition file
+ * include NFDEV-$(system) for ioctl number definitions
+ *
+ * 1998.07.13 jsh Started
+ *
+ *
+ */
+
+#ifndef NFDEV_COMMON_H
+#define NFDEV_COMMON_H
+
+/**
+ * Result of the ENQUIRY ioctl.
+ */
+typedef struct nfdev_enquiry_str {
+ unsigned int busno; /**< Which bus is the PCI device on. */
+ unsigned char slotno; /**< Which slot is the PCI device in. */
+ unsigned char reserved[3]; /**< for consistant struct alignment */
+} nfdev_enquiry_str;
+
+/**
+ * Result of the STATS ioctl.
+ */
+typedef struct nfdev_stats_str {
+ unsigned long isr; /**< Count interrupts. */
+ unsigned long isr_read; /**< Count read interrupts. */
+ unsigned long isr_write; /**< Count write interrupts. */
+ unsigned long write_fail; /**< Count write failures. */
+ unsigned long write_block; /**< Count blocks written. */
+ unsigned long write_byte; /**< Count bytes written. */
+ unsigned long read_fail; /**< Count read failures. */
+ unsigned long read_block; /**< Count blocks read. */
+ unsigned long read_byte; /**< Count bytes read. */
+ unsigned long ensure_fail; /**< Count read request failures. */
+ unsigned long ensure; /**< Count read requests. */
+} nfdev_stats_str;
+
+/**
+ * Input to the CONTROL ioctl.
+ */
+typedef struct nfdev_control_str {
+ unsigned control; /**< Control flags. */
+} nfdev_control_str;
+
+/** Control bit indicating host supports MOI control */
+#define NFDEV_CONTROL_HOST_MOI 0x0001
+
+/** Index of control bits indicating desired mode
+ *
+ * Desired mode follows the M_ModuleMode enumeration.
+ */
+#define NFDEV_CONTROL_MODE_SHIFT 1
+
+/** Detect a backwards-compatible control value
+ *
+ * Returns true if the request control value "makes no difference", i.e.
+ * and the failure of an attempt to set it is therefore uninteresting.
+ */
+#define NFDEV_CONTROL_HARMLESS(c) ((c) <= 1)
+
+/**
+ * Result of the STATUS ioctl.
+ */
+typedef struct nfdev_status_str {
+ unsigned status; /**< Status flags. */
+ char error[8]; /**< Error string. */
+} nfdev_status_str;
+
+/** Monitor firmware supports MOI control and error reporting */
+#define NFDEV_STATUS_MONITOR_MOI 0x0001
+
+/** Application firmware supports MOI control and error reporting */
+#define NFDEV_STATUS_APPLICATION_MOI 0x0002
+
+/** Application firmware running and supports error reporting */
+#define NFDEV_STATUS_APPLICATION_RUNNING 0x0004
+
+/** HSM failed
+ *
+ * Consult error[] for additional information.
+ */
+#define NFDEV_STATUS_FAILED 0x0008
+
+/** Standard PCI interface. */
+#define NFDEV_IF_STANDARD 0x01
+
+/** PCI interface with results pushed from device
+ * via DMA.
+ */
+#define NFDEV_IF_PCI_PUSH 0x02
+
+/* platform independant base ioctl numbers */
+
+/** Enquiry ioctl.
+ * \return nfdev_enquiry_str describing the attached device. */
+#define NFDEV_IOCTL_NUM_ENQUIRY 0x01
+/** Channel Update ioctl.
+ * \deprecated */
+#define NFDEV_IOCTL_NUM_CHUPDATE 0x02
+/** Ensure Reading ioctl.
+ * Signal a read request to the device.
+ * \param (unsigned int) Length of data to be read.
+ */
+#define NFDEV_IOCTL_NUM_ENSUREREADING 0x03
+/** Device Count ioctl.
+ * Not implemented for on all platforms.
+ * \return (int) the number of attached devices. */
+#define NFDEV_IOCTL_NUM_DEVCOUNT 0x04
+/** Internal Debug ioctl.
+ * Not implemented in release drivers. */
+#define NFDEV_IOCTL_NUM_DEBUG 0x05
+/** PCI Interface Version ioctl.
+ * \param (int) Maximum PCI interface version
+ * supported by the user of the device. */
+#define NFDEV_IOCTL_NUM_PCI_IFVERS 0x06
+/** Statistics ioctl.
+ * \return nfdev_enquiry_str describing the attached device. */
+#define NFDEV_IOCTL_NUM_STATS 0x07
+
+/** Module control ioctl
+ * \param (nfdev_control_str) Value to write to HSM control register
+ */
+#define NFDEV_IOCTL_NUM_CONTROL 0x08
+
+/** Module state ioctl
+ * \return (nfdev_status_str) Values read from HSM status/error registers
+ */
+#define NFDEV_IOCTL_NUM_STATUS 0x09
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfdev-solaris.h b/usr/src/uts/common/io/nfp/nfdev-solaris.h
new file mode 100644
index 0000000000..923b902e46
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfdev-solaris.h
@@ -0,0 +1,37 @@
+/*
+
+nfdev-solaris.h: nFast solaris specific device ioctl interface.
+
+(C) Copyright nCipher Corporation Ltd 1998-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+14/07/1998 jsh Original
+
+*/
+
+#ifndef NFDEV_SOLARIS_H
+#define NFDEV_SOLARIS_H
+
+#include "nfdev-common.h"
+
+#define NFDEV_IOCTL_TYPE ('n'<<8)
+
+#define NFDEV_IOCTL_ENQUIRY ( NFDEV_IOCTL_TYPE | \
+ NFDEV_IOCTL_NUM_ENQUIRY )
+#define NFDEV_IOCTL_ENSUREREADING ( NFDEV_IOCTL_TYPE | \
+ NFDEV_IOCTL_NUM_ENSUREREADING )
+#define NFDEV_IOCTL_DEVCOUNT ( NFDEV_IOCTL_TYPE | \
+ NFDEV_IOCTL_NUM_DEVCOUNT )
+#define NFDEV_IOCTL_DEBUG ( NFDEV_IOCTL_TYPE | \
+ NFDEV_IOCTL_NUM_DEBUG )
+#define NFDEV_IOCTL_PCI_IFVERS ( NFDEV_IOCTL_TYPE | \
+ NFDEV_IOCTL_NUM_PCI_IFVERS )
+#define NFDEV_IOCTL_STATS ( NFDEV_IOCTL_TYPE | \
+ NFDEV_IOCTL_NUM_STATS )
+
+#endif /* NFDEV_SOLARIS_H */
diff --git a/usr/src/uts/common/io/nfp/nfp.h b/usr/src/uts/common/io/nfp/nfp.h
new file mode 100644
index 0000000000..9704f04fbc
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp.h
@@ -0,0 +1,113 @@
+/*
+
+nfp.h: nFast PCI driver for Solaris 2.5, 2.6 and 2.7
+
+(C) Copyright nCipher Corporation Ltd 2001-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+06/05/1998 jsh Original solaris 2.6
+21/05/1999 jsh added support for solaris 2.5
+10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit)
+16/10/2001 jsh moved from nfast to new structure in nfdrv
+
+*/
+
+#ifndef NFP_H
+#define NFP_H
+
+#ifndef _KERNEL
+#error Hello? this is a driver, please compile with -D_KERNEL
+#endif
+
+#if ( CH_KERNELVER < 260 )
+typedef int ioctlptr_t;
+typedef unsigned short uint16_t;
+#define DDI_GET32 ddi_getl
+#define DDI_PUT32 ddi_putl
+#define DDI_GET16 ddi_getw
+#define DDI_PUT16 ddi_putw
+#define DDI_REP_GET8 ddi_rep_getb
+#define DDI_REP_PUT8 ddi_rep_putb
+#define DDI_REP_GET32 ddi_rep_getl
+#define DDI_REP_PUT32 ddi_rep_putl
+#define PCI_CONFIG_GET16 pci_config_getw
+#else /* ( CH_KERNELVER >= 260 ) */
+typedef intptr_t ioctlptr_t;
+#define DDI_GET32 ddi_get32
+#define DDI_PUT32 ddi_put32
+#define DDI_GET16 ddi_get16
+#define DDI_PUT16 ddi_put16
+#define DDI_REP_GET8 ddi_rep_get8
+#define DDI_REP_PUT8 ddi_rep_put8
+#define DDI_REP_GET32 ddi_rep_get32
+#define DDI_REP_PUT32 ddi_rep_put32
+#define PCI_CONFIG_GET16 pci_config_get16
+#endif
+
+#if ( CH_KERNELVER < 270 )
+typedef int nfp_timeout_t;
+#define EXTRA_CB_FLAGS 0
+#define VSXPRINTF(s, n, format, ap) vsprintf (s, format, ap)
+#else /* ( CH_KERNELVER >= 270 ) */
+typedef timeout_id_t nfp_timeout_t;
+#define EXTRA_CB_FLAGS D_64BIT
+#define VSXPRINTF(s, n, format, ap) vsnprintf(s, n, format, ap)
+#endif
+
+typedef struct nfp_dev {
+ int rd_ok;
+ int wr_ok;
+
+ int ifvers;
+
+ /* for PCI push read interface */
+ unsigned char *read_buf;
+ ddi_dma_handle_t read_dma_handle;
+ ddi_dma_cookie_t read_dma_cookie;
+
+ ddi_acc_handle_t acchandle;
+
+ int rd_dma_ok;
+
+ nfp_timeout_t wrtimeout;
+ nfp_timeout_t rdtimeout;
+
+ struct buf *wr_bp;
+ int wr_ready;
+ int rd_ready;
+ int rd_pending;
+ int rd_outstanding;
+ kcondvar_t rd_cv;
+
+ struct pollhead pollhead;
+ dev_info_t *dip;
+
+ ddi_iblock_cookie_t high_iblock_cookie; /* for mutex */
+ ddi_iblock_cookie_t low_iblock_cookie; /* for mutex */
+ kmutex_t high_mutex;
+ kmutex_t low_mutex;
+ int high_intr;
+ ddi_softintr_t soft_int_id;
+ int high_read;
+ int high_write;
+
+ ddi_iblock_cookie_t iblock_cookie; /* for mutex */
+ kmutex_t isr_mutex;
+
+ kmutex_t busy_mutex;
+ int busy;
+
+ ddi_acc_handle_t conf_handle;
+
+ nfp_cdev common;
+ const nfpcmd_dev *cmddev;
+} nfp_dev;
+
+extern struct nfp_dev *nfp_dev_list[];
+
+#endif /* NFP_H */
diff --git a/usr/src/uts/common/io/nfp/nfp_cmd.h b/usr/src/uts/common/io/nfp/nfp_cmd.h
new file mode 100644
index 0000000000..db8af0b2f9
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_cmd.h
@@ -0,0 +1,68 @@
+/*
+
+nfp_cmd.h: nCipher PCI HSM command driver decalrations
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+10/10/2001 jsh Original
+
+*/
+
+#ifndef NFPCMD_H
+#define NFPCMD_H
+
+#include "nfp_hostif.h"
+#include "nfp_error.h"
+
+/* read and write called with userspace buffer */
+
+typedef struct nfpcmd_dev {
+ const char *name;
+ unsigned short vendorid, deviceid,
+ sub_vendorid, sub_deviceid;
+ unsigned int bar_sizes[6]; /* includes IO bit */
+ unsigned int flags;
+ nfp_err (*create)(struct nfp_cdev *pdev);
+ nfp_err (*destroy)(void * ctx);
+ nfp_err (*open)(void * ctx);
+ nfp_err (*close)(void * ctx);
+ nfp_err (*isr)(void *ctx, int *handled);
+ nfp_err (*write_block)( const char *ublock, int len, void *ctx );
+ nfp_err (*read_block)( char *ublock, int len, void *ctx, int *rcount);
+ nfp_err (*channel_update)( char *data, int len, void *ctx);
+ nfp_err (*ensure_reading)( unsigned int addr, int len, void *ctx );
+ nfp_err (*debug)( int cmd, void *ctx);
+} nfpcmd_dev;
+
+#define NFP_CMD_FLG_NEED_IOBUF 0x1
+
+/* list of all supported drivers ---------------------------------------- */
+
+extern const nfpcmd_dev *nfp_drvlist[];
+
+extern const nfpcmd_dev i21285_cmddev;
+extern const nfpcmd_dev i21555_cmddev;
+extern const nfpcmd_dev bcm5820_cmddev;
+
+#ifndef PCI_BASE_ADDRESS_SPACE_IO
+#define PCI_BASE_ADDRESS_SPACE_IO 0x1
+#endif
+
+#define NFP_MAXDEV 16
+
+
+#define NFP_MEMBAR_MASK ~0xf
+#define NFP_IOBAR_MASK ~0x3
+/*
+ This masks off the bottom bits of the PCI_CSR_BAR which signify that the
+ BAR is an IO BAR rather than a MEM BAR
+*/
+
+#endif
+
diff --git a/usr/src/uts/common/io/nfp/nfp_common.h b/usr/src/uts/common/io/nfp/nfp_common.h
new file mode 100644
index 0000000000..d1d2100fea
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_common.h
@@ -0,0 +1,68 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#ifndef NFP_COMMON_H
+#define NFP_COMMON_H
+
+#include <sys/types.h>
+#include <sys/conf.h>
+
+typedef uint32_t UINT32;
+typedef uint8_t BYTE;
+
+#define DEFINE_NFPCI_PACKED_STRUCTS
+#include "nfpci.h"
+#include "nfdev-solaris.h"
+
+typedef int oserr_t;
+
+#if CH_BIGENDIAN
+
+/* Big Endian Sparc */
+
+#define SWP32(x) \
+( (((unsigned int)(x)>>24)&0xff) | (((unsigned int)(x)>>8)&0xff00) | (((unsigned int)(x)<<8)&0xff0000) | (((unsigned int)(x)<<24)&0xff000000) )
+
+#define SWP16(x) ( (((x)>>8)&0xff) | (((x)<<8)&0xff00) )
+
+#define FROM_LE32_IO(x) SWP32(*x)
+#define TO_LE32_IO(x,y) *x=SWP32(y)
+
+#define FROM_LE32_MEM(x) SWP32(*x)
+#define TO_LE32_MEM(x,y) *x=SWP32(y)
+
+#define FROM_LE16_IO(x) SWP16(*x)
+#define TO_LE16_IO(x,y) *x=SWP16(y)
+
+#else
+
+/* Little Endian x86 */
+
+#define FROM_LE32_IO(x) (*x)
+#define TO_LE32_IO(x,y) (*x=y)
+
+#define FROM_LE32_MEM(x) (*x)
+#define TO_LE32_MEM(x,y) (*x=y)
+
+#define FROM_LE16_IO(x) (*x)
+#define TO_LE16_IO(x,y) (*x=y)
+
+#endif /* !CH_BIGENDIAN */
+
+#include <sys/types.h>
+
+#if CH_KERNELVER == 260
+#define nfp_get_lbolt( lbolt, err ) err= drv_getparm( LBOLT, lbolt )
+#else
+#define nfp_get_lbolt( lbolt, err ) { *lbolt= ddi_get_lbolt(); err= 0; }
+#endif
+
+#endif
+
diff --git a/usr/src/uts/common/io/nfp/nfp_error.h b/usr/src/uts/common/io/nfp/nfp_error.h
new file mode 100644
index 0000000000..d64cb78fd4
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_error.h
@@ -0,0 +1,48 @@
+/*
+
+nfp_error.h: nCipher PCI HSM error handling
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+05/12/2001 jsh Original
+
+*/
+
+#ifndef NFP_ERROR_H
+#define NFP_ERROR_H
+
+#include "nfp_common.h"
+
+#define NFP_SUCCESS 0x0
+#define NFP_EFAULT 0x1
+#define NFP_ENOMEM 0x2
+#define NFP_EINVAL 0x3
+#define NFP_EIO 0x4
+#define NFP_ENXIO 0x5
+#define NFP_ENODEV 0x6
+#define NFP_EINTR 0x7
+#define NFP_ESTARTING 0x8
+#define NFP_EAGAIN 0x9
+#define NFP_EUNKNOWN 0x100
+
+typedef int nfp_err;
+
+extern oserr_t nfp_oserr( nfp_err nerr );
+extern nfp_err nfp_error( oserr_t oerr );
+
+#define nfr( x) \
+ return nfp_error((x))
+
+#define nfer(x, fn, msg) \
+ { oserr_t err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return nfp_error(err); } }
+
+#define er(x, fn, msg ) \
+{ nfp_err err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return err; } }
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfp_hostif.h b/usr/src/uts/common/io/nfp/nfp_hostif.h
new file mode 100644
index 0000000000..3e7d8187e5
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_hostif.h
@@ -0,0 +1,54 @@
+/*
+
+nfp_hostif.h: nCipher PCI HSM host interface declarations
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+10/10/2001 jsh Original
+
+*/
+
+#ifndef NFP_HOSTIF_H
+#define NFP_HOSTIF_H
+
+#include "nfdev-common.h"
+
+struct nfp_dev;
+
+/* common device structure */
+
+typedef struct nfp_cdev {
+ unsigned char *bar[6];
+ void *extra[6];
+
+ int busno;
+ int slotno;
+
+ void *cmdctx;
+
+ char *iobuf;
+
+ struct nfp_dev* dev;
+
+ struct nfdev_stats_str stats;
+
+} nfp_cdev;
+
+/* callbacks from command drivers -------------------------------------- */
+
+void nfp_read_complete( struct nfp_dev *pdev, int ok);
+void nfp_write_complete( struct nfp_dev *pdev, int ok);
+
+#define NFP_READ_MAX (8 * 1024)
+#define NFP_READBUF_SIZE (NFP_READ_MAX + 8)
+#define NFP_TIMEOUT_SEC 10
+
+#define NFP_DRVNAME "nCipher nFast PCI driver"
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfp_ifvers.c b/usr/src/uts/common/io/nfp/nfp_ifvers.c
new file mode 100644
index 0000000000..807b4f24c5
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_ifvers.c
@@ -0,0 +1,51 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+/*
+ * nfp_ifervs.c - common pci interface versioning
+ *
+ * uses:
+ *
+ * int pdev->ifvers
+ * device interface version
+ *
+ * int nfp_ifvers
+ * interface version limit
+ *
+ * int nfp_alloc_pci_push( nfp_dev *pdev )
+ * allocates resources needed for PCI Push,
+ * if not already allocated, and return True if successful
+ *
+ * void nfp_free_pci_push( nfp_dev *pdev ) {
+ * frees any resources allocated to PCI Push
+ */
+
+void nfp_set_ifvers( nfp_dev *pdev, int vers ) {
+ if( nfp_ifvers != 0 && vers > nfp_ifvers ) {
+ nfp_log( NFP_DBG2,
+ "nfp_set_ifvers: can't set ifvers %d"
+ " as nfp_ifvers wants max ifvers %d",
+ vers, nfp_ifvers);
+ return;
+ }
+ if( vers >= NFDEV_IF_PCI_PUSH ) {
+ if(!nfp_alloc_pci_push(pdev)) {
+ nfp_log( NFP_DBG1,
+ "nfp_set_ifvers: can't set ifvers %d"
+ " as resources not available",
+ vers);
+ return;
+ }
+ } else {
+ nfp_free_pci_push(pdev);
+ }
+ pdev->ifvers= vers;
+ nfp_log( NFP_DBG3, "nfp_set_ifvers: setting ifvers %d", vers);
+}
diff --git a/usr/src/uts/common/io/nfp/nfp_osif.h b/usr/src/uts/common/io/nfp/nfp_osif.h
new file mode 100644
index 0000000000..17ffe469ce
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfp_osif.h
@@ -0,0 +1,105 @@
+/*
+
+nfp_osif.h: nCipher PCI HSM OS interface declarations
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+history
+
+10/10/2001 jsh Original
+
+*/
+
+#ifndef NFP_OSIF_H
+#define NFP_OSIF_H
+
+#include "nfp_hostif.h"
+#include "nfp_error.h"
+
+/* general typedefs ----------------------------------------------- */
+
+typedef volatile unsigned int reg32;
+typedef volatile unsigned short reg16;
+typedef volatile unsigned char reg8;
+
+/* sempaphores, mutexs and events --------------------------------- */
+
+#if 0
+extern nfp_err nfp_sema_init( nfp_sema *sema, int initial);
+extern void nfp_sema_destroy( nfp_sema *sema );
+extern void nfp_sema_post( nfp_sema *sema );
+extern void nfp_sema_wait( nfp_sema *sema );
+extern int nfp_sema_wait_sig( nfp_sema *sema );
+
+extern nfp_err nfp_mutex_init( nfp_mutex *mutex );
+extern void nfp_mutex_destroy( nfp_mutex *mutex );
+extern void nfp_mutex_enter( nfp_mutex *mutex );
+extern void nfp_mutex_exit( nfp_mutex *mutex );
+
+extern nfp_err nfp_event_init( nfp_event *event );
+extern void nfp_event_destroy( nfp_event *event );
+extern void nfp_event_set( nfp_event *event );
+extern void nfp_event_clear( nfp_event *event );
+extern void nfp_event_wait( nfp_event *event );
+extern void nfp_event_wait_sig( nfp_event *event );
+
+#endif
+
+/* timeouts ------------------------------------------------------ */
+
+extern void nfp_sleep( int ms );
+
+/* memory handling ----------------------------------------------- */
+
+#define KMALLOC_DMA 0
+#define KMALLOC_CACHED 1
+
+extern void *nfp_kmalloc( int size, int flags );
+extern void *nfp_krealloc( void *ptr, int size, int flags );
+extern void nfp_kfree( void * );
+
+/* config space access ------------------------------------------------ */
+
+/* return Little Endian 32 bit config register */
+extern nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res );
+
+/* io space access ------------------------------------------------ */
+
+extern unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset );
+extern unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset );
+extern void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data );
+extern void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data );
+
+/* user and device memory space access ---------------------------- */
+
+/* NB these 2 functions are not guarenteed to be re-entrant for a given device */
+extern nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len);
+extern nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len);
+
+extern nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len );
+extern nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len );
+
+extern nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len );
+extern nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len);
+
+/* debug ------------------------------------------------------------ */
+
+#define NFP_DBG1 1
+#define NFP_DBGE NFP_DBG1
+#define NFP_DBG2 2
+#define NFP_DBG3 3
+#define NFP_DBG4 4
+
+#ifdef STRANGE_VARARGS
+extern void nfp_log();
+#else
+extern void nfp_log( int severity, const char *format, ...);
+#endif
+
+extern int nfp_debug;
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/nfpci.h b/usr/src/uts/common/io/nfp/nfpci.h
new file mode 100644
index 0000000000..793f5995e6
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/nfpci.h
@@ -0,0 +1,171 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+/*
+*
+* NFPCI.H - nFast PCI interface definition file
+*
+*
+*
+* 1998.06.09 IH Started
+*
+* The interface presented by nFast PCI devices consists of:
+*
+* A region of shared RAM used for data transfer & control information
+* A doorbell interrupt register, so both sides can give each other interrupts
+* A number of DMA channels for transferring data
+*/
+
+#ifndef NFPCI_H
+#define NFPCI_H
+
+/* Sizes of some regions */
+#define NFPCI_RAM_MINSIZE 0x00100000
+/* This is the minimum size of shared RAM. In future it may be possible to
+ negotiate larger sizes of shared RAM or auto-detect how big it is */
+#define NFPCI_RAM_MINSIZE_JOBS 0x00020000 /* standard jobs only */
+#define NFPCI_RAM_MINSIZE_KERN 0x00040000 /* standard and kernel jobs */
+
+/* Offsets within shared memory space.
+ The following main regions are:
+ jobs input area
+ jobs output area
+ kernel jobs input area
+ kernel output area
+*/
+
+#define NFPCI_OFFSET_JOBS 0x00000000
+#define NFPCI_OFFSET_JOBS_WR 0x00000000
+#define NFPCI_OFFSET_JOBS_RD 0x00010000
+#define NFPCI_OFFSET_KERN 0x00020000
+#define NFPCI_OFFSET_KERN_WR 0x00020000
+#define NFPCI_OFFSET_KERN_RD 0x00030000
+
+/* Interrupts, defined by bit position in doorbell register */
+
+/* Interrupts from device to host */
+#define NFAST_INT_DEVICE_WRITE_OK 0x00000001
+#define NFAST_INT_DEVICE_WRITE_FAILED 0x00000002
+#define NFAST_INT_DEVICE_READ_OK 0x00000004
+#define NFAST_INT_DEVICE_READ_FAILED 0x00000008
+#define NFAST_INT_DEVICE_KERN_WRITE_OK 0x00000010
+#define NFAST_INT_DEVICE_KERN_WRITE_FAILED 0x00000020
+#define NFAST_INT_DEVICE_KERN_READ_OK 0x00000040
+#define NFAST_INT_DEVICE_KERN_READ_FAILED 0x00000080
+
+/* Interrupts from host to device */
+#define NFAST_INT_HOST_WRITE_REQUEST 0x00010000
+#define NFAST_INT_HOST_READ_REQUEST 0x00020000
+#define NFAST_INT_HOST_DEBUG 0x00040000
+#define NFAST_INT_HOST_KERN_WRITE_REQUEST 0x00080000
+#define NFAST_INT_HOST_KERN_READ_REQUEST 0x00100000
+
+/* Ordinary job submission ------------------------ */
+
+/* The NFPCI_OFFSET_JOBS_WR and NFPCI_OFFSET_JOBS_RD regions are defined
+ by the following (byte) address offsets... */
+
+#define NFPCI_OFFSET_CONTROL 0x0
+#define NFPCI_OFFSET_LENGTH 0x4
+#define NFPCI_OFFSET_DATA 0x8
+#define NFPCI_OFFSET_PUSH_ADDR 0x8
+
+#define NFPCI_JOBS_WR_CONTROL (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_CONTROL)
+#define NFPCI_JOBS_WR_LENGTH (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_LENGTH)
+#define NFPCI_JOBS_WR_DATA (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_DATA)
+#define NFPCI_MAX_JOBS_WR_LEN (0x0000FFF8)
+
+#define NFPCI_JOBS_RD_CONTROL (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_CONTROL)
+#define NFPCI_JOBS_RD_LENGTH (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_LENGTH)
+#define NFPCI_JOBS_RD_DATA (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_DATA)
+/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */
+#define NFPCI_JOBS_RD_PUSH_ADDR (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_PUSH_ADDR)
+#define NFPCI_MAX_JOBS_RD_LEN (0x000FFF8)
+
+/* Kernel inferface job submission ---------------- */
+
+#define NFPCI_KERN_WR_CONTROL (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_CONTROL)
+#define NFPCI_KERN_WR_LENGTH (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_LENGTH)
+#define NFPCI_KERN_WR_DATA (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_DATA)
+#define NFPCI_MAX_KERN_WR_LEN (0x0000FFF8)
+
+#define NFPCI_KERN_RD_CONTROL (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_CONTROL)
+#define NFPCI_KERN_RD_LENGTH (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_LENGTH)
+#define NFPCI_KERN_RD_DATA (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_DATA)
+/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */
+#define NFPCI_KERN_RD_ADDR (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_PUSH_ADDR)
+#define NFPCI_MAX_KERN_RD_LEN (0x000FFF8)
+
+#ifdef DEFINE_NFPCI_PACKED_STRUCTS
+typedef struct
+{
+ UINT32 controlword;
+ UINT32 length; /* length of data to follow */
+ union {
+ BYTE data[1];
+ UINT32 addr;
+ } uu;
+}
+ NFPCI_JOBS_BLOCK;
+#endif
+
+
+#define NFPCI_JOB_CONTROL 0x00000001
+#define NFPCI_JOB_CONTROL_PCI_PUSH 0x00000002
+/*
+ The 'Control' word is analogous to the SCSI read/write address;
+ 1 = standard push/pull IO
+ 2 = push/push IO
+
+ To submit a block of job data, the host:
+ - sets the (32-bit, little-endian) word at NFPCI_JOBS_WR_CONTROL to NFPCI_JOB_CONTROL
+ - sets the word at NFPCI_JOBS_WR_LENGTH to the length of the data
+ - copies the data to NFPCI_JOBS_WR_DATA
+ - sets interrupt NFAST_INT_HOST_WRITE_REQUEST in the doorbell register
+ - awaits the NFAST_INT_DEVICE_WRITE_OK (or _FAILED) interrupts back
+
+ To read a block of jobs back, the host:
+ - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL
+ - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data
+ - sets interrupt NFAST_INT_HOST_READ_REQUEST
+ - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt
+ - reads the data from NFPCI_JOBS_RD_DATA; the module will set the word at
+ NFPCI_JOBS_RD_LENGTH to its actual length.
+
+ Optionally the host can request the PCI read data to be pushed to host PCI mapped ram:
+ - allocates a contiguous PCI addressable buffer for a NFPCI_JOBS_BLOCK of max
+ size NFPCI_MAX_JOBS_RD_LEN (or NFPCI_MAX_KERN_RD_LEN) + 8
+ - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL_PCI_PUSH
+ - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data
+ - sets the word at NFPCI_JOBS_RD_PUSH_ADDR to be the host PCI address of
+ the buffer
+ - sets interrupt NFAST_INT_HOST_READ_REQUEST
+ - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt
+ - reads the data from the buffer at NFPCI_OFFSET_DATA in the buffer. The
+ module will set NFPCI_OFFSET_LENGTH to the actual length.
+*/
+
+#define NFPCI_SCRATCH_CONTROL 0
+
+#define NFPCI_SCRATCH_CONTROL_HOST_MOI (1<<0)
+#define NFPCI_SCRATCH_CONTROL_MODE_SHIFT 1
+#define NFPCI_SCRATCH_CONTROL_MODE_MASK (3<<NFPCI_SCRATCH_CONTROL_MODE_SHIFT)
+
+#define NFPCI_SCRATCH_STATUS 1
+
+#define NFPCI_SCRATCH_STATUS_MONITOR_MOI (1<<0)
+#define NFPCI_SCRATCH_STATUS_APPLICATION_MOI (1<<1)
+#define NFPCI_SCRATCH_STATUS_APPLICATION_RUNNING (1<<2)
+#define NFPCI_SCRATCH_STATUS_ERROR (1<<3)
+
+#define NFPCI_SCRATCH_ERROR_LO 2
+#define NFPCI_SCRATCH_ERROR_HI 3
+
+#endif
diff --git a/usr/src/uts/common/io/nfp/osif.c b/usr/src/uts/common/io/nfp/osif.c
new file mode 100644
index 0000000000..fba62f9a37
--- /dev/null
+++ b/usr/src/uts/common/io/nfp/osif.c
@@ -0,0 +1,184 @@
+/*
+
+(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved
+
+Copyright (c) 2008-2013 Thales e-Security All rights reserved
+
+Copyright (c) 2014 Thales UK All rights reserved
+
+*/
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/map.h>
+#include <sys/debug.h>
+#include <sys/modctl.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/open.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/pci.h>
+
+#include "nfp_common.h"
+#include "nfp_hostif.h"
+#include "nfp_error.h"
+#include "nfp_osif.h"
+#include "nfp_cmd.h"
+#include "nfp.h"
+#include "autoversion.h"
+
+/* config space access ---------------------------------- */
+
+nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ) {
+ unsigned int tmp32;
+ if ( !pdev || !pdev->dev || !pdev->dev->conf_handle )
+ return NFP_ENODEV;
+
+/* pci_config_get32() does byte swapping, so put back to LE */
+ tmp32 = pci_config_get32( pdev->dev->conf_handle, offset );
+ TO_LE32_IO(res, tmp32);
+
+ return NFP_SUCCESS;
+}
+
+/* user space memory access ---------------------------------- */
+
+nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len) {
+ bcopy(ubuf, kbuf, len);
+ return 0;
+}
+
+nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len) {
+ bcopy(kbuf, ubuf, len);
+ return 0;
+}
+
+nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len) {
+ /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying from kernel mem */
+ return nfp_copy_to_dev( cdev, bar, offset, ubuf, len );
+}
+
+nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len) {
+ /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying to kernel mem */
+ return nfp_copy_from_dev( cdev, bar, offset, ubuf, len );
+}
+
+nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len) {
+ if( len & 0x3 || offset & 0x3 )
+ DDI_REP_GET8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR);
+ else
+ /* LINTED: alignment */
+ DDI_REP_GET32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR);
+ return NFP_SUCCESS;
+}
+
+nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len) {
+ if( len & 0x3 || offset & 0x3 )
+ DDI_REP_PUT8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR );
+ else
+ /* LINTED: alignment */
+ DDI_REP_PUT32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR );
+ return NFP_SUCCESS;
+}
+
+/* pci io space access --------------------------------------- */
+
+unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ) {
+ nfp_log( NFP_DBG3, "nfp_inl: addr %x", (uintptr_t) pdev->bar[bar] + offset);
+ /* LINTED: alignment */
+ return DDI_GET32( pdev->extra[bar], (uint32_t *)(pdev->bar[bar] + offset) );
+}
+
+unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ) {
+ nfp_log( NFP_DBG3, "nfp_inw: addr %x", (uintptr_t) pdev->bar[bar] + offset);
+ /* LINTED: alignment */
+ return DDI_GET16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset) );
+}
+
+void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ) {
+ nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data);
+ /* LINTED: alignment */
+ DDI_PUT32( pdev->extra[bar], (uint32_t *)(pdev->bar[ bar ] + offset), data );
+}
+
+void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ) {
+ nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data);
+ /* LINTED: alignment */
+ DDI_PUT16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset), data );
+}
+
+/* logging ---------------------------------------------------- */
+
+void nfp_log( int level, const char *fmt, ...)
+{
+ auto char buf[256];
+ va_list ap;
+
+ switch (level) {
+ case NFP_DBG4: if (nfp_debug < 4) break;
+ /*FALLTHROUGH*/
+ case NFP_DBG3: if (nfp_debug < 3) break;
+ /*FALLTHROUGH*/
+ case NFP_DBG2: if (nfp_debug < 2) break;
+ /*FALLTHROUGH*/
+ case NFP_DBG1: if (nfp_debug < 1) break;
+ /*FALLTHROUGH*/
+ default:
+ va_start(ap, fmt);
+ (void) vsnprintf(buf, 256, fmt, ap);
+ va_end(ap);
+ cmn_err(CE_CONT, "!" VERSION_COMPNAME " " VERSION_NO ": %s\n", buf);
+ break;
+ }
+}
+
+struct errstr {
+ int oserr;
+ nfp_err nferr;
+};
+
+
+static struct errstr errtab[] = {
+ { EFAULT, NFP_EFAULT },
+ { ENOMEM, NFP_ENOMEM },
+ { EINVAL, NFP_EINVAL },
+ { EIO, NFP_EIO },
+ { ENXIO, NFP_ENXIO },
+ { ENODEV, NFP_ENODEV },
+ { EINVAL, NFP_EUNKNOWN },
+ { 0, 0 }
+};
+
+nfp_err nfp_error( int oserr )
+{
+ struct errstr *perr;
+ if(!oserr)
+ return 0;
+ perr= errtab;
+ while(perr->nferr) {
+ if(perr->oserr == oserr)
+ return perr->nferr;
+ perr++;
+ }
+ return NFP_EUNKNOWN;
+}
+
+int nfp_oserr( nfp_err nferr )
+{
+ struct errstr *perr;
+ if(nferr == NFP_SUCCESS)
+ return 0;
+ perr= errtab;
+ while(perr->nferr) {
+ if(perr->nferr == nferr)
+ return perr->oserr;
+ perr++;
+ }
+ return EIO;
+}
diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c
index b3e8172d4f..57a453f5a5 100644
--- a/usr/src/uts/common/io/nvme/nvme.c
+++ b/usr/src/uts/common/io/nvme/nvme.c
@@ -59,7 +59,7 @@
* but they share some driver state: the command array (holding pointers to
* commands currently being processed by the hardware) and the active command
* counter. Access to a submission queue and the shared state is protected by
- * nq_mutex, completion queue is protected by ncq_mutex.
+ * nq_mutex; completion queue is protected by ncq_mutex.
*
* When a command is submitted to a queue pair the active command counter is
* incremented and a pointer to the command is stored in the command array. The
@@ -202,6 +202,23 @@
* device.
*
*
+ * NVMe Hotplug:
+ *
+ * The driver supports hot removal. The driver uses the NDI event framework
+ * to register a callback, nvme_remove_callback, to clean up when a disk is
+ * removed. In particular, the driver will unqueue outstanding I/O commands and
+ * set n_dead on the softstate to true so that other operations, such as ioctls
+ * and command submissions, fail as well.
+ *
+ * While the callback registration relies on the NDI event framework, the
+ * removal event itself is kicked off in the PCIe hotplug framework, when the
+ * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicatating that a
+ * device was removed from the slot.
+ *
+ * The NVMe driver instance itself will remain until the final close of the
+ * device.
+ *
+ *
* DDI UFM Support
*
* The driver supports the DDI UFM framework for reporting information about
@@ -1065,6 +1082,10 @@ nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
static int
nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
{
+ if (cmd->nc_nvme->n_dead) {
+ return (EIO);
+ }
+
if (sema_tryp(&qp->nq_sema) == 0)
return (EAGAIN);
@@ -3266,6 +3287,47 @@ nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg)
return (fm_error->fme_status);
}
+static void
+nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a,
+ void *b)
+{
+ nvme_t *nvme = a;
+
+ nvme->n_dead = B_TRUE;
+
+ /*
+ * Fail all outstanding commands, including those in the admin queue
+ * (queue 0).
+ */
+ for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) {
+ nvme_qpair_t *qp = nvme->n_ioq[i];
+
+ mutex_enter(&qp->nq_mutex);
+ for (size_t j = 0; j < qp->nq_nentry; j++) {
+ nvme_cmd_t *cmd = qp->nq_cmd[j];
+ nvme_cmd_t *u_cmd;
+
+ if (cmd == NULL) {
+ continue;
+ }
+
+ /*
+ * Since we have the queue lock held the entire time we
+ * iterate over it, it's not possible for the queue to
+ * change underneath us. Thus, we don't need to check
+ * that the return value of nvme_unqueue_cmd matches the
+ * requested cmd to unqueue.
+ */
+ u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
+ taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq,
+ cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
+
+ ASSERT3P(u_cmd, ==, cmd);
+ }
+ mutex_exit(&qp->nq_mutex);
+ }
+}
+
static int
nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
@@ -3289,6 +3351,17 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
ddi_set_driver_private(dip, nvme);
nvme->n_dip = dip;
+ /* Set up event handlers for hot removal. */
+ if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT,
+ &nvme->n_rm_cookie) != DDI_SUCCESS) {
+ goto fail;
+ }
+ if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie,
+ nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) !=
+ DDI_SUCCESS) {
+ goto fail;
+ }
+
mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL);
nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
@@ -3602,6 +3675,12 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
if (nvme->n_product != NULL)
strfree(nvme->n_product);
+ /* Clean up hot removal event handler. */
+ if (nvme->n_ev_rm_cb_id != NULL) {
+ (void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id);
+ }
+ nvme->n_ev_rm_cb_id = NULL;
+
ddi_soft_state_free(nvme_state, instance);
return (DDI_SUCCESS);
@@ -3890,6 +3969,11 @@ static int
nvme_bd_mediainfo(void *arg, bd_media_t *media)
{
nvme_namespace_t *ns = arg;
+ nvme_t *nvme = ns->ns_nvme;
+
+ if (nvme->n_dead) {
+ return (EIO);
+ }
media->m_nblks = ns->ns_block_count;
media->m_blksize = ns->ns_block_size;
@@ -3910,8 +3994,9 @@ nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
boolean_t poll;
int ret;
- if (nvme->n_dead)
+ if (nvme->n_dead) {
return (EIO);
+ }
cmd = nvme_create_nvm_cmd(ns, opc, xfer);
if (cmd == NULL)
@@ -3992,6 +4077,11 @@ static int
nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
{
nvme_namespace_t *ns = arg;
+ nvme_t *nvme = ns->ns_nvme;
+
+ if (nvme->n_dead) {
+ return (EIO);
+ }
/*LINTED: E_BAD_PTR_CAST_ALIGN*/
if (*(uint64_t *)ns->ns_eui64 != 0) {
@@ -4549,6 +4639,9 @@ nvme_ioctl_detach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
if (nsid == 0)
return (EINVAL);
+ if (nvme->n_ns[nsid - 1].ns_ignore)
+ return (0);
+
rv = bd_detach_handle(nvme->n_ns[nsid - 1].ns_bd_hdl);
if (rv != DDI_SUCCESS)
rv = EBUSY;
@@ -4579,6 +4672,14 @@ nvme_ioctl_attach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
kmem_free(idns, sizeof (nvme_identify_nsid_t));
+ if (nvme->n_ns[nsid - 1].ns_ignore)
+ return (ENOTSUP);
+
+ if (nvme->n_ns[nsid - 1].ns_bd_hdl == NULL)
+ nvme->n_ns[nsid - 1].ns_bd_hdl = bd_alloc_handle(
+ &nvme->n_ns[nsid - 1], &nvme_bd_ops, &nvme->n_prp_dma_attr,
+ KM_SLEEP);
+
rv = bd_attach_handle(nvme->n_dip, nvme->n_ns[nsid - 1].ns_bd_hdl);
if (rv != DDI_SUCCESS)
rv = EBUSY;
diff --git a/usr/src/uts/common/io/nvme/nvme_var.h b/usr/src/uts/common/io/nvme/nvme_var.h
index fb8f4ba771..ea378b8be4 100644
--- a/usr/src/uts/common/io/nvme/nvme_var.h
+++ b/usr/src/uts/common/io/nvme/nvme_var.h
@@ -114,20 +114,23 @@ struct nvme_cq {
struct nvme_qpair {
size_t nq_nentry;
+ /* submission fields */
nvme_dma_t *nq_sqdma;
nvme_sqe_t *nq_sq;
uint_t nq_sqhead;
uint_t nq_sqtail;
uintptr_t nq_sqtdbl;
+ /* completion */
nvme_cq_t *nq_cq;
- nvme_cmd_t **nq_cmd;
- uint16_t nq_next_cmd;
- uint_t nq_active_cmds;
+ /* shared structures for completion and submission */
+ nvme_cmd_t **nq_cmd; /* active command array */
+ uint16_t nq_next_cmd; /* next potential empty queue slot */
+ uint_t nq_active_cmds; /* number of active cmds */
- kmutex_t nq_mutex;
- ksema_t nq_sema;
+ kmutex_t nq_mutex; /* protects shared state */
+ ksema_t nq_sema; /* semaphore to ensure q always has >= 1 empty slot */
};
struct nvme {
@@ -188,7 +191,12 @@ struct nvme {
nvme_identify_ctrl_t *n_idctl;
+ /* Pointer to the admin queue, which is always queue 0 in n_ioq. */
nvme_qpair_t *n_adminq;
+ /*
+ * All command queues, including the admin queue.
+ * Its length is: n_ioq_count + 1.
+ */
nvme_qpair_t **n_ioq;
nvme_cq_t **n_cq;
@@ -244,6 +252,10 @@ struct nvme {
uint32_t n_vendor_event;
uint32_t n_unknown_event;
+ /* hot removal NDI event handling */
+ ddi_eventcookie_t n_rm_cookie;
+ ddi_callback_id_t n_ev_rm_cb_id;
+
/* DDI UFM handle */
ddi_ufm_handle_t *n_ufmh;
/* Cached Firmware Slot Information log page */
diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c
new file mode 100644
index 0000000000..e837b15c93
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.c
@@ -0,0 +1,2185 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay Devices
+ *
+ * Overlay devices provide a means for creating overlay networks, a means of
+ * multiplexing multiple logical, isolated, and discrete layer two and layer
+ * three networks on top of one physical network.
+ *
+ * In general, these overlay devices encapsulate the logic to answer two
+ * different questions:
+ *
+ * 1) How should I transform a packet to put it on the wire?
+ * 2) Where should I send a transformed packet?
+ *
+ * Each overlay device is presented to the user as a GLDv3 device. While the
+ * link itself cannot have an IP interface created on top of it, it allows for
+ * additional GLDv3 devices, such as a VNIC, to be created on top of it which
+ * can be plumbed up with IP interfaces.
+ *
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * The logical overlay device that a user sees in dladm(1M) is a combination of
+ * two different components that work together. The first component is this
+ * kernel module, which is responsible for answering question one -- how should
+ * I transform a packet to put it on the wire.
+ *
+ * The second component is what we call the virtual ARP daemon, or varpd. It is
+ * a userland component that is responsible for answering the second question --
+ * Where should I send a transformed packet. Instances of the kernel overlay
+ * GLDv3 device ask varpd the question of where should a packet go.
+ *
+ * The split was done for a few reasons. Importantly, we wanted to keep the act
+ * of generating encapsulated packets in the kernel so as to ensure that the
+ * general data path was fast and also kept simple. On the flip side, while the
+ * question of where should something go may be simple, it may often be
+ * complicated and need to interface with several different external or
+ * distributed systems. In those cases, it's simpler to allow for the full
+ * flexibility of userland to be brought to bear to solve that problem and in
+ * general, the path isn't very common.
+ *
+ * The following is what makes up the logical overlay device that a user would
+ * create with dladm(1M).
+ *
+ * Kernel Userland
+ * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
+ * . +--------+ +--------+ +--------+ . . .
+ * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . .
+ * . +--------+ +--------+ +--------+ . . .
+ * . | | | . . .
+ * . | | | . . .
+ * . +------------+-----------+ . . .
+ * . | . . /dev/overlay .
+ * . +--------------+ . . . +------------+ .
+ * . | | . . . | | .
+ * . | Overlay |======*=================| Virtual | .
+ * . | GLDv3 Device |========================| ARP Daemon | .
+ * . | | . . | | .
+ * . +--------------+ . . +------------+ .
+ * . | . . | .
+ * . | . . | .
+ * . +----------------+ . . +--------+ .
+ * . | Overlay | . . | varpd | .
+ * . | Encapsulation | . . | Lookup | .
+ * . | Plugin | . . | Plugin | .
+ * . +----------------+ . . +--------+ .
+ * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
+ *
+ *
+ * This image shows the two different components and where they live.
+ * Importantly, it also shows that both the kernel overlay device and the
+ * userland varpd both support plugins. The plugins actually implement the
+ * things that users care about and the APIs have been designed to try to
+ * minimize the amount of things that a module writer needs to worry about it.
+ *
+ * IDENTIFIERS
+ *
+ * Every overlay device is defined by a unique identifier which is the overlay
+ * identifier. Its purpose is similar to that of a VLAN identifier, it's a
+ * unique number that is used to differentiate between different entries on the
+ * wire.
+ *
+ * ENCAPSULATION
+ *
+ * An overlay encapsulation plugin is a kernel miscellaneous module whose
+ * purpose is to contain knowledge about how to transform packets to put them
+ * onto the wire and to take them off. An example of an encapsulation plugin is
+ * vxlan. It's also how support for things like nvgre or geneve would be brought
+ * into the system.
+ *
+ * Each encapsulation plugins defines a series of operation vectors and
+ * properties. For the full details on everything they should provide, please
+ * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible
+ * for telling the system what information is required to send a packet. For
+ * example, vxlan is defined to send everything over a UDP packet and therefore
+ * requires a port and an IP address, while nvgre on the other hand is its own
+ * IP type and therefore just requires an IP address. In addition, it also
+ * provides information about the kind of socket that should be created. This is
+ * used by the kernel multiplexor, more of that in the Kernel Components
+ * section.
+ *
+ * LOOKUPS
+ *
+ * The kernel communicates requests for lookups over the character device
+ * /dev/overlay. varpd is responsible for listening for requests on that device
+ * and answering them. The character device is specific to the target path and
+ * varpd.
+ *
+ * Much as the kernel overlay module handles the bulk of the scaffolding but
+ * leaves the important work to the encapsulation plugin, varpd provides a
+ * similar role and leaves the full brunt of lookups to a userland dynamic
+ * shared object which implements the logic of lookups.
+ *
+ * Each lookup plugin defines a series of operation vectors and properties. For
+ * the full details on everything that they should provide, please read
+ * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC
+ * address and asked to give an address on the physical network that it should
+ * be sent to. In addition, they handle questions related to how to handle
+ * things like broadcast and multicast traffic, etc.
+ *
+ * ----------
+ * Properties
+ * ----------
+ *
+ * A device from a dladm perspective has a unique set of properties that are
+ * combined from three different sources:
+ *
+ * 1) Generic properties that every overlay device has
+ * 2) Properties that are specific to the encapsulation plugin
+ * 3) Properties that are specific to the lookup plugin
+ *
+ * All of these are exposed in a single set of properties in dladm. Note that
+ * these are not necessarily traditional link properties. However, if something
+ * is both a traditional GLDv3 link property, say the MTU of a device, and a
+ * specific property here, than the driver ensures that all existing GLDv3
+ * specific means of manipulating it are used and wraps up its private property
+ * interfaces to ensure that works.
+ *
+ * Properties in the second and third category are prefixed with the name of
+ * their module. For example, the vxlan encapsulation module has a property
+ * called the 'listen_ip'. This property would show up in dladm as
+ * 'vxlan/listen_ip'. This allows different plugins to both use similar names
+ * for similar properties and to also have independent name spaces so that
+ * overlapping names do not conflict with anything else.
+ *
+ * While the kernel combines both sets one and two into a single coherent view,
+ * it does not do anything with respect to the properties that are owned by the
+ * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in
+ * charge of bridging these two worlds into one magical experience for the user.
+ * It carries the burden of knowing about both overlay specific and varpd
+ * specific properties. Importantly, we want to maintain this distinction. We
+ * don't want to treat the kernel as an arbitrary key/value store for varpd and
+ * we want the kernel to own its own data and not have to ask userland for
+ * information that it owns.
+ *
+ * Every property in the system has the following attributes:
+ *
+ * o A name
+ * o A type
+ * o A size
+ * o Permissions
+ * o Default value
+ * o Valid value ranges
+ * o A value
+ *
+ * Everything except for the value is obtained by callers through the propinfo
+ * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX,
+ * currently 256 bytes.
+ *
+ * The following are the supported types of properties:
+ *
+ * OVERLAY_PROP_T_INT
+ *
+ * A signed integer, its length is 8 bytes, corresponding to a
+ * int64_t.
+ *
+ * OVERLAY_PROP_T_UINT
+ *
+ * An unsigned integer, its length is 8 bytes, corresponding to a
+ * uint64_t.
+ *
+ * OVERLAY_PROP_T_IP
+ *
+ * A struct in6_addr, it has a fixed size.
+ *
+ * OVERLAY_PROP_T_STRING
+ *
+ * A null-terminated character string encoded in either ASCII or
+ * UTF-8. Note that the size of the string includes the null
+ * terminator.
+ *
+ * The next thing that we apply to a property is its permission. The permissions
+ * are put together by the bitwise or of the following flags and values.
+ *
+ * OVERLAY_PROP_PERM_REQ
+ *
+ * This indicates a required property. A property that is required
+ * must be set by a consumer before the device can be created. If a
+ * required property has a default property, this constraint is
+ * loosened because the default property defines the value.
+ *
+ * OVERLAY_PORP_PERM_READ
+ *
+ * This indicates that a property can be read. All properties will
+ * have this value set.
+ *
+ * OVERLAY_PROP_PERM_WRITE
+ *
+ * This indicates that a property can be written to and thus
+ * updated by userland. Properties that are only intended to
+ * display information, will not have OVERLAY_PROP_PERM_WRITE set.
+ *
+ * In addition, a few additional values are defined as a convenience to
+ * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of
+ * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second,
+ * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ,
+ * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a
+ * property should generally be a constant across its lifetime.
+ *
+ * A property may optionally have a default value. If it does have a default
+ * value, and that property is not set to be a different value, then the default
+ * value is inherited automatically. It also means that if the default value is
+ * acceptable, there is no need to set the value for a required property. For
+ * example, the vxlan module has the vxlan/listen_port property which is
+ * required, but has a default value of 4789 (the IANA assigned port). Because
+ * of that default value, there is no need for it to be set.
+ *
+ * Finally, a property may declare a list of valid values. These valid values
+ * are used for display purposes, they are not enforced by the broader system,
+ * but merely allow a means for the information to be communicated to the user
+ * through dladm(1M). Like a default value, this is optional.
+ *
+ * The general scaffolding does not do very much with respect to the getting and
+ * setting of properties. That is really owned by the individual plugins
+ * themselves.
+ *
+ * -----------------------------
+ * Destinations and Plugin Types
+ * -----------------------------
+ *
+ * Both encapsulation and lookup plugins define the kinds of destinations that
+ * they know how to support. There are three different pieces of information
+ * that can be used to address to a destination currently, all of which is
+ * summarized in the type overlay_point_t. Any combination of these is
+ * supported.
+ *
+ * OVERLAY_PLUGIN_D_ETHERNET
+ *
+ * An Ethernet MAC address is required.
+ *
+ * OVERLAY_PLUGIN_D_IP
+ *
+ * An IP address is required. All IP addresses used by the overlay
+ * system are transmitted as IPv6 addresses. IPv4 addresses can be
+ * represented by using IPv4-mapped IPv6 addresses.
+ *
+ * OVERLAY_PLUGIN_D_PORT
+ *
+ * A TCP/UDP port is required.
+ *
+ * A kernel encapsulation plugin declares which of these that it requires, it's
+ * a static set. On the other hand, a userland lookup plugin can be built to
+ * support all of these or any combination thereof. It gets passed the required
+ * destination type, based on the kernel encapsulation method, and then it makes
+ * the determination as to whether or not it supports it. For example, the
+ * direct plugin can support either an IP or both an IP and a port, it simply
+ * doesn't display the direct/dest_port property in the cases where a port is
+ * not required to support this.
+ *
+ * The user lookup plugins have two different modes of operation which
+ * determines how they interact with the broader system and how look ups are
+ * performed. These types are:
+ *
+ * OVERLAY_TARGET_POINT
+ *
+ * A point to point plugin has a single static definition for where
+ * to send all traffic. Every packet in the system always gets sent
+ * to the exact same destination which is programmed into the
+ * kernel when the general device is activated.
+ *
+ * OVERLAY_TARGET_DYNAMIC
+ *
+ * A dynamic plugin does not have a single static definition.
+ * Instead, for each destination, the kernel makes an asynchronous
+ * request to varpd to determine where the packet should be routed,
+ * and if a specific destination is found, then that destination is
+ * cached in the overlay device's target cache.
+ *
+ * This distinction, while important for the general overlay device's operation,
+ * is not important to the encapsulation plugins. They don't need to know about
+ * any of these pieces. It's just a concern for varpd, the userland plugin, and
+ * the general overlay scaffolding.
+ *
+ * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not
+ * maintain a target cache, and instead just keeps track of the destination and
+ * always sends encapsulated packets to that address. When the target type is of
+ * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such
+ * destinations. These destinations are kept around in an instance of a
+ * reference hash that is specific to the given overlay device. Entries in the
+ * cache can be invalidated and replaced by varpd and its lookup plugins.
+ *
+ * ----------------------------------
+ * Kernel Components and Architecture
+ * ----------------------------------
+ *
+ * There are multiple pieces inside the kernel that work together, there is the
+ * general overlay_dev_t structure, which is the logical GLDv3 device, but it
+ * itself has references to things like an instance of an encapsulation plugin,
+ * a pointer to a mux and a target cache. It can roughly be summarized in the
+ * following image:
+ *
+ * +------------------+
+ * | global |
+ * | overlay list |
+ * | overlay_dev_list |
+ * +------------------+
+ * |
+ * | +-----------------------+ +---------------+
+ * +->| GLDv3 Device |----------->| GLDv3 Device | -> ...
+ * | overlay_dev_t | | overlay_dev_t |
+ * | | +---------------+
+ * | |
+ * | mac_handle_t -----+---> GLDv3 handle to MAC
+ * | datalink_id_t -----+---> Datalink ID used by DLS
+ * | overlay_dev_flag_t ---+---> Device state
+ * | uint_t -----+---> Curent device MTU
+ * | uint_t -----+---> In-progress RX operations
+ * | uint_t -----+---> In-progress TX operations
+ * | char[] -----+---> FMA degraded message
+ * | void * -----+---> plugin private data
+ * | overlay_target_t * ---+---------------------+
+ * | overlay_plugin_t * ---+---------+ |
+ * +-----------------------+ | |
+ * ^ | |
+ * +--------------------+ | | |
+ * | Kernel Socket | | | |
+ * | Multiplexor | | | |
+ * | overlay_mux_t | | | |
+ * | | | | |
+ * | avl_tree_t -+--+ | |
+ * | uint_t -+--> socket family | |
+ * | uint_t -+--> socket type | |
+ * | uint_t -+--> socket protocol | |
+ * | ksocket_t -+--> I/O socket | |
+ * | struct sockaddr * -+--> ksocket address | |
+ * | overlay_plugin_t --+--------+ | |
+ * +--------------------+ | | |
+ * | | |
+ * +-------------------------+ | | |
+ * | Encap Plugin |<--+-----------+ |
+ * | overlay_plugin_t | |
+ * | | |
+ * | char * ---+--> plugin name |
+ * | overlay_plugin_ops_t * -+--> plugin downcalls |
+ * | char ** (props) ---+--> property list |
+ * | uint_t ---+--> id length |
+ * | overlay_plugin_flags_t -+--> plugin flags |
+ * | overlay_plugin_dest_t --+--> destination type v
+ * +-------------------------+ +-------------------------+
+ * | Target Cache |
+ * | overlay_target_t |
+ * | |
+ * cache mode <--+- overlay_target_mode_t |
+ * dest type <--+- overlay_plugin_dest_t |
+ * cache flags <--+- overlay_target_flag_t |
+ * varpd id <--+- uint64_t |
+ * outstanding varpd reqs. <--+- uint_t |
+ * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t |
+ * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t |
+ * | +-------------------------+
+ * +-----------------------+
+ * |
+ * v
+ * +-------------------------------+ +------------------------+
+ * | Target Entry |-->| Target Entry |--> ...
+ * | overlay_target_entry_t | | overlay_target_entry_t |
+ * | | +------------------------+
+ * | |
+ * | overlay_target_entry_flags_t -+--> Entry flags
+ * | uint8_t[ETHERADDRL] ---+--> Target MAC address
+ * | overlay_target_point_t ---+--> Target underlay address
+ * | mblk_t * ---+--> outstanding mblk head
+ * | mblk_t * ---+--> outstanding mblk tail
+ * | size_t ---+--> outstanding mblk size
+ * +-------------------------------+
+ *
+ * The primary entries that we care about are the overlay_dev_t, which
+ * correspond to each overlay device that is created with dladm(1M). Globally,
+ * these devices are maintained in a simple list_t which is protected with a
+ * lock. Hence, these include important information such as the mac_handle_t
+ * and a datalink_id_t which is used to interact with the broader MAC and DLS
+ * ecosystem. We also maintain additional information such as the current state,
+ * outstanding operations, the mtu, and importantly, the plugin's private data.
+ * This is the instance of an encapsulation plugin that gets created as part of
+ * creating an overlay device. Another aspect of this is that the overlay_dev_t
+ * also includes information with respect to FMA. For more information, see the
+ * FMA section.
+ *
+ * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin
+ * is the encapsulation plugin. This allows the device to make downcalls into it
+ * based on doing things like getting and setting properties. Otherwise, the
+ * plugin itself is a fairly straightforward entity. They are maintained in an
+ * (not pictured above) list. The plugins themselves mostly maintain things like
+ * the static list of properties, what kind of destination they require, and the
+ * operations vector. A given module may contain more if necessary.
+ *
+ * The next piece of the puzzle is the mux, or a multiplexor. The mux itself
+ * maintains a ksocket and it is through the mux that we send and receive
+ * message blocks. The mux represents a socket type and address, as well as a
+ * plugin. Multiple overlay_dev_t devices may then share the same mux. For
+ * example, consider the case where you have different instances of vxlan all on
+ * the same underlay network. These would all logically share the same IP
+ * address and port that packets are sent and received on; however, what differs
+ * is the decapuslation ID.
+ *
+ * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike
+ * a socket, we enable a direct callback on the ksocket. This means that
+ * whenever a message block chain is received, rather than sitting there and
+ * getting a callback in a context and kicking that back out to a taskq. Instead
+ * data comes into the callback function overlay_mux_recv().
+ *
+ * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx
+ * function) to transmit. It receives encapsulated packets, decapsulates them to
+ * determine the overlay identifier, looks up the given device that matches that
+ * identifier, and then causes the broader MAC world to receive the packet with
+ * a call to mac_rx().
+ *
+ * Today, we don't do too much that's special with the ksocket; however, as
+ * hardware is gaining understanding for these encapuslation protocols, we'll
+ * probably want to think of better ways to get those capabilities passed down
+ * and potentially better ways to program receive filters so they get directly
+ * to us. Though, that's all fantasy future land.
+ *
+ * The next part of the puzzle is the target cache. The purpose of the target
+ * cache is to cache where we should send a packet on the underlay network,
+ * given its mac address. The target cache operates in two modes depending on
+ * whether the lookup module was declared to OVERLAY_TARGET_POINT or
+ * OVERLAY_TARGET_DYANMIC.
+ *
+ * In the case where the target cache has been programmed to be
+ * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t
+ * which has the destination that we send everything, no matter the destination
+ * mac address.
+ *
+ * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
+ * are much more interesting and as a result, more complicated. We primarily
+ * store lists of overlay_target_entry_t's which are stored in both an avl tree
+ * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
+ * is only used for a few of the target ioctls used to dump data such that we
+ * can get a consistent iteration order for things like dladm show-overlay -t.
+ * The key that we use for the reference hashtable is based on the mac address
+ * in the cache and currently we just do a simple CRC32 to transform it into a
+ * hash.
+ *
+ * Each entry maintains a set of flags to indicate the current status of the
+ * request. The flags may indicate one of three states: that current cache entry
+ * is valid, that the current cache entry has been directed to drop all output,
+ * and that the current cache entry is invalid and may be being looked up. In
+ * the case where it's valid, we just take the destination address and run with
+ * it.
+ *
+ * If it's invalid and a lookup has not been made, then we start the process
+ * that prepares a query that will make its way up to varpd. The cache entry
+ * entry maintains a message block chain of outstanding message blocks and a
+ * size. These lists are populated only when we don't know the answer as to
+ * where should these be sent. The size entry is used to cap the amount of
+ * outstanding data that we don't know the answer to. If we exceed a cap on the
+ * amount of outstanding data (currently 1 Mb), then we'll drop any additional
+ * packets. Once we get an answer indicating a valid destination, we transmit
+ * any outstanding data to that place. For the full story on how we look that up
+ * will be discussed in the section on the Target Cache Lifecycle.
+ *
+ * ------------------------
+ * FMA and Degraded Devices
+ * ------------------------
+ *
+ * Every kernel overlay device keeps track of its FMA state. Today in FMA we
+ * cannot represent partitions between resources nor can we represent that a
+ * given minor node of a psuedo device has failed -- if we degrade the overlay
+ * device, then the entire dev_info_t is degraded. However, we still want to be
+ * able to indicate to administrators that things may go wrong.
+ *
+ * To this end, we've added a notion of a degraded state to every overlay
+ * device. This state is primarily dictated by userland and it can happen for
+ * various reasons. Generally, because a userland lookup plugin has been
+ * partitioned, or something has gone wrong such that there is no longer any
+ * userland lookup module for a device, then we'll mark it degraded.
+ *
+ * As long as any of our minor instances is degraded, then we'll fire off the
+ * FMA event to note that. Once the last degraded instance is no longer
+ * degraded, then we'll end up telling FMA that we're all clean.
+ *
+ * To help administrators get a better sense of which of the various minor
+ * devices is wrong, we store the odd_fmamsg[] character array. This character
+ * array can be fetched with doing a dladm show-overlay -f.
+ *
+ * Note, that it's important that we do not update the link status of the
+ * devices. We want to remain up as much as possible. By changing the link in a
+ * degraded state, this may end up making things worse. We may still actually
+ * have information in the target cache and if we mark the link down, that'll
+ * result in not being able to use it. The reason being that this'll mark all
+ * the downstream VNICs down which will go to IP and from there we end up
+ * dealing with sadness.
+ *
+ * -----------------------
+ * Target Cache Life Cycle
+ * -----------------------
+ *
+ * This section only applies when we have a lookup plugin of
+ * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type
+ * OVERLAY_TARGET_POINT.
+ *
+ * While we got into the target cache in the general architecture section, it's
+ * worth going into more details as to how this actually works and showing some
+ * examples and state machines. Recall that a target cache entry basically has
+ * the following state transition diagram:
+ *
+ * Initial state
+ * . . . . . . first access . . . varpd lookup enqueued
+ * . . .
+ * . . .
+ * +-------+ . +----------+ .
+ * | No |------*---->| Invalid |-------*----+
+ * | Entry | | Entry | |
+ * +-------+ +----------+ |
+ * varpd ^ ^ varpd |
+ * invalidate | | drop |
+ * . . . * * . . v
+ * +-------+ | | +---------+
+ * | Entry |--->-----+ +----<----| Entry |
+ * | Valid |<----------*---------<----| Pending |->-+ varpd
+ * +-------+ . +---------+ * . . drop, but
+ * . varpd ^ | other queued
+ * . success | | entries
+ * +-----+
+ *
+ * When the table is first created, it is empty. As we attempt to lookup entries
+ * and we find there is no entry at all, we'll create a new table entry for it.
+ * At that point the entry is technically in an invalid state, that means that
+ * we have no valid data from varpd. In that case, we'll go ahead and queue the
+ * packet into the entry's pending chain, and queue a varpd lookup, setting the
+ * OVERLAY_ENTRY_F_PENDING flag in the progress.
+ *
+ * If additional mblk_t's come in for this entry, we end up appending them to
+ * the tail of the chain, if and only if, we don't exceed the threshold for the
+ * amount of space they can take up. An entry remains pending until we get a
+ * varpd reply. If varpd replies with a valid results, we move to the valid
+ * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one
+ * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate.
+ *
+ * Once an entry is valid, it stays valid until user land tells us to invalidate
+ * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and
+ * OVERLAY_TARG_CACHE_SET respectively.
+ *
+ * If the lookup fails with a call to drop the packet, then the next state is
+ * determined by the state of the queue. If the set of outstanding entries is
+ * empty, then we just transition back to the invalid state. If instead, the
+ * set of outstanding entries is not empty, then we'll queue another entry and
+ * stay in the same state, repeating this until the number of requests is
+ * drained.
+ *
+ * The following images describes the flow of a given lookup and where the
+ * overlay_target_entry_t is at any given time.
+ *
+ * +-------------------+
+ * | Invalid Entry | An entry starts off as an invalid entry
+ * | de:ad:be:ef:00:00 | and only exists in the target cache.
+ * +-------------------+
+ *
+ * ~~~~
+ *
+ * +---------------------+
+ * | Global list_t | A mblk_t comes in for an entry. We
+ * | overlay_target_list | append it to the overlay_target_list.
+ * +---------------------+
+ * |
+ * v
+ * +-------------------+ +-------------------+
+ * | Pending Entry |----->| Pending Entry |--->...
+ * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 |
+ * +-------------------+ +-------------------+
+ *
+ * ~~~~
+ *
+ * +--------------------------+
+ * | /dev/overlay minor state | User land said that it would look up an
+ * | overlay_target_hdl_t | entry for us. We remove it from the
+ * +--------------------------+ global list and add it to the handle's
+ * | outstanding list.
+ * |
+ * v
+ * +-------------------+ +-------------------+
+ * | Pending Entry |----->| Pending Entry |
+ * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 |
+ * +-------------------+ +-------------------+
+ *
+ * ~~~~
+ *
+ * +-------------------+
+ * | Valid Entry | varpd returned an answer with
+ * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache
+ * | 10.169.23.42:4789 | entry is now populated with a
+ * +-------------------+ destination and marked as valid
+ *
+ *
+ * The lookup mechanism is performed via a series of operations on the character
+ * psuedo-device /dev/overlay. The only thing that uses this device is the
+ * userland daemon varpd. /dev/overlay is a cloneable device, each open of it
+ * granting a new minor number which maintains its own state. We maintain this
+ * state so that way if an outstanding lookup was queued to something that
+ * crashed or closed its handle without responding, we can know about this and
+ * thus handle it appropriately.
+ *
+ * When a lookup is first created it's added to our global list of outstanding
+ * lookups. To service requests, userland is required to perform an ioctl to ask
+ * for a request. We will block it in the kernel a set amount of time waiting
+ * for a request. When we give a request to a given minor instance of the
+ * device, we remove it from the global list and append the request to the
+ * device's list of outstanding entries, for the reasons we discussed above.
+ * When a lookup comes in, we give user land a smaller amount of information
+ * specific to that packet, the overlay_targ_lookup_t. It includes a request id
+ * to identify this, and then the overlay id, the varpd id, the header and
+ * packet size, the source and destination mac address, the SAP, and any
+ * potential VLAN header.
+ *
+ * At that point, it stays in that outstanding list until one of two ioctls are
+ * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time,
+ * userland may also perform other operations. For example, it may use
+ * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth
+ * analysis of what to do beyond what we gave it initially. This is useful for
+ * providing proxy arp and the like. Finally, there are two other ioctls that
+ * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the
+ * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which
+ * causes us to encapsulate and send out the packet they've given us.
+ *
+ *
+ * Finally, through the target cache, several ioctls are provided to allow for
+ * interrogation and management of the cache. They allow for individual entries
+ * to be retrieved, set, or have the entire table flushed. For the full set of
+ * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h.
+ *
+ * ------------------
+ * Sample Packet Flow
+ * ------------------
+ *
+ * There's a lot of pieces here, hopefully an example of how this all fits
+ * together will help clarify and elucidate what's going on. We're going to
+ * first track an outgoing packet, eg. one that is sent from an IP interface on
+ * a VNIC on top of an overlay device, and then we'll look at what it means to
+ * respond to that.
+ *
+ *
+ * +----------------+ +--------------+ +------------------+
+ * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches |
+ * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx |
+ * +----------------+ | VNIC device | | overlay_m_tx() |
+ * +--------------+ +------------------+
+ * |
+ * . lookup . cache |
+ * . drop . miss v
+ * +---------+ . +--------+ . +------------------+
+ * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk |
+ * | mblk_t | | lookup | | in the target |
+ * +---------+ | queued | | cache |
+ * ^ +--------+ +------------------+
+ * on send | | | cache
+ * error . . * *. . lookup * . . hit
+ * | | success v
+ * | | +------------------+
+ * +-----------------+ +--------------->| call plugin |
+ * | Send out | | ovpo_encap() to |
+ * | overlay_mux_t's |<----------------------------------| get encap mblk_t |
+ * | ksocket | +------------------+
+ * +-----------------+
+ *
+ * The receive end point looks a little different and looks more like:
+ *
+ * +------------------+ +----------------+ +-----------+
+ * | mblk_t comes off |---->| enter netstack |--->| delivered |---+
+ * | the physical | | IP stack | | to | * . . direct
+ * | device | +----------------+ | ksocket | | callback
+ * +------------------+ +-----------+ |
+ * . overlay id |
+ * . not found v
+ * +-----------+ . +-----------------+ +--------------------+
+ * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() |
+ * | mblk_t | | ovpo_decap() to | +--------------------+
+ * +-----------+ | decap mblk_t |
+ * +-----------------+
+ * |
+ * * . . overlay id
+ * v found
+ * +--------+ +----------------+
+ * | adjust |----->| call mac_rx |
+ * | mblk_t | | on original |
+ * +--------+ | decaped packet |
+ * +----------------+
+ *
+ * ------------------
+ * Netstack Awareness
+ * ------------------
+ *
+ * In the above image we note that this enters a netstack. Today the only
+ * netstack that can be is the global zone as the overlay driver itself is not
+ * exactly netstack aware. What this really means is that varpd cannot run in a
+ * non-global zone and an overlay device cannot belong to a non-global zone.
+ * Non-global zones can still have a VNIC assigned to them that's been created
+ * over the overlay device the same way they would if it had been created over
+ * an etherstub or a physical device.
+ *
+ * The majority of the work to make it netstack aware is straightforward and the
+ * biggest thing is to create a netstack module that allows us to hook into
+ * netstack (and thus zone) creation and destruction. From there, we need to
+ * amend the target cache lookup routines that we discussed earlier to not have
+ * a global outstanding list and a global list of handles, but rather, one per
+ * netstack.
+ *
+ * For the mux, we'll need to open the ksocket in the context of the zone, we
+ * can likely do this with a properly composed credential, but we'll need to do
+ * some more work on that path. Finally, we'll want to make sure the dld ioctls
+ * are aware of the zoneid of the caller and we use that appropriately and store
+ * it in the overlay_dev_t.
+ *
+ * -----------
+ * GLDv3 Notes
+ * -----------
+ *
+ * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more
+ * relevant and other parts are much less relevant for us. For example, the
+ * GLDv3 is used to toggle the device being put into and out of promiscuous
+ * mode, to program MAC addresses for unicast and multicast hardware filters.
+ * Today, an overlay device doesn't have a notion of promiscuous mode nor does
+ * it have a notion of unicast and multicast addresses programmed into the
+ * device. Instead, for the purposes of the hardware filter, we don't do
+ * anything and just always accept new addresses being added and removed.
+ *
+ * If the GLDv3 start function has not been called, then we will not use this
+ * device for I/O purposes. Any calls to transmit or receive should be dropped,
+ * though the GLDv3 guarantees us that transmit will not be called without
+ * calling start. Similarly, once stop is called, then no packets can be dealt
+ * with.
+ *
+ * Today we don't support the stat interfaces, though there's no good reason
+ * that we shouldn't assemble some of the stats based on what we have in the
+ * future.
+ *
+ * When it comes to link properties, many of the traditional link properties do
+ * not apply and many others MAC handles for us. For example, we don't need to
+ * implement anything for overlay_m_getprop() to deal with returning the MTU, as
+ * MAC never calls into us for that. As such, there isn't much of anything to
+ * support in terms of properties.
+ *
+ * Today, we don't support any notion of hardware capabilities. However, if
+ * future NIC hardware or other changes to the system cause it to make sense for
+ * us to emulate logical groups, then we should do that. However, we still do
+ * implement a capab function so that we can identify ourselves as an overlay
+ * device to the broader MAC framework. This is done mostly so that a device
+ * created on top of us can have fanout rings as we don't try to lie about a
+ * speed for our device.
+ *
+ * The other question is what should be done for a device's MTU and margin. We
+ * set our minimum supported MTU to be the minimum value that an IP network may
+ * be set to 576 -- which mimics what an etherstub does. On the flip side, we
+ * have our upper bound set to 8900. This value comes from the fact that a lot
+ * of jumbo networks use their maximum as 9000. As such, we want to reserve 100
+ * bytes, which isn't exactly the most accurate number, but it'll be good enough
+ * for now. Because of that, our default MTU off of these devices is 1400, as
+ * the default MTU for everything is usually 1500 or whatever the underlying
+ * device is at; however, this is a bit simpler than asking the netstack what
+ * are all the IP interfaces at. It also calls into question how PMTU and PMTU
+ * discovery should work here. The challenge, especially for
+ * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's
+ * not clear that if you have a single bad entry that the overall MTU should be
+ * lowered. Instead, we should figure out a better way of determining these
+ * kinds of PMTU errors and appropriately alerting the administrator via FMA.
+ *
+ * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether
+ * or not the underlying encapsulation device supports VLAN tags. If it does,
+ * then we'll set the margin to allow for it, otherwise, we will not.
+ */
+
+#include <sys/conf.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/policy.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/ddifm.h>
+
+#include <sys/dls.h>
+#include <sys/dld_ioc.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client_priv.h>
+#include <sys/mac_ether.h>
+#include <sys/vlan.h>
+
+#include <sys/overlay_impl.h>
+
+dev_info_t *overlay_dip;
+static kmutex_t overlay_dev_lock;
+static list_t overlay_dev_list;
+static uint8_t overlay_macaddr[ETHERADDRL] =
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+
+typedef enum overlay_dev_prop {
+ OVERLAY_DEV_P_MTU = 0,
+ OVERLAY_DEV_P_VNETID,
+ OVERLAY_DEV_P_ENCAP,
+ OVERLAY_DEV_P_VARPDID
+} overlay_dev_prop_t;
+
+#define OVERLAY_DEV_NPROPS 4
+static const char *overlay_dev_props[] = {
+ "mtu",
+ "vnetid",
+ "encap",
+ "varpd/id"
+};
+
+#define OVERLAY_MTU_MIN 576
+#define OVERLAY_MTU_DEF 1400
+#define OVERLAY_MTU_MAX 8900
+
+overlay_dev_t *
+overlay_hold_by_dlid(datalink_id_t id)
+{
+ overlay_dev_t *o;
+
+ mutex_enter(&overlay_dev_lock);
+ for (o = list_head(&overlay_dev_list); o != NULL;
+ o = list_next(&overlay_dev_list, o)) {
+ if (id == o->odd_linkid) {
+ mutex_enter(&o->odd_lock);
+ o->odd_ref++;
+ mutex_exit(&o->odd_lock);
+ mutex_exit(&overlay_dev_lock);
+ return (o);
+ }
+ }
+
+ mutex_exit(&overlay_dev_lock);
+ return (NULL);
+}
+
+void
+overlay_hold_rele(overlay_dev_t *odd)
+{
+ mutex_enter(&odd->odd_lock);
+ ASSERT(odd->odd_ref > 0);
+ odd->odd_ref--;
+ mutex_exit(&odd->odd_lock);
+}
+
+void
+overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+ ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
+ ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+ if (flag & OVERLAY_F_IN_RX)
+ odd->odd_rxcount++;
+ if (flag & OVERLAY_F_IN_TX)
+ odd->odd_txcount++;
+ odd->odd_flags |= flag;
+}
+
+void
+overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+ boolean_t signal = B_FALSE;
+
+ ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
+ ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+ if (flag & OVERLAY_F_IN_RX) {
+ ASSERT(odd->odd_rxcount > 0);
+ odd->odd_rxcount--;
+ if (odd->odd_rxcount == 0) {
+ signal = B_TRUE;
+ odd->odd_flags &= ~OVERLAY_F_IN_RX;
+ }
+ }
+ if (flag & OVERLAY_F_IN_TX) {
+ ASSERT(odd->odd_txcount > 0);
+ odd->odd_txcount--;
+ if (odd->odd_txcount == 0) {
+ signal = B_TRUE;
+ odd->odd_flags &= ~OVERLAY_F_IN_TX;
+ }
+ }
+
+ if (signal == B_TRUE)
+ cv_broadcast(&odd->odd_iowait);
+}
+
+static void
+overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag)
+{
+ ASSERT((flag & ~OVERLAY_F_IOMASK) == 0);
+ ASSERT(MUTEX_HELD(&odd->odd_lock));
+
+ while (odd->odd_flags & flag) {
+ cv_wait(&odd->odd_iowait, &odd->odd_lock);
+ }
+}
+
+void
+overlay_dev_iter(overlay_dev_iter_f func, void *arg)
+{
+ overlay_dev_t *odd;
+
+ mutex_enter(&overlay_dev_lock);
+ for (odd = list_head(&overlay_dev_list); odd != NULL;
+ odd = list_next(&overlay_dev_list, odd)) {
+ if (func(odd, arg) != 0) {
+ mutex_exit(&overlay_dev_lock);
+ return;
+ }
+ }
+ mutex_exit(&overlay_dev_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_stat(void *arg, uint_t stat, uint64_t *val)
+{
+ return (ENOTSUP);
+}
+
+static int
+overlay_m_start(void *arg)
+{
+ overlay_dev_t *odd = arg;
+ overlay_mux_t *mux;
+ int ret, domain, family, prot;
+ struct sockaddr_storage storage;
+ socklen_t slen;
+
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) {
+ mutex_exit(&odd->odd_lock);
+ return (EAGAIN);
+ }
+ mutex_exit(&odd->odd_lock);
+
+ ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain,
+ &family, &prot, (struct sockaddr *)&storage, &slen);
+ if (ret != 0)
+ return (ret);
+
+ mux = overlay_mux_open(odd->odd_plugin, domain, family, prot,
+ (struct sockaddr *)&storage, slen, &ret);
+ if (mux == NULL)
+ return (ret);
+
+ overlay_mux_add_dev(mux, odd);
+ odd->odd_mux = mux;
+ mutex_enter(&odd->odd_lock);
+ ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX));
+ odd->odd_flags |= OVERLAY_F_IN_MUX;
+ mutex_exit(&odd->odd_lock);
+
+ return (0);
+}
+
+static void
+overlay_m_stop(void *arg)
+{
+ overlay_dev_t *odd = arg;
+
+ /*
+ * The MAC Perimeter is held here, so we don't have to worry about
+ * synchornizing this with respect to metadata operations.
+ */
+ mutex_enter(&odd->odd_lock);
+ VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX);
+ VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP));
+ odd->odd_flags |= OVERLAY_F_MDDROP;
+ overlay_io_wait(odd, OVERLAY_F_IOMASK);
+ mutex_exit(&odd->odd_lock);
+
+ overlay_mux_remove_dev(odd->odd_mux, odd);
+ overlay_mux_close(odd->odd_mux);
+ odd->odd_mux = NULL;
+
+ mutex_enter(&odd->odd_lock);
+ odd->odd_flags &= ~OVERLAY_F_IN_MUX;
+ odd->odd_flags &= ~OVERLAY_F_MDDROP;
+ VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0);
+ mutex_exit(&odd->odd_lock);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_promisc(void *arg, boolean_t on)
+{
+ return (0);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp)
+{
+ return (0);
+}
+
+/*
+ * For more info on this, see the big theory statement.
+ */
+/* ARGSUSED */
+static int
+overlay_m_unicast(void *arg, const uint8_t *macaddr)
+{
+ return (0);
+}
+
+mblk_t *
+overlay_m_tx(void *arg, mblk_t *mp_chain)
+{
+ overlay_dev_t *odd = arg;
+ mblk_t *mp, *ep;
+ int ret;
+ ovep_encap_info_t einfo;
+ struct msghdr hdr;
+
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
+ !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+ mutex_exit(&odd->odd_lock);
+ freemsgchain(mp_chain);
+ return (NULL);
+ }
+ overlay_io_start(odd, OVERLAY_F_IN_TX);
+ mutex_exit(&odd->odd_lock);
+
+ bzero(&hdr, sizeof (struct msghdr));
+
+ bzero(&einfo, sizeof (ovep_encap_info_t));
+ einfo.ovdi_id = odd->odd_vid;
+ mp = mp_chain;
+ while (mp != NULL) {
+ socklen_t slen;
+ struct sockaddr_storage storage;
+
+ mp_chain = mp->b_next;
+ mp->b_next = NULL;
+ ep = NULL;
+
+ ret = overlay_target_lookup(odd, mp,
+ (struct sockaddr *)&storage, &slen);
+ if (ret != OVERLAY_TARGET_OK) {
+ if (ret == OVERLAY_TARGET_DROP)
+ freemsg(mp);
+ mp = mp_chain;
+ continue;
+ }
+
+ hdr.msg_name = &storage;
+ hdr.msg_namelen = slen;
+
+ ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp,
+ &einfo, &ep);
+ if (ret != 0 || ep == NULL) {
+ freemsg(mp);
+ goto out;
+ }
+
+ ASSERT(ep->b_cont == mp || ep == mp);
+ ret = overlay_mux_tx(odd->odd_mux, &hdr, ep);
+ if (ret != 0)
+ goto out;
+
+ mp = mp_chain;
+ }
+
+out:
+ mutex_enter(&odd->odd_lock);
+ overlay_io_done(odd, OVERLAY_F_IN_TX);
+ mutex_exit(&odd->odd_lock);
+ freemsgchain(mp_chain);
+ return (NULL);
+}
+
+/* ARGSUSED */
+static void
+overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
+{
+ miocnak(q, mp, 0, ENOTSUP);
+}
+
+/* ARGSUSED */
+static boolean_t
+overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
+{
+ /*
+ * Tell MAC we're an overlay.
+ */
+ if (cap == MAC_CAPAB_OVERLAY)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+ uint_t pr_valsize, const void *pr_val)
+{
+ uint32_t mtu, old;
+ int err;
+ overlay_dev_t *odd = arg;
+
+ if (pr_num != MAC_PROP_MTU)
+ return (ENOTSUP);
+
+ bcopy(pr_val, &mtu, sizeof (mtu));
+ if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX)
+ return (EINVAL);
+
+ mutex_enter(&odd->odd_lock);
+ old = odd->odd_mtu;
+ odd->odd_mtu = mtu;
+ err = mac_maxsdu_update(odd->odd_mh, mtu);
+ if (err != 0)
+ odd->odd_mtu = old;
+ mutex_exit(&odd->odd_lock);
+
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+ uint_t pr_valsize, void *pr_val)
+{
+ return (ENOTSUP);
+}
+
+/* ARGSUSED */
+static void
+overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+ mac_prop_info_handle_t prh)
+{
+ if (pr_num != MAC_PROP_MTU)
+ return;
+
+ mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF);
+ mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX);
+}
+
+static mac_callbacks_t overlay_m_callbacks = {
+ .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP |
+ MC_PROPINFO),
+ .mc_getstat = overlay_m_stat,
+ .mc_start = overlay_m_start,
+ .mc_stop = overlay_m_stop,
+ .mc_setpromisc = overlay_m_promisc,
+ .mc_multicst = overlay_m_multicast,
+ .mc_unicst = overlay_m_unicast,
+ .mc_tx = overlay_m_tx,
+ .mc_ioctl = overlay_m_ioctl,
+ .mc_getcapab = overlay_m_getcapab,
+ .mc_getprop = overlay_m_getprop,
+ .mc_setprop = overlay_m_setprop,
+ .mc_propinfo = overlay_m_propinfo
+};
+
+static boolean_t
+overlay_valid_name(const char *name, size_t buflen)
+{
+ size_t actlen;
+ int err, i;
+
+ for (i = 0; i < buflen; i++) {
+ if (name[i] == '\0')
+ break;
+ }
+
+ if (i == 0 || i == buflen)
+ return (B_FALSE);
+ actlen = i;
+ if (strchr(name, '/') != NULL)
+ return (B_FALSE);
+ if (u8_validate((char *)name, actlen, NULL,
+ U8_VALIDATE_ENTIRE, &err) < 0)
+ return (B_FALSE);
+ return (B_TRUE);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ int err;
+ uint64_t maxid;
+ overlay_dev_t *odd, *o;
+ mac_register_t *mac;
+ overlay_ioc_create_t *oicp = karg;
+
+ if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE)
+ return (EINVAL);
+
+ odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP);
+ odd->odd_linkid = oicp->oic_linkid;
+ odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap);
+ if (odd->odd_plugin == NULL) {
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (ENOENT);
+ }
+ err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd,
+ &odd->odd_pvoid);
+ if (err != 0) {
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EINVAL);
+ }
+
+ /*
+ * Make sure that our virtual network id is valid for the given plugin
+ * that we're working with.
+ */
+ ASSERT(odd->odd_plugin->ovp_id_size <= 8);
+ maxid = UINT64_MAX;
+ if (odd->odd_plugin->ovp_id_size != 8)
+ maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL;
+ if (oicp->oic_vnetid > maxid) {
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EINVAL);
+ }
+ odd->odd_vid = oicp->oic_vnetid;
+
+ mac = mac_alloc(MAC_VERSION);
+ if (mac == NULL) {
+ mutex_exit(&overlay_dev_lock);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EINVAL);
+ }
+
+ mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
+ mac->m_driver = odd;
+ mac->m_dip = overlay_dip;
+ mac->m_dst_addr = NULL;
+ mac->m_callbacks = &overlay_m_callbacks;
+ mac->m_pdata = NULL;
+ mac->m_pdata_size = 0;
+
+ mac->m_priv_props = NULL;
+
+ /* Let mac handle this itself. */
+ mac->m_instance = (uint_t)-1;
+
+ /*
+ * There is no real source address that should be used here, but saying
+ * that we're not ethernet is going to cause its own problems. At the
+ * end of the say, this is fine.
+ */
+ mac->m_src_addr = overlay_macaddr;
+
+ /*
+ * Start with the default MTU as the max SDU. If the MTU is changed, the
+ * SDU will be changed to reflect that.
+ */
+ mac->m_min_sdu = 1;
+ mac->m_max_sdu = OVERLAY_MTU_DEF;
+ mac->m_multicast_sdu = 0;
+
+ /*
+ * The underlying device doesn't matter, instead this comes from the
+ * encapsulation protocol and whether or not they allow VLAN tags.
+ */
+ if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) {
+ mac->m_margin = VLAN_TAGSZ;
+ } else {
+ mac->m_margin = 0;
+ }
+
+ /*
+ * Today, we have no MAC virtualization, it may make sense in the future
+ * to go ahead and emulate some subset of this, but it doesn't today.
+ */
+ mac->m_v12n = MAC_VIRT_NONE;
+
+ mutex_enter(&overlay_dev_lock);
+ for (o = list_head(&overlay_dev_list); o != NULL;
+ o = list_next(&overlay_dev_list, o)) {
+ if (o->odd_linkid == oicp->oic_linkid) {
+ mutex_exit(&overlay_dev_lock);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EEXIST);
+ }
+
+ if (o->odd_vid == oicp->oic_vnetid &&
+ o->odd_plugin == odd->odd_plugin) {
+ mutex_exit(&overlay_dev_lock);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (EEXIST);
+ }
+ }
+
+ err = mac_register(mac, &odd->odd_mh);
+ mac_free(mac);
+ if (err != 0) {
+ mutex_exit(&overlay_dev_lock);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (err);
+ }
+
+ err = dls_devnet_create(odd->odd_mh, odd->odd_linkid,
+ crgetzoneid(cred));
+ if (err != 0) {
+ mutex_exit(&overlay_dev_lock);
+ (void) mac_unregister(odd->odd_mh);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+ return (err);
+ }
+
+ mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL);
+ odd->odd_ref = 0;
+ odd->odd_flags = 0;
+ list_insert_tail(&overlay_dev_list, odd);
+ mutex_exit(&overlay_dev_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ int i, ret;
+ overlay_dev_t *odd;
+ mac_perim_handle_t mph;
+ overlay_ioc_activate_t *oiap = karg;
+ overlay_ioc_propinfo_t *infop;
+ overlay_ioc_prop_t *oip;
+ overlay_prop_handle_t phdl;
+
+ odd = overlay_hold_by_dlid(oiap->oia_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP);
+ oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP);
+ phdl = (overlay_prop_handle_t)infop;
+
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
+ mutex_exit(&odd->odd_lock);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (EEXIST);
+ }
+ mutex_exit(&odd->odd_lock);
+
+ for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
+ const char *pname = odd->odd_plugin->ovp_props[i];
+ bzero(infop, sizeof (overlay_ioc_propinfo_t));
+ overlay_prop_init(phdl);
+ ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl);
+ if (ret != 0) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (ret);
+ }
+
+ if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0)
+ continue;
+ bzero(oip, sizeof (overlay_ioc_prop_t));
+ oip->oip_size = sizeof (oip->oip_value);
+ ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
+ pname, oip->oip_value, &oip->oip_size);
+ if (ret != 0) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (ret);
+ }
+ if (oip->oip_size == 0) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (EINVAL);
+ }
+ }
+
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) {
+ mutex_exit(&odd->odd_lock);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+ return (ENXIO);
+ }
+
+ ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0);
+ odd->odd_flags |= OVERLAY_F_ACTIVATED;
+
+ /*
+ * Now that we've activated ourselves, we should indicate to the world
+ * that we're up. Note that we may not be able to perform lookups at
+ * this time, but our notion of being 'up' isn't dependent on that
+ * ability.
+ */
+ mac_link_update(odd->odd_mh, LINK_STATE_UP);
+ mutex_exit(&odd->odd_lock);
+
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
+ kmem_free(oip, sizeof (overlay_ioc_prop_t));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
+{
+ overlay_ioc_delete_t *oidp = karg;
+ overlay_dev_t *odd;
+ datalink_id_t tid;
+ int ret;
+
+ odd = overlay_hold_by_dlid(oidp->oid_linkid);
+ if (odd == NULL) {
+ return (ENOENT);
+ }
+
+ mutex_enter(&odd->odd_lock);
+ /* If we're not the only hold, we're busy */
+ if (odd->odd_ref != 1) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (EBUSY);
+ }
+
+ if (odd->odd_flags & OVERLAY_F_IN_MUX) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (EBUSY);
+ }
+
+ /*
+ * To remove this, we need to first remove it from dls and then remove
+ * it from mac. The act of removing it from mac will check if there are
+ * devices on top of this, eg. vnics. If there are, then that will fail
+ * and we'll have to go through and recreate the dls entry. Only after
+ * mac_unregister has succeeded, then we'll go through and actually free
+ * everything and drop the dev lock.
+ */
+ ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE);
+ if (ret != 0) {
+ overlay_hold_rele(odd);
+ return (ret);
+ }
+
+ ASSERT(oidp->oid_linkid == tid);
+ ret = mac_disable(odd->odd_mh);
+ if (ret != 0) {
+ (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid,
+ crgetzoneid(cred));
+ overlay_hold_rele(odd);
+ return (ret);
+ }
+
+ overlay_target_quiesce(odd->odd_target);
+
+ mutex_enter(&overlay_dev_lock);
+ list_remove(&overlay_dev_list, odd);
+ mutex_exit(&overlay_dev_lock);
+
+ cv_destroy(&odd->odd_iowait);
+ mutex_destroy(&odd->odd_lock);
+ overlay_target_free(odd);
+ odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
+ overlay_plugin_rele(odd->odd_plugin);
+ kmem_free(odd, sizeof (overlay_dev_t));
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ overlay_dev_t *odd;
+ overlay_ioc_nprops_t *on = karg;
+
+ odd = overlay_hold_by_dlid(on->oipn_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+ on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS;
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+static int
+overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg)
+{
+ overlay_prop_handle_t phdl = arg;
+ overlay_prop_set_range_str(phdl, opp->ovp_name);
+ return (0);
+}
+
+static int
+overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id)
+{
+ int i;
+
+ for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+ if (strcmp(overlay_dev_props[i], name) == 0) {
+ *id = i;
+ return (0);
+ }
+ }
+
+ for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
+ if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) {
+ *id = i + OVERLAY_DEV_NPROPS;
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+static void
+overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl)
+{
+ uint32_t def;
+ mac_propval_range_t range;
+ uint_t perm;
+
+ ASSERT(MAC_PERIM_HELD(odd->odd_mh));
+
+ bzero(&range, sizeof (mac_propval_range_t));
+ range.mpr_count = 1;
+ if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def,
+ sizeof (def), &range, &perm) != 0)
+ return;
+
+ if (perm == MAC_PROP_PERM_READ)
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+ else if (perm == MAC_PROP_PERM_WRITE)
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE);
+ else if (perm == MAC_PROP_PERM_RW)
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_default(phdl, &def, sizeof (def));
+ overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min,
+ range.mpr_range_uint32[0].mpur_max);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ overlay_dev_t *odd;
+ int ret;
+ mac_perim_handle_t mph;
+ uint_t propid = UINT_MAX;
+ overlay_ioc_propinfo_t *oip = karg;
+ overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
+
+ odd = overlay_hold_by_dlid(oip->oipi_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ overlay_prop_init(phdl);
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+
+ /*
+ * If the id is -1, then the property that we're looking for is named in
+ * oipi_name and we should fill in its id. Otherwise, we've been given
+ * an id and we need to turn that into a name for our plugin's sake. The
+ * id is our own fabrication for property discovery.
+ */
+ if (oip->oipi_id == -1) {
+ /*
+ * Determine if it's a known generic property or it belongs to a
+ * module by checking against the list of known names.
+ */
+ oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+ if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name,
+ &propid)) != 0) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ }
+ oip->oipi_id = propid;
+ if (propid >= OVERLAY_DEV_NPROPS) {
+ ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
+ oip->oipi_name, phdl);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+
+ }
+ } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) {
+ uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS;
+
+ if (id >= odd->odd_plugin->ovp_nprops) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (EINVAL);
+ }
+ ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
+ odd->odd_plugin->ovp_props[id], phdl);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ } else if (oip->oipi_id < -1) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (EINVAL);
+ } else {
+ ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS);
+ ASSERT(oip->oipi_id >= 0);
+ propid = oip->oipi_id;
+ (void) strlcpy(oip->oipi_name, overlay_dev_props[propid],
+ sizeof (oip->oipi_name));
+ }
+
+ switch (propid) {
+ case OVERLAY_DEV_P_MTU:
+ overlay_i_propinfo_mtu(odd, phdl);
+ break;
+ case OVERLAY_DEV_P_VNETID:
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_nodefault(phdl);
+ break;
+ case OVERLAY_DEV_P_ENCAP:
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING);
+ overlay_prop_set_nodefault(phdl);
+ overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl);
+ break;
+ case OVERLAY_DEV_P_VARPDID:
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ overlay_prop_set_nodefault(phdl);
+ break;
+ default:
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ENOENT);
+ }
+
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ int ret;
+ overlay_dev_t *odd;
+ mac_perim_handle_t mph;
+ overlay_ioc_prop_t *oip = karg;
+ uint_t propid, mtu;
+
+ odd = overlay_hold_by_dlid(oip->oip_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+ oip->oip_size = OVERLAY_PROP_SIZEMAX;
+ oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+ if (oip->oip_id == -1) {
+ int i;
+
+ for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+ if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
+ break;
+ if (i == OVERLAY_DEV_NPROPS) {
+ ret = odd->odd_plugin->ovp_ops->ovpo_getprop(
+ odd->odd_pvoid, oip->oip_name,
+ oip->oip_value, &oip->oip_size);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ }
+ }
+
+ propid = i;
+ } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
+ uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
+
+ if (id > odd->odd_plugin->ovp_nprops) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (EINVAL);
+ }
+ ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
+ odd->odd_plugin->ovp_props[id], oip->oip_value,
+ &oip->oip_size);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ } else if (oip->oip_id < -1) {
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (EINVAL);
+ } else {
+ ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
+ ASSERT(oip->oip_id >= 0);
+ propid = oip->oip_id;
+ }
+
+ ret = 0;
+ switch (propid) {
+ case OVERLAY_DEV_P_MTU:
+ /*
+ * The MTU is always set and retrieved through MAC, to allow for
+ * MAC to do whatever it wants, as really that property belongs
+ * to MAC. This is important for things where vnics have hold on
+ * the MTU.
+ */
+ mac_sdu_get(odd->odd_mh, NULL, &mtu);
+ bcopy(&mtu, oip->oip_value, sizeof (uint_t));
+ oip->oip_size = sizeof (uint_t);
+ break;
+ case OVERLAY_DEV_P_VNETID:
+ /*
+ * While it's read-only while inside of a mux, we're not in a
+ * context that can guarantee that. Therefore we always grab the
+ * overlay_dev_t's odd_lock.
+ */
+ mutex_enter(&odd->odd_lock);
+ bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t));
+ mutex_exit(&odd->odd_lock);
+ oip->oip_size = sizeof (uint64_t);
+ break;
+ case OVERLAY_DEV_P_ENCAP:
+ oip->oip_size = strlcpy((char *)oip->oip_value,
+ odd->odd_plugin->ovp_name, oip->oip_size);
+ break;
+ case OVERLAY_DEV_P_VARPDID:
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_VARPD) {
+ const uint64_t val = odd->odd_target->ott_id;
+ bcopy(&val, oip->oip_value, sizeof (uint64_t));
+ oip->oip_size = sizeof (uint64_t);
+ } else {
+ oip->oip_size = 0;
+ }
+ mutex_exit(&odd->odd_lock);
+ break;
+ default:
+ ret = ENOENT;
+ }
+
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+}
+
+static void
+overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
+{
+ mutex_enter(&odd->odd_lock);
+
+ /* Simple case, not active */
+ if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+ odd->odd_vid = vnetid;
+ mutex_exit(&odd->odd_lock);
+ return;
+ }
+
+ /*
+ * In the hard case, we need to set the drop flag, quiesce I/O and then
+ * we can go ahead and do everything.
+ */
+ odd->odd_flags |= OVERLAY_F_MDDROP;
+ overlay_io_wait(odd, OVERLAY_F_IOMASK);
+ mutex_exit(&odd->odd_lock);
+
+ overlay_mux_remove_dev(odd->odd_mux, odd);
+ mutex_enter(&odd->odd_lock);
+ odd->odd_vid = vnetid;
+ mutex_exit(&odd->odd_lock);
+ overlay_mux_add_dev(odd->odd_mux, odd);
+
+ mutex_enter(&odd->odd_lock);
+ ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
+ odd->odd_flags &= ~OVERLAY_F_IN_MUX;
+ mutex_exit(&odd->odd_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ int ret;
+ overlay_dev_t *odd;
+ overlay_ioc_prop_t *oip = karg;
+ uint_t propid = UINT_MAX;
+ mac_perim_handle_t mph;
+ uint64_t maxid, *vidp;
+
+ if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
+ return (EINVAL);
+
+ odd = overlay_hold_by_dlid(oip->oip_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
+ mac_perim_enter_by_mh(odd->odd_mh, &mph);
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
+ mac_perim_exit(mph);
+ mutex_exit(&odd->odd_lock);
+ return (ENOTSUP);
+ }
+ mutex_exit(&odd->odd_lock);
+ if (oip->oip_id == -1) {
+ int i;
+
+ for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
+ if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
+ break;
+ if (i == OVERLAY_DEV_NPROPS) {
+ ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
+ odd->odd_pvoid, oip->oip_name,
+ oip->oip_value, oip->oip_size);
+ overlay_hold_rele(odd);
+ mac_perim_exit(mph);
+ return (ret);
+ }
+ }
+
+ propid = i;
+ } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
+ uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
+
+ if (id > odd->odd_plugin->ovp_nprops) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+ ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid,
+ odd->odd_plugin->ovp_props[id], oip->oip_value,
+ oip->oip_size);
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (ret);
+ } else if (oip->oip_id < -1) {
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ } else {
+ ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
+ ASSERT(oip->oip_id >= 0);
+ propid = oip->oip_id;
+ }
+
+ ret = 0;
+ switch (propid) {
+ case OVERLAY_DEV_P_MTU:
+ ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu",
+ oip->oip_value, oip->oip_size);
+ break;
+ case OVERLAY_DEV_P_VNETID:
+ if (oip->oip_size != sizeof (uint64_t)) {
+ ret = EINVAL;
+ break;
+ }
+ vidp = (uint64_t *)oip->oip_value;
+ ASSERT(odd->odd_plugin->ovp_id_size <= 8);
+ maxid = UINT64_MAX;
+ if (odd->odd_plugin->ovp_id_size != 8)
+ maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) -
+ 1ULL;
+ if (*vidp >= maxid) {
+ ret = EINVAL;
+ break;
+ }
+ overlay_setprop_vnetid(odd, *vidp);
+ break;
+ case OVERLAY_DEV_P_ENCAP:
+ case OVERLAY_DEV_P_VARPDID:
+ ret = EPERM;
+ break;
+ default:
+ ret = ENOENT;
+ }
+
+ mac_perim_exit(mph);
+ overlay_hold_rele(odd);
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred,
+ int *rvalp)
+{
+ overlay_dev_t *odd;
+ overlay_ioc_status_t *os = karg;
+
+ odd = overlay_hold_by_dlid(os->ois_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) {
+ os->ois_status = OVERLAY_I_DEGRADED;
+ if (odd->odd_fmamsg != NULL) {
+ (void) strlcpy(os->ois_message, odd->odd_fmamsg,
+ OVERLAY_STATUS_BUFLEN);
+ } else {
+ os->ois_message[0] = '\0';
+ }
+
+ } else {
+ os->ois_status = OVERLAY_I_OK;
+ os->ois_message[0] = '\0';
+ }
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+static dld_ioc_info_t overlay_ioc_list[] = {
+ { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t),
+ overlay_i_create, secpolicy_dl_config },
+ { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t),
+ overlay_i_activate, secpolicy_dl_config },
+ { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t),
+ overlay_i_delete, secpolicy_dl_config },
+ { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT,
+ sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo,
+ secpolicy_dl_config },
+ { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT,
+ sizeof (overlay_ioc_prop_t), overlay_i_getprop,
+ secpolicy_dl_config },
+ { OVERLAY_IOC_SETPROP, DLDCOPYIN,
+ sizeof (overlay_ioc_prop_t), overlay_i_setprop,
+ secpolicy_dl_config },
+ { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT,
+ sizeof (overlay_ioc_nprops_t), overlay_i_nprops,
+ secpolicy_dl_config },
+ { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT,
+ sizeof (overlay_ioc_status_t), overlay_i_status,
+ NULL }
+};
+
+static int
+overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ int fmcap = DDI_FM_EREPORT_CAPABLE;
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ if (overlay_dip != NULL || ddi_get_instance(dip) != 0)
+ return (DDI_FAILURE);
+
+ ddi_fm_init(dip, &fmcap, NULL);
+
+ if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR,
+ ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE)
+ return (DDI_FAILURE);
+
+ if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list,
+ DLDIOCCNT(overlay_ioc_list)) != 0) {
+ ddi_remove_minor_node(dip, OVERLAY_CTL);
+ return (DDI_FAILURE);
+ }
+
+ overlay_dip = dip;
+ return (DDI_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
+{
+ int error;
+
+ switch (cmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *resp = (void *)overlay_dip;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *resp = (void *)0;
+ error = DDI_SUCCESS;
+ break;
+ default:
+ error = DDI_FAILURE;
+ break;
+ }
+
+ return (error);
+}
+
+static int
+overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ mutex_enter(&overlay_dev_lock);
+ if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) {
+ mutex_exit(&overlay_dev_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&overlay_dev_lock);
+
+
+ dld_ioc_unregister(OVERLAY_IOC);
+ ddi_remove_minor_node(dip, OVERLAY_CTL);
+ ddi_fm_fini(dip);
+ overlay_dip = NULL;
+ return (DDI_SUCCESS);
+}
+
+static struct cb_ops overlay_cbops = {
+ overlay_target_open, /* cb_open */
+ overlay_target_close, /* cb_close */
+ nodev, /* cb_strategy */
+ nodev, /* cb_print */
+ nodev, /* cb_dump */
+ nodev, /* cb_read */
+ nodev, /* cb_write */
+ overlay_target_ioctl, /* cb_ioctl */
+ nodev, /* cb_devmap */
+ nodev, /* cb_mmap */
+ nodev, /* cb_segmap */
+ nochpoll, /* cb_chpoll */
+ ddi_prop_op, /* cb_prop_op */
+ NULL, /* cb_stream */
+ D_MP, /* cb_flag */
+ CB_REV, /* cb_rev */
+ nodev, /* cb_aread */
+ nodev, /* cb_awrite */
+};
+
+static struct dev_ops overlay_dev_ops = {
+ DEVO_REV, /* devo_rev */
+ 0, /* devo_refcnt */
+ overlay_getinfo, /* devo_getinfo */
+ nulldev, /* devo_identify */
+ nulldev, /* devo_probe */
+ overlay_attach, /* devo_attach */
+ overlay_detach, /* devo_detach */
+ nulldev, /* devo_reset */
+ &overlay_cbops, /* devo_cb_ops */
+ NULL, /* devo_bus_ops */
+ NULL, /* devo_power */
+ ddi_quiesce_not_supported /* devo_quiesce */
+};
+
+static struct modldrv overlay_modldrv = {
+ &mod_driverops,
+ "Overlay Network Driver",
+ &overlay_dev_ops
+};
+
+static struct modlinkage overlay_linkage = {
+ MODREV_1,
+ &overlay_modldrv
+};
+
+static int
+overlay_init(void)
+{
+ mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&overlay_dev_list, sizeof (overlay_dev_t),
+ offsetof(overlay_dev_t, odd_link));
+ overlay_mux_init();
+ overlay_plugin_init();
+ overlay_target_init();
+
+ return (DDI_SUCCESS);
+}
+
+static void
+overlay_fini(void)
+{
+ overlay_target_fini();
+ overlay_plugin_fini();
+ overlay_mux_fini();
+ mutex_destroy(&overlay_dev_lock);
+ list_destroy(&overlay_dev_list);
+}
+
+int
+_init(void)
+{
+ int err;
+
+ if ((err = overlay_init()) != DDI_SUCCESS)
+ return (err);
+
+ mac_init_ops(NULL, "overlay");
+ err = mod_install(&overlay_linkage);
+ if (err != DDI_SUCCESS) {
+ overlay_fini();
+ return (err);
+ }
+
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&overlay_linkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int err;
+
+ err = mod_remove(&overlay_linkage);
+ if (err != 0)
+ return (err);
+
+ overlay_fini();
+ return (0);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay.conf b/usr/src/uts/common/io/overlay/overlay.conf
new file mode 100644
index 0000000000..4b62fafd94
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015, Joyent, Inc.
+#
+
+name="overlay" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/io/overlay/overlay.mapfile b/usr/src/uts/common/io/overlay/overlay.mapfile
new file mode 100644
index 0000000000..800d72dc2b
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay.mapfile
@@ -0,0 +1,46 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Joyent, Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+# usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+ global:
+ # DDI Interfaces
+ _fini;
+ _init;
+ _info;
+
+ # Encapsualation Plugin interfaces
+ overlay_plugin_alloc;
+ overlay_plugin_free;
+ overlay_plugin_register;
+ overlay_plugin_unregister;
+ local:
+ *;
+};
diff --git a/usr/src/uts/common/io/overlay/overlay_fm.c b/usr/src/uts/common/io/overlay/overlay_fm.c
new file mode 100644
index 0000000000..0701d08e8b
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_fm.c
@@ -0,0 +1,82 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay device FMA operations.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/ddifm.h>
+#include <sys/overlay_impl.h>
+
+kmutex_t overlay_fm_lock;
+uint_t overlay_fm_count;
+
+void
+overlay_fm_init(void)
+{
+ overlay_fm_count = 0;
+ mutex_init(&overlay_fm_lock, NULL, MUTEX_DRIVER, NULL);
+}
+
+void
+overlay_fm_fini(void)
+{
+ VERIFY(overlay_fm_count == 0);
+ mutex_destroy(&overlay_fm_lock);
+}
+
+void
+overlay_fm_degrade(overlay_dev_t *odd, const char *msg)
+{
+ mutex_enter(&overlay_fm_lock);
+ mutex_enter(&odd->odd_lock);
+
+ if (msg != NULL)
+ (void) strlcpy(odd->odd_fmamsg, msg, OVERLAY_STATUS_BUFLEN);
+
+ if (odd->odd_flags & OVERLAY_F_DEGRADED)
+ goto out;
+
+ odd->odd_flags |= OVERLAY_F_DEGRADED;
+ overlay_fm_count++;
+ if (overlay_fm_count == 1) {
+ ddi_fm_service_impact(overlay_dip, DDI_SERVICE_DEGRADED);
+ }
+out:
+ mutex_exit(&odd->odd_lock);
+ mutex_exit(&overlay_fm_lock);
+}
+
+void
+overlay_fm_restore(overlay_dev_t *odd)
+{
+ mutex_enter(&overlay_fm_lock);
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_DEGRADED))
+ goto out;
+
+ odd->odd_fmamsg[0] = '\0';
+ odd->odd_flags &= ~OVERLAY_F_DEGRADED;
+ overlay_fm_count--;
+ if (overlay_fm_count == 0) {
+ ddi_fm_service_impact(overlay_dip, DDI_SERVICE_RESTORED);
+ }
+out:
+ mutex_exit(&odd->odd_lock);
+ mutex_exit(&overlay_fm_lock);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c
new file mode 100644
index 0000000000..882f350060
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_mux.c
@@ -0,0 +1,371 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * Overlay device ksocket multiplexer.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ksynch.h>
+#include <sys/ksocket.h>
+#include <sys/avl.h>
+#include <sys/list.h>
+#include <sys/pattr.h>
+#include <sys/sysmacros.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/tihdr.h>
+
+#include <sys/overlay_impl.h>
+
+#include <sys/sdt.h>
+
+#define OVERLAY_FREEMSG(mp, reason) \
+ DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason)
+
+static list_t overlay_mux_list;
+static kmutex_t overlay_mux_lock;
+
+void
+overlay_mux_init(void)
+{
+ list_create(&overlay_mux_list, sizeof (overlay_mux_t),
+ offsetof(overlay_mux_t, omux_lnode));
+ mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
+}
+
+void
+overlay_mux_fini(void)
+{
+ mutex_destroy(&overlay_mux_lock);
+ list_destroy(&overlay_mux_list);
+}
+
+static int
+overlay_mux_comparator(const void *a, const void *b)
+{
+ const overlay_dev_t *odl, *odr;
+ odl = a;
+ odr = b;
+ if (odl->odd_vid > odr->odd_vid)
+ return (1);
+ else if (odl->odd_vid < odr->odd_vid)
+ return (-1);
+ else
+ return (0);
+}
+
+/*
+ * This is the central receive data path. We need to decode the packet, if we
+ * can, and then deliver it to the appropriate overlay.
+ */
+/* ARGSUSED */
+static boolean_t
+overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
+ void *arg)
+{
+ mblk_t *mp, *nmp, *fmp;
+ overlay_mux_t *mux = arg;
+
+ /*
+ * We may have a received a chain of messages. Each messsage in the
+ * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
+ * If we aren't getting that, we should probably drop that for the
+ * moment.
+ */
+ for (mp = mpchain; mp != NULL; mp = nmp) {
+ struct T_unitdata_ind *tudi;
+ ovep_encap_info_t infop;
+ overlay_dev_t od, *odd;
+ int ret;
+
+ nmp = mp->b_next;
+ mp->b_next = NULL;
+
+ if (DB_TYPE(mp) != M_PROTO) {
+ OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
+ freemsg(mp);
+ continue;
+ }
+
+ if (mp->b_cont == NULL) {
+ OVERLAY_FREEMSG(mp, "missing a b_cont");
+ freemsg(mp);
+ continue;
+ }
+
+ tudi = (struct T_unitdata_ind *)mp->b_rptr;
+ if (tudi->PRIM_type != T_UNITDATA_IND) {
+ OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
+ freemsg(mp);
+ continue;
+ }
+
+ /*
+ * In the future, we'll care about the source information
+ * for purposes of telling varpd for oob invalidation. But for
+ * now, just drop that block.
+ */
+ fmp = mp;
+ mp = fmp->b_cont;
+ freeb(fmp);
+
+ /*
+ * Until we have VXLAN-or-other-decap HW acceleration support
+ * (e.g. we support NICs that reach into VXLAN-encapsulated
+ * packets and check the inside-VXLAN IP packets' checksums,
+ * or do LSO with VXLAN), we should clear any HW-accelerated-
+ * performed bits.
+ */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ /*
+ * Decap and deliver.
+ */
+ bzero(&infop, sizeof (ovep_encap_info_t));
+ ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
+ if (ret != 0) {
+ OVERLAY_FREEMSG(mp, "decap failed");
+ freemsg(mp);
+ continue;
+ }
+ if (MBLKL(mp) > infop.ovdi_hdr_size) {
+ mp->b_rptr += infop.ovdi_hdr_size;
+ } else {
+ while (infop.ovdi_hdr_size != 0) {
+ size_t rem, blkl;
+
+ if (mp == NULL)
+ break;
+
+ blkl = MBLKL(mp);
+ rem = MIN(infop.ovdi_hdr_size, blkl);
+ infop.ovdi_hdr_size -= rem;
+ mp->b_rptr += rem;
+ if (rem == blkl) {
+ fmp = mp;
+ mp = fmp->b_cont;
+ fmp->b_cont = NULL;
+ OVERLAY_FREEMSG(mp,
+ "freed a fmp block");
+ freemsg(fmp);
+ }
+ }
+ if (mp == NULL) {
+ OVERLAY_FREEMSG(mp, "freed it all...");
+ continue;
+ }
+ }
+
+
+ od.odd_vid = infop.ovdi_id;
+ mutex_enter(&mux->omux_lock);
+ odd = avl_find(&mux->omux_devices, &od, NULL);
+ if (odd == NULL) {
+ mutex_exit(&mux->omux_lock);
+ OVERLAY_FREEMSG(mp, "no matching vid");
+ freemsg(mp);
+ continue;
+ }
+ mutex_enter(&odd->odd_lock);
+ if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
+ !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
+ mutex_exit(&odd->odd_lock);
+ mutex_exit(&mux->omux_lock);
+ OVERLAY_FREEMSG(mp, "dev dropped");
+ freemsg(mp);
+ continue;
+ }
+ overlay_io_start(odd, OVERLAY_F_IN_RX);
+ mutex_exit(&odd->odd_lock);
+ mutex_exit(&mux->omux_lock);
+
+ mac_rx(odd->odd_mh, NULL, mp);
+
+ mutex_enter(&odd->odd_lock);
+ overlay_io_done(odd, OVERLAY_F_IN_RX);
+ mutex_exit(&odd->odd_lock);
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * Register a given device with a socket backend. If no such device socket
+ * exists, create a new one.
+ */
+overlay_mux_t *
+overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
+ struct sockaddr *addr, socklen_t len, int *errp)
+{
+ int err;
+ overlay_mux_t *mux;
+ ksocket_t ksock;
+
+ if (errp == NULL)
+ errp = &err;
+
+ mutex_enter(&overlay_mux_lock);
+ for (mux = list_head(&overlay_mux_list); mux != NULL;
+ mux = list_next(&overlay_mux_list, mux)) {
+ if (domain == mux->omux_domain &&
+ family == mux->omux_family &&
+ protocol == mux->omux_protocol &&
+ len == mux->omux_alen &&
+ bcmp(addr, mux->omux_addr, len) == 0) {
+
+ if (opp != mux->omux_plugin) {
+ *errp = EEXIST;
+ return (NULL);
+ }
+
+ mutex_enter(&mux->omux_lock);
+ mux->omux_count++;
+ mutex_exit(&mux->omux_lock);
+ mutex_exit(&overlay_mux_lock);
+ *errp = 0;
+ return (mux);
+ }
+ }
+
+ /*
+ * Today we aren't zone-aware and only exist in the global zone. When we
+ * allow for things to exist in the non-global zone, we'll want to use a
+ * credential that's actually specific to the zone.
+ */
+ *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
+ kcred);
+ if (*errp != 0) {
+ mutex_exit(&overlay_mux_lock);
+ return (NULL);
+ }
+
+ *errp = ksocket_bind(ksock, addr, len, kcred);
+ if (*errp != 0) {
+ mutex_exit(&overlay_mux_lock);
+ ksocket_close(ksock, kcred);
+ return (NULL);
+ }
+
+ /*
+ * Ask our lower layer to optionally toggle anything they need on this
+ * socket. Because a socket is owned by a single type of plugin, we can
+ * then ask it to perform any additional socket set up it'd like to do.
+ */
+ if (opp->ovp_ops->ovpo_sockopt != NULL &&
+ (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
+ mutex_exit(&overlay_mux_lock);
+ ksocket_close(ksock, kcred);
+ return (NULL);
+ }
+
+ mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
+ list_link_init(&mux->omux_lnode);
+ mux->omux_ksock = ksock;
+ mux->omux_plugin = opp;
+ mux->omux_domain = domain;
+ mux->omux_family = family;
+ mux->omux_protocol = protocol;
+ mux->omux_addr = kmem_alloc(len, KM_SLEEP);
+ bcopy(addr, mux->omux_addr, len);
+ mux->omux_alen = len;
+ mux->omux_count = 1;
+ avl_create(&mux->omux_devices, overlay_mux_comparator,
+ sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
+ mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
+
+
+ /* Once this is called, we need to expect to rx data */
+ *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
+ if (*errp != 0) {
+ ksocket_close(ksock, kcred);
+ mutex_destroy(&mux->omux_lock);
+ avl_destroy(&mux->omux_devices);
+ kmem_free(mux->omux_addr, len);
+ kmem_free(mux, sizeof (overlay_mux_t));
+ return (NULL);
+ }
+
+ list_insert_tail(&overlay_mux_list, mux);
+ mutex_exit(&overlay_mux_lock);
+
+ *errp = 0;
+ return (mux);
+}
+
+void
+overlay_mux_close(overlay_mux_t *mux)
+{
+ mutex_enter(&overlay_mux_lock);
+ mutex_enter(&mux->omux_lock);
+ mux->omux_count--;
+ if (mux->omux_count != 0) {
+ mutex_exit(&mux->omux_lock);
+ mutex_exit(&overlay_mux_lock);
+ return;
+ }
+ list_remove(&overlay_mux_list, mux);
+ mutex_exit(&mux->omux_lock);
+ mutex_exit(&overlay_mux_lock);
+
+ ksocket_close(mux->omux_ksock, kcred);
+ avl_destroy(&mux->omux_devices);
+ kmem_free(mux->omux_addr, mux->omux_alen);
+ kmem_free(mux, sizeof (overlay_mux_t));
+}
+
+void
+overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
+{
+ mutex_enter(&mux->omux_lock);
+ avl_add(&mux->omux_devices, odd);
+ mutex_exit(&mux->omux_lock);
+}
+
+void
+overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
+{
+ mutex_enter(&mux->omux_lock);
+ avl_remove(&mux->omux_devices, odd);
+ mutex_exit(&mux->omux_lock);
+}
+
+int
+overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
+{
+ int ret;
+
+ /*
+ * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
+ * that isn't actually supported by UDP at this time.
+ *
+ * Send with MSG_DONTWAIT to indicate clogged UDP sockets upstack.
+ */
+ ret = ksocket_sendmblk(mux->omux_ksock, hdr, MSG_DONTWAIT, &mp, kcred);
+ /*
+ * NOTE: ksocket_sendmblk() may send partial packets downstack,
+ * returning what's not sent in &mp (i.e. mp pre-call might be a
+ * b_cont of mp post-call). We can't hold up this message (it's a
+ * datagram), so we drop, and let the caller cope.
+ */
+ if (ret != 0)
+ freemsg(mp);
+
+ return (ret);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_plugin.c b/usr/src/uts/common/io/overlay/overlay_plugin.c
new file mode 100644
index 0000000000..348ddb92a2
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_plugin.c
@@ -0,0 +1,281 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+/*
+ * Overlay device encapsulation plugin management
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/errno.h>
+#include <sys/sysmacros.h>
+#include <sys/modctl.h>
+
+#include <sys/overlay_impl.h>
+
+static kmem_cache_t *overlay_plugin_cache;
+static kmutex_t overlay_plugin_lock;
+static list_t overlay_plugin_list;
+
+#define OVERLAY_MODDIR "overlay"
+
+/* ARGSUSED */
+static int
+overlay_plugin_cache_constructor(void *buf, void *arg, int kmflags)
+{
+ overlay_plugin_t *opp = buf;
+
+ mutex_init(&opp->ovp_mutex, NULL, MUTEX_DRIVER, NULL);
+ list_link_init(&opp->ovp_link);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_plugin_cache_destructor(void *buf, void *arg)
+{
+ overlay_plugin_t *opp = buf;
+ ASSERT(list_link_active(&opp->ovp_link) == 0);
+ mutex_destroy(&opp->ovp_mutex);
+}
+
+void
+overlay_plugin_init(void)
+{
+ mutex_init(&overlay_plugin_lock, NULL, MUTEX_DRIVER, 0);
+
+ /*
+ * In the future we may want to have a reaper to unload unused modules
+ * to help the kernel be able to reclaim memory.
+ */
+ overlay_plugin_cache = kmem_cache_create("overlay_plugin_cache",
+ sizeof (overlay_plugin_t), 0, overlay_plugin_cache_constructor,
+ overlay_plugin_cache_destructor, NULL, NULL, NULL, 0);
+ list_create(&overlay_plugin_list, sizeof (overlay_plugin_t),
+ offsetof(overlay_plugin_t, ovp_link));
+}
+
+void
+overlay_plugin_fini(void)
+{
+ mutex_enter(&overlay_plugin_lock);
+ VERIFY(list_is_empty(&overlay_plugin_list));
+ mutex_exit(&overlay_plugin_lock);
+
+ list_destroy(&overlay_plugin_list);
+ kmem_cache_destroy(overlay_plugin_cache);
+ mutex_destroy(&overlay_plugin_lock);
+}
+
+overlay_plugin_register_t *
+overlay_plugin_alloc(uint_t version)
+{
+ overlay_plugin_register_t *ovrp;
+ /* Version 1 is the only one that exists */
+ if (version != OVEP_VERSION_ONE)
+ return (NULL);
+
+ ovrp = kmem_zalloc(sizeof (overlay_plugin_register_t), KM_SLEEP);
+ ovrp->ovep_version = version;
+ return (ovrp);
+}
+
+void
+overlay_plugin_free(overlay_plugin_register_t *ovrp)
+{
+ kmem_free(ovrp, sizeof (overlay_plugin_register_t));
+}
+
+int
+overlay_plugin_register(overlay_plugin_register_t *ovrp)
+{
+ overlay_plugin_t *opp, *ipp;
+
+ /* Sanity check parameters of the registration */
+ if (ovrp->ovep_version != OVEP_VERSION_ONE)
+ return (EINVAL);
+
+ if (ovrp->ovep_name == NULL || ovrp->ovep_ops == NULL)
+ return (EINVAL);
+
+ if ((ovrp->ovep_flags & ~(OVEP_F_VLAN_TAG)) != 0)
+ return (EINVAL);
+
+ if (ovrp->ovep_id_size < 1)
+ return (EINVAL);
+
+ /* Don't support anything that has an id size larger than 8 bytes */
+ if (ovrp->ovep_id_size > 8)
+ return (ENOTSUP);
+
+ if (ovrp->ovep_dest == OVERLAY_PLUGIN_D_INVALID)
+ return (EINVAL);
+
+ if ((ovrp->ovep_dest & ~OVERLAY_PLUGIN_D_MASK) != 0)
+ return (EINVAL);
+
+ if (ovrp->ovep_ops->ovpo_callbacks != 0)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_init == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_fini == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_encap == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_decap == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_socket == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_getprop == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_setprop == NULL)
+ return (EINVAL);
+ if (ovrp->ovep_ops->ovpo_propinfo == NULL)
+ return (EINVAL);
+
+
+ opp = kmem_cache_alloc(overlay_plugin_cache, KM_SLEEP);
+ opp->ovp_active = 0;
+ opp->ovp_name = ovrp->ovep_name;
+ opp->ovp_ops = ovrp->ovep_ops;
+ opp->ovp_props = ovrp->ovep_props;
+ opp->ovp_id_size = ovrp->ovep_id_size;
+ opp->ovp_flags = ovrp->ovep_flags;
+ opp->ovp_dest = ovrp->ovep_dest;
+
+ opp->ovp_nprops = 0;
+ if (ovrp->ovep_props != NULL) {
+ while (ovrp->ovep_props[opp->ovp_nprops] != NULL) {
+ if (strlen(ovrp->ovep_props[opp->ovp_nprops]) >=
+ OVERLAY_PROP_NAMELEN) {
+ mutex_exit(&overlay_plugin_lock);
+ kmem_cache_free(overlay_plugin_cache, opp);
+ return (EINVAL);
+ }
+ opp->ovp_nprops++;
+ }
+ }
+
+ mutex_enter(&overlay_plugin_lock);
+ for (ipp = list_head(&overlay_plugin_list); ipp != NULL;
+ ipp = list_next(&overlay_plugin_list, ipp)) {
+ if (strcmp(ipp->ovp_name, opp->ovp_name) == 0) {
+ mutex_exit(&overlay_plugin_lock);
+ kmem_cache_free(overlay_plugin_cache, opp);
+ return (EEXIST);
+ }
+ }
+ list_insert_tail(&overlay_plugin_list, opp);
+ mutex_exit(&overlay_plugin_lock);
+
+ return (0);
+}
+
+int
+overlay_plugin_unregister(const char *name)
+{
+ overlay_plugin_t *opp;
+
+ mutex_enter(&overlay_plugin_lock);
+ for (opp = list_head(&overlay_plugin_list); opp != NULL;
+ opp = list_next(&overlay_plugin_list, opp)) {
+ if (strcmp(opp->ovp_name, name) == 0)
+ break;
+ }
+
+ if (opp == NULL) {
+ mutex_exit(&overlay_plugin_lock);
+ return (ENOENT);
+ }
+
+ mutex_enter(&opp->ovp_mutex);
+ if (opp->ovp_active > 0) {
+ mutex_exit(&opp->ovp_mutex);
+ mutex_exit(&overlay_plugin_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&opp->ovp_mutex);
+
+ list_remove(&overlay_plugin_list, opp);
+ mutex_exit(&overlay_plugin_lock);
+
+ kmem_cache_free(overlay_plugin_cache, opp);
+ return (0);
+}
+
+overlay_plugin_t *
+overlay_plugin_lookup(const char *name)
+{
+ overlay_plugin_t *opp;
+ boolean_t trymodload = B_FALSE;
+
+ for (;;) {
+ mutex_enter(&overlay_plugin_lock);
+ for (opp = list_head(&overlay_plugin_list); opp != NULL;
+ opp = list_next(&overlay_plugin_list, opp)) {
+ if (strcmp(name, opp->ovp_name) == 0) {
+ mutex_enter(&opp->ovp_mutex);
+ opp->ovp_active++;
+ mutex_exit(&opp->ovp_mutex);
+ mutex_exit(&overlay_plugin_lock);
+ return (opp);
+ }
+ }
+ mutex_exit(&overlay_plugin_lock);
+
+ if (trymodload == B_TRUE)
+ return (NULL);
+
+ /*
+ * If we didn't find it, it may still exist, but just not have
+ * been a loaded module. In that case, we'll do one attempt to
+ * load it.
+ */
+ if (modload(OVERLAY_MODDIR, (char *)name) == -1)
+ return (NULL);
+ trymodload = B_TRUE;
+ }
+
+}
+
+void
+overlay_plugin_rele(overlay_plugin_t *opp)
+{
+ mutex_enter(&opp->ovp_mutex);
+ ASSERT(opp->ovp_active > 0);
+ opp->ovp_active--;
+ mutex_exit(&opp->ovp_mutex);
+}
+
+void
+overlay_plugin_walk(overlay_plugin_walk_f func, void *arg)
+{
+ overlay_plugin_t *opp;
+ mutex_enter(&overlay_plugin_lock);
+ for (opp = list_head(&overlay_plugin_list); opp != NULL;
+ opp = list_next(&overlay_plugin_list, opp)) {
+ if (func(opp, arg) != 0) {
+ mutex_exit(&overlay_plugin_lock);
+ return;
+ }
+ }
+ mutex_exit(&overlay_plugin_lock);
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_prop.c b/usr/src/uts/common/io/overlay/overlay_prop.c
new file mode 100644
index 0000000000..ba1ea2a629
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_prop.c
@@ -0,0 +1,122 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+/*
+ * Routines for manipulating property information structures.
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/overlay_impl.h>
+
+void
+overlay_prop_init(overlay_prop_handle_t phdl)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+
+ infop->oipi_posssize = sizeof (mac_propval_range_t);
+ bzero(rangep, sizeof (mac_propval_range_t));
+}
+
+void
+overlay_prop_set_name(overlay_prop_handle_t phdl, const char *name)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ (void) strlcpy(infop->oipi_name, name, OVERLAY_PROP_NAMELEN);
+}
+
+void
+overlay_prop_set_prot(overlay_prop_handle_t phdl, overlay_prop_prot_t prot)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ infop->oipi_prot = prot;
+}
+
+void
+overlay_prop_set_type(overlay_prop_handle_t phdl, overlay_prop_type_t type)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ infop->oipi_type = type;
+}
+
+int
+overlay_prop_set_default(overlay_prop_handle_t phdl, void *def, ssize_t len)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+
+ if (len > OVERLAY_PROP_SIZEMAX)
+ return (E2BIG);
+
+ if (len < 0)
+ return (EOVERFLOW);
+
+ bcopy(def, infop->oipi_default, len);
+ infop->oipi_defsize = (uint32_t)len;
+
+ return (0);
+}
+
+void
+overlay_prop_set_nodefault(overlay_prop_handle_t phdl)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ infop->oipi_default[0] = '\0';
+ infop->oipi_defsize = 0;
+}
+
+void
+overlay_prop_set_range_uint32(overlay_prop_handle_t phdl, uint32_t min,
+ uint32_t max)
+{
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+
+ if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_UINT32)
+ return;
+
+ if (infop->oipi_posssize + sizeof (mac_propval_uint32_range_t) >
+ sizeof (infop->oipi_poss))
+ return;
+
+ infop->oipi_posssize += sizeof (mac_propval_uint32_range_t);
+ rangep->mpr_count++;
+ rangep->mpr_type = MAC_PROPVAL_UINT32;
+ rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_min = min;
+ rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_max = max;
+}
+
+void
+overlay_prop_set_range_str(overlay_prop_handle_t phdl, const char *str)
+{
+ size_t len = strlen(str) + 1; /* Account for a null terminator */
+ overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl;
+ mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss;
+ mac_propval_str_range_t *pstr = &rangep->u.mpr_str;
+
+ if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_STR)
+ return;
+
+ if (infop->oipi_posssize + len > sizeof (infop->oipi_poss))
+ return;
+
+ rangep->mpr_count++;
+ rangep->mpr_type = MAC_PROPVAL_STR;
+ strlcpy((char *)&pstr->mpur_data[pstr->mpur_nextbyte], str,
+ sizeof (infop->oipi_poss) - infop->oipi_posssize);
+ pstr->mpur_nextbyte += len;
+ infop->oipi_posssize += len;
+}
diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c
new file mode 100644
index 0000000000..f4147b56d1
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/overlay_target.c
@@ -0,0 +1,1651 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Overlay device target cache management
+ *
+ * For more information, see the big theory statement in
+ * uts/common/io/overlay/overlay.c
+ */
+
+#include <sys/types.h>
+#include <sys/ethernet.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/sysmacros.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/strsubr.h>
+#include <sys/mac_provider.h>
+#include <sys/mac_client.h>
+#include <sys/mac_client_priv.h>
+#include <sys/vlan.h>
+#include <sys/crc32.h>
+#include <sys/cred.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+#include <sys/overlay_impl.h>
+#include <sys/sdt.h>
+
+/*
+ * This is total straw man, but at least it's a prime number. Here we're
+ * going to have to go through and do a lot of evaluation and understanding as
+ * to how these target caches should grow and shrink, as well as, memory
+ * pressure and evictions. This just gives us a starting point that'll be 'good
+ * enough', until it's not.
+ */
+#define OVERLAY_HSIZE 823
+
+/*
+ * We use this data structure to keep track of what requests have been actively
+ * allocated to a given instance so we know what to put back on the pending
+ * list.
+ */
+typedef struct overlay_target_hdl {
+ minor_t oth_minor; /* RO */
+ zoneid_t oth_zoneid; /* RO */
+ int oth_oflags; /* RO */
+ list_node_t oth_link; /* overlay_target_lock */
+ kmutex_t oth_lock;
+ list_t oth_outstanding; /* oth_lock */
+} overlay_target_hdl_t;
+
+typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int);
+typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *);
+typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int);
+
+typedef struct overaly_target_ioctl {
+ int oti_cmd; /* ioctl id */
+ boolean_t oti_write; /* ioctl requires FWRITE */
+ boolean_t oti_ncopyout; /* copyout data? */
+ overlay_target_copyin_f oti_copyin; /* copyin func */
+ overlay_target_ioctl_f oti_func; /* function to call */
+ overlay_target_copyout_f oti_copyout; /* copyin func */
+ size_t oti_size; /* size of user level structure */
+} overlay_target_ioctl_t;
+
+static kmem_cache_t *overlay_target_cache;
+static kmem_cache_t *overlay_entry_cache;
+static id_space_t *overlay_thdl_idspace;
+static void *overlay_thdl_state;
+
+/*
+ * When we support overlay devices in the NGZ, then all of these need to become
+ * zone aware, by plugging into the netstack engine and becoming per-netstack
+ * data.
+ */
+static list_t overlay_thdl_list;
+static kmutex_t overlay_target_lock;
+static kcondvar_t overlay_target_condvar;
+static list_t overlay_target_list;
+static boolean_t overlay_target_excl;
+
+/*
+ * Outstanding data per hash table entry.
+ */
+static int overlay_ent_size = 128 * 1024;
+
+/* ARGSUSED */
+static int
+overlay_target_cache_constructor(void *buf, void *arg, int kmflgs)
+{
+ overlay_target_t *ott = buf;
+
+ mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_target_cache_destructor(void *buf, void *arg)
+{
+ overlay_target_t *ott = buf;
+
+ cv_destroy(&ott->ott_cond);
+ mutex_destroy(&ott->ott_lock);
+}
+
+/* ARGSUSED */
+static int
+overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs)
+{
+ overlay_target_entry_t *ote = buf;
+
+ bzero(ote, sizeof (overlay_target_entry_t));
+ mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+overlay_entry_cache_destructor(void *buf, void *arg)
+{
+ overlay_target_entry_t *ote = buf;
+
+ mutex_destroy(&ote->ote_lock);
+}
+
+static uint64_t
+overlay_mac_hash(const void *v)
+{
+ uint32_t crc;
+ CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
+ return (crc);
+}
+
+static int
+overlay_mac_cmp(const void *a, const void *b)
+{
+ return (bcmp(a, b, ETHERADDRL));
+}
+
+/* ARGSUSED */
+static void
+overlay_target_entry_dtor(void *arg)
+{
+ overlay_target_entry_t *ote = arg;
+
+ ote->ote_flags = 0;
+ bzero(ote->ote_addr, ETHERADDRL);
+ ote->ote_ott = NULL;
+ ote->ote_odd = NULL;
+ freemsgchain(ote->ote_chead);
+ ote->ote_chead = ote->ote_ctail = NULL;
+ ote->ote_mbsize = 0;
+ ote->ote_vtime = 0;
+ kmem_cache_free(overlay_entry_cache, ote);
+}
+
+static int
+overlay_mac_avl(const void *a, const void *b)
+{
+ int i;
+ const overlay_target_entry_t *l, *r;
+ l = a;
+ r = b;
+
+ for (i = 0; i < ETHERADDRL; i++) {
+ if (l->ote_addr[i] > r->ote_addr[i])
+ return (1);
+ else if (l->ote_addr[i] < r->ote_addr[i])
+ return (-1);
+ }
+
+ return (0);
+}
+
+void
+overlay_target_init(void)
+{
+ int ret;
+ ret = ddi_soft_state_init(&overlay_thdl_state,
+ sizeof (overlay_target_hdl_t), 1);
+ VERIFY(ret == 0);
+ overlay_target_cache = kmem_cache_create("overlay_target",
+ sizeof (overlay_target_t), 0, overlay_target_cache_constructor,
+ overlay_target_cache_destructor, NULL, NULL, NULL, 0);
+ overlay_entry_cache = kmem_cache_create("overlay_entry",
+ sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor,
+ overlay_entry_cache_destructor, NULL, NULL, NULL, 0);
+ mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL);
+ list_create(&overlay_target_list, sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_qlink));
+ list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t),
+ offsetof(overlay_target_hdl_t, oth_link));
+ overlay_thdl_idspace = id_space_create("overlay_target_minors",
+ 1, INT32_MAX);
+}
+
+void
+overlay_target_fini(void)
+{
+ id_space_destroy(overlay_thdl_idspace);
+ list_destroy(&overlay_thdl_list);
+ list_destroy(&overlay_target_list);
+ cv_destroy(&overlay_target_condvar);
+ mutex_destroy(&overlay_target_lock);
+ kmem_cache_destroy(overlay_entry_cache);
+ kmem_cache_destroy(overlay_target_cache);
+ ddi_soft_state_fini(&overlay_thdl_state);
+}
+
+void
+overlay_target_free(overlay_dev_t *odd)
+{
+ if (odd->odd_target == NULL)
+ return;
+
+ if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
+ refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
+ avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
+ overlay_target_entry_t *ote;
+
+ /*
+ * Our AVL tree and hashtable contain the same elements,
+ * therefore we should just remove it from the tree, but then
+ * delete the entries when we remove them from the hash table
+ * (which happens through the refhash dtor).
+ */
+ while ((ote = avl_first(ap)) != NULL)
+ avl_remove(ap, ote);
+
+ avl_destroy(ap);
+ for (ote = refhash_first(rp); ote != NULL;
+ ote = refhash_next(rp, ote)) {
+ refhash_remove(rp, ote);
+ }
+ refhash_destroy(rp);
+ }
+
+ ASSERT(odd->odd_target->ott_ocount == 0);
+ kmem_cache_free(overlay_target_cache, odd->odd_target);
+}
+
+int
+overlay_target_busy()
+{
+ int ret;
+
+ mutex_enter(&overlay_target_lock);
+ ret = !list_is_empty(&overlay_thdl_list);
+ mutex_exit(&overlay_target_lock);
+
+ return (ret);
+}
+
+static void
+overlay_target_queue(overlay_target_entry_t *entry)
+{
+ mutex_enter(&overlay_target_lock);
+ mutex_enter(&entry->ote_ott->ott_lock);
+ if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
+ mutex_exit(&entry->ote_ott->ott_lock);
+ mutex_exit(&overlay_target_lock);
+ return;
+ }
+ entry->ote_ott->ott_ocount++;
+ mutex_exit(&entry->ote_ott->ott_lock);
+ list_insert_tail(&overlay_target_list, entry);
+ cv_signal(&overlay_target_condvar);
+ mutex_exit(&overlay_target_lock);
+}
+
+void
+overlay_target_quiesce(overlay_target_t *ott)
+{
+ if (ott == NULL)
+ return;
+ mutex_enter(&ott->ott_lock);
+ ott->ott_flags |= OVERLAY_T_TEARDOWN;
+ while (ott->ott_ocount != 0)
+ cv_wait(&ott->ott_cond, &ott->ott_lock);
+ mutex_exit(&ott->ott_lock);
+}
+
+/*
+ * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
+ * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
+ * this time, say for NVGRE, we drop all packets that mcuh this.
+ */
+int
+overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
+ socklen_t *slenp)
+{
+ int ret;
+ struct sockaddr_in6 *v6;
+ overlay_target_t *ott;
+ mac_header_info_t mhi;
+ overlay_target_entry_t *entry;
+
+ ASSERT(odd->odd_target != NULL);
+
+ /*
+ * At this point, the overlay device is in a mux which means that it's
+ * been activated. At this point, parts of the target, such as the mode
+ * and the destination are now read-only and we don't have to worry
+ * about synchronization for them.
+ */
+ ott = odd->odd_target;
+ if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
+ return (OVERLAY_TARGET_DROP);
+
+ v6 = (struct sockaddr_in6 *)sock;
+ bzero(v6, sizeof (struct sockaddr_in6));
+ v6->sin6_family = AF_INET6;
+
+ if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+ mutex_enter(&ott->ott_lock);
+ bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr,
+ sizeof (struct in6_addr));
+ v6->sin6_port = htons(ott->ott_u.ott_point.otp_port);
+ mutex_exit(&ott->ott_lock);
+ *slenp = sizeof (struct sockaddr_in6);
+
+ return (OVERLAY_TARGET_OK);
+ }
+
+ ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
+
+ /*
+ * Note we only want the MAC address here, therefore we won't bother
+ * using mac_vlan_header_info(). If any caller needs the vlan info at
+ * this point, this should change to a call to mac_vlan_header_info().
+ */
+ if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
+ return (OVERLAY_TARGET_DROP);
+ mutex_enter(&ott->ott_lock);
+ entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ mhi.mhi_daddr);
+ if (entry == NULL) {
+ entry = kmem_cache_alloc(overlay_entry_cache,
+ KM_NOSLEEP | KM_NORMALPRI);
+ if (entry == NULL) {
+ mutex_exit(&ott->ott_lock);
+ return (OVERLAY_TARGET_DROP);
+ }
+ bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
+ entry->ote_chead = entry->ote_ctail = mp;
+ entry->ote_mbsize = msgsize(mp);
+ entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+ entry->ote_ott = ott;
+ entry->ote_odd = odd;
+ refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
+ avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
+ mutex_exit(&ott->ott_lock);
+ overlay_target_queue(entry);
+ return (OVERLAY_TARGET_ASYNC);
+ }
+ refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
+ mutex_exit(&ott->ott_lock);
+
+ mutex_enter(&entry->ote_lock);
+ if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ ret = OVERLAY_TARGET_DROP;
+ } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
+ sizeof (struct in6_addr));
+ v6->sin6_port = htons(entry->ote_dest.otp_port);
+ *slenp = sizeof (struct sockaddr_in6);
+ ret = OVERLAY_TARGET_OK;
+ } else {
+ size_t mlen = msgsize(mp);
+
+ if (mlen + entry->ote_mbsize > overlay_ent_size) {
+ ret = OVERLAY_TARGET_DROP;
+ } else {
+ if (entry->ote_ctail != NULL) {
+ ASSERT(entry->ote_ctail->b_next ==
+ NULL);
+ entry->ote_ctail->b_next = mp;
+ entry->ote_ctail = mp;
+ } else {
+ entry->ote_chead = mp;
+ entry->ote_ctail = mp;
+ }
+ entry->ote_mbsize += mlen;
+ if ((entry->ote_flags &
+ OVERLAY_ENTRY_F_PENDING) == 0) {
+ entry->ote_flags |=
+ OVERLAY_ENTRY_F_PENDING;
+ overlay_target_queue(entry);
+ }
+ ret = OVERLAY_TARGET_ASYNC;
+ }
+ }
+ mutex_exit(&entry->ote_lock);
+
+ mutex_enter(&ott->ott_lock);
+ refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+ mutex_exit(&ott->ott_lock);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_info(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_targ_info_t *oti = arg;
+
+ odd = overlay_hold_by_dlid(oti->oti_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ oti->oti_flags = 0;
+ oti->oti_needs = odd->odd_plugin->ovp_dest;
+ if (odd->odd_flags & OVERLAY_F_DEGRADED)
+ oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED;
+ if (odd->odd_flags & OVERLAY_F_ACTIVATED)
+ oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE;
+ oti->oti_vnetid = odd->odd_vid;
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_targ_associate_t *ota = arg;
+
+ odd = overlay_hold_by_dlid(ota->ota_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ if (ota->ota_id == 0) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+
+ if (ota->ota_mode != OVERLAY_TARGET_POINT &&
+ ota->ota_mode != OVERLAY_TARGET_DYNAMIC) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+
+ if (ota->ota_provides != odd->odd_plugin->ovp_dest) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+
+ if (ota->ota_mode == OVERLAY_TARGET_POINT) {
+ if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) ||
+ IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) ||
+ IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+ }
+
+ if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) {
+ if (ota->ota_point.otp_port == 0) {
+ overlay_hold_rele(odd);
+ return (EINVAL);
+ }
+ }
+ }
+
+ ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP);
+ ott->ott_flags = 0;
+ ott->ott_ocount = 0;
+ ott->ott_mode = ota->ota_mode;
+ ott->ott_dest = ota->ota_provides;
+ ott->ott_id = ota->ota_id;
+
+ if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+ bcopy(&ota->ota_point, &ott->ott_u.ott_point,
+ sizeof (overlay_target_point_t));
+ } else {
+ ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
+ overlay_mac_hash, overlay_mac_cmp,
+ overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_reflink),
+ offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
+ avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
+ sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_avllink));
+ }
+ mutex_enter(&odd->odd_lock);
+ if (odd->odd_flags & OVERLAY_F_VARPD) {
+ mutex_exit(&odd->odd_lock);
+ kmem_cache_free(overlay_target_cache, ott);
+ overlay_hold_rele(odd);
+ return (EEXIST);
+ }
+
+ odd->odd_flags |= OVERLAY_F_VARPD;
+ odd->odd_target = ott;
+ mutex_exit(&odd->odd_lock);
+
+ overlay_hold_rele(odd);
+
+
+ return (0);
+}
+
+
+/* ARGSUSED */
+static int
+overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_targ_degrade_t *otd = arg;
+
+ odd = overlay_hold_by_dlid(otd->otd_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ overlay_fm_degrade(odd, otd->otd_buf);
+ overlay_hold_rele(odd);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_restore(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_targ_id_t *otid = arg;
+
+ odd = overlay_hold_by_dlid(otid->otid_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ overlay_fm_restore(odd);
+ overlay_hold_rele(odd);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_targ_id_t *otid = arg;
+
+ odd = overlay_hold_by_dlid(otid->otid_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ odd->odd_flags &= ~OVERLAY_F_VARPD;
+ mutex_exit(&odd->odd_lock);
+
+ overlay_hold_rele(odd);
+ return (0);
+
+}
+
+static int
+overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_lookup_t *otl = arg;
+ overlay_target_entry_t *entry;
+ clock_t ret, timeout;
+ mac_header_info_t mhi;
+
+ timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC);
+again:
+ mutex_enter(&overlay_target_lock);
+ while (list_is_empty(&overlay_target_list)) {
+ ret = cv_timedwait(&overlay_target_condvar,
+ &overlay_target_lock, timeout);
+ if (ret == -1) {
+ mutex_exit(&overlay_target_lock);
+ return (ETIME);
+ }
+ }
+ entry = list_remove_head(&overlay_target_list);
+ mutex_exit(&overlay_target_lock);
+ mutex_enter(&entry->ote_lock);
+ if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ ASSERT(entry->ote_chead == NULL);
+ mutex_exit(&entry->ote_lock);
+ goto again;
+ }
+ ASSERT(entry->ote_chead != NULL);
+
+ /*
+ * If we have a bogon that doesn't have a valid mac header, drop it and
+ * try again.
+ */
+ if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
+ &mhi) != 0) {
+ boolean_t queue = B_FALSE;
+ mblk_t *mp = entry->ote_chead;
+ entry->ote_chead = mp->b_next;
+ mp->b_next = NULL;
+ if (entry->ote_ctail == mp)
+ entry->ote_ctail = entry->ote_chead;
+ entry->ote_mbsize -= msgsize(mp);
+ if (entry->ote_chead != NULL)
+ queue = B_TRUE;
+ mutex_exit(&entry->ote_lock);
+ if (queue == B_TRUE)
+ overlay_target_queue(entry);
+ freemsg(mp);
+ goto again;
+ }
+
+ otl->otl_dlid = entry->ote_odd->odd_linkid;
+ otl->otl_reqid = (uintptr_t)entry;
+ otl->otl_varpdid = entry->ote_ott->ott_id;
+ otl->otl_vnetid = entry->ote_odd->odd_vid;
+
+ otl->otl_hdrsize = mhi.mhi_hdrsize;
+ otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
+ bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL);
+ bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL);
+ otl->otl_dsttype = mhi.mhi_dsttype;
+ otl->otl_sap = mhi.mhi_bindsap;
+ otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
+ mutex_exit(&entry->ote_lock);
+
+ mutex_enter(&thdl->oth_lock);
+ list_insert_tail(&thdl->oth_outstanding, entry);
+ mutex_exit(&thdl->oth_lock);
+
+ return (0);
+}
+
+static int
+overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
+{
+ const overlay_targ_resp_t *otr = arg;
+ overlay_target_entry_t *entry;
+ mblk_t *mp;
+
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == otr->otr_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ return (EINVAL);
+ }
+ list_remove(&thdl->oth_outstanding, entry);
+ mutex_exit(&thdl->oth_lock);
+
+ mutex_enter(&entry->ote_lock);
+ bcopy(&otr->otr_answer, &entry->ote_dest,
+ sizeof (overlay_target_point_t));
+ entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+ entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
+ mp = entry->ote_chead;
+ entry->ote_chead = NULL;
+ entry->ote_ctail = NULL;
+ entry->ote_mbsize = 0;
+ entry->ote_vtime = gethrtime();
+ mutex_exit(&entry->ote_lock);
+
+ /*
+ * For now do an in-situ drain.
+ */
+ mp = overlay_m_tx(entry->ote_odd, mp);
+ freemsgchain(mp);
+
+ mutex_enter(&entry->ote_ott->ott_lock);
+ entry->ote_ott->ott_ocount--;
+ cv_signal(&entry->ote_ott->ott_cond);
+ mutex_exit(&entry->ote_ott->ott_lock);
+
+ return (0);
+}
+
+static int
+overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
+{
+ const overlay_targ_resp_t *otr = arg;
+ overlay_target_entry_t *entry;
+ mblk_t *mp;
+ boolean_t queue = B_FALSE;
+
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == otr->otr_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ return (EINVAL);
+ }
+ list_remove(&thdl->oth_outstanding, entry);
+ mutex_exit(&thdl->oth_lock);
+
+ mutex_enter(&entry->ote_lock);
+
+ /* Safeguard against a confused varpd */
+ if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+ DTRACE_PROBE1(overlay__target__valid__drop,
+ overlay_target_entry_t *, entry);
+ mutex_exit(&entry->ote_lock);
+ goto done;
+ }
+
+ mp = entry->ote_chead;
+ if (mp != NULL) {
+ entry->ote_chead = mp->b_next;
+ mp->b_next = NULL;
+ if (entry->ote_ctail == mp)
+ entry->ote_ctail = entry->ote_chead;
+ entry->ote_mbsize -= msgsize(mp);
+ }
+ if (entry->ote_chead != NULL) {
+ queue = B_TRUE;
+ entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
+ } else {
+ entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
+ }
+ mutex_exit(&entry->ote_lock);
+
+ if (queue == B_TRUE)
+ overlay_target_queue(entry);
+ freemsg(mp);
+
+done:
+ mutex_enter(&entry->ote_ott->ott_lock);
+ entry->ote_ott->ott_ocount--;
+ cv_signal(&entry->ote_ott->ott_cond);
+ mutex_exit(&entry->ote_ott->ott_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize,
+ int flags)
+{
+ overlay_targ_pkt_t *pkt;
+ overlay_targ_pkt32_t *pkt32;
+
+ pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP);
+ *outp = pkt;
+ *bsize = sizeof (overlay_targ_pkt_t);
+ if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
+ uintptr_t addr;
+
+ if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t),
+ flags & FKIOCTL) != 0) {
+ kmem_free(pkt, *bsize);
+ return (EFAULT);
+ }
+ pkt32 = (overlay_targ_pkt32_t *)pkt;
+ addr = pkt32->otp_buf;
+ pkt->otp_buf = (void *)addr;
+ } else {
+ if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) {
+ kmem_free(pkt, *bsize);
+ return (EFAULT);
+ }
+ }
+ return (0);
+}
+
+static int
+overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize,
+ int flags)
+{
+ if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
+ overlay_targ_pkt_t *pkt = buf;
+ overlay_targ_pkt32_t *pkt32 = buf;
+ uintptr_t addr = (uintptr_t)pkt->otp_buf;
+ pkt32->otp_buf = (caddr32_t)addr;
+ if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t),
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+ } else {
+ if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0)
+ return (EFAULT);
+ }
+ return (0);
+}
+
+static int
+overlay_target_packet(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_pkt_t *pkt = arg;
+ overlay_target_entry_t *entry;
+ mblk_t *mp;
+ size_t mlen;
+ size_t boff;
+
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == pkt->otp_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ return (EINVAL);
+ }
+ mutex_enter(&entry->ote_lock);
+ mutex_exit(&thdl->oth_lock);
+ mp = entry->ote_chead;
+ /* Protect against a rogue varpd */
+ if (mp == NULL) {
+ mutex_exit(&entry->ote_lock);
+ return (EINVAL);
+ }
+ mlen = MIN(msgsize(mp), pkt->otp_size);
+ pkt->otp_size = mlen;
+ boff = 0;
+ while (mlen > 0) {
+ size_t wlen = MIN(MBLKL(mp), mlen);
+ if (ddi_copyout(mp->b_rptr,
+ (void *)((uintptr_t)pkt->otp_buf + boff),
+ wlen, 0) != 0) {
+ mutex_exit(&entry->ote_lock);
+ return (EFAULT);
+ }
+ mlen -= wlen;
+ boff += wlen;
+ mp = mp->b_cont;
+ }
+ mutex_exit(&entry->ote_lock);
+ return (0);
+}
+
+static int
+overlay_target_inject(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_pkt_t *pkt = arg;
+ overlay_target_entry_t *entry;
+ overlay_dev_t *odd;
+ mblk_t *mp;
+
+ if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
+ return (EINVAL);
+
+ mp = allocb(pkt->otp_size, 0);
+ if (mp == NULL)
+ return (ENOMEM);
+
+ if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
+ freeb(mp);
+ return (EFAULT);
+ }
+ mp->b_wptr += pkt->otp_size;
+
+ if (pkt->otp_linkid != UINT64_MAX) {
+ odd = overlay_hold_by_dlid(pkt->otp_linkid);
+ if (odd == NULL) {
+ freeb(mp);
+ return (ENOENT);
+ }
+ } else {
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == pkt->otp_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ freeb(mp);
+ return (ENOENT);
+ }
+ odd = entry->ote_odd;
+ mutex_exit(&thdl->oth_lock);
+ }
+
+ mutex_enter(&odd->odd_lock);
+ overlay_io_start(odd, OVERLAY_F_IN_RX);
+ mutex_exit(&odd->odd_lock);
+
+ mac_rx(odd->odd_mh, NULL, mp);
+
+ mutex_enter(&odd->odd_lock);
+ overlay_io_done(odd, OVERLAY_F_IN_RX);
+ mutex_exit(&odd->odd_lock);
+
+ return (0);
+}
+
+static int
+overlay_target_resend(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_targ_pkt_t *pkt = arg;
+ overlay_target_entry_t *entry;
+ overlay_dev_t *odd;
+ mblk_t *mp;
+
+ if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
+ return (EINVAL);
+
+ mp = allocb(pkt->otp_size, 0);
+ if (mp == NULL)
+ return (ENOMEM);
+
+ if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
+ freeb(mp);
+ return (EFAULT);
+ }
+ mp->b_wptr += pkt->otp_size;
+
+ if (pkt->otp_linkid != UINT64_MAX) {
+ odd = overlay_hold_by_dlid(pkt->otp_linkid);
+ if (odd == NULL) {
+ freeb(mp);
+ return (ENOENT);
+ }
+ } else {
+ mutex_enter(&thdl->oth_lock);
+ for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
+ entry = list_next(&thdl->oth_outstanding, entry)) {
+ if ((uintptr_t)entry == pkt->otp_reqid)
+ break;
+ }
+
+ if (entry == NULL) {
+ mutex_exit(&thdl->oth_lock);
+ freeb(mp);
+ return (ENOENT);
+ }
+ odd = entry->ote_odd;
+ mutex_exit(&thdl->oth_lock);
+ }
+
+ mp = overlay_m_tx(odd, mp);
+ freemsgchain(mp);
+
+ return (0);
+}
+
+typedef struct overlay_targ_list_int {
+ boolean_t otli_count;
+ uint32_t otli_cur;
+ uint32_t otli_nents;
+ uint32_t otli_ents[];
+} overlay_targ_list_int_t;
+
+static int
+overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize,
+ int flags)
+{
+ overlay_targ_list_t n;
+ overlay_targ_list_int_t *otl;
+
+ if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t),
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+
+ /*
+ */
+ if (n.otl_nents >= INT32_MAX / sizeof (uint32_t))
+ return (EINVAL);
+ *bsize = sizeof (overlay_targ_list_int_t) +
+ sizeof (uint32_t) * n.otl_nents;
+ otl = kmem_zalloc(*bsize, KM_SLEEP);
+ otl->otli_cur = 0;
+ otl->otli_nents = n.otl_nents;
+ if (otl->otli_nents != 0) {
+ otl->otli_count = B_FALSE;
+ if (ddi_copyin((void *)((uintptr_t)ubuf +
+ offsetof(overlay_targ_list_t, otl_ents)),
+ otl->otli_ents, n.otl_nents * sizeof (uint32_t),
+ flags & FKIOCTL) != 0) {
+ kmem_free(otl, *bsize);
+ return (EFAULT);
+ }
+ } else {
+ otl->otli_count = B_TRUE;
+ }
+
+ *outp = otl;
+ return (0);
+}
+
+static int
+overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg)
+{
+ overlay_targ_list_int_t *otl = arg;
+
+ if (otl->otli_cur < otl->otli_nents)
+ otl->otli_ents[otl->otli_cur] = odd->odd_linkid;
+ otl->otli_cur++;
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_iter(overlay_target_ioctl_list_cb, arg);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags)
+{
+ overlay_targ_list_int_t *otl = buf;
+
+ if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t),
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (otl->otli_count == B_FALSE) {
+ if (ddi_copyout(otl->otli_ents,
+ (void *)((uintptr_t)ubuf +
+ offsetof(overlay_targ_list_t, otl_ents)),
+ sizeof (uint32_t) * otl->otli_nents,
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg)
+{
+ int ret = 0;
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_targ_cache_t *otc = arg;
+
+ odd = overlay_hold_by_dlid(otc->otc_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_POINT &&
+ ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+ otc->otc_entry.otce_flags = 0;
+ bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest,
+ sizeof (overlay_target_point_t));
+ } else {
+ overlay_target_entry_t *ote;
+ ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ otc->otc_entry.otce_mac);
+ if (ote != NULL) {
+ mutex_enter(&ote->ote_lock);
+ if ((ote->ote_flags &
+ OVERLAY_ENTRY_F_VALID_MASK) != 0) {
+ if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
+ otc->otc_entry.otce_flags =
+ OVERLAY_TARGET_CACHE_DROP;
+ } else {
+ otc->otc_entry.otce_flags = 0;
+ bcopy(&ote->ote_dest,
+ &otc->otc_entry.otce_dest,
+ sizeof (overlay_target_point_t));
+ }
+ ret = 0;
+ } else {
+ ret = ENOENT;
+ }
+ mutex_exit(&ote->ote_lock);
+ } else {
+ ret = ENOENT;
+ }
+ }
+
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_target_entry_t *ote;
+ overlay_targ_cache_t *otc = arg;
+ mblk_t *mp = NULL;
+
+ if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
+ return (EINVAL);
+
+ odd = overlay_hold_by_dlid(otc->otc_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ otc->otc_entry.otce_mac);
+ if (ote == NULL) {
+ ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
+ bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL);
+ ote->ote_chead = ote->ote_ctail = NULL;
+ ote->ote_mbsize = 0;
+ ote->ote_ott = ott;
+ ote->ote_odd = odd;
+ mutex_enter(&ote->ote_lock);
+ refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
+ avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
+ } else {
+ mutex_enter(&ote->ote_lock);
+ }
+
+ if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
+ ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
+ } else {
+ ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
+ bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
+ sizeof (overlay_target_point_t));
+ mp = ote->ote_chead;
+ ote->ote_chead = NULL;
+ ote->ote_ctail = NULL;
+ ote->ote_mbsize = 0;
+ ote->ote_vtime = gethrtime();
+ }
+
+ mutex_exit(&ote->ote_lock);
+ mutex_exit(&ott->ott_lock);
+
+ if (mp != NULL) {
+ mp = overlay_m_tx(ote->ote_odd, mp);
+ freemsgchain(mp);
+ }
+
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg)
+{
+ int ret = 0;
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_target_entry_t *ote;
+ overlay_targ_cache_t *otc = arg;
+
+ odd = overlay_hold_by_dlid(otc->otc_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ otc->otc_entry.otce_mac);
+ if (ote != NULL) {
+ mutex_enter(&ote->ote_lock);
+ ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
+ mutex_exit(&ote->ote_lock);
+ ret = 0;
+ } else {
+ ret = ENOENT;
+ }
+
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg)
+{
+ avl_tree_t *avl;
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_target_entry_t *ote;
+ overlay_targ_cache_t *otc = arg;
+
+ odd = overlay_hold_by_dlid(otc->otc_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+ avl = &ott->ott_u.ott_dyn.ott_tree;
+
+ for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) {
+ mutex_enter(&ote->ote_lock);
+ ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
+ mutex_exit(&ote->ote_lock);
+ }
+ ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
+ otc->otc_entry.otce_mac);
+
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+static int
+overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize,
+ int flags)
+{
+ overlay_targ_cache_iter_t base, *iter;
+
+ if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t),
+ flags & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (base.otci_count > OVERLAY_TARGET_ITER_MAX)
+ return (E2BIG);
+
+ if (base.otci_count == 0)
+ return (EINVAL);
+
+ *bsize = sizeof (overlay_targ_cache_iter_t) +
+ base.otci_count * sizeof (overlay_targ_cache_entry_t);
+ iter = kmem_alloc(*bsize, KM_SLEEP);
+ bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t));
+ *outp = iter;
+
+ return (0);
+}
+
+typedef struct overlay_targ_cache_marker {
+ uint8_t otcm_mac[ETHERADDRL];
+ uint16_t otcm_done;
+} overlay_targ_cache_marker_t;
+
+/* ARGSUSED */
+static int
+overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
+{
+ overlay_dev_t *odd;
+ overlay_target_t *ott;
+ overlay_target_entry_t lookup, *ent;
+ overlay_targ_cache_marker_t *mark;
+ avl_index_t where;
+ avl_tree_t *avl;
+ uint16_t written = 0;
+
+ overlay_targ_cache_iter_t *iter = arg;
+ mark = (void *)&iter->otci_marker;
+
+ if (mark->otcm_done != 0) {
+ iter->otci_count = 0;
+ return (0);
+ }
+
+ odd = overlay_hold_by_dlid(iter->otci_linkid);
+ if (odd == NULL)
+ return (ENOENT);
+
+ mutex_enter(&odd->odd_lock);
+ if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENXIO);
+ }
+ ott = odd->odd_target;
+ if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC &&
+ ott->ott_mode != OVERLAY_TARGET_POINT) {
+ mutex_exit(&odd->odd_lock);
+ overlay_hold_rele(odd);
+ return (ENOTSUP);
+ }
+
+ /*
+ * Holding this lock across the entire iteration probably isn't very
+ * good. We should perhaps add an r/w lock for the avl tree. But we'll
+ * wait until we now it's necessary before we do more.
+ */
+ mutex_enter(&ott->ott_lock);
+ mutex_exit(&odd->odd_lock);
+
+ if (ott->ott_mode == OVERLAY_TARGET_POINT) {
+ overlay_targ_cache_entry_t *out = &iter->otci_ents[0];
+ bzero(out->otce_mac, ETHERADDRL);
+ out->otce_flags = 0;
+ bcopy(&ott->ott_u.ott_point, &out->otce_dest,
+ sizeof (overlay_target_point_t));
+ written++;
+ mark->otcm_done = 1;
+ }
+
+ avl = &ott->ott_u.ott_dyn.ott_tree;
+ bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL);
+ ent = avl_find(avl, &lookup, &where);
+
+ /*
+ * NULL ent means that the entry does not exist, so we want to start
+ * with the closest node in the tree. This means that we implicitly rely
+ * on the tree's order and the first node will be the mac 00:00:00:00:00
+ * and the last will be ff:ff:ff:ff:ff:ff.
+ */
+ if (ent == NULL) {
+ ent = avl_nearest(avl, where, AVL_AFTER);
+ if (ent == NULL) {
+ mark->otcm_done = 1;
+ goto done;
+ }
+ }
+
+ for (; ent != NULL && written < iter->otci_count;
+ ent = AVL_NEXT(avl, ent)) {
+ overlay_targ_cache_entry_t *out = &iter->otci_ents[written];
+ mutex_enter(&ent->ote_lock);
+ if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) {
+ mutex_exit(&ent->ote_lock);
+ continue;
+ }
+ bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL);
+ out->otce_flags = 0;
+ if (ent->ote_flags & OVERLAY_ENTRY_F_DROP)
+ out->otce_flags |= OVERLAY_TARGET_CACHE_DROP;
+ if (ent->ote_flags & OVERLAY_ENTRY_F_VALID)
+ bcopy(&ent->ote_dest, &out->otce_dest,
+ sizeof (overlay_target_point_t));
+ written++;
+ mutex_exit(&ent->ote_lock);
+ }
+
+ if (ent != NULL) {
+ bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL);
+ } else {
+ mark->otcm_done = 1;
+ }
+
+done:
+ iter->otci_count = written;
+ mutex_exit(&ott->ott_lock);
+ overlay_hold_rele(odd);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize,
+ int flags)
+{
+ size_t outsize;
+ const overlay_targ_cache_iter_t *iter = buf;
+
+ outsize = sizeof (overlay_targ_cache_iter_t) +
+ iter->otci_count * sizeof (overlay_targ_cache_entry_t);
+
+ if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+static overlay_target_ioctl_t overlay_target_ioctab[] = {
+ { OVERLAY_TARG_INFO, B_TRUE, B_TRUE,
+ NULL, overlay_target_info,
+ NULL, sizeof (overlay_targ_info_t) },
+ { OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE,
+ NULL, overlay_target_associate,
+ NULL, sizeof (overlay_targ_associate_t) },
+ { OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE,
+ NULL, overlay_target_disassociate,
+ NULL, sizeof (overlay_targ_id_t) },
+ { OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE,
+ NULL, overlay_target_degrade,
+ NULL, sizeof (overlay_targ_degrade_t) },
+ { OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE,
+ NULL, overlay_target_restore,
+ NULL, sizeof (overlay_targ_id_t) },
+ { OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE,
+ NULL, overlay_target_lookup_request,
+ NULL, sizeof (overlay_targ_lookup_t) },
+ { OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE,
+ NULL, overlay_target_lookup_respond,
+ NULL, sizeof (overlay_targ_resp_t) },
+ { OVERLAY_TARG_DROP, B_TRUE, B_FALSE,
+ NULL, overlay_target_lookup_drop,
+ NULL, sizeof (overlay_targ_resp_t) },
+ { OVERLAY_TARG_PKT, B_TRUE, B_TRUE,
+ overlay_target_pkt_copyin,
+ overlay_target_packet,
+ overlay_target_pkt_copyout,
+ sizeof (overlay_targ_pkt_t) },
+ { OVERLAY_TARG_INJECT, B_TRUE, B_FALSE,
+ overlay_target_pkt_copyin,
+ overlay_target_inject,
+ NULL, sizeof (overlay_targ_pkt_t) },
+ { OVERLAY_TARG_RESEND, B_TRUE, B_FALSE,
+ overlay_target_pkt_copyin,
+ overlay_target_resend,
+ NULL, sizeof (overlay_targ_pkt_t) },
+ { OVERLAY_TARG_LIST, B_FALSE, B_TRUE,
+ overlay_target_list_copyin,
+ overlay_target_ioctl_list,
+ overlay_target_list_copyout,
+ sizeof (overlay_targ_list_t) },
+ { OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE,
+ NULL, overlay_target_cache_get,
+ NULL, sizeof (overlay_targ_cache_t) },
+ { OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE,
+ NULL, overlay_target_cache_set,
+ NULL, sizeof (overlay_targ_cache_t) },
+ { OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE,
+ NULL, overlay_target_cache_remove,
+ NULL, sizeof (overlay_targ_cache_t) },
+ { OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE,
+ NULL, overlay_target_cache_flush,
+ NULL, sizeof (overlay_targ_cache_t) },
+ { OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE,
+ overlay_target_cache_iter_copyin,
+ overlay_target_cache_iter,
+ overlay_target_cache_iter_copyout,
+ sizeof (overlay_targ_cache_iter_t) },
+ { 0 }
+};
+
+int
+overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp)
+{
+ minor_t mid;
+ overlay_target_hdl_t *thdl;
+
+ if (secpolicy_dl_config(credp) != 0)
+ return (EPERM);
+
+ if (getminor(*devp) != 0)
+ return (ENXIO);
+
+ if (otype & OTYP_BLK)
+ return (EINVAL);
+
+ if (flags & ~(FREAD | FWRITE | FEXCL))
+ return (EINVAL);
+
+ if ((flags & FWRITE) &&
+ !(flags & FEXCL))
+ return (EINVAL);
+
+ if (!(flags & FREAD) && !(flags & FWRITE))
+ return (EINVAL);
+
+ if (crgetzoneid(credp) != GLOBAL_ZONEID)
+ return (EPERM);
+
+ mid = id_alloc(overlay_thdl_idspace);
+ if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) {
+ id_free(overlay_thdl_idspace, mid);
+ return (ENXIO);
+ }
+
+ thdl = ddi_get_soft_state(overlay_thdl_state, mid);
+ VERIFY(thdl != NULL);
+ thdl->oth_minor = mid;
+ thdl->oth_zoneid = crgetzoneid(credp);
+ thdl->oth_oflags = flags;
+ mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_qlink));
+ *devp = makedevice(getmajor(*devp), mid);
+
+ mutex_enter(&overlay_target_lock);
+ if ((flags & FEXCL) && overlay_target_excl == B_TRUE) {
+ mutex_exit(&overlay_target_lock);
+ list_destroy(&thdl->oth_outstanding);
+ mutex_destroy(&thdl->oth_lock);
+ ddi_soft_state_free(overlay_thdl_state, mid);
+ id_free(overlay_thdl_idspace, mid);
+ return (EEXIST);
+ } else if ((flags & FEXCL) != 0) {
+ VERIFY(overlay_target_excl == B_FALSE);
+ overlay_target_excl = B_TRUE;
+ }
+ list_insert_tail(&overlay_thdl_list, thdl);
+ mutex_exit(&overlay_target_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+int
+overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+ int *rvalp)
+{
+ overlay_target_ioctl_t *ioc;
+ overlay_target_hdl_t *thdl;
+
+ if (secpolicy_dl_config(credp) != 0)
+ return (EPERM);
+
+ if ((thdl = ddi_get_soft_state(overlay_thdl_state,
+ getminor(dev))) == NULL)
+ return (ENXIO);
+
+ for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) {
+ int ret;
+ caddr_t buf;
+ size_t bufsize;
+
+ if (ioc->oti_cmd != cmd)
+ continue;
+
+ if (ioc->oti_write == B_TRUE && !(mode & FWRITE))
+ return (EBADF);
+
+ if (ioc->oti_copyin == NULL) {
+ bufsize = ioc->oti_size;
+ buf = kmem_alloc(bufsize, KM_SLEEP);
+ if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize,
+ mode & FKIOCTL) != 0) {
+ kmem_free(buf, bufsize);
+ return (EFAULT);
+ }
+ } else {
+ if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg,
+ (void **)&buf, &bufsize, mode)) != 0)
+ return (ret);
+ }
+
+ ret = ioc->oti_func(thdl, buf);
+ if (ret == 0 && ioc->oti_size != 0 &&
+ ioc->oti_ncopyout == B_TRUE) {
+ if (ioc->oti_copyout == NULL) {
+ if (ddi_copyout(buf, (void *)(uintptr_t)arg,
+ bufsize, mode & FKIOCTL) != 0)
+ ret = EFAULT;
+ } else {
+ ret = ioc->oti_copyout((void *)(uintptr_t)arg,
+ buf, bufsize, mode);
+ }
+ }
+
+ kmem_free(buf, bufsize);
+ return (ret);
+ }
+
+ return (ENOTTY);
+}
+
+/* ARGSUSED */
+int
+overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp)
+{
+ overlay_target_hdl_t *thdl;
+ overlay_target_entry_t *entry;
+ minor_t mid = getminor(dev);
+
+ if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL)
+ return (ENXIO);
+
+ mutex_enter(&overlay_target_lock);
+ list_remove(&overlay_thdl_list, thdl);
+ mutex_enter(&thdl->oth_lock);
+ while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL)
+ list_insert_tail(&overlay_target_list, entry);
+ cv_signal(&overlay_target_condvar);
+ mutex_exit(&thdl->oth_lock);
+ if ((thdl->oth_oflags & FEXCL) != 0) {
+ VERIFY(overlay_target_excl == B_TRUE);
+ overlay_target_excl = B_FALSE;
+ }
+ mutex_exit(&overlay_target_lock);
+
+ list_destroy(&thdl->oth_outstanding);
+ mutex_destroy(&thdl->oth_lock);
+ mid = thdl->oth_minor;
+ ddi_soft_state_free(overlay_thdl_state, mid);
+ id_free(overlay_thdl_idspace, mid);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c
new file mode 100644
index 0000000000..92144b3985
--- /dev/null
+++ b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c
@@ -0,0 +1,394 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * VXLAN encapsulation module
+ *
+ *
+ * The VXLAN header looks as follows in network byte order:
+ *
+ * |0 3| 4 |5 31|
+ * +----------+---+------------------------+
+ * | Reserved | I | Reserved |
+ * +---------------------------------------+
+ * | Virtual Network ID | Reserved |
+ * +----------------------------+----------+
+ * |0 23|24 31|
+ *
+ * All reserved values must be 0. The I bit must be 1. We call the top
+ * word the VXLAN magic field for the time being. The second word is
+ * definitely not the most friendly way to operate. Specifically, the ID
+ * is a 24-bit big endian value, but we have to make sure not to use the
+ * reserved byte.
+ *
+ * For us, VXLAN encapsulation is a fairly straightforward implementation. It
+ * only has two properties, a listen_ip and a listen_port. These determine on
+ * what address we should be listening on. While we do not have a default
+ * address to listen upon, we do have a default port, which is the IANA assigned
+ * port for VXLAN -- 4789.
+ */
+
+#include <sys/overlay_plugin.h>
+#include <sys/modctl.h>
+#include <sys/errno.h>
+#include <sys/byteorder.h>
+#include <sys/vxlan.h>
+#include <inet/ip.h>
+#include <netinet/in.h>
+#include <sys/strsun.h>
+#include <netinet/udp.h>
+
+static const char *vxlan_ident = "vxlan";
+static uint16_t vxlan_defport = IPPORT_VXLAN;
+
+/*
+ * Should we enable UDP source port hashing for fanout.
+ */
+boolean_t vxlan_fanout = B_TRUE;
+
+/*
+ * This represents the size in bytes that we want to allocate when allocating a
+ * vxlan header block. This is intended such that lower levels can try and use
+ * the message block that we allocate for the IP and UPD header. The hope is
+ * that even if this is tunneled, that this is enough space.
+ *
+ * The vxlan_noalloc_min value represents the minimum amount of space we need to
+ * consider not allocating a message block and just passing it down the stack in
+ * this form. This number assumes that we have a VLAN tag, so 18 byte Ethernet
+ * header, 20 byte IP header, 8 byte UDP header, and 8 byte VXLAN header.
+ */
+uint_t vxlan_alloc_size = 128;
+uint_t vxlan_noalloc_min = 54;
+
+static const char *vxlan_props[] = {
+ "vxlan/listen_ip",
+ "vxlan/listen_port",
+ NULL
+};
+
+typedef struct vxlan {
+ kmutex_t vxl_lock;
+ overlay_handle_t vxl_oh;
+ uint16_t vxl_lport;
+ boolean_t vxl_hladdr;
+ struct in6_addr vxl_laddr;
+} vxlan_t;
+
+static int
+vxlan_o_init(overlay_handle_t oh, void **outp)
+{
+ vxlan_t *vxl;
+
+ vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP);
+ *outp = vxl;
+ mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL);
+ vxl->vxl_oh = oh;
+ vxl->vxl_lport = vxlan_defport;
+ vxl->vxl_hladdr = B_FALSE;
+
+ return (0);
+}
+
+static void
+vxlan_o_fini(void *arg)
+{
+ vxlan_t *vxl = arg;
+
+ mutex_destroy(&vxl->vxl_lock);
+ kmem_free(arg, sizeof (vxlan_t));
+}
+
+static int
+vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr,
+ socklen_t *slenp)
+{
+ vxlan_t *vxl = arg;
+ struct sockaddr_in6 *in;
+
+ in = (struct sockaddr_in6 *)addr;
+ *dp = AF_INET6;
+ *fp = SOCK_DGRAM;
+ *pp = 0;
+ bzero(in, sizeof (struct sockaddr_in6));
+ in->sin6_family = AF_INET6;
+
+ /*
+ * We should consider a more expressive private errno set that
+ * provider's can use.
+ */
+ mutex_enter(&vxl->vxl_lock);
+ if (vxl->vxl_hladdr == B_FALSE) {
+ mutex_exit(&vxl->vxl_lock);
+ return (EINVAL);
+ }
+ in->sin6_port = htons(vxl->vxl_lport);
+ in->sin6_addr = vxl->vxl_laddr;
+ mutex_exit(&vxl->vxl_lock);
+ *slenp = sizeof (struct sockaddr_in6);
+
+ return (0);
+}
+
+static int
+vxlan_o_sockopt(ksocket_t ksock)
+{
+ int val, err;
+ if (vxlan_fanout == B_FALSE)
+ return (0);
+
+ val = UDP_HASH_VXLAN;
+ err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val,
+ sizeof (val), kcred);
+ return (err);
+}
+
+/* ARGSUSED */
+static int
+vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop,
+ mblk_t **outp)
+{
+ mblk_t *ob;
+ vxlan_hdr_t *vxh;
+
+ ASSERT(einfop->ovdi_id < (1 << 24));
+
+ if (DB_REF(mp) != 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
+ /*
+ * This allocation could get hot. We may want to have a good
+ * way to cache and handle this allocation the same way that IP
+ * does with keeping around a message block per entry, or
+ * basically treating this as an immutable message block in the
+ * system. Basically freemsg() will be a nop, but we'll do the
+ * right thing with respect to the rest of the chain.
+ */
+ ob = allocb(vxlan_alloc_size, 0);
+ if (ob == NULL)
+ return (ENOMEM);
+
+ ob->b_wptr = DB_LIM(ob);
+ ob->b_rptr = ob->b_wptr;
+ ob->b_cont = mp;
+ } else {
+ ob = mp;
+ }
+ ob->b_rptr -= VXLAN_HDR_LEN;
+
+ vxh = (vxlan_hdr_t *)ob->b_rptr;
+ vxh->vxlan_flags = ntohl(VXLAN_F_VDI);
+ vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT);
+ *outp = ob;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop)
+{
+ vxlan_hdr_t *vxh;
+
+ if (MBLKL(mp) < sizeof (vxlan_hdr_t))
+ return (EINVAL);
+ vxh = (vxlan_hdr_t *)mp->b_rptr;
+ if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0)
+ return (EINVAL);
+
+ dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT;
+ dinfop->ovdi_hdr_size = VXLAN_HDR_LEN;
+
+ return (0);
+}
+
+static int
+vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize)
+{
+ vxlan_t *vxl = arg;
+
+ /* vxlan/listen_ip */
+ if (strcmp(pr_name, vxlan_props[0]) == 0) {
+ if (*bufsize < sizeof (struct in6_addr))
+ return (EOVERFLOW);
+
+ mutex_enter(&vxl->vxl_lock);
+ if (vxl->vxl_hladdr == B_FALSE) {
+ *bufsize = 0;
+ } else {
+ bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr));
+ *bufsize = sizeof (struct in6_addr);
+ }
+ mutex_exit(&vxl->vxl_lock);
+ return (0);
+ }
+
+ /* vxlan/listen_port */
+ if (strcmp(pr_name, vxlan_props[1]) == 0) {
+ uint64_t val;
+ if (*bufsize < sizeof (uint64_t))
+ return (EOVERFLOW);
+
+ mutex_enter(&vxl->vxl_lock);
+ val = vxl->vxl_lport;
+ bcopy(&val, buf, sizeof (uint64_t));
+ *bufsize = sizeof (uint64_t);
+ mutex_exit(&vxl->vxl_lock);
+ return (0);
+ }
+
+ return (EINVAL);
+}
+
+static int
+vxlan_o_setprop(void *arg, const char *pr_name, const void *buf,
+ uint32_t bufsize)
+{
+ vxlan_t *vxl = arg;
+
+ /* vxlan/listen_ip */
+ if (strcmp(pr_name, vxlan_props[0]) == 0) {
+ const struct in6_addr *ipv6 = buf;
+ if (bufsize != sizeof (struct in6_addr))
+ return (EINVAL);
+
+ if (IN6_IS_ADDR_V4COMPAT(ipv6))
+ return (EINVAL);
+
+ if (IN6_IS_ADDR_MULTICAST(ipv6))
+ return (EINVAL);
+
+ if (IN6_IS_ADDR_6TO4(ipv6))
+ return (EINVAL);
+
+ if (IN6_IS_ADDR_V4MAPPED(ipv6)) {
+ ipaddr_t v4;
+ IN6_V4MAPPED_TO_IPADDR(ipv6, v4);
+ if (IN_MULTICAST(v4))
+ return (EINVAL);
+ }
+
+ mutex_enter(&vxl->vxl_lock);
+ vxl->vxl_hladdr = B_TRUE;
+ bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr));
+ mutex_exit(&vxl->vxl_lock);
+
+ return (0);
+ }
+
+ /* vxlan/listen_port */
+ if (strcmp(pr_name, vxlan_props[1]) == 0) {
+ const uint64_t *valp = buf;
+ if (bufsize != 8)
+ return (EINVAL);
+
+ if (*valp == 0 || *valp > UINT16_MAX)
+ return (EINVAL);
+
+ mutex_enter(&vxl->vxl_lock);
+ vxl->vxl_lport = *valp;
+ mutex_exit(&vxl->vxl_lock);
+ return (0);
+ }
+ return (EINVAL);
+}
+
+static int
+vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl)
+{
+ /* vxlan/listen_ip */
+ if (strcmp(pr_name, vxlan_props[0]) == 0) {
+ overlay_prop_set_name(phdl, vxlan_props[0]);
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP);
+ overlay_prop_set_nodefault(phdl);
+ return (0);
+ }
+
+ if (strcmp(pr_name, vxlan_props[1]) == 0) {
+ overlay_prop_set_name(phdl, vxlan_props[1]);
+ overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
+ overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
+ (void) overlay_prop_set_default(phdl, &vxlan_defport,
+ sizeof (vxlan_defport));
+ overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX);
+ return (0);
+ }
+
+ return (EINVAL);
+}
+
+static struct overlay_plugin_ops vxlan_o_ops = {
+ 0,
+ vxlan_o_init,
+ vxlan_o_fini,
+ vxlan_o_encap,
+ vxlan_o_decap,
+ vxlan_o_socket,
+ vxlan_o_sockopt,
+ vxlan_o_getprop,
+ vxlan_o_setprop,
+ vxlan_o_propinfo
+};
+
+static struct modlmisc vxlan_modlmisc = {
+ &mod_miscops,
+ "VXLAN encap plugin"
+};
+
+static struct modlinkage vxlan_modlinkage = {
+ MODREV_1,
+ &vxlan_modlmisc
+};
+
+int
+_init(void)
+{
+ int err;
+ overlay_plugin_register_t *ovrp;
+
+ ovrp = overlay_plugin_alloc(OVEP_VERSION);
+ if (ovrp == NULL)
+ return (ENOTSUP);
+ ovrp->ovep_name = vxlan_ident;
+ ovrp->ovep_ops = &vxlan_o_ops;
+ ovrp->ovep_id_size = VXLAN_ID_LEN;
+ ovrp->ovep_flags = OVEP_F_VLAN_TAG;
+ ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT;
+ ovrp->ovep_props = vxlan_props;
+
+ if ((err = overlay_plugin_register(ovrp)) == 0) {
+ if ((err = mod_install(&vxlan_modlinkage)) != 0) {
+ (void) overlay_plugin_unregister(vxlan_ident);
+ }
+ }
+
+ overlay_plugin_free(ovrp);
+ return (err);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&vxlan_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int err;
+
+ if ((err = overlay_plugin_unregister(vxlan_ident)) != 0)
+ return (err);
+
+ return (mod_remove(&vxlan_modlinkage));
+}
diff --git a/usr/src/uts/common/io/pciex/hotplug/pciehpc.c b/usr/src/uts/common/io/pciex/hotplug/pciehpc.c
index 5ce219bd2f..3e4beda495 100644
--- a/usr/src/uts/common/io/pciex/hotplug/pciehpc.c
+++ b/usr/src/uts/common/io/pciex/hotplug/pciehpc.c
@@ -396,6 +396,21 @@ pciehpc_intr(dev_info_t *dip)
control & ~PCIE_SLOTCTL_PWR_FAULT_EN);
/*
+ * If supported, notify the child device driver that the
+ * device is being removed.
+ */
+ dev_info_t *cdip = ddi_get_child(dip);
+ if (cdip != NULL) {
+ ddi_eventcookie_t rm_cookie;
+ if (ddi_get_eventcookie(cdip,
+ DDI_DEVI_REMOVE_EVENT,
+ &rm_cookie) == DDI_SUCCESS) {
+ ndi_post_event(dip, cdip, rm_cookie,
+ NULL);
+ }
+ }
+
+ /*
* Ask DDI Hotplug framework to change state to Empty
*/
(void) ndi_hp_state_change_req(dip,
diff --git a/usr/src/uts/common/io/pciex/pcie.c b/usr/src/uts/common/io/pciex/pcie.c
index 22f191943c..35a0190be7 100644
--- a/usr/src/uts/common/io/pciex/pcie.c
+++ b/usr/src/uts/common/io/pciex/pcie.c
@@ -845,6 +845,13 @@ pcie_init_pfd(dev_info_t *dip)
PCIE_ZALLOC(pf_pcix_ecc_regs_t);
}
}
+
+ PCIE_SLOT_REG(pfd_p) = PCIE_ZALLOC(pf_pcie_slot_regs_t);
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_regs_valid = B_FALSE;
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_cap = 0;
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_control = 0;
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_status = 0;
+
} else if (PCIE_IS_PCIX(bus_p)) {
if (PCIE_IS_BDG(bus_p)) {
PCIX_BDG_ERR_REG(pfd_p) =
diff --git a/usr/src/uts/common/io/pciex/pcie_fault.c b/usr/src/uts/common/io/pciex/pcie_fault.c
index 90563c1d1a..de558a9fc6 100644
--- a/usr/src/uts/common/io/pciex/pcie_fault.c
+++ b/usr/src/uts/common/io/pciex/pcie_fault.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/sysmacros.h>
@@ -200,7 +200,7 @@ pf_eh_exit(pcie_bus_t *bus_p)
* for the root_pfd_p.
*
* "Root Complexes" such as NPE and PX should call scan_fabric using itself as
- * the rdip. PCIe Root ports should call pf_scan_fabric using it's parent as
+ * the rdip. PCIe Root ports should call pf_scan_fabric using its parent as
* the rdip.
*
* Scan fabric initiated from RCs are likely due to a fabric message, traps or
@@ -587,6 +587,35 @@ pf_pcie_regs_gather(pf_data_t *pfd_p, pcie_bus_t *bus_p)
PCIE_ROOTCTL);
}
+ /*
+ * For eligible components, we gather Slot Register state.
+ *
+ * Eligible components are:
+ * - a Downstream Port or a Root Port with the Slot Implemented
+ * capability bit set
+ * - hotplug capable
+ *
+ * Slot register state is useful, for instance, to determine whether the
+ * Slot's child device is physically present (via the Slot Status
+ * register).
+ */
+ if ((PCIE_IS_SWD(bus_p) || PCIE_IS_ROOT(bus_p)) &&
+ PCIE_IS_HOTPLUG_ENABLED(PCIE_BUS2DIP(bus_p))) {
+ pf_pcie_slot_regs_t *pcie_slot_regs = PCIE_SLOT_REG(pfd_p);
+ pcie_slot_regs->pcie_slot_cap = PCIE_CAP_GET(32, bus_p,
+ PCIE_SLOTCAP);
+ pcie_slot_regs->pcie_slot_control = PCIE_CAP_GET(16, bus_p,
+ PCIE_SLOTCTL);
+ pcie_slot_regs->pcie_slot_status = PCIE_CAP_GET(16, bus_p,
+ PCIE_SLOTSTS);
+
+ if (pcie_slot_regs->pcie_slot_cap != PCI_EINVAL32 &&
+ pcie_slot_regs->pcie_slot_control != PCI_EINVAL16 &&
+ pcie_slot_regs->pcie_slot_status != PCI_EINVAL16) {
+ pcie_slot_regs->pcie_slot_regs_valid = B_TRUE;
+ }
+ }
+
if (!PCIE_HAS_AER(bus_p))
return;
@@ -838,7 +867,7 @@ pf_pci_find_rp_fault(pf_data_t *pfd_p, pcie_bus_t *bus_p)
* Check to see if an error has been received that
* requires a scan of the fabric. Count the number of
* faults seen. If MUL CE/FE_NFE that counts for
- * atleast 2 faults, so just return with full_scan.
+ * at least 2 faults, so just return with full_scan.
*/
if ((root_err & PCIE_AER_RE_STS_MUL_CE_RCVD) ||
(root_err & PCIE_AER_RE_STS_MUL_FE_NFE_RCVD)) {
@@ -1232,7 +1261,7 @@ const pf_fab_err_tbl_t pcie_rp_tbl[] = {
{PCIE_AER_UCE_FCP, pf_panic,
PF_AFFECTED_SELF | PF_AFFECTED_CHILDREN, 0},
- {PCIE_AER_UCE_TO, pf_panic,
+ {PCIE_AER_UCE_TO, pf_analyse_to,
PF_AFFECTED_ADDR, PF_AFFECTED_CHILDREN},
{PCIE_AER_UCE_CA, pf_no_panic,
@@ -1916,16 +1945,35 @@ pf_analyse_sc(ddi_fm_error_t *derr, uint32_t bit, pf_data_t *dq_head_p,
/*
* PCIe Timeout error analyser. This error can be forgiven if it is marked as
* CE Advisory. If it is marked as advisory, this means the HW can recover
- * and/or retry the transaction automatically.
+ * and/or retry the transaction automatically. Additionally, if a device's
+ * parent slot reports that it is no longer physically present, we do not panic,
+ * as one would not expect a missing device to respond to a command.
*/
/* ARGSUSED */
static int
pf_analyse_to(ddi_fm_error_t *derr, uint32_t bit, pf_data_t *dq_head_p,
pf_data_t *pfd_p)
{
+ dev_info_t *rpdip = PCIE_PFD2BUS(pfd_p)->bus_rp_dip;
+ pf_data_t *rppfd = PCIE_DIP2PFD(rpdip);
+ pf_pcie_slot_regs_t *p_pcie_slot_regs;
+
if (HAS_AER_LOGS(pfd_p, bit) && CE_ADVISORY(pfd_p))
return (PF_ERR_NO_PANIC);
+ p_pcie_slot_regs = PCIE_SLOT_REG(rppfd);
+ if (p_pcie_slot_regs->pcie_slot_regs_valid) {
+ /*
+ * If the device is reported gone from its parent slot, then it
+ * is expected that any outstanding commands would time out. In
+ * this case, do not panic.
+ */
+ if ((p_pcie_slot_regs->pcie_slot_status &
+ PCIE_SLOTSTS_PRESENCE_DETECTED) == 0x0) {
+ return (PF_ERR_NO_PANIC);
+ }
+ }
+
return (PF_ERR_PANIC);
}
@@ -2970,6 +3018,24 @@ pf_send_ereport(ddi_fm_error_t *derr, pf_impl_t *impl)
NULL);
}
+ /*
+ * Slot Status registers
+ *
+ * Since we only gather these for certain types of components,
+ * only put these registers into the ereport if we have valid
+ * data.
+ */
+ if (PCIE_SLOT_REG(pfd_p)->pcie_slot_regs_valid) {
+ fm_payload_set(ereport,
+ "pcie_slot_cap", DATA_TYPE_UINT32,
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_cap,
+ "pcie_slot_control", DATA_TYPE_UINT16,
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_control,
+ "pcie_slot_status", DATA_TYPE_UINT16,
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_status,
+ NULL);
+ }
+
generic:
/* IOV related information */
if (!PCIE_BDG_IS_UNASSIGNED(PCIE_PFD2BUS(impl->pf_dq_head_p))) {
diff --git a/usr/src/uts/common/io/physmem.c b/usr/src/uts/common/io/physmem.c
index 665c9eff6c..9aaf58fb7b 100644
--- a/usr/src/uts/common/io/physmem.c
+++ b/usr/src/uts/common/io/physmem.c
@@ -21,6 +21,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
@@ -807,6 +808,13 @@ physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp)
int ret;
static int msg_printed = 0;
+ /*
+ * This device should never be visible in a zone, but if it somehow
+ * does get created we refuse to allow the zone to use it.
+ */
+ if (crgetzoneid(credp) != GLOBAL_ZONEID)
+ return (EACCES);
+
if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) {
return (EINVAL);
}
diff --git a/usr/src/uts/common/io/pseudo.conf b/usr/src/uts/common/io/pseudo.conf
index 42248e93d6..08affec609 100644
--- a/usr/src/uts/common/io/pseudo.conf
+++ b/usr/src/uts/common/io/pseudo.conf
@@ -22,8 +22,7 @@
#
# Copyright 2003 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
-#
-# ident "%Z%%M% %I% %E% SMI"
+# Copyright 2014 Joyent, Inc. All rights reserved.
#
# This file is private to the pseudonex driver. It should not be edited.
#
@@ -38,3 +37,9 @@ name="pseudo" class="root" instance=0;
# /pseudo; it has as its children the zone console pseudo nodes.
#
name="zconsnex" parent="/pseudo" instance=1 valid-children="zcons";
+
+#
+# zfdnex is an alias for pseudo; this node is instantiated as a child of
+# /pseudo; it has as its children the zone fd pseudo nodes.
+#
+name="zfdnex" parent="/pseudo" instance=2 valid-children="zfd";
diff --git a/usr/src/uts/common/io/pseudonex.c b/usr/src/uts/common/io/pseudonex.c
index f83b0abf39..0ae06f88cc 100644
--- a/usr/src/uts/common/io/pseudonex.c
+++ b/usr/src/uts/common/io/pseudonex.c
@@ -83,6 +83,8 @@ static int pseudonex_detach(dev_info_t *, ddi_detach_cmd_t);
static int pseudonex_open(dev_t *, int, int, cred_t *);
static int pseudonex_close(dev_t, int, int, cred_t *);
static int pseudonex_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+static int pseudonex_fm_init(dev_info_t *, dev_info_t *, int,
+ ddi_iblock_cookie_t *);
static int pseudonex_ctl(dev_info_t *, dev_info_t *, ddi_ctl_enum_t, void *,
void *);
@@ -90,6 +92,8 @@ static void *pseudonex_state;
typedef struct pseudonex_state {
dev_info_t *pnx_devi;
+ int pnx_fmcap;
+ ddi_iblock_cookie_t pnx_fm_ibc;
} pseudonex_state_t;
static struct bus_ops pseudonex_bus_ops = {
@@ -116,7 +120,7 @@ static struct bus_ops pseudonex_bus_ops = {
NULL, /* bus_intr_ctl */
NULL, /* bus_config */
NULL, /* bus_unconfig */
- NULL, /* bus_fm_init */
+ pseudonex_fm_init, /* bus_fm_init */
NULL, /* bus_fm_fini */
NULL, /* bus_fm_access_enter */
NULL, /* bus_fm_access_exit */
@@ -228,6 +232,9 @@ pseudonex_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
pnx_state = ddi_get_soft_state(pseudonex_state, instance);
pnx_state->pnx_devi = devi;
+ pnx_state->pnx_fmcap = DDI_FM_EREPORT_CAPABLE;
+ ddi_fm_init(devi, &pnx_state->pnx_fmcap, &pnx_state->pnx_fm_ibc);
+
if (ddi_create_minor_node(devi, "devctl", S_IFCHR, instance,
DDI_NT_NEXUS, 0) != DDI_SUCCESS) {
ddi_remove_minor_node(devi, NULL);
@@ -247,6 +254,10 @@ pseudonex_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
if (cmd == DDI_SUSPEND)
return (DDI_SUCCESS);
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ ddi_fm_fini(devi);
ddi_remove_minor_node(devi, NULL);
ddi_soft_state_free(pseudonex_state, instance);
return (DDI_SUCCESS);
@@ -375,6 +386,19 @@ pseudonex_auto_assign(dev_info_t *child)
}
static int
+pseudonex_fm_init(dev_info_t *dip, dev_info_t *tdip, int cap,
+ ddi_iblock_cookie_t *ibc)
+{
+ pseudonex_state_t *pnx_state;
+
+ pnx_state = ddi_get_soft_state(pseudonex_state, ddi_get_instance(dip));
+ ASSERT(pnx_state != NULL);
+ ASSERT(ibc != NULL);
+ *ibc = pnx_state->pnx_fm_ibc;
+ return (pnx_state->pnx_fmcap & cap);
+}
+
+static int
pseudonex_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
void *arg, void *result)
{
diff --git a/usr/src/uts/common/io/ptm.c b/usr/src/uts/common/io/ptm.c
index 472cbf6936..d4dfe83766 100644
--- a/usr/src/uts/common/io/ptm.c
+++ b/usr/src/uts/common/io/ptm.c
@@ -449,6 +449,18 @@ ptmclose(queue_t *rqp, int flag, cred_t *credp)
return (0);
}
+static boolean_t
+ptmptsopencb(ptmptsopencb_arg_t arg)
+{
+ struct pt_ttys *ptmp = (struct pt_ttys *)arg;
+ boolean_t rval;
+
+ PT_ENTER_READ(ptmp);
+ rval = (ptmp->pt_nullmsg != NULL);
+ PT_EXIT_READ(ptmp);
+ return (rval);
+}
+
/*
* The wput procedure will only handle ioctl and flush messages.
*/
@@ -583,6 +595,41 @@ ptmwput(queue_t *qp, mblk_t *mp)
miocack(qp, mp, 0, 0);
break;
}
+ case PTMPTSOPENCB:
+ {
+ mblk_t *dp; /* ioctl reply data */
+ ptmptsopencb_t *ppocb;
+
+ /* only allow the kernel to invoke this ioctl */
+ if (iocp->ioc_cr != kcred) {
+ miocnak(qp, mp, 0, EINVAL);
+ break;
+ }
+
+ /* we don't support transparent ioctls */
+ ASSERT(iocp->ioc_count != TRANSPARENT);
+ if (iocp->ioc_count == TRANSPARENT) {
+ miocnak(qp, mp, 0, EINVAL);
+ break;
+ }
+
+ /* allocate a response message */
+ dp = allocb(sizeof (ptmptsopencb_t), BPRI_MED);
+ if (dp == NULL) {
+ miocnak(qp, mp, 0, EAGAIN);
+ break;
+ }
+
+ /* initialize the ioctl results */
+ ppocb = (ptmptsopencb_t *)dp->b_rptr;
+ ppocb->ppocb_func = ptmptsopencb;
+ ppocb->ppocb_arg = (ptmptsopencb_arg_t)ptmp;
+
+ /* send the reply data */
+ mioc2ack(mp, dp, sizeof (ptmptsopencb_t), 0);
+ qreply(qp, mp);
+ break;
+ }
}
break;
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg
new file mode 100644
index 0000000000..b932ffaa7c
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg
Binary files differ
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg
new file mode 100644
index 0000000000..9421ecc0db
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg
Binary files differ
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png
new file mode 100644
index 0000000000..4b8a66761a
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png
Binary files differ
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png
new file mode 100644
index 0000000000..3254fbdc3b
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png
Binary files differ
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg
new file mode 100644
index 0000000000..7bb0dbf21b
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg
Binary files differ
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin
new file mode 100644
index 0000000000..43014fd8ea
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin
Binary files differ
diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin
new file mode 100644
index 0000000000..9524eb4a63
--- /dev/null
+++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin
Binary files differ
diff --git a/usr/src/uts/common/io/qede/qede_list.h b/usr/src/uts/common/io/qede/qede_list.h
index 2350cb4117..656d2a915f 100644
--- a/usr/src/uts/common/io/qede/qede_list.h
+++ b/usr/src/uts/common/io/qede/qede_list.h
@@ -176,4 +176,3 @@ qede_list_splice_tail(qede_list_t *list,
#define QEDE_LIST_FOR_EACH_ENTRY_SAFE OSAL_LIST_FOR_EACH_ENTRY_SAFE
#endif /* !_QEDE_LIST_H */
-
diff --git a/usr/src/uts/common/io/qede/qede_version.h b/usr/src/uts/common/io/qede/qede_version.h
index 43584f95f0..0ee38b4338 100644
--- a/usr/src/uts/common/io/qede/qede_version.h
+++ b/usr/src/uts/common/io/qede/qede_version.h
@@ -42,4 +42,3 @@
#define REVVERSION 25
#endif /* !_QEDE_VERSION_H */
-
diff --git a/usr/src/uts/common/io/random.c b/usr/src/uts/common/io/random.c
index d79b86362c..a50bbcceec 100644
--- a/usr/src/uts/common/io/random.c
+++ b/usr/src/uts/common/io/random.c
@@ -291,6 +291,9 @@ rnd_write(dev_t dev, struct uio *uiop, cred_t *credp)
if ((error = uiomove(buf, bytes, UIO_WRITE, uiop)) != 0)
return (error);
+ if (crgetzone(credp) != global_zone)
+ continue;
+
switch (devno) {
case DEVRANDOM:
if ((error = random_add_entropy(buf, bytes, 0)) != 0)
diff --git a/usr/src/uts/common/io/rsm/rsm.c b/usr/src/uts/common/io/rsm/rsm.c
index b49d5b735a..d9d40c83fd 100644
--- a/usr/src/uts/common/io/rsm/rsm.c
+++ b/usr/src/uts/common/io/rsm/rsm.c
@@ -22,8 +22,8 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2012 Milan Jurik. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
*/
diff --git a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c
index 691c03bff4..721aead599 100644
--- a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c
+++ b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c
@@ -10772,6 +10772,7 @@ ahci_em_ioctl_set(ahci_ctl_t *ahci_ctlp, intptr_t arg)
}
task->aelta_ctl = ahci_ctlp;
+ task->aelta_port = set.aiems_port;
task->aelta_port = (uint8_t)set.aiems_port;
task->aelta_op = set.aiems_op;
task->aelta_state = set.aiems_leds;
diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mpt_sas.conf b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mpt_sas.conf
index c6e017655e..721e66c276 100644
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mpt_sas.conf
+++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mpt_sas.conf
@@ -21,7 +21,7 @@
#
# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
-#
+# Copyright 2019 Joyent, Inc.
#
#
@@ -49,3 +49,8 @@ ddi-vhci-class="scsi_vhci";
# name="mpt_sas" parent="/pci@7c0/pci@0/pci@9" unit-address="0" mpxio-disable="yes";
#
mpxio-disable="no";
+
+#
+# Command/target timeout checking should be done at a 1-second granularity.
+#
+scsi-watchdog-tick=1;
diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
index 66256a2aa2..e458c61168 100644
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
+++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
@@ -72,6 +72,7 @@
#include <sys/file.h>
#include <sys/policy.h>
#include <sys/model.h>
+#include <sys/refhash.h>
#include <sys/sysevent.h>
#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/dr.h>
@@ -99,7 +100,6 @@
#include <sys/scsi/adapters/mpt_sas/mptsas_var.h>
#include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h>
#include <sys/scsi/adapters/mpt_sas/mptsas_smhba.h>
-#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h>
#include <sys/raidioctl.h>
#include <sys/fs/dv_node.h> /* devfs_clean */
diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c
deleted file mode 100644
index 8f96c2d9f1..0000000000
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source. A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2014 Joyent, Inc. All rights reserved.
- */
-
-#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h>
-#include <sys/sysmacros.h>
-#include <sys/types.h>
-#include <sys/kmem.h>
-#include <sys/list.h>
-#include <sys/ddi.h>
-
-#ifdef lint
-extern refhash_link_t *obj_to_link(refhash_t *, void *);
-extern void *link_to_obj(refhash_t *, refhash_link_t *);
-extern void *obj_to_tag(refhash_t *, void *);
-#else
-#define obj_to_link(_h, _o) \
- ((refhash_link_t *)(((char *)(_o)) + (_h)->rh_link_off))
-#define link_to_obj(_h, _l) \
- ((void *)(((char *)(_l)) - (_h)->rh_link_off))
-#define obj_to_tag(_h, _o) \
- ((void *)(((char *)(_o)) + (_h)->rh_tag_off))
-#endif
-
-refhash_t *
-refhash_create(uint_t bucket_count, refhash_hash_f hash,
- refhash_cmp_f cmp, refhash_dtor_f dtor, size_t obj_size, size_t link_off,
- size_t tag_off, int km_flags)
-{
- refhash_t *hp;
- uint_t i;
-
- hp = kmem_alloc(sizeof (refhash_t), km_flags);
- if (hp == NULL)
- return (NULL);
- hp->rh_buckets = kmem_zalloc(bucket_count * sizeof (list_t), km_flags);
- if (hp->rh_buckets == NULL) {
- kmem_free(hp, sizeof (refhash_t));
- return (NULL);
- }
- hp->rh_bucket_count = bucket_count;
-
- for (i = 0; i < bucket_count; i++) {
- list_create(&hp->rh_buckets[i], sizeof (refhash_link_t),
- offsetof(refhash_link_t, rhl_chain_link));
- }
- list_create(&hp->rh_objs, sizeof (refhash_link_t),
- offsetof(refhash_link_t, rhl_global_link));
-
- hp->rh_obj_size = obj_size;
- hp->rh_link_off = link_off;
- hp->rh_tag_off = tag_off;
- hp->rh_hash = hash;
- hp->rh_cmp = cmp;
- hp->rh_dtor = dtor;
-
- return (hp);
-}
-
-void
-refhash_destroy(refhash_t *hp)
-{
- ASSERT(list_is_empty(&hp->rh_objs));
-
- kmem_free(hp->rh_buckets, hp->rh_bucket_count * sizeof (list_t));
- kmem_free(hp, sizeof (refhash_t));
-}
-
-void
-refhash_insert(refhash_t *hp, void *op)
-{
- uint_t bucket;
- refhash_link_t *lp = obj_to_link(hp, op);
-
- bucket = hp->rh_hash(obj_to_tag(hp, op)) % hp->rh_bucket_count;
- list_link_init(&lp->rhl_chain_link);
- list_link_init(&lp->rhl_global_link);
- lp->rhl_flags = 0;
- lp->rhl_refcnt = 0;
- list_insert_tail(&hp->rh_buckets[bucket], lp);
- list_insert_tail(&hp->rh_objs, lp);
-}
-
-static void
-refhash_delete(refhash_t *hp, void *op)
-{
- refhash_link_t *lp = obj_to_link(hp, op);
- uint_t bucket;
-
- bucket = hp->rh_hash(obj_to_tag(hp, op)) % hp->rh_bucket_count;
- list_remove(&hp->rh_buckets[bucket], lp);
- list_remove(&hp->rh_objs, lp);
- hp->rh_dtor(op);
-}
-
-void
-refhash_remove(refhash_t *hp, void *op)
-{
- refhash_link_t *lp = obj_to_link(hp, op);
-
- if (lp->rhl_refcnt > 0) {
- lp->rhl_flags |= RHL_F_DEAD;
- } else {
- refhash_delete(hp, op);
- }
-}
-
-void *
-refhash_lookup(refhash_t *hp, const void *tp)
-{
- uint_t bucket;
- refhash_link_t *lp;
- void *op;
-
- bucket = hp->rh_hash(tp) % hp->rh_bucket_count;
- for (lp = list_head(&hp->rh_buckets[bucket]); lp != NULL;
- lp = list_next(&hp->rh_buckets[bucket], lp)) {
- op = link_to_obj(hp, lp);
- if (hp->rh_cmp(obj_to_tag(hp, op), tp) == 0 &&
- !(lp->rhl_flags & RHL_F_DEAD)) {
- return (op);
- }
- }
-
- return (NULL);
-}
-
-void *
-refhash_linear_search(refhash_t *hp, refhash_eval_f eval, void *arg)
-{
- void *op;
- refhash_link_t *lp;
-
- for (lp = list_head(&hp->rh_objs); lp != NULL;
- lp = list_next(&hp->rh_objs, lp)) {
- op = link_to_obj(hp, lp);
- if (eval(op, arg) == 0)
- return (op);
- }
-
- return (NULL);
-}
-
-void
-refhash_hold(refhash_t *hp, void *op)
-{
- refhash_link_t *lp = obj_to_link(hp, op);
-
- ++lp->rhl_refcnt;
-}
-
-void
-refhash_rele(refhash_t *hp, void *op)
-{
- refhash_link_t *lp = obj_to_link(hp, op);
-
- ASSERT(lp->rhl_refcnt > 0);
-
- if (--lp->rhl_refcnt == 0 && (lp->rhl_flags & RHL_F_DEAD))
- refhash_remove(hp, op);
-}
-
-void *
-refhash_first(refhash_t *hp)
-{
- refhash_link_t *lp;
-
- lp = list_head(&hp->rh_objs);
- if (lp == NULL)
- return (NULL);
-
- ++lp->rhl_refcnt;
-
- return (link_to_obj(hp, lp));
-}
-
-void *
-refhash_next(refhash_t *hp, void *op)
-{
- refhash_link_t *lp;
-
- lp = obj_to_link(hp, op);
- while ((lp = list_next(&hp->rh_objs, lp)) != NULL) {
- if (!(lp->rhl_flags & RHL_F_DEAD))
- break;
- }
-
- refhash_rele(hp, op);
- if (lp == NULL)
- return (NULL);
-
- ++lp->rhl_refcnt;
-
- return (link_to_obj(hp, lp));
-}
-
-boolean_t
-refhash_obj_valid(refhash_t *hp, const void *op)
-{
- /* LINTED - E_ARG_INCOMPATIBLE_WITH_ARG_L */
- const refhash_link_t *lp = obj_to_link(hp, op);
-
- return ((lp->rhl_flags & RHL_F_DEAD) != 0);
-}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c
new file mode 100644
index 0000000000..a7ef2b69d7
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c
@@ -0,0 +1,565 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+static int smrt_attach(dev_info_t *, ddi_attach_cmd_t);
+static int smrt_detach(dev_info_t *, ddi_detach_cmd_t);
+static int smrt_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+static void smrt_cleanup(smrt_t *);
+static int smrt_command_comparator(const void *, const void *);
+
+/*
+ * Controller soft state. Each entry is an object of type "smrt_t".
+ */
+void *smrt_state;
+
+/*
+ * DMA attributes template. Each controller will make a copy of this template
+ * with appropriate customisations; e.g., the Scatter/Gather List Length.
+ */
+static ddi_dma_attr_t smrt_dma_attr_template = {
+ .dma_attr_version = DMA_ATTR_V0,
+ .dma_attr_addr_lo = 0x0000000000000000,
+ .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF,
+ .dma_attr_count_max = 0x00FFFFFF,
+ .dma_attr_align = 0x20,
+ .dma_attr_burstsizes = 0x20,
+ .dma_attr_minxfer = DMA_UNIT_8,
+ .dma_attr_maxxfer = 0xFFFFFFFF,
+ /*
+ * There is some suggestion that at least some, possibly older, Smart
+ * Array controllers cannot tolerate a DMA segment that straddles a 4GB
+ * boundary.
+ */
+ .dma_attr_seg = 0xFFFFFFFF,
+ .dma_attr_sgllen = 1,
+ .dma_attr_granular = 512,
+ .dma_attr_flags = 0
+};
+
+/*
+ * Device memory access attributes for device control registers.
+ */
+ddi_device_acc_attr_t smrt_dev_attributes = {
+ .devacc_attr_version = DDI_DEVICE_ATTR_V0,
+ .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC,
+ .devacc_attr_dataorder = DDI_STRICTORDER_ACC,
+ .devacc_attr_access = 0
+};
+
+/*
+ * Character/Block Operations Structure
+ */
+static struct cb_ops smrt_cb_ops = {
+ .cb_rev = CB_REV,
+ .cb_flag = D_NEW | D_MP,
+
+ .cb_open = scsi_hba_open,
+ .cb_close = scsi_hba_close,
+
+ .cb_ioctl = smrt_ioctl,
+
+ .cb_strategy = nodev,
+ .cb_print = nodev,
+ .cb_dump = nodev,
+ .cb_read = nodev,
+ .cb_write = nodev,
+ .cb_devmap = nodev,
+ .cb_mmap = nodev,
+ .cb_segmap = nodev,
+ .cb_chpoll = nochpoll,
+ .cb_prop_op = ddi_prop_op,
+ .cb_str = NULL,
+ .cb_aread = nodev,
+ .cb_awrite = nodev
+};
+
+/*
+ * Device Operations Structure
+ */
+static struct dev_ops smrt_dev_ops = {
+ .devo_rev = DEVO_REV,
+ .devo_refcnt = 0,
+
+ .devo_attach = smrt_attach,
+ .devo_detach = smrt_detach,
+
+ .devo_cb_ops = &smrt_cb_ops,
+
+ .devo_getinfo = nodev,
+ .devo_identify = nulldev,
+ .devo_probe = nulldev,
+ .devo_reset = nodev,
+ .devo_bus_ops = NULL,
+ .devo_power = nodev,
+ .devo_quiesce = nodev
+};
+
+/*
+ * Linkage structures
+ */
+static struct modldrv smrt_modldrv = {
+ .drv_modops = &mod_driverops,
+ .drv_linkinfo = "HP Smart Array",
+ .drv_dev_ops = &smrt_dev_ops
+};
+
+static struct modlinkage smrt_modlinkage = {
+ .ml_rev = MODREV_1,
+ .ml_linkage = { &smrt_modldrv, NULL }
+};
+
+
+int
+_init()
+{
+ int r;
+
+ VERIFY0(ddi_soft_state_init(&smrt_state, sizeof (smrt_t), 0));
+
+ if ((r = scsi_hba_init(&smrt_modlinkage)) != 0) {
+ goto fail;
+ }
+
+ if ((r = mod_install(&smrt_modlinkage)) != 0) {
+ scsi_hba_fini(&smrt_modlinkage);
+ goto fail;
+ }
+
+ return (r);
+
+fail:
+ ddi_soft_state_fini(&smrt_state);
+ return (r);
+}
+
+int
+_fini()
+{
+ int r;
+
+ if ((r = mod_remove(&smrt_modlinkage)) == 0) {
+ scsi_hba_fini(&smrt_modlinkage);
+ ddi_soft_state_fini(&smrt_state);
+ }
+
+ return (r);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&smrt_modlinkage, modinfop));
+}
+
+static int
+smrt_iport_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ const char *addr;
+ dev_info_t *pdip;
+ int instance;
+ smrt_t *smrt;
+
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ /*
+ * Note, we cannot get to our parent via the tran's tran_hba_private
+ * member. This pointer is reset to NULL when the scsi_hba_tran_t
+ * structure is duplicated.
+ */
+ addr = scsi_hba_iport_unit_address(dip);
+ VERIFY(addr != NULL);
+ pdip = ddi_get_parent(dip);
+ instance = ddi_get_instance(pdip);
+ smrt = ddi_get_soft_state(smrt_state, instance);
+ VERIFY(smrt != NULL);
+
+ if (strcmp(addr, SMRT_IPORT_VIRT) == 0) {
+ if (smrt_logvol_hba_setup(smrt, dip) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+ smrt->smrt_virt_iport = dip;
+ } else if (strcmp(addr, SMRT_IPORT_PHYS) == 0) {
+ if (smrt_phys_hba_setup(smrt, dip) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+ smrt->smrt_phys_iport = dip;
+ } else {
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+static int
+smrt_iport_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ const char *addr;
+ scsi_hba_tran_t *tran;
+ smrt_t *smrt;
+
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ tran = ddi_get_driver_private(dip);
+ VERIFY(tran != NULL);
+ smrt = tran->tran_hba_private;
+ VERIFY(smrt != NULL);
+
+ addr = scsi_hba_iport_unit_address(dip);
+ VERIFY(addr != NULL);
+
+ if (strcmp(addr, SMRT_IPORT_VIRT) == 0) {
+ smrt_logvol_hba_teardown(smrt, dip);
+ smrt->smrt_virt_iport = NULL;
+ } else if (strcmp(addr, SMRT_IPORT_PHYS) == 0) {
+ smrt_phys_hba_teardown(smrt, dip);
+ smrt->smrt_phys_iport = NULL;
+ } else {
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+static int
+smrt_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ uint32_t instance;
+ smrt_t *smrt;
+ boolean_t check_for_interrupts = B_FALSE;
+ int r;
+ char taskq_name[64];
+
+ if (scsi_hba_iport_unit_address(dip) != NULL)
+ return (smrt_iport_attach(dip, cmd));
+
+ if (cmd != DDI_ATTACH) {
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Allocate the per-controller soft state object and get
+ * a pointer to it.
+ */
+ instance = ddi_get_instance(dip);
+ if (ddi_soft_state_zalloc(smrt_state, instance) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not allocate soft state");
+ return (DDI_FAILURE);
+ }
+ if ((smrt = ddi_get_soft_state(smrt_state, instance)) == NULL) {
+ dev_err(dip, CE_WARN, "could not get soft state");
+ ddi_soft_state_free(smrt_state, instance);
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Initialise per-controller state object.
+ */
+ smrt->smrt_dip = dip;
+ smrt->smrt_instance = instance;
+ smrt->smrt_next_tag = SMRT_MIN_TAG_NUMBER;
+ list_create(&smrt->smrt_commands, sizeof (smrt_command_t),
+ offsetof(smrt_command_t, smcm_link));
+ list_create(&smrt->smrt_finishq, sizeof (smrt_command_t),
+ offsetof(smrt_command_t, smcm_link_finish));
+ list_create(&smrt->smrt_abortq, sizeof (smrt_command_t),
+ offsetof(smrt_command_t, smcm_link_abort));
+ list_create(&smrt->smrt_volumes, sizeof (smrt_volume_t),
+ offsetof(smrt_volume_t, smlv_link));
+ list_create(&smrt->smrt_physicals, sizeof (smrt_physical_t),
+ offsetof(smrt_physical_t, smpt_link));
+ list_create(&smrt->smrt_targets, sizeof (smrt_target_t),
+ offsetof(smrt_target_t, smtg_link_ctlr));
+ avl_create(&smrt->smrt_inflight, smrt_command_comparator,
+ sizeof (smrt_command_t), offsetof(smrt_command_t,
+ smcm_node));
+ cv_init(&smrt->smrt_cv_finishq, NULL, CV_DRIVER, NULL);
+
+ smrt->smrt_init_level |= SMRT_INITLEVEL_BASIC;
+
+ /*
+ * Perform basic device setup, including identifying the board, mapping
+ * the I2O registers and the Configuration Table.
+ */
+ if (smrt_device_setup(smrt) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "device setup failed");
+ goto fail;
+ }
+
+ /*
+ * Select a Transport Method (e.g. Simple or Performant) and update
+ * the Configuration Table. This function also waits for the
+ * controller to become ready.
+ */
+ if (smrt_ctlr_init(smrt) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "controller initialisation failed");
+ goto fail;
+ }
+
+ /*
+ * Each controller may have a different Scatter/Gather Element count.
+ * Configure a per-controller set of DMA attributes with the
+ * appropriate S/G size.
+ */
+ VERIFY(smrt->smrt_sg_cnt > 0);
+ smrt->smrt_dma_attr = smrt_dma_attr_template;
+ smrt->smrt_dma_attr.dma_attr_sgllen = smrt->smrt_sg_cnt;
+
+ /*
+ * Now that we have selected a Transport Method, we can configure
+ * the appropriate interrupt handlers.
+ */
+ if (smrt_interrupts_setup(smrt) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "interrupt handler setup failed");
+ goto fail;
+ }
+
+ /*
+ * Now that we have the correct interrupt priority, we can initialise
+ * the mutex. This must be done before the interrupt handler is
+ * enabled.
+ */
+ mutex_init(&smrt->smrt_mutex, NULL, MUTEX_DRIVER,
+ DDI_INTR_PRI(smrt->smrt_interrupt_pri));
+ smrt->smrt_init_level |= SMRT_INITLEVEL_MUTEX;
+
+ /*
+ * From this point forward, the controller is able to accept commands
+ * and (at least by polling) return command submissions. Setting this
+ * flag allows the rest of the driver to interact with the device.
+ */
+ smrt->smrt_status |= SMRT_CTLR_STATUS_RUNNING;
+
+ if (smrt_interrupts_enable(smrt) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "interrupt handler could not be enabled");
+ goto fail;
+ }
+
+ if (smrt_ctrl_hba_setup(smrt) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "SCSI framework setup failed");
+ goto fail;
+ }
+
+ /*
+ * Set the appropriate Interrupt Mask Register bits to start
+ * command completion interrupts from the controller.
+ */
+ smrt_intr_set(smrt, B_TRUE);
+ check_for_interrupts = B_TRUE;
+
+ /*
+ * Register the maintenance routine for periodic execution:
+ */
+ smrt->smrt_periodic = ddi_periodic_add(smrt_periodic, smrt,
+ SMRT_PERIODIC_RATE * NANOSEC, DDI_IPL_0);
+ smrt->smrt_init_level |= SMRT_INITLEVEL_PERIODIC;
+
+ (void) snprintf(taskq_name, sizeof (taskq_name), "smrt_discover_%u",
+ instance);
+ smrt->smrt_discover_taskq = ddi_taskq_create(smrt->smrt_dip, taskq_name,
+ 1, TASKQ_DEFAULTPRI, 0);
+ if (smrt->smrt_discover_taskq == NULL) {
+ dev_err(dip, CE_WARN, "failed to create discovery task queue");
+ goto fail;
+ }
+ smrt->smrt_init_level |= SMRT_INITLEVEL_TASKQ;
+
+ if ((r = smrt_event_init(smrt)) != 0) {
+ dev_err(dip, CE_WARN, "could not initialize event subsystem "
+ "(%d)", r);
+ goto fail;
+ }
+ smrt->smrt_init_level |= SMRT_INITLEVEL_ASYNC_EVENT;
+
+ if (scsi_hba_iport_register(dip, SMRT_IPORT_VIRT) != DDI_SUCCESS)
+ goto fail;
+
+ if (scsi_hba_iport_register(dip, SMRT_IPORT_PHYS) != DDI_SUCCESS)
+ goto fail;
+
+ /*
+ * Announce the attachment of this controller.
+ */
+ ddi_report_dev(dip);
+
+ return (DDI_SUCCESS);
+
+fail:
+ if (check_for_interrupts) {
+ if (smrt->smrt_stats.smrts_claimed_interrupts == 0) {
+ dev_err(dip, CE_WARN, "controller did not interrupt "
+ "during attach");
+ }
+ }
+ smrt_cleanup(smrt);
+ return (DDI_FAILURE);
+}
+
+static int
+smrt_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ scsi_hba_tran_t *tran = (scsi_hba_tran_t *)ddi_get_driver_private(dip);
+ smrt_t *smrt = (smrt_t *)tran->tran_hba_private;
+
+ if (scsi_hba_iport_unit_address(dip) != NULL)
+ return (smrt_iport_detach(dip, cmd));
+
+ if (cmd != DDI_DETACH) {
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * First, check to make sure that all SCSI framework targets have
+ * detached.
+ */
+ mutex_enter(&smrt->smrt_mutex);
+ if (!list_is_empty(&smrt->smrt_targets)) {
+ mutex_exit(&smrt->smrt_mutex);
+ dev_err(smrt->smrt_dip, CE_WARN, "cannot detach; targets still "
+ "using HBA");
+ return (DDI_FAILURE);
+ }
+
+ if (smrt->smrt_virt_iport != NULL || smrt->smrt_phys_iport != NULL) {
+ mutex_exit(&smrt->smrt_mutex);
+ dev_err(smrt->smrt_dip, CE_WARN, "cannot detach: iports still "
+ "attached");
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Prevent new targets from attaching now:
+ */
+ smrt->smrt_status |= SMRT_CTLR_STATUS_DETACHING;
+ mutex_exit(&smrt->smrt_mutex);
+
+ /*
+ * Clean up all remaining resources.
+ */
+ smrt_cleanup(smrt);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+smrt_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+ int *rval)
+{
+ int inst = MINOR2INST(getminor(dev));
+ int status;
+
+ if (secpolicy_sys_config(credp, B_FALSE) != 0) {
+ return (EPERM);
+ }
+
+ /*
+ * Ensure that we have a soft state object for this instance.
+ */
+ if (ddi_get_soft_state(smrt_state, inst) == NULL) {
+ return (ENXIO);
+ }
+
+ switch (cmd) {
+ default:
+ status = scsi_hba_ioctl(dev, cmd, arg, mode, credp, rval);
+ break;
+ }
+
+ return (status);
+}
+
+static void
+smrt_cleanup(smrt_t *smrt)
+{
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_ASYNC_EVENT) {
+ smrt_event_fini(smrt);
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_ASYNC_EVENT;
+ }
+
+ smrt_interrupts_teardown(smrt);
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_TASKQ) {
+ ddi_taskq_destroy(smrt->smrt_discover_taskq);
+ smrt->smrt_discover_taskq = NULL;
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_TASKQ;
+ }
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_PERIODIC) {
+ ddi_periodic_delete(smrt->smrt_periodic);
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_PERIODIC;
+ }
+
+ smrt_ctrl_hba_teardown(smrt);
+
+ smrt_ctlr_teardown(smrt);
+
+ smrt_device_teardown(smrt);
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_BASIC) {
+ smrt_logvol_teardown(smrt);
+ smrt_phys_teardown(smrt);
+
+ cv_destroy(&smrt->smrt_cv_finishq);
+
+ VERIFY(list_is_empty(&smrt->smrt_commands));
+ list_destroy(&smrt->smrt_commands);
+ list_destroy(&smrt->smrt_finishq);
+ list_destroy(&smrt->smrt_abortq);
+
+ VERIFY(list_is_empty(&smrt->smrt_volumes));
+ list_destroy(&smrt->smrt_volumes);
+
+ VERIFY(list_is_empty(&smrt->smrt_physicals));
+ list_destroy(&smrt->smrt_physicals);
+
+ VERIFY(list_is_empty(&smrt->smrt_targets));
+ list_destroy(&smrt->smrt_targets);
+
+ VERIFY(avl_is_empty(&smrt->smrt_inflight));
+ avl_destroy(&smrt->smrt_inflight);
+
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_BASIC;
+ }
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_MUTEX) {
+ mutex_destroy(&smrt->smrt_mutex);
+
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_MUTEX;
+ }
+
+ VERIFY0(smrt->smrt_init_level);
+
+ ddi_soft_state_free(smrt_state, ddi_get_instance(smrt->smrt_dip));
+}
+
+/*
+ * Comparator for the "smrt_inflight" AVL tree in a "smrt_t". This AVL tree
+ * allows a tag ID to be mapped back to the relevant "smrt_command_t".
+ */
+static int
+smrt_command_comparator(const void *lp, const void *rp)
+{
+ const smrt_command_t *l = lp;
+ const smrt_command_t *r = rp;
+
+ if (l->smcm_tag > r->smcm_tag) {
+ return (1);
+ } else if (l->smcm_tag < r->smcm_tag) {
+ return (-1);
+ } else {
+ return (0);
+ }
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf
new file mode 100644
index 0000000000..758ecd0779
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2016 Joyent, Inc.
+#
+
+scsi-no-quiesce=1;
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c
new file mode 100644
index 0000000000..b4cdd5607e
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c
@@ -0,0 +1,2023 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+/*
+ * Discovery, Resets, Periodics, and Events
+ * ----------------------------------------
+ *
+ * Discovery is the act of figuring out what logical and physical volumes exist
+ * under the controller. Discovery happens in response to the following events:
+ *
+ * o iports for virtual and physical devices being attached
+ * o Controller event notifications indicating potential topology changes
+ * o After a reset of the controller, before we can perform I/O again
+ *
+ * Because we have to perform discovery after a reset, which can happen during
+ * panic(), that also means that discovery may be run in panic context. We
+ * also need to emphasize the need for discovery to happen after a controller
+ * reset. Once a reset is initiated, we cannot be certain about the addresses
+ * of any of the existing targets until the reset has completed. The driver
+ * performs I/Os to addresses that the controller provides. The controller
+ * specification says that these addresses may change after a controller reset.
+ *
+ * Unfortunately, all of this combined means that making sure we can correctly
+ * run discovery is somewhat complicated. In non-panic contexts, discovery is
+ * always run from a taskq. We'll kick off the discovery in the taskq if
+ * nothing is pending at that time. The state is managed by bits in the
+ * smrt_status member of the smrt_t. There are four bits at this time:
+ *
+ * SMRT_CTLR_DISCOVERY_REQUESTED This flag indicates that something has
+ * requested that a discovery be performed.
+ * If no flags are set when this is set,
+ * then we will kick off discovery. All
+ * discovery requests are initiated via the
+ * smrt_discover_request() function.
+ *
+ * SMRT_CTLR_DISCOVERY_RUNNING This flag is set at the start of us
+ * running a discovery. It is removed when
+ * discovery finishes.
+ *
+ * SMRT_CTLR_DISCOVERY_PERIODIC This flag is set in a number of
+ * circumstances, which will be described
+ * in a subsequent section. This indicates
+ * that the periodic must kick off the
+ * discovery process.
+ *
+ * SMRT_CTLR_DISCOVERY_REQUIRED This flag indicates that at some point a
+ * controller reset occurred and we need to
+ * have a successful discovery to finish
+ * the act of resetting and allowing I/O to
+ * continue.
+ *
+ * In general, a request to discover kicks off the taskq to discover entries, if
+ * it hasn't already been requested or started. This also allows us to coalesce
+ * multiple requests, if needed. Note that if a request comes in when a
+ * discovery is ongoing, we do not kick off discovery again. Instead, we set
+ * the SMRT_CTLR_DISCOVERY_REQUESTED flag which will rerun discovery after the
+ * initial pass has completed.
+ *
+ * When a discovery starts, the first thing it does is clear the
+ * SMRT_CTLR_DISCOVERY_REQUESTED flag. This is important, because any
+ * additional requests for discovery that come in after this has started likely
+ * indicate that we've missed something. As such, when the discovery process
+ * finishes, if it sees the REQUESTED flag, then it will need to set the
+ * PERIODIC flag. The PERIODIC flag is used to indicate that we should run
+ * discovery again, but not kick if off immediately. Instead, it should be
+ * driven by the normal periodic behavior.
+ *
+ * If for some reason the act of discovery fails, or we fail to dispatch
+ * discovery due to a transient error, then we will flag PERIODIC so that the
+ * periodic tick will try and run things again.
+ *
+ * Now, we need to talk about SMRT_CTLR_DISCOVERY_REQUIRED. This flag is set
+ * after a reset occurs. The reset thread will be blocked on this.
+ * Importantly, none of the code in the discovery path can ask for a controller
+ * reset at this time. If at the end of a discovery, this flag is set, then we
+ * will signal the reset thread that it should check on its status by
+ * broadcasting on the smrt_cv_finishq. At that point, the reset thread will
+ * continue.
+ *
+ * Panic Context
+ * -------------
+ *
+ * All of this talk of threads and taskqs is well and good, but as an HBA
+ * driver, we have a serious responsibility to try and deal with panic sanely.
+ * In panic context, we will directly call the discovery functions and not poll
+ * for them to occur.
+ *
+ * However, because our discovery relies on the target maps, which aren't safe
+ * for panic context at this time, we have to take a different approach. We
+ * leverage the fact that we have a generation number stored with every
+ * discovery. If we try to do an I/O to a device where the generation doesn't
+ * match, then we know that it disappeared and should not be used. We also
+ * sanity check the model, serial numbers, and WWNs to make sure that these are
+ * the same devices. If they are, then we'll end up updating the address
+ * structures.
+ *
+ * Now, it is possible that when we were panicking, we had a thread that was in
+ * the process of running a discovery or even resetting the system. Once we're
+ * in panic, those threads aren't running, so if they didn't end up producing a
+ * new view of the world that the SCSI framework is using, then it shouldn't
+ * really matter, as we won't have updated the list of devices. Importantly,
+ * once we're in that context, we're not going to be attaching or detaching
+ * targets. If we get a request for one of these targets which has disappeared,
+ * we're going to have to end up giving up.
+ *
+ * Request Attributes
+ * ------------------
+ *
+ * The CISS specification allows for three different kinds of attributes that
+ * describe how requests are queued to the controller. These are:
+ *
+ * HEAD OF QUEUE The request should go to the head of the
+ * controller queue. This is used for resets and
+ * aborts to ensure that they're not blocked behind
+ * additional I/O.
+ *
+ * SIMPLE This queues the request for normal processing.
+ * Commands queued this way are not special with
+ * respect to one another. We use this for all I/O
+ * and discovery commands.
+ *
+ * ORDERED This attribute is used to indicate that commands
+ * should be submitted and processed in some order.
+ * This is used primarily for the event
+ * notification bits so we can ensure that at the
+ * return of a cancellation of the event
+ * notification, that any outstanding request has
+ * been honored.
+ */
+
+static int smrt_ctlr_versions(smrt_t *, uint16_t, smrt_versions_t *);
+static void smrt_discover(void *);
+
+/*
+ * The maximum number of seconds to wait for the controller to come online.
+ */
+unsigned smrt_ciss_init_time = 90;
+
+/*
+ * A tunable that determines the number of events per tick that we'll process
+ * via asynchronous event notification. If this rate is very high, then we will
+ * not submit the event and it will be picked up at the next tick of the
+ * periodic.
+ */
+uint_t smrt_event_intervention_threshold = 1000;
+
+/*
+ * Converts a LUN Address to a BMIC Identifier. The BMIC Identifier is used
+ * when performing various physical commands and generally should stay the same
+ * for a given device across inserts and removals; however, not across
+ * controller resets. These are calculated based on what the CISS specification
+ * calls the 'Level 2' target and bus, which don't have a real meaning in the
+ * SAS world otherwise.
+ */
+uint16_t
+smrt_lun_addr_to_bmic(PhysDevAddr_t *paddr)
+{
+ uint16_t id;
+
+ id = (paddr->Target[1].PeripDev.Bus - 1) << 8;
+ id += paddr->Target[1].PeripDev.Dev;
+
+ return (id);
+}
+
+void
+smrt_write_lun_addr_phys(LUNAddr_t *lun, boolean_t masked, unsigned bus,
+ unsigned target)
+{
+ lun->PhysDev.Mode = masked ? MASK_PERIPHERIAL_DEV_ADDR :
+ PERIPHERIAL_DEV_ADDR;
+
+ lun->PhysDev.TargetId = target;
+ lun->PhysDev.Bus = bus;
+
+ bzero(&lun->PhysDev.Target, sizeof (lun->PhysDev.Target));
+}
+
+/*
+ * According to the CISS Specification, the controller is always addressed in
+ * Mask Perhiperhal mode with a bus and target ID of zero. This is used by
+ * commands that need to write to the controller itself, which is generally
+ * discovery and other commands.
+ */
+void
+smrt_write_controller_lun_addr(LUNAddr_t *lun)
+{
+ smrt_write_lun_addr_phys(lun, B_TRUE, 0, 0);
+}
+
+void
+smrt_write_message_common(smrt_command_t *smcm, uint8_t type, int timeout_secs)
+{
+ switch (type) {
+ case CISS_MSG_ABORT:
+ case CISS_MSG_RESET:
+ case CISS_MSG_NOP:
+ break;
+
+ default:
+ panic("unknown message type");
+ }
+
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_MSG;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_HEADOFQUEUE;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_NONE;
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout_secs);
+ smcm->smcm_va_cmd->Request.CDBLen = CISS_CDBLEN;
+ smcm->smcm_va_cmd->Request.CDB[0] = type;
+}
+
+void
+smrt_write_message_abort_one(smrt_command_t *smcm, uint32_t tag)
+{
+ smrt_tag_t cisstag;
+
+ /*
+ * When aborting a particular command, the request is addressed
+ * to the controller.
+ */
+ smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN,
+ B_TRUE, 0, 0);
+
+ smrt_write_message_common(smcm, CISS_MSG_ABORT, 0);
+
+ /*
+ * Abort a single command.
+ */
+ smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASK;
+
+ /*
+ * The CISS Specification says that the tag value for a task-level
+ * abort should be in the CDB in bytes 4-11.
+ */
+ bzero(&cisstag, sizeof (cisstag));
+ cisstag.tag_value = tag;
+ bcopy(&cisstag, &smcm->smcm_va_cmd->Request.CDB[4],
+ sizeof (cisstag));
+}
+
+void
+smrt_write_message_abort_all(smrt_command_t *smcm, LUNAddr_t *addr)
+{
+ /*
+ * When aborting all tasks for a particular Logical Volume,
+ * the command is addressed not to the controller but to
+ * the Volume itself.
+ */
+ smcm->smcm_va_cmd->Header.LUN = *addr;
+
+ smrt_write_message_common(smcm, CISS_MSG_ABORT, 0);
+
+ /*
+ * Abort all commands for a particular Logical Volume.
+ */
+ smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASKSET;
+}
+
+void
+smrt_write_message_event_notify(smrt_command_t *smcm)
+{
+ smrt_event_notify_req_t senr;
+
+ smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+ smcm->smcm_va_cmd->Request.Timeout = 0;
+ smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr);
+
+ bzero(&senr, sizeof (senr));
+ senr.senr_opcode = CISS_SCMD_READ;
+ senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT;
+ senr.senr_flags = BE_32(0);
+ senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN);
+
+ bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0],
+ MIN(CISS_CDBLEN, sizeof (senr)));
+}
+
+void
+smrt_write_message_cancel_event_notify(smrt_command_t *smcm)
+{
+ smrt_event_notify_req_t senr;
+
+ smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_WRITE;
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(SMRT_ASYNC_CANCEL_TIMEOUT);
+ smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr);
+
+ bzero(&senr, sizeof (senr));
+ senr.senr_opcode = CISS_SCMD_WRITE;
+ senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT_CANCEL;
+ senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN);
+
+ bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0],
+ MIN(CISS_CDBLEN, sizeof (senr)));
+}
+
+void
+smrt_write_message_reset_ctlr(smrt_command_t *smcm)
+{
+ smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN,
+ B_TRUE, 0, 0);
+
+ smrt_write_message_common(smcm, CISS_MSG_RESET, 0);
+
+ smcm->smcm_va_cmd->Request.CDB[1] = CISS_RESET_CTLR;
+}
+
+void
+smrt_write_message_nop(smrt_command_t *smcm, int timeout_secs)
+{
+ /*
+ * No-op messages are always sent to the controller.
+ */
+ smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN,
+ B_TRUE, 0, 0);
+
+ smrt_write_message_common(smcm, CISS_MSG_NOP, timeout_secs);
+}
+
+/*
+ * This routine is executed regularly by ddi_periodic_add(9F). It checks the
+ * health of the controller and looks for submitted commands that have timed
+ * out.
+ */
+void
+smrt_periodic(void *arg)
+{
+ smrt_t *smrt = arg;
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Before we even check if the controller is running to process
+ * everything else, we must first check if we had a request to kick off
+ * discovery. We do this before the check if the controller is running,
+ * as this may be required to finish a discovery.
+ */
+ if ((smrt->smrt_status & SMRT_CTLR_DISCOVERY_PERIODIC) != 0 &&
+ (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) == 0 &&
+ (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) == 0) {
+ if (ddi_taskq_dispatch(smrt->smrt_discover_taskq,
+ smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) {
+ smrt->smrt_stats.smrts_discovery_tq_errors++;
+ } else {
+ smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_PERIODIC;
+ }
+ }
+
+ if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)) {
+ /*
+ * The device is currently not active, e.g. due to an
+ * in-progress controller reset.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ return;
+ }
+
+ /*
+ * Check on the health of the controller firmware. Note that if the
+ * controller has locked up, this routine will panic the system.
+ */
+ smrt_lockup_check(smrt);
+
+ /*
+ * Reset the event notification threshold counter.
+ */
+ smrt->smrt_event_count = 0;
+
+ /*
+ * Check inflight commands to see if they have timed out.
+ */
+ for (smrt_command_t *smcm = avl_first(&smrt->smrt_inflight);
+ smcm != NULL; smcm = AVL_NEXT(&smrt->smrt_inflight, smcm)) {
+ if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) {
+ /*
+ * Polled commands are timed out by the polling
+ * routine.
+ */
+ continue;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) {
+ /*
+ * This command has been aborted; either it will
+ * complete or the controller will be reset.
+ */
+ continue;
+ }
+
+ if (list_link_active(&smcm->smcm_link_abort)) {
+ /*
+ * Already on the abort queue.
+ */
+ continue;
+ }
+
+ if (smcm->smcm_expiry == 0) {
+ /*
+ * This command has no expiry time.
+ */
+ continue;
+ }
+
+ if (gethrtime() > smcm->smcm_expiry) {
+ list_insert_tail(&smrt->smrt_abortq, smcm);
+ smcm->smcm_status |= SMRT_CMD_STATUS_TIMEOUT;
+ }
+ }
+
+ /*
+ * Process the abort queue.
+ */
+ (void) smrt_process_abortq(smrt);
+
+ /*
+ * Check if we have an outstanding event intervention request. Note,
+ * the command in question should always be in a state such that it is
+ * usable by the system here. The command is always prepared again by
+ * the normal event notification path, even if a reset has occurred.
+ * The reset will be processed before we'd ever consider running an
+ * event again. Note, if we fail to submit this, then we leave this for
+ * the next occurrence of the periodic.
+ */
+ if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) {
+ smrt->smrt_stats.smrts_events_intervened++;
+
+ if (smrt_submit(smrt, smrt->smrt_event_cmd) == 0) {
+ smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION;
+ }
+ }
+
+ mutex_exit(&smrt->smrt_mutex);
+}
+
+int
+smrt_retrieve(smrt_t *smrt)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ switch (smrt->smrt_ctlr_mode) {
+ case SMRT_CTLR_MODE_SIMPLE:
+ smrt_retrieve_simple(smrt);
+ return (DDI_SUCCESS);
+
+ case SMRT_CTLR_MODE_UNKNOWN:
+ break;
+ }
+
+ panic("unknown controller mode");
+ /* LINTED: E_FUNC_NO_RET_VAL */
+}
+
+/*
+ * Grab a new tag number for this command. We aim to avoid reusing tag numbers
+ * as much as possible, so as to avoid spurious double completion from the
+ * controller.
+ */
+static void
+smrt_set_new_tag(smrt_t *smrt, smrt_command_t *smcm)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ /*
+ * Loop until we find a tag that is not in use. The tag space is
+ * very large (~30 bits) and the maximum number of inflight commands
+ * is comparatively small (~1024 in current controllers).
+ */
+ for (;;) {
+ uint32_t new_tag = smrt->smrt_next_tag;
+
+ if (++smrt->smrt_next_tag > SMRT_MAX_TAG_NUMBER) {
+ smrt->smrt_next_tag = SMRT_MIN_TAG_NUMBER;
+ }
+
+ if (smrt_lookup_inflight(smrt, new_tag) != NULL) {
+ /*
+ * This tag is already used on an inflight command.
+ * Choose another.
+ */
+ continue;
+ }
+
+ /*
+ * Set the tag for the command and also write it into the
+ * appropriate part of the request block.
+ */
+ smcm->smcm_tag = new_tag;
+ smcm->smcm_va_cmd->Header.Tag.tag_value = new_tag;
+ return;
+ }
+}
+
+/*
+ * Submit a command to the controller.
+ */
+int
+smrt_submit(smrt_t *smrt, smrt_command_t *smcm)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+ VERIFY(smcm->smcm_type != SMRT_CMDTYPE_PREINIT);
+
+ /*
+ * Anything that asks us to ignore the running state of the controller
+ * must be wired up to poll for completion.
+ */
+ if (smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING) {
+ VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED);
+ }
+
+ /*
+ * If the controller is currently being reset, do not allow command
+ * submission. However, if this is one of the commands needed to finish
+ * reset, as indicated on the command structure, allow it.
+ */
+ if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING) &&
+ !(smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING)) {
+ return (EIO);
+ }
+
+ /*
+ * Do not allow submission of more concurrent commands than the
+ * controller supports.
+ */
+ if (avl_numnodes(&smrt->smrt_inflight) >= smrt->smrt_maxcmds) {
+ return (EAGAIN);
+ }
+
+ /*
+ * Synchronise the Command Block DMA resources to ensure that the
+ * device has a consistent view before we pass it the command.
+ */
+ if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0,
+ DDI_DMA_SYNC_FORDEV) != DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "DMA sync failure");
+ return (EIO);
+ }
+
+ /*
+ * Ensure that this command is not re-used without issuing a new
+ * tag number and performing any appropriate cleanup.
+ */
+ VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_USED));
+ smcm->smcm_status |= SMRT_CMD_STATUS_USED;
+
+ /*
+ * Assign a tag that is not currently in use
+ */
+ smrt_set_new_tag(smrt, smcm);
+
+ /*
+ * Insert this command into the inflight AVL.
+ */
+ avl_index_t where;
+ if (avl_find(&smrt->smrt_inflight, smcm, &where) != NULL) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "duplicate submit tag %x",
+ smcm->smcm_tag);
+ }
+ avl_insert(&smrt->smrt_inflight, smcm, where);
+ if (smrt->smrt_stats.smrts_max_inflight <
+ avl_numnodes(&smrt->smrt_inflight)) {
+ smrt->smrt_stats.smrts_max_inflight =
+ avl_numnodes(&smrt->smrt_inflight);
+ }
+
+ VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT));
+ smcm->smcm_status |= SMRT_CMD_STATUS_INFLIGHT;
+
+ smcm->smcm_time_submit = gethrtime();
+
+ switch (smrt->smrt_ctlr_mode) {
+ case SMRT_CTLR_MODE_SIMPLE:
+ smrt_submit_simple(smrt, smcm);
+ return (0);
+
+ case SMRT_CTLR_MODE_UNKNOWN:
+ break;
+ }
+ panic("unknown controller mode");
+ /* LINTED: E_FUNC_NO_RET_VAL */
+}
+
+static void
+smrt_process_finishq_sync(smrt_command_t *smcm)
+{
+ smrt_t *smrt = smcm->smcm_ctlr;
+
+ if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0,
+ DDI_DMA_SYNC_FORCPU) != DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "finishq DMA sync failure");
+ }
+}
+
+static void
+smrt_process_finishq_one(smrt_command_t *smcm)
+{
+ smrt_t *smrt = smcm->smcm_ctlr;
+
+ VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_COMPLETE));
+ smcm->smcm_status |= SMRT_CMD_STATUS_COMPLETE;
+
+ switch (smcm->smcm_type) {
+ case SMRT_CMDTYPE_INTERNAL:
+ cv_broadcast(&smcm->smcm_ctlr->smrt_cv_finishq);
+ return;
+
+ case SMRT_CMDTYPE_SCSA:
+ smrt_hba_complete(smcm);
+ return;
+
+ case SMRT_CMDTYPE_EVENT:
+ smrt_event_complete(smcm);
+ return;
+
+ case SMRT_CMDTYPE_ABORTQ:
+ /*
+ * Abort messages sent as part of abort queue processing
+ * do not require any completion activity.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ mutex_enter(&smrt->smrt_mutex);
+ return;
+
+ case SMRT_CMDTYPE_PREINIT:
+ dev_err(smrt->smrt_dip, CE_PANIC, "preinit command "
+ "completed after initialisation");
+ return;
+ }
+
+ panic("unknown command type");
+}
+
+/*
+ * Process commands in the completion queue.
+ */
+void
+smrt_process_finishq(smrt_t *smrt)
+{
+ smrt_command_t *smcm;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ while ((smcm = list_remove_head(&smrt->smrt_finishq)) != NULL) {
+ /*
+ * Synchronise the Command Block before we read from it or
+ * free it, to ensure that any writes from the controller are
+ * visible.
+ */
+ smrt_process_finishq_sync(smcm);
+
+ /*
+ * Check if this command was in line to be aborted.
+ */
+ if (list_link_active(&smcm->smcm_link_abort)) {
+ /*
+ * This command was in line, but the controller
+ * subsequently completed the command before we
+ * were able to do so.
+ */
+ list_remove(&smrt->smrt_abortq, smcm);
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_TIMEOUT;
+ }
+
+ /*
+ * Check if this command has been abandoned by the original
+ * submitter. If it has, free it now to avoid a leak.
+ */
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ABANDONED) {
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ mutex_enter(&smrt->smrt_mutex);
+ continue;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) {
+ /*
+ * This command will be picked up and processed
+ * by "smrt_poll_for()" once the CV is triggered
+ * at the end of processing.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE;
+ continue;
+ }
+
+ smrt_process_finishq_one(smcm);
+ }
+
+ cv_broadcast(&smrt->smrt_cv_finishq);
+}
+
+/*
+ * Process commands in the abort queue.
+ */
+void
+smrt_process_abortq(smrt_t *smrt)
+{
+ smrt_command_t *smcm;
+ smrt_command_t *abort_smcm = NULL;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ if (list_is_empty(&smrt->smrt_abortq)) {
+ goto out;
+ }
+
+another:
+ mutex_exit(&smrt->smrt_mutex);
+ if ((abort_smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_ABORTQ,
+ KM_NOSLEEP)) == NULL) {
+ /*
+ * No resources available to send abort messages. We will
+ * try again the next time around.
+ */
+ mutex_enter(&smrt->smrt_mutex);
+ goto out;
+ }
+ mutex_enter(&smrt->smrt_mutex);
+
+ while ((smcm = list_remove_head(&smrt->smrt_abortq)) != NULL) {
+ if (!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) {
+ /*
+ * This message is not currently inflight, so
+ * no abort is needed.
+ */
+ continue;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) {
+ /*
+ * An abort message has already been sent for
+ * this command.
+ */
+ continue;
+ }
+
+ /*
+ * Send an abort message for the command.
+ */
+ smrt_write_message_abort_one(abort_smcm, smcm->smcm_tag);
+ if (smrt_submit(smrt, abort_smcm) != 0) {
+ /*
+ * The command could not be submitted to the
+ * controller. Put it back in the abort queue
+ * and give up for now.
+ */
+ list_insert_head(&smrt->smrt_abortq, smcm);
+ goto out;
+ }
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABORT_SENT;
+
+ /*
+ * Record some debugging information about the abort we
+ * sent:
+ */
+ smcm->smcm_abort_time = gethrtime();
+ smcm->smcm_abort_tag = abort_smcm->smcm_tag;
+
+ /*
+ * The abort message was sent. Release it and
+ * allocate another command.
+ */
+ abort_smcm = NULL;
+ goto another;
+ }
+
+out:
+ cv_broadcast(&smrt->smrt_cv_finishq);
+ if (abort_smcm != NULL) {
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(abort_smcm);
+ mutex_enter(&smrt->smrt_mutex);
+ }
+}
+
+int
+smrt_poll_for(smrt_t *smrt, smrt_command_t *smcm)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+ VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED);
+
+ while (!(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE)) {
+ if (smcm->smcm_expiry != 0) {
+ /*
+ * This command has an expiry time. Check to see
+ * if it has already passed:
+ */
+ if (smcm->smcm_expiry < gethrtime()) {
+ return (ETIMEDOUT);
+ }
+ }
+
+ if (ddi_in_panic()) {
+ /*
+ * When the system is panicking, there are no
+ * interrupts or other threads. Drive the polling loop
+ * on our own, but with a small delay to avoid
+ * aggrevating the controller while we're trying to
+ * dump.
+ */
+ (void) smrt_retrieve(smrt);
+ smrt_process_finishq(smrt);
+ drv_usecwait(100);
+ continue;
+ }
+
+ /*
+ * Wait for command completion to return through the regular
+ * interrupt handling path.
+ */
+ if (smcm->smcm_expiry == 0) {
+ cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
+ } else {
+ /*
+ * Wait only until the expiry time for this command.
+ */
+ (void) cv_timedwait_sig_hrtime(&smrt->smrt_cv_finishq,
+ &smrt->smrt_mutex, smcm->smcm_expiry);
+ }
+ }
+
+ /*
+ * Fire the completion callback for this command. The callback
+ * is responsible for freeing the command, so it may not be
+ * referenced again once this call returns.
+ */
+ smrt_process_finishq_one(smcm);
+
+ return (0);
+}
+
+void
+smrt_intr_set(smrt_t *smrt, boolean_t enabled)
+{
+ /*
+ * Read the Interrupt Mask Register.
+ */
+ uint32_t imr = smrt_get32(smrt, CISS_I2O_INTERRUPT_MASK);
+
+ switch (smrt->smrt_ctlr_mode) {
+ case SMRT_CTLR_MODE_SIMPLE:
+ if (enabled) {
+ imr &= ~CISS_IMR_BIT_SIMPLE_INTR_DISABLE;
+ } else {
+ imr |= CISS_IMR_BIT_SIMPLE_INTR_DISABLE;
+ }
+ smrt_put32(smrt, CISS_I2O_INTERRUPT_MASK, imr);
+ return;
+
+ case SMRT_CTLR_MODE_UNKNOWN:
+ break;
+ }
+ panic("unknown controller mode");
+}
+
+/*
+ * Signal to the controller that we have updated the Configuration Table by
+ * writing to the Inbound Doorbell Register. The controller will, after some
+ * number of seconds, acknowledge this by clearing the bit.
+ *
+ * If successful, return DDI_SUCCESS. If the controller takes too long to
+ * acknowledge, return DDI_FAILURE.
+ */
+int
+smrt_cfgtbl_flush(smrt_t *smrt)
+{
+ /*
+ * Read the current value of the Inbound Doorbell Register.
+ */
+ uint32_t idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL);
+
+ /*
+ * Signal the Configuration Table change to the controller.
+ */
+ idr |= CISS_IDR_BIT_CFGTBL_CHANGE;
+ smrt_put32(smrt, CISS_I2O_INBOUND_DOORBELL, idr);
+
+ /*
+ * Wait for the controller to acknowledge the change.
+ */
+ for (unsigned i = 0; i < smrt_ciss_init_time; i++) {
+ idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL);
+
+ if ((idr & CISS_IDR_BIT_CFGTBL_CHANGE) == 0) {
+ return (DDI_SUCCESS);
+ }
+
+ /*
+ * Wait for one second before trying again.
+ */
+ delay(drv_usectohz(1000000));
+ }
+
+ dev_err(smrt->smrt_dip, CE_WARN, "time out expired before controller "
+ "configuration completed");
+ return (DDI_FAILURE);
+}
+
+int
+smrt_cfgtbl_transport_has_support(smrt_t *smrt, int xport)
+{
+ VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE);
+
+ /*
+ * Read the current value of the "Supported Transport Methods" field in
+ * the Configuration Table.
+ */
+ uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->TransportSupport);
+
+ /*
+ * Check that the desired transport method is supported by the
+ * controller:
+ */
+ if ((xport_active & xport) == 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "controller does not support "
+ "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ?
+ "simple" : "performant");
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+void
+smrt_cfgtbl_transport_set(smrt_t *smrt, int xport)
+{
+ VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE);
+
+ ddi_put32(smrt->smrt_ct_handle, &smrt->smrt_ct->TransportRequest,
+ xport);
+}
+
+int
+smrt_cfgtbl_transport_confirm(smrt_t *smrt, int xport)
+{
+ VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE);
+
+ /*
+ * Read the current value of the TransportActive field in the
+ * Configuration Table.
+ */
+ uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->TransportActive);
+
+ /*
+ * Check that the desired transport method is now active:
+ */
+ if ((xport_active & xport) == 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to enable transport "
+ "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ?
+ "simple" : "performant");
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Ensure that the controller is now ready to accept commands.
+ */
+ if ((xport_active & CISS_CFGTBL_READY_FOR_COMMANDS) == 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "controller not ready to "
+ "accept commands");
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+uint32_t
+smrt_ctlr_get_maxsgelements(smrt_t *smrt)
+{
+ return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->MaxSGElements));
+}
+
+uint32_t
+smrt_ctlr_get_cmdsoutmax(smrt_t *smrt)
+{
+ return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->CmdsOutMax));
+}
+
+static uint32_t
+smrt_ctlr_get_hostdrvsup(smrt_t *smrt)
+{
+ return (ddi_get32(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->HostDrvrSupport));
+}
+
+int
+smrt_ctlr_init(smrt_t *smrt)
+{
+ uint8_t signature[4] = { 'C', 'I', 'S', 'S' };
+ int e;
+
+ if ((e = smrt_ctlr_wait_for_state(smrt,
+ SMRT_WAIT_STATE_READY)) != DDI_SUCCESS) {
+ return (e);
+ }
+
+ /*
+ * The configuration table contains an ASCII signature ("CISS") which
+ * should be checked as we initialise the controller.
+ * See: "9.1 Configuration Table" in CISS Specification.
+ */
+ for (unsigned i = 0; i < 4; i++) {
+ if (ddi_get8(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->Signature[i]) != signature[i]) {
+ dev_err(smrt->smrt_dip, CE_WARN, "invalid signature "
+ "detected");
+ return (DDI_FAILURE);
+ }
+ }
+
+ /*
+ * Initialise an appropriate Transport Method. For now, this driver
+ * only supports the "Simple" method.
+ */
+ if ((e = smrt_ctlr_init_simple(smrt)) != DDI_SUCCESS) {
+ return (e);
+ }
+
+ /*
+ * Save some common feature support bitfields.
+ */
+ smrt->smrt_host_support = smrt_ctlr_get_hostdrvsup(smrt);
+ smrt->smrt_bus_support = ddi_get32(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->BusTypes);
+
+ /*
+ * Read initial controller heartbeat value and mark the current
+ * reading time.
+ */
+ smrt->smrt_last_heartbeat = ddi_get32(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->HeartBeat);
+ smrt->smrt_last_heartbeat_time = gethrtime();
+
+ /*
+ * Determine the firmware version of the controller so that we can
+ * select which type of interrupts to use.
+ */
+ if ((e = smrt_ctlr_versions(smrt, SMRT_DISCOVER_TIMEOUT,
+ &smrt->smrt_versions)) != 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "could not identify "
+ "controller (%d)", e);
+ return (DDI_FAILURE);
+ }
+
+ dev_err(smrt->smrt_dip, CE_NOTE, "!firmware rev %s",
+ smrt->smrt_versions.smrtv_firmware_rev);
+
+ return (DDI_SUCCESS);
+}
+
+void
+smrt_ctlr_teardown(smrt_t *smrt)
+{
+ smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING;
+
+ switch (smrt->smrt_ctlr_mode) {
+ case SMRT_CTLR_MODE_SIMPLE:
+ smrt_ctlr_teardown_simple(smrt);
+ return;
+
+ case SMRT_CTLR_MODE_UNKNOWN:
+ return;
+ }
+
+ panic("unknown controller mode");
+}
+
+int
+smrt_ctlr_wait_for_state(smrt_t *smrt, smrt_wait_state_t state)
+{
+ unsigned wait_usec = 100 * 1000;
+ unsigned wait_count = SMRT_WAIT_DELAY_SECONDS * 1000000 / wait_usec;
+
+ VERIFY(state == SMRT_WAIT_STATE_READY ||
+ state == SMRT_WAIT_STATE_UNREADY);
+
+ /*
+ * Read from the Scratchpad Register until the expected ready signature
+ * is detected. This behaviour is not described in the CISS
+ * specification.
+ *
+ * If the device is not in the desired state immediately, sleep for a
+ * second and try again. If the device has not become ready in 300
+ * seconds, give up.
+ */
+ for (unsigned i = 0; i < wait_count; i++) {
+ uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD);
+
+ switch (state) {
+ case SMRT_WAIT_STATE_READY:
+ if (spr == CISS_SCRATCHPAD_INITIALISED) {
+ return (DDI_SUCCESS);
+ }
+ break;
+
+ case SMRT_WAIT_STATE_UNREADY:
+ if (spr != CISS_SCRATCHPAD_INITIALISED) {
+ return (DDI_SUCCESS);
+ }
+ break;
+ }
+
+ if (ddi_in_panic()) {
+ /*
+ * There is no sleep for the panicking, so we
+ * must spin wait:
+ */
+ drv_usecwait(wait_usec);
+ } else {
+ /*
+ * Wait for a quarter second and try again.
+ */
+ delay(drv_usectohz(wait_usec));
+ }
+ }
+
+ dev_err(smrt->smrt_dip, CE_WARN, "time out waiting for controller "
+ "to enter state \"%s\"", state == SMRT_WAIT_STATE_READY ?
+ "ready": "unready");
+ return (DDI_FAILURE);
+}
+
+void
+smrt_lockup_check(smrt_t *smrt)
+{
+ /*
+ * Read the current controller heartbeat value.
+ */
+ uint32_t heartbeat = ddi_get32(smrt->smrt_ct_handle,
+ &smrt->smrt_ct->HeartBeat);
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ /*
+ * Check to see if the value is the same as last time we looked:
+ */
+ if (heartbeat != smrt->smrt_last_heartbeat) {
+ /*
+ * The heartbeat value has changed, which suggests that the
+ * firmware in the controller has not yet come to a complete
+ * stop. Record the new value, as well as the current time.
+ */
+ smrt->smrt_last_heartbeat = heartbeat;
+ smrt->smrt_last_heartbeat_time = gethrtime();
+ return;
+ }
+
+ /*
+ * The controller _might_ have been able to signal to us that is
+ * has locked up. This is a truly unfathomable state of affairs:
+ * If the firmware can tell it has flown off the rails, why not
+ * simply reset the controller?
+ */
+ uint32_t odr = smrt_get32(smrt, CISS_I2O_OUTBOUND_DOORBELL_STATUS);
+ uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD);
+ if ((odr & CISS_ODR_BIT_LOCKUP) != 0) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has "
+ "reported a critical fault (odr %08x spr %08x)",
+ odr, spr);
+ }
+
+ if (gethrtime() > smrt->smrt_last_heartbeat_time + 60 * NANOSEC) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has "
+ "stopped responding (odr %08x spr %08x)",
+ odr, spr);
+ }
+}
+
+/*
+ * Probe the controller with the IDENTIFY CONTROLLER request. This is a BMIC
+ * command, so it must be submitted to the controller and we must poll for its
+ * completion. This functionality is only presently used during controller
+ * initialisation, so it uses the special pre-initialisation path for command
+ * allocation and submission.
+ */
+static int
+smrt_ctlr_identify(smrt_t *smrt, uint16_t timeout,
+ smrt_identify_controller_t *resp)
+{
+ smrt_command_t *smcm;
+ smrt_identify_controller_req_t smicr;
+ int r;
+ size_t sz;
+
+ /*
+ * Allocate a command with a data buffer; the controller will fill it
+ * with identification information. There is some suggestion in the
+ * firmware-level specification that the buffer length should be a
+ * multiple of 512 bytes for some controllers, so we round up.
+ */
+ sz = P2ROUNDUP_TYPED(sizeof (*resp), 512, size_t);
+ if ((smcm = smrt_command_alloc_preinit(smrt, sz, KM_SLEEP)) == NULL) {
+ return (ENOMEM);
+ }
+
+ smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+ smcm->smcm_va_cmd->Request.CDBLen = sizeof (smicr);
+ smcm->smcm_va_cmd->Request.Timeout = timeout;
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+
+ /*
+ * Construct the IDENTIFY CONTROLLER request CDB. Note that any
+ * reserved fields in the request must be filled with zeroes.
+ */
+ bzero(&smicr, sizeof (smicr));
+ smicr.smicr_opcode = CISS_SCMD_BMIC_READ;
+ smicr.smicr_lun = 0;
+ smicr.smicr_command = CISS_BMIC_IDENTIFY_CONTROLLER;
+ bcopy(&smicr, &smcm->smcm_va_cmd->Request.CDB[0],
+ MIN(CISS_CDBLEN, sizeof (smicr)));
+
+ /*
+ * Send the command to the device and poll for its completion.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ smcm->smcm_expiry = gethrtime() + timeout * NANOSEC;
+ if ((r = smrt_preinit_command_simple(smrt, smcm)) != 0) {
+ VERIFY3S(r, ==, ETIMEDOUT);
+ VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * This command timed out, but the driver is not presently
+ * initialised to the point where we can try to abort it.
+ * The command was created with the PREINIT type, so it
+ * does not appear in the global command tracking list.
+ * In order to avoid problems with DMA from the controller,
+ * we have to leak the command allocation.
+ */
+ smcm = NULL;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ /*
+ * The controller was reset while we were trying to identify
+ * it. Report failure.
+ */
+ r = EIO;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+
+ if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+ dev_err(smrt->smrt_dip, CE_WARN, "identify "
+ "controller error: status 0x%x",
+ ei->CommandStatus);
+ r = EIO;
+ goto out;
+ }
+ }
+
+ if (resp != NULL) {
+ /*
+ * Copy the identify response out for the caller.
+ */
+ bcopy(smcm->smcm_internal->smcmi_va, resp, sizeof (*resp));
+ }
+
+ r = 0;
+
+out:
+ if (smcm != NULL) {
+ smrt_command_free(smcm);
+ }
+ return (r);
+}
+
+/*
+ * The firmware versions in an IDENTIFY CONTROLLER response generally take
+ * the form of a four byte ASCII string containing a dotted decimal version
+ * number; e.g., "8.00".
+ *
+ * This function sanitises the firmware version, replacing unexpected
+ * values with a question mark.
+ */
+static void
+smrt_copy_firmware_version(uint8_t *src, char *dst)
+{
+ for (unsigned i = 0; i < 4; i++) {
+ /*
+ * Make sure that this is a 7-bit clean ASCII value.
+ */
+ char c = src[i] <= 0x7f ? (char)(src[i] & 0x7f) : '?';
+
+ if (isalnum(c) || c == '.' || c == ' ') {
+ dst[i] = c;
+ } else {
+ dst[i] = '?';
+ }
+ }
+ dst[4] = '\0';
+}
+
+/*
+ * Using an IDENTIFY CONTROLLER request, determine firmware and controller
+ * version details. See the comments for "smrt_ctlr_identify()" for more
+ * details about calling context.
+ */
+static int
+smrt_ctlr_versions(smrt_t *smrt, uint16_t timeout, smrt_versions_t *smrtv)
+{
+ smrt_identify_controller_t smic;
+ int r;
+
+ if ((r = smrt_ctlr_identify(smrt, timeout, &smic)) != 0) {
+ return (r);
+ }
+
+ smrtv->smrtv_hardware_version = smic.smic_hardware_version;
+ smrt_copy_firmware_version(smic.smic_firmware_rev,
+ smrtv->smrtv_firmware_rev);
+ smrt_copy_firmware_version(smic.smic_recovery_rev,
+ smrtv->smrtv_recovery_rev);
+ smrt_copy_firmware_version(smic.smic_bootblock_rev,
+ smrtv->smrtv_bootblock_rev);
+
+ return (0);
+}
+
+int
+smrt_ctlr_reset(smrt_t *smrt)
+{
+ smrt_command_t *smcm, *smcm_nop;
+ int r;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ if (ddi_in_panic()) {
+ goto skip_check;
+ }
+
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+ /*
+ * Don't pile on. One reset is enough. Wait until
+ * it's complete, and then return success.
+ */
+ while (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+ cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
+ }
+ return (0);
+ }
+ smrt->smrt_status |= SMRT_CTLR_STATUS_RESETTING;
+ smrt->smrt_last_reset_start = gethrtime();
+ smrt->smrt_stats.smrts_ctlr_resets++;
+
+skip_check:
+ /*
+ * Allocate two commands: one for the soft reset message, which we
+ * cannot free until the controller has reset; and one for the ping we
+ * will use to determine when it is once again functional.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL) {
+ mutex_enter(&smrt->smrt_mutex);
+ return (ENOMEM);
+ }
+ if ((smcm_nop = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL) {
+ smrt_command_free(smcm);
+ mutex_enter(&smrt->smrt_mutex);
+ return (ENOMEM);
+ }
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Send a soft reset command to the controller. If this command
+ * succeeds, there will likely be no completion notification. Instead,
+ * the device should become unavailable for some period of time and
+ * then become available again. Once available again, we know the soft
+ * reset has completed and should abort all in-flight commands.
+ */
+ smrt_write_message_reset_ctlr(smcm);
+
+ /*
+ * Disable interrupts now.
+ */
+ smrt_intr_set(smrt, B_FALSE);
+
+ dev_err(smrt->smrt_dip, CE_WARN, "attempting controller soft reset");
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if ((r = smrt_submit(smrt, smcm)) != 0) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+ "submit failed (%d)", r);
+ }
+
+ /*
+ * Mark every currently inflight command as being reset, including the
+ * soft reset command we just sent. Once we confirm the reset works,
+ * we can safely report that these commands have failed.
+ */
+ for (smrt_command_t *t = avl_first(&smrt->smrt_inflight);
+ t != NULL; t = AVL_NEXT(&smrt->smrt_inflight, t)) {
+ t->smcm_status |= SMRT_CMD_STATUS_RESET_SENT;
+ }
+
+ /*
+ * Now that we have submitted our soft reset command, prevent
+ * the rest of the driver from interacting with the controller.
+ */
+ smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING;
+
+ /*
+ * We do not expect a completion from the controller for our soft
+ * reset command, but we also cannot remove it from the inflight
+ * list until we know the controller has actually reset. To do
+ * otherwise would potentially allow the controller to scribble
+ * on the memory we were using.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+
+ if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_UNREADY) !=
+ DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+ "controller did not become unready");
+ }
+ dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller unready");
+
+ if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_READY) !=
+ DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+ "controller did not come become ready");
+ }
+ dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller ready");
+
+ /*
+ * In at least the Smart Array P420i, the controller can take 30-45
+ * seconds after the scratchpad register shows it as being available
+ * before it is ready to receive commands. In order to avoid hitting
+ * it too early with our post-reset ping, we will sleep for 10 seconds
+ * here.
+ */
+ if (ddi_in_panic()) {
+ drv_usecwait(10 * MICROSEC);
+ } else {
+ delay(drv_usectohz(10 * MICROSEC));
+ }
+
+ smrt_ctlr_teardown(smrt);
+ if (smrt_ctlr_init(smrt) != DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+ "controller transport could not be configured");
+ }
+ dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller configured");
+
+ smrt_write_message_nop(smcm_nop, 0);
+ smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLLED |
+ SMRT_CMD_IGNORE_RUNNING;
+ if ((r = smrt_submit(smrt, smcm_nop)) != 0) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+ "ping could not be submitted (%d)", r);
+ }
+
+ /*
+ * Interrupts are still masked at this stage. Poll manually in
+ * a way that will not trigger regular finish queue processing:
+ */
+ VERIFY(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT);
+ for (unsigned i = 0; i < 600; i++) {
+ smrt_retrieve_simple(smrt);
+
+ if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) {
+ /*
+ * Remove the ping command from the finish queue and
+ * process it manually. This processing must mirror
+ * what would have been done in smrt_process_finishq().
+ */
+ VERIFY(list_link_active(&smcm_nop->smcm_link_finish));
+ list_remove(&smrt->smrt_finishq, smcm_nop);
+ smrt_process_finishq_sync(smcm_nop);
+ smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE;
+ smrt_process_finishq_one(smcm_nop);
+ break;
+ }
+
+ if (ddi_in_panic()) {
+ drv_usecwait(100 * 1000);
+ } else {
+ delay(drv_usectohz(100 * 1000));
+ }
+ }
+
+ if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_COMPLETE)) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
+ "ping did not complete");
+ } else if (smcm_nop->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ dev_err(smrt->smrt_dip, CE_WARN, "soft reset: ping completed "
+ "in error (status %u)",
+ (unsigned)smcm_nop->smcm_va_err->CommandStatus);
+ } else {
+ dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: ping completed");
+ }
+
+ /*
+ * Now that the controller is working again, we can abort any
+ * commands that were inflight during the reset.
+ */
+ smrt_command_t *nt;
+ for (smrt_command_t *t = avl_first(&smrt->smrt_inflight);
+ t != NULL; t = nt) {
+ nt = AVL_NEXT(&smrt->smrt_inflight, t);
+
+ if (t->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ avl_remove(&smrt->smrt_inflight, t);
+ t->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT;
+
+ list_insert_tail(&smrt->smrt_finishq, t);
+ }
+ }
+
+ /*
+ * Quiesce our discovery thread. Note, because
+ * SMRT_CTLR_STATUS_RESTARTING is set, nothing can cause it to be
+ * enabled again.
+ */
+ if (!ddi_in_panic()) {
+ mutex_exit(&smrt->smrt_mutex);
+ ddi_taskq_wait(smrt->smrt_discover_taskq);
+ mutex_enter(&smrt->smrt_mutex);
+ }
+
+ /*
+ * Re-enable interrupts. Now, we must kick off a discovery to make sure
+ * that the system is in a sane state and that we can perform I/O.
+ */
+ smrt_intr_set(smrt, B_TRUE);
+ smrt->smrt_status &= ~SMRT_CTLR_STATUS_RESETTING;
+ smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUIRED;
+
+ /*
+ * Attempt a discovery to make sure that the drivers sees a realistic
+ * view of the world. If we're not in panic context, spin for the
+ * asynchronous process to complete, otherwise we're in panic context
+ * and this is going to happen regardless if we want it to or not.
+ * Before we kick off the request to run discovery, we reset the
+ * discovery request flags as we know that nothing else can consider
+ * running discovery and we don't want to delay until the next smrt
+ * periodic tick if we can avoid it. In panic context, if this failed,
+ * then we won't make it back.
+ */
+ VERIFY0(smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING);
+ smrt->smrt_status &= ~(SMRT_CTLR_DISCOVERY_MASK);
+ smrt_discover(smrt);
+ if (!ddi_in_panic()) {
+ while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) {
+ cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
+ }
+ }
+
+ smrt->smrt_status |= SMRT_CTLR_STATUS_RUNNING;
+ smrt->smrt_last_reset_finish = gethrtime();
+
+ /*
+ * Wake anybody that was waiting for the reset to complete.
+ */
+ cv_broadcast(&smrt->smrt_cv_finishq);
+
+ /*
+ * Process the completion queue one last time before we let go
+ * of the mutex.
+ */
+ smrt_process_finishq(smrt);
+
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm_nop);
+ mutex_enter(&smrt->smrt_mutex);
+ return (0);
+}
+
+int
+smrt_event_init(smrt_t *smrt)
+{
+ int ret;
+ smrt_command_t *event, *cancel;
+
+ event = smrt_command_alloc(smrt, SMRT_CMDTYPE_EVENT, KM_NOSLEEP);
+ if (event == NULL)
+ return (ENOMEM);
+ if (smrt_command_attach_internal(smrt, event, SMRT_EVENT_NOTIFY_BUFLEN,
+ KM_NOSLEEP) != 0) {
+ smrt_command_free(event);
+ return (ENOMEM);
+ }
+ smrt_write_message_event_notify(event);
+
+ cancel = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, KM_NOSLEEP);
+ if (cancel == NULL) {
+ smrt_command_free(event);
+ return (ENOMEM);
+ }
+ if (smrt_command_attach_internal(smrt, cancel, SMRT_EVENT_NOTIFY_BUFLEN,
+ KM_NOSLEEP) != 0) {
+ smrt_command_free(event);
+ smrt_command_free(cancel);
+ return (ENOMEM);
+ }
+ smrt_write_message_cancel_event_notify(cancel);
+
+ cv_init(&smrt->smrt_event_queue, NULL, CV_DRIVER, NULL);
+
+ mutex_enter(&smrt->smrt_mutex);
+ if ((ret = smrt_submit(smrt, event)) != 0) {
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(event);
+ smrt_command_free(cancel);
+ return (ret);
+ }
+
+ smrt->smrt_event_cmd = event;
+ smrt->smrt_event_cancel_cmd = cancel;
+ mutex_exit(&smrt->smrt_mutex);
+
+ return (0);
+}
+
+void
+smrt_event_complete(smrt_command_t *smcm)
+{
+ smrt_event_notify_t *sen;
+ boolean_t log, rescan;
+
+ boolean_t intervene = B_FALSE;
+ smrt_t *smrt = smcm->smcm_ctlr;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+ VERIFY3P(smcm, ==, smrt->smrt_event_cmd);
+ VERIFY0(smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION);
+
+ smrt->smrt_stats.smrts_events_received++;
+
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
+ cv_signal(&smrt->smrt_event_queue);
+ return;
+ }
+
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+ intervene = B_TRUE;
+ goto clean;
+ }
+
+ /*
+ * The event notification command failed for some reason. Attempt to
+ * drive on and try again at the next intervention period. Because this
+ * may represent a programmer error (though it's hard to know), we wait
+ * until the next intervention period and don't panic.
+ */
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+ intervene = B_TRUE;
+
+ smrt->smrt_stats.smrts_events_errors++;
+ dev_err(smrt->smrt_dip, CE_WARN, "!event notification request "
+ "error: status 0x%x", ei->CommandStatus);
+ goto clean;
+ }
+
+ sen = smcm->smcm_internal->smcmi_va;
+ log = rescan = B_FALSE;
+ switch (sen->sen_class) {
+ case SMRT_EVENT_CLASS_PROTOCOL:
+ /*
+ * Most of the event protocol class events aren't really
+ * actionable. However, subclass 1 indicates errors. Today,
+ * the only error is an event overflow. If there's an event
+ * overflow, then we must assume that we need to rescan.
+ */
+ if (sen->sen_subclass == SMRT_EVENT_PROTOCOL_SUBCLASS_ERROR) {
+ rescan = B_TRUE;
+ }
+ break;
+ case SMRT_EVENT_CLASS_HOTPLUG:
+ /*
+ * We want to log all hotplug events. However we only need to
+ * scan these if the subclass indicates the event is for a disk.
+ */
+ log = B_TRUE;
+ if (sen->sen_subclass == SMRT_EVENT_HOTPLUG_SUBCLASS_DRIVE) {
+ rescan = B_TRUE;
+ }
+ break;
+ case SMRT_EVENT_CLASS_HWERROR:
+ case SMRT_EVENT_CLASS_ENVIRONMENT:
+ log = B_TRUE;
+ break;
+ case SMRT_EVENT_CLASS_PHYS:
+ log = B_TRUE;
+ /*
+ * This subclass indicates some change for physical drives. As
+ * such, this should trigger a rescan.
+ */
+ if (sen->sen_subclass == SMRT_EVENT_PHYS_SUBCLASS_STATE) {
+ rescan = B_TRUE;
+ }
+ break;
+ case SMRT_EVENT_CLASS_LOGVOL:
+ rescan = B_TRUE;
+ log = B_TRUE;
+ break;
+ default:
+ /*
+ * While there are other classes of events, it's hard to say how
+ * actionable they are for the moment. If we revamp this such
+ * that it becomes an ireport based system, then we should just
+ * always log these. We opt not to at the moment to try and be
+ * kind to the system log.
+ */
+ break;
+ }
+
+ /*
+ * Ideally, this would be an ireport that we could pass onto
+ * administrators; however, since we don't have any way to generate
+ * that, we provide a subset of the event information.
+ */
+ if (log) {
+ const char *rmsg;
+ if (rescan == B_TRUE) {
+ rmsg = "rescanning";
+ } else {
+ rmsg = "not rescanning";
+ }
+ if (sen->sen_message[0] != '\0') {
+ sen->sen_message[sizeof (sen->sen_message) - 1] = '\0';
+ dev_err(smrt->smrt_dip, CE_NOTE, "!controller event "
+ "class/sub-class/detail %x, %x, %x: %s; %s devices",
+ sen->sen_class, sen->sen_subclass, sen->sen_detail,
+ sen->sen_message, rmsg);
+ } else {
+ dev_err(smrt->smrt_dip, CE_NOTE, "!controller event "
+ "class/sub-class/detail %x, %x, %x; %s devices",
+ sen->sen_class, sen->sen_subclass, sen->sen_detail,
+ rmsg);
+ }
+ }
+
+ if (rescan)
+ smrt_discover_request(smrt);
+
+clean:
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_reuse(smcm);
+ bzero(smcm->smcm_internal->smcmi_va, SMRT_EVENT_NOTIFY_BUFLEN);
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Make sure we're not _now_ detaching or resetting.
+ */
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
+ cv_signal(&smrt->smrt_event_queue);
+ return;
+ }
+
+ if ((smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) != 0 ||
+ intervene == B_TRUE) {
+ smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION;
+ return;
+ }
+
+ /*
+ * Check out command count per tick. If it's too high, leave it for
+ * intervention to solve. Likely there is some serious driver or
+ * firmware error going on.
+ */
+ smrt->smrt_event_count++;
+ if (smrt->smrt_event_count > smrt_event_intervention_threshold) {
+ smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION;
+ return;
+ }
+
+ if (smrt_submit(smrt, smcm) != 0) {
+ smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION;
+ }
+}
+
+void
+smrt_event_fini(smrt_t *smrt)
+{
+ int ret;
+ smrt_command_t *event, *cancel;
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * If intervention has been requested, there is nothing for us to do. We
+ * clear the flag so nothing else accidentally sees this and takes
+ * action. We also don't need to bother sending a cancellation request,
+ * as there is no outstanding event.
+ */
+ if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) {
+ smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION;
+ goto free;
+ }
+
+ /*
+ * Submit a cancel request for the event notification queue. Because we
+ * submit both the cancel event and the regular notification event as an
+ * ordered command, we know that by the time this completes, that the
+ * existing one will have completed.
+ */
+ smrt->smrt_event_cancel_cmd->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if ((ret = smrt_submit(smrt, smrt->smrt_event_cancel_cmd)) != 0) {
+ /*
+ * This is unfortunate. We've failed to submit the command. At
+ * this point all we can do is reset the device. If the reset
+ * succeeds, we're done and we can clear all the memory. If it
+ * fails, then all we can do is just leak the command and scream
+ * to the system, sorry.
+ */
+ if (smrt_ctlr_reset(smrt) != 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to reset "
+ "device after failure to submit cancellation "
+ "(%d), abandoning smrt_command_t at address %p",
+ ret, smrt->smrt_event_cmd);
+ smrt->smrt_event_cmd = NULL;
+ goto free;
+ }
+ }
+
+ smrt->smrt_event_cancel_cmd->smcm_expiry = gethrtime() +
+ SMRT_ASYNC_CANCEL_TIMEOUT * NANOSEC;
+ if ((ret = smrt_poll_for(smrt, smrt->smrt_event_cancel_cmd)) != 0) {
+ VERIFY3S(ret, ==, ETIMEDOUT);
+ VERIFY0(smrt->smrt_event_cancel_cmd->smcm_status &
+ SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * The command timed out. All we can do is hope a reset will
+ * work.
+ */
+ if (smrt_ctlr_reset(smrt) != 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to reset "
+ "device after failure to poll for async "
+ "cancellation command abandoning smrt_command_t "
+ "event command at address %p and cancellation "
+ "command at %p", smrt->smrt_event_cmd,
+ smrt->smrt_event_cancel_cmd);
+ smrt->smrt_event_cmd = NULL;
+ smrt->smrt_event_cancel_cmd = NULL;
+ goto free;
+ }
+
+ }
+
+ /*
+ * Well, in the end, it's results that count.
+ */
+ if (smrt->smrt_event_cancel_cmd->smcm_status &
+ SMRT_CMD_STATUS_RESET_SENT) {
+ goto free;
+ }
+
+ if (smrt->smrt_event_cancel_cmd->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smrt->smrt_event_cancel_cmd->smcm_va_err;
+
+ /*
+ * This can return a CISS_CMD_TARGET_STATUS entry when the
+ * controller doesn't think a command is outstanding. It is
+ * possible we raced, so don't think too much about that case.
+ * Anything else leaves us between a rock and a hard place, the
+ * only way out is a reset.
+ */
+ if (ei->CommandStatus != CISS_CMD_TARGET_STATUS &&
+ smrt_ctlr_reset(smrt) != 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to reset "
+ "device after receiving an error on the async "
+ "cancellation command (%d); abandoning "
+ "smrt_command_t event command at address %p and "
+ "cancellation command at %p", ei->CommandStatus,
+ smrt->smrt_event_cmd, smrt->smrt_event_cancel_cmd);
+ smrt->smrt_event_cmd = NULL;
+ smrt->smrt_event_cancel_cmd = NULL;
+ goto free;
+ }
+ }
+
+free:
+ event = smrt->smrt_event_cmd;
+ smrt->smrt_event_cmd = NULL;
+ cancel = smrt->smrt_event_cancel_cmd;
+ smrt->smrt_event_cancel_cmd = NULL;
+ mutex_exit(&smrt->smrt_mutex);
+ if (event != NULL)
+ smrt_command_free(event);
+ if (cancel != NULL)
+ smrt_command_free(cancel);
+ cv_destroy(&smrt->smrt_event_queue);
+}
+
+/*
+ * We've been asked to do a discovery in panic context. This would have
+ * occurred because there was a device reset. Because we can't rely on the
+ * target maps, all we can do at the moment is go over all the active targets
+ * and note which ones no longer exist. If this target was required to dump,
+ * then the dump code will encounter a fatal error. If not, then we should
+ * count ourselves surprisingly lucky.
+ */
+static void
+smrt_discover_panic_check(smrt_t *smrt)
+{
+ smrt_target_t *smtg;
+
+ ASSERT(MUTEX_HELD(&smrt->smrt_mutex));
+ for (smtg = list_head(&smrt->smrt_targets); smtg != NULL;
+ smtg = list_next(&smrt->smrt_targets, smtg)) {
+ uint64_t gen;
+
+ if (smtg->smtg_physical) {
+ smrt_physical_t *smpt = smtg->smtg_lun.smtg_phys;
+ /*
+ * Don't worry about drives that aren't visible.
+ */
+ if (!smpt->smpt_visible)
+ continue;
+ gen = smpt->smpt_gen;
+ } else {
+ smrt_volume_t *smlv = smtg->smtg_lun.smtg_vol;
+ gen = smlv->smlv_gen;
+ }
+
+ if (gen != smrt->smrt_discover_gen) {
+ dev_err(smrt->smrt_dip, CE_WARN, "target %s "
+ "disappeared during post-panic discovery",
+ scsi_device_unit_address(smtg->smtg_scsi_dev));
+ smtg->smtg_gone = B_TRUE;
+ }
+ }
+}
+
+static void
+smrt_discover(void *arg)
+{
+ int log = 0, phys = 0;
+ smrt_t *smrt = arg;
+ uint64_t gen;
+ boolean_t runphys, runvirt;
+
+ mutex_enter(&smrt->smrt_mutex);
+ smrt->smrt_status |= SMRT_CTLR_DISCOVERY_RUNNING;
+ smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUESTED;
+
+ smrt->smrt_discover_gen++;
+ gen = smrt->smrt_discover_gen;
+ runphys = smrt->smrt_phys_tgtmap != NULL;
+ runvirt = smrt->smrt_virt_tgtmap != NULL;
+ mutex_exit(&smrt->smrt_mutex);
+ if (runphys)
+ phys = smrt_phys_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen);
+ if (runvirt)
+ log = smrt_logvol_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen);
+ mutex_enter(&smrt->smrt_mutex);
+
+ if (phys != 0 || log != 0) {
+ if (!ddi_in_panic()) {
+ smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC;
+ } else {
+ panic("smrt_t %p failed to perform discovery after "
+ "a reset in panic context, unable to continue. "
+ "logvol: %d, phys: %d", smrt, log, phys);
+ }
+ } else {
+ if (!ddi_in_panic() &&
+ smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) {
+ smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUIRED;
+ cv_broadcast(&smrt->smrt_cv_finishq);
+ }
+
+ if (ddi_in_panic()) {
+ smrt_discover_panic_check(smrt);
+ }
+ }
+ smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_RUNNING;
+ if (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUESTED)
+ smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC;
+ mutex_exit(&smrt->smrt_mutex);
+}
+
+/*
+ * Request discovery, which is always run via a taskq.
+ */
+void
+smrt_discover_request(smrt_t *smrt)
+{
+ boolean_t run;
+ ASSERT(MUTEX_HELD(&smrt->smrt_mutex));
+
+ if (ddi_in_panic()) {
+ smrt_discover(smrt);
+ return;
+ }
+
+ run = (smrt->smrt_status & SMRT_CTLR_DISCOVERY_MASK) == 0;
+ smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUESTED;
+ if (run && ddi_taskq_dispatch(smrt->smrt_discover_taskq,
+ smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) {
+ smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC;
+ smrt->smrt_stats.smrts_discovery_tq_errors++;
+ }
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c
new file mode 100644
index 0000000000..1b3d7b2602
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c
@@ -0,0 +1,282 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+uint_t
+smrt_isr_hw_simple(caddr_t arg1, caddr_t arg2)
+{
+ _NOTE(ARGUNUSED(arg2))
+
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ smrt_t *smrt = (smrt_t *)arg1;
+ uint32_t isr = smrt_get32(smrt, CISS_I2O_INTERRUPT_STATUS);
+ hrtime_t now = gethrtime();
+
+ mutex_enter(&smrt->smrt_mutex);
+ if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)) {
+ smrt->smrt_stats.smrts_unclaimed_interrupts++;
+ smrt->smrt_last_interrupt_unclaimed = now;
+
+ /*
+ * We should not be receiving interrupts from the controller
+ * while the driver is not running.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ /*
+ * Check to see if this interrupt came from the device:
+ */
+ if ((isr & CISS_ISR_BIT_SIMPLE_INTR) == 0) {
+ smrt->smrt_stats.smrts_unclaimed_interrupts++;
+ smrt->smrt_last_interrupt_unclaimed = now;
+
+ /*
+ * Check to see if the firmware has come to rest. If it has,
+ * this routine will panic the system.
+ */
+ smrt_lockup_check(smrt);
+
+ mutex_exit(&smrt->smrt_mutex);
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ smrt->smrt_stats.smrts_claimed_interrupts++;
+ smrt->smrt_last_interrupt_claimed = now;
+
+ /*
+ * The interrupt was from our controller, so collect any pending
+ * command completions.
+ */
+ smrt_retrieve_simple(smrt);
+
+ /*
+ * Process any commands in the completion queue.
+ */
+ smrt_process_finishq(smrt);
+
+ mutex_exit(&smrt->smrt_mutex);
+ return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * Read tags and process completion of the associated command until the supply
+ * of tags is exhausted.
+ */
+void
+smrt_retrieve_simple(smrt_t *smrt)
+{
+ uint32_t opq;
+ uint32_t none = 0xffffffff;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ while ((opq = smrt_get32(smrt, CISS_I2O_OUTBOUND_POST_Q)) != none) {
+ uint32_t tag = CISS_OPQ_READ_TAG(opq);
+ smrt_command_t *smcm;
+
+ if ((smcm = smrt_lookup_inflight(smrt, tag)) == NULL) {
+ dev_err(smrt->smrt_dip, CE_WARN, "spurious tag %x",
+ tag);
+ continue;
+ }
+
+ avl_remove(&smrt->smrt_inflight, smcm);
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT;
+ if (CISS_OPQ_READ_ERROR(opq) != 0) {
+ smcm->smcm_status |= SMRT_CMD_STATUS_ERROR;
+ }
+ smcm->smcm_time_complete = gethrtime();
+
+ /*
+ * Push this command onto the completion queue.
+ */
+ list_insert_tail(&smrt->smrt_finishq, smcm);
+ }
+}
+
+/*
+ * Submit a command to the controller by posting it to the Inbound Post Queue
+ * Register.
+ */
+void
+smrt_submit_simple(smrt_t *smrt, smrt_command_t *smcm)
+{
+ smrt_put32(smrt, CISS_I2O_INBOUND_POST_Q, smcm->smcm_pa_cmd);
+}
+
+/*
+ * Submit a command to the controller by posting it to the Inbound Post Queue
+ * Register. Immediately begin polling on the completion of that command.
+ *
+ * NOTE: This function is for controller initialisation only. It discards
+ * completions of commands other than the expected command as spurious, and
+ * will not interact correctly with the rest of the driver once it is running.
+ */
+int
+smrt_preinit_command_simple(smrt_t *smrt, smrt_command_t *smcm)
+{
+ /*
+ * The controller must be initialised to use the Simple Transport
+ * Method, but not be marked RUNNING. The command to process must be a
+ * PREINIT command with the expected tag number, marked for polling.
+ */
+ VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_SIMPLE);
+ VERIFY(!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING));
+ VERIFY(smcm->smcm_type == SMRT_CMDTYPE_PREINIT);
+ VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED);
+ VERIFY3U(smcm->smcm_tag, ==, SMRT_PRE_TAG_NUMBER);
+
+ /*
+ * Submit this command to the controller.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_INFLIGHT;
+ smrt_put32(smrt, CISS_I2O_INBOUND_POST_Q, smcm->smcm_pa_cmd);
+
+ /*
+ * Poll the controller for completions until we see the command we just
+ * sent, or the timeout expires.
+ */
+ for (;;) {
+ uint32_t none = 0xffffffff;
+ uint32_t opq = smrt_get32(smrt, CISS_I2O_OUTBOUND_POST_Q);
+ uint32_t tag;
+
+ if (smcm->smcm_expiry != 0) {
+ /*
+ * This command has an expiry time. Check to see
+ * if it has already passed:
+ */
+ if (smcm->smcm_expiry < gethrtime()) {
+ return (ETIMEDOUT);
+ }
+ }
+
+ if (opq == none) {
+ delay(drv_usectohz(10 * 1000));
+ continue;
+ }
+
+ if ((tag = CISS_OPQ_READ_TAG(opq)) != SMRT_PRE_TAG_NUMBER) {
+ dev_err(smrt->smrt_dip, CE_WARN, "unexpected tag 0x%x"
+ " completed during driver init", tag);
+ delay(drv_usectohz(10 * 1000));
+ continue;
+ }
+
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT;
+ if (CISS_OPQ_READ_ERROR(opq) != 0) {
+ smcm->smcm_status |= SMRT_CMD_STATUS_ERROR;
+ }
+ smcm->smcm_time_complete = gethrtime();
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE;
+
+ return (0);
+ }
+}
+
+int
+smrt_ctlr_init_simple(smrt_t *smrt)
+{
+ VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_UNKNOWN);
+
+ if (smrt_cfgtbl_transport_has_support(smrt,
+ CISS_CFGTBL_XPORT_SIMPLE) != DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+ smrt->smrt_ctlr_mode = SMRT_CTLR_MODE_SIMPLE;
+
+ /*
+ * Disable device interrupts while we are setting up.
+ */
+ smrt_intr_set(smrt, B_FALSE);
+
+ if ((smrt->smrt_maxcmds = smrt_ctlr_get_cmdsoutmax(smrt)) == 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "maximum outstanding "
+ "commands set to zero");
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Determine the number of Scatter/Gather List entries this controller
+ * supports. The maximum number we allow is CISS_MAXSGENTRIES: the
+ * number of elements in the static struct we use for command
+ * submission.
+ */
+ if ((smrt->smrt_sg_cnt = smrt_ctlr_get_maxsgelements(smrt)) == 0) {
+ /*
+ * The CISS specification states that if this value is
+ * zero, we should assume a value of 31 for compatibility
+ * with older firmware.
+ */
+ smrt->smrt_sg_cnt = CISS_SGCNT_FALLBACK;
+
+ } else if (smrt->smrt_sg_cnt > CISS_MAXSGENTRIES) {
+ /*
+ * If the controller supports more than we have allocated,
+ * just cap the count at the allocation size.
+ */
+ smrt->smrt_sg_cnt = CISS_MAXSGENTRIES;
+ }
+
+ /*
+ * Zero the upper 32 bits of the address in the Controller.
+ */
+ ddi_put32(smrt->smrt_ct_handle, &smrt->smrt_ct->Upper32Addr, 0);
+
+ /*
+ * Set the Transport Method and flush the changes to the
+ * Configuration Table.
+ */
+ smrt_cfgtbl_transport_set(smrt, CISS_CFGTBL_XPORT_SIMPLE);
+ if (smrt_cfgtbl_flush(smrt) != DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+
+ if (smrt_cfgtbl_transport_confirm(smrt,
+ CISS_CFGTBL_XPORT_SIMPLE) != DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Check the outstanding command cap a second time now that we have
+ * flushed out the new Transport Method. This is entirely defensive;
+ * we do not expect this value to change.
+ */
+ uint32_t check_again = smrt_ctlr_get_cmdsoutmax(smrt);
+ if (check_again != smrt->smrt_maxcmds) {
+ dev_err(smrt->smrt_dip, CE_WARN, "maximum outstanding commands "
+ "changed during initialisation (was %u, now %u)",
+ smrt->smrt_maxcmds, check_again);
+ return (DDI_FAILURE);
+ }
+
+ return (DDI_SUCCESS);
+}
+
+void
+smrt_ctlr_teardown_simple(smrt_t *smrt)
+{
+ VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_SIMPLE);
+
+ /*
+ * Due to the nominal simplicity of the simple mode, we have no
+ * particular teardown to perform as we do not allocate anything
+ * on the way up.
+ */
+ smrt->smrt_ctlr_mode = SMRT_CTLR_MODE_UNKNOWN;
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c
new file mode 100644
index 0000000000..edcbfa65e2
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c
@@ -0,0 +1,362 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+
+static ddi_dma_attr_t smrt_command_dma_attr = {
+ .dma_attr_version = DMA_ATTR_V0,
+ .dma_attr_addr_lo = 0x00000000,
+ .dma_attr_addr_hi = 0xFFFFFFFF,
+ .dma_attr_count_max = 0x00FFFFFF,
+ .dma_attr_align = 0x20,
+ .dma_attr_burstsizes = 0x20,
+ .dma_attr_minxfer = DMA_UNIT_8,
+ .dma_attr_maxxfer = 0xFFFFFFFF,
+ .dma_attr_seg = 0x0000FFFF,
+ .dma_attr_sgllen = 1,
+ .dma_attr_granular = 512,
+ .dma_attr_flags = 0
+};
+
+/*
+ * These device access attributes are for command block allocation, where we do
+ * not use any of the structured byte swapping facilities.
+ */
+static ddi_device_acc_attr_t smrt_command_dev_attr = {
+ .devacc_attr_version = DDI_DEVICE_ATTR_V0,
+ .devacc_attr_endian_flags = DDI_NEVERSWAP_ACC,
+ .devacc_attr_dataorder = DDI_STRICTORDER_ACC,
+ .devacc_attr_access = 0
+};
+
+
+static void smrt_contig_free(smrt_dma_t *);
+
+
+static int
+smrt_check_command_type(smrt_command_type_t type)
+{
+ /*
+ * Note that we leave out the default case in order to utilise
+ * compiler warnings about missed enum values.
+ */
+ switch (type) {
+ case SMRT_CMDTYPE_ABORTQ:
+ case SMRT_CMDTYPE_SCSA:
+ case SMRT_CMDTYPE_INTERNAL:
+ case SMRT_CMDTYPE_PREINIT:
+ case SMRT_CMDTYPE_EVENT:
+ return (type);
+ }
+
+ panic("unexpected command type");
+ /* LINTED: E_FUNC_NO_RET_VAL */
+}
+
+static int
+smrt_contig_alloc(smrt_t *smrt, smrt_dma_t *smdma, size_t sz, int kmflags,
+ void **vap, uint32_t *pap)
+{
+ caddr_t va;
+ int rv;
+ dev_info_t *dip = smrt->smrt_dip;
+ int (*dma_wait)(caddr_t) = (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP :
+ DDI_DMA_DONTWAIT;
+
+ VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP);
+
+ /*
+ * Ensure we don't try to allocate a second time using the same
+ * tracking object.
+ */
+ VERIFY0(smdma->smdma_level);
+
+ if ((rv = ddi_dma_alloc_handle(dip, &smrt_command_dma_attr,
+ dma_wait, NULL, &smdma->smdma_dma_handle)) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "DMA handle allocation failed (%x)",
+ rv);
+ goto fail;
+ }
+ smdma->smdma_level |= SMRT_DMALEVEL_HANDLE_ALLOC;
+
+ if ((rv = ddi_dma_mem_alloc(smdma->smdma_dma_handle, sz,
+ &smrt_command_dev_attr, DDI_DMA_CONSISTENT, dma_wait, NULL,
+ &va, &smdma->smdma_real_size, &smdma->smdma_acc_handle)) !=
+ DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "DMA memory allocation failed (%x)", rv);
+ goto fail;
+ }
+ smdma->smdma_level |= SMRT_DMALEVEL_MEMORY_ALLOC;
+
+ if ((rv = ddi_dma_addr_bind_handle(smdma->smdma_dma_handle,
+ NULL, va, smdma->smdma_real_size,
+ DDI_DMA_CONSISTENT | DDI_DMA_RDWR, dma_wait, NULL,
+ smdma->smdma_dma_cookies, &smdma->smdma_dma_ncookies)) !=
+ DDI_DMA_MAPPED) {
+ dev_err(dip, CE_WARN, "DMA handle bind failed (%x)", rv);
+ goto fail;
+ }
+ smdma->smdma_level |= SMRT_DMALEVEL_HANDLE_BOUND;
+
+ VERIFY3U(smdma->smdma_dma_ncookies, ==, 1);
+ *pap = smdma->smdma_dma_cookies[0].dmac_address;
+ *vap = (void *)va;
+ return (DDI_SUCCESS);
+
+fail:
+ *vap = NULL;
+ *pap = 0;
+ smrt_contig_free(smdma);
+ return (DDI_FAILURE);
+}
+
+static void
+smrt_contig_free(smrt_dma_t *smdma)
+{
+ if (smdma->smdma_level & SMRT_DMALEVEL_HANDLE_BOUND) {
+ VERIFY3U(ddi_dma_unbind_handle(smdma->smdma_dma_handle), ==,
+ DDI_SUCCESS);
+
+ smdma->smdma_level &= ~SMRT_DMALEVEL_HANDLE_BOUND;
+ }
+
+ if (smdma->smdma_level & SMRT_DMALEVEL_MEMORY_ALLOC) {
+ ddi_dma_mem_free(&smdma->smdma_acc_handle);
+
+ smdma->smdma_level &= ~SMRT_DMALEVEL_MEMORY_ALLOC;
+ }
+
+ if (smdma->smdma_level & SMRT_DMALEVEL_HANDLE_ALLOC) {
+ ddi_dma_free_handle(&smdma->smdma_dma_handle);
+
+ smdma->smdma_level &= ~SMRT_DMALEVEL_HANDLE_ALLOC;
+ }
+
+ VERIFY(smdma->smdma_level == 0);
+ bzero(smdma, sizeof (*smdma));
+}
+
+static smrt_command_t *
+smrt_command_alloc_impl(smrt_t *smrt, smrt_command_type_t type, int kmflags)
+{
+ smrt_command_t *smcm;
+
+ VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP);
+
+ if ((smcm = kmem_zalloc(sizeof (*smcm), kmflags)) == NULL) {
+ return (NULL);
+ }
+
+ smcm->smcm_ctlr = smrt;
+ smcm->smcm_type = smrt_check_command_type(type);
+
+ /*
+ * Allocate a single contiguous chunk of memory for the command block
+ * (smcm_va_cmd) and the error information block (smcm_va_err). The
+ * physical address of each block should be 32-byte aligned.
+ */
+ size_t contig_size = 0;
+ contig_size += P2ROUNDUP_TYPED(sizeof (CommandList_t), 32, size_t);
+
+ size_t errorinfo_offset = contig_size;
+ contig_size += P2ROUNDUP_TYPED(sizeof (ErrorInfo_t), 32, size_t);
+
+ if (smrt_contig_alloc(smrt, &smcm->smcm_contig, contig_size,
+ kmflags, (void **)&smcm->smcm_va_cmd, &smcm->smcm_pa_cmd) !=
+ DDI_SUCCESS) {
+ kmem_free(smcm, sizeof (*smcm));
+ return (NULL);
+ }
+
+ smcm->smcm_va_err = (void *)((caddr_t)smcm->smcm_va_cmd +
+ errorinfo_offset);
+ smcm->smcm_pa_err = smcm->smcm_pa_cmd + errorinfo_offset;
+
+ /*
+ * Ensure we asked for, and received, the correct physical alignment:
+ */
+ VERIFY0(smcm->smcm_pa_cmd & 0x1f);
+ VERIFY0(smcm->smcm_pa_err & 0x1f);
+
+ /*
+ * Populate Fields.
+ */
+ bzero(smcm->smcm_va_cmd, contig_size);
+ smcm->smcm_va_cmd->ErrDesc.Addr = smcm->smcm_pa_err;
+ smcm->smcm_va_cmd->ErrDesc.Len = sizeof (ErrorInfo_t);
+
+ return (smcm);
+}
+
+smrt_command_t *
+smrt_command_alloc_preinit(smrt_t *smrt, size_t datasize, int kmflags)
+{
+ smrt_command_t *smcm;
+
+ if ((smcm = smrt_command_alloc_impl(smrt, SMRT_CMDTYPE_PREINIT,
+ kmflags)) == NULL) {
+ return (NULL);
+ }
+
+ /*
+ * Note that most driver infrastructure has not been initialised at
+ * this time. All commands are submitted to the controller serially,
+ * using a pre-specified tag, and are not attached to the command
+ * tracking list.
+ */
+ smcm->smcm_tag = SMRT_PRE_TAG_NUMBER;
+ smcm->smcm_va_cmd->Header.Tag.tag_value = SMRT_PRE_TAG_NUMBER;
+
+ if (smrt_command_attach_internal(smrt, smcm, datasize, kmflags) != 0) {
+ smrt_command_free(smcm);
+ return (NULL);
+ }
+
+ return (smcm);
+}
+
+smrt_command_t *
+smrt_command_alloc(smrt_t *smrt, smrt_command_type_t type, int kmflags)
+{
+ smrt_command_t *smcm;
+
+ VERIFY(type != SMRT_CMDTYPE_PREINIT);
+
+ if ((smcm = smrt_command_alloc_impl(smrt, type, kmflags)) == NULL) {
+ return (NULL);
+ }
+
+ /*
+ * Insert into the per-controller command list.
+ */
+ mutex_enter(&smrt->smrt_mutex);
+ list_insert_tail(&smrt->smrt_commands, smcm);
+ mutex_exit(&smrt->smrt_mutex);
+
+ return (smcm);
+}
+
+int
+smrt_command_attach_internal(smrt_t *smrt, smrt_command_t *smcm, size_t len,
+ int kmflags)
+{
+ smrt_command_internal_t *smcmi;
+
+ VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP);
+ VERIFY3U(len, <=, UINT32_MAX);
+
+ if ((smcmi = kmem_zalloc(sizeof (*smcmi), kmflags)) == NULL) {
+ return (ENOMEM);
+ }
+
+ if (smrt_contig_alloc(smrt, &smcmi->smcmi_contig, len, kmflags,
+ &smcmi->smcmi_va, &smcmi->smcmi_pa) != DDI_SUCCESS) {
+ kmem_free(smcmi, sizeof (*smcmi));
+ return (ENOMEM);
+ }
+
+ bzero(smcmi->smcmi_va, smcmi->smcmi_len);
+
+ smcm->smcm_internal = smcmi;
+
+ smcm->smcm_va_cmd->SG[0].Addr = smcmi->smcmi_pa;
+ smcm->smcm_va_cmd->SG[0].Len = (uint32_t)len;
+ smcm->smcm_va_cmd->Header.SGList = 1;
+ smcm->smcm_va_cmd->Header.SGTotal = 1;
+
+ return (0);
+}
+
+void
+smrt_command_reuse(smrt_command_t *smcm)
+{
+ smrt_t *smrt = smcm->smcm_ctlr;
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Make sure the command is not currently inflight, then
+ * reset the command status.
+ */
+ VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT));
+ smcm->smcm_status = SMRT_CMD_STATUS_REUSED;
+
+ /*
+ * Ensure we are not trying to reuse a command that is in the finish or
+ * abort queue.
+ */
+ VERIFY(!list_link_active(&smcm->smcm_link_abort));
+ VERIFY(!list_link_active(&smcm->smcm_link_finish));
+
+ /*
+ * Clear the previous tag value.
+ */
+ smcm->smcm_tag = 0;
+ smcm->smcm_va_cmd->Header.Tag.tag_value = 0;
+
+ mutex_exit(&smrt->smrt_mutex);
+}
+
+void
+smrt_command_free(smrt_command_t *smcm)
+{
+ smrt_t *smrt = smcm->smcm_ctlr;
+
+ /*
+ * Ensure the object we are about to free is not currently in the
+ * inflight AVL.
+ */
+ VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT));
+
+ if (smcm->smcm_internal != NULL) {
+ smrt_command_internal_t *smcmi = smcm->smcm_internal;
+
+ smrt_contig_free(&smcmi->smcmi_contig);
+ kmem_free(smcmi, sizeof (*smcmi));
+ }
+
+ smrt_contig_free(&smcm->smcm_contig);
+
+ if (smcm->smcm_type != SMRT_CMDTYPE_PREINIT) {
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Ensure we are not trying to free a command that is in the
+ * finish or abort queue.
+ */
+ VERIFY(!list_link_active(&smcm->smcm_link_abort));
+ VERIFY(!list_link_active(&smcm->smcm_link_finish));
+
+ list_remove(&smrt->smrt_commands, smcm);
+
+ mutex_exit(&smrt->smrt_mutex);
+ }
+
+ kmem_free(smcm, sizeof (*smcm));
+}
+
+smrt_command_t *
+smrt_lookup_inflight(smrt_t *smrt, uint32_t tag)
+{
+ smrt_command_t srch;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ bzero(&srch, sizeof (srch));
+ srch.smcm_tag = tag;
+
+ return (avl_find(&smrt->smrt_inflight, &srch, NULL));
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c
new file mode 100644
index 0000000000..9e27448b68
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c
@@ -0,0 +1,238 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+/*
+ * We must locate what the CISS specification describes as the "I2O
+ * registers". The Intelligent I/O (I2O) Architecture Specification describes
+ * this somewhat more coherently as "the memory region specified by the first
+ * base address configuration register indicating memory space (offset 10h,
+ * 14h, and so forth)".
+ */
+static int
+smrt_locate_bar(pci_regspec_t *regs, unsigned nregs,
+ unsigned *i2o_bar)
+{
+ /*
+ * Locate the first memory-mapped BAR:
+ */
+ for (unsigned i = 0; i < nregs; i++) {
+ unsigned type = regs[i].pci_phys_hi & PCI_ADDR_MASK;
+
+ if (type == PCI_ADDR_MEM32 || type == PCI_ADDR_MEM64) {
+ *i2o_bar = i;
+ return (DDI_SUCCESS);
+ }
+ }
+
+ return (DDI_FAILURE);
+}
+
+static int
+smrt_locate_cfgtbl(smrt_t *smrt, pci_regspec_t *regs, unsigned nregs,
+ unsigned *ct_bar, uint32_t *baseaddr)
+{
+ uint32_t cfg_offset, mem_offset;
+ unsigned want_type;
+ uint32_t want_bar;
+
+ cfg_offset = smrt_get32(smrt, CISS_I2O_CFGTBL_CFG_OFFSET);
+ mem_offset = smrt_get32(smrt, CISS_I2O_CFGTBL_MEM_OFFSET);
+
+ VERIFY3U(cfg_offset, !=, 0xffffffff);
+ VERIFY3U(mem_offset, !=, 0xffffffff);
+
+ /*
+ * Locate the Configuration Table. Three different values read
+ * from two I2O registers allow us to determine the location:
+ * - the correct PCI BAR offset is in the low 16 bits of
+ * CISS_I2O_CFGTBL_CFG_OFFSET
+ * - bit 16 is 0 for a 32-bit space, and 1 for 64-bit
+ * - the memory offset from the base of this BAR is
+ * in CISS_I2O_CFGTBL_MEM_OFFSET
+ */
+ want_bar = (cfg_offset & 0xffff);
+ want_type = (cfg_offset & (1UL << 16)) ? PCI_ADDR_MEM64 :
+ PCI_ADDR_MEM32;
+
+ DTRACE_PROBE4(locate_cfgtbl, uint32_t, want_bar, unsigned,
+ want_type, uint32_t, cfg_offset, uint32_t, mem_offset);
+
+ for (unsigned i = 0; i < nregs; i++) {
+ unsigned type = regs[i].pci_phys_hi & PCI_ADDR_MASK;
+ unsigned bar = PCI_REG_REG_G(regs[i].pci_phys_hi);
+
+ if (type != PCI_ADDR_MEM32 && type != PCI_ADDR_MEM64) {
+ continue;
+ }
+
+ if (bar == want_bar) {
+ *ct_bar = i;
+ *baseaddr = mem_offset;
+ return (DDI_SUCCESS);
+ }
+ }
+
+ return (DDI_FAILURE);
+}
+
+/*
+ * Determine the PCI vendor and device ID which is a proxy for which generation
+ * of controller we're working with.
+ */
+static int
+smrt_identify_device(smrt_t *smrt)
+{
+ ddi_acc_handle_t pci_hdl;
+
+ if (pci_config_setup(smrt->smrt_dip, &pci_hdl) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+
+ smrt->smrt_pci_vendor = pci_config_get16(pci_hdl, PCI_CONF_VENID);
+ smrt->smrt_pci_device = pci_config_get16(pci_hdl, PCI_CONF_DEVID);
+
+ pci_config_teardown(&pci_hdl);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+smrt_map_device(smrt_t *smrt)
+{
+ pci_regspec_t *regs;
+ uint_t regslen, nregs;
+ dev_info_t *dip = smrt->smrt_dip;
+ int r = DDI_FAILURE;
+
+ /*
+ * Get the list of PCI registers from the DDI property "regs":
+ */
+ if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+ "reg", (int **)&regs, &regslen) != DDI_PROP_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not load \"reg\" DDI prop");
+ return (DDI_FAILURE);
+ }
+ nregs = regslen * sizeof (int) / sizeof (pci_regspec_t);
+
+ if (smrt_locate_bar(regs, nregs, &smrt->smrt_i2o_bar) !=
+ DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "did not find any memory BARs");
+ goto out;
+ }
+
+ /*
+ * Map enough of the I2O memory space to enable us to talk to the
+ * device.
+ */
+ if (ddi_regs_map_setup(dip, smrt->smrt_i2o_bar, &smrt->smrt_i2o_space,
+ CISS_I2O_MAP_BASE, CISS_I2O_MAP_LIMIT - CISS_I2O_MAP_BASE,
+ &smrt_dev_attributes, &smrt->smrt_i2o_handle) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "failed to map I2O registers");
+ goto out;
+ }
+ smrt->smrt_init_level |= SMRT_INITLEVEL_I2O_MAPPED;
+
+ if (smrt_locate_cfgtbl(smrt, regs, nregs, &smrt->smrt_ct_bar,
+ &smrt->smrt_ct_baseaddr) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not find config table");
+ goto out;
+ }
+
+ /*
+ * Map the Configuration Table.
+ */
+ if (ddi_regs_map_setup(dip, smrt->smrt_ct_bar,
+ (caddr_t *)&smrt->smrt_ct, smrt->smrt_ct_baseaddr,
+ sizeof (CfgTable_t), &smrt_dev_attributes,
+ &smrt->smrt_ct_handle) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not map config table");
+ goto out;
+ }
+ smrt->smrt_init_level |= SMRT_INITLEVEL_CFGTBL_MAPPED;
+
+ r = DDI_SUCCESS;
+
+out:
+ ddi_prop_free(regs);
+ return (r);
+}
+
+int
+smrt_device_setup(smrt_t *smrt)
+{
+ /*
+ * Ensure that the controller is installed in such a fashion that it
+ * may become a DMA master.
+ */
+ if (ddi_slaveonly(smrt->smrt_dip) == DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "device cannot become DMA "
+ "master");
+ return (DDI_FAILURE);
+ }
+
+ if (smrt_identify_device(smrt) != DDI_SUCCESS)
+ goto fail;
+
+ if (smrt_map_device(smrt) != DDI_SUCCESS) {
+ goto fail;
+ }
+
+ return (DDI_SUCCESS);
+
+fail:
+ smrt_device_teardown(smrt);
+ return (DDI_FAILURE);
+}
+
+void
+smrt_device_teardown(smrt_t *smrt)
+{
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_CFGTBL_MAPPED) {
+ ddi_regs_map_free(&smrt->smrt_ct_handle);
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_CFGTBL_MAPPED;
+ }
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_I2O_MAPPED) {
+ ddi_regs_map_free(&smrt->smrt_i2o_handle);
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_I2O_MAPPED;
+ }
+}
+
+uint32_t
+smrt_get32(smrt_t *smrt, offset_t off)
+{
+ VERIFY3S(off, >=, CISS_I2O_MAP_BASE);
+ VERIFY3S(off, <, CISS_I2O_MAP_BASE + CISS_I2O_MAP_LIMIT);
+
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ uint32_t *addr = (uint32_t *)(smrt->smrt_i2o_space +
+ (off - CISS_I2O_MAP_BASE));
+
+ return (ddi_get32(smrt->smrt_i2o_handle, addr));
+}
+
+void
+smrt_put32(smrt_t *smrt, offset_t off, uint32_t val)
+{
+ VERIFY3S(off, >=, CISS_I2O_MAP_BASE);
+ VERIFY3S(off, <, CISS_I2O_MAP_BASE + CISS_I2O_MAP_LIMIT);
+
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ uint32_t *addr = (uint32_t *)(smrt->smrt_i2o_space +
+ (off - CISS_I2O_MAP_BASE));
+
+ ddi_put32(smrt->smrt_i2o_handle, addr, val);
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c
new file mode 100644
index 0000000000..0eec7c2482
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c
@@ -0,0 +1,1457 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+/*
+ * The controller is not allowed to attach.
+ */
+static int
+smrt_ctrl_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+ return (DDI_FAILURE);
+}
+
+/*
+ * The controller is not allowed to send packets.
+ */
+static int
+smrt_ctrl_tran_start(struct scsi_address *sa, struct scsi_pkt *pkt)
+{
+ return (TRAN_BADPKT);
+}
+
+static boolean_t
+smrt_logvol_parse(const char *ua, uint_t *targp)
+{
+ long targ, lun;
+ const char *comma;
+ char *eptr;
+
+ comma = strchr(ua, ',');
+ if (comma == NULL) {
+ return (B_FALSE);
+ }
+
+ /*
+ * We expect the target number for a logical unit number to be zero for
+ * a logical volume.
+ */
+ if (ddi_strtol(comma + 1, &eptr, 16, &lun) != 0 || *eptr != '\0' ||
+ lun != 0) {
+ return (B_FALSE);
+ }
+
+ if (ddi_strtol(ua, &eptr, 16, &targ) != 0 || eptr != comma ||
+ targ < 0 || targ >= SMRT_MAX_LOGDRV) {
+ return (B_FALSE);
+ }
+
+ *targp = (uint_t)targ;
+
+ return (B_TRUE);
+}
+
+static int
+smrt_logvol_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+ _NOTE(ARGUNUSED(hba_dip))
+
+ smrt_volume_t *smlv;
+ smrt_target_t *smtg;
+ const char *ua;
+ uint_t targ;
+
+ smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private;
+ dev_info_t *dip = smrt->smrt_dip;
+
+ /*
+ * The unit address comes in the form of 'target,lun'. We expect the
+ * lun to be zero. The target is what we set when we added it to the
+ * target map earlier.
+ */
+ ua = scsi_device_unit_address(sd);
+ if (ua == NULL) {
+ return (DDI_FAILURE);
+ }
+
+ if (!smrt_logvol_parse(ua, &targ)) {
+ return (DDI_FAILURE);
+ }
+
+ if ((smtg = kmem_zalloc(sizeof (*smtg), KM_NOSLEEP)) == NULL) {
+ dev_err(dip, CE_WARN, "could not allocate target object "
+ "due to memory exhaustion");
+ return (DDI_FAILURE);
+ }
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
+ /*
+ * We are detaching. Do not accept any more requests to
+ * attach targets from the framework.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ kmem_free(smtg, sizeof (*smtg));
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Look for a logical volume for the SCSI unit address of this target.
+ */
+ if ((smlv = smrt_logvol_lookup_by_id(smrt, targ)) == NULL) {
+ mutex_exit(&smrt->smrt_mutex);
+ kmem_free(smtg, sizeof (*smtg));
+ return (DDI_FAILURE);
+ }
+
+ smtg->smtg_lun.smtg_vol = smlv;
+ smtg->smtg_addr = &smlv->smlv_addr;
+ smtg->smtg_physical = B_FALSE;
+ list_insert_tail(&smlv->smlv_targets, smtg);
+
+ /*
+ * Link this target object to the controller:
+ */
+ smtg->smtg_ctlr = smrt;
+ list_insert_tail(&smrt->smrt_targets, smtg);
+
+ smtg->smtg_scsi_dev = sd;
+ VERIFY(sd->sd_dev == tgt_dip);
+
+ scsi_device_hba_private_set(sd, smtg);
+
+ mutex_exit(&smrt->smrt_mutex);
+ return (DDI_SUCCESS);
+}
+
+static void
+smrt_logvol_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+ _NOTE(ARGUNUSED(hba_dip, tgt_dip))
+
+ smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private;
+ smrt_target_t *smtg = scsi_device_hba_private_get(sd);
+ smrt_volume_t *smlv = smtg->smtg_lun.smtg_vol;
+
+ VERIFY(smtg->smtg_scsi_dev == sd);
+ VERIFY(smtg->smtg_physical == B_FALSE);
+
+ mutex_enter(&smrt->smrt_mutex);
+ list_remove(&smlv->smlv_targets, smtg);
+ list_remove(&smrt->smrt_targets, smtg);
+
+ scsi_device_hba_private_set(sd, NULL);
+
+ mutex_exit(&smrt->smrt_mutex);
+
+ kmem_free(smtg, sizeof (*smtg));
+}
+
+static int
+smrt_phys_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+ _NOTE(ARGUNUSED(hba_dip))
+
+ smrt_target_t *smtg;
+ smrt_physical_t *smpt;
+ const char *ua, *comma;
+ char *eptr;
+ long lun;
+
+ smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private;
+ dev_info_t *dip = smrt->smrt_dip;
+
+ /*
+ * The unit address comes in the form of 'target,lun'. We expect the
+ * lun to be zero. The target is what we set when we added it to the
+ * target map earlier.
+ */
+ ua = scsi_device_unit_address(sd);
+ if (ua == NULL)
+ return (DDI_FAILURE);
+
+ comma = strchr(ua, ',');
+ if (comma == NULL) {
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Confirm the LUN is zero. We may want to instead check the scsi
+ * 'lun'/'lun64' property or do so in addition to this logic.
+ */
+ if (ddi_strtol(comma + 1, &eptr, 16, &lun) != 0 || *eptr != '\0' ||
+ lun != 0) {
+ return (DDI_FAILURE);
+ }
+
+ if ((smtg = kmem_zalloc(sizeof (*smtg), KM_NOSLEEP)) == NULL) {
+ dev_err(dip, CE_WARN, "could not allocate target object "
+ "due to memory exhaustion");
+ return (DDI_FAILURE);
+ }
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
+ /*
+ * We are detaching. Do not accept any more requests to
+ * attach targets from the framework.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ kmem_free(smtg, sizeof (*smtg));
+ return (DDI_FAILURE);
+ }
+
+
+ /*
+ * Look for a physical target based on the unit address of the target
+ * (which will encode its WWN and LUN).
+ */
+ smpt = smrt_phys_lookup_by_ua(smrt, ua);
+ if (smpt == NULL) {
+ mutex_exit(&smrt->smrt_mutex);
+ kmem_free(smtg, sizeof (*smtg));
+ return (DDI_FAILURE);
+ }
+
+ smtg->smtg_scsi_dev = sd;
+ smtg->smtg_physical = B_TRUE;
+ smtg->smtg_lun.smtg_phys = smpt;
+ list_insert_tail(&smpt->smpt_targets, smtg);
+ smtg->smtg_addr = &smpt->smpt_addr;
+
+ /*
+ * Link this target object to the controller:
+ */
+ smtg->smtg_ctlr = smrt;
+ list_insert_tail(&smrt->smrt_targets, smtg);
+
+ VERIFY(sd->sd_dev == tgt_dip);
+ smtg->smtg_scsi_dev = sd;
+
+ scsi_device_hba_private_set(sd, smtg);
+ mutex_exit(&smrt->smrt_mutex);
+
+ return (DDI_SUCCESS);
+}
+
+static void
+smrt_phys_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip,
+ scsi_hba_tran_t *hba_tran, struct scsi_device *sd)
+{
+ _NOTE(ARGUNUSED(hba_dip, tgt_dip))
+
+ smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private;
+ smrt_target_t *smtg = scsi_device_hba_private_get(sd);
+ smrt_physical_t *smpt = smtg->smtg_lun.smtg_phys;
+
+ VERIFY(smtg->smtg_scsi_dev == sd);
+ VERIFY(smtg->smtg_physical == B_TRUE);
+
+ mutex_enter(&smrt->smrt_mutex);
+ list_remove(&smpt->smpt_targets, smtg);
+ list_remove(&smrt->smrt_targets, smtg);
+
+ scsi_device_hba_private_set(sd, NULL);
+ mutex_exit(&smrt->smrt_mutex);
+ kmem_free(smtg, sizeof (*smtg));
+}
+
+/*
+ * This function is called when the SCSI framework has allocated a packet and
+ * our private per-packet object.
+ *
+ * We choose not to have the framework pre-allocate memory for the CDB.
+ * Instead, we will make available the CDB area in the controller command block
+ * itself.
+ *
+ * Status block memory is allocated by the framework because we passed
+ * SCSI_HBA_TRAN_SCB to scsi_hba_attach_setup(9F).
+ */
+static int
+smrt_tran_setup_pkt(struct scsi_pkt *pkt, int (*callback)(caddr_t),
+ caddr_t arg)
+{
+ _NOTE(ARGUNUSED(arg))
+
+ struct scsi_device *sd;
+ smrt_target_t *smtg;
+ smrt_t *smrt;
+ smrt_command_t *smcm;
+ smrt_command_scsa_t *smcms;
+ int kmflags = callback == SLEEP_FUNC ? KM_SLEEP : KM_NOSLEEP;
+
+ sd = scsi_address_device(&pkt->pkt_address);
+ VERIFY(sd != NULL);
+ smtg = scsi_device_hba_private_get(sd);
+ VERIFY(smtg != NULL);
+ smrt = smtg->smtg_ctlr;
+ VERIFY(smrt != NULL);
+ smcms = (smrt_command_scsa_t *)pkt->pkt_ha_private;
+
+ /*
+ * Check that we have enough space in the command object for the
+ * request from the target driver:
+ */
+ if (pkt->pkt_cdblen > CISS_CDBLEN) {
+ /*
+ * The CDB member of the Request Block of a controller
+ * command is fixed at 16 bytes.
+ */
+ dev_err(smrt->smrt_dip, CE_WARN, "oversize CDB: had %u, "
+ "needed %u", CISS_CDBLEN, pkt->pkt_cdblen);
+ return (-1);
+ }
+
+ /*
+ * Allocate our command block:
+ */
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_SCSA,
+ kmflags)) == NULL) {
+ return (-1);
+ }
+ smcm->smcm_scsa = smcms;
+ smcms->smcms_command = smcm;
+ smcms->smcms_pkt = pkt;
+
+ pkt->pkt_cdbp = &smcm->smcm_va_cmd->Request.CDB[0];
+ smcm->smcm_va_cmd->Request.CDBLen = pkt->pkt_cdblen;
+
+ smcm->smcm_target = smtg;
+
+ return (0);
+}
+
+static void
+smrt_tran_teardown_pkt(struct scsi_pkt *pkt)
+{
+ smrt_command_scsa_t *smcms = (smrt_command_scsa_t *)
+ pkt->pkt_ha_private;
+ smrt_command_t *smcm = smcms->smcms_command;
+
+ smrt_command_free(smcm);
+
+ pkt->pkt_cdbp = NULL;
+}
+
+static void
+smrt_set_arq_data(struct scsi_pkt *pkt, uchar_t key)
+{
+ struct scsi_arq_status *sts;
+
+ VERIFY3U(pkt->pkt_scblen, >=, sizeof (struct scsi_arq_status));
+
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ sts = (struct scsi_arq_status *)(pkt->pkt_scbp);
+ bzero(sts, sizeof (*sts));
+
+ /*
+ * Mock up a CHECK CONDITION SCSI status for the original command:
+ */
+ sts->sts_status.sts_chk = 1;
+
+ /*
+ * Pretend that we successfully performed REQUEST SENSE:
+ */
+ sts->sts_rqpkt_reason = CMD_CMPLT;
+ sts->sts_rqpkt_resid = 0;
+ sts->sts_rqpkt_state = STATE_GOT_BUS | STATE_GOT_TARGET |
+ STATE_SENT_CMD | STATE_XFERRED_DATA;
+ sts->sts_rqpkt_statistics = 0;
+
+ /*
+ * Return the key value we were provided in the fake sense data:
+ */
+ sts->sts_sensedata.es_valid = 1;
+ sts->sts_sensedata.es_class = CLASS_EXTENDED_SENSE;
+ sts->sts_sensedata.es_key = key;
+
+ pkt->pkt_state |= STATE_ARQ_DONE;
+}
+
+/*
+ * When faking up a REPORT LUNS data structure, we simply report one LUN, LUN 0.
+ * We need 16 bytes for this, 4 for the size, 4 reserved bytes, and the 8 for
+ * the actual LUN.
+ */
+static void
+smrt_fake_report_lun(smrt_command_t *smcm, struct scsi_pkt *pkt)
+{
+ size_t sz;
+ char resp[16];
+ struct buf *bp;
+
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET | STATE_SENT_CMD |
+ STATE_GOT_STATUS;
+
+ /*
+ * Check to make sure this is valid. If reserved bits are set or if the
+ * mode is one other than 0x00, 0x01, 0x02, then it's an illegal
+ * request.
+ */
+ if (pkt->pkt_cdbp[1] != 0 || pkt->pkt_cdbp[3] != 0 ||
+ pkt->pkt_cdbp[4] != 0 || pkt->pkt_cdbp[5] != 0 ||
+ pkt->pkt_cdbp[10] != 0 || pkt->pkt_cdbp[11] != 0 ||
+ pkt->pkt_cdbp[2] > 0x2) {
+ smrt_set_arq_data(pkt, KEY_ILLEGAL_REQUEST);
+ return;
+ }
+
+ /*
+ * Construct the actual REPORT LUNS reply. We need to indicate a single
+ * LUN of all zeros. This means that the length needs to be 8 bytes,
+ * the size of the lun. Otherwise, the rest of this structure can be
+ * zeros.
+ */
+ bzero(resp, sizeof (resp));
+ resp[3] = sizeof (scsi_lun_t);
+
+ bp = scsi_pkt2bp(pkt);
+ sz = MIN(sizeof (resp), bp->b_bcount);
+
+ bp_mapin(bp);
+ bcopy(resp, bp->b_un.b_addr, sz);
+ bp_mapout(bp);
+ pkt->pkt_state |= STATE_XFERRED_DATA;
+ pkt->pkt_resid = bp->b_bcount - sz;
+ if (pkt->pkt_scblen >= 1) {
+ pkt->pkt_scbp[0] = STATUS_GOOD;
+ }
+}
+
+static int
+smrt_tran_start(struct scsi_address *sa, struct scsi_pkt *pkt)
+{
+ _NOTE(ARGUNUSED(sa))
+
+ struct scsi_device *sd;
+ smrt_target_t *smtg;
+ smrt_t *smrt;
+ smrt_command_scsa_t *smcms;
+ smrt_command_t *smcm;
+ int r;
+
+ sd = scsi_address_device(&pkt->pkt_address);
+ VERIFY(sd != NULL);
+ smtg = scsi_device_hba_private_get(sd);
+ VERIFY(smtg != NULL);
+ smrt = smtg->smtg_ctlr;
+ VERIFY(smrt != NULL);
+ smcms = (smrt_command_scsa_t *)pkt->pkt_ha_private;
+ VERIFY(smcms != NULL);
+ smcm = smcms->smcms_command;
+ VERIFY(smcm != NULL);
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_TRAN_START) {
+ /*
+ * This is a retry of a command that has already been
+ * used once. Assign it a new tag number.
+ */
+ smrt_command_reuse(smcm);
+ }
+ smcm->smcm_status |= SMRT_CMD_STATUS_TRAN_START;
+
+ /*
+ * The sophisticated firmware in this controller cannot possibly bear
+ * the following SCSI commands. It appears to return a response with
+ * the status STATUS_ACA_ACTIVE (0x30), which is not something we
+ * expect. Instead, fake up a failure response.
+ */
+ switch (pkt->pkt_cdbp[0]) {
+ case SCMD_FORMAT:
+ case SCMD_LOG_SENSE_G1:
+ case SCMD_MODE_SELECT:
+ case SCMD_PERSISTENT_RESERVE_IN:
+ if (smtg->smtg_physical) {
+ break;
+ }
+
+ smrt->smrt_stats.smrts_ignored_scsi_cmds++;
+ smcm->smcm_status |= SMRT_CMD_STATUS_TRAN_IGNORED;
+
+ /*
+ * Mark the command as completed to the point where we
+ * received a SCSI status code:
+ */
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET |
+ STATE_SENT_CMD | STATE_GOT_STATUS;
+
+ /*
+ * Mock up sense data for an illegal request:
+ */
+ smrt_set_arq_data(pkt, KEY_ILLEGAL_REQUEST);
+
+ scsi_hba_pkt_comp(pkt);
+ return (TRAN_ACCEPT);
+ case SCMD_REPORT_LUNS:
+ /*
+ * The SMRT controller does not accept a REPORT LUNS command for
+ * logical volumes. As such, we need to fake up a REPORT LUNS
+ * response that has a single LUN, LUN 0.
+ */
+ if (smtg->smtg_physical) {
+ break;
+ }
+
+ smrt_fake_report_lun(smcm, pkt);
+
+ scsi_hba_pkt_comp(pkt);
+ return (TRAN_ACCEPT);
+ default:
+ break;
+ }
+
+ if (pkt->pkt_flags & FLAG_NOINTR) {
+ /*
+ * We must sleep and wait for the completion of this command.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ }
+
+ /*
+ * Because we provide a tran_setup_pkt(9E) entrypoint, we must now
+ * set up the Scatter/Gather List in the Command to reflect any
+ * DMA resources passed to us by the framework.
+ */
+ if (pkt->pkt_numcookies > smrt->smrt_sg_cnt) {
+ /*
+ * More DMA cookies than we are prepared to handle.
+ */
+ dev_err(smrt->smrt_dip, CE_WARN, "too many DMA cookies (got %u;"
+ " expected %u)", pkt->pkt_numcookies, smrt->smrt_sg_cnt);
+ return (TRAN_BADPKT);
+ }
+ smcm->smcm_va_cmd->Header.SGList = pkt->pkt_numcookies;
+ smcm->smcm_va_cmd->Header.SGTotal = pkt->pkt_numcookies;
+ for (unsigned i = 0; i < pkt->pkt_numcookies; i++) {
+ smcm->smcm_va_cmd->SG[i].Addr =
+ LE_64(pkt->pkt_cookies[i].dmac_laddress);
+ smcm->smcm_va_cmd->SG[i].Len =
+ LE_32(pkt->pkt_cookies[i].dmac_size);
+ }
+
+ /*
+ * Copy logical volume address from the target object:
+ */
+ smcm->smcm_va_cmd->Header.LUN = *smcm->smcm_target->smtg_addr;
+
+ /*
+ * Initialise the command block.
+ */
+ smcm->smcm_va_cmd->Request.CDBLen = pkt->pkt_cdblen;
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(pkt->pkt_time);
+ if (pkt->pkt_numcookies > 0) {
+ /*
+ * There are DMA resources; set the transfer direction
+ * appropriately:
+ */
+ if (pkt->pkt_dma_flags & DDI_DMA_READ) {
+ smcm->smcm_va_cmd->Request.Type.Direction =
+ CISS_XFER_READ;
+ } else if (pkt->pkt_dma_flags & DDI_DMA_WRITE) {
+ smcm->smcm_va_cmd->Request.Type.Direction =
+ CISS_XFER_WRITE;
+ } else {
+ smcm->smcm_va_cmd->Request.Type.Direction =
+ CISS_XFER_NONE;
+ }
+ } else {
+ /*
+ * No DMA resources means no transfer.
+ */
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_NONE;
+ }
+
+ /*
+ * Initialise the SCSI packet as described in tran_start(9E). We will
+ * progressively update these fields as the command moves through the
+ * submission and completion states.
+ */
+ pkt->pkt_resid = 0;
+ pkt->pkt_reason = CMD_CMPLT;
+ pkt->pkt_statistics = 0;
+ pkt->pkt_state = 0;
+
+ /*
+ * If this SCSI packet has a timeout, configure an appropriate
+ * expiry time:
+ */
+ if (pkt->pkt_time != 0) {
+ smcm->smcm_expiry = gethrtime() + pkt->pkt_time * NANOSEC;
+ }
+
+ /*
+ * Submit the command to the controller.
+ */
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * If we're dumping, there's a chance that the target we're talking to
+ * could have ended up disappearing during the process of discovery. If
+ * this target is part of the dump device, we check here and return that
+ * we hit a fatal error.
+ */
+ if (ddi_in_panic() && smtg->smtg_gone) {
+ mutex_exit(&smrt->smrt_mutex);
+
+ dev_err(smrt->smrt_dip, CE_WARN, "smrt_submit failed: target "
+ "%s is gone, it did not come back after post-panic reset "
+ "device discovery", scsi_device_unit_address(sd));
+
+ return (TRAN_FATAL_ERROR);
+ }
+
+ smrt->smrt_stats.smrts_tran_starts++;
+ if ((r = smrt_submit(smrt, smcm)) != 0) {
+ mutex_exit(&smrt->smrt_mutex);
+
+ dev_err(smrt->smrt_dip, CE_WARN, "smrt_submit failed %d", r);
+
+ /*
+ * Inform the SCSI framework that we could not submit
+ * the command.
+ */
+ return (r == EAGAIN ? TRAN_BUSY : TRAN_FATAL_ERROR);
+ }
+
+ /*
+ * Update the SCSI packet to reflect submission of the command.
+ */
+ pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET | STATE_SENT_CMD;
+
+ if (pkt->pkt_flags & FLAG_NOINTR) {
+ /*
+ * Poll the controller for completion of the command we
+ * submitted. Once this routine has returned, the completion
+ * callback will have been fired with either an active response
+ * (success or error) or a timeout. The command is freed by
+ * the completion callback, so it may not be referenced again
+ * after this call returns.
+ */
+ (void) smrt_poll_for(smrt, smcm);
+ }
+
+ mutex_exit(&smrt->smrt_mutex);
+ return (TRAN_ACCEPT);
+}
+
+static int
+smrt_tran_reset(struct scsi_address *sa, int level)
+{
+ _NOTE(ARGUNUSED(level))
+
+ struct scsi_device *sd;
+ smrt_target_t *smtg;
+ smrt_t *smrt;
+ smrt_command_t *smcm;
+ int r;
+
+ sd = scsi_address_device(sa);
+ VERIFY(sd != NULL);
+ smtg = scsi_device_hba_private_get(sd);
+ VERIFY(smtg != NULL);
+ smrt = smtg->smtg_ctlr;
+
+ /*
+ * The framework has requested some kind of SCSI reset. A
+ * controller-level soft reset can take a very long time -- often on
+ * the order of 30-60 seconds -- but might well be our only option if
+ * the controller is non-responsive.
+ *
+ * First, check if the controller is responding to pings.
+ */
+again:
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL) {
+ return (0);
+ }
+
+ smrt_write_message_nop(smcm, SMRT_PING_CHECK_TIMEOUT);
+
+ mutex_enter(&smrt->smrt_mutex);
+ smrt->smrt_stats.smrts_tran_resets++;
+ if (ddi_in_panic()) {
+ goto skip_check;
+ }
+
+ if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+ /*
+ * The controller is already resetting. Wait for that
+ * to finish.
+ */
+ while (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
+ cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
+ }
+ }
+
+skip_check:
+ /*
+ * Submit our ping to the controller.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ smcm->smcm_expiry = gethrtime() + SMRT_PING_CHECK_TIMEOUT * NANOSEC;
+ if (smrt_submit(smrt, smcm) != 0) {
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ return (0);
+ }
+
+ if ((r = smrt_poll_for(smrt, smcm)) != 0) {
+ VERIFY3S(r, ==, ETIMEDOUT);
+ VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * The ping command timed out. Abandon it now.
+ */
+ dev_err(smrt->smrt_dip, CE_WARN, "controller ping timed out");
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+
+ } else if ((smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) ||
+ (smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) {
+ /*
+ * The command completed in error, or a controller reset
+ * was sent while we were trying to ping.
+ */
+ dev_err(smrt->smrt_dip, CE_WARN, "controller ping error");
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ mutex_enter(&smrt->smrt_mutex);
+
+ } else {
+ VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_COMPLETE);
+
+ /*
+ * The controller is responsive, and a full soft reset would be
+ * extremely disruptive to the system. Given our spotty
+ * support for some SCSI commands (which can upset the target
+ * drivers) and the historically lax behaviour of the "smrt"
+ * driver, we grit our teeth and pretend we were able to
+ * perform a reset.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ return (1);
+ }
+
+ /*
+ * If a reset has been initiated in the last 90 seconds, try
+ * another ping.
+ */
+ if (gethrtime() < smrt->smrt_last_reset_start + 90 * NANOSEC) {
+ dev_err(smrt->smrt_dip, CE_WARN, "controller ping failed, but "
+ "was recently reset; retrying ping");
+ mutex_exit(&smrt->smrt_mutex);
+
+ /*
+ * Sleep for a second first.
+ */
+ if (ddi_in_panic()) {
+ drv_usecwait(1 * MICROSEC);
+ } else {
+ delay(drv_usectohz(1 * MICROSEC));
+ }
+ goto again;
+ }
+
+ dev_err(smrt->smrt_dip, CE_WARN, "controller ping failed; resetting "
+ "controller");
+ if (smrt_ctlr_reset(smrt) != 0) {
+ dev_err(smrt->smrt_dip, CE_WARN, "controller reset failure");
+ mutex_exit(&smrt->smrt_mutex);
+ return (0);
+ }
+
+ mutex_exit(&smrt->smrt_mutex);
+ return (1);
+}
+
+static int
+smrt_tran_abort(struct scsi_address *sa, struct scsi_pkt *pkt)
+{
+ struct scsi_device *sd;
+ smrt_target_t *smtg;
+ smrt_t *smrt;
+ smrt_command_t *smcm = NULL;
+ smrt_command_t *abort_smcm;
+
+ sd = scsi_address_device(sa);
+ VERIFY(sd != NULL);
+ smtg = scsi_device_hba_private_get(sd);
+ VERIFY(smtg != NULL);
+ smrt = smtg->smtg_ctlr;
+ VERIFY(smrt != NULL);
+
+
+ if ((abort_smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL) {
+ /*
+ * No resources available to send an abort message.
+ */
+ return (0);
+ }
+
+ mutex_enter(&smrt->smrt_mutex);
+ smrt->smrt_stats.smrts_tran_aborts++;
+ if (pkt != NULL) {
+ /*
+ * The framework wants us to abort a specific SCSI packet.
+ */
+ smrt_command_scsa_t *smcms = (smrt_command_scsa_t *)
+ pkt->pkt_ha_private;
+ smcm = smcms->smcms_command;
+
+ if (!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) {
+ /*
+ * This message is not currently in flight, so we
+ * cannot abort it.
+ */
+ goto fail;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) {
+ /*
+ * An abort message for this command has already been
+ * sent to the controller. Return failure.
+ */
+ goto fail;
+ }
+
+ smrt_write_message_abort_one(abort_smcm, smcm->smcm_tag);
+ } else {
+ /*
+ * The framework wants us to abort every in flight command
+ * for the target with this address.
+ */
+ smrt_write_message_abort_all(abort_smcm, smtg->smtg_addr);
+ }
+
+ /*
+ * Submit the abort message to the controller.
+ */
+ abort_smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if (smrt_submit(smrt, abort_smcm) != 0) {
+ goto fail;
+ }
+
+ if (pkt != NULL) {
+ /*
+ * Record some debugging information about the abort we
+ * sent:
+ */
+ smcm->smcm_abort_time = gethrtime();
+ smcm->smcm_abort_tag = abort_smcm->smcm_tag;
+
+ /*
+ * Mark the command as aborted so that we do not send
+ * a second abort message:
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABORT_SENT;
+ }
+
+ /*
+ * Poll for completion of the abort message. Note that this function
+ * only fails if we set a timeout on the command, which we have not
+ * done.
+ */
+ VERIFY0(smrt_poll_for(smrt, abort_smcm));
+
+ if ((abort_smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) ||
+ (abort_smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) {
+ /*
+ * Either the controller was reset or the abort command
+ * failed.
+ */
+ goto fail;
+ }
+
+ /*
+ * The command was successfully aborted.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(abort_smcm);
+ return (1);
+
+fail:
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(abort_smcm);
+ return (0);
+}
+
+static void
+smrt_hba_complete_status(smrt_command_t *smcm)
+{
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+ struct scsi_pkt *pkt = smcm->smcm_scsa->smcms_pkt;
+
+ bzero(pkt->pkt_scbp, pkt->pkt_scblen);
+
+ if (ei->ScsiStatus != STATUS_CHECK) {
+ /*
+ * If the SCSI status is not CHECK CONDITION, we don't want
+ * to try and read the sense data buffer.
+ */
+ goto simple_status;
+ }
+
+ if (pkt->pkt_scblen < sizeof (struct scsi_arq_status)) {
+ /*
+ * There is not enough room for a request sense structure.
+ * Fall back to reporting just the SCSI status code.
+ */
+ goto simple_status;
+ }
+
+ /* LINTED: E_BAD_PTR_CAST_ALIGN */
+ struct scsi_arq_status *sts = (struct scsi_arq_status *)pkt->pkt_scbp;
+
+ /*
+ * Copy in the SCSI status from the original command.
+ */
+ bcopy(&ei->ScsiStatus, &sts->sts_status, sizeof (sts->sts_status));
+
+ /*
+ * Mock up a successful REQUEST SENSE:
+ */
+ sts->sts_rqpkt_reason = CMD_CMPLT;
+ sts->sts_rqpkt_resid = 0;
+ sts->sts_rqpkt_state = STATE_GOT_BUS | STATE_GOT_TARGET |
+ STATE_SENT_CMD | STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ sts->sts_rqpkt_statistics = 0;
+
+ /*
+ * The sense data from the controller should be copied into place
+ * starting at the "sts_sensedata" member of the auto request
+ * sense object.
+ */
+ size_t sense_len = pkt->pkt_scblen - offsetof(struct scsi_arq_status,
+ sts_sensedata);
+ if (ei->SenseLen < sense_len) {
+ /*
+ * Only copy sense data bytes that are within the region
+ * the controller marked as valid.
+ */
+ sense_len = ei->SenseLen;
+ }
+ bcopy(ei->SenseInfo, &sts->sts_sensedata, sense_len);
+
+ pkt->pkt_state |= STATE_ARQ_DONE;
+ return;
+
+simple_status:
+ if (pkt->pkt_scblen < sizeof (struct scsi_status)) {
+ /*
+ * There is not even enough room for the SCSI status byte.
+ */
+ return;
+ }
+
+ bcopy(&ei->ScsiStatus, pkt->pkt_scbp, sizeof (struct scsi_status));
+}
+
+static void
+smrt_hba_complete_log_error(smrt_command_t *smcm, const char *name)
+{
+ smrt_t *smrt = smcm->smcm_ctlr;
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+
+ dev_err(smrt->smrt_dip, CE_WARN, "!SCSI command failed: %s: "
+ "SCSI op %x, CISS status %x, SCSI status %x", name,
+ (unsigned)smcm->smcm_va_cmd->Request.CDB[0],
+ (unsigned)ei->CommandStatus, (unsigned)ei->ScsiStatus);
+}
+
+/*
+ * Completion routine for commands submitted to the controller via the SCSI
+ * framework.
+ */
+void
+smrt_hba_complete(smrt_command_t *smcm)
+{
+ smrt_t *smrt = smcm->smcm_ctlr;
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+ struct scsi_pkt *pkt = smcm->smcm_scsa->smcms_pkt;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ pkt->pkt_resid = ei->ResidualCnt;
+
+ /*
+ * Check if the controller was reset while this packet was in flight.
+ */
+ if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ if (pkt->pkt_reason != CMD_CMPLT) {
+ /*
+ * If another error status has already been written,
+ * do not overwrite it.
+ */
+ pkt->pkt_reason = CMD_RESET;
+ }
+ pkt->pkt_statistics |= STAT_BUS_RESET | STAT_DEV_RESET;
+ goto finish;
+ }
+
+ if (!(smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) {
+ /*
+ * The command was completed without error by the controller.
+ *
+ * As per the specification, if an error was not signalled
+ * by the controller through the CISS transport method,
+ * the error information (including CommandStatus) has not
+ * been written and should not be checked.
+ */
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ goto finish;
+ }
+
+ /*
+ * Check the completion status to determine what befell this request.
+ */
+ switch (ei->CommandStatus) {
+ case CISS_CMD_SUCCESS:
+ /*
+ * In a certain sense, the specification contradicts itself.
+ * On the one hand, it suggests that a successful command
+ * will not result in a controller write to the error
+ * information block; on the other hand, it makes room
+ * for a status code (0) which denotes a successful
+ * execution.
+ *
+ * To be on the safe side, we check for that condition here.
+ */
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ break;
+
+ case CISS_CMD_DATA_UNDERRUN:
+ /*
+ * A data underrun occurred. Ideally this will result in
+ * an appropriate SCSI status and sense data.
+ */
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ break;
+
+ case CISS_CMD_TARGET_STATUS:
+ /*
+ * The command completed, but an error occurred. We need
+ * to provide the sense data to the SCSI framework.
+ */
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ break;
+
+ case CISS_CMD_DATA_OVERRUN:
+ /*
+ * Data overrun has occurred.
+ */
+ smrt_hba_complete_log_error(smcm, "data overrun");
+ pkt->pkt_reason = CMD_DATA_OVR;
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ break;
+
+ case CISS_CMD_INVALID:
+ /*
+ * One or more fields in the command has invalid data.
+ */
+ smrt_hba_complete_log_error(smcm, "invalid command");
+ pkt->pkt_reason = CMD_BADMSG;
+ pkt->pkt_state |= STATE_GOT_STATUS;
+ break;
+
+ case CISS_CMD_PROTOCOL_ERR:
+ /*
+ * An error occurred in communication with the end device.
+ */
+ smrt_hba_complete_log_error(smcm, "protocol error");
+ pkt->pkt_reason = CMD_BADMSG;
+ pkt->pkt_state |= STATE_GOT_STATUS;
+ break;
+
+ case CISS_CMD_HARDWARE_ERR:
+ /*
+ * A hardware error occurred.
+ */
+ smrt_hba_complete_log_error(smcm, "hardware error");
+ pkt->pkt_reason = CMD_INCOMPLETE;
+ break;
+
+ case CISS_CMD_CONNECTION_LOST:
+ /*
+ * The connection with the end device cannot be
+ * re-established.
+ */
+ smrt_hba_complete_log_error(smcm, "connection lost");
+ pkt->pkt_reason = CMD_INCOMPLETE;
+ break;
+
+ case CISS_CMD_ABORTED:
+ case CISS_CMD_UNSOLICITED_ABORT:
+ if (smcm->smcm_status & SMRT_CMD_STATUS_TIMEOUT) {
+ /*
+ * This abort was arranged by the periodic routine
+ * in response to an elapsed timeout.
+ */
+ pkt->pkt_reason = CMD_TIMEOUT;
+ pkt->pkt_statistics |= STAT_TIMEOUT;
+ } else {
+ pkt->pkt_reason = CMD_ABORTED;
+ }
+ pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS;
+ pkt->pkt_statistics |= STAT_ABORTED;
+ break;
+
+ case CISS_CMD_TIMEOUT:
+ smrt_hba_complete_log_error(smcm, "timeout");
+ pkt->pkt_reason = CMD_TIMEOUT;
+ pkt->pkt_statistics |= STAT_TIMEOUT;
+ break;
+
+ default:
+ /*
+ * This is an error that we were not prepared to handle.
+ * Signal a generic transport-level error to the framework.
+ */
+ smrt_hba_complete_log_error(smcm, "unexpected error");
+ pkt->pkt_reason = CMD_TRAN_ERR;
+ }
+
+ /*
+ * Attempt to read a SCSI status code and any automatic
+ * request sense data that may exist:
+ */
+ smrt_hba_complete_status(smcm);
+
+finish:
+ mutex_exit(&smrt->smrt_mutex);
+ scsi_hba_pkt_comp(pkt);
+ mutex_enter(&smrt->smrt_mutex);
+}
+
+static int
+smrt_getcap(struct scsi_address *sa, char *cap, int whom)
+{
+ _NOTE(ARGUNUSED(whom))
+
+ struct scsi_device *sd;
+ smrt_target_t *smtg;
+ smrt_t *smrt;
+ int index;
+
+ sd = scsi_address_device(sa);
+ VERIFY(sd != NULL);
+ smtg = scsi_device_hba_private_get(sd);
+ VERIFY(smtg != NULL);
+ smrt = smtg->smtg_ctlr;
+ VERIFY(smrt != NULL);
+
+ if ((index = scsi_hba_lookup_capstr(cap)) == DDI_FAILURE) {
+ /*
+ * This capability string could not be translated to an
+ * ID number, so it must not exist.
+ */
+ return (-1);
+ }
+
+ switch (index) {
+ case SCSI_CAP_CDB_LEN:
+ /*
+ * The CDB field in the CISS request block is fixed at 16
+ * bytes.
+ */
+ return (CISS_CDBLEN);
+
+ case SCSI_CAP_DMA_MAX:
+ if (smrt->smrt_dma_attr.dma_attr_maxxfer > INT_MAX) {
+ return (INT_MAX);
+ }
+ return ((int)smrt->smrt_dma_attr.dma_attr_maxxfer);
+
+ case SCSI_CAP_SECTOR_SIZE:
+ if (smrt->smrt_dma_attr.dma_attr_granular > INT_MAX) {
+ return (-1);
+ }
+ return ((int)smrt->smrt_dma_attr.dma_attr_granular);
+
+ /*
+ * If this target corresponds to a physical device, then we always
+ * indicate that we're on a SAS interconnect. Otherwise, we default to
+ * saying that we're on a parallel bus. We can't use SAS for
+ * everything, unfortunately. When you declare yourself to be a SAS
+ * interconnect, it's expected that you have a full 16-byte WWN as the
+ * target. If not, devfsadm will not be able to enumerate the device
+ * and create /dev/[r]dsk entries.
+ */
+ case SCSI_CAP_INTERCONNECT_TYPE:
+ if (smtg->smtg_physical) {
+ return (INTERCONNECT_SAS);
+ } else {
+ return (INTERCONNECT_PARALLEL);
+ }
+
+ case SCSI_CAP_DISCONNECT:
+ case SCSI_CAP_SYNCHRONOUS:
+ case SCSI_CAP_WIDE_XFER:
+ case SCSI_CAP_ARQ:
+ case SCSI_CAP_UNTAGGED_QING:
+ case SCSI_CAP_TAGGED_QING:
+ /*
+ * These capabilities are supported by the driver and the
+ * controller. See scsi_ifgetcap(9F) for more information.
+ */
+ return (1);
+
+ case SCSI_CAP_INITIATOR_ID:
+ case SCSI_CAP_RESET_NOTIFICATION:
+ /*
+ * These capabilities are not supported.
+ */
+ return (0);
+
+ default:
+ /*
+ * The property in question is not known to this driver.
+ */
+ return (-1);
+ }
+}
+
+/* ARGSUSED */
+static int
+smrt_setcap(struct scsi_address *sa, char *cap, int value, int whom)
+{
+ int index;
+
+ if ((index = scsi_hba_lookup_capstr(cap)) == DDI_FAILURE) {
+ /*
+ * This capability string could not be translated to an
+ * ID number, so it must not exist.
+ */
+ return (-1);
+ }
+
+ if (whom == 0) {
+ /*
+ * When whom is 0, this is a request to set a capability for
+ * all targets. As per the recommendation in tran_setcap(9E),
+ * we do not support this mode of operation.
+ */
+ return (-1);
+ }
+
+ switch (index) {
+ case SCSI_CAP_CDB_LEN:
+ case SCSI_CAP_DMA_MAX:
+ case SCSI_CAP_SECTOR_SIZE:
+ case SCSI_CAP_INITIATOR_ID:
+ case SCSI_CAP_DISCONNECT:
+ case SCSI_CAP_SYNCHRONOUS:
+ case SCSI_CAP_WIDE_XFER:
+ case SCSI_CAP_ARQ:
+ case SCSI_CAP_UNTAGGED_QING:
+ case SCSI_CAP_TAGGED_QING:
+ case SCSI_CAP_RESET_NOTIFICATION:
+ case SCSI_CAP_INTERCONNECT_TYPE:
+ /*
+ * We do not support changing any capabilities at this time.
+ */
+ return (0);
+
+ default:
+ /*
+ * The capability in question is not known to this driver.
+ */
+ return (-1);
+ }
+}
+
+int
+smrt_ctrl_hba_setup(smrt_t *smrt)
+{
+ int flags;
+ dev_info_t *dip = smrt->smrt_dip;
+ scsi_hba_tran_t *tran;
+
+ if ((tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP)) == NULL) {
+ dev_err(dip, CE_WARN, "could not allocate SCSA resources");
+ return (DDI_FAILURE);
+ }
+
+ smrt->smrt_hba_tran = tran;
+ tran->tran_hba_private = smrt;
+
+ tran->tran_tgt_init = smrt_ctrl_tran_tgt_init;
+ tran->tran_tgt_probe = scsi_hba_probe;
+
+ tran->tran_start = smrt_ctrl_tran_start;
+
+ tran->tran_getcap = smrt_getcap;
+ tran->tran_setcap = smrt_setcap;
+
+ tran->tran_setup_pkt = smrt_tran_setup_pkt;
+ tran->tran_teardown_pkt = smrt_tran_teardown_pkt;
+ tran->tran_hba_len = sizeof (smrt_command_scsa_t);
+ tran->tran_interconnect_type = INTERCONNECT_SAS;
+
+ flags = SCSI_HBA_HBA | SCSI_HBA_TRAN_SCB | SCSI_HBA_ADDR_COMPLEX;
+ if (scsi_hba_attach_setup(dip, &smrt->smrt_dma_attr, tran, flags) !=
+ DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not attach to SCSA framework");
+ scsi_hba_tran_free(tran);
+ return (DDI_FAILURE);
+ }
+
+ smrt->smrt_init_level |= SMRT_INITLEVEL_SCSA;
+ return (DDI_SUCCESS);
+}
+
+void
+smrt_ctrl_hba_teardown(smrt_t *smrt)
+{
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_SCSA) {
+ VERIFY(scsi_hba_detach(smrt->smrt_dip) != DDI_FAILURE);
+ scsi_hba_tran_free(smrt->smrt_hba_tran);
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_SCSA;
+ }
+}
+
+int
+smrt_logvol_hba_setup(smrt_t *smrt, dev_info_t *iport)
+{
+ scsi_hba_tran_t *tran;
+
+ tran = ddi_get_driver_private(iport);
+ if (tran == NULL)
+ return (DDI_FAILURE);
+
+ tran->tran_tgt_init = smrt_logvol_tran_tgt_init;
+ tran->tran_tgt_free = smrt_logvol_tran_tgt_free;
+
+ tran->tran_start = smrt_tran_start;
+ tran->tran_reset = smrt_tran_reset;
+ tran->tran_abort = smrt_tran_abort;
+
+ tran->tran_hba_private = smrt;
+
+ mutex_enter(&smrt->smrt_mutex);
+ if (scsi_hba_tgtmap_create(iport, SCSI_TM_FULLSET, MICROSEC,
+ 2 * MICROSEC, smrt, smrt_logvol_tgtmap_activate,
+ smrt_logvol_tgtmap_deactivate, &smrt->smrt_virt_tgtmap) !=
+ DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+
+ smrt_discover_request(smrt);
+ mutex_exit(&smrt->smrt_mutex);
+
+ return (DDI_SUCCESS);
+}
+
+void
+smrt_logvol_hba_teardown(smrt_t *smrt, dev_info_t *iport)
+{
+ ASSERT(smrt->smrt_virt_iport == iport);
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ if (smrt->smrt_virt_tgtmap != NULL) {
+ scsi_hba_tgtmap_t *t;
+
+ /*
+ * Ensure that we can't be racing with discovery.
+ */
+ while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) {
+ mutex_exit(&smrt->smrt_mutex);
+ ddi_taskq_wait(smrt->smrt_discover_taskq);
+ mutex_enter(&smrt->smrt_mutex);
+ }
+
+ t = smrt->smrt_virt_tgtmap;
+ smrt->smrt_virt_tgtmap = NULL;
+ mutex_exit(&smrt->smrt_mutex);
+ scsi_hba_tgtmap_destroy(t);
+ mutex_enter(&smrt->smrt_mutex);
+ }
+
+ mutex_exit(&smrt->smrt_mutex);
+}
+
+int
+smrt_phys_hba_setup(smrt_t *smrt, dev_info_t *iport)
+{
+ scsi_hba_tran_t *tran;
+
+ tran = ddi_get_driver_private(iport);
+ if (tran == NULL)
+ return (DDI_FAILURE);
+
+ tran->tran_tgt_init = smrt_phys_tran_tgt_init;
+ tran->tran_tgt_free = smrt_phys_tran_tgt_free;
+
+ tran->tran_start = smrt_tran_start;
+ tran->tran_reset = smrt_tran_reset;
+ tran->tran_abort = smrt_tran_abort;
+
+ tran->tran_hba_private = smrt;
+
+ mutex_enter(&smrt->smrt_mutex);
+ if (scsi_hba_tgtmap_create(iport, SCSI_TM_FULLSET, MICROSEC,
+ 2 * MICROSEC, smrt, smrt_phys_tgtmap_activate,
+ smrt_phys_tgtmap_deactivate, &smrt->smrt_phys_tgtmap) !=
+ DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+
+ smrt_discover_request(smrt);
+ mutex_exit(&smrt->smrt_mutex);
+
+ return (DDI_SUCCESS);
+}
+
+void
+smrt_phys_hba_teardown(smrt_t *smrt, dev_info_t *iport)
+{
+ ASSERT(smrt->smrt_phys_iport == iport);
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ if (smrt->smrt_phys_tgtmap != NULL) {
+ scsi_hba_tgtmap_t *t;
+
+ /*
+ * Ensure that we can't be racing with discovery.
+ */
+ while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) {
+ mutex_exit(&smrt->smrt_mutex);
+ ddi_taskq_wait(smrt->smrt_discover_taskq);
+ mutex_enter(&smrt->smrt_mutex);
+ }
+
+ t = smrt->smrt_phys_tgtmap;
+ smrt->smrt_phys_tgtmap = NULL;
+ mutex_exit(&smrt->smrt_mutex);
+ scsi_hba_tgtmap_destroy(t);
+ mutex_enter(&smrt->smrt_mutex);
+ }
+
+ mutex_exit(&smrt->smrt_mutex);
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c
new file mode 100644
index 0000000000..18d5b8e936
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c
@@ -0,0 +1,286 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+static char *
+smrt_interrupt_type_name(int type)
+{
+ switch (type) {
+ case DDI_INTR_TYPE_MSIX:
+ return ("MSI-X");
+ case DDI_INTR_TYPE_MSI:
+ return ("MSI");
+ case DDI_INTR_TYPE_FIXED:
+ return ("fixed");
+ default:
+ return ("?");
+ }
+}
+
+static boolean_t
+smrt_try_msix(smrt_t *smrt)
+{
+ char *fwver = smrt->smrt_versions.smrtv_firmware_rev;
+
+ /*
+ * Generation 9 controllers end up having a different firmware
+ * versioning scheme than others. If this is a generation 9 controller,
+ * which all share the same PCI device ID, then we default to MSI.
+ */
+ if (smrt->smrt_pci_vendor == SMRT_VENDOR_HP &&
+ smrt->smrt_pci_device == SMRT_DEVICE_GEN9) {
+ return (B_FALSE);
+ }
+
+ if (fwver[0] == '8' && fwver[1] == '.' && isdigit(fwver[2]) &&
+ isdigit(fwver[3])) {
+ /*
+ * Version 8.00 of the Smart Array firmware appears to have
+ * broken MSI support on at least one controller. We could
+ * blindly try MSI-X everywhere, except that on at least some
+ * 6.XX firmware versions, MSI-X interrupts do not appear
+ * to be triggered for Simple Transport Method command
+ * completions.
+ *
+ * For now, assume we should try for MSI-X with all 8.XX
+ * versions of the firmware.
+ */
+ dev_err(smrt->smrt_dip, CE_NOTE, "!trying MSI-X interrupts "
+ "to work around 8.XX firmware defect");
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static int
+smrt_interrupts_disable(smrt_t *smrt)
+{
+ if (smrt->smrt_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
+ return (ddi_intr_block_disable(smrt->smrt_interrupts,
+ smrt->smrt_ninterrupts));
+ } else {
+ VERIFY3S(smrt->smrt_ninterrupts, ==, 1);
+
+ return (ddi_intr_disable(smrt->smrt_interrupts[0]));
+ }
+}
+
+int
+smrt_interrupts_enable(smrt_t *smrt)
+{
+ int ret;
+
+ VERIFY(!(smrt->smrt_init_level & SMRT_INITLEVEL_INT_ENABLED));
+
+ if (smrt->smrt_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
+ ret = ddi_intr_block_enable(smrt->smrt_interrupts,
+ smrt->smrt_ninterrupts);
+ } else {
+ VERIFY3S(smrt->smrt_ninterrupts, ==, 1);
+
+ ret = ddi_intr_enable(smrt->smrt_interrupts[0]);
+ }
+
+ if (ret == DDI_SUCCESS) {
+ smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ENABLED;
+ }
+
+ return (ret);
+}
+
+static void
+smrt_interrupts_free(smrt_t *smrt)
+{
+ for (int i = 0; i < smrt->smrt_ninterrupts; i++) {
+ (void) ddi_intr_free(smrt->smrt_interrupts[i]);
+ }
+ smrt->smrt_ninterrupts = 0;
+ smrt->smrt_interrupt_type = 0;
+ smrt->smrt_interrupt_cap = 0;
+ smrt->smrt_interrupt_pri = 0;
+}
+
+static int
+smrt_interrupts_alloc(smrt_t *smrt, int type)
+{
+ dev_info_t *dip = smrt->smrt_dip;
+ int nintrs = 0;
+ int navail = 0;
+
+ if (ddi_intr_get_nintrs(dip, type, &nintrs) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not count %s interrupts",
+ smrt_interrupt_type_name(type));
+ return (DDI_FAILURE);
+ }
+ if (nintrs < 1) {
+ dev_err(dip, CE_WARN, "no %s interrupts supported",
+ smrt_interrupt_type_name(type));
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_intr_get_navail(dip, type, &navail) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not count available %s "
+ "interrupts", smrt_interrupt_type_name(type));
+ return (DDI_FAILURE);
+ }
+ if (navail < 1) {
+ dev_err(dip, CE_WARN, "no %s interrupts available",
+ smrt_interrupt_type_name(type));
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_intr_alloc(dip, smrt->smrt_interrupts, type, 0, 1,
+ &smrt->smrt_ninterrupts, DDI_INTR_ALLOC_STRICT) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "%s interrupt allocation failed",
+ smrt_interrupt_type_name(type));
+ smrt_interrupts_free(smrt);
+ return (DDI_FAILURE);
+ }
+
+ smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ALLOC;
+ smrt->smrt_interrupt_type = type;
+ return (DDI_SUCCESS);
+}
+
+int
+smrt_interrupts_setup(smrt_t *smrt)
+{
+ int types;
+ unsigned ipri;
+ uint_t (*hw_isr)(caddr_t, caddr_t);
+ dev_info_t *dip = smrt->smrt_dip;
+
+ /*
+ * Select the correct hardware interrupt service routine for the
+ * Transport Method we have configured:
+ */
+ switch (smrt->smrt_ctlr_mode) {
+ case SMRT_CTLR_MODE_SIMPLE:
+ hw_isr = smrt_isr_hw_simple;
+ break;
+ default:
+ panic("unknown controller mode");
+ }
+
+ if (ddi_intr_get_supported_types(dip, &types) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not get support interrupts");
+ goto fail;
+ }
+
+ /*
+ * At least one firmware version has been released for the Smart Array
+ * line with entirely defective MSI support. The specification is
+ * somewhat unclear on the precise nature of MSI-X support with Smart
+ * Array controllers, particularly with respect to the Simple Transport
+ * Method, but for those broken firmware versions we need to try
+ * anyway.
+ */
+ if (smrt_try_msix(smrt) && (types & DDI_INTR_TYPE_MSIX)) {
+ if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_MSIX) ==
+ DDI_SUCCESS) {
+ goto add_handler;
+ }
+ }
+
+ /*
+ * If MSI-X is not available, or not expected to work, fall back to
+ * MSI.
+ */
+ if (types & DDI_INTR_TYPE_MSI) {
+ if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_MSI) ==
+ DDI_SUCCESS) {
+ goto add_handler;
+ }
+ }
+
+ /*
+ * If neither MSI-X nor MSI is available, fall back to fixed
+ * interrupts. Note that the use of fixed interrupts has been
+ * observed, with some combination of controllers and systems, to
+ * result in interrupts stopping completely at random times.
+ */
+ if (types & DDI_INTR_TYPE_FIXED) {
+ if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_FIXED) ==
+ DDI_SUCCESS) {
+ goto add_handler;
+ }
+ }
+
+ /*
+ * We were unable to allocate any interrupts.
+ */
+ dev_err(dip, CE_WARN, "interrupt allocation failed");
+ goto fail;
+
+add_handler:
+ /*
+ * Ensure that we have not been given a high-level interrupt, as our
+ * interrupt handlers do not support them.
+ */
+ if (ddi_intr_get_pri(smrt->smrt_interrupts[0], &ipri) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not determine interrupt priority");
+ goto fail;
+ }
+ if (ipri >= ddi_intr_get_hilevel_pri()) {
+ dev_err(dip, CE_WARN, "high level interrupts not supported");
+ goto fail;
+ }
+ smrt->smrt_interrupt_pri = ipri;
+
+ if (ddi_intr_get_cap(smrt->smrt_interrupts[0],
+ &smrt->smrt_interrupt_cap) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "could not get %s interrupt cap",
+ smrt_interrupt_type_name(smrt->smrt_interrupt_type));
+ goto fail;
+ }
+
+ if (ddi_intr_add_handler(smrt->smrt_interrupts[0], hw_isr,
+ (caddr_t)smrt, NULL) != DDI_SUCCESS) {
+ dev_err(dip, CE_WARN, "adding %s interrupt failed",
+ smrt_interrupt_type_name(smrt->smrt_interrupt_type));
+ goto fail;
+ }
+ smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ADDED;
+
+ return (DDI_SUCCESS);
+
+fail:
+ smrt_interrupts_teardown(smrt);
+ return (DDI_FAILURE);
+}
+
+void
+smrt_interrupts_teardown(smrt_t *smrt)
+{
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ENABLED) {
+ (void) smrt_interrupts_disable(smrt);
+
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ENABLED;
+ }
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ADDED) {
+ (void) ddi_intr_remove_handler(smrt->smrt_interrupts[0]);
+
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ADDED;
+ }
+
+ if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ALLOC) {
+ smrt_interrupts_free(smrt);
+
+ smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ALLOC;
+ }
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c
new file mode 100644
index 0000000000..05963ac2e2
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c
@@ -0,0 +1,367 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017, Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+static void
+smrt_logvol_free(smrt_volume_t *smlv)
+{
+ /*
+ * By this stage of teardown, all of the SCSI target drivers
+ * must have been detached from this logical volume.
+ */
+ VERIFY(list_is_empty(&smlv->smlv_targets));
+ list_destroy(&smlv->smlv_targets);
+
+ kmem_free(smlv, sizeof (*smlv));
+}
+
+smrt_volume_t *
+smrt_logvol_lookup_by_id(smrt_t *smrt, unsigned long id)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ for (smrt_volume_t *smlv = list_head(&smrt->smrt_volumes);
+ smlv != NULL; smlv = list_next(&smrt->smrt_volumes, smlv)) {
+ if (smlv->smlv_addr.LogDev.VolId == id) {
+ return (smlv);
+ }
+ }
+
+ return (NULL);
+}
+
+static int
+smrt_read_logvols(smrt_t *smrt, smrt_report_logical_lun_t *smrll, uint64_t gen)
+{
+ smrt_report_logical_lun_ent_t *ents = smrll->smrll_data.ents;
+ uint32_t count = BE_32(smrll->smrll_datasize) /
+ sizeof (smrt_report_logical_lun_ent_t);
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ if (count > SMRT_MAX_LOGDRV) {
+ count = SMRT_MAX_LOGDRV;
+ }
+
+ for (unsigned i = 0; i < count; i++) {
+ smrt_volume_t *smlv;
+ char id[SCSI_MAXNAMELEN];
+
+ DTRACE_PROBE2(read_logvol, unsigned, i,
+ smrt_report_logical_lun_ent_t *, &ents[i]);
+
+ if ((smlv = smrt_logvol_lookup_by_id(smrt,
+ ents[i].smrle_addr.VolId)) == NULL) {
+
+ /*
+ * This is a new Logical Volume, so add it the the list.
+ */
+ if ((smlv = kmem_zalloc(sizeof (*smlv), KM_NOSLEEP)) ==
+ NULL) {
+ return (ENOMEM);
+ }
+
+ list_create(&smlv->smlv_targets,
+ sizeof (smrt_target_t),
+ offsetof(smrt_target_t, smtg_link_lun));
+
+ smlv->smlv_ctlr = smrt;
+ list_insert_tail(&smrt->smrt_volumes, smlv);
+ }
+
+ /*
+ * Always make sure that the address and the generation are up
+ * to date, regardless of where this came from.
+ */
+ smlv->smlv_addr.LogDev = ents[i].smrle_addr;
+ smlv->smlv_gen = gen;
+ (void) snprintf(id, sizeof (id), "%x",
+ smlv->smlv_addr.LogDev.VolId);
+ if (!ddi_in_panic() &&
+ scsi_hba_tgtmap_set_add(smrt->smrt_virt_tgtmap,
+ SCSI_TGT_SCSI_DEVICE, id, NULL) != DDI_SUCCESS) {
+ return (EIO);
+ }
+ }
+
+ return (0);
+}
+
+static int
+smrt_read_logvols_ext(smrt_t *smrt, smrt_report_logical_lun_t *smrll,
+ uint64_t gen)
+{
+ smrt_report_logical_lun_extent_t *extents =
+ smrll->smrll_data.extents;
+ uint32_t count = BE_32(smrll->smrll_datasize) /
+ sizeof (smrt_report_logical_lun_extent_t);
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ if (count > SMRT_MAX_LOGDRV) {
+ count = SMRT_MAX_LOGDRV;
+ }
+
+ for (unsigned i = 0; i < count; i++) {
+ smrt_volume_t *smlv;
+ char id[SCSI_MAXNAMELEN];
+
+ DTRACE_PROBE2(read_logvol_ext, unsigned, i,
+ smrt_report_logical_lun_extent_t *, &extents[i]);
+
+ if ((smlv = smrt_logvol_lookup_by_id(smrt,
+ extents[i].smrle_addr.VolId)) != NULL) {
+ if ((smlv->smlv_flags & SMRT_VOL_FLAG_WWN) &&
+ bcmp(extents[i].smrle_wwn, smlv->smlv_wwn,
+ 16) != 0) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "logical "
+ "volume %u WWN changed unexpectedly", i);
+ }
+ } else {
+ /*
+ * This is a new Logical Volume, so add it the the list.
+ */
+ if ((smlv = kmem_zalloc(sizeof (*smlv), KM_NOSLEEP)) ==
+ NULL) {
+ return (ENOMEM);
+ }
+
+ bcopy(extents[i].smrle_wwn, smlv->smlv_wwn, 16);
+ smlv->smlv_flags |= SMRT_VOL_FLAG_WWN;
+
+ list_create(&smlv->smlv_targets,
+ sizeof (smrt_target_t),
+ offsetof(smrt_target_t, smtg_link_lun));
+
+ smlv->smlv_ctlr = smrt;
+ list_insert_tail(&smrt->smrt_volumes, smlv);
+ }
+
+ /*
+ * Always make sure that the address and the generation are up
+ * to date. The address may have changed on a reset.
+ */
+ smlv->smlv_addr.LogDev = extents[i].smrle_addr;
+ smlv->smlv_gen = gen;
+ (void) snprintf(id, sizeof (id), "%x",
+ smlv->smlv_addr.LogDev.VolId);
+ if (!ddi_in_panic() &&
+ scsi_hba_tgtmap_set_add(smrt->smrt_virt_tgtmap,
+ SCSI_TGT_SCSI_DEVICE, id, NULL) != DDI_SUCCESS) {
+ return (EIO);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Discover the currently visible set of Logical Volumes exposed by the
+ * controller.
+ */
+int
+smrt_logvol_discover(smrt_t *smrt, uint16_t timeout, uint64_t gen)
+{
+ smrt_command_t *smcm;
+ smrt_report_logical_lun_t *smrll;
+ smrt_report_logical_lun_req_t smrllr = { 0 };
+ int r;
+
+ /*
+ * Allocate the command to send to the device, including buffer space
+ * for the returned list of Logical Volumes.
+ */
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm,
+ sizeof (smrt_report_logical_lun_t), KM_NOSLEEP) != 0) {
+ r = ENOMEM;
+ mutex_enter(&smrt->smrt_mutex);
+ goto out;
+ }
+
+ smrll = smcm->smcm_internal->smcmi_va;
+
+ smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+ smcm->smcm_va_cmd->Request.CDBLen = sizeof (smrllr);
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout);
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+
+ /*
+ * The Report Logical LUNs command is essentially a vendor-specific
+ * SCSI command, which we assemble into the CDB region of the command
+ * block.
+ */
+ bzero(&smrllr, sizeof (smrllr));
+ smrllr.smrllr_opcode = CISS_SCMD_REPORT_LOGICAL_LUNS;
+ smrllr.smrllr_extflag = 1;
+ smrllr.smrllr_datasize = htonl(sizeof (smrt_report_logical_lun_t));
+ bcopy(&smrllr, &smcm->smcm_va_cmd->Request.CDB[0],
+ MIN(CISS_CDBLEN, sizeof (smrllr)));
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Send the command to the device.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if ((r = smrt_submit(smrt, smcm)) != 0) {
+ goto out;
+ }
+
+ /*
+ * Poll for completion.
+ */
+ smcm->smcm_expiry = gethrtime() + timeout * NANOSEC;
+ if ((r = smrt_poll_for(smrt, smcm)) != 0) {
+ VERIFY3S(r, ==, ETIMEDOUT);
+ VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * The command timed out; abandon it now. Remove the POLLED
+ * flag so that the periodic routine will send an abort to
+ * clean it up next time around.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+ smcm = NULL;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ /*
+ * The controller was reset while we were trying to discover
+ * logical volumes. Report failure.
+ */
+ r = EIO;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+
+ if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+ dev_err(smrt->smrt_dip, CE_WARN, "logical volume "
+ "discovery error: status 0x%x", ei->CommandStatus);
+ r = EIO;
+ goto out;
+ }
+ }
+
+ if (!ddi_in_panic() &&
+ scsi_hba_tgtmap_set_begin(smrt->smrt_virt_tgtmap) != DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to begin target map "
+ "observation on %s", SMRT_IPORT_VIRT);
+ r = EIO;
+ goto out;
+ }
+
+ if ((smrll->smrll_extflag & 0x1) != 0) {
+ r = smrt_read_logvols_ext(smrt, smrll, gen);
+ } else {
+ r = smrt_read_logvols(smrt, smrll, gen);
+ }
+
+ if (r == 0 && !ddi_in_panic()) {
+ if (scsi_hba_tgtmap_set_end(smrt->smrt_virt_tgtmap, 0) !=
+ DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to end target "
+ "map observation on %s", SMRT_IPORT_VIRT);
+ r = EIO;
+ }
+ } else if (r != 0 && !ddi_in_panic()) {
+ if (scsi_hba_tgtmap_set_flush(smrt->smrt_virt_tgtmap) !=
+ DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to end target "
+ "map observation on %s", SMRT_IPORT_VIRT);
+ r = EIO;
+ }
+ }
+
+ if (r == 0) {
+ /*
+ * Update the time of the last successful Logical Volume
+ * discovery:
+ */
+ smrt->smrt_last_log_discovery = gethrtime();
+ }
+
+out:
+ mutex_exit(&smrt->smrt_mutex);
+
+ if (smcm != NULL) {
+ smrt_command_free(smcm);
+ }
+ return (r);
+}
+
+void
+smrt_logvol_tgtmap_activate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type,
+ void **privpp)
+{
+ smrt_t *smrt = arg;
+ unsigned long volume;
+ char *eptr;
+
+ VERIFY(type == SCSI_TGT_SCSI_DEVICE);
+ VERIFY0(ddi_strtoul(addr, &eptr, 16, &volume));
+ VERIFY3S(*eptr, ==, '\0');
+ VERIFY3S(volume, >=, 0);
+ VERIFY3S(volume, <, SMRT_MAX_LOGDRV);
+ mutex_enter(&smrt->smrt_mutex);
+ VERIFY(smrt_logvol_lookup_by_id(smrt, volume) != NULL);
+ mutex_exit(&smrt->smrt_mutex);
+ *privpp = NULL;
+}
+
+boolean_t
+smrt_logvol_tgtmap_deactivate(void *arg, char *addr,
+ scsi_tgtmap_tgt_type_t type, void *priv, scsi_tgtmap_deact_rsn_t reason)
+{
+ smrt_t *smrt = arg;
+ smrt_volume_t *smlv;
+ unsigned long volume;
+ char *eptr;
+
+ VERIFY(type == SCSI_TGT_SCSI_DEVICE);
+ VERIFY(priv == NULL);
+ VERIFY0(ddi_strtoul(addr, &eptr, 16, &volume));
+ VERIFY3S(*eptr, ==, '\0');
+ VERIFY3S(volume, >=, 0);
+ VERIFY3S(volume, <, SMRT_MAX_LOGDRV);
+
+ mutex_enter(&smrt->smrt_mutex);
+ smlv = smrt_logvol_lookup_by_id(smrt, volume);
+ VERIFY(smlv != NULL);
+
+ list_remove(&smrt->smrt_volumes, smlv);
+ smrt_logvol_free(smlv);
+ mutex_exit(&smrt->smrt_mutex);
+
+ return (B_FALSE);
+}
+
+void
+smrt_logvol_teardown(smrt_t *smrt)
+{
+ smrt_volume_t *smlv;
+
+ while ((smlv = list_remove_head(&smrt->smrt_volumes)) != NULL) {
+ smrt_logvol_free(smlv);
+ }
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c
new file mode 100644
index 0000000000..8ab3927673
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c
@@ -0,0 +1,613 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2017 Joyent, Inc.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+static void
+smrt_physical_free(smrt_physical_t *smpt)
+{
+ VERIFY(list_is_empty(&smpt->smpt_targets));
+ VERIFY(smpt->smpt_info != NULL);
+
+ kmem_free(smpt->smpt_info, sizeof (*smpt->smpt_info));
+ list_destroy(&smpt->smpt_targets);
+ kmem_free(smpt, sizeof (*smpt));
+}
+
+/*
+ * Determine if a physical device enumerated should be shown to the world. There
+ * are three conditions to satisfy for this to be true.
+ *
+ * 1. The device (SAS, SATA, SES, etc.) must not have a masked CISS address. A
+ * masked CISS address indicates a device that we should not be performing I/O
+ * to.
+ * 2. The drive (SAS or SATA device) must not be marked as a member of a logical
+ * volume.
+ * 3. The drive (SAS or SATA device) must not be marked as a spare.
+ */
+static boolean_t
+smrt_physical_visible(PhysDevAddr_t *addr, smrt_identify_physical_drive_t *info)
+{
+ if (addr->Mode == SMRT_CISS_MODE_MASKED) {
+ return (B_FALSE);
+ }
+
+ if ((info->sipd_more_flags & (SMRT_MORE_FLAGS_LOGVOL |
+ SMRT_MORE_FLAGS_SPARE)) != 0) {
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * Note, the caller is responsible for making sure that the unit-address form of
+ * the WWN is pased in. Any additional information to target a specific LUN
+ * will be ignored.
+ */
+smrt_physical_t *
+smrt_phys_lookup_by_ua(smrt_t *smrt, const char *ua)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ /*
+ * Sanity check that the caller has provided us enough bytes for a
+ * properly formed unit-address form of a WWN.
+ */
+ if (strlen(ua) < SCSI_WWN_UA_STRLEN)
+ return (NULL);
+
+ for (smrt_physical_t *smpt = list_head(&smrt->smrt_physicals);
+ smpt != NULL; smpt = list_next(&smrt->smrt_physicals, smpt)) {
+ char wwnstr[SCSI_WWN_BUFLEN];
+
+ (void) scsi_wwn_to_wwnstr(smpt->smpt_wwn, 1, wwnstr);
+ if (strncmp(wwnstr, ua, SCSI_WWN_UA_STRLEN) != 0)
+ continue;
+
+ /*
+ * Verify that the UA string is either a comma or null there.
+ * We accept the comma in case it's being used as part of a
+ * normal UA with a LUN.
+ */
+ if (ua[SCSI_WWN_UA_STRLEN] != '\0' &&
+ ua[SCSI_WWN_UA_STRLEN] != ',') {
+ continue;
+ }
+
+ return (smpt);
+ }
+
+ return (NULL);
+}
+
+static smrt_physical_t *
+smrt_phys_lookup_by_wwn(smrt_t *smrt, uint64_t wwn)
+{
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ for (smrt_physical_t *smpt = list_head(&smrt->smrt_physicals);
+ smpt != NULL; smpt = list_next(&smrt->smrt_physicals, smpt)) {
+ if (wwn == smpt->smpt_wwn)
+ return (smpt);
+ }
+
+ return (NULL);
+}
+
+static int
+smrt_phys_identify(smrt_t *smrt, smrt_identify_physical_drive_t *info,
+ uint16_t bmic, uint16_t timeout)
+{
+ smrt_command_t *smcm = NULL;
+ smrt_identify_physical_drive_t *sipd;
+ smrt_identify_physical_drive_req_t sipdr;
+ int ret;
+ size_t sz, copysz;
+
+ sz = sizeof (smrt_identify_physical_drive_t);
+ sz = P2ROUNDUP_TYPED(sz, 512, size_t);
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm,
+ sizeof (*sipd), KM_NOSLEEP) != 0) {
+ ret = ENOMEM;
+ goto out;
+ }
+
+ sipd = smcm->smcm_internal->smcmi_va;
+
+ smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+ smcm->smcm_va_cmd->Request.CDBLen = sizeof (sipdr);
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout);
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+
+ /*
+ * Construct the IDENTIFY PHYSICAL DEVICE request CDB. Note that any
+ * reserved fields in the request must be filled with zeroes.
+ */
+ bzero(&sipdr, sizeof (sipdr));
+ sipdr.sipdr_opcode = CISS_SCMD_BMIC_READ;
+ sipdr.sipdr_lun = 0;
+ sipdr.sipdr_bmic_index1 = bmic & 0x00ff;
+ sipdr.sipdr_command = CISS_BMIC_IDENTIFY_PHYSICAL_DEVICE;
+ sipdr.sipdr_bmic_index2 = (bmic & 0xff00) >> 8;
+ bcopy(&sipdr, &smcm->smcm_va_cmd->Request.CDB[0],
+ MIN(CISS_CDBLEN, sizeof (sipdr)));
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Send the command to the device.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if ((ret = smrt_submit(smrt, smcm)) != 0) {
+ mutex_exit(&smrt->smrt_mutex);
+ goto out;
+ }
+
+ /*
+ * Poll for completion.
+ */
+ smcm->smcm_expiry = gethrtime() + timeout * NANOSEC;
+ if ((ret = smrt_poll_for(smrt, smcm)) != 0) {
+ VERIFY3S(ret, ==, ETIMEDOUT);
+ VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * The command timed out; abandon it now. Remove the POLLED
+ * flag so that the periodic routine will send an abort to
+ * clean it up next time around.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+ smcm = NULL;
+ mutex_exit(&smrt->smrt_mutex);
+ goto out;
+ }
+ mutex_exit(&smrt->smrt_mutex);
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ /*
+ * The controller was reset while we were trying to discover
+ * physical volumes. Report failure.
+ */
+ ret = EIO;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+
+ if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+ dev_err(smrt->smrt_dip, CE_WARN, "identify physical "
+ "device error: status 0x%x", ei->CommandStatus);
+ ret = EIO;
+ goto out;
+ }
+
+ copysz = MIN(sizeof (*sipd), sz - ei->ResidualCnt);
+ } else {
+ copysz = sizeof (*sipd);
+ }
+
+
+ sz = MIN(sizeof (*sipd), copysz);
+ bcopy(sipd, info, sizeof (*sipd));
+
+ ret = 0;
+out:
+ if (smcm != NULL) {
+ smrt_command_free(smcm);
+ }
+
+ return (ret);
+}
+
+static int
+smrt_read_phys_ext(smrt_t *smrt, smrt_report_physical_lun_t *smrpl,
+ uint16_t timeout, uint64_t gen)
+{
+ smrt_report_physical_lun_extent_t *extents = smrpl->smrpl_data.extents;
+ uint32_t count = BE_32(smrpl->smrpl_datasize) /
+ sizeof (smrt_report_physical_lun_extent_t);
+ uint32_t i;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+
+ if (count > SMRT_MAX_PHYSDEV) {
+ count = SMRT_MAX_PHYSDEV;
+ }
+
+ for (i = 0; i < count; i++) {
+ int ret;
+ smrt_physical_t *smpt;
+ smrt_identify_physical_drive_t *info;
+ smrt_report_physical_opdi_t *opdi;
+ uint16_t bmic;
+ uint64_t wwn, satawwn;
+ char name[SCSI_MAXNAMELEN];
+
+ opdi = &extents[i].srple_extdata.srple_opdi;
+
+ mutex_exit(&smrt->smrt_mutex);
+
+ /*
+ * Get the extended information about this device.
+ */
+ info = kmem_zalloc(sizeof (*info), KM_NOSLEEP);
+ if (info == NULL) {
+ mutex_enter(&smrt->smrt_mutex);
+ return (ENOMEM);
+ }
+
+ bmic = smrt_lun_addr_to_bmic(&extents[i].srple_addr);
+ ret = smrt_phys_identify(smrt, info, bmic, timeout);
+ if (ret != 0) {
+ mutex_enter(&smrt->smrt_mutex);
+ kmem_free(info, sizeof (*info));
+ return (ret);
+ }
+
+ wwn = *(uint64_t *)opdi->srpo_wwid;
+ wwn = BE_64(wwn);
+
+ /*
+ * SATA devices may not have a proper WWN returned from firmware
+ * based on the SATL specification. Try to fetch the proper id
+ * for SATA devices, if the drive has one. If the drive doesn't
+ * have one or the SATL refuses to give us one, we use whatever
+ * the controller told us.
+ */
+ if (opdi->srpo_dtype == SMRT_DTYPE_SATA &&
+ smrt_sata_determine_wwn(smrt, &extents[i].srple_addr,
+ &satawwn, timeout) == 0) {
+ wwn = satawwn;
+ }
+
+ mutex_enter(&smrt->smrt_mutex);
+ smpt = smrt_phys_lookup_by_wwn(smrt, wwn);
+ if (smpt != NULL) {
+ /*
+ * Sanity check that the model and serial number of this
+ * device is the same for this WWN. If it's not, the
+ * controller is probably lying about something.
+ */
+ if (bcmp(smpt->smpt_info->sipd_model, info->sipd_model,
+ sizeof (info->sipd_model)) != 0 ||
+ bcmp(smpt->smpt_info->sipd_serial,
+ info->sipd_serial, sizeof (info->sipd_serial)) !=
+ 0 || smpt->smpt_dtype != opdi->srpo_dtype) {
+ dev_err(smrt->smrt_dip, CE_PANIC, "physical "
+ "target with wwn 0x%" PRIx64 " changed "
+ "model, serial, or type unexpectedly: "
+ "smrt_physical_t %p, phys info: %p", wwn,
+ smpt, info);
+ }
+
+ /*
+ * When panicking, we don't allow a device's visibility
+ * to change to being invisible and be able to actually
+ * panic. We only worry about devices which are used
+ * for I/O. We purposefully ignore SES devices.
+ */
+ if (ddi_in_panic() &&
+ (opdi->srpo_dtype == SMRT_DTYPE_SATA ||
+ opdi->srpo_dtype == SMRT_DTYPE_SAS)) {
+ boolean_t visible;
+
+ visible = smrt_physical_visible(
+ &smpt->smpt_addr.PhysDev, smpt->smpt_info);
+
+ if (visible != smpt->smpt_visible) {
+ dev_err(smrt->smrt_dip, CE_PANIC,
+ "physical target with wwn 0x%"
+ PRIx64 " changed visibility status "
+ "unexpectedly", wwn);
+ }
+ }
+
+ kmem_free(smpt->smpt_info, sizeof (*smpt->smpt_info));
+ smpt->smpt_info = NULL;
+ } else {
+ smpt = kmem_zalloc(sizeof (smrt_physical_t),
+ KM_NOSLEEP);
+ if (smpt == NULL) {
+ kmem_free(info, sizeof (*info));
+ return (ENOMEM);
+ }
+
+ smpt->smpt_wwn = wwn;
+ smpt->smpt_dtype = opdi->srpo_dtype;
+ list_create(&smpt->smpt_targets, sizeof (smrt_target_t),
+ offsetof(smrt_target_t, smtg_link_lun));
+ smpt->smpt_ctlr = smrt;
+ list_insert_tail(&smrt->smrt_physicals, smpt);
+ }
+
+ VERIFY3P(smpt->smpt_info, ==, NULL);
+
+ /*
+ * Determine if this device is supported and if it's visible to
+ * the system. Some devices may not be visible to the system
+ * because they're used in logical volumes or spares.
+ * Unsupported devices are also not visible.
+ */
+ switch (smpt->smpt_dtype) {
+ case SMRT_DTYPE_SATA:
+ case SMRT_DTYPE_SAS:
+ smpt->smpt_supported = B_TRUE;
+ smpt->smpt_visible =
+ smrt_physical_visible(&extents[i].srple_addr, info);
+ break;
+ case SMRT_DTYPE_SES:
+ smpt->smpt_supported = B_TRUE;
+ smpt->smpt_visible =
+ smrt_physical_visible(&extents[i].srple_addr, info);
+ break;
+ default:
+ smpt->smpt_visible = B_FALSE;
+ smpt->smpt_supported = B_FALSE;
+ }
+
+ smpt->smpt_info = info;
+ smpt->smpt_addr.PhysDev = extents[i].srple_addr;
+ smpt->smpt_bmic = bmic;
+ smpt->smpt_gen = gen;
+ (void) scsi_wwn_to_wwnstr(smpt->smpt_wwn, 1, name);
+ if (!ddi_in_panic() && smpt->smpt_visible &&
+ scsi_hba_tgtmap_set_add(smrt->smrt_phys_tgtmap,
+ SCSI_TGT_SCSI_DEVICE, name, NULL) != DDI_SUCCESS) {
+ return (EIO);
+ }
+ }
+
+ return (0);
+}
+
+int
+smrt_phys_discover(smrt_t *smrt, uint16_t timeout, uint64_t gen)
+{
+ smrt_command_t *smcm;
+ smrt_report_physical_lun_t *smrpl;
+ smrt_report_physical_lun_req_t smrplr;
+ int r;
+
+ /*
+ * Allocate the command to send to the device, including buffer space
+ * for the returned list of Physical Volumes.
+ */
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm,
+ sizeof (*smrpl), KM_NOSLEEP) != 0) {
+ r = ENOMEM;
+ mutex_enter(&smrt->smrt_mutex);
+ goto out;
+ }
+
+ smrpl = smcm->smcm_internal->smcmi_va;
+
+ smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
+
+ smcm->smcm_va_cmd->Request.CDBLen = sizeof (smrplr);
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout);
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+
+ /*
+ * The Report Physical LUNs command is essentially a vendor-specific
+ * SCSI command, which we assemble into the CDB region of the command
+ * block.
+ */
+ bzero(&smrplr, sizeof (smrplr));
+ smrplr.smrplr_opcode = CISS_SCMD_REPORT_PHYSICAL_LUNS;
+ smrplr.smrplr_extflag = SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI;
+ smrplr.smrplr_datasize = BE_32(sizeof (smrt_report_physical_lun_t));
+ bcopy(&smrplr, &smcm->smcm_va_cmd->Request.CDB[0],
+ MIN(CISS_CDBLEN, sizeof (smrplr)));
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Send the command to the device.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if ((r = smrt_submit(smrt, smcm)) != 0) {
+ goto out;
+ }
+
+ /*
+ * Poll for completion.
+ */
+ smcm->smcm_expiry = gethrtime() + timeout * NANOSEC;
+ if ((r = smrt_poll_for(smrt, smcm)) != 0) {
+ VERIFY3S(r, ==, ETIMEDOUT);
+ VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * The command timed out; abandon it now. Remove the POLLED
+ * flag so that the periodic routine will send an abort to
+ * clean it up next time around.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+ smcm = NULL;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ /*
+ *
+ * The controller was reset while we were trying to discover
+ * logical volumes. Report failure.
+ */
+ r = EIO;
+ goto out;
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+
+ if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+ dev_err(smrt->smrt_dip, CE_WARN, "physical target "
+ "discovery error: status 0x%x", ei->CommandStatus);
+ r = EIO;
+ goto out;
+ }
+ }
+
+ /*
+ * If the controller doesn't support extended physical reporting, it
+ * likely doesn't even support physical devices that we'd care about
+ * exposing. As such, we treat this as an OK case.
+ */
+ if ((smrpl->smrpl_extflag & SMRT_REPORT_PHYSICAL_LUN_EXT_MASK) !=
+ SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI) {
+ r = 0;
+ goto out;
+ }
+
+ if (!ddi_in_panic() &&
+ scsi_hba_tgtmap_set_begin(smrt->smrt_phys_tgtmap) != DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to begin target map "
+ "observation on %s", SMRT_IPORT_PHYS);
+ r = EIO;
+ goto out;
+ }
+
+ r = smrt_read_phys_ext(smrt, smrpl, timeout, gen);
+
+ if (r == 0 && !ddi_in_panic()) {
+ if (scsi_hba_tgtmap_set_end(smrt->smrt_phys_tgtmap, 0) !=
+ DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to end target "
+ "map observation on %s", SMRT_IPORT_PHYS);
+ r = EIO;
+ }
+ } else if (r != 0 && !ddi_in_panic()) {
+ if (scsi_hba_tgtmap_set_flush(smrt->smrt_phys_tgtmap) !=
+ DDI_SUCCESS) {
+ dev_err(smrt->smrt_dip, CE_WARN, "failed to end target "
+ "map observation on %s", SMRT_IPORT_PHYS);
+ r = EIO;
+ }
+ }
+
+ if (r == 0) {
+ smrt_physical_t *smpt, *next;
+
+ /*
+ * Prune physical devices that do not match the current
+ * generation and are not marked as visible devices. Visible
+ * devices will be dealt with as part of the target map work.
+ */
+ for (smpt = list_head(&smrt->smrt_physicals), next = NULL;
+ smpt != NULL; smpt = next) {
+ next = list_next(&smrt->smrt_physicals, smpt);
+ if (smpt->smpt_visible || smpt->smpt_gen == gen)
+ continue;
+ list_remove(&smrt->smrt_physicals, smpt);
+ smrt_physical_free(smpt);
+ }
+
+ /*
+ * Update the time of the last successful Physical Volume
+ * discovery:
+ */
+ smrt->smrt_last_phys_discovery = gethrtime();
+
+ /*
+ * Now, for each unsupported device that we haven't warned about
+ * encountering, try and give the administrator some hope of
+ * knowing about this.
+ */
+ for (smpt = list_head(&smrt->smrt_physicals), next = NULL;
+ smpt != NULL; smpt = next) {
+ if (smpt->smpt_supported || smpt->smpt_unsup_warn)
+ continue;
+ smpt->smpt_unsup_warn = B_TRUE;
+ dev_err(smrt->smrt_dip, CE_WARN, "encountered "
+ "unsupported device with device type %d",
+ smpt->smpt_dtype);
+ }
+ }
+
+out:
+ mutex_exit(&smrt->smrt_mutex);
+
+ if (smcm != NULL) {
+ smrt_command_free(smcm);
+ }
+ return (r);
+}
+
+void
+smrt_phys_tgtmap_activate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type,
+ void **privpp)
+{
+ smrt_t *smrt = arg;
+ smrt_physical_t *smpt;
+
+ VERIFY3S(type, ==, SCSI_TGT_SCSI_DEVICE);
+ mutex_enter(&smrt->smrt_mutex);
+ smpt = smrt_phys_lookup_by_ua(smrt, addr);
+ VERIFY(smpt != NULL);
+ VERIFY(smpt->smpt_supported);
+ VERIFY(smpt->smpt_visible);
+ *privpp = NULL;
+ mutex_exit(&smrt->smrt_mutex);
+}
+
+boolean_t
+smrt_phys_tgtmap_deactivate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type,
+ void *priv, scsi_tgtmap_deact_rsn_t reason)
+{
+ smrt_t *smrt = arg;
+ smrt_physical_t *smpt;
+
+ VERIFY3S(type, ==, SCSI_TGT_SCSI_DEVICE);
+ VERIFY3P(priv, ==, NULL);
+
+ mutex_enter(&smrt->smrt_mutex);
+ smpt = smrt_phys_lookup_by_ua(smrt, addr);
+
+ /*
+ * If the device disappeared or became invisible, then it may have
+ * already been removed.
+ */
+ if (smpt == NULL || !smpt->smpt_visible) {
+ mutex_exit(&smrt->smrt_mutex);
+ return (B_FALSE);
+ }
+
+ list_remove(&smrt->smrt_physicals, smpt);
+ smrt_physical_free(smpt);
+ mutex_exit(&smrt->smrt_mutex);
+ return (B_FALSE);
+}
+
+void
+smrt_phys_teardown(smrt_t *smrt)
+{
+ smrt_physical_t *smpt;
+
+ VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
+ while ((smpt = list_remove_head(&smrt->smrt_physicals)) != NULL) {
+ smrt_physical_free(smpt);
+ }
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c
new file mode 100644
index 0000000000..6224b97732
--- /dev/null
+++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c
@@ -0,0 +1,160 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Joyent, Inc.
+ */
+
+/*
+ * Collection of routines specific to SATA devices and attempting to make them
+ * work.
+ */
+
+#include <sys/scsi/adapters/smrt/smrt.h>
+
+/*
+ * This is a buffer size that should easily cover all of the data that we need
+ * to properly determine the buffer allocation.
+ */
+#define SMRT_SATA_INQ83_LEN 256
+
+/*
+ * We need to try and determine if a SATA WWN exists on the device. SAT-2
+ * defines that the response to the inquiry page 0x83.
+ */
+int
+smrt_sata_determine_wwn(smrt_t *smrt, PhysDevAddr_t *addr, uint64_t *wwnp,
+ uint16_t timeout)
+{
+ smrt_command_t *smcm;
+ int r;
+ uint8_t *inq;
+ uint64_t wwn;
+ size_t resid;
+
+ VERIFY3P(wwnp, !=, NULL);
+
+ if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
+ KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm,
+ SMRT_SATA_INQ83_LEN, KM_NOSLEEP) != 0) {
+ if (smcm != NULL) {
+ smrt_command_free(smcm);
+ }
+ return (ENOMEM);
+ }
+
+ smcm->smcm_va_cmd->Header.LUN.PhysDev = *addr;
+ smcm->smcm_va_cmd->Request.CDBLen = CDB_GROUP0;
+ smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
+ smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
+ smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
+ smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout);
+
+ smcm->smcm_va_cmd->Request.CDB[0] = SCMD_INQUIRY;
+ smcm->smcm_va_cmd->Request.CDB[1] = 1;
+ smcm->smcm_va_cmd->Request.CDB[2] = 0x83;
+ smcm->smcm_va_cmd->Request.CDB[3] = (SMRT_SATA_INQ83_LEN & 0xff00) >> 8;
+ smcm->smcm_va_cmd->Request.CDB[4] = SMRT_SATA_INQ83_LEN & 0x00ff;
+ smcm->smcm_va_cmd->Request.CDB[5] = 0;
+
+ mutex_enter(&smrt->smrt_mutex);
+
+ /*
+ * Send the command to the device.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
+ if ((r = smrt_submit(smrt, smcm)) != 0) {
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ return (r);
+ }
+
+ if ((r = smrt_poll_for(smrt, smcm)) != 0) {
+ VERIFY3S(r, ==, ETIMEDOUT);
+ VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
+
+ /*
+ * The command timed out; abandon it now. Remove the POLLED
+ * flag so that the periodic routine will send an abort to
+ * clean it up next time around.
+ */
+ smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
+ smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED;
+ mutex_exit(&smrt->smrt_mutex);
+ return (r);
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
+ /*
+ * The controller was reset while we were trying to discover
+ * logical volumes. Report failure.
+ */
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ return (EIO);
+ }
+
+ if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
+ ErrorInfo_t *ei = smcm->smcm_va_err;
+
+ if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
+ dev_err(smrt->smrt_dip, CE_WARN, "physical target "
+ "SATA WWN error: status 0x%x", ei->CommandStatus);
+ mutex_exit(&smrt->smrt_mutex);
+ smrt_command_free(smcm);
+ return (EIO);
+ }
+ resid = ei->ResidualCnt;
+ } else {
+ resid = 0;
+ }
+
+ mutex_exit(&smrt->smrt_mutex);
+
+ /*
+ * We must have at least 12 bytes. The first four bytes are the header,
+ * the next four are for the LUN header, and the last 8 are for the
+ * actual WWN, which according to SAT-2 will always be first.
+ */
+ if (SMRT_SATA_INQ83_LEN - resid < 16) {
+ smrt_command_free(smcm);
+ return (EINVAL);
+ }
+ inq = smcm->smcm_internal->smcmi_va;
+
+ /*
+ * Sanity check we have the right page.
+ */
+ if (inq[1] != 0x83) {
+ smrt_command_free(smcm);
+ return (EINVAL);
+ }
+
+ /*
+ * Check to see if we have a proper Network Address Authority (NAA)
+ * based world wide number for this LUN. It is possible that firmware
+ * interposes on this and constructs a fake world wide number (WWN). If
+ * this is the case, we don't want to actually use it. We need to
+ * verify that the WWN declares the correct naming authority and is of
+ * the proper length.
+ */
+ if ((inq[5] & 0x30) != 0 || (inq[5] & 0x0f) != 3 || inq[7] != 8) {
+ smrt_command_free(smcm);
+ return (ENOTSUP);
+ }
+
+ bcopy(&inq[8], &wwn, sizeof (uint64_t));
+ *wwnp = BE_64(wwn);
+
+ smrt_command_free(smcm);
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c
index 605ddae078..4784fdeec4 100644
--- a/usr/src/uts/common/io/scsi/targets/sd.c
+++ b/usr/src/uts/common/io/scsi/targets/sd.c
@@ -3476,9 +3476,13 @@ sd_set_mmc_caps(sd_ssc_t *ssc)
* according to the successful response to the page
* 0x2A mode sense request.
*/
- scsi_log(SD_DEVINFO(un), sd_label, CE_WARN,
- "sd_set_mmc_caps: Mode Sense returned "
- "invalid block descriptor length\n");
+ /*
+ * The following warning occurs due to the KVM CD-ROM
+ * mishandling the multi-media commands. Ignore it.
+ * scsi_log(SD_DEVINFO(un), sd_label, CE_WARN,
+ * "sd_set_mmc_caps: Mode Sense returned "
+ * "invalid block descriptor length\n");
+ */
kmem_free(buf, BUFLEN_MODE_CDROM_CAP);
return;
}
@@ -4422,19 +4426,78 @@ static int
sd_sdconf_id_match(struct sd_lun *un, char *id, int idlen)
{
struct scsi_inquiry *sd_inq;
- int rval = SD_SUCCESS;
+ int rval = SD_SUCCESS;
+ char *p;
+ int chk_vidlen = 0, chk_pidlen = 0;
+ int has_tail = 0;
+ static const int VSZ = sizeof (sd_inq->inq_vid);
+ static const int PSZ = sizeof (sd_inq->inq_pid);
ASSERT(un != NULL);
sd_inq = un->un_sd->sd_inq;
ASSERT(id != NULL);
/*
- * We use the inq_vid as a pointer to a buffer containing the
- * vid and pid and use the entire vid/pid length of the table
- * entry for the comparison. This works because the inq_pid
- * data member follows inq_vid in the scsi_inquiry structure.
+ * We would like to use the inq_vid as a pointer to a buffer
+ * containing the vid and pid and use the entire vid/pid length of
+ * the table entry for the comparison. However, this does not work
+ * because, while the inq_pid data member follows inq_vid in the
+ * scsi_inquiry structure, we do not control the contents of this
+ * buffer, and some broken devices violate SPC 4.3.1 and return
+ * fields with null bytes in them.
+ */
+ chk_vidlen = MIN(VSZ, idlen);
+ p = id + chk_vidlen - 1;
+ while (*p == ' ' && chk_vidlen > 0) {
+ --p;
+ --chk_vidlen;
+ }
+
+ /*
+ * If it's all spaces, check the whole thing.
+ */
+ if (chk_vidlen == 0)
+ chk_vidlen = MIN(VSZ, idlen);
+
+ if (idlen > VSZ) {
+ chk_pidlen = idlen - VSZ;
+ p = id + idlen - 1;
+ while (*p == ' ' && chk_pidlen > 0) {
+ --p;
+ --chk_pidlen;
+ }
+ if (chk_pidlen == 0)
+ chk_pidlen = MIN(PSZ, idlen - VSZ);
+ }
+
+ /*
+ * There's one more thing we need to do here. If the user specified
+ * an ID with trailing spaces, we need to make sure the inquiry
+ * vid/pid has only spaces or NULs after the check length; otherwise, it
+ * can't match.
*/
- if (strncasecmp(sd_inq->inq_vid, id, idlen) != 0) {
+ if (idlen > chk_vidlen && chk_vidlen < VSZ) {
+ for (p = sd_inq->inq_vid + chk_vidlen;
+ p < sd_inq->inq_vid + VSZ; ++p) {
+ if (*p != ' ' && *p != '\0') {
+ ++has_tail;
+ break;
+ }
+ }
+ }
+ if (idlen > chk_pidlen + VSZ && chk_pidlen < PSZ) {
+ for (p = sd_inq->inq_pid + chk_pidlen;
+ p < sd_inq->inq_pid + PSZ; ++p) {
+ if (*p != ' ' && *p != '\0') {
+ ++has_tail;
+ break;
+ }
+ }
+ }
+
+ if (has_tail || strncasecmp(sd_inq->inq_vid, id, chk_vidlen) != 0 ||
+ (idlen > VSZ &&
+ strncasecmp(sd_inq->inq_pid, id + VSZ, chk_pidlen) != 0)) {
/*
* The user id string is compared to the inquiry vid/pid
* using a case insensitive comparison and ignoring
@@ -6701,7 +6764,7 @@ sdpower(dev_info_t *devi, int component, int level)
time_t intvlp;
struct pm_trans_data sd_pm_tran_data;
uchar_t save_state = SD_STATE_NORMAL;
- int sval;
+ int sval, tursval = 0;
uchar_t state_before_pm;
int got_semaphore_here;
sd_ssc_t *ssc;
@@ -7018,13 +7081,26 @@ sdpower(dev_info_t *devi, int component, int level)
* a deadlock on un_pm_busy_cv will occur.
*/
if (SD_PM_IS_IO_CAPABLE(un, level)) {
- sval = sd_send_scsi_TEST_UNIT_READY(ssc,
+ tursval = sd_send_scsi_TEST_UNIT_READY(ssc,
SD_DONT_RETRY_TUR | SD_BYPASS_PM);
- if (sval != 0)
+ if (tursval != 0)
sd_ssc_assessment(ssc, SD_FMT_IGNORE);
}
- if (un->un_f_power_condition_supported) {
+ /*
+ * We've encountered certain classes of drives that pass a TUR, but fail
+ * the START STOP UNIT when using power conditions, or worse leave the
+ * drive in an unusable state despite passing SSU. Strictly speaking,
+ * for SPC-4 or greater, no additional actions are required to make the
+ * drive operational when a TUR passes. If we have something that
+ * matches this condition, we continue on and presume the drive is
+ * successfully powered on.
+ */
+ if (un->un_f_power_condition_supported &&
+ SD_SCSI_VERS_IS_GE_SPC_4(un) && SD_PM_IS_IO_CAPABLE(un, level) &&
+ level == SD_SPINDLE_ACTIVE && tursval == 0) {
+ sval = 0;
+ } else if (un->un_f_power_condition_supported) {
char *pm_condition_name[] = {"STOPPED", "STANDBY",
"IDLE", "ACTIVE"};
SD_TRACE(SD_LOG_IO_PM, un,
@@ -7044,6 +7120,7 @@ sdpower(dev_info_t *devi, int component, int level)
sd_ssc_assessment(ssc, SD_FMT_STATUS_CHECK);
else
sd_ssc_assessment(ssc, SD_FMT_IGNORE);
+
}
/* Command failed, check for media present. */
@@ -31308,7 +31385,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi)
if (SD_PM_CAPABLE_IS_UNDEFINED(pm_cap)) {
un->un_f_log_sense_supported = TRUE;
if (!un->un_f_power_condition_disabled &&
- SD_INQUIRY(un)->inq_ansi == 6) {
+ SD_SCSI_VERS_IS_GE_SPC_4(un)) {
un->un_f_power_condition_supported = TRUE;
}
} else {
@@ -31326,7 +31403,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi)
/* SD_PM_CAPABLE_IS_TRUE case */
un->un_f_pm_supported = TRUE;
if (!un->un_f_power_condition_disabled &&
- SD_PM_CAPABLE_IS_SPC_4(pm_cap)) {
+ (SD_PM_CAPABLE_IS_GE_SPC_4(pm_cap))) {
un->un_f_power_condition_supported =
TRUE;
}
diff --git a/usr/src/uts/common/io/signalfd.c b/usr/src/uts/common/io/signalfd.c
index 46d616dd79..4dce53e22c 100644
--- a/usr/src/uts/common/io/signalfd.c
+++ b/usr/src/uts/common/io/signalfd.c
@@ -107,6 +107,7 @@
#include <sys/schedctl.h>
#include <sys/id_space.h>
#include <sys/sdt.h>
+#include <sys/brand.h>
#include <sys/disp.h>
#include <sys/taskq_impl.h>
@@ -459,6 +460,9 @@ consume_signal(k_sigset_t set, uio_t *uio, boolean_t block)
lwp->lwp_extsig = 0;
mutex_exit(&p->p_lock);
+ if (PROC_IS_BRANDED(p) && BROP(p)->b_sigfd_translate)
+ BROP(p)->b_sigfd_translate(infop);
+
/* Convert k_siginfo into external, datamodel independent, struct. */
bzero(ssp, sizeof (*ssp));
ssp->ssi_signo = infop->si_signo;
diff --git a/usr/src/uts/common/io/usb/clients/hid/hid.c b/usr/src/uts/common/io/usb/clients/hid/hid.c
index 084fa7fedc..eccd48bf08 100644
--- a/usr/src/uts/common/io/usb/clients/hid/hid.c
+++ b/usr/src/uts/common/io/usb/clients/hid/hid.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2017 Joyent, Inc.
*/
@@ -139,6 +139,12 @@ static int hid_info(dev_info_t *, ddi_info_cmd_t, void *, void **);
static int hid_attach(dev_info_t *, ddi_attach_cmd_t);
static int hid_detach(dev_info_t *, ddi_detach_cmd_t);
static int hid_power(dev_info_t *, int, int);
+/* These are to enable ugen support: */
+static int hid_chropen(dev_t *, int, int, cred_t *);
+static int hid_chrclose(dev_t, int, int, cred_t *);
+static int hid_read(dev_t, struct uio *, cred_t *);
+static int hid_write(dev_t, struct uio *, cred_t *);
+static int hid_poll(dev_t, short, int, short *, struct pollhead **);
/*
* Warlock is not aware of the automatic locking mechanisms for
@@ -198,18 +204,18 @@ struct streamtab hid_streamtab = {
};
struct cb_ops hid_cb_ops = {
- nulldev, /* open */
- nulldev, /* close */
+ hid_chropen, /* open */
+ hid_chrclose, /* close */
nulldev, /* strategy */
nulldev, /* print */
nulldev, /* dump */
- nulldev, /* read */
- nulldev, /* write */
+ hid_read, /* read */
+ hid_write, /* write */
nulldev, /* ioctl */
nulldev, /* devmap */
nulldev, /* mmap */
nulldev, /* segmap */
- nochpoll, /* poll */
+ hid_poll, /* poll */
ddi_prop_op, /* cb_prop_op */
&hid_streamtab, /* streamtab */
D_MP | D_MTPERQ
@@ -349,6 +355,7 @@ hid_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
usb_alt_if_data_t *altif_data;
char minor_name[HID_MINOR_NAME_LEN];
usb_ep_data_t *ep_data;
+ usb_ugen_info_t usb_ugen_info;
switch (cmd) {
case DDI_ATTACH:
@@ -491,6 +498,28 @@ hid_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
usb_free_dev_data(dip, dev_data);
hidp->hid_dev_data = NULL;
+ if (usb_owns_device(dip)) {
+ /* Get a ugen handle. */
+ bzero(&usb_ugen_info, sizeof (usb_ugen_info));
+
+ usb_ugen_info.usb_ugen_flags = 0;
+ usb_ugen_info.usb_ugen_minor_node_ugen_bits_mask =
+ (dev_t)HID_MINOR_UGEN_BITS_MASK;
+ usb_ugen_info.usb_ugen_minor_node_instance_mask =
+ (dev_t)HID_MINOR_INSTANCE_MASK;
+ hidp->hid_ugen_hdl = usb_ugen_get_hdl(dip, &usb_ugen_info);
+
+ if (usb_ugen_attach(hidp->hid_ugen_hdl, cmd) !=
+ USB_SUCCESS) {
+ USB_DPRINTF_L2(PRINT_MASK_ATTA,
+ hidp->hid_log_handle,
+ "usb_ugen_attach failed");
+
+ usb_ugen_release_hdl(hidp->hid_ugen_hdl);
+ hidp->hid_ugen_hdl = NULL;
+ }
+ }
+
/*
* Don't get the report descriptor if parsing hid descriptor earlier
* failed since device probably won't return valid report descriptor
@@ -769,6 +798,149 @@ hid_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
return (rval);
}
+static int
+hid_chropen(dev_t *devp, int flag, int sflag, cred_t *credp)
+{
+ int rval;
+ minor_t minor = getminor(*devp);
+ int instance;
+ hid_state_t *hidp;
+
+ instance = HID_MINOR_TO_INSTANCE(minor);
+
+ hidp = ddi_get_soft_state(hid_statep, instance);
+ if (hidp == NULL) {
+ return (ENXIO);
+ }
+
+ if (!HID_IS_UGEN_OPEN(minor)) {
+ return (ENXIO);
+ }
+
+ hid_pm_busy_component(hidp);
+ (void) pm_raise_power(hidp->hid_dip, 0, USB_DEV_OS_FULL_PWR);
+
+ mutex_enter(&hidp->hid_mutex);
+
+ rval = usb_ugen_open(hidp->hid_ugen_hdl, devp, flag,
+ sflag, credp);
+
+ mutex_exit(&hidp->hid_mutex);
+
+ if (rval != 0) {
+ hid_pm_idle_component(hidp);
+ }
+
+ return (rval);
+}
+
+static int
+hid_chrclose(dev_t dev, int flag, int otyp, cred_t *credp)
+{
+ int rval;
+ minor_t minor = getminor(dev);
+ int instance;
+ hid_state_t *hidp;
+
+ instance = HID_MINOR_TO_INSTANCE(minor);
+
+ hidp = ddi_get_soft_state(hid_statep, instance);
+ if (hidp == NULL) {
+ return (ENXIO);
+ }
+
+ if (!HID_IS_UGEN_OPEN(minor)) {
+ return (ENXIO);
+ }
+
+ mutex_enter(&hidp->hid_mutex);
+
+ rval = usb_ugen_close(hidp->hid_ugen_hdl, dev, flag,
+ otyp, credp);
+
+ mutex_exit(&hidp->hid_mutex);
+
+ if (rval == 0) {
+ hid_pm_idle_component(hidp);
+ }
+
+ return (rval);
+}
+
+static int
+hid_read(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+ int rval;
+ minor_t minor = getminor(dev);
+ int instance;
+ hid_state_t *hidp;
+
+ instance = HID_MINOR_TO_INSTANCE(minor);
+
+ hidp = ddi_get_soft_state(hid_statep, instance);
+ if (hidp == NULL) {
+ return (ENXIO);
+ }
+
+ if (!HID_IS_UGEN_OPEN(minor)) {
+ return (ENXIO);
+ }
+
+ rval = usb_ugen_read(hidp->hid_ugen_hdl, dev, uiop, credp);
+
+ return (rval);
+}
+
+static int
+hid_write(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+ int rval;
+ minor_t minor = getminor(dev);
+ int instance;
+ hid_state_t *hidp;
+
+ instance = HID_MINOR_TO_INSTANCE(minor);
+
+ hidp = ddi_get_soft_state(hid_statep, instance);
+ if (hidp == NULL) {
+ return (ENXIO);
+ }
+
+ if (!HID_IS_UGEN_OPEN(minor)) {
+ return (ENXIO);
+ }
+
+ rval = usb_ugen_write(hidp->hid_ugen_hdl, dev, uiop, credp);
+
+ return (rval);
+}
+
+static int
+hid_poll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ int rval;
+ minor_t minor = getminor(dev);
+ int instance;
+ hid_state_t *hidp;
+
+ instance = HID_MINOR_TO_INSTANCE(minor);
+
+ hidp = ddi_get_soft_state(hid_statep, instance);
+ if (hidp == NULL) {
+ return (ENXIO);
+ }
+
+ if (!HID_IS_UGEN_OPEN(minor)) {
+ return (ENXIO);
+ }
+
+ rval = usb_ugen_poll(hidp->hid_ugen_hdl, dev, events, anyyet,
+ reventsp, phpp);
+
+ return (rval);
+}
+
/*
* hid_open :
* Open entry point: Opens the interrupt pipe. Sets up queues.
@@ -787,13 +959,21 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
hidp = ddi_get_soft_state(hid_statep, instance);
if (hidp == NULL) {
-
return (ENXIO);
}
USB_DPRINTF_L4(PRINT_MASK_OPEN, hidp->hid_log_handle,
"hid_open: Begin");
+ /*
+ * If this is a ugen device, return ENOSTR (no streams). This will
+ * cause spec_open to try hid_chropen from our regular ops_cb instead
+ * (and thus treat us as a plain character device).
+ */
+ if (HID_IS_UGEN_OPEN(minor)) {
+ return (ENOSTR);
+ }
+
if (sflag) {
/* clone open NOT supported here */
return (ENXIO);
@@ -803,6 +983,8 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
return (EIO);
}
+ mutex_enter(&hidp->hid_mutex);
+
/*
* This is a workaround:
* Currently, if we open an already disconnected device, and send
@@ -812,7 +994,6 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
* The consconfig_dacf module need this interface to detect if the
* device is already disconnnected.
*/
- mutex_enter(&hidp->hid_mutex);
if (HID_IS_INTERNAL_OPEN(minor) &&
(hidp->hid_dev_state == USB_DEV_DISCONNECTED)) {
mutex_exit(&hidp->hid_mutex);
@@ -1688,6 +1869,11 @@ hid_cpr_suspend(hid_state_t *hidp)
}
mutex_exit(&hidp->hid_mutex);
+ if ((retval == USB_SUCCESS) && hidp->hid_ugen_hdl != NULL) {
+ retval = usb_ugen_detach(hidp->hid_ugen_hdl,
+ DDI_SUSPEND);
+ }
+
return (retval);
}
@@ -1699,6 +1885,10 @@ hid_cpr_resume(hid_state_t *hidp)
"hid_cpr_resume: dip=0x%p", (void *)hidp->hid_dip);
hid_restore_device_state(hidp->hid_dip, hidp);
+
+ if (hidp->hid_ugen_hdl != NULL) {
+ (void) usb_ugen_attach(hidp->hid_ugen_hdl, DDI_RESUME);
+ }
}
@@ -2136,6 +2326,12 @@ hid_detach_cleanup(dev_info_t *dip, hid_state_t *hidp)
hidp->hid_pm = NULL;
}
+ if (hidp->hid_ugen_hdl != NULL) {
+ rval = usb_ugen_detach(hidp->hid_ugen_hdl, DDI_DETACH);
+ VERIFY0(rval);
+ usb_ugen_release_hdl(hidp->hid_ugen_hdl);
+ }
+
mutex_exit(&hidp->hid_mutex);
if (hidp->hid_report_descr != NULL) {
diff --git a/usr/src/uts/common/io/vnd/frameio.c b/usr/src/uts/common/io/vnd/frameio.c
new file mode 100644
index 0000000000..d36608efcd
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/frameio.c
@@ -0,0 +1,465 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * Frame I/O utility functions
+ */
+
+#include <sys/frameio.h>
+
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/sysmacros.h>
+#include <sys/inttypes.h>
+
+static kmem_cache_t *frameio_cache;
+
+int
+frameio_init(void)
+{
+ frameio_cache = kmem_cache_create("frameio_cache",
+ sizeof (frameio_t) + sizeof (framevec_t) * FRAMEIO_NVECS_MAX,
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (frameio_cache == NULL)
+ return (1);
+
+ return (0);
+}
+
+void
+frameio_fini(void)
+{
+ if (frameio_cache != NULL)
+ kmem_cache_destroy(frameio_cache);
+}
+
+frameio_t *
+frameio_alloc(int kmflags)
+{
+ return (kmem_cache_alloc(frameio_cache, kmflags));
+}
+
+void
+frameio_free(frameio_t *fio)
+{
+ kmem_cache_free(frameio_cache, fio);
+}
+
+/*
+ * Ensure that we don't see any garbage in the framevecs that we're nominally
+ * supposed to work with. Specifically we want to make sure that the buflen and
+ * the address are not zero.
+ */
+static int
+frameio_hdr_check_vecs(frameio_t *fio)
+{
+ int i;
+ for (i = 0; i < fio->fio_nvecs; i++)
+ if (fio->fio_vecs[i].fv_buf == NULL ||
+ fio->fio_vecs[i].fv_buflen == 0)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * We have to copy in framevec32_t's. To work around the data model issues and
+ * trying not to copy memory we first copy in the framevec32_t data into the
+ * standard fio_vec space. Next we work backwards copying a given framevec32_t
+ * to a temporaory framevec_t and then overwrite the frameio_t's data. Note that
+ * it is important that we do this in reverse so as to ensure that we don't
+ * clobber data as the framevec_t is larger than the framevec32_t.
+ */
+static int
+frameio_hdr_copyin_ilp32(frameio_t *fio, const void *addr)
+{
+ framevec32_t *vec32p;
+ framevec_t fv;
+ int i;
+
+ vec32p = (framevec32_t *)&fio->fio_vecs[0];
+
+ if (ddi_copyin(addr, vec32p, sizeof (framevec32_t) * fio->fio_nvecs,
+ 0) != 0)
+ return (EFAULT);
+
+ for (i = fio->fio_nvecs - 1; i >= 0; i--) {
+ fv.fv_buf = (void *)(uintptr_t)vec32p[i].fv_buf;
+ fv.fv_buflen = vec32p[i].fv_buflen;
+ fv.fv_actlen = vec32p[i].fv_actlen;
+ fio->fio_vecs[i].fv_buf = fv.fv_buf;
+ fio->fio_vecs[i].fv_buflen = fv.fv_buflen;
+ fio->fio_vecs[i].fv_actlen = fv.fv_actlen;
+ }
+
+ return (frameio_hdr_check_vecs(fio));
+}
+
+/*
+ * Copy in a frame io header into fio with space for up to nvecs. If the frameio
+ * contains more vectors than specified it will be ignored. mode should contain
+ * information about the datamodel.
+ */
+int
+frameio_hdr_copyin(frameio_t *fio, int max_vecs, const void *addr, uint_t mode)
+{
+ int model = ddi_model_convert_from(mode & FMODELS);
+ int cpf = mode & FKIOCTL ? FKIOCTL : 0;
+ size_t fsize = model == DDI_MODEL_ILP32 ?
+ sizeof (frameio32_t) : sizeof (frameio_t);
+
+ /*
+ * The start of the header is the same in all data models for the
+ * current verison.
+ */
+ if (ddi_copyin(addr, fio, fsize, cpf) != 0)
+ return (EFAULT);
+
+ if (fio->fio_version != FRAMEIO_VERSION_ONE)
+ return (EINVAL);
+
+ if (fio->fio_nvecs > FRAMEIO_NVECS_MAX || fio->fio_nvecs == 0)
+ return (EINVAL);
+
+ if (fio->fio_nvpf == 0)
+ return (EINVAL);
+
+ if (fio->fio_nvecs % fio->fio_nvpf != 0)
+ return (EINVAL);
+
+ if (fio->fio_nvecs > max_vecs)
+ return (EOVERFLOW);
+
+ addr = (void *)((uintptr_t)addr + fsize);
+ if (model == DDI_MODEL_ILP32) {
+ if (cpf != 0)
+ return (EINVAL);
+ return (frameio_hdr_copyin_ilp32(fio, addr));
+ }
+
+ if (ddi_copyin(addr, &fio->fio_vecs[0],
+ sizeof (framevec_t) * fio->fio_nvecs, cpf) != 0)
+ return (EFAULT);
+
+ return (frameio_hdr_check_vecs(fio));
+}
+
+static mblk_t *
+frameio_allocb(size_t sz)
+{
+ mblk_t *mp;
+
+ mp = allocb(sz, 0);
+ if (mp == NULL)
+ return (NULL);
+
+ mp->b_datap->db_type = M_DATA;
+ return (mp);
+}
+
+static int
+framevec_mblk_read(framevec_t *fv, mblk_t **mpp, int cpf)
+{
+ mblk_t *mp;
+ cpf = cpf != 0 ? FKIOCTL : 0;
+
+ mp = frameio_allocb(fv->fv_buflen);
+
+ if (mp == NULL) {
+ freemsg(mp);
+ return (EAGAIN);
+ }
+
+ if (ddi_copyin(fv->fv_buf, mp->b_wptr, fv->fv_buflen,
+ cpf) != 0) {
+ freemsg(mp);
+ return (EFAULT);
+ }
+
+ mp->b_wptr += fv->fv_buflen;
+ *mpp = mp;
+ return (0);
+}
+
+/*
+ * Read a set of frame vectors that make up a single message boundary and return
+ * that as a single message in *mpp that consists of multiple data parts.
+ */
+static int
+frameio_mblk_read(frameio_t *fio, framevec_t *fv, mblk_t **mpp, int cpf)
+{
+ int nparts = fio->fio_nvpf;
+ int part, error;
+ mblk_t *mp;
+
+ *mpp = NULL;
+ cpf = cpf != 0 ? FKIOCTL : 0;
+
+ /*
+ * Construct the initial frame
+ */
+ for (part = 0; part < nparts; part++) {
+ error = framevec_mblk_read(fv, &mp, cpf);
+ if (error != 0) {
+ freemsg(*mpp);
+ return (error);
+ }
+
+ if (*mpp == NULL)
+ *mpp = mp;
+ else
+ linkb(*mpp, mp);
+ fv++;
+ }
+
+ return (0);
+}
+
+/*
+ * Read data from a series of frameio vectors into a message block chain. A
+ * given frameio request has a number of discrete messages divided into
+ * individual vectors based on fio->fio_nvcspframe. Each discrete message will
+ * be constructed into a message block chain pointed to by b_next.
+ *
+ * If we get an EAGAIN while trying to construct a given message block what we
+ * return depends on what else we've done so far. If we have succesfully
+ * completed at least one message then we free everything else we've done so
+ * far and return that. If no messages have been completed we return EAGAIN. If
+ * instead we encounter a different error, say EFAULT, then all of the fv_actlen
+ * entries values are undefined.
+ */
+int
+frameio_mblk_chain_read(frameio_t *fio, mblk_t **mpp, int *nvecs, int cpf)
+{
+ int error = ENOTSUP;
+ int nframes = fio->fio_nvecs / fio->fio_nvpf;
+ int frame;
+ framevec_t *fv;
+ mblk_t *mp, *bmp = NULL;
+
+ /*
+ * Protect against bogus kernel subsystems.
+ */
+ VERIFY(fio->fio_nvecs > 0);
+ VERIFY(fio->fio_nvecs % fio->fio_nvpf == 0);
+
+ *mpp = NULL;
+ cpf = cpf != 0 ? FKIOCTL : 0;
+
+ fv = &fio->fio_vecs[0];
+ for (frame = 0; frame < nframes; frame++) {
+ error = frameio_mblk_read(fio, fv, &mp, cpf);
+ if (error != 0)
+ goto failed;
+
+ if (bmp != NULL)
+ bmp->b_next = mp;
+ else
+ *mpp = mp;
+ bmp = mp;
+ }
+
+ *nvecs = nframes;
+ return (0);
+failed:
+ /*
+ * On EAGAIN we've already taken care of making sure that we have no
+ * leftover messages, eg. they were never linked in.
+ */
+ if (error == EAGAIN) {
+ if (frame != 0)
+ error = 0;
+ if (nvecs != NULL)
+ *nvecs = frame;
+ ASSERT(*mpp != NULL);
+ } else {
+ for (mp = *mpp; mp != NULL; mp = bmp) {
+ bmp = mp->b_next;
+ freemsg(mp);
+ }
+ if (nvecs != NULL)
+ *nvecs = 0;
+ *mpp = NULL;
+ }
+ return (error);
+}
+
+size_t
+frameio_frame_length(frameio_t *fio, framevec_t *fv)
+{
+ int i;
+ size_t len = 0;
+
+ for (i = 0; i < fio->fio_nvpf; i++, fv++)
+ len += fv->fv_buflen;
+
+ return (len);
+}
+
+/*
+ * Write a portion of an mblk to the current.
+ */
+static int
+framevec_write_mblk_part(framevec_t *fv, mblk_t *mp, size_t len, size_t moff,
+ size_t foff, int cpf)
+{
+ ASSERT(len <= MBLKL(mp) - moff);
+ ASSERT(len <= fv->fv_buflen - fv->fv_actlen);
+ cpf = cpf != 0 ? FKIOCTL : 0;
+
+ if (ddi_copyout(mp->b_rptr + moff, (caddr_t)fv->fv_buf + foff, len,
+ cpf) != 0)
+ return (EFAULT);
+ fv->fv_actlen += len;
+
+ return (0);
+}
+
+/*
+ * Because copying this out to the user might fail we don't want to update the
+ * b_rptr in case we need to copy it out again.
+ */
+static int
+framevec_map_blk(frameio_t *fio, framevec_t *fv, mblk_t *mp, int cpf)
+{
+ int err;
+ size_t msize, blksize, len, moff, foff;
+
+ msize = msgsize(mp);
+ if (msize > frameio_frame_length(fio, fv))
+ return (EOVERFLOW);
+
+ moff = 0;
+ foff = 0;
+ blksize = MBLKL(mp);
+ fv->fv_actlen = 0;
+ while (msize != 0) {
+ len = MIN(blksize, fv->fv_buflen - fv->fv_actlen);
+ err = framevec_write_mblk_part(fv, mp, len, moff, foff, cpf);
+ if (err != 0)
+ return (err);
+
+ msize -= len;
+ blksize -= len;
+ moff += len;
+ foff += len;
+
+ if (blksize == 0 && msize != 0) {
+ mp = mp->b_cont;
+ ASSERT(mp != NULL);
+ moff = 0;
+ blksize = MBLKL(mp);
+ }
+
+ if (fv->fv_buflen == fv->fv_actlen && msize != 0) {
+ fv++;
+ fv->fv_actlen = 0;
+ foff = 0;
+ }
+ }
+
+ return (0);
+}
+
+int
+frameio_mblk_chain_write(frameio_t *fio, frameio_write_mblk_map_t map,
+ mblk_t *mp, int *nwrite, int cpf)
+{
+ int mcount = 0;
+ int ret = 0;
+
+ if (map != MAP_BLK_FRAME)
+ return (EINVAL);
+
+ while (mp != NULL && mcount < fio->fio_nvecs) {
+ ret = framevec_map_blk(fio, &fio->fio_vecs[mcount], mp, cpf);
+ if (ret != 0)
+ break;
+ mcount += fio->fio_nvpf;
+ mp = mp->b_next;
+ }
+
+ if (ret != 0 && mcount == 0) {
+ if (nwrite != NULL)
+ *nwrite = 0;
+ return (ret);
+ }
+
+ if (nwrite != NULL)
+ *nwrite = mcount / fio->fio_nvpf;
+
+ return (0);
+}
+
+/*
+ * Copy out nframes worth of frameio header data back to userland.
+ */
+int
+frameio_hdr_copyout(frameio_t *fio, int nframes, void *addr, uint_t mode)
+{
+ int i;
+ int model = ddi_model_convert_from(mode & FMODELS);
+ framevec32_t *vec32p;
+ framevec32_t f;
+
+ if (fio->fio_nvecs / fio->fio_nvpf < nframes)
+ return (EINVAL);
+
+ fio->fio_nvecs = nframes * fio->fio_nvpf;
+
+ if (model == DDI_MODEL_NONE) {
+ if (ddi_copyout(fio, addr,
+ sizeof (frameio_t) + fio->fio_nvecs * sizeof (framevec_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+ return (0);
+ }
+
+ ASSERT(model == DDI_MODEL_ILP32);
+
+ vec32p = (framevec32_t *)&fio->fio_vecs[0];
+ for (i = 0; i < fio->fio_nvecs; i++) {
+ f.fv_buf = (caddr32_t)(uintptr_t)fio->fio_vecs[i].fv_buf;
+ if (fio->fio_vecs[i].fv_buflen > UINT_MAX ||
+ fio->fio_vecs[i].fv_actlen > UINT_MAX)
+ return (EOVERFLOW);
+ f.fv_buflen = fio->fio_vecs[i].fv_buflen;
+ f.fv_actlen = fio->fio_vecs[i].fv_actlen;
+ vec32p[i].fv_buf = f.fv_buf;
+ vec32p[i].fv_buflen = f.fv_buflen;
+ vec32p[i].fv_actlen = f.fv_actlen;
+ }
+
+ if (ddi_copyout(fio, addr,
+ sizeof (frameio32_t) + fio->fio_nvecs * sizeof (framevec32_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+ return (0);
+}
+
+void
+frameio_mark_consumed(frameio_t *fio, int nframes)
+{
+ int i;
+
+ ASSERT(fio->fio_nvecs / fio->fio_nvpf >= nframes);
+ for (i = 0; i < nframes * fio->fio_nvpf; i++)
+ fio->fio_vecs[i].fv_actlen = fio->fio_vecs[i].fv_buflen;
+}
diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c
new file mode 100644
index 0000000000..9ed57f30c4
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/vnd.c
@@ -0,0 +1,5857 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * vnd - virtual (machine) networking datapath
+ *
+ * vnd's purpose is to provide a highly performant data path for Layer 2 network
+ * traffic and exist side by side an active IP netstack, each servicing
+ * different datalinks. vnd provides many of the same capabilities as the
+ * current TCP/IP stack does and some specific to layer two. Specifically:
+ *
+ * o Use of the DLD fastpath
+ * o Packet capture hooks
+ * o Ability to use hardware capabilities
+ * o Useful interfaces for handling multiple frames
+ *
+ * The following image shows where vnd fits into today's networking stack:
+ *
+ * +---------+----------+----------+
+ * | libdlpi | libvnd | libsocket|
+ * +---------+----------+----------+
+ * | · · VFS |
+ * | VFS · VFS +----------+
+ * | · | sockfs |
+ * +---------+----------+----------+
+ * | | VND | IP |
+ * | +----------+----------+
+ * | DLD/DLS |
+ * +-------------------------------+
+ * | MAC |
+ * +-------------------------------+
+ * | GLDv3 |
+ * +-------------------------------+
+ *
+ * -----------------------------------------
+ * A Tale of Two Devices - DDI Device Basics
+ * -----------------------------------------
+ *
+ * vnd presents itself to userland as a character device; however, it also is a
+ * STREAMS device so that it can interface with dld and the rest of the
+ * networking stack. Users never interface with the STREAMs devices directly and
+ * they are purely an implementation detail of vnd. Opening the STREAMS device
+ * require kcred and as such userland cannot interact with it or push it onto
+ * the stream head.
+ *
+ * The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every
+ * clone gets its own minor number; however, minor nodes are not created in the
+ * devices tree for these instances. In this state a user may do two different
+ * things. They may issue ioctls that affect global state or they may issue
+ * ioctls that try to attach it to a given datalink. Once a minor device has
+ * been attached to a datalink, all operations on it are scoped to that context,
+ * therefore subsequent global operations are not permitted.
+ *
+ * A given device can be linked into the /devices and /dev name space via a link
+ * ioctl. That ioctl causes a minor node to be created in /devices and then it
+ * will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar
+ * to, but simpler than, IP's persistence mechanism.
+ *
+ * ---------------------
+ * Binding to a datalink
+ * ---------------------
+ *
+ * Datalinks are backed by the dld (datalink device) and dls (datalink services)
+ * drivers. These drivers provide a STREAMS device for datalinks on the system
+ * which are exposed through /dev/net. Userland generally manipulates datalinks
+ * through libdlpi. When an IP interface is being plumbed up what actually
+ * happens is that someone does a dlpi_open(3DLPI) of the underlying datalink
+ * and then pushes on the ip STREAMS module with an I_PUSH ioctl. Modules may
+ * then can negotiate with dld and dls to obtain access to various capabilities
+ * and fast paths via a series of STREAMS messages.
+ *
+ * In vnd, we do the same thing, but we leave our STREAMS module as an
+ * implementation detail of the system. We don't want users to be able to
+ * arbitrarily push vnd STREAMS module onto any stream, so we explicitly require
+ * kcred to manipulate it. Thus, when a user issues a request to attach a
+ * datalink to a minor instance of the character device, that vnd minor instance
+ * itself does a layered open (ldi_open_by_name(9F)) of the specified datalink.
+ * vnd does that open using the passed in credentials from the ioctl, not kcred.
+ * This ensures that users who doesn't have permissions to open the device
+ * cannot. Once that's been opened, we push on the vnd streams module.
+ *
+ * Once the vnd STREAMS instance has been created for this device, eg. the
+ * I_PUSH ioctl returns, we explicitly send a STREAMS ioctl
+ * (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices.
+ * This association begins the STREAM device's initialization. We start up an
+ * asynchronous state machine that takes care of all the different aspects of
+ * plumbing up the device with dld and dls and enabling the MAC fast path. We
+ * need to guarantee to consumers of the character device that by the time their
+ * ioctl returns, the data path has been fully initialized.
+ *
+ * The state progression is fairly linear. There are two general steady states.
+ * The first is VND_S_ONLINE, which means that everything is jacked up and good
+ * to go. The alternative is VND_S_ZOMBIE, which means that the streams device
+ * encountered an error or we have finished tearing it down and the character
+ * device can clean it up. The following is our state progression and the
+ * meaning of each state:
+ *
+ * |
+ * |
+ * V
+ * +---------------+
+ * | VNS_S_INITIAL | This is our initial state. Every
+ * +---------------+ vnd STREAMS device starts here.
+ * | While in this state, only dlpi
+ * | M_PROTO and M_IOCTL messages can be
+ * | sent or received. All STREAMS based
+ * | data messages are dropped.
+ * | We transition out of this state by
+ * | sending a DL_INFO_REQ to obtain
+ * | information about the underlying
+ * | link.
+ * v
+ * +-----------------+
+ * +--<-| VNS_S_INFO_SENT | In this state, we verify and
+ * | +-----------------+ record information about the
+ * | | underlying device. If the device is
+ * | | not suitable, eg. not of type
+ * v | DL_ETHER, then we immediately
+ * | | become a ZOMBIE. To leave this
+ * | | state we request exclusive active
+ * | | access to the device via
+ * v | DL_EXCLUSIVE_REQ.
+ * | v
+ * | +----------------------+
+ * +--<-| VNS_S_EXCLUSIVE_SENT | In this state, we verify whether
+ * | +----------------------+ or not we were able to obtain
+ * | | | exclusive access to the device. If
+ * | | | we were not able to, then we leave,
+ * v | | as that means that something like
+ * | | | IP is already plumbed up on top of
+ * | | | the datalink. We leave this state
+ * | | | by progressing through to the
+ * | | | appropriate DLPI primitive, either
+ * v | | DLPI_ATTACH_REQ or DLPI_BIND_REQ
+ * | | | depending on the style of the
+ * | | | datalink.
+ * | | v
+ * | | +-------------------+
+ * +------ |--<-| VNS_S_ATTACH_SENT | In this state, we verify we were
+ * | | +-------------------+ able to perform a standard DLPI
+ * | | | attach and if so, go ahead and
+ * v | | send a DLPI_BIND_REQ.
+ * | v v
+ * | +-------------------+
+ * +--<-| VNS_S_BIND_SENT | In this state we see the result of
+ * | +-------------------+ our attempt to bind to PPA 0 of the
+ * v | underlying device. Because we're
+ * | | trying to be a layer two datapath,
+ * | | the specific attachment point isn't
+ * | | too important as we're going to
+ * v | have to enable promiscuous mode. We
+ * | | transition out of this by sending
+ * | | our first of three promiscuous mode
+ * | | requests.
+ * v v
+ * | +------------------------+
+ * +--<-| VNS_S_SAP_PROMISC_SENT | In this state we verify that we
+ * | +------------------------+ were able to enable promiscuous
+ * | | mode at the physical level. We
+ * | | transition out of this by enabling
+ * | | multicast and broadcast promiscuous
+ * v | mode.
+ * | v
+ * | +--------------------------+
+ * +--<-| VNS_S_MULTI_PROMISC_SENT | In this state we verify that we
+ * | +--------------------------+ have enabled DL_PROMISC_MULTI and
+ * v | move onto the second promiscuous
+ * | | mode request.
+ * | v
+ * | +----------------------------+
+ * +--<-| VNS_S_RX_ONLY_PROMISC_SENT | In this state we verify that we
+ * | +----------------------------+ enabled RX_ONLY promiscuous mode.
+ * | | We specifically do this as we don't
+ * v | want to receive our own traffic
+ * | | that we'll send out. We leave this
+ * | | state by enabling the final flag
+ * | | DL_PROMISC_FIXUPS.
+ * | v
+ * | +--------------------------+
+ * +--<-| VNS_S_FIXUP_PROMISC_SENT | In this state we verify that we
+ * | +--------------------------+ enabled FIXUP promiscuous mode.
+ * | | We specifically do this as we need
+ * v | to ensure that traffic which is
+ * | | received by being looped back to us
+ * | | correctly has checksums fixed. We
+ * | | leave this state by requesting the
+ * | | dld/dls capabilities that we can
+ * v | process.
+ * | v
+ * | +--------------------+
+ * +--<-| VNS_S_CAPAB_Q_SENT | We loop over the set of
+ * | +--------------------+ capabilities that dld advertised
+ * | | and enable the ones that currently
+ * v | support for use. See the section
+ * | | later on regarding capabilities
+ * | | for more information. We leave this
+ * | | state by sending an enable request.
+ * v v
+ * | +--------------------+
+ * +--<-| VNS_S_CAPAB_E_SENT | Here we finish all capability
+ * | +--------------------+ initialization. Once finished, we
+ * | | transition to the next state. If
+ * v | the dld fast path is not available,
+ * | | we become a zombie.
+ * | v
+ * | +--------------+
+ * | | VNS_S_ONLINE | This is a vnd STREAMS device's
+ * | +--------------+ steady state. It will normally
+ * | | reside in this state while it is in
+ * | | active use. It will only transition
+ * v | to the next state when the STREAMS
+ * | | device is closed by the character
+ * | | device. In this state, all data
+ * | | flows over the dld fast path.
+ * | v
+ * | +---------------------+
+ * +--->| VNS_S_SHUTTING_DOWN | This vnd state takes care of
+ * | +---------------------+ disabling capabilities and
+ * | | flushing all data. At this point
+ * | | any additional data that we receive
+ * | | will be dropped. We leave this
+ * v | state by trying to remove multicast
+ * | | promiscuity.
+ * | |
+ * | v
+ * | +---------------------------------+
+ * +-->| VNS_S_MULTICAST_PROMISCOFF_SENT | In this state, we check if we have
+ * | +---------------------------------+ successfully removed multicast
+ * | | promiscuous mode. If we have
+ * | | failed, we still carry on but only
+ * | | warn. We leave this state by trying
+ * | | to disable SAP level promiscuous
+ * | | mode.
+ * | v
+ * | +---------------------------+
+ * +-->| VNS_S_SAP_PROMISCOFF_SENT | In this state, we check if we have
+ * | +---------------------------+ successfully removed SAP level
+ * | | promiscuous mode. If we have
+ * | | failed, we still carry on but only
+ * | | warn. Note that we don't worry
+ * | | about either of
+ * | | DL_PROMISC_FIXUPS or
+ * | | DL_PROMISC_RX_ONLY. If these are
+ * | | the only two entries left, then we
+ * | | should have anything that MAC is
+ * | | doing for us at this point,
+ * | | therefore it's safe for us to
+ * | | proceed to unbind, which is how we
+ * | | leave this state via a
+ * | v DL_UNBIND_REQ.
+ * | +-------------------+
+ * +--->| VNS_S_UNBIND_SENT | Here, we check how the unbind
+ * | +-------------------+ request went. Regardless of its
+ * | | success, we always transition to
+ * | | a zombie state.
+ * | v
+ * | +--------------+
+ * +--->| VNS_S_ZOMBIE | In this state, the vnd STREAMS
+ * +--------------+ device is waiting to finish being
+ * reaped. Because we have no more
+ * ways to receive data it should be
+ * safe to destroy all remaining data
+ * structures.
+ *
+ * If the stream association fails for any reason the state machine reaches
+ * VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the
+ * STREAMS ioctl to the character device. That will fail the user ioctl and
+ * propagate the vnd_errno_t back to userland. If, on the other hand, the
+ * association succeeds, then the vnd STREAMS device will be fully plumbed up
+ * and ready to transmit and receive message blocks. Consumers will be able to
+ * start using the other cbops(9E) entry points once the attach has fully
+ * finished, which will occur after the original user attach ioctl to the
+ * character device returns.
+ *
+ * It's quite important that we end up sending the full series of STREAMS
+ * messages when tearing down. While it's tempting to say that we should just
+ * rely on the STREAMS device being closed to properly ensure that we have no
+ * more additional data, that's not sufficient due to our use of direct
+ * callbacks. DLS does not ensure that by the time we change the direct
+ * callback (vnd_mac_input) that all callers to it will have been quiesced.
+ * However, it does guarantee that if we disable promiscuous mode ourselves and
+ * we turn off the main data path via DL_UNBIND_REQ that it will work.
+ * Therefore, we make sure to do this ourselves rather than letting DLS/DLD do
+ * it as part of tearing down the STREAMS device. This ensures that we'll
+ * quiesce all data before we destroy our data structures and thus we should
+ * eliminate the race in changing the data function.
+ *
+ * --------------------
+ * General Architecture
+ * --------------------
+ *
+ * There are several different devices and structures in the vnd driver. There
+ * is a per-netstack component, pieces related to the character device that
+ * consumers see, the internal STREAMS device state, and the data queues
+ * themselves. The following ASCII art picture describes their relationships and
+ * some of the major pieces of data that contain them. These are not exhaustive,
+ * e.g. synchronization primitives are left out.
+ *
+ * +----------------+ +-----------------+
+ * | global | | global |
+ * | device list | | netstack list |
+ * | vnd_dev_list | | vnd_nsd_list |
+ * +----------------+ +-----------------+
+ * | |
+ * | v
+ * | +-------------------+ +-------------------+
+ * | | per-netstack data | ---> | per-netstack data | --> ...
+ * | | vnd_pnsd_t | | vnd_pnsd_t |
+ * | | | +-------------------+
+ * | | |
+ * | | nestackid_t ---+----> Netstack ID
+ * | | vnd_pnsd_flags_t -+----> Status flags
+ * | | zoneid_t ---+----> Zone ID for this netstack
+ * | | hook_family_t ---+----> VND IPv4 Hooks
+ * | | hook_family_t ---+----> VND IPv6 Hooks
+ * | | list_t ----+ |
+ * | +------------+------+
+ * | |
+ * | v
+ * | +------------------+ +------------------+
+ * | | character device | ---> | character device | -> ...
+ * +---------->| vnd_dev_t | | vnd_dev_t |
+ * | | +------------------+
+ * | |
+ * | minor_t ---+--> device minor number
+ * | ldi_handle_t ---+--> handle to /dev/net/%datalink
+ * | vnd_dev_flags_t -+--> device flags, non blocking, etc.
+ * | char[] ---+--> name if linked
+ * | vnd_str_t * -+ |
+ * +--------------+---+
+ * |
+ * v
+ * +-------------------------+
+ * | STREAMS device |
+ * | vnd_str_t |
+ * | |
+ * | vnd_str_state_t ---+---> State machine state
+ * | gsqueue_t * ---+---> mblk_t Serialization queue
+ * | vnd_str_stat_t ---+---> per-device kstats
+ * | vnd_str_capab_t ---+----------------------------+
+ * | vnd_data_queue_t ---+ | |
+ * | vnd_data_queue_t -+ | | v
+ * +-------------------+-+---+ +---------------------+
+ * | | | Stream capabilities |
+ * | | | vnd_str_capab_t |
+ * | | | |
+ * | | supported caps <--+-- vnd_capab_flags_t |
+ * | | dld cap handle <--+-- void * |
+ * | | direct tx func <--+-- vnd_dld_tx_t |
+ * | | +---------------------+
+ * | |
+ * +----------------+ +-------------+
+ * | |
+ * v v
+ * +-------------------+ +-------------------+
+ * | Read data queue | | Write data queue |
+ * | vnd_data_queue_t | | vnd_data_queue_t |
+ * | | | |
+ * | size_t ----+--> Current size | size_t ----+--> Current size
+ * | size_t ----+--> Max size | size_t ----+--> Max size
+ * | mblk_t * ----+--> Queue head | mblk_t * ----+--> Queue head
+ * | mblk_t * ----+--> Queue tail | mblk_t * ----+--> Queue tail
+ * +-------------------+ +-------------------+
+ *
+ *
+ * Globally, we maintain two lists. One list contains all of the character
+ * device soft states. The other maintains a list of all our netstack soft
+ * states. Each netstack maintains a list of active devices that have been
+ * associated with a datalink in its netstack.
+ *
+ * Recall that a given minor instance of the character device exists in one of
+ * two modes. It can either be a cloned open of /dev/vnd/ctl, the control node,
+ * or it can be associated with a given datalink. When minor instances are in
+ * the former state, they do not exist in a given vnd_pnsd_t's list of devices.
+ * As part of attaching to a datalink, the given vnd_dev_t will be inserted into
+ * the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a
+ * vnd_str_t, to be created and associated to a vnd_dev_t.
+ *
+ * The character device, and its vnd_dev_t, is the interface to the rest of the
+ * system. The vnd_dev_t keeps track of various aspects like whether various
+ * operations, such as read, write and the frameio ioctls, are considered
+ * blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for
+ * keeping track of things like the name of the device, if any, in /dev. The
+ * vnd_str_t, on the other hand manages aspects like buffer sizes and the actual
+ * data queues. However, ioctls that manipulate these properties all go through
+ * the vnd_dev_t to its associated vnd_str_t.
+ *
+ * Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One
+ * for frames to transmit (write queue) and one for frames received (read
+ * queue). These data queues have a maximum size and attempting to add data
+ * beyond that maximum size will result in data being dropped. The sizes are
+ * configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits
+ * in those buffers or has a reservation in those buffers while they are in vnd
+ * and waiting to be consumed by the user or by mac.
+ *
+ * Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the
+ * available, negotiated, and currently active features.
+ *
+ * ----------------------
+ * Data Path and gsqueues
+ * ----------------------
+ *
+ * There's a lot of plumbing in vnd to get to the point where we can send data,
+ * but vnd's bread and butter is the data path, so it's worth diving into it in
+ * more detail. Data enters and exits the system from two ends.
+ *
+ * The first end is the vnd consumer. This comes in the form of read and write
+ * system calls as well as the frame I/O ioctls. The read and write system calls
+ * operate on a single frame at a time. Think of a frame as a single message
+ * that has come in off the wire, which may itself comprise multiple mblk_t's
+ * linked together in the kernel. readv(2) and writev(2) have the same
+ * limitations as read(2) and write(2). We enforce this as the system is
+ * required to fill up every uio(9S) buffer before moving onto the next one.
+ * This means that if you have a MTU sized buffer and two frames come in which
+ * are less than half of the MTU they must fill up the given iovec. Even if we
+ * didn't want to do this, we have no way of informing the supplier of the
+ * iovecs that they were only partially filled or where one frame ends and
+ * another begins. That's life, as such we have frame I/O which solves this
+ * problem. It allows for multiple frames to be consumed as well as for frames
+ * to be broken down into multiple vector components.
+ *
+ * The second end is the mac direct calls. As part of negotiating capabilities
+ * via dld, we give mac a function of ours to call when packets are received
+ * [vnd_mac_input()] and a callback to indicate that flow has been restored
+ * [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can
+ * transmit data with. As part of the contract with mac, mac is allowed to flow
+ * control us by returning a cookie to the transmit function. When that happens,
+ * all outbound traffic is halted until our callback function is called and we
+ * can schedule drains.
+ *
+ * It's worth looking at these in further detail. We'll start with the rx path.
+ *
+ *
+ * |
+ * * . . . packets from gld
+ * |
+ * v
+ * +-------------+
+ * | mac |
+ * +-------------+
+ * |
+ * v
+ * +-------------+
+ * | dld |
+ * +-------------+
+ * |
+ * * . . . dld direct callback
+ * |
+ * v
+ * +---------------+
+ * | vnd_mac_input |
+ * +---------------+
+ * |
+ * v
+ * +---------+ +-------------+
+ * | dropped |<--*---------| vnd_hooks |
+ * | by | . +-------------+
+ * | hooks | . drop probe |
+ * +---------+ kstat bump * . . . Do we have free
+ * | buffer space?
+ * |
+ * no . | . yes
+ * . + .
+ * +---*--+------*-------+
+ * | |
+ * * . . drop probe * . . recv probe
+ * | kstat bump | kstat bump
+ * v |
+ * +---------+ * . . fire pollin
+ * | freemsg | v
+ * +---------+ +-----------------------+
+ * | vnd_str_t`vns_dq_read |
+ * +-----------------------+
+ * ^ ^
+ * +----------+ | | +---------+
+ * | read(9E) |-->-+ +--<--| frameio |
+ * +----------+ +---------+
+ *
+ * The rx path is rather linear. Packets come into us from mac. We always run
+ * them through the various hooks, and if they come out of that, we inspect the
+ * read data queue. If there is not enough space for a packet, we drop it.
+ * Otherwise, we append it to the data queue, and fire read notifications
+ * targetting anyone polling or doing blocking I/O on this device. Those
+ * consumers then drain the head of the data queue.
+ *
+ * The tx path is more complicated due to mac flow control. After any call into
+ * mac, we may have to potentially suspend writes and buffer data for an
+ * arbitrary amount of time. As such, we need to carefully track the total
+ * amount of outstanding data so that we don't waste kernel memory. This is
+ * further complicated by the fact that mac will asynchronously tell us when our
+ * flow has been resumed.
+ *
+ * For data to be able to enter the system, it needs to be able to take a
+ * reservation from the write data queue. Once the reservation has been
+ * obtained, we enter the gsqueue so that we can actually append it. We use
+ * gsqueues (serialization queues) to ensure that packets are manipulated in
+ * order as we deal with the draining and appending packets. We also leverage
+ * its worker thread to help us do draining after mac has restorted our flow.
+ *
+ * The following image describes the flow:
+ *
+ * +-----------+ +--------------+ +-------------------------+ +------+
+ * | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one() |-->| Done |
+ * | frameio | | write queue? | . | +->vnd_squeue_tx_append | +------+
+ * +-----------+ +--------------+ . +-------------------------+
+ * | ^ .
+ * | | . reserve space from gsqueue
+ * | | |
+ * queue . . . * | space v
+ * full | * . . . avail +------------------------+
+ * v | | vnd_squeue_tx_append() |
+ * +--------+ +------------+ +------------------------+
+ * | EAGAIN |<--*------| Non-block? |<-+ |
+ * +--------+ . +------------+ | v
+ * . yes v | wait +--------------+
+ * no . .* * . . for | append chain |
+ * +----+ space | to outgoing |
+ * | mblk chain |
+ * from gsqueue +--------------+
+ * | |
+ * | +-------------------------------------------------+
+ * | |
+ * | | yes . . .
+ * v v .
+ * +-----------------------+ +--------------+ . +------+
+ * | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done |
+ * +-----------------------+ +--------------+ +------+
+ * | |
+ * +---------------------------------|---------------------+
+ * | | tx |
+ * | no . . * queue . . *
+ * | flow controlled . | empty * . fire pollout
+ * | . v | if mblk_t's
+ * +-------------+ . +---------------------+ | sent
+ * | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+
+ * | flags | +---------------------+ |
+ * +-------------+ More data | | | More data |
+ * and limit ^ v * . . and limit ^
+ * not reached . . * | | reached |
+ * +----+ | |
+ * v |
+ * +----------+ +-------------+ +---------------------------+
+ * | mac flow |--------->| remove mac |--->| gsqueue_enter_one() with |
+ * | control | | block flags | | vnd_squeue_tx_drain() and |
+ * | callback | +-------------+ | GSQUEUE_FILL flag, iff |
+ * +----------+ | not already scheduled |
+ * +---------------------------+
+ *
+ * The final path taken for a given write(9E)/frameio ioctl depends on whether
+ * or not the vnd_dev_t is non-blocking. That controls the initial path of
+ * trying to take a reservation in write data queue. If the device is in
+ * non-blocking mode, we'll return EAGAIN when there is not enough space
+ * available, otherwise, the calling thread blocks on the data queue.
+ *
+ * Today when we call into vnd_squeue_tx_drain() we will not try to drain the
+ * entire queue, as that could be quite large and we don't want to necessarily
+ * keep the thread that's doing the drain until it's been finished. Not only
+ * could more data be coming in, but the draining thread could be a userland
+ * thread that has more work to do. We have two limits today. There is an upper
+ * bound on the total amount of data and the total number of mblk_t chains. If
+ * we hit either limit, then we will schedule another drain in the gsqueue and
+ * go from there.
+ *
+ * It's worth taking some time to describe how we interact with gsqueues. vnd
+ * has a gsqueue_set_t for itself. It's important that it has its own set, as
+ * the profile of work that vnd does is different from other sub-systems in the
+ * kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue.
+ * Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up
+ * maintaining one for a given device. Because of that, we want to use a
+ * pseudo-random one to try and spread out the load, and picking one at random
+ * is likely to be just as good as any fancy algorithm we might come up with,
+ * especially as any two devices could have radically different transmit
+ * profiles.
+ *
+ * While some of the write path may seem complicated, it does allow us to
+ * maintain an important property. Once we have acknowledged a write(9E) or
+ * frameio ioctl, we will not drop the packet, excepting something like ipf via
+ * the firewall hooks.
+ *
+ * There is one other source of flow control that can exist in the system which
+ * is in the form of a barrier. The barrier is an internal mechanism used for
+ * ensuring that an gsqueue is drained for a given device. We use this as part
+ * of tearing down. Specifically we disable the write path so nothing new can be
+ * inserted into the gsqueue and then insert a barrier block. Once the barrier
+ * block comes out of the gsqueue, then we know nothing else in the gsqueue that
+ * could refer to the vnd_str_t, being destroyed, exists.
+ *
+ * ---------------------
+ * vnd, zones, netstacks
+ * ---------------------
+ *
+ * vnd devices are scoped to datalinks and datalinks are scoped to a netstack.
+ * Because of that, vnd is also a netstack module. It registers with the
+ * netstack sub-system and receives callbacks every time a netstack is created,
+ * being shutdown, and destroyed. The netstack callbacks drive the creation and
+ * destruction of the vnd_pnsd_t structures.
+ *
+ * Recall from the earlier architecture diagrams that every vnd device is scoped
+ * to a netstack and known about by a given vnd_pnsd_t. When that netstack is
+ * torn down, we also tear down any vnd devices that are hanging around. When
+ * the netstack is torn down, we know that any zones that are scoped to that
+ * netstack are being shut down and have no processes remaining. This is going
+ * to be the case whether they are shared or exclusive stack zones. We have to
+ * perform a careful dance.
+ *
+ * There are two different callbacks that happen on tear down, the first is a
+ * shutdown callback, the second is a destroy callback. When the shutdown
+ * callback is fired we need to prepare for the netstack to go away and ensure
+ * that nothing can continue to persist itself.
+ *
+ * More specifically, when we get notice of a stack being shutdown we first
+ * remove the netstack from the global netstack list to ensure that no one new
+ * can come in and find the netstack and get a reference to it. After that, we
+ * notify the neti hooks that they're going away. Once that's all done, we get
+ * to the heart of the matter.
+ *
+ * When shutting down there could be any number of outstanding contexts that
+ * have a reference on the vnd_pnsd_t and on the individual links. However, we
+ * know that no one new will be able to find the vnd_pnsd_t. To account for
+ * things that have existing references we mark the vnd_pnsd_t`vpnd_flags with
+ * VND_NS_CONDEMNED. This is checked by code paths that wish to append a device
+ * to the netstack's list. If this is set, then they must not append to it.
+ * Once this is set, we know that the netstack's list of devices can never grow,
+ * only shrink.
+ *
+ * Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that
+ * the container for the device is being destroyed and that we should not allow
+ * additional references to the device to be created, whether via open, or
+ * linking. The presence of this bit also allows things like the list ioctl and
+ * sdev to know not to consider its existence. At the conclusion of this being
+ * set, we know that no one else should be able to obtain a new reference to the
+ * device.
+ *
+ * Once that has been set for all devices, we go through and remove any existing
+ * links that have been established in sdev. Because doing that may cause the
+ * final reference for the device to be dropped, which still has a reference to
+ * the netstack, we have to restart our walk due to dropped locks. We know that
+ * this walk will eventually complete because the device cannot be relinked and
+ * no new devices will be attached in this netstack due to VND_NS_CONDEMNED.
+ * Once that's finished, the shutdown callback returns.
+ *
+ * When we reach the destroy callback, we simply wait for references on the
+ * netstack to disappear. Because the zone has been shut down, all processes in
+ * it that have open references have been terminated and reaped. Any threads
+ * that are newly trying to reference it will fail. However, there is one thing
+ * that can halt this that we have no control over, which is the global zone
+ * holding open a reference to the device. In this case the zone halt will hang
+ * in vnd_stack_destroy. Once the last references is dropped we finish destroy
+ * the netinfo hooks and free the vnd_pnsd_t.
+ *
+ * ----
+ * sdev
+ * ----
+ *
+ * vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd
+ * for both the global and non-global zones. In any given zone we always supply
+ * a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone
+ * will also have an entry per-link in that zone under /dev/vnd/%datalink, eg.
+ * if a link was named net0, there would be a /dev/vnd/net0. The global zone can
+ * also see every link for every zone, ala /dev/net, under
+ * /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device
+ * named net0, the global zone would have /dev/vnd/turin/net0.
+ *
+ * The sdev plugin has three interfaces that it supplies back to sdev. One is to
+ * validate that a given node is still valid. The next is a callback from sdev
+ * to say that it is no longer using the node. The third and final one is from
+ * sdev where it asks us to fill a directory. All of the heavy lifting is done
+ * in directory filling and in valiation. We opt not to maintain a reference on
+ * the device while there is an sdev node present. This makes the removal of
+ * nodes much simpler and most of the possible failure modes shouldn't cause any
+ * real problems. For example, the open path has to handle both dev_t's which no
+ * longer exist and which are no longer linked.
+ *
+ * -----
+ * hooks
+ * -----
+ *
+ * Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd
+ * provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks
+ * in a minimal fashion. While we will allow traffic to be filtered through the
+ * hooks, we do not provide means for packet injection or additional inspection
+ * at this time. There are a total of four different events created:
+ *
+ * o IPv4 physical in
+ * o IPv4 physical out
+ * o IPv6 physical in
+ * o IPv6 physical out
+ *
+ * ---------------
+ * Synchronization
+ * ---------------
+ *
+ * To make our synchronization simpler, we've put more effort into making the
+ * metadata/setup paths do more work. That work allows the data paths to make
+ * assumptions around synchronization that simplify the general case. Each major
+ * structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is
+ * annotated with the protection that its members receives. The following
+ * annotations are used:
+ *
+ * A Atomics; these values are only modified using atomics values.
+ * Currently this only applies to kstat values.
+ * E Existence; no lock is needed to access this member, it does not
+ * change while the structure is valid.
+ * GL Global Lock; these members are protected by the global
+ * vnd_dev_lock.
+ * L Locked; access to the member is controlled by a lock that is in
+ * the structure.
+ * NSL netstack lock; this member is protected by the containing
+ * netstack. This only applies to the vnd_dev_t`vdd_nslink.
+ * X This member is special, and is discussed in this section.
+ *
+ * In addition to locking, we also have reference counts on the vnd_dev_t and
+ * the vnd_pnsd_t. The reference counts describe the lifetimes of the structure.
+ * With rare exception, once a reference count is decremented, the consumer
+ * should not assume that the data is valid any more. The only exception to this
+ * is the case where we're removing an extant reference count from a link into
+ * /devices or /dev. Reference counts are obtained on these structures as a part
+ * of looking them up.
+ *
+ * # Global Lock Ordering
+ * ######################
+ *
+ * The following is the order that you must take locks in vnd:
+ *
+ * 1) vnd`vnd_dev_lock
+ * 2) vnd_pnsd_t`vpnd_lock
+ * 3) vnd_dev_t`vnd_lock
+ * 4) vnd_str_t`vns_lock
+ * 5) vnd_data_queue_t`vdq_lock
+ *
+ * One must adhere to the following rules:
+ *
+ * o You must acquire a lower numbered lock before a high numbered lock.
+ * o It is NOT legal to hold two locks of the same level concurrently, eg. you
+ * can not hold two different vnd_dev_t's vnd_lock at the same time.
+ * o You may release locks in any order.
+ * o If you release a lock, you must honor the locking rules before acquiring
+ * it again.
+ * o You should not hold any locks when calling any of the rele functions.
+ *
+ * # Special Considerations
+ * ########################
+ *
+ * While most of the locking is what's expected, it's worth going into the
+ * special nature that a few members hold. Today, only two structures have
+ * special considerations: the vnd_dev_t and the vnd_str_t. All members with
+ * special considerations have an additional annotation that describes how you
+ * should interact with it.
+ *
+ * vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is
+ * attached or in the process of attaching. If the code path that goes through
+ * requires an attached vnd_dev_t, eg. the data path and tear down path, then it
+ * is always legal to dereference that member without a lock held. When they are
+ * added to the system, they should be done under the vdd_lock and done as part
+ * of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the
+ * lifetime of the vnd_dev_t.
+ *
+ * vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it
+ * always exists as it is a part of the structure. The only time that it's valid
+ * to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag
+ * set or during tear down. Outside of those paths which are naturally
+ * serialized, there is no explicit locking around the member.
+ *
+ * vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not
+ * initially set as part of creating the structure, but are set as part of
+ * responding to the association ioctl. Anything in the data path or metadata
+ * path that requires association may assume that they exist, as we do not kick
+ * off the state machine until they're set.
+ *
+ * vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The
+ * members are designed to be used as part of various operations with the
+ * gsqueues. A lock isn't needed to use them, but to work with them, the
+ * appropriate flag in the vnd_str_t`vns_flags must have been set by the current
+ * thread. Otherwise, it is always fair game to refer to their addresses. Their
+ * contents are ignored by vnd, but some members are manipulated by the gsqueue
+ * subsystem.
+ */
+
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/modctl.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/open.h>
+#include <sys/ddi.h>
+#include <sys/ethernet.h>
+#include <sys/stropts.h>
+#include <sys/sunddi.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/ksynch.h>
+#include <sys/taskq_impl.h>
+#include <sys/sdt.h>
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/dlpi.h>
+#include <sys/cred.h>
+#include <sys/id_space.h>
+#include <sys/list.h>
+#include <sys/ctype.h>
+#include <sys/policy.h>
+#include <sys/sunldi.h>
+#include <sys/cred.h>
+#include <sys/strsubr.h>
+#include <sys/poll.h>
+#include <sys/neti.h>
+#include <sys/hook.h>
+#include <sys/hook_event.h>
+#include <sys/vlan.h>
+#include <sys/dld.h>
+#include <sys/mac_client.h>
+#include <sys/netstack.h>
+#include <sys/fs/sdev_plugin.h>
+#include <sys/kstat.h>
+#include <sys/atomic.h>
+#include <sys/disp.h>
+#include <sys/random.h>
+#include <sys/gsqueue.h>
+#include <sys/smt.h>
+
+#include <inet/ip.h>
+#include <inet/ip6.h>
+
+#include <sys/vnd.h>
+
+/*
+ * Globals
+ */
+static dev_info_t *vnd_dip;
+static taskq_t *vnd_taskq;
+static kmem_cache_t *vnd_str_cache;
+static kmem_cache_t *vnd_dev_cache;
+static kmem_cache_t *vnd_pnsd_cache;
+static id_space_t *vnd_minors;
+static int vnd_list_init = 0;
+static sdev_plugin_hdl_t vnd_sdev_hdl;
+static gsqueue_set_t *vnd_sqset;
+
+static kmutex_t vnd_dev_lock;
+static list_t vnd_dev_list; /* Protected by the vnd_dev_lock */
+static list_t vnd_nsd_list; /* Protected by the vnd_dev_lock */
+
+/*
+ * STREAMs ioctls
+ *
+ * The STREAMs ioctls are internal to vnd. No one should be seeing them, as such
+ * they aren't a part of the header file.
+ */
+#define VND_STRIOC (('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80)
+
+/*
+ * Private ioctl to associate a given streams instance with a minor instance of
+ * the character device.
+ */
+#define VND_STRIOC_ASSOCIATE (VND_STRIOC | 0x1)
+
+typedef struct vnd_strioc_associate {
+ minor_t vsa_minor; /* minor device node */
+ netstackid_t vsa_nsid; /* netstack id */
+ vnd_errno_t vsa_errno; /* errno */
+} vnd_strioc_associate_t;
+
+typedef enum vnd_strioc_state {
+ VSS_UNKNOWN = 0,
+ VSS_COPYIN = 1,
+ VSS_COPYOUT = 2,
+} vnd_strioc_state_t;
+
+typedef struct vnd_strioc {
+ vnd_strioc_state_t vs_state;
+ caddr_t vs_addr;
+} vnd_strioc_t;
+
+/*
+ * VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though
+ * really, overlap is at the end of the day, inevitable.
+ */
+#define VND_SQUEUE_TAG_TX_DRAIN 0x42
+#define VND_SQUEUE_TAG_MAC_FLOW_CONTROL 0x43
+#define VND_SQUEUE_TAG_VND_WRITE 0x44
+#define VND_SQUEUE_TAG_ND_FRAMEIO_WRITE 0x45
+#define VND_SQUEUE_TAG_STRBARRIER 0x46
+
+/*
+ * vnd reserved names. These are names which are reserved by vnd and thus
+ * shouldn't be used by some external program.
+ */
+static char *vnd_reserved_names[] = {
+ "ctl",
+ "zone",
+ NULL
+};
+
+/*
+ * vnd's DTrace probe macros
+ *
+ * DTRACE_VND* are all for a stable provider. We also have an unstable internal
+ * set of probes for reference count manipulation.
+ */
+#define DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \
+ DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3);
+
+#define DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \
+ DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4);
+
+#define DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4, type5, arg5) \
+ DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4, type5, arg5);
+
+#define DTRACE_VND_REFINC(vdp) \
+ DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref);
+#define DTRACE_VND_REFDEC(vdp) \
+ DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref);
+
+
+/*
+ * Tunables
+ */
+size_t vnd_vdq_default_size = 1024 * 64; /* 64 KB */
+size_t vnd_vdq_hard_max = 1024 * 1024 * 4; /* 4 MB */
+
+/*
+ * These numbers are designed as per-device tunables that are applied when a new
+ * vnd device is attached. They're a rough stab at what may be a reasonable
+ * amount of work to do in one burst in an squeue.
+ */
+size_t vnd_flush_burst_size = 1520 * 10; /* 10 1500 MTU packets */
+size_t vnd_flush_nburst = 10; /* 10 frames */
+
+/*
+ * Constants related to our sdev plugins
+ */
+#define VND_SDEV_NAME "vnd"
+#define VND_SDEV_ROOT "/dev/vnd"
+#define VND_SDEV_ZROOT "/dev/vnd/zone"
+
+/*
+ * vnd relies on privileges, not mode bits to limit access. As such, device
+ * files are read-write to everyone.
+ */
+#define VND_SDEV_MODE (S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | \
+ S_IROTH | S_IWOTH)
+
+/*
+ * Statistic macros
+ */
+#define VND_STAT_INC(vsp, field, val) \
+ atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val)
+#define VND_LATENCY_1MS 1000000
+#define VND_LATENCY_10MS 10000000
+#define VND_LATENCY_100MS 100000000
+#define VND_LATENCY_1S 1000000000
+#define VND_LATENCY_10S 10000000000
+
+/*
+ * Constants for vnd hooks
+ */
+static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+#define IPV4_MCAST_LEN 3
+static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E };
+#define IPV6_MCAST_LEN 2
+static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 };
+
+/*
+ * vnd internal data structures and types
+ */
+
+struct vnd_str;
+struct vnd_dev;
+struct vnd_pnsd;
+
+/*
+ * As part of opening the device stream we need to properly communicate with our
+ * underlying stream. This is a bit of an asynchronous dance and we need to
+ * properly work with dld to get everything set up. We have to initiate the
+ * conversation with dld and as such we keep track of our state here.
+ */
+typedef enum vnd_str_state {
+ VNS_S_INITIAL = 0,
+ VNS_S_INFO_SENT,
+ VNS_S_EXCLUSIVE_SENT,
+ VNS_S_ATTACH_SENT,
+ VNS_S_BIND_SENT,
+ VNS_S_SAP_PROMISC_SENT,
+ VNS_S_MULTI_PROMISC_SENT,
+ VNS_S_RX_ONLY_PROMISC_SENT,
+ VNS_S_FIXUP_PROMISC_SENT,
+ VNS_S_CAPAB_Q_SENT,
+ VNS_S_CAPAB_E_SENT,
+ VNS_S_ONLINE,
+ VNS_S_SHUTTING_DOWN,
+ VNS_S_MULTICAST_PROMISCOFF_SENT,
+ VNS_S_SAP_PROMISCOFF_SENT,
+ VNS_S_UNBIND_SENT,
+ VNS_S_ZOMBIE
+} vnd_str_state_t;
+
+typedef enum vnd_str_flags {
+ VNS_F_NEED_ZONE = 0x1,
+ VNS_F_TASKQ_DISPATCHED = 0x2,
+ VNS_F_CONDEMNED = 0x4,
+ VNS_F_FLOW_CONTROLLED = 0x8,
+ VNS_F_DRAIN_SCHEDULED = 0x10,
+ VNS_F_BARRIER = 0x20,
+ VNS_F_BARRIER_DONE = 0x40
+} vnd_str_flags_t;
+
+typedef enum vnd_capab_flags {
+ VNS_C_HCKSUM = 0x1,
+ VNS_C_DLD = 0x2,
+ VNS_C_DIRECT = 0x4,
+ VNS_C_HCKSUM_BADVERS = 0x8
+} vnd_capab_flags_t;
+
+/*
+ * Definitions to interact with direct callbacks
+ */
+typedef void (*vnd_rx_t)(struct vnd_str *, mac_resource_t *, mblk_t *,
+ mac_header_info_t *);
+typedef uintptr_t vnd_mac_cookie_t;
+/* DLD Direct capability function */
+typedef int (*vnd_dld_cap_t)(void *, uint_t, void *, uint_t);
+/* DLD Direct tx function */
+typedef vnd_mac_cookie_t (*vnd_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t);
+/* DLD Direct function to set flow control callback */
+typedef void *(*vnd_dld_set_fcb_t)(void *, void (*)(void *, vnd_mac_cookie_t),
+ void *);
+/* DLD Direct function to see if flow controlled still */
+typedef int (*vnd_dld_is_fc_t)(void *, vnd_mac_cookie_t);
+
+/*
+ * The vnd_str_capab_t is always protected by the vnd_str_t it's a member of.
+ */
+typedef struct vnd_str_capab {
+ vnd_capab_flags_t vsc_flags;
+ t_uscalar_t vsc_hcksum_opts;
+ vnd_dld_cap_t vsc_capab_f;
+ void *vsc_capab_hdl;
+ vnd_dld_tx_t vsc_tx_f;
+ void *vsc_tx_hdl;
+ vnd_dld_set_fcb_t vsc_set_fcb_f;
+ void *vsc_set_fcb_hdl;
+ vnd_dld_is_fc_t vsc_is_fc_f;
+ void *vsc_is_fc_hdl;
+ vnd_mac_cookie_t vsc_fc_cookie;
+ void *vsc_tx_fc_hdl;
+} vnd_str_capab_t;
+
+/*
+ * The vnd_data_queue is a simple construct for storing a series of messages in
+ * a queue.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_data_queue {
+ struct vnd_str *vdq_vns; /* E */
+ kmutex_t vdq_lock;
+ kcondvar_t vdq_ready; /* Uses vdq_lock */
+ ssize_t vdq_max; /* L */
+ ssize_t vdq_cur; /* L */
+ mblk_t *vdq_head; /* L */
+ mblk_t *vdq_tail; /* L */
+} vnd_data_queue_t;
+
+typedef struct vnd_str_stat {
+ kstat_named_t vks_rbytes;
+ kstat_named_t vks_rpackets;
+ kstat_named_t vks_obytes;
+ kstat_named_t vks_opackets;
+ kstat_named_t vks_nhookindrops;
+ kstat_named_t vks_nhookoutdrops;
+ kstat_named_t vks_ndlpidrops;
+ kstat_named_t vks_ndataindrops;
+ kstat_named_t vks_ndataoutdrops;
+ kstat_named_t vks_tdrops;
+ kstat_named_t vks_linkname;
+ kstat_named_t vks_zonename;
+ kstat_named_t vks_nmacflow;
+ kstat_named_t vks_tmacflow;
+ kstat_named_t vks_mac_flow_1ms;
+ kstat_named_t vks_mac_flow_10ms;
+ kstat_named_t vks_mac_flow_100ms;
+ kstat_named_t vks_mac_flow_1s;
+ kstat_named_t vks_mac_flow_10s;
+} vnd_str_stat_t;
+
+/*
+ * vnd stream structure
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_str {
+ kmutex_t vns_lock;
+ kcondvar_t vns_cancelcv; /* Uses vns_lock */
+ kcondvar_t vns_barriercv; /* Uses vns_lock */
+ kcondvar_t vns_stcv; /* Uses vns_lock */
+ vnd_str_state_t vns_state; /* L */
+ vnd_str_state_t vns_laststate; /* L */
+ vnd_errno_t vns_errno; /* L */
+ vnd_str_flags_t vns_flags; /* L */
+ vnd_str_capab_t vns_caps; /* L */
+ taskq_ent_t vns_tqe; /* L */
+ vnd_data_queue_t vns_dq_read; /* E */
+ vnd_data_queue_t vns_dq_write; /* E */
+ mblk_t *vns_dlpi_inc; /* L */
+ queue_t *vns_rq; /* E */
+ queue_t *vns_wq; /* E */
+ queue_t *vns_lrq; /* E */
+ t_uscalar_t vns_dlpi_style; /* L */
+ t_uscalar_t vns_minwrite; /* L */
+ t_uscalar_t vns_maxwrite; /* L */
+ hrtime_t vns_fclatch; /* L */
+ hrtime_t vns_fcupdate; /* L */
+ kstat_t *vns_kstat; /* E */
+ gsqueue_t *vns_squeue; /* E */
+ mblk_t vns_drainblk; /* E + X */
+ mblk_t vns_barrierblk; /* E + X */
+ vnd_str_stat_t vns_ksdata; /* A */
+ size_t vns_nflush; /* L */
+ size_t vns_bsize; /* L */
+ struct vnd_dev *vns_dev; /* E + X */
+ struct vnd_pnsd *vns_nsd; /* E + X */
+} vnd_str_t;
+
+typedef enum vnd_dev_flags {
+ VND_D_ATTACH_INFLIGHT = 0x001,
+ VND_D_ATTACHED = 0x002,
+ VND_D_LINK_INFLIGHT = 0x004,
+ VND_D_LINKED = 0x008,
+ VND_D_CONDEMNED = 0x010,
+ VND_D_ZONE_DYING = 0x020,
+ VND_D_OPENED = 0x040
+} vnd_dev_flags_t;
+
+/*
+ * This represents the data associated with a minor device instance.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_dev {
+ kmutex_t vdd_lock;
+ list_node_t vdd_link; /* GL */
+ list_node_t vdd_nslink; /* NSL */
+ int vdd_ref; /* L */
+ vnd_dev_flags_t vdd_flags; /* L */
+ minor_t vdd_minor; /* E */
+ dev_t vdd_devid; /* E */
+ ldi_ident_t vdd_ldiid; /* E */
+ ldi_handle_t vdd_ldih; /* X */
+ cred_t *vdd_cr; /* X */
+ vnd_str_t *vdd_str; /* L */
+ struct pollhead vdd_ph; /* E */
+ struct vnd_pnsd *vdd_nsd; /* E + X */
+ char vdd_datalink[VND_NAMELEN]; /* L */
+ char vdd_lname[VND_NAMELEN]; /* L */
+} vnd_dev_t;
+
+typedef enum vnd_pnsd_flags {
+ VND_NS_CONDEMNED = 0x1
+} vnd_pnsd_flags_t;
+
+/*
+ * Per netstack data structure.
+ *
+ * See synchronization section of the big theory statement for member
+ * annotations.
+ */
+typedef struct vnd_pnsd {
+ list_node_t vpnd_link; /* protected by global dev lock */
+ zoneid_t vpnd_zid; /* E */
+ netstackid_t vpnd_nsid; /* E */
+ boolean_t vpnd_hooked; /* E */
+ net_handle_t vpnd_neti_v4; /* E */
+ hook_family_t vpnd_family_v4; /* E */
+ hook_event_t vpnd_event_in_v4; /* E */
+ hook_event_t vpnd_event_out_v4; /* E */
+ hook_event_token_t vpnd_token_in_v4; /* E */
+ hook_event_token_t vpnd_token_out_v4; /* E */
+ net_handle_t vpnd_neti_v6; /* E */
+ hook_family_t vpnd_family_v6; /* E */
+ hook_event_t vpnd_event_in_v6; /* E */
+ hook_event_t vpnd_event_out_v6; /* E */
+ hook_event_token_t vpnd_token_in_v6; /* E */
+ hook_event_token_t vpnd_token_out_v6; /* E */
+ kmutex_t vpnd_lock; /* Protects remaining members */
+ kcondvar_t vpnd_ref_change; /* Uses vpnd_lock */
+ int vpnd_ref; /* L */
+ vnd_pnsd_flags_t vpnd_flags; /* L */
+ list_t vpnd_dev_list; /* L */
+} vnd_pnsd_t;
+
+static void vnd_squeue_tx_drain(void *, mblk_t *, gsqueue_t *, void *);
+
+/*
+ * Drop function signature.
+ */
+typedef void (*vnd_dropper_f)(vnd_str_t *, mblk_t *, const char *);
+
+static void
+vnd_drop_ctl(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__ctl, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_ndataindrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_ndataoutdrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_hook_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_nhookindrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+static void
+vnd_drop_hook_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
+ mp, const char *, reason);
+ if (mp != NULL) {
+ freemsg(mp);
+ }
+ VND_STAT_INC(vsp, vks_nhookoutdrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+}
+
+/* ARGSUSED */
+static void
+vnd_drop_panic(vnd_str_t *vsp, mblk_t *mp, const char *reason)
+{
+ panic("illegal vnd drop");
+}
+
+/* ARGSUSED */
+static void
+vnd_mac_drop_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain,
+ mac_header_info_t *mhip)
+{
+ mblk_t *mp;
+
+ while (mp_chain != NULL) {
+ mp = mp_chain;
+ mp_chain = mp->b_next;
+ vnd_drop_hook_in(vsp, mp, "stream not associated");
+ }
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup(netstackid_t nsid)
+{
+ vnd_pnsd_t *nsp;
+
+ mutex_enter(&vnd_dev_lock);
+ for (nsp = list_head(&vnd_nsd_list); nsp != NULL;
+ nsp = list_next(&vnd_nsd_list, nsp)) {
+ if (nsp->vpnd_nsid == nsid) {
+ mutex_enter(&nsp->vpnd_lock);
+ VERIFY(nsp->vpnd_ref >= 0);
+ nsp->vpnd_ref++;
+ mutex_exit(&nsp->vpnd_lock);
+ break;
+ }
+ }
+ mutex_exit(&vnd_dev_lock);
+ return (nsp);
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup_by_zid(zoneid_t zid)
+{
+ netstack_t *ns;
+ vnd_pnsd_t *nsp;
+ ns = netstack_find_by_zoneid(zid);
+ if (ns == NULL)
+ return (NULL);
+ nsp = vnd_nsd_lookup(ns->netstack_stackid);
+ netstack_rele(ns);
+ return (nsp);
+}
+
+static vnd_pnsd_t *
+vnd_nsd_lookup_by_zonename(char *zname)
+{
+ zone_t *zonep;
+ vnd_pnsd_t *nsp;
+
+ zonep = zone_find_by_name(zname);
+ if (zonep == NULL)
+ return (NULL);
+
+ nsp = vnd_nsd_lookup_by_zid(zonep->zone_id);
+ zone_rele(zonep);
+ return (nsp);
+}
+
+static void
+vnd_nsd_ref(vnd_pnsd_t *nsp)
+{
+ mutex_enter(&nsp->vpnd_lock);
+ /*
+ * This can only be used on something that has been obtained through
+ * some other means. As such, the caller should already have a reference
+ * before adding another one. This function should not be used as a
+ * means of creating the initial reference.
+ */
+ VERIFY(nsp->vpnd_ref > 0);
+ nsp->vpnd_ref++;
+ mutex_exit(&nsp->vpnd_lock);
+ cv_broadcast(&nsp->vpnd_ref_change);
+}
+
+static void
+vnd_nsd_rele(vnd_pnsd_t *nsp)
+{
+ mutex_enter(&nsp->vpnd_lock);
+ VERIFY(nsp->vpnd_ref > 0);
+ nsp->vpnd_ref--;
+ mutex_exit(&nsp->vpnd_lock);
+ cv_broadcast(&nsp->vpnd_ref_change);
+}
+
+static vnd_dev_t *
+vnd_dev_lookup(minor_t m)
+{
+ vnd_dev_t *vdp;
+ mutex_enter(&vnd_dev_lock);
+ for (vdp = list_head(&vnd_dev_list); vdp != NULL;
+ vdp = list_next(&vnd_dev_list, vdp)) {
+ if (vdp->vdd_minor == m) {
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_ref > 0);
+ vdp->vdd_ref++;
+ DTRACE_VND_REFINC(vdp);
+ mutex_exit(&vdp->vdd_lock);
+ break;
+ }
+ }
+ mutex_exit(&vnd_dev_lock);
+ return (vdp);
+}
+
+static void
+vnd_dev_free(vnd_dev_t *vdp)
+{
+ /*
+ * When the STREAM exists we need to go through and make sure
+ * communication gets torn down. As part of closing the stream, we
+ * guarantee that nothing else should be able to enter the stream layer
+ * at this point. That means no one should be able to call
+ * read(),write() or one of the frameio ioctls.
+ */
+ if (vdp->vdd_flags & VND_D_ATTACHED) {
+ (void) ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+ crfree(vdp->vdd_cr);
+ vdp->vdd_cr = NULL;
+
+ /*
+ * We have to remove ourselves from our parents list now. It is
+ * really quite important that we have already set the condemend
+ * flag here so that our containing netstack basically knows
+ * that we're on the way down and knows not to wait for us. It's
+ * also important that we do that before we put a rele on the
+ * the device as that is the point at which it will check again.
+ */
+ mutex_enter(&vdp->vdd_nsd->vpnd_lock);
+ list_remove(&vdp->vdd_nsd->vpnd_dev_list, vdp);
+ mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+ vnd_nsd_rele(vdp->vdd_nsd);
+ vdp->vdd_nsd = NULL;
+ }
+ ASSERT(vdp->vdd_flags & VND_D_CONDEMNED);
+ id_free(vnd_minors, vdp->vdd_minor);
+ mutex_destroy(&vdp->vdd_lock);
+ kmem_cache_free(vnd_dev_cache, vdp);
+}
+
+static void
+vnd_dev_ref(vnd_dev_t *vdp)
+{
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_ref > 0);
+ vdp->vdd_ref++;
+ DTRACE_VND_REFINC(vdp);
+ mutex_exit(&vdp->vdd_lock);
+}
+
+/*
+ * As part of releasing the hold on this we may tear down a given vnd_dev_t As
+ * such we need to make sure that we grab the list lock first before grabbing
+ * the vnd_dev_t's lock to ensure proper lock ordering.
+ */
+static void
+vnd_dev_rele(vnd_dev_t *vdp)
+{
+ mutex_enter(&vnd_dev_lock);
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_ref > 0);
+ vdp->vdd_ref--;
+ DTRACE_VND_REFDEC(vdp);
+ if (vdp->vdd_ref > 0) {
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+ return;
+ }
+
+ /*
+ * Now that we've removed this from the list, we can go ahead and
+ * drop the list lock. No one else can find this device and reference
+ * it. As its reference count is zero, it by definition does not have
+ * any remaining entries in /devices that could lead someone back to
+ * this.
+ */
+ vdp->vdd_flags |= VND_D_CONDEMNED;
+ list_remove(&vnd_dev_list, vdp);
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+
+ vnd_dev_free(vdp);
+}
+
+/*
+ * Insert a mesage block chain if there's space, otherwise drop it. Return one
+ * so someone who was waiting for data would now end up having found it. eg.
+ * caller should consider a broadcast.
+ */
+static int
+vnd_dq_push(vnd_data_queue_t *vqp, mblk_t *mp, boolean_t reserved,
+ vnd_dropper_f dropf)
+{
+ size_t msize;
+
+ ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+ if (reserved == B_FALSE) {
+ msize = msgsize(mp);
+ if (vqp->vdq_cur + msize > vqp->vdq_max) {
+ dropf(vqp->vdq_vns, mp, "buffer full");
+ return (0);
+ }
+ vqp->vdq_cur += msize;
+ }
+
+ if (vqp->vdq_head == NULL) {
+ ASSERT(vqp->vdq_tail == NULL);
+ vqp->vdq_head = mp;
+ vqp->vdq_tail = mp;
+ } else {
+ vqp->vdq_tail->b_next = mp;
+ vqp->vdq_tail = mp;
+ }
+
+ return (1);
+}
+
+/*
+ * Remove a message message block chain. If the amount of space in the buffer
+ * has changed we return 1. We have no way of knowing whether or not there is
+ * enough space overall for a given writer who is blocked, so we always end up
+ * having to return true and thus tell consumers that they should consider
+ * signalling.
+ */
+static int
+vnd_dq_pop(vnd_data_queue_t *vqp, mblk_t **mpp)
+{
+ size_t msize;
+ mblk_t *mp;
+
+ ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+ ASSERT(mpp != NULL);
+ if (vqp->vdq_head == NULL) {
+ ASSERT(vqp->vdq_tail == NULL);
+ *mpp = NULL;
+ return (0);
+ }
+
+ mp = vqp->vdq_head;
+ msize = msgsize(mp);
+
+ vqp->vdq_cur -= msize;
+ if (mp->b_next == NULL) {
+ vqp->vdq_head = NULL;
+ vqp->vdq_tail = NULL;
+ /*
+ * We can't be certain that this is always going to be zero.
+ * Someone may have basically taken a reservation of space on
+ * the data queue, eg. claimed spae but not yet pushed it on
+ * yet.
+ */
+ ASSERT(vqp->vdq_cur >= 0);
+ } else {
+ vqp->vdq_head = mp->b_next;
+ ASSERT(vqp->vdq_cur > 0);
+ }
+ mp->b_next = NULL;
+ *mpp = mp;
+ return (1);
+}
+
+/*
+ * Reserve space in the queue. This will bump up the size of the queue and
+ * entitle the user to push something on later without bumping the space.
+ */
+static int
+vnd_dq_reserve(vnd_data_queue_t *vqp, ssize_t size)
+{
+ ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+ ASSERT(size >= 0);
+
+ if (size == 0)
+ return (0);
+
+ if (size + vqp->vdq_cur > vqp->vdq_max)
+ return (0);
+
+ vqp->vdq_cur += size;
+ return (1);
+}
+
+static void
+vnd_dq_unreserve(vnd_data_queue_t *vqp, ssize_t size)
+{
+ ASSERT(MUTEX_HELD(&vqp->vdq_lock));
+ ASSERT(size > 0);
+ ASSERT(size <= vqp->vdq_cur);
+
+ vqp->vdq_cur -= size;
+}
+
+static void
+vnd_dq_flush(vnd_data_queue_t *vqp, vnd_dropper_f dropf)
+{
+ mblk_t *mp, *next;
+
+ mutex_enter(&vqp->vdq_lock);
+ for (mp = vqp->vdq_head; mp != NULL; mp = next) {
+ next = mp->b_next;
+ mp->b_next = NULL;
+ dropf(vqp->vdq_vns, mp, "vnd_dq_flush");
+ }
+ vqp->vdq_cur = 0;
+ vqp->vdq_head = NULL;
+ vqp->vdq_tail = NULL;
+ mutex_exit(&vqp->vdq_lock);
+}
+
+static boolean_t
+vnd_dq_is_empty(vnd_data_queue_t *vqp)
+{
+ boolean_t ret;
+
+ mutex_enter(&vqp->vdq_lock);
+ if (vqp->vdq_head == NULL)
+ ret = B_TRUE;
+ else
+ ret = B_FALSE;
+ mutex_exit(&vqp->vdq_lock);
+
+ return (ret);
+}
+
+/*
+ * Get a network uint16_t from the message and translate it into something the
+ * host understands.
+ */
+static int
+vnd_mbc_getu16(mblk_t *mp, off_t off, uint16_t *out)
+{
+ size_t mpsize;
+ uint8_t *bp;
+
+ mpsize = msgsize(mp);
+ /* Check for overflow */
+ if (off + sizeof (uint16_t) > mpsize)
+ return (1);
+
+ mpsize = MBLKL(mp);
+ while (off >= mpsize) {
+ mp = mp->b_cont;
+ off -= mpsize;
+ mpsize = MBLKL(mp);
+ }
+
+ /*
+ * Data is in network order. Note the second byte of data might be in
+ * the next mp.
+ */
+ bp = mp->b_rptr + off;
+ *out = *bp << 8;
+ if (off + 1 == mpsize) {
+ mp = mp->b_cont;
+ bp = mp->b_rptr;
+ } else {
+ bp++;
+ }
+
+ *out |= *bp;
+ return (0);
+}
+
+/*
+ * Given an mblk chain find the mblk and address of a particular offset.
+ */
+static int
+vnd_mbc_getoffset(mblk_t *mp, off_t off, mblk_t **mpp, uintptr_t *offp)
+{
+ size_t mpsize;
+
+ if (off >= msgsize(mp))
+ return (1);
+
+ mpsize = MBLKL(mp);
+ while (off >= mpsize) {
+ mp = mp->b_cont;
+ off -= mpsize;
+ mpsize = MBLKL(mp);
+ }
+ *mpp = mp;
+ *offp = (uintptr_t)mp->b_rptr + off;
+
+ return (0);
+}
+
+/*
+ * Fetch the destination mac address. Set *dstp to that mac address. If the data
+ * is not contiguous in the first mblk_t, fill in datap and set *dstp to it.
+ */
+static int
+vnd_mbc_getdstmac(mblk_t *mp, uint8_t **dstpp, uint8_t *datap)
+{
+ int i;
+
+ if (MBLKL(mp) >= ETHERADDRL) {
+ *dstpp = mp->b_rptr;
+ return (0);
+ }
+
+ *dstpp = datap;
+ for (i = 0; i < ETHERADDRL; i += 2, datap += 2) {
+ if (vnd_mbc_getu16(mp, i, (uint16_t *)datap) != 0)
+ return (1);
+ }
+
+ return (0);
+}
+
+static int
+vnd_hook(vnd_str_t *vsp, mblk_t **mpp, net_handle_t netiv4, hook_event_t hev4,
+ hook_event_token_t hetv4, net_handle_t netiv6, hook_event_t hev6,
+ hook_event_token_t hetv6, vnd_dropper_f hdrop, vnd_dropper_f ddrop)
+{
+ uint16_t etype;
+ hook_pkt_event_t info;
+ size_t offset, mblen;
+ uint8_t *dstp;
+ uint8_t dstaddr[6];
+ hook_event_t he;
+ hook_event_token_t het;
+ net_handle_t neti;
+
+ /*
+ * Before we can ask if we're interested we have to do enough work to
+ * determine the ethertype.
+ */
+
+ /* Byte 12 is either the VLAN tag or the ethertype */
+ if (vnd_mbc_getu16(*mpp, 12, &etype) != 0) {
+ ddrop(vsp, *mpp, "packet has incomplete ethernet header");
+ *mpp = NULL;
+ return (1);
+ }
+
+ if (etype == ETHERTYPE_VLAN) {
+ /* Actual ethertype is another four bytes in */
+ if (vnd_mbc_getu16(*mpp, 16, &etype) != 0) {
+ ddrop(vsp, *mpp,
+ "packet has incomplete ethernet vlan header");
+ *mpp = NULL;
+ return (1);
+ }
+ offset = sizeof (struct ether_vlan_header);
+ } else {
+ offset = sizeof (struct ether_header);
+ }
+
+ /*
+ * At the moment we only hook on the kinds of things that the IP module
+ * would normally.
+ */
+ if (etype != ETHERTYPE_IP && etype != ETHERTYPE_IPV6)
+ return (0);
+
+ if (etype == ETHERTYPE_IP) {
+ neti = netiv4;
+ he = hev4;
+ het = hetv4;
+ } else {
+ neti = netiv6;
+ he = hev6;
+ het = hetv6;
+ }
+
+ if (!he.he_interested)
+ return (0);
+
+
+ if (vnd_mbc_getdstmac(*mpp, &dstp, dstaddr) != 0) {
+ ddrop(vsp, *mpp, "packet has incomplete ethernet header");
+ *mpp = NULL;
+ return (1);
+ }
+
+ /*
+ * Now that we know we're interested, we have to do some additional
+ * sanity checking for IPF's sake, ala ip_check_length(). Specifically
+ * we need to check to make sure that the remaining packet size,
+ * excluding MAC, is at least the size of an IP header.
+ */
+ mblen = msgsize(*mpp);
+ if ((etype == ETHERTYPE_IP &&
+ mblen - offset < IP_SIMPLE_HDR_LENGTH) ||
+ (etype == ETHERTYPE_IPV6 && mblen - offset < IPV6_HDR_LEN)) {
+ ddrop(vsp, *mpp, "packet has invalid IP header");
+ *mpp = NULL;
+ return (1);
+ }
+
+ info.hpe_protocol = neti;
+ info.hpe_ifp = (phy_if_t)vsp;
+ info.hpe_ofp = (phy_if_t)vsp;
+ info.hpe_mp = mpp;
+ info.hpe_flags = 0;
+
+ if (bcmp(vnd_bcast_addr, dstp, ETHERADDRL) == 0)
+ info.hpe_flags |= HPE_BROADCAST;
+ else if (etype == ETHERTYPE_IP &&
+ bcmp(vnd_ipv4_mcast, vnd_bcast_addr, IPV4_MCAST_LEN) == 0)
+ info.hpe_flags |= HPE_MULTICAST;
+ else if (etype == ETHERTYPE_IPV6 &&
+ bcmp(vnd_ipv6_mcast, vnd_bcast_addr, IPV6_MCAST_LEN) == 0)
+ info.hpe_flags |= HPE_MULTICAST;
+
+ if (vnd_mbc_getoffset(*mpp, offset, &info.hpe_mb,
+ (uintptr_t *)&info.hpe_hdr) != 0) {
+ ddrop(vsp, *mpp, "packet too small -- "
+ "unable to find payload");
+ *mpp = NULL;
+ return (1);
+ }
+
+ if (hook_run(neti->netd_hooks, het, (hook_data_t)&info) != 0) {
+ hdrop(vsp, *mpp, "drooped by hooks");
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * This should not be used for DL_INFO_REQ.
+ */
+static mblk_t *
+vnd_dlpi_alloc(size_t len, t_uscalar_t prim)
+{
+ mblk_t *mp;
+ mp = allocb(len, BPRI_MED);
+ if (mp == NULL)
+ return (NULL);
+
+ mp->b_datap->db_type = M_PROTO;
+ mp->b_wptr = mp->b_rptr + len;
+ bzero(mp->b_rptr, len);
+ ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;
+
+ return (mp);
+}
+
+static void
+vnd_dlpi_inc_push(vnd_str_t *vsp, mblk_t *mp)
+{
+ mblk_t **mpp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ ASSERT(mp->b_next == NULL);
+ mpp = &vsp->vns_dlpi_inc;
+ while (*mpp != NULL)
+ mpp = &((*mpp)->b_next);
+ *mpp = mp;
+}
+
+static mblk_t *
+vnd_dlpi_inc_pop(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vsp->vns_dlpi_inc;
+ if (mp != NULL) {
+ VERIFY(mp->b_next == NULL || mp->b_next != mp);
+ vsp->vns_dlpi_inc = mp->b_next;
+ mp->b_next = NULL;
+ }
+ return (mp);
+}
+
+static int
+vnd_st_sinfo(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ dl_info_req_t *dlir;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
+ BPRI_HI);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+ vsp->vns_state = VNS_S_INFO_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+
+ mp->b_datap->db_type = M_PCPROTO;
+ dlir = (dl_info_req_t *)mp->b_rptr;
+ mp->b_wptr = (uchar_t *)&dlir[1];
+ dlir->dl_primitive = DL_INFO_REQ;
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+static int
+vnd_st_info(vnd_str_t *vsp)
+{
+ dl_info_ack_t *dlia;
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ dlia = (dl_info_ack_t *)mp->b_rptr;
+ vsp->vns_dlpi_style = dlia->dl_provider_style;
+ vsp->vns_minwrite = dlia->dl_min_sdu;
+ vsp->vns_maxwrite = dlia->dl_max_sdu;
+
+ /*
+ * At this time we only support DL_ETHER devices.
+ */
+ if (dlia->dl_mac_type != DL_ETHER) {
+ freemsg(mp);
+ vsp->vns_errno = VND_E_NOTETHER;
+ return (1);
+ }
+
+ /*
+ * Because vnd operates on entire packets, we need to manually account
+ * for the ethernet header information. We add the size of the
+ * ether_vlan_header to account for this, regardless if it is using
+ * vlans or not.
+ */
+ vsp->vns_maxwrite += sizeof (struct ether_vlan_header);
+
+ freemsg(mp);
+ return (0);
+}
+
+static int
+vnd_st_sexclusive(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_EXCLUSIVE_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+
+ vsp->vns_state = VNS_S_EXCLUSIVE_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+ return (0);
+}
+
+static int
+vnd_st_exclusive(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim, cprim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;
+ cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+ if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp,
+ "wrong dlpi primitive for vnd_st_exclusive");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (cprim != DL_EXCLUSIVE_REQ) {
+ vnd_drop_ctl(vsp, mp,
+ "vnd_st_exclusive: got ack/nack for wrong primitive");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (prim == DL_ERROR_ACK)
+ vsp->vns_errno = VND_E_DLEXCL;
+
+ freemsg(mp);
+ return (prim == DL_ERROR_ACK);
+}
+
+/*
+ * Send down a DLPI_ATTACH_REQ.
+ */
+static int
+vnd_st_sattach(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_ATTACH_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+
+ ((dl_attach_req_t *)mp->b_rptr)->dl_ppa = 0;
+ vsp->vns_state = VNS_S_ATTACH_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+static int
+vnd_st_attach(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim, cprim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+ cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+
+ if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp, "vnd_st_attach: unknown primitive type");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (cprim != DL_ATTACH_REQ) {
+ vnd_drop_ctl(vsp, mp,
+ "vnd_st_attach: Got ack/nack for wrong primitive");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (prim == DL_ERROR_ACK)
+ vsp->vns_errno = VND_E_ATTACHFAIL;
+
+ freemsg(mp);
+ return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_sbind(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ dl_bind_req_t *dbrp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
+ DL_BIND_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+ dbrp = (dl_bind_req_t *)(mp->b_rptr);
+ dbrp->dl_sap = 0;
+ dbrp->dl_service_mode = DL_CLDLS;
+
+ vsp->vns_state = VNS_S_BIND_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+static int
+vnd_st_bind(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;
+
+ if (prim != DL_BIND_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_bind");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (prim == DL_ERROR_ACK)
+ vsp->vns_errno = VND_E_BINDFAIL;
+
+ freemsg(mp);
+ return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_spromisc(vnd_str_t *vsp, int type, vnd_str_state_t next)
+{
+ mblk_t *mp;
+ dl_promiscon_req_t *dprp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCON_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+
+ dprp = (dl_promiscon_req_t *)mp->b_rptr;
+ dprp->dl_level = type;
+
+ vsp->vns_state = next;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+static int
+vnd_st_promisc(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim, cprim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+ prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+ cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+ if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp,
+ "wrong dlpi primitive for vnd_st_promisc");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (cprim != DL_PROMISCON_REQ) {
+ vnd_drop_ctl(vsp, mp,
+ "vnd_st_promisc: Got ack/nack for wrong primitive");
+ vsp->vns_errno = VND_E_DLPIINVAL;
+ return (1);
+ }
+
+ if (prim == DL_ERROR_ACK)
+ vsp->vns_errno = VND_E_PROMISCFAIL;
+
+ freemsg(mp);
+ return (prim == DL_ERROR_ACK);
+}
+
+static int
+vnd_st_scapabq(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+ mp = vnd_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
+ if (mp == NULL) {
+ vsp->vns_errno = VND_E_NOMEM;
+ return (1);
+ }
+
+ vsp->vns_state = VNS_S_CAPAB_Q_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ putnext(vsp->vns_wq, mp);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+vnd_mac_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain,
+ mac_header_info_t *mhip)
+{
+ int signal = 0;
+ mblk_t *mp;
+ vnd_pnsd_t *nsp = vsp->vns_nsd;
+
+ ASSERT(vsp != NULL);
+ ASSERT(mp_chain != NULL);
+
+ for (mp = mp_chain; mp != NULL; mp = mp_chain) {
+ uint16_t vid;
+ mp_chain = mp->b_next;
+ mp->b_next = NULL;
+
+ /*
+ * If we were operating in a traditional dlpi context then we
+ * would have enabled DLIOCRAW and rather than the fast path, we
+ * would come through dld_str_rx_raw. That function does two
+ * things that we have to consider doing ourselves. The first is
+ * that it adjusts the b_rptr back to account for dld bumping us
+ * past the mac header. It also tries to account for cases where
+ * mac provides an illusion of the mac header. Fortunately, dld
+ * only allows the fastpath when the media type is the same as
+ * the native type. Therefore all we have to do here is adjust
+ * the b_rptr.
+ */
+ ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
+ mp->b_rptr -= mhip->mhi_hdrsize;
+ vid = VLAN_ID(mhip->mhi_tci);
+ if (mhip->mhi_istagged && vid != VLAN_ID_NONE) {
+ /*
+ * This is an overlapping copy. Do not use bcopy(9F).
+ */
+ (void) memmove(mp->b_rptr + 4, mp->b_rptr, 12);
+ mp->b_rptr += 4;
+ }
+
+ if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4,
+ nsp->vpnd_event_in_v4, nsp->vpnd_token_in_v4,
+ nsp->vpnd_neti_v6, nsp->vpnd_event_in_v6,
+ nsp->vpnd_token_in_v6, vnd_drop_hook_in, vnd_drop_in) != 0)
+ continue;
+
+ VND_STAT_INC(vsp, vks_rpackets, 1);
+ VND_STAT_INC(vsp, vks_rbytes, msgsize(mp));
+ DTRACE_VND5(recv, mblk_t *, mp, void *, NULL, void *, NULL,
+ vnd_str_t *, vsp, mblk_t *, mp);
+ mutex_enter(&vsp->vns_dq_read.vdq_lock);
+ signal |= vnd_dq_push(&vsp->vns_dq_read, mp, B_FALSE,
+ vnd_drop_in);
+ mutex_exit(&vsp->vns_dq_read.vdq_lock);
+ }
+
+ if (signal != 0) {
+ cv_broadcast(&vsp->vns_dq_read.vdq_ready);
+ pollwakeup(&vsp->vns_dev->vdd_ph, POLLIN | POLLRDNORM);
+ }
+
+}
+
+static void
+vnd_mac_flow_control_stat(vnd_str_t *vsp, hrtime_t diff)
+{
+ VND_STAT_INC(vsp, vks_nmacflow, 1);
+ VND_STAT_INC(vsp, vks_tmacflow, diff);
+ if (diff >= VND_LATENCY_1MS)
+ VND_STAT_INC(vsp, vks_mac_flow_1ms, 1);
+ if (diff >= VND_LATENCY_10MS)
+ VND_STAT_INC(vsp, vks_mac_flow_10ms, 1);
+ if (diff >= VND_LATENCY_100MS)
+ VND_STAT_INC(vsp, vks_mac_flow_100ms, 1);
+ if (diff >= VND_LATENCY_1S)
+ VND_STAT_INC(vsp, vks_mac_flow_1s, 1);
+ if (diff >= VND_LATENCY_10S)
+ VND_STAT_INC(vsp, vks_mac_flow_10s, 1);
+}
+
+/*
+ * This is a callback from MAC that indicates that we are allowed to send
+ * packets again.
+ */
+static void
+vnd_mac_flow_control(void *arg, vnd_mac_cookie_t cookie)
+{
+ vnd_str_t *vsp = arg;
+ hrtime_t now;
+
+ mutex_enter(&vsp->vns_lock);
+ now = gethrtime();
+
+ /*
+ * Check for the case that we beat vnd_squeue_tx_one to the punch.
+ * There's also an additional case here that we got notified because
+ * we're sharing a device that ran out of tx descriptors, even though it
+ * wasn't because of us.
+ */
+ if (!(vsp->vns_flags & VNS_F_FLOW_CONTROLLED)) {
+ vsp->vns_fcupdate = now;
+ mutex_exit(&vsp->vns_lock);
+ return;
+ }
+
+ ASSERT(vsp->vns_flags & VNS_F_FLOW_CONTROLLED);
+ ASSERT(vsp->vns_caps.vsc_fc_cookie == cookie);
+ vsp->vns_flags &= ~VNS_F_FLOW_CONTROLLED;
+ vsp->vns_caps.vsc_fc_cookie = (vnd_mac_cookie_t)NULL;
+ vsp->vns_fclatch = 0;
+ DTRACE_VND3(flow__resumed, vnd_str_t *, vsp, uint64_t,
+ vsp->vns_dq_write.vdq_cur, uintptr_t, cookie);
+ /*
+ * If someone has asked to flush the squeue and thus inserted a barrier,
+ * than we shouldn't schedule a drain.
+ */
+ if (!(vsp->vns_flags & (VNS_F_DRAIN_SCHEDULED | VNS_F_BARRIER))) {
+ vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED;
+ gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_drainblk,
+ vnd_squeue_tx_drain, vsp, GSQUEUE_FILL,
+ VND_SQUEUE_TAG_MAC_FLOW_CONTROL);
+ }
+ mutex_exit(&vsp->vns_lock);
+}
+
+static void
+vnd_mac_enter(vnd_str_t *vsp, mac_perim_handle_t *mphp)
+{
+ ASSERT(MUTEX_HELD(&vsp->vns_lock));
+ VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
+ DLD_CAPAB_PERIM, mphp, DLD_ENABLE) == 0);
+}
+
+static void
+vnd_mac_exit(vnd_str_t *vsp, mac_perim_handle_t mph)
+{
+ ASSERT(MUTEX_HELD(&vsp->vns_lock));
+ VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
+ DLD_CAPAB_PERIM, mph, DLD_DISABLE) == 0);
+}
+
+static int
+vnd_dld_cap_enable(vnd_str_t *vsp, vnd_rx_t rxfunc)
+{
+ int ret;
+ dld_capab_direct_t d;
+ mac_perim_handle_t mph;
+ vnd_str_capab_t *c = &vsp->vns_caps;
+
+ bzero(&d, sizeof (d));
+ d.di_rx_cf = (uintptr_t)rxfunc;
+ d.di_rx_ch = vsp;
+ d.di_flags = DI_DIRECT_RAW;
+
+ vnd_mac_enter(vsp, &mph);
+
+ /*
+ * If we're coming in here for a second pass, we need to make sure that
+ * we remove an existing flow control notification callback, otherwise
+ * we'll create a duplicate that will remain with garbage data.
+ */
+ if (c->vsc_tx_fc_hdl != NULL) {
+ ASSERT(c->vsc_set_fcb_hdl != NULL);
+ (void) c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, NULL,
+ c->vsc_tx_fc_hdl);
+ c->vsc_tx_fc_hdl = NULL;
+ }
+
+ if (vsp->vns_caps.vsc_capab_f(c->vsc_capab_hdl,
+ DLD_CAPAB_DIRECT, &d, DLD_ENABLE) == 0) {
+ c->vsc_tx_f = (vnd_dld_tx_t)d.di_tx_df;
+ c->vsc_tx_hdl = d.di_tx_dh;
+ c->vsc_set_fcb_f = (vnd_dld_set_fcb_t)d.di_tx_cb_df;
+ c->vsc_set_fcb_hdl = d.di_tx_cb_dh;
+ c->vsc_is_fc_f = (vnd_dld_is_fc_t)d.di_tx_fctl_df;
+ c->vsc_is_fc_hdl = d.di_tx_fctl_dh;
+ c->vsc_tx_fc_hdl = c->vsc_set_fcb_f(c->vsc_set_fcb_hdl,
+ vnd_mac_flow_control, vsp);
+ c->vsc_flags |= VNS_C_DIRECT;
+ ret = 0;
+ } else {
+ vsp->vns_errno = VND_E_DIRECTFAIL;
+ ret = 1;
+ }
+ vnd_mac_exit(vsp, mph);
+ return (ret);
+}
+
+static int
+vnd_st_capabq(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ dl_capability_ack_t *cap;
+ dl_capability_sub_t *subp;
+ dl_capab_hcksum_t *hck;
+ dl_capab_dld_t *dld;
+ unsigned char *rp;
+ int ret = 0;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_inc_pop(vsp);
+
+ rp = mp->b_rptr;
+ cap = (dl_capability_ack_t *)rp;
+ if (cap->dl_sub_length == 0)
+ goto done;
+
+ /* Don't try to process something too big */
+ if (sizeof (dl_capability_ack_t) + cap->dl_sub_length > MBLKL(mp)) {
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+ vsp->vns_errno = VND_E_CAPACKINVAL;
+ ret = 1;
+ goto done;
+ }
+
+ rp += cap->dl_sub_offset;
+
+ while (cap->dl_sub_length > 0) {
+ subp = (dl_capability_sub_t *)rp;
+ /* Sanity check something crazy from down below */
+ if (subp->dl_length + sizeof (dl_capability_sub_t) >
+ cap->dl_sub_length) {
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+ vsp->vns_errno = VND_E_SUBCAPINVAL;
+ ret = 1;
+ goto done;
+ }
+
+ switch (subp->dl_cap) {
+ case DL_CAPAB_HCKSUM:
+ hck = (dl_capab_hcksum_t *)(rp +
+ sizeof (dl_capability_sub_t));
+ if (hck->hcksum_version != HCKSUM_CURRENT_VERSION) {
+ vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM_BADVERS;
+ break;
+ }
+ if (dlcapabcheckqid(&hck->hcksum_mid, vsp->vns_lrq) !=
+ B_TRUE) {
+ vsp->vns_errno = VND_E_CAPABPASS;
+ ret = 1;
+ goto done;
+ }
+ vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM;
+ vsp->vns_caps.vsc_hcksum_opts = hck->hcksum_txflags;
+ break;
+ case DL_CAPAB_DLD:
+ dld = (dl_capab_dld_t *)(rp +
+ sizeof (dl_capability_sub_t));
+ if (dld->dld_version != DLD_CURRENT_VERSION) {
+ vsp->vns_errno = VND_E_DLDBADVERS;
+ ret = 1;
+ goto done;
+ }
+ if (dlcapabcheckqid(&dld->dld_mid, vsp->vns_lrq) !=
+ B_TRUE) {
+ vsp->vns_errno = VND_E_CAPABPASS;
+ ret = 1;
+ goto done;
+ }
+ vsp->vns_caps.vsc_flags |= VNS_C_DLD;
+ vsp->vns_caps.vsc_capab_f =
+ (vnd_dld_cap_t)dld->dld_capab;
+ vsp->vns_caps.vsc_capab_hdl =
+ (void *)dld->dld_capab_handle;
+ /*
+ * At this point in time, we have to set up a direct
+ * function that drops all input. This validates that
+ * we'll be able to set up direct input and that we can
+ * easily switch it earlier to the real data function
+ * when we've plumbed everything up.
+ */
+ if (vnd_dld_cap_enable(vsp, vnd_mac_drop_input) != 0) {
+ /* vns_errno set by vnd_dld_cap_enable */
+ ret = 1;
+ goto done;
+ }
+ break;
+ default:
+ /* Ignore unsupported cap */
+ break;
+ }
+
+ rp += sizeof (dl_capability_sub_t) + subp->dl_length;
+ cap->dl_sub_length -= sizeof (dl_capability_sub_t) +
+ subp->dl_length;
+ }
+
+done:
+ /* Make sure we enabled direct callbacks */
+ if (ret == 0 && !(vsp->vns_caps.vsc_flags & VNS_C_DIRECT)) {
+ vsp->vns_errno = VND_E_DIRECTNOTSUP;
+ ret = 1;
+ }
+
+ freemsg(mp);
+ return (ret);
+}
+
+static void
+vnd_st_sonline(vnd_str_t *vsp)
+{
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ vsp->vns_state = VNS_S_ONLINE;
+ cv_broadcast(&vsp->vns_stcv);
+}
+
+static void
+vnd_st_shutdown(vnd_str_t *vsp)
+{
+ mac_perim_handle_t mph;
+ vnd_str_capab_t *vsc = &vsp->vns_caps;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+ /*
+ * At this point in time we know that there is no one transmitting as
+ * our final reference has been torn down and that vnd_s_close inserted
+ * a barrier to validate that everything is flushed.
+ */
+ if (vsc->vsc_flags & VNS_C_DIRECT) {
+ vnd_mac_enter(vsp, &mph);
+ vsc->vsc_flags &= ~VNS_C_DIRECT;
+ (void) vsc->vsc_set_fcb_f(vsc->vsc_set_fcb_hdl, NULL,
+ vsc->vsc_tx_fc_hdl);
+ vsc->vsc_tx_fc_hdl = NULL;
+ (void) vsc->vsc_capab_f(vsc->vsc_capab_hdl, DLD_CAPAB_DIRECT,
+ NULL, DLD_DISABLE);
+ vnd_mac_exit(vsp, mph);
+ }
+}
+
+static boolean_t
+vnd_st_spromiscoff(vnd_str_t *vsp, int type, vnd_str_state_t next)
+{
+ boolean_t ret = B_TRUE;
+ mblk_t *mp;
+ dl_promiscoff_req_t *dprp;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+ mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCOFF_REQ);
+ if (mp == NULL) {
+ cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for "
+ "promiscoff request");
+ ret = B_FALSE;
+ goto next;
+ }
+
+ dprp = (dl_promiscoff_req_t *)mp->b_rptr;
+ dprp->dl_level = type;
+
+ putnext(vsp->vns_wq, mp);
+next:
+ vsp->vns_state = next;
+ cv_broadcast(&vsp->vns_stcv);
+ return (ret);
+}
+
+static void
+vnd_st_promiscoff(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim, cprim;
+
+ VERIFY(MUTEX_HELD(&vsp->vns_lock));
+
+ /*
+ * Unlike other cases where we guard against the incoming packet being
+ * NULL, during tear down we try to keep driving and therefore we may
+ * have gotten here due to an earlier failure, so there's nothing to do.
+ */
+ mp = vnd_dlpi_inc_pop(vsp);
+ if (mp == NULL)
+ return;
+
+ prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+ cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+ if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp,
+ "wrong dlpi primitive for vnd_st_promiscoff");
+ return;
+ }
+
+ if (cprim != DL_PROMISCOFF_REQ) {
+ vnd_drop_ctl(vsp, mp,
+ "vnd_st_promiscoff: Got ack/nack for wrong primitive");
+ return;
+ }
+
+ if (prim == DL_ERROR_ACK) {
+ cmn_err(CE_WARN, "!failed to disable promiscuos mode during "
+ "vnd teardown");
+ }
+}
+
+static boolean_t
+vnd_st_sunbind(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ boolean_t ret = B_TRUE;
+
+ mp = vnd_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ);
+ if (mp == NULL) {
+ cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for "
+ "unbind request");
+ ret = B_FALSE;
+ goto next;
+ }
+
+ putnext(vsp->vns_wq, mp);
+next:
+ vsp->vns_state = VNS_S_UNBIND_SENT;
+ cv_broadcast(&vsp->vns_stcv);
+ return (ret);
+}
+
+static void
+vnd_st_unbind(vnd_str_t *vsp)
+{
+ mblk_t *mp;
+ t_uscalar_t prim, cprim;
+
+ /*
+ * Unlike other cases where we guard against the incoming packet being
+ * NULL, during tear down we try to keep driving and therefore we may
+ * have gotten here due to an earlier failure, so there's nothing to do.
+ */
+ mp = vnd_dlpi_inc_pop(vsp);
+ if (mp == NULL)
+ goto next;
+
+ prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
+ cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;
+
+ if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
+ vnd_drop_ctl(vsp, mp,
+ "wrong dlpi primitive for vnd_st_unbind");
+ goto next;
+ }
+
+ if (cprim != DL_UNBIND_REQ) {
+ vnd_drop_ctl(vsp, mp,
+ "vnd_st_unbind: Got ack/nack for wrong primitive");
+ goto next;
+ }
+
+ if (prim == DL_ERROR_ACK) {
+ cmn_err(CE_WARN, "!failed to unbind stream during vnd "
+ "teardown");
+ }
+
+next:
+ vsp->vns_state = VNS_S_ZOMBIE;
+ cv_broadcast(&vsp->vns_stcv);
+}
+
+/*
+ * Perform state transitions. This is a one way shot down the flow chart
+ * described in the big theory statement.
+ */
+static void
+vnd_str_state_transition(void *arg)
+{
+ boolean_t died = B_FALSE;
+ vnd_str_t *vsp = arg;
+ mblk_t *mp;
+
+ mutex_enter(&vsp->vns_lock);
+ if (vsp->vns_dlpi_inc == NULL && (vsp->vns_state != VNS_S_INITIAL &&
+ vsp->vns_state != VNS_S_SHUTTING_DOWN)) {
+ mutex_exit(&vsp->vns_lock);
+ return;
+ }
+
+ /*
+ * When trying to shut down, or unwinding from a failed enabling, rather
+ * than immediately entering the ZOMBIE state, we may instead opt to try
+ * and enter the next state in the progression. This is especially
+ * important when trying to tear everything down.
+ */
+loop:
+ DTRACE_PROBE2(vnd__state__transition, uintptr_t, vsp,
+ vnd_str_state_t, vsp->vns_state);
+ switch (vsp->vns_state) {
+ case VNS_S_INITIAL:
+ VERIFY(vsp->vns_dlpi_inc == NULL);
+ if (vnd_st_sinfo(vsp) != 0)
+ died = B_TRUE;
+ break;
+ case VNS_S_INFO_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_info(vsp) == 0) {
+ if (vnd_st_sexclusive(vsp) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_EXCLUSIVE_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_exclusive(vsp) == 0) {
+ if (vsp->vns_dlpi_style == DL_STYLE2) {
+ if (vnd_st_sattach(vsp) != 0)
+ died = B_TRUE;
+ } else {
+ if (vnd_st_sbind(vsp) != 0)
+ died = B_TRUE;
+ }
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_ATTACH_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_attach(vsp) == 0) {
+ if (vnd_st_sbind(vsp) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_BIND_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_bind(vsp) == 0) {
+ if (vnd_st_spromisc(vsp, DL_PROMISC_SAP,
+ VNS_S_SAP_PROMISC_SENT) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_SAP_PROMISC_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_promisc(vsp) == 0) {
+ if (vnd_st_spromisc(vsp, DL_PROMISC_MULTI,
+ VNS_S_MULTI_PROMISC_SENT) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_MULTI_PROMISC_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_promisc(vsp) == 0) {
+ if (vnd_st_spromisc(vsp, DL_PROMISC_RX_ONLY,
+ VNS_S_RX_ONLY_PROMISC_SENT) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_RX_ONLY_PROMISC_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_promisc(vsp) == 0) {
+ if (vnd_st_spromisc(vsp, DL_PROMISC_FIXUPS,
+ VNS_S_FIXUP_PROMISC_SENT) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_FIXUP_PROMISC_SENT:
+ VERIFY(vsp->vns_dlpi_inc != NULL);
+ if (vnd_st_promisc(vsp) == 0) {
+ if (vnd_st_scapabq(vsp) != 0)
+ died = B_TRUE;
+ } else {
+ died = B_TRUE;
+ }
+ break;
+ case VNS_S_CAPAB_Q_SENT:
+ if (vnd_st_capabq(vsp) != 0)
+ died = B_TRUE;
+ else
+ vnd_st_sonline(vsp);
+ break;
+ case VNS_S_SHUTTING_DOWN:
+ vnd_st_shutdown(vsp);
+ if (vnd_st_spromiscoff(vsp, DL_PROMISC_MULTI,
+ VNS_S_MULTICAST_PROMISCOFF_SENT) == B_FALSE)
+ goto loop;
+ break;
+ case VNS_S_MULTICAST_PROMISCOFF_SENT:
+ vnd_st_promiscoff(vsp);
+ if (vnd_st_spromiscoff(vsp, DL_PROMISC_SAP,
+ VNS_S_SAP_PROMISCOFF_SENT) == B_FALSE)
+ goto loop;
+ break;
+ case VNS_S_SAP_PROMISCOFF_SENT:
+ vnd_st_promiscoff(vsp);
+ if (vnd_st_sunbind(vsp) == B_FALSE)
+ goto loop;
+ break;
+ case VNS_S_UNBIND_SENT:
+ vnd_st_unbind(vsp);
+ break;
+ case VNS_S_ZOMBIE:
+ while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL)
+ vnd_drop_ctl(vsp, mp, "vsp received data as a zombie");
+ break;
+ default:
+ panic("vnd_str_t entered an unknown state");
+ }
+
+ if (died == B_TRUE) {
+ ASSERT(vsp->vns_errno != VND_E_SUCCESS);
+ vsp->vns_laststate = vsp->vns_state;
+ vsp->vns_state = VNS_S_ZOMBIE;
+ cv_broadcast(&vsp->vns_stcv);
+ }
+
+ mutex_exit(&vsp->vns_lock);
+}
+
+static void
+vnd_dlpi_taskq_dispatch(void *arg)
+{
+ vnd_str_t *vsp = arg;
+ int run = 1;
+
+ while (run != 0) {
+ vnd_str_state_transition(vsp);
+ mutex_enter(&vsp->vns_lock);
+ if (vsp->vns_flags & VNS_F_CONDEMNED ||
+ vsp->vns_dlpi_inc == NULL) {
+ run = 0;
+ vsp->vns_flags &= ~VNS_F_TASKQ_DISPATCHED;
+ }
+ if (vsp->vns_flags & VNS_F_CONDEMNED)
+ cv_signal(&vsp->vns_cancelcv);
+ mutex_exit(&vsp->vns_lock);
+ }
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getifname(net_handle_t neti, phy_if_t phy, char *buf, const size_t len)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getmtu(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getptmue(net_handle_t neti)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getlifaddr(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+ size_t nelem, net_ifaddr_t type[], void *storage)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getlifzone(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+ zoneid_t *zid)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_getlifflags(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
+ uint64_t *flags)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static phy_if_t
+vnd_neti_phygetnext(net_handle_t neti, phy_if_t phy)
+{
+ return ((phy_if_t)-1);
+}
+
+/* ARGSUSED */
+static phy_if_t
+vnd_neti_phylookup(net_handle_t neti, const char *name)
+{
+ return ((phy_if_t)-1);
+}
+
+/* ARGSUSED */
+static lif_if_t
+vnd_neti_lifgetnext(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_inject(net_handle_t neti, inject_t style, net_inject_t *packet)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static phy_if_t
+vnd_neti_route(net_handle_t neti, struct sockaddr *address,
+ struct sockaddr *next)
+{
+ return ((phy_if_t)-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_ispchksum(net_handle_t neti, mblk_t *mp)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+vnd_neti_isvchksum(net_handle_t neti, mblk_t *mp)
+{
+ return (-1);
+}
+
+static net_protocol_t vnd_neti_info_v4 = {
+ NETINFO_VERSION,
+ NHF_VND_INET,
+ vnd_neti_getifname,
+ vnd_neti_getmtu,
+ vnd_neti_getptmue,
+ vnd_neti_getlifaddr,
+ vnd_neti_getlifzone,
+ vnd_neti_getlifflags,
+ vnd_neti_phygetnext,
+ vnd_neti_phylookup,
+ vnd_neti_lifgetnext,
+ vnd_neti_inject,
+ vnd_neti_route,
+ vnd_neti_ispchksum,
+ vnd_neti_isvchksum
+};
+
+static net_protocol_t vnd_neti_info_v6 = {
+ NETINFO_VERSION,
+ NHF_VND_INET6,
+ vnd_neti_getifname,
+ vnd_neti_getmtu,
+ vnd_neti_getptmue,
+ vnd_neti_getlifaddr,
+ vnd_neti_getlifzone,
+ vnd_neti_getlifflags,
+ vnd_neti_phygetnext,
+ vnd_neti_phylookup,
+ vnd_neti_lifgetnext,
+ vnd_neti_inject,
+ vnd_neti_route,
+ vnd_neti_ispchksum,
+ vnd_neti_isvchksum
+};
+
+
+static int
+vnd_netinfo_init(vnd_pnsd_t *nsp)
+{
+ nsp->vpnd_neti_v4 = net_protocol_register(nsp->vpnd_nsid,
+ &vnd_neti_info_v4);
+ ASSERT(nsp->vpnd_neti_v4 != NULL);
+
+ nsp->vpnd_neti_v6 = net_protocol_register(nsp->vpnd_nsid,
+ &vnd_neti_info_v6);
+ ASSERT(nsp->vpnd_neti_v6 != NULL);
+
+ nsp->vpnd_family_v4.hf_version = HOOK_VERSION;
+ nsp->vpnd_family_v4.hf_name = "vnd_inet";
+
+ if (net_family_register(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4) != 0) {
+ (void) net_protocol_unregister(nsp->vpnd_neti_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_family_v6.hf_version = HOOK_VERSION;
+ nsp->vpnd_family_v6.hf_name = "vnd_inet6";
+
+ if (net_family_register(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6) != 0) {
+ (void) net_family_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_family_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_event_in_v4.he_version = HOOK_VERSION;
+ nsp->vpnd_event_in_v4.he_name = NH_PHYSICAL_IN;
+ nsp->vpnd_event_in_v4.he_flags = 0;
+ nsp->vpnd_event_in_v4.he_interested = B_FALSE;
+
+ nsp->vpnd_token_in_v4 = net_event_register(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ if (nsp->vpnd_token_in_v4 == NULL) {
+ (void) net_family_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_family_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_family_v6);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_event_in_v6.he_version = HOOK_VERSION;
+ nsp->vpnd_event_in_v6.he_name = NH_PHYSICAL_IN;
+ nsp->vpnd_event_in_v6.he_flags = 0;
+ nsp->vpnd_event_in_v6.he_interested = B_FALSE;
+
+ nsp->vpnd_token_in_v6 = net_event_register(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ if (nsp->vpnd_token_in_v6 == NULL) {
+ (void) net_event_shutdown(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ (void) net_event_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_family_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_family_v6);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_event_out_v4.he_version = HOOK_VERSION;
+ nsp->vpnd_event_out_v4.he_name = NH_PHYSICAL_OUT;
+ nsp->vpnd_event_out_v4.he_flags = 0;
+ nsp->vpnd_event_out_v4.he_interested = B_FALSE;
+
+ nsp->vpnd_token_out_v4 = net_event_register(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_out_v4);
+ if (nsp->vpnd_token_out_v4 == NULL) {
+ (void) net_event_shutdown(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ (void) net_event_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ (void) net_event_shutdown(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ (void) net_event_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_family_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_family_v6);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ nsp->vpnd_event_out_v6.he_version = HOOK_VERSION;
+ nsp->vpnd_event_out_v6.he_name = NH_PHYSICAL_OUT;
+ nsp->vpnd_event_out_v6.he_flags = 0;
+ nsp->vpnd_event_out_v6.he_interested = B_FALSE;
+
+ nsp->vpnd_token_out_v6 = net_event_register(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_out_v6);
+ if (nsp->vpnd_token_out_v6 == NULL) {
+ (void) net_event_shutdown(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ (void) net_event_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ (void) net_event_shutdown(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ (void) net_event_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_event_in_v6);
+ (void) net_event_shutdown(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ (void) net_event_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_event_in_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v4,
+ &nsp->vpnd_family_v4);
+ (void) net_family_unregister(nsp->vpnd_neti_v6,
+ &nsp->vpnd_family_v6);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v4);
+ (void) net_protocol_unregister(nsp->vpnd_neti_v6);
+ cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
+ "failed for stack %d", nsp->vpnd_nsid);
+ return (1);
+ }
+
+ return (0);
+}
+
+static void
+vnd_netinfo_shutdown(vnd_pnsd_t *nsp)
+{
+ int ret;
+
+ ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+ VERIFY(ret == 0);
+ ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
+ VERIFY(ret == 0);
+ ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+ VERIFY(ret == 0);
+ ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
+ VERIFY(ret == 0);
+}
+
+static void
+vnd_netinfo_fini(vnd_pnsd_t *nsp)
+{
+ int ret;
+
+ ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
+ VERIFY(ret == 0);
+ ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
+ VERIFY(ret == 0);
+ ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
+ VERIFY(ret == 0);
+ ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
+ VERIFY(ret == 0);
+ ret = net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
+ VERIFY(ret == 0);
+ ret = net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
+ VERIFY(ret == 0);
+ ret = net_protocol_unregister(nsp->vpnd_neti_v4);
+ VERIFY(ret == 0);
+ ret = net_protocol_unregister(nsp->vpnd_neti_v6);
+ VERIFY(ret == 0);
+}
+
+/* ARGSUSED */
+static void
+vnd_strbarrier_cb(void *arg, mblk_t *bmp, gsqueue_t *gsp, void *dummy)
+{
+ vnd_str_t *vsp = arg;
+
+ VERIFY(bmp == &vsp->vns_barrierblk);
+ mutex_enter(&vsp->vns_lock);
+ VERIFY(vsp->vns_flags & VNS_F_BARRIER);
+ VERIFY(!(vsp->vns_flags & VNS_F_BARRIER_DONE));
+ vsp->vns_flags |= VNS_F_BARRIER_DONE;
+ mutex_exit(&vsp->vns_lock);
+
+ /*
+ * For better or worse, we have to broadcast here as we could have a
+ * thread that's blocked for completion as well as one that's blocked
+ * waiting to do a barrier itself.
+ */
+ cv_broadcast(&vsp->vns_barriercv);
+}
+
+/*
+ * This is a data barrier for the stream while it is in fastpath mode. It blocks
+ * and ensures that there is nothing else in the squeue.
+ */
+static void
+vnd_strbarrier(vnd_str_t *vsp)
+{
+ mutex_enter(&vsp->vns_lock);
+ while (vsp->vns_flags & VNS_F_BARRIER)
+ cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
+ vsp->vns_flags |= VNS_F_BARRIER;
+ mutex_exit(&vsp->vns_lock);
+
+ gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_barrierblk,
+ vnd_strbarrier_cb, vsp, GSQUEUE_PROCESS, VND_SQUEUE_TAG_STRBARRIER);
+
+ mutex_enter(&vsp->vns_lock);
+ while (!(vsp->vns_flags & VNS_F_BARRIER_DONE))
+ cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
+ vsp->vns_flags &= ~VNS_F_BARRIER;
+ vsp->vns_flags &= ~VNS_F_BARRIER_DONE;
+ mutex_exit(&vsp->vns_lock);
+
+ /*
+ * We have to broadcast in case anyone is waiting for the barrier
+ * themselves.
+ */
+ cv_broadcast(&vsp->vns_barriercv);
+}
+
+/*
+ * Based on the type of message that we're dealing with we're going to want to
+ * do one of several things. Basically if it looks like it's something we know
+ * about, we should probably handle it in one of our transition threads.
+ * Otherwise, we should just simply putnext.
+ */
+static int
+vnd_s_rput(queue_t *q, mblk_t *mp)
+{
+ t_uscalar_t prim;
+ int dispatch = 0;
+ vnd_str_t *vsp = q->q_ptr;
+
+ switch (DB_TYPE(mp)) {
+ case M_PROTO:
+ case M_PCPROTO:
+ if (MBLKL(mp) < sizeof (t_uscalar_t)) {
+ vnd_drop_ctl(vsp, mp, "PROTO message too short");
+ break;
+ }
+
+ prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
+ if (prim == DL_UNITDATA_REQ || prim == DL_UNITDATA_IND) {
+ vnd_drop_ctl(vsp, mp,
+ "recieved an unsupported dlpi DATA req");
+ break;
+ }
+
+ /*
+ * Enqueue the entry and fire off a taskq dispatch.
+ */
+ mutex_enter(&vsp->vns_lock);
+ vnd_dlpi_inc_push(vsp, mp);
+ if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) {
+ dispatch = 1;
+ vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+ }
+ mutex_exit(&vsp->vns_lock);
+ if (dispatch != 0)
+ taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch,
+ vsp, 0, &vsp->vns_tqe);
+ break;
+ case M_DATA:
+ vnd_drop_in(vsp, mp, "M_DATA via put(9E)");
+ break;
+ default:
+ putnext(vsp->vns_rq, mp);
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+vnd_strioctl(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct iocblk *iocp)
+{
+ int error;
+ vnd_strioc_t *visp;
+
+ if (iocp->ioc_cmd != VND_STRIOC_ASSOCIATE ||
+ iocp->ioc_count != TRANSPARENT) {
+ error = EINVAL;
+ goto nak;
+ }
+
+ /*
+ * All streams ioctls that we support must use kcred as a means to
+ * distinguish that this is a layered open by the kernel as opposed to
+ * one by a user who has done an I_PUSH of the module.
+ */
+ if (iocp->ioc_cr != kcred) {
+ error = EPERM;
+ goto nak;
+ }
+
+ if (mp->b_cont == NULL) {
+ error = EAGAIN;
+ goto nak;
+ }
+
+ visp = kmem_alloc(sizeof (vnd_strioc_t), KM_SLEEP);
+ ASSERT(MBLKL(mp->b_cont) == sizeof (caddr_t));
+ visp->vs_addr = *(caddr_t *)mp->b_cont->b_rptr;
+ visp->vs_state = VSS_COPYIN;
+
+ mcopyin(mp, (void *)visp, sizeof (vnd_strioc_associate_t), NULL);
+ qreply(q, mp);
+
+ return;
+
+nak:
+ if (mp->b_cont != NULL) {
+ freemsg(mp->b_cont);
+ mp->b_cont = NULL;
+ }
+
+ iocp->ioc_error = error;
+ mp->b_datap->db_type = M_IOCNAK;
+ iocp->ioc_count = 0;
+ qreply(q, mp);
+}
+
+static void
+vnd_striocdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp)
+{
+ vnd_str_state_t state;
+ struct copyreq *crp;
+ vnd_strioc_associate_t *vss;
+ vnd_dev_t *vdp = NULL;
+ vnd_pnsd_t *nsp = NULL;
+ char iname[2*VND_NAMELEN];
+ zone_t *zone;
+ vnd_strioc_t *visp;
+
+ visp = (vnd_strioc_t *)csp->cp_private;
+
+ /* If it's not ours, it's not our problem */
+ if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) {
+ if (q->q_next != NULL) {
+ putnext(q, mp);
+ } else {
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+ vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA");
+ }
+ kmem_free(visp, sizeof (vnd_strioc_t));
+ return;
+ }
+
+ /* The nak is already sent for us */
+ if (csp->cp_rval != 0) {
+ vnd_drop_ctl(vsp, mp, "M_COPYIN failed");
+ kmem_free(visp, sizeof (vnd_strioc_t));
+ return;
+ }
+
+ /* Data is sitting for us in b_cont */
+ if (mp->b_cont == NULL ||
+ MBLKL(mp->b_cont) != sizeof (vnd_strioc_associate_t)) {
+ kmem_free(visp, sizeof (vnd_strioc_t));
+ miocnak(q, mp, 0, EINVAL);
+ return;
+ }
+
+ vss = (vnd_strioc_associate_t *)mp->b_cont->b_rptr;
+ vdp = vnd_dev_lookup(vss->vsa_minor);
+ if (vdp == NULL) {
+ vss->vsa_errno = VND_E_NODEV;
+ goto nak;
+ }
+
+ nsp = vnd_nsd_lookup(vss->vsa_nsid);
+ if (nsp == NULL) {
+ vss->vsa_errno = VND_E_NONETSTACK;
+ goto nak;
+ }
+
+ mutex_enter(&vsp->vns_lock);
+ if (!(vsp->vns_flags & VNS_F_NEED_ZONE)) {
+ mutex_exit(&vsp->vns_lock);
+ vss->vsa_errno = VND_E_ASSOCIATED;
+ goto nak;
+ }
+
+ vsp->vns_nsd = nsp;
+ vsp->vns_flags &= ~VNS_F_NEED_ZONE;
+ vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+ mutex_exit(&vsp->vns_lock);
+
+ taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0,
+ &vsp->vns_tqe);
+
+
+ /* At this point we need to wait until we have transitioned to ONLINE */
+ mutex_enter(&vsp->vns_lock);
+ while (vsp->vns_state != VNS_S_ONLINE && vsp->vns_state != VNS_S_ZOMBIE)
+ cv_wait(&vsp->vns_stcv, &vsp->vns_lock);
+ state = vsp->vns_state;
+ mutex_exit(&vsp->vns_lock);
+
+ if (state == VNS_S_ZOMBIE) {
+ vss->vsa_errno = vsp->vns_errno;
+ goto nak;
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ mutex_enter(&vsp->vns_lock);
+ VERIFY(vdp->vdd_str == NULL);
+ /*
+ * Now initialize the remaining kstat properties and let's go ahead and
+ * create it.
+ */
+ (void) snprintf(iname, sizeof (iname), "z%d_%d",
+ vdp->vdd_nsd->vpnd_zid, vdp->vdd_minor);
+ vsp->vns_kstat = kstat_create_zone("vnd", vdp->vdd_minor, iname, "net",
+ KSTAT_TYPE_NAMED, sizeof (vnd_str_stat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
+ if (vsp->vns_kstat == NULL) {
+ vss->vsa_errno = VND_E_KSTATCREATE;
+ mutex_exit(&vsp->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+ goto nak;
+ }
+ vdp->vdd_str = vsp;
+ vsp->vns_dev = vdp;
+
+ /*
+ * Now, it's time to do the las thing that can fail, changing out the
+ * input function. After this we know that we can receive data, so we
+ * should make sure that we're ready.
+ */
+ if (vnd_dld_cap_enable(vsp, vnd_mac_input) != 0) {
+ vss->vsa_errno = VND_E_DIRECTFAIL;
+ vdp->vdd_str = NULL;
+ vsp->vns_dev = NULL;
+ mutex_exit(&vsp->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+ goto nak;
+ }
+
+ zone = zone_find_by_id(vdp->vdd_nsd->vpnd_zid);
+ ASSERT(zone != NULL);
+ vsp->vns_kstat->ks_data = &vsp->vns_ksdata;
+ /* Account for zone name */
+ vsp->vns_kstat->ks_data_size += strlen(zone->zone_name) + 1;
+ /* Account for eventual link name */
+ vsp->vns_kstat->ks_data_size += VND_NAMELEN;
+ kstat_named_setstr(&vsp->vns_ksdata.vks_zonename, zone->zone_name);
+ kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+ vdp->vdd_lname);
+ zone_rele(zone);
+ kstat_install(vsp->vns_kstat);
+
+ mutex_exit(&vsp->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+
+ /*
+ * Note that the vnd_str_t does not keep a permanent hold on the
+ * vnd_pnsd_t. We leave that up to the vnd_dev_t as that's also what
+ * the nestack goes through to take care of everything.
+ */
+ vss->vsa_errno = VND_E_SUCCESS;
+nak:
+ if (vdp != NULL)
+ vnd_dev_rele(vdp);
+ if (nsp != NULL)
+ vnd_nsd_rele(nsp);
+ /*
+ * Change the copyin request to a copyout. Note that we can't use
+ * mcopyout here as it only works when the DB_TYPE is M_IOCTL. That's
+ * okay, as the copyin vs. copyout is basically the same.
+ */
+ DB_TYPE(mp) = M_COPYOUT;
+ visp->vs_state = VSS_COPYOUT;
+ crp = (struct copyreq *)mp->b_rptr;
+ crp->cq_private = (void *)visp;
+ crp->cq_addr = visp->vs_addr;
+ crp->cq_size = sizeof (vnd_strioc_associate_t);
+ qreply(q, mp);
+}
+
+static void
+vnd_stroutdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp)
+{
+ ASSERT(csp->cp_private != NULL);
+ kmem_free(csp->cp_private, sizeof (vnd_strioc_t));
+ if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) {
+ if (q->q_next != NULL) {
+ putnext(q, mp);
+ } else {
+ VND_STAT_INC(vsp, vks_ndlpidrops, 1);
+ VND_STAT_INC(vsp, vks_tdrops, 1);
+ vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA");
+ }
+ return;
+ }
+
+ /* The nak is already sent for us */
+ if (csp->cp_rval != 0) {
+ vnd_drop_ctl(vsp, mp, "M_COPYOUT failed");
+ return;
+ }
+
+ /* Ack and let's be done with it all */
+ miocack(q, mp, 0, 0);
+}
+
+static int
+vnd_s_wput(queue_t *q, mblk_t *mp)
+{
+ vnd_str_t *vsp = q->q_ptr;
+ struct copyresp *crp;
+ vnd_strioc_state_t vstate;
+ vnd_strioc_t *visp;
+
+ switch (DB_TYPE(mp)) {
+ case M_IOCTL:
+ vnd_strioctl(q, vsp, mp, (struct iocblk *)mp->b_rptr);
+ return (0);
+ case M_IOCDATA:
+ crp = (struct copyresp *)mp->b_rptr;
+ ASSERT(crp->cp_private != NULL);
+ visp = (vnd_strioc_t *)crp->cp_private;
+ vstate = visp->vs_state;
+ ASSERT(vstate == VSS_COPYIN || vstate == VSS_COPYOUT);
+ if (vstate == VSS_COPYIN)
+ vnd_striocdata(q, vsp, mp,
+ (struct copyresp *)mp->b_rptr);
+ else
+ vnd_stroutdata(q, vsp, mp,
+ (struct copyresp *)mp->b_rptr);
+ return (0);
+ default:
+ break;
+ }
+ if (q->q_next != NULL)
+ putnext(q, mp);
+ else
+ vnd_drop_ctl(vsp, mp, "!M_IOCTL in wput");
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vnd_s_open(queue_t *q, dev_t *devp, int oflag, int sflag, cred_t *credp)
+{
+ vnd_str_t *vsp;
+ uint_t rand;
+
+ if (q->q_ptr != NULL)
+ return (EINVAL);
+
+ if (!(sflag & MODOPEN))
+ return (ENXIO);
+
+ if (credp != kcred)
+ return (EPERM);
+
+ vsp = kmem_cache_alloc(vnd_str_cache, KM_SLEEP);
+ bzero(vsp, sizeof (*vsp));
+ mutex_init(&vsp->vns_lock, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&vsp->vns_cancelcv, NULL, CV_DRIVER, NULL);
+ cv_init(&vsp->vns_barriercv, NULL, CV_DRIVER, NULL);
+ cv_init(&vsp->vns_stcv, NULL, CV_DRIVER, NULL);
+ vsp->vns_state = VNS_S_INITIAL;
+
+ mutex_init(&vsp->vns_dq_read.vdq_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&vsp->vns_dq_write.vdq_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_enter(&vnd_dev_lock);
+ vsp->vns_dq_read.vdq_max = vnd_vdq_default_size;
+ vsp->vns_dq_read.vdq_vns = vsp;
+ vsp->vns_dq_write.vdq_max = vnd_vdq_default_size;
+ vsp->vns_dq_write.vdq_vns = vsp;
+ mutex_exit(&vnd_dev_lock);
+ vsp->vns_rq = q;
+ vsp->vns_wq = WR(q);
+ q->q_ptr = WR(q)->q_ptr = vsp;
+ vsp->vns_flags = VNS_F_NEED_ZONE;
+ vsp->vns_nflush = vnd_flush_nburst;
+ vsp->vns_bsize = vnd_flush_burst_size;
+
+ (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand));
+ vsp->vns_squeue = gsqueue_set_get(vnd_sqset, rand);
+
+ /*
+ * We create our kstat and initialize all of its fields now, but we
+ * don't install it until we actually do the zone association so we can
+ * get everything.
+ */
+ kstat_named_init(&vsp->vns_ksdata.vks_rbytes, "rbytes",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_rpackets, "rpackets",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_obytes, "obytes",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_opackets, "opackets",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_nhookindrops, "nhookindrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_nhookoutdrops, "nhookoutdrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_ndlpidrops, "ndlpidrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_ndataindrops, "ndataindrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_ndataoutdrops, "ndataoutdrops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_tdrops, "total_drops",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_linkname, "linkname",
+ KSTAT_DATA_STRING);
+ kstat_named_init(&vsp->vns_ksdata.vks_zonename, "zonename",
+ KSTAT_DATA_STRING);
+ kstat_named_init(&vsp->vns_ksdata.vks_nmacflow, "flowcontrol_events",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_tmacflow, "flowcontrol_time",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1ms, "flowcontrol_1ms",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10ms, "flowcontrol_10ms",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_100ms,
+ "flowcontrol_100ms", KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1s, "flowcontrol_1s",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10s, "flowcontrol_10s",
+ KSTAT_DATA_UINT64);
+ qprocson(q);
+ /*
+ * Now that we've called qprocson, grab the lower module for making sure
+ * that we don't have any pass through modules.
+ */
+ vsp->vns_lrq = RD(vsp->vns_wq->q_next);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vnd_s_close(queue_t *q, int flag, cred_t *credp)
+{
+ vnd_str_t *vsp;
+ mblk_t *mp;
+
+ VERIFY(WR(q)->q_next != NULL);
+
+ vsp = q->q_ptr;
+ ASSERT(vsp != NULL);
+
+ /*
+ * We need to transition ourselves down. This means that we have a few
+ * important different things to do in the process of tearing down our
+ * input and output buffers, making sure we've drained the current
+ * squeue, and disabling the fast path. Before we disable the fast path,
+ * we should make sure the squeue is drained. Because we're in streams
+ * close, we know that no packets can come into us from userland, but we
+ * can receive more. As such, the following is the exact order of things
+ * that we do:
+ *
+ * 1) flush the vns_dq_read
+ * 2) Insert the drain mblk
+ * 3) When it's been received, tear down the fast path by kicking
+ * off the state machine.
+ * 4) One final flush of both the vns_dq_read,vns_dq_write
+ */
+
+ vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in);
+ vnd_strbarrier(vsp);
+ mutex_enter(&vsp->vns_lock);
+ vsp->vns_state = VNS_S_SHUTTING_DOWN;
+ if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) {
+ vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
+ taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp,
+ 0, &vsp->vns_tqe);
+ }
+ while (vsp->vns_state != VNS_S_ZOMBIE)
+ cv_wait(&vsp->vns_stcv, &vsp->vns_lock);
+ mutex_exit(&vsp->vns_lock);
+
+ qprocsoff(q);
+ mutex_enter(&vsp->vns_lock);
+ vsp->vns_flags |= VNS_F_CONDEMNED;
+ while (vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)
+ cv_wait(&vsp->vns_cancelcv, &vsp->vns_lock);
+
+ while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL)
+ vnd_drop_ctl(vsp, mp, "vnd_s_close");
+ mutex_exit(&vsp->vns_lock);
+
+ q->q_ptr = NULL;
+ vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in);
+ vnd_dq_flush(&vsp->vns_dq_write, vnd_drop_out);
+ mutex_destroy(&vsp->vns_dq_read.vdq_lock);
+ mutex_destroy(&vsp->vns_dq_write.vdq_lock);
+
+ if (vsp->vns_kstat != NULL)
+ kstat_delete(vsp->vns_kstat);
+ mutex_destroy(&vsp->vns_lock);
+ cv_destroy(&vsp->vns_stcv);
+ cv_destroy(&vsp->vns_barriercv);
+ cv_destroy(&vsp->vns_cancelcv);
+ kmem_cache_free(vnd_str_cache, vsp);
+
+ return (0);
+}
+
+static vnd_mac_cookie_t
+vnd_squeue_tx_one(vnd_str_t *vsp, mblk_t *mp)
+{
+ hrtime_t txtime;
+ vnd_mac_cookie_t vc;
+
+ VND_STAT_INC(vsp, vks_opackets, 1);
+ VND_STAT_INC(vsp, vks_obytes, msgsize(mp));
+ DTRACE_VND5(send, mblk_t *, mp, void *, NULL, void *, NULL,
+ vnd_str_t *, vsp, mblk_t *, mp);
+ /* Actually tx now */
+ txtime = gethrtime();
+ vc = vsp->vns_caps.vsc_tx_f(vsp->vns_caps.vsc_tx_hdl,
+ mp, 0, MAC_DROP_ON_NO_DESC);
+
+ /*
+ * We need to check two different conditions before we immediately set
+ * the flow control lock. The first thing that we need to do is verify
+ * that this is an instance of hard flow control, so to say. The flow
+ * control callbacks won't always fire in cases where we still get a
+ * cookie returned. The explicit check for flow control will guarantee
+ * us that we'll get a subsequent notification callback.
+ *
+ * The second case comes about because we do not hold the
+ * vnd_str_t`vns_lock across calls to tx, we need to determine if a flow
+ * control notification already came across for us in a different thread
+ * calling vnd_mac_flow_control(). To deal with this, we record a
+ * timestamp every time that we change the flow control state. We grab
+ * txtime here before we transmit because that guarantees that the
+ * hrtime_t of the call to vnd_mac_flow_control() will be after txtime.
+ *
+ * If the flow control notification beat us to the punch, the value of
+ * vns_fcupdate will be larger than the value of txtime, and we should
+ * just record the statistics. However, if we didn't beat it to the
+ * punch (txtime > vns_fcupdate), then we know that it's safe to wait
+ * for a notification.
+ */
+ if (vc != (vnd_mac_cookie_t)NULL) {
+ hrtime_t diff;
+
+ if (vsp->vns_caps.vsc_is_fc_f(vsp->vns_caps.vsc_is_fc_hdl,
+ vc) == 0)
+ return ((vnd_mac_cookie_t)NULL);
+ mutex_enter(&vsp->vns_lock);
+ diff = vsp->vns_fcupdate - txtime;
+ if (diff > 0) {
+ mutex_exit(&vsp->vns_lock);
+ vnd_mac_flow_control_stat(vsp, diff);
+ return ((vnd_mac_cookie_t)NULL);
+ }
+ vsp->vns_flags |= VNS_F_FLOW_CONTROLLED;
+ vsp->vns_caps.vsc_fc_cookie = vc;
+ vsp->vns_fclatch = txtime;
+ vsp->vns_fcupdate = txtime;
+ DTRACE_VND3(flow__blocked, vnd_str_t *, vsp,
+ uint64_t, vsp->vns_dq_write.vdq_cur, uintptr_t, vc);
+ mutex_exit(&vsp->vns_lock);
+ }
+
+ return (vc);
+}
+
+/* ARGSUSED */
+static void
+vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy)
+{
+ mblk_t *mp;
+ int nmps;
+ size_t mptot, nflush, bsize;
+ boolean_t blocked, empty;
+ vnd_data_queue_t *vqp;
+ vnd_str_t *vsp = arg;
+
+ mutex_enter(&vsp->vns_lock);
+ /*
+ * We either enter here via an squeue or via vnd_squeue_tx_append(). In
+ * the former case we need to mark that there is no longer an active
+ * user of the drain block.
+ */
+ if (drain_mp != NULL) {
+ VERIFY(drain_mp == &vsp->vns_drainblk);
+ VERIFY(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED);
+ vsp->vns_flags &= ~VNS_F_DRAIN_SCHEDULED;
+ }
+
+ /*
+ * If we're still flow controlled or under a flush barrier, nothing to
+ * do.
+ */
+ if (vsp->vns_flags & (VNS_F_FLOW_CONTROLLED | VNS_F_BARRIER)) {
+ mutex_exit(&vsp->vns_lock);
+ return;
+ }
+
+ nflush = vsp->vns_nflush;
+ bsize = vsp->vns_bsize;
+ mutex_exit(&vsp->vns_lock);
+
+ /*
+ * We're potentially going deep into the networking layer; make sure the
+ * guest can't run concurrently.
+ */
+ smt_begin_unsafe();
+
+ nmps = 0;
+ mptot = 0;
+ blocked = B_FALSE;
+ vqp = &vsp->vns_dq_write;
+ while (nmps < nflush && mptot <= bsize) {
+ mutex_enter(&vqp->vdq_lock);
+ if (vnd_dq_pop(vqp, &mp) == 0) {
+ mutex_exit(&vqp->vdq_lock);
+ break;
+ }
+ mutex_exit(&vqp->vdq_lock);
+
+ nmps++;
+ mptot += msgsize(mp);
+ if (vnd_squeue_tx_one(vsp, mp) != (vnd_mac_cookie_t)NULL) {
+ blocked = B_TRUE;
+ break;
+ }
+ }
+
+ smt_end_unsafe();
+
+ empty = vnd_dq_is_empty(&vsp->vns_dq_write);
+
+ /*
+ * If the queue is not empty, we're not blocked, and there isn't a drain
+ * scheduled, put it into the squeue with the drain block and
+ * GSQUEUE_FILL.
+ */
+ if (blocked == B_FALSE && empty == B_FALSE) {
+ mutex_enter(&vsp->vns_lock);
+ if (!(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED)) {
+ mblk_t *mp = &vsp->vns_drainblk;
+ vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED;
+ gsqueue_enter_one(vsp->vns_squeue,
+ mp, vnd_squeue_tx_drain, vsp,
+ GSQUEUE_FILL, VND_SQUEUE_TAG_TX_DRAIN);
+ }
+ mutex_exit(&vsp->vns_lock);
+ }
+
+ /*
+ * If we drained some amount of data, we need to signal the data queue.
+ */
+ if (nmps > 0) {
+ cv_broadcast(&vsp->vns_dq_write.vdq_ready);
+ pollwakeup(&vsp->vns_dev->vdd_ph, POLLOUT);
+ }
+}
+
+/* ARGSUSED */
+static void
+vnd_squeue_tx_append(void *arg, mblk_t *mp, gsqueue_t *gsp, void *dummy)
+{
+ vnd_str_t *vsp = arg;
+ vnd_data_queue_t *vqp = &vsp->vns_dq_write;
+ vnd_pnsd_t *nsp = vsp->vns_nsd;
+ size_t len = msgsize(mp);
+
+ /*
+ * Before we append this packet, we should run it through the firewall
+ * rules.
+ */
+ if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4,
+ nsp->vpnd_event_out_v4, nsp->vpnd_token_out_v4, nsp->vpnd_neti_v6,
+ nsp->vpnd_event_out_v6, nsp->vpnd_token_out_v6, vnd_drop_hook_out,
+ vnd_drop_out) != 0) {
+ /*
+ * Because we earlier reserved space for this packet and it's
+ * not making the cut, we need to go through and unreserve that
+ * space. Also note that the message block will likely be freed
+ * by the time we return from vnd_hook so we cannot rely on it.
+ */
+ mutex_enter(&vqp->vdq_lock);
+ vnd_dq_unreserve(vqp, len);
+ mutex_exit(&vqp->vdq_lock);
+ return;
+ }
+
+ /*
+ * We earlier reserved space for this packet. So for now simply append
+ * it and call drain. We know that no other drain can be going on right
+ * now thanks to the squeue.
+ */
+ mutex_enter(&vqp->vdq_lock);
+ (void) vnd_dq_push(&vsp->vns_dq_write, mp, B_TRUE, vnd_drop_panic);
+ mutex_exit(&vqp->vdq_lock);
+ vnd_squeue_tx_drain(vsp, NULL, NULL, NULL);
+}
+
+/*
+ * We need to see if this is a valid name of sorts for us. That means a few
+ * things. First off, we can't assume that what we've been given has actually
+ * been null terminated. More importantly, that it's a valid name as far as
+ * ddi_create_minor_node is concerned (that means no '@', '/', or ' '). We
+ * further constrain ourselves to simply alphanumeric characters and a few
+ * additional ones, ':', '-', and '_'.
+ */
+static int
+vnd_validate_name(const char *buf, size_t buflen)
+{
+ int i, len;
+
+ /* First make sure a null terminator exists */
+ for (i = 0; i < buflen; i++)
+ if (buf[i] == '\0')
+ break;
+ len = i;
+ if (i == 0 || i == buflen)
+ return (0);
+
+ for (i = 0; i < len; i++)
+ if (!isalnum(buf[i]) && buf[i] != ':' && buf[i] != '-' &&
+ buf[i] != '_')
+ return (0);
+
+ return (1);
+}
+
+static int
+vnd_ioctl_attach(vnd_dev_t *vdp, uintptr_t arg, cred_t *credp, int cpflag)
+{
+ vnd_ioc_attach_t via;
+ vnd_strioc_associate_t vss;
+ vnd_pnsd_t *nsp;
+ zone_t *zonep;
+ zoneid_t zid;
+ char buf[2*VND_NAMELEN];
+ int ret, rp;
+
+ if (secpolicy_net_config(credp, B_FALSE) != 0)
+ return (EPERM);
+
+ if (secpolicy_net_rawaccess(credp) != 0)
+ return (EPERM);
+
+ if (ddi_copyin((void *)arg, &via, sizeof (via), cpflag) != 0)
+ return (EFAULT);
+ via.via_errno = VND_E_SUCCESS;
+
+ if (vnd_validate_name(via.via_name, VND_NAMELEN) == 0) {
+ via.via_errno = VND_E_BADNAME;
+ ret = EIO;
+ goto errcopyout;
+ }
+
+ /*
+ * Only the global zone can request to create a device in a different
+ * zone.
+ */
+ zid = crgetzoneid(credp);
+ if (zid != GLOBAL_ZONEID && via.via_zoneid != -1 &&
+ zid != via.via_zoneid) {
+ via.via_errno = VND_E_PERM;
+ ret = EIO;
+ goto errcopyout;
+ }
+
+ if (via.via_zoneid == -1)
+ via.via_zoneid = zid;
+
+ /*
+ * Establish the name we'll use now. We want to be extra paranoid about
+ * the device we're opening so check that now.
+ */
+ if (zid == GLOBAL_ZONEID && via.via_zoneid != zid) {
+ zonep = zone_find_by_id(via.via_zoneid);
+ if (zonep == NULL) {
+ via.via_errno = VND_E_NOZONE;
+ ret = EIO;
+ goto errcopyout;
+ }
+ if (snprintf(NULL, 0, "/dev/net/zone/%s/%s", zonep->zone_name,
+ via.via_name) >= sizeof (buf)) {
+ zone_rele(zonep);
+ via.via_errno = VND_E_BADNAME;
+ ret = EIO;
+ goto errcopyout;
+ }
+ (void) snprintf(buf, sizeof (buf), "/dev/net/zone/%s/%s",
+ zonep->zone_name, via.via_name);
+ zone_rele(zonep);
+ zonep = NULL;
+ } else {
+ if (snprintf(NULL, 0, "/dev/net/%s", via.via_name) >=
+ sizeof (buf)) {
+ via.via_errno = VND_E_BADNAME;
+ ret = EIO;
+ goto errcopyout;
+ }
+ (void) snprintf(buf, sizeof (buf), "/dev/net/%s", via.via_name);
+ }
+
+ /*
+ * If our zone is dying then the netstack will have been removed from
+ * this list.
+ */
+ nsp = vnd_nsd_lookup_by_zid(via.via_zoneid);
+ if (nsp == NULL) {
+ via.via_errno = VND_E_NOZONE;
+ ret = EIO;
+ goto errcopyout;
+ }
+
+ /*
+ * Note we set the attached handle even though we haven't actually
+ * finished the process of attaching the ldi handle.
+ */
+ mutex_enter(&vdp->vdd_lock);
+ if (vdp->vdd_flags & (VND_D_ATTACHED | VND_D_ATTACH_INFLIGHT)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_nsd_rele(nsp);
+ via.via_errno = VND_E_ATTACHED;
+ ret = EIO;
+ goto errcopyout;
+ }
+ vdp->vdd_flags |= VND_D_ATTACH_INFLIGHT;
+ ASSERT(vdp->vdd_cr == NULL);
+ crhold(credp);
+ vdp->vdd_cr = credp;
+ ASSERT(vdp->vdd_nsd == NULL);
+ vdp->vdd_nsd = nsp;
+ mutex_exit(&vdp->vdd_lock);
+
+ /*
+ * Place an additional hold on the vnd_pnsd_t as we go through and do
+ * all of the rest of our work. This will be the hold that we keep for
+ * as long as this thing is attached.
+ */
+ vnd_nsd_ref(nsp);
+
+ ret = ldi_open_by_name(buf, FREAD | FWRITE, vdp->vdd_cr,
+ &vdp->vdd_ldih, vdp->vdd_ldiid);
+ if (ret != 0) {
+ if (ret == ENODEV)
+ via.via_errno = VND_E_NODATALINK;
+ goto err;
+ }
+
+ /*
+ * Unfortunately the I_PUSH interface doesn't allow us a way to detect
+ * whether or not we're coming in from a layered device. We really want
+ * to make sure that a normal user can't push on our streams module.
+ * Currently the only idea I have for this is to make sure that the
+ * credp is kcred which is really terrible.
+ */
+ ret = ldi_ioctl(vdp->vdd_ldih, I_PUSH, (intptr_t)"vnd", FKIOCTL,
+ kcred, &rp);
+ if (ret != 0) {
+ rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+ VERIFY(rp == 0);
+ via.via_errno = VND_E_STRINIT;
+ ret = EIO;
+ goto err;
+ }
+
+ vss.vsa_minor = vdp->vdd_minor;
+ vss.vsa_nsid = nsp->vpnd_nsid;
+
+ ret = ldi_ioctl(vdp->vdd_ldih, VND_STRIOC_ASSOCIATE, (intptr_t)&vss,
+ FKIOCTL, kcred, &rp);
+ if (ret != 0 || vss.vsa_errno != VND_E_SUCCESS) {
+ rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+ VERIFY(rp == 0);
+ if (ret == 0) {
+ via.via_errno = vss.vsa_errno;
+ ret = EIO;
+ }
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_nsd->vpnd_lock);
+
+ /*
+ * There's a chance that our netstack was condemned while we've had a
+ * hold on it. As such we need to check and if so, error out.
+ */
+ if (vdp->vdd_nsd->vpnd_flags & VND_NS_CONDEMNED) {
+ mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+ rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
+ VERIFY(rp == 0);
+ ret = EIO;
+ via.via_errno = VND_E_NOZONE;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_str != NULL);
+ vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT;
+ vdp->vdd_flags |= VND_D_ATTACHED;
+ (void) strlcpy(vdp->vdd_datalink, via.via_name,
+ sizeof (vdp->vdd_datalink));
+ list_insert_tail(&vdp->vdd_nsd->vpnd_dev_list, vdp);
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vdp->vdd_nsd->vpnd_lock);
+ vnd_nsd_rele(nsp);
+
+ return (0);
+
+err:
+ mutex_enter(&vdp->vdd_lock);
+ vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT;
+ crfree(vdp->vdd_cr);
+ vdp->vdd_cr = NULL;
+ vdp->vdd_nsd = NULL;
+ mutex_exit(&vdp->vdd_lock);
+
+ /*
+ * We have two holds to drop here. One for our original reference and
+ * one for the hold this operation would have represented.
+ */
+ vnd_nsd_rele(nsp);
+ vnd_nsd_rele(nsp);
+errcopyout:
+ if (ddi_copyout(&via, (void *)arg, sizeof (via), cpflag) != 0)
+ ret = EFAULT;
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_link(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag)
+{
+ int ret = 0;
+ vnd_ioc_link_t vil;
+ char mname[2*VND_NAMELEN];
+ char **c;
+ vnd_dev_t *v;
+ zoneid_t zid;
+
+ /* Not anyone can link something */
+ if (secpolicy_net_config(credp, B_FALSE) != 0)
+ return (EPERM);
+
+ if (ddi_copyin((void *)arg, &vil, sizeof (vil), cpflag) != 0)
+ return (EFAULT);
+
+ if (vnd_validate_name(vil.vil_name, VND_NAMELEN) == 0) {
+ ret = EIO;
+ vil.vil_errno = VND_E_BADNAME;
+ goto errcopyout;
+ }
+
+ c = vnd_reserved_names;
+ while (*c != NULL) {
+ if (strcmp(vil.vil_name, *c) == 0) {
+ ret = EIO;
+ vil.vil_errno = VND_E_BADNAME;
+ goto errcopyout;
+ }
+ c++;
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_NOTATTACHED;
+ goto errcopyout;
+ }
+
+ if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_NOZONE;
+ goto errcopyout;
+ }
+
+ if (vdp->vdd_flags & (VND_D_LINK_INFLIGHT | VND_D_LINKED)) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_LINKED;
+ goto errcopyout;
+ }
+ vdp->vdd_flags |= VND_D_LINK_INFLIGHT;
+ zid = vdp->vdd_nsd->vpnd_zid;
+ mutex_exit(&vdp->vdd_lock);
+
+ if (snprintf(NULL, 0, "z%d:%s", zid, vil.vil_name) >=
+ sizeof (mname)) {
+ ret = EIO;
+ vil.vil_errno = VND_E_BADNAME;
+ goto errcopyout;
+ }
+
+ mutex_enter(&vnd_dev_lock);
+ for (v = list_head(&vnd_dev_list); v != NULL;
+ v = list_next(&vnd_dev_list, v)) {
+ if (!(v->vdd_flags & VND_D_LINKED))
+ continue;
+
+ if (v->vdd_nsd->vpnd_zid == zid &&
+ strcmp(v->vdd_lname, vil.vil_name) == 0) {
+ mutex_exit(&vnd_dev_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_LINKEXISTS;
+ goto error;
+ }
+ }
+
+ /*
+ * We set the name and mark ourselves attached while holding the list
+ * lock to ensure that no other user can mistakingly find our name.
+ */
+ (void) snprintf(mname, sizeof (mname), "z%d:%s", zid,
+ vil.vil_name);
+ mutex_enter(&vdp->vdd_lock);
+
+ /*
+ * Because we dropped our lock, we need to double check whether or not
+ * the zone was marked as dying while we were here. If it hasn't, then
+ * it's safe for us to link it in.
+ */
+ if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+ ret = EIO;
+ vil.vil_errno = VND_E_NOZONE;
+ goto error;
+ }
+
+ (void) strlcpy(vdp->vdd_lname, vil.vil_name, sizeof (vdp->vdd_lname));
+ if (ddi_create_minor_node(vnd_dip, mname, S_IFCHR, vdp->vdd_minor,
+ DDI_PSEUDO, 0) != DDI_SUCCESS) {
+ ret = EIO;
+ vil.vil_errno = VND_E_MINORNODE;
+ } else {
+ vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT;
+ vdp->vdd_flags |= VND_D_LINKED;
+ kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+ vdp->vdd_lname);
+ ret = 0;
+ }
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+
+ if (ret == 0) {
+ /*
+ * Add a reference to represent that this device is linked into
+ * the file system name space to ensure that it doesn't
+ * disappear.
+ */
+ vnd_dev_ref(vdp);
+ return (0);
+ }
+
+error:
+ mutex_enter(&vdp->vdd_lock);
+ vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT;
+ vdp->vdd_lname[0] = '\0';
+ mutex_exit(&vdp->vdd_lock);
+
+errcopyout:
+ if (ddi_copyout(&vil, (void *)arg, sizeof (vil), cpflag) != 0)
+ ret = EFAULT;
+ return (ret);
+}
+
+/*
+ * Common unlink function. This is used both from the ioctl path and from the
+ * netstack shutdown path. The caller is required to hold the mutex on the
+ * vnd_dev_t, but they basically will have it relinquished for them. The only
+ * thing the caller is allowed to do afterward is to potentially rele the
+ * vnd_dev_t if they have their own hold. Note that only the ioctl path has its
+ * own hold.
+ */
+static void
+vnd_dev_unlink(vnd_dev_t *vdp)
+{
+ char mname[2*VND_NAMELEN];
+
+ ASSERT(MUTEX_HELD(&vdp->vdd_lock));
+
+ (void) snprintf(mname, sizeof (mname), "z%d:%s",
+ vdp->vdd_nsd->vpnd_zid, vdp->vdd_lname);
+ ddi_remove_minor_node(vnd_dip, mname);
+ vdp->vdd_lname[0] = '\0';
+ vdp->vdd_flags &= ~VND_D_LINKED;
+ kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
+ vdp->vdd_lname);
+ mutex_exit(&vdp->vdd_lock);
+
+ /*
+ * This rele corresponds to the reference that we took in
+ * vnd_ioctl_link.
+ */
+ vnd_dev_rele(vdp);
+}
+
+static int
+vnd_ioctl_unlink(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag)
+{
+ int ret;
+ zoneid_t zid;
+ vnd_ioc_unlink_t viu;
+
+ /* Not anyone can unlink something */
+ if (secpolicy_net_config(credp, B_FALSE) != 0)
+ return (EPERM);
+
+ zid = crgetzoneid(credp);
+
+ if (ddi_copyin((void *)arg, &viu, sizeof (viu), cpflag) != 0)
+ return (EFAULT);
+
+ viu.viu_errno = VND_E_SUCCESS;
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_LINKED)) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ viu.viu_errno = VND_E_NOTLINKED;
+ goto err;
+ }
+ VERIFY(vdp->vdd_flags & VND_D_ATTACHED);
+
+ if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) {
+ mutex_exit(&vdp->vdd_lock);
+ ret = EIO;
+ viu.viu_errno = VND_E_PERM;
+ goto err;
+ }
+
+ /* vnd_dev_unlink releases the vdp mutex for us */
+ vnd_dev_unlink(vdp);
+ ret = 0;
+err:
+ if (ddi_copyout(&viu, (void *)arg, sizeof (viu), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_setrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ int ret;
+ vnd_ioc_buf_t vib;
+
+ if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ mutex_enter(&vnd_dev_lock);
+ if (vib.vib_size > vnd_vdq_hard_max) {
+ mutex_exit(&vnd_dev_lock);
+ vib.vib_errno = VND_E_BUFTOOBIG;
+ ret = EIO;
+ goto err;
+ }
+ mutex_exit(&vnd_dev_lock);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_NOTATTACHED;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ if (vib.vib_size < vdp->vdd_str->vns_minwrite) {
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_BUFTOOSMALL;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock);
+ vdp->vdd_str->vns_dq_read.vdq_max = (size_t)vib.vib_size;
+ mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock);
+ mutex_exit(&vdp->vdd_lock);
+ ret = 0;
+
+err:
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_getrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ int ret;
+ vnd_ioc_buf_t vib;
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_NOTATTACHED;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock);
+ vib.vib_size = vdp->vdd_str->vns_dq_read.vdq_max;
+ mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock);
+ mutex_exit(&vdp->vdd_lock);
+ ret = 0;
+
+err:
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static int
+vnd_ioctl_getmaxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ vnd_ioc_buf_t vib;
+
+ mutex_enter(&vnd_dev_lock);
+ vib.vib_size = vnd_vdq_hard_max;
+ mutex_exit(&vnd_dev_lock);
+
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+static int
+vnd_ioctl_gettxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ int ret;
+ vnd_ioc_buf_t vib;
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_NOTATTACHED;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock);
+ vib.vib_size = vdp->vdd_str->vns_dq_write.vdq_max;
+ mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock);
+ mutex_exit(&vdp->vdd_lock);
+ ret = 0;
+
+err:
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_settxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
+{
+ int ret;
+ vnd_ioc_buf_t vib;
+
+ if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ mutex_enter(&vnd_dev_lock);
+ if (vib.vib_size > vnd_vdq_hard_max) {
+ mutex_exit(&vnd_dev_lock);
+ vib.vib_errno = VND_E_BUFTOOBIG;
+ ret = EIO;
+ goto err;
+ }
+ mutex_exit(&vnd_dev_lock);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_NOTATTACHED;
+ ret = EIO;
+ goto err;
+ }
+
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ if (vib.vib_size < vdp->vdd_str->vns_minwrite) {
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ mutex_exit(&vdp->vdd_lock);
+ vib.vib_errno = VND_E_BUFTOOSMALL;
+ ret = EIO;
+ goto err;
+ }
+ mutex_exit(&vdp->vdd_str->vns_lock);
+
+ mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock);
+ vdp->vdd_str->vns_dq_write.vdq_max = (size_t)vib.vib_size;
+ mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock);
+ mutex_exit(&vdp->vdd_lock);
+ ret = 0;
+
+err:
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
+ return (EFAULT);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_gettu(vnd_dev_t *vdp, intptr_t arg, int mode, boolean_t min)
+{
+ vnd_ioc_buf_t vib;
+
+ vib.vib_errno = 0;
+ mutex_enter(&vdp->vdd_lock);
+ if (vdp->vdd_flags & VND_D_ATTACHED) {
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ if (min == B_TRUE)
+ vib.vib_size = vdp->vdd_str->vns_minwrite;
+ else
+ vib.vib_size = vdp->vdd_str->vns_maxwrite;
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ } else {
+ vib.vib_errno = VND_E_NOTATTACHED;
+ }
+ mutex_exit(&vdp->vdd_lock);
+
+ if (ddi_copyout(&vib, (void *)arg, sizeof (vib), mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+static int
+vnd_frameio_read(vnd_dev_t *vdp, intptr_t addr, int mode)
+{
+ int ret, nonblock, nwrite;
+ frameio_t *fio;
+ vnd_data_queue_t *vqp;
+ mblk_t *mp;
+
+ fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI);
+ if (fio == NULL)
+ return (EAGAIN);
+
+ ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (const void *)addr,
+ mode);
+ if (ret != 0) {
+ frameio_free(fio);
+ return (ret);
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ frameio_free(fio);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+
+ nonblock = mode & (FNONBLOCK | FNDELAY);
+
+ vqp = &vdp->vdd_str->vns_dq_read;
+ mutex_enter(&vqp->vdq_lock);
+
+ /* Check empty case */
+ if (vqp->vdq_cur == 0) {
+ if (nonblock != 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (EWOULDBLOCK);
+ }
+ while (vqp->vdq_cur == 0) {
+ if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (EINTR);
+ }
+ }
+ }
+
+ ret = frameio_mblk_chain_write(fio, MAP_BLK_FRAME, vqp->vdq_head,
+ &nwrite, mode & FKIOCTL);
+ if (ret != 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (ret);
+ }
+
+ ret = frameio_hdr_copyout(fio, nwrite, (void *)addr, mode);
+ if (ret != 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (ret);
+ }
+
+ while (nwrite > 0) {
+ (void) vnd_dq_pop(vqp, &mp);
+ freemsg(mp);
+ nwrite--;
+ }
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+
+ return (0);
+}
+
+static int
+vnd_frameio_write(vnd_dev_t *vdp, intptr_t addr, int mode)
+{
+ frameio_t *fio;
+ int ret, nonblock, nframes, i, nread;
+ size_t maxwrite, minwrite, total, flen;
+ mblk_t *mp_chain, *mp, *nmp;
+ vnd_data_queue_t *vqp;
+
+ fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI);
+ if (fio == NULL)
+ return (EAGAIN);
+
+ ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (void *)addr, mode);
+ if (ret != 0) {
+ frameio_free(fio);
+ return (ret);
+ }
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ frameio_free(fio);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+
+ nonblock = mode & (FNONBLOCK | FNDELAY);
+
+ /*
+ * Make sure no single frame is larger than we can accept.
+ */
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ minwrite = vdp->vdd_str->vns_minwrite;
+ maxwrite = vdp->vdd_str->vns_maxwrite;
+ mutex_exit(&vdp->vdd_str->vns_lock);
+
+ nframes = fio->fio_nvpf / fio->fio_nvecs;
+ total = 0;
+ for (i = 0; i < nframes; i++) {
+ flen = frameio_frame_length(fio,
+ &fio->fio_vecs[i*fio->fio_nvpf]);
+ if (flen < minwrite || flen > maxwrite) {
+ frameio_free(fio);
+ return (ERANGE);
+ }
+ total += flen;
+ }
+
+ vqp = &vdp->vdd_str->vns_dq_write;
+ mutex_enter(&vqp->vdq_lock);
+ while (vnd_dq_reserve(vqp, total) == 0) {
+ if (nonblock != 0) {
+ frameio_free(fio);
+ mutex_exit(&vqp->vdq_lock);
+ return (EAGAIN);
+ }
+ if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+ mutex_exit(&vqp->vdq_lock);
+ frameio_free(fio);
+ return (EINTR);
+ }
+ }
+ mutex_exit(&vqp->vdq_lock);
+
+ /*
+ * We've reserved our space, let's copyin and go from here.
+ */
+ ret = frameio_mblk_chain_read(fio, &mp_chain, &nread, mode & FKIOCTL);
+ if (ret != 0) {
+ frameio_free(fio);
+ vnd_dq_unreserve(vqp, total);
+ cv_broadcast(&vqp->vdq_ready);
+ pollwakeup(&vdp->vdd_ph, POLLOUT);
+ return (ret);
+ }
+
+ for (mp = mp_chain; mp != NULL; mp = nmp) {
+ nmp = mp->b_next;
+ mp->b_next = NULL;
+ gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp,
+ vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS,
+ VND_SQUEUE_TAG_VND_WRITE);
+ }
+
+ /*
+ * Update the frameio structure to indicate that we wrote those frames.
+ */
+ frameio_mark_consumed(fio, nread);
+ ret = frameio_hdr_copyout(fio, nread, (void *)addr, mode);
+ frameio_free(fio);
+
+ return (ret);
+}
+
+static int
+vnd_ioctl_list_copy_info(vnd_dev_t *vdp, vnd_ioc_info_t *arg, int mode)
+{
+ const char *link;
+ uint32_t vers = 1;
+ ASSERT(MUTEX_HELD(&vdp->vdd_lock));
+
+ /*
+ * Copy all of the members out to userland.
+ */
+ if (ddi_copyout(&vers, &arg->vii_version, sizeof (uint32_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (vdp->vdd_flags & VND_D_LINKED)
+ link = vdp->vdd_lname;
+ else
+ link = "<anonymous>";
+ if (ddi_copyout(link, arg->vii_name, sizeof (arg->vii_name),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (ddi_copyout(vdp->vdd_datalink, arg->vii_datalink,
+ sizeof (arg->vii_datalink), mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ if (ddi_copyout(&vdp->vdd_nsd->vpnd_zid, &arg->vii_zone,
+ sizeof (zoneid_t), mode & FKIOCTL) != 0)
+ return (EFAULT);
+ return (0);
+}
+
+static int
+vnd_ioctl_list(intptr_t arg, cred_t *credp, int mode)
+{
+ vnd_ioc_list_t vl;
+ vnd_ioc_list32_t vl32;
+ zoneid_t zid;
+ vnd_dev_t *vdp;
+ vnd_ioc_info_t *vip;
+ int found, cancopy, ret;
+
+ if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) {
+ if (ddi_copyin((void *)arg, &vl32, sizeof (vnd_ioc_list32_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+ vl.vl_nents = vl32.vl_nents;
+ vl.vl_actents = vl32.vl_actents;
+ vl.vl_ents = (void *)(uintptr_t)vl32.vl_ents;
+ } else {
+ if (ddi_copyin((void *)arg, &vl, sizeof (vnd_ioc_list_t),
+ mode & FKIOCTL) != 0)
+ return (EFAULT);
+ }
+
+ cancopy = vl.vl_nents;
+ vip = vl.vl_ents;
+ found = 0;
+ zid = crgetzoneid(credp);
+ mutex_enter(&vnd_dev_lock);
+ for (vdp = list_head(&vnd_dev_list); vdp != NULL;
+ vdp = list_next(&vnd_dev_list, vdp)) {
+ mutex_enter(&vdp->vdd_lock);
+ if (vdp->vdd_flags & VND_D_ATTACHED &&
+ !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING)) &&
+ (zid == GLOBAL_ZONEID || zid == vdp->vdd_nsd->vpnd_zid)) {
+ found++;
+ if (cancopy > 0) {
+ ret = vnd_ioctl_list_copy_info(vdp, vip, mode);
+ if (ret != 0) {
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&vnd_dev_lock);
+ return (ret);
+ }
+ cancopy--;
+ vip++;
+ }
+ }
+ mutex_exit(&vdp->vdd_lock);
+ }
+ mutex_exit(&vnd_dev_lock);
+
+ if (ddi_copyout(&found, &((vnd_ioc_list_t *)arg)->vl_actents,
+ sizeof (uint_t), mode & FKIOCTL) != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+
+/* ARGSUSED */
+static int
+vnd_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+ int *rvalp)
+{
+ int ret;
+ minor_t m;
+ vnd_dev_t *vdp;
+
+ m = getminor(dev);
+ ASSERT(m != 0);
+
+ /*
+ * Make sure no one has come in on an ioctl from the strioc case.
+ */
+ if ((cmd & VND_STRIOC) == VND_STRIOC)
+ return (ENOTTY);
+
+ /*
+ * Like close, seems like if this minor isn't found, it's a programmer
+ * error somehow.
+ */
+ vdp = vnd_dev_lookup(m);
+ if (vdp == NULL)
+ return (ENXIO);
+
+ switch (cmd) {
+ case VND_IOC_ATTACH:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_attach(vdp, arg, credp, mode);
+ break;
+ case VND_IOC_LINK:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_link(vdp, arg, credp, mode);
+ break;
+ case VND_IOC_UNLINK:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_unlink(vdp, arg, credp, mode);
+ break;
+ case VND_IOC_GETRXBUF:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_getrxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_SETRXBUF:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_setrxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_GETTXBUF:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_gettxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_SETTXBUF:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_settxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_GETMAXBUF:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ if (crgetzoneid(credp) != GLOBAL_ZONEID) {
+ ret = EPERM;
+ break;
+ }
+ ret = vnd_ioctl_getmaxbuf(vdp, arg, mode);
+ break;
+ case VND_IOC_GETMINTU:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_gettu(vdp, arg, mode, B_TRUE);
+ break;
+ case VND_IOC_GETMAXTU:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_gettu(vdp, arg, mode, B_FALSE);
+ break;
+ case VND_IOC_FRAMEIO_READ:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_frameio_read(vdp, arg, mode);
+ break;
+ case VND_IOC_FRAMEIO_WRITE:
+ if (!(mode & FWRITE)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_frameio_write(vdp, arg, mode);
+ break;
+ case VND_IOC_LIST:
+ if (!(mode & FREAD)) {
+ ret = EBADF;
+ break;
+ }
+ ret = vnd_ioctl_list(arg, credp, mode);
+ break;
+ default:
+ ret = ENOTTY;
+ break;
+ }
+
+ vnd_dev_rele(vdp);
+ return (ret);
+}
+
+static int
+vnd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
+{
+ vnd_dev_t *vdp;
+ minor_t m;
+ zoneid_t zid;
+
+ if (flag & (FEXCL | FNDELAY))
+ return (ENOTSUP);
+
+ if (otyp & OTYP_BLK)
+ return (ENOTSUP);
+
+ zid = crgetzoneid(credp);
+ m = getminor(*devp);
+
+ /*
+ * If we have an open of a non-zero instance then we need to look that
+ * up in our list of entries.
+ */
+ if (m != 0) {
+
+ /*
+ * We don't check for rawaccess globally as a user could be
+ * doing a list ioctl on the control node which doesn't require
+ * this privilege.
+ */
+ if (secpolicy_net_rawaccess(credp) != 0)
+ return (EPERM);
+
+
+ vdp = vnd_dev_lookup(m);
+ if (vdp == NULL)
+ return (ENOENT);
+
+ /*
+ * We need to check to make sure that the user is allowed to
+ * open this node. At this point it should be an attached handle
+ * as that's all we're allowed to access.
+ */
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_LINKED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENOENT);
+ }
+
+ if (vdp->vdd_flags & VND_D_ZONE_DYING) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENOENT);
+ }
+
+ if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENOENT);
+ }
+
+ if ((flag & FEXCL) && (vdp->vdd_flags & VND_D_OPENED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (EBUSY);
+ }
+
+ if (!(vdp->vdd_flags & VND_D_OPENED)) {
+ vdp->vdd_flags |= VND_D_OPENED;
+ vdp->vdd_ref++;
+ DTRACE_VND_REFINC(vdp);
+ }
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+
+ return (0);
+ }
+
+ if (flag & FEXCL)
+ return (ENOTSUP);
+
+ /*
+ * We need to clone ourselves and set up new a state.
+ */
+ vdp = kmem_cache_alloc(vnd_dev_cache, KM_SLEEP);
+ bzero(vdp, sizeof (vnd_dev_t));
+
+ if (ldi_ident_from_dev(*devp, &vdp->vdd_ldiid) != 0) {
+ kmem_cache_free(vnd_dev_cache, vdp);
+ return (EINVAL);
+ }
+
+ vdp->vdd_minor = id_alloc(vnd_minors);
+ mutex_init(&vdp->vdd_lock, NULL, MUTEX_DRIVER, NULL);
+ list_link_init(&vdp->vdd_link);
+ vdp->vdd_ref = 1;
+ *devp = makedevice(getmajor(*devp), vdp->vdd_minor);
+ vdp->vdd_devid = *devp;
+ DTRACE_VND_REFINC(vdp);
+ vdp->vdd_flags |= VND_D_OPENED;
+
+ mutex_enter(&vnd_dev_lock);
+ list_insert_head(&vnd_dev_list, vdp);
+ mutex_exit(&vnd_dev_lock);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vnd_close(dev_t dev, int flag, int otyp, cred_t *credp)
+{
+ minor_t m;
+ vnd_dev_t *vdp;
+
+ m = getminor(dev);
+ if (m == 0)
+ return (ENXIO);
+
+ vdp = vnd_dev_lookup(m);
+ if (vdp == NULL)
+ return (ENXIO);
+
+ mutex_enter(&vdp->vdd_lock);
+ VERIFY(vdp->vdd_flags & VND_D_OPENED);
+ vdp->vdd_flags &= ~VND_D_OPENED;
+ mutex_exit(&vdp->vdd_lock);
+
+ /* Remove the hold from the previous open. */
+ vnd_dev_rele(vdp);
+
+ /* And now from lookup */
+ vnd_dev_rele(vdp);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vnd_read(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+ int nonblock, error = 0;
+ size_t mpsize;
+ vnd_dev_t *vdp;
+ vnd_data_queue_t *vqp;
+ mblk_t *mp = NULL;
+ offset_t u_loffset;
+
+ /*
+ * If we have more than one uio we refuse to do anything. That's for
+ * frameio.
+ */
+ if (uiop->uio_iovcnt > 1)
+ return (EINVAL);
+
+ vdp = vnd_dev_lookup(getminor(dev));
+ if (vdp == NULL)
+ return (ENXIO);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+ nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY);
+
+ vqp = &vdp->vdd_str->vns_dq_read;
+ mutex_enter(&vqp->vdq_lock);
+
+ /* Check empty case */
+ if (vqp->vdq_cur == 0) {
+ if (nonblock != 0) {
+ error = EWOULDBLOCK;
+ goto err;
+ }
+ while (vqp->vdq_cur == 0) {
+ if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+ error = EINTR;
+ goto err;
+ }
+ }
+ }
+
+ /* Ensure our buffer is big enough */
+ mp = vqp->vdq_head;
+ ASSERT(mp != NULL);
+ mpsize = msgsize(mp);
+ if (mpsize > uiop->uio_resid) {
+ error = EOVERFLOW;
+ goto err;
+ }
+
+ u_loffset = uiop->uio_loffset;
+ while (mp != NULL) {
+ if (uiomove(mp->b_rptr, MBLKL(mp), UIO_READ, uiop) != 0) {
+ error = EFAULT;
+ uiop->uio_loffset = u_loffset;
+ mp = NULL;
+ goto err;
+ }
+ mpsize -= MBLKL(mp);
+ mp = mp->b_cont;
+ }
+ ASSERT(mpsize == 0);
+ (void) vnd_dq_pop(vqp, &mp);
+ freemsg(mp);
+err:
+ mutex_exit(&vqp->vdq_lock);
+ vnd_dev_rele(vdp);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+vnd_write(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+ int nonblock, error;
+ vnd_dev_t *vdp;
+ mblk_t *mp;
+ ssize_t iosize, origsize;
+ vnd_data_queue_t *vqp;
+
+ if (uiop->uio_iovcnt > 1)
+ return (EINVAL);
+
+ vdp = vnd_dev_lookup(getminor(dev));
+ if (vdp == NULL)
+ return (ENXIO);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+ nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY);
+
+ mutex_enter(&vdp->vdd_str->vns_lock);
+ if (uiop->uio_resid > vdp->vdd_str->vns_maxwrite ||
+ uiop->uio_resid < vdp->vdd_str->vns_minwrite) {
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ vnd_dev_rele(vdp);
+ return (ERANGE);
+ }
+ mutex_exit(&vdp->vdd_str->vns_lock);
+ VERIFY(vdp->vdd_str != NULL);
+
+ /*
+ * Reserve space in the data queue if we can. If we can't, block or
+ * return EAGAIN. If we can, go and squeue_enter.
+ */
+ vqp = &vdp->vdd_str->vns_dq_write;
+ mutex_enter(&vqp->vdq_lock);
+ while (vnd_dq_reserve(vqp, uiop->uio_resid) == 0) {
+ if (nonblock != 0) {
+ mutex_exit(&vqp->vdq_lock);
+ vnd_dev_rele(vdp);
+ return (EAGAIN);
+ }
+ if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
+ mutex_exit(&vqp->vdq_lock);
+ vnd_dev_rele(vdp);
+ return (EINTR);
+ }
+ }
+ mutex_exit(&vqp->vdq_lock);
+
+ /*
+ * Now that we've reserved the space, try to allocate kernel space for
+ * and copy in the block. To take care of all this we use the
+ * strmakedata subroutine for now.
+ */
+ origsize = iosize = uiop->uio_resid;
+ error = strmakedata(&iosize, uiop, vdp->vdd_str->vns_wq->q_stream, 0,
+ &mp);
+
+ /*
+ * strmakedata() will return an error or it may only consume a portion
+ * of the data.
+ */
+ if (error != 0 || uiop->uio_resid != 0) {
+ vnd_dq_unreserve(vqp, origsize);
+ cv_broadcast(&vqp->vdq_ready);
+ pollwakeup(&vdp->vdd_ph, POLLOUT);
+ vnd_dev_rele(vdp);
+ return (ENOSR);
+ }
+
+ gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp,
+ vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS,
+ VND_SQUEUE_TAG_VND_WRITE);
+
+ vnd_dev_rele(vdp);
+ return (0);
+}
+
+static int
+vnd_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ short ready = 0;
+ vnd_dev_t *vdp;
+ vnd_data_queue_t *vqp;
+
+ vdp = vnd_dev_lookup(getminor(dev));
+ if (vdp == NULL)
+ return (ENXIO);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (ENXIO);
+ }
+ mutex_exit(&vdp->vdd_lock);
+
+ if ((events & POLLIN) || (events & POLLRDNORM)) {
+ vqp = &vdp->vdd_str->vns_dq_read;
+ mutex_enter(&vqp->vdq_lock);
+ if (vqp->vdq_head != NULL)
+ ready |= events & (POLLIN | POLLRDNORM);
+ mutex_exit(&vqp->vdq_lock);
+ }
+
+ if (events & POLLOUT) {
+ vqp = &vdp->vdd_str->vns_dq_write;
+ mutex_enter(&vqp->vdq_lock);
+ if (vqp->vdq_cur != vqp->vdq_max)
+ ready |= POLLOUT;
+ mutex_exit(&vqp->vdq_lock);
+ }
+
+ if ((ready == 0 && !anyyet) || (events & POLLET)) {
+ *phpp = &vdp->vdd_ph;
+ }
+ *reventsp = ready;
+ vnd_dev_rele(vdp);
+ return (0);
+}
+
+/* ARGSUSED */
+static void *
+vnd_stack_init(netstackid_t stackid, netstack_t *ns)
+{
+ vnd_pnsd_t *nsp;
+
+ nsp = kmem_cache_alloc(vnd_pnsd_cache, KM_SLEEP);
+ bzero(nsp, sizeof (*nsp));
+ nsp->vpnd_nsid = stackid;
+ nsp->vpnd_zid = netstackid_to_zoneid(stackid);
+ nsp->vpnd_flags = 0;
+ mutex_init(&nsp->vpnd_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&nsp->vpnd_dev_list, sizeof (vnd_dev_t),
+ offsetof(vnd_dev_t, vdd_nslink));
+ if (vnd_netinfo_init(nsp) == 0)
+ nsp->vpnd_hooked = B_TRUE;
+
+ mutex_enter(&vnd_dev_lock);
+ list_insert_tail(&vnd_nsd_list, nsp);
+ mutex_exit(&vnd_dev_lock);
+
+ return (nsp);
+}
+
+/* ARGSUSED */
+static void
+vnd_stack_shutdown(netstackid_t stackid, void *arg)
+{
+ vnd_pnsd_t *nsp = arg;
+ vnd_dev_t *vdp;
+
+ ASSERT(nsp != NULL);
+ /*
+ * After shut down no one should be able to find their way to this
+ * netstack again.
+ */
+ mutex_enter(&vnd_dev_lock);
+ list_remove(&vnd_nsd_list, nsp);
+ mutex_exit(&vnd_dev_lock);
+
+ /*
+ * Make sure hooks know that they're going away.
+ */
+ if (nsp->vpnd_hooked == B_TRUE)
+ vnd_netinfo_shutdown(nsp);
+
+ /*
+ * Now we need to go through and notify each zone that they are in
+ * teardown phase. See the big theory statement section on vnd, zones,
+ * netstacks, and sdev for more information about this.
+ */
+ mutex_enter(&nsp->vpnd_lock);
+ nsp->vpnd_flags |= VND_NS_CONDEMNED;
+ for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+ vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_CONDEMNED))
+ vdp->vdd_flags |= VND_D_ZONE_DYING;
+ mutex_exit(&vdp->vdd_lock);
+ }
+ mutex_exit(&nsp->vpnd_lock);
+
+ /*
+ * Next we remove all the links as we know nothing new can be added to
+ * the list and that none of the extent devices can obtain additional
+ * links.
+ */
+restart:
+ mutex_enter(&nsp->vpnd_lock);
+ for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+ vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+ mutex_enter(&vdp->vdd_lock);
+ if ((vdp->vdd_flags & VND_D_CONDEMNED) ||
+ !(vdp->vdd_flags & VND_D_LINKED)) {
+ mutex_exit(&vdp->vdd_lock);
+ continue;
+ }
+
+ /*
+ * We drop our lock here and restart afterwards. Note that as
+ * part of unlinking we end up doing a rele of the vnd_dev_t. If
+ * this is the final hold on the vnd_dev_t then it might try and
+ * remove itself. Our locking rules requires not to be holding
+ * any locks when we call any of the rele functions.
+ *
+ * Note that the unlink function requires holders to call into
+ * it with the vnd_dev_t->vdd_lock held and will take care of it
+ * for us. Because we don't have a hold on it, we're done at
+ * this point.
+ */
+ mutex_exit(&nsp->vpnd_lock);
+ /* Forcibly unlink */
+ vnd_dev_unlink(vdp);
+ goto restart;
+ }
+ mutex_exit(&nsp->vpnd_lock);
+}
+
+/* ARGSUSED */
+static void
+vnd_stack_destroy(netstackid_t stackid, void *arg)
+{
+ vnd_pnsd_t *nsp = arg;
+
+ ASSERT(nsp != NULL);
+
+ /*
+ * Now that we've unlinked everything we just have to hang out for
+ * it to finish exiting. Now that it's no longer the kernel itself
+ * that's doing this we just need to wait for our reference count to
+ * equal zero and then we're free. If the global zone is holding open a
+ * reference to a vnd device for another zone, that's bad, but there's
+ * nothing much we can do. See the section on 'vnd, zones, netstacks' in
+ * the big theory statement for more information.
+ */
+ mutex_enter(&nsp->vpnd_lock);
+ while (nsp->vpnd_ref != 0)
+ cv_wait(&nsp->vpnd_ref_change, &nsp->vpnd_lock);
+ mutex_exit(&nsp->vpnd_lock);
+
+ /*
+ * During shutdown we removed ourselves from the list and now we have no
+ * more references so we can safely say that there is nothing left and
+ * destroy everything that we had sitting around.
+ */
+ if (nsp->vpnd_hooked == B_TRUE)
+ vnd_netinfo_fini(nsp);
+
+ mutex_destroy(&nsp->vpnd_lock);
+ list_destroy(&nsp->vpnd_dev_list);
+ kmem_cache_free(vnd_pnsd_cache, nsp);
+}
+
+/*
+ * Convert a node with a name of the form /dev/vnd/zone/%zonename and
+ * /dev/vnd/zone/%zonename/%linkname to the corresponding vnd netstack.
+ */
+static vnd_pnsd_t *
+vnd_sdev_ctx_to_ns(sdev_ctx_t ctx)
+{
+ enum vtype vt;
+ const char *path = sdev_ctx_path(ctx);
+ char *zstart, *dup;
+ size_t duplen;
+ vnd_pnsd_t *nsp;
+
+ vt = sdev_ctx_vtype(ctx);
+ ASSERT(strncmp(path, VND_SDEV_ZROOT, strlen(VND_SDEV_ZROOT)) == 0);
+
+ if (vt == VDIR) {
+ zstart = strrchr(path, '/');
+ ASSERT(zstart != NULL);
+ zstart++;
+ return (vnd_nsd_lookup_by_zonename(zstart));
+ }
+
+ ASSERT(vt == VCHR);
+
+ dup = strdup(path);
+ duplen = strlen(dup) + 1;
+ zstart = strrchr(dup, '/');
+ *zstart = '\0';
+ zstart--;
+ zstart = strrchr(dup, '/');
+ zstart++;
+ nsp = vnd_nsd_lookup_by_zonename(zstart);
+ kmem_free(dup, duplen);
+
+ return (nsp);
+}
+
+static sdev_plugin_validate_t
+vnd_sdev_validate_dir(sdev_ctx_t ctx)
+{
+ vnd_pnsd_t *nsp;
+
+ if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ROOT) == 0)
+ return (SDEV_VTOR_VALID);
+
+ if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ZROOT) == 0) {
+ ASSERT(getzoneid() == GLOBAL_ZONEID);
+ ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL);
+ return (SDEV_VTOR_VALID);
+ }
+
+ nsp = vnd_sdev_ctx_to_ns(ctx);
+ if (nsp == NULL)
+ return (SDEV_VTOR_INVALID);
+ vnd_nsd_rele(nsp);
+
+ return (SDEV_VTOR_VALID);
+}
+
+static sdev_plugin_validate_t
+vnd_sdev_validate(sdev_ctx_t ctx)
+{
+ enum vtype vt;
+ vnd_dev_t *vdp;
+ minor_t minor;
+
+ vt = sdev_ctx_vtype(ctx);
+ if (vt == VDIR)
+ return (vnd_sdev_validate_dir(ctx));
+ ASSERT(vt == VCHR);
+
+ if (strcmp("ctl", sdev_ctx_name(ctx)) == 0)
+ return (SDEV_VTOR_VALID);
+
+ if (sdev_ctx_minor(ctx, &minor) != 0)
+ return (SDEV_VTOR_STALE);
+
+ vdp = vnd_dev_lookup(minor);
+ if (vdp == NULL)
+ return (SDEV_VTOR_STALE);
+
+ mutex_enter(&vdp->vdd_lock);
+ if (!(vdp->vdd_flags & VND_D_LINKED) ||
+ (vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (SDEV_VTOR_STALE);
+ }
+
+ if (strcmp(sdev_ctx_name(ctx), vdp->vdd_lname) != 0) {
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (SDEV_VTOR_STALE);
+ }
+
+ mutex_exit(&vdp->vdd_lock);
+ vnd_dev_rele(vdp);
+ return (SDEV_VTOR_VALID);
+}
+
+/*
+ * This function is a no-op. sdev never has holds on our devices as they can go
+ * away at any time and specfs has to deal with that fact.
+ */
+/* ARGSUSED */
+static void
+vnd_sdev_inactive(sdev_ctx_t ctx)
+{
+}
+
+static int
+vnd_sdev_fillzone(vnd_pnsd_t *nsp, sdev_ctx_t ctx)
+{
+ int ret;
+ vnd_dev_t *vdp;
+
+ mutex_enter(&nsp->vpnd_lock);
+ for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
+ vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
+ mutex_enter(&vdp->vdd_lock);
+ if ((vdp->vdd_flags & VND_D_LINKED) &&
+ !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) {
+ ret = sdev_plugin_mknod(ctx, vdp->vdd_lname,
+ VND_SDEV_MODE, vdp->vdd_devid);
+ if (ret != 0 && ret != EEXIST) {
+ mutex_exit(&vdp->vdd_lock);
+ mutex_exit(&nsp->vpnd_lock);
+ vnd_nsd_rele(nsp);
+ return (ret);
+ }
+ }
+ mutex_exit(&vdp->vdd_lock);
+ }
+ mutex_exit(&nsp->vpnd_lock);
+
+ return (0);
+}
+
+static int
+vnd_sdev_filldir_root(sdev_ctx_t ctx)
+{
+ zoneid_t zid;
+ vnd_pnsd_t *nsp;
+ int ret;
+
+ zid = getzoneid();
+ nsp = vnd_nsd_lookup(zoneid_to_netstackid(zid));
+ ASSERT(nsp != NULL);
+ ret = vnd_sdev_fillzone(nsp, ctx);
+ vnd_nsd_rele(nsp);
+ if (ret != 0)
+ return (ret);
+
+ /*
+ * Checking the zone id is not sufficient as the global zone could be
+ * reaching down into a non-global zone's mounted /dev.
+ */
+ if (zid == GLOBAL_ZONEID && (sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL)) {
+ ret = sdev_plugin_mkdir(ctx, "zone");
+ if (ret != 0 && ret != EEXIST)
+ return (ret);
+ }
+
+ /*
+ * Always add a reference to the control node. There's no need to
+ * reference it since it always exists and is always what we clone from.
+ */
+ ret = sdev_plugin_mknod(ctx, "ctl", VND_SDEV_MODE,
+ makedevice(ddi_driver_major(vnd_dip), 0));
+ if (ret != 0 && ret != EEXIST)
+ return (ret);
+
+ return (0);
+}
+
+static int
+vnd_sdev_filldir_zroot(sdev_ctx_t ctx)
+{
+ int ret;
+ vnd_pnsd_t *nsp;
+ zone_t *zonep;
+
+ ASSERT(getzoneid() == GLOBAL_ZONEID);
+ ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL);
+
+ mutex_enter(&vnd_dev_lock);
+ for (nsp = list_head(&vnd_nsd_list); nsp != NULL;
+ nsp = list_next(&vnd_nsd_list, nsp)) {
+ mutex_enter(&nsp->vpnd_lock);
+ if (list_is_empty(&nsp->vpnd_dev_list)) {
+ mutex_exit(&nsp->vpnd_lock);
+ continue;
+ }
+ mutex_exit(&nsp->vpnd_lock);
+ zonep = zone_find_by_id(nsp->vpnd_zid);
+ /*
+ * This zone must be being torn down, so skip it.
+ */
+ if (zonep == NULL)
+ continue;
+ ret = sdev_plugin_mkdir(ctx, zonep->zone_name);
+ zone_rele(zonep);
+ if (ret != 0 && ret != EEXIST) {
+ mutex_exit(&vnd_dev_lock);
+ return (ret);
+ }
+ }
+ mutex_exit(&vnd_dev_lock);
+ return (0);
+}
+
+static int
+vnd_sdev_filldir(sdev_ctx_t ctx)
+{
+ int ret;
+ vnd_pnsd_t *nsp;
+
+ ASSERT(sdev_ctx_vtype(ctx) == VDIR);
+ if (strcmp(VND_SDEV_ROOT, sdev_ctx_path(ctx)) == 0)
+ return (vnd_sdev_filldir_root(ctx));
+
+ if (strcmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx)) == 0)
+ return (vnd_sdev_filldir_zroot(ctx));
+
+ ASSERT(strncmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx),
+ strlen(VND_SDEV_ZROOT)) == 0);
+ nsp = vnd_sdev_ctx_to_ns(ctx);
+ if (nsp == NULL)
+ return (0);
+
+ ret = vnd_sdev_fillzone(nsp, ctx);
+ vnd_nsd_rele(nsp);
+
+ return (ret);
+}
+
+static sdev_plugin_ops_t vnd_sdev_ops = {
+ SDEV_PLUGIN_VERSION,
+ SDEV_PLUGIN_SUBDIR,
+ vnd_sdev_validate,
+ vnd_sdev_filldir,
+ vnd_sdev_inactive
+};
+
+static int
+vnd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ int errp = 0;
+
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ /*
+ * Only allow one instance.
+ */
+ if (vnd_dip != NULL)
+ return (DDI_FAILURE);
+
+ vnd_dip = dip;
+ if (ddi_create_minor_node(vnd_dip, "vnd", S_IFCHR, 0, DDI_PSEUDO, 0) !=
+ DDI_SUCCESS) {
+ vnd_dip = NULL;
+ return (DDI_FAILURE);
+ }
+
+ if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
+ DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
+ ddi_remove_minor_node(vnd_dip, NULL);
+ vnd_dip = NULL;
+ return (DDI_FAILURE);
+ }
+
+ vnd_sdev_hdl = sdev_plugin_register(VND_SDEV_NAME, &vnd_sdev_ops,
+ &errp);
+ if (vnd_sdev_hdl == (sdev_plugin_hdl_t)NULL) {
+ ddi_remove_minor_node(vnd_dip, NULL);
+ ddi_prop_remove_all(vnd_dip);
+ vnd_dip = NULL;
+ return (DDI_FAILURE);
+ }
+
+ vnd_sqset = gsqueue_set_create(GSQUEUE_DEFAULT_PRIORITY);
+
+ return (DDI_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+vnd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ mutex_enter(&vnd_dev_lock);
+ if (!list_is_empty(&vnd_dev_list)) {
+ mutex_exit(&vnd_dev_lock);
+ return (DDI_FAILURE);
+ }
+ mutex_exit(&vnd_dev_lock);
+
+ return (DDI_FAILURE);
+}
+
+/* ARGSUSED */
+static int
+vnd_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
+{
+ int error;
+
+ switch (cmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ *result = (void *)vnd_dip;
+ error = DDI_SUCCESS;
+ break;
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)0;
+ error = DDI_SUCCESS;
+ break;
+ default:
+ error = DDI_FAILURE;
+ break;
+ }
+ return (error);
+}
+
+
+
+static void
+vnd_ddi_fini(void)
+{
+ netstack_unregister(NS_VND);
+ if (vnd_taskq != NULL)
+ taskq_destroy(vnd_taskq);
+ if (vnd_str_cache != NULL)
+ kmem_cache_destroy(vnd_str_cache);
+ if (vnd_dev_cache != NULL)
+ kmem_cache_destroy(vnd_dev_cache);
+ if (vnd_pnsd_cache != NULL)
+ kmem_cache_destroy(vnd_pnsd_cache);
+ if (vnd_minors != NULL)
+ id_space_destroy(vnd_minors);
+ if (vnd_list_init != 0) {
+ list_destroy(&vnd_nsd_list);
+ list_destroy(&vnd_dev_list);
+ mutex_destroy(&vnd_dev_lock);
+ vnd_list_init = 0;
+ }
+ frameio_fini();
+}
+
+static int
+vnd_ddi_init(void)
+{
+ if (frameio_init() != 0)
+ return (DDI_FAILURE);
+
+ vnd_str_cache = kmem_cache_create("vnd_str_cache", sizeof (vnd_str_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (vnd_str_cache == NULL) {
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+ vnd_dev_cache = kmem_cache_create("vnd_dev_cache", sizeof (vnd_dev_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (vnd_dev_cache == NULL) {
+ kmem_cache_destroy(vnd_str_cache);
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+ vnd_pnsd_cache = kmem_cache_create("vnd_pnsd_cache",
+ sizeof (vnd_pnsd_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (vnd_pnsd_cache == NULL) {
+ kmem_cache_destroy(vnd_dev_cache);
+ kmem_cache_destroy(vnd_str_cache);
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+
+ vnd_taskq = taskq_create_instance("vnd", -1, 1, minclsyspri, 0, 0, 0);
+ if (vnd_taskq == NULL) {
+ kmem_cache_destroy(vnd_pnsd_cache);
+ kmem_cache_destroy(vnd_dev_cache);
+ kmem_cache_destroy(vnd_str_cache);
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+
+ vnd_minors = id_space_create("vnd_minors", 1, INT32_MAX);
+ if (vnd_minors == NULL) {
+ taskq_destroy(vnd_taskq);
+ kmem_cache_destroy(vnd_pnsd_cache);
+ kmem_cache_destroy(vnd_dev_cache);
+ kmem_cache_destroy(vnd_str_cache);
+ frameio_fini();
+ return (DDI_FAILURE);
+ }
+
+ mutex_init(&vnd_dev_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&vnd_dev_list, sizeof (vnd_dev_t),
+ offsetof(vnd_dev_t, vdd_link));
+ list_create(&vnd_nsd_list, sizeof (vnd_pnsd_t),
+ offsetof(vnd_pnsd_t, vpnd_link));
+ vnd_list_init = 1;
+
+ netstack_register(NS_VND, vnd_stack_init, vnd_stack_shutdown,
+ vnd_stack_destroy);
+
+ return (DDI_SUCCESS);
+}
+
+static struct module_info vnd_minfo = {
+ 0, /* module id */
+ "vnd", /* module name */
+ 1, /* smallest packet size */
+ INFPSZ, /* largest packet size (infinite) */
+ 1, /* high watermark */
+ 0 /* low watermark */
+};
+
+static struct qinit vnd_r_qinit = {
+ vnd_s_rput,
+ NULL,
+ vnd_s_open,
+ vnd_s_close,
+ NULL,
+ &vnd_minfo,
+ NULL
+};
+
+static struct qinit vnd_w_qinit = {
+ vnd_s_wput,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ &vnd_minfo,
+ NULL
+};
+
+static struct streamtab vnd_strtab = {
+ &vnd_r_qinit,
+ &vnd_w_qinit,
+ NULL,
+ NULL
+};
+
+
+static struct cb_ops vnd_cb_ops = {
+ vnd_open, /* open */
+ vnd_close, /* close */
+ nulldev, /* strategy */
+ nulldev, /* print */
+ nodev, /* dump */
+ vnd_read, /* read */
+ vnd_write, /* write */
+ vnd_ioctl, /* ioctl */
+ nodev, /* devmap */
+ nodev, /* mmap */
+ nodev, /* segmap */
+ vnd_chpoll, /* poll */
+ ddi_prop_op, /* cb_prop_op */
+ NULL, /* streamtab */
+ D_MP /* Driver compatibility flag */
+};
+
+static struct dev_ops vnd_dev_ops = {
+ DEVO_REV, /* devo_rev */
+ 0, /* refcnt */
+ vnd_info, /* get_dev_info */
+ nulldev, /* identify */
+ nulldev, /* probe */
+ vnd_attach, /* attach */
+ vnd_detach, /* detach */
+ nodev, /* reset */
+ &vnd_cb_ops, /* driver operations */
+ NULL, /* bus operations */
+ nodev, /* dev power */
+ ddi_quiesce_not_needed /* quiesce */
+};
+
+static struct modldrv vnd_modldrv = {
+ &mod_driverops,
+ "Virtual Networking Datapath Driver",
+ &vnd_dev_ops
+};
+
+static struct fmodsw vnd_fmodfsw = {
+ "vnd",
+ &vnd_strtab,
+ D_NEW | D_MP
+};
+
+static struct modlstrmod vnd_modlstrmod = {
+ &mod_strmodops,
+ "Virtual Networking Datapath Driver",
+ &vnd_fmodfsw
+};
+
+static struct modlinkage vnd_modlinkage = {
+ MODREV_1,
+ &vnd_modldrv,
+ &vnd_modlstrmod,
+ NULL
+};
+
+int
+_init(void)
+{
+ int error;
+
+ /*
+ * We need to do all of our global initialization in init as opposed to
+ * attach and detach. The problem here is that because vnd can be used
+ * from a stream context while being detached, we can not rely on having
+ * run attach to create everything, alas. so it goes in _init, just like
+ * our friend ip.
+ */
+ if ((error = vnd_ddi_init()) != DDI_SUCCESS)
+ return (error);
+ error = mod_install((&vnd_modlinkage));
+ if (error != 0)
+ vnd_ddi_fini();
+ return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&vnd_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int error;
+
+ error = mod_remove(&vnd_modlinkage);
+ if (error == 0)
+ vnd_ddi_fini();
+ return (error);
+}
diff --git a/usr/src/uts/common/io/vnd/vnd.conf b/usr/src/uts/common/io/vnd/vnd.conf
new file mode 100644
index 0000000000..65872e1ddf
--- /dev/null
+++ b/usr/src/uts/common/io/vnd/vnd.conf
@@ -0,0 +1,16 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2014, Joyent, Inc. All rights reserved.
+#
+
+name="vnd" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/io/zfd.c b/usr/src/uts/common/io/zfd.c
new file mode 100644
index 0000000000..2da310ab8d
--- /dev/null
+++ b/usr/src/uts/common/io/zfd.c
@@ -0,0 +1,1154 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Zone File Descriptor Driver.
+ *
+ * This driver is derived from the zcons driver which is in turn derived from
+ * the pts/ptm drivers. The purpose is to expose file descriptors within the
+ * zone which are connected to zoneadmd and used for logging or an interactive
+ * connection to a process within the zone.
+ *
+ * Its implementation is straightforward. Each instance of the driver
+ * represents a global-zone/local-zone pair. Unlike the zcons device, zoneadmd
+ * uses these devices unidirectionally to provide stdin, stdout and stderr to
+ * the process within the zone.
+ *
+ * Instances of zfd are onlined as children of /pseudo/zfdnex@2/ by zoneadmd,
+ * using the devctl framework; thus the driver does not need to maintain any
+ * sort of "admin" node.
+ *
+ * The driver shuttles I/O from master side to slave side and back. In a break
+ * from the pts/ptm semantics, if one side is not open, I/O directed towards
+ * it will simply be discarded. This is so that if zoneadmd is not holding the
+ * master side fd open (i.e. it has died somehow), processes in the zone do not
+ * experience any errors and I/O to the fd does not cause the process to hang.
+ *
+ * The driver can also act as a multiplexer so that data written to the
+ * slave side within the zone is also redirected back to another zfd device
+ * inside the zone for consumption (i.e. it can be read). The intention is
+ * that a logging process within the zone can consume data that is being
+ * written by an application onto the primary stream. This is essentially
+ * a tee off of the primary stream into a log stream. This tee can also be
+ * configured to be flow controlled via an ioctl. Flow control happens on the
+ * primary stream and is used to ensure that the log stream receives all of
+ * the messages off the primary stream when consumption of the data off of
+ * the log stream gets behind. Configuring for flow control implies that the
+ * application writing to the primary stream will be blocked when the log
+ * consumer gets behind. Note that closing the log stream (e.g. when the zone
+ * halts) will cause the loss of all messages queued in the stream.
+ *
+ * The zone's zfd device configuration is driven by zoneadmd and a zone mode.
+ * The mode, which is controlled by the zone attribute "zlog-mode" is somewhat
+ * of a misnomer since its purpose has evolved. The attribute can have a
+ * variety of values, but the lowest two positions are used to control how many
+ * zfd devices are created inside the zone and if the primary stream is a tty.
+ *
+ * Here is a summary of how the 4 modes control what zfd devices are created
+ * and how they're used:
+ *
+ * t-: 1 stdio zdev (0) configured as a tty
+ * --: 3 stdio zdevs (0, 1, 2), not configured as a tty
+ * tn: 1 stdio zdev (0) configured as a tty, 1 additional zdev (1)
+ * -n: 3 stdio zdevs (0, 1, 2), not tty, 2 additional zdevs (3, 4)
+ *
+ * With the 't' flag set, stdin/out/err is multiplexed onto a single full-duplex
+ * stream which is configured as a tty. That is, ptem, ldterm and ttycompat are
+ * autopushed onto the stream when the slave side is opened. There is only a
+ * single zfd dev (0) needed for the primary stream.
+ *
+ * When the 'n' flag is set, it is assumed that output logging will be done
+ * within the zone itself. In this configuration 1 or 2 additional zfd devices,
+ * depending on tty mode ('t' flag) are created within the zone. An application
+ * can then configure the zfd streams driver into a multiplexer. Output from
+ * the stdout/stderr zfd(s) will be teed into the correspond logging zfd(s)
+ * within the zone.
+ *
+ * The following is a diagram of how this works for a '-n' configuration:
+ *
+ *
+ * zoneadmd (for zlogin -I stdout)
+ * GZ: ^
+ * |
+ * --------------------------
+ * ^
+ * NGZ: |
+ * app >1 -> zfd1 -> zfd3 -> logger (for logger to consume app's stdout)
+ *
+ * There would be a similar path for the app's stderr into zfd4 for the logger
+ * to consume stderr.
+ */
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/conf.h>
+#include <sys/cred.h>
+#include <sys/ddi.h>
+#include <sys/debug.h>
+#include <sys/devops.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kstr.h>
+#include <sys/modctl.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/stream.h>
+#include <sys/stropts.h>
+#include <sys/strsun.h>
+#include <sys/sunddi.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/zfd.h>
+#include <sys/vnode.h>
+#include <sys/fs/snode.h>
+#include <sys/zone.h>
+#include <sys/sdt.h>
+
+static kmutex_t zfd_mux_lock;
+
+static int zfd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
+static int zfd_attach(dev_info_t *, ddi_attach_cmd_t);
+static int zfd_detach(dev_info_t *, ddi_detach_cmd_t);
+
+static int zfd_open(queue_t *, dev_t *, int, int, cred_t *);
+static int zfd_close(queue_t *, int, cred_t *);
+static void zfd_wput(queue_t *, mblk_t *);
+static void zfd_rsrv(queue_t *);
+static void zfd_wsrv(queue_t *);
+
+/*
+ * The instance number is encoded in the dev_t in the minor number; the lowest
+ * bit of the minor number is used to track the master vs. slave side of the
+ * fd. The rest of the bits in the minor number are the instance.
+ */
+#define ZFD_MASTER_MINOR 0
+#define ZFD_SLAVE_MINOR 1
+
+#define ZFD_INSTANCE(x) (getminor((x)) >> 1)
+#define ZFD_NODE(x) (getminor((x)) & 0x01)
+
+/*
+ * This macro converts a zfd_state_t pointer to the associated slave minor
+ * node's dev_t.
+ */
+#define ZFD_STATE_TO_SLAVEDEV(x) \
+ (makedevice(ddi_driver_major((x)->zfd_devinfo), \
+ (minor_t)(ddi_get_instance((x)->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR)))
+
+int zfd_debug = 0;
+#define DBG(a) if (zfd_debug) cmn_err(CE_NOTE, a)
+#define DBG1(a, b) if (zfd_debug) cmn_err(CE_NOTE, a, b)
+
+/*
+ * ZFD Pseudo Terminal Module: stream data structure definitions,
+ * based on zcons.
+ */
+static struct module_info zfd_info = {
+ 0x20FD, /* ZOFD - 8445 */
+ "zfd",
+ 0, /* min packet size */
+ INFPSZ, /* max packet size - infinity */
+ 2048, /* high water */
+ 128 /* low water */
+};
+
+static struct qinit zfd_rinit = {
+ NULL,
+ (int (*)()) zfd_rsrv,
+ zfd_open,
+ zfd_close,
+ NULL,
+ &zfd_info,
+ NULL
+};
+
+static struct qinit zfd_winit = {
+ (int (*)()) zfd_wput,
+ (int (*)()) zfd_wsrv,
+ NULL,
+ NULL,
+ NULL,
+ &zfd_info,
+ NULL
+};
+
+static struct streamtab zfd_tab_info = {
+ &zfd_rinit,
+ &zfd_winit,
+ NULL,
+ NULL
+};
+
+#define ZFD_CONF_FLAG (D_MP | D_MTQPAIR | D_MTOUTPERIM | D_MTOCEXCL)
+
+/*
+ * this will define (struct cb_ops cb_zfd_ops) and (struct dev_ops zfd_ops)
+ */
+DDI_DEFINE_STREAM_OPS(zfd_ops, nulldev, nulldev, zfd_attach, zfd_detach, \
+ nodev, zfd_getinfo, ZFD_CONF_FLAG, &zfd_tab_info, \
+ ddi_quiesce_not_needed);
+
+/*
+ * Module linkage information for the kernel.
+ */
+
+static struct modldrv modldrv = {
+ &mod_driverops, /* Type of module (this is a pseudo driver) */
+ "Zone FD driver", /* description of module */
+ &zfd_ops /* driver ops */
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ &modldrv,
+ NULL
+};
+
+typedef enum {
+ ZFD_NO_MUX,
+ ZFD_PRIMARY_STREAM,
+ ZFD_LOG_STREAM
+} zfd_mux_type_t;
+
+typedef struct zfd_state {
+ dev_info_t *zfd_devinfo; /* instance info */
+ queue_t *zfd_master_rdq; /* GZ read queue */
+ queue_t *zfd_slave_rdq; /* in-zone read queue */
+ int zfd_state; /* ZFD_STATE_MOPEN, ZFD_STATE_SOPEN */
+ int zfd_tty; /* ZFD_MAKETTY - strm mods will push */
+ boolean_t zfd_is_flowcon; /* primary stream flow stopped */
+ boolean_t zfd_allow_flowcon; /* use flow control */
+ zfd_mux_type_t zfd_muxt; /* state type: none, primary, log */
+ struct zfd_state *zfd_inst_pri; /* log state's primary ptr */
+ struct zfd_state *zfd_inst_log; /* primary state's log ptr */
+} zfd_state_t;
+
+#define ZFD_STATE_MOPEN 0x01
+#define ZFD_STATE_SOPEN 0x02
+
+static void *zfd_soft_state;
+
+/*
+ * List of STREAMS modules that are autopushed onto a slave instance when its
+ * opened, but only if the ZFD_MAKETTY ioctl has first been received by the
+ * master.
+ */
+static char *zfd_mods[] = {
+ "ptem",
+ "ldterm",
+ "ttcompat",
+ NULL
+};
+
+int
+_init(void)
+{
+ int err;
+
+ if ((err = ddi_soft_state_init(&zfd_soft_state, sizeof (zfd_state_t),
+ 0)) != 0) {
+ return (err);
+ }
+
+ if ((err = mod_install(&modlinkage)) != 0)
+ ddi_soft_state_fini(zfd_soft_state);
+
+ mutex_init(&zfd_mux_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (err);
+}
+
+
+int
+_fini(void)
+{
+ int err;
+
+ if ((err = mod_remove(&modlinkage)) != 0) {
+ return (err);
+ }
+
+ ddi_soft_state_fini(&zfd_soft_state);
+ mutex_destroy(&zfd_mux_lock);
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+static int
+zfd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ zfd_state_t *zfds;
+ int instance;
+ char masternm[ZFD_NAME_LEN], slavenm[ZFD_NAME_LEN];
+
+ if (cmd != DDI_ATTACH)
+ return (DDI_FAILURE);
+
+ instance = ddi_get_instance(dip);
+ if (ddi_soft_state_zalloc(zfd_soft_state, instance) != DDI_SUCCESS)
+ return (DDI_FAILURE);
+
+ (void) snprintf(masternm, sizeof (masternm), "%s%d", ZFD_MASTER_NAME,
+ instance);
+ (void) snprintf(slavenm, sizeof (slavenm), "%s%d", ZFD_SLAVE_NAME,
+ instance);
+
+ /*
+ * Create the master and slave minor nodes.
+ */
+ if ((ddi_create_minor_node(dip, slavenm, S_IFCHR,
+ instance << 1 | ZFD_SLAVE_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE) ||
+ (ddi_create_minor_node(dip, masternm, S_IFCHR,
+ instance << 1 | ZFD_MASTER_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE)) {
+ ddi_remove_minor_node(dip, NULL);
+ ddi_soft_state_free(zfd_soft_state, instance);
+ return (DDI_FAILURE);
+ }
+
+ VERIFY((zfds = ddi_get_soft_state(zfd_soft_state, instance)) != NULL);
+ zfds->zfd_devinfo = dip;
+ zfds->zfd_tty = 0;
+ zfds->zfd_muxt = ZFD_NO_MUX;
+ zfds->zfd_inst_log = NULL;
+ return (DDI_SUCCESS);
+}
+
+static int
+zfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ zfd_state_t *zfds;
+ int instance;
+
+ if (cmd != DDI_DETACH)
+ return (DDI_FAILURE);
+
+ instance = ddi_get_instance(dip);
+ if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL)
+ return (DDI_FAILURE);
+
+ if ((zfds->zfd_state & ZFD_STATE_MOPEN) ||
+ (zfds->zfd_state & ZFD_STATE_SOPEN)) {
+ DBG1("zfd_detach: device (dip=%p) still open\n", (void *)dip);
+ return (DDI_FAILURE);
+ }
+
+ ddi_remove_minor_node(dip, NULL);
+ ddi_soft_state_free(zfd_soft_state, instance);
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * zfd_getinfo()
+ * getinfo(9e) entrypoint.
+ */
+/*ARGSUSED*/
+static int
+zfd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ zfd_state_t *zfds;
+ int instance = ZFD_INSTANCE((dev_t)arg);
+
+ switch (infocmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ if ((zfds = ddi_get_soft_state(zfd_soft_state,
+ instance)) == NULL)
+ return (DDI_FAILURE);
+ *result = zfds->zfd_devinfo;
+ return (DDI_SUCCESS);
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)(uintptr_t)instance;
+ return (DDI_SUCCESS);
+ }
+ return (DDI_FAILURE);
+}
+
+/*
+ * Return the equivalent queue from the other side of the relationship.
+ * e.g.: given the slave's write queue, return the master's write queue.
+ */
+static queue_t *
+zfd_switch(queue_t *qp)
+{
+ zfd_state_t *zfds = qp->q_ptr;
+ ASSERT(zfds != NULL);
+
+ if (qp == zfds->zfd_master_rdq)
+ return (zfds->zfd_slave_rdq);
+ else if (OTHERQ(qp) == zfds->zfd_master_rdq && zfds->zfd_slave_rdq
+ != NULL)
+ return (OTHERQ(zfds->zfd_slave_rdq));
+ else if (qp == zfds->zfd_slave_rdq)
+ return (zfds->zfd_master_rdq);
+ else if (OTHERQ(qp) == zfds->zfd_slave_rdq && zfds->zfd_master_rdq
+ != NULL)
+ return (OTHERQ(zfds->zfd_master_rdq));
+ else
+ return (NULL);
+}
+
+/*
+ * For debugging and outputting messages. Returns the name of the side of
+ * the relationship associated with this queue.
+ */
+static const char *
+zfd_side(queue_t *qp)
+{
+ zfd_state_t *zfds = qp->q_ptr;
+ ASSERT(zfds != NULL);
+
+ if (qp == zfds->zfd_master_rdq ||
+ OTHERQ(qp) == zfds->zfd_master_rdq) {
+ return ("master");
+ }
+ ASSERT(qp == zfds->zfd_slave_rdq || OTHERQ(qp) == zfds->zfd_slave_rdq);
+ return ("slave");
+}
+
+/*ARGSUSED*/
+static int
+zfd_master_open(zfd_state_t *zfds,
+ queue_t *rqp, /* pointer to the read side queue */
+ dev_t *devp, /* pointer to stream tail's dev */
+ int oflag, /* the user open(2) supplied flags */
+ int sflag, /* open state flag */
+ cred_t *credp) /* credentials */
+{
+ mblk_t *mop;
+ struct stroptions *sop;
+
+ /*
+ * Enforce exclusivity on the master side; the only consumer should
+ * be the zoneadmd for the zone.
+ */
+ if ((zfds->zfd_state & ZFD_STATE_MOPEN) != 0)
+ return (EBUSY);
+
+ if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) {
+ DBG("zfd_master_open(): mop allocation failed\n");
+ return (ENOMEM);
+ }
+
+ zfds->zfd_state |= ZFD_STATE_MOPEN;
+
+ /*
+ * q_ptr stores driver private data; stash the soft state data on both
+ * read and write sides of the queue.
+ */
+ WR(rqp)->q_ptr = rqp->q_ptr = zfds;
+ qprocson(rqp);
+
+ /*
+ * Following qprocson(), the master side is fully plumbed into the
+ * STREAM and may send/receive messages. Setting zfds->zfd_master_rdq
+ * will allow the slave to send messages to us (the master).
+ * This cannot occur before qprocson() because the master is not
+ * ready to process them until that point.
+ */
+ zfds->zfd_master_rdq = rqp;
+
+ /*
+ * set up hi/lo water marks on stream head read queue and add
+ * controlling tty as needed.
+ */
+ mop->b_datap->db_type = M_SETOPTS;
+ mop->b_wptr += sizeof (struct stroptions);
+ sop = (struct stroptions *)(void *)mop->b_rptr;
+ if (oflag & FNOCTTY)
+ sop->so_flags = SO_HIWAT | SO_LOWAT;
+ else
+ sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY;
+ sop->so_hiwat = 512;
+ sop->so_lowat = 256;
+ putnext(rqp, mop);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfd_slave_open(zfd_state_t *zfds,
+ queue_t *rqp, /* pointer to the read side queue */
+ dev_t *devp, /* pointer to stream tail's dev */
+ int oflag, /* the user open(2) supplied flags */
+ int sflag, /* open state flag */
+ cred_t *credp) /* credentials */
+{
+ mblk_t *mop;
+ struct stroptions *sop;
+ /*
+ * The slave side can be opened as many times as needed.
+ */
+ if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) {
+ ASSERT((rqp != NULL) && (WR(rqp)->q_ptr == zfds));
+ return (0);
+ }
+
+ /* A log stream is read-only */
+ if (zfds->zfd_muxt == ZFD_LOG_STREAM &&
+ (oflag & (FREAD | FWRITE)) != FREAD)
+ return (EINVAL);
+
+ if (zfds->zfd_tty == 1) {
+ major_t major;
+ minor_t minor;
+ minor_t lastminor;
+ uint_t anchorindex;
+
+ /*
+ * Set up sad(7D) so that the necessary STREAMS modules will
+ * be in place. A wrinkle is that 'ptem' must be anchored
+ * in place (see streamio(7i)) because we always want the
+ * fd to have terminal semantics.
+ */
+ minor =
+ ddi_get_instance(zfds->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR;
+ major = ddi_driver_major(zfds->zfd_devinfo);
+ lastminor = 0;
+ anchorindex = 1;
+ if (kstr_autopush(SET_AUTOPUSH, &major, &minor, &lastminor,
+ &anchorindex, zfd_mods) != 0) {
+ DBG("zfd_slave_open(): kstr_autopush() failed\n");
+ return (EIO);
+ }
+ }
+
+ if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) {
+ DBG("zfd_slave_open(): mop allocation failed\n");
+ return (ENOMEM);
+ }
+
+ zfds->zfd_state |= ZFD_STATE_SOPEN;
+
+ /*
+ * q_ptr stores driver private data; stash the soft state data on both
+ * read and write sides of the queue.
+ */
+ WR(rqp)->q_ptr = rqp->q_ptr = zfds;
+
+ qprocson(rqp);
+
+ /*
+ * Must follow qprocson(), since we aren't ready to process until then.
+ */
+ zfds->zfd_slave_rdq = rqp;
+
+ /*
+ * set up hi/lo water marks on stream head read queue and add
+ * controlling tty as needed.
+ */
+ mop->b_datap->db_type = M_SETOPTS;
+ mop->b_wptr += sizeof (struct stroptions);
+ sop = (struct stroptions *)(void *)mop->b_rptr;
+ sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY;
+ sop->so_hiwat = 512;
+ sop->so_lowat = 256;
+ putnext(rqp, mop);
+
+ return (0);
+}
+
+/*
+ * open(9e) entrypoint; checks sflag, and rejects anything unordinary.
+ */
+static int
+zfd_open(queue_t *rqp, /* pointer to the read side queue */
+ dev_t *devp, /* pointer to stream tail's dev */
+ int oflag, /* the user open(2) supplied flags */
+ int sflag, /* open state flag */
+ cred_t *credp) /* credentials */
+{
+ int instance = ZFD_INSTANCE(*devp);
+ int ret;
+ zfd_state_t *zfds;
+
+ if (sflag != 0)
+ return (EINVAL);
+
+ if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL)
+ return (ENXIO);
+
+ switch (ZFD_NODE(*devp)) {
+ case ZFD_MASTER_MINOR:
+ ret = zfd_master_open(zfds, rqp, devp, oflag, sflag, credp);
+ break;
+ case ZFD_SLAVE_MINOR:
+ ret = zfd_slave_open(zfds, rqp, devp, oflag, sflag, credp);
+ /*
+ * If we just opened the log stream and flow control has
+ * been enabled, we want to make sure the primary stream can
+ * start flowing.
+ */
+ if (ret == 0 && zfds->zfd_muxt == ZFD_LOG_STREAM &&
+ zfds->zfd_inst_pri->zfd_allow_flowcon) {
+ zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE;
+ if (zfds->zfd_inst_pri->zfd_master_rdq != NULL)
+ qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq));
+ }
+ break;
+ default:
+ ret = ENXIO;
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * close(9e) entrypoint.
+ */
+/*ARGSUSED1*/
+static int
+zfd_close(queue_t *rqp, int flag, cred_t *credp)
+{
+ queue_t *wqp;
+ mblk_t *bp;
+ zfd_state_t *zfds;
+ major_t major;
+ minor_t minor;
+
+ zfds = (zfd_state_t *)rqp->q_ptr;
+
+ if (rqp == zfds->zfd_master_rdq) {
+ DBG("Closing master side");
+
+ zfds->zfd_master_rdq = NULL;
+ zfds->zfd_state &= ~ZFD_STATE_MOPEN;
+
+ /*
+ * qenable slave side write queue so that it can flush
+ * its messages as master's read queue is going away
+ */
+ if (zfds->zfd_slave_rdq != NULL) {
+ qenable(WR(zfds->zfd_slave_rdq));
+ }
+
+ qprocsoff(rqp);
+ WR(rqp)->q_ptr = rqp->q_ptr = NULL;
+
+ } else if (rqp == zfds->zfd_slave_rdq) {
+
+ DBG("Closing slave side");
+ zfds->zfd_state &= ~ZFD_STATE_SOPEN;
+ zfds->zfd_slave_rdq = NULL;
+
+ wqp = WR(rqp);
+ while ((bp = getq(wqp)) != NULL) {
+ if (zfds->zfd_master_rdq != NULL)
+ putnext(zfds->zfd_master_rdq, bp);
+ else if (bp->b_datap->db_type == M_IOCTL)
+ miocnak(wqp, bp, 0, 0);
+ else
+ freemsg(bp);
+ }
+
+ /*
+ * Qenable master side write queue so that it can flush its
+ * messages as slaves's read queue is going away.
+ */
+ if (zfds->zfd_master_rdq != NULL)
+ qenable(WR(zfds->zfd_master_rdq));
+
+ /*
+ * Qenable primary stream if necessary.
+ */
+ if (zfds->zfd_muxt == ZFD_LOG_STREAM &&
+ zfds->zfd_inst_pri->zfd_allow_flowcon) {
+ zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE;
+ if (zfds->zfd_inst_pri->zfd_master_rdq != NULL)
+ qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq));
+ }
+
+ qprocsoff(rqp);
+ WR(rqp)->q_ptr = rqp->q_ptr = NULL;
+
+ if (zfds->zfd_tty == 1) {
+ /*
+ * Clear the sad configuration so that reopening
+ * doesn't fail to set up sad configuration.
+ */
+ major = ddi_driver_major(zfds->zfd_devinfo);
+ minor = ddi_get_instance(zfds->zfd_devinfo) << 1 |
+ ZFD_SLAVE_MINOR;
+ (void) kstr_autopush(CLR_AUTOPUSH, &major, &minor,
+ NULL, NULL, NULL);
+ }
+ }
+
+ return (0);
+}
+
+static void
+handle_mflush(queue_t *qp, mblk_t *mp)
+{
+ mblk_t *nmp;
+ DBG1("M_FLUSH on %s side", zfd_side(qp));
+
+ if (*mp->b_rptr & FLUSHW) {
+ DBG1("M_FLUSH, FLUSHW, %s side", zfd_side(qp));
+ flushq(qp, FLUSHDATA);
+ *mp->b_rptr &= ~FLUSHW;
+ if ((*mp->b_rptr & FLUSHR) == 0) {
+ /*
+ * FLUSHW only. Change to FLUSHR and putnext other side,
+ * then we are done.
+ */
+ *mp->b_rptr |= FLUSHR;
+ if (zfd_switch(RD(qp)) != NULL) {
+ putnext(zfd_switch(RD(qp)), mp);
+ return;
+ }
+ } else if ((zfd_switch(RD(qp)) != NULL) &&
+ (nmp = copyb(mp)) != NULL) {
+ /*
+ * It is a FLUSHRW; we copy the mblk and send
+ * it to the other side, since we still need to use
+ * the mblk in FLUSHR processing, below.
+ */
+ putnext(zfd_switch(RD(qp)), nmp);
+ }
+ }
+
+ if (*mp->b_rptr & FLUSHR) {
+ DBG("qreply(qp) turning FLUSHR around\n");
+ qreply(qp, mp);
+ return;
+ }
+ freemsg(mp);
+}
+
+/*
+ * Evaluate the various conditionals to determine if we're teeing into a log
+ * stream and if the primary stream should be flow controlled. This function
+ * can set the zfd_is_flowcon flag as a side effect.
+ *
+ * When teeing with flow control, we always queue the teed msg here and if
+ * the queue is getting full, we set zfd_is_flowcon. The primary stream will
+ * always queue when zfd_is_flowcon and will also not be served when
+ * zfd_is_flowcon is set. This causes backpressure on the primary stream
+ * until the teed queue can drain.
+ */
+static void
+zfd_tee_handler(zfd_state_t *zfds, unsigned char type, mblk_t *mp)
+{
+ queue_t *log_qp;
+ zfd_state_t *log_zfds;
+ mblk_t *lmp;
+
+ if (zfds->zfd_muxt != ZFD_PRIMARY_STREAM)
+ return;
+
+ if (type != M_DATA)
+ return;
+
+ log_zfds = zfds->zfd_inst_log;
+ if (log_zfds == NULL)
+ return;
+
+ ASSERT(log_zfds->zfd_muxt == ZFD_LOG_STREAM);
+
+ if ((log_zfds->zfd_state & ZFD_STATE_SOPEN) == 0) {
+ if (zfds->zfd_allow_flowcon)
+ zfds->zfd_is_flowcon = B_TRUE;
+ return;
+ }
+
+ /* The zfd_slave_rdq is null until the log dev is opened in the zone */
+ log_qp = RD(log_zfds->zfd_slave_rdq);
+ DTRACE_PROBE2(zfd__tee__check, void *, log_qp, void *, zfds);
+
+ if (!zfds->zfd_allow_flowcon) {
+ /*
+ * We're not supposed to tee with flow control and the tee is
+ * full so we skip teeing into the log stream.
+ */
+ if ((log_qp->q_flag & QFULL) != 0)
+ return;
+ }
+
+ /*
+ * Tee the message into the log stream.
+ */
+ lmp = dupmsg(mp);
+ if (lmp == NULL) {
+ if (zfds->zfd_allow_flowcon)
+ zfds->zfd_is_flowcon = B_TRUE;
+ return;
+ }
+
+ if (log_qp->q_first == NULL && bcanputnext(log_qp, lmp->b_band)) {
+ putnext(log_qp, lmp);
+ } else {
+ if (putq(log_qp, lmp) == 0) {
+ /* The logger queue is full, free the msg. */
+ freemsg(lmp);
+ }
+ /*
+ * If we're supposed to tee with flow control and the tee is
+ * over the high water mark then we want the primary stream to
+ * stop flowing. We'll stop queueing the primary stream after
+ * the log stream has drained.
+ */
+ if (zfds->zfd_allow_flowcon &&
+ log_qp->q_count > log_qp->q_hiwat) {
+ zfds->zfd_is_flowcon = B_TRUE;
+ }
+ }
+}
+
+/*
+ * wput(9E) is symmetric for master and slave sides, so this handles both
+ * without splitting the codepath. (The only exception to this is the
+ * processing of zfd ioctls, which is restricted to the master side.)
+ *
+ * zfd_wput() looks at the other side; if there is no process holding that
+ * side open, it frees the message. This prevents processes from hanging
+ * if no one is holding open the fd. Otherwise, it putnext's high
+ * priority messages, putnext's normal messages if possible, and otherwise
+ * enqueues the messages; in the case that something is enqueued, wsrv(9E)
+ * will take care of eventually shuttling I/O to the other side.
+ *
+ * When configured as a multiplexer, then anything written to the stream
+ * from inside the zone is also teed off to the corresponding log stream
+ * for consumption within the zone (i.e. the log stream can be read, but never
+ * written to, by an application inside the zone).
+ */
+static void
+zfd_wput(queue_t *qp, mblk_t *mp)
+{
+ unsigned char type = mp->b_datap->db_type;
+ zfd_state_t *zfds;
+ struct iocblk *iocbp;
+ boolean_t must_queue = B_FALSE;
+
+ ASSERT(qp->q_ptr);
+
+ DBG1("entering zfd_wput, %s side", zfd_side(qp));
+
+ /*
+ * Process zfd ioctl messages if qp is the master side's write queue.
+ */
+ zfds = (zfd_state_t *)qp->q_ptr;
+
+ if (type == M_IOCTL) {
+ iocbp = (struct iocblk *)(void *)mp->b_rptr;
+
+ switch (iocbp->ioc_cmd) {
+ case ZFD_MAKETTY:
+ zfds->zfd_tty = 1;
+ miocack(qp, mp, 0, 0);
+ return;
+ case ZFD_EOF:
+ if (zfds->zfd_slave_rdq != NULL)
+ (void) putnextctl(zfds->zfd_slave_rdq,
+ M_HANGUP);
+ miocack(qp, mp, 0, 0);
+ return;
+ case ZFD_HAS_SLAVE:
+ if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) {
+ miocack(qp, mp, 0, 0);
+ } else {
+ miocack(qp, mp, 0, ENOTTY);
+ }
+ return;
+ case ZFD_MUX: {
+ /*
+ * Setup the multiplexer configuration for the two
+ * streams.
+ *
+ * We expect to be called on the stream that will
+ * become the log stream and be passed one data block
+ * with the minor number of the slave side of the
+ * primary stream.
+ */
+ int to;
+ int instance;
+ zfd_state_t *prim_zfds;
+
+ if (iocbp->ioc_count != TRANSPARENT ||
+ mp->b_cont == NULL) {
+ miocack(qp, mp, 0, EINVAL);
+ return;
+ }
+
+ /* Get the primary slave minor device number */
+ to = *(int *)mp->b_cont->b_rptr;
+ instance = ZFD_INSTANCE(to);
+
+ if ((prim_zfds = ddi_get_soft_state(zfd_soft_state,
+ instance)) == NULL) {
+ miocack(qp, mp, 0, EINVAL);
+ return;
+ }
+
+ /* Disallow changing primary/log once set. */
+ mutex_enter(&zfd_mux_lock);
+ if (zfds->zfd_muxt != ZFD_NO_MUX ||
+ prim_zfds->zfd_muxt != ZFD_NO_MUX) {
+ mutex_exit(&zfd_mux_lock);
+ miocack(qp, mp, 0, EINVAL);
+ return;
+ }
+
+ zfds->zfd_muxt = ZFD_LOG_STREAM;
+ zfds->zfd_inst_pri = prim_zfds;
+ prim_zfds->zfd_muxt = ZFD_PRIMARY_STREAM;
+ prim_zfds->zfd_inst_log = zfds;
+ mutex_exit(&zfd_mux_lock);
+ DTRACE_PROBE2(zfd__mux__link, void *, prim_zfds,
+ void *, zfds);
+
+ miocack(qp, mp, 0, 0);
+ return;
+ }
+ case ZFD_MUX_FLOWCON: {
+ /*
+ * We expect this ioctl to be issued against the
+ * log stream. We don't use the primary stream since
+ * there can be other streams modules pushed onto that
+ * stream which would interfere with the ioctl.
+ */
+ int val;
+ zfd_state_t *prim_zfds;
+
+ if (iocbp->ioc_count != TRANSPARENT ||
+ mp->b_cont == NULL) {
+ miocack(qp, mp, 0, EINVAL);
+ return;
+ }
+
+ if (zfds->zfd_muxt != ZFD_LOG_STREAM) {
+ miocack(qp, mp, 0, EINVAL);
+ return;
+ }
+ prim_zfds = zfds->zfd_inst_pri;
+
+ /* Get the flow control setting */
+ val = *(int *)mp->b_cont->b_rptr;
+ if (val != 0 && val != 1) {
+ miocack(qp, mp, 0, EINVAL);
+ return;
+ }
+
+ prim_zfds->zfd_allow_flowcon = (boolean_t)val;
+ if (!prim_zfds->zfd_allow_flowcon)
+ prim_zfds->zfd_is_flowcon = B_FALSE;
+
+ DTRACE_PROBE1(zfd__mux__flowcon, void *, prim_zfds);
+ miocack(qp, mp, 0, 0);
+ return;
+ }
+ default:
+ break;
+ }
+ }
+
+ /* if on the write side, may need to tee */
+ if (zfds->zfd_slave_rdq != NULL && qp == WR(zfds->zfd_slave_rdq)) {
+ /* tee output to any attached log stream */
+ zfd_tee_handler(zfds, type, mp);
+
+ /* high-priority msgs are not subject to flow control */
+ if (zfds->zfd_is_flowcon && type == M_DATA)
+ must_queue = B_TRUE;
+ }
+
+ if (zfd_switch(RD(qp)) == NULL) {
+ DBG1("wput to %s side (no one listening)", zfd_side(qp));
+ switch (type) {
+ case M_FLUSH:
+ handle_mflush(qp, mp);
+ break;
+ case M_IOCTL:
+ miocnak(qp, mp, 0, 0);
+ break;
+ default:
+ freemsg(mp);
+ break;
+ }
+ return;
+ }
+
+ if (type >= QPCTL) {
+ DBG1("(hipri) wput, %s side", zfd_side(qp));
+ switch (type) {
+ case M_READ: /* supposedly from ldterm? */
+ DBG("zfd_wput: tossing M_READ\n");
+ freemsg(mp);
+ break;
+ case M_FLUSH:
+ handle_mflush(qp, mp);
+ break;
+ default:
+ /*
+ * Put this to the other side.
+ */
+ ASSERT(zfd_switch(RD(qp)) != NULL);
+ putnext(zfd_switch(RD(qp)), mp);
+ break;
+ }
+ DBG1("done (hipri) wput, %s side", zfd_side(qp));
+ return;
+ }
+
+ /*
+ * If the primary stream has been stopped for flow control then
+ * enqueue the msg, otherwise only putnext if there isn't already
+ * something in the queue. If we don't do this then things would wind
+ * up out of order.
+ */
+ if (!must_queue && qp->q_first == NULL &&
+ bcanputnext(RD(zfd_switch(qp)), mp->b_band)) {
+ putnext(RD(zfd_switch(qp)), mp);
+ } else {
+ /*
+ * zfd_wsrv expects msgs queued on the primary queue. Those
+ * will be handled by zfd_wsrv after zfd_rsrv performs the
+ * qenable on the proper queue.
+ */
+ (void) putq(qp, mp);
+ }
+
+ DBG1("done wput, %s side", zfd_side(qp));
+}
+
+/*
+ * Read server
+ *
+ * For primary stream:
+ * Under normal execution rsrv(9E) is symmetric for master and slave, so
+ * zfd_rsrv() can handle both without splitting up the codepath. We do this by
+ * enabling the write side of the partner. This triggers the partner to send
+ * messages queued on its write side to this queue's read side.
+ *
+ * For log stream:
+ * Internally we've queued up the msgs that we've teed off to the log stream
+ * so when we're invoked we need to pass these along.
+ */
+static void
+zfd_rsrv(queue_t *qp)
+{
+ zfd_state_t *zfds;
+ zfds = (zfd_state_t *)qp->q_ptr;
+
+ /*
+ * log stream server
+ */
+ if (zfds->zfd_muxt == ZFD_LOG_STREAM && zfds->zfd_slave_rdq != NULL) {
+ queue_t *log_qp;
+ mblk_t *mp;
+
+ log_qp = RD(zfds->zfd_slave_rdq);
+
+ if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) {
+ zfd_state_t *pzfds = zfds->zfd_inst_pri;
+
+ while ((mp = getq(qp)) != NULL) {
+ if (bcanputnext(log_qp, mp->b_band)) {
+ putnext(log_qp, mp);
+ } else {
+ (void) putbq(log_qp, mp);
+ break;
+ }
+ }
+
+ if (log_qp->q_count < log_qp->q_lowat) {
+ DTRACE_PROBE(zfd__flow__on);
+ pzfds->zfd_is_flowcon = B_FALSE;
+ if (pzfds->zfd_master_rdq != NULL)
+ qenable(RD(pzfds->zfd_master_rdq));
+ }
+ } else {
+ /* No longer open, drain the queue */
+ while ((mp = getq(qp)) != NULL) {
+ freemsg(mp);
+ }
+ flushq(qp, FLUSHALL);
+ }
+ return;
+ }
+
+ /*
+ * Care must be taken here, as either of the master or slave side
+ * qptr could be NULL.
+ */
+ ASSERT(qp == zfds->zfd_master_rdq || qp == zfds->zfd_slave_rdq);
+ if (zfd_switch(qp) == NULL) {
+ DBG("zfd_rsrv: other side isn't listening\n");
+ return;
+ }
+ qenable(WR(zfd_switch(qp)));
+}
+
+/*
+ * Write server
+ *
+ * This routine is symmetric for master and slave, so it handles both without
+ * splitting up the codepath.
+ *
+ * If there are messages on this queue that can be sent to the other, send
+ * them via putnext(). Else, if queued messages cannot be sent, leave them
+ * on this queue.
+ */
+static void
+zfd_wsrv(queue_t *qp)
+{
+ queue_t *swq;
+ mblk_t *mp;
+ zfd_state_t *zfds = (zfd_state_t *)qp->q_ptr;
+
+ ASSERT(zfds != NULL);
+
+ /*
+ * Partner has no read queue, so take the data, and throw it away.
+ */
+ if (zfd_switch(RD(qp)) == NULL) {
+ DBG("zfd_wsrv: other side isn't listening");
+ while ((mp = getq(qp)) != NULL) {
+ if (mp->b_datap->db_type == M_IOCTL)
+ miocnak(qp, mp, 0, 0);
+ else
+ freemsg(mp);
+ }
+ flushq(qp, FLUSHALL);
+ return;
+ }
+
+ swq = RD(zfd_switch(qp));
+
+ /*
+ * while there are messages on this write queue...
+ */
+ while (!zfds->zfd_is_flowcon && (mp = getq(qp)) != NULL) {
+ /*
+ * Due to the way zfd_wput is implemented, we should never
+ * see a high priority control message here.
+ */
+ ASSERT(mp->b_datap->db_type < QPCTL);
+
+ if (bcanputnext(swq, mp->b_band)) {
+ putnext(swq, mp);
+ } else {
+ (void) putbq(qp, mp);
+ break;
+ }
+ }
+}