summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorbrutus <none@none>2008-04-18 09:37:59 -0700
committerbrutus <none@none>2008-04-18 09:37:59 -0700
commit8e50dcc9f00b393d43e6aa42b820bcbf1d3e1ce4 (patch)
treebabf218af112e325384a001aeac7a408aa885dd9 /usr/src
parent8ca4fa23f8750b90c13a6933cc51ddb7d29abf22 (diff)
downloadillumos-gate-8e50dcc9f00b393d43e6aa42b820bcbf1d3e1ce4.tar.gz
6567008 driver for intel ioat v1 & v2 DMA engine needed
6582323 uioa - uio asynchronous, for support of Intel I/OAT hardware 6582330 sodirect - socket direct, for support of Intel I/OAT hardware 6582335 TCP/IP receive-side zero CPU copy for support of Intel I/OAT hardware
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/pkgdefs/Makefile1
-rw-r--r--usr/src/pkgdefs/SUNWdcopy/Makefile38
-rw-r--r--usr/src/pkgdefs/SUNWdcopy/pkginfo.tmpl50
-rw-r--r--usr/src/pkgdefs/SUNWdcopy/postinstall.tmpl33
-rw-r--r--usr/src/pkgdefs/SUNWdcopy/preremove.tmpl31
-rw-r--r--usr/src/pkgdefs/SUNWdcopy/prototype_com53
-rw-r--r--usr/src/pkgdefs/SUNWdcopy/prototype_i38662
-rw-r--r--usr/src/pkgdefs/SUNWhea/prototype_com1
-rw-r--r--usr/src/uts/common/fs/sockfs/sockstr.c180
-rw-r--r--usr/src/uts/common/fs/sockfs/socksubr.c25
-rw-r--r--usr/src/uts/common/fs/sockfs/socktpi.c127
-rw-r--r--usr/src/uts/common/fs/sockfs/sockvnops.c10
-rw-r--r--usr/src/uts/common/inet/tcp.h10
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c537
-rw-r--r--usr/src/uts/common/inet/tcp/tcp6ddi.c4
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_fusion.c9
-rw-r--r--usr/src/uts/common/inet/tcp/tcpddi.c4
-rw-r--r--usr/src/uts/common/io/dcopy.c932
-rw-r--r--usr/src/uts/common/io/stream.c50
-rw-r--r--usr/src/uts/common/os/move.c384
-rw-r--r--usr/src/uts/common/os/streamio.c174
-rw-r--r--usr/src/uts/common/os/strsubr.c11
-rw-r--r--usr/src/uts/common/sys/Makefile1
-rw-r--r--usr/src/uts/common/sys/conf.h5
-rw-r--r--usr/src/uts/common/sys/dcopy.h235
-rw-r--r--usr/src/uts/common/sys/dcopy_device.h154
-rw-r--r--usr/src/uts/common/sys/socketvar.h9
-rw-r--r--usr/src/uts/common/sys/sodirect.h101
-rw-r--r--usr/src/uts/common/sys/stream.h5
-rw-r--r--usr/src/uts/common/sys/strsubr.h8
-rw-r--r--usr/src/uts/common/sys/uio.h69
-rw-r--r--usr/src/uts/i86pc/Makefile.files1
-rw-r--r--usr/src/uts/i86pc/Makefile.i86pc.shared1
-rw-r--r--usr/src/uts/i86pc/Makefile.rules7
-rw-r--r--usr/src/uts/i86pc/io/ioat/ioat.c665
-rw-r--r--usr/src/uts/i86pc/io/ioat/ioat.conf30
-rw-r--r--usr/src/uts/i86pc/io/ioat/ioat_chan.c1319
-rw-r--r--usr/src/uts/i86pc/io/ioat/ioat_ioctl.c343
-rw-r--r--usr/src/uts/i86pc/io/ioat/ioat_rs.c246
-rw-r--r--usr/src/uts/i86pc/ioat/Makefile97
-rw-r--r--usr/src/uts/i86pc/sys/ioat.h359
-rw-r--r--usr/src/uts/i86xpv/Makefile.files5
-rw-r--r--usr/src/uts/i86xpv/Makefile.i86xpv.shared1
-rw-r--r--usr/src/uts/i86xpv/Makefile.rules9
-rw-r--r--usr/src/uts/i86xpv/ioat/Makefile97
-rw-r--r--usr/src/uts/intel/Makefile.files1
-rw-r--r--usr/src/uts/intel/Makefile.intel.shared1
-rw-r--r--usr/src/uts/intel/dcopy/Makefile84
-rw-r--r--usr/src/uts/intel/ia32/ml/modstubs.s16
-rw-r--r--usr/src/uts/sparc/ml/modstubs.s16
50 files changed, 6487 insertions, 124 deletions
diff --git a/usr/src/pkgdefs/Makefile b/usr/src/pkgdefs/Makefile
index 39fbf0ca05..9e755c6748 100644
--- a/usr/src/pkgdefs/Makefile
+++ b/usr/src/pkgdefs/Makefile
@@ -125,6 +125,7 @@ i386_SUBDIRS= \
SUNWgrub \
SUNWgrubS \
SUNWhxge \
+ SUNWdcopy \
SUNWipw \
SUNWiwi \
SUNWiwk \
diff --git a/usr/src/pkgdefs/SUNWdcopy/Makefile b/usr/src/pkgdefs/SUNWdcopy/Makefile
new file mode 100644
index 0000000000..3431d26eb9
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWdcopy/Makefile
@@ -0,0 +1,38 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#pragma ident "%Z%%M% %I% %E% SMI"
+#
+
+include ../Makefile.com
+
+TMPLFILES += postinstall preremove
+DATAFILES += depend
+
+.KEEP_STATE:
+
+all: $(FILES)
+install: all pkg
+
+include ../Makefile.targ
+include ../Makefile.prtarg
diff --git a/usr/src/pkgdefs/SUNWdcopy/pkginfo.tmpl b/usr/src/pkgdefs/SUNWdcopy/pkginfo.tmpl
new file mode 100644
index 0000000000..3b9f1d87d6
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWdcopy/pkginfo.tmpl
@@ -0,0 +1,50 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+
+#
+# This required package information file describes characteristics of the
+# package, such as package abbreviation, full package name, package version,
+# and package architecture.
+#
+PKG="SUNWdcopy"
+NAME="Sun dcopy DMA drivers"
+ARCH="i386"
+CATEGORY="system"
+BASEDIR=/
+SUNW_PKGVERS="1.0"
+SUNW_PKGTYPE="root"
+CLASSES="none"
+DESC="Sun dcopy DMA drivers"
+SUNW_PRODNAME="SunOS"
+SUNW_PRODVERS="RELEASE/VERSION"
+VERSION="ONVERS,REV=0.0.0"
+VENDOR="Sun Microsystems, Inc."
+HOTLINE="Please contact your local service provider"
+EMAIL=""
+MAXINST="1000"
+SUNW_PKG_ALLZONES="true"
+SUNW_PKG_HOLLOW="true"
+SUNW_PKG_THISZONE="false"
diff --git a/usr/src/pkgdefs/SUNWdcopy/postinstall.tmpl b/usr/src/pkgdefs/SUNWdcopy/postinstall.tmpl
new file mode 100644
index 0000000000..cdb1f395bf
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWdcopy/postinstall.tmpl
@@ -0,0 +1,33 @@
+#!/bin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#ident "%Z%%M% %I% %E% SMI"
+#
+
+include drv_utils
+
+CB1='"pciex8086,1a38" "pciex8086,360b"'
+CB2='"pciex8086,402f"'
+
+pkg_drvadd -i "'$CB1 $CB2'" ioat || exit 1
diff --git a/usr/src/pkgdefs/SUNWdcopy/preremove.tmpl b/usr/src/pkgdefs/SUNWdcopy/preremove.tmpl
new file mode 100644
index 0000000000..2526218df9
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWdcopy/preremove.tmpl
@@ -0,0 +1,31 @@
+#!/sbin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#ident "%Z%%M% %I% %E% SMI"
+#
+
+include drv_utils
+
+pkg_drvrem ioat || exit 1
+
diff --git a/usr/src/pkgdefs/SUNWdcopy/prototype_com b/usr/src/pkgdefs/SUNWdcopy/prototype_com
new file mode 100644
index 0000000000..34626771bc
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWdcopy/prototype_com
@@ -0,0 +1,53 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#ident "%Z%%M% %I% %E% SMI"
+#
+# This required package information file contains a list of package contents.
+# The 'pkgmk' command uses this file to identify the contents of a package
+# and their location on the development machine when building the package.
+# Can be created via a text editor or through use of the 'pkgproto' command.
+
+#!search <pathname pathname ...> # where to find pkg objects
+#!include <filename> # include another 'prototype' file
+#!default <mode> <owner> <group> # default used if not specified on entry
+#!<param>=<value> # puts parameter in pkg environment
+
+#
+# packaging files
+i copyright
+i depend
+i pkginfo
+i postinstall
+i preremove
+
+#
+# source locations relative to the prototype file
+#
+#
+# SUNWdcopy
+#
+d none kernel 0755 root sys
+d none kernel/misc 0755 root sys
+f none kernel/misc/dcopy 0755 root sys
diff --git a/usr/src/pkgdefs/SUNWdcopy/prototype_i386 b/usr/src/pkgdefs/SUNWdcopy/prototype_i386
new file mode 100644
index 0000000000..77bcc81a7e
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWdcopy/prototype_i386
@@ -0,0 +1,62 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# ident "%Z%%M% %I% %E% SMI"
+#
+# This required package information file contains a list of package contents.
+# The 'pkgmk' command uses this file to identify the contents of a package
+# and their location on the development machine when building the package.
+# Can be created via a text editor or through use of the 'pkgproto' command.
+
+#!search <pathname pathname ...> # where to find pkg objects
+#!include <filename> # include another 'prototype' file
+#!default <mode> <owner> <group> # default used if not specified on entry
+#!<param>=<value> # puts parameter in pkg environment
+#
+#
+# Include ISA independent files (prototype_com)
+#
+!include prototype_com
+#
+#
+# List files which are i386 specific here
+#
+# SUNWioat
+#
+d none kernel/misc/amd64 0755 root sys
+f none kernel/misc/amd64/dcopy 0755 root sys
+d none platform 0755 root sys
+d none platform/i86pc 0755 root sys
+d none platform/i86pc/kernel 0755 root sys
+d none platform/i86pc/kernel/drv 0755 root sys
+f none platform/i86pc/kernel/drv/ioat 755 root sys
+f none platform/i86pc/kernel/drv/ioat.conf 644 root sys
+d none platform/i86pc/kernel/drv/amd64 0755 root sys
+f none platform/i86pc/kernel/drv/amd64/ioat 755 root sys
+d none platform/i86xpv 0755 root sys
+d none platform/i86xpv/kernel 0755 root sys
+d none platform/i86xpv/kernel/drv 0755 root sys
+f none platform/i86xpv/kernel/drv/ioat 755 root sys
+f none platform/i86xpv/kernel/drv/ioat.conf 644 root sys
+d none platform/i86xpv/kernel/drv/amd64 0755 root sys
+f none platform/i86xpv/kernel/drv/amd64/ioat 755 root sys
diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com
index cc72136ac2..90be67b9dd 100644
--- a/usr/src/pkgdefs/SUNWhea/prototype_com
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com
@@ -1218,6 +1218,7 @@ f none usr/include/sys/socket.h 644 root bin
f none usr/include/sys/socket_impl.h 644 root bin
f none usr/include/sys/socketvar.h 644 root bin
f none usr/include/sys/sockio.h 644 root bin
+f none usr/include/sys/sodirect.h 644 root bin
f none usr/include/sys/sservice.h 644 root bin
f none usr/include/sys/squeue.h 644 root bin
f none usr/include/sys/squeue_impl.h 644 root bin
diff --git a/usr/src/uts/common/fs/sockfs/sockstr.c b/usr/src/uts/common/fs/sockfs/sockstr.c
index eb540644be..1a50324bc0 100644
--- a/usr/src/uts/common/fs/sockfs/sockstr.c
+++ b/usr/src/uts/common/fs/sockfs/sockstr.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -69,6 +69,8 @@
#include <c2/audit.h>
+#include <sys/dcopy.h>
+
int so_default_version = SOV_SOCKSTREAM;
#ifdef DEBUG
@@ -119,6 +121,36 @@ static mblk_t *strsock_misc(vnode_t *vp, mblk_t *mp,
static int tlitosyserr(int terr);
/*
+ * Sodirect kmem_cache and put/wakeup functions.
+ */
+struct kmem_cache *socktpi_sod_cache;
+static int sodput(sodirect_t *, mblk_t *);
+static void sodwakeup(sodirect_t *);
+
+/*
+ * Called by sockinit() when sockfs is loaded.
+ *
+ * Check for uioasync dcopy support and if supported
+ * allocate the sodirect_t kmem_cache socktpi_sod_cache.
+ */
+int
+sostr_init()
+{
+ if (uioasync.enabled == B_TRUE && modload("misc", "dcopy") == -1) {
+ /* No dcopy KAPI driver, disable uioa */
+ uioasync.enabled = B_FALSE;
+ }
+
+ if (uioasync.enabled == B_TRUE) {
+ /* Uioasync enabled so sodirect will be used */
+ socktpi_sod_cache = kmem_cache_create("socktpi_sod_cache",
+ sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ }
+
+ return (0);
+}
+
+/*
* Convert a socket to a stream. Invoked when the illusory sockmod
* is popped from the stream.
* Change the stream head back to default operation without losing
@@ -468,6 +500,34 @@ so_strinit(struct sonode *so, struct sonode *tso)
stp->sd_qn_minpsz = 0;
mutex_exit(&stp->sd_lock);
+ /*
+ * If sodirect capable allocate and initialize sodirect_t.
+ * Note, SS_SODIRECT is set in socktpi_open().
+ */
+ if (so->so_state & SS_SODIRECT) {
+ sodirect_t *sodp;
+
+ ASSERT(so->so_direct == NULL);
+
+ sodp = kmem_cache_alloc(socktpi_sod_cache, KM_SLEEP);
+ sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT;
+ sodp->sod_want = 0;
+ sodp->sod_q = RD(stp->sd_wrq);
+ sodp->sod_enqueue = sodput;
+ sodp->sod_wakeup = sodwakeup;
+ sodp->sod_uioafh = NULL;
+ sodp->sod_uioaft = NULL;
+ sodp->sod_lock = &stp->sd_lock;
+ /*
+ * Remainder of the sod_uioa members are left uninitialized
+ * but will be initialized later by uioainit() before uioa
+ * is enabled.
+ */
+ sodp->sod_uioa.uioa_state = UIOA_ALLOC;
+ so->so_direct = sodp;
+ stp->sd_sodirect = sodp;
+ }
+
return (0);
}
@@ -2872,3 +2932,121 @@ tlitosyserr(int terr)
else
return (tli_errs[terr]);
}
+
+/*
+ * Sockfs sodirect STREAMS read put procedure. Called from sodirect enable
+ * transport driver/module with an mblk_t chain.
+ *
+ * Note, we in-line putq() for the fast-path cases of q is empty, q_last and
+ * bp are of type M_DATA. All other cases we call putq().
+ *
+ * On success a zero will be return, else an errno will be returned.
+ */
+int
+sodput(sodirect_t *sodp, mblk_t *bp)
+{
+ queue_t *q = sodp->sod_q;
+ struct stdata *stp = (struct stdata *)q->q_ptr;
+ mblk_t *nbp;
+ int ret;
+ mblk_t *last = q->q_last;
+ int bytecnt = 0;
+ int mblkcnt = 0;
+
+
+ ASSERT(MUTEX_HELD(sodp->sod_lock));
+
+ if (stp->sd_flag == STREOF) {
+ ret = 0;
+ goto error;
+ }
+
+ if (q->q_first == NULL) {
+ /* Q empty, really fast fast-path */
+ bp->b_prev = NULL;
+ bp->b_next = NULL;
+ q->q_first = bp;
+ q->q_last = bp;
+
+ } else if (last->b_datap->db_type == M_DATA &&
+ bp->b_datap->db_type == M_DATA) {
+ /*
+ * Last mblk_t chain and bp are both type M_DATA so
+ * in-line putq() here, if the DBLK_UIOA state match
+ * add bp to the end of the current last chain, else
+ * start a new last chain with bp.
+ */
+ if ((last->b_datap->db_flags & DBLK_UIOA) ==
+ (bp->b_datap->db_flags & DBLK_UIOA)) {
+ /* Added to end */
+ while ((nbp = last->b_cont) != NULL)
+ last = nbp;
+ last->b_cont = bp;
+ } else {
+ /* New last */
+ last->b_next = bp;
+ bp->b_next = NULL;
+ bp->b_prev = last;
+ q->q_last = bp;
+ }
+ } else {
+ /*
+ * Can't use q_last so just call putq().
+ */
+ (void) putq(q, bp);
+ return (0);
+ }
+
+ /* Count bytes and mblk_t's */
+ do {
+ bytecnt += MBLKL(bp);
+ mblkcnt++;
+ } while ((bp = bp->b_cont) != NULL);
+ q->q_count += bytecnt;
+ q->q_mblkcnt += mblkcnt;
+
+ /* Check for QFULL */
+ if (q->q_count >= q->q_hiwat + sodp->sod_want ||
+ q->q_mblkcnt >= q->q_hiwat) {
+ q->q_flag |= QFULL;
+ }
+
+ return (0);
+
+error:
+ do {
+ if ((nbp = bp->b_next) != NULL)
+ bp->b_next = NULL;
+ freemsg(bp);
+ } while ((bp = nbp) != NULL);
+
+ return (ret);
+}
+
+/*
+ * Sockfs sodirect read wakeup. Called from a sodirect enabled transport
+ * driver/module to indicate that read-side data is available.
+ *
+ * On return the sodirect_t.lock mutex will be exited so this must be the
+ * last sodirect_t call to guarantee atomic access of *sodp.
+ */
+void
+sodwakeup(sodirect_t *sodp)
+{
+ queue_t *q = sodp->sod_q;
+ struct stdata *stp = (struct stdata *)q->q_ptr;
+
+ ASSERT(MUTEX_HELD(sodp->sod_lock));
+
+ if (stp->sd_flag & RSLEEP) {
+ stp->sd_flag &= ~RSLEEP;
+ cv_broadcast(&q->q_wait);
+ }
+
+ if (stp->sd_rput_opt & SR_POLLIN) {
+ stp->sd_rput_opt &= ~SR_POLLIN;
+ mutex_exit(sodp->sod_lock);
+ pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM);
+ } else
+ mutex_exit(sodp->sod_lock);
+}
diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c
index 9a6e9147e3..c857c34225 100644
--- a/usr/src/uts/common/fs/sockfs/socksubr.c
+++ b/usr/src/uts/common/fs/sockfs/socksubr.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -44,6 +44,7 @@
#include <sys/file.h>
#include <sys/open.h>
#include <sys/user.h>
+#include <sys/uio.h>
#include <sys/termios.h>
#include <sys/stream.h>
#include <sys/strsubr.h>
@@ -90,6 +91,7 @@
#define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */
static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
+struct kmem_cache *socktpi_sod_cache;
dev_t sockdev; /* For fsid in getattr */
@@ -105,6 +107,8 @@ extern void sendfile_init();
extern void nl7c_init(void);
+extern int sostr_init();
+
#define ADRSTRLEN (2 * sizeof (void *) + 1)
/*
* kernel structure for passing the sockinfo data back up to the user.
@@ -523,6 +527,15 @@ sockfree(struct sonode *so)
so->so_nl7c_flags = 0;
}
+ if (so->so_direct != NULL) {
+ sodirect_t *sodp = so->so_direct;
+
+ ASSERT(sodp->sod_uioafh == NULL);
+
+ so->so_direct = NULL;
+ kmem_cache_free(socktpi_sod_cache, sodp);
+ }
+
ASSERT(so->so_ux_bound_vp == NULL);
if ((mp = so->so_unbind_mp) != NULL) {
freemsg(mp);
@@ -567,6 +580,8 @@ socktpi_constructor(void *buf, void *cdrarg, int kmflags)
struct sonode *so = buf;
struct vnode *vp;
+ so->so_direct = NULL;
+
so->so_nl7c_flags = 0;
so->so_nl7c_uri = NULL;
so->so_nl7c_rcv_mp = NULL;
@@ -606,6 +621,8 @@ socktpi_destructor(void *buf, void *cdrarg)
struct sonode *so = buf;
struct vnode *vp = SOTOV(so);
+ ASSERT(so->so_direct == NULL);
+
ASSERT(so->so_nl7c_flags == 0);
ASSERT(so->so_nl7c_uri == NULL);
ASSERT(so->so_nl7c_rcv_mp == NULL);
@@ -713,6 +730,12 @@ sockinit(int fstype, char *name)
goto failure;
}
+ error = sostr_init();
+ if (error != 0) {
+ err_str = NULL;
+ goto failure;
+ }
+
/*
* Create sonode caches. We create a special one for AF_UNIX so
* that we can track them for netstat(1m).
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c
index d6f9ebb57f..e632e234e2 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.c
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -58,6 +58,7 @@
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
+#include <sys/sodirect.h>
#include <netinet/in.h>
#include <sys/un.h>
#include <sys/strsun.h>
@@ -186,6 +187,9 @@ extern mblk_t *strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *,
static int sotpi_unbind(struct sonode *, int);
+extern int sodput(sodirect_t *, mblk_t *);
+extern void sodwakeup(sodirect_t *);
+
/* TPI sockfs sonode operations */
static int sotpi_accept(struct sonode *, int, struct sonode **);
static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
@@ -2910,11 +2914,13 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
t_uscalar_t namelen;
int so_state = so->so_state; /* Snapshot */
ssize_t saved_resid;
- int error;
rval_t rval;
int flags;
clock_t timout;
int first;
+ int error = 0;
+ struct uio *suiop = NULL;
+ sodirect_t *sodp = so->so_direct;
flags = msg->msg_flags;
msg->msg_flags = 0;
@@ -3062,6 +3068,53 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
opflag = pflag;
first = 1;
+ if (uiop->uio_resid >= uioasync.mincnt &&
+ sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
+ uioasync.enabled && !(flags & MSG_PEEK) &&
+ !(so_state & SS_CANTRCVMORE)) {
+ /*
+ * Big enough I/O for uioa min setup and an sodirect socket
+ * and sodirect enabled and uioa enabled and I/O will be done
+ * and not EOF so initialize the sodirect_t uioa_t with "uiop".
+ */
+ mutex_enter(sodp->sod_lock);
+ if (!uioainit(uiop, &sodp->sod_uioa)) {
+ /*
+ * Successful uioainit() so the uio_t part of the
+ * uioa_t will be used for all uio_t work to follow,
+ * we save the original "uiop" in "suiop".
+ */
+ suiop = uiop;
+ uiop = (uio_t *)&sodp->sod_uioa;
+ /*
+ * Before returning to the caller the passed in uio_t
+ * "uiop" will be updated via a call to uioafini()
+ * below.
+ *
+ * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
+ * here as first we have to uioamove() any currently
+ * queued M_DATA mblk_t(s) so it will be done in
+ * kstrgetmsg().
+ */
+ }
+ /*
+ * In either uioainit() success or not case note the number
+ * of uio bytes the caller wants for sod framework and/or
+ * transport (e.g. TCP) strategy.
+ */
+ sodp->sod_want = uiop->uio_resid;
+ mutex_exit(sodp->sod_lock);
+ } else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
+ /*
+ * No uioa but still using sodirect so note the number of
+ * uio bytes the caller wants for sodirect framework and/or
+ * transport (e.g. TCP) strategy.
+ *
+ * Note, sod_lock not held, only writer is in this function
+ * and only one thread at a time so not needed just to init.
+ */
+ sodp->sod_want = uiop->uio_resid;
+ }
retry:
saved_resid = uiop->uio_resid;
pri = 0;
@@ -3091,10 +3144,7 @@ retry:
eprintsoline(so, error);
break;
}
- mutex_enter(&so->so_lock);
- so_unlock_read(so); /* Clear SOREADLOCKED */
- mutex_exit(&so->so_lock);
- return (error);
+ goto out;
}
/*
* For datagrams the MOREDATA flag is used to set MSG_TRUNC.
@@ -3137,9 +3187,7 @@ retry:
pflag = opflag | MSG_NOMARK;
goto retry;
}
- so_unlock_read(so); /* Clear SOREADLOCKED */
- mutex_exit(&so->so_lock);
- return (0);
+ goto out_locked;
}
/* strsock_proto has already verified length and alignment */
@@ -3179,9 +3227,7 @@ retry:
pflag = opflag | MSG_NOMARK;
goto retry;
}
- so_unlock_read(so); /* Clear SOREADLOCKED */
- mutex_exit(&so->so_lock);
- return (0);
+ goto out_locked;
}
case T_UNITDATA_IND: {
void *addr;
@@ -3207,7 +3253,7 @@ retry:
freemsg(mp);
error = EPROTO;
eprintsoline(so, error);
- goto err;
+ goto out;
}
if (so->so_family == AF_UNIX) {
/*
@@ -3236,7 +3282,7 @@ retry:
freemsg(mp);
error = EPROTO;
eprintsoline(so, error);
- goto err;
+ goto out;
}
if (so->so_family == AF_UNIX)
so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
@@ -3283,17 +3329,14 @@ retry:
msg->msg_namelen);
kmem_free(control, controllen);
eprintsoline(so, error);
- goto err;
+ goto out;
}
msg->msg_control = control;
msg->msg_controllen = controllen;
}
freemsg(mp);
- mutex_enter(&so->so_lock);
- so_unlock_read(so); /* Clear SOREADLOCKED */
- mutex_exit(&so->so_lock);
- return (0);
+ goto out;
}
case T_OPTDATA_IND: {
struct T_optdata_req *tdr;
@@ -3322,7 +3365,7 @@ retry:
freemsg(mp);
error = EPROTO;
eprintsoline(so, error);
- goto err;
+ goto out;
}
ncontrollen = so_cmsglen(mp, opt, optlen,
@@ -3350,7 +3393,7 @@ retry:
freemsg(mp);
kmem_free(control, controllen);
eprintsoline(so, error);
- goto err;
+ goto out;
}
msg->msg_control = control;
msg->msg_controllen = controllen;
@@ -3382,9 +3425,7 @@ retry:
pflag = opflag | MSG_NOMARK;
goto retry;
}
- so_unlock_read(so); /* Clear SOREADLOCKED */
- mutex_exit(&so->so_lock);
- return (0);
+ goto out_locked;
}
case T_EXDATA_IND: {
dprintso(so, 1,
@@ -3441,10 +3482,7 @@ retry:
eprintsoline(so, error);
}
#endif /* SOCK_DEBUG */
- mutex_enter(&so->so_lock);
- so_unlock_read(so); /* Clear SOREADLOCKED */
- mutex_exit(&so->so_lock);
- return (error);
+ goto out;
}
ASSERT(mp);
tpr = (union T_primitives *)mp->b_rptr;
@@ -3490,11 +3528,40 @@ retry:
freemsg(mp);
error = EPROTO;
eprintsoline(so, error);
- goto err;
+ goto out;
}
/* NOTREACHED */
-err:
+out:
mutex_enter(&so->so_lock);
+out_locked:
+ if (sodp != NULL) {
+ /* Finish any sodirect and uioa processing */
+ mutex_enter(sodp->sod_lock);
+ if (suiop != NULL) {
+ /* Finish any uioa_t processing */
+ int ret;
+
+ ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
+ ret = uioafini(suiop, (uioa_t *)uiop);
+ if (error == 0 && ret != 0) {
+ /* If no error yet, set it */
+ error = ret;
+ }
+ if ((mp = sodp->sod_uioafh) != NULL) {
+ sodp->sod_uioafh = NULL;
+ sodp->sod_uioaft = NULL;
+ freemsg(mp);
+ }
+ }
+ if (!(sodp->sod_state & SOD_WAKE_NOT)) {
+ /* Awoke */
+ sodp->sod_state &= SOD_WAKE_CLR;
+ sodp->sod_state |= SOD_WAKE_NOT;
+ }
+ /* Last, clear sod_want value */
+ sodp->sod_want = 0;
+ mutex_exit(sodp->sod_lock);
+ }
so_unlock_read(so); /* Clear SOREADLOCKED */
mutex_exit(&so->so_lock);
return (error);
diff --git a/usr/src/uts/common/fs/sockfs/sockvnops.c b/usr/src/uts/common/fs/sockfs/sockvnops.c
index 6c122c679d..c85a76d6e6 100644
--- a/usr/src/uts/common/fs/sockfs/sockvnops.c
+++ b/usr/src/uts/common/fs/sockfs/sockvnops.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -239,6 +239,10 @@ socktpi_open(struct vnode **vpp, int flag, struct cred *cr,
* udp case, when some other module is autopushed
* above it, or for some reasons the expected module
* isn't purely D_MP (which is the main requirement).
+ *
+ * Else, SS_DIRECT is valid. If the read-side Q has
+ * _QSODIRECT set then and uioasync is enabled then
+ * set SS_SODIRECT to enable sodirect.
*/
if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
!(_OTHERQ(tq)->q_flag & _QDIRECT)) {
@@ -255,6 +259,10 @@ socktpi_open(struct vnode **vpp, int flag, struct cred *cr,
return (error);
}
}
+ } else if ((_OTHERQ(tq)->q_flag & _QSODIRECT) &&
+ uioasync.enabled) {
+ /* Enable sodirect */
+ so->so_state |= SS_SODIRECT;
}
}
} else {
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index aa5ba3a075..26e1b12f4e 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -37,6 +37,7 @@ extern "C" {
#include <netinet/ip6.h>
#include <netinet/tcp.h>
#include <sys/socket.h>
+#include <sys/sodirect.h>
#include <sys/multidata.h>
#include <sys/md5.h>
#include <inet/common.h>
@@ -598,6 +599,13 @@ typedef struct tcp_s {
*/
boolean_t tcp_flow_stopped;
+ /*
+ * tcp_sodirect is used by tcp on the receive side to push mblk_t(s)
+ * directly to sockfs. Also, to schedule asynchronous copyout directly
+ * to a pending user-land uio buffer.
+ */
+ sodirect_t *tcp_sodirect;
+
#ifdef DEBUG
pc_t tcmp_stk[15];
#endif
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 12b781c0bc..a729e2d066 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -66,6 +66,8 @@ const char tcp_version[] = "%Z%%M% %I% %E% SMI";
#include <sys/isa_defs.h>
#include <sys/md5.h>
#include <sys/random.h>
+#include <sys/sodirect.h>
+#include <sys/uio.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netinet/ip6.h>
@@ -216,6 +218,23 @@ const char tcp_version[] = "%Z%%M% %I% %E% SMI";
* behaviour. Once tcp_issocket is unset, its never set for the
* life of that connection.
*
+ * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT)
+ * two consoldiation private KAPIs are used to enqueue M_DATA mblk_t's
+ * directly to the socket (sodirect) and start an asynchronous copyout
+ * to a user-land receive-side buffer (uioa) when a blocking socket read
+ * (e.g. read, recv, ...) is pending.
+ *
+ * This is accomplished when tcp_issocket is set and tcp_sodirect is not
+ * NULL so points to an sodirect_t and if marked enabled then we enqueue
+ * all mblk_t's directly to the socket.
+ *
+ * Further, if the sodirect_t sod_uioa and if marked enabled (due to a
+ * blocking socket read, e.g. user-land read, recv, ...) then an asynchronous
+ * copyout will be started directly to the user-land uio buffer. Also, as we
+ * have a pending read, TCP's push logic can take into account the number of
+ * bytes to be received and only awake the blocked read()er when the uioa_t
+ * byte count has been satisfied.
+ *
* IPsec notes :
*
* Since a packet is always executed on the correct TCP perimeter
@@ -246,6 +265,37 @@ squeue_func_t tcp_squeue_close_proc;
squeue_func_t tcp_squeue_wput_proc;
/*
+ * Macros for sodirect:
+ *
+ * SOD_PTR_ENTER(tcp, sodp) - for the tcp_t pointer "tcp" set the
+ * sodirect_t pointer "sodp" to the socket/tcp shared sodirect_t
+ * if it exists and is enabled, else to NULL. Note, in the current
+ * sodirect implementation the sod_lock must not be held across any
+ * STREAMS call (e.g. putnext) else a "recursive mutex_enter" PANIC
+ * will result as sod_lock is the streamhead stdata.sd_lock.
+ *
+ * SOD_NOT_ENABLED(tcp) - return true if not a sodirect tcp_t or the
+ * sodirect_t isn't enabled, usefull for ASSERT()ing that a recieve
+ * side tcp code path dealing with a tcp_rcv_list or putnext() isn't
+ * being used when sodirect code paths should be.
+ */
+
+#define SOD_PTR_ENTER(tcp, sodp) \
+ (sodp) = (tcp)->tcp_sodirect; \
+ \
+ if ((sodp) != NULL) { \
+ mutex_enter((sodp)->sod_lock); \
+ if (!((sodp)->sod_state & SOD_ENABLED)) { \
+ mutex_exit((sodp)->sod_lock); \
+ (sodp) = NULL; \
+ } \
+ }
+
+#define SOD_NOT_ENABLED(tcp) \
+ ((tcp)->tcp_sodirect == NULL || \
+ !((tcp)->tcp_sodirect->sod_state & SOD_ENABLED))
+
+/*
* This controls how tiny a write must be before we try to copy it
* into the the mblk on the tail of the transmit queue. Not much
* speedup is observed for values larger than sixteen. Zero will
@@ -3808,6 +3858,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
mblk_t *mp;
queue_t *q;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ sodirect_t *sodp;
TCP_CLD_STAT(tag);
@@ -3872,6 +3923,13 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
return (-1);
}
+ /* If sodirect, not anymore */
+ SOD_PTR_ENTER(tcp, sodp);
+ if (sodp != NULL) {
+ tcp->tcp_sodirect = NULL;
+ mutex_exit(sodp->sod_lock);
+ }
+
q = tcp->tcp_rq;
/* Trash all inbound data */
@@ -4236,6 +4294,11 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
*/
/* FALLTHRU */
default:
+ if (tcp->tcp_sodirect != NULL) {
+ /* Ok, no more sodirect */
+ tcp->tcp_sodirect = NULL;
+ }
+
if (tcp->tcp_fused)
tcp_unfuse(tcp);
@@ -6381,6 +6444,15 @@ tcp_connect(tcp_t *tcp, mblk_t *mp)
*(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
}
+ if (tcp->tcp_issocket) {
+ /*
+ * TCP is _D_SODIRECT and sockfs is directly above so save
+ * the shared sonode sodirect_t pointer (if any) to enable
+ * TCP sodirect.
+ */
+ tcp->tcp_sodirect = SOD_QTOSODP(tcp->tcp_rq);
+ }
+
switch (tcp->tcp_state) {
case TCPS_IDLE:
/*
@@ -8190,6 +8262,9 @@ tcp_reinit_values(tcp)
ASSERT(!tcp->tcp_kssl_pending);
PRESERVE(tcp->tcp_kssl_ent);
+ /* Sodirect */
+ tcp->tcp_sodirect = NULL;
+
tcp->tcp_closemp_used = B_FALSE;
#ifdef DEBUG
@@ -8282,6 +8357,9 @@ tcp_init_values(tcp_t *tcp)
tcp->tcp_fuse_rcv_unread_hiwater = 0;
tcp->tcp_fuse_rcv_unread_cnt = 0;
+ /* Sodirect */
+ tcp->tcp_sodirect = NULL;
+
/* Initialize the header template */
if (tcp->tcp_ipversion == IPV4_VERSION) {
err = tcp_header_init_ipv4(tcp);
@@ -11691,6 +11769,9 @@ tcp_rcv_drain(queue_t *q, tcp_t *tcp)
if (tcp->tcp_listener != NULL)
return (ret);
+ /* Can't be sodirect enabled */
+ ASSERT(SOD_NOT_ENABLED(tcp));
+
/*
* Handle two cases here: we are currently fused or we were
* previously fused and have some urgent data to be delivered
@@ -11770,6 +11851,9 @@ tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len)
ASSERT(seg_len == msgdsize(mp));
ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL);
+ /* Can't be sodirect enabled */
+ ASSERT(SOD_NOT_ENABLED(tcp));
+
if (tcp->tcp_rcv_list == NULL) {
ASSERT(tcp->tcp_rcv_last_head == NULL);
tcp->tcp_rcv_list = mp;
@@ -11790,6 +11874,216 @@ tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len)
}
/*
+ * The tcp_rcv_sod_XXX() functions enqueue data directly to the socket
+ * above, in addition when uioa is enabled schedule an asynchronous uio
+ * prior to enqueuing. They implement the combinhed semantics of the
+ * tcp_rcv_XXX() functions, tcp_rcv_list push logic, and STREAMS putnext()
+ * canputnext(), i.e. flow-control with backenable.
+ *
+ * tcp_sod_wakeup() is called where tcp_rcv_drain() would be called in the
+ * non sodirect connection but as there are no tcp_tcv_list mblk_t's we deal
+ * with the rcv_wnd and push timer and call the sodirect wakeup function.
+ *
+ * Must be called with sodp->sod_lock held and will return with the lock
+ * released.
+ */
+static uint_t
+tcp_rcv_sod_wakeup(tcp_t *tcp, sodirect_t *sodp)
+{
+ queue_t *q = tcp->tcp_rq;
+ uint_t thwin;
+ tcp_stack_t *tcps = tcp->tcp_tcps;
+ uint_t ret = 0;
+
+ /* Can't be an eager connection */
+ ASSERT(tcp->tcp_listener == NULL);
+
+ /* Caller must have lock held */
+ ASSERT(MUTEX_HELD(sodp->sod_lock));
+
+ /* Sodirect mode so must not be a tcp_rcv_list */
+ ASSERT(tcp->tcp_rcv_list == NULL);
+
+ if (SOD_QFULL(sodp)) {
+ /* Q is full, mark Q for need backenable */
+ SOD_QSETBE(sodp);
+ }
+ /* Last advertised rwnd, i.e. rwnd last sent in a packet */
+ thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win))
+ << tcp->tcp_rcv_ws;
+ /* This is peer's calculated send window (our available rwnd). */
+ thwin -= tcp->tcp_rnxt - tcp->tcp_rack;
+ /*
+ * Increase the receive window to max. But we need to do receiver
+ * SWS avoidance. This means that we need to check the increase of
+ * of receive window is at least 1 MSS.
+ */
+ if (!SOD_QFULL(sodp) && (q->q_hiwat - thwin >= tcp->tcp_mss)) {
+ /*
+ * If the window that the other side knows is less than max
+ * deferred acks segments, send an update immediately.
+ */
+ if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) {
+ BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate);
+ ret = TH_ACK_NEEDED;
+ }
+ tcp->tcp_rwnd = q->q_hiwat;
+ }
+
+ if (!SOD_QEMPTY(sodp)) {
+ /* Wakeup to socket */
+ sodp->sod_state &= SOD_WAKE_CLR;
+ sodp->sod_state |= SOD_WAKE_DONE;
+ (sodp->sod_wakeup)(sodp);
+ /* wakeup() does the mutex_ext() */
+ } else {
+ /* Q is empty, no need to wake */
+ sodp->sod_state &= SOD_WAKE_CLR;
+ sodp->sod_state |= SOD_WAKE_NOT;
+ mutex_exit(sodp->sod_lock);
+ }
+
+ /* No need for the push timer now. */
+ if (tcp->tcp_push_tid != 0) {
+ (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
+ tcp->tcp_push_tid = 0;
+ }
+
+ return (ret);
+}
+
+/*
+ * Called where tcp_rcv_enqueue()/putnext(RD(q)) would be. For M_DATA
+ * mblk_t's if uioa enabled then start a uioa asynchronous copy directly
+ * to the user-land buffer and flag the mblk_t as such.
+ *
+ * Also, handle tcp_rwnd.
+ */
+uint_t
+tcp_rcv_sod_enqueue(tcp_t *tcp, sodirect_t *sodp, mblk_t *mp, uint_t seg_len)
+{
+ uioa_t *uioap = &sodp->sod_uioa;
+ boolean_t qfull;
+ uint_t thwin;
+
+ /* Can't be an eager connection */
+ ASSERT(tcp->tcp_listener == NULL);
+
+ /* Caller must have lock held */
+ ASSERT(MUTEX_HELD(sodp->sod_lock));
+
+ /* Sodirect mode so must not be a tcp_rcv_list */
+ ASSERT(tcp->tcp_rcv_list == NULL);
+
+ /* Passed in segment length must be equal to mblk_t chain data size */
+ ASSERT(seg_len == msgdsize(mp));
+
+ if (DB_TYPE(mp) != M_DATA) {
+ /* Only process M_DATA mblk_t's */
+ goto enq;
+ }
+ if (uioap->uioa_state & UIOA_ENABLED) {
+ /* Uioa is enabled */
+ mblk_t *mp1 = mp;
+
+ if (seg_len > uioap->uio_resid) {
+ /*
+ * There isn't enough uio space for the mblk_t chain
+ * so disable uioa such that this and any additional
+ * mblk_t data is handled by the socket and schedule
+ * the socket for wakeup to finish this uioa.
+ */
+ uioap->uioa_state &= UIOA_CLR;
+ uioap->uioa_state |= UIOA_FINI;
+ if (sodp->sod_state & SOD_WAKE_NOT) {
+ sodp->sod_state &= SOD_WAKE_CLR;
+ sodp->sod_state |= SOD_WAKE_NEED;
+ }
+ goto enq;
+ }
+ do {
+ uint32_t len = MBLKL(mp1);
+
+ if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) {
+ /* Scheduled, mark dblk_t as such */
+ DB_FLAGS(mp1) |= DBLK_UIOA;
+ } else {
+ /* Error, turn off async processing */
+ uioap->uioa_state &= UIOA_CLR;
+ uioap->uioa_state |= UIOA_FINI;
+ break;
+ }
+ } while ((mp1 = mp1->b_cont) != NULL);
+
+ if (mp1 != NULL || uioap->uio_resid == 0) {
+ /*
+ * Not all mblk_t(s) uioamoved (error) or all uio
+ * space has been consumed so schedule the socket
+ * for wakeup to finish this uio.
+ */
+ sodp->sod_state &= SOD_WAKE_CLR;
+ sodp->sod_state |= SOD_WAKE_NEED;
+ }
+ } else if (uioap->uioa_state & UIOA_FINI) {
+ /*
+ * Post UIO_ENABLED waiting for socket to finish processing
+ * so just enqueue and update tcp_rwnd.
+ */
+ if (SOD_QFULL(sodp))
+ tcp->tcp_rwnd -= seg_len;
+ } else if (sodp->sod_want > 0) {
+ /*
+ * Uioa isn't enabled but sodirect has a pending read().
+ */
+ if (SOD_QCNT(sodp) + seg_len >= sodp->sod_want) {
+ if (sodp->sod_state & SOD_WAKE_NOT) {
+ /* Schedule socket for wakeup */
+ sodp->sod_state &= SOD_WAKE_CLR;
+ sodp->sod_state |= SOD_WAKE_NEED;
+ }
+ tcp->tcp_rwnd -= seg_len;
+ }
+ } else if (SOD_QCNT(sodp) + seg_len >= tcp->tcp_rq->q_hiwat >> 3) {
+ /*
+ * No pending sodirect read() so used the default
+ * TCP push logic to guess that a push is needed.
+ */
+ if (sodp->sod_state & SOD_WAKE_NOT) {
+ /* Schedule socket for wakeup */
+ sodp->sod_state &= SOD_WAKE_CLR;
+ sodp->sod_state |= SOD_WAKE_NEED;
+ }
+ tcp->tcp_rwnd -= seg_len;
+ } else {
+ /* Just update tcp_rwnd */
+ tcp->tcp_rwnd -= seg_len;
+ }
+enq:
+ qfull = SOD_QFULL(sodp);
+
+ (sodp->sod_enqueue)(sodp, mp);
+
+ if (! qfull && SOD_QFULL(sodp)) {
+ /* Wasn't QFULL, now QFULL, need back-enable */
+ SOD_QSETBE(sodp);
+ }
+
+ /*
+ * Check to see if remote avail swnd < mss due to delayed ACK,
+ * first get advertised rwnd.
+ */
+ thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win));
+ /* Minus delayed ACK count */
+ thwin -= tcp->tcp_rnxt - tcp->tcp_rack;
+ if (thwin < tcp->tcp_mss) {
+ /* Remote avail swnd < mss, need ACK now */
+ return (TH_ACK_NEEDED);
+ }
+
+ return (0);
+}
+
+/*
* DEFAULT TCP ENTRY POINT via squeue on READ side.
*
* This is the default entry function into TCP on the read side. TCP is
@@ -14987,13 +15281,39 @@ est:
tcp_rcv_enqueue(tcp, mp, seg_len);
}
} else {
+ sodirect_t *sodp = tcp->tcp_sodirect;
+
+ /*
+ * If an sodirect connection and an enabled sodirect_t then
+ * sodp will be set to point to the tcp_t/sonode_t shared
+ * sodirect_t and the sodirect_t's lock will be held.
+ */
+ if (sodp != NULL) {
+ mutex_enter(sodp->sod_lock);
+ if (!(sodp->sod_state & SOD_ENABLED)) {
+ mutex_exit(sodp->sod_lock);
+ sodp = NULL;
+ } else if (tcp->tcp_kssl_ctx != NULL &&
+ DB_TYPE(mp) == M_DATA) {
+ mutex_exit(sodp->sod_lock);
+ sodp = NULL;
+ }
+ }
if (mp->b_datap->db_type != M_DATA ||
(flags & TH_MARKNEXT_NEEDED)) {
- if (tcp->tcp_rcv_list != NULL) {
+ if (sodp != NULL) {
+ if (!SOD_QEMPTY(sodp) &&
+ (sodp->sod_state & SOD_WAKE_NOT)) {
+ flags |= tcp_rcv_sod_wakeup(tcp, sodp);
+ /* sod_wakeup() did the mutex_exit() */
+ mutex_enter(sodp->sod_lock);
+ }
+ } else if (tcp->tcp_rcv_list != NULL) {
flags |= tcp_rcv_drain(tcp->tcp_rq, tcp);
}
ASSERT(tcp->tcp_rcv_list == NULL ||
tcp->tcp_fused_sigurg);
+
if (flags & TH_MARKNEXT_NEEDED) {
#ifdef DEBUG
(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
@@ -15011,11 +15331,40 @@ est:
DTRACE_PROBE1(kssl_mblk__ksslinput_data1,
mblk_t *, mp);
tcp_kssl_input(tcp, mp);
+ } else if (sodp) {
+ flags |= tcp_rcv_sod_enqueue(
+ tcp, sodp, mp, seg_len);
+ flags |= tcp_rcv_sod_wakeup(tcp, sodp);
+ /* sod_wakeup() did the mutex_exit() */
} else {
putnext(tcp->tcp_rq, mp);
if (!canputnext(tcp->tcp_rq))
tcp->tcp_rwnd -= seg_len;
}
+ } else if ((tcp->tcp_kssl_ctx != NULL) &&
+ (DB_TYPE(mp) == M_DATA)) {
+ /* Do SSL processing first */
+ DTRACE_PROBE1(kssl_mblk__ksslinput_data2,
+ mblk_t *, mp);
+ tcp_kssl_input(tcp, mp);
+ } else if (sodp != NULL) {
+ /*
+ * Sodirect so all mblk_t's are queued on the
+ * socket directly, check for wakeup of blocked
+ * reader (if any), and last if flow-controled.
+ */
+ flags |= tcp_rcv_sod_enqueue(tcp, sodp, mp, seg_len);
+ if ((sodp->sod_state & SOD_WAKE_NEED) ||
+ (flags & (TH_PUSH|TH_FIN))) {
+ flags |= tcp_rcv_sod_wakeup(tcp, sodp);
+ /* sod_wakeup() did the mutex_exit() */
+ } else {
+ if (SOD_QFULL(sodp)) {
+ /* Q is full, need backenable */
+ SOD_QSETBE(sodp);
+ }
+ mutex_exit(sodp->sod_lock);
+ }
} else if ((flags & (TH_PUSH|TH_FIN)) ||
tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_rq->q_hiwat >> 3) {
if (tcp->tcp_rcv_list != NULL) {
@@ -15035,41 +15384,33 @@ est:
tcp_rcv_enqueue(tcp, mp, seg_len);
flags |= tcp_rcv_drain(tcp->tcp_rq, tcp);
} else {
- /* Does this need SSL processing first? */
- if ((tcp->tcp_kssl_ctx != NULL) &&
- (DB_TYPE(mp) == M_DATA)) {
- DTRACE_PROBE1(
- kssl_mblk__ksslinput_data2,
- mblk_t *, mp);
- tcp_kssl_input(tcp, mp);
- } else {
- putnext(tcp->tcp_rq, mp);
- if (!canputnext(tcp->tcp_rq))
- tcp->tcp_rwnd -= seg_len;
- }
+ putnext(tcp->tcp_rq, mp);
+ if (!canputnext(tcp->tcp_rq))
+ tcp->tcp_rwnd -= seg_len;
}
} else {
/*
* Enqueue all packets when processing an mblk
* from the co queue and also enqueue normal packets.
- * For packets which belong to SSL stream do SSL
- * processing first.
*/
- if ((tcp->tcp_kssl_ctx != NULL) &&
- (DB_TYPE(mp) == M_DATA)) {
- DTRACE_PROBE1(kssl_mblk__tcpksslin3,
- mblk_t *, mp);
- tcp_kssl_input(tcp, mp);
- } else {
- tcp_rcv_enqueue(tcp, mp, seg_len);
- }
+ tcp_rcv_enqueue(tcp, mp, seg_len);
}
/*
* Make sure the timer is running if we have data waiting
* for a push bit. This provides resiliency against
* implementations that do not correctly generate push bits.
+ *
+ * Note, for sodirect if Q isn't empty and there's not a
+ * pending wakeup then we need a timer. Also note that sodp
+ * is assumed to be still valid after exit()ing the sod_lock
+ * above and while the SOD state can change it can only change
+ * such that the Q is empty now even though data was added
+ * above.
*/
- if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) {
+ if (((sodp != NULL && !SOD_QEMPTY(sodp) &&
+ (sodp->sod_state & SOD_WAKE_NOT)) ||
+ (sodp == NULL && tcp->tcp_rcv_list != NULL)) &&
+ tcp->tcp_push_tid == 0) {
/*
* The connection may be closed at this point, so don't
* do anything for a detached tcp.
@@ -15081,6 +15422,7 @@ est:
tcps->tcps_push_timer_interval));
}
}
+
xmit_check:
/* Is there anything left to do? */
ASSERT(!(flags & TH_MARKNEXT_NEEDED));
@@ -15156,13 +15498,27 @@ ack_check:
/*
* Send up any queued data and then send the mark message
*/
- if (tcp->tcp_rcv_list != NULL) {
- flags |= tcp_rcv_drain(tcp->tcp_rq, tcp);
- }
- ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
+ sodirect_t *sodp;
+
+ SOD_PTR_ENTER(tcp, sodp);
mp1 = tcp->tcp_urp_mark_mp;
tcp->tcp_urp_mark_mp = NULL;
+ if (sodp != NULL) {
+
+ ASSERT(tcp->tcp_rcv_list == NULL);
+
+ flags |= tcp_rcv_sod_enqueue(tcp, sodp, mp1, 0);
+ flags |= tcp_rcv_sod_wakeup(tcp, sodp);
+ /* sod_wakeup() does the mutex_exit() */
+ } else if (tcp->tcp_rcv_list != NULL) {
+ flags |= tcp_rcv_drain(tcp->tcp_rq, tcp);
+
+ ASSERT(tcp->tcp_rcv_list == NULL ||
+ tcp->tcp_fused_sigurg);
+
+ putnext(tcp->tcp_rq, mp1);
+ }
#ifdef DEBUG
(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_rput: sending zero-length %s %s",
@@ -15170,7 +15526,6 @@ ack_check:
"MSGNOTMARKNEXT"),
tcp_display(tcp, NULL, DISP_PORT_ONLY));
#endif /* DEBUG */
- putnext(tcp->tcp_rq, mp1);
flags &= ~TH_SEND_URP_MARK;
}
if (flags & TH_ACK_NEEDED) {
@@ -15208,14 +15563,32 @@ ack_check:
* In the eager case tcp_rsrv will do this when run
* after tcp_accept is done.
*/
+ sodirect_t *sodp;
+
ASSERT(tcp->tcp_listener == NULL);
- if (tcp->tcp_rcv_list != NULL) {
+
+ SOD_PTR_ENTER(tcp, sodp);
+ if (sodp != NULL) {
+ /* No more sodirect */
+ tcp->tcp_sodirect = NULL;
+ if (!SOD_QEMPTY(sodp)) {
+ /* Mblk(s) to process, notify */
+ flags |= tcp_rcv_sod_wakeup(tcp, sodp);
+ /* sod_wakeup() does the mutex_exit() */
+ } else {
+ /* Nothing to process */
+ mutex_exit(sodp->sod_lock);
+ }
+ } else if (tcp->tcp_rcv_list != NULL) {
/*
* Push any mblk(s) enqueued from co processing.
*/
flags |= tcp_rcv_drain(tcp->tcp_rq, tcp);
+
+ ASSERT(tcp->tcp_rcv_list == NULL ||
+ tcp->tcp_fused_sigurg);
}
- ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
+
if ((mp1 = mi_tpi_ordrel_ind()) != NULL) {
tcp->tcp_ordrel_done = B_TRUE;
putnext(tcp->tcp_rq, mp1);
@@ -15985,6 +16358,8 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2)
queue_t *q = tcp->tcp_rq;
uint_t thwin;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ sodirect_t *sodp;
+ boolean_t fc;
freeb(mp);
@@ -16035,7 +16410,27 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2)
return;
}
- if (canputnext(q)) {
+ SOD_PTR_ENTER(tcp, sodp);
+ if (sodp != NULL) {
+ /* An sodirect connection */
+ if (SOD_QFULL(sodp)) {
+ /* Flow-controlled, need another back-enable */
+ fc = B_TRUE;
+ SOD_QSETBE(sodp);
+ } else {
+ /* Not flow-controlled */
+ fc = B_FALSE;
+ }
+ mutex_exit(sodp->sod_lock);
+ } else if (canputnext(q)) {
+ /* STREAMS, not flow-controlled */
+ fc = B_FALSE;
+ } else {
+ /* STREAMS, flow-controlled */
+ fc = B_TRUE;
+ }
+ if (!fc) {
+ /* Not flow-controlled, open rwnd */
tcp->tcp_rwnd = q->q_hiwat;
thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win))
<< tcp->tcp_rcv_ws;
@@ -16054,13 +16449,32 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2)
BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate);
}
}
+
/* Handle a failure to allocate a T_ORDREL_IND here */
if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
ASSERT(tcp->tcp_listener == NULL);
- if (tcp->tcp_rcv_list != NULL) {
- (void) tcp_rcv_drain(q, tcp);
+
+ SOD_PTR_ENTER(tcp, sodp);
+ if (sodp != NULL) {
+ /* No more sodirect */
+ tcp->tcp_sodirect = NULL;
+ if (!SOD_QEMPTY(sodp)) {
+ /* Notify mblk(s) to process */
+ (void) tcp_rcv_sod_wakeup(tcp, sodp);
+ /* sod_wakeup() does the mutex_exit() */
+ } else {
+ /* Nothing to process */
+ mutex_exit(sodp->sod_lock);
+ }
+ } else if (tcp->tcp_rcv_list != NULL) {
+ /*
+ * Push any mblk(s) enqueued from co processing.
+ */
+ (void) tcp_rcv_drain(tcp->tcp_rq, tcp);
+ ASSERT(tcp->tcp_rcv_list == NULL ||
+ tcp->tcp_fused_sigurg);
}
- ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
+
mp = mi_tpi_ordrel_ind();
if (mp) {
tcp->tcp_ordrel_done = B_TRUE;
@@ -18108,6 +18522,8 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
*/
if (tcp->tcp_rcv_list != NULL) {
/* We drain directly in case of fused tcp loopback */
+ sodirect_t *sodp;
+
if (!tcp->tcp_fused && canputnext(q)) {
tcp->tcp_rwnd = q->q_hiwat;
thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win))
@@ -18123,7 +18539,26 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
}
}
- (void) tcp_rcv_drain(q, tcp);
+
+ SOD_PTR_ENTER(tcp, sodp);
+ if (sodp != NULL) {
+ /* Sodirect, move from rcv_list */
+ ASSERT(!tcp->tcp_fused);
+ while ((mp = tcp->tcp_rcv_list) != NULL) {
+ tcp->tcp_rcv_list = mp->b_next;
+ mp->b_next = NULL;
+ (void) tcp_rcv_sod_enqueue(tcp, sodp, mp,
+ msgdsize(mp));
+ }
+ tcp->tcp_rcv_last_head = NULL;
+ tcp->tcp_rcv_last_tail = NULL;
+ tcp->tcp_rcv_cnt = 0;
+ (void) tcp_rcv_sod_wakeup(tcp, sodp);
+ /* sod_wakeup() did the mutex_exit() */
+ } else {
+ /* Not sodirect, drain */
+ (void) tcp_rcv_drain(q, tcp);
+ }
/*
* For fused tcp loopback, back-enable peer endpoint
@@ -18315,6 +18750,21 @@ tcp_wput_accept(queue_t *q, mblk_t *mp)
listener = eager->tcp_listener;
eager->tcp_issocket = B_TRUE;
+ /*
+ * TCP is _D_SODIRECT and sockfs is directly above so
+ * save shared sodirect_t pointer (if any).
+ *
+ * If tcp_fused and sodirect enabled disable it.
+ */
+ eager->tcp_sodirect = SOD_QTOSODP(eager->tcp_rq);
+ if (eager->tcp_fused && eager->tcp_sodirect != NULL) {
+ /* Fused, disable sodirect */
+ mutex_enter(eager->tcp_sodirect->sod_lock);
+ SOD_DISABLE(eager->tcp_sodirect);
+ mutex_exit(eager->tcp_sodirect->sod_lock);
+ eager->tcp_sodirect = NULL;
+ }
+
econnp->conn_zoneid = listener->tcp_connp->conn_zoneid;
econnp->conn_allzones = listener->tcp_connp->conn_allzones;
ASSERT(econnp->conn_netstack ==
@@ -22101,6 +22551,7 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2)
tcp_fuse_disable_pair(tcp, B_FALSE);
}
tcp->tcp_issocket = B_FALSE;
+ tcp->tcp_sodirect = NULL;
TCP_STAT(tcps, tcp_sock_fallback);
DB_TYPE(mp) = M_IOCACK;
@@ -23383,6 +23834,8 @@ tcp_push_timer(void *arg)
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ uint_t flags;
+ sodirect_t *sodp;
TCP_DBGSTAT(tcps, tcp_push_timer_cnt);
@@ -23394,9 +23847,17 @@ tcp_push_timer(void *arg)
*/
TCP_FUSE_SYNCSTR_PLUG_DRAIN(tcp);
tcp->tcp_push_tid = 0;
- if ((tcp->tcp_rcv_list != NULL) &&
- (tcp_rcv_drain(tcp->tcp_rq, tcp) == TH_ACK_NEEDED))
+
+ SOD_PTR_ENTER(tcp, sodp);
+ if (sodp != NULL) {
+ flags = tcp_rcv_sod_wakeup(tcp, sodp);
+ /* sod_wakeup() does the mutex_exit() */
+ } else if (tcp->tcp_rcv_list != NULL) {
+ flags = tcp_rcv_drain(tcp->tcp_rq, tcp);
+ }
+ if (flags == TH_ACK_NEEDED)
tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
+
TCP_FUSE_SYNCSTR_UNPLUG_DRAIN(tcp);
}
diff --git a/usr/src/uts/common/inet/tcp/tcp6ddi.c b/usr/src/uts/common/inet/tcp/tcp6ddi.c
index e724bdd022..1eda50d9a6 100644
--- a/usr/src/uts/common/inet/tcp/tcp6ddi.c
+++ b/usr/src/uts/common/inet/tcp/tcp6ddi.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -40,7 +40,7 @@
* for TCP Fusion (loopback); this is why we don't define
* D_SYNCSTR here.
*/
-#define INET_DEVMTFLAGS (D_MP|_D_DIRECT)
+#define INET_DEVMTFLAGS (D_MP|_D_DIRECT|_D_SODIRECT)
#include "../inetddi.c"
diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c
index 2503a13e29..75851ac1f7 100644
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c
@@ -287,6 +287,15 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
if ((mp = allocb(sizeof (*stropt), BPRI_HI)) == NULL)
goto failed;
+ /* If peer sodirect enabled then disable */
+ ASSERT(tcp->tcp_sodirect == NULL);
+ if (peer_tcp->tcp_sodirect != NULL) {
+ mutex_enter(peer_tcp->tcp_sodirect->sod_lock);
+ SOD_DISABLE(peer_tcp->tcp_sodirect);
+ mutex_exit(peer_tcp->tcp_sodirect->sod_lock);
+ peer_tcp->tcp_sodirect = NULL;
+ }
+
/* Fuse both endpoints */
peer_tcp->tcp_loopback_peer = tcp;
tcp->tcp_loopback_peer = peer_tcp;
diff --git a/usr/src/uts/common/inet/tcp/tcpddi.c b/usr/src/uts/common/inet/tcp/tcpddi.c
index 436786b846..dc08ad23c4 100644
--- a/usr/src/uts/common/inet/tcp/tcpddi.c
+++ b/usr/src/uts/common/inet/tcp/tcpddi.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -44,7 +44,7 @@
* for TCP Fusion (loopback); this is why we don't define
* D_SYNCSTR here.
*/
-#define INET_DEVMTFLAGS (D_MP|_D_DIRECT)
+#define INET_DEVMTFLAGS (D_MP|_D_DIRECT|_D_SODIRECT)
#include "../inetddi.c"
diff --git a/usr/src/uts/common/io/dcopy.c b/usr/src/uts/common/io/dcopy.c
new file mode 100644
index 0000000000..2dc5a311bc
--- /dev/null
+++ b/usr/src/uts/common/io/dcopy.c
@@ -0,0 +1,932 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * dcopy.c
+ * dcopy misc module
+ */
+
+#include <sys/conf.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/sysmacros.h>
+#include <sys/atomic.h>
+
+
+#include <sys/dcopy.h>
+#include <sys/dcopy_device.h>
+
+
+/* Number of entries per channel to allocate */
+uint_t dcopy_channel_size = 1024;
+
+
+typedef struct dcopy_list_s {
+ list_t dl_list;
+ kmutex_t dl_mutex;
+ uint_t dl_cnt; /* num entries on list */
+} dcopy_list_t;
+
+/* device state for register/unregister */
+struct dcopy_device_s {
+ /* DMA device drivers private pointer */
+ void *dc_device_private;
+
+ /* to track list of channels from this DMA device */
+ dcopy_list_t dc_devchan_list;
+ list_node_t dc_device_list_node;
+
+ /*
+ * dc_removing_cnt track how many channels still have to be freed up
+ * before it's safe to allow the DMA device driver to detach.
+ */
+ uint_t dc_removing_cnt;
+ dcopy_device_cb_t *dc_cb;
+
+ dcopy_device_info_t dc_info;
+
+};
+
+typedef struct dcopy_stats_s {
+ kstat_named_t cs_bytes_xfer;
+ kstat_named_t cs_cmd_alloc;
+ kstat_named_t cs_cmd_post;
+ kstat_named_t cs_cmd_poll;
+ kstat_named_t cs_notify_poll;
+ kstat_named_t cs_notify_pending;
+ kstat_named_t cs_id;
+ kstat_named_t cs_capabilities;
+} dcopy_stats_t;
+
+/* DMA channel state */
+struct dcopy_channel_s {
+ /* DMA driver channel private pointer */
+ void *ch_channel_private;
+
+ /* shortcut to device callbacks */
+ dcopy_device_cb_t *ch_cb;
+
+ /*
+ * number of outstanding allocs for this channel. used to track when
+ * it's safe to free up this channel so the DMA device driver can
+ * detach.
+ */
+ uint64_t ch_ref_cnt;
+
+ /* state for if channel needs to be removed when ch_ref_cnt gets to 0 */
+ boolean_t ch_removing;
+
+ list_node_t ch_devchan_list_node;
+ list_node_t ch_globalchan_list_node;
+
+ /*
+ * per channel list of commands actively blocking waiting for
+ * completion.
+ */
+ dcopy_list_t ch_poll_list;
+
+ /* pointer back to our device */
+ struct dcopy_device_s *ch_device;
+
+ dcopy_query_channel_t ch_info;
+
+ kstat_t *ch_kstat;
+ dcopy_stats_t ch_stat;
+};
+
+/*
+ * If grabbing both device_list mutex & globalchan_list mutex,
+ * Always grab globalchan_list mutex before device_list mutex
+ */
+typedef struct dcopy_state_s {
+ dcopy_list_t d_device_list;
+ dcopy_list_t d_globalchan_list;
+} dcopy_state_t;
+dcopy_state_t *dcopy_statep;
+
+
+/* Module Driver Info */
+static struct modlmisc dcopy_modlmisc = {
+ &mod_miscops,
+ "dcopy kernel module"
+};
+
+/* Module Linkage */
+static struct modlinkage dcopy_modlinkage = {
+ MODREV_1,
+ &dcopy_modlmisc,
+ NULL
+};
+
+static int dcopy_init();
+static void dcopy_fini();
+
+static int dcopy_list_init(dcopy_list_t *list, size_t node_size,
+ offset_t link_offset);
+static void dcopy_list_fini(dcopy_list_t *list);
+static void dcopy_list_push(dcopy_list_t *list, void *list_node);
+static void *dcopy_list_pop(dcopy_list_t *list);
+
+static void dcopy_device_cleanup(dcopy_device_handle_t device,
+ boolean_t do_callback);
+
+static int dcopy_stats_init(dcopy_handle_t channel);
+static void dcopy_stats_fini(dcopy_handle_t channel);
+
+
+/*
+ * _init()
+ */
+int
+_init()
+{
+ int e;
+
+ e = dcopy_init();
+ if (e != 0) {
+ return (e);
+ }
+
+ return (mod_install(&dcopy_modlinkage));
+}
+
+
+/*
+ * _info()
+ */
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&dcopy_modlinkage, modinfop));
+}
+
+
+/*
+ * _fini()
+ */
+int
+_fini()
+{
+ int e;
+
+ e = mod_remove(&dcopy_modlinkage);
+ if (e != 0) {
+ return (e);
+ }
+
+ dcopy_fini();
+
+ return (e);
+}
+
+/*
+ * dcopy_init()
+ */
+static int
+dcopy_init()
+{
+ int e;
+
+
+ dcopy_statep = kmem_zalloc(sizeof (*dcopy_statep), KM_SLEEP);
+
+ /* Initialize the list we use to track device register/unregister */
+ e = dcopy_list_init(&dcopy_statep->d_device_list,
+ sizeof (struct dcopy_device_s),
+ offsetof(struct dcopy_device_s, dc_device_list_node));
+ if (e != DCOPY_SUCCESS) {
+ goto dcopyinitfail_device;
+ }
+
+ /* Initialize the list we use to track all DMA channels */
+ e = dcopy_list_init(&dcopy_statep->d_globalchan_list,
+ sizeof (struct dcopy_channel_s),
+ offsetof(struct dcopy_channel_s, ch_globalchan_list_node));
+ if (e != DCOPY_SUCCESS) {
+ goto dcopyinitfail_global;
+ }
+
+ return (0);
+
+dcopyinitfail_cback:
+ dcopy_list_fini(&dcopy_statep->d_globalchan_list);
+dcopyinitfail_global:
+ dcopy_list_fini(&dcopy_statep->d_device_list);
+dcopyinitfail_device:
+ kmem_free(dcopy_statep, sizeof (*dcopy_statep));
+
+ return (-1);
+}
+
+
+/*
+ * dcopy_fini()
+ */
+static void
+dcopy_fini()
+{
+ /*
+ * if mod_remove was successfull, we shouldn't have any
+ * devices/channels to worry about.
+ */
+ ASSERT(list_head(&dcopy_statep->d_globalchan_list.dl_list) == NULL);
+ ASSERT(list_head(&dcopy_statep->d_device_list.dl_list) == NULL);
+
+ dcopy_list_fini(&dcopy_statep->d_globalchan_list);
+ dcopy_list_fini(&dcopy_statep->d_device_list);
+ kmem_free(dcopy_statep, sizeof (*dcopy_statep));
+}
+
+
+/* *** EXTERNAL INTERFACE *** */
+/*
+ * dcopy_query()
+ */
+void
+dcopy_query(dcopy_query_t *query)
+{
+ query->dq_version = DCOPY_QUERY_V0;
+ query->dq_num_channels = dcopy_statep->d_globalchan_list.dl_cnt;
+}
+
+
+/*
+ * dcopy_alloc()
+ */
+/*ARGSUSED*/
+int
+dcopy_alloc(int flags, dcopy_handle_t *handle)
+{
+ dcopy_handle_t channel;
+ dcopy_list_t *list;
+
+
+ /*
+ * we don't use the dcopy_list_* code here because we need to due
+ * some non-standard stuff.
+ */
+
+ list = &dcopy_statep->d_globalchan_list;
+
+ /*
+ * if nothing is on the channel list, return DCOPY_NORESOURCES. This
+ * can happen if there aren't any DMA device registered.
+ */
+ mutex_enter(&list->dl_mutex);
+ channel = list_head(&list->dl_list);
+ if (channel == NULL) {
+ mutex_exit(&list->dl_mutex);
+ return (DCOPY_NORESOURCES);
+ }
+
+ /*
+ * increment the reference count, and pop the channel off the head and
+ * push it on the tail. This ensures we rotate through the channels.
+ * DMA channels are shared.
+ */
+ channel->ch_ref_cnt++;
+ list_remove(&list->dl_list, channel);
+ list_insert_tail(&list->dl_list, channel);
+ mutex_exit(&list->dl_mutex);
+
+ *handle = (dcopy_handle_t)channel;
+ return (DCOPY_SUCCESS);
+}
+
+
+/*
+ * dcopy_free()
+ */
+void
+dcopy_free(dcopy_handle_t *channel)
+{
+ dcopy_device_handle_t device;
+ dcopy_list_t *list;
+ boolean_t cleanup;
+
+
+ ASSERT(*channel != NULL);
+
+ /*
+ * we don't need to add the channel back to the list since we never
+ * removed it. decrement the reference count.
+ */
+ list = &dcopy_statep->d_globalchan_list;
+ mutex_enter(&list->dl_mutex);
+ (*channel)->ch_ref_cnt--;
+
+ /*
+ * if we need to remove this channel, and the reference count is down
+ * to 0, decrement the number of channels which still need to be
+ * removed on the device.
+ */
+ if ((*channel)->ch_removing && ((*channel)->ch_ref_cnt == 0)) {
+ cleanup = B_FALSE;
+ device = (*channel)->ch_device;
+ mutex_enter(&device->dc_devchan_list.dl_mutex);
+ device->dc_removing_cnt--;
+ if (device->dc_removing_cnt == 0) {
+ cleanup = B_TRUE;
+ }
+ mutex_exit(&device->dc_devchan_list.dl_mutex);
+ }
+ mutex_exit(&list->dl_mutex);
+
+ /*
+ * if there are no channels which still need to be removed, cleanup the
+ * device state and call back into the DMA device driver to tell them
+ * the device is free.
+ */
+ if (cleanup) {
+ dcopy_device_cleanup(device, B_TRUE);
+ }
+
+ *channel = NULL;
+}
+
+
+/*
+ * dcopy_query_channel()
+ */
+void
+dcopy_query_channel(dcopy_handle_t channel, dcopy_query_channel_t *query)
+{
+ *query = channel->ch_info;
+}
+
+
+/*
+ * dcopy_cmd_alloc()
+ */
+int
+dcopy_cmd_alloc(dcopy_handle_t handle, int flags, dcopy_cmd_t *cmd)
+{
+ dcopy_handle_t channel;
+ dcopy_cmd_priv_t priv;
+ int e;
+
+
+ channel = handle;
+
+ atomic_inc_64(&channel->ch_stat.cs_cmd_alloc.value.ui64);
+ e = channel->ch_cb->cb_cmd_alloc(channel->ch_channel_private, flags,
+ cmd);
+ if (e == DCOPY_SUCCESS) {
+ priv = (*cmd)->dp_private;
+ priv->pr_channel = channel;
+ /*
+ * we won't initialize the blocking state until we actually
+ * need to block.
+ */
+ priv->pr_block_init = B_FALSE;
+ }
+
+ return (e);
+}
+
+
+/*
+ * dcopy_cmd_free()
+ */
+void
+dcopy_cmd_free(dcopy_cmd_t *cmd)
+{
+ dcopy_handle_t channel;
+ dcopy_cmd_priv_t priv;
+
+
+ ASSERT(*cmd != NULL);
+
+ priv = (*cmd)->dp_private;
+ channel = priv->pr_channel;
+
+ /* if we initialized the blocking state, clean it up too */
+ if (priv->pr_block_init) {
+ cv_destroy(&priv->pr_cv);
+ mutex_destroy(&priv->pr_mutex);
+ }
+
+ channel->ch_cb->cb_cmd_free(channel->ch_channel_private, cmd);
+}
+
+
+/*
+ * dcopy_cmd_post()
+ */
+int
+dcopy_cmd_post(dcopy_cmd_t cmd)
+{
+ dcopy_handle_t channel;
+ int e;
+
+
+ channel = cmd->dp_private->pr_channel;
+
+ atomic_inc_64(&channel->ch_stat.cs_cmd_post.value.ui64);
+ if (cmd->dp_cmd == DCOPY_CMD_COPY) {
+ atomic_add_64(&channel->ch_stat.cs_bytes_xfer.value.ui64,
+ cmd->dp.copy.cc_size);
+ }
+ e = channel->ch_cb->cb_cmd_post(channel->ch_channel_private, cmd);
+ if (e != DCOPY_SUCCESS) {
+ return (e);
+ }
+
+ return (DCOPY_SUCCESS);
+}
+
+
+/*
+ * dcopy_cmd_poll()
+ */
+int
+dcopy_cmd_poll(dcopy_cmd_t cmd, int flags)
+{
+ dcopy_handle_t channel;
+ dcopy_cmd_priv_t priv;
+ int e;
+
+
+ priv = cmd->dp_private;
+ channel = priv->pr_channel;
+
+ /*
+ * if the caller is trying to block, they needed to post the
+ * command with DCOPY_CMD_INTR set.
+ */
+ if ((flags & DCOPY_POLL_BLOCK) && !(cmd->dp_flags & DCOPY_CMD_INTR)) {
+ return (DCOPY_FAILURE);
+ }
+
+ atomic_inc_64(&channel->ch_stat.cs_cmd_poll.value.ui64);
+
+repoll:
+ e = channel->ch_cb->cb_cmd_poll(channel->ch_channel_private, cmd);
+ if (e == DCOPY_PENDING) {
+ /*
+ * if the command is still active, and the blocking flag
+ * is set.
+ */
+ if (flags & DCOPY_POLL_BLOCK) {
+
+ /*
+ * if we haven't initialized the state, do it now. A
+ * command can be re-used, so it's possible it's
+ * already been initialized.
+ */
+ if (!priv->pr_block_init) {
+ priv->pr_block_init = B_TRUE;
+ mutex_init(&priv->pr_mutex, NULL, MUTEX_DRIVER,
+ NULL);
+ cv_init(&priv->pr_cv, NULL, CV_DRIVER, NULL);
+ priv->pr_cmd = cmd;
+ }
+
+ /* push it on the list for blocking commands */
+ priv->pr_wait = B_TRUE;
+ dcopy_list_push(&channel->ch_poll_list, priv);
+
+ mutex_enter(&priv->pr_mutex);
+ /*
+ * it's possible we already cleared pr_wait before we
+ * grabbed the mutex.
+ */
+ if (priv->pr_wait) {
+ cv_wait(&priv->pr_cv, &priv->pr_mutex);
+ }
+ mutex_exit(&priv->pr_mutex);
+
+ /*
+ * the command has completed, go back and poll so we
+ * get the status.
+ */
+ goto repoll;
+ }
+ }
+
+ return (e);
+}
+
+/* *** END OF EXTERNAL INTERFACE *** */
+
+/*
+ * dcopy_list_init()
+ */
+static int
+dcopy_list_init(dcopy_list_t *list, size_t node_size, offset_t link_offset)
+{
+ mutex_init(&list->dl_mutex, NULL, MUTEX_DRIVER, NULL);
+ list_create(&list->dl_list, node_size, link_offset);
+ list->dl_cnt = 0;
+
+ return (DCOPY_SUCCESS);
+}
+
+
+/*
+ * dcopy_list_fini()
+ */
+static void
+dcopy_list_fini(dcopy_list_t *list)
+{
+ list_destroy(&list->dl_list);
+ mutex_destroy(&list->dl_mutex);
+}
+
+
+/*
+ * dcopy_list_push()
+ */
+static void
+dcopy_list_push(dcopy_list_t *list, void *list_node)
+{
+ mutex_enter(&list->dl_mutex);
+ list_insert_tail(&list->dl_list, list_node);
+ list->dl_cnt++;
+ mutex_exit(&list->dl_mutex);
+}
+
+
+/*
+ * dcopy_list_pop()
+ */
+static void *
+dcopy_list_pop(dcopy_list_t *list)
+{
+ list_node_t *list_node;
+
+ mutex_enter(&list->dl_mutex);
+ list_node = list_head(&list->dl_list);
+ if (list_node == NULL) {
+ mutex_exit(&list->dl_mutex);
+ return (list_node);
+ }
+ list->dl_cnt--;
+ list_remove(&list->dl_list, list_node);
+ mutex_exit(&list->dl_mutex);
+
+ return (list_node);
+}
+
+
+/* *** DEVICE INTERFACE *** */
+/*
+ * dcopy_device_register()
+ */
+int
+dcopy_device_register(void *device_private, dcopy_device_info_t *info,
+ dcopy_device_handle_t *handle)
+{
+ struct dcopy_channel_s *channel;
+ struct dcopy_device_s *device;
+ int e;
+ int i;
+
+
+ /* initialize the per device state */
+ device = kmem_zalloc(sizeof (*device), KM_SLEEP);
+ device->dc_device_private = device_private;
+ device->dc_info = *info;
+ device->dc_removing_cnt = 0;
+ device->dc_cb = info->di_cb;
+
+ /*
+ * we have a per device channel list so we can remove a device in the
+ * future.
+ */
+ e = dcopy_list_init(&device->dc_devchan_list,
+ sizeof (struct dcopy_channel_s),
+ offsetof(struct dcopy_channel_s, ch_devchan_list_node));
+ if (e != DCOPY_SUCCESS) {
+ goto registerfail_devchan;
+ }
+
+ /*
+ * allocate state for each channel, allocate the channel, and then add
+ * the devices dma channels to the devices channel list.
+ */
+ for (i = 0; i < info->di_num_dma; i++) {
+ channel = kmem_zalloc(sizeof (*channel), KM_SLEEP);
+ channel->ch_device = device;
+ channel->ch_removing = B_FALSE;
+ channel->ch_ref_cnt = 0;
+ channel->ch_cb = info->di_cb;
+
+ e = info->di_cb->cb_channel_alloc(device_private, channel,
+ DCOPY_SLEEP, dcopy_channel_size, &channel->ch_info,
+ &channel->ch_channel_private);
+ if (e != DCOPY_SUCCESS) {
+ kmem_free(channel, sizeof (*channel));
+ goto registerfail_alloc;
+ }
+
+ e = dcopy_stats_init(channel);
+ if (e != DCOPY_SUCCESS) {
+ info->di_cb->cb_channel_free(
+ &channel->ch_channel_private);
+ kmem_free(channel, sizeof (*channel));
+ goto registerfail_alloc;
+ }
+
+ e = dcopy_list_init(&channel->ch_poll_list,
+ sizeof (struct dcopy_cmd_priv_s),
+ offsetof(struct dcopy_cmd_priv_s, pr_poll_list_node));
+ if (e != DCOPY_SUCCESS) {
+ dcopy_stats_fini(channel);
+ info->di_cb->cb_channel_free(
+ &channel->ch_channel_private);
+ kmem_free(channel, sizeof (*channel));
+ goto registerfail_alloc;
+ }
+
+ dcopy_list_push(&device->dc_devchan_list, channel);
+ }
+
+ /* add the device to device list */
+ dcopy_list_push(&dcopy_statep->d_device_list, device);
+
+ /*
+ * add the device's dma channels to the global channel list (where
+ * dcopy_alloc's come from)
+ */
+ mutex_enter(&dcopy_statep->d_globalchan_list.dl_mutex);
+ mutex_enter(&dcopy_statep->d_device_list.dl_mutex);
+ channel = list_head(&device->dc_devchan_list.dl_list);
+ while (channel != NULL) {
+ list_insert_tail(&dcopy_statep->d_globalchan_list.dl_list,
+ channel);
+ dcopy_statep->d_globalchan_list.dl_cnt++;
+ channel = list_next(&device->dc_devchan_list.dl_list, channel);
+ }
+ mutex_exit(&dcopy_statep->d_device_list.dl_mutex);
+ mutex_exit(&dcopy_statep->d_globalchan_list.dl_mutex);
+
+ *handle = device;
+ return (DCOPY_SUCCESS);
+
+registerfail_alloc:
+ channel = list_head(&device->dc_devchan_list.dl_list);
+ while (channel != NULL) {
+ /* remove from the list */
+ channel = dcopy_list_pop(&device->dc_devchan_list);
+ ASSERT(channel != NULL);
+
+ dcopy_list_fini(&channel->ch_poll_list);
+ dcopy_stats_fini(channel);
+ info->di_cb->cb_channel_free(&channel->ch_channel_private);
+ kmem_free(channel, sizeof (*channel));
+ }
+
+ dcopy_list_fini(&device->dc_devchan_list);
+registerfail_devchan:
+ kmem_free(device, sizeof (*device));
+
+ return (DCOPY_FAILURE);
+}
+
+
+/*
+ * dcopy_device_unregister()
+ */
+/*ARGSUSED*/
+int
+dcopy_device_unregister(dcopy_device_handle_t *handle)
+{
+ struct dcopy_channel_s *channel;
+ dcopy_device_handle_t device;
+ boolean_t device_busy;
+
+
+ device = *handle;
+ device_busy = B_FALSE;
+
+ /*
+ * remove the devices dma channels from the global channel list (where
+ * dcopy_alloc's come from)
+ */
+ mutex_enter(&dcopy_statep->d_globalchan_list.dl_mutex);
+ mutex_enter(&device->dc_devchan_list.dl_mutex);
+ channel = list_head(&device->dc_devchan_list.dl_list);
+ while (channel != NULL) {
+ /*
+ * if the channel has outstanding allocs, mark it as having
+ * to be removed and increment the number of channels which
+ * need to be removed in the device state too.
+ */
+ if (channel->ch_ref_cnt != 0) {
+ channel->ch_removing = B_TRUE;
+ device_busy = B_TRUE;
+ device->dc_removing_cnt++;
+ }
+ dcopy_statep->d_globalchan_list.dl_cnt--;
+ list_remove(&dcopy_statep->d_globalchan_list.dl_list, channel);
+ channel = list_next(&device->dc_devchan_list.dl_list, channel);
+ }
+ mutex_exit(&device->dc_devchan_list.dl_mutex);
+ mutex_exit(&dcopy_statep->d_globalchan_list.dl_mutex);
+
+ /*
+ * if there are channels which still need to be removed, we will clean
+ * up the device state after they are freed up.
+ */
+ if (device_busy) {
+ return (DCOPY_PENDING);
+ }
+
+ dcopy_device_cleanup(device, B_FALSE);
+
+ *handle = NULL;
+ return (DCOPY_SUCCESS);
+}
+
+
+/*
+ * dcopy_device_cleanup()
+ */
+static void
+dcopy_device_cleanup(dcopy_device_handle_t device, boolean_t do_callback)
+{
+ struct dcopy_channel_s *channel;
+
+ /*
+ * remove all the channels in the device list, free them, and clean up
+ * the state.
+ */
+ mutex_enter(&dcopy_statep->d_device_list.dl_mutex);
+ channel = list_head(&device->dc_devchan_list.dl_list);
+ while (channel != NULL) {
+ device->dc_devchan_list.dl_cnt--;
+ list_remove(&device->dc_devchan_list.dl_list, channel);
+ dcopy_list_fini(&channel->ch_poll_list);
+ dcopy_stats_fini(channel);
+ channel->ch_cb->cb_channel_free(&channel->ch_channel_private);
+ kmem_free(channel, sizeof (*channel));
+ channel = list_head(&device->dc_devchan_list.dl_list);
+ }
+
+ /* remove it from the list of devices */
+ list_remove(&dcopy_statep->d_device_list.dl_list, device);
+
+ mutex_exit(&dcopy_statep->d_device_list.dl_mutex);
+
+ /*
+ * notify the DMA device driver that the device is free to be
+ * detached.
+ */
+ if (do_callback) {
+ device->dc_cb->cb_unregister_complete(
+ device->dc_device_private, DCOPY_SUCCESS);
+ }
+
+ dcopy_list_fini(&device->dc_devchan_list);
+ kmem_free(device, sizeof (*device));
+}
+
+
+/*
+ * dcopy_device_channel_notify()
+ */
+/*ARGSUSED*/
+void
+dcopy_device_channel_notify(dcopy_handle_t handle, int status)
+{
+ struct dcopy_channel_s *channel;
+ dcopy_list_t *poll_list;
+ dcopy_cmd_priv_t priv;
+ int e;
+
+
+ ASSERT(status == DCOPY_COMPLETION);
+ channel = handle;
+
+ poll_list = &channel->ch_poll_list;
+
+ /*
+ * when we get a completion notification from the device, go through
+ * all of the commands blocking on this channel and see if they have
+ * completed. Remove the command and wake up the block thread if they
+ * have. Once we hit a command which is still pending, we are done
+ * polling since commands in a channel complete in order.
+ */
+ mutex_enter(&poll_list->dl_mutex);
+ if (poll_list->dl_cnt != 0) {
+ priv = list_head(&poll_list->dl_list);
+ while (priv != NULL) {
+ atomic_inc_64(&channel->
+ ch_stat.cs_notify_poll.value.ui64);
+ e = channel->ch_cb->cb_cmd_poll(
+ channel->ch_channel_private,
+ priv->pr_cmd);
+ if (e == DCOPY_PENDING) {
+ atomic_inc_64(&channel->
+ ch_stat.cs_notify_pending.value.ui64);
+ break;
+ }
+
+ poll_list->dl_cnt--;
+ list_remove(&poll_list->dl_list, priv);
+
+ mutex_enter(&priv->pr_mutex);
+ priv->pr_wait = B_FALSE;
+ cv_signal(&priv->pr_cv);
+ mutex_exit(&priv->pr_mutex);
+
+ priv = list_head(&poll_list->dl_list);
+ }
+ }
+
+ mutex_exit(&poll_list->dl_mutex);
+}
+
+
+/*
+ * dcopy_stats_init()
+ */
+static int
+dcopy_stats_init(dcopy_handle_t channel)
+{
+#define CHANSTRSIZE 20
+ char chanstr[CHANSTRSIZE];
+ dcopy_stats_t *stats;
+ int instance;
+ char *name;
+
+
+ stats = &channel->ch_stat;
+ name = (char *)ddi_driver_name(channel->ch_device->dc_info.di_dip);
+ instance = ddi_get_instance(channel->ch_device->dc_info.di_dip);
+
+ (void) snprintf(chanstr, CHANSTRSIZE, "channel%d",
+ (uint32_t)channel->ch_info.qc_chan_num);
+
+ channel->ch_kstat = kstat_create(name, instance, chanstr, "misc",
+ KSTAT_TYPE_NAMED, sizeof (dcopy_stats_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (channel->ch_kstat == NULL) {
+ return (DCOPY_FAILURE);
+ }
+ channel->ch_kstat->ks_data = stats;
+
+ kstat_named_init(&stats->cs_bytes_xfer, "bytes_xfer",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&stats->cs_cmd_alloc, "cmd_alloc",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&stats->cs_cmd_post, "cmd_post",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&stats->cs_cmd_poll, "cmd_poll",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&stats->cs_notify_poll, "notify_poll",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&stats->cs_notify_pending, "notify_pending",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&stats->cs_id, "id",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&stats->cs_capabilities, "capabilities",
+ KSTAT_DATA_UINT64);
+
+ kstat_install(channel->ch_kstat);
+
+ channel->ch_stat.cs_id.value.ui64 = channel->ch_info.qc_id;
+ channel->ch_stat.cs_capabilities.value.ui64 =
+ channel->ch_info.qc_capabilities;
+
+ return (DCOPY_SUCCESS);
+}
+
+
+/*
+ * dcopy_stats_fini()
+ */
+static void
+dcopy_stats_fini(dcopy_handle_t channel)
+{
+ kstat_delete(channel->ch_kstat);
+}
+/* *** END OF DEVICE INTERFACE *** */
diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c
index 28a9a4928f..90fbf3cbf1 100644
--- a/usr/src/uts/common/io/stream.c
+++ b/usr/src/uts/common/io/stream.c
@@ -23,7 +23,7 @@
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -318,8 +318,8 @@ streams_msg_init(void)
int offset;
mblk_cache = kmem_cache_create("streams_mblk",
- sizeof (mblk_t), 32, NULL, NULL, NULL, NULL, NULL,
- mblk_kmem_flags);
+ sizeof (mblk_t), 32, NULL, NULL, NULL, NULL, NULL,
+ mblk_kmem_flags);
for (sizep = dblk_sizes; (size = *sizep) != 0; sizep++) {
@@ -330,7 +330,7 @@ streams_msg_init(void)
*/
tot_size = size + sizeof (dblk_t);
ASSERT((offset + sizeof (dblk_t) + sizeof (kmem_slab_t))
- < PAGESIZE);
+ < PAGESIZE);
ASSERT((tot_size & (DBLK_CACHE_ALIGN - 1)) == 0);
} else {
@@ -346,9 +346,9 @@ streams_msg_init(void)
(void) sprintf(name, "streams_dblk_%ld", size);
cp = kmem_cache_create(name, tot_size,
- DBLK_CACHE_ALIGN, dblk_constructor,
- dblk_destructor, NULL,
- (void *)(size), NULL, dblk_kmem_flags);
+ DBLK_CACHE_ALIGN, dblk_constructor,
+ dblk_destructor, NULL,
+ (void *)(size), NULL, dblk_kmem_flags);
while (lastsize <= size) {
dblk_cache[(lastsize - 1) >> DBLK_SIZE_SHIFT] = cp;
@@ -357,13 +357,13 @@ streams_msg_init(void)
}
dblk_esb_cache = kmem_cache_create("streams_dblk_esb",
- sizeof (dblk_t), DBLK_CACHE_ALIGN,
- dblk_esb_constructor, dblk_destructor, NULL,
- (void *) sizeof (dblk_t), NULL, dblk_kmem_flags);
+ sizeof (dblk_t), DBLK_CACHE_ALIGN,
+ dblk_esb_constructor, dblk_destructor, NULL,
+ (void *) sizeof (dblk_t), NULL, dblk_kmem_flags);
fthdr_cache = kmem_cache_create("streams_fthdr",
- sizeof (fthdr_t), 32, NULL, NULL, NULL, NULL, NULL, 0);
+ sizeof (fthdr_t), 32, NULL, NULL, NULL, NULL, NULL, 0);
ftblk_cache = kmem_cache_create("streams_ftblk",
- sizeof (ftblk_t), 32, NULL, NULL, NULL, NULL, NULL, 0);
+ sizeof (ftblk_t), 32, NULL, NULL, NULL, NULL, NULL, 0);
/* Initialize Multidata caches */
mmd_init();
@@ -545,8 +545,8 @@ dblk_lastfree(mblk_t *mp, dblk_t *dbp)
dbp->db_struioflag = 0;
dbp->db_struioun.cksum.flags = 0;
- /* and the COOKED flag */
- dbp->db_flags &= ~DBLK_COOKED;
+ /* and the COOKED and/or UIOA flag(s) */
+ dbp->db_flags &= ~(DBLK_COOKED | DBLK_UIOA);
kmem_cache_free(dbp->db_cache, dbp);
}
@@ -739,7 +739,7 @@ desballoc(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
*/
if (!str_ftnever) {
mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
- frp, dblk_lastfree_desb, KM_NOSLEEP);
+ frp, dblk_lastfree_desb, KM_NOSLEEP);
if (mp != NULL)
STR_FTALLOC(&DB_FTHDR(mp), FTEV_DESBALLOC, size);
@@ -857,7 +857,7 @@ bcache_create(char *name, size_t size, uint_t align)
(void) sprintf(buffer, "%s_dblk_cache", name);
bcp->dblk_cache = kmem_cache_create(buffer, sizeof (dblk_t),
DBLK_CACHE_ALIGN, bcache_dblk_constructor, bcache_dblk_destructor,
- NULL, (void *)bcp, NULL, 0);
+ NULL, (void *)bcp, NULL, 0);
return (bcp);
}
@@ -1584,7 +1584,7 @@ adjmsg(mblk_t *mp, ssize_t len)
*/
if ((save_bp != mp) &&
- (save_bp->b_wptr == save_bp->b_rptr)) {
+ (save_bp->b_wptr == save_bp->b_rptr)) {
bcont = save_bp->b_cont;
freeb(save_bp);
prev_bp->b_cont = bcont;
@@ -2129,8 +2129,8 @@ flushband(queue_t *q, unsigned char pri, int flag)
nmp = mp->b_next;
mp->b_next = mp->b_prev = NULL;
if ((mp->b_band == 0) &&
- ((flag == FLUSHALL) ||
- datamsg(mp->b_datap->db_type)))
+ ((flag == FLUSHALL) ||
+ datamsg(mp->b_datap->db_type)))
freemsg(mp);
else
(void) putq(q, mp);
@@ -2242,7 +2242,7 @@ bcanput(queue_t *q, unsigned char pri)
q->q_flag |= QWANTW;
mutex_exit(QLOCK(q));
TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
- "bcanput:%p %X %d", q, pri, 0);
+ "bcanput:%p %X %d", q, pri, 0);
return (0);
}
} else { /* pri != 0 */
@@ -2252,7 +2252,7 @@ bcanput(queue_t *q, unsigned char pri)
*/
mutex_exit(QLOCK(q));
TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
- "bcanput:%p %X %d", q, pri, 1);
+ "bcanput:%p %X %d", q, pri, 1);
return (1);
}
qbp = q->q_bandp;
@@ -2262,13 +2262,13 @@ bcanput(queue_t *q, unsigned char pri)
qbp->qb_flag |= QB_WANTW;
mutex_exit(QLOCK(q));
TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
- "bcanput:%p %X %d", q, pri, 0);
+ "bcanput:%p %X %d", q, pri, 0);
return (0);
}
}
mutex_exit(QLOCK(q));
TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
- "bcanput:%p %X %d", q, pri, 1);
+ "bcanput:%p %X %d", q, pri, 1);
return (1);
}
@@ -2847,7 +2847,7 @@ putnextctl1(queue_t *q, int type, int param)
mblk_t *bp;
if ((datamsg(type) && (type != M_DELAY)) ||
- ((bp = allocb_tryhard(1)) == NULL))
+ ((bp = allocb_tryhard(1)) == NULL))
return (0);
bp->b_datap->db_type = (unsigned char)type;
@@ -2864,7 +2864,7 @@ putnextctl(queue_t *q, int type)
mblk_t *bp;
if ((datamsg(type) && (type != M_DELAY)) ||
- ((bp = allocb_tryhard(0)) == NULL))
+ ((bp = allocb_tryhard(0)) == NULL))
return (0);
bp->b_datap->db_type = (unsigned char)type;
diff --git a/usr/src/uts/common/os/move.c b/usr/src/uts/common/os/move.c
index d5c63b167e..f4056aa02c 100644
--- a/usr/src/uts/common/os/move.c
+++ b/usr/src/uts/common/os/move.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -45,6 +44,16 @@
#include <sys/systm.h>
#include <sys/uio.h>
#include <sys/errno.h>
+#include <sys/vmsystm.h>
+#include <sys/cmn_err.h>
+#include <vm/as.h>
+#include <vm/page.h>
+
+#include <sys/dcopy.h>
+
+int64_t uioa_maxpoll = -1; /* <0 = noblock, 0 = block, >0 = block after */
+#define UIO_DCOPY_CHANNEL 0
+#define UIO_DCOPY_CMD 1
/*
* Move "n" bytes at byte address "p"; "rw" indicates the direction
@@ -277,3 +286,370 @@ uiodup(uio_t *suio, uio_t *duio, iovec_t *diov, int diov_cnt)
duio->uio_iov = diov;
return (0);
}
+
+/*
+ * Shadow state for checking if a platform has hardware asynchronous
+ * copy capability and minimum copy size, e.g. Intel's I/OAT dma engine,
+ * /dev/dcopy.
+ */
+uioasync_t uioasync = {B_TRUE, 1024};
+
+/*
+ * Schedule an asynchronous move of "n" bytes at byte address "p",
+ * "rw" indicates the direction of the move, I/O parameters and
+ * async state are provided in "uioa" which is update to reflect
+ * the data which is to be moved.
+ *
+ * Returns 0 on success or a non-zero errno on failure.
+ *
+ * Note, while the uioasync APIs are general purpose in design
+ * the current implementation is Intel I/OAT specific.
+ */
+int
+uioamove(void *p, size_t n, enum uio_rw rw, uioa_t *uioa)
+{
+ int soff, doff;
+ uint64_t pa;
+ int cnt;
+ iovec_t *iov;
+ dcopy_handle_t channel;
+ dcopy_cmd_t cmd;
+ int ret = 0;
+ int dcopy_flags;
+
+ if (!(uioa->uioa_state & UIOA_ENABLED)) {
+ /* The uioa_t isn't enabled */
+ return (ENXIO);
+ }
+
+ if (uioa->uio_segflg != UIO_USERSPACE || rw != UIO_READ) {
+ /* Only support to user-land from kernel */
+ return (ENOTSUP);
+ }
+
+
+ channel = uioa->uioa_hwst[UIO_DCOPY_CHANNEL];
+ cmd = uioa->uioa_hwst[UIO_DCOPY_CMD];
+ dcopy_flags = DCOPY_NOSLEEP;
+
+ /*
+ * While source bytes and destination bytes.
+ */
+ while (n > 0 && uioa->uio_resid > 0) {
+ iov = uioa->uio_iov;
+ if (iov->iov_len == 0l) {
+ uioa->uio_iov++;
+ uioa->uio_iovcnt--;
+ uioa->uioa_lcur++;
+ uioa->uioa_lppp = uioa->uioa_lcur->uioa_ppp;
+ continue;
+ }
+ /*
+ * While source bytes schedule an async
+ * dma for destination page by page.
+ */
+ while (n > 0) {
+ /* Addr offset in page src/dst */
+ soff = (uintptr_t)p & PAGEOFFSET;
+ doff = (uintptr_t)iov->iov_base & PAGEOFFSET;
+ /* Min copy count src and dst and page sized */
+ cnt = MIN(n, iov->iov_len);
+ cnt = MIN(cnt, PAGESIZE - soff);
+ cnt = MIN(cnt, PAGESIZE - doff);
+ /* XXX if next page(s) contiguous could use multipage */
+
+ /*
+ * if we have an old command, we want to link all
+ * other commands to the next command we alloced so
+ * we only need to track the last command but can
+ * still free them all.
+ */
+ if (cmd != NULL) {
+ dcopy_flags |= DCOPY_ALLOC_LINK;
+ }
+ ret = dcopy_cmd_alloc(channel, dcopy_flags, &cmd);
+ if (ret != DCOPY_SUCCESS) {
+ /* Error of some sort */
+ return (EIO);
+ }
+ uioa->uioa_hwst[UIO_DCOPY_CMD] = cmd;
+
+ ASSERT(cmd->dp_version == DCOPY_CMD_V0);
+ if (uioa_maxpoll >= 0) {
+ /* Blocking (>0 may be) used in uioafini() */
+ cmd->dp_flags = DCOPY_CMD_INTR;
+ } else {
+ /* Non blocking uioafini() so no intr */
+ cmd->dp_flags = DCOPY_CMD_NOFLAGS;
+ }
+ cmd->dp_cmd = DCOPY_CMD_COPY;
+ pa = ptob((uint64_t)hat_getpfnum(kas.a_hat, p));
+ cmd->dp.copy.cc_source = pa + soff;
+ if (uioa->uioa_lcur->uioa_pfncnt == 0) {
+ /* Have a (page_t **) */
+ pa = ptob((uint64_t)(
+ *(page_t **)uioa->uioa_lppp)->p_pagenum);
+ } else {
+ /* Have a (pfn_t *) */
+ pa = ptob((uint64_t)(
+ *(pfn_t *)uioa->uioa_lppp));
+ }
+ cmd->dp.copy.cc_dest = pa + doff;
+ cmd->dp.copy.cc_size = cnt;
+ ret = dcopy_cmd_post(cmd);
+ if (ret != DCOPY_SUCCESS) {
+ /* Error of some sort */
+ return (EIO);
+ }
+ ret = 0;
+
+ /* If UIOA_POLL not set, set it */
+ if (!(uioa->uioa_state & UIOA_POLL))
+ uioa->uioa_state |= UIOA_POLL;
+
+ /* Update iov, uio, and local pointers/counters */
+ iov->iov_base += cnt;
+ iov->iov_len -= cnt;
+ uioa->uio_resid -= cnt;
+ uioa->uio_loffset += cnt;
+ p = (caddr_t)p + cnt;
+ n -= cnt;
+
+ /* End of iovec? */
+ if (iov->iov_len == 0) {
+ /* Yup, next iovec */
+ break;
+ }
+
+ /* Next dst addr page? */
+ if (doff + cnt == PAGESIZE) {
+ /* Yup, next page_t */
+ uioa->uioa_lppp++;
+ }
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * Initialize a uioa_t for a given uio_t for the current user context,
+ * copy the common uio_t to the uioa_t, walk the shared iovec_t and
+ * lock down the user-land page(s) containing iovec_t data, then mapin
+ * user-land pages using segkpm.
+ */
+int
+uioainit(uio_t *uiop, uioa_t *uioap)
+{
+ caddr_t addr;
+ page_t **pages;
+ int off;
+ int len;
+ proc_t *procp = ttoproc(curthread);
+ struct as *as = procp->p_as;
+ iovec_t *iov = uiop->uio_iov;
+ int32_t iovcnt = uiop->uio_iovcnt;
+ uioa_page_t *locked = uioap->uioa_locked;
+ dcopy_handle_t channel;
+ int error;
+
+ if (! (uioap->uioa_state & UIOA_ALLOC)) {
+ /* Can only init() a freshly allocated uioa_t */
+ return (EINVAL);
+ }
+
+ error = dcopy_alloc(DCOPY_NOSLEEP, &channel);
+ if (error == DCOPY_NORESOURCES) {
+ /* Turn off uioa */
+ uioasync.enabled = B_FALSE;
+ return (ENODEV);
+ }
+ if (error != DCOPY_SUCCESS) {
+ /* Alloc failed */
+ return (EIO);
+ }
+
+ uioap->uioa_hwst[UIO_DCOPY_CHANNEL] = channel;
+ uioap->uioa_hwst[UIO_DCOPY_CMD] = NULL;
+
+ /* Indicate uioa_t (will be) initialized */
+ uioap->uioa_state = UIOA_INIT;
+
+ /* uio_t/uioa_t uio_t common struct copy */
+ *((uio_t *)uioap) = *uiop;
+
+ /* initialize *uiop->uio_iov */
+ if (iovcnt > UIOA_IOV_MAX) {
+ /* Too big? */
+ return (E2BIG);
+ }
+ uioap->uio_iov = iov;
+ uioap->uio_iovcnt = iovcnt;
+
+ /* Mark the uioap as such */
+ uioap->uio_extflg |= UIO_ASYNC;
+
+ /*
+ * For each iovec_t, lock-down the page(s) backing the iovec_t
+ * and save the page_t list for phys addr use in uioamove().
+ */
+ iov = uiop->uio_iov;
+ iovcnt = uiop->uio_iovcnt;
+ while (iovcnt > 0) {
+ addr = iov->iov_base;
+ off = (uintptr_t)addr & PAGEOFFSET;
+ addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
+ len = iov->iov_len + off;
+
+ /* Lock down page(s) for the iov span */
+ if ((error = as_pagelock(as, &pages,
+ iov->iov_base, iov->iov_len, S_WRITE)) != 0) {
+ /* Error */
+ goto cleanup;
+ }
+
+ if (pages == NULL) {
+ /*
+ * Need page_t list, really only need
+ * a pfn list so build one.
+ */
+ pfn_t *pfnp;
+ int pcnt = len >> PAGESHIFT;
+
+ if (off)
+ pcnt++;
+ if ((pfnp = kmem_alloc(pcnt * sizeof (pfnp),
+ KM_NOSLEEP)) == NULL) {
+ error = ENOMEM;
+ goto cleanup;
+ }
+ locked->uioa_ppp = (void **)pfnp;
+ locked->uioa_pfncnt = pcnt;
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ while (pcnt-- > 0) {
+ *pfnp++ = hat_getpfnum(as->a_hat, addr);
+ addr += PAGESIZE;
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+ } else {
+ /* Have a page_t list, save it */
+ locked->uioa_ppp = (void **)pages;
+ locked->uioa_pfncnt = 0;
+ }
+ /* Save for as_pageunlock() in uioafini() */
+ locked->uioa_base = iov->iov_base;
+ locked->uioa_len = iov->iov_len;
+ locked++;
+
+ /* Next iovec_t */
+ iov++;
+ iovcnt--;
+ }
+ /* Initialize curret pointer into uioa_locked[] and it's uioa_ppp */
+ uioap->uioa_lcur = uioap->uioa_locked;
+ uioap->uioa_lppp = uioap->uioa_lcur->uioa_ppp;
+ return (0);
+
+cleanup:
+ /* Unlock any previously locked page_t(s) */
+ while (locked > uioap->uioa_locked) {
+ locked--;
+ as_pageunlock(as, (page_t **)locked->uioa_ppp,
+ locked->uioa_base, locked->uioa_len, S_WRITE);
+ }
+
+ /* Last indicate uioa_t still in alloc state */
+ uioap->uioa_state = UIOA_ALLOC;
+
+ return (error);
+}
+
+/*
+ * Finish processing of a uioa_t by cleanup any pending "uioap" actions.
+ */
+int
+uioafini(uio_t *uiop, uioa_t *uioap)
+{
+ int32_t iovcnt = uiop->uio_iovcnt;
+ uioa_page_t *locked = uioap->uioa_locked;
+ struct as *as = ttoproc(curthread)->p_as;
+ dcopy_handle_t channel;
+ dcopy_cmd_t cmd;
+ int ret = 0;
+
+ ASSERT(uioap->uio_extflg & UIO_ASYNC);
+
+ if (!(uioap->uioa_state & (UIOA_ENABLED|UIOA_FINI))) {
+ /* Must be an active uioa_t */
+ return (EINVAL);
+ }
+
+ channel = uioap->uioa_hwst[UIO_DCOPY_CHANNEL];
+ cmd = uioap->uioa_hwst[UIO_DCOPY_CMD];
+
+ /* XXX - why do we get cmd == NULL sometimes? */
+ if (cmd != NULL) {
+ if (uioap->uioa_state & UIOA_POLL) {
+ /* Wait for last dcopy() to finish */
+ int64_t poll = 1;
+ int poll_flag = DCOPY_POLL_NOFLAGS;
+
+ do {
+ if (uioa_maxpoll == 0 ||
+ (uioa_maxpoll > 0 &&
+ poll >= uioa_maxpoll)) {
+ /* Always block or after maxpoll */
+ poll_flag = DCOPY_POLL_BLOCK;
+ } else {
+ /* No block, poll */
+ poll++;
+ }
+ ret = dcopy_cmd_poll(cmd, poll_flag);
+ } while (ret == DCOPY_PENDING);
+
+ if (ret == DCOPY_COMPLETED) {
+ /* Poll/block succeeded */
+ ret = 0;
+ } else {
+ /* Poll/block failed */
+ ret = EIO;
+ }
+ }
+ dcopy_cmd_free(&cmd);
+ }
+
+ dcopy_free(&channel);
+
+ /* Unlock all page(s) iovec_t by iovec_t */
+ while (iovcnt-- > 0) {
+ page_t **pages;
+
+ if (locked->uioa_pfncnt == 0) {
+ /* A as_pagelock() returned (page_t **) */
+ pages = (page_t **)locked->uioa_ppp;
+ } else {
+ /* Our pfn_t array */
+ pages = NULL;
+ kmem_free(locked->uioa_ppp, locked->uioa_pfncnt);
+ }
+ as_pageunlock(as, pages, locked->uioa_base, locked->uioa_len,
+ S_WRITE);
+
+ locked++;
+ }
+ /* uioa_t->uio_t common struct copy */
+ *uiop = *((uio_t *)uioap);
+
+ /*
+ * Last, reset uioa state to alloc.
+ *
+ * Note, we only initialize the state here, all other members
+ * will be initialized in a subsequent uioainit().
+ */
+ uioap->uioa_state = UIOA_ALLOC;
+
+ uioap->uioa_hwst[UIO_DCOPY_CMD] = NULL;
+ uioap->uioa_hwst[UIO_DCOPY_CHANNEL] = NULL;
+
+ return (ret);
+}
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index d80fa67f56..53e2d81465 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -143,6 +143,7 @@ static uint32_t ioc_id;
static void putback(struct stdata *, queue_t *, mblk_t *, int);
static void strcleanall(struct vnode *);
static int strwsrv(queue_t *);
+static void struioainit(queue_t *, sodirect_t *, uio_t *);
/*
* qinit and module_info structures for stream head read and write queues
@@ -188,6 +189,11 @@ static boolean_t msghasdata(mblk_t *bp);
* mirror this.
* 4. ioctl monitor: sd_lock is gotten to ensure that only one
* thread is doing an ioctl at a time.
+ *
+ * Note, for sodirect case 3. is extended to (*sodirect_t.sod_enqueue)()
+ * call-back from below, further the sodirect support is for code paths
+ * called via kstgetmsg(), all other code paths ASSERT() that sodirect
+ * uioa generated mblk_t's (i.e. DBLK_UIOA) aren't processed.
*/
static int
@@ -395,6 +401,7 @@ ckreturn:
stp->sd_qn_minpsz = 0;
stp->sd_qn_maxpsz = INFPSZ - 1; /* used to check for initialization */
stp->sd_maxblk = INFPSZ;
+ stp->sd_sodirect = NULL;
qp->q_ptr = _WR(qp)->q_ptr = stp;
STREAM(qp) = STREAM(_WR(qp)) = stp;
vp->v_stream = stp;
@@ -966,11 +973,14 @@ strcleanall(struct vnode *vp)
* It is the callers responsibility to call qbackenable after
* it is finished with the message. The caller should not call
* qbackenable until after any putback calls to avoid spurious backenabling.
+ *
+ * Also, handle uioa initialization and process any DBLK_UIOA flaged messages.
*/
mblk_t *
strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
int *errorp)
{
+ sodirect_t *sodp = stp->sd_sodirect;
mblk_t *bp;
int error;
@@ -1059,7 +1069,67 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
}
*errorp = 0;
ASSERT(MUTEX_HELD(&stp->sd_lock));
- return (getq_noenab(q));
+ if (sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
+ (sodp->sod_uioa.uioa_state & UIOA_INIT)) {
+ /*
+ * First kstrgetmsg() call for an uioa_t so if any
+ * queued mblk_t's need to consume them before uioa
+ * from below can occur.
+ */
+ sodp->sod_uioa.uioa_state &= UIOA_CLR;
+ sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
+ if (q->q_first != NULL) {
+ struioainit(q, sodp, uiop);
+ }
+ }
+
+ bp = getq_noenab(q);
+
+ if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
+ /*
+ * A uioa flaged mblk_t chain, already uio processed,
+ * add it to the sodirect uioa pending free list.
+ *
+ * Note, a b_cont chain headed by a DBLK_UIOA enable
+ * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
+ */
+ mblk_t *bpt = sodp->sod_uioaft;
+
+ ASSERT(sodp != NULL);
+
+ /*
+ * Add first mblk_t of "bp" chain to current sodirect uioa
+ * free list tail mblk_t, if any, else empty list so new head.
+ */
+ if (bpt == NULL)
+ sodp->sod_uioafh = bp;
+ else
+ bpt->b_cont = bp;
+
+ /*
+ * Walk mblk_t "bp" chain to find tail and adjust rptr of
+ * each to reflect that uioamove() has consumed all data.
+ */
+ bpt = bp;
+ for (;;) {
+ bpt->b_rptr = bpt->b_wptr;
+ if (bpt->b_cont == NULL)
+ break;
+ bpt = bpt->b_cont;
+
+ ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
+ }
+ /* New sodirect uioa free list tail */
+ sodp->sod_uioaft = bpt;
+
+ /* Only 1 strget() with data returned per uioa_t */
+ if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
+ sodp->sod_uioa.uioa_state &= UIOA_CLR;
+ sodp->sod_uioa.uioa_state |= UIOA_FINI;
+ }
+ }
+
+ return (bp);
}
/*
@@ -1079,6 +1149,8 @@ struiocopyout(mblk_t *bp, struct uio *uiop, int *errorp)
ASSERT(bp->b_wptr >= bp->b_rptr);
do {
+ ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA));
+
if ((n = MIN(uiop->uio_resid, MBLKL(bp))) != 0) {
ASSERT(n > 0);
@@ -1225,8 +1297,10 @@ strread(struct vnode *vp, struct uio *uiop, cred_t *crp)
}
first = 0;
}
+
ASSERT(MUTEX_HELD(&stp->sd_lock));
ASSERT(bp);
+ ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA));
pri = bp->b_band;
/*
* Extract any mark information. If the message is not
@@ -6460,6 +6534,7 @@ strgetmsg(
bp = strget(stp, q, uiop, first, &error);
ASSERT(MUTEX_HELD(&stp->sd_lock));
if (bp != NULL) {
+ ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA));
if (bp->b_datap->db_type == M_SIG) {
strsignal_nolock(stp, *bp->b_rptr,
(int32_t)bp->b_band);
@@ -7098,7 +7173,7 @@ retry:
"kstrgetmsg calls strwaitq:%p, %p",
vp, uiop);
if (((error = strwaitq(stp, waitflag, (ssize_t)0,
- fmode, timout, &done)) != 0) || done) {
+ fmode, timout, &done))) != 0 || done) {
TRACE_2(TR_FAC_STREAMS_FR, TR_KSTRGETMSG_DONE,
"kstrgetmsg error or done:%p, %p",
vp, uiop);
@@ -7132,6 +7207,7 @@ retry:
* If the caller doesn't want the mark return.
* Used to implement MSG_WAITALL in sockets.
*/
+ ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA));
if (flags & MSG_NOMARK) {
putback(stp, q, bp, pri);
qbackenable(q, pri);
@@ -7170,6 +7246,8 @@ retry:
* there is indeed a shortage of memory. dupmsg() may fail
* if db_ref in any of the messages reaches its limit.
*/
+
+ ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA));
if ((nbp = dupmsg(bp)) == NULL && (nbp = copymsg(bp)) == NULL) {
/*
* Restore the state of the stream head since we
@@ -7228,6 +7306,7 @@ retry:
}
}
+ ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA));
bp = (stp->sd_rputdatafunc)(stp->sd_vnode, bp,
NULL, NULL, NULL, NULL);
@@ -7278,6 +7357,8 @@ retry:
*/
if (uiop == NULL) {
/* Append data to tail of mctlp */
+
+ ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA));
if (mctlp != NULL) {
mblk_t **mpp = mctlp;
@@ -7286,6 +7367,14 @@ retry:
*mpp = bp;
bp = NULL;
}
+ } else if (bp->b_datap->db_flags & DBLK_UIOA) {
+ /*
+ * A uioa mblk_t chain, as uio processing has already
+ * been done we simple skip over processing.
+ */
+ bp = NULL;
+ pr = 0;
+
} else if (uiop->uio_resid >= 0 && bp) {
size_t oldresid = uiop->uio_resid;
@@ -7374,6 +7463,8 @@ retry:
* again since the flush logic in strrput_nondata()
* may have cleared it while we had sd_lock dropped.
*/
+
+ ASSERT(!(savemp->b_datap->db_flags & DBLK_UIOA));
if (type >= QPCTL) {
ASSERT(type == M_PCPROTO);
if (queclass(savemp) < QPCTL)
@@ -8445,3 +8536,82 @@ msghasdata(mblk_t *bp)
}
return (B_FALSE);
}
+
+/*
+ * Called on the first strget() of a sodirect/uioa enabled streamhead,
+ * if any mblk_t(s) enqueued they must first be uioamove()d before uioa
+ * can be enabled for the underlying transport's use.
+ */
+void
+struioainit(queue_t *q, sodirect_t *sodp, uio_t *uiop)
+{
+ uioa_t *uioap = (uioa_t *)uiop;
+ mblk_t *bp = q->q_first;
+ mblk_t *lbp = NULL;
+ mblk_t *nbp, *wbp;
+ int len;
+ int error;
+
+ ASSERT(MUTEX_HELD(sodp->sod_lock));
+ ASSERT(&sodp->sod_uioa == uioap);
+
+ /*
+ * Walk the b_next/b_prev doubly linked list of b_cont chain(s)
+ * and schedule any M_DATA mblk_t's for uio asynchronous move.
+ */
+ do {
+ /* Next mblk_t chain */
+ nbp = bp->b_next;
+ /* Walk the chain */
+ wbp = bp;
+ do {
+ if (wbp->b_datap->db_type == M_DATA &&
+ (len = wbp->b_wptr - wbp->b_rptr) > 0) {
+ /* Have a M_DATA mblk_t with data */
+ if (len > uioap->uio_resid) {
+ /* Not enough uio sapce */
+ goto nospace;
+ }
+ error = uioamove(wbp->b_rptr, len,
+ UIO_READ, uioap);
+ if (!error) {
+ /* Scheduled, mark dblk_t as such */
+ wbp->b_datap->db_flags |= DBLK_UIOA;
+ } else {
+ /* Error of some sort, no more uioa */
+ uioap->uioa_state &= UIOA_CLR;
+ uioap->uioa_state |= UIOA_FINI;
+ return;
+ }
+ }
+ /* Save last wbp processed */
+ lbp = wbp;
+ } while ((wbp = wbp->b_cont) != NULL);
+ } while ((bp = nbp) != NULL);
+
+ return;
+
+nospace:
+ /* Not enough uio space, no more uioa */
+ uioap->uioa_state &= UIOA_CLR;
+ uioap->uioa_state |= UIOA_FINI;
+
+ /*
+ * If we processed 1 or more mblk_t(s) then we need to split the
+ * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s)
+ * are in the current chain and the rest are in the following new
+ * chain.
+ */
+ if (lbp != NULL) {
+ /* New end of current chain */
+ lbp->b_cont = NULL;
+
+ /* Insert new chain wbp after bp */
+ if ((wbp->b_next = nbp) != NULL)
+ nbp->b_prev = wbp;
+ else
+ q->q_last = wbp;
+ wbp->b_prev = bp;
+ bp->b_next = wbp;
+ }
+}
diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c
index 650a4cfaf9..a7750e2ec3 100644
--- a/usr/src/uts/common/os/strsubr.c
+++ b/usr/src/uts/common/os/strsubr.c
@@ -2467,13 +2467,18 @@ devflg_to_qflag(struct streamtab *stp, uint32_t devflag, uint32_t *qflagp,
/*
* Private flag used by a transport module to indicate
* to sockfs that it supports direct-access mode without
- * having to go through STREAMS.
+ * having to go through STREAMS or the transport can use
+ * sodirect_t sharing to bypass STREAMS for receive-side
+ * M_DATA processing.
*/
- if (devflag & _D_DIRECT) {
+ if (devflag & (_D_DIRECT|_D_SODIRECT)) {
/* Reject unless the module is fully-MT (no perimeter) */
if ((qflag & QMT_TYPEMASK) != QMTSAFE)
goto bad;
- qflag |= _QDIRECT;
+ if (devflag & _D_DIRECT)
+ qflag |= _QDIRECT;
+ if (devflag & _D_SODIRECT)
+ qflag |= _QSODIRECT;
}
*qflagp = qflag;
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index d0d531088f..f072b5e18f 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -488,6 +488,7 @@ CHKHDRS= \
socket_impl.h \
socketvar.h \
sockio.h \
+ sodirect.h \
squeue.h \
squeue_impl.h \
srn.h \
diff --git a/usr/src/uts/common/sys/conf.h b/usr/src/uts/common/sys/conf.h
index 3f6300e581..435cffb35f 100644
--- a/usr/src/uts/common/sys/conf.h
+++ b/usr/src/uts/common/sys/conf.h
@@ -22,7 +22,7 @@
/* All Rights Reserved */
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -221,6 +221,9 @@ extern int cdev_prop_op(dev_t, dev_info_t *, ddi_prop_op_t,
#define D_OPEN_RETURNS_EINTR 0x100000 /* EINTR expected from open(9E) */
+#define _D_SODIRECT 0x200000 /* Private flag for transport modules used */
+ /* to enable _QSODIRECT for a STREAMS Q */
+
#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/dcopy.h b/usr/src/uts/common/sys/dcopy.h
new file mode 100644
index 0000000000..e700ed9cf6
--- /dev/null
+++ b/usr/src/uts/common/sys/dcopy.h
@@ -0,0 +1,235 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DCOPY_H
+#define _SYS_DCOPY_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+
+/*
+ * *** This interface is for private use by the IP stack only ***
+ */
+
+/* Function return status */
+#define DCOPY_FAILURE (-1)
+#define DCOPY_SUCCESS (0)
+#define DCOPY_NORESOURCES (1) /* _alloc & _cmd_alloc, _cmd_post only */
+#define DCOPY_PENDING (0x10) /* dcopy_poll(), dcopy_unregister() */
+#define DCOPY_COMPLETED (0x20) /* dcopy_poll() only */
+
+
+/* dq_version */
+#define DCOPY_QUERY_V0 0
+
+typedef struct dcopy_query_s {
+ int dq_version; /* DCOPY_QUERY_V0 */
+ uint_t dq_num_channels; /* number of dma channels */
+} dcopy_query_t;
+
+/*
+ * dcopy_query()
+ * query for the number of DMA engines usable in the system.
+ */
+void dcopy_query(dcopy_query_t *query);
+
+
+typedef struct dcopy_channel_s *dcopy_handle_t;
+
+/* dcopy_alloc() and dcopy_cmd_alloc() common flags */
+#define DCOPY_SLEEP (0)
+#define DCOPY_NOSLEEP (1 << 0)
+
+/*
+ * dcopy_alloc()
+ * Allocate a DMA channel which is used for posting DMA requests. Note: this
+ * does not give the caller exclusive access to the DMA engine. Commands
+ * posted to a channel will complete in order.
+ * flags - (DCOPY_SLEEP, DCOPY_NOSLEEP)
+ * returns => DCOPY_FAILURE, DCOPY_SUCCESS, DCOPY_NORESOURCES
+ */
+int dcopy_alloc(int flags, dcopy_handle_t *handle);
+
+/*
+ * dcopy_free()
+ * Free the DMA channel. The client can no longer use the handle to post or
+ * poll for status on posts which were previously done on this channel.
+ */
+void dcopy_free(dcopy_handle_t *handle);
+
+/* dq_version */
+#define DCOPY_QUERY_CHANNEL_V0 0
+
+/* Per DMA channel info */
+typedef struct dcopy_query_channel_s {
+ int qc_version; /* DCOPY_QUERY_CHANNEL_V0 */
+
+ /* Does DMA channel support DCA */
+ boolean_t qc_dca_supported;
+
+ /* device id and device specific capabilities */
+ uint64_t qc_id;
+ uint64_t qc_capabilities;
+
+ /*
+ * DMA channel size. This may not be the same as the number of posts
+ * that the DMA channel can handle since a post may consume 1 or more
+ * entries.
+ */
+ uint64_t qc_channel_size;
+
+ /* DMA channel number within the device. Not unique across devices */
+ uint64_t qc_chan_num;
+} dcopy_query_channel_t;
+
+/*
+ * dcopy_query_channel()
+ * query DMA engines capabilities
+ */
+void dcopy_query_channel(dcopy_handle_t handle, dcopy_query_channel_t *query);
+
+
+/* dp_version */
+#define DCOPY_CMD_V0 0
+
+/* dp_cmd */
+#define DCOPY_CMD_COPY 0x1
+
+/* dp_flags */
+/*
+ * DCOPY_CMD_QUEUE
+ * Hint to queue up the post but don't notify the DMA engine. This can be
+ * used as an optimization when multiple posts are going to be queued up and
+ * you only want notify the DMA engine after the last post. Note, this does
+ * not mean the DMA engine won't process the request since it could notice
+ * it anyway.
+ * DCOPY_CMD_NOSTAT
+ * Don't generate a status. If this flag is used, You cannot poll for
+ * completion status on this command. This can be a useful performance
+ * optimization if your posting multiple commands and just want to poll on
+ * the last command.
+ * DCOPY_CMD_DCA
+ * If DCA is supported, direct this and all future command data (until the
+ * next command with DCOPY_POST_DCA set) to the processor specified in
+ * dp_dca_id. This flag is ignored if DCA is not supported.
+ * DCOPY_CMD_INTR
+ * Generate an interrupt when command completes. This flag is required if
+ * the caller is going to call dcopy_cmd_poll(() with DCOPY_POLL_BLOCK set
+ * for this command.
+ */
+#define DCOPY_CMD_NOFLAGS (0)
+#define DCOPY_CMD_QUEUE (1 << 0)
+#define DCOPY_CMD_NOSTAT (1 << 1)
+#define DCOPY_CMD_DCA (1 << 2)
+#define DCOPY_CMD_INTR (1 << 3)
+
+typedef struct dcopy_cmd_copy_s {
+ uint64_t cc_source; /* Source physical address */
+ uint64_t cc_dest; /* Destination physical address */
+ size_t cc_size;
+} dcopy_cmd_copy_t;
+
+typedef union dcopy_cmd_u {
+ dcopy_cmd_copy_t copy;
+} dcopy_cmd_u_t;
+
+typedef struct dcopy_cmd_priv_s *dcopy_cmd_priv_t;
+
+struct dcopy_cmd_s {
+ uint_t dp_version; /* DCOPY_CMD_V0 */
+ uint_t dp_flags;
+ uint64_t dp_cmd;
+ dcopy_cmd_u_t dp;
+ uint32_t dp_dca_id;
+ dcopy_cmd_priv_t dp_private;
+};
+typedef struct dcopy_cmd_s *dcopy_cmd_t;
+
+
+/*
+ * dcopy_cmd_alloc() specific flags
+ * DCOPY_ALLOC_LINK - when set, the caller passes in a previously alloced
+ * command in cmd. dcopy_cmd_alloc() will allocate a new command and
+ * link it to the old command. The caller can use this to build a
+ * chain of commands, keeping only the last cmd alloced. calling
+ * dcopy_cmd_free() with the last cmd alloced in the chain will free all of
+ * the commands in the chain. dcopy_cmd_post() and dcopy_cmd_poll() have
+ * no knowledge of a chain of commands. It's only used for alloc/free.
+ */
+#define DCOPY_ALLOC_LINK (1 << 16)
+
+/*
+ * dcopy_cmd_alloc()
+ * allocate a command. A command can be re-used after it completes.
+ * flags - (DCOPY_SLEEP || DCOPY_NOSLEEP), DCOPY_ALLOC_LINK
+ * returns => DCOPY_FAILURE, DCOPY_SUCCESS, DCOPY_NORESOURCES
+ */
+int dcopy_cmd_alloc(dcopy_handle_t handle, int flags, dcopy_cmd_t *cmd);
+
+/*
+ * dcopy_cmd_free()
+ * free the command. This call cannot be called after dcopy_free().
+ */
+void dcopy_cmd_free(dcopy_cmd_t *cmd);
+
+/*
+ * dcopy_cmd_post()
+ * post a command (allocated from dcopy_cmd_alloc()) to the DMA channel
+ * returns => DCOPY_FAILURE, DCOPY_SUCCESS, DCOPY_NORESOURCES
+ */
+int dcopy_cmd_post(dcopy_cmd_t cmd);
+
+/* dcopy_cmd_poll() flags */
+#define DCOPY_POLL_NOFLAGS (0)
+#define DCOPY_POLL_BLOCK (1 << 0)
+
+/*
+ * dcopy_cmd_poll()
+ * poll on completion status of a previous post. This call cannot be called
+ * after dcopy_free().
+ *
+ * if flags == DCOPY_POLL_NOFLAGS, return status can be DCOPY_FAILURE,
+ * DCOPY_PENDING, or DCOPY_COMPLETED.
+ *
+ * if flags & DCOPY_POLL_BLOCK, return status can be DCOPY_FAILURE or
+ * DCOPY_COMPLETED. DCOPY_POLL_BLOCK can only be set in base context.
+ *
+ * The command cannot be re-used or freed until the command has completed
+ * (e.g. DCOPY_FAILURE or DCOPY_COMPLETED).
+ */
+int dcopy_cmd_poll(dcopy_cmd_t cmd, int flags);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DCOPY_H */
diff --git a/usr/src/uts/common/sys/dcopy_device.h b/usr/src/uts/common/sys/dcopy_device.h
new file mode 100644
index 0000000000..25e95b2aa8
--- /dev/null
+++ b/usr/src/uts/common/sys/dcopy_device.h
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DCOPY_DEVICE_H
+#define _SYS_DCOPY_DEVICE_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/dcopy.h>
+
+/*
+ * private command state. Space for this structure should be allocated during
+ * (*cb_cmd_alloc). The DMA driver must set dp_private in dcopy_cmd_t to point
+ * to the memory it allocated. Other than pr_device_cmd_private, the DMA driver
+ * should not touch any of the fields in this structure. pr_device_cmd_private
+ * is a private pointer for the DMA engine to use.
+ */
+struct dcopy_cmd_priv_s {
+ /*
+ * we only init the state used to track a command which blocks when it
+ * actually blocks. pr_block_init tells us when we need to clean it
+ * up during a cmd_free.
+ */
+ boolean_t pr_block_init;
+
+ /* dcopy_poll blocking state */
+ list_node_t pr_poll_list_node;
+ volatile boolean_t pr_wait;
+ kmutex_t pr_mutex;
+ kcondvar_t pr_cv;
+
+ /* back pointer to the command */
+ dcopy_cmd_t pr_cmd;
+
+ /* shortcut to the channel we're on */
+ struct dcopy_channel_s *pr_channel;
+
+ /* DMA driver private pointer */
+ void *pr_device_cmd_private;
+};
+
+/* cb_version */
+#define DCOPY_DEVICECB_V0 0
+
+typedef struct dcopy_device_chaninfo_s {
+ uint_t di_chan_num;
+} dcopy_device_chaninfo_t;
+
+typedef struct dcopy_device_cb_s {
+ int cb_version;
+ int cb_res1;
+
+ /* allocate/free a DMA channel. See dcopy.h for return status */
+ int (*cb_channel_alloc)(void *device_private,
+ dcopy_handle_t handle, int flags, uint_t size,
+ dcopy_query_channel_t *info, void *channel_private);
+ void (*cb_channel_free)(void *channel_private);
+
+ /* allocate/free a command. See dcopy.h for return status */
+ int (*cb_cmd_alloc)(void *channel_private, int flags,
+ dcopy_cmd_t *cmd);
+ void (*cb_cmd_free)(void *channel_private, dcopy_cmd_t *cmd);
+
+ /*
+ * post a command/poll for command status. See dcopy.h for return
+ * status
+ */
+ int (*cb_cmd_post)(void *channel_private, dcopy_cmd_t cmd);
+ int (*cb_cmd_poll)(void *channel_private, dcopy_cmd_t cmd);
+
+ /*
+ * if dcopy_device_unregister() returns DCOPY_PENDING, dcopy will
+ * call this routine when all the channels are no longer being
+ * used and have been free'd up. e.g. it's safe for the DMA driver
+ * to detach.
+ * status = DCOPY_SUCCESS || DCOPY_FAILURE
+ */
+ void (*cb_unregister_complete)(void *device_private, int status);
+} dcopy_device_cb_t;
+
+
+typedef struct dcopy_device_info_s {
+ dev_info_t *di_dip;
+ dcopy_device_cb_t *di_cb; /* must be a static array */
+ uint_t di_num_dma;
+ uint_t di_maxxfer;
+ uint_t di_capabilities;
+ uint64_t di_id;
+} dcopy_device_info_t;
+
+typedef struct dcopy_device_s *dcopy_device_handle_t;
+
+/* dcopy_device_notify() status */
+#define DCOPY_COMPLETION 0
+
+/*
+ * dcopy_device_register()
+ * register the DMA device with dcopy.
+ * return status => DCOPY_FAILURE, DCOPY_SUCCESS
+ */
+int dcopy_device_register(void *device_private, dcopy_device_info_t *info,
+ dcopy_device_handle_t *handle);
+
+/*
+ * dcopy_device_unregister()
+ * try to unregister the DMA device with dcopy. If the DMA engines are
+ * still being used by upper layer modules, DCOPY_PENDING will be returned.
+ * return status => DCOPY_FAILURE, DCOPY_SUCCESS, DCOPY_PENDING
+ * if DCOPY_PENDING, (*cb_unregister_complete)() will be called when
+ * completed.
+ */
+int dcopy_device_unregister(dcopy_device_handle_t *handle);
+
+/*
+ * dcopy_device_channel_notify()
+ * Notify dcopy of an event.
+ * dcopy_handle_t handle => what was passed into (*cb_alloc)()
+ * status => DCOPY_COMPLETION
+ */
+void dcopy_device_channel_notify(dcopy_handle_t handle, int status);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DCOPY_DEVICE_H */
diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h
index 0680546ade..178a8a2905 100644
--- a/usr/src/uts/common/sys/socketvar.h
+++ b/usr/src/uts/common/sys/socketvar.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -50,14 +50,13 @@
#include <sys/file.h>
#include <sys/param.h>
#include <sys/zone.h>
+#include <sys/sodirect.h>
#include <inet/kssl/ksslapi.h>
#ifdef __cplusplus
extern "C" {
#endif
-
-
/*
* Internal representation used for addresses.
*/
@@ -333,6 +332,9 @@ struct sonode {
kssl_endpt_type_t so_kssl_type; /* is proxy/is proxied/none */
kssl_ent_t so_kssl_ent; /* SSL config entry */
kssl_ctx_t so_kssl_ctx; /* SSL session context */
+
+ /* != NULL for sodirect_t enabled socket */
+ sodirect_t *so_direct;
};
/* flags */
@@ -375,6 +377,7 @@ struct sonode {
#define SS_MOREDATA 0x00100000 /* NCAfs: NCA has more data */
#define SS_DIRECT 0x00200000 /* transport is directly below */
+#define SS_SODIRECT 0x00400000 /* transport supports sodirect */
#define SS_LADDR_VALID 0x01000000 /* so_laddr valid for user */
#define SS_FADDR_VALID 0x02000000 /* so_faddr valid for user */
diff --git a/usr/src/uts/common/sys/sodirect.h b/usr/src/uts/common/sys/sodirect.h
new file mode 100644
index 0000000000..49609bc5af
--- /dev/null
+++ b/usr/src/uts/common/sys/sodirect.h
@@ -0,0 +1,101 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _SYS_SODIRECT_H
+#define _SYS_SODIRECT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Sodirect ...
+ *
+ * Currently the sodirect_t uses the sockfs streamhead STREAMS Q directly,
+ * in the future when we have STREAMless sockets a sonode Q will have to
+ * be implemented however the sodirect KPI shouldn't need to change.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct sodirect_s {
+ uint32_t sod_state; /* State bits */
+ uint32_t sod_want; /* Pending read byte count or 0 */
+ queue_t *sod_q; /* Socket Q */
+ int (*sod_enqueue)(); /* Call to enqueue an mblk_t */
+ void (*sod_wakeup)(); /* Call to awkake a read()er, if any */
+ mblk_t *sod_uioafh; /* To be freed list head, or NULL */
+ mblk_t *sod_uioaft; /* To be freed list tail */
+ kmutex_t *sod_lock; /* Lock needed to protect all members */
+ uioa_t sod_uioa; /* Pending uio_t for uioa_t use */
+} sodirect_t;
+
+/*
+ * sod_state bits:
+ */
+
+#define SOD_DISABLED 0 /* No more sodirect */
+
+#define SOD_ENABLED 0x0001 /* sodirect_t enabled */
+
+#define SOD_WAKE_NOT 0x0010 /* Wakeup not needed */
+#define SOD_WAKE_NEED 0x0020 /* Wakeup needed */
+#define SOD_WAKE_DONE 0x0040 /* Wakeup done */
+#define SOD_WAKE_CLR ~(SOD_WAKE_NOT|SOD_WAKE_NEED|SOD_WAKE_DONE)
+
+/*
+ * Usefull macros:
+ */
+
+#define SOD_QSETBE(p) ((p)->sod_q->q_flag |= QWANTW)
+#define SOD_QCLRBE(p) ((p)->sod_q->q_flag &= ~QWANTW)
+#define SOD_QEMPTY(p) ((p)->sod_q->q_first == NULL)
+#define SOD_QFULL(p) ((p)->sod_q->q_flag & QFULL)
+#define SOD_QCNT(p) ((p)->sod_q->q_count)
+
+#define SOD_DISABLE(p) (p)->sod_state &= ~SOD_ENABLED
+
+#define SOD_QTOSODP(q) (q)->q_stream->sd_sodirect
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SODIRECT_H */
diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h
index 7142a1f19d..6720c14718 100644
--- a/usr/src/uts/common/sys/stream.h
+++ b/usr/src/uts/common/sys/stream.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -190,6 +190,8 @@ typedef struct queue {
#define _QASSOCIATED 0x10000000 /* queue is associated with a device */
#define _QDIRECT 0x20000000 /* Private; transport module uses */
/* direct interface to/from sockfs */
+#define _QSODIRECT 0x40000000 /* Private, transport module shares */
+ /* an sodirect_t with sockfs */
/* queue sqflags (protected by SQLOCK). */
#define Q_SQQUEUED 0x01 /* Queue is in the syncq list */
@@ -400,6 +402,7 @@ typedef struct bcache {
*/
#define DBLK_REFMIN 0x01 /* min refcnt stored in low bit */
#define DBLK_COOKED 0x02 /* message has been processed once */
+#define DBLK_UIOA 0x04 /* uioamove() is pending */
/*
* db_struioflag values:
diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h
index 6be0519425..41c1fdf0b3 100644
--- a/usr/src/uts/common/sys/strsubr.h
+++ b/usr/src/uts/common/sys/strsubr.h
@@ -46,6 +46,7 @@
#include <sys/proc.h>
#include <sys/netstack.h>
#include <sys/modhash.h>
+#include <sys/sodirect.h>
#ifdef __cplusplus
extern "C" {
@@ -94,9 +95,8 @@ extern "C" {
* sd_mark
* sd_closetime
* sd_wakeq
- * sd_uiordq
- * sd_uiowrq
* sd_maxblk
+ * sd_sodirect
*
* The following fields are modified only by the allocator, which
* has exclusive access to them at that time:
@@ -244,6 +244,10 @@ typedef struct stdata {
kcondvar_t sd_zcopy_wait;
uint_t sd_copyflag; /* copy-related flags */
zoneid_t sd_anchorzone; /* Allow removal from same zone only */
+ /*
+ * Support for socket direct.
+ */
+ sodirect_t *sd_sodirect; /* pointer to shared sodirect_t */
} stdata_t;
/*
diff --git a/usr/src/uts/common/sys/uio.h b/usr/src/uts/common/sys/uio.h
index 3e9e4a5eda..4f0aff49f6 100644
--- a/usr/src/uts/common/sys/uio.h
+++ b/usr/src/uts/common/sys/uio.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -101,6 +100,49 @@ typedef struct uio {
ssize_t uio_resid; /* residual count */
} uio_t;
+/*
+ * Extended uio_t uioa_t used for asynchronous uio.
+ *
+ * Note: UIOA_IOV_MAX is defined and used as it is in "fs/vncalls.c"
+ * as there isn't a formal definition of IOV_MAX for the kernel.
+ */
+#define UIOA_IOV_MAX 16
+
+typedef struct uioa_page_s { /* locked uio_iov state */
+ int uioa_pfncnt; /* count of pfn_t(s) in *uioa_ppp */
+ void **uioa_ppp; /* page_t or pfn_t arrary */
+ caddr_t uioa_base; /* address base */
+ size_t uioa_len; /* span length */
+} uioa_page_t;
+
+typedef struct uioa_s {
+ iovec_t *uio_iov; /* pointer to array of iovecs */
+ int uio_iovcnt; /* number of iovecs */
+ lloff_t _uio_offset; /* file offset */
+ uio_seg_t uio_segflg; /* address space (kernel or user) */
+ uint16_t uio_fmode; /* file mode flags */
+ uint16_t uio_extflg; /* extended flags */
+ lloff_t _uio_limit; /* u-limit (maximum byte offset) */
+ ssize_t uio_resid; /* residual count */
+ /*
+ * uioa extended members.
+ */
+ uint32_t uioa_state; /* state of asynch i/o */
+ uioa_page_t *uioa_lcur; /* pointer into uioa_locked[] */
+ void **uioa_lppp; /* pointer into lcur->uioa_ppp[] */
+ void *uioa_hwst[4]; /* opaque hardware state */
+ uioa_page_t uioa_locked[UIOA_IOV_MAX]; /* Per iov locked pages */
+} uioa_t;
+
+#define UIOA_ALLOC 0x0001 /* allocated but not yet initialized */
+#define UIOA_INIT 0x0002 /* initialized but not yet enabled */
+#define UIOA_ENABLED 0x0004 /* enabled, asynch i/o active */
+#define UIOA_FINI 0x0008 /* finished waiting for uioafini() */
+
+#define UIOA_CLR (~0x000F) /* clear mutually exclusive bits */
+
+#define UIOA_POLL 0x0010 /* need dcopy_poll() */
+
#define uio_loffset _uio_offset._f
#if !defined(_LP64)
#define uio_offset _uio_offset._p._l
@@ -127,10 +169,24 @@ typedef enum uio_rw { UIO_READ, UIO_WRITE } uio_rw_t;
* access, ie, access bypassing caches, should be used. Filesystems that
* don't initialize this field could experience suboptimal performance due to
* the random data the field contains.
+ *
+ * NOTE: This flag is also used by uioasync callers to pass an extended
+ * uio_t (uioa_t), to uioasync enabled consumers. Unlike above all
+ * consumers of a uioa_t require the uio_extflg to be initialized.
*/
#define UIO_COPY_DEFAULT 0x0000 /* no special options to copy */
#define UIO_COPY_CACHED 0x0001 /* copy should not bypass caches */
+#define UIO_ASYNC 0x0002 /* uio_t is really a uioa_t */
+
+/*
+ * Global uioasync capability shadow state.
+ */
+typedef struct uioasync_s {
+ boolean_t enabled; /* Is uioasync enabled? */
+ size_t mincnt; /* Minimum byte count for use of */
+} uioasync_t;
+
#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */
#if defined(_KERNEL)
@@ -141,6 +197,11 @@ int uwritec(struct uio *);
void uioskip(uio_t *, size_t);
int uiodup(uio_t *, uio_t *, iovec_t *, int);
+int uioamove(void *, size_t, enum uio_rw, uioa_t *);
+int uioainit(uio_t *, uioa_t *);
+int uioafini(uio_t *, uioa_t *);
+extern uioasync_t uioasync;
+
#else /* defined(_KERNEL) */
#if defined(__STDC__)
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index 8ec4d23a1e..26cf951204 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -161,6 +161,7 @@ DBOOT_OBJS += \
#
GFX_PRIVATE_OBJS += gfx_private.o gfxp_pci.o gfxp_segmap.o \
gfxp_devmap.o gfxp_vgatext.o gfxp_vm.o vgasubr.o
+IOAT_OBJS += ioat.o ioat_rs.o ioat_ioctl.o ioat_chan.o
ISANEXUS_OBJS += isa.o dma_engine.o i8237A.o
PCI_E_MISC_OBJS += pcie.o pcie_fault.o
PCI_E_NEXUS_OBJS += npe.o npe_misc.o
diff --git a/usr/src/uts/i86pc/Makefile.i86pc.shared b/usr/src/uts/i86pc/Makefile.i86pc.shared
index 18bc7610e2..58f309b0dd 100644
--- a/usr/src/uts/i86pc/Makefile.i86pc.shared
+++ b/usr/src/uts/i86pc/Makefile.i86pc.shared
@@ -257,6 +257,7 @@ DRV_KMODS += xsvc
DRV_KMODS += mc-amd
DRV_KMODS += tzmon
DRV_KMODS += battery
+DRV_KMODS += ioat
DRV_KMODS += cpudrv
diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules
index a411dd1ad5..004e6af2e8 100644
--- a/usr/src/uts/i86pc/Makefile.rules
+++ b/usr/src/uts/i86pc/Makefile.rules
@@ -73,6 +73,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/battery/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/ioat/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/mc/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -259,6 +263,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/battery/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/ioat/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/mc/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
diff --git a/usr/src/uts/i86pc/io/ioat/ioat.c b/usr/src/uts/i86pc/io/ioat/ioat.c
new file mode 100644
index 0000000000..7bf8a559c1
--- /dev/null
+++ b/usr/src/uts/i86pc/io/ioat/ioat.c
@@ -0,0 +1,665 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/file.h>
+#include <sys/open.h>
+#include <sys/modctl.h>
+#include <sys/ddi_impldefs.h>
+#include <sys/sysmacros.h>
+
+#include <sys/ioat.h>
+
+static int ioat_open(dev_t *devp, int flag, int otyp, cred_t *cred);
+static int ioat_close(dev_t devp, int flag, int otyp, cred_t *cred);
+static int ioat_attach(dev_info_t *devi, ddi_attach_cmd_t cmd);
+static int ioat_detach(dev_info_t *devi, ddi_detach_cmd_t cmd);
+static int ioat_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
+ void **result);
+
+static struct cb_ops ioat_cb_ops = {
+ ioat_open, /* cb_open */
+ ioat_close, /* cb_close */
+ nodev, /* cb_strategy */
+ nodev, /* cb_print */
+ nodev, /* cb_dump */
+ nodev, /* cb_read */
+ nodev, /* cb_write */
+ ioat_ioctl, /* cb_ioctl */
+ nodev, /* cb_devmap */
+ nodev, /* cb_mmap */
+ nodev, /* cb_segmap */
+ nochpoll, /* cb_chpoll */
+ ddi_prop_op, /* cb_prop_op */
+ NULL, /* cb_stream */
+ D_NEW | D_MP | D_64BIT | D_DEVMAP, /* cb_flag */
+ CB_REV
+};
+
+static struct dev_ops ioat_dev_ops = {
+ DEVO_REV, /* devo_rev */
+ 0, /* devo_refcnt */
+ ioat_getinfo, /* devo_getinfo */
+ nulldev, /* devo_identify */
+ nulldev, /* devo_probe */
+ ioat_attach, /* devo_attach */
+ ioat_detach, /* devo_detach */
+ nodev, /* devo_reset */
+ &ioat_cb_ops, /* devo_cb_ops */
+ NULL, /* devo_bus_ops */
+ NULL /* power */
+};
+
+static struct modldrv ioat_modldrv = {
+ &mod_driverops, /* Type of module. This one is a driver */
+ "ioat driver v%I%", /* Name of the module. */
+ &ioat_dev_ops, /* driver ops */
+};
+
+static struct modlinkage ioat_modlinkage = {
+ MODREV_1,
+ (void *) &ioat_modldrv,
+ NULL
+};
+
+
+void *ioat_statep;
+
+static int ioat_chip_init(ioat_state_t *state);
+static void ioat_chip_fini(ioat_state_t *state);
+static int ioat_drv_init(ioat_state_t *state);
+static void ioat_drv_fini(ioat_state_t *state);
+static uint_t ioat_isr(caddr_t parm);
+static void ioat_intr_enable(ioat_state_t *state);
+static void ioat_intr_disable(ioat_state_t *state);
+void ioat_detach_finish(ioat_state_t *state);
+
+
+ddi_device_acc_attr_t ioat_acc_attr = {
+ DDI_DEVICE_ATTR_V0, /* devacc_attr_version */
+ DDI_NEVERSWAP_ACC, /* devacc_attr_endian_flags */
+ DDI_STORECACHING_OK_ACC, /* devacc_attr_dataorder */
+ DDI_DEFAULT_ACC /* devacc_attr_access */
+};
+
+/* dcopy callback interface */
+dcopy_device_cb_t ioat_cb = {
+ DCOPY_DEVICECB_V0,
+ 0, /* reserved */
+ ioat_channel_alloc,
+ ioat_channel_free,
+ ioat_cmd_alloc,
+ ioat_cmd_free,
+ ioat_cmd_post,
+ ioat_cmd_poll,
+ ioat_unregister_complete
+};
+
+/*
+ * _init()
+ */
+int
+_init(void)
+{
+ int e;
+
+ e = ddi_soft_state_init(&ioat_statep, sizeof (ioat_state_t), 1);
+ if (e != 0) {
+ return (e);
+ }
+
+ e = mod_install(&ioat_modlinkage);
+ if (e != 0) {
+ ddi_soft_state_fini(&ioat_statep);
+ return (e);
+ }
+
+ return (0);
+}
+
+/*
+ * _info()
+ */
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&ioat_modlinkage, modinfop));
+}
+
+/*
+ * _fini()
+ */
+int
+_fini(void)
+{
+ int e;
+
+ e = mod_remove(&ioat_modlinkage);
+ if (e != 0) {
+ return (e);
+ }
+
+ ddi_soft_state_fini(&ioat_statep);
+
+ return (0);
+}
+
+/*
+ * ioat_attach()
+ */
+static int
+ioat_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ ioat_state_t *state;
+ int instance;
+ int e;
+
+
+ switch (cmd) {
+ case DDI_ATTACH:
+ break;
+
+ case DDI_RESUME:
+ instance = ddi_get_instance(dip);
+ state = ddi_get_soft_state(ioat_statep, instance);
+ if (state == NULL) {
+ return (DDI_FAILURE);
+ }
+ e = ioat_channel_resume(state);
+ if (e != DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+ ioat_intr_enable(state);
+ return (DDI_SUCCESS);
+
+ default:
+ return (DDI_FAILURE);
+ }
+
+ instance = ddi_get_instance(dip);
+ e = ddi_soft_state_zalloc(ioat_statep, instance);
+ if (e != DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+ state = ddi_get_soft_state(ioat_statep, instance);
+ if (state == NULL) {
+ goto attachfail_get_soft_state;
+ }
+
+ state->is_dip = dip;
+ state->is_instance = instance;
+
+ /* setup the registers, save away some device info */
+ e = ioat_chip_init(state);
+ if (e != DDI_SUCCESS) {
+ goto attachfail_chip_init;
+ }
+
+ /* initialize driver state, must be after chip init */
+ e = ioat_drv_init(state);
+ if (e != DDI_SUCCESS) {
+ goto attachfail_drv_init;
+ }
+
+ /* create the minor node (for the ioctl) */
+ e = ddi_create_minor_node(dip, "ioat", S_IFCHR, instance, DDI_PSEUDO,
+ 0);
+ if (e != DDI_SUCCESS) {
+ goto attachfail_minor_node;
+ }
+
+ /* Enable device interrupts */
+ ioat_intr_enable(state);
+
+ /* Report that driver was loaded */
+ ddi_report_dev(dip);
+
+ /* register with dcopy */
+ e = dcopy_device_register(state, &state->is_deviceinfo,
+ &state->is_device_handle);
+ if (e != DCOPY_SUCCESS) {
+ goto attachfail_register;
+ }
+
+ return (DDI_SUCCESS);
+
+attachfail_register:
+ ioat_intr_disable(state);
+ ddi_remove_minor_node(dip, NULL);
+attachfail_minor_node:
+ ioat_drv_fini(state);
+attachfail_drv_init:
+ ioat_chip_fini(state);
+attachfail_chip_init:
+attachfail_get_soft_state:
+ (void) ddi_soft_state_free(ioat_statep, instance);
+
+ return (DDI_FAILURE);
+}
+
+/*
+ * ioat_detach()
+ */
+static int
+ioat_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ ioat_state_t *state;
+ int instance;
+ int e;
+
+
+ instance = ddi_get_instance(dip);
+ state = ddi_get_soft_state(ioat_statep, instance);
+ if (state == NULL) {
+ return (DDI_FAILURE);
+ }
+
+ switch (cmd) {
+ case DDI_DETACH:
+ break;
+
+ case DDI_SUSPEND:
+ ioat_channel_suspend(state);
+ return (DDI_SUCCESS);
+
+ default:
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * try to unregister from dcopy. Since this driver doesn't follow the
+ * traditional parent/child model, we may still be in use so we can't
+ * detach yet.
+ */
+ e = dcopy_device_unregister(&state->is_device_handle);
+ if (e != DCOPY_SUCCESS) {
+ if (e == DCOPY_PENDING) {
+ cmn_err(CE_NOTE, "device busy, performing asynchronous"
+ " detach\n");
+ }
+ return (DDI_FAILURE);
+ }
+
+ ioat_detach_finish(state);
+
+ return (DDI_SUCCESS);
+}
+
+/*
+ * ioat_getinfo()
+ */
+/*ARGSUSED*/
+static int
+ioat_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
+{
+ ioat_state_t *state;
+ int instance;
+ dev_t dev;
+ int e;
+
+
+ dev = (dev_t)arg;
+ instance = getminor(dev);
+
+ switch (cmd) {
+ case DDI_INFO_DEVT2DEVINFO:
+ state = ddi_get_soft_state(ioat_statep, instance);
+ if (state == NULL) {
+ return (DDI_FAILURE);
+ }
+ *result = (void *)state->is_dip;
+ e = DDI_SUCCESS;
+ break;
+
+ case DDI_INFO_DEVT2INSTANCE:
+ *result = (void *)(uintptr_t)instance;
+ e = DDI_SUCCESS;
+ break;
+
+ default:
+ e = DDI_FAILURE;
+ break;
+ }
+
+ return (e);
+}
+
+
+/*
+ * ioat_open()
+ */
+/*ARGSUSED*/
+static int
+ioat_open(dev_t *devp, int flag, int otyp, cred_t *cred)
+{
+ ioat_state_t *state;
+ int instance;
+
+ instance = getminor(*devp);
+ state = ddi_get_soft_state(ioat_statep, instance);
+ if (state == NULL) {
+ return (ENXIO);
+ }
+
+ return (0);
+}
+
+
+/*
+ * ioat_close()
+ */
+/*ARGSUSED*/
+static int
+ioat_close(dev_t devp, int flag, int otyp, cred_t *cred)
+{
+ return (0);
+}
+
+
+/*
+ * ioat_chip_init()
+ */
+static int
+ioat_chip_init(ioat_state_t *state)
+{
+ ddi_device_acc_attr_t attr;
+ int e;
+
+
+ attr.devacc_attr_version = DDI_DEVICE_ATTR_V0;
+ attr.devacc_attr_endian_flags = DDI_NEVERSWAP_ACC;
+ attr.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
+
+ e = ddi_regs_map_setup(state->is_dip, 1, (caddr_t *)&state->is_genregs,
+ 0, 0, &attr, &state->is_reg_handle);
+ if (e != DDI_SUCCESS) {
+ goto chipinitfail_regsmap;
+ }
+
+ /* save away ioat chip info */
+ state->is_num_channels = (uint_t)ddi_get8(state->is_reg_handle,
+ &state->is_genregs[IOAT_CHANCNT]);
+ state->is_maxxfer = (uint_t)ddi_get8(state->is_reg_handle,
+ &state->is_genregs[IOAT_XFERCAP]);
+ state->is_chanoff = (uintptr_t)ddi_get16(state->is_reg_handle,
+ (uint16_t *)&state->is_genregs[IOAT_PERPORT_OFF]);
+ state->is_cbver = (uint_t)ddi_get8(state->is_reg_handle,
+ &state->is_genregs[IOAT_CBVER]);
+ state->is_intrdelay = (uint_t)ddi_get16(state->is_reg_handle,
+ (uint16_t *)&state->is_genregs[IOAT_INTRDELAY]);
+ state->is_status = (uint_t)ddi_get16(state->is_reg_handle,
+ (uint16_t *)&state->is_genregs[IOAT_CSSTATUS]);
+ state->is_capabilities = (uint_t)ddi_get32(state->is_reg_handle,
+ (uint32_t *)&state->is_genregs[IOAT_DMACAPABILITY]);
+
+ if (state->is_cbver & 0x10) {
+ state->is_ver = IOAT_CBv1;
+ } else if (state->is_cbver & 0x20) {
+ state->is_ver = IOAT_CBv2;
+ } else {
+ goto chipinitfail_version;
+ }
+
+ return (DDI_SUCCESS);
+
+chipinitfail_version:
+ ddi_regs_map_free(&state->is_reg_handle);
+chipinitfail_regsmap:
+ return (DDI_FAILURE);
+}
+
+
+/*
+ * ioat_chip_fini()
+ */
+static void
+ioat_chip_fini(ioat_state_t *state)
+{
+ ddi_regs_map_free(&state->is_reg_handle);
+}
+
+
+/*
+ * ioat_drv_init()
+ */
+static int
+ioat_drv_init(ioat_state_t *state)
+{
+ ddi_acc_handle_t handle;
+ int e;
+
+
+ mutex_init(&state->is_mutex, NULL, MUTEX_DRIVER, NULL);
+
+ state->is_deviceinfo.di_dip = state->is_dip;
+ state->is_deviceinfo.di_num_dma = state->is_num_channels;
+ state->is_deviceinfo.di_maxxfer = state->is_maxxfer;
+ state->is_deviceinfo.di_capabilities = state->is_capabilities;
+ state->is_deviceinfo.di_cb = &ioat_cb;
+
+ e = pci_config_setup(state->is_dip, &handle);
+ if (e != DDI_SUCCESS) {
+ goto drvinitfail_config_setup;
+ }
+
+ /* read in Vendor ID */
+ state->is_deviceinfo.di_id = (uint64_t)pci_config_get16(handle, 0);
+ state->is_deviceinfo.di_id = state->is_deviceinfo.di_id << 16;
+
+ /* read in Device ID */
+ state->is_deviceinfo.di_id |= (uint64_t)pci_config_get16(handle, 2);
+ state->is_deviceinfo.di_id = state->is_deviceinfo.di_id << 32;
+
+ /* Add in chipset version */
+ state->is_deviceinfo.di_id |= (uint64_t)state->is_cbver;
+ pci_config_teardown(&handle);
+
+ e = ddi_intr_hilevel(state->is_dip, 0);
+ if (e != 0) {
+ cmn_err(CE_WARN, "hilevel interrupt not supported\n");
+ goto drvinitfail_hilevel;
+ }
+
+ /* we don't support MSIs for v2 yet */
+ e = ddi_add_intr(state->is_dip, 0, NULL, NULL, ioat_isr,
+ (caddr_t)state);
+ if (e != DDI_SUCCESS) {
+ goto drvinitfail_add_intr;
+ }
+
+ e = ddi_get_iblock_cookie(state->is_dip, 0, &state->is_iblock_cookie);
+ if (e != DDI_SUCCESS) {
+ goto drvinitfail_iblock_cookie;
+ }
+
+ e = ioat_channel_init(state);
+ if (e != DDI_SUCCESS) {
+ goto drvinitfail_channel_init;
+ }
+
+ return (DDI_SUCCESS);
+
+drvinitfail_channel_init:
+drvinitfail_iblock_cookie:
+ ddi_remove_intr(state->is_dip, 0, state->is_iblock_cookie);
+drvinitfail_add_intr:
+drvinitfail_hilevel:
+drvinitfail_config_setup:
+ mutex_destroy(&state->is_mutex);
+
+ return (DDI_FAILURE);
+}
+
+
+/*
+ * ioat_drv_fini()
+ */
+static void
+ioat_drv_fini(ioat_state_t *state)
+{
+ ioat_channel_fini(state);
+ ddi_remove_intr(state->is_dip, 0, state->is_iblock_cookie);
+ mutex_destroy(&state->is_mutex);
+}
+
+
+/*
+ * ioat_unregister_complete()
+ */
+void
+ioat_unregister_complete(void *device_private, int status)
+{
+ ioat_state_t *state;
+
+
+ state = device_private;
+
+ if (status != DCOPY_SUCCESS) {
+ cmn_err(CE_WARN, "asynchronous detach aborted\n");
+ return;
+ }
+
+ cmn_err(CE_CONT, "detach completing\n");
+ ioat_detach_finish(state);
+}
+
+
+/*
+ * ioat_detach_finish()
+ */
+void
+ioat_detach_finish(ioat_state_t *state)
+{
+ ioat_intr_disable(state);
+ ddi_remove_minor_node(state->is_dip, NULL);
+ ioat_drv_fini(state);
+ ioat_chip_fini(state);
+ (void) ddi_soft_state_free(ioat_statep, state->is_instance);
+}
+
+
+/*
+ * ioat_intr_enable()
+ */
+static void
+ioat_intr_enable(ioat_state_t *state)
+{
+ uint32_t intr_status;
+
+
+ /* Clear any pending interrupts */
+ intr_status = ddi_get32(state->is_reg_handle,
+ (uint32_t *)&state->is_genregs[IOAT_ATTNSTATUS]);
+ if (intr_status != 0) {
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&state->is_genregs[IOAT_ATTNSTATUS],
+ intr_status);
+ }
+
+ /* Enable interrupts on the device */
+ ddi_put8(state->is_reg_handle, &state->is_genregs[IOAT_INTRCTL],
+ IOAT_INTRCTL_MASTER_EN);
+}
+
+
+/*
+ * ioat_intr_disable()
+ */
+static void
+ioat_intr_disable(ioat_state_t *state)
+{
+ /*
+ * disable interrupts on the device. A read of the interrupt control
+ * register clears the enable bit.
+ */
+ (void) ddi_get8(state->is_reg_handle,
+ &state->is_genregs[IOAT_INTRCTL]);
+}
+
+
+/*
+ * ioat_isr()
+ */
+static uint_t
+ioat_isr(caddr_t parm)
+{
+ uint32_t intr_status;
+ ioat_state_t *state;
+ uint8_t intrctrl;
+ uint32_t chan;
+ uint_t r;
+ int i;
+
+ state = (ioat_state_t *)parm;
+
+ intrctrl = ddi_get8(state->is_reg_handle,
+ &state->is_genregs[IOAT_INTRCTL]);
+ /* master interrupt enable should always be set */
+ ASSERT(intrctrl & IOAT_INTRCTL_MASTER_EN);
+
+ /* If the interrupt status bit isn't set, it's not ours */
+ if (!(intrctrl & IOAT_INTRCTL_INTR_STAT)) {
+ /* re-set master interrupt enable (since it clears on read) */
+ ddi_put8(state->is_reg_handle,
+ &state->is_genregs[IOAT_INTRCTL], intrctrl);
+ return (DDI_INTR_UNCLAIMED);
+ }
+
+ /* see which channels generated the interrupt */
+ intr_status = ddi_get32(state->is_reg_handle,
+ (uint32_t *)&state->is_genregs[IOAT_ATTNSTATUS]);
+
+ /* call the intr handler for the channels */
+ r = DDI_INTR_UNCLAIMED;
+ chan = 1;
+ for (i = 0; i < state->is_num_channels; i++) {
+ if (intr_status & chan) {
+ ioat_channel_intr(&state->is_channel[i]);
+ r = DDI_INTR_CLAIMED;
+ }
+ chan = chan << 1;
+ }
+
+ /*
+ * if interrupt status bit was set, there should have been an
+ * attention status bit set too.
+ */
+ ASSERT(r == DDI_INTR_CLAIMED);
+
+ /* re-set master interrupt enable (since it clears on read) */
+ ddi_put8(state->is_reg_handle, &state->is_genregs[IOAT_INTRCTL],
+ intrctrl);
+
+ return (r);
+}
diff --git a/usr/src/uts/i86pc/io/ioat/ioat.conf b/usr/src/uts/i86pc/io/ioat/ioat.conf
new file mode 100644
index 0000000000..49d948eddb
--- /dev/null
+++ b/usr/src/uts/i86pc/io/ioat/ioat.conf
@@ -0,0 +1,30 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#ident "%Z%%M% %I% %E% SMI"
+
+#
+# force attach this driver to support misc/driver
+ddi-forceattach=1;
+
diff --git a/usr/src/uts/i86pc/io/ioat/ioat_chan.c b/usr/src/uts/i86pc/io/ioat/ioat_chan.c
new file mode 100644
index 0000000000..8615f9a7ad
--- /dev/null
+++ b/usr/src/uts/i86pc/io/ioat/ioat_chan.c
@@ -0,0 +1,1319 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/file.h>
+#include <sys/open.h>
+#include <sys/modctl.h>
+#include <sys/ddi_impldefs.h>
+#include <sys/sysmacros.h>
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <sys/mach_mmu.h>
+#ifdef __xpv
+#include <sys/hypervisor.h>
+#endif
+
+#include <sys/ioat.h>
+
+
+extern ddi_device_acc_attr_t ioat_acc_attr;
+
+/* dma attr for the descriptor rings */
+ddi_dma_attr_t ioat_desc_dma_attr = {
+ DMA_ATTR_V0, /* dma_attr_version */
+ 0x0, /* dma_attr_addr_lo */
+ 0xffffffffffffffff, /* dma_attr_addr_hi */
+ 0xffffffff, /* dma_attr_count_max */
+ 0x1000, /* dma_attr_align */
+ 0x1, /* dma_attr_burstsizes */
+ 0x1, /* dma_attr_minxfer */
+ 0xffffffff, /* dma_attr_maxxfer */
+ 0xffffffff, /* dma_attr_seg */
+ 0x1, /* dma_attr_sgllen */
+ 0x1, /* dma_attr_granular */
+ 0x0, /* dma_attr_flags */
+};
+
+/* dma attr for the completion buffers */
+ddi_dma_attr_t ioat_cmpl_dma_attr = {
+ DMA_ATTR_V0, /* dma_attr_version */
+ 0x0, /* dma_attr_addr_lo */
+ 0xffffffffffffffff, /* dma_attr_addr_hi */
+ 0xffffffff, /* dma_attr_count_max */
+ 0x40, /* dma_attr_align */
+ 0x1, /* dma_attr_burstsizes */
+ 0x1, /* dma_attr_minxfer */
+ 0xffffffff, /* dma_attr_maxxfer */
+ 0xffffffff, /* dma_attr_seg */
+ 0x1, /* dma_attr_sgllen */
+ 0x1, /* dma_attr_granular */
+ 0x0, /* dma_attr_flags */
+};
+
+static int ioat_completion_alloc(ioat_channel_t channel);
+static void ioat_completion_free(ioat_channel_t channel);
+static void ioat_channel_start(ioat_channel_t channel);
+static void ioat_channel_reset(ioat_channel_t channel);
+
+int ioat_ring_alloc(ioat_channel_t channel, uint_t desc_cnt);
+void ioat_ring_free(ioat_channel_t channel);
+void ioat_ring_seed(ioat_channel_t channel, ioat_chan_dma_desc_t *desc);
+int ioat_ring_reserve(ioat_channel_t channel, ioat_channel_ring_t *ring,
+ dcopy_cmd_t cmd);
+
+static void ioat_cmd_post_copy(ioat_channel_ring_t *ring, uint64_t src_addr,
+ uint64_t dest_addr, uint32_t size, uint32_t ctrl);
+static void ioat_cmd_post_dca(ioat_channel_ring_t *ring, uint32_t dca_id);
+
+
+/*
+ * ioat_channel_init()
+ */
+int
+ioat_channel_init(ioat_state_t *state)
+{
+ int i;
+
+ /*
+ * initialize each dma channel's state which doesn't change across
+ * channel alloc/free.
+ */
+ state->is_chansize = sizeof (struct ioat_channel_s) *
+ state->is_num_channels;
+ state->is_channel = kmem_zalloc(state->is_chansize, KM_SLEEP);
+ for (i = 0; i < state->is_num_channels; i++) {
+ state->is_channel[i].ic_state = state;
+ state->is_channel[i].ic_regs = (uint8_t *)
+ ((uintptr_t)state->is_genregs +
+ (uintptr_t)(IOAT_CHANNELREG_OFFSET * (i + 1)));
+ }
+
+ /* initial the allocator (from 0 to state->is_num_channels) */
+ ioat_rs_init(state, 0, state->is_num_channels, &state->is_channel_rs);
+
+ return (DDI_SUCCESS);
+}
+
+
+/*
+ * ioat_channel_fini()
+ */
+void
+ioat_channel_fini(ioat_state_t *state)
+{
+ ioat_rs_fini(&state->is_channel_rs);
+ kmem_free(state->is_channel, state->is_chansize);
+}
+
+
+/*
+ * ioat_channel_alloc()
+ * NOTE: We intentionaly don't handle DCOPY_SLEEP (if no channels are
+ * available)
+ */
+/*ARGSUSED*/
+int
+ioat_channel_alloc(void *device_private, dcopy_handle_t handle, int flags,
+ uint_t size, dcopy_query_channel_t *info, void *channel_private)
+{
+#define CHANSTRSIZE 20
+ struct ioat_channel_s *channel;
+ char chanstr[CHANSTRSIZE];
+ ioat_channel_t *chan;
+ ioat_state_t *state;
+ size_t cmd_size;
+ uint_t chan_num;
+ uint32_t estat;
+ int e;
+
+
+ state = (ioat_state_t *)device_private;
+ chan = (ioat_channel_t *)channel_private;
+
+ /* allocate a H/W channel */
+ e = ioat_rs_alloc(state->is_channel_rs, &chan_num);
+ if (e != DDI_SUCCESS) {
+ return (DCOPY_NORESOURCES);
+ }
+
+ channel = &state->is_channel[chan_num];
+ channel->ic_inuse = B_TRUE;
+ channel->ic_chan_num = chan_num;
+ channel->ic_ver = state->is_ver;
+ channel->ic_dca_active = B_FALSE;
+ channel->ic_channel_state = IOAT_CHANNEL_OK;
+ channel->ic_dcopy_handle = handle;
+
+#ifdef DEBUG
+ {
+ /* if we're cbv2, verify that the V2 compatibility bit is set */
+ uint16_t reg;
+ if (channel->ic_ver == IOAT_CBv2) {
+ reg = ddi_get16(state->is_reg_handle,
+ (uint16_t *)&channel->ic_regs[IOAT_CHAN_COMP]);
+ ASSERT(reg & 0x2);
+ }
+ }
+#endif
+
+ /*
+ * Configure DMA channel
+ * Channel In Use
+ * Error Interrupt Enable
+ * Any Error Abort Enable
+ * Error Completion Enable
+ */
+ ddi_put16(state->is_reg_handle,
+ (uint16_t *)&channel->ic_regs[IOAT_CHAN_CTL], 0x011C);
+
+ /* check channel error register, clear any errors */
+ estat = ddi_get32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_CHAN_ERR]);
+ if (estat != 0) {
+#ifdef DEBUG
+ cmn_err(CE_CONT, "cleared errors (0x%x) before channel (%d) "
+ "enable\n", estat, channel->ic_chan_num);
+#endif
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_CHAN_ERR], estat);
+ }
+
+ /* allocate and initialize the descriptor buf */
+ e = ioat_ring_alloc(channel, size);
+ if (e != DDI_SUCCESS) {
+ goto chinitfail_desc_alloc;
+ }
+
+ /* allocate and initialize the completion space */
+ e = ioat_completion_alloc(channel);
+ if (e != DDI_SUCCESS) {
+ goto chinitfail_completion_alloc;
+ }
+
+ /* setup kmem_cache for commands */
+ cmd_size = sizeof (struct dcopy_cmd_s) +
+ sizeof (struct dcopy_cmd_priv_s) +
+ sizeof (struct ioat_cmd_private_s);
+ (void) snprintf(chanstr, CHANSTRSIZE, "ioat%dchan%dcmd",
+ state->is_instance, channel->ic_chan_num);
+ channel->ic_cmd_cache = kmem_cache_create(chanstr, cmd_size, 64,
+ NULL, NULL, NULL, NULL, NULL, 0);
+ if (channel->ic_cmd_cache == NULL) {
+ goto chinitfail_kmem_cache;
+ }
+
+ /* start-up the channel */
+ ioat_channel_start(channel);
+
+ /* fill in the channel info returned to dcopy */
+ info->qc_version = DCOPY_QUERY_CHANNEL_V0;
+ info->qc_id = state->is_deviceinfo.di_id;
+ info->qc_capabilities = (uint64_t)state->is_capabilities;
+ info->qc_channel_size = (uint64_t)size;
+ info->qc_chan_num = (uint64_t)channel->ic_chan_num;
+ if (channel->ic_ver == IOAT_CBv1) {
+ info->qc_dca_supported = B_FALSE;
+ } else {
+ if (info->qc_capabilities & IOAT_DMACAP_DCA) {
+ info->qc_dca_supported = B_TRUE;
+ } else {
+ info->qc_dca_supported = B_FALSE;
+ }
+ }
+
+ *chan = channel;
+
+ return (DCOPY_SUCCESS);
+
+chinitfail_kmem_cache:
+ ioat_completion_free(channel);
+chinitfail_completion_alloc:
+ ioat_ring_free(channel);
+chinitfail_desc_alloc:
+ return (DCOPY_FAILURE);
+}
+
+
+/*
+ * ioat_channel_suspend()
+ */
+/*ARGSUSED*/
+void
+ioat_channel_suspend(ioat_state_t *state)
+{
+ /*
+ * normally you would disable interrupts and reset the H/W here. But
+ * since the suspend framework doesn't know who is using us, it may
+ * not suspend their I/O before us. Since we won't actively be doing
+ * any DMA or interrupts unless someone asks us to, it's safe to not
+ * do anything here.
+ */
+}
+
+
+/*
+ * ioat_channel_resume()
+ */
+int
+ioat_channel_resume(ioat_state_t *state)
+{
+ ioat_channel_ring_t *ring;
+ ioat_channel_t channel;
+ uint32_t estat;
+ int i;
+
+
+ for (i = 0; i < state->is_num_channels; i++) {
+ channel = &state->is_channel[i];
+ ring = channel->ic_ring;
+
+ if (!channel->ic_inuse) {
+ continue;
+ }
+
+ /*
+ * Configure DMA channel
+ * Channel In Use
+ * Error Interrupt Enable
+ * Any Error Abort Enable
+ * Error Completion Enable
+ */
+ ddi_put16(state->is_reg_handle,
+ (uint16_t *)&channel->ic_regs[IOAT_CHAN_CTL], 0x011C);
+
+ /* check channel error register, clear any errors */
+ estat = ddi_get32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_CHAN_ERR]);
+ if (estat != 0) {
+#ifdef DEBUG
+ cmn_err(CE_CONT, "cleared errors (0x%x) before channel"
+ " (%d) enable\n", estat, channel->ic_chan_num);
+#endif
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_CHAN_ERR],
+ estat);
+ }
+
+ /* Re-initialize the ring */
+ bzero(ring->cr_desc, channel->ic_desc_alloc_size);
+ /* write the physical address into the chain address register */
+ if (channel->ic_ver == IOAT_CBv1) {
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V1_CHAN_ADDR_LO],
+ (uint32_t)(ring->cr_phys_desc & 0xffffffff));
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V1_CHAN_ADDR_HI],
+ (uint32_t)(ring->cr_phys_desc >> 32));
+ } else {
+ ASSERT(channel->ic_ver == IOAT_CBv2);
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V2_CHAN_ADDR_LO],
+ (uint32_t)(ring->cr_phys_desc & 0xffffffff));
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V2_CHAN_ADDR_HI],
+ (uint32_t)(ring->cr_phys_desc >> 32));
+ }
+
+ /* re-initialize the completion buffer */
+ bzero((void *)channel->ic_cmpl, channel->ic_cmpl_alloc_size);
+ /* write the phys addr into the completion address register */
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_CHAN_CMPL_LO],
+ (uint32_t)(channel->ic_phys_cmpl & 0xffffffff));
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_CHAN_CMPL_HI],
+ (uint32_t)(channel->ic_phys_cmpl >> 32));
+
+ /* start-up the channel */
+ ioat_channel_start(channel);
+
+ }
+
+ return (DDI_SUCCESS);
+}
+
+
+/*
+ * ioat_channel_free()
+ */
+void
+ioat_channel_free(void *channel_private)
+{
+ struct ioat_channel_s *channel;
+ ioat_channel_t *chan;
+ ioat_state_t *state;
+ uint_t chan_num;
+
+
+ chan = (ioat_channel_t *)channel_private;
+ channel = *chan;
+
+ state = channel->ic_state;
+ chan_num = channel->ic_chan_num;
+
+ /* disable the interrupts */
+ ddi_put16(state->is_reg_handle,
+ (uint16_t *)&channel->ic_regs[IOAT_CHAN_CTL], 0x0);
+
+ ioat_channel_reset(channel);
+
+ /* cleanup command cache */
+ kmem_cache_destroy(channel->ic_cmd_cache);
+
+ /* clean-up/free-up the completion space and descriptors */
+ ioat_completion_free(channel);
+ ioat_ring_free(channel);
+
+ channel->ic_inuse = B_FALSE;
+
+ /* free the H/W DMA engine */
+ ioat_rs_free(state->is_channel_rs, chan_num);
+
+ *chan = NULL;
+}
+
+
+/*
+ * ioat_channel_intr()
+ */
+void
+ioat_channel_intr(ioat_channel_t channel)
+{
+ ioat_state_t *state;
+ uint16_t chanctrl;
+ uint32_t chanerr;
+ uint32_t status;
+
+
+ state = channel->ic_state;
+
+ if (channel->ic_ver == IOAT_CBv1) {
+ status = ddi_get32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V1_CHAN_STS_LO]);
+ } else {
+ ASSERT(channel->ic_ver == IOAT_CBv2);
+ status = ddi_get32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V2_CHAN_STS_LO]);
+ }
+
+ /* if that status isn't ACTIVE or IDLE, the channel has failed */
+ if (status & IOAT_CHAN_STS_FAIL_MASK) {
+ chanerr = ddi_get32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_CHAN_ERR]);
+ cmn_err(CE_WARN, "channel(%d) fatal failure! "
+ "chanstat_lo=0x%X; chanerr=0x%X\n",
+ channel->ic_chan_num, status, chanerr);
+ channel->ic_channel_state = IOAT_CHANNEL_IN_FAILURE;
+ ioat_channel_reset(channel);
+
+ return;
+ }
+
+ /*
+ * clear interrupt disable bit if set (it's a RW1C). Read it back to
+ * ensure the write completes.
+ */
+ chanctrl = ddi_get16(state->is_reg_handle,
+ (uint16_t *)&channel->ic_regs[IOAT_CHAN_CTL]);
+ ddi_put16(state->is_reg_handle,
+ (uint16_t *)&channel->ic_regs[IOAT_CHAN_CTL], chanctrl);
+ (void) ddi_get16(state->is_reg_handle,
+ (uint16_t *)&channel->ic_regs[IOAT_CHAN_CTL]);
+
+ /* tell dcopy we have seen a completion on this channel */
+ dcopy_device_channel_notify(channel->ic_dcopy_handle, DCOPY_COMPLETION);
+}
+
+
+/*
+ * ioat_channel_start()
+ */
+void
+ioat_channel_start(ioat_channel_t channel)
+{
+ ioat_chan_dma_desc_t desc;
+
+ /* set the first descriptor up as a NULL descriptor */
+ bzero(&desc, sizeof (desc));
+ desc.dd_size = 0;
+ desc.dd_ctrl = IOAT_DESC_CTRL_OP_DMA | IOAT_DESC_DMACTRL_NULL |
+ IOAT_DESC_CTRL_CMPL;
+ desc.dd_next_desc = 0x0;
+
+ /* setup the very first descriptor */
+ ioat_ring_seed(channel, &desc);
+}
+
+
+/*
+ * ioat_channel_reset()
+ */
+void
+ioat_channel_reset(ioat_channel_t channel)
+{
+ ioat_state_t *state;
+
+ state = channel->ic_state;
+
+ /* hit the reset bit */
+ if (channel->ic_ver == IOAT_CBv1) {
+ ddi_put8(state->is_reg_handle,
+ &channel->ic_regs[IOAT_V1_CHAN_CMD], 0x20);
+ } else {
+ ASSERT(channel->ic_ver == IOAT_CBv2);
+ ddi_put8(state->is_reg_handle,
+ &channel->ic_regs[IOAT_V2_CHAN_CMD], 0x20);
+ }
+}
+
+
+/*
+ * ioat_completion_alloc()
+ */
+int
+ioat_completion_alloc(ioat_channel_t channel)
+{
+ ioat_state_t *state;
+ size_t real_length;
+ uint_t cookie_cnt;
+ int e;
+
+
+ state = channel->ic_state;
+
+ /*
+ * allocate memory for the completion status, zero it out, and get
+ * the paddr. We'll allocate a physically contiguous cache line.
+ */
+ e = ddi_dma_alloc_handle(state->is_dip, &ioat_cmpl_dma_attr,
+ DDI_DMA_SLEEP, NULL, &channel->ic_cmpl_dma_handle);
+ if (e != DDI_SUCCESS) {
+ goto cmplallocfail_alloc_handle;
+ }
+ channel->ic_cmpl_alloc_size = 64;
+ e = ddi_dma_mem_alloc(channel->ic_cmpl_dma_handle,
+ channel->ic_cmpl_alloc_size, &ioat_acc_attr,
+ DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
+ (caddr_t *)&channel->ic_cmpl, &real_length,
+ &channel->ic_cmpl_handle);
+ if (e != DDI_SUCCESS) {
+ goto cmplallocfail_mem_alloc;
+ }
+ bzero((void *)channel->ic_cmpl, channel->ic_cmpl_alloc_size);
+ e = ddi_dma_addr_bind_handle(channel->ic_cmpl_dma_handle, NULL,
+ (caddr_t)channel->ic_cmpl, channel->ic_cmpl_alloc_size,
+ DDI_DMA_RDWR | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
+ &channel->ic_cmpl_cookie, &cookie_cnt);
+ if (e != DDI_SUCCESS) {
+ goto cmplallocfail_addr_bind;
+ }
+ ASSERT(cookie_cnt == 1);
+ ASSERT(channel->ic_cmpl_cookie.dmac_size ==
+ channel->ic_cmpl_alloc_size);
+ channel->ic_phys_cmpl = channel->ic_cmpl_cookie.dmac_laddress;
+
+ /* write the physical address into the completion address register */
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_CHAN_CMPL_LO],
+ (uint32_t)(channel->ic_phys_cmpl & 0xffffffff));
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_CHAN_CMPL_HI],
+ (uint32_t)(channel->ic_phys_cmpl >> 32));
+
+ return (DDI_SUCCESS);
+
+cmplallocfail_addr_bind:
+ ddi_dma_mem_free(&channel->ic_desc_handle);
+cmplallocfail_mem_alloc:
+ ddi_dma_free_handle(&channel->ic_desc_dma_handle);
+cmplallocfail_alloc_handle:
+ return (DDI_FAILURE);
+}
+
+
+/*
+ * ioat_completion_free()
+ */
+void
+ioat_completion_free(ioat_channel_t channel)
+{
+ ioat_state_t *state;
+
+ state = channel->ic_state;
+
+ /* reset the completion address register */
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_CHAN_CMPL_LO], 0x0);
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_CHAN_CMPL_HI], 0x0);
+
+ /* unbind, then free up the memory, dma handle */
+ (void) ddi_dma_unbind_handle(channel->ic_cmpl_dma_handle);
+ ddi_dma_mem_free(&channel->ic_cmpl_handle);
+ ddi_dma_free_handle(&channel->ic_cmpl_dma_handle);
+}
+
+/*
+ * ioat_ring_alloc()
+ */
+int
+ioat_ring_alloc(ioat_channel_t channel, uint_t desc_cnt)
+{
+ ioat_channel_ring_t *ring;
+ ioat_state_t *state;
+ size_t real_length;
+ uint_t cookie_cnt;
+ int e;
+
+
+ state = channel->ic_state;
+
+ ring = kmem_zalloc(sizeof (ioat_channel_ring_t), KM_SLEEP);
+ channel->ic_ring = ring;
+ ring->cr_chan = channel;
+ ring->cr_post_cnt = 0;
+
+ mutex_init(&ring->cr_cmpl_mutex, NULL, MUTEX_DRIVER,
+ channel->ic_state->is_iblock_cookie);
+ mutex_init(&ring->cr_desc_mutex, NULL, MUTEX_DRIVER,
+ channel->ic_state->is_iblock_cookie);
+
+ /*
+ * allocate memory for the ring, zero it out, and get the paddr.
+ * We'll allocate a physically contiguous chunck of memory which
+ * simplifies the completion logic.
+ */
+ e = ddi_dma_alloc_handle(state->is_dip, &ioat_desc_dma_attr,
+ DDI_DMA_SLEEP, NULL, &channel->ic_desc_dma_handle);
+ if (e != DDI_SUCCESS) {
+ goto ringallocfail_alloc_handle;
+ }
+ /*
+ * allocate one extra descriptor so we can simplify the empty/full
+ * logic. Then round that number up to a whole multiple of 4.
+ */
+ channel->ic_chan_desc_cnt = ((desc_cnt + 1) + 3) & ~0x3;
+ ring->cr_desc_last = channel->ic_chan_desc_cnt - 1;
+ channel->ic_desc_alloc_size = channel->ic_chan_desc_cnt *
+ sizeof (ioat_chan_desc_t);
+ e = ddi_dma_mem_alloc(channel->ic_desc_dma_handle,
+ channel->ic_desc_alloc_size, &ioat_acc_attr,
+ DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
+ (caddr_t *)&ring->cr_desc, &real_length, &channel->ic_desc_handle);
+ if (e != DDI_SUCCESS) {
+ goto ringallocfail_mem_alloc;
+ }
+ bzero(ring->cr_desc, channel->ic_desc_alloc_size);
+ e = ddi_dma_addr_bind_handle(channel->ic_desc_dma_handle, NULL,
+ (caddr_t)ring->cr_desc, channel->ic_desc_alloc_size,
+ DDI_DMA_RDWR | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
+ &channel->ic_desc_cookies, &cookie_cnt);
+ if (e != DDI_SUCCESS) {
+ goto ringallocfail_addr_bind;
+ }
+ ASSERT(cookie_cnt == 1);
+ ASSERT(channel->ic_desc_cookies.dmac_size ==
+ channel->ic_desc_alloc_size);
+ ring->cr_phys_desc = channel->ic_desc_cookies.dmac_laddress;
+
+ /* write the physical address into the chain address register */
+ if (channel->ic_ver == IOAT_CBv1) {
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V1_CHAN_ADDR_LO],
+ (uint32_t)(ring->cr_phys_desc & 0xffffffff));
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V1_CHAN_ADDR_HI],
+ (uint32_t)(ring->cr_phys_desc >> 32));
+ } else {
+ ASSERT(channel->ic_ver == IOAT_CBv2);
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V2_CHAN_ADDR_LO],
+ (uint32_t)(ring->cr_phys_desc & 0xffffffff));
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V2_CHAN_ADDR_HI],
+ (uint32_t)(ring->cr_phys_desc >> 32));
+ }
+
+ return (DCOPY_SUCCESS);
+
+ringallocfail_addr_bind:
+ ddi_dma_mem_free(&channel->ic_desc_handle);
+ringallocfail_mem_alloc:
+ ddi_dma_free_handle(&channel->ic_desc_dma_handle);
+ringallocfail_alloc_handle:
+ mutex_destroy(&ring->cr_desc_mutex);
+ mutex_destroy(&ring->cr_cmpl_mutex);
+ kmem_free(channel->ic_ring, sizeof (ioat_channel_ring_t));
+
+ return (DCOPY_FAILURE);
+}
+
+
+/*
+ * ioat_ring_free()
+ */
+void
+ioat_ring_free(ioat_channel_t channel)
+{
+ ioat_state_t *state;
+
+
+ state = channel->ic_state;
+
+ /* reset the chain address register */
+ if (channel->ic_ver == IOAT_CBv1) {
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V1_CHAN_ADDR_LO], 0x0);
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V1_CHAN_ADDR_HI], 0x0);
+ } else {
+ ASSERT(channel->ic_ver == IOAT_CBv2);
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V2_CHAN_ADDR_LO], 0x0);
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&channel->ic_regs[IOAT_V2_CHAN_ADDR_HI], 0x0);
+ }
+
+ /* unbind, then free up the memory, dma handle */
+ (void) ddi_dma_unbind_handle(channel->ic_desc_dma_handle);
+ ddi_dma_mem_free(&channel->ic_desc_handle);
+ ddi_dma_free_handle(&channel->ic_desc_dma_handle);
+
+ mutex_destroy(&channel->ic_ring->cr_desc_mutex);
+ mutex_destroy(&channel->ic_ring->cr_cmpl_mutex);
+ kmem_free(channel->ic_ring, sizeof (ioat_channel_ring_t));
+
+}
+
+
+/*
+ * ioat_ring_seed()
+ * write the first descriptor in the ring.
+ */
+void
+ioat_ring_seed(ioat_channel_t channel, ioat_chan_dma_desc_t *in_desc)
+{
+ ioat_channel_ring_t *ring;
+ ioat_chan_dma_desc_t *desc;
+ ioat_chan_dma_desc_t *prev;
+ ioat_state_t *state;
+
+
+ state = channel->ic_state;
+ ring = channel->ic_ring;
+
+ /* init the completion state */
+ ring->cr_cmpl_gen = 0x0;
+ ring->cr_cmpl_last = 0x0;
+
+ /* write in the descriptor and init the descriptor state */
+ ring->cr_post_cnt++;
+ channel->ic_ring->cr_desc[0] = *(ioat_chan_desc_t *)in_desc;
+ ring->cr_desc_gen = 0;
+ ring->cr_desc_prev = 0;
+ ring->cr_desc_next = 1;
+
+ if (channel->ic_ver == IOAT_CBv1) {
+ /* hit the start bit */
+ ddi_put8(state->is_reg_handle,
+ &channel->ic_regs[IOAT_V1_CHAN_CMD], 0x1);
+ } else {
+ /*
+ * if this is CBv2, link the descriptor to an empty
+ * descriptor
+ */
+ ASSERT(ring->cr_chan->ic_ver == IOAT_CBv2);
+ desc = (ioat_chan_dma_desc_t *)
+ &ring->cr_desc[ring->cr_desc_next];
+ prev = (ioat_chan_dma_desc_t *)
+ &ring->cr_desc[ring->cr_desc_prev];
+
+ desc->dd_ctrl = 0;
+ desc->dd_next_desc = 0x0;
+
+ prev->dd_next_desc = ring->cr_phys_desc +
+ (ring->cr_desc_next << 6);
+
+ ddi_put16(state->is_reg_handle,
+ (uint16_t *)&channel->ic_regs[IOAT_V2_CHAN_CNT],
+ (uint16_t)1);
+ }
+
+}
+
+
+/*
+ * ioat_cmd_alloc()
+ */
+int
+ioat_cmd_alloc(void *private, int flags, dcopy_cmd_t *cmd)
+{
+ ioat_cmd_private_t *priv;
+ ioat_channel_t channel;
+ dcopy_cmd_t oldcmd;
+ int kmflag;
+
+
+ channel = (ioat_channel_t)private;
+
+ if (flags & DCOPY_NOSLEEP) {
+ kmflag = KM_NOSLEEP;
+ } else {
+ kmflag = KM_SLEEP;
+ }
+
+ /* save the command passed incase DCOPY_ALLOC_LINK is set */
+ oldcmd = *cmd;
+
+ *cmd = kmem_cache_alloc(channel->ic_cmd_cache, kmflag);
+ if (*cmd == NULL) {
+ return (DCOPY_NORESOURCES);
+ }
+
+ /* setup the dcopy and ioat private state pointers */
+ (*cmd)->dp_version = DCOPY_CMD_V0;
+ (*cmd)->dp_cmd = 0;
+ (*cmd)->dp_private = (struct dcopy_cmd_priv_s *)
+ ((uintptr_t)(*cmd) + sizeof (struct dcopy_cmd_s));
+ (*cmd)->dp_private->pr_device_cmd_private =
+ (struct ioat_cmd_private_s *)((uintptr_t)(*cmd)->dp_private +
+ sizeof (struct dcopy_cmd_priv_s));
+
+ /*
+ * if DCOPY_ALLOC_LINK is set, link the old command to the new one
+ * just allocated.
+ */
+ priv = (*cmd)->dp_private->pr_device_cmd_private;
+ if (flags & DCOPY_ALLOC_LINK) {
+ priv->ip_next = oldcmd;
+ } else {
+ priv->ip_next = NULL;
+ }
+
+ return (DCOPY_SUCCESS);
+}
+
+
+/*
+ * ioat_cmd_free()
+ */
+void
+ioat_cmd_free(void *private, dcopy_cmd_t *cmdp)
+{
+ ioat_cmd_private_t *priv;
+ ioat_channel_t channel;
+ dcopy_cmd_t next;
+ dcopy_cmd_t cmd;
+
+
+ channel = (ioat_channel_t)private;
+ cmd = *(cmdp);
+
+ /*
+ * free all the commands in the chain (see DCOPY_ALLOC_LINK in
+ * ioat_cmd_alloc() for more info).
+ */
+ while (cmd != NULL) {
+ priv = cmd->dp_private->pr_device_cmd_private;
+ next = priv->ip_next;
+ kmem_cache_free(channel->ic_cmd_cache, cmd);
+ cmd = next;
+ }
+ *cmdp = NULL;
+}
+
+
+/*
+ * ioat_cmd_post()
+ */
+int
+ioat_cmd_post(void *private, dcopy_cmd_t cmd)
+{
+ ioat_channel_ring_t *ring;
+ ioat_cmd_private_t *priv;
+ ioat_channel_t channel;
+ ioat_state_t *state;
+ uint64_t dest_paddr;
+ uint64_t src_paddr;
+ uint64_t dest_addr;
+ uint32_t dest_size;
+ uint64_t src_addr;
+ uint32_t src_size;
+ size_t xfer_size;
+ uint32_t ctrl;
+ size_t size;
+ int e;
+
+
+ channel = (ioat_channel_t)private;
+ priv = cmd->dp_private->pr_device_cmd_private;
+
+ state = channel->ic_state;
+ ring = channel->ic_ring;
+
+ mutex_enter(&ring->cr_desc_mutex);
+
+ /* if the channel has had a fatal failure, return failure */
+ if (channel->ic_channel_state == IOAT_CHANNEL_IN_FAILURE) {
+ mutex_exit(&ring->cr_cmpl_mutex);
+ return (DCOPY_FAILURE);
+ }
+
+ /* make sure we have space for the descriptors */
+ e = ioat_ring_reserve(channel, ring, cmd);
+ if (e != DCOPY_SUCCESS) {
+ mutex_exit(&ring->cr_cmpl_mutex);
+ return (DCOPY_NORESOURCES);
+ }
+
+ /* if we support DCA, and the DCA flag is set, post a DCA desc */
+ if ((channel->ic_ver == IOAT_CBv2) &&
+ (cmd->dp_flags & DCOPY_CMD_DCA)) {
+ ioat_cmd_post_dca(ring, cmd->dp_dca_id);
+ }
+
+ /*
+ * the dma copy may have to be broken up into multiple descriptors
+ * since we can't cross a page boundary.
+ */
+ ASSERT(cmd->dp_version == DCOPY_CMD_V0);
+ ASSERT(cmd->dp_cmd == DCOPY_CMD_COPY);
+ src_addr = cmd->dp.copy.cc_source;
+ dest_addr = cmd->dp.copy.cc_dest;
+ size = cmd->dp.copy.cc_size;
+ while (size > 0) {
+ src_paddr = pa_to_ma(src_addr);
+ dest_paddr = pa_to_ma(dest_addr);
+
+ /* adjust for any offset into the page */
+ if ((src_addr & PAGEOFFSET) == 0) {
+ src_size = PAGESIZE;
+ } else {
+ src_size = PAGESIZE - (src_addr & PAGEOFFSET);
+ }
+ if ((dest_addr & PAGEOFFSET) == 0) {
+ dest_size = PAGESIZE;
+ } else {
+ dest_size = PAGESIZE - (dest_addr & PAGEOFFSET);
+ }
+
+ /* take the smallest of the three */
+ xfer_size = MIN(src_size, dest_size);
+ xfer_size = MIN(xfer_size, size);
+
+ /*
+ * if this is the last descriptor, and we are supposed to
+ * generate a completion, generate a completion. same logic
+ * for interrupt.
+ */
+ ctrl = 0;
+ if (xfer_size == size) {
+ if (!(cmd->dp_flags & DCOPY_CMD_NOSTAT)) {
+ ctrl |= IOAT_DESC_CTRL_CMPL;
+ }
+ if ((cmd->dp_flags & DCOPY_CMD_INTR)) {
+ ctrl |= IOAT_DESC_CTRL_INTR;
+ }
+ }
+
+ ioat_cmd_post_copy(ring, src_paddr, dest_paddr, xfer_size,
+ ctrl);
+
+ /* go to the next page */
+ src_addr += xfer_size;
+ dest_addr += xfer_size;
+ size -= xfer_size;
+ }
+
+ /*
+ * if we are going to create a completion, save away the state so we
+ * can poll on it.
+ */
+ if (!(cmd->dp_flags & DCOPY_CMD_NOSTAT)) {
+ priv->ip_generation = ring->cr_desc_gen_prev;
+ priv->ip_index = ring->cr_desc_prev;
+ }
+
+ /* if queue not defined, tell the DMA engine about it */
+ if (!(cmd->dp_flags & DCOPY_CMD_QUEUE)) {
+ if (channel->ic_ver == IOAT_CBv1) {
+ ddi_put8(state->is_reg_handle,
+ (uint8_t *)&channel->ic_regs[IOAT_V1_CHAN_CMD],
+ 0x2);
+ } else {
+ ASSERT(channel->ic_ver == IOAT_CBv2);
+ ddi_put16(state->is_reg_handle,
+ (uint16_t *)&channel->ic_regs[IOAT_V2_CHAN_CNT],
+ (uint16_t)(ring->cr_post_cnt & 0xFFFF));
+ }
+ }
+
+ mutex_exit(&ring->cr_desc_mutex);
+
+ return (DCOPY_SUCCESS);
+}
+
+
+/*
+ * ioat_cmd_post_dca()
+ */
+static void
+ioat_cmd_post_dca(ioat_channel_ring_t *ring, uint32_t dca_id)
+{
+ ioat_chan_dca_desc_t *desc;
+ ioat_chan_dca_desc_t *prev;
+ ioat_channel_t channel;
+
+
+ channel = ring->cr_chan;
+ desc = (ioat_chan_dca_desc_t *)&ring->cr_desc[ring->cr_desc_next];
+ prev = (ioat_chan_dca_desc_t *)&ring->cr_desc[ring->cr_desc_prev];
+
+ /* keep track of the number of descs posted for cbv2 */
+ ring->cr_post_cnt++;
+
+ /*
+ * post a context change desriptor. If dca has never been used on
+ * this channel, or if the id doesn't match the last id used on this
+ * channel, set CONTEXT_CHANGE bit and dca id, set dca state to active,
+ * and save away the id we're using.
+ */
+ desc->dd_ctrl = IOAT_DESC_CTRL_OP_CNTX;
+ desc->dd_next_desc = 0x0;
+ if (!channel->ic_dca_active || (channel->ic_dca_current != dca_id)) {
+ channel->ic_dca_active = B_TRUE;
+ channel->ic_dca_current = dca_id;
+ desc->dd_ctrl |= IOAT_DESC_CTRL_CNTX_CHNG;
+ desc->dd_cntx = dca_id;
+ }
+
+ /* Put the descriptors physical address in the previous descriptor */
+ /*LINTED:E_TRUE_LOGICAL_EXPR*/
+ ASSERT(sizeof (ioat_chan_dca_desc_t) == 64);
+
+ /* sync the current desc */
+ (void) ddi_dma_sync(channel->ic_desc_dma_handle,
+ ring->cr_desc_next << 6, 64, DDI_DMA_SYNC_FORDEV);
+
+ /* update the previous desc and sync it too */
+ prev->dd_next_desc = ring->cr_phys_desc +
+ (ring->cr_desc_next << 6);
+ (void) ddi_dma_sync(channel->ic_desc_dma_handle,
+ ring->cr_desc_prev << 6, 64, DDI_DMA_SYNC_FORDEV);
+
+ /* save the current desc_next and desc_last for the completion */
+ ring->cr_desc_prev = ring->cr_desc_next;
+ ring->cr_desc_gen_prev = ring->cr_desc_gen;
+
+ /* increment next/gen so it points to the next free desc */
+ ring->cr_desc_next++;
+ if (ring->cr_desc_next > ring->cr_desc_last) {
+ ring->cr_desc_next = 0;
+ ring->cr_desc_gen++;
+ }
+
+ /*
+ * if this is CBv2, link the descriptor to an empty descriptor. Since
+ * we always leave on desc empty to detect full, this works out.
+ */
+ if (ring->cr_chan->ic_ver == IOAT_CBv2) {
+ desc = (ioat_chan_dca_desc_t *)
+ &ring->cr_desc[ring->cr_desc_next];
+ prev = (ioat_chan_dca_desc_t *)
+ &ring->cr_desc[ring->cr_desc_prev];
+ desc->dd_ctrl = 0;
+ desc->dd_next_desc = 0x0;
+
+ prev->dd_next_desc = ring->cr_phys_desc +
+ (ring->cr_desc_next << 6);
+ }
+}
+
+
+/*
+ * ioat_cmd_post_copy()
+ *
+ */
+static void
+ioat_cmd_post_copy(ioat_channel_ring_t *ring, uint64_t src_addr,
+ uint64_t dest_addr, uint32_t size, uint32_t ctrl)
+{
+ ioat_chan_dma_desc_t *desc;
+ ioat_chan_dma_desc_t *prev;
+ ioat_channel_t channel;
+
+
+ channel = ring->cr_chan;
+ desc = (ioat_chan_dma_desc_t *)&ring->cr_desc[ring->cr_desc_next];
+ prev = (ioat_chan_dma_desc_t *)&ring->cr_desc[ring->cr_desc_prev];
+
+ /* keep track of the number of descs posted for cbv2 */
+ ring->cr_post_cnt++;
+
+ /* write in the DMA desc */
+ desc->dd_ctrl = IOAT_DESC_CTRL_OP_DMA | ctrl;
+ desc->dd_size = size;
+ desc->dd_src_paddr = src_addr;
+ desc->dd_dest_paddr = dest_addr;
+ desc->dd_next_desc = 0x0;
+
+ /* Put the descriptors physical address in the previous descriptor */
+ /*LINTED:E_TRUE_LOGICAL_EXPR*/
+ ASSERT(sizeof (ioat_chan_dma_desc_t) == 64);
+
+ /* sync the current desc */
+ (void) ddi_dma_sync(channel->ic_desc_dma_handle,
+ ring->cr_desc_next << 6, 64, DDI_DMA_SYNC_FORDEV);
+
+ /* update the previous desc and sync it too */
+ prev->dd_next_desc = ring->cr_phys_desc +
+ (ring->cr_desc_next << 6);
+ (void) ddi_dma_sync(channel->ic_desc_dma_handle,
+ ring->cr_desc_prev << 6, 64, DDI_DMA_SYNC_FORDEV);
+
+ /* increment next/gen so it points to the next free desc */
+ ring->cr_desc_prev = ring->cr_desc_next;
+ ring->cr_desc_gen_prev = ring->cr_desc_gen;
+
+ /* increment next/gen so it points to the next free desc */
+ ring->cr_desc_next++;
+ if (ring->cr_desc_next > ring->cr_desc_last) {
+ ring->cr_desc_next = 0;
+ ring->cr_desc_gen++;
+ }
+
+ /*
+ * if this is CBv2, link the descriptor to an empty descriptor. Since
+ * we always leave on desc empty to detect full, this works out.
+ */
+ if (ring->cr_chan->ic_ver == IOAT_CBv2) {
+ desc = (ioat_chan_dma_desc_t *)
+ &ring->cr_desc[ring->cr_desc_next];
+ prev = (ioat_chan_dma_desc_t *)
+ &ring->cr_desc[ring->cr_desc_prev];
+ desc->dd_size = 0;
+ desc->dd_ctrl = 0;
+ desc->dd_next_desc = 0x0;
+
+ prev->dd_next_desc = ring->cr_phys_desc +
+ (ring->cr_desc_next << 6);
+ }
+}
+
+
+/*
+ * ioat_cmd_poll()
+ */
+int
+ioat_cmd_poll(void *private, dcopy_cmd_t cmd)
+{
+ ioat_channel_ring_t *ring;
+ ioat_cmd_private_t *priv;
+ ioat_channel_t channel;
+ uint64_t generation;
+ uint64_t last_cmpl;
+
+
+ channel = (ioat_channel_t)private;
+ priv = cmd->dp_private->pr_device_cmd_private;
+
+ ring = channel->ic_ring;
+ ASSERT(ring != NULL);
+
+ mutex_enter(&ring->cr_cmpl_mutex);
+
+ /* if the channel had a fatal failure, fail all polls */
+ if ((channel->ic_channel_state == IOAT_CHANNEL_IN_FAILURE) ||
+ IOAT_CMPL_FAILED(channel)) {
+ mutex_exit(&ring->cr_cmpl_mutex);
+ return (DCOPY_FAILURE);
+ }
+
+ /*
+ * if the current completion is the same as the last time we read one,
+ * post is still pending, nothing further to do. We track completions
+ * as indexes into the ring since post uses VAs and the H/W returns
+ * PAs. We grab a snapshot of generation and last_cmpl in the mutex.
+ */
+ (void) ddi_dma_sync(channel->ic_cmpl_dma_handle, 0, 0,
+ DDI_DMA_SYNC_FORCPU);
+ last_cmpl = IOAT_CMPL_INDEX(channel);
+ if (last_cmpl != ring->cr_cmpl_last) {
+ /*
+ * if we wrapped the ring, increment the generation. Store
+ * the last cmpl. This logic assumes a physically contiguous
+ * ring.
+ */
+ if (last_cmpl < ring->cr_cmpl_last) {
+ ring->cr_cmpl_gen++;
+ }
+ ring->cr_cmpl_last = last_cmpl;
+ generation = ring->cr_cmpl_gen;
+
+ } else {
+ generation = ring->cr_cmpl_gen;
+ }
+
+ mutex_exit(&ring->cr_cmpl_mutex);
+
+ /*
+ * if cmd isn't passed in, well return. Useful for updating the
+ * consumer pointer (ring->cr_cmpl_last).
+ */
+ if (cmd == NULL) {
+ return (DCOPY_PENDING);
+ }
+
+ /*
+ * if the post's generation is old, this post has completed. No reason
+ * to go check the last completion. if the generation is the same
+ * and if the post is before or = to the last completion processed,
+ * the post has completed.
+ */
+ if (priv->ip_generation < generation) {
+ return (DCOPY_COMPLETED);
+ } else if ((priv->ip_generation == generation) &&
+ (priv->ip_index <= last_cmpl)) {
+ return (DCOPY_COMPLETED);
+ }
+
+ return (DCOPY_PENDING);
+}
+
+
+/*
+ * ioat_ring_reserve()
+ */
+int
+ioat_ring_reserve(ioat_channel_t channel, ioat_channel_ring_t *ring,
+ dcopy_cmd_t cmd)
+{
+ uint64_t dest_addr;
+ uint32_t dest_size;
+ uint64_t src_addr;
+ uint32_t src_size;
+ size_t xfer_size;
+ uint64_t desc;
+ int num_desc;
+ size_t size;
+ int i;
+
+
+ /*
+ * figure out how many descriptors we need. This can include a dca
+ * desc and multiple desc for a dma copy.
+ */
+ num_desc = 0;
+ if ((channel->ic_ver == IOAT_CBv2) &&
+ (cmd->dp_flags & DCOPY_CMD_DCA)) {
+ num_desc++;
+ }
+ src_addr = cmd->dp.copy.cc_source;
+ dest_addr = cmd->dp.copy.cc_dest;
+ size = cmd->dp.copy.cc_size;
+ while (size > 0) {
+ num_desc++;
+
+ /* adjust for any offset into the page */
+ if ((src_addr & PAGEOFFSET) == 0) {
+ src_size = PAGESIZE;
+ } else {
+ src_size = PAGESIZE - (src_addr & PAGEOFFSET);
+ }
+ if ((dest_addr & PAGEOFFSET) == 0) {
+ dest_size = PAGESIZE;
+ } else {
+ dest_size = PAGESIZE - (dest_addr & PAGEOFFSET);
+ }
+
+ /* take the smallest of the three */
+ xfer_size = MIN(src_size, dest_size);
+ xfer_size = MIN(xfer_size, size);
+
+ /* go to the next page */
+ src_addr += xfer_size;
+ dest_addr += xfer_size;
+ size -= xfer_size;
+ }
+
+ /* Make sure we have space for these descriptors */
+ desc = ring->cr_desc_next;
+ for (i = 0; i < num_desc; i++) {
+
+ /*
+ * if this is the last descriptor in the ring, see if the
+ * last completed descriptor is #0.
+ */
+ if (desc == ring->cr_desc_last) {
+ if (ring->cr_cmpl_last == 0) {
+ /*
+ * if we think the ring is full, update where
+ * the H/W really is and check for full again.
+ */
+ (void) ioat_cmd_poll(channel, NULL);
+ if (ring->cr_cmpl_last == 0) {
+ return (DCOPY_NORESOURCES);
+ }
+ }
+
+ /*
+ * go to the next descriptor which is zero in this
+ * case.
+ */
+ desc = 0;
+
+ /*
+ * if this is not the last descriptor in the ring, see if
+ * the last completion we saw was the next descriptor.
+ */
+ } else {
+ if ((desc + 1) == ring->cr_cmpl_last) {
+ /*
+ * if we think the ring is full, update where
+ * the H/W really is and check for full again.
+ */
+ (void) ioat_cmd_poll(channel, NULL);
+ if ((desc + 1) == ring->cr_cmpl_last) {
+ return (DCOPY_NORESOURCES);
+ }
+ }
+
+ /* go to the next descriptor */
+ desc++;
+ }
+ }
+
+ return (DCOPY_SUCCESS);
+}
diff --git a/usr/src/uts/i86pc/io/ioat/ioat_ioctl.c b/usr/src/uts/i86pc/io/ioat/ioat_ioctl.c
new file mode 100644
index 0000000000..70640dac4f
--- /dev/null
+++ b/usr/src/uts/i86pc/io/ioat/ioat_ioctl.c
@@ -0,0 +1,343 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/file.h>
+#include <sys/open.h>
+#include <sys/modctl.h>
+#include <sys/ddi_impldefs.h>
+#include <sys/sysmacros.h>
+
+#include <vm/hat.h>
+#include <vm/as.h>
+
+#include <sys/ioat.h>
+
+
+extern void *ioat_statep;
+#define ptob64(x) (((uint64_t)(x)) << PAGESHIFT)
+
+static int ioat_ioctl_rdreg(ioat_state_t *state, void *arg, int mode);
+#ifdef DEBUG
+static int ioat_ioctl_wrreg(ioat_state_t *state, void *arg, int mode);
+static int ioat_ioctl_test(ioat_state_t *state, void *arg, int mode);
+#endif
+
+/*
+ * ioat_ioctl()
+ */
+/*ARGSUSED*/
+int
+ioat_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred, int *rval)
+{
+ ioat_state_t *state;
+ int instance;
+ int e;
+
+
+ e = drv_priv(cred);
+ if (e != 0) {
+ return (EPERM);
+ }
+ instance = getminor(dev);
+ if (instance == -1) {
+ return (EBADF);
+ }
+ state = ddi_get_soft_state(ioat_statep, instance);
+ if (state == NULL) {
+ return (EBADF);
+ }
+
+ switch (cmd) {
+ case IOAT_IOCTL_READ_REG:
+ e = ioat_ioctl_rdreg(state, (void *)arg, mode);
+ break;
+#ifdef DEBUG
+ case IOAT_IOCTL_WRITE_REG:
+ e = ioat_ioctl_wrreg(state, (void *)arg, mode);
+ break;
+ case IOAT_IOCTL_TEST:
+ e = ioat_ioctl_test(state, (void *)arg, mode);
+ break;
+#endif
+
+ default:
+ e = ENXIO;
+ }
+
+ return (e);
+}
+
+
+/*
+ * ioat_ioctl_rdreg()
+ */
+static int
+ioat_ioctl_rdreg(ioat_state_t *state, void *arg, int mode)
+{
+ ioat_ioctl_rdreg_t rdreg;
+ int e;
+
+
+ e = ddi_copyin(arg, &rdreg, sizeof (ioat_ioctl_rdreg_t), mode);
+ if (e != 0) {
+ return (EFAULT);
+ }
+
+ /*
+ * read a device register, where size is read size in bits, addr is
+ * the offset into MMIO registers.
+ */
+ switch (rdreg.size) {
+ case 8:
+ rdreg.data = (uint64_t)ddi_get8(state->is_reg_handle,
+ (uint8_t *)&state->is_genregs[rdreg.addr]);
+ break;
+ case 16:
+ rdreg.data = (uint64_t)ddi_get16(state->is_reg_handle,
+ (uint16_t *)&state->is_genregs[rdreg.addr]);
+ break;
+ case 32:
+ rdreg.data = (uint64_t)ddi_get32(state->is_reg_handle,
+ (uint32_t *)&state->is_genregs[rdreg.addr]);
+ break;
+ case 64:
+ rdreg.data = (uint64_t)ddi_get64(state->is_reg_handle,
+ (uint64_t *)&state->is_genregs[rdreg.addr]);
+ break;
+ default:
+ return (EFAULT);
+ }
+
+ e = ddi_copyout(&rdreg, arg, sizeof (ioat_ioctl_rdreg_t), mode);
+ if (e != 0) {
+ return (EFAULT);
+ }
+
+ return (0);
+}
+
+
+#ifdef DEBUG
+/*
+ * ioat_ioctl_wrreg()
+ */
+static int
+ioat_ioctl_wrreg(ioat_state_t *state, void *arg, int mode)
+{
+ ioat_ioctl_wrreg_t wrreg;
+ int e;
+
+
+ e = ddi_copyin(arg, &wrreg, sizeof (ioat_ioctl_wrreg_t), mode);
+ if (e != 0) {
+ return (EFAULT);
+ }
+
+ /*
+ * write a device register, where size is write size in bits, addr is
+ * the offset into MMIO registers.
+ */
+ switch (wrreg.size) {
+ case 8:
+ ddi_put8(state->is_reg_handle,
+ (uint8_t *)&state->is_genregs[wrreg.addr],
+ (uint8_t)wrreg.data);
+ break;
+ case 16:
+ ddi_put16(state->is_reg_handle,
+ (uint16_t *)&state->is_genregs[wrreg.addr],
+ (uint16_t)wrreg.data);
+ break;
+ case 32:
+ ddi_put32(state->is_reg_handle,
+ (uint32_t *)&state->is_genregs[wrreg.addr],
+ (uint32_t)wrreg.data);
+ break;
+ case 64:
+ ddi_put64(state->is_reg_handle,
+ (uint64_t *)&state->is_genregs[wrreg.addr],
+ (uint64_t)wrreg.data);
+ break;
+ default:
+ return (EFAULT);
+ }
+
+ return (0);
+}
+
+
+/*
+ * ioat_ioctl_test()
+ */
+/*ARGSUSED*/
+static int
+ioat_ioctl_test(ioat_state_t *state, void *arg, int mode)
+{
+ dcopy_handle_t channel;
+ dcopy_cmd_t cmd;
+ uint8_t *source;
+ uint_t buf_size;
+ uint_t poll_cnt;
+ uint8_t *dest;
+ uint8_t *buf;
+ int flags;
+ int i;
+ int e;
+
+
+ /* allocate 2 paged aligned 4k pages */
+ buf_size = 0x1000;
+ buf = kmem_zalloc((buf_size * 2) + 0x1000, KM_SLEEP);
+ source = (uint8_t *)(((uintptr_t)buf + PAGEOFFSET) & PAGEMASK);
+ dest = source + buf_size;
+
+ /* Init source buffer */
+ for (i = 0; i < buf_size; i++) {
+ source[i] = (uint8_t)(i & 0xFF);
+ }
+
+ /* allocate a DMA channel */
+ e = dcopy_alloc(DCOPY_SLEEP, &channel);
+ if (e != DCOPY_SUCCESS) {
+ cmn_err(CE_CONT, "dcopy_alloc() failed\n");
+ goto testfail_alloc;
+ }
+
+ /*
+ * post 32 DMA copy's from dest to dest. These will complete in order
+ * so they won't stomp on each other. We don't care about the data
+ * right now which is why we go dest to dest.
+ */
+ flags = DCOPY_SLEEP;
+ for (i = 0; i < 32; i++) {
+ /*
+ * if this is the second command, link the commands from here
+ * on out. We only want to keep track of the last command. We
+ * will poll on the last command completing (which infers that
+ * the other commands completed). If any of the previous
+ * commands fail, so will the last one. Linking the commands
+ * also allows us to only call free for the last command. free
+ * will free up the entire chain of commands.
+ */
+ if (i == 1) {
+ flags |= DCOPY_ALLOC_LINK;
+ }
+ e = dcopy_cmd_alloc(channel, flags, &cmd);
+ if (e != DCOPY_SUCCESS) {
+ cmn_err(CE_CONT, "dcopy_cmd_alloc() failed\n");
+ goto testfail_alloc;
+ }
+
+ ASSERT(cmd->dp_version == DCOPY_CMD_V0);
+ cmd->dp_cmd = DCOPY_CMD_COPY;
+ cmd->dp_flags = DCOPY_CMD_NOFLAGS;
+
+ /* do a bunch of dest to dest DMA's */
+ cmd->dp.copy.cc_source = ptob64(hat_getpfnum(kas.a_hat,
+ (caddr_t)source)) + ((uintptr_t)dest & PAGEOFFSET);
+ cmd->dp.copy.cc_dest = ptob64(hat_getpfnum(kas.a_hat,
+ (caddr_t)dest)) + ((uintptr_t)dest & PAGEOFFSET);
+ cmd->dp.copy.cc_size = PAGESIZE;
+
+ e = dcopy_cmd_post(cmd);
+ if (e != DCOPY_SUCCESS) {
+ cmn_err(CE_CONT, "dcopy_post() failed\n");
+ goto testfail_post;
+ }
+ }
+
+ e = dcopy_cmd_alloc(channel, flags, &cmd);
+ if (e != DCOPY_SUCCESS) {
+ cmn_err(CE_CONT, "dcopy_cmd_alloc() failed\n");
+ goto testfail_alloc;
+ }
+
+ /* now queue up the DMA we are going to check status and data for */
+ cmd->dp_cmd = DCOPY_CMD_COPY;
+ cmd->dp_flags = DCOPY_CMD_INTR;
+ cmd->dp.copy.cc_source = ptob64(hat_getpfnum(kas.a_hat,
+ (caddr_t)source)) + ((uintptr_t)source & PAGEOFFSET);
+ cmd->dp.copy.cc_dest = ptob64(hat_getpfnum(kas.a_hat,
+ (caddr_t)dest)) + ((uintptr_t)dest & PAGEOFFSET);
+ cmd->dp.copy.cc_size = PAGESIZE;
+ e = dcopy_cmd_post(cmd);
+ if (e != DCOPY_SUCCESS) {
+ cmn_err(CE_CONT, "dcopy_post() failed\n");
+ goto testfail_post;
+ }
+
+ /* check the status of the last command */
+ poll_cnt = 0;
+ flags = DCOPY_POLL_NOFLAGS;
+ while ((e = dcopy_cmd_poll(cmd, flags)) == DCOPY_PENDING) {
+ poll_cnt++;
+ if (poll_cnt >= 16) {
+ flags |= DCOPY_POLL_BLOCK;
+ }
+ }
+ if (e != DCOPY_COMPLETED) {
+ cmn_err(CE_CONT, "dcopy_poll() failed\n");
+ goto testfail_poll;
+ }
+
+ /* since the cmd's are linked we only need to pass in the last cmd */
+ dcopy_cmd_free(&cmd);
+ dcopy_free(&channel);
+
+ /* verify the data */
+ for (i = 0; i < PAGESIZE; i++) {
+ if (dest[i] != (uint8_t)(i & 0xFF)) {
+ cmn_err(CE_CONT,
+ "dcopy_data_compare() failed, %p[%d]: %x, %x\n",
+ (void *)dest, i, dest[i], i & 0xFF);
+ return (-1);
+ }
+ }
+
+ kmem_free(buf, (buf_size * 2) + 0x1000);
+
+ return (0);
+
+testfail_data_compare:
+testfail_poll:
+testfail_post:
+ dcopy_cmd_free(&cmd);
+ dcopy_free(&channel);
+testfail_alloc:
+ kmem_free(buf, (buf_size * 2) + 0x1000);
+
+ return (-1);
+}
+#endif
diff --git a/usr/src/uts/i86pc/io/ioat/ioat_rs.c b/usr/src/uts/i86pc/io/ioat/ioat_rs.c
new file mode 100644
index 0000000000..6d12798fda
--- /dev/null
+++ b/usr/src/uts/i86pc/io/ioat/ioat_rs.c
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/kmem.h>
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+#include <sys/ioat.h>
+
+
+/* structure used to keep track of resources */
+typedef struct ioat_rs_s {
+ /*
+ * Bounds of resource allocation. We will start allocating at rs_min
+ * and rollover at rs_max+1 (rs_max is included). e.g. for rs_min=0
+ * and rs_max=7, we will have 8 total resources which can be alloced.
+ */
+ uint_t rs_min;
+ uint_t rs_max;
+
+ /*
+ * rs_free points to an array of 64-bit values used to track resource
+ * allocation. rs_free_size is the free buffer size in bytes.
+ */
+ uint64_t *rs_free;
+ uint_t rs_free_size;
+
+ /*
+ * last tracks the last alloc'd resource. This allows us to do a round
+ * robin allocation.
+ */
+ uint_t rs_last;
+
+ kmutex_t rs_mutex;
+} ioat_rs_t;
+
+
+/*
+ * ioat_rs_init()
+ * Initialize the resource structure. This structure will be protected
+ * by a mutex at the iblock_cookie passed in. init() returns a handle to be
+ * used for the rest of the resource functions. This code is written assuming
+ * that min_val will be close to 0. Therefore, we will allocate the free
+ * buffer only taking max_val into account.
+ */
+void
+ioat_rs_init(ioat_state_t *state, uint_t min_val, uint_t max_val,
+ ioat_rs_hdl_t *handle)
+{
+ ioat_rs_t *rstruct;
+ uint_t array_size;
+ uint_t index;
+
+
+ ASSERT(handle != NULL);
+ ASSERT(min_val < max_val);
+
+ /* alloc space for resource structure */
+ rstruct = kmem_alloc(sizeof (ioat_rs_t), KM_SLEEP);
+
+ /*
+ * Test to see if the max value is 64-bit aligned. If so, we don't need
+ * to allocate an extra 64-bit word. alloc space for free buffer
+ * (8 bytes per uint64_t).
+ */
+ if ((max_val & 0x3F) == 0) {
+ rstruct->rs_free_size = (max_val >> 6) * 8;
+ } else {
+ rstruct->rs_free_size = ((max_val >> 6) + 1) * 8;
+ }
+ rstruct->rs_free = kmem_alloc(rstruct->rs_free_size, KM_SLEEP);
+
+ /* Initialize resource structure */
+ rstruct->rs_min = min_val;
+ rstruct->rs_last = min_val;
+ rstruct->rs_max = max_val;
+ mutex_init(&rstruct->rs_mutex, NULL, MUTEX_DRIVER,
+ state->is_iblock_cookie);
+
+ /* Mark all resources as free */
+ array_size = rstruct->rs_free_size >> 3;
+ for (index = 0; index < array_size; index++) {
+ rstruct->rs_free[index] = (uint64_t)0xFFFFFFFFFFFFFFFF;
+ }
+
+ /* setup handle which is returned from this function */
+ *handle = rstruct;
+}
+
+
+/*
+ * ioat_rs_fini()
+ * Frees up the space allocated in init(). Notice that a pointer to the
+ * handle is used for the parameter. fini() will set the handle to NULL
+ * before returning.
+ */
+void
+ioat_rs_fini(ioat_rs_hdl_t *handle)
+{
+ ioat_rs_t *rstruct;
+
+
+ ASSERT(handle != NULL);
+
+ rstruct = (ioat_rs_t *)*handle;
+
+ mutex_destroy(&rstruct->rs_mutex);
+ kmem_free(rstruct->rs_free, rstruct->rs_free_size);
+ kmem_free(rstruct, sizeof (ioat_rs_t));
+
+ /* set handle to null. This helps catch bugs. */
+ *handle = NULL;
+}
+
+
+/*
+ * ioat_rs_alloc()
+ * alloc a resource. If alloc fails, we are out of resources.
+ */
+int
+ioat_rs_alloc(ioat_rs_hdl_t handle, uint_t *resource)
+{
+ ioat_rs_t *rstruct;
+ uint_t array_idx;
+ uint64_t free;
+ uint_t index;
+ uint_t last;
+ uint_t min;
+ uint_t max;
+
+
+ ASSERT(handle != NULL);
+ ASSERT(resource != NULL);
+
+ rstruct = (ioat_rs_t *)handle;
+
+ mutex_enter(&rstruct->rs_mutex);
+ min = rstruct->rs_min;
+ max = rstruct->rs_max;
+
+ /*
+ * Find a free resource. This will return out of the loop once it finds
+ * a free resource. There are a total of 'max'-'min'+1 resources.
+ * Performs a round robin allocation.
+ */
+ for (index = min; index <= max; index++) {
+
+ array_idx = rstruct->rs_last >> 6;
+ free = rstruct->rs_free[array_idx];
+ last = rstruct->rs_last & 0x3F;
+
+ /* if the next resource to check is free */
+ if ((free & ((uint64_t)1 << last)) != 0) {
+ /* we are using this resource */
+ *resource = rstruct->rs_last;
+
+ /* take it out of the free list */
+ rstruct->rs_free[array_idx] &= ~((uint64_t)1 << last);
+
+ /*
+ * increment the last count so we start checking the
+ * next resource on the next alloc(). Note the rollover
+ * at 'max'+1.
+ */
+ rstruct->rs_last++;
+ if (rstruct->rs_last > max) {
+ rstruct->rs_last = rstruct->rs_min;
+ }
+
+ /* unlock the resource structure */
+ mutex_exit(&rstruct->rs_mutex);
+
+ return (DDI_SUCCESS);
+ }
+
+ /*
+ * This resource is not free, lets go to the next one. Note the
+ * rollover at 'max'.
+ */
+ rstruct->rs_last++;
+ if (rstruct->rs_last > max) {
+ rstruct->rs_last = rstruct->rs_min;
+ }
+ }
+
+ mutex_exit(&rstruct->rs_mutex);
+
+ return (DDI_FAILURE);
+}
+
+
+/*
+ * ioat_rs_free()
+ * Free the previously alloc'd resource. Once a resource has been free'd,
+ * it can be used again when alloc is called.
+ */
+void
+ioat_rs_free(ioat_rs_hdl_t handle, uint_t resource)
+{
+ ioat_rs_t *rstruct;
+ uint_t array_idx;
+ uint_t offset;
+
+
+ ASSERT(handle != NULL);
+
+ rstruct = (ioat_rs_t *)handle;
+ ASSERT(resource >= rstruct->rs_min);
+ ASSERT(resource <= rstruct->rs_max);
+
+ mutex_enter(&rstruct->rs_mutex);
+
+ /* Put the resource back in the free list */
+ array_idx = resource >> 6;
+ offset = resource & 0x3F;
+ rstruct->rs_free[array_idx] |= ((uint64_t)1 << offset);
+
+ mutex_exit(&rstruct->rs_mutex);
+}
diff --git a/usr/src/uts/i86pc/ioat/Makefile b/usr/src/uts/i86pc/ioat/Makefile
new file mode 100644
index 0000000000..2dcd6e898e
--- /dev/null
+++ b/usr/src/uts/i86pc/ioat/Makefile
@@ -0,0 +1,97 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# uts/i86pc/ioat/Makefile
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#pragma ident "%Z%%M% %I% %E% SMI"
+#
+# This makefile drives the production of the ioat driver kernel
+# module.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = ioat
+OBJECTS = $(IOAT_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(IOAT_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE)
+CONF_SRCDIR = $(UTSBASE)/i86pc/io/ioat
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/i86pc/Makefile.i86pc
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY) $(SRC_CONFILE)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
+
+#
+# Dependency
+#
+LDFLAGS += -dy -Nmisc/dcopy
+
+#
+# Override defaults to build a unique, local modstubs.o.
+#
+MODSTUBS_DIR = $(OBJS_DIR)
+CLEANFILES += $(MODSTUBS_O)
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/i86pc/Makefile.targ
+
diff --git a/usr/src/uts/i86pc/sys/ioat.h b/usr/src/uts/i86pc/sys/ioat.h
new file mode 100644
index 0000000000..1e32b54ebd
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/ioat.h
@@ -0,0 +1,359 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_IOAT_H
+#define _SYS_IOAT_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/dcopy.h>
+#include <sys/dcopy_device.h>
+
+
+/* ioat ioctls */
+#define IOATIOC ('T'<< 8)
+typedef enum {
+ IOAT_IOCTL_WRITE_REG = (IOATIOC | 0x0),
+ IOAT_IOCTL_READ_REG = (IOATIOC | 0x1),
+ IOAT_IOCTL_TEST = (IOATIOC | 0x2)
+} ioat_ioctl_enum_t;
+
+typedef struct ioat_ioctl_reg_s {
+ uint_t size;
+ uint_t addr;
+ uint64_t data;
+} ioat_ioctl_reg_t;
+typedef ioat_ioctl_reg_t ioat_ioctl_wrreg_t;
+typedef ioat_ioctl_reg_t ioat_ioctl_rdreg_t;
+
+#ifdef _KERNEL
+/* *** Driver Private Below *** */
+
+/* IOAT_DMACAPABILITY flags */
+#define IOAT_DMACAP_PAGEBREAK 0x1
+#define IOAT_DMACAP_CRC 0x2
+#define IOAT_DMACAP_MARKERSKIP 0x4
+#define IOAT_DMACAP_XOR 0x8
+#define IOAT_DMACAP_DCA 0x10
+
+/* IOAT_INTRCTL bits */
+#define IOAT_INTRCTL_MASTER_EN 0x1
+#define IOAT_INTRCTL_INTR_STAT 0x2
+
+/* MMIO Registers */
+#define IOAT_CHANCNT 0x0 /* 8-bit */
+#define IOAT_XFERCAP 0x1 /* 8-bit */
+#define IOAT_GENCTRL 0x2 /* 8-bit */
+#define IOAT_INTRCTL 0x3 /* 8-bit */
+#define IOAT_ATTNSTATUS 0x4 /* 32-bit */
+#define IOAT_CBVER 0x8 /* 8-bit */
+#define IOAT_PERPORT_OFF 0xA /* 16-bit */
+#define IOAT_INTRDELAY 0xC /* 16-bit */
+#define IOAT_CSSTATUS 0xE /* 16-bit */
+#define IOAT_DMACAPABILITY 0x10 /* 32-bit */
+
+#define IOAT_CHANNELREG_OFFSET 0x80
+
+/* Channel Registers */
+#define IOAT_CHAN_CTL 0x0 /* 16-bit */
+#define IOAT_CHAN_COMP 0x2 /* 16-bit */
+#define IOAT_CHAN_CMPL_LO 0x18 /* 32-bit */
+#define IOAT_CHAN_CMPL_HI 0x1C /* 32-bit */
+#define IOAT_CHAN_ERR 0x28 /* 32-bit */
+#define IOAT_CHAN_ERRMASK 0x2C /* 32-bit */
+#define IOAT_CHAN_DCACTRL 0x30 /* 32-bit */
+
+#define IOAT_V1_CHAN_STS_LO 0x4 /* 32-bit */
+#define IOAT_V1_CHAN_STS_HI 0x8 /* 32-bit */
+#define IOAT_V1_CHAN_ADDR_LO 0x0C /* 32-bit */
+#define IOAT_V1_CHAN_ADDR_HI 0x10 /* 32-bit */
+#define IOAT_V1_CHAN_CMD 0x14 /* 8-bit */
+
+#define IOAT_V2_CHAN_CMD 0x4 /* 8-bit */
+#define IOAT_V2_CHAN_CNT 0x6 /* 16-bit */
+#define IOAT_V2_CHAN_STS_LO 0x8 /* 32-bit */
+#define IOAT_V2_CHAN_STS_HI 0xC /* 32-bit */
+#define IOAT_V2_CHAN_ADDR_LO 0x10 /* 32-bit */
+#define IOAT_V2_CHAN_ADDR_HI 0x14 /* 32-bit */
+
+#define IOAT_CHAN_STS_ADDR_MASK 0xFFFFFFFFFFFFFFC0
+#define IOAT_CHAN_STS_XFER_MASK 0x3F
+#define IOAT_CHAN_STS_FAIL_MASK 0x6
+#define IOAT_CMPL_INDEX(channel) \
+ (((*channel->ic_cmpl & IOAT_CHAN_STS_ADDR_MASK) - \
+ ring->cr_phys_desc) >> 6)
+#define IOAT_CMPL_FAILED(channel) \
+ (*channel->ic_cmpl & IOAT_CHAN_STS_FAIL_MASK)
+
+
+typedef struct ioat_chan_desc_s {
+ uint32_t dd_res0;
+ uint32_t dd_ctrl;
+ uint64_t dd_res1;
+ uint64_t dd_res2;
+ uint64_t dd_next_desc;
+ uint64_t dd_res4;
+ uint64_t dd_res5;
+ uint64_t dd_res6;
+ uint64_t dd_res7;
+} ioat_chan_desc_t;
+
+/* dca dd_ctrl bits */
+#define IOAT_DESC_CTRL_OP_CNTX ((uint32_t)0xFF << 24)
+#define IOAT_DESC_CTRL_CNTX_CHNG 0x1
+typedef struct ioat_chan_dca_desc_s {
+ uint32_t dd_cntx;
+ uint32_t dd_ctrl;
+ uint64_t dd_res1;
+ uint64_t dd_res2;
+ uint64_t dd_next_desc;
+ uint64_t dd_res4;
+ uint64_t dd_res5;
+ uint64_t dd_res6;
+ uint64_t dd_res7;
+} ioat_chan_dca_desc_t;
+
+/* dma dd_ctrl bits */
+#define IOAT_DESC_CTRL_OP_DMA (0x0 << 24)
+#define IOAT_DESC_DMACTRL_NULL 0x20
+#define IOAT_DESC_CTRL_FENCE 0x10
+#define IOAT_DESC_CTRL_CMPL 0x8
+#define IOAT_DESC_CTRL_INTR 0x1
+typedef struct ioat_chan_dma_desc_s {
+ uint32_t dd_size;
+ uint32_t dd_ctrl;
+ uint64_t dd_src_paddr;
+ uint64_t dd_dest_paddr;
+ uint64_t dd_next_desc;
+ uint64_t dd_next_src_paddr; /* v2 only */
+ uint64_t dd_next_dest_paddr; /* v2 only */
+ uint64_t dd_res6;
+ uint64_t dd_res7;
+} ioat_chan_dma_desc_t;
+
+
+typedef enum {
+ IOAT_CBv1,
+ IOAT_CBv2
+} ioat_version_t;
+
+/* ioat private data per command */
+typedef struct ioat_cmd_private_s {
+ uint64_t ip_generation;
+ uint64_t ip_index;
+ dcopy_cmd_t ip_next;
+} ioat_cmd_private_t;
+
+/* descriptor ring state */
+typedef struct ioat_channel_ring_s {
+ /* protects cr_cmpl_gen & cr_cmpl_last */
+ kmutex_t cr_cmpl_mutex;
+
+ /* desc ring generation for the last completion we saw */
+ uint64_t cr_cmpl_gen;
+
+ /* last descriptor index we saw complete */
+ uint64_t cr_cmpl_last;
+
+ /* protects cr_desc_* */
+ kmutex_t cr_desc_mutex;
+
+ /*
+ * last descriptor posted. used to update its next pointer when we
+ * add a new desc. Also used to tack the completion (See comment for
+ * cr_desc_gen_prev).
+ */
+ uint64_t cr_desc_prev;
+
+ /* where to put the next descriptor */
+ uint64_t cr_desc_next;
+
+ /* what the current desc ring generation is */
+ uint64_t cr_desc_gen;
+
+ /*
+ * used during cmd_post to track the last desc posted. cr_desc_next
+ * and cr_desc_gen will be pointing to the next free desc after
+ * writing the descriptor to the ring. But we want to track the
+ * completion for the last descriptor posted.
+ */
+ uint64_t cr_desc_gen_prev;
+
+ /* the last desc in the ring (for wrap) */
+ uint64_t cr_desc_last;
+
+ /* pointer to the head of the ring */
+ ioat_chan_desc_t *cr_desc;
+
+ /* physical address of the head of the ring */
+ uint64_t cr_phys_desc;
+
+ /* back pointer to the channel state */
+ struct ioat_channel_s *cr_chan;
+
+ /* for CB v2, number of desc posted (written to IOAT_V2_CHAN_CNT) */
+ uint_t cr_post_cnt;
+} ioat_channel_ring_t;
+
+/* track channel state so we can handle a failure */
+typedef enum {
+ IOAT_CHANNEL_OK = 0,
+ IOAT_CHANNEL_IN_FAILURE = 1
+} ic_channel_state_t;
+
+typedef struct ioat_channel_s *ioat_channel_t;
+struct ioat_channel_s {
+ /* channel's ring state */
+ ioat_channel_ring_t *ic_ring;
+
+ /* IOAT_CBv1 || IOAT_CBv2 */
+ ioat_version_t ic_ver;
+
+ /*
+ * state to determine if it's OK to post the the channel and if all
+ * future polls should return failure.
+ */
+ ic_channel_state_t ic_channel_state;
+
+ /* channel command cache (*_cmd_alloc, *_cmd_free, etc) */
+ kmem_cache_t *ic_cmd_cache;
+
+ /* dcopy state for dcopy_device_channel_notify() call */
+ dcopy_handle_t ic_dcopy_handle;
+
+ /* location in memory where completions are DMA'ed into */
+ volatile uint64_t *ic_cmpl;
+
+ /* channel specific registers */
+ uint8_t *ic_regs;
+
+ /* if this channel is using DCA */
+ boolean_t ic_dca_active;
+
+ /* DCA ID the channel is currently pointing to */
+ uint32_t ic_dca_current;
+
+ /* devices channel number */
+ uint_t ic_chan_num;
+
+ /* number of descriptors in ring */
+ uint_t ic_chan_desc_cnt;
+
+ /* descriptor ring alloc state */
+ ddi_dma_handle_t ic_desc_dma_handle;
+ size_t ic_desc_alloc_size;
+ ddi_acc_handle_t ic_desc_handle;
+ ddi_dma_cookie_t ic_desc_cookies;
+
+ /* completion buffer alloc state */
+ ddi_dma_handle_t ic_cmpl_dma_handle;
+ size_t ic_cmpl_alloc_size;
+ ddi_acc_handle_t ic_cmpl_handle;
+ ddi_dma_cookie_t ic_cmpl_cookie;
+ uint64_t ic_phys_cmpl;
+
+ /* if inuse, we need to re-init the channel during resume */
+ boolean_t ic_inuse;
+
+ /* backpointer to driver state */
+ struct ioat_state_s *ic_state;
+};
+
+typedef struct ioat_rs_s *ioat_rs_hdl_t;
+
+/* driver state */
+typedef struct ioat_state_s {
+ dev_info_t *is_dip;
+ int is_instance;
+
+ kmutex_t is_mutex;
+
+ /* register handle and pointer to registers */
+ ddi_acc_handle_t is_reg_handle;
+ uint8_t *is_genregs;
+
+ /* IOAT_CBv1 || IOAT_CBv2 */
+ ioat_version_t is_ver;
+
+ /* channel state */
+ ioat_channel_t is_channel;
+ size_t is_chansize;
+ ioat_rs_hdl_t is_channel_rs;
+
+ ddi_iblock_cookie_t is_iblock_cookie;
+
+ /* device info */
+ uint_t is_chanoff;
+ uint_t is_num_channels;
+ uint_t is_maxxfer;
+ uint_t is_cbver;
+ uint_t is_intrdelay;
+ uint_t is_status;
+ uint_t is_capabilities;
+
+ /* dcopy_device_register()/dcopy_device_unregister() state */
+ dcopy_device_handle_t is_device_handle;
+ dcopy_device_info_t is_deviceinfo;
+} ioat_state_t;
+
+
+int ioat_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred,
+ int *rval);
+
+void ioat_rs_init(ioat_state_t *state, uint_t min_val, uint_t max_val,
+ ioat_rs_hdl_t *handle);
+void ioat_rs_fini(ioat_rs_hdl_t *handle);
+int ioat_rs_alloc(ioat_rs_hdl_t handle, uint_t *rs);
+void ioat_rs_free(ioat_rs_hdl_t handle, uint_t rs);
+
+int ioat_channel_init(ioat_state_t *state);
+void ioat_channel_fini(ioat_state_t *state);
+void ioat_channel_suspend(ioat_state_t *state);
+int ioat_channel_resume(ioat_state_t *state);
+
+int ioat_channel_alloc(void *device_private, dcopy_handle_t handle, int flags,
+ uint_t size, dcopy_query_channel_t *info, void *channel_private);
+void ioat_channel_free(void *channel_private);
+void ioat_channel_intr(ioat_channel_t channel);
+int ioat_cmd_alloc(void *channel, int flags, dcopy_cmd_t *cmd);
+void ioat_cmd_free(void *channel, dcopy_cmd_t *cmd);
+int ioat_cmd_post(void *channel, dcopy_cmd_t cmd);
+int ioat_cmd_poll(void *channel, dcopy_cmd_t cmd);
+void ioat_unregister_complete(void *device_private, int status);
+
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_IOAT_H */
diff --git a/usr/src/uts/i86xpv/Makefile.files b/usr/src/uts/i86xpv/Makefile.files
index 974f7038b5..0e24d203d3 100644
--- a/usr/src/uts/i86xpv/Makefile.files
+++ b/usr/src/uts/i86xpv/Makefile.files
@@ -179,12 +179,13 @@ DBOOT_OBJS += \
#
# driver & misc modules
#
-ISANEXUS_OBJS += isa.o dma_engine.o i8237A.o
-DOMCAPS_OBJS += domcaps.o
BALLOON_OBJS += balloon_drv.o
+DOMCAPS_OBJS += domcaps.o
EVTCHN_OBJS += evtchn_dev.o
GFX_PRIVATE_OBJS += gfx_private.o gfxp_pci.o gfxp_segmap.o \
gfxp_devmap.o gfxp_vgatext.o gfxp_vm.o vgasubr.o
+IOAT_OBJS += ioat.o ioat_rs.o ioat_ioctl.o ioat_chan.o
+ISANEXUS_OBJS += isa.o dma_engine.o i8237A.o
PCI_E_MISC_OBJS += pcie.o pcie_fault.o
PCI_E_NEXUS_OBJS += npe.o npe_misc.o
PCI_E_NEXUS_OBJS += pci_common.o pci_kstats.o pci_tools.o
diff --git a/usr/src/uts/i86xpv/Makefile.i86xpv.shared b/usr/src/uts/i86xpv/Makefile.i86xpv.shared
index 506c8035a0..472edeb8c7 100644
--- a/usr/src/uts/i86xpv/Makefile.i86xpv.shared
+++ b/usr/src/uts/i86xpv/Makefile.i86xpv.shared
@@ -240,6 +240,7 @@ MACH_NOT_YET_KMODS = $(AUTOCONF_OBJS)
#
DRV_KMODS += rootnex
+DRV_KMODS += ioat
DRV_KMODS += isa
DRV_KMODS += pci
DRV_KMODS += npe
diff --git a/usr/src/uts/i86xpv/Makefile.rules b/usr/src/uts/i86xpv/Makefile.rules
index 7b758fd3f6..1f8aeba9e2 100644
--- a/usr/src/uts/i86xpv/Makefile.rules
+++ b/usr/src/uts/i86xpv/Makefile.rules
@@ -20,7 +20,7 @@
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# ident "%Z%%M% %I% %E% SMI"
@@ -57,6 +57,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86xpv/io/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/ioat/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/pci/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -208,6 +212,9 @@ DBOOT_DEFS += -D__xpv
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/cpr/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/ioat/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/pci/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
diff --git a/usr/src/uts/i86xpv/ioat/Makefile b/usr/src/uts/i86xpv/ioat/Makefile
new file mode 100644
index 0000000000..54354aedc7
--- /dev/null
+++ b/usr/src/uts/i86xpv/ioat/Makefile
@@ -0,0 +1,97 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# uts/i86xpv/ioat/Makefile
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#pragma ident "%Z%%M% %I% %E% SMI"
+#
+# This makefile drives the production of the ioat driver kernel
+# module.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = ioat
+OBJECTS = $(IOAT_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(IOAT_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE)
+CONF_SRCDIR = $(UTSBASE)/i86pc/io/ioat
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/i86xpv/Makefile.i86xpv
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY) $(SRC_CONFILE)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
+
+#
+# Dependency
+#
+LDFLAGS += -dy -Nmisc/dcopy
+
+#
+# Override defaults to build a unique, local modstubs.o.
+#
+MODSTUBS_DIR = $(OBJS_DIR)
+CLEANFILES += $(MODSTUBS_O)
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/i86xpv/Makefile.targ
+
diff --git a/usr/src/uts/intel/Makefile.files b/usr/src/uts/intel/Makefile.files
index 9a756bd90d..a1a4a1d66e 100644
--- a/usr/src/uts/intel/Makefile.files
+++ b/usr/src/uts/intel/Makefile.files
@@ -138,6 +138,7 @@ CMDK_OBJS += cmdk.o
CMLB_OBJS += cmlb.o
CPUNEX_OBJS += cpunex.o
DADK_OBJS += dadk.o
+DCOPY_OBJS += dcopy.o
DNET_OBJS += dnet.o mii.o
FD_OBJS += fd.o
GDA_OBJS += gda.o
diff --git a/usr/src/uts/intel/Makefile.intel.shared b/usr/src/uts/intel/Makefile.intel.shared
index f52893df93..f1cbff8530 100644
--- a/usr/src/uts/intel/Makefile.intel.shared
+++ b/usr/src/uts/intel/Makefile.intel.shared
@@ -525,6 +525,7 @@ MISC_KMODS += cmlb
MISC_KMODS += consconfig
MISC_KMODS += ctf
MISC_KMODS += dadk
+MISC_KMODS += dcopy
MISC_KMODS += diaudio
MISC_KMODS += dls
MISC_KMODS += drm
diff --git a/usr/src/uts/intel/dcopy/Makefile b/usr/src/uts/intel/dcopy/Makefile
new file mode 100644
index 0000000000..e321465ec1
--- /dev/null
+++ b/usr/src/uts/intel/dcopy/Makefile
@@ -0,0 +1,84 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# uts/intel/dcopy/Makefile
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#ident "%Z%%M% %I% %E% SMI"
+#
+# This makefile drives the production of the dcopy
+# kernel module.
+#
+# intel architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = dcopy
+OBJECTS = $(DCOPY_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(DCOPY_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s
index fd7a606594..5ae7072e82 100644
--- a/usr/src/uts/intel/ia32/ml/modstubs.s
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s
@@ -1313,6 +1313,22 @@ fcnname/**/_info: \
END_MODULE(kssl);
#endif
+/*
+ * Stubs for dcopy, for Intel IOAT KAPIs
+ */
+#ifndef DCOPY_MODULE
+ MODULE(dcopy,misc);
+ NO_UNLOAD_STUB(dcopy, dcopy_query, nomod_minus_one);
+ NO_UNLOAD_STUB(dcopy, dcopy_query_channel, nomod_minus_one);
+ NO_UNLOAD_STUB(dcopy, dcopy_alloc, nomod_minus_one);
+ NO_UNLOAD_STUB(dcopy, dcopy_free, nomod_minus_one);
+ NO_UNLOAD_STUB(dcopy, dcopy_cmd_alloc, nomod_minus_one);
+ NO_UNLOAD_STUB(dcopy, dcopy_cmd_free, nomod_void);
+ NO_UNLOAD_STUB(dcopy, dcopy_cmd_post, nomod_minus_one);
+ NO_UNLOAD_STUB(dcopy, dcopy_cmd_poll, nomod_minus_one);
+ END_MODULE(dcopy);
+#endif
+
/ this is just a marker for the area of text that contains stubs
ENTRY_NP(stubs_end)
diff --git a/usr/src/uts/sparc/ml/modstubs.s b/usr/src/uts/sparc/ml/modstubs.s
index 8e4e06a008..b1936c4172 100644
--- a/usr/src/uts/sparc/ml/modstubs.s
+++ b/usr/src/uts/sparc/ml/modstubs.s
@@ -1265,6 +1265,22 @@ stubs_base:
END_MODULE(kssl);
#endif
+/*
+ * Stubs for dcopy, for Intel IOAT KAPIs
+ */
+#ifndef DCOPY_MODULE
+ MODULE(dcopy,misc);
+ NO_UNLOAD_STUB(dcopy, dcopy_query, nomod_minus_one);
+ NO_UNLOAD_STUB(dcopy, dcopy_query_channel, nomod_minus_one);
+ NO_UNLOAD_STUB(dcopy, dcopy_alloc, nomod_minus_one);
+ NO_UNLOAD_STUB(dcopy, dcopy_free, nomod_minus_one);
+ NO_UNLOAD_STUB(dcopy, dcopy_cmd_alloc, nomod_minus_one);
+ NO_UNLOAD_STUB(dcopy, dcopy_cmd_free, nomod_void);
+ NO_UNLOAD_STUB(dcopy, dcopy_cmd_post, nomod_minus_one);
+ NO_UNLOAD_STUB(dcopy, dcopy_cmd_poll, nomod_minus_one);
+ END_MODULE(dcopy);
+#endif
+
! this is just a marker for the area of text that contains stubs
.seg ".text"
.global stubs_end