diff options
-rw-r--r-- | usr/src/pkgdefs/Makefile | 1 | ||||
-rw-r--r-- | usr/src/pkgdefs/SUNWdcopy/Makefile (renamed from deleted_files/usr/src/pkgdefs/SUNWdcopy/Makefile) | 0 | ||||
-rw-r--r-- | usr/src/pkgdefs/SUNWdcopy/pkginfo.tmpl (renamed from deleted_files/usr/src/pkgdefs/SUNWdcopy/pkginfo.tmpl) | 0 | ||||
-rw-r--r-- | usr/src/pkgdefs/SUNWdcopy/postinstall.tmpl (renamed from deleted_files/usr/src/pkgdefs/SUNWdcopy/postinstall.tmpl) | 0 | ||||
-rw-r--r-- | usr/src/pkgdefs/SUNWdcopy/preremove.tmpl (renamed from deleted_files/usr/src/pkgdefs/SUNWdcopy/preremove.tmpl) | 0 | ||||
-rw-r--r-- | usr/src/pkgdefs/SUNWdcopy/prototype_com (renamed from deleted_files/usr/src/pkgdefs/SUNWdcopy/prototype_com) | 0 | ||||
-rw-r--r-- | usr/src/pkgdefs/SUNWdcopy/prototype_i386 (renamed from deleted_files/usr/src/pkgdefs/SUNWdcopy/prototype_i386) | 0 | ||||
-rw-r--r-- | usr/src/pkgdefs/SUNWhea/prototype_com | 1 | ||||
-rw-r--r-- | usr/src/uts/common/fs/sockfs/socksctp.c | 6 | ||||
-rwxr-xr-x | usr/src/uts/common/fs/sockfs/socksdp.c | 6 | ||||
-rw-r--r-- | usr/src/uts/common/fs/sockfs/sockstr.c | 170 | ||||
-rw-r--r-- | usr/src/uts/common/fs/sockfs/socksubr.c | 25 | ||||
-rw-r--r-- | usr/src/uts/common/fs/sockfs/socktpi.c | 127 | ||||
-rw-r--r-- | usr/src/uts/common/fs/sockfs/sockvnops.c | 10 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp.h | 10 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp.c | 536 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp6ddi.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp_fusion.c | 14 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcpddi.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/io/dcopy.c (renamed from deleted_files/usr/src/uts/common/io/dcopy.c) | 6 | ||||
-rw-r--r-- | usr/src/uts/common/io/stream.c | 50 | ||||
-rw-r--r-- | usr/src/uts/common/os/move.c | 400 | ||||
-rw-r--r-- | usr/src/uts/common/os/streamio.c | 176 | ||||
-rw-r--r-- | usr/src/uts/common/os/strsubr.c | 11 | ||||
-rw-r--r-- | usr/src/uts/common/sys/Makefile | 1 | ||||
-rw-r--r-- | usr/src/uts/common/sys/conf.h | 5 | ||||
-rw-r--r-- | usr/src/uts/common/sys/dcopy.h (renamed from deleted_files/usr/src/uts/common/sys/dcopy.h) | 4 | ||||
-rw-r--r-- | usr/src/uts/common/sys/dcopy_device.h (renamed from deleted_files/usr/src/uts/common/sys/dcopy_device.h) | 0 | ||||
-rw-r--r-- | usr/src/uts/common/sys/socketvar.h | 9 | ||||
-rw-r--r-- | usr/src/uts/common/sys/sodirect.h (renamed from deleted_files/usr/src/uts/common/sys/sodirect.h) | 0 | ||||
-rw-r--r-- | usr/src/uts/common/sys/stream.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/sys/strsubr.h | 8 | ||||
-rw-r--r-- | usr/src/uts/common/sys/uio.h | 69 | ||||
-rw-r--r-- | usr/src/uts/i86pc/Makefile.files | 1 | ||||
-rw-r--r-- | usr/src/uts/i86pc/Makefile.i86pc.shared | 1 | ||||
-rw-r--r-- | usr/src/uts/i86pc/Makefile.rules | 7 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/ioat/ioat.c (renamed from deleted_files/usr/src/uts/i86pc/io/ioat/ioat.c) | 0 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/ioat/ioat.conf (renamed from deleted_files/usr/src/uts/i86pc/io/ioat/ioat.conf) | 0 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/ioat/ioat_chan.c (renamed from deleted_files/usr/src/uts/i86pc/io/ioat/ioat_chan.c) | 0 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/ioat/ioat_ioctl.c (renamed from deleted_files/usr/src/uts/i86pc/io/ioat/ioat_ioctl.c) | 0 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/ioat/ioat_rs.c (renamed from deleted_files/usr/src/uts/i86pc/io/ioat/ioat_rs.c) | 0 | ||||
-rw-r--r-- | usr/src/uts/i86pc/ioat/Makefile (renamed from deleted_files/usr/src/uts/i86pc/ioat/Makefile) | 0 | ||||
-rw-r--r-- | usr/src/uts/i86pc/sys/ioat.h (renamed from deleted_files/usr/src/uts/i86pc/sys/ioat.h) | 0 | ||||
-rw-r--r-- | usr/src/uts/i86xpv/Makefile.files | 5 | ||||
-rw-r--r-- | usr/src/uts/i86xpv/Makefile.i86xpv.shared | 1 | ||||
-rw-r--r-- | usr/src/uts/i86xpv/Makefile.rules | 7 | ||||
-rw-r--r-- | usr/src/uts/i86xpv/ioat/Makefile (renamed from deleted_files/usr/src/uts/i86xpv/ioat/Makefile) | 0 | ||||
-rw-r--r-- | usr/src/uts/intel/Makefile.files | 1 | ||||
-rw-r--r-- | usr/src/uts/intel/Makefile.intel.shared | 1 | ||||
-rw-r--r-- | usr/src/uts/intel/dcopy/Makefile (renamed from deleted_files/usr/src/uts/intel/dcopy/Makefile) | 0 | ||||
-rw-r--r-- | usr/src/uts/intel/ia32/ml/modstubs.s | 16 | ||||
-rw-r--r-- | usr/src/uts/sparc/ml/modstubs.s | 16 |
52 files changed, 1588 insertions, 124 deletions
diff --git a/usr/src/pkgdefs/Makefile b/usr/src/pkgdefs/Makefile index b0e0d4b520..067ddb9ae0 100644 --- a/usr/src/pkgdefs/Makefile +++ b/usr/src/pkgdefs/Makefile @@ -125,6 +125,7 @@ i386_SUBDIRS= \ SUNWgrub \ SUNWgrubS \ SUNWhxge \ + SUNWdcopy \ SUNWipw \ SUNWiwi \ SUNWiwk \ diff --git a/deleted_files/usr/src/pkgdefs/SUNWdcopy/Makefile b/usr/src/pkgdefs/SUNWdcopy/Makefile index 3431d26eb9..3431d26eb9 100644 --- a/deleted_files/usr/src/pkgdefs/SUNWdcopy/Makefile +++ b/usr/src/pkgdefs/SUNWdcopy/Makefile diff --git a/deleted_files/usr/src/pkgdefs/SUNWdcopy/pkginfo.tmpl b/usr/src/pkgdefs/SUNWdcopy/pkginfo.tmpl index 3b9f1d87d6..3b9f1d87d6 100644 --- a/deleted_files/usr/src/pkgdefs/SUNWdcopy/pkginfo.tmpl +++ b/usr/src/pkgdefs/SUNWdcopy/pkginfo.tmpl diff --git a/deleted_files/usr/src/pkgdefs/SUNWdcopy/postinstall.tmpl b/usr/src/pkgdefs/SUNWdcopy/postinstall.tmpl index cdb1f395bf..cdb1f395bf 100644 --- a/deleted_files/usr/src/pkgdefs/SUNWdcopy/postinstall.tmpl +++ b/usr/src/pkgdefs/SUNWdcopy/postinstall.tmpl diff --git a/deleted_files/usr/src/pkgdefs/SUNWdcopy/preremove.tmpl b/usr/src/pkgdefs/SUNWdcopy/preremove.tmpl index 2526218df9..2526218df9 100644 --- a/deleted_files/usr/src/pkgdefs/SUNWdcopy/preremove.tmpl +++ b/usr/src/pkgdefs/SUNWdcopy/preremove.tmpl diff --git a/deleted_files/usr/src/pkgdefs/SUNWdcopy/prototype_com b/usr/src/pkgdefs/SUNWdcopy/prototype_com index 34626771bc..34626771bc 100644 --- a/deleted_files/usr/src/pkgdefs/SUNWdcopy/prototype_com +++ b/usr/src/pkgdefs/SUNWdcopy/prototype_com diff --git a/deleted_files/usr/src/pkgdefs/SUNWdcopy/prototype_i386 b/usr/src/pkgdefs/SUNWdcopy/prototype_i386 index 77bcc81a7e..77bcc81a7e 100644 --- a/deleted_files/usr/src/pkgdefs/SUNWdcopy/prototype_i386 +++ b/usr/src/pkgdefs/SUNWdcopy/prototype_i386 diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com index d41415c209..bf0e84c13b 100644 --- a/usr/src/pkgdefs/SUNWhea/prototype_com +++ b/usr/src/pkgdefs/SUNWhea/prototype_com @@ -1218,6 +1218,7 @@ f none usr/include/sys/socket.h 644 root bin f none usr/include/sys/socket_impl.h 644 root bin f none usr/include/sys/socketvar.h 644 root bin f none usr/include/sys/sockio.h 644 root bin +f none usr/include/sys/sodirect.h 644 root bin f none usr/include/sys/sservice.h 644 root bin f none usr/include/sys/squeue.h 644 root bin f none usr/include/sys/squeue_impl.h 644 root bin diff --git a/usr/src/uts/common/fs/sockfs/socksctp.c b/usr/src/uts/common/fs/sockfs/socksctp.c index 5478bbfda0..8f9ca22255 100644 --- a/usr/src/uts/common/fs/sockfs/socksctp.c +++ b/usr/src/uts/common/fs/sockfs/socksctp.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -171,6 +171,8 @@ sosctp_sock_constructor(void *buf, void *cdrarg, int kmflags) so->so_nl7c_uri = NULL; so->so_nl7c_rcv_mp = NULL; + so->so_direct = NULL; + vp = vn_alloc(kmflags); if (vp == NULL) { return (-1); @@ -204,6 +206,8 @@ sosctp_sock_destructor(void *buf, void *cdrarg) struct sonode *so = &ss->ss_so; struct vnode *vp = SOTOV(so); + ASSERT(so->so_direct == NULL); + ASSERT(so->so_nl7c_flags == 0); ASSERT(so->so_nl7c_uri == NULL); ASSERT(so->so_nl7c_rcv_mp == NULL); diff --git a/usr/src/uts/common/fs/sockfs/socksdp.c b/usr/src/uts/common/fs/sockfs/socksdp.c index 09ab4d0b49..b8482b90b1 100755 --- a/usr/src/uts/common/fs/sockfs/socksdp.c +++ b/usr/src/uts/common/fs/sockfs/socksdp.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -133,6 +133,8 @@ sosdp_sock_constructor(void *buf, void *cdrarg, int kmflags) so->so_nl7c_uri = NULL; so->so_nl7c_rcv_mp = NULL; + so->so_direct = NULL; + vp = vn_alloc(kmflags); if (vp == NULL) { return (-1); @@ -159,6 +161,8 @@ sosdp_sock_destructor(void *buf, void *cdrarg) struct sonode *so = &ss->ss_so; struct vnode *vp = SOTOV(so); + ASSERT(so->so_direct == NULL); + ASSERT(so->so_nl7c_flags == 0); ASSERT(so->so_nl7c_uri == NULL); ASSERT(so->so_nl7c_rcv_mp == NULL); diff --git a/usr/src/uts/common/fs/sockfs/sockstr.c b/usr/src/uts/common/fs/sockfs/sockstr.c index eb540644be..1e3d0aaa5d 100644 --- a/usr/src/uts/common/fs/sockfs/sockstr.c +++ b/usr/src/uts/common/fs/sockfs/sockstr.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -69,6 +69,8 @@ #include <c2/audit.h> +#include <sys/dcopy.h> + int so_default_version = SOV_SOCKSTREAM; #ifdef DEBUG @@ -119,6 +121,26 @@ static mblk_t *strsock_misc(vnode_t *vp, mblk_t *mp, static int tlitosyserr(int terr); /* + * Sodirect kmem_cache and put/wakeup functions. + */ +struct kmem_cache *socktpi_sod_cache; +static int sodput(sodirect_t *, mblk_t *); +static void sodwakeup(sodirect_t *); + +/* + * Called by sockinit() when sockfs is loaded. + */ +int +sostr_init() +{ + /* Allocate sodirect_t kmem_cache */ + socktpi_sod_cache = kmem_cache_create("socktpi_sod_cache", + sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + return (0); +} + +/* * Convert a socket to a stream. Invoked when the illusory sockmod * is popped from the stream. * Change the stream head back to default operation without losing @@ -468,6 +490,34 @@ so_strinit(struct sonode *so, struct sonode *tso) stp->sd_qn_minpsz = 0; mutex_exit(&stp->sd_lock); + /* + * If sodirect capable allocate and initialize sodirect_t. + * Note, SS_SODIRECT is set in socktpi_open(). + */ + if (so->so_state & SS_SODIRECT) { + sodirect_t *sodp; + + ASSERT(so->so_direct == NULL); + + sodp = kmem_cache_alloc(socktpi_sod_cache, KM_SLEEP); + sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT; + sodp->sod_want = 0; + sodp->sod_q = RD(stp->sd_wrq); + sodp->sod_enqueue = sodput; + sodp->sod_wakeup = sodwakeup; + sodp->sod_uioafh = NULL; + sodp->sod_uioaft = NULL; + sodp->sod_lock = &stp->sd_lock; + /* + * Remainder of the sod_uioa members are left uninitialized + * but will be initialized later by uioainit() before uioa + * is enabled. + */ + sodp->sod_uioa.uioa_state = UIOA_ALLOC; + so->so_direct = sodp; + stp->sd_sodirect = sodp; + } + return (0); } @@ -2872,3 +2922,121 @@ tlitosyserr(int terr) else return (tli_errs[terr]); } + +/* + * Sockfs sodirect STREAMS read put procedure. Called from sodirect enable + * transport driver/module with an mblk_t chain. + * + * Note, we in-line putq() for the fast-path cases of q is empty, q_last and + * bp are of type M_DATA. All other cases we call putq(). + * + * On success a zero will be return, else an errno will be returned. + */ +int +sodput(sodirect_t *sodp, mblk_t *bp) +{ + queue_t *q = sodp->sod_q; + struct stdata *stp = (struct stdata *)q->q_ptr; + mblk_t *nbp; + int ret; + mblk_t *last = q->q_last; + int bytecnt = 0; + int mblkcnt = 0; + + + ASSERT(MUTEX_HELD(sodp->sod_lock)); + + if (stp->sd_flag == STREOF) { + ret = 0; + goto error; + } + + if (q->q_first == NULL) { + /* Q empty, really fast fast-path */ + bp->b_prev = NULL; + bp->b_next = NULL; + q->q_first = bp; + q->q_last = bp; + + } else if (last->b_datap->db_type == M_DATA && + bp->b_datap->db_type == M_DATA) { + /* + * Last mblk_t chain and bp are both type M_DATA so + * in-line putq() here, if the DBLK_UIOA state match + * add bp to the end of the current last chain, else + * start a new last chain with bp. + */ + if ((last->b_datap->db_flags & DBLK_UIOA) == + (bp->b_datap->db_flags & DBLK_UIOA)) { + /* Added to end */ + while ((nbp = last->b_cont) != NULL) + last = nbp; + last->b_cont = bp; + } else { + /* New last */ + last->b_next = bp; + bp->b_next = NULL; + bp->b_prev = last; + q->q_last = bp; + } + } else { + /* + * Can't use q_last so just call putq(). + */ + (void) putq(q, bp); + return (0); + } + + /* Count bytes and mblk_t's */ + do { + bytecnt += MBLKL(bp); + mblkcnt++; + } while ((bp = bp->b_cont) != NULL); + q->q_count += bytecnt; + q->q_mblkcnt += mblkcnt; + + /* Check for QFULL */ + if (q->q_count >= q->q_hiwat + sodp->sod_want || + q->q_mblkcnt >= q->q_hiwat) { + q->q_flag |= QFULL; + } + + return (0); + +error: + do { + if ((nbp = bp->b_next) != NULL) + bp->b_next = NULL; + freemsg(bp); + } while ((bp = nbp) != NULL); + + return (ret); +} + +/* + * Sockfs sodirect read wakeup. Called from a sodirect enabled transport + * driver/module to indicate that read-side data is available. + * + * On return the sodirect_t.lock mutex will be exited so this must be the + * last sodirect_t call to guarantee atomic access of *sodp. + */ +void +sodwakeup(sodirect_t *sodp) +{ + queue_t *q = sodp->sod_q; + struct stdata *stp = (struct stdata *)q->q_ptr; + + ASSERT(MUTEX_HELD(sodp->sod_lock)); + + if (stp->sd_flag & RSLEEP) { + stp->sd_flag &= ~RSLEEP; + cv_broadcast(&q->q_wait); + } + + if (stp->sd_rput_opt & SR_POLLIN) { + stp->sd_rput_opt &= ~SR_POLLIN; + mutex_exit(sodp->sod_lock); + pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM); + } else + mutex_exit(sodp->sod_lock); +} diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c index 9a6e9147e3..c857c34225 100644 --- a/usr/src/uts/common/fs/sockfs/socksubr.c +++ b/usr/src/uts/common/fs/sockfs/socksubr.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -44,6 +44,7 @@ #include <sys/file.h> #include <sys/open.h> #include <sys/user.h> +#include <sys/uio.h> #include <sys/termios.h> #include <sys/stream.h> #include <sys/strsubr.h> @@ -90,6 +91,7 @@ #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */ static struct kmem_cache *socktpi_cache, *socktpi_unix_cache; +struct kmem_cache *socktpi_sod_cache; dev_t sockdev; /* For fsid in getattr */ @@ -105,6 +107,8 @@ extern void sendfile_init(); extern void nl7c_init(void); +extern int sostr_init(); + #define ADRSTRLEN (2 * sizeof (void *) + 1) /* * kernel structure for passing the sockinfo data back up to the user. @@ -523,6 +527,15 @@ sockfree(struct sonode *so) so->so_nl7c_flags = 0; } + if (so->so_direct != NULL) { + sodirect_t *sodp = so->so_direct; + + ASSERT(sodp->sod_uioafh == NULL); + + so->so_direct = NULL; + kmem_cache_free(socktpi_sod_cache, sodp); + } + ASSERT(so->so_ux_bound_vp == NULL); if ((mp = so->so_unbind_mp) != NULL) { freemsg(mp); @@ -567,6 +580,8 @@ socktpi_constructor(void *buf, void *cdrarg, int kmflags) struct sonode *so = buf; struct vnode *vp; + so->so_direct = NULL; + so->so_nl7c_flags = 0; so->so_nl7c_uri = NULL; so->so_nl7c_rcv_mp = NULL; @@ -606,6 +621,8 @@ socktpi_destructor(void *buf, void *cdrarg) struct sonode *so = buf; struct vnode *vp = SOTOV(so); + ASSERT(so->so_direct == NULL); + ASSERT(so->so_nl7c_flags == 0); ASSERT(so->so_nl7c_uri == NULL); ASSERT(so->so_nl7c_rcv_mp == NULL); @@ -713,6 +730,12 @@ sockinit(int fstype, char *name) goto failure; } + error = sostr_init(); + if (error != 0) { + err_str = NULL; + goto failure; + } + /* * Create sonode caches. We create a special one for AF_UNIX so * that we can track them for netstat(1m). diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c index d6f9ebb57f..e632e234e2 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi.c +++ b/usr/src/uts/common/fs/sockfs/socktpi.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -58,6 +58,7 @@ #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sockio.h> +#include <sys/sodirect.h> #include <netinet/in.h> #include <sys/un.h> #include <sys/strsun.h> @@ -186,6 +187,9 @@ extern mblk_t *strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *, static int sotpi_unbind(struct sonode *, int); +extern int sodput(sodirect_t *, mblk_t *); +extern void sodwakeup(sodirect_t *); + /* TPI sockfs sonode operations */ static int sotpi_accept(struct sonode *, int, struct sonode **); static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t, @@ -2910,11 +2914,13 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) t_uscalar_t namelen; int so_state = so->so_state; /* Snapshot */ ssize_t saved_resid; - int error; rval_t rval; int flags; clock_t timout; int first; + int error = 0; + struct uio *suiop = NULL; + sodirect_t *sodp = so->so_direct; flags = msg->msg_flags; msg->msg_flags = 0; @@ -3062,6 +3068,53 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) opflag = pflag; first = 1; + if (uiop->uio_resid >= uioasync.mincnt && + sodp != NULL && (sodp->sod_state & SOD_ENABLED) && + uioasync.enabled && !(flags & MSG_PEEK) && + !(so_state & SS_CANTRCVMORE)) { + /* + * Big enough I/O for uioa min setup and an sodirect socket + * and sodirect enabled and uioa enabled and I/O will be done + * and not EOF so initialize the sodirect_t uioa_t with "uiop". + */ + mutex_enter(sodp->sod_lock); + if (!uioainit(uiop, &sodp->sod_uioa)) { + /* + * Successful uioainit() so the uio_t part of the + * uioa_t will be used for all uio_t work to follow, + * we save the original "uiop" in "suiop". + */ + suiop = uiop; + uiop = (uio_t *)&sodp->sod_uioa; + /* + * Before returning to the caller the passed in uio_t + * "uiop" will be updated via a call to uioafini() + * below. + * + * Note, the uioa.uioa_state isn't set to UIOA_ENABLED + * here as first we have to uioamove() any currently + * queued M_DATA mblk_t(s) so it will be done in + * kstrgetmsg(). + */ + } + /* + * In either uioainit() success or not case note the number + * of uio bytes the caller wants for sod framework and/or + * transport (e.g. TCP) strategy. + */ + sodp->sod_want = uiop->uio_resid; + mutex_exit(sodp->sod_lock); + } else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) { + /* + * No uioa but still using sodirect so note the number of + * uio bytes the caller wants for sodirect framework and/or + * transport (e.g. TCP) strategy. + * + * Note, sod_lock not held, only writer is in this function + * and only one thread at a time so not needed just to init. + */ + sodp->sod_want = uiop->uio_resid; + } retry: saved_resid = uiop->uio_resid; pri = 0; @@ -3091,10 +3144,7 @@ retry: eprintsoline(so, error); break; } - mutex_enter(&so->so_lock); - so_unlock_read(so); /* Clear SOREADLOCKED */ - mutex_exit(&so->so_lock); - return (error); + goto out; } /* * For datagrams the MOREDATA flag is used to set MSG_TRUNC. @@ -3137,9 +3187,7 @@ retry: pflag = opflag | MSG_NOMARK; goto retry; } - so_unlock_read(so); /* Clear SOREADLOCKED */ - mutex_exit(&so->so_lock); - return (0); + goto out_locked; } /* strsock_proto has already verified length and alignment */ @@ -3179,9 +3227,7 @@ retry: pflag = opflag | MSG_NOMARK; goto retry; } - so_unlock_read(so); /* Clear SOREADLOCKED */ - mutex_exit(&so->so_lock); - return (0); + goto out_locked; } case T_UNITDATA_IND: { void *addr; @@ -3207,7 +3253,7 @@ retry: freemsg(mp); error = EPROTO; eprintsoline(so, error); - goto err; + goto out; } if (so->so_family == AF_UNIX) { /* @@ -3236,7 +3282,7 @@ retry: freemsg(mp); error = EPROTO; eprintsoline(so, error); - goto err; + goto out; } if (so->so_family == AF_UNIX) so_getopt_srcaddr(opt, optlen, &addr, &addrlen); @@ -3283,17 +3329,14 @@ retry: msg->msg_namelen); kmem_free(control, controllen); eprintsoline(so, error); - goto err; + goto out; } msg->msg_control = control; msg->msg_controllen = controllen; } freemsg(mp); - mutex_enter(&so->so_lock); - so_unlock_read(so); /* Clear SOREADLOCKED */ - mutex_exit(&so->so_lock); - return (0); + goto out; } case T_OPTDATA_IND: { struct T_optdata_req *tdr; @@ -3322,7 +3365,7 @@ retry: freemsg(mp); error = EPROTO; eprintsoline(so, error); - goto err; + goto out; } ncontrollen = so_cmsglen(mp, opt, optlen, @@ -3350,7 +3393,7 @@ retry: freemsg(mp); kmem_free(control, controllen); eprintsoline(so, error); - goto err; + goto out; } msg->msg_control = control; msg->msg_controllen = controllen; @@ -3382,9 +3425,7 @@ retry: pflag = opflag | MSG_NOMARK; goto retry; } - so_unlock_read(so); /* Clear SOREADLOCKED */ - mutex_exit(&so->so_lock); - return (0); + goto out_locked; } case T_EXDATA_IND: { dprintso(so, 1, @@ -3441,10 +3482,7 @@ retry: eprintsoline(so, error); } #endif /* SOCK_DEBUG */ - mutex_enter(&so->so_lock); - so_unlock_read(so); /* Clear SOREADLOCKED */ - mutex_exit(&so->so_lock); - return (error); + goto out; } ASSERT(mp); tpr = (union T_primitives *)mp->b_rptr; @@ -3490,11 +3528,40 @@ retry: freemsg(mp); error = EPROTO; eprintsoline(so, error); - goto err; + goto out; } /* NOTREACHED */ -err: +out: mutex_enter(&so->so_lock); +out_locked: + if (sodp != NULL) { + /* Finish any sodirect and uioa processing */ + mutex_enter(sodp->sod_lock); + if (suiop != NULL) { + /* Finish any uioa_t processing */ + int ret; + + ASSERT(uiop == (uio_t *)&sodp->sod_uioa); + ret = uioafini(suiop, (uioa_t *)uiop); + if (error == 0 && ret != 0) { + /* If no error yet, set it */ + error = ret; + } + if ((mp = sodp->sod_uioafh) != NULL) { + sodp->sod_uioafh = NULL; + sodp->sod_uioaft = NULL; + freemsg(mp); + } + } + if (!(sodp->sod_state & SOD_WAKE_NOT)) { + /* Awoke */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NOT; + } + /* Last, clear sod_want value */ + sodp->sod_want = 0; + mutex_exit(sodp->sod_lock); + } so_unlock_read(so); /* Clear SOREADLOCKED */ mutex_exit(&so->so_lock); return (error); diff --git a/usr/src/uts/common/fs/sockfs/sockvnops.c b/usr/src/uts/common/fs/sockfs/sockvnops.c index 6c122c679d..c85a76d6e6 100644 --- a/usr/src/uts/common/fs/sockfs/sockvnops.c +++ b/usr/src/uts/common/fs/sockfs/sockvnops.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -239,6 +239,10 @@ socktpi_open(struct vnode **vpp, int flag, struct cred *cr, * udp case, when some other module is autopushed * above it, or for some reasons the expected module * isn't purely D_MP (which is the main requirement). + * + * Else, SS_DIRECT is valid. If the read-side Q has + * _QSODIRECT set then and uioasync is enabled then + * set SS_SODIRECT to enable sodirect. */ if (!socktpi_direct || !(tq->q_flag & _QDIRECT) || !(_OTHERQ(tq)->q_flag & _QDIRECT)) { @@ -255,6 +259,10 @@ socktpi_open(struct vnode **vpp, int flag, struct cred *cr, return (error); } } + } else if ((_OTHERQ(tq)->q_flag & _QSODIRECT) && + uioasync.enabled) { + /* Enable sodirect */ + so->so_state |= SS_SODIRECT; } } } else { diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index aa5ba3a075..26e1b12f4e 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -37,6 +37,7 @@ extern "C" { #include <netinet/ip6.h> #include <netinet/tcp.h> #include <sys/socket.h> +#include <sys/sodirect.h> #include <sys/multidata.h> #include <sys/md5.h> #include <inet/common.h> @@ -598,6 +599,13 @@ typedef struct tcp_s { */ boolean_t tcp_flow_stopped; + /* + * tcp_sodirect is used by tcp on the receive side to push mblk_t(s) + * directly to sockfs. Also, to schedule asynchronous copyout directly + * to a pending user-land uio buffer. + */ + sodirect_t *tcp_sodirect; + #ifdef DEBUG pc_t tcmp_stk[15]; #endif diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 470f6cad1d..766a7db59e 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -66,6 +66,8 @@ const char tcp_version[] = "%Z%%M% %I% %E% SMI"; #include <sys/isa_defs.h> #include <sys/md5.h> #include <sys/random.h> +#include <sys/sodirect.h> +#include <sys/uio.h> #include <netinet/in.h> #include <netinet/tcp.h> #include <netinet/ip6.h> @@ -216,6 +218,23 @@ const char tcp_version[] = "%Z%%M% %I% %E% SMI"; * behaviour. Once tcp_issocket is unset, its never set for the * life of that connection. * + * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT) + * two consoldiation private KAPIs are used to enqueue M_DATA mblk_t's + * directly to the socket (sodirect) and start an asynchronous copyout + * to a user-land receive-side buffer (uioa) when a blocking socket read + * (e.g. read, recv, ...) is pending. + * + * This is accomplished when tcp_issocket is set and tcp_sodirect is not + * NULL so points to an sodirect_t and if marked enabled then we enqueue + * all mblk_t's directly to the socket. + * + * Further, if the sodirect_t sod_uioa and if marked enabled (due to a + * blocking socket read, e.g. user-land read, recv, ...) then an asynchronous + * copyout will be started directly to the user-land uio buffer. Also, as we + * have a pending read, TCP's push logic can take into account the number of + * bytes to be received and only awake the blocked read()er when the uioa_t + * byte count has been satisfied. + * * IPsec notes : * * Since a packet is always executed on the correct TCP perimeter @@ -246,6 +265,37 @@ squeue_func_t tcp_squeue_close_proc; squeue_func_t tcp_squeue_wput_proc; /* + * Macros for sodirect: + * + * SOD_PTR_ENTER(tcp, sodp) - for the tcp_t pointer "tcp" set the + * sodirect_t pointer "sodp" to the socket/tcp shared sodirect_t + * if it exists and is enabled, else to NULL. Note, in the current + * sodirect implementation the sod_lock must not be held across any + * STREAMS call (e.g. putnext) else a "recursive mutex_enter" PANIC + * will result as sod_lock is the streamhead stdata.sd_lock. + * + * SOD_NOT_ENABLED(tcp) - return true if not a sodirect tcp_t or the + * sodirect_t isn't enabled, usefull for ASSERT()ing that a recieve + * side tcp code path dealing with a tcp_rcv_list or putnext() isn't + * being used when sodirect code paths should be. + */ + +#define SOD_PTR_ENTER(tcp, sodp) \ + (sodp) = (tcp)->tcp_sodirect; \ + \ + if ((sodp) != NULL) { \ + mutex_enter((sodp)->sod_lock); \ + if (!((sodp)->sod_state & SOD_ENABLED)) { \ + mutex_exit((sodp)->sod_lock); \ + (sodp) = NULL; \ + } \ + } + +#define SOD_NOT_ENABLED(tcp) \ + ((tcp)->tcp_sodirect == NULL || \ + !((tcp)->tcp_sodirect->sod_state & SOD_ENABLED)) + +/* * This controls how tiny a write must be before we try to copy it * into the the mblk on the tail of the transmit queue. Not much * speedup is observed for values larger than sixteen. Zero will @@ -3808,6 +3858,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) mblk_t *mp; queue_t *q; tcp_stack_t *tcps = tcp->tcp_tcps; + sodirect_t *sodp; TCP_CLD_STAT(tag); @@ -3872,6 +3923,13 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) return (-1); } + /* If sodirect, not anymore */ + SOD_PTR_ENTER(tcp, sodp); + if (sodp != NULL) { + tcp->tcp_sodirect = NULL; + mutex_exit(sodp->sod_lock); + } + q = tcp->tcp_rq; /* Trash all inbound data */ @@ -4236,6 +4294,11 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) */ /* FALLTHRU */ default: + if (tcp->tcp_sodirect != NULL) { + /* Ok, no more sodirect */ + tcp->tcp_sodirect = NULL; + } + if (tcp->tcp_fused) tcp_unfuse(tcp); @@ -6381,6 +6444,15 @@ tcp_connect(tcp_t *tcp, mblk_t *mp) *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; } + if (tcp->tcp_issocket) { + /* + * TCP is _D_SODIRECT and sockfs is directly above so save + * the shared sonode sodirect_t pointer (if any) to enable + * TCP sodirect. + */ + tcp->tcp_sodirect = SOD_QTOSODP(tcp->tcp_rq); + } + switch (tcp->tcp_state) { case TCPS_IDLE: /* @@ -8190,6 +8262,9 @@ tcp_reinit_values(tcp) ASSERT(!tcp->tcp_kssl_pending); PRESERVE(tcp->tcp_kssl_ent); + /* Sodirect */ + tcp->tcp_sodirect = NULL; + tcp->tcp_closemp_used = B_FALSE; #ifdef DEBUG @@ -8282,6 +8357,9 @@ tcp_init_values(tcp_t *tcp) tcp->tcp_fuse_rcv_unread_hiwater = 0; tcp->tcp_fuse_rcv_unread_cnt = 0; + /* Sodirect */ + tcp->tcp_sodirect = NULL; + /* Initialize the header template */ if (tcp->tcp_ipversion == IPV4_VERSION) { err = tcp_header_init_ipv4(tcp); @@ -11680,6 +11758,9 @@ tcp_rcv_drain(queue_t *q, tcp_t *tcp) if (tcp->tcp_listener != NULL) return (ret); + /* Can't be sodirect enabled */ + ASSERT(SOD_NOT_ENABLED(tcp)); + /* * Handle two cases here: we are currently fused or we were * previously fused and have some urgent data to be delivered @@ -11779,6 +11860,216 @@ tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) } /* + * The tcp_rcv_sod_XXX() functions enqueue data directly to the socket + * above, in addition when uioa is enabled schedule an asynchronous uio + * prior to enqueuing. They implement the combinhed semantics of the + * tcp_rcv_XXX() functions, tcp_rcv_list push logic, and STREAMS putnext() + * canputnext(), i.e. flow-control with backenable. + * + * tcp_sod_wakeup() is called where tcp_rcv_drain() would be called in the + * non sodirect connection but as there are no tcp_tcv_list mblk_t's we deal + * with the rcv_wnd and push timer and call the sodirect wakeup function. + * + * Must be called with sodp->sod_lock held and will return with the lock + * released. + */ +static uint_t +tcp_rcv_sod_wakeup(tcp_t *tcp, sodirect_t *sodp) +{ + queue_t *q = tcp->tcp_rq; + uint_t thwin; + tcp_stack_t *tcps = tcp->tcp_tcps; + uint_t ret = 0; + + /* Can't be an eager connection */ + ASSERT(tcp->tcp_listener == NULL); + + /* Caller must have lock held */ + ASSERT(MUTEX_HELD(sodp->sod_lock)); + + /* Sodirect mode so must not be a tcp_rcv_list */ + ASSERT(tcp->tcp_rcv_list == NULL); + + if (SOD_QFULL(sodp)) { + /* Q is full, mark Q for need backenable */ + SOD_QSETBE(sodp); + } + /* Last advertised rwnd, i.e. rwnd last sent in a packet */ + thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) + << tcp->tcp_rcv_ws; + /* This is peer's calculated send window (our available rwnd). */ + thwin -= tcp->tcp_rnxt - tcp->tcp_rack; + /* + * Increase the receive window to max. But we need to do receiver + * SWS avoidance. This means that we need to check the increase of + * of receive window is at least 1 MSS. + */ + if (!SOD_QFULL(sodp) && (q->q_hiwat - thwin >= tcp->tcp_mss)) { + /* + * If the window that the other side knows is less than max + * deferred acks segments, send an update immediately. + */ + if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { + BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); + ret = TH_ACK_NEEDED; + } + tcp->tcp_rwnd = q->q_hiwat; + } + + if (!SOD_QEMPTY(sodp)) { + /* Wakeup to socket */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_DONE; + (sodp->sod_wakeup)(sodp); + /* wakeup() does the mutex_ext() */ + } else { + /* Q is empty, no need to wake */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NOT; + mutex_exit(sodp->sod_lock); + } + + /* No need for the push timer now. */ + if (tcp->tcp_push_tid != 0) { + (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); + tcp->tcp_push_tid = 0; + } + + return (ret); +} + +/* + * Called where tcp_rcv_enqueue()/putnext(RD(q)) would be. For M_DATA + * mblk_t's if uioa enabled then start a uioa asynchronous copy directly + * to the user-land buffer and flag the mblk_t as such. + * + * Also, handle tcp_rwnd. + */ +uint_t +tcp_rcv_sod_enqueue(tcp_t *tcp, sodirect_t *sodp, mblk_t *mp, uint_t seg_len) +{ + uioa_t *uioap = &sodp->sod_uioa; + boolean_t qfull; + uint_t thwin; + + /* Can't be an eager connection */ + ASSERT(tcp->tcp_listener == NULL); + + /* Caller must have lock held */ + ASSERT(MUTEX_HELD(sodp->sod_lock)); + + /* Sodirect mode so must not be a tcp_rcv_list */ + ASSERT(tcp->tcp_rcv_list == NULL); + + /* Passed in segment length must be equal to mblk_t chain data size */ + ASSERT(seg_len == msgdsize(mp)); + + if (DB_TYPE(mp) != M_DATA) { + /* Only process M_DATA mblk_t's */ + goto enq; + } + if (uioap->uioa_state & UIOA_ENABLED) { + /* Uioa is enabled */ + mblk_t *mp1 = mp; + + if (seg_len > uioap->uio_resid) { + /* + * There isn't enough uio space for the mblk_t chain + * so disable uioa such that this and any additional + * mblk_t data is handled by the socket and schedule + * the socket for wakeup to finish this uioa. + */ + uioap->uioa_state &= UIOA_CLR; + uioap->uioa_state |= UIOA_FINI; + if (sodp->sod_state & SOD_WAKE_NOT) { + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NEED; + } + goto enq; + } + do { + uint32_t len = MBLKL(mp1); + + if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) { + /* Scheduled, mark dblk_t as such */ + DB_FLAGS(mp1) |= DBLK_UIOA; + } else { + /* Error, turn off async processing */ + uioap->uioa_state &= UIOA_CLR; + uioap->uioa_state |= UIOA_FINI; + break; + } + } while ((mp1 = mp1->b_cont) != NULL); + + if (mp1 != NULL || uioap->uio_resid == 0) { + /* + * Not all mblk_t(s) uioamoved (error) or all uio + * space has been consumed so schedule the socket + * for wakeup to finish this uio. + */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NEED; + } + } else if (uioap->uioa_state & UIOA_FINI) { + /* + * Post UIO_ENABLED waiting for socket to finish processing + * so just enqueue and update tcp_rwnd. + */ + if (SOD_QFULL(sodp)) + tcp->tcp_rwnd -= seg_len; + } else if (sodp->sod_want > 0) { + /* + * Uioa isn't enabled but sodirect has a pending read(). + */ + if (SOD_QCNT(sodp) + seg_len >= sodp->sod_want) { + if (sodp->sod_state & SOD_WAKE_NOT) { + /* Schedule socket for wakeup */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NEED; + } + tcp->tcp_rwnd -= seg_len; + } + } else if (SOD_QCNT(sodp) + seg_len >= tcp->tcp_rq->q_hiwat >> 3) { + /* + * No pending sodirect read() so used the default + * TCP push logic to guess that a push is needed. + */ + if (sodp->sod_state & SOD_WAKE_NOT) { + /* Schedule socket for wakeup */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NEED; + } + tcp->tcp_rwnd -= seg_len; + } else { + /* Just update tcp_rwnd */ + tcp->tcp_rwnd -= seg_len; + } +enq: + qfull = SOD_QFULL(sodp); + + (sodp->sod_enqueue)(sodp, mp); + + if (! qfull && SOD_QFULL(sodp)) { + /* Wasn't QFULL, now QFULL, need back-enable */ + SOD_QSETBE(sodp); + } + + /* + * Check to see if remote avail swnd < mss due to delayed ACK, + * first get advertised rwnd. + */ + thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)); + /* Minus delayed ACK count */ + thwin -= tcp->tcp_rnxt - tcp->tcp_rack; + if (thwin < tcp->tcp_mss) { + /* Remote avail swnd < mss, need ACK now */ + return (TH_ACK_NEEDED); + } + + return (0); +} + +/* * DEFAULT TCP ENTRY POINT via squeue on READ side. * * This is the default entry function into TCP on the read side. TCP is @@ -14976,13 +15267,39 @@ est: tcp_rcv_enqueue(tcp, mp, seg_len); } } else { + sodirect_t *sodp = tcp->tcp_sodirect; + + /* + * If an sodirect connection and an enabled sodirect_t then + * sodp will be set to point to the tcp_t/sonode_t shared + * sodirect_t and the sodirect_t's lock will be held. + */ + if (sodp != NULL) { + mutex_enter(sodp->sod_lock); + if (!(sodp->sod_state & SOD_ENABLED)) { + mutex_exit(sodp->sod_lock); + sodp = NULL; + } else if (tcp->tcp_kssl_ctx != NULL && + DB_TYPE(mp) == M_DATA) { + mutex_exit(sodp->sod_lock); + sodp = NULL; + } + } if (mp->b_datap->db_type != M_DATA || (flags & TH_MARKNEXT_NEEDED)) { - if (tcp->tcp_rcv_list != NULL) { + if (sodp != NULL) { + if (!SOD_QEMPTY(sodp) && + (sodp->sod_state & SOD_WAKE_NOT)) { + flags |= tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() did the mutex_exit() */ + mutex_enter(sodp->sod_lock); + } + } else if (tcp->tcp_rcv_list != NULL) { flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); } ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); + if (flags & TH_MARKNEXT_NEEDED) { #ifdef DEBUG (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, @@ -15001,10 +15318,42 @@ est: mblk_t *, mp); tcp_kssl_input(tcp, mp); } else { + if (sodp) { + /* + * Done with sodirect, use putnext + * to push this non M_DATA headed + * mblk_t chain. + */ + mutex_exit(sodp->sod_lock); + } putnext(tcp->tcp_rq, mp); if (!canputnext(tcp->tcp_rq)) tcp->tcp_rwnd -= seg_len; } + } else if ((tcp->tcp_kssl_ctx != NULL) && + (DB_TYPE(mp) == M_DATA)) { + /* Do SSL processing first */ + DTRACE_PROBE1(kssl_mblk__ksslinput_data2, + mblk_t *, mp); + tcp_kssl_input(tcp, mp); + } else if (sodp != NULL) { + /* + * Sodirect so all mblk_t's are queued on the + * socket directly, check for wakeup of blocked + * reader (if any), and last if flow-controled. + */ + flags |= tcp_rcv_sod_enqueue(tcp, sodp, mp, seg_len); + if ((sodp->sod_state & SOD_WAKE_NEED) || + (flags & (TH_PUSH|TH_FIN))) { + flags |= tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() did the mutex_exit() */ + } else { + if (SOD_QFULL(sodp)) { + /* Q is full, need backenable */ + SOD_QSETBE(sodp); + } + mutex_exit(sodp->sod_lock); + } } else if ((flags & (TH_PUSH|TH_FIN)) || tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_rq->q_hiwat >> 3) { if (tcp->tcp_rcv_list != NULL) { @@ -15024,41 +15373,33 @@ est: tcp_rcv_enqueue(tcp, mp, seg_len); flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); } else { - /* Does this need SSL processing first? */ - if ((tcp->tcp_kssl_ctx != NULL) && - (DB_TYPE(mp) == M_DATA)) { - DTRACE_PROBE1( - kssl_mblk__ksslinput_data2, - mblk_t *, mp); - tcp_kssl_input(tcp, mp); - } else { - putnext(tcp->tcp_rq, mp); - if (!canputnext(tcp->tcp_rq)) - tcp->tcp_rwnd -= seg_len; - } + putnext(tcp->tcp_rq, mp); + if (!canputnext(tcp->tcp_rq)) + tcp->tcp_rwnd -= seg_len; } } else { /* * Enqueue all packets when processing an mblk * from the co queue and also enqueue normal packets. - * For packets which belong to SSL stream do SSL - * processing first. */ - if ((tcp->tcp_kssl_ctx != NULL) && - (DB_TYPE(mp) == M_DATA)) { - DTRACE_PROBE1(kssl_mblk__tcpksslin3, - mblk_t *, mp); - tcp_kssl_input(tcp, mp); - } else { - tcp_rcv_enqueue(tcp, mp, seg_len); - } + tcp_rcv_enqueue(tcp, mp, seg_len); } /* * Make sure the timer is running if we have data waiting * for a push bit. This provides resiliency against * implementations that do not correctly generate push bits. + * + * Note, for sodirect if Q isn't empty and there's not a + * pending wakeup then we need a timer. Also note that sodp + * is assumed to be still valid after exit()ing the sod_lock + * above and while the SOD state can change it can only change + * such that the Q is empty now even though data was added + * above. */ - if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) { + if (((sodp != NULL && !SOD_QEMPTY(sodp) && + (sodp->sod_state & SOD_WAKE_NOT)) || + (sodp == NULL && tcp->tcp_rcv_list != NULL)) && + tcp->tcp_push_tid == 0) { /* * The connection may be closed at this point, so don't * do anything for a detached tcp. @@ -15070,6 +15411,7 @@ est: tcps->tcps_push_timer_interval)); } } + xmit_check: /* Is there anything left to do? */ ASSERT(!(flags & TH_MARKNEXT_NEEDED)); @@ -15145,13 +15487,26 @@ ack_check: /* * Send up any queued data and then send the mark message */ - if (tcp->tcp_rcv_list != NULL) { - flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); - } - ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); + sodirect_t *sodp; + + SOD_PTR_ENTER(tcp, sodp); mp1 = tcp->tcp_urp_mark_mp; tcp->tcp_urp_mark_mp = NULL; + if (sodp != NULL) { + + ASSERT(tcp->tcp_rcv_list == NULL); + + flags |= tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() does the mutex_exit() */ + } else if (tcp->tcp_rcv_list != NULL) { + flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); + + ASSERT(tcp->tcp_rcv_list == NULL || + tcp->tcp_fused_sigurg); + + } + putnext(tcp->tcp_rq, mp1); #ifdef DEBUG (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: sending zero-length %s %s", @@ -15159,7 +15514,6 @@ ack_check: "MSGNOTMARKNEXT"), tcp_display(tcp, NULL, DISP_PORT_ONLY)); #endif /* DEBUG */ - putnext(tcp->tcp_rq, mp1); flags &= ~TH_SEND_URP_MARK; } if (flags & TH_ACK_NEEDED) { @@ -15197,14 +15551,32 @@ ack_check: * In the eager case tcp_rsrv will do this when run * after tcp_accept is done. */ + sodirect_t *sodp; + ASSERT(tcp->tcp_listener == NULL); - if (tcp->tcp_rcv_list != NULL) { + + SOD_PTR_ENTER(tcp, sodp); + if (sodp != NULL) { + /* No more sodirect */ + tcp->tcp_sodirect = NULL; + if (!SOD_QEMPTY(sodp)) { + /* Mblk(s) to process, notify */ + flags |= tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() does the mutex_exit() */ + } else { + /* Nothing to process */ + mutex_exit(sodp->sod_lock); + } + } else if (tcp->tcp_rcv_list != NULL) { /* * Push any mblk(s) enqueued from co processing. */ flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); + + ASSERT(tcp->tcp_rcv_list == NULL || + tcp->tcp_fused_sigurg); } - ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); + if ((mp1 = mi_tpi_ordrel_ind()) != NULL) { tcp->tcp_ordrel_done = B_TRUE; putnext(tcp->tcp_rq, mp1); @@ -15974,6 +16346,8 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) queue_t *q = tcp->tcp_rq; uint_t thwin; tcp_stack_t *tcps = tcp->tcp_tcps; + sodirect_t *sodp; + boolean_t fc; freeb(mp); @@ -16024,7 +16398,27 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) return; } - if (canputnext(q)) { + SOD_PTR_ENTER(tcp, sodp); + if (sodp != NULL) { + /* An sodirect connection */ + if (SOD_QFULL(sodp)) { + /* Flow-controlled, need another back-enable */ + fc = B_TRUE; + SOD_QSETBE(sodp); + } else { + /* Not flow-controlled */ + fc = B_FALSE; + } + mutex_exit(sodp->sod_lock); + } else if (canputnext(q)) { + /* STREAMS, not flow-controlled */ + fc = B_FALSE; + } else { + /* STREAMS, flow-controlled */ + fc = B_TRUE; + } + if (!fc) { + /* Not flow-controlled, open rwnd */ tcp->tcp_rwnd = q->q_hiwat; thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) << tcp->tcp_rcv_ws; @@ -16043,13 +16437,32 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); } } + /* Handle a failure to allocate a T_ORDREL_IND here */ if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { ASSERT(tcp->tcp_listener == NULL); - if (tcp->tcp_rcv_list != NULL) { - (void) tcp_rcv_drain(q, tcp); + + SOD_PTR_ENTER(tcp, sodp); + if (sodp != NULL) { + /* No more sodirect */ + tcp->tcp_sodirect = NULL; + if (!SOD_QEMPTY(sodp)) { + /* Notify mblk(s) to process */ + (void) tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() does the mutex_exit() */ + } else { + /* Nothing to process */ + mutex_exit(sodp->sod_lock); + } + } else if (tcp->tcp_rcv_list != NULL) { + /* + * Push any mblk(s) enqueued from co processing. + */ + (void) tcp_rcv_drain(tcp->tcp_rq, tcp); + ASSERT(tcp->tcp_rcv_list == NULL || + tcp->tcp_fused_sigurg); } - ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); + mp = mi_tpi_ordrel_ind(); if (mp) { tcp->tcp_ordrel_done = B_TRUE; @@ -18097,6 +18510,8 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) */ if (tcp->tcp_rcv_list != NULL) { /* We drain directly in case of fused tcp loopback */ + sodirect_t *sodp; + if (!tcp->tcp_fused && canputnext(q)) { tcp->tcp_rwnd = q->q_hiwat; thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) @@ -18112,7 +18527,26 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) } } - (void) tcp_rcv_drain(q, tcp); + + SOD_PTR_ENTER(tcp, sodp); + if (sodp != NULL) { + /* Sodirect, move from rcv_list */ + ASSERT(!tcp->tcp_fused); + while ((mp = tcp->tcp_rcv_list) != NULL) { + tcp->tcp_rcv_list = mp->b_next; + mp->b_next = NULL; + (void) tcp_rcv_sod_enqueue(tcp, sodp, mp, + msgdsize(mp)); + } + tcp->tcp_rcv_last_head = NULL; + tcp->tcp_rcv_last_tail = NULL; + tcp->tcp_rcv_cnt = 0; + (void) tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() did the mutex_exit() */ + } else { + /* Not sodirect, drain */ + (void) tcp_rcv_drain(q, tcp); + } /* * For fused tcp loopback, back-enable peer endpoint @@ -18304,6 +18738,21 @@ tcp_wput_accept(queue_t *q, mblk_t *mp) listener = eager->tcp_listener; eager->tcp_issocket = B_TRUE; + /* + * TCP is _D_SODIRECT and sockfs is directly above so + * save shared sodirect_t pointer (if any). + * + * If tcp_fused and sodirect enabled disable it. + */ + eager->tcp_sodirect = SOD_QTOSODP(eager->tcp_rq); + if (eager->tcp_fused && eager->tcp_sodirect != NULL) { + /* Fused, disable sodirect */ + mutex_enter(eager->tcp_sodirect->sod_lock); + SOD_DISABLE(eager->tcp_sodirect); + mutex_exit(eager->tcp_sodirect->sod_lock); + eager->tcp_sodirect = NULL; + } + econnp->conn_zoneid = listener->tcp_connp->conn_zoneid; econnp->conn_allzones = listener->tcp_connp->conn_allzones; ASSERT(econnp->conn_netstack == @@ -22140,6 +22589,7 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) tcp_fuse_disable_pair(tcp, B_FALSE); } tcp->tcp_issocket = B_FALSE; + tcp->tcp_sodirect = NULL; TCP_STAT(tcps, tcp_sock_fallback); DB_TYPE(mp) = M_IOCACK; @@ -23420,6 +23870,8 @@ tcp_push_timer(void *arg) conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; tcp_stack_t *tcps = tcp->tcp_tcps; + uint_t flags; + sodirect_t *sodp; TCP_DBGSTAT(tcps, tcp_push_timer_cnt); @@ -23431,9 +23883,17 @@ tcp_push_timer(void *arg) */ TCP_FUSE_SYNCSTR_PLUG_DRAIN(tcp); tcp->tcp_push_tid = 0; - if ((tcp->tcp_rcv_list != NULL) && - (tcp_rcv_drain(tcp->tcp_rq, tcp) == TH_ACK_NEEDED)) + + SOD_PTR_ENTER(tcp, sodp); + if (sodp != NULL) { + flags = tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() does the mutex_exit() */ + } else if (tcp->tcp_rcv_list != NULL) { + flags = tcp_rcv_drain(tcp->tcp_rq, tcp); + } + if (flags == TH_ACK_NEEDED) tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); + TCP_FUSE_SYNCSTR_UNPLUG_DRAIN(tcp); } diff --git a/usr/src/uts/common/inet/tcp/tcp6ddi.c b/usr/src/uts/common/inet/tcp/tcp6ddi.c index e724bdd022..1eda50d9a6 100644 --- a/usr/src/uts/common/inet/tcp/tcp6ddi.c +++ b/usr/src/uts/common/inet/tcp/tcp6ddi.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -40,7 +40,7 @@ * for TCP Fusion (loopback); this is why we don't define * D_SYNCSTR here. */ -#define INET_DEVMTFLAGS (D_MP|_D_DIRECT) +#define INET_DEVMTFLAGS (D_MP|_D_DIRECT|_D_SODIRECT) #include "../inetddi.c" diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c index 2503a13e29..5e2a8b23cb 100644 --- a/usr/src/uts/common/inet/tcp/tcp_fusion.c +++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c @@ -287,6 +287,20 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) if ((mp = allocb(sizeof (*stropt), BPRI_HI)) == NULL) goto failed; + /* If either tcp or peer_tcp sodirect enabled then disable */ + if (tcp->tcp_sodirect != NULL) { + mutex_enter(tcp->tcp_sodirect->sod_lock); + SOD_DISABLE(tcp->tcp_sodirect); + mutex_exit(tcp->tcp_sodirect->sod_lock); + tcp->tcp_sodirect = NULL; + } + if (peer_tcp->tcp_sodirect != NULL) { + mutex_enter(peer_tcp->tcp_sodirect->sod_lock); + SOD_DISABLE(peer_tcp->tcp_sodirect); + mutex_exit(peer_tcp->tcp_sodirect->sod_lock); + peer_tcp->tcp_sodirect = NULL; + } + /* Fuse both endpoints */ peer_tcp->tcp_loopback_peer = tcp; tcp->tcp_loopback_peer = peer_tcp; diff --git a/usr/src/uts/common/inet/tcp/tcpddi.c b/usr/src/uts/common/inet/tcp/tcpddi.c index 436786b846..dc08ad23c4 100644 --- a/usr/src/uts/common/inet/tcp/tcpddi.c +++ b/usr/src/uts/common/inet/tcp/tcpddi.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -44,7 +44,7 @@ * for TCP Fusion (loopback); this is why we don't define * D_SYNCSTR here. */ -#define INET_DEVMTFLAGS (D_MP|_D_DIRECT) +#define INET_DEVMTFLAGS (D_MP|_D_DIRECT|_D_SODIRECT) #include "../inetddi.c" diff --git a/deleted_files/usr/src/uts/common/io/dcopy.c b/usr/src/uts/common/io/dcopy.c index 2dc5a311bc..02163c7e9e 100644 --- a/deleted_files/usr/src/uts/common/io/dcopy.c +++ b/usr/src/uts/common/io/dcopy.c @@ -689,6 +689,10 @@ dcopy_device_register(void *device_private, dcopy_device_info_t *info, mutex_exit(&dcopy_statep->d_globalchan_list.dl_mutex); *handle = device; + + /* last call-back into kernel for dcopy KAPI enabled */ + uioa_dcopy_enable(); + return (DCOPY_SUCCESS); registerfail_alloc: @@ -723,6 +727,8 @@ dcopy_device_unregister(dcopy_device_handle_t *handle) dcopy_device_handle_t device; boolean_t device_busy; + /* first call-back into kernel for dcopy KAPI disable */ + uioa_dcopy_disable(); device = *handle; device_busy = B_FALSE; diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c index 28a9a4928f..90fbf3cbf1 100644 --- a/usr/src/uts/common/io/stream.c +++ b/usr/src/uts/common/io/stream.c @@ -23,7 +23,7 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -318,8 +318,8 @@ streams_msg_init(void) int offset; mblk_cache = kmem_cache_create("streams_mblk", - sizeof (mblk_t), 32, NULL, NULL, NULL, NULL, NULL, - mblk_kmem_flags); + sizeof (mblk_t), 32, NULL, NULL, NULL, NULL, NULL, + mblk_kmem_flags); for (sizep = dblk_sizes; (size = *sizep) != 0; sizep++) { @@ -330,7 +330,7 @@ streams_msg_init(void) */ tot_size = size + sizeof (dblk_t); ASSERT((offset + sizeof (dblk_t) + sizeof (kmem_slab_t)) - < PAGESIZE); + < PAGESIZE); ASSERT((tot_size & (DBLK_CACHE_ALIGN - 1)) == 0); } else { @@ -346,9 +346,9 @@ streams_msg_init(void) (void) sprintf(name, "streams_dblk_%ld", size); cp = kmem_cache_create(name, tot_size, - DBLK_CACHE_ALIGN, dblk_constructor, - dblk_destructor, NULL, - (void *)(size), NULL, dblk_kmem_flags); + DBLK_CACHE_ALIGN, dblk_constructor, + dblk_destructor, NULL, + (void *)(size), NULL, dblk_kmem_flags); while (lastsize <= size) { dblk_cache[(lastsize - 1) >> DBLK_SIZE_SHIFT] = cp; @@ -357,13 +357,13 @@ streams_msg_init(void) } dblk_esb_cache = kmem_cache_create("streams_dblk_esb", - sizeof (dblk_t), DBLK_CACHE_ALIGN, - dblk_esb_constructor, dblk_destructor, NULL, - (void *) sizeof (dblk_t), NULL, dblk_kmem_flags); + sizeof (dblk_t), DBLK_CACHE_ALIGN, + dblk_esb_constructor, dblk_destructor, NULL, + (void *) sizeof (dblk_t), NULL, dblk_kmem_flags); fthdr_cache = kmem_cache_create("streams_fthdr", - sizeof (fthdr_t), 32, NULL, NULL, NULL, NULL, NULL, 0); + sizeof (fthdr_t), 32, NULL, NULL, NULL, NULL, NULL, 0); ftblk_cache = kmem_cache_create("streams_ftblk", - sizeof (ftblk_t), 32, NULL, NULL, NULL, NULL, NULL, 0); + sizeof (ftblk_t), 32, NULL, NULL, NULL, NULL, NULL, 0); /* Initialize Multidata caches */ mmd_init(); @@ -545,8 +545,8 @@ dblk_lastfree(mblk_t *mp, dblk_t *dbp) dbp->db_struioflag = 0; dbp->db_struioun.cksum.flags = 0; - /* and the COOKED flag */ - dbp->db_flags &= ~DBLK_COOKED; + /* and the COOKED and/or UIOA flag(s) */ + dbp->db_flags &= ~(DBLK_COOKED | DBLK_UIOA); kmem_cache_free(dbp->db_cache, dbp); } @@ -739,7 +739,7 @@ desballoc(unsigned char *base, size_t size, uint_t pri, frtn_t *frp) */ if (!str_ftnever) { mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0), - frp, dblk_lastfree_desb, KM_NOSLEEP); + frp, dblk_lastfree_desb, KM_NOSLEEP); if (mp != NULL) STR_FTALLOC(&DB_FTHDR(mp), FTEV_DESBALLOC, size); @@ -857,7 +857,7 @@ bcache_create(char *name, size_t size, uint_t align) (void) sprintf(buffer, "%s_dblk_cache", name); bcp->dblk_cache = kmem_cache_create(buffer, sizeof (dblk_t), DBLK_CACHE_ALIGN, bcache_dblk_constructor, bcache_dblk_destructor, - NULL, (void *)bcp, NULL, 0); + NULL, (void *)bcp, NULL, 0); return (bcp); } @@ -1584,7 +1584,7 @@ adjmsg(mblk_t *mp, ssize_t len) */ if ((save_bp != mp) && - (save_bp->b_wptr == save_bp->b_rptr)) { + (save_bp->b_wptr == save_bp->b_rptr)) { bcont = save_bp->b_cont; freeb(save_bp); prev_bp->b_cont = bcont; @@ -2129,8 +2129,8 @@ flushband(queue_t *q, unsigned char pri, int flag) nmp = mp->b_next; mp->b_next = mp->b_prev = NULL; if ((mp->b_band == 0) && - ((flag == FLUSHALL) || - datamsg(mp->b_datap->db_type))) + ((flag == FLUSHALL) || + datamsg(mp->b_datap->db_type))) freemsg(mp); else (void) putq(q, mp); @@ -2242,7 +2242,7 @@ bcanput(queue_t *q, unsigned char pri) q->q_flag |= QWANTW; mutex_exit(QLOCK(q)); TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT, - "bcanput:%p %X %d", q, pri, 0); + "bcanput:%p %X %d", q, pri, 0); return (0); } } else { /* pri != 0 */ @@ -2252,7 +2252,7 @@ bcanput(queue_t *q, unsigned char pri) */ mutex_exit(QLOCK(q)); TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT, - "bcanput:%p %X %d", q, pri, 1); + "bcanput:%p %X %d", q, pri, 1); return (1); } qbp = q->q_bandp; @@ -2262,13 +2262,13 @@ bcanput(queue_t *q, unsigned char pri) qbp->qb_flag |= QB_WANTW; mutex_exit(QLOCK(q)); TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT, - "bcanput:%p %X %d", q, pri, 0); + "bcanput:%p %X %d", q, pri, 0); return (0); } } mutex_exit(QLOCK(q)); TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT, - "bcanput:%p %X %d", q, pri, 1); + "bcanput:%p %X %d", q, pri, 1); return (1); } @@ -2847,7 +2847,7 @@ putnextctl1(queue_t *q, int type, int param) mblk_t *bp; if ((datamsg(type) && (type != M_DELAY)) || - ((bp = allocb_tryhard(1)) == NULL)) + ((bp = allocb_tryhard(1)) == NULL)) return (0); bp->b_datap->db_type = (unsigned char)type; @@ -2864,7 +2864,7 @@ putnextctl(queue_t *q, int type) mblk_t *bp; if ((datamsg(type) && (type != M_DELAY)) || - ((bp = allocb_tryhard(0)) == NULL)) + ((bp = allocb_tryhard(0)) == NULL)) return (0); bp->b_datap->db_type = (unsigned char)type; diff --git a/usr/src/uts/common/os/move.c b/usr/src/uts/common/os/move.c index d5c63b167e..163a4cc2e5 100644 --- a/usr/src/uts/common/os/move.c +++ b/usr/src/uts/common/os/move.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -45,6 +44,16 @@ #include <sys/systm.h> #include <sys/uio.h> #include <sys/errno.h> +#include <sys/vmsystm.h> +#include <sys/cmn_err.h> +#include <vm/as.h> +#include <vm/page.h> + +#include <sys/dcopy.h> + +int64_t uioa_maxpoll = -1; /* <0 = noblock, 0 = block, >0 = block after */ +#define UIO_DCOPY_CHANNEL 0 +#define UIO_DCOPY_CMD 1 /* * Move "n" bytes at byte address "p"; "rw" indicates the direction @@ -277,3 +286,386 @@ uiodup(uio_t *suio, uio_t *duio, iovec_t *diov, int diov_cnt) duio->uio_iov = diov; return (0); } + +/* + * Shadow state for checking if a platform has hardware asynchronous + * copy capability and minimum copy size, e.g. Intel's I/OAT dma engine, + * + * Dcopy does a call-back to uioa_dcopy_enable() when a dma device calls + * into dcopy to register and uioa_dcopy_disable() when the device calls + * into dcopy to unregister. + */ +uioasync_t uioasync = {B_FALSE, 1024}; + +void +uioa_dcopy_enable() +{ + uioasync.enabled = B_TRUE; +} + +void +uioa_dcopy_disable() +{ + uioasync.enabled = B_FALSE; +} + +/* + * Schedule an asynchronous move of "n" bytes at byte address "p", + * "rw" indicates the direction of the move, I/O parameters and + * async state are provided in "uioa" which is update to reflect + * the data which is to be moved. + * + * Returns 0 on success or a non-zero errno on failure. + * + * Note, while the uioasync APIs are general purpose in design + * the current implementation is Intel I/OAT specific. + */ +int +uioamove(void *p, size_t n, enum uio_rw rw, uioa_t *uioa) +{ + int soff, doff; + uint64_t pa; + int cnt; + iovec_t *iov; + dcopy_handle_t channel; + dcopy_cmd_t cmd; + int ret = 0; + int dcopy_flags; + + if (!(uioa->uioa_state & UIOA_ENABLED)) { + /* The uioa_t isn't enabled */ + return (ENXIO); + } + + if (uioa->uio_segflg != UIO_USERSPACE || rw != UIO_READ) { + /* Only support to user-land from kernel */ + return (ENOTSUP); + } + + + channel = uioa->uioa_hwst[UIO_DCOPY_CHANNEL]; + cmd = uioa->uioa_hwst[UIO_DCOPY_CMD]; + dcopy_flags = DCOPY_NOSLEEP; + + /* + * While source bytes and destination bytes. + */ + while (n > 0 && uioa->uio_resid > 0) { + iov = uioa->uio_iov; + if (iov->iov_len == 0l) { + uioa->uio_iov++; + uioa->uio_iovcnt--; + uioa->uioa_lcur++; + uioa->uioa_lppp = uioa->uioa_lcur->uioa_ppp; + continue; + } + /* + * While source bytes schedule an async + * dma for destination page by page. + */ + while (n > 0) { + /* Addr offset in page src/dst */ + soff = (uintptr_t)p & PAGEOFFSET; + doff = (uintptr_t)iov->iov_base & PAGEOFFSET; + /* Min copy count src and dst and page sized */ + cnt = MIN(n, iov->iov_len); + cnt = MIN(cnt, PAGESIZE - soff); + cnt = MIN(cnt, PAGESIZE - doff); + /* XXX if next page(s) contiguous could use multipage */ + + /* + * if we have an old command, we want to link all + * other commands to the next command we alloced so + * we only need to track the last command but can + * still free them all. + */ + if (cmd != NULL) { + dcopy_flags |= DCOPY_ALLOC_LINK; + } + ret = dcopy_cmd_alloc(channel, dcopy_flags, &cmd); + if (ret != DCOPY_SUCCESS) { + /* Error of some sort */ + return (EIO); + } + uioa->uioa_hwst[UIO_DCOPY_CMD] = cmd; + + ASSERT(cmd->dp_version == DCOPY_CMD_V0); + if (uioa_maxpoll >= 0) { + /* Blocking (>0 may be) used in uioafini() */ + cmd->dp_flags = DCOPY_CMD_INTR; + } else { + /* Non blocking uioafini() so no intr */ + cmd->dp_flags = DCOPY_CMD_NOFLAGS; + } + cmd->dp_cmd = DCOPY_CMD_COPY; + pa = ptob((uint64_t)hat_getpfnum(kas.a_hat, p)); + cmd->dp.copy.cc_source = pa + soff; + if (uioa->uioa_lcur->uioa_pfncnt == 0) { + /* Have a (page_t **) */ + pa = ptob((uint64_t)( + *(page_t **)uioa->uioa_lppp)->p_pagenum); + } else { + /* Have a (pfn_t *) */ + pa = ptob((uint64_t)( + *(pfn_t *)uioa->uioa_lppp)); + } + cmd->dp.copy.cc_dest = pa + doff; + cmd->dp.copy.cc_size = cnt; + ret = dcopy_cmd_post(cmd); + if (ret != DCOPY_SUCCESS) { + /* Error of some sort */ + return (EIO); + } + ret = 0; + + /* If UIOA_POLL not set, set it */ + if (!(uioa->uioa_state & UIOA_POLL)) + uioa->uioa_state |= UIOA_POLL; + + /* Update iov, uio, and local pointers/counters */ + iov->iov_base += cnt; + iov->iov_len -= cnt; + uioa->uio_resid -= cnt; + uioa->uio_loffset += cnt; + p = (caddr_t)p + cnt; + n -= cnt; + + /* End of iovec? */ + if (iov->iov_len == 0) { + /* Yup, next iovec */ + break; + } + + /* Next dst addr page? */ + if (doff + cnt == PAGESIZE) { + /* Yup, next page_t */ + uioa->uioa_lppp++; + } + } + } + + return (ret); +} + +/* + * Initialize a uioa_t for a given uio_t for the current user context, + * copy the common uio_t to the uioa_t, walk the shared iovec_t and + * lock down the user-land page(s) containing iovec_t data, then mapin + * user-land pages using segkpm. + */ +int +uioainit(uio_t *uiop, uioa_t *uioap) +{ + caddr_t addr; + page_t **pages; + int off; + int len; + proc_t *procp = ttoproc(curthread); + struct as *as = procp->p_as; + iovec_t *iov = uiop->uio_iov; + int32_t iovcnt = uiop->uio_iovcnt; + uioa_page_t *locked = uioap->uioa_locked; + dcopy_handle_t channel; + int error; + + if (! (uioap->uioa_state & UIOA_ALLOC)) { + /* Can only init() a freshly allocated uioa_t */ + return (EINVAL); + } + + error = dcopy_alloc(DCOPY_NOSLEEP, &channel); + if (error == DCOPY_NORESOURCES) { + /* Turn off uioa */ + uioasync.enabled = B_FALSE; + return (ENODEV); + } + if (error != DCOPY_SUCCESS) { + /* Alloc failed */ + return (EIO); + } + + uioap->uioa_hwst[UIO_DCOPY_CHANNEL] = channel; + uioap->uioa_hwst[UIO_DCOPY_CMD] = NULL; + + /* Indicate uioa_t (will be) initialized */ + uioap->uioa_state = UIOA_INIT; + + /* uio_t/uioa_t uio_t common struct copy */ + *((uio_t *)uioap) = *uiop; + + /* initialize *uiop->uio_iov */ + if (iovcnt > UIOA_IOV_MAX) { + /* Too big? */ + return (E2BIG); + } + uioap->uio_iov = iov; + uioap->uio_iovcnt = iovcnt; + + /* Mark the uioap as such */ + uioap->uio_extflg |= UIO_ASYNC; + + /* + * For each iovec_t, lock-down the page(s) backing the iovec_t + * and save the page_t list for phys addr use in uioamove(). + */ + iov = uiop->uio_iov; + iovcnt = uiop->uio_iovcnt; + while (iovcnt > 0) { + addr = iov->iov_base; + off = (uintptr_t)addr & PAGEOFFSET; + addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + len = iov->iov_len + off; + + /* Lock down page(s) for the iov span */ + if ((error = as_pagelock(as, &pages, + iov->iov_base, iov->iov_len, S_WRITE)) != 0) { + /* Error */ + goto cleanup; + } + + if (pages == NULL) { + /* + * Need page_t list, really only need + * a pfn list so build one. + */ + pfn_t *pfnp; + int pcnt = len >> PAGESHIFT; + + if (off) + pcnt++; + if ((pfnp = kmem_alloc(pcnt * sizeof (pfnp), + KM_NOSLEEP)) == NULL) { + error = ENOMEM; + goto cleanup; + } + locked->uioa_ppp = (void **)pfnp; + locked->uioa_pfncnt = pcnt; + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + while (pcnt-- > 0) { + *pfnp++ = hat_getpfnum(as->a_hat, addr); + addr += PAGESIZE; + } + AS_LOCK_EXIT(as, &as->a_lock); + } else { + /* Have a page_t list, save it */ + locked->uioa_ppp = (void **)pages; + locked->uioa_pfncnt = 0; + } + /* Save for as_pageunlock() in uioafini() */ + locked->uioa_base = iov->iov_base; + locked->uioa_len = iov->iov_len; + locked++; + + /* Next iovec_t */ + iov++; + iovcnt--; + } + /* Initialize curret pointer into uioa_locked[] and it's uioa_ppp */ + uioap->uioa_lcur = uioap->uioa_locked; + uioap->uioa_lppp = uioap->uioa_lcur->uioa_ppp; + return (0); + +cleanup: + /* Unlock any previously locked page_t(s) */ + while (locked > uioap->uioa_locked) { + locked--; + as_pageunlock(as, (page_t **)locked->uioa_ppp, + locked->uioa_base, locked->uioa_len, S_WRITE); + } + + /* Last indicate uioa_t still in alloc state */ + uioap->uioa_state = UIOA_ALLOC; + + return (error); +} + +/* + * Finish processing of a uioa_t by cleanup any pending "uioap" actions. + */ +int +uioafini(uio_t *uiop, uioa_t *uioap) +{ + int32_t iovcnt = uiop->uio_iovcnt; + uioa_page_t *locked = uioap->uioa_locked; + struct as *as = ttoproc(curthread)->p_as; + dcopy_handle_t channel; + dcopy_cmd_t cmd; + int ret = 0; + + ASSERT(uioap->uio_extflg & UIO_ASYNC); + + if (!(uioap->uioa_state & (UIOA_ENABLED|UIOA_FINI))) { + /* Must be an active uioa_t */ + return (EINVAL); + } + + channel = uioap->uioa_hwst[UIO_DCOPY_CHANNEL]; + cmd = uioap->uioa_hwst[UIO_DCOPY_CMD]; + + /* XXX - why do we get cmd == NULL sometimes? */ + if (cmd != NULL) { + if (uioap->uioa_state & UIOA_POLL) { + /* Wait for last dcopy() to finish */ + int64_t poll = 1; + int poll_flag = DCOPY_POLL_NOFLAGS; + + do { + if (uioa_maxpoll == 0 || + (uioa_maxpoll > 0 && + poll >= uioa_maxpoll)) { + /* Always block or after maxpoll */ + poll_flag = DCOPY_POLL_BLOCK; + } else { + /* No block, poll */ + poll++; + } + ret = dcopy_cmd_poll(cmd, poll_flag); + } while (ret == DCOPY_PENDING); + + if (ret == DCOPY_COMPLETED) { + /* Poll/block succeeded */ + ret = 0; + } else { + /* Poll/block failed */ + ret = EIO; + } + } + dcopy_cmd_free(&cmd); + } + + dcopy_free(&channel); + + /* Unlock all page(s) iovec_t by iovec_t */ + while (iovcnt-- > 0) { + page_t **pages; + + if (locked->uioa_pfncnt == 0) { + /* A as_pagelock() returned (page_t **) */ + pages = (page_t **)locked->uioa_ppp; + } else { + /* Our pfn_t array */ + pages = NULL; + kmem_free(locked->uioa_ppp, locked->uioa_pfncnt * + sizeof (pfn_t *)); + } + as_pageunlock(as, pages, locked->uioa_base, locked->uioa_len, + S_WRITE); + + locked++; + } + /* uioa_t->uio_t common struct copy */ + *uiop = *((uio_t *)uioap); + + /* + * Last, reset uioa state to alloc. + * + * Note, we only initialize the state here, all other members + * will be initialized in a subsequent uioainit(). + */ + uioap->uioa_state = UIOA_ALLOC; + + uioap->uioa_hwst[UIO_DCOPY_CMD] = NULL; + uioap->uioa_hwst[UIO_DCOPY_CHANNEL] = NULL; + + return (ret); +} diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index 3fcbf8634b..a1cada7964 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -144,6 +144,7 @@ static void putback(struct stdata *, queue_t *, mblk_t *, int); static void strcleanall(struct vnode *); static int strwsrv(queue_t *); static int strdocmd(struct stdata *, struct strcmd *, cred_t *); +static void struioainit(queue_t *, sodirect_t *, uio_t *); /* * qinit and module_info structures for stream head read and write queues @@ -189,6 +190,11 @@ static boolean_t msghasdata(mblk_t *bp); * mirror this. * 4. ioctl monitor: sd_lock is gotten to ensure that only one * thread is doing an ioctl at a time. + * + * Note, for sodirect case 3. is extended to (*sodirect_t.sod_enqueue)() + * call-back from below, further the sodirect support is for code paths + * called via kstgetmsg(), all other code paths ASSERT() that sodirect + * uioa generated mblk_t's (i.e. DBLK_UIOA) aren't processed. */ static int @@ -397,6 +403,7 @@ ckreturn: stp->sd_qn_minpsz = 0; stp->sd_qn_maxpsz = INFPSZ - 1; /* used to check for initialization */ stp->sd_maxblk = INFPSZ; + stp->sd_sodirect = NULL; qp->q_ptr = _WR(qp)->q_ptr = stp; STREAM(qp) = STREAM(_WR(qp)) = stp; vp->v_stream = stp; @@ -970,11 +977,14 @@ strcleanall(struct vnode *vp) * It is the callers responsibility to call qbackenable after * it is finished with the message. The caller should not call * qbackenable until after any putback calls to avoid spurious backenabling. + * + * Also, handle uioa initialization and process any DBLK_UIOA flaged messages. */ mblk_t * strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, int *errorp) { + sodirect_t *sodp = stp->sd_sodirect; mblk_t *bp; int error; @@ -1063,7 +1073,67 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, } *errorp = 0; ASSERT(MUTEX_HELD(&stp->sd_lock)); - return (getq_noenab(q)); + if (sodp != NULL && (sodp->sod_state & SOD_ENABLED) && + (sodp->sod_uioa.uioa_state & UIOA_INIT)) { + /* + * First kstrgetmsg() call for an uioa_t so if any + * queued mblk_t's need to consume them before uioa + * from below can occur. + */ + sodp->sod_uioa.uioa_state &= UIOA_CLR; + sodp->sod_uioa.uioa_state |= UIOA_ENABLED; + if (q->q_first != NULL) { + struioainit(q, sodp, uiop); + } + } + + bp = getq_noenab(q); + + if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) { + /* + * A uioa flaged mblk_t chain, already uio processed, + * add it to the sodirect uioa pending free list. + * + * Note, a b_cont chain headed by a DBLK_UIOA enable + * mblk_t must have all mblk_t(s) DBLK_UIOA enabled. + */ + mblk_t *bpt = sodp->sod_uioaft; + + ASSERT(sodp != NULL); + + /* + * Add first mblk_t of "bp" chain to current sodirect uioa + * free list tail mblk_t, if any, else empty list so new head. + */ + if (bpt == NULL) + sodp->sod_uioafh = bp; + else + bpt->b_cont = bp; + + /* + * Walk mblk_t "bp" chain to find tail and adjust rptr of + * each to reflect that uioamove() has consumed all data. + */ + bpt = bp; + for (;;) { + bpt->b_rptr = bpt->b_wptr; + if (bpt->b_cont == NULL) + break; + bpt = bpt->b_cont; + + ASSERT(bpt->b_datap->db_flags & DBLK_UIOA); + } + /* New sodirect uioa free list tail */ + sodp->sod_uioaft = bpt; + + /* Only 1 strget() with data returned per uioa_t */ + if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) { + sodp->sod_uioa.uioa_state &= UIOA_CLR; + sodp->sod_uioa.uioa_state |= UIOA_FINI; + } + } + + return (bp); } /* @@ -1083,6 +1153,8 @@ struiocopyout(mblk_t *bp, struct uio *uiop, int *errorp) ASSERT(bp->b_wptr >= bp->b_rptr); do { + ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA)); + if ((n = MIN(uiop->uio_resid, MBLKL(bp))) != 0) { ASSERT(n > 0); @@ -1229,8 +1301,10 @@ strread(struct vnode *vp, struct uio *uiop, cred_t *crp) } first = 0; } + ASSERT(MUTEX_HELD(&stp->sd_lock)); ASSERT(bp); + ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA)); pri = bp->b_band; /* * Extract any mark information. If the message is not @@ -6650,6 +6724,7 @@ strgetmsg( bp = strget(stp, q, uiop, first, &error); ASSERT(MUTEX_HELD(&stp->sd_lock)); if (bp != NULL) { + ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA)); if (bp->b_datap->db_type == M_SIG) { strsignal_nolock(stp, *bp->b_rptr, (int32_t)bp->b_band); @@ -7288,7 +7363,7 @@ retry: "kstrgetmsg calls strwaitq:%p, %p", vp, uiop); if (((error = strwaitq(stp, waitflag, (ssize_t)0, - fmode, timout, &done)) != 0) || done) { + fmode, timout, &done))) != 0 || done) { TRACE_2(TR_FAC_STREAMS_FR, TR_KSTRGETMSG_DONE, "kstrgetmsg error or done:%p, %p", vp, uiop); @@ -7360,6 +7435,8 @@ retry: * there is indeed a shortage of memory. dupmsg() may fail * if db_ref in any of the messages reaches its limit. */ + + ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA)); if ((nbp = dupmsg(bp)) == NULL && (nbp = copymsg(bp)) == NULL) { /* * Restore the state of the stream head since we @@ -7418,6 +7495,7 @@ retry: } } + ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA)); bp = (stp->sd_rputdatafunc)(stp->sd_vnode, bp, NULL, NULL, NULL, NULL); @@ -7468,6 +7546,8 @@ retry: */ if (uiop == NULL) { /* Append data to tail of mctlp */ + + ASSERT(bp == NULL || !(bp->b_datap->db_flags & DBLK_UIOA)); if (mctlp != NULL) { mblk_t **mpp = mctlp; @@ -7476,6 +7556,14 @@ retry: *mpp = bp; bp = NULL; } + } else if (bp && (bp->b_datap->db_flags & DBLK_UIOA)) { + /* + * A uioa mblk_t chain, as uio processing has already + * been done we simple skip over processing. + */ + bp = NULL; + pr = 0; + } else if (uiop->uio_resid >= 0 && bp) { size_t oldresid = uiop->uio_resid; @@ -7564,6 +7652,8 @@ retry: * again since the flush logic in strrput_nondata() * may have cleared it while we had sd_lock dropped. */ + + ASSERT(!(savemp->b_datap->db_flags & DBLK_UIOA)); if (type >= QPCTL) { ASSERT(type == M_PCPROTO); if (queclass(savemp) < QPCTL) @@ -8635,3 +8725,85 @@ msghasdata(mblk_t *bp) } return (B_FALSE); } + +/* + * Called on the first strget() of a sodirect/uioa enabled streamhead, + * if any mblk_t(s) enqueued they must first be uioamove()d before uioa + * can be enabled for the underlying transport's use. + */ +void +struioainit(queue_t *q, sodirect_t *sodp, uio_t *uiop) +{ + uioa_t *uioap = (uioa_t *)uiop; + mblk_t *bp = q->q_first; + mblk_t *lbp = NULL; + mblk_t *nbp, *wbp; + int len; + int error; + + ASSERT(MUTEX_HELD(sodp->sod_lock)); + ASSERT(&sodp->sod_uioa == uioap); + + /* + * Walk the b_next/b_prev doubly linked list of b_cont chain(s) + * and schedule any M_DATA mblk_t's for uio asynchronous move. + */ + do { + /* Next mblk_t chain */ + nbp = bp->b_next; + /* Walk the chain */ + wbp = bp; + do { + if (wbp->b_datap->db_type != M_DATA) { + /* Not M_DATA, no more uioa */ + goto nouioa; + } + if ((len = wbp->b_wptr - wbp->b_rptr) > 0) { + /* Have a M_DATA mblk_t with data */ + if (len > uioap->uio_resid) { + /* Not enough uio sapce */ + goto nouioa; + } + error = uioamove(wbp->b_rptr, len, + UIO_READ, uioap); + if (!error) { + /* Scheduled, mark dblk_t as such */ + wbp->b_datap->db_flags |= DBLK_UIOA; + } else { + /* Error of some sort, no more uioa */ + uioap->uioa_state &= UIOA_CLR; + uioap->uioa_state |= UIOA_FINI; + return; + } + } + /* Save last wbp processed */ + lbp = wbp; + } while ((wbp = wbp->b_cont) != NULL); + } while ((bp = nbp) != NULL); + + return; + +nouioa: + /* No more uioa */ + uioap->uioa_state &= UIOA_CLR; + uioap->uioa_state |= UIOA_FINI; + + /* + * If we processed 1 or more mblk_t(s) then we need to split the + * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s) + * are in the current chain and the rest are in the following new + * chain. + */ + if (lbp != NULL) { + /* New end of current chain */ + lbp->b_cont = NULL; + + /* Insert new chain wbp after bp */ + if ((wbp->b_next = nbp) != NULL) + nbp->b_prev = wbp; + else + q->q_last = wbp; + wbp->b_prev = bp; + bp->b_next = wbp; + } +} diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index 650a4cfaf9..a7750e2ec3 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -2467,13 +2467,18 @@ devflg_to_qflag(struct streamtab *stp, uint32_t devflag, uint32_t *qflagp, /* * Private flag used by a transport module to indicate * to sockfs that it supports direct-access mode without - * having to go through STREAMS. + * having to go through STREAMS or the transport can use + * sodirect_t sharing to bypass STREAMS for receive-side + * M_DATA processing. */ - if (devflag & _D_DIRECT) { + if (devflag & (_D_DIRECT|_D_SODIRECT)) { /* Reject unless the module is fully-MT (no perimeter) */ if ((qflag & QMT_TYPEMASK) != QMTSAFE) goto bad; - qflag |= _QDIRECT; + if (devflag & _D_DIRECT) + qflag |= _QDIRECT; + if (devflag & _D_SODIRECT) + qflag |= _QSODIRECT; } *qflagp = qflag; diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 37b908076b..728860594a 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -487,6 +487,7 @@ CHKHDRS= \ socket_impl.h \ socketvar.h \ sockio.h \ + sodirect.h \ squeue.h \ squeue_impl.h \ srn.h \ diff --git a/usr/src/uts/common/sys/conf.h b/usr/src/uts/common/sys/conf.h index 3f6300e581..435cffb35f 100644 --- a/usr/src/uts/common/sys/conf.h +++ b/usr/src/uts/common/sys/conf.h @@ -22,7 +22,7 @@ /* All Rights Reserved */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -221,6 +221,9 @@ extern int cdev_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, #define D_OPEN_RETURNS_EINTR 0x100000 /* EINTR expected from open(9E) */ +#define _D_SODIRECT 0x200000 /* Private flag for transport modules used */ + /* to enable _QSODIRECT for a STREAMS Q */ + #endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ #ifdef __cplusplus diff --git a/deleted_files/usr/src/uts/common/sys/dcopy.h b/usr/src/uts/common/sys/dcopy.h index e700ed9cf6..09e72e84e0 100644 --- a/deleted_files/usr/src/uts/common/sys/dcopy.h +++ b/usr/src/uts/common/sys/dcopy.h @@ -39,6 +39,10 @@ extern "C" { * *** This interface is for private use by the IP stack only *** */ +/* Private dcopy/uioa interface for dcopy to enable/disable dcopy KAPI */ +extern void uioa_dcopy_enable(); +extern void uioa_dcopy_disable(); + /* Function return status */ #define DCOPY_FAILURE (-1) #define DCOPY_SUCCESS (0) diff --git a/deleted_files/usr/src/uts/common/sys/dcopy_device.h b/usr/src/uts/common/sys/dcopy_device.h index 25e95b2aa8..25e95b2aa8 100644 --- a/deleted_files/usr/src/uts/common/sys/dcopy_device.h +++ b/usr/src/uts/common/sys/dcopy_device.h diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h index 0680546ade..178a8a2905 100644 --- a/usr/src/uts/common/sys/socketvar.h +++ b/usr/src/uts/common/sys/socketvar.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -50,14 +50,13 @@ #include <sys/file.h> #include <sys/param.h> #include <sys/zone.h> +#include <sys/sodirect.h> #include <inet/kssl/ksslapi.h> #ifdef __cplusplus extern "C" { #endif - - /* * Internal representation used for addresses. */ @@ -333,6 +332,9 @@ struct sonode { kssl_endpt_type_t so_kssl_type; /* is proxy/is proxied/none */ kssl_ent_t so_kssl_ent; /* SSL config entry */ kssl_ctx_t so_kssl_ctx; /* SSL session context */ + + /* != NULL for sodirect_t enabled socket */ + sodirect_t *so_direct; }; /* flags */ @@ -375,6 +377,7 @@ struct sonode { #define SS_MOREDATA 0x00100000 /* NCAfs: NCA has more data */ #define SS_DIRECT 0x00200000 /* transport is directly below */ +#define SS_SODIRECT 0x00400000 /* transport supports sodirect */ #define SS_LADDR_VALID 0x01000000 /* so_laddr valid for user */ #define SS_FADDR_VALID 0x02000000 /* so_faddr valid for user */ diff --git a/deleted_files/usr/src/uts/common/sys/sodirect.h b/usr/src/uts/common/sys/sodirect.h index 49609bc5af..49609bc5af 100644 --- a/deleted_files/usr/src/uts/common/sys/sodirect.h +++ b/usr/src/uts/common/sys/sodirect.h diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h index 3eca2fefdf..0da91f7d38 100644 --- a/usr/src/uts/common/sys/stream.h +++ b/usr/src/uts/common/sys/stream.h @@ -190,6 +190,8 @@ typedef struct queue { #define _QASSOCIATED 0x10000000 /* queue is associated with a device */ #define _QDIRECT 0x20000000 /* Private; transport module uses */ /* direct interface to/from sockfs */ +#define _QSODIRECT 0x40000000 /* Private, transport module shares */ + /* an sodirect_t with sockfs */ /* queue sqflags (protected by SQLOCK). */ #define Q_SQQUEUED 0x01 /* Queue is in the syncq list */ @@ -400,6 +402,7 @@ typedef struct bcache { */ #define DBLK_REFMIN 0x01 /* min refcnt stored in low bit */ #define DBLK_COOKED 0x02 /* message has been processed once */ +#define DBLK_UIOA 0x04 /* uioamove() is pending */ /* * db_struioflag values: diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h index df489c3dff..71c26a3212 100644 --- a/usr/src/uts/common/sys/strsubr.h +++ b/usr/src/uts/common/sys/strsubr.h @@ -46,6 +46,7 @@ #include <sys/proc.h> #include <sys/netstack.h> #include <sys/modhash.h> +#include <sys/sodirect.h> #ifdef __cplusplus extern "C" { @@ -94,9 +95,8 @@ extern "C" { * sd_mark * sd_closetime * sd_wakeq - * sd_uiordq - * sd_uiowrq * sd_maxblk + * sd_sodirect * * The following fields are modified only by the allocator, which * has exclusive access to them at that time: @@ -245,6 +245,10 @@ typedef struct stdata { uint_t sd_copyflag; /* copy-related flags */ zoneid_t sd_anchorzone; /* Allow removal from same zone only */ struct msgb *sd_cmdblk; /* reply from _I_CMD */ + /* + * Support for socket direct. + */ + sodirect_t *sd_sodirect; /* pointer to shared sodirect_t */ } stdata_t; /* diff --git a/usr/src/uts/common/sys/uio.h b/usr/src/uts/common/sys/uio.h index 3e9e4a5eda..4f0aff49f6 100644 --- a/usr/src/uts/common/sys/uio.h +++ b/usr/src/uts/common/sys/uio.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -101,6 +100,49 @@ typedef struct uio { ssize_t uio_resid; /* residual count */ } uio_t; +/* + * Extended uio_t uioa_t used for asynchronous uio. + * + * Note: UIOA_IOV_MAX is defined and used as it is in "fs/vncalls.c" + * as there isn't a formal definition of IOV_MAX for the kernel. + */ +#define UIOA_IOV_MAX 16 + +typedef struct uioa_page_s { /* locked uio_iov state */ + int uioa_pfncnt; /* count of pfn_t(s) in *uioa_ppp */ + void **uioa_ppp; /* page_t or pfn_t arrary */ + caddr_t uioa_base; /* address base */ + size_t uioa_len; /* span length */ +} uioa_page_t; + +typedef struct uioa_s { + iovec_t *uio_iov; /* pointer to array of iovecs */ + int uio_iovcnt; /* number of iovecs */ + lloff_t _uio_offset; /* file offset */ + uio_seg_t uio_segflg; /* address space (kernel or user) */ + uint16_t uio_fmode; /* file mode flags */ + uint16_t uio_extflg; /* extended flags */ + lloff_t _uio_limit; /* u-limit (maximum byte offset) */ + ssize_t uio_resid; /* residual count */ + /* + * uioa extended members. + */ + uint32_t uioa_state; /* state of asynch i/o */ + uioa_page_t *uioa_lcur; /* pointer into uioa_locked[] */ + void **uioa_lppp; /* pointer into lcur->uioa_ppp[] */ + void *uioa_hwst[4]; /* opaque hardware state */ + uioa_page_t uioa_locked[UIOA_IOV_MAX]; /* Per iov locked pages */ +} uioa_t; + +#define UIOA_ALLOC 0x0001 /* allocated but not yet initialized */ +#define UIOA_INIT 0x0002 /* initialized but not yet enabled */ +#define UIOA_ENABLED 0x0004 /* enabled, asynch i/o active */ +#define UIOA_FINI 0x0008 /* finished waiting for uioafini() */ + +#define UIOA_CLR (~0x000F) /* clear mutually exclusive bits */ + +#define UIOA_POLL 0x0010 /* need dcopy_poll() */ + #define uio_loffset _uio_offset._f #if !defined(_LP64) #define uio_offset _uio_offset._p._l @@ -127,10 +169,24 @@ typedef enum uio_rw { UIO_READ, UIO_WRITE } uio_rw_t; * access, ie, access bypassing caches, should be used. Filesystems that * don't initialize this field could experience suboptimal performance due to * the random data the field contains. + * + * NOTE: This flag is also used by uioasync callers to pass an extended + * uio_t (uioa_t), to uioasync enabled consumers. Unlike above all + * consumers of a uioa_t require the uio_extflg to be initialized. */ #define UIO_COPY_DEFAULT 0x0000 /* no special options to copy */ #define UIO_COPY_CACHED 0x0001 /* copy should not bypass caches */ +#define UIO_ASYNC 0x0002 /* uio_t is really a uioa_t */ + +/* + * Global uioasync capability shadow state. + */ +typedef struct uioasync_s { + boolean_t enabled; /* Is uioasync enabled? */ + size_t mincnt; /* Minimum byte count for use of */ +} uioasync_t; + #endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ #if defined(_KERNEL) @@ -141,6 +197,11 @@ int uwritec(struct uio *); void uioskip(uio_t *, size_t); int uiodup(uio_t *, uio_t *, iovec_t *, int); +int uioamove(void *, size_t, enum uio_rw, uioa_t *); +int uioainit(uio_t *, uioa_t *); +int uioafini(uio_t *, uioa_t *); +extern uioasync_t uioasync; + #else /* defined(_KERNEL) */ #if defined(__STDC__) diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 5ae521c687..0e30822c0d 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -161,6 +161,7 @@ DBOOT_OBJS += \ # GFX_PRIVATE_OBJS += gfx_private.o gfxp_pci.o gfxp_segmap.o \ gfxp_devmap.o gfxp_vgatext.o gfxp_vm.o vgasubr.o +IOAT_OBJS += ioat.o ioat_rs.o ioat_ioctl.o ioat_chan.o ISANEXUS_OBJS += isa.o dma_engine.o i8237A.o PCI_E_MISC_OBJS += pcie.o pcie_fault.o PCI_E_NEXUS_OBJS += npe.o npe_misc.o diff --git a/usr/src/uts/i86pc/Makefile.i86pc.shared b/usr/src/uts/i86pc/Makefile.i86pc.shared index 1e1c6abe1d..4d188a4dfd 100644 --- a/usr/src/uts/i86pc/Makefile.i86pc.shared +++ b/usr/src/uts/i86pc/Makefile.i86pc.shared @@ -257,6 +257,7 @@ DRV_KMODS += xsvc DRV_KMODS += mc-amd DRV_KMODS += tzmon DRV_KMODS += acpi_drv +DRV_KMODS += ioat DRV_KMODS += cpudrv diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules index 78d3832d9b..9195b8ffb7 100644 --- a/usr/src/uts/i86pc/Makefile.rules +++ b/usr/src/uts/i86pc/Makefile.rules @@ -73,6 +73,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/acpi_drv/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/ioat/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/mc/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -259,6 +263,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/acpi_drv/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/ioat/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/mc/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/deleted_files/usr/src/uts/i86pc/io/ioat/ioat.c b/usr/src/uts/i86pc/io/ioat/ioat.c index 7bf8a559c1..7bf8a559c1 100644 --- a/deleted_files/usr/src/uts/i86pc/io/ioat/ioat.c +++ b/usr/src/uts/i86pc/io/ioat/ioat.c diff --git a/deleted_files/usr/src/uts/i86pc/io/ioat/ioat.conf b/usr/src/uts/i86pc/io/ioat/ioat.conf index 49d948eddb..49d948eddb 100644 --- a/deleted_files/usr/src/uts/i86pc/io/ioat/ioat.conf +++ b/usr/src/uts/i86pc/io/ioat/ioat.conf diff --git a/deleted_files/usr/src/uts/i86pc/io/ioat/ioat_chan.c b/usr/src/uts/i86pc/io/ioat/ioat_chan.c index 8615f9a7ad..8615f9a7ad 100644 --- a/deleted_files/usr/src/uts/i86pc/io/ioat/ioat_chan.c +++ b/usr/src/uts/i86pc/io/ioat/ioat_chan.c diff --git a/deleted_files/usr/src/uts/i86pc/io/ioat/ioat_ioctl.c b/usr/src/uts/i86pc/io/ioat/ioat_ioctl.c index 70640dac4f..70640dac4f 100644 --- a/deleted_files/usr/src/uts/i86pc/io/ioat/ioat_ioctl.c +++ b/usr/src/uts/i86pc/io/ioat/ioat_ioctl.c diff --git a/deleted_files/usr/src/uts/i86pc/io/ioat/ioat_rs.c b/usr/src/uts/i86pc/io/ioat/ioat_rs.c index 6d12798fda..6d12798fda 100644 --- a/deleted_files/usr/src/uts/i86pc/io/ioat/ioat_rs.c +++ b/usr/src/uts/i86pc/io/ioat/ioat_rs.c diff --git a/deleted_files/usr/src/uts/i86pc/ioat/Makefile b/usr/src/uts/i86pc/ioat/Makefile index 2dcd6e898e..2dcd6e898e 100644 --- a/deleted_files/usr/src/uts/i86pc/ioat/Makefile +++ b/usr/src/uts/i86pc/ioat/Makefile diff --git a/deleted_files/usr/src/uts/i86pc/sys/ioat.h b/usr/src/uts/i86pc/sys/ioat.h index 1e32b54ebd..1e32b54ebd 100644 --- a/deleted_files/usr/src/uts/i86pc/sys/ioat.h +++ b/usr/src/uts/i86pc/sys/ioat.h diff --git a/usr/src/uts/i86xpv/Makefile.files b/usr/src/uts/i86xpv/Makefile.files index 7c2ce261fd..88392b1855 100644 --- a/usr/src/uts/i86xpv/Makefile.files +++ b/usr/src/uts/i86xpv/Makefile.files @@ -179,12 +179,13 @@ DBOOT_OBJS += \ # # driver & misc modules # -ISANEXUS_OBJS += isa.o dma_engine.o i8237A.o -DOMCAPS_OBJS += domcaps.o BALLOON_OBJS += balloon_drv.o +DOMCAPS_OBJS += domcaps.o EVTCHN_OBJS += evtchn_dev.o GFX_PRIVATE_OBJS += gfx_private.o gfxp_pci.o gfxp_segmap.o \ gfxp_devmap.o gfxp_vgatext.o gfxp_vm.o vgasubr.o +IOAT_OBJS += ioat.o ioat_rs.o ioat_ioctl.o ioat_chan.o +ISANEXUS_OBJS += isa.o dma_engine.o i8237A.o PCI_E_MISC_OBJS += pcie.o pcie_fault.o PCI_E_NEXUS_OBJS += npe.o npe_misc.o PCI_E_NEXUS_OBJS += pci_common.o pci_kstats.o pci_tools.o diff --git a/usr/src/uts/i86xpv/Makefile.i86xpv.shared b/usr/src/uts/i86xpv/Makefile.i86xpv.shared index 0461221216..6c41c1141d 100644 --- a/usr/src/uts/i86xpv/Makefile.i86xpv.shared +++ b/usr/src/uts/i86xpv/Makefile.i86xpv.shared @@ -240,6 +240,7 @@ MACH_NOT_YET_KMODS = $(AUTOCONF_OBJS) # DRV_KMODS += rootnex +DRV_KMODS += ioat DRV_KMODS += isa DRV_KMODS += pci DRV_KMODS += npe diff --git a/usr/src/uts/i86xpv/Makefile.rules b/usr/src/uts/i86xpv/Makefile.rules index cd22ca6278..8da47ca9d1 100644 --- a/usr/src/uts/i86xpv/Makefile.rules +++ b/usr/src/uts/i86xpv/Makefile.rules @@ -57,6 +57,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86xpv/io/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/ioat/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/pci/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -215,6 +219,9 @@ DBOOT_DEFS += -D__xpv $(LINTS_DIR)/%.ln: $(UTSBASE)/common/cpr/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/ioat/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/pci/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/deleted_files/usr/src/uts/i86xpv/ioat/Makefile b/usr/src/uts/i86xpv/ioat/Makefile index 54354aedc7..54354aedc7 100644 --- a/deleted_files/usr/src/uts/i86xpv/ioat/Makefile +++ b/usr/src/uts/i86xpv/ioat/Makefile diff --git a/usr/src/uts/intel/Makefile.files b/usr/src/uts/intel/Makefile.files index 9a756bd90d..a1a4a1d66e 100644 --- a/usr/src/uts/intel/Makefile.files +++ b/usr/src/uts/intel/Makefile.files @@ -138,6 +138,7 @@ CMDK_OBJS += cmdk.o CMLB_OBJS += cmlb.o CPUNEX_OBJS += cpunex.o DADK_OBJS += dadk.o +DCOPY_OBJS += dcopy.o DNET_OBJS += dnet.o mii.o FD_OBJS += fd.o GDA_OBJS += gda.o diff --git a/usr/src/uts/intel/Makefile.intel.shared b/usr/src/uts/intel/Makefile.intel.shared index e8eef62150..476bd301e1 100644 --- a/usr/src/uts/intel/Makefile.intel.shared +++ b/usr/src/uts/intel/Makefile.intel.shared @@ -528,6 +528,7 @@ MISC_KMODS += cmlb MISC_KMODS += consconfig MISC_KMODS += ctf MISC_KMODS += dadk +MISC_KMODS += dcopy MISC_KMODS += dls MISC_KMODS += drm MISC_KMODS += fssnap_if diff --git a/deleted_files/usr/src/uts/intel/dcopy/Makefile b/usr/src/uts/intel/dcopy/Makefile index e321465ec1..e321465ec1 100644 --- a/deleted_files/usr/src/uts/intel/dcopy/Makefile +++ b/usr/src/uts/intel/dcopy/Makefile diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s index fd7a606594..5ae7072e82 100644 --- a/usr/src/uts/intel/ia32/ml/modstubs.s +++ b/usr/src/uts/intel/ia32/ml/modstubs.s @@ -1313,6 +1313,22 @@ fcnname/**/_info: \ END_MODULE(kssl); #endif +/* + * Stubs for dcopy, for Intel IOAT KAPIs + */ +#ifndef DCOPY_MODULE + MODULE(dcopy,misc); + NO_UNLOAD_STUB(dcopy, dcopy_query, nomod_minus_one); + NO_UNLOAD_STUB(dcopy, dcopy_query_channel, nomod_minus_one); + NO_UNLOAD_STUB(dcopy, dcopy_alloc, nomod_minus_one); + NO_UNLOAD_STUB(dcopy, dcopy_free, nomod_minus_one); + NO_UNLOAD_STUB(dcopy, dcopy_cmd_alloc, nomod_minus_one); + NO_UNLOAD_STUB(dcopy, dcopy_cmd_free, nomod_void); + NO_UNLOAD_STUB(dcopy, dcopy_cmd_post, nomod_minus_one); + NO_UNLOAD_STUB(dcopy, dcopy_cmd_poll, nomod_minus_one); + END_MODULE(dcopy); +#endif + / this is just a marker for the area of text that contains stubs ENTRY_NP(stubs_end) diff --git a/usr/src/uts/sparc/ml/modstubs.s b/usr/src/uts/sparc/ml/modstubs.s index 8e4e06a008..b1936c4172 100644 --- a/usr/src/uts/sparc/ml/modstubs.s +++ b/usr/src/uts/sparc/ml/modstubs.s @@ -1265,6 +1265,22 @@ stubs_base: END_MODULE(kssl); #endif +/* + * Stubs for dcopy, for Intel IOAT KAPIs + */ +#ifndef DCOPY_MODULE + MODULE(dcopy,misc); + NO_UNLOAD_STUB(dcopy, dcopy_query, nomod_minus_one); + NO_UNLOAD_STUB(dcopy, dcopy_query_channel, nomod_minus_one); + NO_UNLOAD_STUB(dcopy, dcopy_alloc, nomod_minus_one); + NO_UNLOAD_STUB(dcopy, dcopy_free, nomod_minus_one); + NO_UNLOAD_STUB(dcopy, dcopy_cmd_alloc, nomod_minus_one); + NO_UNLOAD_STUB(dcopy, dcopy_cmd_free, nomod_void); + NO_UNLOAD_STUB(dcopy, dcopy_cmd_post, nomod_minus_one); + NO_UNLOAD_STUB(dcopy, dcopy_cmd_poll, nomod_minus_one); + END_MODULE(dcopy); +#endif + ! this is just a marker for the area of text that contains stubs .seg ".text" .global stubs_end |