diff options
Diffstat (limited to 'usr/src/uts/common')
23 files changed, 2923 insertions, 121 deletions
diff --git a/usr/src/uts/common/fs/sockfs/sockstr.c b/usr/src/uts/common/fs/sockfs/sockstr.c index eb540644be..1a50324bc0 100644 --- a/usr/src/uts/common/fs/sockfs/sockstr.c +++ b/usr/src/uts/common/fs/sockfs/sockstr.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -69,6 +69,8 @@ #include <c2/audit.h> +#include <sys/dcopy.h> + int so_default_version = SOV_SOCKSTREAM; #ifdef DEBUG @@ -119,6 +121,36 @@ static mblk_t *strsock_misc(vnode_t *vp, mblk_t *mp, static int tlitosyserr(int terr); /* + * Sodirect kmem_cache and put/wakeup functions. + */ +struct kmem_cache *socktpi_sod_cache; +static int sodput(sodirect_t *, mblk_t *); +static void sodwakeup(sodirect_t *); + +/* + * Called by sockinit() when sockfs is loaded. + * + * Check for uioasync dcopy support and if supported + * allocate the sodirect_t kmem_cache socktpi_sod_cache. + */ +int +sostr_init() +{ + if (uioasync.enabled == B_TRUE && modload("misc", "dcopy") == -1) { + /* No dcopy KAPI driver, disable uioa */ + uioasync.enabled = B_FALSE; + } + + if (uioasync.enabled == B_TRUE) { + /* Uioasync enabled so sodirect will be used */ + socktpi_sod_cache = kmem_cache_create("socktpi_sod_cache", + sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + } + + return (0); +} + +/* * Convert a socket to a stream. Invoked when the illusory sockmod * is popped from the stream. * Change the stream head back to default operation without losing @@ -468,6 +500,34 @@ so_strinit(struct sonode *so, struct sonode *tso) stp->sd_qn_minpsz = 0; mutex_exit(&stp->sd_lock); + /* + * If sodirect capable allocate and initialize sodirect_t. + * Note, SS_SODIRECT is set in socktpi_open(). + */ + if (so->so_state & SS_SODIRECT) { + sodirect_t *sodp; + + ASSERT(so->so_direct == NULL); + + sodp = kmem_cache_alloc(socktpi_sod_cache, KM_SLEEP); + sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT; + sodp->sod_want = 0; + sodp->sod_q = RD(stp->sd_wrq); + sodp->sod_enqueue = sodput; + sodp->sod_wakeup = sodwakeup; + sodp->sod_uioafh = NULL; + sodp->sod_uioaft = NULL; + sodp->sod_lock = &stp->sd_lock; + /* + * Remainder of the sod_uioa members are left uninitialized + * but will be initialized later by uioainit() before uioa + * is enabled. + */ + sodp->sod_uioa.uioa_state = UIOA_ALLOC; + so->so_direct = sodp; + stp->sd_sodirect = sodp; + } + return (0); } @@ -2872,3 +2932,121 @@ tlitosyserr(int terr) else return (tli_errs[terr]); } + +/* + * Sockfs sodirect STREAMS read put procedure. Called from sodirect enable + * transport driver/module with an mblk_t chain. + * + * Note, we in-line putq() for the fast-path cases of q is empty, q_last and + * bp are of type M_DATA. All other cases we call putq(). + * + * On success a zero will be return, else an errno will be returned. + */ +int +sodput(sodirect_t *sodp, mblk_t *bp) +{ + queue_t *q = sodp->sod_q; + struct stdata *stp = (struct stdata *)q->q_ptr; + mblk_t *nbp; + int ret; + mblk_t *last = q->q_last; + int bytecnt = 0; + int mblkcnt = 0; + + + ASSERT(MUTEX_HELD(sodp->sod_lock)); + + if (stp->sd_flag == STREOF) { + ret = 0; + goto error; + } + + if (q->q_first == NULL) { + /* Q empty, really fast fast-path */ + bp->b_prev = NULL; + bp->b_next = NULL; + q->q_first = bp; + q->q_last = bp; + + } else if (last->b_datap->db_type == M_DATA && + bp->b_datap->db_type == M_DATA) { + /* + * Last mblk_t chain and bp are both type M_DATA so + * in-line putq() here, if the DBLK_UIOA state match + * add bp to the end of the current last chain, else + * start a new last chain with bp. + */ + if ((last->b_datap->db_flags & DBLK_UIOA) == + (bp->b_datap->db_flags & DBLK_UIOA)) { + /* Added to end */ + while ((nbp = last->b_cont) != NULL) + last = nbp; + last->b_cont = bp; + } else { + /* New last */ + last->b_next = bp; + bp->b_next = NULL; + bp->b_prev = last; + q->q_last = bp; + } + } else { + /* + * Can't use q_last so just call putq(). + */ + (void) putq(q, bp); + return (0); + } + + /* Count bytes and mblk_t's */ + do { + bytecnt += MBLKL(bp); + mblkcnt++; + } while ((bp = bp->b_cont) != NULL); + q->q_count += bytecnt; + q->q_mblkcnt += mblkcnt; + + /* Check for QFULL */ + if (q->q_count >= q->q_hiwat + sodp->sod_want || + q->q_mblkcnt >= q->q_hiwat) { + q->q_flag |= QFULL; + } + + return (0); + +error: + do { + if ((nbp = bp->b_next) != NULL) + bp->b_next = NULL; + freemsg(bp); + } while ((bp = nbp) != NULL); + + return (ret); +} + +/* + * Sockfs sodirect read wakeup. Called from a sodirect enabled transport + * driver/module to indicate that read-side data is available. + * + * On return the sodirect_t.lock mutex will be exited so this must be the + * last sodirect_t call to guarantee atomic access of *sodp. + */ +void +sodwakeup(sodirect_t *sodp) +{ + queue_t *q = sodp->sod_q; + struct stdata *stp = (struct stdata *)q->q_ptr; + + ASSERT(MUTEX_HELD(sodp->sod_lock)); + + if (stp->sd_flag & RSLEEP) { + stp->sd_flag &= ~RSLEEP; + cv_broadcast(&q->q_wait); + } + + if (stp->sd_rput_opt & SR_POLLIN) { + stp->sd_rput_opt &= ~SR_POLLIN; + mutex_exit(sodp->sod_lock); + pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM); + } else + mutex_exit(sodp->sod_lock); +} diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c index 9a6e9147e3..c857c34225 100644 --- a/usr/src/uts/common/fs/sockfs/socksubr.c +++ b/usr/src/uts/common/fs/sockfs/socksubr.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -44,6 +44,7 @@ #include <sys/file.h> #include <sys/open.h> #include <sys/user.h> +#include <sys/uio.h> #include <sys/termios.h> #include <sys/stream.h> #include <sys/strsubr.h> @@ -90,6 +91,7 @@ #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */ static struct kmem_cache *socktpi_cache, *socktpi_unix_cache; +struct kmem_cache *socktpi_sod_cache; dev_t sockdev; /* For fsid in getattr */ @@ -105,6 +107,8 @@ extern void sendfile_init(); extern void nl7c_init(void); +extern int sostr_init(); + #define ADRSTRLEN (2 * sizeof (void *) + 1) /* * kernel structure for passing the sockinfo data back up to the user. @@ -523,6 +527,15 @@ sockfree(struct sonode *so) so->so_nl7c_flags = 0; } + if (so->so_direct != NULL) { + sodirect_t *sodp = so->so_direct; + + ASSERT(sodp->sod_uioafh == NULL); + + so->so_direct = NULL; + kmem_cache_free(socktpi_sod_cache, sodp); + } + ASSERT(so->so_ux_bound_vp == NULL); if ((mp = so->so_unbind_mp) != NULL) { freemsg(mp); @@ -567,6 +580,8 @@ socktpi_constructor(void *buf, void *cdrarg, int kmflags) struct sonode *so = buf; struct vnode *vp; + so->so_direct = NULL; + so->so_nl7c_flags = 0; so->so_nl7c_uri = NULL; so->so_nl7c_rcv_mp = NULL; @@ -606,6 +621,8 @@ socktpi_destructor(void *buf, void *cdrarg) struct sonode *so = buf; struct vnode *vp = SOTOV(so); + ASSERT(so->so_direct == NULL); + ASSERT(so->so_nl7c_flags == 0); ASSERT(so->so_nl7c_uri == NULL); ASSERT(so->so_nl7c_rcv_mp == NULL); @@ -713,6 +730,12 @@ sockinit(int fstype, char *name) goto failure; } + error = sostr_init(); + if (error != 0) { + err_str = NULL; + goto failure; + } + /* * Create sonode caches. We create a special one for AF_UNIX so * that we can track them for netstat(1m). diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c index d6f9ebb57f..e632e234e2 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi.c +++ b/usr/src/uts/common/fs/sockfs/socktpi.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -58,6 +58,7 @@ #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sockio.h> +#include <sys/sodirect.h> #include <netinet/in.h> #include <sys/un.h> #include <sys/strsun.h> @@ -186,6 +187,9 @@ extern mblk_t *strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *, static int sotpi_unbind(struct sonode *, int); +extern int sodput(sodirect_t *, mblk_t *); +extern void sodwakeup(sodirect_t *); + /* TPI sockfs sonode operations */ static int sotpi_accept(struct sonode *, int, struct sonode **); static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t, @@ -2910,11 +2914,13 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) t_uscalar_t namelen; int so_state = so->so_state; /* Snapshot */ ssize_t saved_resid; - int error; rval_t rval; int flags; clock_t timout; int first; + int error = 0; + struct uio *suiop = NULL; + sodirect_t *sodp = so->so_direct; flags = msg->msg_flags; msg->msg_flags = 0; @@ -3062,6 +3068,53 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) opflag = pflag; first = 1; + if (uiop->uio_resid >= uioasync.mincnt && + sodp != NULL && (sodp->sod_state & SOD_ENABLED) && + uioasync.enabled && !(flags & MSG_PEEK) && + !(so_state & SS_CANTRCVMORE)) { + /* + * Big enough I/O for uioa min setup and an sodirect socket + * and sodirect enabled and uioa enabled and I/O will be done + * and not EOF so initialize the sodirect_t uioa_t with "uiop". + */ + mutex_enter(sodp->sod_lock); + if (!uioainit(uiop, &sodp->sod_uioa)) { + /* + * Successful uioainit() so the uio_t part of the + * uioa_t will be used for all uio_t work to follow, + * we save the original "uiop" in "suiop". + */ + suiop = uiop; + uiop = (uio_t *)&sodp->sod_uioa; + /* + * Before returning to the caller the passed in uio_t + * "uiop" will be updated via a call to uioafini() + * below. + * + * Note, the uioa.uioa_state isn't set to UIOA_ENABLED + * here as first we have to uioamove() any currently + * queued M_DATA mblk_t(s) so it will be done in + * kstrgetmsg(). + */ + } + /* + * In either uioainit() success or not case note the number + * of uio bytes the caller wants for sod framework and/or + * transport (e.g. TCP) strategy. + */ + sodp->sod_want = uiop->uio_resid; + mutex_exit(sodp->sod_lock); + } else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) { + /* + * No uioa but still using sodirect so note the number of + * uio bytes the caller wants for sodirect framework and/or + * transport (e.g. TCP) strategy. + * + * Note, sod_lock not held, only writer is in this function + * and only one thread at a time so not needed just to init. + */ + sodp->sod_want = uiop->uio_resid; + } retry: saved_resid = uiop->uio_resid; pri = 0; @@ -3091,10 +3144,7 @@ retry: eprintsoline(so, error); break; } - mutex_enter(&so->so_lock); - so_unlock_read(so); /* Clear SOREADLOCKED */ - mutex_exit(&so->so_lock); - return (error); + goto out; } /* * For datagrams the MOREDATA flag is used to set MSG_TRUNC. @@ -3137,9 +3187,7 @@ retry: pflag = opflag | MSG_NOMARK; goto retry; } - so_unlock_read(so); /* Clear SOREADLOCKED */ - mutex_exit(&so->so_lock); - return (0); + goto out_locked; } /* strsock_proto has already verified length and alignment */ @@ -3179,9 +3227,7 @@ retry: pflag = opflag | MSG_NOMARK; goto retry; } - so_unlock_read(so); /* Clear SOREADLOCKED */ - mutex_exit(&so->so_lock); - return (0); + goto out_locked; } case T_UNITDATA_IND: { void *addr; @@ -3207,7 +3253,7 @@ retry: freemsg(mp); error = EPROTO; eprintsoline(so, error); - goto err; + goto out; } if (so->so_family == AF_UNIX) { /* @@ -3236,7 +3282,7 @@ retry: freemsg(mp); error = EPROTO; eprintsoline(so, error); - goto err; + goto out; } if (so->so_family == AF_UNIX) so_getopt_srcaddr(opt, optlen, &addr, &addrlen); @@ -3283,17 +3329,14 @@ retry: msg->msg_namelen); kmem_free(control, controllen); eprintsoline(so, error); - goto err; + goto out; } msg->msg_control = control; msg->msg_controllen = controllen; } freemsg(mp); - mutex_enter(&so->so_lock); - so_unlock_read(so); /* Clear SOREADLOCKED */ - mutex_exit(&so->so_lock); - return (0); + goto out; } case T_OPTDATA_IND: { struct T_optdata_req *tdr; @@ -3322,7 +3365,7 @@ retry: freemsg(mp); error = EPROTO; eprintsoline(so, error); - goto err; + goto out; } ncontrollen = so_cmsglen(mp, opt, optlen, @@ -3350,7 +3393,7 @@ retry: freemsg(mp); kmem_free(control, controllen); eprintsoline(so, error); - goto err; + goto out; } msg->msg_control = control; msg->msg_controllen = controllen; @@ -3382,9 +3425,7 @@ retry: pflag = opflag | MSG_NOMARK; goto retry; } - so_unlock_read(so); /* Clear SOREADLOCKED */ - mutex_exit(&so->so_lock); - return (0); + goto out_locked; } case T_EXDATA_IND: { dprintso(so, 1, @@ -3441,10 +3482,7 @@ retry: eprintsoline(so, error); } #endif /* SOCK_DEBUG */ - mutex_enter(&so->so_lock); - so_unlock_read(so); /* Clear SOREADLOCKED */ - mutex_exit(&so->so_lock); - return (error); + goto out; } ASSERT(mp); tpr = (union T_primitives *)mp->b_rptr; @@ -3490,11 +3528,40 @@ retry: freemsg(mp); error = EPROTO; eprintsoline(so, error); - goto err; + goto out; } /* NOTREACHED */ -err: +out: mutex_enter(&so->so_lock); +out_locked: + if (sodp != NULL) { + /* Finish any sodirect and uioa processing */ + mutex_enter(sodp->sod_lock); + if (suiop != NULL) { + /* Finish any uioa_t processing */ + int ret; + + ASSERT(uiop == (uio_t *)&sodp->sod_uioa); + ret = uioafini(suiop, (uioa_t *)uiop); + if (error == 0 && ret != 0) { + /* If no error yet, set it */ + error = ret; + } + if ((mp = sodp->sod_uioafh) != NULL) { + sodp->sod_uioafh = NULL; + sodp->sod_uioaft = NULL; + freemsg(mp); + } + } + if (!(sodp->sod_state & SOD_WAKE_NOT)) { + /* Awoke */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NOT; + } + /* Last, clear sod_want value */ + sodp->sod_want = 0; + mutex_exit(sodp->sod_lock); + } so_unlock_read(so); /* Clear SOREADLOCKED */ mutex_exit(&so->so_lock); return (error); diff --git a/usr/src/uts/common/fs/sockfs/sockvnops.c b/usr/src/uts/common/fs/sockfs/sockvnops.c index 6c122c679d..c85a76d6e6 100644 --- a/usr/src/uts/common/fs/sockfs/sockvnops.c +++ b/usr/src/uts/common/fs/sockfs/sockvnops.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -239,6 +239,10 @@ socktpi_open(struct vnode **vpp, int flag, struct cred *cr, * udp case, when some other module is autopushed * above it, or for some reasons the expected module * isn't purely D_MP (which is the main requirement). + * + * Else, SS_DIRECT is valid. If the read-side Q has + * _QSODIRECT set then and uioasync is enabled then + * set SS_SODIRECT to enable sodirect. */ if (!socktpi_direct || !(tq->q_flag & _QDIRECT) || !(_OTHERQ(tq)->q_flag & _QDIRECT)) { @@ -255,6 +259,10 @@ socktpi_open(struct vnode **vpp, int flag, struct cred *cr, return (error); } } + } else if ((_OTHERQ(tq)->q_flag & _QSODIRECT) && + uioasync.enabled) { + /* Enable sodirect */ + so->so_state |= SS_SODIRECT; } } } else { diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index aa5ba3a075..26e1b12f4e 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -37,6 +37,7 @@ extern "C" { #include <netinet/ip6.h> #include <netinet/tcp.h> #include <sys/socket.h> +#include <sys/sodirect.h> #include <sys/multidata.h> #include <sys/md5.h> #include <inet/common.h> @@ -598,6 +599,13 @@ typedef struct tcp_s { */ boolean_t tcp_flow_stopped; + /* + * tcp_sodirect is used by tcp on the receive side to push mblk_t(s) + * directly to sockfs. Also, to schedule asynchronous copyout directly + * to a pending user-land uio buffer. + */ + sodirect_t *tcp_sodirect; + #ifdef DEBUG pc_t tcmp_stk[15]; #endif diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 12b781c0bc..a729e2d066 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -66,6 +66,8 @@ const char tcp_version[] = "%Z%%M% %I% %E% SMI"; #include <sys/isa_defs.h> #include <sys/md5.h> #include <sys/random.h> +#include <sys/sodirect.h> +#include <sys/uio.h> #include <netinet/in.h> #include <netinet/tcp.h> #include <netinet/ip6.h> @@ -216,6 +218,23 @@ const char tcp_version[] = "%Z%%M% %I% %E% SMI"; * behaviour. Once tcp_issocket is unset, its never set for the * life of that connection. * + * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT) + * two consoldiation private KAPIs are used to enqueue M_DATA mblk_t's + * directly to the socket (sodirect) and start an asynchronous copyout + * to a user-land receive-side buffer (uioa) when a blocking socket read + * (e.g. read, recv, ...) is pending. + * + * This is accomplished when tcp_issocket is set and tcp_sodirect is not + * NULL so points to an sodirect_t and if marked enabled then we enqueue + * all mblk_t's directly to the socket. + * + * Further, if the sodirect_t sod_uioa and if marked enabled (due to a + * blocking socket read, e.g. user-land read, recv, ...) then an asynchronous + * copyout will be started directly to the user-land uio buffer. Also, as we + * have a pending read, TCP's push logic can take into account the number of + * bytes to be received and only awake the blocked read()er when the uioa_t + * byte count has been satisfied. + * * IPsec notes : * * Since a packet is always executed on the correct TCP perimeter @@ -246,6 +265,37 @@ squeue_func_t tcp_squeue_close_proc; squeue_func_t tcp_squeue_wput_proc; /* + * Macros for sodirect: + * + * SOD_PTR_ENTER(tcp, sodp) - for the tcp_t pointer "tcp" set the + * sodirect_t pointer "sodp" to the socket/tcp shared sodirect_t + * if it exists and is enabled, else to NULL. Note, in the current + * sodirect implementation the sod_lock must not be held across any + * STREAMS call (e.g. putnext) else a "recursive mutex_enter" PANIC + * will result as sod_lock is the streamhead stdata.sd_lock. + * + * SOD_NOT_ENABLED(tcp) - return true if not a sodirect tcp_t or the + * sodirect_t isn't enabled, usefull for ASSERT()ing that a recieve + * side tcp code path dealing with a tcp_rcv_list or putnext() isn't + * being used when sodirect code paths should be. + */ + +#define SOD_PTR_ENTER(tcp, sodp) \ + (sodp) = (tcp)->tcp_sodirect; \ + \ + if ((sodp) != NULL) { \ + mutex_enter((sodp)->sod_lock); \ + if (!((sodp)->sod_state & SOD_ENABLED)) { \ + mutex_exit((sodp)->sod_lock); \ + (sodp) = NULL; \ + } \ + } + +#define SOD_NOT_ENABLED(tcp) \ + ((tcp)->tcp_sodirect == NULL || \ + !((tcp)->tcp_sodirect->sod_state & SOD_ENABLED)) + +/* * This controls how tiny a write must be before we try to copy it * into the the mblk on the tail of the transmit queue. Not much * speedup is observed for values larger than sixteen. Zero will @@ -3808,6 +3858,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) mblk_t *mp; queue_t *q; tcp_stack_t *tcps = tcp->tcp_tcps; + sodirect_t *sodp; TCP_CLD_STAT(tag); @@ -3872,6 +3923,13 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) return (-1); } + /* If sodirect, not anymore */ + SOD_PTR_ENTER(tcp, sodp); + if (sodp != NULL) { + tcp->tcp_sodirect = NULL; + mutex_exit(sodp->sod_lock); + } + q = tcp->tcp_rq; /* Trash all inbound data */ @@ -4236,6 +4294,11 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) */ /* FALLTHRU */ default: + if (tcp->tcp_sodirect != NULL) { + /* Ok, no more sodirect */ + tcp->tcp_sodirect = NULL; + } + if (tcp->tcp_fused) tcp_unfuse(tcp); @@ -6381,6 +6444,15 @@ tcp_connect(tcp_t *tcp, mblk_t *mp) *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; } + if (tcp->tcp_issocket) { + /* + * TCP is _D_SODIRECT and sockfs is directly above so save + * the shared sonode sodirect_t pointer (if any) to enable + * TCP sodirect. + */ + tcp->tcp_sodirect = SOD_QTOSODP(tcp->tcp_rq); + } + switch (tcp->tcp_state) { case TCPS_IDLE: /* @@ -8190,6 +8262,9 @@ tcp_reinit_values(tcp) ASSERT(!tcp->tcp_kssl_pending); PRESERVE(tcp->tcp_kssl_ent); + /* Sodirect */ + tcp->tcp_sodirect = NULL; + tcp->tcp_closemp_used = B_FALSE; #ifdef DEBUG @@ -8282,6 +8357,9 @@ tcp_init_values(tcp_t *tcp) tcp->tcp_fuse_rcv_unread_hiwater = 0; tcp->tcp_fuse_rcv_unread_cnt = 0; + /* Sodirect */ + tcp->tcp_sodirect = NULL; + /* Initialize the header template */ if (tcp->tcp_ipversion == IPV4_VERSION) { err = tcp_header_init_ipv4(tcp); @@ -11691,6 +11769,9 @@ tcp_rcv_drain(queue_t *q, tcp_t *tcp) if (tcp->tcp_listener != NULL) return (ret); + /* Can't be sodirect enabled */ + ASSERT(SOD_NOT_ENABLED(tcp)); + /* * Handle two cases here: we are currently fused or we were * previously fused and have some urgent data to be delivered @@ -11770,6 +11851,9 @@ tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) ASSERT(seg_len == msgdsize(mp)); ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL); + /* Can't be sodirect enabled */ + ASSERT(SOD_NOT_ENABLED(tcp)); + if (tcp->tcp_rcv_list == NULL) { ASSERT(tcp->tcp_rcv_last_head == NULL); tcp->tcp_rcv_list = mp; @@ -11790,6 +11874,216 @@ tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) } /* + * The tcp_rcv_sod_XXX() functions enqueue data directly to the socket + * above, in addition when uioa is enabled schedule an asynchronous uio + * prior to enqueuing. They implement the combinhed semantics of the + * tcp_rcv_XXX() functions, tcp_rcv_list push logic, and STREAMS putnext() + * canputnext(), i.e. flow-control with backenable. + * + * tcp_sod_wakeup() is called where tcp_rcv_drain() would be called in the + * non sodirect connection but as there are no tcp_tcv_list mblk_t's we deal + * with the rcv_wnd and push timer and call the sodirect wakeup function. + * + * Must be called with sodp->sod_lock held and will return with the lock + * released. + */ +static uint_t +tcp_rcv_sod_wakeup(tcp_t *tcp, sodirect_t *sodp) +{ + queue_t *q = tcp->tcp_rq; + uint_t thwin; + tcp_stack_t *tcps = tcp->tcp_tcps; + uint_t ret = 0; + + /* Can't be an eager connection */ + ASSERT(tcp->tcp_listener == NULL); + + /* Caller must have lock held */ + ASSERT(MUTEX_HELD(sodp->sod_lock)); + + /* Sodirect mode so must not be a tcp_rcv_list */ + ASSERT(tcp->tcp_rcv_list == NULL); + + if (SOD_QFULL(sodp)) { + /* Q is full, mark Q for need backenable */ + SOD_QSETBE(sodp); + } + /* Last advertised rwnd, i.e. rwnd last sent in a packet */ + thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) + << tcp->tcp_rcv_ws; + /* This is peer's calculated send window (our available rwnd). */ + thwin -= tcp->tcp_rnxt - tcp->tcp_rack; + /* + * Increase the receive window to max. But we need to do receiver + * SWS avoidance. This means that we need to check the increase of + * of receive window is at least 1 MSS. + */ + if (!SOD_QFULL(sodp) && (q->q_hiwat - thwin >= tcp->tcp_mss)) { + /* + * If the window that the other side knows is less than max + * deferred acks segments, send an update immediately. + */ + if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { + BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); + ret = TH_ACK_NEEDED; + } + tcp->tcp_rwnd = q->q_hiwat; + } + + if (!SOD_QEMPTY(sodp)) { + /* Wakeup to socket */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_DONE; + (sodp->sod_wakeup)(sodp); + /* wakeup() does the mutex_ext() */ + } else { + /* Q is empty, no need to wake */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NOT; + mutex_exit(sodp->sod_lock); + } + + /* No need for the push timer now. */ + if (tcp->tcp_push_tid != 0) { + (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); + tcp->tcp_push_tid = 0; + } + + return (ret); +} + +/* + * Called where tcp_rcv_enqueue()/putnext(RD(q)) would be. For M_DATA + * mblk_t's if uioa enabled then start a uioa asynchronous copy directly + * to the user-land buffer and flag the mblk_t as such. + * + * Also, handle tcp_rwnd. + */ +uint_t +tcp_rcv_sod_enqueue(tcp_t *tcp, sodirect_t *sodp, mblk_t *mp, uint_t seg_len) +{ + uioa_t *uioap = &sodp->sod_uioa; + boolean_t qfull; + uint_t thwin; + + /* Can't be an eager connection */ + ASSERT(tcp->tcp_listener == NULL); + + /* Caller must have lock held */ + ASSERT(MUTEX_HELD(sodp->sod_lock)); + + /* Sodirect mode so must not be a tcp_rcv_list */ + ASSERT(tcp->tcp_rcv_list == NULL); + + /* Passed in segment length must be equal to mblk_t chain data size */ + ASSERT(seg_len == msgdsize(mp)); + + if (DB_TYPE(mp) != M_DATA) { + /* Only process M_DATA mblk_t's */ + goto enq; + } + if (uioap->uioa_state & UIOA_ENABLED) { + /* Uioa is enabled */ + mblk_t *mp1 = mp; + + if (seg_len > uioap->uio_resid) { + /* + * There isn't enough uio space for the mblk_t chain + * so disable uioa such that this and any additional + * mblk_t data is handled by the socket and schedule + * the socket for wakeup to finish this uioa. + */ + uioap->uioa_state &= UIOA_CLR; + uioap->uioa_state |= UIOA_FINI; + if (sodp->sod_state & SOD_WAKE_NOT) { + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NEED; + } + goto enq; + } + do { + uint32_t len = MBLKL(mp1); + + if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) { + /* Scheduled, mark dblk_t as such */ + DB_FLAGS(mp1) |= DBLK_UIOA; + } else { + /* Error, turn off async processing */ + uioap->uioa_state &= UIOA_CLR; + uioap->uioa_state |= UIOA_FINI; + break; + } + } while ((mp1 = mp1->b_cont) != NULL); + + if (mp1 != NULL || uioap->uio_resid == 0) { + /* + * Not all mblk_t(s) uioamoved (error) or all uio + * space has been consumed so schedule the socket + * for wakeup to finish this uio. + */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NEED; + } + } else if (uioap->uioa_state & UIOA_FINI) { + /* + * Post UIO_ENABLED waiting for socket to finish processing + * so just enqueue and update tcp_rwnd. + */ + if (SOD_QFULL(sodp)) + tcp->tcp_rwnd -= seg_len; + } else if (sodp->sod_want > 0) { + /* + * Uioa isn't enabled but sodirect has a pending read(). + */ + if (SOD_QCNT(sodp) + seg_len >= sodp->sod_want) { + if (sodp->sod_state & SOD_WAKE_NOT) { + /* Schedule socket for wakeup */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NEED; + } + tcp->tcp_rwnd -= seg_len; + } + } else if (SOD_QCNT(sodp) + seg_len >= tcp->tcp_rq->q_hiwat >> 3) { + /* + * No pending sodirect read() so used the default + * TCP push logic to guess that a push is needed. + */ + if (sodp->sod_state & SOD_WAKE_NOT) { + /* Schedule socket for wakeup */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NEED; + } + tcp->tcp_rwnd -= seg_len; + } else { + /* Just update tcp_rwnd */ + tcp->tcp_rwnd -= seg_len; + } +enq: + qfull = SOD_QFULL(sodp); + + (sodp->sod_enqueue)(sodp, mp); + + if (! qfull && SOD_QFULL(sodp)) { + /* Wasn't QFULL, now QFULL, need back-enable */ + SOD_QSETBE(sodp); + } + + /* + * Check to see if remote avail swnd < mss due to delayed ACK, + * first get advertised rwnd. + */ + thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)); + /* Minus delayed ACK count */ + thwin -= tcp->tcp_rnxt - tcp->tcp_rack; + if (thwin < tcp->tcp_mss) { + /* Remote avail swnd < mss, need ACK now */ + return (TH_ACK_NEEDED); + } + + return (0); +} + +/* * DEFAULT TCP ENTRY POINT via squeue on READ side. * * This is the default entry function into TCP on the read side. TCP is @@ -14987,13 +15281,39 @@ est: tcp_rcv_enqueue(tcp, mp, seg_len); } } else { + sodirect_t *sodp = tcp->tcp_sodirect; + + /* + * If an sodirect connection and an enabled sodirect_t then + * sodp will be set to point to the tcp_t/sonode_t shared + * sodirect_t and the sodirect_t's lock will be held. + */ + if (sodp != NULL) { + mutex_enter(sodp->sod_lock); + if (!(sodp->sod_state & SOD_ENABLED)) { + mutex_exit(sodp->sod_lock); + sodp = NULL; + } else if (tcp->tcp_kssl_ctx != NULL && + DB_TYPE(mp) == M_DATA) { + mutex_exit(sodp->sod_lock); + sodp = NULL; + } + } if (mp->b_datap->db_type != M_DATA || (flags & TH_MARKNEXT_NEEDED)) { - if (tcp->tcp_rcv_list != NULL) { + if (sodp != NULL) { + if (!SOD_QEMPTY(sodp) && + (sodp->sod_state & SOD_WAKE_NOT)) { + flags |= tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() did the mutex_exit() */ + mutex_enter(sodp->sod_lock); + } + } else if (tcp->tcp_rcv_list != NULL) { flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); } ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); + if (flags & TH_MARKNEXT_NEEDED) { #ifdef DEBUG (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, @@ -15011,11 +15331,40 @@ est: DTRACE_PROBE1(kssl_mblk__ksslinput_data1, mblk_t *, mp); tcp_kssl_input(tcp, mp); + } else if (sodp) { + flags |= tcp_rcv_sod_enqueue( + tcp, sodp, mp, seg_len); + flags |= tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() did the mutex_exit() */ } else { putnext(tcp->tcp_rq, mp); if (!canputnext(tcp->tcp_rq)) tcp->tcp_rwnd -= seg_len; } + } else if ((tcp->tcp_kssl_ctx != NULL) && + (DB_TYPE(mp) == M_DATA)) { + /* Do SSL processing first */ + DTRACE_PROBE1(kssl_mblk__ksslinput_data2, + mblk_t *, mp); + tcp_kssl_input(tcp, mp); + } else if (sodp != NULL) { + /* + * Sodirect so all mblk_t's are queued on the + * socket directly, check for wakeup of blocked + * reader (if any), and last if flow-controled. + */ + flags |= tcp_rcv_sod_enqueue(tcp, sodp, mp, seg_len); + if ((sodp->sod_state & SOD_WAKE_NEED) || + (flags & (TH_PUSH|TH_FIN))) { + flags |= tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() did the mutex_exit() */ + } else { + if (SOD_QFULL(sodp)) { + /* Q is full, need backenable */ + SOD_QSETBE(sodp); + } + mutex_exit(sodp->sod_lock); + } } else if ((flags & (TH_PUSH|TH_FIN)) || tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_rq->q_hiwat >> 3) { if (tcp->tcp_rcv_list != NULL) { @@ -15035,41 +15384,33 @@ est: tcp_rcv_enqueue(tcp, mp, seg_len); flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); } else { - /* Does this need SSL processing first? */ - if ((tcp->tcp_kssl_ctx != NULL) && - (DB_TYPE(mp) == M_DATA)) { - DTRACE_PROBE1( - kssl_mblk__ksslinput_data2, - mblk_t *, mp); - tcp_kssl_input(tcp, mp); - } else { - putnext(tcp->tcp_rq, mp); - if (!canputnext(tcp->tcp_rq)) - tcp->tcp_rwnd -= seg_len; - } + putnext(tcp->tcp_rq, mp); + if (!canputnext(tcp->tcp_rq)) + tcp->tcp_rwnd -= seg_len; } } else { /* * Enqueue all packets when processing an mblk * from the co queue and also enqueue normal packets. - * For packets which belong to SSL stream do SSL - * processing first. */ - if ((tcp->tcp_kssl_ctx != NULL) && - (DB_TYPE(mp) == M_DATA)) { - DTRACE_PROBE1(kssl_mblk__tcpksslin3, - mblk_t *, mp); - tcp_kssl_input(tcp, mp); - } else { - tcp_rcv_enqueue(tcp, mp, seg_len); - } + tcp_rcv_enqueue(tcp, mp, seg_len); } /* * Make sure the timer is running if we have data waiting * for a push bit. This provides resiliency against * implementations that do not correctly generate push bits. + * + * Note, for sodirect if Q isn't empty and there's not a + * pending wakeup then we need a timer. Also note that sodp + * is assumed to be still valid after exit()ing the sod_lock + * above and while the SOD state can change it can only change + * such that the Q is empty now even though data was added + * above. */ - if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) { + if (((sodp != NULL && !SOD_QEMPTY(sodp) && + (sodp->sod_state & SOD_WAKE_NOT)) || + (sodp == NULL && tcp->tcp_rcv_list != NULL)) && + tcp->tcp_push_tid == 0) { /* * The connection may be closed at this point, so don't * do anything for a detached tcp. @@ -15081,6 +15422,7 @@ est: tcps->tcps_push_timer_interval)); } } + xmit_check: /* Is there anything left to do? */ ASSERT(!(flags & TH_MARKNEXT_NEEDED)); @@ -15156,13 +15498,27 @@ ack_check: /* * Send up any queued data and then send the mark message */ - if (tcp->tcp_rcv_list != NULL) { - flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); - } - ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); + sodirect_t *sodp; + + SOD_PTR_ENTER(tcp, sodp); mp1 = tcp->tcp_urp_mark_mp; tcp->tcp_urp_mark_mp = NULL; + if (sodp != NULL) { + + ASSERT(tcp->tcp_rcv_list == NULL); + + flags |= tcp_rcv_sod_enqueue(tcp, sodp, mp1, 0); + flags |= tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() does the mutex_exit() */ + } else if (tcp->tcp_rcv_list != NULL) { + flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); + + ASSERT(tcp->tcp_rcv_list == NULL || + tcp->tcp_fused_sigurg); + + putnext(tcp->tcp_rq, mp1); + } #ifdef DEBUG (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: sending zero-length %s %s", @@ -15170,7 +15526,6 @@ ack_check: "MSGNOTMARKNEXT"), tcp_display(tcp, NULL, DISP_PORT_ONLY)); #endif /* DEBUG */ - putnext(tcp->tcp_rq, mp1); flags &= ~TH_SEND_URP_MARK; } if (flags & TH_ACK_NEEDED) { @@ -15208,14 +15563,32 @@ ack_check: * In the eager case tcp_rsrv will do this when run * after tcp_accept is done. */ + sodirect_t *sodp; + ASSERT(tcp->tcp_listener == NULL); - if (tcp->tcp_rcv_list != NULL) { + + SOD_PTR_ENTER(tcp, sodp); + if (sodp != NULL) { + /* No more sodirect */ + tcp->tcp_sodirect = NULL; + if (!SOD_QEMPTY(sodp)) { + /* Mblk(s) to process, notify */ + flags |= tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() does the mutex_exit() */ + } else { + /* Nothing to process */ + mutex_exit(sodp->sod_lock); + } + } else if (tcp->tcp_rcv_list != NULL) { /* * Push any mblk(s) enqueued from co processing. */ flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); + + ASSERT(tcp->tcp_rcv_list == NULL || + tcp->tcp_fused_sigurg); } - ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); + if ((mp1 = mi_tpi_ordrel_ind()) != NULL) { tcp->tcp_ordrel_done = B_TRUE; putnext(tcp->tcp_rq, mp1); @@ -15985,6 +16358,8 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) queue_t *q = tcp->tcp_rq; uint_t thwin; tcp_stack_t *tcps = tcp->tcp_tcps; + sodirect_t *sodp; + boolean_t fc; freeb(mp); @@ -16035,7 +16410,27 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) return; } - if (canputnext(q)) { + SOD_PTR_ENTER(tcp, sodp); + if (sodp != NULL) { + /* An sodirect connection */ + if (SOD_QFULL(sodp)) { + /* Flow-controlled, need another back-enable */ + fc = B_TRUE; + SOD_QSETBE(sodp); + } else { + /* Not flow-controlled */ + fc = B_FALSE; + } + mutex_exit(sodp->sod_lock); + } else if (canputnext(q)) { + /* STREAMS, not flow-controlled */ + fc = B_FALSE; + } else { + /* STREAMS, flow-controlled */ + fc = B_TRUE; + } + if (!fc) { + /* Not flow-controlled, open rwnd */ tcp->tcp_rwnd = q->q_hiwat; thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) << tcp->tcp_rcv_ws; @@ -16054,13 +16449,32 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); } } + /* Handle a failure to allocate a T_ORDREL_IND here */ if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { ASSERT(tcp->tcp_listener == NULL); - if (tcp->tcp_rcv_list != NULL) { - (void) tcp_rcv_drain(q, tcp); + + SOD_PTR_ENTER(tcp, sodp); + if (sodp != NULL) { + /* No more sodirect */ + tcp->tcp_sodirect = NULL; + if (!SOD_QEMPTY(sodp)) { + /* Notify mblk(s) to process */ + (void) tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() does the mutex_exit() */ + } else { + /* Nothing to process */ + mutex_exit(sodp->sod_lock); + } + } else if (tcp->tcp_rcv_list != NULL) { + /* + * Push any mblk(s) enqueued from co processing. + */ + (void) tcp_rcv_drain(tcp->tcp_rq, tcp); + ASSERT(tcp->tcp_rcv_list == NULL || + tcp->tcp_fused_sigurg); } - ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); + mp = mi_tpi_ordrel_ind(); if (mp) { tcp->tcp_ordrel_done = B_TRUE; @@ -18108,6 +18522,8 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) */ if (tcp->tcp_rcv_list != NULL) { /* We drain directly in case of fused tcp loopback */ + sodirect_t *sodp; + if (!tcp->tcp_fused && canputnext(q)) { tcp->tcp_rwnd = q->q_hiwat; thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) @@ -18123,7 +18539,26 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) } } - (void) tcp_rcv_drain(q, tcp); + + SOD_PTR_ENTER(tcp, sodp); + if (sodp != NULL) { + /* Sodirect, move from rcv_list */ + ASSERT(!tcp->tcp_fused); + while ((mp = tcp->tcp_rcv_list) != NULL) { + tcp->tcp_rcv_list = mp->b_next; + mp->b_next = NULL; + (void) tcp_rcv_sod_enqueue(tcp, sodp, mp, + msgdsize(mp)); + } + tcp->tcp_rcv_last_head = NULL; + tcp->tcp_rcv_last_tail = NULL; + tcp->tcp_rcv_cnt = 0; + (void) tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() did the mutex_exit() */ + } else { + /* Not sodirect, drain */ + (void) tcp_rcv_drain(q, tcp); + } /* * For fused tcp loopback, back-enable peer endpoint @@ -18315,6 +18750,21 @@ tcp_wput_accept(queue_t *q, mblk_t *mp) listener = eager->tcp_listener; eager->tcp_issocket = B_TRUE; + /* + * TCP is _D_SODIRECT and sockfs is directly above so + * save shared sodirect_t pointer (if any). + * + * If tcp_fused and sodirect enabled disable it. + */ + eager->tcp_sodirect = SOD_QTOSODP(eager->tcp_rq); + if (eager->tcp_fused && eager->tcp_sodirect != NULL) { + /* Fused, disable sodirect */ + mutex_enter(eager->tcp_sodirect->sod_lock); + SOD_DISABLE(eager->tcp_sodirect); + mutex_exit(eager->tcp_sodirect->sod_lock); + eager->tcp_sodirect = NULL; + } + econnp->conn_zoneid = listener->tcp_connp->conn_zoneid; econnp->conn_allzones = listener->tcp_connp->conn_allzones; ASSERT(econnp->conn_netstack == @@ -22101,6 +22551,7 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) tcp_fuse_disable_pair(tcp, B_FALSE); } tcp->tcp_issocket = B_FALSE; + tcp->tcp_sodirect = NULL; TCP_STAT(tcps, tcp_sock_fallback); DB_TYPE(mp) = M_IOCACK; @@ -23383,6 +23834,8 @@ tcp_push_timer(void *arg) conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; tcp_stack_t *tcps = tcp->tcp_tcps; + uint_t flags; + sodirect_t *sodp; TCP_DBGSTAT(tcps, tcp_push_timer_cnt); @@ -23394,9 +23847,17 @@ tcp_push_timer(void *arg) */ TCP_FUSE_SYNCSTR_PLUG_DRAIN(tcp); tcp->tcp_push_tid = 0; - if ((tcp->tcp_rcv_list != NULL) && - (tcp_rcv_drain(tcp->tcp_rq, tcp) == TH_ACK_NEEDED)) + + SOD_PTR_ENTER(tcp, sodp); + if (sodp != NULL) { + flags = tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() does the mutex_exit() */ + } else if (tcp->tcp_rcv_list != NULL) { + flags = tcp_rcv_drain(tcp->tcp_rq, tcp); + } + if (flags == TH_ACK_NEEDED) tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); + TCP_FUSE_SYNCSTR_UNPLUG_DRAIN(tcp); } diff --git a/usr/src/uts/common/inet/tcp/tcp6ddi.c b/usr/src/uts/common/inet/tcp/tcp6ddi.c index e724bdd022..1eda50d9a6 100644 --- a/usr/src/uts/common/inet/tcp/tcp6ddi.c +++ b/usr/src/uts/common/inet/tcp/tcp6ddi.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -40,7 +40,7 @@ * for TCP Fusion (loopback); this is why we don't define * D_SYNCSTR here. */ -#define INET_DEVMTFLAGS (D_MP|_D_DIRECT) +#define INET_DEVMTFLAGS (D_MP|_D_DIRECT|_D_SODIRECT) #include "../inetddi.c" diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c index 2503a13e29..75851ac1f7 100644 --- a/usr/src/uts/common/inet/tcp/tcp_fusion.c +++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c @@ -287,6 +287,15 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) if ((mp = allocb(sizeof (*stropt), BPRI_HI)) == NULL) goto failed; + /* If peer sodirect enabled then disable */ + ASSERT(tcp->tcp_sodirect == NULL); + if (peer_tcp->tcp_sodirect != NULL) { + mutex_enter(peer_tcp->tcp_sodirect->sod_lock); + SOD_DISABLE(peer_tcp->tcp_sodirect); + mutex_exit(peer_tcp->tcp_sodirect->sod_lock); + peer_tcp->tcp_sodirect = NULL; + } + /* Fuse both endpoints */ peer_tcp->tcp_loopback_peer = tcp; tcp->tcp_loopback_peer = peer_tcp; diff --git a/usr/src/uts/common/inet/tcp/tcpddi.c b/usr/src/uts/common/inet/tcp/tcpddi.c index 436786b846..dc08ad23c4 100644 --- a/usr/src/uts/common/inet/tcp/tcpddi.c +++ b/usr/src/uts/common/inet/tcp/tcpddi.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -44,7 +44,7 @@ * for TCP Fusion (loopback); this is why we don't define * D_SYNCSTR here. */ -#define INET_DEVMTFLAGS (D_MP|_D_DIRECT) +#define INET_DEVMTFLAGS (D_MP|_D_DIRECT|_D_SODIRECT) #include "../inetddi.c" diff --git a/usr/src/uts/common/io/dcopy.c b/usr/src/uts/common/io/dcopy.c new file mode 100644 index 0000000000..2dc5a311bc --- /dev/null +++ b/usr/src/uts/common/io/dcopy.c @@ -0,0 +1,932 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * dcopy.c + * dcopy misc module + */ + +#include <sys/conf.h> +#include <sys/kmem.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/modctl.h> +#include <sys/sysmacros.h> +#include <sys/atomic.h> + + +#include <sys/dcopy.h> +#include <sys/dcopy_device.h> + + +/* Number of entries per channel to allocate */ +uint_t dcopy_channel_size = 1024; + + +typedef struct dcopy_list_s { + list_t dl_list; + kmutex_t dl_mutex; + uint_t dl_cnt; /* num entries on list */ +} dcopy_list_t; + +/* device state for register/unregister */ +struct dcopy_device_s { + /* DMA device drivers private pointer */ + void *dc_device_private; + + /* to track list of channels from this DMA device */ + dcopy_list_t dc_devchan_list; + list_node_t dc_device_list_node; + + /* + * dc_removing_cnt track how many channels still have to be freed up + * before it's safe to allow the DMA device driver to detach. + */ + uint_t dc_removing_cnt; + dcopy_device_cb_t *dc_cb; + + dcopy_device_info_t dc_info; + +}; + +typedef struct dcopy_stats_s { + kstat_named_t cs_bytes_xfer; + kstat_named_t cs_cmd_alloc; + kstat_named_t cs_cmd_post; + kstat_named_t cs_cmd_poll; + kstat_named_t cs_notify_poll; + kstat_named_t cs_notify_pending; + kstat_named_t cs_id; + kstat_named_t cs_capabilities; +} dcopy_stats_t; + +/* DMA channel state */ +struct dcopy_channel_s { + /* DMA driver channel private pointer */ + void *ch_channel_private; + + /* shortcut to device callbacks */ + dcopy_device_cb_t *ch_cb; + + /* + * number of outstanding allocs for this channel. used to track when + * it's safe to free up this channel so the DMA device driver can + * detach. + */ + uint64_t ch_ref_cnt; + + /* state for if channel needs to be removed when ch_ref_cnt gets to 0 */ + boolean_t ch_removing; + + list_node_t ch_devchan_list_node; + list_node_t ch_globalchan_list_node; + + /* + * per channel list of commands actively blocking waiting for + * completion. + */ + dcopy_list_t ch_poll_list; + + /* pointer back to our device */ + struct dcopy_device_s *ch_device; + + dcopy_query_channel_t ch_info; + + kstat_t *ch_kstat; + dcopy_stats_t ch_stat; +}; + +/* + * If grabbing both device_list mutex & globalchan_list mutex, + * Always grab globalchan_list mutex before device_list mutex + */ +typedef struct dcopy_state_s { + dcopy_list_t d_device_list; + dcopy_list_t d_globalchan_list; +} dcopy_state_t; +dcopy_state_t *dcopy_statep; + + +/* Module Driver Info */ +static struct modlmisc dcopy_modlmisc = { + &mod_miscops, + "dcopy kernel module" +}; + +/* Module Linkage */ +static struct modlinkage dcopy_modlinkage = { + MODREV_1, + &dcopy_modlmisc, + NULL +}; + +static int dcopy_init(); +static void dcopy_fini(); + +static int dcopy_list_init(dcopy_list_t *list, size_t node_size, + offset_t link_offset); +static void dcopy_list_fini(dcopy_list_t *list); +static void dcopy_list_push(dcopy_list_t *list, void *list_node); +static void *dcopy_list_pop(dcopy_list_t *list); + +static void dcopy_device_cleanup(dcopy_device_handle_t device, + boolean_t do_callback); + +static int dcopy_stats_init(dcopy_handle_t channel); +static void dcopy_stats_fini(dcopy_handle_t channel); + + +/* + * _init() + */ +int +_init() +{ + int e; + + e = dcopy_init(); + if (e != 0) { + return (e); + } + + return (mod_install(&dcopy_modlinkage)); +} + + +/* + * _info() + */ +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&dcopy_modlinkage, modinfop)); +} + + +/* + * _fini() + */ +int +_fini() +{ + int e; + + e = mod_remove(&dcopy_modlinkage); + if (e != 0) { + return (e); + } + + dcopy_fini(); + + return (e); +} + +/* + * dcopy_init() + */ +static int +dcopy_init() +{ + int e; + + + dcopy_statep = kmem_zalloc(sizeof (*dcopy_statep), KM_SLEEP); + + /* Initialize the list we use to track device register/unregister */ + e = dcopy_list_init(&dcopy_statep->d_device_list, + sizeof (struct dcopy_device_s), + offsetof(struct dcopy_device_s, dc_device_list_node)); + if (e != DCOPY_SUCCESS) { + goto dcopyinitfail_device; + } + + /* Initialize the list we use to track all DMA channels */ + e = dcopy_list_init(&dcopy_statep->d_globalchan_list, + sizeof (struct dcopy_channel_s), + offsetof(struct dcopy_channel_s, ch_globalchan_list_node)); + if (e != DCOPY_SUCCESS) { + goto dcopyinitfail_global; + } + + return (0); + +dcopyinitfail_cback: + dcopy_list_fini(&dcopy_statep->d_globalchan_list); +dcopyinitfail_global: + dcopy_list_fini(&dcopy_statep->d_device_list); +dcopyinitfail_device: + kmem_free(dcopy_statep, sizeof (*dcopy_statep)); + + return (-1); +} + + +/* + * dcopy_fini() + */ +static void +dcopy_fini() +{ + /* + * if mod_remove was successfull, we shouldn't have any + * devices/channels to worry about. + */ + ASSERT(list_head(&dcopy_statep->d_globalchan_list.dl_list) == NULL); + ASSERT(list_head(&dcopy_statep->d_device_list.dl_list) == NULL); + + dcopy_list_fini(&dcopy_statep->d_globalchan_list); + dcopy_list_fini(&dcopy_statep->d_device_list); + kmem_free(dcopy_statep, sizeof (*dcopy_statep)); +} + + +/* *** EXTERNAL INTERFACE *** */ +/* + * dcopy_query() + */ +void +dcopy_query(dcopy_query_t *query) +{ + query->dq_version = DCOPY_QUERY_V0; + query->dq_num_channels = dcopy_statep->d_globalchan_list.dl_cnt; +} + + +/* + * dcopy_alloc() + */ +/*ARGSUSED*/ +int +dcopy_alloc(int flags, dcopy_handle_t *handle) +{ + dcopy_handle_t channel; + dcopy_list_t *list; + + + /* + * we don't use the dcopy_list_* code here because we need to due + * some non-standard stuff. + */ + + list = &dcopy_statep->d_globalchan_list; + + /* + * if nothing is on the channel list, return DCOPY_NORESOURCES. This + * can happen if there aren't any DMA device registered. + */ + mutex_enter(&list->dl_mutex); + channel = list_head(&list->dl_list); + if (channel == NULL) { + mutex_exit(&list->dl_mutex); + return (DCOPY_NORESOURCES); + } + + /* + * increment the reference count, and pop the channel off the head and + * push it on the tail. This ensures we rotate through the channels. + * DMA channels are shared. + */ + channel->ch_ref_cnt++; + list_remove(&list->dl_list, channel); + list_insert_tail(&list->dl_list, channel); + mutex_exit(&list->dl_mutex); + + *handle = (dcopy_handle_t)channel; + return (DCOPY_SUCCESS); +} + + +/* + * dcopy_free() + */ +void +dcopy_free(dcopy_handle_t *channel) +{ + dcopy_device_handle_t device; + dcopy_list_t *list; + boolean_t cleanup; + + + ASSERT(*channel != NULL); + + /* + * we don't need to add the channel back to the list since we never + * removed it. decrement the reference count. + */ + list = &dcopy_statep->d_globalchan_list; + mutex_enter(&list->dl_mutex); + (*channel)->ch_ref_cnt--; + + /* + * if we need to remove this channel, and the reference count is down + * to 0, decrement the number of channels which still need to be + * removed on the device. + */ + if ((*channel)->ch_removing && ((*channel)->ch_ref_cnt == 0)) { + cleanup = B_FALSE; + device = (*channel)->ch_device; + mutex_enter(&device->dc_devchan_list.dl_mutex); + device->dc_removing_cnt--; + if (device->dc_removing_cnt == 0) { + cleanup = B_TRUE; + } + mutex_exit(&device->dc_devchan_list.dl_mutex); + } + mutex_exit(&list->dl_mutex); + + /* + * if there are no channels which still need to be removed, cleanup the + * device state and call back into the DMA device driver to tell them + * the device is free. + */ + if (cleanup) { + dcopy_device_cleanup(device, B_TRUE); + } + + *channel = NULL; +} + + +/* + * dcopy_query_channel() + */ +void +dcopy_query_channel(dcopy_handle_t channel, dcopy_query_channel_t *query) +{ + *query = channel->ch_info; +} + + +/* + * dcopy_cmd_alloc() + */ +int +dcopy_cmd_alloc(dcopy_handle_t handle, int flags, dcopy_cmd_t *cmd) +{ + dcopy_handle_t channel; + dcopy_cmd_priv_t priv; + int e; + + + channel = handle; + + atomic_inc_64(&channel->ch_stat.cs_cmd_alloc.value.ui64); + e = channel->ch_cb->cb_cmd_alloc(channel->ch_channel_private, flags, + cmd); + if (e == DCOPY_SUCCESS) { + priv = (*cmd)->dp_private; + priv->pr_channel = channel; + /* + * we won't initialize the blocking state until we actually + * need to block. + */ + priv->pr_block_init = B_FALSE; + } + + return (e); +} + + +/* + * dcopy_cmd_free() + */ +void +dcopy_cmd_free(dcopy_cmd_t *cmd) +{ + dcopy_handle_t channel; + dcopy_cmd_priv_t priv; + + + ASSERT(*cmd != NULL); + + priv = (*cmd)->dp_private; + channel = priv->pr_channel; + + /* if we initialized the blocking state, clean it up too */ + if (priv->pr_block_init) { + cv_destroy(&priv->pr_cv); + mutex_destroy(&priv->pr_mutex); + } + + channel->ch_cb->cb_cmd_free(channel->ch_channel_private, cmd); +} + + +/* + * dcopy_cmd_post() + */ +int +dcopy_cmd_post(dcopy_cmd_t cmd) +{ + dcopy_handle_t channel; + int e; + + + channel = cmd->dp_private->pr_channel; + + atomic_inc_64(&channel->ch_stat.cs_cmd_post.value.ui64); + if (cmd->dp_cmd == DCOPY_CMD_COPY) { + atomic_add_64(&channel->ch_stat.cs_bytes_xfer.value.ui64, + cmd->dp.copy.cc_size); + } + e = channel->ch_cb->cb_cmd_post(channel->ch_channel_private, cmd); + if (e != DCOPY_SUCCESS) { + return (e); + } + + return (DCOPY_SUCCESS); +} + + +/* + * dcopy_cmd_poll() + */ +int +dcopy_cmd_poll(dcopy_cmd_t cmd, int flags) +{ + dcopy_handle_t channel; + dcopy_cmd_priv_t priv; + int e; + + + priv = cmd->dp_private; + channel = priv->pr_channel; + + /* + * if the caller is trying to block, they needed to post the + * command with DCOPY_CMD_INTR set. + */ + if ((flags & DCOPY_POLL_BLOCK) && !(cmd->dp_flags & DCOPY_CMD_INTR)) { + return (DCOPY_FAILURE); + } + + atomic_inc_64(&channel->ch_stat.cs_cmd_poll.value.ui64); + +repoll: + e = channel->ch_cb->cb_cmd_poll(channel->ch_channel_private, cmd); + if (e == DCOPY_PENDING) { + /* + * if the command is still active, and the blocking flag + * is set. + */ + if (flags & DCOPY_POLL_BLOCK) { + + /* + * if we haven't initialized the state, do it now. A + * command can be re-used, so it's possible it's + * already been initialized. + */ + if (!priv->pr_block_init) { + priv->pr_block_init = B_TRUE; + mutex_init(&priv->pr_mutex, NULL, MUTEX_DRIVER, + NULL); + cv_init(&priv->pr_cv, NULL, CV_DRIVER, NULL); + priv->pr_cmd = cmd; + } + + /* push it on the list for blocking commands */ + priv->pr_wait = B_TRUE; + dcopy_list_push(&channel->ch_poll_list, priv); + + mutex_enter(&priv->pr_mutex); + /* + * it's possible we already cleared pr_wait before we + * grabbed the mutex. + */ + if (priv->pr_wait) { + cv_wait(&priv->pr_cv, &priv->pr_mutex); + } + mutex_exit(&priv->pr_mutex); + + /* + * the command has completed, go back and poll so we + * get the status. + */ + goto repoll; + } + } + + return (e); +} + +/* *** END OF EXTERNAL INTERFACE *** */ + +/* + * dcopy_list_init() + */ +static int +dcopy_list_init(dcopy_list_t *list, size_t node_size, offset_t link_offset) +{ + mutex_init(&list->dl_mutex, NULL, MUTEX_DRIVER, NULL); + list_create(&list->dl_list, node_size, link_offset); + list->dl_cnt = 0; + + return (DCOPY_SUCCESS); +} + + +/* + * dcopy_list_fini() + */ +static void +dcopy_list_fini(dcopy_list_t *list) +{ + list_destroy(&list->dl_list); + mutex_destroy(&list->dl_mutex); +} + + +/* + * dcopy_list_push() + */ +static void +dcopy_list_push(dcopy_list_t *list, void *list_node) +{ + mutex_enter(&list->dl_mutex); + list_insert_tail(&list->dl_list, list_node); + list->dl_cnt++; + mutex_exit(&list->dl_mutex); +} + + +/* + * dcopy_list_pop() + */ +static void * +dcopy_list_pop(dcopy_list_t *list) +{ + list_node_t *list_node; + + mutex_enter(&list->dl_mutex); + list_node = list_head(&list->dl_list); + if (list_node == NULL) { + mutex_exit(&list->dl_mutex); + return (list_node); + } + list->dl_cnt--; + list_remove(&list->dl_list, list_node); + mutex_exit(&list->dl_mutex); + + return (list_node); +} + + +/* *** DEVICE INTERFACE *** */ +/* + * dcopy_device_register() + */ +int +dcopy_device_register(void *device_private, dcopy_device_info_t *info, + dcopy_device_handle_t *handle) +{ + struct dcopy_channel_s *channel; + struct dcopy_device_s *device; + int e; + int i; + + + /* initialize the per device state */ + device = kmem_zalloc(sizeof (*device), KM_SLEEP); + device->dc_device_private = device_private; + device->dc_info = *info; + device->dc_removing_cnt = 0; + device->dc_cb = info->di_cb; + + /* + * we have a per device channel list so we can remove a device in the + * future. + */ + e = dcopy_list_init(&device->dc_devchan_list, + sizeof (struct dcopy_channel_s), + offsetof(struct dcopy_channel_s, ch_devchan_list_node)); + if (e != DCOPY_SUCCESS) { + goto registerfail_devchan; + } + + /* + * allocate state for each channel, allocate the channel, and then add + * the devices dma channels to the devices channel list. + */ + for (i = 0; i < info->di_num_dma; i++) { + channel = kmem_zalloc(sizeof (*channel), KM_SLEEP); + channel->ch_device = device; + channel->ch_removing = B_FALSE; + channel->ch_ref_cnt = 0; + channel->ch_cb = info->di_cb; + + e = info->di_cb->cb_channel_alloc(device_private, channel, + DCOPY_SLEEP, dcopy_channel_size, &channel->ch_info, + &channel->ch_channel_private); + if (e != DCOPY_SUCCESS) { + kmem_free(channel, sizeof (*channel)); + goto registerfail_alloc; + } + + e = dcopy_stats_init(channel); + if (e != DCOPY_SUCCESS) { + info->di_cb->cb_channel_free( + &channel->ch_channel_private); + kmem_free(channel, sizeof (*channel)); + goto registerfail_alloc; + } + + e = dcopy_list_init(&channel->ch_poll_list, + sizeof (struct dcopy_cmd_priv_s), + offsetof(struct dcopy_cmd_priv_s, pr_poll_list_node)); + if (e != DCOPY_SUCCESS) { + dcopy_stats_fini(channel); + info->di_cb->cb_channel_free( + &channel->ch_channel_private); + kmem_free(channel, sizeof (*channel)); + goto registerfail_alloc; + } + + dcopy_list_push(&device->dc_devchan_list, channel); + } + + /* add the device to device list */ + dcopy_list_push(&dcopy_statep->d_device_list, device); + + /* + * add the device's dma channels to the global channel list (where + * dcopy_alloc's come from) + */ + mutex_enter(&dcopy_statep->d_globalchan_list.dl_mutex); + mutex_enter(&dcopy_statep->d_device_list.dl_mutex); + channel = list_head(&device->dc_devchan_list.dl_list); + while (channel != NULL) { + list_insert_tail(&dcopy_statep->d_globalchan_list.dl_list, + channel); + dcopy_statep->d_globalchan_list.dl_cnt++; + channel = list_next(&device->dc_devchan_list.dl_list, channel); + } + mutex_exit(&dcopy_statep->d_device_list.dl_mutex); + mutex_exit(&dcopy_statep->d_globalchan_list.dl_mutex); + + *handle = device; + return (DCOPY_SUCCESS); + +registerfail_alloc: + channel = list_head(&device->dc_devchan_list.dl_list); + while (channel != NULL) { + /* remove from the list */ + channel = dcopy_list_pop(&device->dc_devchan_list); + ASSERT(channel != NULL); + + dcopy_list_fini(&channel->ch_poll_list); + dcopy_stats_fini(channel); + info->di_cb->cb_channel_free(&channel->ch_channel_private); + kmem_free(channel, sizeof (*channel)); + } + + dcopy_list_fini(&device->dc_devchan_list); +registerfail_devchan: + kmem_free(device, sizeof (*device)); + + return (DCOPY_FAILURE); +} + + +/* + * dcopy_device_unregister() + */ +/*ARGSUSED*/ +int +dcopy_device_unregister(dcopy_device_handle_t *handle) +{ + struct dcopy_channel_s *channel; + dcopy_device_handle_t device; + boolean_t device_busy; + + + device = *handle; + device_busy = B_FALSE; + + /* + * remove the devices dma channels from the global channel list (where + * dcopy_alloc's come from) + */ + mutex_enter(&dcopy_statep->d_globalchan_list.dl_mutex); + mutex_enter(&device->dc_devchan_list.dl_mutex); + channel = list_head(&device->dc_devchan_list.dl_list); + while (channel != NULL) { + /* + * if the channel has outstanding allocs, mark it as having + * to be removed and increment the number of channels which + * need to be removed in the device state too. + */ + if (channel->ch_ref_cnt != 0) { + channel->ch_removing = B_TRUE; + device_busy = B_TRUE; + device->dc_removing_cnt++; + } + dcopy_statep->d_globalchan_list.dl_cnt--; + list_remove(&dcopy_statep->d_globalchan_list.dl_list, channel); + channel = list_next(&device->dc_devchan_list.dl_list, channel); + } + mutex_exit(&device->dc_devchan_list.dl_mutex); + mutex_exit(&dcopy_statep->d_globalchan_list.dl_mutex); + + /* + * if there are channels which still need to be removed, we will clean + * up the device state after they are freed up. + */ + if (device_busy) { + return (DCOPY_PENDING); + } + + dcopy_device_cleanup(device, B_FALSE); + + *handle = NULL; + return (DCOPY_SUCCESS); +} + + +/* + * dcopy_device_cleanup() + */ +static void +dcopy_device_cleanup(dcopy_device_handle_t device, boolean_t do_callback) +{ + struct dcopy_channel_s *channel; + + /* + * remove all the channels in the device list, free them, and clean up + * the state. + */ + mutex_enter(&dcopy_statep->d_device_list.dl_mutex); + channel = list_head(&device->dc_devchan_list.dl_list); + while (channel != NULL) { + device->dc_devchan_list.dl_cnt--; + list_remove(&device->dc_devchan_list.dl_list, channel); + dcopy_list_fini(&channel->ch_poll_list); + dcopy_stats_fini(channel); + channel->ch_cb->cb_channel_free(&channel->ch_channel_private); + kmem_free(channel, sizeof (*channel)); + channel = list_head(&device->dc_devchan_list.dl_list); + } + + /* remove it from the list of devices */ + list_remove(&dcopy_statep->d_device_list.dl_list, device); + + mutex_exit(&dcopy_statep->d_device_list.dl_mutex); + + /* + * notify the DMA device driver that the device is free to be + * detached. + */ + if (do_callback) { + device->dc_cb->cb_unregister_complete( + device->dc_device_private, DCOPY_SUCCESS); + } + + dcopy_list_fini(&device->dc_devchan_list); + kmem_free(device, sizeof (*device)); +} + + +/* + * dcopy_device_channel_notify() + */ +/*ARGSUSED*/ +void +dcopy_device_channel_notify(dcopy_handle_t handle, int status) +{ + struct dcopy_channel_s *channel; + dcopy_list_t *poll_list; + dcopy_cmd_priv_t priv; + int e; + + + ASSERT(status == DCOPY_COMPLETION); + channel = handle; + + poll_list = &channel->ch_poll_list; + + /* + * when we get a completion notification from the device, go through + * all of the commands blocking on this channel and see if they have + * completed. Remove the command and wake up the block thread if they + * have. Once we hit a command which is still pending, we are done + * polling since commands in a channel complete in order. + */ + mutex_enter(&poll_list->dl_mutex); + if (poll_list->dl_cnt != 0) { + priv = list_head(&poll_list->dl_list); + while (priv != NULL) { + atomic_inc_64(&channel-> + ch_stat.cs_notify_poll.value.ui64); + e = channel->ch_cb->cb_cmd_poll( + channel->ch_channel_private, + priv->pr_cmd); + if (e == DCOPY_PENDING) { + atomic_inc_64(&channel-> + ch_stat.cs_notify_pending.value.ui64); + break; + } + + poll_list->dl_cnt--; + list_remove(&poll_list->dl_list, priv); + + mutex_enter(&priv->pr_mutex); + priv->pr_wait = B_FALSE; + cv_signal(&priv->pr_cv); + mutex_exit(&priv->pr_mutex); + + priv = list_head(&poll_list->dl_list); + } + } + + mutex_exit(&poll_list->dl_mutex); +} + + +/* + * dcopy_stats_init() + */ +static int +dcopy_stats_init(dcopy_handle_t channel) +{ +#define CHANSTRSIZE 20 + char chanstr[CHANSTRSIZE]; + dcopy_stats_t *stats; + int instance; + char *name; + + + stats = &channel->ch_stat; + name = (char *)ddi_driver_name(channel->ch_device->dc_info.di_dip); + instance = ddi_get_instance(channel->ch_device->dc_info.di_dip); + + (void) snprintf(chanstr, CHANSTRSIZE, "channel%d", + (uint32_t)channel->ch_info.qc_chan_num); + + channel->ch_kstat = kstat_create(name, instance, chanstr, "misc", + KSTAT_TYPE_NAMED, sizeof (dcopy_stats_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (channel->ch_kstat == NULL) { + return (DCOPY_FAILURE); + } + channel->ch_kstat->ks_data = stats; + + kstat_named_init(&stats->cs_bytes_xfer, "bytes_xfer", + KSTAT_DATA_UINT64); + kstat_named_init(&stats->cs_cmd_alloc, "cmd_alloc", + KSTAT_DATA_UINT64); + kstat_named_init(&stats->cs_cmd_post, "cmd_post", + KSTAT_DATA_UINT64); + kstat_named_init(&stats->cs_cmd_poll, "cmd_poll", + KSTAT_DATA_UINT64); + kstat_named_init(&stats->cs_notify_poll, "notify_poll", + KSTAT_DATA_UINT64); + kstat_named_init(&stats->cs_notify_pending, "notify_pending", + KSTAT_DATA_UINT64); + kstat_named_init(&stats->cs_id, "id", + KSTAT_DATA_UINT64); + kstat_named_init(&stats->cs_capabilities, "capabilities", + KSTAT_DATA_UINT64); + + kstat_install(channel->ch_kstat); + + channel->ch_stat.cs_id.value.ui64 = channel->ch_info.qc_id; + channel->ch_stat.cs_capabilities.value.ui64 = + channel->ch_info.qc_capabilities; + + return (DCOPY_SUCCESS); +} + + +/* + * dcopy_stats_fini() + */ +static void +dcopy_stats_fini(dcopy_handle_t channel) +{ + kstat_delete(channel->ch_kstat); +} +/* *** END OF DEVICE INTERFACE *** */ diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c index 28a9a4928f..90fbf3cbf1 100644 --- a/usr/src/uts/common/io/stream.c +++ b/usr/src/uts/common/io/stream.c @@ -23,7 +23,7 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -318,8 +318,8 @@ streams_msg_init(void) int offset; mblk_cache = kmem_cache_create("streams_mblk", - sizeof (mblk_t), 32, NULL, NULL, NULL, NULL, NULL, - mblk_kmem_flags); + sizeof (mblk_t), 32, NULL, NULL, NULL, NULL, NULL, + mblk_kmem_flags); for (sizep = dblk_sizes; (size = *sizep) != 0; sizep++) { @@ -330,7 +330,7 @@ streams_msg_init(void) */ tot_size = size + sizeof (dblk_t); ASSERT((offset + sizeof (dblk_t) + sizeof (kmem_slab_t)) - < PAGESIZE); + < PAGESIZE); ASSERT((tot_size & (DBLK_CACHE_ALIGN - 1)) == 0); } else { @@ -346,9 +346,9 @@ streams_msg_init(void) (void) sprintf(name, "streams_dblk_%ld", size); cp = kmem_cache_create(name, tot_size, - DBLK_CACHE_ALIGN, dblk_constructor, - dblk_destructor, NULL, - (void *)(size), NULL, dblk_kmem_flags); + DBLK_CACHE_ALIGN, dblk_constructor, + dblk_destructor, NULL, + (void *)(size), NULL, dblk_kmem_flags); while (lastsize <= size) { dblk_cache[(lastsize - 1) >> DBLK_SIZE_SHIFT] = cp; @@ -357,13 +357,13 @@ streams_msg_init(void) } dblk_esb_cache = kmem_cache_create("streams_dblk_esb", - sizeof (dblk_t), DBLK_CACHE_ALIGN, - dblk_esb_constructor, dblk_destructor, NULL, - (void *) sizeof (dblk_t), NULL, dblk_kmem_flags); + sizeof (dblk_t), DBLK_CACHE_ALIGN, + dblk_esb_constructor, dblk_destructor, NULL, + (void *) sizeof (dblk_t), NULL, dblk_kmem_flags); fthdr_cache = kmem_cache_create("streams_fthdr", - sizeof (fthdr_t), 32, NULL, NULL, NULL, NULL, NULL, 0); + sizeof (fthdr_t), 32, NULL, NULL, NULL, NULL, NULL, 0); ftblk_cache = kmem_cache_create("streams_ftblk", - sizeof (ftblk_t), 32, NULL, NULL, NULL, NULL, NULL, 0); + sizeof (ftblk_t), 32, NULL, NULL, NULL, NULL, NULL, 0); /* Initialize Multidata caches */ mmd_init(); @@ -545,8 +545,8 @@ dblk_lastfree(mblk_t *mp, dblk_t *dbp) dbp->db_struioflag = 0; dbp->db_struioun.cksum.flags = 0; - /* and the COOKED flag */ - dbp->db_flags &= ~DBLK_COOKED; + /* and the COOKED and/or UIOA flag(s) */ + dbp->db_flags &= ~(DBLK_COOKED | DBLK_UIOA); kmem_cache_free(dbp->db_cache, dbp); } @@ -739,7 +739,7 @@ desballoc(unsigned char *base, size_t size, uint_t pri, frtn_t *frp) */ if (!str_ftnever) { mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0), - frp, dblk_lastfree_desb, KM_NOSLEEP); + frp, dblk_lastfree_desb, KM_NOSLEEP); if (mp != NULL) STR_FTALLOC(&DB_FTHDR(mp), FTEV_DESBALLOC, size); @@ -857,7 +857,7 @@ bcache_create(char *name, size_t size, uint_t align) (void) sprintf(buffer, "%s_dblk_cache", name); bcp->dblk_cache = kmem_cache_create(buffer, sizeof (dblk_t), DBLK_CACHE_ALIGN, bcache_dblk_constructor, bcache_dblk_destructor, - NULL, (void *)bcp, NULL, 0); + NULL, (void *)bcp, NULL, 0); return (bcp); } @@ -1584,7 +1584,7 @@ adjmsg(mblk_t *mp, ssize_t len) */ if ((save_bp != mp) && - (save_bp->b_wptr == save_bp->b_rptr)) { + (save_bp->b_wptr == save_bp->b_rptr)) { bcont = save_bp->b_cont; freeb(save_bp); prev_bp->b_cont = bcont; @@ -2129,8 +2129,8 @@ flushband(queue_t *q, unsigned char pri, int flag) nmp = mp->b_next; mp->b_next = mp->b_prev = NULL; if ((mp->b_band == 0) && - ((flag == FLUSHALL) || - datamsg(mp->b_datap->db_type))) + ((flag == FLUSHALL) || + datamsg(mp->b_datap->db_type))) freemsg(mp); else (void) putq(q, mp); @@ -2242,7 +2242,7 @@ bcanput(queue_t *q, unsigned char pri) q->q_flag |= QWANTW; mutex_exit(QLOCK(q)); TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT, - "bcanput:%p %X %d", q, pri, 0); + "bcanput:%p %X %d", q, pri, 0); return (0); } } else { /* pri != 0 */ @@ -2252,7 +2252,7 @@ bcanput(queue_t *q, unsigned char pri) */ mutex_exit(QLOCK(q)); TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT, - "bcanput:%p %X %d", q, pri, 1); + "bcanput:%p %X %d", q, pri, 1); return (1); } qbp = q->q_bandp; @@ -2262,13 +2262,13 @@ bcanput(queue_t *q, unsigned char pri) qbp->qb_flag |= QB_WANTW; mutex_exit(QLOCK(q)); TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT, - "bcanput:%p %X %d", q, pri, 0); + "bcanput:%p %X %d", q, pri, 0); return (0); } } mutex_exit(QLOCK(q)); TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT, - "bcanput:%p %X %d", q, pri, 1); + "bcanput:%p %X %d", q, pri, 1); return (1); } @@ -2847,7 +2847,7 @@ putnextctl1(queue_t *q, int type, int param) mblk_t *bp; if ((datamsg(type) && (type != M_DELAY)) || - ((bp = allocb_tryhard(1)) == NULL)) + ((bp = allocb_tryhard(1)) == NULL)) return (0); bp->b_datap->db_type = (unsigned char)type; @@ -2864,7 +2864,7 @@ putnextctl(queue_t *q, int type) mblk_t *bp; if ((datamsg(type) && (type != M_DELAY)) || - ((bp = allocb_tryhard(0)) == NULL)) + ((bp = allocb_tryhard(0)) == NULL)) return (0); bp->b_datap->db_type = (unsigned char)type; diff --git a/usr/src/uts/common/os/move.c b/usr/src/uts/common/os/move.c index d5c63b167e..f4056aa02c 100644 --- a/usr/src/uts/common/os/move.c +++ b/usr/src/uts/common/os/move.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -45,6 +44,16 @@ #include <sys/systm.h> #include <sys/uio.h> #include <sys/errno.h> +#include <sys/vmsystm.h> +#include <sys/cmn_err.h> +#include <vm/as.h> +#include <vm/page.h> + +#include <sys/dcopy.h> + +int64_t uioa_maxpoll = -1; /* <0 = noblock, 0 = block, >0 = block after */ +#define UIO_DCOPY_CHANNEL 0 +#define UIO_DCOPY_CMD 1 /* * Move "n" bytes at byte address "p"; "rw" indicates the direction @@ -277,3 +286,370 @@ uiodup(uio_t *suio, uio_t *duio, iovec_t *diov, int diov_cnt) duio->uio_iov = diov; return (0); } + +/* + * Shadow state for checking if a platform has hardware asynchronous + * copy capability and minimum copy size, e.g. Intel's I/OAT dma engine, + * /dev/dcopy. + */ +uioasync_t uioasync = {B_TRUE, 1024}; + +/* + * Schedule an asynchronous move of "n" bytes at byte address "p", + * "rw" indicates the direction of the move, I/O parameters and + * async state are provided in "uioa" which is update to reflect + * the data which is to be moved. + * + * Returns 0 on success or a non-zero errno on failure. + * + * Note, while the uioasync APIs are general purpose in design + * the current implementation is Intel I/OAT specific. + */ +int +uioamove(void *p, size_t n, enum uio_rw rw, uioa_t *uioa) +{ + int soff, doff; + uint64_t pa; + int cnt; + iovec_t *iov; + dcopy_handle_t channel; + dcopy_cmd_t cmd; + int ret = 0; + int dcopy_flags; + + if (!(uioa->uioa_state & UIOA_ENABLED)) { + /* The uioa_t isn't enabled */ + return (ENXIO); + } + + if (uioa->uio_segflg != UIO_USERSPACE || rw != UIO_READ) { + /* Only support to user-land from kernel */ + return (ENOTSUP); + } + + + channel = uioa->uioa_hwst[UIO_DCOPY_CHANNEL]; + cmd = uioa->uioa_hwst[UIO_DCOPY_CMD]; + dcopy_flags = DCOPY_NOSLEEP; + + /* + * While source bytes and destination bytes. + */ + while (n > 0 && uioa->uio_resid > 0) { + iov = uioa->uio_iov; + if (iov->iov_len == 0l) { + uioa->uio_iov++; + uioa->uio_iovcnt--; + uioa->uioa_lcur++; + uioa->uioa_lppp = uioa->uioa_lcur->uioa_ppp; + continue; + } + /* + * While source bytes schedule an async + * dma for destination page by page. + */ + while (n > 0) { + /* Addr offset in page src/dst */ + soff = (uintptr_t)p & PAGEOFFSET; + doff = (uintptr_t)iov->iov_base & PAGEOFFSET; + /* Min copy count src and dst and page sized */ + cnt = MIN(n, iov->iov_len); + cnt = MIN(cnt, PAGESIZE - soff); + cnt = MIN(cnt, PAGESIZE - doff); + /* XXX if next page(s) contiguous could use multipage */ + + /* + * if we have an old command, we want to link all + * other commands to the next command we alloced so + * we only need to track the last command but can + * still free them all. + */ + if (cmd != NULL) { + dcopy_flags |= DCOPY_ALLOC_LINK; + } + ret = dcopy_cmd_alloc(channel, dcopy_flags, &cmd); + if (ret != DCOPY_SUCCESS) { + /* Error of some sort */ + return (EIO); + } + uioa->uioa_hwst[UIO_DCOPY_CMD] = cmd; + + ASSERT(cmd->dp_version == DCOPY_CMD_V0); + if (uioa_maxpoll >= 0) { + /* Blocking (>0 may be) used in uioafini() */ + cmd->dp_flags = DCOPY_CMD_INTR; + } else { + /* Non blocking uioafini() so no intr */ + cmd->dp_flags = DCOPY_CMD_NOFLAGS; + } + cmd->dp_cmd = DCOPY_CMD_COPY; + pa = ptob((uint64_t)hat_getpfnum(kas.a_hat, p)); + cmd->dp.copy.cc_source = pa + soff; + if (uioa->uioa_lcur->uioa_pfncnt == 0) { + /* Have a (page_t **) */ + pa = ptob((uint64_t)( + *(page_t **)uioa->uioa_lppp)->p_pagenum); + } else { + /* Have a (pfn_t *) */ + pa = ptob((uint64_t)( + *(pfn_t *)uioa->uioa_lppp)); + } + cmd->dp.copy.cc_dest = pa + doff; + cmd->dp.copy.cc_size = cnt; + ret = dcopy_cmd_post(cmd); + if (ret != DCOPY_SUCCESS) { + /* Error of some sort */ + return (EIO); + } + ret = 0; + + /* If UIOA_POLL not set, set it */ + if (!(uioa->uioa_state & UIOA_POLL)) + uioa->uioa_state |= UIOA_POLL; + + /* Update iov, uio, and local pointers/counters */ + iov->iov_base += cnt; + iov->iov_len -= cnt; + uioa->uio_resid -= cnt; + uioa->uio_loffset += cnt; + p = (caddr_t)p + cnt; + n -= cnt; + + /* End of iovec? */ + if (iov->iov_len == 0) { + /* Yup, next iovec */ + break; + } + + /* Next dst addr page? */ + if (doff + cnt == PAGESIZE) { + /* Yup, next page_t */ + uioa->uioa_lppp++; + } + } + } + + return (ret); +} + +/* + * Initialize a uioa_t for a given uio_t for the current user context, + * copy the common uio_t to the uioa_t, walk the shared iovec_t and + * lock down the user-land page(s) containing iovec_t data, then mapin + * user-land pages using segkpm. + */ +int +uioainit(uio_t *uiop, uioa_t *uioap) +{ + caddr_t addr; + page_t **pages; + int off; + int len; + proc_t *procp = ttoproc(curthread); + struct as *as = procp->p_as; + iovec_t *iov = uiop->uio_iov; + int32_t iovcnt = uiop->uio_iovcnt; + uioa_page_t *locked = uioap->uioa_locked; + dcopy_handle_t channel; + int error; + + if (! (uioap->uioa_state & UIOA_ALLOC)) { + /* Can only init() a freshly allocated uioa_t */ + return (EINVAL); + } + + error = dcopy_alloc(DCOPY_NOSLEEP, &channel); + if (error == DCOPY_NORESOURCES) { + /* Turn off uioa */ + uioasync.enabled = B_FALSE; + return (ENODEV); + } + if (error != DCOPY_SUCCESS) { + /* Alloc failed */ + return (EIO); + } + + uioap->uioa_hwst[UIO_DCOPY_CHANNEL] = channel; + uioap->uioa_hwst[UIO_DCOPY_CMD] = NULL; + + /* Indicate uioa_t (will be) initialized */ + uioap->uioa_state = UIOA_INIT; + + /* uio_t/uioa_t uio_t common struct copy */ + *((uio_t *)uioap) = *uiop; + + /* initialize *uiop->uio_iov */ + if (iovcnt > UIOA_IOV_MAX) { + /* Too big? */ + return (E2BIG); + } + uioap->uio_iov = iov; + uioap->uio_iovcnt = iovcnt; + + /* Mark the uioap as such */ + uioap->uio_extflg |= UIO_ASYNC; + + /* + * For each iovec_t, lock-down the page(s) backing the iovec_t + * and save the page_t list for phys addr use in uioamove(). + */ + iov = uiop->uio_iov; + iovcnt = uiop->uio_iovcnt; + while (iovcnt > 0) { + addr = iov->iov_base; + off = (uintptr_t)addr & PAGEOFFSET; + addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); + len = iov->iov_len + off; + + /* Lock down page(s) for the iov span */ + if ((error = as_pagelock(as, &pages, + iov->iov_base, iov->iov_len, S_WRITE)) != 0) { + /* Error */ + goto cleanup; + } + + if (pages == NULL) { + /* + * Need page_t list, really only need + * a pfn list so build one. + */ + pfn_t *pfnp; + int pcnt = len >> PAGESHIFT; + + if (off) + pcnt++; + if ((pfnp = kmem_alloc(pcnt * sizeof (pfnp), + KM_NOSLEEP)) == NULL) { + error = ENOMEM; + goto cleanup; + } + locked->uioa_ppp = (void **)pfnp; + locked->uioa_pfncnt = pcnt; + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + while (pcnt-- > 0) { + *pfnp++ = hat_getpfnum(as->a_hat, addr); + addr += PAGESIZE; + } + AS_LOCK_EXIT(as, &as->a_lock); + } else { + /* Have a page_t list, save it */ + locked->uioa_ppp = (void **)pages; + locked->uioa_pfncnt = 0; + } + /* Save for as_pageunlock() in uioafini() */ + locked->uioa_base = iov->iov_base; + locked->uioa_len = iov->iov_len; + locked++; + + /* Next iovec_t */ + iov++; + iovcnt--; + } + /* Initialize curret pointer into uioa_locked[] and it's uioa_ppp */ + uioap->uioa_lcur = uioap->uioa_locked; + uioap->uioa_lppp = uioap->uioa_lcur->uioa_ppp; + return (0); + +cleanup: + /* Unlock any previously locked page_t(s) */ + while (locked > uioap->uioa_locked) { + locked--; + as_pageunlock(as, (page_t **)locked->uioa_ppp, + locked->uioa_base, locked->uioa_len, S_WRITE); + } + + /* Last indicate uioa_t still in alloc state */ + uioap->uioa_state = UIOA_ALLOC; + + return (error); +} + +/* + * Finish processing of a uioa_t by cleanup any pending "uioap" actions. + */ +int +uioafini(uio_t *uiop, uioa_t *uioap) +{ + int32_t iovcnt = uiop->uio_iovcnt; + uioa_page_t *locked = uioap->uioa_locked; + struct as *as = ttoproc(curthread)->p_as; + dcopy_handle_t channel; + dcopy_cmd_t cmd; + int ret = 0; + + ASSERT(uioap->uio_extflg & UIO_ASYNC); + + if (!(uioap->uioa_state & (UIOA_ENABLED|UIOA_FINI))) { + /* Must be an active uioa_t */ + return (EINVAL); + } + + channel = uioap->uioa_hwst[UIO_DCOPY_CHANNEL]; + cmd = uioap->uioa_hwst[UIO_DCOPY_CMD]; + + /* XXX - why do we get cmd == NULL sometimes? */ + if (cmd != NULL) { + if (uioap->uioa_state & UIOA_POLL) { + /* Wait for last dcopy() to finish */ + int64_t poll = 1; + int poll_flag = DCOPY_POLL_NOFLAGS; + + do { + if (uioa_maxpoll == 0 || + (uioa_maxpoll > 0 && + poll >= uioa_maxpoll)) { + /* Always block or after maxpoll */ + poll_flag = DCOPY_POLL_BLOCK; + } else { + /* No block, poll */ + poll++; + } + ret = dcopy_cmd_poll(cmd, poll_flag); + } while (ret == DCOPY_PENDING); + + if (ret == DCOPY_COMPLETED) { + /* Poll/block succeeded */ + ret = 0; + } else { + /* Poll/block failed */ + ret = EIO; + } + } + dcopy_cmd_free(&cmd); + } + + dcopy_free(&channel); + + /* Unlock all page(s) iovec_t by iovec_t */ + while (iovcnt-- > 0) { + page_t **pages; + + if (locked->uioa_pfncnt == 0) { + /* A as_pagelock() returned (page_t **) */ + pages = (page_t **)locked->uioa_ppp; + } else { + /* Our pfn_t array */ + pages = NULL; + kmem_free(locked->uioa_ppp, locked->uioa_pfncnt); + } + as_pageunlock(as, pages, locked->uioa_base, locked->uioa_len, + S_WRITE); + + locked++; + } + /* uioa_t->uio_t common struct copy */ + *uiop = *((uio_t *)uioap); + + /* + * Last, reset uioa state to alloc. + * + * Note, we only initialize the state here, all other members + * will be initialized in a subsequent uioainit(). + */ + uioap->uioa_state = UIOA_ALLOC; + + uioap->uioa_hwst[UIO_DCOPY_CMD] = NULL; + uioap->uioa_hwst[UIO_DCOPY_CHANNEL] = NULL; + + return (ret); +} diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index d80fa67f56..53e2d81465 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -143,6 +143,7 @@ static uint32_t ioc_id; static void putback(struct stdata *, queue_t *, mblk_t *, int); static void strcleanall(struct vnode *); static int strwsrv(queue_t *); +static void struioainit(queue_t *, sodirect_t *, uio_t *); /* * qinit and module_info structures for stream head read and write queues @@ -188,6 +189,11 @@ static boolean_t msghasdata(mblk_t *bp); * mirror this. * 4. ioctl monitor: sd_lock is gotten to ensure that only one * thread is doing an ioctl at a time. + * + * Note, for sodirect case 3. is extended to (*sodirect_t.sod_enqueue)() + * call-back from below, further the sodirect support is for code paths + * called via kstgetmsg(), all other code paths ASSERT() that sodirect + * uioa generated mblk_t's (i.e. DBLK_UIOA) aren't processed. */ static int @@ -395,6 +401,7 @@ ckreturn: stp->sd_qn_minpsz = 0; stp->sd_qn_maxpsz = INFPSZ - 1; /* used to check for initialization */ stp->sd_maxblk = INFPSZ; + stp->sd_sodirect = NULL; qp->q_ptr = _WR(qp)->q_ptr = stp; STREAM(qp) = STREAM(_WR(qp)) = stp; vp->v_stream = stp; @@ -966,11 +973,14 @@ strcleanall(struct vnode *vp) * It is the callers responsibility to call qbackenable after * it is finished with the message. The caller should not call * qbackenable until after any putback calls to avoid spurious backenabling. + * + * Also, handle uioa initialization and process any DBLK_UIOA flaged messages. */ mblk_t * strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, int *errorp) { + sodirect_t *sodp = stp->sd_sodirect; mblk_t *bp; int error; @@ -1059,7 +1069,67 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, } *errorp = 0; ASSERT(MUTEX_HELD(&stp->sd_lock)); - return (getq_noenab(q)); + if (sodp != NULL && (sodp->sod_state & SOD_ENABLED) && + (sodp->sod_uioa.uioa_state & UIOA_INIT)) { + /* + * First kstrgetmsg() call for an uioa_t so if any + * queued mblk_t's need to consume them before uioa + * from below can occur. + */ + sodp->sod_uioa.uioa_state &= UIOA_CLR; + sodp->sod_uioa.uioa_state |= UIOA_ENABLED; + if (q->q_first != NULL) { + struioainit(q, sodp, uiop); + } + } + + bp = getq_noenab(q); + + if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) { + /* + * A uioa flaged mblk_t chain, already uio processed, + * add it to the sodirect uioa pending free list. + * + * Note, a b_cont chain headed by a DBLK_UIOA enable + * mblk_t must have all mblk_t(s) DBLK_UIOA enabled. + */ + mblk_t *bpt = sodp->sod_uioaft; + + ASSERT(sodp != NULL); + + /* + * Add first mblk_t of "bp" chain to current sodirect uioa + * free list tail mblk_t, if any, else empty list so new head. + */ + if (bpt == NULL) + sodp->sod_uioafh = bp; + else + bpt->b_cont = bp; + + /* + * Walk mblk_t "bp" chain to find tail and adjust rptr of + * each to reflect that uioamove() has consumed all data. + */ + bpt = bp; + for (;;) { + bpt->b_rptr = bpt->b_wptr; + if (bpt->b_cont == NULL) + break; + bpt = bpt->b_cont; + + ASSERT(bpt->b_datap->db_flags & DBLK_UIOA); + } + /* New sodirect uioa free list tail */ + sodp->sod_uioaft = bpt; + + /* Only 1 strget() with data returned per uioa_t */ + if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) { + sodp->sod_uioa.uioa_state &= UIOA_CLR; + sodp->sod_uioa.uioa_state |= UIOA_FINI; + } + } + + return (bp); } /* @@ -1079,6 +1149,8 @@ struiocopyout(mblk_t *bp, struct uio *uiop, int *errorp) ASSERT(bp->b_wptr >= bp->b_rptr); do { + ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA)); + if ((n = MIN(uiop->uio_resid, MBLKL(bp))) != 0) { ASSERT(n > 0); @@ -1225,8 +1297,10 @@ strread(struct vnode *vp, struct uio *uiop, cred_t *crp) } first = 0; } + ASSERT(MUTEX_HELD(&stp->sd_lock)); ASSERT(bp); + ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA)); pri = bp->b_band; /* * Extract any mark information. If the message is not @@ -6460,6 +6534,7 @@ strgetmsg( bp = strget(stp, q, uiop, first, &error); ASSERT(MUTEX_HELD(&stp->sd_lock)); if (bp != NULL) { + ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA)); if (bp->b_datap->db_type == M_SIG) { strsignal_nolock(stp, *bp->b_rptr, (int32_t)bp->b_band); @@ -7098,7 +7173,7 @@ retry: "kstrgetmsg calls strwaitq:%p, %p", vp, uiop); if (((error = strwaitq(stp, waitflag, (ssize_t)0, - fmode, timout, &done)) != 0) || done) { + fmode, timout, &done))) != 0 || done) { TRACE_2(TR_FAC_STREAMS_FR, TR_KSTRGETMSG_DONE, "kstrgetmsg error or done:%p, %p", vp, uiop); @@ -7132,6 +7207,7 @@ retry: * If the caller doesn't want the mark return. * Used to implement MSG_WAITALL in sockets. */ + ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA)); if (flags & MSG_NOMARK) { putback(stp, q, bp, pri); qbackenable(q, pri); @@ -7170,6 +7246,8 @@ retry: * there is indeed a shortage of memory. dupmsg() may fail * if db_ref in any of the messages reaches its limit. */ + + ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA)); if ((nbp = dupmsg(bp)) == NULL && (nbp = copymsg(bp)) == NULL) { /* * Restore the state of the stream head since we @@ -7228,6 +7306,7 @@ retry: } } + ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA)); bp = (stp->sd_rputdatafunc)(stp->sd_vnode, bp, NULL, NULL, NULL, NULL); @@ -7278,6 +7357,8 @@ retry: */ if (uiop == NULL) { /* Append data to tail of mctlp */ + + ASSERT(!(bp->b_datap->db_flags & DBLK_UIOA)); if (mctlp != NULL) { mblk_t **mpp = mctlp; @@ -7286,6 +7367,14 @@ retry: *mpp = bp; bp = NULL; } + } else if (bp->b_datap->db_flags & DBLK_UIOA) { + /* + * A uioa mblk_t chain, as uio processing has already + * been done we simple skip over processing. + */ + bp = NULL; + pr = 0; + } else if (uiop->uio_resid >= 0 && bp) { size_t oldresid = uiop->uio_resid; @@ -7374,6 +7463,8 @@ retry: * again since the flush logic in strrput_nondata() * may have cleared it while we had sd_lock dropped. */ + + ASSERT(!(savemp->b_datap->db_flags & DBLK_UIOA)); if (type >= QPCTL) { ASSERT(type == M_PCPROTO); if (queclass(savemp) < QPCTL) @@ -8445,3 +8536,82 @@ msghasdata(mblk_t *bp) } return (B_FALSE); } + +/* + * Called on the first strget() of a sodirect/uioa enabled streamhead, + * if any mblk_t(s) enqueued they must first be uioamove()d before uioa + * can be enabled for the underlying transport's use. + */ +void +struioainit(queue_t *q, sodirect_t *sodp, uio_t *uiop) +{ + uioa_t *uioap = (uioa_t *)uiop; + mblk_t *bp = q->q_first; + mblk_t *lbp = NULL; + mblk_t *nbp, *wbp; + int len; + int error; + + ASSERT(MUTEX_HELD(sodp->sod_lock)); + ASSERT(&sodp->sod_uioa == uioap); + + /* + * Walk the b_next/b_prev doubly linked list of b_cont chain(s) + * and schedule any M_DATA mblk_t's for uio asynchronous move. + */ + do { + /* Next mblk_t chain */ + nbp = bp->b_next; + /* Walk the chain */ + wbp = bp; + do { + if (wbp->b_datap->db_type == M_DATA && + (len = wbp->b_wptr - wbp->b_rptr) > 0) { + /* Have a M_DATA mblk_t with data */ + if (len > uioap->uio_resid) { + /* Not enough uio sapce */ + goto nospace; + } + error = uioamove(wbp->b_rptr, len, + UIO_READ, uioap); + if (!error) { + /* Scheduled, mark dblk_t as such */ + wbp->b_datap->db_flags |= DBLK_UIOA; + } else { + /* Error of some sort, no more uioa */ + uioap->uioa_state &= UIOA_CLR; + uioap->uioa_state |= UIOA_FINI; + return; + } + } + /* Save last wbp processed */ + lbp = wbp; + } while ((wbp = wbp->b_cont) != NULL); + } while ((bp = nbp) != NULL); + + return; + +nospace: + /* Not enough uio space, no more uioa */ + uioap->uioa_state &= UIOA_CLR; + uioap->uioa_state |= UIOA_FINI; + + /* + * If we processed 1 or more mblk_t(s) then we need to split the + * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s) + * are in the current chain and the rest are in the following new + * chain. + */ + if (lbp != NULL) { + /* New end of current chain */ + lbp->b_cont = NULL; + + /* Insert new chain wbp after bp */ + if ((wbp->b_next = nbp) != NULL) + nbp->b_prev = wbp; + else + q->q_last = wbp; + wbp->b_prev = bp; + bp->b_next = wbp; + } +} diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index 650a4cfaf9..a7750e2ec3 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -2467,13 +2467,18 @@ devflg_to_qflag(struct streamtab *stp, uint32_t devflag, uint32_t *qflagp, /* * Private flag used by a transport module to indicate * to sockfs that it supports direct-access mode without - * having to go through STREAMS. + * having to go through STREAMS or the transport can use + * sodirect_t sharing to bypass STREAMS for receive-side + * M_DATA processing. */ - if (devflag & _D_DIRECT) { + if (devflag & (_D_DIRECT|_D_SODIRECT)) { /* Reject unless the module is fully-MT (no perimeter) */ if ((qflag & QMT_TYPEMASK) != QMTSAFE) goto bad; - qflag |= _QDIRECT; + if (devflag & _D_DIRECT) + qflag |= _QDIRECT; + if (devflag & _D_SODIRECT) + qflag |= _QSODIRECT; } *qflagp = qflag; diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index d0d531088f..f072b5e18f 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -488,6 +488,7 @@ CHKHDRS= \ socket_impl.h \ socketvar.h \ sockio.h \ + sodirect.h \ squeue.h \ squeue_impl.h \ srn.h \ diff --git a/usr/src/uts/common/sys/conf.h b/usr/src/uts/common/sys/conf.h index 3f6300e581..435cffb35f 100644 --- a/usr/src/uts/common/sys/conf.h +++ b/usr/src/uts/common/sys/conf.h @@ -22,7 +22,7 @@ /* All Rights Reserved */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -221,6 +221,9 @@ extern int cdev_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, #define D_OPEN_RETURNS_EINTR 0x100000 /* EINTR expected from open(9E) */ +#define _D_SODIRECT 0x200000 /* Private flag for transport modules used */ + /* to enable _QSODIRECT for a STREAMS Q */ + #endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/dcopy.h b/usr/src/uts/common/sys/dcopy.h new file mode 100644 index 0000000000..e700ed9cf6 --- /dev/null +++ b/usr/src/uts/common/sys/dcopy.h @@ -0,0 +1,235 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DCOPY_H +#define _SYS_DCOPY_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> + +/* + * *** This interface is for private use by the IP stack only *** + */ + +/* Function return status */ +#define DCOPY_FAILURE (-1) +#define DCOPY_SUCCESS (0) +#define DCOPY_NORESOURCES (1) /* _alloc & _cmd_alloc, _cmd_post only */ +#define DCOPY_PENDING (0x10) /* dcopy_poll(), dcopy_unregister() */ +#define DCOPY_COMPLETED (0x20) /* dcopy_poll() only */ + + +/* dq_version */ +#define DCOPY_QUERY_V0 0 + +typedef struct dcopy_query_s { + int dq_version; /* DCOPY_QUERY_V0 */ + uint_t dq_num_channels; /* number of dma channels */ +} dcopy_query_t; + +/* + * dcopy_query() + * query for the number of DMA engines usable in the system. + */ +void dcopy_query(dcopy_query_t *query); + + +typedef struct dcopy_channel_s *dcopy_handle_t; + +/* dcopy_alloc() and dcopy_cmd_alloc() common flags */ +#define DCOPY_SLEEP (0) +#define DCOPY_NOSLEEP (1 << 0) + +/* + * dcopy_alloc() + * Allocate a DMA channel which is used for posting DMA requests. Note: this + * does not give the caller exclusive access to the DMA engine. Commands + * posted to a channel will complete in order. + * flags - (DCOPY_SLEEP, DCOPY_NOSLEEP) + * returns => DCOPY_FAILURE, DCOPY_SUCCESS, DCOPY_NORESOURCES + */ +int dcopy_alloc(int flags, dcopy_handle_t *handle); + +/* + * dcopy_free() + * Free the DMA channel. The client can no longer use the handle to post or + * poll for status on posts which were previously done on this channel. + */ +void dcopy_free(dcopy_handle_t *handle); + +/* dq_version */ +#define DCOPY_QUERY_CHANNEL_V0 0 + +/* Per DMA channel info */ +typedef struct dcopy_query_channel_s { + int qc_version; /* DCOPY_QUERY_CHANNEL_V0 */ + + /* Does DMA channel support DCA */ + boolean_t qc_dca_supported; + + /* device id and device specific capabilities */ + uint64_t qc_id; + uint64_t qc_capabilities; + + /* + * DMA channel size. This may not be the same as the number of posts + * that the DMA channel can handle since a post may consume 1 or more + * entries. + */ + uint64_t qc_channel_size; + + /* DMA channel number within the device. Not unique across devices */ + uint64_t qc_chan_num; +} dcopy_query_channel_t; + +/* + * dcopy_query_channel() + * query DMA engines capabilities + */ +void dcopy_query_channel(dcopy_handle_t handle, dcopy_query_channel_t *query); + + +/* dp_version */ +#define DCOPY_CMD_V0 0 + +/* dp_cmd */ +#define DCOPY_CMD_COPY 0x1 + +/* dp_flags */ +/* + * DCOPY_CMD_QUEUE + * Hint to queue up the post but don't notify the DMA engine. This can be + * used as an optimization when multiple posts are going to be queued up and + * you only want notify the DMA engine after the last post. Note, this does + * not mean the DMA engine won't process the request since it could notice + * it anyway. + * DCOPY_CMD_NOSTAT + * Don't generate a status. If this flag is used, You cannot poll for + * completion status on this command. This can be a useful performance + * optimization if your posting multiple commands and just want to poll on + * the last command. + * DCOPY_CMD_DCA + * If DCA is supported, direct this and all future command data (until the + * next command with DCOPY_POST_DCA set) to the processor specified in + * dp_dca_id. This flag is ignored if DCA is not supported. + * DCOPY_CMD_INTR + * Generate an interrupt when command completes. This flag is required if + * the caller is going to call dcopy_cmd_poll(() with DCOPY_POLL_BLOCK set + * for this command. + */ +#define DCOPY_CMD_NOFLAGS (0) +#define DCOPY_CMD_QUEUE (1 << 0) +#define DCOPY_CMD_NOSTAT (1 << 1) +#define DCOPY_CMD_DCA (1 << 2) +#define DCOPY_CMD_INTR (1 << 3) + +typedef struct dcopy_cmd_copy_s { + uint64_t cc_source; /* Source physical address */ + uint64_t cc_dest; /* Destination physical address */ + size_t cc_size; +} dcopy_cmd_copy_t; + +typedef union dcopy_cmd_u { + dcopy_cmd_copy_t copy; +} dcopy_cmd_u_t; + +typedef struct dcopy_cmd_priv_s *dcopy_cmd_priv_t; + +struct dcopy_cmd_s { + uint_t dp_version; /* DCOPY_CMD_V0 */ + uint_t dp_flags; + uint64_t dp_cmd; + dcopy_cmd_u_t dp; + uint32_t dp_dca_id; + dcopy_cmd_priv_t dp_private; +}; +typedef struct dcopy_cmd_s *dcopy_cmd_t; + + +/* + * dcopy_cmd_alloc() specific flags + * DCOPY_ALLOC_LINK - when set, the caller passes in a previously alloced + * command in cmd. dcopy_cmd_alloc() will allocate a new command and + * link it to the old command. The caller can use this to build a + * chain of commands, keeping only the last cmd alloced. calling + * dcopy_cmd_free() with the last cmd alloced in the chain will free all of + * the commands in the chain. dcopy_cmd_post() and dcopy_cmd_poll() have + * no knowledge of a chain of commands. It's only used for alloc/free. + */ +#define DCOPY_ALLOC_LINK (1 << 16) + +/* + * dcopy_cmd_alloc() + * allocate a command. A command can be re-used after it completes. + * flags - (DCOPY_SLEEP || DCOPY_NOSLEEP), DCOPY_ALLOC_LINK + * returns => DCOPY_FAILURE, DCOPY_SUCCESS, DCOPY_NORESOURCES + */ +int dcopy_cmd_alloc(dcopy_handle_t handle, int flags, dcopy_cmd_t *cmd); + +/* + * dcopy_cmd_free() + * free the command. This call cannot be called after dcopy_free(). + */ +void dcopy_cmd_free(dcopy_cmd_t *cmd); + +/* + * dcopy_cmd_post() + * post a command (allocated from dcopy_cmd_alloc()) to the DMA channel + * returns => DCOPY_FAILURE, DCOPY_SUCCESS, DCOPY_NORESOURCES + */ +int dcopy_cmd_post(dcopy_cmd_t cmd); + +/* dcopy_cmd_poll() flags */ +#define DCOPY_POLL_NOFLAGS (0) +#define DCOPY_POLL_BLOCK (1 << 0) + +/* + * dcopy_cmd_poll() + * poll on completion status of a previous post. This call cannot be called + * after dcopy_free(). + * + * if flags == DCOPY_POLL_NOFLAGS, return status can be DCOPY_FAILURE, + * DCOPY_PENDING, or DCOPY_COMPLETED. + * + * if flags & DCOPY_POLL_BLOCK, return status can be DCOPY_FAILURE or + * DCOPY_COMPLETED. DCOPY_POLL_BLOCK can only be set in base context. + * + * The command cannot be re-used or freed until the command has completed + * (e.g. DCOPY_FAILURE or DCOPY_COMPLETED). + */ +int dcopy_cmd_poll(dcopy_cmd_t cmd, int flags); + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DCOPY_H */ diff --git a/usr/src/uts/common/sys/dcopy_device.h b/usr/src/uts/common/sys/dcopy_device.h new file mode 100644 index 0000000000..25e95b2aa8 --- /dev/null +++ b/usr/src/uts/common/sys/dcopy_device.h @@ -0,0 +1,154 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DCOPY_DEVICE_H +#define _SYS_DCOPY_DEVICE_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/dcopy.h> + +/* + * private command state. Space for this structure should be allocated during + * (*cb_cmd_alloc). The DMA driver must set dp_private in dcopy_cmd_t to point + * to the memory it allocated. Other than pr_device_cmd_private, the DMA driver + * should not touch any of the fields in this structure. pr_device_cmd_private + * is a private pointer for the DMA engine to use. + */ +struct dcopy_cmd_priv_s { + /* + * we only init the state used to track a command which blocks when it + * actually blocks. pr_block_init tells us when we need to clean it + * up during a cmd_free. + */ + boolean_t pr_block_init; + + /* dcopy_poll blocking state */ + list_node_t pr_poll_list_node; + volatile boolean_t pr_wait; + kmutex_t pr_mutex; + kcondvar_t pr_cv; + + /* back pointer to the command */ + dcopy_cmd_t pr_cmd; + + /* shortcut to the channel we're on */ + struct dcopy_channel_s *pr_channel; + + /* DMA driver private pointer */ + void *pr_device_cmd_private; +}; + +/* cb_version */ +#define DCOPY_DEVICECB_V0 0 + +typedef struct dcopy_device_chaninfo_s { + uint_t di_chan_num; +} dcopy_device_chaninfo_t; + +typedef struct dcopy_device_cb_s { + int cb_version; + int cb_res1; + + /* allocate/free a DMA channel. See dcopy.h for return status */ + int (*cb_channel_alloc)(void *device_private, + dcopy_handle_t handle, int flags, uint_t size, + dcopy_query_channel_t *info, void *channel_private); + void (*cb_channel_free)(void *channel_private); + + /* allocate/free a command. See dcopy.h for return status */ + int (*cb_cmd_alloc)(void *channel_private, int flags, + dcopy_cmd_t *cmd); + void (*cb_cmd_free)(void *channel_private, dcopy_cmd_t *cmd); + + /* + * post a command/poll for command status. See dcopy.h for return + * status + */ + int (*cb_cmd_post)(void *channel_private, dcopy_cmd_t cmd); + int (*cb_cmd_poll)(void *channel_private, dcopy_cmd_t cmd); + + /* + * if dcopy_device_unregister() returns DCOPY_PENDING, dcopy will + * call this routine when all the channels are no longer being + * used and have been free'd up. e.g. it's safe for the DMA driver + * to detach. + * status = DCOPY_SUCCESS || DCOPY_FAILURE + */ + void (*cb_unregister_complete)(void *device_private, int status); +} dcopy_device_cb_t; + + +typedef struct dcopy_device_info_s { + dev_info_t *di_dip; + dcopy_device_cb_t *di_cb; /* must be a static array */ + uint_t di_num_dma; + uint_t di_maxxfer; + uint_t di_capabilities; + uint64_t di_id; +} dcopy_device_info_t; + +typedef struct dcopy_device_s *dcopy_device_handle_t; + +/* dcopy_device_notify() status */ +#define DCOPY_COMPLETION 0 + +/* + * dcopy_device_register() + * register the DMA device with dcopy. + * return status => DCOPY_FAILURE, DCOPY_SUCCESS + */ +int dcopy_device_register(void *device_private, dcopy_device_info_t *info, + dcopy_device_handle_t *handle); + +/* + * dcopy_device_unregister() + * try to unregister the DMA device with dcopy. If the DMA engines are + * still being used by upper layer modules, DCOPY_PENDING will be returned. + * return status => DCOPY_FAILURE, DCOPY_SUCCESS, DCOPY_PENDING + * if DCOPY_PENDING, (*cb_unregister_complete)() will be called when + * completed. + */ +int dcopy_device_unregister(dcopy_device_handle_t *handle); + +/* + * dcopy_device_channel_notify() + * Notify dcopy of an event. + * dcopy_handle_t handle => what was passed into (*cb_alloc)() + * status => DCOPY_COMPLETION + */ +void dcopy_device_channel_notify(dcopy_handle_t handle, int status); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DCOPY_DEVICE_H */ diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h index 0680546ade..178a8a2905 100644 --- a/usr/src/uts/common/sys/socketvar.h +++ b/usr/src/uts/common/sys/socketvar.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -50,14 +50,13 @@ #include <sys/file.h> #include <sys/param.h> #include <sys/zone.h> +#include <sys/sodirect.h> #include <inet/kssl/ksslapi.h> #ifdef __cplusplus extern "C" { #endif - - /* * Internal representation used for addresses. */ @@ -333,6 +332,9 @@ struct sonode { kssl_endpt_type_t so_kssl_type; /* is proxy/is proxied/none */ kssl_ent_t so_kssl_ent; /* SSL config entry */ kssl_ctx_t so_kssl_ctx; /* SSL session context */ + + /* != NULL for sodirect_t enabled socket */ + sodirect_t *so_direct; }; /* flags */ @@ -375,6 +377,7 @@ struct sonode { #define SS_MOREDATA 0x00100000 /* NCAfs: NCA has more data */ #define SS_DIRECT 0x00200000 /* transport is directly below */ +#define SS_SODIRECT 0x00400000 /* transport supports sodirect */ #define SS_LADDR_VALID 0x01000000 /* so_laddr valid for user */ #define SS_FADDR_VALID 0x02000000 /* so_faddr valid for user */ diff --git a/usr/src/uts/common/sys/sodirect.h b/usr/src/uts/common/sys/sodirect.h new file mode 100644 index 0000000000..49609bc5af --- /dev/null +++ b/usr/src/uts/common/sys/sodirect.h @@ -0,0 +1,101 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_SODIRECT_H +#define _SYS_SODIRECT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Sodirect ... + * + * Currently the sodirect_t uses the sockfs streamhead STREAMS Q directly, + * in the future when we have STREAMless sockets a sonode Q will have to + * be implemented however the sodirect KPI shouldn't need to change. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct sodirect_s { + uint32_t sod_state; /* State bits */ + uint32_t sod_want; /* Pending read byte count or 0 */ + queue_t *sod_q; /* Socket Q */ + int (*sod_enqueue)(); /* Call to enqueue an mblk_t */ + void (*sod_wakeup)(); /* Call to awkake a read()er, if any */ + mblk_t *sod_uioafh; /* To be freed list head, or NULL */ + mblk_t *sod_uioaft; /* To be freed list tail */ + kmutex_t *sod_lock; /* Lock needed to protect all members */ + uioa_t sod_uioa; /* Pending uio_t for uioa_t use */ +} sodirect_t; + +/* + * sod_state bits: + */ + +#define SOD_DISABLED 0 /* No more sodirect */ + +#define SOD_ENABLED 0x0001 /* sodirect_t enabled */ + +#define SOD_WAKE_NOT 0x0010 /* Wakeup not needed */ +#define SOD_WAKE_NEED 0x0020 /* Wakeup needed */ +#define SOD_WAKE_DONE 0x0040 /* Wakeup done */ +#define SOD_WAKE_CLR ~(SOD_WAKE_NOT|SOD_WAKE_NEED|SOD_WAKE_DONE) + +/* + * Usefull macros: + */ + +#define SOD_QSETBE(p) ((p)->sod_q->q_flag |= QWANTW) +#define SOD_QCLRBE(p) ((p)->sod_q->q_flag &= ~QWANTW) +#define SOD_QEMPTY(p) ((p)->sod_q->q_first == NULL) +#define SOD_QFULL(p) ((p)->sod_q->q_flag & QFULL) +#define SOD_QCNT(p) ((p)->sod_q->q_count) + +#define SOD_DISABLE(p) (p)->sod_state &= ~SOD_ENABLED + +#define SOD_QTOSODP(q) (q)->q_stream->sd_sodirect + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SODIRECT_H */ diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h index 7142a1f19d..6720c14718 100644 --- a/usr/src/uts/common/sys/stream.h +++ b/usr/src/uts/common/sys/stream.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -190,6 +190,8 @@ typedef struct queue { #define _QASSOCIATED 0x10000000 /* queue is associated with a device */ #define _QDIRECT 0x20000000 /* Private; transport module uses */ /* direct interface to/from sockfs */ +#define _QSODIRECT 0x40000000 /* Private, transport module shares */ + /* an sodirect_t with sockfs */ /* queue sqflags (protected by SQLOCK). */ #define Q_SQQUEUED 0x01 /* Queue is in the syncq list */ @@ -400,6 +402,7 @@ typedef struct bcache { */ #define DBLK_REFMIN 0x01 /* min refcnt stored in low bit */ #define DBLK_COOKED 0x02 /* message has been processed once */ +#define DBLK_UIOA 0x04 /* uioamove() is pending */ /* * db_struioflag values: diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h index 6be0519425..41c1fdf0b3 100644 --- a/usr/src/uts/common/sys/strsubr.h +++ b/usr/src/uts/common/sys/strsubr.h @@ -46,6 +46,7 @@ #include <sys/proc.h> #include <sys/netstack.h> #include <sys/modhash.h> +#include <sys/sodirect.h> #ifdef __cplusplus extern "C" { @@ -94,9 +95,8 @@ extern "C" { * sd_mark * sd_closetime * sd_wakeq - * sd_uiordq - * sd_uiowrq * sd_maxblk + * sd_sodirect * * The following fields are modified only by the allocator, which * has exclusive access to them at that time: @@ -244,6 +244,10 @@ typedef struct stdata { kcondvar_t sd_zcopy_wait; uint_t sd_copyflag; /* copy-related flags */ zoneid_t sd_anchorzone; /* Allow removal from same zone only */ + /* + * Support for socket direct. + */ + sodirect_t *sd_sodirect; /* pointer to shared sodirect_t */ } stdata_t; /* diff --git a/usr/src/uts/common/sys/uio.h b/usr/src/uts/common/sys/uio.h index 3e9e4a5eda..4f0aff49f6 100644 --- a/usr/src/uts/common/sys/uio.h +++ b/usr/src/uts/common/sys/uio.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -101,6 +100,49 @@ typedef struct uio { ssize_t uio_resid; /* residual count */ } uio_t; +/* + * Extended uio_t uioa_t used for asynchronous uio. + * + * Note: UIOA_IOV_MAX is defined and used as it is in "fs/vncalls.c" + * as there isn't a formal definition of IOV_MAX for the kernel. + */ +#define UIOA_IOV_MAX 16 + +typedef struct uioa_page_s { /* locked uio_iov state */ + int uioa_pfncnt; /* count of pfn_t(s) in *uioa_ppp */ + void **uioa_ppp; /* page_t or pfn_t arrary */ + caddr_t uioa_base; /* address base */ + size_t uioa_len; /* span length */ +} uioa_page_t; + +typedef struct uioa_s { + iovec_t *uio_iov; /* pointer to array of iovecs */ + int uio_iovcnt; /* number of iovecs */ + lloff_t _uio_offset; /* file offset */ + uio_seg_t uio_segflg; /* address space (kernel or user) */ + uint16_t uio_fmode; /* file mode flags */ + uint16_t uio_extflg; /* extended flags */ + lloff_t _uio_limit; /* u-limit (maximum byte offset) */ + ssize_t uio_resid; /* residual count */ + /* + * uioa extended members. + */ + uint32_t uioa_state; /* state of asynch i/o */ + uioa_page_t *uioa_lcur; /* pointer into uioa_locked[] */ + void **uioa_lppp; /* pointer into lcur->uioa_ppp[] */ + void *uioa_hwst[4]; /* opaque hardware state */ + uioa_page_t uioa_locked[UIOA_IOV_MAX]; /* Per iov locked pages */ +} uioa_t; + +#define UIOA_ALLOC 0x0001 /* allocated but not yet initialized */ +#define UIOA_INIT 0x0002 /* initialized but not yet enabled */ +#define UIOA_ENABLED 0x0004 /* enabled, asynch i/o active */ +#define UIOA_FINI 0x0008 /* finished waiting for uioafini() */ + +#define UIOA_CLR (~0x000F) /* clear mutually exclusive bits */ + +#define UIOA_POLL 0x0010 /* need dcopy_poll() */ + #define uio_loffset _uio_offset._f #if !defined(_LP64) #define uio_offset _uio_offset._p._l @@ -127,10 +169,24 @@ typedef enum uio_rw { UIO_READ, UIO_WRITE } uio_rw_t; * access, ie, access bypassing caches, should be used. Filesystems that * don't initialize this field could experience suboptimal performance due to * the random data the field contains. + * + * NOTE: This flag is also used by uioasync callers to pass an extended + * uio_t (uioa_t), to uioasync enabled consumers. Unlike above all + * consumers of a uioa_t require the uio_extflg to be initialized. */ #define UIO_COPY_DEFAULT 0x0000 /* no special options to copy */ #define UIO_COPY_CACHED 0x0001 /* copy should not bypass caches */ +#define UIO_ASYNC 0x0002 /* uio_t is really a uioa_t */ + +/* + * Global uioasync capability shadow state. + */ +typedef struct uioasync_s { + boolean_t enabled; /* Is uioasync enabled? */ + size_t mincnt; /* Minimum byte count for use of */ +} uioasync_t; + #endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ #if defined(_KERNEL) @@ -141,6 +197,11 @@ int uwritec(struct uio *); void uioskip(uio_t *, size_t); int uiodup(uio_t *, uio_t *, iovec_t *, int); +int uioamove(void *, size_t, enum uio_rw, uioa_t *); +int uioainit(uio_t *, uioa_t *); +int uioafini(uio_t *, uioa_t *); +extern uioasync_t uioasync; + #else /* defined(_KERNEL) */ #if defined(__STDC__) |
