diff options
| author | chunli zhang - Sun Microsystems - Irvine United States <Chunli.Zhang@Sun.COM> | 2010-01-18 10:34:16 -0800 | 
|---|---|---|
| committer | chunli zhang - Sun Microsystems - Irvine United States <Chunli.Zhang@Sun.COM> | 2010-01-18 10:34:16 -0800 | 
| commit | c242f9a02a2ef021449275ae0a1d2581ee77231d (patch) | |
| tree | 6d298bebb8ff9febd9acf936d402f67a6d67d358 /usr/src | |
| parent | bce54adf407df0723facaef4e2147ed69b922786 (diff) | |
| download | illumos-joyent-c242f9a02a2ef021449275ae0a1d2581ee77231d.tar.gz | |
6873106 Need a mechanism to share buffers between fs modules
Diffstat (limited to 'usr/src')
27 files changed, 990 insertions, 84 deletions
| diff --git a/usr/src/cmd/stat/fsstat/fsstat.c b/usr/src/cmd/stat/fsstat/fsstat.c index 1869ff4fe6..31b2f5e054 100644 --- a/usr/src/cmd/stat/fsstat/fsstat.c +++ b/usr/src/cmd/stat/fsstat/fsstat.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -428,6 +428,8 @@ vop_display(char *name, vopstats_t *oldvsp, vopstats_t *newvsp, int dispflag)  	PRINT_VOPSTAT(niceflag, setsecattr);  	PRINT_VOPSTAT(niceflag, shrlock);  	PRINT_VOPSTAT(niceflag, vnevent); +	PRINT_VOPSTAT(niceflag, reqzcbuf); +	PRINT_VOPSTAT(niceflag, retzcbuf);  	if (niceflag) {  		/* Make it easier on the eyes */ diff --git a/usr/src/uts/common/fs/fem.c b/usr/src/uts/common/fs/fem.c index b7d1413bd7..95248d1077 100644 --- a/usr/src/uts/common/fs/fem.c +++ b/usr/src/uts/common/fs/fem.c @@ -19,10 +19,9 @@   * CDDL HEADER END   */  /* - * Copyright 2008 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ -#pragma ident	"%Z%%M%	%I%	%E% SMI"  #include <sys/types.h>  #include <sys/atomic.h> @@ -124,6 +123,8 @@ static fs_operation_trans_def_t	fem_opdef[] = {  	_FEMOPDEF(GETSECATTR,	getsecattr),  	_FEMOPDEF(SHRLOCK,	shrlock),  	_FEMOPDEF(VNEVENT,	vnevent), +	_FEMOPDEF(REQZCBUF,	reqzcbuf), +	_FEMOPDEF(RETZCBUF,	retzcbuf),  	{ NULL, 0, NULL, NULL }  }; @@ -176,6 +177,8 @@ static struct fs_operation_def fem_guard_ops[] = {  	_FEMGUARD(GETSECATTR,	getsecattr),  	_FEMGUARD(SHRLOCK,	shrlock),  	_FEMGUARD(VNEVENT,	vnevent), +	_FEMGUARD(REQZCBUF,	reqzcbuf), +	_FEMGUARD(RETZCBUF,	retzcbuf),  	{ NULL, NULL }  }; @@ -1645,6 +1648,61 @@ vhead_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *cname,  }  static int +vhead_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr, +    caller_context_t *ct) +{ +	femarg_t	farg; +	struct fem_list	*femsp; +	int		(*func)(); +	void		*arg0; +	int		errc; + +	if ((femsp = fem_lock(vp->v_femhead)) == NULL) { +		func = (int (*)()) (vp->v_op->vop_reqzcbuf); +		arg0 = vp; +		fem_unlock(vp->v_femhead); +		errc = (*func)(arg0, ioflag, xuiop, cr, ct); +	} else { +		fem_addref(femsp); +		fem_unlock(vp->v_femhead); +		farg.fa_vnode.vp = vp; +		farg.fa_fnode = femsp->feml_nodes + femsp->feml_tos; +		vsop_find(&farg, &func, int, &arg0, vop_reqzcbuf, +		    femop_reqzcbuf); +		errc = (*func)(arg0, ioflag, xuiop, cr, ct); +		fem_release(femsp); +	} +	return (errc); +} + +static int +vhead_retzcbuf(vnode_t *vp, xuio_t *xuiop, cred_t *cr, caller_context_t *ct) +{ +	femarg_t	farg; +	struct fem_list	*femsp; +	int		(*func)(); +	void		*arg0; +	int		errc; + +	if ((femsp = fem_lock(vp->v_femhead)) == NULL) { +		func = (int (*)()) (vp->v_op->vop_retzcbuf); +		arg0 = vp; +		fem_unlock(vp->v_femhead); +		errc = (*func)(arg0, xuiop, cr, ct); +	} else { +		fem_addref(femsp); +		fem_unlock(vp->v_femhead); +		farg.fa_vnode.vp = vp; +		farg.fa_fnode = femsp->feml_nodes + femsp->feml_tos; +		vsop_find(&farg, &func, int, &arg0, vop_retzcbuf, +		    femop_retzcbuf); +		errc = (*func)(arg0, xuiop, cr, ct); +		fem_release(femsp); +	} +	return (errc); +} + +static int  fshead_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)  {  	fsemarg_t	farg; @@ -1942,6 +2000,8 @@ static struct fs_operation_def fhead_vn_spec[] = {  	{ VOPNAME_GETSECATTR, (femop_t *)vhead_getsecattr },  	{ VOPNAME_SHRLOCK, (femop_t *)vhead_shrlock },  	{ VOPNAME_VNEVENT, (femop_t *)vhead_vnevent }, +	{ VOPNAME_REQZCBUF, (femop_t *)vhead_reqzcbuf }, +	{ VOPNAME_RETZCBUF, (femop_t *)vhead_retzcbuf },  	{	NULL,	NULL	}  }; @@ -2642,6 +2702,35 @@ vnext_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *cname,  }  int +vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr, +    caller_context_t *ct) +{ +	int (*func)() = NULL; +	void *arg0 = NULL; + +	ASSERT(vf != NULL); +	vf->fa_fnode--; +	vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf); +	ASSERT(func != NULL); +	ASSERT(arg0 != NULL); +	return ((*func)(arg0, ioflag, xuiop, cr, ct)); +} + +int +vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct) +{ +	int (*func)() = NULL; +	void *arg0 = NULL; + +	ASSERT(vf != NULL); +	vf->fa_fnode--; +	vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf); +	ASSERT(func != NULL); +	ASSERT(arg0 != NULL); +	return ((*func)(arg0, xuiop, cr, ct)); +} + +int  vfsnext_mount(fsemarg_t *vf, vnode_t *mvp, struct mounta *uap, cred_t *cr)  {  	int (*func)() = NULL; diff --git a/usr/src/uts/common/fs/nfs/nfs3_srv.c b/usr/src/uts/common/fs/nfs/nfs3_srv.c index 71ebdb2d74..b8e63c183d 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_srv.c +++ b/usr/src/uts/common/fs/nfs/nfs3_srv.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -87,6 +87,8 @@ static void	vattr_to_pre_op_attr(struct vattr *, pre_op_attr *);  static void	vattr_to_wcc_data(struct vattr *, struct vattr *, wcc_data *);  static int	rdma_setup_read_data3(READ3args *, READ3resok *); +extern int nfs_loaned_buffers; +  u_longlong_t nfs3_srv_caller_id;  /* ARGSUSED */ @@ -994,6 +996,9 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi,  	int in_crit = 0;  	int need_rwunlock = 0;  	caller_context_t ct; +	int rdma_used = 0; +	int loaned_buffers; +	struct uio *uiop;  	vap = NULL; @@ -1007,6 +1012,12 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi,  		goto out;  	} +	if (args->wlist) +		rdma_used = 1; + +	/* use loaned buffers for TCP */ +	loaned_buffers = (nfs_loaned_buffers && !rdma_used) ? 1 : 0; +  	if (is_system_labeled()) {  		bslabel_t *clabel = req->rq_label; @@ -1136,12 +1147,38 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi,  	if (args->count > rfs3_tsize(req))  		args->count = rfs3_tsize(req); +	if (loaned_buffers) { +		uiop = (uio_t *)rfs_setup_xuio(vp); +		ASSERT(uiop != NULL); +		uiop->uio_segflg = UIO_SYSSPACE; +		uiop->uio_loffset = args->offset; +		uiop->uio_resid = args->count; + +		/* Jump to do the read if successful */ +		if (VOP_REQZCBUF(vp, UIO_READ, (xuio_t *)uiop, cr, &ct) == 0) { +			/* +			 * Need to hold the vnode until after VOP_RETZCBUF() +			 * is called. +			 */ +			VN_HOLD(vp); +			goto doio_read; +		} + +		DTRACE_PROBE2(nfss__i__reqzcbuf_failed, int, +		    uiop->uio_loffset, int, uiop->uio_resid); + +		uiop->uio_extflg = 0; +		/* failure to setup for zero copy */ +		rfs_free_xuio((void *)uiop); +		loaned_buffers = 0; +	} +  	/*  	 * If returning data via RDMA Write, then grab the chunk list.  	 * If we aren't returning READ data w/RDMA_WRITE, then grab  	 * a mblk.  	 */ -	if (args->wlist) { +	if (rdma_used) {  		mp = NULL;  		(void) rdma_get_wchunk(req, &iov, args->wlist);  	} else { @@ -1167,11 +1204,14 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi,  	uio.uio_extflg = UIO_COPY_CACHED;  	uio.uio_loffset = args->offset;  	uio.uio_resid = args->count; +	uiop = &uio; -	error = VOP_READ(vp, &uio, 0, cr, &ct); +doio_read: +	error = VOP_READ(vp, uiop, 0, cr, &ct);  	if (error) { -		freeb(mp); +		if (mp) +			freemsg(mp);  		/* check if a monitor detected a delegation conflict */  		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {  			resp->status = NFS3ERR_JUKEBOX; @@ -1180,6 +1220,12 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi,  		goto out;  	} +	/* make mblk using zc buffers */ +	if (loaned_buffers) { +		mp = uio_to_mblk(uiop); +		ASSERT(mp != NULL); +	} +  	va.va_mask = AT_ALL;  	error = VOP_GETATTR(vp, &va, 0, cr, &ct); @@ -1205,16 +1251,20 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi,  	resp->status = NFS3_OK;  	vattr_to_post_op_attr(vap, &resp->resok.file_attributes); -	resp->resok.count = args->count - uio.uio_resid; +	resp->resok.count = args->count - uiop->uio_resid;  	if (!error && offset + resp->resok.count == va.va_size)  		resp->resok.eof = TRUE;  	else  		resp->resok.eof = FALSE;  	resp->resok.data.data_len = resp->resok.count; + +	if (mp) +		rfs_rndup_mblks(mp, resp->resok.count, loaned_buffers); +  	resp->resok.data.mp = mp;  	resp->resok.size = (uint_t)args->count; -	if (args->wlist) { +	if (rdma_used) {  		resp->resok.data.data_val = (caddr_t)iov.iov_base;  		if (!rdma_setup_read_data3(args, &(resp->resok))) {  			resp->status = NFS3ERR_INVAL; @@ -1260,7 +1310,7 @@ rfs3_read_free(READ3res *resp)  	if (resp->status == NFS3_OK) {  		mp = resp->resok.data.mp;  		if (mp != NULL) -			freeb(mp); +			freemsg(mp);  	}  } diff --git a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c index 501c2dbd9e..2111e9fabf 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c @@ -1003,7 +1003,7 @@ errout:  static int nfs3_dynamic = 0;	/* global variable to enable dynamic retrans. */  static ushort_t nfs3_max_threads = 8;	/* max number of active async threads */ -static uint_t nfs3_bsize = 32 * 1024;	/* client `block' size */ +uint_t nfs3_bsize = 32 * 1024;	/* client `block' size */  static uint_t nfs3_async_clusters = 1;	/* # of reqs from each async queue */  static uint_t nfs3_cots_timeo = NFS_COTS_TIMEO; diff --git a/usr/src/uts/common/fs/nfs/nfs3_xdr.c b/usr/src/uts/common/fs/nfs/nfs3_xdr.c index e8fd857848..cdec8ffc96 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_xdr.c +++ b/usr/src/uts/common/fs/nfs/nfs3_xdr.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -1320,16 +1320,9 @@ xdr_READ3res(XDR *xdrs, READ3res *objp)  	}  	if (xdrs->x_op == XDR_ENCODE) { -		int i, rndup;  		mp = resokp->data.mp;  		if (mp != NULL && xdrs->x_ops == &xdrmblk_ops) { -			mp->b_wptr += resokp->count; -			rndup = BYTES_PER_XDR_UNIT - -			    (resokp->data.data_len % BYTES_PER_XDR_UNIT); -			if (rndup != BYTES_PER_XDR_UNIT) -				for (i = 0; i < rndup; i++) -					*mp->b_wptr++ = '\0';  			if (xdrmblk_putmblk(xdrs, mp, resokp->count) == TRUE) {  				resokp->data.mp = NULL;  				return (TRUE); diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv.c b/usr/src/uts/common/fs/nfs/nfs4_srv.c index ac584c9d62..62474ee7f6 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -84,6 +84,8 @@ static int rfs4_maxlock_tries = RFS4_MAXLOCK_TRIES;  #define	RFS4_LOCK_DELAY 10	/* Milliseconds */  static clock_t  rfs4_lock_delay = RFS4_LOCK_DELAY;  extern struct svc_ops rdma_svc_ops; +extern int nfs_loaned_buffers; +/* End of Tunables */  static int rdma_setup_read_data4(READ4args *, READ4res *); @@ -3140,9 +3142,12 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req,  	bool_t *deleg = &cs->deleg;  	nfsstat4 stat;  	int in_crit = 0; -	mblk_t *mp; +	mblk_t *mp = NULL;  	int alloc_err = 0; +	int rdma_used = 0; +	int loaned_buffers;  	caller_context_t ct; +	struct uio *uiop;  	DTRACE_NFSV4_2(op__read__start, struct compound_state *, cs,  	    READ4args, args); @@ -3183,6 +3188,12 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req,  		goto out;  	} +	if (args->wlist) +		rdma_used = 1; + +	/* use loaned buffers for TCP */ +	loaned_buffers = (nfs_loaned_buffers && !rdma_used) ? 1 : 0; +  	va.va_mask = AT_MODE|AT_SIZE|AT_UID;  	verror = VOP_GETATTR(vp, &va, 0, cs->cr, &ct); @@ -3250,11 +3261,38 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req,  	if (args->count > rfs4_tsize(req))  		args->count = rfs4_tsize(req); +	if (loaned_buffers) { +		uiop = (uio_t *)rfs_setup_xuio(vp); +		ASSERT(uiop != NULL); +		uiop->uio_segflg = UIO_SYSSPACE; +		uiop->uio_loffset = args->offset; +		uiop->uio_resid = args->count; + +		/* Jump to do the read if successful */ +		if (!VOP_REQZCBUF(vp, UIO_READ, (xuio_t *)uiop, cs->cr, &ct)) { +			/* +			 * Need to hold the vnode until after VOP_RETZCBUF() +			 * is called. +			 */ +			VN_HOLD(vp); +			goto doio_read; +		} + +		DTRACE_PROBE2(nfss__i__reqzcbuf_failed, int, +		    uiop->uio_loffset, int, uiop->uio_resid); + +		uiop->uio_extflg = 0; + +		/* failure to setup for zero copy */ +		rfs_free_xuio((void *)uiop); +		loaned_buffers = 0; +	} +  	/*  	 * If returning data via RDMA Write, then grab the chunk list. If we  	 * aren't returning READ data w/RDMA_WRITE, then grab a mblk.  	 */ -	if (args->wlist) { +	if (rdma_used) {  		mp = NULL;  		(void) rdma_get_wchunk(req, &iov, args->wlist);  	} else { @@ -3287,27 +3325,38 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req,  	uio.uio_extflg = UIO_COPY_CACHED;  	uio.uio_loffset = args->offset;  	uio.uio_resid = args->count; +	uiop = &uio; -	error = do_io(FREAD, vp, &uio, 0, cs->cr, &ct); +doio_read: +	error = do_io(FREAD, vp, uiop, 0, cs->cr, &ct);  	va.va_mask = AT_SIZE;  	verror = VOP_GETATTR(vp, &va, 0, cs->cr, &ct);  	if (error) { -		freeb(mp); +		if (mp) +			freemsg(mp);  		*cs->statusp = resp->status = puterrno4(error);  		goto out;  	} +	/* make mblk using zc buffers */ +	if (loaned_buffers) { +		mp = uio_to_mblk(uiop); +		ASSERT(mp != NULL); +	} +  	*cs->statusp = resp->status = NFS4_OK; -	ASSERT(uio.uio_resid >= 0); -	resp->data_len = args->count - uio.uio_resid; +	ASSERT(uiop->uio_resid >= 0); +	resp->data_len = args->count - uiop->uio_resid;  	if (mp) {  		resp->data_val = (char *)mp->b_datap->db_base; +		rfs_rndup_mblks(mp, resp->data_len, loaned_buffers);  	} else {  		resp->data_val = (caddr_t)iov.iov_base;  	} +  	resp->mblk = mp;  	if (!verror && offset + resp->data_len == va.va_size) @@ -3315,7 +3364,7 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req,  	else  		resp->eof = FALSE; -	if (args->wlist) { +	if (rdma_used) {  		if (!rdma_setup_read_data4(args, resp)) {  			*cs->statusp = resp->status = NFS4ERR_INVAL;  		} @@ -3337,7 +3386,7 @@ rfs4_op_read_free(nfs_resop4 *resop)  	READ4res	*resp = &resop->nfs_resop4_u.opread;  	if (resp->status == NFS4_OK && resp->mblk != NULL) { -		freeb(resp->mblk); +		freemsg(resp->mblk);  		resp->mblk = NULL;  		resp->data_val = NULL;  		resp->data_len = 0; diff --git a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c index d6ac9bf407..040fbed7bd 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c @@ -2159,7 +2159,7 @@ restore_svp(mntinfo4_t *mi, servinfo4_t *svp, servinfo4_t *origsvp)  }  static ushort_t nfs4_max_threads = 8;	/* max number of active async threads */ -static uint_t nfs4_bsize = 32 * 1024;	/* client `block' size */ +uint_t nfs4_bsize = 32 * 1024;	/* client `block' size */  static uint_t nfs4_async_clusters = 1;	/* # of reqs from each async queue */  static uint_t nfs4_cots_timeo = NFS_COTS_TIMEO; diff --git a/usr/src/uts/common/fs/nfs/nfs4_xdr.c b/usr/src/uts/common/fs/nfs/nfs4_xdr.c index e2e14cff8a..08e9546cf3 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_xdr.c +++ b/usr/src/uts/common/fs/nfs/nfs4_xdr.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -3350,7 +3350,6 @@ xdr_READ4args(XDR *xdrs, READ4args *objp)  static bool_t  xdr_READ4res(XDR *xdrs, READ4res *objp)  { -	int i, rndup;  	mblk_t *mp;  	if (xdrs->x_op == XDR_DECODE) @@ -3378,12 +3377,6 @@ xdr_READ4res(XDR *xdrs, READ4res *objp)  	mp = objp->mblk;  	if (mp != NULL && xdrs->x_ops == &xdrmblk_ops) { -		mp->b_wptr += objp->data_len; -		rndup = BYTES_PER_XDR_UNIT - -		    (objp->data_len % BYTES_PER_XDR_UNIT); -		if (rndup != BYTES_PER_XDR_UNIT) -			for (i = 0; i < rndup; i++) -				*mp->b_wptr++ = '\0';  		if (xdrmblk_putmblk(xdrs, mp, objp->data_len) == TRUE) {  			objp->mblk = NULL;  			return (TRUE); diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c index dc7a23b583..2f6e2bc8be 100644 --- a/usr/src/uts/common/fs/nfs/nfs_server.c +++ b/usr/src/uts/common/fs/nfs/nfs_server.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -106,6 +106,9 @@ static struct modlinkage modlinkage = {  char _depends_on[] = "misc/klmmod"; +kmem_cache_t *nfs_xuio_cache; +int nfs_loaned_buffers = 0; +  int  _init(void)  { @@ -139,6 +142,11 @@ _init(void)  	/* setup DSS paths here; must be done before initial server startup */  	rfs4_dss_paths = rfs4_dss_oldpaths = NULL; +	/* initialize the copy reduction caches */ + +	nfs_xuio_cache = kmem_cache_create("nfs_xuio_cache", +	    sizeof (nfs_xuio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +  	return (status);  } @@ -3215,3 +3223,140 @@ do_rfs_label_check(bslabel_t *clabel, vnode_t *vp, int flag,  	label_rele(tslabel);  	return (result);  } + +/* + * Callback function to return the loaned buffers. + * Calls VOP_RETZCBUF() only after all uio_iov[] + * buffers are returned. nu_ref maintains the count. + */ +void +rfs_free_xuio(void *free_arg) +{ +	uint_t ref; +	nfs_xuio_t *nfsuiop = (nfs_xuio_t *)free_arg; + +	ref = atomic_dec_uint_nv(&nfsuiop->nu_ref); + +	/* +	 * Call VOP_RETZCBUF() only when all the iov buffers +	 * are sent OTW. +	 */ +	if (ref != 0) +		return; + +	if (((uio_t *)nfsuiop)->uio_extflg & UIO_XUIO) { +		(void) VOP_RETZCBUF(nfsuiop->nu_vp, (xuio_t *)free_arg, NULL, +		    NULL); +		VN_RELE(nfsuiop->nu_vp); +	} + +	kmem_cache_free(nfs_xuio_cache, free_arg); +} + +xuio_t * +rfs_setup_xuio(vnode_t *vp) +{ +	nfs_xuio_t *nfsuiop; + +	nfsuiop = kmem_cache_alloc(nfs_xuio_cache, KM_SLEEP); + +	bzero(nfsuiop, sizeof (nfs_xuio_t)); +	nfsuiop->nu_vp = vp; + +	/* +	 * ref count set to 1. more may be added +	 * if multiple mblks refer to multiple iov's. +	 * This is done in uio_to_mblk(). +	 */ + +	nfsuiop->nu_ref = 1; + +	nfsuiop->nu_frtn.free_func = rfs_free_xuio; +	nfsuiop->nu_frtn.free_arg = (char *)nfsuiop; + +	nfsuiop->nu_uio.xu_type = UIOTYPE_ZEROCOPY; + +	return (&nfsuiop->nu_uio); +} + +mblk_t * +uio_to_mblk(uio_t *uiop) +{ +	struct iovec *iovp; +	int i; +	mblk_t *mp, *mp1; +	nfs_xuio_t *nfsuiop = (nfs_xuio_t *)uiop; + +	if (uiop->uio_iovcnt == 0) +		return (NULL); + +	iovp = uiop->uio_iov; +	mp = mp1 = esballoca((uchar_t *)iovp->iov_base, iovp->iov_len, +	    BPRI_MED, &nfsuiop->nu_frtn); +	ASSERT(mp != NULL); + +	mp->b_wptr += iovp->iov_len; +	mp->b_datap->db_type = M_DATA; + +	for (i = 1; i < uiop->uio_iovcnt; i++) { +		iovp = (uiop->uio_iov + i); + +		mp1->b_cont = esballoca( +		    (uchar_t *)iovp->iov_base, iovp->iov_len, BPRI_MED, +		    &nfsuiop->nu_frtn); + +		mp1 = mp1->b_cont; +		ASSERT(mp1 != NULL); +		mp1->b_wptr += iovp->iov_len; +		mp1->b_datap->db_type = M_DATA; +	} + +	nfsuiop->nu_ref = uiop->uio_iovcnt; + +	return (mp); +} + +void +rfs_rndup_mblks(mblk_t *mp, uint_t len, int buf_loaned) +{ +	int i, rndup; +	int alloc_err = 0; +	mblk_t *rmp; + +	rndup = BYTES_PER_XDR_UNIT - (len % BYTES_PER_XDR_UNIT); + +	/* single mblk_t non copy-reduction case */ +	if (!buf_loaned) { +		mp->b_wptr += len; +		if (rndup != BYTES_PER_XDR_UNIT) { +			for (i = 0; i < rndup; i++) +				*mp->b_wptr++ = '\0'; +		} +		return; +	} + +	/* no need for extra rndup */ +	if (rndup == BYTES_PER_XDR_UNIT) +		return; + +	while (mp->b_cont) +		mp = mp->b_cont; + +	/* +	 * In case of copy-reduction mblks, the size of the mblks +	 * are fixed and are of the size of the loaned buffers. +	 * Allocate a roundup mblk and chain it to the data +	 * buffers. This is sub-optimal, but not expected to +	 * happen in regular common workloads. +	 */ + +	rmp = allocb_wait(rndup, BPRI_MED, STR_NOSIG, &alloc_err); +	ASSERT(rmp != NULL); +	ASSERT(alloc_err == 0); + +	for (i = 0; i < rndup; i++) +		*rmp->b_wptr++ = '\0'; + +	rmp->b_datap->db_type = M_DATA; +	mp->b_cont = rmp; +} diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index 2f7aa751ad..acdfdb36a1 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -362,6 +362,12 @@ static const fs_operation_trans_def_t vn_ops_table[] = {  	    (fs_generic_func_p) fs_vnevent_nosupport,  	    (fs_generic_func_p) fs_vnevent_nosupport, +	VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf), +	    fs_nosys, fs_nosys, + +	VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf), +	    fs_nosys, fs_nosys, +  	NULL, 0, NULL, NULL  }; @@ -522,6 +528,10 @@ create_vopstats_template()  	kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);  	/* VOP_VNEVENT */  	kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64); +	/* VOP_REQZCBUF */ +	kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64); +	/* VOP_RETZCBUF */ +	kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);  	return (vsp);  } @@ -4151,6 +4161,31 @@ fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,  	return (err);  } +int +fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr, +    caller_context_t *ct) +{ +	int err; + +	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0) +		return (ENOTSUP); +	err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct); +	VOPSTATS_UPDATE(vp, reqzcbuf); +	return (err); +} + +int +fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct) +{ +	int err; + +	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0) +		return (ENOTSUP); +	err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct); +	VOPSTATS_UPDATE(vp, retzcbuf); +	return (err); +} +  /*   * Default destructor   *	Needed because NULL destructor means that the key is unused diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 9c4fb291ca..8e03c48a23 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -1241,14 +1241,31 @@ arc_return_buf(arc_buf_t *buf, void *tag)  {  	arc_buf_hdr_t *hdr = buf->b_hdr; -	ASSERT(hdr->b_state == arc_anon);  	ASSERT(buf->b_data != NULL); -	VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0); -	VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1); +	(void) refcount_add(&hdr->b_refcnt, tag); +	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);  	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);  } +/* Detach an arc_buf from a dbuf (tag) */ +void +arc_loan_inuse_buf(arc_buf_t *buf, void *tag) +{ +	arc_buf_hdr_t *hdr; + +	rw_enter(&buf->b_lock, RW_WRITER); +	ASSERT(buf->b_data != NULL); +	hdr = buf->b_hdr; +	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); +	(void) refcount_remove(&hdr->b_refcnt, tag); +	buf->b_efunc = NULL; +	buf->b_private = NULL; + +	atomic_add_64(&arc_loaned_bytes, hdr->b_size); +	rw_exit(&buf->b_lock); +} +  static arc_buf_t *  arc_buf_clone(arc_buf_t *from)  { diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index ed2dc455de..b1f20af319 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -406,6 +406,29 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)  	}  } +/* + * Loan out an arc_buf for read.  Return the loaned arc_buf. + */ +arc_buf_t * +dbuf_loan_arcbuf(dmu_buf_impl_t *db) +{ +	arc_buf_t *abuf; + +	mutex_enter(&db->db_mtx); +	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { +		int blksz = db->db.db_size; +		mutex_exit(&db->db_mtx); +		abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz); +		bcopy(db->db.db_data, abuf->b_data, blksz); +	} else { +		abuf = db->db_buf; +		arc_loan_inuse_buf(abuf, db); +		dbuf_set_data(db, NULL); +		mutex_exit(&db->db_mtx); +	} +	return (abuf); +} +  uint64_t  dbuf_whichblock(dnode_t *dn, uint64_t offset)  { @@ -1162,7 +1185,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)  	ASSERT(db->db_blkid != DB_BONUS_BLKID);  	mutex_enter(&db->db_mtx); -  	/*  	 * If this buffer is not dirty, we're done.  	 */ @@ -1341,9 +1363,11 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)  		(void) dbuf_dirty(db, tx);  		bcopy(buf->b_data, db->db.db_data, db->db.db_size);  		VERIFY(arc_buf_remove_ref(buf, db) == 1); +		xuio_stat_wbuf_copied();  		return;  	} +	xuio_stat_wbuf_nocopy();  	if (db->db_state == DB_CACHED) {  		dbuf_dirty_record_t *dr = db->db_last_dirty; diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index d3dfc21ac1..2d0927bd44 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -661,12 +661,136 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,  	dmu_buf_rele_array(dbp, numbufs, FTAG);  } +/* + * DMU support for xuio + */ +kstat_t *xuio_ksp = NULL; + +int +dmu_xuio_init(xuio_t *xuio, int nblk) +{ +	dmu_xuio_t *priv; +	uio_t *uio = &xuio->xu_uio; + +	uio->uio_iovcnt = nblk; +	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); + +	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); +	priv->cnt = nblk; +	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); +	priv->iovp = uio->uio_iov; +	XUIO_XUZC_PRIV(xuio) = priv; + +	if (XUIO_XUZC_RW(xuio) == UIO_READ) +		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); +	else +		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); + +	return (0); +} + +void +dmu_xuio_fini(xuio_t *xuio) +{ +	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); +	int nblk = priv->cnt; + +	kmem_free(priv->iovp, nblk * sizeof (iovec_t)); +	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); +	kmem_free(priv, sizeof (dmu_xuio_t)); + +	if (XUIO_XUZC_RW(xuio) == UIO_READ) +		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); +	else +		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); +} + +/* + * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } + * and increase priv->next by 1. + */ +int +dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) +{ +	struct iovec *iov; +	uio_t *uio = &xuio->xu_uio; +	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); +	int i = priv->next++; + +	ASSERT(i < priv->cnt); +	ASSERT(off + n <= arc_buf_size(abuf)); +	iov = uio->uio_iov + i; +	iov->iov_base = (char *)abuf->b_data + off; +	iov->iov_len = n; +	priv->bufs[i] = abuf; +	return (0); +} + +int +dmu_xuio_cnt(xuio_t *xuio) +{ +	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); +	return (priv->cnt); +} + +arc_buf_t * +dmu_xuio_arcbuf(xuio_t *xuio, int i) +{ +	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + +	ASSERT(i < priv->cnt); +	return (priv->bufs[i]); +} + +void +dmu_xuio_clear(xuio_t *xuio, int i) +{ +	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + +	ASSERT(i < priv->cnt); +	priv->bufs[i] = NULL; +} + +static void +xuio_stat_init(void) +{ +	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", +	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), +	    KSTAT_FLAG_VIRTUAL); +	if (xuio_ksp != NULL) { +		xuio_ksp->ks_data = &xuio_stats; +		kstat_install(xuio_ksp); +	} +} + +static void +xuio_stat_fini(void) +{ +	if (xuio_ksp != NULL) { +		kstat_delete(xuio_ksp); +		xuio_ksp = NULL; +	} +} + +void +xuio_stat_wbuf_copied() +{ +	XUIOSTAT_BUMP(xuiostat_wbuf_copied); +} + +void +xuio_stat_wbuf_nocopy() +{ +	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); +} +  #ifdef _KERNEL  int  dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)  {  	dmu_buf_t **dbp;  	int numbufs, i, err; +	xuio_t *xuio = NULL;  	/*  	 * NB: we could do this block-at-a-time, but it's nice @@ -677,6 +801,9 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)  	if (err)  		return (err); +	if (uio->uio_extflg == UIO_XUIO) +		xuio = (xuio_t *)uio; +  	for (i = 0; i < numbufs; i++) {  		int tocpy;  		int bufoff; @@ -687,8 +814,24 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)  		bufoff = uio->uio_loffset - db->db_offset;  		tocpy = (int)MIN(db->db_size - bufoff, size); -		err = uiomove((char *)db->db_data + bufoff, tocpy, -		    UIO_READ, uio); +		if (xuio) { +			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; +			arc_buf_t *dbuf_abuf = dbi->db_buf; +			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); +			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); +			if (!err) { +				uio->uio_resid -= tocpy; +				uio->uio_loffset += tocpy; +			} + +			if (abuf == dbuf_abuf) +				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); +			else +				XUIOSTAT_BUMP(xuiostat_rbuf_copied); +		} else { +			err = uiomove((char *)db->db_data + bufoff, tocpy, +			    UIO_READ, uio); +		}  		if (err)  			break; @@ -857,6 +1000,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,  		dmu_write(dn->dn_objset, dn->dn_object, offset, blksz,  		    buf->b_data, tx);  		dmu_return_arcbuf(buf); +		XUIOSTAT_BUMP(xuiostat_wbuf_copied);  	}  } @@ -1369,6 +1513,7 @@ dmu_init(void)  	zfetch_init();  	arc_init();  	l2arc_init(); +	xuio_stat_init();  }  void @@ -1379,4 +1524,5 @@ dmu_fini(void)  	dnode_fini();  	dbuf_fini();  	l2arc_fini(); +	xuio_stat_fini();  } diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index a4f4964e11..c528fac1a6 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -87,6 +87,7 @@ arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,      arc_buf_contents_t type);  arc_buf_t *arc_loan_buf(spa_t *spa, int size);  void arc_return_buf(arc_buf_t *buf, void *tag); +void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);  void arc_buf_add_ref(arc_buf_t *buf, void *tag);  int arc_buf_remove_ref(arc_buf_t *buf, void *tag);  int arc_buf_size(arc_buf_t *buf); diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h index 6e2a66a2fa..d99ade07f8 100644 --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -267,6 +267,7 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);  void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);  void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);  dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);  void dbuf_clear(dmu_buf_impl_t *db);  void dbuf_evict(dmu_buf_impl_t *db); diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index e229ca3bd8..b41bc96c38 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -45,6 +45,7 @@ extern "C" {  #endif  struct uio; +struct xuio;  struct page;  struct vnode;  struct spa; @@ -500,6 +501,15 @@ struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);  void dmu_return_arcbuf(struct arc_buf *buf);  void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,      dmu_tx_t *tx); +int dmu_xuio_init(struct xuio *uio, int niov); +void dmu_xuio_fini(struct xuio *uio); +int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, +    size_t n); +int dmu_xuio_cnt(struct xuio *uio); +struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); +void dmu_xuio_clear(struct xuio *uio, int i); +void xuio_stat_wbuf_copied(); +void xuio_stat_wbuf_nocopy();  extern int zfs_prefetch_disable; diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h index 1e16da6b97..22f9f5f8c8 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -232,6 +232,39 @@ extern "C" {  struct objset;  struct dmu_pool; +typedef struct dmu_xuio { +	int next; +	int cnt; +	struct arc_buf **bufs; +	iovec_t *iovp; +} dmu_xuio_t; + +typedef struct xuio_stats { +	/* loaned yet not returned arc_buf */ +	kstat_named_t xuiostat_onloan_rbuf; +	kstat_named_t xuiostat_onloan_wbuf; +	/* whether a copy is made when loaning out a read buffer */ +	kstat_named_t xuiostat_rbuf_copied; +	kstat_named_t xuiostat_rbuf_nocopy; +	/* whether a copy is made when assigning a write buffer */ +	kstat_named_t xuiostat_wbuf_copied; +	kstat_named_t xuiostat_wbuf_nocopy; +} xuio_stats_t; + +static xuio_stats_t xuio_stats = { +	{ "onloan_read_buf",	KSTAT_DATA_UINT64 }, +	{ "onloan_write_buf",	KSTAT_DATA_UINT64 }, +	{ "read_buf_copied",	KSTAT_DATA_UINT64 }, +	{ "read_buf_nocopy",	KSTAT_DATA_UINT64 }, +	{ "write_buf_copied",	KSTAT_DATA_UINT64 }, +	{ "write_buf_nocopy",	KSTAT_DATA_UINT64 } +}; + +#define	XUIOSTAT_INCR(stat, val)	\ +	atomic_add_64(&xuio_stats.stat.value.ui64, (val)) +#define	XUIOSTAT_BUMP(stat)	XUIOSTAT_INCR(stat, 1) + +  #ifdef	__cplusplus  }  #endif diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 0a262cbe21..6759a812ed 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -1115,6 +1115,7 @@ zfs_domount(vfs_t *vfsp, char *osname)  		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);  		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);  	} +	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);  	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {  		uint64_t pval; diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index 9d9fe50aa9..d59c7625ec 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -447,6 +447,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)  	ssize_t		n, nbytes;  	int		error;  	rl_t		*rl; +	xuio_t		*xuio = NULL;  	ZFS_ENTER(zfsvfs);  	ZFS_VERIFY_ZP(zp); @@ -507,6 +508,35 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)  	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);  	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); +	if ((uio->uio_extflg == UIO_XUIO) && +	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { +		int nblk; +		int blksz = zp->z_blksz; +		uint64_t offset = uio->uio_loffset; + +		xuio = (xuio_t *)uio; +		if ((ISP2(blksz))) { +			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, +			    blksz)) / blksz; +		} else { +			ASSERT(offset + n <= blksz); +			nblk = 1; +		} +		dmu_xuio_init(xuio, nblk); + +		if (vn_has_cached_data(vp)) { +			/* +			 * For simplicity, we always allocate a full buffer +			 * even if we only expect to read a portion of a block. +			 */ +			while (--nblk >= 0) { +				dmu_xuio_add(xuio, +				    dmu_request_arcbuf(zp->z_dbuf, blksz), +				    0, blksz); +			} +		} +	} +  	while (n > 0) {  		nbytes = MIN(n, zfs_read_chunk_size -  		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); @@ -524,7 +554,6 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)  		n -= nbytes;  	} -  out:  	zfs_range_unlock(rl); @@ -570,6 +599,12 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)  	uint64_t	pflags;  	int		error;  	arc_buf_t	*abuf; +	iovec_t		*aiov; +	xuio_t		*xuio = NULL; +	int		i_iov = 0; +	int		iovcnt = uio->uio_iovcnt; +	iovec_t		*iovp = uio->uio_iov; +	int		write_eof;  	/*  	 * Fasttrack empty write @@ -619,8 +654,13 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)  	/*  	 * Pre-fault the pages to ensure slow (eg NFS) pages  	 * don't hold up txg. +	 * Skip this if uio contains loaned arc_buf.  	 */ -	uio_prefaultpages(n, uio); +	if ((uio->uio_extflg == UIO_XUIO) && +	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) +		xuio = (xuio_t *)uio; +	else +		uio_prefaultpages(n, uio);  	/*  	 * If in append mode, set the io offset pointer to eof. @@ -659,6 +699,9 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)  	if ((woff + n) > limit || woff > (limit - n))  		n = limit - woff; +	/* Will this write extend the file length? */ +	write_eof = (woff + n > zp->z_phys->zp_size); +  	end_size = MAX(zp->z_phys->zp_size, woff + n);  	/* @@ -669,7 +712,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)  	while (n > 0) {  		abuf = NULL;  		woff = uio->uio_loffset; -  again:  		if (zfs_usergroup_overquota(zfsvfs,  		    B_FALSE, zp->z_phys->zp_uid) || @@ -681,16 +723,28 @@ again:  			break;  		} -		/* -		 * If dmu_assign_arcbuf() is expected to execute with minimum -		 * overhead loan an arc buffer and copy user data to it before -		 * we enter a txg.  This avoids holding a txg forever while we -		 * pagefault on a hanging NFS server mapping. -		 */ -		if (abuf == NULL && n >= max_blksz && +		if (xuio && abuf == NULL) { +			ASSERT(i_iov < iovcnt); +			aiov = &iovp[i_iov]; +			abuf = dmu_xuio_arcbuf(xuio, i_iov); +			dmu_xuio_clear(xuio, i_iov); +			DTRACE_PROBE3(zfs_cp_write, int, i_iov, +			    iovec_t *, aiov, arc_buf_t *, abuf); +			ASSERT((aiov->iov_base == abuf->b_data) || +			    ((char *)aiov->iov_base - (char *)abuf->b_data + +			    aiov->iov_len == arc_buf_size(abuf))); +			i_iov++; +		} else if (abuf == NULL && n >= max_blksz &&  		    woff >= zp->z_phys->zp_size &&  		    P2PHASE(woff, max_blksz) == 0 &&  		    zp->z_blksz == max_blksz) { +			/* +			 * This write covers a full block.  "Borrow" a buffer +			 * from the dmu so that we can fill it before we enter +			 * a transaction.  This avoids the possibility of +			 * holding up the transaction if the data copy hangs +			 * up on a pagefault (e.g., from an NFS server mapping). +			 */  			size_t cbytes;  			abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz); @@ -755,8 +809,24 @@ again:  			tx_bytes -= uio->uio_resid;  		} else {  			tx_bytes = nbytes; -			ASSERT(tx_bytes == max_blksz); -			dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx); +			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); +			/* +			 * If this is not a full block write, but we are +			 * extending the file past EOF and this data starts +			 * block-aligned, use assign_arcbuf().  Otherwise, +			 * write via dmu_write(). +			 */ +			if (tx_bytes < max_blksz && (!write_eof || +			    aiov->iov_base != abuf->b_data)) { +				ASSERT(xuio); +				dmu_write(zfsvfs->z_os, zp->z_id, woff, +				    aiov->iov_len, aiov->iov_base, tx); +				dmu_return_arcbuf(abuf); +				xuio_stat_wbuf_copied(); +			} else { +				ASSERT(xuio || tx_bytes == max_blksz); +				dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx); +			}  			ASSERT(tx_bytes <= uio->uio_resid);  			uioskip(uio, tx_bytes);  		} @@ -4571,6 +4641,160 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,  }  /* + * Tunable, both must be a power of 2. + * + * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf + * zcr_blksz_max: if set to less than the file block size, allow loaning out of + *                an arcbuf for a partial block read + */ +int zcr_blksz_min = (1 << 10);	/* 1K */ +int zcr_blksz_max = (1 << 17);	/* 128K */ + +/*ARGSUSED*/ +static int +zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr, +    caller_context_t *ct) +{ +	znode_t	*zp = VTOZ(vp); +	zfsvfs_t *zfsvfs = zp->z_zfsvfs; +	int max_blksz = zfsvfs->z_max_blksz; +	uio_t *uio = &xuio->xu_uio; +	ssize_t size = uio->uio_resid; +	offset_t offset = uio->uio_loffset; +	int blksz; +	int fullblk, i; +	arc_buf_t *abuf; +	ssize_t maxsize; +	int preamble, postamble; + +	if (xuio->xu_type != UIOTYPE_ZEROCOPY) +		return (EINVAL); + +	ZFS_ENTER(zfsvfs); +	ZFS_VERIFY_ZP(zp); +	switch (ioflag) { +	case UIO_WRITE: +		/* +		 * Loan out an arc_buf for write if write size is bigger than +		 * max_blksz, and the file's block size is also max_blksz. +		 */ +		blksz = max_blksz; +		if (size < blksz || zp->z_blksz != blksz) { +			ZFS_EXIT(zfsvfs); +			return (EINVAL); +		} +		/* +		 * Caller requests buffers for write before knowing where the +		 * write offset might be (e.g. NFS TCP write). +		 */ +		if (offset == -1) { +			preamble = 0; +		} else { +			preamble = P2PHASE(offset, blksz); +			if (preamble) { +				preamble = blksz - preamble; +				size -= preamble; +			} +		} + +		postamble = P2PHASE(size, blksz); +		size -= postamble; + +		fullblk = size / blksz; +		dmu_xuio_init(xuio, +		    (preamble != 0) + fullblk + (postamble != 0)); +		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble, +		    int, postamble, int, +		    (preamble != 0) + fullblk + (postamble != 0)); + +		/* +		 * Have to fix iov base/len for partial buffers.  They +		 * currently represent full arc_buf's. +		 */ +		if (preamble) { +			/* data begins in the middle of the arc_buf */ +			abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); +			ASSERT(abuf); +			dmu_xuio_add(xuio, abuf, blksz - preamble, preamble); +		} + +		for (i = 0; i < fullblk; i++) { +			abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); +			ASSERT(abuf); +			dmu_xuio_add(xuio, abuf, 0, blksz); +		} + +		if (postamble) { +			/* data ends in the middle of the arc_buf */ +			abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); +			ASSERT(abuf); +			dmu_xuio_add(xuio, abuf, 0, postamble); +		} +		break; +	case UIO_READ: +		/* +		 * Loan out an arc_buf for read if the read size is larger than +		 * the current file block size.  Block alignment is not +		 * considered.  Partial arc_buf will be loaned out for read. +		 */ +		blksz = zp->z_blksz; +		if (blksz < zcr_blksz_min) +			blksz = zcr_blksz_min; +		if (blksz > zcr_blksz_max) +			blksz = zcr_blksz_max; +		/* avoid potential complexity of dealing with it */ +		if (blksz > max_blksz) { +			ZFS_EXIT(zfsvfs); +			return (EINVAL); +		} + +		maxsize = zp->z_phys->zp_size - uio->uio_loffset; +		if (size > maxsize) +			size = maxsize; + +		if (size < blksz || vn_has_cached_data(vp)) { +			ZFS_EXIT(zfsvfs); +			return (EINVAL); +		} +		break; +	default: +		ZFS_EXIT(zfsvfs); +		return (EINVAL); +	} + +	uio->uio_extflg = UIO_XUIO; +	XUIO_XUZC_RW(xuio) = ioflag; +	ZFS_EXIT(zfsvfs); +	return (0); +} + +/*ARGSUSED*/ +static int +zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct) +{ +	int i; +	arc_buf_t *abuf; +	int ioflag = XUIO_XUZC_RW(xuio); + +	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); + +	i = dmu_xuio_cnt(xuio); +	while (i-- > 0) { +		abuf = dmu_xuio_arcbuf(xuio, i); +		/* +		 * if abuf == NULL, it must be a write buffer +		 * that has been returned in zfs_write(). +		 */ +		if (abuf) +			dmu_return_arcbuf(abuf); +		ASSERT(abuf || ioflag == UIO_WRITE); +	} + +	dmu_xuio_fini(xuio); +	return (0); +} + +/*   * Predeclare these here so that the compiler assumes that   * this is an "old style" function declaration that does   * not include arguments => we won't get type mismatch errors @@ -4653,6 +4877,8 @@ const fs_operation_def_t zfs_fvnodeops_template[] = {  	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },  	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },  	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support }, +	VOPNAME_REQZCBUF, 	{ .vop_reqzcbuf = zfs_reqzcbuf }, +	VOPNAME_RETZCBUF, 	{ .vop_retzcbuf = zfs_retzcbuf },  	NULL,			NULL  }; diff --git a/usr/src/uts/common/nfs/nfs.h b/usr/src/uts/common/nfs/nfs.h index 77e5a397c2..1905e47c4f 100644 --- a/usr/src/uts/common/nfs/nfs.h +++ b/usr/src/uts/common/nfs/nfs.h @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -1447,6 +1447,7 @@ struct READ3resok {  #ifdef _KERNEL  	uint_t wlist_len;  	struct clist *wlist; +	frtn_t zcopy;  #endif  };  typedef struct READ3resok READ3resok; @@ -2322,6 +2323,24 @@ extern int do_xattr_exists_check(vnode_t *, ulong_t *, cred_t *);  extern ts_label_t	*nfs_getflabel(vnode_t *, struct exportinfo *);  extern boolean_t	do_rfs_label_check(bslabel_t *, vnode_t *, int,  			    struct exportinfo *); + +/* + * Copy Reduction support. + * xuio_t wrapper with additional private data. + */ + +typedef struct nfs_xuio { +	xuio_t nu_uio; +	vnode_t *nu_vp; +	uint_t nu_ref; +	frtn_t nu_frtn; +} nfs_xuio_t; + +xuio_t *rfs_setup_xuio(vnode_t *); +mblk_t *uio_to_mblk(uio_t *); +void rfs_rndup_mblks(mblk_t *, uint_t, int); +void rfs_free_xuio(void *); +  #endif	/* _KERNEL */  #ifdef	__cplusplus diff --git a/usr/src/uts/common/rpc/rpcmod.c b/usr/src/uts/common/rpc/rpcmod.c index cab50d67cd..891045d7f2 100644 --- a/usr/src/uts/common/rpc/rpcmod.c +++ b/usr/src/uts/common/rpc/rpcmod.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */  /* Copyright (c) 1990 Mentat Inc. */ @@ -1059,8 +1059,6 @@ rpcmod_release(queue_t *q, mblk_t *bp)  #define	MIR_SVC_ORDREL_TIMEOUT	(10 * (60 * 1000L))	/* 10 minutes */  #define	MIR_LASTFRAG	0x80000000	/* Record marker */ -#define	DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr)) -  #define	MIR_SVC_QUIESCED(mir)	\  	(mir->mir_ref_cnt == 0 && mir->mir_inrservice == 0) diff --git a/usr/src/uts/common/rpc/xdr.h b/usr/src/uts/common/rpc/xdr.h index 4ef63d6baf..3db775893c 100644 --- a/usr/src/uts/common/rpc/xdr.h +++ b/usr/src/uts/common/rpc/xdr.h @@ -18,7 +18,7 @@   *   * CDDL HEADER END   * - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */  /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -585,6 +585,8 @@ extern uint_t xdrrec_readbytes();  #endif  #else +#define	DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr)) +  extern void	xdrmem_create(XDR *, caddr_t, uint_t, enum xdr_op);  extern void	xdrmblk_init(XDR *, mblk_t *, enum xdr_op, int);  extern bool_t	xdrmblk_getmblk(XDR *, mblk_t **, uint_t *); diff --git a/usr/src/uts/common/rpc/xdr_mblk.c b/usr/src/uts/common/rpc/xdr_mblk.c index 053edb7603..0b06b827e0 100644 --- a/usr/src/uts/common/rpc/xdr_mblk.c +++ b/usr/src/uts/common/rpc/xdr_mblk.c @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2008 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -361,20 +361,24 @@ xdrmblk_putbytes(XDR *xdrs, caddr_t addr, int len)   * not a multiple of BYTES_PER_XDR_UNIT, the caller has the option   * of making the data a BYTES_PER_XDR_UNIT multiple (b_wptr - b_rptr is   * a BYTES_PER_XDR_UNIT multiple), but in this case the caller has to ensure - * that the filler bytes are initialized to zero. Note: Doesn't to work for - * chained mblks. + * that the filler bytes are initialized to zero.   */  bool_t  xdrmblk_putmblk(XDR *xdrs, mblk_t *m, uint_t len)  {  	int32_t llen = (int32_t)len; -	if (((m->b_wptr - m->b_rptr) % BYTES_PER_XDR_UNIT) != 0) +	if ((DLEN(m) % BYTES_PER_XDR_UNIT) != 0)  		return (FALSE);  	if (!xdrmblk_putint32(xdrs, &llen))  		return (FALSE); +  	/* LINTED pointer alignment */  	((mblk_t *)xdrs->x_base)->b_cont = m; + +	/* base points to the last mblk */ +	while (m->b_cont) +		m = m->b_cont;  	xdrs->x_base = (caddr_t)m;  	xdrs->x_handy = 0;  	return (TRUE); diff --git a/usr/src/uts/common/sys/fem.h b/usr/src/uts/common/sys/fem.h index 84defb057c..9b3cd142e4 100644 --- a/usr/src/uts/common/sys/fem.h +++ b/usr/src/uts/common/sys/fem.h @@ -19,15 +19,13 @@   * CDDL HEADER END   */  /* - * Copyright 2008 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */  #ifndef _SYS_FEM_H  #define	_SYS_FEM_H -#pragma ident	"%Z%%M%	%I%	%E% SMI" -  #include <sys/types.h>  #include <sys/mutex.h>  #include <sys/pathname.h> @@ -260,7 +258,13 @@ struct fem_head {  			struct shrlock *shr, int flag, cred_t *cr,	\  			caller_context_t *ct);				\  	int (*femop_vnevent)(femarg_t *vf, vnevent_t vnevent,		\ -			vnode_t *dvp, char *cname, caller_context_t *ct) +			vnode_t *dvp, char *cname, 			\ +			caller_context_t *ct);				\ +	int (*femop_reqzcbuf)(femarg_t *vf, enum uio_rw ioflag,		\ +			xuio_t *xuio, cred_t *cr,			\ +			caller_context_t *ct);				\ +	int (*femop_retzcbuf)(femarg_t *vf, xuio_t *xuio, cred_t *cr,	\ +			caller_context_t *ct)  	/* NB: No ";" */  struct fem { @@ -392,6 +396,10 @@ extern int vnext_shrlock(femarg_t *vf, int cmd, struct shrlock *shr,  			int flag, cred_t *cr, caller_context_t *ct);  extern int vnext_vnevent(femarg_t *vf, vnevent_t vevent, vnode_t *dvp,  			char *cname, caller_context_t *ct); +extern int vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, +			cred_t *cr, caller_context_t *ct); +extern int vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, +			caller_context_t *ct);  extern int vfsnext_mount(fsemarg_t *vf, vnode_t *mvp, struct mounta *uap,  			cred_t *cr); diff --git a/usr/src/uts/common/sys/uio.h b/usr/src/uts/common/sys/uio.h index 248443f9a5..7255a2fa67 100644 --- a/usr/src/uts/common/sys/uio.h +++ b/usr/src/uts/common/sys/uio.h @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -133,6 +133,49 @@ typedef struct uioa_s {  	uioa_page_t	uioa_locked[UIOA_IOV_MAX]; /* Per iov locked pages */  } uioa_t; +/* + * uio extensions + * + * PSARC 2009/478: Copy Reduction Interfaces + */ +typedef enum xuio_type { +	UIOTYPE_ASYNCIO, +	UIOTYPE_ZEROCOPY +} xuio_type_t; + +typedef struct xuio { +	uio_t xu_uio;		/* Embedded UIO structure */ + +	/* Extended uio fields */ +	enum xuio_type xu_type;	/* What kind of uio structure? */ +	union { +		/* Async I/O Support, intend to replace uioa_t. */ +		struct { +			uint32_t xu_a_state;	/* state of async i/o */ +			/* bytes that have been uioamove()ed */ +			ssize_t xu_a_mbytes; +			uioa_page_t *xu_a_lcur;	/* pointer into uioa_locked[] */ +			/* pointer into lcur->uioa_ppp[] */ +			void **xu_a_lppp; +			void *xu_a_hwst[4];	/* opaque hardware state */ +			/* Per iov locked pages */ +			uioa_page_t xu_a_locked[UIOA_IOV_MAX]; +		} xu_aio; + +		/* +		 * Copy Reduction Support -- facilate loaning / returning of +		 * filesystem cache buffers. +		 */ +		struct { +			int xu_zc_rw;	/* read or write buffer */ +			void *xu_zc_priv;	/* fs specific */ +		} xu_zc; +	} xu_ext; +} xuio_t; + +#define	XUIO_XUZC_PRIV(xuio)    xuio->xu_ext.xu_zc.xu_zc_priv +#define	XUIO_XUZC_RW(xuio)	xuio->xu_ext.xu_zc.xu_zc_rw +  #define	UIOA_ALLOC	0x0001		/* allocated but not yet initialized */  #define	UIOA_INIT	0x0002		/* initialized but not yet enabled */  #define	UIOA_ENABLED	0x0004		/* enabled, asynch i/o active */ @@ -177,6 +220,7 @@ typedef enum uio_rw { UIO_READ, UIO_WRITE } uio_rw_t;  #define	UIO_COPY_CACHED		0x0001	/* copy should not bypass caches */  #define	UIO_ASYNC		0x0002	/* uio_t is really a uioa_t */ +#define	UIO_XUIO		0x0004	/* Structure is xuio_t */  /*   * Global uioasync capability shadow state. diff --git a/usr/src/uts/common/sys/vfs.h b/usr/src/uts/common/sys/vfs.h index 2e9679cf97..bae4e5b87f 100644 --- a/usr/src/uts/common/sys/vfs.h +++ b/usr/src/uts/common/sys/vfs.h @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -302,7 +302,8 @@ typedef	uint64_t	vfs_feature_t;  #define	VFSFT_SYSATTR_VIEWS	0x100000040	/* Supports sysattr view i/f */  #define	VFSFT_ACCESS_FILTER	0x100000080	/* dirents filtered by access */  #define	VFSFT_REPARSE		0x100000100	/* Supports reparse point */ - +#define	VFSFT_ZEROCOPY_SUPPORTED	0x100000200 +				/* Support loaning /returning cache buffer */  /*   * Argument structure for mount(2).   * diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h index 97504aabf3..8b75225a64 100644 --- a/usr/src/uts/common/sys/vnode.h +++ b/usr/src/uts/common/sys/vnode.h @@ -19,7 +19,7 @@   * CDDL HEADER END   */  /* - * Copyright 2009 Sun Microsystems, Inc.  All rights reserved. + * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.   * Use is subject to license terms.   */ @@ -118,6 +118,8 @@ typedef struct vopstats {  	kstat_named_t	ngetsecattr;	/* VOP_GETSECATTR */  	kstat_named_t	nshrlock;	/* VOP_SHRLOCK */  	kstat_named_t	nvnevent;	/* VOP_VNEVENT */ +	kstat_named_t	nreqzcbuf;	/* VOP_REQZCBUF */ +	kstat_named_t	nretzcbuf;	/* VOP_RETZCBUF */  } vopstats_t;  /* @@ -900,7 +902,11 @@ struct taskq;  	int	(*vop_shrlock)(vnode_t *, int, struct shrlock *,	\  				int, cred_t *, caller_context_t *);	\  	int	(*vop_vnevent)(vnode_t *, vnevent_t, vnode_t *,		\ -				char *, caller_context_t *) +				char *, caller_context_t *);		\ +	int	(*vop_reqzcbuf)(vnode_t *, enum uio_rw, xuio_t *,	\ +				cred_t *, caller_context_t *);		\ +	int	(*vop_retzcbuf)(vnode_t *, xuio_t *, cred_t *,		\ +				caller_context_t *)  	/* NB: No ";" */  /* @@ -997,6 +1003,9 @@ extern int	fop_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,  				caller_context_t *);  extern int	fop_vnevent(vnode_t *, vnevent_t, vnode_t *, char *,  				caller_context_t *); +extern int	fop_reqzcbuf(vnode_t *, enum uio_rw, xuio_t *, cred_t *, +				caller_context_t *); +extern int	fop_retzcbuf(vnode_t *, xuio_t *, cred_t *, caller_context_t *);  #endif	/* _KERNEL */ @@ -1088,6 +1097,10 @@ extern int	fop_vnevent(vnode_t *, vnevent_t, vnode_t *, char *,  	fop_shrlock(vp, cmd, shr, f, cr, ct)  #define	VOP_VNEVENT(vp, vnevent, dvp, fnm, ct) \  	fop_vnevent(vp, vnevent, dvp, fnm, ct) +#define	VOP_REQZCBUF(vp, rwflag, xuiop, cr, ct) \ +	fop_reqzcbuf(vp, rwflag, xuiop, cr, ct) +#define	VOP_RETZCBUF(vp, xuiop, cr, ct) \ +	fop_retzcbuf(vp, xuiop, cr, ct)  #define	VOPNAME_OPEN		"open"  #define	VOPNAME_CLOSE		"close" @@ -1133,6 +1146,8 @@ extern int	fop_vnevent(vnode_t *, vnevent_t, vnode_t *, char *,  #define	VOPNAME_SETSECATTR	"setsecattr"  #define	VOPNAME_SHRLOCK		"shrlock"  #define	VOPNAME_VNEVENT		"vnevent" +#define	VOPNAME_REQZCBUF	"reqzcbuf" +#define	VOPNAME_RETZCBUF	"retzcbuf"  /*   * Flags for VOP_LOOKUP | 
