diff options
| author | chunli zhang - Sun Microsystems - Irvine United States <Chunli.Zhang@Sun.COM> | 2010-01-18 10:34:16 -0800 |
|---|---|---|
| committer | chunli zhang - Sun Microsystems - Irvine United States <Chunli.Zhang@Sun.COM> | 2010-01-18 10:34:16 -0800 |
| commit | c242f9a02a2ef021449275ae0a1d2581ee77231d (patch) | |
| tree | 6d298bebb8ff9febd9acf936d402f67a6d67d358 | |
| parent | bce54adf407df0723facaef4e2147ed69b922786 (diff) | |
| download | illumos-joyent-c242f9a02a2ef021449275ae0a1d2581ee77231d.tar.gz | |
6873106 Need a mechanism to share buffers between fs modules
27 files changed, 990 insertions, 84 deletions
diff --git a/usr/src/cmd/stat/fsstat/fsstat.c b/usr/src/cmd/stat/fsstat/fsstat.c index 1869ff4fe6..31b2f5e054 100644 --- a/usr/src/cmd/stat/fsstat/fsstat.c +++ b/usr/src/cmd/stat/fsstat/fsstat.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -428,6 +428,8 @@ vop_display(char *name, vopstats_t *oldvsp, vopstats_t *newvsp, int dispflag) PRINT_VOPSTAT(niceflag, setsecattr); PRINT_VOPSTAT(niceflag, shrlock); PRINT_VOPSTAT(niceflag, vnevent); + PRINT_VOPSTAT(niceflag, reqzcbuf); + PRINT_VOPSTAT(niceflag, retzcbuf); if (niceflag) { /* Make it easier on the eyes */ diff --git a/usr/src/uts/common/fs/fem.c b/usr/src/uts/common/fs/fem.c index b7d1413bd7..95248d1077 100644 --- a/usr/src/uts/common/fs/fem.c +++ b/usr/src/uts/common/fs/fem.c @@ -19,10 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" #include <sys/types.h> #include <sys/atomic.h> @@ -124,6 +123,8 @@ static fs_operation_trans_def_t fem_opdef[] = { _FEMOPDEF(GETSECATTR, getsecattr), _FEMOPDEF(SHRLOCK, shrlock), _FEMOPDEF(VNEVENT, vnevent), + _FEMOPDEF(REQZCBUF, reqzcbuf), + _FEMOPDEF(RETZCBUF, retzcbuf), { NULL, 0, NULL, NULL } }; @@ -176,6 +177,8 @@ static struct fs_operation_def fem_guard_ops[] = { _FEMGUARD(GETSECATTR, getsecattr), _FEMGUARD(SHRLOCK, shrlock), _FEMGUARD(VNEVENT, vnevent), + _FEMGUARD(REQZCBUF, reqzcbuf), + _FEMGUARD(RETZCBUF, retzcbuf), { NULL, NULL } }; @@ -1645,6 +1648,61 @@ vhead_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *cname, } static int +vhead_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr, + caller_context_t *ct) +{ + femarg_t farg; + struct fem_list *femsp; + int (*func)(); + void *arg0; + int errc; + + if ((femsp = fem_lock(vp->v_femhead)) == NULL) { + func = (int (*)()) (vp->v_op->vop_reqzcbuf); + arg0 = vp; + fem_unlock(vp->v_femhead); + errc = (*func)(arg0, ioflag, xuiop, cr, ct); + } else { + fem_addref(femsp); + fem_unlock(vp->v_femhead); + farg.fa_vnode.vp = vp; + farg.fa_fnode = femsp->feml_nodes + femsp->feml_tos; + vsop_find(&farg, &func, int, &arg0, vop_reqzcbuf, + femop_reqzcbuf); + errc = (*func)(arg0, ioflag, xuiop, cr, ct); + fem_release(femsp); + } + return (errc); +} + +static int +vhead_retzcbuf(vnode_t *vp, xuio_t *xuiop, cred_t *cr, caller_context_t *ct) +{ + femarg_t farg; + struct fem_list *femsp; + int (*func)(); + void *arg0; + int errc; + + if ((femsp = fem_lock(vp->v_femhead)) == NULL) { + func = (int (*)()) (vp->v_op->vop_retzcbuf); + arg0 = vp; + fem_unlock(vp->v_femhead); + errc = (*func)(arg0, xuiop, cr, ct); + } else { + fem_addref(femsp); + fem_unlock(vp->v_femhead); + farg.fa_vnode.vp = vp; + farg.fa_fnode = femsp->feml_nodes + femsp->feml_tos; + vsop_find(&farg, &func, int, &arg0, vop_retzcbuf, + femop_retzcbuf); + errc = (*func)(arg0, xuiop, cr, ct); + fem_release(femsp); + } + return (errc); +} + +static int fshead_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) { fsemarg_t farg; @@ -1942,6 +2000,8 @@ static struct fs_operation_def fhead_vn_spec[] = { { VOPNAME_GETSECATTR, (femop_t *)vhead_getsecattr }, { VOPNAME_SHRLOCK, (femop_t *)vhead_shrlock }, { VOPNAME_VNEVENT, (femop_t *)vhead_vnevent }, + { VOPNAME_REQZCBUF, (femop_t *)vhead_reqzcbuf }, + { VOPNAME_RETZCBUF, (femop_t *)vhead_retzcbuf }, { NULL, NULL } }; @@ -2642,6 +2702,35 @@ vnext_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *cname, } int +vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr, + caller_context_t *ct) +{ + int (*func)() = NULL; + void *arg0 = NULL; + + ASSERT(vf != NULL); + vf->fa_fnode--; + vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf); + ASSERT(func != NULL); + ASSERT(arg0 != NULL); + return ((*func)(arg0, ioflag, xuiop, cr, ct)); +} + +int +vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct) +{ + int (*func)() = NULL; + void *arg0 = NULL; + + ASSERT(vf != NULL); + vf->fa_fnode--; + vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf); + ASSERT(func != NULL); + ASSERT(arg0 != NULL); + return ((*func)(arg0, xuiop, cr, ct)); +} + +int vfsnext_mount(fsemarg_t *vf, vnode_t *mvp, struct mounta *uap, cred_t *cr) { int (*func)() = NULL; diff --git a/usr/src/uts/common/fs/nfs/nfs3_srv.c b/usr/src/uts/common/fs/nfs/nfs3_srv.c index 71ebdb2d74..b8e63c183d 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_srv.c +++ b/usr/src/uts/common/fs/nfs/nfs3_srv.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -87,6 +87,8 @@ static void vattr_to_pre_op_attr(struct vattr *, pre_op_attr *); static void vattr_to_wcc_data(struct vattr *, struct vattr *, wcc_data *); static int rdma_setup_read_data3(READ3args *, READ3resok *); +extern int nfs_loaned_buffers; + u_longlong_t nfs3_srv_caller_id; /* ARGSUSED */ @@ -994,6 +996,9 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi, int in_crit = 0; int need_rwunlock = 0; caller_context_t ct; + int rdma_used = 0; + int loaned_buffers; + struct uio *uiop; vap = NULL; @@ -1007,6 +1012,12 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi, goto out; } + if (args->wlist) + rdma_used = 1; + + /* use loaned buffers for TCP */ + loaned_buffers = (nfs_loaned_buffers && !rdma_used) ? 1 : 0; + if (is_system_labeled()) { bslabel_t *clabel = req->rq_label; @@ -1136,12 +1147,38 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi, if (args->count > rfs3_tsize(req)) args->count = rfs3_tsize(req); + if (loaned_buffers) { + uiop = (uio_t *)rfs_setup_xuio(vp); + ASSERT(uiop != NULL); + uiop->uio_segflg = UIO_SYSSPACE; + uiop->uio_loffset = args->offset; + uiop->uio_resid = args->count; + + /* Jump to do the read if successful */ + if (VOP_REQZCBUF(vp, UIO_READ, (xuio_t *)uiop, cr, &ct) == 0) { + /* + * Need to hold the vnode until after VOP_RETZCBUF() + * is called. + */ + VN_HOLD(vp); + goto doio_read; + } + + DTRACE_PROBE2(nfss__i__reqzcbuf_failed, int, + uiop->uio_loffset, int, uiop->uio_resid); + + uiop->uio_extflg = 0; + /* failure to setup for zero copy */ + rfs_free_xuio((void *)uiop); + loaned_buffers = 0; + } + /* * If returning data via RDMA Write, then grab the chunk list. * If we aren't returning READ data w/RDMA_WRITE, then grab * a mblk. */ - if (args->wlist) { + if (rdma_used) { mp = NULL; (void) rdma_get_wchunk(req, &iov, args->wlist); } else { @@ -1167,11 +1204,14 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi, uio.uio_extflg = UIO_COPY_CACHED; uio.uio_loffset = args->offset; uio.uio_resid = args->count; + uiop = &uio; - error = VOP_READ(vp, &uio, 0, cr, &ct); +doio_read: + error = VOP_READ(vp, uiop, 0, cr, &ct); if (error) { - freeb(mp); + if (mp) + freemsg(mp); /* check if a monitor detected a delegation conflict */ if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) { resp->status = NFS3ERR_JUKEBOX; @@ -1180,6 +1220,12 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi, goto out; } + /* make mblk using zc buffers */ + if (loaned_buffers) { + mp = uio_to_mblk(uiop); + ASSERT(mp != NULL); + } + va.va_mask = AT_ALL; error = VOP_GETATTR(vp, &va, 0, cr, &ct); @@ -1205,16 +1251,20 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi, resp->status = NFS3_OK; vattr_to_post_op_attr(vap, &resp->resok.file_attributes); - resp->resok.count = args->count - uio.uio_resid; + resp->resok.count = args->count - uiop->uio_resid; if (!error && offset + resp->resok.count == va.va_size) resp->resok.eof = TRUE; else resp->resok.eof = FALSE; resp->resok.data.data_len = resp->resok.count; + + if (mp) + rfs_rndup_mblks(mp, resp->resok.count, loaned_buffers); + resp->resok.data.mp = mp; resp->resok.size = (uint_t)args->count; - if (args->wlist) { + if (rdma_used) { resp->resok.data.data_val = (caddr_t)iov.iov_base; if (!rdma_setup_read_data3(args, &(resp->resok))) { resp->status = NFS3ERR_INVAL; @@ -1260,7 +1310,7 @@ rfs3_read_free(READ3res *resp) if (resp->status == NFS3_OK) { mp = resp->resok.data.mp; if (mp != NULL) - freeb(mp); + freemsg(mp); } } diff --git a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c index 501c2dbd9e..2111e9fabf 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c @@ -1003,7 +1003,7 @@ errout: static int nfs3_dynamic = 0; /* global variable to enable dynamic retrans. */ static ushort_t nfs3_max_threads = 8; /* max number of active async threads */ -static uint_t nfs3_bsize = 32 * 1024; /* client `block' size */ +uint_t nfs3_bsize = 32 * 1024; /* client `block' size */ static uint_t nfs3_async_clusters = 1; /* # of reqs from each async queue */ static uint_t nfs3_cots_timeo = NFS_COTS_TIMEO; diff --git a/usr/src/uts/common/fs/nfs/nfs3_xdr.c b/usr/src/uts/common/fs/nfs/nfs3_xdr.c index e8fd857848..cdec8ffc96 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_xdr.c +++ b/usr/src/uts/common/fs/nfs/nfs3_xdr.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1320,16 +1320,9 @@ xdr_READ3res(XDR *xdrs, READ3res *objp) } if (xdrs->x_op == XDR_ENCODE) { - int i, rndup; mp = resokp->data.mp; if (mp != NULL && xdrs->x_ops == &xdrmblk_ops) { - mp->b_wptr += resokp->count; - rndup = BYTES_PER_XDR_UNIT - - (resokp->data.data_len % BYTES_PER_XDR_UNIT); - if (rndup != BYTES_PER_XDR_UNIT) - for (i = 0; i < rndup; i++) - *mp->b_wptr++ = '\0'; if (xdrmblk_putmblk(xdrs, mp, resokp->count) == TRUE) { resokp->data.mp = NULL; return (TRUE); diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv.c b/usr/src/uts/common/fs/nfs/nfs4_srv.c index ac584c9d62..62474ee7f6 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -84,6 +84,8 @@ static int rfs4_maxlock_tries = RFS4_MAXLOCK_TRIES; #define RFS4_LOCK_DELAY 10 /* Milliseconds */ static clock_t rfs4_lock_delay = RFS4_LOCK_DELAY; extern struct svc_ops rdma_svc_ops; +extern int nfs_loaned_buffers; +/* End of Tunables */ static int rdma_setup_read_data4(READ4args *, READ4res *); @@ -3140,9 +3142,12 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, bool_t *deleg = &cs->deleg; nfsstat4 stat; int in_crit = 0; - mblk_t *mp; + mblk_t *mp = NULL; int alloc_err = 0; + int rdma_used = 0; + int loaned_buffers; caller_context_t ct; + struct uio *uiop; DTRACE_NFSV4_2(op__read__start, struct compound_state *, cs, READ4args, args); @@ -3183,6 +3188,12 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, goto out; } + if (args->wlist) + rdma_used = 1; + + /* use loaned buffers for TCP */ + loaned_buffers = (nfs_loaned_buffers && !rdma_used) ? 1 : 0; + va.va_mask = AT_MODE|AT_SIZE|AT_UID; verror = VOP_GETATTR(vp, &va, 0, cs->cr, &ct); @@ -3250,11 +3261,38 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, if (args->count > rfs4_tsize(req)) args->count = rfs4_tsize(req); + if (loaned_buffers) { + uiop = (uio_t *)rfs_setup_xuio(vp); + ASSERT(uiop != NULL); + uiop->uio_segflg = UIO_SYSSPACE; + uiop->uio_loffset = args->offset; + uiop->uio_resid = args->count; + + /* Jump to do the read if successful */ + if (!VOP_REQZCBUF(vp, UIO_READ, (xuio_t *)uiop, cs->cr, &ct)) { + /* + * Need to hold the vnode until after VOP_RETZCBUF() + * is called. + */ + VN_HOLD(vp); + goto doio_read; + } + + DTRACE_PROBE2(nfss__i__reqzcbuf_failed, int, + uiop->uio_loffset, int, uiop->uio_resid); + + uiop->uio_extflg = 0; + + /* failure to setup for zero copy */ + rfs_free_xuio((void *)uiop); + loaned_buffers = 0; + } + /* * If returning data via RDMA Write, then grab the chunk list. If we * aren't returning READ data w/RDMA_WRITE, then grab a mblk. */ - if (args->wlist) { + if (rdma_used) { mp = NULL; (void) rdma_get_wchunk(req, &iov, args->wlist); } else { @@ -3287,27 +3325,38 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, uio.uio_extflg = UIO_COPY_CACHED; uio.uio_loffset = args->offset; uio.uio_resid = args->count; + uiop = &uio; - error = do_io(FREAD, vp, &uio, 0, cs->cr, &ct); +doio_read: + error = do_io(FREAD, vp, uiop, 0, cs->cr, &ct); va.va_mask = AT_SIZE; verror = VOP_GETATTR(vp, &va, 0, cs->cr, &ct); if (error) { - freeb(mp); + if (mp) + freemsg(mp); *cs->statusp = resp->status = puterrno4(error); goto out; } + /* make mblk using zc buffers */ + if (loaned_buffers) { + mp = uio_to_mblk(uiop); + ASSERT(mp != NULL); + } + *cs->statusp = resp->status = NFS4_OK; - ASSERT(uio.uio_resid >= 0); - resp->data_len = args->count - uio.uio_resid; + ASSERT(uiop->uio_resid >= 0); + resp->data_len = args->count - uiop->uio_resid; if (mp) { resp->data_val = (char *)mp->b_datap->db_base; + rfs_rndup_mblks(mp, resp->data_len, loaned_buffers); } else { resp->data_val = (caddr_t)iov.iov_base; } + resp->mblk = mp; if (!verror && offset + resp->data_len == va.va_size) @@ -3315,7 +3364,7 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, else resp->eof = FALSE; - if (args->wlist) { + if (rdma_used) { if (!rdma_setup_read_data4(args, resp)) { *cs->statusp = resp->status = NFS4ERR_INVAL; } @@ -3337,7 +3386,7 @@ rfs4_op_read_free(nfs_resop4 *resop) READ4res *resp = &resop->nfs_resop4_u.opread; if (resp->status == NFS4_OK && resp->mblk != NULL) { - freeb(resp->mblk); + freemsg(resp->mblk); resp->mblk = NULL; resp->data_val = NULL; resp->data_len = 0; diff --git a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c index d6ac9bf407..040fbed7bd 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c @@ -2159,7 +2159,7 @@ restore_svp(mntinfo4_t *mi, servinfo4_t *svp, servinfo4_t *origsvp) } static ushort_t nfs4_max_threads = 8; /* max number of active async threads */ -static uint_t nfs4_bsize = 32 * 1024; /* client `block' size */ +uint_t nfs4_bsize = 32 * 1024; /* client `block' size */ static uint_t nfs4_async_clusters = 1; /* # of reqs from each async queue */ static uint_t nfs4_cots_timeo = NFS_COTS_TIMEO; diff --git a/usr/src/uts/common/fs/nfs/nfs4_xdr.c b/usr/src/uts/common/fs/nfs/nfs4_xdr.c index e2e14cff8a..08e9546cf3 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_xdr.c +++ b/usr/src/uts/common/fs/nfs/nfs4_xdr.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -3350,7 +3350,6 @@ xdr_READ4args(XDR *xdrs, READ4args *objp) static bool_t xdr_READ4res(XDR *xdrs, READ4res *objp) { - int i, rndup; mblk_t *mp; if (xdrs->x_op == XDR_DECODE) @@ -3378,12 +3377,6 @@ xdr_READ4res(XDR *xdrs, READ4res *objp) mp = objp->mblk; if (mp != NULL && xdrs->x_ops == &xdrmblk_ops) { - mp->b_wptr += objp->data_len; - rndup = BYTES_PER_XDR_UNIT - - (objp->data_len % BYTES_PER_XDR_UNIT); - if (rndup != BYTES_PER_XDR_UNIT) - for (i = 0; i < rndup; i++) - *mp->b_wptr++ = '\0'; if (xdrmblk_putmblk(xdrs, mp, objp->data_len) == TRUE) { objp->mblk = NULL; return (TRUE); diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c index dc7a23b583..2f6e2bc8be 100644 --- a/usr/src/uts/common/fs/nfs/nfs_server.c +++ b/usr/src/uts/common/fs/nfs/nfs_server.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -106,6 +106,9 @@ static struct modlinkage modlinkage = { char _depends_on[] = "misc/klmmod"; +kmem_cache_t *nfs_xuio_cache; +int nfs_loaned_buffers = 0; + int _init(void) { @@ -139,6 +142,11 @@ _init(void) /* setup DSS paths here; must be done before initial server startup */ rfs4_dss_paths = rfs4_dss_oldpaths = NULL; + /* initialize the copy reduction caches */ + + nfs_xuio_cache = kmem_cache_create("nfs_xuio_cache", + sizeof (nfs_xuio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + return (status); } @@ -3215,3 +3223,140 @@ do_rfs_label_check(bslabel_t *clabel, vnode_t *vp, int flag, label_rele(tslabel); return (result); } + +/* + * Callback function to return the loaned buffers. + * Calls VOP_RETZCBUF() only after all uio_iov[] + * buffers are returned. nu_ref maintains the count. + */ +void +rfs_free_xuio(void *free_arg) +{ + uint_t ref; + nfs_xuio_t *nfsuiop = (nfs_xuio_t *)free_arg; + + ref = atomic_dec_uint_nv(&nfsuiop->nu_ref); + + /* + * Call VOP_RETZCBUF() only when all the iov buffers + * are sent OTW. + */ + if (ref != 0) + return; + + if (((uio_t *)nfsuiop)->uio_extflg & UIO_XUIO) { + (void) VOP_RETZCBUF(nfsuiop->nu_vp, (xuio_t *)free_arg, NULL, + NULL); + VN_RELE(nfsuiop->nu_vp); + } + + kmem_cache_free(nfs_xuio_cache, free_arg); +} + +xuio_t * +rfs_setup_xuio(vnode_t *vp) +{ + nfs_xuio_t *nfsuiop; + + nfsuiop = kmem_cache_alloc(nfs_xuio_cache, KM_SLEEP); + + bzero(nfsuiop, sizeof (nfs_xuio_t)); + nfsuiop->nu_vp = vp; + + /* + * ref count set to 1. more may be added + * if multiple mblks refer to multiple iov's. + * This is done in uio_to_mblk(). + */ + + nfsuiop->nu_ref = 1; + + nfsuiop->nu_frtn.free_func = rfs_free_xuio; + nfsuiop->nu_frtn.free_arg = (char *)nfsuiop; + + nfsuiop->nu_uio.xu_type = UIOTYPE_ZEROCOPY; + + return (&nfsuiop->nu_uio); +} + +mblk_t * +uio_to_mblk(uio_t *uiop) +{ + struct iovec *iovp; + int i; + mblk_t *mp, *mp1; + nfs_xuio_t *nfsuiop = (nfs_xuio_t *)uiop; + + if (uiop->uio_iovcnt == 0) + return (NULL); + + iovp = uiop->uio_iov; + mp = mp1 = esballoca((uchar_t *)iovp->iov_base, iovp->iov_len, + BPRI_MED, &nfsuiop->nu_frtn); + ASSERT(mp != NULL); + + mp->b_wptr += iovp->iov_len; + mp->b_datap->db_type = M_DATA; + + for (i = 1; i < uiop->uio_iovcnt; i++) { + iovp = (uiop->uio_iov + i); + + mp1->b_cont = esballoca( + (uchar_t *)iovp->iov_base, iovp->iov_len, BPRI_MED, + &nfsuiop->nu_frtn); + + mp1 = mp1->b_cont; + ASSERT(mp1 != NULL); + mp1->b_wptr += iovp->iov_len; + mp1->b_datap->db_type = M_DATA; + } + + nfsuiop->nu_ref = uiop->uio_iovcnt; + + return (mp); +} + +void +rfs_rndup_mblks(mblk_t *mp, uint_t len, int buf_loaned) +{ + int i, rndup; + int alloc_err = 0; + mblk_t *rmp; + + rndup = BYTES_PER_XDR_UNIT - (len % BYTES_PER_XDR_UNIT); + + /* single mblk_t non copy-reduction case */ + if (!buf_loaned) { + mp->b_wptr += len; + if (rndup != BYTES_PER_XDR_UNIT) { + for (i = 0; i < rndup; i++) + *mp->b_wptr++ = '\0'; + } + return; + } + + /* no need for extra rndup */ + if (rndup == BYTES_PER_XDR_UNIT) + return; + + while (mp->b_cont) + mp = mp->b_cont; + + /* + * In case of copy-reduction mblks, the size of the mblks + * are fixed and are of the size of the loaned buffers. + * Allocate a roundup mblk and chain it to the data + * buffers. This is sub-optimal, but not expected to + * happen in regular common workloads. + */ + + rmp = allocb_wait(rndup, BPRI_MED, STR_NOSIG, &alloc_err); + ASSERT(rmp != NULL); + ASSERT(alloc_err == 0); + + for (i = 0; i < rndup; i++) + *rmp->b_wptr++ = '\0'; + + rmp->b_datap->db_type = M_DATA; + mp->b_cont = rmp; +} diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index 2f7aa751ad..acdfdb36a1 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -362,6 +362,12 @@ static const fs_operation_trans_def_t vn_ops_table[] = { (fs_generic_func_p) fs_vnevent_nosupport, (fs_generic_func_p) fs_vnevent_nosupport, + VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf), + fs_nosys, fs_nosys, + + VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf), + fs_nosys, fs_nosys, + NULL, 0, NULL, NULL }; @@ -522,6 +528,10 @@ create_vopstats_template() kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64); /* VOP_VNEVENT */ kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64); + /* VOP_REQZCBUF */ + kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64); + /* VOP_RETZCBUF */ + kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64); return (vsp); } @@ -4151,6 +4161,31 @@ fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm, return (err); } +int +fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr, + caller_context_t *ct) +{ + int err; + + if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0) + return (ENOTSUP); + err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct); + VOPSTATS_UPDATE(vp, reqzcbuf); + return (err); +} + +int +fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + int err; + + if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0) + return (ENOTSUP); + err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct); + VOPSTATS_UPDATE(vp, retzcbuf); + return (err); +} + /* * Default destructor * Needed because NULL destructor means that the key is unused diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 9c4fb291ca..8e03c48a23 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1241,14 +1241,31 @@ arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(hdr->b_state == arc_anon); ASSERT(buf->b_data != NULL); - VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0); - VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1); + (void) refcount_add(&hdr->b_refcnt, tag); + (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -hdr->b_size); } +/* Detach an arc_buf from a dbuf (tag) */ +void +arc_loan_inuse_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr; + + rw_enter(&buf->b_lock, RW_WRITER); + ASSERT(buf->b_data != NULL); + hdr = buf->b_hdr; + (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_refcnt, tag); + buf->b_efunc = NULL; + buf->b_private = NULL; + + atomic_add_64(&arc_loaned_bytes, hdr->b_size); + rw_exit(&buf->b_lock); +} + static arc_buf_t * arc_buf_clone(arc_buf_t *from) { diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index ed2dc455de..b1f20af319 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -406,6 +406,29 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) } } +/* + * Loan out an arc_buf for read. Return the loaned arc_buf. + */ +arc_buf_t * +dbuf_loan_arcbuf(dmu_buf_impl_t *db) +{ + arc_buf_t *abuf; + + mutex_enter(&db->db_mtx); + if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { + int blksz = db->db.db_size; + mutex_exit(&db->db_mtx); + abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz); + bcopy(db->db.db_data, abuf->b_data, blksz); + } else { + abuf = db->db_buf; + arc_loan_inuse_buf(abuf, db); + dbuf_set_data(db, NULL); + mutex_exit(&db->db_mtx); + } + return (abuf); +} + uint64_t dbuf_whichblock(dnode_t *dn, uint64_t offset) { @@ -1162,7 +1185,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_blkid != DB_BONUS_BLKID); mutex_enter(&db->db_mtx); - /* * If this buffer is not dirty, we're done. */ @@ -1341,9 +1363,11 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); VERIFY(arc_buf_remove_ref(buf, db) == 1); + xuio_stat_wbuf_copied(); return; } + xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = db->db_last_dirty; diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index d3dfc21ac1..2d0927bd44 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -661,12 +661,136 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } +/* + * DMU support for xuio + */ +kstat_t *xuio_ksp = NULL; + +int +dmu_xuio_init(xuio_t *xuio, int nblk) +{ + dmu_xuio_t *priv; + uio_t *uio = &xuio->xu_uio; + + uio->uio_iovcnt = nblk; + uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); + + priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); + priv->cnt = nblk; + priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); + priv->iovp = uio->uio_iov; + XUIO_XUZC_PRIV(xuio) = priv; + + if (XUIO_XUZC_RW(xuio) == UIO_READ) + XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); + else + XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); + + return (0); +} + +void +dmu_xuio_fini(xuio_t *xuio) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + int nblk = priv->cnt; + + kmem_free(priv->iovp, nblk * sizeof (iovec_t)); + kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); + kmem_free(priv, sizeof (dmu_xuio_t)); + + if (XUIO_XUZC_RW(xuio) == UIO_READ) + XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); + else + XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); +} + +/* + * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } + * and increase priv->next by 1. + */ +int +dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) +{ + struct iovec *iov; + uio_t *uio = &xuio->xu_uio; + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + int i = priv->next++; + + ASSERT(i < priv->cnt); + ASSERT(off + n <= arc_buf_size(abuf)); + iov = uio->uio_iov + i; + iov->iov_base = (char *)abuf->b_data + off; + iov->iov_len = n; + priv->bufs[i] = abuf; + return (0); +} + +int +dmu_xuio_cnt(xuio_t *xuio) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + return (priv->cnt); +} + +arc_buf_t * +dmu_xuio_arcbuf(xuio_t *xuio, int i) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + + ASSERT(i < priv->cnt); + return (priv->bufs[i]); +} + +void +dmu_xuio_clear(xuio_t *xuio, int i) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + + ASSERT(i < priv->cnt); + priv->bufs[i] = NULL; +} + +static void +xuio_stat_init(void) +{ + xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", + KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (xuio_ksp != NULL) { + xuio_ksp->ks_data = &xuio_stats; + kstat_install(xuio_ksp); + } +} + +static void +xuio_stat_fini(void) +{ + if (xuio_ksp != NULL) { + kstat_delete(xuio_ksp); + xuio_ksp = NULL; + } +} + +void +xuio_stat_wbuf_copied() +{ + XUIOSTAT_BUMP(xuiostat_wbuf_copied); +} + +void +xuio_stat_wbuf_nocopy() +{ + XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); +} + #ifdef _KERNEL int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; + xuio_t *xuio = NULL; /* * NB: we could do this block-at-a-time, but it's nice @@ -677,6 +801,9 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) if (err) return (err); + if (uio->uio_extflg == UIO_XUIO) + xuio = (xuio_t *)uio; + for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; @@ -687,8 +814,24 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); + if (xuio) { + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + arc_buf_t *dbuf_abuf = dbi->db_buf; + arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); + err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); + if (!err) { + uio->uio_resid -= tocpy; + uio->uio_loffset += tocpy; + } + + if (abuf == dbuf_abuf) + XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); + else + XUIOSTAT_BUMP(xuiostat_rbuf_copied); + } else { + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_READ, uio); + } if (err) break; @@ -857,6 +1000,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_write(dn->dn_objset, dn->dn_object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); + XUIOSTAT_BUMP(xuiostat_wbuf_copied); } } @@ -1369,6 +1513,7 @@ dmu_init(void) zfetch_init(); arc_init(); l2arc_init(); + xuio_stat_init(); } void @@ -1379,4 +1524,5 @@ dmu_fini(void) dnode_fini(); dbuf_fini(); l2arc_fini(); + xuio_stat_fini(); } diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index a4f4964e11..c528fac1a6 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -87,6 +87,7 @@ arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type); arc_buf_t *arc_loan_buf(spa_t *spa, int size); void arc_return_buf(arc_buf_t *buf, void *tag); +void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); void arc_buf_add_ref(arc_buf_t *buf, void *tag); int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h index 6e2a66a2fa..d99ade07f8 100644 --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -267,6 +267,7 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dbuf_clear(dmu_buf_impl_t *db); void dbuf_evict(dmu_buf_impl_t *db); diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index e229ca3bd8..b41bc96c38 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -45,6 +45,7 @@ extern "C" { #endif struct uio; +struct xuio; struct page; struct vnode; struct spa; @@ -500,6 +501,15 @@ struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); +int dmu_xuio_init(struct xuio *uio, int niov); +void dmu_xuio_fini(struct xuio *uio); +int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, + size_t n); +int dmu_xuio_cnt(struct xuio *uio); +struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); +void dmu_xuio_clear(struct xuio *uio, int i); +void xuio_stat_wbuf_copied(); +void xuio_stat_wbuf_nocopy(); extern int zfs_prefetch_disable; diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h index 1e16da6b97..22f9f5f8c8 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -232,6 +232,39 @@ extern "C" { struct objset; struct dmu_pool; +typedef struct dmu_xuio { + int next; + int cnt; + struct arc_buf **bufs; + iovec_t *iovp; +} dmu_xuio_t; + +typedef struct xuio_stats { + /* loaned yet not returned arc_buf */ + kstat_named_t xuiostat_onloan_rbuf; + kstat_named_t xuiostat_onloan_wbuf; + /* whether a copy is made when loaning out a read buffer */ + kstat_named_t xuiostat_rbuf_copied; + kstat_named_t xuiostat_rbuf_nocopy; + /* whether a copy is made when assigning a write buffer */ + kstat_named_t xuiostat_wbuf_copied; + kstat_named_t xuiostat_wbuf_nocopy; +} xuio_stats_t; + +static xuio_stats_t xuio_stats = { + { "onloan_read_buf", KSTAT_DATA_UINT64 }, + { "onloan_write_buf", KSTAT_DATA_UINT64 }, + { "read_buf_copied", KSTAT_DATA_UINT64 }, + { "read_buf_nocopy", KSTAT_DATA_UINT64 }, + { "write_buf_copied", KSTAT_DATA_UINT64 }, + { "write_buf_nocopy", KSTAT_DATA_UINT64 } +}; + +#define XUIOSTAT_INCR(stat, val) \ + atomic_add_64(&xuio_stats.stat.value.ui64, (val)) +#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) + + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 0a262cbe21..6759a812ed 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1115,6 +1115,7 @@ zfs_domount(vfs_t *vfsp, char *osname) vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); } + vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index 9d9fe50aa9..d59c7625ec 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -447,6 +447,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) ssize_t n, nbytes; int error; rl_t *rl; + xuio_t *xuio = NULL; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -507,6 +508,35 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) ASSERT(uio->uio_loffset < zp->z_phys->zp_size); n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { + int nblk; + int blksz = zp->z_blksz; + uint64_t offset = uio->uio_loffset; + + xuio = (xuio_t *)uio; + if ((ISP2(blksz))) { + nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, + blksz)) / blksz; + } else { + ASSERT(offset + n <= blksz); + nblk = 1; + } + dmu_xuio_init(xuio, nblk); + + if (vn_has_cached_data(vp)) { + /* + * For simplicity, we always allocate a full buffer + * even if we only expect to read a portion of a block. + */ + while (--nblk >= 0) { + dmu_xuio_add(xuio, + dmu_request_arcbuf(zp->z_dbuf, blksz), + 0, blksz); + } + } + } + while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); @@ -524,7 +554,6 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) n -= nbytes; } - out: zfs_range_unlock(rl); @@ -570,6 +599,12 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) uint64_t pflags; int error; arc_buf_t *abuf; + iovec_t *aiov; + xuio_t *xuio = NULL; + int i_iov = 0; + int iovcnt = uio->uio_iovcnt; + iovec_t *iovp = uio->uio_iov; + int write_eof; /* * Fasttrack empty write @@ -619,8 +654,13 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. + * Skip this if uio contains loaned arc_buf. */ - uio_prefaultpages(n, uio); + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) + xuio = (xuio_t *)uio; + else + uio_prefaultpages(n, uio); /* * If in append mode, set the io offset pointer to eof. @@ -659,6 +699,9 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; + /* Will this write extend the file length? */ + write_eof = (woff + n > zp->z_phys->zp_size); + end_size = MAX(zp->z_phys->zp_size, woff + n); /* @@ -669,7 +712,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) while (n > 0) { abuf = NULL; woff = uio->uio_loffset; - again: if (zfs_usergroup_overquota(zfsvfs, B_FALSE, zp->z_phys->zp_uid) || @@ -681,16 +723,28 @@ again: break; } - /* - * If dmu_assign_arcbuf() is expected to execute with minimum - * overhead loan an arc buffer and copy user data to it before - * we enter a txg. This avoids holding a txg forever while we - * pagefault on a hanging NFS server mapping. - */ - if (abuf == NULL && n >= max_blksz && + if (xuio && abuf == NULL) { + ASSERT(i_iov < iovcnt); + aiov = &iovp[i_iov]; + abuf = dmu_xuio_arcbuf(xuio, i_iov); + dmu_xuio_clear(xuio, i_iov); + DTRACE_PROBE3(zfs_cp_write, int, i_iov, + iovec_t *, aiov, arc_buf_t *, abuf); + ASSERT((aiov->iov_base == abuf->b_data) || + ((char *)aiov->iov_base - (char *)abuf->b_data + + aiov->iov_len == arc_buf_size(abuf))); + i_iov++; + } else if (abuf == NULL && n >= max_blksz && woff >= zp->z_phys->zp_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { + /* + * This write covers a full block. "Borrow" a buffer + * from the dmu so that we can fill it before we enter + * a transaction. This avoids the possibility of + * holding up the transaction if the data copy hangs + * up on a pagefault (e.g., from an NFS server mapping). + */ size_t cbytes; abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz); @@ -755,8 +809,24 @@ again: tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; - ASSERT(tx_bytes == max_blksz); - dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx); + ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); + /* + * If this is not a full block write, but we are + * extending the file past EOF and this data starts + * block-aligned, use assign_arcbuf(). Otherwise, + * write via dmu_write(). + */ + if (tx_bytes < max_blksz && (!write_eof || + aiov->iov_base != abuf->b_data)) { + ASSERT(xuio); + dmu_write(zfsvfs->z_os, zp->z_id, woff, + aiov->iov_len, aiov->iov_base, tx); + dmu_return_arcbuf(abuf); + xuio_stat_wbuf_copied(); + } else { + ASSERT(xuio || tx_bytes == max_blksz); + dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx); + } ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); } @@ -4571,6 +4641,160 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, } /* + * Tunable, both must be a power of 2. + * + * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf + * zcr_blksz_max: if set to less than the file block size, allow loaning out of + * an arcbuf for a partial block read + */ +int zcr_blksz_min = (1 << 10); /* 1K */ +int zcr_blksz_max = (1 << 17); /* 128K */ + +/*ARGSUSED*/ +static int +zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int max_blksz = zfsvfs->z_max_blksz; + uio_t *uio = &xuio->xu_uio; + ssize_t size = uio->uio_resid; + offset_t offset = uio->uio_loffset; + int blksz; + int fullblk, i; + arc_buf_t *abuf; + ssize_t maxsize; + int preamble, postamble; + + if (xuio->xu_type != UIOTYPE_ZEROCOPY) + return (EINVAL); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + switch (ioflag) { + case UIO_WRITE: + /* + * Loan out an arc_buf for write if write size is bigger than + * max_blksz, and the file's block size is also max_blksz. + */ + blksz = max_blksz; + if (size < blksz || zp->z_blksz != blksz) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + /* + * Caller requests buffers for write before knowing where the + * write offset might be (e.g. NFS TCP write). + */ + if (offset == -1) { + preamble = 0; + } else { + preamble = P2PHASE(offset, blksz); + if (preamble) { + preamble = blksz - preamble; + size -= preamble; + } + } + + postamble = P2PHASE(size, blksz); + size -= postamble; + + fullblk = size / blksz; + dmu_xuio_init(xuio, + (preamble != 0) + fullblk + (postamble != 0)); + DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble, + int, postamble, int, + (preamble != 0) + fullblk + (postamble != 0)); + + /* + * Have to fix iov base/len for partial buffers. They + * currently represent full arc_buf's. + */ + if (preamble) { + /* data begins in the middle of the arc_buf */ + abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); + ASSERT(abuf); + dmu_xuio_add(xuio, abuf, blksz - preamble, preamble); + } + + for (i = 0; i < fullblk; i++) { + abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); + ASSERT(abuf); + dmu_xuio_add(xuio, abuf, 0, blksz); + } + + if (postamble) { + /* data ends in the middle of the arc_buf */ + abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); + ASSERT(abuf); + dmu_xuio_add(xuio, abuf, 0, postamble); + } + break; + case UIO_READ: + /* + * Loan out an arc_buf for read if the read size is larger than + * the current file block size. Block alignment is not + * considered. Partial arc_buf will be loaned out for read. + */ + blksz = zp->z_blksz; + if (blksz < zcr_blksz_min) + blksz = zcr_blksz_min; + if (blksz > zcr_blksz_max) + blksz = zcr_blksz_max; + /* avoid potential complexity of dealing with it */ + if (blksz > max_blksz) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + maxsize = zp->z_phys->zp_size - uio->uio_loffset; + if (size > maxsize) + size = maxsize; + + if (size < blksz || vn_has_cached_data(vp)) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + break; + default: + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + uio->uio_extflg = UIO_XUIO; + XUIO_XUZC_RW(xuio) = ioflag; + ZFS_EXIT(zfsvfs); + return (0); +} + +/*ARGSUSED*/ +static int +zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct) +{ + int i; + arc_buf_t *abuf; + int ioflag = XUIO_XUZC_RW(xuio); + + ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); + + i = dmu_xuio_cnt(xuio); + while (i-- > 0) { + abuf = dmu_xuio_arcbuf(xuio, i); + /* + * if abuf == NULL, it must be a write buffer + * that has been returned in zfs_write(). + */ + if (abuf) + dmu_return_arcbuf(abuf); + ASSERT(abuf || ioflag == UIO_WRITE); + } + + dmu_xuio_fini(xuio); + return (0); +} + +/* * Predeclare these here so that the compiler assumes that * this is an "old style" function declaration that does * not include arguments => we won't get type mismatch errors @@ -4653,6 +4877,8 @@ const fs_operation_def_t zfs_fvnodeops_template[] = { VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf }, + VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf }, NULL, NULL }; diff --git a/usr/src/uts/common/nfs/nfs.h b/usr/src/uts/common/nfs/nfs.h index 77e5a397c2..1905e47c4f 100644 --- a/usr/src/uts/common/nfs/nfs.h +++ b/usr/src/uts/common/nfs/nfs.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1447,6 +1447,7 @@ struct READ3resok { #ifdef _KERNEL uint_t wlist_len; struct clist *wlist; + frtn_t zcopy; #endif }; typedef struct READ3resok READ3resok; @@ -2322,6 +2323,24 @@ extern int do_xattr_exists_check(vnode_t *, ulong_t *, cred_t *); extern ts_label_t *nfs_getflabel(vnode_t *, struct exportinfo *); extern boolean_t do_rfs_label_check(bslabel_t *, vnode_t *, int, struct exportinfo *); + +/* + * Copy Reduction support. + * xuio_t wrapper with additional private data. + */ + +typedef struct nfs_xuio { + xuio_t nu_uio; + vnode_t *nu_vp; + uint_t nu_ref; + frtn_t nu_frtn; +} nfs_xuio_t; + +xuio_t *rfs_setup_xuio(vnode_t *); +mblk_t *uio_to_mblk(uio_t *); +void rfs_rndup_mblks(mblk_t *, uint_t, int); +void rfs_free_xuio(void *); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/rpc/rpcmod.c b/usr/src/uts/common/rpc/rpcmod.c index cab50d67cd..891045d7f2 100644 --- a/usr/src/uts/common/rpc/rpcmod.c +++ b/usr/src/uts/common/rpc/rpcmod.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -1059,8 +1059,6 @@ rpcmod_release(queue_t *q, mblk_t *bp) #define MIR_SVC_ORDREL_TIMEOUT (10 * (60 * 1000L)) /* 10 minutes */ #define MIR_LASTFRAG 0x80000000 /* Record marker */ -#define DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr)) - #define MIR_SVC_QUIESCED(mir) \ (mir->mir_ref_cnt == 0 && mir->mir_inrservice == 0) diff --git a/usr/src/uts/common/rpc/xdr.h b/usr/src/uts/common/rpc/xdr.h index 4ef63d6baf..3db775893c 100644 --- a/usr/src/uts/common/rpc/xdr.h +++ b/usr/src/uts/common/rpc/xdr.h @@ -18,7 +18,7 @@ * * CDDL HEADER END * - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -585,6 +585,8 @@ extern uint_t xdrrec_readbytes(); #endif #else +#define DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr)) + extern void xdrmem_create(XDR *, caddr_t, uint_t, enum xdr_op); extern void xdrmblk_init(XDR *, mblk_t *, enum xdr_op, int); extern bool_t xdrmblk_getmblk(XDR *, mblk_t **, uint_t *); diff --git a/usr/src/uts/common/rpc/xdr_mblk.c b/usr/src/uts/common/rpc/xdr_mblk.c index 053edb7603..0b06b827e0 100644 --- a/usr/src/uts/common/rpc/xdr_mblk.c +++ b/usr/src/uts/common/rpc/xdr_mblk.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -361,20 +361,24 @@ xdrmblk_putbytes(XDR *xdrs, caddr_t addr, int len) * not a multiple of BYTES_PER_XDR_UNIT, the caller has the option * of making the data a BYTES_PER_XDR_UNIT multiple (b_wptr - b_rptr is * a BYTES_PER_XDR_UNIT multiple), but in this case the caller has to ensure - * that the filler bytes are initialized to zero. Note: Doesn't to work for - * chained mblks. + * that the filler bytes are initialized to zero. */ bool_t xdrmblk_putmblk(XDR *xdrs, mblk_t *m, uint_t len) { int32_t llen = (int32_t)len; - if (((m->b_wptr - m->b_rptr) % BYTES_PER_XDR_UNIT) != 0) + if ((DLEN(m) % BYTES_PER_XDR_UNIT) != 0) return (FALSE); if (!xdrmblk_putint32(xdrs, &llen)) return (FALSE); + /* LINTED pointer alignment */ ((mblk_t *)xdrs->x_base)->b_cont = m; + + /* base points to the last mblk */ + while (m->b_cont) + m = m->b_cont; xdrs->x_base = (caddr_t)m; xdrs->x_handy = 0; return (TRUE); diff --git a/usr/src/uts/common/sys/fem.h b/usr/src/uts/common/sys/fem.h index 84defb057c..9b3cd142e4 100644 --- a/usr/src/uts/common/sys/fem.h +++ b/usr/src/uts/common/sys/fem.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FEM_H #define _SYS_FEM_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/mutex.h> #include <sys/pathname.h> @@ -260,7 +258,13 @@ struct fem_head { struct shrlock *shr, int flag, cred_t *cr, \ caller_context_t *ct); \ int (*femop_vnevent)(femarg_t *vf, vnevent_t vnevent, \ - vnode_t *dvp, char *cname, caller_context_t *ct) + vnode_t *dvp, char *cname, \ + caller_context_t *ct); \ + int (*femop_reqzcbuf)(femarg_t *vf, enum uio_rw ioflag, \ + xuio_t *xuio, cred_t *cr, \ + caller_context_t *ct); \ + int (*femop_retzcbuf)(femarg_t *vf, xuio_t *xuio, cred_t *cr, \ + caller_context_t *ct) /* NB: No ";" */ struct fem { @@ -392,6 +396,10 @@ extern int vnext_shrlock(femarg_t *vf, int cmd, struct shrlock *shr, int flag, cred_t *cr, caller_context_t *ct); extern int vnext_vnevent(femarg_t *vf, vnevent_t vevent, vnode_t *dvp, char *cname, caller_context_t *ct); +extern int vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, + cred_t *cr, caller_context_t *ct); +extern int vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, + caller_context_t *ct); extern int vfsnext_mount(fsemarg_t *vf, vnode_t *mvp, struct mounta *uap, cred_t *cr); diff --git a/usr/src/uts/common/sys/uio.h b/usr/src/uts/common/sys/uio.h index 248443f9a5..7255a2fa67 100644 --- a/usr/src/uts/common/sys/uio.h +++ b/usr/src/uts/common/sys/uio.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -133,6 +133,49 @@ typedef struct uioa_s { uioa_page_t uioa_locked[UIOA_IOV_MAX]; /* Per iov locked pages */ } uioa_t; +/* + * uio extensions + * + * PSARC 2009/478: Copy Reduction Interfaces + */ +typedef enum xuio_type { + UIOTYPE_ASYNCIO, + UIOTYPE_ZEROCOPY +} xuio_type_t; + +typedef struct xuio { + uio_t xu_uio; /* Embedded UIO structure */ + + /* Extended uio fields */ + enum xuio_type xu_type; /* What kind of uio structure? */ + union { + /* Async I/O Support, intend to replace uioa_t. */ + struct { + uint32_t xu_a_state; /* state of async i/o */ + /* bytes that have been uioamove()ed */ + ssize_t xu_a_mbytes; + uioa_page_t *xu_a_lcur; /* pointer into uioa_locked[] */ + /* pointer into lcur->uioa_ppp[] */ + void **xu_a_lppp; + void *xu_a_hwst[4]; /* opaque hardware state */ + /* Per iov locked pages */ + uioa_page_t xu_a_locked[UIOA_IOV_MAX]; + } xu_aio; + + /* + * Copy Reduction Support -- facilate loaning / returning of + * filesystem cache buffers. + */ + struct { + int xu_zc_rw; /* read or write buffer */ + void *xu_zc_priv; /* fs specific */ + } xu_zc; + } xu_ext; +} xuio_t; + +#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv +#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw + #define UIOA_ALLOC 0x0001 /* allocated but not yet initialized */ #define UIOA_INIT 0x0002 /* initialized but not yet enabled */ #define UIOA_ENABLED 0x0004 /* enabled, asynch i/o active */ @@ -177,6 +220,7 @@ typedef enum uio_rw { UIO_READ, UIO_WRITE } uio_rw_t; #define UIO_COPY_CACHED 0x0001 /* copy should not bypass caches */ #define UIO_ASYNC 0x0002 /* uio_t is really a uioa_t */ +#define UIO_XUIO 0x0004 /* Structure is xuio_t */ /* * Global uioasync capability shadow state. diff --git a/usr/src/uts/common/sys/vfs.h b/usr/src/uts/common/sys/vfs.h index 2e9679cf97..bae4e5b87f 100644 --- a/usr/src/uts/common/sys/vfs.h +++ b/usr/src/uts/common/sys/vfs.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -302,7 +302,8 @@ typedef uint64_t vfs_feature_t; #define VFSFT_SYSATTR_VIEWS 0x100000040 /* Supports sysattr view i/f */ #define VFSFT_ACCESS_FILTER 0x100000080 /* dirents filtered by access */ #define VFSFT_REPARSE 0x100000100 /* Supports reparse point */ - +#define VFSFT_ZEROCOPY_SUPPORTED 0x100000200 + /* Support loaning /returning cache buffer */ /* * Argument structure for mount(2). * diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h index 97504aabf3..8b75225a64 100644 --- a/usr/src/uts/common/sys/vnode.h +++ b/usr/src/uts/common/sys/vnode.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -118,6 +118,8 @@ typedef struct vopstats { kstat_named_t ngetsecattr; /* VOP_GETSECATTR */ kstat_named_t nshrlock; /* VOP_SHRLOCK */ kstat_named_t nvnevent; /* VOP_VNEVENT */ + kstat_named_t nreqzcbuf; /* VOP_REQZCBUF */ + kstat_named_t nretzcbuf; /* VOP_RETZCBUF */ } vopstats_t; /* @@ -900,7 +902,11 @@ struct taskq; int (*vop_shrlock)(vnode_t *, int, struct shrlock *, \ int, cred_t *, caller_context_t *); \ int (*vop_vnevent)(vnode_t *, vnevent_t, vnode_t *, \ - char *, caller_context_t *) + char *, caller_context_t *); \ + int (*vop_reqzcbuf)(vnode_t *, enum uio_rw, xuio_t *, \ + cred_t *, caller_context_t *); \ + int (*vop_retzcbuf)(vnode_t *, xuio_t *, cred_t *, \ + caller_context_t *) /* NB: No ";" */ /* @@ -997,6 +1003,9 @@ extern int fop_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *, caller_context_t *); extern int fop_vnevent(vnode_t *, vnevent_t, vnode_t *, char *, caller_context_t *); +extern int fop_reqzcbuf(vnode_t *, enum uio_rw, xuio_t *, cred_t *, + caller_context_t *); +extern int fop_retzcbuf(vnode_t *, xuio_t *, cred_t *, caller_context_t *); #endif /* _KERNEL */ @@ -1088,6 +1097,10 @@ extern int fop_vnevent(vnode_t *, vnevent_t, vnode_t *, char *, fop_shrlock(vp, cmd, shr, f, cr, ct) #define VOP_VNEVENT(vp, vnevent, dvp, fnm, ct) \ fop_vnevent(vp, vnevent, dvp, fnm, ct) +#define VOP_REQZCBUF(vp, rwflag, xuiop, cr, ct) \ + fop_reqzcbuf(vp, rwflag, xuiop, cr, ct) +#define VOP_RETZCBUF(vp, xuiop, cr, ct) \ + fop_retzcbuf(vp, xuiop, cr, ct) #define VOPNAME_OPEN "open" #define VOPNAME_CLOSE "close" @@ -1133,6 +1146,8 @@ extern int fop_vnevent(vnode_t *, vnevent_t, vnode_t *, char *, #define VOPNAME_SETSECATTR "setsecattr" #define VOPNAME_SHRLOCK "shrlock" #define VOPNAME_VNEVENT "vnevent" +#define VOPNAME_REQZCBUF "reqzcbuf" +#define VOPNAME_RETZCBUF "retzcbuf" /* * Flags for VOP_LOOKUP |
