summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorchunli zhang - Sun Microsystems - Irvine United States <Chunli.Zhang@Sun.COM>2010-01-18 10:34:16 -0800
committerchunli zhang - Sun Microsystems - Irvine United States <Chunli.Zhang@Sun.COM>2010-01-18 10:34:16 -0800
commitc242f9a02a2ef021449275ae0a1d2581ee77231d (patch)
tree6d298bebb8ff9febd9acf936d402f67a6d67d358 /usr/src
parentbce54adf407df0723facaef4e2147ed69b922786 (diff)
downloadillumos-joyent-c242f9a02a2ef021449275ae0a1d2581ee77231d.tar.gz
6873106 Need a mechanism to share buffers between fs modules
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/stat/fsstat/fsstat.c4
-rw-r--r--usr/src/uts/common/fs/fem.c93
-rw-r--r--usr/src/uts/common/fs/nfs/nfs3_srv.c64
-rw-r--r--usr/src/uts/common/fs/nfs/nfs3_vfsops.c2
-rw-r--r--usr/src/uts/common/fs/nfs/nfs3_xdr.c9
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_srv.c67
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_vfsops.c2
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_xdr.c9
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_server.c147
-rw-r--r--usr/src/uts/common/fs/vnode.c37
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c25
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c28
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c152
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dbuf.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h12
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_impl.h35
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c3
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c252
-rw-r--r--usr/src/uts/common/nfs/nfs.h21
-rw-r--r--usr/src/uts/common/rpc/rpcmod.c4
-rw-r--r--usr/src/uts/common/rpc/xdr.h4
-rw-r--r--usr/src/uts/common/rpc/xdr_mblk.c12
-rw-r--r--usr/src/uts/common/sys/fem.h16
-rw-r--r--usr/src/uts/common/sys/uio.h46
-rw-r--r--usr/src/uts/common/sys/vfs.h5
-rw-r--r--usr/src/uts/common/sys/vnode.h19
27 files changed, 990 insertions, 84 deletions
diff --git a/usr/src/cmd/stat/fsstat/fsstat.c b/usr/src/cmd/stat/fsstat/fsstat.c
index 1869ff4fe6..31b2f5e054 100644
--- a/usr/src/cmd/stat/fsstat/fsstat.c
+++ b/usr/src/cmd/stat/fsstat/fsstat.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -428,6 +428,8 @@ vop_display(char *name, vopstats_t *oldvsp, vopstats_t *newvsp, int dispflag)
PRINT_VOPSTAT(niceflag, setsecattr);
PRINT_VOPSTAT(niceflag, shrlock);
PRINT_VOPSTAT(niceflag, vnevent);
+ PRINT_VOPSTAT(niceflag, reqzcbuf);
+ PRINT_VOPSTAT(niceflag, retzcbuf);
if (niceflag) {
/* Make it easier on the eyes */
diff --git a/usr/src/uts/common/fs/fem.c b/usr/src/uts/common/fs/fem.c
index b7d1413bd7..95248d1077 100644
--- a/usr/src/uts/common/fs/fem.c
+++ b/usr/src/uts/common/fs/fem.c
@@ -19,10 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/atomic.h>
@@ -124,6 +123,8 @@ static fs_operation_trans_def_t fem_opdef[] = {
_FEMOPDEF(GETSECATTR, getsecattr),
_FEMOPDEF(SHRLOCK, shrlock),
_FEMOPDEF(VNEVENT, vnevent),
+ _FEMOPDEF(REQZCBUF, reqzcbuf),
+ _FEMOPDEF(RETZCBUF, retzcbuf),
{ NULL, 0, NULL, NULL }
};
@@ -176,6 +177,8 @@ static struct fs_operation_def fem_guard_ops[] = {
_FEMGUARD(GETSECATTR, getsecattr),
_FEMGUARD(SHRLOCK, shrlock),
_FEMGUARD(VNEVENT, vnevent),
+ _FEMGUARD(REQZCBUF, reqzcbuf),
+ _FEMGUARD(RETZCBUF, retzcbuf),
{ NULL, NULL }
};
@@ -1645,6 +1648,61 @@ vhead_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *cname,
}
static int
+vhead_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr,
+ caller_context_t *ct)
+{
+ femarg_t farg;
+ struct fem_list *femsp;
+ int (*func)();
+ void *arg0;
+ int errc;
+
+ if ((femsp = fem_lock(vp->v_femhead)) == NULL) {
+ func = (int (*)()) (vp->v_op->vop_reqzcbuf);
+ arg0 = vp;
+ fem_unlock(vp->v_femhead);
+ errc = (*func)(arg0, ioflag, xuiop, cr, ct);
+ } else {
+ fem_addref(femsp);
+ fem_unlock(vp->v_femhead);
+ farg.fa_vnode.vp = vp;
+ farg.fa_fnode = femsp->feml_nodes + femsp->feml_tos;
+ vsop_find(&farg, &func, int, &arg0, vop_reqzcbuf,
+ femop_reqzcbuf);
+ errc = (*func)(arg0, ioflag, xuiop, cr, ct);
+ fem_release(femsp);
+ }
+ return (errc);
+}
+
+static int
+vhead_retzcbuf(vnode_t *vp, xuio_t *xuiop, cred_t *cr, caller_context_t *ct)
+{
+ femarg_t farg;
+ struct fem_list *femsp;
+ int (*func)();
+ void *arg0;
+ int errc;
+
+ if ((femsp = fem_lock(vp->v_femhead)) == NULL) {
+ func = (int (*)()) (vp->v_op->vop_retzcbuf);
+ arg0 = vp;
+ fem_unlock(vp->v_femhead);
+ errc = (*func)(arg0, xuiop, cr, ct);
+ } else {
+ fem_addref(femsp);
+ fem_unlock(vp->v_femhead);
+ farg.fa_vnode.vp = vp;
+ farg.fa_fnode = femsp->feml_nodes + femsp->feml_tos;
+ vsop_find(&farg, &func, int, &arg0, vop_retzcbuf,
+ femop_retzcbuf);
+ errc = (*func)(arg0, xuiop, cr, ct);
+ fem_release(femsp);
+ }
+ return (errc);
+}
+
+static int
fshead_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
{
fsemarg_t farg;
@@ -1942,6 +2000,8 @@ static struct fs_operation_def fhead_vn_spec[] = {
{ VOPNAME_GETSECATTR, (femop_t *)vhead_getsecattr },
{ VOPNAME_SHRLOCK, (femop_t *)vhead_shrlock },
{ VOPNAME_VNEVENT, (femop_t *)vhead_vnevent },
+ { VOPNAME_REQZCBUF, (femop_t *)vhead_reqzcbuf },
+ { VOPNAME_RETZCBUF, (femop_t *)vhead_retzcbuf },
{ NULL, NULL }
};
@@ -2642,6 +2702,35 @@ vnext_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *cname,
}
int
+vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr,
+ caller_context_t *ct)
+{
+ int (*func)() = NULL;
+ void *arg0 = NULL;
+
+ ASSERT(vf != NULL);
+ vf->fa_fnode--;
+ vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf);
+ ASSERT(func != NULL);
+ ASSERT(arg0 != NULL);
+ return ((*func)(arg0, ioflag, xuiop, cr, ct));
+}
+
+int
+vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct)
+{
+ int (*func)() = NULL;
+ void *arg0 = NULL;
+
+ ASSERT(vf != NULL);
+ vf->fa_fnode--;
+ vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf);
+ ASSERT(func != NULL);
+ ASSERT(arg0 != NULL);
+ return ((*func)(arg0, xuiop, cr, ct));
+}
+
+int
vfsnext_mount(fsemarg_t *vf, vnode_t *mvp, struct mounta *uap, cred_t *cr)
{
int (*func)() = NULL;
diff --git a/usr/src/uts/common/fs/nfs/nfs3_srv.c b/usr/src/uts/common/fs/nfs/nfs3_srv.c
index 71ebdb2d74..b8e63c183d 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_srv.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_srv.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -87,6 +87,8 @@ static void vattr_to_pre_op_attr(struct vattr *, pre_op_attr *);
static void vattr_to_wcc_data(struct vattr *, struct vattr *, wcc_data *);
static int rdma_setup_read_data3(READ3args *, READ3resok *);
+extern int nfs_loaned_buffers;
+
u_longlong_t nfs3_srv_caller_id;
/* ARGSUSED */
@@ -994,6 +996,9 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi,
int in_crit = 0;
int need_rwunlock = 0;
caller_context_t ct;
+ int rdma_used = 0;
+ int loaned_buffers;
+ struct uio *uiop;
vap = NULL;
@@ -1007,6 +1012,12 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi,
goto out;
}
+ if (args->wlist)
+ rdma_used = 1;
+
+ /* use loaned buffers for TCP */
+ loaned_buffers = (nfs_loaned_buffers && !rdma_used) ? 1 : 0;
+
if (is_system_labeled()) {
bslabel_t *clabel = req->rq_label;
@@ -1136,12 +1147,38 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi,
if (args->count > rfs3_tsize(req))
args->count = rfs3_tsize(req);
+ if (loaned_buffers) {
+ uiop = (uio_t *)rfs_setup_xuio(vp);
+ ASSERT(uiop != NULL);
+ uiop->uio_segflg = UIO_SYSSPACE;
+ uiop->uio_loffset = args->offset;
+ uiop->uio_resid = args->count;
+
+ /* Jump to do the read if successful */
+ if (VOP_REQZCBUF(vp, UIO_READ, (xuio_t *)uiop, cr, &ct) == 0) {
+ /*
+ * Need to hold the vnode until after VOP_RETZCBUF()
+ * is called.
+ */
+ VN_HOLD(vp);
+ goto doio_read;
+ }
+
+ DTRACE_PROBE2(nfss__i__reqzcbuf_failed, int,
+ uiop->uio_loffset, int, uiop->uio_resid);
+
+ uiop->uio_extflg = 0;
+ /* failure to setup for zero copy */
+ rfs_free_xuio((void *)uiop);
+ loaned_buffers = 0;
+ }
+
/*
* If returning data via RDMA Write, then grab the chunk list.
* If we aren't returning READ data w/RDMA_WRITE, then grab
* a mblk.
*/
- if (args->wlist) {
+ if (rdma_used) {
mp = NULL;
(void) rdma_get_wchunk(req, &iov, args->wlist);
} else {
@@ -1167,11 +1204,14 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi,
uio.uio_extflg = UIO_COPY_CACHED;
uio.uio_loffset = args->offset;
uio.uio_resid = args->count;
+ uiop = &uio;
- error = VOP_READ(vp, &uio, 0, cr, &ct);
+doio_read:
+ error = VOP_READ(vp, uiop, 0, cr, &ct);
if (error) {
- freeb(mp);
+ if (mp)
+ freemsg(mp);
/* check if a monitor detected a delegation conflict */
if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
resp->status = NFS3ERR_JUKEBOX;
@@ -1180,6 +1220,12 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi,
goto out;
}
+ /* make mblk using zc buffers */
+ if (loaned_buffers) {
+ mp = uio_to_mblk(uiop);
+ ASSERT(mp != NULL);
+ }
+
va.va_mask = AT_ALL;
error = VOP_GETATTR(vp, &va, 0, cr, &ct);
@@ -1205,16 +1251,20 @@ rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi,
resp->status = NFS3_OK;
vattr_to_post_op_attr(vap, &resp->resok.file_attributes);
- resp->resok.count = args->count - uio.uio_resid;
+ resp->resok.count = args->count - uiop->uio_resid;
if (!error && offset + resp->resok.count == va.va_size)
resp->resok.eof = TRUE;
else
resp->resok.eof = FALSE;
resp->resok.data.data_len = resp->resok.count;
+
+ if (mp)
+ rfs_rndup_mblks(mp, resp->resok.count, loaned_buffers);
+
resp->resok.data.mp = mp;
resp->resok.size = (uint_t)args->count;
- if (args->wlist) {
+ if (rdma_used) {
resp->resok.data.data_val = (caddr_t)iov.iov_base;
if (!rdma_setup_read_data3(args, &(resp->resok))) {
resp->status = NFS3ERR_INVAL;
@@ -1260,7 +1310,7 @@ rfs3_read_free(READ3res *resp)
if (resp->status == NFS3_OK) {
mp = resp->resok.data.mp;
if (mp != NULL)
- freeb(mp);
+ freemsg(mp);
}
}
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
index 501c2dbd9e..2111e9fabf 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
@@ -1003,7 +1003,7 @@ errout:
static int nfs3_dynamic = 0; /* global variable to enable dynamic retrans. */
static ushort_t nfs3_max_threads = 8; /* max number of active async threads */
-static uint_t nfs3_bsize = 32 * 1024; /* client `block' size */
+uint_t nfs3_bsize = 32 * 1024; /* client `block' size */
static uint_t nfs3_async_clusters = 1; /* # of reqs from each async queue */
static uint_t nfs3_cots_timeo = NFS_COTS_TIMEO;
diff --git a/usr/src/uts/common/fs/nfs/nfs3_xdr.c b/usr/src/uts/common/fs/nfs/nfs3_xdr.c
index e8fd857848..cdec8ffc96 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_xdr.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_xdr.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1320,16 +1320,9 @@ xdr_READ3res(XDR *xdrs, READ3res *objp)
}
if (xdrs->x_op == XDR_ENCODE) {
- int i, rndup;
mp = resokp->data.mp;
if (mp != NULL && xdrs->x_ops == &xdrmblk_ops) {
- mp->b_wptr += resokp->count;
- rndup = BYTES_PER_XDR_UNIT -
- (resokp->data.data_len % BYTES_PER_XDR_UNIT);
- if (rndup != BYTES_PER_XDR_UNIT)
- for (i = 0; i < rndup; i++)
- *mp->b_wptr++ = '\0';
if (xdrmblk_putmblk(xdrs, mp, resokp->count) == TRUE) {
resokp->data.mp = NULL;
return (TRUE);
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv.c b/usr/src/uts/common/fs/nfs/nfs4_srv.c
index ac584c9d62..62474ee7f6 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -84,6 +84,8 @@ static int rfs4_maxlock_tries = RFS4_MAXLOCK_TRIES;
#define RFS4_LOCK_DELAY 10 /* Milliseconds */
static clock_t rfs4_lock_delay = RFS4_LOCK_DELAY;
extern struct svc_ops rdma_svc_ops;
+extern int nfs_loaned_buffers;
+/* End of Tunables */
static int rdma_setup_read_data4(READ4args *, READ4res *);
@@ -3140,9 +3142,12 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req,
bool_t *deleg = &cs->deleg;
nfsstat4 stat;
int in_crit = 0;
- mblk_t *mp;
+ mblk_t *mp = NULL;
int alloc_err = 0;
+ int rdma_used = 0;
+ int loaned_buffers;
caller_context_t ct;
+ struct uio *uiop;
DTRACE_NFSV4_2(op__read__start, struct compound_state *, cs,
READ4args, args);
@@ -3183,6 +3188,12 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req,
goto out;
}
+ if (args->wlist)
+ rdma_used = 1;
+
+ /* use loaned buffers for TCP */
+ loaned_buffers = (nfs_loaned_buffers && !rdma_used) ? 1 : 0;
+
va.va_mask = AT_MODE|AT_SIZE|AT_UID;
verror = VOP_GETATTR(vp, &va, 0, cs->cr, &ct);
@@ -3250,11 +3261,38 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req,
if (args->count > rfs4_tsize(req))
args->count = rfs4_tsize(req);
+ if (loaned_buffers) {
+ uiop = (uio_t *)rfs_setup_xuio(vp);
+ ASSERT(uiop != NULL);
+ uiop->uio_segflg = UIO_SYSSPACE;
+ uiop->uio_loffset = args->offset;
+ uiop->uio_resid = args->count;
+
+ /* Jump to do the read if successful */
+ if (!VOP_REQZCBUF(vp, UIO_READ, (xuio_t *)uiop, cs->cr, &ct)) {
+ /*
+ * Need to hold the vnode until after VOP_RETZCBUF()
+ * is called.
+ */
+ VN_HOLD(vp);
+ goto doio_read;
+ }
+
+ DTRACE_PROBE2(nfss__i__reqzcbuf_failed, int,
+ uiop->uio_loffset, int, uiop->uio_resid);
+
+ uiop->uio_extflg = 0;
+
+ /* failure to setup for zero copy */
+ rfs_free_xuio((void *)uiop);
+ loaned_buffers = 0;
+ }
+
/*
* If returning data via RDMA Write, then grab the chunk list. If we
* aren't returning READ data w/RDMA_WRITE, then grab a mblk.
*/
- if (args->wlist) {
+ if (rdma_used) {
mp = NULL;
(void) rdma_get_wchunk(req, &iov, args->wlist);
} else {
@@ -3287,27 +3325,38 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req,
uio.uio_extflg = UIO_COPY_CACHED;
uio.uio_loffset = args->offset;
uio.uio_resid = args->count;
+ uiop = &uio;
- error = do_io(FREAD, vp, &uio, 0, cs->cr, &ct);
+doio_read:
+ error = do_io(FREAD, vp, uiop, 0, cs->cr, &ct);
va.va_mask = AT_SIZE;
verror = VOP_GETATTR(vp, &va, 0, cs->cr, &ct);
if (error) {
- freeb(mp);
+ if (mp)
+ freemsg(mp);
*cs->statusp = resp->status = puterrno4(error);
goto out;
}
+ /* make mblk using zc buffers */
+ if (loaned_buffers) {
+ mp = uio_to_mblk(uiop);
+ ASSERT(mp != NULL);
+ }
+
*cs->statusp = resp->status = NFS4_OK;
- ASSERT(uio.uio_resid >= 0);
- resp->data_len = args->count - uio.uio_resid;
+ ASSERT(uiop->uio_resid >= 0);
+ resp->data_len = args->count - uiop->uio_resid;
if (mp) {
resp->data_val = (char *)mp->b_datap->db_base;
+ rfs_rndup_mblks(mp, resp->data_len, loaned_buffers);
} else {
resp->data_val = (caddr_t)iov.iov_base;
}
+
resp->mblk = mp;
if (!verror && offset + resp->data_len == va.va_size)
@@ -3315,7 +3364,7 @@ rfs4_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req,
else
resp->eof = FALSE;
- if (args->wlist) {
+ if (rdma_used) {
if (!rdma_setup_read_data4(args, resp)) {
*cs->statusp = resp->status = NFS4ERR_INVAL;
}
@@ -3337,7 +3386,7 @@ rfs4_op_read_free(nfs_resop4 *resop)
READ4res *resp = &resop->nfs_resop4_u.opread;
if (resp->status == NFS4_OK && resp->mblk != NULL) {
- freeb(resp->mblk);
+ freemsg(resp->mblk);
resp->mblk = NULL;
resp->data_val = NULL;
resp->data_len = 0;
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
index d6ac9bf407..040fbed7bd 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
@@ -2159,7 +2159,7 @@ restore_svp(mntinfo4_t *mi, servinfo4_t *svp, servinfo4_t *origsvp)
}
static ushort_t nfs4_max_threads = 8; /* max number of active async threads */
-static uint_t nfs4_bsize = 32 * 1024; /* client `block' size */
+uint_t nfs4_bsize = 32 * 1024; /* client `block' size */
static uint_t nfs4_async_clusters = 1; /* # of reqs from each async queue */
static uint_t nfs4_cots_timeo = NFS_COTS_TIMEO;
diff --git a/usr/src/uts/common/fs/nfs/nfs4_xdr.c b/usr/src/uts/common/fs/nfs/nfs4_xdr.c
index e2e14cff8a..08e9546cf3 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_xdr.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_xdr.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -3350,7 +3350,6 @@ xdr_READ4args(XDR *xdrs, READ4args *objp)
static bool_t
xdr_READ4res(XDR *xdrs, READ4res *objp)
{
- int i, rndup;
mblk_t *mp;
if (xdrs->x_op == XDR_DECODE)
@@ -3378,12 +3377,6 @@ xdr_READ4res(XDR *xdrs, READ4res *objp)
mp = objp->mblk;
if (mp != NULL && xdrs->x_ops == &xdrmblk_ops) {
- mp->b_wptr += objp->data_len;
- rndup = BYTES_PER_XDR_UNIT -
- (objp->data_len % BYTES_PER_XDR_UNIT);
- if (rndup != BYTES_PER_XDR_UNIT)
- for (i = 0; i < rndup; i++)
- *mp->b_wptr++ = '\0';
if (xdrmblk_putmblk(xdrs, mp, objp->data_len) == TRUE) {
objp->mblk = NULL;
return (TRUE);
diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c
index dc7a23b583..2f6e2bc8be 100644
--- a/usr/src/uts/common/fs/nfs/nfs_server.c
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -106,6 +106,9 @@ static struct modlinkage modlinkage = {
char _depends_on[] = "misc/klmmod";
+kmem_cache_t *nfs_xuio_cache;
+int nfs_loaned_buffers = 0;
+
int
_init(void)
{
@@ -139,6 +142,11 @@ _init(void)
/* setup DSS paths here; must be done before initial server startup */
rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
+ /* initialize the copy reduction caches */
+
+ nfs_xuio_cache = kmem_cache_create("nfs_xuio_cache",
+ sizeof (nfs_xuio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
return (status);
}
@@ -3215,3 +3223,140 @@ do_rfs_label_check(bslabel_t *clabel, vnode_t *vp, int flag,
label_rele(tslabel);
return (result);
}
+
+/*
+ * Callback function to return the loaned buffers.
+ * Calls VOP_RETZCBUF() only after all uio_iov[]
+ * buffers are returned. nu_ref maintains the count.
+ */
+void
+rfs_free_xuio(void *free_arg)
+{
+ uint_t ref;
+ nfs_xuio_t *nfsuiop = (nfs_xuio_t *)free_arg;
+
+ ref = atomic_dec_uint_nv(&nfsuiop->nu_ref);
+
+ /*
+ * Call VOP_RETZCBUF() only when all the iov buffers
+ * are sent OTW.
+ */
+ if (ref != 0)
+ return;
+
+ if (((uio_t *)nfsuiop)->uio_extflg & UIO_XUIO) {
+ (void) VOP_RETZCBUF(nfsuiop->nu_vp, (xuio_t *)free_arg, NULL,
+ NULL);
+ VN_RELE(nfsuiop->nu_vp);
+ }
+
+ kmem_cache_free(nfs_xuio_cache, free_arg);
+}
+
+xuio_t *
+rfs_setup_xuio(vnode_t *vp)
+{
+ nfs_xuio_t *nfsuiop;
+
+ nfsuiop = kmem_cache_alloc(nfs_xuio_cache, KM_SLEEP);
+
+ bzero(nfsuiop, sizeof (nfs_xuio_t));
+ nfsuiop->nu_vp = vp;
+
+ /*
+ * ref count set to 1. more may be added
+ * if multiple mblks refer to multiple iov's.
+ * This is done in uio_to_mblk().
+ */
+
+ nfsuiop->nu_ref = 1;
+
+ nfsuiop->nu_frtn.free_func = rfs_free_xuio;
+ nfsuiop->nu_frtn.free_arg = (char *)nfsuiop;
+
+ nfsuiop->nu_uio.xu_type = UIOTYPE_ZEROCOPY;
+
+ return (&nfsuiop->nu_uio);
+}
+
+mblk_t *
+uio_to_mblk(uio_t *uiop)
+{
+ struct iovec *iovp;
+ int i;
+ mblk_t *mp, *mp1;
+ nfs_xuio_t *nfsuiop = (nfs_xuio_t *)uiop;
+
+ if (uiop->uio_iovcnt == 0)
+ return (NULL);
+
+ iovp = uiop->uio_iov;
+ mp = mp1 = esballoca((uchar_t *)iovp->iov_base, iovp->iov_len,
+ BPRI_MED, &nfsuiop->nu_frtn);
+ ASSERT(mp != NULL);
+
+ mp->b_wptr += iovp->iov_len;
+ mp->b_datap->db_type = M_DATA;
+
+ for (i = 1; i < uiop->uio_iovcnt; i++) {
+ iovp = (uiop->uio_iov + i);
+
+ mp1->b_cont = esballoca(
+ (uchar_t *)iovp->iov_base, iovp->iov_len, BPRI_MED,
+ &nfsuiop->nu_frtn);
+
+ mp1 = mp1->b_cont;
+ ASSERT(mp1 != NULL);
+ mp1->b_wptr += iovp->iov_len;
+ mp1->b_datap->db_type = M_DATA;
+ }
+
+ nfsuiop->nu_ref = uiop->uio_iovcnt;
+
+ return (mp);
+}
+
+void
+rfs_rndup_mblks(mblk_t *mp, uint_t len, int buf_loaned)
+{
+ int i, rndup;
+ int alloc_err = 0;
+ mblk_t *rmp;
+
+ rndup = BYTES_PER_XDR_UNIT - (len % BYTES_PER_XDR_UNIT);
+
+ /* single mblk_t non copy-reduction case */
+ if (!buf_loaned) {
+ mp->b_wptr += len;
+ if (rndup != BYTES_PER_XDR_UNIT) {
+ for (i = 0; i < rndup; i++)
+ *mp->b_wptr++ = '\0';
+ }
+ return;
+ }
+
+ /* no need for extra rndup */
+ if (rndup == BYTES_PER_XDR_UNIT)
+ return;
+
+ while (mp->b_cont)
+ mp = mp->b_cont;
+
+ /*
+ * In case of copy-reduction mblks, the size of the mblks
+ * are fixed and are of the size of the loaned buffers.
+ * Allocate a roundup mblk and chain it to the data
+ * buffers. This is sub-optimal, but not expected to
+ * happen in regular common workloads.
+ */
+
+ rmp = allocb_wait(rndup, BPRI_MED, STR_NOSIG, &alloc_err);
+ ASSERT(rmp != NULL);
+ ASSERT(alloc_err == 0);
+
+ for (i = 0; i < rndup; i++)
+ *rmp->b_wptr++ = '\0';
+
+ rmp->b_datap->db_type = M_DATA;
+ mp->b_cont = rmp;
+}
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index 2f7aa751ad..acdfdb36a1 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -362,6 +362,12 @@ static const fs_operation_trans_def_t vn_ops_table[] = {
(fs_generic_func_p) fs_vnevent_nosupport,
(fs_generic_func_p) fs_vnevent_nosupport,
+ VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
+ fs_nosys, fs_nosys,
+
NULL, 0, NULL, NULL
};
@@ -522,6 +528,10 @@ create_vopstats_template()
kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
/* VOP_VNEVENT */
kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
+ /* VOP_REQZCBUF */
+ kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
+ /* VOP_RETZCBUF */
+ kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
return (vsp);
}
@@ -4151,6 +4161,31 @@ fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
return (err);
}
+int
+fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
+ return (ENOTSUP);
+ err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
+ VOPSTATS_UPDATE(vp, reqzcbuf);
+ return (err);
+}
+
+int
+fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+ int err;
+
+ if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
+ return (ENOTSUP);
+ err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
+ VOPSTATS_UPDATE(vp, retzcbuf);
+ return (err);
+}
+
/*
* Default destructor
* Needed because NULL destructor means that the key is unused
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 9c4fb291ca..8e03c48a23 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1241,14 +1241,31 @@ arc_return_buf(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- ASSERT(hdr->b_state == arc_anon);
ASSERT(buf->b_data != NULL);
- VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
- VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
+ (void) refcount_add(&hdr->b_refcnt, tag);
+ (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
}
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr;
+
+ rw_enter(&buf->b_lock, RW_WRITER);
+ ASSERT(buf->b_data != NULL);
+ hdr = buf->b_hdr;
+ (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
+ (void) refcount_remove(&hdr->b_refcnt, tag);
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+
+ atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+ rw_exit(&buf->b_lock);
+}
+
static arc_buf_t *
arc_buf_clone(arc_buf_t *from)
{
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index ed2dc455de..b1f20af319 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -406,6 +406,29 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
}
}
+/*
+ * Loan out an arc_buf for read. Return the loaned arc_buf.
+ */
+arc_buf_t *
+dbuf_loan_arcbuf(dmu_buf_impl_t *db)
+{
+ arc_buf_t *abuf;
+
+ mutex_enter(&db->db_mtx);
+ if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
+ int blksz = db->db.db_size;
+ mutex_exit(&db->db_mtx);
+ abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz);
+ bcopy(db->db.db_data, abuf->b_data, blksz);
+ } else {
+ abuf = db->db_buf;
+ arc_loan_inuse_buf(abuf, db);
+ dbuf_set_data(db, NULL);
+ mutex_exit(&db->db_mtx);
+ }
+ return (abuf);
+}
+
uint64_t
dbuf_whichblock(dnode_t *dn, uint64_t offset)
{
@@ -1162,7 +1185,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(db->db_blkid != DB_BONUS_BLKID);
mutex_enter(&db->db_mtx);
-
/*
* If this buffer is not dirty, we're done.
*/
@@ -1341,9 +1363,11 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
(void) dbuf_dirty(db, tx);
bcopy(buf->b_data, db->db.db_data, db->db.db_size);
VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ xuio_stat_wbuf_copied();
return;
}
+ xuio_stat_wbuf_nocopy();
if (db->db_state == DB_CACHED) {
dbuf_dirty_record_t *dr = db->db_last_dirty;
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index d3dfc21ac1..2d0927bd44 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -661,12 +661,136 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
+/*
+ * DMU support for xuio
+ */
+kstat_t *xuio_ksp = NULL;
+
+int
+dmu_xuio_init(xuio_t *xuio, int nblk)
+{
+ dmu_xuio_t *priv;
+ uio_t *uio = &xuio->xu_uio;
+
+ uio->uio_iovcnt = nblk;
+ uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
+
+ priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
+ priv->cnt = nblk;
+ priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
+ priv->iovp = uio->uio_iov;
+ XUIO_XUZC_PRIV(xuio) = priv;
+
+ if (XUIO_XUZC_RW(xuio) == UIO_READ)
+ XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
+ else
+ XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
+
+ return (0);
+}
+
+void
+dmu_xuio_fini(xuio_t *xuio)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ int nblk = priv->cnt;
+
+ kmem_free(priv->iovp, nblk * sizeof (iovec_t));
+ kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
+ kmem_free(priv, sizeof (dmu_xuio_t));
+
+ if (XUIO_XUZC_RW(xuio) == UIO_READ)
+ XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
+ else
+ XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
+}
+
+/*
+ * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
+ * and increase priv->next by 1.
+ */
+int
+dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
+{
+ struct iovec *iov;
+ uio_t *uio = &xuio->xu_uio;
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ int i = priv->next++;
+
+ ASSERT(i < priv->cnt);
+ ASSERT(off + n <= arc_buf_size(abuf));
+ iov = uio->uio_iov + i;
+ iov->iov_base = (char *)abuf->b_data + off;
+ iov->iov_len = n;
+ priv->bufs[i] = abuf;
+ return (0);
+}
+
+int
+dmu_xuio_cnt(xuio_t *xuio)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ return (priv->cnt);
+}
+
+arc_buf_t *
+dmu_xuio_arcbuf(xuio_t *xuio, int i)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+ ASSERT(i < priv->cnt);
+ return (priv->bufs[i]);
+}
+
+void
+dmu_xuio_clear(xuio_t *xuio, int i)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+ ASSERT(i < priv->cnt);
+ priv->bufs[i] = NULL;
+}
+
+static void
+xuio_stat_init(void)
+{
+ xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (xuio_ksp != NULL) {
+ xuio_ksp->ks_data = &xuio_stats;
+ kstat_install(xuio_ksp);
+ }
+}
+
+static void
+xuio_stat_fini(void)
+{
+ if (xuio_ksp != NULL) {
+ kstat_delete(xuio_ksp);
+ xuio_ksp = NULL;
+ }
+}
+
+void
+xuio_stat_wbuf_copied()
+{
+ XUIOSTAT_BUMP(xuiostat_wbuf_copied);
+}
+
+void
+xuio_stat_wbuf_nocopy()
+{
+ XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
+}
+
#ifdef _KERNEL
int
dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
{
dmu_buf_t **dbp;
int numbufs, i, err;
+ xuio_t *xuio = NULL;
/*
* NB: we could do this block-at-a-time, but it's nice
@@ -677,6 +801,9 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
if (err)
return (err);
+ if (uio->uio_extflg == UIO_XUIO)
+ xuio = (xuio_t *)uio;
+
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
@@ -687,8 +814,24 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
bufoff = uio->uio_loffset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
- err = uiomove((char *)db->db_data + bufoff, tocpy,
- UIO_READ, uio);
+ if (xuio) {
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ arc_buf_t *dbuf_abuf = dbi->db_buf;
+ arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
+ err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
+ if (!err) {
+ uio->uio_resid -= tocpy;
+ uio->uio_loffset += tocpy;
+ }
+
+ if (abuf == dbuf_abuf)
+ XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
+ else
+ XUIOSTAT_BUMP(xuiostat_rbuf_copied);
+ } else {
+ err = uiomove((char *)db->db_data + bufoff, tocpy,
+ UIO_READ, uio);
+ }
if (err)
break;
@@ -857,6 +1000,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_write(dn->dn_objset, dn->dn_object, offset, blksz,
buf->b_data, tx);
dmu_return_arcbuf(buf);
+ XUIOSTAT_BUMP(xuiostat_wbuf_copied);
}
}
@@ -1369,6 +1513,7 @@ dmu_init(void)
zfetch_init();
arc_init();
l2arc_init();
+ xuio_stat_init();
}
void
@@ -1379,4 +1524,5 @@ dmu_fini(void)
dnode_fini();
dbuf_fini();
l2arc_fini();
+ xuio_stat_fini();
}
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index a4f4964e11..c528fac1a6 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -87,6 +87,7 @@ arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
arc_buf_contents_t type);
arc_buf_t *arc_loan_buf(spa_t *spa, int size);
void arc_return_buf(arc_buf_t *buf, void *tag);
+void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
void arc_buf_add_ref(arc_buf_t *buf, void *tag);
int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf);
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
index 6e2a66a2fa..d99ade07f8 100644
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -267,6 +267,7 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
void dbuf_clear(dmu_buf_impl_t *db);
void dbuf_evict(dmu_buf_impl_t *db);
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index e229ca3bd8..b41bc96c38 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -45,6 +45,7 @@ extern "C" {
#endif
struct uio;
+struct xuio;
struct page;
struct vnode;
struct spa;
@@ -500,6 +501,15 @@ struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
void dmu_return_arcbuf(struct arc_buf *buf);
void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
dmu_tx_t *tx);
+int dmu_xuio_init(struct xuio *uio, int niov);
+void dmu_xuio_fini(struct xuio *uio);
+int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
+ size_t n);
+int dmu_xuio_cnt(struct xuio *uio);
+struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
+void dmu_xuio_clear(struct xuio *uio, int i);
+void xuio_stat_wbuf_copied();
+void xuio_stat_wbuf_nocopy();
extern int zfs_prefetch_disable;
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
index 1e16da6b97..22f9f5f8c8 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -232,6 +232,39 @@ extern "C" {
struct objset;
struct dmu_pool;
+typedef struct dmu_xuio {
+ int next;
+ int cnt;
+ struct arc_buf **bufs;
+ iovec_t *iovp;
+} dmu_xuio_t;
+
+typedef struct xuio_stats {
+ /* loaned yet not returned arc_buf */
+ kstat_named_t xuiostat_onloan_rbuf;
+ kstat_named_t xuiostat_onloan_wbuf;
+ /* whether a copy is made when loaning out a read buffer */
+ kstat_named_t xuiostat_rbuf_copied;
+ kstat_named_t xuiostat_rbuf_nocopy;
+ /* whether a copy is made when assigning a write buffer */
+ kstat_named_t xuiostat_wbuf_copied;
+ kstat_named_t xuiostat_wbuf_nocopy;
+} xuio_stats_t;
+
+static xuio_stats_t xuio_stats = {
+ { "onloan_read_buf", KSTAT_DATA_UINT64 },
+ { "onloan_write_buf", KSTAT_DATA_UINT64 },
+ { "read_buf_copied", KSTAT_DATA_UINT64 },
+ { "read_buf_nocopy", KSTAT_DATA_UINT64 },
+ { "write_buf_copied", KSTAT_DATA_UINT64 },
+ { "write_buf_nocopy", KSTAT_DATA_UINT64 }
+};
+
+#define XUIOSTAT_INCR(stat, val) \
+ atomic_add_64(&xuio_stats.stat.value.ui64, (val))
+#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1)
+
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 0a262cbe21..6759a812ed 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1115,6 +1115,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
}
+ vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
uint64_t pval;
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 9d9fe50aa9..d59c7625ec 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -447,6 +447,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
ssize_t n, nbytes;
int error;
rl_t *rl;
+ xuio_t *xuio = NULL;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
@@ -507,6 +508,35 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
+ int nblk;
+ int blksz = zp->z_blksz;
+ uint64_t offset = uio->uio_loffset;
+
+ xuio = (xuio_t *)uio;
+ if ((ISP2(blksz))) {
+ nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
+ blksz)) / blksz;
+ } else {
+ ASSERT(offset + n <= blksz);
+ nblk = 1;
+ }
+ dmu_xuio_init(xuio, nblk);
+
+ if (vn_has_cached_data(vp)) {
+ /*
+ * For simplicity, we always allocate a full buffer
+ * even if we only expect to read a portion of a block.
+ */
+ while (--nblk >= 0) {
+ dmu_xuio_add(xuio,
+ dmu_request_arcbuf(zp->z_dbuf, blksz),
+ 0, blksz);
+ }
+ }
+ }
+
while (n > 0) {
nbytes = MIN(n, zfs_read_chunk_size -
P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
@@ -524,7 +554,6 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
n -= nbytes;
}
-
out:
zfs_range_unlock(rl);
@@ -570,6 +599,12 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
uint64_t pflags;
int error;
arc_buf_t *abuf;
+ iovec_t *aiov;
+ xuio_t *xuio = NULL;
+ int i_iov = 0;
+ int iovcnt = uio->uio_iovcnt;
+ iovec_t *iovp = uio->uio_iov;
+ int write_eof;
/*
* Fasttrack empty write
@@ -619,8 +654,13 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
/*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
+ * Skip this if uio contains loaned arc_buf.
*/
- uio_prefaultpages(n, uio);
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+ xuio = (xuio_t *)uio;
+ else
+ uio_prefaultpages(n, uio);
/*
* If in append mode, set the io offset pointer to eof.
@@ -659,6 +699,9 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
if ((woff + n) > limit || woff > (limit - n))
n = limit - woff;
+ /* Will this write extend the file length? */
+ write_eof = (woff + n > zp->z_phys->zp_size);
+
end_size = MAX(zp->z_phys->zp_size, woff + n);
/*
@@ -669,7 +712,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
while (n > 0) {
abuf = NULL;
woff = uio->uio_loffset;
-
again:
if (zfs_usergroup_overquota(zfsvfs,
B_FALSE, zp->z_phys->zp_uid) ||
@@ -681,16 +723,28 @@ again:
break;
}
- /*
- * If dmu_assign_arcbuf() is expected to execute with minimum
- * overhead loan an arc buffer and copy user data to it before
- * we enter a txg. This avoids holding a txg forever while we
- * pagefault on a hanging NFS server mapping.
- */
- if (abuf == NULL && n >= max_blksz &&
+ if (xuio && abuf == NULL) {
+ ASSERT(i_iov < iovcnt);
+ aiov = &iovp[i_iov];
+ abuf = dmu_xuio_arcbuf(xuio, i_iov);
+ dmu_xuio_clear(xuio, i_iov);
+ DTRACE_PROBE3(zfs_cp_write, int, i_iov,
+ iovec_t *, aiov, arc_buf_t *, abuf);
+ ASSERT((aiov->iov_base == abuf->b_data) ||
+ ((char *)aiov->iov_base - (char *)abuf->b_data +
+ aiov->iov_len == arc_buf_size(abuf)));
+ i_iov++;
+ } else if (abuf == NULL && n >= max_blksz &&
woff >= zp->z_phys->zp_size &&
P2PHASE(woff, max_blksz) == 0 &&
zp->z_blksz == max_blksz) {
+ /*
+ * This write covers a full block. "Borrow" a buffer
+ * from the dmu so that we can fill it before we enter
+ * a transaction. This avoids the possibility of
+ * holding up the transaction if the data copy hangs
+ * up on a pagefault (e.g., from an NFS server mapping).
+ */
size_t cbytes;
abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
@@ -755,8 +809,24 @@ again:
tx_bytes -= uio->uio_resid;
} else {
tx_bytes = nbytes;
- ASSERT(tx_bytes == max_blksz);
- dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+ ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
+ /*
+ * If this is not a full block write, but we are
+ * extending the file past EOF and this data starts
+ * block-aligned, use assign_arcbuf(). Otherwise,
+ * write via dmu_write().
+ */
+ if (tx_bytes < max_blksz && (!write_eof ||
+ aiov->iov_base != abuf->b_data)) {
+ ASSERT(xuio);
+ dmu_write(zfsvfs->z_os, zp->z_id, woff,
+ aiov->iov_len, aiov->iov_base, tx);
+ dmu_return_arcbuf(abuf);
+ xuio_stat_wbuf_copied();
+ } else {
+ ASSERT(xuio || tx_bytes == max_blksz);
+ dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+ }
ASSERT(tx_bytes <= uio->uio_resid);
uioskip(uio, tx_bytes);
}
@@ -4571,6 +4641,160 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
}
/*
+ * Tunable, both must be a power of 2.
+ *
+ * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
+ * zcr_blksz_max: if set to less than the file block size, allow loaning out of
+ * an arcbuf for a partial block read
+ */
+int zcr_blksz_min = (1 << 10); /* 1K */
+int zcr_blksz_max = (1 << 17); /* 128K */
+
+/*ARGSUSED*/
+static int
+zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int max_blksz = zfsvfs->z_max_blksz;
+ uio_t *uio = &xuio->xu_uio;
+ ssize_t size = uio->uio_resid;
+ offset_t offset = uio->uio_loffset;
+ int blksz;
+ int fullblk, i;
+ arc_buf_t *abuf;
+ ssize_t maxsize;
+ int preamble, postamble;
+
+ if (xuio->xu_type != UIOTYPE_ZEROCOPY)
+ return (EINVAL);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ switch (ioflag) {
+ case UIO_WRITE:
+ /*
+ * Loan out an arc_buf for write if write size is bigger than
+ * max_blksz, and the file's block size is also max_blksz.
+ */
+ blksz = max_blksz;
+ if (size < blksz || zp->z_blksz != blksz) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+ /*
+ * Caller requests buffers for write before knowing where the
+ * write offset might be (e.g. NFS TCP write).
+ */
+ if (offset == -1) {
+ preamble = 0;
+ } else {
+ preamble = P2PHASE(offset, blksz);
+ if (preamble) {
+ preamble = blksz - preamble;
+ size -= preamble;
+ }
+ }
+
+ postamble = P2PHASE(size, blksz);
+ size -= postamble;
+
+ fullblk = size / blksz;
+ dmu_xuio_init(xuio,
+ (preamble != 0) + fullblk + (postamble != 0));
+ DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
+ int, postamble, int,
+ (preamble != 0) + fullblk + (postamble != 0));
+
+ /*
+ * Have to fix iov base/len for partial buffers. They
+ * currently represent full arc_buf's.
+ */
+ if (preamble) {
+ /* data begins in the middle of the arc_buf */
+ abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+ ASSERT(abuf);
+ dmu_xuio_add(xuio, abuf, blksz - preamble, preamble);
+ }
+
+ for (i = 0; i < fullblk; i++) {
+ abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+ ASSERT(abuf);
+ dmu_xuio_add(xuio, abuf, 0, blksz);
+ }
+
+ if (postamble) {
+ /* data ends in the middle of the arc_buf */
+ abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+ ASSERT(abuf);
+ dmu_xuio_add(xuio, abuf, 0, postamble);
+ }
+ break;
+ case UIO_READ:
+ /*
+ * Loan out an arc_buf for read if the read size is larger than
+ * the current file block size. Block alignment is not
+ * considered. Partial arc_buf will be loaned out for read.
+ */
+ blksz = zp->z_blksz;
+ if (blksz < zcr_blksz_min)
+ blksz = zcr_blksz_min;
+ if (blksz > zcr_blksz_max)
+ blksz = zcr_blksz_max;
+ /* avoid potential complexity of dealing with it */
+ if (blksz > max_blksz) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ maxsize = zp->z_phys->zp_size - uio->uio_loffset;
+ if (size > maxsize)
+ size = maxsize;
+
+ if (size < blksz || vn_has_cached_data(vp)) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+ break;
+ default:
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ uio->uio_extflg = UIO_XUIO;
+ XUIO_XUZC_RW(xuio) = ioflag;
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
+{
+ int i;
+ arc_buf_t *abuf;
+ int ioflag = XUIO_XUZC_RW(xuio);
+
+ ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
+
+ i = dmu_xuio_cnt(xuio);
+ while (i-- > 0) {
+ abuf = dmu_xuio_arcbuf(xuio, i);
+ /*
+ * if abuf == NULL, it must be a write buffer
+ * that has been returned in zfs_write().
+ */
+ if (abuf)
+ dmu_return_arcbuf(abuf);
+ ASSERT(abuf || ioflag == UIO_WRITE);
+ }
+
+ dmu_xuio_fini(xuio);
+ return (0);
+}
+
+/*
* Predeclare these here so that the compiler assumes that
* this is an "old style" function declaration that does
* not include arguments => we won't get type mismatch errors
@@ -4653,6 +4877,8 @@ const fs_operation_def_t zfs_fvnodeops_template[] = {
VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf },
+ VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf },
NULL, NULL
};
diff --git a/usr/src/uts/common/nfs/nfs.h b/usr/src/uts/common/nfs/nfs.h
index 77e5a397c2..1905e47c4f 100644
--- a/usr/src/uts/common/nfs/nfs.h
+++ b/usr/src/uts/common/nfs/nfs.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1447,6 +1447,7 @@ struct READ3resok {
#ifdef _KERNEL
uint_t wlist_len;
struct clist *wlist;
+ frtn_t zcopy;
#endif
};
typedef struct READ3resok READ3resok;
@@ -2322,6 +2323,24 @@ extern int do_xattr_exists_check(vnode_t *, ulong_t *, cred_t *);
extern ts_label_t *nfs_getflabel(vnode_t *, struct exportinfo *);
extern boolean_t do_rfs_label_check(bslabel_t *, vnode_t *, int,
struct exportinfo *);
+
+/*
+ * Copy Reduction support.
+ * xuio_t wrapper with additional private data.
+ */
+
+typedef struct nfs_xuio {
+ xuio_t nu_uio;
+ vnode_t *nu_vp;
+ uint_t nu_ref;
+ frtn_t nu_frtn;
+} nfs_xuio_t;
+
+xuio_t *rfs_setup_xuio(vnode_t *);
+mblk_t *uio_to_mblk(uio_t *);
+void rfs_rndup_mblks(mblk_t *, uint_t, int);
+void rfs_free_xuio(void *);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/rpc/rpcmod.c b/usr/src/uts/common/rpc/rpcmod.c
index cab50d67cd..891045d7f2 100644
--- a/usr/src/uts/common/rpc/rpcmod.c
+++ b/usr/src/uts/common/rpc/rpcmod.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -1059,8 +1059,6 @@ rpcmod_release(queue_t *q, mblk_t *bp)
#define MIR_SVC_ORDREL_TIMEOUT (10 * (60 * 1000L)) /* 10 minutes */
#define MIR_LASTFRAG 0x80000000 /* Record marker */
-#define DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr))
-
#define MIR_SVC_QUIESCED(mir) \
(mir->mir_ref_cnt == 0 && mir->mir_inrservice == 0)
diff --git a/usr/src/uts/common/rpc/xdr.h b/usr/src/uts/common/rpc/xdr.h
index 4ef63d6baf..3db775893c 100644
--- a/usr/src/uts/common/rpc/xdr.h
+++ b/usr/src/uts/common/rpc/xdr.h
@@ -18,7 +18,7 @@
*
* CDDL HEADER END
*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -585,6 +585,8 @@ extern uint_t xdrrec_readbytes();
#endif
#else
+#define DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr))
+
extern void xdrmem_create(XDR *, caddr_t, uint_t, enum xdr_op);
extern void xdrmblk_init(XDR *, mblk_t *, enum xdr_op, int);
extern bool_t xdrmblk_getmblk(XDR *, mblk_t **, uint_t *);
diff --git a/usr/src/uts/common/rpc/xdr_mblk.c b/usr/src/uts/common/rpc/xdr_mblk.c
index 053edb7603..0b06b827e0 100644
--- a/usr/src/uts/common/rpc/xdr_mblk.c
+++ b/usr/src/uts/common/rpc/xdr_mblk.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -361,20 +361,24 @@ xdrmblk_putbytes(XDR *xdrs, caddr_t addr, int len)
* not a multiple of BYTES_PER_XDR_UNIT, the caller has the option
* of making the data a BYTES_PER_XDR_UNIT multiple (b_wptr - b_rptr is
* a BYTES_PER_XDR_UNIT multiple), but in this case the caller has to ensure
- * that the filler bytes are initialized to zero. Note: Doesn't to work for
- * chained mblks.
+ * that the filler bytes are initialized to zero.
*/
bool_t
xdrmblk_putmblk(XDR *xdrs, mblk_t *m, uint_t len)
{
int32_t llen = (int32_t)len;
- if (((m->b_wptr - m->b_rptr) % BYTES_PER_XDR_UNIT) != 0)
+ if ((DLEN(m) % BYTES_PER_XDR_UNIT) != 0)
return (FALSE);
if (!xdrmblk_putint32(xdrs, &llen))
return (FALSE);
+
/* LINTED pointer alignment */
((mblk_t *)xdrs->x_base)->b_cont = m;
+
+ /* base points to the last mblk */
+ while (m->b_cont)
+ m = m->b_cont;
xdrs->x_base = (caddr_t)m;
xdrs->x_handy = 0;
return (TRUE);
diff --git a/usr/src/uts/common/sys/fem.h b/usr/src/uts/common/sys/fem.h
index 84defb057c..9b3cd142e4 100644
--- a/usr/src/uts/common/sys/fem.h
+++ b/usr/src/uts/common/sys/fem.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_FEM_H
#define _SYS_FEM_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/mutex.h>
#include <sys/pathname.h>
@@ -260,7 +258,13 @@ struct fem_head {
struct shrlock *shr, int flag, cred_t *cr, \
caller_context_t *ct); \
int (*femop_vnevent)(femarg_t *vf, vnevent_t vnevent, \
- vnode_t *dvp, char *cname, caller_context_t *ct)
+ vnode_t *dvp, char *cname, \
+ caller_context_t *ct); \
+ int (*femop_reqzcbuf)(femarg_t *vf, enum uio_rw ioflag, \
+ xuio_t *xuio, cred_t *cr, \
+ caller_context_t *ct); \
+ int (*femop_retzcbuf)(femarg_t *vf, xuio_t *xuio, cred_t *cr, \
+ caller_context_t *ct)
/* NB: No ";" */
struct fem {
@@ -392,6 +396,10 @@ extern int vnext_shrlock(femarg_t *vf, int cmd, struct shrlock *shr,
int flag, cred_t *cr, caller_context_t *ct);
extern int vnext_vnevent(femarg_t *vf, vnevent_t vevent, vnode_t *dvp,
char *cname, caller_context_t *ct);
+extern int vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop,
+ cred_t *cr, caller_context_t *ct);
+extern int vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr,
+ caller_context_t *ct);
extern int vfsnext_mount(fsemarg_t *vf, vnode_t *mvp, struct mounta *uap,
cred_t *cr);
diff --git a/usr/src/uts/common/sys/uio.h b/usr/src/uts/common/sys/uio.h
index 248443f9a5..7255a2fa67 100644
--- a/usr/src/uts/common/sys/uio.h
+++ b/usr/src/uts/common/sys/uio.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -133,6 +133,49 @@ typedef struct uioa_s {
uioa_page_t uioa_locked[UIOA_IOV_MAX]; /* Per iov locked pages */
} uioa_t;
+/*
+ * uio extensions
+ *
+ * PSARC 2009/478: Copy Reduction Interfaces
+ */
+typedef enum xuio_type {
+ UIOTYPE_ASYNCIO,
+ UIOTYPE_ZEROCOPY
+} xuio_type_t;
+
+typedef struct xuio {
+ uio_t xu_uio; /* Embedded UIO structure */
+
+ /* Extended uio fields */
+ enum xuio_type xu_type; /* What kind of uio structure? */
+ union {
+ /* Async I/O Support, intend to replace uioa_t. */
+ struct {
+ uint32_t xu_a_state; /* state of async i/o */
+ /* bytes that have been uioamove()ed */
+ ssize_t xu_a_mbytes;
+ uioa_page_t *xu_a_lcur; /* pointer into uioa_locked[] */
+ /* pointer into lcur->uioa_ppp[] */
+ void **xu_a_lppp;
+ void *xu_a_hwst[4]; /* opaque hardware state */
+ /* Per iov locked pages */
+ uioa_page_t xu_a_locked[UIOA_IOV_MAX];
+ } xu_aio;
+
+ /*
+ * Copy Reduction Support -- facilate loaning / returning of
+ * filesystem cache buffers.
+ */
+ struct {
+ int xu_zc_rw; /* read or write buffer */
+ void *xu_zc_priv; /* fs specific */
+ } xu_zc;
+ } xu_ext;
+} xuio_t;
+
+#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv
+#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw
+
#define UIOA_ALLOC 0x0001 /* allocated but not yet initialized */
#define UIOA_INIT 0x0002 /* initialized but not yet enabled */
#define UIOA_ENABLED 0x0004 /* enabled, asynch i/o active */
@@ -177,6 +220,7 @@ typedef enum uio_rw { UIO_READ, UIO_WRITE } uio_rw_t;
#define UIO_COPY_CACHED 0x0001 /* copy should not bypass caches */
#define UIO_ASYNC 0x0002 /* uio_t is really a uioa_t */
+#define UIO_XUIO 0x0004 /* Structure is xuio_t */
/*
* Global uioasync capability shadow state.
diff --git a/usr/src/uts/common/sys/vfs.h b/usr/src/uts/common/sys/vfs.h
index 2e9679cf97..bae4e5b87f 100644
--- a/usr/src/uts/common/sys/vfs.h
+++ b/usr/src/uts/common/sys/vfs.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -302,7 +302,8 @@ typedef uint64_t vfs_feature_t;
#define VFSFT_SYSATTR_VIEWS 0x100000040 /* Supports sysattr view i/f */
#define VFSFT_ACCESS_FILTER 0x100000080 /* dirents filtered by access */
#define VFSFT_REPARSE 0x100000100 /* Supports reparse point */
-
+#define VFSFT_ZEROCOPY_SUPPORTED 0x100000200
+ /* Support loaning /returning cache buffer */
/*
* Argument structure for mount(2).
*
diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h
index 97504aabf3..8b75225a64 100644
--- a/usr/src/uts/common/sys/vnode.h
+++ b/usr/src/uts/common/sys/vnode.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -118,6 +118,8 @@ typedef struct vopstats {
kstat_named_t ngetsecattr; /* VOP_GETSECATTR */
kstat_named_t nshrlock; /* VOP_SHRLOCK */
kstat_named_t nvnevent; /* VOP_VNEVENT */
+ kstat_named_t nreqzcbuf; /* VOP_REQZCBUF */
+ kstat_named_t nretzcbuf; /* VOP_RETZCBUF */
} vopstats_t;
/*
@@ -900,7 +902,11 @@ struct taskq;
int (*vop_shrlock)(vnode_t *, int, struct shrlock *, \
int, cred_t *, caller_context_t *); \
int (*vop_vnevent)(vnode_t *, vnevent_t, vnode_t *, \
- char *, caller_context_t *)
+ char *, caller_context_t *); \
+ int (*vop_reqzcbuf)(vnode_t *, enum uio_rw, xuio_t *, \
+ cred_t *, caller_context_t *); \
+ int (*vop_retzcbuf)(vnode_t *, xuio_t *, cred_t *, \
+ caller_context_t *)
/* NB: No ";" */
/*
@@ -997,6 +1003,9 @@ extern int fop_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
caller_context_t *);
extern int fop_vnevent(vnode_t *, vnevent_t, vnode_t *, char *,
caller_context_t *);
+extern int fop_reqzcbuf(vnode_t *, enum uio_rw, xuio_t *, cred_t *,
+ caller_context_t *);
+extern int fop_retzcbuf(vnode_t *, xuio_t *, cred_t *, caller_context_t *);
#endif /* _KERNEL */
@@ -1088,6 +1097,10 @@ extern int fop_vnevent(vnode_t *, vnevent_t, vnode_t *, char *,
fop_shrlock(vp, cmd, shr, f, cr, ct)
#define VOP_VNEVENT(vp, vnevent, dvp, fnm, ct) \
fop_vnevent(vp, vnevent, dvp, fnm, ct)
+#define VOP_REQZCBUF(vp, rwflag, xuiop, cr, ct) \
+ fop_reqzcbuf(vp, rwflag, xuiop, cr, ct)
+#define VOP_RETZCBUF(vp, xuiop, cr, ct) \
+ fop_retzcbuf(vp, xuiop, cr, ct)
#define VOPNAME_OPEN "open"
#define VOPNAME_CLOSE "close"
@@ -1133,6 +1146,8 @@ extern int fop_vnevent(vnode_t *, vnevent_t, vnode_t *, char *,
#define VOPNAME_SETSECATTR "setsecattr"
#define VOPNAME_SHRLOCK "shrlock"
#define VOPNAME_VNEVENT "vnevent"
+#define VOPNAME_REQZCBUF "reqzcbuf"
+#define VOPNAME_RETZCBUF "retzcbuf"
/*
* Flags for VOP_LOOKUP