summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason King <jason.brian.king@gmail.com>2021-04-17 09:08:24 +0000
committerAndy Fiddaman <omnios@citrus-it.co.uk>2021-10-07 09:11:03 +0000
commitaa693e996c2928c92cccd8a3efe91373e85a6967 (patch)
tree23d7431e48a5194bf8ae93968c3caedc6c8bc7a6
parent2d2dd8359f765a17f6caaa2d37d86837c0c40915 (diff)
downloadillumos-gate-aa693e996c2928c92cccd8a3efe91373e85a6967.tar.gz
13380 Add virtio-9p (aka VirtFS) filesystem sharing to bhyve
Portions contributed by: Andy Fiddaman <andy@omnios.org> Reviewed by: Jason King <jason.brian.king@gmail.com> Reviewed by: Jorge Schrauwen <sjorge@blackdot.be> Approved by: Robert Mustacchi <rm@fingolfin.org>
-rw-r--r--exception_lists/cstyle2
-rw-r--r--exception_lists/hdrchk1
-rw-r--r--exception_lists/packaging6
-rw-r--r--exception_lists/wscheck2
-rw-r--r--usr/src/cmd/bhyve/Makefile5
-rw-r--r--usr/src/cmd/bhyve/README.sync6
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_9p.c406
-rw-r--r--usr/src/lib/Makefile3
-rw-r--r--usr/src/lib/lib9p/COPYRIGHT47
-rw-r--r--usr/src/lib/lib9p/COPYRIGHT.descrip1
-rw-r--r--usr/src/lib/lib9p/Makefile41
-rw-r--r--usr/src/lib/lib9p/Makefile.com77
-rw-r--r--usr/src/lib/lib9p/amd64/Makefile19
-rw-r--r--usr/src/lib/lib9p/common/backend/backend.h69
-rw-r--r--usr/src/lib/lib9p/common/backend/fs.c3238
-rw-r--r--usr/src/lib/lib9p/common/backend/fs.h37
-rw-r--r--usr/src/lib/lib9p/common/connection.c215
-rw-r--r--usr/src/lib/lib9p/common/fcall.h624
-rw-r--r--usr/src/lib/lib9p/common/fid.h160
-rw-r--r--usr/src/lib/lib9p/common/genacl.c806
-rw-r--r--usr/src/lib/lib9p/common/genacl.h316
-rw-r--r--usr/src/lib/lib9p/common/hashtable.c276
-rw-r--r--usr/src/lib/lib9p/common/hashtable.h107
-rw-r--r--usr/src/lib/lib9p/common/illumos_endian.h26
-rw-r--r--usr/src/lib/lib9p/common/lib9p.h249
-rw-r--r--usr/src/lib/lib9p/common/lib9p_impl.h78
-rw-r--r--usr/src/lib/lib9p/common/linux_errno.h247
-rw-r--r--usr/src/lib/lib9p/common/log.c67
-rw-r--r--usr/src/lib/lib9p/common/log.h46
-rw-r--r--usr/src/lib/lib9p/common/pack.c996
-rw-r--r--usr/src/lib/lib9p/common/request.c1446
-rw-r--r--usr/src/lib/lib9p/common/rfuncs.c320
-rw-r--r--usr/src/lib/lib9p/common/rfuncs.h83
-rw-r--r--usr/src/lib/lib9p/common/sbuf/sbuf.c65
-rw-r--r--usr/src/lib/lib9p/common/sbuf/sbuf.h51
-rw-r--r--usr/src/lib/lib9p/common/threadpool.c469
-rw-r--r--usr/src/lib/lib9p/common/threadpool.h118
-rw-r--r--usr/src/lib/lib9p/common/transport/socket.c593
-rw-r--r--usr/src/lib/lib9p/common/transport/socket.h39
-rw-r--r--usr/src/lib/lib9p/common/utils.c1363
-rw-r--r--usr/src/lib/lib9p/mapfile-vers58
-rw-r--r--usr/src/man/man1m/bhyve.1m22
-rw-r--r--usr/src/man/man4/bhyve_config.415
-rw-r--r--usr/src/pkg/manifests/system-library-bhyve.mf4
44 files changed, 12810 insertions, 9 deletions
diff --git a/exception_lists/cstyle b/exception_lists/cstyle
index bf1856d5f0..3b15aa6700 100644
--- a/exception_lists/cstyle
+++ b/exception_lists/cstyle
@@ -1357,6 +1357,7 @@ usr/src/cmd/bhyve/pci_lpc.[ch]
usr/src/cmd/bhyve/pci_nvme.c
usr/src/cmd/bhyve/pci_passthru.c
usr/src/cmd/bhyve/pci_uart.c
+usr/src/cmd/bhyve/pci_virtio_9p.c
usr/src/cmd/bhyve/pci_virtio_block.c
usr/src/cmd/bhyve/pci_virtio_console.c
usr/src/cmd/bhyve/pci_virtio_net.c
@@ -1390,3 +1391,4 @@ usr/src/uts/i86pc/io/vmm/amd/amdvi_*.[ch]
usr/src/uts/i86pc/io/vmm/amd/ivrs_*.c
usr/src/uts/i86pc/sys/vmm.h
usr/src/uts/i86pc/sys/vmm_dev.h
+usr/src/lib/lib9p/common/*
diff --git a/exception_lists/hdrchk b/exception_lists/hdrchk
index fc022b3782..0c9c154ff0 100644
--- a/exception_lists/hdrchk
+++ b/exception_lists/hdrchk
@@ -433,3 +433,4 @@ usr/src/uts/i86pc/io/vmm/vmm_util.h
usr/src/uts/i86pc/io/vmm/x86.h
usr/src/uts/i86pc/sys/vmm.h
usr/src/uts/i86pc/sys/vmm_dev.h
+usr/src/lib/lib9p/common/*
diff --git a/exception_lists/packaging b/exception_lists/packaging
index 591b4b9711..47acb0988a 100644
--- a/exception_lists/packaging
+++ b/exception_lists/packaging
@@ -862,6 +862,12 @@ usr/lib/sparcv9/libdwarf.so sparc
usr/lib/libdwarf.so
#
+# lib9p is private
+#
+usr/include/lib9p.h
+usr/lib/amd64/lib9p.so i386
+
+#
# We're not quite ready to ship ctfconvert and ctfmerge
#
usr/bin/ctfconvert
diff --git a/exception_lists/wscheck b/exception_lists/wscheck
index 462546802f..fdebb77910 100644
--- a/exception_lists/wscheck
+++ b/exception_lists/wscheck
@@ -69,6 +69,7 @@ usr/src/cmd/bhyve/pci_lpc.[ch]
usr/src/cmd/bhyve/pci_nvme.c
usr/src/cmd/bhyve/pci_passthru.c
usr/src/cmd/bhyve/pci_uart.c
+usr/src/cmd/bhyve/pci_virtio_9p.c
usr/src/cmd/bhyve/pci_virtio_block.c
usr/src/cmd/bhyve/pci_virtio_console.c
usr/src/cmd/bhyve/pci_virtio_net.c
@@ -95,3 +96,4 @@ usr/src/cmd/bhyve/xmsr.[ch]
usr/src/cmd/bhyvectl/bhyvectl.c
usr/src/contrib/bhyve/*
usr/src/lib/libvmmapi/common/vmmapi.[ch]
+usr/src/lib/lib9p/common/*
diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile
index bbc966d67f..4e54c6be42 100644
--- a/usr/src/cmd/bhyve/Makefile
+++ b/usr/src/cmd/bhyve/Makefile
@@ -56,6 +56,7 @@ SRCS = acpi.c \
pci_nvme.c \
pci_passthru.c \
pci_uart.c \
+ pci_virtio_9p.c \
pci_virtio_block.c \
pci_virtio_console.c \
pci_virtio_net.c \
@@ -115,6 +116,7 @@ CPPFLAGS = -I$(COMPAT)/bhyve -I$(CONTRIB)/bhyve \
-I$(COMPAT)/bhyve/amd64 -I$(CONTRIB)/bhyve/amd64 \
-I$(CONTRIB)/bhyve/dev/usb/controller \
-I$(CONTRIB)/bhyve/dev/mii \
+ -I$(SRC)/lib/lib9p/common \
-I$(SRC)/uts/common/io/e1000api \
$(CPPFLAGS.master) \
-I$(SRC)/uts/i86pc/io/vmm \
@@ -128,6 +130,8 @@ pci_nvme.o := SMOFF += kmalloc_wrong_size
pci_passthru.o := CERRWARN += -_gcc10=-Wno-address-of-packed-member
+pci_virtio_9p.o := SMOFF += kmalloc_wrong_size
+
pci_xhci.o := CERRWARN += -_gcc10=-Wno-address-of-packed-member
SMOFF += all_func_returns,leaks,no_if_block
@@ -136,6 +140,7 @@ SMOFF += all_func_returns,leaks,no_if_block
CSTD= $(CSTD_GNU99)
$(PROG) := LDLIBS += \
+ -l9p \
-lsocket \
-lnsl \
-ldlpi \
diff --git a/usr/src/cmd/bhyve/README.sync b/usr/src/cmd/bhyve/README.sync
index 4f71c1420e..bec61410ee 100644
--- a/usr/src/cmd/bhyve/README.sync
+++ b/usr/src/cmd/bhyve/README.sync
@@ -24,12 +24,6 @@ The draft Save/Restore functionality, added in FreeBSD commit
yet. It is not built by default in FreeBSD, so we're not interested in taking
it until it successfully endures more in-depth testing.
-The VirtFS filesystem sharing feature, added in FreeBSD commit
-100353cfbf882e23c911300ebd0cb458bd3ee975, has not been synced into illumos bhyve
-yet. It depends on the userland lib9p which needs a fair amount of work to
-build and run on illumos. The integration of this feature is being tracked in
-https://www.illumos.org/issues/13380
-
The stub usr/src/compat/bhyve/stdatomic.h file only includes enough glue
to satisfy the use of <stdatomic.h> in usr/src/cmd/bhyve/rfb.c, and in
particular assumes that atomic variables are sized as an int. If other bhyve
diff --git a/usr/src/cmd/bhyve/pci_virtio_9p.c b/usr/src/cmd/bhyve/pci_virtio_9p.c
new file mode 100644
index 0000000000..b3fdb2db2c
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_9p.c
@@ -0,0 +1,406 @@
+/*-
+ * Copyright (c) 2015 iXsystems Inc.
+ * Copyright (c) 2017-2018 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * VirtIO filesystem passthrough using 9p protocol.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/uio.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include <lib9p.h>
+#include <backend/fs.h>
+
+#include "bhyverun.h"
+#include "config.h"
+#include "debug.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+#ifndef __FreeBSD__
+#include "privileges.h"
+#endif
+
+#define VT9P_MAX_IOV 128
+#define VT9P_RINGSZ 256
+#define VT9P_MAXTAGSZ 256
+#define VT9P_CONFIGSPACESZ (VT9P_MAXTAGSZ + sizeof(uint16_t))
+
+static int pci_vt9p_debug;
+#define DPRINTF(params) if (pci_vt9p_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vt9p_softc {
+ struct virtio_softc vsc_vs;
+ struct vqueue_info vsc_vq;
+ pthread_mutex_t vsc_mtx;
+ uint64_t vsc_cfg;
+ uint64_t vsc_features;
+ char * vsc_rootpath;
+ struct pci_vt9p_config * vsc_config;
+ struct l9p_backend * vsc_fs_backend;
+ struct l9p_server * vsc_server;
+ struct l9p_connection * vsc_conn;
+};
+
+struct pci_vt9p_request {
+ struct pci_vt9p_softc * vsr_sc;
+ struct iovec * vsr_iov;
+ size_t vsr_niov;
+ size_t vsr_respidx;
+ size_t vsr_iolen;
+ uint16_t vsr_idx;
+};
+
+struct pci_vt9p_config {
+ uint16_t tag_len;
+ char tag[0];
+} __attribute__((packed));
+
+static int pci_vt9p_send(struct l9p_request *, const struct iovec *,
+ const size_t, const size_t, void *);
+static void pci_vt9p_drop(struct l9p_request *, const struct iovec *, size_t,
+ void *);
+static void pci_vt9p_reset(void *);
+static void pci_vt9p_notify(void *, struct vqueue_info *);
+static int pci_vt9p_cfgread(void *, int, int, uint32_t *);
+static void pci_vt9p_neg_features(void *, uint64_t);
+
+static struct virtio_consts vt9p_vi_consts = {
+ "vt9p", /* our name */
+ 1, /* we support 1 virtqueue */
+ VT9P_CONFIGSPACESZ, /* config reg size */
+ pci_vt9p_reset, /* reset */
+ pci_vt9p_notify, /* device-wide qnotify */
+ pci_vt9p_cfgread, /* read virtio config */
+ NULL, /* write virtio config */
+ pci_vt9p_neg_features, /* apply negotiated features */
+ (1 << 0), /* our capabilities */
+};
+
+
+static void
+pci_vt9p_reset(void *vsc)
+{
+ struct pci_vt9p_softc *sc;
+
+ sc = vsc;
+
+ DPRINTF(("vt9p: device reset requested !\n"));
+ vi_reset_dev(&sc->vsc_vs);
+}
+
+static void
+pci_vt9p_neg_features(void *vsc, uint64_t negotiated_features)
+{
+ struct pci_vt9p_softc *sc = vsc;
+
+ sc->vsc_features = negotiated_features;
+}
+
+static int
+pci_vt9p_cfgread(void *vsc, int offset, int size, uint32_t *retval)
+{
+ struct pci_vt9p_softc *sc = vsc;
+ void *ptr;
+
+ ptr = (uint8_t *)sc->vsc_config + offset;
+ memcpy(retval, ptr, size);
+ return (0);
+}
+
+static int
+pci_vt9p_get_buffer(struct l9p_request *req, struct iovec *iov, size_t *niov,
+ void *arg)
+{
+ struct pci_vt9p_request *preq = req->lr_aux;
+ size_t n = preq->vsr_niov - preq->vsr_respidx;
+
+ memcpy(iov, preq->vsr_iov + preq->vsr_respidx,
+ n * sizeof(struct iovec));
+ *niov = n;
+ return (0);
+}
+
+static int
+pci_vt9p_send(struct l9p_request *req, const struct iovec *iov,
+ const size_t niov, const size_t iolen, void *arg)
+{
+ struct pci_vt9p_request *preq = req->lr_aux;
+ struct pci_vt9p_softc *sc = preq->vsr_sc;
+
+ preq->vsr_iolen = iolen;
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+ vq_relchain(&sc->vsc_vq, preq->vsr_idx, preq->vsr_iolen);
+ vq_endchains(&sc->vsc_vq, 1);
+ pthread_mutex_unlock(&sc->vsc_mtx);
+ free(preq);
+ return (0);
+}
+
+static void
+pci_vt9p_drop(struct l9p_request *req, const struct iovec *iov, size_t niov,
+ void *arg)
+{
+ struct pci_vt9p_request *preq = req->lr_aux;
+ struct pci_vt9p_softc *sc = preq->vsr_sc;
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+ vq_relchain(&sc->vsc_vq, preq->vsr_idx, 0);
+ vq_endchains(&sc->vsc_vq, 1);
+ pthread_mutex_unlock(&sc->vsc_mtx);
+ free(preq);
+}
+
+static void
+pci_vt9p_notify(void *vsc, struct vqueue_info *vq)
+{
+ struct iovec iov[VT9P_MAX_IOV];
+ struct pci_vt9p_softc *sc;
+ struct pci_vt9p_request *preq;
+ uint16_t idx, n, i;
+ uint16_t flags[VT9P_MAX_IOV];
+
+ sc = vsc;
+
+ while (vq_has_descs(vq)) {
+ n = vq_getchain(vq, &idx, iov, VT9P_MAX_IOV, flags);
+ preq = calloc(1, sizeof(struct pci_vt9p_request));
+#ifndef __FreeBSD__
+ if (preq == NULL) {
+ EPRINTLN("virtio-9p: allocation failure: %s",
+ strerror(errno));
+ break;
+ }
+#endif
+ preq->vsr_sc = sc;
+ preq->vsr_idx = idx;
+ preq->vsr_iov = iov;
+ preq->vsr_niov = n;
+ preq->vsr_respidx = 0;
+
+ /* Count readable descriptors */
+ for (i = 0; i < n; i++) {
+ if (flags[i] & VRING_DESC_F_WRITE)
+ break;
+
+ preq->vsr_respidx++;
+ }
+
+ for (int i = 0; i < n; i++) {
+ DPRINTF(("vt9p: vt9p_notify(): desc%d base=%p, "
+ "len=%zu, flags=0x%04x\r\n", i, iov[i].iov_base,
+ iov[i].iov_len, flags[i]));
+ }
+
+ l9p_connection_recv(sc->vsc_conn, iov, preq->vsr_respidx, preq);
+ }
+}
+
+static int
+pci_vt9p_legacy_config(nvlist_t *nvl, const char *opts)
+{
+ char *sharename = NULL, *tofree, *token, *tokens;
+
+ if (opts == NULL)
+ return (0);
+
+ tokens = tofree = strdup(opts);
+ while ((token = strsep(&tokens, ",")) != NULL) {
+ if (strchr(token, '=') != NULL) {
+ if (sharename != NULL) {
+ EPRINTLN(
+ "virtio-9p: more than one share name given");
+ return (-1);
+ }
+
+ sharename = strsep(&token, "=");
+ set_config_value_node(nvl, "sharename", sharename);
+ set_config_value_node(nvl, "path", token);
+ } else
+ set_config_bool_node(nvl, token, true);
+ }
+ free(tofree);
+
+ return (0);
+}
+
+static int
+pci_vt9p_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
+{
+ struct pci_vt9p_softc *sc;
+ const char *value;
+ const char *sharename;
+ int rootfd;
+ bool ro;
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_t rootcap;
+#endif
+
+ ro = get_config_bool_node_default(nvl, "ro", false);
+
+#ifndef __FreeBSD__
+ illumos_priv_add_min(PRIV_FILE_DAC_READ, "vt9p");
+ illumos_priv_add_min(PRIV_FILE_DAC_SEARCH, "vt9p");
+
+ if (!ro) {
+ illumos_priv_add_min(PRIV_FILE_CHOWN, "vt9p");
+ illumos_priv_add_min(PRIV_FILE_CHOWN_SELF, "vt9p");
+ illumos_priv_add_min(PRIV_FILE_WRITE, "vt9p");
+ illumos_priv_add_min(PRIV_FILE_DAC_WRITE, "vt9p");
+ illumos_priv_add_min(PRIV_FILE_OWNER, "vt9p");
+ illumos_priv_add_min(PRIV_FILE_LINK_ANY, "vt9p");
+ }
+#endif
+
+ value = get_config_value_node(nvl, "path");
+ if (value == NULL) {
+ EPRINTLN("virtio-9p: path required");
+ return (1);
+ }
+ rootfd = open(value, O_DIRECTORY);
+ if (rootfd < 0) {
+ EPRINTLN("virtio-9p: failed to open '%s': %s", value,
+ strerror(errno));
+ return (-1);
+ }
+
+ sharename = get_config_value_node(nvl, "sharename");
+ if (sharename == NULL) {
+ EPRINTLN("virtio-9p: share name required");
+ return (1);
+ }
+ if (strlen(sharename) > VT9P_MAXTAGSZ) {
+ EPRINTLN("virtio-9p: share name too long");
+ return (1);
+ }
+
+ sc = calloc(1, sizeof(struct pci_vt9p_softc));
+#ifndef __FreeBSD__
+ if (sc == NULL) {
+ EPRINTLN("virtio-9p: soft state allocation failure: %s",
+ strerror(errno));
+ return (1);
+ }
+#endif
+ sc->vsc_config = calloc(1, sizeof(struct pci_vt9p_config) +
+ VT9P_MAXTAGSZ);
+#ifndef __FreeBSD__
+ if (sc == NULL) {
+ EPRINTLN("virtio-9p: vsc_config allocation failure: %s",
+ strerror(errno));
+ return (1);
+ }
+#endif
+
+ pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+#ifndef WITHOUT_CAPSICUM
+ cap_rights_init(&rootcap,
+ CAP_LOOKUP, CAP_ACL_CHECK, CAP_ACL_DELETE, CAP_ACL_GET,
+ CAP_ACL_SET, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSTAT,
+ CAP_CREATE, CAP_FCHMODAT, CAP_FCHOWNAT, CAP_FTRUNCATE,
+ CAP_LINKAT_SOURCE, CAP_LINKAT_TARGET, CAP_MKDIRAT, CAP_MKNODAT,
+ CAP_PREAD, CAP_PWRITE, CAP_RENAMEAT_SOURCE, CAP_RENAMEAT_TARGET,
+ CAP_SEEK, CAP_SYMLINKAT, CAP_UNLINKAT, CAP_EXTATTR_DELETE,
+ CAP_EXTATTR_GET, CAP_EXTATTR_LIST, CAP_EXTATTR_SET,
+ CAP_FUTIMES, CAP_FSTATFS, CAP_FSYNC, CAP_FPATHCONF);
+
+ if (cap_rights_limit(rootfd, &rootcap) != 0)
+ return (1);
+#endif
+
+ sc->vsc_config->tag_len = (uint16_t)strlen(sharename);
+ memcpy(sc->vsc_config->tag, sharename, sc->vsc_config->tag_len);
+
+ if (l9p_backend_fs_init(&sc->vsc_fs_backend, rootfd, ro) != 0) {
+ errno = ENXIO;
+ return (1);
+ }
+
+ if (l9p_server_init(&sc->vsc_server, sc->vsc_fs_backend) != 0) {
+ errno = ENXIO;
+ return (1);
+ }
+
+ if (l9p_connection_init(sc->vsc_server, &sc->vsc_conn) != 0) {
+ errno = EIO;
+ return (1);
+ }
+
+ sc->vsc_conn->lc_msize = L9P_MAX_IOV * PAGE_SIZE;
+ sc->vsc_conn->lc_lt.lt_get_response_buffer = pci_vt9p_get_buffer;
+ sc->vsc_conn->lc_lt.lt_send_response = pci_vt9p_send;
+ sc->vsc_conn->lc_lt.lt_drop_response = pci_vt9p_drop;
+
+ vi_softc_linkup(&sc->vsc_vs, &vt9p_vi_consts, sc, pi, &sc->vsc_vq);
+ sc->vsc_vs.vs_mtx = &sc->vsc_mtx;
+ sc->vsc_vq.vq_qsize = VT9P_RINGSZ;
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_9P);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_9P);
+ pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR);
+
+ if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix()))
+ return (1);
+ vi_set_io_bar(&sc->vsc_vs, 0);
+
+ return (0);
+}
+
+struct pci_devemu pci_de_v9p = {
+ .pe_emu = "virtio-9p",
+ .pe_legacy_config = pci_vt9p_legacy_config,
+ .pe_init = pci_vt9p_init,
+ .pe_barwrite = vi_pci_write,
+ .pe_barread = vi_pci_read
+};
+PCI_EMUL_SET(pci_de_v9p);
diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile
index 5c796c3caf..2673e008d8 100644
--- a/usr/src/lib/Makefile
+++ b/usr/src/lib/Makefile
@@ -274,6 +274,7 @@ SUBDIRS += \
$($(MACH)_SUBDIRS)
i386_SUBDIRS= \
+ lib9p \
libfdisk \
libppt \
libsaveargs \
@@ -489,6 +490,7 @@ HDRSUBDIRS= \
$($(MACH)_HDRSUBDIRS)
i386_HDRSUBDIRS= \
+ lib9p \
libfdisk \
libppt \
libsaveargs \
@@ -581,6 +583,7 @@ gss_mechs/mech_krb5: libgss libresolv2 pkcs11 libkstat
gss_mechs/mech_spnego: gss_mechs/mech_krb5
hal: dbusdeps
krb5: gss_mechs/mech_krb5 libtecla libldap5
+lib9p: libsec libcustr
libads: libnsl
libadt_jni: libbsm
libadutils: libldap5 libresolv2
diff --git a/usr/src/lib/lib9p/COPYRIGHT b/usr/src/lib/lib9p/COPYRIGHT
new file mode 100644
index 0000000000..b02f09aabd
--- /dev/null
+++ b/usr/src/lib/lib9p/COPYRIGHT
@@ -0,0 +1,47 @@
+Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+All rights reserved
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted providing that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+Some parts of the code are based on libixp (http://libs.suckless.org/libixp)
+library code released under following license:
+
+© 2005-2006 Anselm R. Garbe <garbeam@gmail.com>
+© 2006-2010 Kris Maglione <maglione.k at Gmail>
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/usr/src/lib/lib9p/COPYRIGHT.descrip b/usr/src/lib/lib9p/COPYRIGHT.descrip
new file mode 100644
index 0000000000..d854795482
--- /dev/null
+++ b/usr/src/lib/lib9p/COPYRIGHT.descrip
@@ -0,0 +1 @@
+lib9p library
diff --git a/usr/src/lib/lib9p/Makefile b/usr/src/lib/lib9p/Makefile
new file mode 100644
index 0000000000..65f8a88fae
--- /dev/null
+++ b/usr/src/lib/lib9p/Makefile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
+#
+
+include ../Makefile.lib
+
+$(BUILD64)SUBDIRS += $(MACH64)
+
+HDRS = lib9p.h
+HDRDIR = common
+CHECKHDRS =
+
+all:= TARGET= all
+install:= TARGET= install
+clean:= TARGET= clean
+clobber:= TARGET= clobber
+
+.KEEP_STATE:
+
+all install clean clobber: $(SUBDIRS)
+
+install_h: $(ROOTHDRS)
+check: $(CHECKHDRS)
+
+$(SUBDIRS): FRC
+ cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/lib/lib9p/Makefile.com b/usr/src/lib/lib9p/Makefile.com
new file mode 100644
index 0000000000..b04b210796
--- /dev/null
+++ b/usr/src/lib/lib9p/Makefile.com
@@ -0,0 +1,77 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
+#
+
+LIBRARY= lib9p.a
+VERS= .1
+
+OBJECTS= backend/fs.o \
+ connection.o \
+ genacl.o \
+ hashtable.o \
+ log.o \
+ pack.o \
+ request.o \
+ rfuncs.o \
+ sbuf/sbuf.o \
+ threadpool.o \
+ transport/socket.o \
+ utils.o
+HDRS = lib9p.h
+
+LOBJDIRS= backend transport sbuf
+
+include ../../Makefile.lib
+
+LIBS = $(DYNLIB)
+LDLIBS += -lc -lcustr -lsocket -lsec -lnvpair
+
+SRCDIR = ..
+
+CSTD = $(CSTD_GNU99)
+
+CFLAGS += $(CCVERBOSE)
+
+CPPFLAGS += -D__illumos__
+CPPFLAGS += -D_POSIX_PTHREAD_SEMANTICS -D__EXTENSIONS__
+CPPFLAGS += -I../common -I../common/backend
+$(NOT_RELEASE_BUILD)CPPFLAGS += -DL9P_DEBUG=L9P_DEBUG
+
+SMOFF += all_func_returns
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+$(LIBS): mkpicdirs
+
+mkpicdirs:
+ @mkdir -p $(LOBJDIRS:%=pics/%)
+
+pics/%.o: ../common/%.c
+ $(COMPILE.c) -o $@ $<
+ $(POST_PROCESS_O)
+
+pics/backend/%.o: ../common/backend/%.c
+ $(COMPILE.c) -o $@ $<
+ $(POST_PROCESS_O)
+
+pics/transport/%.o: ../common/transport/%.c
+ $(COMPILE.c) -o $@ $<
+ $(POST_PROCESS_O)
+
+$(ROOTHDRDIR)/%.h: ../common/%.h
+ $(INS.file)
+
+include ../../Makefile.targ
diff --git a/usr/src/lib/lib9p/amd64/Makefile b/usr/src/lib/lib9p/amd64/Makefile
new file mode 100644
index 0000000000..c3510fdb62
--- /dev/null
+++ b/usr/src/lib/lib9p/amd64/Makefile
@@ -0,0 +1,19 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
+#
+
+include ../Makefile.com
+include ../../Makefile.lib.64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64)
diff --git a/usr/src/lib/lib9p/common/backend/backend.h b/usr/src/lib/lib9p/common/backend/backend.h
new file mode 100644
index 0000000000..2b4bf2d8e4
--- /dev/null
+++ b/usr/src/lib/lib9p/common/backend/backend.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+
+#ifndef LIB9P_BACKEND_H
+#define LIB9P_BACKEND_H
+
+struct l9p_backend {
+ void *softc;
+ void (*freefid)(void *, struct l9p_fid *);
+ int (*attach)(void *, struct l9p_request *);
+ int (*clunk)(void *, struct l9p_fid *);
+ int (*create)(void *, struct l9p_request *);
+ int (*open)(void *, struct l9p_request *);
+ int (*read)(void *, struct l9p_request *);
+ int (*remove)(void *, struct l9p_fid *);
+ int (*stat)(void *, struct l9p_request *);
+ int (*walk)(void *, struct l9p_request *);
+ int (*write)(void *, struct l9p_request *);
+ int (*wstat)(void *, struct l9p_request *);
+ int (*statfs)(void *, struct l9p_request *);
+ int (*lopen)(void *, struct l9p_request *);
+ int (*lcreate)(void *, struct l9p_request *);
+ int (*symlink)(void *, struct l9p_request *);
+ int (*mknod)(void *, struct l9p_request *);
+ int (*rename)(void *, struct l9p_request *);
+ int (*readlink)(void *, struct l9p_request *);
+ int (*getattr)(void *, struct l9p_request *);
+ int (*setattr)(void *, struct l9p_request *);
+ int (*xattrwalk)(void *, struct l9p_request *);
+ int (*xattrcreate)(void *, struct l9p_request *);
+ int (*xattrread)(void *, struct l9p_request *);
+ int (*xattrwrite)(void *, struct l9p_request *);
+ int (*xattrclunk)(void *, struct l9p_fid *);
+ int (*readdir)(void *, struct l9p_request *);
+ int (*fsync)(void *, struct l9p_request *);
+ int (*lock)(void *, struct l9p_request *);
+ int (*getlock)(void *, struct l9p_request *);
+ int (*link)(void *, struct l9p_request *);
+ int (*mkdir)(void *, struct l9p_request *);
+ int (*renameat)(void *, struct l9p_request *);
+ int (*unlinkat)(void *, struct l9p_request *);
+};
+
+#endif /* LIB9P_BACKEND_H */
diff --git a/usr/src/lib/lib9p/common/backend/fs.c b/usr/src/lib/lib9p/common/backend/fs.c
new file mode 100644
index 0000000000..4b7764cd86
--- /dev/null
+++ b/usr/src/lib/lib9p/common/backend/fs.c
@@ -0,0 +1,3238 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Copyright 2021 Joyent, Inc.
+ */
+
+/*
+ * Based on libixp code: ©2007-2010 Kris Maglione <maglione.k at Gmail>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <dirent.h>
+#include <pwd.h>
+#include <grp.h>
+#include <libgen.h>
+#include <pthread.h>
+#include "../lib9p.h"
+#include "../lib9p_impl.h"
+#include "../fid.h"
+#include "../log.h"
+#include "../rfuncs.h"
+#include "../genacl.h"
+#include "backend.h"
+#include "fs.h"
+
+#if defined(WITH_CASPER)
+ #include <libcasper.h>
+ #include <casper/cap_pwd.h>
+ #include <casper/cap_grp.h>
+#endif
+
+#if defined(__FreeBSD__)
+ #include <sys/param.h>
+ #if __FreeBSD_version >= 1000000
+ #define HAVE_BINDAT
+ #endif
+#endif
+
+#if defined(__FreeBSD__)
+ #define HAVE_BIRTHTIME
+#endif
+
+#if defined(__APPLE__)
+ #include <sys/syscall.h>
+ #include "Availability.h"
+ #define ACL_TYPE_NFS4 ACL_TYPE_EXTENDED
+#endif
+
+#if defined (__illumos__)
+ #include <sys/sysmacros.h>
+ #include <sys/statvfs.h>
+ #include <sys/un.h>
+ #include <attr.h>
+ #include <sys/nvpair.h>
+#endif
+
+struct fs_softc {
+ int fs_rootfd;
+ bool fs_readonly;
+#if defined(__illumos__)
+ /*
+ * On illumos, the file creation time (birthtime) is stored (on
+ * supported filesystems -- i.e. zfs) in an extended attribute.
+ * If for some reason the fs doesn't support extended attributes,
+ * we skip trying to read the creation time.
+ */
+ bool fs_hasxattr;
+#endif
+#if defined(WITH_CASPER)
+ cap_channel_t *fs_cappwd;
+ cap_channel_t *fs_capgrp;
+#endif
+};
+
+struct fs_fid {
+ DIR *ff_dir;
+ int ff_dirfd;
+ int ff_fd;
+ int ff_flags;
+ char *ff_name;
+ struct fs_authinfo *ff_ai;
+ pthread_mutex_t ff_mtx;
+ struct l9p_acl *ff_acl; /* cached ACL if any */
+};
+
+#if defined(__FreeBSD__)
+# define STATFS_FSID(_s) \
+ (((uint64_t)(_s)->f_fsid.val[0] << 32) | (uint64_t)(_s)->f_fsid.val[1])
+
+# define STAT_ATIME(_s) ((_s)->st_atimespec)
+# define STAT_MTIME(_s) ((_s)->st_mtimespec)
+# define STAT_CTIME(_s) ((_s)->st_ctimespec)
+#elif defined (__illumos__)
+# define STATFS_FSID(_s) ((_s)->f_fsid)
+
+# define STAT_ATIME(_s) ((_s)->st_atim)
+# define STAT_MTIME(_s) ((_s)->st_mtim)
+# define STAT_CTIME(_s) ((_s)->st_ctim)
+#else
+#error "Port me"
+#endif
+
+#define FF_NO_NFSV4_ACL 0x01 /* don't go looking for NFSv4 ACLs */
+/* FF_NO_POSIX_ACL 0x02 -- not yet */
+
+/*
+ * Our authinfo consists of:
+ *
+ * - a reference count
+ * - a uid
+ * - a gid-set
+ *
+ * The "default" gid is the first gid in the git-set, provided the
+ * set size is at least 1. The set-size may be zero, though.
+ *
+ * Adjustments to the ref-count must be atomic, once it's shared.
+ * It would be nice to use C11 atomics here but they are not common
+ * enough to all systems just yet; for now, we use a mutex.
+ *
+ * Note that some ops (Linux style ones) pass an effective gid for
+ * the op, in which case, that gid may override. To achieve this
+ * effect, permissions testing functions also take an extra gid.
+ * If this gid is (gid_t)-1 it is not used and only the remaining
+ * gids take part.
+ *
+ * The uid may also be (uid_t)-1, meaning "no uid was available
+ * at all at attach time". In this case, new files inherit parent
+ * directory uids.
+ *
+ * The refcount is simply the number of "openfile"s using this
+ * authinfo (so that when the last ref goes away, we can free it).
+ *
+ * There are also master ACL flags (same as in ff_flags).
+ */
+struct fs_authinfo {
+ pthread_mutex_t ai_mtx; /* lock for refcnt */
+ uint32_t ai_refcnt;
+ int ai_flags;
+ uid_t ai_uid;
+ int ai_ngids;
+ gid_t ai_gids[]; /* NB: flexible array member */
+};
+
+/*
+ * We have a global-static mutex for single-threading Tattach
+ * requests, which use getpwnam (and indirectly, getgr* functions)
+ * which are not reentrant.
+ */
+static bool fs_attach_mutex_inited;
+static pthread_mutex_t fs_attach_mutex;
+
+static pthread_mutexattr_t fs_mutexattr;
+
+/*
+ * Internal functions (except inline functions).
+ */
+static struct passwd *fs_getpwuid(struct fs_softc *, uid_t, struct r_pgdata *);
+static struct group *fs_getgrgid(struct fs_softc *, gid_t, struct r_pgdata *);
+static int fs_buildname(struct l9p_fid *, char *, char *, size_t);
+static int fs_pdir(struct fs_softc *, struct l9p_fid *, char *, size_t,
+ struct stat *st);
+static int fs_dpf(char *, char *, size_t);
+static int fs_oflags_dotu(int, int *);
+static int fs_oflags_dotl(uint32_t, int *, enum l9p_omode *);
+static int fs_nde(struct fs_softc *, struct l9p_fid *, bool, gid_t,
+ struct stat *, uid_t *, gid_t *);
+static struct fs_fid *open_fid(int, const char *, struct fs_authinfo *, bool);
+static void dostat(struct fs_softc *, struct l9p_stat *, char *,
+ struct stat *, bool dotu);
+#ifdef __illumos__
+static void getcrtime(struct fs_softc *, int, const char *, uint64_t *,
+ uint64_t *);
+static void dostatfs(struct l9p_statfs *, struct statvfs *, long);
+#define ACL_TYPE_NFS4 1
+acl_t *acl_get_fd_np(int fd, int type);
+#else
+static void dostatfs(struct l9p_statfs *, struct statfs *, long);
+#endif
+static void fillacl(struct fs_fid *ff);
+static struct l9p_acl *getacl(struct fs_fid *ff, int fd, const char *path);
+static void dropacl(struct fs_fid *ff);
+static struct l9p_acl *look_for_nfsv4_acl(struct fs_fid *ff, int fd,
+ const char *path);
+static int check_access(int32_t,
+ struct l9p_acl *, struct stat *, struct l9p_acl *, struct stat *,
+ struct fs_authinfo *, gid_t);
+static void generate_qid(struct stat *, struct l9p_qid *);
+
+static int fs_icreate(void *, struct l9p_fid *, char *, int,
+ bool, mode_t, gid_t, struct stat *);
+static int fs_iopen(void *, struct l9p_fid *, int, enum l9p_omode,
+ gid_t, struct stat *);
+static int fs_imkdir(void *, struct l9p_fid *, char *,
+ bool, mode_t, gid_t, struct stat *);
+static int fs_imkfifo(void *, struct l9p_fid *, char *,
+ bool, mode_t, gid_t, struct stat *);
+static int fs_imknod(void *, struct l9p_fid *, char *,
+ bool, mode_t, dev_t, gid_t, struct stat *);
+static int fs_imksocket(void *, struct l9p_fid *, char *,
+ bool, mode_t, gid_t, struct stat *);
+static int fs_isymlink(void *, struct l9p_fid *, char *, char *,
+ gid_t, struct stat *);
+
+/*
+ * Internal functions implementing backend.
+ */
+static int fs_attach(void *, struct l9p_request *);
+static int fs_clunk(void *, struct l9p_fid *);
+static int fs_create(void *, struct l9p_request *);
+static int fs_open(void *, struct l9p_request *);
+static int fs_read(void *, struct l9p_request *);
+static int fs_remove(void *, struct l9p_fid *);
+static int fs_stat(void *, struct l9p_request *);
+static int fs_walk(void *, struct l9p_request *);
+static int fs_write(void *, struct l9p_request *);
+static int fs_wstat(void *, struct l9p_request *);
+static int fs_statfs(void *, struct l9p_request *);
+static int fs_lopen(void *, struct l9p_request *);
+static int fs_lcreate(void *, struct l9p_request *);
+static int fs_symlink(void *, struct l9p_request *);
+static int fs_mknod(void *, struct l9p_request *);
+static int fs_rename(void *, struct l9p_request *);
+static int fs_readlink(void *, struct l9p_request *);
+static int fs_getattr(void *, struct l9p_request *);
+static int fs_setattr(void *, struct l9p_request *);
+static int fs_xattrwalk(void *, struct l9p_request *);
+static int fs_xattrcreate(void *, struct l9p_request *);
+static int fs_readdir(void *, struct l9p_request *);
+static int fs_fsync(void *, struct l9p_request *);
+static int fs_lock(void *, struct l9p_request *);
+static int fs_getlock(void *, struct l9p_request *);
+static int fs_link(void *, struct l9p_request *);
+static int fs_renameat(void *, struct l9p_request *);
+static int fs_unlinkat(void *, struct l9p_request *);
+static void fs_freefid(void *, struct l9p_fid *);
+
+/*
+ * Convert from 9p2000 open/create mode to Unix-style O_* flags.
+ * This includes 9p2000.u extensions, but not 9p2000.L protocol,
+ * which has entirely different open, create, etc., flag bits.
+ *
+ * The <mode> given here is the one-byte (uint8_t) "mode"
+ * argument to Tcreate or Topen, so it can have at most 8 bits.
+ *
+ * https://swtch.com/plan9port/man/man9/open.html and
+ * http://plan9.bell-labs.com/magic/man2html/5/open
+ * both say:
+ *
+ * The [low two bits of the] mode field determines the
+ * type of I/O ... [I]f mode has the OTRUNC (0x10) bit
+ * set, the file is to be truncated, which requires write
+ * permission ...; if the mode has the ORCLOSE (0x40) bit
+ * set, the file is to be removed when the fid is clunked,
+ * which requires permission to remove the file from its
+ * directory. All other bits in mode should be zero. It
+ * is illegal to write a directory, truncate it, or
+ * attempt to remove it on close.
+ *
+ * 9P2000.u may add ODIRECT (0x80); this is not completely clear.
+ * The fcall.h header defines OCEXEC (0x20) as well, but it makes
+ * no sense to send this to a server. There seem to be no bits
+ * 0x04 and 0x08.
+ *
+ * We always turn on O_NOCTTY since as a server, we never want
+ * to gain a controlling terminal. We always turn on O_NOFOLLOW
+ * for reasons described elsewhere.
+ */
+static int
+fs_oflags_dotu(int mode, int *aflags)
+{
+ int flags;
+#define CONVERT(theirs, ours) \
+ do { \
+ if (mode & (theirs)) { \
+ mode &= ~(theirs); \
+ flags |= ours; \
+ } \
+ } while (0)
+
+ switch (mode & L9P_OACCMODE) {
+
+ case L9P_OREAD:
+ default:
+ flags = O_RDONLY;
+ break;
+
+ case L9P_OWRITE:
+ flags = O_WRONLY;
+ break;
+
+ case L9P_ORDWR:
+ flags = O_RDWR;
+ break;
+
+ case L9P_OEXEC:
+ if (mode & L9P_OTRUNC)
+ return (EINVAL);
+ flags = O_RDONLY;
+ break;
+ }
+
+ flags |= O_NOCTTY | O_NOFOLLOW;
+
+ CONVERT(L9P_OTRUNC, O_TRUNC);
+
+ /*
+ * Now take away some flags locally:
+ * the access mode (already translated)
+ * ORCLOSE - caller only
+ * OCEXEC - makes no sense in server
+ * ODIRECT - not applicable here
+ * If there are any flag bits left after this,
+ * we were unable to translate them. For now, let's
+ * treat this as EINVAL so that we can catch problems.
+ */
+ mode &= ~(L9P_OACCMODE | L9P_ORCLOSE | L9P_OCEXEC | L9P_ODIRECT);
+ if (mode != 0) {
+ L9P_LOG(L9P_INFO,
+ "fs_oflags_dotu: untranslated bits: %#x",
+ (unsigned)mode);
+ return (EINVAL);
+ }
+
+ *aflags = flags;
+ return (0);
+#undef CONVERT
+}
+
+/*
+ * Convert from 9P2000.L (Linux) open mode bits to O_* flags.
+ * See fs_oflags_dotu above.
+ *
+ * Linux currently does not have open-for-exec, but there is a
+ * proposal for it using O_PATH|O_NOFOLLOW, now handled here.
+ *
+ * We may eventually also set L9P_ORCLOSE for L_O_TMPFILE.
+ */
+static int
+fs_oflags_dotl(uint32_t l_mode, int *aflags, enum l9p_omode *ap9)
+{
+ int flags;
+ enum l9p_omode p9;
+#define CLEAR(theirs) l_mode &= ~(uint32_t)(theirs)
+#define CONVERT(theirs, ours) \
+ do { \
+ if (l_mode & (theirs)) { \
+ CLEAR(theirs); \
+ flags |= ours; \
+ } \
+ } while (0)
+
+ /*
+ * Linux O_RDONLY, O_WRONLY, O_RDWR (0,1,2) match BSD/MacOS.
+ */
+ flags = l_mode & O_ACCMODE;
+ if (flags == 3)
+ return (EINVAL);
+ CLEAR(O_ACCMODE);
+
+ if ((l_mode & (L9P_L_O_PATH | L9P_L_O_NOFOLLOW)) ==
+ (L9P_L_O_PATH | L9P_L_O_NOFOLLOW)) {
+ CLEAR(L9P_L_O_PATH | L9P_L_O_NOFOLLOW);
+ p9 = L9P_OEXEC;
+ } else {
+ /*
+ * Slightly dirty, but same dirt, really, as
+ * setting flags from l_mode & O_ACCMODE.
+ */
+ p9 = (enum l9p_omode)flags; /* slightly dirty */
+ }
+
+ /* turn L_O_TMPFILE into L9P_ORCLOSE in *p9? */
+ if (l_mode & L9P_L_O_TRUNC)
+ p9 |= L9P_OTRUNC; /* but don't CLEAR yet */
+
+ flags |= O_NOCTTY | O_NOFOLLOW;
+
+ /*
+ * L_O_CREAT seems to be noise, since we get separate open
+ * and create. But it is actually set sometimes. We just
+ * throw it out here; create ops must set it themselves and
+ * open ops have no permissions bits and hence cannot create.
+ *
+ * L_O_EXCL does make sense on create ops, i.e., we can
+ * take a create op with or without L_O_EXCL. We pass that
+ * through.
+ */
+ CLEAR(L9P_L_O_CREAT);
+ CONVERT(L9P_L_O_EXCL, O_EXCL);
+ CONVERT(L9P_L_O_TRUNC, O_TRUNC);
+ CONVERT(L9P_L_O_DIRECTORY, O_DIRECTORY);
+ CONVERT(L9P_L_O_APPEND, O_APPEND);
+ CONVERT(L9P_L_O_NONBLOCK, O_NONBLOCK);
+
+ /*
+ * Discard these as useless noise at our (server) end.
+ * (NOATIME might be useful but we can only set it on a
+ * per-mount basis.)
+ */
+ CLEAR(L9P_L_O_CLOEXEC);
+ CLEAR(L9P_L_O_DIRECT);
+ CLEAR(L9P_L_O_DSYNC);
+ CLEAR(L9P_L_O_FASYNC);
+ CLEAR(L9P_L_O_LARGEFILE);
+ CLEAR(L9P_L_O_NOATIME);
+ CLEAR(L9P_L_O_NOCTTY);
+ CLEAR(L9P_L_O_NOFOLLOW);
+ CLEAR(L9P_L_O_SYNC);
+
+ if (l_mode != 0) {
+ L9P_LOG(L9P_INFO,
+ "fs_oflags_dotl: untranslated bits: %#x",
+ (unsigned)l_mode);
+ return (EINVAL);
+ }
+
+ *aflags = flags;
+ *ap9 = p9;
+ return (0);
+#undef CLEAR
+#undef CONVERT
+}
+
+static struct passwd *
+fs_getpwuid(struct fs_softc *sc, uid_t uid, struct r_pgdata *pg)
+{
+#if defined(WITH_CASPER)
+ return (r_cap_getpwuid(sc->fs_cappwd, uid, pg));
+#else
+ (void)sc;
+ return (r_getpwuid(uid, pg));
+#endif
+}
+
+static struct group *
+fs_getgrgid(struct fs_softc *sc, gid_t gid, struct r_pgdata *pg)
+{
+#if defined(WITH_CASPER)
+ return (r_cap_getgrgid(sc->fs_capgrp, gid, pg));
+#else
+ (void)sc;
+ return (r_getgrgid(gid, pg));
+#endif
+}
+
+/*
+ * Build full name of file by appending given name to directory name.
+ */
+static int
+fs_buildname(struct l9p_fid *dir, char *name, char *buf, size_t size)
+{
+ struct fs_fid *dirf = dir->lo_aux;
+ size_t dlen, nlen1;
+
+ assert(dirf != NULL);
+ dlen = strlen(dirf->ff_name);
+ nlen1 = strlen(name) + 1; /* +1 for '\0' */
+ if (dlen + 1 + nlen1 > size)
+ return (ENAMETOOLONG);
+ memcpy(buf, dirf->ff_name, dlen);
+ buf[dlen] = '/';
+ memcpy(buf + dlen + 1, name, nlen1);
+ return (0);
+}
+
+/*
+ * Build parent name of file by splitting it off. Return an error
+ * if the given fid represents the root, so that there is no such
+ * parent, or if the discovered parent is not a directory.
+ */
+static int
+fs_pdir(struct fs_softc *sc __unused, struct l9p_fid *fid, char *buf,
+ size_t size, struct stat *st)
+{
+ struct fs_fid *ff;
+ char *path;
+
+ ff = fid->lo_aux;
+ assert(ff != NULL);
+ path = ff->ff_name;
+ path = r_dirname(path, buf, size);
+ if (path == NULL)
+ return (ENAMETOOLONG);
+ if (fstatat(ff->ff_dirfd, path, st, AT_SYMLINK_NOFOLLOW) != 0)
+ return (errno);
+ if (!S_ISDIR(st->st_mode))
+ return (ENOTDIR);
+ return (0);
+}
+
+/*
+ * Like fs_buildname() but for adding a file name to a buffer
+ * already holding a directory name. Essentially does
+ * strcat(dbuf, "/");
+ * strcat(dbuf, fname);
+ * but with size checking and an ENAMETOOLONG error as needed.
+ *
+ * (Think of the function name as "directory plus-equals file".)
+ */
+static int
+fs_dpf(char *dbuf, char *fname, size_t size)
+{
+ size_t dlen, nlen1;
+
+ dlen = strlen(dbuf);
+ nlen1 = strlen(fname) + 1;
+ if (dlen + 1 + nlen1 > size)
+ return (ENAMETOOLONG);
+ dbuf[dlen] = '/';
+ memcpy(dbuf + dlen + 1, fname, nlen1);
+ return (0);
+}
+
+/*
+ * Prepare to create a new directory entry (open with O_CREAT,
+ * mkdir, etc -- any operation that creates a new inode),
+ * operating in parent data <dir>, based on authinfo <ai> and
+ * effective gid <egid>.
+ *
+ * The new entity should be owned by user/group <*nuid, *ngid>,
+ * if it's really a new entity. It will be a directory if isdir.
+ *
+ * Returns an error number if the entry should not be created
+ * (e.g., read-only file system or no permission to write in
+ * parent directory). Always sets *nuid and *ngid on success:
+ * in the worst case, when there is no available ID, this will
+ * use the parent directory's IDs. Fills in <*st> on success.
+ */
+static int
+fs_nde(struct fs_softc *sc, struct l9p_fid *dir, bool isdir, gid_t egid,
+ struct stat *st, uid_t *nuid, gid_t *ngid)
+{
+ struct fs_fid *dirf;
+ struct fs_authinfo *ai;
+ int32_t op;
+ int error;
+
+ if (sc->fs_readonly)
+ return (EROFS);
+ dirf = dir->lo_aux;
+ assert(dirf != NULL);
+ if (fstatat(dirf->ff_dirfd, dirf->ff_name, st,
+ AT_SYMLINK_NOFOLLOW) != 0)
+ return (errno);
+ if (!S_ISDIR(st->st_mode))
+ return (ENOTDIR);
+ dirf = dir->lo_aux;
+ ai = dirf->ff_ai;
+ fillacl(dirf);
+ op = isdir ? L9P_ACE_ADD_SUBDIRECTORY : L9P_ACE_ADD_FILE;
+ error = check_access(op, dirf->ff_acl, st, NULL, NULL, ai, egid);
+ if (error)
+ return (EPERM);
+
+ *nuid = ai->ai_uid != (uid_t)-1 ? ai->ai_uid : st->st_uid;
+ *ngid = egid != (gid_t)-1 ? egid :
+ ai->ai_ngids > 0 ? ai->ai_gids[0] : st->st_gid;
+ return (0);
+}
+
+/*
+ * Allocate new open-file data structure to attach to a fid.
+ *
+ * The new file's authinfo is the same as the old one's, and
+ * we gain a reference.
+ */
+static struct fs_fid *
+open_fid(int dirfd, const char *path, struct fs_authinfo *ai, bool creating)
+{
+ struct fs_fid *ret;
+ uint32_t newcount;
+ int error;
+
+ ret = l9p_calloc(1, sizeof(*ret));
+#ifdef __illumos__
+ error = pthread_mutex_init(&ret->ff_mtx, &fs_mutexattr);
+#else
+ error = pthread_mutex_init(&ret->ff_mtx, NULL);
+#endif
+ if (error) {
+ free(ret);
+ return (NULL);
+ }
+ ret->ff_fd = -1;
+ ret->ff_dirfd = dirfd;
+ ret->ff_name = strdup(path);
+ if (ret->ff_name == NULL) {
+ (void) pthread_mutex_destroy(&ret->ff_mtx);
+ free(ret);
+ return (NULL);
+ }
+ if (pthread_mutex_lock(&ai->ai_mtx) != 0) {
+ (void) pthread_mutex_destroy(&ret->ff_mtx);
+ free(ret->ff_name);
+ free(ret);
+ return (NULL);
+ }
+ newcount = ++ai->ai_refcnt;
+ (void) pthread_mutex_unlock(&ai->ai_mtx);
+ /*
+ * If we just incremented the count to 1, we're the *first*
+ * reference. This is only allowed when creating the authinfo,
+ * otherwise it means something has gone wrong. This cannot
+ * catch every bad (re)use of a freed authinfo but it may catch
+ * a few.
+ */
+ assert(newcount > 1 || creating);
+ L9P_LOG(L9P_DEBUG, "authinfo %p now used by %lu",
+ (void *)ai, (u_long)newcount);
+ ret->ff_ai = ai;
+ return (ret);
+}
+
+static void
+dostat(struct fs_softc *sc, struct l9p_stat *s, char *name,
+ struct stat *buf, bool dotu)
+{
+ struct passwd *user;
+ struct group *group;
+
+ memset(s, 0, sizeof(struct l9p_stat));
+
+ generate_qid(buf, &s->qid);
+
+ s->type = 0;
+ s->dev = 0;
+ s->mode = buf->st_mode & 0777;
+
+ if (S_ISDIR(buf->st_mode))
+ s->mode |= L9P_DMDIR;
+
+ if (S_ISLNK(buf->st_mode) && dotu)
+ s->mode |= L9P_DMSYMLINK;
+
+ if (S_ISCHR(buf->st_mode) || S_ISBLK(buf->st_mode))
+ s->mode |= L9P_DMDEVICE;
+
+ if (S_ISSOCK(buf->st_mode))
+ s->mode |= L9P_DMSOCKET;
+
+ if (S_ISFIFO(buf->st_mode))
+ s->mode |= L9P_DMNAMEDPIPE;
+
+ s->atime = (uint32_t)buf->st_atime;
+ s->mtime = (uint32_t)buf->st_mtime;
+ s->length = (uint64_t)buf->st_size;
+
+ s->name = r_basename(name, NULL, 0);
+
+ if (!dotu) {
+ struct r_pgdata udata, gdata;
+
+ user = fs_getpwuid(sc, buf->st_uid, &udata);
+ group = fs_getgrgid(sc, buf->st_gid, &gdata);
+ s->uid = user != NULL ? strdup(user->pw_name) : NULL;
+ s->gid = group != NULL ? strdup(group->gr_name) : NULL;
+ s->muid = user != NULL ? strdup(user->pw_name) : NULL;
+ r_pgfree(&udata);
+ r_pgfree(&gdata);
+ } else {
+ /*
+ * When using 9P2000.u, we don't need to bother about
+ * providing user and group names in textual form.
+ *
+ * NB: if the asprintf()s fail, s->extension should
+ * be unset so we can ignore these.
+ */
+ s->n_uid = buf->st_uid;
+ s->n_gid = buf->st_gid;
+ s->n_muid = buf->st_uid;
+
+ if (S_ISLNK(buf->st_mode)) {
+ char target[MAXPATHLEN];
+ ssize_t ret = readlink(name, target, MAXPATHLEN);
+
+ if (ret < 0) {
+ s->extension = NULL;
+ return;
+ }
+
+ s->extension = strndup(target, (size_t)ret);
+ }
+
+ if (S_ISBLK(buf->st_mode)) {
+ asprintf(&s->extension, "b %d %d", major(buf->st_rdev),
+ minor(buf->st_rdev));
+ }
+
+ if (S_ISCHR(buf->st_mode)) {
+ asprintf(&s->extension, "c %d %d", major(buf->st_rdev),
+ minor(buf->st_rdev));
+ }
+ }
+}
+
+#ifndef __illumos__
+static void
+dostatfs(struct l9p_statfs *out, struct statfs *in, long namelen)
+#else
+static void
+dostatfs(struct l9p_statfs *out, struct statvfs *in, long namelen)
+#endif
+{
+
+ out->type = L9P_FSTYPE;
+ out->bsize = in->f_bsize;
+#ifndef __illumos__
+ out->blocks = in->f_blocks;
+ out->bfree = in->f_bfree;
+ out->bavail = in->f_bavail;
+#else
+ out->blocks = in->f_blocks * in->f_frsize / in->f_bsize;
+ out->bfree = in->f_bfree * in->f_frsize / in->f_bsize;
+ out->bavail = in->f_bavail * in->f_frsize / in->f_bsize;
+#endif
+ out->files = in->f_files;
+ out->ffree = in->f_ffree;
+ out->namelen = (uint32_t)namelen;
+ out->fsid = STATFS_FSID(in);
+}
+
+static void
+generate_qid(struct stat *buf, struct l9p_qid *qid)
+{
+ qid->path = buf->st_ino;
+ qid->version = 0;
+
+ if (S_ISREG(buf->st_mode))
+ qid->type |= L9P_QTFILE;
+
+ if (S_ISDIR(buf->st_mode))
+ qid->type |= L9P_QTDIR;
+
+ if (S_ISLNK(buf->st_mode))
+ qid->type |= L9P_QTSYMLINK;
+}
+
+/*
+ * Fill in ff->ff_acl if it's not set yet. Skip if the "don't use
+ * ACLs" flag is set, and use the flag to remember failure so
+ * we don't bother retrying either.
+ */
+static void
+fillacl(struct fs_fid *ff)
+{
+
+ if (ff->ff_acl == NULL && (ff->ff_flags & FF_NO_NFSV4_ACL) == 0) {
+ ff->ff_acl = look_for_nfsv4_acl(ff, ff->ff_fd, ff->ff_name);
+ if (ff->ff_acl == NULL)
+ ff->ff_flags |= FF_NO_NFSV4_ACL;
+ }
+}
+
+/*
+ * Get an ACL given fd and/or path name. We check for the "don't get
+ * ACL" flag in the given ff_fid data structure first, but don't set
+ * the flag here. The fillacl() code is similar but will set the
+ * flag; it also uses the ff_fd and ff_name directly.
+ *
+ * (This is used to get ACLs for parent directories, for instance.)
+ */
+static struct l9p_acl *
+getacl(struct fs_fid *ff, int fd, const char *path)
+{
+
+ if (ff->ff_flags & FF_NO_NFSV4_ACL)
+ return (NULL);
+ return look_for_nfsv4_acl(ff, fd, path);
+}
+
+/*
+ * Drop cached ff->ff_acl, e.g., after moving from one directory to
+ * another, where inherited ACLs might change.
+ */
+static void
+dropacl(struct fs_fid *ff)
+{
+
+ l9p_acl_free(ff->ff_acl);
+ ff->ff_acl = NULL;
+ ff->ff_flags = ff->ff_ai->ai_flags;
+}
+
+/*
+ * Check to see if we can find NFSv4 ACLs for the given file.
+ * If we have an open fd, we can use that, otherwise we need
+ * to use the path.
+ */
+static struct l9p_acl *
+look_for_nfsv4_acl(struct fs_fid *ff, int fd, const char *path)
+{
+ struct l9p_acl *acl;
+#ifdef __illumos__
+ acl_t *sysacl;
+#else
+ acl_t sysacl;
+#endif
+ int doclose = 0;
+
+ if (fd < 0) {
+ fd = openat(ff->ff_dirfd, path, 0);
+ doclose = 1;
+ }
+
+ sysacl = acl_get_fd_np(fd, ACL_TYPE_NFS4);
+ if (sysacl == NULL) {
+ /*
+ * EINVAL means no NFSv4 ACLs apply for this file.
+ * Other error numbers indicate some kind of problem.
+ */
+ if (errno != EINVAL) {
+ L9P_LOG(L9P_ERROR,
+ "error retrieving NFSv4 ACL from "
+ "fdesc %d (%s): %s", fd,
+ path, strerror(errno));
+ }
+
+ if (doclose)
+ close(fd);
+
+ return (NULL);
+ }
+#if defined(HAVE_FREEBSD_ACLS)
+ acl = l9p_freebsd_nfsv4acl_to_acl(sysacl);
+#elif defined(HAVE__ILLUMOS_ACLS)
+ acl = l9p_illumos_nfsv4acl_to_acl(sysacl);
+#else
+ acl = NULL; /* XXX need a l9p_darwin_acl_to_acl */
+#endif
+ acl_free(sysacl);
+
+ if (doclose)
+ close(fd);
+
+ return (acl);
+}
+
+/*
+ * Verify that the user whose authinfo is in <ai> and effective
+ * group ID is <egid> ((gid_t)-1 means no egid supplied) has
+ * permission to do something.
+ *
+ * The "something" may be rather complex: we allow NFSv4 style
+ * operation masks here, and provide parent and child ACLs and
+ * stat data. At most one of pacl+pst and cacl+cst can be NULL,
+ * unless ACLs are not supported; then pacl and cacl can both
+ * be NULL but pst or cst must be non-NULL depending on the
+ * operation.
+ */
+static int
+check_access(int32_t opmask,
+ struct l9p_acl *pacl, struct stat *pst,
+ struct l9p_acl *cacl, struct stat *cst,
+ struct fs_authinfo *ai, gid_t egid)
+{
+ struct l9p_acl_check_args args;
+
+ /*
+ * If we have ACLs, use them exclusively, ignoring Unix
+ * permissions. Otherwise, fall back on stat st_mode
+ * bits, and allow super-user as well.
+ */
+ args.aca_uid = ai->ai_uid;
+ args.aca_gid = egid;
+ args.aca_groups = ai->ai_gids;
+ args.aca_ngroups = (size_t)ai->ai_ngids;
+ args.aca_parent = pacl;
+ args.aca_pstat = pst;
+ args.aca_child = cacl;
+ args.aca_cstat = cst;
+ args.aca_aclmode = pacl == NULL && cacl == NULL
+ ? L9P_ACM_STAT_MODE
+ : L9P_ACM_NFS_ACL | L9P_ACM_ZFS_ACL;
+
+ args.aca_superuser = true;
+ return (l9p_acl_check_access(opmask, &args));
+}
+
+static int
+fs_attach(void *softc, struct l9p_request *req)
+{
+ struct fs_authinfo *ai;
+ struct fs_softc *sc = (struct fs_softc *)softc;
+ struct fs_fid *file;
+ struct passwd *pwd;
+ struct stat st;
+ struct r_pgdata udata;
+ uint32_t n_uname;
+ gid_t *gids;
+ uid_t uid;
+ int error;
+ int ngroups;
+
+ assert(req->lr_fid != NULL);
+
+ /*
+ * Single-thread pwd/group related items. We have a reentrant
+ * r_getpwuid but not a reentrant r_getpwnam, and l9p_getgrlist
+ * may use non-reentrant C library getgr* routines.
+ */
+ if ((error = pthread_mutex_lock(&fs_attach_mutex)) != 0)
+ return (error);
+
+ n_uname = req->lr_req.tattach.n_uname;
+ if (n_uname != L9P_NONUNAME) {
+ uid = (uid_t)n_uname;
+ pwd = fs_getpwuid(sc, uid, &udata);
+#if defined(L9P_DEBUG)
+ if (pwd == NULL)
+ L9P_LOG(L9P_DEBUG,
+ "Tattach: uid %ld: no such user", (long)uid);
+#endif
+ } else {
+ uid = (uid_t)-1;
+#if defined(WITH_CASPER)
+ pwd = cap_getpwnam(sc->fs_cappwd, req->lr_req.tattach.uname);
+#else
+ pwd = getpwnam(req->lr_req.tattach.uname);
+#endif
+#if defined(L9P_DEBUG)
+ if (pwd == NULL)
+ L9P_LOG(L9P_DEBUG,
+ "Tattach: %s: no such user",
+ req->lr_req.tattach.uname);
+#endif
+ }
+
+ /*
+ * If caller didn't give a numeric UID, pick it up from pwd
+ * if possible. If that doesn't work we can't continue.
+ *
+ * Note that pwd also supplies the group set. This assumes
+ * the server has the right mapping; this needs improvement.
+ * We do at least support ai->ai_ngids==0 properly now though.
+ */
+ if (uid == (uid_t)-1 && pwd != NULL)
+ uid = pwd->pw_uid;
+ if (uid == (uid_t)-1)
+ error = EPERM;
+ else {
+ error = 0;
+ if (fstat(sc->fs_rootfd, &st) != 0)
+ error = errno;
+ else if (!S_ISDIR(st.st_mode))
+ error = ENOTDIR;
+ }
+ if (error) {
+ (void) pthread_mutex_unlock(&fs_attach_mutex);
+ L9P_LOG(L9P_DEBUG,
+ "Tattach: denying uid=%ld access to rootdir: %s",
+ (long)uid, strerror(error));
+ /*
+ * Pass ENOENT and ENOTDIR through for diagnosis;
+ * others become EPERM. This should not leak too
+ * much security.
+ */
+ return (error == ENOENT || error == ENOTDIR ? error : EPERM);
+ }
+
+ if (pwd != NULL) {
+ /*
+ * This either succeeds and fills in ngroups and
+ * returns non-NULL, or fails and sets ngroups to 0
+ * and returns NULL. Either way ngroups is correct.
+ */
+ gids = l9p_getgrlist(pwd->pw_name, pwd->pw_gid, &ngroups);
+ } else {
+ gids = NULL;
+ ngroups = 0;
+ }
+
+ /*
+ * Done with pwd and group related items that may use
+ * non-reentrant C library routines; allow other threads in.
+ */
+ (void) pthread_mutex_unlock(&fs_attach_mutex);
+
+ ai = malloc(sizeof(*ai) + (size_t)ngroups * sizeof(gid_t));
+ if (ai == NULL) {
+ free(gids);
+ return (ENOMEM);
+ }
+#ifdef __illumos__
+ error = pthread_mutex_init(&ai->ai_mtx, &fs_mutexattr);
+#else
+ error = pthread_mutex_init(&ai->ai_mtx, NULL);
+#endif
+ if (error) {
+ free(gids);
+ free(ai);
+ return (error);
+ }
+ ai->ai_refcnt = 0;
+ ai->ai_uid = uid;
+ ai->ai_flags = 0; /* XXX for now */
+ ai->ai_ngids = ngroups;
+ memcpy(ai->ai_gids, gids, (size_t)ngroups * sizeof(gid_t));
+ free(gids);
+
+ file = open_fid(sc->fs_rootfd, ".", ai, true);
+ if (file == NULL) {
+ (void) pthread_mutex_destroy(&ai->ai_mtx);
+ free(ai);
+ return (ENOMEM);
+ }
+
+ req->lr_fid->lo_aux = file;
+ generate_qid(&st, &req->lr_resp.rattach.qid);
+ return (0);
+}
+
+static int
+fs_clunk(void *softc __unused, struct l9p_fid *fid)
+{
+ struct fs_fid *file;
+
+ file = fid->lo_aux;
+ assert(file != NULL);
+
+ if (file->ff_dir) {
+ closedir(file->ff_dir);
+ file->ff_dir = NULL;
+ } else if (file->ff_fd != -1) {
+ close(file->ff_fd);
+ file->ff_fd = -1;
+ }
+
+ return (0);
+}
+
+/*
+ * Create ops.
+ *
+ * We are to create a new file under some existing path,
+ * where the new file's name is in the Tcreate request and the
+ * existing path is due to a fid-based file (req->lr_fid).
+ *
+ * One op (create regular file) sets file->fd, the rest do not.
+ */
+static int
+fs_create(void *softc, struct l9p_request *req)
+{
+ struct l9p_fid *dir;
+ struct stat st;
+ uint32_t dmperm;
+ mode_t perm;
+ char *name;
+ int error;
+
+ dir = req->lr_fid;
+ name = req->lr_req.tcreate.name;
+ dmperm = req->lr_req.tcreate.perm;
+ perm = (mode_t)(dmperm & 0777);
+
+ if (dmperm & L9P_DMDIR)
+ error = fs_imkdir(softc, dir, name, true,
+ perm, (gid_t)-1, &st);
+ else if (dmperm & L9P_DMSYMLINK)
+ error = fs_isymlink(softc, dir, name,
+ req->lr_req.tcreate.extension, (gid_t)-1, &st);
+ else if (dmperm & L9P_DMNAMEDPIPE)
+ error = fs_imkfifo(softc, dir, name, true,
+ perm, (gid_t)-1, &st);
+ else if (dmperm & L9P_DMSOCKET)
+ error = fs_imksocket(softc, dir, name, true,
+ perm, (gid_t)-1, &st);
+ else if (dmperm & L9P_DMDEVICE) {
+ unsigned int major, minor;
+ char type;
+ dev_t dev;
+
+ /*
+ * ??? Should this be testing < 3? For now, allow a single
+ * integer mode with minor==0 implied.
+ */
+ minor = 0;
+ if (sscanf(req->lr_req.tcreate.extension, "%c %u %u",
+ &type, &major, &minor) < 2) {
+ return (EINVAL);
+ }
+
+ switch (type) {
+ case 'b':
+ perm |= S_IFBLK;
+ break;
+ case 'c':
+ perm |= S_IFCHR;
+ break;
+ default:
+ return (EINVAL);
+ }
+ dev = makedev(major, minor);
+ error = fs_imknod(softc, dir, name, true, perm, dev,
+ (gid_t)-1, &st);
+ } else {
+ enum l9p_omode p9;
+ int flags;
+
+ p9 = req->lr_req.tcreate.mode;
+ error = fs_oflags_dotu(p9, &flags);
+ if (error)
+ return (error);
+ error = fs_icreate(softc, dir, name, flags,
+ true, perm, (gid_t)-1, &st);
+ req->lr_resp.rcreate.iounit = req->lr_conn->lc_max_io_size;
+ }
+
+ if (error == 0)
+ generate_qid(&st, &req->lr_resp.rcreate.qid);
+
+ return (error);
+}
+
+/*
+ * https://swtch.com/plan9port/man/man9/open.html and
+ * http://plan9.bell-labs.com/magic/man2html/5/open
+ * say that permissions are actually
+ * perm & (~0666 | (dir.perm & 0666))
+ * for files, and
+ * perm & (~0777 | (dir.perm & 0777))
+ * for directories. That is, the parent directory may
+ * take away permissions granted by the operation.
+ *
+ * This seems a bit restrictive; probably
+ * there should be a control knob for this.
+ */
+static inline mode_t
+fs_p9perm(mode_t perm, mode_t dir_perm, bool isdir)
+{
+
+ if (isdir)
+ perm &= ~0777 | (dir_perm & 0777);
+ else
+ perm &= ~0666 | (dir_perm & 0666);
+ return (perm);
+}
+
+/*
+ * Internal form of create (plain file).
+ *
+ * Our caller takes care of splitting off all the special
+ * types of create (mknod, etc), so this is purely for files.
+ * We receive the fs_softc <softc>, the directory fid <dir>
+ * in which the new file is to be created, the name of the
+ * new file, a flag <isp9> indicating whether to do plan9 style
+ * permissions or Linux style permissions, the permissions <perm>,
+ * an effective group id <egid>, and a pointer to a stat structure
+ * <st> to fill in describing the final result on success.
+ *
+ * On successful create, the fid switches to the newly created
+ * file, which is now open; its associated file-name changes too.
+ *
+ * Note that the original (dir) fid is never currently open,
+ * so there is nothing to close.
+ */
+static int
+fs_icreate(void *softc, struct l9p_fid *dir, char *name, int flags,
+ bool isp9, mode_t perm, gid_t egid, struct stat *st)
+{
+ struct fs_fid *file;
+ gid_t gid;
+ uid_t uid;
+ char newname[MAXPATHLEN];
+ int error, fd;
+
+ file = dir->lo_aux;
+
+ /*
+ * Build full path name from directory + file name. We'll
+ * check permissions on the parent directory, then race to
+ * create the file before anything bad happens like symlinks.
+ *
+ * (To close this race we need to use openat(), which is
+ * left for a later version of this code.)
+ */
+ error = fs_buildname(dir, name, newname, sizeof(newname));
+ if (error)
+ return (error);
+
+ /* In case of success, we will need a new file->ff_name. */
+ name = strdup(newname);
+ if (name == NULL)
+ return (ENOMEM);
+
+ /* Check create permission and compute new file ownership. */
+ error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
+ if (error) {
+ free(name);
+ return (error);
+ }
+
+ /* Adjust new-file permissions for Plan9 protocol. */
+ if (isp9)
+ perm = fs_p9perm(perm, st->st_mode, false);
+
+ /* Create is always exclusive so O_TRUNC is irrelevant. */
+ fd = openat(file->ff_dirfd, newname, flags | O_CREAT | O_EXCL, perm);
+ if (fd < 0) {
+ error = errno;
+ free(name);
+ return (error);
+ }
+
+ /* Fix permissions and owner. */
+ if (fchmod(fd, perm) != 0 ||
+ fchown(fd, uid, gid) != 0 ||
+ fstat(fd, st) != 0) {
+ error = errno;
+ (void) close(fd);
+ /* unlink(newname); ? */
+ free(name);
+ return (error);
+ }
+
+ /* It *was* a directory; now it's a file, and it's open. */
+ free(file->ff_name);
+ file->ff_name = name;
+ file->ff_fd = fd;
+ return (0);
+}
+
+/*
+ * Internal form of open: stat file and verify permissions (from p9
+ * argument), then open the file-or-directory, leaving the internal
+ * fs_fid fields set up. If we cannot open the file, return a
+ * suitable error number, and leave everything unchanged.
+ *
+ * To mitigate the race between permissions testing and the actual
+ * open, we can stat the file twice (once with lstat() before open,
+ * then with fstat() after). We assume O_NOFOLLOW is set in flags,
+ * so if some other race-winner substitutes in a symlink we won't
+ * open it here. (However, embedded symlinks, if they occur, are
+ * still an issue. Ideally we would like to have an O_NEVERFOLLOW
+ * that fails on embedded symlinks, and a way to pass this to
+ * lstat() as well.)
+ *
+ * When we use opendir() we cannot pass O_NOFOLLOW, so we must rely
+ * on substitution-detection via fstat(). To simplify the code we
+ * just always re-check.
+ *
+ * (For a proper fix in the future, we can require openat(), keep
+ * each parent directory open during walk etc, and allow only final
+ * name components with O_NOFOLLOW.)
+ *
+ * On successful return, st has been filled in.
+ */
+static int
+fs_iopen(void *softc, struct l9p_fid *fid, int flags, enum l9p_omode p9,
+ gid_t egid __unused, struct stat *st)
+{
+ struct fs_softc *sc = softc;
+ struct fs_fid *file;
+ struct stat first;
+ int32_t op;
+ char *name;
+ int error;
+ int fd;
+ DIR *dirp;
+
+ /* Forbid write ops on read-only file system. */
+ if (sc->fs_readonly) {
+ if ((flags & O_TRUNC) != 0)
+ return (EROFS);
+ if ((flags & O_ACCMODE) != O_RDONLY)
+ return (EROFS);
+ if (p9 & L9P_ORCLOSE)
+ return (EROFS);
+ }
+
+ file = fid->lo_aux;
+ assert(file != NULL);
+ name = file->ff_name;
+
+ if (fstatat(file->ff_dirfd, name, &first, AT_SYMLINK_NOFOLLOW) != 0)
+ return (errno);
+ if (S_ISLNK(first.st_mode))
+ return (EPERM);
+
+ /* Can we rely on O_APPEND here? Best not, can be cleared. */
+ switch (flags & O_ACCMODE) {
+ case O_RDONLY:
+ op = L9P_ACE_READ_DATA;
+ break;
+ case O_WRONLY:
+ op = L9P_ACE_WRITE_DATA;
+ break;
+ case O_RDWR:
+ op = L9P_ACE_READ_DATA | L9P_ACE_WRITE_DATA;
+ break;
+ default:
+ return (EINVAL);
+ }
+ fillacl(file);
+ error = check_access(op, NULL, NULL, file->ff_acl, &first,
+ file->ff_ai, (gid_t)-1);
+ if (error)
+ return (error);
+
+ if (S_ISDIR(first.st_mode)) {
+ /* Forbid write or truncate on directory. */
+ if ((flags & O_ACCMODE) != O_RDONLY || (flags & O_TRUNC))
+ return (EPERM);
+ fd = openat(file->ff_dirfd, name, O_DIRECTORY);
+ dirp = fdopendir(fd);
+ if (dirp == NULL)
+ return (EPERM);
+ fd = dirfd(dirp);
+ } else {
+ dirp = NULL;
+ fd = openat(file->ff_dirfd, name, flags);
+ if (fd < 0)
+ return (EPERM);
+ }
+
+ /*
+ * We have a valid fd, and maybe non-null dirp. Re-check
+ * the file, and fail if st_dev or st_ino changed.
+ */
+ if (fstat(fd, st) != 0 ||
+ first.st_dev != st->st_dev ||
+ first.st_ino != st->st_ino) {
+ if (dirp != NULL)
+ (void) closedir(dirp);
+ else
+ (void) close(fd);
+ return (EPERM);
+ }
+ if (dirp != NULL)
+ file->ff_dir = dirp;
+ else
+ file->ff_fd = fd;
+ return (0);
+}
+
+/*
+ * Internal form of mkdir (common code for all forms).
+ * We receive the fs_softc <softc>, the directory fid <dir>
+ * in which the new entry is to be created, the name of the
+ * new entry, a flag <isp9> indicating whether to do plan9 style
+ * permissions or Linux style permissions, the permissions <perm>,
+ * an effective group id <egid>, and a pointer to a stat structure
+ * <st> to fill in describing the final result on success.
+ *
+ * See also fs_icreate() above.
+ */
+static int
+fs_imkdir(void *softc, struct l9p_fid *dir, char *name,
+ bool isp9, mode_t perm, gid_t egid, struct stat *st)
+{
+ struct fs_fid *ff;
+ gid_t gid;
+ uid_t uid;
+ char newname[MAXPATHLEN];
+ int error, fd;
+
+ ff = dir->lo_aux;
+ error = fs_buildname(dir, name, newname, sizeof(newname));
+ if (error)
+ return (error);
+
+ error = fs_nde(softc, dir, true, egid, st, &uid, &gid);
+ if (error)
+ return (error);
+
+ if (isp9)
+ perm = fs_p9perm(perm, st->st_mode, true);
+
+ if (mkdirat(ff->ff_dirfd, newname, perm) != 0)
+ return (errno);
+
+ fd = openat(ff->ff_dirfd, newname,
+ O_DIRECTORY | O_RDONLY | O_NOFOLLOW);
+ if (fd < 0 ||
+ fchown(fd, uid, gid) != 0 ||
+ fchmod(fd, perm) != 0 ||
+ fstat(fd, st) != 0) {
+ error = errno;
+ /* rmdir(newname) ? */
+ }
+ if (fd >= 0)
+ (void) close(fd);
+
+ return (error);
+}
+
+#ifdef __APPLE__
+/*
+ * This is an undocumented OS X syscall. It would be best to avoid it,
+ * but there doesn't seem to be another safe way to implement mknodat.
+ * Dear Apple, please implement mknodat before you remove this syscall.
+ */
+static int fs_ifchdir_thread_local(int fd)
+{
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+ return syscall(SYS___pthread_fchdir, fd);
+#pragma clang diagnostic pop
+}
+#endif
+
+/*
+ * Internal form of mknod (special device).
+ *
+ * The device type (S_IFBLK, S_IFCHR) is included in the <mode> parameter.
+ */
+static int
+fs_imknod(void *softc, struct l9p_fid *dir, char *name,
+ bool isp9, mode_t mode, dev_t dev, gid_t egid, struct stat *st)
+{
+ struct fs_fid *ff;
+ mode_t perm;
+ gid_t gid;
+ uid_t uid;
+ char newname[MAXPATHLEN];
+ int error;
+
+ ff = dir->lo_aux;
+ error = fs_buildname(dir, name, newname, sizeof(newname));
+ if (error)
+ return (error);
+
+ error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
+ if (error)
+ return (error);
+
+ if (isp9) {
+ perm = fs_p9perm(mode & 0777, st->st_mode, false);
+ mode = (mode & ~0777) | perm;
+ } else {
+ perm = mode & 0777;
+ }
+
+#ifdef __APPLE__
+ if (fs_ifchdir_thread_local(ff->ff_dirfd) < 0) {
+ return -1;
+ }
+ error = mknod(newname, mode, dev);
+ int preserved_errno = errno;
+ /* Stop using the thread-local cwd */
+ fs_ifchdir_thread_local(-1);
+ if (error < 0) {
+ errno = preserved_errno;
+ return errno;
+ }
+#else
+ if (mknodat(ff->ff_dirfd, newname, mode, dev) != 0)
+ return (errno);
+#endif
+
+ /* We cannot open the new name; race to use l* syscalls. */
+ if (fchownat(ff->ff_dirfd, newname, uid, gid, AT_SYMLINK_NOFOLLOW) != 0 ||
+ fchmodat(ff->ff_dirfd, newname, perm, 0) != 0 ||
+ fstatat(ff->ff_dirfd, newname, st, AT_SYMLINK_NOFOLLOW) != 0)
+ error = errno;
+ else if ((st->st_mode & S_IFMT) != (mode & S_IFMT))
+ error = EPERM; /* ??? lost a race anyway */
+
+ /* if (error) unlink(newname) ? */
+
+ return (error);
+}
+
+/*
+ * Internal form of mkfifo.
+ */
+static int
+fs_imkfifo(void *softc, struct l9p_fid *dir, char *name,
+ bool isp9, mode_t perm, gid_t egid, struct stat *st)
+{
+ struct fs_fid *ff;
+ gid_t gid;
+ uid_t uid;
+ char newname[MAXPATHLEN];
+ int error;
+
+ ff = dir->lo_aux;
+ error = fs_buildname(dir, name, newname, sizeof(newname));
+ if (error)
+ return (error);
+
+ error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
+ if (error)
+ return (error);
+
+ if (isp9)
+ perm = fs_p9perm(perm, st->st_mode, false);
+
+ if (mkfifo(newname, perm) != 0)
+ return (errno);
+
+ /* We cannot open the new name; race to use l* syscalls. */
+ if (fchownat(ff->ff_dirfd, newname, uid, gid, AT_SYMLINK_NOFOLLOW) != 0 ||
+ fchmodat(ff->ff_dirfd, newname, perm, 0) != 0 ||
+ fstatat(ff->ff_dirfd, newname, st, AT_SYMLINK_NOFOLLOW) != 0)
+ error = errno;
+ else if (!S_ISFIFO(st->st_mode))
+ error = EPERM; /* ??? lost a race anyway */
+
+ /* if (error) unlink(newname) ? */
+
+ return (error);
+}
+
+/*
+ * Internal form of mksocket.
+ *
+ * This is a bit different because of the horrible socket naming
+ * system (bind() with sockaddr_un sun_path).
+ */
+static int
+fs_imksocket(void *softc, struct l9p_fid *dir, char *name,
+ bool isp9, mode_t perm, gid_t egid, struct stat *st)
+{
+ struct fs_fid *ff;
+ struct sockaddr_un un;
+ char *path;
+ char newname[MAXPATHLEN];
+ gid_t gid;
+ uid_t uid;
+ int error = 0, s, fd, slen;
+
+ ff = dir->lo_aux;
+ error = fs_buildname(dir, name, newname, sizeof(newname));
+ if (error)
+ return (error);
+
+ error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
+ if (error)
+ return (error);
+
+ if (isp9)
+ perm = fs_p9perm(perm, st->st_mode, false);
+
+ s = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (s < 0)
+ return (errno);
+
+ path = newname;
+ fd = -1;
+#ifdef HAVE_BINDAT
+ /* Try bindat() if needed. */
+ if (strlen(path) >= sizeof(un.sun_path)) {
+ fd = openat(ff->ff_dirfd, ff->ff_name,
+ O_RDONLY | O_DIRECTORY | O_NOFOLLOW);
+ if (fd >= 0)
+ path = name;
+ }
+#endif
+
+ /*
+ * Can only create the socket if the path will fit.
+ * Even if we are using bindat() there are limits
+ * (the API for AF_UNIX sockets is ... not good).
+ *
+ * Note: in theory we can fill sun_path to the end
+ * (omitting a terminating '\0') but in at least one
+ * Unix-like system, this was known to behave oddly,
+ * so we test for ">=" rather than just ">".
+ */
+ if (strlen(path) >= sizeof(un.sun_path)) {
+ error = ENAMETOOLONG;
+ goto out;
+ }
+ un.sun_family = AF_UNIX;
+#ifndef __illumos__
+ slen = un.sun_len = sizeof(struct sockaddr_un);
+#else
+ slen = SUN_LEN(&un);
+#endif
+
+ strncpy(un.sun_path, path, sizeof(un.sun_path));
+
+#ifdef HAVE_BINDAT
+ if (fd >= 0) {
+ if (bindat(fd, s, (struct sockaddr *)&un, slen) < 0)
+ error = errno;
+ goto out; /* done now, for good or ill */
+ }
+#endif
+
+ if (bind(s, (struct sockaddr *)&un, slen) < 0)
+ error = errno;
+out:
+
+ if (error == 0) {
+ /*
+ * We believe we created the socket-inode. Fix
+ * permissions etc. Note that we cannot use
+ * fstat() on the socket descriptor: it succeeds,
+ * but we get bogus data!
+ */
+ if (fchownat(ff->ff_dirfd, newname, uid, gid, AT_SYMLINK_NOFOLLOW) != 0 ||
+ fchmodat(ff->ff_dirfd, newname, perm, 0) != 0 ||
+ fstatat(ff->ff_dirfd, newname, st, AT_SYMLINK_NOFOLLOW) != 0)
+ error = errno;
+ else if (!S_ISSOCK(st->st_mode))
+ error = EPERM; /* ??? lost a race anyway */
+
+ /* if (error) unlink(newname) ? */
+ }
+
+ /*
+ * It's not clear which error should override, although
+ * ideally we should never see either close() call fail.
+ * In any case we do want to try to close both fd and s,
+ * always. Let's set error only if it is not already set,
+ * so that all exit paths can use the same code.
+ */
+ if (fd >= 0 && close(fd) != 0)
+ if (error == 0)
+ error = errno;
+ if (close(s) != 0)
+ if (error == 0)
+ error = errno;
+
+ return (error);
+}
+
+/*
+ * Internal form of symlink.
+ *
+ * Note that symlinks are presumed to carry no permission bits.
+ * They do have owners, however (who may be charged for quotas).
+ */
+static int
+fs_isymlink(void *softc, struct l9p_fid *dir, char *name,
+ char *symtgt, gid_t egid, struct stat *st)
+{
+ struct fs_fid *ff;
+ gid_t gid;
+ uid_t uid;
+ char newname[MAXPATHLEN];
+ int error;
+
+ ff = dir->lo_aux;
+ error = fs_buildname(dir, name, newname, sizeof(newname));
+ if (error)
+ return (error);
+
+ error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
+ if (error)
+ return (error);
+
+ if (symlinkat(symtgt, ff->ff_dirfd, newname) != 0)
+ return (errno);
+
+ /* We cannot open the new name; race to use l* syscalls. */
+ if (fchownat(ff->ff_dirfd, newname, uid, gid, AT_SYMLINK_NOFOLLOW) != 0 ||
+ fstatat(ff->ff_dirfd, newname, st, AT_SYMLINK_NOFOLLOW) != 0)
+ error = errno;
+ else if (!S_ISLNK(st->st_mode))
+ error = EPERM; /* ??? lost a race anyway */
+
+ /* if (error) unlink(newname) ? */
+
+ return (error);
+}
+
+static int
+fs_open(void *softc, struct l9p_request *req)
+{
+ struct l9p_fid *fid = req->lr_fid;
+ struct stat st;
+ enum l9p_omode p9;
+ int error, flags;
+
+ p9 = req->lr_req.topen.mode;
+ error = fs_oflags_dotu(p9, &flags);
+ if (error)
+ return (error);
+
+ error = fs_iopen(softc, fid, flags, p9, (gid_t)-1, &st);
+ if (error)
+ return (error);
+
+ generate_qid(&st, &req->lr_resp.ropen.qid);
+ req->lr_resp.ropen.iounit = req->lr_conn->lc_max_io_size;
+ return (0);
+}
+
+/*
+ * Helper for directory read. We want to run an lstat on each
+ * file name within the directory. This is a lot faster if we
+ * have lstatat (or fstatat with AT_SYMLINK_NOFOLLOW), but not
+ * all systems do, so hide the ifdef-ed code in an inline function.
+ */
+static inline int
+fs_lstatat(struct fs_fid *file, char *name, struct stat *st)
+{
+
+ return (fstatat(dirfd(file->ff_dir), name, st, AT_SYMLINK_NOFOLLOW));
+}
+
+static int
+fs_read(void *softc, struct l9p_request *req)
+{
+ struct l9p_stat l9stat;
+ struct fs_softc *sc;
+ struct fs_fid *file;
+ bool dotu = req->lr_conn->lc_version >= L9P_2000U;
+ ssize_t ret;
+
+ sc = softc;
+ file = req->lr_fid->lo_aux;
+ assert(file != NULL);
+
+ if (file->ff_dir != NULL) {
+ struct dirent *d;
+ struct stat st;
+ struct l9p_message msg;
+ long o;
+ int err;
+
+ if ((err = pthread_mutex_lock(&file->ff_mtx)) != 0)
+ return (err);
+
+ /*
+ * Must use telldir before readdir since seekdir
+ * takes cookie values. Unfortunately this wastes
+ * a lot of time (and memory) building unneeded
+ * cookies that can only be flushed by closing
+ * the directory.
+ *
+ * NB: FreeBSD libc seekdir has SINGLEUSE defined,
+ * so in fact, we can discard the cookies by
+ * calling seekdir on them. This clears up wasted
+ * memory at the cost of even more wasted time...
+ *
+ * XXX: readdir/telldir/seekdir not thread safe
+ */
+ l9p_init_msg(&msg, req, L9P_PACK);
+ for (;;) {
+ o = telldir(file->ff_dir);
+ d = readdir(file->ff_dir);
+ if (d == NULL)
+ break;
+ if (fs_lstatat(file, d->d_name, &st))
+ continue;
+ dostat(sc, &l9stat, d->d_name, &st, dotu);
+ if (l9p_pack_stat(&msg, req, &l9stat) != 0) {
+ seekdir(file->ff_dir, o);
+ break;
+ }
+#if defined(__FreeBSD__)
+ seekdir(file->ff_dir, o);
+ (void) readdir(file->ff_dir);
+#endif
+ }
+
+ (void) pthread_mutex_unlock(&file->ff_mtx);
+ } else {
+ size_t niov = l9p_truncate_iov(req->lr_data_iov,
+ req->lr_data_niov, req->lr_req.io.count);
+
+#if defined(__FreeBSD__) || defined(__illumos__)
+ ret = preadv(file->ff_fd, req->lr_data_iov, niov,
+ req->lr_req.io.offset);
+#else
+ /* XXX: not thread safe, should really use aio_listio. */
+ if (lseek(file->ff_fd, (off_t)req->lr_req.io.offset, SEEK_SET) < 0)
+ return (errno);
+
+ ret = (uint32_t)readv(file->ff_fd, req->lr_data_iov, (int)niov);
+#endif
+
+ if (ret < 0)
+ return (errno);
+
+ req->lr_resp.io.count = (uint32_t)ret;
+ }
+
+ return (0);
+}
+
+static int
+fs_remove(void *softc, struct l9p_fid *fid)
+{
+ struct fs_softc *sc = softc;
+ struct l9p_acl *parent_acl;
+ struct fs_fid *file;
+ struct stat pst, cst;
+ char dirname[MAXPATHLEN];
+ int error;
+
+ if (sc->fs_readonly)
+ return (EROFS);
+
+ error = fs_pdir(sc, fid, dirname, sizeof(dirname), &pst);
+ if (error)
+ return (error);
+
+ file = fid->lo_aux;
+ if (fstatat(file->ff_dirfd, file->ff_name, &cst, AT_SYMLINK_NOFOLLOW) != 0)
+ return (error);
+
+ parent_acl = getacl(file, -1, dirname);
+ fillacl(file);
+
+ error = check_access(L9P_ACOP_UNLINK,
+ parent_acl, &pst, file->ff_acl, &cst, file->ff_ai, (gid_t)-1);
+ l9p_acl_free(parent_acl);
+ if (error)
+ return (error);
+
+ if (unlinkat(file->ff_dirfd, file->ff_name,
+ S_ISDIR(cst.st_mode) ? AT_REMOVEDIR : 0) != 0)
+ error = errno;
+
+ return (error);
+}
+
+static int
+fs_stat(void *softc, struct l9p_request *req)
+{
+ struct fs_softc *sc;
+ struct fs_fid *file;
+ struct stat st;
+ bool dotu = req->lr_conn->lc_version >= L9P_2000U;
+
+ sc = softc;
+ file = req->lr_fid->lo_aux;
+ assert(file);
+
+ if (fstatat(file->ff_dirfd, file->ff_name, &st,
+ AT_SYMLINK_NOFOLLOW) != 0)
+ return (errno);
+
+ dostat(sc, &req->lr_resp.rstat.stat, file->ff_name, &st, dotu);
+ return (0);
+}
+
+static int
+fs_walk(void *softc, struct l9p_request *req)
+{
+ struct l9p_acl *acl;
+ struct fs_authinfo *ai;
+ struct fs_fid *file = req->lr_fid->lo_aux;
+ struct fs_fid *newfile;
+ struct stat st;
+ size_t clen, namelen, need;
+ char *comp, *succ, *next, *swtmp;
+ bool atroot;
+ bool dotdot;
+ int i, nwname;
+ int error = 0;
+ char namebufs[2][MAXPATHLEN];
+
+ /*
+ * https://swtch.com/plan9port/man/man9/walk.html:
+ *
+ * It is legal for nwname to be zero, in which case newfid
+ * will represent the same file as fid and the walk will
+ * usually succeed; this is equivalent to walking to dot.
+ * [Aside: it's not clear if we should test S_ISDIR here.]
+ * ...
+ * The name ".." ... represents the parent directory.
+ * The name "." ... is not used in the protocol.
+ * ... A walk of the name ".." in the root directory
+ * of the server is equivalent to a walk with no name
+ * elements.
+ *
+ * Note that req.twalk.nwname never exceeds L9P_MAX_WELEM,
+ * so it is safe to convert to plain int.
+ *
+ * We are to return an error only if the first walk fails,
+ * else stop at the end of the names or on the first error.
+ * The final fid is based on the last name successfully
+ * walked.
+ *
+ * Note that we *do* get Twalk requests with nwname==0 on files.
+ *
+ * Set up "successful name" buffer pointer with base fid name,
+ * initially. We'll swap each new success into it as we go.
+ *
+ * Invariant: atroot and stat data correspond to current
+ * (succ) path.
+ */
+ succ = namebufs[0];
+ next = namebufs[1];
+ namelen = strlcpy(succ, file->ff_name, MAXPATHLEN);
+ if (namelen >= MAXPATHLEN)
+ return (ENAMETOOLONG);
+ if (fstatat(file->ff_dirfd, succ, &st, AT_SYMLINK_NOFOLLOW) < 0)
+ return (errno);
+ ai = file->ff_ai;
+ atroot = strlen(succ) == 0; /* XXX? */
+ fillacl(file);
+ acl = file->ff_acl;
+
+ nwname = (int)req->lr_req.twalk.nwname;
+
+ for (i = 0; i < nwname; i++) {
+ /*
+ * Must have execute permission to search a directory.
+ * Then, look up each component in its directory-so-far.
+ * Check for ".." along the way, handlng specially
+ * as needed. Forbid "/" in name components.
+ *
+ */
+ if (!S_ISDIR(st.st_mode)) {
+ error = ENOTDIR;
+ goto out;
+ }
+ error = check_access(L9P_ACE_EXECUTE,
+ NULL, NULL, acl, &st, ai, (gid_t)-1);
+ if (error) {
+ L9P_LOG(L9P_DEBUG,
+ "Twalk: denying dir-walk on \"%s\" for uid %u",
+ succ, (unsigned)ai->ai_uid);
+ error = EPERM;
+ goto out;
+ }
+ comp = req->lr_req.twalk.wname[i];
+ if (strchr(comp, '/') != NULL) {
+ error = EINVAL;
+ break;
+ }
+
+ clen = strlen(comp);
+ dotdot = false;
+
+ /*
+ * Build next pathname (into "next"). If "..",
+ * just strip one name component off the success
+ * name so far. Since we know this name fits, the
+ * stripped down version also fits. Otherwise,
+ * the name is the base name plus '/' plus the
+ * component name plus terminating '\0'; this may
+ * or may not fit.
+ */
+ if (comp[0] == '.') {
+ if (clen == 1) {
+ error = EINVAL;
+ break;
+ }
+ if (comp[1] == '.' && clen == 2)
+ dotdot = true;
+ }
+ if (dotdot) {
+ /*
+ * It's not clear how ".." at root should
+ * be handled when i > 0. Obeying the man
+ * page exactly, we reset i to 0 and stop,
+ * declaring terminal success.
+ *
+ * Otherwise, we just climbed up one level
+ * so adjust "atroot".
+ */
+ if (atroot) {
+ i = 0;
+ break;
+ }
+ (void) r_dirname(succ, next, MAXPATHLEN);
+ namelen = strlen(next);
+ atroot = strlen(next) == 0; /* XXX? */
+ } else {
+ need = namelen + 1 + clen + 1;
+ if (need > MAXPATHLEN) {
+ error = ENAMETOOLONG;
+ break;
+ }
+ memcpy(next, succ, namelen);
+ next[namelen++] = '/';
+ memcpy(&next[namelen], comp, clen + 1);
+ namelen += clen;
+ /*
+ * Since name is never ".", we are necessarily
+ * descending below the root now.
+ */
+ atroot = false;
+ }
+
+ if (fstatat(file->ff_dirfd, next, &st, AT_SYMLINK_NOFOLLOW) < 0) {
+ error = ENOENT;
+ break;
+ }
+
+ /*
+ * Success: generate qid and swap this
+ * successful name into place. Update acl.
+ */
+ generate_qid(&st, &req->lr_resp.rwalk.wqid[i]);
+ swtmp = succ;
+ succ = next;
+ next = swtmp;
+ if (acl != NULL && acl != file->ff_acl)
+ l9p_acl_free(acl);
+ acl = getacl(file, -1, next);
+ }
+
+ /*
+ * Fail only if we failed on the first name.
+ * Otherwise we succeeded on something, and "succ"
+ * points to the last successful name in namebufs[].
+ */
+ if (error) {
+ if (i == 0)
+ goto out;
+ error = 0;
+ }
+
+ newfile = open_fid(file->ff_dirfd, succ, ai, false);
+ if (newfile == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ if (req->lr_newfid == req->lr_fid) {
+ /*
+ * Before overwriting fid->lo_aux, free the old value.
+ * Note that this doesn't free the l9p_fid data,
+ * just the fs_fid data. (But it does ditch ff_acl.)
+ */
+ if (acl == file->ff_acl)
+ acl = NULL;
+ fs_freefid(softc, req->lr_fid);
+ file = NULL;
+ }
+ req->lr_newfid->lo_aux = newfile;
+ if (file != NULL && acl != file->ff_acl) {
+ newfile->ff_acl = acl;
+ acl = NULL;
+ }
+ req->lr_resp.rwalk.nwqid = (uint16_t)i;
+out:
+ if (file != NULL && acl != file->ff_acl)
+ l9p_acl_free(acl);
+ return (error);
+}
+
+static int
+fs_write(void *softc, struct l9p_request *req)
+{
+ struct fs_softc *sc = softc;
+ struct fs_fid *file;
+ ssize_t ret;
+
+ file = req->lr_fid->lo_aux;
+ assert(file != NULL);
+
+ if (sc->fs_readonly)
+ return (EROFS);
+
+ size_t niov = l9p_truncate_iov(req->lr_data_iov,
+ req->lr_data_niov, req->lr_req.io.count);
+
+#if defined(__FreeBSD__) || defined(__illumos__)
+ ret = pwritev(file->ff_fd, req->lr_data_iov, niov,
+ req->lr_req.io.offset);
+#else
+ /* XXX: not thread safe, should really use aio_listio. */
+ if (lseek(file->ff_fd, (off_t)req->lr_req.io.offset, SEEK_SET) < 0)
+ return (errno);
+
+ ret = writev(file->ff_fd, req->lr_data_iov,
+ (int)niov);
+#endif
+
+ if (ret < 0)
+ return (errno);
+
+ req->lr_resp.io.count = (uint32_t)ret;
+ return (0);
+}
+
+static int
+fs_wstat(void *softc, struct l9p_request *req)
+{
+ struct fs_softc *sc = softc;
+ struct l9p_stat *l9stat = &req->lr_req.twstat.stat;
+ struct l9p_fid *fid;
+ struct fs_fid *file;
+ int error = 0;
+
+ fid = req->lr_fid;
+ file = fid->lo_aux;
+ assert(file != NULL);
+
+ /*
+ * XXX:
+ *
+ * stat(9P) sez:
+ *
+ * Either all the changes in wstat request happen, or none of them
+ * does: if the request succeeds, all changes were made; if it fails,
+ * none were.
+ *
+ * Atomicity is clearly missing in current implementation.
+ */
+
+ if (sc->fs_readonly)
+ return (EROFS);
+
+ if (l9stat->atime != (uint32_t)~0) {
+ /* XXX: not implemented, ignore */
+ }
+
+ if (l9stat->mtime != (uint32_t)~0) {
+ /* XXX: not implemented, ignore */
+ }
+
+ if (l9stat->dev != (uint32_t)~0) {
+ error = EPERM;
+ goto out;
+ }
+
+ if (l9stat->length != (uint64_t)~0) {
+ if (file->ff_dir != NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (truncate(file->ff_name, (off_t)l9stat->length) != 0) {
+ error = errno;
+ goto out;
+ }
+ }
+
+ if (req->lr_conn->lc_version >= L9P_2000U) {
+ if (fchownat(file->ff_dirfd, file->ff_name, l9stat->n_uid,
+ l9stat->n_gid, AT_SYMLINK_NOFOLLOW) != 0) {
+ error = errno;
+ goto out;
+ }
+ }
+
+ if (l9stat->mode != (uint32_t)~0) {
+ if (fchmodat(file->ff_dirfd, file->ff_name,
+ l9stat->mode & 0777, 0) != 0) {
+ error = errno;
+ goto out;
+ }
+ }
+
+ if (strlen(l9stat->name) > 0) {
+ struct l9p_acl *parent_acl;
+ struct stat st;
+ char *tmp;
+ char newname[MAXPATHLEN];
+
+ /*
+ * Rename-within-directory: it's not deleting anything,
+ * but we need write permission on the directory. This
+ * should suffice.
+ */
+ error = fs_pdir(softc, fid, newname, sizeof(newname), &st);
+ if (error)
+ goto out;
+ parent_acl = getacl(file, -1, newname);
+ error = check_access(L9P_ACE_ADD_FILE,
+ parent_acl, &st, NULL, NULL, file->ff_ai, (gid_t)-1);
+ l9p_acl_free(parent_acl);
+ if (error)
+ goto out;
+ error = fs_dpf(newname, l9stat->name, sizeof(newname));
+ if (error)
+ goto out;
+ tmp = strdup(newname);
+ if (tmp == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ if (renameat(file->ff_dirfd, file->ff_name, file->ff_dirfd,
+ tmp) != 0) {
+ error = errno;
+ free(tmp);
+ goto out;
+ }
+ /* Successful rename, update file->ff_name. ACL can stay. */
+ free(file->ff_name);
+ file->ff_name = tmp;
+ }
+out:
+ return (error);
+}
+
+static int
+fs_statfs(void *softc __unused, struct l9p_request *req)
+{
+ struct fs_fid *file;
+ struct stat st;
+#ifdef __illumos__
+ struct statvfs f;
+#else
+ struct statfs f;
+#endif
+ long name_max;
+ int error;
+ int fd;
+
+ file = req->lr_fid->lo_aux;
+ assert(file);
+
+ if (fstatat(file->ff_dirfd, file->ff_name, &st,
+ AT_SYMLINK_NOFOLLOW) != 0)
+ return (errno);
+
+ /*
+ * Not entirely clear what access to require; we'll go
+ * for "read data".
+ */
+ fillacl(file);
+ error = check_access(L9P_ACE_READ_DATA, NULL, NULL,
+ file->ff_acl, &st, file->ff_ai, (gid_t)-1);
+ if (error)
+ return (error);
+
+ fd = openat(file->ff_dirfd, file->ff_name, 0);
+ if (fd < 0)
+ return (errno);
+
+#ifdef __illumos__
+ if (fstatvfs(fd, &f) != 0)
+ return (errno);
+#else
+ if (fstatfs(fd, &f) != 0)
+ return (errno);
+#endif
+
+ name_max = fpathconf(fd, _PC_NAME_MAX);
+ error = errno;
+ close(fd);
+
+ if (name_max == -1)
+ return (error);
+
+ dostatfs(&req->lr_resp.rstatfs.statfs, &f, name_max);
+
+ return (0);
+}
+
+static int
+fs_lopen(void *softc, struct l9p_request *req)
+{
+ struct l9p_fid *fid = req->lr_fid;
+ struct stat st;
+ enum l9p_omode p9;
+ gid_t gid;
+ int error, flags;
+
+ error = fs_oflags_dotl(req->lr_req.tlopen.flags, &flags, &p9);
+ if (error)
+ return (error);
+
+ gid = req->lr_req.tlopen.gid;
+ error = fs_iopen(softc, fid, flags, p9, gid, &st);
+ if (error)
+ return (error);
+
+ generate_qid(&st, &req->lr_resp.rlopen.qid);
+ req->lr_resp.rlopen.iounit = req->lr_conn->lc_max_io_size;
+ return (0);
+}
+
+static int
+fs_lcreate(void *softc, struct l9p_request *req)
+{
+ struct l9p_fid *dir;
+ struct stat st;
+ enum l9p_omode p9;
+ char *name;
+ mode_t perm;
+ gid_t gid;
+ int error, flags;
+
+ dir = req->lr_fid;
+ name = req->lr_req.tlcreate.name;
+
+ error = fs_oflags_dotl(req->lr_req.tlcreate.flags, &flags, &p9);
+ if (error)
+ return (error);
+
+ perm = (mode_t)req->lr_req.tlcreate.mode & 0777; /* ? set-id bits? */
+ gid = req->lr_req.tlcreate.gid;
+ error = fs_icreate(softc, dir, name, flags, false, perm, gid, &st);
+ if (error == 0)
+ generate_qid(&st, &req->lr_resp.rlcreate.qid);
+ req->lr_resp.rlcreate.iounit = req->lr_conn->lc_max_io_size;
+ return (error);
+}
+
+static int
+fs_symlink(void *softc, struct l9p_request *req)
+{
+ struct l9p_fid *dir;
+ struct stat st;
+ gid_t gid;
+ char *name, *symtgt;
+ int error;
+
+ dir = req->lr_fid;
+ name = req->lr_req.tsymlink.name;
+ symtgt = req->lr_req.tsymlink.symtgt;
+ gid = req->lr_req.tsymlink.gid;
+ error = fs_isymlink(softc, dir, name, symtgt, gid, &st);
+ if (error == 0)
+ generate_qid(&st, &req->lr_resp.rsymlink.qid);
+ return (error);
+}
+
+static int
+fs_mknod(void *softc, struct l9p_request *req)
+{
+ struct l9p_fid *dir;
+ struct stat st;
+ uint32_t mode, major, minor;
+ dev_t dev;
+ gid_t gid;
+ char *name;
+ int error;
+
+ dir = req->lr_fid;
+ name = req->lr_req.tmknod.name;
+ mode = req->lr_req.tmknod.mode;
+ gid = req->lr_req.tmknod.gid;
+
+ switch (mode & S_IFMT) {
+ case S_IFBLK:
+ case S_IFCHR:
+ mode = (mode & S_IFMT) | (mode & 0777); /* ??? */
+ major = req->lr_req.tmknod.major;
+ minor = req->lr_req.tmknod.major;
+ dev = makedev(major, minor);
+ error = fs_imknod(softc, dir, name, false,
+ (mode_t)mode, dev, gid, &st);
+ break;
+
+ case S_IFIFO:
+ error = fs_imkfifo(softc, dir, name, false,
+ (mode_t)(mode & 0777), gid, &st);
+ break;
+
+ case S_IFSOCK:
+ error = fs_imksocket(softc, dir, name, false,
+ (mode_t)(mode & 0777), gid, &st);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+ if (error == 0)
+ generate_qid(&st, &req->lr_resp.rmknod.qid);
+ return (error);
+}
+
+static int
+fs_rename(void *softc, struct l9p_request *req)
+{
+ struct fs_softc *sc = softc;
+ struct fs_authinfo *ai;
+ struct l9p_acl *oparent_acl;
+ struct l9p_fid *fid, *f2;
+ struct fs_fid *file, *f2ff;
+ struct stat cst, opst, npst;
+ int32_t op;
+ bool reparenting;
+ char *tmp;
+ char olddir[MAXPATHLEN], newname[MAXPATHLEN];
+ int error;
+
+ if (sc->fs_readonly)
+ return (EROFS);
+
+ /*
+ * Note: lr_fid represents the file that is to be renamed,
+ * so we must locate its parent directory and verify that
+ * both this parent directory and the new directory f2 are
+ * writable. But if the new parent directory is the same
+ * path as the old parent directory, our job is simpler.
+ */
+ fid = req->lr_fid;
+ file = fid->lo_aux;
+ assert(file != NULL);
+ ai = file->ff_ai;
+
+ error = fs_pdir(sc, fid, olddir, sizeof(olddir), &opst);
+ if (error)
+ return (error);
+
+ f2 = req->lr_fid2;
+ f2ff = f2->lo_aux;
+ assert(f2ff != NULL);
+
+ reparenting = strcmp(olddir, f2ff->ff_name) != 0;
+
+ fillacl(file);
+ fillacl(f2ff);
+
+ if (fstatat(file->ff_dirfd, file->ff_name, &cst,
+ AT_SYMLINK_NOFOLLOW) != 0)
+ return (errno);
+
+ /*
+ * Are we moving from olddir? If so, we're unlinking
+ * from it, in terms of ACL access.
+ */
+ if (reparenting) {
+ oparent_acl = getacl(file, -1, olddir);
+ error = check_access(L9P_ACOP_UNLINK,
+ oparent_acl, &opst, file->ff_acl, &cst, ai, (gid_t)-1);
+ l9p_acl_free(oparent_acl);
+ if (error)
+ return (error);
+ }
+
+ /*
+ * Now check that we're allowed to "create" a file or directory in
+ * f2. (Should we do this, too, only if reparenting? Maybe check
+ * for dir write permission if not reparenting -- but that's just
+ * add-file/add-subdir, which means doing this always.)
+ */
+ if (fstatat(f2ff->ff_dirfd, f2ff->ff_name, &npst,
+ AT_SYMLINK_NOFOLLOW) != 0)
+ return (errno);
+
+ op = S_ISDIR(cst.st_mode) ? L9P_ACE_ADD_SUBDIRECTORY : L9P_ACE_ADD_FILE;
+ error = check_access(op, f2ff->ff_acl, &npst, NULL, NULL,
+ ai, (gid_t)-1);
+ if (error)
+ return (error);
+
+ /*
+ * Directories OK, file systems not R/O, etc; build final name.
+ * f2ff->ff_name cannot exceed MAXPATHLEN, but out of general
+ * paranoia, let's double check anyway.
+ */
+ if (strlcpy(newname, f2ff->ff_name, sizeof(newname)) >= sizeof(newname))
+ return (ENAMETOOLONG);
+ error = fs_dpf(newname, req->lr_req.trename.name, sizeof(newname));
+ if (error)
+ return (error);
+ tmp = strdup(newname);
+ if (tmp == NULL)
+ return (ENOMEM);
+
+ if (renameat(file->ff_dirfd, file->ff_name, file->ff_dirfd, tmp) != 0) {
+ error = errno;
+ free(tmp);
+ return (error);
+ }
+
+ /* file has been renamed but old fid is not clunked */
+ free(file->ff_name);
+ file->ff_name = tmp;
+
+ dropacl(file);
+ return (0);
+}
+
+static int
+fs_readlink(void *softc __unused, struct l9p_request *req)
+{
+ struct fs_fid *file;
+ ssize_t linklen;
+ char buf[MAXPATHLEN];
+ int error = 0;
+
+ file = req->lr_fid->lo_aux;
+ assert(file);
+
+ linklen = readlinkat(file->ff_dirfd, file->ff_name, buf, sizeof(buf));
+ if (linklen < 0)
+ error = errno;
+ else if ((size_t)linklen >= sizeof(buf))
+ error = ENOMEM; /* todo: allocate dynamically */
+ else if ((req->lr_resp.rreadlink.target = strndup(buf,
+ (size_t)linklen)) == NULL)
+ error = ENOMEM;
+ return (error);
+}
+
+static int
+fs_getattr(void *softc __unused, struct l9p_request *req)
+{
+ uint64_t mask, valid;
+ struct fs_fid *file;
+ struct stat st;
+ int error = 0;
+
+ file = req->lr_fid->lo_aux;
+ assert(file);
+
+ valid = 0;
+ if (fstatat(file->ff_dirfd, file->ff_name, &st, AT_SYMLINK_NOFOLLOW)) {
+ error = errno;
+ goto out;
+ }
+ /* ?? Can we provide items not-requested? If so, can skip tests. */
+ mask = req->lr_req.tgetattr.request_mask;
+ if (mask & L9PL_GETATTR_MODE) {
+ /* It is not clear if we need any translations. */
+ req->lr_resp.rgetattr.mode = st.st_mode;
+ valid |= L9PL_GETATTR_MODE;
+ }
+ if (mask & L9PL_GETATTR_NLINK) {
+ req->lr_resp.rgetattr.nlink = st.st_nlink;
+ valid |= L9PL_GETATTR_NLINK;
+ }
+ if (mask & L9PL_GETATTR_UID) {
+ /* provide st_uid, or file->ff_uid? */
+ req->lr_resp.rgetattr.uid = st.st_uid;
+ valid |= L9PL_GETATTR_UID;
+ }
+ if (mask & L9PL_GETATTR_GID) {
+ /* provide st_gid, or file->ff_gid? */
+ req->lr_resp.rgetattr.gid = st.st_gid;
+ valid |= L9PL_GETATTR_GID;
+ }
+ if (mask & L9PL_GETATTR_RDEV) {
+ /* It is not clear if we need any translations. */
+ req->lr_resp.rgetattr.rdev = (uint64_t)st.st_rdev;
+ valid |= L9PL_GETATTR_RDEV;
+ }
+ if (mask & L9PL_GETATTR_ATIME) {
+ req->lr_resp.rgetattr.atime_sec =
+ (uint64_t)STAT_ATIME(&st).tv_sec;
+ req->lr_resp.rgetattr.atime_nsec =
+ (uint64_t)STAT_ATIME(&st).tv_nsec;
+ valid |= L9PL_GETATTR_ATIME;
+ }
+ if (mask & L9PL_GETATTR_MTIME) {
+ req->lr_resp.rgetattr.mtime_sec =
+ (uint64_t)STAT_MTIME(&st).tv_sec;
+ req->lr_resp.rgetattr.mtime_nsec =
+ (uint64_t)STAT_MTIME(&st).tv_nsec;
+ valid |= L9PL_GETATTR_MTIME;
+ }
+ if (mask & L9PL_GETATTR_CTIME) {
+ req->lr_resp.rgetattr.ctime_sec =
+ (uint64_t)STAT_CTIME(&st).tv_sec;
+ req->lr_resp.rgetattr.ctime_nsec =
+ (uint64_t)STAT_CTIME(&st).tv_nsec;
+ valid |= L9PL_GETATTR_CTIME;
+ }
+ if (mask & L9PL_GETATTR_BTIME) {
+#if defined(HAVE_BIRTHTIME)
+ req->lr_resp.rgetattr.btime_sec =
+ (uint64_t)st.st_birthtim.tv_sec;
+ req->lr_resp.rgetattr.btime_nsec =
+ (uint64_t)st.st_birthtim.tv_nsec;
+#elif defined(__illumos__)
+ getcrtime(softc, file->ff_dirfd, file->ff_name,
+ &req->lr_resp.rgetattr.btime_sec,
+ &req->lr_resp.rgetattr.btime_nsec);
+#else
+ req->lr_resp.rgetattr.btime_sec = 0;
+ req->lr_resp.rgetattr.btime_nsec = 0;
+#endif
+ valid |= L9PL_GETATTR_BTIME;
+ }
+ if (mask & L9PL_GETATTR_INO)
+ valid |= L9PL_GETATTR_INO;
+ if (mask & L9PL_GETATTR_SIZE) {
+ req->lr_resp.rgetattr.size = (uint64_t)st.st_size;
+ valid |= L9PL_GETATTR_SIZE;
+ }
+ if (mask & L9PL_GETATTR_BLOCKS) {
+ req->lr_resp.rgetattr.blksize = (uint64_t)st.st_blksize;
+ req->lr_resp.rgetattr.blocks = (uint64_t)st.st_blocks;
+ valid |= L9PL_GETATTR_BLOCKS;
+ }
+#ifndef __illumos__
+ if (mask & L9PL_GETATTR_GEN) {
+ req->lr_resp.rgetattr.gen = st.st_gen;
+ valid |= L9PL_GETATTR_GEN;
+ }
+#endif
+ /* don't know what to do with data version yet */
+
+ generate_qid(&st, &req->lr_resp.rgetattr.qid);
+out:
+ req->lr_resp.rgetattr.valid = valid;
+ return (error);
+}
+
+/*
+ * Should combine some of this with wstat code.
+ */
+static int
+fs_setattr(void *softc, struct l9p_request *req)
+{
+ uint64_t mask;
+ struct fs_softc *sc = softc;
+ struct timespec ts[2];
+ struct fs_fid *file;
+ struct stat st;
+ int error = 0;
+ uid_t uid, gid;
+
+ file = req->lr_fid->lo_aux;
+ assert(file);
+
+ if (sc->fs_readonly)
+ return (EROFS);
+
+ /*
+ * As with WSTAT we have atomicity issues.
+ */
+ mask = req->lr_req.tsetattr.valid;
+
+ if (fstatat(file->ff_dirfd, file->ff_name, &st, AT_SYMLINK_NOFOLLOW)) {
+ error = errno;
+ goto out;
+ }
+
+ if ((mask & L9PL_SETATTR_SIZE) && S_ISDIR(st.st_mode)) {
+ error = EISDIR;
+ goto out;
+ }
+
+ if (mask & L9PL_SETATTR_MODE) {
+ if (fchmodat(file->ff_dirfd, file->ff_name,
+ req->lr_req.tsetattr.mode & 0777,
+ 0)) {
+ error = errno;
+ goto out;
+ }
+ }
+
+ if (mask & (L9PL_SETATTR_UID | L9PL_SETATTR_GID)) {
+ uid = mask & L9PL_SETATTR_UID
+ ? req->lr_req.tsetattr.uid
+ : (uid_t)-1;
+
+ gid = mask & L9PL_SETATTR_GID
+ ? req->lr_req.tsetattr.gid
+ : (gid_t)-1;
+
+ if (fchownat(file->ff_dirfd, file->ff_name, uid, gid,
+ AT_SYMLINK_NOFOLLOW)) {
+ error = errno;
+ goto out;
+ }
+ }
+
+ if (mask & L9PL_SETATTR_SIZE) {
+ /* Truncate follows symlinks, is this OK? */
+ int fd = openat(file->ff_dirfd, file->ff_name, O_RDWR);
+ if (ftruncate(fd, (off_t)req->lr_req.tsetattr.size)) {
+ error = errno;
+ (void) close(fd);
+ goto out;
+ }
+ (void) close(fd);
+ }
+
+ if (mask & (L9PL_SETATTR_ATIME | L9PL_SETATTR_MTIME)) {
+ ts[0].tv_sec = STAT_ATIME(&st).tv_sec;
+ ts[0].tv_nsec = STAT_ATIME(&st).tv_nsec;
+ ts[1].tv_sec = STAT_MTIME(&st).tv_sec;
+ ts[1].tv_nsec = STAT_MTIME(&st).tv_nsec;
+
+ if (mask & L9PL_SETATTR_ATIME) {
+ if (mask & L9PL_SETATTR_ATIME_SET) {
+ ts[0].tv_sec = req->lr_req.tsetattr.atime_sec;
+ ts[0].tv_nsec = req->lr_req.tsetattr.atime_nsec;
+ } else {
+ if (clock_gettime(CLOCK_REALTIME, &ts[0]) != 0) {
+ error = errno;
+ goto out;
+ }
+ }
+ }
+
+ if (mask & L9PL_SETATTR_MTIME) {
+ if (mask & L9PL_SETATTR_MTIME_SET) {
+ ts[1].tv_sec = req->lr_req.tsetattr.mtime_sec;
+ ts[1].tv_nsec = req->lr_req.tsetattr.mtime_nsec;
+ } else {
+ if (clock_gettime(CLOCK_REALTIME, &ts[1]) != 0) {
+ error = errno;
+ goto out;
+ }
+ }
+ }
+
+ if (utimensat(file->ff_dirfd, file->ff_name, ts,
+ AT_SYMLINK_NOFOLLOW)) {
+ error = errno;
+ goto out;
+ }
+ }
+out:
+ return (error);
+}
+
+static int
+fs_xattrwalk(void *softc __unused, struct l9p_request *req __unused)
+{
+ return (EOPNOTSUPP);
+}
+
+static int
+fs_xattrcreate(void *softc __unused, struct l9p_request *req __unused)
+{
+ return (EOPNOTSUPP);
+}
+
+static int
+fs_readdir(void *softc __unused, struct l9p_request *req)
+{
+ struct l9p_message msg;
+ struct l9p_dirent de;
+ struct fs_fid *file;
+ struct dirent *dp;
+ struct stat st;
+ uint32_t count;
+ int error = 0;
+
+ file = req->lr_fid->lo_aux;
+ assert(file);
+
+ if (file->ff_dir == NULL)
+ return (ENOTDIR);
+
+ if ((error = pthread_mutex_lock(&file->ff_mtx)) != 0)
+ return (error);
+
+ /*
+ * It's not clear whether we can use the same trick for
+ * discarding offsets here as we do in fs_read. It
+ * probably should work, we'll have to see if some
+ * client(s) use the zero-offset thing to rescan without
+ * clunking the directory first.
+ *
+ * Probably the thing to do is switch to calling
+ * getdirentries() / getdents() directly, instead of
+ * going through libc.
+ */
+ if (req->lr_req.io.offset == 0)
+ rewinddir(file->ff_dir);
+ else
+ seekdir(file->ff_dir, (long)req->lr_req.io.offset);
+
+ l9p_init_msg(&msg, req, L9P_PACK);
+ count = (uint32_t)msg.lm_size; /* in case we get no entries */
+ while ((dp = readdir(file->ff_dir)) != NULL) {
+ /*
+ * Although "." is forbidden in naming and ".." is
+ * special cased, testing shows that we must transmit
+ * them through readdir. (For ".." at root, we
+ * should perhaps alter the inode number, but not
+ * yet.)
+ */
+
+ /*
+ * TODO: we do a full lstat here; could use dp->d_*
+ * to construct the qid more efficiently, as long
+ * as dp->d_type != DT_UNKNOWN.
+ */
+ if (fs_lstatat(file, dp->d_name, &st))
+ continue;
+
+ de.qid.type = 0;
+ generate_qid(&st, &de.qid);
+ de.offset = (uint64_t)telldir(file->ff_dir);
+#ifdef __illumos__
+ de.type = st.st_mode & S_IFMT;
+#else
+ de.type = dp->d_type;
+#endif
+ de.name = dp->d_name;
+
+ /* Update count only if we completely pack the dirent. */
+ if (l9p_pudirent(&msg, &de) < 0)
+ break;
+ count = (uint32_t)msg.lm_size;
+ }
+
+ (void) pthread_mutex_unlock(&file->ff_mtx);
+ req->lr_resp.io.count = count;
+ return (error);
+}
+
+static int
+fs_fsync(void *softc __unused, struct l9p_request *req)
+{
+ struct fs_fid *file;
+ int error = 0;
+
+ file = req->lr_fid->lo_aux;
+ assert(file);
+ if (fsync(file->ff_dir != NULL ? dirfd(file->ff_dir) : file->ff_fd))
+ error = errno;
+ return (error);
+}
+
+static int
+fs_lock(void *softc __unused, struct l9p_request *req)
+{
+
+ switch (req->lr_req.tlock.type) {
+ case L9PL_LOCK_TYPE_RDLOCK:
+ case L9PL_LOCK_TYPE_WRLOCK:
+ case L9PL_LOCK_TYPE_UNLOCK:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ req->lr_resp.rlock.status = L9PL_LOCK_SUCCESS;
+ return (0);
+}
+
+static int
+fs_getlock(void *softc __unused, struct l9p_request *req)
+{
+
+ /*
+ * Client wants to see if a request to lock a region would
+ * block. This is, of course, not atomic anyway, so the
+ * op is useless. QEMU simply says "unlocked!", so we do
+ * too.
+ */
+ switch (req->lr_req.getlock.type) {
+ case L9PL_LOCK_TYPE_RDLOCK:
+ case L9PL_LOCK_TYPE_WRLOCK:
+ case L9PL_LOCK_TYPE_UNLOCK:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ req->lr_resp.getlock = req->lr_req.getlock;
+ req->lr_resp.getlock.type = L9PL_LOCK_TYPE_UNLOCK;
+ req->lr_resp.getlock.client_id = strdup(""); /* XXX what should go here? */
+ return (0);
+}
+
+static int
+fs_link(void *softc __unused, struct l9p_request *req)
+{
+ struct l9p_fid *dir;
+ struct fs_fid *file;
+ struct fs_fid *dirf;
+ struct stat fst, tdst;
+ int32_t op;
+ char *name;
+ char newname[MAXPATHLEN];
+ int error;
+
+ /* N.B.: lr_fid is the file to link, lr_fid2 is the target dir */
+ dir = req->lr_fid2;
+ dirf = dir->lo_aux;
+ assert(dirf != NULL);
+
+ name = req->lr_req.tlink.name;
+ error = fs_buildname(dir, name, newname, sizeof(newname));
+ if (error)
+ return (error);
+
+ file = req->lr_fid->lo_aux;
+ assert(file != NULL);
+
+ if (fstatat(dirf->ff_dirfd, dirf->ff_name, &tdst, AT_SYMLINK_NOFOLLOW) != 0 ||
+ fstatat(file->ff_dirfd, file->ff_name, &fst, AT_SYMLINK_NOFOLLOW) != 0)
+ return (errno);
+ if (S_ISDIR(fst.st_mode))
+ return (EISDIR);
+ fillacl(dirf);
+ op = S_ISDIR(fst.st_mode) ? L9P_ACE_ADD_SUBDIRECTORY : L9P_ACE_ADD_FILE;
+ error = check_access(op,
+ dirf->ff_acl, &tdst, NULL, NULL, file->ff_ai, (gid_t)-1);
+ if (error)
+ return (error);
+
+ if (linkat(file->ff_dirfd, file->ff_name, file->ff_dirfd,
+ newname, 0) != 0)
+ error = errno;
+ else
+ dropacl(file);
+
+ return (error);
+}
+
+static int
+fs_mkdir(void *softc, struct l9p_request *req)
+{
+ struct l9p_fid *dir;
+ struct stat st;
+ mode_t perm;
+ gid_t gid;
+ char *name;
+ int error;
+
+ dir = req->lr_fid;
+ name = req->lr_req.tmkdir.name;
+ perm = (mode_t)req->lr_req.tmkdir.mode;
+ gid = req->lr_req.tmkdir.gid;
+
+ error = fs_imkdir(softc, dir, name, false, perm, gid, &st);
+ if (error == 0)
+ generate_qid(&st, &req->lr_resp.rmkdir.qid);
+ return (error);
+}
+
+static int
+fs_renameat(void *softc, struct l9p_request *req)
+{
+ struct fs_softc *sc = softc;
+ struct l9p_fid *olddir, *newdir;
+ struct l9p_acl *facl;
+ struct fs_fid *off, *nff;
+ struct stat odst, ndst, fst;
+ int32_t op;
+ bool reparenting;
+ char *onp, *nnp;
+ char onb[MAXPATHLEN], nnb[MAXPATHLEN];
+ int error;
+
+ if (sc->fs_readonly)
+ return (EROFS);
+
+ olddir = req->lr_fid;
+ newdir = req->lr_fid2;
+ assert(olddir != NULL && newdir != NULL);
+ off = olddir->lo_aux;
+ nff = newdir->lo_aux;
+ assert(off != NULL && nff != NULL);
+
+ onp = req->lr_req.trenameat.oldname;
+ nnp = req->lr_req.trenameat.newname;
+ error = fs_buildname(olddir, onp, onb, sizeof(onb));
+ if (error)
+ return (error);
+ error = fs_buildname(newdir, nnp, nnb, sizeof(nnb));
+ if (error)
+ return (error);
+ if (fstatat(off->ff_dirfd, onb, &fst, AT_SYMLINK_NOFOLLOW) != 0)
+ return (errno);
+
+ reparenting = olddir != newdir &&
+ strcmp(off->ff_name, nff->ff_name) != 0;
+
+ if (fstatat(off->ff_dirfd, off->ff_name, &odst, AT_SYMLINK_NOFOLLOW) != 0)
+ return (errno);
+ if (!S_ISDIR(odst.st_mode))
+ return (ENOTDIR);
+ fillacl(off);
+
+ if (reparenting) {
+ if (fstatat(nff->ff_dirfd, nff->ff_name, &ndst, AT_SYMLINK_NOFOLLOW) != 0)
+ return (errno);
+ if (!S_ISDIR(ndst.st_mode))
+ return (ENOTDIR);
+ facl = getacl(off, -1, onb);
+ fillacl(nff);
+
+ error = check_access(L9P_ACOP_UNLINK,
+ off->ff_acl, &odst, facl, &fst, off->ff_ai, (gid_t)-1);
+ l9p_acl_free(facl);
+ if (error)
+ return (error);
+ op = S_ISDIR(fst.st_mode) ? L9P_ACE_ADD_SUBDIRECTORY :
+ L9P_ACE_ADD_FILE;
+ error = check_access(op,
+ nff->ff_acl, &ndst, NULL, NULL, nff->ff_ai, (gid_t)-1);
+ if (error)
+ return (error);
+ }
+
+ if (renameat(off->ff_dirfd, onb, nff->ff_dirfd, nnb))
+ error = errno;
+
+ return (error);
+}
+
+/*
+ * Unlink file in given directory, or remove directory in given
+ * directory, based on flags.
+ */
+static int
+fs_unlinkat(void *softc, struct l9p_request *req)
+{
+ struct fs_softc *sc = softc;
+ struct l9p_acl *facl;
+ struct l9p_fid *dir;
+ struct fs_fid *dirff;
+ struct stat dirst, fst;
+ char *name;
+ char newname[MAXPATHLEN];
+ int error;
+
+ if (sc->fs_readonly)
+ return (EROFS);
+
+ dir = req->lr_fid;
+ dirff = dir->lo_aux;
+ assert(dirff != NULL);
+ name = req->lr_req.tunlinkat.name;
+ error = fs_buildname(dir, name, newname, sizeof(newname));
+ if (error)
+ return (error);
+ if (fstatat(dirff->ff_dirfd, newname, &fst, AT_SYMLINK_NOFOLLOW) != 0 ||
+ fstatat(dirff->ff_dirfd, dirff->ff_name, &dirst, AT_SYMLINK_NOFOLLOW) != 0)
+ return (errno);
+ fillacl(dirff);
+ facl = getacl(dirff, -1, newname);
+ error = check_access(L9P_ACOP_UNLINK,
+ dirff->ff_acl, &dirst, facl, &fst, dirff->ff_ai, (gid_t)-1);
+ l9p_acl_free(facl);
+ if (error)
+ return (error);
+
+ if (req->lr_req.tunlinkat.flags & L9PL_AT_REMOVEDIR) {
+ if (unlinkat(dirff->ff_dirfd, newname, AT_REMOVEDIR) != 0)
+ error = errno;
+ } else {
+ if (unlinkat(dirff->ff_dirfd, newname, 0) != 0)
+ error = errno;
+ }
+ return (error);
+}
+
+static void
+fs_freefid(void *softc __unused, struct l9p_fid *fid)
+{
+ struct fs_fid *f = fid->lo_aux;
+ struct fs_authinfo *ai;
+ uint32_t newcount;
+
+ if (f == NULL) {
+ /* Nothing to do here */
+ return;
+ }
+
+ if (f->ff_fd != -1)
+ close(f->ff_fd);
+
+ if (f->ff_dir)
+ closedir(f->ff_dir);
+
+ (void) pthread_mutex_destroy(&f->ff_mtx);
+ free(f->ff_name);
+ ai = f->ff_ai;
+ l9p_acl_free(f->ff_acl);
+ free(f);
+ (void) pthread_mutex_lock(&ai->ai_mtx);
+ newcount = --ai->ai_refcnt;
+ (void) pthread_mutex_unlock(&ai->ai_mtx);
+ if (newcount == 0) {
+ /*
+ * We *were* the last ref, no one can have gained a ref.
+ */
+ L9P_LOG(L9P_DEBUG, "dropped last ref to authinfo %p",
+ (void *)ai);
+ (void) pthread_mutex_destroy(&ai->ai_mtx);
+ free(ai);
+ } else {
+ L9P_LOG(L9P_DEBUG, "authinfo %p now used by %lu",
+ (void *)ai, (u_long)newcount);
+ }
+}
+
+int
+l9p_backend_fs_init(struct l9p_backend **backendp, int rootfd, bool ro)
+{
+ struct l9p_backend *backend;
+ struct fs_softc *sc;
+ int error;
+#if defined(WITH_CASPER)
+ cap_channel_t *capcas;
+#endif
+
+ if (!fs_attach_mutex_inited) {
+#ifdef __illumos__
+ if ((error = pthread_mutexattr_init(&fs_mutexattr)) != 0) {
+ errno = error;
+ return (-1);
+ }
+ if ((error = pthread_mutexattr_settype(&fs_mutexattr,
+ PTHREAD_MUTEX_ERRORCHECK)) != 0) {
+ errno = error;
+ return (-1);
+ }
+ error = pthread_mutex_init(&fs_attach_mutex, &fs_mutexattr);
+#else
+ error = pthread_mutex_init(&fs_attach_mutex, NULL);
+#endif
+ if (error) {
+ errno = error;
+ return (-1);
+ }
+ fs_attach_mutex_inited = true;
+ }
+
+ backend = l9p_malloc(sizeof(*backend));
+ backend->attach = fs_attach;
+ backend->clunk = fs_clunk;
+ backend->create = fs_create;
+ backend->open = fs_open;
+ backend->read = fs_read;
+ backend->remove = fs_remove;
+ backend->stat = fs_stat;
+ backend->walk = fs_walk;
+ backend->write = fs_write;
+ backend->wstat = fs_wstat;
+ backend->statfs = fs_statfs;
+ backend->lopen = fs_lopen;
+ backend->lcreate = fs_lcreate;
+ backend->symlink = fs_symlink;
+ backend->mknod = fs_mknod;
+ backend->rename = fs_rename;
+ backend->readlink = fs_readlink;
+ backend->getattr = fs_getattr;
+ backend->setattr = fs_setattr;
+ backend->xattrwalk = fs_xattrwalk;
+ backend->xattrcreate = fs_xattrcreate;
+ backend->readdir = fs_readdir;
+ backend->fsync = fs_fsync;
+ backend->lock = fs_lock;
+ backend->getlock = fs_getlock;
+ backend->link = fs_link;
+ backend->mkdir = fs_mkdir;
+ backend->renameat = fs_renameat;
+ backend->unlinkat = fs_unlinkat;
+ backend->freefid = fs_freefid;
+
+ sc = l9p_malloc(sizeof(*sc));
+ sc->fs_rootfd = rootfd;
+ sc->fs_readonly = ro;
+ backend->softc = sc;
+
+#if defined(__illumos__)
+ if (fpathconf(rootfd, _PC_XATTR_ENABLED) > 0)
+ sc->fs_hasxattr = 1;
+#endif
+
+#if defined(WITH_CASPER)
+ capcas = cap_init();
+ if (capcas == NULL)
+ return (-1);
+
+ sc->fs_cappwd = cap_service_open(capcas, "system.pwd");
+ if (sc->fs_cappwd == NULL)
+ return (-1);
+
+ sc->fs_capgrp = cap_service_open(capcas, "system.grp");
+ if (sc->fs_capgrp == NULL)
+ return (-1);
+
+ cap_setpassent(sc->fs_cappwd, 1);
+ cap_setgroupent(sc->fs_capgrp, 1);
+ cap_close(capcas);
+#elif defined(__illumos__)
+ setpwent();
+#else
+ setpassent(1);
+#endif
+
+ *backendp = backend;
+ return (0);
+}
+
+#ifdef __illumos__
+acl_t *
+acl_get_fd_np(int fd, int type)
+{
+ acl_t *acl;
+ int flag, ret;
+
+ flag = 0;
+ if (type == ACL_TYPE_NFS4)
+ flag = ACL_NO_TRIVIAL;
+
+ ret = facl_get(fd, flag, &acl);
+ if (ret != 0)
+ return (NULL);
+
+ return (acl);
+}
+
+static void
+getcrtime(struct fs_softc *sc, int dirfd, const char *fname, uint64_t *secp,
+ uint64_t *nsp)
+{
+ nvlist_t *nvl = NULL;
+ uint64_t *vals = NULL;
+ uint_t nvals = 0;
+ int error;
+
+ *secp = 0;
+ *nsp = 0;
+
+ if (!sc->fs_hasxattr)
+ return;
+
+ if ((error = getattrat(dirfd, XATTR_VIEW_READWRITE, fname, &nvl)) != 0)
+ return;
+
+ if (nvlist_lookup_uint64_array(nvl, "crtime", &vals, &nvals) != 0)
+ goto done;
+
+ if (nvals != 2)
+ goto done;
+
+ *secp = vals[0];
+ *nsp = vals[1];
+
+done:
+ nvlist_free(nvl);
+}
+#endif
diff --git a/usr/src/lib/lib9p/common/backend/fs.h b/usr/src/lib/lib9p/common/backend/fs.h
new file mode 100644
index 0000000000..84b37171c2
--- /dev/null
+++ b/usr/src/lib/lib9p/common/backend/fs.h
@@ -0,0 +1,37 @@
+
+/*
+ * Copyright 2016 Chris Torek <torek@ixsystems.com>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef LIB9P_BACKEND_FS_H
+#define LIB9P_BACKEND_FS_H
+
+#include <stdbool.h>
+#include "backend.h"
+
+int l9p_backend_fs_init(struct l9p_backend **backendp, int rootfd, bool ro);
+
+#endif /* LIB9P_BACKEND_FS_H */
diff --git a/usr/src/lib/lib9p/common/connection.c b/usr/src/lib/lib9p/common/connection.c
new file mode 100644
index 0000000000..20c27796b8
--- /dev/null
+++ b/usr/src/lib/lib9p/common/connection.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/queue.h>
+#include "lib9p.h"
+#include "lib9p_impl.h"
+#include "fid.h"
+#include "hashtable.h"
+#include "log.h"
+#include "threadpool.h"
+#include "backend/backend.h"
+
+int
+l9p_server_init(struct l9p_server **serverp, struct l9p_backend *backend)
+{
+ struct l9p_server *server;
+
+ server = l9p_calloc(1, sizeof (*server));
+ server->ls_max_version = L9P_2000L;
+ server->ls_backend = backend;
+ LIST_INIT(&server->ls_conns);
+
+ *serverp = server;
+ return (0);
+}
+
+int
+l9p_connection_init(struct l9p_server *server, struct l9p_connection **conn)
+{
+ struct l9p_connection *newconn;
+
+ assert(server != NULL);
+ assert(conn != NULL);
+
+ newconn = calloc(1, sizeof (*newconn));
+ if (newconn == NULL)
+ return (-1);
+ newconn->lc_server = server;
+ newconn->lc_msize = L9P_DEFAULT_MSIZE;
+ if (l9p_threadpool_init(&newconn->lc_tp, L9P_NUMTHREADS)) {
+ free(newconn);
+ return (-1);
+ }
+ ht_init(&newconn->lc_files, 100);
+ ht_init(&newconn->lc_requests, 100);
+ LIST_INSERT_HEAD(&server->ls_conns, newconn, lc_link);
+ *conn = newconn;
+
+ return (0);
+}
+
+void
+l9p_connection_free(struct l9p_connection *conn)
+{
+
+ LIST_REMOVE(conn, lc_link);
+ free(conn);
+}
+
+void
+l9p_connection_recv(struct l9p_connection *conn, const struct iovec *iov,
+ const size_t niov, void *aux)
+{
+ struct l9p_request *req;
+ int error;
+
+ req = l9p_calloc(1, sizeof (struct l9p_request));
+ req->lr_aux = aux;
+ req->lr_conn = conn;
+
+ req->lr_req_msg.lm_mode = L9P_UNPACK;
+ req->lr_req_msg.lm_niov = niov;
+ memcpy(req->lr_req_msg.lm_iov, iov, sizeof (struct iovec) * niov);
+
+ req->lr_resp_msg.lm_mode = L9P_PACK;
+
+ if (l9p_pufcall(&req->lr_req_msg, &req->lr_req, conn->lc_version) != 0) {
+ L9P_LOG(L9P_WARNING, "cannot unpack received message");
+ l9p_freefcall(&req->lr_req);
+ free(req);
+ return;
+ }
+
+ if (ht_add(&conn->lc_requests, req->lr_req.hdr.tag, req)) {
+ L9P_LOG(L9P_WARNING, "client reusing outstanding tag %d",
+ req->lr_req.hdr.tag);
+ l9p_freefcall(&req->lr_req);
+ free(req);
+ return;
+ }
+
+ error = conn->lc_lt.lt_get_response_buffer(req,
+ req->lr_resp_msg.lm_iov,
+ &req->lr_resp_msg.lm_niov,
+ conn->lc_lt.lt_aux);
+ if (error) {
+ L9P_LOG(L9P_WARNING, "cannot obtain buffers for response");
+ ht_remove(&conn->lc_requests, req->lr_req.hdr.tag);
+ l9p_freefcall(&req->lr_req);
+ free(req);
+ return;
+ }
+
+ /*
+ * NB: it's up to l9p_threadpool_run to decide whether
+ * to queue the work or to run it immediately and wait
+ * (it must do the latter for Tflush requests).
+ */
+ l9p_threadpool_run(&conn->lc_tp, req);
+}
+
+void
+l9p_connection_close(struct l9p_connection *conn)
+{
+ struct ht_iter iter;
+ struct l9p_fid *fid;
+ struct l9p_request *req;
+
+ L9P_LOG(L9P_DEBUG, "waiting for thread pool to shut down");
+ l9p_threadpool_shutdown(&conn->lc_tp);
+
+ /* Drain pending requests (if any) */
+ L9P_LOG(L9P_DEBUG, "draining pending requests");
+ ht_iter(&conn->lc_requests, &iter);
+ while ((req = ht_next(&iter)) != NULL) {
+#ifdef notyet
+ /* XXX would be good to know if there is anyone listening */
+ if (anyone listening) {
+ /* XXX crude - ops like Tclunk should succeed */
+ req->lr_error = EINTR;
+ l9p_respond(req, false, false);
+ } else
+#endif
+ l9p_respond(req, true, false); /* use no-answer path */
+ ht_remove_at_iter(&iter);
+ }
+
+ /* Close opened files (if any) */
+ L9P_LOG(L9P_DEBUG, "closing opened files");
+ ht_iter(&conn->lc_files, &iter);
+ while ((fid = ht_next(&iter)) != NULL) {
+ conn->lc_server->ls_backend->freefid(
+ conn->lc_server->ls_backend->softc, fid);
+ free(fid);
+ ht_remove_at_iter(&iter);
+ }
+
+ ht_destroy(&conn->lc_requests);
+ ht_destroy(&conn->lc_files);
+}
+
+struct l9p_fid *
+l9p_connection_alloc_fid(struct l9p_connection *conn, uint32_t fid)
+{
+ struct l9p_fid *file;
+
+ file = l9p_calloc(1, sizeof (struct l9p_fid));
+ file->lo_fid = fid;
+ /*
+ * Note that the new fid is not marked valid yet.
+ * The insert here will fail if the fid number is
+ * in use, otherwise we have an invalid fid in the
+ * table (as desired).
+ */
+
+ if (ht_add(&conn->lc_files, fid, file) != 0) {
+ free(file);
+ return (NULL);
+ }
+
+ return (file);
+}
+
+void
+l9p_connection_remove_fid(struct l9p_connection *conn, struct l9p_fid *fid)
+{
+ struct l9p_backend *be;
+
+ /* fid should be marked invalid by this point */
+ assert(!l9p_fid_isvalid(fid));
+
+ be = conn->lc_server->ls_backend;
+ be->freefid(be->softc, fid);
+
+ ht_remove(&conn->lc_files, fid->lo_fid);
+ free(fid);
+}
diff --git a/usr/src/lib/lib9p/common/fcall.h b/usr/src/lib/lib9p/common/fcall.h
new file mode 100644
index 0000000000..f779ea6ad5
--- /dev/null
+++ b/usr/src/lib/lib9p/common/fcall.h
@@ -0,0 +1,624 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Based on libixp code: ©2007-2010 Kris Maglione <maglione.k at Gmail>
+ */
+
+#ifndef LIB9P_FCALL_H
+#define LIB9P_FCALL_H
+
+#include <stdint.h>
+
+#define L9P_MAX_WELEM 256
+
+/*
+ * Function call/reply (Tfoo/Rfoo) numbers.
+ *
+ * These are protocol code numbers, so the exact values
+ * matter. However, __FIRST and __LAST_PLUS_ONE are for
+ * debug code, and just need to encompass the entire range.
+ *
+ * Note that we rely (in the debug code) on Rfoo == Tfoo+1.
+ */
+enum l9p_ftype {
+ L9P__FIRST = 6, /* NB: must be <= all legal values */
+ L9P_TLERROR = 6, /* illegal; exists for parity with Rlerror */
+ L9P_RLERROR,
+ L9P_TSTATFS = 8,
+ L9P_RSTATFS,
+ L9P_TLOPEN = 12,
+ L9P_RLOPEN,
+ L9P_TLCREATE = 14,
+ L9P_RLCREATE,
+ L9P_TSYMLINK = 16,
+ L9P_RSYMLINK,
+ L9P_TMKNOD = 18,
+ L9P_RMKNOD,
+ L9P_TRENAME = 20,
+ L9P_RRENAME,
+ L9P_TREADLINK = 22,
+ L9P_RREADLINK,
+ L9P_TGETATTR = 24,
+ L9P_RGETATTR,
+ L9P_TSETATTR = 26,
+ L9P_RSETATTR,
+ L9P_TXATTRWALK = 30,
+ L9P_RXATTRWALK,
+ L9P_TXATTRCREATE = 32,
+ L9P_RXATTRCREATE,
+ L9P_TREADDIR = 40,
+ L9P_RREADDIR,
+ L9P_TFSYNC = 50,
+ L9P_RFSYNC,
+ L9P_TLOCK = 52,
+ L9P_RLOCK,
+ L9P_TGETLOCK = 54,
+ L9P_RGETLOCK,
+ L9P_TLINK = 70,
+ L9P_RLINK,
+ L9P_TMKDIR = 72,
+ L9P_RMKDIR,
+ L9P_TRENAMEAT = 74,
+ L9P_RRENAMEAT,
+ L9P_TUNLINKAT = 76,
+ L9P_RUNLINKAT,
+ L9P_TVERSION = 100,
+ L9P_RVERSION,
+ L9P_TAUTH = 102,
+ L9P_RAUTH,
+ L9P_TATTACH = 104,
+ L9P_RATTACH,
+ L9P_TERROR = 106, /* illegal */
+ L9P_RERROR,
+ L9P_TFLUSH = 108,
+ L9P_RFLUSH,
+ L9P_TWALK = 110,
+ L9P_RWALK,
+ L9P_TOPEN = 112,
+ L9P_ROPEN,
+ L9P_TCREATE = 114,
+ L9P_RCREATE,
+ L9P_TREAD = 116,
+ L9P_RREAD,
+ L9P_TWRITE = 118,
+ L9P_RWRITE,
+ L9P_TCLUNK = 120,
+ L9P_RCLUNK,
+ L9P_TREMOVE = 122,
+ L9P_RREMOVE,
+ L9P_TSTAT = 124,
+ L9P_RSTAT,
+ L9P_TWSTAT = 126,
+ L9P_RWSTAT,
+ L9P__LAST_PLUS_1, /* NB: must be last */
+};
+
+/*
+ * When a Tfoo request comes over the wire, we decode it
+ * (pack.c) from wire format into a request laid out in
+ * a "union l9p_fcall" object. This object is not in wire
+ * format, but rather in something more convenient for us
+ * to operate on.
+ *
+ * We then dispatch the request (request.c, backend/fs.c) and
+ * use another "union l9p_fcall" object to build a reply.
+ * The reply is converted to wire format on the way back out
+ * (pack.c again).
+ *
+ * All sub-objects start with a header containing the request
+ * or reply type code and two-byte tag, and whether or not it
+ * is needed, a four-byte fid.
+ *
+ * What this means here is that the data structures within
+ * the union can be shared across various requests and replies.
+ * For instance, replies to OPEN, CREATE, LCREATE, LOPEN, MKDIR, and
+ * SYMLINK are all fairly similar (providing a qid and sometimes
+ * an iounit) and hence can all use the l9p_f_ropen structure.
+ * Which structures are used for which operations is somewhat
+ * arbitrary; for programming ease, if an operation shares a
+ * data structure, it still has its own name: there are union
+ * members named ropen, rcreate, rlcreate, rlopen, rmkdir, and
+ * rsymlink, even though all use struct l9p_f_ropen.
+ *
+ * The big exception to the above rule is struct l9p_f_io, which
+ * is used as both request and reply for all of READ, WRITE, and
+ * READDIR. Moreover, the READDIR reply must be pre-packed into
+ * wire format (it is handled like raw data a la READ).
+ *
+ * Some request messages (e.g., TREADLINK) fit in a header, having
+ * just type code, tag, and fid. These have no separate data
+ * structure, nor union member name. Similarly, some reply
+ * messages (e.g., RCLUNK, RREMOVE, RRENAME) have just the type
+ * code and tag.
+ */
+
+/*
+ * Type code bits in (the first byte of) a qid.
+ */
+enum l9p_qid_type {
+ L9P_QTDIR = 0x80, /* type bit for directories */
+ L9P_QTAPPEND = 0x40, /* type bit for append only files */
+ L9P_QTEXCL = 0x20, /* type bit for exclusive use files */
+ L9P_QTMOUNT = 0x10, /* type bit for mounted channel */
+ L9P_QTAUTH = 0x08, /* type bit for authentication file */
+ L9P_QTTMP = 0x04, /* type bit for non-backed-up file */
+ L9P_QTSYMLINK = 0x02, /* type bit for symbolic link */
+ L9P_QTFILE = 0x00 /* type bits for plain file */
+};
+
+/*
+ * Extra permission bits in create and file modes (stat).
+ */
+#define L9P_DMDIR 0x80000000
+enum {
+ L9P_DMAPPEND = 0x40000000,
+ L9P_DMEXCL = 0x20000000,
+ L9P_DMMOUNT = 0x10000000,
+ L9P_DMAUTH = 0x08000000,
+ L9P_DMTMP = 0x04000000,
+ L9P_DMSYMLINK = 0x02000000,
+ /* 9P2000.u extensions */
+ L9P_DMDEVICE = 0x00800000,
+ L9P_DMNAMEDPIPE = 0x00200000,
+ L9P_DMSOCKET = 0x00100000,
+ L9P_DMSETUID = 0x00080000,
+ L9P_DMSETGID = 0x00040000,
+};
+
+/*
+ * Open/create mode bits in 9P2000 and 9P2000.u operations
+ * (not Linux lopen and lcreate flags, which are different).
+ * Note that the mode field is only one byte wide.
+ */
+enum l9p_omode {
+ L9P_OREAD = 0, /* open for read */
+ L9P_OWRITE = 1, /* write */
+ L9P_ORDWR = 2, /* read and write */
+ L9P_OEXEC = 3, /* execute, == read but check execute permission */
+ L9P_OACCMODE = 3, /* mask for the above access-mode bits */
+ L9P_OTRUNC = 16, /* or'ed in (except for exec), truncate file first */
+ L9P_OCEXEC = 32, /* or'ed in, close on exec */
+ L9P_ORCLOSE = 64, /* or'ed in, remove on close */
+ L9P_ODIRECT = 128, /* or'ed in, direct access */
+};
+
+/*
+ * Flag bits in 9P2000.L operations (Tlopen, Tlcreate). These are
+ * basically just the Linux L_* flags. The bottom 3 bits are the
+ * same as for l9p_omode, although open-for-exec is not used:
+ * instead, the client does a Tgetattr and checks the mode for
+ * execute bits, then just opens for reading.
+ *
+ * Each L_O_xxx is just value O_xxx has on Linux in <fcntl.h>;
+ * not all are necessarily used. From observation, we do get
+ * L_O_CREAT and L_O_EXCL when creating with exclusive, and always
+ * get L_O_LARGEFILE. We do get L_O_APPEND when opening for
+ * append. We also get both L_O_DIRECT and L_O_DIRECTORY set
+ * when opening directories.
+ *
+ * We probably never get L_O_NOCTTY which makes no sense, and
+ * some of the other options may need to be handled on the client.
+ */
+enum l9p_l_o_flags {
+ L9P_L_O_CREAT = 000000100U,
+ L9P_L_O_EXCL = 000000200U,
+ L9P_L_O_NOCTTY = 000000400U,
+ L9P_L_O_TRUNC = 000001000U,
+ L9P_L_O_APPEND = 000002000U,
+ L9P_L_O_NONBLOCK = 000004000U,
+ L9P_L_O_DSYNC = 000010000U,
+ L9P_L_O_FASYNC = 000020000U,
+ L9P_L_O_DIRECT = 000040000U,
+ L9P_L_O_LARGEFILE = 000100000U,
+ L9P_L_O_DIRECTORY = 000200000U,
+ L9P_L_O_NOFOLLOW = 000400000U,
+ L9P_L_O_NOATIME = 001000000U,
+ L9P_L_O_CLOEXEC = 002000000U,
+ L9P_L_O_SYNC = 004000000U,
+ L9P_L_O_PATH = 010000000U,
+ L9P_L_O_TMPFILE = 020000000U,
+};
+
+struct l9p_hdr {
+ uint8_t type;
+ uint16_t tag;
+ uint32_t fid;
+};
+
+struct l9p_qid {
+ uint8_t type;
+ uint32_t version;
+ uint64_t path;
+};
+
+struct l9p_stat {
+ uint16_t type;
+ uint32_t dev;
+ struct l9p_qid qid;
+ uint32_t mode;
+ uint32_t atime;
+ uint32_t mtime;
+ uint64_t length;
+ char *name;
+ char *uid;
+ char *gid;
+ char *muid;
+ char *extension;
+ uint32_t n_uid;
+ uint32_t n_gid;
+ uint32_t n_muid;
+};
+
+#define L9P_FSTYPE 0x01021997
+
+struct l9p_statfs {
+ uint32_t type; /* file system type */
+ uint32_t bsize; /* block size for I/O */
+ uint64_t blocks; /* file system size (bsize-byte blocks) */
+ uint64_t bfree; /* free blocks in fs */
+ uint64_t bavail; /* free blocks avail to non-superuser*/
+ uint64_t files; /* file nodes in file system (# inodes) */
+ uint64_t ffree; /* free file nodes in fs */
+ uint64_t fsid; /* file system identifier */
+ uint32_t namelen; /* maximum length of filenames */
+};
+
+struct l9p_f_version {
+ struct l9p_hdr hdr;
+ uint32_t msize;
+ char *version;
+};
+
+struct l9p_f_tflush {
+ struct l9p_hdr hdr;
+ uint16_t oldtag;
+};
+
+struct l9p_f_error {
+ struct l9p_hdr hdr;
+ char *ename;
+ uint32_t errnum;
+};
+
+struct l9p_f_ropen {
+ struct l9p_hdr hdr;
+ struct l9p_qid qid;
+ uint32_t iounit;
+};
+
+struct l9p_f_rauth {
+ struct l9p_hdr hdr;
+ struct l9p_qid aqid;
+};
+
+struct l9p_f_attach {
+ struct l9p_hdr hdr;
+ uint32_t afid;
+ char *uname;
+ char *aname;
+ uint32_t n_uname;
+};
+#define L9P_NOFID ((uint32_t)-1) /* in Tattach, no auth fid */
+#define L9P_NONUNAME ((uint32_t)-1) /* in Tattach, no n_uname */
+
+struct l9p_f_tcreate {
+ struct l9p_hdr hdr;
+ uint32_t perm;
+ char *name;
+ uint8_t mode; /* +Topen */
+ char *extension;
+};
+
+struct l9p_f_twalk {
+ struct l9p_hdr hdr;
+ uint32_t newfid;
+ uint16_t nwname;
+ char *wname[L9P_MAX_WELEM];
+};
+
+struct l9p_f_rwalk {
+ struct l9p_hdr hdr;
+ uint16_t nwqid;
+ struct l9p_qid wqid[L9P_MAX_WELEM];
+};
+
+struct l9p_f_io {
+ struct l9p_hdr hdr;
+ uint64_t offset; /* Tread, Twrite, Treaddir */
+ uint32_t count; /* Tread, Twrite, Rread, Treaddir, Rreaddir */
+};
+
+struct l9p_f_rstat {
+ struct l9p_hdr hdr;
+ struct l9p_stat stat;
+};
+
+struct l9p_f_twstat {
+ struct l9p_hdr hdr;
+ struct l9p_stat stat;
+};
+
+struct l9p_f_rstatfs {
+ struct l9p_hdr hdr;
+ struct l9p_statfs statfs;
+};
+
+/* Used for Tlcreate, Tlopen, Tmkdir, Tunlinkat. */
+struct l9p_f_tlcreate {
+ struct l9p_hdr hdr;
+ char *name; /* Tlcreate, Tmkdir, Tunlinkat */
+ uint32_t flags; /* Tlcreate, Tlopen, Tmkdir, Tunlinkat */
+ uint32_t mode; /* Tlcreate, Tmkdir */
+ uint32_t gid; /* Tlcreate, Tmkdir */
+};
+
+struct l9p_f_tsymlink {
+ struct l9p_hdr hdr;
+ char *name;
+ char *symtgt;
+ uint32_t gid;
+};
+
+struct l9p_f_tmknod {
+ struct l9p_hdr hdr;
+ char *name;
+ uint32_t mode;
+ uint32_t major;
+ uint32_t minor;
+ uint32_t gid;
+};
+
+struct l9p_f_trename {
+ struct l9p_hdr hdr;
+ uint32_t dfid;
+ char *name;
+};
+
+struct l9p_f_rreadlink {
+ struct l9p_hdr hdr;
+ char *target;
+};
+
+struct l9p_f_tgetattr {
+ struct l9p_hdr hdr;
+ uint64_t request_mask;
+};
+
+struct l9p_f_rgetattr {
+ struct l9p_hdr hdr;
+ uint64_t valid;
+ struct l9p_qid qid;
+ uint32_t mode;
+ uint32_t uid;
+ uint32_t gid;
+ uint64_t nlink;
+ uint64_t rdev;
+ uint64_t size;
+ uint64_t blksize;
+ uint64_t blocks;
+ uint64_t atime_sec;
+ uint64_t atime_nsec;
+ uint64_t mtime_sec;
+ uint64_t mtime_nsec;
+ uint64_t ctime_sec;
+ uint64_t ctime_nsec;
+ uint64_t btime_sec;
+ uint64_t btime_nsec;
+ uint64_t gen;
+ uint64_t data_version;
+};
+
+/* Fields in req->request_mask and reply->valid for Tgetattr, Rgetattr. */
+enum l9pl_getattr_flags {
+ L9PL_GETATTR_MODE = 0x00000001,
+ L9PL_GETATTR_NLINK = 0x00000002,
+ L9PL_GETATTR_UID = 0x00000004,
+ L9PL_GETATTR_GID = 0x00000008,
+ L9PL_GETATTR_RDEV = 0x00000010,
+ L9PL_GETATTR_ATIME = 0x00000020,
+ L9PL_GETATTR_MTIME = 0x00000040,
+ L9PL_GETATTR_CTIME = 0x00000080,
+ L9PL_GETATTR_INO = 0x00000100,
+ L9PL_GETATTR_SIZE = 0x00000200,
+ L9PL_GETATTR_BLOCKS = 0x00000400,
+ /* everything up to and including BLOCKS is BASIC */
+ L9PL_GETATTR_BASIC = L9PL_GETATTR_MODE |
+ L9PL_GETATTR_NLINK |
+ L9PL_GETATTR_UID |
+ L9PL_GETATTR_GID |
+ L9PL_GETATTR_RDEV |
+ L9PL_GETATTR_ATIME |
+ L9PL_GETATTR_MTIME |
+ L9PL_GETATTR_CTIME |
+ L9PL_GETATTR_INO |
+ L9PL_GETATTR_SIZE |
+ L9PL_GETATTR_BLOCKS,
+ L9PL_GETATTR_BTIME = 0x00000800,
+ L9PL_GETATTR_GEN = 0x00001000,
+ L9PL_GETATTR_DATA_VERSION = 0x00002000,
+ /* BASIC + birthtime + gen + data-version = ALL */
+ L9PL_GETATTR_ALL = L9PL_GETATTR_BASIC |
+ L9PL_GETATTR_BTIME |
+ L9PL_GETATTR_GEN |
+ L9PL_GETATTR_DATA_VERSION,
+};
+
+struct l9p_f_tsetattr {
+ struct l9p_hdr hdr;
+ uint32_t valid;
+ uint32_t mode;
+ uint32_t uid;
+ uint32_t gid;
+ uint64_t size;
+ uint64_t atime_sec; /* if valid & L9PL_SETATTR_ATIME_SET */
+ uint64_t atime_nsec; /* (else use on-server time) */
+ uint64_t mtime_sec; /* if valid & L9PL_SETATTR_MTIME_SET */
+ uint64_t mtime_nsec; /* (else use on-server time) */
+};
+
+/* Fields in req->valid for Tsetattr. */
+enum l9pl_setattr_flags {
+ L9PL_SETATTR_MODE = 0x00000001,
+ L9PL_SETATTR_UID = 0x00000002,
+ L9PL_SETATTR_GID = 0x00000004,
+ L9PL_SETATTR_SIZE = 0x00000008,
+ L9PL_SETATTR_ATIME = 0x00000010,
+ L9PL_SETATTR_MTIME = 0x00000020,
+ L9PL_SETATTR_CTIME = 0x00000040,
+ L9PL_SETATTR_ATIME_SET = 0x00000080,
+ L9PL_SETATTR_MTIME_SET = 0x00000100,
+};
+
+struct l9p_f_txattrwalk {
+ struct l9p_hdr hdr;
+ uint32_t newfid;
+ char *name;
+};
+
+struct l9p_f_rxattrwalk {
+ struct l9p_hdr hdr;
+ uint64_t size;
+};
+
+struct l9p_f_txattrcreate {
+ struct l9p_hdr hdr;
+ char *name;
+ uint64_t attr_size;
+ uint32_t flags;
+};
+
+struct l9p_f_tlock {
+ struct l9p_hdr hdr;
+ uint8_t type; /* from l9pl_lock_type */
+ uint32_t flags; /* from l9pl_lock_flags */
+ uint64_t start;
+ uint64_t length;
+ uint32_t proc_id;
+ char *client_id;
+};
+
+enum l9pl_lock_type {
+ L9PL_LOCK_TYPE_RDLOCK = 0,
+ L9PL_LOCK_TYPE_WRLOCK = 1,
+ L9PL_LOCK_TYPE_UNLOCK = 2,
+};
+
+enum l9pl_lock_flags {
+ L9PL_LOCK_TYPE_BLOCK = 1,
+ L9PL_LOCK_TYPE_RECLAIM = 2,
+};
+
+struct l9p_f_rlock {
+ struct l9p_hdr hdr;
+ uint8_t status; /* from l9pl_lock_status */
+};
+
+enum l9pl_lock_status {
+ L9PL_LOCK_SUCCESS = 0,
+ L9PL_LOCK_BLOCKED = 1,
+ L9PL_LOCK_ERROR = 2,
+ L9PL_LOCK_GRACE = 3,
+};
+
+struct l9p_f_getlock {
+ struct l9p_hdr hdr;
+ uint8_t type; /* from l9pl_lock_type */
+ uint64_t start;
+ uint64_t length;
+ uint32_t proc_id;
+ char *client_id;
+};
+
+struct l9p_f_tlink {
+ struct l9p_hdr hdr;
+ uint32_t dfid;
+ char *name;
+};
+
+struct l9p_f_trenameat {
+ struct l9p_hdr hdr;
+ char *oldname;
+ uint32_t newdirfid;
+ char *newname;
+};
+
+/*
+ * Flags in Tunlinkat (which re-uses f_tlcreate data structure but
+ * with different meaning).
+ */
+enum l9p_l_unlinkat_flags {
+ /* not sure if any other AT_* flags are passed through */
+ L9PL_AT_REMOVEDIR = 0x0200,
+};
+
+union l9p_fcall {
+ struct l9p_hdr hdr;
+ struct l9p_f_version version;
+ struct l9p_f_tflush tflush;
+ struct l9p_f_ropen ropen;
+ struct l9p_f_ropen rcreate;
+ struct l9p_f_ropen rattach;
+ struct l9p_f_error error;
+ struct l9p_f_rauth rauth;
+ struct l9p_f_attach tattach;
+ struct l9p_f_attach tauth;
+ struct l9p_f_tcreate tcreate;
+ struct l9p_f_tcreate topen;
+ struct l9p_f_twalk twalk;
+ struct l9p_f_rwalk rwalk;
+ struct l9p_f_twstat twstat;
+ struct l9p_f_rstat rstat;
+ struct l9p_f_rstatfs rstatfs;
+ struct l9p_f_tlcreate tlopen;
+ struct l9p_f_ropen rlopen;
+ struct l9p_f_tlcreate tlcreate;
+ struct l9p_f_ropen rlcreate;
+ struct l9p_f_tsymlink tsymlink;
+ struct l9p_f_ropen rsymlink;
+ struct l9p_f_tmknod tmknod;
+ struct l9p_f_ropen rmknod;
+ struct l9p_f_trename trename;
+ struct l9p_f_rreadlink rreadlink;
+ struct l9p_f_tgetattr tgetattr;
+ struct l9p_f_rgetattr rgetattr;
+ struct l9p_f_tsetattr tsetattr;
+ struct l9p_f_txattrwalk txattrwalk;
+ struct l9p_f_rxattrwalk rxattrwalk;
+ struct l9p_f_txattrcreate txattrcreate;
+ struct l9p_f_tlock tlock;
+ struct l9p_f_rlock rlock;
+ struct l9p_f_getlock getlock;
+ struct l9p_f_tlink tlink;
+ struct l9p_f_tlcreate tmkdir;
+ struct l9p_f_ropen rmkdir;
+ struct l9p_f_trenameat trenameat;
+ struct l9p_f_tlcreate tunlinkat;
+ struct l9p_f_io io;
+};
+
+#endif /* LIB9P_FCALL_H */
diff --git a/usr/src/lib/lib9p/common/fid.h b/usr/src/lib/lib9p/common/fid.h
new file mode 100644
index 0000000000..cdfdd7ec93
--- /dev/null
+++ b/usr/src/lib/lib9p/common/fid.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef LIB9P_FID_H
+#define LIB9P_FID_H
+
+#include <stdbool.h>
+
+/*
+ * Data structure for a fid. All active fids in one session
+ * are stored in a hash table; the hash table provides the
+ * iterator to process them. (See also l9p_connection in lib9p.h.)
+ *
+ * The back-end code has additional data per fid, found via
+ * lo_aux. Currently this is allocated with a separate calloc().
+ *
+ * Most fids represent a file or directory, but a few are special
+ * purpose, including the auth fid from Tauth+Tattach, and the
+ * fids used for extended attributes. We have our own set of
+ * flags here in lo_flags.
+ *
+ * Note that all new fids start as potentially-valid (reserving
+ * their 32-bit fid value), but not actually-valid. If another
+ * (threaded) op is invoked on a not-yet-valid fid, the fid cannot
+ * be used. A fid can also be locked against other threads, in
+ * which case they must wait for it: this happens during create
+ * and open, which on success result in the fid changing from a
+ * directory to a file. (At least, all this applies in principle
+ * -- we're currently single-threaded per connection so the locks
+ * are nop-ed out and the valid bit is mainly just for debug.)
+ *
+ * Fids that are "open" (the underlying file or directory is open)
+ * are marked as well.
+ *
+ * Locking is managed by the front end (request.c); validation
+ * and type-marking can be done by either side as needed.
+ *
+ * Fid types and validity are manipulated by set* and unset*
+ * functions, and tested by is* ops. Note that we only
+ * distinguish between "directory" and "not directory" at this
+ * level, i.e., symlinks and devices are just "not a directory
+ * fid". Also, fids cannot be unset as auth or xattr fids,
+ * nor can an open fid become closed, except by being clunked.
+ * While files should not normally become directories, it IS normal
+ * for directory fids to become file fids due to Twalk operations.
+ *
+ * (These accessor functions are just to leave wiggle room for
+ * different future implementations.)
+ */
+struct l9p_fid {
+ void *lo_aux;
+ uint32_t lo_fid;
+ uint32_t lo_flags; /* volatile atomic_t when threaded? */
+};
+
+enum l9p_lo_flags {
+ L9P_LO_ISAUTH = 0x01,
+ L9P_LO_ISDIR = 0x02,
+ L9P_LO_ISOPEN = 0x04,
+ L9P_LO_ISVALID = 0x08,
+ L9P_LO_ISXATTR = 0x10,
+};
+
+static inline bool
+l9p_fid_isauth(struct l9p_fid *fid)
+{
+ return ((fid->lo_flags & L9P_LO_ISAUTH) != 0);
+}
+
+static inline void
+l9p_fid_setauth(struct l9p_fid *fid)
+{
+ fid->lo_flags |= L9P_LO_ISAUTH;
+}
+
+static inline bool
+l9p_fid_isdir(struct l9p_fid *fid)
+{
+ return ((fid->lo_flags & L9P_LO_ISDIR) != 0);
+}
+
+static inline void
+l9p_fid_setdir(struct l9p_fid *fid)
+{
+ fid->lo_flags |= L9P_LO_ISDIR;
+}
+
+static inline void
+l9p_fid_unsetdir(struct l9p_fid *fid)
+{
+ fid->lo_flags &= ~(uint32_t)L9P_LO_ISDIR;
+}
+
+static inline bool
+l9p_fid_isopen(struct l9p_fid *fid)
+{
+ return ((fid->lo_flags & L9P_LO_ISOPEN) != 0);
+}
+
+static inline void
+l9p_fid_setopen(struct l9p_fid *fid)
+{
+ fid->lo_flags |= L9P_LO_ISOPEN;
+}
+
+static inline bool
+l9p_fid_isvalid(struct l9p_fid *fid)
+{
+ return ((fid->lo_flags & L9P_LO_ISVALID) != 0);
+}
+
+static inline void
+l9p_fid_setvalid(struct l9p_fid *fid)
+{
+ fid->lo_flags |= L9P_LO_ISVALID;
+}
+
+static inline void
+l9p_fid_unsetvalid(struct l9p_fid *fid)
+{
+ fid->lo_flags &= ~(uint32_t)L9P_LO_ISVALID;
+}
+
+static inline bool
+l9p_fid_isxattr(struct l9p_fid *fid)
+{
+ return ((fid->lo_flags & L9P_LO_ISXATTR) != 0);
+}
+
+static inline void
+l9p_fid_setxattr(struct l9p_fid *fid)
+{
+ fid->lo_flags |= L9P_LO_ISXATTR;
+}
+
+#endif /* LIB9P_FID_H */
diff --git a/usr/src/lib/lib9p/common/genacl.c b/usr/src/lib/lib9p/common/genacl.c
new file mode 100644
index 0000000000..a7be17ca9b
--- /dev/null
+++ b/usr/src/lib/lib9p/common/genacl.c
@@ -0,0 +1,806 @@
+/*
+ * Copyright 2016 Chris Torek <torek@ixsystems.com>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/acl.h>
+#include <sys/stat.h>
+
+#ifdef __illumos__
+#include <sys/sysmacros.h>
+#endif
+
+#include "lib9p.h"
+#include "lib9p_impl.h"
+#include "genacl.h"
+#include "fid.h"
+#include "log.h"
+
+#ifndef __illumos__
+typedef int econvertfn(acl_entry_t, struct l9p_ace *);
+#endif
+
+#ifdef __FreeBSD__
+static struct l9p_acl *l9p_new_acl(uint32_t acetype, uint32_t aceasize);
+static struct l9p_acl *l9p_growacl(struct l9p_acl *acl, uint32_t aceasize);
+static int l9p_count_aces(acl_t sysacl);
+static struct l9p_acl *l9p_sysacl_to_acl(int, acl_t, econvertfn *);
+#endif
+static bool l9p_ingroup(gid_t tid, gid_t gid, gid_t *gids, size_t ngids);
+static int l9p_check_aces(int32_t mask, struct l9p_acl *acl, struct stat *st,
+ uid_t uid, gid_t gid, gid_t *gids, size_t ngids);
+
+void
+l9p_acl_free(struct l9p_acl *acl)
+{
+
+ free(acl);
+}
+
+/*
+ * Is the given group ID tid (test-id) any of the gid's in agids?
+ */
+static bool
+l9p_ingroup(gid_t tid, gid_t gid, gid_t *gids, size_t ngids)
+{
+ size_t i;
+
+ if (tid == gid)
+ return (true);
+ for (i = 0; i < ngids; i++)
+ if (tid == gids[i])
+ return (true);
+ return (false);
+}
+
+/* #define ACE_DEBUG */
+
+/*
+ * Note that NFSv4 tests are done on a "first match" basis.
+ * That is, we check each ACE sequentially until we run out
+ * of ACEs, or find something explicitly denied (DENIED!),
+ * or have cleared out all our attempt-something bits. Once
+ * we come across an ALLOW entry for the bits we're trying,
+ * we clear those from the bits we're still looking for, in
+ * the order they appear.
+ *
+ * The result is either "definitely allowed" (we cleared
+ * all the bits), "definitely denied" (we hit a deny with
+ * some or all of the bits), or "unspecified". We
+ * represent these three states as +1 (positive = yes = allow),
+ * -1 (negative = no = denied), or 0 (no strong answer).
+ *
+ * For our caller's convenience, if we are called with a
+ * mask of 0, we return 0 (no answer).
+ */
+static int
+l9p_check_aces(int32_t mask, struct l9p_acl *acl, struct stat *st,
+ uid_t uid, gid_t gid, gid_t *gids, size_t ngids)
+{
+ uint32_t i;
+ struct l9p_ace *ace;
+#ifdef ACE_DEBUG
+ const char *acetype, *allowdeny;
+ bool show_tid;
+#endif
+ bool match;
+ uid_t tid;
+
+ if (mask == 0)
+ return (0);
+
+ for (i = 0; mask != 0 && i < acl->acl_nace; i++) {
+ ace = &acl->acl_aces[i];
+ switch (ace->ace_type) {
+ case L9P_ACET_ACCESS_ALLOWED:
+ case L9P_ACET_ACCESS_DENIED:
+ break;
+ default:
+ /* audit, alarm - ignore */
+ continue;
+ }
+#ifdef ACE_DEBUG
+ show_tid = false;
+#endif
+ if (ace->ace_flags & L9P_ACEF_OWNER) {
+#ifdef ACE_DEBUG
+ acetype = "OWNER@";
+#endif
+ match = st->st_uid == uid;
+ } else if (ace->ace_flags & L9P_ACEF_GROUP) {
+#ifdef ACE_DEBUG
+ acetype = "GROUP@";
+#endif
+ match = l9p_ingroup(st->st_gid, gid, gids, ngids);
+ } else if (ace->ace_flags & L9P_ACEF_EVERYONE) {
+#ifdef ACE_DEBUG
+ acetype = "EVERYONE@";
+#endif
+ match = true;
+ } else {
+ if (ace->ace_idsize != sizeof(tid))
+ continue;
+#ifdef ACE_DEBUG
+ show_tid = true;
+#endif
+ memcpy(&tid, &ace->ace_idbytes, sizeof(tid));
+ if (ace->ace_flags & L9P_ACEF_IDENTIFIER_GROUP) {
+#ifdef ACE_DEBUG
+ acetype = "group";
+#endif
+ match = l9p_ingroup(tid, gid, gids, ngids);
+ } else {
+#ifdef ACE_DEBUG
+ acetype = "user";
+#endif
+ match = tid == uid;
+ }
+ }
+ /*
+ * If this ACE applies to us, check remaining bits.
+ * If any of those bits also apply, check the type:
+ * DENY means "stop now", ALLOW means allow these bits
+ * and keep checking.
+ */
+#ifdef ACE_DEBUG
+ allowdeny = ace->ace_type == L9P_ACET_ACCESS_DENIED ?
+ "deny" : "allow";
+#endif
+ if (match && (ace->ace_mask & (uint32_t)mask) != 0) {
+#ifdef ACE_DEBUG
+ if (show_tid)
+ L9P_LOG(L9P_DEBUG,
+ "ACE: %s %s %d: mask 0x%x ace_mask 0x%x",
+ allowdeny, acetype, (int)tid,
+ (u_int)mask, (u_int)ace->ace_mask);
+ else
+ L9P_LOG(L9P_DEBUG,
+ "ACE: %s %s: mask 0x%x ace_mask 0x%x",
+ allowdeny, acetype,
+ (u_int)mask, (u_int)ace->ace_mask);
+#endif
+ if (ace->ace_type == L9P_ACET_ACCESS_DENIED)
+ return (-1);
+ mask &= ~ace->ace_mask;
+#ifdef ACE_DEBUG
+ L9P_LOG(L9P_DEBUG, "clear 0x%x: now mask=0x%x",
+ (u_int)ace->ace_mask, (u_int)mask);
+#endif
+ } else {
+#ifdef ACE_DEBUG
+ if (show_tid)
+ L9P_LOG(L9P_DEBUG,
+ "ACE: SKIP %s %s %d: "
+ "match %d mask 0x%x ace_mask 0x%x",
+ allowdeny, acetype, (int)tid,
+ (int)match, (u_int)mask,
+ (u_int)ace->ace_mask);
+ else
+ L9P_LOG(L9P_DEBUG,
+ "ACE: SKIP %s %s: "
+ "match %d mask 0x%x ace_mask 0x%x",
+ allowdeny, acetype,
+ (int)match, (u_int)mask,
+ (u_int)ace->ace_mask);
+#endif
+ }
+ }
+
+ /* Return 1 if access definitely granted. */
+#ifdef ACE_DEBUG
+ L9P_LOG(L9P_DEBUG, "ACE: end of ACEs, mask now 0x%x: %s",
+ mask, mask ? "no-definitive-answer" : "ALLOW");
+#endif
+ return (mask == 0 ? 1 : 0);
+}
+
+/*
+ * Test against ACLs.
+ *
+ * The return value is normally 0 (access allowed) or EPERM
+ * (access denied), so it could just be a boolean....
+ *
+ * For "make new dir in dir" and "remove dir in dir", you must
+ * set the mask to test the directory permissions (not ADD_FILE but
+ * ADD_SUBDIRECTORY, and DELETE_CHILD). For "make new file in dir"
+ * you must set the opmask to test file ADD_FILE.
+ *
+ * The L9P_ACE_DELETE flag means "can delete this thing"; it's not
+ * clear whether it should override the parent directory's ACL if
+ * any. In our case it does not, but a caller may try
+ * L9P_ACE_DELETE_CHILD (separately, on its own) and then a
+ * (second, separate) L9P_ACE_DELETE, to make the permissions work
+ * as "or" instead of "and".
+ *
+ * Pass a NULL parent/pstat if they are not applicable, e.g.,
+ * for doing operations on an existing file, such as reading or
+ * writing data or attributes. Pass in a null child/cstat if
+ * that's not applicable, such as creating a new file/dir.
+ *
+ * NB: it's probably wise to allow the owner of any file to update
+ * the ACLs of that file, but we leave that test to the caller.
+ */
+int l9p_acl_check_access(int32_t opmask, struct l9p_acl_check_args *args)
+{
+ struct l9p_acl *parent, *child;
+ struct stat *pstat, *cstat;
+ int32_t pop, cop;
+ size_t ngids;
+ uid_t uid;
+ gid_t gid, *gids;
+ int panswer, canswer;
+
+ assert(opmask != 0);
+ parent = args->aca_parent;
+ pstat = args->aca_pstat;
+ child = args->aca_child;
+ cstat = args->aca_cstat;
+ uid = args->aca_uid;
+ gid = args->aca_gid;
+ gids = args->aca_groups;
+ ngids = args->aca_ngroups;
+
+#ifdef ACE_DEBUG
+ L9P_LOG(L9P_DEBUG,
+ "l9p_acl_check_access: opmask=0x%x uid=%ld gid=%ld ngids=%zd",
+ (u_int)opmask, (long)uid, (long)gid, ngids);
+#endif
+ /*
+ * If caller said "superuser semantics", check that first.
+ * Note that we apply them regardless of ACLs.
+ */
+ if (uid == 0 && args->aca_superuser)
+ return (0);
+
+ /*
+ * If told to ignore ACLs and use only stat-based permissions,
+ * discard any non-NULL ACL pointers.
+ *
+ * This will need some fancying up when we support POSIX ACLs.
+ */
+ if ((args->aca_aclmode & L9P_ACM_NFS_ACL) == 0)
+ parent = child = NULL;
+
+ assert(parent == NULL || parent->acl_acetype == L9P_ACLTYPE_NFSv4);
+ assert(parent == NULL || pstat != NULL);
+ assert(child == NULL || child->acl_acetype == L9P_ACLTYPE_NFSv4);
+ assert(child == NULL || cstat != NULL);
+ assert(pstat != NULL || cstat != NULL);
+
+ /*
+ * If the operation is UNLINK we should have either both ACLs
+ * or no ACLs, but we won't require that here.
+ *
+ * If a parent ACL is supplied, it's a directory by definition.
+ * Make sure we're allowed to do this there, whatever this is.
+ * If a child ACL is supplied, check it too. Note that the
+ * DELETE permission only applies in the child though, not
+ * in the parent, and the DELETE_CHILD only applies in the
+ * parent.
+ */
+ pop = cop = opmask;
+ if (parent != NULL || pstat != NULL) {
+ /*
+ * Remove child-only bits from parent op and
+ * parent-only bits from child op.
+ *
+ * L9P_ACE_DELETE is child-only.
+ *
+ * L9P_ACE_DELETE_CHILD is parent-only, and three data
+ * access bits overlap with three directory access bits.
+ * We should have child==NULL && cstat==NULL, so the
+ * three data bits should be redundant, but it's
+ * both trivial and safest to remove them anyway.
+ */
+ pop &= ~L9P_ACE_DELETE;
+ cop &= ~(L9P_ACE_DELETE_CHILD | L9P_ACE_LIST_DIRECTORY |
+ L9P_ACE_ADD_FILE | L9P_ACE_ADD_SUBDIRECTORY);
+ } else {
+ /*
+ * Remove child-only bits from parent op. We need
+ * not bother since we just found we have no parent
+ * and no pstat, and hence won't actually *use* pop.
+ *
+ * pop &= ~(L9P_ACE_READ_DATA | L9P_ACE_WRITE_DATA |
+ * L9P_ACE_APPEND_DATA);
+ */
+ }
+ panswer = 0;
+ canswer = 0;
+ if (parent != NULL)
+ panswer = l9p_check_aces(pop, parent, pstat,
+ uid, gid, gids, ngids);
+ if (child != NULL)
+ canswer = l9p_check_aces(cop, child, cstat,
+ uid, gid, gids, ngids);
+
+ if (panswer || canswer) {
+ /*
+ * Got a definitive answer from parent and/or
+ * child ACLs. We're not quite done yet though.
+ */
+ if (opmask == L9P_ACOP_UNLINK) {
+ /*
+ * For UNLINK, we can get an allow from child
+ * and deny from parent, or vice versa. It's
+ * not 100% clear how to handle the two-answer
+ * case. ZFS says that if either says "allow",
+ * we allow, and if both definitely say "deny",
+ * we deny. This makes sense, so we do that
+ * here for all cases, even "strict".
+ */
+ if (panswer > 0 || canswer > 0)
+ return (0);
+ if (panswer < 0 && canswer < 0)
+ return (EPERM);
+ /* non-definitive answer from one! move on */
+ } else {
+ /*
+ * Have at least one definitive answer, and
+ * should have only one; obey whichever
+ * one it is.
+ */
+ if (panswer)
+ return (panswer < 0 ? EPERM : 0);
+ return (canswer < 0 ? EPERM : 0);
+ }
+ }
+
+ /*
+ * No definitive answer from ACLs alone. Check for ZFS style
+ * permissions checking and an "UNLINK" operation under ACLs.
+ * If so, find write-and-execute permission on parent.
+ * Note that WRITE overlaps with ADD_FILE -- that's ZFS's
+ * way of saying "allow write to dir" -- but EXECUTE is
+ * separate from LIST_DIRECTORY, so that's at least a little
+ * bit cleaner.
+ *
+ * Note also that only a definitive yes (both bits are
+ * explicitly allowed) results in granting unlink, and
+ * a definitive no (at least one bit explicitly denied)
+ * results in EPERM. Only "no answer" moves on.
+ */
+ if ((args->aca_aclmode & L9P_ACM_ZFS_ACL) &&
+ opmask == L9P_ACOP_UNLINK && parent != NULL) {
+ panswer = l9p_check_aces(L9P_ACE_ADD_FILE | L9P_ACE_EXECUTE,
+ parent, pstat, uid, gid, gids, ngids);
+ if (panswer)
+ return (panswer < 0 ? EPERM : 0);
+ }
+
+ /*
+ * No definitive answer from ACLs.
+ *
+ * Try POSIX style rwx permissions if allowed. This should
+ * be rare, occurring mainly when caller supplied no ACLs
+ * or set the mode to suppress them.
+ *
+ * The stat to check is the parent's if we don't have a child
+ * (i.e., this is a dir op), or if the DELETE_CHILD bit is set
+ * (i.e., this is an unlink or similar). Otherwise it's the
+ * child's.
+ */
+ if (args->aca_aclmode & L9P_ACM_STAT_MODE) {
+ struct stat *st;
+ int rwx, bits;
+
+ rwx = l9p_ace_mask_to_rwx(opmask);
+ if ((st = cstat) == NULL || (opmask & L9P_ACE_DELETE_CHILD))
+ st = pstat;
+ if (uid == st->st_uid)
+ bits = (st->st_mode >> 6) & 7;
+ else if (l9p_ingroup(st->st_gid, gid, gids, ngids))
+ bits = (st->st_mode >> 3) & 7;
+ else
+ bits = st->st_mode & 7;
+ /*
+ * If all the desired bits are set, we're OK.
+ */
+ if ((rwx & bits) == rwx)
+ return (0);
+ }
+
+ /* all methods have failed, return EPERM */
+ return (EPERM);
+}
+
+/*
+ * Collapse fancy ACL operation mask down to simple Unix bits.
+ *
+ * Directory operations don't map that well. However, listing
+ * a directory really does require read permission, and adding
+ * or deleting files really does require write permission, so
+ * this is probably sufficient.
+ */
+int
+l9p_ace_mask_to_rwx(int32_t opmask)
+{
+ int rwx = 0;
+
+ if (opmask &
+ (L9P_ACE_READ_DATA | L9P_ACE_READ_NAMED_ATTRS |
+ L9P_ACE_READ_ATTRIBUTES | L9P_ACE_READ_ACL))
+ rwx |= 4;
+ if (opmask &
+ (L9P_ACE_WRITE_DATA | L9P_ACE_APPEND_DATA |
+ L9P_ACE_ADD_FILE | L9P_ACE_ADD_SUBDIRECTORY |
+ L9P_ACE_DELETE | L9P_ACE_DELETE_CHILD |
+ L9P_ACE_WRITE_NAMED_ATTRS | L9P_ACE_WRITE_ATTRIBUTES |
+ L9P_ACE_WRITE_ACL))
+ rwx |= 2;
+ if (opmask & L9P_ACE_EXECUTE)
+ rwx |= 1;
+ return (rwx);
+}
+
+#if defined(__FreeBSD__) || defined(__illumos__)
+/*
+ * Allocate new ACL holder and ACEs.
+ */
+static struct l9p_acl *
+l9p_new_acl(uint32_t acetype, uint32_t aceasize)
+{
+ struct l9p_acl *ret;
+ size_t asize, size;
+
+ asize = aceasize * sizeof(struct l9p_ace);
+ size = sizeof(struct l9p_acl) + asize;
+ ret = malloc(size);
+ if (ret != NULL) {
+ ret->acl_acetype = acetype;
+ ret->acl_nace = 0;
+ ret->acl_aceasize = aceasize;
+ }
+ return (ret);
+}
+#endif
+
+#ifdef __FreeBSD__
+/*
+ * Expand ACL to accomodate more entries.
+ *
+ * Currently won't shrink, only grow, so it's a fast no-op until
+ * we hit the allocated size. After that, it's best to grow in
+ * big chunks, or this will be O(n**2).
+ */
+static struct l9p_acl *
+l9p_growacl(struct l9p_acl *acl, uint32_t aceasize)
+{
+ struct l9p_acl *tmp;
+ size_t asize, size;
+
+ if (acl->acl_aceasize < aceasize) {
+ asize = aceasize * sizeof(struct l9p_ace);
+ size = sizeof(struct l9p_acl) + asize;
+ tmp = realloc(acl, size);
+ if (tmp == NULL)
+ free(acl);
+ acl = tmp;
+ }
+ return (acl);
+}
+
+/*
+ * Annoyingly, there's no POSIX-standard way to count the number
+ * of ACEs in a system ACL other than to walk through them all.
+ * This is silly, but at least 2n is still O(n), and the walk is
+ * short. (If the system ACL mysteriously grows, we'll handle
+ * that OK via growacl(), too.)
+ */
+static int
+l9p_count_aces(acl_t sysacl)
+{
+ acl_entry_t entry;
+ uint32_t n;
+ int id;
+
+ id = ACL_FIRST_ENTRY;
+ for (n = 0; acl_get_entry(sysacl, id, &entry) == 1; n++)
+ id = ACL_NEXT_ENTRY;
+
+ return ((int)n);
+}
+
+/*
+ * Create ACL with ACEs from the given acl_t. We use the given
+ * convert function on each ACE.
+ */
+static struct l9p_acl *
+l9p_sysacl_to_acl(int acetype, acl_t sysacl, econvertfn *convert)
+{
+ struct l9p_acl *acl;
+ acl_entry_t entry;
+ uint32_t n;
+ int error, id;
+
+ acl = l9p_new_acl((uint32_t)acetype, (uint32_t)l9p_count_aces(sysacl));
+ if (acl == NULL)
+ return (NULL);
+ id = ACL_FIRST_ENTRY;
+ for (n = 0;;) {
+ if (acl_get_entry(sysacl, id, &entry) != 1)
+ break;
+ acl = l9p_growacl(acl, n + 1);
+ if (acl == NULL)
+ return (NULL);
+ error = (*convert)(entry, &acl->acl_aces[n]);
+ id = ACL_NEXT_ENTRY;
+ if (error == 0)
+ n++;
+ }
+ acl->acl_nace = n;
+ return (acl);
+}
+#endif
+
+#if defined(HAVE_POSIX_ACLS) && 0 /* not yet */
+struct l9p_acl *
+l9p_posix_acl_to_acl(acl_t sysacl)
+{
+}
+#endif
+
+#if defined(HAVE_FREEBSD_ACLS)
+static int
+l9p_frombsdnfs4(acl_entry_t sysace, struct l9p_ace *ace)
+{
+ acl_tag_t tag; /* e.g., USER_OBJ, GROUP, etc */
+ acl_entry_type_t entry_type; /* e.g., allow/deny */
+ acl_permset_t absdperm;
+ acl_flagset_t absdflag;
+ acl_perm_t bsdperm; /* e.g., READ_DATA */
+ acl_flag_t bsdflag; /* e.g., FILE_INHERIT_ACE */
+ uint32_t flags, mask;
+ int error;
+ uid_t uid, *aid;
+
+ error = acl_get_tag_type(sysace, &tag);
+ if (error == 0)
+ error = acl_get_entry_type_np(sysace, &entry_type);
+ if (error == 0)
+ error = acl_get_flagset_np(sysace, &absdflag);
+ if (error == 0)
+ error = acl_get_permset(sysace, &absdperm);
+ if (error)
+ return (error);
+
+ flags = 0;
+ uid = 0;
+ aid = NULL;
+
+ /* move user/group/everyone + id-is-group-id into flags */
+ switch (tag) {
+ case ACL_USER_OBJ:
+ flags |= L9P_ACEF_OWNER;
+ break;
+ case ACL_GROUP_OBJ:
+ flags |= L9P_ACEF_GROUP;
+ break;
+ case ACL_EVERYONE:
+ flags |= L9P_ACEF_EVERYONE;
+ break;
+ case ACL_GROUP:
+ flags |= L9P_ACEF_IDENTIFIER_GROUP;
+ /* FALLTHROUGH */
+ case ACL_USER:
+ aid = acl_get_qualifier(sysace); /* ugh, this malloc()s */
+ if (aid == NULL)
+ return (ENOMEM);
+ uid = *(uid_t *)aid;
+ free(aid);
+ aid = &uid;
+ break;
+ default:
+ return (EINVAL); /* can't happen */
+ }
+
+ switch (entry_type) {
+
+ case ACL_ENTRY_TYPE_ALLOW:
+ ace->ace_type = L9P_ACET_ACCESS_ALLOWED;
+ break;
+
+ case ACL_ENTRY_TYPE_DENY:
+ ace->ace_type = L9P_ACET_ACCESS_DENIED;
+ break;
+
+ case ACL_ENTRY_TYPE_AUDIT:
+ ace->ace_type = L9P_ACET_SYSTEM_AUDIT;
+ break;
+
+ case ACL_ENTRY_TYPE_ALARM:
+ ace->ace_type = L9P_ACET_SYSTEM_ALARM;
+ break;
+
+ default:
+ return (EINVAL); /* can't happen */
+ }
+
+ /* transform remaining BSD flags to internal NFS-y form */
+ bsdflag = *absdflag;
+ if (bsdflag & ACL_ENTRY_FILE_INHERIT)
+ flags |= L9P_ACEF_FILE_INHERIT_ACE;
+ if (bsdflag & ACL_ENTRY_DIRECTORY_INHERIT)
+ flags |= L9P_ACEF_DIRECTORY_INHERIT_ACE;
+ if (bsdflag & ACL_ENTRY_NO_PROPAGATE_INHERIT)
+ flags |= L9P_ACEF_NO_PROPAGATE_INHERIT_ACE;
+ if (bsdflag & ACL_ENTRY_INHERIT_ONLY)
+ flags |= L9P_ACEF_INHERIT_ONLY_ACE;
+ if (bsdflag & ACL_ENTRY_SUCCESSFUL_ACCESS)
+ flags |= L9P_ACEF_SUCCESSFUL_ACCESS_ACE_FLAG;
+ if (bsdflag & ACL_ENTRY_FAILED_ACCESS)
+ flags |= L9P_ACEF_FAILED_ACCESS_ACE_FLAG;
+ ace->ace_flags = flags;
+
+ /*
+ * Transform BSD permissions to ace_mask. Note that directory
+ * vs file bits are the same in both sets, so we don't need
+ * to worry about that, at least.
+ *
+ * There seem to be no BSD equivalents for WRITE_RETENTION
+ * and WRITE_RETENTION_HOLD.
+ */
+ mask = 0;
+ bsdperm = *absdperm;
+ if (bsdperm & ACL_READ_DATA)
+ mask |= L9P_ACE_READ_DATA;
+ if (bsdperm & ACL_WRITE_DATA)
+ mask |= L9P_ACE_WRITE_DATA;
+ if (bsdperm & ACL_APPEND_DATA)
+ mask |= L9P_ACE_APPEND_DATA;
+ if (bsdperm & ACL_READ_NAMED_ATTRS)
+ mask |= L9P_ACE_READ_NAMED_ATTRS;
+ if (bsdperm & ACL_WRITE_NAMED_ATTRS)
+ mask |= L9P_ACE_WRITE_NAMED_ATTRS;
+ if (bsdperm & ACL_EXECUTE)
+ mask |= L9P_ACE_EXECUTE;
+ if (bsdperm & ACL_DELETE_CHILD)
+ mask |= L9P_ACE_DELETE_CHILD;
+ if (bsdperm & ACL_READ_ATTRIBUTES)
+ mask |= L9P_ACE_READ_ATTRIBUTES;
+ if (bsdperm & ACL_WRITE_ATTRIBUTES)
+ mask |= L9P_ACE_WRITE_ATTRIBUTES;
+ /* L9P_ACE_WRITE_RETENTION */
+ /* L9P_ACE_WRITE_RETENTION_HOLD */
+ /* 0x00800 */
+ if (bsdperm & ACL_DELETE)
+ mask |= L9P_ACE_DELETE;
+ if (bsdperm & ACL_READ_ACL)
+ mask |= L9P_ACE_READ_ACL;
+ if (bsdperm & ACL_WRITE_ACL)
+ mask |= L9P_ACE_WRITE_ACL;
+ if (bsdperm & ACL_WRITE_OWNER)
+ mask |= L9P_ACE_WRITE_OWNER;
+ if (bsdperm & ACL_SYNCHRONIZE)
+ mask |= L9P_ACE_SYNCHRONIZE;
+ ace->ace_mask = mask;
+
+ /* fill in variable-size user or group ID bytes */
+ if (aid == NULL)
+ ace->ace_idsize = 0;
+ else {
+ ace->ace_idsize = sizeof(uid);
+ memcpy(&ace->ace_idbytes[0], aid, sizeof(uid));
+ }
+
+ return (0);
+}
+
+struct l9p_acl *
+l9p_freebsd_nfsv4acl_to_acl(acl_t sysacl)
+{
+
+ return (l9p_sysacl_to_acl(L9P_ACLTYPE_NFSv4, sysacl, l9p_frombsdnfs4));
+}
+#endif
+
+#if defined(HAVE_DARWIN_ACLS) && 0 /* not yet */
+struct l9p_acl *
+l9p_darwin_nfsv4acl_to_acl(acl_t sysacl)
+{
+}
+#endif
+
+#if defined(HAVE__ILLUMOS_ACLS)
+
+static struct {
+ uint16_t ace_flag;
+ uint32_t l9_flag;
+} ace_flag_tbl[] = {
+ { ACE_FILE_INHERIT_ACE, L9P_ACEF_FILE_INHERIT_ACE },
+ { ACE_DIRECTORY_INHERIT_ACE, L9P_ACEF_DIRECTORY_INHERIT_ACE },
+ { ACE_NO_PROPAGATE_INHERIT_ACE, L9P_ACEF_NO_PROPAGATE_INHERIT_ACE },
+ { ACE_INHERIT_ONLY_ACE, L9P_ACEF_INHERIT_ONLY_ACE },
+ { ACE_SUCCESSFUL_ACCESS_ACE_FLAG,
+ L9P_ACEF_SUCCESSFUL_ACCESS_ACE_FLAG },
+ { ACE_IDENTIFIER_GROUP, L9P_ACEF_IDENTIFIER_GROUP },
+ /* There doesn't appear to be an equivalent for ACE_INHERITED_ACE */
+ { ACE_OWNER, L9P_ACEF_OWNER },
+ { ACE_GROUP, L9P_ACEF_GROUP },
+ { ACE_EVERYONE, L9P_ACEF_EVERYONE }
+};
+
+struct l9p_acl *
+l9p_illumos_nfsv4acl_to_acl(acl_t *sysacl)
+{
+ struct l9p_acl *l9acl;
+ struct l9p_ace *l9ace;
+ ace_t *ent;
+ int i, j;
+
+ /* We only support NFSv4 ACLs.. so don't try this on UFS */
+ if (sysacl->acl_type != ACE_T)
+ return (NULL);
+
+ l9acl = l9p_new_acl(L9P_ACLTYPE_NFSv4, sysacl->acl_cnt);
+ if (l9acl == NULL)
+ return (NULL);
+
+ ent = sysacl->acl_aclp;
+ l9ace = l9acl->acl_aces;
+ for (i = 0; i < sysacl->acl_cnt; i++, ent++, l9ace++) {
+ switch (ent->a_type) {
+ case ACE_ACCESS_ALLOWED_ACE_TYPE:
+ l9ace->ace_type = L9P_ACET_ACCESS_ALLOWED;
+ break;
+ case ACE_ACCESS_DENIED_ACE_TYPE:
+ l9ace->ace_type = L9P_ACET_ACCESS_DENIED;
+ break;
+ case ACE_SYSTEM_AUDIT_ACE_TYPE:
+ l9ace->ace_type = L9P_ACET_SYSTEM_AUDIT;
+ break;
+ case ACE_SYSTEM_ALARM_ACE_TYPE:
+ l9ace->ace_type = L9P_ACET_SYSTEM_ALARM;
+ break;
+ default:
+ L9P_LOG(L9P_ERROR, "invalid ACL type");
+ l9p_acl_free(l9acl);
+ return (NULL);
+ }
+
+ l9ace->ace_flags = 0;
+ for (j = 0; j < ARRAY_SIZE(ace_flag_tbl); j++) {
+ if ((ent->a_flags & ace_flag_tbl[j].ace_flag) != 0)
+ l9ace->ace_flags |= ace_flag_tbl[j].l9_flag;
+ }
+
+ /*
+ * In a bit of good fortune, the bit values for ace_t masks
+ * and l9p masks are the same (l9p does have WRITE_RETENTION
+ * and WRITE_RETENTION_HOLD which aren't used -- we're also
+ * going ace_t->l9p so they dont matter in this context).
+ */
+ l9ace->ace_mask = ent->a_access_mask;
+ l9ace->ace_idsize = sizeof (ent->a_who);
+ memcpy(l9acl->acl_aces, &ent->a_who, sizeof (ent->a_who));
+ }
+
+ return (l9acl);
+}
+#endif
diff --git a/usr/src/lib/lib9p/common/genacl.h b/usr/src/lib/lib9p/common/genacl.h
new file mode 100644
index 0000000000..d74b543c19
--- /dev/null
+++ b/usr/src/lib/lib9p/common/genacl.h
@@ -0,0 +1,316 @@
+/*
+ * Copyright 2016 Chris Torek <torek@ixsystems.com>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * General ACL support for 9P2000.L.
+ *
+ * We mostly use Linux's xattr name space and nfs4 ACL bits, as
+ * these are the most general forms available.
+ *
+ * Linux requests attributes named
+ *
+ * "system.posix_acl_default"
+ * "system.posix_acl_access"
+ *
+ * to get POSIX style ACLs, and:
+ *
+ * "system.nfs4_acl"
+ *
+ * to get NFSv4 style ACLs. The v9fs client does not explicitly
+ * ask for the latter, but if you use the Ubuntu nfs4-acl-tools
+ * package, it should be able to read and write these.
+ *
+ * For the record, the Linux kernel source code also shows:
+ *
+ * - Lustre uses "trusted.*", with "*" matching "lov", "lma",
+ * "lmv", "dmv", "link", "fid", "version", "som", "hsm", and
+ * "lfsck_namespace".
+ *
+ * - ceph has a name tree of the form "ceph.<type>.<name>" with
+ * <type,name> pairs like <"dir","entries">, <"dir","files>,
+ * <"file","layout">, and so on.
+ *
+ * - ext4 uses the POSIX names, plus some special ext4-specific
+ * goop that might not get externalized.
+ *
+ * - NFS uses both the POSIX names and the NFSv4 ACLs. However,
+ * what it mainly does is have nfsd generate fake NFSv4 ACLs
+ * from POSIX ACLs. If you run an NFS client, the client
+ * relies on the server actually implementing the ACLs, and
+ * lets nfs4-acl-tools read and write the system.nfs4_acl xattr
+ * data. If you run an NFS server off, e.g., an ext4 file system,
+ * the server looks for the system.nfs4_acl xattr, serves that
+ * out if found, and otherwise just generates the fakes.
+ *
+ * - "security.*" and "selinux.*" are reserved.
+ *
+ * - "security.capability" is the name for capabilities.
+ *
+ * - sockets use "system.sockprotoname".
+ */
+
+#if defined(__APPLE__)
+ #define HAVE_POSIX_ACLS
+ #define HAVE_DARWIN_ACLS
+#endif
+
+#if defined(__FreeBSD__)
+ #define HAVE_POSIX_ACLS
+ #define HAVE_FREEBSD_ACLS
+#endif
+
+#if defined (__illumos__)
+ #define HAVE_POSIX_ACLS
+ #define HAVE__ILLUMOS_ACLS
+#endif
+
+#include <sys/types.h>
+#include <sys/acl.h> /* XXX assumes existence of sys/acl.h */
+
+/*
+ * An ACL consists of a number of ACEs that grant some kind of
+ * "allow" or "deny" to some specific entity.
+ *
+ * The number of ACEs is potentially unlimited, although in practice
+ * they tend not to be that long.
+ *
+ * It's the responsibility of the back-end to supply the ACL
+ * for each test. However, the ACL may be in some sort of
+ * system-specific form. It's the responsibility of some
+ * (system-specific) code to translate it to *this* form, after
+ * which the backend may use l9p_acl_check_access() to get
+ * access granted or denied (and, eventually, audits and alarms
+ * recorded and raises, although that's yet to be designed).
+ *
+ * The reason for all this faffing-about with formats is so that
+ * we can *report* the ACLs using Linux 9p style xattrs.
+ */
+
+struct l9p_acl;
+struct l9p_fid;
+
+void l9p_acl_free(struct l9p_acl *);
+
+/*
+ * An ACL is made up of ACEs.
+ *
+ * Each ACE has:
+ *
+ * - a type: allow, deny, audit, alarm
+ * - a set of flags
+ * - permissions bits: a "mask"
+ * - an optional, nominally-variable-length identity
+ *
+ * The last part is especially tricky and currently has limited
+ * support here: it's always a 16 byte field on Darwin, and just
+ * a uint32_t on BSD (should be larger, really). Linux supports
+ * very large, actually-variable-size values; we'll deal with
+ * this later, maybe.
+ *
+ * We will define the mask first, below, since these are also the bits
+ * passed in for the accmask argument to l9p_acl_check_access().
+ */
+
+/*
+ * ACL entry mask, and accmask argument flags.
+ *
+ * NB: not every bit is implemented, but they are all here because
+ * they are all defined as part of an NFSv4 ACL entry, which is
+ * more or less a superset of a POSIX ACL entry. This means you
+ * can put a complete NFSv4 ACL in and we can reproduce it.
+ *
+ * Note that the LIST_DIRECTORY, ADD_FILE, and ADD_SUBDIRECTORY bits
+ * apply only to a directory, while the READ_DATA, WRITE_DATA, and
+ * APPEND_DATA bits apply only to a file. See aca_parent/aca_child
+ * below.
+ */
+#define L9P_ACE_READ_DATA 0x00001
+#define L9P_ACE_LIST_DIRECTORY 0x00001 /* same as READ_DATA */
+#define L9P_ACE_WRITE_DATA 0x00002
+#define L9P_ACE_ADD_FILE 0x00002 /* same as WRITE_DATA */
+#define L9P_ACE_APPEND_DATA 0x00004
+#define L9P_ACE_ADD_SUBDIRECTORY 0x00004 /* same as APPEND_DATA */
+#define L9P_ACE_READ_NAMED_ATTRS 0x00008
+#define L9P_ACE_WRITE_NAMED_ATTRS 0x00010
+#define L9P_ACE_EXECUTE 0x00020
+#define L9P_ACE_DELETE_CHILD 0x00040
+#define L9P_ACE_READ_ATTRIBUTES 0x00080
+#define L9P_ACE_WRITE_ATTRIBUTES 0x00100
+#define L9P_ACE_WRITE_RETENTION 0x00200 /* not used here */
+#define L9P_ACE_WRITE_RETENTION_HOLD 0x00400 /* not used here */
+/* 0x00800 unused? */
+#define L9P_ACE_DELETE 0x01000
+#define L9P_ACE_READ_ACL 0x02000
+#define L9P_ACE_WRITE_ACL 0x04000
+#define L9P_ACE_WRITE_OWNER 0x08000
+#define L9P_ACE_SYNCHRONIZE 0x10000 /* not used here */
+
+/*
+ * This is not an ACE bit, but is used with the access checking
+ * below. It represents a request to unlink (delete child /
+ * delete) an entity, and is equivalent to asking for *either*
+ * (not both) permission.
+ */
+#define L9P_ACOP_UNLINK (L9P_ACE_DELETE_CHILD | L9P_ACE_DELETE)
+
+/*
+ * Access checking takes a lot of arguments, so they are
+ * collected into a "struct" here.
+ *
+ * The aca_parent and aca_pstat fields may/must be NULL if the
+ * operation itself does not involve "directory" permissions.
+ * The aca_child and aca_cstat fields may/must be NULL if the
+ * operation does not involve anything *but* a directory. This
+ * is how we decide whether you're interested in L9P_ACE_READ_DATA
+ * vs L9P_ACE_LIST_DIRECTORY, for instance.
+ *
+ * Note that it's OK for both parent and child to be directories
+ * (as is the case when we're adding or deleting a subdirectory).
+ */
+struct l9p_acl_check_args {
+ uid_t aca_uid; /* the uid that is requesting access */
+ gid_t aca_gid; /* the gid that is requesting access */
+ gid_t *aca_groups; /* the additional group-set, if any */
+ size_t aca_ngroups; /* number of groups in group-set */
+ struct l9p_acl *aca_parent; /* ACLs associated with parent/dir */
+ struct stat *aca_pstat; /* stat data for parent/dir */
+ struct l9p_acl *aca_child; /* ACLs associated with file */
+ struct stat *aca_cstat; /* stat data for file */
+ int aca_aclmode; /* mode checking bits, see below */
+ bool aca_superuser; /* alway allow uid==0 in STAT_MODE */
+};
+
+/*
+ * Access checking mode bits in aca_checkmode. If you enable
+ * ACLs, they are used first, optionally with ZFS style ACLs.
+ * This means that even if aca_superuser is set, if an ACL denies
+ * permission to uid 0, permission is really denied.
+ *
+ * NFS style ACLs run before POSIX style ACLs (though POSIX
+ * ACLs aren't done yet anyway).
+ *
+ * N.B.: you probably want L9P_ACL_ZFS, especially when operating
+ * with a ZFS file system on FreeBSD.
+ */
+#define L9P_ACM_NFS_ACL 0x0001 /* enable NFS ACL checking */
+#define L9P_ACM_ZFS_ACL 0x0002 /* use ZFS ACL unlink semantics */
+#define L9P_ACM_POSIX_ACL 0x0004 /* enable POSIX ACL checking (notyet) */
+#define L9P_ACM_STAT_MODE 0x0008 /* enable st_mode bits */
+
+/*
+ * Requests to access some file or directory must provide:
+ *
+ * - An operation. This should usually be just one bit from the
+ * L9P_ACE_* bit-sets above, or our special L9P_ACOP_UNLINK.
+ * For a few file-open operations it may be multiple bits,
+ * e.g., both read and write data.
+ * - The identity of the accessor: uid + gid + gid-set.
+ * - The type of access desired: this may be multiple bits.
+ * - The parent directory, if applicable.
+ * - The child file/dir being accessed, if applicable.
+ * - stat data for parent and/or child, if applicable.
+ *
+ * The ACLs and/or stat data of the parent and/or child get used
+ * here, so the caller must provide them. We should have a way to
+ * cache these on fids, but not yet. The parent and child
+ * arguments are a bit tricky; see the code in genacl.c.
+ */
+int l9p_acl_check_access(int32_t op, struct l9p_acl_check_args *args);
+
+/*
+ * When falling back to POSIX ACL or Unix-style permissions
+ * testing, it's nice to collapse the above detailed permissions
+ * into simple read/write/execute bits (value 0..7). We provide
+ * a small utility function that does this.
+ */
+int l9p_ace_mask_to_rwx(int32_t);
+
+/*
+ * The rest of the data in an ACE.
+ */
+
+/* type in ace_type */
+#define L9P_ACET_ACCESS_ALLOWED 0
+#define L9P_ACET_ACCESS_DENIED 1
+#define L9P_ACET_SYSTEM_AUDIT 2
+#define L9P_ACET_SYSTEM_ALARM 3
+
+/* flags in ace_flags */
+#define L9P_ACEF_FILE_INHERIT_ACE 0x001
+#define L9P_ACEF_DIRECTORY_INHERIT_ACE 0x002
+#define L9P_ACEF_NO_PROPAGATE_INHERIT_ACE 0x004
+#define L9P_ACEF_INHERIT_ONLY_ACE 0x008
+#define L9P_ACEF_SUCCESSFUL_ACCESS_ACE_FLAG 0x010
+#define L9P_ACEF_FAILED_ACCESS_ACE_FLAG 0x020
+#define L9P_ACEF_IDENTIFIER_GROUP 0x040
+#define L9P_ACEF_OWNER 0x080
+#define L9P_ACEF_GROUP 0x100
+#define L9P_ACEF_EVERYONE 0x200
+
+#if defined(__APPLE__)
+# define L9P_ACE_IDSIZE 16 /* but, how do we map Darwin uuid? */
+#else
+# define L9P_ACE_IDSIZE 4
+#endif
+
+struct l9p_ace {
+ uint16_t ace_type; /* ACL entry type */
+ uint16_t ace_flags; /* ACL entry flags */
+ uint32_t ace_mask; /* ACL entry mask */
+ uint32_t ace_idsize; /* length of ace_idbytes */
+ unsigned char ace_idbytes[L9P_ACE_IDSIZE];
+};
+
+#define L9P_ACLTYPE_NFSv4 1 /* currently the only valid type */
+struct l9p_acl {
+ uint32_t acl_acetype; /* reserved for future expansion */
+ uint32_t acl_nace; /* number of occupied ACEs */
+ uint32_t acl_aceasize; /* actual size of ACE array */
+ struct l9p_ace acl_aces[]; /* variable length ACE array */
+};
+
+/*
+ * These are the system-specific converters.
+ *
+ * Right now the backend needs to just find BSD NFSv4 ACLs
+ * and convert them before each operation that needs to be
+ * tested.
+ */
+#if defined(HAVE_DARWIN_ACLS)
+struct l9p_acl *l9p_darwin_nfsv4acl_to_acl(acl_t acl);
+#endif
+
+#if defined(HAVE_FREEBSD_ACLS)
+struct l9p_acl *l9p_freebsd_nfsv4acl_to_acl(acl_t acl);
+#endif
+
+#if defined(HAVE__ILLUMOS_ACLS)
+struct l9p_acl *l9p_illumos_nfsv4acl_to_acl(acl_t *acl);
+#endif
+
+#if defined(HAVE_POSIX_ACLS) && 0 /* not yet */
+struct l9p_acl *l9p_posix_acl_to_acl(acl_t acl);
+#endif
diff --git a/usr/src/lib/lib9p/common/hashtable.c b/usr/src/lib/lib9p/common/hashtable.c
new file mode 100644
index 0000000000..70db6bcc0e
--- /dev/null
+++ b/usr/src/lib/lib9p/common/hashtable.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include "lib9p_impl.h"
+#include "hashtable.h"
+
+static struct ht_item *ht_iter_advance(struct ht_iter *, struct ht_item *);
+
+void
+ht_init(struct ht *h, ssize_t size)
+{
+ ssize_t i;
+
+ memset(h, 0, sizeof(struct ht));
+ h->ht_nentries = size;
+ h->ht_entries = l9p_calloc((size_t)size, sizeof(struct ht_entry));
+ (void) pthread_rwlock_init(&h->ht_rwlock, NULL);
+
+ for (i = 0; i < size; i++)
+ TAILQ_INIT(&h->ht_entries[i].hte_items);
+}
+
+void
+ht_destroy(struct ht *h)
+{
+ struct ht_entry *he;
+ struct ht_item *item, *tmp;
+ ssize_t i;
+
+ for (i = 0; i < h->ht_nentries; i++) {
+ he = &h->ht_entries[i];
+ TAILQ_FOREACH_SAFE(item, &he->hte_items, hti_link, tmp) {
+ free(item);
+ }
+ }
+
+ (void) pthread_rwlock_destroy(&h->ht_rwlock);
+ free(h->ht_entries);
+ h->ht_entries = NULL;
+}
+
+void *
+ht_find(struct ht *h, uint32_t hash)
+{
+ void *result;
+
+ if (ht_rdlock(h) != 0)
+ return (NULL);
+ result = ht_find_locked(h, hash);
+ (void) ht_unlock(h);
+ return (result);
+}
+
+void *
+ht_find_locked(struct ht *h, uint32_t hash)
+{
+ struct ht_entry *entry;
+ struct ht_item *item;
+
+ entry = &h->ht_entries[hash % h->ht_nentries];
+
+ TAILQ_FOREACH(item, &entry->hte_items, hti_link) {
+ if (item->hti_hash == hash)
+ return (item->hti_data);
+ }
+
+ return (NULL);
+}
+
+int
+ht_add(struct ht *h, uint32_t hash, void *value)
+{
+ struct ht_entry *entry;
+ struct ht_item *item;
+ int err;
+
+ if ((err = ht_wrlock(h)) != 0)
+ return (err);
+
+ entry = &h->ht_entries[hash % h->ht_nentries];
+
+ TAILQ_FOREACH(item, &entry->hte_items, hti_link) {
+ if (item->hti_hash == hash) {
+ errno = EEXIST;
+ (void) ht_unlock(h);
+ return (-1);
+ }
+ }
+
+ item = l9p_calloc(1, sizeof(struct ht_item));
+ item->hti_hash = hash;
+ item->hti_data = value;
+ TAILQ_INSERT_TAIL(&entry->hte_items, item, hti_link);
+ (void) ht_unlock(h);
+
+ return (0);
+}
+
+int
+ht_remove(struct ht *h, uint32_t hash)
+{
+ int result;
+ int err;
+
+ if ((err = ht_wrlock(h)) != 0)
+ return (err);
+ result = ht_remove_locked(h, hash);
+ (void) ht_unlock(h);
+ return (result);
+}
+
+int
+ht_remove_locked(struct ht *h, uint32_t hash)
+{
+ struct ht_entry *entry;
+ struct ht_item *item, *tmp;
+ ssize_t slot = hash % h->ht_nentries;
+
+ entry = &h->ht_entries[slot];
+
+ TAILQ_FOREACH_SAFE(item, &entry->hte_items, hti_link, tmp) {
+ if (item->hti_hash == hash) {
+ TAILQ_REMOVE(&entry->hte_items, item, hti_link);
+ free(item);
+ return (0);
+ }
+ }
+
+ errno = ENOENT;
+ return (-1);
+}
+
+/*
+ * Inner workings for advancing the iterator.
+ *
+ * If we have a current item, that tells us how to find the
+ * next item. If not, we get the first item from the next
+ * slot (well, the next slot with an item); in any case, we
+ * record the new slot and return the next item.
+ *
+ * For bootstrapping, iter->htit_slot can be -1 to start
+ * searching at slot 0.
+ *
+ * Caller must hold a lock on the table.
+ */
+static struct ht_item *
+ht_iter_advance(struct ht_iter *iter, struct ht_item *cur)
+{
+ struct ht_item *next;
+ struct ht *h;
+ ssize_t slot;
+
+ h = iter->htit_parent;
+
+ if (cur == NULL)
+ next = NULL;
+ else
+ next = TAILQ_NEXT(cur, hti_link);
+
+ if (next == NULL) {
+ slot = iter->htit_slot;
+ while (++slot < h->ht_nentries) {
+ next = TAILQ_FIRST(&h->ht_entries[slot].hte_items);
+ if (next != NULL)
+ break;
+ }
+ iter->htit_slot = slot;
+ }
+ return (next);
+}
+
+/*
+ * Remove the current item - there must be one, or this is an
+ * error. This (necessarily) pre-locates the next item, so callers
+ * must not use it on an actively-changing table.
+ */
+int
+ht_remove_at_iter(struct ht_iter *iter)
+{
+ struct ht_item *item;
+ struct ht *h;
+ ssize_t slot;
+ int err;
+
+ assert(iter != NULL);
+
+ if ((item = iter->htit_curr) == NULL) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ /* remove the item from the table, saving the NEXT one */
+ h = iter->htit_parent;
+ if ((err = ht_wrlock(h)) != 0)
+ return (err);
+ slot = iter->htit_slot;
+ iter->htit_next = ht_iter_advance(iter, item);
+ TAILQ_REMOVE(&h->ht_entries[slot].hte_items, item, hti_link);
+ (void) ht_unlock(h);
+
+ /* mark us as no longer on an item, then free it */
+ iter->htit_curr = NULL;
+ free(item);
+
+ return (0);
+}
+
+/*
+ * Initialize iterator. Subsequent ht_next calls will find the
+ * first item, then the next, and so on. Callers should in general
+ * not use this on actively-changing tables, though we do our best
+ * to make it semi-sensible.
+ */
+void
+ht_iter(struct ht *h, struct ht_iter *iter)
+{
+
+ iter->htit_parent = h;
+ iter->htit_curr = NULL;
+ iter->htit_next = NULL;
+ iter->htit_slot = -1; /* which will increment to 0 */
+}
+
+/*
+ * Return the next item, which is the first item if we have not
+ * yet been called on this iterator, or the next item if we have.
+ */
+void *
+ht_next(struct ht_iter *iter)
+{
+ struct ht_item *item;
+ struct ht *h;
+
+ if ((item = iter->htit_next) == NULL) {
+ /* no pre-loaded next; find next from current */
+ h = iter->htit_parent;
+ if (ht_rdlock(h) != 0)
+ return (NULL);
+ item = ht_iter_advance(iter, iter->htit_curr);
+ (void) ht_unlock(h);
+ } else
+ iter->htit_next = NULL;
+ iter->htit_curr = item;
+ return (item == NULL ? NULL : item->hti_data);
+}
diff --git a/usr/src/lib/lib9p/common/hashtable.h b/usr/src/lib/lib9p/common/hashtable.h
new file mode 100644
index 0000000000..60b8dfff7b
--- /dev/null
+++ b/usr/src/lib/lib9p/common/hashtable.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef LIB9P_HASHTABLE_H
+#define LIB9P_HASHTABLE_H
+
+#include <pthread.h>
+#include <sys/queue.h>
+
+struct ht {
+ struct ht_entry * ht_entries;
+ ssize_t ht_nentries;
+ pthread_rwlock_t ht_rwlock;
+};
+
+struct ht_entry {
+ TAILQ_HEAD(, ht_item) hte_items;
+};
+
+struct ht_item {
+ uint32_t hti_hash;
+ void * hti_data;
+ TAILQ_ENTRY(ht_item) hti_link;
+};
+
+struct ht_iter {
+ struct ht * htit_parent;
+ struct ht_item * htit_curr;
+ struct ht_item * htit_next;
+ ssize_t htit_slot;
+};
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wthread-safety-analysis"
+#endif
+
+/*
+ * Obtain read-lock on hash table.
+ */
+static inline int
+ht_rdlock(struct ht *h)
+{
+
+ return (pthread_rwlock_rdlock(&h->ht_rwlock));
+}
+
+/*
+ * Obtain write-lock on hash table.
+ */
+static inline int
+ht_wrlock(struct ht *h)
+{
+
+ return (pthread_rwlock_wrlock(&h->ht_rwlock));
+}
+
+/*
+ * Release lock on hash table.
+ */
+static inline int
+ht_unlock(struct ht *h)
+{
+
+ return (pthread_rwlock_unlock(&h->ht_rwlock));
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+void ht_init(struct ht *h, ssize_t size);
+void ht_destroy(struct ht *h);
+void *ht_find(struct ht *h, uint32_t hash);
+void *ht_find_locked(struct ht *h, uint32_t hash);
+int ht_add(struct ht *h, uint32_t hash, void *value);
+int ht_remove(struct ht *h, uint32_t hash);
+int ht_remove_locked(struct ht *h, uint32_t hash);
+int ht_remove_at_iter(struct ht_iter *iter);
+void ht_iter(struct ht *h, struct ht_iter *iter);
+void *ht_next(struct ht_iter *iter);
+
+#endif /* LIB9P_HASHTABLE_H */
diff --git a/usr/src/lib/lib9p/common/illumos_endian.h b/usr/src/lib/lib9p/common/illumos_endian.h
new file mode 100644
index 0000000000..ecb7874724
--- /dev/null
+++ b/usr/src/lib/lib9p/common/illumos_endian.h
@@ -0,0 +1,26 @@
+#ifndef __ILLUMOS_ENDIAN_H
+#define __ILLUMOS_ENDIAN_H
+
+/*
+ * Shims to make illumos' endian headers and macros compatible
+ * with FreeBSD's <sys/endian.h>
+ */
+
+# include <endian.h>
+
+# define _COMPAT_LITTLE_ENDIAN 0x12345678
+# define _COMPAT_BIG_ENDIAN 0x87654321
+
+# ifdef _LITTLE_ENDIAN
+# define _BYTE_ORDER _COMPAT_LITTLE_ENDIAN
+# endif
+# ifdef _BIG_ENDIAN
+# define _BYTE_ORDER _COMPAT_BIG_ENDIAN
+# endif
+
+# undef _LITTLE_ENDIAN
+# undef _BIG_ENDIAN
+# define _LITTLE_ENDIAN _COMPAT_LITTLE_ENDIAN
+# define _BIG_ENDIAN _COMPAT_BIG_ENDIAN
+
+#endif /* __ILLUMOS_ENDIAN_H */
diff --git a/usr/src/lib/lib9p/common/lib9p.h b/usr/src/lib/lib9p/common/lib9p.h
new file mode 100644
index 0000000000..3d62e99006
--- /dev/null
+++ b/usr/src/lib/lib9p/common/lib9p.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+
+#ifndef LIB9P_LIB9P_H
+#define LIB9P_LIB9P_H
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/uio.h>
+#include <pthread.h>
+
+#if defined(__FreeBSD__)
+#include <sys/sbuf.h>
+#else
+#include "sbuf/sbuf.h"
+#endif
+
+#include "fcall.h"
+#include "threadpool.h"
+#include "hashtable.h"
+
+#define L9P_DEFAULT_MSIZE 8192
+#define L9P_MAX_IOV 128
+#define L9P_NUMTHREADS 8
+
+struct l9p_request;
+struct l9p_backend;
+struct l9p_fid;
+
+/*
+ * Functions to implement underlying transport for lib9p.
+ *
+ * The transport is responsible for:
+ *
+ * - allocating a response buffer (filling in the iovec and niov)
+ * (gets req, pointer to base of iov array of size L9P_MAX_IOV,
+ * pointer to niov, lt_aux)
+ *
+ * - sending a response, when a request has a reply ready
+ * (gets req, pointer to iov, niov, actual response length, lt_aux)
+ *
+ * - dropping the response buffer, when a request has been
+ * flushed or otherwise dropped without a response
+ * (gets req, pointer to iov, niov, lt_aux)
+ *
+ * The transport is of course also responsible for feeding in
+ * request-buffers, but that happens by the transport calling
+ * l9p_connection_recv().
+ */
+struct l9p_transport {
+ void *lt_aux;
+ int (*lt_get_response_buffer)(struct l9p_request *, struct iovec *,
+ size_t *, void *);
+ int (*lt_send_response)(struct l9p_request *, const struct iovec *,
+ size_t, size_t, void *);
+ void (*lt_drop_response)(struct l9p_request *, const struct iovec *,
+ size_t, void *);
+};
+
+enum l9p_pack_mode {
+ L9P_PACK,
+ L9P_UNPACK
+};
+
+enum l9p_integer_type {
+ L9P_BYTE = 1,
+ L9P_WORD = 2,
+ L9P_DWORD = 4,
+ L9P_QWORD = 8
+};
+
+enum l9p_version {
+ L9P_INVALID_VERSION = 0,
+ L9P_2000 = 1,
+ L9P_2000U = 2,
+ L9P_2000L = 3
+};
+
+/*
+ * This structure is used for unpacking (decoding) incoming
+ * requests and packing (encoding) outgoing results. It has its
+ * own copy of the iov array, with its own counters for working
+ * through that array, but it borrows the actual DATA from the
+ * original iov array associated with the original request (see
+ * below).
+ */
+struct l9p_message {
+ enum l9p_pack_mode lm_mode;
+ struct iovec lm_iov[L9P_MAX_IOV];
+ size_t lm_niov;
+ size_t lm_cursor_iov;
+ size_t lm_cursor_offset;
+ size_t lm_size;
+};
+
+/*
+ * Data structure for a request/response pair (Tfoo/Rfoo).
+ *
+ * Note that the response is not formatted out into raw data
+ * (overwriting the request raw data) until we are really
+ * responding, with the exception of read operations Tread
+ * and Treaddir, which overlay their result-data into the
+ * iov array in the process of reading.
+ *
+ * We have room for two incoming fids, in case we are
+ * using 9P2000.L protocol. Note that nothing that uses two
+ * fids also has an output fid (newfid), so we could have a
+ * union of lr_fid2 and lr_newfid, but keeping them separate
+ * is probably a bit less error-prone. (If we want to shave
+ * memory requirements there are more places to look.)
+ *
+ * (The fid, fid2, and newfid fields should be removed via
+ * reorganization, as they are only used for smuggling data
+ * between request.c and the backend and should just be
+ * parameters to backend ops.)
+ */
+struct l9p_request {
+ struct l9p_message lr_req_msg; /* for unpacking the request */
+ struct l9p_message lr_resp_msg; /* for packing the response */
+ union l9p_fcall lr_req; /* the request, decoded/unpacked */
+ union l9p_fcall lr_resp; /* the response, not yet packed */
+
+ struct l9p_fid *lr_fid;
+ struct l9p_fid *lr_fid2;
+ struct l9p_fid *lr_newfid;
+
+ struct l9p_connection *lr_conn; /* containing connection */
+ void *lr_aux; /* reserved for transport layer */
+
+ struct iovec lr_data_iov[L9P_MAX_IOV]; /* iovecs for req + resp */
+ size_t lr_data_niov; /* actual size of data_iov */
+
+ int lr_error; /* result from l9p_dispatch_request */
+
+ /* proteced by threadpool mutex */
+ enum l9p_workstate lr_workstate; /* threadpool: work state */
+ enum l9p_flushstate lr_flushstate; /* flush state if flushee */
+ struct l9p_worker *lr_worker; /* threadpool: worker */
+ STAILQ_ENTRY(l9p_request) lr_worklink; /* reserved to threadpool */
+
+ /* protected by tag hash table lock */
+ struct l9p_request_queue lr_flushq; /* q of flushers */
+ STAILQ_ENTRY(l9p_request) lr_flushlink; /* link w/in flush queue */
+};
+
+/* N.B.: these dirents are variable length and for .L only */
+struct l9p_dirent {
+ struct l9p_qid qid;
+ uint64_t offset;
+ uint8_t type;
+ char *name;
+};
+
+/*
+ * The 9pfs protocol has the notion of a "session", which is
+ * traffic between any two "Tversion" requests. All fids
+ * (lc_files, below) are specific to one particular session.
+ *
+ * We need a data structure per connection (client/server
+ * pair). This data structure lasts longer than these 9pfs
+ * sessions, but contains the request/response pairs and fids.
+ * Logically, the per-session data should be separate, but
+ * most of the time that would just require an extra
+ * indirection. Instead, a new session simply clunks all
+ * fids, and otherwise keeps using this same connection.
+ */
+struct l9p_connection {
+ struct l9p_server *lc_server;
+ struct l9p_transport lc_lt;
+ struct l9p_threadpool lc_tp;
+ enum l9p_version lc_version;
+ uint32_t lc_msize;
+ uint32_t lc_max_io_size;
+ struct ht lc_files;
+ struct ht lc_requests;
+ LIST_ENTRY(l9p_connection) lc_link;
+};
+
+struct l9p_server {
+ struct l9p_backend *ls_backend;
+ enum l9p_version ls_max_version;
+ LIST_HEAD(, l9p_connection) ls_conns;
+};
+
+int l9p_pufcall(struct l9p_message *msg, union l9p_fcall *fcall,
+ enum l9p_version version);
+ssize_t l9p_pustat(struct l9p_message *msg, struct l9p_stat *s,
+ enum l9p_version version);
+uint16_t l9p_sizeof_stat(struct l9p_stat *stat, enum l9p_version version);
+int l9p_pack_stat(struct l9p_message *msg, struct l9p_request *req,
+ struct l9p_stat *s);
+ssize_t l9p_pudirent(struct l9p_message *msg, struct l9p_dirent *de);
+
+int l9p_server_init(struct l9p_server **serverp, struct l9p_backend *backend);
+
+int l9p_connection_init(struct l9p_server *server,
+ struct l9p_connection **connp);
+void l9p_connection_free(struct l9p_connection *conn);
+void l9p_connection_recv(struct l9p_connection *conn, const struct iovec *iov,
+ size_t niov, void *aux);
+void l9p_connection_close(struct l9p_connection *conn);
+struct l9p_fid *l9p_connection_alloc_fid(struct l9p_connection *conn,
+ uint32_t fid);
+void l9p_connection_remove_fid(struct l9p_connection *conn,
+ struct l9p_fid *fid);
+
+int l9p_dispatch_request(struct l9p_request *req);
+void l9p_respond(struct l9p_request *req, bool drop, bool rmtag);
+
+void l9p_init_msg(struct l9p_message *msg, struct l9p_request *req,
+ enum l9p_pack_mode mode);
+void l9p_seek_iov(const struct iovec *iov1, size_t niov1, struct iovec *iov2,
+ size_t *niov2, size_t seek);
+size_t l9p_truncate_iov(struct iovec *iov, size_t niov, size_t length);
+void l9p_describe_fcall(union l9p_fcall *fcall, enum l9p_version version,
+ struct sbuf *sb);
+void l9p_freefcall(union l9p_fcall *fcall);
+void l9p_freestat(struct l9p_stat *stat);
+
+gid_t *l9p_getgrlist(const char *, gid_t, int *);
+
+#endif /* LIB9P_LIB9P_H */
diff --git a/usr/src/lib/lib9p/common/lib9p_impl.h b/usr/src/lib/lib9p/common/lib9p_impl.h
new file mode 100644
index 0000000000..41ff07ae18
--- /dev/null
+++ b/usr/src/lib/lib9p/common/lib9p_impl.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef LIB9P_LIB9P_IMPL_H
+#define LIB9P_LIB9P_IMPL_H
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef _KERNEL
+static inline void *
+l9p_malloc(size_t size)
+{
+ void *r = malloc(size);
+
+ if (r == NULL) {
+ fprintf(stderr, "cannot allocate %zd bytes: out of memory\n",
+ size);
+ abort();
+ }
+
+ return (r);
+}
+
+static inline void *
+l9p_calloc(size_t n, size_t size)
+{
+ void *r = calloc(n, size);
+
+ if (r == NULL) {
+ fprintf(stderr, "cannot allocate %zd bytes: out of memory\n",
+ n * size);
+ abort();
+ }
+
+ return (r);
+}
+
+static inline void *
+l9p_realloc(void *ptr, size_t newsize)
+{
+ void *r = realloc(ptr, newsize);
+
+ if (r == NULL) {
+ fprintf(stderr, "cannot allocate %zd bytes: out of memory\n",
+ newsize);
+ abort();
+ }
+
+ return (r);
+}
+#endif /* _KERNEL */
+
+#endif /* LIB9P_LIB9P_IMPL_H */
diff --git a/usr/src/lib/lib9p/common/linux_errno.h b/usr/src/lib/lib9p/common/linux_errno.h
new file mode 100644
index 0000000000..72778daa23
--- /dev/null
+++ b/usr/src/lib/lib9p/common/linux_errno.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2016 Chris Torek <torek@ixsystems.com>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef LIB9P_LINUX_ERRNO_H
+#define LIB9P_LINUX_ERRNO_H
+
+/*
+ * Linux error numbers that are outside of the original base range
+ * (which ends with ERANGE).
+ *
+ * This is pretty much the same as Linux's errno.h except that the
+ * names are prefixed with "LINUX_", and we add _STR with the
+ * string name.
+ *
+ * The string expansions were obtained with a little program to
+ * print every strerror().
+ *
+ * Note that BSD EDEADLK is 11 and BSD EAGAIN is 35, vs
+ * Linux / Plan9 EAGAIN at 11. So one value in the ERANGE
+ * range still needs translation too.
+ */
+
+#define LINUX_EAGAIN 11
+#define LINUX_EAGAIN_STR "Resource temporarily unavailable"
+
+#define LINUX_EDEADLK 35
+#define LINUX_EDEADLK_STR "Resource deadlock avoided"
+#define LINUX_ENAMETOOLONG 36
+#define LINUX_ENAMETOOLONG_STR "File name too long"
+#define LINUX_ENOLCK 37
+#define LINUX_ENOLCK_STR "No locks available"
+#define LINUX_ENOSYS 38
+#define LINUX_ENOSYS_STR "Function not implemented"
+#define LINUX_ENOTEMPTY 39
+#define LINUX_ENOTEMPTY_STR "Directory not empty"
+#define LINUX_ELOOP 40
+#define LINUX_ELOOP_STR "Too many levels of symbolic links"
+/* 41 unused */
+#define LINUX_ENOMSG 42
+#define LINUX_ENOMSG_STR "No message of desired type"
+#define LINUX_EIDRM 43
+#define LINUX_EIDRM_STR "Identifier removed"
+#define LINUX_ECHRNG 44
+#define LINUX_ECHRNG_STR "Channel number out of range"
+#define LINUX_EL2NSYNC 45
+#define LINUX_EL2NSYNC_STR "Level 2 not synchronized"
+#define LINUX_EL3HLT 46
+#define LINUX_EL3HLT_STR "Level 3 halted"
+#define LINUX_EL3RST 47
+#define LINUX_EL3RST_STR "Level 3 reset"
+#define LINUX_ELNRNG 48
+#define LINUX_ELNRNG_STR "Link number out of range"
+#define LINUX_EUNATCH 49
+#define LINUX_EUNATCH_STR "Protocol driver not attached"
+#define LINUX_ENOCSI 50
+#define LINUX_ENOCSI_STR "No CSI structure available"
+#define LINUX_EL2HLT 51
+#define LINUX_EL2HLT_STR "Level 2 halted"
+#define LINUX_EBADE 52
+#define LINUX_EBADE_STR "Invalid exchange"
+#define LINUX_EBADR 53
+#define LINUX_EBADR_STR "Invalid request descriptor"
+#define LINUX_EXFULL 54
+#define LINUX_EXFULL_STR "Exchange full"
+#define LINUX_ENOANO 55
+#define LINUX_ENOANO_STR "No anode"
+#define LINUX_EBADRQC 56
+#define LINUX_EBADRQC_STR "Invalid request code"
+#define LINUX_EBADSLT 57
+#define LINUX_EBADSLT_STR "Invalid slot"
+/* 58 unused */
+#define LINUX_EBFONT 59
+#define LINUX_EBFONT_STR "Bad font file format"
+#define LINUX_ENOSTR 60
+#define LINUX_ENOSTR_STR "Device not a stream"
+#define LINUX_ENODATA 61
+#define LINUX_ENODATA_STR "No data available"
+#define LINUX_ETIME 62
+#define LINUX_ETIME_STR "Timer expired"
+#define LINUX_ENOSR 63
+#define LINUX_ENOSR_STR "Out of streams resources"
+#define LINUX_ENONET 64
+#define LINUX_ENONET_STR "Machine is not on the network"
+#define LINUX_ENOPKG 65
+#define LINUX_ENOPKG_STR "Package not installed"
+#define LINUX_EREMOTE 66
+#define LINUX_EREMOTE_STR "Object is remote"
+#define LINUX_ENOLINK 67
+#define LINUX_ENOLINK_STR "Link has been severed"
+#define LINUX_EADV 68
+#define LINUX_EADV_STR "Advertise error"
+#define LINUX_ESRMNT 69
+#define LINUX_ESRMNT_STR "Srmount error"
+#define LINUX_ECOMM 70
+#define LINUX_ECOMM_STR "Communication error on send"
+#define LINUX_EPROTO 71
+#define LINUX_EPROTO_STR "Protocol error"
+#define LINUX_EMULTIHOP 72
+#define LINUX_EMULTIHOP_STR "Multihop attempted"
+#define LINUX_EDOTDOT 73
+#define LINUX_EDOTDOT_STR "RFS specific error"
+#define LINUX_EBADMSG 74
+#define LINUX_EBADMSG_STR "Bad message"
+#define LINUX_EOVERFLOW 75
+#define LINUX_EOVERFLOW_STR "Value too large for defined data type"
+#define LINUX_ENOTUNIQ 76
+#define LINUX_ENOTUNIQ_STR "Name not unique on network"
+#define LINUX_EBADFD 77
+#define LINUX_EBADFD_STR "File descriptor in bad state"
+#define LINUX_EREMCHG 78
+#define LINUX_EREMCHG_STR "Remote address changed"
+#define LINUX_ELIBACC 79
+#define LINUX_ELIBACC_STR "Can not access a needed shared library"
+#define LINUX_ELIBBAD 80
+#define LINUX_ELIBBAD_STR "Accessing a corrupted shared library"
+#define LINUX_ELIBSCN 81
+#define LINUX_ELIBSCN_STR ".lib section in a.out corrupted"
+#define LINUX_ELIBMAX 82
+#define LINUX_ELIBMAX_STR "Attempting to link in too many shared libraries"
+#define LINUX_ELIBEXEC 83
+#define LINUX_ELIBEXEC_STR "Cannot exec a shared library directly"
+#define LINUX_EILSEQ 84
+#define LINUX_EILSEQ_STR "Invalid or incomplete multibyte or wide character"
+#define LINUX_ERESTART 85
+#define LINUX_ERESTART_STR "Interrupted system call should be restarted"
+#define LINUX_ESTRPIPE 86
+#define LINUX_ESTRPIPE_STR "Streams pipe error"
+#define LINUX_EUSERS 87
+#define LINUX_EUSERS_STR "Too many users"
+#define LINUX_ENOTSOCK 88
+#define LINUX_ENOTSOCK_STR "Socket operation on non-socket"
+#define LINUX_EDESTADDRREQ 89
+#define LINUX_EDESTADDRREQ_STR "Destination address required"
+#define LINUX_EMSGSIZE 90
+#define LINUX_EMSGSIZE_STR "Message too long"
+#define LINUX_EPROTOTYPE 91
+#define LINUX_EPROTOTYPE_STR "Protocol wrong type for socket"
+#define LINUX_ENOPROTOOPT 92
+#define LINUX_ENOPROTOOPT_STR "Protocol not available"
+#define LINUX_EPROTONOSUPPORT 93
+#define LINUX_EPROTONOSUPPORT_STR "Protocol not supported"
+#define LINUX_ESOCKTNOSUPPORT 94
+#define LINUX_ESOCKTNOSUPPORT_STR "Socket type not supported"
+#define LINUX_EOPNOTSUPP 95
+#define LINUX_EOPNOTSUPP_STR "Operation not supported"
+#define LINUX_EPFNOSUPPORT 96
+#define LINUX_EPFNOSUPPORT_STR "Protocol family not supported"
+#define LINUX_EAFNOSUPPORT 97
+#define LINUX_EAFNOSUPPORT_STR "Address family not supported by protocol"
+#define LINUX_EADDRINUSE 98
+#define LINUX_EADDRINUSE_STR "Address already in use"
+#define LINUX_EADDRNOTAVAIL 99
+#define LINUX_EADDRNOTAVAIL_STR "Cannot assign requested address"
+#define LINUX_ENETDOWN 100
+#define LINUX_ENETDOWN_STR "Network is down"
+#define LINUX_ENETUNREACH 101
+#define LINUX_ENETUNREACH_STR "Network is unreachable"
+#define LINUX_ENETRESET 102
+#define LINUX_ENETRESET_STR "Network dropped connection on reset"
+#define LINUX_ECONNABORTED 103
+#define LINUX_ECONNABORTED_STR "Software caused connection abort"
+#define LINUX_ECONNRESET 104
+#define LINUX_ECONNRESET_STR "Connection reset by peer"
+#define LINUX_ENOBUFS 105
+#define LINUX_ENOBUFS_STR "No buffer space available"
+#define LINUX_EISCONN 106
+#define LINUX_EISCONN_STR "Transport endpoint is already connected"
+#define LINUX_ENOTCONN 107
+#define LINUX_ENOTCONN_STR "Transport endpoint is not connected"
+#define LINUX_ESHUTDOWN 108
+#define LINUX_ESHUTDOWN_STR "Cannot send after transport endpoint shutdown"
+#define LINUX_ETOOMANYREFS 109
+#define LINUX_ETOOMANYREFS_STR "Too many references: cannot splice"
+#define LINUX_ETIMEDOUT 110
+#define LINUX_ETIMEDOUT_STR "Connection timed out"
+#define LINUX_ECONNREFUSED 111
+#define LINUX_ECONNREFUSED_STR "Connection refused"
+#define LINUX_EHOSTDOWN 112
+#define LINUX_EHOSTDOWN_STR "Host is down"
+#define LINUX_EHOSTUNREACH 113
+#define LINUX_EHOSTUNREACH_STR "No route to host"
+#define LINUX_EALREADY 114
+#define LINUX_EALREADY_STR "Operation already in progress"
+#define LINUX_EINPROGRESS 115
+#define LINUX_EINPROGRESS_STR "Operation now in progress"
+#define LINUX_ESTALE 116
+#define LINUX_ESTALE_STR "Stale file handle"
+#define LINUX_EUCLEAN 117
+#define LINUX_EUCLEAN_STR "Structure needs cleaning"
+#define LINUX_ENOTNAM 118
+#define LINUX_ENOTNAM_STR "Not a XENIX named type file"
+#define LINUX_ENAVAIL 119
+#define LINUX_ENAVAIL_STR "No XENIX semaphores available"
+#define LINUX_EISNAM 120
+#define LINUX_EISNAM_STR "Is a named type file"
+#define LINUX_EREMOTEIO 121
+#define LINUX_EREMOTEIO_STR "Remote I/O error"
+#define LINUX_EDQUOT 122
+#define LINUX_EDQUOT_STR "Quota exceeded"
+#define LINUX_ENOMEDIUM 123
+#define LINUX_ENOMEDIUM_STR "No medium found"
+#define LINUX_EMEDIUMTYPE 124
+#define LINUX_EMEDIUMTYPE_STR "Wrong medium type"
+#define LINUX_ECANCELED 125
+#define LINUX_ECANCELED_STR "Operation canceled"
+#define LINUX_ENOKEY 126
+#define LINUX_ENOKEY_STR "Required key not available"
+#define LINUX_EKEYEXPIRED 127
+#define LINUX_EKEYEXPIRED_STR "Key has expired"
+#define LINUX_EKEYREVOKED 128
+#define LINUX_EKEYREVOKED_STR "Key has been revoked"
+#define LINUX_EKEYREJECTED 129
+#define LINUX_EKEYREJECTED_STR "Key was rejected by service"
+#define LINUX_EOWNERDEAD 130
+#define LINUX_EOWNERDEAD_STR "Owner died"
+#define LINUX_ENOTRECOVERABLE 131
+#define LINUX_ENOTRECOVERABLE_STR "State not recoverable"
+#define LINUX_ERFKILL 132
+#define LINUX_ERFKILL_STR "Operation not possible due to RF-kill"
+#define LINUX_EHWPOISON 133
+#define LINUX_EHWPOISON_STR "Memory page has hardware error"
+
+#endif /* LIB9P_LINUX_ERRNO_H */
diff --git a/usr/src/lib/lib9p/common/log.c b/usr/src/lib/lib9p/common/log.c
new file mode 100644
index 0000000000..fb2596a16f
--- /dev/null
+++ b/usr/src/lib/lib9p/common/log.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include "log.h"
+
+static const char *l9p_log_level_names[] = {
+ "DEBUG",
+ "INFO",
+ "WARN",
+ "ERROR"
+};
+
+void
+l9p_logf(enum l9p_log_level level, const char *func, const char *fmt, ...)
+{
+ const char *dest = NULL;
+ static FILE *stream = NULL;
+ va_list ap;
+
+ if (stream == NULL) {
+ dest = getenv("LIB9P_LOGGING");
+ if (dest == NULL)
+ return;
+ else if (!strcmp(dest, "stderr"))
+ stream = stderr;
+ else {
+ stream = fopen(dest, "a");
+ if (stream == NULL)
+ return;
+ }
+ }
+
+ va_start(ap, fmt);
+ fprintf(stream, "[%s]\t %s: ", l9p_log_level_names[level], func);
+ vfprintf(stream, fmt, ap);
+ fprintf(stream, "\n");
+ fflush(stream);
+ va_end(ap);
+}
diff --git a/usr/src/lib/lib9p/common/log.h b/usr/src/lib/lib9p/common/log.h
new file mode 100644
index 0000000000..b801d4017a
--- /dev/null
+++ b/usr/src/lib/lib9p/common/log.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef LIB9P_LOG_H
+#define LIB9P_LOG_H
+
+enum l9p_log_level {
+ L9P_DEBUG,
+ L9P_INFO,
+ L9P_WARNING,
+ L9P_ERROR
+};
+
+void l9p_logf(enum l9p_log_level level, const char *func, const char *fmt, ...);
+
+#if defined(L9P_DEBUG)
+#define L9P_LOG(level, fmt, ...) l9p_logf(level, __func__, fmt, ##__VA_ARGS__)
+#else
+#define L9P_LOG(level, fmt, ...)
+#endif
+
+#endif /* LIB9P_LOG_H */
diff --git a/usr/src/lib/lib9p/common/pack.c b/usr/src/lib/lib9p/common/pack.c
new file mode 100644
index 0000000000..13ec5f02b5
--- /dev/null
+++ b/usr/src/lib/lib9p/common/pack.c
@@ -0,0 +1,996 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Based on libixp code: ©2007-2010 Kris Maglione <maglione.k at Gmail>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#ifdef __APPLE__
+# include "apple_endian.h"
+#elif __illumos__
+# include "illumos_endian.h"
+# include <sys/sysmacros.h>
+#else
+# include <sys/endian.h>
+#endif
+#include <sys/uio.h>
+#include "lib9p.h"
+#include "lib9p_impl.h"
+#include "log.h"
+
+#define N(ary) (sizeof(ary) / sizeof(*ary))
+#define STRING_SIZE(s) (L9P_WORD + (s != NULL ? (uint16_t)strlen(s) : 0))
+#define QID_SIZE (L9P_BYTE + L9P_DWORD + L9P_QWORD)
+
+static ssize_t l9p_iov_io(struct l9p_message *, void *, size_t);
+static inline ssize_t l9p_pu8(struct l9p_message *, uint8_t *);
+static inline ssize_t l9p_pu16(struct l9p_message *, uint16_t *);
+static inline ssize_t l9p_pu32(struct l9p_message *, uint32_t *);
+static inline ssize_t l9p_pu64(struct l9p_message *, uint64_t *);
+static ssize_t l9p_pustring(struct l9p_message *, char **s);
+static ssize_t l9p_pustrings(struct l9p_message *, uint16_t *, char **, size_t);
+static ssize_t l9p_puqid(struct l9p_message *, struct l9p_qid *);
+static ssize_t l9p_puqids(struct l9p_message *, uint16_t *, struct l9p_qid *q);
+
+/*
+ * Transfer data from incoming request, or to outgoing response,
+ * using msg to track position and direction within request/response.
+ *
+ * Returns the number of bytes actually transferred (which is always
+ * just len itself, converted to signed), or -1 if we ran out of space.
+ *
+ * Note that if we return -1, subsequent l9p_iov_io() calls with
+ * the same (and not-reset) msg and len > 0 will also return -1.
+ * This means most users can just check the *last* call for failure.
+ */
+static ssize_t
+l9p_iov_io(struct l9p_message *msg, void *buffer, size_t len)
+{
+ size_t done = 0;
+ size_t left = len;
+
+ assert(msg != NULL);
+
+ if (len == 0)
+ return (0);
+
+ if (msg->lm_cursor_iov >= msg->lm_niov)
+ return (-1);
+
+ assert(buffer != NULL);
+
+ while (left > 0) {
+ size_t idx = msg->lm_cursor_iov;
+ size_t space = msg->lm_iov[idx].iov_len - msg->lm_cursor_offset;
+ size_t towrite = MIN(space, left);
+
+ if (msg->lm_mode == L9P_PACK) {
+ memcpy((char *)msg->lm_iov[idx].iov_base +
+ msg->lm_cursor_offset, (char *)buffer + done,
+ towrite);
+ }
+
+ if (msg->lm_mode == L9P_UNPACK) {
+ memcpy((char *)buffer + done,
+ (char *)msg->lm_iov[idx].iov_base +
+ msg->lm_cursor_offset, towrite);
+ }
+
+ msg->lm_cursor_offset += towrite;
+
+ done += towrite;
+ left -= towrite;
+
+ if (space - towrite == 0) {
+ /* Advance to next iov */
+ msg->lm_cursor_iov++;
+ msg->lm_cursor_offset = 0;
+
+ if (msg->lm_cursor_iov >= msg->lm_niov && left > 0)
+ return (-1);
+ }
+ }
+
+ msg->lm_size += done;
+ return ((ssize_t)done);
+}
+
+/*
+ * Pack or unpack a byte (8 bits).
+ *
+ * Returns 1 (success, 1 byte) or -1 (error).
+ */
+static inline ssize_t
+l9p_pu8(struct l9p_message *msg, uint8_t *val)
+{
+
+ return (l9p_iov_io(msg, val, sizeof (uint8_t)));
+}
+
+/*
+ * Pack or unpack 16-bit value.
+ *
+ * Returns 2 or -1.
+ */
+static inline ssize_t
+l9p_pu16(struct l9p_message *msg, uint16_t *val)
+{
+#if _BYTE_ORDER != _LITTLE_ENDIAN
+ /*
+ * The ifdefs are annoying, but there is no need
+ * for all of this foolery on little-endian hosts,
+ * and I don't expect the compiler to optimize it
+ * all away.
+ */
+ uint16_t copy;
+ ssize_t ret;
+
+ if (msg->lm_mode == L9P_PACK) {
+ copy = htole16(*val);
+ return (l9p_iov_io(msg, &copy, sizeof (uint16_t)));
+ }
+ ret = l9p_iov_io(msg, val, sizeof (uint16_t));
+ *val = le16toh(*val);
+ return (ret);
+#else
+ return (l9p_iov_io(msg, val, sizeof (uint16_t)));
+#endif
+}
+
+/*
+ * Pack or unpack 32-bit value.
+ *
+ * Returns 4 or -1.
+ */
+static inline ssize_t
+l9p_pu32(struct l9p_message *msg, uint32_t *val)
+{
+#if _BYTE_ORDER != _LITTLE_ENDIAN
+ uint32_t copy;
+ ssize_t ret;
+
+ if (msg->lm_mode == L9P_PACK) {
+ copy = htole32(*val);
+ return (l9p_iov_io(msg, &copy, sizeof (uint32_t)));
+ }
+ ret = l9p_iov_io(msg, val, sizeof (uint32_t));
+ *val = le32toh(*val);
+ return (ret);
+#else
+ return (l9p_iov_io(msg, val, sizeof (uint32_t)));
+#endif
+}
+
+/*
+ * Pack or unpack 64-bit value.
+ *
+ * Returns 8 or -1.
+ */
+static inline ssize_t
+l9p_pu64(struct l9p_message *msg, uint64_t *val)
+{
+#if _BYTE_ORDER != _LITTLE_ENDIAN
+ uint64_t copy;
+ ssize_t ret;
+
+ if (msg->lm_mode == L9P_PACK) {
+ copy = htole64(*val);
+ return (l9p_iov_io(msg, &copy, sizeof (uint64_t)));
+ }
+ ret = l9p_iov_io(msg, val, sizeof (uint32_t));
+ *val = le64toh(*val);
+ return (ret);
+#else
+ return (l9p_iov_io(msg, val, sizeof (uint64_t)));
+#endif
+}
+
+/*
+ * Pack or unpack a string, encoded as 2-byte length followed by
+ * string bytes. The returned length is 2 greater than the
+ * length of the string itself.
+ *
+ * When unpacking, this allocates a new string (NUL-terminated).
+ *
+ * Return -1 on error (not space, or failed to allocate string,
+ * or illegal string).
+ *
+ * Note that pustring (and hence pustrings) can return an error
+ * even when l9p_iov_io succeeds.
+ */
+static ssize_t
+l9p_pustring(struct l9p_message *msg, char **s)
+{
+ uint16_t len;
+
+ if (msg->lm_mode == L9P_PACK)
+ len = *s != NULL ? (uint16_t)strlen(*s) : 0;
+
+ if (l9p_pu16(msg, &len) < 0)
+ return (-1);
+
+ if (msg->lm_mode == L9P_UNPACK) {
+ *s = l9p_calloc(1, len + 1);
+ if (*s == NULL)
+ return (-1);
+ }
+
+ if (l9p_iov_io(msg, *s, len) < 0)
+ return (-1);
+
+ if (msg->lm_mode == L9P_UNPACK) {
+ /*
+ * An embedded NUL byte in a string is illegal.
+ * We don't necessarily have to check (we'll just
+ * treat it as a shorter string), but checking
+ * seems like a good idea.
+ */
+ if (memchr(*s, '\0', len) != NULL)
+ return (-1);
+ }
+
+ return ((ssize_t)len + 2);
+}
+
+/*
+ * Pack or unpack a number (*num) of strings (but at most max of
+ * them).
+ *
+ * Returns the number of bytes transferred, including the packed
+ * number of strings. If packing and the packed number of strings
+ * was reduced, the original *num value is unchanged; only the
+ * wire-format number is reduced. If unpacking and the input
+ * number of strings exceeds the max, the incoming *num is reduced
+ * to lim, if needed. (NOTE ASYMMETRY HERE!)
+ *
+ * Returns -1 on error.
+ */
+static ssize_t
+l9p_pustrings(struct l9p_message *msg, uint16_t *num, char **strings,
+ size_t max)
+{
+ size_t i, lim;
+ ssize_t r, ret;
+ uint16_t adjusted;
+
+ if (msg->lm_mode == L9P_PACK) {
+ lim = *num;
+ if (lim > max)
+ lim = max;
+ adjusted = (uint16_t)lim;
+ r = l9p_pu16(msg, &adjusted);
+ } else {
+ r = l9p_pu16(msg, num);
+ lim = *num;
+ if (lim > max)
+ *num = (uint16_t)(lim = max);
+ }
+ if (r < 0)
+ return (-1);
+
+ for (i = 0; i < lim; i++) {
+ ret = l9p_pustring(msg, &strings[i]);
+ if (ret < 1)
+ return (-1);
+
+ r += ret;
+ }
+
+ return (r);
+}
+
+/*
+ * Pack or unpack a qid.
+ *
+ * Returns 13 (success) or -1 (error).
+ */
+static ssize_t
+l9p_puqid(struct l9p_message *msg, struct l9p_qid *qid)
+{
+ ssize_t r;
+ uint8_t type;
+
+ if (msg->lm_mode == L9P_PACK) {
+ type = qid->type;
+ r = l9p_pu8(msg, &type);
+ } else {
+ r = l9p_pu8(msg, &type);
+ qid->type = type;
+ }
+ if (r > 0)
+ r = l9p_pu32(msg, &qid->version);
+ if (r > 0)
+ r = l9p_pu64(msg, &qid->path);
+
+ return (r > 0 ? QID_SIZE : r);
+}
+
+/*
+ * Pack or unpack *num qids.
+ *
+ * Returns 2 + 13 * *num (after possibly setting *num), or -1 on error.
+ */
+static ssize_t
+l9p_puqids(struct l9p_message *msg, uint16_t *num, struct l9p_qid *qids)
+{
+ size_t i, lim;
+ ssize_t ret, r;
+
+ r = l9p_pu16(msg, num);
+ if (r > 0) {
+ for (i = 0, lim = *num; i < lim; i++) {
+ ret = l9p_puqid(msg, &qids[i]);
+ if (ret < 0)
+ return (-1);
+ r += ret;
+ }
+ }
+ return (r);
+}
+
+/*
+ * Pack or unpack a l9p_stat.
+ *
+ * These have variable size, and the size further depends on
+ * the protocol version.
+ *
+ * Returns the number of bytes packed/unpacked, or -1 on error.
+ */
+ssize_t
+l9p_pustat(struct l9p_message *msg, struct l9p_stat *stat,
+ enum l9p_version version)
+{
+ ssize_t r = 0;
+ uint16_t size;
+
+ /* The on-wire size field excludes the size of the size field. */
+ if (msg->lm_mode == L9P_PACK)
+ size = l9p_sizeof_stat(stat, version) - 2;
+
+ r += l9p_pu16(msg, &size);
+ r += l9p_pu16(msg, &stat->type);
+ r += l9p_pu32(msg, &stat->dev);
+ r += l9p_puqid(msg, &stat->qid);
+ r += l9p_pu32(msg, &stat->mode);
+ r += l9p_pu32(msg, &stat->atime);
+ r += l9p_pu32(msg, &stat->mtime);
+ r += l9p_pu64(msg, &stat->length);
+ r += l9p_pustring(msg, &stat->name);
+ r += l9p_pustring(msg, &stat->uid);
+ r += l9p_pustring(msg, &stat->gid);
+ r += l9p_pustring(msg, &stat->muid);
+
+ if (version >= L9P_2000U) {
+ r += l9p_pustring(msg, &stat->extension);
+ r += l9p_pu32(msg, &stat->n_uid);
+ r += l9p_pu32(msg, &stat->n_gid);
+ r += l9p_pu32(msg, &stat->n_muid);
+ }
+
+ if (r < size + 2)
+ return (-1);
+
+ return (r);
+}
+
+/*
+ * Pack or unpack a variable-length dirent.
+ *
+ * If unpacking, the name field is malloc()ed and the caller must
+ * free it.
+ *
+ * Returns the wire-format length, or -1 if we ran out of room.
+ */
+ssize_t
+l9p_pudirent(struct l9p_message *msg, struct l9p_dirent *de)
+{
+ ssize_t r, s;
+
+ r = l9p_puqid(msg, &de->qid);
+ r += l9p_pu64(msg, &de->offset);
+ r += l9p_pu8(msg, &de->type);
+ s = l9p_pustring(msg, &de->name);
+ if (r < QID_SIZE + 8 + 1 || s < 0)
+ return (-1);
+ return (r + s);
+}
+
+/*
+ * Pack or unpack a request or response (fcall).
+ *
+ * Returns 0 on success, -1 on error. (It's up to the caller
+ * to call l9p_freefcall on our failure.)
+ */
+int
+l9p_pufcall(struct l9p_message *msg, union l9p_fcall *fcall,
+ enum l9p_version version)
+{
+ uint32_t length = 0;
+ ssize_t r;
+
+ /*
+ * Get overall length, type, and tag, which should appear
+ * in all messages. If not even that works, abort immediately.
+ */
+ l9p_pu32(msg, &length);
+ l9p_pu8(msg, &fcall->hdr.type);
+ r = l9p_pu16(msg, &fcall->hdr.tag);
+ if (r < 0)
+ return (-1);
+
+ /*
+ * Decode remainder of message. When unpacking, this may
+ * allocate memory, even if we fail during the decode.
+ * Note that the initial fcall is zeroed out, though, so
+ * we can just freefcall() to release whatever might have
+ * gotten allocated, if the unpack fails due to a short
+ * packet.
+ */
+ switch (fcall->hdr.type) {
+ case L9P_TVERSION:
+ case L9P_RVERSION:
+ l9p_pu32(msg, &fcall->version.msize);
+ r = l9p_pustring(msg, &fcall->version.version);
+ break;
+
+ case L9P_TAUTH:
+ l9p_pu32(msg, &fcall->tauth.afid);
+ r = l9p_pustring(msg, &fcall->tauth.uname);
+ if (r < 0)
+ break;
+ r = l9p_pustring(msg, &fcall->tauth.aname);
+ if (r < 0)
+ break;
+ if (version >= L9P_2000U)
+ r = l9p_pu32(msg, &fcall->tauth.n_uname);
+ break;
+
+ case L9P_RAUTH:
+ r = l9p_puqid(msg, &fcall->rauth.aqid);
+ break;
+
+ case L9P_TATTACH:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ l9p_pu32(msg, &fcall->tattach.afid);
+ r = l9p_pustring(msg, &fcall->tattach.uname);
+ if (r < 0)
+ break;
+ r = l9p_pustring(msg, &fcall->tattach.aname);
+ if (r < 0)
+ break;
+ if (version >= L9P_2000U)
+ r = l9p_pu32(msg, &fcall->tattach.n_uname);
+ break;
+
+ case L9P_RATTACH:
+ r = l9p_puqid(msg, &fcall->rattach.qid);
+ break;
+
+ case L9P_RERROR:
+ r = l9p_pustring(msg, &fcall->error.ename);
+ if (r < 0)
+ break;
+ if (version >= L9P_2000U)
+ r = l9p_pu32(msg, &fcall->error.errnum);
+ break;
+
+ case L9P_RLERROR:
+ r = l9p_pu32(msg, &fcall->error.errnum);
+ break;
+
+ case L9P_TFLUSH:
+ r = l9p_pu16(msg, &fcall->tflush.oldtag);
+ break;
+
+ case L9P_RFLUSH:
+ break;
+
+ case L9P_TWALK:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ l9p_pu32(msg, &fcall->twalk.newfid);
+ r = l9p_pustrings(msg, &fcall->twalk.nwname,
+ fcall->twalk.wname, N(fcall->twalk.wname));
+ break;
+
+ case L9P_RWALK:
+ r = l9p_puqids(msg, &fcall->rwalk.nwqid, fcall->rwalk.wqid);
+ break;
+
+ case L9P_TOPEN:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ r = l9p_pu8(msg, &fcall->topen.mode);
+ break;
+
+ case L9P_ROPEN:
+ l9p_puqid(msg, &fcall->ropen.qid);
+ r = l9p_pu32(msg, &fcall->ropen.iounit);
+ break;
+
+ case L9P_TCREATE:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ r = l9p_pustring(msg, &fcall->tcreate.name);
+ if (r < 0)
+ break;
+ l9p_pu32(msg, &fcall->tcreate.perm);
+ r = l9p_pu8(msg, &fcall->tcreate.mode);
+ if (version >= L9P_2000U)
+ r = l9p_pustring(msg, &fcall->tcreate.extension);
+ break;
+
+ case L9P_RCREATE:
+ l9p_puqid(msg, &fcall->rcreate.qid);
+ r = l9p_pu32(msg, &fcall->rcreate.iounit);
+ break;
+
+ case L9P_TREAD:
+ case L9P_TREADDIR:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ l9p_pu64(msg, &fcall->io.offset);
+ r = l9p_pu32(msg, &fcall->io.count);
+ break;
+
+ case L9P_RREAD:
+ case L9P_RREADDIR:
+ r = l9p_pu32(msg, &fcall->io.count);
+ break;
+
+ case L9P_TWRITE:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ l9p_pu64(msg, &fcall->io.offset);
+ r = l9p_pu32(msg, &fcall->io.count);
+ break;
+
+ case L9P_RWRITE:
+ r = l9p_pu32(msg, &fcall->io.count);
+ break;
+
+ case L9P_TCLUNK:
+ case L9P_TSTAT:
+ case L9P_TREMOVE:
+ case L9P_TSTATFS:
+ r = l9p_pu32(msg, &fcall->hdr.fid);
+ break;
+
+ case L9P_RCLUNK:
+ case L9P_RREMOVE:
+ break;
+
+ case L9P_RSTAT:
+ {
+ uint16_t size = l9p_sizeof_stat(&fcall->rstat.stat,
+ version);
+ l9p_pu16(msg, &size);
+ r = l9p_pustat(msg, &fcall->rstat.stat, version);
+ }
+ break;
+
+ case L9P_TWSTAT:
+ {
+ uint16_t size;
+ l9p_pu32(msg, &fcall->hdr.fid);
+ l9p_pu16(msg, &size);
+ r = l9p_pustat(msg, &fcall->twstat.stat, version);
+ }
+ break;
+
+ case L9P_RWSTAT:
+ break;
+
+ case L9P_RSTATFS:
+ l9p_pu32(msg, &fcall->rstatfs.statfs.type);
+ l9p_pu32(msg, &fcall->rstatfs.statfs.bsize);
+ l9p_pu64(msg, &fcall->rstatfs.statfs.blocks);
+ l9p_pu64(msg, &fcall->rstatfs.statfs.bfree);
+ l9p_pu64(msg, &fcall->rstatfs.statfs.bavail);
+ l9p_pu64(msg, &fcall->rstatfs.statfs.files);
+ l9p_pu64(msg, &fcall->rstatfs.statfs.ffree);
+ l9p_pu64(msg, &fcall->rstatfs.statfs.fsid);
+ r = l9p_pu32(msg, &fcall->rstatfs.statfs.namelen);
+ break;
+
+ case L9P_TLOPEN:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ r = l9p_pu32(msg, &fcall->tlopen.flags);
+ break;
+
+ case L9P_RLOPEN:
+ l9p_puqid(msg, &fcall->rlopen.qid);
+ r = l9p_pu32(msg, &fcall->rlopen.iounit);
+ break;
+
+ case L9P_TLCREATE:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ r = l9p_pustring(msg, &fcall->tlcreate.name);
+ if (r < 0)
+ break;
+ l9p_pu32(msg, &fcall->tlcreate.flags);
+ l9p_pu32(msg, &fcall->tlcreate.mode);
+ r = l9p_pu32(msg, &fcall->tlcreate.gid);
+ break;
+
+ case L9P_RLCREATE:
+ l9p_puqid(msg, &fcall->rlcreate.qid);
+ r = l9p_pu32(msg, &fcall->rlcreate.iounit);
+ break;
+
+ case L9P_TSYMLINK:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ r = l9p_pustring(msg, &fcall->tsymlink.name);
+ if (r < 0)
+ break;
+ r = l9p_pustring(msg, &fcall->tsymlink.symtgt);
+ if (r < 0)
+ break;
+ r = l9p_pu32(msg, &fcall->tlcreate.gid);
+ break;
+
+ case L9P_RSYMLINK:
+ r = l9p_puqid(msg, &fcall->rsymlink.qid);
+ break;
+
+ case L9P_TMKNOD:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ r = l9p_pustring(msg, &fcall->tmknod.name);
+ if (r < 0)
+ break;
+ l9p_pu32(msg, &fcall->tmknod.mode);
+ l9p_pu32(msg, &fcall->tmknod.major);
+ l9p_pu32(msg, &fcall->tmknod.minor);
+ r = l9p_pu32(msg, &fcall->tmknod.gid);
+ break;
+
+ case L9P_RMKNOD:
+ r = l9p_puqid(msg, &fcall->rmknod.qid);
+ break;
+
+ case L9P_TRENAME:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ l9p_pu32(msg, &fcall->trename.dfid);
+ r = l9p_pustring(msg, &fcall->trename.name);
+ break;
+
+ case L9P_RRENAME:
+ break;
+
+ case L9P_TREADLINK:
+ r = l9p_pu32(msg, &fcall->hdr.fid);
+ break;
+
+ case L9P_RREADLINK:
+ r = l9p_pustring(msg, &fcall->rreadlink.target);
+ break;
+
+ case L9P_TGETATTR:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ r = l9p_pu64(msg, &fcall->tgetattr.request_mask);
+ break;
+
+ case L9P_RGETATTR:
+ l9p_pu64(msg, &fcall->rgetattr.valid);
+ l9p_puqid(msg, &fcall->rgetattr.qid);
+ l9p_pu32(msg, &fcall->rgetattr.mode);
+ l9p_pu32(msg, &fcall->rgetattr.uid);
+ l9p_pu32(msg, &fcall->rgetattr.gid);
+ l9p_pu64(msg, &fcall->rgetattr.nlink);
+ l9p_pu64(msg, &fcall->rgetattr.rdev);
+ l9p_pu64(msg, &fcall->rgetattr.size);
+ l9p_pu64(msg, &fcall->rgetattr.blksize);
+ l9p_pu64(msg, &fcall->rgetattr.blocks);
+ l9p_pu64(msg, &fcall->rgetattr.atime_sec);
+ l9p_pu64(msg, &fcall->rgetattr.atime_nsec);
+ l9p_pu64(msg, &fcall->rgetattr.mtime_sec);
+ l9p_pu64(msg, &fcall->rgetattr.mtime_nsec);
+ l9p_pu64(msg, &fcall->rgetattr.ctime_sec);
+ l9p_pu64(msg, &fcall->rgetattr.ctime_nsec);
+ l9p_pu64(msg, &fcall->rgetattr.btime_sec);
+ l9p_pu64(msg, &fcall->rgetattr.btime_nsec);
+ l9p_pu64(msg, &fcall->rgetattr.gen);
+ r = l9p_pu64(msg, &fcall->rgetattr.data_version);
+ break;
+
+ case L9P_TSETATTR:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ l9p_pu32(msg, &fcall->tsetattr.valid);
+ l9p_pu32(msg, &fcall->tsetattr.mode);
+ l9p_pu32(msg, &fcall->tsetattr.uid);
+ l9p_pu32(msg, &fcall->tsetattr.gid);
+ l9p_pu64(msg, &fcall->tsetattr.size);
+ l9p_pu64(msg, &fcall->tsetattr.atime_sec);
+ l9p_pu64(msg, &fcall->tsetattr.atime_nsec);
+ l9p_pu64(msg, &fcall->tsetattr.mtime_sec);
+ r = l9p_pu64(msg, &fcall->tsetattr.mtime_nsec);
+ break;
+
+ case L9P_RSETATTR:
+ break;
+
+ case L9P_TXATTRWALK:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ l9p_pu32(msg, &fcall->txattrwalk.newfid);
+ r = l9p_pustring(msg, &fcall->txattrwalk.name);
+ break;
+
+ case L9P_RXATTRWALK:
+ r = l9p_pu64(msg, &fcall->rxattrwalk.size);
+ break;
+
+ case L9P_TXATTRCREATE:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ r = l9p_pustring(msg, &fcall->txattrcreate.name);
+ if (r < 0)
+ break;
+ l9p_pu64(msg, &fcall->txattrcreate.attr_size);
+ r = l9p_pu32(msg, &fcall->txattrcreate.flags);
+ break;
+
+ case L9P_RXATTRCREATE:
+ break;
+
+ case L9P_TFSYNC:
+ r = l9p_pu32(msg, &fcall->hdr.fid);
+ break;
+
+ case L9P_RFSYNC:
+ break;
+
+ case L9P_TLOCK:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ l9p_pu8(msg, &fcall->tlock.type);
+ l9p_pu32(msg, &fcall->tlock.flags);
+ l9p_pu64(msg, &fcall->tlock.start);
+ l9p_pu64(msg, &fcall->tlock.length);
+ l9p_pu32(msg, &fcall->tlock.proc_id);
+ r = l9p_pustring(msg, &fcall->tlock.client_id);
+ break;
+
+ case L9P_RLOCK:
+ r = l9p_pu8(msg, &fcall->rlock.status);
+ break;
+
+ case L9P_TGETLOCK:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ /* FALLTHROUGH */
+
+ case L9P_RGETLOCK:
+ l9p_pu8(msg, &fcall->getlock.type);
+ l9p_pu64(msg, &fcall->getlock.start);
+ l9p_pu64(msg, &fcall->getlock.length);
+ l9p_pu32(msg, &fcall->getlock.proc_id);
+ r = l9p_pustring(msg, &fcall->getlock.client_id);
+ break;
+
+ case L9P_TLINK:
+ l9p_pu32(msg, &fcall->tlink.dfid);
+ l9p_pu32(msg, &fcall->hdr.fid);
+ r = l9p_pustring(msg, &fcall->tlink.name);
+ break;
+
+ case L9P_RLINK:
+ break;
+
+ case L9P_TMKDIR:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ r = l9p_pustring(msg, &fcall->tmkdir.name);
+ if (r < 0)
+ break;
+ l9p_pu32(msg, &fcall->tmkdir.mode);
+ r = l9p_pu32(msg, &fcall->tmkdir.gid);
+ break;
+
+ case L9P_RMKDIR:
+ r = l9p_puqid(msg, &fcall->rmkdir.qid);
+ break;
+
+ case L9P_TRENAMEAT:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ r = l9p_pustring(msg, &fcall->trenameat.oldname);
+ if (r < 0)
+ break;
+ l9p_pu32(msg, &fcall->trenameat.newdirfid);
+ r = l9p_pustring(msg, &fcall->trenameat.newname);
+ break;
+
+ case L9P_RRENAMEAT:
+ break;
+
+ case L9P_TUNLINKAT:
+ l9p_pu32(msg, &fcall->hdr.fid);
+ r = l9p_pustring(msg, &fcall->tunlinkat.name);
+ if (r < 0)
+ break;
+ r = l9p_pu32(msg, &fcall->tunlinkat.flags);
+ break;
+
+ case L9P_RUNLINKAT:
+ break;
+
+ default:
+ L9P_LOG(L9P_ERROR, "%s(): missing case for type %d",
+ __func__, fcall->hdr.type);
+ break;
+ }
+
+ /* Check for over- or under-run, or pustring error. */
+ if (r < 0)
+ return (-1);
+
+ if (msg->lm_mode == L9P_PACK) {
+ /* Rewind to the beginning and install size at front. */
+ uint32_t len = (uint32_t)msg->lm_size;
+ msg->lm_cursor_offset = 0;
+ msg->lm_cursor_iov = 0;
+
+ /*
+ * Subtract 4 bytes from current size, becase we're
+ * overwriting size (rewinding message to the beginning)
+ * and writing again, which will increase it 4 more.
+ */
+ msg->lm_size -= sizeof(uint32_t);
+
+ if (fcall->hdr.type == L9P_RREAD ||
+ fcall->hdr.type == L9P_RREADDIR)
+ len += fcall->io.count;
+
+ l9p_pu32(msg, &len);
+ }
+
+ return (0);
+}
+
+/*
+ * Free any strings or other data malloc'ed in the process of
+ * packing or unpacking an fcall.
+ */
+void
+l9p_freefcall(union l9p_fcall *fcall)
+{
+ uint16_t i;
+
+ switch (fcall->hdr.type) {
+
+ case L9P_TVERSION:
+ case L9P_RVERSION:
+ free(fcall->version.version);
+ return;
+
+ case L9P_TATTACH:
+ free(fcall->tattach.aname);
+ free(fcall->tattach.uname);
+ return;
+
+ case L9P_TWALK:
+ for (i = 0; i < fcall->twalk.nwname; i++)
+ free(fcall->twalk.wname[i]);
+ return;
+
+ case L9P_TCREATE:
+ case L9P_TOPEN:
+ free(fcall->tcreate.name);
+ free(fcall->tcreate.extension);
+ return;
+
+ case L9P_RSTAT:
+ l9p_freestat(&fcall->rstat.stat);
+ return;
+
+ case L9P_TWSTAT:
+ l9p_freestat(&fcall->twstat.stat);
+ return;
+
+ case L9P_TLCREATE:
+ free(fcall->tlcreate.name);
+ return;
+
+ case L9P_TSYMLINK:
+ free(fcall->tsymlink.name);
+ free(fcall->tsymlink.symtgt);
+ return;
+
+ case L9P_TMKNOD:
+ free(fcall->tmknod.name);
+ return;
+
+ case L9P_TRENAME:
+ free(fcall->trename.name);
+ return;
+
+ case L9P_RREADLINK:
+ free(fcall->rreadlink.target);
+ return;
+
+ case L9P_TXATTRWALK:
+ free(fcall->txattrwalk.name);
+ return;
+
+ case L9P_TXATTRCREATE:
+ free(fcall->txattrcreate.name);
+ return;
+
+ case L9P_TLOCK:
+ free(fcall->tlock.client_id);
+ return;
+
+ case L9P_TGETLOCK:
+ case L9P_RGETLOCK:
+ free(fcall->getlock.client_id);
+ return;
+
+ case L9P_TLINK:
+ free(fcall->tlink.name);
+ return;
+
+ case L9P_TMKDIR:
+ free(fcall->tmkdir.name);
+ return;
+
+ case L9P_TRENAMEAT:
+ free(fcall->trenameat.oldname);
+ free(fcall->trenameat.newname);
+ return;
+
+ case L9P_TUNLINKAT:
+ free(fcall->tunlinkat.name);
+ return;
+ }
+}
+
+void
+l9p_freestat(struct l9p_stat *stat)
+{
+ free(stat->name);
+ free(stat->extension);
+ free(stat->uid);
+ free(stat->gid);
+ free(stat->muid);
+}
+
+uint16_t
+l9p_sizeof_stat(struct l9p_stat *stat, enum l9p_version version)
+{
+ uint16_t size = L9P_WORD /* size */
+ + L9P_WORD /* type */
+ + L9P_DWORD /* dev */
+ + QID_SIZE /* qid */
+ + 3 * L9P_DWORD /* mode, atime, mtime */
+ + L9P_QWORD /* length */
+ + STRING_SIZE(stat->name)
+ + STRING_SIZE(stat->uid)
+ + STRING_SIZE(stat->gid)
+ + STRING_SIZE(stat->muid);
+
+ if (version >= L9P_2000U) {
+ size += STRING_SIZE(stat->extension)
+ + 3 * L9P_DWORD;
+ }
+
+ return (size);
+}
diff --git a/usr/src/lib/lib9p/common/request.c b/usr/src/lib/lib9p/common/request.c
new file mode 100644
index 0000000000..99885690af
--- /dev/null
+++ b/usr/src/lib/lib9p/common/request.c
@@ -0,0 +1,1446 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/param.h>
+#include <sys/uio.h>
+#ifdef __illumos__
+#include <sys/sysmacros.h>
+#endif
+#if defined(__FreeBSD__)
+#include <sys/sbuf.h>
+#else
+#include "sbuf/sbuf.h"
+#endif
+#include "lib9p.h"
+#include "lib9p_impl.h"
+#include "fcall.h"
+#include "fid.h"
+#include "hashtable.h"
+#include "log.h"
+#include "linux_errno.h"
+#include "backend/backend.h"
+#include "threadpool.h"
+
+#define N(x) (sizeof(x) / sizeof(x[0]))
+
+static int l9p_dispatch_tversion(struct l9p_request *req);
+static int l9p_dispatch_tattach(struct l9p_request *req);
+static int l9p_dispatch_tclunk(struct l9p_request *req);
+static int l9p_dispatch_tcreate(struct l9p_request *req);
+static int l9p_dispatch_topen(struct l9p_request *req);
+static int l9p_dispatch_tread(struct l9p_request *req);
+static int l9p_dispatch_tremove(struct l9p_request *req);
+static int l9p_dispatch_tstat(struct l9p_request *req);
+static int l9p_dispatch_twalk(struct l9p_request *req);
+static int l9p_dispatch_twrite(struct l9p_request *req);
+static int l9p_dispatch_twstat(struct l9p_request *req);
+static int l9p_dispatch_tstatfs(struct l9p_request *req);
+static int l9p_dispatch_tlopen(struct l9p_request *req);
+static int l9p_dispatch_tlcreate(struct l9p_request *req);
+static int l9p_dispatch_tsymlink(struct l9p_request *req);
+static int l9p_dispatch_tmknod(struct l9p_request *req);
+static int l9p_dispatch_trename(struct l9p_request *req);
+static int l9p_dispatch_treadlink(struct l9p_request *req);
+static int l9p_dispatch_tgetattr(struct l9p_request *req);
+static int l9p_dispatch_tsetattr(struct l9p_request *req);
+static int l9p_dispatch_txattrwalk(struct l9p_request *req);
+static int l9p_dispatch_txattrcreate(struct l9p_request *req);
+static int l9p_dispatch_treaddir(struct l9p_request *req);
+static int l9p_dispatch_tfsync(struct l9p_request *req);
+static int l9p_dispatch_tlock(struct l9p_request *req);
+static int l9p_dispatch_tgetlock(struct l9p_request *req);
+static int l9p_dispatch_tlink(struct l9p_request *req);
+static int l9p_dispatch_tmkdir(struct l9p_request *req);
+static int l9p_dispatch_trenameat(struct l9p_request *req);
+static int l9p_dispatch_tunlinkat(struct l9p_request *req);
+
+/*
+ * Each Txxx handler has a "must run" flag. If it is false,
+ * we check for a flush request before calling the handler.
+ * If a flush is already requested we can instantly fail the
+ * request with EINTR.
+ *
+ * Tclunk and Tremove must run because they make their fids
+ * become invalid. Tversion and Tattach should never get
+ * a flush request applied (it makes no sense as the connection
+ * is not really running yet), so it should be harmless to
+ * set them either way, but for now we have them as must-run.
+ * Flushing a Tflush is not really allowed either so we keep
+ * these as must-run too (although they run without being done
+ * threaded anyway).
+ */
+struct l9p_handler {
+ enum l9p_ftype type;
+ int (*handler)(struct l9p_request *);
+ bool must_run;
+};
+
+static const struct l9p_handler l9p_handlers_no_version[] = {
+ {L9P_TVERSION, l9p_dispatch_tversion, true},
+};
+
+static const struct l9p_handler l9p_handlers_base[] = {
+ {L9P_TVERSION, l9p_dispatch_tversion, true},
+ {L9P_TATTACH, l9p_dispatch_tattach, true},
+ {L9P_TCLUNK, l9p_dispatch_tclunk, true},
+ {L9P_TFLUSH, l9p_threadpool_tflush, true},
+ {L9P_TCREATE, l9p_dispatch_tcreate, false},
+ {L9P_TOPEN, l9p_dispatch_topen, false},
+ {L9P_TREAD, l9p_dispatch_tread, false},
+ {L9P_TWRITE, l9p_dispatch_twrite, false},
+ {L9P_TREMOVE, l9p_dispatch_tremove, true},
+ {L9P_TSTAT, l9p_dispatch_tstat, false},
+ {L9P_TWALK, l9p_dispatch_twalk, false},
+ {L9P_TWSTAT, l9p_dispatch_twstat, false}
+};
+static const struct l9p_handler l9p_handlers_dotu[] = {
+ {L9P_TVERSION, l9p_dispatch_tversion, true},
+ {L9P_TATTACH, l9p_dispatch_tattach, true},
+ {L9P_TCLUNK, l9p_dispatch_tclunk, true},
+ {L9P_TFLUSH, l9p_threadpool_tflush, true},
+ {L9P_TCREATE, l9p_dispatch_tcreate, false},
+ {L9P_TOPEN, l9p_dispatch_topen, false},
+ {L9P_TREAD, l9p_dispatch_tread, false},
+ {L9P_TWRITE, l9p_dispatch_twrite, false},
+ {L9P_TREMOVE, l9p_dispatch_tremove, true},
+ {L9P_TSTAT, l9p_dispatch_tstat, false},
+ {L9P_TWALK, l9p_dispatch_twalk, false},
+ {L9P_TWSTAT, l9p_dispatch_twstat, false}
+};
+static const struct l9p_handler l9p_handlers_dotL[] = {
+ {L9P_TVERSION, l9p_dispatch_tversion, true},
+ {L9P_TATTACH, l9p_dispatch_tattach, true},
+ {L9P_TCLUNK, l9p_dispatch_tclunk, true},
+ {L9P_TFLUSH, l9p_threadpool_tflush, true},
+ {L9P_TCREATE, l9p_dispatch_tcreate, false},
+ {L9P_TOPEN, l9p_dispatch_topen, false},
+ {L9P_TREAD, l9p_dispatch_tread, false},
+ {L9P_TWRITE, l9p_dispatch_twrite, false},
+ {L9P_TREMOVE, l9p_dispatch_tremove, true},
+ {L9P_TSTAT, l9p_dispatch_tstat, false},
+ {L9P_TWALK, l9p_dispatch_twalk, false},
+ {L9P_TWSTAT, l9p_dispatch_twstat, false},
+ {L9P_TSTATFS, l9p_dispatch_tstatfs, false},
+ {L9P_TLOPEN, l9p_dispatch_tlopen, false},
+ {L9P_TLCREATE, l9p_dispatch_tlcreate, false},
+ {L9P_TSYMLINK, l9p_dispatch_tsymlink, false},
+ {L9P_TMKNOD, l9p_dispatch_tmknod, false},
+ {L9P_TRENAME, l9p_dispatch_trename, false},
+ {L9P_TREADLINK, l9p_dispatch_treadlink, false},
+ {L9P_TGETATTR, l9p_dispatch_tgetattr, false},
+ {L9P_TSETATTR, l9p_dispatch_tsetattr, false},
+ {L9P_TXATTRWALK, l9p_dispatch_txattrwalk, false},
+ {L9P_TXATTRCREATE, l9p_dispatch_txattrcreate, false},
+ {L9P_TREADDIR, l9p_dispatch_treaddir, false},
+ {L9P_TFSYNC, l9p_dispatch_tfsync, false},
+ {L9P_TLOCK, l9p_dispatch_tlock, true},
+ {L9P_TGETLOCK, l9p_dispatch_tgetlock, true},
+ {L9P_TLINK, l9p_dispatch_tlink, false},
+ {L9P_TMKDIR, l9p_dispatch_tmkdir, false},
+ {L9P_TRENAMEAT, l9p_dispatch_trenameat, false},
+ {L9P_TUNLINKAT, l9p_dispatch_tunlinkat, false},
+};
+
+/*
+ * NB: version index 0 is reserved for new connections, and
+ * is a protocol that handles only L9P_TVERSION. Once we get a
+ * valid version, we start a new session using its dispatch table.
+ */
+static const struct {
+ const char *name;
+ const struct l9p_handler *handlers;
+ int n_handlers;
+} l9p_versions[] = {
+ { "<none>", l9p_handlers_no_version, N(l9p_handlers_no_version) },
+ { "9P2000", l9p_handlers_base, N(l9p_handlers_base) },
+ { "9P2000.u", l9p_handlers_dotu, N(l9p_handlers_dotu), },
+ { "9P2000.L", l9p_handlers_dotL, N(l9p_handlers_dotL), },
+};
+
+/*
+ * Run the appropriate handler for this request.
+ * It's our caller's responsibility to respond.
+ */
+int
+l9p_dispatch_request(struct l9p_request *req)
+{
+ struct l9p_connection *conn;
+#if defined(L9P_DEBUG)
+ struct sbuf *sb;
+#endif
+ size_t i, n;
+ const struct l9p_handler *handlers, *hp;
+ bool flush_requested;
+
+ conn = req->lr_conn;
+ flush_requested = req->lr_flushstate == L9P_FLUSH_REQUESTED_PRE_START;
+
+ handlers = l9p_versions[conn->lc_version].handlers;
+ n = (size_t)l9p_versions[conn->lc_version].n_handlers;
+ for (hp = handlers, i = 0; i < n; hp++, i++)
+ if (req->lr_req.hdr.type == hp->type)
+ goto found;
+ hp = NULL;
+found:
+
+#if defined(L9P_DEBUG)
+ sb = sbuf_new_auto();
+ if (flush_requested) {
+ sbuf_cat(sb, "FLUSH requested pre-dispatch");
+ if (hp != NULL && hp->must_run)
+ sbuf_cat(sb, ", but must run");
+ sbuf_cat(sb, ": ");
+ }
+ l9p_describe_fcall(&req->lr_req, conn->lc_version, sb);
+ sbuf_finish(sb);
+
+ L9P_LOG(L9P_DEBUG, "%s", sbuf_data(sb));
+ sbuf_delete(sb);
+#endif
+
+ if (hp != NULL) {
+ if (!flush_requested || hp->must_run)
+ return (hp->handler(req));
+ return (EINTR);
+ }
+
+ L9P_LOG(L9P_WARNING, "unknown request of type %d",
+ req->lr_req.hdr.type);
+ return (ENOSYS);
+}
+
+/*
+ * Translate BSD errno to 9P2000/9P2000.u errno.
+ */
+static inline int
+e29p(int errnum)
+{
+ static int const table[] = {
+ [ENOTEMPTY] = EPERM,
+ [EDQUOT] = EPERM,
+ [ENOSYS] = EPERM, /* ??? */
+ };
+
+ if ((size_t)errnum < N(table) && table[errnum] != 0)
+ return (table[errnum]);
+ if (errnum <= ERANGE)
+ return (errnum);
+ return (EIO); /* ??? */
+}
+
+/*
+ * Translate BSD errno to Linux errno.
+ */
+static inline int
+e2linux(int errnum)
+{
+ static int const table[] = {
+ [EDEADLK] = LINUX_EDEADLK,
+ [EAGAIN] = LINUX_EAGAIN,
+ [EINPROGRESS] = LINUX_EINPROGRESS,
+ [EALREADY] = LINUX_EALREADY,
+ [ENOTSOCK] = LINUX_ENOTSOCK,
+ [EDESTADDRREQ] = LINUX_EDESTADDRREQ,
+ [EMSGSIZE] = LINUX_EMSGSIZE,
+ [EPROTOTYPE] = LINUX_EPROTOTYPE,
+ [ENOPROTOOPT] = LINUX_ENOPROTOOPT,
+ [EPROTONOSUPPORT] = LINUX_EPROTONOSUPPORT,
+ [ESOCKTNOSUPPORT] = LINUX_ESOCKTNOSUPPORT,
+ [EOPNOTSUPP] = LINUX_EOPNOTSUPP,
+ [EPFNOSUPPORT] = LINUX_EPFNOSUPPORT,
+ [EAFNOSUPPORT] = LINUX_EAFNOSUPPORT,
+ [EADDRINUSE] = LINUX_EADDRINUSE,
+ [EADDRNOTAVAIL] = LINUX_EADDRNOTAVAIL,
+ [ENETDOWN] = LINUX_ENETDOWN,
+ [ENETUNREACH] = LINUX_ENETUNREACH,
+ [ENETRESET] = LINUX_ENETRESET,
+ [ECONNABORTED] = LINUX_ECONNABORTED,
+ [ECONNRESET] = LINUX_ECONNRESET,
+ [ENOBUFS] = LINUX_ENOBUFS,
+ [EISCONN] = LINUX_EISCONN,
+ [ENOTCONN] = LINUX_ENOTCONN,
+ [ESHUTDOWN] = LINUX_ESHUTDOWN,
+ [ETOOMANYREFS] = LINUX_ETOOMANYREFS,
+ [ETIMEDOUT] = LINUX_ETIMEDOUT,
+ [ECONNREFUSED] = LINUX_ECONNREFUSED,
+ [ELOOP] = LINUX_ELOOP,
+ [ENAMETOOLONG] = LINUX_ENAMETOOLONG,
+ [EHOSTDOWN] = LINUX_EHOSTDOWN,
+ [EHOSTUNREACH] = LINUX_EHOSTUNREACH,
+ [ENOTEMPTY] = LINUX_ENOTEMPTY,
+#ifndef __illumos__
+ [EPROCLIM] = LINUX_EAGAIN,
+#endif
+ [EUSERS] = LINUX_EUSERS,
+ [EDQUOT] = LINUX_EDQUOT,
+ [ESTALE] = LINUX_ESTALE,
+ [EREMOTE] = LINUX_EREMOTE,
+ /* EBADRPC = unmappable? */
+ /* ERPCMISMATCH = unmappable? */
+ /* EPROGUNAVAIL = unmappable? */
+ /* EPROGMISMATCH = unmappable? */
+ /* EPROCUNAVAIL = unmappable? */
+ [ENOLCK] = LINUX_ENOLCK,
+ [ENOSYS] = LINUX_ENOSYS,
+ /* EFTYPE = unmappable? */
+ /* EAUTH = unmappable? */
+ /* ENEEDAUTH = unmappable? */
+ [EIDRM] = LINUX_EIDRM,
+ [ENOMSG] = LINUX_ENOMSG,
+ [EOVERFLOW] = LINUX_EOVERFLOW,
+ [ECANCELED] = LINUX_ECANCELED,
+ [EILSEQ] = LINUX_EILSEQ,
+ /* EDOOFUS = unmappable? */
+ [EBADMSG] = LINUX_EBADMSG,
+ [EMULTIHOP] = LINUX_EMULTIHOP,
+ [ENOLINK] = LINUX_ENOLINK,
+ [EPROTO] = LINUX_EPROTO,
+ /* ENOTCAPABLE = unmappable? */
+#ifdef ECAPMODE
+ [ECAPMODE] = EPERM,
+#endif
+#ifdef ENOTRECOVERABLE
+ [ENOTRECOVERABLE] = LINUX_ENOTRECOVERABLE,
+#endif
+#ifdef EOWNERDEAD
+ [EOWNERDEAD] = LINUX_EOWNERDEAD,
+#endif
+ };
+
+ /*
+ * In case we want to return a raw Linux errno, allow negative
+ * values a la Linux kernel internals.
+ *
+ * Values up to ERANGE are shared across systems (see
+ * linux_errno.h), except for EAGAIN.
+ */
+ if (errnum < 0)
+ return (-errnum);
+
+ if ((size_t)errnum < N(table) && table[errnum] != 0)
+ return (table[errnum]);
+
+ if (errnum <= ERANGE)
+ return (errnum);
+
+ L9P_LOG(L9P_WARNING, "cannot map errno %d to anything reasonable",
+ errnum);
+
+ return (LINUX_ENOTRECOVERABLE); /* ??? */
+}
+
+/*
+ * Send response to request, or possibly just drop request.
+ * We also need to know whether to remove the request from
+ * the tag hash table.
+ */
+void
+l9p_respond(struct l9p_request *req, bool drop, bool rmtag)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ size_t iosize;
+#if defined(L9P_DEBUG)
+ struct sbuf *sb;
+ const char *ftype;
+#endif
+ int error;
+
+ req->lr_resp.hdr.tag = req->lr_req.hdr.tag;
+
+ error = req->lr_error;
+ if (error == 0)
+ req->lr_resp.hdr.type = req->lr_req.hdr.type + 1;
+ else {
+ if (conn->lc_version == L9P_2000L) {
+ req->lr_resp.hdr.type = L9P_RLERROR;
+ req->lr_resp.error.errnum = (uint32_t)e2linux(error);
+ } else {
+ req->lr_resp.hdr.type = L9P_RERROR;
+ req->lr_resp.error.ename = strerror(error);
+ req->lr_resp.error.errnum = (uint32_t)e29p(error);
+ }
+ }
+
+#if defined(L9P_DEBUG)
+ sb = sbuf_new_auto();
+ l9p_describe_fcall(&req->lr_resp, conn->lc_version, sb);
+ sbuf_finish(sb);
+
+ switch (req->lr_flushstate) {
+ case L9P_FLUSH_NONE:
+ default:
+ ftype = "";
+ break;
+ case L9P_FLUSH_REQUESTED_PRE_START:
+ ftype = "FLUSH requested pre-dispatch: ";
+ break;
+ case L9P_FLUSH_REQUESTED_POST_START:
+ ftype = "FLUSH requested while running: ";
+ break;
+ case L9P_FLUSH_TOOLATE:
+ ftype = "FLUSH requested too late: ";
+ break;
+ }
+ L9P_LOG(L9P_DEBUG, "%s%s%s",
+ drop ? "DROP: " : "", ftype, sbuf_data(sb));
+ sbuf_delete(sb);
+#endif
+
+ error = drop ? 0 :
+ l9p_pufcall(&req->lr_resp_msg, &req->lr_resp, conn->lc_version);
+ if (rmtag)
+ ht_remove(&conn->lc_requests, req->lr_req.hdr.tag);
+ if (error != 0) {
+ L9P_LOG(L9P_ERROR, "cannot pack response");
+ drop = true;
+ }
+
+ if (drop) {
+ conn->lc_lt.lt_drop_response(req,
+ req->lr_resp_msg.lm_iov, req->lr_resp_msg.lm_niov,
+ conn->lc_lt.lt_aux);
+ } else {
+ iosize = req->lr_resp_msg.lm_size;
+
+ /*
+ * Include I/O size in calculation for Rread and
+ * Rreaddir responses.
+ */
+ if (req->lr_resp.hdr.type == L9P_RREAD ||
+ req->lr_resp.hdr.type == L9P_RREADDIR)
+ iosize += req->lr_resp.io.count;
+
+ conn->lc_lt.lt_send_response(req,
+ req->lr_resp_msg.lm_iov, req->lr_resp_msg.lm_niov,
+ iosize, conn->lc_lt.lt_aux);
+ }
+
+ l9p_freefcall(&req->lr_req);
+ l9p_freefcall(&req->lr_resp);
+
+ free(req);
+}
+
+/*
+ * This allows a caller to iterate through the data in a
+ * read or write request (creating the data if packing,
+ * scanning through it if unpacking). This is used for
+ * writing readdir entries, so mode should be L9P_PACK
+ * (but we allow L9P_UNPACK so that debug code can also scan
+ * through the data later, if desired).
+ *
+ * This relies on the Tread op having positioned the request's
+ * iov to the beginning of the data buffer (note the l9p_seek_iov
+ * in l9p_dispatch_tread).
+ */
+void
+l9p_init_msg(struct l9p_message *msg, struct l9p_request *req,
+ enum l9p_pack_mode mode)
+{
+
+ msg->lm_size = 0;
+ msg->lm_mode = mode;
+ msg->lm_cursor_iov = 0;
+ msg->lm_cursor_offset = 0;
+ msg->lm_niov = req->lr_data_niov;
+ memcpy(msg->lm_iov, req->lr_data_iov,
+ sizeof (struct iovec) * req->lr_data_niov);
+}
+
+enum fid_lookup_flags {
+ F_REQUIRE_OPEN = 0x01, /* require that the file be marked OPEN */
+ F_REQUIRE_DIR = 0x02, /* require that the file be marked ISDIR */
+ F_REQUIRE_XATTR = 0x04, /* require that the file be marked XATTR */
+ F_REQUIRE_AUTH = 0x08, /* require that the fid be marked AUTH */
+ F_FORBID_OPEN = 0x10, /* forbid that the file be marked OPEN */
+ F_FORBID_DIR = 0x20, /* forbid that the file be marked ISDIR */
+ F_FORBID_XATTR = 0x40, /* forbid that the file be marked XATTR */
+ F_ALLOW_AUTH = 0x80, /* allow that the fid be marked AUTH */
+};
+
+/*
+ * Look up a fid. It must correspond to a valid file, else we return
+ * the given errno (some "not a valid fid" calls must return EIO and
+ * some must return EINVAL and qemu returns ENOENT in other cases and
+ * so on, so we just provide a general "return this error number").
+ *
+ * Callers may also set constraints: fid must be (or not be) open,
+ * must be (or not be) a directory, must be (or not be) an xattr.
+ *
+ * Only one op has a fid that *must* be an auth fid. Most ops forbid
+ * auth fids So instead of FORBID we have ALLOW here and the default
+ * is FORBID.
+ */
+static inline int
+fid_lookup(struct l9p_connection *conn, uint32_t fid, int err, int flags,
+ struct l9p_fid **afile)
+{
+ struct l9p_fid *file;
+
+ file = ht_find(&conn->lc_files, fid);
+ if (file == NULL)
+ return (err);
+
+ /*
+ * As soon as we go multithreaded / async, this
+ * assert has to become "return EINVAL" or "return err".
+ *
+ * We may also need a way to mark a fid as
+ * "in async op" (valid for some purposes, but cannot be
+ * used elsewhere until async op is completed or aborted).
+ *
+ * For now, this serves for bug-detecting.
+ */
+ assert(l9p_fid_isvalid(file));
+
+ /*
+ * Note that we're inline expanded and flags is constant,
+ * so unnecessary tests just drop out entirely.
+ */
+ if ((flags & F_REQUIRE_OPEN) && !l9p_fid_isopen(file))
+ return (EINVAL);
+ if ((flags & F_FORBID_OPEN) && l9p_fid_isopen(file))
+ return (EINVAL);
+ if ((flags & F_REQUIRE_DIR) && !l9p_fid_isdir(file))
+ return (ENOTDIR);
+ if ((flags & F_FORBID_DIR) && l9p_fid_isdir(file))
+ return (EISDIR);
+ if ((flags & F_REQUIRE_XATTR) && !l9p_fid_isxattr(file))
+ return (EINVAL);
+ if ((flags & F_FORBID_XATTR) && l9p_fid_isxattr(file))
+ return (EINVAL);
+ if (l9p_fid_isauth(file)) {
+ if ((flags & (F_REQUIRE_AUTH | F_ALLOW_AUTH)) == 0)
+ return (EINVAL);
+ } else if (flags & F_REQUIRE_AUTH)
+ return (EINVAL);
+ *afile = file;
+ return (0);
+}
+
+/*
+ * Append variable-size stat object and adjust io count.
+ * Returns 0 if the entire stat object was packed, -1 if not.
+ * A fully packed object updates the request's io count.
+ *
+ * Caller must use their own private l9p_message object since
+ * a partially packed object will leave the message object in
+ * a useless state.
+ *
+ * Frees the stat object.
+ */
+int
+l9p_pack_stat(struct l9p_message *msg, struct l9p_request *req,
+ struct l9p_stat *st)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ uint16_t size = l9p_sizeof_stat(st, conn->lc_version);
+ int ret = 0;
+
+ assert(msg->lm_mode == L9P_PACK);
+
+ if (req->lr_resp.io.count + size > req->lr_req.io.count ||
+ l9p_pustat(msg, st, conn->lc_version) < 0)
+ ret = -1;
+ else
+ req->lr_resp.io.count += size;
+ l9p_freestat(st);
+ return (ret);
+}
+
+static int
+l9p_dispatch_tversion(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_server *server = conn->lc_server;
+ enum l9p_version remote_version = L9P_INVALID_VERSION;
+ size_t i;
+ const char *remote_version_name;
+
+ for (i = 0; i < N(l9p_versions); i++) {
+ if (strcmp(req->lr_req.version.version,
+ l9p_versions[i].name) == 0) {
+ remote_version = (enum l9p_version)i;
+ break;
+ }
+ }
+
+ if (remote_version == L9P_INVALID_VERSION) {
+ L9P_LOG(L9P_ERROR, "unsupported remote version: %s",
+ req->lr_req.version.version);
+ return (ENOSYS);
+ }
+
+ remote_version_name = l9p_versions[remote_version].name;
+ L9P_LOG(L9P_INFO, "remote version: %s", remote_version_name);
+ L9P_LOG(L9P_INFO, "local version: %s",
+ l9p_versions[server->ls_max_version].name);
+
+ conn->lc_version = MIN(remote_version, server->ls_max_version);
+ conn->lc_msize = MIN(req->lr_req.version.msize, conn->lc_msize);
+ conn->lc_max_io_size = conn->lc_msize - 24;
+ req->lr_resp.version.version = strdup(remote_version_name);
+ req->lr_resp.version.msize = conn->lc_msize;
+ return (0);
+}
+
+static int
+l9p_dispatch_tattach(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ struct l9p_fid *fid;
+ int error;
+
+ /*
+ * We still don't have Tauth yet, but let's code this part
+ * anyway.
+ *
+ * Look up the auth fid first since if it fails we can just
+ * return immediately.
+ */
+ if (req->lr_req.tattach.afid != L9P_NOFID) {
+ error = fid_lookup(conn, req->lr_req.tattach.afid, EINVAL,
+ F_REQUIRE_AUTH, &req->lr_fid2);
+ if (error)
+ return (error);
+ } else
+ req->lr_fid2 = NULL;
+
+ fid = l9p_connection_alloc_fid(conn, req->lr_req.hdr.fid);
+ if (fid == NULL)
+ return (EINVAL);
+
+ be = conn->lc_server->ls_backend;
+
+ req->lr_fid = fid;
+
+ /* For backend convenience, set NONUNAME on 9P2000. */
+ if (conn->lc_version == L9P_2000)
+ req->lr_req.tattach.n_uname = L9P_NONUNAME;
+ error = be->attach(be->softc, req);
+
+ /*
+ * On success, fid becomes valid; on failure, disconnect.
+ * It certainly *should* be a directory here...
+ */
+ if (error == 0) {
+ l9p_fid_setvalid(fid);
+ if (req->lr_resp.rattach.qid.type & L9P_QTDIR)
+ l9p_fid_setdir(fid);
+ } else
+ l9p_connection_remove_fid(conn, fid);
+ return (error);
+}
+
+static int
+l9p_dispatch_tclunk(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ struct l9p_fid *fid;
+ int error;
+
+ /* Note that clunk is the only way to dispose of an auth fid. */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_ALLOW_AUTH, &fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+ l9p_fid_unsetvalid(fid);
+
+ /*
+ * If it's an xattr fid there must, by definition, be an
+ * xattrclunk. The xattrclunk function can only be NULL if
+ * xattrwalk and xattrcreate are NULL or always return error.
+ *
+ * Q: do we want to allow async xattrclunk in case of very
+ * large xattr create? This will make things difficult,
+ * so probably not.
+ */
+ if (l9p_fid_isxattr(fid))
+ error = be->xattrclunk(be->softc, fid);
+ else
+ error = be->clunk(be->softc, fid);
+
+ /* fid is now gone regardless of any error return */
+ l9p_connection_remove_fid(conn, fid);
+ return (error);
+}
+
+static int
+l9p_dispatch_tcreate(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ uint32_t dmperm;
+ int error;
+
+ /* Incoming fid must represent a directory that has not been opened. */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, EINVAL,
+ F_REQUIRE_DIR | F_FORBID_OPEN, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+ dmperm = req->lr_req.tcreate.perm;
+#define MKDIR_OR_SIMILAR \
+ (L9P_DMDIR | L9P_DMSYMLINK | L9P_DMNAMEDPIPE | L9P_DMSOCKET | L9P_DMDEVICE)
+
+ /*
+ * TODO:
+ * - check new file name
+ * - break out different kinds of create (file vs mkdir etc)
+ * - add async file-create (leaves req->lr_fid in limbo)
+ *
+ * A successful file-create changes the fid into an open file.
+ */
+ error = be->create(be->softc, req);
+ if (error == 0 && (dmperm & MKDIR_OR_SIMILAR) == 0) {
+ l9p_fid_unsetdir(req->lr_fid);
+ l9p_fid_setopen(req->lr_fid);
+ }
+
+ return (error);
+}
+
+static int
+l9p_dispatch_topen(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_FORBID_OPEN | F_FORBID_XATTR, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ /*
+ * TODO:
+ * - add async open (leaves req->lr_fid in limbo)
+ */
+ error = be->open(be->softc, req);
+ if (error == 0)
+ l9p_fid_setopen(req->lr_fid);
+ return (error);
+}
+
+static int
+l9p_dispatch_tread(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ struct l9p_fid *fid;
+ int error;
+
+ /* Xattr fids are not open, so we need our own tests. */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, EINVAL, 0, &req->lr_fid);
+ if (error)
+ return (error);
+
+ /*
+ * Adjust so that writing messages (packing data) starts
+ * right after the count field in the response.
+ *
+ * size[4] + Rread[1] + tag[2] + count[4] = 11
+ */
+ l9p_seek_iov(req->lr_resp_msg.lm_iov, req->lr_resp_msg.lm_niov,
+ req->lr_data_iov, &req->lr_data_niov, 11);
+
+ /*
+ * If it's an xattr fid there must, by definition, be an
+ * xattrread. The xattrread function can only be NULL if
+ * xattrwalk and xattrcreate are NULL or always return error.
+ *
+ * TODO:
+ * separate out directory-read
+ * allow async read
+ */
+ be = conn->lc_server->ls_backend;
+ fid = req->lr_fid;
+ if (l9p_fid_isxattr(fid)) {
+ error = be->xattrread(be->softc, req);
+ } else if (l9p_fid_isopen(fid)) {
+ error = be->read(be->softc, req);
+ } else {
+ error = EINVAL;
+ }
+
+ return (error);
+}
+
+static int
+l9p_dispatch_tremove(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ struct l9p_fid *fid;
+ int error;
+
+ /*
+ * ?? Should we allow Tremove on auth fids? If so, do
+ * we pretend it is just a Tclunk?
+ */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, EINVAL, 0, &fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+ l9p_fid_unsetvalid(fid);
+
+ error = be->remove(be->softc, fid);
+ /* fid is now gone regardless of any error return */
+ l9p_connection_remove_fid(conn, fid);
+ return (error);
+}
+
+static int
+l9p_dispatch_tstat(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ struct l9p_fid *fid;
+ int error;
+
+ /* Allow Tstat on auth fid? Seems harmless enough... */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_ALLOW_AUTH, &fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+ req->lr_fid = fid;
+ error = be->stat(be->softc, req);
+
+ if (error == 0) {
+ if (l9p_fid_isauth(fid))
+ req->lr_resp.rstat.stat.qid.type |= L9P_QTAUTH;
+
+ /* should we check req->lr_resp.rstat.qid.type L9P_QTDIR bit? */
+ if (req->lr_resp.rstat.stat.qid.type &= L9P_QTDIR)
+ l9p_fid_setdir(fid);
+ else
+ l9p_fid_unsetdir(fid);
+ }
+
+ return (error);
+}
+
+static int
+l9p_dispatch_twalk(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ struct l9p_fid *fid, *newfid;
+ uint16_t n;
+ int error;
+
+ /* Can forbid XATTR, but cannot require DIR. */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_FORBID_XATTR, &fid);
+ if (error)
+ return (error);
+
+ if (req->lr_req.twalk.hdr.fid != req->lr_req.twalk.newfid) {
+ newfid = l9p_connection_alloc_fid(conn,
+ req->lr_req.twalk.newfid);
+ if (newfid == NULL)
+ return (EINVAL);
+ } else
+ newfid = fid;
+
+ be = conn->lc_server->ls_backend;
+ req->lr_fid = fid;
+ req->lr_newfid = newfid;
+ error = be->walk(be->softc, req);
+
+ /*
+ * If newfid == fid, then fid itself has (potentially) changed,
+ * but is still valid. Otherwise set newfid valid on
+ * success, and destroy it on error.
+ */
+ if (newfid != fid) {
+ if (error == 0)
+ l9p_fid_setvalid(newfid);
+ else
+ l9p_connection_remove_fid(conn, newfid);
+ }
+
+ /*
+ * If we walked any name elements, the last (n-1'th) qid
+ * has the type (dir vs file) for the new fid. Otherwise
+ * the type of newfid is the same as fid. Of course, if
+ * n==0 and fid==newfid, fid is already set up correctly
+ * as the whole thing was a big no-op, but it's safe to
+ * copy its dir bit to itself.
+ */
+ if (error == 0) {
+ n = req->lr_resp.rwalk.nwqid;
+ if (n > 0) {
+ if (req->lr_resp.rwalk.wqid[n - 1].type & L9P_QTDIR)
+ l9p_fid_setdir(newfid);
+ } else {
+ if (l9p_fid_isdir(fid))
+ l9p_fid_setdir(newfid);
+ }
+ }
+ return (error);
+}
+
+static int
+l9p_dispatch_twrite(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ struct l9p_fid *fid;
+ int error;
+
+ /* Cannot require open due to xattr write, but can forbid dir. */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, EINVAL,
+ F_FORBID_DIR, &req->lr_fid);
+ if (error)
+ return (error);
+
+ /*
+ * Adjust to point to the data to be written (a la
+ * l9p_dispatch_tread, but we're pointing into the request
+ * buffer rather than the response):
+ *
+ * size[4] + Twrite[1] + tag[2] + fid[4] + offset[8] + count[4] = 23
+ */
+ l9p_seek_iov(req->lr_req_msg.lm_iov, req->lr_req_msg.lm_niov,
+ req->lr_data_iov, &req->lr_data_niov, 23);
+
+ /*
+ * Unlike read, write and xattrwrite are optional (for R/O fs).
+ *
+ * TODO:
+ * allow async write
+ */
+ be = conn->lc_server->ls_backend;
+ fid = req->lr_fid;
+ if (l9p_fid_isxattr(fid)) {
+ error = be->xattrwrite != NULL ?
+ be->xattrwrite(be->softc, req) : ENOSYS;
+ } else if (l9p_fid_isopen(fid)) {
+ error = be->write != NULL ?
+ be->write(be->softc, req) : ENOSYS;
+ } else {
+ error = EINVAL;
+ }
+
+ return (error);
+}
+
+static int
+l9p_dispatch_twstat(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ error = fid_lookup(conn, req->lr_req.hdr.fid, EINVAL,
+ F_FORBID_XATTR, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+ error = be->wstat != NULL ? be->wstat(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_tstatfs(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ /* Should we allow statfs on auth fids? */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, EINVAL, 0, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+ error = be->statfs(be->softc, req);
+ return (error);
+}
+
+static int
+l9p_dispatch_tlopen(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_FORBID_OPEN | F_FORBID_XATTR, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ /*
+ * TODO:
+ * - add async open (leaves req->lr_fid in limbo)
+ */
+ error = be->lopen != NULL ? be->lopen(be->softc, req) : ENOSYS;
+ if (error == 0)
+ l9p_fid_setopen(req->lr_fid);
+ return (error);
+}
+
+static int
+l9p_dispatch_tlcreate(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_REQUIRE_DIR | F_FORBID_OPEN, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ /*
+ * TODO:
+ * - check new file name
+ * - add async create (leaves req->lr_fid in limbo)
+ */
+ error = be->lcreate != NULL ? be->lcreate(be->softc, req) : ENOSYS;
+ if (error == 0) {
+ l9p_fid_unsetdir(req->lr_fid);
+ l9p_fid_setopen(req->lr_fid);
+ }
+ return (error);
+}
+
+static int
+l9p_dispatch_tsymlink(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ /* This doesn't affect the containing dir; maybe allow OPEN? */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_REQUIRE_DIR | F_FORBID_OPEN, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ /*
+ * TODO:
+ * - check new file name
+ */
+ error = be->symlink != NULL ? be->symlink(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_tmknod(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ /* This doesn't affect the containing dir; maybe allow OPEN? */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_REQUIRE_DIR | F_FORBID_OPEN, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ /*
+ * TODO:
+ * - check new file name
+ */
+ error = be->mknod != NULL ? be->mknod(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_trename(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ /* Rename directory or file (including symlink etc). */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_FORBID_XATTR, &req->lr_fid);
+ if (error)
+ return (error);
+
+ /* Doesn't affect new dir fid; maybe allow OPEN? */
+ error = fid_lookup(conn, req->lr_req.trename.dfid, ENOENT,
+ F_REQUIRE_DIR | F_FORBID_OPEN, &req->lr_fid2);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ /*
+ * TODO:
+ * - check new file name (trename.name)
+ */
+ error = be->rename != NULL ? be->rename(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_treadlink(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ /*
+ * The underlying readlink will fail unless it's a symlink,
+ * and the back end has to check, but we might as well forbid
+ * directories and open files here since it's cheap.
+ */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_FORBID_DIR | F_FORBID_OPEN, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ error = be->readlink != NULL ? be->readlink(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_tgetattr(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_FORBID_XATTR, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ error = be->getattr != NULL ? be->getattr(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_tsetattr(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_FORBID_XATTR, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ error = be->setattr != NULL ? be->setattr(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_txattrwalk(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ struct l9p_fid *fid, *newfid;
+ int error;
+
+ /*
+ * Not sure if we care if file-or-dir is open or not.
+ * However, the fid argument should always be a file or
+ * dir and the newfid argument must be supplied, must
+ * be different, and always becomes a new xattr,
+ * so this is not very much like Twalk.
+ */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_FORBID_XATTR, &fid);
+ if (error)
+ return (error);
+
+ newfid = l9p_connection_alloc_fid(conn, req->lr_req.txattrwalk.newfid);
+ if (newfid == NULL)
+ return (EINVAL);
+
+ be = conn->lc_server->ls_backend;
+
+ req->lr_fid = fid;
+ req->lr_newfid = newfid;
+ error = be->xattrwalk != NULL ? be->xattrwalk(be->softc, req) : ENOSYS;
+
+ /*
+ * Success/fail is similar to Twalk, except that we need
+ * to set the xattr type bit in the new fid. It's also
+ * much simpler since newfid is always a new fid.
+ */
+ if (error == 0) {
+ l9p_fid_setvalid(newfid);
+ l9p_fid_setxattr(newfid);
+ } else {
+ l9p_connection_remove_fid(conn, newfid);
+ }
+ return (error);
+}
+
+static int
+l9p_dispatch_txattrcreate(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ struct l9p_fid *fid;
+ int error;
+
+ /*
+ * Forbid incoming open fid since it's going to become an
+ * xattr fid instead. If it turns out we need to allow
+ * it, fs code will need to handle this.
+ *
+ * Curiously, qemu 9pfs uses ENOENT for a bad txattrwalk
+ * fid, but EINVAL for txattrcreate (so we do too).
+ */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, EINVAL,
+ F_FORBID_XATTR | F_FORBID_OPEN, &fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ req->lr_fid = fid;
+ error = be->xattrcreate != NULL ? be->xattrcreate(be->softc, req) :
+ ENOSYS;
+
+ /*
+ * On success, fid has changed from a regular (file or dir)
+ * fid to an xattr fid.
+ */
+ if (error == 0) {
+ l9p_fid_unsetdir(fid);
+ l9p_fid_setxattr(fid);
+ }
+ return (error);
+}
+
+static int
+l9p_dispatch_treaddir(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_REQUIRE_DIR | F_REQUIRE_OPEN, &req->lr_fid);
+ if (error)
+ return (error);
+
+ /*
+ * Adjust so that writing messages (packing data) starts
+ * right after the count field in the response.
+ *
+ * size[4] + Rreaddir[1] + tag[2] + count[4] = 11
+ */
+ l9p_seek_iov(req->lr_resp_msg.lm_iov, req->lr_resp_msg.lm_niov,
+ req->lr_data_iov, &req->lr_data_niov, 11);
+
+ be = conn->lc_server->ls_backend;
+
+ error = be->readdir != NULL ? be->readdir(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_tfsync(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_REQUIRE_OPEN, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ error = be->fsync != NULL ? be->fsync(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_tlock(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ /* Forbid directories? */
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_REQUIRE_OPEN, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ /*
+ * TODO: multiple client handling; perhaps async locking.
+ */
+ error = be->lock != NULL ? be->lock(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_tgetlock(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_REQUIRE_OPEN, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ /*
+ * TODO: multiple client handling; perhaps async locking.
+ */
+ error = be->getlock != NULL ? be->getlock(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_tlink(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ /*
+ * Note, dfid goes into fid2 in current scheme.
+ *
+ * Allow open dir? Target dir fid is not modified...
+ */
+ error = fid_lookup(conn, req->lr_req.tlink.dfid, ENOENT,
+ F_REQUIRE_DIR | F_FORBID_OPEN, &req->lr_fid2);
+ if (error)
+ return (error);
+
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_FORBID_DIR | F_FORBID_XATTR, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ error = be->link != NULL ? be->link(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_tmkdir(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_REQUIRE_DIR | F_FORBID_OPEN, &req->lr_fid);
+ if (error)
+ return (error);
+
+ /* Slashes embedded in the name are not allowed */
+ if (strchr(req->lr_req.tlcreate.name, '/') != NULL)
+ return (EINVAL);
+
+ be = conn->lc_server->ls_backend;
+ error = be->mkdir != NULL ? be->mkdir(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_trenameat(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_REQUIRE_DIR | F_FORBID_OPEN, &req->lr_fid);
+ if (error)
+ return (error);
+
+ error = fid_lookup(conn, req->lr_req.trenameat.newdirfid, ENOENT,
+ F_REQUIRE_DIR | F_FORBID_OPEN, &req->lr_fid2);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ /* TODO: check old and new names */
+ error = be->renameat != NULL ? be->renameat(be->softc, req) : ENOSYS;
+ return (error);
+}
+
+static int
+l9p_dispatch_tunlinkat(struct l9p_request *req)
+{
+ struct l9p_connection *conn = req->lr_conn;
+ struct l9p_backend *be;
+ int error;
+
+ error = fid_lookup(conn, req->lr_req.hdr.fid, ENOENT,
+ F_REQUIRE_DIR | F_FORBID_OPEN, &req->lr_fid);
+ if (error)
+ return (error);
+
+ be = conn->lc_server->ls_backend;
+
+ /* TODO: check dir-or-file name */
+ error = be->unlinkat != NULL ? be->unlinkat(be->softc, req) : ENOSYS;
+ return (error);
+}
diff --git a/usr/src/lib/lib9p/common/rfuncs.c b/usr/src/lib/lib9p/common/rfuncs.c
new file mode 100644
index 0000000000..f80e8c1541
--- /dev/null
+++ b/usr/src/lib/lib9p/common/rfuncs.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright 2016 Chris Torek <chris.torek@gmail.com>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#if defined(WITH_CASPER)
+#include <libcasper.h>
+#include <casper/cap_pwd.h>
+#include <casper/cap_grp.h>
+#endif
+
+#include "rfuncs.h"
+
+/*
+ * This is essentially a clone of the BSD basename_r function,
+ * which is like POSIX basename() but puts the result in a user
+ * supplied buffer.
+ *
+ * In BSD basename_r, the buffer must be least MAXPATHLEN bytes
+ * long. In our case we take the size of the buffer as an argument.
+ *
+ * Note that it's impossible in general to do this without
+ * a temporary buffer since basename("foo/bar") is "bar",
+ * but basename("foo/bar/") is still "bar" -- no trailing
+ * slash is allowed.
+ *
+ * The return value is your supplied buffer <buf>, or NULL if
+ * the length of the basename of the supplied <path> equals or
+ * exceeds your indicated <bufsize>.
+ *
+ * As a special but useful case, if you supply NULL for the <buf>
+ * argument, we allocate the buffer dynamically to match the
+ * basename, i.e., the result is basically strdup()ed for you.
+ * In this case <bufsize> is ignored (recommended: pass 0 here).
+ */
+char *
+r_basename(const char *path, char *buf, size_t bufsize)
+{
+ const char *endp, *comp;
+ size_t len;
+
+ /*
+ * NULL or empty path means ".". This is perhaps overly
+ * forgiving but matches libc basename_r(), and avoids
+ * breaking the code below.
+ */
+ if (path == NULL || *path == '\0') {
+ comp = ".";
+ len = 1;
+ } else {
+ /*
+ * Back up over any trailing slashes. If we reach
+ * the top of the path and it's still a trailing
+ * slash, it's also a leading slash and the entire
+ * path is just "/" (or "//", or "///", etc).
+ */
+ endp = path + strlen(path) - 1;
+ while (*endp == '/' && endp > path)
+ endp--;
+ /* Invariant: *endp != '/' || endp == path */
+ if (*endp == '/') {
+ /* then endp==path and hence entire path is "/" */
+ comp = "/";
+ len = 1;
+ } else {
+ /*
+ * We handled empty strings earlier, and
+ * we just proved *endp != '/'. Hence
+ * we have a non-empty basename, ending
+ * at endp.
+ *
+ * Back up one path name component. The
+ * part between these two is the basename.
+ *
+ * Note that we only stop backing up when
+ * either comp==path, or comp[-1] is '/'.
+ *
+ * Suppose path[0] is '/'. Then, since *endp
+ * is *not* '/', we had comp>path initially, and
+ * stopped backing up because we found a '/'
+ * (perhaps path[0], perhaps a later '/').
+ *
+ * Or, suppose path[0] is NOT '/'. Then,
+ * either there are no '/'s at all and
+ * comp==path, or comp[-1] is '/'.
+ *
+ * In all cases, we want all bytes from *comp
+ * to *endp, inclusive.
+ */
+ comp = endp;
+ while (comp > path && comp[-1] != '/')
+ comp--;
+ len = (size_t)(endp - comp + 1);
+ }
+ }
+ if (buf == NULL) {
+ buf = malloc(len + 1);
+ if (buf == NULL)
+ return (NULL);
+ } else {
+ if (len >= bufsize) {
+ errno = ENAMETOOLONG;
+ return (NULL);
+ }
+ }
+ memcpy(buf, comp, len);
+ buf[len] = '\0';
+ return (buf);
+}
+
+/*
+ * This is much like POSIX dirname(), but is reentrant.
+ *
+ * We examine a path, find the directory portion, and copy that
+ * to a user supplied buffer <buf> of the given size <bufsize>.
+ *
+ * Note that dirname("/foo/bar/") is "/foo", dirname("/foo") is "/",
+ * and dirname("////") is "/". However, dirname("////foo/bar") is
+ * "////foo" (we do not resolve these leading slashes away -- this
+ * matches the BSD libc behavior).
+ *
+ * The return value is your supplied buffer <buf>, or NULL if
+ * the length of the dirname of the supplied <path> equals or
+ * exceeds your indicated <bufsize>.
+ *
+ * As a special but useful case, if you supply NULL for the <buf>
+ * argument, we allocate the buffer dynamically to match the
+ * dirname, i.e., the result is basically strdup()ed for you.
+ * In this case <bufsize> is ignored (recommended: pass 0 here).
+ */
+char *
+r_dirname(const char *path, char *buf, size_t bufsize)
+{
+ const char *endp, *dirpart;
+ size_t len;
+
+ /*
+ * NULL or empty path means ".". This is perhaps overly
+ * forgiving but matches libc dirname(), and avoids breaking
+ * the code below.
+ */
+ if (path == NULL || *path == '\0') {
+ dirpart = ".";
+ len = 1;
+ } else {
+ /*
+ * Back up over any trailing slashes, then back up
+ * one path name, then back up over more slashes.
+ * In all cases, stop as soon as endp==path so
+ * that we do not back out of the buffer entirely.
+ *
+ * The first loop takes care of trailing slashes
+ * in names like "/foo/bar//" (where the dirname
+ * part is to be "/foo"), the second strips out
+ * the non-dir-name part, and the third leaves us
+ * pointing to the end of the directory component.
+ *
+ * If the entire name is of the form "/foo" or
+ * "//foo" (or "/foo/", etc, but we already
+ * handled trailing slashes), we end up pointing
+ * to the leading "/", which is what we want; but
+ * if it is of the form "foo" (or "foo/", etc) we
+ * point to a non-slash. So, if (and only if)
+ * endp==path AND *endp is not '/', the dirname is
+ * ".", but in all cases, the LENGTH of the
+ * dirname is (endp-path+1).
+ */
+ endp = path + strlen(path) - 1;
+ while (endp > path && *endp == '/')
+ endp--;
+ while (endp > path && *endp != '/')
+ endp--;
+ while (endp > path && *endp == '/')
+ endp--;
+
+ len = (size_t)(endp - path + 1);
+ if (endp == path && *endp != '/')
+ dirpart = ".";
+ else
+ dirpart = path;
+ }
+ if (buf == NULL) {
+ buf = malloc(len + 1);
+ if (buf == NULL)
+ return (NULL);
+ } else {
+ if (len >= bufsize) {
+ errno = ENAMETOOLONG;
+ return (NULL);
+ }
+ }
+ memcpy(buf, dirpart, len);
+ buf[len] = '\0';
+ return (buf);
+}
+
+static void
+r_pginit(struct r_pgdata *pg)
+{
+
+ /* Note: init to half size since the first thing we do is double it */
+ pg->r_pgbufsize = 1 << 9;
+ pg->r_pgbuf = NULL; /* note that realloc(NULL) == malloc */
+}
+
+static int
+r_pgexpand(struct r_pgdata *pg)
+{
+ size_t nsize;
+
+ nsize = pg->r_pgbufsize << 1;
+ if (nsize >= (1 << 20) ||
+ (pg->r_pgbuf = reallocf(pg->r_pgbuf, nsize)) == NULL)
+ return (ENOMEM);
+ return (0);
+}
+
+void
+r_pgfree(struct r_pgdata *pg)
+{
+
+ free(pg->r_pgbuf);
+}
+
+struct passwd *
+r_getpwuid(uid_t uid, struct r_pgdata *pg)
+{
+ struct passwd *result = NULL;
+ int error;
+
+ r_pginit(pg);
+ do {
+ error = r_pgexpand(pg);
+ if (error == 0)
+ error = getpwuid_r(uid, &pg->r_pgun.un_pw,
+ pg->r_pgbuf, pg->r_pgbufsize, &result);
+ } while (error == ERANGE);
+
+ return (error ? NULL : result);
+}
+
+struct group *
+r_getgrgid(gid_t gid, struct r_pgdata *pg)
+{
+ struct group *result = NULL;
+ int error;
+
+ r_pginit(pg);
+ do {
+ error = r_pgexpand(pg);
+ if (error == 0)
+ error = getgrgid_r(gid, &pg->r_pgun.un_gr,
+ pg->r_pgbuf, pg->r_pgbufsize, &result);
+ } while (error == ERANGE);
+
+ return (error ? NULL : result);
+}
+
+#if defined(WITH_CASPER)
+struct passwd *
+r_cap_getpwuid(cap_channel_t *cap, uid_t uid, struct r_pgdata *pg)
+{
+ struct passwd *result = NULL;
+ int error;
+
+ r_pginit(pg);
+ do {
+ error = r_pgexpand(pg);
+ if (error == 0)
+ error = cap_getpwuid_r(cap, uid, &pg->r_pgun.un_pw,
+ pg->r_pgbuf, pg->r_pgbufsize, &result);
+ } while (error == ERANGE);
+
+ return (error ? NULL : result);
+}
+
+struct group *
+r_cap_getgrgid(cap_channel_t *cap, gid_t gid, struct r_pgdata *pg)
+{
+ struct group *result = NULL;
+ int error;
+
+ r_pginit(pg);
+ do {
+ error = r_pgexpand(pg);
+ if (error == 0)
+ error = cap_getgrgid_r(cap, gid, &pg->r_pgun.un_gr,
+ pg->r_pgbuf, pg->r_pgbufsize, &result);
+ } while (error == ERANGE);
+
+ return (error ? NULL : result);
+}
+#endif
diff --git a/usr/src/lib/lib9p/common/rfuncs.h b/usr/src/lib/lib9p/common/rfuncs.h
new file mode 100644
index 0000000000..5946f2e2b7
--- /dev/null
+++ b/usr/src/lib/lib9p/common/rfuncs.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2016 Chris Torek <chris.torek@gmail.com>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef LIB9P_RFUNCS_H
+#define LIB9P_RFUNCS_H
+
+#if defined(__illumos__) && !defined(_POSIX_PTHREAD_SEMANTICS)
+#define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+
+#include <grp.h>
+#include <pwd.h>
+#include <string.h>
+
+#if defined(WITH_CASPER)
+#include <libcasper.h>
+#endif
+
+/*
+ * Reentrant, optionally-malloc-ing versions of
+ * basename() and dirname().
+ */
+char *r_basename(const char *, char *, size_t);
+char *r_dirname(const char *, char *, size_t);
+
+/*
+ * Yuck: getpwuid, getgrgid are not thread-safe, and the
+ * POSIX replacements (getpwuid_r, getgrgid_r) are horrible.
+ * This is to allow us to loop over the get.*_r calls with ever
+ * increasing buffers until they succeed or get unreasonable
+ * (same idea as the libc code for the non-reentrant versions,
+ * although prettier).
+ *
+ * The getpwuid/getgrgid functions auto-init one of these,
+ * but the caller must call r_pgfree() when done with the
+ * return values.
+ *
+ * If we need more later, we may have to expose the init function.
+ */
+struct r_pgdata {
+ char *r_pgbuf;
+ size_t r_pgbufsize;
+ union {
+ struct passwd un_pw;
+ struct group un_gr;
+ } r_pgun;
+};
+
+/* void r_pginit(struct r_pgdata *); */
+void r_pgfree(struct r_pgdata *);
+struct passwd *r_getpwuid(uid_t, struct r_pgdata *);
+struct group *r_getgrgid(gid_t, struct r_pgdata *);
+
+#if defined(WITH_CASPER)
+struct passwd *r_cap_getpwuid(cap_channel_t *, uid_t, struct r_pgdata *);
+struct group *r_cap_getgrgid(cap_channel_t *, gid_t, struct r_pgdata *);
+#endif
+
+#endif /* LIB9P_RFUNCS_H */
diff --git a/usr/src/lib/lib9p/common/sbuf/sbuf.c b/usr/src/lib/lib9p/common/sbuf/sbuf.c
new file mode 100644
index 0000000000..55e0f88650
--- /dev/null
+++ b/usr/src/lib/lib9p/common/sbuf/sbuf.c
@@ -0,0 +1,65 @@
+/*
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Minimal libsbuf wrapper around libcustr for illumos.
+ */
+
+#include <stdlib.h>
+#include "sbuf.h"
+
+struct sbuf *
+sbuf_new_auto()
+{
+ struct sbuf *s;
+
+ s = malloc(sizeof(struct sbuf));
+ if (s == NULL)
+ return (s);
+ if (custr_alloc(&s->s_custr) != 0) {
+ free(s);
+ return (NULL);
+ }
+ return (s);
+}
+
+int
+sbuf_printf(struct sbuf *s, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ va_start(ap, fmt);
+ ret = custr_append_vprintf(s->s_custr, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+void
+sbuf_delete(struct sbuf *s)
+{
+ custr_free(s->s_custr);
+ free(s);
+}
diff --git a/usr/src/lib/lib9p/common/sbuf/sbuf.h b/usr/src/lib/lib9p/common/sbuf/sbuf.h
new file mode 100644
index 0000000000..5b17b3113e
--- /dev/null
+++ b/usr/src/lib/lib9p/common/sbuf/sbuf.h
@@ -0,0 +1,51 @@
+/*
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Minimal libsbuf wrapper around libcustr for illumos.
+ */
+
+#ifndef LIB9P_SBUF_H
+#define LIB9P_SBUF_H
+
+#include <stdarg.h>
+#include <libcustr.h>
+
+struct sbuf
+{
+ custr_t *s_custr;
+};
+
+struct sbuf *sbuf_new_auto(void);
+char *sbuf_data(struct sbuf *s);
+int sbuf_printf(struct sbuf *s, const char *fmt, ...);
+void sbuf_delete(struct sbuf *s);
+
+#define sbuf_cat(s, str) custr_append((s)->s_custr, (str))
+#define sbuf_vprintf(s, fmt, args) \
+ custr_append_vprintf((s)->s_custr, (fmt), (args))
+#define sbuf_data(s) custr_cstr((s)->s_custr)
+#define sbuf_finish(s)
+
+#endif /* LIB9P_SBUF_H */
diff --git a/usr/src/lib/lib9p/common/threadpool.c b/usr/src/lib/lib9p/common/threadpool.c
new file mode 100644
index 0000000000..a29f2315c5
--- /dev/null
+++ b/usr/src/lib/lib9p/common/threadpool.c
@@ -0,0 +1,469 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Copyright 2020 Joyent, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <pthread.h>
+#if defined(__FreeBSD__)
+#include <pthread_np.h>
+#endif
+#include <sys/queue.h>
+#include "lib9p.h"
+#include "threadpool.h"
+
+static void l9p_threadpool_rflush(struct l9p_threadpool *tp,
+ struct l9p_request *req);
+
+static void *
+l9p_responder(void *arg)
+{
+ struct l9p_threadpool *tp;
+ struct l9p_worker *worker = arg;
+ struct l9p_request *req;
+
+ tp = worker->ltw_tp;
+ for (;;) {
+ /* get next reply to send */
+
+ if (pthread_mutex_lock(&tp->ltp_mtx) != 0)
+ break;
+ while (STAILQ_EMPTY(&tp->ltp_replyq) && !worker->ltw_exiting) {
+ (void) pthread_cond_wait(&tp->ltp_reply_cv,
+ &tp->ltp_mtx);
+ }
+ if (worker->ltw_exiting) {
+ (void) pthread_mutex_unlock(&tp->ltp_mtx);
+ break;
+ }
+
+ /* off reply queue */
+ req = STAILQ_FIRST(&tp->ltp_replyq);
+ STAILQ_REMOVE_HEAD(&tp->ltp_replyq, lr_worklink);
+
+ /* request is now in final glide path, can't be Tflush-ed */
+ req->lr_workstate = L9P_WS_REPLYING;
+
+ /* any flushers waiting for this request can go now */
+ if (req->lr_flushstate != L9P_FLUSH_NONE)
+ l9p_threadpool_rflush(tp, req);
+
+ if (pthread_mutex_unlock(&tp->ltp_mtx) != 0)
+ break;
+
+ /* send response */
+ l9p_respond(req, false, true);
+ }
+ return (NULL);
+}
+
+static void *
+l9p_worker(void *arg)
+{
+ struct l9p_threadpool *tp;
+ struct l9p_worker *worker = arg;
+ struct l9p_request *req;
+
+ tp = worker->ltw_tp;
+ if (pthread_mutex_lock(&tp->ltp_mtx) != 0)
+ return (NULL);
+ for (;;) {
+ while (STAILQ_EMPTY(&tp->ltp_workq) && !worker->ltw_exiting) {
+ (void) pthread_cond_wait(&tp->ltp_work_cv,
+ &tp->ltp_mtx);
+ }
+ if (worker->ltw_exiting)
+ break;
+
+ /* off work queue; now work-in-progress, by us */
+ req = STAILQ_FIRST(&tp->ltp_workq);
+ STAILQ_REMOVE_HEAD(&tp->ltp_workq, lr_worklink);
+ req->lr_workstate = L9P_WS_INPROGRESS;
+ req->lr_worker = worker;
+ (void) pthread_mutex_unlock(&tp->ltp_mtx);
+
+ /* actually try the request */
+ req->lr_error = l9p_dispatch_request(req);
+
+ /* move to responder queue, updating work-state */
+ if (pthread_mutex_lock(&tp->ltp_mtx) != 0)
+ return (NULL);
+ req->lr_workstate = L9P_WS_RESPQUEUED;
+ req->lr_worker = NULL;
+ STAILQ_INSERT_TAIL(&tp->ltp_replyq, req, lr_worklink);
+
+ /* signal the responder */
+ (void) pthread_cond_signal(&tp->ltp_reply_cv);
+ }
+ (void) pthread_mutex_unlock(&tp->ltp_mtx);
+ return (NULL);
+}
+
+/*
+ * Just before finally replying to a request that got touched by
+ * a Tflush request, we enqueue its flushers (requests of type
+ * Tflush, which are now on the flushee's lr_flushq) onto the
+ * response queue.
+ */
+static void
+l9p_threadpool_rflush(struct l9p_threadpool *tp, struct l9p_request *req)
+{
+ struct l9p_request *flusher;
+
+ /*
+ * https://swtch.com/plan9port/man/man9/flush.html says:
+ *
+ * "Should multiple Tflushes be received for a pending
+ * request, they must be answered in order. A Rflush for
+ * any of the multiple Tflushes implies an answer for all
+ * previous ones. Therefore, should a server receive a
+ * request and then multiple flushes for that request, it
+ * need respond only to the last flush." This means
+ * we could march through the queue of flushers here,
+ * marking all but the last one as "to be dropped" rather
+ * than "to be replied-to".
+ *
+ * However, we'll leave that for later, if ever -- it
+ * should be harmless to respond to each, in order.
+ */
+ STAILQ_FOREACH(flusher, &req->lr_flushq, lr_flushlink) {
+ flusher->lr_workstate = L9P_WS_RESPQUEUED;
+#ifdef notdef
+ if (not the last) {
+ flusher->lr_flushstate = L9P_FLUSH_NOT_RUN;
+ /* or, flusher->lr_drop = true ? */
+ }
+#endif
+ STAILQ_INSERT_TAIL(&tp->ltp_replyq, flusher, lr_worklink);
+ }
+}
+
+int
+l9p_threadpool_init(struct l9p_threadpool *tp, int size)
+{
+ struct l9p_worker *worker;
+#if defined(__FreeBSD__)
+ char threadname[16];
+#endif
+ int error;
+ int i, nworkers, nresponders;
+
+ if (size <= 0)
+ return (EINVAL);
+#ifdef __illumos__
+ pthread_mutexattr_t attr;
+
+ if ((error = pthread_mutexattr_init(&attr)) != 0)
+ return (error);
+ if ((error = pthread_mutexattr_settype(&attr,
+ PTHREAD_MUTEX_ERRORCHECK)) != 0) {
+ return (error);
+ }
+ error = pthread_mutex_init(&tp->ltp_mtx, &attr);
+#else
+ error = pthread_mutex_init(&tp->ltp_mtx, NULL);
+#endif
+ if (error)
+ return (error);
+ error = pthread_cond_init(&tp->ltp_work_cv, NULL);
+ if (error)
+ goto fail_work_cv;
+ error = pthread_cond_init(&tp->ltp_reply_cv, NULL);
+ if (error)
+ goto fail_reply_cv;
+
+ STAILQ_INIT(&tp->ltp_workq);
+ STAILQ_INIT(&tp->ltp_replyq);
+ LIST_INIT(&tp->ltp_workers);
+
+ nresponders = 0;
+ nworkers = 0;
+ for (i = 0; i <= size; i++) {
+ worker = calloc(1, sizeof(struct l9p_worker));
+#ifdef __illumos__
+ if (worker == NULL)
+ break;
+#endif
+ worker->ltw_tp = tp;
+ worker->ltw_responder = i == 0;
+ error = pthread_create(&worker->ltw_thread, NULL,
+ worker->ltw_responder ? l9p_responder : l9p_worker,
+ (void *)worker);
+ if (error) {
+ free(worker);
+ break;
+ }
+ if (worker->ltw_responder)
+ nresponders++;
+ else
+ nworkers++;
+
+#if defined(__FreeBSD__)
+ if (worker->ltw_responder) {
+ pthread_set_name_np(worker->ltw_thread, "9p-responder");
+ } else {
+ sprintf(threadname, "9p-worker:%d", i - 1);
+ pthread_set_name_np(worker->ltw_thread, threadname);
+ }
+#elif defined(__illumos__)
+ if (worker->ltw_responder) {
+ (void) pthread_setname_np(worker->ltw_thread,
+ "9p-responder");
+ } else {
+ char threadname[PTHREAD_MAX_NAMELEN_NP];
+
+ (void) snprintf(threadname, sizeof (threadname),
+ "9p-worker:%d", i - 1);
+ (void) pthread_setname_np(worker->ltw_thread,
+ threadname);
+ }
+#endif
+
+ LIST_INSERT_HEAD(&tp->ltp_workers, worker, ltw_link);
+ }
+ if (nresponders == 0 || nworkers == 0) {
+ /* need the one responder, and at least one worker */
+ l9p_threadpool_shutdown(tp);
+ return (error);
+ }
+ return (0);
+
+ /*
+ * We could avoid these labels by having multiple destroy
+ * paths (one for each error case), or by having booleans
+ * for which variables were initialized. Neither is very
+ * appealing...
+ */
+fail_reply_cv:
+ (void) pthread_cond_destroy(&tp->ltp_work_cv);
+fail_work_cv:
+ (void) pthread_mutex_destroy(&tp->ltp_mtx);
+
+ return (error);
+}
+
+/*
+ * Run a request, usually by queueing it.
+ */
+void
+l9p_threadpool_run(struct l9p_threadpool *tp, struct l9p_request *req)
+{
+
+ /*
+ * Flush requests must be handled specially, since they
+ * can cancel / kill off regular requests. (But we can
+ * run them through the regular dispatch mechanism.)
+ */
+ if (req->lr_req.hdr.type == L9P_TFLUSH) {
+ /* not on a work queue yet so we can touch state */
+ req->lr_workstate = L9P_WS_IMMEDIATE;
+ (void) l9p_dispatch_request(req);
+ } else {
+ if (pthread_mutex_lock(&tp->ltp_mtx) != 0)
+ return;
+ req->lr_workstate = L9P_WS_NOTSTARTED;
+ STAILQ_INSERT_TAIL(&tp->ltp_workq, req, lr_worklink);
+ (void) pthread_cond_signal(&tp->ltp_work_cv);
+ (void) pthread_mutex_unlock(&tp->ltp_mtx);
+ }
+}
+
+/*
+ * Run a Tflush request. Called via l9p_dispatch_request() since
+ * it has some debug code in it, but not called from worker thread.
+ */
+int
+l9p_threadpool_tflush(struct l9p_request *req)
+{
+ struct l9p_connection *conn;
+ struct l9p_threadpool *tp;
+ struct l9p_request *flushee;
+ uint16_t oldtag;
+ enum l9p_flushstate nstate = L9P_FLUSH_NONE;
+ int err;
+
+ /*
+ * Find what we're supposed to flush (the flushee, as it were).
+ */
+ req->lr_error = 0; /* Tflush always succeeds */
+ conn = req->lr_conn;
+ tp = &conn->lc_tp;
+ oldtag = req->lr_req.tflush.oldtag;
+ if ((err = ht_wrlock(&conn->lc_requests)) != 0)
+ return (err);
+ flushee = ht_find_locked(&conn->lc_requests, oldtag);
+ if (flushee == NULL) {
+ /*
+ * Nothing to flush! The old request must have
+ * been done and gone already. Just queue this
+ * Tflush for a success reply.
+ */
+ (void) ht_unlock(&conn->lc_requests);
+ if ((err = pthread_mutex_lock(&tp->ltp_mtx)) != 0)
+ return (err);
+ goto done;
+ }
+
+ /*
+ * Found the original request. We'll need to inspect its
+ * work-state to figure out what to do.
+ */
+ if ((err = pthread_mutex_lock(&tp->ltp_mtx)) != 0) {
+ (void) ht_unlock(&conn->lc_requests);
+ return (err);
+ }
+ (void) ht_unlock(&conn->lc_requests);
+
+ switch (flushee->lr_workstate) {
+
+ case L9P_WS_NOTSTARTED:
+ /*
+ * Flushee is on work queue, but not yet being
+ * handled by a worker.
+ *
+ * The documentation -- see
+ * http://ericvh.github.io/9p-rfc/rfc9p2000.html
+ * https://swtch.com/plan9port/man/man9/flush.html
+ * -- says that "the server should answer the
+ * flush message immediately". However, Linux
+ * sends flush requests for operations that
+ * must finish, such as Tclunk, and it's not
+ * possible to *answer* the flush request until
+ * it has been handled (if necessary) or aborted
+ * (if allowed).
+ *
+ * We therefore now just the original request
+ * and let the request-handler do whatever is
+ * appropriate. NOTE: we could have a table of
+ * "requests that can be aborted without being
+ * run" vs "requests that must be run to be
+ * aborted", but for now that seems like an
+ * unnecessary complication.
+ */
+ nstate = L9P_FLUSH_REQUESTED_PRE_START;
+ break;
+
+ case L9P_WS_IMMEDIATE:
+ /*
+ * This state only applies to Tflush requests, and
+ * flushing a Tflush is illegal. But we'll do nothing
+ * special here, which will make us act like a flush
+ * request for the flushee that arrived too late to
+ * do anything about the flushee.
+ */
+ nstate = L9P_FLUSH_REQUESTED_POST_START;
+ break;
+
+ case L9P_WS_INPROGRESS:
+ /*
+ * Worker thread flushee->lr_worker is working on it.
+ * Kick it to get it out of blocking system calls.
+ * (This requires that it carefully set up some
+ * signal handlers, and may be FreeBSD-dependent,
+ * it probably cannot be handled this way on MacOS.)
+ */
+#ifdef notyet
+ pthread_kill(...);
+#endif
+ nstate = L9P_FLUSH_REQUESTED_POST_START;
+ break;
+
+ case L9P_WS_RESPQUEUED:
+ /*
+ * The flushee is already in the response queue.
+ * We'll just mark it as having had some flush
+ * action applied.
+ */
+ nstate = L9P_FLUSH_TOOLATE;
+ break;
+
+ case L9P_WS_REPLYING:
+ /*
+ * Although we found the flushee, it's too late to
+ * make us depend on it: it's already heading out
+ * the door as a reply.
+ *
+ * We don't want to do anything to the flushee.
+ * Instead, we want to work the same way as if
+ * we had never found the tag.
+ */
+ goto done;
+ }
+
+ /*
+ * Now add us to the list of Tflush-es that are waiting
+ * for the flushee (creating the list if needed, i.e., if
+ * this is the first Tflush for the flushee). We (req)
+ * will get queued for reply later, when the responder
+ * processes the flushee and calls l9p_threadpool_rflush().
+ */
+ if (flushee->lr_flushstate == L9P_FLUSH_NONE)
+ STAILQ_INIT(&flushee->lr_flushq);
+ flushee->lr_flushstate = nstate;
+ STAILQ_INSERT_TAIL(&flushee->lr_flushq, req, lr_flushlink);
+
+ (void) pthread_mutex_unlock(&tp->ltp_mtx);
+
+ return (0);
+
+done:
+ /*
+ * This immediate op is ready to be replied-to now, so just
+ * stick it onto the reply queue.
+ */
+ req->lr_workstate = L9P_WS_RESPQUEUED;
+ STAILQ_INSERT_TAIL(&tp->ltp_replyq, req, lr_worklink);
+ (void) pthread_mutex_unlock(&tp->ltp_mtx);
+ (void) pthread_cond_signal(&tp->ltp_reply_cv);
+ return (0);
+}
+
+int
+l9p_threadpool_shutdown(struct l9p_threadpool *tp)
+{
+ struct l9p_worker *worker, *tmp;
+
+ LIST_FOREACH_SAFE(worker, &tp->ltp_workers, ltw_link, tmp) {
+ if (pthread_mutex_lock(&tp->ltp_mtx) != 0)
+ continue;
+ worker->ltw_exiting = true;
+ if (worker->ltw_responder)
+ (void) pthread_cond_signal(&tp->ltp_reply_cv);
+ else
+ (void) pthread_cond_broadcast(&tp->ltp_work_cv);
+ (void) pthread_mutex_unlock(&tp->ltp_mtx);
+ (void) pthread_join(worker->ltw_thread, NULL);
+ LIST_REMOVE(worker, ltw_link);
+ free(worker);
+ }
+ (void) pthread_cond_destroy(&tp->ltp_reply_cv);
+ (void) pthread_cond_destroy(&tp->ltp_work_cv);
+ (void) pthread_mutex_destroy(&tp->ltp_mtx);
+
+ return (0);
+}
diff --git a/usr/src/lib/lib9p/common/threadpool.h b/usr/src/lib/lib9p/common/threadpool.h
new file mode 100644
index 0000000000..2855c1c545
--- /dev/null
+++ b/usr/src/lib/lib9p/common/threadpool.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef LIB9P_THREADPOOL_H
+#define LIB9P_THREADPOOL_H
+
+#include <stdbool.h>
+#include <pthread.h>
+#include <sys/queue.h>
+#include "lib9p.h"
+
+STAILQ_HEAD(l9p_request_queue, l9p_request);
+
+/*
+ * Most of the workers in the threadpool run requests.
+ *
+ * One distinguished worker delivers responses from the
+ * response queue. The reason this worker exists is to
+ * guarantee response order, so that flush responses go
+ * after their flushed requests.
+ */
+struct l9p_threadpool {
+ struct l9p_connection * ltp_conn; /* the connection */
+ struct l9p_request_queue ltp_workq; /* requests awaiting a worker */
+ struct l9p_request_queue ltp_replyq; /* requests that are done */
+ pthread_mutex_t ltp_mtx; /* locks queues and cond vars */
+ pthread_cond_t ltp_work_cv; /* to signal regular workers */
+ pthread_cond_t ltp_reply_cv; /* to signal reply-worker */
+ LIST_HEAD(, l9p_worker) ltp_workers; /* list of all workers */
+};
+
+/*
+ * All workers, including the responder, use this as their
+ * control structure. (The only thing that distinguishes the
+ * responder is that it runs different code and waits on the
+ * reply_cv.)
+ */
+struct l9p_worker {
+ struct l9p_threadpool * ltw_tp;
+ pthread_t ltw_thread;
+ bool ltw_exiting;
+ bool ltw_responder;
+ LIST_ENTRY(l9p_worker) ltw_link;
+};
+
+/*
+ * Each request has a "work state" telling where the request is,
+ * in terms of workers working on it. That is, this tells us
+ * which threadpool queue, if any, the request is in now or would
+ * go in, or what's happening with it.
+ */
+enum l9p_workstate {
+ L9P_WS_NOTSTARTED, /* not yet started */
+ L9P_WS_IMMEDIATE, /* Tflush being done sans worker */
+ L9P_WS_INPROGRESS, /* worker is working on it */
+ L9P_WS_RESPQUEUED, /* worker is done, response queued */
+ L9P_WS_REPLYING, /* responder is in final reply path */
+};
+
+/*
+ * Each request has a "flush state", initally NONE meaning no
+ * Tflush affected the request.
+ *
+ * If a Tflush comes in before we ever assign a work thread,
+ * the flush state goes to FLUSH_REQUESTED_PRE_START.
+ *
+ * If a Tflush comes in after we assign a work thread, the
+ * flush state goes to FLUSH_REQUESTED_POST_START. The flush
+ * request may be too late: the request might finish anyway.
+ * Or it might be soon enough to abort. In all cases, though, the
+ * operation requesting the flush (the "flusher") must wait for
+ * the other request (the "flushee") to go through the respond
+ * path. The respond routine gets to decide whether to send a
+ * normal response, send an error, or drop the request
+ * entirely.
+ *
+ * There's one especially annoying case: what if a Tflush comes in
+ * *while* we're sending a response? In this case it's too late:
+ * the flush just waits for the fully-composed response.
+ */
+enum l9p_flushstate {
+ L9P_FLUSH_NONE = 0, /* must be zero */
+ L9P_FLUSH_REQUESTED_PRE_START, /* not even started before flush */
+ L9P_FLUSH_REQUESTED_POST_START, /* started, then someone said flush */
+ L9P_FLUSH_TOOLATE /* too late, already responding */
+};
+
+void l9p_threadpool_flushee_done(struct l9p_request *);
+int l9p_threadpool_init(struct l9p_threadpool *, int);
+void l9p_threadpool_run(struct l9p_threadpool *, struct l9p_request *);
+int l9p_threadpool_shutdown(struct l9p_threadpool *);
+int l9p_threadpool_tflush(struct l9p_request *);
+
+#endif /* LIB9P_THREADPOOL_H */
diff --git a/usr/src/lib/lib9p/common/transport/socket.c b/usr/src/lib/lib9p/common/transport/socket.c
new file mode 100644
index 0000000000..214a1c8d70
--- /dev/null
+++ b/usr/src/lib/lib9p/common/transport/socket.c
@@ -0,0 +1,593 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Copyright 2021 Joyent, Inc.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <assert.h>
+#include <sys/types.h>
+#ifdef __APPLE__
+# include "../apple_endian.h"
+#elif __illumos__
+# include <sys/param.h>
+# include <port.h>
+# include "../illumos_endian.h"
+#else
+# include <sys/endian.h>
+#endif
+#include <sys/socket.h>
+#ifndef __illumos__
+# include <sys/event.h>
+#endif
+#include <sys/uio.h>
+#include <netdb.h>
+#include "../lib9p.h"
+#include "../lib9p_impl.h"
+#include "../log.h"
+#include "socket.h"
+
+struct l9p_socket_softc
+{
+ struct l9p_connection *ls_conn;
+ struct sockaddr ls_sockaddr;
+ socklen_t ls_socklen;
+ pthread_t ls_thread;
+ int ls_fd;
+};
+
+#ifdef __FreeBSD__
+struct event_svr {
+ struct kevent *ev_kev;
+ struct kevent *ev_event;
+ int ev_kq;
+};
+#elif __illumos__
+struct event_svr {
+ port_event_t *ev_pe;
+ int ev_port;
+};
+#else
+#error "No event server defined"
+#endif
+
+static int l9p_init_event_svr(struct event_svr *, uint_t);
+static uint_t l9p_get_server_addrs(const char *, const char *,
+ struct addrinfo **);
+static uint_t l9p_bind_addrs(struct event_svr *, struct addrinfo *, uint_t,
+ int **);
+static int l9p_event_get(struct l9p_server *, struct event_svr *, uint_t,
+ void (*cb)(struct l9p_server *, int));
+static int l9p_socket_readmsg(struct l9p_socket_softc *, void **, size_t *);
+static int l9p_socket_get_response_buffer(struct l9p_request *,
+ struct iovec *, size_t *, void *);
+static int l9p_socket_send_response(struct l9p_request *, const struct iovec *,
+ const size_t, const size_t, void *);
+static void l9p_socket_drop_response(struct l9p_request *, const struct iovec *,
+ size_t, void *);
+static void *l9p_socket_thread(void *);
+static ssize_t xread(int, void *, size_t);
+static ssize_t xwrite(int, void *, size_t);
+
+int
+l9p_start_server(struct l9p_server *server, const char *host, const char *port)
+{
+ struct addrinfo *res = NULL;
+ int *sockets = NULL;
+ uint_t naddrs = 0;
+ uint_t nsockets = 0;
+ uint_t i;
+ struct event_svr esvr;
+
+ naddrs = l9p_get_server_addrs(host, port, &res);
+ if (naddrs == 0)
+ return (-1);
+
+ if (l9p_init_event_svr(&esvr, naddrs) != 0) {
+ freeaddrinfo(res);
+ return (-1);
+ }
+
+ nsockets = l9p_bind_addrs(&esvr, res, naddrs, &sockets);
+
+ /*
+ * We don't need res, after this, so free it and NULL it to prevent
+ * any possible use after free.
+ */
+ freeaddrinfo(res);
+ res = NULL;
+
+ if (nsockets == 0)
+ goto fail;
+
+ for (;;) {
+ if (l9p_event_get(server, &esvr, nsockets,
+ l9p_socket_accept) < 0)
+ break;
+ }
+
+ /* We get here if something failed */
+ for (i = 0; i < nsockets; i++)
+ close(sockets[i]);
+
+fail:
+ free(sockets);
+
+#ifdef __FreeBSD__
+ close(esvr.ev_kq);
+ free(esvr.ev_kev);
+ free(esvr.ev_event);
+#elif __illumos__
+ close(esvr.ev_port);
+ free(esvr.ev_pe);
+#else
+#error "Port me"
+#endif
+
+ return (-1);
+}
+
+static uint_t
+l9p_get_server_addrs(const char *host, const char *port, struct addrinfo **resp)
+{
+ struct addrinfo *res, hints;
+ uint_t naddrs;
+ int rc;
+
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_family = PF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ rc = getaddrinfo(host, port, &hints, resp);
+ if (rc > 0) {
+ L9P_LOG(L9P_ERROR, "getaddrinfo(): %s", gai_strerror(rc));
+ return (0);
+ }
+
+ naddrs = 0;
+ for (res = *resp; res != NULL; res = res->ai_next)
+ naddrs++;
+
+ if (naddrs == 0) {
+ L9P_LOG(L9P_ERROR, "no addresses found for %s:%s", host, port);
+ }
+
+ return (naddrs);
+}
+
+#ifdef __FreeBSD__
+static int
+l9p_init_event_svr(struct event_svr *svr, uint_t nsockets)
+{
+ svr->ev_kev = calloc(nsockets, sizeof(struct kevent));
+ if (svr->ev_kev == NULL) {
+ L9P_LOG(L9P_ERROR, "calloc(): %s", strerror(errno));
+ return (-1);
+ }
+
+ svr->ev_event = calloc(nsockets, sizeof(struct kevent));
+ if (svr->ev_event == NULL) {
+ L9P_LOG(L9P_ERROR, "calloc(): %s", strerror(errno));
+ free(svr->ev_key);
+ svr->ev_key = NULL;
+ return (-1);
+ }
+
+ svr->ev_kq = kqueue();
+ if (svr->ev_kq == -1) {
+ L9P_LOG(L9P_ERROR, "kqueue(): %s", strerror(errno));
+ free(svr->ev_kev);
+ free(svr->ev_event);
+ svr->ev_kev = NULL;
+ svr->ev_event = NULL;
+ return (-1);
+ }
+
+ return (0);
+}
+#elif __illumos__
+static int
+l9p_init_event_svr(struct event_svr *svr, uint_t nsockets)
+{
+ svr->ev_pe = calloc(nsockets, sizeof(port_event_t));
+ if (svr->ev_pe == NULL) {
+ L9P_LOG(L9P_ERROR, "calloc(): %s", strerror(errno));
+ return (-1);
+ }
+
+ svr->ev_port = port_create();
+ if (svr->ev_port == -1) {
+ L9P_LOG(L9P_ERROR, "port_create(): %s", strerror(errno));
+ return (-1);
+ }
+
+ return (0);
+}
+#else
+#error "No event server defined"
+#endif
+
+static uint_t
+l9p_bind_addrs(struct event_svr *svr, struct addrinfo *addrs, uint_t naddrs,
+ int **socketsp)
+{
+ struct addrinfo *addr;
+ uint_t i, j;
+
+ *socketsp = calloc(naddrs, sizeof(int));
+ if (*socketsp == NULL) {
+ L9P_LOG(L9P_ERROR, "calloc(): %s", strerror(errno));
+ return (0);
+ }
+
+ for (i = 0, addr = addrs; addr != NULL; addr = addr->ai_next) {
+ int s;
+ int val = 1;
+
+ s = socket(addr->ai_family, addr->ai_socktype,
+ addr->ai_protocol);
+ if (s == -1) {
+ L9P_LOG(L9P_ERROR, "socket(): %s", strerror(errno));
+ continue;
+ }
+
+ if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &val,
+ sizeof(val)) < 0) {
+ L9P_LOG(L9P_ERROR, "setsockopt(): %s", strerror(errno));
+ close(s);
+ continue;
+ }
+
+ if (bind(s, addr->ai_addr, addr->ai_addrlen) < 0) {
+ L9P_LOG(L9P_ERROR, "bind(): %s", strerror(errno));
+ close(s);
+ continue;
+ }
+
+ if (listen(s, 10) < 0) {
+ L9P_LOG(L9P_ERROR, "listen(): %s", strerror(errno));
+ close(s);
+ continue;
+ }
+
+#ifdef __FreeBSD__
+ EV_SET(&svr->ev_kev[i], s, EVFILT_READ, EV_ADD | EV_ENABLE, 0,
+ 0, 0);
+#elif __illumos__
+ if (port_associate(svr->ev_port, PORT_SOURCE_FD, s,
+ POLLIN|POLLHUP, NULL) < 0) {
+ L9P_LOG(L9P_ERROR, "port_associate(%d): %s", s,
+ strerror(errno));
+ close(s);
+ continue;
+ }
+#else
+#error "Port me"
+#endif
+
+ *socketsp[i++] = s;
+ }
+
+ if (i < 1) {
+ free(*socketsp);
+ *socketsp = NULL;
+ return (0);
+ }
+
+ for (j = i; j < naddrs; j++)
+ *socketsp[j++] = -1;
+
+#ifdef __FreeBSD__
+ if (kevent(svr->ev_kq, svr->ev_kev, i, NULL, 0, NULL) < 0) {
+ L9P_LOG(L9P_ERROR, "kevent(): %s", strerror(errno));
+
+ for (j = 0; j < i; j++)
+ close(j);
+
+ free(*socketsp);
+ *socketsp = NULL;
+
+ return (0);
+ }
+#endif
+
+ return (i);
+}
+
+#ifdef __FreeBSD__
+static int
+l9p_event_get(struct l9p_server *l9svr, struct event_svr *esvr, uint_t nsockets,
+ void (*cb)(struct l9p_server *, int))
+{
+ int i, evs;
+
+ evs = kevent(esvr->ev_kq, NULL, 0, esvr->ev_event, nsockets, NULL);
+ if (evs < 0) {
+ if (errno == EINTR)
+ return (0);
+ L9P_LOG(L9P_ERROR, "kevent(): %s", strerror(errno));
+ return (-1);
+ }
+
+ for (i = 0; i < evs; i++)
+ cb(l9svr, (int)sevr->ev_event[i].ident);
+
+ return (0);
+}
+#elif __illumos__
+static int
+l9p_event_get(struct l9p_server *l9svr, struct event_svr *esvr, uint_t nsockets,
+ void (*cb)(struct l9p_server *, int))
+{
+ uint_t evs = 1;
+ int i;
+
+ if (port_getn(esvr->ev_port, esvr->ev_pe, nsockets, &evs, NULL) < 0) {
+ if (errno == EINTR)
+ return (0);
+ L9P_LOG(L9P_ERROR, "port_getn(): %s", strerror(errno));
+ return (-1);
+ }
+
+ for (i = 0; i < evs; i++) {
+ if (esvr->ev_pe[i].portev_source != PORT_SOURCE_FD)
+ continue;
+
+ cb(l9svr, (int)esvr->ev_pe[i].portev_object);
+ }
+
+ return (0);
+}
+#else
+#error "Port me"
+#endif
+
+void
+l9p_socket_accept(struct l9p_server *server, int svr_fd)
+{
+ struct l9p_socket_softc *sc;
+ struct l9p_connection *conn;
+ char host[NI_MAXHOST + 1];
+ char serv[NI_MAXSERV + 1];
+ struct sockaddr client_addr;
+ socklen_t client_addr_len = sizeof(client_addr);
+ int conn_fd, err;
+
+ conn_fd = accept(svr_fd, &client_addr, &client_addr_len);
+ if (conn_fd < 0) {
+ L9P_LOG(L9P_WARNING, "accept(): %s", strerror(errno));
+ return;
+ }
+
+ err = getnameinfo(&client_addr, client_addr_len, host, NI_MAXHOST,
+ serv, NI_MAXSERV, NI_NUMERICHOST | NI_NUMERICSERV);
+
+ if (err != 0) {
+ L9P_LOG(L9P_WARNING, "cannot look up client name: %s",
+ gai_strerror(err));
+ } else {
+ L9P_LOG(L9P_INFO, "new connection from %s:%s", host, serv);
+ }
+
+ if (l9p_connection_init(server, &conn) != 0) {
+ L9P_LOG(L9P_ERROR, "cannot create new connection");
+ return;
+ }
+
+ sc = l9p_calloc(1, sizeof(*sc));
+ sc->ls_conn = conn;
+ sc->ls_fd = conn_fd;
+
+ /*
+ * Fill in transport handler functions and aux argument.
+ */
+ conn->lc_lt.lt_aux = sc;
+ conn->lc_lt.lt_get_response_buffer = l9p_socket_get_response_buffer;
+ conn->lc_lt.lt_send_response = l9p_socket_send_response;
+ conn->lc_lt.lt_drop_response = l9p_socket_drop_response;
+
+ err = pthread_create(&sc->ls_thread, NULL, l9p_socket_thread, sc);
+ if (err) {
+ L9P_LOG(L9P_ERROR,
+ "pthread_create (for connection from %s:%s): error %s",
+ host, serv, strerror(err));
+ l9p_connection_close(sc->ls_conn);
+ free(sc);
+ }
+}
+
+static void *
+l9p_socket_thread(void *arg)
+{
+ struct l9p_socket_softc *sc = (struct l9p_socket_softc *)arg;
+ struct iovec iov;
+ void *buf;
+ size_t length;
+
+ for (;;) {
+ if (l9p_socket_readmsg(sc, &buf, &length) != 0)
+ break;
+
+ iov.iov_base = buf;
+ iov.iov_len = length;
+ l9p_connection_recv(sc->ls_conn, &iov, 1, NULL);
+ free(buf);
+ }
+
+ L9P_LOG(L9P_INFO, "connection closed");
+ l9p_connection_close(sc->ls_conn);
+ free(sc);
+ return (NULL);
+}
+
+static int
+l9p_socket_readmsg(struct l9p_socket_softc *sc, void **buf, size_t *size)
+{
+ uint32_t msize;
+ size_t toread;
+ ssize_t ret;
+ void *buffer;
+ int fd = sc->ls_fd;
+
+ assert(fd > 0);
+
+ buffer = l9p_malloc(sizeof(uint32_t));
+
+ ret = xread(fd, buffer, sizeof(uint32_t));
+ if (ret < 0) {
+ L9P_LOG(L9P_ERROR, "read(): %s", strerror(errno));
+ return (-1);
+ }
+
+ if (ret != sizeof(uint32_t)) {
+ if (ret == 0) {
+ L9P_LOG(L9P_DEBUG, "%p: EOF", (void *)sc->ls_conn);
+ } else {
+ L9P_LOG(L9P_ERROR,
+ "short read: %zd bytes of %zd expected",
+ ret, sizeof(uint32_t));
+ }
+ return (-1);
+ }
+
+ msize = le32toh(*(uint32_t *)buffer);
+ toread = msize - sizeof(uint32_t);
+ buffer = l9p_realloc(buffer, msize);
+
+ ret = xread(fd, (char *)buffer + sizeof(uint32_t), toread);
+ if (ret < 0) {
+ L9P_LOG(L9P_ERROR, "read(): %s", strerror(errno));
+ return (-1);
+ }
+
+ if (ret != (ssize_t)toread) {
+ L9P_LOG(L9P_ERROR, "short read: %zd bytes of %zd expected",
+ ret, toread);
+ return (-1);
+ }
+
+ *size = msize;
+ *buf = buffer;
+ L9P_LOG(L9P_INFO, "%p: read complete message, buf=%p size=%d",
+ (void *)sc->ls_conn, buffer, msize);
+
+ return (0);
+}
+
+static int
+l9p_socket_get_response_buffer(struct l9p_request *req, struct iovec *iov,
+ size_t *niovp, void *arg __unused)
+{
+ size_t size = req->lr_conn->lc_msize;
+ void *buf;
+
+ buf = l9p_malloc(size);
+ iov[0].iov_base = buf;
+ iov[0].iov_len = size;
+
+ *niovp = 1;
+ return (0);
+}
+
+static int
+l9p_socket_send_response(struct l9p_request *req __unused,
+ const struct iovec *iov, const size_t niov __unused, const size_t iolen,
+ void *arg)
+{
+ struct l9p_socket_softc *sc = (struct l9p_socket_softc *)arg;
+
+ assert(sc->ls_fd >= 0);
+
+ L9P_LOG(L9P_DEBUG, "%p: sending reply, buf=%p, size=%d", arg,
+ iov[0].iov_base, iolen);
+
+ if (xwrite(sc->ls_fd, iov[0].iov_base, iolen) != (int)iolen) {
+ L9P_LOG(L9P_ERROR, "short write: %s", strerror(errno));
+ return (-1);
+ }
+
+ free(iov[0].iov_base);
+ return (0);
+}
+
+static void
+l9p_socket_drop_response(struct l9p_request *req __unused,
+ const struct iovec *iov, size_t niov __unused, void *arg)
+{
+
+ L9P_LOG(L9P_DEBUG, "%p: drop buf=%p", arg, iov[0].iov_base);
+ free(iov[0].iov_base);
+}
+
+static ssize_t
+xread(int fd, void *buf, size_t count)
+{
+ size_t done = 0;
+ ssize_t ret;
+
+ while (done < count) {
+ ret = read(fd, (char *)buf + done, count - done);
+ if (ret < 0) {
+ if (errno == EINTR)
+ continue;
+
+ return (-1);
+ }
+
+ if (ret == 0)
+ return ((ssize_t)done);
+
+ done += (size_t)ret;
+ }
+
+ return ((ssize_t)done);
+}
+
+static ssize_t
+xwrite(int fd, void *buf, size_t count)
+{
+ size_t done = 0;
+ ssize_t ret;
+
+ while (done < count) {
+ ret = write(fd, (char *)buf + done, count - done);
+ if (ret < 0) {
+ if (errno == EINTR)
+ continue;
+
+ return (-1);
+ }
+
+ if (ret == 0)
+ return ((ssize_t)done);
+
+ done += (size_t)ret;
+ }
+
+ return ((ssize_t)done);
+}
diff --git a/usr/src/lib/lib9p/common/transport/socket.h b/usr/src/lib/lib9p/common/transport/socket.h
new file mode 100644
index 0000000000..df950ffb7d
--- /dev/null
+++ b/usr/src/lib/lib9p/common/transport/socket.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef LIB9P_SOCKET_H
+#define LIB9P_SOCKET_H
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include "../lib9p.h"
+
+int l9p_start_server(struct l9p_server *server, const char *host,
+ const char *port);
+void l9p_socket_accept(struct l9p_server *server, int serv_fd);
+
+#endif /* LIB9P_SOCKET_H */
diff --git a/usr/src/lib/lib9p/common/utils.c b/usr/src/lib/lib9p/common/utils.c
new file mode 100644
index 0000000000..10c9683c0a
--- /dev/null
+++ b/usr/src/lib/lib9p/common/utils.c
@@ -0,0 +1,1363 @@
+/*
+ * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted providing that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#if defined(__FreeBSD__)
+#include <sys/sbuf.h>
+#else
+#include "sbuf/sbuf.h"
+#endif
+#include "lib9p.h"
+#include "fcall.h"
+#include "linux_errno.h"
+
+#ifdef __illumos__
+#include <sys/sysmacros.h>
+#include <grp.h>
+#endif
+
+#ifdef __APPLE__
+ #define GETGROUPS_GROUP_TYPE_IS_INT
+#endif
+
+#define N(ary) (sizeof(ary) / sizeof(*ary))
+
+/* See l9p_describe_bits() below. */
+struct descbits {
+ uint64_t db_mask; /* mask value */
+ uint64_t db_match; /* match value */
+ const char *db_name; /* name for matched value */
+};
+
+
+static bool l9p_describe_bits(const char *, uint64_t, const char *,
+ const struct descbits *, struct sbuf *);
+static void l9p_describe_fid(const char *, uint32_t, struct sbuf *);
+static void l9p_describe_mode(const char *, uint32_t, struct sbuf *);
+static void l9p_describe_name(const char *, char *, struct sbuf *);
+static void l9p_describe_perm(const char *, uint32_t, struct sbuf *);
+static void l9p_describe_lperm(const char *, uint32_t, struct sbuf *);
+static void l9p_describe_qid(const char *, struct l9p_qid *, struct sbuf *);
+static void l9p_describe_l9stat(const char *, struct l9p_stat *,
+ enum l9p_version, struct sbuf *);
+static void l9p_describe_statfs(const char *, struct l9p_statfs *,
+ struct sbuf *);
+static void l9p_describe_time(struct sbuf *, const char *, uint64_t, uint64_t);
+static void l9p_describe_readdir(struct sbuf *, struct l9p_f_io *);
+static void l9p_describe_size(const char *, uint64_t, struct sbuf *);
+static void l9p_describe_ugid(const char *, uint32_t, struct sbuf *);
+static void l9p_describe_getattr_mask(uint64_t, struct sbuf *);
+static void l9p_describe_unlinkat_flags(const char *, uint32_t, struct sbuf *);
+static const char *lookup_linux_errno(uint32_t, char *, size_t);
+
+/*
+ * Using indexed initializers, we can have these occur in any order.
+ * Using adjacent-string concatenation ("T" #name, "R" #name), we
+ * get both Tfoo and Rfoo strings with one copy of the name.
+ * Alas, there is no stupid cpp trick to lowercase-ify, so we
+ * have to write each name twice. In which case we might as well
+ * make the second one a string in the first place and not bother
+ * with the stringizing.
+ *
+ * This table should have entries for each enum value in fcall.h.
+ */
+#define X(NAME, name) [L9P_T##NAME - L9P__FIRST] = "T" name, \
+ [L9P_R##NAME - L9P__FIRST] = "R" name
+static const char *ftype_names[] = {
+ X(VERSION, "version"),
+ X(AUTH, "auth"),
+ X(ATTACH, "attach"),
+ X(ERROR, "error"),
+ X(LERROR, "lerror"),
+ X(FLUSH, "flush"),
+ X(WALK, "walk"),
+ X(OPEN, "open"),
+ X(CREATE, "create"),
+ X(READ, "read"),
+ X(WRITE, "write"),
+ X(CLUNK, "clunk"),
+ X(REMOVE, "remove"),
+ X(STAT, "stat"),
+ X(WSTAT, "wstat"),
+ X(STATFS, "statfs"),
+ X(LOPEN, "lopen"),
+ X(LCREATE, "lcreate"),
+ X(SYMLINK, "symlink"),
+ X(MKNOD, "mknod"),
+ X(RENAME, "rename"),
+ X(READLINK, "readlink"),
+ X(GETATTR, "getattr"),
+ X(SETATTR, "setattr"),
+ X(XATTRWALK, "xattrwalk"),
+ X(XATTRCREATE, "xattrcreate"),
+ X(READDIR, "readdir"),
+ X(FSYNC, "fsync"),
+ X(LOCK, "lock"),
+ X(GETLOCK, "getlock"),
+ X(LINK, "link"),
+ X(MKDIR, "mkdir"),
+ X(RENAMEAT, "renameat"),
+ X(UNLINKAT, "unlinkat"),
+};
+#undef X
+
+void
+l9p_seek_iov(const struct iovec *iov1, size_t niov1, struct iovec *iov2,
+ size_t *niov2, size_t seek)
+{
+ size_t remainder = 0;
+ size_t left = seek;
+ size_t i, j;
+
+ assert(niov1 <= L9P_MAX_IOV);
+
+ for (i = 0; i < niov1; i++) {
+ size_t toseek = MIN(left, iov1[i].iov_len);
+ left -= toseek;
+
+ if (toseek == iov1[i].iov_len)
+ continue;
+
+ if (left == 0) {
+ remainder = toseek;
+ break;
+ }
+ }
+
+ for (j = i; j < niov1; j++) {
+ iov2[j - i].iov_base = (char *)iov1[j].iov_base + remainder;
+ iov2[j - i].iov_len = iov1[j].iov_len - remainder;
+ remainder = 0;
+ }
+
+ *niov2 = j - i;
+}
+
+size_t
+l9p_truncate_iov(struct iovec *iov, size_t niov, size_t length)
+{
+ size_t i, done = 0;
+
+ for (i = 0; i < niov; i++) {
+ size_t toseek = MIN(length - done, iov[i].iov_len);
+ done += toseek;
+
+ if (toseek < iov[i].iov_len) {
+ iov[i].iov_len = toseek;
+ return (i + 1);
+ }
+ }
+
+ return (niov);
+}
+
+/*
+ * This wrapper for getgrouplist() that calloc'ed memory, and
+ * papers over FreeBSD vs Mac differences in the getgrouplist()
+ * argument types.
+ *
+ * Note that this function guarantees that *either*:
+ * return value != NULL and *angroups has been set
+ * or: return value == NULL and *angroups is 0
+ */
+gid_t *
+l9p_getgrlist(const char *name, gid_t basegid, int *angroups)
+{
+#ifdef GETGROUPS_GROUP_TYPE_IS_INT
+ int i, *int_groups;
+#endif
+ gid_t *groups;
+ int ngroups;
+
+ /*
+ * Todo, perhaps: while getgrouplist() returns -1, expand.
+ * For now just use NGROUPS_MAX.
+ */
+ ngroups = NGROUPS_MAX;
+ groups = calloc((size_t)ngroups, sizeof(*groups));
+#ifdef GETGROUPS_GROUP_TYPE_IS_INT
+ int_groups = groups ? calloc((size_t)ngroups, sizeof(*int_groups)) :
+ NULL;
+ if (int_groups == NULL) {
+ free(groups);
+ groups = NULL;
+ }
+#endif
+ if (groups == NULL) {
+ *angroups = 0;
+ return (NULL);
+ }
+#ifdef GETGROUPS_GROUP_TYPE_IS_INT
+ if (getgrouplist(name, (int)basegid, int_groups, &ngroups) < 0) {
+ free(groups);
+ free(int_groups);
+ return (NULL);
+ }
+ for (i = 0; i < ngroups; i++)
+ groups[i] = (gid_t)int_groups[i];
+ free(int_groups);
+#else
+ if (getgrouplist(name, basegid, groups, &ngroups) < 0) {
+ free(groups);
+ return (NULL);
+ }
+#endif
+ *angroups = ngroups;
+ return (groups);
+}
+
+/*
+ * For the various debug describe ops: decode bits in a bit-field-y
+ * value. For example, we might produce:
+ * value=0x3c[FOO,BAR,QUUX,?0x20]
+ * when FOO is bit 0x10, BAR is 0x08, and QUUX is 0x04 (as defined
+ * by the table). This leaves 0x20 (bit 5) as a mystery, while bits
+ * 4, 3, and 2 were decoded. (Bits 0 and 1 were 0 on input hence
+ * were not attempted here.)
+ *
+ * For general use we take a uint64_t <value>. The bit description
+ * table <db> is an array of {mask, match, str} values ending with
+ * {0, 0, NULL}.
+ *
+ * If <str> is non-NULL we'll print it and the mask as well (if
+ * str is NULL we'll print neither). The mask is always printed in
+ * hex at the moment. See undec description too.
+ *
+ * For convenience, you can use a mask-and-match value, e.g., to
+ * decode a 2-bit field in bits 0 and 1 you can mask against 3 and
+ * match the values 0, 1, 2, and 3. To handle this, make sure that
+ * all masks-with-same-match are sequential.
+ *
+ * If there are any nonzero undecoded bits, print them after
+ * all the decode-able bits have been handled.
+ *
+ * The <oc> argument defines the open and close bracket characters,
+ * typically "[]", that surround the entire string. If NULL, no
+ * brackets are added, else oc[0] goes in the front and oc[1] at
+ * the end, after printing any <str><value> part.
+ *
+ * Returns true if it printed anything (other than the implied
+ * str-and-value, that is).
+ */
+static bool
+l9p_describe_bits(const char *str, uint64_t value, const char *oc,
+ const struct descbits *db, struct sbuf *sb)
+{
+ const char *sep;
+ char bracketbuf[2] = "";
+ bool printed = false;
+
+ if (str != NULL)
+ sbuf_printf(sb, "%s0x%" PRIx64, str, value);
+
+ if (oc != NULL)
+ bracketbuf[0] = oc[0];
+ sep = bracketbuf;
+ for (; db->db_name != NULL; db++) {
+ if ((value & db->db_mask) == db->db_match) {
+ sbuf_printf(sb, "%s%s", sep, db->db_name);
+ sep = ",";
+ printed = true;
+
+ /*
+ * Clear the field, and make sure we
+ * won't match a zero-valued field with
+ * this same mask.
+ */
+ value &= ~db->db_mask;
+ while (db[1].db_mask == db->db_mask &&
+ db[1].db_name != NULL)
+ db++;
+ }
+ }
+ if (value != 0) {
+ sbuf_printf(sb, "%s?0x%" PRIx64, sep, value);
+ printed = true;
+ }
+ if (printed && oc != NULL) {
+ bracketbuf[0] = oc[1];
+ sbuf_cat(sb, bracketbuf);
+ }
+ return (printed);
+}
+
+/*
+ * Show file ID.
+ */
+static void
+l9p_describe_fid(const char *str, uint32_t fid, struct sbuf *sb)
+{
+
+ sbuf_printf(sb, "%s%" PRIu32, str, fid);
+}
+
+/*
+ * Show user or group ID.
+ */
+static void
+l9p_describe_ugid(const char *str, uint32_t ugid, struct sbuf *sb)
+{
+
+ sbuf_printf(sb, "%s%" PRIu32, str, ugid);
+}
+
+/*
+ * Show file mode (O_RDWR, O_RDONLY, etc). The argument is
+ * an l9p_omode, not a Linux flags mode. Linux flags are
+ * decoded with l9p_describe_lflags.
+ */
+static void
+l9p_describe_mode(const char *str, uint32_t mode, struct sbuf *sb)
+{
+ static const struct descbits bits[] = {
+ { L9P_OACCMODE, L9P_OREAD, "OREAD" },
+ { L9P_OACCMODE, L9P_OWRITE, "OWRITE" },
+ { L9P_OACCMODE, L9P_ORDWR, "ORDWR" },
+ { L9P_OACCMODE, L9P_OEXEC, "OEXEC" },
+
+ { L9P_OCEXEC, L9P_OCEXEC, "OCEXEC" },
+ { L9P_ODIRECT, L9P_ODIRECT, "ODIRECT" },
+ { L9P_ORCLOSE, L9P_ORCLOSE, "ORCLOSE" },
+ { L9P_OTRUNC, L9P_OTRUNC, "OTRUNC" },
+ { 0, 0, NULL }
+ };
+
+ (void) l9p_describe_bits(str, mode, "[]", bits, sb);
+}
+
+/*
+ * Show Linux mode/flags.
+ */
+static void
+l9p_describe_lflags(const char *str, uint32_t flags, struct sbuf *sb)
+{
+ static const struct descbits bits[] = {
+ { L9P_OACCMODE, L9P_OREAD, "O_READ" },
+ { L9P_OACCMODE, L9P_OWRITE, "O_WRITE" },
+ { L9P_OACCMODE, L9P_ORDWR, "O_RDWR" },
+ { L9P_OACCMODE, L9P_OEXEC, "O_EXEC" },
+
+ { L9P_L_O_APPEND, L9P_L_O_APPEND, "O_APPEND" },
+ { L9P_L_O_CLOEXEC, L9P_L_O_CLOEXEC, "O_CLOEXEC" },
+ { L9P_L_O_CREAT, L9P_L_O_CREAT, "O_CREAT" },
+ { L9P_L_O_DIRECT, L9P_L_O_DIRECT, "O_DIRECT" },
+ { L9P_L_O_DIRECTORY, L9P_L_O_DIRECTORY, "O_DIRECTORY" },
+ { L9P_L_O_DSYNC, L9P_L_O_DSYNC, "O_DSYNC" },
+ { L9P_L_O_EXCL, L9P_L_O_EXCL, "O_EXCL" },
+ { L9P_L_O_FASYNC, L9P_L_O_FASYNC, "O_FASYNC" },
+ { L9P_L_O_LARGEFILE, L9P_L_O_LARGEFILE, "O_LARGEFILE" },
+ { L9P_L_O_NOATIME, L9P_L_O_NOATIME, "O_NOATIME" },
+ { L9P_L_O_NOCTTY, L9P_L_O_NOCTTY, "O_NOCTTY" },
+ { L9P_L_O_NOFOLLOW, L9P_L_O_NOFOLLOW, "O_NOFOLLOW" },
+ { L9P_L_O_NONBLOCK, L9P_L_O_NONBLOCK, "O_NONBLOCK" },
+ { L9P_L_O_PATH, L9P_L_O_PATH, "O_PATH" },
+ { L9P_L_O_SYNC, L9P_L_O_SYNC, "O_SYNC" },
+ { L9P_L_O_TMPFILE, L9P_L_O_TMPFILE, "O_TMPFILE" },
+ { L9P_L_O_TMPFILE, L9P_L_O_TMPFILE, "O_TMPFILE" },
+ { L9P_L_O_TRUNC, L9P_L_O_TRUNC, "O_TRUNC" },
+ { 0, 0, NULL }
+ };
+
+ (void) l9p_describe_bits(str, flags, "[]", bits, sb);
+}
+
+/*
+ * Show file name or other similar, potentially-very-long string.
+ * Actual strings get quotes, a NULL name (if it occurs) gets
+ * <null> (no quotes), so you can tell the difference.
+ */
+static void
+l9p_describe_name(const char *str, char *name, struct sbuf *sb)
+{
+ size_t len;
+
+ if (name == NULL) {
+ sbuf_printf(sb, "%s<null>", str);
+ return;
+ }
+
+ len = strlen(name);
+
+ if (len > 32)
+ sbuf_printf(sb, "%s\"%.*s...\"", str, 32 - 3, name);
+ else
+ sbuf_printf(sb, "%s\"%.*s\"", str, (int)len, name);
+}
+
+#define STRMODE_SIZE 12
+
+#ifdef __illumos__
+static void
+strmode(mode_t mode, char *bp)
+{
+ char *const sbp = bp;
+
+ /*
+ * The single caller does not pass in the file type as part of 'mode',
+ * and ignores the first character in the returned buffer anyway.
+ */
+ *bp++ = '?';
+
+#define ONE(_cmp, _ch) ((mode & (_cmp)) != 0) ? (_ch) : '-'
+ *bp++ = ONE(S_IRUSR, 'r');
+ *bp++ = ONE(S_IWUSR, 'w');
+ switch (mode & (S_ISUID|S_IXUSR)) {
+ case S_ISUID|S_IXUSR:
+ *bp++ = 's';
+ break;
+ case S_ISUID:
+ *bp++ = 'S';
+ break;
+ case S_IXUSR:
+ *bp++ = 'x';
+ break;
+ case 0:
+ *bp++ = '-';
+ }
+
+ *bp++ = ONE(S_IRGRP, 'r');
+ *bp++ = ONE(S_IWGRP, 'w');
+ switch (mode & (S_ISGID|S_IXGRP|S_IFREG)) {
+ case S_ISGID|S_IXGRP:
+ *bp++ = 's';
+ break;
+ case S_ISGID|S_IFREG:
+ *bp++ = 'L';
+ break;
+ case S_ISGID:
+ *bp++ = 'S';
+ break;
+ case S_IXGRP:
+ *bp++ = 'x';
+ break;
+ default:
+ *bp++ = '-';
+ }
+
+ *bp++ = ONE(S_IROTH, 'r');
+ *bp++ = ONE(S_IWOTH, 'w');
+ switch (mode & (S_ISVTX|S_IXOTH)) {
+ case S_ISVTX|S_IXOTH:
+ *bp++ = 't';
+ break;
+ case S_ISVTX:
+ *bp++ = 'T';
+ break;
+ case S_IXOTH:
+ *bp++ = 'x';
+ break;
+ default:
+ *bp++ = '-';
+ }
+
+ *bp++ = ' ';
+ *bp = '\0';
+
+ assert(bp - sbp <= STRMODE_SIZE);
+#undef ONE
+}
+#endif /* __illumos__ */
+
+/*
+ * Show permissions (rwx etc). Prints the value in hex only if
+ * the rwx bits do not cover the entire value.
+ */
+static void
+l9p_describe_perm(const char *str, uint32_t mode, struct sbuf *sb)
+{
+ char pbuf[STRMODE_SIZE];
+
+ strmode(mode & 0777, pbuf);
+ if ((mode & ~(uint32_t)0777) != 0)
+ sbuf_printf(sb, "%s0x%" PRIx32 "<%.9s>", str, mode, pbuf + 1);
+ else
+ sbuf_printf(sb, "%s<%.9s>", str, pbuf + 1);
+}
+
+/*
+ * Show "extended" permissions: regular permissions, but also the
+ * various DM* extension bits from 9P2000.u.
+ */
+static void
+l9p_describe_ext_perm(const char *str, uint32_t mode, struct sbuf *sb)
+{
+ static const struct descbits bits[] = {
+ { L9P_DMDIR, L9P_DMDIR, "DMDIR" },
+ { L9P_DMAPPEND, L9P_DMAPPEND, "DMAPPEND" },
+ { L9P_DMEXCL, L9P_DMEXCL, "DMEXCL" },
+ { L9P_DMMOUNT, L9P_DMMOUNT, "DMMOUNT" },
+ { L9P_DMAUTH, L9P_DMAUTH, "DMAUTH" },
+ { L9P_DMTMP, L9P_DMTMP, "DMTMP" },
+ { L9P_DMSYMLINK, L9P_DMSYMLINK, "DMSYMLINK" },
+ { L9P_DMDEVICE, L9P_DMDEVICE, "DMDEVICE" },
+ { L9P_DMNAMEDPIPE, L9P_DMNAMEDPIPE, "DMNAMEDPIPE" },
+ { L9P_DMSOCKET, L9P_DMSOCKET, "DMSOCKET" },
+ { L9P_DMSETUID, L9P_DMSETUID, "DMSETUID" },
+ { L9P_DMSETGID, L9P_DMSETGID, "DMSETGID" },
+ { 0, 0, NULL }
+ };
+ bool need_sep;
+
+ sbuf_printf(sb, "%s[", str);
+ need_sep = l9p_describe_bits(NULL, mode & ~(uint32_t)0777, NULL,
+ bits, sb);
+ l9p_describe_perm(need_sep ? "," : "", mode & 0777, sb);
+ sbuf_cat(sb, "]");
+}
+
+/*
+ * Show Linux-specific permissions: regular permissions, but also
+ * the S_IFMT field.
+ */
+static void
+l9p_describe_lperm(const char *str, uint32_t mode, struct sbuf *sb)
+{
+ static const struct descbits bits[] = {
+ { S_IFMT, S_IFIFO, "S_IFIFO" },
+ { S_IFMT, S_IFCHR, "S_IFCHR" },
+ { S_IFMT, S_IFDIR, "S_IFDIR" },
+ { S_IFMT, S_IFBLK, "S_IFBLK" },
+ { S_IFMT, S_IFREG, "S_IFREG" },
+ { S_IFMT, S_IFLNK, "S_IFLNK" },
+ { S_IFMT, S_IFSOCK, "S_IFSOCK" },
+#ifdef __illumos__
+ { S_IFMT, S_IFDOOR, "S_IFDOOR" },
+ { S_IFMT, S_IFPORT, "S_IFPORT" },
+#endif
+ { 0, 0, NULL }
+ };
+ bool need_sep;
+
+ sbuf_printf(sb, "%s[", str);
+ need_sep = l9p_describe_bits(NULL, mode & ~(uint32_t)0777, NULL,
+ bits, sb);
+ l9p_describe_perm(need_sep ? "," : "", mode & 0777, sb);
+ sbuf_cat(sb, "]");
+}
+
+/*
+ * Show qid (<type, version, path> tuple).
+ */
+static void
+l9p_describe_qid(const char *str, struct l9p_qid *qid, struct sbuf *sb)
+{
+ static const struct descbits bits[] = {
+ /*
+ * NB: L9P_QTFILE is 0, i.e., is implied by no
+ * other bits being set. We get this produced
+ * when we mask against 0xff and compare for
+ * L9P_QTFILE, but we must do it first so that
+ * we mask against the original (not-adjusted)
+ * value.
+ */
+ { 0xff, L9P_QTFILE, "FILE" },
+ { L9P_QTDIR, L9P_QTDIR, "DIR" },
+ { L9P_QTAPPEND, L9P_QTAPPEND, "APPEND" },
+ { L9P_QTEXCL, L9P_QTEXCL, "EXCL" },
+ { L9P_QTMOUNT, L9P_QTMOUNT, "MOUNT" },
+ { L9P_QTAUTH, L9P_QTAUTH, "AUTH" },
+ { L9P_QTTMP, L9P_QTTMP, "TMP" },
+ { L9P_QTSYMLINK, L9P_QTSYMLINK, "SYMLINK" },
+ { 0, 0, NULL }
+ };
+
+ assert(qid != NULL);
+
+ sbuf_cat(sb, str);
+ (void) l9p_describe_bits("<", qid->type, "[]", bits, sb);
+ sbuf_printf(sb, ",%" PRIu32 ",0x%016" PRIx64 ">",
+ qid->version, qid->path);
+}
+
+/*
+ * Show size.
+ */
+static void
+l9p_describe_size(const char *str, uint64_t size, struct sbuf *sb)
+{
+
+ sbuf_printf(sb, "%s%" PRIu64, str, size);
+}
+
+/*
+ * Show l9stat (including 9P2000.u extensions if appropriate).
+ */
+static void
+l9p_describe_l9stat(const char *str, struct l9p_stat *st,
+ enum l9p_version version, struct sbuf *sb)
+{
+ bool dotu = version >= L9P_2000U;
+
+ assert(st != NULL);
+
+ sbuf_printf(sb, "%stype=0x%04" PRIx32 " dev=0x%08" PRIx32, str,
+ st->type, st->dev);
+ l9p_describe_qid(" qid=", &st->qid, sb);
+ l9p_describe_ext_perm(" mode=", st->mode, sb);
+ if (st->atime != (uint32_t)-1)
+ sbuf_printf(sb, " atime=%" PRIu32, st->atime);
+ if (st->mtime != (uint32_t)-1)
+ sbuf_printf(sb, " mtime=%" PRIu32, st->mtime);
+ if (st->length != (uint64_t)-1)
+ sbuf_printf(sb, " length=%" PRIu64, st->length);
+ l9p_describe_name(" name=", st->name, sb);
+ /*
+ * It's pretty common to have NULL name+gid+muid. They're
+ * just noise if NULL *and* dot-u; decode only if non-null
+ * or not-dot-u.
+ */
+ if (st->uid != NULL || !dotu)
+ l9p_describe_name(" uid=", st->uid, sb);
+ if (st->gid != NULL || !dotu)
+ l9p_describe_name(" gid=", st->gid, sb);
+ if (st->muid != NULL || !dotu)
+ l9p_describe_name(" muid=", st->muid, sb);
+ if (dotu) {
+ if (st->extension != NULL)
+ l9p_describe_name(" extension=", st->extension, sb);
+ sbuf_printf(sb,
+ " n_uid=%" PRIu32 " n_gid=%" PRIu32 " n_muid=%" PRIu32,
+ st->n_uid, st->n_gid, st->n_muid);
+ }
+}
+
+static void
+l9p_describe_statfs(const char *str, struct l9p_statfs *st, struct sbuf *sb)
+{
+
+ assert(st != NULL);
+
+ sbuf_printf(sb, "%stype=0x%04lx bsize=%lu blocks=%" PRIu64
+ " bfree=%" PRIu64 " bavail=%" PRIu64 " files=%" PRIu64
+ " ffree=%" PRIu64 " fsid=0x%" PRIx64 " namelen=%" PRIu32 ">",
+ str, (u_long)st->type, (u_long)st->bsize, st->blocks,
+ st->bfree, st->bavail, st->files,
+ st->ffree, st->fsid, st->namelen);
+}
+
+/*
+ * Decode a <seconds,nsec> timestamp.
+ *
+ * Perhaps should use asctime_r. For now, raw values.
+ */
+static void
+l9p_describe_time(struct sbuf *sb, const char *s, uint64_t sec, uint64_t nsec)
+{
+
+ sbuf_cat(sb, s);
+ if (nsec > 999999999)
+ sbuf_printf(sb, "%" PRIu64 ".<invalid nsec %" PRIu64 ">)",
+ sec, nsec);
+ else
+ sbuf_printf(sb, "%" PRIu64 ".%09" PRIu64, sec, nsec);
+}
+
+/*
+ * Decode readdir data (.L format, variable length names).
+ */
+static void
+l9p_describe_readdir(struct sbuf *sb, struct l9p_f_io *io)
+{
+ uint32_t count;
+#ifdef notyet
+ int i;
+ struct l9p_message msg;
+ struct l9p_dirent de;
+#endif
+
+ if ((count = io->count) == 0) {
+ sbuf_printf(sb, " EOF (count=0)");
+ return;
+ }
+
+ /*
+ * Can't do this yet because we do not have the original
+ * req.
+ */
+#ifdef notyet
+ sbuf_printf(sb, " count=%" PRIu32 " [", count);
+
+ l9p_init_msg(&msg, req, L9P_UNPACK);
+ for (i = 0; msg.lm_size < count; i++) {
+ if (l9p_pudirent(&msg, &de) < 0) {
+ sbuf_printf(sb, " bad count");
+ break;
+ }
+
+ sbuf_printf(sb, i ? ", " : " ");
+ l9p_describe_qid(" qid=", &de.qid, sb);
+ sbuf_printf(sb, " offset=%" PRIu64 " type=%d",
+ de.offset, de.type);
+ l9p_describe_name(" name=", de.name);
+ free(de.name);
+ }
+ sbuf_printf(sb, "]=%d dir entries", i);
+#else /* notyet */
+ sbuf_printf(sb, " count=%" PRIu32, count);
+#endif
+}
+
+/*
+ * Decode Tgetattr request_mask field.
+ */
+static void
+l9p_describe_getattr_mask(uint64_t request_mask, struct sbuf *sb)
+{
+ static const struct descbits bits[] = {
+ /*
+ * Note: ALL and BASIC must occur first and second.
+ * This is a little dirty: it depends on the way the
+ * describe_bits code clears the values. If we
+ * match ALL, we clear all those bits and do not
+ * match BASIC; if we match BASIC, we clear all
+ * those bits and do not match individual bits. Thus
+ * if we have BASIC but not all the additional bits,
+ * we'll see, e.g., [BASIC,BTIME,GEN]; if we have
+ * all the additional bits too, we'll see [ALL].
+ *
+ * Since <undec> is true below, we'll also spot any
+ * bits added to the protocol since we made this table.
+ */
+ { L9PL_GETATTR_ALL, L9PL_GETATTR_ALL, "ALL" },
+ { L9PL_GETATTR_BASIC, L9PL_GETATTR_BASIC, "BASIC" },
+
+ /* individual bits in BASIC */
+ { L9PL_GETATTR_MODE, L9PL_GETATTR_MODE, "MODE" },
+ { L9PL_GETATTR_NLINK, L9PL_GETATTR_NLINK, "NLINK" },
+ { L9PL_GETATTR_UID, L9PL_GETATTR_UID, "UID" },
+ { L9PL_GETATTR_GID, L9PL_GETATTR_GID, "GID" },
+ { L9PL_GETATTR_RDEV, L9PL_GETATTR_RDEV, "RDEV" },
+ { L9PL_GETATTR_ATIME, L9PL_GETATTR_ATIME, "ATIME" },
+ { L9PL_GETATTR_MTIME, L9PL_GETATTR_MTIME, "MTIME" },
+ { L9PL_GETATTR_CTIME, L9PL_GETATTR_CTIME, "CTIME" },
+ { L9PL_GETATTR_INO, L9PL_GETATTR_INO, "INO" },
+ { L9PL_GETATTR_SIZE, L9PL_GETATTR_SIZE, "SIZE" },
+ { L9PL_GETATTR_BLOCKS, L9PL_GETATTR_BLOCKS, "BLOCKS" },
+
+ /* additional bits in ALL */
+ { L9PL_GETATTR_BTIME, L9PL_GETATTR_BTIME, "BTIME" },
+ { L9PL_GETATTR_GEN, L9PL_GETATTR_GEN, "GEN" },
+ { L9PL_GETATTR_DATA_VERSION, L9PL_GETATTR_DATA_VERSION,
+ "DATA_VERSION" },
+ { 0, 0, NULL }
+ };
+
+ (void) l9p_describe_bits(" request_mask=", request_mask, "[]", bits,
+ sb);
+}
+
+/*
+ * Decode Tunlinkat flags.
+ */
+static void
+l9p_describe_unlinkat_flags(const char *str, uint32_t flags, struct sbuf *sb)
+{
+ static const struct descbits bits[] = {
+ { L9PL_AT_REMOVEDIR, L9PL_AT_REMOVEDIR, "AT_REMOVEDIR" },
+ { 0, 0, NULL }
+ };
+
+ (void) l9p_describe_bits(str, flags, "[]", bits, sb);
+}
+
+static const char *
+lookup_linux_errno(uint32_t linux_errno, char *buf, size_t len)
+{
+ /*
+ * Error numbers in the "base" range (1..ERANGE) are common
+ * across BSD, MacOS, Linux, and Plan 9.
+ *
+ * Error numbers outside that range require translation.
+ */
+ const char *const table[] = {
+#define X0(name) [name] = name ## _STR
+#define X(name) [name] = name ## _STR
+ X(LINUX_EAGAIN),
+ X(LINUX_EDEADLK),
+ X(LINUX_ENAMETOOLONG),
+ X(LINUX_ENOLCK),
+ X(LINUX_ENOSYS),
+ X(LINUX_ENOTEMPTY),
+ X(LINUX_ELOOP),
+ X(LINUX_ENOMSG),
+ X(LINUX_EIDRM),
+ X(LINUX_ECHRNG),
+ X(LINUX_EL2NSYNC),
+ X(LINUX_EL3HLT),
+ X(LINUX_EL3RST),
+ X(LINUX_ELNRNG),
+ X(LINUX_EUNATCH),
+ X(LINUX_ENOCSI),
+ X(LINUX_EL2HLT),
+ X(LINUX_EBADE),
+ X(LINUX_EBADR),
+ X(LINUX_EXFULL),
+ X(LINUX_ENOANO),
+ X(LINUX_EBADRQC),
+ X(LINUX_EBADSLT),
+ X(LINUX_EBFONT),
+ X(LINUX_ENOSTR),
+ X(LINUX_ENODATA),
+ X(LINUX_ETIME),
+ X(LINUX_ENOSR),
+ X(LINUX_ENONET),
+ X(LINUX_ENOPKG),
+ X(LINUX_EREMOTE),
+ X(LINUX_ENOLINK),
+ X(LINUX_EADV),
+ X(LINUX_ESRMNT),
+ X(LINUX_ECOMM),
+ X(LINUX_EPROTO),
+ X(LINUX_EMULTIHOP),
+ X(LINUX_EDOTDOT),
+ X(LINUX_EBADMSG),
+ X(LINUX_EOVERFLOW),
+ X(LINUX_ENOTUNIQ),
+ X(LINUX_EBADFD),
+ X(LINUX_EREMCHG),
+ X(LINUX_ELIBACC),
+ X(LINUX_ELIBBAD),
+ X(LINUX_ELIBSCN),
+ X(LINUX_ELIBMAX),
+ X(LINUX_ELIBEXEC),
+ X(LINUX_EILSEQ),
+ X(LINUX_ERESTART),
+ X(LINUX_ESTRPIPE),
+ X(LINUX_EUSERS),
+ X(LINUX_ENOTSOCK),
+ X(LINUX_EDESTADDRREQ),
+ X(LINUX_EMSGSIZE),
+ X(LINUX_EPROTOTYPE),
+ X(LINUX_ENOPROTOOPT),
+ X(LINUX_EPROTONOSUPPORT),
+ X(LINUX_ESOCKTNOSUPPORT),
+ X(LINUX_EOPNOTSUPP),
+ X(LINUX_EPFNOSUPPORT),
+ X(LINUX_EAFNOSUPPORT),
+ X(LINUX_EADDRINUSE),
+ X(LINUX_EADDRNOTAVAIL),
+ X(LINUX_ENETDOWN),
+ X(LINUX_ENETUNREACH),
+ X(LINUX_ENETRESET),
+ X(LINUX_ECONNABORTED),
+ X(LINUX_ECONNRESET),
+ X(LINUX_ENOBUFS),
+ X(LINUX_EISCONN),
+ X(LINUX_ENOTCONN),
+ X(LINUX_ESHUTDOWN),
+ X(LINUX_ETOOMANYREFS),
+ X(LINUX_ETIMEDOUT),
+ X(LINUX_ECONNREFUSED),
+ X(LINUX_EHOSTDOWN),
+ X(LINUX_EHOSTUNREACH),
+ X(LINUX_EALREADY),
+ X(LINUX_EINPROGRESS),
+ X(LINUX_ESTALE),
+ X(LINUX_EUCLEAN),
+ X(LINUX_ENOTNAM),
+ X(LINUX_ENAVAIL),
+ X(LINUX_EISNAM),
+ X(LINUX_EREMOTEIO),
+ X(LINUX_EDQUOT),
+ X(LINUX_ENOMEDIUM),
+ X(LINUX_EMEDIUMTYPE),
+ X(LINUX_ECANCELED),
+ X(LINUX_ENOKEY),
+ X(LINUX_EKEYEXPIRED),
+ X(LINUX_EKEYREVOKED),
+ X(LINUX_EKEYREJECTED),
+ X(LINUX_EOWNERDEAD),
+ X(LINUX_ENOTRECOVERABLE),
+ X(LINUX_ERFKILL),
+ X(LINUX_EHWPOISON),
+#undef X0
+#undef X
+ };
+ if ((size_t)linux_errno < N(table) && table[linux_errno] != NULL)
+ return (table[linux_errno]);
+ if (linux_errno <= ERANGE)
+ return (strerror((int)linux_errno));
+ (void) snprintf(buf, len, "Unknown error %d", linux_errno);
+ return (buf);
+}
+
+void
+l9p_describe_fcall(union l9p_fcall *fcall, enum l9p_version version,
+ struct sbuf *sb)
+{
+ uint64_t mask;
+ uint8_t type;
+ int i;
+
+ assert(fcall != NULL);
+ assert(sb != NULL);
+ assert(version <= L9P_2000L);
+
+ type = fcall->hdr.type;
+
+ if (type < L9P__FIRST || type >= L9P__LAST_PLUS_1 ||
+ ftype_names[type - L9P__FIRST] == NULL) {
+ const char *rr;
+
+ /*
+ * Can't say for sure that this distinction --
+ * an even number is a request, an odd one is
+ * a response -- will be maintained forever,
+ * but it's good enough for now.
+ */
+ rr = (type & 1) != 0 ? "response" : "request";
+ sbuf_printf(sb, "<unknown %s %d> tag=%d", rr, type,
+ fcall->hdr.tag);
+ } else {
+ sbuf_printf(sb, "%s tag=%d", ftype_names[type - L9P__FIRST],
+ fcall->hdr.tag);
+ }
+
+ switch (type) {
+ case L9P_TVERSION:
+ case L9P_RVERSION:
+ sbuf_printf(sb, " version=\"%s\" msize=%d", fcall->version.version,
+ fcall->version.msize);
+ return;
+
+ case L9P_TAUTH:
+ l9p_describe_fid(" afid=", fcall->hdr.fid, sb);
+ sbuf_printf(sb, " uname=\"%s\" aname=\"%s\"",
+ fcall->tauth.uname, fcall->tauth.aname);
+ return;
+
+ case L9P_TATTACH:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_fid(" afid=", fcall->tattach.afid, sb);
+ sbuf_printf(sb, " uname=\"%s\" aname=\"%s\"",
+ fcall->tattach.uname, fcall->tattach.aname);
+ if (version >= L9P_2000U)
+ sbuf_printf(sb, " n_uname=%d", fcall->tattach.n_uname);
+ return;
+
+ case L9P_RATTACH:
+ l9p_describe_qid(" ", &fcall->rattach.qid, sb);
+ return;
+
+ case L9P_RERROR:
+ sbuf_printf(sb, " ename=\"%s\" errnum=%d", fcall->error.ename,
+ fcall->error.errnum);
+ return;
+
+ case L9P_RLERROR: {
+ char unknown[50];
+
+ sbuf_printf(sb, " errnum=%d (%s)", fcall->error.errnum,
+ lookup_linux_errno(fcall->error.errnum,
+ unknown, sizeof(unknown)));
+ return;
+ }
+
+ case L9P_TFLUSH:
+ sbuf_printf(sb, " oldtag=%d", fcall->tflush.oldtag);
+ return;
+
+ case L9P_RFLUSH:
+ return;
+
+ case L9P_TWALK:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_fid(" newfid=", fcall->twalk.newfid, sb);
+ if (fcall->twalk.nwname) {
+ sbuf_cat(sb, " wname=\"");
+ for (i = 0; i < fcall->twalk.nwname; i++)
+ sbuf_printf(sb, "%s%s", i == 0 ? "" : "/",
+ fcall->twalk.wname[i]);
+ sbuf_cat(sb, "\"");
+ }
+ return;
+
+ case L9P_RWALK:
+ sbuf_printf(sb, " wqid=[");
+ for (i = 0; i < fcall->rwalk.nwqid; i++)
+ l9p_describe_qid(i == 0 ? "" : ",",
+ &fcall->rwalk.wqid[i], sb);
+ sbuf_cat(sb, "]");
+ return;
+
+ case L9P_TOPEN:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_mode(" mode=", fcall->tcreate.mode, sb);
+ return;
+
+ case L9P_ROPEN:
+ l9p_describe_qid(" qid=", &fcall->ropen.qid, sb);
+ sbuf_printf(sb, " iounit=%d", fcall->ropen.iounit);
+ return;
+
+ case L9P_TCREATE:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_name(" name=", fcall->tcreate.name, sb);
+ l9p_describe_ext_perm(" perm=", fcall->tcreate.perm, sb);
+ l9p_describe_mode(" mode=", fcall->tcreate.mode, sb);
+ if (version >= L9P_2000U && fcall->tcreate.extension != NULL)
+ l9p_describe_name(" extension=",
+ fcall->tcreate.extension, sb);
+ return;
+
+ case L9P_RCREATE:
+ l9p_describe_qid(" qid=", &fcall->rcreate.qid, sb);
+ sbuf_printf(sb, " iounit=%d", fcall->rcreate.iounit);
+ return;
+
+ case L9P_TREAD:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ sbuf_printf(sb, " offset=%" PRIu64 " count=%" PRIu32,
+ fcall->io.offset, fcall->io.count);
+ return;
+
+ case L9P_RREAD:
+ case L9P_RWRITE:
+ sbuf_printf(sb, " count=%" PRIu32, fcall->io.count);
+ return;
+
+ case L9P_TWRITE:
+ case L9P_TREADDIR:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ sbuf_printf(sb, " offset=%" PRIu64 " count=%" PRIu32,
+ fcall->io.offset, fcall->io.count);
+ return;
+
+ case L9P_TCLUNK:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ return;
+
+ case L9P_RCLUNK:
+ return;
+
+ case L9P_TREMOVE:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ return;
+
+ case L9P_RREMOVE:
+ return;
+
+ case L9P_TSTAT:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ return;
+
+ case L9P_RSTAT:
+ l9p_describe_l9stat(" ", &fcall->rstat.stat, version, sb);
+ return;
+
+ case L9P_TWSTAT:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_l9stat(" ", &fcall->twstat.stat, version, sb);
+ return;
+
+ case L9P_RWSTAT:
+ return;
+
+ case L9P_TSTATFS:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ return;
+
+ case L9P_RSTATFS:
+ l9p_describe_statfs(" ", &fcall->rstatfs.statfs, sb);
+ return;
+
+ case L9P_TLOPEN:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_lflags(" flags=", fcall->tlcreate.flags, sb);
+ return;
+
+ case L9P_RLOPEN:
+ l9p_describe_qid(" qid=", &fcall->rlopen.qid, sb);
+ sbuf_printf(sb, " iounit=%d", fcall->rlopen.iounit);
+ return;
+
+ case L9P_TLCREATE:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_name(" name=", fcall->tlcreate.name, sb);
+ /* confusing: "flags" is open-mode, "mode" is permissions */
+ l9p_describe_lflags(" flags=", fcall->tlcreate.flags, sb);
+ /* TLCREATE mode/permissions have S_IFREG (0x8000) set */
+ l9p_describe_lperm(" mode=", fcall->tlcreate.mode, sb);
+ l9p_describe_ugid(" gid=", fcall->tlcreate.gid, sb);
+ return;
+
+ case L9P_RLCREATE:
+ l9p_describe_qid(" qid=", &fcall->rlcreate.qid, sb);
+ sbuf_printf(sb, " iounit=%d", fcall->rlcreate.iounit);
+ return;
+
+ case L9P_TSYMLINK:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_name(" name=", fcall->tsymlink.name, sb);
+ l9p_describe_name(" symtgt=", fcall->tsymlink.symtgt, sb);
+ l9p_describe_ugid(" gid=", fcall->tsymlink.gid, sb);
+ return;
+
+ case L9P_RSYMLINK:
+ l9p_describe_qid(" qid=", &fcall->ropen.qid, sb);
+ return;
+
+ case L9P_TMKNOD:
+ l9p_describe_fid(" dfid=", fcall->hdr.fid, sb);
+ l9p_describe_name(" name=", fcall->tmknod.name, sb);
+ /*
+ * TMKNOD mode/permissions have S_IFBLK/S_IFCHR/S_IFIFO
+ * bits. The major and minor values are only meaningful
+ * for S_IFBLK and S_IFCHR, but just decode always here.
+ */
+ l9p_describe_lperm(" mode=", fcall->tmknod.mode, sb);
+ sbuf_printf(sb, " major=%u minor=%u",
+ fcall->tmknod.major, fcall->tmknod.minor);
+ l9p_describe_ugid(" gid=", fcall->tmknod.gid, sb);
+ return;
+
+ case L9P_RMKNOD:
+ l9p_describe_qid(" qid=", &fcall->rmknod.qid, sb);
+ return;
+
+ case L9P_TRENAME:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_fid(" dfid=", fcall->trename.dfid, sb);
+ l9p_describe_name(" name=", fcall->trename.name, sb);
+ return;
+
+ case L9P_RRENAME:
+ return;
+
+ case L9P_TREADLINK:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ return;
+
+ case L9P_RREADLINK:
+ l9p_describe_name(" target=", fcall->rreadlink.target, sb);
+ return;
+
+ case L9P_TGETATTR:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_getattr_mask(fcall->tgetattr.request_mask, sb);
+ return;
+
+ case L9P_RGETATTR:
+ /* Don't need to decode bits: they're implied by the output */
+ mask = fcall->rgetattr.valid;
+ sbuf_printf(sb, " valid=0x%016" PRIx64, mask);
+ l9p_describe_qid(" qid=", &fcall->rgetattr.qid, sb);
+ if (mask & L9PL_GETATTR_MODE)
+ l9p_describe_lperm(" mode=", fcall->rgetattr.mode, sb);
+ if (mask & L9PL_GETATTR_UID)
+ l9p_describe_ugid(" uid=", fcall->rgetattr.uid, sb);
+ if (mask & L9PL_GETATTR_GID)
+ l9p_describe_ugid(" gid=", fcall->rgetattr.gid, sb);
+ if (mask & L9PL_GETATTR_NLINK)
+ sbuf_printf(sb, " nlink=%" PRIu64,
+ fcall->rgetattr.nlink);
+ if (mask & L9PL_GETATTR_RDEV)
+ sbuf_printf(sb, " rdev=0x%" PRIx64,
+ fcall->rgetattr.rdev);
+ if (mask & L9PL_GETATTR_SIZE)
+ l9p_describe_size(" size=", fcall->rgetattr.size, sb);
+ if (mask & L9PL_GETATTR_BLOCKS)
+ sbuf_printf(sb, " blksize=%" PRIu64 " blocks=%" PRIu64,
+ fcall->rgetattr.blksize, fcall->rgetattr.blocks);
+ if (mask & L9PL_GETATTR_ATIME)
+ l9p_describe_time(sb, " atime=",
+ fcall->rgetattr.atime_sec,
+ fcall->rgetattr.atime_nsec);
+ if (mask & L9PL_GETATTR_MTIME)
+ l9p_describe_time(sb, " mtime=",
+ fcall->rgetattr.mtime_sec,
+ fcall->rgetattr.mtime_nsec);
+ if (mask & L9PL_GETATTR_CTIME)
+ l9p_describe_time(sb, " ctime=",
+ fcall->rgetattr.ctime_sec,
+ fcall->rgetattr.ctime_nsec);
+ if (mask & L9PL_GETATTR_BTIME)
+ l9p_describe_time(sb, " btime=",
+ fcall->rgetattr.btime_sec,
+ fcall->rgetattr.btime_nsec);
+ if (mask & L9PL_GETATTR_GEN)
+ sbuf_printf(sb, " gen=0x%" PRIx64, fcall->rgetattr.gen);
+ if (mask & L9PL_GETATTR_DATA_VERSION)
+ sbuf_printf(sb, " data_version=0x%" PRIx64,
+ fcall->rgetattr.data_version);
+ return;
+
+ case L9P_TSETATTR:
+ /* As with RGETATTR, we'll imply decode via output. */
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ mask = fcall->tsetattr.valid;
+ /* NB: tsetattr valid mask is only 32 bits, hence %08x */
+ sbuf_printf(sb, " valid=0x%08" PRIx64, mask);
+ if (mask & L9PL_SETATTR_MODE)
+ l9p_describe_lperm(" mode=", fcall->tsetattr.mode, sb);
+ if (mask & L9PL_SETATTR_UID)
+ l9p_describe_ugid(" uid=", fcall->tsetattr.uid, sb);
+ if (mask & L9PL_SETATTR_GID)
+ l9p_describe_ugid(" uid=", fcall->tsetattr.gid, sb);
+ if (mask & L9PL_SETATTR_SIZE)
+ l9p_describe_size(" size=", fcall->tsetattr.size, sb);
+ if (mask & L9PL_SETATTR_ATIME) {
+ if (mask & L9PL_SETATTR_ATIME_SET)
+ l9p_describe_time(sb, " atime=",
+ fcall->tsetattr.atime_sec,
+ fcall->tsetattr.atime_nsec);
+ else
+ sbuf_cat(sb, " atime=now");
+ }
+ if (mask & L9PL_SETATTR_MTIME) {
+ if (mask & L9PL_SETATTR_MTIME_SET)
+ l9p_describe_time(sb, " mtime=",
+ fcall->tsetattr.mtime_sec,
+ fcall->tsetattr.mtime_nsec);
+ else
+ sbuf_cat(sb, " mtime=now");
+ }
+ if (mask & L9PL_SETATTR_CTIME)
+ sbuf_cat(sb, " ctime=now");
+ return;
+
+ case L9P_RSETATTR:
+ return;
+
+ case L9P_TXATTRWALK:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_fid(" newfid=", fcall->txattrwalk.newfid, sb);
+ l9p_describe_name(" name=", fcall->txattrwalk.name, sb);
+ return;
+
+ case L9P_RXATTRWALK:
+ l9p_describe_size(" size=", fcall->rxattrwalk.size, sb);
+ return;
+
+ case L9P_TXATTRCREATE:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_name(" name=", fcall->txattrcreate.name, sb);
+ l9p_describe_size(" size=", fcall->txattrcreate.attr_size, sb);
+ sbuf_printf(sb, " flags=%" PRIu32, fcall->txattrcreate.flags);
+ return;
+
+ case L9P_RXATTRCREATE:
+ return;
+
+ case L9P_RREADDIR:
+ l9p_describe_readdir(sb, &fcall->io);
+ return;
+
+ case L9P_TFSYNC:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ return;
+
+ case L9P_RFSYNC:
+ return;
+
+ case L9P_TLOCK:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ /* decode better later */
+ sbuf_printf(sb, " type=%d flags=0x%" PRIx32
+ " start=%" PRIu64 " length=%" PRIu64
+ " proc_id=0x%" PRIx32 " client_id=\"%s\"",
+ fcall->tlock.type, fcall->tlock.flags,
+ fcall->tlock.start, fcall->tlock.length,
+ fcall->tlock.proc_id, fcall->tlock.client_id);
+ return;
+
+ case L9P_RLOCK:
+ sbuf_printf(sb, " status=%d", fcall->rlock.status);
+ return;
+
+ case L9P_TGETLOCK:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ /* FALLTHROUGH */
+
+ case L9P_RGETLOCK:
+ /* decode better later */
+ sbuf_printf(sb, " type=%d "
+ " start=%" PRIu64 " length=%" PRIu64
+ " proc_id=0x%" PRIx32 " client_id=\"%s\"",
+ fcall->getlock.type,
+ fcall->getlock.start, fcall->getlock.length,
+ fcall->getlock.proc_id, fcall->getlock.client_id);
+ return;
+
+ case L9P_TLINK:
+ l9p_describe_fid(" dfid=", fcall->tlink.dfid, sb);
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_name(" name=", fcall->tlink.name, sb);
+ return;
+
+ case L9P_RLINK:
+ return;
+
+ case L9P_TMKDIR:
+ l9p_describe_fid(" fid=", fcall->hdr.fid, sb);
+ l9p_describe_name(" name=", fcall->tmkdir.name, sb);
+ /* TMKDIR mode/permissions have S_IFDIR set */
+ l9p_describe_lperm(" mode=", fcall->tmkdir.mode, sb);
+ l9p_describe_ugid(" gid=", fcall->tmkdir.gid, sb);
+ return;
+
+ case L9P_RMKDIR:
+ l9p_describe_qid(" qid=", &fcall->rmkdir.qid, sb);
+ return;
+
+ case L9P_TRENAMEAT:
+ l9p_describe_fid(" olddirfid=", fcall->hdr.fid, sb);
+ l9p_describe_name(" oldname=", fcall->trenameat.oldname,
+ sb);
+ l9p_describe_fid(" newdirfid=", fcall->trenameat.newdirfid, sb);
+ l9p_describe_name(" newname=", fcall->trenameat.newname,
+ sb);
+ return;
+
+ case L9P_RRENAMEAT:
+ return;
+
+ case L9P_TUNLINKAT:
+ l9p_describe_fid(" dirfd=", fcall->hdr.fid, sb);
+ l9p_describe_name(" name=", fcall->tunlinkat.name, sb);
+ l9p_describe_unlinkat_flags(" flags=",
+ fcall->tunlinkat.flags, sb);
+ return;
+
+ case L9P_RUNLINKAT:
+ return;
+
+ default:
+ sbuf_printf(sb, " <missing case in %s()>", __func__);
+ }
+}
diff --git a/usr/src/lib/lib9p/mapfile-vers b/usr/src/lib/lib9p/mapfile-vers
new file mode 100644
index 0000000000..9bf38cc847
--- /dev/null
+++ b/usr/src/lib/lib9p/mapfile-vers
@@ -0,0 +1,58 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+# usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+$mapfile_version 2
+
+SYMBOL_VERSION ILLUMOSprivate {
+ global:
+ l9p_backend_fs_init;
+ l9p_connection_alloc_fid;
+ l9p_connection_close;
+ l9p_connection_free;
+ l9p_connection_init;
+ l9p_connection_recv;
+ l9p_connection_remove_fid;
+ l9p_describe_fcall;
+ l9p_dispatch_request;
+ l9p_freefcall;
+ l9p_freestat;
+ l9p_getgrlist;
+ l9p_init_msg;
+ l9p_pack_stat;
+ l9p_pudirent;
+ l9p_pufcall;
+ l9p_pustat;
+ l9p_respond;
+ l9p_seek_iov;
+ l9p_server_init;
+ l9p_sizeof_stat;
+ l9p_truncate_iov;
+ local:
+ *;
+};
diff --git a/usr/src/man/man1m/bhyve.1m b/usr/src/man/man1m/bhyve.1m
index a6c4637538..cab588665e 100644
--- a/usr/src/man/man1m/bhyve.1m
+++ b/usr/src/man/man1m/bhyve.1m
@@ -24,7 +24,7 @@
.\"
.\" Portions Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
.\"
-.Dd March 18, 2021
+.Dd April 20, 2021
.Dt BHYVE 1M
.Os
.Sh NAME
@@ -263,6 +263,8 @@ Accelerated Virtio network interface.
Legacy Virtio network interface.
.It Li virtio-blk
Virtio block storage interface.
+.It Li virtio-9p
+Virtio 9p (VirtFS) interface.
.It Li virtio-rnd
Virtio random number generator interface.
.It Li virtio-console
@@ -390,6 +392,24 @@ Disable emulation of guest trim requests via
requests.
.El
.Pp
+9P devices:
+.Bl -tag -width 10n
+.It Xo
+.Sm off
+.Cm sharename Sy = Pa /path/to/share
+.Op Cm \&, Ar 9p-device-options
+.Sm on
+.Xc
+.El
+.Pp
+The
+.Ar 9p-device-options
+are:
+.Bl -tag -width 10n
+.It Cm ro
+Expose the share in read-only mode.
+.El
+.Pp
TTY devices:
.Bl -tag -width 10n
.It Cm stdio
diff --git a/usr/src/man/man4/bhyve_config.4 b/usr/src/man/man4/bhyve_config.4
index 23e1e33c5a..668b363115 100644
--- a/usr/src/man/man4/bhyve_config.4
+++ b/usr/src/man/man4/bhyve_config.4
@@ -25,7 +25,7 @@
.\"
.\" Portions Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
.\"
-.Dd May 6, 2021
+.Dd May 7, 2021
.Dt BHYVE_CONFIG 4
.Os
.Sh NAME
@@ -217,6 +217,8 @@ NVM Express (NVMe) controller.
PCI pass-through device.
.It Li uart
PCI 16550 serial device.
+.It Li virtio-9p
+VirtIO 9p (VirtFS) interface.
.It Li virtio-blk
VirtIO block storage interface.
.It Li virtio-console
@@ -474,6 +476,17 @@ where
.Ar N
is the device number.
.El
+.Ss VirtIO 9p Settings
+Each VirtIO 9p device exposes a single filesystem from a host path.
+.Bl -column "sharename" "Format" "Default"
+.It Sy Name Ta Sy Format Ta Sy Default Ta Sy Description
+.It Va sharename Ta string Ta Ta
+The share name exposed to the guest.
+.It Va path Ta path Ta Ta
+The path of a directory on the host to export to the guest.
+.It Va ro Ta bool Ta false Ta
+If true, the guest filesystem is read-only.
+.El
.Ss VirtIO Console Device Settings
Each VirtIO Console device contains one or more console ports.
Each port stores its settings in a node named
diff --git a/usr/src/pkg/manifests/system-library-bhyve.mf b/usr/src/pkg/manifests/system-library-bhyve.mf
index f425c83034..c29a0ab1cf 100644
--- a/usr/src/pkg/manifests/system-library-bhyve.mf
+++ b/usr/src/pkg/manifests/system-library-bhyve.mf
@@ -14,7 +14,7 @@
#
#
-# Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
+# Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
#
set name=pkg.fmri value=pkg:/system/library/bhyve@$(PKGVERS)
@@ -30,8 +30,10 @@ dir path=usr/lib group=bin
dir path=usr/lib/$(ARCH64) group=bin
file path=lib/$(ARCH64)/libvmm.so.1
file path=lib/$(ARCH64)/libvmmapi.so.1
+file path=usr/lib/$(ARCH64)/lib9p.so.1
file path=usr/lib/$(ARCH64)/libppt.so.1
file path=usr/lib/libppt.so.1
license lic_CDDL license=lic_CDDL
+license usr/src/lib/lib9p/COPYRIGHT license=usr/src/lib/lib9p/COPYRIGHT
license usr/src/lib/libvmmapi/THIRDPARTYLICENSE \
license=usr/src/lib/libvmmapi/THIRDPARTYLICENSE