diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2019-08-19 12:06:05 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2019-08-19 12:06:05 +0000 |
commit | 289a9bb49771505b864985403334d2f94f0ca3ec (patch) | |
tree | 2853dbf40fb16b4ea3df020177473835c0641dcb | |
parent | fb22979c02ec1ab84832084bea882640c366be5b (diff) | |
parent | 2052a1fb16201e50b4c3a91ebcbeeccbc8276644 (diff) | |
download | illumos-joyent-289a9bb49771505b864985403334d2f94f0ca3ec.tar.gz |
[illumos-gate merge]
commit 2052a1fb16201e50b4c3a91ebcbeeccbc8276644
11568 loader: pxe.c missing initializer
commit 8d94f651a44d41a7147253bb5dad1a53941e8f50
11031 SMB3 persistent handles
commit 2f57b5e005e6dce9d124b3dbd5fdcad1cc0372d2
11532 Makefile.master: add gcc9 support flags
commit f8296c60994fb27105f37ac6f75661e4a6bdbab7
11329 improved Virtio framework
10012 vioblk should not accept an all-zero serial number
7366 vioif happily creates rx descriptors until it consumes all memory
Conflicts:
usr/src/uts/common/io/vioif/vioif.c
59 files changed, 7523 insertions, 4797 deletions
diff --git a/exception_lists/packaging b/exception_lists/packaging index ce7ebe91a1..c6cb2ccf99 100644 --- a/exception_lists/packaging +++ b/exception_lists/packaging @@ -600,6 +600,7 @@ usr/lib/smbsrv/libfksmbsrv.so.1 usr/lib/smbsrv/libmlsvc.so usr/lib/smbsrv/libsmb.so usr/lib/smbsrv/libsmbns.so +usr/lib/smbsrv/nvlprint usr/lib/smbsrv/test-msgbuf usr/lib/smbsrv/testoplock # diff --git a/usr/src/Makefile.master b/usr/src/Makefile.master index da8d14c660..e751a9f79f 100644 --- a/usr/src/Makefile.master +++ b/usr/src/Makefile.master @@ -372,8 +372,10 @@ CCNOAUTOINLINE= \ -_gcc=-fno-ipa-cp \ -_gcc7=-fno-ipa-icf \ -_gcc8=-fno-ipa-icf \ + -_gcc9=-fno-ipa-icf \ -_gcc7=-fno-clone-functions \ - -_gcc8=-fno-clone-functions + -_gcc8=-fno-clone-functions \ + -_gcc9=-fno-clone-functions # GCC may put functions in different named sub-sections of .text based on # their presumed calling frequency. At least in the kernel, where we actually @@ -383,7 +385,8 @@ CCNOAUTOINLINE= \ # but the application of this may move into usr/src/uts/ in future. CCNOREORDER= \ -_gcc7=-fno-reorder-functions \ - -_gcc8=-fno-reorder-functions + -_gcc8=-fno-reorder-functions \ + -_gcc9=-fno-reorder-functions # # gcc has a rather aggressive optimization on by default that infers loop @@ -394,7 +397,8 @@ CCNOREORDER= \ # CCNOAGGRESSIVELOOPS= \ -_gcc7=-fno-aggressive-loop-optimizations \ - -_gcc8=-fno-aggressive-loop-optimizations + -_gcc8=-fno-aggressive-loop-optimizations \ + -_gcc9=-fno-aggressive-loop-optimizations # One optimization the compiler might perform is to turn this: # #pragma weak foo @@ -472,7 +476,8 @@ CERRWARN += -_gcc=-Wno-array-bounds # gcc4 lacks -Wno-maybe-uninitialized CNOWARN_UNINIT = -_gcc4=-Wno-uninitialized \ -_gcc7=-Wno-maybe-uninitialized \ - -_gcc8=-Wno-maybe-uninitialized + -_gcc8=-Wno-maybe-uninitialized \ + -_gcc9=-Wno-maybe-uninitialized CERRWARN += -_smatch=-p=illumos_user include $(SRC)/Makefile.smatch diff --git a/usr/src/boot/sys/boot/i386/libi386/pxe.c b/usr/src/boot/sys/boot/i386/libi386/pxe.c index 693596559d..821d0f627d 100644 --- a/usr/src/boot/sys/boot/i386/libi386/pxe.c +++ b/usr/src/boot/sys/boot/i386/libi386/pxe.c @@ -76,16 +76,21 @@ static ssize_t pxe_netif_put(struct iodesc *desc, void *pkt, size_t len); static void pxe_netif_end(struct netif *nif); extern struct netif_stats pxe_st[]; -extern u_int16_t __bangpxeseg; -extern u_int16_t __bangpxeoff; +extern uint16_t __bangpxeseg; +extern uint16_t __bangpxeoff; extern void __bangpxeentry(void); -extern u_int16_t __pxenvseg; -extern u_int16_t __pxenvoff; +extern uint16_t __pxenvseg; +extern uint16_t __pxenvoff; extern void __pxenventry(void); struct netif_dif pxe_ifs[] = { -/* dif_unit dif_nsel dif_stats dif_private */ - {0, 1, &pxe_st[0], 0} + { + .dif_unit = 0, + .dif_nsel = 1, + .dif_stats = &pxe_st[0], + .dif_private = NULL, + .dif_used = 0 + } }; struct netif_stats pxe_st[nitems(pxe_ifs)]; @@ -218,7 +223,7 @@ pxe_init(void) pxenv_p->RMEntry.segment, pxenv_p->RMEntry.offset); } - gci_p = bio_alloc(sizeof(*gci_p)); + gci_p = bio_alloc(sizeof (*gci_p)); if (gci_p == NULL) { pxe_p = NULL; return (0); @@ -269,7 +274,7 @@ pxe_cleanup(void) if (pxe_call == NULL) return; - undi_shutdown_p = bio_alloc(sizeof(*undi_shutdown_p)); + undi_shutdown_p = bio_alloc(sizeof (*undi_shutdown_p)); if (undi_shutdown_p != NULL) { bzero(undi_shutdown_p, sizeof (*undi_shutdown_p)); pxe_call(PXENV_UNDI_SHUTDOWN, undi_shutdown_p); @@ -282,7 +287,7 @@ pxe_cleanup(void) bio_free(undi_shutdown_p, sizeof (*undi_shutdown_p)); } - unload_stack_p = bio_alloc(sizeof(*unload_stack_p)); + unload_stack_p = bio_alloc(sizeof (*unload_stack_p)); if (unload_stack_p != NULL) { bzero(unload_stack_p, sizeof (*unload_stack_p)); pxe_call(PXENV_UNLOAD_STACK, unload_stack_p); @@ -423,7 +428,7 @@ pxe_netif_init(struct iodesc *desc, void *machdep_hint) else desc->xid = 0; - bio_free(undi_info_p, sizeof(*undi_info_p)); + bio_free(undi_info_p, sizeof (*undi_info_p)); undi_open_p = bio_alloc(sizeof (*undi_open_p)); if (undi_open_p == NULL) return; diff --git a/usr/src/cmd/mdb/common/modules/smbsrv/smbsrv.c b/usr/src/cmd/mdb/common/modules/smbsrv/smbsrv.c index b54549eebb..4195a62149 100644 --- a/usr/src/cmd/mdb/common/modules/smbsrv/smbsrv.c +++ b/usr/src/cmd/mdb/common/modules/smbsrv/smbsrv.c @@ -1623,6 +1623,9 @@ tree_flag_bits[] = { { "FORCE_L2_OPLOCK", SMB_TREE_FORCE_L2_OPLOCK, SMB_TREE_FORCE_L2_OPLOCK }, + { "CA", + SMB_TREE_CA, + SMB_TREE_CA }, { NULL, 0, 0 } }; @@ -2334,17 +2337,26 @@ smb_kshare_walk_step(mdb_walk_state_t *wsp) * ***************************************************************************** */ +typedef struct mdb_smb_vfs { + list_node_t sv_lnd; + uint32_t sv_magic; + uint32_t sv_refcnt; + vfs_t *sv_vfsp; + vnode_t *sv_rootvp; +} mdb_smb_vfs_t; + struct smb_vfs_cb_args { uint_t opts; vnode_t vn; char path[MAXPATHLEN]; }; +/*ARGSUSED*/ static int smb_vfs_cb(uintptr_t addr, const void *data, void *varg) { struct smb_vfs_cb_args *args = varg; - const smb_vfs_t *sf = data; + mdb_smb_vfs_t sf; if (args->opts & SMB_OPT_VERBOSE) { mdb_arg_t argv; @@ -2363,16 +2375,21 @@ smb_vfs_cb(uintptr_t addr, const void *data, void *varg) * * Get the vnode v_path string if we can. */ + if (mdb_ctf_vread(&sf, SMBSRV_SCOPE "smb_vfs_t", + "mdb_smb_vfs_t", addr, 0) < 0) { + mdb_warn("failed to read struct smb_vfs at %p", addr); + return (DCMD_ERR); + } strcpy(args->path, "?"); if (mdb_vread(&args->vn, sizeof (args->vn), - (uintptr_t)sf->sv_rootvp) == sizeof (args->vn)) + (uintptr_t)sf.sv_rootvp) == sizeof (args->vn)) (void) mdb_readstr(args->path, sizeof (args->path), (uintptr_t)args->vn.v_path); mdb_printf("%-?p ", addr); - mdb_printf("%-10d ", sf->sv_refcnt); - mdb_printf("%-?p ", sf->sv_vfsp); - mdb_printf("%-?p ", sf->sv_rootvp); + mdb_printf("%-10d ", sf.sv_refcnt); + mdb_printf("%-?p ", sf.sv_vfsp); + mdb_printf("%-?p ", sf.sv_rootvp); mdb_printf("%-s\n", args->path); return (WALK_NEXT); @@ -2442,7 +2459,12 @@ smb_vfs_walk_init(mdb_walk_state_t *wsp) * OFFSETOF(smb_server_t, sv_export.e_vfs_list.ll_list); */ GET_OFFSET(sv_exp_off, smb_server_t, sv_export); - GET_OFFSET(ex_vfs_off, smb_export_t, e_vfs_list); + /* GET_OFFSET(ex_vfs_off, smb_export_t, e_vfs_list); */ + ex_vfs_off = mdb_ctf_offsetof_by_name("smb_export_t", "e_vfs_list"); + if (ex_vfs_off < 0) { + mdb_warn("cannot lookup: smb_export_t .e_vfs_list"); + return (WALK_ERR); + } GET_OFFSET(ll_off, smb_llist_t, ll_list); wsp->walk_addr += (sv_exp_off + ex_vfs_off + ll_off); diff --git a/usr/src/cmd/smbsrv/Makefile b/usr/src/cmd/smbsrv/Makefile index 8e7699c252..85d9ec05f1 100644 --- a/usr/src/cmd/smbsrv/Makefile +++ b/usr/src/cmd/smbsrv/Makefile @@ -26,7 +26,7 @@ # SUBDIRS = smbadm smbd smbstat dtrace fksmbd bind-helper \ - test-msgbuf testoplock + nvlprint testoplock test-msgbuf MSGSUBDIRS = smbadm smbstat include ../Makefile.cmd diff --git a/usr/src/cmd/smbsrv/fksmbd/fksmbd_shr.c b/usr/src/cmd/smbsrv/fksmbd/fksmbd_shr.c index 23038f1641..20f1f146b0 100644 --- a/usr/src/cmd/smbsrv/fksmbd/fksmbd_shr.c +++ b/usr/src/cmd/smbsrv/fksmbd/fksmbd_shr.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright 2018 Nexenta Systems, Inc. All rights reserved. */ /* @@ -115,6 +115,8 @@ smb_shr_load(void *args) */ new_share("test", "/var/smb/test", "fksmbd test share", SMB_SHRF_GUEST_OK); + new_share("testca", "/var/smb/test", "fksmbd test CA share", + SMB_SHRF_CA); /* Allow creating lots of shares for testing. */ shr_file = getenv("FKSMBD_SHARE_FILE"); diff --git a/usr/src/cmd/smbsrv/nvlprint/Makefile b/usr/src/cmd/smbsrv/nvlprint/Makefile new file mode 100644 index 0000000000..6e107f4219 --- /dev/null +++ b/usr/src/cmd/smbsrv/nvlprint/Makefile @@ -0,0 +1,37 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2017 Nexenta Systems, Inc. All rights reserved. +# + + +PROG= nvlprint + +include ../../Makefile.cmd +ROOTCMDDIR= $(ROOT)/usr/lib/smbsrv + +CFLAGS += $(CCVERBOSE) + +CPPFLAGS += -D_FILE_OFFSET_BITS=64 +LDLIBS += -lnvpair + +.KEEP_STATE: + +all: $(PROG) + +install: all $(ROOTCMD) + +clean: + +lint: + +include ../../Makefile.targ diff --git a/usr/src/cmd/smbsrv/nvlprint/nvlprint.c b/usr/src/cmd/smbsrv/nvlprint/nvlprint.c new file mode 100644 index 0000000000..939cedd933 --- /dev/null +++ b/usr/src/cmd/smbsrv/nvlprint/nvlprint.c @@ -0,0 +1,88 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +/* + * Print a packed nvlist from a file. + */ + +#include <stdio.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include "libnvpair.h" + +char buf[65536]; + +void +dumpit(FILE *fp) +{ + struct stat st; + size_t flen; + int rlen; + nvlist_t *nvl = NULL; + int err; + + if (fstat(fileno(fp), &st) < 0) { + perror("fstat"); + return; + } + flen = (size_t)st.st_size; + if (flen > sizeof (buf)) { + (void) printf("File too large\n"); + return; + } + rlen = fread(buf, 1, flen, fp); + if (rlen <= 0) { + perror("fread"); + return; + } + if (rlen != flen) { + (void) printf("Short read %d %d \n", rlen, flen); + return; + } + + err = nvlist_unpack(buf, flen, &nvl, 0); + if (err != 0) { + (void) printf("nvlist_unpack, err=%d\n", err); + return; + } + + nvlist_print(stdout, nvl); + nvlist_free(nvl); +} + +int +main(int argc, char **argv) +{ + FILE *fp; + int i; + + if (argc < 2) { + (void) fprintf(stderr, "usage: %s {filename} [filename2...]\n", + argv[0]); + return (1); + } + for (i = 1; i < argc; i++) { + fp = fopen(argv[i], "r"); + if (fp == NULL) { + perror(argv[i]); + return (1); + } + (void) printf("%s:\n", argv[i]); + dumpit(fp); + (void) fclose(fp); + } + return (0); +} diff --git a/usr/src/lib/libfakekernel/common/clock.c b/usr/src/lib/libfakekernel/common/clock.c index 2bee02af2e..deacbd4705 100644 --- a/usr/src/lib/libfakekernel/common/clock.c +++ b/usr/src/lib/libfakekernel/common/clock.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ @@ -83,3 +83,48 @@ void scalehrtime(hrtime_t *t) { } + +/* + * These functions are blatently stolen from the kernel. + * See the dissertation in the comments preceding the + * hrt2ts() and ts2hrt() functions in: + * uts/common/os/timers.c + */ +void +hrt2ts(hrtime_t hrt, timespec_t *tsp) +{ + uint32_t sec, nsec, tmp; + + tmp = (uint32_t)(hrt >> 30); + sec = tmp - (tmp >> 2); + sec = tmp - (sec >> 5); + sec = tmp + (sec >> 1); + sec = tmp - (sec >> 6) + 7; + sec = tmp - (sec >> 3); + sec = tmp + (sec >> 1); + sec = tmp + (sec >> 3); + sec = tmp + (sec >> 4); + tmp = (sec << 7) - sec - sec - sec; + tmp = (tmp << 7) - tmp - tmp - tmp; + tmp = (tmp << 7) - tmp - tmp - tmp; + nsec = (uint32_t)hrt - (tmp << 9); + while (nsec >= NANOSEC) { + nsec -= NANOSEC; + sec++; + } + tsp->tv_sec = (time_t)sec; + tsp->tv_nsec = nsec; +} + +hrtime_t +ts2hrt(const timestruc_t *tsp) +{ + hrtime_t hrt; + + hrt = tsp->tv_sec; + hrt = (hrt << 7) - hrt - hrt - hrt; + hrt = (hrt << 7) - hrt - hrt - hrt; + hrt = (hrt << 7) - hrt - hrt - hrt; + hrt = (hrt << 9) + tsp->tv_nsec; + return (hrt); +} diff --git a/usr/src/lib/libfakekernel/common/kmisc.c b/usr/src/lib/libfakekernel/common/kmisc.c index 15730d6539..70f303e035 100644 --- a/usr/src/lib/libfakekernel/common/kmisc.c +++ b/usr/src/lib/libfakekernel/common/kmisc.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. * Copyright 2017 RackTop Systems. */ @@ -95,6 +95,7 @@ highbit64(uint64_t i) int ddi_strtoul(const char *str, char **endp, int base, unsigned long *res) { + errno = 0; *res = strtoul(str, endp, base); if (*res == 0) return (errno); diff --git a/usr/src/lib/libfakekernel/common/mapfile-vers b/usr/src/lib/libfakekernel/common/mapfile-vers index 3950ccd4b5..731f6801a5 100644 --- a/usr/src/lib/libfakekernel/common/mapfile-vers +++ b/usr/src/lib/libfakekernel/common/mapfile-vers @@ -99,7 +99,7 @@ SYMBOL_VERSION SUNWprivate_1.1 { highbit; highbit64; - + hrt2ts; hz; issig; @@ -233,6 +233,7 @@ SYMBOL_VERSION SUNWprivate_1.1 { tick_per_msec; timeout; + ts2hrt; tsignal; uiomove; uioskip; diff --git a/usr/src/lib/libshare/smb/libshare_smb.c b/usr/src/lib/libshare/smb/libshare_smb.c index e15bb26d9a..f567e7818b 100644 --- a/usr/src/lib/libshare/smb/libshare_smb.c +++ b/usr/src/lib/libshare/smb/libshare_smb.c @@ -179,6 +179,7 @@ struct option_defs optdefs[] = { { SHOPT_GUEST, OPT_TYPE_BOOLEAN }, { SHOPT_DFSROOT, OPT_TYPE_BOOLEAN }, { SHOPT_DESCRIPTION, OPT_TYPE_STRING }, + { SHOPT_CA, OPT_TYPE_BOOLEAN }, { SHOPT_FSO, OPT_TYPE_BOOLEAN }, { SHOPT_QUOTAS, OPT_TYPE_BOOLEAN }, { SHOPT_ENCRYPT, OPT_TYPE_STRING }, @@ -2195,6 +2196,9 @@ smb_build_shareinfo(sa_share_t share, sa_resource_t resource, smb_share_t *si) if (smb_saprop_getbool(opts, SHOPT_DFSROOT, B_FALSE)) si->shr_flags |= SMB_SHRF_DFSROOT; + if (smb_saprop_getbool(opts, SHOPT_CA, B_FALSE)) + si->shr_flags |= SMB_SHRF_CA; + if (smb_saprop_getbool(opts, SHOPT_FSO, B_FALSE)) si->shr_flags |= SMB_SHRF_FSO; diff --git a/usr/src/lib/smbsrv/libfksmbsrv/Makefile.com b/usr/src/lib/smbsrv/libfksmbsrv/Makefile.com index 507122dadd..7f29003239 100644 --- a/usr/src/lib/smbsrv/libfksmbsrv/Makefile.com +++ b/usr/src/lib/smbsrv/libfksmbsrv/Makefile.com @@ -119,7 +119,6 @@ OBJS_FS_SMBSRV = \ smb_tree_connect.o \ smb_unlock_byte_range.o \ smb_user.o \ - smb_vfs.o \ smb_vops.o \ smb_vss.o \ smb_write.o \ @@ -210,8 +209,10 @@ STRIP_STABS = : # Note: need our sys includes _before_ ENVCPPFLAGS, proto etc. +# Also, like Makefile.uts, reset CPPFLAGS CPPFLAGS.first += -I../../../libfakekernel/common CPPFLAGS.first += -I../common +CPPFLAGS = $(CPPFLAGS.first) INCS += -I$(SRC)/uts/common INCS += -I$(SRC)/common/smbsrv diff --git a/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_cred.c b/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_cred.c index 7b2bb93581..030c9c6244 100644 --- a/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_cred.c +++ b/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_cred.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #include <sys/types.h> @@ -53,6 +53,14 @@ smb_cred_create(smb_token_t *token) return (cr); } +cred_t * +smb_kcred_create(void) +{ + cred_t *cr; + cr = CRED(); + return (cr); +} + void smb_user_setcred(smb_user_t *user, cred_t *cr, uint32_t privileges) { diff --git a/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_init.c b/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_init.c index 4f0d6bf299..dc9eff1b44 100644 --- a/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_init.c +++ b/usr/src/lib/smbsrv/libfksmbsrv/common/fksmb_init.c @@ -141,9 +141,12 @@ fksmbsrv_drv_open(void) int fksmbsrv_drv_close(void) { + smb_server_t *sv; int rc; - rc = smb_server_delete(); + rc = smb_server_lookup(&sv); + if (rc == 0) + rc = smb_server_delete(sv); if (g_init_done != 0) { smb_server_g_fini(); diff --git a/usr/src/lib/smbsrv/libmlsvc/common/smb_share.c b/usr/src/lib/smbsrv/libmlsvc/common/smb_share.c index ccd5b75c12..8a354a7da0 100644 --- a/usr/src/lib/smbsrv/libmlsvc/common/smb_share.c +++ b/usr/src/lib/smbsrv/libmlsvc/common/smb_share.c @@ -770,6 +770,10 @@ smb_shr_modify(smb_share_t *new_si) si->shr_flags &= ~SMB_SHRF_DFSROOT; si->shr_flags |= flag; + flag = (new_si->shr_flags & SMB_SHRF_CA); + si->shr_flags &= ~SMB_SHRF_CA; + si->shr_flags |= flag; + flag = (new_si->shr_flags & SMB_SHRF_FSO); si->shr_flags &= ~SMB_SHRF_FSO; si->shr_flags |= flag; @@ -1822,6 +1826,12 @@ smb_shr_sa_get(sa_share_t share, sa_resource_t resource, smb_share_t *si) free(val); } + val = smb_shr_sa_getprop(opts, SHOPT_CA); + if (val != NULL) { + smb_shr_sa_setflag(val, si, SMB_SHRF_CA); + free(val); + } + val = smb_shr_sa_getprop(opts, SHOPT_FSO); if (val != NULL) { smb_shr_sa_setflag(val, si, SMB_SHRF_FSO); @@ -2611,6 +2621,8 @@ smb_shr_encode(smb_share_t *si, nvlist_t **nvlist) rc |= nvlist_add_string(smb, SHOPT_GUEST, "true"); if ((si->shr_flags & SMB_SHRF_DFSROOT) != 0) rc |= nvlist_add_string(smb, SHOPT_DFSROOT, "true"); + if ((si->shr_flags & SMB_SHRF_CA) != 0) + rc |= nvlist_add_string(smb, SHOPT_CA, "true"); if ((si->shr_flags & SMB_SHRF_FSO) != 0) rc |= nvlist_add_string(smb, SHOPT_FSO, "true"); if ((si->shr_flags & SMB_SHRF_QUOTAS) != 0) diff --git a/usr/src/tools/quick/make-smbsrv b/usr/src/tools/quick/make-smbsrv index 9e2381288d..0aabee3812 100755 --- a/usr/src/tools/quick/make-smbsrv +++ b/usr/src/tools/quick/make-smbsrv @@ -278,6 +278,7 @@ usr/lib/libmlrpc.so.2 usr/lib/smbsrv/libmlsvc.so.1 usr/lib/smbsrv/libsmb.so.1 usr/lib/smbsrv/libsmbns.so.1 +usr/lib/smbsrv/nvlprint usr/lib/smbsrv/smbd usr/sbin/smbadm usr/sbin/smbstat diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 9d63669f58..0b4426db3a 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -1231,7 +1231,6 @@ SMBSRV_OBJS += $(SMBSRV_SHARED_OBJS) \ smb_tree_connect.o \ smb_unlock_byte_range.o \ smb_user.o \ - smb_vfs.o \ smb_vops.o \ smb_vss.o \ smb_write.o \ @@ -2097,7 +2096,7 @@ NXGE_HCALL_OBJS = \ # # Virtio core -VIRTIO_OBJS = virtio.o +VIRTIO_OBJS = virtio_main.o virtio_dma.o # Virtio block driver VIOBLK_OBJS = vioblk.o diff --git a/usr/src/uts/common/fs/smbsrv/smb2_close.c b/usr/src/uts/common/fs/smbsrv/smb2_close.c index e019a3c3da..bbb000f329 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_close.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_close.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright 2018 Nexenta Systems, Inc. All rights reserved. */ /* @@ -71,6 +71,8 @@ smb2_close(smb_request_t *sr) } } + if (of->dh_persist) + smb2_dh_setdoc_persistent(of); smb_ofile_close(of, 0); errout: diff --git a/usr/src/uts/common/fs/smbsrv/smb2_create.c b/usr/src/uts/common/fs/smbsrv/smb2_create.c index 6aab3c5127..582efbae28 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_create.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_create.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright 2018 Nexenta Systems, Inc. All rights reserved. */ /* @@ -280,7 +280,6 @@ smb2_create(smb_request_t *sr) * many create context types are ignored too. */ op->dh_vers = SMB2_NOT_DURABLE; - op->dh_v2_flags = 0; if ((cctx.cc_in_flags & (CCTX_DH_RECONNECT|CCTX_DH_RECONNECT_V2)) != 0) { @@ -388,6 +387,9 @@ smb2_create(smb_request_t *sr) cctx.cc_in_flags &= ~CCTX_REQUEST_LEASE; } + if ((sr->tid_tree->t_flags & SMB_TREE_CA) == 0) + op->dh_v2_flags &= ~DH_PERSISTENT; + if ((cctx.cc_in_flags & (CCTX_DH_REQUEST|CCTX_DH_REQUEST_V2)) != 0) { if ((cctx.cc_in_flags & CCTX_DH_REQUEST_V2) != 0) @@ -441,15 +443,19 @@ smb2_create(smb_request_t *sr) * non-durable handles in case we get the ioctl * to set "resiliency" on this handle. */ - if (of->f_ftype == SMB_FTYPE_DISK) - smb_ofile_set_persistid(of); + if (of->f_ftype == SMB_FTYPE_DISK) { + if ((op->dh_v2_flags & DH_PERSISTENT) != 0) + smb_ofile_set_persistid_ph(of); + else + smb_ofile_set_persistid_dh(of); + } /* * [MS-SMB2] 3.3.5.9.8 * Handling the SMB2_CREATE_REQUEST_LEASE Create Context */ if ((cctx.cc_in_flags & CCTX_REQUEST_LEASE) != 0) { - status = smb2_lease_create(sr); + status = smb2_lease_create(sr, sr->session->clnt_uuid); if (status != NT_STATUS_SUCCESS) { if (op->action_taken == SMB_OACT_CREATED) { smb_ofile_set_delete_on_close(sr, of); @@ -479,7 +485,8 @@ smb2_create(smb_request_t *sr) if ((cctx.cc_in_flags & (CCTX_DH_REQUEST|CCTX_DH_REQUEST_V2)) != 0 && smb_node_is_file(of->f_node) && - ((op->op_oplock_level == SMB2_OPLOCK_LEVEL_BATCH) || + ((op->dh_v2_flags & DH_PERSISTENT) != 0 || + (op->op_oplock_level == SMB2_OPLOCK_LEVEL_BATCH) || (op->op_oplock_level == SMB2_OPLOCK_LEVEL_LEASE && (op->lease_state & OPLOCK_LEVEL_CACHE_HANDLE) != 0))) { /* @@ -489,8 +496,13 @@ smb2_create(smb_request_t *sr) (void) memcpy(of->dh_create_guid, op->create_guid, UUID_LEN); - /* no persistent handles yet */ - of->dh_persist = B_FALSE; + if ((op->dh_v2_flags & DH_PERSISTENT) != 0) { + if (smb2_dh_make_persistent(sr, of) == 0) { + of->dh_persist = B_TRUE; + } else { + op->dh_v2_flags = 0; + } + } } if (op->dh_vers != SMB2_NOT_DURABLE) { uint32_t msto; @@ -503,8 +515,11 @@ smb2_create(smb_request_t *sr) * the default timeout (in mSec.) */ msto = op->dh_timeout; - if (msto == 0) - msto = smb2_dh_def_timeout; + if (msto == 0) { + msto = (of->dh_persist) ? + smb2_persist_timeout : + smb2_dh_def_timeout; + } if (msto > smb2_dh_max_timeout) msto = smb2_dh_max_timeout; op->dh_timeout = msto; @@ -512,6 +527,7 @@ smb2_create(smb_request_t *sr) } } else { op->dh_vers = SMB2_NOT_DURABLE; + op->dh_v2_flags = 0; } /* diff --git a/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c b/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c index b592dc4c5f..88c4b6d600 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c @@ -979,6 +979,16 @@ cmd_done: */ if (!sr->smb2_async && sr->smb2_next_command != 0) goto cmd_start; + + /* + * If we have a durable handle, and this operation updated + * the nvlist, write it out (before smb2_send_reply). + */ + if (sr->dh_nvl_dirty) { + sr->dh_nvl_dirty = B_FALSE; + smb2_dh_update_nvfile(sr); + } + smb2_send_reply(sr); if (sr->smb2_async && sr->smb2_next_command != 0) { MBC_FLUSH(&sr->reply); /* New reply buffer. */ @@ -990,6 +1000,9 @@ cleanup: if (disconnect) smb_session_disconnect(session); + /* + * Do "postwork" for oplock (and maybe other things) + */ if (sr->sr_postwork != NULL) smb2sr_run_postwork(sr); @@ -1728,6 +1741,16 @@ smb2sr_run_postwork(smb_request_t *top_sr) default: ASSERT(0); } + + /* + * If we have a durable handle, and this operation + * updated the nvlist, write it out. + */ + if (post_sr->dh_nvl_dirty) { + post_sr->dh_nvl_dirty = B_FALSE; + smb2_dh_update_nvfile(post_sr); + } + post_sr->sr_state = SMB_REQ_STATE_COMPLETED; smb_request_free(post_sr); } diff --git a/usr/src/uts/common/fs/smbsrv/smb2_durable.c b/usr/src/uts/common/fs/smbsrv/smb2_durable.c index 9ba3dd9c07..7b65924ca4 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_durable.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_durable.c @@ -21,6 +21,7 @@ #include <sys/cmn_err.h> #include <sys/fcntl.h> #include <sys/nbmlock.h> +#include <sys/sid.h> #include <smbsrv/string.h> #include <smbsrv/smb_kproto.h> #include <smbsrv/smb_fsops.h> @@ -53,6 +54,48 @@ uint32_t smb2_dh_max_timeout = 300 * MILLISEC; /* mSec. */ uint32_t smb2_res_def_timeout = 120 * MILLISEC; /* mSec. */ uint32_t smb2_res_max_timeout = 300 * MILLISEC; /* mSec. */ +uint32_t smb2_persist_timeout = 300 * MILLISEC; /* mSec. */ + +/* Max. size of the file used to store a CA handle. */ +static uint32_t smb2_dh_max_cah_size = 64 * 1024; +static uint32_t smb2_ca_info_version = 1; + +/* + * Want this to have invariant layout on disk, where the + * last two uint32_t values are stored as a uint64_t + */ +struct nvlk { + uint64_t lk_start; + uint64_t lk_len; + /* (lk_pid << 32) | lk_type */ +#ifdef _BIG_ENDIAN + uint32_t lk_pid, lk_type; +#else + uint32_t lk_type, lk_pid; +#endif +}; + +static void smb2_dh_import_share(void *); +static smb_ofile_t *smb2_dh_import_handle(smb_request_t *, smb_node_t *, + uint64_t); +static int smb2_dh_read_nvlist(smb_request_t *, smb_node_t *, struct nvlist **); +static int smb2_dh_import_cred(smb_ofile_t *, char *); + +#define DH_SN_SIZE 24 /* size of DH stream name buffers */ +/* + * Build the stream name used to store a CA handle. + * i.e. ":0123456789abcdef:$CA" + * Note: smb_fsop_create adds the SUNWsmb prefix, + * so we compose the name without the prefix. + */ +static inline void +smb2_dh_make_stream_name(char *buf, size_t buflen, uint64_t id) +{ + ASSERT(buflen >= DH_SN_SIZE); + (void) snprintf(buf, buflen, + ":%016" PRIx64 ":$CA", id); +} + /* * smb_dh_should_save * @@ -80,6 +123,11 @@ uint32_t smb2_res_max_timeout = 300 * MILLISEC; /* mSec. */ * Open.OplockState == Held, and Open.IsDurable is TRUE. * * - Open.IsPersistent is TRUE. + * + * We also deal with some special cases for shutdown of the + * server, session, user, tree (in that order). Other than + * the cases above, shutdown (or forced termination) should + * destroy durable handles. */ boolean_t smb_dh_should_save(smb_ofile_t *of) @@ -87,12 +135,49 @@ smb_dh_should_save(smb_ofile_t *of) ASSERT(MUTEX_HELD(&of->f_mutex)); ASSERT(of->dh_vers != SMB2_NOT_DURABLE); - if (of->f_user->preserve_opens == SMB2_DH_PRESERVE_NONE) + /* SMB service shutting down, destroy DH */ + if (of->f_server->sv_state == SMB_SERVER_STATE_STOPPING) return (B_FALSE); - if (of->f_user->preserve_opens == SMB2_DH_PRESERVE_ALL) + /* + * SMB Session (connection) going away (server up). + * If server initiated disconnect, destroy DH + * If client initiated disconnect, save all DH. + */ + if (of->f_session->s_state == SMB_SESSION_STATE_TERMINATED) + return (B_FALSE); + if (of->f_session->s_state == SMB_SESSION_STATE_DISCONNECTED) return (B_TRUE); + /* + * SMB User logoff, session still "up". + * Action depends on why/how this logoff happened, + * determined based on user->preserve_opens + */ + if (of->f_user->u_state == SMB_USER_STATE_LOGGING_OFF) { + switch (of->f_user->preserve_opens) { + case SMB2_DH_PRESERVE_NONE: + /* Server-initiated */ + return (B_FALSE); + case SMB2_DH_PRESERVE_SOME: + /* Previous session logoff. */ + goto preserve_some; + case SMB2_DH_PRESERVE_ALL: + /* Protocol logoff request */ + return (B_TRUE); + } + } + + /* + * SMB tree disconnecting (user still logged on) + * i.e. when kshare export forces disconnection. + */ + if (of->f_tree->t_state == SMB_TREE_STATE_DISCONNECTING) + return (B_FALSE); + +preserve_some: + /* preserve_opens == SMB2_DH_PRESERVE_SOME */ + switch (of->dh_vers) { case SMB2_RESILIENT: return (B_TRUE); @@ -116,6 +201,1063 @@ smb_dh_should_save(smb_ofile_t *of) } /* + * Is this stream name a CA handle? i.e. + * ":0123456789abcdef:$CA" + */ +static boolean_t +smb2_dh_match_ca_name(const char *name, uint64_t *idp) +{ + static const char suffix[] = ":$CA"; + u_longlong_t ull; + const char *p = name; + char *p2 = NULL; + int len, rc; + + if (*p++ != ':') + return (B_FALSE); + + rc = ddi_strtoull(p, &p2, 16, &ull); + if (rc != 0 || p2 != (p + 16)) + return (B_FALSE); + p += 16; + + len = sizeof (suffix) - 1; + if (strncmp(p, suffix, len) != 0) + return (B_FALSE); + p += len; + + if (*p != '\0') + return (B_FALSE); + + *idp = (uint64_t)ull; + return (B_TRUE); +} + +/* + * smb2_dh_new_ca_share + * + * Called when a new share has ca=true. Find or create the CA dir, + * and start a thread to import persistent handles. + */ +int +smb2_dh_new_ca_share(smb_server_t *sv, smb_kshare_t *shr) +{ + smb_kshare_t *shr2; + smb_request_t *sr; + + ASSERT(STYPE_ISDSK(shr->shr_type)); + + /* + * Need to lookup the kshare again, to get a hold. + * Add a function to just get the hold? + */ + shr2 = smb_kshare_lookup(sv, shr->shr_name); + if (shr2 != shr) + return (EINVAL); + + sr = smb_request_alloc(sv->sv_session, 0); + if (sr == NULL) { + /* shutting down? */ + smb_kshare_release(sv, shr); + return (EINTR); + } + sr->sr_state = SMB_REQ_STATE_SUBMITTED; + + /* + * Mark this share as "busy importing persistent handles" + * so we can hold off tree connect until that's done. + * Will clear and wakeup below. + */ + mutex_enter(&shr->shr_mutex); + shr->shr_import_busy = sr; + mutex_exit(&shr->shr_mutex); + + /* + * Start a taskq job to import any CA handles. + * The hold on the kshare is given to this job, + * which releases it when it's done. + */ + sr->arg.tcon.si = shr; /* hold from above */ + (void) taskq_dispatch( + sv->sv_worker_pool, + smb2_dh_import_share, sr, TQ_SLEEP); + + return (0); +} + +int smb2_dh_import_delay = 0; + +static void +smb2_dh_import_share(void *arg) +{ + smb_request_t *sr = arg; + smb_kshare_t *shr = sr->arg.tcon.si; + smb_node_t *snode; + cred_t *kcr = zone_kcred(); + smb_streaminfo_t *str_info = NULL; + uint64_t id; + smb_node_t *str_node; + smb_odir_t *od = NULL; + smb_ofile_t *of; + int rc; + boolean_t eof; + + sr->sr_state = SMB_REQ_STATE_ACTIVE; + + if (smb2_dh_import_delay > 0) + delay(SEC_TO_TICK(smb2_dh_import_delay)); + + /* + * Borrow the server's "root" user. + * + * This takes the place of smb_session_lookup_ssnid() + * that would happen in smb2_dispatch for a normal SR. + * As usual, this hold is released in smb_request_free. + */ + sr->uid_user = sr->sr_server->sv_rootuser; + smb_user_hold_internal(sr->uid_user); + sr->user_cr = sr->uid_user->u_cred; + + /* + * Create a temporary tree connect + */ + sr->arg.tcon.path = shr->shr_name; + sr->tid_tree = smb_tree_alloc(sr, shr, shr->shr_root_node, + ACE_ALL_PERMS, 0); + if (sr->tid_tree == NULL) { + cmn_err(CE_NOTE, "smb2_dh_import_share: " + "failed connect share <%s>", shr->shr_name); + goto out; + } + snode = sr->tid_tree->t_snode; + + /* + * Get the buffers we'll use to read CA handle data. + * Stash in sr_request_buf for smb2_dh_import_handle(). + * Also a buffer for the stream name info. + */ + sr->sr_req_length = smb2_dh_max_cah_size; + sr->sr_request_buf = kmem_alloc(sr->sr_req_length, KM_SLEEP); + str_info = kmem_alloc(sizeof (smb_streaminfo_t), KM_SLEEP); + + /* + * Open the ext. attr dir under the share root and + * import CA handles for this share. + */ + if (smb_odir_openat(sr, snode, &od) != 0) { + cmn_err(CE_NOTE, "Share [%s] CA import, no xattr dir?", + shr->shr_name); + goto out; + } + + eof = B_FALSE; + do { + /* + * If the kshare gets unshared before we finish, + * bail out so we don't hold things up. + */ + if (shr->shr_flags & SMB_SHRF_REMOVED) + break; + + /* + * Read a stream name and info + */ + rc = smb_odir_read_streaminfo(sr, od, str_info, &eof); + if ((rc != 0) || (eof)) + break; + + /* + * Skip anything not a CA handle. + */ + if (!smb2_dh_match_ca_name(str_info->si_name, &id)) { + continue; + } + + /* + * Lookup stream node and import + */ + str_node = NULL; + rc = smb_fsop_lookup_name(sr, kcr, SMB_CASE_SENSITIVE, + snode, snode, str_info->si_name, &str_node); + if (rc != 0) { + cmn_err(CE_NOTE, "Share [%s] CA import, " + "lookup <%s> failed rc=%d", + shr->shr_name, str_info->si_name, rc); + continue; + } + of = smb2_dh_import_handle(sr, str_node, id); + smb_node_release(str_node); + if (of != NULL) { + smb_ofile_release(of); + of = NULL; + } + sr->fid_ofile = NULL; + + } while (!eof); + +out: + if (od != NULL) { + smb_odir_close(od); + smb_odir_release(od); + } + + if (str_info != NULL) + kmem_free(str_info, sizeof (smb_streaminfo_t)); + /* Let smb_request_free clean up sr->sr_request_buf */ + + /* + * We did a (temporary, internal) tree connect above, + * which we need to undo before we return. Note that + * smb_request_free will do the final release of + * sr->tid_tree, sr->uid_user + */ + if (sr->tid_tree != NULL) + smb_tree_disconnect(sr->tid_tree, B_FALSE); + + /* + * Wake up any waiting tree connect(s). + * See smb_tree_connect_disk(). + */ + mutex_enter(&shr->shr_mutex); + shr->shr_import_busy = NULL; + cv_broadcast(&shr->shr_cv); + mutex_exit(&shr->shr_mutex); + + smb_kshare_release(sr->sr_server, shr); + smb_request_free(sr); +} + +/* + * This returns the new ofile mostly for dtrace. + */ +static smb_ofile_t * +smb2_dh_import_handle(smb_request_t *sr, smb_node_t *str_node, + uint64_t persist_id) +{ + uint8_t client_uuid[UUID_LEN]; + smb_tree_t *tree = sr->tid_tree; + smb_arg_open_t *op = &sr->arg.open; + smb_pathname_t *pn = &op->fqi.fq_path; + cred_t *kcr = zone_kcred(); + struct nvlist *nvl = NULL; + char *sidstr = NULL; + smb_ofile_t *of = NULL; + smb_attr_t *pa; + boolean_t did_open = B_FALSE; + boolean_t have_lease = B_FALSE; + hrtime_t hrt; + uint64_t *u64p; + uint64_t u64; + uint32_t u32; + uint32_t status; + char *s; + uint8_t *u8p; + uint_t alen; + int rc; + + /* + * While we're called with arg.tcon, we now want to use + * smb_arg_open for the rest of import, so clear it. + */ + bzero(op, sizeof (*op)); + op->create_disposition = FILE_OPEN; + + /* + * Read and unpack the NVL + */ + rc = smb2_dh_read_nvlist(sr, str_node, &nvl); + if (rc != 0) + return (NULL); + + /* + * Known CA info version? + */ + u32 = 0; + rc = nvlist_lookup_uint32(nvl, "info_version", &u32); + if (rc != 0 || u32 != smb2_ca_info_version) { + cmn_err(CE_NOTE, "CA import (%s/%s) bad vers=%d", + tree->t_resource, str_node->od_name, u32); + goto errout; + } + + /* + * The persist ID in the nvlist should match the one + * encoded in the file name. (not enforced) + */ + u64 = 0; + rc = nvlist_lookup_uint64(nvl, "file_persistid", &u64); + if (rc != 0 || u64 != persist_id) { + cmn_err(CE_WARN, "CA import (%s/%s) bad id=%016" PRIx64, + tree->t_resource, str_node->od_name, u64); + /* goto errout? (allow) */ + } + + /* + * Does it belong in the share being imported? + */ + s = NULL; + rc = nvlist_lookup_string(nvl, "share_name", &s); + if (rc != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) no share_name", + tree->t_resource, str_node->od_name); + goto errout; + } + if (smb_strcasecmp(s, tree->t_sharename, 0) != 0) { + /* Normal (not an error) */ +#ifdef DEBUG + cmn_err(CE_NOTE, "CA import (%s/%s) other share", + tree->t_resource, str_node->od_name); +#endif + goto errout; + } + + /* + * Get the path name (for lookup) + */ + rc = nvlist_lookup_string(nvl, "path_name", &pn->pn_path); + if (rc != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) no path_name", + tree->t_resource, str_node->od_name); + goto errout; + } + + /* + * owner sid + */ + rc = nvlist_lookup_string(nvl, "owner_sid", &sidstr); + if (rc != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) no owner_sid", + tree->t_resource, str_node->od_name); + goto errout; + } + + /* + * granted access + */ + rc = nvlist_lookup_uint32(nvl, + "granted_access", &op->desired_access); + if (rc != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) no granted_access", + tree->t_resource, str_node->od_name); + goto errout; + } + + /* + * share access + */ + rc = nvlist_lookup_uint32(nvl, + "share_access", &op->share_access); + if (rc != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) no share_access", + tree->t_resource, str_node->od_name); + goto errout; + } + + /* + * create options + */ + rc = nvlist_lookup_uint32(nvl, + "create_options", &op->create_options); + if (rc != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) no create_options", + tree->t_resource, str_node->od_name); + goto errout; + } + + /* + * create guid (client-assigned) + */ + alen = UUID_LEN; + u8p = NULL; + rc = nvlist_lookup_uint8_array(nvl, "file_guid", &u8p, &alen); + if (rc != 0 || alen != UUID_LEN) { + cmn_err(CE_NOTE, "CA import (%s/%s) bad file_guid", + tree->t_resource, str_node->od_name); + goto errout; + } + bcopy(u8p, op->create_guid, UUID_LEN); + + /* + * client uuid (identifies the client) + */ + alen = UUID_LEN; + u8p = NULL; + rc = nvlist_lookup_uint8_array(nvl, "client_uuid", &u8p, &alen); + if (rc != 0 || alen != UUID_LEN) { + cmn_err(CE_NOTE, "CA import (%s/%s) no client_uuid", + tree->t_resource, str_node->od_name); + goto errout; + } + bcopy(u8p, client_uuid, UUID_LEN); + + /* + * Lease key (optional) + */ + alen = SMB_LEASE_KEY_SZ; + u8p = NULL; + rc = nvlist_lookup_uint8_array(nvl, "lease_uuid", &u8p, &alen); + if (rc == 0) { + bcopy(u8p, op->lease_key, UUID_LEN); + (void) nvlist_lookup_uint32(nvl, + "lease_state", &op->lease_state); + (void) nvlist_lookup_uint16(nvl, + "lease_epoch", &op->lease_epoch); + (void) nvlist_lookup_uint16(nvl, + "lease_version", &op->lease_version); + have_lease = B_TRUE; + } else { + (void) nvlist_lookup_uint32(nvl, + "oplock_state", &op->op_oplock_state); + } + + /* + * Done getting what we need from the NV list. + * (re)open the file + */ + status = smb_common_open(sr); + if (status != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) open failed 0x%x", + tree->t_resource, str_node->od_name, status); + (void) smb_node_set_delete_on_close(str_node, kcr, 0); + goto errout; + } + of = sr->fid_ofile; + did_open = B_TRUE; + + /* + * Now restore the rest of the SMB2 level state. + * See smb2_create after smb_common_open + */ + + /* + * Setup of->f_cr with owner SID + */ + rc = smb2_dh_import_cred(of, sidstr); + if (rc != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) import cred failed", + tree->t_resource, str_node->od_name); + goto errout; + } + + /* + * Use the persist ID we previously assigned. + * Like smb_ofile_set_persistid_ph() + */ + rc = smb_ofile_insert_persistid(of, persist_id); + if (rc != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) " + "insert_persistid rc=%d", + tree->t_resource, str_node->od_name, rc); + goto errout; + } + + /* + * Like smb2_lease_create() + * + * Lease state is stored in each persistent handle, but + * only one handle has the state we want. As we import + * each handle, "upgrade" the lease if the handle we're + * importing has a "better" lease state (higher epoch or + * more cache rights). After all handles are imported, + * that will get the lease to the right state. + */ + if (have_lease) { + smb_lease_t *ls; + status = smb2_lease_create(sr, client_uuid); + if (status != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) get lease 0x%x", + tree->t_resource, str_node->od_name, status); + goto errout; + } + ls = of->f_lease; + + /* Use most current "epoch". */ + mutex_enter(&ls->ls_mutex); + if (ls->ls_epoch < op->lease_epoch) + ls->ls_epoch = op->lease_epoch; + mutex_exit(&ls->ls_mutex); + + /* + * Get the lease (and oplock) + * uses op->lease_state + */ + op->op_oplock_level = SMB2_OPLOCK_LEVEL_LEASE; + smb2_lease_acquire(sr); + + } else { + /* + * No lease; maybe get an oplock + * uses: op->op_oplock_level + */ + if (op->op_oplock_state & OPLOCK_LEVEL_BATCH) { + op->op_oplock_level = SMB2_OPLOCK_LEVEL_BATCH; + } else if (op->op_oplock_state & OPLOCK_LEVEL_ONE) { + op->op_oplock_level = SMB2_OPLOCK_LEVEL_EXCLUSIVE; + } else if (op->op_oplock_state & OPLOCK_LEVEL_TWO) { + op->op_oplock_level = SMB2_OPLOCK_LEVEL_II; + } else { + op->op_oplock_level = SMB2_OPLOCK_LEVEL_NONE; + } + smb2_oplock_acquire(sr); + } + + /* + * Byte range locks + */ + alen = 0; + u64p = NULL; + if (nvlist_lookup_uint64_array(nvl, "locks", &u64p, &alen) == 0) { + uint_t i; + uint_t nlocks = alen / 3; + struct nvlk *nlp; + + nlp = (struct nvlk *)u64p; + for (i = 0; i < nlocks; i++) { + status = smb_lock_range( + sr, + nlp->lk_start, + nlp->lk_len, + nlp->lk_pid, + nlp->lk_type, + 0); + if (status != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) " + "get lock %d failed 0x%x", + tree->t_resource, + str_node->od_name, + i, status); + } + nlp++; + } + } + alen = SMB_OFILE_LSEQ_MAX; + u8p = NULL; + if (nvlist_lookup_uint8_array(nvl, "lockseq", &u8p, &alen) == 0) { + if (alen != SMB_OFILE_LSEQ_MAX) { + cmn_err(CE_NOTE, "CA import (%s/%s) " + "get lockseq bad len=%d", + tree->t_resource, + str_node->od_name, + alen); + } else { + mutex_enter(&of->f_mutex); + bcopy(u8p, of->f_lock_seq, alen); + mutex_exit(&of->f_mutex); + } + } + + /* + * Optional "sticky" times (set pending attributes) + */ + mutex_enter(&of->f_mutex); + pa = &of->f_pending_attr; + if (nvlist_lookup_hrtime(nvl, "atime", &hrt) == 0) { + hrt2ts(hrt, &pa->sa_vattr.va_atime); + pa->sa_mask |= SMB_AT_ATIME; + } + if (nvlist_lookup_hrtime(nvl, "mtime", &hrt) == 0) { + hrt2ts(hrt, &pa->sa_vattr.va_mtime); + pa->sa_mask |= SMB_AT_MTIME; + } + if (nvlist_lookup_hrtime(nvl, "ctime", &hrt) == 0) { + hrt2ts(hrt, &pa->sa_vattr.va_ctime); + pa->sa_mask |= SMB_AT_CTIME; + } + mutex_exit(&of->f_mutex); + + /* + * Make durable and persistent. + * See smb2_dh_make_persistent() + */ + of->dh_vers = SMB2_DURABLE_V2; + bcopy(op->create_guid, of->dh_create_guid, UUID_LEN); + of->dh_persist = B_TRUE; + of->dh_nvfile = str_node; + smb_node_ref(str_node); + of->dh_nvlist = nvl; + nvl = NULL; + + /* + * Now make it state orphaned... + * See smb_ofile_drop(), then + * smb_ofile_save_dh() + */ + mutex_enter(&of->f_mutex); + of->f_state = SMB_OFILE_STATE_SAVE_DH; + of->dh_timeout_offset = MSEC2NSEC(smb2_persist_timeout); + mutex_exit(&of->f_mutex); + + /* + * Finished! + */ + return (of); + +errout: + if (did_open) { + smb_ofile_close(of, 0); + smb_ofile_release(of); + } else { + ASSERT(of == NULL); + } + + if (nvl != NULL) + nvlist_free(nvl); + + return (NULL); +} + +static int +smb2_dh_read_nvlist(smb_request_t *sr, smb_node_t *node, + struct nvlist **nvlpp) +{ + smb_attr_t attr; + iovec_t iov; + uio_t uio; + smb_kshare_t *shr = sr->arg.tcon.si; + cred_t *kcr = zone_kcred(); + size_t flen; + int rc; + + bzero(&attr, sizeof (attr)); + attr.sa_mask = SMB_AT_SIZE; + rc = smb_node_getattr(NULL, node, kcr, NULL, &attr); + if (rc != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) getattr rc=%d", + shr->shr_path, node->od_name, rc); + return (rc); + } + + if (attr.sa_vattr.va_size < 4 || + attr.sa_vattr.va_size > sr->sr_req_length) { + cmn_err(CE_NOTE, "CA import (%s/%s) bad size=%" PRIu64, + shr->shr_path, node->od_name, + (uint64_t)attr.sa_vattr.va_size); + return (EINVAL); + } + flen = (size_t)attr.sa_vattr.va_size; + + bzero(&uio, sizeof (uio)); + iov.iov_base = sr->sr_request_buf; + iov.iov_len = flen; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_resid = flen; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_extflg = UIO_COPY_DEFAULT; + rc = smb_fsop_read(sr, kcr, node, NULL, &uio); + if (rc != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) read, rc=%d", + shr->shr_path, node->od_name, rc); + return (rc); + } + if (uio.uio_resid != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) short read", + shr->shr_path, node->od_name); + return (EIO); + } + + rc = nvlist_unpack(sr->sr_request_buf, flen, nvlpp, KM_SLEEP); + if (rc != 0) { + cmn_err(CE_NOTE, "CA import (%s/%s) unpack, rc=%d", + shr->shr_path, node->od_name, rc); + return (rc); + } + + return (0); +} + +/* + * Setup a vestigial credential in of->f_cr just good enough for + * smb_is_same_user to determine if the caller owned this ofile. + * At reconnect, of->f_cr will be replaced with the caller's. + */ +static int +smb2_dh_import_cred(smb_ofile_t *of, char *sidstr) +{ +#ifdef _FAKE_KERNEL + _NOTE(ARGUNUSED(sidstr)) + /* fksmbd doesn't have real credentials. */ + of->f_cr = CRED(); + crhold(of->f_cr); +#else + char tmpstr[SMB_SID_STRSZ]; + ksid_t ksid; + cred_t *cr, *oldcr; + int rc; + + (void) strlcpy(tmpstr, sidstr, sizeof (tmpstr)); + bzero(&ksid, sizeof (ksid)); + + rc = smb_sid_splitstr(tmpstr, &ksid.ks_rid); + if (rc != 0) + return (rc); + cr = crget(); + + ksid.ks_domain = ksid_lookupdomain(tmpstr); + crsetsid(cr, &ksid, KSID_USER); + ksiddomain_hold(ksid.ks_domain); + crsetsid(cr, &ksid, KSID_OWNER); + + /* + * Just to avoid leaving the KSID_GROUP slot NULL, + * put the "everyone" SID there (S-1-1-0). + */ + ksid.ks_domain = ksid_lookupdomain("S-1-1"); + ksid.ks_rid = 0; + crsetsid(cr, &ksid, KSID_GROUP); + + oldcr = of->f_cr; + of->f_cr = cr; + if (oldcr != NULL) + crfree(oldcr); +#endif + + return (0); +} + +/* + * Set Delete-on-Close (DoC) on the persistent state file so it will be + * removed when the last ref. goes away (in smb2_dh_close_persistent). + * + * This is called in just two places: + * (1) SMB2_close request -- client tells us to destroy the handle. + * (2) smb2_dh_expire -- client has forgotten about this handle. + * All other (server-initiated) close calls should leave these + * persistent state files in the file system. + */ +void +smb2_dh_setdoc_persistent(smb_ofile_t *of) +{ + smb_node_t *strnode; + uint32_t status; + + mutex_enter(&of->dh_nvlock); + if ((strnode = of->dh_nvfile) != NULL) + smb_node_ref(strnode); + mutex_exit(&of->dh_nvlock); + + if (strnode != NULL) { + status = smb_node_set_delete_on_close(strnode, + zone_kcred(), SMB_CASE_SENSITIVE); + if (status != 0) { + cmn_err(CE_WARN, "Can't set DoC on CA file: %s", + strnode->od_name); + DTRACE_PROBE1(rm__ca__err, smb_ofile_t *, of); + } + smb_node_release(strnode); + } +} + +/* + * During ofile close, free the persistent handle state nvlist and + * drop our reference to the state file node (which may unlink it + * if smb2_dh_setdoc_persistent was called). + */ +void +smb2_dh_close_persistent(smb_ofile_t *of) +{ + smb_node_t *strnode; + struct nvlist *nvl; + + /* + * Clear out nvlist and stream linkage + */ + mutex_enter(&of->dh_nvlock); + strnode = of->dh_nvfile; + of->dh_nvfile = NULL; + nvl = of->dh_nvlist; + of->dh_nvlist = NULL; + mutex_exit(&of->dh_nvlock); + + if (nvl != NULL) + nvlist_free(nvl); + + if (strnode != NULL) + smb_node_release(strnode); +} + +/* + * Make this durable handle persistent. + * If we succeed, set of->dh_persist = TRUE. + */ +int +smb2_dh_make_persistent(smb_request_t *sr, smb_ofile_t *of) +{ + char fname[DH_SN_SIZE]; + char sidstr[SMB_SID_STRSZ]; + smb_attr_t attr; + smb_arg_open_t *op = &sr->arg.open; + cred_t *kcr = zone_kcred(); + smb_node_t *dnode = of->f_tree->t_snode; + smb_node_t *fnode = NULL; + ksid_t *ksid; + int rc; + + ASSERT(of->dh_nvfile == NULL); + + /* + * Create the persistent handle nvlist file. + * It's a named stream in the share root. + */ + smb2_dh_make_stream_name(fname, sizeof (fname), of->f_persistid); + + bzero(&attr, sizeof (attr)); + attr.sa_mask = SMB_AT_TYPE | SMB_AT_MODE | SMB_AT_SIZE; + attr.sa_vattr.va_type = VREG; + attr.sa_vattr.va_mode = 0640; + attr.sa_vattr.va_size = 4; + rc = smb_fsop_create(sr, kcr, dnode, fname, &attr, &fnode); + if (rc != 0) + return (rc); + + mutex_enter(&of->dh_nvlock); + + /* fnode is held. rele in smb2_dh_close_persistent */ + of->dh_nvfile = fnode; + (void) nvlist_alloc(&of->dh_nvlist, NV_UNIQUE_NAME, KM_SLEEP); + + /* + * Want the ksid as a string + */ + ksid = crgetsid(of->f_user->u_cred, KSID_USER); + (void) snprintf(sidstr, sizeof (sidstr), "%s-%u", + ksid->ks_domain->kd_name, ksid->ks_rid); + + /* + * Fill in the fixed parts of the nvlist + */ + (void) nvlist_add_uint32(of->dh_nvlist, + "info_version", smb2_ca_info_version); + (void) nvlist_add_string(of->dh_nvlist, + "owner_sid", sidstr); + (void) nvlist_add_string(of->dh_nvlist, + "share_name", of->f_tree->t_sharename); + (void) nvlist_add_uint64(of->dh_nvlist, + "file_persistid", of->f_persistid); + (void) nvlist_add_uint8_array(of->dh_nvlist, + "file_guid", of->dh_create_guid, UUID_LEN); + (void) nvlist_add_string(of->dh_nvlist, + "client_ipaddr", sr->session->ip_addr_str); + (void) nvlist_add_uint8_array(of->dh_nvlist, + "client_uuid", sr->session->clnt_uuid, UUID_LEN); + (void) nvlist_add_string(of->dh_nvlist, + "path_name", op->fqi.fq_path.pn_path); + (void) nvlist_add_uint32(of->dh_nvlist, + "granted_access", of->f_granted_access); + (void) nvlist_add_uint32(of->dh_nvlist, + "share_access", of->f_share_access); + (void) nvlist_add_uint32(of->dh_nvlist, + "create_options", of->f_create_options); + if (of->f_lease != NULL) { + smb_lease_t *ls = of->f_lease; + (void) nvlist_add_uint8_array(of->dh_nvlist, + "lease_uuid", ls->ls_key, 16); + (void) nvlist_add_uint32(of->dh_nvlist, + "lease_state", ls->ls_state); + (void) nvlist_add_uint16(of->dh_nvlist, + "lease_epoch", ls->ls_epoch); + (void) nvlist_add_uint16(of->dh_nvlist, + "lease_version", ls->ls_version); + } else { + (void) nvlist_add_uint32(of->dh_nvlist, + "oplock_state", of->f_oplock.og_state); + } + mutex_exit(&of->dh_nvlock); + + smb2_dh_update_locks(sr, of); + + /* Tell sr update nvlist file */ + sr->dh_nvl_dirty = B_TRUE; + + return (0); +} + +void +smb2_dh_update_nvfile(smb_request_t *sr) +{ + smb_attr_t attr; + iovec_t iov; + uio_t uio; + smb_ofile_t *of = sr->fid_ofile; + cred_t *kcr = zone_kcred(); + char *buf = NULL; + size_t buflen = 0; + uint32_t wcnt; + int rc; + + if (of == NULL || of->dh_persist == B_FALSE) + return; + + mutex_enter(&of->dh_nvlock); + if (of->dh_nvlist == NULL || of->dh_nvfile == NULL) { + mutex_exit(&of->dh_nvlock); + return; + } + + rc = nvlist_size(of->dh_nvlist, &buflen, NV_ENCODE_XDR); + if (rc != 0) + goto out; + buf = kmem_zalloc(buflen, KM_SLEEP); + + rc = nvlist_pack(of->dh_nvlist, &buf, &buflen, + NV_ENCODE_XDR, KM_SLEEP); + if (rc != 0) + goto out; + + bzero(&attr, sizeof (attr)); + attr.sa_mask = SMB_AT_SIZE; + attr.sa_vattr.va_size = buflen; + rc = smb_node_setattr(sr, of->dh_nvfile, kcr, NULL, &attr); + if (rc != 0) + goto out; + + bzero(&uio, sizeof (uio)); + iov.iov_base = (void *) buf; + iov.iov_len = buflen; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_resid = buflen; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_extflg = UIO_COPY_DEFAULT; + rc = smb_fsop_write(sr, kcr, of->dh_nvfile, + NULL, &uio, &wcnt, 0); + if (rc == 0 && wcnt != buflen) + rc = EIO; + +out: + mutex_exit(&of->dh_nvlock); + + if (rc != 0) { + cmn_err(CE_WARN, + "clnt(%s) failed to update persistent handle, rc=%d", + sr->session->ip_addr_str, rc); + } + + if (buf != NULL) { + kmem_free(buf, buflen); + } +} + +/* + * Called after f_oplock (and lease) changes + * If lease, update: lease_state, lease_epoch + * else (oplock) update: oplock_state + */ +void +smb2_dh_update_oplock(smb_request_t *sr, smb_ofile_t *of) +{ + smb_lease_t *ls; + + mutex_enter(&of->dh_nvlock); + if (of->dh_nvlist == NULL) { + mutex_exit(&of->dh_nvlock); + return; + } + + if (of->f_lease != NULL) { + ls = of->f_lease; + (void) nvlist_add_uint32(of->dh_nvlist, + "lease_state", ls->ls_state); + (void) nvlist_add_uint16(of->dh_nvlist, + "lease_epoch", ls->ls_epoch); + } else { + (void) nvlist_add_uint32(of->dh_nvlist, + "oplock_state", of->f_oplock.og_state); + } + mutex_exit(&of->dh_nvlock); + + sr->dh_nvl_dirty = B_TRUE; +} + +/* + * Save locks from this ofile as an array of uint64_t, where the + * elements are triplets: (start, length, (pid << 32) | type) + * Note pid should always be zero for SMB2, so we could use + * that 32-bit spot for something else if needed. + */ +void +smb2_dh_update_locks(smb_request_t *sr, smb_ofile_t *of) +{ + uint8_t lseq[SMB_OFILE_LSEQ_MAX]; + smb_node_t *node = of->f_node; + smb_llist_t *llist = &node->n_lock_list; + size_t vec_sz; // storage size + uint_t my_cnt = 0; + uint64_t *vec = NULL; + struct nvlk *nlp; + smb_lock_t *lock; + + smb_llist_enter(llist, RW_READER); + vec_sz = (llist->ll_count + 1) * sizeof (struct nvlk); + vec = kmem_alloc(vec_sz, KM_SLEEP); + nlp = (struct nvlk *)vec; + for (lock = smb_llist_head(llist); + lock != NULL; + lock = smb_llist_next(llist, lock)) { + if (lock->l_file != of) + continue; + nlp->lk_start = lock->l_start; + nlp->lk_len = lock->l_length; + nlp->lk_pid = lock->l_pid; + nlp->lk_type = lock->l_type; + nlp++; + my_cnt++; + } + smb_llist_exit(llist); + + mutex_enter(&of->f_mutex); + bcopy(of->f_lock_seq, lseq, sizeof (lseq)); + mutex_exit(&of->f_mutex); + + mutex_enter(&of->dh_nvlock); + if (of->dh_nvlist != NULL) { + + (void) nvlist_add_uint64_array(of->dh_nvlist, + "locks", vec, my_cnt * 3); + + (void) nvlist_add_uint8_array(of->dh_nvlist, + "lockseq", lseq, sizeof (lseq)); + } + mutex_exit(&of->dh_nvlock); + + kmem_free(vec, vec_sz); + + sr->dh_nvl_dirty = B_TRUE; +} + +/* + * Save "sticky" times + */ +void +smb2_dh_update_times(smb_request_t *sr, smb_ofile_t *of, smb_attr_t *attr) +{ + hrtime_t t; + + mutex_enter(&of->dh_nvlock); + if (of->dh_nvlist == NULL) { + mutex_exit(&of->dh_nvlock); + return; + } + + if (attr->sa_mask & SMB_AT_ATIME) { + t = ts2hrt(&attr->sa_vattr.va_atime); + (void) nvlist_add_hrtime(of->dh_nvlist, "atime", t); + } + if (attr->sa_mask & SMB_AT_MTIME) { + t = ts2hrt(&attr->sa_vattr.va_mtime); + (void) nvlist_add_hrtime(of->dh_nvlist, "mtime", t); + } + if (attr->sa_mask & SMB_AT_CTIME) { + t = ts2hrt(&attr->sa_vattr.va_ctime); + (void) nvlist_add_hrtime(of->dh_nvlist, "ctime", t); + } + mutex_exit(&of->dh_nvlock); + + sr->dh_nvl_dirty = B_TRUE; +} + + +/* * Requirements for ofile found during reconnect (MS-SMB2 3.3.5.9.7): * - security descriptor must match provided descriptor * @@ -332,6 +1474,8 @@ smb2_dh_expire(void *arg) { smb_ofile_t *of = (smb_ofile_t *)arg; + if (of->dh_persist) + smb2_dh_setdoc_persistent(of); smb_ofile_close(of, 0); smb_ofile_release(of); } @@ -383,9 +1527,96 @@ smb2_durable_timers(smb_server_t *sv) } /* + * This is called when we're about to add a new open to some node. + * If we still have orphaned durable handles on this node, let's + * assume the client has lost interest in those and close them, + * otherwise we might conflict with our own orphaned handles. + * + * We need this because we import persistent handles "speculatively" + * during share import (before the client ever asks for reconnect). + * That allows us to avoid any need for a "create blackout" (or + * "grace period") because the imported handles prevent unwanted + * conflicting opens from other clients. However, if some client + * "forgets" about a persistent handle (*cough* Hyper-V) and tries + * a new (conflicting) open instead of a reconnect, that might + * fail unless we expire our orphaned durables handle first. + * + * Logic similar to smb_node_open_check() + */ +void +smb2_dh_close_my_orphans(smb_request_t *sr, smb_ofile_t *new_of) +{ + smb_node_t *node = new_of->f_node; + smb_ofile_t *of; + + SMB_NODE_VALID(node); + + smb_llist_enter(&node->n_ofile_list, RW_READER); + for (of = smb_llist_head(&node->n_ofile_list); + of != NULL; + of = smb_llist_next(&node->n_ofile_list, of)) { + + /* Same client? */ + if (of->f_lease != NULL && + bcmp(sr->session->clnt_uuid, + of->f_lease->ls_clnt, 16) != 0) + continue; + + if (!smb_is_same_user(sr->user_cr, of->f_cr)) + continue; + + mutex_enter(&of->f_mutex); + if (of->f_state == SMB_OFILE_STATE_ORPHANED) { + of->f_state = SMB_OFILE_STATE_EXPIRED; + /* inline smb_ofile_hold_internal() */ + of->f_refcnt++; + smb_llist_post(&node->n_ofile_list, + of, smb2_dh_expire); + } + mutex_exit(&of->f_mutex); + } + + smb_llist_exit(&node->n_ofile_list); +} + +/* + * Called for each orphaned DH during shutdown. + * Clean out any in-memory state, but leave any + * on-disk persistent handle state in place. + */ +static void +smb2_dh_cleanup(void *arg) +{ + smb_ofile_t *of = (smb_ofile_t *)arg; + smb_node_t *strnode; + struct nvlist *nvl; + + /* + * Intentionally skip smb2_dh_close_persistent by + * clearing dh_nvfile before smb_ofile_close(). + */ + mutex_enter(&of->dh_nvlock); + strnode = of->dh_nvfile; + of->dh_nvfile = NULL; + nvl = of->dh_nvlist; + of->dh_nvlist = NULL; + mutex_exit(&of->dh_nvlock); + + if (nvl != NULL) + nvlist_free(nvl); + + if (strnode != NULL) + smb_node_release(strnode); + + smb_ofile_close(of, 0); + smb_ofile_release(of); +} + +/* * Clean out durable handles during shutdown. - * Like, smb2_durable_timers but expire all, - * and make sure the hash buckets are empty. + * + * Like, smb2_durable_timers but cleanup only in-memory state, + * and leave any persistent state there for later reconnect. */ void smb2_dh_shutdown(smb_server_t *sv) @@ -410,7 +1641,7 @@ smb2_dh_shutdown(smb_server_t *sv) of->f_state = SMB_OFILE_STATE_EXPIRED; /* inline smb_ofile_hold_internal() */ of->f_refcnt++; - smb_llist_post(bucket, of, smb2_dh_expire); + smb_llist_post(bucket, of, smb2_dh_cleanup); break; default: break; diff --git a/usr/src/uts/common/fs/smbsrv/smb2_lease.c b/usr/src/uts/common/fs/smbsrv/smb2_lease.c index d2bf4805b3..95d7d9c7f1 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_lease.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_lease.c @@ -122,11 +122,10 @@ smb_hash_uuid(const uint8_t *uuid) * Handling the SMB2_CREATE_REQUEST_LEASE Create Context */ uint32_t -smb2_lease_create(smb_request_t *sr) +smb2_lease_create(smb_request_t *sr, uint8_t *clnt) { smb_arg_open_t *op = &sr->arg.open; uint8_t *key = op->lease_key; - uint8_t *clnt = sr->session->clnt_uuid; smb_ofile_t *of = sr->fid_ofile; smb_hash_t *ht = sr->sr_server->sv_lease_ht; smb_llist_t *bucket; diff --git a/usr/src/uts/common/fs/smbsrv/smb2_lock.c b/usr/src/uts/common/fs/smbsrv/smb2_lock.c index c6e8236cce..cc05f96e75 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_lock.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_lock.c @@ -142,6 +142,10 @@ smb2_lock(smb_request_t *sr) status = smb2_locks(sr); } + if (sr->fid_ofile->dh_persist) { + smb2_dh_update_locks(sr, sr->fid_ofile); + } + errout: sr->smb2_status = status; DTRACE_SMB2_DONE(op__Lock, smb_request_t *, sr); diff --git a/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c b/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c index cbdd5f9fb5..5bc7b01260 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c @@ -26,8 +26,12 @@ uint32_t smb2srv_capabilities = SMB2_CAP_DFS | SMB2_CAP_LEASING | SMB2_CAP_LARGE_MTU | + SMB2_CAP_PERSISTENT_HANDLES | SMB2_CAP_ENCRYPTION; +/* These are the only capabilities defined for SMB2.X */ +#define SMB_2X_CAPS (SMB2_CAP_DFS | SMB2_CAP_LEASING | SMB2_CAP_LARGE_MTU) + /* * These are not intended as customer tunables, but dev. & test folks * might want to adjust them (with caution). @@ -350,16 +354,26 @@ smb2_negotiate_common(smb_request_t *sr, uint16_t version) /* * [MS-SMB2] 3.3.5.4 Receiving an SMB2 NEGOTIATE Request * - * Only set CAP_ENCRYPTION if this is 3.0 or 3.0.2 and - * the client has it set. + * The SMB2.x capabilities are returned without regard for + * what capabilities the client provided in the request. + * The SMB3.x capabilities returned are the traditional + * logical AND of server and client capabilities. + * + * One additional check: If KCF is missing something we + * require for encryption, turn off that capability. */ - - if (s->dialect < SMB_VERS_3_0 || - !SMB3_CLIENT_ENCRYPTS(sr) || - smb3_encrypt_init_mech(s) != 0) - s->srv_cap = smb2srv_capabilities & ~SMB2_CAP_ENCRYPTION; - else - s->srv_cap = smb2srv_capabilities; + if (s->dialect < SMB_VERS_3_0) { + /* SMB 2.x */ + s->srv_cap = smb2srv_capabilities & SMB_2X_CAPS; + } else { + /* SMB 3.0 or later */ + s->srv_cap = smb2srv_capabilities & + (SMB_2X_CAPS | s->capabilities); + if ((s->srv_cap & SMB2_CAP_ENCRYPTION) != 0 && + smb3_encrypt_init_mech(s) != 0) { + s->srv_cap &= ~SMB2_CAP_ENCRYPTION; + } + } /* * See notes above smb2_max_rwsize, smb2_old_rwsize diff --git a/usr/src/uts/common/fs/smbsrv/smb2_tree_connect.c b/usr/src/uts/common/fs/smbsrv/smb2_tree_connect.c index e11a8855f7..34a74f564b 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_tree_connect.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_tree_connect.c @@ -19,6 +19,8 @@ #include <smbsrv/smb2_kproto.h> +#define SMB2_SHARE_CAP_CA SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY + smb_sdrc_t smb2_tree_connect(smb_request_t *sr) { @@ -114,6 +116,10 @@ smb2_tree_connect(smb_request_t *sr) ShareFlags = 0; Capabilities = 0; + if ((tree->t_flags & SMB_TREE_DFSROOT) != 0) + Capabilities |= SMB2_SHARE_CAP_DFS; + if ((tree->t_flags & SMB_TREE_CA) != 0) + Capabilities |= SMB2_SHARE_CAP_CA; /* * SMB2 Tree Connect reply diff --git a/usr/src/uts/common/fs/smbsrv/smb_common_open.c b/usr/src/uts/common/fs/smbsrv/smb_common_open.c index 161f2790f6..0ef06a3c3e 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_common_open.c +++ b/usr/src/uts/common/fs/smbsrv/smb_common_open.c @@ -40,9 +40,6 @@ int smb_session_ofile_max = 32768; -static volatile uint32_t smb_fids = 0; -#define SMB_UNIQ_FID() atomic_inc_32_nv(&smb_fids) - extern uint32_t smb_is_executable(char *); static void smb_delete_new_object(smb_request_t *); static int smb_set_open_attributes(smb_request_t *, smb_ofile_t *); @@ -280,6 +277,7 @@ smb_common_open(smb_request_t *sr) boolean_t fnode_shrlk = B_FALSE; boolean_t did_open = B_FALSE; boolean_t did_break_handle = B_FALSE; + boolean_t did_cleanup_orphans = B_FALSE; /* Get out now if we've been cancelled. */ mutex_enter(&sr->sr_mutex); @@ -350,10 +348,9 @@ smb_common_open(smb_request_t *sr) /* * Most of IPC open is handled in smb_opipe_open() */ - uniq_fid = SMB_UNIQ_FID(); op->create_options = 0; of = smb_ofile_alloc(sr, op, NULL, SMB_FTYPE_MESG_PIPE, - tree_fid, uniq_fid); + tree_fid); tree_fid = 0; // given to the ofile status = smb_opipe_open(sr, of); smb_threshold_exit(&sv->sv_opipe_ct); @@ -450,13 +447,6 @@ smb_common_open(smb_request_t *sr) goto errout; } - /* - * The uniq_fid is a CIFS-server-wide unique identifier for an ofile - * which is used to uniquely identify open instances for the - * VFS share reservation and POSIX locks. - */ - uniq_fid = SMB_UNIQ_FID(); - if (last_comp_found) { smb_node_unlock(dnode); @@ -584,10 +574,14 @@ smb_common_open(smb_request_t *sr) * affect the sharing checks, and may delete the file due to * DELETE_ON_CLOSE. This may block, so set the file opening * count before oplock stuff. + * + * Need the "proposed" ofile (and its TargetOplockKey) for + * correct oplock break semantics. */ of = smb_ofile_alloc(sr, op, fnode, SMB_FTYPE_DISK, - tree_fid, uniq_fid); + tree_fid); tree_fid = 0; // given to the ofile + uniq_fid = of->f_uniqid; smb_node_inc_opening_count(fnode); opening_incr = B_TRUE; @@ -683,6 +677,22 @@ smb_common_open(smb_request_t *sr) } /* + * If we still have orphaned durable handles on this file, + * let's assume the client has lost interest in those and + * close them so they don't cause sharing violations. + * See longer comment at smb2_dh_close_my_orphans(). + */ + if (status == NT_STATUS_SHARING_VIOLATION && + sr->session->dialect >= SMB_VERS_2_BASE && + did_cleanup_orphans == B_FALSE) { + + did_cleanup_orphans = B_TRUE; + smb2_dh_close_my_orphans(sr, of); + + goto shrlock_again; + } + + /* * SMB1 expects a 1 sec. delay before returning a * sharing violation error. If breaking oplocks * above took less than a sec, wait some more. @@ -904,27 +914,17 @@ create: goto errout; } + /* Create done. */ smb_node_unlock(dnode); dnode_wlock = B_FALSE; created = B_TRUE; op->action_taken = SMB_OACT_CREATED; + /* Note: hold from create */ fnode = op->fqi.fq_fnode; fnode_held = B_TRUE; - smb_node_inc_opening_count(fnode); - opening_incr = B_TRUE; - - smb_node_wrlock(fnode); - fnode_wlock = B_TRUE; - - status = smb_fsop_shrlock(sr->user_cr, fnode, uniq_fid, - op->desired_access, op->share_access); - if (status != 0) - goto errout; - fnode_shrlk = B_TRUE; - if (max_requested) { smb_fsop_eaccess(sr, sr->user_cr, fnode, &max_allowed); op->desired_access |= max_allowed; @@ -937,6 +937,27 @@ create: */ op->desired_access |= (READ_CONTROL | FILE_READ_ATTRIBUTES); + /* Allocate the ofile and fill in most of it. */ + of = smb_ofile_alloc(sr, op, fnode, SMB_FTYPE_DISK, + tree_fid); + tree_fid = 0; // given to the ofile + uniq_fid = of->f_uniqid; + + smb_node_inc_opening_count(fnode); + opening_incr = B_TRUE; + + /* + * Share access checks... + */ + smb_node_wrlock(fnode); + fnode_wlock = B_TRUE; + + status = smb_fsop_shrlock(sr->user_cr, fnode, uniq_fid, + op->desired_access, op->share_access); + if (status != 0) + goto errout; + fnode_shrlk = B_TRUE; + /* * MS-FSA 2.1.5.1.1 * If the Oplock member of the DirectoryStream in @@ -951,9 +972,6 @@ create: * * The break never blocks, so ignore the return. */ - of = smb_ofile_alloc(sr, op, fnode, SMB_FTYPE_DISK, - tree_fid, uniq_fid); - tree_fid = 0; // given to the ofile (void) smb_oplock_break_PARENT(dnode, of); } @@ -1052,8 +1070,9 @@ create: errout: if (did_open) { smb_ofile_close(of, 0); - /* Don't also ofile_free */ + /* rele via sr->fid_ofile */ } else if (of != NULL) { + /* No other refs possible */ smb_ofile_free(of); } diff --git a/usr/src/uts/common/fs/smbsrv/smb_cred.c b/usr/src/uts/common/fs/smbsrv/smb_cred.c index f47f5e72a5..8431db4653 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_cred.c +++ b/usr/src/uts/common/fs/smbsrv/smb_cred.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ /* @@ -172,3 +172,19 @@ smb_cred_set_sidlist(smb_ids_t *token_grps) return (lp); } + +/* + * Special variant of smb_cred_create() used when we need an + * SMB kcred (e.g. DH import). The returned cred must be + * from crget() so it can be passed to smb_user_setcred(). + */ +cred_t * +smb_kcred_create(void) +{ + cred_t *cr; + + cr = crget(); + ASSERT(cr != NULL); + + return (cr); +} diff --git a/usr/src/uts/common/fs/smbsrv/smb_fsops.c b/usr/src/uts/common/fs/smbsrv/smb_fsops.c index 6aa4074221..1b7c3a9fa9 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_fsops.c +++ b/usr/src/uts/common/fs/smbsrv/smb_fsops.c @@ -365,6 +365,9 @@ smb_fsop_create(smb_request_t *sr, cred_t *cr, smb_node_t *dnode, * because we want to set the UID and GID on the named * stream in this case for consistency with the (unnamed * stream) file (see comments for smb_vop_setattr()). + * + * Note that some stream "types" are "restricted" and only + * internal callers (cr == kcred) can create those. */ static int smb_fsop_create_stream(smb_request_t *sr, cred_t *cr, @@ -379,6 +382,9 @@ smb_fsop_create_stream(smb_request_t *sr, cred_t *cr, int rc = 0; boolean_t fcreate = B_FALSE; + if (cr != kcr && smb_strname_restricted(sname)) + return (EACCES); + /* Look up / create the unnamed stream, fname */ rc = smb_fsop_lookup(sr, cr, flags | SMB_FOLLOW_LINKS, sr->tid_tree->t_snode, dnode, fname, &fnode); @@ -663,6 +669,9 @@ smb_fsop_mkdir( * It is assumed that a reference exists on snode coming into this routine. * * A null smb_request might be passed to this function. + * + * Note that some stream "types" are "restricted" and only + * internal callers (cr == kcred) can remove those. */ int smb_fsop_remove( @@ -698,6 +707,11 @@ smb_fsop_remove( sname = kmem_alloc(MAXNAMELEN, KM_SLEEP); if (dnode->flags & NODE_XATTR_DIR) { + if (cr != zone_kcred() && smb_strname_restricted(name)) { + rc = EACCES; + goto out; + } + fnode = dnode->n_dnode; rc = smb_vop_stream_remove(fnode->vp, name, flags, cr); @@ -709,6 +723,11 @@ smb_fsop_remove( } else if (smb_is_stream_name(name)) { smb_stream_parse_name(name, fname, sname); + if (cr != zone_kcred() && smb_strname_restricted(sname)) { + rc = EACCES; + goto out; + } + /* * Look up the unnamed stream (i.e. fname). * Unmangle processing will be done on fname @@ -719,9 +738,7 @@ smb_fsop_remove( sr->tid_tree->t_snode, dnode, fname, &fnode); if (rc != 0) { - kmem_free(fname, MAXNAMELEN); - kmem_free(sname, MAXNAMELEN); - return (rc); + goto out; } /* @@ -744,9 +761,7 @@ smb_fsop_remove( if (rc == ENOENT) { if (!SMB_TREE_SUPPORTS_SHORTNAMES(sr) || !smb_maybe_mangled(name)) { - kmem_free(fname, MAXNAMELEN); - kmem_free(sname, MAXNAMELEN); - return (rc); + goto out; } longname = kmem_alloc(MAXNAMELEN, KM_SLEEP); @@ -776,6 +791,7 @@ smb_fsop_remove( } } +out: kmem_free(fname, MAXNAMELEN); kmem_free(sname, MAXNAMELEN); @@ -1609,6 +1625,9 @@ smb_fsop_statfs( * check is performed on the named stream in case it has been * quarantined. kcred is used to avoid issues with the permissions * set on the extended attribute file representing the named stream. + * + * Note that some stream "types" are "restricted" and only + * internal callers (cr == kcred) can access those. */ int smb_fsop_access(smb_request_t *sr, cred_t *cr, smb_node_t *snode, @@ -1639,9 +1658,14 @@ smb_fsop_access(smb_request_t *sr, cred_t *cr, smb_node_t *snode, unnamed_node = SMB_IS_STREAM(snode); if (unnamed_node) { + cred_t *kcr = zone_kcred(); + ASSERT(unnamed_node->n_magic == SMB_NODE_MAGIC); ASSERT(unnamed_node->n_state != SMB_NODE_STATE_DESTROYING); + if (cr != kcr && smb_strname_restricted(snode->od_name)) + return (NT_STATUS_ACCESS_DENIED); + /* * Perform VREAD access check on the named stream in case it * is quarantined. kcred is passed to smb_vop_access so it @@ -1649,7 +1673,7 @@ smb_fsop_access(smb_request_t *sr, cred_t *cr, smb_node_t *snode, */ if (faccess & (FILE_READ_DATA | FILE_EXECUTE)) { error = smb_vop_access(snode->vp, VREAD, - 0, NULL, zone_kcred()); + 0, NULL, kcr); if (error) return (NT_STATUS_ACCESS_DENIED); } diff --git a/usr/src/uts/common/fs/smbsrv/smb_init.c b/usr/src/uts/common/fs/smbsrv/smb_init.c index 88d804723e..f7e1739367 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_init.c +++ b/usr/src/uts/common/fs/smbsrv/smb_init.c @@ -247,7 +247,14 @@ smb_drv_open(dev_t *devp, int flag, int otyp, cred_t *cr) static int smb_drv_close(dev_t dev, int flag, int otyp, cred_t *credp) { - return (smb_server_delete()); + smb_server_t *sv; + int rc; + + rc = smb_server_lookup(&sv); + if (rc == 0) + rc = smb_server_delete(sv); + + return (rc); } /* ARGSUSED */ diff --git a/usr/src/uts/common/fs/smbsrv/smb_kshare.c b/usr/src/uts/common/fs/smbsrv/smb_kshare.c index a43c4af02a..5c5458bca5 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_kshare.c +++ b/usr/src/uts/common/fs/smbsrv/smb_kshare.c @@ -26,8 +26,9 @@ */ #include <smbsrv/smb_door.h> -#include <smbsrv/smb_kproto.h> #include <smbsrv/smb_ktypes.h> +#include <smbsrv/smb2_kproto.h> +#include <smbsrv/smb_kstat.h> typedef struct smb_unshare { list_node_t us_lnd; @@ -36,7 +37,6 @@ typedef struct smb_unshare { static kmem_cache_t *smb_kshare_cache_share; static kmem_cache_t *smb_kshare_cache_unexport; -kmem_cache_t *smb_kshare_cache_vfs; static int smb_kshare_cmp(const void *, const void *); static void smb_kshare_hold(const void *); @@ -294,7 +294,6 @@ smb_export_stop(smb_server_t *sv) mutex_exit(&sv->sv_export.e_mutex); smb_avl_destroy(&sv->sv_export.e_share_avl); - smb_vfs_rele_all(&sv->sv_export); } void @@ -305,18 +304,12 @@ smb_kshare_g_init(void) smb_kshare_cache_unexport = kmem_cache_create("smb_unexport_cache", sizeof (smb_unshare_t), 8, NULL, NULL, NULL, NULL, NULL, 0); - - smb_kshare_cache_vfs = kmem_cache_create("smb_vfs_cache", - sizeof (smb_vfs_t), 8, NULL, NULL, NULL, NULL, NULL, 0); } void smb_kshare_init(smb_server_t *sv) { - smb_llist_constructor(&sv->sv_export.e_vfs_list, sizeof (smb_vfs_t), - offsetof(smb_vfs_t, sv_lnd)); - smb_slist_constructor(&sv->sv_export.e_unexport_list, sizeof (smb_unshare_t), offsetof(smb_unshare_t, us_lnd)); } @@ -348,10 +341,6 @@ smb_kshare_fini(smb_server_t *sv) kmem_cache_free(smb_kshare_cache_unexport, ux); } smb_slist_destructor(&sv->sv_export.e_unexport_list); - - smb_vfs_rele_all(&sv->sv_export); - - smb_llist_destructor(&sv->sv_export.e_vfs_list); } void @@ -359,7 +348,6 @@ smb_kshare_g_fini(void) { kmem_cache_destroy(smb_kshare_cache_unexport); kmem_cache_destroy(smb_kshare_cache_share); - kmem_cache_destroy(smb_kshare_cache_vfs); } @@ -684,10 +672,8 @@ smb_kshare_release(smb_server_t *sv, smb_kshare_t *shr) /* * Add the given share in the specified server. - * If the share is a disk share, smb_vfs_hold() is - * invoked to ensure that there is a hold on the - * corresponding file system before the share is - * added to shares AVL. + * If the share is a disk share, lookup the share path + * and hold the smb_node_t for the share root. * * If the share is an Autohome share and it is * already in the AVL only a reference count for @@ -698,7 +684,7 @@ smb_kshare_export(smb_server_t *sv, smb_kshare_t *shr) { smb_avl_t *share_avl; smb_kshare_t *auto_shr; - vnode_t *vp; + smb_node_t *snode = NULL; int rc = 0; share_avl = &sv->sv_export.e_share_avl; @@ -713,36 +699,53 @@ smb_kshare_export(smb_server_t *sv, smb_kshare_t *shr) } if ((auto_shr = smb_avl_lookup(share_avl, shr)) != NULL) { - if ((auto_shr->shr_flags & SMB_SHRF_AUTOHOME) == 0) { - smb_avl_release(share_avl, auto_shr); - return (EEXIST); + rc = EEXIST; + if ((auto_shr->shr_flags & SMB_SHRF_AUTOHOME) != 0) { + mutex_enter(&auto_shr->shr_mutex); + auto_shr->shr_autocnt++; + mutex_exit(&auto_shr->shr_mutex); + rc = 0; } - - mutex_enter(&auto_shr->shr_mutex); - auto_shr->shr_autocnt++; - mutex_exit(&auto_shr->shr_mutex); smb_avl_release(share_avl, auto_shr); - return (0); + return (rc); } - if ((rc = smb_server_sharevp(sv, shr->shr_path, &vp)) != 0) { - cmn_err(CE_WARN, "export[%s(%s)]: failed obtaining vnode (%d)", + /* + * Get the root smb_node_t for this share, held. + * This hold is normally released during AVL destroy, + * via the element destructor: smb_kshare_destroy + */ + rc = smb_server_share_lookup(sv, shr->shr_path, &snode); + if (rc != 0) { + cmn_err(CE_WARN, "export[%s(%s)]: lookup failed (%d)", shr->shr_name, shr->shr_path, rc); return (rc); } - if ((rc = smb_vfs_hold(&sv->sv_export, vp->v_vfsp)) == 0) { - if ((rc = smb_avl_add(share_avl, shr)) != 0) { - cmn_err(CE_WARN, "export[%s]: failed caching (%d)", - shr->shr_name, rc); - smb_vfs_rele(&sv->sv_export, vp->v_vfsp); + shr->shr_root_node = snode; + if ((rc = smb_avl_add(share_avl, shr)) != 0) { + cmn_err(CE_WARN, "export[%s]: failed caching (%d)", + shr->shr_name, rc); + shr->shr_root_node = NULL; + smb_node_release(snode); + return (rc); + } + + /* + * For CA shares, find or create the CA handle dir, + * and (if restarted) import persistent handles. + */ + if ((shr->shr_flags & SMB_SHRF_CA) != 0) { + rc = smb2_dh_new_ca_share(sv, shr); + if (rc != 0) { + /* Just make it a non-CA share. */ + mutex_enter(&shr->shr_mutex); + shr->shr_flags &= ~SMB_SHRF_CA; + mutex_exit(&shr->shr_mutex); + rc = 0; } - } else { - cmn_err(CE_WARN, "export[%s(%s)]: failed holding VFS (%d)", - shr->shr_name, shr->shr_path, rc); } - VN_RELE(vp); return (rc); } @@ -764,8 +767,6 @@ smb_kshare_unexport(smb_server_t *sv, const char *shrname) smb_avl_t *share_avl; smb_kshare_t key; smb_kshare_t *shr; - vnode_t *vp; - int rc; boolean_t auto_unexport; share_avl = &sv->sv_export.e_share_avl; @@ -785,19 +786,12 @@ smb_kshare_unexport(smb_server_t *sv, const char *shrname) } } - if (STYPE_ISDSK(shr->shr_type)) { - if ((rc = smb_server_sharevp(sv, shr->shr_path, &vp)) != 0) { - smb_avl_release(share_avl, shr); - cmn_err(CE_WARN, "unexport[%s]: failed obtaining vnode" - " (%d)", shrname, rc); - return (rc); - } + smb_avl_remove(share_avl, shr); - smb_vfs_rele(&sv->sv_export, vp->v_vfsp); - VN_RELE(vp); - } + mutex_enter(&shr->shr_mutex); + shr->shr_flags |= SMB_SHRF_REMOVED; + mutex_exit(&shr->shr_mutex); - smb_avl_remove(share_avl, shr); smb_avl_release(share_avl, shr); return (0); @@ -892,6 +886,7 @@ smb_kshare_decode(nvlist_t *share) SMB_SHRF_DFSROOT); tmp.shr_flags |= smb_kshare_decode_bool(smb, SHOPT_QUOTAS, SMB_SHRF_QUOTAS); + tmp.shr_flags |= smb_kshare_decode_bool(smb, SHOPT_CA, SMB_SHRF_CA); tmp.shr_flags |= smb_kshare_decode_bool(smb, SHOPT_FSO, SMB_SHRF_FSO); tmp.shr_flags |= smb_kshare_decode_bool(smb, SHOPT_AUTOHOME, SMB_SHRF_AUTOHOME); @@ -1041,6 +1036,11 @@ smb_kshare_destroy(void *p) ASSERT(shr); ASSERT(shr->shr_magic == SMB_SHARE_MAGIC); + if (shr->shr_ca_dir != NULL) + smb_node_release(shr->shr_ca_dir); + if (shr->shr_root_node) + smb_node_release(shr->shr_root_node); + smb_mem_free(shr->shr_name); smb_mem_free(shr->shr_path); smb_mem_free(shr->shr_cmnt); diff --git a/usr/src/uts/common/fs/smbsrv/smb_node.c b/usr/src/uts/common/fs/smbsrv/smb_node.c index 63756f9037..3e9933d51a 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_node.c +++ b/usr/src/uts/common/fs/smbsrv/smb_node.c @@ -88,7 +88,7 @@ * course the state of the node should be tested/updated under the * protection of the mutex). */ -#include <smbsrv/smb_kproto.h> +#include <smbsrv/smb2_kproto.h> #include <smbsrv/smb_fsops.h> #include <smbsrv/smb_kstat.h> #include <sys/ddi.h> @@ -1574,10 +1574,20 @@ smb_node_setattr(smb_request_t *sr, smb_node_t *node, attr->sa_crtime; mutex_exit(&of->f_mutex); + /* * The f_pending_attr times are reapplied in * smb_ofile_close(). */ + + /* + * If this change is coming directly from a client + * (sr != NULL) and it's a persistent handle, save + * the "sticky times" in the handle. + */ + if (sr != NULL && of->dh_persist) { + smb2_dh_update_times(sr, of, attr); + } } if ((attr->sa_mask & SMB_AT_ALLOCSZ) != 0) { diff --git a/usr/src/uts/common/fs/smbsrv/smb_ofile.c b/usr/src/uts/common/fs/smbsrv/smb_ofile.c index 0142bf9164..531ca314fb 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_ofile.c +++ b/usr/src/uts/common/fs/smbsrv/smb_ofile.c @@ -280,11 +280,7 @@ #include <smbsrv/smb2_kproto.h> #include <smbsrv/smb_fsops.h> #include <sys/time.h> - -/* XXX: May need to actually assign GUIDs for these. */ -/* Don't leak object addresses */ -#define SMB_OFILE_PERSISTID(of) \ - ((uintptr_t)&smb_cache_ofile ^ (uintptr_t)(of)) +#include <sys/random.h> static boolean_t smb_ofile_is_open_locked(smb_ofile_t *); static void smb_ofile_delete(void *arg); @@ -296,6 +292,14 @@ static int smb_ofile_netinfo_init(smb_ofile_t *, smb_netfileinfo_t *); static void smb_ofile_netinfo_fini(smb_netfileinfo_t *); /* + * The uniq_fid is a CIFS-server-wide unique identifier for an ofile + * which is used to uniquely identify open instances for the + * VFS share reservation and POSIX locks. + */ +static volatile uint32_t smb_fids = 0; +#define SMB_UNIQ_FID() atomic_inc_32_nv(&smb_fids) + +/* * smb_ofile_alloc * Allocate an ofile and fill in it's "up" pointers, but * do NOT link it into the tree's list of ofiles or the @@ -304,6 +308,9 @@ static void smb_ofile_netinfo_fini(smb_netfileinfo_t *); * * If we don't get as far as smb_ofile_open with this OF, * call smb_ofile_free() to free this object. + * + * Note: The following sr members may be null during + * persistent handle import: session, uid_usr, tid_tree */ smb_ofile_t * smb_ofile_alloc( @@ -311,10 +318,10 @@ smb_ofile_alloc( smb_arg_open_t *op, smb_node_t *node, /* optional (may be NULL) */ uint16_t ftype, - uint16_t tree_fid, - uint32_t uniqid) + uint16_t tree_fid) { - smb_tree_t *tree = sr->tid_tree; + smb_user_t *user = sr->uid_user; /* optional */ + smb_tree_t *tree = sr->tid_tree; /* optional */ smb_ofile_t *of; of = kmem_cache_alloc(smb_cache_ofile, KM_SLEEP); @@ -324,22 +331,28 @@ smb_ofile_alloc( mutex_init(&of->f_mutex, NULL, MUTEX_DEFAULT, NULL); list_create(&of->f_notify.nc_waiters, sizeof (smb_request_t), offsetof(smb_request_t, sr_waiters)); + mutex_init(&of->dh_nvlock, NULL, MUTEX_DEFAULT, NULL); of->f_state = SMB_OFILE_STATE_ALLOC; of->f_refcnt = 1; of->f_ftype = ftype; of->f_fid = tree_fid; /* of->f_persistid see smb2_create */ - of->f_uniqid = uniqid; + of->f_uniqid = SMB_UNIQ_FID(); of->f_opened_by_pid = sr->smb_pid; of->f_granted_access = op->desired_access; of->f_share_access = op->share_access; of->f_create_options = op->create_options; - of->f_cr = (op->create_options & FILE_OPEN_FOR_BACKUP_INTENT) ? - smb_user_getprivcred(sr->uid_user) : sr->uid_user->u_cred; - crhold(of->f_cr); - of->f_server = tree->t_server; - of->f_session = tree->t_session; + if (user != NULL) { + if ((op->create_options & FILE_OPEN_FOR_BACKUP_INTENT) != 0) + of->f_cr = smb_user_getprivcred(user); + else + of->f_cr = user->u_cred; + crhold(of->f_cr); + } + of->f_server = sr->sr_server; + of->f_session = sr->session; /* may be NULL */ + (void) memset(of->f_lock_seq, -1, SMB_OFILE_LSEQ_MAX); of->f_mode = smb_fsop_amask_to_omode(of->f_granted_access); @@ -361,11 +374,15 @@ smb_ofile_alloc( * held by our caller, until smb_ofile_open puts this * ofile on the node ofile list with smb_node_add_ofile. */ - smb_user_hold_internal(sr->uid_user); - smb_tree_hold_internal(tree); - of->f_user = sr->uid_user; - of->f_tree = tree; - of->f_node = node; + if (user != NULL) { + smb_user_hold_internal(user); + of->f_user = user; + } + if (tree != NULL) { + smb_tree_hold_internal(tree); + of->f_tree = tree; + } + of->f_node = node; /* may be NULL */ return (of); } @@ -448,6 +465,9 @@ smb_ofile_close(smb_ofile_t *of, int32_t mtime_sec) return; } + /* + * Only one thread here (the one that that set f_state closing) + */ switch (of->f_ftype) { case SMB_FTYPE_BYTE_PIPE: case SMB_FTYPE_MESG_PIPE: @@ -456,6 +476,8 @@ smb_ofile_close(smb_ofile_t *of, int32_t mtime_sec) break; case SMB_FTYPE_DISK: + if (of->dh_persist) + smb2_dh_close_persistent(of); if (of->f_persistid != 0) smb_ofile_del_persistid(of); if (of->f_lease != NULL) @@ -961,6 +983,9 @@ smb_ofile_lookup_by_persistid(smb_request_t *sr, uint64_t persistid) smb_ofile_t *of; uint_t idx; + if (persistid == 0) + return (NULL); + hash = sr->sr_server->sv_persistid_ht; idx = smb_hash_uint64(hash, persistid); bucket = &hash->buckets[idx]; @@ -981,28 +1006,132 @@ smb_ofile_lookup_by_persistid(smb_request_t *sr, uint64_t persistid) } /* - * Create a (unique) persistent ID for a new ofile, - * and add this ofile to the persistid hash table. + * Create a (unique) durable/persistent ID for a new ofile, + * and add this ofile to the persistid hash table. This ID + * is referred to as the persistent ID in the protocol spec, + * so that's what we call it too, though the persistence may + * vary. "Durable" handles are persistent across reconnects + * but not server reboots. Persistent handles are persistent + * across server reboots too. + * + * Note that persistent IDs need to be unique for the lifetime of + * any given ofile. For normal (non-persistent) ofiles we can just + * use a persistent ID derived from the ofile memory address, as + * these don't ever live beyond the current OS boot lifetime. + * + * Persistent handles are re-imported after server restart, and + * generally have a different memory address after import than + * they had in the previous OS boot lifetime, so for these we + * use a randomly assigned value that won't conflict with any + * non-persistent (durable) handles. Ensuring that a randomly + * generated ID is unique requires a search of the ofiles in one + * hash bucket, which we'd rather avoid for non-persistent opens. + * + * The solution used here is to divide the persistent ID space + * in half (odd and even values) where durable opens use an ID + * derived from the ofile address (which is always even), and + * persistent opens use an ID generated randomly (always odd). + * + * smb_ofile_set_persistid_dh() sets a durable handle ID and + * smb_ofile_set_persistid_ph() sets a persistent handle ID. */ void -smb_ofile_set_persistid(smb_ofile_t *of) +smb_ofile_set_persistid_dh(smb_ofile_t *of) { smb_hash_t *hash = of->f_server->sv_persistid_ht; smb_bucket_t *bucket; smb_llist_t *ll; + uint64_t persistid; uint_t idx; - of->f_persistid = SMB_OFILE_PERSISTID(of); + persistid = (uintptr_t)of; + /* Avoid showing object addresses */ + persistid ^= ((uintptr_t)&smb_cache_ofile); + /* make sure it's even */ + persistid &= ~((uint64_t)1); - idx = smb_hash_uint64(hash, of->f_persistid); + idx = smb_hash_uint64(hash, persistid); bucket = &hash->buckets[idx]; ll = &bucket->b_list; smb_llist_enter(ll, RW_WRITER); - smb_llist_insert_tail(ll, of); + if (of->f_persistid == 0) { + of->f_persistid = persistid; + smb_llist_insert_tail(ll, of); + } smb_llist_exit(ll); } void +smb_ofile_set_persistid_ph(smb_ofile_t *of) +{ + uint64_t persistid; + int rc; + +top: + (void) random_get_pseudo_bytes((uint8_t *)&persistid, + sizeof (persistid)); + if (persistid == 0) { + cmn_err(CE_NOTE, "random gave all zeros!"); + goto top; + } + /* make sure it's odd */ + persistid |= (uint64_t)1; + + /* + * Try inserting with this persistent ID. + */ + rc = smb_ofile_insert_persistid(of, persistid); + if (rc == EEXIST) + goto top; + if (rc != 0) { + cmn_err(CE_NOTE, "set persistid rc=%d", rc); + } +} + +/* + * Insert an ofile into the persistid hash table. + * If the persistent ID is in use, error. + */ +int +smb_ofile_insert_persistid(smb_ofile_t *new_of, uint64_t persistid) +{ + smb_hash_t *hash = new_of->f_server->sv_persistid_ht; + smb_bucket_t *bucket; + smb_llist_t *ll; + smb_ofile_t *of; + uint_t idx; + + ASSERT(persistid != 0); + + /* + * Look to see if this key alreay exists. + */ + idx = smb_hash_uint64(hash, persistid); + bucket = &hash->buckets[idx]; + ll = &bucket->b_list; + + smb_llist_enter(ll, RW_WRITER); + of = smb_llist_head(ll); + while (of != NULL) { + if (of->f_persistid == persistid) { + /* already in use */ + smb_llist_exit(ll); + return (EEXIST); + } + of = smb_llist_next(ll, of); + } + + /* Not found, so OK to insert. */ + if (new_of->f_persistid == 0) { + new_of->f_persistid = persistid; + smb_llist_insert_tail(ll, new_of); + } + smb_llist_exit(ll); + + return (0); +} + +void smb_ofile_del_persistid(smb_ofile_t *of) { smb_hash_t *hash = of->f_server->sv_persistid_ht; @@ -1014,7 +1143,10 @@ smb_ofile_del_persistid(smb_ofile_t *of) bucket = &hash->buckets[idx]; ll = &bucket->b_list; smb_llist_enter(ll, RW_WRITER); - smb_llist_remove(ll, of); + if (of->f_persistid != 0) { + smb_llist_remove(ll, of); + of->f_persistid = 0; + } smb_llist_exit(ll); } @@ -1390,6 +1522,7 @@ smb_ofile_free(smb_ofile_t *of) of->f_magic = (uint32_t)~SMB_OFILE_MAGIC; list_destroy(&of->f_notify.nc_waiters); + mutex_destroy(&of->dh_nvlock); mutex_destroy(&of->f_mutex); kmem_cache_free(smb_cache_ofile, of); } diff --git a/usr/src/uts/common/fs/smbsrv/smb_pathname.c b/usr/src/uts/common/fs/smbsrv/smb_pathname.c index a8f5ae3aa4..fbf003c7c0 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_pathname.c +++ b/usr/src/uts/common/fs/smbsrv/smb_pathname.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #include <smbsrv/smb_kproto.h> @@ -154,7 +154,7 @@ smb_pathname_reduce( pathname_t ppn; char *usepath; int lookup_flags = FOLLOW; - int trailing_slash = 0; + int trailing_slash = 0; int err = 0; int len; smb_node_t *vss_cur_node; @@ -423,6 +423,10 @@ smb_pathname(smb_request_t *sr, char *path, int flags, if ((err = pn_set(&pn, namep)) != 0) break; + /* We want the DOS attributes. */ + bzero(&attr, sizeof (attr)); + attr.sa_mask = SMB_AT_DOSATTR; + local_flags = flags & FIGNORECASE; err = smb_pathname_lookup(&pn, &rpn, local_flags, &vp, rootvp, dnode->vp, &attr, cred); @@ -1066,6 +1070,27 @@ smb_is_stream_name(char *path) } /* + * Is this stream node a "restricted" type? + */ +boolean_t +smb_strname_restricted(char *strname) +{ + char *stype; + + stype = strrchr(strname, ':'); + if (stype == NULL) + return (B_FALSE); + + /* + * Only ":$CA" is restricted (for now). + */ + if (strcmp(stype, ":$CA") == 0) + return (B_TRUE); + + return (B_FALSE); +} + +/* * smb_validate_stream_name * * B_FALSE will be returned, and the error status ser in the sr, if: @@ -1079,6 +1104,7 @@ boolean_t smb_validate_stream_name(smb_request_t *sr, smb_pathname_t *pn) { static char *strmtype[] = { + "$CA", "$DATA", "$INDEX_ALLOCATION" }; diff --git a/usr/src/uts/common/fs/smbsrv/smb_server.c b/usr/src/uts/common/fs/smbsrv/smb_server.c index 42b6f8defa..6b2390d633 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_server.c +++ b/usr/src/uts/common/fs/smbsrv/smb_server.c @@ -20,8 +20,8 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017 by Delphix. All rights reserved. + * Copyright 2018 Nexenta Systems, Inc. All rights reserved. */ /* @@ -229,12 +229,12 @@ static void smb_server_fsop_stop(smb_server_t *); static void smb_event_cancel(smb_server_t *, uint32_t); static uint32_t smb_event_alloc_txid(void); -static void smb_server_disconnect_share(smb_llist_t *, const char *); -static void smb_server_enum_users(smb_llist_t *, smb_svcenum_t *); -static void smb_server_enum_trees(smb_llist_t *, smb_svcenum_t *); -static int smb_server_session_disconnect(smb_llist_t *, const char *, +static void smb_server_disconnect_share(smb_server_t *, const char *); +static void smb_server_enum_users(smb_server_t *, smb_svcenum_t *); +static void smb_server_enum_trees(smb_server_t *, smb_svcenum_t *); +static int smb_server_session_disconnect(smb_server_t *, const char *, const char *); -static int smb_server_fclose(smb_llist_t *, uint32_t); +static int smb_server_fclose(smb_server_t *, uint32_t); static int smb_server_kstat_update(kstat_t *, int); static int smb_server_legacy_kstat_update(kstat_t *, int); static void smb_server_listener_init(smb_server_t *, smb_listener_daemon_t *, @@ -473,14 +473,8 @@ smb_server_create(void) * activity associated that server has ceased before destroying it. */ int -smb_server_delete(void) +smb_server_delete(smb_server_t *sv) { - smb_server_t *sv; - int rc; - - rc = smb_server_lookup(&sv); - if (rc != 0) - return (rc); mutex_enter(&sv->sv_mutex); switch (sv->sv_state) { @@ -608,6 +602,7 @@ smb_server_start(smb_ioc_start_t *ioc) int rc = 0; int family; smb_server_t *sv; + cred_t *ucr; rc = smb_server_lookup(&sv); if (rc) @@ -620,6 +615,31 @@ smb_server_start(smb_ioc_start_t *ioc) if ((rc = smb_server_fsop_start(sv)) != 0) break; + /* + * Note: smb_kshare_start needs sv_session. + */ + sv->sv_session = smb_session_create(NULL, 0, sv, 0); + if (sv->sv_session == NULL) { + rc = ENOMEM; + break; + } + + /* + * Create a logon on the server session, + * used when importing CA shares. + */ + sv->sv_rootuser = smb_user_new(sv->sv_session); + ucr = smb_kcred_create(); + rc = smb_user_logon(sv->sv_rootuser, ucr, "", "root", + SMB_USER_FLAG_ADMIN, 0, 0); + crfree(ucr); + ucr = NULL; + if (rc != 0) { + cmn_err(CE_NOTE, "smb_server_start: " + "failed to create root user"); + break; + } + if ((rc = smb_kshare_start(sv)) != 0) break; @@ -637,9 +657,8 @@ smb_server_start(smb_ioc_start_t *ioc) sv->sv_cfg.skc_maxconnections, INT_MAX, curzone->zone_zsched, TASKQ_DYNAMIC); - sv->sv_session = smb_session_create(NULL, 0, sv, 0); - - if (sv->sv_worker_pool == NULL || sv->sv_session == NULL) { + if (sv->sv_worker_pool == NULL || + sv->sv_receiver_pool == NULL) { rc = ENOMEM; break; } @@ -904,11 +923,11 @@ smb_server_enum(smb_ioc_svcenum_t *ioc) switch (svcenum->se_type) { case SMB_SVCENUM_TYPE_USER: - smb_server_enum_users(&sv->sv_session_list, svcenum); + smb_server_enum_users(sv, svcenum); break; case SMB_SVCENUM_TYPE_TREE: case SMB_SVCENUM_TYPE_FILE: - smb_server_enum_trees(&sv->sv_session_list, svcenum); + smb_server_enum_trees(sv, svcenum); break; default: rc = EINVAL; @@ -924,7 +943,6 @@ smb_server_enum(smb_ioc_svcenum_t *ioc) int smb_server_session_close(smb_ioc_session_t *ioc) { - smb_llist_t *ll; smb_server_t *sv; int cnt; int rc; @@ -932,8 +950,7 @@ smb_server_session_close(smb_ioc_session_t *ioc) if ((rc = smb_server_lookup(&sv)) != 0) return (rc); - ll = &sv->sv_session_list; - cnt = smb_server_session_disconnect(ll, ioc->client, ioc->username); + cnt = smb_server_session_disconnect(sv, ioc->client, ioc->username); smb_server_release(sv); @@ -949,15 +966,13 @@ int smb_server_file_close(smb_ioc_fileid_t *ioc) { uint32_t uniqid = ioc->uniqid; - smb_llist_t *ll; smb_server_t *sv; int rc; if ((rc = smb_server_lookup(&sv)) != 0) return (rc); - ll = &sv->sv_session_list; - rc = smb_server_fclose(ll, uniqid); + rc = smb_server_fclose(sv, uniqid); smb_server_release(sv); return (rc); @@ -978,17 +993,16 @@ smb_server_get_session_count(smb_server_t *sv) } /* - * Gets the vnode of the specified share path. - * - * A hold on the returned vnode pointer is taken so the caller - * must call VN_RELE. + * Gets the smb_node of the specified share path. + * Node is returned held (caller must rele.) */ int -smb_server_sharevp(smb_server_t *sv, const char *shr_path, vnode_t **vp) +smb_server_share_lookup(smb_server_t *sv, const char *shr_path, + smb_node_t **nodepp) { smb_request_t *sr; smb_node_t *fnode = NULL; - smb_node_t *dnode; + smb_node_t *dnode = NULL; char last_comp[MAXNAMELEN]; int rc = 0; @@ -1025,10 +1039,7 @@ smb_server_sharevp(smb_server_t *sv, const char *shr_path, vnode_t **vp) ASSERT(fnode->vp && fnode->vp->v_vfsp); - VN_HOLD(fnode->vp); - *vp = fnode->vp; - - smb_node_release(fnode); + *nodepp = fnode; return (0); } @@ -1070,7 +1081,6 @@ int smb_server_unshare(const char *sharename) { smb_server_t *sv; - smb_llist_t *ll; int rc; if ((rc = smb_server_lookup(&sv))) @@ -1088,8 +1098,7 @@ smb_server_unshare(const char *sharename) } mutex_exit(&sv->sv_mutex); - ll = &sv->sv_session_list; - smb_server_disconnect_share(ll, sharename); + smb_server_disconnect_share(sv, sharename); smb_server_release(sv); return (0); @@ -1100,10 +1109,12 @@ smb_server_unshare(const char *sharename) * Typically called when a share has been removed. */ static void -smb_server_disconnect_share(smb_llist_t *ll, const char *sharename) +smb_server_disconnect_share(smb_server_t *sv, const char *sharename) { + smb_llist_t *ll; smb_session_t *session; + ll = &sv->sv_session_list; smb_llist_enter(ll, RW_READER); session = smb_llist_head(ll); @@ -1514,9 +1525,17 @@ smb_server_shutdown(smb_server_t *sv) * normal sessions, this happens in smb_session_cancel, * but that's not called for the server session. */ + if (sv->sv_rootuser != NULL) { + smb_user_logoff(sv->sv_rootuser); + smb_user_release(sv->sv_rootuser); + sv->sv_rootuser = NULL; + } if (sv->sv_session != NULL) { smb_slist_wait_for_empty(&sv->sv_session->s_req_list); + /* Just in case import left users and trees */ + smb_session_logoff(sv->sv_session); + smb_session_delete(sv->sv_session); sv->sv_session = NULL; } @@ -1817,8 +1836,9 @@ smb_server_release(smb_server_t *sv) * Enumerate the users associated with a session list. */ static void -smb_server_enum_users(smb_llist_t *ll, smb_svcenum_t *svcenum) +smb_server_enum_users(smb_server_t *sv, smb_svcenum_t *svcenum) { + smb_llist_t *ll = &sv->sv_session_list; smb_session_t *sn; smb_llist_t *ulist; smb_user_t *user; @@ -1859,8 +1879,9 @@ smb_server_enum_users(smb_llist_t *ll, smb_svcenum_t *svcenum) * Enumerate the trees/files associated with a session list. */ static void -smb_server_enum_trees(smb_llist_t *ll, smb_svcenum_t *svcenum) +smb_server_enum_trees(smb_server_t *sv, smb_svcenum_t *svcenum) { + smb_llist_t *ll = &sv->sv_session_list; smb_session_t *sn; smb_llist_t *tlist; smb_tree_t *tree; @@ -1902,9 +1923,10 @@ smb_server_enum_trees(smb_llist_t *ll, smb_svcenum_t *svcenum) * Empty strings are treated as wildcards. */ static int -smb_server_session_disconnect(smb_llist_t *ll, +smb_server_session_disconnect(smb_server_t *sv, const char *client, const char *name) { + smb_llist_t *ll = &sv->sv_session_list; smb_session_t *sn; smb_llist_t *ulist; smb_user_t *user; @@ -1949,13 +1971,15 @@ smb_server_session_disconnect(smb_llist_t *ll, * Close a file by its unique id. */ static int -smb_server_fclose(smb_llist_t *ll, uint32_t uniqid) +smb_server_fclose(smb_server_t *sv, uint32_t uniqid) { + smb_llist_t *ll; smb_session_t *sn; smb_llist_t *tlist; smb_tree_t *tree; int rc = ENOENT; + ll = &sv->sv_session_list; smb_llist_enter(ll, RW_READER); sn = smb_llist_head(ll); diff --git a/usr/src/uts/common/fs/smbsrv/smb_session.c b/usr/src/uts/common/fs/smbsrv/smb_session.c index 205c21179b..2878df28e7 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_session.c +++ b/usr/src/uts/common/fs/smbsrv/smb_session.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Nexenta Systems, Inc. All rights reserved. */ #include <sys/atomic.h> @@ -72,8 +72,6 @@ static int smb_session_reader(smb_session_t *); static int smb_session_xprt_puthdr(smb_session_t *, uint8_t msg_type, uint32_t msg_len, uint8_t *dst, size_t dstlen); -static smb_tree_t *smb_session_get_tree(smb_session_t *, smb_tree_t *); -static void smb_session_logoff(smb_session_t *); static void smb_session_disconnect_trees(smb_session_t *); static void smb_request_init_command_mbuf(smb_request_t *sr); static void smb_session_genkey(smb_session_t *); @@ -752,7 +750,22 @@ smb_session_create(ksocket_t new_so, uint16_t port, smb_server_t *sv, smb_rwx_init(&session->s_lock); - if (new_so != NULL) { + session->s_srqueue = &sv->sv_srqueue; + smb_server_get_cfg(sv, &session->s_cfg); + + if (new_so == NULL) { + /* + * This call is creating the special "server" session, + * used for kshare export, oplock breaks, CA import. + * CA import creates temporary trees on this session + * and those should never get map/unmap up-calls, so + * force the map/unmap flags zero on this session. + * Set a "modern" dialect for CA import too, so + * pathname parse doesn't do OS/2 stuff, etc. + */ + session->s_cfg.skc_execflags = 0; + session->dialect = session->s_cfg.skc_max_protocol; + } else { if (family == AF_INET) { slen = sizeof (sin); (void) ksocket_getsockname(new_so, @@ -794,8 +807,6 @@ smb_session_create(ksocket_t new_so, uint16_t port, smb_server_t *sv, else smb_server_inc_tcp_sess(sv); } - smb_server_get_cfg(sv, &session->s_cfg); - session->s_srqueue = &sv->sv_srqueue; /* * The initial new request handler is special, @@ -1006,117 +1017,35 @@ smb_session_lookup_tree( } /* - * Find the first connected tree that matches the specified sharename. - * If the specified tree is NULL the search starts from the beginning of - * the user's tree list. If a tree is provided the search starts just - * after that tree. - */ -smb_tree_t * -smb_session_lookup_share( - smb_session_t *session, - const char *sharename, - smb_tree_t *tree) -{ - SMB_SESSION_VALID(session); - ASSERT(sharename); - - smb_llist_enter(&session->s_tree_list, RW_READER); - - if (tree) { - ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC); - ASSERT(tree->t_session == session); - tree = smb_llist_next(&session->s_tree_list, tree); - } else { - tree = smb_llist_head(&session->s_tree_list); - } - - while (tree) { - ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC); - ASSERT(tree->t_session == session); - if (smb_strcasecmp(tree->t_sharename, sharename, 0) == 0) { - if (smb_tree_hold(tree)) { - smb_llist_exit(&session->s_tree_list); - return (tree); - } - } - tree = smb_llist_next(&session->s_tree_list, tree); - } - - smb_llist_exit(&session->s_tree_list); - return (NULL); -} - -/* - * Find the first connected tree that matches the specified volume name. - * If the specified tree is NULL the search starts from the beginning of - * the user's tree list. If a tree is provided the search starts just - * after that tree. - */ -smb_tree_t * -smb_session_lookup_volume( - smb_session_t *session, - const char *name, - smb_tree_t *tree) -{ - SMB_SESSION_VALID(session); - ASSERT(name); - - smb_llist_enter(&session->s_tree_list, RW_READER); - - if (tree) { - ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC); - ASSERT(tree->t_session == session); - tree = smb_llist_next(&session->s_tree_list, tree); - } else { - tree = smb_llist_head(&session->s_tree_list); - } - - while (tree) { - ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC); - ASSERT(tree->t_session == session); - - if (smb_strcasecmp(tree->t_volume, name, 0) == 0) { - if (smb_tree_hold(tree)) { - smb_llist_exit(&session->s_tree_list); - return (tree); - } - } - - tree = smb_llist_next(&session->s_tree_list, tree); - } - - smb_llist_exit(&session->s_tree_list); - return (NULL); -} - -/* * Disconnect all trees that match the specified client process-id. + * Used by the SMB1 "process exit" request. */ void smb_session_close_pid( smb_session_t *session, uint32_t pid) { + smb_llist_t *tree_list = &session->s_tree_list; smb_tree_t *tree; - SMB_SESSION_VALID(session); + smb_llist_enter(tree_list, RW_READER); - tree = smb_session_get_tree(session, NULL); + tree = smb_llist_head(tree_list); while (tree) { - smb_tree_t *next; - ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC); - ASSERT(tree->t_session == session); - smb_tree_close_pid(tree, pid); - next = smb_session_get_tree(session, tree); - smb_tree_release(tree); - tree = next; + if (smb_tree_hold(tree)) { + smb_tree_close_pid(tree, pid); + smb_tree_release(tree); + } + tree = smb_llist_next(tree_list, tree); } + + smb_llist_exit(tree_list); } static void -smb_session_tree_dtor(void *t) +smb_session_tree_dtor(void *arg) { - smb_tree_t *tree = (smb_tree_t *)t; + smb_tree_t *tree = arg; smb_tree_disconnect(tree, B_TRUE); /* release the ref acquired during the traversal loop */ @@ -1167,84 +1096,76 @@ static void smb_session_disconnect_trees( smb_session_t *session) { - smb_tree_t *tree, *next_tree; + smb_llist_t *tree_list = &session->s_tree_list; + smb_tree_t *tree; - SMB_SESSION_VALID(session); + smb_llist_enter(tree_list, RW_READER); - tree = smb_session_get_tree(session, NULL); + tree = smb_llist_head(tree_list); while (tree) { - ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC); - ASSERT(tree->t_session == session); - smb_tree_disconnect(tree, B_TRUE); - next_tree = smb_session_get_tree(session, tree); - smb_tree_release(tree); - tree = next_tree; + if (smb_tree_hold(tree)) { + smb_llist_post(tree_list, tree, + smb_session_tree_dtor); + } + tree = smb_llist_next(tree_list, tree); } + + /* drop the lock and flush the dtor queue */ + smb_llist_exit(tree_list); } /* - * Disconnect all trees that match the specified share name. + * Variant of smb_session_tree_dtor that also + * cancels requests using this tree. */ -void -smb_session_disconnect_share( - smb_session_t *session, - const char *sharename) +static void +smb_session_tree_kill(void *arg) { - smb_tree_t *tree; - smb_tree_t *next; + smb_tree_t *tree = arg; - SMB_SESSION_VALID(session); + SMB_TREE_VALID(tree); - tree = smb_session_lookup_share(session, sharename, NULL); - while (tree) { - ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC); - ASSERT(tree->t_session == session); - smb_tree_disconnect(tree, B_TRUE); - smb_session_cancel_requests(session, tree, NULL); - next = smb_session_lookup_share(session, sharename, tree); - smb_tree_release(tree); - tree = next; - } + smb_tree_disconnect(tree, B_TRUE); + smb_session_cancel_requests(tree->t_session, tree, NULL); + + /* release the ref acquired during the traversal loop */ + smb_tree_release(tree); } /* - * Get the next connected tree in the list. A reference is taken on - * the tree, which can be released later with smb_tree_release(). - * - * If the specified tree is NULL the search starts from the beginning of - * the tree list. If a tree is provided the search starts just after - * that tree. - * - * Returns NULL if there are no connected trees in the list. + * Disconnect all trees that match the specified share name, + * and kill requests using those trees. */ -static smb_tree_t * -smb_session_get_tree( +void +smb_session_disconnect_share( smb_session_t *session, - smb_tree_t *tree) + const char *sharename) { - smb_llist_t *tree_list; + smb_llist_t *ll; + smb_tree_t *tree; SMB_SESSION_VALID(session); - tree_list = &session->s_tree_list; - smb_llist_enter(tree_list, RW_READER); + ll = &session->s_tree_list; + smb_llist_enter(ll, RW_READER); - if (tree) { - ASSERT3U(tree->t_magic, ==, SMB_TREE_MAGIC); - tree = smb_llist_next(tree_list, tree); - } else { - tree = smb_llist_head(tree_list); - } + for (tree = smb_llist_head(ll); + tree != NULL; + tree = smb_llist_next(ll, tree)) { - while (tree) { - if (smb_tree_hold(tree)) - break; + SMB_TREE_VALID(tree); + ASSERT(tree->t_session == session); - tree = smb_llist_next(tree_list, tree); + if (smb_strcasecmp(tree->t_sharename, sharename, 0) != 0) + continue; + + if (smb_tree_hold(tree)) { + smb_llist_post(ll, tree, + smb_session_tree_kill); + } } - smb_llist_exit(tree_list); - return (tree); + smb_llist_exit(ll); } /* @@ -1255,7 +1176,7 @@ smb_session_get_tree( * disconnect (SMB_SESSION_STATE_DISCONNECTED). * If client-initiated, save durable handles. */ -static void +void smb_session_logoff(smb_session_t *session) { smb_llist_t *ulist; @@ -1279,9 +1200,6 @@ top: // smb_user_hold_internal(user); user->u_refcnt++; mutex_exit(&user->u_mutex); - if (user->u_session->s_state == - SMB_SESSION_STATE_DISCONNECTED) - user->preserve_opens = SMB2_DH_PRESERVE_ALL; smb_user_logoff(user); smb_user_release(user); break; diff --git a/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c b/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c index 86ce24c0b0..7c4be2f56e 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c +++ b/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c @@ -346,6 +346,11 @@ smb_oplock_async_break(void *arg) break; } + if (sr->dh_nvl_dirty) { + sr->dh_nvl_dirty = B_FALSE; + smb2_dh_update_nvfile(sr); + } + sr->sr_state = SMB_REQ_STATE_COMPLETED; smb_request_free(sr); } @@ -444,6 +449,10 @@ smb_oplock_send_brk(smb_request_t *sr) if (lease != NULL) lease->ls_state = NewLevel & CACHE_RWH; ofile->f_oplock.og_state = NewLevel; + + if (ofile->dh_persist) { + smb2_dh_update_oplock(sr, ofile); + } } /* @@ -583,6 +592,10 @@ smb_oplock_send_brk(smb_request_t *sr) if (lease != NULL) { lease->ls_state = NewLevel & CACHE_RWH; } + + if (ofile->dh_persist) { + smb2_dh_update_oplock(sr, ofile); + } } /* diff --git a/usr/src/uts/common/fs/smbsrv/smb_tree.c b/usr/src/uts/common/fs/smbsrv/smb_tree.c index 5020dec794..aedacf2123 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_tree.c +++ b/usr/src/uts/common/fs/smbsrv/smb_tree.c @@ -184,8 +184,6 @@ uint32_t smb_tree_connect_core(smb_request_t *); uint32_t smb_tree_connect_disk(smb_request_t *, smb_arg_tcon_t *); uint32_t smb_tree_connect_printq(smb_request_t *, smb_arg_tcon_t *); uint32_t smb_tree_connect_ipc(smb_request_t *, smb_arg_tcon_t *); -static smb_tree_t *smb_tree_alloc(smb_request_t *, const smb_kshare_t *, - smb_node_t *, uint32_t, uint32_t); static void smb_tree_dealloc(void *); static boolean_t smb_tree_is_connected_locked(smb_tree_t *); static char *smb_tree_get_sharename(char *); @@ -193,9 +191,7 @@ static int smb_tree_getattr(const smb_kshare_t *, smb_node_t *, smb_tree_t *); static void smb_tree_get_volname(vfs_t *, smb_tree_t *); static void smb_tree_get_flags(const smb_kshare_t *, vfs_t *, smb_tree_t *); static void smb_tree_log(smb_request_t *, const char *, const char *, ...); -static void smb_tree_close_odirs(smb_tree_t *, uint16_t); -static smb_ofile_t *smb_tree_get_ofile(smb_tree_t *, smb_ofile_t *); -static smb_odir_t *smb_tree_get_odir(smb_tree_t *, smb_odir_t *); +static void smb_tree_close_odirs(smb_tree_t *, uint32_t); static void smb_tree_set_execinfo(smb_tree_t *, smb_shr_execinfo_t *, int); static int smb_tree_enum_private(smb_tree_t *, smb_svcenum_t *); static int smb_tree_netinfo_encode(smb_tree_t *, uint8_t *, size_t, uint32_t *); @@ -303,10 +299,13 @@ out: /* * Disconnect a tree. + * + * The "do_exec" arg is obsolete and ignored. */ void smb_tree_disconnect(smb_tree_t *tree, boolean_t do_exec) { + _NOTE(ARGUNUSED(do_exec)) smb_shr_execinfo_t execinfo; ASSERT(tree->t_magic == SMB_TREE_MAGIC); @@ -314,34 +313,27 @@ smb_tree_disconnect(smb_tree_t *tree, boolean_t do_exec) mutex_enter(&tree->t_mutex); ASSERT(tree->t_refcnt); - if (smb_tree_is_connected_locked(tree)) { - /* - * Indicate that the disconnect process has started. - */ - tree->t_state = SMB_TREE_STATE_DISCONNECTING; + if (!smb_tree_is_connected_locked(tree)) { mutex_exit(&tree->t_mutex); - - if (do_exec) { - /* - * The files opened under this tree are closed. - */ - smb_ofile_close_all(tree, 0); - /* - * The directories opened under this tree are closed. - */ - smb_tree_close_odirs(tree, 0); - } - - mutex_enter(&tree->t_mutex); - tree->t_state = SMB_TREE_STATE_DISCONNECTED; - smb_server_dec_trees(tree->t_server); + return; } + /* + * Indicate that the disconnect process has started. + */ + tree->t_state = SMB_TREE_STATE_DISCONNECTING; mutex_exit(&tree->t_mutex); - if (do_exec && (tree->t_state == SMB_TREE_STATE_DISCONNECTED) && - (tree->t_execflags & SMB_EXEC_UNMAP)) { + /* + * The files opened under this tree are closed. + */ + smb_ofile_close_all(tree, 0); + /* + * The directories opened under this tree are closed. + */ + smb_tree_close_odirs(tree, 0); + if ((tree->t_execflags & SMB_EXEC_UNMAP) != 0) { smb_tree_set_execinfo(tree, &execinfo, SMB_EXEC_UNMAP); (void) smb_kshare_exec(tree->t_server, &execinfo); } @@ -408,7 +400,7 @@ smb_tree_release( tree->t_refcnt--; switch (tree->t_state) { - case SMB_TREE_STATE_DISCONNECTED: + case SMB_TREE_STATE_DISCONNECTING: if (tree->t_refcnt == 0) { smb_session_t *ssn = tree->t_session; tree->t_state = SMB_TREE_STATE_DISCONNECTED; @@ -417,7 +409,6 @@ smb_tree_release( } break; case SMB_TREE_STATE_CONNECTED: - case SMB_TREE_STATE_DISCONNECTING: break; default: ASSERT(0); @@ -463,31 +454,29 @@ smb_tree_has_feature(smb_tree_t *tree, uint32_t flags) int smb_tree_enum(smb_tree_t *tree, smb_svcenum_t *svcenum) { + smb_llist_t *of_list; smb_ofile_t *of; - smb_ofile_t *next; int rc = 0; - ASSERT(tree); - ASSERT(tree->t_magic == SMB_TREE_MAGIC); - if (svcenum->se_type == SMB_SVCENUM_TYPE_TREE) return (smb_tree_enum_private(tree, svcenum)); - of = smb_tree_get_ofile(tree, NULL); - while (of) { - ASSERT(of->f_tree == tree); + of_list = &tree->t_ofile_list; + smb_llist_enter(of_list, RW_READER); - rc = smb_ofile_enum(of, svcenum); - if (rc != 0) { + of = smb_llist_head(of_list); + while (of) { + if (smb_ofile_hold(of)) { + rc = smb_ofile_enum(of, svcenum); smb_ofile_release(of); - break; } - - next = smb_tree_get_ofile(tree, of); - smb_ofile_release(of); - of = next; + if (rc != 0) + break; + of = smb_llist_next(of_list, of); } + smb_llist_exit(of_list); + return (rc); } @@ -662,6 +651,9 @@ smb_tree_chkaccess(smb_request_t *sr, smb_kshare_t *shr, vnode_t *vp) return (access); } +/* How long should tree connect wait for DH import to complete? */ +int smb_tcon_import_wait = 20; /* sec. */ + /* * Connect a share for use with files and directories. */ @@ -671,16 +663,14 @@ smb_tree_connect_disk(smb_request_t *sr, smb_arg_tcon_t *tcon) char *sharename = tcon->path; const char *any = "?????"; smb_user_t *user = sr->uid_user; - smb_node_t *dnode = NULL; smb_node_t *snode = NULL; smb_kshare_t *si = tcon->si; char *service = tcon->service; - char last_component[MAXNAMELEN]; smb_tree_t *tree; - cred_t *kcr; int rc; uint32_t access; smb_shr_execinfo_t execinfo; + clock_t time; ASSERT(user); ASSERT(user->u_cred); @@ -694,34 +684,34 @@ smb_tree_connect_disk(smb_request_t *sr, smb_arg_tcon_t *tcon) /* * Check that the shared directory exists. - * Client might not have access to the path _leading_ to the share, - * so we use "kcred" to get to the share root. */ - kcr = zone_kcred(); - rc = smb_pathname_reduce(sr, kcr, si->shr_path, 0, 0, &dnode, - last_component); - if (rc == 0) { - rc = smb_fsop_lookup(sr, kcr, SMB_FOLLOW_LINKS, - sr->sr_server->si_root_smb_node, dnode, last_component, - &snode); - - smb_node_release(dnode); - } - - if (rc) { - if (snode) - smb_node_release(snode); - + snode = si->shr_root_node; + if (snode == NULL) { smb_tree_log(sr, sharename, "bad path: %s", si->shr_path); return (NT_STATUS_BAD_NETWORK_NAME); } if ((access = smb_tree_chkaccess(sr, si, snode->vp)) == 0) { - smb_node_release(snode); return (NT_STATUS_ACCESS_DENIED); } /* + * Wait for DH import of persistent handles to finish. + * If we timeout, it's not clear what status to return, + * but as the share is not really available yet, let's + * return the status for "no such share". + */ + time = SEC_TO_TICK(smb_tcon_import_wait) + ddi_get_lbolt(); + mutex_enter(&si->shr_mutex); + while (si->shr_import_busy != NULL) { + if (cv_timedwait(&si->shr_cv, &si->shr_mutex, time) < 0) { + mutex_exit(&si->shr_mutex); + return (NT_STATUS_BAD_NETWORK_NAME); + } + } + mutex_exit(&si->shr_mutex); + + /* * Set up the OptionalSupport for this share. */ tcon->optional_support = SMB_SUPPORT_SEARCH_BITS; @@ -758,8 +748,6 @@ smb_tree_connect_disk(smb_request_t *sr, smb_arg_tcon_t *tcon) tree = smb_tree_alloc(sr, si, snode, access, sr->sr_cfg->skc_execflags); - smb_node_release(snode); - if (tree == NULL) return (NT_STATUS_INSUFF_SERVER_RESOURCES); @@ -769,7 +757,17 @@ smb_tree_connect_disk(smb_request_t *sr, smb_arg_tcon_t *tcon) rc = smb_kshare_exec(tree->t_server, &execinfo); if ((rc != 0) && (tree->t_execflags & SMB_EXEC_TERM)) { - smb_tree_disconnect(tree, B_FALSE); + /* + * Inline parts of: smb_tree_disconnect() + * Not using smb_tree_disconnect() for cleanup + * here because: we don't want an exec up-call, + * and there can't be any opens as we never + * returned this TID to the client. + */ + mutex_enter(&tree->t_mutex); + tree->t_state = SMB_TREE_STATE_DISCONNECTING; + mutex_exit(&tree->t_mutex); + smb_tree_release(tree); return (NT_STATUS_ACCESS_DENIED); } @@ -901,7 +899,7 @@ smb_tree_connect_ipc(smb_request_t *sr, smb_arg_tcon_t *tcon) /* * Allocate a tree. */ -static smb_tree_t * +smb_tree_t * smb_tree_alloc(smb_request_t *sr, const smb_kshare_t *si, smb_node_t *snode, uint32_t access, uint32_t execflags) { @@ -1001,6 +999,8 @@ smb_tree_dealloc(void *arg) ASSERT(tree->t_state == SMB_TREE_STATE_DISCONNECTED); ASSERT(tree->t_refcnt == 0); + smb_server_dec_trees(tree->t_server); + session = tree->t_session; smb_llist_enter(&session->s_tree_list, RW_WRITER); smb_llist_remove(&session->s_tree_list, tree); @@ -1199,6 +1199,9 @@ smb_tree_get_flags(const smb_kshare_t *si, vfs_t *vfsp, smb_tree_t *tree) if (si->shr_flags & SMB_SHRF_ABE) flags |= SMB_TREE_ABE; + if (si->shr_flags & SMB_SHRF_CA) + flags |= SMB_TREE_CA; + if (si->shr_flags & SMB_SHRF_FSO) flags |= SMB_TREE_FORCE_L2_OPLOCK; @@ -1361,83 +1364,6 @@ smb_tree_is_connected(smb_tree_t *tree) } /* - * Get the next open ofile in the list. A reference is taken on - * the ofile, which can be released later with smb_ofile_release(). - * - * If the specified ofile is NULL, search from the beginning of the - * list. Otherwise, the search starts just after that ofile. - * - * Returns NULL if there are no open files in the list. - */ -static smb_ofile_t * -smb_tree_get_ofile(smb_tree_t *tree, smb_ofile_t *of) -{ - smb_llist_t *ofile_list; - - ASSERT(tree); - ASSERT(tree->t_magic == SMB_TREE_MAGIC); - - ofile_list = &tree->t_ofile_list; - smb_llist_enter(ofile_list, RW_READER); - - if (of) { - ASSERT(of->f_magic == SMB_OFILE_MAGIC); - of = smb_llist_next(ofile_list, of); - } else { - of = smb_llist_head(ofile_list); - } - - while (of) { - if (smb_ofile_hold(of)) - break; - - of = smb_llist_next(ofile_list, of); - } - - smb_llist_exit(ofile_list); - return (of); -} - -/* - * smb_tree_get_odir - * - * Find the next odir in the tree's list of odirs, and obtain a - * hold on it. - * If the specified odir is NULL the search starts at the beginning - * of the tree's odir list, otherwise the search starts after the - * specified odir. - */ -static smb_odir_t * -smb_tree_get_odir(smb_tree_t *tree, smb_odir_t *od) -{ - smb_llist_t *od_list; - - ASSERT(tree); - ASSERT(tree->t_magic == SMB_TREE_MAGIC); - - od_list = &tree->t_odir_list; - smb_llist_enter(od_list, RW_READER); - - if (od) { - ASSERT(od->d_magic == SMB_ODIR_MAGIC); - od = smb_llist_next(od_list, od); - } else { - od = smb_llist_head(od_list); - } - - while (od) { - ASSERT(od->d_magic == SMB_ODIR_MAGIC); - - if (smb_odir_hold(od)) - break; - od = smb_llist_next(od_list, od); - } - - smb_llist_exit(od_list); - return (od); -} - -/* * smb_tree_close_odirs * * Close all open odirs in the tree's list which were opened by @@ -1445,25 +1371,34 @@ smb_tree_get_odir(smb_tree_t *tree, smb_odir_t *od) * If pid is zero, close all open odirs in the tree's list. */ static void -smb_tree_close_odirs(smb_tree_t *tree, uint16_t pid) +smb_tree_close_odirs(smb_tree_t *tree, uint32_t pid) { - smb_odir_t *od, *next_od; + smb_llist_t *od_list; + smb_odir_t *od; ASSERT(tree); ASSERT(tree->t_magic == SMB_TREE_MAGIC); - od = smb_tree_get_odir(tree, NULL); - while (od) { + od_list = &tree->t_odir_list; + smb_llist_enter(od_list, RW_READER); + + for (od = smb_llist_head(od_list); + od != NULL; + od = smb_llist_next(od_list, od)) { + ASSERT(od->d_magic == SMB_ODIR_MAGIC); ASSERT(od->d_tree == tree); - next_od = smb_tree_get_odir(tree, od); - if ((pid == 0) || (od->d_opened_by_pid == pid)) - smb_odir_close(od); - smb_odir_release(od); + if (pid != 0 && od->d_opened_by_pid != pid) + continue; - od = next_od; + if (smb_odir_hold(od)) { + smb_odir_close(od); + smb_odir_release(od); + } } + + smb_llist_exit(od_list); } static void diff --git a/usr/src/uts/common/fs/smbsrv/smb_user.c b/usr/src/uts/common/fs/smbsrv/smb_user.c index 0bfceb4ff4..74bb502c56 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_user.c +++ b/usr/src/uts/common/fs/smbsrv/smb_user.c @@ -303,7 +303,6 @@ smb_user_logon( * we always have an auth. socket to close. */ authsock = user->u_authsock; - ASSERT(authsock != NULL); user->u_authsock = NULL; tmo = user->u_auth_tmo; user->u_auth_tmo = NULL; @@ -325,7 +324,8 @@ smb_user_logon( (void) untimeout(tmo); /* This close can block, so not under the mutex. */ - smb_authsock_close(user, authsock); + if (authsock != NULL) + smb_authsock_close(user, authsock); return (0); } diff --git a/usr/src/uts/common/fs/smbsrv/smb_vfs.c b/usr/src/uts/common/fs/smbsrv/smb_vfs.c deleted file mode 100644 index ae631e4ffa..0000000000 --- a/usr/src/uts/common/fs/smbsrv/smb_vfs.c +++ /dev/null @@ -1,164 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ - -#include <sys/vfs.h> -#include <smbsrv/smb_ktypes.h> -#include <smbsrv/smb_kproto.h> - -static smb_vfs_t *smb_vfs_find(smb_export_t *, vfs_t *); -static void smb_vfs_destroy(smb_vfs_t *); - -/* - * If a hold on the specified VFS has already been taken - * then only increment the reference count of the corresponding - * smb_vfs_t structure. If no smb_vfs_t structure has been created - * yet for the specified VFS then create one and take a hold on - * the VFS. - */ -int -smb_vfs_hold(smb_export_t *se, vfs_t *vfsp) -{ - smb_vfs_t *smb_vfs; - vnode_t *rootvp; - int rc; - - if (se == NULL || vfsp == NULL) - return (EINVAL); - - smb_llist_enter(&se->e_vfs_list, RW_WRITER); - - if ((smb_vfs = smb_vfs_find(se, vfsp)) != NULL) { - smb_vfs->sv_refcnt++; - DTRACE_PROBE1(smb_vfs_hold_hit, smb_vfs_t *, smb_vfs); - smb_llist_exit(&se->e_vfs_list); - return (0); - } - - if ((rc = VFS_ROOT(vfsp, &rootvp)) != 0) { - smb_llist_exit(&se->e_vfs_list); - return (rc); - } - - smb_vfs = kmem_cache_alloc(smb_kshare_cache_vfs, KM_SLEEP); - - bzero(smb_vfs, sizeof (smb_vfs_t)); - - smb_vfs->sv_magic = SMB_VFS_MAGIC; - smb_vfs->sv_refcnt = 1; - smb_vfs->sv_vfsp = vfsp; - /* - * We have a hold on the root vnode of the file system - * from the VFS_ROOT call above. - */ - smb_vfs->sv_rootvp = rootvp; - - smb_llist_insert_head(&se->e_vfs_list, smb_vfs); - DTRACE_PROBE1(smb_vfs_hold_miss, smb_vfs_t *, smb_vfs); - smb_llist_exit(&se->e_vfs_list); - - return (0); -} - -/* - * smb_vfs_rele - * - * Decrements the reference count of the fs passed in. If the reference count - * drops to zero the smb_vfs_t structure associated with the fs is freed. - */ -void -smb_vfs_rele(smb_export_t *se, vfs_t *vfsp) -{ - smb_vfs_t *smb_vfs; - - ASSERT(vfsp); - - smb_llist_enter(&se->e_vfs_list, RW_WRITER); - smb_vfs = smb_vfs_find(se, vfsp); - DTRACE_PROBE1(smb_vfs_release, smb_vfs_t *, smb_vfs); - if (smb_vfs) { - ASSERT(smb_vfs->sv_refcnt); - if (--smb_vfs->sv_refcnt == 0) { - smb_llist_remove(&se->e_vfs_list, smb_vfs); - smb_llist_exit(&se->e_vfs_list); - smb_vfs_destroy(smb_vfs); - return; - } - } - smb_llist_exit(&se->e_vfs_list); -} - -/* - * smb_vfs_rele_all() - * - * Release all holds on root vnodes of file systems which were taken - * due to the existence of at least one enabled share on the file system. - * Called at driver close time. - */ -void -smb_vfs_rele_all(smb_export_t *se) -{ - smb_vfs_t *smb_vfs; - - smb_llist_enter(&se->e_vfs_list, RW_WRITER); - while ((smb_vfs = smb_llist_head(&se->e_vfs_list)) != NULL) { - - ASSERT(smb_vfs->sv_magic == SMB_VFS_MAGIC); - DTRACE_PROBE1(smb_vfs_rele_all_hit, smb_vfs_t *, smb_vfs); - smb_llist_remove(&se->e_vfs_list, smb_vfs); - smb_vfs_destroy(smb_vfs); - } - smb_llist_exit(&se->e_vfs_list); -} - -/* - * Goes through the list of smb_vfs_t structure and returns the one matching - * the vnode passed in. If no match is found a NULL pointer is returned. - * - * The list of smb_vfs_t structures has to have been entered prior calling - * this function. - */ -static smb_vfs_t * -smb_vfs_find(smb_export_t *se, vfs_t *vfsp) -{ - smb_vfs_t *smb_vfs; - - smb_vfs = smb_llist_head(&se->e_vfs_list); - while (smb_vfs) { - ASSERT(smb_vfs->sv_magic == SMB_VFS_MAGIC); - if (smb_vfs->sv_vfsp == vfsp) - return (smb_vfs); - smb_vfs = smb_llist_next(&se->e_vfs_list, smb_vfs); - } - - return (NULL); -} - -static void -smb_vfs_destroy(smb_vfs_t *smb_vfs) -{ - VN_RELE(smb_vfs->sv_rootvp); - smb_vfs->sv_magic = (uint32_t)~SMB_VFS_MAGIC; - kmem_cache_free(smb_kshare_cache_vfs, smb_vfs); -} diff --git a/usr/src/uts/common/fs/smbsrv/smb_vops.c b/usr/src/uts/common/fs/smbsrv/smb_vops.c index d2f0fd7085..4b0f99839f 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_vops.c +++ b/usr/src/uts/common/fs/smbsrv/smb_vops.c @@ -608,8 +608,14 @@ smb_vop_lookup( char *np = name; char namebuf[MAXNAMELEN]; - if (*name == '\0') - return (EINVAL); + if (*name == '\0') { + /* + * This happens creating named streams at the share root. + */ + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } ASSERT(vpp); *vpp = NULL; diff --git a/usr/src/uts/common/io/vioblk/vioblk.c b/usr/src/uts/common/io/vioblk/vioblk.c index 074d886857..8801a0e760 100644 --- a/usr/src/uts/common/io/vioblk/vioblk.c +++ b/usr/src/uts/common/io/vioblk/vioblk.c @@ -22,9 +22,50 @@ /* * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com> - * Copyright 2017, Joyent Inc. + * Copyright 2019 Joyent Inc. */ +/* + * VIRTIO BLOCK DRIVER + * + * This driver provides support for Virtio Block devices. Each driver instance + * attaches to a single underlying block device. + * + * REQUEST CHAIN LAYOUT + * + * Every request chain sent to the I/O queue has the following structure. Each + * box in the diagram represents a descriptor entry (i.e., a DMA cookie) within + * the chain: + * + * +-0-----------------------------------------+ + * | struct virtio_blk_hdr |-----------------------\ + * | (written by driver, read by device) | | + * +-1-----------------------------------------+ | + * | optional data payload |--\ | + * | (written by driver for write requests, | | | + * | or by device for read requests) | | | + * +-2-----------------------------------------+ | | + * | ,~` : |-cookies loaned | + * |/ : ,~`| | from blkdev | + * : / | | | + * +-(N - 1)-----------------------------------+ | | + * | ... end of data payload. | | | + * | | | | + * | |--/ | + * +-N-----------------------------------------+ | + * | status byte | | + * | (written by device, read by driver) |--------------------\ | + * +-------------------------------------------+ | | + * | | + * The memory for the header and status bytes (i.e., 0 and N above) | | + * is allocated as a single chunk by vioblk_alloc_reqs(): | | + * | | + * +-------------------------------------------+ | | + * | struct virtio_blk_hdr |<----------------------/ + * +-------------------------------------------+ | + * | status byte |<-------------------/ + * +-------------------------------------------+ + */ #include <sys/modctl.h> #include <sys/blkdev.h> @@ -43,402 +84,429 @@ #include <sys/debug.h> #include <sys/pci.h> #include <sys/containerof.h> -#include "virtiovar.h" -#include "virtioreg.h" - -/* Feature bits */ -#define VIRTIO_BLK_F_BARRIER (1<<0) -#define VIRTIO_BLK_F_SIZE_MAX (1<<1) -#define VIRTIO_BLK_F_SEG_MAX (1<<2) -#define VIRTIO_BLK_F_GEOMETRY (1<<4) -#define VIRTIO_BLK_F_RO (1<<5) -#define VIRTIO_BLK_F_BLK_SIZE (1<<6) -#define VIRTIO_BLK_F_SCSI (1<<7) -#define VIRTIO_BLK_F_FLUSH (1<<9) -#define VIRTIO_BLK_F_TOPOLOGY (1<<10) - -/* Configuration registers */ -#define VIRTIO_BLK_CONFIG_CAPACITY 0 /* 64bit */ -#define VIRTIO_BLK_CONFIG_SIZE_MAX 8 /* 32bit */ -#define VIRTIO_BLK_CONFIG_SEG_MAX 12 /* 32bit */ -#define VIRTIO_BLK_CONFIG_GEOMETRY_C 16 /* 16bit */ -#define VIRTIO_BLK_CONFIG_GEOMETRY_H 18 /* 8bit */ -#define VIRTIO_BLK_CONFIG_GEOMETRY_S 19 /* 8bit */ -#define VIRTIO_BLK_CONFIG_BLK_SIZE 20 /* 32bit */ -#define VIRTIO_BLK_CONFIG_TOPO_PBEXP 24 /* 8bit */ -#define VIRTIO_BLK_CONFIG_TOPO_ALIGN 25 /* 8bit */ -#define VIRTIO_BLK_CONFIG_TOPO_MIN_SZ 26 /* 16bit */ -#define VIRTIO_BLK_CONFIG_TOPO_OPT_SZ 28 /* 32bit */ - -/* Command */ -#define VIRTIO_BLK_T_IN 0 -#define VIRTIO_BLK_T_OUT 1 -#define VIRTIO_BLK_T_SCSI_CMD 2 -#define VIRTIO_BLK_T_SCSI_CMD_OUT 3 -#define VIRTIO_BLK_T_FLUSH 4 -#define VIRTIO_BLK_T_FLUSH_OUT 5 -#define VIRTIO_BLK_T_GET_ID 8 -#define VIRTIO_BLK_T_BARRIER 0x80000000 - -#define VIRTIO_BLK_ID_BYTES 20 /* devid */ - -/* Statuses */ -#define VIRTIO_BLK_S_OK 0 -#define VIRTIO_BLK_S_IOERR 1 -#define VIRTIO_BLK_S_UNSUPP 2 - -#define DEF_MAXINDIRECT (128) -#define DEF_MAXSECTOR (4096) - -#define VIOBLK_POISON 0xdead0001dead0001 +#include <sys/ctype.h> +#include <sys/sysmacros.h> -/* - * Static Variables. - */ -static char vioblk_ident[] = "VirtIO block driver"; +#include "virtio.h" +#include "vioblk.h" -/* Request header structure */ -struct vioblk_req_hdr { - uint32_t type; /* VIRTIO_BLK_T_* */ - uint32_t ioprio; - uint64_t sector; -}; -struct vioblk_req { - struct vioblk_req_hdr hdr; - uint8_t status; - uint8_t unused[3]; - unsigned int ndmac; - ddi_dma_handle_t dmah; - ddi_dma_handle_t bd_dmah; - ddi_dma_cookie_t dmac; - bd_xfer_t *xfer; -}; +static void vioblk_get_id(vioblk_t *); +uint_t vioblk_int_handler(caddr_t, caddr_t); +static uint_t vioblk_poll(vioblk_t *); +static int vioblk_quiesce(dev_info_t *); +static int vioblk_attach(dev_info_t *, ddi_attach_cmd_t); +static int vioblk_detach(dev_info_t *, ddi_detach_cmd_t); -struct vioblk_stats { - struct kstat_named sts_rw_outofmemory; - struct kstat_named sts_rw_badoffset; - struct kstat_named sts_rw_queuemax; - struct kstat_named sts_rw_cookiesmax; - struct kstat_named sts_rw_cacheflush; - struct kstat_named sts_intr_queuemax; - struct kstat_named sts_intr_total; - struct kstat_named sts_io_errors; - struct kstat_named sts_unsupp_errors; - struct kstat_named sts_nxio_errors; + +static struct dev_ops vioblk_dev_ops = { + .devo_rev = DEVO_REV, + .devo_refcnt = 0, + + .devo_attach = vioblk_attach, + .devo_detach = vioblk_detach, + .devo_quiesce = vioblk_quiesce, + + .devo_getinfo = ddi_no_info, + .devo_identify = nulldev, + .devo_probe = nulldev, + .devo_reset = nodev, + .devo_cb_ops = NULL, + .devo_bus_ops = NULL, + .devo_power = NULL, }; -struct vioblk_lstats { - uint64_t rw_cacheflush; - uint64_t intr_total; - unsigned int rw_cookiesmax; - unsigned int intr_queuemax; - unsigned int io_errors; - unsigned int unsupp_errors; - unsigned int nxio_errors; +static struct modldrv vioblk_modldrv = { + .drv_modops = &mod_driverops, + .drv_linkinfo = "VIRTIO block driver", + .drv_dev_ops = &vioblk_dev_ops }; -struct vioblk_softc { - dev_info_t *sc_dev; /* mirrors virtio_softc->sc_dev */ - struct virtio_softc sc_virtio; - struct virtqueue *sc_vq; - bd_handle_t bd_h; - struct vioblk_req *sc_reqs; - struct vioblk_stats *ks_data; - kstat_t *sc_intrstat; - uint64_t sc_capacity; - uint64_t sc_nblks; - struct vioblk_lstats sc_stats; - short sc_blkflags; - boolean_t sc_in_poll_mode; - boolean_t sc_readonly; - int sc_blk_size; - int sc_pblk_size; - int sc_seg_max; - int sc_seg_size_max; - kmutex_t lock_devid; - kcondvar_t cv_devid; - char devid[VIRTIO_BLK_ID_BYTES + 1]; +static struct modlinkage vioblk_modlinkage = { + .ml_rev = MODREV_1, + .ml_linkage = { &vioblk_modldrv, NULL } }; -static int vioblk_get_id(struct vioblk_softc *sc); - -static int vioblk_read(void *arg, bd_xfer_t *xfer); -static int vioblk_write(void *arg, bd_xfer_t *xfer); -static int vioblk_flush(void *arg, bd_xfer_t *xfer); -static void vioblk_driveinfo(void *arg, bd_drive_t *drive); -static int vioblk_mediainfo(void *arg, bd_media_t *media); -static int vioblk_devid_init(void *, dev_info_t *, ddi_devid_t *); -uint_t vioblk_int_handler(caddr_t arg1, caddr_t arg2); - -static bd_ops_t vioblk_ops = { - BD_OPS_VERSION_0, - vioblk_driveinfo, - vioblk_mediainfo, - vioblk_devid_init, - vioblk_flush, - vioblk_read, - vioblk_write, +/* + * DMA attribute template for header and status blocks. We also make a + * per-instance copy of this template with negotiated sizes from the device for + * blkdev. + */ +static const ddi_dma_attr_t vioblk_dma_attr = { + .dma_attr_version = DMA_ATTR_V0, + .dma_attr_addr_lo = 0x0000000000000000, + .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, + .dma_attr_count_max = 0x00000000FFFFFFFF, + .dma_attr_align = 1, + .dma_attr_burstsizes = 1, + .dma_attr_minxfer = 1, + .dma_attr_maxxfer = 0x00000000FFFFFFFF, + .dma_attr_seg = 0x00000000FFFFFFFF, + .dma_attr_sgllen = 1, + .dma_attr_granular = 1, + .dma_attr_flags = 0 }; -static int vioblk_quiesce(dev_info_t *); -static int vioblk_attach(dev_info_t *, ddi_attach_cmd_t); -static int vioblk_detach(dev_info_t *, ddi_detach_cmd_t); -static struct dev_ops vioblk_dev_ops = { - DEVO_REV, - 0, - ddi_no_info, - nulldev, /* identify */ - nulldev, /* probe */ - vioblk_attach, /* attach */ - vioblk_detach, /* detach */ - nodev, /* reset */ - NULL, /* cb_ops */ - NULL, /* bus_ops */ - NULL, /* power */ - vioblk_quiesce /* quiesce */ -}; +static vioblk_req_t * +vioblk_req_alloc(vioblk_t *vib) +{ + vioblk_req_t *vbr; + VERIFY(MUTEX_HELD(&vib->vib_mutex)); + if ((vbr = list_remove_head(&vib->vib_reqs)) == NULL) { + return (NULL); + } + vib->vib_nreqs_alloc++; -/* Standard Module linkage initialization for a Streams driver */ -extern struct mod_ops mod_driverops; + VERIFY0(vbr->vbr_status); + vbr->vbr_status |= VIOBLK_REQSTAT_ALLOCATED; -static struct modldrv modldrv = { - &mod_driverops, /* Type of module. This one is a driver */ - vioblk_ident, /* short description */ - &vioblk_dev_ops /* driver specific ops */ -}; + VERIFY3P(vbr->vbr_xfer, ==, NULL); + VERIFY3S(vbr->vbr_error, ==, 0); -static struct modlinkage modlinkage = { - MODREV_1, - { - (void *)&modldrv, - NULL, - }, -}; + return (vbr); +} -ddi_device_acc_attr_t vioblk_attr = { - DDI_DEVICE_ATTR_V0, - DDI_NEVERSWAP_ACC, /* virtio is always native byte order */ - DDI_STORECACHING_OK_ACC, - DDI_DEFAULT_ACC -}; +static void +vioblk_req_free(vioblk_t *vib, vioblk_req_t *vbr) +{ + VERIFY(MUTEX_HELD(&vib->vib_mutex)); -/* DMA attr for the header/status blocks. */ -static ddi_dma_attr_t vioblk_req_dma_attr = { - DMA_ATTR_V0, /* dma_attr version */ - 0, /* dma_attr_addr_lo */ - 0xFFFFFFFFFFFFFFFFull, /* dma_attr_addr_hi */ - 0x00000000FFFFFFFFull, /* dma_attr_count_max */ - 1, /* dma_attr_align */ - 1, /* dma_attr_burstsizes */ - 1, /* dma_attr_minxfer */ - 0xFFFFFFFFull, /* dma_attr_maxxfer */ - 0xFFFFFFFFFFFFFFFFull, /* dma_attr_seg */ - 1, /* dma_attr_sgllen */ - 1, /* dma_attr_granular */ - 0, /* dma_attr_flags */ -}; + /* + * Check that this request was allocated, then zero the status field to + * clear all status bits. + */ + VERIFY(vbr->vbr_status & VIOBLK_REQSTAT_ALLOCATED); + vbr->vbr_status = 0; -/* DMA attr for the data blocks. */ -static ddi_dma_attr_t vioblk_bd_dma_attr = { - DMA_ATTR_V0, /* dma_attr version */ - 0, /* dma_attr_addr_lo */ - 0xFFFFFFFFFFFFFFFFull, /* dma_attr_addr_hi */ - 0x00000000FFFFFFFFull, /* dma_attr_count_max */ - 1, /* dma_attr_align */ - 1, /* dma_attr_burstsizes */ - 1, /* dma_attr_minxfer */ - 0, /* dma_attr_maxxfer, set in attach */ - 0xFFFFFFFFFFFFFFFFull, /* dma_attr_seg */ - 0, /* dma_attr_sgllen, set in attach */ - 1, /* dma_attr_granular */ - 0, /* dma_attr_flags */ -}; + vbr->vbr_xfer = NULL; + vbr->vbr_error = 0; + vbr->vbr_type = 0; -static int -vioblk_rw(struct vioblk_softc *sc, bd_xfer_t *xfer, int type, - uint32_t len) + list_insert_head(&vib->vib_reqs, vbr); + + VERIFY3U(vib->vib_nreqs_alloc, >, 0); + vib->vib_nreqs_alloc--; +} + +static void +vioblk_complete(vioblk_t *vib, vioblk_req_t *vbr) { - struct vioblk_req *req; - struct vq_entry *ve_hdr; - int total_cookies, write; + VERIFY(MUTEX_HELD(&vib->vib_mutex)); - write = (type == VIRTIO_BLK_T_OUT || - type == VIRTIO_BLK_T_FLUSH_OUT) ? 1 : 0; - total_cookies = 2; + VERIFY(!(vbr->vbr_status & VIOBLK_REQSTAT_COMPLETE)); + vbr->vbr_status |= VIOBLK_REQSTAT_COMPLETE; - if ((xfer->x_blkno + xfer->x_nblks) > sc->sc_nblks) { - sc->ks_data->sts_rw_badoffset.value.ui64++; - return (EINVAL); + if (vbr->vbr_type == VIRTIO_BLK_T_FLUSH) { + vib->vib_stats->vbs_rw_cacheflush.value.ui64++; } - /* allocate top entry */ - ve_hdr = vq_alloc_entry(sc->sc_vq); - if (!ve_hdr) { - sc->ks_data->sts_rw_outofmemory.value.ui64++; - return (ENOMEM); + if (vbr->vbr_xfer != NULL) { + /* + * This is a blkdev framework request. + */ + mutex_exit(&vib->vib_mutex); + bd_xfer_done(vbr->vbr_xfer, vbr->vbr_error); + mutex_enter(&vib->vib_mutex); + vbr->vbr_xfer = NULL; } +} - /* getting request */ - req = &sc->sc_reqs[ve_hdr->qe_index]; - req->hdr.type = type; - req->hdr.ioprio = 0; - req->hdr.sector = xfer->x_blkno; - req->xfer = xfer; - - /* Header */ - virtio_ve_add_indirect_buf(ve_hdr, req->dmac.dmac_laddress, - sizeof (struct vioblk_req_hdr), B_TRUE); - - /* Payload */ - if (len > 0) { - virtio_ve_add_cookie(ve_hdr, xfer->x_dmah, xfer->x_dmac, - xfer->x_ndmac, write ? B_TRUE : B_FALSE); - total_cookies += xfer->x_ndmac; +static virtio_chain_t * +vioblk_common_start(vioblk_t *vib, int type, uint64_t sector, + boolean_t polled) +{ + vioblk_req_t *vbr = NULL; + virtio_chain_t *vic = NULL; + + if ((vbr = vioblk_req_alloc(vib)) == NULL) { + vib->vib_stats->vbs_rw_outofmemory.value.ui64++; + return (NULL); + } + vbr->vbr_type = type; + + if (polled) { + /* + * Mark this command as polled so that we can wait on it + * ourselves. + */ + vbr->vbr_status |= VIOBLK_REQSTAT_POLLED; } - /* Status */ - virtio_ve_add_indirect_buf(ve_hdr, - req->dmac.dmac_laddress + sizeof (struct vioblk_req_hdr), - sizeof (uint8_t), B_FALSE); + if ((vic = virtio_chain_alloc(vib->vib_vq, KM_NOSLEEP)) == NULL) { + vib->vib_stats->vbs_rw_outofmemory.value.ui64++; + goto fail; + } - /* sending the whole chain to the device */ - virtio_push_chain(ve_hdr, B_TRUE); + struct vioblk_req_hdr vbh; + vbh.vbh_type = type; + vbh.vbh_ioprio = 0; + vbh.vbh_sector = sector; + bcopy(&vbh, virtio_dma_va(vbr->vbr_dma, 0), sizeof (vbh)); - if (sc->sc_stats.rw_cookiesmax < total_cookies) - sc->sc_stats.rw_cookiesmax = total_cookies; + virtio_chain_data_set(vic, vbr); - return (DDI_SUCCESS); + /* + * Put the header in the first descriptor. See the block comment at + * the top of the file for more details on the chain layout. + */ + if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0), + sizeof (struct vioblk_req_hdr), VIRTIO_DIR_DEVICE_READS) != + DDI_SUCCESS) { + goto fail; + } + + return (vic); + +fail: + vbr->vbr_xfer = NULL; + vioblk_req_free(vib, vbr); + if (vic != NULL) { + virtio_chain_free(vic); + } + return (NULL); } -/* - * Now in polling mode. Interrupts are off, so we - * 1) poll for the already queued requests to complete. - * 2) push our request. - * 3) wait for our request to complete. - */ static int -vioblk_rw_poll(struct vioblk_softc *sc, bd_xfer_t *xfer, - int type, uint32_t len) +vioblk_common_submit(vioblk_t *vib, virtio_chain_t *vic) { - clock_t tmout; - int ret; + int r; + vioblk_req_t *vbr = virtio_chain_data(vic); - ASSERT(xfer->x_flags & BD_XFER_POLL); + VERIFY(MUTEX_HELD(&vib->vib_mutex)); - /* Prevent a hard hang. */ - tmout = drv_usectohz(30000000); - - /* Poll for an empty queue */ - while (vq_num_used(sc->sc_vq)) { - /* Check if any pending requests completed. */ - ret = vioblk_int_handler((caddr_t)&sc->sc_virtio, NULL); - if (ret != DDI_INTR_CLAIMED) { - drv_usecwait(10); - tmout -= 10; - return (ETIMEDOUT); - } + /* + * The device will write the status byte into this last descriptor. + * See the block comment at the top of the file for more details on the + * chain layout. + */ + if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0) + + sizeof (struct vioblk_req_hdr), sizeof (uint8_t), + VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) { + r = ENOMEM; + goto out; } - ret = vioblk_rw(sc, xfer, type, len); - if (ret) - return (ret); + virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORDEV); + virtio_chain_submit(vic, B_TRUE); + + if (!(vbr->vbr_status & VIOBLK_REQSTAT_POLLED)) { + /* + * This is not a polled request. Our request will be freed and + * the caller notified later in vioblk_poll(). + */ + return (0); + } - tmout = drv_usectohz(30000000); - /* Poll for an empty queue again. */ - while (vq_num_used(sc->sc_vq)) { - /* Check if any pending requests completed. */ - ret = vioblk_int_handler((caddr_t)&sc->sc_virtio, NULL); - if (ret != DDI_INTR_CLAIMED) { + /* + * This is a polled request. We need to block here and wait for the + * device to complete request processing. + */ + while (!(vbr->vbr_status & VIOBLK_REQSTAT_POLL_COMPLETE)) { + if (ddi_in_panic()) { + /* + * When panicking, interrupts are disabled. We must + * poll the queue manually. + */ drv_usecwait(10); - tmout -= 10; - return (ETIMEDOUT); + (void) vioblk_poll(vib); + continue; } + + /* + * When not panicking, the device will interrupt on command + * completion and vioblk_poll() will be called to wake us up. + */ + cv_wait(&vib->vib_cv, &vib->vib_mutex); } - return (DDI_SUCCESS); + vioblk_complete(vib, vbr); + r = vbr->vbr_error; + +out: + vioblk_req_free(vib, vbr); + virtio_chain_free(vic); + return (r); } static int -vioblk_read(void *arg, bd_xfer_t *xfer) +vioblk_internal(vioblk_t *vib, int type, virtio_dma_t *dma, + uint64_t sector, virtio_direction_t dir) { - int ret; - struct vioblk_softc *sc = (void *)arg; + virtio_chain_t *vic; + vioblk_req_t *vbr; + int r; - if (xfer->x_flags & BD_XFER_POLL) { - if (!sc->sc_in_poll_mode) { - virtio_stop_vq_intr(sc->sc_vq); - sc->sc_in_poll_mode = 1; - } + VERIFY(MUTEX_HELD(&vib->vib_mutex)); - ret = vioblk_rw_poll(sc, xfer, VIRTIO_BLK_T_IN, - xfer->x_nblks * DEV_BSIZE); - } else { - if (sc->sc_in_poll_mode) { - virtio_start_vq_intr(sc->sc_vq); - sc->sc_in_poll_mode = 0; - } + /* + * Allocate a polled request. + */ + if ((vic = vioblk_common_start(vib, type, sector, B_TRUE)) == NULL) { + return (ENOMEM); + } + vbr = virtio_chain_data(vic); - ret = vioblk_rw(sc, xfer, VIRTIO_BLK_T_IN, - xfer->x_nblks * DEV_BSIZE); + /* + * If there is a request payload, it goes between the header and the + * status byte. See the block comment at the top of the file for more + * detail on the chain layout. + */ + if (dma != NULL) { + for (uint_t n = 0; n < virtio_dma_ncookies(dma); n++) { + if (virtio_chain_append(vic, + virtio_dma_cookie_pa(dma, n), + virtio_dma_cookie_size(dma, n), dir) != + DDI_SUCCESS) { + r = ENOMEM; + goto out; + } + } } - return (ret); + return (vioblk_common_submit(vib, vic)); + +out: + vioblk_req_free(vib, vbr); + virtio_chain_free(vic); + return (r); } static int -vioblk_write(void *arg, bd_xfer_t *xfer) +vioblk_request(vioblk_t *vib, bd_xfer_t *xfer, int type) { - int ret; - struct vioblk_softc *sc = (void *)arg; + virtio_chain_t *vic = NULL; + vioblk_req_t *vbr = NULL; + uint_t total_cookies = 2; + boolean_t polled = (xfer->x_flags & BD_XFER_POLL) != 0; + int r; - if (xfer->x_flags & BD_XFER_POLL) { - if (!sc->sc_in_poll_mode) { - virtio_stop_vq_intr(sc->sc_vq); - sc->sc_in_poll_mode = 1; - } + VERIFY(MUTEX_HELD(&vib->vib_mutex)); + + /* + * Ensure that this request falls within the advertised size of the + * block device. Be careful to avoid overflow. + */ + if (xfer->x_nblks > SIZE_MAX - xfer->x_blkno || + (xfer->x_blkno + xfer->x_nblks) > vib->vib_nblks) { + vib->vib_stats->vbs_rw_badoffset.value.ui64++; + return (EINVAL); + } - ret = vioblk_rw_poll(sc, xfer, VIRTIO_BLK_T_OUT, - xfer->x_nblks * DEV_BSIZE); - } else { - if (sc->sc_in_poll_mode) { - virtio_start_vq_intr(sc->sc_vq); - sc->sc_in_poll_mode = 0; + if ((vic = vioblk_common_start(vib, type, xfer->x_blkno, polled)) == + NULL) { + return (ENOMEM); + } + vbr = virtio_chain_data(vic); + vbr->vbr_xfer = xfer; + + /* + * If there is a request payload, it goes between the header and the + * status byte. See the block comment at the top of the file for more + * detail on the chain layout. + */ + if ((type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_OUT) && + xfer->x_nblks > 0) { + virtio_direction_t dir = (type == VIRTIO_BLK_T_OUT) ? + VIRTIO_DIR_DEVICE_READS : VIRTIO_DIR_DEVICE_WRITES; + + for (uint_t n = 0; n < xfer->x_ndmac; n++) { + ddi_dma_cookie_t dmac; + + if (n == 0) { + /* + * The first cookie is in the blkdev request. + */ + dmac = xfer->x_dmac; + } else { + ddi_dma_nextcookie(xfer->x_dmah, &dmac); + } + + if (virtio_chain_append(vic, dmac.dmac_laddress, + dmac.dmac_size, dir) != DDI_SUCCESS) { + r = ENOMEM; + goto fail; + } } - ret = vioblk_rw(sc, xfer, VIRTIO_BLK_T_OUT, - xfer->x_nblks * DEV_BSIZE); + total_cookies += xfer->x_ndmac; + + } else if (xfer->x_nblks > 0) { + dev_err(vib->vib_dip, CE_PANIC, + "request of type %d had payload length of %lu blocks", type, + xfer->x_nblks); + } + + if (vib->vib_stats->vbs_rw_cookiesmax.value.ui32 < total_cookies) { + vib->vib_stats->vbs_rw_cookiesmax.value.ui32 = total_cookies; } - return (ret); + + return (vioblk_common_submit(vib, vic)); + +fail: + vbr->vbr_xfer = NULL; + vioblk_req_free(vib, vbr); + virtio_chain_free(vic); + return (r); } static int -vioblk_flush(void *arg, bd_xfer_t *xfer) +vioblk_bd_read(void *arg, bd_xfer_t *xfer) { - int ret; - struct vioblk_softc *sc = (void *)arg; + vioblk_t *vib = arg; + int r; + + mutex_enter(&vib->vib_mutex); + r = vioblk_request(vib, xfer, VIRTIO_BLK_T_IN); + mutex_exit(&vib->vib_mutex); - ASSERT((xfer->x_flags & BD_XFER_POLL) == 0); + return (r); +} - ret = vioblk_rw(sc, xfer, VIRTIO_BLK_T_FLUSH_OUT, - xfer->x_nblks * DEV_BSIZE); +static int +vioblk_bd_write(void *arg, bd_xfer_t *xfer) +{ + vioblk_t *vib = arg; + int r; - if (!ret) - sc->sc_stats.rw_cacheflush++; + mutex_enter(&vib->vib_mutex); + r = vioblk_request(vib, xfer, VIRTIO_BLK_T_OUT); + mutex_exit(&vib->vib_mutex); - return (ret); + return (r); } +static int +vioblk_bd_flush(void *arg, bd_xfer_t *xfer) +{ + vioblk_t *vib = arg; + int r; + + mutex_enter(&vib->vib_mutex); + if (!virtio_feature_present(vib->vib_virtio, VIRTIO_BLK_F_FLUSH)) { + /* + * We don't really expect to get here, because if we did not + * negotiate the flush feature we would not have installed this + * function in the blkdev ops vector. + */ + mutex_exit(&vib->vib_mutex); + return (ENOTSUP); + } + + r = vioblk_request(vib, xfer, VIRTIO_BLK_T_FLUSH); + mutex_exit(&vib->vib_mutex); + + return (r); +} static void -vioblk_driveinfo(void *arg, bd_drive_t *drive) +vioblk_bd_driveinfo(void *arg, bd_drive_t *drive) { - struct vioblk_softc *sc = (void *)arg; + vioblk_t *vib = arg; - drive->d_qsize = sc->sc_vq->vq_num; + drive->d_qsize = vib->vib_reqs_capacity; drive->d_removable = B_FALSE; drive->d_hotpluggable = B_TRUE; drive->d_target = 0; @@ -450,8 +518,7 @@ vioblk_driveinfo(void *arg, bd_drive_t *drive) drive->d_product = "Block Device"; drive->d_product_len = strlen(drive->d_product); - (void) vioblk_get_id(sc); - drive->d_serial = sc->devid; + drive->d_serial = vib->vib_devid; drive->d_serial_len = strlen(drive->d_serial); drive->d_revision = "0000"; @@ -459,618 +526,501 @@ vioblk_driveinfo(void *arg, bd_drive_t *drive) } static int -vioblk_mediainfo(void *arg, bd_media_t *media) +vioblk_bd_mediainfo(void *arg, bd_media_t *media) { - struct vioblk_softc *sc = (void *)arg; + vioblk_t *vib = (void *)arg; - media->m_nblks = sc->sc_nblks; - media->m_blksize = sc->sc_blk_size; - media->m_readonly = sc->sc_readonly; - media->m_pblksize = sc->sc_pblk_size; + /* + * The device protocol is specified in terms of 512 byte logical + * blocks, regardless of the recommended I/O size which might be + * larger. + */ + media->m_nblks = vib->vib_nblks; + media->m_blksize = DEV_BSIZE; + + media->m_readonly = vib->vib_readonly; + media->m_pblksize = vib->vib_pblk_size; return (0); } -static int -vioblk_get_id(struct vioblk_softc *sc) +static void +vioblk_get_id(vioblk_t *vib) { - clock_t deadline; - int ret; - bd_xfer_t xfer; - - deadline = ddi_get_lbolt() + (clock_t)drv_usectohz(3 * 1000000); - (void) memset(&xfer, 0, sizeof (bd_xfer_t)); - xfer.x_nblks = 1; - - ret = ddi_dma_alloc_handle(sc->sc_dev, &vioblk_bd_dma_attr, - DDI_DMA_SLEEP, NULL, &xfer.x_dmah); - if (ret != DDI_SUCCESS) - goto out_alloc; - - ret = ddi_dma_addr_bind_handle(xfer.x_dmah, NULL, (caddr_t)&sc->devid, - VIRTIO_BLK_ID_BYTES, DDI_DMA_READ | DDI_DMA_CONSISTENT, - DDI_DMA_SLEEP, NULL, &xfer.x_dmac, &xfer.x_ndmac); - if (ret != DDI_DMA_MAPPED) { - ret = DDI_FAILURE; - goto out_map; - } + virtio_dma_t *dma; + int r; - mutex_enter(&sc->lock_devid); - - ret = vioblk_rw(sc, &xfer, VIRTIO_BLK_T_GET_ID, - VIRTIO_BLK_ID_BYTES); - if (ret) { - mutex_exit(&sc->lock_devid); - goto out_rw; + if ((dma = virtio_dma_alloc(vib->vib_virtio, VIRTIO_BLK_ID_BYTES, + &vioblk_dma_attr, DDI_DMA_CONSISTENT | DDI_DMA_READ, + KM_SLEEP)) == NULL) { + return; } - /* wait for reply */ - ret = cv_timedwait(&sc->cv_devid, &sc->lock_devid, deadline); - mutex_exit(&sc->lock_devid); - - (void) ddi_dma_unbind_handle(xfer.x_dmah); - ddi_dma_free_handle(&xfer.x_dmah); + mutex_enter(&vib->vib_mutex); + if ((r = vioblk_internal(vib, VIRTIO_BLK_T_GET_ID, dma, 0, + VIRTIO_DIR_DEVICE_WRITES)) == 0) { + const char *b = virtio_dma_va(dma, 0); + uint_t pos = 0; - /* timeout */ - if (ret < 0) { - dev_err(sc->sc_dev, CE_WARN, - "Cannot get devid from the device"); - return (DDI_FAILURE); - } - - return (0); + /* + * Save the entire response for debugging purposes. + */ + bcopy(virtio_dma_va(dma, 0), vib->vib_rawid, + VIRTIO_BLK_ID_BYTES); -out_rw: - (void) ddi_dma_unbind_handle(xfer.x_dmah); -out_map: - ddi_dma_free_handle(&xfer.x_dmah); -out_alloc: - return (ret); -} + /* + * Process the returned ID. + */ + bzero(vib->vib_devid, sizeof (vib->vib_devid)); + for (uint_t n = 0; n < VIRTIO_BLK_ID_BYTES; n++) { + if (isalnum(b[n]) || b[n] == '-' || b[n] == '_') { + /* + * Accept a subset of printable ASCII + * characters. + */ + vib->vib_devid[pos++] = b[n]; + } else { + /* + * Stop processing at the first sign of + * trouble. + */ + break; + } + } -static int -vioblk_devid_init(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) -{ - struct vioblk_softc *sc = (void *)arg; - int ret; - - ret = vioblk_get_id(sc); - if (ret != DDI_SUCCESS) - return (ret); - - ret = ddi_devid_init(devinfo, DEVID_ATA_SERIAL, - VIRTIO_BLK_ID_BYTES, sc->devid, devid); - if (ret != DDI_SUCCESS) { - dev_err(devinfo, CE_WARN, "Cannot build devid from the device"); - return (ret); + vib->vib_devid_fetched = B_TRUE; } + mutex_exit(&vib->vib_mutex); - dev_debug(sc->sc_dev, CE_NOTE, - "devid %x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x", - sc->devid[0], sc->devid[1], sc->devid[2], sc->devid[3], - sc->devid[4], sc->devid[5], sc->devid[6], sc->devid[7], - sc->devid[8], sc->devid[9], sc->devid[10], sc->devid[11], - sc->devid[12], sc->devid[13], sc->devid[14], sc->devid[15], - sc->devid[16], sc->devid[17], sc->devid[18], sc->devid[19]); - - return (0); -} - -static void -vioblk_show_features(struct vioblk_softc *sc, const char *prefix, - uint32_t features) -{ - char buf[512]; - char *bufp = buf; - char *bufend = buf + sizeof (buf); - - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += snprintf(bufp, bufend - bufp, prefix); - - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += virtio_show_features(features, bufp, bufend - bufp); - - - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += snprintf(bufp, bufend - bufp, "Vioblk ( "); - - if (features & VIRTIO_BLK_F_BARRIER) - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += snprintf(bufp, bufend - bufp, "BARRIER "); - if (features & VIRTIO_BLK_F_SIZE_MAX) - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += snprintf(bufp, bufend - bufp, "SIZE_MAX "); - if (features & VIRTIO_BLK_F_SEG_MAX) - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += snprintf(bufp, bufend - bufp, "SEG_MAX "); - if (features & VIRTIO_BLK_F_GEOMETRY) - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += snprintf(bufp, bufend - bufp, "GEOMETRY "); - if (features & VIRTIO_BLK_F_RO) - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += snprintf(bufp, bufend - bufp, "RO "); - if (features & VIRTIO_BLK_F_BLK_SIZE) - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += snprintf(bufp, bufend - bufp, "BLK_SIZE "); - if (features & VIRTIO_BLK_F_SCSI) - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += snprintf(bufp, bufend - bufp, "SCSI "); - if (features & VIRTIO_BLK_F_FLUSH) - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += snprintf(bufp, bufend - bufp, "FLUSH "); - if (features & VIRTIO_BLK_F_TOPOLOGY) - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += snprintf(bufp, bufend - bufp, "TOPOLOGY "); - - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += snprintf(bufp, bufend - bufp, ")"); - *bufp = '\0'; - - dev_debug(sc->sc_dev, CE_NOTE, "%s", buf); + virtio_dma_free(dma); } static int -vioblk_dev_features(struct vioblk_softc *sc) +vioblk_bd_devid(void *arg, dev_info_t *dip, ddi_devid_t *devid) { - uint32_t host_features; - - host_features = virtio_negotiate_features(&sc->sc_virtio, - VIRTIO_BLK_F_RO | - VIRTIO_BLK_F_GEOMETRY | - VIRTIO_BLK_F_BLK_SIZE | - VIRTIO_BLK_F_FLUSH | - VIRTIO_BLK_F_TOPOLOGY | - VIRTIO_BLK_F_SEG_MAX | - VIRTIO_BLK_F_SIZE_MAX | - VIRTIO_F_RING_INDIRECT_DESC); - - vioblk_show_features(sc, "Host features: ", host_features); - vioblk_show_features(sc, "Negotiated features: ", - sc->sc_virtio.sc_features); - - if (!(sc->sc_virtio.sc_features & VIRTIO_F_RING_INDIRECT_DESC)) { - dev_err(sc->sc_dev, CE_NOTE, - "Host does not support RING_INDIRECT_DESC, bye."); + vioblk_t *vib = arg; + size_t len; + + if ((len = strlen(vib->vib_devid)) == 0) { + /* + * The device has no ID. + */ return (DDI_FAILURE); } - return (DDI_SUCCESS); + return (ddi_devid_init(dip, DEVID_ATA_SERIAL, len, vib->vib_devid, + devid)); } -/* ARGSUSED */ -uint_t -vioblk_int_handler(caddr_t arg1, caddr_t arg2) +/* + * As the device completes processing of a request, it returns the chain for + * that request to our I/O queue. This routine is called in two contexts: + * - from the interrupt handler, in response to notification from the device + * - synchronously in line with request processing when panicking + */ +static uint_t +vioblk_poll(vioblk_t *vib) { - struct virtio_softc *vsc = (void *)arg1; - struct vioblk_softc *sc = __containerof(vsc, - struct vioblk_softc, sc_virtio); - struct vq_entry *ve; - uint32_t len; - int i = 0, error; - - while ((ve = virtio_pull_chain(sc->sc_vq, &len))) { - struct vioblk_req *req = &sc->sc_reqs[ve->qe_index]; - bd_xfer_t *xfer = req->xfer; - uint8_t status = req->status; - uint32_t type = req->hdr.type; - - if (req->xfer == (void *)VIOBLK_POISON) { - dev_err(sc->sc_dev, CE_WARN, "Poisoned descriptor!"); - virtio_free_chain(ve); - return (DDI_INTR_CLAIMED); - } + virtio_chain_t *vic; + uint_t count = 0; + boolean_t wakeup = B_FALSE; + + VERIFY(MUTEX_HELD(&vib->vib_mutex)); - req->xfer = (void *) VIOBLK_POISON; + while ((vic = virtio_queue_poll(vib->vib_vq)) != NULL) { + vioblk_req_t *vbr = virtio_chain_data(vic); + uint8_t status; - /* Note: blkdev tears down the payload mapping for us. */ - virtio_free_chain(ve); + virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORCPU); + + bcopy(virtio_dma_va(vbr->vbr_dma, + sizeof (struct vioblk_req_hdr)), &status, sizeof (status)); - /* returning payload back to blkdev */ switch (status) { - case VIRTIO_BLK_S_OK: - error = 0; - break; - case VIRTIO_BLK_S_IOERR: - error = EIO; - sc->sc_stats.io_errors++; - break; - case VIRTIO_BLK_S_UNSUPP: - sc->sc_stats.unsupp_errors++; - error = ENOTTY; - break; - default: - sc->sc_stats.nxio_errors++; - error = ENXIO; - break; + case VIRTIO_BLK_S_OK: + vbr->vbr_error = 0; + break; + case VIRTIO_BLK_S_IOERR: + vbr->vbr_error = EIO; + vib->vib_stats->vbs_io_errors.value.ui64++; + break; + case VIRTIO_BLK_S_UNSUPP: + vbr->vbr_error = ENOTTY; + vib->vib_stats->vbs_unsupp_errors.value.ui64++; + break; + default: + vbr->vbr_error = ENXIO; + vib->vib_stats->vbs_nxio_errors.value.ui64++; + break; + } + + count++; + + if (vbr->vbr_status & VIOBLK_REQSTAT_POLLED) { + /* + * This request must not be freed as it is being held + * by a call to vioblk_common_submit(). + */ + VERIFY(!(vbr->vbr_status & + VIOBLK_REQSTAT_POLL_COMPLETE)); + vbr->vbr_status |= VIOBLK_REQSTAT_POLL_COMPLETE; + wakeup = B_TRUE; + continue; } - if (type == VIRTIO_BLK_T_GET_ID) { - /* notify devid_init */ - mutex_enter(&sc->lock_devid); - cv_broadcast(&sc->cv_devid); - mutex_exit(&sc->lock_devid); - } else - bd_xfer_done(xfer, error); + vioblk_complete(vib, vbr); - i++; + vioblk_req_free(vib, vbr); + virtio_chain_free(vic); } - /* update stats */ - if (sc->sc_stats.intr_queuemax < i) - sc->sc_stats.intr_queuemax = i; - sc->sc_stats.intr_total++; + if (wakeup) { + /* + * Signal anybody waiting for polled command completion. + */ + cv_broadcast(&vib->vib_cv); + } - return (DDI_INTR_CLAIMED); + return (count); } -/* ARGSUSED */ uint_t -vioblk_config_handler(caddr_t arg1, caddr_t arg2) -{ - return (DDI_INTR_CLAIMED); -} - -static int -vioblk_register_ints(struct vioblk_softc *sc) +vioblk_int_handler(caddr_t arg0, caddr_t arg1) { - int ret; + vioblk_t *vib = (vioblk_t *)arg0; + uint_t count; - struct virtio_int_handler vioblk_conf_h = { - vioblk_config_handler - }; - - struct virtio_int_handler vioblk_vq_h[] = { - { vioblk_int_handler }, - { NULL }, - }; + mutex_enter(&vib->vib_mutex); + if ((count = vioblk_poll(vib)) > + vib->vib_stats->vbs_intr_queuemax.value.ui32) { + vib->vib_stats->vbs_intr_queuemax.value.ui32 = count; + } - ret = virtio_register_ints(&sc->sc_virtio, - &vioblk_conf_h, vioblk_vq_h); + vib->vib_stats->vbs_intr_total.value.ui64++; + mutex_exit(&vib->vib_mutex); - return (ret); + return (DDI_INTR_CLAIMED); } static void -vioblk_free_reqs(struct vioblk_softc *sc) +vioblk_free_reqs(vioblk_t *vib) { - int i, qsize; + VERIFY3U(vib->vib_nreqs_alloc, ==, 0); - qsize = sc->sc_vq->vq_num; + for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) { + struct vioblk_req *vbr = &vib->vib_reqs_mem[i]; - for (i = 0; i < qsize; i++) { - struct vioblk_req *req = &sc->sc_reqs[i]; + VERIFY(list_link_active(&vbr->vbr_link)); + list_remove(&vib->vib_reqs, vbr); - if (req->ndmac) - (void) ddi_dma_unbind_handle(req->dmah); + VERIFY0(vbr->vbr_status); - if (req->dmah) - ddi_dma_free_handle(&req->dmah); + if (vbr->vbr_dma != NULL) { + virtio_dma_free(vbr->vbr_dma); + vbr->vbr_dma = NULL; + } } + VERIFY(list_is_empty(&vib->vib_reqs)); - kmem_free(sc->sc_reqs, sizeof (struct vioblk_req) * qsize); + if (vib->vib_reqs_mem != NULL) { + kmem_free(vib->vib_reqs_mem, + sizeof (struct vioblk_req) * vib->vib_reqs_capacity); + vib->vib_reqs_mem = NULL; + vib->vib_reqs_capacity = 0; + } } static int -vioblk_alloc_reqs(struct vioblk_softc *sc) +vioblk_alloc_reqs(vioblk_t *vib) { - int i, qsize; - int ret; - - qsize = sc->sc_vq->vq_num; - - sc->sc_reqs = kmem_zalloc(sizeof (struct vioblk_req) * qsize, KM_SLEEP); - - for (i = 0; i < qsize; i++) { - struct vioblk_req *req = &sc->sc_reqs[i]; - - ret = ddi_dma_alloc_handle(sc->sc_dev, &vioblk_req_dma_attr, - DDI_DMA_SLEEP, NULL, &req->dmah); - if (ret != DDI_SUCCESS) { - - dev_err(sc->sc_dev, CE_WARN, - "Can't allocate dma handle for req " - "buffer %d", i); - goto exit; - } + vib->vib_reqs_capacity = MIN(virtio_queue_size(vib->vib_vq), + VIRTIO_BLK_REQ_BUFS); + vib->vib_reqs_mem = kmem_zalloc( + sizeof (struct vioblk_req) * vib->vib_reqs_capacity, KM_SLEEP); + vib->vib_nreqs_alloc = 0; + + for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) { + list_insert_tail(&vib->vib_reqs, &vib->vib_reqs_mem[i]); + } - ret = ddi_dma_addr_bind_handle(req->dmah, NULL, - (caddr_t)&req->hdr, + for (vioblk_req_t *vbr = list_head(&vib->vib_reqs); vbr != NULL; + vbr = list_next(&vib->vib_reqs, vbr)) { + if ((vbr->vbr_dma = virtio_dma_alloc(vib->vib_virtio, sizeof (struct vioblk_req_hdr) + sizeof (uint8_t), - DDI_DMA_RDWR | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, - NULL, &req->dmac, &req->ndmac); - if (ret != DDI_DMA_MAPPED) { - dev_err(sc->sc_dev, CE_WARN, - "Can't bind req buffer %d", i); - goto exit; + &vioblk_dma_attr, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, + KM_SLEEP)) == NULL) { + goto fail; } } return (0); -exit: - vioblk_free_reqs(sc); +fail: + vioblk_free_reqs(vib); return (ENOMEM); } - -static int -vioblk_ksupdate(kstat_t *ksp, int rw) -{ - struct vioblk_softc *sc = ksp->ks_private; - - if (rw == KSTAT_WRITE) - return (EACCES); - - sc->ks_data->sts_rw_cookiesmax.value.ui32 = sc->sc_stats.rw_cookiesmax; - sc->ks_data->sts_intr_queuemax.value.ui32 = sc->sc_stats.intr_queuemax; - sc->ks_data->sts_unsupp_errors.value.ui32 = sc->sc_stats.unsupp_errors; - sc->ks_data->sts_nxio_errors.value.ui32 = sc->sc_stats.nxio_errors; - sc->ks_data->sts_io_errors.value.ui32 = sc->sc_stats.io_errors; - sc->ks_data->sts_rw_cacheflush.value.ui64 = sc->sc_stats.rw_cacheflush; - sc->ks_data->sts_intr_total.value.ui64 = sc->sc_stats.intr_total; - - - return (0); -} - static int -vioblk_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) +vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { - int ret = DDI_SUCCESS; - int instance; - struct vioblk_softc *sc; - struct virtio_softc *vsc; - struct vioblk_stats *ks_data; + int instance = ddi_get_instance(dip); + vioblk_t *vib; + virtio_t *vio; + boolean_t did_mutex = B_FALSE; - instance = ddi_get_instance(devinfo); - - switch (cmd) { - case DDI_ATTACH: - break; - - case DDI_RESUME: - case DDI_PM_RESUME: - dev_err(devinfo, CE_WARN, "resume not supported yet"); + if (cmd != DDI_ATTACH) { return (DDI_FAILURE); + } - default: - dev_err(devinfo, CE_WARN, "cmd 0x%x not recognized", cmd); + if ((vio = virtio_init(dip, VIRTIO_BLK_WANTED_FEATURES, B_TRUE)) == + NULL) { + dev_err(dip, CE_WARN, "failed to start Virtio init"); return (DDI_FAILURE); } - sc = kmem_zalloc(sizeof (struct vioblk_softc), KM_SLEEP); - ddi_set_driver_private(devinfo, sc); - - vsc = &sc->sc_virtio; + vib = kmem_zalloc(sizeof (*vib), KM_SLEEP); + vib->vib_dip = dip; + vib->vib_virtio = vio; + ddi_set_driver_private(dip, vib); + list_create(&vib->vib_reqs, sizeof (vioblk_req_t), + offsetof(vioblk_req_t, vbr_link)); - /* Duplicate for faster access / less typing */ - sc->sc_dev = devinfo; - vsc->sc_dev = devinfo; + /* + * Determine how many scatter-gather entries we can use in a single + * request. + */ + vib->vib_seg_max = VIRTIO_BLK_DEFAULT_MAX_SEG; + if (virtio_feature_present(vio, VIRTIO_BLK_F_SEG_MAX)) { + vib->vib_seg_max = virtio_dev_get32(vio, + VIRTIO_BLK_CONFIG_SEG_MAX); - cv_init(&sc->cv_devid, NULL, CV_DRIVER, NULL); - mutex_init(&sc->lock_devid, NULL, MUTEX_DRIVER, NULL); + if (vib->vib_seg_max == 0 || vib->vib_seg_max == PCI_EINVAL32) { + /* + * We need to be able to use at least one data segment, + * so we'll assume that this device is just poorly + * implemented and try for one. + */ + vib->vib_seg_max = 1; + } + } /* - * Initialize interrupt kstat. This should not normally fail, since - * we don't use a persistent stat. We do it this way to avoid having - * to test for it at run time on the hot path. + * When allocating the request queue, we include two additional + * descriptors (beyond those required for request data) to account for + * the header and the status byte. */ - sc->sc_intrstat = kstat_create("vioblk", instance, - "intrs", "controller", KSTAT_TYPE_NAMED, + if ((vib->vib_vq = virtio_queue_alloc(vio, VIRTIO_BLK_VIRTQ_IO, "io", + vioblk_int_handler, vib, B_FALSE, vib->vib_seg_max + 2)) == NULL) { + goto fail; + } + + if (virtio_init_complete(vio, 0) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "failed to complete Virtio init"); + goto fail; + } + + cv_init(&vib->vib_cv, NULL, CV_DRIVER, NULL); + mutex_init(&vib->vib_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio)); + did_mutex = B_TRUE; + + if ((vib->vib_kstat = kstat_create("vioblk", instance, + "statistics", "controller", KSTAT_TYPE_NAMED, sizeof (struct vioblk_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_PERSISTENT); - if (sc->sc_intrstat == NULL) { - dev_err(devinfo, CE_WARN, "kstat_create failed"); - goto exit_intrstat; + KSTAT_FLAG_PERSISTENT)) == NULL) { + dev_err(dip, CE_WARN, "kstat_create failed"); + goto fail; } - ks_data = (struct vioblk_stats *)sc->sc_intrstat->ks_data; - kstat_named_init(&ks_data->sts_rw_outofmemory, + vib->vib_stats = (vioblk_stats_t *)vib->vib_kstat->ks_data; + kstat_named_init(&vib->vib_stats->vbs_rw_outofmemory, "total_rw_outofmemory", KSTAT_DATA_UINT64); - kstat_named_init(&ks_data->sts_rw_badoffset, + kstat_named_init(&vib->vib_stats->vbs_rw_badoffset, "total_rw_badoffset", KSTAT_DATA_UINT64); - kstat_named_init(&ks_data->sts_intr_total, + kstat_named_init(&vib->vib_stats->vbs_intr_total, "total_intr", KSTAT_DATA_UINT64); - kstat_named_init(&ks_data->sts_io_errors, - "total_io_errors", KSTAT_DATA_UINT32); - kstat_named_init(&ks_data->sts_unsupp_errors, - "total_unsupp_errors", KSTAT_DATA_UINT32); - kstat_named_init(&ks_data->sts_nxio_errors, - "total_nxio_errors", KSTAT_DATA_UINT32); - kstat_named_init(&ks_data->sts_rw_cacheflush, + kstat_named_init(&vib->vib_stats->vbs_io_errors, + "total_io_errors", KSTAT_DATA_UINT64); + kstat_named_init(&vib->vib_stats->vbs_unsupp_errors, + "total_unsupp_errors", KSTAT_DATA_UINT64); + kstat_named_init(&vib->vib_stats->vbs_nxio_errors, + "total_nxio_errors", KSTAT_DATA_UINT64); + kstat_named_init(&vib->vib_stats->vbs_rw_cacheflush, "total_rw_cacheflush", KSTAT_DATA_UINT64); - kstat_named_init(&ks_data->sts_rw_cookiesmax, + kstat_named_init(&vib->vib_stats->vbs_rw_cookiesmax, "max_rw_cookies", KSTAT_DATA_UINT32); - kstat_named_init(&ks_data->sts_intr_queuemax, + kstat_named_init(&vib->vib_stats->vbs_intr_queuemax, "max_intr_queue", KSTAT_DATA_UINT32); - sc->ks_data = ks_data; - sc->sc_intrstat->ks_private = sc; - sc->sc_intrstat->ks_update = vioblk_ksupdate; - kstat_install(sc->sc_intrstat); - - /* map BAR0 */ - ret = ddi_regs_map_setup(devinfo, 1, - (caddr_t *)&sc->sc_virtio.sc_io_addr, - 0, 0, &vioblk_attr, &sc->sc_virtio.sc_ioh); - if (ret != DDI_SUCCESS) { - dev_err(devinfo, CE_WARN, "unable to map bar0: [%d]", ret); - goto exit_map; + kstat_install(vib->vib_kstat); + + vib->vib_readonly = virtio_feature_present(vio, VIRTIO_BLK_F_RO); + if ((vib->vib_nblks = virtio_dev_get64(vio, + VIRTIO_BLK_CONFIG_CAPACITY)) == UINT64_MAX) { + dev_err(dip, CE_WARN, "invalid capacity"); + goto fail; } - virtio_device_reset(&sc->sc_virtio); - virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_ACK); - virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER); + /* + * Determine the optimal logical block size recommended by the device. + * This size is advisory; the protocol always deals in 512 byte blocks. + */ + vib->vib_blk_size = DEV_BSIZE; + if (virtio_feature_present(vio, VIRTIO_BLK_F_BLK_SIZE)) { + uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_BLK_SIZE); - if (vioblk_register_ints(sc)) { - dev_err(devinfo, CE_WARN, "Unable to add interrupt"); - goto exit_int; + if (v != 0 && v != PCI_EINVAL32) { + vib->vib_blk_size = v; + } } - ret = vioblk_dev_features(sc); - if (ret) - goto exit_features; + /* + * The device may also provide an advisory physical block size. + */ + vib->vib_pblk_size = vib->vib_blk_size; + if (virtio_feature_present(vio, VIRTIO_BLK_F_TOPOLOGY)) { + uint8_t v = virtio_dev_get8(vio, VIRTIO_BLK_CONFIG_TOPO_PBEXP); - if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_RO) - sc->sc_readonly = B_TRUE; - else - sc->sc_readonly = B_FALSE; + if (v != PCI_EINVAL8) { + vib->vib_pblk_size <<= v; + } + } - sc->sc_capacity = virtio_read_device_config_8(&sc->sc_virtio, - VIRTIO_BLK_CONFIG_CAPACITY); - sc->sc_nblks = sc->sc_capacity; + /* + * The maximum size for a cookie in a request. + */ + vib->vib_seg_size_max = VIRTIO_BLK_DEFAULT_MAX_SIZE; + if (virtio_feature_present(vio, VIRTIO_BLK_F_SIZE_MAX)) { + uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_SIZE_MAX); - sc->sc_blk_size = DEV_BSIZE; - if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_BLK_SIZE) { - sc->sc_blk_size = virtio_read_device_config_4(&sc->sc_virtio, - VIRTIO_BLK_CONFIG_BLK_SIZE); + if (v != 0 && v != PCI_EINVAL32) { + vib->vib_seg_size_max = v; + } } - sc->sc_pblk_size = sc->sc_blk_size; - if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_TOPOLOGY) { - sc->sc_pblk_size <<= virtio_read_device_config_1(&sc->sc_virtio, - VIRTIO_BLK_CONFIG_TOPO_PBEXP); + /* + * Set up the DMA attributes for blkdev to use for request data. The + * specification is not extremely clear about whether DMA-related + * parameters include or exclude the header and status descriptors. + * For now, we assume they cover only the request data and not the + * headers. + */ + vib->vib_bd_dma_attr = vioblk_dma_attr; + vib->vib_bd_dma_attr.dma_attr_sgllen = vib->vib_seg_max; + vib->vib_bd_dma_attr.dma_attr_count_max = vib->vib_seg_size_max; + vib->vib_bd_dma_attr.dma_attr_maxxfer = vib->vib_seg_max * + vib->vib_seg_size_max; + + if (vioblk_alloc_reqs(vib) != 0) { + goto fail; } - /* Flushing is not supported. */ - if (!(sc->sc_virtio.sc_features & VIRTIO_BLK_F_FLUSH)) { - vioblk_ops.o_sync_cache = NULL; + /* + * The blkdev framework does not provide a way to specify that the + * device does not support write cache flushing, except by omitting the + * "o_sync_cache" member from the ops vector. As "bd_alloc_handle()" + * makes a copy of the ops vector, we can safely assemble one on the + * stack based on negotiated features. + */ + bd_ops_t vioblk_bd_ops = { + .o_version = BD_OPS_VERSION_0, + .o_drive_info = vioblk_bd_driveinfo, + .o_media_info = vioblk_bd_mediainfo, + .o_devid_init = vioblk_bd_devid, + .o_sync_cache = vioblk_bd_flush, + .o_read = vioblk_bd_read, + .o_write = vioblk_bd_write, + }; + if (!virtio_feature_present(vio, VIRTIO_BLK_F_FLUSH)) { + vioblk_bd_ops.o_sync_cache = NULL; } - sc->sc_seg_max = DEF_MAXINDIRECT; - /* The max number of segments (cookies) in a request */ - if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_SEG_MAX) { - sc->sc_seg_max = virtio_read_device_config_4(&sc->sc_virtio, - VIRTIO_BLK_CONFIG_SEG_MAX); - - /* That's what Linux does. */ - if (!sc->sc_seg_max) - sc->sc_seg_max = 1; + vib->vib_bd_h = bd_alloc_handle(vib, &vioblk_bd_ops, + &vib->vib_bd_dma_attr, KM_SLEEP); - /* - * SEG_MAX corresponds to the number of _data_ - * blocks in a request - */ - sc->sc_seg_max += 2; + /* + * Enable interrupts now so that we can request the device identity. + */ + if (virtio_interrupts_enable(vio) != DDI_SUCCESS) { + goto fail; } - /* 2 descriptors taken for header/status */ - vioblk_bd_dma_attr.dma_attr_sgllen = sc->sc_seg_max - 2; + vioblk_get_id(vib); - /* The maximum size for a cookie in a request. */ - sc->sc_seg_size_max = DEF_MAXSECTOR; - if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_SIZE_MAX) { - sc->sc_seg_size_max = virtio_read_device_config_4( - &sc->sc_virtio, VIRTIO_BLK_CONFIG_SIZE_MAX); + if (bd_attach_handle(dip, vib->vib_bd_h) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "Failed to attach blkdev"); + goto fail; } - /* The maximum request size */ - vioblk_bd_dma_attr.dma_attr_maxxfer = - vioblk_bd_dma_attr.dma_attr_sgllen * sc->sc_seg_size_max; - - dev_debug(devinfo, CE_NOTE, - "nblks=%" PRIu64 " blksize=%d (%d) num_seg=%d, " - "seg_size=%d, maxxfer=%" PRIu64, - sc->sc_nblks, sc->sc_blk_size, sc->sc_pblk_size, - vioblk_bd_dma_attr.dma_attr_sgllen, - sc->sc_seg_size_max, - vioblk_bd_dma_attr.dma_attr_maxxfer); - + return (DDI_SUCCESS); - sc->sc_vq = virtio_alloc_vq(&sc->sc_virtio, 0, 0, - sc->sc_seg_max, "I/O request"); - if (sc->sc_vq == NULL) { - goto exit_alloc1; +fail: + if (vib->vib_bd_h != NULL) { + (void) bd_detach_handle(vib->vib_bd_h); + bd_free_handle(vib->vib_bd_h); } - - ret = vioblk_alloc_reqs(sc); - if (ret) { - goto exit_alloc2; + if (vio != NULL) { + (void) virtio_fini(vio, B_TRUE); } - - sc->bd_h = bd_alloc_handle(sc, &vioblk_ops, &vioblk_bd_dma_attr, - KM_SLEEP); - - - virtio_set_status(&sc->sc_virtio, - VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK); - virtio_start_vq_intr(sc->sc_vq); - - ret = virtio_enable_ints(&sc->sc_virtio); - if (ret) - goto exit_enable_ints; - - ret = bd_attach_handle(devinfo, sc->bd_h); - if (ret != DDI_SUCCESS) { - dev_err(devinfo, CE_WARN, "Failed to attach blkdev"); - goto exit_attach_bd; + if (did_mutex) { + mutex_destroy(&vib->vib_mutex); + cv_destroy(&vib->vib_cv); } - - return (DDI_SUCCESS); - -exit_attach_bd: - /* - * There is no virtio_disable_ints(), it's done in virtio_release_ints. - * If they ever get split, don't forget to add a call here. - */ -exit_enable_ints: - virtio_stop_vq_intr(sc->sc_vq); - bd_free_handle(sc->bd_h); - vioblk_free_reqs(sc); -exit_alloc2: - virtio_free_vq(sc->sc_vq); -exit_alloc1: -exit_features: - virtio_release_ints(&sc->sc_virtio); -exit_int: - virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_FAILED); - ddi_regs_map_free(&sc->sc_virtio.sc_ioh); -exit_map: - kstat_delete(sc->sc_intrstat); -exit_intrstat: - mutex_destroy(&sc->lock_devid); - cv_destroy(&sc->cv_devid); - kmem_free(sc, sizeof (struct vioblk_softc)); + if (vib->vib_kstat != NULL) { + kstat_delete(vib->vib_kstat); + } + vioblk_free_reqs(vib); + kmem_free(vib, sizeof (*vib)); return (DDI_FAILURE); } static int -vioblk_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) +vioblk_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { - struct vioblk_softc *sc = ddi_get_driver_private(devinfo); + vioblk_t *vib = ddi_get_driver_private(dip); - switch (cmd) { - case DDI_DETACH: - break; + if (cmd != DDI_DETACH) { + return (DDI_FAILURE); + } - case DDI_PM_SUSPEND: - cmn_err(CE_WARN, "suspend not supported yet"); + mutex_enter(&vib->vib_mutex); + if (vib->vib_nreqs_alloc > 0) { + /* + * Cannot detach while there are still outstanding requests. + */ + mutex_exit(&vib->vib_mutex); return (DDI_FAILURE); + } - default: - cmn_err(CE_WARN, "cmd 0x%x unrecognized", cmd); + if (bd_detach_handle(vib->vib_bd_h) != DDI_SUCCESS) { + mutex_exit(&vib->vib_mutex); return (DDI_FAILURE); } - (void) bd_detach_handle(sc->bd_h); - virtio_stop_vq_intr(sc->sc_vq); - virtio_release_ints(&sc->sc_virtio); - vioblk_free_reqs(sc); - virtio_free_vq(sc->sc_vq); - virtio_device_reset(&sc->sc_virtio); - ddi_regs_map_free(&sc->sc_virtio.sc_ioh); - kstat_delete(sc->sc_intrstat); - kmem_free(sc, sizeof (struct vioblk_softc)); + /* + * Tear down the Virtio framework before freeing the rest of the + * resources. This will ensure the interrupt handlers are no longer + * running. + */ + virtio_fini(vib->vib_virtio, B_FALSE); + + vioblk_free_reqs(vib); + kstat_delete(vib->vib_kstat); + + mutex_exit(&vib->vib_mutex); + mutex_destroy(&vib->vib_mutex); + + kmem_free(vib, sizeof (*vib)); return (DDI_SUCCESS); } static int -vioblk_quiesce(dev_info_t *devinfo) +vioblk_quiesce(dev_info_t *dip) { - struct vioblk_softc *sc = ddi_get_driver_private(devinfo); + vioblk_t *vib; - virtio_stop_vq_intr(sc->sc_vq); - virtio_device_reset(&sc->sc_virtio); + if ((vib = ddi_get_driver_private(dip)) == NULL) { + return (DDI_FAILURE); + } - return (DDI_SUCCESS); + return (virtio_quiesce(vib->vib_virtio)); } int @@ -1080,7 +1030,7 @@ _init(void) bd_mod_init(&vioblk_dev_ops); - if ((rv = mod_install(&modlinkage)) != 0) { + if ((rv = mod_install(&vioblk_modlinkage)) != 0) { bd_mod_fini(&vioblk_dev_ops); } @@ -1092,7 +1042,7 @@ _fini(void) { int rv; - if ((rv = mod_remove(&modlinkage)) == 0) { + if ((rv = mod_remove(&vioblk_modlinkage)) == 0) { bd_mod_fini(&vioblk_dev_ops); } @@ -1102,5 +1052,5 @@ _fini(void) int _info(struct modinfo *modinfop) { - return (mod_info(&modlinkage, modinfop)); + return (mod_info(&vioblk_modlinkage, modinfop)); } diff --git a/usr/src/uts/common/io/vioblk/vioblk.h b/usr/src/uts/common/io/vioblk/vioblk.h new file mode 100644 index 0000000000..e08fc31e8f --- /dev/null +++ b/usr/src/uts/common/io/vioblk/vioblk.h @@ -0,0 +1,212 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * VIRTIO BLOCK DRIVER + */ + +#ifndef _VIOBLK_H +#define _VIOBLK_H + +#include "virtio.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * VIRTIO BLOCK CONFIGURATION REGISTERS + * + * These are offsets into the device-specific configuration space available + * through the virtio_dev_*() family of functions. + */ +#define VIRTIO_BLK_CONFIG_CAPACITY 0x00 /* 64 R */ +#define VIRTIO_BLK_CONFIG_SIZE_MAX 0x08 /* 32 R */ +#define VIRTIO_BLK_CONFIG_SEG_MAX 0x0C /* 32 R */ +#define VIRTIO_BLK_CONFIG_GEOMETRY_C 0x10 /* 16 R */ +#define VIRTIO_BLK_CONFIG_GEOMETRY_H 0x12 /* 8 R */ +#define VIRTIO_BLK_CONFIG_GEOMETRY_S 0x13 /* 8 R */ +#define VIRTIO_BLK_CONFIG_BLK_SIZE 0x14 /* 32 R */ +#define VIRTIO_BLK_CONFIG_TOPO_PBEXP 0x18 /* 8 R */ +#define VIRTIO_BLK_CONFIG_TOPO_ALIGN 0x19 /* 8 R */ +#define VIRTIO_BLK_CONFIG_TOPO_MIN_SZ 0x1A /* 16 R */ +#define VIRTIO_BLK_CONFIG_TOPO_OPT_SZ 0x1C /* 32 R */ + +/* + * VIRTIO BLOCK VIRTQUEUES + * + * Virtio block devices have just one queue which is used to make the various + * supported I/O requests. + */ +#define VIRTIO_BLK_VIRTQ_IO 0 + +/* + * VIRTIO BLOCK FEATURE BITS + */ +#define VIRTIO_BLK_F_BARRIER (1ULL << 0) +#define VIRTIO_BLK_F_SIZE_MAX (1ULL << 1) +#define VIRTIO_BLK_F_SEG_MAX (1ULL << 2) +#define VIRTIO_BLK_F_GEOMETRY (1ULL << 4) +#define VIRTIO_BLK_F_RO (1ULL << 5) +#define VIRTIO_BLK_F_BLK_SIZE (1ULL << 6) +#define VIRTIO_BLK_F_SCSI (1ULL << 7) +#define VIRTIO_BLK_F_FLUSH (1ULL << 9) +#define VIRTIO_BLK_F_TOPOLOGY (1ULL << 10) + +/* + * These features are supported by the driver and we will request them from the + * device. + */ +#define VIRTIO_BLK_WANTED_FEATURES (VIRTIO_BLK_F_RO | \ + VIRTIO_BLK_F_BLK_SIZE | \ + VIRTIO_BLK_F_FLUSH | \ + VIRTIO_BLK_F_TOPOLOGY | \ + VIRTIO_BLK_F_SEG_MAX | \ + VIRTIO_BLK_F_SIZE_MAX) + +/* + * VIRTIO BLOCK REQUEST HEADER + * + * This structure appears at the start of each I/O request buffer. Note that + * neither the data payload nor the status byte appear in this structure as + * both are handled in separate descriptor entries. + */ +struct vioblk_req_hdr { + uint32_t vbh_type; + uint32_t vbh_ioprio; + uint64_t vbh_sector; +} __packed; + +/* + * VIRTIO BLOCK REQUEST HEADER: COMMANDS (vbh_type) + * + * Each of these is a command type, except for BARRIER which is logically + * OR-ed with one of the other types. + */ +#define VIRTIO_BLK_T_IN 0 +#define VIRTIO_BLK_T_OUT 1 +#define VIRTIO_BLK_T_SCSI_CMD 2 +#define VIRTIO_BLK_T_SCSI_CMD_OUT 3 +#define VIRTIO_BLK_T_FLUSH 4 +#define VIRTIO_BLK_T_FLUSH_OUT 5 +#define VIRTIO_BLK_T_GET_ID 8 +#define VIRTIO_BLK_T_BARRIER 0x80000000 + +/* + * The GET_ID command type does not appear in the specification, but + * implementations in the wild use a 20 byte buffer into which the device will + * write an ASCII string. The string should not be assumed to be + * NUL-terminated. + */ +#define VIRTIO_BLK_ID_BYTES 20 + +/* + * VIRTIO BLOCK REQUEST HEADER: STATUS CODES + * + * These are returned in the writeable status byte descriptor included at the + * end of each request passed to the device. + */ +#define VIRTIO_BLK_S_OK 0 +#define VIRTIO_BLK_S_IOERR 1 +#define VIRTIO_BLK_S_UNSUPP 2 + +/* + * DRIVER PARAMETERS + */ + +/* + * In the event that the device does not negotiate DMA parameters, we have to + * make a best guess. + */ +#define VIRTIO_BLK_DEFAULT_MAX_SEG 128 +#define VIRTIO_BLK_DEFAULT_MAX_SIZE 4096 + +/* + * We allocate a fixed number of request buffers in advance and place them in a + * per-instance free list. + */ +#define VIRTIO_BLK_REQ_BUFS 256 + +/* + * TYPE DEFINITIONS + */ + +typedef enum vioblk_req_status { + VIOBLK_REQSTAT_ALLOCATED = (0x1 << 0), + VIOBLK_REQSTAT_INFLIGHT = (0x1 << 1), + VIOBLK_REQSTAT_COMPLETE = (0x1 << 2), + VIOBLK_REQSTAT_POLLED = (0x1 << 3), + VIOBLK_REQSTAT_POLL_COMPLETE = (0x1 << 4), +} vioblk_req_status_t; + +typedef struct vioblk_req { + vioblk_req_status_t vbr_status; + uint64_t vbr_seqno; + int vbr_type; + int vbr_error; + virtio_dma_t *vbr_dma; + bd_xfer_t *vbr_xfer; + list_node_t vbr_link; +} vioblk_req_t; + +typedef struct vioblk_stats { + struct kstat_named vbs_rw_outofmemory; + struct kstat_named vbs_rw_badoffset; + struct kstat_named vbs_rw_queuemax; + struct kstat_named vbs_rw_cookiesmax; + struct kstat_named vbs_rw_cacheflush; + struct kstat_named vbs_intr_queuemax; + struct kstat_named vbs_intr_total; + struct kstat_named vbs_io_errors; + struct kstat_named vbs_unsupp_errors; + struct kstat_named vbs_nxio_errors; +} vioblk_stats_t; + +typedef struct vioblk { + dev_info_t *vib_dip; + virtio_t *vib_virtio; + virtio_queue_t *vib_vq; + + kmutex_t vib_mutex; + kcondvar_t vib_cv; + + bd_handle_t vib_bd_h; + ddi_dma_attr_t vib_bd_dma_attr; + + list_t vib_reqs; + uint_t vib_nreqs_alloc; + uint_t vib_reqs_capacity; + vioblk_req_t *vib_reqs_mem; + + kstat_t *vib_kstat; + vioblk_stats_t *vib_stats; + + uint64_t vib_nblks; + boolean_t vib_readonly; + uint_t vib_blk_size; + uint_t vib_pblk_size; + uint_t vib_seg_max; + uint_t vib_seg_size_max; + + boolean_t vib_devid_fetched; + char vib_devid[VIRTIO_BLK_ID_BYTES + 1]; + uint8_t vib_rawid[VIRTIO_BLK_ID_BYTES]; +} vioblk_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _VIOBLK_H */ diff --git a/usr/src/uts/common/io/vioif/vioif.c b/usr/src/uts/common/io/vioif/vioif.c index ec6684f040..201e84e11b 100644 --- a/usr/src/uts/common/io/vioif/vioif.c +++ b/usr/src/uts/common/io/vioif/vioif.c @@ -41,6 +41,10 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* + * VIRTIO NETWORK DRIVER + */ + #include <sys/types.h> #include <sys/errno.h> #include <sys/param.h> @@ -57,6 +61,7 @@ #include <sys/pci.h> #include <sys/ethernet.h> #include <sys/vlan.h> +#include <sys/sysmacros.h> #include <sys/dlpi.h> #include <sys/taskq.h> @@ -72,805 +77,625 @@ #include <sys/mac_provider.h> #include <sys/mac_ether.h> -#include "virtiovar.h" -#include "virtioreg.h" - -/* Configuration registers */ -#define VIRTIO_NET_CONFIG_MAC 0 /* 8bit x 6byte */ -#define VIRTIO_NET_CONFIG_STATUS 6 /* 16bit */ - -/* Feature bits */ -#define VIRTIO_NET_F_CSUM (1 << 0) /* Host handles pkts w/ partial csum */ -#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* Guest handles pkts w/ part csum */ -#define VIRTIO_NET_F_MAC (1 << 5) /* Host has given MAC address. */ -#define VIRTIO_NET_F_GSO (1 << 6) /* Host handles pkts w/ any GSO type */ -#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* Guest can handle TSOv4 in. */ -#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* Guest can handle TSOv6 in. */ -#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* Guest can handle TSO[6] w/ ECN in */ -#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* Guest can handle UFO in. */ -#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* Host can handle TSOv4 in. */ -#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* Host can handle TSOv6 in. */ -#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* Host can handle TSO[6] w/ ECN in */ -#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* Host can handle UFO in. */ -#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* Host can merge receive buffers. */ -#define VIRTIO_NET_F_STATUS (1 << 16) /* Config.status available */ -#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* Control channel available */ -#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* Control channel RX mode support */ -#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* Control channel VLAN filtering */ -#define VIRTIO_NET_F_CTRL_RX_EXTRA (1 << 20) /* Extra RX mode control support */ - -#define VIRTIO_NET_FEATURE_BITS \ - "\020" \ - "\1CSUM" \ - "\2GUEST_CSUM" \ - "\6MAC" \ - "\7GSO" \ - "\10GUEST_TSO4" \ - "\11GUEST_TSO6" \ - "\12GUEST_ECN" \ - "\13GUEST_UFO" \ - "\14HOST_TSO4" \ - "\15HOST_TSO6" \ - "\16HOST_ECN" \ - "\17HOST_UFO" \ - "\20MRG_RXBUF" \ - "\21STATUS" \ - "\22CTRL_VQ" \ - "\23CTRL_RX" \ - "\24CTRL_VLAN" \ - "\25CTRL_RX_EXTRA" - -/* Status */ -#define VIRTIO_NET_S_LINK_UP 1 - -#pragma pack(1) -/* Packet header structure */ -struct virtio_net_hdr { - uint8_t flags; - uint8_t gso_type; - uint16_t hdr_len; - uint16_t gso_size; - uint16_t csum_start; - uint16_t csum_offset; -}; -#pragma pack() +#include "virtio.h" +#include "vioif.h" -#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* flags */ -#define VIRTIO_NET_HDR_GSO_NONE 0 /* gso_type */ -#define VIRTIO_NET_HDR_GSO_TCPV4 1 /* gso_type */ -#define VIRTIO_NET_HDR_GSO_UDP 3 /* gso_type */ -#define VIRTIO_NET_HDR_GSO_TCPV6 4 /* gso_type */ -#define VIRTIO_NET_HDR_GSO_ECN 0x80 /* gso_type, |'ed */ - -/* Control virtqueue */ -#pragma pack(1) -struct virtio_net_ctrl_cmd { - uint8_t class; - uint8_t command; +static int vioif_quiesce(dev_info_t *); +static int vioif_attach(dev_info_t *, ddi_attach_cmd_t); +static int vioif_detach(dev_info_t *, ddi_detach_cmd_t); +static boolean_t vioif_has_feature(vioif_t *, uint32_t); +static void vioif_reclaim_restart(vioif_t *); +static int vioif_m_stat(void *, uint_t, uint64_t *); +static void vioif_m_stop(void *); +static int vioif_m_start(void *); +static int vioif_m_multicst(void *, boolean_t, const uint8_t *); +static int vioif_m_setpromisc(void *, boolean_t); +static int vioif_m_unicst(void *, const uint8_t *); +static mblk_t *vioif_m_tx(void *, mblk_t *); +static int vioif_m_setprop(void *, const char *, mac_prop_id_t, uint_t, + const void *); +static int vioif_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); +static void vioif_m_propinfo(void *, const char *, mac_prop_id_t, + mac_prop_info_handle_t); +static boolean_t vioif_m_getcapab(void *, mac_capab_t, void *); +static uint_t vioif_add_rx(vioif_t *); + + +static struct cb_ops vioif_cb_ops = { + .cb_rev = CB_REV, + .cb_flag = D_MP | D_NEW, + + .cb_open = nulldev, + .cb_close = nulldev, + .cb_strategy = nodev, + .cb_print = nodev, + .cb_dump = nodev, + .cb_read = nodev, + .cb_write = nodev, + .cb_ioctl = nodev, + .cb_devmap = nodev, + .cb_mmap = nodev, + .cb_segmap = nodev, + .cb_chpoll = nochpoll, + .cb_prop_op = ddi_prop_op, + .cb_str = NULL, + .cb_aread = nodev, + .cb_awrite = nodev, }; -#pragma pack() - -#define VIRTIO_NET_CTRL_RX 0 -#define VIRTIO_NET_CTRL_RX_PROMISC 0 -#define VIRTIO_NET_CTRL_RX_ALLMULTI 1 -#define VIRTIO_NET_CTRL_MAC 1 -#define VIRTIO_NET_CTRL_MAC_TABLE_SET 0 +static struct dev_ops vioif_dev_ops = { + .devo_rev = DEVO_REV, + .devo_refcnt = 0, -#define VIRTIO_NET_CTRL_VLAN 2 -#define VIRTIO_NET_CTRL_VLAN_ADD 0 -#define VIRTIO_NET_CTRL_VLAN_DEL 1 + .devo_attach = vioif_attach, + .devo_detach = vioif_detach, + .devo_quiesce = vioif_quiesce, -#pragma pack(1) -struct virtio_net_ctrl_status { - uint8_t ack; -}; + .devo_cb_ops = &vioif_cb_ops, -struct virtio_net_ctrl_rx { - uint8_t onoff; + .devo_getinfo = NULL, + .devo_identify = nulldev, + .devo_probe = nulldev, + .devo_reset = nodev, + .devo_bus_ops = NULL, + .devo_power = NULL, }; -struct virtio_net_ctrl_mac_tbl { - uint32_t nentries; - uint8_t macs[][ETHERADDRL]; +static struct modldrv vioif_modldrv = { + .drv_modops = &mod_driverops, + .drv_linkinfo = "VIRTIO network driver", + .drv_dev_ops = &vioif_dev_ops }; -struct virtio_net_ctrl_vlan { - uint16_t id; -}; -#pragma pack() - -static int vioif_quiesce(dev_info_t *); -static int vioif_attach(dev_info_t *, ddi_attach_cmd_t); -static int vioif_detach(dev_info_t *, ddi_detach_cmd_t); - -DDI_DEFINE_STREAM_OPS(vioif_ops, - nulldev, /* identify */ - nulldev, /* probe */ - vioif_attach, /* attach */ - vioif_detach, /* detach */ - nodev, /* reset */ - NULL, /* cb_ops */ - D_MP, /* bus_ops */ - NULL, /* power */ - vioif_quiesce /* quiesce */); - -static char vioif_ident[] = "VirtIO ethernet driver"; - -/* Standard Module linkage initialization for a Streams driver */ -extern struct mod_ops mod_driverops; - -static struct modldrv modldrv = { - &mod_driverops, /* Type of module. This one is a driver */ - vioif_ident, /* short description */ - &vioif_ops /* driver specific ops */ +static struct modlinkage vioif_modlinkage = { + .ml_rev = MODREV_1, + .ml_linkage = { &vioif_modldrv, NULL } }; -static struct modlinkage modlinkage = { - MODREV_1, - { - (void *)&modldrv, - NULL, - }, +static mac_callbacks_t vioif_mac_callbacks = { + .mc_getstat = vioif_m_stat, + .mc_start = vioif_m_start, + .mc_stop = vioif_m_stop, + .mc_setpromisc = vioif_m_setpromisc, + .mc_multicst = vioif_m_multicst, + .mc_unicst = vioif_m_unicst, + .mc_tx = vioif_m_tx, + + .mc_callbacks = (MC_GETCAPAB | MC_SETPROP | + MC_GETPROP | MC_PROPINFO), + .mc_getcapab = vioif_m_getcapab, + .mc_setprop = vioif_m_setprop, + .mc_getprop = vioif_m_getprop, + .mc_propinfo = vioif_m_propinfo, }; -/* Interval for the periodic TX reclaim */ -uint_t vioif_reclaim_ms = 200; - -ddi_device_acc_attr_t vioif_attr = { - DDI_DEVICE_ATTR_V0, - DDI_NEVERSWAP_ACC, /* virtio is always native byte order */ - DDI_STORECACHING_OK_ACC, - DDI_DEFAULT_ACC +static const uchar_t vioif_broadcast[ETHERADDRL] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; /* - * A mapping represents a binding for a single buffer that is contiguous in the - * virtual address space. + * Interval for the periodic TX reclaim. */ -struct vioif_buf_mapping { - caddr_t vbm_buf; - ddi_dma_handle_t vbm_dmah; - ddi_acc_handle_t vbm_acch; - ddi_dma_cookie_t vbm_dmac; - unsigned int vbm_ncookies; -}; +uint_t vioif_reclaim_ms = 200; /* - * Rx buffers can be loaned upstream, so the code has - * to allocate them dynamically. + * DMA attribute template for transmit and receive buffers. The SGL entry + * count will be modified before using the template. Note that these + * allocations are aligned so that VIOIF_HEADER_SKIP places the IP header in + * received frames at the correct offset for the networking stack. */ -struct vioif_rx_buf { - struct vioif_softc *rb_sc; - frtn_t rb_frtn; - - struct vioif_buf_mapping rb_mapping; +ddi_dma_attr_t vioif_dma_attr_bufs = { + .dma_attr_version = DMA_ATTR_V0, + .dma_attr_addr_lo = 0x0000000000000000, + .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, + .dma_attr_count_max = 0x00000000FFFFFFFF, + .dma_attr_align = VIOIF_HEADER_ALIGN, + .dma_attr_burstsizes = 1, + .dma_attr_minxfer = 1, + .dma_attr_maxxfer = 0x00000000FFFFFFFF, + .dma_attr_seg = 0x00000000FFFFFFFF, + .dma_attr_sgllen = 0, + .dma_attr_granular = 1, + .dma_attr_flags = 0 }; /* - * Tx buffers have two mapping types. One, "inline", is pre-allocated and is - * used to hold the virtio_net_header. Small packets also get copied there, as - * it's faster then mapping them. Bigger packets get mapped using the "external" - * mapping array. An array is used, because a packet may consist of muptiple - * fragments, so each fragment gets bound to an entry. According to my - * observations, the number of fragments does not exceed 2, but just in case, - * a bigger, up to VIOIF_INDIRECT_MAX - 1 array is allocated. To save resources, - * the dma handles are allocated lazily in the tx path. + * DMA attributes for mapping larger transmit buffers from the networking + * stack. The requirements are quite loose, but note that the SGL entry length + * field is 32-bit. */ -struct vioif_tx_buf { - mblk_t *tb_mp; - - /* inline buffer */ - struct vioif_buf_mapping tb_inline_mapping; - - /* External buffers */ - struct vioif_buf_mapping *tb_external_mapping; - unsigned int tb_external_num; +ddi_dma_attr_t vioif_dma_attr_external = { + .dma_attr_version = DMA_ATTR_V0, + .dma_attr_addr_lo = 0x0000000000000000, + .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, + .dma_attr_count_max = 0x00000000FFFFFFFF, + .dma_attr_align = 1, + .dma_attr_burstsizes = 1, + .dma_attr_minxfer = 1, + .dma_attr_maxxfer = 0x00000000FFFFFFFF, + .dma_attr_seg = 0x00000000FFFFFFFF, + .dma_attr_sgllen = VIOIF_MAX_SEGS - 1, + .dma_attr_granular = 1, + .dma_attr_flags = 0 }; -struct vioif_softc { - dev_info_t *sc_dev; /* mirrors virtio_softc->sc_dev */ - struct virtio_softc sc_virtio; - - mac_handle_t sc_mac_handle; - mac_register_t *sc_macp; - - struct virtqueue *sc_rx_vq; - struct virtqueue *sc_tx_vq; - struct virtqueue *sc_ctrl_vq; - - /* TX virtqueue management resources */ - kmutex_t sc_tx_lock; - boolean_t sc_tx_corked; - boolean_t sc_tx_drain; - timeout_id_t sc_tx_reclaim_tid; - - /* Feature bits. */ - unsigned int sc_rx_csum:1; - unsigned int sc_tx_csum:1; - unsigned int sc_tx_tso4:1; - - /* - * For debugging, it is useful to know whether the MAC address we - * are using came from the host (via VIRTIO_NET_CONFIG_MAC) or - * was otherwise generated or set from within the guest. - */ - unsigned int sc_mac_from_host:1; - - int sc_mtu; - uint8_t sc_mac[ETHERADDRL]; - /* - * For rx buffers, we keep a pointer array, because the buffers - * can be loaned upstream, and we have to repopulate the array with - * new members. - */ - struct vioif_rx_buf **sc_rxbufs; - - /* - * For tx, we just allocate an array of buffers. The packet can - * either be copied into the inline buffer, or the external mapping - * could be used to map the packet - */ - struct vioif_tx_buf *sc_txbufs; - - kstat_t *sc_intrstat; - /* - * We "loan" rx buffers upstream and reuse them after they are - * freed. This lets us avoid allocations in the hot path. - */ - kmem_cache_t *sc_rxbuf_cache; - ulong_t sc_rxloan; - - /* Copying small packets turns out to be faster then mapping them. */ - unsigned long sc_rxcopy_thresh; - unsigned long sc_txcopy_thresh; - - /* - * Statistics visible through mac: - */ - uint64_t sc_ipackets; - uint64_t sc_opackets; - uint64_t sc_rbytes; - uint64_t sc_obytes; - uint64_t sc_brdcstxmt; - uint64_t sc_brdcstrcv; - uint64_t sc_multixmt; - uint64_t sc_multircv; - uint64_t sc_norecvbuf; - uint64_t sc_notxbuf; - uint64_t sc_ierrors; - uint64_t sc_oerrors; - - /* - * Internal debugging statistics: - */ - uint64_t sc_rxfail_dma_handle; - uint64_t sc_rxfail_dma_buffer; - uint64_t sc_rxfail_dma_bind; - uint64_t sc_rxfail_chain_undersize; - uint64_t sc_rxfail_no_descriptors; - uint64_t sc_txfail_dma_handle; - uint64_t sc_txfail_dma_bind; - uint64_t sc_txfail_indirect_limit; -}; - -#define ETHER_HEADER_LEN sizeof (struct ether_header) - -/* MTU + the ethernet header. */ -#define MAX_PAYLOAD 65535 -#define MAX_MTU (MAX_PAYLOAD - ETHER_HEADER_LEN) -#define DEFAULT_MTU ETHERMTU /* - * Yeah, we spend 8M per device. Turns out, there is no point - * being smart and using merged rx buffers (VIRTIO_NET_F_MRG_RXBUF), - * because vhost does not support them, and we expect to be used with - * vhost in production environment. + * VIRTIO NET MAC PROPERTIES */ -/* The buffer keeps both the packet data and the virtio_net_header. */ -#define VIOIF_RX_SIZE (MAX_PAYLOAD + sizeof (struct virtio_net_hdr)) +#define VIOIF_MACPROP_TXCOPY_THRESH "_txcopy_thresh" +#define VIOIF_MACPROP_TXCOPY_THRESH_DEF 300 +#define VIOIF_MACPROP_TXCOPY_THRESH_MAX 640 -/* - * We win a bit on header alignment, but the host wins a lot - * more on moving aligned buffers. Might need more thought. - */ -#define VIOIF_IP_ALIGN 0 - -/* Maximum number of indirect descriptors, somewhat arbitrary. */ -#define VIOIF_INDIRECT_MAX 128 - -/* - * We pre-allocate a reasonably large buffer to copy small packets - * there. Bigger packets are mapped, packets with multiple - * cookies are mapped as indirect buffers. - */ -#define VIOIF_TX_INLINE_SIZE 2048 - -/* Native queue size for all queues */ -#define VIOIF_RX_QLEN 0 -#define VIOIF_TX_QLEN 0 -#define VIOIF_CTRL_QLEN 0 - -static uchar_t vioif_broadcast[ETHERADDRL] = { - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff -}; - -#define VIOIF_TX_THRESH_MAX 640 -#define VIOIF_RX_THRESH_MAX 640 - -#define CACHE_NAME_SIZE 32 - -static char vioif_txcopy_thresh[] = - "vioif_txcopy_thresh"; -static char vioif_rxcopy_thresh[] = - "vioif_rxcopy_thresh"; +#define VIOIF_MACPROP_RXCOPY_THRESH "_rxcopy_thresh" +#define VIOIF_MACPROP_RXCOPY_THRESH_DEF 300 +#define VIOIF_MACPROP_RXCOPY_THRESH_MAX 640 static char *vioif_priv_props[] = { - vioif_txcopy_thresh, - vioif_rxcopy_thresh, + VIOIF_MACPROP_TXCOPY_THRESH, + VIOIF_MACPROP_RXCOPY_THRESH, NULL }; -static void vioif_reclaim_restart(struct vioif_softc *); -/* Add up to ddi? */ -static ddi_dma_cookie_t * -vioif_dma_curr_cookie(ddi_dma_handle_t dmah) +static vioif_txbuf_t * +vioif_txbuf_alloc(vioif_t *vif) { - ddi_dma_impl_t *dmah_impl = (void *) dmah; - ASSERT(dmah_impl->dmai_cookie); - return (dmah_impl->dmai_cookie); -} + vioif_txbuf_t *tb; -static void -vioif_dma_reset_cookie(ddi_dma_handle_t dmah, ddi_dma_cookie_t *dmac) -{ - ddi_dma_impl_t *dmah_impl = (void *) dmah; - dmah_impl->dmai_cookie = dmac; -} + VERIFY(MUTEX_HELD(&vif->vif_mutex)); -static link_state_t -vioif_link_state(struct vioif_softc *sc) -{ - if (sc->sc_virtio.sc_features & VIRTIO_NET_F_STATUS) { - if (virtio_read_device_config_2(&sc->sc_virtio, - VIRTIO_NET_CONFIG_STATUS) & VIRTIO_NET_S_LINK_UP) { - return (LINK_STATE_UP); - } else { - return (LINK_STATE_DOWN); - } + if ((tb = list_remove_head(&vif->vif_txbufs)) != NULL) { + vif->vif_ntxbufs_alloc++; } - return (LINK_STATE_UP); + return (tb); } -static ddi_dma_attr_t vioif_inline_buf_dma_attr = { - DMA_ATTR_V0, /* Version number */ - 0, /* low address */ - 0xFFFFFFFFFFFFFFFF, /* high address */ - 0xFFFFFFFF, /* counter register max */ - 1, /* page alignment */ - 1, /* burst sizes: 1 - 32 */ - 1, /* minimum transfer size */ - 0xFFFFFFFF, /* max transfer size */ - 0xFFFFFFFFFFFFFFF, /* address register max */ - 1, /* scatter-gather capacity */ - 1, /* device operates on bytes */ - 0, /* attr flag: set to 0 */ -}; - -static ddi_dma_attr_t vioif_mapped_buf_dma_attr = { - DMA_ATTR_V0, /* Version number */ - 0, /* low address */ - 0xFFFFFFFFFFFFFFFF, /* high address */ - 0xFFFFFFFF, /* counter register max */ - 1, /* page alignment */ - 1, /* burst sizes: 1 - 32 */ - 1, /* minimum transfer size */ - 0xFFFFFFFF, /* max transfer size */ - 0xFFFFFFFFFFFFFFF, /* address register max */ - - /* One entry is used for the virtio_net_hdr on the tx path */ - VIOIF_INDIRECT_MAX - 1, /* scatter-gather capacity */ - 1, /* device operates on bytes */ - 0, /* attr flag: set to 0 */ -}; - -static ddi_device_acc_attr_t vioif_bufattr = { - DDI_DEVICE_ATTR_V0, - DDI_NEVERSWAP_ACC, - DDI_STORECACHING_OK_ACC, - DDI_DEFAULT_ACC -}; - static void -vioif_rx_free(caddr_t free_arg) +vioif_txbuf_free(vioif_t *vif, vioif_txbuf_t *tb) { - struct vioif_rx_buf *buf = (void *) free_arg; - struct vioif_softc *sc = buf->rb_sc; + VERIFY(MUTEX_HELD(&vif->vif_mutex)); + + VERIFY3U(vif->vif_ntxbufs_alloc, >, 0); + vif->vif_ntxbufs_alloc--; - kmem_cache_free(sc->sc_rxbuf_cache, buf); - atomic_dec_ulong(&sc->sc_rxloan); + virtio_chain_clear(tb->tb_chain); + list_insert_head(&vif->vif_txbufs, tb); } -static int -vioif_rx_construct(void *buffer, void *user_arg, int kmflags) +static vioif_rxbuf_t * +vioif_rxbuf_alloc(vioif_t *vif) { - _NOTE(ARGUNUSED(kmflags)); - struct vioif_softc *sc = user_arg; - struct vioif_rx_buf *buf = buffer; - size_t len; + vioif_rxbuf_t *rb; - if (ddi_dma_alloc_handle(sc->sc_dev, &vioif_mapped_buf_dma_attr, - DDI_DMA_SLEEP, NULL, &buf->rb_mapping.vbm_dmah)) { - sc->sc_rxfail_dma_handle++; - goto exit_handle; - } - - if (ddi_dma_mem_alloc(buf->rb_mapping.vbm_dmah, - VIOIF_RX_SIZE + sizeof (struct virtio_net_hdr), - &vioif_bufattr, DDI_DMA_STREAMING, DDI_DMA_SLEEP, - NULL, &buf->rb_mapping.vbm_buf, &len, &buf->rb_mapping.vbm_acch)) { - sc->sc_rxfail_dma_buffer++; - goto exit_alloc; - } - ASSERT(len >= VIOIF_RX_SIZE); + VERIFY(MUTEX_HELD(&vif->vif_mutex)); - if (ddi_dma_addr_bind_handle(buf->rb_mapping.vbm_dmah, NULL, - buf->rb_mapping.vbm_buf, len, DDI_DMA_READ | DDI_DMA_STREAMING, - DDI_DMA_SLEEP, NULL, &buf->rb_mapping.vbm_dmac, - &buf->rb_mapping.vbm_ncookies)) { - sc->sc_rxfail_dma_bind++; - goto exit_bind; + if ((rb = list_remove_head(&vif->vif_rxbufs)) != NULL) { + vif->vif_nrxbufs_alloc++; } - ASSERT(buf->rb_mapping.vbm_ncookies <= VIOIF_INDIRECT_MAX); + return (rb); +} - buf->rb_sc = sc; - buf->rb_frtn.free_arg = (void *) buf; - buf->rb_frtn.free_func = vioif_rx_free; +static void +vioif_rxbuf_free(vioif_t *vif, vioif_rxbuf_t *rb) +{ + VERIFY(MUTEX_HELD(&vif->vif_mutex)); - return (0); -exit_bind: - ddi_dma_mem_free(&buf->rb_mapping.vbm_acch); -exit_alloc: - ddi_dma_free_handle(&buf->rb_mapping.vbm_dmah); -exit_handle: + VERIFY3U(vif->vif_nrxbufs_alloc, >, 0); + vif->vif_nrxbufs_alloc--; - return (ENOMEM); + virtio_chain_clear(rb->rb_chain); + list_insert_head(&vif->vif_rxbufs, rb); } static void -vioif_rx_destruct(void *buffer, void *user_arg) +vioif_rx_free_callback(caddr_t free_arg) { - _NOTE(ARGUNUSED(user_arg)); - struct vioif_rx_buf *buf = buffer; + vioif_rxbuf_t *rb = (vioif_rxbuf_t *)free_arg; + vioif_t *vif = rb->rb_vioif; + + mutex_enter(&vif->vif_mutex); + + /* + * Return this receive buffer to the free list. + */ + vioif_rxbuf_free(vif, rb); + + VERIFY3U(vif->vif_nrxbufs_onloan, >, 0); + vif->vif_nrxbufs_onloan--; - ASSERT(buf->rb_mapping.vbm_acch); - ASSERT(buf->rb_mapping.vbm_acch); + /* + * Attempt to replenish the receive queue with at least the buffer we + * just freed. There isn't a great way to deal with failure here, + * though because we'll only loan at most half of the buffers there + * should always be at least some available even if this fails. + */ + (void) vioif_add_rx(vif); - (void) ddi_dma_unbind_handle(buf->rb_mapping.vbm_dmah); - ddi_dma_mem_free(&buf->rb_mapping.vbm_acch); - ddi_dma_free_handle(&buf->rb_mapping.vbm_dmah); + mutex_exit(&vif->vif_mutex); } static void -vioif_free_mems(struct vioif_softc *sc) +vioif_free_bufs(vioif_t *vif) { - int i; + VERIFY(MUTEX_HELD(&vif->vif_mutex)); - for (i = 0; i < sc->sc_tx_vq->vq_num; i++) { - struct vioif_tx_buf *buf = &sc->sc_txbufs[i]; - int j; + VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0); + for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) { + vioif_txbuf_t *tb = &vif->vif_txbufs_mem[i]; - /* Tear down the internal mapping. */ + /* + * Ensure that this txbuf is now in the free list: + */ + VERIFY(list_link_active(&tb->tb_link)); + list_remove(&vif->vif_txbufs, tb); - ASSERT(buf->tb_inline_mapping.vbm_acch); - ASSERT(buf->tb_inline_mapping.vbm_dmah); + /* + * We should not have an mblk chain at this point. + */ + VERIFY3P(tb->tb_mp, ==, NULL); - (void) ddi_dma_unbind_handle(buf->tb_inline_mapping.vbm_dmah); - ddi_dma_mem_free(&buf->tb_inline_mapping.vbm_acch); - ddi_dma_free_handle(&buf->tb_inline_mapping.vbm_dmah); + if (tb->tb_dma != NULL) { + virtio_dma_free(tb->tb_dma); + tb->tb_dma = NULL; + } - /* We should not see any in-flight buffers at this point. */ - ASSERT(!buf->tb_mp); + if (tb->tb_chain != NULL) { + virtio_chain_free(tb->tb_chain); + tb->tb_chain = NULL; + } + + if (tb->tb_dmaext != NULL) { + for (uint_t j = 0; j < tb->tb_dmaext_capacity; j++) { + if (tb->tb_dmaext[j] != NULL) { + virtio_dma_free( + tb->tb_dmaext[j]); + tb->tb_dmaext[j] = NULL; + } + } - /* Free all the dma hdnales we allocated lazily. */ - for (j = 0; buf->tb_external_mapping[j].vbm_dmah; j++) - ddi_dma_free_handle( - &buf->tb_external_mapping[j].vbm_dmah); - /* Free the external mapping array. */ - kmem_free(buf->tb_external_mapping, - sizeof (struct vioif_tx_buf) * VIOIF_INDIRECT_MAX - 1); + kmem_free(tb->tb_dmaext, + sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity); + tb->tb_dmaext = NULL; + tb->tb_dmaext_capacity = 0; + } + } + VERIFY(list_is_empty(&vif->vif_txbufs)); + if (vif->vif_txbufs_mem != NULL) { + kmem_free(vif->vif_txbufs_mem, + sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity); + vif->vif_txbufs_mem = NULL; + vif->vif_txbufs_capacity = 0; } - kmem_free(sc->sc_txbufs, sizeof (struct vioif_tx_buf) * - sc->sc_tx_vq->vq_num); + VERIFY3U(vif->vif_nrxbufs_alloc, ==, 0); + for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) { + vioif_rxbuf_t *rb = &vif->vif_rxbufs_mem[i]; - for (i = 0; i < sc->sc_rx_vq->vq_num; i++) { - struct vioif_rx_buf *buf = sc->sc_rxbufs[i]; + /* + * Ensure that this rxbuf is now in the free list: + */ + VERIFY(list_link_active(&rb->rb_link)); + list_remove(&vif->vif_rxbufs, rb); - if (buf) - kmem_cache_free(sc->sc_rxbuf_cache, buf); + if (rb->rb_dma != NULL) { + virtio_dma_free(rb->rb_dma); + rb->rb_dma = NULL; + } + + if (rb->rb_chain != NULL) { + virtio_chain_free(rb->rb_chain); + rb->rb_chain = NULL; + } + } + VERIFY(list_is_empty(&vif->vif_rxbufs)); + if (vif->vif_rxbufs_mem != NULL) { + kmem_free(vif->vif_rxbufs_mem, + sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity); + vif->vif_rxbufs_mem = NULL; + vif->vif_rxbufs_capacity = 0; } - kmem_free(sc->sc_rxbufs, sizeof (struct vioif_rx_buf *) * - sc->sc_rx_vq->vq_num); } static int -vioif_alloc_mems(struct vioif_softc *sc) +vioif_alloc_bufs(vioif_t *vif) { - int i, txqsize, rxqsize; - size_t len; - unsigned int nsegments; + VERIFY(MUTEX_HELD(&vif->vif_mutex)); - txqsize = sc->sc_tx_vq->vq_num; - rxqsize = sc->sc_rx_vq->vq_num; + /* + * Allocate one contiguous chunk of memory for the transmit and receive + * buffer tracking objects. If the ring is unusually small, we'll + * reduce our target buffer count accordingly. + */ + vif->vif_txbufs_capacity = MIN(VIRTIO_NET_TX_BUFS, + virtio_queue_size(vif->vif_tx_vq)); + vif->vif_txbufs_mem = kmem_zalloc( + sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity, KM_SLEEP); + list_create(&vif->vif_txbufs, sizeof (vioif_txbuf_t), + offsetof(vioif_txbuf_t, tb_link)); + + vif->vif_rxbufs_capacity = MIN(VIRTIO_NET_RX_BUFS, + virtio_queue_size(vif->vif_rx_vq)); + vif->vif_rxbufs_mem = kmem_zalloc( + sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity, KM_SLEEP); + list_create(&vif->vif_rxbufs, sizeof (vioif_rxbuf_t), + offsetof(vioif_rxbuf_t, rb_link)); - sc->sc_txbufs = kmem_zalloc(sizeof (struct vioif_tx_buf) * txqsize, - KM_SLEEP); - if (sc->sc_txbufs == NULL) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to allocate the tx buffers array"); - goto exit_txalloc; - } + /* + * Do not loan more than half of our allocated receive buffers into + * the networking stack. + */ + vif->vif_nrxbufs_onloan_max = vif->vif_rxbufs_capacity / 2; /* - * We don't allocate the rx vioif_bufs, just the pointers, as - * rx vioif_bufs can be loaned upstream, and we don't know the - * total number we need. + * Put everything in the free list straight away in order to simplify + * the use of vioif_free_bufs() for cleanup on allocation failure. */ - sc->sc_rxbufs = kmem_zalloc(sizeof (struct vioif_rx_buf *) * rxqsize, - KM_SLEEP); - if (sc->sc_rxbufs == NULL) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to allocate the rx buffers pointer array"); - goto exit_rxalloc; + for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) { + list_insert_tail(&vif->vif_txbufs, &vif->vif_txbufs_mem[i]); + } + for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) { + list_insert_tail(&vif->vif_rxbufs, &vif->vif_rxbufs_mem[i]); } - for (i = 0; i < txqsize; i++) { - struct vioif_tx_buf *buf = &sc->sc_txbufs[i]; - - /* Allocate and bind an inline mapping. */ - - if (ddi_dma_alloc_handle(sc->sc_dev, - &vioif_inline_buf_dma_attr, - DDI_DMA_SLEEP, NULL, &buf->tb_inline_mapping.vbm_dmah)) { + /* + * Start from the DMA attribute template common to both transmit and + * receive buffers. The SGL entry count will be modified for each + * buffer type. + */ + ddi_dma_attr_t attr = vioif_dma_attr_bufs; - dev_err(sc->sc_dev, CE_WARN, - "Can't allocate dma handle for tx buffer %d", i); - goto exit_tx; + /* + * The transmit inline buffer is small (less than a page), so it's + * reasonable to request a single cookie. + */ + attr.dma_attr_sgllen = 1; + + for (vioif_txbuf_t *tb = list_head(&vif->vif_txbufs); tb != NULL; + tb = list_next(&vif->vif_txbufs, tb)) { + if ((tb->tb_dma = virtio_dma_alloc(vif->vif_virtio, + VIOIF_TX_INLINE_SIZE, &attr, + DDI_DMA_STREAMING | DDI_DMA_WRITE, KM_SLEEP)) == NULL) { + goto fail; } + VERIFY3U(virtio_dma_ncookies(tb->tb_dma), ==, 1); - if (ddi_dma_mem_alloc(buf->tb_inline_mapping.vbm_dmah, - VIOIF_TX_INLINE_SIZE, &vioif_bufattr, DDI_DMA_STREAMING, - DDI_DMA_SLEEP, NULL, &buf->tb_inline_mapping.vbm_buf, - &len, &buf->tb_inline_mapping.vbm_acch)) { - - dev_err(sc->sc_dev, CE_WARN, - "Can't allocate tx buffer %d", i); - goto exit_tx; + if ((tb->tb_chain = virtio_chain_alloc(vif->vif_tx_vq, + KM_SLEEP)) == NULL) { + goto fail; } - ASSERT(len >= VIOIF_TX_INLINE_SIZE); + virtio_chain_data_set(tb->tb_chain, tb); - if (ddi_dma_addr_bind_handle(buf->tb_inline_mapping.vbm_dmah, - NULL, buf->tb_inline_mapping.vbm_buf, len, - DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL, - &buf->tb_inline_mapping.vbm_dmac, &nsegments)) { + tb->tb_dmaext_capacity = VIOIF_MAX_SEGS - 1; + tb->tb_dmaext = kmem_zalloc( + sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity, + KM_SLEEP); + } - dev_err(sc->sc_dev, CE_WARN, - "Can't bind tx buffer %d", i); - goto exit_tx; + /* + * The receive buffers are larger, and we can tolerate a large number + * of segments. Adjust the SGL entry count, setting aside one segment + * for the virtio net header. + */ + attr.dma_attr_sgllen = VIOIF_MAX_SEGS - 1; + + for (vioif_rxbuf_t *rb = list_head(&vif->vif_rxbufs); rb != NULL; + rb = list_next(&vif->vif_rxbufs, rb)) { + if ((rb->rb_dma = virtio_dma_alloc(vif->vif_virtio, + VIOIF_RX_BUF_SIZE, &attr, DDI_DMA_STREAMING | DDI_DMA_READ, + KM_SLEEP)) == NULL) { + goto fail; } - /* We asked for a single segment */ - ASSERT(nsegments == 1); + if ((rb->rb_chain = virtio_chain_alloc(vif->vif_rx_vq, + KM_SLEEP)) == NULL) { + goto fail; + } + virtio_chain_data_set(rb->rb_chain, rb); /* - * We allow up to VIOIF_INDIRECT_MAX - 1 external mappings. - * In reality, I don't expect more then 2-3 used, but who - * knows. + * Ensure that the first cookie is sufficient to cover the + * header skip region plus one byte. */ - buf->tb_external_mapping = kmem_zalloc( - sizeof (struct vioif_tx_buf) * VIOIF_INDIRECT_MAX - 1, - KM_SLEEP); + VERIFY3U(virtio_dma_cookie_size(rb->rb_dma, 0), >=, + VIOIF_HEADER_SKIP + 1); /* - * The external mapping's dma handles are allocate lazily, - * as we don't expect most of them to be used.. + * Ensure that the frame data begins at a location with a + * correctly aligned IP header. */ - } - - return (0); - -exit_tx: - for (i = 0; i < txqsize; i++) { - struct vioif_tx_buf *buf = &sc->sc_txbufs[i]; + VERIFY3U((uintptr_t)virtio_dma_va(rb->rb_dma, + VIOIF_HEADER_SKIP) % 4, ==, 2); - if (buf->tb_inline_mapping.vbm_dmah) - (void) ddi_dma_unbind_handle( - buf->tb_inline_mapping.vbm_dmah); - - if (buf->tb_inline_mapping.vbm_acch) - ddi_dma_mem_free( - &buf->tb_inline_mapping.vbm_acch); - - if (buf->tb_inline_mapping.vbm_dmah) - ddi_dma_free_handle( - &buf->tb_inline_mapping.vbm_dmah); - - if (buf->tb_external_mapping) - kmem_free(buf->tb_external_mapping, - sizeof (struct vioif_tx_buf) * - VIOIF_INDIRECT_MAX - 1); + rb->rb_vioif = vif; + rb->rb_frtn.free_func = vioif_rx_free_callback; + rb->rb_frtn.free_arg = (caddr_t)rb; } - kmem_free(sc->sc_rxbufs, sizeof (struct vioif_rx_buf) * rxqsize); + return (0); -exit_rxalloc: - kmem_free(sc->sc_txbufs, sizeof (struct vioif_tx_buf) * txqsize); -exit_txalloc: +fail: + vioif_free_bufs(vif); return (ENOMEM); } -/* ARGSUSED */ static int -vioif_multicst(void *arg, boolean_t add, const uint8_t *macaddr) +vioif_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr) { - return (DDI_SUCCESS); + /* + * Even though we currently do not have support for programming + * multicast filters, or even enabling promiscuous mode, we return + * success here to avoid the networking stack falling back to link + * layer broadcast for multicast traffic. Some hypervisors already + * pass received multicast frames onto the guest, so at least on those + * systems multicast will work as expected anyway. + */ + return (0); } -/* ARGSUSED */ static int -vioif_promisc(void *arg, boolean_t on) +vioif_m_setpromisc(void *arg, boolean_t on) { - return (DDI_SUCCESS); + /* + * Even though we cannot currently enable promiscuous mode, we return + * success here to allow tools like snoop(1M) to continue to function. + */ + return (0); } -/* ARGSUSED */ static int -vioif_unicst(void *arg, const uint8_t *macaddr) +vioif_m_unicst(void *arg, const uint8_t *mac) { - return (DDI_FAILURE); + return (ENOTSUP); } static uint_t -vioif_add_rx(struct vioif_softc *sc, int kmflag) +vioif_add_rx(vioif_t *vif) { - uint_t num_added = 0; - struct vq_entry *ve; - - while ((ve = vq_alloc_entry(sc->sc_rx_vq)) != NULL) { - struct vioif_rx_buf *buf = sc->sc_rxbufs[ve->qe_index]; - - if (buf == NULL) { - /* First run, allocate the buffer. */ - buf = kmem_cache_alloc(sc->sc_rxbuf_cache, kmflag); - sc->sc_rxbufs[ve->qe_index] = buf; - } - - /* Still nothing? Bye. */ - if (buf == NULL) { - sc->sc_norecvbuf++; - vq_free_entry(sc->sc_rx_vq, ve); - break; - } - - ASSERT(buf->rb_mapping.vbm_ncookies >= 1); + VERIFY(MUTEX_HELD(&vif->vif_mutex)); + if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) { /* - * For an unknown reason, the virtio_net_hdr must be placed - * as a separate virtio queue entry. + * If the NIC is not running, do not give the device any + * receive buffers. */ - virtio_ve_add_indirect_buf(ve, - buf->rb_mapping.vbm_dmac.dmac_laddress, - sizeof (struct virtio_net_hdr), B_FALSE); + return (0); + } - /* Add the rest of the first cookie. */ - virtio_ve_add_indirect_buf(ve, - buf->rb_mapping.vbm_dmac.dmac_laddress + - sizeof (struct virtio_net_hdr), - buf->rb_mapping.vbm_dmac.dmac_size - - sizeof (struct virtio_net_hdr), B_FALSE); + uint_t num_added = 0; + vioif_rxbuf_t *rb; + while ((rb = vioif_rxbuf_alloc(vif)) != NULL) { /* - * If the buffer consists of a single cookie (unlikely for a - * 64-k buffer), we are done. Otherwise, add the rest of the - * cookies using indirect entries. + * For legacy devices, and those that have not negotiated + * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a + * separate descriptor entry to the rest of the buffer. */ - if (buf->rb_mapping.vbm_ncookies > 1) { - ddi_dma_cookie_t *first_extra_dmac; - ddi_dma_cookie_t dmac; - first_extra_dmac = - vioif_dma_curr_cookie(buf->rb_mapping.vbm_dmah); - - ddi_dma_nextcookie(buf->rb_mapping.vbm_dmah, &dmac); - virtio_ve_add_cookie(ve, buf->rb_mapping.vbm_dmah, - dmac, buf->rb_mapping.vbm_ncookies - 1, B_FALSE); - vioif_dma_reset_cookie(buf->rb_mapping.vbm_dmah, - first_extra_dmac); + if (virtio_chain_append(rb->rb_chain, + virtio_dma_cookie_pa(rb->rb_dma, 0), + sizeof (struct virtio_net_hdr), + VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) { + goto fail; } - virtio_push_chain(ve, B_FALSE); - num_added++; - } + for (uint_t n = 0; n < virtio_dma_ncookies(rb->rb_dma); n++) { + uint64_t pa = virtio_dma_cookie_pa(rb->rb_dma, n); + size_t sz = virtio_dma_cookie_size(rb->rb_dma, n); - return (num_added); -} + if (n == 0) { + pa += VIOIF_HEADER_SKIP; + VERIFY3U(sz, >, VIOIF_HEADER_SKIP); + sz -= VIOIF_HEADER_SKIP; + } -static uint_t -vioif_populate_rx(struct vioif_softc *sc, int kmflag) -{ - uint_t num_added = vioif_add_rx(sc, kmflag); + if (virtio_chain_append(rb->rb_chain, pa, sz, + VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) { + goto fail; + } + } + + virtio_chain_submit(rb->rb_chain, B_FALSE); + num_added++; + continue; - if (num_added > 0) - virtio_sync_vq(sc->sc_rx_vq); +fail: + vioif_rxbuf_free(vif, rb); + vif->vif_norecvbuf++; + break; + } + + if (num_added > 0) { + virtio_queue_flush(vif->vif_rx_vq); + } return (num_added); } static uint_t -vioif_process_rx(struct vioif_softc *sc) +vioif_process_rx(vioif_t *vif) { - struct vq_entry *ve; - struct vioif_rx_buf *buf; + virtio_chain_t *vic; mblk_t *mphead = NULL, *lastmp = NULL, *mp; - uint32_t len; uint_t num_processed = 0; - while ((ve = virtio_pull_chain(sc->sc_rx_vq, &len))) { + VERIFY(MUTEX_HELD(&vif->vif_mutex)); - buf = sc->sc_rxbufs[ve->qe_index]; - ASSERT(buf); + while ((vic = virtio_queue_poll(vif->vif_rx_vq)) != NULL) { + /* + * We have to use the chain received length here, as the device + * does not tell us the received frame length any other way. + * In a limited survey of hypervisors, virtio network devices + * appear to provide the right value here. + */ + size_t len = virtio_chain_received_length(vic); + vioif_rxbuf_t *rb = virtio_chain_data(vic); - if (len < sizeof (struct virtio_net_hdr)) { - sc->sc_rxfail_chain_undersize++; - sc->sc_ierrors++; - virtio_free_chain(ve); + virtio_dma_sync(rb->rb_dma, DDI_DMA_SYNC_FORCPU); + + /* + * If the NIC is not running, discard any received frames. + */ + if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) { + vioif_rxbuf_free(vif, rb); continue; } + if (len < sizeof (struct virtio_net_hdr)) { + vif->vif_rxfail_chain_undersize++; + vif->vif_ierrors++; + vioif_rxbuf_free(vif, rb); + continue; + } len -= sizeof (struct virtio_net_hdr); + /* * We copy small packets that happen to fit into a single * cookie and reuse the buffers. For bigger ones, we loan * the buffers upstream. */ - if (len < sc->sc_rxcopy_thresh) { - mp = allocb(len, 0); - if (mp == NULL) { - sc->sc_norecvbuf++; - sc->sc_ierrors++; - - virtio_free_chain(ve); - break; + if (len < vif->vif_rxcopy_thresh || + vif->vif_nrxbufs_onloan >= vif->vif_nrxbufs_onloan_max) { + mutex_exit(&vif->vif_mutex); + if ((mp = allocb(len, 0)) == NULL) { + mutex_enter(&vif->vif_mutex); + vif->vif_norecvbuf++; + vif->vif_ierrors++; + + vioif_rxbuf_free(vif, rb); + continue; } - bcopy((char *)buf->rb_mapping.vbm_buf + - sizeof (struct virtio_net_hdr), mp->b_rptr, len); + bcopy(virtio_dma_va(rb->rb_dma, VIOIF_HEADER_SKIP), + mp->b_rptr, len); mp->b_wptr = mp->b_rptr + len; + /* + * As the packet contents was copied rather than + * loaned, we can return the receive buffer resources + * to the free list. + */ + mutex_enter(&vif->vif_mutex); + vioif_rxbuf_free(vif, rb); + } else { - mp = desballoc((unsigned char *) - buf->rb_mapping.vbm_buf + - sizeof (struct virtio_net_hdr) + - VIOIF_IP_ALIGN, len, 0, &buf->rb_frtn); - if (mp == NULL) { - sc->sc_norecvbuf++; - sc->sc_ierrors++; - - virtio_free_chain(ve); - break; + mutex_exit(&vif->vif_mutex); + if ((mp = desballoc(virtio_dma_va(rb->rb_dma, + VIOIF_HEADER_SKIP), len, 0, + &rb->rb_frtn)) == NULL) { + mutex_enter(&vif->vif_mutex); + vif->vif_norecvbuf++; + vif->vif_ierrors++; + + vioif_rxbuf_free(vif, rb); + continue; } mp->b_wptr = mp->b_rptr + len; - atomic_inc_ulong(&sc->sc_rxloan); - /* - * Buffer loaned, we will have to allocate a new one - * for this slot. - */ - sc->sc_rxbufs[ve->qe_index] = NULL; + mutex_enter(&vif->vif_mutex); + vif->vif_nrxbufs_onloan++; } /* @@ -879,15 +704,13 @@ vioif_process_rx(struct vioif_softc *sc) */ if (mp->b_rptr[0] & 0x1) { if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0) - sc->sc_multircv++; + vif->vif_multircv++; else - sc->sc_brdcstrcv++; + vif->vif_brdcstrcv++; } - sc->sc_rbytes += len; - sc->sc_ipackets++; - - virtio_free_chain(ve); + vif->vif_rbytes += len; + vif->vif_ipackets++; if (lastmp == NULL) { mphead = mp; @@ -899,42 +722,56 @@ vioif_process_rx(struct vioif_softc *sc) } if (mphead != NULL) { - mac_rx(sc->sc_mac_handle, NULL, mphead); + if (vif->vif_runstate == VIOIF_RUNSTATE_RUNNING) { + mutex_exit(&vif->vif_mutex); + mac_rx(vif->vif_mac_handle, NULL, mphead); + mutex_enter(&vif->vif_mutex); + } else { + /* + * The NIC was disabled part way through our execution, + * so free the messages we allocated. + */ + freemsgchain(mphead); + } } return (num_processed); } static uint_t -vioif_reclaim_used_tx(struct vioif_softc *sc) +vioif_reclaim_used_tx(vioif_t *vif) { - struct vq_entry *ve; - uint32_t len; + virtio_chain_t *vic; uint_t num_reclaimed = 0; - while ((ve = virtio_pull_chain(sc->sc_tx_vq, &len))) { - struct vioif_tx_buf *buf; - mblk_t *mp; + VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); - /* We don't chain descriptors for tx, so don't expect any. */ - ASSERT(ve->qe_next == NULL); + while ((vic = virtio_queue_poll(vif->vif_tx_vq)) != NULL) { + vioif_txbuf_t *tb = virtio_chain_data(vic); - buf = &sc->sc_txbufs[ve->qe_index]; - mp = buf->tb_mp; - buf->tb_mp = NULL; + if (tb->tb_mp != NULL) { + /* + * Unbind the external mapping. + */ + for (uint_t i = 0; i < tb->tb_dmaext_capacity; i++) { + if (tb->tb_dmaext[i] == NULL) { + continue; + } - if (mp != NULL) { - for (uint_t i = 0; i < buf->tb_external_num; i++) { - (void) ddi_dma_unbind_handle( - buf->tb_external_mapping[i].vbm_dmah); + virtio_dma_unbind(tb->tb_dmaext[i]); } + + freemsg(tb->tb_mp); + tb->tb_mp = NULL; } - virtio_free_chain(ve); + /* + * Return this transmit buffer to the free list for reuse. + */ + mutex_enter(&vif->vif_mutex); + vioif_txbuf_free(vif, tb); + mutex_exit(&vif->vif_mutex); - /* External mapping used, mp was not freed in vioif_send() */ - if (mp != NULL) - freemsg(mp); num_reclaimed++; } @@ -942,24 +779,24 @@ vioif_reclaim_used_tx(struct vioif_softc *sc) if (num_reclaimed > 0) { boolean_t do_update = B_FALSE; - mutex_enter(&sc->sc_tx_lock); - if (sc->sc_tx_corked) { + mutex_enter(&vif->vif_mutex); + vif->vif_stat_tx_reclaim += num_reclaimed; + if (vif->vif_tx_corked) { /* * TX was corked on a lack of available descriptors. * That dire state has passed so the TX interrupt can * be disabled and MAC can be notified that * transmission is possible again. */ - sc->sc_tx_corked = B_FALSE; - virtio_stop_vq_intr(sc->sc_tx_vq); + vif->vif_tx_corked = B_FALSE; + virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE); do_update = B_TRUE; } - mutex_exit(&sc->sc_tx_lock); - /* Notify MAC outside the above lock */ if (do_update) { - mac_tx_update(sc->sc_mac_handle); + mac_tx_update(vif->vif_mac_handle); } + mutex_exit(&vif->vif_mutex); } return (num_reclaimed); @@ -968,208 +805,196 @@ vioif_reclaim_used_tx(struct vioif_softc *sc) static void vioif_reclaim_periodic(void *arg) { - struct vioif_softc *sc = arg; + vioif_t *vif = arg; uint_t num_reclaimed; - num_reclaimed = vioif_reclaim_used_tx(sc); + num_reclaimed = vioif_reclaim_used_tx(vif); - mutex_enter(&sc->sc_tx_lock); - sc->sc_tx_reclaim_tid = 0; + mutex_enter(&vif->vif_mutex); + vif->vif_tx_reclaim_tid = 0; /* * If used descriptors were reclaimed or TX descriptors appear to be * outstanding, the ring is considered active and periodic reclamation * is necessary for now. */ - if (num_reclaimed != 0 || vq_num_used(sc->sc_tx_vq) != 0) { + if (num_reclaimed != 0 || virtio_queue_nactive(vif->vif_tx_vq) != 0) { /* Do not reschedule if the ring is being drained. */ - if (!sc->sc_tx_drain) { - vioif_reclaim_restart(sc); + if (!vif->vif_tx_drain) { + vioif_reclaim_restart(vif); } } - mutex_exit(&sc->sc_tx_lock); + mutex_exit(&vif->vif_mutex); } static void -vioif_reclaim_restart(struct vioif_softc *sc) +vioif_reclaim_restart(vioif_t *vif) { - ASSERT(MUTEX_HELD(&sc->sc_tx_lock)); - ASSERT(!sc->sc_tx_drain); + VERIFY(MUTEX_HELD(&vif->vif_mutex)); + VERIFY(!vif->vif_tx_drain); - if (sc->sc_tx_reclaim_tid == 0) { - sc->sc_tx_reclaim_tid = timeout(vioif_reclaim_periodic, sc, + if (vif->vif_tx_reclaim_tid == 0) { + vif->vif_tx_reclaim_tid = timeout(vioif_reclaim_periodic, vif, MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms)); } } static void -vioif_tx_drain(struct vioif_softc *sc) +vioif_tx_drain(vioif_t *vif) { - mutex_enter(&sc->sc_tx_lock); - sc->sc_tx_drain = B_TRUE; + VERIFY(MUTEX_HELD(&vif->vif_mutex)); + VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPING); + + vif->vif_tx_drain = B_TRUE; /* Put a stop to the periodic reclaim if it is running */ - if (sc->sc_tx_reclaim_tid != 0) { - timeout_id_t tid = sc->sc_tx_reclaim_tid; + if (vif->vif_tx_reclaim_tid != 0) { + timeout_id_t tid = vif->vif_tx_reclaim_tid; /* - * With sc_tx_drain set, there is no risk that a racing + * With vif_tx_drain set, there is no risk that a racing * vioif_reclaim_periodic() call will reschedule itself. * * Being part of the mc_stop hook also guarantees that - * vioif_tx() will not be called to restart it. + * vioif_m_tx() will not be called to restart it. */ - sc->sc_tx_reclaim_tid = 0; - mutex_exit(&sc->sc_tx_lock); + vif->vif_tx_reclaim_tid = 0; + mutex_exit(&vif->vif_mutex); (void) untimeout(tid); - mutex_enter(&sc->sc_tx_lock); + mutex_enter(&vif->vif_mutex); } - virtio_stop_vq_intr(sc->sc_tx_vq); - mutex_exit(&sc->sc_tx_lock); + virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE); /* * Wait for all of the TX descriptors to be processed by the host so * they can be reclaimed. */ - while (vq_num_used(sc->sc_tx_vq) != 0) { - (void) vioif_reclaim_used_tx(sc); + while (vif->vif_ntxbufs_alloc > 0) { + mutex_exit(&vif->vif_mutex); + (void) vioif_reclaim_used_tx(vif); delay(5); + mutex_enter(&vif->vif_mutex); } - - VERIFY(!sc->sc_tx_corked); - VERIFY3U(sc->sc_tx_reclaim_tid, ==, 0); - VERIFY3U(vq_num_used(sc->sc_tx_vq), ==, 0); + VERIFY(!vif->vif_tx_corked); + VERIFY3U(vif->vif_tx_reclaim_tid, ==, 0); + VERIFY3U(virtio_queue_nactive(vif->vif_tx_vq), ==, 0); } -/* sc will be used to update stat counters. */ -/* ARGSUSED */ -static inline void -vioif_tx_inline(struct vioif_softc *sc, struct vq_entry *ve, mblk_t *mp, - size_t msg_size) +static int +vioif_tx_inline(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size) { - struct vioif_tx_buf *buf; - buf = &sc->sc_txbufs[ve->qe_index]; - - ASSERT(buf); + VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); - /* Frees mp */ - mcopymsg(mp, buf->tb_inline_mapping.vbm_buf + - sizeof (struct virtio_net_hdr)); + VERIFY3U(msg_size, <=, virtio_dma_size(tb->tb_dma) - VIOIF_HEADER_SKIP); - virtio_ve_add_indirect_buf(ve, - buf->tb_inline_mapping.vbm_dmac.dmac_laddress + - sizeof (struct virtio_net_hdr), msg_size, B_TRUE); -} + /* + * Copy the message into the inline buffer and then free the message. + */ + mcopymsg(mp, virtio_dma_va(tb->tb_dma, VIOIF_HEADER_SKIP)); -static inline int -vioif_tx_lazy_handle_alloc(struct vioif_softc *sc, struct vioif_tx_buf *buf, - int i) -{ - int ret = DDI_SUCCESS; - - if (!buf->tb_external_mapping[i].vbm_dmah) { - ret = ddi_dma_alloc_handle(sc->sc_dev, - &vioif_mapped_buf_dma_attr, DDI_DMA_SLEEP, NULL, - &buf->tb_external_mapping[i].vbm_dmah); - if (ret != DDI_SUCCESS) { - sc->sc_txfail_dma_handle++; - } + if (virtio_chain_append(tb->tb_chain, + virtio_dma_cookie_pa(tb->tb_dma, 0) + VIOIF_HEADER_SKIP, + msg_size, VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { + return (DDI_FAILURE); } - return (ret); + return (DDI_SUCCESS); } -static inline int -vioif_tx_external(struct vioif_softc *sc, struct vq_entry *ve, mblk_t *mp, - size_t msg_size) +static int +vioif_tx_external(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size) { - _NOTE(ARGUNUSED(msg_size)); + VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); - struct vioif_tx_buf *buf; - mblk_t *nmp; - int i, j; - int ret = DDI_SUCCESS; + mblk_t *nmp = mp; + tb->tb_ndmaext = 0; - buf = &sc->sc_txbufs[ve->qe_index]; - - ASSERT(buf); - - buf->tb_external_num = 0; - i = 0; - nmp = mp; - - while (nmp) { + while (nmp != NULL) { size_t len; - ddi_dma_cookie_t dmac; - unsigned int ncookies; - len = MBLKL(nmp); - /* - * For some reason, the network stack can - * actually send us zero-length fragments. - */ - if (len == 0) { + if ((len = MBLKL(nmp)) == 0) { + /* + * Skip any zero-length entries in the chain. + */ nmp = nmp->b_cont; continue; } - ret = vioif_tx_lazy_handle_alloc(sc, buf, i); - if (ret != DDI_SUCCESS) { - sc->sc_notxbuf++; - sc->sc_oerrors++; - goto exit_lazy_alloc; - } - ret = ddi_dma_addr_bind_handle( - buf->tb_external_mapping[i].vbm_dmah, NULL, - (caddr_t)nmp->b_rptr, len, - DDI_DMA_WRITE | DDI_DMA_STREAMING, - DDI_DMA_SLEEP, NULL, &dmac, &ncookies); - - if (ret != DDI_SUCCESS) { - sc->sc_txfail_dma_bind++; - sc->sc_oerrors++; - goto exit_bind; + if (tb->tb_ndmaext >= tb->tb_dmaext_capacity) { + mutex_enter(&vif->vif_mutex); + vif->vif_txfail_indirect_limit++; + vif->vif_notxbuf++; + mutex_exit(&vif->vif_mutex); + goto fail; } - /* Check if we still fit into the indirect table. */ - if (virtio_ve_indirect_available(ve) < ncookies) { - sc->sc_txfail_indirect_limit++; - sc->sc_notxbuf++; - sc->sc_oerrors++; - - ret = DDI_FAILURE; - goto exit_limit; + if (tb->tb_dmaext[tb->tb_ndmaext] == NULL) { + /* + * Allocate a DMA handle for this slot. + */ + if ((tb->tb_dmaext[tb->tb_ndmaext] = + virtio_dma_alloc_nomem(vif->vif_virtio, + &vioif_dma_attr_external, KM_SLEEP)) == NULL) { + mutex_enter(&vif->vif_mutex); + vif->vif_notxbuf++; + mutex_exit(&vif->vif_mutex); + goto fail; + } + } + virtio_dma_t *extdma = tb->tb_dmaext[tb->tb_ndmaext++]; + + if (virtio_dma_bind(extdma, nmp->b_rptr, len, + DDI_DMA_WRITE | DDI_DMA_STREAMING, KM_SLEEP) != + DDI_SUCCESS) { + mutex_enter(&vif->vif_mutex); + vif->vif_txfail_dma_bind++; + mutex_exit(&vif->vif_mutex); + goto fail; } - virtio_ve_add_cookie(ve, buf->tb_external_mapping[i].vbm_dmah, - dmac, ncookies, B_TRUE); + for (uint_t n = 0; n < virtio_dma_ncookies(extdma); n++) { + uint64_t pa = virtio_dma_cookie_pa(extdma, n); + size_t sz = virtio_dma_cookie_size(extdma, n); + + if (virtio_chain_append(tb->tb_chain, pa, sz, + VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { + mutex_enter(&vif->vif_mutex); + vif->vif_txfail_indirect_limit++; + vif->vif_notxbuf++; + mutex_exit(&vif->vif_mutex); + goto fail; + } + } nmp = nmp->b_cont; - i++; } - buf->tb_external_num = i; - /* Save the mp to free it when the packet is sent. */ - buf->tb_mp = mp; + /* + * We need to keep the message around until we reclaim the buffer from + * the device before freeing it. + */ + tb->tb_mp = mp; return (DDI_SUCCESS); -exit_limit: -exit_bind: -exit_lazy_alloc: - - for (j = 0; j < i; j++) { - (void) ddi_dma_unbind_handle( - buf->tb_external_mapping[j].vbm_dmah); +fail: + for (uint_t n = 0; n < tb->tb_ndmaext; n++) { + if (tb->tb_dmaext[n] != NULL) { + virtio_dma_unbind(tb->tb_dmaext[n]); + } } + tb->tb_ndmaext = 0; - return (ret); + freemsg(mp); + + return (DDI_FAILURE); } static boolean_t -vioif_send(struct vioif_softc *sc, mblk_t *mp) +vioif_send(vioif_t *vif, mblk_t *mp) { - struct vq_entry *ve; - struct vioif_tx_buf *buf; - struct virtio_net_hdr *net_header = NULL; + VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); + + vioif_txbuf_t *tb = NULL; + struct virtio_net_hdr *vnh = NULL; size_t msg_size = 0; uint32_t csum_start; uint32_t csum_stuff; @@ -1179,133 +1004,159 @@ vioif_send(struct vioif_softc *sc, mblk_t *mp) mblk_t *nmp; int ret; boolean_t lso_required = B_FALSE; + struct ether_header *ether = (void *)mp->b_rptr; for (nmp = mp; nmp; nmp = nmp->b_cont) msg_size += MBLKL(nmp); - if (sc->sc_tx_tso4) { + if (vif->vif_tx_tso4) { mac_lso_get(mp, &lso_mss, &lso_flags); - lso_required = (lso_flags & HW_LSO); + lso_required = (lso_flags & HW_LSO) != 0; } - ve = vq_alloc_entry(sc->sc_tx_vq); - - if (ve == NULL) { - sc->sc_notxbuf++; - /* Out of free descriptors - try later. */ - return (B_FALSE); + mutex_enter(&vif->vif_mutex); + if ((tb = vioif_txbuf_alloc(vif)) == NULL) { + vif->vif_notxbuf++; + goto fail; } - buf = &sc->sc_txbufs[ve->qe_index]; + mutex_exit(&vif->vif_mutex); - /* Use the inline buffer of the first entry for the virtio_net_hdr. */ - (void) memset(buf->tb_inline_mapping.vbm_buf, 0, - sizeof (struct virtio_net_hdr)); + /* + * Use the inline buffer for the virtio net header. Zero the portion + * of our DMA allocation prior to the packet data. + */ + vnh = virtio_dma_va(tb->tb_dma, 0); + bzero(vnh, VIOIF_HEADER_SKIP); - net_header = (struct virtio_net_hdr *)buf->tb_inline_mapping.vbm_buf; + /* + * For legacy devices, and those that have not negotiated + * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a separate + * descriptor entry to the rest of the buffer. + */ + if (virtio_chain_append(tb->tb_chain, + virtio_dma_cookie_pa(tb->tb_dma, 0), sizeof (struct virtio_net_hdr), + VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { + mutex_enter(&vif->vif_mutex); + vif->vif_notxbuf++; + goto fail; + } - mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL, - NULL, &csum_flags); + mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL, NULL, &csum_flags); - /* They want us to do the TCP/UDP csum calculation. */ + /* + * They want us to do the TCP/UDP csum calculation. + */ if (csum_flags & HCK_PARTIALCKSUM) { - struct ether_header *eth_header; int eth_hsize; - /* Did we ask for it? */ - ASSERT(sc->sc_tx_csum); + /* + * Did we ask for it? + */ + ASSERT(vif->vif_tx_csum); - /* We only asked for partial csum packets. */ + /* + * We only asked for partial csum packets. + */ ASSERT(!(csum_flags & HCK_IPV4_HDRCKSUM)); ASSERT(!(csum_flags & HCK_FULLCKSUM)); - eth_header = (void *) mp->b_rptr; - if (eth_header->ether_type == htons(ETHERTYPE_VLAN)) { + if (ether->ether_type == htons(ETHERTYPE_VLAN)) { eth_hsize = sizeof (struct ether_vlan_header); } else { eth_hsize = sizeof (struct ether_header); } - net_header->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - net_header->csum_start = eth_hsize + csum_start; - net_header->csum_offset = csum_stuff - csum_start; + + vnh->vnh_flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnh->vnh_csum_start = eth_hsize + csum_start; + vnh->vnh_csum_offset = csum_stuff - csum_start; } - /* setup LSO fields if required */ + /* + * Setup LSO fields if required. + */ if (lso_required) { - net_header->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - net_header->gso_size = (uint16_t)lso_mss; + vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + vnh->vnh_gso_size = (uint16_t)lso_mss; } - virtio_ve_add_indirect_buf(ve, - buf->tb_inline_mapping.vbm_dmac.dmac_laddress, - sizeof (struct virtio_net_hdr), B_TRUE); - - /* meanwhile update the statistic */ - if (mp->b_rptr[0] & 0x1) { - if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0) - sc->sc_multixmt++; - else - sc->sc_brdcstxmt++; + /* + * The device does not maintain its own statistics about broadcast or + * multicast packets, so we have to check the destination address + * ourselves. + */ + if ((ether->ether_dhost.ether_addr_octet[0] & 0x01) != 0) { + mutex_enter(&vif->vif_mutex); + if (ether_cmp(ðer->ether_dhost, vioif_broadcast) == 0) { + vif->vif_brdcstxmt++; + } else { + vif->vif_multixmt++; + } + mutex_exit(&vif->vif_mutex); } /* - * We copy small packets into the inline buffer. The bigger ones - * get mapped using the mapped buffer. + * For small packets, copy into the preallocated inline buffer rather + * than incur the overhead of mapping. Note that both of these + * functions ensure that "mp" is freed before returning. */ - if (msg_size < sc->sc_txcopy_thresh) { - vioif_tx_inline(sc, ve, mp, msg_size); + if (msg_size < vif->vif_txcopy_thresh) { + ret = vioif_tx_inline(vif, tb, mp, msg_size); } else { - /* statistic gets updated by vioif_tx_external when fail */ - ret = vioif_tx_external(sc, ve, mp, msg_size); - if (ret != DDI_SUCCESS) - goto exit_tx_external; + ret = vioif_tx_external(vif, tb, mp, msg_size); } + mp = NULL; - virtio_push_chain(ve, B_TRUE); - - sc->sc_opackets++; - sc->sc_obytes += msg_size; + mutex_enter(&vif->vif_mutex); - return (B_TRUE); + if (ret != DDI_SUCCESS) { + goto fail; + } -exit_tx_external: + vif->vif_opackets++; + vif->vif_obytes += msg_size; + mutex_exit(&vif->vif_mutex); - vq_free_entry(sc->sc_tx_vq, ve); - /* - * vioif_tx_external can fail when the buffer does not fit into the - * indirect descriptor table. Free the mp. I don't expect this ever - * to happen. - */ - freemsg(mp); + virtio_dma_sync(tb->tb_dma, DDI_DMA_SYNC_FORDEV); + virtio_chain_submit(tb->tb_chain, B_TRUE); return (B_TRUE); + +fail: + vif->vif_oerrors++; + if (tb != NULL) { + vioif_txbuf_free(vif, tb); + } + mutex_exit(&vif->vif_mutex); + + return (mp == NULL); } static mblk_t * -vioif_tx(void *arg, mblk_t *mp) +vioif_m_tx(void *arg, mblk_t *mp) { - struct vioif_softc *sc = arg; + vioif_t *vif = arg; mblk_t *nmp; /* * Prior to attempting to send any more frames, do a reclaim to pick up * any descriptors which have been processed by the host. */ - if (vq_num_used(sc->sc_tx_vq) != 0) { - (void) vioif_reclaim_used_tx(sc); + if (virtio_queue_nactive(vif->vif_tx_vq) != 0) { + (void) vioif_reclaim_used_tx(vif); } while (mp != NULL) { nmp = mp->b_next; mp->b_next = NULL; - if (!vioif_send(sc, mp)) { + if (!vioif_send(vif, mp)) { /* * If there are no descriptors available, try to * reclaim some, allowing a retry of the send if some * are found. */ mp->b_next = nmp; - if (vioif_reclaim_used_tx(sc) != 0) { + if (vioif_reclaim_used_tx(vif) != 0) { continue; } @@ -1315,106 +1166,116 @@ vioif_tx(void *arg, mblk_t *mp) * can begin again. For safety, make sure the periodic * reclaim is running as well. */ - mutex_enter(&sc->sc_tx_lock); - sc->sc_tx_corked = B_TRUE; - virtio_start_vq_intr(sc->sc_tx_vq); - vioif_reclaim_restart(sc); - mutex_exit(&sc->sc_tx_lock); + mutex_enter(&vif->vif_mutex); + vif->vif_tx_corked = B_TRUE; + virtio_queue_no_interrupt(vif->vif_tx_vq, B_FALSE); + vioif_reclaim_restart(vif); + mutex_exit(&vif->vif_mutex); return (mp); } mp = nmp; } /* Ensure the periodic reclaim has been started. */ - mutex_enter(&sc->sc_tx_lock); - vioif_reclaim_restart(sc); - mutex_exit(&sc->sc_tx_lock); + mutex_enter(&vif->vif_mutex); + vioif_reclaim_restart(vif); + mutex_exit(&vif->vif_mutex); return (NULL); } static int -vioif_start(void *arg) +vioif_m_start(void *arg) { - struct vioif_softc *sc = arg; - struct vq_entry *ve; - uint32_t len; + vioif_t *vif = arg; + + mutex_enter(&vif->vif_mutex); + + VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPED); + vif->vif_runstate = VIOIF_RUNSTATE_RUNNING; - mac_link_update(sc->sc_mac_handle, vioif_link_state(sc)); + mac_link_update(vif->vif_mac_handle, LINK_STATE_UP); - virtio_start_vq_intr(sc->sc_rx_vq); + virtio_queue_no_interrupt(vif->vif_rx_vq, B_FALSE); /* * Starting interrupts on the TX virtqueue is unnecessary at this time. * Descriptor reclamation is handling during transmit, via a periodic * timer, and when resources are tight, via the then-enabled interrupt. */ - sc->sc_tx_drain = B_FALSE; + vif->vif_tx_drain = B_FALSE; /* - * Clear any data that arrived early on the receive queue and populate - * it with free buffers that the device can use moving forward. + * Add as many receive buffers as we can to the receive queue. If we + * cannot add any, it may be because we have stopped and started again + * and the descriptors are all in the queue already. */ - while ((ve = virtio_pull_chain(sc->sc_rx_vq, &len)) != NULL) { - virtio_free_chain(ve); - } - (void) vioif_populate_rx(sc, KM_SLEEP); + (void) vioif_add_rx(vif); + mutex_exit(&vif->vif_mutex); return (DDI_SUCCESS); } static void -vioif_stop(void *arg) +vioif_m_stop(void *arg) { - struct vioif_softc *sc = arg; + vioif_t *vif = arg; + + mutex_enter(&vif->vif_mutex); + + VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_RUNNING); + vif->vif_runstate = VIOIF_RUNSTATE_STOPPING; /* Ensure all TX descriptors have been processed and reclaimed */ - vioif_tx_drain(sc); + vioif_tx_drain(vif); - virtio_stop_vq_intr(sc->sc_rx_vq); + virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE); + + vif->vif_runstate = VIOIF_RUNSTATE_STOPPED; + mutex_exit(&vif->vif_mutex); } static int -vioif_stat(void *arg, uint_t stat, uint64_t *val) +vioif_m_stat(void *arg, uint_t stat, uint64_t *val) { - struct vioif_softc *sc = arg; + vioif_t *vif = arg; switch (stat) { case MAC_STAT_IERRORS: - *val = sc->sc_ierrors; + *val = vif->vif_ierrors; break; case MAC_STAT_OERRORS: - *val = sc->sc_oerrors; + *val = vif->vif_oerrors; break; case MAC_STAT_MULTIRCV: - *val = sc->sc_multircv; + *val = vif->vif_multircv; break; case MAC_STAT_BRDCSTRCV: - *val = sc->sc_brdcstrcv; + *val = vif->vif_brdcstrcv; break; case MAC_STAT_MULTIXMT: - *val = sc->sc_multixmt; + *val = vif->vif_multixmt; break; case MAC_STAT_BRDCSTXMT: - *val = sc->sc_brdcstxmt; + *val = vif->vif_brdcstxmt; break; case MAC_STAT_IPACKETS: - *val = sc->sc_ipackets; + *val = vif->vif_ipackets; break; case MAC_STAT_RBYTES: - *val = sc->sc_rbytes; + *val = vif->vif_rbytes; break; case MAC_STAT_OPACKETS: - *val = sc->sc_opackets; + *val = vif->vif_opackets; break; case MAC_STAT_OBYTES: - *val = sc->sc_obytes; + *val = vif->vif_obytes; break; case MAC_STAT_NORCVBUF: - *val = sc->sc_norecvbuf; + *val = vif->vif_norecvbuf; break; case MAC_STAT_NOXMTBUF: - *val = sc->sc_notxbuf; + *val = vif->vif_notxbuf; break; case MAC_STAT_IFSPEED: /* always 1 Gbit */ @@ -1433,651 +1294,490 @@ vioif_stat(void *arg, uint_t stat, uint64_t *val) } static int -vioif_set_prop_private(struct vioif_softc *sc, const char *pr_name, +vioif_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, uint_t pr_valsize, const void *pr_val) { - _NOTE(ARGUNUSED(pr_valsize)); - - long result; + vioif_t *vif = arg; - if (strcmp(pr_name, vioif_txcopy_thresh) == 0) { + switch (pr_num) { + case MAC_PROP_MTU: { + int r; + uint32_t mtu; + if (pr_valsize < sizeof (mtu)) { + return (EOVERFLOW); + } + bcopy(pr_val, &mtu, sizeof (mtu)); - if (pr_val == NULL) + if (mtu < ETHERMIN || mtu > vif->vif_mtu_max) { return (EINVAL); + } - (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); + mutex_enter(&vif->vif_mutex); + if ((r = mac_maxsdu_update(vif->vif_mac_handle, mtu)) == 0) { + vif->vif_mtu = mtu; + } + mutex_exit(&vif->vif_mutex); - if (result < 0 || result > VIOIF_TX_THRESH_MAX) - return (EINVAL); - sc->sc_txcopy_thresh = result; + return (r); } - if (strcmp(pr_name, vioif_rxcopy_thresh) == 0) { - if (pr_val == NULL) - return (EINVAL); - - (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); + case MAC_PROP_PRIVATE: { + long max, result; + uint_t *resp; + char *endptr; + + if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) { + max = VIOIF_MACPROP_TXCOPY_THRESH_MAX; + resp = &vif->vif_txcopy_thresh; + } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) { + max = VIOIF_MACPROP_RXCOPY_THRESH_MAX; + resp = &vif->vif_rxcopy_thresh; + } else { + return (ENOTSUP); + } - if (result < 0 || result > VIOIF_RX_THRESH_MAX) + if (pr_val == NULL) { return (EINVAL); - sc->sc_rxcopy_thresh = result; - } - return (0); -} - -static int -vioif_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, - uint_t pr_valsize, const void *pr_val) -{ - struct vioif_softc *sc = arg; - const uint32_t *new_mtu; - int err; - - switch (pr_num) { - case MAC_PROP_MTU: - new_mtu = pr_val; + } - if (*new_mtu > MAX_MTU) { + if (ddi_strtol(pr_val, &endptr, 10, &result) != 0 || + *endptr != '\0' || result < 0 || result > max) { return (EINVAL); } - err = mac_maxsdu_update(sc->sc_mac_handle, *new_mtu); - if (err) { - return (err); - } - break; - case MAC_PROP_PRIVATE: - err = vioif_set_prop_private(sc, pr_name, - pr_valsize, pr_val); - if (err) - return (err); - break; + mutex_enter(&vif->vif_mutex); + *resp = result; + mutex_exit(&vif->vif_mutex); + + return (0); + } + default: return (ENOTSUP); } - - return (0); } static int -vioif_get_prop_private(struct vioif_softc *sc, const char *pr_name, +vioif_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, uint_t pr_valsize, void *pr_val) { - int err = ENOTSUP; - int value; + vioif_t *vif = arg; - if (strcmp(pr_name, vioif_txcopy_thresh) == 0) { + switch (pr_num) { + case MAC_PROP_PRIVATE: { + uint_t value; - value = sc->sc_txcopy_thresh; - err = 0; - goto done; - } - if (strcmp(pr_name, vioif_rxcopy_thresh) == 0) { + if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) { + value = vif->vif_txcopy_thresh; + } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) { + value = vif->vif_rxcopy_thresh; + } else { + return (ENOTSUP); + } - value = sc->sc_rxcopy_thresh; - err = 0; - goto done; - } -done: - if (err == 0) { - (void) snprintf(pr_val, pr_valsize, "%d", value); - } - return (err); -} + if (snprintf(pr_val, pr_valsize, "%u", value) >= pr_valsize) { + return (EOVERFLOW); + } -static int -vioif_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, - uint_t pr_valsize, void *pr_val) -{ - struct vioif_softc *sc = arg; - int err = ENOTSUP; + return (0); + } - switch (pr_num) { - case MAC_PROP_PRIVATE: - err = vioif_get_prop_private(sc, pr_name, - pr_valsize, pr_val); - break; default: - break; + return (ENOTSUP); } - return (err); } static void -vioif_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, +vioif_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, mac_prop_info_handle_t prh) { - struct vioif_softc *sc = arg; + vioif_t *vif = arg; char valstr[64]; int value; switch (pr_num) { case MAC_PROP_MTU: - mac_prop_info_set_range_uint32(prh, ETHERMIN, MAX_MTU); - break; + mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); + mac_prop_info_set_range_uint32(prh, ETHERMIN, vif->vif_mtu_max); + return; case MAC_PROP_PRIVATE: - bzero(valstr, sizeof (valstr)); - if (strcmp(pr_name, vioif_txcopy_thresh) == 0) { - value = sc->sc_txcopy_thresh; - } else if (strcmp(pr_name, vioif_rxcopy_thresh) == 0) { - value = sc->sc_rxcopy_thresh; + if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) { + value = VIOIF_MACPROP_TXCOPY_THRESH_DEF; + } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) { + value = VIOIF_MACPROP_RXCOPY_THRESH_DEF; } else { + /* + * We do not recognise this private property name. + */ return; } + mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); (void) snprintf(valstr, sizeof (valstr), "%d", value); - break; + mac_prop_info_set_default_str(prh, valstr); + return; default: - break; + return; } } static boolean_t -vioif_getcapab(void *arg, mac_capab_t cap, void *cap_data) +vioif_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) { - struct vioif_softc *sc = arg; + vioif_t *vif = arg; switch (cap) { - case MAC_CAPAB_HCKSUM: - if (sc->sc_tx_csum) { - uint32_t *txflags = cap_data; - - *txflags = HCKSUM_INET_PARTIAL; - return (B_TRUE); + case MAC_CAPAB_HCKSUM: { + if (!vif->vif_tx_csum) { + return (B_FALSE); } - return (B_FALSE); - case MAC_CAPAB_LSO: - if (sc->sc_tx_tso4) { - mac_capab_lso_t *cap_lso = cap_data; - cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; - cap_lso->lso_basic_tcp_ipv4.lso_max = MAX_MTU; - return (B_TRUE); - } - return (B_FALSE); - default: - break; + *(uint32_t *)cap_data = HCKSUM_INET_PARTIAL; + + return (B_TRUE); } - return (B_FALSE); -} -static mac_callbacks_t vioif_m_callbacks = { - .mc_callbacks = (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO), - .mc_getstat = vioif_stat, - .mc_start = vioif_start, - .mc_stop = vioif_stop, - .mc_setpromisc = vioif_promisc, - .mc_multicst = vioif_multicst, - .mc_unicst = vioif_unicst, - .mc_tx = vioif_tx, - /* Optional callbacks */ - .mc_reserved = NULL, /* reserved */ - .mc_ioctl = NULL, /* mc_ioctl */ - .mc_getcapab = vioif_getcapab, /* mc_getcapab */ - .mc_open = NULL, /* mc_open */ - .mc_close = NULL, /* mc_close */ - .mc_setprop = vioif_setprop, - .mc_getprop = vioif_getprop, - .mc_propinfo = vioif_propinfo, -}; + case MAC_CAPAB_LSO: { + if (!vif->vif_tx_tso4) { + return (B_FALSE); + } -static void -vioif_show_features(struct vioif_softc *sc, const char *prefix, - uint32_t features) -{ - char buf[512]; - char *bufp = buf; - char *bufend = buf + sizeof (buf); - - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += snprintf(bufp, bufend - bufp, prefix); - /* LINTED E_PTRDIFF_OVERFLOW */ - bufp += virtio_show_features(features, bufp, bufend - bufp); - *bufp = '\0'; - - /* Using '!' to only CE_NOTE this to the system log. */ - dev_err(sc->sc_dev, CE_NOTE, "!%s Vioif (%b)", buf, features, - VIRTIO_NET_FEATURE_BITS); -} + mac_capab_lso_t *lso = cap_data; + lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; + lso->lso_basic_tcp_ipv4.lso_max = VIOIF_RX_DATA_SIZE; -/* - * Find out which features are supported by the device and - * choose which ones we wish to use. - */ -static int -vioif_dev_features(struct vioif_softc *sc) -{ - uint32_t host_features; - - host_features = virtio_negotiate_features(&sc->sc_virtio, - VIRTIO_NET_F_CSUM | - VIRTIO_NET_F_HOST_TSO4 | - VIRTIO_NET_F_HOST_ECN | - VIRTIO_NET_F_MAC | - VIRTIO_NET_F_STATUS | - VIRTIO_F_RING_INDIRECT_DESC); - - vioif_show_features(sc, "Host features: ", host_features); - vioif_show_features(sc, "Negotiated features: ", - sc->sc_virtio.sc_features); - - if (!(sc->sc_virtio.sc_features & VIRTIO_F_RING_INDIRECT_DESC)) { - dev_err(sc->sc_dev, CE_WARN, - "Host does not support RING_INDIRECT_DESC. Cannot attach."); - return (DDI_FAILURE); + return (B_TRUE); } - return (DDI_SUCCESS); + default: + return (B_FALSE); + } } static boolean_t -vioif_has_feature(struct vioif_softc *sc, uint32_t feature) +vioif_has_feature(vioif_t *vif, uint32_t feature) { - return (virtio_has_feature(&sc->sc_virtio, feature)); + return (virtio_feature_present(vif->vif_virtio, feature)); } +/* + * Read the primary MAC address from the device if one is provided. If not, + * generate a random locally administered MAC address and write it back to the + * device. + */ static void -vioif_set_mac(struct vioif_softc *sc) +vioif_get_mac(vioif_t *vif) { - int i; - - for (i = 0; i < ETHERADDRL; i++) { - virtio_write_device_config_1(&sc->sc_virtio, - VIRTIO_NET_CONFIG_MAC + i, sc->sc_mac[i]); - } - sc->sc_mac_from_host = 0; -} + VERIFY(MUTEX_HELD(&vif->vif_mutex)); -/* Get the mac address out of the hardware, or make up one. */ -static void -vioif_get_mac(struct vioif_softc *sc) -{ - int i; - if (sc->sc_virtio.sc_features & VIRTIO_NET_F_MAC) { - for (i = 0; i < ETHERADDRL; i++) { - sc->sc_mac[i] = virtio_read_device_config_1( - &sc->sc_virtio, + if (vioif_has_feature(vif, VIRTIO_NET_F_MAC)) { + for (uint_t i = 0; i < ETHERADDRL; i++) { + vif->vif_mac[i] = virtio_dev_get8(vif->vif_virtio, VIRTIO_NET_CONFIG_MAC + i); } - sc->sc_mac_from_host = 1; - } else { - /* Get a few random bytes */ - (void) random_get_pseudo_bytes(sc->sc_mac, ETHERADDRL); - /* Make sure it's a unicast MAC */ - sc->sc_mac[0] &= ~1; - /* Set the "locally administered" bit */ - sc->sc_mac[1] |= 2; + vif->vif_mac_from_host = 1; - vioif_set_mac(sc); + return; + } - dev_err(sc->sc_dev, CE_NOTE, - "!Generated a random MAC address: %s", - ether_sprintf((struct ether_addr *)sc->sc_mac)); + /* Get a few random bytes */ + (void) random_get_pseudo_bytes(vif->vif_mac, ETHERADDRL); + /* Make sure it's a unicast MAC */ + vif->vif_mac[0] &= ~1; + /* Set the "locally administered" bit */ + vif->vif_mac[1] |= 2; + + /* + * Write the random MAC address back to the device. + */ + for (uint_t i = 0; i < ETHERADDRL; i++) { + virtio_dev_put8(vif->vif_virtio, VIRTIO_NET_CONFIG_MAC + i, + vif->vif_mac[i]); } + vif->vif_mac_from_host = 0; + + dev_err(vif->vif_dip, CE_NOTE, "!Generated a random MAC address: " + "%02x:%02x:%02x:%02x:%02x:%02x", + (uint_t)vif->vif_mac[0], (uint_t)vif->vif_mac[1], + (uint_t)vif->vif_mac[2], (uint_t)vif->vif_mac[3], + (uint_t)vif->vif_mac[4], (uint_t)vif->vif_mac[5]); } /* * Virtqueue interrupt handlers */ -/* ARGSUSED */ static uint_t -vioif_rx_handler(caddr_t arg1, caddr_t arg2) +vioif_rx_handler(caddr_t arg0, caddr_t arg1) { - struct virtio_softc *vsc = (void *) arg1; - struct vioif_softc *sc = __containerof(vsc, - struct vioif_softc, sc_virtio); + vioif_t *vif = (vioif_t *)arg0; + + mutex_enter(&vif->vif_mutex); + (void) vioif_process_rx(vif); /* - * The return values of these functions are not needed but they make - * debugging interrupts simpler because you can use them to detect when - * stuff was processed and repopulated in this handler. + * Attempt to replenish the receive queue. If we cannot add any + * descriptors here, it may be because all of the recently received + * packets were loaned up to the networking stack. */ - (void) vioif_process_rx(sc); - (void) vioif_populate_rx(sc, KM_NOSLEEP); + (void) vioif_add_rx(vif); + mutex_exit(&vif->vif_mutex); return (DDI_INTR_CLAIMED); } -/* ARGSUSED */ static uint_t -vioif_tx_handler(caddr_t arg1, caddr_t arg2) +vioif_tx_handler(caddr_t arg0, caddr_t arg1) { - struct virtio_softc *vsc = (void *)arg1; - struct vioif_softc *sc = __containerof(vsc, - struct vioif_softc, sc_virtio); + vioif_t *vif = (vioif_t *)arg0; /* * The TX interrupt could race with other reclamation activity, so * interpreting the return value is unimportant. */ - (void) vioif_reclaim_used_tx(sc); + (void) vioif_reclaim_used_tx(vif); return (DDI_INTR_CLAIMED); } -static int -vioif_register_ints(struct vioif_softc *sc) -{ - int ret; - - struct virtio_int_handler vioif_vq_h[] = { - { vioif_rx_handler }, - { vioif_tx_handler }, - { NULL } - }; - - ret = virtio_register_ints(&sc->sc_virtio, NULL, vioif_vq_h); - - return (ret); -} - - static void -vioif_check_features(struct vioif_softc *sc) +vioif_check_features(vioif_t *vif) { - if (vioif_has_feature(sc, VIRTIO_NET_F_CSUM)) { - /* The GSO/GRO featured depend on CSUM, check them here. */ - sc->sc_tx_csum = 1; - sc->sc_rx_csum = 1; + VERIFY(MUTEX_HELD(&vif->vif_mutex)); - if (!vioif_has_feature(sc, VIRTIO_NET_F_GUEST_CSUM)) { - sc->sc_rx_csum = 0; - } - dev_err(sc->sc_dev, CE_NOTE, "!Csum enabled."); + vif->vif_tx_csum = 0; + vif->vif_tx_tso4 = 0; - if (vioif_has_feature(sc, VIRTIO_NET_F_HOST_TSO4)) { + if (vioif_has_feature(vif, VIRTIO_NET_F_CSUM)) { + /* + * The host will accept packets with partial checksums from us. + */ + vif->vif_tx_csum = 1; - sc->sc_tx_tso4 = 1; - /* - * We don't seem to have a way to ask the system - * not to send us LSO packets with Explicit - * Congestion Notification bit set, so we require - * the device to support it in order to do - * LSO. - */ - if (!vioif_has_feature(sc, VIRTIO_NET_F_HOST_ECN)) { - dev_err(sc->sc_dev, CE_NOTE, - "!TSO4 supported, but not ECN. " - "Not using LSO."); - sc->sc_tx_tso4 = 0; - } else { - dev_err(sc->sc_dev, CE_NOTE, "!LSO enabled"); - } + /* + * The legacy GSO feature represents the combination of + * HOST_TSO4, HOST_TSO6, and HOST_ECN. + */ + boolean_t gso = vioif_has_feature(vif, VIRTIO_NET_F_GSO); + boolean_t tso4 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO4); + boolean_t ecn = vioif_has_feature(vif, VIRTIO_NET_F_HOST_ECN); + + /* + * Explicit congestion notification (ECN) is configured + * globally; see "tcp_ecn_permitted". As we cannot currently + * request that the stack disable ECN on a per interface basis, + * we require the device to support the combination of + * segmentation offload and ECN support. + */ + if (gso || (tso4 && ecn)) { + vif->vif_tx_tso4 = 1; } } } static int -vioif_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) +vioif_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { - int ret, instance; - struct vioif_softc *sc; - struct virtio_softc *vsc; - mac_register_t *macp; - char cache_name[CACHE_NAME_SIZE]; - - instance = ddi_get_instance(devinfo); - - switch (cmd) { - case DDI_ATTACH: - break; - - case DDI_RESUME: - case DDI_PM_RESUME: - /* We do not support suspend/resume for vioif. */ - goto exit; + int ret; + vioif_t *vif; + virtio_t *vio; + mac_register_t *macp = NULL; - default: - goto exit; + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); } - sc = kmem_zalloc(sizeof (struct vioif_softc), KM_SLEEP); - ddi_set_driver_private(devinfo, sc); - - vsc = &sc->sc_virtio; + if ((vio = virtio_init(dip, VIRTIO_NET_WANTED_FEATURES, B_TRUE)) == + NULL) { + return (DDI_FAILURE); + } - /* Duplicate for less typing */ - sc->sc_dev = devinfo; - vsc->sc_dev = devinfo; + vif = kmem_zalloc(sizeof (*vif), KM_SLEEP); + vif->vif_dip = dip; + vif->vif_virtio = vio; + vif->vif_runstate = VIOIF_RUNSTATE_STOPPED; + ddi_set_driver_private(dip, vif); + + if ((vif->vif_rx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_RX, + "rx", vioif_rx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL || + (vif->vif_tx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_TX, + "tx", vioif_tx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL) { + goto fail; + } - /* - * Initialize interrupt kstat. - */ - sc->sc_intrstat = kstat_create("vioif", instance, "intr", "controller", - KSTAT_TYPE_INTR, 1, 0); - if (sc->sc_intrstat == NULL) { - dev_err(devinfo, CE_WARN, "kstat_create failed"); - goto exit_intrstat; - } - kstat_install(sc->sc_intrstat); - - /* map BAR 0 */ - ret = ddi_regs_map_setup(devinfo, 1, - (caddr_t *)&sc->sc_virtio.sc_io_addr, - 0, 0, &vioif_attr, &sc->sc_virtio.sc_ioh); - if (ret != DDI_SUCCESS) { - dev_err(devinfo, CE_WARN, "unable to map bar 0: %d", ret); - goto exit_map; + if (virtio_init_complete(vio, 0) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "failed to complete Virtio init"); + goto fail; } - virtio_device_reset(&sc->sc_virtio); - virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_ACK); - virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER); + virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE); + virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE); - ret = vioif_dev_features(sc); - if (ret) - goto exit_features; + mutex_init(&vif->vif_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio)); + mutex_enter(&vif->vif_mutex); - vsc->sc_nvqs = vioif_has_feature(sc, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2; + vioif_get_mac(vif); - (void) snprintf(cache_name, CACHE_NAME_SIZE, "vioif%d_rx", instance); - sc->sc_rxbuf_cache = kmem_cache_create(cache_name, - sizeof (struct vioif_rx_buf), 0, vioif_rx_construct, - vioif_rx_destruct, NULL, sc, NULL, KM_SLEEP); - if (sc->sc_rxbuf_cache == NULL) { - dev_err(sc->sc_dev, CE_WARN, "Can't allocate the buffer cache"); - goto exit_cache; - } + vif->vif_rxcopy_thresh = VIOIF_MACPROP_RXCOPY_THRESH_DEF; + vif->vif_txcopy_thresh = VIOIF_MACPROP_TXCOPY_THRESH_DEF; - ret = vioif_register_ints(sc); - if (ret) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to allocate interrupt(s)!"); - goto exit_ints; + if (vioif_has_feature(vif, VIRTIO_NET_F_MTU)) { + vif->vif_mtu_max = virtio_dev_get16(vio, VIRTIO_NET_CONFIG_MTU); + } else { + vif->vif_mtu_max = ETHERMTU; } - /* - * Register layout determined, can now access the - * device-specific bits - */ - vioif_get_mac(sc); - - sc->sc_rx_vq = virtio_alloc_vq(&sc->sc_virtio, 0, - VIOIF_RX_QLEN, VIOIF_INDIRECT_MAX, "rx"); - if (!sc->sc_rx_vq) - goto exit_alloc1; - virtio_stop_vq_intr(sc->sc_rx_vq); - - sc->sc_tx_vq = virtio_alloc_vq(&sc->sc_virtio, 1, - VIOIF_TX_QLEN, VIOIF_INDIRECT_MAX, "tx"); - if (!sc->sc_tx_vq) - goto exit_alloc2; - virtio_stop_vq_intr(sc->sc_tx_vq); - - mutex_init(&sc->sc_tx_lock, NULL, MUTEX_DRIVER, - DDI_INTR_PRI(sc->sc_virtio.sc_intr_prio)); - - if (vioif_has_feature(sc, VIRTIO_NET_F_CTRL_VQ)) { - sc->sc_ctrl_vq = virtio_alloc_vq(&sc->sc_virtio, 2, - VIOIF_CTRL_QLEN, 0, "ctrl"); - if (!sc->sc_ctrl_vq) { - goto exit_alloc3; - } - virtio_stop_vq_intr(sc->sc_ctrl_vq); + vif->vif_mtu = ETHERMTU; + if (vif->vif_mtu > vif->vif_mtu_max) { + vif->vif_mtu = vif->vif_mtu_max; } - virtio_set_status(&sc->sc_virtio, - VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK); - - sc->sc_rxloan = 0; + vioif_check_features(vif); - /* set some reasonable-small default values */ - sc->sc_rxcopy_thresh = 300; - sc->sc_txcopy_thresh = 300; - sc->sc_mtu = ETHERMTU; + if (vioif_alloc_bufs(vif) != 0) { + mutex_exit(&vif->vif_mutex); + dev_err(dip, CE_WARN, "failed to allocate memory"); + goto fail; + } - vioif_check_features(sc); + mutex_exit(&vif->vif_mutex); - if (vioif_alloc_mems(sc) != 0) - goto exit_alloc_mems; + if (virtio_interrupts_enable(vio) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "failed to enable interrupts"); + goto fail; + } if ((macp = mac_alloc(MAC_VERSION)) == NULL) { - dev_err(devinfo, CE_WARN, "Failed to allocate a mac_register"); - goto exit_macalloc; + dev_err(dip, CE_WARN, "failed to allocate a mac_register"); + goto fail; } macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; - macp->m_driver = sc; - macp->m_dip = devinfo; - macp->m_src_addr = sc->sc_mac; - macp->m_callbacks = &vioif_m_callbacks; + macp->m_driver = vif; + macp->m_dip = dip; + macp->m_src_addr = vif->vif_mac; + macp->m_callbacks = &vioif_mac_callbacks; macp->m_min_sdu = 0; - macp->m_max_sdu = sc->sc_mtu; + macp->m_max_sdu = vif->vif_mtu; macp->m_margin = VLAN_TAGSZ; macp->m_priv_props = vioif_priv_props; - sc->sc_macp = macp; - - /* Pre-fill the rx ring. */ - (void) vioif_populate_rx(sc, KM_SLEEP); - - ret = mac_register(macp, &sc->sc_mac_handle); - if (ret != 0) { - dev_err(devinfo, CE_WARN, "vioif_attach: " - "mac_register() failed, ret=%d", ret); - goto exit_register; + if ((ret = mac_register(macp, &vif->vif_mac_handle)) != 0) { + dev_err(dip, CE_WARN, "mac_register() failed (%d)", ret); + goto fail; } + mac_free(macp); - ret = virtio_enable_ints(&sc->sc_virtio); - if (ret) { - dev_err(devinfo, CE_WARN, "Failed to enable interrupts"); - goto exit_enable_ints; - } + mac_link_update(vif->vif_mac_handle, LINK_STATE_UP); - mac_link_update(sc->sc_mac_handle, LINK_STATE_UP); return (DDI_SUCCESS); -exit_enable_ints: - (void) mac_unregister(sc->sc_mac_handle); -exit_register: - mac_free(macp); -exit_macalloc: - vioif_free_mems(sc); -exit_alloc_mems: - virtio_release_ints(&sc->sc_virtio); - if (sc->sc_ctrl_vq) - virtio_free_vq(sc->sc_ctrl_vq); -exit_alloc3: - virtio_free_vq(sc->sc_tx_vq); -exit_alloc2: - virtio_free_vq(sc->sc_rx_vq); -exit_alloc1: -exit_ints: - kmem_cache_destroy(sc->sc_rxbuf_cache); -exit_cache: -exit_features: - virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_FAILED); - ddi_regs_map_free(&sc->sc_virtio.sc_ioh); -exit_intrstat: -exit_map: - kstat_delete(sc->sc_intrstat); - kmem_free(sc, sizeof (struct vioif_softc)); -exit: +fail: + vioif_free_bufs(vif); + if (macp != NULL) { + mac_free(macp); + } + (void) virtio_fini(vio, B_TRUE); + kmem_free(vif, sizeof (*vif)); return (DDI_FAILURE); } static int -vioif_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) +vioif_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { - struct vioif_softc *sc; + int r; + vioif_t *vif; - if ((sc = ddi_get_driver_private(devinfo)) == NULL) + if (cmd != DDI_DETACH) { return (DDI_FAILURE); + } - switch (cmd) { - case DDI_DETACH: - break; + if ((vif = ddi_get_driver_private(dip)) == NULL) { + return (DDI_FAILURE); + } - case DDI_PM_SUSPEND: - /* We do not support suspend/resume for vioif. */ + mutex_enter(&vif->vif_mutex); + if (vif->vif_runstate != VIOIF_RUNSTATE_STOPPED) { + dev_err(dip, CE_WARN, "!NIC still running, cannot detach"); + mutex_exit(&vif->vif_mutex); return (DDI_FAILURE); + } - default: + /* + * There should be no outstanding transmit buffers once the NIC is + * completely stopped. + */ + VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0); + + /* + * Though we cannot claw back all of the receive buffers until we reset + * the device, we must ensure all those loaned to MAC have been + * returned before calling mac_unregister(). + */ + if (vif->vif_nrxbufs_onloan > 0) { + dev_err(dip, CE_WARN, "!%u receive buffers still loaned, " + "cannot detach", vif->vif_nrxbufs_onloan); + mutex_exit(&vif->vif_mutex); return (DDI_FAILURE); } - if (sc->sc_rxloan > 0) { - dev_err(devinfo, CE_WARN, "!Some rx buffers are still upstream," - " not detaching."); + if ((r = mac_unregister(vif->vif_mac_handle)) != 0) { + dev_err(dip, CE_WARN, "!MAC unregister failed (%d)", r); return (DDI_FAILURE); } + mac_free(vif->vif_macp); - virtio_stop_vq_intr(sc->sc_rx_vq); - virtio_stop_vq_intr(sc->sc_tx_vq); + /* + * Shut down the device so that we can recover any previously + * submitted receive buffers. + */ + virtio_shutdown(vif->vif_virtio); + for (;;) { + virtio_chain_t *vic; - virtio_release_ints(&sc->sc_virtio); + if ((vic = virtio_queue_evacuate(vif->vif_rx_vq)) == NULL) { + break; + } - if (mac_unregister(sc->sc_mac_handle)) { - return (DDI_FAILURE); + vioif_rxbuf_t *rb = virtio_chain_data(vic); + vioif_rxbuf_free(vif, rb); } - mac_free(sc->sc_macp); - - vioif_free_mems(sc); - virtio_free_vq(sc->sc_rx_vq); - virtio_free_vq(sc->sc_tx_vq); + (void) virtio_fini(vif->vif_virtio, B_FALSE); - virtio_device_reset(&sc->sc_virtio); + vioif_free_bufs(vif); - ddi_regs_map_free(&sc->sc_virtio.sc_ioh); + mutex_exit(&vif->vif_mutex); + mutex_destroy(&vif->vif_mutex); - kmem_cache_destroy(sc->sc_rxbuf_cache); - kstat_delete(sc->sc_intrstat); - kmem_free(sc, sizeof (struct vioif_softc)); + kmem_free(vif, sizeof (*vif)); return (DDI_SUCCESS); } static int -vioif_quiesce(dev_info_t *devinfo) +vioif_quiesce(dev_info_t *dip) { - struct vioif_softc *sc; + vioif_t *vif; - if ((sc = ddi_get_driver_private(devinfo)) == NULL) + if ((vif = ddi_get_driver_private(dip)) == NULL) return (DDI_FAILURE); - virtio_stop_vq_intr(sc->sc_rx_vq); - virtio_stop_vq_intr(sc->sc_tx_vq); - virtio_device_reset(&sc->sc_virtio); - - return (DDI_SUCCESS); + return (virtio_quiesce(vif->vif_virtio)); } int _init(void) { - int ret = 0; + int ret; - mac_init_ops(&vioif_ops, "vioif"); + mac_init_ops(&vioif_dev_ops, "vioif"); - ret = mod_install(&modlinkage); - if (ret != DDI_SUCCESS) { - mac_fini_ops(&vioif_ops); - return (ret); + if ((ret = mod_install(&vioif_modlinkage)) != DDI_SUCCESS) { + mac_fini_ops(&vioif_dev_ops); } - return (0); + return (ret); } int @@ -2085,16 +1785,15 @@ _fini(void) { int ret; - ret = mod_remove(&modlinkage); - if (ret == DDI_SUCCESS) { - mac_fini_ops(&vioif_ops); + if ((ret = mod_remove(&vioif_modlinkage)) == DDI_SUCCESS) { + mac_fini_ops(&vioif_dev_ops); } return (ret); } int -_info(struct modinfo *pModinfo) +_info(struct modinfo *modinfop) { - return (mod_info(&modlinkage, pModinfo)); + return (mod_info(&vioif_modlinkage, modinfop)); } diff --git a/usr/src/uts/common/io/vioif/vioif.h b/usr/src/uts/common/io/vioif/vioif.h new file mode 100644 index 0000000000..51dbc1acd4 --- /dev/null +++ b/usr/src/uts/common/io/vioif/vioif.h @@ -0,0 +1,432 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * VIRTIO NETWORK DRIVER + */ + +#ifndef _VIOIF_H +#define _VIOIF_H + +#include "virtio.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * VIRTIO NETWORK CONFIGURATION REGISTERS + * + * These are offsets into the device-specific configuration space available + * through the virtio_dev_*() family of functions. + */ +#define VIRTIO_NET_CONFIG_MAC 0x00 /* 48 R/W */ +#define VIRTIO_NET_CONFIG_STATUS 0x06 /* 16 R */ +#define VIRTIO_NET_CONFIG_MAX_VQ_PAIRS 0x08 /* 16 R */ +#define VIRTIO_NET_CONFIG_MTU 0x0A /* 16 R */ + +/* + * VIRTIO NETWORK VIRTQUEUES + * + * Note that the control queue is only present if VIRTIO_NET_F_CTRL_VQ is + * negotiated with the device. + */ +#define VIRTIO_NET_VIRTQ_RX 0 +#define VIRTIO_NET_VIRTQ_TX 1 +#define VIRTIO_NET_VIRTQ_CONTROL 2 + +/* + * VIRTIO NETWORK FEATURE BITS + */ + +/* + * CSUM, GUEST_CSUM: + * Partial checksum support. These features signal that the device will + * accept packets with partial checksums (CSUM), and that the driver will + * accept packets with partial checksums (GUEST_CSUM). These features + * combine the use of the VIRTIO_NET_HDR_F_NEEDS_CSUM flag, and the + * "csum_start" and "csum_offset" fields, in the virtio net header. + */ +#define VIRTIO_NET_F_CSUM (1ULL << 0) +#define VIRTIO_NET_F_GUEST_CSUM (1ULL << 1) + +/* + * MTU: + * The device offers a maximum MTU value at VIRTIO_NET_CONFIG_MTU. If + * this is not negotiated, we allow the largest possible MTU that our + * buffer allocations support in case jumbo frames are tacitly supported + * by the device. The default MTU is always 1500. + */ +#define VIRTIO_NET_F_MTU (1ULL << 3) + +/* + * MAC: + * The device has an assigned primary MAC address. If this feature bit is + * not set, the driver must provide a locally assigned MAC address. See + * IEEE 802, "48-bit universal LAN MAC addresses" for more details on + * assignment. + */ +#define VIRTIO_NET_F_MAC (1ULL << 5) + +/* + * GUEST_TSO4, GUEST_TSO6, GUEST_UFO: + * Inbound segmentation offload support. These features depend on having + * VIRTIO_NET_F_GUEST_CSUM and signal that the driver can accept large + * combined TCP (v4 or v6) packets, or reassembled UDP fragments. + */ +#define VIRTIO_NET_F_GUEST_TSO4 (1ULL << 7) +#define VIRTIO_NET_F_GUEST_TSO6 (1ULL << 8) +#define VIRTIO_NET_F_GUEST_UFO (1ULL << 10) + +/* + * GUEST_ECN: + * Depends on either VIRTIO_NET_F_GUEST_TSO4 or VIRTIO_NET_F_GUEST_TSO6. + * This feature means the driver will look for the VIRTIO_NET_HDR_GSO_ECN + * bit in the "gso_type" of the virtio net header. This bit tells the + * driver that the Explicit Congestion Notification (ECN) bit was set in + * the original TCP packets. + */ +#define VIRTIO_NET_F_GUEST_ECN (1ULL << 9) + +/* + * HOST_TSO4, HOST_TSO6, HOST_UFO: + * Outbound segmentation offload support. These features depend on having + * VIRTIO_NET_F_CSUM and signal that the device will accept large combined + * TCP (v4 or v6) packets that require segmentation offload, or large + * combined UDP packets that require fragmentation offload. + */ +#define VIRTIO_NET_F_HOST_TSO4 (1ULL << 11) +#define VIRTIO_NET_F_HOST_TSO6 (1ULL << 12) +#define VIRTIO_NET_F_HOST_UFO (1ULL << 14) + +/* + * HOST_ECN: + * Depends on either VIRTIO_NET_F_HOST_TSO4 or VIRTIO_NET_F_HOST_TSO6. + * This features means the device will accept packets that both require + * segmentation offload and have the Explicit Congestion Notification + * (ECN) bit set. If this feature is not present, the device must not + * send large segments that require ECN to be set. + */ +#define VIRTIO_NET_F_HOST_ECN (1ULL << 13) + +/* + * GSO: + * The GSO feature is, in theory, the combination of HOST_TSO4, HOST_TSO6, + * and HOST_ECN. This is only useful for legacy devices; newer devices + * should be using the more specific bits above. + */ +#define VIRTIO_NET_F_GSO (1ULL << 6) + +/* + * MRG_RXBUF: + * This feature allows the receipt of large packets without needing to + * allocate large buffers. The "virtio_net_hdr" will include an extra + * value: the number of buffers to gang together. + */ +#define VIRTIO_NET_F_MRG_RXBUF (1ULL << 15) + +/* + * STATUS: + * The VIRTIO_NET_CONFIG_STATUS configuration register is available, which + * allows the driver to read the link state from the device. + */ +#define VIRTIO_NET_F_STATUS (1ULL << 16) + +/* + * CTRL_VQ, CTRL_RX, CTRL_VLAN: + * These features signal that the device exposes the control queue + * (VIRTIO_NET_VIRTQ_CONTROL), in the case of CTRL_VQ; and that the + * control queue supports extra commands (CTRL_RX, CTRL_VLAN). + */ +#define VIRTIO_NET_F_CTRL_VQ (1ULL << 17) +#define VIRTIO_NET_F_CTRL_RX (1ULL << 18) +#define VIRTIO_NET_F_CTRL_VLAN (1ULL << 19) +#define VIRTIO_NET_F_CTRL_RX_EXTRA (1ULL << 20) + +/* + * These features are supported by the driver and we will request them from the + * device. Note that we do not currently request GUEST_CSUM, as the driver + * does not presently support receiving frames with any offload features from + * the device. + */ +#define VIRTIO_NET_WANTED_FEATURES (VIRTIO_NET_F_CSUM | \ + VIRTIO_NET_F_GSO | \ + VIRTIO_NET_F_HOST_TSO4 | \ + VIRTIO_NET_F_HOST_ECN | \ + VIRTIO_NET_F_MAC | \ + VIRTIO_NET_F_MTU) + +/* + * VIRTIO NETWORK HEADER + * + * This structure appears at the start of each transmit or receive packet + * buffer. + */ +struct virtio_net_hdr { + uint8_t vnh_flags; + uint8_t vnh_gso_type; + uint16_t vnh_hdr_len; + uint16_t vnh_gso_size; + uint16_t vnh_csum_start; + uint16_t vnh_csum_offset; +} __packed; + +/* + * VIRTIO NETWORK HEADER: FLAGS (vnh_flags) + */ +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 0x01 + +/* + * VIRTIO NETWORK HEADER: OFFLOAD OPTIONS (vnh_gso_type) + * + * Each of these is an offload type, except for the ECN value which is + * logically OR-ed with one of the other types. + */ +#define VIRTIO_NET_HDR_GSO_NONE 0 +#define VIRTIO_NET_HDR_GSO_TCPV4 1 +#define VIRTIO_NET_HDR_GSO_UDP 3 +#define VIRTIO_NET_HDR_GSO_TCPV6 4 +#define VIRTIO_NET_HDR_GSO_ECN 0x80 + + +/* + * DRIVER PARAMETERS + */ + +/* + * At attach, we allocate a fixed pool of buffers for receipt and transmission + * of frames. The maximum number of buffers of each type that we will allocate + * is specified here. If the ring size is smaller than this number, we will + * use the ring size instead. + */ +#define VIRTIO_NET_TX_BUFS 256 +#define VIRTIO_NET_RX_BUFS 256 + +/* + * The virtio net header and the first buffer segment share the same DMA + * allocation. We round up the virtio header size to a multiple of 4 and add 2 + * bytes so that the IP header, which starts immediately after the 14 or 18 + * byte Ethernet header, is then correctly aligned: + * + * 0 10 16 18 32/36 + * | virtio_net_hdr | %4==0 | +2 | Ethernet header (14/18 bytes) | IPv4 ... + * + * Note that for this to work correctly, the DMA allocation must also be 4 byte + * aligned. + */ +#define VIOIF_HEADER_ALIGN 4 +#define VIOIF_HEADER_SKIP (P2ROUNDUP( \ + sizeof (struct virtio_net_hdr), \ + VIOIF_HEADER_ALIGN) + 2) + +/* + * Given we are not negotiating VIRTIO_NET_F_MRG_RXBUF, the specification says + * we must be able to accept a 1514 byte packet, or if any segmentation offload + * features have been negotiated a 65550 byte packet. To keep things simple, + * we'll assume segmentation offload is possible in most cases. In addition to + * the packet payload, we need to account for the Ethernet header and the + * virtio_net_hdr. + */ +#define VIOIF_RX_DATA_SIZE 65550 +#define VIOIF_RX_BUF_SIZE (VIOIF_RX_DATA_SIZE + \ + sizeof (struct ether_header) + \ + VIOIF_HEADER_SKIP) + +/* + * If we assume that a large allocation will probably have mostly 4K page sized + * cookies, 64 segments allows us 256KB for a single frame. We're in control + * of the allocation we use for receive buffers, so this value only has an + * impact on the length of chain we're able to create for external transmit + * buffer mappings. + */ +#define VIOIF_MAX_SEGS 64 + +/* + * We pre-allocate a reasonably large buffer to copy small packets + * there. Bigger packets are mapped, packets with multiple + * cookies are mapped as indirect buffers. + */ +#define VIOIF_TX_INLINE_SIZE (2 * 1024) + + +/* + * TYPE DEFINITIONS + */ + +typedef struct vioif vioif_t; + +/* + * Receive buffers are allocated in advance as a combination of DMA memory and + * a descriptor chain. Receive buffers can be loaned to the networking stack + * to avoid copying, and this object contains the free routine to pass to + * desballoc(). + * + * When receive buffers are not in use, they are linked into the per-instance + * free list, "vif_rxbufs" via "rb_link". Under normal conditions, we expect + * the free list to be empty much of the time; most buffers will be in the ring + * or on loan. + */ +typedef struct vioif_rxbuf { + vioif_t *rb_vioif; + frtn_t rb_frtn; + + virtio_dma_t *rb_dma; + virtio_chain_t *rb_chain; + + list_node_t rb_link; +} vioif_rxbuf_t; + +/* + * Transmit buffers are also allocated in advance. DMA memory is allocated for + * the virtio net header, and to hold small packets. Larger packets are mapped + * from storage loaned to the driver by the network stack. + * + * When transmit buffers are not in use, they are linked into the per-instance + * free list, "vif_txbufs" via "tb_link". + */ +typedef struct vioif_txbuf { + mblk_t *tb_mp; + + /* + * Inline buffer space (VIOIF_TX_INLINE_SIZE) for storage of the virtio + * net header, and to hold copied (rather than mapped) packet data. + */ + virtio_dma_t *tb_dma; + virtio_chain_t *tb_chain; + + /* + * External buffer mapping. The capacity is fixed at allocation time, + * and "tb_ndmaext" tracks the current number of mappings. + */ + virtio_dma_t **tb_dmaext; + uint_t tb_dmaext_capacity; + uint_t tb_ndmaext; + + list_node_t tb_link; +} vioif_txbuf_t; + +typedef enum vioif_runstate { + VIOIF_RUNSTATE_STOPPED = 1, + VIOIF_RUNSTATE_STOPPING, + VIOIF_RUNSTATE_RUNNING +} vioif_runstate_t; + +/* + * Per-instance driver object. + */ +struct vioif { + dev_info_t *vif_dip; + virtio_t *vif_virtio; + + kmutex_t vif_mutex; + + /* + * The NIC is considered RUNNING between the mc_start(9E) and + * mc_stop(9E) calls. Otherwise it is STOPPING (while draining + * resources) then STOPPED. When not RUNNING, we will drop incoming + * frames and refuse to insert more receive buffers into the receive + * queue. + */ + vioif_runstate_t vif_runstate; + + mac_handle_t vif_mac_handle; + mac_register_t *vif_macp; + + virtio_queue_t *vif_rx_vq; + virtio_queue_t *vif_tx_vq; + + /* TX virtqueue management resources */ + boolean_t vif_tx_corked; + boolean_t vif_tx_drain; + timeout_id_t vif_tx_reclaim_tid; + + /* + * Configured offload features: + */ + unsigned int vif_tx_csum:1; + unsigned int vif_tx_tso4:1; + + /* + * For debugging, it is useful to know whether the MAC address we + * are using came from the host (via VIRTIO_NET_CONFIG_MAC) or + * was otherwise generated or set from within the guest. + */ + unsigned int vif_mac_from_host:1; + + uint_t vif_mtu; + uint_t vif_mtu_max; + uint8_t vif_mac[ETHERADDRL]; + + /* + * Receive buffer free list and accounting: + */ + list_t vif_rxbufs; + uint_t vif_nrxbufs_alloc; + uint_t vif_nrxbufs_onloan; + uint_t vif_nrxbufs_onloan_max; + uint_t vif_rxbufs_capacity; + vioif_rxbuf_t *vif_rxbufs_mem; + + /* + * Transmit buffer free list and accounting: + */ + list_t vif_txbufs; + uint_t vif_ntxbufs_alloc; + uint_t vif_txbufs_capacity; + vioif_txbuf_t *vif_txbufs_mem; + + /* + * These copy size thresholds are exposed as private MAC properties so + * that they can be tuned without rebooting. + */ + uint_t vif_rxcopy_thresh; + uint_t vif_txcopy_thresh; + + /* + * Statistics visible through mac: + */ + uint64_t vif_ipackets; + uint64_t vif_opackets; + uint64_t vif_rbytes; + uint64_t vif_obytes; + uint64_t vif_brdcstxmt; + uint64_t vif_brdcstrcv; + uint64_t vif_multixmt; + uint64_t vif_multircv; + uint64_t vif_norecvbuf; + uint64_t vif_notxbuf; + uint64_t vif_ierrors; + uint64_t vif_oerrors; + + /* + * Internal debugging statistics: + */ + uint64_t vif_rxfail_dma_handle; + uint64_t vif_rxfail_dma_buffer; + uint64_t vif_rxfail_dma_bind; + uint64_t vif_rxfail_chain_undersize; + uint64_t vif_rxfail_no_descriptors; + uint64_t vif_txfail_dma_handle; + uint64_t vif_txfail_dma_bind; + uint64_t vif_txfail_indirect_limit; + + uint64_t vif_stat_tx_reclaim; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _VIOIF_H */ diff --git a/usr/src/uts/common/io/virtio/virtio.c b/usr/src/uts/common/io/virtio/virtio.c deleted file mode 100644 index 19a66b8f38..0000000000 --- a/usr/src/uts/common/io/virtio/virtio.c +++ /dev/null @@ -1,1364 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> - * Copyright (c) 2016 by Delphix. All rights reserved. - * Copyright 2017 Joyent, Inc. - */ - -/* Based on the NetBSD virtio driver by Minoura Makoto. */ -/* - * Copyright (c) 2010 Minoura Makoto. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <sys/conf.h> -#include <sys/kmem.h> -#include <sys/debug.h> -#include <sys/modctl.h> -#include <sys/autoconf.h> -#include <sys/ddi_impldefs.h> -#include <sys/ddi.h> -#include <sys/sunddi.h> -#include <sys/sunndi.h> -#include <sys/avintr.h> -#include <sys/spl.h> -#include <sys/promif.h> -#include <sys/list.h> -#include <sys/bootconf.h> -#include <sys/bootsvcs.h> -#include <sys/sysmacros.h> -#include <sys/pci.h> - -#include "virtiovar.h" -#include "virtioreg.h" - -#define NDEVNAMES (sizeof (virtio_device_name) / sizeof (char *)) -#define MINSEG_INDIRECT 2 /* use indirect if nsegs >= this value */ -#define VIRTQUEUE_ALIGN(n) (((n)+(VIRTIO_PAGE_SIZE-1)) & \ - ~(VIRTIO_PAGE_SIZE-1)) - -void -virtio_set_status(struct virtio_softc *sc, unsigned int status) -{ - int old = 0; - - if (status != 0) { - old = ddi_get8(sc->sc_ioh, (uint8_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_DEVICE_STATUS)); - } - - ddi_put8(sc->sc_ioh, (uint8_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_DEVICE_STATUS), status | old); -} - -/* - * Negotiate features, save the result in sc->sc_features - */ -uint32_t -virtio_negotiate_features(struct virtio_softc *sc, uint32_t guest_features) -{ - uint32_t host_features; - uint32_t features; - - host_features = ddi_get32(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint32_t *)(sc->sc_io_addr + VIRTIO_CONFIG_DEVICE_FEATURES)); - - dev_debug(sc->sc_dev, CE_NOTE, "host features: %x, guest features: %x", - host_features, guest_features); - - features = host_features & guest_features; - ddi_put32(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint32_t *)(sc->sc_io_addr + VIRTIO_CONFIG_GUEST_FEATURES), - features); - - sc->sc_features = features; - - return (host_features); -} - -size_t -virtio_show_features(uint32_t features, char *buf, size_t len) -{ - char *orig_buf = buf; - char *bufend = buf + len; - - /* LINTED E_PTRDIFF_OVERFLOW */ - buf += snprintf(buf, bufend - buf, "Generic ( "); - if (features & VIRTIO_F_RING_INDIRECT_DESC) - /* LINTED E_PTRDIFF_OVERFLOW */ - buf += snprintf(buf, bufend - buf, "INDIRECT_DESC "); - - /* LINTED E_PTRDIFF_OVERFLOW */ - buf += snprintf(buf, bufend - buf, ") "); - - /* LINTED E_PTRDIFF_OVERFLOW */ - return (buf - orig_buf); -} - -boolean_t -virtio_has_feature(struct virtio_softc *sc, uint32_t feature) -{ - return (sc->sc_features & feature); -} - -/* - * Device configuration registers. - */ -uint8_t -virtio_read_device_config_1(struct virtio_softc *sc, unsigned int index) -{ - ASSERT(sc->sc_config_offset); - return ddi_get8(sc->sc_ioh, - (uint8_t *)(sc->sc_io_addr + sc->sc_config_offset + index)); -} - -uint16_t -virtio_read_device_config_2(struct virtio_softc *sc, unsigned int index) -{ - ASSERT(sc->sc_config_offset); - return ddi_get16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + sc->sc_config_offset + index)); -} - -uint32_t -virtio_read_device_config_4(struct virtio_softc *sc, unsigned int index) -{ - ASSERT(sc->sc_config_offset); - return ddi_get32(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + index)); -} - -uint64_t -virtio_read_device_config_8(struct virtio_softc *sc, unsigned int index) -{ - uint64_t r; - - ASSERT(sc->sc_config_offset); - r = ddi_get32(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + - index + sizeof (uint32_t))); - - r <<= 32; - - r += ddi_get32(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + index)); - return (r); -} - -void -virtio_write_device_config_1(struct virtio_softc *sc, unsigned int index, - uint8_t value) -{ - ASSERT(sc->sc_config_offset); - ddi_put8(sc->sc_ioh, - (uint8_t *)(sc->sc_io_addr + sc->sc_config_offset + index), value); -} - -void -virtio_write_device_config_2(struct virtio_softc *sc, unsigned int index, - uint16_t value) -{ - ASSERT(sc->sc_config_offset); - ddi_put16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + sc->sc_config_offset + index), value); -} - -void -virtio_write_device_config_4(struct virtio_softc *sc, unsigned int index, - uint32_t value) -{ - ASSERT(sc->sc_config_offset); - ddi_put32(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + index), value); -} - -void -virtio_write_device_config_8(struct virtio_softc *sc, unsigned int index, - uint64_t value) -{ - ASSERT(sc->sc_config_offset); - ddi_put32(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + index), - value & 0xFFFFFFFF); - ddi_put32(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + - index + sizeof (uint32_t)), value >> 32); -} - -/* - * Start/stop vq interrupt. No guarantee. - */ -void -virtio_stop_vq_intr(struct virtqueue *vq) -{ - vq->vq_avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; -} - -void -virtio_start_vq_intr(struct virtqueue *vq) -{ - vq->vq_avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; -} - -static ddi_dma_attr_t virtio_vq_dma_attr = { - DMA_ATTR_V0, /* Version number */ - 0, /* low address */ - 0x00000FFFFFFFFFFF, /* high address. Has to fit into 32 bits */ - /* after page-shifting */ - 0xFFFFFFFF, /* counter register max */ - VIRTIO_PAGE_SIZE, /* page alignment required */ - 0x3F, /* burst sizes: 1 - 32 */ - 0x1, /* minimum transfer size */ - 0xFFFFFFFF, /* max transfer size */ - 0xFFFFFFFF, /* address register max */ - 1, /* no scatter-gather */ - 1, /* device operates on bytes */ - 0, /* attr flag: set to 0 */ -}; - -static ddi_dma_attr_t virtio_vq_indirect_dma_attr = { - DMA_ATTR_V0, /* Version number */ - 0, /* low address */ - 0xFFFFFFFFFFFFFFFF, /* high address */ - 0xFFFFFFFF, /* counter register max */ - 1, /* No specific alignment */ - 0x3F, /* burst sizes: 1 - 32 */ - 0x1, /* minimum transfer size */ - 0xFFFFFFFF, /* max transfer size */ - 0xFFFFFFFF, /* address register max */ - 1, /* no scatter-gather */ - 1, /* device operates on bytes */ - 0, /* attr flag: set to 0 */ -}; - -/* Same for direct and indirect descriptors. */ -static ddi_device_acc_attr_t virtio_vq_devattr = { - DDI_DEVICE_ATTR_V0, - DDI_NEVERSWAP_ACC, - DDI_STORECACHING_OK_ACC, - DDI_DEFAULT_ACC -}; - -static void -virtio_free_indirect(struct vq_entry *entry) -{ - - (void) ddi_dma_unbind_handle(entry->qe_indirect_dma_handle); - ddi_dma_mem_free(&entry->qe_indirect_dma_acch); - ddi_dma_free_handle(&entry->qe_indirect_dma_handle); - - entry->qe_indirect_descs = NULL; -} - - -static int -virtio_alloc_indirect(struct virtio_softc *sc, struct vq_entry *entry) -{ - int allocsize, num; - size_t len; - unsigned int ncookies; - int ret; - - num = entry->qe_queue->vq_indirect_num; - ASSERT(num > 1); - - allocsize = sizeof (struct vring_desc) * num; - - ret = ddi_dma_alloc_handle(sc->sc_dev, &virtio_vq_indirect_dma_attr, - DDI_DMA_SLEEP, NULL, &entry->qe_indirect_dma_handle); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to allocate dma handle for indirect descriptors, " - "entry %d, vq %d", entry->qe_index, - entry->qe_queue->vq_index); - goto out_alloc_handle; - } - - ret = ddi_dma_mem_alloc(entry->qe_indirect_dma_handle, allocsize, - &virtio_vq_devattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, - (caddr_t *)&entry->qe_indirect_descs, &len, - &entry->qe_indirect_dma_acch); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to allocate dma memory for indirect descriptors, " - "entry %d, vq %d,", entry->qe_index, - entry->qe_queue->vq_index); - goto out_alloc; - } - - (void) memset(entry->qe_indirect_descs, 0xff, allocsize); - - ret = ddi_dma_addr_bind_handle(entry->qe_indirect_dma_handle, NULL, - (caddr_t)entry->qe_indirect_descs, len, - DDI_DMA_RDWR | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, - &entry->qe_indirect_dma_cookie, &ncookies); - if (ret != DDI_DMA_MAPPED) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to bind dma memory for indirect descriptors, " - "entry %d, vq %d", entry->qe_index, - entry->qe_queue->vq_index); - goto out_bind; - } - - /* We asked for a single segment */ - ASSERT(ncookies == 1); - - return (0); - -out_bind: - ddi_dma_mem_free(&entry->qe_indirect_dma_acch); -out_alloc: - ddi_dma_free_handle(&entry->qe_indirect_dma_handle); -out_alloc_handle: - - return (ret); -} - -/* - * Initialize the vq structure. - */ -static int -virtio_init_vq(struct virtio_softc *sc, struct virtqueue *vq) -{ - int ret; - uint16_t i; - int vq_size = vq->vq_num; - int indirect_num = vq->vq_indirect_num; - - /* free slot management */ - list_create(&vq->vq_freelist, sizeof (struct vq_entry), - offsetof(struct vq_entry, qe_list)); - - for (i = 0; i < vq_size; i++) { - struct vq_entry *entry = &vq->vq_entries[i]; - list_insert_tail(&vq->vq_freelist, entry); - entry->qe_index = i; - entry->qe_desc = &vq->vq_descs[i]; - entry->qe_queue = vq; - - if (indirect_num) { - ret = virtio_alloc_indirect(sc, entry); - if (ret) - goto out_indirect; - } - } - - mutex_init(&vq->vq_freelist_lock, "virtio-freelist", MUTEX_DRIVER, - DDI_INTR_PRI(sc->sc_intr_prio)); - mutex_init(&vq->vq_avail_lock, "virtio-avail", MUTEX_DRIVER, - DDI_INTR_PRI(sc->sc_intr_prio)); - mutex_init(&vq->vq_used_lock, "virtio-used", MUTEX_DRIVER, - DDI_INTR_PRI(sc->sc_intr_prio)); - - return (0); - -out_indirect: - for (i = 0; i < vq_size; i++) { - struct vq_entry *entry = &vq->vq_entries[i]; - if (entry->qe_indirect_descs) - virtio_free_indirect(entry); - } - - return (ret); -} - -/* - * Allocate/free a vq. - */ -struct virtqueue * -virtio_alloc_vq(struct virtio_softc *sc, unsigned int index, unsigned int size, - unsigned int indirect_num, const char *name) -{ - int vq_size, allocsize1, allocsize2, allocsize = 0; - int ret; - unsigned int ncookies; - size_t len; - struct virtqueue *vq; - - ddi_put16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_SELECT), index); - vq_size = ddi_get16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_SIZE)); - if (vq_size == 0) { - dev_err(sc->sc_dev, CE_WARN, - "virtqueue dest not exist, index %d for %s\n", index, name); - goto out; - } - - vq = kmem_zalloc(sizeof (struct virtqueue), KM_SLEEP); - - /* size 0 => use native vq size, good for receive queues. */ - if (size) - vq_size = MIN(vq_size, size); - - /* allocsize1: descriptor table + avail ring + pad */ - allocsize1 = VIRTQUEUE_ALIGN(sizeof (struct vring_desc) * vq_size + - sizeof (struct vring_avail) + sizeof (uint16_t) * vq_size); - /* allocsize2: used ring + pad */ - allocsize2 = VIRTQUEUE_ALIGN(sizeof (struct vring_used) + - sizeof (struct vring_used_elem) * vq_size); - - allocsize = allocsize1 + allocsize2; - - ret = ddi_dma_alloc_handle(sc->sc_dev, &virtio_vq_dma_attr, - DDI_DMA_SLEEP, NULL, &vq->vq_dma_handle); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to allocate dma handle for vq %d", index); - goto out_alloc_handle; - } - - ret = ddi_dma_mem_alloc(vq->vq_dma_handle, allocsize, - &virtio_vq_devattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, - (caddr_t *)&vq->vq_vaddr, &len, &vq->vq_dma_acch); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to allocate dma memory for vq %d", index); - goto out_alloc; - } - - ret = ddi_dma_addr_bind_handle(vq->vq_dma_handle, NULL, - (caddr_t)vq->vq_vaddr, len, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, - DDI_DMA_SLEEP, NULL, &vq->vq_dma_cookie, &ncookies); - if (ret != DDI_DMA_MAPPED) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to bind dma memory for vq %d", index); - goto out_bind; - } - - /* We asked for a single segment */ - ASSERT(ncookies == 1); - /* and page-ligned buffers. */ - ASSERT(vq->vq_dma_cookie.dmac_laddress % VIRTIO_PAGE_SIZE == 0); - - (void) memset(vq->vq_vaddr, 0, allocsize); - - /* Make sure all zeros hit the buffer before we point the host to it */ - membar_producer(); - - /* set the vq address */ - ddi_put32(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint32_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_ADDRESS), - (vq->vq_dma_cookie.dmac_laddress / VIRTIO_PAGE_SIZE)); - - /* remember addresses and offsets for later use */ - vq->vq_owner = sc; - vq->vq_num = vq_size; - vq->vq_index = index; - vq->vq_descs = vq->vq_vaddr; - vq->vq_availoffset = sizeof (struct vring_desc)*vq_size; - vq->vq_avail = (void *)(((char *)vq->vq_descs) + vq->vq_availoffset); - vq->vq_usedoffset = allocsize1; - vq->vq_used = (void *)(((char *)vq->vq_descs) + vq->vq_usedoffset); - - ASSERT(indirect_num == 0 || - virtio_has_feature(sc, VIRTIO_F_RING_INDIRECT_DESC)); - vq->vq_indirect_num = indirect_num; - - /* free slot management */ - vq->vq_entries = kmem_zalloc(sizeof (struct vq_entry) * vq_size, - KM_SLEEP); - - ret = virtio_init_vq(sc, vq); - if (ret) - goto out_init; - - dev_debug(sc->sc_dev, CE_NOTE, - "Allocated %d entries for vq %d:%s (%d indirect descs)", - vq_size, index, name, indirect_num * vq_size); - - return (vq); - -out_init: - kmem_free(vq->vq_entries, sizeof (struct vq_entry) * vq_size); - (void) ddi_dma_unbind_handle(vq->vq_dma_handle); -out_bind: - ddi_dma_mem_free(&vq->vq_dma_acch); -out_alloc: - ddi_dma_free_handle(&vq->vq_dma_handle); -out_alloc_handle: - kmem_free(vq, sizeof (struct virtqueue)); -out: - return (NULL); -} - -void -virtio_free_vq(struct virtqueue *vq) -{ - struct virtio_softc *sc = vq->vq_owner; - int i; - - /* tell device that there's no virtqueue any longer */ - ddi_put16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_SELECT), - vq->vq_index); - ddi_put32(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint32_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_ADDRESS), 0); - - /* Free the indirect descriptors, if any. */ - for (i = 0; i < vq->vq_num; i++) { - struct vq_entry *entry = &vq->vq_entries[i]; - if (entry->qe_indirect_descs) - virtio_free_indirect(entry); - } - - kmem_free(vq->vq_entries, sizeof (struct vq_entry) * vq->vq_num); - - (void) ddi_dma_unbind_handle(vq->vq_dma_handle); - ddi_dma_mem_free(&vq->vq_dma_acch); - ddi_dma_free_handle(&vq->vq_dma_handle); - - mutex_destroy(&vq->vq_used_lock); - mutex_destroy(&vq->vq_avail_lock); - mutex_destroy(&vq->vq_freelist_lock); - - kmem_free(vq, sizeof (struct virtqueue)); -} - -/* - * Free descriptor management. - */ -struct vq_entry * -vq_alloc_entry(struct virtqueue *vq) -{ - struct vq_entry *qe; - - mutex_enter(&vq->vq_freelist_lock); - if (list_is_empty(&vq->vq_freelist)) { - mutex_exit(&vq->vq_freelist_lock); - return (NULL); - } - qe = list_remove_head(&vq->vq_freelist); - - ASSERT(vq->vq_used_entries >= 0); - vq->vq_used_entries++; - - mutex_exit(&vq->vq_freelist_lock); - - qe->qe_next = NULL; - qe->qe_indirect_next = 0; - (void) memset(qe->qe_desc, 0, sizeof (struct vring_desc)); - - return (qe); -} - -void -vq_free_entry(struct virtqueue *vq, struct vq_entry *qe) -{ - mutex_enter(&vq->vq_freelist_lock); - - list_insert_head(&vq->vq_freelist, qe); - vq->vq_used_entries--; - ASSERT(vq->vq_used_entries >= 0); - mutex_exit(&vq->vq_freelist_lock); -} - -/* - * We (intentionally) don't have a global vq mutex, so you are - * responsible for external locking to avoid allocting/freeing any - * entries before using the returned value. Have fun. - */ -uint_t -vq_num_used(struct virtqueue *vq) -{ - /* vq->vq_freelist_lock would not help here. */ - return (vq->vq_used_entries); -} - -static inline void -virtio_ve_set_desc(struct vring_desc *desc, uint64_t paddr, uint32_t len, - boolean_t write) -{ - desc->addr = paddr; - desc->len = len; - desc->next = 0; - desc->flags = 0; - - /* 'write' - from the driver's point of view */ - if (!write) - desc->flags = VRING_DESC_F_WRITE; -} - -void -virtio_ve_set(struct vq_entry *qe, uint64_t paddr, uint32_t len, - boolean_t write) -{ - virtio_ve_set_desc(qe->qe_desc, paddr, len, write); -} - -unsigned int -virtio_ve_indirect_available(struct vq_entry *qe) -{ - return (qe->qe_queue->vq_indirect_num - qe->qe_indirect_next); -} - -void -virtio_ve_add_indirect_buf(struct vq_entry *qe, uint64_t paddr, uint32_t len, - boolean_t write) -{ - struct vring_desc *indirect_desc; - - ASSERT(qe->qe_queue->vq_indirect_num); - ASSERT(qe->qe_indirect_next < qe->qe_queue->vq_indirect_num); - - indirect_desc = &qe->qe_indirect_descs[qe->qe_indirect_next]; - virtio_ve_set_desc(indirect_desc, paddr, len, write); - qe->qe_indirect_next++; -} - -void -virtio_ve_add_cookie(struct vq_entry *qe, ddi_dma_handle_t dma_handle, - ddi_dma_cookie_t dma_cookie, unsigned int ncookies, boolean_t write) -{ - int i; - - for (i = 0; i < ncookies; i++) { - virtio_ve_add_indirect_buf(qe, dma_cookie.dmac_laddress, - dma_cookie.dmac_size, write); - ddi_dma_nextcookie(dma_handle, &dma_cookie); - } -} - -void -virtio_sync_vq(struct virtqueue *vq) -{ - struct virtio_softc *vsc = vq->vq_owner; - - /* Make sure the avail ring update hit the buffer */ - membar_producer(); - - vq->vq_avail->idx = vq->vq_avail_idx; - - /* Make sure the avail idx update hits the buffer */ - membar_producer(); - - /* Make sure we see the flags update */ - membar_consumer(); - - if (!(vq->vq_used->flags & VRING_USED_F_NO_NOTIFY)) { - ddi_put16(vsc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(vsc->sc_io_addr + - VIRTIO_CONFIG_QUEUE_NOTIFY), - vq->vq_index); - } -} - -void -virtio_push_chain(struct vq_entry *qe, boolean_t sync) -{ - struct virtqueue *vq = qe->qe_queue; - struct vq_entry *head = qe; - struct vring_desc *desc; - int idx; - - ASSERT(qe); - - /* - * Bind the descs together, paddr and len should be already - * set with virtio_ve_set - */ - do { - /* Bind the indirect descriptors */ - if (qe->qe_indirect_next > 1) { - uint16_t i = 0; - - /* - * Set the pointer/flags to the - * first indirect descriptor - */ - virtio_ve_set_desc(qe->qe_desc, - qe->qe_indirect_dma_cookie.dmac_laddress, - sizeof (struct vring_desc) * qe->qe_indirect_next, - B_FALSE); - qe->qe_desc->flags |= VRING_DESC_F_INDIRECT; - - /* For all but the last one, add the next index/flag */ - do { - desc = &qe->qe_indirect_descs[i]; - i++; - - desc->flags |= VRING_DESC_F_NEXT; - desc->next = i; - } while (i < qe->qe_indirect_next - 1); - - } - - if (qe->qe_next) { - qe->qe_desc->flags |= VRING_DESC_F_NEXT; - qe->qe_desc->next = qe->qe_next->qe_index; - } - - qe = qe->qe_next; - } while (qe); - - mutex_enter(&vq->vq_avail_lock); - idx = vq->vq_avail_idx; - vq->vq_avail_idx++; - - /* Make sure the bits hit the descriptor(s) */ - membar_producer(); - vq->vq_avail->ring[idx % vq->vq_num] = head->qe_index; - - /* Notify the device, if needed. */ - if (sync) - virtio_sync_vq(vq); - - mutex_exit(&vq->vq_avail_lock); -} - -/* - * Get a chain of descriptors from the used ring, if one is available. - */ -struct vq_entry * -virtio_pull_chain(struct virtqueue *vq, uint32_t *len) -{ - struct vq_entry *head; - int slot; - int usedidx; - - mutex_enter(&vq->vq_used_lock); - - /* No used entries? Bye. */ - if (vq->vq_used_idx == vq->vq_used->idx) { - mutex_exit(&vq->vq_used_lock); - return (NULL); - } - - usedidx = vq->vq_used_idx; - vq->vq_used_idx++; - mutex_exit(&vq->vq_used_lock); - - usedidx %= vq->vq_num; - - /* Make sure we do the next step _after_ checking the idx. */ - membar_consumer(); - - slot = vq->vq_used->ring[usedidx].id; - *len = vq->vq_used->ring[usedidx].len; - - head = &vq->vq_entries[slot]; - - return (head); -} - -void -virtio_free_chain(struct vq_entry *qe) -{ - struct vq_entry *tmp; - struct virtqueue *vq = qe->qe_queue; - - ASSERT(qe); - - do { - ASSERT(qe->qe_queue == vq); - tmp = qe->qe_next; - vq_free_entry(vq, qe); - qe = tmp; - } while (tmp != NULL); -} - -void -virtio_ventry_stick(struct vq_entry *first, struct vq_entry *second) -{ - first->qe_next = second; -} - -static int -virtio_register_msi(struct virtio_softc *sc, - struct virtio_int_handler *config_handler, - struct virtio_int_handler vq_handlers[], int intr_types) -{ - int count, actual; - int int_type; - int i; - int handler_count; - int ret; - - /* If both MSI and MSI-x are reported, prefer MSI-x. */ - int_type = DDI_INTR_TYPE_MSI; - if (intr_types & DDI_INTR_TYPE_MSIX) - int_type = DDI_INTR_TYPE_MSIX; - - /* Walk the handler table to get the number of handlers. */ - for (handler_count = 0; - vq_handlers && vq_handlers[handler_count].vh_func; - handler_count++) - ; - - /* +1 if there is a config change handler. */ - if (config_handler != NULL) - handler_count++; - - /* Number of MSIs supported by the device. */ - ret = ddi_intr_get_nintrs(sc->sc_dev, int_type, &count); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, "ddi_intr_get_nintrs failed"); - return (ret); - } - - /* - * Those who try to register more handlers then the device - * supports shall suffer. - */ - ASSERT(handler_count <= count); - - sc->sc_intr_htable = kmem_zalloc(sizeof (ddi_intr_handle_t) * - handler_count, KM_SLEEP); - - ret = ddi_intr_alloc(sc->sc_dev, sc->sc_intr_htable, int_type, 0, - handler_count, &actual, DDI_INTR_ALLOC_NORMAL); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, "Failed to allocate MSI: %d", ret); - goto out_msi_alloc; - } - - if (actual != handler_count) { - dev_err(sc->sc_dev, CE_WARN, - "Not enough MSI available: need %d, available %d", - handler_count, actual); - goto out_msi_available; - } - - sc->sc_intr_num = handler_count; - sc->sc_intr_config = B_FALSE; - if (config_handler != NULL) { - sc->sc_intr_config = B_TRUE; - } - - /* Assume they are all same priority */ - ret = ddi_intr_get_pri(sc->sc_intr_htable[0], &sc->sc_intr_prio); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, "ddi_intr_get_pri failed"); - goto out_msi_prio; - } - - /* Add the vq handlers */ - for (i = 0; vq_handlers[i].vh_func; i++) { - ret = ddi_intr_add_handler(sc->sc_intr_htable[i], - vq_handlers[i].vh_func, sc, vq_handlers[i].vh_priv); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "ddi_intr_add_handler failed"); - /* Remove the handlers that succeeded. */ - while (--i >= 0) { - (void) ddi_intr_remove_handler( - sc->sc_intr_htable[i]); - } - goto out_add_handlers; - } - } - - /* Don't forget the config handler */ - if (config_handler != NULL) { - ret = ddi_intr_add_handler(sc->sc_intr_htable[i], - config_handler->vh_func, sc, config_handler->vh_priv); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "ddi_intr_add_handler failed"); - /* Remove the handlers that succeeded. */ - while (--i >= 0) { - (void) ddi_intr_remove_handler( - sc->sc_intr_htable[i]); - } - goto out_add_handlers; - } - } - - ret = ddi_intr_get_cap(sc->sc_intr_htable[0], &sc->sc_intr_cap); - if (ret == DDI_SUCCESS) { - sc->sc_int_type = int_type; - return (DDI_SUCCESS); - } - -out_add_handlers: -out_msi_prio: -out_msi_available: - for (i = 0; i < actual; i++) - (void) ddi_intr_free(sc->sc_intr_htable[i]); -out_msi_alloc: - kmem_free(sc->sc_intr_htable, - sizeof (ddi_intr_handle_t) * handler_count); - - return (ret); -} - -struct virtio_handler_container { - int nhandlers; - struct virtio_int_handler config_handler; - struct virtio_int_handler vq_handlers[]; -}; - -uint_t -virtio_intx_dispatch(caddr_t arg1, caddr_t arg2) -{ - struct virtio_softc *sc = (void *)arg1; - struct virtio_handler_container *vhc = (void *)arg2; - uint8_t isr_status; - int i; - - isr_status = ddi_get8(sc->sc_ioh, (uint8_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_ISR_STATUS)); - - if (!isr_status) - return (DDI_INTR_UNCLAIMED); - - if ((isr_status & VIRTIO_CONFIG_ISR_CONFIG_CHANGE) && - vhc->config_handler.vh_func) { - vhc->config_handler.vh_func((void *)sc, - vhc->config_handler.vh_priv); - } - - /* Notify all handlers */ - for (i = 0; i < vhc->nhandlers; i++) { - vhc->vq_handlers[i].vh_func((void *)sc, - vhc->vq_handlers[i].vh_priv); - } - - return (DDI_INTR_CLAIMED); -} - -/* - * config_handler and vq_handlers may be allocated on stack. - * Take precautions not to loose them. - */ -static int -virtio_register_intx(struct virtio_softc *sc, - struct virtio_int_handler *config_handler, - struct virtio_int_handler vq_handlers[]) -{ - int vq_handler_count; - int actual; - struct virtio_handler_container *vhc; - size_t vhc_sz; - int ret = DDI_FAILURE; - - /* Walk the handler table to get the number of handlers. */ - for (vq_handler_count = 0; - vq_handlers && vq_handlers[vq_handler_count].vh_func; - vq_handler_count++) - ; - - vhc_sz = sizeof (struct virtio_handler_container) + - sizeof (struct virtio_int_handler) * vq_handler_count; - vhc = kmem_zalloc(vhc_sz, KM_SLEEP); - - vhc->nhandlers = vq_handler_count; - (void) memcpy(vhc->vq_handlers, vq_handlers, - sizeof (struct virtio_int_handler) * vq_handler_count); - - if (config_handler != NULL) { - (void) memcpy(&vhc->config_handler, config_handler, - sizeof (struct virtio_int_handler)); - } - - /* Just a single entry for a single interrupt. */ - sc->sc_intr_htable = kmem_zalloc(sizeof (ddi_intr_handle_t), KM_SLEEP); - - ret = ddi_intr_alloc(sc->sc_dev, sc->sc_intr_htable, - DDI_INTR_TYPE_FIXED, 0, 1, &actual, DDI_INTR_ALLOC_NORMAL); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to allocate a fixed interrupt: %d", ret); - goto out_int_alloc; - } - - ASSERT(actual == 1); - sc->sc_intr_num = 1; - - ret = ddi_intr_get_pri(sc->sc_intr_htable[0], &sc->sc_intr_prio); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, "ddi_intr_get_pri failed"); - goto out_prio; - } - - ret = ddi_intr_add_handler(sc->sc_intr_htable[0], - virtio_intx_dispatch, sc, vhc); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, "ddi_intr_add_handler failed"); - goto out_add_handlers; - } - - sc->sc_int_type = DDI_INTR_TYPE_FIXED; - - return (DDI_SUCCESS); - -out_add_handlers: -out_prio: - (void) ddi_intr_free(sc->sc_intr_htable[0]); -out_int_alloc: - kmem_free(sc->sc_intr_htable, sizeof (ddi_intr_handle_t)); - kmem_free(vhc, vhc_sz); - return (ret); -} - -/* - * We find out if we support MSI during this, and the register layout - * depends on the MSI (doh). Don't acces the device specific bits in - * BAR 0 before calling it! - */ -int -virtio_register_ints(struct virtio_softc *sc, - struct virtio_int_handler *config_handler, - struct virtio_int_handler vq_handlers[]) -{ - int ret; - int intr_types; - - /* Default offset until MSI-X is enabled, if ever. */ - sc->sc_config_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSIX; - - /* Determine which types of interrupts are supported */ - ret = ddi_intr_get_supported_types(sc->sc_dev, &intr_types); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, "Can't get supported int types"); - goto out_inttype; - } - - /* If we have msi, let's use them. */ - if (intr_types & (DDI_INTR_TYPE_MSIX | DDI_INTR_TYPE_MSI)) { - ret = virtio_register_msi(sc, config_handler, - vq_handlers, intr_types); - if (!ret) - return (0); - } - - /* Fall back to old-fashioned interrupts. */ - if (intr_types & DDI_INTR_TYPE_FIXED) { - dev_debug(sc->sc_dev, CE_WARN, - "Using legacy interrupts"); - - return (virtio_register_intx(sc, config_handler, vq_handlers)); - } - - dev_err(sc->sc_dev, CE_WARN, - "MSI failed and fixed interrupts not supported. Giving up."); - ret = DDI_FAILURE; - -out_inttype: - return (ret); -} - -static int -virtio_enable_msi(struct virtio_softc *sc) -{ - int ret, i; - int vq_handler_count = sc->sc_intr_num; - - /* Number of handlers, not counting the counfig. */ - if (sc->sc_intr_config) - vq_handler_count--; - - /* Enable the interrupts. Either the whole block, or one by one. */ - if (sc->sc_intr_cap & DDI_INTR_FLAG_BLOCK) { - ret = ddi_intr_block_enable(sc->sc_intr_htable, - sc->sc_intr_num); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to enable MSI, falling back to INTx"); - goto out_enable; - } - } else { - for (i = 0; i < sc->sc_intr_num; i++) { - ret = ddi_intr_enable(sc->sc_intr_htable[i]); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to enable MSI %d, " - "falling back to INTx", i); - - while (--i >= 0) { - (void) ddi_intr_disable( - sc->sc_intr_htable[i]); - } - goto out_enable; - } - } - } - - /* Bind the allocated MSI to the queues and config */ - for (i = 0; i < vq_handler_count; i++) { - int check; - - ddi_put16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_QUEUE_SELECT), i); - - ddi_put16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_QUEUE_VECTOR), i); - - check = ddi_get16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_QUEUE_VECTOR)); - if (check != i) { - dev_err(sc->sc_dev, CE_WARN, "Failed to bind handler " - "for VQ %d, MSI %d. Check = %x", i, i, check); - ret = ENODEV; - goto out_bind; - } - } - - if (sc->sc_intr_config) { - int check; - - ddi_put16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_CONFIG_VECTOR), i); - - check = ddi_get16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_CONFIG_VECTOR)); - if (check != i) { - dev_err(sc->sc_dev, CE_WARN, "Failed to bind handler " - "for Config updates, MSI %d", i); - ret = ENODEV; - goto out_bind; - } - } - - /* Configuration offset depends on whether MSI-X is used. */ - if (sc->sc_int_type == DDI_INTR_TYPE_MSIX) - sc->sc_config_offset = VIRTIO_CONFIG_DEVICE_CONFIG_MSIX; - else - ASSERT(sc->sc_int_type == DDI_INTR_TYPE_MSI); - - return (DDI_SUCCESS); - -out_bind: - /* Unbind the vqs */ - for (i = 0; i < vq_handler_count - 1; i++) { - ddi_put16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_QUEUE_SELECT), i); - - ddi_put16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_QUEUE_VECTOR), - VIRTIO_MSI_NO_VECTOR); - } - /* And the config */ - /* LINTED E_BAD_PTR_CAST_ALIGN */ - ddi_put16(sc->sc_ioh, (uint16_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_CONFIG_VECTOR), VIRTIO_MSI_NO_VECTOR); - - /* Disable the interrupts. Either the whole block, or one by one. */ - if (sc->sc_intr_cap & DDI_INTR_FLAG_BLOCK) { - ret = ddi_intr_block_disable(sc->sc_intr_htable, - sc->sc_intr_num); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to disable MSIs, won't be able to " - "reuse next time"); - } - } else { - for (i = 0; i < sc->sc_intr_num; i++) { - ret = ddi_intr_disable(sc->sc_intr_htable[i]); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to disable interrupt %d, " - "won't be able to reuse", i); - } - } - } - - ret = DDI_FAILURE; - -out_enable: - return (ret); -} - -static int -virtio_enable_intx(struct virtio_softc *sc) -{ - int ret; - - ret = ddi_intr_enable(sc->sc_intr_htable[0]); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to enable interrupt: %d", ret); - } - - return (ret); -} - -/* - * We can't enable/disable individual handlers in the INTx case so do - * the whole bunch even in the msi case. - */ -int -virtio_enable_ints(struct virtio_softc *sc) -{ - - ASSERT(sc->sc_config_offset == VIRTIO_CONFIG_DEVICE_CONFIG_NOMSIX); - - /* See if we are using MSI. */ - if (sc->sc_int_type == DDI_INTR_TYPE_MSIX || - sc->sc_int_type == DDI_INTR_TYPE_MSI) - return (virtio_enable_msi(sc)); - - ASSERT(sc->sc_int_type == DDI_INTR_TYPE_FIXED); - return (virtio_enable_intx(sc)); -} - -void -virtio_release_ints(struct virtio_softc *sc) -{ - int i; - int ret; - - /* We were running with MSI, unbind them. */ - if (sc->sc_int_type == DDI_INTR_TYPE_MSIX || - sc->sc_int_type == DDI_INTR_TYPE_MSI) { - /* Unbind all vqs */ - for (i = 0; i < sc->sc_nvqs; i++) { - ddi_put16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_QUEUE_SELECT), i); - - ddi_put16(sc->sc_ioh, - /* LINTED E_BAD_PTR_CAST_ALIGN */ - (uint16_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_QUEUE_VECTOR), - VIRTIO_MSI_NO_VECTOR); - } - /* And the config */ - /* LINTED E_BAD_PTR_CAST_ALIGN */ - ddi_put16(sc->sc_ioh, (uint16_t *)(sc->sc_io_addr + - VIRTIO_CONFIG_CONFIG_VECTOR), - VIRTIO_MSI_NO_VECTOR); - - } - - /* Disable the interrupts. Either the whole block, or one by one. */ - if (sc->sc_intr_cap & DDI_INTR_FLAG_BLOCK) { - ret = ddi_intr_block_disable(sc->sc_intr_htable, - sc->sc_intr_num); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to disable MSIs, won't be able to " - "reuse next time"); - } - } else { - for (i = 0; i < sc->sc_intr_num; i++) { - ret = ddi_intr_disable(sc->sc_intr_htable[i]); - if (ret != DDI_SUCCESS) { - dev_err(sc->sc_dev, CE_WARN, - "Failed to disable interrupt %d, " - "won't be able to reuse", i); - } - } - } - - - for (i = 0; i < sc->sc_intr_num; i++) { - (void) ddi_intr_remove_handler(sc->sc_intr_htable[i]); - } - - for (i = 0; i < sc->sc_intr_num; i++) - (void) ddi_intr_free(sc->sc_intr_htable[i]); - - kmem_free(sc->sc_intr_htable, sizeof (ddi_intr_handle_t) * - sc->sc_intr_num); - - /* After disabling interrupts, the config offset is non-MSI-X. */ - sc->sc_config_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSIX; -} - -/* - * Module linkage information for the kernel. - */ -static struct modlmisc modlmisc = { - &mod_miscops, /* Type of module */ - "VirtIO common library module", -}; - -static struct modlinkage modlinkage = { - MODREV_1, - { - (void *)&modlmisc, - NULL - } -}; - -int -_init(void) -{ - return (mod_install(&modlinkage)); -} - -int -_fini(void) -{ - return (mod_remove(&modlinkage)); -} - -int -_info(struct modinfo *modinfop) -{ - return (mod_info(&modlinkage, modinfop)); -} diff --git a/usr/src/uts/common/io/virtio/virtio.h b/usr/src/uts/common/io/virtio/virtio.h new file mode 100644 index 0000000000..420f9ccfed --- /dev/null +++ b/usr/src/uts/common/io/virtio/virtio.h @@ -0,0 +1,342 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VIRTIO_H +#define _VIRTIO_H + +/* + * VIRTIO FRAMEWORK + * + * This framework handles the initialisation and operation common to all Virtio + * device types; e.g., Virtio Block (vioblk), Virtio Network (vioif), etc. The + * framework presently provides for what is now described as a "legacy" driver + * in the current issue of the "Virtual I/O Device (VIRTIO) Version 1.1" + * specification. Though several new specifications have been released, legacy + * devices are still the most widely available on current hypervisor platforms. + * Legacy devices make use of the native byte order of the host system. + * + * FRAMEWORK INITIALISATION: STARTING + * + * Client drivers will, in their attach(9E) routine, make an early call to + * virtio_init(). This causes the framework to allocate some base resources + * and begin initialising the device. This routine confirms that the device + * will operate in the supported legacy mode as per the specification. A + * failure here means that we cannot presently support this device. + * + * Once virtio_init() returns, the initialisation phase has begun and the + * driver can examine negotiated features and set up virtqueues. The + * initialisation phase ends when the driver calls either + * virtio_init_complete() or virtio_fini(). + * + * FRAMEWORK INITIALISATION: FEATURE NEGOTIATION + * + * The virtio_init() call accepts a bitmask of desired features that the driver + * supports. The framework will negotiate the common set of features supported + * by both the driver and the device. The presence of any individual feature + * can be tested after the initialisation phase has begun using + * virtio_feature_present(). + * + * The framework will additionally negotiate some set of features that are not + * specific to a device type on behalf of the client driver; e.g., support for + * indirect descriptors. + * + * Some features allow the driver to read additional configuration values from + * the device-specific regions of the device register space. These can be + * accessed via the virtio_dev_get*() and virtio_dev_put*() family of + * functions. + * + * FRAMEWORK INITIALISATION: VIRTQUEUE CONFIGURATION + * + * During the initialisation phase, the client driver may configure some number + * of virtqueues with virtio_queue_alloc(). Once initialisation has been + * completed, no further queues can be configured without destroying the + * framework object and beginning again from scratch. + * + * When configuring a queue, the driver must know the queue index number. This + * generally comes from the section of the specification describing the + * specific device type; e.g., Virtio Network devices have a receive queue at + * index 0, and a transmit queue at index 1. The name given to the queue is + * informational and has no impact on device operation. + * + * Most queues will require an interrupt handler function. When a queue + * notification interrupt is received, the provided handler will be called with + * two arguments: first, the provided user data argument; and second, a pointer + * to the "virtio_t" object for this instance. + * + * A maximum segment count must be selected for each queue. This count is the + * upper bound on the number of scatter-gather cookies that will be accepted, + * and applies to both direct and indirect descriptor based queues. This cap + * is usually either negotiated with the device, or determined structurally + * based on the shape of the buffers required for device operation. + * + * FRAMEWORK INITIALISATION: FINISHING + * + * Once queue configuration has been completed, the client driver calls + * virtio_init_complete() to finalise resource allocation and set the device to + * the running state (DRIVER_OK). The framework will allocate any interrupts + * needed for queue notifications at this time. + * + * If the client driver cannot complete initialisation, the instance may + * instead be torn down with virtio_fini(). Signalling failure to this routine + * will report failure to the device instead of resetting it, which may be + * reported by the hypervisor as a fault. + * + * DESCRIPTOR CHAINS + * + * Most devices accept I/O requests from the driver through a least one queue. + * Some devices are operated by submission of synchronous requests. The device + * is expected to process the request and return some kind of status; e.g., a + * block device accepts write requests from the file system and signals when + * they have completed or failed. + * + * Other devices operate by asynchronous delivery of I/O requests to the + * driver; e.g., a network device may receive incoming frames at any time. + * Inbound asynchronous delivery is usually achieved by populating a queue with + * a series of memory buffers where the incoming data will be written by the + * device at some later time. + * + * Whether for inbound or outbound transfers, buffers are inserted into the + * ring through chains of one or more descriptors. Each descriptor has a + * transfer direction (to or from the device), and a physical address and + * length (i.e., a DMA cookie). The framework automatically manages the slight + * differences in operation between direct and indirect descriptor usage on + * behalf of the client driver. + * + * A chain of descriptors is allocated by calling virtio_chain_alloc() against + * a particular queue. This function accepts a kmem flag as per + * kmem_alloc(9F). A client driver specific void pointer may be attached to + * the chain with virtio_chain_data_set() and read back later with + * virtio_chain_data(); e.g., after it is returned by a call to + * virtio_queue_poll(). + * + * Cookies are added to a chain by calling virtio_chain_append() with the + * appropriate physical address and transfer direction. This function may fail + * if the chain is already using the maximum number of cookies for this queue. + * Client drivers are responsible for appropriate use of virtio_dma_sync() + * or ddi_dma_sync(9F) on any memory appended to a descriptor chain prior to + * chain submission. + * + * Once fully constructed and synced, a chain can be submitted to the device by + * calling virtio_chain_submit(). The caller may choose to flush the queue + * contents to the device on each submission, or to batch notifications until + * later to amortise the notification cost over more requests. If batching + * notifications, outstanding submissions can be flushed with a call to + * virtio_queue_flush(). Note that the framework will insert an appropriate + * memory barrier to ensure writes by the driver complete before making the + * submitted descriptor visible to the device. + * + * A chain may be reset for reuse with new cookies by calling + * virtio_chain_clear(). The chain may be freed completely by calling + * virtio_chain_free(). + * + * When a descriptor chain is returned to the driver by the device, it may + * include a received data length value. This value can be accessed via + * virtio_chain_received_length(). There is some suggestion in more recent + * Virtio specifications that, depending on the device type and the hypervisor + * this value may not always be accurate or useful. + * + * VIRTQUEUE OPERATION + * + * The queue size (i.e., the number of direct descriptor entries) can be + * found with virtio_queue_size(). This value is static over the lifetime + * of the queue. + * + * The number of descriptor chains presently submitted to the device and not + * yet returned can be obtained via virtio_queue_nactive(). + * + * Over time the device will return descriptor chains to the driver in response + * to device activity. Any newly returned chains may be retrieved by the + * driver by calling virtio_queue_poll(). See the DESCRIPTOR CHAINS section + * for more detail about managing descriptor chain objects. Note that the + * framework will insert an appropriate memory barrier to ensure that writes by + * the host are complete before returning the chain to the client driver. + * + * The NO_INTERRUPT flag on a queue may be set or cleared with + * virtio_queue_no_interrupt(). Note that this flag is purely advisory, and + * may not actually stop interrupts from the device in a timely fashion. + * + * INTERRUPT MANAGEMENT + * + * A mutex used within an interrupt handler must be initialised with the + * correct interrupt priority. After the initialisation phase is complete, the + * client should use virtio_intr_pri() to get a value suitable to pass to + * mutex_init(9F). + * + * When the driver is ready to receive notifications from the device, the + * virtio_interrupts_enable() routine may be called. Interrupts may be + * disabled again by calling virtio_interrupts_disable(). Interrupt resources + * will be deallocated as part of a subsequent call to virtio_fini(). + * + * DMA MEMORY MANAGEMENT: ALLOCATION AND FREE + * + * Client drivers may allocate memory suitable for communication with the + * device by using virtio_dma_alloc(). This function accepts an allocation + * size, a DMA attribute template, a set of DMA flags, and a kmem flag. + * A "virtio_dma_t" object is returned to track and manage the allocation. + * + * The DMA flags value will be a combination of direction flags (e.g., + * DDI_DMA_READ or DDI_DMA_WRITE) and mapping flags (e.g., DDI_DMA_CONSISTENT + * or DDI_DMA_STREAMING). The kmem flag is either KM_SLEEP or KM_NOSLEEP, + * as described in kmem_alloc(9F). + * + * Memory that is no longer required can be freed using virtio_dma_free(). + * + * DMA MEMORY MANAGEMENT: BINDING WITHOUT ALLOCATION + * + * If another subsystem has loaned memory to your client driver, you may need + * to allocate and bind a handle without additional backing memory. The + * virtio_dma_alloc_nomem() function can be used for this purpose, returning a + * "virtio_dma_t" object. + * + * Once allocated, an arbitrary kernel memory location can be bound for DMA + * with virtio_dma_bind(). The binding can be subsequently undone with + * virtio_dma_unbind(), allowing the "virtio_dma_t" object to be reused for + * another binding. + * + * DMA MEMORY MANAGEMENT: VIRTUAL AND PHYSICAL ADDRESSES + * + * The total size of a mapping (with or without own backing memory) can be + * found with virtio_dma_size(). A void pointer to a kernel virtual address + * within the buffer can be obtained via virtio_dma_va(); this function accepts + * a linear offset into the VA range and performs bounds checking. + * + * The number of physical memory addresses (DMA cookies) can be found with + * virtio_dma_ncookies(). The physical address and length of each cookie can + * be found with virtio_dma_cookie_pa() and virtio_dma_cookie_size(); these + * functions are keyed on the zero-indexed cookie number. + * + * DMA MEMORY MANAGEMENT: SYNCHRONISATION + * + * When passing memory to the device, or reading memory returned from the + * device, DMA synchronisation must be performed in case it is required by the + * underlying platform. A convenience wrapper exists: virtio_dma_sync(). This + * routine synchronises the entire binding and accepts the same synchronisation + * type values as ddi_dma_sync(9F). + * + * QUIESCE + * + * As quiesce(9E) merely requires that the device come to a complete stop, most + * client drivers will be able to call virtio_quiesce() without additional + * actions. This will reset the device, immediately halting all queue + * activity, and return a value suitable for returning from the client driver + * quiesce(9E) entrypoint. This routine must only be called from quiesce + * context as it performs no synchronisation with other threads. + * + * DETACH + * + * Some devices are effectively long-polled; that is, they submit some number + * of descriptor chains to the device that are not returned to the driver until + * some asynchronous event occurs such as the receipt of an incoming packet or + * a device hot plug event. When detaching the device the return of these + * outstanding buffers must be arranged. Some device types may have task + * management commands that can force the orderly return of these chains, but + * the only way to do so uniformly is to reset the device and claw back the + * memory. + * + * If the client driver has outstanding descriptors and needs a hard stop on + * device activity it can call virtio_shutdown(). This routine will bring + * queue processing to an orderly stop and then reset the device, causing it to + * cease use of any DMA resources. Once this function returns, the driver may + * call virtio_queue_evacuate() on each queue to retrieve any previously + * submitted chains. + * + * To tear down resources (e.g., interrupts and allocated memory) the client + * driver must finally call virtio_fini(). If virtio_shutdown() was not + * needed, this routine will also reset the device. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct virtio virtio_t; +typedef struct virtio_queue virtio_queue_t; +typedef struct virtio_chain virtio_chain_t; +typedef struct virtio_dma virtio_dma_t; + +typedef enum virtio_direction { + /* + * In the base specification, a descriptor is either set up to be + * written by the device or to be read by the device, but not both. + */ + VIRTIO_DIR_DEVICE_WRITES = 1, + VIRTIO_DIR_DEVICE_READS +} virtio_direction_t; + +void virtio_fini(virtio_t *, boolean_t); +virtio_t *virtio_init(dev_info_t *, uint64_t, boolean_t); +int virtio_init_complete(virtio_t *, int); +int virtio_quiesce(virtio_t *); +void virtio_shutdown(virtio_t *); + +void *virtio_intr_pri(virtio_t *); + +void virtio_device_reset(virtio_t *); + +uint8_t virtio_dev_get8(virtio_t *, uintptr_t); +uint16_t virtio_dev_get16(virtio_t *, uintptr_t); +uint32_t virtio_dev_get32(virtio_t *, uintptr_t); +uint64_t virtio_dev_get64(virtio_t *, uintptr_t); + +void virtio_dev_put8(virtio_t *, uintptr_t, uint8_t); +void virtio_dev_put16(virtio_t *, uintptr_t, uint16_t); +void virtio_dev_put32(virtio_t *, uintptr_t, uint32_t); + +boolean_t virtio_feature_present(virtio_t *, uint64_t); + +virtio_queue_t *virtio_queue_alloc(virtio_t *, uint16_t, const char *, + ddi_intr_handler_t *, void *, boolean_t, uint_t); + +virtio_chain_t *virtio_queue_poll(virtio_queue_t *); +virtio_chain_t *virtio_queue_evacuate(virtio_queue_t *); +void virtio_queue_flush(virtio_queue_t *); +void virtio_queue_no_interrupt(virtio_queue_t *, boolean_t); +uint_t virtio_queue_nactive(virtio_queue_t *); +uint_t virtio_queue_size(virtio_queue_t *); + +virtio_chain_t *virtio_chain_alloc(virtio_queue_t *, int); +void virtio_chain_clear(virtio_chain_t *); +void virtio_chain_free(virtio_chain_t *); +int virtio_chain_append(virtio_chain_t *, uint64_t, size_t, virtio_direction_t); + +void *virtio_chain_data(virtio_chain_t *); +void virtio_chain_data_set(virtio_chain_t *, void *); + +void virtio_chain_submit(virtio_chain_t *, boolean_t); +size_t virtio_chain_received_length(virtio_chain_t *); + +int virtio_interrupts_enable(virtio_t *); +void virtio_interrupts_disable(virtio_t *); + +virtio_dma_t *virtio_dma_alloc(virtio_t *, size_t, const ddi_dma_attr_t *, int, + int); +virtio_dma_t *virtio_dma_alloc_nomem(virtio_t *, const ddi_dma_attr_t *, int); +void virtio_dma_free(virtio_dma_t *); +int virtio_dma_bind(virtio_dma_t *, void *, size_t, int, int); +void virtio_dma_unbind(virtio_dma_t *); +void virtio_dma_sync(virtio_dma_t *, int); + +void *virtio_dma_va(virtio_dma_t *, size_t); +size_t virtio_dma_size(virtio_dma_t *); +uint_t virtio_dma_ncookies(virtio_dma_t *); +uint64_t virtio_dma_cookie_pa(virtio_dma_t *, uint_t); +size_t virtio_dma_cookie_size(virtio_dma_t *, uint_t); + + +#ifdef __cplusplus +} +#endif + +#endif /* _VIRTIO_H */ diff --git a/usr/src/uts/common/io/virtio/virtio_dma.c b/usr/src/uts/common/io/virtio/virtio_dma.c new file mode 100644 index 0000000000..81972b5402 --- /dev/null +++ b/usr/src/uts/common/io/virtio/virtio_dma.c @@ -0,0 +1,295 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * VIRTIO FRAMEWORK: DMA ROUTINES + * + * For design and usage documentation, see the comments in "virtio.h". + */ + +#include <sys/conf.h> +#include <sys/kmem.h> +#include <sys/debug.h> +#include <sys/modctl.h> +#include <sys/autoconf.h> +#include <sys/ddi_impldefs.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/avintr.h> +#include <sys/spl.h> +#include <sys/promif.h> +#include <sys/list.h> +#include <sys/bootconf.h> +#include <sys/bootsvcs.h> +#include <sys/sysmacros.h> +#include <sys/pci.h> + +#include "virtio.h" +#include "virtio_impl.h" + + + +void +virtio_dma_sync(virtio_dma_t *vidma, int flag) +{ + VERIFY0(ddi_dma_sync(vidma->vidma_dma_handle, 0, 0, flag)); +} + +uint_t +virtio_dma_ncookies(virtio_dma_t *vidma) +{ + return (vidma->vidma_dma_ncookies); +} + +size_t +virtio_dma_size(virtio_dma_t *vidma) +{ + return (vidma->vidma_size); +} + +void * +virtio_dma_va(virtio_dma_t *vidma, size_t offset) +{ + VERIFY3U(offset, <, vidma->vidma_size); + + return (vidma->vidma_va + offset); +} + +uint64_t +virtio_dma_cookie_pa(virtio_dma_t *vidma, uint_t cookie) +{ + VERIFY3U(cookie, <, vidma->vidma_dma_ncookies); + + return (vidma->vidma_dma_cookies[cookie].dmac_laddress); +} + +size_t +virtio_dma_cookie_size(virtio_dma_t *vidma, uint_t cookie) +{ + VERIFY3U(cookie, <, vidma->vidma_dma_ncookies); + + return (vidma->vidma_dma_cookies[cookie].dmac_size); +} + +int +virtio_dma_init_handle(virtio_t *vio, virtio_dma_t *vidma, + const ddi_dma_attr_t *attr, int kmflags) +{ + int r; + dev_info_t *dip = vio->vio_dip; + + VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP); + int (*dma_wait)(caddr_t) = (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : + DDI_DMA_DONTWAIT; + + vidma->vidma_virtio = vio; + + /* + * Ensure we don't try to allocate a second time using the same + * tracking object. + */ + VERIFY0(vidma->vidma_level); + + if ((r = ddi_dma_alloc_handle(dip, (ddi_dma_attr_t *)attr, dma_wait, + NULL, &vidma->vidma_dma_handle)) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "DMA handle allocation failed (%x)", r); + goto fail; + } + vidma->vidma_level |= VIRTIO_DMALEVEL_HANDLE_ALLOC; + + return (DDI_SUCCESS); + +fail: + virtio_dma_fini(vidma); + return (DDI_FAILURE); +} + +int +virtio_dma_init(virtio_t *vio, virtio_dma_t *vidma, size_t sz, + const ddi_dma_attr_t *attr, int dmaflags, int kmflags) +{ + int r; + dev_info_t *dip = vio->vio_dip; + caddr_t va = NULL; + + VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP); + int (*dma_wait)(caddr_t) = (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : + DDI_DMA_DONTWAIT; + + if (virtio_dma_init_handle(vio, vidma, attr, kmflags) != + DDI_SUCCESS) { + goto fail; + } + + if ((r = ddi_dma_mem_alloc(vidma->vidma_dma_handle, sz, + &virtio_acc_attr, + dmaflags & (DDI_DMA_STREAMING | DDI_DMA_CONSISTENT), + dma_wait, NULL, &va, &vidma->vidma_real_size, + &vidma->vidma_acc_handle)) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "DMA memory allocation failed (%x)", r); + goto fail; + } + vidma->vidma_level |= VIRTIO_DMALEVEL_MEMORY_ALLOC; + + /* + * Zero the memory to avoid accidental exposure of arbitrary kernel + * memory. + */ + bzero(va, vidma->vidma_real_size); + + if (virtio_dma_bind(vidma, va, sz, dmaflags, kmflags) != DDI_SUCCESS) { + goto fail; + } + + return (DDI_SUCCESS); + +fail: + virtio_dma_fini(vidma); + return (DDI_FAILURE); +} + +int +virtio_dma_bind(virtio_dma_t *vidma, void *va, size_t sz, int dmaflags, + int kmflags) +{ + int r; + dev_info_t *dip = vidma->vidma_virtio->vio_dip; + ddi_dma_cookie_t dmac; + + VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP); + int (*dma_wait)(caddr_t) = (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : + DDI_DMA_DONTWAIT; + + VERIFY(vidma->vidma_level & VIRTIO_DMALEVEL_HANDLE_ALLOC); + VERIFY(!(vidma->vidma_level & VIRTIO_DMALEVEL_HANDLE_BOUND)); + + vidma->vidma_va = va; + vidma->vidma_size = sz; + + if ((r = ddi_dma_addr_bind_handle(vidma->vidma_dma_handle, NULL, + vidma->vidma_va, vidma->vidma_size, dmaflags, dma_wait, NULL, + &dmac, &vidma->vidma_dma_ncookies)) != DDI_DMA_MAPPED) { + VERIFY3S(r, !=, DDI_DMA_PARTIAL_MAP); + dev_err(dip, CE_WARN, "DMA handle bind failed (%x)", r); + goto fail; + } + vidma->vidma_level |= VIRTIO_DMALEVEL_HANDLE_BOUND; + + if ((vidma->vidma_dma_cookies = kmem_alloc( + vidma->vidma_dma_ncookies * sizeof (ddi_dma_cookie_t), + kmflags)) == NULL) { + dev_err(dip, CE_WARN, "DMA cookie array allocation failure"); + goto fail; + } + vidma->vidma_level |= VIRTIO_DMALEVEL_COOKIE_ARRAY; + + vidma->vidma_dma_cookies[0] = dmac; + for (uint_t n = 1; n < vidma->vidma_dma_ncookies; n++) { + ddi_dma_nextcookie(vidma->vidma_dma_handle, + &vidma->vidma_dma_cookies[n]); + } + + return (DDI_SUCCESS); + +fail: + virtio_dma_unbind(vidma); + return (DDI_FAILURE); +} + +virtio_dma_t * +virtio_dma_alloc(virtio_t *vio, size_t sz, const ddi_dma_attr_t *attr, + int dmaflags, int kmflags) +{ + virtio_dma_t *vidma; + + if ((vidma = kmem_zalloc(sizeof (*vidma), kmflags)) == NULL) { + return (NULL); + } + + if (virtio_dma_init(vio, vidma, sz, attr, dmaflags, kmflags) != + DDI_SUCCESS) { + kmem_free(vidma, sizeof (*vidma)); + return (NULL); + } + + return (vidma); +} + +virtio_dma_t * +virtio_dma_alloc_nomem(virtio_t *vio, const ddi_dma_attr_t *attr, int kmflags) +{ + virtio_dma_t *vidma; + + if ((vidma = kmem_zalloc(sizeof (*vidma), kmflags)) == NULL) { + return (NULL); + } + + if (virtio_dma_init_handle(vio, vidma, attr, kmflags) != DDI_SUCCESS) { + kmem_free(vidma, sizeof (*vidma)); + return (NULL); + } + + return (vidma); +} + +void +virtio_dma_fini(virtio_dma_t *vidma) +{ + virtio_dma_unbind(vidma); + + if (vidma->vidma_level & VIRTIO_DMALEVEL_MEMORY_ALLOC) { + ddi_dma_mem_free(&vidma->vidma_acc_handle); + + vidma->vidma_level &= ~VIRTIO_DMALEVEL_MEMORY_ALLOC; + } + + if (vidma->vidma_level & VIRTIO_DMALEVEL_HANDLE_ALLOC) { + ddi_dma_free_handle(&vidma->vidma_dma_handle); + + vidma->vidma_level &= ~VIRTIO_DMALEVEL_HANDLE_ALLOC; + } + + VERIFY0(vidma->vidma_level); + bzero(vidma, sizeof (*vidma)); +} + +void +virtio_dma_unbind(virtio_dma_t *vidma) +{ + if (vidma->vidma_level & VIRTIO_DMALEVEL_COOKIE_ARRAY) { + kmem_free(vidma->vidma_dma_cookies, + vidma->vidma_dma_ncookies * sizeof (ddi_dma_cookie_t)); + + vidma->vidma_level &= ~VIRTIO_DMALEVEL_COOKIE_ARRAY; + } + + if (vidma->vidma_level & VIRTIO_DMALEVEL_HANDLE_BOUND) { + VERIFY3U(ddi_dma_unbind_handle(vidma->vidma_dma_handle), ==, + DDI_SUCCESS); + + vidma->vidma_level &= ~VIRTIO_DMALEVEL_HANDLE_BOUND; + } + + vidma->vidma_va = 0; + vidma->vidma_size = 0; +} + +void +virtio_dma_free(virtio_dma_t *vidma) +{ + virtio_dma_fini(vidma); + kmem_free(vidma, sizeof (*vidma)); +} diff --git a/usr/src/uts/common/io/virtio/virtio_impl.h b/usr/src/uts/common/io/virtio/virtio_impl.h new file mode 100644 index 0000000000..518667c7f4 --- /dev/null +++ b/usr/src/uts/common/io/virtio/virtio_impl.h @@ -0,0 +1,368 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VIRTIO_IMPL_H +#define _VIRTIO_IMPL_H + +/* + * VIRTIO FRAMEWORK: FRAMEWORK-PRIVATE DEFINITIONS + * + * For design and usage documentation, see the comments in "virtio.h". + * + * NOTE: Client drivers should not use definitions from this file. + */ + +#include <sys/types.h> +#include <sys/dditypes.h> +#include <sys/list.h> +#include <sys/ccompile.h> + +#include "virtio.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern ddi_device_acc_attr_t virtio_acc_attr; +extern ddi_dma_attr_t virtio_dma_attr; + +typedef struct virtio_vq_desc virtio_vq_desc_t; +typedef struct virtio_vq_driver virtio_vq_driver_t; +typedef struct virtio_vq_device virtio_vq_device_t; +typedef struct virtio_vq_elem virtio_vq_elem_t; + +int virtio_dma_init(virtio_t *, virtio_dma_t *, size_t, const ddi_dma_attr_t *, + int, int); +void virtio_dma_fini(virtio_dma_t *); + + + +typedef enum virtio_dma_level { + VIRTIO_DMALEVEL_HANDLE_ALLOC = (1ULL << 0), + VIRTIO_DMALEVEL_MEMORY_ALLOC = (1ULL << 1), + VIRTIO_DMALEVEL_HANDLE_BOUND = (1ULL << 2), + VIRTIO_DMALEVEL_COOKIE_ARRAY = (1ULL << 3), +} virtio_dma_level_t; + +struct virtio_dma { + virtio_dma_level_t vidma_level; + virtio_t *vidma_virtio; + caddr_t vidma_va; + size_t vidma_size; + size_t vidma_real_size; + ddi_dma_handle_t vidma_dma_handle; + ddi_acc_handle_t vidma_acc_handle; + uint_t vidma_dma_ncookies; + ddi_dma_cookie_t *vidma_dma_cookies; +}; + +typedef enum virtio_initlevel { + VIRTIO_INITLEVEL_REGS = (1ULL << 0), + VIRTIO_INITLEVEL_PROVIDER = (1ULL << 1), + VIRTIO_INITLEVEL_INT_ALLOC = (1ULL << 2), + VIRTIO_INITLEVEL_INT_ADDED = (1ULL << 3), + VIRTIO_INITLEVEL_INT_ENABLED = (1ULL << 4), + VIRTIO_INITLEVEL_SHUTDOWN = (1ULL << 5), +} virtio_initlevel_t; + +struct virtio { + dev_info_t *vio_dip; + + kmutex_t vio_mutex; + + virtio_initlevel_t vio_initlevel; + + list_t vio_queues; + + ddi_acc_handle_t vio_barh; + caddr_t vio_bar; + uint_t vio_config_offset; + + uint32_t vio_features; + uint32_t vio_features_device; + + ddi_intr_handle_t *vio_interrupts; + int vio_ninterrupts; + int vio_interrupt_type; + int vio_interrupt_cap; + uint_t vio_interrupt_priority; +}; + +struct virtio_queue { + virtio_t *viq_virtio; + kmutex_t viq_mutex; + const char *viq_name; + list_node_t viq_link; + + boolean_t viq_shutdown; + boolean_t viq_indirect; + uint_t viq_max_segs; + + /* + * Each Virtio device type has some set of queues for data transfer to + * and from the host. This index is described in the specification for + * the particular device and queue type, and written to QUEUE_SELECT to + * allow interaction with the queue. For example, a network device has + * at least a receive queue with index 0, and a transmit queue with + * index 1. + */ + uint16_t viq_index; + + /* + * For legacy Virtio devices, the size and shape of the queue is + * determined entirely by the number of queue entries. + */ + uint16_t viq_size; + id_space_t *viq_descmap; + + /* + * The memory shared between the device and the driver is allocated as + * a large phyisically contiguous chunk. Access to this area is + * through three pointers to packed structures. + */ + virtio_dma_t viq_dma; + virtio_vq_desc_t *viq_dma_descs; + virtio_vq_driver_t *viq_dma_driver; + virtio_vq_device_t *viq_dma_device; + + uint16_t viq_device_index; + uint16_t viq_driver_index; + + /* + * Interrupt handler function, or NULL if not provided. + */ + ddi_intr_handler_t *viq_func; + void *viq_funcarg; + boolean_t viq_handler_added; + uint_t viq_handler_index; + + /* + * When a chain is submitted to the queue, it is also stored in this + * AVL tree keyed by the index of the first descriptor in the chain. + */ + avl_tree_t viq_inflight; +}; + +struct virtio_chain { + virtio_queue_t *vic_vq; + avl_node_t vic_node; + + void *vic_data; + + uint16_t vic_head; + uint32_t vic_received_length; + + virtio_dma_t vic_indirect_dma; + uint_t vic_indirect_capacity; + uint_t vic_indirect_used; + + uint_t vic_direct_capacity; + uint_t vic_direct_used; + uint16_t vic_direct[]; +}; + +/* + * PACKED STRUCTS FOR DEVICE ACCESS + */ + +struct virtio_vq_desc { + /* + * Buffer physical address and length. + */ + uint64_t vqd_addr; + uint32_t vqd_len; + + /* + * Flags. Use with the VIRTQ_DESC_F_* family of constants. See below. + */ + uint16_t vqd_flags; + + /* + * If VIRTQ_DESC_F_NEXT is set in flags, this refers to the next + * descriptor in the chain by table index. + */ + uint16_t vqd_next; +} __packed; + +/* + * VIRTIO DESCRIPTOR FLAGS (vqd_flags) + */ + +/* + * NEXT: + * Signals that this descriptor (direct or indirect) is part of a chain. + * If populated, "vqd_next" names the next descriptor in the chain by its + * table index. + */ +#define VIRTQ_DESC_F_NEXT (1 << 0) + +/* + * WRITE: + * Determines whether this buffer is to be written by the device (WRITE is + * set) or by the driver (WRITE is not set). + */ +#define VIRTQ_DESC_F_WRITE (1 << 1) + +/* + * INDIRECT: + * This bit signals that a direct descriptor refers to an indirect + * descriptor list, rather than directly to a buffer. This bit may only + * be used in a direct descriptor; indirect descriptors are not allowed to + * refer to additional layers of indirect tables. If this bit is set, + * NEXT must be clear; indirect descriptors may not be chained. + */ +#define VIRTQ_DESC_F_INDIRECT (1 << 2) + +/* + * This structure is variously known as the "available" or "avail" ring, or the + * driver-owned portion of the queue structure. It is used by the driver to + * submit descriptor chains to the device. + */ +struct virtio_vq_driver { + uint16_t vqdr_flags; + uint16_t vqdr_index; + uint16_t vqdr_ring[]; +} __packed; + +#define VIRTQ_AVAIL_F_NO_INTERRUPT (1 << 0) + +/* + * We use the sizeof operator on this packed struct to calculate the offset of + * subsequent structs. Ensure the compiler is not adding any padding to the + * end of the struct. + */ +CTASSERT(sizeof (virtio_vq_driver_t) == + offsetof(virtio_vq_driver_t, vqdr_ring)); + +struct virtio_vq_elem { + /* + * The device returns chains of descriptors by specifying the table + * index of the first descriptor in the chain. + */ + uint32_t vqe_start; + uint32_t vqe_len; +} __packed; + +/* + * This structure is variously known as the "used" ring, or the device-owned + * portion of the queue structure. It is used by the device to return + * completed descriptor chains to the device. + */ +struct virtio_vq_device { + uint16_t vqde_flags; + uint16_t vqde_index; + virtio_vq_elem_t vqde_ring[]; +} __packed; + +#define VIRTQ_USED_F_NO_NOTIFY (1 << 0) + +/* + * BASIC CONFIGURATION + * + * Legacy devices expose both their generic and their device-specific + * configuration through PCI BAR0. This is the second entry in the register + * address space set for these devices. + */ +#define VIRTIO_LEGACY_PCI_BAR0 1 + +/* + * These are offsets into the base configuration space available through the + * virtio_get*() and virtio_put*() family of functions. These offsets are for + * what the specification describes as the "legacy" mode of device operation. + */ +#define VIRTIO_LEGACY_FEATURES_DEVICE 0x00 /* 32 R */ +#define VIRTIO_LEGACY_FEATURES_DRIVER 0x04 /* 32 R/W */ +#define VIRTIO_LEGACY_QUEUE_ADDRESS 0x08 /* 32 R/W */ +#define VIRTIO_LEGACY_QUEUE_SIZE 0x0C /* 16 R */ +#define VIRTIO_LEGACY_QUEUE_SELECT 0x0E /* 16 R/W */ +#define VIRTIO_LEGACY_QUEUE_NOTIFY 0x10 /* 16 R/W */ +#define VIRTIO_LEGACY_DEVICE_STATUS 0x12 /* 8 R/W */ +#define VIRTIO_LEGACY_ISR_STATUS 0x13 /* 8 R */ + +#define VIRTIO_LEGACY_MSIX_CONFIG 0x14 /* 16 R/W */ +#define VIRTIO_LEGACY_MSIX_QUEUE 0x16 /* 16 R/W */ + +#define VIRTIO_LEGACY_CFG_OFFSET (VIRTIO_LEGACY_ISR_STATUS + 1) +#define VIRTIO_LEGACY_CFG_OFFSET_MSIX (VIRTIO_LEGACY_MSIX_QUEUE + 2) + +#define VIRTIO_LEGACY_MSI_NO_VECTOR 0xFFFF + +/* + * Bits in the Device Status byte (VIRTIO_LEGACY_DEVICE_STATUS): + */ +#define VIRTIO_STATUS_RESET 0 +#define VIRTIO_STATUS_ACKNOWLEDGE (1 << 0) +#define VIRTIO_STATUS_DRIVER (1 << 1) +#define VIRTIO_STATUS_DRIVER_OK (1 << 2) +#define VIRTIO_STATUS_FAILED (1 << 7) + +/* + * Bits in the Interrupt Service Routine Status byte + * (VIRTIO_LEGACY_ISR_STATUS): + */ +#define VIRTIO_ISR_CHECK_QUEUES (1 << 0) +#define VIRTIO_ISR_CHECK_CONFIG (1 << 1) + +/* + * Bits in the Features fields (VIRTIO_LEGACY_FEATURES_DEVICE, + * VIRTIO_LEGACY_FEATURES_DRIVER): + */ +#define VIRTIO_F_RING_INDIRECT_DESC (1ULL << 28) + +/* + * For devices operating in the legacy mode, virtqueues must be aligned on a + * "page size" of 4096 bytes; this is also called the "Queue Align" value in + * newer versions of the specification. + */ +#define VIRTIO_PAGE_SHIFT 12 +#define VIRTIO_PAGE_SIZE (1 << VIRTIO_PAGE_SHIFT) +CTASSERT(VIRTIO_PAGE_SIZE == 4096); +CTASSERT(ISP2(VIRTIO_PAGE_SIZE)); + +/* + * DMA SYNCHRONISATION WRAPPERS + */ + +/* + * Synchronise the driver-owned portion of the queue so that the device can see + * our writes. This covers the memory accessed via the "viq_dma_descs" and + * "viq_dma_device" members. + */ +#define VIRTQ_DMA_SYNC_FORDEV(viq) VERIFY0(ddi_dma_sync( \ + (viq)->viq_dma.vidma_dma_handle, \ + 0, \ + (uintptr_t)(viq)->viq_dma_device - \ + (uintptr_t)(viq)->viq_dma_descs, \ + DDI_DMA_SYNC_FORDEV)) + +/* + * Synchronise the device-owned portion of the queue so that we can see any + * writes from the device. This covers the memory accessed via the + * "viq_dma_device" member. + */ +#define VIRTQ_DMA_SYNC_FORKERNEL(viq) VERIFY0(ddi_dma_sync( \ + (viq)->viq_dma.vidma_dma_handle, \ + (uintptr_t)(viq)->viq_dma_device - \ + (uintptr_t)(viq)->viq_dma_descs, \ + (viq)->viq_dma.vidma_size - \ + (uintptr_t)(viq)->viq_dma_device - \ + (uintptr_t)(viq)->viq_dma_descs, \ + DDI_DMA_SYNC_FORKERNEL)) + +#ifdef __cplusplus +} +#endif + +#endif /* _VIRTIO_IMPL_H */ diff --git a/usr/src/uts/common/io/virtio/virtio_main.c b/usr/src/uts/common/io/virtio/virtio_main.c new file mode 100644 index 0000000000..be92dacfba --- /dev/null +++ b/usr/src/uts/common/io/virtio/virtio_main.c @@ -0,0 +1,1730 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * VIRTIO FRAMEWORK + * + * For design and usage documentation, see the comments in "virtio.h". + */ + +#include <sys/conf.h> +#include <sys/kmem.h> +#include <sys/debug.h> +#include <sys/modctl.h> +#include <sys/autoconf.h> +#include <sys/ddi_impldefs.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/avintr.h> +#include <sys/spl.h> +#include <sys/promif.h> +#include <sys/list.h> +#include <sys/bootconf.h> +#include <sys/bootsvcs.h> +#include <sys/sysmacros.h> +#include <sys/pci.h> + +#include "virtio.h" +#include "virtio_impl.h" + + +/* + * Linkage structures + */ +static struct modlmisc virtio_modlmisc = { + .misc_modops = &mod_miscops, + .misc_linkinfo = "VIRTIO common routines", +}; + +static struct modlinkage virtio_modlinkage = { + .ml_rev = MODREV_1, + .ml_linkage = { &virtio_modlmisc, NULL } +}; + +int +_init(void) +{ + return (mod_install(&virtio_modlinkage)); +} + +int +_fini(void) +{ + return (mod_remove(&virtio_modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&virtio_modlinkage, modinfop)); +} + + + +static void virtio_set_status(virtio_t *, uint8_t); +static int virtio_chain_append_impl(virtio_chain_t *, uint64_t, size_t, + uint16_t); +static int virtio_interrupts_setup(virtio_t *, int); +static void virtio_interrupts_teardown(virtio_t *); +static void virtio_interrupts_disable_locked(virtio_t *); +static void virtio_queue_free(virtio_queue_t *); + +/* + * We use the same device access attributes for BAR mapping and access to the + * virtqueue memory. + */ +ddi_device_acc_attr_t virtio_acc_attr = { + .devacc_attr_version = DDI_DEVICE_ATTR_V1, + .devacc_attr_endian_flags = DDI_NEVERSWAP_ACC, + .devacc_attr_dataorder = DDI_STORECACHING_OK_ACC, + .devacc_attr_access = DDI_DEFAULT_ACC +}; + + +/* + * DMA attributes for the memory given to the device for queue management. + */ +ddi_dma_attr_t virtio_dma_attr_queue = { + .dma_attr_version = DMA_ATTR_V0, + .dma_attr_addr_lo = 0x0000000000000000, + /* + * Queue memory is aligned on VIRTIO_PAGE_SIZE with the address shifted + * down by VIRTIO_PAGE_SHIFT before being passed to the device in a + * 32-bit register. + */ + .dma_attr_addr_hi = 0x00000FFFFFFFF000, + .dma_attr_count_max = 0x00000000FFFFFFFF, + .dma_attr_align = VIRTIO_PAGE_SIZE, + .dma_attr_burstsizes = 1, + .dma_attr_minxfer = 1, + .dma_attr_maxxfer = 0x00000000FFFFFFFF, + .dma_attr_seg = 0x00000000FFFFFFFF, + .dma_attr_sgllen = 1, + .dma_attr_granular = 1, + .dma_attr_flags = 0 +}; + +/* + * DMA attributes for the the allocation of indirect descriptor lists. The + * indirect list is referenced by a regular descriptor entry: the physical + * address field is 64 bits wide, but the length field is only 32 bits. Each + * descriptor is 16 bytes long. + */ +ddi_dma_attr_t virtio_dma_attr_indirect = { + .dma_attr_version = DMA_ATTR_V0, + .dma_attr_addr_lo = 0x0000000000000000, + .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, + .dma_attr_count_max = 0x00000000FFFFFFFF, + .dma_attr_align = sizeof (struct virtio_vq_desc), + .dma_attr_burstsizes = 1, + .dma_attr_minxfer = 1, + .dma_attr_maxxfer = 0x00000000FFFFFFFF, + .dma_attr_seg = 0x00000000FFFFFFFF, + .dma_attr_sgllen = 1, + .dma_attr_granular = 1, + .dma_attr_flags = 0 +}; + + +uint8_t +virtio_get8(virtio_t *vio, uintptr_t offset) +{ + return (ddi_get8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset))); +} + +uint16_t +virtio_get16(virtio_t *vio, uintptr_t offset) +{ + return (ddi_get16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset))); +} + +uint32_t +virtio_get32(virtio_t *vio, uintptr_t offset) +{ + return (ddi_get32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset))); +} + +void +virtio_put8(virtio_t *vio, uintptr_t offset, uint8_t value) +{ + ddi_put8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset), value); +} + +void +virtio_put16(virtio_t *vio, uintptr_t offset, uint16_t value) +{ + ddi_put16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset), value); +} + +void +virtio_put32(virtio_t *vio, uintptr_t offset, uint32_t value) +{ + ddi_put32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset), value); +} + +void +virtio_fini(virtio_t *vio, boolean_t failed) +{ + mutex_enter(&vio->vio_mutex); + + virtio_interrupts_teardown(vio); + + virtio_queue_t *viq; + while ((viq = list_remove_head(&vio->vio_queues)) != NULL) { + virtio_queue_free(viq); + } + list_destroy(&vio->vio_queues); + + if (failed) { + /* + * Signal to the host that device setup failed. + */ + virtio_set_status(vio, VIRTIO_STATUS_FAILED); + } else { + virtio_device_reset(vio); + } + + /* + * We don't need to do anything for the provider initlevel, as it + * merely records the fact that virtio_init_complete() was called. + */ + vio->vio_initlevel &= ~VIRTIO_INITLEVEL_PROVIDER; + + if (vio->vio_initlevel & VIRTIO_INITLEVEL_REGS) { + /* + * Unmap PCI BAR0. + */ + ddi_regs_map_free(&vio->vio_barh); + + vio->vio_initlevel &= ~VIRTIO_INITLEVEL_REGS; + } + + /* + * Ensure we have torn down everything we set up. + */ + VERIFY0(vio->vio_initlevel); + + mutex_exit(&vio->vio_mutex); + mutex_destroy(&vio->vio_mutex); + + kmem_free(vio, sizeof (*vio)); +} + +/* + * Early device initialisation for legacy (pre-1.0 specification) virtio + * devices. + */ +virtio_t * +virtio_init(dev_info_t *dip, uint64_t driver_features, boolean_t allow_indirect) +{ + int r; + + /* + * First, confirm that this is a legacy device. + */ + ddi_acc_handle_t pci; + if (pci_config_setup(dip, &pci) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "pci_config_setup failed"); + return (NULL); + } + + uint8_t revid; + if ((revid = pci_config_get8(pci, PCI_CONF_REVID)) == PCI_EINVAL8) { + dev_err(dip, CE_WARN, "could not read config space"); + pci_config_teardown(&pci); + return (NULL); + } + + pci_config_teardown(&pci); + + /* + * The legacy specification requires that the device advertise as PCI + * Revision 0. + */ + if (revid != 0) { + dev_err(dip, CE_WARN, "PCI Revision %u incorrect for " + "legacy virtio device", (uint_t)revid); + return (NULL); + } + + virtio_t *vio = kmem_zalloc(sizeof (*vio), KM_SLEEP); + vio->vio_dip = dip; + + /* + * Map PCI BAR0 for legacy device access. + */ + if ((r = ddi_regs_map_setup(dip, VIRTIO_LEGACY_PCI_BAR0, + (caddr_t *)&vio->vio_bar, 0, 0, &virtio_acc_attr, + &vio->vio_barh)) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "ddi_regs_map_setup failure (%d)", r); + kmem_free(vio, sizeof (*vio)); + return (NULL); + } + vio->vio_initlevel |= VIRTIO_INITLEVEL_REGS; + + /* + * We initialise the mutex without an interrupt priority to ease the + * implementation of some of the configuration space access routines. + * Drivers using the virtio framework MUST make a call to + * "virtio_init_complete()" prior to spawning other threads or enabling + * interrupt handlers, at which time we will destroy and reinitialise + * the mutex for use in our interrupt handlers. + */ + mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, NULL); + + list_create(&vio->vio_queues, sizeof (virtio_queue_t), + offsetof(virtio_queue_t, viq_link)); + + /* + * Legacy virtio devices require a few common steps before we can + * negotiate device features. + */ + virtio_device_reset(vio); + virtio_set_status(vio, VIRTIO_STATUS_ACKNOWLEDGE); + virtio_set_status(vio, VIRTIO_STATUS_DRIVER); + + /* + * Negotiate features with the device. Record the original supported + * feature set for debugging purposes. + */ + vio->vio_features_device = virtio_get32(vio, + VIRTIO_LEGACY_FEATURES_DEVICE); + if (allow_indirect) { + driver_features |= VIRTIO_F_RING_INDIRECT_DESC; + } + vio->vio_features = vio->vio_features_device & driver_features; + virtio_put32(vio, VIRTIO_LEGACY_FEATURES_DRIVER, vio->vio_features); + + /* + * The device-specific configuration begins at an offset into the BAR + * that depends on whether we have enabled MSI-X interrupts or not. + * Start out with the offset for pre-MSI-X operation so that we can + * read device configuration space prior to configuring interrupts. + */ + vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET; + + return (vio); +} + +/* + * This function must be called by the driver once it has completed early setup + * calls. + */ +int +virtio_init_complete(virtio_t *vio, int allowed_interrupt_types) +{ + VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER)); + vio->vio_initlevel |= VIRTIO_INITLEVEL_PROVIDER; + + if (!list_is_empty(&vio->vio_queues)) { + /* + * Set up interrupts for the queues that have been registered. + */ + if (virtio_interrupts_setup(vio, allowed_interrupt_types) != + DDI_SUCCESS) { + return (DDI_FAILURE); + } + } + + /* + * We can allocate the mutex once we know the priority. + */ + mutex_destroy(&vio->vio_mutex); + mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio)); + for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL; + viq = list_next(&vio->vio_queues, viq)) { + mutex_destroy(&viq->viq_mutex); + mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER, + virtio_intr_pri(vio)); + } + + virtio_set_status(vio, VIRTIO_STATUS_DRIVER_OK); + + return (DDI_SUCCESS); +} + +boolean_t +virtio_feature_present(virtio_t *vio, uint64_t feature_mask) +{ + return ((vio->vio_features & feature_mask) != 0); +} + +void * +virtio_intr_pri(virtio_t *vio) +{ + VERIFY(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED); + + return (DDI_INTR_PRI(vio->vio_interrupt_priority)); +} + +/* + * Enable a bit in the device status register. Each bit signals a level of + * guest readiness to the host. Use the VIRTIO_CONFIG_DEVICE_STATUS_* + * constants for "status". To zero the status field use virtio_device_reset(). + */ +static void +virtio_set_status(virtio_t *vio, uint8_t status) +{ + VERIFY3U(status, !=, 0); + + mutex_enter(&vio->vio_mutex); + + uint8_t old = virtio_get8(vio, VIRTIO_LEGACY_DEVICE_STATUS); + virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, status | old); + + mutex_exit(&vio->vio_mutex); +} + +static void +virtio_device_reset_locked(virtio_t *vio) +{ + virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, VIRTIO_STATUS_RESET); +} + +void +virtio_device_reset(virtio_t *vio) +{ + mutex_enter(&vio->vio_mutex); + virtio_device_reset_locked(vio); + mutex_exit(&vio->vio_mutex); +} + +/* + * Some queues are effectively long-polled; the driver submits a series of + * buffers and the device only returns them when there is data available. + * During detach, we need to coordinate the return of these buffers. Calling + * "virtio_shutdown()" will reset the device, then allow the removal of all + * buffers that were in flight at the time of shutdown via + * "virtio_queue_evacuate()". + */ +void +virtio_shutdown(virtio_t *vio) +{ + mutex_enter(&vio->vio_mutex); + if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) { + /* + * Shutdown has been performed already. + */ + mutex_exit(&vio->vio_mutex); + return; + } + + /* + * First, mark all of the queues as shutdown. This will prevent any + * further activity. + */ + for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL; + viq = list_next(&vio->vio_queues, viq)) { + mutex_enter(&viq->viq_mutex); + viq->viq_shutdown = B_TRUE; + mutex_exit(&viq->viq_mutex); + } + + /* + * Now, reset the device. This removes any queue configuration on the + * device side. + */ + virtio_device_reset_locked(vio); + vio->vio_initlevel |= VIRTIO_INITLEVEL_SHUTDOWN; + mutex_exit(&vio->vio_mutex); +} + +/* + * Common implementation of quiesce(9E) for simple Virtio-based devices. + */ +int +virtio_quiesce(virtio_t *vio) +{ + if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) { + /* + * Device has already been reset. + */ + return (DDI_SUCCESS); + } + + /* + * When we reset the device, it should immediately stop using any DMA + * memory we've previously passed to it. All queue configuration is + * discarded. This is good enough for quiesce(9E). + */ + virtio_device_reset_locked(vio); + + return (DDI_SUCCESS); +} + +/* + * DEVICE-SPECIFIC REGISTER ACCESS + * + * Note that these functions take the mutex to avoid racing with interrupt + * enable/disable, when the device-specific offset can potentially change. + */ + +uint8_t +virtio_dev_get8(virtio_t *vio, uintptr_t offset) +{ + mutex_enter(&vio->vio_mutex); + uint8_t r = virtio_get8(vio, vio->vio_config_offset + offset); + mutex_exit(&vio->vio_mutex); + + return (r); +} + +uint16_t +virtio_dev_get16(virtio_t *vio, uintptr_t offset) +{ + mutex_enter(&vio->vio_mutex); + uint16_t r = virtio_get16(vio, vio->vio_config_offset + offset); + mutex_exit(&vio->vio_mutex); + + return (r); +} + +uint32_t +virtio_dev_get32(virtio_t *vio, uintptr_t offset) +{ + mutex_enter(&vio->vio_mutex); + uint32_t r = virtio_get32(vio, vio->vio_config_offset + offset); + mutex_exit(&vio->vio_mutex); + + return (r); +} + +uint64_t +virtio_dev_get64(virtio_t *vio, uintptr_t offset) +{ + mutex_enter(&vio->vio_mutex); + /* + * On at least some systems, a 64-bit read or write to this BAR is not + * possible. For legacy devices, there is no generation number to use + * to determine if configuration may have changed half-way through a + * read. We need to continue to read both halves of the value until we + * read the same value at least twice. + */ + uintptr_t o_lo = vio->vio_config_offset + offset; + uintptr_t o_hi = o_lo + 4; + + uint64_t val = virtio_get32(vio, o_lo) | + ((uint64_t)virtio_get32(vio, o_hi) << 32); + + for (;;) { + uint64_t tval = virtio_get32(vio, o_lo) | + ((uint64_t)virtio_get32(vio, o_hi) << 32); + + if (tval == val) { + break; + } + + val = tval; + } + + mutex_exit(&vio->vio_mutex); + return (val); +} + +void +virtio_dev_put8(virtio_t *vio, uintptr_t offset, uint8_t value) +{ + mutex_enter(&vio->vio_mutex); + virtio_put8(vio, vio->vio_config_offset + offset, value); + mutex_exit(&vio->vio_mutex); +} + +void +virtio_dev_put16(virtio_t *vio, uintptr_t offset, uint16_t value) +{ + mutex_enter(&vio->vio_mutex); + virtio_put16(vio, vio->vio_config_offset + offset, value); + mutex_exit(&vio->vio_mutex); +} + +void +virtio_dev_put32(virtio_t *vio, uintptr_t offset, uint32_t value) +{ + mutex_enter(&vio->vio_mutex); + virtio_put32(vio, vio->vio_config_offset + offset, value); + mutex_exit(&vio->vio_mutex); +} + +/* + * VIRTQUEUE MANAGEMENT + */ + +static int +virtio_inflight_compar(const void *lp, const void *rp) +{ + const virtio_chain_t *l = lp; + const virtio_chain_t *r = rp; + + if (l->vic_head < r->vic_head) { + return (-1); + } else if (l->vic_head > r->vic_head) { + return (1); + } else { + return (0); + } +} + +virtio_queue_t * +virtio_queue_alloc(virtio_t *vio, uint16_t qidx, const char *name, + ddi_intr_handler_t *func, void *funcarg, boolean_t force_direct, + uint_t max_segs) +{ + uint16_t qsz; + char space_name[256]; + + if (max_segs < 1) { + /* + * Every descriptor, direct or indirect, needs to refer to at + * least one buffer. + */ + dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) " + "segment count must be at least 1", name, (uint_t)qidx); + return (NULL); + } + + mutex_enter(&vio->vio_mutex); + + if (vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER) { + /* + * Cannot configure any more queues once initial setup is + * complete and interrupts have been allocated. + */ + dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) " + "alloc after init complete", name, (uint_t)qidx); + mutex_exit(&vio->vio_mutex); + return (NULL); + } + + /* + * There is no way to negotiate a different queue size for legacy + * devices. We must read and use the native queue size of the device. + */ + virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx); + if ((qsz = virtio_get16(vio, VIRTIO_LEGACY_QUEUE_SIZE)) == 0) { + /* + * A size of zero means the device does not have a queue with + * this index. + */ + dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) " + "does not exist on device", name, (uint_t)qidx); + mutex_exit(&vio->vio_mutex); + return (NULL); + } + + mutex_exit(&vio->vio_mutex); + + virtio_queue_t *viq = kmem_zalloc(sizeof (*viq), KM_SLEEP); + viq->viq_virtio = vio; + viq->viq_name = name; + viq->viq_index = qidx; + viq->viq_size = qsz; + viq->viq_func = func; + viq->viq_funcarg = funcarg; + viq->viq_max_segs = max_segs; + avl_create(&viq->viq_inflight, virtio_inflight_compar, + sizeof (virtio_chain_t), offsetof(virtio_chain_t, vic_node)); + + /* + * Allocate the mutex without an interrupt priority for now, as we do + * with "vio_mutex". We'll reinitialise it in + * "virtio_init_complete()". + */ + mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER, NULL); + + if (virtio_feature_present(vio, VIRTIO_F_RING_INDIRECT_DESC) && + !force_direct) { + /* + * If we were able to negotiate the indirect descriptor + * feature, and the caller has not explicitly forced the use of + * direct descriptors, we'll allocate indirect descriptor lists + * for each chain. + */ + viq->viq_indirect = B_TRUE; + } + + /* + * Track descriptor usage in an identifier space. + */ + (void) snprintf(space_name, sizeof (space_name), "%s%d_vq_%s", + ddi_get_name(vio->vio_dip), ddi_get_instance(vio->vio_dip), name); + if ((viq->viq_descmap = id_space_create(space_name, 0, qsz)) == NULL) { + dev_err(vio->vio_dip, CE_WARN, "could not allocate descriptor " + "ID space"); + virtio_queue_free(viq); + return (NULL); + } + + /* + * For legacy devices, memory for the queue has a strict layout + * determined by the queue size. + */ + size_t sz_descs = sizeof (virtio_vq_desc_t) * qsz; + size_t sz_driver = P2ROUNDUP_TYPED(sz_descs + + sizeof (virtio_vq_driver_t) + + sizeof (uint16_t) * qsz, + VIRTIO_PAGE_SIZE, size_t); + size_t sz_device = P2ROUNDUP_TYPED(sizeof (virtio_vq_device_t) + + sizeof (virtio_vq_elem_t) * qsz, + VIRTIO_PAGE_SIZE, size_t); + + if (virtio_dma_init(vio, &viq->viq_dma, sz_driver + sz_device, + &virtio_dma_attr_queue, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, + KM_SLEEP) != DDI_SUCCESS) { + dev_err(vio->vio_dip, CE_WARN, "could not allocate queue " + "DMA memory"); + virtio_queue_free(viq); + return (NULL); + } + + /* + * NOTE: The viq_dma_* members below are used by + * VIRTQ_DMA_SYNC_FORDEV() and VIRTQ_DMA_SYNC_FORKERNEL() to calculate + * offsets into the DMA allocation for partial synchronisation. If the + * ordering of, or relationship between, these pointers changes, the + * macros must be kept in sync. + */ + viq->viq_dma_descs = virtio_dma_va(&viq->viq_dma, 0); + viq->viq_dma_driver = virtio_dma_va(&viq->viq_dma, sz_descs); + viq->viq_dma_device = virtio_dma_va(&viq->viq_dma, sz_driver); + + /* + * Install in the per-device list of queues. + */ + mutex_enter(&vio->vio_mutex); + for (virtio_queue_t *chkvq = list_head(&vio->vio_queues); chkvq != NULL; + chkvq = list_next(&vio->vio_queues, chkvq)) { + if (chkvq->viq_index == qidx) { + dev_err(vio->vio_dip, CE_WARN, "attempt to register " + "queue \"%s\" with same index (%d) as queue \"%s\"", + name, qidx, chkvq->viq_name); + mutex_exit(&vio->vio_mutex); + virtio_queue_free(viq); + return (NULL); + } + } + list_insert_tail(&vio->vio_queues, viq); + + /* + * Ensure the zeroing of the queue memory is visible to the host before + * we inform the device of the queue address. + */ + membar_producer(); + VIRTQ_DMA_SYNC_FORDEV(viq); + + virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx); + virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS, + virtio_dma_cookie_pa(&viq->viq_dma, 0) >> VIRTIO_PAGE_SHIFT); + + mutex_exit(&vio->vio_mutex); + return (viq); +} + +static void +virtio_queue_free(virtio_queue_t *viq) +{ + virtio_t *vio = viq->viq_virtio; + + /* + * We are going to destroy the queue mutex. Make sure we've already + * removed the interrupt handlers. + */ + VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED)); + + mutex_enter(&viq->viq_mutex); + + /* + * If the device has not already been reset as part of a shutdown, + * detach the queue from the device now. + */ + if (!viq->viq_shutdown) { + virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, viq->viq_index); + virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS, 0); + } + + virtio_dma_fini(&viq->viq_dma); + + VERIFY(avl_is_empty(&viq->viq_inflight)); + avl_destroy(&viq->viq_inflight); + if (viq->viq_descmap != NULL) { + id_space_destroy(viq->viq_descmap); + } + + mutex_exit(&viq->viq_mutex); + mutex_destroy(&viq->viq_mutex); + + kmem_free(viq, sizeof (*viq)); +} + +void +virtio_queue_no_interrupt(virtio_queue_t *viq, boolean_t stop_interrupts) +{ + mutex_enter(&viq->viq_mutex); + + if (stop_interrupts) { + viq->viq_dma_driver->vqdr_flags |= VIRTQ_AVAIL_F_NO_INTERRUPT; + } else { + viq->viq_dma_driver->vqdr_flags &= ~VIRTQ_AVAIL_F_NO_INTERRUPT; + } + VIRTQ_DMA_SYNC_FORDEV(viq); + + mutex_exit(&viq->viq_mutex); +} + +static virtio_chain_t * +virtio_queue_complete(virtio_queue_t *viq, uint_t index) +{ + VERIFY(MUTEX_HELD(&viq->viq_mutex)); + + virtio_chain_t *vic; + + virtio_chain_t search; + bzero(&search, sizeof (search)); + search.vic_head = index; + + if ((vic = avl_find(&viq->viq_inflight, &search, NULL)) == NULL) { + return (NULL); + } + avl_remove(&viq->viq_inflight, vic); + + return (vic); +} + +uint_t +virtio_queue_size(virtio_queue_t *viq) +{ + return (viq->viq_size); +} + +uint_t +virtio_queue_nactive(virtio_queue_t *viq) +{ + mutex_enter(&viq->viq_mutex); + uint_t r = avl_numnodes(&viq->viq_inflight); + mutex_exit(&viq->viq_mutex); + + return (r); +} + +virtio_chain_t * +virtio_queue_poll(virtio_queue_t *viq) +{ + mutex_enter(&viq->viq_mutex); + if (viq->viq_shutdown) { + /* + * The device has been reset by virtio_shutdown(), and queue + * processing has been halted. Any previously submitted chains + * will be evacuated using virtio_queue_evacuate(). + */ + mutex_exit(&viq->viq_mutex); + return (NULL); + } + + VIRTQ_DMA_SYNC_FORKERNEL(viq); + if (viq->viq_device_index == viq->viq_dma_device->vqde_index) { + /* + * If the device index has not changed since the last poll, + * there are no new chains to process. + */ + mutex_exit(&viq->viq_mutex); + return (NULL); + } + + /* + * We need to ensure that all reads from the descriptor (vqde_ring[]) + * and any referenced memory by the descriptor occur after we have read + * the descriptor index value above (vqde_index). + */ + membar_consumer(); + + uint16_t index = (viq->viq_device_index++) % viq->viq_size; + uint16_t start = viq->viq_dma_device->vqde_ring[index].vqe_start; + uint32_t len = viq->viq_dma_device->vqde_ring[index].vqe_len; + + virtio_chain_t *vic; + if ((vic = virtio_queue_complete(viq, start)) == NULL) { + /* + * We could not locate a chain for this descriptor index, which + * suggests that something has gone horribly wrong. + */ + dev_err(viq->viq_virtio->vio_dip, CE_PANIC, + "queue \"%s\" ring entry %u (descriptor %u) has no chain", + viq->viq_name, (uint16_t)index, (uint16_t)start); + } + + vic->vic_received_length = len; + + mutex_exit(&viq->viq_mutex); + + return (vic); +} + +/* + * After a call to "virtio_shutdown()", the driver must retrieve any previously + * submitted chains and free any associated resources. + */ +virtio_chain_t * +virtio_queue_evacuate(virtio_queue_t *viq) +{ + virtio_t *vio = viq->viq_virtio; + + mutex_enter(&vio->vio_mutex); + if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN)) { + dev_err(vio->vio_dip, CE_PANIC, + "virtio_queue_evacuate() without virtio_shutdown()"); + } + mutex_exit(&vio->vio_mutex); + + mutex_enter(&viq->viq_mutex); + VERIFY(viq->viq_shutdown); + + virtio_chain_t *vic = avl_first(&viq->viq_inflight); + if (vic != NULL) { + avl_remove(&viq->viq_inflight, vic); + } + + mutex_exit(&viq->viq_mutex); + + return (vic); +} + +/* + * VIRTQUEUE DESCRIPTOR CHAIN MANAGEMENT + */ + +/* + * When the device returns a descriptor chain to the driver, it may provide the + * length in bytes of data written into the chain. Client drivers should use + * this value with care; the specification suggests some device implementations + * have not always provided a useful or correct value. + */ +size_t +virtio_chain_received_length(virtio_chain_t *vic) +{ + return (vic->vic_received_length); +} + +/* + * Allocate a descriptor chain for use with this queue. The "kmflags" value + * may be KM_SLEEP or KM_NOSLEEP as per kmem_alloc(9F). + */ +virtio_chain_t * +virtio_chain_alloc(virtio_queue_t *viq, int kmflags) +{ + virtio_t *vio = viq->viq_virtio; + virtio_chain_t *vic; + uint_t cap; + + /* + * Direct descriptors are known by their index in the descriptor table + * for the queue. We use the variable-length array member at the end + * of the chain tracking object to hold the list of direct descriptors + * assigned to this chain. + */ + if (viq->viq_indirect) { + /* + * When using indirect descriptors we still need one direct + * descriptor entry to hold the physical address and length of + * the indirect descriptor table. + */ + cap = 1; + } else { + /* + * For direct descriptors we need to be able to track a + * descriptor for each possible segment in a single chain. + */ + cap = viq->viq_max_segs; + } + + size_t vicsz = sizeof (*vic) + sizeof (uint16_t) * cap; + if ((vic = kmem_zalloc(vicsz, kmflags)) == NULL) { + return (NULL); + } + vic->vic_vq = viq; + vic->vic_direct_capacity = cap; + + if (viq->viq_indirect) { + /* + * Allocate an indirect descriptor list with the appropriate + * number of entries. + */ + if (virtio_dma_init(vio, &vic->vic_indirect_dma, + sizeof (virtio_vq_desc_t) * viq->viq_max_segs, + &virtio_dma_attr_indirect, + DDI_DMA_CONSISTENT | DDI_DMA_WRITE, + kmflags) != DDI_SUCCESS) { + goto fail; + } + + /* + * Allocate a single descriptor to hold the indirect list. + * Leave the length as zero for now; it will be set to include + * any occupied entries at push time. + */ + mutex_enter(&viq->viq_mutex); + if (virtio_chain_append_impl(vic, + virtio_dma_cookie_pa(&vic->vic_indirect_dma, 0), 0, + VIRTQ_DESC_F_INDIRECT) != DDI_SUCCESS) { + mutex_exit(&viq->viq_mutex); + goto fail; + } + mutex_exit(&viq->viq_mutex); + VERIFY3U(vic->vic_direct_used, ==, 1); + + /* + * Don't set the indirect capacity until after we've installed + * the direct descriptor which points at the indirect list, or + * virtio_chain_append_impl() will be confused. + */ + vic->vic_indirect_capacity = viq->viq_max_segs; + } + + return (vic); + +fail: + virtio_dma_fini(&vic->vic_indirect_dma); + kmem_free(vic, vicsz); + return (NULL); +} + +void * +virtio_chain_data(virtio_chain_t *vic) +{ + return (vic->vic_data); +} + +void +virtio_chain_data_set(virtio_chain_t *vic, void *data) +{ + vic->vic_data = data; +} + +void +virtio_chain_clear(virtio_chain_t *vic) +{ + if (vic->vic_indirect_capacity != 0) { + /* + * There should only be one direct descriptor, which points at + * our indirect descriptor list. We don't want to clear it + * here. + */ + VERIFY3U(vic->vic_direct_capacity, ==, 1); + + if (vic->vic_indirect_used > 0) { + /* + * Clear out the indirect descriptor table. + */ + vic->vic_indirect_used = 0; + bzero(virtio_dma_va(&vic->vic_indirect_dma, 0), + virtio_dma_size(&vic->vic_indirect_dma)); + } + + } else if (vic->vic_direct_capacity > 0) { + /* + * Release any descriptors that were assigned to us previously. + */ + for (uint_t i = 0; i < vic->vic_direct_used; i++) { + id_free(vic->vic_vq->viq_descmap, vic->vic_direct[i]); + vic->vic_direct[i] = 0; + } + vic->vic_direct_used = 0; + } +} + +void +virtio_chain_free(virtio_chain_t *vic) +{ + /* + * First ensure that we have released any descriptors used by this + * chain. + */ + virtio_chain_clear(vic); + + if (vic->vic_indirect_capacity > 0) { + /* + * Release the direct descriptor that points to our indirect + * descriptor list. + */ + VERIFY3U(vic->vic_direct_capacity, ==, 1); + id_free(vic->vic_vq->viq_descmap, vic->vic_direct[0]); + + virtio_dma_fini(&vic->vic_indirect_dma); + } + + size_t vicsz = sizeof (*vic) + + vic->vic_direct_capacity * sizeof (uint16_t); + + kmem_free(vic, vicsz); +} + +static inline int +virtio_queue_descmap_alloc(virtio_queue_t *viq, uint_t *indexp) +{ + id_t index; + + if ((index = id_alloc_nosleep(viq->viq_descmap)) == -1) { + return (ENOMEM); + } + + VERIFY3S(index, >=, 0); + VERIFY3S(index, <=, viq->viq_size); + + *indexp = (uint_t)index; + return (0); +} + +static int +virtio_chain_append_impl(virtio_chain_t *vic, uint64_t pa, size_t len, + uint16_t flags) +{ + virtio_queue_t *viq = vic->vic_vq; + virtio_vq_desc_t *vqd; + uint_t index; + + /* + * We're modifying the queue-wide descriptor list so make sure we have + * the appropriate lock. + */ + VERIFY(MUTEX_HELD(&viq->viq_mutex)); + + if (vic->vic_indirect_capacity != 0) { + /* + * Use indirect descriptors. + */ + if (vic->vic_indirect_used >= vic->vic_indirect_capacity) { + return (DDI_FAILURE); + } + + vqd = virtio_dma_va(&vic->vic_indirect_dma, 0); + + if ((index = vic->vic_indirect_used++) > 0) { + /* + * Chain the current last indirect descriptor to the + * new one. + */ + vqd[index - 1].vqd_flags |= VIRTQ_DESC_F_NEXT; + vqd[index - 1].vqd_next = index; + } + + } else { + /* + * Use direct descriptors. + */ + if (vic->vic_direct_used >= vic->vic_direct_capacity) { + return (DDI_FAILURE); + } + + if (virtio_queue_descmap_alloc(viq, &index) != 0) { + return (DDI_FAILURE); + } + + vqd = virtio_dma_va(&viq->viq_dma, 0); + + if (vic->vic_direct_used > 0) { + /* + * This is not the first entry. Chain the current + * descriptor to the next one. + */ + uint16_t p = vic->vic_direct[vic->vic_direct_used - 1]; + + vqd[p].vqd_flags |= VIRTQ_DESC_F_NEXT; + vqd[p].vqd_next = index; + } + vic->vic_direct[vic->vic_direct_used++] = index; + } + + vqd[index].vqd_addr = pa; + vqd[index].vqd_len = len; + vqd[index].vqd_flags = flags; + vqd[index].vqd_next = 0; + + return (DDI_SUCCESS); +} + +int +virtio_chain_append(virtio_chain_t *vic, uint64_t pa, size_t len, + virtio_direction_t dir) +{ + virtio_queue_t *viq = vic->vic_vq; + uint16_t flags = 0; + + switch (dir) { + case VIRTIO_DIR_DEVICE_WRITES: + flags |= VIRTQ_DESC_F_WRITE; + break; + + case VIRTIO_DIR_DEVICE_READS: + break; + + default: + panic("unknown direction value %u", dir); + } + + mutex_enter(&viq->viq_mutex); + int r = virtio_chain_append_impl(vic, pa, len, flags); + mutex_exit(&viq->viq_mutex); + + return (r); +} + +static void +virtio_queue_flush_locked(virtio_queue_t *viq) +{ + VERIFY(MUTEX_HELD(&viq->viq_mutex)); + + /* + * Make sure any writes we have just made to the descriptors + * (vqdr_ring[]) are visible to the device before we update the ring + * pointer (vqdr_index). + */ + membar_producer(); + viq->viq_dma_driver->vqdr_index = viq->viq_driver_index; + VIRTQ_DMA_SYNC_FORDEV(viq); + + /* + * Determine whether the device expects us to notify it of new + * descriptors. + */ + VIRTQ_DMA_SYNC_FORKERNEL(viq); + if (!(viq->viq_dma_device->vqde_flags & VIRTQ_USED_F_NO_NOTIFY)) { + virtio_put16(viq->viq_virtio, VIRTIO_LEGACY_QUEUE_NOTIFY, + viq->viq_index); + } +} + +void +virtio_queue_flush(virtio_queue_t *viq) +{ + mutex_enter(&viq->viq_mutex); + virtio_queue_flush_locked(viq); + mutex_exit(&viq->viq_mutex); +} + +void +virtio_chain_submit(virtio_chain_t *vic, boolean_t flush) +{ + virtio_queue_t *viq = vic->vic_vq; + + mutex_enter(&viq->viq_mutex); + + if (vic->vic_indirect_capacity != 0) { + virtio_vq_desc_t *vqd = virtio_dma_va(&viq->viq_dma, 0); + + VERIFY3U(vic->vic_direct_used, ==, 1); + + /* + * This is an indirect descriptor queue. The length in bytes + * of the descriptor must extend to cover the populated + * indirect descriptor entries. + */ + vqd[vic->vic_direct[0]].vqd_len = + sizeof (virtio_vq_desc_t) * vic->vic_indirect_used; + + virtio_dma_sync(&vic->vic_indirect_dma, DDI_DMA_SYNC_FORDEV); + } + + /* + * Populate the next available slot in the driver-owned ring for this + * chain. The updated value of viq_driver_index is not yet visible to + * the device until a subsequent queue flush. + */ + uint16_t index = (viq->viq_driver_index++) % viq->viq_size; + viq->viq_dma_driver->vqdr_ring[index] = vic->vic_direct[0]; + + vic->vic_head = vic->vic_direct[0]; + avl_add(&viq->viq_inflight, vic); + + if (flush) { + virtio_queue_flush_locked(vic->vic_vq); + } + + mutex_exit(&viq->viq_mutex); +} + +/* + * INTERRUPTS MANAGEMENT + */ + +static const char * +virtio_interrupt_type_name(int type) +{ + switch (type) { + case DDI_INTR_TYPE_MSIX: + return ("MSI-X"); + case DDI_INTR_TYPE_MSI: + return ("MSI"); + case DDI_INTR_TYPE_FIXED: + return ("fixed"); + default: + return ("?"); + } +} + +static int +virtio_interrupts_alloc(virtio_t *vio, int type, int nrequired) +{ + dev_info_t *dip = vio->vio_dip; + int nintrs = 0; + int navail = 0; + + VERIFY(MUTEX_HELD(&vio->vio_mutex)); + VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC)); + + if (ddi_intr_get_nintrs(dip, type, &nintrs) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not count %s interrupts", + virtio_interrupt_type_name(type)); + return (DDI_FAILURE); + } + if (nintrs < 1) { + dev_err(dip, CE_WARN, "no %s interrupts supported", + virtio_interrupt_type_name(type)); + return (DDI_FAILURE); + } + + if (ddi_intr_get_navail(dip, type, &navail) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not count available %s interrupts", + virtio_interrupt_type_name(type)); + return (DDI_FAILURE); + } + if (navail < nrequired) { + dev_err(dip, CE_WARN, "need %d %s interrupts, but only %d " + "available", nrequired, virtio_interrupt_type_name(type), + navail); + return (DDI_FAILURE); + } + + VERIFY3P(vio->vio_interrupts, ==, NULL); + vio->vio_interrupts = kmem_zalloc( + sizeof (ddi_intr_handle_t) * nrequired, KM_SLEEP); + + int r; + if ((r = ddi_intr_alloc(dip, vio->vio_interrupts, type, 0, nrequired, + &vio->vio_ninterrupts, DDI_INTR_ALLOC_STRICT)) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "%s interrupt allocation failure (%d)", + virtio_interrupt_type_name(type), r); + kmem_free(vio->vio_interrupts, + sizeof (ddi_intr_handle_t) * nrequired); + vio->vio_interrupts = NULL; + return (DDI_FAILURE); + } + + vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ALLOC; + vio->vio_interrupt_type = type; + return (DDI_SUCCESS); +} + +static uint_t +virtio_shared_isr(caddr_t arg0, caddr_t arg1) +{ + virtio_t *vio = (virtio_t *)arg0; + uint_t r = DDI_INTR_UNCLAIMED; + uint8_t isr; + + mutex_enter(&vio->vio_mutex); + + /* + * Check the ISR status to see if the interrupt applies to us. Reading + * this field resets it to zero. + */ + isr = virtio_get8(vio, VIRTIO_LEGACY_ISR_STATUS); + if ((isr & VIRTIO_ISR_CHECK_QUEUES) == 0) { + goto done; + } + + for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL; + viq = list_next(&vio->vio_queues, viq)) { + if (viq->viq_func != NULL) { + mutex_exit(&vio->vio_mutex); + if (viq->viq_func(viq->viq_funcarg, arg0) == + DDI_INTR_CLAIMED) { + r = DDI_INTR_CLAIMED; + } + mutex_enter(&vio->vio_mutex); + + if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) { + /* + * The device was shut down while in a queue + * handler routine. + */ + goto done; + } + } + } + +done: + mutex_exit(&vio->vio_mutex); + return (r); +} + +static int +virtio_interrupts_setup(virtio_t *vio, int allow_types) +{ + dev_info_t *dip = vio->vio_dip; + int types; + int count = 0; + + mutex_enter(&vio->vio_mutex); + + /* + * Determine the number of interrupts we'd like based on the number of + * virtqueues. + */ + for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL; + viq = list_next(&vio->vio_queues, viq)) { + if (viq->viq_func != NULL) { + count++; + } + } + + if (ddi_intr_get_supported_types(dip, &types) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not get supported interrupts"); + mutex_exit(&vio->vio_mutex); + return (DDI_FAILURE); + } + + if (allow_types != 0) { + /* + * Restrict the possible interrupt types at the request of the + * driver. + */ + types &= allow_types; + } + + /* + * Try each potential interrupt type in descending order of preference. + * Note that the specification does not appear to allow for the use of + * classical MSI, so we are limited to either MSI-X or fixed + * interrupts. + */ + if (types & DDI_INTR_TYPE_MSIX) { + if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_MSIX, + count) == DDI_SUCCESS) { + goto add_handlers; + } + } + if (types & DDI_INTR_TYPE_FIXED) { + /* + * If fixed interrupts are all that are available, we'll just + * ask for one. + */ + if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_FIXED, 1) == + DDI_SUCCESS) { + goto add_handlers; + } + } + + dev_err(dip, CE_WARN, "interrupt allocation failed"); + mutex_exit(&vio->vio_mutex); + return (DDI_FAILURE); + +add_handlers: + /* + * Ensure that we have not been given any high-level interrupts as our + * interrupt handlers do not support them. + */ + for (int i = 0; i < vio->vio_ninterrupts; i++) { + uint_t ipri; + + if (ddi_intr_get_pri(vio->vio_interrupts[i], &ipri) != + DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not determine interrupt " + "priority"); + goto fail; + } + + if (ipri >= ddi_intr_get_hilevel_pri()) { + dev_err(dip, CE_WARN, "high level interrupts not " + "supported"); + goto fail; + } + + /* + * Record the highest priority we've been allocated to use for + * mutex initialisation. + */ + if (i == 0 || ipri > vio->vio_interrupt_priority) { + vio->vio_interrupt_priority = ipri; + } + } + + /* + * Get the interrupt capabilities from the first handle to determine + * whether we need to use ddi_intr_block_enable(9F). + */ + if (ddi_intr_get_cap(vio->vio_interrupts[0], + &vio->vio_interrupt_cap) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "failed to get interrupt capabilities"); + goto fail; + } + + if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) { + VERIFY3S(vio->vio_ninterrupts, ==, 1); + /* + * For fixed interrupts, we need to use our shared handler to + * multiplex the per-queue handlers provided by the driver. + */ + if (ddi_intr_add_handler(vio->vio_interrupts[0], + virtio_shared_isr, (caddr_t)vio, NULL) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "adding shared %s interrupt " + "handler failed", virtio_interrupt_type_name( + vio->vio_interrupt_type)); + goto fail; + } + + goto done; + } + + VERIFY3S(vio->vio_ninterrupts, ==, count); + + uint_t n = 0; + for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL; + viq = list_next(&vio->vio_queues, viq)) { + if (viq->viq_func == NULL) { + continue; + } + + if (ddi_intr_add_handler(vio->vio_interrupts[n], + viq->viq_func, (caddr_t)viq->viq_funcarg, + (caddr_t)vio) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "adding interrupt %u (%s) failed", + n, viq->viq_name); + goto fail; + } + + viq->viq_handler_index = n; + viq->viq_handler_added = B_TRUE; + n++; + } + +done: + vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ADDED; + mutex_exit(&vio->vio_mutex); + return (DDI_SUCCESS); + +fail: + virtio_interrupts_teardown(vio); + mutex_exit(&vio->vio_mutex); + return (DDI_FAILURE); +} + +static void +virtio_interrupts_teardown(virtio_t *vio) +{ + VERIFY(MUTEX_HELD(&vio->vio_mutex)); + + virtio_interrupts_disable_locked(vio); + + if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) { + /* + * Remove the multiplexing interrupt handler. + */ + if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED) { + int r; + + VERIFY3S(vio->vio_ninterrupts, ==, 1); + + if ((r = ddi_intr_remove_handler( + vio->vio_interrupts[0])) != DDI_SUCCESS) { + dev_err(vio->vio_dip, CE_WARN, "removing " + "shared interrupt handler failed (%d)", r); + } + } + } else { + for (virtio_queue_t *viq = list_head(&vio->vio_queues); + viq != NULL; viq = list_next(&vio->vio_queues, viq)) { + int r; + + if (!viq->viq_handler_added) { + continue; + } + + if ((r = ddi_intr_remove_handler( + vio->vio_interrupts[viq->viq_handler_index])) != + DDI_SUCCESS) { + dev_err(vio->vio_dip, CE_WARN, "removing " + "interrupt handler (%s) failed (%d)", + viq->viq_name, r); + } + + viq->viq_handler_added = B_FALSE; + } + } + vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ADDED; + + if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC) { + for (int i = 0; i < vio->vio_ninterrupts; i++) { + int r; + + if ((r = ddi_intr_free(vio->vio_interrupts[i])) != + DDI_SUCCESS) { + dev_err(vio->vio_dip, CE_WARN, "freeing " + "interrupt %u failed (%d)", i, r); + } + } + kmem_free(vio->vio_interrupts, + sizeof (ddi_intr_handle_t) * vio->vio_ninterrupts); + vio->vio_interrupts = NULL; + vio->vio_ninterrupts = 0; + vio->vio_interrupt_type = 0; + vio->vio_interrupt_cap = 0; + vio->vio_interrupt_priority = 0; + + vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ALLOC; + } +} + +static void +virtio_interrupts_unwind(virtio_t *vio) +{ + VERIFY(MUTEX_HELD(&vio->vio_mutex)); + + if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) { + for (virtio_queue_t *viq = list_head(&vio->vio_queues); + viq != NULL; viq = list_next(&vio->vio_queues, viq)) { + if (!viq->viq_handler_added) { + continue; + } + + virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, + viq->viq_index); + virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE, + VIRTIO_LEGACY_MSI_NO_VECTOR); + } + } + + if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) { + (void) ddi_intr_block_disable(vio->vio_interrupts, + vio->vio_ninterrupts); + } else { + for (int i = 0; i < vio->vio_ninterrupts; i++) { + (void) ddi_intr_disable(vio->vio_interrupts[i]); + } + } + + /* + * Disabling the interrupts makes the MSI-X fields disappear from the + * BAR once more. + */ + vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET; +} + +int +virtio_interrupts_enable(virtio_t *vio) +{ + mutex_enter(&vio->vio_mutex); + if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED) { + mutex_exit(&vio->vio_mutex); + return (DDI_SUCCESS); + } + + int r = DDI_SUCCESS; + if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) { + r = ddi_intr_block_enable(vio->vio_interrupts, + vio->vio_ninterrupts); + } else { + for (int i = 0; i < vio->vio_ninterrupts; i++) { + if ((r = ddi_intr_enable(vio->vio_interrupts[i])) != + DDI_SUCCESS) { + /* + * Disable the interrupts we have enabled so + * far. + */ + for (i--; i >= 0; i--) { + (void) ddi_intr_disable( + vio->vio_interrupts[i]); + } + break; + } + } + } + + if (r != DDI_SUCCESS) { + mutex_exit(&vio->vio_mutex); + return (r); + } + + if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) { + /* + * When asked to enable the interrupts, the system enables + * MSI-X in the PCI configuration for the device. While + * enabled, the extra MSI-X configuration table fields appear + * between the general and the device-specific regions of the + * BAR. + */ + vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET_MSIX; + + for (virtio_queue_t *viq = list_head(&vio->vio_queues); + viq != NULL; viq = list_next(&vio->vio_queues, viq)) { + if (!viq->viq_handler_added) { + continue; + } + + uint16_t qi = viq->viq_index; + uint16_t msi = viq->viq_handler_index; + + /* + * Route interrupts for this queue to the assigned + * MSI-X vector number. + */ + virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qi); + virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE, msi); + + /* + * The device may not actually accept the vector number + * we're attempting to program. We need to confirm + * that configuration was successful by re-reading the + * configuration we just wrote. + */ + if (virtio_get16(vio, VIRTIO_LEGACY_MSIX_QUEUE) != + msi) { + dev_err(vio->vio_dip, CE_WARN, + "failed to configure MSI-X vector %u for " + "queue \"%s\" (#%u)", (uint_t)msi, + viq->viq_name, (uint_t)qi); + + virtio_interrupts_unwind(vio); + mutex_exit(&vio->vio_mutex); + return (DDI_FAILURE); + } + } + } + + vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ENABLED; + + mutex_exit(&vio->vio_mutex); + return (DDI_SUCCESS); +} + +static void +virtio_interrupts_disable_locked(virtio_t *vio) +{ + VERIFY(MUTEX_HELD(&vio->vio_mutex)); + + if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED)) { + return; + } + + virtio_interrupts_unwind(vio); + + vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ENABLED; +} + +void +virtio_interrupts_disable(virtio_t *vio) +{ + mutex_enter(&vio->vio_mutex); + virtio_interrupts_disable_locked(vio); + mutex_exit(&vio->vio_mutex); +} diff --git a/usr/src/uts/common/io/virtio/virtioreg.h b/usr/src/uts/common/io/virtio/virtioreg.h deleted file mode 100644 index 19579e96bc..0000000000 --- a/usr/src/uts/common/io/virtio/virtioreg.h +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2010 Minoura Makoto. - * Copyright (c) 2012 Nexenta Systems, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Part of the file derived from `Virtio PCI Card Specification v0.8.6 DRAFT' - * Appendix A. - */ - -/* - * An interface for efficient virtio implementation. - * - * This header is BSD licensed so anyone can use the definitions - * to implement compatible drivers/servers. - * - * Copyright 2007, 2009, IBM Corporation - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of IBM nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * ``AS IS'' ANDANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - - -#ifndef __VIRTIOREG_H__ -#define __VIRTIOREG_H__ - -#include <sys/types.h> - -#define PCI_VENDOR_QUMRANET 0x1af4 -#define PCI_DEV_VIRTIO_MIN 0x1000 -#define PCI_DEV_VIRTIO_MAX 0x103f -#define VIRTIO_PCI_ABI_VERSION 0 - -/* Virtio product id (subsystem) */ -#define PCI_PRODUCT_VIRTIO_NETWORK 1 -#define PCI_PRODUCT_VIRTIO_BLOCK 2 -#define PCI_PRODUCT_VIRTIO_CONSOLE 3 -#define PCI_PRODUCT_VIRTIO_ENTROPY 4 -#define PCI_PRODUCT_VIRTIO_BALLOON 5 -#define PCI_PRODUCT_VIRTIO_9P 9 - -/* Virtio header */ -#define VIRTIO_CONFIG_DEVICE_FEATURES 0 /* 32bit */ -#define VIRTIO_CONFIG_GUEST_FEATURES 4 /* 32bit */ - -#define VIRTIO_F_NOTIFY_ON_EMPTY (1<<24) -#define VIRTIO_F_RING_INDIRECT_DESC (1<<28) -#define VIRTIO_F_BAD_FEATURE (1<<30) - -#define VIRTIO_CONFIG_QUEUE_ADDRESS 8 /* 32bit */ -#define VIRTIO_CONFIG_QUEUE_SIZE 12 /* 16bit */ -#define VIRTIO_CONFIG_QUEUE_SELECT 14 /* 16bit */ -#define VIRTIO_CONFIG_QUEUE_NOTIFY 16 /* 16bit */ -#define VIRTIO_CONFIG_DEVICE_STATUS 18 /* 8bit */ - -#define VIRTIO_CONFIG_DEVICE_STATUS_RESET 0 -#define VIRTIO_CONFIG_DEVICE_STATUS_ACK 1 -#define VIRTIO_CONFIG_DEVICE_STATUS_DRIVER 2 -#define VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK 4 -#define VIRTIO_CONFIG_DEVICE_STATUS_FAILED 128 - -#define VIRTIO_CONFIG_ISR_STATUS 19 /* 8bit */ -#define VIRTIO_CONFIG_ISR_CONFIG_CHANGE 2 - -#define VIRTIO_CONFIG_CONFIG_VECTOR 20 /* 16bit, optional */ -#define VIRTIO_CONFIG_QUEUE_VECTOR 22 - -#define VIRTIO_CONFIG_DEVICE_CONFIG_NOMSIX 20 -#define VIRTIO_CONFIG_DEVICE_CONFIG_MSIX 24 - -#define VIRTIO_MSI_NO_VECTOR 0xffff - -/* Virtqueue */ -/* This marks a buffer as continuing via the next field. */ -#define VRING_DESC_F_NEXT 1 -/* - * This marks a buffer as write-only, from the devices's perspective. - * (otherwise read-only). - */ -#define VRING_DESC_F_WRITE 2 -/* This means the buffer contains a list of buffer descriptors. */ -#define VRING_DESC_F_INDIRECT 4 - -/* - * The Host uses this in used->flags to advise the Guest: don't kick me - * when you add a buffer. It's unreliable, so it's simply an - * optimization. Guest will still kick if it's out of buffers. - */ -#define VRING_USED_F_NO_NOTIFY 1 -/* - * The Guest uses this in avail->flags to advise the Host: don't - * interrupt me when you consume a buffer. It's unreliable, so it's - * simply an optimization. - */ -#define VRING_AVAIL_F_NO_INTERRUPT 1 - -/* - * Virtio ring descriptors: 16 bytes. - * These can chain together via "next". - */ -struct vring_desc { - /* Address (guest-physical). */ - uint64_t addr; - /* Length. */ - uint32_t len; - /* The flags as indicated above. */ - uint16_t flags; - /* We chain unused descriptors via this, too */ - uint16_t next; -} __attribute__((packed)); - -struct vring_avail { - uint16_t flags; - uint16_t idx; - uint16_t ring[]; -} __attribute__((packed)); - -/* u32 is used here for ids for padding reasons. */ -struct vring_used_elem { - /* Index of start of used descriptor chain. */ - uint32_t id; - /* Total length of the descriptor chain which was written to. */ - uint32_t len; -} __attribute__((packed)); - -struct vring_used { - uint16_t flags; - uint16_t idx; - struct vring_used_elem ring[]; -} __attribute__((packed)); - - -/* Got nothing to do with the system page size, just a confusing name. */ -#define VIRTIO_PAGE_SIZE (4096) - -#endif /* __VIRTIOREG_H__ */ diff --git a/usr/src/uts/common/io/virtio/virtiovar.h b/usr/src/uts/common/io/virtio/virtiovar.h deleted file mode 100644 index 17aebe3864..0000000000 --- a/usr/src/uts/common/io/virtio/virtiovar.h +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2010 Minoura Makoto. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Part of the file derived from `Virtio PCI Card Specification v0.8.6 DRAFT' - * Appendix A. - */ - -/* - * An interface for efficient virtio implementation. - * - * This header is BSD licensed so anyone can use the definitions - * to implement compatible drivers/servers. - * - * Copyright 2007, 2009, IBM Corporation - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of IBM nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * ``AS IS'' ANDANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * Copyright 2012 Nexenta Systems, Inc. All rights reserved. - */ - -#ifndef __VIRTIOVAR_H__ -#define __VIRTIOVAR_H__ - -#include <sys/types.h> -#include <sys/dditypes.h> -#include <sys/cmn_err.h> -#include <sys/list.h> - -#ifdef DEBUG -#define dev_debug(dip, fmt, arg...) \ - dev_err(dip, fmt, ##arg) -#else -#define dev_debug(dip, fmt, arg...) -#endif - -struct vq_entry { - list_node_t qe_list; - struct virtqueue *qe_queue; - uint16_t qe_index; /* index in vq_desc array */ - /* followings are used only when it is the `head' entry */ - struct vq_entry *qe_next; - struct vring_desc *qe_desc; - ddi_dma_cookie_t qe_indirect_dma_cookie; - ddi_dma_handle_t qe_indirect_dma_handle; - ddi_acc_handle_t qe_indirect_dma_acch; - struct vring_desc *qe_indirect_descs; - unsigned int qe_indirect_next; -}; - -struct virtqueue { - struct virtio_softc *vq_owner; - unsigned int vq_num; /* queue size (# of entries) */ - unsigned int vq_indirect_num; - int vq_index; /* queue number (0, 1, ...) */ - - /* vring pointers (KVA) */ - struct vring_desc *vq_descs; - struct vring_avail *vq_avail; - struct vring_used *vq_used; - - /* virtqueue allocation info */ - void *vq_vaddr; - int vq_availoffset; - int vq_usedoffset; - ddi_dma_cookie_t vq_dma_cookie; - ddi_dma_handle_t vq_dma_handle; - ddi_acc_handle_t vq_dma_acch; - - int vq_maxsegsize; - - /* free entry management */ - struct vq_entry *vq_entries; - list_t vq_freelist; - kmutex_t vq_freelist_lock; - int vq_used_entries; - - /* enqueue/dequeue status */ - uint16_t vq_avail_idx; - kmutex_t vq_avail_lock; - uint16_t vq_used_idx; - kmutex_t vq_used_lock; -}; - -struct virtio_softc { - dev_info_t *sc_dev; - - uint_t sc_intr_prio; - - ddi_acc_handle_t sc_ioh; - caddr_t sc_io_addr; - int sc_config_offset; - - uint32_t sc_features; - - int sc_nvqs; /* set by the user */ - - ddi_intr_handle_t *sc_intr_htable; - int sc_intr_num; - boolean_t sc_intr_config; - int sc_intr_cap; - int sc_int_type; -}; - -struct virtio_int_handler { - ddi_intr_handler_t *vh_func; - void *vh_priv; -}; - -/* public interface */ -uint32_t virtio_negotiate_features(struct virtio_softc *, uint32_t); -size_t virtio_show_features(uint32_t features, char *buffer, size_t len); -boolean_t virtio_has_feature(struct virtio_softc *sc, uint32_t feature); -void virtio_set_status(struct virtio_softc *sc, unsigned int); -#define virtio_device_reset(sc) virtio_set_status((sc), 0) - -uint8_t virtio_read_device_config_1(struct virtio_softc *sc, - unsigned int index); -uint16_t virtio_read_device_config_2(struct virtio_softc *sc, - unsigned int index); -uint32_t virtio_read_device_config_4(struct virtio_softc *sc, - unsigned int index); -uint64_t virtio_read_device_config_8(struct virtio_softc *sc, - unsigned int index); -void virtio_write_device_config_1(struct virtio_softc *sc, - unsigned int index, uint8_t value); -void virtio_write_device_config_2(struct virtio_softc *sc, - unsigned int index, uint16_t value); -void virtio_write_device_config_4(struct virtio_softc *sc, - unsigned int index, uint32_t value); -void virtio_write_device_config_8(struct virtio_softc *sc, - unsigned int index, uint64_t value); - -struct virtqueue *virtio_alloc_vq(struct virtio_softc *sc, - unsigned int index, unsigned int size, - unsigned int indirect_num, const char *name); -void virtio_free_vq(struct virtqueue *); -void virtio_reset(struct virtio_softc *); -struct vq_entry *vq_alloc_entry(struct virtqueue *vq); -void vq_free_entry(struct virtqueue *vq, struct vq_entry *qe); -uint_t vq_num_used(struct virtqueue *vq); -unsigned int virtio_ve_indirect_available(struct vq_entry *qe); - -void virtio_stop_vq_intr(struct virtqueue *); -void virtio_start_vq_intr(struct virtqueue *); - -void virtio_ve_add_cookie(struct vq_entry *qe, ddi_dma_handle_t dma_handle, - ddi_dma_cookie_t dma_cookie, unsigned int ncookies, boolean_t write); -void virtio_ve_add_indirect_buf(struct vq_entry *qe, uint64_t paddr, - uint32_t len, boolean_t write); -void virtio_ve_set(struct vq_entry *qe, uint64_t paddr, uint32_t len, - boolean_t write); - -void virtio_push_chain(struct vq_entry *qe, boolean_t sync); -struct vq_entry *virtio_pull_chain(struct virtqueue *vq, uint32_t *len); -void virtio_free_chain(struct vq_entry *ve); -void virtio_sync_vq(struct virtqueue *vq); - -int virtio_register_ints(struct virtio_softc *sc, - struct virtio_int_handler *config_handler, - struct virtio_int_handler vq_handlers[]); -void virtio_release_ints(struct virtio_softc *sc); -int virtio_enable_ints(struct virtio_softc *sc); - -#endif /* __VIRTIOVAR_H__ */ diff --git a/usr/src/uts/common/smbsrv/smb2_kproto.h b/usr/src/uts/common/smbsrv/smb2_kproto.h index 97b13af868..ed553bedcd 100644 --- a/usr/src/uts/common/smbsrv/smb2_kproto.h +++ b/usr/src/uts/common/smbsrv/smb2_kproto.h @@ -32,6 +32,7 @@ extern uint32_t smb2_dh_def_timeout; extern uint32_t smb2_dh_max_timeout; extern uint32_t smb2_res_def_timeout; extern uint32_t smb2_res_max_timeout; +extern uint32_t smb2_persist_timeout; extern int smb2_enable_dh; #define SMB3_CLIENT_ENCRYPTS(sr) \ @@ -131,7 +132,7 @@ uint32_t smb2_setinfo_quota(smb_request_t *, smb_setinfo_t *); void smb2_oplock_acquire(smb_request_t *sr); void smb2_oplock_reconnect(smb_request_t *sr); void smb2_lease_acquire(smb_request_t *sr); -uint32_t smb2_lease_create(smb_request_t *sr); +uint32_t smb2_lease_create(smb_request_t *sr, uint8_t *); void smb2_lease_rele(smb_lease_t *); void smb2_lease_init(void); void smb2_lease_fini(void); @@ -142,6 +143,15 @@ void smb2_durable_timers(smb_server_t *); uint32_t smb2_dh_reconnect(smb_request_t *); boolean_t smb_dh_should_save(smb_ofile_t *); extern void smb2_dh_shutdown(smb_server_t *); +int smb2_dh_new_ca_share(smb_server_t *, smb_kshare_t *); +void smb2_dh_close_persistent(smb_ofile_t *); +void smb2_dh_close_my_orphans(smb_request_t *, smb_ofile_t *); +int smb2_dh_make_persistent(smb_request_t *, smb_ofile_t *); +void smb2_dh_setdoc_persistent(smb_ofile_t *); +void smb2_dh_update_nvfile(smb_request_t *); +void smb2_dh_update_oplock(smb_request_t *, smb_ofile_t *); +void smb2_dh_update_locks(smb_request_t *, smb_ofile_t *); +void smb2_dh_update_times(smb_request_t *, smb_ofile_t *, smb_attr_t *); #ifdef __cplusplus } diff --git a/usr/src/uts/common/smbsrv/smb_kproto.h b/usr/src/uts/common/smbsrv/smb_kproto.h index d18ff80d5e..751f047e0c 100644 --- a/usr/src/uts/common/smbsrv/smb_kproto.h +++ b/usr/src/uts/common/smbsrv/smb_kproto.h @@ -338,6 +338,8 @@ boolean_t smb_validate_dirname(smb_request_t *, smb_pathname_t *); boolean_t smb_validate_object_name(smb_request_t *, smb_pathname_t *); boolean_t smb_validate_stream_name(smb_request_t *, smb_pathname_t *); boolean_t smb_is_stream_name(char *); +boolean_t smb_strname_restricted(char *); + void smb_stream_parse_name(char *, char *, char *); @@ -438,7 +440,7 @@ int smb_server_get_count(void); int smb_server_g_init(void); void smb_server_g_fini(void); int smb_server_create(void); -int smb_server_delete(void); +int smb_server_delete(smb_server_t *); int smb_server_configure(smb_ioc_cfg_t *); int smb_server_start(smb_ioc_start_t *); int smb_server_stop(void); @@ -451,7 +453,7 @@ int smb_server_numopen(smb_ioc_opennum_t *); int smb_server_enum(smb_ioc_svcenum_t *); int smb_server_session_close(smb_ioc_session_t *); int smb_server_file_close(smb_ioc_fileid_t *); -int smb_server_sharevp(smb_server_t *, const char *, vnode_t **); +int smb_server_share_lookup(smb_server_t *, const char *, smb_node_t **); int smb_server_unshare(const char *); void smb_server_logoff_ssnid(smb_request_t *, uint64_t); @@ -553,14 +555,6 @@ int smb_pathname(smb_request_t *, char *, int, smb_node_t *, smb_node_t *, smb_node_t **, smb_node_t **, cred_t *); /* - * smb_vfs functions - */ - -int smb_vfs_hold(smb_export_t *, vfs_t *); -void smb_vfs_rele(smb_export_t *, vfs_t *); -void smb_vfs_rele_all(smb_export_t *); - -/* * smb_notify.c */ uint32_t smb_notify_act1(smb_request_t *, uint32_t, uint32_t); @@ -633,6 +627,7 @@ smb_tree_t *smb_session_lookup_volume(smb_session_t *, const char *, void smb_session_close_pid(smb_session_t *, uint32_t); void smb_session_disconnect_owned_trees(smb_session_t *, smb_user_t *); void smb_session_disconnect_share(smb_session_t *, const char *); +void smb_session_logoff(smb_session_t *); void smb_session_getclient(smb_session_t *, char *, size_t); boolean_t smb_session_isclient(smb_session_t *, const char *); void smb_session_correct_keep_alive_values(smb_llist_t *, uint32_t); @@ -654,7 +649,7 @@ smb_ofile_t *smb_ofile_lookup_by_uniqid(smb_tree_t *, uint32_t); smb_ofile_t *smb_ofile_lookup_by_persistid(smb_request_t *, uint64_t); boolean_t smb_ofile_disallow_fclose(smb_ofile_t *); smb_ofile_t *smb_ofile_alloc(smb_request_t *, smb_arg_open_t *, smb_node_t *, - uint16_t, uint16_t, uint32_t); + uint16_t, uint16_t); void smb_ofile_open(smb_request_t *, smb_arg_open_t *, smb_ofile_t *); void smb_ofile_close(smb_ofile_t *, int32_t); void smb_ofile_free(smb_ofile_t *); @@ -678,7 +673,9 @@ void smb_delayed_write_timer(smb_llist_t *); void smb_ofile_set_quota_resume(smb_ofile_t *, char *); void smb_ofile_get_quota_resume(smb_ofile_t *, char *, int); void smb_ofile_del_persistid(smb_ofile_t *); -void smb_ofile_set_persistid(smb_ofile_t *); +void smb_ofile_set_persistid_dh(smb_ofile_t *); +void smb_ofile_set_persistid_ph(smb_ofile_t *); +int smb_ofile_insert_persistid(smb_ofile_t *, uint64_t); #define SMB_OFILE_GET_SESSION(of) ((of)->f_session) #define SMB_OFILE_GET_TREE(of) ((of)->f_tree) @@ -734,6 +731,7 @@ void smb_user_netinfo_fini(smb_netuserinfo_t *); int smb_user_netinfo_encode(smb_user_t *, uint8_t *, size_t, uint32_t *); smb_token_t *smb_get_token(smb_session_t *, smb_logon_t *); cred_t *smb_cred_create(smb_token_t *); +cred_t *smb_kcred_create(void); void smb_user_setcred(smb_user_t *, cred_t *, uint32_t); boolean_t smb_is_same_user(cred_t *, cred_t *); @@ -741,6 +739,7 @@ boolean_t smb_is_same_user(cred_t *, cred_t *); * SMB tree functions (file smb_tree.c) */ uint32_t smb_tree_connect(smb_request_t *); +uint32_t smb_tree_connect_disk(smb_request_t *, smb_arg_tcon_t *); void smb_tree_disconnect(smb_tree_t *, boolean_t); void smb_tree_close_pid(smb_tree_t *, uint32_t); boolean_t smb_tree_has_feature(smb_tree_t *, uint_t); @@ -751,6 +750,8 @@ void smb_tree_hold_internal(smb_tree_t *); void smb_tree_release(smb_tree_t *); smb_odir_t *smb_tree_lookup_odir(smb_request_t *, uint16_t); boolean_t smb_tree_is_connected(smb_tree_t *); +smb_tree_t *smb_tree_alloc(smb_request_t *, const smb_kshare_t *, + smb_node_t *, uint32_t, uint32_t); smb_xa_t *smb_xa_create(smb_session_t *session, smb_request_t *sr, uint32_t total_parameter_count, uint32_t total_data_count, @@ -937,7 +938,7 @@ void smb_threshold_exit(smb_cmd_threshold_t *); void smb_threshold_wake_all(smb_cmd_threshold_t *); /* SMB hash function prototypes */ -smb_hash_t *smb_hash_create(size_t, size_t, uint32_t num_buckets); +smb_hash_t *smb_hash_create(size_t, size_t, uint32_t); void smb_hash_destroy(smb_hash_t *); uint_t smb_hash_uint64(smb_hash_t *, uint64_t); diff --git a/usr/src/uts/common/smbsrv/smb_ktypes.h b/usr/src/uts/common/smbsrv/smb_ktypes.h index 09e52b70f7..1f8ce704fb 100644 --- a/usr/src/uts/common/smbsrv/smb_ktypes.h +++ b/usr/src/uts/common/smbsrv/smb_ktypes.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright 2018 Nexenta Systems, Inc. All rights reserved. */ /* @@ -61,6 +61,7 @@ extern "C" { struct __door_handle; /* <sys/door.h> */ struct edirent; /* <sys/extdirent.h> */ +struct nvlist; struct smb_disp_entry; struct smb_request; @@ -476,7 +477,6 @@ typedef struct { typedef struct smb_export { kmutex_t e_mutex; boolean_t e_ready; - smb_llist_t e_vfs_list; smb_avl_t e_share_avl; smb_slist_t e_unexport_list; smb_thread_t e_unexport_thread; @@ -629,16 +629,6 @@ typedef struct smb_lease { uint8_t ls_clnt[SMB_LEASE_KEY_SZ]; } smb_lease_t; -#define SMB_VFS_MAGIC 0x534D4256 /* 'SMBV' */ - -typedef struct smb_vfs { - list_node_t sv_lnd; - uint32_t sv_magic; - uint32_t sv_refcnt; - vfs_t *sv_vfsp; - vnode_t *sv_rootvp; -} smb_vfs_t; - #define SMB_NODE_MAGIC 0x4E4F4445 /* 'NODE' */ #define SMB_NODE_VALID(p) ASSERT((p)->n_magic == SMB_NODE_MAGIC) @@ -703,6 +693,9 @@ typedef struct smb_node { typedef struct smb_kshare { uint32_t shr_magic; + avl_node_t shr_link; + kmutex_t shr_mutex; + kcondvar_t shr_cv; char *shr_name; char *shr_path; char *shr_cmnt; @@ -717,8 +710,9 @@ typedef struct smb_kshare { char *shr_access_none; char *shr_access_ro; char *shr_access_rw; - avl_node_t shr_link; - kmutex_t shr_mutex; + smb_node_t *shr_root_node; + smb_node_t *shr_ca_dir; + void *shr_import_busy; smb_cfg_val_t shr_encrypt; /* Share.EncryptData */ } smb_kshare_t; @@ -984,7 +978,7 @@ typedef struct smb_session { unsigned char MAC_key[44]; char ip_addr_str[INET6_ADDRSTRLEN]; uint8_t clnt_uuid[16]; - char workstation[SMB_PI_MAX_HOST]; + char workstation[SMB_PI_MAX_HOST]; } smb_session_t; /* @@ -1100,6 +1094,7 @@ typedef struct smb_user { #define SMB_TREE_SPARSE 0x00040000 #define SMB_TREE_TRAVERSE_MOUNTS 0x00080000 #define SMB_TREE_FORCE_L2_OPLOCK 0x00100000 +#define SMB_TREE_CA 0x00200000 /* Note: SMB_TREE_... in the mdb module too. */ /* @@ -1166,15 +1161,15 @@ typedef struct smb_tree { (((sr) && (sr)->tid_tree) ? \ (((sr)->tid_tree->t_access) & (acemask)) : 0))) -#define SMB_TREE_SUPPORTS_CATIA(sr) \ +#define SMB_TREE_SUPPORTS_CATIA(sr) \ (((sr) && (sr)->tid_tree) ? \ smb_tree_has_feature((sr)->tid_tree, SMB_TREE_CATIA) : 0) -#define SMB_TREE_SUPPORTS_ABE(sr) \ +#define SMB_TREE_SUPPORTS_ABE(sr) \ (((sr) && (sr)->tid_tree) ? \ smb_tree_has_feature((sr)->tid_tree, SMB_TREE_ABE) : 0) -#define SMB_TREE_IS_DFSROOT(sr) \ +#define SMB_TREE_IS_DFSROOT(sr) \ (((sr) && (sr)->tid_tree) ? \ smb_tree_has_feature((sr)->tid_tree, SMB_TREE_DFSROOT) : 0) @@ -1202,7 +1197,7 @@ typedef struct smb_tree { (SMB_TREE_IS_READONLY((sr)) || \ smb_node_file_is_readonly((node))) -#define SMB_ODIR_MAGIC 0x4F444952 /* 'ODIR' */ +#define SMB_ODIR_MAGIC 0x4F444952 /* 'ODIR' */ #define SMB_ODIR_VALID(p) \ ASSERT((p != NULL) && ((p)->d_magic == SMB_ODIR_MAGIC)) @@ -1332,7 +1327,7 @@ typedef struct smb_opipe { #define SMB_OFLAGS_SET_DELETE_ON_CLOSE 0x0004 #define SMB_OFLAGS_LLF_POS_VALID 0x0008 -#define SMB_OFILE_MAGIC 0x4F464C45 /* 'OFLE' */ +#define SMB_OFILE_MAGIC 0x4F464C45 /* 'OFLE' */ #define SMB_OFILE_VALID(p) \ ASSERT((p != NULL) && ((p)->f_magic == SMB_OFILE_MAGIC)) @@ -1416,6 +1411,10 @@ typedef struct smb_ofile { hrtime_t dh_timeout_offset; /* time offset for timeout */ hrtime_t dh_expire_time; /* time the handle expires */ boolean_t dh_persist; + kmutex_t dh_nvlock; + struct nvlist *dh_nvlist; + smb_node_t *dh_nvfile; + uint8_t dh_create_guid[16]; char f_quota_resume[SMB_SID_STRSZ]; uint8_t f_lock_seq[SMB_OFILE_LSEQ_MAX]; @@ -1441,7 +1440,7 @@ typedef struct smb_streaminfo { char si_name[MAXPATHLEN]; } smb_streaminfo_t; -#define SMB_LOCK_MAGIC 0x4C4F434B /* 'LOCK' */ +#define SMB_LOCK_MAGIC 0x4C4F434B /* 'LOCK' */ typedef struct smb_lock { list_node_t l_lnd; @@ -1472,7 +1471,7 @@ typedef struct smb_lock { typedef struct vardata_block { uint8_t vdb_tag; uint32_t vdb_len; - struct uio vdb_uio; + struct uio vdb_uio; struct iovec vdb_iovec[MAX_IOVEC]; } smb_vdb_t; @@ -1760,7 +1759,7 @@ typedef struct smb_arg_olbrk { * */ -#define SMB_REQ_MAGIC 0x534D4252 /* 'SMBR' */ +#define SMB_REQ_MAGIC 0x534D4252 /* 'SMBR' */ #define SMB_REQ_VALID(p) ASSERT((p)->sr_magic == SMB_REQ_MAGIC) typedef enum smb_req_state { @@ -1810,7 +1809,7 @@ typedef struct smb_request { list_t sr_storage; struct smb_xa *r_xa; int andx_prev_wct; - int cur_reply_offset; + int cur_reply_offset; int orig_request_hdr; unsigned int reply_seqnum; /* reply sequence number */ unsigned char first_smb_com; /* command code */ @@ -1868,6 +1867,7 @@ typedef struct smb_request { uint8_t nonce[16]; boolean_t encrypted; + boolean_t dh_nvl_dirty; boolean_t smb2_async; uint64_t smb2_async_id; @@ -2068,7 +2068,7 @@ typedef enum smb_server_state { typedef struct { /* protected by sv_mutex */ kcondvar_t sp_cv; - uint32_t sp_cnt; + uint32_t sp_cnt; smb_llist_t sp_list; smb_llist_t sp_fidlist; } smb_spool_t; @@ -2094,11 +2094,12 @@ typedef struct smb_server { krwlock_t sv_cfg_lock; smb_kmod_cfg_t sv_cfg; smb_session_t *sv_session; + smb_user_t *sv_rootuser; smb_llist_t sv_session_list; smb_hash_t *sv_persistid_ht; smb_hash_t *sv_lease_ht; - struct smb_export sv_export; + smb_export_t sv_export; struct __door_handle *sv_lmshrd; /* Internal door for up-calls to smbd */ diff --git a/usr/src/uts/common/smbsrv/smb_share.h b/usr/src/uts/common/smbsrv/smb_share.h index 7c2219caad..090de59105 100644 --- a/usr/src/uts/common/smbsrv/smb_share.h +++ b/usr/src/uts/common/smbsrv/smb_share.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright 2018 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. */ @@ -92,6 +92,7 @@ extern "C" { #define SHOPT_AD_CONTAINER "ad-container" #define SHOPT_ABE "abe" #define SHOPT_NAME "name" +#define SHOPT_CA "ca" #define SHOPT_CSC "csc" #define SHOPT_CATIA "catia" #define SHOPT_GUEST "guestok" @@ -185,6 +186,7 @@ extern "C" { #define SMB_SHRF_QUOTAS 0x1000 /* Enable SMB Quotas */ #define SMB_SHRF_FSO 0x2000 /* Force Shared Oplocks */ +#define SMB_SHRF_CA 0x4000 /* Continuous Availability */ /* * Runtime flags @@ -193,6 +195,7 @@ extern "C" { #define SMB_SHRF_TRANS 0x10000000 #define SMB_SHRF_PERM 0x20000000 #define SMB_SHRF_AUTOHOME 0x40000000 +#define SMB_SHRF_REMOVED 0x80000000 /* unshared */ #define SMB_SHARE_PRINT "print$" #define SMB_SHARE_PRINT_LEN 6 diff --git a/usr/src/uts/intel/vioblk/Makefile b/usr/src/uts/intel/vioblk/Makefile index 5e5783fca6..ace9b626d0 100644 --- a/usr/src/uts/intel/vioblk/Makefile +++ b/usr/src/uts/intel/vioblk/Makefile @@ -1,90 +1,68 @@ # -# CDDL HEADER START +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. # -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. # + # # Copyright 2012 Nexenta Systems, Inc. All rights reserved. +# Copyright 2019 Joyent, Inc. # # -# Path to the base of the uts directory tree (usually /usr/src/uts). +# Path to the base of the uts directory tree (usually /usr/src/uts). # -UTSBASE = ../.. +UTSBASE = ../.. # -# Define the module and object file sets. +# Define the module and object file sets. # -MODULE = vioblk -OBJECTS = $(VIOBLK_OBJS:%=$(OBJS_DIR)/%) -LINTS = $(VIOBLK_OBJS:%.o=$(LINTS_DIR)/%.ln) -ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +MODULE = vioblk +OBJECTS = $(VIOBLK_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) # -# Include common rules. +# Include common rules. # include $(UTSBASE)/intel/Makefile.intel # -# Define targets +# Define targets # -ALL_TARGET = $(BINARY) -LINT_TARGET = $(MODULE).lint -INSTALL_TARGET = $(BINARY) $(ROOTMODULE) +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # -# Overrides +# Overrides # - -INC_PATH += -I$(UTSBASE)/common/io/virtio - -# -# lint pass one enforcement -# -CFLAGS += $(CCVERBOSE) +INC_PATH += -I$(UTSBASE)/common/io/virtio # # Driver depends on virtio and blkdev # -LDFLAGS += -dy -N misc/virtio -N drv/blkdev +LDFLAGS += -dy -N misc/virtio -N drv/blkdev # -# Default build targets. +# Default build targets. # .KEEP_STATE: -def: $(DEF_DEPS) - -all: $(ALL_DEPS) - -clean: $(CLEAN_DEPS) - -clobber: $(CLOBBER_DEPS) +def: $(DEF_DEPS) -lint: $(LINT_DEPS) +all: $(ALL_DEPS) -modlintlib: $(MODLINTLIB_DEPS) +clean: $(CLEAN_DEPS) -clean.lint: $(CLEAN_LINT_DEPS) +clobber: $(CLOBBER_DEPS) -install: $(INSTALL_DEPS) +install: $(INSTALL_DEPS) # -# Include common targets. +# Include common targets. # include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/vioif/Makefile b/usr/src/uts/intel/vioif/Makefile index ba87d97c61..a2dc4a337b 100644 --- a/usr/src/uts/intel/vioif/Makefile +++ b/usr/src/uts/intel/vioif/Makefile @@ -11,70 +11,58 @@ # # Copyright 2013 Nexenta Inc. All rights reserved. +# Copyright 2019 Joyent, Inc. # # -# Path to the base of the uts directory tree (usually /usr/src/uts). +# Path to the base of the uts directory tree (usually /usr/src/uts). # -UTSBASE = ../.. +UTSBASE = ../.. # -# Define the module and object file sets. +# Define the module and object file sets. # -MODULE = vioif -OBJECTS = $(VIOIF_OBJS:%=$(OBJS_DIR)/%) -LINTS = $(VIOIF_OBJS:%.o=$(LINTS_DIR)/%.ln) -ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +MODULE = vioif +OBJECTS = $(VIOIF_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) # -# Include common rules. +# Include common rules. # include $(UTSBASE)/intel/Makefile.intel # -# Define targets +# Define targets # -ALL_TARGET = $(BINARY) -LINT_TARGET = $(MODULE).lint -INSTALL_TARGET = $(BINARY) $(ROOTMODULE) +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # -# Overrides +# Overrides # +INC_PATH += -I$(UTSBASE)/common/io/virtio -INC_PATH += -I$(UTSBASE)/common/io/virtio - -# -# lint pass one enforcement -# -CFLAGS += $(CCVERBOSE) # -# Driver depends on virtio and blkdev +# Driver depends on virtio and mac # -LDFLAGS += -dy -N misc/virtio -N misc/mac +LDFLAGS += -dy -N misc/virtio -N misc/mac # -# Default build targets. +# Default build targets. # .KEEP_STATE: -def: $(DEF_DEPS) - -all: $(ALL_DEPS) - -clean: $(CLEAN_DEPS) - -clobber: $(CLOBBER_DEPS) +def: $(DEF_DEPS) -lint: $(LINT_DEPS) +all: $(ALL_DEPS) -modlintlib: $(MODLINTLIB_DEPS) +clean: $(CLEAN_DEPS) -clean.lint: $(CLEAN_LINT_DEPS) +clobber: $(CLOBBER_DEPS) -install: $(INSTALL_DEPS) +install: $(INSTALL_DEPS) # -# Include common targets. +# Include common targets. # include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/virtio/Makefile b/usr/src/uts/intel/virtio/Makefile index 1f6548a135..c5a0d05b6a 100644 --- a/usr/src/uts/intel/virtio/Makefile +++ b/usr/src/uts/intel/virtio/Makefile @@ -1,90 +1,63 @@ # -# CDDL HEADER START +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. # -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. # # # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. -# Copyright (c) 2018, Joyent, Inc. +# Copyright 2019 Joyent, Inc. # # -# Path to the base of the uts directory tree (usually /usr/src/uts). +# Path to the base of the uts directory tree (usually /usr/src/uts). # -UTSBASE = ../.. +UTSBASE = ../.. # -# Define the module and object file sets. +# Define the module and object file sets. # -MODULE = virtio -OBJECTS = $(VIRTIO_OBJS:%=$(OBJS_DIR)/%) -LINTS = $(VIRTIO_OBJS:%.o=$(LINTS_DIR)/%.ln) -ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE) +MODULE = virtio +OBJECTS = $(VIRTIO_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE) # -# Include common rules. +# Include common rules. # include $(UTSBASE)/intel/Makefile.intel # -# Define targets +# Define targets # -ALL_TARGET = $(BINARY) -LINT_TARGET = $(MODULE).lint -INSTALL_TARGET = $(BINARY) $(ROOTMODULE) +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # -# Overrides +# Overrides # - -INC_PATH += -I$(UTSBASE)/common/io/virtio +INC_PATH += -I$(UTSBASE)/common/io/virtio # -# lint pass one enforcement -# -CFLAGS += $(CCVERBOSE) - -# needs work -SMOFF += all_func_returns - -# -# Default build targets. +# Default build targets. # .KEEP_STATE: -def: $(DEF_DEPS) - -all: $(ALL_DEPS) - -clean: $(CLEAN_DEPS) - -clobber: $(CLOBBER_DEPS) +def: $(DEF_DEPS) -lint: $(LINT_DEPS) +all: $(ALL_DEPS) -modlintlib: $(MODLINTLIB_DEPS) +clean: $(CLEAN_DEPS) -clean.lint: $(CLEAN_LINT_DEPS) +clobber: $(CLOBBER_DEPS) -install: $(INSTALL_DEPS) +install: $(INSTALL_DEPS) # -# Include common targets. +# Include common targets. # include $(UTSBASE)/intel/Makefile.targ |