diff options
author | mrj <none@none> | 2007-12-21 14:13:23 -0800 |
---|---|---|
committer | mrj <none@none> | 2007-12-21 14:13:23 -0800 |
commit | 551bc2a66868b5cb5be6b70ab9f55515e77a39a9 (patch) | |
tree | a01e761c9864ea9483c468ced858a0f67edcbf93 | |
parent | 71a79fe7afa36dcf0de6902c2c6ef432980534d3 (diff) | |
download | illumos-joyent-551bc2a66868b5cb5be6b70ab9f55515e77a39a9.tar.gz |
PSARC 2007/664 Paravirtualized Drivers for Fully Virtualized xVM Domains
6525093 xnb/xnf should use hypervisor based copy for xnb->xnf data path
6608917 members of struct xnf and xnb need unique names
6609324 deadlock trying to own the HAT migrate lock
6609805 still missing XPV_DISALLOW_MIGRATE/XPV_ALLOW_MIGRATE bracketing in hat_i86.c
6616384 xnb's grant ref unmapping is inefficient
6619947 Solaris should provide a PV network driver for xVM HVM environments
6632774 panic setting up xen console
--HG--
rename : usr/src/uts/i86xpv/os/gnttab.c => usr/src/uts/common/xen/os/gnttab.c
rename : usr/src/uts/i86xpv/os/hypercall.c => usr/src/uts/common/xen/os/hypercall.c
rename : usr/src/uts/i86xpv/sys/gnttab.h => usr/src/uts/common/xen/sys/gnttab.h
rename : usr/src/uts/i86xpv/ml/hypersubr.s => usr/src/uts/intel/ia32/ml/hypersubr.s
rename : usr/src/uts/i86xpv/sys/hypervisor.h => usr/src/uts/intel/sys/hypervisor.h
rename : usr/src/uts/i86xpv/sys/xen_errno.h => usr/src/uts/intel/sys/xen_errno.h
58 files changed, 4045 insertions, 1096 deletions
diff --git a/usr/src/cmd/boot/filelist/i386/filelist.ramdisk b/usr/src/cmd/boot/filelist/i386/filelist.ramdisk index 4029f595dc..7a7d78e457 100644 --- a/usr/src/cmd/boot/filelist/i386/filelist.ramdisk +++ b/usr/src/cmd/boot/filelist/i386/filelist.ramdisk @@ -16,6 +16,7 @@ etc/path_to_inst etc/rtc_config etc/system kernel +platform/i86hvm/kernel platform/i86pc/kernel platform/i86xpv/kernel platform/i86pc/ucode/GenuineIntel diff --git a/usr/src/pkgdefs/Makefile b/usr/src/pkgdefs/Makefile index 6e09175617..60c94941d6 100644 --- a/usr/src/pkgdefs/Makefile +++ b/usr/src/pkgdefs/Makefile @@ -142,7 +142,8 @@ i386_SUBDIRS= \ SUNWsi3124 \ SUNWvia823x \ SUNWwpi \ - SUNWxsvc + SUNWxsvc \ + SUNWxvmpv i386_XMODS= \ BRCMbnx \ diff --git a/usr/src/pkgdefs/SUNWhea/prototype_i386 b/usr/src/pkgdefs/SUNWhea/prototype_i386 index ec0dc2ad4b..5ab4b464d1 100644 --- a/usr/src/pkgdefs/SUNWhea/prototype_i386 +++ b/usr/src/pkgdefs/SUNWhea/prototype_i386 @@ -87,6 +87,7 @@ f none usr/include/sys/dktp/dadkio.h 644 root bin f none usr/include/sys/dktp/fdisk.h 644 root bin f none usr/include/sys/dma_engine.h 644 root bin f none usr/include/sys/fp.h 644 root bin +f none usr/include/sys/hypervisor.h 644 root bin f none usr/include/sys/i8272A.h 644 root bin f none usr/include/sys/kd.h 644 root bin f none usr/include/sys/mc.h 644 root bin @@ -120,6 +121,7 @@ f none usr/include/sys/traptrace.h 644 root bin f none usr/include/sys/tss.h 644 root bin f none usr/include/sys/x86_archext.h 644 root bin f none usr/include/sys/ucode.h 644 root bin +f none usr/include/sys/xen_errno.h 644 root bin d none usr/platform 755 root sys d none usr/platform/i86pc 755 root sys d none usr/platform/i86pc/include 755 root bin @@ -158,9 +160,7 @@ d none usr/platform/i86xpv 755 root sys d none usr/platform/i86xpv/include 755 root bin d none usr/platform/i86xpv/include/sys 755 root bin f none usr/platform/i86xpv/include/sys/balloon.h 644 root bin -f none usr/platform/i86xpv/include/sys/hypervisor.h 644 root bin f none usr/platform/i86xpv/include/sys/machprivregs.h 644 root bin -f none usr/platform/i86xpv/include/sys/xen_errno.h 644 root bin f none usr/platform/i86xpv/include/sys/xen_mmu.h 644 root bin f none usr/platform/i86xpv/include/sys/xpv_impl.h 644 root bin d none usr/platform/i86xpv/include/vm 755 root bin diff --git a/usr/src/pkgdefs/SUNWxvmpv/Makefile b/usr/src/pkgdefs/SUNWxvmpv/Makefile new file mode 100644 index 0000000000..a38f5dce9b --- /dev/null +++ b/usr/src/pkgdefs/SUNWxvmpv/Makefile @@ -0,0 +1,40 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include ../Makefile.com + +TMPLFILES += postinstall preremove +DATAFILES += depend + +.KEEP_STATE: + +all: $(FILES) +install: all pkg + +include ../Makefile.targ +include ../Makefile.prtarg diff --git a/usr/src/pkgdefs/SUNWxvmpv/pkginfo.tmpl b/usr/src/pkgdefs/SUNWxvmpv/pkginfo.tmpl new file mode 100644 index 0000000000..29a5ae2c56 --- /dev/null +++ b/usr/src/pkgdefs/SUNWxvmpv/pkginfo.tmpl @@ -0,0 +1,47 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +PKG=SUNWxvmpv +NAME=xVM Paravirtualized Drivers +ARCH="i386" +VERSION="ONVERS,REV=0.0.0" +SUNW_PRODNAME="SunOS" +SUNW_PRODVERS="RELEASE/VERSION" +SUNW_PKGVERS="1.0" +SUNW_PKGTYPE="root" +MAXINST="1000" +CATEGORY=system +VENDOR="Sun Microsystems, Inc." +DESC="xVM Paravirtualized Drivers" +CLASSES="none preserve" +HOTLINE="Please contact your local service provider" +EMAIL="" +BASEDIR=/ +SUNW_PKG_ALLZONES="true" +SUNW_PKG_HOLLOW="true" +SUNW_PKG_THISZONE="false" diff --git a/usr/src/pkgdefs/SUNWxvmpv/postinstall.tmpl b/usr/src/pkgdefs/SUNWxvmpv/postinstall.tmpl new file mode 100644 index 0000000000..826d451125 --- /dev/null +++ b/usr/src/pkgdefs/SUNWxvmpv/postinstall.tmpl @@ -0,0 +1,34 @@ +#!/sbin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include drv_utils + +pkg_drvadd -i "pci5853,1" -b "$BASEDIR" xpv || exit 1 +pkg_drvadd xpvd || exit 1 +pkg_drvadd xnf || exit 1 diff --git a/usr/src/pkgdefs/SUNWxvmpv/preremove.tmpl b/usr/src/pkgdefs/SUNWxvmpv/preremove.tmpl new file mode 100644 index 0000000000..006870f722 --- /dev/null +++ b/usr/src/pkgdefs/SUNWxvmpv/preremove.tmpl @@ -0,0 +1,33 @@ +#!/sbin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include drv_utils +pkg_drvrem xnf || exit 1 +pkg_drvrem xpvd || exit 1 +pkg_drvrem xpv || exit 1 diff --git a/usr/src/pkgdefs/SUNWxvmpv/prototype_i386 b/usr/src/pkgdefs/SUNWxvmpv/prototype_i386 new file mode 100644 index 0000000000..16bf5a6a30 --- /dev/null +++ b/usr/src/pkgdefs/SUNWxvmpv/prototype_i386 @@ -0,0 +1,61 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +# +# This required package information file contains a list of package contents. +# The 'pkgmk' command uses this file to identify the contents of a package +# and their location on the development machine when building the package. +# Can be created via a text editor or through use of the 'pkgproto' command. + +#!search <pathname pathname ...> # where to find pkg objects +#!include <filename> # include another 'prototype' file +#!default <mode> <owner> <group> # default used if not specified on entry +#!<param>=<value> # puts parameter in pkg environment + +# +# +i pkginfo +i copyright +i depend +i postinstall +i preremove + +# xVM PV drivers +d none platform 0755 root sys +d none platform/i86hvm 0755 root sys +d none platform/i86hvm/kernel 0755 root sys +d none platform/i86hvm/kernel/drv 0755 root sys +d none platform/i86hvm/kernel/drv/amd64 0755 root sys +f none platform/i86hvm/kernel/drv/amd64/xnf 0755 root sys +f none platform/i86hvm/kernel/drv/amd64/xpv 0755 root sys +f none platform/i86hvm/kernel/drv/amd64/xpvd 0755 root sys +f none platform/i86hvm/kernel/drv/xnf 0755 root sys +f none platform/i86hvm/kernel/drv/xpv 0755 root sys +f none platform/i86hvm/kernel/drv/xpv.conf 0644 root sys +f none platform/i86hvm/kernel/drv/xpvd 0755 root sys +f none platform/i86hvm/kernel/drv/xpvd.conf 0644 root sys diff --git a/usr/src/pkgdefs/etc/exception_list_i386 b/usr/src/pkgdefs/etc/exception_list_i386 index 12e706fc7d..8256e86212 100644 --- a/usr/src/pkgdefs/etc/exception_list_i386 +++ b/usr/src/pkgdefs/etc/exception_list_i386 @@ -1034,3 +1034,8 @@ usr/include/libvscan.h i386 # usr/lib/vscan/llib-lvscan i386 usr/lib/vscan/llib-lvscan.ln i386 +# +# i86hvm is not a full platform. It is just a home for paravirtualized +# drivers. There is no usr/ component to this sub-platform, but the +# directory is created in the proto area to keep other tools happy. +usr/platform/i86hvm i386 diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh index f68fa09651..c86cb80626 100644 --- a/usr/src/tools/scripts/bfu.sh +++ b/usr/src/tools/scripts/bfu.sh @@ -2149,14 +2149,20 @@ if [ $diskless = no ]; then chgrp sys $root/platform/sun4u-us3 fi - if [ $target_isa = i386 -a $archive_type = xpv ]; then - # - # On i386, we want to apply the archives for both platforms - # (i86pc and i86xpv) if they exist. We force the platform - # to i86xpv so that both will be applied. - # - karch=i86pc - plat=i86xpv + if [ $target_isa = i386 ]; then + if [ $archive_type = xpv ]; then + # + # On i386, we want to apply the archives for both + # platforms (i86pc and i86xpv) if they exist. We + # force the platform to i86xpv so that both will be + # applied. + # + karch=i86pc + plat=i86xpv + fi + if [ ! -d $root/platform/i86hvm ]; then + mkdir $root/platform/i86hvm + fi fi if [ $karch != $plat -a -f ${cpiodir}/${plat}.usr$ZFIX ]; then diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h index 74404edd3e..d545e093b3 100644 --- a/usr/src/uts/common/sys/thread.h +++ b/usr/src/uts/common/sys/thread.h @@ -292,6 +292,7 @@ typedef struct _kthread { uint8_t t_unpark; /* modified holding t_delay_lock */ uint8_t t_release; /* lwp_release() waked up the thread */ uint8_t t_hatdepth; /* depth of recursive hat_memloads */ + uint8_t t_xpvcntr; /* see xen_block_migrate() */ kcondvar_t t_joincv; /* cv used to wait for thread exit */ void *t_taskq; /* for threads belonging to taskq */ hrtime_t t_anttime; /* most recent time anticipatory load */ diff --git a/usr/src/uts/common/xen/io/xdb.c b/usr/src/uts/common/xen/io/xdb.c index b640010c22..33a075ac3d 100644 --- a/usr/src/uts/common/xen/io/xdb.c +++ b/usr/src/uts/common/xen/io/xdb.c @@ -50,9 +50,30 @@ #pragma ident "%Z%%M% %I% %E% SMI" -#include "xdb.h" -#include <sys/lofi.h> +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/dditypes.h> +#include <sys/sunddi.h> +#include <sys/list.h> +#include <sys/dkio.h> +#include <sys/cmlb.h> +#include <sys/vtoc.h> +#include <sys/modctl.h> +#include <sys/bootconf.h> +#include <sys/promif.h> +#include <sys/sysmacros.h> +#include <public/io/xenbus.h> +#include <xen/sys/xenbus_impl.h> +#include <xen/sys/xendev.h> +#include <sys/gnttab.h> +#include <sys/scsi/generic/inquiry.h> +#include <vm/seg_kmem.h> #include <vm/hat_i86.h> +#include <sys/gnttab.h> +#include <sys/lofi.h> +#include <io/xdf.h> +#include <io/xdb.h> static xdb_t *xdb_statep; static int xdb_debug = 0; diff --git a/usr/src/uts/common/xen/io/xdb.h b/usr/src/uts/common/xen/io/xdb.h index 81f6b5d9c2..d4d744d2ac 100644 --- a/usr/src/uts/common/xen/io/xdb.h +++ b/usr/src/uts/common/xen/io/xdb.h @@ -34,19 +34,6 @@ extern "C" { #endif -#include <sys/types.h> -#include <sys/conf.h> -#include <sys/ddi.h> -#include <sys/dditypes.h> -#include <sys/sunddi.h> -#include <sys/sunldi.h> -#include <sys/modctl.h> -#include <vm/seg_kmem.h> -#include <sys/gnttab.h> -#include <xen/sys/xenbus_impl.h> -#include <xen/sys/xendev.h> -#include "xdf.h" - #define XDB_DBG_ALL 0xf #define XDB_DBG_IO 0x1 #define XDB_DBG_INFO 0x2 diff --git a/usr/src/uts/common/xen/io/xdf.c b/usr/src/uts/common/xen/io/xdf.c index c820bb27c5..4d695ec992 100644 --- a/usr/src/uts/common/xen/io/xdf.c +++ b/usr/src/uts/common/xen/io/xdf.c @@ -33,7 +33,30 @@ #pragma ident "%Z%%M% %I% %E% SMI" -#include "xdf.h" +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/dditypes.h> +#include <sys/sunddi.h> +#include <sys/list.h> +#include <sys/cmlb.h> +#include <sys/dkio.h> +#include <sys/vtoc.h> +#include <sys/modctl.h> +#include <sys/bootconf.h> +#include <sys/promif.h> +#include <sys/sysmacros.h> +#include <sys/kstat.h> +#include <sys/mach_mmu.h> +#ifdef XPV_HVM_DRIVER +#include <sys/xpv_support.h> +#endif +#include <public/io/xenbus.h> +#include <xen/sys/xenbus_impl.h> +#include <xen/sys/xendev.h> +#include <sys/gnttab.h> +#include <sys/scsi/generic/inquiry.h> +#include <io/xdf.h> #define FLUSH_DISKCACHE 0x1 #define WRITE_BARRIER 0x2 @@ -302,6 +325,16 @@ xdf_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) ddi_iblock_cookie_t ibc; ddi_iblock_cookie_t softibc; int instance; +#if defined(XPV_HVM_DRIVER) && defined(__i386) + /* XXX: 6609126 32-bit xdf driver panics on a 64-bit dom0 */ + extern int xen_is_64bit; + + if (xen_is_64bit) { + cmn_err(CE_WARN, "xdf cannot be used in 32-bit domUs on a" + " 64-bit dom0."); + return (DDI_FAILURE); + } +#endif xdfdebug = ddi_prop_get_int(DDI_DEV_T_ANY, devi, DDI_PROP_NOTPROM, "xdfdebug", 0); @@ -534,7 +567,11 @@ xdf_suspend(dev_info_t *devi) /* make sure no more I/O responses left in the ring buffer */ if ((st == XD_INIT) || (st == XD_READY)) { +#ifdef XPV_HVM_DRIVER + ec_unbind_evtchn(vdp->xdf_evtchn); +#else (void) ddi_remove_intr(devi, 0, NULL); +#endif (void) xdf_drain_io(vdp); /* * no need to teardown the ring buffer here @@ -1437,7 +1474,9 @@ xdf_drain_io(xdf_t *vdp) if (!xvdi_ring_has_incomp_request(xbr)) goto out; +#ifndef XPV_HVM_DRIVER (void) HYPERVISOR_yield(); +#endif /* * file-backed devices can be slow */ @@ -1616,12 +1655,17 @@ xdf_start_connect(xdf_t *vdp) ddi_get_name_addr(dip)); goto errout; } + vdp->xdf_evtchn = xvdi_get_evtchn(dip); +#ifdef XPV_HVM_DRIVER + ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp); +#else if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) != DDI_SUCCESS) { cmn_err(CE_WARN, "xdf_start_connect: xdf@%s: " "failed to add intr handler", ddi_get_name_addr(dip)); goto errout1; } +#endif if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE, sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) != @@ -1657,7 +1701,7 @@ trans_retry: } if (rv = xenbus_printf(xbt, xsnode, "event-channel", "%u", - xvdi_get_evtchn(dip))) { + vdp->xdf_evtchn)) { cmn_err(CE_WARN, "xdf@%s: failed to write event-channel", ddi_get_name_addr(dip)); xvdi_fatal_error(dip, rv, "writing event-channel"); @@ -1694,7 +1738,11 @@ abort_trans: fail_trans: xvdi_free_ring(vdp->xdf_xb_ring); errout2: +#ifdef XPV_HVM_DRIVER + ec_unbind_evtchn(vdp->xdf_evtchn); +#else (void) ddi_remove_intr(vdp->xdf_dip, 0, NULL); +#endif errout1: xvdi_free_evtchn(dip); errout: @@ -1786,7 +1834,7 @@ xdf_post_connect(xdf_t *vdp) /* * We've created all the minor nodes via cmlb_attach() using default - * value in xdf_attach() to make it possbile to block in xdf_open(), + * value in xdf_attach() to make it possible to block in xdf_open(), * in case there's anyone (say, booting thread) ever trying to open * it before connected to backend. We will refresh all those minor * nodes w/ latest info we've got now when we are almost connected. @@ -1857,7 +1905,11 @@ xdf_post_connect(xdf_t *vdp) static void xdf_post_disconnect(xdf_t *vdp) { +#ifdef XPV_HVM_DRIVER + ec_unbind_evtchn(vdp->xdf_evtchn); +#else (void) ddi_remove_intr(vdp->xdf_dip, 0, NULL); +#endif xvdi_free_evtchn(vdp->xdf_dip); xvdi_free_ring(vdp->xdf_xb_ring); vdp->xdf_xb_ring = NULL; diff --git a/usr/src/uts/common/xen/io/xdf.h b/usr/src/uts/common/xen/io/xdf.h index c3992c62fc..ea796772dd 100644 --- a/usr/src/uts/common/xen/io/xdf.h +++ b/usr/src/uts/common/xen/io/xdf.h @@ -35,26 +35,6 @@ extern "C" { #endif -#include <sys/types.h> -#include <sys/conf.h> -#include <sys/ddi.h> -#include <sys/dditypes.h> -#include <sys/sunddi.h> -#include <sys/list.h> -#include <sys/dkio.h> -#include <sys/vtoc.h> -#include <sys/modctl.h> -#include <sys/bootconf.h> -#include <sys/promif.h> -#include <sys/open.h> -#include <sys/sysmacros.h> -#include <sys/kstat.h> -#include <sys/gnttab.h> -#include <xen/sys/xenbus_impl.h> -#include <xen/sys/xendev.h> -#include <sys/cmlb.h> -#include <sys/scsi/generic/inquiry.h> - #define BLKIF_RING_SIZE __RING_SIZE((blkif_sring_t *)NULL, PAGESIZE) /* @@ -108,7 +88,7 @@ enum xdf_state { }; /* - * 16 paritions + fdisk + * 16 partitions + fdisk */ #define XDF_PSHIFT 6 #define XDF_PMASK ((1 << XDF_PSHIFT) - 1) @@ -176,7 +156,7 @@ typedef struct v_req { * Status set and checked in vreq->v_status by vreq_setup() * * These flags will help us to continue the vreq setup work from last failure - * point, instead of starting from scrath after each failure. + * point, instead of starting from scratch after each failure. */ #define VREQ_INIT 0x0 #define VREQ_INIT_DONE 0x1 @@ -218,6 +198,7 @@ typedef struct xdf { int xdf_wce; char *xdf_flush_mem; char *xdf_cache_flush_block; + int xdf_evtchn; #ifdef DEBUG int xdf_dmacallback_num; #endif diff --git a/usr/src/uts/common/xen/io/xenbus_client.c b/usr/src/uts/common/xen/io/xenbus_client.c index b0e2b5e520..b0cb441332 100644 --- a/usr/src/uts/common/xen/io/xenbus_client.c +++ b/usr/src/uts/common/xen/io/xenbus_client.c @@ -55,9 +55,14 @@ #pragma ident "%Z%%M% %I% %E% SMI" +#ifdef XPV_HVM_DRIVER +#include <sys/xpv_support.h> +#include <sys/hypervisor.h> +#else #include <sys/hypervisor.h> #include <sys/xen_mmu.h> #include <sys/evtchn_impl.h> +#endif #include <sys/gnttab.h> #include <xen/sys/xenbus_impl.h> #include <sys/cmn_err.h> diff --git a/usr/src/uts/common/xen/io/xenbus_comms.c b/usr/src/uts/common/xen/io/xenbus_comms.c index ee4c162bf4..e7eb20f166 100644 --- a/usr/src/uts/common/xen/io/xenbus_comms.c +++ b/usr/src/uts/common/xen/io/xenbus_comms.c @@ -59,12 +59,18 @@ #include <sys/types.h> #include <vm/hat.h> #include <vm/as.h> -#include <sys/bootinfo.h> #include <sys/bootconf.h> -#include <vm/kboot_mmu.h> #include <vm/seg_kmem.h> +#ifdef XPV_HVM_DRIVER +#include <sys/pc_mmu.h> +#include <sys/xpv_support.h> +#include <sys/hypervisor.h> +#else +#include <vm/kboot_mmu.h> +#include <sys/bootinfo.h> #include <sys/hypervisor.h> #include <sys/evtchn_impl.h> +#endif #include <sys/condvar.h> #include <sys/mutex.h> #include <sys/atomic.h> @@ -240,10 +246,19 @@ xb_suspend(void) void xb_setup_intr(void) { +#ifdef XPV_HVM_DRIVER + ec_bind_evtchn_to_handler(xen_info->store_evtchn, IPL_XENBUS, + xenbus_intr, NULL); +#else xenbus_irq = ec_bind_evtchn_to_irq(xen_info->store_evtchn); + if (xenbus_irq < 0) { + cmn_err(CE_WARN, "Couldn't bind xenbus event channel"); + return; + } if (!add_avintr(NULL, IPL_XENBUS, (avfunc)xenbus_intr, "xenbus", xenbus_irq, NULL, NULL, NULL, NULL)) cmn_err(CE_WARN, "XENBUS add intr failed\n"); +#endif } /* diff --git a/usr/src/uts/common/xen/io/xenbus_dev.c b/usr/src/uts/common/xen/io/xenbus_dev.c index 57c57d886f..0eb82322b0 100644 --- a/usr/src/uts/common/xen/io/xenbus_dev.c +++ b/usr/src/uts/common/xen/io/xenbus_dev.c @@ -71,10 +71,15 @@ #include <sys/condvar.h> #include <sys/ddi.h> #include <sys/sunddi.h> +#ifdef XPV_HVM_DRIVER +#include <public/io/xenbus.h> +#include <public/io/xs_wire.h> +#include <sys/xpv_support.h> +#endif #include <sys/hypervisor.h> +#include <xen/sys/xenbus.h> #include <xen/sys/xenbus_comms.h> #include <xen/sys/xenbus_impl.h> -#include <xen/sys/xenbus.h> #include <xen/public/io/xs_wire.h> #ifdef DEBUG @@ -287,8 +292,10 @@ xenbusdrv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) xenbusdrv_dip = dip; ddi_report_dev(dip); +#ifndef XPV_HVM_DRIVER if (DOMAIN_IS_INITDOMAIN(xen_info)) xs_dom0_init(); +#endif return (DDI_SUCCESS); diff --git a/usr/src/uts/common/xen/io/xenbus_probe.c b/usr/src/uts/common/xen/io/xenbus_probe.c index 18d1e7a7d7..ebf3a12a3e 100644 --- a/usr/src/uts/common/xen/io/xenbus_probe.c +++ b/usr/src/uts/common/xen/io/xenbus_probe.c @@ -55,8 +55,10 @@ #pragma ident "%Z%%M% %I% %E% SMI" +#ifdef XPV_HVM_DRIVER +#include <sys/xpv_support.h> +#endif #include <sys/hypervisor.h> -#include <sys/evtchn_impl.h> #include <xen/sys/xenbus_impl.h> #include <xen/sys/xenbus_comms.h> #include <xen/public/io/xs_wire.h> diff --git a/usr/src/uts/common/xen/io/xenbus_xs.c b/usr/src/uts/common/xen/io/xenbus_xs.c index 04ac2988e3..39f41ecd60 100644 --- a/usr/src/uts/common/xen/io/xenbus_xs.c +++ b/usr/src/uts/common/xen/io/xenbus_xs.c @@ -78,10 +78,13 @@ #include <sys/sunddi.h> #include <sys/avintr.h> #include <sys/cmn_err.h> +#include <sys/mach_mmu.h> #include <util/sscanf.h> #define _XSD_ERRORS_DEFINED +#ifdef XPV_HVM_DRIVER +#include <sys/xpv_support.h> +#endif #include <sys/hypervisor.h> -#include <sys/mach_mmu.h> #include <sys/taskq.h> #include <sys/sdt.h> #include <xen/sys/xenbus_impl.h> diff --git a/usr/src/uts/common/xen/io/xencons.c b/usr/src/uts/common/xen/io/xencons.c index d6eb84dc91..891b2f18e5 100644 --- a/usr/src/uts/common/xen/io/xencons.c +++ b/usr/src/uts/common/xen/io/xencons.c @@ -290,9 +290,9 @@ xenconssetup(struct xencons *xcp) mutex_exit(&xcp->excl); } else { (void) xvdi_alloc_evtchn(xcp->dip); + xcp->evtchn = xvdi_get_evtchn(xcp->dip); (void) ddi_add_intr(xcp->dip, 0, NULL, NULL, xenconsintr, (caddr_t)xcp); - xcp->evtchn = xvdi_get_evtchn(xcp->dip); } } diff --git a/usr/src/uts/common/xen/io/xnb.c b/usr/src/uts/common/xen/io/xnb.c index b13a9354c2..7202754860 100644 --- a/usr/src/uts/common/xen/io/xnb.c +++ b/usr/src/uts/common/xen/io/xnb.c @@ -40,6 +40,7 @@ #include <sys/dlpi.h> #include <sys/strsubr.h> #include <sys/strsun.h> +#include <sys/types.h> #include <sys/pattr.h> #include <vm/seg_kmem.h> #include <vm/hat_i86.h> @@ -101,8 +102,17 @@ static void xnb_rxbuf_put(xnb_t *, xnb_rxbuf_t *); static void xnb_rx_notify_peer(xnb_t *); static void xnb_rx_complete(xnb_rxbuf_t *); static void xnb_rx_mark_complete(xnb_t *, RING_IDX, int16_t); -static void xnb_rx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *); +static void xnb_rx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *, + xnb_rxbuf_t *); static void xnb_rx_perform_pending_unmop(xnb_t *); +mblk_t *xnb_copy_to_peer(xnb_t *, mblk_t *); + +int xnb_unmop_lowwat = NET_TX_RING_SIZE >> 2; +int xnb_unmop_hiwat = NET_TX_RING_SIZE - (NET_TX_RING_SIZE >> 2); + + +boolean_t xnb_hv_copy = B_TRUE; +boolean_t xnb_explicit_pageflip_set = B_FALSE; #ifdef XNB_DEBUG #define NR_GRANT_ENTRIES \ @@ -129,12 +139,17 @@ static char *aux_statistics[] = { "tx_too_early", "rx_too_early", "rx_allocb_failed", + "tx_allocb_failed", + "tx_foreign_page", "mac_full", "spurious_intr", "allocation_success", "allocation_failure", "small_allocation_success", "small_allocation_failure", + "other_allocation_failure", + "tx_pageboundary_crossed", + "tx_cpoparea_grown", "csum_hardware", "csum_software", }; @@ -155,23 +170,28 @@ xnb_ks_aux_update(kstat_t *ksp, int flag) * Assignment order should match that of the names in * aux_statistics. */ - (knp++)->value.ui64 = xnbp->x_stat_tx_cksum_deferred; - (knp++)->value.ui64 = xnbp->x_stat_rx_cksum_no_need; - (knp++)->value.ui64 = xnbp->x_stat_tx_notify_deferred; - (knp++)->value.ui64 = xnbp->x_stat_tx_notify_sent; - (knp++)->value.ui64 = xnbp->x_stat_rx_notify_deferred; - (knp++)->value.ui64 = xnbp->x_stat_rx_notify_sent; - (knp++)->value.ui64 = xnbp->x_stat_tx_too_early; - (knp++)->value.ui64 = xnbp->x_stat_rx_too_early; - (knp++)->value.ui64 = xnbp->x_stat_rx_allocb_failed; - (knp++)->value.ui64 = xnbp->x_stat_mac_full; - (knp++)->value.ui64 = xnbp->x_stat_spurious_intr; - (knp++)->value.ui64 = xnbp->x_stat_allocation_success; - (knp++)->value.ui64 = xnbp->x_stat_allocation_failure; - (knp++)->value.ui64 = xnbp->x_stat_small_allocation_success; - (knp++)->value.ui64 = xnbp->x_stat_small_allocation_failure; - (knp++)->value.ui64 = xnbp->x_stat_csum_hardware; - (knp++)->value.ui64 = xnbp->x_stat_csum_software; + (knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_deferred; + (knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_no_need; + (knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred; + (knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent; + (knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred; + (knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent; + (knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early; + (knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early; + (knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed; + (knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed; + (knp++)->value.ui64 = xnbp->xnb_stat_tx_foreign_page; + (knp++)->value.ui64 = xnbp->xnb_stat_mac_full; + (knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr; + (knp++)->value.ui64 = xnbp->xnb_stat_allocation_success; + (knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure; + (knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success; + (knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure; + (knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure; + (knp++)->value.ui64 = xnbp->xnb_stat_tx_pagebndry_crossed; + (knp++)->value.ui64 = xnbp->xnb_stat_tx_cpoparea_grown; + (knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware; + (knp++)->value.ui64 = xnbp->xnb_stat_csum_software; return (0); } @@ -187,16 +207,16 @@ xnb_ks_init(xnb_t *xnbp) /* * Create and initialise kstats. */ - xnbp->x_kstat_aux = kstat_create(ddi_driver_name(xnbp->x_devinfo), - ddi_get_instance(xnbp->x_devinfo), "aux_statistics", "net", + xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo), + ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net", KSTAT_TYPE_NAMED, nstat, 0); - if (xnbp->x_kstat_aux == NULL) + if (xnbp->xnb_kstat_aux == NULL) return (B_FALSE); - xnbp->x_kstat_aux->ks_private = xnbp; - xnbp->x_kstat_aux->ks_update = xnb_ks_aux_update; + xnbp->xnb_kstat_aux->ks_private = xnbp; + xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update; - knp = xnbp->x_kstat_aux->ks_data; + knp = xnbp->xnb_kstat_aux->ks_data; while (nstat > 0) { kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); @@ -205,7 +225,7 @@ xnb_ks_init(xnb_t *xnbp) nstat--; } - kstat_install(xnbp->x_kstat_aux); + kstat_install(xnbp->xnb_kstat_aux); return (B_TRUE); } @@ -213,7 +233,7 @@ xnb_ks_init(xnb_t *xnbp) static void xnb_ks_free(xnb_t *xnbp) { - kstat_delete(xnbp->x_kstat_aux); + kstat_delete(xnbp->xnb_kstat_aux); } /* @@ -301,7 +321,7 @@ xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab) 0, 0, 0, 0, HCK_FULLCKSUM, KM_NOSLEEP); - xnbp->x_stat_csum_hardware++; + xnbp->xnb_stat_csum_hardware++; return (mp); } @@ -323,7 +343,7 @@ software: * We are not able to use any offload so do the whole thing in * software. */ - xnbp->x_stat_csum_software++; + xnbp->xnb_stat_csum_software++; return (xnb_software_csum(xnbp, mp)); } @@ -336,38 +356,46 @@ xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data) xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP); - xnbp->x_flavour = flavour; - xnbp->x_flavour_data = flavour_data; - xnbp->x_devinfo = dip; - xnbp->x_evtchn = INVALID_EVTCHN; - xnbp->x_irq = B_FALSE; - xnbp->x_tx_ring_handle = INVALID_GRANT_HANDLE; - xnbp->x_rx_ring_handle = INVALID_GRANT_HANDLE; - xnbp->x_cksum_offload = xnb_cksum_offload; - xnbp->x_connected = B_FALSE; - xnbp->x_hotplugged = B_FALSE; - xnbp->x_detachable = B_FALSE; - xnbp->x_peer = xvdi_get_oeid(dip); - xnbp->x_rx_pages_writable = B_FALSE; - - xnbp->x_rx_buf_count = 0; - xnbp->x_rx_unmop_count = 0; - - xnbp->x_tx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); - ASSERT(xnbp->x_tx_va != NULL); - - if (ddi_get_iblock_cookie(dip, 0, &xnbp->x_icookie) + xnbp->xnb_flavour = flavour; + xnbp->xnb_flavour_data = flavour_data; + xnbp->xnb_devinfo = dip; + xnbp->xnb_evtchn = INVALID_EVTCHN; + xnbp->xnb_irq = B_FALSE; + xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE; + xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE; + xnbp->xnb_cksum_offload = xnb_cksum_offload; + xnbp->xnb_connected = B_FALSE; + xnbp->xnb_hotplugged = B_FALSE; + xnbp->xnb_detachable = B_FALSE; + xnbp->xnb_peer = xvdi_get_oeid(dip); + xnbp->xnb_rx_pages_writable = B_FALSE; + + xnbp->xnb_rx_buf_count = 0; + xnbp->xnb_rx_unmop_count = 0; + + xnbp->xnb_hv_copy = B_FALSE; + + xnbp->xnb_tx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); + ASSERT(xnbp->xnb_tx_va != NULL); + + if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie) != DDI_SUCCESS) goto failure; - mutex_init(&xnbp->x_tx_lock, NULL, MUTEX_DRIVER, xnbp->x_icookie); - mutex_init(&xnbp->x_rx_lock, NULL, MUTEX_DRIVER, xnbp->x_icookie); + /* allocated on demand, when/if we enter xnb_copy_to_peer() */ + xnbp->xnb_tx_cpop = NULL; + xnbp->xnb_cpop_sz = 0; + + mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER, + xnbp->xnb_icookie); + mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER, + xnbp->xnb_icookie); /* set driver private pointer now */ ddi_set_driver_private(dip, xnbp); if (!xnb_ks_init(xnbp)) - goto late_failure; + goto failure_1; /* * Receive notification of changes in the state of the @@ -375,35 +403,52 @@ xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data) */ if (xvdi_add_event_handler(dip, XS_OE_STATE, xnb_oe_state_change) != DDI_SUCCESS) - goto very_late_failure; + goto failure_2; /* * Receive notification of hotplug events. */ if (xvdi_add_event_handler(dip, XS_HP_STATE, xnb_hp_state_change) != DDI_SUCCESS) - goto very_late_failure; + goto failure_2; xsname = xvdi_get_xsname(dip); if (xenbus_printf(XBT_NULL, xsname, "feature-no-csum-offload", "%d", - xnbp->x_cksum_offload ? 0 : 1) != 0) - goto very_very_late_failure; + xnbp->xnb_cksum_offload ? 0 : 1) != 0) + goto failure_3; + + /* + * Use global xnb_hv_copy to export this feature. This means that + * we have to decide what to do before starting up a guest domain + */ + if (xenbus_printf(XBT_NULL, xsname, + "feature-rx-copy", "%d", xnb_hv_copy ? 1 : 0) != 0) + goto failure_3; + /* + * Linux domUs seem to depend on "feature-rx-flip" being 0 + * in addition to "feature-rx-copy" being 1. It seems strange + * to use four possible states to describe a binary decision, + * but we might as well play nice. + */ + if (xenbus_printf(XBT_NULL, xsname, + "feature-rx-flip", "%d", xnb_explicit_pageflip_set ? 1 : 0) != 0) + goto failure_3; if (xenbus_scanf(XBT_NULL, xsname, "mac", "%s", mac) != 0) { cmn_err(CE_WARN, "xnb_attach: " "cannot read mac address from %s", xsname); - goto very_very_late_failure; + goto failure_3; } - if (ether_aton(mac, xnbp->x_mac_addr) != ETHERADDRL) { + if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) { cmn_err(CE_WARN, "xnb_attach: cannot parse mac address %s", mac); - goto very_very_late_failure; + goto failure_3; } (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait); @@ -411,18 +456,18 @@ xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data) return (DDI_SUCCESS); -very_very_late_failure: /* not that the naming is getting silly or anything */ +failure_3: xvdi_remove_event_handler(dip, NULL); -very_late_failure: +failure_2: xnb_ks_free(xnbp); -late_failure: - mutex_destroy(&xnbp->x_rx_lock); - mutex_destroy(&xnbp->x_tx_lock); +failure_1: + mutex_destroy(&xnbp->xnb_rx_lock); + mutex_destroy(&xnbp->xnb_tx_lock); failure: - vmem_free(heap_arena, xnbp->x_tx_va, PAGESIZE); + vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE); kmem_free(xnbp, sizeof (*xnbp)); return (DDI_FAILURE); } @@ -434,8 +479,8 @@ xnb_detach(dev_info_t *dip) xnb_t *xnbp = ddi_get_driver_private(dip); ASSERT(xnbp != NULL); - ASSERT(!xnbp->x_connected); - ASSERT(xnbp->x_rx_buf_count == 0); + ASSERT(!xnbp->xnb_connected); + ASSERT(xnbp->xnb_rx_buf_count == 0); xnb_disconnect_rings(dip); @@ -445,11 +490,15 @@ xnb_detach(dev_info_t *dip) ddi_set_driver_private(dip, NULL); - mutex_destroy(&xnbp->x_tx_lock); - mutex_destroy(&xnbp->x_rx_lock); + mutex_destroy(&xnbp->xnb_tx_lock); + mutex_destroy(&xnbp->xnb_rx_lock); + + if (xnbp->xnb_cpop_sz > 0) + kmem_free(xnbp->xnb_tx_cpop, sizeof (*xnbp->xnb_tx_cpop) + * xnbp->xnb_cpop_sz); - ASSERT(xnbp->x_tx_va != NULL); - vmem_free(heap_arena, xnbp->x_tx_va, PAGESIZE); + ASSERT(xnbp->xnb_tx_va != NULL); + vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE); kmem_free(xnbp, sizeof (*xnbp)); } @@ -467,29 +516,27 @@ xnb_alloc_page(xnb_t *xnbp) mutex_enter(&xnb_alloc_page_lock); if (nth == BATCH_SIZE) { if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) { - xnbp->x_stat_allocation_failure++; + xnbp->xnb_stat_allocation_failure++; mutex_exit(&xnb_alloc_page_lock); /* * Try for a single page in low memory situations. */ if (balloon_alloc_pages(1, &mfn) != 1) { - xnbp->x_stat_small_allocation_failure++; - if ((xnbp->x_stat_small_allocation_failure - % WARNING_RATE_LIMIT) == 0) { + if ((xnbp->xnb_stat_small_allocation_failure++ + % WARNING_RATE_LIMIT) == 0) cmn_err(CE_WARN, "xnb_alloc_page: " "Cannot allocate memory to " "transfer packets to peer."); - } return (0); } else { - xnbp->x_stat_small_allocation_success++; + xnbp->xnb_stat_small_allocation_success++; return (mfn); } } nth = 0; - xnbp->x_stat_allocation_success++; + xnbp->xnb_stat_allocation_success++; } mfn = mfns[nth++]; @@ -524,6 +571,16 @@ xnb_free_page(xnb_t *xnbp, mfn_t mfn) } } +/* + * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but + * using local variables. + */ +#define XNB_RING_HAS_UNCONSUMED_REQUESTS(_r) \ + ((((_r)->sring->req_prod - loop) < \ + (RING_SIZE(_r) - (loop - prod))) ? \ + ((_r)->sring->req_prod - loop) : \ + (RING_SIZE(_r) - (loop - prod))) + mblk_t * xnb_to_peer(xnb_t *xnbp, mblk_t *mp) { @@ -549,35 +606,26 @@ xnb_to_peer(xnb_t *xnbp, mblk_t *mp) * to transfer them. */ - mutex_enter(&xnbp->x_tx_lock); + mutex_enter(&xnbp->xnb_tx_lock); /* * If we are not connected to the peer or have not yet * finished hotplug it is too early to pass packets to the * peer. */ - if (!(xnbp->x_connected && xnbp->x_hotplugged)) { - mutex_exit(&xnbp->x_tx_lock); - xnbp->x_stat_tx_too_early++; + if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) { + mutex_exit(&xnbp->xnb_tx_lock); + DTRACE_PROBE(flip_tx_too_early); + xnbp->xnb_stat_tx_too_early++; return (mp); } - loop = xnbp->x_rx_ring.req_cons; - prod = xnbp->x_rx_ring.rsp_prod_pvt; - gop = xnbp->x_tx_top; - - /* - * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->x_rx_ring) but - * using local variables. - */ -#define XNB_RING_HAS_UNCONSUMED_REQUESTS(_r) \ - ((((_r)->sring->req_prod - loop) < \ - (RING_SIZE(_r) - (loop - prod))) ? \ - ((_r)->sring->req_prod - loop) : \ - (RING_SIZE(_r) - (loop - prod))) + loop = xnbp->xnb_rx_ring.req_cons; + prod = xnbp->xnb_rx_ring.rsp_prod_pvt; + gop = xnbp->xnb_tx_top; while ((mp != NULL) && - XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->x_rx_ring)) { + XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) { mfn_t mfn; pfn_t pfn; @@ -590,12 +638,12 @@ xnb_to_peer(xnb_t *xnbp, mblk_t *mp) /* 1 */ if ((mfn = xnb_alloc_page(xnbp)) == 0) { - xnbp->x_stat_xmit_defer++; + xnbp->xnb_stat_xmit_defer++; break; } /* 2 */ - rxreq = RING_GET_REQUEST(&xnbp->x_rx_ring, loop); + rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop); #ifdef XNB_DEBUG if (!(rxreq->id < NET_RX_RING_SIZE)) @@ -610,14 +658,14 @@ xnb_to_peer(xnb_t *xnbp, mblk_t *mp) /* Assign a pfn and map the new page at the allocated va. */ pfn = xen_assign_pfn(mfn); - hat_devload(kas.a_hat, xnbp->x_tx_va, PAGESIZE, + hat_devload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE, pfn, PROT_READ | PROT_WRITE, HAT_LOAD); offset = TX_BUFFER_HEADROOM; /* 3 */ len = 0; - valoop = xnbp->x_tx_va + offset; + valoop = xnbp->xnb_tx_va + offset; for (ml = mp; ml != NULL; ml = ml->b_cont) { size_t chunk = ml->b_wptr - ml->b_rptr; @@ -629,26 +677,26 @@ xnb_to_peer(xnb_t *xnbp, mblk_t *mp) ASSERT(len + offset < PAGESIZE); /* Release the pfn. */ - hat_unload(kas.a_hat, xnbp->x_tx_va, PAGESIZE, + hat_unload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE, HAT_UNLOAD_UNMAP); xen_release_pfn(pfn); /* 4 */ gop->mfn = mfn; - gop->domid = xnbp->x_peer; + gop->domid = xnbp->xnb_peer; gop->ref = rxreq->gref; /* 5.1 */ - rxresp = RING_GET_RESPONSE(&xnbp->x_rx_ring, prod); + rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod); rxresp->offset = offset; rxresp->flags = 0; - cksum_flags = xnbp->x_flavour->xf_cksum_to_peer(xnbp, mp); + cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp); if (cksum_flags != 0) - xnbp->x_stat_tx_cksum_deferred++; + xnbp->xnb_stat_tx_cksum_deferred++; rxresp->flags |= cksum_flags; - rxresp->id = RING_GET_REQUEST(&xnbp->x_rx_ring, prod)->id; + rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id; rxresp->status = len; loop++; @@ -661,8 +709,8 @@ xnb_to_peer(xnb_t *xnbp, mblk_t *mp) /* * Did we actually do anything? */ - if (loop == xnbp->x_rx_ring.req_cons) { - mutex_exit(&xnbp->x_tx_lock); + if (loop == xnbp->xnb_rx_ring.req_cons) { + mutex_exit(&xnbp->xnb_tx_lock); return (mp); } @@ -674,14 +722,14 @@ xnb_to_peer(xnb_t *xnbp, mblk_t *mp) ASSERT(prev != NULL); prev->b_next = NULL; - if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->x_tx_top, - loop - xnbp->x_rx_ring.req_cons) != 0) { + if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_tx_top, + loop - xnbp->xnb_rx_ring.req_cons) != 0) { cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed"); } - loop = xnbp->x_rx_ring.req_cons; - prod = xnbp->x_rx_ring.rsp_prod_pvt; - gop = xnbp->x_tx_top; + loop = xnbp->xnb_rx_ring.req_cons; + prod = xnbp->xnb_rx_ring.rsp_prod_pvt; + gop = xnbp->xnb_tx_top; while (loop < end) { int16_t status = NETIF_RSP_OKAY; @@ -716,11 +764,11 @@ xnb_to_peer(xnb_t *xnbp, mblk_t *mp) /* 5.2 */ if (status != NETIF_RSP_OKAY) { - RING_GET_RESPONSE(&xnbp->x_rx_ring, prod)->status = + RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status = status; } else { - xnbp->x_stat_opackets++; - xnbp->x_stat_obytes += len; + xnbp->xnb_stat_opackets++; + xnbp->xnb_stat_obytes += len; } loop++; @@ -728,23 +776,23 @@ xnb_to_peer(xnb_t *xnbp, mblk_t *mp) gop++; } - xnbp->x_rx_ring.req_cons = loop; - xnbp->x_rx_ring.rsp_prod_pvt = prod; + xnbp->xnb_rx_ring.req_cons = loop; + xnbp->xnb_rx_ring.rsp_prod_pvt = prod; /* 6 */ - /*LINTED: constant in conditional context*/ - RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->x_rx_ring, notify); + /* LINTED: constant in conditional context */ + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify); if (notify) { - ec_notify_via_evtchn(xnbp->x_evtchn); - xnbp->x_stat_tx_notify_sent++; + ec_notify_via_evtchn(xnbp->xnb_evtchn); + xnbp->xnb_stat_tx_notify_sent++; } else { - xnbp->x_stat_tx_notify_deferred++; + xnbp->xnb_stat_tx_notify_deferred++; } if (mp != NULL) - xnbp->x_stat_xmit_defer++; + xnbp->xnb_stat_xmit_defer++; - mutex_exit(&xnbp->x_tx_lock); + mutex_exit(&xnbp->xnb_tx_lock); /* Free mblk_t's that we consumed. */ freemsgchain(free); @@ -752,6 +800,387 @@ xnb_to_peer(xnb_t *xnbp, mblk_t *mp) return (mp); } +/* helper functions for xnb_copy_to_peer */ + +/* + * Grow the array of copy operation descriptors. + * Returns a pointer to the next available entry. + */ +gnttab_copy_t * +grow_cpop_area(xnb_t *xnbp, gnttab_copy_t *o_cpop) +{ + /* + * o_cpop (arg.1) is a ptr to the area we would like to copy + * something into but cannot, because we haven't alloc'ed it + * yet, or NULL. + * old_cpop and new_cpop (local) are pointers to old/new + * versions of xnbp->xnb_tx_cpop. + */ + gnttab_copy_t *new_cpop, *old_cpop, *ret_cpop; + size_t newcount; + + ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock)); + + old_cpop = xnbp->xnb_tx_cpop; + /* + * o_cpop is a pointer into the array pointed to by old_cpop; + * it would be an error for exactly one of these pointers to be NULL. + * We shouldn't call this function if xnb_tx_cpop has already + * been allocated, but we're starting to fill it from the beginning + * again. + */ + ASSERT((o_cpop == NULL && old_cpop == NULL) || + (o_cpop != NULL && old_cpop != NULL && o_cpop != old_cpop)); + + newcount = xnbp->xnb_cpop_sz + CPOP_DEFCNT; + + new_cpop = kmem_alloc(sizeof (*new_cpop) * newcount, KM_NOSLEEP); + if (new_cpop == NULL) { + xnbp->xnb_stat_other_allocation_failure++; + return (NULL); + } + + if (o_cpop != NULL) { + size_t offset = (o_cpop - old_cpop); + + /* we only need to move the parts in use ... */ + (void) memmove(new_cpop, old_cpop, xnbp->xnb_cpop_sz * + (sizeof (*old_cpop))); + + kmem_free(old_cpop, xnbp->xnb_cpop_sz * sizeof (*old_cpop)); + + ret_cpop = new_cpop + offset; + } else { + ret_cpop = new_cpop; + } + + xnbp->xnb_tx_cpop = new_cpop; + xnbp->xnb_cpop_sz = newcount; + + xnbp->xnb_stat_tx_cpoparea_grown++; + + return (ret_cpop); +} + +/* + * Check whether an address is on a page that's foreign to this domain. + */ +static boolean_t +is_foreign(void *addr) +{ + pfn_t pfn = hat_getpfnum(kas.a_hat, addr); + + return (pfn & PFN_IS_FOREIGN_MFN ? B_TRUE : B_FALSE); +} + +/* + * Insert a newly allocated mblk into a chain, replacing the old one. + */ +static mblk_t * +replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev) +{ + uint32_t start, stuff, end, value, flags; + mblk_t *new_mp; + + new_mp = copyb(mp); + if (new_mp == NULL) + cmn_err(CE_PANIC, "replace_msg: cannot alloc new message" + "for %p, len %lu", (void *) mp, len); + + hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); + (void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value, + flags, KM_NOSLEEP); + + new_mp->b_next = mp->b_next; + new_mp->b_prev = mp->b_prev; + new_mp->b_cont = mp->b_cont; + + /* Make sure we only overwrite pointers to the mblk being replaced. */ + if (mp_prev != NULL && mp_prev->b_next == mp) + mp_prev->b_next = new_mp; + + if (ml_prev != NULL && ml_prev->b_cont == mp) + ml_prev->b_cont = new_mp; + + mp->b_next = mp->b_prev = mp->b_cont = NULL; + freemsg(mp); + + return (new_mp); +} + +/* + * Set all the fields in a gnttab_copy_t. + */ +static void +setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr, + size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref) +{ + ASSERT(xnbp != NULL && gp != NULL); + + gp->source.offset = s_off; + gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr)); + gp->source.domid = DOMID_SELF; + + gp->len = (uint16_t)len; + gp->flags = GNTCOPY_dest_gref; + gp->status = 0; + + gp->dest.u.ref = d_ref; + gp->dest.offset = d_off; + gp->dest.domid = xnbp->xnb_peer; +} + +mblk_t * +xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp) +{ + mblk_t *free = mp, *mp_prev = NULL, *saved_mp = mp; + mblk_t *ml, *ml_prev; + gnttab_copy_t *gop_cp; + boolean_t notify; + RING_IDX loop, prod; + int i; + + if (!xnbp->xnb_hv_copy) + return (xnb_to_peer(xnbp, mp)); + + /* + * For each packet the sequence of operations is: + * + * 1. get a request slot from the ring. + * 2. set up data for hypercall (see NOTE below) + * 3. have the hypervisore copy the data + * 4. update the request slot. + * 5. kick the peer. + * + * NOTE ad 2. + * In order to reduce the number of hypercalls, we prepare + * several packets (mp->b_cont != NULL) for the peer and + * perform a single hypercall to transfer them. + * We also have to set up a seperate copy operation for + * every page. + * + * If we have more than one message (mp->b_next != NULL), + * we do this whole dance repeatedly. + */ + + mutex_enter(&xnbp->xnb_tx_lock); + + if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) { + mutex_exit(&xnbp->xnb_tx_lock); + DTRACE_PROBE(copy_tx_too_early); + xnbp->xnb_stat_tx_too_early++; + return (mp); + } + + loop = xnbp->xnb_rx_ring.req_cons; + prod = xnbp->xnb_rx_ring.rsp_prod_pvt; + + while ((mp != NULL) && + XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) { + netif_rx_request_t *rxreq; + netif_rx_response_t *rxresp; + size_t offset, d_offset; + size_t len; + uint16_t cksum_flags; + int16_t status = NETIF_RSP_OKAY; + int item_count; + + /* 1 */ + rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop); + +#ifdef XNB_DEBUG + if (!(rxreq->id < NET_RX_RING_SIZE)) + cmn_err(CE_PANIC, "xnb_copy_to_peer: " + "id %d out of range in request 0x%p", + rxreq->id, (void *)rxreq); + if (rxreq->gref >= NR_GRANT_ENTRIES) + cmn_err(CE_PANIC, "xnb_copy_to_peer: " + "grant ref %d out of range in request 0x%p", + rxreq->gref, (void *)rxreq); +#endif /* XNB_DEBUG */ + + /* 2 */ + d_offset = offset = TX_BUFFER_HEADROOM; + len = 0; + item_count = 0; + + gop_cp = xnbp->xnb_tx_cpop; + + /* + * We walk the b_cont pointers and set up a gop_cp + * structure for every page in every data block we have. + */ + /* 2a */ + for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) { + size_t chunk = ml->b_wptr - ml->b_rptr; + uchar_t *r_tmp, *rpt_align; + size_t r_offset; + + /* + * If we get an mblk on a page that doesn't belong to + * this domain, get a new mblk to replace the old one. + */ + if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) { + mblk_t *ml_new = replace_msg(ml, chunk, + mp_prev, ml_prev); + + /* We can still use old ml, but not *ml! */ + if (free == ml) + free = ml_new; + if (mp == ml) + mp = ml_new; + ml = ml_new; + + xnbp->xnb_stat_tx_foreign_page++; + } + + rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr); + r_offset = (uint16_t)(ml->b_rptr - rpt_align); + r_tmp = ml->b_rptr; + + if (d_offset + chunk > PAGESIZE) + cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p " + "(svd: %p), ml %p,rpt_alg. %p, d_offset " + "(%lu) + chunk (%lu) > PAGESIZE %d!", + (void *)mp, (void *)saved_mp, (void *)ml, + (void *)rpt_align, + d_offset, chunk, (int)PAGESIZE); + + while (chunk > 0) { + size_t part_len; + + item_count++; + if (item_count > xnbp->xnb_cpop_sz) { + gop_cp = grow_cpop_area(xnbp, gop_cp); + if (gop_cp == NULL) + goto failure; + } + /* + * If our mblk crosses a page boundary, we need + * to do a seperate copy for every page. + */ + if (r_offset + chunk > PAGESIZE) { + part_len = PAGESIZE - r_offset; + + DTRACE_PROBE3(mblk_page_crossed, + (mblk_t *), ml, int, chunk, int, + (int)r_offset); + + xnbp->xnb_stat_tx_pagebndry_crossed++; + } else { + part_len = chunk; + } + + setup_gop(xnbp, gop_cp, r_tmp, r_offset, + d_offset, part_len, rxreq->gref); + + chunk -= part_len; + + len += part_len; + d_offset += part_len; + r_tmp += part_len; + /* + * The 2nd, 3rd ... last copies will always + * start at r_tmp, therefore r_offset is 0. + */ + r_offset = 0; + gop_cp++; + } + ml_prev = ml; + DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int, + chunk, int, len, int, item_count); + } + /* 3 */ + if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_tx_cpop, + item_count) != 0) { + cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed"); + DTRACE_PROBE(HV_granttableopfailed); + } + + /* 4 */ + rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod); + rxresp->offset = offset; + + rxresp->flags = 0; + + DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int, + (int)rxresp->offset, int, (int)rxresp->flags, int, + (int)rxresp->status); + + cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp); + if (cksum_flags != 0) + xnbp->xnb_stat_tx_cksum_deferred++; + rxresp->flags |= cksum_flags; + + rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id; + rxresp->status = len; + + DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int, + (int)rxresp->offset, int, (int)rxresp->flags, int, + (int)rxresp->status); + + for (i = 0; i < item_count; i++) { + if (xnbp->xnb_tx_cpop[i].status != 0) { + DTRACE_PROBE2(cpop__status__nonnull, int, + (int)xnbp->xnb_tx_cpop[i].status, + int, i); + status = NETIF_RSP_ERROR; + } + } + + /* 5.2 */ + if (status != NETIF_RSP_OKAY) { + RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status = + status; + } else { + xnbp->xnb_stat_opackets++; + xnbp->xnb_stat_obytes += len; + } + + loop++; + prod++; + mp_prev = mp; + mp = mp->b_next; + } +failure: + /* + * Did we actually do anything? + */ + if (loop == xnbp->xnb_rx_ring.req_cons) { + mutex_exit(&xnbp->xnb_tx_lock); + return (mp); + } + + /* + * Unlink the end of the 'done' list from the remainder. + */ + ASSERT(mp_prev != NULL); + mp_prev->b_next = NULL; + + xnbp->xnb_rx_ring.req_cons = loop; + xnbp->xnb_rx_ring.rsp_prod_pvt = prod; + + /* 6 */ + /* LINTED: constant in conditional context */ + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify); + if (notify) { + ec_notify_via_evtchn(xnbp->xnb_evtchn); + xnbp->xnb_stat_tx_notify_sent++; + } else { + xnbp->xnb_stat_tx_notify_deferred++; + } + + if (mp != NULL) + xnbp->xnb_stat_xmit_defer++; + + mutex_exit(&xnbp->xnb_tx_lock); + + /* Free mblk_t structs we have consumed. */ + freemsgchain(free); + + return (mp); +} + /*ARGSUSED*/ static int xnb_rxbuf_constructor(void *buf, void *arg, int kmflag) @@ -803,15 +1232,15 @@ xnb_rx_notify_peer(xnb_t *xnbp) { boolean_t notify; - ASSERT(MUTEX_HELD(&xnbp->x_rx_lock)); + ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); - /*LINTED: constant in conditional context*/ - RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->x_tx_ring, notify); + /* LINTED: constant in conditional context */ + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify); if (notify) { - ec_notify_via_evtchn(xnbp->x_evtchn); - xnbp->x_stat_rx_notify_sent++; + ec_notify_via_evtchn(xnbp->xnb_evtchn); + xnbp->xnb_stat_rx_notify_sent++; } else { - xnbp->x_stat_rx_notify_deferred++; + xnbp->xnb_stat_rx_notify_deferred++; } } @@ -822,19 +1251,9 @@ xnb_rx_complete(xnb_rxbuf_t *rxp) ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE); - mutex_enter(&xnbp->x_rx_lock); - - xnb_rx_schedule_unmop(xnbp, &rxp->xr_mop); - xnb_rx_perform_pending_unmop(xnbp); - - if (xnbp->x_connected) { - xnb_rx_mark_complete(xnbp, rxp->xr_id, rxp->xr_status); - xnb_rx_notify_peer(xnbp); - } - - xnb_rxbuf_put(xnbp, rxp); - - mutex_exit(&xnbp->x_rx_lock); + mutex_enter(&xnbp->xnb_rx_lock); + xnb_rx_schedule_unmop(xnbp, &rxp->xr_mop, rxp); + mutex_exit(&xnbp->xnb_rx_lock); } static void @@ -843,15 +1262,15 @@ xnb_rx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status) RING_IDX i; netif_tx_response_t *txresp; - ASSERT(MUTEX_HELD(&xnbp->x_rx_lock)); + ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); - i = xnbp->x_tx_ring.rsp_prod_pvt; + i = xnbp->xnb_tx_ring.rsp_prod_pvt; - txresp = RING_GET_RESPONSE(&xnbp->x_tx_ring, i); + txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i); txresp->id = id; txresp->status = status; - xnbp->x_tx_ring.rsp_prod_pvt = i + 1; + xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1; /* * Note that we don't push the change to the peer here - that @@ -859,61 +1278,75 @@ xnb_rx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status) */ } -/* - * XXPV dme: currently pending unmap operations are stored on a - * per-instance basis. Should they be per-driver? The locking would - * have to change (obviously), but there might be an improvement from - * batching more together. Right now they are all 'done' either at - * the tail of each receive operation (copy case) or on each - * completion (non-copy case). Should that be changed to some - * interval (watermark?) to improve the chance of batching? - */ static void -xnb_rx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop) +xnb_rx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop, + xnb_rxbuf_t *rxp) { - gnttab_unmap_grant_ref_t *unmop; + gnttab_unmap_grant_ref_t *unmop; + int u_count; + int reqs_on_ring; - ASSERT(MUTEX_HELD(&xnbp->x_rx_lock)); - ASSERT(xnbp->x_rx_unmop_count <= NET_TX_RING_SIZE); + ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); + ASSERT(xnbp->xnb_rx_unmop_count < NET_TX_RING_SIZE); - unmop = &xnbp->x_rx_unmop[xnbp->x_rx_unmop_count]; - xnbp->x_rx_unmop_count++; + u_count = xnbp->xnb_rx_unmop_count++; + /* Cache data for the time when we actually unmap grant refs */ + xnbp->xnb_rx_unmop_rxp[u_count] = rxp; + + unmop = &xnbp->xnb_rx_unmop[u_count]; unmop->host_addr = mop->host_addr; unmop->dev_bus_addr = mop->dev_bus_addr; unmop->handle = mop->handle; -#ifdef XNB_DEBUG - if (xnbp->x_rx_unmop_count <= NET_TX_RING_SIZE) - ASSERT(xnbp->x_rx_unmop[xnbp->x_rx_unmop_count].host_addr - == NULL); -#endif /* XNB_DEBUG */ + /* + * We cannot check the ring once we're disconnected from it. Batching + * doesn't seem to be a useful optimisation in this case either, + * so we directly call into the actual unmap function. + */ + if (xnbp->xnb_connected) { + reqs_on_ring = RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring); + /* + * By tuning xnb_unmop_hiwat to N, we can emulate "N per batch" + * or (with N == 1) "immediate unmop" behaviour. + * The "> xnb_unmop_lowwat" is a guard against ring exhaustion. + */ + if (xnbp->xnb_rx_unmop_count < xnb_unmop_hiwat && + reqs_on_ring > xnb_unmop_lowwat) + return; + } + + xnb_rx_perform_pending_unmop(xnbp); } +/* + * Here we perform the actual unmapping of the data that was + * accumulated in xnb_rx_schedule_unmop(). + * Note that it is the caller's responsibility to make sure that + * there's actually something there to unmop. + */ static void xnb_rx_perform_pending_unmop(xnb_t *xnbp) { -#ifdef XNB_DEBUG RING_IDX loop; +#ifdef XNB_DEBUG gnttab_unmap_grant_ref_t *unmop; #endif /* XNB_DEBUG */ - ASSERT(MUTEX_HELD(&xnbp->x_rx_lock)); - - if (xnbp->x_rx_unmop_count == 0) - return; + ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); + ASSERT(xnbp->xnb_rx_unmop_count > 0); if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, - xnbp->x_rx_unmop, xnbp->x_rx_unmop_count) < 0) { + xnbp->xnb_rx_unmop, xnbp->xnb_rx_unmop_count) < 0) { cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: " "unmap grant operation failed, " - "%d pages lost", xnbp->x_rx_unmop_count); + "%d pages lost", xnbp->xnb_rx_unmop_count); } #ifdef XNB_DEBUG - for (loop = 0, unmop = xnbp->x_rx_unmop; - loop < xnbp->x_rx_unmop_count; + for (loop = 0, unmop = xnbp->xnb_rx_unmop; + loop < xnbp->xnb_rx_unmop_count; loop++, unmop++) { if (unmop->status != 0) { cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: " @@ -923,10 +1356,27 @@ xnb_rx_perform_pending_unmop(xnb_t *xnbp) } #endif /* XNB_DEBUG */ - xnbp->x_rx_unmop_count = 0; + for (loop = 0; loop < xnbp->xnb_rx_unmop_count; loop++) { + xnb_rxbuf_t *rxp = xnbp->xnb_rx_unmop_rxp[loop]; + + if (rxp == NULL) + cmn_err(CE_PANIC, + "xnb_rx_perform_pending_unmop: " + "unexpected NULL rxp (loop %d; count %d)!", + loop, xnbp->xnb_rx_unmop_count); + + if (xnbp->xnb_connected) + xnb_rx_mark_complete(xnbp, rxp->xr_id, rxp->xr_status); + xnb_rxbuf_put(xnbp, rxp); + } + if (xnbp->xnb_connected) + xnb_rx_notify_peer(xnbp); + + xnbp->xnb_rx_unmop_count = 0; #ifdef XNB_DEBUG - bzero(xnbp->x_rx_unmop, sizeof (xnbp->x_rx_unmop)); + bzero(xnbp->xnb_rx_unmop, sizeof (xnbp->xnb_rx_unmop)); + bzero(xnbp->xnb_rx_unmop_rxp, sizeof (xnbp->xnb_rx_unmop_rxp)); #endif /* XNB_DEBUG */ } @@ -935,7 +1385,7 @@ xnb_rxbuf_get(xnb_t *xnbp, int flags) { xnb_rxbuf_t *rxp; - ASSERT(MUTEX_HELD(&xnbp->x_rx_lock)); + ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); rxp = kmem_cache_alloc(xnb_rxbuf_cachep, flags); if (rxp != NULL) { @@ -943,13 +1393,13 @@ xnb_rxbuf_get(xnb_t *xnbp, int flags) rxp->xr_flags |= XNB_RXBUF_INUSE; rxp->xr_xnbp = xnbp; - rxp->xr_mop.dom = xnbp->x_peer; + rxp->xr_mop.dom = xnbp->xnb_peer; rxp->xr_mop.flags = GNTMAP_host_map; - if (!xnbp->x_rx_pages_writable) + if (!xnbp->xnb_rx_pages_writable) rxp->xr_mop.flags |= GNTMAP_readonly; - xnbp->x_rx_buf_count++; + xnbp->xnb_rx_buf_count++; } return (rxp); @@ -958,11 +1408,11 @@ xnb_rxbuf_get(xnb_t *xnbp, int flags) static void xnb_rxbuf_put(xnb_t *xnbp, xnb_rxbuf_t *rxp) { - ASSERT(MUTEX_HELD(&xnbp->x_rx_lock)); + ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE); rxp->xr_flags &= ~XNB_RXBUF_INUSE; - xnbp->x_rx_buf_count--; + xnbp->xnb_rx_buf_count--; kmem_cache_free(xnb_rxbuf_cachep, rxp); } @@ -982,7 +1432,7 @@ xnb_recv(xnb_t *xnbp) * packet be destined for this host) will modify the packet * 'in place'. */ - boolean_t copy = !xnbp->x_rx_pages_writable; + boolean_t copy = !xnbp->xnb_rx_pages_writable; /* * For each individual request, the sequence of actions is: @@ -1001,21 +1451,19 @@ xnb_recv(xnb_t *xnbp) head = tail = NULL; around: - ASSERT(MUTEX_HELD(&xnbp->x_rx_lock)); + ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); - /*LINTED: constant in conditional context*/ - RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->x_tx_ring, work_to_do); + /* LINTED: constant in conditional context */ + RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do); if (!work_to_do) { finished: - xnb_rx_notify_peer(xnbp); - return (head); } - start = xnbp->x_tx_ring.req_cons; - end = xnbp->x_tx_ring.sring->req_prod; + start = xnbp->xnb_tx_ring.req_cons; + end = xnbp->xnb_tx_ring.sring->req_prod; - for (loop = start, mop = xnbp->x_rx_mop, rxpp = xnbp->x_rx_bufp; + for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp; loop != end; loop++, mop++, rxpp++) { xnb_rxbuf_t *rxp; @@ -1024,12 +1472,12 @@ finished: if (rxp == NULL) break; - ASSERT(xnbp->x_rx_pages_writable || + ASSERT(xnbp->xnb_rx_pages_writable || ((rxp->xr_mop.flags & GNTMAP_readonly) == GNTMAP_readonly)); rxp->xr_mop.ref = - RING_GET_REQUEST(&xnbp->x_tx_ring, loop)->gref; + RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop)->gref; ASSERT(rxp->xr_mop.ref < NR_GRANT_ENTRIES); @@ -1043,12 +1491,12 @@ finished: end = loop; if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, - xnbp->x_rx_mop, end - start) != 0) { + xnbp->xnb_rx_mop, end - start) != 0) { cmn_err(CE_WARN, "xnb_recv: map grant operation failed"); loop = start; - rxpp = xnbp->x_rx_bufp; + rxpp = xnbp->xnb_rx_bufp; while (loop != end) { xnb_rxbuf_put(xnbp, *rxpp); @@ -1060,7 +1508,7 @@ finished: goto finished; } - for (loop = start, mop = xnbp->x_rx_mop, rxpp = xnbp->x_rx_bufp; + for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp; loop != end; loop++, mop++, rxpp++) { mblk_t *mp = NULL; @@ -1074,14 +1522,14 @@ finished: status = NETIF_RSP_ERROR; } - txreq = RING_GET_REQUEST(&xnbp->x_tx_ring, loop); + txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop); if (status == NETIF_RSP_OKAY) { if (copy) { mp = allocb(txreq->size, BPRI_MED); if (mp == NULL) { status = NETIF_RSP_ERROR; - xnbp->x_stat_rx_allocb_failed++; + xnbp->xnb_stat_rx_allocb_failed++; } else { bcopy((caddr_t)(uintptr_t) mop->host_addr + txreq->offset, @@ -1089,12 +1537,12 @@ finished: mp->b_wptr += txreq->size; } } else { - mp = desballoc((unsigned char *)(uintptr_t) + mp = desballoc((uchar_t *)(uintptr_t) mop->host_addr + txreq->offset, txreq->size, 0, &rxp->xr_free_rtn); if (mp == NULL) { status = NETIF_RSP_ERROR; - xnbp->x_stat_rx_allocb_failed++; + xnbp->xnb_stat_rx_allocb_failed++; } else { rxp->xr_id = txreq->id; rxp->xr_status = status; @@ -1112,20 +1560,21 @@ finished: ((txreq->flags & (NETTXF_csum_blank | NETTXF_data_validated)) != 0)) { - mp = xnbp->x_flavour->xf_cksum_from_peer(xnbp, + mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp, mp, txreq->flags); - xnbp->x_stat_rx_cksum_no_need++; + xnbp->xnb_stat_rx_cksum_no_need++; } } if (copy || (mp == NULL)) { - xnb_rx_mark_complete(xnbp, txreq->id, status); - xnb_rx_schedule_unmop(xnbp, mop); + rxp->xr_status = status; + rxp->xr_id = txreq->id; + xnb_rx_schedule_unmop(xnbp, mop, rxp); } if (mp != NULL) { - xnbp->x_stat_ipackets++; - xnbp->x_stat_rbytes += txreq->size; + xnbp->xnb_stat_ipackets++; + xnbp->xnb_stat_rbytes += txreq->size; mp->b_next = NULL; if (head == NULL) { @@ -1139,23 +1588,7 @@ finished: } } - /* - * This has to be here rather than in the 'finished' code - * because we can only handle NET_TX_RING_SIZE pending unmap - * operations, which may be exceeded by multiple trips around - * the receive loop during heavy load (one trip around the - * loop cannot generate more than NET_TX_RING_SIZE unmap - * operations). - */ - xnb_rx_perform_pending_unmop(xnbp); - if (copy) { - for (loop = start, rxpp = xnbp->x_rx_bufp; - loop != end; - loop++, rxpp++) - xnb_rxbuf_put(xnbp, *rxpp); - } - - xnbp->x_tx_ring.req_cons = loop; + xnbp->xnb_tx_ring.req_cons = loop; goto around; /* NOTREACHED */ @@ -1170,26 +1603,26 @@ xnb_intr(caddr_t arg) xnb_t *xnbp = (xnb_t *)arg; mblk_t *mp; - xnbp->x_stat_intr++; + xnbp->xnb_stat_intr++; - mutex_enter(&xnbp->x_rx_lock); + mutex_enter(&xnbp->xnb_rx_lock); - ASSERT(xnbp->x_connected); + ASSERT(xnbp->xnb_connected); mp = xnb_recv(xnbp); - mutex_exit(&xnbp->x_rx_lock); + mutex_exit(&xnbp->xnb_rx_lock); - if (!xnbp->x_hotplugged) { - xnbp->x_stat_rx_too_early++; + if (!xnbp->xnb_hotplugged) { + xnbp->xnb_stat_rx_too_early++; goto fail; } if (mp == NULL) { - xnbp->x_stat_spurious_intr++; + xnbp->xnb_stat_spurious_intr++; goto fail; } - xnbp->x_flavour->xf_recv(xnbp, mp); + xnbp->xnb_flavour->xf_recv(xnbp, mp); return (DDI_INTR_CLAIMED); @@ -1210,14 +1643,14 @@ xnb_connect_rings(dev_info_t *dip) /* * Cannot attempt to connect the rings if already connected. */ - ASSERT(!xnbp->x_connected); + ASSERT(!xnbp->xnb_connected); oename = xvdi_get_oename(dip); if (xenbus_gather(XBT_NULL, oename, "event-channel", "%u", &evtchn, - "tx-ring-ref", "%lu", &xnbp->x_tx_ring_ref, - "rx-ring-ref", "%lu", &xnbp->x_rx_ring_ref, + "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref, + "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref, NULL) != 0) { cmn_err(CE_WARN, "xnb_connect_rings: " "cannot read other-end details from %s", @@ -1229,13 +1662,20 @@ xnb_connect_rings(dev_info_t *dip) "feature-tx-writable", "%d", &i) != 0) i = 0; if (i != 0) - xnbp->x_rx_pages_writable = B_TRUE; + xnbp->xnb_rx_pages_writable = B_TRUE; if (xenbus_scanf(XBT_NULL, oename, "feature-no-csum-offload", "%d", &i) != 0) i = 0; - if ((i == 1) || !xnbp->x_cksum_offload) - xnbp->x_cksum_offload = B_FALSE; + if ((i == 1) || !xnbp->xnb_cksum_offload) + xnbp->xnb_cksum_offload = B_FALSE; + + /* Check whether our peer knows and requests hypervisor copy */ + if (xenbus_scanf(XBT_NULL, oename, "request-rx-copy", "%d", &i) + != 0) + i = 0; + if (i != 0) + xnbp->xnb_hv_copy = B_TRUE; /* * 1. allocate a vaddr for the tx page, one for the rx page. @@ -1249,57 +1689,57 @@ xnb_connect_rings(dev_info_t *dip) */ /* 1.tx */ - xnbp->x_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE, + xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE, 0, 0, 0, 0, VM_SLEEP); - ASSERT(xnbp->x_tx_ring_addr != NULL); + ASSERT(xnbp->xnb_tx_ring_addr != NULL); /* 2.tx */ - map_op.host_addr = (uint64_t)((long)xnbp->x_tx_ring_addr); + map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr); map_op.flags = GNTMAP_host_map; - map_op.ref = xnbp->x_tx_ring_ref; - map_op.dom = xnbp->x_peer; - hat_prepare_mapping(kas.a_hat, xnbp->x_tx_ring_addr); + map_op.ref = xnbp->xnb_tx_ring_ref; + map_op.dom = xnbp->xnb_peer; + hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr); if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &map_op, 1) != 0 || map_op.status != 0) { cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page."); goto fail; } - xnbp->x_tx_ring_handle = map_op.handle; + xnbp->xnb_tx_ring_handle = map_op.handle; - /*LINTED: constant in conditional context*/ - BACK_RING_INIT(&xnbp->x_tx_ring, - (netif_tx_sring_t *)xnbp->x_tx_ring_addr, PAGESIZE); + /* LINTED: constant in conditional context */ + BACK_RING_INIT(&xnbp->xnb_tx_ring, + (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE); /* 1.rx */ - xnbp->x_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE, + xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE, 0, 0, 0, 0, VM_SLEEP); - ASSERT(xnbp->x_rx_ring_addr != NULL); + ASSERT(xnbp->xnb_rx_ring_addr != NULL); /* 2.rx */ - map_op.host_addr = (uint64_t)((long)xnbp->x_rx_ring_addr); + map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr); map_op.flags = GNTMAP_host_map; - map_op.ref = xnbp->x_rx_ring_ref; - map_op.dom = xnbp->x_peer; - hat_prepare_mapping(kas.a_hat, xnbp->x_rx_ring_addr); + map_op.ref = xnbp->xnb_rx_ring_ref; + map_op.dom = xnbp->xnb_peer; + hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr); if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &map_op, 1) != 0 || map_op.status != 0) { cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page."); goto fail; } - xnbp->x_rx_ring_handle = map_op.handle; + xnbp->xnb_rx_ring_handle = map_op.handle; - /*LINTED: constant in conditional context*/ - BACK_RING_INIT(&xnbp->x_rx_ring, - (netif_rx_sring_t *)xnbp->x_rx_ring_addr, PAGESIZE); + /* LINTED: constant in conditional context */ + BACK_RING_INIT(&xnbp->xnb_rx_ring, + (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE); /* 3 */ if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) { cmn_err(CE_WARN, "xnb_connect_rings: " - "cannot bind event channel %d", xnbp->x_evtchn); - xnbp->x_evtchn = INVALID_EVTCHN; + "cannot bind event channel %d", xnbp->xnb_evtchn); + xnbp->xnb_evtchn = INVALID_EVTCHN; goto fail; } - xnbp->x_evtchn = xvdi_get_evtchn(dip); + xnbp->xnb_evtchn = xvdi_get_evtchn(dip); /* * It would be good to set the state to XenbusStateConnected @@ -1307,14 +1747,14 @@ xnb_connect_rings(dev_info_t *dip) * Changing the state in the store will be noticed by the peer * and cannot be "taken back". */ - mutex_enter(&xnbp->x_tx_lock); - mutex_enter(&xnbp->x_rx_lock); + mutex_enter(&xnbp->xnb_tx_lock); + mutex_enter(&xnbp->xnb_rx_lock); /* 5.1 */ - xnbp->x_connected = B_TRUE; + xnbp->xnb_connected = B_TRUE; - mutex_exit(&xnbp->x_rx_lock); - mutex_exit(&xnbp->x_tx_lock); + mutex_exit(&xnbp->xnb_rx_lock); + mutex_exit(&xnbp->xnb_tx_lock); /* 4, 6 */ if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp) @@ -1322,7 +1762,7 @@ xnb_connect_rings(dev_info_t *dip) cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt"); goto fail; } - xnbp->x_irq = B_TRUE; + xnbp->xnb_irq = B_TRUE; /* 5.2 */ (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected); @@ -1330,13 +1770,12 @@ xnb_connect_rings(dev_info_t *dip) return (B_TRUE); fail: - mutex_enter(&xnbp->x_tx_lock); - mutex_enter(&xnbp->x_rx_lock); - - xnbp->x_connected = B_FALSE; + mutex_enter(&xnbp->xnb_tx_lock); + mutex_enter(&xnbp->xnb_rx_lock); - mutex_exit(&xnbp->x_rx_lock); - mutex_exit(&xnbp->x_tx_lock); + xnbp->xnb_connected = B_FALSE; + mutex_exit(&xnbp->xnb_rx_lock); + mutex_exit(&xnbp->xnb_tx_lock); return (B_FALSE); } @@ -1346,56 +1785,61 @@ xnb_disconnect_rings(dev_info_t *dip) { xnb_t *xnbp = ddi_get_driver_private(dip); - if (xnbp->x_irq) { + if (xnbp->xnb_irq) { ddi_remove_intr(dip, 0, NULL); - xnbp->x_irq = B_FALSE; + xnbp->xnb_irq = B_FALSE; } - if (xnbp->x_evtchn != INVALID_EVTCHN) { + if (xnbp->xnb_rx_unmop_count > 0) + xnb_rx_perform_pending_unmop(xnbp); + + if (xnbp->xnb_evtchn != INVALID_EVTCHN) { xvdi_free_evtchn(dip); - xnbp->x_evtchn = INVALID_EVTCHN; + xnbp->xnb_evtchn = INVALID_EVTCHN; } - if (xnbp->x_rx_ring_handle != INVALID_GRANT_HANDLE) { + if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) { struct gnttab_unmap_grant_ref unmap_op; - unmap_op.host_addr = (uint64_t)(uintptr_t)xnbp->x_rx_ring_addr; + unmap_op.host_addr = (uint64_t)(uintptr_t) + xnbp->xnb_rx_ring_addr; unmap_op.dev_bus_addr = 0; - unmap_op.handle = xnbp->x_rx_ring_handle; + unmap_op.handle = xnbp->xnb_rx_ring_handle; if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unmap_op, 1) != 0) cmn_err(CE_WARN, "xnb_disconnect_rings: " "cannot unmap rx-ring page (%d)", unmap_op.status); - xnbp->x_rx_ring_handle = INVALID_GRANT_HANDLE; + xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE; } - if (xnbp->x_rx_ring_addr != NULL) { - hat_release_mapping(kas.a_hat, xnbp->x_rx_ring_addr); - vmem_free(heap_arena, xnbp->x_rx_ring_addr, PAGESIZE); - xnbp->x_rx_ring_addr = NULL; + if (xnbp->xnb_rx_ring_addr != NULL) { + hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr); + vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE); + xnbp->xnb_rx_ring_addr = NULL; } - if (xnbp->x_tx_ring_handle != INVALID_GRANT_HANDLE) { + if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) { struct gnttab_unmap_grant_ref unmap_op; - unmap_op.host_addr = (uint64_t)(uintptr_t)xnbp->x_tx_ring_addr; + unmap_op.host_addr = (uint64_t)(uintptr_t) + xnbp->xnb_tx_ring_addr; unmap_op.dev_bus_addr = 0; - unmap_op.handle = xnbp->x_tx_ring_handle; + unmap_op.handle = xnbp->xnb_tx_ring_handle; if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unmap_op, 1) != 0) cmn_err(CE_WARN, "xnb_disconnect_rings: " "cannot unmap tx-ring page (%d)", unmap_op.status); - xnbp->x_tx_ring_handle = INVALID_GRANT_HANDLE; + xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE; } - if (xnbp->x_tx_ring_addr != NULL) { - hat_release_mapping(kas.a_hat, xnbp->x_tx_ring_addr); - vmem_free(heap_arena, xnbp->x_tx_ring_addr, PAGESIZE); - xnbp->x_tx_ring_addr = NULL; + if (xnbp->xnb_tx_ring_addr != NULL) { + hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr); + vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE); + xnbp->xnb_tx_ring_addr = NULL; } } @@ -1412,9 +1856,9 @@ xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, switch (new_state) { case XenbusStateConnected: if (xnb_connect_rings(dip)) { - xnbp->x_flavour->xf_peer_connected(xnbp); + xnbp->xnb_flavour->xf_peer_connected(xnbp); } else { - xnbp->x_flavour->xf_peer_disconnected(xnbp); + xnbp->xnb_flavour->xf_peer_disconnected(xnbp); xnb_disconnect_rings(dip); (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed); @@ -1425,7 +1869,7 @@ xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, * Now that we've attempted to connect it's reasonable * to allow an attempt to detach. */ - xnbp->x_detachable = B_TRUE; + xnbp->xnb_detachable = B_TRUE; break; @@ -1435,16 +1879,16 @@ xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, break; case XenbusStateClosed: - xnbp->x_flavour->xf_peer_disconnected(xnbp); + xnbp->xnb_flavour->xf_peer_disconnected(xnbp); - mutex_enter(&xnbp->x_tx_lock); - mutex_enter(&xnbp->x_rx_lock); + mutex_enter(&xnbp->xnb_tx_lock); + mutex_enter(&xnbp->xnb_rx_lock); xnb_disconnect_rings(dip); - xnbp->x_connected = B_FALSE; + xnbp->xnb_connected = B_FALSE; - mutex_exit(&xnbp->x_rx_lock); - mutex_exit(&xnbp->x_tx_lock); + mutex_exit(&xnbp->xnb_rx_lock); + mutex_exit(&xnbp->xnb_tx_lock); (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed); (void) xvdi_post_event(dip, XEN_HP_REMOVE); @@ -1455,7 +1899,7 @@ xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, * having been through the case above, so we set it to * be sure. */ - xnbp->x_detachable = B_TRUE; + xnbp->xnb_detachable = B_TRUE; break; @@ -1478,15 +1922,15 @@ xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id, switch (state) { case Connected: - success = xnbp->x_flavour->xf_hotplug_connected(xnbp); + success = xnbp->xnb_flavour->xf_hotplug_connected(xnbp); - mutex_enter(&xnbp->x_tx_lock); - mutex_enter(&xnbp->x_rx_lock); + mutex_enter(&xnbp->xnb_tx_lock); + mutex_enter(&xnbp->xnb_rx_lock); - xnbp->x_hotplugged = success; + xnbp->xnb_hotplugged = success; - mutex_exit(&xnbp->x_rx_lock); - mutex_exit(&xnbp->x_tx_lock); + mutex_exit(&xnbp->xnb_rx_lock); + mutex_exit(&xnbp->xnb_tx_lock); break; default: diff --git a/usr/src/uts/common/xen/io/xnb.h b/usr/src/uts/common/xen/io/xnb.h index 16ba897727..8da45c82ea 100644 --- a/usr/src/uts/common/xen/io/xnb.h +++ b/usr/src/uts/common/xen/io/xnb.h @@ -94,84 +94,98 @@ typedef struct xnb_rxbuf { /* Per network-interface-controller driver private structure */ struct xnb { /* most interesting stuff first to assist debugging */ - dev_info_t *x_devinfo; /* System per-device info. */ + dev_info_t *xnb_devinfo; /* System per-device info. */ - xnb_flavour_t *x_flavour; - void *x_flavour_data; + xnb_flavour_t *xnb_flavour; + void *xnb_flavour_data; - boolean_t x_irq; - unsigned char x_mac_addr[ETHERADDRL]; + boolean_t xnb_irq; + unsigned char xnb_mac_addr[ETHERADDRL]; - uint64_t x_stat_ipackets; - uint64_t x_stat_opackets; - uint64_t x_stat_rbytes; - uint64_t x_stat_obytes; + uint64_t xnb_stat_ipackets; + uint64_t xnb_stat_opackets; + uint64_t xnb_stat_rbytes; + uint64_t xnb_stat_obytes; - uint64_t x_stat_intr; - uint64_t x_stat_xmit_defer; + uint64_t xnb_stat_intr; + uint64_t xnb_stat_xmit_defer; - uint64_t x_stat_tx_cksum_deferred; - uint64_t x_stat_rx_cksum_no_need; + uint64_t xnb_stat_tx_cksum_deferred; + uint64_t xnb_stat_rx_cksum_no_need; - uint64_t x_stat_tx_notify_sent; - uint64_t x_stat_tx_notify_deferred; + uint64_t xnb_stat_tx_notify_sent; + uint64_t xnb_stat_tx_notify_deferred; - uint64_t x_stat_rx_notify_sent; - uint64_t x_stat_rx_notify_deferred; + uint64_t xnb_stat_rx_notify_sent; + uint64_t xnb_stat_rx_notify_deferred; - uint64_t x_stat_tx_too_early; - uint64_t x_stat_rx_too_early; - uint64_t x_stat_rx_allocb_failed; - uint64_t x_stat_mac_full; - uint64_t x_stat_spurious_intr; - uint64_t x_stat_allocation_success; - uint64_t x_stat_allocation_failure; - uint64_t x_stat_small_allocation_success; - uint64_t x_stat_small_allocation_failure; + uint64_t xnb_stat_tx_too_early; + uint64_t xnb_stat_rx_too_early; + uint64_t xnb_stat_rx_allocb_failed; + uint64_t xnb_stat_tx_allocb_failed; + uint64_t xnb_stat_tx_foreign_page; + uint64_t xnb_stat_mac_full; + uint64_t xnb_stat_spurious_intr; + uint64_t xnb_stat_allocation_success; + uint64_t xnb_stat_allocation_failure; + uint64_t xnb_stat_small_allocation_success; + uint64_t xnb_stat_small_allocation_failure; + uint64_t xnb_stat_other_allocation_failure; - uint64_t x_stat_csum_hardware; - uint64_t x_stat_csum_software; + uint64_t xnb_stat_tx_pagebndry_crossed; + uint64_t xnb_stat_tx_cpoparea_grown; - kstat_t *x_kstat_aux; + uint64_t xnb_stat_csum_hardware; + uint64_t xnb_stat_csum_software; - boolean_t x_cksum_offload; + kstat_t *xnb_kstat_aux; - ddi_iblock_cookie_t x_icookie; + boolean_t xnb_cksum_offload; - kmutex_t x_rx_lock; - kmutex_t x_tx_lock; + ddi_iblock_cookie_t xnb_icookie; - int x_rx_unmop_count; - int x_rx_buf_count; - boolean_t x_rx_pages_writable; + kmutex_t xnb_rx_lock; + kmutex_t xnb_tx_lock; - netif_rx_back_ring_t x_rx_ring; /* rx interface struct ptr */ - void *x_rx_ring_addr; - grant_ref_t x_rx_ring_ref; - grant_handle_t x_rx_ring_handle; + int xnb_rx_unmop_count; + int xnb_rx_buf_count; + boolean_t xnb_rx_pages_writable; - netif_tx_back_ring_t x_tx_ring; /* tx interface struct ptr */ - void *x_tx_ring_addr; - grant_ref_t x_tx_ring_ref; - grant_handle_t x_tx_ring_handle; + netif_rx_back_ring_t xnb_rx_ring; /* rx interface struct ptr */ + void *xnb_rx_ring_addr; + grant_ref_t xnb_rx_ring_ref; + grant_handle_t xnb_rx_ring_handle; - boolean_t x_connected; - boolean_t x_hotplugged; - boolean_t x_detachable; - int x_evtchn; /* channel to front end */ - domid_t x_peer; + netif_tx_back_ring_t xnb_tx_ring; /* tx interface struct ptr */ + void *xnb_tx_ring_addr; + grant_ref_t xnb_tx_ring_ref; + grant_handle_t xnb_tx_ring_handle; - xnb_rxbuf_t *x_rx_bufp[NET_TX_RING_SIZE]; - gnttab_map_grant_ref_t x_rx_mop[NET_TX_RING_SIZE]; - gnttab_unmap_grant_ref_t x_rx_unmop[NET_TX_RING_SIZE]; + boolean_t xnb_connected; + boolean_t xnb_hotplugged; + boolean_t xnb_detachable; + int xnb_evtchn; /* channel to front end */ + domid_t xnb_peer; - caddr_t x_tx_va; - gnttab_transfer_t x_tx_top[NET_RX_RING_SIZE]; + xnb_rxbuf_t *xnb_rx_bufp[NET_TX_RING_SIZE]; + gnttab_map_grant_ref_t xnb_rx_mop[NET_TX_RING_SIZE]; + gnttab_unmap_grant_ref_t xnb_rx_unmop[NET_TX_RING_SIZE]; + + /* store information for unmop */ + xnb_rxbuf_t *xnb_rx_unmop_rxp[NET_TX_RING_SIZE]; + + caddr_t xnb_tx_va; + gnttab_transfer_t xnb_tx_top[NET_RX_RING_SIZE]; + + boolean_t xnb_hv_copy; /* do we do hypervisor copy? */ + gnttab_copy_t *xnb_tx_cpop; +#define CPOP_DEFCNT 8 + size_t xnb_cpop_sz; /* in elements, not bytes */ }; extern int xnb_attach(dev_info_t *, xnb_flavour_t *, void *); extern void xnb_detach(dev_info_t *); -extern mblk_t *xnb_to_peer(xnb_t *, mblk_t *); +extern mblk_t *xnb_copy_to_peer(xnb_t *, mblk_t *); extern mblk_t *xnb_process_cksum_flags(xnb_t *, mblk_t *, uint32_t); #ifdef __cplusplus diff --git a/usr/src/uts/common/xen/io/xnbo.c b/usr/src/uts/common/xen/io/xnbo.c index a7d2190cda..723d650c55 100644 --- a/usr/src/uts/common/xen/io/xnbo.c +++ b/usr/src/uts/common/xen/io/xnbo.c @@ -63,19 +63,19 @@ static void xnbo_close_mac(xnbo_t *); static void xnbo_to_mac(xnb_t *xnbp, mblk_t *mp) { - xnbo_t *xnbop = xnbp->x_flavour_data; + xnbo_t *xnbop = xnbp->xnb_flavour_data; ASSERT(mp != NULL); if (!xnbop->o_running) { - xnbp->x_stat_rx_too_early++; + xnbp->xnb_stat_rx_too_early++; goto fail; } mp = xnbop->o_mtx->mt_fn(xnbop->o_mtx->mt_arg, mp); if (mp != NULL) { - xnbp->x_stat_mac_full++; + xnbp->xnb_stat_mac_full++; goto fail; } @@ -88,13 +88,13 @@ fail: static mblk_t * xnbo_cksum_from_peer(xnb_t *xnbp, mblk_t *mp, uint16_t flags) { - xnbo_t *xnbop = xnbp->x_flavour_data; + xnbo_t *xnbop = xnbp->xnb_flavour_data; ASSERT(mp->b_next == NULL); if ((flags & NETTXF_csum_blank) != 0) { /* - * It would be nice to ASSERT that xnbp->x_cksum_offload + * It would be nice to ASSERT that xnbp->xnb_cksum_offload * is TRUE here, but some peers insist on assuming * that it is available even when they have been told * otherwise. @@ -128,7 +128,7 @@ xnbo_cksum_to_peer(xnb_t *xnbp, mblk_t *mp) * caller must use HCK_PARTIALCKSUM. */ - if (xnbp->x_cksum_offload) { + if (xnbp->xnb_cksum_offload) { uint32_t pflags, csum; /* @@ -162,7 +162,7 @@ xnbo_from_mac(void *arg, mac_resource_handle_t mrh, mblk_t *mp) { xnb_t *xnbp = arg; - mp = xnb_to_peer(xnbp, mp); + mp = xnb_copy_to_peer(xnbp, mp); if (mp != NULL) freemsgchain(mp); @@ -178,7 +178,7 @@ static void xnbo_from_mac_filter(void *arg, mac_resource_handle_t mrh, mblk_t *mp) { xnb_t *xnbp = arg; - xnbo_t *xnbop = xnbp->x_flavour_data; + xnbo_t *xnbop = xnbp->xnb_flavour_data; mblk_t *next, *keep, *keep_head, *free, *free_head; keep = keep_head = free = free_head = NULL; @@ -207,8 +207,8 @@ xnbo_from_mac_filter(void *arg, mac_resource_handle_t mrh, mblk_t *mp) continue; } - if (bcmp(hdr_info.mhi_daddr, xnbp->x_mac_addr, - sizeof (xnbp->x_mac_addr)) == 0) { + if (bcmp(hdr_info.mhi_daddr, xnbp->xnb_mac_addr, + sizeof (xnbp->xnb_mac_addr)) == 0) { ADD(keep, mp); continue; } @@ -228,7 +228,7 @@ static void xnbo_notify(void *arg, mac_notify_type_t type) { xnb_t *xnbp = arg; - xnbo_t *xnbop = xnbp->x_flavour_data; + xnbo_t *xnbop = xnbp->xnb_flavour_data; switch (type) { case MAC_NOTE_PROMISC: @@ -240,13 +240,13 @@ xnbo_notify(void *arg, mac_notify_type_t type) static boolean_t xnbo_open_mac(xnb_t *xnbp, char *mac) { - xnbo_t *xnbop = xnbp->x_flavour_data; + xnbo_t *xnbop = xnbp->xnb_flavour_data; int err, need_rx_filter, need_setphysaddr, need_promiscuous; const mac_info_t *mi; char *xsname; void (*rx_fn)(void *, mac_resource_handle_t, mblk_t *); - xsname = xvdi_get_xsname(xnbp->x_devinfo); + xsname = xvdi_get_xsname(xnbp->xnb_devinfo); if ((err = mac_open(mac, &xnbop->o_mh)) != 0) { cmn_err(CE_WARN, "xnbo_open_mac: " @@ -313,10 +313,10 @@ xnbo_open_mac(xnb_t *xnbp, char *mac) if (need_setphysaddr > 0) { struct ether_addr ea; - err = mac_unicst_set(xnbop->o_mh, xnbp->x_mac_addr); + err = mac_unicst_set(xnbop->o_mh, xnbp->xnb_mac_addr); /* Warn, but continue on. */ if (err != 0) { - bcopy(xnbp->x_mac_addr, ea.ether_addr_octet, + bcopy(xnbp->xnb_mac_addr, ea.ether_addr_octet, ETHERADDRL); cmn_err(CE_WARN, "xnbo_open_mac: " "cannot set MAC address of %s to " @@ -367,7 +367,7 @@ xnbo_hotplug(xnb_t *xnbp) char *xsname; char mac[LIFNAMSIZ]; - xsname = xvdi_get_xsname(xnbp->x_devinfo); + xsname = xvdi_get_xsname(xnbp->xnb_devinfo); if (xenbus_scanf(XBT_NULL, xsname, "nic", "%s", mac) != 0) { cmn_err(CE_WARN, "xnbo_hotplug: " "cannot read nic name from %s", xsname); @@ -428,7 +428,7 @@ xnbo_connected(xnb_t *xnbp) static void xnbo_disconnected(xnb_t *xnbp) { - xnbo_close_mac(xnbp->x_flavour_data); + xnbo_close_mac(xnbp->xnb_flavour_data); } static int @@ -469,7 +469,7 @@ static int xnbo_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { xnb_t *xnbp = ddi_get_driver_private(dip); - xnbo_t *xnbop = xnbp->x_flavour_data; + xnbo_t *xnbop = xnbp->xnb_flavour_data; switch (cmd) { case DDI_DETACH: @@ -480,19 +480,19 @@ xnbo_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) return (DDI_FAILURE); } - mutex_enter(&xnbp->x_tx_lock); - mutex_enter(&xnbp->x_rx_lock); + mutex_enter(&xnbp->xnb_tx_lock); + mutex_enter(&xnbp->xnb_rx_lock); - if (!xnbp->x_detachable || xnbp->x_connected || - (xnbp->x_rx_buf_count > 0)) { - mutex_exit(&xnbp->x_rx_lock); - mutex_exit(&xnbp->x_tx_lock); + if (!xnbp->xnb_detachable || xnbp->xnb_connected || + (xnbp->xnb_rx_buf_count > 0)) { + mutex_exit(&xnbp->xnb_rx_lock); + mutex_exit(&xnbp->xnb_tx_lock); return (DDI_FAILURE); } - mutex_exit(&xnbp->x_rx_lock); - mutex_exit(&xnbp->x_tx_lock); + mutex_exit(&xnbp->xnb_rx_lock); + mutex_exit(&xnbp->xnb_tx_lock); xnbo_close_mac(xnbop); kmem_free(xnbop, sizeof (*xnbop)); diff --git a/usr/src/uts/common/xen/io/xnbu.c b/usr/src/uts/common/xen/io/xnbu.c index 1ed6067af0..fa9604194b 100644 --- a/usr/src/uts/common/xen/io/xnbu.c +++ b/usr/src/uts/common/xen/io/xnbu.c @@ -81,14 +81,14 @@ static mac_callbacks_t xnb_callbacks = { static void xnbu_to_host(xnb_t *xnbp, mblk_t *mp) { - xnbu_t *xnbup = xnbp->x_flavour_data; + xnbu_t *xnbup = xnbp->xnb_flavour_data; boolean_t sched = B_FALSE; ASSERT(mp != NULL); mac_rx(xnbup->u_mh, xnbup->u_rx_handle, mp); - mutex_enter(&xnbp->x_tx_lock); + mutex_enter(&xnbp->xnb_tx_lock); /* * If a transmit attempt failed because we ran out of ring @@ -96,12 +96,12 @@ xnbu_to_host(xnb_t *xnbp, mblk_t *mp) * path. */ if (xnbup->u_need_sched && - RING_HAS_UNCONSUMED_REQUESTS(&xnbp->x_rx_ring)) { + RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) { sched = B_TRUE; xnbup->u_need_sched = B_FALSE; } - mutex_exit(&xnbp->x_tx_lock); + mutex_exit(&xnbp->xnb_tx_lock); if (sched) mac_tx_update(xnbup->u_mh); @@ -155,7 +155,7 @@ xnbu_cksum_to_peer(xnb_t *xnbp, mblk_t *mp) { uint16_t r = 0; - if (xnbp->x_cksum_offload) { + if (xnbp->xnb_cksum_offload) { uint32_t pflags; hcksum_retrieve(mp, NULL, NULL, NULL, NULL, @@ -176,7 +176,7 @@ xnbu_cksum_to_peer(xnb_t *xnbp, mblk_t *mp) static void xnbu_connected(xnb_t *xnbp) { - xnbu_t *xnbup = xnbp->x_flavour_data; + xnbu_t *xnbup = xnbp->xnb_flavour_data; mac_link_update(xnbup->u_mh, LINK_STATE_UP); /* @@ -188,7 +188,7 @@ xnbu_connected(xnb_t *xnbp) static void xnbu_disconnected(xnb_t *xnbp) { - xnbu_t *xnbup = xnbp->x_flavour_data; + xnbu_t *xnbup = xnbp->xnb_flavour_data; mac_link_update(xnbup->u_mh, LINK_STATE_DOWN); } @@ -204,9 +204,9 @@ static mblk_t * xnbu_m_send(void *arg, mblk_t *mp) { xnb_t *xnbp = arg; - xnbu_t *xnbup = xnbp->x_flavour_data; + xnbu_t *xnbup = xnbp->xnb_flavour_data; - mp = xnb_to_peer(arg, mp); + mp = xnb_copy_to_peer(arg, mp); /* XXPV dme: playing with need_sched without txlock? */ @@ -239,10 +239,10 @@ static int xnbu_m_set_mac_addr(void *arg, const uint8_t *macaddr) { xnb_t *xnbp = arg; - xnbu_t *xnbup = xnbp->x_flavour_data; + xnbu_t *xnbup = xnbp->xnb_flavour_data; - bcopy(macaddr, xnbp->x_mac_addr, ETHERADDRL); - mac_unicst_update(xnbup->u_mh, xnbp->x_mac_addr); + bcopy(macaddr, xnbp->xnb_mac_addr, ETHERADDRL); + mac_unicst_update(xnbup->u_mh, xnbp->xnb_mac_addr); return (0); } @@ -300,12 +300,12 @@ xnbu_m_stat(void *arg, uint_t stat, uint64_t *val) { xnb_t *xnbp = arg; - mutex_enter(&xnbp->x_tx_lock); - mutex_enter(&xnbp->x_rx_lock); + mutex_enter(&xnbp->xnb_tx_lock); + mutex_enter(&xnbp->xnb_rx_lock); #define map_stat(q, r) \ case (MAC_STAT_##q): \ - *val = xnbp->x_stat_##r; \ + *val = xnbp->xnb_stat_##r; \ break switch (stat) { @@ -316,16 +316,16 @@ xnbu_m_stat(void *arg, uint_t stat, uint64_t *val) map_stat(OBYTES, obytes); default: - mutex_exit(&xnbp->x_rx_lock); - mutex_exit(&xnbp->x_tx_lock); + mutex_exit(&xnbp->xnb_rx_lock); + mutex_exit(&xnbp->xnb_tx_lock); return (ENOTSUP); } #undef map_stat - mutex_exit(&xnbp->x_rx_lock); - mutex_exit(&xnbp->x_tx_lock); + mutex_exit(&xnbp->xnb_rx_lock); + mutex_exit(&xnbp->xnb_tx_lock); return (0); } @@ -343,7 +343,7 @@ static void xnbu_m_resources(void *arg) { xnb_t *xnbp = arg; - xnbu_t *xnbup = xnbp->x_flavour_data; + xnbu_t *xnbup = xnbp->xnb_flavour_data; mac_rx_fifo_t mrf; mrf.mrf_type = MAC_RX_FIFO; @@ -365,7 +365,7 @@ xnbu_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) case MAC_CAPAB_HCKSUM: { uint32_t *capab = cap_data; - if (xnbp->x_cksum_offload) + if (xnbp->xnb_cksum_offload) *capab = HCKSUM_INET_PARTIAL; else *capab = 0; @@ -428,13 +428,13 @@ xnbu_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) * used by the generic layer. */ mr->m_type_ident = MAC_PLUGIN_IDENT_ETHER; - mr->m_src_addr = xnbp->x_mac_addr; + mr->m_src_addr = xnbp->xnb_mac_addr; mr->m_callbacks = &xnb_callbacks; mr->m_min_sdu = 0; mr->m_max_sdu = XNBMAXPKT; - (void) memset(xnbp->x_mac_addr, 0xff, ETHERADDRL); - xnbp->x_mac_addr[0] &= 0xfe; + (void) memset(xnbp->xnb_mac_addr, 0xff, ETHERADDRL); + xnbp->xnb_mac_addr[0] &= 0xfe; xnbup->u_need_sched = B_FALSE; /* @@ -458,7 +458,7 @@ int xnbu_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { xnb_t *xnbp = ddi_get_driver_private(dip); - xnbu_t *xnbup = xnbp->x_flavour_data; + xnbu_t *xnbup = xnbp->xnb_flavour_data; switch (cmd) { case DDI_DETACH: @@ -472,19 +472,19 @@ xnbu_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) ASSERT(xnbp != NULL); ASSERT(xnbup != NULL); - mutex_enter(&xnbp->x_tx_lock); - mutex_enter(&xnbp->x_rx_lock); + mutex_enter(&xnbp->xnb_tx_lock); + mutex_enter(&xnbp->xnb_rx_lock); - if (!xnbp->x_detachable || xnbp->x_connected || - (xnbp->x_rx_buf_count > 0)) { - mutex_exit(&xnbp->x_rx_lock); - mutex_exit(&xnbp->x_tx_lock); + if (!xnbp->xnb_detachable || xnbp->xnb_connected || + (xnbp->xnb_rx_buf_count > 0)) { + mutex_exit(&xnbp->xnb_rx_lock); + mutex_exit(&xnbp->xnb_tx_lock); return (DDI_FAILURE); } - mutex_exit(&xnbp->x_rx_lock); - mutex_exit(&xnbp->x_tx_lock); + mutex_exit(&xnbp->xnb_rx_lock); + mutex_exit(&xnbp->xnb_tx_lock); /* * Attempt to unregister the mac. diff --git a/usr/src/uts/common/xen/io/xnf.c b/usr/src/uts/common/xen/io/xnf.c index 4f457edf00..89a12e4d03 100644 --- a/usr/src/uts/common/xen/io/xnf.c +++ b/usr/src/uts/common/xen/io/xnf.c @@ -63,49 +63,42 @@ */ #include <sys/types.h> -#include <sys/hypervisor.h> -#include <sys/debug.h> #include <sys/errno.h> #include <sys/param.h> #include <sys/sysmacros.h> #include <sys/systm.h> -#include <sys/stropts.h> #include <sys/stream.h> #include <sys/strsubr.h> -#include <sys/kmem.h> #include <sys/conf.h> #include <sys/ddi.h> #include <sys/devops.h> #include <sys/sunddi.h> #include <sys/sunndi.h> -#include <sys/ksynch.h> #include <sys/dlpi.h> #include <sys/ethernet.h> #include <sys/strsun.h> #include <sys/pattr.h> -#include <inet/common.h> #include <inet/ip.h> -#include <sys/stat.h> #include <sys/modctl.h> #include <sys/mac.h> #include <sys/mac_ether.h> -#include <sys/atomic.h> -#include <sys/errno.h> -#include <sys/machsystm.h> -#include <sys/bootconf.h> -#include <sys/bootsvcs.h> #include <sys/bootinfo.h> -#include <sys/promif.h> -#include <sys/archsystm.h> -#include <sys/gnttab.h> #include <sys/mach_mmu.h> -#include <xen/public/memory.h> - -#include "xnf.h" - +#ifdef XPV_HVM_DRIVER +#include <sys/xpv_support.h> +#include <sys/hypervisor.h> +#else +#include <sys/hypervisor.h> #include <sys/evtchn_impl.h> #include <sys/balloon_impl.h> +#endif +#include <xen/public/io/netif.h> +#include <sys/gnttab.h> #include <xen/sys/xendev.h> +#include <sys/sdt.h> + +#include <io/xnf.h> + /* * Declarations and Module Linkage @@ -127,6 +120,10 @@ int xnfdebug = 0; #define xnf_btop(addr) ((addr) >> PAGESHIFT) boolean_t xnf_cksum_offload = B_TRUE; + +/* Default value for hypervisor-based copy operations */ +boolean_t xnf_rx_hvcopy = B_TRUE; + /* * Should pages used for transmit be readonly for the peer? */ @@ -164,17 +161,20 @@ static void xnf_release_dma_resources(xnf_t *); static mblk_t *xnf_process_recv(xnf_t *); static void xnf_rcv_complete(struct xnf_buffer_desc *); static void xnf_release_mblks(xnf_t *); -static struct xnf_buffer_desc *xnf_alloc_xmit_buffer(xnf_t *); +static struct xnf_buffer_desc *xnf_alloc_tx_buffer(xnf_t *); static struct xnf_buffer_desc *xnf_alloc_buffer(xnf_t *); -static struct xnf_buffer_desc *xnf_get_xmit_buffer(xnf_t *); +static struct xnf_buffer_desc *xnf_get_tx_buffer(xnf_t *); static struct xnf_buffer_desc *xnf_get_buffer(xnf_t *); static void xnf_free_buffer(struct xnf_buffer_desc *); -static void xnf_free_xmit_buffer(struct xnf_buffer_desc *); +static void xnf_free_tx_buffer(struct xnf_buffer_desc *); void xnf_send_driver_status(int, int); static void rx_buffer_hang(xnf_t *, struct xnf_buffer_desc *); static int xnf_clean_tx_ring(xnf_t *); static void oe_state_change(dev_info_t *, ddi_eventcookie_t, void *, void *); +static mblk_t *xnf_process_hvcopy_recv(xnf_t *xnfp); +static boolean_t xnf_hvcopy_peer_status(dev_info_t *devinfo); +static boolean_t xnf_kstat_init(xnf_t *xnfp); /* * XXPV dme: remove MC_IOCTL? @@ -194,8 +194,8 @@ static mac_callbacks_t xnf_callbacks = { }; #define GRANT_INVALID_REF 0 -int xnf_recv_bufs_lowat = 4 * NET_RX_RING_SIZE; -int xnf_recv_bufs_hiwat = 8 * NET_RX_RING_SIZE; /* default max */ +const int xnf_rx_bufs_lowat = 4 * NET_RX_RING_SIZE; +const int xnf_rx_bufs_hiwat = 8 * NET_RX_RING_SIZE; /* default max */ /* DMA attributes for network ring buffer */ static ddi_dma_attr_t ringbuf_dma_attr = { @@ -300,134 +300,54 @@ _info(struct modinfo *modinfop) return (mod_info(&modlinkage, modinfop)); } -/* - * Statistics. - */ -/* XXPV: most of these names need re-"nice"ing */ -static char *xnf_aux_statistics[] = { - "tx_cksum_deferred", - "rx_cksum_no_need", - "intr", - "xmit_pullup", - "xmit_pagebndry", - "xmit_attempt", - "rx_no_ringbuf", - "mac_rcv_error", - "runt", -}; - -static int -xnf_kstat_aux_update(kstat_t *ksp, int flag) -{ - xnf_t *xnfp; - kstat_named_t *knp; - - if (flag != KSTAT_READ) - return (EACCES); - - xnfp = ksp->ks_private; - knp = ksp->ks_data; - - /* - * Assignment order should match that of the names in - * xnf_aux_statistics. - */ - (knp++)->value.ui64 = xnfp->stat_tx_cksum_deferred; - (knp++)->value.ui64 = xnfp->stat_rx_cksum_no_need; - - (knp++)->value.ui64 = xnfp->stat_intr; - (knp++)->value.ui64 = xnfp->stat_xmit_pullup; - (knp++)->value.ui64 = xnfp->stat_xmit_pagebndry; - (knp++)->value.ui64 = xnfp->stat_xmit_attempt; - (knp++)->value.ui64 = xnfp->stat_rx_no_ringbuf; - (knp++)->value.ui64 = xnfp->stat_mac_rcv_error; - (knp++)->value.ui64 = xnfp->stat_runt; - - return (0); -} - -static boolean_t -xnf_kstat_init(xnf_t *xnfp) -{ - int nstat = sizeof (xnf_aux_statistics) / - sizeof (xnf_aux_statistics[0]); - char **cp = xnf_aux_statistics; - kstat_named_t *knp; - - /* - * Create and initialise kstats. - */ - if ((xnfp->kstat_aux = kstat_create("xnf", - ddi_get_instance(xnfp->devinfo), - "aux_statistics", "net", KSTAT_TYPE_NAMED, - nstat, 0)) == NULL) - return (B_FALSE); - - xnfp->kstat_aux->ks_private = xnfp; - xnfp->kstat_aux->ks_update = xnf_kstat_aux_update; - - knp = xnfp->kstat_aux->ks_data; - while (nstat > 0) { - kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); - - knp++; - cp++; - nstat--; - } - - kstat_install(xnfp->kstat_aux); - - return (B_TRUE); -} - static int xnf_setup_rings(xnf_t *xnfp) { int ix, err; RING_IDX i; - struct xnf_buffer_desc *bdesc, *rbp; - struct xenbus_device *xsd; - domid_t oeid; + struct xnf_buffer_desc *bdesc, *rbp; + struct xenbus_device *xsd; + domid_t oeid; - oeid = xvdi_get_oeid(xnfp->devinfo); - xsd = xvdi_get_xsd(xnfp->devinfo); + oeid = xvdi_get_oeid(xnfp->xnf_devinfo); + xsd = xvdi_get_xsd(xnfp->xnf_devinfo); - if (xnfp->tx_ring_ref != GRANT_INVALID_REF) - gnttab_end_foreign_access(xnfp->tx_ring_ref, 0, 0); + if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF) + gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); err = gnttab_grant_foreign_access(oeid, - xnf_btop(pa_to_ma(xnfp->tx_ring_phys_addr)), 0); + xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0); if (err <= 0) { err = -err; xenbus_dev_error(xsd, err, "granting access to tx ring page"); goto out; } - xnfp->tx_ring_ref = (grant_ref_t)err; + xnfp->xnf_tx_ring_ref = (grant_ref_t)err; - if (xnfp->rx_ring_ref != GRANT_INVALID_REF) - gnttab_end_foreign_access(xnfp->rx_ring_ref, 0, 0); + if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF) + gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); err = gnttab_grant_foreign_access(oeid, - xnf_btop(pa_to_ma(xnfp->rx_ring_phys_addr)), 0); + xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0); if (err <= 0) { err = -err; xenbus_dev_error(xsd, err, "granting access to rx ring page"); goto out; } - xnfp->rx_ring_ref = (grant_ref_t)err; + xnfp->xnf_rx_ring_ref = (grant_ref_t)err; - mutex_enter(&xnfp->intrlock); + mutex_enter(&xnfp->xnf_intrlock); /* * Cleanup the TX ring. We just clean up any valid tx_pktinfo structs * and reset the ring. Note that this can lose packets after a resume, * but we expect to stagger on. */ - mutex_enter(&xnfp->txlock); + mutex_enter(&xnfp->xnf_txlock); - for (i = 0; i < xnfp->n_xmits; i++) { - struct tx_pktinfo *txp = &xnfp->tx_pkt_info[i]; + for (i = 0; i < xnfp->xnf_n_tx; i++) { + struct tx_pktinfo *txp = &xnfp->xnf_tx_pkt_info[i]; txp->id = i + 1; @@ -446,83 +366,105 @@ xnf_setup_rings(xnf_t *xnfp) (void) ddi_dma_unbind_handle(txp->dma_handle); if (txp->bdesc != NULL) { - xnf_free_xmit_buffer(txp->bdesc); + xnf_free_tx_buffer(txp->bdesc); txp->bdesc = NULL; } (void) gnttab_end_foreign_access_ref(txp->grant_ref, - xnfp->tx_pages_readonly); - gnttab_release_grant_reference(&xnfp->gref_tx_head, + xnfp->xnf_tx_pages_readonly); + gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head, txp->grant_ref); txp->grant_ref = GRANT_INVALID_REF; } - xnfp->tx_pkt_id_list = 0; - xnfp->tx_ring.rsp_cons = 0; - xnfp->tx_ring.sring->req_prod = 0; - xnfp->tx_ring.sring->rsp_prod = 0; - xnfp->tx_ring.sring->rsp_event = 1; + xnfp->xnf_tx_pkt_id_list = 0; + xnfp->xnf_tx_ring.rsp_cons = 0; + xnfp->xnf_tx_ring.sring->req_prod = 0; + xnfp->xnf_tx_ring.sring->rsp_prod = 0; + xnfp->xnf_tx_ring.sring->rsp_event = 1; - mutex_exit(&xnfp->txlock); + mutex_exit(&xnfp->xnf_txlock); /* * Rebuild the RX ring. We have to rebuild the RX ring because some of - * our pages are currently flipped out so we can't just free the RX - * buffers. Reclaim any unprocessed recv buffers, they won't be + * our pages are currently flipped out/granted so we can't just free + * the RX buffers. Reclaim any unprocessed recv buffers, they won't be * useable anyway since the mfn's they refer to are no longer valid. * Grant the backend domain access to each hung rx buffer. */ - i = xnfp->rx_ring.rsp_cons; - while (i++ != xnfp->rx_ring.sring->req_prod) { + i = xnfp->xnf_rx_ring.rsp_cons; + while (i++ != xnfp->xnf_rx_ring.sring->req_prod) { volatile netif_rx_request_t *rxrp; - rxrp = RING_GET_REQUEST(&xnfp->rx_ring, i); - ix = rxrp - RING_GET_REQUEST(&xnfp->rx_ring, 0); - rbp = xnfp->rxpkt_bufptr[ix]; + rxrp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, i); + ix = rxrp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0); + rbp = xnfp->xnf_rxpkt_bufptr[ix]; if (rbp != NULL) { - ASSERT(rbp->grant_ref != GRANT_INVALID_REF); - gnttab_grant_foreign_transfer_ref(rbp->grant_ref, - oeid); + grant_ref_t ref = rbp->grant_ref; + + ASSERT(ref != GRANT_INVALID_REF); + if (xnfp->xnf_rx_hvcopy) { + pfn_t pfn = xnf_btop(rbp->buf_phys); + mfn_t mfn = pfn_to_mfn(pfn); + + gnttab_grant_foreign_access_ref(ref, oeid, + mfn, 0); + } else { + gnttab_grant_foreign_transfer_ref(ref, oeid); + } rxrp->id = ix; - rxrp->gref = rbp->grant_ref; + rxrp->gref = ref; } } + /* * Reset the ring pointers to initial state. * Hang buffers for any empty ring slots. */ - xnfp->rx_ring.rsp_cons = 0; - xnfp->rx_ring.sring->req_prod = 0; - xnfp->rx_ring.sring->rsp_prod = 0; - xnfp->rx_ring.sring->rsp_event = 1; + xnfp->xnf_rx_ring.rsp_cons = 0; + xnfp->xnf_rx_ring.sring->req_prod = 0; + xnfp->xnf_rx_ring.sring->rsp_prod = 0; + xnfp->xnf_rx_ring.sring->rsp_event = 1; for (i = 0; i < NET_RX_RING_SIZE; i++) { - xnfp->rx_ring.req_prod_pvt = i; - if (xnfp->rxpkt_bufptr[i] != NULL) + xnfp->xnf_rx_ring.req_prod_pvt = i; + if (xnfp->xnf_rxpkt_bufptr[i] != NULL) continue; if ((bdesc = xnf_get_buffer(xnfp)) == NULL) break; rx_buffer_hang(xnfp, bdesc); } - xnfp->rx_ring.req_prod_pvt = i; + xnfp->xnf_rx_ring.req_prod_pvt = i; /* LINTED: constant in conditional context */ - RING_PUSH_REQUESTS(&xnfp->rx_ring); + RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring); - mutex_exit(&xnfp->intrlock); + mutex_exit(&xnfp->xnf_intrlock); return (0); out: - if (xnfp->tx_ring_ref != GRANT_INVALID_REF) - gnttab_end_foreign_access(xnfp->tx_ring_ref, 0, 0); - xnfp->tx_ring_ref = GRANT_INVALID_REF; + if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF) + gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); + xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF; - if (xnfp->rx_ring_ref != GRANT_INVALID_REF) - gnttab_end_foreign_access(xnfp->rx_ring_ref, 0, 0); - xnfp->rx_ring_ref = GRANT_INVALID_REF; + if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF) + gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); + xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF; return (err); } + +/* Called when the upper layers free a message we passed upstream */ +static void +xnf_copy_rcv_complete(struct xnf_buffer_desc *bdesc) +{ + (void) ddi_dma_unbind_handle(bdesc->dma_handle); + ddi_dma_mem_free(&bdesc->acc_handle); + ddi_dma_free_handle(&bdesc->dma_handle); + kmem_free(bdesc, sizeof (*bdesc)); +} + + /* * Connect driver to back end, called to set up communication with * back end driver both initially and on resume after restore/migrate. @@ -533,16 +475,16 @@ xnf_be_connect(xnf_t *xnfp) char mac[ETHERADDRL * 3]; const char *message; xenbus_transaction_t xbt; - struct xenbus_device *xsd; + struct xenbus_device *xsd; char *xsname; int err, be_no_cksum_offload; - ASSERT(!xnfp->connected); + ASSERT(!xnfp->xnf_connected); - xsd = xvdi_get_xsd(xnfp->devinfo); - xsname = xvdi_get_xsname(xnfp->devinfo); + xsd = xvdi_get_xsd(xnfp->xnf_devinfo); + xsname = xvdi_get_xsname(xnfp->xnf_devinfo); - err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->devinfo), "mac", + err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo), "mac", "%s", (char *)&mac[0]); if (err != 0) { /* @@ -550,12 +492,12 @@ xnf_be_connect(xnf_t *xnfp) * addr. at this point */ cmn_err(CE_WARN, "%s%d: no mac address", - ddi_driver_name(xnfp->devinfo), - ddi_get_instance(xnfp->devinfo)); + ddi_driver_name(xnfp->xnf_devinfo), + ddi_get_instance(xnfp->xnf_devinfo)); return; } - if (ether_aton(mac, xnfp->mac_addr) != ETHERADDRL) { + if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) { err = ENOENT; xenbus_dev_error(xsd, ENOENT, "parsing %s/mac", xsname); return; @@ -568,7 +510,7 @@ xnf_be_connect(xnf_t *xnfp) return; } - err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->devinfo), + err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo), "feature-no-csum-offload", "%d", &be_no_cksum_offload); /* * If we fail to read the store we assume that the key is @@ -581,8 +523,8 @@ xnf_be_connect(xnf_t *xnfp) * If the far end cannot do checksum offload or we do not wish * to do it, disable it. */ - if ((be_no_cksum_offload == 1) || !xnfp->cksum_offload) - xnfp->cksum_offload = B_FALSE; + if ((be_no_cksum_offload == 1) || !xnfp->xnf_cksum_offload) + xnfp->xnf_cksum_offload = B_FALSE; again: err = xenbus_transaction_start(&xbt); @@ -592,20 +534,21 @@ again: } err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u", - xnfp->tx_ring_ref); + xnfp->xnf_tx_ring_ref); if (err != 0) { message = "writing tx ring-ref"; goto abort_transaction; } err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u", - xnfp->rx_ring_ref); + xnfp->xnf_rx_ring_ref); if (err != 0) { message = "writing rx ring-ref"; goto abort_transaction; } - err = xenbus_printf(xbt, xsname, "event-channel", "%u", xnfp->evtchn); + err = xenbus_printf(xbt, xsname, "event-channel", "%u", + xnfp->xnf_evtchn); if (err != 0) { message = "writing event-channel"; goto abort_transaction; @@ -617,7 +560,7 @@ again: goto abort_transaction; } - if (!xnfp->tx_pages_readonly) { + if (!xnfp->xnf_tx_pages_readonly) { err = xenbus_printf(xbt, xsname, "feature-tx-writable", "%d", 1); if (err != 0) { @@ -627,11 +570,17 @@ again: } err = xenbus_printf(xbt, xsname, "feature-no-csum-offload", "%d", - xnfp->cksum_offload ? 0 : 1); + xnfp->xnf_cksum_offload ? 0 : 1); if (err != 0) { message = "writing feature-no-csum-offload"; goto abort_transaction; } + err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", + xnfp->xnf_rx_hvcopy ? 1 : 0); + if (err != 0) { + message = "writing request-rx-copy"; + goto abort_transaction; + } err = xenbus_printf(xbt, xsname, "state", "%d", XenbusStateConnected); if (err != 0) { @@ -677,20 +626,24 @@ xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) (void) xvdi_resume(devinfo); (void) xvdi_alloc_evtchn(devinfo); + xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo); +#ifdef XPV_HVM_DRIVER + ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, + xnfp); +#else (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp); - xnfp->evtchn = xvdi_get_evtchn(devinfo); +#endif xnf_be_connect(xnfp); /* - * Our MAC address didn't necessarily change, but - * given that we may be resuming this OS instance - * on a different machine (or on the same one and got a - * different MAC address because we didn't specify one of - * our own), it's useful to claim that - * it changed in order that IP send out a - * gratuitous ARP. + * Our MAC address may have changed if we're resuming: + * - on a different host + * - on the same one and got a different MAC address + * because we didn't specify one of our own. + * so it's useful to claim that it changed in order that + * IP send out a gratuitous ARP. */ - mac_unicst_update(xnfp->mh, xnfp->mac_addr); + mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr); return (DDI_SUCCESS); case DDI_ATTACH: @@ -710,23 +663,32 @@ xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) macp->m_dip = devinfo; macp->m_driver = xnfp; - xnfp->devinfo = devinfo; + xnfp->xnf_devinfo = devinfo; macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; - macp->m_src_addr = xnfp->mac_addr; + macp->m_src_addr = xnfp->xnf_mac_addr; macp->m_callbacks = &xnf_callbacks; macp->m_min_sdu = 0; macp->m_max_sdu = XNF_MAXPKT; - xnfp->running = B_FALSE; - xnfp->connected = B_FALSE; - xnfp->cksum_offload = xnf_cksum_offload; - xnfp->tx_pages_readonly = xnf_tx_pages_readonly; + xnfp->xnf_running = B_FALSE; + xnfp->xnf_connected = B_FALSE; + xnfp->xnf_cksum_offload = xnf_cksum_offload; + xnfp->xnf_tx_pages_readonly = xnf_tx_pages_readonly; + + xnfp->xnf_rx_hvcopy = xnf_hvcopy_peer_status(devinfo) && xnf_rx_hvcopy; +#ifdef XPV_HVM_DRIVER + if (!xnfp->xnf_rx_hvcopy) { + cmn_err(CE_WARN, "The xnf driver requires a dom0 that " + "supports 'feature-rx-copy'"); + goto failure; + } +#endif /* * Get the iblock cookie with which to initialize the mutexes. */ - if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->icookie) + if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie) != DDI_SUCCESS) goto failure; /* @@ -736,84 +698,94 @@ xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) * affect the operation of any other part of the driver, * it needs to acquire the txlock mutex. */ - mutex_init(&xnfp->tx_buf_mutex, - NULL, MUTEX_DRIVER, xnfp->icookie); - mutex_init(&xnfp->rx_buf_mutex, - NULL, MUTEX_DRIVER, xnfp->icookie); - mutex_init(&xnfp->txlock, - NULL, MUTEX_DRIVER, xnfp->icookie); - mutex_init(&xnfp->intrlock, - NULL, MUTEX_DRIVER, xnfp->icookie); - cv_init(&xnfp->cv, NULL, CV_DEFAULT, NULL); + mutex_init(&xnfp->xnf_tx_buf_mutex, + NULL, MUTEX_DRIVER, xnfp->xnf_icookie); + mutex_init(&xnfp->xnf_rx_buf_mutex, + NULL, MUTEX_DRIVER, xnfp->xnf_icookie); + mutex_init(&xnfp->xnf_txlock, + NULL, MUTEX_DRIVER, xnfp->xnf_icookie); + mutex_init(&xnfp->xnf_intrlock, + NULL, MUTEX_DRIVER, xnfp->xnf_icookie); + cv_init(&xnfp->xnf_cv, NULL, CV_DEFAULT, NULL); if (gnttab_alloc_grant_references(NET_TX_RING_SIZE, - &xnfp->gref_tx_head) < 0) { + &xnfp->xnf_gref_tx_head) < 0) { cmn_err(CE_WARN, "xnf%d: can't alloc tx grant refs", - ddi_get_instance(xnfp->devinfo)); - goto late_failure; + ddi_get_instance(xnfp->xnf_devinfo)); + goto failure_1; } if (gnttab_alloc_grant_references(NET_RX_RING_SIZE, - &xnfp->gref_rx_head) < 0) { + &xnfp->xnf_gref_rx_head) < 0) { cmn_err(CE_WARN, "xnf%d: can't alloc rx grant refs", - ddi_get_instance(xnfp->devinfo)); - goto late_failure; + ddi_get_instance(xnfp->xnf_devinfo)); + goto failure_1; } if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) { cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize " - "driver data structures", ddi_get_instance(xnfp->devinfo)); - goto late_failure; + "driver data structures", + ddi_get_instance(xnfp->xnf_devinfo)); + goto failure_1; } - xnfp->rx_ring.sring->rsp_event = xnfp->tx_ring.sring->rsp_event = 1; + xnfp->xnf_rx_ring.sring->rsp_event = + xnfp->xnf_tx_ring.sring->rsp_event = 1; - xnfp->tx_ring_ref = GRANT_INVALID_REF; - xnfp->rx_ring_ref = GRANT_INVALID_REF; + xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF; + xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF; /* set driver private pointer now */ ddi_set_driver_private(devinfo, xnfp); if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change) != DDI_SUCCESS) - goto late_failure; + goto failure_1; if (!xnf_kstat_init(xnfp)) - goto very_late_failure; + goto failure_2; /* * Allocate an event channel, add the interrupt handler and * bind it to the event channel. */ (void) xvdi_alloc_evtchn(devinfo); + xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo); +#ifdef XPV_HVM_DRIVER + ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp); +#else (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp); - xnfp->evtchn = xvdi_get_evtchn(devinfo); +#endif /* * connect to the backend */ xnf_be_connect(xnfp); - err = mac_register(macp, &xnfp->mh); + err = mac_register(macp, &xnfp->xnf_mh); mac_free(macp); macp = NULL; if (err != 0) - goto very_very_late_failure; + goto failure_3; return (DDI_SUCCESS); -very_very_late_failure: - kstat_delete(xnfp->kstat_aux); +failure_3: + kstat_delete(xnfp->xnf_kstat_aux); -very_late_failure: +failure_2: xvdi_remove_event_handler(devinfo, XS_OE_STATE); - ddi_remove_intr(devinfo, 0, xnfp->icookie); - xnfp->evtchn = INVALID_EVTCHN; +#ifdef XPV_HVM_DRIVER + ec_unbind_evtchn(xnfp->xnf_evtchn); +#else + ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); +#endif + xnfp->xnf_evtchn = INVALID_EVTCHN; -late_failure: +failure_1: xnf_release_dma_resources(xnfp); - cv_destroy(&xnfp->cv); - mutex_destroy(&xnfp->rx_buf_mutex); - mutex_destroy(&xnfp->txlock); - mutex_destroy(&xnfp->intrlock); + cv_destroy(&xnfp->xnf_cv); + mutex_destroy(&xnfp->xnf_rx_buf_mutex); + mutex_destroy(&xnfp->xnf_txlock); + mutex_destroy(&xnfp->xnf_intrlock); failure: kmem_free(xnfp, sizeof (*xnfp)); @@ -839,17 +811,21 @@ xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) switch (cmd) { case DDI_SUSPEND: - ddi_remove_intr(devinfo, 0, xnfp->icookie); +#ifdef XPV_HVM_DRIVER + ec_unbind_evtchn(xnfp->xnf_evtchn); +#else + ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); +#endif xvdi_suspend(devinfo); - mutex_enter(&xnfp->intrlock); - mutex_enter(&xnfp->txlock); + mutex_enter(&xnfp->xnf_intrlock); + mutex_enter(&xnfp->xnf_txlock); - xnfp->evtchn = INVALID_EVTCHN; - xnfp->connected = B_FALSE; - mutex_exit(&xnfp->txlock); - mutex_exit(&xnfp->intrlock); + xnfp->xnf_evtchn = INVALID_EVTCHN; + xnfp->xnf_connected = B_FALSE; + mutex_exit(&xnfp->xnf_txlock); + mutex_exit(&xnfp->xnf_intrlock); return (DDI_SUCCESS); case DDI_DETACH: @@ -859,32 +835,32 @@ xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) return (DDI_FAILURE); } - if (xnfp->connected) + if (xnfp->xnf_connected) return (DDI_FAILURE); /* Wait for receive buffers to be returned; give up after 5 seconds */ i = 50; - mutex_enter(&xnfp->rx_buf_mutex); - while (xnfp->rx_bufs_outstanding > 0) { - mutex_exit(&xnfp->rx_buf_mutex); + mutex_enter(&xnfp->xnf_rx_buf_mutex); + while (xnfp->xnf_rx_bufs_outstanding > 0) { + mutex_exit(&xnfp->xnf_rx_buf_mutex); delay(drv_usectohz(100000)); if (--i == 0) { cmn_err(CE_WARN, "xnf%d: never reclaimed all the " "receive buffers. Still have %d " "buffers outstanding.", - ddi_get_instance(xnfp->devinfo), - xnfp->rx_bufs_outstanding); + ddi_get_instance(xnfp->xnf_devinfo), + xnfp->xnf_rx_bufs_outstanding); return (DDI_FAILURE); } - mutex_enter(&xnfp->rx_buf_mutex); + mutex_enter(&xnfp->xnf_rx_buf_mutex); } - mutex_exit(&xnfp->rx_buf_mutex); + mutex_exit(&xnfp->xnf_rx_buf_mutex); - kstat_delete(xnfp->kstat_aux); + kstat_delete(xnfp->xnf_kstat_aux); - if (mac_unregister(xnfp->mh) != 0) + if (mac_unregister(xnfp->xnf_mh) != 0) return (DDI_FAILURE); /* Stop the receiver */ @@ -893,7 +869,11 @@ xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) xvdi_remove_event_handler(devinfo, XS_OE_STATE); /* Remove the interrupt */ - ddi_remove_intr(devinfo, 0, xnfp->icookie); +#ifdef XPV_HVM_DRIVER + ec_unbind_evtchn(xnfp->xnf_evtchn); +#else + ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); +#endif /* Release any pending xmit mblks */ xnf_release_mblks(xnfp); @@ -901,10 +881,10 @@ xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) /* Release all DMA resources */ xnf_release_dma_resources(xnfp); - cv_destroy(&xnfp->cv); - mutex_destroy(&xnfp->rx_buf_mutex); - mutex_destroy(&xnfp->txlock); - mutex_destroy(&xnfp->intrlock); + cv_destroy(&xnfp->xnf_cv); + mutex_destroy(&xnfp->xnf_rx_buf_mutex); + mutex_destroy(&xnfp->xnf_txlock); + mutex_destroy(&xnfp->xnf_intrlock); kmem_free(xnfp, sizeof (*xnfp)); @@ -924,7 +904,7 @@ xnf_set_mac_addr(void *arg, const uint8_t *macaddr) if (xnfdebug & XNF_DEBUG_TRACE) printf("xnf%d: set_mac_addr(0x%p): " "%02x:%02x:%02x:%02x:%02x:%02x\n", - ddi_get_instance(xnfp->devinfo), + ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp, macaddr[0], macaddr[1], macaddr[2], macaddr[3], macaddr[4], macaddr[5]); #endif @@ -952,7 +932,7 @@ xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca) if (xnfdebug & XNF_DEBUG_TRACE) printf("xnf%d set_multicast(0x%p): " "%02x:%02x:%02x:%02x:%02x:%02x\n", - ddi_get_instance(xnfp->devinfo), + ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp, mca[0], mca[1], mca[2], mca[3], mca[4], mca[5]); #endif @@ -983,7 +963,7 @@ xnf_set_promiscuous(void *arg, boolean_t on) #ifdef XNF_DEBUG if (xnfdebug & XNF_DEBUG_TRACE) printf("xnf%d set_promiscuous(0x%p, %x)\n", - ddi_get_instance(xnfp->devinfo), + ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp, on); #endif /* @@ -1004,45 +984,46 @@ xnf_clean_tx_ring(xnf_t *xnfp) int id; grant_ref_t ref; - ASSERT(MUTEX_HELD(&xnfp->txlock)); + ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); do { /* * index of next transmission ack */ - next_resp = xnfp->tx_ring.sring->rsp_prod; + next_resp = xnfp->xnf_tx_ring.sring->rsp_prod; membar_consumer(); /* * Clean tx packets from ring that we have responses for */ - for (i = xnfp->tx_ring.rsp_cons; i != next_resp; i++) { - id = RING_GET_RESPONSE(&xnfp->tx_ring, i)->id; - reap = &xnfp->tx_pkt_info[id]; + for (i = xnfp->xnf_tx_ring.rsp_cons; i != next_resp; i++) { + id = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i)->id; + reap = &xnfp->xnf_tx_pkt_info[id]; ref = reap->grant_ref; /* * Return id to free list */ - reap->id = xnfp->tx_pkt_id_list; - xnfp->tx_pkt_id_list = id; + reap->id = xnfp->xnf_tx_pkt_id_list; + xnfp->xnf_tx_pkt_id_list = id; if (gnttab_query_foreign_access(ref) != 0) - panic("tx grant still in use" + panic("tx grant still in use " "by backend domain"); (void) ddi_dma_unbind_handle(reap->dma_handle); (void) gnttab_end_foreign_access_ref(ref, - xnfp->tx_pages_readonly); - gnttab_release_grant_reference(&xnfp->gref_tx_head, + xnfp->xnf_tx_pages_readonly); + gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head, ref); freemsg(reap->mp); reap->mp = NULL; reap->grant_ref = GRANT_INVALID_REF; if (reap->bdesc != NULL) - xnf_free_xmit_buffer(reap->bdesc); + xnf_free_tx_buffer(reap->bdesc); reap->bdesc = NULL; } - xnfp->tx_ring.rsp_cons = next_resp; + xnfp->xnf_tx_ring.rsp_cons = next_resp; membar_enter(); - } while (next_resp != xnfp->tx_ring.sring->rsp_prod); - return (NET_TX_RING_SIZE - (xnfp->tx_ring.sring->req_prod - next_resp)); + } while (next_resp != xnfp->xnf_tx_ring.sring->rsp_prod); + return (NET_TX_RING_SIZE - (xnfp->xnf_tx_ring.sring->req_prod - + next_resp)); } /* @@ -1062,15 +1043,15 @@ xnf_pullupmsg(xnf_t *xnfp, mblk_t *mp) /* * get a xmit buffer from the xmit buffer pool */ - mutex_enter(&xnfp->rx_buf_mutex); - bdesc = xnf_get_xmit_buffer(xnfp); - mutex_exit(&xnfp->rx_buf_mutex); + mutex_enter(&xnfp->xnf_rx_buf_mutex); + bdesc = xnf_get_tx_buffer(xnfp); + mutex_exit(&xnfp->xnf_rx_buf_mutex); if (bdesc == NULL) return (bdesc); /* * Copy the data into the buffer */ - xnfp->stat_xmit_pullup++; + xnfp->xnf_stat_tx_pullup++; bp = bdesc->buf; for (mptr = mp; mptr != NULL; mptr = mptr->b_cont) { len = mptr->b_wptr - mptr->b_rptr; @@ -1112,28 +1093,28 @@ xnf_send_one(xnf_t *xnfp, mblk_t *mp) #ifdef XNF_DEBUG if (xnfdebug & XNF_DEBUG_SEND) printf("xnf%d send(0x%p, 0x%p)\n", - ddi_get_instance(xnfp->devinfo), + ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp, (void *)mp); #endif ASSERT(mp != NULL); ASSERT(mp->b_next == NULL); - ASSERT(MUTEX_HELD(&xnfp->txlock)); + ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); tx_ring_freespace = xnf_clean_tx_ring(xnfp); ASSERT(tx_ring_freespace >= 0); - oeid = xvdi_get_oeid(xnfp->devinfo); - xnfp->stat_xmit_attempt++; + oeid = xvdi_get_oeid(xnfp->xnf_devinfo); + xnfp->xnf_stat_tx_attempt++; /* * If there are no xmit ring slots available, return. */ if (tx_ring_freespace == 0) { - xnfp->stat_xmit_defer++; + xnfp->xnf_stat_tx_defer++; return (B_FALSE); /* Send should be retried */ } - slot = xnfp->tx_ring.sring->req_prod; + slot = xnfp->xnf_tx_ring.sring->req_prod; /* Count the number of mblks in message and compute packet size */ for (i = 0, mptr = mp; mptr != NULL; mptr = mptr->b_cont, i++) pktlen += (mptr->b_wptr - mptr->b_rptr); @@ -1141,7 +1122,7 @@ xnf_send_one(xnf_t *xnfp, mblk_t *mp) /* Make sure packet isn't too large */ if (pktlen > XNF_FRAMESIZE) { cmn_err(CE_WARN, "xnf%d: large packet %d bytes", - ddi_get_instance(xnfp->devinfo), pktlen); + ddi_get_instance(xnfp->xnf_devinfo), pktlen); freemsg(mp); return (B_FALSE); } @@ -1159,14 +1140,14 @@ xnf_send_one(xnf_t *xnfp, mblk_t *mp) */ if (i > xnf_max_tx_frags || page_oops) { if (page_oops) - xnfp->stat_xmit_pagebndry++; + xnfp->xnf_stat_tx_pagebndry++; if ((xmitbuf = xnf_pullupmsg(xnfp, mp)) == NULL) { /* could not allocate resources? */ #ifdef XNF_DEBUG cmn_err(CE_WARN, "xnf%d: pullupmsg failed", - ddi_get_instance(xnfp->devinfo)); + ddi_get_instance(xnfp->xnf_devinfo)); #endif - xnfp->stat_xmit_defer++; + xnfp->xnf_stat_tx_defer++; return (B_FALSE); /* Retry send */ } bufaddr = xmitbuf->buf; @@ -1181,10 +1162,10 @@ xnf_send_one(xnf_t *xnfp, mblk_t *mp) /* * Get packet id from free list */ - tx_id = xnfp->tx_pkt_id_list; + tx_id = xnfp->xnf_tx_pkt_id_list; ASSERT(tx_id < NET_TX_RING_SIZE); - txp_info = &xnfp->tx_pkt_info[tx_id]; - xnfp->tx_pkt_id_list = txp_info->id; + txp_info = &xnfp->xnf_tx_pkt_info[tx_id]; + xnfp->xnf_tx_pkt_id_list = txp_info->id; txp_info->id = tx_id; /* Prepare for DMA mapping of tx buffer(s) */ @@ -1197,27 +1178,27 @@ xnf_send_one(xnf_t *xnfp, mblk_t *mp) /* * Return id to free list */ - txp_info->id = xnfp->tx_pkt_id_list; - xnfp->tx_pkt_id_list = tx_id; + txp_info->id = xnfp->xnf_tx_pkt_id_list; + xnfp->xnf_tx_pkt_id_list = tx_id; if (rc == DDI_DMA_NORESOURCES) { - xnfp->stat_xmit_defer++; + xnfp->xnf_stat_tx_defer++; return (B_FALSE); /* Retry later */ } #ifdef XNF_DEBUG cmn_err(CE_WARN, "xnf%d: bind_handle failed (%x)", - ddi_get_instance(xnfp->devinfo), rc); + ddi_get_instance(xnfp->xnf_devinfo), rc); #endif return (B_FALSE); } ASSERT(ncookies == 1); - ref = gnttab_claim_grant_reference(&xnfp->gref_tx_head); + ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_tx_head); ASSERT((signed short)ref >= 0); mfn = xnf_btop(pa_to_ma((paddr_t)dma_cookie.dmac_laddress)); gnttab_grant_foreign_access_ref(ref, oeid, mfn, - xnfp->tx_pages_readonly); + xnfp->xnf_tx_pages_readonly); txp_info->grant_ref = ref; - txrp = RING_GET_REQUEST(&xnfp->tx_ring, slot); + txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); txrp->gref = ref; txrp->size = dma_cookie.dmac_size; txrp->offset = (uintptr_t)bufaddr & PAGEOFFSET; @@ -1225,7 +1206,7 @@ xnf_send_one(xnf_t *xnfp, mblk_t *mp) txrp->flags = 0; hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &pflags); if (pflags != 0) { - ASSERT(xnfp->cksum_offload); + ASSERT(xnfp->xnf_cksum_offload); /* * If the local protocol stack requests checksum * offload we set the 'checksum blank' flag, @@ -1236,27 +1217,28 @@ xnf_send_one(xnf_t *xnfp, mblk_t *mp) * validated that the data and the checksum match. */ txrp->flags |= NETTXF_csum_blank; - xnfp->stat_tx_cksum_deferred++; + xnfp->xnf_stat_tx_cksum_deferred++; } membar_producer(); - xnfp->tx_ring.sring->req_prod = slot + 1; + xnfp->xnf_tx_ring.sring->req_prod = slot + 1; txp_info->mp = mp; txp_info->bdesc = xmitbuf; - txs_out = xnfp->tx_ring.sring->req_prod - xnfp->tx_ring.sring->rsp_prod; - if (xnfp->tx_ring.sring->req_prod - xnfp->tx_ring.rsp_cons < + txs_out = xnfp->xnf_tx_ring.sring->req_prod - + xnfp->xnf_tx_ring.sring->rsp_prod; + if (xnfp->xnf_tx_ring.sring->req_prod - xnfp->xnf_tx_ring.rsp_cons < XNF_TX_FREE_THRESH) { /* * The ring is getting full; Set up this packet * to cause an interrupt. */ - xnfp->tx_ring.sring->rsp_event = - xnfp->tx_ring.sring->rsp_prod + txs_out; + xnfp->xnf_tx_ring.sring->rsp_event = + xnfp->xnf_tx_ring.sring->rsp_prod + txs_out; } - xnfp->stat_opackets++; - xnfp->stat_obytes += pktlen; + xnfp->xnf_stat_opackets++; + xnfp->xnf_stat_obytes += pktlen; return (B_TRUE); /* successful transmit attempt */ } @@ -1268,19 +1250,19 @@ xnf_send(void *arg, mblk_t *mp) mblk_t *next; boolean_t sent_something = B_FALSE; - mutex_enter(&xnfp->txlock); + mutex_enter(&xnfp->xnf_txlock); /* * Transmission attempts should be impossible without having * previously called xnf_start(). */ - ASSERT(xnfp->running); + ASSERT(xnfp->xnf_running); /* * Wait for getting connected to the backend */ - while (!xnfp->connected) { - cv_wait(&xnfp->cv, &xnfp->txlock); + while (!xnfp->xnf_connected) { + cv_wait(&xnfp->xnf_cv, &xnfp->xnf_txlock); } while (mp != NULL) { @@ -1297,9 +1279,9 @@ xnf_send(void *arg, mblk_t *mp) } if (sent_something) - ec_notify_via_evtchn(xnfp->evtchn); + ec_notify_via_evtchn(xnfp->xnf_evtchn); - mutex_exit(&xnfp->txlock); + mutex_exit(&xnfp->xnf_txlock); return (mp); } @@ -1313,27 +1295,33 @@ xnf_intr(caddr_t arg) xnf_t *xnfp = (xnf_t *)arg; int tx_ring_space; - mutex_enter(&xnfp->intrlock); + mutex_enter(&xnfp->xnf_intrlock); /* * If not connected to the peer or not started by the upper * layers we cannot usefully handle interrupts. */ - if (!(xnfp->connected && xnfp->running)) { - mutex_exit(&xnfp->intrlock); + if (!(xnfp->xnf_connected && xnfp->xnf_running)) { + mutex_exit(&xnfp->xnf_intrlock); + xnfp->xnf_stat_unclaimed_interrupts++; return (DDI_INTR_UNCLAIMED); } #ifdef XNF_DEBUG if (xnfdebug & XNF_DEBUG_INT) printf("xnf%d intr(0x%p)\n", - ddi_get_instance(xnfp->devinfo), (void *)xnfp); + ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp); #endif - if (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->rx_ring)) { + if (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) { mblk_t *mp; - if ((mp = xnf_process_recv(xnfp)) != NULL) - mac_rx(xnfp->mh, xnfp->rx_handle, mp); + if (xnfp->xnf_rx_hvcopy) + mp = xnf_process_hvcopy_recv(xnfp); + else + mp = xnf_process_recv(xnfp); + + if (mp != NULL) + mac_rx(xnfp->xnf_mh, xnfp->xnf_rx_handle, mp); } /* @@ -1341,32 +1329,33 @@ xnf_intr(caddr_t arg) */ #define inuse(r) ((r).sring->req_prod - (r).rsp_cons) - if ((NET_TX_RING_SIZE - inuse(xnfp->tx_ring)) < XNF_TX_FREE_THRESH) { + if ((NET_TX_RING_SIZE - inuse(xnfp->xnf_tx_ring)) < + XNF_TX_FREE_THRESH) { /* * Yes, clean it and try to start any blocked xmit * streams. */ - mutex_enter(&xnfp->txlock); + mutex_enter(&xnfp->xnf_txlock); tx_ring_space = xnf_clean_tx_ring(xnfp); - mutex_exit(&xnfp->txlock); + mutex_exit(&xnfp->xnf_txlock); if (tx_ring_space > XNF_TX_FREE_THRESH) { - mutex_exit(&xnfp->intrlock); - mac_tx_update(xnfp->mh); - mutex_enter(&xnfp->intrlock); + mutex_exit(&xnfp->xnf_intrlock); + mac_tx_update(xnfp->xnf_mh); + mutex_enter(&xnfp->xnf_intrlock); } else { /* * Schedule another tx interrupt when we have * sent enough packets to cross the threshold. */ - xnfp->tx_ring.sring->rsp_event = - xnfp->tx_ring.sring->rsp_prod + + xnfp->xnf_tx_ring.sring->rsp_event = + xnfp->xnf_tx_ring.sring->rsp_prod + XNF_TX_FREE_THRESH - tx_ring_space + 1; } } #undef inuse - xnfp->stat_intr++; - mutex_exit(&xnfp->intrlock); + xnfp->xnf_stat_interrupts++; + mutex_exit(&xnfp->xnf_intrlock); return (DDI_INTR_CLAIMED); /* indicate that the interrupt was for us */ } @@ -1381,17 +1370,17 @@ xnf_start(void *arg) #ifdef XNF_DEBUG if (xnfdebug & XNF_DEBUG_TRACE) printf("xnf%d start(0x%p)\n", - ddi_get_instance(xnfp->devinfo), (void *)xnfp); + ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp); #endif - mutex_enter(&xnfp->intrlock); - mutex_enter(&xnfp->txlock); + mutex_enter(&xnfp->xnf_intrlock); + mutex_enter(&xnfp->xnf_txlock); /* Accept packets from above. */ - xnfp->running = B_TRUE; + xnfp->xnf_running = B_TRUE; - mutex_exit(&xnfp->txlock); - mutex_exit(&xnfp->intrlock); + mutex_exit(&xnfp->xnf_txlock); + mutex_exit(&xnfp->xnf_intrlock); return (0); } @@ -1405,16 +1394,16 @@ xnf_stop(void *arg) #ifdef XNF_DEBUG if (xnfdebug & XNF_DEBUG_TRACE) printf("xnf%d stop(0x%p)\n", - ddi_get_instance(xnfp->devinfo), (void *)xnfp); + ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp); #endif - mutex_enter(&xnfp->intrlock); - mutex_enter(&xnfp->txlock); + mutex_enter(&xnfp->xnf_intrlock); + mutex_enter(&xnfp->xnf_txlock); - xnfp->running = B_FALSE; + xnfp->xnf_running = B_FALSE; - mutex_exit(&xnfp->txlock); - mutex_exit(&xnfp->intrlock); + mutex_exit(&xnfp->xnf_txlock); + mutex_exit(&xnfp->xnf_intrlock); } /* @@ -1428,30 +1417,203 @@ static void rx_buffer_hang(xnf_t *xnfp, struct xnf_buffer_desc *bdesc) { volatile netif_rx_request_t *reqp; - RING_IDX hang_ix; - grant_ref_t ref; - domid_t oeid; + RING_IDX hang_ix; + grant_ref_t ref; + domid_t oeid; - oeid = xvdi_get_oeid(xnfp->devinfo); + oeid = xvdi_get_oeid(xnfp->xnf_devinfo); - ASSERT(MUTEX_HELD(&xnfp->intrlock)); - reqp = RING_GET_REQUEST(&xnfp->rx_ring, xnfp->rx_ring.req_prod_pvt); - hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->rx_ring, 0)); - ASSERT(xnfp->rxpkt_bufptr[hang_ix] == NULL); + ASSERT(MUTEX_HELD(&xnfp->xnf_intrlock)); + reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, + xnfp->xnf_rx_ring.req_prod_pvt); + hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0)); + ASSERT(xnfp->xnf_rxpkt_bufptr[hang_ix] == NULL); if (bdesc->grant_ref == GRANT_INVALID_REF) { - ref = gnttab_claim_grant_reference(&xnfp->gref_rx_head); + ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_rx_head); ASSERT((signed short)ref >= 0); bdesc->grant_ref = ref; - gnttab_grant_foreign_transfer_ref(ref, oeid); + if (xnfp->xnf_rx_hvcopy) { + pfn_t pfn = xnf_btop(bdesc->buf_phys); + mfn_t mfn = pfn_to_mfn(pfn); + + gnttab_grant_foreign_access_ref(ref, oeid, mfn, 0); + } else { + gnttab_grant_foreign_transfer_ref(ref, oeid); + } } reqp->id = hang_ix; reqp->gref = bdesc->grant_ref; bdesc->id = hang_ix; - xnfp->rxpkt_bufptr[hang_ix] = bdesc; + xnfp->xnf_rxpkt_bufptr[hang_ix] = bdesc; membar_producer(); - xnfp->rx_ring.req_prod_pvt++; + xnfp->xnf_rx_ring.req_prod_pvt++; } +static mblk_t * +xnf_process_hvcopy_recv(xnf_t *xnfp) +{ + netif_rx_response_t *rxpkt; + mblk_t *mp, *head, *tail; + struct xnf_buffer_desc *bdesc; + boolean_t hwcsum = B_FALSE, notify, work_to_do; + size_t len; + + /* + * in loop over unconsumed responses, we do: + * 1. get a response + * 2. take corresponding buffer off recv. ring + * 3. indicate this by setting slot to NULL + * 4. create a new message and + * 5. copy data in, adjust ptr + * + * outside loop: + * 7. make sure no more data has arrived; kick HV + */ + + head = tail = NULL; + +loop: + while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) { + + /* 1. */ + rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring, + xnfp->xnf_rx_ring.rsp_cons); + + DTRACE_PROBE4(got_PKT, int, (int)rxpkt->id, int, + (int)rxpkt->offset, + int, (int)rxpkt->flags, int, (int)rxpkt->status); + + /* + * 2. + * Take buffer off of receive ring + */ + hwcsum = B_FALSE; + bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id]; + /* 3 */ + xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL; + ASSERT(bdesc->id == rxpkt->id); + if (rxpkt->status <= 0) { + DTRACE_PROBE4(pkt_status_negative, int, rxpkt->status, + char *, bdesc->buf, int, rxpkt->offset, + char *, ((char *)bdesc->buf) + rxpkt->offset); + mp = NULL; + xnfp->xnf_stat_errrx++; + if (rxpkt->status == 0) + xnfp->xnf_stat_runt++; + if (rxpkt->status == NETIF_RSP_ERROR) + xnfp->xnf_stat_mac_rcv_error++; + if (rxpkt->status == NETIF_RSP_DROPPED) + xnfp->xnf_stat_norxbuf++; + /* + * re-hang the buffer + */ + rx_buffer_hang(xnfp, bdesc); + } else { + grant_ref_t ref = bdesc->grant_ref; + struct xnf_buffer_desc *new_bdesc; + unsigned long off = rxpkt->offset; + + DTRACE_PROBE4(pkt_status_ok, int, rxpkt->status, + char *, bdesc->buf, int, rxpkt->offset, + char *, ((char *)bdesc->buf) + rxpkt->offset); + len = rxpkt->status; + ASSERT(off + len <= PAGEOFFSET); + if (ref == GRANT_INVALID_REF) { + mp = NULL; + new_bdesc = bdesc; + cmn_err(CE_WARN, "Bad rx grant reference %d " + "from dom %d", ref, + xvdi_get_oeid(xnfp->xnf_devinfo)); + goto luckless; + } + /* + * Release ref which we'll be re-claiming in + * rx_buffer_hang(). + */ + bdesc->grant_ref = GRANT_INVALID_REF; + (void) gnttab_end_foreign_access_ref(ref, 0); + gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head, + ref); + if (rxpkt->flags & NETRXF_data_validated) + hwcsum = B_TRUE; + + /* + * XXPV for the initial implementation of HVcopy, + * create a new msg and copy in the data + */ + /* 4. */ + if ((mp = allocb(len, BPRI_MED)) == NULL) { + /* + * Couldn't get buffer to copy to, + * drop this data, and re-hang + * the buffer on the ring. + */ + xnfp->xnf_stat_norxbuf++; + DTRACE_PROBE(alloc_nix); + } else { + /* 5. */ + DTRACE_PROBE(alloc_ok); + bcopy(bdesc->buf + off, mp->b_wptr, + len); + mp->b_wptr += len; + } + new_bdesc = bdesc; +luckless: + + /* Re-hang old or hang new buffer. */ + rx_buffer_hang(xnfp, new_bdesc); + } + if (mp) { + if (hwcsum) { + /* + * See comments in xnf_process_recv(). + */ + + (void) hcksum_assoc(mp, NULL, + NULL, 0, 0, 0, 0, + HCK_FULLCKSUM | + HCK_FULLCKSUM_OK, + 0); + xnfp->xnf_stat_rx_cksum_no_need++; + } + if (head == NULL) { + head = tail = mp; + } else { + tail->b_next = mp; + tail = mp; + } + + ASSERT(mp->b_next == NULL); + + xnfp->xnf_stat_ipackets++; + xnfp->xnf_stat_rbytes += len; + } + + xnfp->xnf_rx_ring.rsp_cons++; + + xnfp->xnf_stat_hvcopy_packet_processed++; + } + + /* 7. */ + /* + * Has more data come in since we started? + */ + /* LINTED: constant in conditional context */ + RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do); + if (work_to_do) + goto loop; + + /* + * Indicate to the backend that we have re-filled the receive + * ring. + */ + /* LINTED: constant in conditional context */ + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify); + if (notify) + ec_notify_via_evtchn(xnfp->xnf_evtchn); + + return (head); +} /* Process all queued received packets */ static mblk_t * @@ -1468,27 +1630,27 @@ xnf_process_recv(xnf_t *xnfp) head = tail = NULL; loop: - while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->rx_ring)) { + while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) { - rxpkt = RING_GET_RESPONSE(&xnfp->rx_ring, - xnfp->rx_ring.rsp_cons); + rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring, + xnfp->xnf_rx_ring.rsp_cons); /* * Take buffer off of receive ring */ hwcsum = B_FALSE; - bdesc = xnfp->rxpkt_bufptr[rxpkt->id]; - xnfp->rxpkt_bufptr[rxpkt->id] = NULL; + bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id]; + xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL; ASSERT(bdesc->id == rxpkt->id); if (rxpkt->status <= 0) { mp = NULL; - xnfp->stat_errrcv++; + xnfp->xnf_stat_errrx++; if (rxpkt->status == 0) - xnfp->stat_runt++; + xnfp->xnf_stat_runt++; if (rxpkt->status == NETIF_RSP_ERROR) - xnfp->stat_mac_rcv_error++; + xnfp->xnf_stat_mac_rcv_error++; if (rxpkt->status == NETIF_RSP_DROPPED) - xnfp->stat_norcvbuf++; + xnfp->xnf_stat_norxbuf++; /* * re-hang the buffer */ @@ -1506,7 +1668,7 @@ loop: new_bdesc = bdesc; cmn_err(CE_WARN, "Bad rx grant reference %d " "from dom %d", ref, - xvdi_get_oeid(xnfp->devinfo)); + xvdi_get_oeid(xnfp->xnf_devinfo)); goto luckless; } bdesc->grant_ref = GRANT_INVALID_REF; @@ -1514,13 +1676,15 @@ loop: ASSERT(mfn != MFN_INVALID); ASSERT(hat_getpfnum(kas.a_hat, bdesc->buf) == PFN_INVALID); - gnttab_release_grant_reference(&xnfp->gref_rx_head, + + gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head, ref); reassign_pfn(xnf_btop(bdesc->buf_phys), mfn); hat_devload(kas.a_hat, bdesc->buf, PAGESIZE, xnf_btop(bdesc->buf_phys), PROT_READ | PROT_WRITE, HAT_LOAD); balloon_drv_added(1); + if (rxpkt->flags & NETRXF_data_validated) hwcsum = B_TRUE; if (len <= xnf_rx_bcopy_thresh) { @@ -1534,14 +1698,14 @@ loop: * We send a pointer to this data upstream; * we need a new buffer to replace this one. */ - mutex_enter(&xnfp->rx_buf_mutex); + mutex_enter(&xnfp->xnf_rx_buf_mutex); new_bdesc = xnf_get_buffer(xnfp); if (new_bdesc != NULL) { - xnfp->rx_bufs_outstanding++; + xnfp->xnf_rx_bufs_outstanding++; } else { - xnfp->stat_rx_no_ringbuf++; + xnfp->xnf_stat_rx_no_ringbuf++; } - mutex_exit(&xnfp->rx_buf_mutex); + mutex_exit(&xnfp->xnf_rx_buf_mutex); } if (new_bdesc == NULL) { @@ -1556,7 +1720,7 @@ loop: * drop this data, and re-hang * the buffer on the ring. */ - xnfp->stat_norcvbuf++; + xnfp->xnf_stat_norxbuf++; } else { bcopy(bdesc->buf + off, mp->b_wptr, len); @@ -1579,7 +1743,7 @@ loop: * Couldn't get mblk to pass recv data * up with, free the old ring buffer */ - xnfp->stat_norcvbuf++; + xnfp->xnf_stat_norxbuf++; xnf_rcv_complete(bdesc); goto luckless; } @@ -1624,7 +1788,7 @@ luckless: HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); - xnfp->stat_rx_cksum_no_need++; + xnfp->xnf_stat_rx_cksum_no_need++; } if (head == NULL) { head = tail = mp; @@ -1635,18 +1799,18 @@ luckless: ASSERT(mp->b_next == NULL); - xnfp->stat_ipackets++; - xnfp->stat_rbytes += len; + xnfp->xnf_stat_ipackets++; + xnfp->xnf_stat_rbytes += len; } - xnfp->rx_ring.rsp_cons++; + xnfp->xnf_rx_ring.rsp_cons++; } /* * Has more data come in since we started? */ /* LINTED: constant in conditional context */ - RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->rx_ring, work_to_do); + RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do); if (work_to_do) goto loop; @@ -1655,9 +1819,9 @@ luckless: * ring. */ /* LINTED: constant in conditional context */ - RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->rx_ring, notify); + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify); if (notify) - ec_notify_via_evtchn(xnfp->evtchn); + ec_notify_via_evtchn(xnfp->xnf_evtchn); return (head); } @@ -1671,13 +1835,13 @@ xnf_rcv_complete(struct xnf_buffer_desc *bdesc) long cnt; /* One less outstanding receive buffer */ - mutex_enter(&xnfp->rx_buf_mutex); - --xnfp->rx_bufs_outstanding; + mutex_enter(&xnfp->xnf_rx_buf_mutex); + --xnfp->xnf_rx_bufs_outstanding; /* * Return buffer to the free list, unless the free list is getting - * too large. XXX - this threshold may need tuning. + * too large. XXPV - this threshold may need tuning. */ - if (xnfp->rx_descs_free < xnf_recv_bufs_lowat) { + if (xnfp->xnf_rx_descs_free < xnf_rx_bufs_lowat) { /* * Unmap the page, and hand the machine page back * to xen so it can be re-used as a backend net buffer. @@ -1689,17 +1853,17 @@ xnf_rcv_complete(struct xnf_buffer_desc *bdesc) "hypervisor\n"); } - bdesc->next = xnfp->free_list; - xnfp->free_list = bdesc; - xnfp->rx_descs_free++; - mutex_exit(&xnfp->rx_buf_mutex); + bdesc->next = xnfp->xnf_free_list; + xnfp->xnf_free_list = bdesc; + xnfp->xnf_rx_descs_free++; + mutex_exit(&xnfp->xnf_rx_buf_mutex); } else { /* * We can return everything here since we have a free buffer * that we have not given the backing page for back to xen. */ - --xnfp->recv_buffer_count; - mutex_exit(&xnfp->rx_buf_mutex); + --xnfp->xnf_rx_buffer_count; + mutex_exit(&xnfp->xnf_rx_buf_mutex); (void) ddi_dma_unbind_handle(bdesc->dma_handle); ddi_dma_mem_free(&bdesc->acc_handle); ddi_dma_free_handle(&bdesc->dma_handle); @@ -1713,7 +1877,7 @@ xnf_rcv_complete(struct xnf_buffer_desc *bdesc) static int xnf_alloc_dma_resources(xnf_t *xnfp) { - dev_info_t *devinfo = xnfp->devinfo; + dev_info_t *devinfo = xnfp->xnf_devinfo; int i; size_t len; ddi_dma_cookie_t dma_cookie; @@ -1722,10 +1886,10 @@ xnf_alloc_dma_resources(xnf_t *xnfp) int rc; caddr_t rptr; - xnfp->n_recvs = NET_RX_RING_SIZE; - xnfp->max_recv_bufs = xnf_recv_bufs_hiwat; + xnfp->xnf_n_rx = NET_RX_RING_SIZE; + xnfp->xnf_max_rx_bufs = xnf_rx_bufs_hiwat; - xnfp->n_xmits = NET_TX_RING_SIZE; + xnfp->xnf_n_tx = NET_TX_RING_SIZE; /* * The code below allocates all the DMA data structures that @@ -1734,10 +1898,10 @@ xnf_alloc_dma_resources(xnf_t *xnfp) * First allocate handles for mapping (virtual address) pointers to * transmit data buffers to physical addresses */ - for (i = 0; i < xnfp->n_xmits; i++) { + for (i = 0; i < xnfp->xnf_n_tx; i++) { if ((rc = ddi_dma_alloc_handle(devinfo, &tx_buffer_dma_attr, DDI_DMA_SLEEP, 0, - &xnfp->tx_pkt_info[i].dma_handle)) != DDI_SUCCESS) + &xnfp->xnf_tx_pkt_info[i].dma_handle)) != DDI_SUCCESS) return (DDI_FAILURE); } @@ -1745,25 +1909,25 @@ xnf_alloc_dma_resources(xnf_t *xnfp) * Allocate page for the transmit descriptor ring. */ if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr, - DDI_DMA_SLEEP, 0, &xnfp->tx_ring_dma_handle) != DDI_SUCCESS) + DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS) goto alloc_error; - if (ddi_dma_mem_alloc(xnfp->tx_ring_dma_handle, + if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle, PAGESIZE, &accattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, 0, &rptr, &len, - &xnfp->tx_ring_dma_acchandle) != DDI_SUCCESS) { - ddi_dma_free_handle(&xnfp->tx_ring_dma_handle); - xnfp->tx_ring_dma_handle = NULL; + &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) { + ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); + xnfp->xnf_tx_ring_dma_handle = NULL; goto alloc_error; } - if ((rc = ddi_dma_addr_bind_handle(xnfp->tx_ring_dma_handle, NULL, + if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL, rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) { - ddi_dma_mem_free(&xnfp->tx_ring_dma_acchandle); - ddi_dma_free_handle(&xnfp->tx_ring_dma_handle); - xnfp->tx_ring_dma_handle = NULL; - xnfp->tx_ring_dma_acchandle = NULL; + ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle); + ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); + xnfp->xnf_tx_ring_dma_handle = NULL; + xnfp->xnf_tx_ring_dma_acchandle = NULL; if (rc == DDI_DMA_NORESOURCES) goto alloc_error; else @@ -1775,32 +1939,32 @@ xnf_alloc_dma_resources(xnf_t *xnfp) /* LINTED: constant in conditional context */ SHARED_RING_INIT((netif_tx_sring_t *)rptr); /* LINTED: constant in conditional context */ - FRONT_RING_INIT(&xnfp->tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE); - xnfp->tx_ring_phys_addr = dma_cookie.dmac_laddress; + FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE); + xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress; /* * Allocate page for the receive descriptor ring. */ if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr, - DDI_DMA_SLEEP, 0, &xnfp->rx_ring_dma_handle) != DDI_SUCCESS) + DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS) goto alloc_error; - if (ddi_dma_mem_alloc(xnfp->rx_ring_dma_handle, + if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle, PAGESIZE, &accattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, 0, &rptr, &len, - &xnfp->rx_ring_dma_acchandle) != DDI_SUCCESS) { - ddi_dma_free_handle(&xnfp->rx_ring_dma_handle); - xnfp->rx_ring_dma_handle = NULL; + &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) { + ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); + xnfp->xnf_rx_ring_dma_handle = NULL; goto alloc_error; } - if ((rc = ddi_dma_addr_bind_handle(xnfp->rx_ring_dma_handle, NULL, + if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL, rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) { - ddi_dma_mem_free(&xnfp->rx_ring_dma_acchandle); - ddi_dma_free_handle(&xnfp->rx_ring_dma_handle); - xnfp->rx_ring_dma_handle = NULL; - xnfp->rx_ring_dma_acchandle = NULL; + ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle); + ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); + xnfp->xnf_rx_ring_dma_handle = NULL; + xnfp->xnf_rx_ring_dma_acchandle = NULL; if (rc == DDI_DMA_NORESOURCES) goto alloc_error; else @@ -1812,26 +1976,26 @@ xnf_alloc_dma_resources(xnf_t *xnfp) /* LINTED: constant in conditional context */ SHARED_RING_INIT((netif_rx_sring_t *)rptr); /* LINTED: constant in conditional context */ - FRONT_RING_INIT(&xnfp->rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE); - xnfp->rx_ring_phys_addr = dma_cookie.dmac_laddress; + FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE); + xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress; /* * Preallocate receive buffers for each receive descriptor. */ /* Set up the "free list" of receive buffer descriptors */ - for (i = 0; i < xnfp->n_recvs; i++) { + for (i = 0; i < xnfp->xnf_n_rx; i++) { if ((bdesc = xnf_alloc_buffer(xnfp)) == NULL) goto alloc_error; - bdesc->next = xnfp->free_list; - xnfp->free_list = bdesc; + bdesc->next = xnfp->xnf_free_list; + xnfp->xnf_free_list = bdesc; } return (DDI_SUCCESS); alloc_error: cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory", - ddi_get_instance(xnfp->devinfo)); + ddi_get_instance(xnfp->xnf_devinfo)); error: xnf_release_dma_resources(xnfp); return (DDI_FAILURE); @@ -1851,28 +2015,28 @@ xnf_release_dma_resources(xnf_t *xnfp) * Free receive buffers which are currently associated with * descriptors */ - for (i = 0; i < xnfp->n_recvs; i++) { + for (i = 0; i < xnfp->xnf_n_rx; i++) { struct xnf_buffer_desc *bp; - if ((bp = xnfp->rxpkt_bufptr[i]) == NULL) + if ((bp = xnfp->xnf_rxpkt_bufptr[i]) == NULL) continue; xnf_free_buffer(bp); - xnfp->rxpkt_bufptr[i] = NULL; + xnfp->xnf_rxpkt_bufptr[i] = NULL; } /* Free the receive ring buffer */ - if (xnfp->rx_ring_dma_acchandle != NULL) { - (void) ddi_dma_unbind_handle(xnfp->rx_ring_dma_handle); - ddi_dma_mem_free(&xnfp->rx_ring_dma_acchandle); - ddi_dma_free_handle(&xnfp->rx_ring_dma_handle); - xnfp->rx_ring_dma_acchandle = NULL; + if (xnfp->xnf_rx_ring_dma_acchandle != NULL) { + (void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle); + ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle); + ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); + xnfp->xnf_rx_ring_dma_acchandle = NULL; } /* Free the transmit ring buffer */ - if (xnfp->tx_ring_dma_acchandle != NULL) { - (void) ddi_dma_unbind_handle(xnfp->tx_ring_dma_handle); - ddi_dma_mem_free(&xnfp->tx_ring_dma_acchandle); - ddi_dma_free_handle(&xnfp->tx_ring_dma_handle); - xnfp->tx_ring_dma_acchandle = NULL; + if (xnfp->xnf_tx_ring_dma_acchandle != NULL) { + (void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle); + ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle); + ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); + xnfp->xnf_tx_ring_dma_acchandle = NULL; } } @@ -1881,12 +2045,13 @@ xnf_release_mblks(xnf_t *xnfp) { int i; - for (i = 0; i < xnfp->n_xmits; i++) { - if (xnfp->tx_pkt_info[i].mp == NULL) + for (i = 0; i < xnfp->xnf_n_tx; i++) { + if (xnfp->xnf_tx_pkt_info[i].mp == NULL) continue; - freemsg(xnfp->tx_pkt_info[i].mp); - xnfp->tx_pkt_info[i].mp = NULL; - (void) ddi_dma_unbind_handle(xnfp->tx_pkt_info[i].dma_handle); + freemsg(xnfp->xnf_tx_pkt_info[i].mp); + xnfp->xnf_tx_pkt_info[i].mp = NULL; + (void) ddi_dma_unbind_handle( + xnfp->xnf_tx_pkt_info[i].dma_handle); } } @@ -1896,15 +2061,15 @@ xnf_release_mblks(xnf_t *xnfp) * Called with the tx_buf_mutex held. */ static struct xnf_buffer_desc * -xnf_get_xmit_buffer(xnf_t *xnfp) +xnf_get_tx_buffer(xnf_t *xnfp) { struct xnf_buffer_desc *bdesc; - bdesc = xnfp->xmit_free_list; + bdesc = xnfp->xnf_tx_free_list; if (bdesc != NULL) { - xnfp->xmit_free_list = bdesc->next; + xnfp->xnf_tx_free_list = bdesc->next; } else { - bdesc = xnf_alloc_xmit_buffer(xnfp); + bdesc = xnf_alloc_tx_buffer(xnfp); } return (bdesc); } @@ -1919,10 +2084,10 @@ xnf_get_buffer(xnf_t *xnfp) { struct xnf_buffer_desc *bdesc; - bdesc = xnfp->free_list; + bdesc = xnfp->xnf_free_list; if (bdesc != NULL) { - xnfp->free_list = bdesc->next; - xnfp->rx_descs_free--; + xnfp->xnf_free_list = bdesc->next; + xnfp->xnf_rx_descs_free--; } else { bdesc = xnf_alloc_buffer(xnfp); } @@ -1933,32 +2098,45 @@ xnf_get_buffer(xnf_t *xnfp) * Free a xmit buffer back to the xmit free list */ static void -xnf_free_xmit_buffer(struct xnf_buffer_desc *bp) +xnf_free_tx_buffer(struct xnf_buffer_desc *bp) { xnf_t *xnfp = bp->xnfp; - mutex_enter(&xnfp->tx_buf_mutex); - bp->next = xnfp->xmit_free_list; - xnfp->xmit_free_list = bp; - mutex_exit(&xnfp->tx_buf_mutex); + mutex_enter(&xnfp->xnf_tx_buf_mutex); + bp->next = xnfp->xnf_tx_free_list; + xnfp->xnf_tx_free_list = bp; + mutex_exit(&xnfp->xnf_tx_buf_mutex); } /* * Put a buffer descriptor onto the head of the free list. + * for page-flip: * We can't really free these buffers back to the kernel * since we have given away their backing page to be used * by the back end net driver. + * for hvcopy: + * release all the memory */ static void -xnf_free_buffer(struct xnf_buffer_desc *bp) +xnf_free_buffer(struct xnf_buffer_desc *bdesc) { - xnf_t *xnfp = bp->xnfp; + xnf_t *xnfp = bdesc->xnfp; - mutex_enter(&xnfp->rx_buf_mutex); - bp->next = xnfp->free_list; - xnfp->free_list = bp; - xnfp->rx_descs_free++; - mutex_exit(&xnfp->rx_buf_mutex); + mutex_enter(&xnfp->xnf_rx_buf_mutex); + if (xnfp->xnf_rx_hvcopy) { + if (ddi_dma_unbind_handle(bdesc->dma_handle) != DDI_SUCCESS) + goto out; + ddi_dma_mem_free(&bdesc->acc_handle); + ddi_dma_free_handle(&bdesc->dma_handle); + kmem_free(bdesc, sizeof (*bdesc)); + xnfp->xnf_rx_buffer_count--; + } else { + bdesc->next = xnfp->xnf_free_list; + xnfp->xnf_free_list = bdesc; + xnfp->xnf_rx_descs_free++; + } +out: + mutex_exit(&xnfp->xnf_rx_buf_mutex); } /* @@ -1966,7 +2144,7 @@ xnf_free_buffer(struct xnf_buffer_desc *bp) * keep track of the buffer. Called with tx_buf_mutex held. */ static struct xnf_buffer_desc * -xnf_alloc_xmit_buffer(xnf_t *xnfp) +xnf_alloc_tx_buffer(xnf_t *xnfp) { struct xnf_buffer_desc *bdesc; size_t len; @@ -1975,7 +2153,7 @@ xnf_alloc_xmit_buffer(xnf_t *xnfp) return (NULL); /* allocate a DMA access handle for receive buffer */ - if (ddi_dma_alloc_handle(xnfp->devinfo, &tx_buffer_dma_attr, + if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buffer_dma_attr, 0, 0, &bdesc->dma_handle) != DDI_SUCCESS) goto failure; @@ -1983,14 +2161,14 @@ xnf_alloc_xmit_buffer(xnf_t *xnfp) if (ddi_dma_mem_alloc(bdesc->dma_handle, PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0, &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS) - goto late_failure; + goto failure_1; bdesc->xnfp = xnfp; - xnfp->xmit_buffer_count++; + xnfp->xnf_tx_buffer_count++; return (bdesc); -late_failure: +failure_1: ddi_dma_free_handle(&bdesc->dma_handle); failure: @@ -2012,14 +2190,14 @@ xnf_alloc_buffer(xnf_t *xnfp) long cnt; pfn_t pfn; - if (xnfp->recv_buffer_count >= xnfp->max_recv_bufs) + if (xnfp->xnf_rx_buffer_count >= xnfp->xnf_max_rx_bufs) return (NULL); if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL) return (NULL); /* allocate a DMA access handle for receive buffer */ - if (ddi_dma_alloc_handle(xnfp->devinfo, &rx_buffer_dma_attr, + if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buffer_dma_attr, 0, 0, &bdesc->dma_handle) != DDI_SUCCESS) goto failure; @@ -2027,39 +2205,46 @@ xnf_alloc_buffer(xnf_t *xnfp) if (ddi_dma_mem_alloc(bdesc->dma_handle, PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0, &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS) - goto late_failure; + goto failure_1; /* bind to virtual address of buffer to get physical address */ if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL, bdesc->buf, PAGESIZE, DDI_DMA_READ | DDI_DMA_STREAMING, DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) - goto late_late_failure; + goto failure_2; bdesc->buf_phys = dma_cookie.dmac_laddress; bdesc->xnfp = xnfp; - bdesc->free_rtn.free_func = xnf_rcv_complete; + if (xnfp->xnf_rx_hvcopy) { + bdesc->free_rtn.free_func = xnf_copy_rcv_complete; + } else { + bdesc->free_rtn.free_func = xnf_rcv_complete; + } bdesc->free_rtn.free_arg = (char *)bdesc; bdesc->grant_ref = GRANT_INVALID_REF; ASSERT(ncookies == 1); - xnfp->recv_buffer_count++; - /* - * Unmap the page, and hand the machine page back - * to xen so it can be used as a backend net buffer. - */ - pfn = xnf_btop(bdesc->buf_phys); - cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn); - if (cnt != 1) { - cmn_err(CE_WARN, "unable to give a page back to the " - "hypervisor\n"); + xnfp->xnf_rx_buffer_count++; + + if (!xnfp->xnf_rx_hvcopy) { + /* + * Unmap the page, and hand the machine page back + * to xen so it can be used as a backend net buffer. + */ + pfn = xnf_btop(bdesc->buf_phys); + cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn); + if (cnt != 1) { + cmn_err(CE_WARN, "unable to give a page back to the " + "hypervisor\n"); + } } return (bdesc); -late_late_failure: +failure_2: ddi_dma_mem_free(&bdesc->acc_handle); -late_failure: +failure_1: ddi_dma_free_handle(&bdesc->dma_handle); failure: @@ -2067,40 +2252,129 @@ failure: return (NULL); } +/* + * Statistics. + */ +static char *xnf_aux_statistics[] = { + "tx_cksum_deferred", + "rx_cksum_no_need", + "interrupts", + "unclaimed_interrupts", + "tx_pullup", + "tx_pagebndry", + "tx_attempt", + "rx_no_ringbuf", + "hvcopy_packet_processed", +}; + +static int +xnf_kstat_aux_update(kstat_t *ksp, int flag) +{ + xnf_t *xnfp; + kstat_named_t *knp; + + if (flag != KSTAT_READ) + return (EACCES); + + xnfp = ksp->ks_private; + knp = ksp->ks_data; + + /* + * Assignment order must match that of the names in + * xnf_aux_statistics. + */ + (knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred; + (knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need; + + (knp++)->value.ui64 = xnfp->xnf_stat_interrupts; + (knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts; + (knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup; + (knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry; + (knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt; + (knp++)->value.ui64 = xnfp->xnf_stat_rx_no_ringbuf; + + (knp++)->value.ui64 = xnfp->xnf_stat_hvcopy_packet_processed; + + return (0); +} + +static boolean_t +xnf_kstat_init(xnf_t *xnfp) +{ + int nstat = sizeof (xnf_aux_statistics) / + sizeof (xnf_aux_statistics[0]); + char **cp = xnf_aux_statistics; + kstat_named_t *knp; + + /* + * Create and initialise kstats. + */ + if ((xnfp->xnf_kstat_aux = kstat_create("xnf", + ddi_get_instance(xnfp->xnf_devinfo), + "aux_statistics", "net", KSTAT_TYPE_NAMED, + nstat, 0)) == NULL) + return (B_FALSE); + + xnfp->xnf_kstat_aux->ks_private = xnfp; + xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update; + + knp = xnfp->xnf_kstat_aux->ks_data; + while (nstat > 0) { + kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); + + knp++; + cp++; + nstat--; + } + + kstat_install(xnfp->xnf_kstat_aux); + + return (B_TRUE); +} + static int xnf_stat(void *arg, uint_t stat, uint64_t *val) { xnf_t *xnfp = arg; - mutex_enter(&xnfp->intrlock); - mutex_enter(&xnfp->txlock); + mutex_enter(&xnfp->xnf_intrlock); + mutex_enter(&xnfp->xnf_txlock); -#define map_stat(q, r) \ +#define mac_stat(q, r) \ case (MAC_STAT_##q): \ - *val = xnfp->stat_##r; \ + *val = xnfp->xnf_stat_##r; \ + break + +#define ether_stat(q, r) \ + case (ETHER_STAT_##q): \ + *val = xnfp->xnf_stat_##r; \ break switch (stat) { - map_stat(IPACKETS, ipackets); - map_stat(OPACKETS, opackets); - map_stat(RBYTES, rbytes); - map_stat(OBYTES, obytes); - map_stat(NORCVBUF, norcvbuf); - map_stat(IERRORS, errrcv); - map_stat(NOXMTBUF, xmit_defer); + mac_stat(IPACKETS, ipackets); + mac_stat(OPACKETS, opackets); + mac_stat(RBYTES, rbytes); + mac_stat(OBYTES, obytes); + mac_stat(NORCVBUF, norxbuf); + mac_stat(IERRORS, errrx); + mac_stat(NOXMTBUF, tx_defer); + + ether_stat(MACRCV_ERRORS, mac_rcv_error); + ether_stat(TOOSHORT_ERRORS, runt); default: - mutex_exit(&xnfp->txlock); - mutex_exit(&xnfp->intrlock); + mutex_exit(&xnfp->xnf_txlock); + mutex_exit(&xnfp->xnf_intrlock); return (ENOTSUP); } -#undef map_stat +#undef mac_stat +#undef ether_stat - mutex_exit(&xnfp->txlock); - mutex_exit(&xnfp->intrlock); + mutex_exit(&xnfp->xnf_txlock); + mutex_exit(&xnfp->xnf_intrlock); return (0); } @@ -2134,7 +2408,7 @@ xnf_resources(void *arg) mrf.mrf_normal_blank_time = 128; /* XXPV dme: see xnf_blank() */ mrf.mrf_normal_pkt_count = 8; /* XXPV dme: see xnf_blank() */ - xnfp->rx_handle = mac_resource_add(xnfp->mh, + xnfp->xnf_rx_handle = mac_resource_add(xnfp->xnf_mh, (mac_resource_t *)&mrf); } @@ -2166,7 +2440,7 @@ xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data) * not zero. (In fact, a Solaris dom0 is happy to deal * with a checksum of zero, but a Linux dom0 is not.) */ - if (xnfp->cksum_offload) + if (xnfp->xnf_cksum_offload) *capab = HCKSUM_INET_PARTIAL; else *capab = 0; @@ -2196,19 +2470,42 @@ oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, switch (new_state) { case XenbusStateConnected: - mutex_enter(&xnfp->intrlock); - mutex_enter(&xnfp->txlock); + mutex_enter(&xnfp->xnf_intrlock); + mutex_enter(&xnfp->xnf_txlock); - xnfp->connected = B_TRUE; - cv_broadcast(&xnfp->cv); + xnfp->xnf_connected = B_TRUE; + cv_broadcast(&xnfp->xnf_cv); - mutex_exit(&xnfp->txlock); - mutex_exit(&xnfp->intrlock); + mutex_exit(&xnfp->xnf_txlock); + mutex_exit(&xnfp->xnf_intrlock); - ec_notify_via_evtchn(xnfp->evtchn); + ec_notify_via_evtchn(xnfp->xnf_evtchn); break; default: break; } } + +/* + * Check whether backend is capable of and willing to talk + * to us via hypervisor copy, as opposed to page flip. + */ +static boolean_t +xnf_hvcopy_peer_status(dev_info_t *devinfo) +{ + int be_rx_copy; + int err; + + err = xenbus_scanf(XBT_NULL, xvdi_get_oename(devinfo), + "feature-rx-copy", "%d", &be_rx_copy); + /* + * If we fail to read the store we assume that the key is + * absent, implying an older domain at the far end. Older + * domains cannot do HV copy (we assume ..). + */ + if (err != 0) + be_rx_copy = 0; + + return (be_rx_copy?B_TRUE:B_FALSE); +} diff --git a/usr/src/uts/common/xen/io/xnf.h b/usr/src/uts/common/xen/io/xnf.h index 7f664ee802..19f7898b0d 100644 --- a/usr/src/uts/common/xen/io/xnf.h +++ b/usr/src/uts/common/xen/io/xnf.h @@ -29,12 +29,6 @@ #pragma ident "%Z%%M% %I% %E% SMI" -#include <sys/types.h> -#include <sys/kstat.h> -#include <sys/hypervisor.h> -#include <xen/public/io/netif.h> -#include <xen/sys/xenbus_impl.h> - #ifdef __cplusplus extern "C" { #endif @@ -94,77 +88,83 @@ struct tx_pktinfo { /* Per network-interface-controller driver private structure */ typedef struct xnf { /* most interesting stuff first to assist debugging */ - dev_info_t *devinfo; /* System per-device info. */ - mac_handle_t mh; /* Nemo per-device info. */ - int rx_bufs_outstanding; - int tx_descs_free; - int rx_descs_free; /* count of free rx bufs */ - int n_xmits; /* No. xmit descriptors */ - int n_recvs; /* No. recv descriptors */ - int n_recv_bufs; /* No. recv DMA buffers */ - int tx_start_thresh_regval; - unsigned char mac_addr[ETHERADDRL]; - int max_recv_bufs; - int recv_buffer_count; - int xmit_buffer_count; - - boolean_t connected; - boolean_t running; - - boolean_t cksum_offload; - - uint64_t stat_intr; - uint64_t stat_norcvbuf; - uint64_t stat_errrcv; - - uint64_t stat_xmit_attempt; - uint64_t stat_xmit_pullup; - uint64_t stat_xmit_pagebndry; - uint64_t stat_xmit_defer; - uint64_t stat_rx_no_ringbuf; - uint64_t stat_mac_rcv_error; - uint64_t stat_runt; - - uint64_t stat_ipackets; - uint64_t stat_opackets; - uint64_t stat_rbytes; - uint64_t stat_obytes; - - uint64_t stat_tx_cksum_deferred; - uint64_t stat_rx_cksum_no_need; - - kstat_t *kstat_aux; - - struct xnf_buffer_desc *free_list; - struct xnf_buffer_desc *xmit_free_list; - int tx_pkt_id_list; /* free list of avail pkt ids */ - struct tx_pktinfo tx_pkt_info[NET_TX_RING_SIZE]; - struct xnf_buffer_desc *rxpkt_bufptr[XNF_MAX_RXDESCS]; - - mac_resource_handle_t rx_handle; - ddi_iblock_cookie_t icookie; - kmutex_t tx_buf_mutex; - kmutex_t rx_buf_mutex; - kmutex_t txlock; - kmutex_t intrlock; - boolean_t tx_pages_readonly; - - netif_tx_front_ring_t tx_ring; /* tx interface struct ptr */ - ddi_dma_handle_t tx_ring_dma_handle; - ddi_acc_handle_t tx_ring_dma_acchandle; - paddr_t tx_ring_phys_addr; - grant_ref_t tx_ring_ref; - - netif_rx_front_ring_t rx_ring; /* rx interface struct ptr */ - ddi_dma_handle_t rx_ring_dma_handle; - ddi_acc_handle_t rx_ring_dma_acchandle; - paddr_t rx_ring_phys_addr; - grant_ref_t rx_ring_ref; - - uint16_t evtchn; /* channel to back end ctlr */ - grant_ref_t gref_tx_head; /* tx grant free list */ - grant_ref_t gref_rx_head; /* rx grant free list */ - kcondvar_t cv; + dev_info_t *xnf_devinfo; /* System per-device info. */ + mac_handle_t xnf_mh; /* Nemo per-device info. */ + int xnf_rx_bufs_outstanding; + int xnf_tx_descs_free; + int xnf_rx_descs_free; /* count of free rx bufs */ + int xnf_n_tx; /* No. xmit descriptors */ + int xnf_n_rx; /* No. recv descriptors */ + int xnf_n_rx_bufs; /* No. recv DMA buffers */ + int xnf_tx_start_thresh_regval; + unsigned char xnf_mac_addr[ETHERADDRL]; + int xnf_max_rx_bufs; + int xnf_rx_buffer_count; + int xnf_tx_buffer_count; + + boolean_t xnf_connected; + boolean_t xnf_running; + + boolean_t xnf_cksum_offload; + + uint64_t xnf_stat_interrupts; + uint64_t xnf_stat_unclaimed_interrupts; + uint64_t xnf_stat_norxbuf; + uint64_t xnf_stat_errrx; + + uint64_t xnf_stat_tx_attempt; + uint64_t xnf_stat_tx_pullup; + uint64_t xnf_stat_tx_pagebndry; + uint64_t xnf_stat_tx_defer; + uint64_t xnf_stat_rx_no_ringbuf; + uint64_t xnf_stat_mac_rcv_error; + uint64_t xnf_stat_runt; + + uint64_t xnf_stat_ipackets; + uint64_t xnf_stat_opackets; + uint64_t xnf_stat_rbytes; + uint64_t xnf_stat_obytes; + + uint64_t xnf_stat_tx_cksum_deferred; + uint64_t xnf_stat_rx_cksum_no_need; + uint64_t xnf_stat_hvcopy_enabled; /* on/off */ + uint64_t xnf_stat_hvcopy_packet_processed; + + kstat_t *xnf_kstat_aux; + + struct xnf_buffer_desc *xnf_free_list; + struct xnf_buffer_desc *xnf_tx_free_list; + int xnf_tx_pkt_id_list; + /* free list of avail pkt ids */ + struct tx_pktinfo xnf_tx_pkt_info[NET_TX_RING_SIZE]; + struct xnf_buffer_desc *xnf_rxpkt_bufptr[XNF_MAX_RXDESCS]; + + mac_resource_handle_t xnf_rx_handle; + ddi_iblock_cookie_t xnf_icookie; + kmutex_t xnf_tx_buf_mutex; + kmutex_t xnf_rx_buf_mutex; + kmutex_t xnf_txlock; + kmutex_t xnf_intrlock; + boolean_t xnf_tx_pages_readonly; + + netif_tx_front_ring_t xnf_tx_ring; /* tx interface struct ptr */ + ddi_dma_handle_t xnf_tx_ring_dma_handle; + ddi_acc_handle_t xnf_tx_ring_dma_acchandle; + paddr_t xnf_tx_ring_phys_addr; + grant_ref_t xnf_tx_ring_ref; + + netif_rx_front_ring_t xnf_rx_ring; /* rx interface struct ptr */ + ddi_dma_handle_t xnf_rx_ring_dma_handle; + ddi_acc_handle_t xnf_rx_ring_dma_acchandle; + paddr_t xnf_rx_ring_phys_addr; + grant_ref_t xnf_rx_ring_ref; + + uint16_t xnf_evtchn; /* channel to back end ctlr */ + grant_ref_t xnf_gref_tx_head; /* tx grant free list */ + grant_ref_t xnf_gref_rx_head; /* rx grant free list */ + kcondvar_t xnf_cv; + + boolean_t xnf_rx_hvcopy; /* do we do HV copy? */ } xnf_t; #ifdef __cplusplus diff --git a/usr/src/uts/common/xen/io/xpvd.c b/usr/src/uts/common/xen/io/xpvd.c index c989960444..34408e16f8 100644 --- a/usr/src/uts/common/xen/io/xpvd.c +++ b/usr/src/uts/common/xen/io/xpvd.c @@ -36,7 +36,6 @@ */ #include <sys/conf.h> -#include <sys/hypervisor.h> #include <sys/kmem.h> #include <sys/debug.h> #include <sys/modctl.h> @@ -46,18 +45,29 @@ #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/sunndi.h> -#include <sys/mach_intr.h> -#include <sys/evtchn_impl.h> #include <sys/avintr.h> #include <sys/psm.h> #include <sys/spl.h> #include <sys/promif.h> #include <sys/list.h> -#include <sys/xen_mmu.h> #include <sys/bootconf.h> #include <sys/bootsvcs.h> -#include <sys/bootinfo.h> #include <util/sscanf.h> +#include <sys/mach_intr.h> +#include <sys/bootinfo.h> +#ifdef XPV_HVM_DRIVER +#include <sys/xpv_support.h> +#include <sys/hypervisor.h> +#include <sys/archsystm.h> +#include <sys/cpu.h> +#include <public/xen.h> +#include <public/event_channel.h> +#include <public/io/xenbus.h> +#else +#include <sys/hypervisor.h> +#include <sys/evtchn_impl.h> +#include <sys/xen_mmu.h> +#endif #include <xen/sys/xenbus_impl.h> #include <xen/sys/xendev.h> @@ -173,6 +183,10 @@ static ndi_event_set_t xpvd_ndi_events = { static ndi_event_hdl_t xpvd_ndi_event_handle; +#ifdef XPV_HVM_DRIVER +static int hvm_vdev_num[26]; +#endif + /* * Hypervisor interrupt capabilities */ @@ -236,7 +250,16 @@ static int xpvd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) { extern void xvdi_watch_devices(int); - xpvd_dip = devi; + +#ifdef XPV_HVM_DRIVER + if (xen_info == NULL) { + if (ddi_hold_installed_driver(ddi_name_to_major("xpv")) == + NULL) { + cmn_err(CE_WARN, "Couldn't initialize xpv framework"); + return (DDI_FAILURE); + } + } +#endif if (ndi_event_alloc_hdl(devi, 0, &xpvd_ndi_event_handle, NDI_SLEEP) != NDI_SUCCESS) { @@ -256,6 +279,7 @@ xpvd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) else xvdi_watch_devices(XENSTORE_UP); + xpvd_dip = devi; ddi_report_dev(devi); return (DDI_SUCCESS); @@ -557,6 +581,9 @@ xpvd_intr_ops(dev_info_t *pdip, dev_info_t *rdip, ddi_intr_op_t intr_op, case DDI_INTROP_SETMASK: case DDI_INTROP_CLRMASK: +#ifdef XPV_HVM_DRIVER + return (DDI_ENOTSUP); +#else /* * Handle this here */ @@ -568,14 +595,18 @@ xpvd_intr_ops(dev_info_t *pdip, dev_info_t *rdip, ddi_intr_op_t intr_op, ec_enable_irq(hdlp->ih_vector); } break; - +#endif case DDI_INTROP_GETPENDING: +#ifdef XPV_HVM_DRIVER + return (DDI_ENOTSUP); +#else if (hdlp->ih_type != DDI_INTR_TYPE_FIXED) return (DDI_FAILURE); *(int *)result = ec_pending_irq(hdlp->ih_vector); DDI_INTR_NEXDBG((CE_CONT, "xpvd: GETPENDING returned = %x\n", *(int *)result)); break; +#endif case DDI_INTROP_NAVAIL: *(int *)result = 1; @@ -689,6 +720,11 @@ xpvd_name_child(dev_info_t *child, char *name, int namelen) int *domain, *vdev; uint_t ndomain, nvdev; char *unit_address; + int devno; +#ifdef XPV_HVM_DRIVER + char *xip; + int xenstore_id; +#endif /* * i_xpvd_parse_devname() knows the formats used by this @@ -721,11 +757,45 @@ xpvd_name_child(dev_info_t *child, char *name, int namelen) /* * Use "unit-address" property (frontend/softdev drivers). + * + * For PV domains, the disk name should be a simple number. In an + * HVM domain, it will be a string of the form hdX. In the latter + * case we convert hda to 0, hdb to 1, and so on. */ if (ddi_prop_lookup_string(DDI_DEV_T_ANY, child, DDI_PROP_DONTPASS, "unit-address", &unit_address) == DDI_PROP_SUCCESS) { - (void) snprintf(name, namelen, "%s", unit_address); + devno = -1; + if (unit_address[0] >= '0' && unit_address[0] <= '9') + (void) sscanf(unit_address, "%d", &devno); +#ifdef XPV_HVM_DRIVER + /* + * XXX: we should really check the device class here. We + * always want to set hvm_vdev_num[] - even if we somehow + * end up with a non-hdX device name. + */ + else if (strlen(unit_address) == 3 && + unit_address[0] == 'h' && unit_address[1] == 'd') { + devno = unit_address[2] - 'a'; + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, child, + DDI_PROP_DONTPASS, "xenstore-id", &xip) + == DDI_PROP_SUCCESS) { + (void) sscanf(xip, "%d", &xenstore_id); + ddi_prop_free(xip); + hvm_vdev_num[devno] = xenstore_id; + } else { + devno = -1; + } + } +#endif + + if (devno < 0) { + cmn_err(CE_WARN, "Unrecognized device: %s", + unit_address); + ddi_prop_free(unit_address); + return (DDI_FAILURE); + } + (void) snprintf(name, namelen, "%x", devno); ddi_prop_free(unit_address); return (DDI_SUCCESS); } @@ -846,10 +916,20 @@ i_xpvd_parse_devname(char *name, xendev_devclass_t *devclassp, /* Frontend format is "<vdev>". */ *domp = DOMID_SELF; if (sscanf(caddr, "%x", vdevp) == 1) { +#ifdef XPV_HVM_DRIVER + if (*devclassp == XEN_VBLK) { + if (*vdevp < 0 || *vdevp > 26) { + *vdevp = -1; + goto done; + } + *vdevp = hvm_vdev_num[*vdevp]; + } +#endif ret = B_TRUE; goto done; } + done: kmem_free(device_name, len); return (ret); diff --git a/usr/src/uts/common/xen/io/xpvd.conf b/usr/src/uts/common/xen/io/xpvd.conf new file mode 100644 index 0000000000..55262457f9 --- /dev/null +++ b/usr/src/uts/common/xen/io/xpvd.conf @@ -0,0 +1,28 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# ident "%Z%%M% %I% %E% SMI" + +name="xpvd" class="root"; diff --git a/usr/src/uts/i86xpv/os/gnttab.c b/usr/src/uts/common/xen/os/gnttab.c index 5284b02ea4..238c45768e 100644 --- a/usr/src/uts/i86xpv/os/gnttab.c +++ b/usr/src/uts/common/xen/os/gnttab.c @@ -58,6 +58,11 @@ #include <sys/types.h> #include <sys/archsystm.h> +#ifdef XPV_HVM_DRIVER +#include <sys/xpv_support.h> +#include <sys/mman.h> +#include <vm/hat.h> +#endif #include <sys/hypervisor.h> #include <sys/gnttab.h> #include <sys/sysmacros.h> @@ -77,11 +82,13 @@ #include <vm/hat_i86.h> #include <sys/bootconf.h> #include <sys/bootsvcs.h> +#ifndef XPV_HVM_DRIVER #include <sys/bootinfo.h> #include <sys/multiboot.h> +#include <vm/kboot_mmu.h> +#endif #include <sys/bootvfs.h> #include <sys/bootprops.h> -#include <vm/kboot_mmu.h> #include <vm/seg_kmem.h> #define cmpxchg(t, c, n) atomic_cas_16((t), (c), (n)) @@ -410,6 +417,61 @@ out: mutex_exit(&gnttab_list_lock); } +#ifdef XPV_HVM_DRIVER + +static void +gnttab_map(void) +{ + struct xen_add_to_physmap xatp; + caddr_t va; + pfn_t pfn; + int i; + + va = (caddr_t)shared; + for (i = 0; i < NR_GRANT_FRAMES; i++) { + pfn = hat_getpfnum(kas.a_hat, va); + + xatp.domid = DOMID_SELF; + xatp.idx = i; + xatp.space = XENMAPSPACE_grant_table; + xatp.gpfn = pfn; + hat_unload(kas.a_hat, va, MMU_PAGESIZE, HAT_UNLOAD); + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp) != 0) + panic("Couldn't map grant table"); + + hat_devload(kas.a_hat, va, MMU_PAGESIZE, pfn, + PROT_READ | PROT_WRITE, + HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); + + va += MMU_PAGESIZE; + } +} + +void +gnttab_init(void) +{ + int i; + + shared = (grant_entry_t *)xen_alloc_pages(NR_GRANT_FRAMES); + + gnttab_map(); + + for (i = NR_RESERVED_ENTRIES; i < NR_GRANT_ENTRIES; i++) + gnttab_list[i] = i + 1; + gnttab_free_count = NR_GRANT_ENTRIES - NR_RESERVED_ENTRIES; + gnttab_free_head = NR_RESERVED_ENTRIES; + + mutex_init(&gnttab_list_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +gnttab_resume(void) +{ + gnttab_map(); +} + +#else /* XPV_HVM_DRIVER */ + void gnttab_init(void) { @@ -472,6 +534,8 @@ gnttab_resume(void) } } +#endif /* XPV_HVM_DRIVER */ + void gnttab_suspend(void) { diff --git a/usr/src/uts/i86xpv/os/hypercall.c b/usr/src/uts/common/xen/os/hypercall.c index ca753bb716..fae533dfbf 100644 --- a/usr/src/uts/i86xpv/os/hypercall.c +++ b/usr/src/uts/common/xen/os/hypercall.c @@ -39,6 +39,9 @@ */ #include <sys/types.h> +#ifdef XPV_HVM_DRIVER +#include <sys/xpv_support.h> +#endif #include <sys/hypervisor.h> #include <xen/public/sched.h> @@ -215,7 +218,7 @@ HYPERVISOR_grant_table_op(uint_t cmd, void *uop, uint_t count) ret_val = __hypercall3(__HYPERVISOR_grant_table_op, (long)cmd, (ulong_t)uop, (ulong_t)count); -#if !defined(_BOOT) +#if !defined(_BOOT) && !defined(XPV_HVM_DRIVER) /* * XXPV -- * The map_grant_ref call suffers a poor design flaw. diff --git a/usr/src/uts/common/xen/os/xvdi.c b/usr/src/uts/common/xen/os/xvdi.c index 347ca8bc3e..4eede251bd 100644 --- a/usr/src/uts/common/xen/os/xvdi.c +++ b/usr/src/uts/common/xen/os/xvdi.c @@ -43,8 +43,6 @@ */ #include <sys/conf.h> #include <sys/param.h> -#include <sys/hypervisor.h> -#include <sys/xen_mmu.h> #include <sys/kmem.h> #include <vm/seg_kmem.h> #include <sys/debug.h> @@ -57,8 +55,6 @@ #include <sys/sunndi.h> #include <sys/sunldi.h> #include <sys/fs/dv_node.h> -#include <sys/evtchn_impl.h> -#include <sys/gnttab.h> #include <sys/avintr.h> #include <sys/psm.h> #include <sys/spl.h> @@ -68,8 +64,22 @@ #include <sys/bootsvcs.h> #include <sys/bootinfo.h> #include <sys/note.h> +#ifdef XPV_HVM_DRIVER +#include <sys/xpv_support.h> +#include <sys/hypervisor.h> +#include <public/grant_table.h> +#include <public/xen.h> +#include <public/io/xenbus.h> +#include <public/io/xs_wire.h> +#include <public/event_channel.h> +#include <public/io/xenbus.h> +#else /* XPV_HVM_DRIVER */ +#include <sys/hypervisor.h> #include <sys/xen_mmu.h> #include <xen/sys/xenbus_impl.h> +#include <sys/evtchn_impl.h> +#endif /* XPV_HVM_DRIVER */ +#include <sys/gnttab.h> #include <xen/sys/xendev.h> #include <vm/hat_i86.h> #include <sys/scsi/generic/inquiry.h> @@ -79,7 +89,9 @@ static void xvdi_ring_init_sring(xendev_ring_t *); static void xvdi_ring_init_front_ring(xendev_ring_t *, size_t, size_t); +#ifndef XPV_HVM_DRIVER static void xvdi_ring_init_back_ring(xendev_ring_t *, size_t, size_t); +#endif static void xvdi_reinit_ring(dev_info_t *, grant_ref_t *, xendev_ring_t *); static int i_xvdi_add_watches(dev_info_t *); @@ -320,6 +332,19 @@ xvdi_init_dev(dev_info_t *dip) dip, "unit-address", prop_str); kmem_free(prop_str, prop_len); } +#ifdef XPV_HVM_DRIVER + /* + * The mapping between the 'dev' name and the + * device ID maintained by Xenstore has to be + * tracked explicitly in HVM domains. + */ + prop_str = strrchr(pdp->xd_xsdev.otherend, '/'); + if (prop_str != NULL) { + prop_str = ((caddr_t)prop_str) + 1; + (void) ndi_prop_update_string(DDI_DEV_T_NONE, + dip, "xenstore-id", prop_str); + } +#endif /* XPV_HVM_DRIVER */ break; default: break; @@ -342,7 +367,9 @@ xvdi_uninit_dev(dev_info_t *dip) i_xvdi_rem_watches(dip); /* tell other end to close */ - (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed); + if (pdp->xd_xsdev.otherend_id != (domid_t)-1) + (void) xvdi_switch_state(dip, XBT_NULL, + XenbusStateClosed); if (pdp->xd_xsdev.nodename != NULL) kmem_free((char *)(pdp->xd_xsdev.nodename), @@ -392,7 +419,9 @@ xvdi_bind_evtchn(dev_info_t *dip, evtchn_port_t evtchn) return (DDI_FAILURE); } } +#ifndef XPV_HVM_DRIVER pdp->xd_ispec.intrspec_vec = ec_bind_evtchn_to_irq(pdp->xd_evtchn); +#endif mutex_exit(&pdp->xd_lk); return (DDI_SUCCESS); @@ -435,7 +464,9 @@ xvdi_alloc_evtchn(dev_info_t *dip) return (DDI_FAILURE); } } +#ifndef XPV_HVM_DRIVER pdp->xd_ispec.intrspec_vec = ec_bind_evtchn_to_irq(pdp->xd_evtchn); +#endif mutex_exit(&pdp->xd_lk); return (DDI_SUCCESS); @@ -455,13 +486,16 @@ xvdi_free_evtchn(dev_info_t *dip) mutex_enter(&pdp->xd_lk); if (pdp->xd_evtchn != INVALID_EVTCHN) { +#ifndef XPV_HVM_DRIVER ec_unbind_irq(pdp->xd_ispec.intrspec_vec); - pdp->xd_evtchn = INVALID_EVTCHN; pdp->xd_ispec.intrspec_vec = 0; +#endif + pdp->xd_evtchn = INVALID_EVTCHN; } mutex_exit(&pdp->xd_lk); } +#ifndef XPV_HVM_DRIVER /* * Map an inter-domain communication ring for a virtual device. * This is used by backend drivers. @@ -566,6 +600,7 @@ xvdi_unmap_ring(xendev_ring_t *ring) vmem_xfree(heap_arena, ring->xr_vaddr, PAGESIZE); kmem_free(ring, sizeof (xendev_ring_t)); } +#endif /* XPV_HVM_DRIVER */ /* * Re-initialise an inter-domain communications ring for the backend domain. @@ -1961,6 +1996,7 @@ xvdi_ring_init_front_ring(xendev_ring_t *ringp, size_t nentry, size_t entrysize) ringp->xr_entry_size = entrysize; } +#ifndef XPV_HVM_DRIVER static void xvdi_ring_init_back_ring(xendev_ring_t *ringp, size_t nentry, size_t entrysize) { @@ -1975,6 +2011,7 @@ xvdi_ring_init_back_ring(xendev_ring_t *ringp, size_t nentry, size_t entrysize) ringp->xr_frontend = 0; ringp->xr_entry_size = entrysize; } +#endif /* XPV_HVM_DRIVER */ static void xendev_offline_device(void *arg) diff --git a/usr/src/uts/i86xpv/sys/gnttab.h b/usr/src/uts/common/xen/sys/gnttab.h index 7066ae3243..7066ae3243 100644 --- a/usr/src/uts/i86xpv/sys/gnttab.h +++ b/usr/src/uts/common/xen/sys/gnttab.h diff --git a/usr/src/uts/common/xen/sys/xendev.h b/usr/src/uts/common/xen/sys/xendev.h index b00a71fcf4..40a79e07e5 100644 --- a/usr/src/uts/common/xen/sys/xendev.h +++ b/usr/src/uts/common/xen/sys/xendev.h @@ -31,6 +31,11 @@ #include <sys/hypervisor.h> #include <sys/taskq.h> +#ifdef XPV_HVM_DRIVER +#include <public/io/ring.h> +#include <public/event_channel.h> +#include <public/grant_table.h> +#endif #include <xen/sys/xenbus_impl.h> #ifdef __cplusplus diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 00e7fabbe5..2bad82b0af 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -186,6 +186,11 @@ ROOTNEX_OBJS += rootnex.o TZMON_OBJS += tzmon.o UPPC_OBJS += uppc.o psm_common.o XSVC_OBJS += xsvc.o +XNF_OBJS += xnf.o +XPV_OBJS += xpv_support.o xvdi.o gnttab.o evtchn.o \ + xenbus_comms.o xenbus_client.o xenbus_probe.o xenbus_xs.o \ + hypercall.o hypersubr.o +XPVD_OBJS += xpvd.o # # Build up defines and paths. diff --git a/usr/src/uts/i86pc/Makefile.hvm b/usr/src/uts/i86pc/Makefile.hvm new file mode 100644 index 0000000000..3c53174cc4 --- /dev/null +++ b/usr/src/uts/i86pc/Makefile.hvm @@ -0,0 +1,67 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# uts/i86pc/Makefile.hvm +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" +# +# This makefile provides support for building PV drivers that run +# in an HVM environment. +# + +ROOT_HVM_DIR = $(ROOT)/platform/i86hvm +ROOT_HVM_MOD_DIR = $(ROOT_HVM_DIR)/kernel +ROOT_HVM_DRV_DIR_32 = $(ROOT_HVM_MOD_DIR)/drv +ROOT_HVM_DRV_DIR_64 = $(ROOT_HVM_MOD_DIR)/drv/$(MACH64) +ROOT_HVM_DRV_DIR = $(ROOT_HVM_DRV_DIR_$(CLASS)) +USR_HVM_DIR = $(ROOT)/usr/platform/i86hvm + +# +# Indicate that we are building for the i86hvm semi-platform +# +CPPFLAGS += -DXPV_HVM_DRIVER +ASFLAGS += -DXPV_HVM_DRIVER + +# +# Installation targets and rules: +# +$(ROOT_HVM_DIR): + -$(INS.dir.root.sys) + +$(ROOT_HVM_MOD_DIR): $(ROOT_HVM_DIR) + -$(INS.dir.root.sys) + +$(ROOT_HVM_DRV_DIR): $(ROOT_MOD_DIR) + -$(INS.dir.root.sys) + +$(ROOT_HVM_MOD_DIR)/%: $(OBJS_DIR)/% $(ROOT_HVM_MOD_DIR) FRC + $(INS.file) + +$(ROOT_HVM_DRV_DIR)/%: $(OBJS_DIR)/% $(ROOT_HVM_DRV_DIR) FRC + $(INS.file) + +$(USR_HVM_DIR): + -$(INS.dir.root.sys) + +INSTALL_DEPS += $(ROOT_HVM_DIR) $(USR_HVM_DIR) diff --git a/usr/src/uts/i86pc/Makefile.i86pc.shared b/usr/src/uts/i86pc/Makefile.i86pc.shared index 311e8ee50b..59b73f2aca 100644 --- a/usr/src/uts/i86pc/Makefile.i86pc.shared +++ b/usr/src/uts/i86pc/Makefile.i86pc.shared @@ -253,6 +253,9 @@ DRV_KMODS += xsvc DRV_KMODS += mc-amd DRV_KMODS += tzmon DRV_KMODS += battery +DRV_KMODS += xnf +DRV_KMODS += xpv +DRV_KMODS += xpvd DRV_KMODS += cpudrv diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules index 8ca64e2fcb..afd1209ebf 100644 --- a/usr/src/uts/i86pc/Makefile.rules +++ b/usr/src/uts/i86pc/Makefile.rules @@ -110,6 +110,13 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/tzmon/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/xpv/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/xpv/%.s + $(COMPILE.s) -o $@ $< + $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/ml/%.s $(COMPILE.s) -o $@ $< @@ -145,6 +152,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/gfx_private/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/xen/io/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/xen/os/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/xsvc/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -177,6 +192,9 @@ $(DBOOT_OBJS_DIR)/%.o: $(UTSBASE)/i86pc/boot/%.c $(DBOOT_OBJS_DIR)/%.o: $(UTSBASE)/i86pc/dboot/%.c $(i386_CC) $(CERRWARN) -O $(DBOOT_DEFS) $(DBOOT_CC_INCL) -c -o $@ $< +$(DBOOT_OBJS_DIR)/%.o: $(UTSBASE)/intel/ia32/%.s + $(DBOOT_AS) -P -D_ASM $(DBOOT_DEFS) $(DBOOT_AS_INCL) -o $@ $< + $(DBOOT_OBJS_DIR)/%.o: $(COMMONBASE)/util/%.c $(i386_CC) $(CERRWARN) -O $(DBOOT_DEFS) $(DBOOT_CC_INCL) -c -o $@ $< @@ -286,6 +304,12 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/psm/%.s $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/tzmon/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/xpv/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/xpv/%.s + @($(LHEAD) $(LINT.s) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/ml/%.s @($(LHEAD) $(LINT.s) $< $(LTAIL)) @@ -316,6 +340,12 @@ $(LINTS_DIR)/%.ln: $(SRC)/common/atomic/%.c $(LINTS_DIR)/%.ln: $(SRC)/common/mc/mc-amd/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/xen/io/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/xen/os/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/io/gfx_private/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -346,4 +376,3 @@ $(DBOOT_LINTS_DIR)/%.ln: $(COMMONBASE)/util/%.c $(DBOOT_LINTS_DIR)/%.ln: $(COMMONBASE)/util/i386/%.s @($(LHEAD) $(DBOOT_LINT) $(DBOOT_LOCAL_LINTFLAGS) $< $(LTAIL)) - diff --git a/usr/src/uts/i86pc/io/xpv/evtchn.c b/usr/src/uts/i86pc/io/xpv/evtchn.c new file mode 100644 index 0000000000..3da34d406e --- /dev/null +++ b/usr/src/uts/i86pc/io/xpv/evtchn.c @@ -0,0 +1,450 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/xpv_support.h> +#include <sys/hypervisor.h> +#include <sys/machsystm.h> +#include <sys/mutex.h> +#include <sys/cmn_err.h> +#include <sys/dditypes.h> +#include <sys/atomic.h> +#include <sys/sysmacros.h> +#include <sys/cpu.h> +#include <sys/psw.h> +#include <sys/psm.h> +#include <sys/sdt.h> + +extern dev_info_t *xpv_dip; +static ddi_intr_handle_t *evtchn_ihp = NULL; +static ddi_softint_handle_t evtchn_to_handle[NR_EVENT_CHANNELS]; +static kmutex_t ec_lock; + +static int evtchn_callback_irq = -1; + +/* + * Xen defines structures shared between the hypervisor and domU using + * longs. Sigh. To support 32-bit domUs on a 64-bit hypervisor, we + * redefine the pending-events and masked-events bitmasks in terms of + * uint32_t's. + */ +static uint32_t *pending_events; +static uint32_t *masked_events; +static int event_array_size; +#define EVTCHN_SHIFT 5 /* log2(NBBY * sizeof (uint32_t)) */ + +/* Atomically get and clear an integer from memory. */ +#define GET_AND_CLEAR(type, size, src, targ) { \ + volatile type *_vsrc = (volatile type *)src; \ + membar_enter(); \ + do { \ + targ = *_vsrc; \ + } while (atomic_cas_## size(_vsrc, targ, 0) != targ); \ +} + +#define GET_AND_CLEAR_32(src, targ) GET_AND_CLEAR(uint32_t, 32, src, targ) +#define GET_AND_CLEAR_64(src, targ) GET_AND_CLEAR(uint64_t, 64, src, targ) + +/* Get the first and last bits set in a bitmap */ +#define GET_BOUNDS(bitmap, max, low, high) { \ + int _i; \ + low = high = -1; \ + for (_i = 0; _i <= max; _i++) \ + if (bitmap & ((uint64_t)1 << _i)) { \ + if (low == -1) \ + low = _i; \ + high = _i; \ + } \ +} + +/* + * Translate an event number into an index into the array of 32-bit + * bitmasks, and a bit within the proper word. + */ +static void +get_event_bit(int evt, int *idx, uint32_t *bit) +{ + int evb; + + *idx = evt >> EVTCHN_SHIFT; + evb = evt & ((1ul << EVTCHN_SHIFT) - 1); + *bit = 1ul << evb; +} + +void +ec_bind_evtchn_to_handler(int evtchn, pri_t pri, ec_handler_fcn_t handler, + void *arg1) +{ + ddi_softint_handle_t hdl; + + if (evtchn < 0 || evtchn > NR_EVENT_CHANNELS) { + cmn_err(CE_WARN, "Binding invalid event channel: %d", evtchn); + return; + } + + (void) ddi_intr_add_softint(xpv_dip, &hdl, pri, handler, (caddr_t)arg1); + mutex_enter(&ec_lock); + ASSERT(evtchn_to_handle[evtchn] == NULL); + evtchn_to_handle[evtchn] = hdl; + mutex_exit(&ec_lock); + + /* Let the hypervisor know we're prepared to handle this event */ + hypervisor_unmask_event(evtchn); +} + +void +ec_unbind_evtchn(int evtchn) +{ + evtchn_close_t close; + ddi_softint_handle_t hdl; + + if (evtchn < 0 || evtchn > NR_EVENT_CHANNELS) { + cmn_err(CE_WARN, "Unbinding invalid event channel: %d", evtchn); + return; + } + + /* + * Let the hypervisor know we're no longer prepared to handle this + * event + */ + hypervisor_mask_event(evtchn); + + /* Cleanup the event handler metadata */ + mutex_enter(&ec_lock); + hdl = evtchn_to_handle[evtchn]; + evtchn_to_handle[evtchn] = NULL; + mutex_exit(&ec_lock); + + close.port = evtchn; + (void) HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); + (void) ddi_intr_remove_softint(hdl); +} + +void +ec_notify_via_evtchn(unsigned int port) +{ + evtchn_send_t send; + + if ((int)port == -1) + return; + send.port = port; + (void) HYPERVISOR_event_channel_op(EVTCHNOP_send, &send); +} + +void +hypervisor_unmask_event(unsigned int ev) +{ + int evi; + uint32_t bit; + volatile uint32_t *maskp; + evtchn_unmask_t unmask; + + /* + * Translate the event number into a index into the masked-events + * bitmask, and set the bit to 0. + */ + get_event_bit(ev, &evi, &bit); + maskp = (volatile uint32_t *)&masked_events[evi]; + atomic_and_32(maskp, ~bit); + + /* Let the hypervisor know the event has been unmasked */ + unmask.port = ev; + if (HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask) != 0) + panic("xen_evtchn_unmask() failed"); +} + +/* Set a bit in an evtchan mask word */ +void +hypervisor_mask_event(uint_t ev) +{ + int evi; + uint32_t bit; + volatile uint32_t *maskp; + + get_event_bit(ev, &evi, &bit); + maskp = (volatile uint32_t *)&masked_events[evi]; + atomic_or_32(maskp, bit); +} + +void +hypervisor_clear_event(uint_t ev) +{ + int evi; + uint32_t bit; + volatile uint32_t *maskp; + + get_event_bit(ev, &evi, &bit); + maskp = (volatile uint32_t *)&pending_events[evi]; + atomic_and_32(maskp, ~bit); +} + +int +xen_alloc_unbound_evtchn(int domid, int *evtchnp) +{ + evtchn_alloc_unbound_t alloc; + int err; + + alloc.dom = DOMID_SELF; + alloc.remote_dom = (domid_t)domid; + + if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc)) == 0) { + *evtchnp = alloc.port; + /* ensure evtchn is masked till we're ready to use it */ + (void) hypervisor_mask_event(*evtchnp); + } else { + err = xen_xlate_errcode(err); + } + + return (err); +} + +int +xen_bind_interdomain(int domid, int remote_port, int *port) +{ + evtchn_bind_interdomain_t bind; + int err; + + bind.remote_dom = (domid_t)domid; + bind.remote_port = remote_port; + if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind)) == 0) + *port = bind.local_port; + else + err = xen_xlate_errcode(err); + return (err); +} + +static int +ev_ffs(uint32_t bits) +{ + int i; + + if (bits == 0) + return (0); + for (i = 1; ; i++, bits >>= 1) { + if (bits & 1) + break; + } + return (i); +} + +/*ARGSUSED*/ +uint_t +evtchn_callback_fcn(caddr_t arg0, caddr_t arg1) +{ + uint32_t pending_word; + int i, j, port; + volatile struct vcpu_info *vci; + uint_t rv = DDI_INTR_UNCLAIMED; + ddi_softint_handle_t hdl; + caddr_t pending_sel_addr; + int low, high; + + vci = &HYPERVISOR_shared_info->vcpu_info[CPU->cpu_id]; + pending_sel_addr = (caddr_t)&vci->evtchn_pending_sel; +#ifndef __amd64 + /* + * More 32/64-bit ugliness. Xen defines this field as a long, so + * it ends up misaligned in a 32-bit domU. + */ + if (xen_is_64bit) + pending_sel_addr = (caddr_t) + P2ROUNDUP((uintptr_t)pending_sel_addr, sizeof (uint64_t)); +#endif + +again: + DTRACE_PROBE2(evtchn__scan__start, int, vci->evtchn_upcall_pending, + ulong_t, vci->evtchn_pending_sel); + + atomic_and_8(&vci->evtchn_upcall_pending, 0); + + /* + * Find the upper and lower bounds in which we need to search for + * pending events. + */ + if (xen_is_64bit) { + uint64_t sels; + + GET_AND_CLEAR_64((volatile uint64_t *)pending_sel_addr, sels); + + /* sels == 1 is by far the most common case. Make it fast */ + if (sels == 1) + low = high = 0; + else if (sels == 0) + return (rv); + else + GET_BOUNDS(sels, 63, low, high); + + /* + * Each bit in the pending_sels bitmap represents 2 entries + * in our forced-to-be-32-bit event channel array. + */ + low = low * 2; + high = high * 2 + 1; + } else { + uint32_t sels; + + GET_AND_CLEAR_32((volatile uint32_t *)pending_sel_addr, sels); + + /* sels == 1 is by far the most common case. Make it fast */ + if (sels == 1) + low = high = 0; + else if (sels == 0) + return (rv); + else + GET_BOUNDS(sels, 31, low, high); + } + + /* Scan the port list, looking for words with bits set */ + for (i = low; i <= high; i++) { + uint32_t tmp; + + GET_AND_CLEAR_32(&pending_events[i], tmp); + pending_word = tmp & ~(masked_events[i]); + + /* Scan the bits in the word, looking for pending events */ + while (pending_word != 0) { + j = ev_ffs(pending_word) - 1; + port = (i << EVTCHN_SHIFT) + j; + pending_word = pending_word & ~(1 << j); + + /* + * If there is a handler registered for this event, + * schedule a softint of the appropriate priority + * to execute it. + */ + if ((hdl = evtchn_to_handle[port]) != NULL) { + (void) ddi_intr_trigger_softint(hdl, NULL); + rv = DDI_INTR_CLAIMED; + } + } + } + DTRACE_PROBE2(evtchn__scan__end, int, vci->evtchn_upcall_pending, + ulong_t, vci->evtchn_pending_sel); + + if ((volatile uint8_t)vci->evtchn_upcall_pending || + *((volatile ulong_t *)pending_sel_addr)) + goto again; + + return (rv); +} + +static int +set_hvm_callback(int irq) +{ + struct xen_hvm_param xhp; + + xhp.domid = DOMID_SELF; + xhp.index = HVM_PARAM_CALLBACK_IRQ; + xhp.value = irq; + return (HYPERVISOR_hvm_op(HVMOP_set_param, &xhp)); +} + +void +ec_fini() +{ + int i; + + for (i = 0; i < NR_EVENT_CHANNELS; i++) + ec_unbind_evtchn(i); + + evtchn_callback_irq = -1; + if (evtchn_ihp != NULL) { + (void) ddi_intr_disable(*evtchn_ihp); + (void) ddi_intr_remove_handler(*evtchn_ihp); + (void) ddi_intr_free(*evtchn_ihp); + kmem_free(evtchn_ihp, sizeof (ddi_intr_handle_t)); + evtchn_ihp = NULL; + } +} + +int +ec_init(dev_info_t *dip) +{ + int i; + int rv, actual; + ddi_intr_handle_t *ihp; + volatile shared_info_t *si = HYPERVISOR_shared_info; + + /* + * Translate the variable-sized pending and masked event bitmasks + * into constant-sized arrays of uint32_t's. + */ + pending_events = (uint32_t *)&si->evtchn_pending[0]; + if (xen_is_64bit) + event_array_size = 2 * sizeof (uint64_t) * 8; + else + event_array_size = sizeof (uint32_t) * 8; + masked_events = &pending_events[event_array_size]; + + /* + * Clear our event handler structures and prevent the hypervisor + * from triggering any events. + */ + mutex_init(&ec_lock, NULL, MUTEX_SPIN, (void *)ipltospl(SPL7)); + for (i = 0; i < NR_EVENT_CHANNELS; i++) { + evtchn_to_handle[i] = NULL; + (void) hypervisor_mask_event(i); + } + + /* + * Allocate and initialize an interrupt handler to process the + * hypervisor's "hey you have events pending!" interrupt. + */ + ihp = kmem_zalloc(sizeof (ddi_intr_handle_t), KM_SLEEP); + rv = ddi_intr_alloc(dip, ihp, DDI_INTR_TYPE_FIXED, 0, 1, &actual, + DDI_INTR_ALLOC_NORMAL); + if (rv < 0 || actual != 1) { + cmn_err(CE_WARN, "Could not allocate evtchn interrupt: %d", + rv); + return (-1); + } + + rv = ddi_intr_add_handler(*ihp, evtchn_callback_fcn, NULL, NULL); + if (rv < 0) { + (void) ddi_intr_free(*ihp); + cmn_err(CE_WARN, "Could not attach evtchn handler"); + return (-1); + } + evtchn_ihp = ihp; + + if (ddi_intr_enable(*ihp) != DDI_SUCCESS) { + cmn_err(CE_WARN, "Could not enable evtchn interrupts\n"); + return (-1); + } + + /* Tell the hypervisor which interrupt we're waiting on. */ + evtchn_callback_irq = ((ddi_intr_handle_impl_t *)*ihp)->ih_vector; + + if (set_hvm_callback(evtchn_callback_irq) != 0) { + cmn_err(CE_WARN, "Couldn't register evtchn callback"); + return (-1); + } + return (0); +} diff --git a/usr/src/uts/i86pc/io/xpv/xpv.conf b/usr/src/uts/i86pc/io/xpv/xpv.conf new file mode 100644 index 0000000000..d599f6f3ff --- /dev/null +++ b/usr/src/uts/i86pc/io/xpv/xpv.conf @@ -0,0 +1,28 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# ident "%Z%%M% %I% %E% SMI" + +interrupt-priorities=9; diff --git a/usr/src/uts/i86pc/io/xpv/xpv_support.c b/usr/src/uts/i86pc/io/xpv/xpv_support.c new file mode 100644 index 0000000000..fb34924319 --- /dev/null +++ b/usr/src/uts/i86pc/io/xpv/xpv_support.c @@ -0,0 +1,541 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/modctl.h> +#include <sys/types.h> +#include <sys/archsystm.h> +#include <sys/machsystm.h> +#include <sys/sunndi.h> +#include <sys/sunddi.h> +#include <sys/ddi_subrdefs.h> +#include <sys/xpv_support.h> +#include <sys/xen_errno.h> +#include <sys/hypervisor.h> +#include <sys/gnttab.h> +#include <sys/xenbus_comms.h> +#include <sys/xenbus_impl.h> +#include <xen/sys/xendev.h> +#include <sys/sysmacros.h> +#include <sys/x86_archext.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/pc_mmu.h> +#include <sys/cmn_err.h> +#include <vm/seg_kmem.h> +#include <vm/as.h> +#include <vm/hat_pte.h> +#include <vm/hat_i86.h> + +#define XPV_MINOR 0 + +/* + * This structure is ordinarily constructed by Xen. In the HVM world, we + * manually fill in the few fields the PV drivers need. + */ +start_info_t *xen_info = NULL; + +/* Xen version number. */ +int xen_major, xen_minor; + +/* Metadata page shared between domain and Xen */ +shared_info_t *HYPERVISOR_shared_info = NULL; + +/* Page containing code to issue hypercalls. */ +extern caddr_t hypercall_page; + +/* Is the hypervisor 64-bit? */ +int xen_is_64bit = -1; + +/* virtual addr for the store_mfn page */ +caddr_t xb_addr; + +dev_info_t *xpv_dip; + +/* + * Forward declarations + */ +static int xpv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int xpv_attach(dev_info_t *, ddi_attach_cmd_t); +static int xpv_detach(dev_info_t *, ddi_detach_cmd_t); +static int xpv_open(dev_t *, int, int, cred_t *); +static int xpv_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); + +static struct cb_ops xpv_cb_ops = { + xpv_open, + nulldev, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + xpv_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, + NULL, + D_MP, + CB_REV, + NULL, + NULL +}; + +static struct dev_ops xpv_dv_ops = { + DEVO_REV, + 0, + xpv_getinfo, + nulldev, /* identify */ + nulldev, /* probe */ + xpv_attach, + xpv_detach, + nodev, /* reset */ + &xpv_cb_ops, + NULL, /* struct bus_ops */ + NULL /* power */ +}; + +static struct modldrv modldrv = { + &mod_driverops, + "xpv driver %I%", + &xpv_dv_ops +}; + +static struct modlinkage modl = { + MODREV_1, + { + (void *)&modldrv, + NULL /* null termination */ + } +}; + +static ddi_dma_attr_t xpv_dma_attr = { + DMA_ATTR_V0, /* version of this structure */ + 0, /* lowest usable address */ + 0xffffffffffffffffULL, /* highest usable address */ + 0x7fffffff, /* maximum DMAable byte count */ + MMU_PAGESIZE, /* alignment in bytes */ + 0x7ff, /* bitmap of burst sizes */ + 1, /* minimum transfer */ + 0xffffffffU, /* maximum transfer */ + 0x7fffffffULL, /* maximum segment length */ + 1, /* maximum number of segments */ + 1, /* granularity */ + 0, /* flags (reserved) */ +}; + +static ddi_device_acc_attr_t xpv_accattr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STRICTORDER_ACC +}; + +#define MAX_ALLOCATIONS 10 +static ddi_dma_handle_t xpv_dma_handle[MAX_ALLOCATIONS]; +static ddi_acc_handle_t xpv_dma_acchandle[MAX_ALLOCATIONS]; +static int xen_alloc_cnt = 0; + +void * +xen_alloc_pages(pgcnt_t cnt) +{ + size_t len; + int a = xen_alloc_cnt++; + caddr_t addr; + + ASSERT(xen_alloc_cnt < MAX_ALLOCATIONS); + if (ddi_dma_alloc_handle(xpv_dip, &xpv_dma_attr, DDI_DMA_SLEEP, 0, + &xpv_dma_handle[a]) != DDI_SUCCESS) + return (NULL); + + if (ddi_dma_mem_alloc(xpv_dma_handle[a], MMU_PAGESIZE * cnt, + &xpv_accattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, 0, + &addr, &len, &xpv_dma_acchandle[a]) != DDI_SUCCESS) { + ddi_dma_free_handle(&xpv_dma_handle[a]); + cmn_err(CE_WARN, "Couldn't allocate memory for xpv devices"); + return (NULL); + } + return (addr); +} + +/* + * This function is invoked twice, first time with reprogram=0 to set up + * the xpvd portion of the device tree. The second time it is ignored. + */ +static void +xpv_enumerate(int reprogram) +{ + dev_info_t *dip; + + if (reprogram != 0) + return; + + ndi_devi_alloc_sleep(ddi_root_node(), "xpvd", + (pnode_t)DEVI_SID_NODEID, &dip); + + (void) ndi_devi_bind_driver(dip, 0); + + /* + * Too early to enumerate split device drivers in domU + * since we need to create taskq thread during enumeration. + * So, we only enumerate softdevs and console here. + */ + xendev_enum_all(dip, B_TRUE); +} + +/* + * Translate a hypervisor errcode to a Solaris error code. + */ +int +xen_xlate_errcode(int error) +{ +#define CASE(num) case X_##num: error = num; break + + switch (-error) { + CASE(EPERM); CASE(ENOENT); CASE(ESRCH); + CASE(EINTR); CASE(EIO); CASE(ENXIO); + CASE(E2BIG); CASE(ENOMEM); CASE(EACCES); + CASE(EFAULT); CASE(EBUSY); CASE(EEXIST); + CASE(ENODEV); CASE(EISDIR); CASE(EINVAL); + CASE(ENOSPC); CASE(ESPIPE); CASE(EROFS); + CASE(ENOSYS); CASE(ENOTEMPTY); CASE(EISCONN); + CASE(ENODATA); + default: + panic("xen_xlate_errcode: unknown error %d", error); + } + return (error); +#undef CASE +} + +/*PRINTFLIKE1*/ +void +xen_printf(const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + printf(fmt, adx); + va_end(adx); +} + +/* + * Stub functions to get the FE drivers to build, and to catch drivers that + * misbehave in HVM domains. + */ +/*ARGSUSED*/ +void +xen_release_pfn(pfn_t pfn, caddr_t va) +{ + panic("xen_release_pfn() is not supported in HVM domains"); +} + +/*ARGSUSED*/ +void +reassign_pfn(pfn_t pfn, mfn_t mfn) +{ + panic("reassign_pfn() is not supported in HVM domains"); +} + +/*ARGSUSED*/ +long +balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns) +{ + panic("balloon_free_pages() is not supported in HVM domains"); + return (0); +} + +/*ARGSUSED*/ +void +balloon_drv_added(int64_t delta) +{ + panic("balloon_drv_added() is not supported in HVM domains"); +} + +/* + * Add a mapping for the machine page at the given virtual address. + */ +void +kbm_map_ma(maddr_t ma, uintptr_t va, uint_t level) +{ + ASSERT(level == 0); + + hat_devload(kas.a_hat, (caddr_t)va, MMU_PAGESIZE, + mmu_btop(ma), PROT_READ | PROT_WRITE, HAT_LOAD); +} + +static uint64_t +hvm_get_param(int param_id) +{ + struct xen_hvm_param xhp; + + xhp.domid = DOMID_SELF; + xhp.index = param_id; + if ((HYPERVISOR_hvm_op(HVMOP_get_param, &xhp) < 0)) + return (-1); + return (xhp.value); +} + +static int +xen_pv_init(dev_info_t *xpv_dip) +{ + struct cpuid_regs cp; + uint32_t xen_signature[4]; + char *xen_str; + struct xen_add_to_physmap xatp; + xen_capabilities_info_t caps; + pfn_t pfn; + uint64_t msrval; + int err; + + /* + * Xen's pseudo-cpuid function 0x40000000 returns a string + * representing the Xen signature in %ebx, %ecx, and %edx. + * %eax contains the maximum supported cpuid function. + */ + cp.cp_eax = 0x40000000; + (void) __cpuid_insn(&cp); + xen_signature[0] = cp.cp_ebx; + xen_signature[1] = cp.cp_ecx; + xen_signature[2] = cp.cp_edx; + xen_signature[3] = 0; + xen_str = (char *)xen_signature; + if (strcmp("XenVMMXenVMM", xen_str) != 0 || + cp.cp_eax < 0x40000002) { + cmn_err(CE_WARN, + "Attempting to load Xen drivers on non-Xen system"); + return (-1); + } + + /* + * cpuid function 0x40000001 returns the Xen version in %eax. The + * top 16 bits are the major version, the bottom 16 are the minor + * version. + */ + cp.cp_eax = 0x40000001; + (void) __cpuid_insn(&cp); + xen_major = cp.cp_eax >> 16; + xen_minor = cp.cp_eax & 0xffff; + if (xen_major != 3 || xen_minor != 0) { + cmn_err(CE_WARN, "Xen version %d.%d is not supported", + xen_major, xen_minor); + return (-1); + } + + /* + * cpuid function 0x40000002 returns information about the + * hypercall page. %eax nominally contains the number of pages + * with hypercall code, but according to the Xen guys, "I'll + * guarantee that remains one forever more, so you can just + * allocate a single page and get quite upset if you ever see CPUID + * return more than one page." %ebx contains an MSR we use to ask + * Xen to remap each page at a specific pfn. + */ + cp.cp_eax = 0x40000002; + (void) __cpuid_insn(&cp); + + /* + * Let Xen know where we want the hypercall page mapped. We + * already have a page allocated in the .text section to simplify + * the wrapper code. + */ + pfn = hat_getpfnum(kas.a_hat, (caddr_t)&hypercall_page); + msrval = mmu_ptob(pfn); + wrmsr(cp.cp_ebx, msrval); + + /* Fill in the xen_info data */ + xen_info = kmem_zalloc(sizeof (start_info_t), KM_SLEEP); + (void) sprintf(xen_info->magic, "xen-%d.%d", xen_major, xen_minor); + xen_info->store_mfn = (mfn_t)hvm_get_param(HVM_PARAM_STORE_PFN); + xen_info->store_evtchn = (int)hvm_get_param(HVM_PARAM_STORE_EVTCHN); + + /* Figure out whether the hypervisor is 32-bit or 64-bit. */ + if ((HYPERVISOR_xen_version(XENVER_capabilities, &caps) == 0)) { + ((char *)(caps))[sizeof (caps) - 1] = '\0'; + if (strstr(caps, "x86_64") != NULL) + xen_is_64bit = 1; + else if (strstr(caps, "x86_32") != NULL) + xen_is_64bit = 0; + } + if (xen_is_64bit < 0) { + cmn_err(CE_WARN, "Couldn't get capability info from Xen."); + return (-1); + } +#ifdef __amd64 + ASSERT(xen_is_64bit == 1); +#endif + + /* + * Allocate space for the shared_info page and tell Xen where it + * is. + */ + HYPERVISOR_shared_info = xen_alloc_pages(1); + xatp.domid = DOMID_SELF; + xatp.idx = 0; + xatp.space = XENMAPSPACE_shared_info; + xatp.gpfn = hat_getpfnum(kas.a_hat, (caddr_t)HYPERVISOR_shared_info); + if ((err = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) != 0) { + cmn_err(CE_WARN, "Could not get shared_info page from Xen." + " error: %d", err); + return (-1); + } + + /* Set up the grant tables. */ + gnttab_init(); + + /* Set up event channel support */ + if (ec_init(xpv_dip) != 0) + return (-1); + + /* Set up xenbus */ + xb_addr = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP); + xs_early_init(); + xs_domu_init(); + + return (0); +} + +static void +xen_pv_fini() +{ + if (xen_info != NULL) + kmem_free(xen_info, sizeof (start_info_t)); + ec_fini(); +} + +/*ARGSUSED*/ +static int +xpv_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + if (getminor((dev_t)arg) != XPV_MINOR) + return (DDI_FAILURE); + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = xpv_dip; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = 0; + break; + default: + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +static int +xpv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR, + ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + xpv_dip = dip; + + if (xen_pv_init(dip) != 0) + return (DDI_FAILURE); + + ddi_report_dev(dip); + + /* + * If the memscrubber attempts to scrub the pages we hand to Xen, + * the domain will panic. + */ + memscrub_disable(); + + return (DDI_SUCCESS); +} + +/* + * Attempts to reload the PV driver plumbing hang on Intel platforms, so + * we don't want to unload the framework by accident. + */ +int xpv_allow_detach = 0; + +static int +xpv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH || xpv_allow_detach == 0) + return (DDI_FAILURE); + + if (xpv_dip != NULL) { + xen_pv_fini(); + ddi_remove_minor_node(dip, NULL); + xpv_dip = NULL; + } + + return (DDI_SUCCESS); +} + +/*ARGSUSED1*/ +static int +xpv_open(dev_t *dev, int flag, int otyp, cred_t *cr) +{ + return (getminor(*dev) == XPV_MINOR ? 0 : ENXIO); +} + +/*ARGSUSED*/ +static int +xpv_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, + int *rval_p) +{ + return (EINVAL); +} + +int +_init(void) +{ + int err; + + if ((err = mod_install(&modl)) != 0) + return (err); + + impl_bus_add_probe(xpv_enumerate); + return (0); +} + +int +_fini(void) +{ + int err; + + if ((err = mod_remove(&modl)) != 0) + return (err); + + impl_bus_delete_probe(xpv_enumerate); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modl, modinfop)); +} diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c index 731e885508..822ec8a4fa 100644 --- a/usr/src/uts/i86pc/os/cpuid.c +++ b/usr/src/uts/i86pc/os/cpuid.c @@ -564,6 +564,34 @@ cpuid_free_space(cpu_t *cpu) kmem_free(cpu->cpu_m.mcpu_cpi, sizeof (*cpu->cpu_m.mcpu_cpi)); } +#if !defined(__xpv) + +static void +check_for_hvm() +{ + struct cpuid_regs cp; + char *xen_str; + uint32_t xen_signature[4]; + extern int xpv_is_hvm; + + /* + * In a fully virtualized domain, Xen's pseudo-cpuid function + * 0x40000000 returns a string representing the Xen signature in + * %ebx, %ecx, and %edx. %eax contains the maximum supported cpuid + * function. + */ + cp.cp_eax = 0x40000000; + (void) __cpuid_insn(&cp); + xen_signature[0] = cp.cp_ebx; + xen_signature[1] = cp.cp_ecx; + xen_signature[2] = cp.cp_edx; + xen_signature[3] = 0; + xen_str = (char *)xen_signature; + if (strcmp("XenVMMXenVMM", xen_str) == 0 && cp.cp_eax <= 0x40000002) + xpv_is_hvm = 1; +} +#endif /* __xpv */ + uint_t cpuid_pass1(cpu_t *cpu) { @@ -1227,6 +1255,9 @@ cpuid_pass1(cpu_t *cpu) synth_info(cpi); pass1_done: +#if !defined(__xpv) + check_for_hvm(); +#endif cpi->cpi_pass = 1; return (feature); } @@ -3674,7 +3705,6 @@ void patch_tsc_read(int flag) { size_t cnt; - switch (flag) { case X86_NO_TSC: cnt = &_no_rdtsc_end - &_no_rdtsc_start; diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c index 3f5705bbc6..f33f60e320 100644 --- a/usr/src/uts/i86pc/os/mlsetup.c +++ b/usr/src/uts/i86pc/os/mlsetup.c @@ -105,6 +105,9 @@ mlsetup(struct regs *rp) extern disp_t cpu0_disp; extern char t0stack[]; int boot_ncpus; +#if !defined(__xpv) + extern int xpv_is_hvm; +#endif ASSERT_STACK_ALIGNED(); @@ -176,8 +179,11 @@ mlsetup(struct regs *rp) * Note: tsc_read is not patched for x86 processors which do * not support "mfence". By default tsc_read will use cpuid for * serialization in such cases. + * + * The Xen hypervisor does not correctly report whether rdtscp is + * supported or not, so we must assume that it is not. */ - if (x86_feature & X86_TSCP) + if (xpv_is_hvm == 0 && (x86_feature & X86_TSCP)) patch_tsc_read(X86_HAVE_TSCP); else if (cpuid_getvendor(CPU) == X86_VENDOR_AMD && cpuid_getfamily(CPU) <= 0xf && (x86_feature & X86_SSE2) != 0) diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index 861c1e0f3e..30864285af 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -521,6 +521,11 @@ static page_t *rd_pages; struct system_hardware system_hardware; /* + * Is this Solaris instance running in a fully virtualized xVM domain? + */ +int xpv_is_hvm = 0; + +/* * Enable some debugging messages concerning memory usage... */ static void @@ -1339,6 +1344,36 @@ startup_kmem(void) PRM_POINT("startup_kmem() done"); } +#ifndef __xpv +/* + * If we have detected that we are running in an HVM environment, we need + * to prepend the PV driver directory to the module search path. + */ +#define HVM_MOD_DIR "/platform/i86hvm/kernel" +static void +update_default_path() +{ + char *current, *newpath; + int newlen; + + /* + * We are about to resync with krtld. krtld will reset its + * internal module search path iff Solaris has set default_path. + * We want to be sure we're prepending this new directory to the + * right search path. + */ + current = (default_path == NULL) ? kobj_module_path : default_path; + + newlen = strlen(HVM_MOD_DIR) + strlen(current) + 1; + newpath = kmem_alloc(newlen, KM_SLEEP); + (void) strcpy(newpath, HVM_MOD_DIR); + (void) strcat(newpath, " "); + (void) strcat(newpath, current); + + default_path = newpath; +} +#endif + static void startup_modules(void) { @@ -1355,6 +1390,9 @@ startup_modules(void) * caused the drv_usecwait to be way too short. */ microfind(); + + if (xpv_is_hvm) + update_default_path(); #endif /* diff --git a/usr/src/uts/i86pc/sys/xpv_support.h b/usr/src/uts/i86pc/sys/xpv_support.h new file mode 100644 index 0000000000..c42551b4f8 --- /dev/null +++ b/usr/src/uts/i86pc/sys/xpv_support.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_XPV_SUPPORT_H +#define _SYS_XPV_SUPPORT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__ + +#if !defined(_ASM) + +#include <sys/types.h> +#include <sys/inttypes.h> +#include <sys/dditypes.h> + +typedef ulong_t mfn_t; +typedef uint64_t maddr_t; +#define mfn_to_ma(mfn) ((maddr_t)(mfn) << MMU_PAGESHIFT) +#define MFN_INVALID (-(mfn_t)1) + +#define IPL_DEBUG 15 /* domain debug interrupt */ +#define IPL_CONS 9 +#define IPL_VIF 6 +#define IPL_VBD 5 +#define IPL_EVTCHN 1 + +#define INVALID_EVTCHN 0 + +typedef uint_t (*ec_handler_fcn_t)(); + +extern int ec_init(dev_info_t *); +extern void ec_fini(); +extern void ec_bind_evtchn_to_handler(int, pri_t, ec_handler_fcn_t, void *); +extern void ec_unbind_evtchn(int); +extern void ec_notify_via_evtchn(uint_t); +extern void hypervisor_mask_event(uint_t); +extern void hypervisor_unmask_event(uint_t); + +extern int xen_bind_interdomain(int, int, int *); +extern int xen_alloc_unbound_evtchn(int, int *); +extern int xen_xlate_errcode(int error); +extern void *xen_alloc_pages(pgcnt_t cnt); +extern void kbm_map_ma(maddr_t ma, uintptr_t va, uint_t level); + +/* + * Stub functions to allow the FE drivers to build without littering them + * with #ifdefs + */ +extern void balloon_drv_added(int64_t); +extern long balloon_free_pages(uint_t, mfn_t *, caddr_t, pfn_t *); +extern void xen_release_pfn(pfn_t, caddr_t); +extern void reassign_pfn(pfn_t, mfn_t); + +extern int xen_is_64bit; + +#define IN_XPV_PANIC() (__lintzero) + +#ifdef __cplusplus +} +#endif + +#endif /* __ASM */ +#endif /* _SYS_XPV_SUPPORT_H */ diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c index 457cd5662d..a3a7957faa 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.c +++ b/usr/src/uts/i86pc/vm/hat_i86.c @@ -326,7 +326,6 @@ hat_alloc(struct as *as) } init_done: - XPV_ALLOW_MIGRATE(); #if defined(__xpv) /* @@ -337,6 +336,7 @@ init_done: xen_pin(hat->hat_user_ptable, mmu.max_level); #endif #endif + XPV_ALLOW_MIGRATE(); /* * Put it at the start of the global list of all hats (used by stealing) @@ -3815,6 +3815,7 @@ hat_mempte_setup(caddr_t addr) ASSERT(IS_PAGEALIGNED(va)); ASSERT(!IN_VA_HOLE(va)); ++curthread->t_hatdepth; + XPV_DISALLOW_MIGRATE(); ht = htable_getpte(kas.a_hat, va, &entry, &oldpte, 0); if (ht == NULL) { ht = htable_create(kas.a_hat, va, 0, NULL); @@ -3835,6 +3836,7 @@ hat_mempte_setup(caddr_t addr) * return the PTE physical address to the caller. */ htable_release(ht); + XPV_ALLOW_MIGRATE(); p = PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry); --curthread->t_hatdepth; return (p); @@ -3850,6 +3852,7 @@ hat_mempte_release(caddr_t addr, hat_mempte_t pte_pa) { htable_t *ht; + XPV_DISALLOW_MIGRATE(); /* * invalidate any left over mapping and decrement the htable valid count */ @@ -3878,6 +3881,7 @@ hat_mempte_release(caddr_t addr, hat_mempte_t pte_pa) ASSERT(ht->ht_level == 0); HTABLE_DEC(ht->ht_valid_cnt); htable_release(ht); + XPV_ALLOW_MIGRATE(); } /* @@ -4266,7 +4270,9 @@ void hat_prepare_mapping(hat_t *hat, caddr_t addr) { ASSERT(IS_P2ALIGNED((uintptr_t)addr, MMU_PAGESIZE)); + XPV_DISALLOW_MIGRATE(); (void) htable_create(hat, (uintptr_t)addr, 0, NULL); + XPV_ALLOW_MIGRATE(); } void @@ -4275,10 +4281,12 @@ hat_release_mapping(hat_t *hat, caddr_t addr) htable_t *ht; ASSERT(IS_P2ALIGNED((uintptr_t)addr, MMU_PAGESIZE)); + XPV_DISALLOW_MIGRATE(); ht = htable_lookup(hat, (uintptr_t)addr, 0); ASSERT(ht != NULL); ASSERT(ht->ht_busy >= 2); htable_release(ht); htable_release(ht); -} + XPV_ALLOW_MIGRATE(); + } #endif diff --git a/usr/src/uts/i86pc/xnf/Makefile b/usr/src/uts/i86pc/xnf/Makefile new file mode 100644 index 0000000000..f582e85990 --- /dev/null +++ b/usr/src/uts/i86pc/xnf/Makefile @@ -0,0 +1,98 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# uts/i86pc/xnf/Makefile +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# This makefile drives the production of the xve +# network driver kernel module. +# +# i86pc architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = xnf +OBJECTS = $(XNF_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(XNF_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_HVM_DRV_DIR)/$(MODULE) + +INC_PATH += -I$(UTSBASE)/common/xen + +# +# Include common rules. +# +include $(UTSBASE)/i86pc/Makefile.i86pc +include $(UTSBASE)/i86pc/Makefile.hvm + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Driver depends on MAC & IP +# +LDFLAGS += -dy -Nmisc/mac -Ndrv/ip -Ndrv/xpvd -Ndrv/xpv + +CPPFLAGS += -D_SOLARIS +LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV +LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/i86pc/Makefile.targ diff --git a/usr/src/uts/i86pc/xpv/Makefile b/usr/src/uts/i86pc/xpv/Makefile new file mode 100644 index 0000000000..7f859166c9 --- /dev/null +++ b/usr/src/uts/i86pc/xpv/Makefile @@ -0,0 +1,101 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# uts/i86pc/xpv/Makefile +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" +# +# This makefile drives the production of the xpv +# driver, which provides the necessary infrastructure for +# paravirtualized front-end drivers in HVM systems. +# +# i86pc implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = xpv +OBJECTS = $(XPV_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(XPV_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_HVM_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/i86pc/io/xpv + +INC_PATH += -I$(UTSBASE)/common/xen -I$(UTSBASE)/../common + +# +# Include common rules. +# +include $(UTSBASE)/i86pc/Makefile.i86pc +include $(UTSBASE)/i86pc/Makefile.hvm + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(CONFMOD) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +CPPFLAGS += -D_SOLARIS +LDFLAGS += -dy -N mach/pcplusmp + +# +# The Xen header files do not lint cleanly. Since the troublesome +# structures form part of the externally defined interface to the +# hypervisor, we're stuck with the noise. +# +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN +LINTTAGS += -erroff=E_SUPPRESSION_DIRECTIVE_UNUSED +LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/i86pc/Makefile.targ diff --git a/usr/src/uts/i86pc/xpvd/Makefile b/usr/src/uts/i86pc/xpvd/Makefile new file mode 100644 index 0000000000..01e515daf4 --- /dev/null +++ b/usr/src/uts/i86pc/xpvd/Makefile @@ -0,0 +1,92 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" +# +# This makefile drives the production of the xpvd nexus driver +# +# i86pc implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = xpvd +OBJECTS = $(XPVD_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(XPVD_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_HVM_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/xen/io + +# +# Include common rules. +# +include $(UTSBASE)/i86pc/Makefile.i86pc +include $(UTSBASE)/i86pc/Makefile.hvm + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(CONFMOD) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +INC_PATH += -I$(UTSBASE)/common/xen -I$(UTSBASE)/../common + +LDFLAGS += -dy -Ndrv/xpv + +LINTTAGS += -erroff=E_STATIC_UNUSED +LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/i86pc/Makefile.targ diff --git a/usr/src/uts/i86xpv/Makefile.files b/usr/src/uts/i86xpv/Makefile.files index 221c580e2c..927414c3e8 100644 --- a/usr/src/uts/i86xpv/Makefile.files +++ b/usr/src/uts/i86xpv/Makefile.files @@ -209,7 +209,8 @@ XDB_OBJS += xdb.o # # Build up defines and paths. # -INC_PATH += -I$(UTSBASE)/i86xpv -I$(UTSBASE)/i86pc -I$(SRC)/common +INC_PATH += -I$(UTSBASE)/i86xpv -I$(UTSBASE)/i86pc -I$(SRC)/common \ + -I$(UTSBASE)/common/xen # # Since the assym files are derived, the dependencies must be explicit for diff --git a/usr/src/uts/i86xpv/Makefile.rules b/usr/src/uts/i86xpv/Makefile.rules index 63fec2422d..7b758fd3f6 100644 --- a/usr/src/uts/i86xpv/Makefile.rules +++ b/usr/src/uts/i86xpv/Makefile.rules @@ -171,6 +171,9 @@ DBOOT_ASFLAGS = $(DBOOT_AS_XARCH_$(CLASS)) -P -D_ASM DBOOT_LINTFLAGS_i86xpv = $(LINTFLAGS_i386_$(CLASS)) $(LINTTAGS_i386_$(CLASS)) +$(DBOOT_OBJS_DIR)/%.o: $(UTSBASE)/common/xen/os/%.c + $(CC) $(DBOOT_CFLAGS) $(DBOOT_DEFS) $(DBOOT_CC_INCL) -c -o $@ $< + $(DBOOT_OBJS_DIR)/%.o: $(UTSBASE)/i86xpv/boot/%.c $(CC) $(DBOOT_CFLAGS) $(DBOOT_DEFS) $(DBOOT_CC_INCL) -c -o $@ $< @@ -186,6 +189,9 @@ $(DBOOT_OBJS_DIR)/%.o: $(COMMONBASE)/util/%.c $(DBOOT_OBJS_DIR)/%.o: $(UTSBASE)/i86xpv/os/%.c $(CC) $(DBOOT_CFLAGS) $(DBOOT_DEFS) $(DBOOT_CC_INCL) -c -o $@ $< +$(DBOOT_OBJS_DIR)/%.o: $(UTSBASE)/intel/ia32/ml/%.s + $(AS) $(DBOOT_ASFLAGS) $(DBOOT_DEFS) $(DBOOT_AS_INCL) -o $@ $< + $(DBOOT_OBJS_DIR)/%.o: $(COMMONBASE)/util/i386/%.s $(AS) $(DBOOT_ASFLAGS) $(DBOOT_DEFS) $(DBOOT_AS_INCL) -o $@ $< @@ -220,6 +226,12 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/xen/io/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/xen/os/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(DBOOT_LINTS_DIR)/%.ln: $(UTSBASE)/intel/ia32/ml/%.s + @($(LHEAD) $(DBOOT_LINT) $(DBOOT_LOCAL_LINTFLAGS) $< $(LTAIL)) + +$(DBOOT_LINTS_DIR)/%.ln: $(UTSBASE)/common/xen/os/%.c + @($(LHEAD) $(DBOOT_LINT) $(DBOOT_LOCAL_LINTFLAGS) $< $(LTAIL)) + $(DBOOT_LINTS_DIR)/%.ln: $(UTSBASE)/i86xpv/os/%.c @($(LHEAD) $(DBOOT_LINT) $(DBOOT_LOCAL_LINTFLAGS) $< $(LTAIL)) diff --git a/usr/src/uts/i86xpv/os/xen_mmu.c b/usr/src/uts/i86xpv/os/xen_mmu.c index eb9b6e07d9..4983e1fb62 100644 --- a/usr/src/uts/i86xpv/os/xen_mmu.c +++ b/usr/src/uts/i86xpv/os/xen_mmu.c @@ -55,12 +55,18 @@ caddr_t xb_addr; /* virtual addr for the store_mfn page */ /* - * Running on the hypervisor, we need to prevent migration while holding - * PTE values that we might do PTE2PFN() or pa_to_ma() on, as the - * mfn_to_pfn_mapping and mfn_list[] translation tables might change. + * We need to prevent migration or suspension of a domU while it's + * manipulating MFN values, as the MFN values will spontaneously + * change. The next 4 routines provide a mechanism for that. + * The basic idea is to use reader/writer mutex, readers are any thread + * that is manipulating MFNs. Only the thread which is going to actually call + * HYPERVISOR_suspend() will become a writer. * - * As the suspend process uses the HAT, we need to check we don't already own - * the lock as a writer before we try to take it as a reader. + * Since various places need to manipulate MFNs and also call the HAT, + * we track if a thread acquires reader status and allow it to recursively + * do so again. This prevents deadlocks if a migration request + * is started and waits for some reader, but then the previous reader needs + * to call into the HAT. */ #define NUM_M2P_LOCKS 128 static struct { @@ -74,7 +80,7 @@ void xen_block_migrate(void) { if (!DOMAIN_IS_INITDOMAIN(xen_info) && - rw_owner(&m2p_lock[XM2P_HASH].m2p_rwlock) != curthread) + ++curthread->t_xpvcntr == 1) rw_enter(&m2p_lock[XM2P_HASH].m2p_rwlock, RW_READER); } @@ -82,7 +88,7 @@ void xen_allow_migrate(void) { if (!DOMAIN_IS_INITDOMAIN(xen_info) && - rw_owner(&m2p_lock[XM2P_HASH].m2p_rwlock) != curthread) + --curthread->t_xpvcntr == 0) rw_exit(&m2p_lock[XM2P_HASH].m2p_rwlock); } @@ -91,6 +97,8 @@ xen_start_migrate(void) { int i; + ASSERT(curthread->t_xpvcntr == 0); + ++curthread->t_xpvcntr; /* this allows calls into HAT */ for (i = 0; i < NUM_M2P_LOCKS; ++i) rw_enter(&m2p_lock[i].m2p_rwlock, RW_WRITER); } @@ -102,6 +110,8 @@ xen_end_migrate(void) for (i = 0; i < NUM_M2P_LOCKS; ++i) rw_exit(&m2p_lock[i].m2p_rwlock); + ASSERT(curthread->t_xpvcntr == 1); + --curthread->t_xpvcntr; } /*ARGSUSED*/ diff --git a/usr/src/uts/i86xpv/sys/Makefile b/usr/src/uts/i86xpv/sys/Makefile index 2b9f5507bc..f559679a17 100644 --- a/usr/src/uts/i86xpv/sys/Makefile +++ b/usr/src/uts/i86xpv/sys/Makefile @@ -39,9 +39,7 @@ FILEMODE = 644 HDRS= \ balloon.h \ - hypervisor.h \ machprivregs.h \ - xen_errno.h \ xen_mmu.h \ xpv_impl.h diff --git a/usr/src/uts/i86xpv/ml/hypersubr.s b/usr/src/uts/intel/ia32/ml/hypersubr.s index f81536f438..c50d24f7d1 100644 --- a/usr/src/uts/i86xpv/ml/hypersubr.s +++ b/usr/src/uts/intel/ia32/ml/hypersubr.s @@ -27,18 +27,10 @@ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/asm_linkage.h> -#include <sys/hypervisor.h> - -/* - * XXPV grr - assembler can't deal with an instruction in a quoted string - */ -#undef TRAP_INSTR /* cause it's currently "int $0x82" */ - -#if defined(__amd64) -#define TRAP_INSTR syscall -#elif defined(__i386) -#define TRAP_INSTR int $0x82 +#ifdef XPV_HVM_DRIVER +#include <sys/xpv_support.h> #endif +#include <sys/hypervisor.h> /* * Hypervisor "system calls" @@ -125,6 +117,63 @@ __hypercall5_int(int callnum, #else /* __lint */ +/* + * XXPV grr - assembler can't deal with an instruction in a quoted string + */ +#undef TRAP_INSTR /* cause it's currently "int $0x82" */ + +/* + * The method for issuing a hypercall (i.e. a system call to the + * hypervisor) varies from platform to platform. In 32-bit PV domains, an + * 'int 82' triggers the call. In 64-bit PV domains, a 'syscall' does the + * trick. + * + * HVM domains are more complicated. In all cases, we want to issue a + * VMEXIT instruction, but AMD and Intel use different opcodes to represent + * that instruction. Rather than build CPU-specific modules with the + * different opcodes, we use the 'hypercall page' provided by Xen. This + * page contains a collection of code stubs that do nothing except issue + * hypercalls using the proper instructions for this machine. To keep the + * wrapper code as simple and efficient as possible, we preallocate that + * page below. When the module is loaded, we ask Xen to remap the + * underlying PFN to that of the hypercall page. + * + * Note: this same mechanism could be used in PV domains, but using + * hypercall page requires a call and several more instructions than simply + * issuing the proper trap. + */ +#if defined(XPV_HVM_DRIVER) + +#define HYPERCALL_PAGESIZE 0x1000 + .text + .align HYPERCALL_PAGESIZE + .globl hypercall_page + .type hypercall_page, @function +hypercall_page: + .skip HYPERCALL_PAGESIZE + .size hypercall_page, HYPERCALL_PAGESIZE +#if defined(__amd64) +#define TRAP_INSTR \ + shll $5, %eax; \ + addq $hypercall_page, %rax; \ + jmp *%rax +#else +#define TRAP_INSTR \ + shll $5, %eax; \ + addl $hypercall_page, %eax; \ + call *%eax +#endif + +#else /* XPV_HVM_DRIVER */ + +#if defined(__amd64) +#define TRAP_INSTR syscall +#elif defined(__i386) +#define TRAP_INSTR int $0x82 +#endif +#endif /* XPV_HVM_DRIVER */ + + #if defined(__amd64) ENTRY_NP(__hypercall0) diff --git a/usr/src/uts/intel/os/name_to_major b/usr/src/uts/intel/os/name_to_major index de3388721c..56e10ff720 100644 --- a/usr/src/uts/intel/os/name_to_major +++ b/usr/src/uts/intel/os/name_to_major @@ -117,6 +117,7 @@ kssl 185 mc-amd 186 tzmon 187 intel_nb5000 188 +xpv 190 xpvd 191 xnf 192 xdf 193 diff --git a/usr/src/uts/intel/sys/Makefile b/usr/src/uts/intel/sys/Makefile index 3296cb5735..0d523cc8ac 100644 --- a/usr/src/uts/intel/sys/Makefile +++ b/usr/src/uts/intel/sys/Makefile @@ -47,6 +47,7 @@ HDRS = \ fp.h \ frame.h \ inline.h \ + hypervisor.h \ kd.h \ kdi_machimpl.h \ kdi_regs.h \ @@ -92,7 +93,8 @@ HDRS = \ ucontext.h \ utrap.h \ vmparam.h \ - x86_archext.h + x86_archext.h \ + xen_errno.h CLOSEDHDRS = \ memtest.h \ diff --git a/usr/src/uts/i86xpv/sys/hypervisor.h b/usr/src/uts/intel/sys/hypervisor.h index 2810c83b1c..9f5aadd499 100644 --- a/usr/src/uts/i86xpv/sys/hypervisor.h +++ b/usr/src/uts/intel/sys/hypervisor.h @@ -58,14 +58,20 @@ extern "C" { #endif +#ifdef XPV_HVM_DRIVER +#include <sys/xpv_support.h> +#else #include <sys/xpv_impl.h> +#endif #include <sys/xen_errno.h> #if !defined(_ASM) #include <sys/processor.h> #include <sys/cpuvar.h> +#ifndef XPV_HVM_DRIVER #include <sys/xen_mmu.h> +#endif #include <sys/systm.h> #include <xen/public/callback.h> #include <xen/public/event_channel.h> @@ -133,12 +139,21 @@ extern void xen_disable_user_iopl(void); /* * A quick way to ask if we're DOM0 or not .. */ +#ifdef XPV_HVM_DRIVER + +#define DOMAIN_IS_INITDOMAIN(info) (__lintzero) +#define DOMAIN_IS_PRIVILEGED(info) (__lintzero) + +#else + #define DOMAIN_IS_INITDOMAIN(info) \ (((info)->flags & SIF_INITDOMAIN) == SIF_INITDOMAIN) #define DOMAIN_IS_PRIVILEGED(info) \ (((info)->flags & SIF_PRIVILEGED) == SIF_PRIVILEGED) +#endif + /* * start of day information passed up from the hypervisor */ diff --git a/usr/src/uts/i86xpv/sys/xen_errno.h b/usr/src/uts/intel/sys/xen_errno.h index 35a6586eaf..35a6586eaf 100644 --- a/usr/src/uts/i86xpv/sys/xen_errno.h +++ b/usr/src/uts/intel/sys/xen_errno.h |