summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndy Fiddaman <omnios@citrus-it.co.uk>2020-12-21 14:44:26 +0000
committerAndy Fiddaman <omnios@citrus-it.co.uk>2021-02-02 23:16:11 +0000
commit6960cd891105f9a002a0327e31a6182f9c6de88e (patch)
tree12af9c786c75f0a9273388aad743d471590100bc
parentce8560eeb961d528e27685fcdd2ffb03e9478dbf (diff)
downloadillumos-joyent-6960cd891105f9a002a0327e31a6182f9c6de88e.tar.gz
13379 bhyve upstream sync 2020 Dec
Reviewed by: Patrick Mooney <pmooney@pfmooney.com> Reviewed by: Jorge Schrauwen <sjorge@blackdot.be> Approved by: Robert Mustacchi <rm@fingolfin.org>
-rw-r--r--usr/src/cmd/bhyve/Makefile3
-rw-r--r--usr/src/cmd/bhyve/README.sync45
-rw-r--r--usr/src/cmd/bhyve/bhyverun.c17
-rw-r--r--usr/src/cmd/bhyve/pci_ahci.c308
-rw-r--r--usr/src/cmd/bhyve/pci_e82545.c10
-rw-r--r--usr/src/cmd/bhyve/pci_emul.c24
-rw-r--r--usr/src/cmd/bhyve/pci_emul.h2
-rw-r--r--usr/src/cmd/bhyve/pci_lpc.c23
-rw-r--r--usr/src/cmd/bhyve/pci_nvme.c1979
-rw-r--r--usr/src/cmd/bhyve/pci_passthru.c6
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_block.c2
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_net.c22
-rw-r--r--usr/src/cmd/bhyve/pci_xhci.c7
-rw-r--r--usr/src/cmd/bhyve/pctestdev.c270
-rw-r--r--usr/src/cmd/bhyve/pctestdev.h43
-rw-r--r--usr/src/cmd/bhyve/pm.c2
-rw-r--r--usr/src/cmd/bhyve/smbiostbl.c26
-rw-r--r--usr/src/cmd/bhyve/usb_mouse.c18
-rw-r--r--usr/src/contrib/bhyve/dev/nvme/nvme.h677
-rw-r--r--usr/src/contrib/bhyve/sys/ata.h68
-rw-r--r--usr/src/lib/libvmmapi/common/mapfile-vers14
-rw-r--r--usr/src/lib/libvmmapi/common/vmmapi.c27
-rw-r--r--usr/src/lib/libvmmapi/common/vmmapi.h2
-rw-r--r--usr/src/uts/i86pc/io/vmm/README.sync37
-rw-r--r--usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c4
-rw-r--r--usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c1
-rw-r--r--usr/src/uts/i86pc/io/vmm/amd/svm.c1
-rw-r--r--usr/src/uts/i86pc/io/vmm/amd/svm_support.s12
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/ppt.c121
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/ppt.h1
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/ppt.mapfile1
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vatpit.c2
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c11
-rw-r--r--usr/src/uts/i86pc/sys/vmm_dev.h1
-rw-r--r--usr/src/uts/intel/ia32/ml/modstubs.s1
35 files changed, 2709 insertions, 1079 deletions
diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile
index 2229cdf454..1c1b99c52b 100644
--- a/usr/src/cmd/bhyve/Makefile
+++ b/usr/src/cmd/bhyve/Makefile
@@ -61,6 +61,7 @@ SRCS = acpi.c \
pci_virtio_rnd.c \
pci_virtio_viona.c \
pci_xhci.c \
+ pctestdev.c \
pm.c \
post.c \
ps2kbd.c \
@@ -97,7 +98,7 @@ SRCS = acpi.c \
#pci_hda.c \
# The bhyve generic net-backend stuff has been ignored by us at the moment
-# because SmartOS users prefer to use viona for its superior network perf.
+# because illumos users prefer to use viona for its superior network perf.
#net_backends.c \
diff --git a/usr/src/cmd/bhyve/README.sync b/usr/src/cmd/bhyve/README.sync
index af90209ac3..2384413853 100644
--- a/usr/src/cmd/bhyve/README.sync
+++ b/usr/src/cmd/bhyve/README.sync
@@ -1,15 +1,42 @@
+Git commit hashes in this file refer to the official FreeBSD distributed
+public Git repository at https://git.freebsd.org/src.git
+
The bhyve kernel module and its associated userland consumers have been updated
-to the latest upstream FreeBSD sources as documented in
+to the latest upstream FreeBSD sources as of:
+
+commit 2bb4be0f86501ec0565dba3d37ce0f7d7fc9c464
+Author: grehan <grehan@FreeBSD.org>
+Date: Fri Dec 18 00:38:48 2020 +0000
+
+ Fix issues with various VNC clients.
+
+ PR: 250795
+ Submitted by: Marko Kiiskila <marko@apache.org>
+ Reviewed by: jhb (bhyve)
+ MFC after: 3 weeks
+ Relnotes: yes
+ Differential Revision: https://reviews.freebsd.org/D27605
+
+Divergence Notes:
- usr/src/uts/i86pc/io/vmm/README.sync
+A previous sync skipped commit 0ff7076bdbc6dae5ea44c0acdb567e1cede199d1 which
+introduced a generic backend functionality to network devices. Without that in
+place, subsequent updates reflect the absence of that subsystem. Integrating
+net backends has not been a priority, given the common use of viona on illumos.
-The userland components in this directory have further been updated with the
-following cherry-picked updates which will need taking into account during the
-next sync.
+The draft Save/Restore functionality, added in FreeBSD commit
+483d953a86a2507355f8287c5107dc827a0ff516, has not been synced into illumos bhyve
+yet. It is not built by default in FreeBSD, so we're not interested in taking
+it until it successfully endures more in-depth testing.
- commit 44a544d41c504fbe37f836eb503e4ae721daada9
- Author: grehan <grehan@FreeBSD.org>
- Date: Fri Dec 18 00:38:48 2020 +0000
+The VirtFS filesystem sharing feature, added in FreeBSD commit
+100353cfbf882e23c911300ebd0cb458bd3ee975, has not been synced into illumos bhyve
+yet. It depends on the userland lib9p which needs a fair amount of work to
+build and run on illumos. The integration of this feature is being tracked in
+https://www.illumos.org/issues/13380
- Fix issues with various VNC clients.
+The stub usr/src/compat/bhyve/stdatomic.h file only includes enough glue
+to satisfy the use of <stdatomic.h> in usr/src/cmd/bhyve/rfb.c, and in
+particular assumes that atomic variables are sized as an int. If other bhyve
+pieces start using stdatomic.h, this will need enhancing.
diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c
index 8522d85bd9..53a92f6dd7 100644
--- a/usr/src/cmd/bhyve/bhyverun.c
+++ b/usr/src/cmd/bhyve/bhyverun.c
@@ -199,6 +199,7 @@ static int gdb_port = 0;
static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
static int virtio_msix = 1;
static int x2apic_mode = 0; /* default is xAPIC */
+static int destroy_on_poweroff = 0;
static int strictio;
static int strictmsr = 1;
@@ -244,7 +245,11 @@ usage(int code)
{
fprintf(stderr,
- "Usage: %s [-abehuwxACHPSWY]\n"
+#ifdef __FreeBSD__
+ "Usage: %s [-abehuwxACDHPSWY]\n"
+#else
+ "Usage: %s [-abdehuwxACDHPSWY]\n"
+#endif
" %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
" %*s [-g <gdb port>] [-l <lpc>]\n"
#ifdef __FreeBSD__
@@ -259,6 +264,7 @@ usage(int code)
#ifndef __FreeBSD__
" -d: suspend cpu at boot\n"
#endif
+ " -D: destroy on power-off\n"
" -e: exit on unhandled I/O access\n"
" -g: gdb port\n"
" -h: help\n"
@@ -980,6 +986,8 @@ vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
case VM_SUSPEND_RESET:
exit(0);
case VM_SUSPEND_POWEROFF:
+ if (destroy_on_poweroff)
+ vm_destroy(ctx);
exit(1);
case VM_SUSPEND_HALT:
exit(2);
@@ -1268,9 +1276,9 @@ main(int argc, char *argv[])
memflags = 0;
#ifdef __FreeBSD__
- optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:B:U:";
+ optstr = "abehuwxACDHIPSWYp:g:G:c:s:m:l:B:U:";
#else
- optstr = "abdehuwxACHIPSWYg:G:c:s:m:l:B:U:";
+ optstr = "abdehuwxACDHIPSWYg:G:c:s:m:l:B:U:";
#endif
while ((c = getopt(argc, argv, optstr)) != -1) {
switch (c) {
@@ -1283,6 +1291,9 @@ main(int argc, char *argv[])
case 'b':
bvmcons = 1;
break;
+ case 'D':
+ destroy_on_poweroff = 1;
+ break;
case 'B':
if (smbios_parse(optarg) != 0) {
errx(EX_USAGE, "invalid SMBIOS "
diff --git a/usr/src/cmd/bhyve/pci_ahci.c b/usr/src/cmd/bhyve/pci_ahci.c
index 57934f9c84..0d4951a61e 100644
--- a/usr/src/cmd/bhyve/pci_ahci.c
+++ b/usr/src/cmd/bhyve/pci_ahci.c
@@ -136,9 +136,9 @@ struct ahci_ioreq {
struct ahci_port {
struct blockif_ctxt *bctx;
struct pci_ahci_softc *pr_sc;
+ struct ata_params ata_ident;
uint8_t *cmd_lst;
uint8_t *rfis;
- char ident[AHCI_PORT_IDENT];
int port;
int atapi;
int reset;
@@ -983,7 +983,50 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
ahci_write_fis_d2h(p, slot, cfis,
(ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
} else {
- uint16_t buf[256];
+ ahci_write_fis_piosetup(p);
+ write_prdt(p, slot, cfis, (void*)&p->ata_ident, sizeof(struct ata_params));
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
+ }
+}
+
+static void
+ata_identify_init(struct ahci_port* p, int atapi)
+{
+ struct ata_params* ata_ident = &p->ata_ident;
+
+ if (atapi) {
+ ata_ident->config = ATA_PROTO_ATAPI | ATA_ATAPI_TYPE_CDROM |
+ ATA_ATAPI_REMOVABLE | ATA_DRQ_FAST;
+ ata_ident->capabilities1 = ATA_SUPPORT_LBA |
+ ATA_SUPPORT_DMA;
+ ata_ident->capabilities2 = (1 << 14 | 1);
+ ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88;
+ ata_ident->obsolete62 = 0x3f;
+ ata_ident->mwdmamodes = 7;
+ if (p->xfermode & ATA_WDMA0)
+ ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8));
+ ata_ident->apiomodes = 3;
+ ata_ident->mwdmamin = 0x0078;
+ ata_ident->mwdmarec = 0x0078;
+ ata_ident->pioblind = 0x0078;
+ ata_ident->pioiordy = 0x0078;
+ ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3);
+ ata_ident->satacapabilities2 = ((p->ssts & ATA_SS_SPD_MASK) >> 3);
+ ata_ident->satasupport = ATA_SUPPORT_NCQ_STREAM;
+ ata_ident->version_major = 0x3f0;
+ ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
+ ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
+ ata_ident->support.command2 = (1 << 14);
+ ata_ident->support.extension = (1 << 14);
+ ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
+ ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
+ ata_ident->enabled.extension = (1 << 14);
+ ata_ident->udmamodes = 0x7f;
+ if (p->xfermode & ATA_UDMA0)
+ ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8));
+ ata_ident->transport_major = 0x1020;
+ ata_ident->integrity = 0x00a5;
+ } else {
uint64_t sectors;
int sectsz, psectsz, psectoff, candelete, ro;
uint16_t cyl;
@@ -995,87 +1038,84 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
sectors = blockif_size(p->bctx) / sectsz;
blockif_chs(p->bctx, &cyl, &heads, &sech);
blockif_psectsz(p->bctx, &psectsz, &psectoff);
- memset(buf, 0, sizeof(buf));
- buf[0] = 0x0040;
- buf[1] = cyl;
- buf[3] = heads;
- buf[6] = sech;
- ata_string((uint8_t *)(buf+10), p->ident, 20);
- ata_string((uint8_t *)(buf+23), "001", 8);
- ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40);
- buf[47] = (0x8000 | 128);
- buf[48] = 0;
- buf[49] = (1 << 8 | 1 << 9 | 1 << 11);
- buf[50] = (1 << 14);
- buf[53] = (1 << 1 | 1 << 2);
+ ata_ident->config = ATA_DRQ_FAST;
+ ata_ident->cylinders = cyl;
+ ata_ident->heads = heads;
+ ata_ident->sectors = sech;
+
+ ata_ident->sectors_intr = (0x8000 | 128);
+ ata_ident->tcg = 0;
+
+ ata_ident->capabilities1 = ATA_SUPPORT_DMA |
+ ATA_SUPPORT_LBA | ATA_SUPPORT_IORDY;
+ ata_ident->capabilities2 = (1 << 14);
+ ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88;
if (p->mult_sectors)
- buf[59] = (0x100 | p->mult_sectors);
+ ata_ident->multi = (ATA_MULTI_VALID | p->mult_sectors);
if (sectors <= 0x0fffffff) {
- buf[60] = sectors;
- buf[61] = (sectors >> 16);
+ ata_ident->lba_size_1 = sectors;
+ ata_ident->lba_size_2 = (sectors >> 16);
} else {
- buf[60] = 0xffff;
- buf[61] = 0x0fff;
+ ata_ident->lba_size_1 = 0xffff;
+ ata_ident->lba_size_2 = 0x0fff;
}
- buf[63] = 0x7;
+ ata_ident->mwdmamodes = 0x7;
if (p->xfermode & ATA_WDMA0)
- buf[63] |= (1 << ((p->xfermode & 7) + 8));
- buf[64] = 0x3;
- buf[65] = 120;
- buf[66] = 120;
- buf[67] = 120;
- buf[68] = 120;
- buf[69] = 0;
- buf[75] = 31;
- buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 |
- ATA_SUPPORT_NCQ);
- buf[77] = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED |
- (p->ssts & ATA_SS_SPD_MASK) >> 3);
- buf[80] = 0x3f0;
- buf[81] = 0x28;
- buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE|
- ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
- buf[83] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
- ATA_SUPPORT_FLUSHCACHE48 | 1 << 14);
- buf[84] = (1 << 14);
- buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE|
- ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
- buf[86] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
- ATA_SUPPORT_FLUSHCACHE48 | 1 << 15);
- buf[87] = (1 << 14);
- buf[88] = 0x7f;
+ ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8));
+ ata_ident->apiomodes = 0x3;
+ ata_ident->mwdmamin = 0x0078;
+ ata_ident->mwdmarec = 0x0078;
+ ata_ident->pioblind = 0x0078;
+ ata_ident->pioiordy = 0x0078;
+ ata_ident->support3 = 0;
+ ata_ident->queue = 31;
+ ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 |
+ ATA_SUPPORT_NCQ);
+ ata_ident->satacapabilities2 = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED |
+ (p->ssts & ATA_SS_SPD_MASK) >> 3);
+ ata_ident->version_major = 0x3f0;
+ ata_ident->version_minor = 0x28;
+ ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE |
+ ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
+ ata_ident->support.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
+ ATA_SUPPORT_FLUSHCACHE48 | 1 << 14);
+ ata_ident->support.extension = (1 << 14);
+ ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE |
+ ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP);
+ ata_ident->enabled.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE |
+ ATA_SUPPORT_FLUSHCACHE48 | 1 << 15);
+ ata_ident->enabled.extension = (1 << 14);
+ ata_ident->udmamodes = 0x7f;
if (p->xfermode & ATA_UDMA0)
- buf[88] |= (1 << ((p->xfermode & 7) + 8));
- buf[100] = sectors;
- buf[101] = (sectors >> 16);
- buf[102] = (sectors >> 32);
- buf[103] = (sectors >> 48);
+ ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8));
+ ata_ident->lba_size48_1 = sectors;
+ ata_ident->lba_size48_2 = (sectors >> 16);
+ ata_ident->lba_size48_3 = (sectors >> 32);
+ ata_ident->lba_size48_4 = (sectors >> 48);
+
if (candelete && !ro) {
- buf[69] |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT;
- buf[105] = 1;
- buf[169] = ATA_SUPPORT_DSM_TRIM;
+ ata_ident->support3 |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT;
+ ata_ident->max_dsm_blocks = 1;
+ ata_ident->support_dsm = ATA_SUPPORT_DSM_TRIM;
}
- buf[106] = 0x4000;
- buf[209] = 0x4000;
+ ata_ident->pss = ATA_PSS_VALID_VALUE;
+ ata_ident->lsalign = 0x4000;
if (psectsz > sectsz) {
- buf[106] |= 0x2000;
- buf[106] |= ffsl(psectsz / sectsz) - 1;
- buf[209] |= (psectoff / sectsz);
+ ata_ident->pss |= ATA_PSS_MULTLS;
+ ata_ident->pss |= ffsl(psectsz / sectsz) - 1;
+ ata_ident->lsalign |= (psectoff / sectsz);
}
if (sectsz > 512) {
- buf[106] |= 0x1000;
- buf[117] = sectsz / 2;
- buf[118] = ((sectsz / 2) >> 16);
+ ata_ident->pss |= ATA_PSS_LSSABOVE512;
+ ata_ident->lss_1 = sectsz / 2;
+ ata_ident->lss_2 = ((sectsz / 2) >> 16);
}
- buf[119] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
- buf[120] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
- buf[222] = 0x1020;
- buf[255] = 0x00a5;
- ahci_checksum((uint8_t *)buf, sizeof(buf));
- ahci_write_fis_piosetup(p);
- write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
- ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
+ ata_ident->support2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
+ ata_ident->enabled2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14);
+ ata_ident->transport_major = 0x1020;
+ ata_ident->integrity = 0x00a5;
}
+ ahci_checksum((uint8_t*)ata_ident, sizeof(struct ata_params));
}
static void
@@ -1085,44 +1125,8 @@ handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)
ahci_write_fis_d2h(p, slot, cfis,
(ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR);
} else {
- uint16_t buf[256];
-
- memset(buf, 0, sizeof(buf));
- buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5);
- ata_string((uint8_t *)(buf+10), p->ident, 20);
- ata_string((uint8_t *)(buf+23), "001", 8);
- ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40);
- buf[49] = (1 << 9 | 1 << 8);
- buf[50] = (1 << 14 | 1);
- buf[53] = (1 << 2 | 1 << 1);
- buf[62] = 0x3f;
- buf[63] = 7;
- if (p->xfermode & ATA_WDMA0)
- buf[63] |= (1 << ((p->xfermode & 7) + 8));
- buf[64] = 3;
- buf[65] = 120;
- buf[66] = 120;
- buf[67] = 120;
- buf[68] = 120;
- buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3);
- buf[77] = ((p->ssts & ATA_SS_SPD_MASK) >> 3);
- buf[78] = (1 << 5);
- buf[80] = 0x3f0;
- buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
- ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
- buf[83] = (1 << 14);
- buf[84] = (1 << 14);
- buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET |
- ATA_SUPPORT_RESET | ATA_SUPPORT_NOP);
- buf[87] = (1 << 14);
- buf[88] = 0x7f;
- if (p->xfermode & ATA_UDMA0)
- buf[88] |= (1 << ((p->xfermode & 7) + 8));
- buf[222] = 0x1020;
- buf[255] = 0x00a5;
- ahci_checksum((uint8_t *)buf, sizeof(buf));
ahci_write_fis_piosetup(p);
- write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
+ write_prdt(p, slot, cfis, (void *)&p->ata_ident, sizeof(struct ata_params));
ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY);
}
}
@@ -2314,6 +2318,10 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
MD5_CTX mdctx;
u_char digest[16];
char *next, *next2;
+ char *bopt, *uopt, *xopts, *config;
+ FILE* fp;
+ size_t block_len;
+ int comma, optpos;
ret = 0;
@@ -2330,6 +2338,9 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
slots = 32;
for (p = 0; p < MAX_PORTS && opts != NULL; p++, opts = next) {
+ struct ata_params *ata_ident = &sc->port[p].ata_ident;
+ memset(ata_ident, 0, sizeof(struct ata_params));
+
/* Identify and cut off type of present port. */
if (strncmp(opts, "hd:", 3) == 0) {
atapi = 0;
@@ -2352,13 +2363,82 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
if (opts[0] == 0)
continue;
+ uopt = strdup(opts);
+ bopt = NULL;
+ fp = open_memstream(&bopt, &block_len);
+ comma = 0;
+ optpos = 0;
+
+ for (xopts = strtok(uopt, ",");
+ xopts != NULL;
+ xopts = strtok(NULL, ",")) {
+
+ /* First option assume as block filename. */
+ if (optpos == 0) {
+ /*
+ * Create an identifier for the backing file.
+ * Use parts of the md5 sum of the filename
+ */
+ char ident[AHCI_PORT_IDENT];
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, opts, strlen(opts));
+ MD5Final(digest, &mdctx);
+ snprintf(ident, AHCI_PORT_IDENT,
+ "BHYVE-%02X%02X-%02X%02X-%02X%02X",
+ digest[0], digest[1], digest[2], digest[3], digest[4],
+ digest[5]);
+ ata_string((uint8_t*)&ata_ident->serial, ident, 20);
+ ata_string((uint8_t*)&ata_ident->revision, "001", 8);
+ if (atapi) {
+ ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DVD ROM", 40);
+ }
+ else {
+ ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DISK", 40);
+ }
+ }
+
+ if ((config = strchr(xopts, '=')) != NULL) {
+ *config++ = '\0';
+ if (!strcmp("nmrr", xopts)) {
+ ata_ident->media_rotation_rate = atoi(config);
+ }
+ else if (!strcmp("ser", xopts)) {
+ ata_string((uint8_t*)(&ata_ident->serial), config, 20);
+ }
+ else if (!strcmp("rev", xopts)) {
+ ata_string((uint8_t*)(&ata_ident->revision), config, 8);
+ }
+ else if (!strcmp("model", xopts)) {
+ ata_string((uint8_t*)(&ata_ident->model), config, 40);
+ }
+ else {
+ /* Pass all other options to blockif_open. */
+ *--config = '=';
+ fprintf(fp, "%s%s", comma ? "," : "", xopts);
+ comma = 1;
+ }
+ }
+ else {
+ /* Pass all other options to blockif_open. */
+ fprintf(fp, "%s%s", comma ? "," : "", xopts);
+ comma = 1;
+ }
+ optpos++;
+ }
+ free(uopt);
+ fclose(fp);
+
+ DPRINTF("%s\n", bopt);
+
/*
* Attempt to open the backing image. Use the PCI slot/func
* and the port number for the identifier string.
*/
snprintf(bident, sizeof(bident), "%d:%d:%d", pi->pi_slot,
pi->pi_func, p);
- bctxt = blockif_open(opts, bident);
+ bctxt = blockif_open(bopt, bident);
+ free(bopt);
+
if (bctxt == NULL) {
sc->ports = p;
ret = 1;
@@ -2380,17 +2460,7 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
(void) blockif_set_wce(bctxt, 1);
#endif
- /*
- * Create an identifier for the backing file.
- * Use parts of the md5 sum of the filename
- */
- MD5Init(&mdctx);
- MD5Update(&mdctx, opts, strlen(opts));
- MD5Final(digest, &mdctx);
- snprintf(sc->port[p].ident, AHCI_PORT_IDENT,
- "BHYVE-%02X%02X-%02X%02X-%02X%02X",
- digest[0], digest[1], digest[2], digest[3], digest[4],
- digest[5]);
+ ata_identify_init(&sc->port[p], atapi);
/*
* Allocate blockif request structures and add them
diff --git a/usr/src/cmd/bhyve/pci_e82545.c b/usr/src/cmd/bhyve/pci_e82545.c
index 8f2c4d810f..cb7e074540 100644
--- a/usr/src/cmd/bhyve/pci_e82545.c
+++ b/usr/src/cmd/bhyve/pci_e82545.c
@@ -1660,18 +1660,18 @@ e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value)
break;
case E1000_TDBAL(0):
sc->esc_TDBAL = value & ~0xF;
- if (sc->esc_tx_enabled) {
- /* Apparently legal */
+ if (sc->esc_tx_enabled)
e82545_tx_update_tdba(sc);
- }
break;
case E1000_TDBAH(0):
- //assert(!sc->esc_tx_enabled);
sc->esc_TDBAH = value;
+ if (sc->esc_tx_enabled)
+ e82545_tx_update_tdba(sc);
break;
case E1000_TDLEN(0):
- //assert(!sc->esc_tx_enabled);
sc->esc_TDLEN = value & ~0xFFF0007F;
+ if (sc->esc_tx_enabled)
+ e82545_tx_update_tdba(sc);
break;
case E1000_TDH(0):
//assert(!sc->esc_tx_enabled);
diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c
index c510116e19..90602f715b 100644
--- a/usr/src/cmd/bhyve/pci_emul.c
+++ b/usr/src/cmd/bhyve/pci_emul.c
@@ -463,14 +463,6 @@ pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
return (-1);
}
-int
-pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
- uint64_t size)
-{
-
- return (pci_emul_alloc_pbar(pdi, idx, 0, type, size));
-}
-
/*
* Register (or unregister) the MMIO or I/O region associated with the BAR
* register 'idx' of an emulated pci device.
@@ -595,8 +587,8 @@ update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type)
}
int
-pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
- enum pcibar_type type, uint64_t size)
+pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
+ uint64_t size)
{
uint64_t *baseptr = NULL;
uint64_t limit = 0, lobits = 0;
@@ -636,16 +628,10 @@ pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
* Some drivers do not work well if the 64-bit BAR is allocated
* above 4GB. Allow for this by allocating small requests under
* 4GB unless then allocation size is larger than some arbitrary
- * number (32MB currently).
+ * number (128MB currently).
*/
- if (size > 32 * 1024 * 1024) {
- /*
- * XXX special case for device requiring peer-peer DMA
- */
- if (size == 0x100000000UL)
- baseptr = &hostbase;
- else
- baseptr = &pci_emul_membase64;
+ if (size > 128 * 1024 * 1024) {
+ baseptr = &pci_emul_membase64;
limit = PCI_EMUL_MEMLIMIT64;
mask = PCIM_BAR_MEM_BASE;
lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h
index d3dd9a2f46..63e3c89a95 100644
--- a/usr/src/cmd/bhyve/pci_emul.h
+++ b/usr/src/cmd/bhyve/pci_emul.h
@@ -222,8 +222,6 @@ int init_pci(struct vmctx *ctx);
void pci_callback(void);
int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
enum pcibar_type type, uint64_t size);
-int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx,
- uint64_t hostbase, enum pcibar_type type, uint64_t size);
int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type);
void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes,
diff --git a/usr/src/cmd/bhyve/pci_lpc.c b/usr/src/cmd/bhyve/pci_lpc.c
index 50413250d3..9c8f25e89d 100644
--- a/usr/src/cmd/bhyve/pci_lpc.c
+++ b/usr/src/cmd/bhyve/pci_lpc.c
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
#include "pci_emul.h"
#include "pci_irq.h"
#include "pci_lpc.h"
+#include "pctestdev.h"
#include "uart_emul.h"
#define IO_ICU1 0x20
@@ -83,6 +84,8 @@ static struct lpc_uart_softc {
static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" };
+static bool pctestdev_present;
+
/*
* LPC device configuration is in the following form:
* <lpc_device_name>[,<options>]
@@ -110,6 +113,18 @@ lpc_device_parse(const char *opts)
goto done;
}
}
+ if (strcasecmp(lpcdev, pctestdev_getname()) == 0) {
+ if (pctestdev_present) {
+ EPRINTLN("More than one %s device conf is "
+ "specified; only one is allowed.",
+ pctestdev_getname());
+ } else if (pctestdev_parse(str) == 0) {
+ pctestdev_present = true;
+ error = 0;
+ free(cpy);
+ goto done;
+ }
+ }
}
done:
@@ -127,6 +142,7 @@ lpc_print_supported_devices()
printf("bootrom\n");
for (i = 0; i < LPC_UART_NUM; i++)
printf("%s\n", lpc_uart_names[i]);
+ printf("%s\n", pctestdev_getname());
}
const char *
@@ -250,6 +266,13 @@ lpc_init(struct vmctx *ctx)
sc->enabled = 1;
}
+ /* pc-testdev */
+ if (pctestdev_present) {
+ error = pctestdev_init(ctx);
+ if (error)
+ return (error);
+ }
+
return (0);
}
diff --git a/usr/src/cmd/bhyve/pci_nvme.c b/usr/src/cmd/bhyve/pci_nvme.c
index 65d8d49b64..a0a8a9571d 100644
--- a/usr/src/cmd/bhyve/pci_nvme.c
+++ b/usr/src/cmd/bhyve/pci_nvme.c
@@ -3,6 +3,7 @@
*
* Copyright (c) 2017 Shunsuke Mie
* Copyright (c) 2018 Leon Dang
+ * Copyright (c) 2020 Chuck Tuffli
*
* Function crc16 Copyright (c) 2017, Fedor Uporov
* Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
@@ -33,7 +34,7 @@
* bhyve PCIe-NVMe device emulation.
*
* options:
- * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
+ * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
*
* accepted devpath:
* /dev/blockdev
@@ -46,6 +47,7 @@
* sectsz = sector size (defaults to blockif sector size)
* ser = serial number (20-chars max)
* eui64 = IEEE Extended Unique Identifier (8 byte value)
+ * dsm = DataSet Management support. Option is one of auto, enable,disable
*
*/
@@ -57,6 +59,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include <sys/errno.h>
#include <sys/types.h>
#include <net/ieee_oui.h>
#ifndef __FreeBSD__
@@ -86,8 +89,8 @@ __FBSDID("$FreeBSD$");
static int nvme_debug = 0;
-#define DPRINTF(params) if (nvme_debug) PRINTLN params
-#define WPRINTF(params) PRINTLN params
+#define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
+#define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
/* defaults; can be overridden */
#define NVME_MSIX_BAR 4
@@ -99,9 +102,16 @@ static int nvme_debug = 0;
#define NVME_QUEUES 16
#define NVME_MAX_QENTRIES 2048
+/* Memory Page size Minimum reported in CAP register */
+#define NVME_MPSMIN 0
+/* MPSMIN converted to bytes */
+#define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
#define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
-#define NVME_MAX_BLOCKIOVS 512
+#define NVME_MDTS 9
+/* Note the + 1 allows for the initial descriptor to not be page aligned */
+#define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
+#define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
/* This is a synthetic status code to indicate there is no status */
#define NVME_NO_STATUS 0xffff
@@ -153,21 +163,21 @@ enum nvme_copy_dir {
struct nvme_completion_queue {
struct nvme_completion *qbase;
+ pthread_mutex_t mtx;
uint32_t size;
uint16_t tail; /* nvme progress */
uint16_t head; /* guest progress */
uint16_t intr_vec;
uint32_t intr_en;
- pthread_mutex_t mtx;
};
struct nvme_submission_queue {
struct nvme_command *qbase;
+ pthread_mutex_t mtx;
uint32_t size;
uint16_t head; /* nvme progress */
uint16_t tail; /* guest progress */
uint16_t cqid; /* completion queue id */
- int busy; /* queue is being processed */
int qpriority;
};
@@ -186,6 +196,18 @@ struct pci_nvme_blockstore {
uint32_t deallocate:1;
};
+/*
+ * Calculate the number of additional page descriptors for guest IO requests
+ * based on the advertised Max Data Transfer (MDTS) and given the number of
+ * default iovec's in a struct blockif_req.
+ *
+ * Note the + 1 allows for the initial descriptor to not be page aligned.
+ */
+#define MDTS_PAD_SIZE \
+ NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
+ NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
+ 0
+
struct pci_nvme_ioreq {
struct pci_nvme_softc *sc;
STAILQ_ENTRY(pci_nvme_ioreq) link;
@@ -199,18 +221,11 @@ struct pci_nvme_ioreq {
uint64_t prev_gpaddr;
size_t prev_size;
-
- /*
- * lock if all iovs consumed (big IO);
- * complete transaction before continuing
- */
- pthread_mutex_t mtx;
- pthread_cond_t cv;
+ size_t bytes;
struct blockif_req io_req;
- /* pad to fit up to 512 page descriptors from guest IO request */
- struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
+ struct iovec iovpadding[MDTS_PAD_SIZE];
};
enum nvme_dsm_type {
@@ -222,6 +237,28 @@ enum nvme_dsm_type {
NVME_DATASET_MANAGEMENT_DISABLE,
};
+struct pci_nvme_softc;
+struct nvme_feature_obj;
+
+typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
+ struct nvme_feature_obj *,
+ struct nvme_command *,
+ struct nvme_completion *);
+
+struct nvme_feature_obj {
+ uint32_t cdw11;
+ nvme_feature_cb set;
+ nvme_feature_cb get;
+ bool namespace_specific;
+};
+
+#define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
+
+struct pci_nvme_aer {
+ STAILQ_ENTRY(pci_nvme_aer) link;
+ uint16_t cid; /* Command ID of the submitted AER */
+};
+
struct pci_nvme_softc {
struct pci_devinst *nsc_pi;
@@ -241,6 +278,7 @@ struct pci_nvme_softc {
uint32_t max_queues; /* max number of IO SQ's or CQ's */
uint32_t num_cqueues;
uint32_t num_squeues;
+ bool num_q_is_set; /* Has host set Number of Queues */
struct pci_nvme_ioreq *ioreqs;
STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
@@ -255,16 +293,26 @@ struct pci_nvme_softc {
struct nvme_completion_queue *compl_queues;
struct nvme_submission_queue *submit_queues;
- /* controller features */
- uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */
- uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
- uint32_t async_ev_config; /* 0x0B: async event config */
+ struct nvme_feature_obj feat[NVME_FID_MAX];
enum nvme_dsm_type dataset_management;
+
+ /* Accounting for SMART data */
+ __uint128_t read_data_units;
+ __uint128_t write_data_units;
+ __uint128_t read_commands;
+ __uint128_t write_commands;
+ uint32_t read_dunits_remainder;
+ uint32_t write_dunits_remainder;
+
+ STAILQ_HEAD(, pci_nvme_aer) aer_list;
+ uint32_t aer_count;
};
-static void pci_nvme_io_partial(struct blockif_req *br, int err);
+static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
+static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
+static void pci_nvme_io_done(struct blockif_req *, int);
/* Controller Configuration utils */
#define NVME_CC_GET_EN(cc) \
@@ -303,6 +351,19 @@ static void pci_nvme_io_partial(struct blockif_req *br, int err);
#define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
+static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
+ struct nvme_feature_obj *,
+ struct nvme_command *,
+ struct nvme_completion *);
+static void nvme_feature_num_queues(struct pci_nvme_softc *,
+ struct nvme_feature_obj *,
+ struct nvme_command *,
+ struct nvme_completion *);
+static void nvme_feature_iv_config(struct pci_nvme_softc *,
+ struct nvme_feature_obj *,
+ struct nvme_command *,
+ struct nvme_completion *);
+
static __inline void
cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
{
@@ -329,14 +390,60 @@ pci_nvme_status_genc(uint16_t *status, uint16_t code)
pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
}
-static __inline void
-pci_nvme_toggle_phase(uint16_t *status, int prev)
+/*
+ * Initialize the requested number or IO Submission and Completion Queues.
+ * Admin queues are allocated implicitly.
+ */
+static void
+pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
{
+ uint32_t i;
- if (prev)
- *status &= ~NVME_STATUS_P;
- else
- *status |= NVME_STATUS_P;
+ /*
+ * Allocate and initialize the Submission Queues
+ */
+ if (nsq > NVME_QUEUES) {
+ WPRINTF("%s: clamping number of SQ from %u to %u",
+ __func__, nsq, NVME_QUEUES);
+ nsq = NVME_QUEUES;
+ }
+
+ sc->num_squeues = nsq;
+
+ sc->submit_queues = calloc(sc->num_squeues + 1,
+ sizeof(struct nvme_submission_queue));
+ if (sc->submit_queues == NULL) {
+ WPRINTF("%s: SQ allocation failed", __func__);
+ sc->num_squeues = 0;
+ } else {
+ struct nvme_submission_queue *sq = sc->submit_queues;
+
+ for (i = 0; i < sc->num_squeues; i++)
+ pthread_mutex_init(&sq[i].mtx, NULL);
+ }
+
+ /*
+ * Allocate and initialize the Completion Queues
+ */
+ if (ncq > NVME_QUEUES) {
+ WPRINTF("%s: clamping number of CQ from %u to %u",
+ __func__, ncq, NVME_QUEUES);
+ ncq = NVME_QUEUES;
+ }
+
+ sc->num_cqueues = ncq;
+
+ sc->compl_queues = calloc(sc->num_cqueues + 1,
+ sizeof(struct nvme_completion_queue));
+ if (sc->compl_queues == NULL) {
+ WPRINTF("%s: CQ allocation failed", __func__);
+ sc->num_cqueues = 0;
+ } else {
+ struct nvme_completion_queue *cq = sc->compl_queues;
+
+ for (i = 0; i < sc->num_cqueues; i++)
+ pthread_mutex_init(&cq[i].mtx, NULL);
+ }
}
static void
@@ -360,7 +467,7 @@ pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
cd->mic = 0;
- cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */
+ cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
cd->ver = 0x00010300;
@@ -368,6 +475,9 @@ pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
cd->acl = 2;
cd->aerl = 4;
+ /* Advertise 1, Read-only firmware slot */
+ cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
+ (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
cd->lpa = 0; /* TODO: support some simple things like SMART */
cd->elpe = 0; /* max error log page entries */
cd->npss = 1; /* number of power states support */
@@ -493,12 +603,136 @@ pci_nvme_init_logpages(struct pci_nvme_softc *sc)
memset(&sc->err_log, 0, sizeof(sc->err_log));
memset(&sc->health_log, 0, sizeof(sc->health_log));
memset(&sc->fw_log, 0, sizeof(sc->fw_log));
+
+ /* Set read/write remainder to round up according to spec */
+ sc->read_dunits_remainder = 999;
+ sc->write_dunits_remainder = 999;
+
+ /* Set nominal Health values checked by implementations */
+ sc->health_log.temperature = 310;
+ sc->health_log.available_spare = 100;
+ sc->health_log.available_spare_threshold = 10;
+}
+
+static void
+pci_nvme_init_features(struct pci_nvme_softc *sc)
+{
+
+ sc->feat[0].set = nvme_feature_invalid_cb;
+ sc->feat[0].get = nvme_feature_invalid_cb;
+
+ sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
+ sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
+ sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
+ sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
+ nvme_feature_iv_config;
+ sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
+ nvme_feature_invalid_cb;
+ sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
+ nvme_feature_invalid_cb;
+}
+
+static void
+pci_nvme_aer_init(struct pci_nvme_softc *sc)
+{
+
+ STAILQ_INIT(&sc->aer_list);
+ sc->aer_count = 0;
+}
+
+static void
+pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
+{
+ struct pci_nvme_aer *aer = NULL;
+
+ while (!STAILQ_EMPTY(&sc->aer_list)) {
+ aer = STAILQ_FIRST(&sc->aer_list);
+ STAILQ_REMOVE_HEAD(&sc->aer_list, link);
+ free(aer);
+ }
+
+ pci_nvme_aer_init(sc);
+}
+
+#ifdef __FreeBSD__
+static bool
+pci_nvme_aer_available(struct pci_nvme_softc *sc)
+{
+
+ return (!STAILQ_EMPTY(&sc->aer_list));
+}
+#else
+/* This is kept behind an ifdef while it's unused to appease the compiler. */
+#endif
+
+static bool
+pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
+{
+ struct nvme_controller_data *cd = &sc->ctrldata;
+
+ /* AERL is a zero based value while aer_count is one's based */
+ return (sc->aer_count == (cd->aerl + 1));
+}
+
+/*
+ * Add an Async Event Request
+ *
+ * Stores an AER to be returned later if the Controller needs to notify the
+ * host of an event.
+ * Note that while the NVMe spec doesn't require Controllers to return AER's
+ * in order, this implementation does preserve the order.
+ */
+static int
+pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
+{
+ struct pci_nvme_aer *aer = NULL;
+
+ if (pci_nvme_aer_limit_reached(sc))
+ return (-1);
+
+ aer = calloc(1, sizeof(struct pci_nvme_aer));
+ if (aer == NULL)
+ return (-1);
+
+ sc->aer_count++;
+
+ /* Save the Command ID for use in the completion message */
+ aer->cid = cid;
+ STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
+
+ return (0);
}
+/*
+ * Get an Async Event Request structure
+ *
+ * Returns a pointer to an AER previously submitted by the host or NULL if
+ * no AER's exist. Caller is responsible for freeing the returned struct.
+ */
+#ifdef __FreeBSD__
+static struct pci_nvme_aer *
+pci_nvme_aer_get(struct pci_nvme_softc *sc)
+{
+ struct pci_nvme_aer *aer = NULL;
+
+ aer = STAILQ_FIRST(&sc->aer_list);
+ if (aer != NULL) {
+ STAILQ_REMOVE_HEAD(&sc->aer_list, link);
+ sc->aer_count--;
+ }
+
+ return (aer);
+}
+#else
+/* This is kept behind an ifdef while it's unused to appease the compiler. */
+#endif
+
static void
pci_nvme_reset_locked(struct pci_nvme_softc *sc)
{
- DPRINTF(("%s", __func__));
+ uint32_t i;
+
+ DPRINTF("%s", __func__);
sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
(1 << NVME_CAP_LO_REG_CQR_SHIFT) |
@@ -511,45 +745,28 @@ pci_nvme_reset_locked(struct pci_nvme_softc *sc)
sc->regs.cc = 0;
sc->regs.csts = 0;
- sc->num_cqueues = sc->num_squeues = sc->max_queues;
- if (sc->submit_queues != NULL) {
- for (int i = 0; i < sc->num_squeues + 1; i++) {
- /*
- * The Admin Submission Queue is at index 0.
- * It must not be changed at reset otherwise the
- * emulation will be out of sync with the guest.
- */
- if (i != 0) {
- sc->submit_queues[i].qbase = NULL;
- sc->submit_queues[i].size = 0;
- sc->submit_queues[i].cqid = 0;
- }
- sc->submit_queues[i].tail = 0;
- sc->submit_queues[i].head = 0;
- sc->submit_queues[i].busy = 0;
- }
- } else
- sc->submit_queues = calloc(sc->num_squeues + 1,
- sizeof(struct nvme_submission_queue));
-
- if (sc->compl_queues != NULL) {
- for (int i = 0; i < sc->num_cqueues + 1; i++) {
- /* See Admin Submission Queue note above */
- if (i != 0) {
- sc->compl_queues[i].qbase = NULL;
- sc->compl_queues[i].size = 0;
- }
+ assert(sc->submit_queues != NULL);
- sc->compl_queues[i].tail = 0;
- sc->compl_queues[i].head = 0;
- }
- } else {
- sc->compl_queues = calloc(sc->num_cqueues + 1,
- sizeof(struct nvme_completion_queue));
+ for (i = 0; i < sc->num_squeues + 1; i++) {
+ sc->submit_queues[i].qbase = NULL;
+ sc->submit_queues[i].size = 0;
+ sc->submit_queues[i].cqid = 0;
+ sc->submit_queues[i].tail = 0;
+ sc->submit_queues[i].head = 0;
+ }
+
+ assert(sc->compl_queues != NULL);
- for (int i = 0; i < sc->num_cqueues + 1; i++)
- pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
+ for (i = 0; i < sc->num_cqueues + 1; i++) {
+ sc->compl_queues[i].qbase = NULL;
+ sc->compl_queues[i].size = 0;
+ sc->compl_queues[i].tail = 0;
+ sc->compl_queues[i].head = 0;
}
+
+ sc->num_q_is_set = false;
+
+ pci_nvme_aer_destroy(sc);
}
static void
@@ -565,23 +782,25 @@ pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
{
uint16_t acqs, asqs;
- DPRINTF(("%s", __func__));
+ DPRINTF("%s", __func__);
asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
sc->submit_queues[0].size = asqs;
sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
sizeof(struct nvme_command) * asqs);
- DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p",
- __func__, sc->regs.asq, sc->submit_queues[0].qbase));
+ DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
+ __func__, sc->regs.asq, sc->submit_queues[0].qbase);
acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
NVME_AQA_REG_ACQS_MASK) + 1;
sc->compl_queues[0].size = acqs;
sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
sizeof(struct nvme_completion) * acqs);
- DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p",
- __func__, sc->regs.acq, sc->compl_queues[0].qbase));
+ sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
+
+ DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
+ __func__, sc->regs.acq, sc->compl_queues[0].qbase);
}
static int
@@ -631,22 +850,63 @@ nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
return (0);
}
+/*
+ * Write a Completion Queue Entry update
+ *
+ * Write the completion and update the doorbell value
+ */
+static void
+pci_nvme_cq_update(struct pci_nvme_softc *sc,
+ struct nvme_completion_queue *cq,
+ uint32_t cdw0,
+ uint16_t cid,
+ uint16_t sqid,
+ uint16_t status)
+{
+ struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
+ struct nvme_completion *cqe;
+
+ assert(cq->qbase != NULL);
+
+ pthread_mutex_lock(&cq->mtx);
+
+ cqe = &cq->qbase[cq->tail];
+
+ /* Flip the phase bit */
+ status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
+
+ cqe->cdw0 = cdw0;
+ cqe->sqhd = sq->head;
+ cqe->sqid = sqid;
+ cqe->cid = cid;
+ cqe->status = status;
+
+ cq->tail++;
+ if (cq->tail >= cq->size) {
+ cq->tail = 0;
+ }
+
+ pthread_mutex_unlock(&cq->mtx);
+}
+
static int
nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
struct nvme_completion* compl)
{
uint16_t qid = command->cdw10 & 0xffff;
- DPRINTF(("%s DELETE_IO_SQ %u", __func__, qid));
- if (qid == 0 || qid > sc->num_squeues) {
- WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u",
- __func__, qid, sc->num_squeues));
+ DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
+ if (qid == 0 || qid > sc->num_squeues ||
+ (sc->submit_queues[qid].qbase == NULL)) {
+ WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
+ __func__, qid, sc->num_squeues);
pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
NVME_SC_INVALID_QUEUE_IDENTIFIER);
return (1);
}
sc->submit_queues[qid].qbase = NULL;
+ sc->submit_queues[qid].cqid = 0;
pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
return (1);
}
@@ -659,9 +919,10 @@ nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
uint16_t qid = command->cdw10 & 0xffff;
struct nvme_submission_queue *nsq;
- if ((qid == 0) || (qid > sc->num_squeues)) {
- WPRINTF(("%s queue index %u > num_squeues %u",
- __func__, qid, sc->num_squeues));
+ if ((qid == 0) || (qid > sc->num_squeues) ||
+ (sc->submit_queues[qid].qbase != NULL)) {
+ WPRINTF("%s queue index %u > num_squeues %u",
+ __func__, qid, sc->num_squeues);
pci_nvme_status_tc(&compl->status,
NVME_SCT_COMMAND_SPECIFIC,
NVME_SC_INVALID_QUEUE_IDENTIFIER);
@@ -670,26 +931,54 @@ nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
nsq = &sc->submit_queues[qid];
nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
+ DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
+ if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
+ /*
+ * Queues must specify at least two entries
+ * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
+ * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
+ */
+ pci_nvme_status_tc(&compl->status,
+ NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
+ return (1);
+ }
+ nsq->head = nsq->tail = 0;
- nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
- sizeof(struct nvme_command) * (size_t)nsq->size);
nsq->cqid = (command->cdw11 >> 16) & 0xffff;
+ if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
+ pci_nvme_status_tc(&compl->status,
+ NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_INVALID_QUEUE_IDENTIFIER);
+ return (1);
+ }
+
+ if (sc->compl_queues[nsq->cqid].qbase == NULL) {
+ pci_nvme_status_tc(&compl->status,
+ NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_COMPLETION_QUEUE_INVALID);
+ return (1);
+ }
+
nsq->qpriority = (command->cdw11 >> 1) & 0x03;
- DPRINTF(("%s sq %u size %u gaddr %p cqid %u", __func__,
- qid, nsq->size, nsq->qbase, nsq->cqid));
+ nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+ sizeof(struct nvme_command) * (size_t)nsq->size);
+
+ DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
+ qid, nsq->size, nsq->qbase, nsq->cqid);
pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
- DPRINTF(("%s completed creating IOSQ qid %u",
- __func__, qid));
+ DPRINTF("%s completed creating IOSQ qid %u",
+ __func__, qid);
} else {
/*
* Guest sent non-cont submission queue request.
* This setting is unsupported by this emulation.
*/
- WPRINTF(("%s unsupported non-contig (list-based) "
- "create i/o submission queue", __func__));
+ WPRINTF("%s unsupported non-contig (list-based) "
+ "create i/o submission queue", __func__);
pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
}
@@ -701,16 +990,27 @@ nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
struct nvme_completion* compl)
{
uint16_t qid = command->cdw10 & 0xffff;
+ uint16_t sqid;
- DPRINTF(("%s DELETE_IO_CQ %u", __func__, qid));
- if (qid == 0 || qid > sc->num_cqueues) {
- WPRINTF(("%s queue index %u / num_cqueues %u",
- __func__, qid, sc->num_cqueues));
+ DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
+ if (qid == 0 || qid > sc->num_cqueues ||
+ (sc->compl_queues[qid].qbase == NULL)) {
+ WPRINTF("%s queue index %u / num_cqueues %u",
+ __func__, qid, sc->num_cqueues);
pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
NVME_SC_INVALID_QUEUE_IDENTIFIER);
return (1);
}
+ /* Deleting an Active CQ is an error */
+ for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
+ if (sc->submit_queues[sqid].cqid == qid) {
+ pci_nvme_status_tc(&compl->status,
+ NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_INVALID_QUEUE_DELETION);
+ return (1);
+ }
+
sc->compl_queues[qid].qbase = NULL;
pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
return (1);
@@ -720,40 +1020,58 @@ static int
nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
struct nvme_completion* compl)
{
- if (command->cdw11 & NVME_CMD_CDW11_PC) {
- uint16_t qid = command->cdw10 & 0xffff;
- struct nvme_completion_queue *ncq;
+ struct nvme_completion_queue *ncq;
+ uint16_t qid = command->cdw10 & 0xffff;
- if ((qid == 0) || (qid > sc->num_cqueues)) {
- WPRINTF(("%s queue index %u > num_cqueues %u",
- __func__, qid, sc->num_cqueues));
- pci_nvme_status_tc(&compl->status,
- NVME_SCT_COMMAND_SPECIFIC,
- NVME_SC_INVALID_QUEUE_IDENTIFIER);
- return (1);
- }
+ /* Only support Physically Contiguous queues */
+ if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
+ WPRINTF("%s unsupported non-contig (list-based) "
+ "create i/o completion queue",
+ __func__);
- ncq = &sc->compl_queues[qid];
- ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
- ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
- ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
+ pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+ return (1);
+ }
- ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
- command->prp1,
- sizeof(struct nvme_command) * (size_t)ncq->size);
+ if ((qid == 0) || (qid > sc->num_cqueues) ||
+ (sc->compl_queues[qid].qbase != NULL)) {
+ WPRINTF("%s queue index %u > num_cqueues %u",
+ __func__, qid, sc->num_cqueues);
+ pci_nvme_status_tc(&compl->status,
+ NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_INVALID_QUEUE_IDENTIFIER);
+ return (1);
+ }
+
+ ncq = &sc->compl_queues[qid];
+ ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
+ ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
+ if (ncq->intr_vec > (sc->max_queues + 1)) {
+ pci_nvme_status_tc(&compl->status,
+ NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_INVALID_INTERRUPT_VECTOR);
+ return (1);
+ }
- pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
- } else {
- /*
- * Non-contig completion queue unsupported.
+ ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
+ if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
+ /*
+ * Queues must specify at least two entries
+ * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
+ * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
*/
- WPRINTF(("%s unsupported non-contig (list-based) "
- "create i/o completion queue",
- __func__));
-
- /* 0x12 = Invalid Use of Controller Memory Buffer */
- pci_nvme_status_genc(&compl->status, 0x12);
+ pci_nvme_status_tc(&compl->status,
+ NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
+ return (1);
}
+ ncq->head = ncq->tail = 0;
+ ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
+ command->prp1,
+ sizeof(struct nvme_command) * (size_t)ncq->size);
+
+ pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+
return (1);
}
@@ -762,33 +1080,53 @@ static int
nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
struct nvme_completion* compl)
{
- uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
+ uint32_t logsize = 0;
uint8_t logpage = command->cdw10 & 0xFF;
- DPRINTF(("%s log page %u len %u", __func__, logpage, logsize));
+ DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+ /*
+ * Command specifies the number of dwords to return in fields NUMDU
+ * and NUMDL. This is a zero-based value.
+ */
+ logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
+ logsize *= sizeof(uint32_t);
+
switch (logpage) {
case NVME_LOG_ERROR:
nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
- command->prp2, (uint8_t *)&sc->err_log, logsize,
+ command->prp2, (uint8_t *)&sc->err_log,
+ MIN(logsize, sizeof(sc->err_log)),
NVME_COPY_TO_PRP);
break;
case NVME_LOG_HEALTH_INFORMATION:
- /* TODO: present some smart info */
+ pthread_mutex_lock(&sc->mtx);
+ memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
+ sizeof(sc->health_log.data_units_read));
+ memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
+ sizeof(sc->health_log.data_units_written));
+ memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
+ sizeof(sc->health_log.host_read_commands));
+ memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
+ sizeof(sc->health_log.host_write_commands));
+ pthread_mutex_unlock(&sc->mtx);
+
nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
- command->prp2, (uint8_t *)&sc->health_log, logsize,
+ command->prp2, (uint8_t *)&sc->health_log,
+ MIN(logsize, sizeof(sc->health_log)),
NVME_COPY_TO_PRP);
break;
case NVME_LOG_FIRMWARE_SLOT:
nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
- command->prp2, (uint8_t *)&sc->fw_log, logsize,
+ command->prp2, (uint8_t *)&sc->fw_log,
+ MIN(logsize, sizeof(sc->fw_log)),
NVME_COPY_TO_PRP);
break;
default:
- WPRINTF(("%s get log page %x command not supported",
- __func__, logpage));
+ DPRINTF("%s get log page %x command not supported",
+ __func__, logpage);
pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
NVME_SC_INVALID_LOG_PAGE);
@@ -802,9 +1140,12 @@ nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
struct nvme_completion* compl)
{
void *dest;
+ uint16_t status = 0;
+
+ DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
+ command->cdw10 & 0xFF, command->nsid);
- DPRINTF(("%s identify 0x%x nsid 0x%x", __func__,
- command->cdw10 & 0xFF, command->nsid));
+ pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
switch (command->cdw10 & 0xFF) {
case 0x00: /* return Identify Namespace data structure */
@@ -821,230 +1162,359 @@ nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
sizeof(uint32_t) * 1024);
+ /* All unused entries shall be zero */
+ bzero(dest, sizeof(uint32_t) * 1024);
((uint32_t *)dest)[0] = 1;
- ((uint32_t *)dest)[1] = 0;
break;
- case 0x11:
- pci_nvme_status_genc(&compl->status,
- NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
- return (1);
case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
- case 0x10:
- case 0x12:
- case 0x13:
- case 0x14:
- case 0x15:
+ if (command->nsid != 1) {
+ pci_nvme_status_genc(&status,
+ NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
+ break;
+ }
+ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
+ sizeof(uint32_t) * 1024);
+ /* All bytes after the descriptor shall be zero */
+ bzero(dest, sizeof(uint32_t) * 1024);
+
+ /* Return NIDT=1 (i.e. EUI64) descriptor */
+ ((uint8_t *)dest)[0] = 1;
+ ((uint8_t *)dest)[1] = sizeof(uint64_t);
+ bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
+ break;
default:
- DPRINTF(("%s unsupported identify command requested 0x%x",
- __func__, command->cdw10 & 0xFF));
- pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
- return (1);
+ DPRINTF("%s unsupported identify command requested 0x%x",
+ __func__, command->cdw10 & 0xFF);
+ pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
+ break;
}
- pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+ compl->status = status;
return (1);
}
-static int
-nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
- struct nvme_completion* compl)
+static const char *
+nvme_fid_to_name(uint8_t fid)
+{
+ const char *name;
+
+ switch (fid) {
+ case NVME_FEAT_ARBITRATION:
+ name = "Arbitration";
+ break;
+ case NVME_FEAT_POWER_MANAGEMENT:
+ name = "Power Management";
+ break;
+ case NVME_FEAT_LBA_RANGE_TYPE:
+ name = "LBA Range Type";
+ break;
+ case NVME_FEAT_TEMPERATURE_THRESHOLD:
+ name = "Temperature Threshold";
+ break;
+ case NVME_FEAT_ERROR_RECOVERY:
+ name = "Error Recovery";
+ break;
+ case NVME_FEAT_VOLATILE_WRITE_CACHE:
+ name = "Volatile Write Cache";
+ break;
+ case NVME_FEAT_NUMBER_OF_QUEUES:
+ name = "Number of Queues";
+ break;
+ case NVME_FEAT_INTERRUPT_COALESCING:
+ name = "Interrupt Coalescing";
+ break;
+ case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
+ name = "Interrupt Vector Configuration";
+ break;
+ case NVME_FEAT_WRITE_ATOMICITY:
+ name = "Write Atomicity Normal";
+ break;
+ case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
+ name = "Asynchronous Event Configuration";
+ break;
+ case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
+ name = "Autonomous Power State Transition";
+ break;
+ case NVME_FEAT_HOST_MEMORY_BUFFER:
+ name = "Host Memory Buffer";
+ break;
+ case NVME_FEAT_TIMESTAMP:
+ name = "Timestamp";
+ break;
+ case NVME_FEAT_KEEP_ALIVE_TIMER:
+ name = "Keep Alive Timer";
+ break;
+ case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
+ name = "Host Controlled Thermal Management";
+ break;
+ case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
+ name = "Non-Operation Power State Config";
+ break;
+ case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
+ name = "Read Recovery Level Config";
+ break;
+ case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
+ name = "Predictable Latency Mode Config";
+ break;
+ case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
+ name = "Predictable Latency Mode Window";
+ break;
+ case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
+ name = "LBA Status Information Report Interval";
+ break;
+ case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
+ name = "Host Behavior Support";
+ break;
+ case NVME_FEAT_SANITIZE_CONFIG:
+ name = "Sanitize Config";
+ break;
+ case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
+ name = "Endurance Group Event Configuration";
+ break;
+ case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
+ name = "Software Progress Marker";
+ break;
+ case NVME_FEAT_HOST_IDENTIFIER:
+ name = "Host Identifier";
+ break;
+ case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
+ name = "Reservation Notification Mask";
+ break;
+ case NVME_FEAT_RESERVATION_PERSISTENCE:
+ name = "Reservation Persistence";
+ break;
+ case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
+ name = "Namespace Write Protection Config";
+ break;
+ default:
+ name = "Unknown";
+ break;
+ }
+
+ return (name);
+}
+
+static void
+nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
+ struct nvme_feature_obj *feat,
+ struct nvme_command *command,
+ struct nvme_completion *compl)
+{
+
+ pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+}
+
+static void
+nvme_feature_iv_config(struct pci_nvme_softc *sc,
+ struct nvme_feature_obj *feat,
+ struct nvme_command *command,
+ struct nvme_completion *compl)
+{
+ uint32_t i;
+ uint32_t cdw11 = command->cdw11;
+ uint16_t iv;
+ bool cd;
+
+ pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+
+ iv = cdw11 & 0xffff;
+ cd = cdw11 & (1 << 16);
+
+ if (iv > (sc->max_queues + 1)) {
+ return;
+ }
+
+ /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
+ if ((iv == 0) && !cd)
+ return;
+
+ /* Requested Interrupt Vector must be used by a CQ */
+ for (i = 0; i < sc->num_cqueues + 1; i++) {
+ if (sc->compl_queues[i].intr_vec == iv) {
+ pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+ }
+ }
+
+}
+
+static void
+nvme_feature_num_queues(struct pci_nvme_softc *sc,
+ struct nvme_feature_obj *feat,
+ struct nvme_command *command,
+ struct nvme_completion *compl)
{
uint16_t nqr; /* Number of Queues Requested */
+ if (sc->num_q_is_set) {
+ WPRINTF("%s: Number of Queues already set", __func__);
+ pci_nvme_status_genc(&compl->status,
+ NVME_SC_COMMAND_SEQUENCE_ERROR);
+ return;
+ }
+
nqr = command->cdw11 & 0xFFFF;
if (nqr == 0xffff) {
- WPRINTF(("%s: Illegal NSQR value %#x", __func__, nqr));
+ WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
- return (-1);
+ return;
}
sc->num_squeues = ONE_BASED(nqr);
if (sc->num_squeues > sc->max_queues) {
- DPRINTF(("NSQR=%u is greater than max %u", sc->num_squeues,
- sc->max_queues));
+ DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
+ sc->max_queues);
sc->num_squeues = sc->max_queues;
}
nqr = (command->cdw11 >> 16) & 0xFFFF;
if (nqr == 0xffff) {
- WPRINTF(("%s: Illegal NCQR value %#x", __func__, nqr));
+ WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
- return (-1);
+ return;
}
sc->num_cqueues = ONE_BASED(nqr);
if (sc->num_cqueues > sc->max_queues) {
- DPRINTF(("NCQR=%u is greater than max %u", sc->num_cqueues,
- sc->max_queues));
+ DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
+ sc->max_queues);
sc->num_cqueues = sc->max_queues;
}
+ /* Patch the command value which will be saved on callback's return */
+ command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
- return (0);
+ sc->num_q_is_set = true;
}
static int
-nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
- struct nvme_completion* compl)
+nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
+ struct nvme_completion *compl)
{
- int feature = command->cdw10 & 0xFF;
- uint32_t iv;
+ struct nvme_feature_obj *feat;
+ uint32_t nsid = command->nsid;
+ uint8_t fid = command->cdw10 & 0xFF;
+
+ DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
+
+ if (fid >= NVME_FID_MAX) {
+ DPRINTF("%s invalid feature 0x%x", __func__, fid);
+ pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+ return (1);
+ }
+ feat = &sc->feat[fid];
+
+ if (!feat->namespace_specific &&
+ !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
+ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_FEATURE_NOT_NS_SPECIFIC);
+ return (1);
+ }
- DPRINTF(("%s feature 0x%x", __func__, feature));
compl->cdw0 = 0;
+ pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
- switch (feature) {
- case NVME_FEAT_ARBITRATION:
- DPRINTF((" arbitration 0x%x", command->cdw11));
- break;
- case NVME_FEAT_POWER_MANAGEMENT:
- DPRINTF((" power management 0x%x", command->cdw11));
- break;
- case NVME_FEAT_LBA_RANGE_TYPE:
- DPRINTF((" lba range 0x%x", command->cdw11));
- break;
- case NVME_FEAT_TEMPERATURE_THRESHOLD:
- DPRINTF((" temperature threshold 0x%x", command->cdw11));
- break;
- case NVME_FEAT_ERROR_RECOVERY:
- DPRINTF((" error recovery 0x%x", command->cdw11));
- break;
- case NVME_FEAT_VOLATILE_WRITE_CACHE:
- DPRINTF((" volatile write cache 0x%x", command->cdw11));
- break;
- case NVME_FEAT_NUMBER_OF_QUEUES:
- nvme_set_feature_queues(sc, command, compl);
- break;
- case NVME_FEAT_INTERRUPT_COALESCING:
- DPRINTF((" interrupt coalescing 0x%x", command->cdw11));
+ if (feat->set)
+ feat->set(sc, feat, command, compl);
- /* in uS */
- sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
+ if (compl->status == NVME_SC_SUCCESS)
+ feat->cdw11 = command->cdw11;
- sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
- break;
- case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
- iv = command->cdw11 & 0xFFFF;
-
- DPRINTF((" interrupt vector configuration 0x%x",
- command->cdw11));
-
- for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
- if (sc->compl_queues[i].intr_vec == iv) {
- if (command->cdw11 & (1 << 16))
- sc->compl_queues[i].intr_en |=
- NVME_CQ_INTCOAL;
- else
- sc->compl_queues[i].intr_en &=
- ~NVME_CQ_INTCOAL;
- }
- }
- break;
- case NVME_FEAT_WRITE_ATOMICITY:
- DPRINTF((" write atomicity 0x%x", command->cdw11));
- break;
- case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
- DPRINTF((" async event configuration 0x%x",
- command->cdw11));
- sc->async_ev_config = command->cdw11;
- break;
- case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
- DPRINTF((" software progress marker 0x%x",
- command->cdw11));
- break;
- case 0x0C:
- DPRINTF((" autonomous power state transition 0x%x",
- command->cdw11));
- break;
- default:
- WPRINTF(("%s invalid feature", __func__));
+ return (0);
+}
+
+static int
+nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
+ struct nvme_completion* compl)
+{
+ struct nvme_feature_obj *feat;
+ uint8_t fid = command->cdw10 & 0xFF;
+
+ DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
+
+ if (fid >= NVME_FID_MAX) {
+ DPRINTF("%s invalid feature 0x%x", __func__, fid);
pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
return (1);
}
+ compl->cdw0 = 0;
pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
- return (1);
+
+ feat = &sc->feat[fid];
+ if (feat->get) {
+ feat->get(sc, feat, command, compl);
+ }
+
+ if (compl->status == NVME_SC_SUCCESS) {
+ compl->cdw0 = feat->cdw11;
+ }
+
+ return (0);
}
static int
-nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
+nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
struct nvme_completion* compl)
{
- int feature = command->cdw10 & 0xFF;
+ uint8_t ses, lbaf, pi;
- DPRINTF(("%s feature 0x%x", __func__, feature));
+ /* Only supports Secure Erase Setting - User Data Erase */
+ ses = (command->cdw10 >> 9) & 0x7;
+ if (ses > 0x1) {
+ pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+ return (1);
+ }
- compl->cdw0 = 0;
+ /* Only supports a single LBA Format */
+ lbaf = command->cdw10 & 0xf;
+ if (lbaf != 0) {
+ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_INVALID_FORMAT);
+ return (1);
+ }
- switch (feature) {
- case NVME_FEAT_ARBITRATION:
- DPRINTF((" arbitration"));
- break;
- case NVME_FEAT_POWER_MANAGEMENT:
- DPRINTF((" power management"));
- break;
- case NVME_FEAT_LBA_RANGE_TYPE:
- DPRINTF((" lba range"));
- break;
- case NVME_FEAT_TEMPERATURE_THRESHOLD:
- DPRINTF((" temperature threshold"));
- switch ((command->cdw11 >> 20) & 0x3) {
- case 0:
- /* Over temp threshold */
- compl->cdw0 = 0xFFFF;
- break;
- case 1:
- /* Under temp threshold */
- compl->cdw0 = 0;
- break;
- default:
- WPRINTF((" invalid threshold type select"));
+ /* Doesn't support Protection Infomation */
+ pi = (command->cdw10 >> 5) & 0x7;
+ if (pi != 0) {
+ pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
+ return (1);
+ }
+
+ if (sc->nvstore.type == NVME_STOR_RAM) {
+ if (sc->nvstore.ctx)
+ free(sc->nvstore.ctx);
+ sc->nvstore.ctx = calloc(1, sc->nvstore.size);
+ pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
+ } else {
+ struct pci_nvme_ioreq *req;
+ int err;
+
+ req = pci_nvme_get_ioreq(sc);
+ if (req == NULL) {
pci_nvme_status_genc(&compl->status,
- NVME_SC_INVALID_FIELD);
+ NVME_SC_INTERNAL_DEVICE_ERROR);
+ WPRINTF("%s: unable to allocate IO req", __func__);
return (1);
}
- break;
- case NVME_FEAT_ERROR_RECOVERY:
- DPRINTF((" error recovery"));
- break;
- case NVME_FEAT_VOLATILE_WRITE_CACHE:
- DPRINTF((" volatile write cache"));
- break;
- case NVME_FEAT_NUMBER_OF_QUEUES:
- compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
-
- DPRINTF((" number of queues (submit %u, completion %u)",
- compl->cdw0 & 0xFFFF,
- (compl->cdw0 >> 16) & 0xFFFF));
+ req->nvme_sq = &sc->submit_queues[0];
+ req->sqid = 0;
+ req->opc = command->opc;
+ req->cid = command->cid;
+ req->nsid = command->nsid;
+
+ req->io_req.br_offset = 0;
+ req->io_req.br_resid = sc->nvstore.size;
+ req->io_req.br_callback = pci_nvme_io_done;
- break;
- case NVME_FEAT_INTERRUPT_COALESCING:
- DPRINTF((" interrupt coalescing"));
- break;
- case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
- DPRINTF((" interrupt vector configuration"));
- break;
- case NVME_FEAT_WRITE_ATOMICITY:
- DPRINTF((" write atomicity"));
- break;
- case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
- DPRINTF((" async event configuration"));
- sc->async_ev_config = command->cdw11;
- break;
- case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
- DPRINTF((" software progress marker"));
- break;
- case 0x0C:
- DPRINTF((" autonomous power state transition"));
- break;
- default:
- WPRINTF(("%s invalid feature 0x%x", __func__, feature));
- pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
- return (1);
+ err = blockif_delete(sc->nvstore.ctx, &req->io_req);
+ if (err) {
+ pci_nvme_status_genc(&compl->status,
+ NVME_SC_INTERNAL_DEVICE_ERROR);
+ pci_nvme_release_ioreq(sc, req);
+ }
}
- pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
return (1);
}
@@ -1052,8 +1522,8 @@ static int
nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
struct nvme_completion* compl)
{
- DPRINTF(("%s submission queue %u, command ID 0x%x", __func__,
- command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
+ DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
+ command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
/* TODO: search for the command ID and abort it */
@@ -1062,25 +1532,34 @@ nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
return (1);
}
-#ifdef __FreeBSD__
static int
nvme_opc_async_event_req(struct pci_nvme_softc* sc,
struct nvme_command* command, struct nvme_completion* compl)
{
- DPRINTF(("%s async event request 0x%x", __func__, command->cdw11));
+ DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
+
+ /* Don't exceed the Async Event Request Limit (AERL). */
+ if (pci_nvme_aer_limit_reached(sc)) {
+ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
+ return (1);
+ }
+
+ if (pci_nvme_aer_add(sc, command->cid)) {
+ pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
+ NVME_SC_INTERNAL_DEVICE_ERROR);
+ return (1);
+ }
/*
- * TODO: raise events when they happen based on the Set Features cmd.
+ * Raise events when they happen based on the Set Features cmd.
* These events happen async, so only set completion successful if
* there is an event reflective of the request to get event.
*/
- pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
- NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
+ compl->status = NVME_NO_STATUS;
+
return (0);
}
-#else
-/* This is kept behind an ifdef while it's unused to appease the compiler. */
-#endif /* __FreeBSD__ */
static void
pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
@@ -1091,20 +1570,15 @@ pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
struct nvme_completion_queue *cq;
uint16_t sqhead;
- DPRINTF(("%s index %u", __func__, (uint32_t)value));
+ DPRINTF("%s index %u", __func__, (uint32_t)value);
sq = &sc->submit_queues[0];
cq = &sc->compl_queues[0];
- sqhead = atomic_load_acq_short(&sq->head);
+ pthread_mutex_lock(&sq->mtx);
- if (atomic_testandset_int(&sq->busy, 1)) {
- DPRINTF(("%s SQ busy, head %u, tail %u",
- __func__, sqhead, sq->tail));
- return;
- }
-
- DPRINTF(("sqhead %u, tail %u", sqhead, sq->tail));
+ sqhead = sq->head;
+ DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
while (sqhead != atomic_load_acq_short(&sq->tail)) {
cmd = &(sq->qbase)[sqhead];
@@ -1113,205 +1587,226 @@ pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
switch (cmd->opc) {
case NVME_OPC_DELETE_IO_SQ:
- DPRINTF(("%s command DELETE_IO_SQ", __func__));
+ DPRINTF("%s command DELETE_IO_SQ", __func__);
nvme_opc_delete_io_sq(sc, cmd, &compl);
break;
case NVME_OPC_CREATE_IO_SQ:
- DPRINTF(("%s command CREATE_IO_SQ", __func__));
+ DPRINTF("%s command CREATE_IO_SQ", __func__);
nvme_opc_create_io_sq(sc, cmd, &compl);
break;
case NVME_OPC_DELETE_IO_CQ:
- DPRINTF(("%s command DELETE_IO_CQ", __func__));
+ DPRINTF("%s command DELETE_IO_CQ", __func__);
nvme_opc_delete_io_cq(sc, cmd, &compl);
break;
case NVME_OPC_CREATE_IO_CQ:
- DPRINTF(("%s command CREATE_IO_CQ", __func__));
+ DPRINTF("%s command CREATE_IO_CQ", __func__);
nvme_opc_create_io_cq(sc, cmd, &compl);
break;
case NVME_OPC_GET_LOG_PAGE:
- DPRINTF(("%s command GET_LOG_PAGE", __func__));
+ DPRINTF("%s command GET_LOG_PAGE", __func__);
nvme_opc_get_log_page(sc, cmd, &compl);
break;
case NVME_OPC_IDENTIFY:
- DPRINTF(("%s command IDENTIFY", __func__));
+ DPRINTF("%s command IDENTIFY", __func__);
nvme_opc_identify(sc, cmd, &compl);
break;
case NVME_OPC_ABORT:
- DPRINTF(("%s command ABORT", __func__));
+ DPRINTF("%s command ABORT", __func__);
nvme_opc_abort(sc, cmd, &compl);
break;
case NVME_OPC_SET_FEATURES:
- DPRINTF(("%s command SET_FEATURES", __func__));
+ DPRINTF("%s command SET_FEATURES", __func__);
nvme_opc_set_features(sc, cmd, &compl);
break;
case NVME_OPC_GET_FEATURES:
- DPRINTF(("%s command GET_FEATURES", __func__));
+ DPRINTF("%s command GET_FEATURES", __func__);
nvme_opc_get_features(sc, cmd, &compl);
break;
+ case NVME_OPC_FIRMWARE_ACTIVATE:
+ DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
+ pci_nvme_status_tc(&compl.status,
+ NVME_SCT_COMMAND_SPECIFIC,
+ NVME_SC_INVALID_FIRMWARE_SLOT);
+ break;
case NVME_OPC_ASYNC_EVENT_REQUEST:
- DPRINTF(("%s command ASYNC_EVENT_REQ", __func__));
- /* XXX dont care, unhandled for now
+ DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
nvme_opc_async_event_req(sc, cmd, &compl);
- */
+ break;
+ case NVME_OPC_FORMAT_NVM:
+ DPRINTF("%s command FORMAT_NVM", __func__);
+ if ((sc->ctrldata.oacs &
+ (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
+ pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
+ }
compl.status = NVME_NO_STATUS;
+ nvme_opc_format_nvm(sc, cmd, &compl);
break;
default:
- WPRINTF(("0x%x command is not implemented",
- cmd->opc));
+ DPRINTF("0x%x command is not implemented",
+ cmd->opc);
pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
}
sqhead = (sqhead + 1) % sq->size;
if (NVME_COMPLETION_VALID(compl)) {
- struct nvme_completion *cp;
- int phase;
-
- cp = &(cq->qbase)[cq->tail];
- cp->cdw0 = compl.cdw0;
- cp->sqid = 0;
- cp->sqhd = sqhead;
- cp->cid = cmd->cid;
-
- phase = NVME_STATUS_GET_P(cp->status);
- cp->status = compl.status;
- pci_nvme_toggle_phase(&cp->status, phase);
-
- cq->tail = (cq->tail + 1) % cq->size;
+ pci_nvme_cq_update(sc, &sc->compl_queues[0],
+ compl.cdw0,
+ cmd->cid,
+ 0, /* SQID */
+ compl.status);
}
}
- DPRINTF(("setting sqhead %u", sqhead));
- atomic_store_short(&sq->head, sqhead);
- atomic_store_int(&sq->busy, 0);
+ DPRINTF("setting sqhead %u", sqhead);
+ sq->head = sqhead;
if (cq->head != cq->tail)
pci_generate_msix(sc->nsc_pi, 0);
+ pthread_mutex_unlock(&sq->mtx);
}
-static int
-pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
- uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
+/*
+ * Update the Write and Read statistics reported in SMART data
+ *
+ * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
+ * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
+ * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
+ */
+static void
+pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
+ size_t bytes, uint16_t status)
{
- int iovidx;
- if (req != NULL) {
- /* concatenate contig block-iovs to minimize number of iovs */
- if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
- iovidx = req->io_req.br_iovcnt - 1;
+ pthread_mutex_lock(&sc->mtx);
+ switch (opc) {
+ case NVME_OPC_WRITE:
+ sc->write_commands++;
+ if (status != NVME_SC_SUCCESS)
+ break;
+ sc->write_dunits_remainder += (bytes / 512);
+ while (sc->write_dunits_remainder >= 1000) {
+ sc->write_data_units++;
+ sc->write_dunits_remainder -= 1000;
+ }
+ break;
+ case NVME_OPC_READ:
+ sc->read_commands++;
+ if (status != NVME_SC_SUCCESS)
+ break;
+ sc->read_dunits_remainder += (bytes / 512);
+ while (sc->read_dunits_remainder >= 1000) {
+ sc->read_data_units++;
+ sc->read_dunits_remainder -= 1000;
+ }
+ break;
+ default:
+ DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
+ break;
+ }
+ pthread_mutex_unlock(&sc->mtx);
+}
- req->io_req.br_iov[iovidx].iov_base =
- paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
- req->prev_gpaddr, size);
+/*
+ * Check if the combination of Starting LBA (slba) and Number of Logical
+ * Blocks (nlb) exceeds the range of the underlying storage.
+ *
+ * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
+ * the capacity in bytes as a uint64_t, care must be taken to avoid integer
+ * overflow.
+ */
+static bool
+pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
+ uint32_t nlb)
+{
+ size_t offset, bytes;
- req->prev_size += size;
- req->io_req.br_resid += size;
+ /* Overflow check of multiplying Starting LBA by the sector size */
+ if (slba >> (64 - nvstore->sectsz_bits))
+ return (true);
- req->io_req.br_iov[iovidx].iov_len = req->prev_size;
- } else {
- pthread_mutex_lock(&req->mtx);
+ offset = slba << nvstore->sectsz_bits;
+ bytes = nlb << nvstore->sectsz_bits;
- iovidx = req->io_req.br_iovcnt;
- if (iovidx == NVME_MAX_BLOCKIOVS) {
- int err = 0;
+ /* Overflow check of Number of Logical Blocks */
+ if ((nvstore->size - offset) < bytes)
+ return (true);
- DPRINTF(("large I/O, doing partial req"));
+ return (false);
+}
- iovidx = 0;
- req->io_req.br_iovcnt = 0;
+static int
+pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
+ uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
+{
+ int iovidx;
- req->io_req.br_callback = pci_nvme_io_partial;
+ if (req == NULL)
+ return (-1);
- if (!do_write)
- err = blockif_read(sc->nvstore.ctx,
- &req->io_req);
- else
- err = blockif_write(sc->nvstore.ctx,
- &req->io_req);
+ if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
+ return (-1);
+ }
- /* wait until req completes before cont */
- if (err == 0)
- pthread_cond_wait(&req->cv, &req->mtx);
- }
- if (iovidx == 0) {
- req->io_req.br_offset = lba;
- req->io_req.br_resid = 0;
- req->io_req.br_param = req;
- }
+ /* concatenate contig block-iovs to minimize number of iovs */
+ if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
+ iovidx = req->io_req.br_iovcnt - 1;
- req->io_req.br_iov[iovidx].iov_base =
- paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
- gpaddr, size);
+ req->io_req.br_iov[iovidx].iov_base =
+ paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
+ req->prev_gpaddr, size);
- req->io_req.br_iov[iovidx].iov_len = size;
+ req->prev_size += size;
+ req->io_req.br_resid += size;
- req->prev_gpaddr = gpaddr;
- req->prev_size = size;
- req->io_req.br_resid += size;
+ req->io_req.br_iov[iovidx].iov_len = req->prev_size;
+ } else {
+ iovidx = req->io_req.br_iovcnt;
+ if (iovidx == 0) {
+ req->io_req.br_offset = lba;
+ req->io_req.br_resid = 0;
+ req->io_req.br_param = req;
+ }
- req->io_req.br_iovcnt++;
+ req->io_req.br_iov[iovidx].iov_base =
+ paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
+ gpaddr, size);
- pthread_mutex_unlock(&req->mtx);
- }
- } else {
- /* RAM buffer: read/write directly */
- void *p = sc->nvstore.ctx;
- void *gptr;
+ req->io_req.br_iov[iovidx].iov_len = size;
- if ((lba + size) > sc->nvstore.size) {
- WPRINTF(("%s write would overflow RAM", __func__));
- return (-1);
- }
+ req->prev_gpaddr = gpaddr;
+ req->prev_size = size;
+ req->io_req.br_resid += size;
- p = (void *)((uintptr_t)p + (uintptr_t)lba);
- gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
- if (do_write)
- memcpy(p, gptr, size);
- else
- memcpy(gptr, p, size);
+ req->io_req.br_iovcnt++;
}
+
return (0);
}
static void
pci_nvme_set_completion(struct pci_nvme_softc *sc,
struct nvme_submission_queue *sq, int sqid, uint16_t cid,
- uint32_t cdw0, uint16_t status, int ignore_busy)
+ uint32_t cdw0, uint16_t status)
{
struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
- struct nvme_completion *compl;
- int phase;
- DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
+ DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
__func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
- NVME_STATUS_GET_SC(status)));
-
- pthread_mutex_lock(&cq->mtx);
-
- assert(cq->qbase != NULL);
-
- compl = &cq->qbase[cq->tail];
-
- compl->cdw0 = cdw0;
- compl->sqid = sqid;
- compl->sqhd = atomic_load_acq_short(&sq->head);
- compl->cid = cid;
-
- // toggle phase
- phase = NVME_STATUS_GET_P(compl->status);
- compl->status = status;
- pci_nvme_toggle_phase(&compl->status, phase);
+ NVME_STATUS_GET_SC(status));
- cq->tail = (cq->tail + 1) % cq->size;
-
- pthread_mutex_unlock(&cq->mtx);
+ pci_nvme_cq_update(sc, cq,
+ 0, /* CDW0 */
+ cid,
+ sqid,
+ status);
if (cq->head != cq->tail) {
if (cq->intr_en & NVME_CQ_INTEN) {
pci_generate_msix(sc->nsc_pi, cq->intr_vec);
} else {
- DPRINTF(("%s: CQ%u interrupt disabled\n",
- __func__, sq->cqid));
+ DPRINTF("%s: CQ%u interrupt disabled",
+ __func__, sq->cqid);
}
}
}
@@ -1373,24 +1868,211 @@ pci_nvme_io_done(struct blockif_req *br, int err)
struct nvme_submission_queue *sq = req->nvme_sq;
uint16_t code, status = 0;
- DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
+ DPRINTF("%s error %d %s", __func__, err, strerror(err));
/* TODO return correct error */
code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
pci_nvme_status_genc(&status, code);
- pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
+ pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
+ pci_nvme_stats_write_read_update(req->sc, req->opc,
+ req->bytes, status);
pci_nvme_release_ioreq(req->sc, req);
}
-static void
-pci_nvme_io_partial(struct blockif_req *br, int err)
+/*
+ * Implements the Flush command. The specification states:
+ * If a volatile write cache is not present, Flush commands complete
+ * successfully and have no effect
+ * in the description of the Volatile Write Cache (VWC) field of the Identify
+ * Controller data. Therefore, set status to Success if the command is
+ * not supported (i.e. RAM or as indicated by the blockif).
+ */
+static bool
+nvme_opc_flush(struct pci_nvme_softc *sc,
+ struct nvme_command *cmd,
+ struct pci_nvme_blockstore *nvstore,
+ struct pci_nvme_ioreq *req,
+ uint16_t *status)
{
- struct pci_nvme_ioreq *req = br->br_param;
+ bool pending = false;
+
+ if (nvstore->type == NVME_STOR_RAM) {
+ pci_nvme_status_genc(status, NVME_SC_SUCCESS);
+ } else {
+ int err;
+
+ req->io_req.br_callback = pci_nvme_io_done;
+
+ err = blockif_flush(nvstore->ctx, &req->io_req);
+ switch (err) {
+ case 0:
+ pending = true;
+ break;
+ case EOPNOTSUPP:
+ pci_nvme_status_genc(status, NVME_SC_SUCCESS);
+ break;
+ default:
+ pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
+ }
+ }
- DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
+ return (pending);
+}
+
+static uint16_t
+nvme_write_read_ram(struct pci_nvme_softc *sc,
+ struct pci_nvme_blockstore *nvstore,
+ uint64_t prp1, uint64_t prp2,
+ size_t offset, uint64_t bytes,
+ bool is_write)
+{
+ uint8_t *buf = nvstore->ctx;
+ enum nvme_copy_dir dir;
+ uint16_t status = 0;
+
+ if (is_write)
+ dir = NVME_COPY_TO_PRP;
+ else
+ dir = NVME_COPY_FROM_PRP;
- pthread_cond_signal(&req->cv);
+ if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
+ buf + offset, bytes, dir))
+ pci_nvme_status_genc(&status,
+ NVME_SC_DATA_TRANSFER_ERROR);
+ else
+ pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
+
+ return (status);
+}
+
+static uint16_t
+nvme_write_read_blockif(struct pci_nvme_softc *sc,
+ struct pci_nvme_blockstore *nvstore,
+ struct pci_nvme_ioreq *req,
+ uint64_t prp1, uint64_t prp2,
+ size_t offset, uint64_t bytes,
+ bool is_write)
+{
+ uint64_t size;
+ int err;
+ uint16_t status = NVME_NO_STATUS;
+
+ size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
+ if (pci_nvme_append_iov_req(sc, req, prp1,
+ size, is_write, offset)) {
+ pci_nvme_status_genc(&status,
+ NVME_SC_DATA_TRANSFER_ERROR);
+ goto out;
+ }
+
+ offset += size;
+ bytes -= size;
+
+ if (bytes == 0) {
+ ;
+ } else if (bytes <= PAGE_SIZE) {
+ size = bytes;
+ if (pci_nvme_append_iov_req(sc, req, prp2,
+ size, is_write, offset)) {
+ pci_nvme_status_genc(&status,
+ NVME_SC_DATA_TRANSFER_ERROR);
+ goto out;
+ }
+ } else {
+ void *vmctx = sc->nsc_pi->pi_vmctx;
+ uint64_t *prp_list = &prp2;
+ uint64_t *last = prp_list;
+
+ /* PRP2 is pointer to a physical region page list */
+ while (bytes) {
+ /* Last entry in list points to the next list */
+ if (prp_list == last) {
+ uint64_t prp = *prp_list;
+
+ prp_list = paddr_guest2host(vmctx, prp,
+ PAGE_SIZE - (prp % PAGE_SIZE));
+ last = prp_list + (NVME_PRP2_ITEMS - 1);
+ }
+
+ size = MIN(bytes, PAGE_SIZE);
+
+ if (pci_nvme_append_iov_req(sc, req, *prp_list,
+ size, is_write, offset)) {
+ pci_nvme_status_genc(&status,
+ NVME_SC_DATA_TRANSFER_ERROR);
+ goto out;
+ }
+
+ offset += size;
+ bytes -= size;
+
+ prp_list++;
+ }
+ }
+ req->io_req.br_callback = pci_nvme_io_done;
+ if (is_write)
+ err = blockif_write(nvstore->ctx, &req->io_req);
+ else
+ err = blockif_read(nvstore->ctx, &req->io_req);
+
+ if (err)
+ pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
+out:
+ return (status);
+}
+
+static bool
+nvme_opc_write_read(struct pci_nvme_softc *sc,
+ struct nvme_command *cmd,
+ struct pci_nvme_blockstore *nvstore,
+ struct pci_nvme_ioreq *req,
+ uint16_t *status)
+{
+ uint64_t lba, nblocks, bytes = 0;
+ size_t offset;
+ bool is_write = cmd->opc == NVME_OPC_WRITE;
+ bool pending = false;
+
+ lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
+ nblocks = (cmd->cdw12 & 0xFFFF) + 1;
+ if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
+ WPRINTF("%s command would exceed LBA range", __func__);
+ pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
+ goto out;
+ }
+
+ bytes = nblocks << nvstore->sectsz_bits;
+ if (bytes > NVME_MAX_DATA_SIZE) {
+ WPRINTF("%s command would exceed MDTS", __func__);
+ pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
+ goto out;
+ }
+
+ offset = lba << nvstore->sectsz_bits;
+
+ req->bytes = bytes;
+ req->io_req.br_offset = lba;
+
+ /* PRP bits 1:0 must be zero */
+ cmd->prp1 &= ~0x3UL;
+ cmd->prp2 &= ~0x3UL;
+
+ if (nvstore->type == NVME_STOR_RAM) {
+ *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
+ cmd->prp2, offset, bytes, is_write);
+ } else {
+ *status = nvme_write_read_blockif(sc, nvstore, req,
+ cmd->prp1, cmd->prp2, offset, bytes, is_write);
+
+ if (*status == NVME_NO_STATUS)
+ pending = true;
+ }
+out:
+ if (!pending)
+ pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
+
+ return (pending);
}
static void
@@ -1427,29 +2109,54 @@ pci_nvme_dealloc_sm(struct blockif_req *br, int err)
if (done) {
pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
- req->cid, 0, status, 0);
+ req->cid, 0, status);
pci_nvme_release_ioreq(sc, req);
}
}
-static int
+static bool
nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
struct nvme_command *cmd,
struct pci_nvme_blockstore *nvstore,
struct pci_nvme_ioreq *req,
uint16_t *status)
{
- int err = -1;
+ struct nvme_dsm_range *range = NULL;
+ uint32_t nr, r, non_zero, dr;
+ int err;
+ bool pending = false;
if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
goto out;
}
+ nr = cmd->cdw10 & 0xff;
+
+ /* copy locally because a range entry could straddle PRPs */
+ range = calloc(1, NVME_MAX_DSM_TRIM);
+ if (range == NULL) {
+ pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
+ goto out;
+ }
+ nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
+ (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
+
+ /* Check for invalid ranges and the number of non-zero lengths */
+ non_zero = 0;
+ for (r = 0; r <= nr; r++) {
+ if (pci_nvme_out_of_range(nvstore,
+ range[r].starting_lba, range[r].length)) {
+ pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
+ goto out;
+ }
+ if (range[r].length != 0)
+ non_zero++;
+ }
+
if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
- struct nvme_dsm_range *range;
- uint32_t nr, r;
- int sectsz = sc->nvstore.sectsz;
+ size_t offset, bytes;
+ int sectsz_bits = sc->nvstore.sectsz_bits;
/*
* DSM calls are advisory only, and compliant controllers
@@ -1460,23 +2167,20 @@ nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
goto out;
}
- if (req == NULL) {
- pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
+ /* If all ranges have a zero length, return Success */
+ if (non_zero == 0) {
+ pci_nvme_status_genc(status, NVME_SC_SUCCESS);
goto out;
}
- /* copy locally because a range entry could straddle PRPs */
- range = calloc(1, NVME_MAX_DSM_TRIM);
- if (range == NULL) {
+ if (req == NULL) {
pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
goto out;
}
- nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
- (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
- req->opc = cmd->opc;
- req->cid = cmd->cid;
- req->nsid = cmd->nsid;
+ offset = range[0].starting_lba << sectsz_bits;
+ bytes = range[0].length << sectsz_bits;
+
/*
* If the request is for more than a single range, store
* the ranges in the br_iov. Optimize for the common case
@@ -1484,20 +2188,29 @@ nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
*
* Note that NVMe Number of Ranges is a zero based value
*/
- nr = cmd->cdw10 & 0xff;
-
req->io_req.br_iovcnt = 0;
- req->io_req.br_offset = range[0].starting_lba * sectsz;
- req->io_req.br_resid = range[0].length * sectsz;
+ req->io_req.br_offset = offset;
+ req->io_req.br_resid = bytes;
if (nr == 0) {
req->io_req.br_callback = pci_nvme_io_done;
} else {
struct iovec *iov = req->io_req.br_iov;
- for (r = 0; r <= nr; r++) {
- iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
- iov[r].iov_len = range[r].length * sectsz;
+ for (r = 0, dr = 0; r <= nr; r++) {
+ offset = range[r].starting_lba << sectsz_bits;
+ bytes = range[r].length << sectsz_bits;
+ if (bytes == 0)
+ continue;
+
+ if ((nvstore->size - offset) < bytes) {
+ pci_nvme_status_genc(status,
+ NVME_SC_LBA_OUT_OF_RANGE);
+ goto out;
+ }
+ iov[dr].iov_base = (void *)offset;
+ iov[dr].iov_len = bytes;
+ dr++;
}
req->io_req.br_callback = pci_nvme_dealloc_sm;
@@ -1506,17 +2219,18 @@ nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
* prev_size to track the number of entries
*/
req->prev_gpaddr = 0;
- req->prev_size = r;
+ req->prev_size = dr;
}
err = blockif_delete(nvstore->ctx, &req->io_req);
if (err)
pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
-
- free(range);
+ else
+ pending = true;
}
out:
- return (err);
+ free(range);
+ return (pending);
}
static void
@@ -1525,221 +2239,105 @@ pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
struct nvme_submission_queue *sq;
uint16_t status = 0;
uint16_t sqhead;
- int err;
/* handle all submissions up to sq->tail index */
sq = &sc->submit_queues[idx];
- if (atomic_testandset_int(&sq->busy, 1)) {
- DPRINTF(("%s sqid %u busy", __func__, idx));
- return;
- }
+ pthread_mutex_lock(&sq->mtx);
- sqhead = atomic_load_acq_short(&sq->head);
-
- DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p",
- idx, sqhead, sq->tail, sq->qbase));
+ sqhead = sq->head;
+ DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
+ idx, sqhead, sq->tail, sq->qbase);
while (sqhead != atomic_load_acq_short(&sq->tail)) {
struct nvme_command *cmd;
- struct pci_nvme_ioreq *req = NULL;
- uint64_t lba;
- uint64_t nblocks, bytes, size, cpsz;
+ struct pci_nvme_ioreq *req;
+ uint32_t nsid;
+ bool pending;
- /* TODO: support scatter gather list handling */
+ pending = false;
+ req = NULL;
+ status = 0;
cmd = &sq->qbase[sqhead];
sqhead = (sqhead + 1) % sq->size;
- lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
-
- if (cmd->opc == NVME_OPC_FLUSH) {
- pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
- pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
- status, 1);
-
- continue;
- } else if (cmd->opc == 0x08) {
- /* TODO: write zeroes */
- WPRINTF(("%s write zeroes lba 0x%lx blocks %u",
- __func__, lba, cmd->cdw12 & 0xFFFF));
- pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
- pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
- status, 1);
-
- continue;
- }
-
- if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
- req = pci_nvme_get_ioreq(sc);
- req->nvme_sq = sq;
- req->sqid = idx;
- }
-
- if (cmd->opc == NVME_OPC_DATASET_MANAGEMENT) {
- if (nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req,
- &status)) {
- pci_nvme_set_completion(sc, sq, idx, cmd->cid,
- 0, status, 1);
- if (req)
- pci_nvme_release_ioreq(sc, req);
- }
- continue;
- }
-
- nblocks = (cmd->cdw12 & 0xFFFF) + 1;
-
- bytes = nblocks * sc->nvstore.sectsz;
-
- /*
- * If data starts mid-page and flows into the next page, then
- * increase page count
- */
-
- DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
- "(%lu-bytes)",
- sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
- cmd->opc == NVME_OPC_WRITE ?
- "WRITE" : "READ",
- lba, nblocks, bytes));
-
- cmd->prp1 &= ~(0x03UL);
- cmd->prp2 &= ~(0x03UL);
-
- DPRINTF((" prp1 0x%lx prp2 0x%lx", cmd->prp1, cmd->prp2));
-
- size = bytes;
- lba *= sc->nvstore.sectsz;
-
- cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
-
- if (cpsz > bytes)
- cpsz = bytes;
-
- if (req != NULL) {
- req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
- cmd->cdw10;
- req->opc = cmd->opc;
- req->cid = cmd->cid;
- req->nsid = cmd->nsid;
- }
-
- err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
- cmd->opc == NVME_OPC_WRITE, lba);
- lba += cpsz;
- size -= cpsz;
-
- if (size == 0)
- goto iodone;
-
- if (size <= PAGE_SIZE) {
- /* prp2 is second (and final) page in transfer */
-
- err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
- size,
- cmd->opc == NVME_OPC_WRITE,
- lba);
- } else {
- uint64_t *prp_list;
- int i;
-
- /* prp2 is pointer to a physical region page list */
- prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
- cmd->prp2, PAGE_SIZE);
-
- i = 0;
- while (size != 0) {
- cpsz = MIN(size, PAGE_SIZE);
-
- /*
- * Move to linked physical region page list
- * in last item.
- */
- if (i == (NVME_PRP2_ITEMS-1) &&
- size > PAGE_SIZE) {
- assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
- prp_list = paddr_guest2host(
- sc->nsc_pi->pi_vmctx,
- prp_list[i], PAGE_SIZE);
- i = 0;
- }
- if (prp_list[i] == 0) {
- WPRINTF(("PRP2[%d] = 0 !!!", i));
- err = 1;
- break;
- }
-
- err = pci_nvme_append_iov_req(sc, req,
- prp_list[i], cpsz,
- cmd->opc == NVME_OPC_WRITE, lba);
- if (err)
- break;
-
- lba += cpsz;
- size -= cpsz;
- i++;
- }
- }
-
-iodone:
- if (sc->nvstore.type == NVME_STOR_RAM) {
- uint16_t code, status = 0;
-
- code = err ? NVME_SC_LBA_OUT_OF_RANGE :
- NVME_SC_SUCCESS;
- pci_nvme_status_genc(&status, code);
-
- pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
- status, 1);
+ nsid = le32toh(cmd->nsid);
+ if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
+ pci_nvme_status_genc(&status,
+ NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
+ status |=
+ NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
+ goto complete;
+ }
- continue;
+ req = pci_nvme_get_ioreq(sc);
+ if (req == NULL) {
+ pci_nvme_status_genc(&status,
+ NVME_SC_INTERNAL_DEVICE_ERROR);
+ WPRINTF("%s: unable to allocate IO req", __func__);
+ goto complete;
}
+ req->nvme_sq = sq;
+ req->sqid = idx;
+ req->opc = cmd->opc;
+ req->cid = cmd->cid;
+ req->nsid = cmd->nsid;
-
- if (err)
- goto do_error;
-
- req->io_req.br_callback = pci_nvme_io_done;
-
- err = 0;
switch (cmd->opc) {
+ case NVME_OPC_FLUSH:
+ pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
+ req, &status);
+ break;
+ case NVME_OPC_WRITE:
case NVME_OPC_READ:
- err = blockif_read(sc->nvstore.ctx, &req->io_req);
+ pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
+ req, &status);
break;
- case NVME_OPC_WRITE:
- err = blockif_write(sc->nvstore.ctx, &req->io_req);
+ case NVME_OPC_WRITE_ZEROES:
+ /* TODO: write zeroes
+ WPRINTF("%s write zeroes lba 0x%lx blocks %u",
+ __func__, lba, cmd->cdw12 & 0xFFFF); */
+ pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
break;
- default:
- WPRINTF(("%s unhandled io command 0x%x",
- __func__, cmd->opc));
- err = 1;
+ case NVME_OPC_DATASET_MANAGEMENT:
+ pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
+ req, &status);
+ break;
+ default:
+ WPRINTF("%s unhandled io command 0x%x",
+ __func__, cmd->opc);
+ pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
}
-
-do_error:
- if (err) {
- uint16_t status = 0;
-
- pci_nvme_status_genc(&status,
- NVME_SC_DATA_TRANSFER_ERROR);
-
+complete:
+ if (!pending) {
pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
- status, 1);
- pci_nvme_release_ioreq(sc, req);
+ status);
+ if (req != NULL)
+ pci_nvme_release_ioreq(sc, req);
}
}
- atomic_store_short(&sq->head, sqhead);
- atomic_store_int(&sq->busy, 0);
+ sq->head = sqhead;
+
+ pthread_mutex_unlock(&sq->mtx);
}
static void
pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
uint64_t idx, int is_sq, uint64_t value)
{
- DPRINTF(("nvme doorbell %lu, %s, val 0x%lx",
- idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
+ DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
+ idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
if (is_sq) {
+ if (idx > sc->num_squeues) {
+ WPRINTF("%s queue index %lu overflow from "
+ "guest (max %u)",
+ __func__, idx, sc->num_squeues);
+ return;
+ }
+
atomic_store_short(&sc->submit_queues[idx].tail,
(uint16_t)value);
@@ -1748,22 +2346,23 @@ pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
} else {
/* submission queue; handle new entries in SQ */
if (idx > sc->num_squeues) {
- WPRINTF(("%s SQ index %lu overflow from "
+ WPRINTF("%s SQ index %lu overflow from "
"guest (max %u)",
- __func__, idx, sc->num_squeues));
+ __func__, idx, sc->num_squeues);
return;
}
pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
}
} else {
if (idx > sc->num_cqueues) {
- WPRINTF(("%s queue index %lu overflow from "
+ WPRINTF("%s queue index %lu overflow from "
"guest (max %u)",
- __func__, idx, sc->num_cqueues));
+ __func__, idx, sc->num_cqueues);
return;
}
- sc->compl_queues[idx].head = (uint16_t)value;
+ atomic_store_short(&sc->compl_queues[idx].head,
+ (uint16_t)value);
}
}
@@ -1774,46 +2373,46 @@ pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
switch (offset) {
case NVME_CR_CAP_LOW:
- DPRINTF(("%s %s NVME_CR_CAP_LOW", func, s));
+ DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
break;
case NVME_CR_CAP_HI:
- DPRINTF(("%s %s NVME_CR_CAP_HI", func, s));
+ DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
break;
case NVME_CR_VS:
- DPRINTF(("%s %s NVME_CR_VS", func, s));
+ DPRINTF("%s %s NVME_CR_VS", func, s);
break;
case NVME_CR_INTMS:
- DPRINTF(("%s %s NVME_CR_INTMS", func, s));
+ DPRINTF("%s %s NVME_CR_INTMS", func, s);
break;
case NVME_CR_INTMC:
- DPRINTF(("%s %s NVME_CR_INTMC", func, s));
+ DPRINTF("%s %s NVME_CR_INTMC", func, s);
break;
case NVME_CR_CC:
- DPRINTF(("%s %s NVME_CR_CC", func, s));
+ DPRINTF("%s %s NVME_CR_CC", func, s);
break;
case NVME_CR_CSTS:
- DPRINTF(("%s %s NVME_CR_CSTS", func, s));
+ DPRINTF("%s %s NVME_CR_CSTS", func, s);
break;
case NVME_CR_NSSR:
- DPRINTF(("%s %s NVME_CR_NSSR", func, s));
+ DPRINTF("%s %s NVME_CR_NSSR", func, s);
break;
case NVME_CR_AQA:
- DPRINTF(("%s %s NVME_CR_AQA", func, s));
+ DPRINTF("%s %s NVME_CR_AQA", func, s);
break;
case NVME_CR_ASQ_LOW:
- DPRINTF(("%s %s NVME_CR_ASQ_LOW", func, s));
+ DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
break;
case NVME_CR_ASQ_HI:
- DPRINTF(("%s %s NVME_CR_ASQ_HI", func, s));
+ DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
break;
case NVME_CR_ACQ_LOW:
- DPRINTF(("%s %s NVME_CR_ACQ_LOW", func, s));
+ DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
break;
case NVME_CR_ACQ_HI:
- DPRINTF(("%s %s NVME_CR_ACQ_HI", func, s));
+ DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
break;
default:
- DPRINTF(("unknown nvme bar-0 offset 0x%lx", offset));
+ DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
}
}
@@ -1830,9 +2429,9 @@ pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
int is_sq = (belloffset % 8) < 4;
if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
- WPRINTF(("guest attempted an overflow write offset "
+ WPRINTF("guest attempted an overflow write offset "
"0x%lx, val 0x%lx in %s",
- offset, value, __func__));
+ offset, value, __func__);
return;
}
@@ -1840,13 +2439,13 @@ pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
return;
}
- DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx",
- offset, size, value));
+ DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
+ offset, size, value);
if (size != 4) {
- WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
+ WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
"val 0x%lx) to bar0 in %s",
- size, offset, value, __func__));
+ size, offset, value, __func__);
/* TODO: shutdown device */
return;
}
@@ -1872,12 +2471,12 @@ pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
case NVME_CR_CC:
ccreg = (uint32_t)value;
- DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
+ DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
"iocqes %u",
__func__,
NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
- NVME_CC_GET_IOCQES(ccreg)));
+ NVME_CC_GET_IOCQES(ccreg));
if (NVME_CC_GET_SHN(ccreg)) {
/* perform shutdown - flush out data to backend */
@@ -1931,8 +2530,8 @@ pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
(value << 32);
break;
default:
- DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d",
- __func__, offset, value, size));
+ DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
+ __func__, offset, value, size);
}
pthread_mutex_unlock(&sc->mtx);
}
@@ -1945,8 +2544,8 @@ pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
if (baridx == pci_msix_table_bar(pi) ||
baridx == pci_msix_pba_bar(pi)) {
- DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
- " value 0x%lx", baridx, offset, size, value));
+ DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
+ " value 0x%lx", baridx, offset, size, value);
pci_emul_msix_twrite(pi, offset, size, value);
return;
@@ -1958,8 +2557,8 @@ pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
break;
default:
- DPRINTF(("%s unknown baridx %d, val 0x%lx",
- __func__, baridx, value));
+ DPRINTF("%s unknown baridx %d, val 0x%lx",
+ __func__, baridx, value);
}
}
@@ -1977,7 +2576,7 @@ static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
pthread_mutex_unlock(&sc->mtx);
} else {
value = 0;
- WPRINTF(("pci_nvme: read invalid offset %ld", offset));
+ WPRINTF("pci_nvme: read invalid offset %ld", offset);
}
switch (size) {
@@ -1992,8 +2591,8 @@ static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
break;
}
- DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x",
- offset, size, (uint32_t)value));
+ DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
+ offset, size, (uint32_t)value);
return (value);
}
@@ -2008,8 +2607,8 @@ pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
if (baridx == pci_msix_table_bar(pi) ||
baridx == pci_msix_pba_bar(pi)) {
- DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
- baridx, offset, size));
+ DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
+ baridx, offset, size);
return pci_emul_msix_tread(pi, offset, size);
}
@@ -2019,7 +2618,7 @@ pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
return pci_nvme_read_bar_0(sc, offset, size);
default:
- DPRINTF(("unknown bar %d, 0x%lx", baridx, offset));
+ DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
}
return (0);
@@ -2162,10 +2761,7 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
for (int i = 0; i < sc->ioslots; i++) {
STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
- pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
- pthread_cond_init(&sc->ioreqs[i].cv, NULL);
}
- sc->intr_coales_aggr_thresh = 1;
pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
@@ -2185,30 +2781,30 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2 * sizeof(uint32_t) * (sc->max_queues + 1);
pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
- DPRINTF(("nvme membar size: %u", pci_membar_sz));
+ DPRINTF("nvme membar size: %u", pci_membar_sz);
error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
if (error) {
- WPRINTF(("%s pci alloc mem bar failed", __func__));
+ WPRINTF("%s pci alloc mem bar failed", __func__);
goto done;
}
error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
if (error) {
- WPRINTF(("%s pci add msixcap failed", __func__));
+ WPRINTF("%s pci add msixcap failed", __func__);
goto done;
}
error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
if (error) {
- WPRINTF(("%s pci add Express capability failed", __func__));
+ WPRINTF("%s pci add Express capability failed", __func__);
goto done;
}
pthread_mutex_init(&sc->mtx, NULL);
sem_init(&sc->iosemlock, 0, sc->ioslots);
- pci_nvme_reset(sc);
+ pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
/*
* Controller data depends on Namespace data so initialize Namespace
* data first.
@@ -2216,6 +2812,11 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
pci_nvme_init_ctrldata(sc);
pci_nvme_init_logpages(sc);
+ pci_nvme_init_features(sc);
+
+ pci_nvme_aer_init(sc);
+
+ pci_nvme_reset(sc);
pci_lintr_request(pi);
diff --git a/usr/src/cmd/bhyve/pci_passthru.c b/usr/src/cmd/bhyve/pci_passthru.c
index 664d07b731..c777c56cb1 100644
--- a/usr/src/cmd/bhyve/pci_passthru.c
+++ b/usr/src/cmd/bhyve/pci_passthru.c
@@ -622,7 +622,7 @@ cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
sc->psc_bar[i].addr = base;
/* Allocate the BAR in the guest I/O or MMIO space */
- error = pci_emul_alloc_pbar(pi, i, base, bartype, size);
+ error = pci_emul_alloc_bar(pi, i, bartype, size);
if (error)
return (-1);
@@ -849,6 +849,10 @@ passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
if (error)
err(1, "vm_setup_pptdev_msix");
}
+ } else {
+ error = vm_disable_pptdev_msix(ctx, sc->pptfd);
+ if (error)
+ err(1, "vm_disable_pptdev_msix");
}
return (0);
}
diff --git a/usr/src/cmd/bhyve/pci_virtio_block.c b/usr/src/cmd/bhyve/pci_virtio_block.c
index a34bd864be..27d743a770 100644
--- a/usr/src/cmd/bhyve/pci_virtio_block.c
+++ b/usr/src/cmd/bhyve/pci_virtio_block.c
@@ -510,7 +510,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
sc->vbsc_cfg.vbc_writeback = 0;
sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT;
sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG;
- sc->vbsc_cfg.discard_sector_alignment = sectsz / VTBLK_BSIZE;
+ sc->vbsc_cfg.discard_sector_alignment = MAX(sectsz, sts) / VTBLK_BSIZE;
/*
* Should we move some of this into virtio.c? Could
diff --git a/usr/src/cmd/bhyve/pci_virtio_net.c b/usr/src/cmd/bhyve/pci_virtio_net.c
index 3a1cc46a06..ded9ca90ea 100644
--- a/usr/src/cmd/bhyve/pci_virtio_net.c
+++ b/usr/src/cmd/bhyve/pci_virtio_net.c
@@ -179,6 +179,7 @@ struct pci_vtnet_softc {
struct nm_desc *vsc_nmd;
int vsc_rx_ready;
+ bool features_negotiated; /* protected by rx_mtx */
int resetting; /* protected by tx_mtx */
uint64_t vsc_features; /* negotiated features */
@@ -228,6 +229,8 @@ pci_vtnet_reset(void *vsc)
/* Acquire the RX lock to block RX processing. */
pthread_mutex_lock(&sc->rx_mtx);
+ sc->features_negotiated = false;
+
/* Set sc->resetting and give a chance to the TX thread to stop. */
pthread_mutex_lock(&sc->tx_mtx);
sc->resetting = 1;
@@ -348,6 +351,11 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
assert(sc->vsc_dlpifd != -1);
#endif
+ /* Features must be negotiated */
+ if (!sc->features_negotiated) {
+ return;
+ }
+
/*
* But, will be called when the rx ring hasn't yet
* been set up.
@@ -558,6 +566,11 @@ pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc)
*/
assert(sc->vsc_nmd != NULL);
+ /* Features must be negotiated */
+ if (!sc->features_negotiated) {
+ return;
+ }
+
/*
* But, will be called when the rx ring hasn't yet
* been set up.
@@ -678,11 +691,14 @@ pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
/*
* A qnotify means that the rx process can now begin.
+ * Enable RX only if features are negotiated.
*/
- if (sc->vsc_rx_ready == 0) {
+ pthread_mutex_lock(&sc->rx_mtx);
+ if (sc->vsc_rx_ready == 0 && sc->features_negotiated) {
sc->vsc_rx_ready = 1;
vq_kick_disable(vq);
}
+ pthread_mutex_unlock(&sc->rx_mtx);
}
static void
@@ -1132,6 +1148,10 @@ pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features)
/* non-merge rx header is 2 bytes shorter */
sc->rx_vhdrlen -= 2;
}
+
+ pthread_mutex_lock(&sc->rx_mtx);
+ sc->features_negotiated = true;
+ pthread_mutex_unlock(&sc->rx_mtx);
}
struct pci_devemu pci_de_vnet = {
diff --git a/usr/src/cmd/bhyve/pci_xhci.c b/usr/src/cmd/bhyve/pci_xhci.c
index b92be4dec3..587e80a91c 100644
--- a/usr/src/cmd/bhyve/pci_xhci.c
+++ b/usr/src/cmd/bhyve/pci_xhci.c
@@ -1849,6 +1849,9 @@ retry:
DPRINTF(("pci_xhci[%d]: xfer->ndata %u", __LINE__, xfer->ndata));
+ if (xfer->ndata <= 0)
+ goto errout;
+
if (epid == 1) {
err = USB_ERR_NOT_STARTED;
if (dev->dev_ue->ue_request != NULL)
@@ -1863,6 +1866,7 @@ retry:
err = USB_TO_XHCI_ERR(err);
if ((err == XHCI_TRB_ERROR_SUCCESS) ||
+ (err == XHCI_TRB_ERROR_STALL) ||
(err == XHCI_TRB_ERROR_SHORT_PKT)) {
err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr);
if (err != XHCI_TRB_ERROR_SUCCESS)
@@ -2813,7 +2817,8 @@ pci_xhci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
sc->hcsparams2 = XHCI_SET_HCSP2_ERSTMAX(XHCI_ERST_MAX) |
XHCI_SET_HCSP2_IST(0x04);
sc->hcsparams3 = 0; /* no latency */
- sc->hccparams1 = XHCI_SET_HCCP1_NSS(1) | /* no 2nd-streams */
+ sc->hccparams1 = XHCI_SET_HCCP1_AC64(1) | /* 64-bit addrs */
+ XHCI_SET_HCCP1_NSS(1) | /* no 2nd-streams */
XHCI_SET_HCCP1_SPC(1) | /* short packet */
XHCI_SET_HCCP1_MAXPSA(XHCI_STREAMS_MAX);
sc->hccparams2 = XHCI_SET_HCCP2_LEC(1) |
diff --git a/usr/src/cmd/bhyve/pctestdev.c b/usr/src/cmd/bhyve/pctestdev.c
new file mode 100644
index 0000000000..be445e5c75
--- /dev/null
+++ b/usr/src/cmd/bhyve/pctestdev.c
@@ -0,0 +1,270 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Adam Fenn <adam@fenn.io>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Emulation of selected legacy test/debug interfaces expected by KVM-unit-tests
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <machine/vmm.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <vmmapi.h>
+
+#include "debug.h"
+#include "inout.h"
+#include "mem.h"
+#include "pctestdev.h"
+
+#define DEBUGEXIT_BASE 0xf4
+#define DEBUGEXIT_LEN 4
+#define DEBUGEXIT_NAME "isa-debug-exit"
+
+#define IOMEM_BASE 0xff000000
+#define IOMEM_LEN 0x10000
+#define IOMEM_NAME "pc-testdev-iomem"
+
+#define IOPORT_BASE 0xe0
+#define IOPORT_LEN 4
+#define IOPORT_NAME "pc-testdev-ioport"
+
+#define IRQ_BASE 0x2000
+#define IRQ_IOAPIC_PINCOUNT_MIN 24
+#define IRQ_IOAPIC_PINCOUNT_MAX 32
+#define IRQ_NAME "pc-testdev-irq-line"
+
+#define PCTESTDEV_NAME "pc-testdev"
+
+static bool pctestdev_inited;
+static uint8_t pctestdev_iomem_buf[IOMEM_LEN];
+static uint32_t pctestdev_ioport_data;
+
+static int pctestdev_debugexit_io(struct vmctx *ctx, int vcpu, int in,
+ int port, int bytes, uint32_t *eax, void *arg);
+static int pctestdev_iomem_io(struct vmctx *ctx, int vcpu, int dir,
+ uint64_t addr, int size, uint64_t *val, void *arg1,
+ long arg2);
+static int pctestdev_ioport_io(struct vmctx *ctx, int vcpu, int in,
+ int port, int bytes, uint32_t *eax, void *arg);
+static int pctestdev_irq_io(struct vmctx *ctx, int vcpu, int in,
+ int port, int bytes, uint32_t *eax, void *arg);
+
+const char *
+pctestdev_getname(void)
+{
+ return (PCTESTDEV_NAME);
+}
+
+int
+pctestdev_parse(const char *opts)
+{
+ if (opts != NULL && *opts != '\0')
+ return (-1);
+
+ return (0);
+}
+
+int
+pctestdev_init(struct vmctx *ctx)
+{
+ struct mem_range iomem;
+ struct inout_port debugexit, ioport, irq;
+ int err, pincount;
+
+ if (pctestdev_inited) {
+ EPRINTLN("Only one pc-testdev device is allowed.");
+
+ return (-1);
+ }
+
+ err = vm_ioapic_pincount(ctx, &pincount);
+ if (err != 0) {
+ EPRINTLN("pc-testdev: Failed to obtain IOAPIC pin count.");
+
+ return (-1);
+ }
+ if (pincount < IRQ_IOAPIC_PINCOUNT_MIN ||
+ pincount > IRQ_IOAPIC_PINCOUNT_MAX) {
+ EPRINTLN("pc-testdev: Unsupported IOAPIC pin count: %d.",
+ pincount);
+
+ return (-1);
+ }
+
+ debugexit.name = DEBUGEXIT_NAME;
+ debugexit.port = DEBUGEXIT_BASE;
+ debugexit.size = DEBUGEXIT_LEN;
+ debugexit.flags = IOPORT_F_INOUT;
+ debugexit.handler = pctestdev_debugexit_io;
+ debugexit.arg = NULL;
+
+ iomem.name = IOMEM_NAME;
+ iomem.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+ iomem.handler = pctestdev_iomem_io;
+ iomem.arg1 = NULL;
+ iomem.arg2 = 0;
+ iomem.base = IOMEM_BASE;
+ iomem.size = IOMEM_LEN;
+
+ ioport.name = IOPORT_NAME;
+ ioport.port = IOPORT_BASE;
+ ioport.size = IOPORT_LEN;
+ ioport.flags = IOPORT_F_INOUT;
+ ioport.handler = pctestdev_ioport_io;
+ ioport.arg = NULL;
+
+ irq.name = IRQ_NAME;
+ irq.port = IRQ_BASE;
+ irq.size = pincount;
+ irq.flags = IOPORT_F_INOUT;
+ irq.handler = pctestdev_irq_io;
+ irq.arg = NULL;
+
+ err = register_inout(&debugexit);
+ if (err != 0)
+ goto fail;
+
+ err = register_inout(&ioport);
+ if (err != 0)
+ goto fail_after_debugexit_reg;
+
+ err = register_inout(&irq);
+ if (err != 0)
+ goto fail_after_ioport_reg;
+
+ err = register_mem(&iomem);
+ if (err != 0)
+ goto fail_after_irq_reg;
+
+ pctestdev_inited = true;
+
+ return (0);
+
+fail_after_irq_reg:
+ (void)unregister_inout(&irq);
+
+fail_after_ioport_reg:
+ (void)unregister_inout(&ioport);
+
+fail_after_debugexit_reg:
+ (void)unregister_inout(&debugexit);
+
+fail:
+ return (err);
+}
+
+static int
+pctestdev_debugexit_io(struct vmctx *ctx, int vcpu, int in, int port,
+ int bytes, uint32_t *eax, void *arg)
+{
+ if (in)
+ *eax = 0;
+ else
+ exit((*eax << 1) | 1);
+
+ return (0);
+}
+
+static int
+pctestdev_iomem_io(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int size, uint64_t *val, void *arg1, long arg2)
+{
+ uint64_t offset;
+
+ if (addr + size > IOMEM_BASE + IOMEM_LEN)
+ return (-1);
+
+ offset = addr - IOMEM_BASE;
+ if (dir == MEM_F_READ) {
+ (void)memcpy(val, pctestdev_iomem_buf + offset, size);
+ } else {
+ assert(dir == MEM_F_WRITE);
+ (void)memcpy(pctestdev_iomem_buf + offset, val, size);
+ }
+
+ return (0);
+}
+
+static int
+pctestdev_ioport_io(struct vmctx *ctx, int vcpu, int in, int port,
+ int bytes, uint32_t *eax, void *arg)
+{
+ uint32_t mask;
+ int lsb;
+
+ if (port + bytes > IOPORT_BASE + IOPORT_LEN)
+ return (-1);
+
+ lsb = (port & 0x3) * 8;
+ mask = (-1UL >> (32 - (bytes * 8))) << lsb;
+
+ if (in)
+ *eax = (pctestdev_ioport_data & mask) >> lsb;
+ else {
+ pctestdev_ioport_data &= ~mask;
+ pctestdev_ioport_data |= *eax << lsb;
+ }
+
+ return (0);
+}
+
+static int
+pctestdev_irq_io(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int irq;
+
+ if (bytes != 1)
+ return (-1);
+
+ if (in) {
+ *eax = 0;
+ return (0);
+ } else {
+ irq = port - IRQ_BASE;
+ if (irq < 16) {
+ if (*eax)
+ return (vm_isa_assert_irq(ctx, irq, irq));
+ else
+ return (vm_isa_deassert_irq(ctx, irq, irq));
+ } else {
+ if (*eax)
+ return (vm_ioapic_assert_irq(ctx, irq));
+ else
+ return (vm_ioapic_deassert_irq(ctx, irq));
+ }
+ }
+}
diff --git a/usr/src/cmd/bhyve/pctestdev.h b/usr/src/cmd/bhyve/pctestdev.h
new file mode 100644
index 0000000000..c1c940146e
--- /dev/null
+++ b/usr/src/cmd/bhyve/pctestdev.h
@@ -0,0 +1,43 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Adam Fenn <adam@fenn.io>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Emulation of selected legacy test/debug interfaces expected by KVM-unit-tests
+ */
+
+#ifndef _PCTESTDEV_H_
+#define _PCTESTDEV_H_
+
+struct vmctx;
+
+const char *pctestdev_getname(void);
+int pctestdev_init(struct vmctx *ctx);
+int pctestdev_parse(const char *opts);
+
+#endif
diff --git a/usr/src/cmd/bhyve/pm.c b/usr/src/cmd/bhyve/pm.c
index fa162faab1..d2732242f9 100644
--- a/usr/src/cmd/bhyve/pm.c
+++ b/usr/src/cmd/bhyve/pm.c
@@ -211,7 +211,7 @@ pm1_enable_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
* the global lock, but ACPI-CA whines profusely if it
* can't set GBL_EN.
*/
- pm1_enable = *eax & (PM1_PWRBTN_EN | PM1_GBL_EN);
+ pm1_enable = *eax & (PM1_RTC_EN | PM1_PWRBTN_EN | PM1_GBL_EN);
sci_update(ctx);
}
pthread_mutex_unlock(&pm_lock);
diff --git a/usr/src/cmd/bhyve/smbiostbl.c b/usr/src/cmd/bhyve/smbiostbl.c
index 8af8a85755..3df2012f10 100644
--- a/usr/src/cmd/bhyve/smbiostbl.c
+++ b/usr/src/cmd/bhyve/smbiostbl.c
@@ -52,6 +52,10 @@ __FBSDID("$FreeBSD$");
#define SMBIOS_BASE 0xF1000
+#define FIRMWARE_VERSION "13.0"
+/* The SMBIOS specification defines the date format to be mm/dd/yyyy */
+#define FIRMWARE_RELEASE_DATE "11/10/2020"
+
/* BHYVE_ACPI_BASE - SMBIOS_BASE) */
#define SMBIOS_MAX_LENGTH (0xF2400 - 0xF1000)
@@ -324,9 +328,9 @@ struct smbios_table_type0 smbios_type0_template = {
};
const char *smbios_type0_strings[] = {
- "BHYVE", /* vendor string */
- "1.00", /* bios version string */
- "03/14/2014", /* bios release date string */
+ "BHYVE", /* vendor string */
+ FIRMWARE_VERSION, /* bios version string */
+ FIRMWARE_RELEASE_DATE, /* bios release date string */
NULL
};
@@ -347,12 +351,12 @@ static int smbios_type1_initializer(struct smbios_structure *template_entry,
uint16_t *n, uint16_t *size);
const char *smbios_type1_strings[] = {
- " ", /* manufacturer string */
- "BHYVE", /* product name string */
- "1.0", /* version string */
- "None", /* serial number string */
- "None", /* sku string */
- " ", /* family name string */
+ "illumos", /* manufacturer string */
+ "BHYVE", /* product name string */
+ "1.0", /* version string */
+ "None", /* serial number string */
+ "None", /* sku string */
+ "Virtual Machine", /* family name string */
NULL
};
@@ -375,7 +379,7 @@ struct smbios_table_type3 smbios_type3_template = {
};
const char *smbios_type3_strings[] = {
- " ", /* manufacturer string */
+ "illumos", /* manufacturer string */
"1.0", /* version string */
"None", /* serial number string */
"None", /* asset tag string */
@@ -755,7 +759,7 @@ smbios_type19_initializer(struct smbios_structure *template_entry,
type19 = (struct smbios_table_type19 *)curaddr;
type19->arrayhand = type16_handle;
type19->xsaddr = 4*GB;
- type19->xeaddr = guest_himem;
+ type19->xeaddr = type19->xsaddr + guest_himem;
}
return (0);
diff --git a/usr/src/cmd/bhyve/usb_mouse.c b/usr/src/cmd/bhyve/usb_mouse.c
index 8af86fcdc7..7790fe0ec9 100644
--- a/usr/src/cmd/bhyve/usb_mouse.c
+++ b/usr/src/cmd/bhyve/usb_mouse.c
@@ -72,7 +72,7 @@ enum {
};
static const char *umouse_desc_strings[] = {
- "\x04\x09",
+ "\x09\x04",
"BHYVE",
"HID Tablet",
"01",
@@ -388,7 +388,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer)
"sizeof(umouse_dev_desc) %lu",
len, sizeof(umouse_dev_desc)));
if ((value & 0xFF) != 0) {
- err = USB_ERR_IOERROR;
+ err = USB_ERR_STALLED;
goto done;
}
if (len > sizeof(umouse_dev_desc)) {
@@ -403,7 +403,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer)
case UDESC_CONFIG:
DPRINTF(("umouse: (->UDESC_CONFIG)"));
if ((value & 0xFF) != 0) {
- err = USB_ERR_IOERROR;
+ err = USB_ERR_STALLED;
goto done;
}
if (len > sizeof(umouse_confd)) {
@@ -472,7 +472,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer)
default:
DPRINTF(("umouse: unknown(%d)->ERROR", value >> 8));
- err = USB_ERR_IOERROR;
+ err = USB_ERR_STALLED;
goto done;
}
eshort = data->blen > 0;
@@ -496,7 +496,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer)
break;
default:
DPRINTF(("umouse: IO ERROR"));
- err = USB_ERR_IOERROR;
+ err = USB_ERR_STALLED;
goto done;
}
eshort = data->blen > 0;
@@ -507,7 +507,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer)
if (index != 0) {
DPRINTF(("umouse get_interface, invalid index %d",
index));
- err = USB_ERR_IOERROR;
+ err = USB_ERR_STALLED;
goto done;
}
@@ -578,7 +578,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer)
case UREQ(UR_SET_FEATURE, UT_WRITE_INTERFACE):
case UREQ(UR_SET_FEATURE, UT_WRITE_ENDPOINT):
DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_INTERFACE)"));
- err = USB_ERR_IOERROR;
+ err = USB_ERR_STALLED;
goto done;
case UREQ(UR_SET_INTERFACE, UT_WRITE_INTERFACE):
@@ -617,7 +617,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer)
memcpy(data->buf, &sc->um_report, len);
data->bdone += len;
} else {
- err = USB_ERR_IOERROR;
+ err = USB_ERR_STALLED;
goto done;
}
eshort = data->blen > 0;
@@ -659,7 +659,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer)
default:
DPRINTF(("**** umouse request unhandled"));
- err = USB_ERR_IOERROR;
+ err = USB_ERR_STALLED;
break;
}
diff --git a/usr/src/contrib/bhyve/dev/nvme/nvme.h b/usr/src/contrib/bhyve/dev/nvme/nvme.h
index c7f6496426..6fbf2b758f 100644
--- a/usr/src/contrib/bhyve/dev/nvme/nvme.h
+++ b/usr/src/contrib/bhyve/dev/nvme/nvme.h
@@ -30,6 +30,18 @@
* Copyright 2019 Joyent, Inc.
*/
+/*
+ * illumos port notes:
+ *
+ * The upstream version of this file uses conditionals of the form
+ * #if _BYTE_ORDER != _LITTLE_ENDIAN
+ * Rather than keep this file in compat with only that little bit changed,
+ * this is locally patched below.
+ *
+ * There is also a static assertion which has been commented out due to a
+ * problem with smatch.
+ */
+
#ifndef __NVME_H__
#define __NVME_H__
@@ -42,6 +54,8 @@
#define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command)
#define NVME_RESET_CONTROLLER _IO('n', 1)
+#define NVME_GET_NSID _IOR('n', 2, struct nvme_get_nsid)
+#define NVME_GET_MAX_XFER_SIZE _IOR('n', 3, uint64_t)
#define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test)
#define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test)
@@ -59,8 +73,8 @@
*/
#define NVME_GLOBAL_NAMESPACE_TAG ((uint32_t)0xFFFFFFFF)
-/* Cap nvme to 1MB transfers driver explodes with larger sizes */
-#define NVME_MAX_XFER_SIZE (MAXPHYS < (1<<20) ? MAXPHYS : (1<<20))
+/* Cap transfers by the maximum addressable by page-sized PRP (4KB -> 2MB). */
+#define NVME_MAX_XFER_SIZE MIN(maxphys, (PAGE_SIZE/8*PAGE_SIZE))
/* Register field definitions */
#define NVME_CAP_LO_REG_MQES_SHIFT (0)
@@ -71,15 +85,51 @@
#define NVME_CAP_LO_REG_AMS_MASK (0x3)
#define NVME_CAP_LO_REG_TO_SHIFT (24)
#define NVME_CAP_LO_REG_TO_MASK (0xFF)
+#define NVME_CAP_LO_MQES(x) \
+ (((x) >> NVME_CAP_LO_REG_MQES_SHIFT) & NVME_CAP_LO_REG_MQES_MASK)
+#define NVME_CAP_LO_CQR(x) \
+ (((x) >> NVME_CAP_LO_REG_CQR_SHIFT) & NVME_CAP_LO_REG_CQR_MASK)
+#define NVME_CAP_LO_AMS(x) \
+ (((x) >> NVME_CAP_LO_REG_AMS_SHIFT) & NVME_CAP_LO_REG_AMS_MASK)
+#define NVME_CAP_LO_TO(x) \
+ (((x) >> NVME_CAP_LO_REG_TO_SHIFT) & NVME_CAP_LO_REG_TO_MASK)
#define NVME_CAP_HI_REG_DSTRD_SHIFT (0)
#define NVME_CAP_HI_REG_DSTRD_MASK (0xF)
+#define NVME_CAP_HI_REG_NSSRS_SHIFT (4)
+#define NVME_CAP_HI_REG_NSSRS_MASK (0x1)
+#define NVME_CAP_HI_REG_CSS_SHIFT (5)
+#define NVME_CAP_HI_REG_CSS_MASK (0xff)
#define NVME_CAP_HI_REG_CSS_NVM_SHIFT (5)
#define NVME_CAP_HI_REG_CSS_NVM_MASK (0x1)
+#define NVME_CAP_HI_REG_BPS_SHIFT (13)
+#define NVME_CAP_HI_REG_BPS_MASK (0x1)
#define NVME_CAP_HI_REG_MPSMIN_SHIFT (16)
#define NVME_CAP_HI_REG_MPSMIN_MASK (0xF)
#define NVME_CAP_HI_REG_MPSMAX_SHIFT (20)
#define NVME_CAP_HI_REG_MPSMAX_MASK (0xF)
+#define NVME_CAP_HI_REG_PMRS_SHIFT (24)
+#define NVME_CAP_HI_REG_PMRS_MASK (0x1)
+#define NVME_CAP_HI_REG_CMBS_SHIFT (25)
+#define NVME_CAP_HI_REG_CMBS_MASK (0x1)
+#define NVME_CAP_HI_DSTRD(x) \
+ (((x) >> NVME_CAP_HI_REG_DSTRD_SHIFT) & NVME_CAP_HI_REG_DSTRD_MASK)
+#define NVME_CAP_HI_NSSRS(x) \
+ (((x) >> NVME_CAP_HI_REG_NSSRS_SHIFT) & NVME_CAP_HI_REG_NSSRS_MASK)
+#define NVME_CAP_HI_CSS(x) \
+ (((x) >> NVME_CAP_HI_REG_CSS_SHIFT) & NVME_CAP_HI_REG_CSS_MASK)
+#define NVME_CAP_HI_CSS_NVM(x) \
+ (((x) >> NVME_CAP_HI_REG_CSS_NVM_SHIFT) & NVME_CAP_HI_REG_CSS_NVM_MASK)
+#define NVME_CAP_HI_BPS(x) \
+ (((x) >> NVME_CAP_HI_REG_BPS_SHIFT) & NVME_CAP_HI_REG_BPS_MASK)
+#define NVME_CAP_HI_MPSMIN(x) \
+ (((x) >> NVME_CAP_HI_REG_MPSMIN_SHIFT) & NVME_CAP_HI_REG_MPSMIN_MASK)
+#define NVME_CAP_HI_MPSMAX(x) \
+ (((x) >> NVME_CAP_HI_REG_MPSMAX_SHIFT) & NVME_CAP_HI_REG_MPSMAX_MASK)
+#define NVME_CAP_HI_PMRS(x) \
+ (((x) >> NVME_CAP_HI_REG_PMRS_SHIFT) & NVME_CAP_HI_REG_PMRS_MASK)
+#define NVME_CAP_HI_CMBS(x) \
+ (((x) >> NVME_CAP_HI_REG_CMBS_SHIFT) & NVME_CAP_HI_REG_CMBS_MASK)
#define NVME_CC_REG_EN_SHIFT (0)
#define NVME_CC_REG_EN_MASK (0x1)
@@ -102,6 +152,10 @@
#define NVME_CSTS_REG_CFS_MASK (0x1)
#define NVME_CSTS_REG_SHST_SHIFT (2)
#define NVME_CSTS_REG_SHST_MASK (0x3)
+#define NVME_CSTS_REG_NVSRO_SHIFT (4)
+#define NVME_CSTS_REG_NVSRO_MASK (0x1)
+#define NVME_CSTS_REG_PP_SHIFT (5)
+#define NVME_CSTS_REG_PP_MASK (0x1)
#define NVME_CSTS_GET_SHST(csts) (((csts) >> NVME_CSTS_REG_SHST_SHIFT) & NVME_CSTS_REG_SHST_MASK)
@@ -110,6 +164,36 @@
#define NVME_AQA_REG_ACQS_SHIFT (16)
#define NVME_AQA_REG_ACQS_MASK (0xFFF)
+#define NVME_PMRCAP_REG_RDS_SHIFT (3)
+#define NVME_PMRCAP_REG_RDS_MASK (0x1)
+#define NVME_PMRCAP_REG_WDS_SHIFT (4)
+#define NVME_PMRCAP_REG_WDS_MASK (0x1)
+#define NVME_PMRCAP_REG_BIR_SHIFT (5)
+#define NVME_PMRCAP_REG_BIR_MASK (0x7)
+#define NVME_PMRCAP_REG_PMRTU_SHIFT (8)
+#define NVME_PMRCAP_REG_PMRTU_MASK (0x3)
+#define NVME_PMRCAP_REG_PMRWBM_SHIFT (10)
+#define NVME_PMRCAP_REG_PMRWBM_MASK (0xf)
+#define NVME_PMRCAP_REG_PMRTO_SHIFT (16)
+#define NVME_PMRCAP_REG_PMRTO_MASK (0xff)
+#define NVME_PMRCAP_REG_CMSS_SHIFT (24)
+#define NVME_PMRCAP_REG_CMSS_MASK (0x1)
+
+#define NVME_PMRCAP_RDS(x) \
+ (((x) >> NVME_PMRCAP_REG_RDS_SHIFT) & NVME_PMRCAP_REG_RDS_MASK)
+#define NVME_PMRCAP_WDS(x) \
+ (((x) >> NVME_PMRCAP_REG_WDS_SHIFT) & NVME_PMRCAP_REG_WDS_MASK)
+#define NVME_PMRCAP_BIR(x) \
+ (((x) >> NVME_PMRCAP_REG_BIR_SHIFT) & NVME_PMRCAP_REG_BIR_MASK)
+#define NVME_PMRCAP_PMRTU(x) \
+ (((x) >> NVME_PMRCAP_REG_PMRTU_SHIFT) & NVME_PMRCAP_REG_PMRTU_MASK)
+#define NVME_PMRCAP_PMRWBM(x) \
+ (((x) >> NVME_PMRCAP_REG_PMRWBM_SHIFT) & NVME_PMRCAP_REG_PMRWBM_MASK)
+#define NVME_PMRCAP_PMRTO(x) \
+ (((x) >> NVME_PMRCAP_REG_PMRTO_SHIFT) & NVME_PMRCAP_REG_PMRTO_MASK)
+#define NVME_PMRCAP_CMSS(x) \
+ (((x) >> NVME_PMRCAP_REG_CMSS_SHIFT) & NVME_PMRCAP_REG_CMSS_MASK)
+
/* Command field definitions */
#define NVME_CMD_FUSE_SHIFT (8)
@@ -121,6 +205,8 @@
#define NVME_STATUS_SC_MASK (0xFF)
#define NVME_STATUS_SCT_SHIFT (9)
#define NVME_STATUS_SCT_MASK (0x7)
+#define NVME_STATUS_CRD_SHIFT (12)
+#define NVME_STATUS_CRD_MASK (0x3)
#define NVME_STATUS_M_SHIFT (14)
#define NVME_STATUS_M_MASK (0x1)
#define NVME_STATUS_DNR_SHIFT (15)
@@ -161,6 +247,9 @@
/* SR-IOV Virtual Function */
#define NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT (2)
#define NVME_CTRLR_DATA_MIC_SRIOVVF_MASK (0x1)
+/* Asymmetric Namespace Access Reporting */
+#define NVME_CTRLR_DATA_MIC_ANAR_SHIFT (3)
+#define NVME_CTRLR_DATA_MIC_ANAR_MASK (0x1)
/** OACS - optional admin command support */
/* supports security send/receive commands */
@@ -190,6 +279,9 @@
/* supports Doorbell Buffer Config */
#define NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT (8)
#define NVME_CTRLR_DATA_OACS_DBBUFFER_MASK (0x1)
+/* supports Get LBA Status */
+#define NVME_CTRLR_DATA_OACS_GETLBA_SHIFT (9)
+#define NVME_CTRLR_DATA_OACS_GETLBA_MASK (0x1)
/** firmware updates */
/* first slot is read-only */
@@ -198,6 +290,9 @@
/* number of firmware slots */
#define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT (1)
#define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK (0x7)
+/* firmware activation without reset */
+#define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_SHIFT (4)
+#define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_MASK (0x1)
/** log page attributes */
/* per namespace smart/health log page */
@@ -214,6 +309,26 @@
#define NVME_CTRLR_DATA_APSTA_APST_SUPP_SHIFT (0)
#define NVME_CTRLR_DATA_APSTA_APST_SUPP_MASK (0x1)
+/** Sanitize Capabilities */
+/* Crypto Erase Support */
+#define NVME_CTRLR_DATA_SANICAP_CES_SHIFT (0)
+#define NVME_CTRLR_DATA_SANICAP_CES_MASK (0x1)
+/* Block Erase Support */
+#define NVME_CTRLR_DATA_SANICAP_BES_SHIFT (1)
+#define NVME_CTRLR_DATA_SANICAP_BES_MASK (0x1)
+/* Overwrite Support */
+#define NVME_CTRLR_DATA_SANICAP_OWS_SHIFT (2)
+#define NVME_CTRLR_DATA_SANICAP_OWS_MASK (0x1)
+/* No-Deallocate Inhibited */
+#define NVME_CTRLR_DATA_SANICAP_NDI_SHIFT (29)
+#define NVME_CTRLR_DATA_SANICAP_NDI_MASK (0x1)
+/* No-Deallocate Modifies Media After Sanitize */
+#define NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT (30)
+#define NVME_CTRLR_DATA_SANICAP_NODMMAS_MASK (0x3)
+#define NVME_CTRLR_DATA_SANICAP_NODMMAS_UNDEF (0)
+#define NVME_CTRLR_DATA_SANICAP_NODMMAS_NO (1)
+#define NVME_CTRLR_DATA_SANICAP_NODMMAS_YES (2)
+
/** submission queue entry size */
#define NVME_CTRLR_DATA_SQES_MIN_SHIFT (0)
#define NVME_CTRLR_DATA_SQES_MIN_MASK (0xF)
@@ -241,6 +356,8 @@
#define NVME_CTRLR_DATA_ONCS_RESERV_MASK (0x1)
#define NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT (6)
#define NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK (0x1)
+#define NVME_CTRLR_DATA_ONCS_VERIFY_SHIFT (7)
+#define NVME_CTRLR_DATA_ONCS_VERIFY_MASK (0x1)
/** Fused Operation Support */
#define NVME_CTRLR_DATA_FUSES_CNW_SHIFT (0)
@@ -255,8 +372,15 @@
#define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK (0x1)
/** volatile write cache */
+/* volatile write cache present */
#define NVME_CTRLR_DATA_VWC_PRESENT_SHIFT (0)
#define NVME_CTRLR_DATA_VWC_PRESENT_MASK (0x1)
+/* flush all namespaces supported */
+#define NVME_CTRLR_DATA_VWC_ALL_SHIFT (1)
+#define NVME_CTRLR_DATA_VWC_ALL_MASK (0x3)
+#define NVME_CTRLR_DATA_VWC_ALL_UNKNOWN (0)
+#define NVME_CTRLR_DATA_VWC_ALL_NO (2)
+#define NVME_CTRLR_DATA_VWC_ALL_YES (3)
/** namespace features */
/* thin provisioning */
@@ -271,6 +395,9 @@
/* NGUID and EUI64 fields are not reusable */
#define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_SHIFT (3)
#define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_MASK (0x1)
+/* NPWG, NPWA, NPDG, NPDA, and NOWS are valid */
+#define NVME_NS_DATA_NSFEAT_NPVALID_SHIFT (4)
+#define NVME_NS_DATA_NSFEAT_NPVALID_MASK (0x1)
/** formatted lba size */
#define NVME_NS_DATA_FLBAS_FORMAT_SHIFT (0)
@@ -351,6 +478,20 @@
#define NVME_NS_DATA_FPI_SUPP_SHIFT (7)
#define NVME_NS_DATA_FPI_SUPP_MASK (0x1)
+/** Deallocate Logical Block Features */
+/* deallocated logical block read behavior */
+#define NVME_NS_DATA_DLFEAT_READ_SHIFT (0)
+#define NVME_NS_DATA_DLFEAT_READ_MASK (0x07)
+#define NVME_NS_DATA_DLFEAT_READ_NR (0x00)
+#define NVME_NS_DATA_DLFEAT_READ_00 (0x01)
+#define NVME_NS_DATA_DLFEAT_READ_FF (0x02)
+/* supports the Deallocate bit in the Write Zeroes */
+#define NVME_NS_DATA_DLFEAT_DWZ_SHIFT (3)
+#define NVME_NS_DATA_DLFEAT_DWZ_MASK (0x01)
+/* Guard field for deallocated logical blocks is set to the CRC */
+#define NVME_NS_DATA_DLFEAT_GCRC_SHIFT (4)
+#define NVME_NS_DATA_DLFEAT_GCRC_MASK (0x01)
+
/** lba format support */
/* metadata size */
#define NVME_NS_DATA_LBAF_MS_SHIFT (0)
@@ -370,11 +511,42 @@ enum nvme_critical_warning_state {
NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP = 0x10,
};
#define NVME_CRIT_WARN_ST_RESERVED_MASK (0xE0)
+#define NVME_ASYNC_EVENT_NS_ATTRIBUTE (0x100)
+#define NVME_ASYNC_EVENT_FW_ACTIVATE (0x200)
/* slot for current FW */
#define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT (0)
#define NVME_FIRMWARE_PAGE_AFI_SLOT_MASK (0x7)
+/* Commands Supported and Effects */
+#define NVME_CE_PAGE_CSUP_SHIFT (0)
+#define NVME_CE_PAGE_CSUP_MASK (0x1)
+#define NVME_CE_PAGE_LBCC_SHIFT (1)
+#define NVME_CE_PAGE_LBCC_MASK (0x1)
+#define NVME_CE_PAGE_NCC_SHIFT (2)
+#define NVME_CE_PAGE_NCC_MASK (0x1)
+#define NVME_CE_PAGE_NIC_SHIFT (3)
+#define NVME_CE_PAGE_NIC_MASK (0x1)
+#define NVME_CE_PAGE_CCC_SHIFT (4)
+#define NVME_CE_PAGE_CCC_MASK (0x1)
+#define NVME_CE_PAGE_CSE_SHIFT (16)
+#define NVME_CE_PAGE_CSE_MASK (0x7)
+#define NVME_CE_PAGE_UUID_SHIFT (19)
+#define NVME_CE_PAGE_UUID_MASK (0x1)
+
+/* Sanitize Status */
+#define NVME_SS_PAGE_SSTAT_STATUS_SHIFT (0)
+#define NVME_SS_PAGE_SSTAT_STATUS_MASK (0x7)
+#define NVME_SS_PAGE_SSTAT_STATUS_NEVER (0)
+#define NVME_SS_PAGE_SSTAT_STATUS_COMPLETED (1)
+#define NVME_SS_PAGE_SSTAT_STATUS_INPROG (2)
+#define NVME_SS_PAGE_SSTAT_STATUS_FAILED (3)
+#define NVME_SS_PAGE_SSTAT_STATUS_COMPLETEDWD (4)
+#define NVME_SS_PAGE_SSTAT_PASSES_SHIFT (3)
+#define NVME_SS_PAGE_SSTAT_PASSES_MASK (0x1f)
+#define NVME_SS_PAGE_SSTAT_GDE_SHIFT (8)
+#define NVME_SS_PAGE_SSTAT_GDE_MASK (0x1)
+
/* CC register SHN field values */
enum shn_value {
NVME_SHN_NORMAL = 0x1,
@@ -390,34 +562,37 @@ enum shst_value {
struct nvme_registers
{
- /** controller capabilities */
- uint32_t cap_lo;
- uint32_t cap_hi;
-
- uint32_t vs; /* version */
- uint32_t intms; /* interrupt mask set */
- uint32_t intmc; /* interrupt mask clear */
-
- /** controller configuration */
- uint32_t cc;
-
- uint32_t reserved1;
-
- /** controller status */
- uint32_t csts;
-
- uint32_t reserved2;
-
- /** admin queue attributes */
- uint32_t aqa;
-
- uint64_t asq; /* admin submission queue base addr */
- uint64_t acq; /* admin completion queue base addr */
- uint32_t reserved3[0x3f2];
-
+ uint32_t cap_lo; /* controller capabilities */
+ uint32_t cap_hi;
+ uint32_t vs; /* version */
+ uint32_t intms; /* interrupt mask set */
+ uint32_t intmc; /* interrupt mask clear */
+ uint32_t cc; /* controller configuration */
+ uint32_t reserved1;
+ uint32_t csts; /* controller status */
+ uint32_t nssr; /* NVM Subsystem Reset */
+ uint32_t aqa; /* admin queue attributes */
+ uint64_t asq; /* admin submission queue base addr */
+ uint64_t acq; /* admin completion queue base addr */
+ uint32_t cmbloc; /* Controller Memory Buffer Location */
+ uint32_t cmbsz; /* Controller Memory Buffer Size */
+ uint32_t bpinfo; /* Boot Partition Information */
+ uint32_t bprsel; /* Boot Partition Read Select */
+ uint64_t bpmbl; /* Boot Partition Memory Buffer Location */
+ uint64_t cmbmsc; /* Controller Memory Buffer Memory Space Control */
+ uint32_t cmbsts; /* Controller Memory Buffer Status */
+ uint8_t reserved3[3492]; /* 5Ch - DFFh */
+ uint32_t pmrcap; /* Persistent Memory Capabilities */
+ uint32_t pmrctl; /* Persistent Memory Region Control */
+ uint32_t pmrsts; /* Persistent Memory Region Status */
+ uint32_t pmrebs; /* Persistent Memory Region Elasticity Buffer Size */
+ uint32_t pmrswtp; /* Persistent Memory Region Sustained Write Throughput */
+ uint32_t pmrmsc_lo; /* Persistent Memory Region Controller Memory Space Control */
+ uint32_t pmrmsc_hi;
+ uint8_t reserved4[484]; /* E1Ch - FFFh */
struct {
- uint32_t sq_tdbl; /* submission queue tail doorbell */
- uint32_t cq_hdbl; /* completion queue head doorbell */
+ uint32_t sq_tdbl; /* submission queue tail doorbell */
+ uint32_t cq_hdbl; /* completion queue head doorbell */
} doorbell[1] __packed;
} __packed;
@@ -458,7 +633,6 @@ struct nvme_command
_Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command");
struct nvme_completion {
-
/* dword 0 */
uint32_t cdw0; /* command-specific */
@@ -492,6 +666,7 @@ enum nvme_status_code_type {
NVME_SCT_GENERIC = 0x0,
NVME_SCT_COMMAND_SPECIFIC = 0x1,
NVME_SCT_MEDIA_ERROR = 0x2,
+ NVME_SCT_PATH_RELATED = 0x3,
/* 0x3-0x6 - reserved */
NVME_SCT_VENDOR_SPECIFIC = 0x7,
};
@@ -530,6 +705,9 @@ enum nvme_generic_command_status_code {
NVME_SC_SANITIZE_IN_PROGRESS = 0x1d,
NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID = 0x1e,
NVME_SC_NOT_SUPPORTED_IN_CMB = 0x1f,
+ NVME_SC_NAMESPACE_IS_WRITE_PROTECTED = 0x20,
+ NVME_SC_COMMAND_INTERRUPTED = 0x21,
+ NVME_SC_TRANSIENT_TRANSPORT_ERROR = 0x22,
NVME_SC_LBA_OUT_OF_RANGE = 0x80,
NVME_SC_CAPACITY_EXCEEDED = 0x81,
@@ -569,12 +747,15 @@ enum nvme_command_specific_status_code {
NVME_SC_NS_NOT_ATTACHED = 0x1a,
NVME_SC_THIN_PROV_NOT_SUPPORTED = 0x1b,
NVME_SC_CTRLR_LIST_INVALID = 0x1c,
- NVME_SC_SELT_TEST_IN_PROGRESS = 0x1d,
+ NVME_SC_SELF_TEST_IN_PROGRESS = 0x1d,
NVME_SC_BOOT_PART_WRITE_PROHIB = 0x1e,
NVME_SC_INVALID_CTRLR_ID = 0x1f,
NVME_SC_INVALID_SEC_CTRLR_STATE = 0x20,
NVME_SC_INVALID_NUM_OF_CTRLR_RESRC = 0x21,
NVME_SC_INVALID_RESOURCE_ID = 0x22,
+ NVME_SC_SANITIZE_PROHIBITED_WPMRE = 0x23,
+ NVME_SC_ANA_GROUP_ID_INVALID = 0x24,
+ NVME_SC_ANA_ATTACH_FAILED = 0x25,
NVME_SC_CONFLICTING_ATTRIBUTES = 0x80,
NVME_SC_INVALID_PROTECTION_INFO = 0x81,
@@ -593,6 +774,17 @@ enum nvme_media_error_status_code {
NVME_SC_DEALLOCATED_OR_UNWRITTEN = 0x87,
};
+/* path related status codes */
+enum nvme_path_related_status_code {
+ NVME_SC_INTERNAL_PATH_ERROR = 0x00,
+ NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS = 0x01,
+ NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE = 0x02,
+ NVME_SC_ASYMMETRIC_ACCESS_TRANSITION = 0x03,
+ NVME_SC_CONTROLLER_PATHING_ERROR = 0x60,
+ NVME_SC_HOST_PATHING_ERROR = 0x70,
+ NVME_SC_COMMAND_ABOTHED_BY_HOST = 0x71,
+};
+
/* admin opcodes */
enum nvme_admin_opcode {
NVME_OPC_DELETE_IO_SQ = 0x00,
@@ -612,20 +804,27 @@ enum nvme_admin_opcode {
/* 0x0e-0x0f - reserved */
NVME_OPC_FIRMWARE_ACTIVATE = 0x10,
NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD = 0x11,
+ /* 0x12-0x13 - reserved */
NVME_OPC_DEVICE_SELF_TEST = 0x14,
NVME_OPC_NAMESPACE_ATTACHMENT = 0x15,
+ /* 0x16-0x17 - reserved */
NVME_OPC_KEEP_ALIVE = 0x18,
NVME_OPC_DIRECTIVE_SEND = 0x19,
NVME_OPC_DIRECTIVE_RECEIVE = 0x1a,
+ /* 0x1b - reserved */
NVME_OPC_VIRTUALIZATION_MANAGEMENT = 0x1c,
NVME_OPC_NVME_MI_SEND = 0x1d,
NVME_OPC_NVME_MI_RECEIVE = 0x1e,
+ /* 0x1f-0x7b - reserved */
NVME_OPC_DOORBELL_BUFFER_CONFIG = 0x7c,
NVME_OPC_FORMAT_NVM = 0x80,
NVME_OPC_SECURITY_SEND = 0x81,
NVME_OPC_SECURITY_RECEIVE = 0x82,
+ /* 0x83 - reserved */
NVME_OPC_SANITIZE = 0x84,
+ /* 0x85 - reserved */
+ NVME_OPC_GET_LBA_STATUS = 0x86,
};
/* nvme nvm opcodes */
@@ -636,11 +835,11 @@ enum nvme_nvm_opcode {
/* 0x03 - reserved */
NVME_OPC_WRITE_UNCORRECTABLE = 0x04,
NVME_OPC_COMPARE = 0x05,
- /* 0x06 - reserved */
+ /* 0x06-0x07 - reserved */
NVME_OPC_WRITE_ZEROES = 0x08,
- /* 0x07 - reserved */
NVME_OPC_DATASET_MANAGEMENT = 0x09,
- /* 0x0a-0x0c - reserved */
+ /* 0x0a-0x0b - reserved */
+ NVME_OPC_VERIFY = 0x0c,
NVME_OPC_RESERVATION_REGISTER = 0x0d,
NVME_OPC_RESERVATION_REPORT = 0x0e,
/* 0x0f-0x10 - reserved */
@@ -668,10 +867,21 @@ enum nvme_feature {
NVME_FEAT_KEEP_ALIVE_TIMER = 0x0F,
NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT = 0x10,
NVME_FEAT_NON_OP_POWER_STATE_CONFIG = 0x11,
- /* 0x12-0x77 - reserved */
+ NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG = 0x12,
+ NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG = 0x13,
+ NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW = 0x14,
+ NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES = 0x15,
+ NVME_FEAT_HOST_BEHAVIOR_SUPPORT = 0x16,
+ NVME_FEAT_SANITIZE_CONFIG = 0x17,
+ NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION = 0x18,
+ /* 0x19-0x77 - reserved */
/* 0x78-0x7f - NVMe Management Interface */
NVME_FEAT_SOFTWARE_PROGRESS_MARKER = 0x80,
- /* 0x81-0xBF - command set specific (reserved) */
+ NVME_FEAT_HOST_IDENTIFIER = 0x81,
+ NVME_FEAT_RESERVATION_NOTIFICATION_MASK = 0x82,
+ NVME_FEAT_RESERVATION_PERSISTENCE = 0x83,
+ NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG = 0x84,
+ /* 0x85-0xBF - command set specific (reserved) */
/* 0xC0-0xFF - vendor specific */
};
@@ -717,7 +927,6 @@ _Static_assert(sizeof(struct nvme_power_state) == 32, "bad size for nvme_power_s
#define NVME_FIRMWARE_REVISION_LENGTH 8
struct nvme_controller_data {
-
/* bytes 0-255: controller capabilities and features */
/** pci vendor id */
@@ -765,12 +974,27 @@ struct nvme_controller_data {
/** Controller Attributes */
uint32_t ctratt; /* bitfield really */
- uint8_t reserved1[12];
+ /** Read Recovery Levels Supported */
+ uint16_t rrls;
+
+ uint8_t reserved1[9];
+
+ /** Controller Type */
+ uint8_t cntrltype;
/** FRU Globally Unique Identifier */
uint8_t fguid[16];
- uint8_t reserved2[128];
+ /** Command Retry Delay Time 1 */
+ uint16_t crdt1;
+
+ /** Command Retry Delay Time 2 */
+ uint16_t crdt2;
+
+ /** Command Retry Delay Time 3 */
+ uint16_t crdt3;
+
+ uint8_t reserved2[122];
/* bytes 256-511: admin command set attributes */
@@ -850,7 +1074,34 @@ struct nvme_controller_data {
/** Sanitize Capabilities */
uint32_t sanicap; /* Really a bitfield */
- uint8_t reserved3[180];
+ /** Host Memory Buffer Minimum Descriptor Entry Size */
+ uint32_t hmminds;
+
+ /** Host Memory Maximum Descriptors Entries */
+ uint16_t hmmaxd;
+
+ /** NVM Set Identifier Maximum */
+ uint16_t nsetidmax;
+
+ /** Endurance Group Identifier Maximum */
+ uint16_t endgidmax;
+
+ /** ANA Transition Time */
+ uint8_t anatt;
+
+ /** Asymmetric Namespace Access Capabilities */
+ uint8_t anacap;
+
+ /** ANA Group Identifier Maximum */
+ uint32_t anagrpmax;
+
+ /** Number of ANA Group Identifiers */
+ uint32_t nanagrpid;
+
+ /** Persistent Event Log Size */
+ uint32_t pels;
+
+ uint8_t reserved3[156];
/* bytes 512-703: nvm command set attributes */
/** submission queue entry size */
@@ -885,7 +1136,9 @@ struct nvme_controller_data {
/** NVM Vendor Specific Command Configuration */
uint8_t nvscc;
- uint8_t reserved5;
+
+ /** Namespace Write Protection Capabilities */
+ uint8_t nwpc;
/** Atomic Compare & Write Unit */
uint16_t acwu;
@@ -894,8 +1147,11 @@ struct nvme_controller_data {
/** SGL Support */
uint32_t sgls;
+ /** Maximum Number of Allowed Namespaces */
+ uint32_t mnan;
+
/* bytes 540-767: Reserved */
- uint8_t reserved7[228];
+ uint8_t reserved7[224];
/** NVM Subsystem NVMe Qualified Name */
uint8_t subnqn[256];
@@ -916,7 +1172,6 @@ struct nvme_controller_data {
_Static_assert(sizeof(struct nvme_controller_data) == 4096, "bad size for nvme_controller_data");
struct nvme_namespace_data {
-
/** namespace size */
uint64_t nsze;
@@ -980,8 +1235,38 @@ struct nvme_namespace_data {
/** NVM Capacity */
uint8_t nvmcap[16];
- /* bytes 64-103: Reserved */
- uint8_t reserved5[40];
+ /** Namespace Preferred Write Granularity */
+ uint16_t npwg;
+
+ /** Namespace Preferred Write Alignment */
+ uint16_t npwa;
+
+ /** Namespace Preferred Deallocate Granularity */
+ uint16_t npdg;
+
+ /** Namespace Preferred Deallocate Alignment */
+ uint16_t npda;
+
+ /** Namespace Optimal Write Size */
+ uint16_t nows;
+
+ /* bytes 74-91: Reserved */
+ uint8_t reserved5[18];
+
+ /** ANA Group Identifier */
+ uint32_t anagrpid;
+
+ /* bytes 96-98: Reserved */
+ uint8_t reserved6[3];
+
+ /** Namespace Attributes */
+ uint8_t nsattr;
+
+ /** NVM Set Identifier */
+ uint16_t nvmsetid;
+
+ /** Endurance Group Identifier */
+ uint16_t endgid;
/** Namespace Globally Unique Identifier */
uint8_t nguid[16];
@@ -992,7 +1277,7 @@ struct nvme_namespace_data {
/** lba format support */
uint32_t lbaf[16];
- uint8_t reserved6[192];
+ uint8_t reserved7[192];
uint8_t vendor_specific[3712];
} __packed __aligned(4);
@@ -1000,16 +1285,27 @@ struct nvme_namespace_data {
_Static_assert(sizeof(struct nvme_namespace_data) == 4096, "bad size for nvme_namepsace_data");
enum nvme_log_page {
-
/* 0x00 - reserved */
NVME_LOG_ERROR = 0x01,
NVME_LOG_HEALTH_INFORMATION = 0x02,
NVME_LOG_FIRMWARE_SLOT = 0x03,
NVME_LOG_CHANGED_NAMESPACE = 0x04,
NVME_LOG_COMMAND_EFFECT = 0x05,
+ NVME_LOG_DEVICE_SELF_TEST = 0x06,
+ NVME_LOG_TELEMETRY_HOST_INITIATED = 0x07,
+ NVME_LOG_TELEMETRY_CONTROLLER_INITIATED = 0x08,
+ NVME_LOG_ENDURANCE_GROUP_INFORMATION = 0x09,
+ NVME_LOG_PREDICTABLE_LATENCY_PER_NVM_SET = 0x0a,
+ NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE = 0x0b,
+ NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS = 0x0c,
+ NVME_LOG_PERSISTENT_EVENT_LOG = 0x0d,
+ NVME_LOG_LBA_STATUS_INFORMATION = 0x0e,
+ NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE = 0x0f,
/* 0x06-0x7F - reserved */
/* 0x80-0xBF - I/O command set specific */
NVME_LOG_RES_NOTIFICATION = 0x80,
+ NVME_LOG_SANITIZE_STATUS = 0x81,
+ /* 0x82-0xBF - reserved */
/* 0xC0-0xFF - vendor specific */
/*
@@ -1029,7 +1325,6 @@ enum nvme_log_page {
};
struct nvme_error_information_entry {
-
uint64_t error_count;
uint16_t sqid;
uint16_t cid;
@@ -1038,13 +1333,16 @@ struct nvme_error_information_entry {
uint64_t lba;
uint32_t nsid;
uint8_t vendor_specific;
- uint8_t reserved[35];
+ uint8_t trtype;
+ uint16_t reserved30;
+ uint64_t csi;
+ uint16_t ttsi;
+ uint8_t reserved[22];
} __packed __aligned(4);
_Static_assert(sizeof(struct nvme_error_information_entry) == 64, "bad size for nvme_error_information_entry");
struct nvme_health_information_page {
-
uint8_t critical_warning;
uint16_t temperature;
uint8_t available_spare;
@@ -1074,8 +1372,16 @@ struct nvme_health_information_page {
uint32_t warning_temp_time;
uint32_t error_temp_time;
uint16_t temp_sensor[8];
-
- uint8_t reserved2[296];
+ /* Thermal Management Temperature 1 Transition Count */
+ uint32_t tmt1tc;
+ /* Thermal Management Temperature 2 Transition Count */
+ uint32_t tmt2tc;
+ /* Total Time For Thermal Management Temperature 1 */
+ uint32_t ttftmt1;
+ /* Total Time For Thermal Management Temperature 2 */
+ uint32_t ttftmt2;
+
+ uint8_t reserved2[280];
} __packed __aligned(4);
/* Currently sparse/smatch incorrectly packs this struct in some situations. */
@@ -1084,7 +1390,6 @@ _Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for
#endif
struct nvme_firmware_page {
-
uint8_t afi;
uint8_t reserved[7];
uint64_t revision[7]; /* revisions for 7 slots */
@@ -1099,6 +1404,43 @@ struct nvme_ns_list {
_Static_assert(sizeof(struct nvme_ns_list) == 4096, "bad size for nvme_ns_list");
+struct nvme_command_effects_page {
+ uint32_t acs[256];
+ uint32_t iocs[256];
+ uint8_t reserved[2048];
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_command_effects_page) == 4096,
+ "bad size for nvme_command_effects_page");
+
+struct nvme_res_notification_page {
+ uint64_t log_page_count;
+ uint8_t log_page_type;
+ uint8_t available_log_pages;
+ uint8_t reserved2;
+ uint32_t nsid;
+ uint8_t reserved[48];
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_res_notification_page) == 64,
+ "bad size for nvme_res_notification_page");
+
+struct nvme_sanitize_status_page {
+ uint16_t sprog;
+ uint16_t sstat;
+ uint32_t scdw10;
+ uint32_t etfo;
+ uint32_t etfbe;
+ uint32_t etfce;
+ uint32_t etfownd;
+ uint32_t etfbewnd;
+ uint32_t etfcewnd;
+ uint8_t reserved[480];
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_sanitize_status_page) == 512,
+ "bad size for nvme_sanitize_status_page");
+
struct intel_log_temp_stats
{
uint64_t current;
@@ -1114,10 +1456,59 @@ struct intel_log_temp_stats
_Static_assert(sizeof(struct intel_log_temp_stats) == 13 * 8, "bad size for intel_log_temp_stats");
+struct nvme_resv_reg_ctrlr
+{
+ uint16_t ctrlr_id; /* Controller ID */
+ uint8_t rcsts; /* Reservation Status */
+ uint8_t reserved3[5];
+ uint64_t hostid; /* Host Identifier */
+ uint64_t rkey; /* Reservation Key */
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_resv_reg_ctrlr) == 24, "bad size for nvme_resv_reg_ctrlr");
+
+struct nvme_resv_reg_ctrlr_ext
+{
+ uint16_t ctrlr_id; /* Controller ID */
+ uint8_t rcsts; /* Reservation Status */
+ uint8_t reserved3[5];
+ uint64_t rkey; /* Reservation Key */
+ uint64_t hostid[2]; /* Host Identifier */
+ uint8_t reserved32[32];
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_resv_reg_ctrlr_ext) == 64, "bad size for nvme_resv_reg_ctrlr_ext");
+
+struct nvme_resv_status
+{
+ uint32_t gen; /* Generation */
+ uint8_t rtype; /* Reservation Type */
+ uint8_t regctl[2]; /* Number of Registered Controllers */
+ uint8_t reserved7[2];
+ uint8_t ptpls; /* Persist Through Power Loss State */
+ uint8_t reserved10[14];
+ struct nvme_resv_reg_ctrlr ctrlr[0];
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_resv_status) == 24, "bad size for nvme_resv_status");
+
+struct nvme_resv_status_ext
+{
+ uint32_t gen; /* Generation */
+ uint8_t rtype; /* Reservation Type */
+ uint8_t regctl[2]; /* Number of Registered Controllers */
+ uint8_t reserved7[2];
+ uint8_t ptpls; /* Persist Through Power Loss State */
+ uint8_t reserved10[14];
+ uint8_t reserved24[40];
+ struct nvme_resv_reg_ctrlr_ext ctrlr[0];
+} __packed __aligned(4);
+
+_Static_assert(sizeof(struct nvme_resv_status_ext) == 64, "bad size for nvme_resv_status_ext");
+
#define NVME_TEST_MAX_THREADS 128
struct nvme_io_test {
-
enum nvme_nvm_opcode opc;
uint32_t size;
uint32_t time; /* in seconds */
@@ -1127,7 +1518,6 @@ struct nvme_io_test {
};
enum nvme_io_test_flags {
-
/*
* Specifies whether dev_refthread/dev_relthread should be
* called during NVME_BIO_TEST. Ignored for other test
@@ -1137,7 +1527,6 @@ enum nvme_io_test_flags {
};
struct nvme_pt_command {
-
/*
* cmd is used to specify a passthrough command to a controller or
* namespace.
@@ -1189,6 +1578,17 @@ struct nvme_pt_command {
struct mtx * driver_lock;
};
+struct nvme_get_nsid {
+ char cdev[SPECNAMELEN + 1];
+ uint32_t nsid;
+};
+
+struct nvme_hmb_desc {
+ uint64_t addr;
+ uint32_t size;
+ uint32_t reserved;
+};
+
#define nvme_completion_is_error(cpl) \
(NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0)
@@ -1197,6 +1597,7 @@ void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen);
#ifdef _KERNEL
struct bio;
+struct thread;
struct nvme_namespace;
struct nvme_controller;
@@ -1223,6 +1624,8 @@ int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
/* Admin functions */
void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr,
uint8_t feature, uint32_t cdw11,
+ uint32_t cdw12, uint32_t cdw13,
+ uint32_t cdw14, uint32_t cdw15,
void *payload, uint32_t payload_size,
nvme_cb_fn_t cb_fn, void *cb_arg);
void nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr,
@@ -1264,6 +1667,13 @@ void nvme_unregister_consumer(struct nvme_consumer *consumer);
device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr);
const struct nvme_controller_data *
nvme_ctrlr_get_data(struct nvme_controller *ctrlr);
+static inline bool
+nvme_ctrlr_has_dataset_mgmt(const struct nvme_controller_data *cd)
+{
+ /* Assumes cd was byte swapped by nvme_controller_data_swapbytes() */
+ return ((cd->oncs >> NVME_CTRLR_DATA_ONCS_DSM_SHIFT) &
+ NVME_CTRLR_DATA_ONCS_DSM_MASK);
+}
/* Namespace helper functions */
uint32_t nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns);
@@ -1279,6 +1689,8 @@ uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns);
int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp,
nvme_cb_fn_t cb_fn);
+int nvme_ns_ioctl_process(struct nvme_namespace *ns, u_long cmd,
+ caddr_t arg, int flag, struct thread *td);
/*
* Command building helper functions -- shared with CAM
@@ -1335,8 +1747,9 @@ extern int nvme_use_nvd;
/* Endianess conversion functions for NVMe structs */
static inline
-void nvme_completion_swapbytes(struct nvme_completion *s)
+void nvme_completion_swapbytes(struct nvme_completion *s __unused)
{
+#ifndef _LITTLE_ENDIAN
s->cdw0 = le32toh(s->cdw0);
/* omit rsvd1 */
@@ -1344,22 +1757,26 @@ void nvme_completion_swapbytes(struct nvme_completion *s)
s->sqid = le16toh(s->sqid);
/* omit cid */
s->status = le16toh(s->status);
+#endif
}
static inline
-void nvme_power_state_swapbytes(struct nvme_power_state *s)
+void nvme_power_state_swapbytes(struct nvme_power_state *s __unused)
{
+#ifndef _LITTLE_ENDIAN
s->mp = le16toh(s->mp);
s->enlat = le32toh(s->enlat);
s->exlat = le32toh(s->exlat);
s->idlp = le16toh(s->idlp);
s->actp = le16toh(s->actp);
+#endif
}
static inline
-void nvme_controller_data_swapbytes(struct nvme_controller_data *s)
+void nvme_controller_data_swapbytes(struct nvme_controller_data *s __unused)
{
+#ifndef _LITTLE_ENDIAN
int i;
s->vid = le16toh(s->vid);
@@ -1370,6 +1787,10 @@ void nvme_controller_data_swapbytes(struct nvme_controller_data *s)
s->rtd3e = le32toh(s->rtd3e);
s->oaes = le32toh(s->oaes);
s->ctratt = le32toh(s->ctratt);
+ s->rrls = le16toh(s->rrls);
+ s->crdt1 = le16toh(s->crdt1);
+ s->crdt2 = le16toh(s->crdt2);
+ s->crdt3 = le16toh(s->crdt3);
s->oacs = le16toh(s->oacs);
s->wctemp = le16toh(s->wctemp);
s->cctemp = le16toh(s->cctemp);
@@ -1383,6 +1804,13 @@ void nvme_controller_data_swapbytes(struct nvme_controller_data *s)
s->mntmt = le16toh(s->mntmt);
s->mxtmt = le16toh(s->mxtmt);
s->sanicap = le32toh(s->sanicap);
+ s->hmminds = le32toh(s->hmminds);
+ s->hmmaxd = le16toh(s->hmmaxd);
+ s->nsetidmax = le16toh(s->nsetidmax);
+ s->endgidmax = le16toh(s->endgidmax);
+ s->anagrpmax = le32toh(s->anagrpmax);
+ s->nanagrpid = le32toh(s->nanagrpid);
+ s->pels = le32toh(s->pels);
s->maxcmd = le16toh(s->maxcmd);
s->nn = le32toh(s->nn);
s->oncs = le16toh(s->oncs);
@@ -1391,13 +1819,16 @@ void nvme_controller_data_swapbytes(struct nvme_controller_data *s)
s->awupf = le16toh(s->awupf);
s->acwu = le16toh(s->acwu);
s->sgls = le32toh(s->sgls);
+ s->mnan = le32toh(s->mnan);
for (i = 0; i < 32; i++)
nvme_power_state_swapbytes(&s->power_state[i]);
+#endif
}
static inline
-void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s)
+void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s __unused)
{
+#ifndef _LITTLE_ENDIAN
int i;
s->nsze = le64toh(s->nsze);
@@ -1410,13 +1841,24 @@ void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s)
s->nabo = le16toh(s->nabo);
s->nabspf = le16toh(s->nabspf);
s->noiob = le16toh(s->noiob);
+ s->npwg = le16toh(s->npwg);
+ s->npwa = le16toh(s->npwa);
+ s->npdg = le16toh(s->npdg);
+ s->npda = le16toh(s->npda);
+ s->nows = le16toh(s->nows);
+ s->anagrpid = le32toh(s->anagrpid);
+ s->nvmsetid = le16toh(s->nvmsetid);
+ s->endgid = le16toh(s->endgid);
for (i = 0; i < 16; i++)
s->lbaf[i] = le32toh(s->lbaf[i]);
+#endif
}
static inline
-void nvme_error_information_entry_swapbytes(struct nvme_error_information_entry *s)
+void nvme_error_information_entry_swapbytes(
+ struct nvme_error_information_entry *s __unused)
{
+#ifndef _LITTLE_ENDIAN
s->error_count = le64toh(s->error_count);
s->sqid = le16toh(s->sqid);
@@ -1425,18 +1867,14 @@ void nvme_error_information_entry_swapbytes(struct nvme_error_information_entry
s->error_location = le16toh(s->error_location);
s->lba = le64toh(s->lba);
s->nsid = le32toh(s->nsid);
+ s->csi = le64toh(s->csi);
+ s->ttsi = le16toh(s->ttsi);
+#endif
}
static inline
-void nvme_le128toh(void *p)
+void nvme_le128toh(void *p __unused)
{
- /*
- * Upstream, this uses the following comparison:
- * #if _BYTE_ORDER != _LITTLE_ENDIAN
- *
- * Rather than keep this file in compat with only that little bit
- * changed, we'll just float a little patch here for now.
- */
#ifndef _LITTLE_ENDIAN
/* Swap 16 bytes in place */
char *tmp = (char*)p;
@@ -1447,14 +1885,14 @@ void nvme_le128toh(void *p)
tmp[i] = tmp[15-i];
tmp[15-i] = b;
}
-#else
- (void)p;
#endif
}
static inline
-void nvme_health_information_page_swapbytes(struct nvme_health_information_page *s)
+void nvme_health_information_page_swapbytes(
+ struct nvme_health_information_page *s __unused)
{
+#ifndef _LITTLE_ENDIAN
int i;
s->temperature = le16toh(s->temperature);
@@ -1472,30 +1910,80 @@ void nvme_health_information_page_swapbytes(struct nvme_health_information_page
s->error_temp_time = le32toh(s->error_temp_time);
for (i = 0; i < 8; i++)
s->temp_sensor[i] = le16toh(s->temp_sensor[i]);
+ s->tmt1tc = le32toh(s->tmt1tc);
+ s->tmt2tc = le32toh(s->tmt2tc);
+ s->ttftmt1 = le32toh(s->ttftmt1);
+ s->ttftmt2 = le32toh(s->ttftmt2);
+#endif
}
-
static inline
-void nvme_firmware_page_swapbytes(struct nvme_firmware_page *s)
+void nvme_firmware_page_swapbytes(struct nvme_firmware_page *s __unused)
{
+#ifndef _LITTLE_ENDIAN
int i;
for (i = 0; i < 7; i++)
s->revision[i] = le64toh(s->revision[i]);
+#endif
}
static inline
-void nvme_ns_list_swapbytes(struct nvme_ns_list *s)
+void nvme_ns_list_swapbytes(struct nvme_ns_list *s __unused)
{
+#ifndef _LITTLE_ENDIAN
int i;
for (i = 0; i < 1024; i++)
s->ns[i] = le32toh(s->ns[i]);
+#endif
+}
+
+static inline
+void nvme_command_effects_page_swapbytes(
+ struct nvme_command_effects_page *s __unused)
+{
+#ifndef _LITTLE_ENDIAN
+ int i;
+
+ for (i = 0; i < 256; i++)
+ s->acs[i] = le32toh(s->acs[i]);
+ for (i = 0; i < 256; i++)
+ s->iocs[i] = le32toh(s->iocs[i]);
+#endif
+}
+
+static inline
+void nvme_res_notification_page_swapbytes(
+ struct nvme_res_notification_page *s __unused)
+{
+#ifndef _LITTLE_ENDIAN
+ s->log_page_count = le64toh(s->log_page_count);
+ s->nsid = le32toh(s->nsid);
+#endif
+}
+
+static inline
+void nvme_sanitize_status_page_swapbytes(
+ struct nvme_sanitize_status_page *s __unused)
+{
+#ifndef _LITTLE_ENDIAN
+ s->sprog = le16toh(s->sprog);
+ s->sstat = le16toh(s->sstat);
+ s->scdw10 = le32toh(s->scdw10);
+ s->etfo = le32toh(s->etfo);
+ s->etfbe = le32toh(s->etfbe);
+ s->etfce = le32toh(s->etfce);
+ s->etfownd = le32toh(s->etfownd);
+ s->etfbewnd = le32toh(s->etfbewnd);
+ s->etfcewnd = le32toh(s->etfcewnd);
+#endif
}
static inline
-void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s)
+void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s __unused)
{
+#ifndef _LITTLE_ENDIAN
s->current = le64toh(s->current);
s->overtemp_flag_last = le64toh(s->overtemp_flag_last);
@@ -1506,6 +1994,43 @@ void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s)
s->max_oper_temp = le64toh(s->max_oper_temp);
s->min_oper_temp = le64toh(s->min_oper_temp);
s->est_offset = le64toh(s->est_offset);
+#endif
+}
+
+static inline
+void nvme_resv_status_swapbytes(struct nvme_resv_status *s __unused,
+ size_t size __unused)
+{
+#ifndef _LITTLE_ENDIAN
+ u_int i, n;
+
+ s->gen = le32toh(s->gen);
+ n = (s->regctl[1] << 8) | s->regctl[0];
+ n = MIN(n, (size - sizeof(s)) / sizeof(s->ctrlr[0]));
+ for (i = 0; i < n; i++) {
+ s->ctrlr[i].ctrlr_id = le16toh(s->ctrlr[i].ctrlr_id);
+ s->ctrlr[i].hostid = le64toh(s->ctrlr[i].hostid);
+ s->ctrlr[i].rkey = le64toh(s->ctrlr[i].rkey);
+ }
+#endif
+}
+
+static inline
+void nvme_resv_status_ext_swapbytes(struct nvme_resv_status_ext *s __unused,
+ size_t size __unused)
+{
+#ifndef _LITTLE_ENDIAN
+ u_int i, n;
+
+ s->gen = le32toh(s->gen);
+ n = (s->regctl[1] << 8) | s->regctl[0];
+ n = MIN(n, (size - sizeof(s)) / sizeof(s->ctrlr[0]));
+ for (i = 0; i < n; i++) {
+ s->ctrlr[i].ctrlr_id = le16toh(s->ctrlr[i].ctrlr_id);
+ s->ctrlr[i].rkey = le64toh(s->ctrlr[i].rkey);
+ nvme_le128toh((void *)s->ctrlr[i].hostid);
+ }
+#endif
}
#endif /* __NVME_H__ */
diff --git a/usr/src/contrib/bhyve/sys/ata.h b/usr/src/contrib/bhyve/sys/ata.h
index 223bd7b3eb..83eb089dbe 100644
--- a/usr/src/contrib/bhyve/sys/ata.h
+++ b/usr/src/contrib/bhyve/sys/ata.h
@@ -1,4 +1,6 @@
/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
* Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
* All rights reserved.
*
@@ -44,6 +46,7 @@ struct ata_params {
#define ATA_ATAPI_TYPE_TAPE 0x0100 /* streaming tape */
#define ATA_ATAPI_TYPE_CDROM 0x0500 /* CD-ROM device */
#define ATA_ATAPI_TYPE_OPTICAL 0x0700 /* optical disk */
+#define ATA_ATAPI_REMOVABLE 0x0080
#define ATA_DRQ_MASK 0x0060
#define ATA_DRQ_SLOW 0x0000 /* cpu 3 ms delay */
#define ATA_DRQ_INTR 0x0020 /* interrupt 10 ms delay */
@@ -64,12 +67,13 @@ struct ata_params {
/*023*/ u_int8_t revision[8]; /* firmware revision */
/*027*/ u_int8_t model[40]; /* model name */
/*047*/ u_int16_t sectors_intr; /* sectors per interrupt */
-/*048*/ u_int16_t usedmovsd; /* double word read/write? */
+/*048*/ u_int16_t tcg; /* Trusted Computing Group */
+#define ATA_SUPPORT_TCG 0x0001
/*049*/ u_int16_t capabilities1;
#define ATA_SUPPORT_DMA 0x0100
#define ATA_SUPPORT_LBA 0x0200
-#define ATA_SUPPORT_IORDY 0x0400
-#define ATA_SUPPORT_IORDYDIS 0x0800
+#define ATA_SUPPORT_IORDYDIS 0x0400
+#define ATA_SUPPORT_IORDY 0x0800
#define ATA_SUPPORT_OVERLAP 0x4000
/*050*/ u_int16_t capabilities2;
@@ -90,6 +94,12 @@ struct ata_params {
/*057*/ u_int16_t current_size_1;
/*058*/ u_int16_t current_size_2;
/*059*/ u_int16_t multi;
+#define ATA_SUPPORT_BLOCK_ERASE_EXT 0x8000
+#define ATA_SUPPORT_OVERWRITE_EXT 0x4000
+#define ATA_SUPPORT_CRYPTO_SCRAMBLE_EXT 0x2000
+#define ATA_SUPPORT_SANITIZE 0x1000
+#define ATA_SUPPORT_SANITIZE_ALLOWED 0x0800
+#define ATA_SUPPORT_ANTIFREEZE_LOCK_EXT 0x0400
#define ATA_MULTI_VALID 0x0100
/*060*/ u_int16_t lba_size_1;
@@ -105,6 +115,7 @@ struct ata_params {
/*069*/ u_int16_t support3;
#define ATA_SUPPORT_RZAT 0x0020
#define ATA_SUPPORT_DRAT 0x4000
+#define ATA_ENCRYPTS_ALL_USER_DATA 0x0010 /* Self-encrypting drive */
#define ATA_SUPPORT_ZONE_MASK 0x0003
#define ATA_SUPPORT_ZONE_NR 0x0000
#define ATA_SUPPORT_ZONE_HOST_AWARE 0x0001
@@ -133,7 +144,8 @@ struct ata_params {
/*77*/ u_int16_t satacapabilities2;
#define ATA_SATA_CURR_GEN_MASK 0x0006
#define ATA_SUPPORT_NCQ_STREAM 0x0010
-#define ATA_SUPPORT_NCQ_QMANAGEMENT 0x0020
+#define ATA_SUPPORT_NCQ_NON_DATA 0x0020
+#define ATA_SUPPORT_NCQ_QMANAGEMENT ATA_SUPPORT_NCQ_NON_DATA
#define ATA_SUPPORT_RCVSND_FPDMA_QUEUED 0x0040
/*78*/ u_int16_t satasupport;
#define ATA_SUPPORT_NONZERO 0x0002
@@ -142,6 +154,7 @@ struct ata_params {
#define ATA_SUPPORT_INORDERDATA 0x0010
#define ATA_SUPPORT_ASYNCNOTIF 0x0020
#define ATA_SUPPORT_SOFTSETPRESERVE 0x0040
+#define ATA_SUPPORT_NCQ_AUTOSENSE 0x0080
/*79*/ u_int16_t sataenabled;
#define ATA_ENABLED_DAPST 0x0080
@@ -234,12 +247,15 @@ struct ata_params {
#define ATA_SUPPORT_FREEFALL 0x0020
#define ATA_SUPPORT_SENSE_REPORT 0x0040
#define ATA_SUPPORT_EPC 0x0080
+#define ATA_SUPPORT_AMAX_ADDR 0x0100
+#define ATA_SUPPORT_DSN 0x0200
/*120*/ u_int16_t enabled2;
#define ATA_ENABLED_WRITEREADVERIFY 0x0002
#define ATA_ENABLED_WRITEUNCORREXT 0x0004
#define ATA_ENABLED_FREEFALL 0x0020
#define ATA_ENABLED_SENSE_REPORT 0x0040
#define ATA_ENABLED_EPC 0x0080
+#define ATA_ENABLED_DSN 0x0200
u_int16_t reserved121[6];
/*127*/ u_int16_t removable_status;
/*128*/ u_int16_t security_status;
@@ -257,10 +273,23 @@ struct ata_params {
/*162*/ u_int16_t cfa_kms_support;
/*163*/ u_int16_t cfa_trueide_modes;
/*164*/ u_int16_t cfa_memory_modes;
- u_int16_t reserved165[4];
+ u_int16_t reserved165[3];
+/*168*/ u_int16_t form_factor;
+#define ATA_FORM_FACTOR_MASK 0x000f
+#define ATA_FORM_FACTOR_NOT_REPORTED 0x0000
+#define ATA_FORM_FACTOR_5_25 0x0001
+#define ATA_FORM_FACTOR_3_5 0x0002
+#define ATA_FORM_FACTOR_2_5 0x0003
+#define ATA_FORM_FACTOR_1_8 0x0004
+#define ATA_FORM_FACTOR_SUB_1_8 0x0005
+#define ATA_FORM_FACTOR_MSATA 0x0006
+#define ATA_FORM_FACTOR_M_2 0x0007
+#define ATA_FORM_FACTOR_MICRO_SSD 0x0008
+#define ATA_FORM_FACTOR_C_FAST 0x0009
/*169*/ u_int16_t support_dsm;
#define ATA_SUPPORT_DSM_TRIM 0x0001
- u_int16_t reserved170[6];
+/*170*/ u_int8_t product_id[8]; /* Additional Product Identifier */
+ u_int16_t reserved174[2];
/*176*/ u_int8_t media_serial[60];
/*206*/ u_int16_t sct;
u_int16_t reserved207[2];
@@ -283,7 +312,7 @@ struct ata_params {
/*223*/ u_int16_t transport_minor;
u_int16_t reserved224[31];
/*255*/ u_int16_t integrity;
-} __packed;
+} __packed __aligned(2);
/* ATA Dataset Management */
#define ATA_DSM_BLK_SIZE 512
@@ -355,7 +384,6 @@ struct ata_params {
#define ATA_SA600 0x49
#define ATA_DMA_MAX 0x4f
-
/* ATA commands */
#define ATA_NOP 0x00 /* NOP */
#define ATA_NF_FLUSHQUEUE 0x00 /* flush queued cmd's */
@@ -391,6 +419,12 @@ struct ata_params {
#define ATA_READ_LOG_DMA_EXT 0x47 /* read log DMA ext - PIO Data-In */
#define ATA_ZAC_MANAGEMENT_IN 0x4a /* ZAC management in */
#define ATA_ZM_REPORT_ZONES 0x00 /* report zones */
+#define ATA_WRITE_LOG_DMA_EXT 0x57 /* WRITE LOG DMA EXT */
+#define ATA_TRUSTED_NON_DATA 0x5b /* TRUSTED NON-DATA */
+#define ATA_TRUSTED_RECEIVE 0x5c /* TRUSTED RECEIVE */
+#define ATA_TRUSTED_RECEIVE_DMA 0x5d /* TRUSTED RECEIVE DMA */
+#define ATA_TRUSTED_SEND 0x5e /* TRUSTED SEND */
+#define ATA_TRUSTED_SEND_DMA 0x5f /* TRUSTED SEND DMA */
#define ATA_READ_FPDMA_QUEUED 0x60 /* read DMA NCQ */
#define ATA_WRITE_FPDMA_QUEUED 0x61 /* write DMA NCQ */
#define ATA_NCQ_NON_DATA 0x63 /* NCQ non-data command */
@@ -410,15 +444,22 @@ struct ata_params {
#define ATA_RFPDMA_ZAC_MGMT_IN 0x02 /* NCQ ZAC mgmt in w/data */
#define ATA_SEP_ATTN 0x67 /* SEP request */
#define ATA_SEEK 0x70 /* seek */
+#define ATA_AMAX_ADDR 0x78 /* Accessible Max Address */
+#define ATA_AMAX_ADDR_GET 0x00 /* GET NATIVE MAX ADDRESS EXT */
+#define ATA_AMAX_ADDR_SET 0x01 /* SET ACCESSIBLE MAX ADDRESS EXT */
+#define ATA_AMAX_ADDR_FREEZE 0x02 /* FREEZE ACCESSIBLE MAX ADDRESS EXT */
#define ATA_ZAC_MANAGEMENT_OUT 0x9f /* ZAC management out */
#define ATA_ZM_CLOSE_ZONE 0x01 /* close zone */
#define ATA_ZM_FINISH_ZONE 0x02 /* finish zone */
#define ATA_ZM_OPEN_ZONE 0x03 /* open zone */
#define ATA_ZM_RWP 0x04 /* reset write pointer */
+#define ATA_DOWNLOAD_MICROCODE 0x92 /* DOWNLOAD MICROCODE */
+#define ATA_DOWNLOAD_MICROCODE_DMA 0x93 /* DOWNLOAD MICROCODE DMA */
#define ATA_PACKET_CMD 0xa0 /* packet command */
#define ATA_ATAPI_IDENTIFY 0xa1 /* get ATAPI params*/
#define ATA_SERVICE 0xa2 /* service command */
#define ATA_SMART_CMD 0xb0 /* SMART command */
+#define ATA_SANITIZE 0xb4 /* sanitize device */
#define ATA_CFA_ERASE 0xc0 /* CFA erase */
#define ATA_READ_MUL 0xc4 /* read multi */
#define ATA_WRITE_MUL 0xc5 /* write multi */
@@ -437,8 +478,11 @@ struct ata_params {
#define ATA_CHECK_POWER_MODE 0xe5 /* device power mode */
#define ATA_SLEEP 0xe6 /* sleep */
#define ATA_FLUSHCACHE 0xe7 /* flush cache to disk */
+#define ATA_WRITE_BUFFER 0xe8 /* write buffer */
#define ATA_WRITE_PM 0xe8 /* write portmultiplier */
+#define ATA_READ_BUFFER_DMA 0xe9 /* read buffer DMA */
#define ATA_FLUSHCACHE48 0xea /* flush cache to disk */
+#define ATA_WRITE_BUFFER_DMA 0xeb /* write buffer DMA */
#define ATA_ATA_IDENTIFY 0xec /* get ATA params */
#define ATA_SETFEATURES 0xef /* features command */
#define ATA_SF_ENAB_WCACHE 0x02 /* enable write cache */
@@ -463,7 +507,6 @@ struct ata_params {
#define ATA_SF_DIS_SRVIRQ 0xde /* disable service interrupt */
#define ATA_SF_LPSAERC 0x62 /* Long Phys Sect Align ErrRep*/
#define ATA_SF_DSN 0x63 /* Device Stats Notification */
-#define ATA_CHECK_POWER_MODE 0xe5 /* Check Power Mode */
#define ATA_SECURITY_SET_PASSWORD 0xf1 /* set drive password */
#define ATA_SECURITY_UNLOCK 0xf2 /* unlock drive using passwd */
#define ATA_SECURITY_ERASE_PREPARE 0xf3 /* prepare to erase drive */
@@ -473,7 +516,6 @@ struct ata_params {
#define ATA_READ_NATIVE_MAX_ADDRESS 0xf8 /* read native max address */
#define ATA_SET_MAX_ADDRESS 0xf9 /* set max address */
-
/* ATAPI commands */
#define ATAPI_TEST_UNIT_READY 0x00 /* check if device is ready */
#define ATAPI_REZERO 0x01 /* rewind */
@@ -534,7 +576,6 @@ struct ata_params {
#define ATAPI_READ_CD 0xbe /* read data */
#define ATAPI_POLL_DSC 0xff /* poll DSC status bit */
-
struct ata_ioc_devices {
int channel;
char name[2][32];
@@ -585,7 +626,7 @@ struct atapi_sense {
u_int8_t specific; /* sense key specific */
#define ATA_SENSE_SPEC_VALID 0x80
#define ATA_SENSE_SPEC_MASK 0x7f
-
+
u_int8_t specific1; /* sense key specific */
u_int8_t specific2; /* sense key specific */
} __packed;
@@ -682,7 +723,7 @@ struct atapi_sense {
#define ATA_IDL_ATA_STRINGS 0x05 /* ATA Strings */
#define ATA_IDL_SECURITY 0x06 /* Security */
#define ATA_IDL_PARALLEL_ATA 0x07 /* Parallel ATA */
-#define ATA_IDL_SERIAL_ATA 0x08 /* Seiral ATA */
+#define ATA_IDL_SERIAL_ATA 0x08 /* Serial ATA */
#define ATA_IDL_ZDI 0x09 /* Zoned Device Information */
struct ata_gp_log_dir {
@@ -965,7 +1006,6 @@ struct ata_security_password {
#define IOCATAGSPINDOWN _IOR('a', 104, int)
#define IOCATASSPINDOWN _IOW('a', 105, int)
-
struct ata_ioc_raid_config {
int lun;
int type;
diff --git a/usr/src/lib/libvmmapi/common/mapfile-vers b/usr/src/lib/libvmmapi/common/mapfile-vers
index 26cfd15426..be0a055490 100644
--- a/usr/src/lib/libvmmapi/common/mapfile-vers
+++ b/usr/src/lib/libvmmapi/common/mapfile-vers
@@ -34,7 +34,6 @@ $mapfile_version 2
SYMBOL_VERSION ILLUMOSprivate {
global:
vcpu_reset;
- vm_active_cpus;
vm_activate_cpu;
vm_active_cpus;
vm_apicid2vcpu;
@@ -46,19 +45,18 @@ SYMBOL_VERSION ILLUMOSprivate {
vm_copy_teardown;
vm_copyin;
vm_copyout;
- vm_create_devmem;
vm_create;
vm_create_devmem;
vm_debug_cpus;
vm_destroy;
- vm_destroy;
+ vm_disable_pptdev_msix;
vm_get_capability;
vm_get_desc;
vm_get_device_fd;
vm_get_devmem_offset;
vm_get_gpa_pmap;
- vm_get_hpet_capabilities;
vm_get_highmem_size;
+ vm_get_hpet_capabilities;
vm_get_intinfo;
vm_get_lowmem_limit;
vm_get_lowmem_size;
@@ -77,10 +75,6 @@ SYMBOL_VERSION ILLUMOSprivate {
vm_inject_exception;
vm_inject_fault;
vm_inject_nmi;
- vm_isa_assert_irq;
- vm_isa_deassert_irq;
- vm_isa_pulse_irq;
- vm_isa_set_irq_trigger;
vm_ioapic_assert_irq;
vm_ioapic_deassert_irq;
vm_ioapic_pincount;
@@ -98,8 +92,10 @@ SYMBOL_VERSION ILLUMOSprivate {
vm_mmap_memseg;
vm_open;
vm_parse_memsize;
+ vm_pmtmr_set_location;
vm_reinit;
vm_restart_instruction;
+ vm_resume_cpu;
vm_rtc_gettime;
vm_rtc_read;
vm_rtc_settime;
@@ -119,9 +115,7 @@ SYMBOL_VERSION ILLUMOSprivate {
vm_suspend;
vm_suspend_cpu;
vm_suspended_cpus;
- vm_resume_cpu;
vm_unassign_pptdev;
- vm_pmtmr_set_location;
vm_wrlock_cycle;
vm_get_run_state;
vm_set_run_state;
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c
index 0b22ca7522..fcb098a74f 100644
--- a/usr/src/lib/libvmmapi/common/vmmapi.c
+++ b/usr/src/lib/libvmmapi/common/vmmapi.c
@@ -1162,7 +1162,22 @@ vm_get_pptdev_limits(struct vmctx *ctx, int bus, int slot, int func,
return (error);
}
+
+int
+vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func)
+{
+ struct vm_pptdev ppt;
+
+ bzero(&ppt, sizeof(ppt));
+ ppt.bus = bus;
+ ppt.slot = slot;
+ ppt.func = func;
+
+ return ioctl(ctx->fd, VM_PPTDEV_DISABLE_MSIX, &ppt);
+}
+
#else /* __FreeBSD__ */
+
int
vm_assign_pptdev(struct vmctx *ctx, int pptfd)
{
@@ -1238,6 +1253,15 @@ vm_get_pptdev_limits(struct vmctx *ctx, int pptfd, int *msi_limit,
*msix_limit = pptlimits.msix_limit;
return (error);
}
+
+int
+vm_disable_pptdev_msix(struct vmctx *ctx, int pptfd)
+{
+ struct vm_pptdev pptdev;
+
+ pptdev.pptfd = pptfd;
+ return (ioctl(ctx->fd, VM_PPTDEV_DISABLE_MSIX, &pptdev));
+}
#endif /* __FreeBSD__ */
uint64_t *
@@ -1905,7 +1929,8 @@ vm_get_ioctls(size_t *len)
VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER,
VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV,
VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI,
- VM_PPTDEV_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC,
+ VM_PPTDEV_MSIX, VM_PPTDEV_DISABLE_MSIX,
+ VM_INJECT_NMI, VM_STATS, VM_STAT_DESC,
VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE,
VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA,
VM_GLA2GPA_NOFAULT,
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h
index f7aaa02087..72e43a4e3d 100644
--- a/usr/src/lib/libvmmapi/common/vmmapi.h
+++ b/usr/src/lib/libvmmapi/common/vmmapi.h
@@ -225,6 +225,7 @@ int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot,
int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
int func, int idx, uint64_t addr, uint64_t msg,
uint32_t vector_control);
+int vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func);
int vm_get_pptdev_limits(struct vmctx *ctx, int bus, int slot, int func,
int *msi_limit, int *msix_limit);
#else /* __FreeBSD__ */
@@ -236,6 +237,7 @@ int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int pptfd,
uint64_t addr, uint64_t msg, int numvec);
int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int pptfd,
int idx, uint64_t addr, uint64_t msg, uint32_t vector_control);
+int vm_disable_pptdev_msix(struct vmctx *ctx, int pptfd);
int vm_get_pptdev_limits(struct vmctx *ctx, int pptfd, int *msi_limit,
int *msix_limit);
#endif /* __FreeBSD__ */
diff --git a/usr/src/uts/i86pc/io/vmm/README.sync b/usr/src/uts/i86pc/io/vmm/README.sync
deleted file mode 100644
index 1b766008a8..0000000000
--- a/usr/src/uts/i86pc/io/vmm/README.sync
+++ /dev/null
@@ -1,37 +0,0 @@
-The bhyve kernel module and its associated userland consumers have been updated
-to the latest upstream FreeBSD sources as of:
-
-commit 8ade7383cafed0f7555cac16ef7f9e956e46eaeb
-Author: grehan <grehan@FreeBSD.org>
-Date: Mon May 25 06:25:31 2020 +0000
-
- Fix pci-passthru MSI issues with OpenBSD guests
-
- - Return 2 x 16-bit registers in the correct byte order
- for a 4-byte read that spans the CMD/STATUS register.
- This reversal was hiding the capabilities-list, which prevented
- the MSI capability from being found for XHCI passthru.
-
- - Reorganize MSI/MSI-x config writes so that a 4-byte write at the
- capability offset would have the read-only portion skipped.
- This prevented MSI interrupts from being enabled.
-
- Reported and extensively tested by Anatoli (me at anatoli dot ws)
-
- PR: 245392
- Reported by: Anatoli (me at anatoli dot ws)
- Reviewed by: jhb (bhyve)
- Approved by: jhb, bz (mentor)
- MFC after: 1 week
- Differential Revision: https://reviews.freebsd.org/D24951
-
-Divergence Notes:
-A previous sync skipped commit c8edafdabc27533d9c51eddc2896e772c16d965c which
-introduced a generic backend functionality to network devices. Without that in
-place, subsequent updates reflect the absence of that subsystem. Integrating
-net backends has not been a priority, given the common use of viona on illumos.
-
-The draft Save/Restore functionality, added in FreeBSD commit
-d3e4e512238b072fb9282e024610b981ba679869, has not been synced into illumos bhyve
-yet. It is not built by default in FreeBSD, so we're not interested in taking
-it until it successfully endures more in-depth testing.
diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c
index a01b06446d..e2f298ae09 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c
+++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c
@@ -356,7 +356,6 @@ amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id,
cmd = amdvi_get_cmd_tail(softc);
KASSERT(cmd != NULL, ("Cmd is NULL"));
-
cmd->opcode = AMDVI_INVD_PAGE_OPCODE;
cmd->word1 = domain_id;
/*
@@ -729,7 +728,6 @@ amdvi_print_pci_cap(device_t dev)
struct amdvi_softc *softc;
uint32_t off, cap;
-
softc = device_get_softc(dev);
off = softc->cap_off;
@@ -869,7 +867,6 @@ amdvi_alloc_intr_resources(struct amdvi_softc *softc)
return (0);
}
-
static void
amdvi_print_dev_cap(struct amdvi_softc *softc)
{
@@ -1121,7 +1118,6 @@ amdvi_free_ptp(uint64_t *ptp, int level)
amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i]
& AMDVI_PT_MASK), level - 1);
-
}
free(ptp, M_AMDVI);
diff --git a/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c
index 11925582ef..96241be8f4 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c
+++ b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c
@@ -105,7 +105,6 @@ ivrs_hdr_iterate_tbl(ivhd_iter_t iter, void *arg)
default:
printf("AMD-Vi:Not IVHD/IVMD type(%d)", ivrs_hdr->Type);
-
}
ivrs_hdr = (ACPI_IVRS_HEADER *)((uint8_t *)ivrs_hdr +
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c
index 94dce3fa47..62823b3a65 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/svm.c
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c
@@ -475,7 +475,6 @@ vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
}
-
/*
* Intercept everything when tracing guest exceptions otherwise
* just intercept machine check exception.
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s
index c1537b1544..278dd5c5cb 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s
@@ -37,10 +37,6 @@
/* Porting note: This is named 'svm_support.S' upstream. */
-#define VMLOAD .byte 0x0f, 0x01, 0xda
-#define VMRUN .byte 0x0f, 0x01, 0xd8
-#define VMSAVE .byte 0x0f, 0x01, 0xdb
-
/*
* Flush scratch registers to avoid lingering guest state being used for
@@ -87,7 +83,7 @@ ENTRY_NP(svm_launch)
movq %rsi, SVMSTK_RSI(%rsp)
movq %rdi, SVMSTK_RDI(%rsp)
- /* VMLOAD and VMRUN expect the VMCB physaddr in %rax */
+ /* Save the physical address of the VMCB in %rax */
movq %rdi, %rax
/* Restore guest state. */
@@ -106,9 +102,9 @@ ENTRY_NP(svm_launch)
movq SCTX_RDI(%rsi), %rdi
movq SCTX_RSI(%rsi), %rsi /* %rsi must be restored last */
- VMLOAD
- VMRUN
- VMSAVE
+ vmload %rax
+ vmrun %rax
+ vmsave %rax
/* Grab the svm_regctx pointer */
movq SVMSTK_RSI(%rsp), %rax
diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.c b/usr/src/uts/i86pc/io/vmm/io/ppt.c
index 2f715bcc42..02446862ea 100644
--- a/usr/src/uts/i86pc/io/vmm/io/ppt.c
+++ b/usr/src/uts/i86pc/io/vmm/io/ppt.c
@@ -825,31 +825,44 @@ fail:
return (B_FALSE);
}
-
-static struct pptdev *
-ppt_findf(int fd)
+static int
+ppt_findf(struct vm *vm, int fd, struct pptdev **pptp)
{
struct pptdev *ppt = NULL;
file_t *fp;
vattr_t va;
+ int err = 0;
- if ((fp = getf(fd)) == NULL) {
- return (NULL);
- }
+ ASSERT(MUTEX_HELD(&pptdev_mtx));
+
+ if ((fp = getf(fd)) == NULL)
+ return (EBADF);
va.va_mask = AT_RDEV;
if (VOP_GETATTR(fp->f_vnode, &va, NO_FOLLOW, fp->f_cred, NULL) != 0 ||
- getmajor(va.va_rdev) != ppt_major)
+ getmajor(va.va_rdev) != ppt_major) {
+ err = EBADF;
goto fail;
+ }
ppt = ddi_get_soft_state(ppt_state, getminor(va.va_rdev));
- if (ppt != NULL)
- return (ppt);
+ if (ppt == NULL) {
+ err = EBADF;
+ goto fail;
+ }
+
+ if (ppt->vm != vm) {
+ err = EBUSY;
+ goto fail;
+ }
+
+ *pptp = ppt;
+ return (0);
fail:
releasef(fd);
- return (NULL);
+ return (err);
}
static void
@@ -992,16 +1005,11 @@ ppt_assign_device(struct vm *vm, int pptfd)
int err = 0;
mutex_enter(&pptdev_mtx);
- ppt = ppt_findf(pptfd);
- if (ppt == NULL) {
+ /* Passing NULL requires the device to be unowned. */
+ err = ppt_findf(NULL, pptfd, &ppt);
+ if (err != 0) {
mutex_exit(&pptdev_mtx);
- return (EBADF);
- }
-
- /* Only one VM may own a device at any given time */
- if (ppt->vm != NULL && ppt->vm != vm) {
- err = EBUSY;
- goto done;
+ return (err);
}
if (pci_save_config_regs(ppt->pptd_dip) != DDI_SUCCESS) {
@@ -1091,20 +1099,14 @@ ppt_unassign_device(struct vm *vm, int pptfd)
int err = 0;
mutex_enter(&pptdev_mtx);
- ppt = ppt_findf(pptfd);
- if (ppt == NULL) {
+ err = ppt_findf(vm, pptfd, &ppt);
+ if (err != 0) {
mutex_exit(&pptdev_mtx);
- return (EBADF);
+ return (err);
}
- /* If this device is not owned by this 'vm' then bail out. */
- if (ppt->vm != vm) {
- err = EBUSY;
- goto done;
- }
ppt_do_unassign(ppt);
-done:
releasef(pptfd);
mutex_exit(&pptdev_mtx);
return (err);
@@ -1135,14 +1137,10 @@ ppt_map_mmio(struct vm *vm, int pptfd, vm_paddr_t gpa, size_t len,
int err = 0;
mutex_enter(&pptdev_mtx);
- ppt = ppt_findf(pptfd);
- if (ppt == NULL) {
+ err = ppt_findf(vm, pptfd, &ppt);
+ if (err != 0) {
mutex_exit(&pptdev_mtx);
- return (EBADF);
- }
- if (ppt->vm != vm) {
- err = EBUSY;
- goto done;
+ return (err);
}
/*
@@ -1208,13 +1206,14 @@ ppt_setup_msi(struct vm *vm, int vcpu, int pptfd, uint64_t addr, uint64_t msg,
return (EINVAL);
mutex_enter(&pptdev_mtx);
- ppt = ppt_findf(pptfd);
- if (ppt == NULL) {
+ err = ppt_findf(vm, pptfd, &ppt);
+ if (err != 0) {
mutex_exit(&pptdev_mtx);
- return (EBADF);
+ return (err);
}
- if (ppt->vm != vm) {
- /* Make sure we own this device */
+
+ /* Reject attempts to enable MSI while MSI-X is active. */
+ if (ppt->msix.num_msgs != 0 && numvec != 0) {
err = EBUSY;
goto done;
}
@@ -1308,13 +1307,14 @@ ppt_setup_msix(struct vm *vm, int vcpu, int pptfd, int idx, uint64_t addr,
int err = 0;
mutex_enter(&pptdev_mtx);
- ppt = ppt_findf(pptfd);
- if (ppt == NULL) {
+ err = ppt_findf(vm, pptfd, &ppt);
+ if (err != 0) {
mutex_exit(&pptdev_mtx);
- return (EBADF);
+ return (err);
}
- /* Make sure we own this device */
- if (ppt->vm != vm) {
+
+ /* Reject attempts to enable MSI-X while MSI is active. */
+ if (ppt->msi.num_msgs != 0) {
err = EBUSY;
goto done;
}
@@ -1410,14 +1410,10 @@ ppt_get_limits(struct vm *vm, int pptfd, int *msilimit, int *msixlimit)
int err = 0;
mutex_enter(&pptdev_mtx);
- ppt = ppt_findf(pptfd);
- if (ppt == NULL) {
+ err = ppt_findf(vm, pptfd, &ppt);
+ if (err != 0) {
mutex_exit(&pptdev_mtx);
- return (EBADF);
- }
- if (ppt->vm != vm) {
- err = EBUSY;
- goto done;
+ return (err);
}
if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSI,
@@ -1429,7 +1425,26 @@ ppt_get_limits(struct vm *vm, int pptfd, int *msilimit, int *msixlimit)
*msixlimit = -1;
}
-done:
+ releasef(pptfd);
+ mutex_exit(&pptdev_mtx);
+ return (err);
+}
+
+int
+ppt_disable_msix(struct vm *vm, int pptfd)
+{
+ struct pptdev *ppt;
+ int err = 0;
+
+ mutex_enter(&pptdev_mtx);
+ err = ppt_findf(vm, pptfd, &ppt);
+ if (err != 0) {
+ mutex_exit(&pptdev_mtx);
+ return (err);
+ }
+
+ ppt_teardown_msix(ppt);
+
releasef(pptfd);
mutex_exit(&pptdev_mtx);
return (err);
diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.h b/usr/src/uts/i86pc/io/vmm/io/ppt.h
index 979c0e18ac..72a768c085 100644
--- a/usr/src/uts/i86pc/io/vmm/io/ppt.h
+++ b/usr/src/uts/i86pc/io/vmm/io/ppt.h
@@ -38,6 +38,7 @@ int ppt_setup_msi(struct vm *vm, int vcpu, int pptfd, uint64_t addr,
uint64_t msg, int numvec);
int ppt_setup_msix(struct vm *vm, int vcpu, int pptfd, int idx, uint64_t addr,
uint64_t msg, uint32_t vector_control);
+int ppt_disable_msix(struct vm *vm, int pptfd);
int ppt_assigned_devices(struct vm *vm);
boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa);
int ppt_get_limits(struct vm *vm, int pptfd, int *msilimit, int *msixlimit);
diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile b/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile
index 708818d78e..1b08b06b58 100644
--- a/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile
+++ b/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile
@@ -41,6 +41,7 @@ SYMBOL_VERSION ILLUMOSprivate {
ppt_assigned_devices;
ppt_is_mmio;
ppt_assign_device;
+ ppt_disable_msix;
ppt_unassign_device;
ppt_unassign_all;
ppt_map_mmio;
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.c b/usr/src/uts/i86pc/io/vmm/io/vatpit.c
index d8cfc1beb6..024aa076f7 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vatpit.c
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.c
@@ -76,7 +76,6 @@ struct vatpit_callout_arg {
int channel_num;
};
-
struct channel {
int mode;
uint16_t initial; /* initial counter value */
@@ -293,7 +292,6 @@ pit_readback(struct vatpit *vatpit, uint8_t cmd)
return (error);
}
-
static int
vatpit_update_mode(struct vatpit *vatpit, uint8_t val)
{
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
index cebcaf0fdb..557d32b764 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
@@ -490,6 +490,7 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
case VM_RTC_WRITE:
case VM_RTC_SETTIME:
case VM_RTC_GETTIME:
+ case VM_PPTDEV_DISABLE_MSIX:
#ifndef __FreeBSD__
case VM_DEVMEM_GETOFFSET:
#endif
@@ -616,6 +617,16 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md,
pptmsix.vector_control);
break;
}
+ case VM_PPTDEV_DISABLE_MSIX: {
+ struct vm_pptdev pptdev;
+
+ if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) {
+ error = EFAULT;
+ break;
+ }
+ error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd);
+ break;
+ }
case VM_MAP_PPTDEV_MMIO: {
struct vm_pptdev_mmio pptmmio;
diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h
index f5d031bfd4..f4a68636b3 100644
--- a/usr/src/uts/i86pc/sys/vmm_dev.h
+++ b/usr/src/uts/i86pc/sys/vmm_dev.h
@@ -368,6 +368,7 @@ struct vm_run_state {
#define VM_SUSPEND_CPU (VMM_IOC_BASE | 0x1d)
#define VM_RESUME_CPU (VMM_IOC_BASE | 0x1e)
+#define VM_PPTDEV_DISABLE_MSIX (VMM_IOC_BASE | 0x1f)
#define VM_DEVMEM_GETOFFSET (VMM_IOC_BASE | 0xff)
diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s
index 49c0cce31c..070083d8f1 100644
--- a/usr/src/uts/intel/ia32/ml/modstubs.s
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s
@@ -1301,6 +1301,7 @@ fcnname/**/_info: \
WSTUB(ppt, ppt_map_mmio, nomod_einval);
WSTUB(ppt, ppt_setup_msi, nomod_einval);
WSTUB(ppt, ppt_setup_msix, nomod_einval);
+ WSTUB(ppt, ppt_disable_msix, nomod_einval);
WSTUB(ppt, ppt_assigned_devices, nomod_zero);
WSTUB(ppt, ppt_is_mmio, nomod_zero);
WSTUB(ppt, ppt_assign_device, nomod_einval);