diff options
author | Andy Fiddaman <omnios@citrus-it.co.uk> | 2020-12-21 14:44:26 +0000 |
---|---|---|
committer | Andy Fiddaman <omnios@citrus-it.co.uk> | 2021-02-02 23:16:11 +0000 |
commit | 6960cd891105f9a002a0327e31a6182f9c6de88e (patch) | |
tree | 12af9c786c75f0a9273388aad743d471590100bc | |
parent | ce8560eeb961d528e27685fcdd2ffb03e9478dbf (diff) | |
download | illumos-joyent-6960cd891105f9a002a0327e31a6182f9c6de88e.tar.gz |
13379 bhyve upstream sync 2020 Dec
Reviewed by: Patrick Mooney <pmooney@pfmooney.com>
Reviewed by: Jorge Schrauwen <sjorge@blackdot.be>
Approved by: Robert Mustacchi <rm@fingolfin.org>
35 files changed, 2709 insertions, 1079 deletions
diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile index 2229cdf454..1c1b99c52b 100644 --- a/usr/src/cmd/bhyve/Makefile +++ b/usr/src/cmd/bhyve/Makefile @@ -61,6 +61,7 @@ SRCS = acpi.c \ pci_virtio_rnd.c \ pci_virtio_viona.c \ pci_xhci.c \ + pctestdev.c \ pm.c \ post.c \ ps2kbd.c \ @@ -97,7 +98,7 @@ SRCS = acpi.c \ #pci_hda.c \ # The bhyve generic net-backend stuff has been ignored by us at the moment -# because SmartOS users prefer to use viona for its superior network perf. +# because illumos users prefer to use viona for its superior network perf. #net_backends.c \ diff --git a/usr/src/cmd/bhyve/README.sync b/usr/src/cmd/bhyve/README.sync index af90209ac3..2384413853 100644 --- a/usr/src/cmd/bhyve/README.sync +++ b/usr/src/cmd/bhyve/README.sync @@ -1,15 +1,42 @@ +Git commit hashes in this file refer to the official FreeBSD distributed +public Git repository at https://git.freebsd.org/src.git + The bhyve kernel module and its associated userland consumers have been updated -to the latest upstream FreeBSD sources as documented in +to the latest upstream FreeBSD sources as of: + +commit 2bb4be0f86501ec0565dba3d37ce0f7d7fc9c464 +Author: grehan <grehan@FreeBSD.org> +Date: Fri Dec 18 00:38:48 2020 +0000 + + Fix issues with various VNC clients. + + PR: 250795 + Submitted by: Marko Kiiskila <marko@apache.org> + Reviewed by: jhb (bhyve) + MFC after: 3 weeks + Relnotes: yes + Differential Revision: https://reviews.freebsd.org/D27605 + +Divergence Notes: - usr/src/uts/i86pc/io/vmm/README.sync +A previous sync skipped commit 0ff7076bdbc6dae5ea44c0acdb567e1cede199d1 which +introduced a generic backend functionality to network devices. Without that in +place, subsequent updates reflect the absence of that subsystem. Integrating +net backends has not been a priority, given the common use of viona on illumos. -The userland components in this directory have further been updated with the -following cherry-picked updates which will need taking into account during the -next sync. +The draft Save/Restore functionality, added in FreeBSD commit +483d953a86a2507355f8287c5107dc827a0ff516, has not been synced into illumos bhyve +yet. It is not built by default in FreeBSD, so we're not interested in taking +it until it successfully endures more in-depth testing. - commit 44a544d41c504fbe37f836eb503e4ae721daada9 - Author: grehan <grehan@FreeBSD.org> - Date: Fri Dec 18 00:38:48 2020 +0000 +The VirtFS filesystem sharing feature, added in FreeBSD commit +100353cfbf882e23c911300ebd0cb458bd3ee975, has not been synced into illumos bhyve +yet. It depends on the userland lib9p which needs a fair amount of work to +build and run on illumos. The integration of this feature is being tracked in +https://www.illumos.org/issues/13380 - Fix issues with various VNC clients. +The stub usr/src/compat/bhyve/stdatomic.h file only includes enough glue +to satisfy the use of <stdatomic.h> in usr/src/cmd/bhyve/rfb.c, and in +particular assumes that atomic variables are sized as an int. If other bhyve +pieces start using stdatomic.h, this will need enhancing. diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c index 8522d85bd9..53a92f6dd7 100644 --- a/usr/src/cmd/bhyve/bhyverun.c +++ b/usr/src/cmd/bhyve/bhyverun.c @@ -199,6 +199,7 @@ static int gdb_port = 0; static int guest_vmexit_on_hlt, guest_vmexit_on_pause; static int virtio_msix = 1; static int x2apic_mode = 0; /* default is xAPIC */ +static int destroy_on_poweroff = 0; static int strictio; static int strictmsr = 1; @@ -244,7 +245,11 @@ usage(int code) { fprintf(stderr, - "Usage: %s [-abehuwxACHPSWY]\n" +#ifdef __FreeBSD__ + "Usage: %s [-abehuwxACDHPSWY]\n" +#else + "Usage: %s [-abdehuwxACDHPSWY]\n" +#endif " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-g <gdb port>] [-l <lpc>]\n" #ifdef __FreeBSD__ @@ -259,6 +264,7 @@ usage(int code) #ifndef __FreeBSD__ " -d: suspend cpu at boot\n" #endif + " -D: destroy on power-off\n" " -e: exit on unhandled I/O access\n" " -g: gdb port\n" " -h: help\n" @@ -980,6 +986,8 @@ vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: + if (destroy_on_poweroff) + vm_destroy(ctx); exit(1); case VM_SUSPEND_HALT: exit(2); @@ -1268,9 +1276,9 @@ main(int argc, char *argv[]) memflags = 0; #ifdef __FreeBSD__ - optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:B:U:"; + optstr = "abehuwxACDHIPSWYp:g:G:c:s:m:l:B:U:"; #else - optstr = "abdehuwxACHIPSWYg:G:c:s:m:l:B:U:"; + optstr = "abdehuwxACDHIPSWYg:G:c:s:m:l:B:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { @@ -1283,6 +1291,9 @@ main(int argc, char *argv[]) case 'b': bvmcons = 1; break; + case 'D': + destroy_on_poweroff = 1; + break; case 'B': if (smbios_parse(optarg) != 0) { errx(EX_USAGE, "invalid SMBIOS " diff --git a/usr/src/cmd/bhyve/pci_ahci.c b/usr/src/cmd/bhyve/pci_ahci.c index 57934f9c84..0d4951a61e 100644 --- a/usr/src/cmd/bhyve/pci_ahci.c +++ b/usr/src/cmd/bhyve/pci_ahci.c @@ -136,9 +136,9 @@ struct ahci_ioreq { struct ahci_port { struct blockif_ctxt *bctx; struct pci_ahci_softc *pr_sc; + struct ata_params ata_ident; uint8_t *cmd_lst; uint8_t *rfis; - char ident[AHCI_PORT_IDENT]; int port; int atapi; int reset; @@ -983,7 +983,50 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { - uint16_t buf[256]; + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void*)&p->ata_ident, sizeof(struct ata_params)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); + } +} + +static void +ata_identify_init(struct ahci_port* p, int atapi) +{ + struct ata_params* ata_ident = &p->ata_ident; + + if (atapi) { + ata_ident->config = ATA_PROTO_ATAPI | ATA_ATAPI_TYPE_CDROM | + ATA_ATAPI_REMOVABLE | ATA_DRQ_FAST; + ata_ident->capabilities1 = ATA_SUPPORT_LBA | + ATA_SUPPORT_DMA; + ata_ident->capabilities2 = (1 << 14 | 1); + ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; + ata_ident->obsolete62 = 0x3f; + ata_ident->mwdmamodes = 7; + if (p->xfermode & ATA_WDMA0) + ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); + ata_ident->apiomodes = 3; + ata_ident->mwdmamin = 0x0078; + ata_ident->mwdmarec = 0x0078; + ata_ident->pioblind = 0x0078; + ata_ident->pioiordy = 0x0078; + ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); + ata_ident->satacapabilities2 = ((p->ssts & ATA_SS_SPD_MASK) >> 3); + ata_ident->satasupport = ATA_SUPPORT_NCQ_STREAM; + ata_ident->version_major = 0x3f0; + ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | + ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); + ata_ident->support.command2 = (1 << 14); + ata_ident->support.extension = (1 << 14); + ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | + ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); + ata_ident->enabled.extension = (1 << 14); + ata_ident->udmamodes = 0x7f; + if (p->xfermode & ATA_UDMA0) + ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); + ata_ident->transport_major = 0x1020; + ata_ident->integrity = 0x00a5; + } else { uint64_t sectors; int sectsz, psectsz, psectoff, candelete, ro; uint16_t cyl; @@ -995,87 +1038,84 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) sectors = blockif_size(p->bctx) / sectsz; blockif_chs(p->bctx, &cyl, &heads, &sech); blockif_psectsz(p->bctx, &psectsz, &psectoff); - memset(buf, 0, sizeof(buf)); - buf[0] = 0x0040; - buf[1] = cyl; - buf[3] = heads; - buf[6] = sech; - ata_string((uint8_t *)(buf+10), p->ident, 20); - ata_string((uint8_t *)(buf+23), "001", 8); - ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40); - buf[47] = (0x8000 | 128); - buf[48] = 0; - buf[49] = (1 << 8 | 1 << 9 | 1 << 11); - buf[50] = (1 << 14); - buf[53] = (1 << 1 | 1 << 2); + ata_ident->config = ATA_DRQ_FAST; + ata_ident->cylinders = cyl; + ata_ident->heads = heads; + ata_ident->sectors = sech; + + ata_ident->sectors_intr = (0x8000 | 128); + ata_ident->tcg = 0; + + ata_ident->capabilities1 = ATA_SUPPORT_DMA | + ATA_SUPPORT_LBA | ATA_SUPPORT_IORDY; + ata_ident->capabilities2 = (1 << 14); + ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; if (p->mult_sectors) - buf[59] = (0x100 | p->mult_sectors); + ata_ident->multi = (ATA_MULTI_VALID | p->mult_sectors); if (sectors <= 0x0fffffff) { - buf[60] = sectors; - buf[61] = (sectors >> 16); + ata_ident->lba_size_1 = sectors; + ata_ident->lba_size_2 = (sectors >> 16); } else { - buf[60] = 0xffff; - buf[61] = 0x0fff; + ata_ident->lba_size_1 = 0xffff; + ata_ident->lba_size_2 = 0x0fff; } - buf[63] = 0x7; + ata_ident->mwdmamodes = 0x7; if (p->xfermode & ATA_WDMA0) - buf[63] |= (1 << ((p->xfermode & 7) + 8)); - buf[64] = 0x3; - buf[65] = 120; - buf[66] = 120; - buf[67] = 120; - buf[68] = 120; - buf[69] = 0; - buf[75] = 31; - buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | - ATA_SUPPORT_NCQ); - buf[77] = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | - (p->ssts & ATA_SS_SPD_MASK) >> 3); - buf[80] = 0x3f0; - buf[81] = 0x28; - buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| - ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); - buf[83] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | - ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); - buf[84] = (1 << 14); - buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| - ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); - buf[86] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | - ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); - buf[87] = (1 << 14); - buf[88] = 0x7f; + ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); + ata_ident->apiomodes = 0x3; + ata_ident->mwdmamin = 0x0078; + ata_ident->mwdmarec = 0x0078; + ata_ident->pioblind = 0x0078; + ata_ident->pioiordy = 0x0078; + ata_ident->support3 = 0; + ata_ident->queue = 31; + ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | + ATA_SUPPORT_NCQ); + ata_ident->satacapabilities2 = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | + (p->ssts & ATA_SS_SPD_MASK) >> 3); + ata_ident->version_major = 0x3f0; + ata_ident->version_minor = 0x28; + ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | + ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); + ata_ident->support.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | + ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); + ata_ident->support.extension = (1 << 14); + ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | + ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); + ata_ident->enabled.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | + ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); + ata_ident->enabled.extension = (1 << 14); + ata_ident->udmamodes = 0x7f; if (p->xfermode & ATA_UDMA0) - buf[88] |= (1 << ((p->xfermode & 7) + 8)); - buf[100] = sectors; - buf[101] = (sectors >> 16); - buf[102] = (sectors >> 32); - buf[103] = (sectors >> 48); + ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); + ata_ident->lba_size48_1 = sectors; + ata_ident->lba_size48_2 = (sectors >> 16); + ata_ident->lba_size48_3 = (sectors >> 32); + ata_ident->lba_size48_4 = (sectors >> 48); + if (candelete && !ro) { - buf[69] |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; - buf[105] = 1; - buf[169] = ATA_SUPPORT_DSM_TRIM; + ata_ident->support3 |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; + ata_ident->max_dsm_blocks = 1; + ata_ident->support_dsm = ATA_SUPPORT_DSM_TRIM; } - buf[106] = 0x4000; - buf[209] = 0x4000; + ata_ident->pss = ATA_PSS_VALID_VALUE; + ata_ident->lsalign = 0x4000; if (psectsz > sectsz) { - buf[106] |= 0x2000; - buf[106] |= ffsl(psectsz / sectsz) - 1; - buf[209] |= (psectoff / sectsz); + ata_ident->pss |= ATA_PSS_MULTLS; + ata_ident->pss |= ffsl(psectsz / sectsz) - 1; + ata_ident->lsalign |= (psectoff / sectsz); } if (sectsz > 512) { - buf[106] |= 0x1000; - buf[117] = sectsz / 2; - buf[118] = ((sectsz / 2) >> 16); + ata_ident->pss |= ATA_PSS_LSSABOVE512; + ata_ident->lss_1 = sectsz / 2; + ata_ident->lss_2 = ((sectsz / 2) >> 16); } - buf[119] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); - buf[120] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); - buf[222] = 0x1020; - buf[255] = 0x00a5; - ahci_checksum((uint8_t *)buf, sizeof(buf)); - ahci_write_fis_piosetup(p); - write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); - ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); + ata_ident->support2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); + ata_ident->enabled2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); + ata_ident->transport_major = 0x1020; + ata_ident->integrity = 0x00a5; } + ahci_checksum((uint8_t*)ata_ident, sizeof(struct ata_params)); } static void @@ -1085,44 +1125,8 @@ handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { - uint16_t buf[256]; - - memset(buf, 0, sizeof(buf)); - buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5); - ata_string((uint8_t *)(buf+10), p->ident, 20); - ata_string((uint8_t *)(buf+23), "001", 8); - ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40); - buf[49] = (1 << 9 | 1 << 8); - buf[50] = (1 << 14 | 1); - buf[53] = (1 << 2 | 1 << 1); - buf[62] = 0x3f; - buf[63] = 7; - if (p->xfermode & ATA_WDMA0) - buf[63] |= (1 << ((p->xfermode & 7) + 8)); - buf[64] = 3; - buf[65] = 120; - buf[66] = 120; - buf[67] = 120; - buf[68] = 120; - buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); - buf[77] = ((p->ssts & ATA_SS_SPD_MASK) >> 3); - buf[78] = (1 << 5); - buf[80] = 0x3f0; - buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | - ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); - buf[83] = (1 << 14); - buf[84] = (1 << 14); - buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | - ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); - buf[87] = (1 << 14); - buf[88] = 0x7f; - if (p->xfermode & ATA_UDMA0) - buf[88] |= (1 << ((p->xfermode & 7) + 8)); - buf[222] = 0x1020; - buf[255] = 0x00a5; - ahci_checksum((uint8_t *)buf, sizeof(buf)); ahci_write_fis_piosetup(p); - write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + write_prdt(p, slot, cfis, (void *)&p->ata_ident, sizeof(struct ata_params)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } @@ -2314,6 +2318,10 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) MD5_CTX mdctx; u_char digest[16]; char *next, *next2; + char *bopt, *uopt, *xopts, *config; + FILE* fp; + size_t block_len; + int comma, optpos; ret = 0; @@ -2330,6 +2338,9 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) slots = 32; for (p = 0; p < MAX_PORTS && opts != NULL; p++, opts = next) { + struct ata_params *ata_ident = &sc->port[p].ata_ident; + memset(ata_ident, 0, sizeof(struct ata_params)); + /* Identify and cut off type of present port. */ if (strncmp(opts, "hd:", 3) == 0) { atapi = 0; @@ -2352,13 +2363,82 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) if (opts[0] == 0) continue; + uopt = strdup(opts); + bopt = NULL; + fp = open_memstream(&bopt, &block_len); + comma = 0; + optpos = 0; + + for (xopts = strtok(uopt, ","); + xopts != NULL; + xopts = strtok(NULL, ",")) { + + /* First option assume as block filename. */ + if (optpos == 0) { + /* + * Create an identifier for the backing file. + * Use parts of the md5 sum of the filename + */ + char ident[AHCI_PORT_IDENT]; + MD5Init(&mdctx); + MD5Update(&mdctx, opts, strlen(opts)); + MD5Final(digest, &mdctx); + snprintf(ident, AHCI_PORT_IDENT, + "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], + digest[5]); + ata_string((uint8_t*)&ata_ident->serial, ident, 20); + ata_string((uint8_t*)&ata_ident->revision, "001", 8); + if (atapi) { + ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DVD ROM", 40); + } + else { + ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DISK", 40); + } + } + + if ((config = strchr(xopts, '=')) != NULL) { + *config++ = '\0'; + if (!strcmp("nmrr", xopts)) { + ata_ident->media_rotation_rate = atoi(config); + } + else if (!strcmp("ser", xopts)) { + ata_string((uint8_t*)(&ata_ident->serial), config, 20); + } + else if (!strcmp("rev", xopts)) { + ata_string((uint8_t*)(&ata_ident->revision), config, 8); + } + else if (!strcmp("model", xopts)) { + ata_string((uint8_t*)(&ata_ident->model), config, 40); + } + else { + /* Pass all other options to blockif_open. */ + *--config = '='; + fprintf(fp, "%s%s", comma ? "," : "", xopts); + comma = 1; + } + } + else { + /* Pass all other options to blockif_open. */ + fprintf(fp, "%s%s", comma ? "," : "", xopts); + comma = 1; + } + optpos++; + } + free(uopt); + fclose(fp); + + DPRINTF("%s\n", bopt); + /* * Attempt to open the backing image. Use the PCI slot/func * and the port number for the identifier string. */ snprintf(bident, sizeof(bident), "%d:%d:%d", pi->pi_slot, pi->pi_func, p); - bctxt = blockif_open(opts, bident); + bctxt = blockif_open(bopt, bident); + free(bopt); + if (bctxt == NULL) { sc->ports = p; ret = 1; @@ -2380,17 +2460,7 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) (void) blockif_set_wce(bctxt, 1); #endif - /* - * Create an identifier for the backing file. - * Use parts of the md5 sum of the filename - */ - MD5Init(&mdctx); - MD5Update(&mdctx, opts, strlen(opts)); - MD5Final(digest, &mdctx); - snprintf(sc->port[p].ident, AHCI_PORT_IDENT, - "BHYVE-%02X%02X-%02X%02X-%02X%02X", - digest[0], digest[1], digest[2], digest[3], digest[4], - digest[5]); + ata_identify_init(&sc->port[p], atapi); /* * Allocate blockif request structures and add them diff --git a/usr/src/cmd/bhyve/pci_e82545.c b/usr/src/cmd/bhyve/pci_e82545.c index 8f2c4d810f..cb7e074540 100644 --- a/usr/src/cmd/bhyve/pci_e82545.c +++ b/usr/src/cmd/bhyve/pci_e82545.c @@ -1660,18 +1660,18 @@ e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value) break; case E1000_TDBAL(0): sc->esc_TDBAL = value & ~0xF; - if (sc->esc_tx_enabled) { - /* Apparently legal */ + if (sc->esc_tx_enabled) e82545_tx_update_tdba(sc); - } break; case E1000_TDBAH(0): - //assert(!sc->esc_tx_enabled); sc->esc_TDBAH = value; + if (sc->esc_tx_enabled) + e82545_tx_update_tdba(sc); break; case E1000_TDLEN(0): - //assert(!sc->esc_tx_enabled); sc->esc_TDLEN = value & ~0xFFF0007F; + if (sc->esc_tx_enabled) + e82545_tx_update_tdba(sc); break; case E1000_TDH(0): //assert(!sc->esc_tx_enabled); diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c index c510116e19..90602f715b 100644 --- a/usr/src/cmd/bhyve/pci_emul.c +++ b/usr/src/cmd/bhyve/pci_emul.c @@ -463,14 +463,6 @@ pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, return (-1); } -int -pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, - uint64_t size) -{ - - return (pci_emul_alloc_pbar(pdi, idx, 0, type, size)); -} - /* * Register (or unregister) the MMIO or I/O region associated with the BAR * register 'idx' of an emulated pci device. @@ -595,8 +587,8 @@ update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) } int -pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, - enum pcibar_type type, uint64_t size) +pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, + uint64_t size) { uint64_t *baseptr = NULL; uint64_t limit = 0, lobits = 0; @@ -636,16 +628,10 @@ pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, * Some drivers do not work well if the 64-bit BAR is allocated * above 4GB. Allow for this by allocating small requests under * 4GB unless then allocation size is larger than some arbitrary - * number (32MB currently). + * number (128MB currently). */ - if (size > 32 * 1024 * 1024) { - /* - * XXX special case for device requiring peer-peer DMA - */ - if (size == 0x100000000UL) - baseptr = &hostbase; - else - baseptr = &pci_emul_membase64; + if (size > 128 * 1024 * 1024) { + baseptr = &pci_emul_membase64; limit = PCI_EMUL_MEMLIMIT64; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h index d3dd9a2f46..63e3c89a95 100644 --- a/usr/src/cmd/bhyve/pci_emul.h +++ b/usr/src/cmd/bhyve/pci_emul.h @@ -222,8 +222,6 @@ int init_pci(struct vmctx *ctx); void pci_callback(void); int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size); -int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, - uint64_t hostbase, enum pcibar_type type, uint64_t size); int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, diff --git a/usr/src/cmd/bhyve/pci_lpc.c b/usr/src/cmd/bhyve/pci_lpc.c index 50413250d3..9c8f25e89d 100644 --- a/usr/src/cmd/bhyve/pci_lpc.c +++ b/usr/src/cmd/bhyve/pci_lpc.c @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" +#include "pctestdev.h" #include "uart_emul.h" #define IO_ICU1 0x20 @@ -83,6 +84,8 @@ static struct lpc_uart_softc { static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" }; +static bool pctestdev_present; + /* * LPC device configuration is in the following form: * <lpc_device_name>[,<options>] @@ -110,6 +113,18 @@ lpc_device_parse(const char *opts) goto done; } } + if (strcasecmp(lpcdev, pctestdev_getname()) == 0) { + if (pctestdev_present) { + EPRINTLN("More than one %s device conf is " + "specified; only one is allowed.", + pctestdev_getname()); + } else if (pctestdev_parse(str) == 0) { + pctestdev_present = true; + error = 0; + free(cpy); + goto done; + } + } } done: @@ -127,6 +142,7 @@ lpc_print_supported_devices() printf("bootrom\n"); for (i = 0; i < LPC_UART_NUM; i++) printf("%s\n", lpc_uart_names[i]); + printf("%s\n", pctestdev_getname()); } const char * @@ -250,6 +266,13 @@ lpc_init(struct vmctx *ctx) sc->enabled = 1; } + /* pc-testdev */ + if (pctestdev_present) { + error = pctestdev_init(ctx); + if (error) + return (error); + } + return (0); } diff --git a/usr/src/cmd/bhyve/pci_nvme.c b/usr/src/cmd/bhyve/pci_nvme.c index 65d8d49b64..a0a8a9571d 100644 --- a/usr/src/cmd/bhyve/pci_nvme.c +++ b/usr/src/cmd/bhyve/pci_nvme.c @@ -3,6 +3,7 @@ * * Copyright (c) 2017 Shunsuke Mie * Copyright (c) 2018 Leon Dang + * Copyright (c) 2020 Chuck Tuffli * * Function crc16 Copyright (c) 2017, Fedor Uporov * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c @@ -33,7 +34,7 @@ * bhyve PCIe-NVMe device emulation. * * options: - * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=# + * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> * * accepted devpath: * /dev/blockdev @@ -46,6 +47,7 @@ * sectsz = sector size (defaults to blockif sector size) * ser = serial number (20-chars max) * eui64 = IEEE Extended Unique Identifier (8 byte value) + * dsm = DataSet Management support. Option is one of auto, enable,disable * */ @@ -57,6 +59,7 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <sys/errno.h> #include <sys/types.h> #include <net/ieee_oui.h> #ifndef __FreeBSD__ @@ -86,8 +89,8 @@ __FBSDID("$FreeBSD$"); static int nvme_debug = 0; -#define DPRINTF(params) if (nvme_debug) PRINTLN params -#define WPRINTF(params) PRINTLN params +#define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) +#define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) /* defaults; can be overridden */ #define NVME_MSIX_BAR 4 @@ -99,9 +102,16 @@ static int nvme_debug = 0; #define NVME_QUEUES 16 #define NVME_MAX_QENTRIES 2048 +/* Memory Page size Minimum reported in CAP register */ +#define NVME_MPSMIN 0 +/* MPSMIN converted to bytes */ +#define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) -#define NVME_MAX_BLOCKIOVS 512 +#define NVME_MDTS 9 +/* Note the + 1 allows for the initial descriptor to not be page aligned */ +#define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) +#define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) /* This is a synthetic status code to indicate there is no status */ #define NVME_NO_STATUS 0xffff @@ -153,21 +163,21 @@ enum nvme_copy_dir { struct nvme_completion_queue { struct nvme_completion *qbase; + pthread_mutex_t mtx; uint32_t size; uint16_t tail; /* nvme progress */ uint16_t head; /* guest progress */ uint16_t intr_vec; uint32_t intr_en; - pthread_mutex_t mtx; }; struct nvme_submission_queue { struct nvme_command *qbase; + pthread_mutex_t mtx; uint32_t size; uint16_t head; /* nvme progress */ uint16_t tail; /* guest progress */ uint16_t cqid; /* completion queue id */ - int busy; /* queue is being processed */ int qpriority; }; @@ -186,6 +196,18 @@ struct pci_nvme_blockstore { uint32_t deallocate:1; }; +/* + * Calculate the number of additional page descriptors for guest IO requests + * based on the advertised Max Data Transfer (MDTS) and given the number of + * default iovec's in a struct blockif_req. + * + * Note the + 1 allows for the initial descriptor to not be page aligned. + */ +#define MDTS_PAD_SIZE \ + NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ + NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ + 0 + struct pci_nvme_ioreq { struct pci_nvme_softc *sc; STAILQ_ENTRY(pci_nvme_ioreq) link; @@ -199,18 +221,11 @@ struct pci_nvme_ioreq { uint64_t prev_gpaddr; size_t prev_size; - - /* - * lock if all iovs consumed (big IO); - * complete transaction before continuing - */ - pthread_mutex_t mtx; - pthread_cond_t cv; + size_t bytes; struct blockif_req io_req; - /* pad to fit up to 512 page descriptors from guest IO request */ - struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; + struct iovec iovpadding[MDTS_PAD_SIZE]; }; enum nvme_dsm_type { @@ -222,6 +237,28 @@ enum nvme_dsm_type { NVME_DATASET_MANAGEMENT_DISABLE, }; +struct pci_nvme_softc; +struct nvme_feature_obj; + +typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, + struct nvme_feature_obj *, + struct nvme_command *, + struct nvme_completion *); + +struct nvme_feature_obj { + uint32_t cdw11; + nvme_feature_cb set; + nvme_feature_cb get; + bool namespace_specific; +}; + +#define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) + +struct pci_nvme_aer { + STAILQ_ENTRY(pci_nvme_aer) link; + uint16_t cid; /* Command ID of the submitted AER */ +}; + struct pci_nvme_softc { struct pci_devinst *nsc_pi; @@ -241,6 +278,7 @@ struct pci_nvme_softc { uint32_t max_queues; /* max number of IO SQ's or CQ's */ uint32_t num_cqueues; uint32_t num_squeues; + bool num_q_is_set; /* Has host set Number of Queues */ struct pci_nvme_ioreq *ioreqs; STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ @@ -255,16 +293,26 @@ struct pci_nvme_softc { struct nvme_completion_queue *compl_queues; struct nvme_submission_queue *submit_queues; - /* controller features */ - uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */ - uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */ - uint32_t async_ev_config; /* 0x0B: async event config */ + struct nvme_feature_obj feat[NVME_FID_MAX]; enum nvme_dsm_type dataset_management; + + /* Accounting for SMART data */ + __uint128_t read_data_units; + __uint128_t write_data_units; + __uint128_t read_commands; + __uint128_t write_commands; + uint32_t read_dunits_remainder; + uint32_t write_dunits_remainder; + + STAILQ_HEAD(, pci_nvme_aer) aer_list; + uint32_t aer_count; }; -static void pci_nvme_io_partial(struct blockif_req *br, int err); +static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); +static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); +static void pci_nvme_io_done(struct blockif_req *, int); /* Controller Configuration utils */ #define NVME_CC_GET_EN(cc) \ @@ -303,6 +351,19 @@ static void pci_nvme_io_partial(struct blockif_req *br, int err); #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ NVME_CTRLR_DATA_ONCS_DSM_SHIFT) +static void nvme_feature_invalid_cb(struct pci_nvme_softc *, + struct nvme_feature_obj *, + struct nvme_command *, + struct nvme_completion *); +static void nvme_feature_num_queues(struct pci_nvme_softc *, + struct nvme_feature_obj *, + struct nvme_command *, + struct nvme_completion *); +static void nvme_feature_iv_config(struct pci_nvme_softc *, + struct nvme_feature_obj *, + struct nvme_command *, + struct nvme_completion *); + static __inline void cpywithpad(char *dst, size_t dst_size, const char *src, char pad) { @@ -329,14 +390,60 @@ pci_nvme_status_genc(uint16_t *status, uint16_t code) pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); } -static __inline void -pci_nvme_toggle_phase(uint16_t *status, int prev) +/* + * Initialize the requested number or IO Submission and Completion Queues. + * Admin queues are allocated implicitly. + */ +static void +pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) { + uint32_t i; - if (prev) - *status &= ~NVME_STATUS_P; - else - *status |= NVME_STATUS_P; + /* + * Allocate and initialize the Submission Queues + */ + if (nsq > NVME_QUEUES) { + WPRINTF("%s: clamping number of SQ from %u to %u", + __func__, nsq, NVME_QUEUES); + nsq = NVME_QUEUES; + } + + sc->num_squeues = nsq; + + sc->submit_queues = calloc(sc->num_squeues + 1, + sizeof(struct nvme_submission_queue)); + if (sc->submit_queues == NULL) { + WPRINTF("%s: SQ allocation failed", __func__); + sc->num_squeues = 0; + } else { + struct nvme_submission_queue *sq = sc->submit_queues; + + for (i = 0; i < sc->num_squeues; i++) + pthread_mutex_init(&sq[i].mtx, NULL); + } + + /* + * Allocate and initialize the Completion Queues + */ + if (ncq > NVME_QUEUES) { + WPRINTF("%s: clamping number of CQ from %u to %u", + __func__, ncq, NVME_QUEUES); + ncq = NVME_QUEUES; + } + + sc->num_cqueues = ncq; + + sc->compl_queues = calloc(sc->num_cqueues + 1, + sizeof(struct nvme_completion_queue)); + if (sc->compl_queues == NULL) { + WPRINTF("%s: CQ allocation failed", __func__); + sc->num_cqueues = 0; + } else { + struct nvme_completion_queue *cq = sc->compl_queues; + + for (i = 0; i < sc->num_cqueues; i++) + pthread_mutex_init(&cq[i].mtx, NULL); + } } static void @@ -360,7 +467,7 @@ pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) cd->mic = 0; - cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ + cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ cd->ver = 0x00010300; @@ -368,6 +475,9 @@ pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) cd->acl = 2; cd->aerl = 4; + /* Advertise 1, Read-only firmware slot */ + cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | + (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); cd->lpa = 0; /* TODO: support some simple things like SMART */ cd->elpe = 0; /* max error log page entries */ cd->npss = 1; /* number of power states support */ @@ -493,12 +603,136 @@ pci_nvme_init_logpages(struct pci_nvme_softc *sc) memset(&sc->err_log, 0, sizeof(sc->err_log)); memset(&sc->health_log, 0, sizeof(sc->health_log)); memset(&sc->fw_log, 0, sizeof(sc->fw_log)); + + /* Set read/write remainder to round up according to spec */ + sc->read_dunits_remainder = 999; + sc->write_dunits_remainder = 999; + + /* Set nominal Health values checked by implementations */ + sc->health_log.temperature = 310; + sc->health_log.available_spare = 100; + sc->health_log.available_spare_threshold = 10; +} + +static void +pci_nvme_init_features(struct pci_nvme_softc *sc) +{ + + sc->feat[0].set = nvme_feature_invalid_cb; + sc->feat[0].get = nvme_feature_invalid_cb; + + sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true; + sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true; + sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues; + sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set = + nvme_feature_iv_config; + sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get = + nvme_feature_invalid_cb; + sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get = + nvme_feature_invalid_cb; +} + +static void +pci_nvme_aer_init(struct pci_nvme_softc *sc) +{ + + STAILQ_INIT(&sc->aer_list); + sc->aer_count = 0; +} + +static void +pci_nvme_aer_destroy(struct pci_nvme_softc *sc) +{ + struct pci_nvme_aer *aer = NULL; + + while (!STAILQ_EMPTY(&sc->aer_list)) { + aer = STAILQ_FIRST(&sc->aer_list); + STAILQ_REMOVE_HEAD(&sc->aer_list, link); + free(aer); + } + + pci_nvme_aer_init(sc); +} + +#ifdef __FreeBSD__ +static bool +pci_nvme_aer_available(struct pci_nvme_softc *sc) +{ + + return (!STAILQ_EMPTY(&sc->aer_list)); +} +#else +/* This is kept behind an ifdef while it's unused to appease the compiler. */ +#endif + +static bool +pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) +{ + struct nvme_controller_data *cd = &sc->ctrldata; + + /* AERL is a zero based value while aer_count is one's based */ + return (sc->aer_count == (cd->aerl + 1)); +} + +/* + * Add an Async Event Request + * + * Stores an AER to be returned later if the Controller needs to notify the + * host of an event. + * Note that while the NVMe spec doesn't require Controllers to return AER's + * in order, this implementation does preserve the order. + */ +static int +pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) +{ + struct pci_nvme_aer *aer = NULL; + + if (pci_nvme_aer_limit_reached(sc)) + return (-1); + + aer = calloc(1, sizeof(struct pci_nvme_aer)); + if (aer == NULL) + return (-1); + + sc->aer_count++; + + /* Save the Command ID for use in the completion message */ + aer->cid = cid; + STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); + + return (0); } +/* + * Get an Async Event Request structure + * + * Returns a pointer to an AER previously submitted by the host or NULL if + * no AER's exist. Caller is responsible for freeing the returned struct. + */ +#ifdef __FreeBSD__ +static struct pci_nvme_aer * +pci_nvme_aer_get(struct pci_nvme_softc *sc) +{ + struct pci_nvme_aer *aer = NULL; + + aer = STAILQ_FIRST(&sc->aer_list); + if (aer != NULL) { + STAILQ_REMOVE_HEAD(&sc->aer_list, link); + sc->aer_count--; + } + + return (aer); +} +#else +/* This is kept behind an ifdef while it's unused to appease the compiler. */ +#endif + static void pci_nvme_reset_locked(struct pci_nvme_softc *sc) { - DPRINTF(("%s", __func__)); + uint32_t i; + + DPRINTF("%s", __func__); sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | (1 << NVME_CAP_LO_REG_CQR_SHIFT) | @@ -511,45 +745,28 @@ pci_nvme_reset_locked(struct pci_nvme_softc *sc) sc->regs.cc = 0; sc->regs.csts = 0; - sc->num_cqueues = sc->num_squeues = sc->max_queues; - if (sc->submit_queues != NULL) { - for (int i = 0; i < sc->num_squeues + 1; i++) { - /* - * The Admin Submission Queue is at index 0. - * It must not be changed at reset otherwise the - * emulation will be out of sync with the guest. - */ - if (i != 0) { - sc->submit_queues[i].qbase = NULL; - sc->submit_queues[i].size = 0; - sc->submit_queues[i].cqid = 0; - } - sc->submit_queues[i].tail = 0; - sc->submit_queues[i].head = 0; - sc->submit_queues[i].busy = 0; - } - } else - sc->submit_queues = calloc(sc->num_squeues + 1, - sizeof(struct nvme_submission_queue)); - - if (sc->compl_queues != NULL) { - for (int i = 0; i < sc->num_cqueues + 1; i++) { - /* See Admin Submission Queue note above */ - if (i != 0) { - sc->compl_queues[i].qbase = NULL; - sc->compl_queues[i].size = 0; - } + assert(sc->submit_queues != NULL); - sc->compl_queues[i].tail = 0; - sc->compl_queues[i].head = 0; - } - } else { - sc->compl_queues = calloc(sc->num_cqueues + 1, - sizeof(struct nvme_completion_queue)); + for (i = 0; i < sc->num_squeues + 1; i++) { + sc->submit_queues[i].qbase = NULL; + sc->submit_queues[i].size = 0; + sc->submit_queues[i].cqid = 0; + sc->submit_queues[i].tail = 0; + sc->submit_queues[i].head = 0; + } + + assert(sc->compl_queues != NULL); - for (int i = 0; i < sc->num_cqueues + 1; i++) - pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); + for (i = 0; i < sc->num_cqueues + 1; i++) { + sc->compl_queues[i].qbase = NULL; + sc->compl_queues[i].size = 0; + sc->compl_queues[i].tail = 0; + sc->compl_queues[i].head = 0; } + + sc->num_q_is_set = false; + + pci_nvme_aer_destroy(sc); } static void @@ -565,23 +782,25 @@ pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) { uint16_t acqs, asqs; - DPRINTF(("%s", __func__)); + DPRINTF("%s", __func__); asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; sc->submit_queues[0].size = asqs; sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, sizeof(struct nvme_command) * asqs); - DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p", - __func__, sc->regs.asq, sc->submit_queues[0].qbase)); + DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", + __func__, sc->regs.asq, sc->submit_queues[0].qbase); acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & NVME_AQA_REG_ACQS_MASK) + 1; sc->compl_queues[0].size = acqs; sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, sizeof(struct nvme_completion) * acqs); - DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p", - __func__, sc->regs.acq, sc->compl_queues[0].qbase)); + sc->compl_queues[0].intr_en = NVME_CQ_INTEN; + + DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", + __func__, sc->regs.acq, sc->compl_queues[0].qbase); } static int @@ -631,22 +850,63 @@ nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, return (0); } +/* + * Write a Completion Queue Entry update + * + * Write the completion and update the doorbell value + */ +static void +pci_nvme_cq_update(struct pci_nvme_softc *sc, + struct nvme_completion_queue *cq, + uint32_t cdw0, + uint16_t cid, + uint16_t sqid, + uint16_t status) +{ + struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; + struct nvme_completion *cqe; + + assert(cq->qbase != NULL); + + pthread_mutex_lock(&cq->mtx); + + cqe = &cq->qbase[cq->tail]; + + /* Flip the phase bit */ + status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; + + cqe->cdw0 = cdw0; + cqe->sqhd = sq->head; + cqe->sqid = sqid; + cqe->cid = cid; + cqe->status = status; + + cq->tail++; + if (cq->tail >= cq->size) { + cq->tail = 0; + } + + pthread_mutex_unlock(&cq->mtx); +} + static int nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; - DPRINTF(("%s DELETE_IO_SQ %u", __func__, qid)); - if (qid == 0 || qid > sc->num_squeues) { - WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u", - __func__, qid, sc->num_squeues)); + DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); + if (qid == 0 || qid > sc->num_squeues || + (sc->submit_queues[qid].qbase == NULL)) { + WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", + __func__, qid, sc->num_squeues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } sc->submit_queues[qid].qbase = NULL; + sc->submit_queues[qid].cqid = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } @@ -659,9 +919,10 @@ nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, uint16_t qid = command->cdw10 & 0xffff; struct nvme_submission_queue *nsq; - if ((qid == 0) || (qid > sc->num_squeues)) { - WPRINTF(("%s queue index %u > num_squeues %u", - __func__, qid, sc->num_squeues)); + if ((qid == 0) || (qid > sc->num_squeues) || + (sc->submit_queues[qid].qbase != NULL)) { + WPRINTF("%s queue index %u > num_squeues %u", + __func__, qid, sc->num_squeues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); @@ -670,26 +931,54 @@ nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, nsq = &sc->submit_queues[qid]; nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); + DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); + if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { + /* + * Queues must specify at least two entries + * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to + * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec + */ + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); + return (1); + } + nsq->head = nsq->tail = 0; - nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, - sizeof(struct nvme_command) * (size_t)nsq->size); nsq->cqid = (command->cdw11 >> 16) & 0xffff; + if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + if (sc->compl_queues[nsq->cqid].qbase == NULL) { + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_COMPLETION_QUEUE_INVALID); + return (1); + } + nsq->qpriority = (command->cdw11 >> 1) & 0x03; - DPRINTF(("%s sq %u size %u gaddr %p cqid %u", __func__, - qid, nsq->size, nsq->qbase, nsq->cqid)); + nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(struct nvme_command) * (size_t)nsq->size); + + DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, + qid, nsq->size, nsq->qbase, nsq->cqid); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); - DPRINTF(("%s completed creating IOSQ qid %u", - __func__, qid)); + DPRINTF("%s completed creating IOSQ qid %u", + __func__, qid); } else { /* * Guest sent non-cont submission queue request. * This setting is unsupported by this emulation. */ - WPRINTF(("%s unsupported non-contig (list-based) " - "create i/o submission queue", __func__)); + WPRINTF("%s unsupported non-contig (list-based) " + "create i/o submission queue", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } @@ -701,16 +990,27 @@ nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; + uint16_t sqid; - DPRINTF(("%s DELETE_IO_CQ %u", __func__, qid)); - if (qid == 0 || qid > sc->num_cqueues) { - WPRINTF(("%s queue index %u / num_cqueues %u", - __func__, qid, sc->num_cqueues)); + DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); + if (qid == 0 || qid > sc->num_cqueues || + (sc->compl_queues[qid].qbase == NULL)) { + WPRINTF("%s queue index %u / num_cqueues %u", + __func__, qid, sc->num_cqueues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } + /* Deleting an Active CQ is an error */ + for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) + if (sc->submit_queues[sqid].cqid == qid) { + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_DELETION); + return (1); + } + sc->compl_queues[qid].qbase = NULL; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); @@ -720,40 +1020,58 @@ static int nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { - if (command->cdw11 & NVME_CMD_CDW11_PC) { - uint16_t qid = command->cdw10 & 0xffff; - struct nvme_completion_queue *ncq; + struct nvme_completion_queue *ncq; + uint16_t qid = command->cdw10 & 0xffff; - if ((qid == 0) || (qid > sc->num_cqueues)) { - WPRINTF(("%s queue index %u > num_cqueues %u", - __func__, qid, sc->num_cqueues)); - pci_nvme_status_tc(&compl->status, - NVME_SCT_COMMAND_SPECIFIC, - NVME_SC_INVALID_QUEUE_IDENTIFIER); - return (1); - } + /* Only support Physically Contiguous queues */ + if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { + WPRINTF("%s unsupported non-contig (list-based) " + "create i/o completion queue", + __func__); - ncq = &sc->compl_queues[qid]; - ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; - ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; - ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } - ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, - command->prp1, - sizeof(struct nvme_command) * (size_t)ncq->size); + if ((qid == 0) || (qid > sc->num_cqueues) || + (sc->compl_queues[qid].qbase != NULL)) { + WPRINTF("%s queue index %u > num_cqueues %u", + __func__, qid, sc->num_cqueues); + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + ncq = &sc->compl_queues[qid]; + ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; + ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; + if (ncq->intr_vec > (sc->max_queues + 1)) { + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_INTERRUPT_VECTOR); + return (1); + } - pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); - } else { - /* - * Non-contig completion queue unsupported. + ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); + if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { + /* + * Queues must specify at least two entries + * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to + * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec */ - WPRINTF(("%s unsupported non-contig (list-based) " - "create i/o completion queue", - __func__)); - - /* 0x12 = Invalid Use of Controller Memory Buffer */ - pci_nvme_status_genc(&compl->status, 0x12); + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); + return (1); } + ncq->head = ncq->tail = 0; + ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, + command->prp1, + sizeof(struct nvme_command) * (size_t)ncq->size); + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); } @@ -762,33 +1080,53 @@ static int nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { - uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2; + uint32_t logsize = 0; uint8_t logpage = command->cdw10 & 0xFF; - DPRINTF(("%s log page %u len %u", __func__, logpage, logsize)); + DPRINTF("%s log page %u len %u", __func__, logpage, logsize); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + /* + * Command specifies the number of dwords to return in fields NUMDU + * and NUMDL. This is a zero-based value. + */ + logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; + logsize *= sizeof(uint32_t); + switch (logpage) { case NVME_LOG_ERROR: nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, - command->prp2, (uint8_t *)&sc->err_log, logsize, + command->prp2, (uint8_t *)&sc->err_log, + MIN(logsize, sizeof(sc->err_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_HEALTH_INFORMATION: - /* TODO: present some smart info */ + pthread_mutex_lock(&sc->mtx); + memcpy(&sc->health_log.data_units_read, &sc->read_data_units, + sizeof(sc->health_log.data_units_read)); + memcpy(&sc->health_log.data_units_written, &sc->write_data_units, + sizeof(sc->health_log.data_units_written)); + memcpy(&sc->health_log.host_read_commands, &sc->read_commands, + sizeof(sc->health_log.host_read_commands)); + memcpy(&sc->health_log.host_write_commands, &sc->write_commands, + sizeof(sc->health_log.host_write_commands)); + pthread_mutex_unlock(&sc->mtx); + nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, - command->prp2, (uint8_t *)&sc->health_log, logsize, + command->prp2, (uint8_t *)&sc->health_log, + MIN(logsize, sizeof(sc->health_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_FIRMWARE_SLOT: nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, - command->prp2, (uint8_t *)&sc->fw_log, logsize, + command->prp2, (uint8_t *)&sc->fw_log, + MIN(logsize, sizeof(sc->fw_log)), NVME_COPY_TO_PRP); break; default: - WPRINTF(("%s get log page %x command not supported", - __func__, logpage)); + DPRINTF("%s get log page %x command not supported", + __func__, logpage); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_LOG_PAGE); @@ -802,9 +1140,12 @@ nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { void *dest; + uint16_t status = 0; + + DPRINTF("%s identify 0x%x nsid 0x%x", __func__, + command->cdw10 & 0xFF, command->nsid); - DPRINTF(("%s identify 0x%x nsid 0x%x", __func__, - command->cdw10 & 0xFF, command->nsid)); + pci_nvme_status_genc(&status, NVME_SC_SUCCESS); switch (command->cdw10 & 0xFF) { case 0x00: /* return Identify Namespace data structure */ @@ -821,230 +1162,359 @@ nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint32_t) * 1024); + /* All unused entries shall be zero */ + bzero(dest, sizeof(uint32_t) * 1024); ((uint32_t *)dest)[0] = 1; - ((uint32_t *)dest)[1] = 0; break; - case 0x11: - pci_nvme_status_genc(&compl->status, - NVME_SC_INVALID_NAMESPACE_OR_FORMAT); - return (1); case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ - case 0x10: - case 0x12: - case 0x13: - case 0x14: - case 0x15: + if (command->nsid != 1) { + pci_nvme_status_genc(&status, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + break; + } + dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(uint32_t) * 1024); + /* All bytes after the descriptor shall be zero */ + bzero(dest, sizeof(uint32_t) * 1024); + + /* Return NIDT=1 (i.e. EUI64) descriptor */ + ((uint8_t *)dest)[0] = 1; + ((uint8_t *)dest)[1] = sizeof(uint64_t); + bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); + break; default: - DPRINTF(("%s unsupported identify command requested 0x%x", - __func__, command->cdw10 & 0xFF)); - pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); - return (1); + DPRINTF("%s unsupported identify command requested 0x%x", + __func__, command->cdw10 & 0xFF); + pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); + break; } - pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + compl->status = status; return (1); } -static int -nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command, - struct nvme_completion* compl) +static const char * +nvme_fid_to_name(uint8_t fid) +{ + const char *name; + + switch (fid) { + case NVME_FEAT_ARBITRATION: + name = "Arbitration"; + break; + case NVME_FEAT_POWER_MANAGEMENT: + name = "Power Management"; + break; + case NVME_FEAT_LBA_RANGE_TYPE: + name = "LBA Range Type"; + break; + case NVME_FEAT_TEMPERATURE_THRESHOLD: + name = "Temperature Threshold"; + break; + case NVME_FEAT_ERROR_RECOVERY: + name = "Error Recovery"; + break; + case NVME_FEAT_VOLATILE_WRITE_CACHE: + name = "Volatile Write Cache"; + break; + case NVME_FEAT_NUMBER_OF_QUEUES: + name = "Number of Queues"; + break; + case NVME_FEAT_INTERRUPT_COALESCING: + name = "Interrupt Coalescing"; + break; + case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: + name = "Interrupt Vector Configuration"; + break; + case NVME_FEAT_WRITE_ATOMICITY: + name = "Write Atomicity Normal"; + break; + case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + name = "Asynchronous Event Configuration"; + break; + case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: + name = "Autonomous Power State Transition"; + break; + case NVME_FEAT_HOST_MEMORY_BUFFER: + name = "Host Memory Buffer"; + break; + case NVME_FEAT_TIMESTAMP: + name = "Timestamp"; + break; + case NVME_FEAT_KEEP_ALIVE_TIMER: + name = "Keep Alive Timer"; + break; + case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: + name = "Host Controlled Thermal Management"; + break; + case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: + name = "Non-Operation Power State Config"; + break; + case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: + name = "Read Recovery Level Config"; + break; + case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: + name = "Predictable Latency Mode Config"; + break; + case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: + name = "Predictable Latency Mode Window"; + break; + case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: + name = "LBA Status Information Report Interval"; + break; + case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: + name = "Host Behavior Support"; + break; + case NVME_FEAT_SANITIZE_CONFIG: + name = "Sanitize Config"; + break; + case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: + name = "Endurance Group Event Configuration"; + break; + case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: + name = "Software Progress Marker"; + break; + case NVME_FEAT_HOST_IDENTIFIER: + name = "Host Identifier"; + break; + case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: + name = "Reservation Notification Mask"; + break; + case NVME_FEAT_RESERVATION_PERSISTENCE: + name = "Reservation Persistence"; + break; + case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: + name = "Namespace Write Protection Config"; + break; + default: + name = "Unknown"; + break; + } + + return (name); +} + +static void +nvme_feature_invalid_cb(struct pci_nvme_softc *sc, + struct nvme_feature_obj *feat, + struct nvme_command *command, + struct nvme_completion *compl) +{ + + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); +} + +static void +nvme_feature_iv_config(struct pci_nvme_softc *sc, + struct nvme_feature_obj *feat, + struct nvme_command *command, + struct nvme_completion *compl) +{ + uint32_t i; + uint32_t cdw11 = command->cdw11; + uint16_t iv; + bool cd; + + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + + iv = cdw11 & 0xffff; + cd = cdw11 & (1 << 16); + + if (iv > (sc->max_queues + 1)) { + return; + } + + /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ + if ((iv == 0) && !cd) + return; + + /* Requested Interrupt Vector must be used by a CQ */ + for (i = 0; i < sc->num_cqueues + 1; i++) { + if (sc->compl_queues[i].intr_vec == iv) { + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + } + } + +} + +static void +nvme_feature_num_queues(struct pci_nvme_softc *sc, + struct nvme_feature_obj *feat, + struct nvme_command *command, + struct nvme_completion *compl) { uint16_t nqr; /* Number of Queues Requested */ + if (sc->num_q_is_set) { + WPRINTF("%s: Number of Queues already set", __func__); + pci_nvme_status_genc(&compl->status, + NVME_SC_COMMAND_SEQUENCE_ERROR); + return; + } + nqr = command->cdw11 & 0xFFFF; if (nqr == 0xffff) { - WPRINTF(("%s: Illegal NSQR value %#x", __func__, nqr)); + WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); - return (-1); + return; } sc->num_squeues = ONE_BASED(nqr); if (sc->num_squeues > sc->max_queues) { - DPRINTF(("NSQR=%u is greater than max %u", sc->num_squeues, - sc->max_queues)); + DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, + sc->max_queues); sc->num_squeues = sc->max_queues; } nqr = (command->cdw11 >> 16) & 0xFFFF; if (nqr == 0xffff) { - WPRINTF(("%s: Illegal NCQR value %#x", __func__, nqr)); + WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); - return (-1); + return; } sc->num_cqueues = ONE_BASED(nqr); if (sc->num_cqueues > sc->max_queues) { - DPRINTF(("NCQR=%u is greater than max %u", sc->num_cqueues, - sc->max_queues)); + DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, + sc->max_queues); sc->num_cqueues = sc->max_queues; } + /* Patch the command value which will be saved on callback's return */ + command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); - return (0); + sc->num_q_is_set = true; } static int -nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command, - struct nvme_completion* compl) +nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, + struct nvme_completion *compl) { - int feature = command->cdw10 & 0xFF; - uint32_t iv; + struct nvme_feature_obj *feat; + uint32_t nsid = command->nsid; + uint8_t fid = command->cdw10 & 0xFF; + + DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); + + if (fid >= NVME_FID_MAX) { + DPRINTF("%s invalid feature 0x%x", __func__, fid); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + feat = &sc->feat[fid]; + + if (!feat->namespace_specific && + !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_FEATURE_NOT_NS_SPECIFIC); + return (1); + } - DPRINTF(("%s feature 0x%x", __func__, feature)); compl->cdw0 = 0; + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); - switch (feature) { - case NVME_FEAT_ARBITRATION: - DPRINTF((" arbitration 0x%x", command->cdw11)); - break; - case NVME_FEAT_POWER_MANAGEMENT: - DPRINTF((" power management 0x%x", command->cdw11)); - break; - case NVME_FEAT_LBA_RANGE_TYPE: - DPRINTF((" lba range 0x%x", command->cdw11)); - break; - case NVME_FEAT_TEMPERATURE_THRESHOLD: - DPRINTF((" temperature threshold 0x%x", command->cdw11)); - break; - case NVME_FEAT_ERROR_RECOVERY: - DPRINTF((" error recovery 0x%x", command->cdw11)); - break; - case NVME_FEAT_VOLATILE_WRITE_CACHE: - DPRINTF((" volatile write cache 0x%x", command->cdw11)); - break; - case NVME_FEAT_NUMBER_OF_QUEUES: - nvme_set_feature_queues(sc, command, compl); - break; - case NVME_FEAT_INTERRUPT_COALESCING: - DPRINTF((" interrupt coalescing 0x%x", command->cdw11)); + if (feat->set) + feat->set(sc, feat, command, compl); - /* in uS */ - sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100; + if (compl->status == NVME_SC_SUCCESS) + feat->cdw11 = command->cdw11; - sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF; - break; - case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: - iv = command->cdw11 & 0xFFFF; - - DPRINTF((" interrupt vector configuration 0x%x", - command->cdw11)); - - for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) { - if (sc->compl_queues[i].intr_vec == iv) { - if (command->cdw11 & (1 << 16)) - sc->compl_queues[i].intr_en |= - NVME_CQ_INTCOAL; - else - sc->compl_queues[i].intr_en &= - ~NVME_CQ_INTCOAL; - } - } - break; - case NVME_FEAT_WRITE_ATOMICITY: - DPRINTF((" write atomicity 0x%x", command->cdw11)); - break; - case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: - DPRINTF((" async event configuration 0x%x", - command->cdw11)); - sc->async_ev_config = command->cdw11; - break; - case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: - DPRINTF((" software progress marker 0x%x", - command->cdw11)); - break; - case 0x0C: - DPRINTF((" autonomous power state transition 0x%x", - command->cdw11)); - break; - default: - WPRINTF(("%s invalid feature", __func__)); + return (0); +} + +static int +nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + struct nvme_feature_obj *feat; + uint8_t fid = command->cdw10 & 0xFF; + + DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); + + if (fid >= NVME_FID_MAX) { + DPRINTF("%s invalid feature 0x%x", __func__, fid); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } + compl->cdw0 = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); - return (1); + + feat = &sc->feat[fid]; + if (feat->get) { + feat->get(sc, feat, command, compl); + } + + if (compl->status == NVME_SC_SUCCESS) { + compl->cdw0 = feat->cdw11; + } + + return (0); } static int -nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, +nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { - int feature = command->cdw10 & 0xFF; + uint8_t ses, lbaf, pi; - DPRINTF(("%s feature 0x%x", __func__, feature)); + /* Only supports Secure Erase Setting - User Data Erase */ + ses = (command->cdw10 >> 9) & 0x7; + if (ses > 0x1) { + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } - compl->cdw0 = 0; + /* Only supports a single LBA Format */ + lbaf = command->cdw10 & 0xf; + if (lbaf != 0) { + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_FORMAT); + return (1); + } - switch (feature) { - case NVME_FEAT_ARBITRATION: - DPRINTF((" arbitration")); - break; - case NVME_FEAT_POWER_MANAGEMENT: - DPRINTF((" power management")); - break; - case NVME_FEAT_LBA_RANGE_TYPE: - DPRINTF((" lba range")); - break; - case NVME_FEAT_TEMPERATURE_THRESHOLD: - DPRINTF((" temperature threshold")); - switch ((command->cdw11 >> 20) & 0x3) { - case 0: - /* Over temp threshold */ - compl->cdw0 = 0xFFFF; - break; - case 1: - /* Under temp threshold */ - compl->cdw0 = 0; - break; - default: - WPRINTF((" invalid threshold type select")); + /* Doesn't support Protection Infomation */ + pi = (command->cdw10 >> 5) & 0x7; + if (pi != 0) { + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + + if (sc->nvstore.type == NVME_STOR_RAM) { + if (sc->nvstore.ctx) + free(sc->nvstore.ctx); + sc->nvstore.ctx = calloc(1, sc->nvstore.size); + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + } else { + struct pci_nvme_ioreq *req; + int err; + + req = pci_nvme_get_ioreq(sc); + if (req == NULL) { pci_nvme_status_genc(&compl->status, - NVME_SC_INVALID_FIELD); + NVME_SC_INTERNAL_DEVICE_ERROR); + WPRINTF("%s: unable to allocate IO req", __func__); return (1); } - break; - case NVME_FEAT_ERROR_RECOVERY: - DPRINTF((" error recovery")); - break; - case NVME_FEAT_VOLATILE_WRITE_CACHE: - DPRINTF((" volatile write cache")); - break; - case NVME_FEAT_NUMBER_OF_QUEUES: - compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); - - DPRINTF((" number of queues (submit %u, completion %u)", - compl->cdw0 & 0xFFFF, - (compl->cdw0 >> 16) & 0xFFFF)); + req->nvme_sq = &sc->submit_queues[0]; + req->sqid = 0; + req->opc = command->opc; + req->cid = command->cid; + req->nsid = command->nsid; + + req->io_req.br_offset = 0; + req->io_req.br_resid = sc->nvstore.size; + req->io_req.br_callback = pci_nvme_io_done; - break; - case NVME_FEAT_INTERRUPT_COALESCING: - DPRINTF((" interrupt coalescing")); - break; - case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: - DPRINTF((" interrupt vector configuration")); - break; - case NVME_FEAT_WRITE_ATOMICITY: - DPRINTF((" write atomicity")); - break; - case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: - DPRINTF((" async event configuration")); - sc->async_ev_config = command->cdw11; - break; - case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: - DPRINTF((" software progress marker")); - break; - case 0x0C: - DPRINTF((" autonomous power state transition")); - break; - default: - WPRINTF(("%s invalid feature 0x%x", __func__, feature)); - pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); - return (1); + err = blockif_delete(sc->nvstore.ctx, &req->io_req); + if (err) { + pci_nvme_status_genc(&compl->status, + NVME_SC_INTERNAL_DEVICE_ERROR); + pci_nvme_release_ioreq(sc, req); + } } - pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } @@ -1052,8 +1522,8 @@ static int nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { - DPRINTF(("%s submission queue %u, command ID 0x%x", __func__, - command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF)); + DPRINTF("%s submission queue %u, command ID 0x%x", __func__, + command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); /* TODO: search for the command ID and abort it */ @@ -1062,25 +1532,34 @@ nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, return (1); } -#ifdef __FreeBSD__ static int nvme_opc_async_event_req(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { - DPRINTF(("%s async event request 0x%x", __func__, command->cdw11)); + DPRINTF("%s async event request 0x%x", __func__, command->cdw11); + + /* Don't exceed the Async Event Request Limit (AERL). */ + if (pci_nvme_aer_limit_reached(sc)) { + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); + return (1); + } + + if (pci_nvme_aer_add(sc, command->cid)) { + pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, + NVME_SC_INTERNAL_DEVICE_ERROR); + return (1); + } /* - * TODO: raise events when they happen based on the Set Features cmd. + * Raise events when they happen based on the Set Features cmd. * These events happen async, so only set completion successful if * there is an event reflective of the request to get event. */ - pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, - NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); + compl->status = NVME_NO_STATUS; + return (0); } -#else -/* This is kept behind an ifdef while it's unused to appease the compiler. */ -#endif /* __FreeBSD__ */ static void pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) @@ -1091,20 +1570,15 @@ pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) struct nvme_completion_queue *cq; uint16_t sqhead; - DPRINTF(("%s index %u", __func__, (uint32_t)value)); + DPRINTF("%s index %u", __func__, (uint32_t)value); sq = &sc->submit_queues[0]; cq = &sc->compl_queues[0]; - sqhead = atomic_load_acq_short(&sq->head); + pthread_mutex_lock(&sq->mtx); - if (atomic_testandset_int(&sq->busy, 1)) { - DPRINTF(("%s SQ busy, head %u, tail %u", - __func__, sqhead, sq->tail)); - return; - } - - DPRINTF(("sqhead %u, tail %u", sqhead, sq->tail)); + sqhead = sq->head; + DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); while (sqhead != atomic_load_acq_short(&sq->tail)) { cmd = &(sq->qbase)[sqhead]; @@ -1113,205 +1587,226 @@ pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) switch (cmd->opc) { case NVME_OPC_DELETE_IO_SQ: - DPRINTF(("%s command DELETE_IO_SQ", __func__)); + DPRINTF("%s command DELETE_IO_SQ", __func__); nvme_opc_delete_io_sq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_SQ: - DPRINTF(("%s command CREATE_IO_SQ", __func__)); + DPRINTF("%s command CREATE_IO_SQ", __func__); nvme_opc_create_io_sq(sc, cmd, &compl); break; case NVME_OPC_DELETE_IO_CQ: - DPRINTF(("%s command DELETE_IO_CQ", __func__)); + DPRINTF("%s command DELETE_IO_CQ", __func__); nvme_opc_delete_io_cq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_CQ: - DPRINTF(("%s command CREATE_IO_CQ", __func__)); + DPRINTF("%s command CREATE_IO_CQ", __func__); nvme_opc_create_io_cq(sc, cmd, &compl); break; case NVME_OPC_GET_LOG_PAGE: - DPRINTF(("%s command GET_LOG_PAGE", __func__)); + DPRINTF("%s command GET_LOG_PAGE", __func__); nvme_opc_get_log_page(sc, cmd, &compl); break; case NVME_OPC_IDENTIFY: - DPRINTF(("%s command IDENTIFY", __func__)); + DPRINTF("%s command IDENTIFY", __func__); nvme_opc_identify(sc, cmd, &compl); break; case NVME_OPC_ABORT: - DPRINTF(("%s command ABORT", __func__)); + DPRINTF("%s command ABORT", __func__); nvme_opc_abort(sc, cmd, &compl); break; case NVME_OPC_SET_FEATURES: - DPRINTF(("%s command SET_FEATURES", __func__)); + DPRINTF("%s command SET_FEATURES", __func__); nvme_opc_set_features(sc, cmd, &compl); break; case NVME_OPC_GET_FEATURES: - DPRINTF(("%s command GET_FEATURES", __func__)); + DPRINTF("%s command GET_FEATURES", __func__); nvme_opc_get_features(sc, cmd, &compl); break; + case NVME_OPC_FIRMWARE_ACTIVATE: + DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); + pci_nvme_status_tc(&compl.status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_FIRMWARE_SLOT); + break; case NVME_OPC_ASYNC_EVENT_REQUEST: - DPRINTF(("%s command ASYNC_EVENT_REQ", __func__)); - /* XXX dont care, unhandled for now + DPRINTF("%s command ASYNC_EVENT_REQ", __func__); nvme_opc_async_event_req(sc, cmd, &compl); - */ + break; + case NVME_OPC_FORMAT_NVM: + DPRINTF("%s command FORMAT_NVM", __func__); + if ((sc->ctrldata.oacs & + (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { + pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); + } compl.status = NVME_NO_STATUS; + nvme_opc_format_nvm(sc, cmd, &compl); break; default: - WPRINTF(("0x%x command is not implemented", - cmd->opc)); + DPRINTF("0x%x command is not implemented", + cmd->opc); pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); } sqhead = (sqhead + 1) % sq->size; if (NVME_COMPLETION_VALID(compl)) { - struct nvme_completion *cp; - int phase; - - cp = &(cq->qbase)[cq->tail]; - cp->cdw0 = compl.cdw0; - cp->sqid = 0; - cp->sqhd = sqhead; - cp->cid = cmd->cid; - - phase = NVME_STATUS_GET_P(cp->status); - cp->status = compl.status; - pci_nvme_toggle_phase(&cp->status, phase); - - cq->tail = (cq->tail + 1) % cq->size; + pci_nvme_cq_update(sc, &sc->compl_queues[0], + compl.cdw0, + cmd->cid, + 0, /* SQID */ + compl.status); } } - DPRINTF(("setting sqhead %u", sqhead)); - atomic_store_short(&sq->head, sqhead); - atomic_store_int(&sq->busy, 0); + DPRINTF("setting sqhead %u", sqhead); + sq->head = sqhead; if (cq->head != cq->tail) pci_generate_msix(sc->nsc_pi, 0); + pthread_mutex_unlock(&sq->mtx); } -static int -pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, - uint64_t gpaddr, size_t size, int do_write, uint64_t lba) +/* + * Update the Write and Read statistics reported in SMART data + * + * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. + * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 + * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. + */ +static void +pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, + size_t bytes, uint16_t status) { - int iovidx; - if (req != NULL) { - /* concatenate contig block-iovs to minimize number of iovs */ - if ((req->prev_gpaddr + req->prev_size) == gpaddr) { - iovidx = req->io_req.br_iovcnt - 1; + pthread_mutex_lock(&sc->mtx); + switch (opc) { + case NVME_OPC_WRITE: + sc->write_commands++; + if (status != NVME_SC_SUCCESS) + break; + sc->write_dunits_remainder += (bytes / 512); + while (sc->write_dunits_remainder >= 1000) { + sc->write_data_units++; + sc->write_dunits_remainder -= 1000; + } + break; + case NVME_OPC_READ: + sc->read_commands++; + if (status != NVME_SC_SUCCESS) + break; + sc->read_dunits_remainder += (bytes / 512); + while (sc->read_dunits_remainder >= 1000) { + sc->read_data_units++; + sc->read_dunits_remainder -= 1000; + } + break; + default: + DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); + break; + } + pthread_mutex_unlock(&sc->mtx); +} - req->io_req.br_iov[iovidx].iov_base = - paddr_guest2host(req->sc->nsc_pi->pi_vmctx, - req->prev_gpaddr, size); +/* + * Check if the combination of Starting LBA (slba) and Number of Logical + * Blocks (nlb) exceeds the range of the underlying storage. + * + * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores + * the capacity in bytes as a uint64_t, care must be taken to avoid integer + * overflow. + */ +static bool +pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, + uint32_t nlb) +{ + size_t offset, bytes; - req->prev_size += size; - req->io_req.br_resid += size; + /* Overflow check of multiplying Starting LBA by the sector size */ + if (slba >> (64 - nvstore->sectsz_bits)) + return (true); - req->io_req.br_iov[iovidx].iov_len = req->prev_size; - } else { - pthread_mutex_lock(&req->mtx); + offset = slba << nvstore->sectsz_bits; + bytes = nlb << nvstore->sectsz_bits; - iovidx = req->io_req.br_iovcnt; - if (iovidx == NVME_MAX_BLOCKIOVS) { - int err = 0; + /* Overflow check of Number of Logical Blocks */ + if ((nvstore->size - offset) < bytes) + return (true); - DPRINTF(("large I/O, doing partial req")); + return (false); +} - iovidx = 0; - req->io_req.br_iovcnt = 0; +static int +pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, + uint64_t gpaddr, size_t size, int do_write, uint64_t lba) +{ + int iovidx; - req->io_req.br_callback = pci_nvme_io_partial; + if (req == NULL) + return (-1); - if (!do_write) - err = blockif_read(sc->nvstore.ctx, - &req->io_req); - else - err = blockif_write(sc->nvstore.ctx, - &req->io_req); + if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { + return (-1); + } - /* wait until req completes before cont */ - if (err == 0) - pthread_cond_wait(&req->cv, &req->mtx); - } - if (iovidx == 0) { - req->io_req.br_offset = lba; - req->io_req.br_resid = 0; - req->io_req.br_param = req; - } + /* concatenate contig block-iovs to minimize number of iovs */ + if ((req->prev_gpaddr + req->prev_size) == gpaddr) { + iovidx = req->io_req.br_iovcnt - 1; - req->io_req.br_iov[iovidx].iov_base = - paddr_guest2host(req->sc->nsc_pi->pi_vmctx, - gpaddr, size); + req->io_req.br_iov[iovidx].iov_base = + paddr_guest2host(req->sc->nsc_pi->pi_vmctx, + req->prev_gpaddr, size); - req->io_req.br_iov[iovidx].iov_len = size; + req->prev_size += size; + req->io_req.br_resid += size; - req->prev_gpaddr = gpaddr; - req->prev_size = size; - req->io_req.br_resid += size; + req->io_req.br_iov[iovidx].iov_len = req->prev_size; + } else { + iovidx = req->io_req.br_iovcnt; + if (iovidx == 0) { + req->io_req.br_offset = lba; + req->io_req.br_resid = 0; + req->io_req.br_param = req; + } - req->io_req.br_iovcnt++; + req->io_req.br_iov[iovidx].iov_base = + paddr_guest2host(req->sc->nsc_pi->pi_vmctx, + gpaddr, size); - pthread_mutex_unlock(&req->mtx); - } - } else { - /* RAM buffer: read/write directly */ - void *p = sc->nvstore.ctx; - void *gptr; + req->io_req.br_iov[iovidx].iov_len = size; - if ((lba + size) > sc->nvstore.size) { - WPRINTF(("%s write would overflow RAM", __func__)); - return (-1); - } + req->prev_gpaddr = gpaddr; + req->prev_size = size; + req->io_req.br_resid += size; - p = (void *)((uintptr_t)p + (uintptr_t)lba); - gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); - if (do_write) - memcpy(p, gptr, size); - else - memcpy(gptr, p, size); + req->io_req.br_iovcnt++; } + return (0); } static void pci_nvme_set_completion(struct pci_nvme_softc *sc, struct nvme_submission_queue *sq, int sqid, uint16_t cid, - uint32_t cdw0, uint16_t status, int ignore_busy) + uint32_t cdw0, uint16_t status) { struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; - struct nvme_completion *compl; - int phase; - DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", + DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), - NVME_STATUS_GET_SC(status))); - - pthread_mutex_lock(&cq->mtx); - - assert(cq->qbase != NULL); - - compl = &cq->qbase[cq->tail]; - - compl->cdw0 = cdw0; - compl->sqid = sqid; - compl->sqhd = atomic_load_acq_short(&sq->head); - compl->cid = cid; - - // toggle phase - phase = NVME_STATUS_GET_P(compl->status); - compl->status = status; - pci_nvme_toggle_phase(&compl->status, phase); + NVME_STATUS_GET_SC(status)); - cq->tail = (cq->tail + 1) % cq->size; - - pthread_mutex_unlock(&cq->mtx); + pci_nvme_cq_update(sc, cq, + 0, /* CDW0 */ + cid, + sqid, + status); if (cq->head != cq->tail) { if (cq->intr_en & NVME_CQ_INTEN) { pci_generate_msix(sc->nsc_pi, cq->intr_vec); } else { - DPRINTF(("%s: CQ%u interrupt disabled\n", - __func__, sq->cqid)); + DPRINTF("%s: CQ%u interrupt disabled", + __func__, sq->cqid); } } } @@ -1373,24 +1868,211 @@ pci_nvme_io_done(struct blockif_req *br, int err) struct nvme_submission_queue *sq = req->nvme_sq; uint16_t code, status = 0; - DPRINTF(("%s error %d %s", __func__, err, strerror(err))); + DPRINTF("%s error %d %s", __func__, err, strerror(err)); /* TODO return correct error */ code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; pci_nvme_status_genc(&status, code); - pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0); + pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); + pci_nvme_stats_write_read_update(req->sc, req->opc, + req->bytes, status); pci_nvme_release_ioreq(req->sc, req); } -static void -pci_nvme_io_partial(struct blockif_req *br, int err) +/* + * Implements the Flush command. The specification states: + * If a volatile write cache is not present, Flush commands complete + * successfully and have no effect + * in the description of the Volatile Write Cache (VWC) field of the Identify + * Controller data. Therefore, set status to Success if the command is + * not supported (i.e. RAM or as indicated by the blockif). + */ +static bool +nvme_opc_flush(struct pci_nvme_softc *sc, + struct nvme_command *cmd, + struct pci_nvme_blockstore *nvstore, + struct pci_nvme_ioreq *req, + uint16_t *status) { - struct pci_nvme_ioreq *req = br->br_param; + bool pending = false; + + if (nvstore->type == NVME_STOR_RAM) { + pci_nvme_status_genc(status, NVME_SC_SUCCESS); + } else { + int err; + + req->io_req.br_callback = pci_nvme_io_done; + + err = blockif_flush(nvstore->ctx, &req->io_req); + switch (err) { + case 0: + pending = true; + break; + case EOPNOTSUPP: + pci_nvme_status_genc(status, NVME_SC_SUCCESS); + break; + default: + pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); + } + } - DPRINTF(("%s error %d %s", __func__, err, strerror(err))); + return (pending); +} + +static uint16_t +nvme_write_read_ram(struct pci_nvme_softc *sc, + struct pci_nvme_blockstore *nvstore, + uint64_t prp1, uint64_t prp2, + size_t offset, uint64_t bytes, + bool is_write) +{ + uint8_t *buf = nvstore->ctx; + enum nvme_copy_dir dir; + uint16_t status = 0; + + if (is_write) + dir = NVME_COPY_TO_PRP; + else + dir = NVME_COPY_FROM_PRP; - pthread_cond_signal(&req->cv); + if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, + buf + offset, bytes, dir)) + pci_nvme_status_genc(&status, + NVME_SC_DATA_TRANSFER_ERROR); + else + pci_nvme_status_genc(&status, NVME_SC_SUCCESS); + + return (status); +} + +static uint16_t +nvme_write_read_blockif(struct pci_nvme_softc *sc, + struct pci_nvme_blockstore *nvstore, + struct pci_nvme_ioreq *req, + uint64_t prp1, uint64_t prp2, + size_t offset, uint64_t bytes, + bool is_write) +{ + uint64_t size; + int err; + uint16_t status = NVME_NO_STATUS; + + size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); + if (pci_nvme_append_iov_req(sc, req, prp1, + size, is_write, offset)) { + pci_nvme_status_genc(&status, + NVME_SC_DATA_TRANSFER_ERROR); + goto out; + } + + offset += size; + bytes -= size; + + if (bytes == 0) { + ; + } else if (bytes <= PAGE_SIZE) { + size = bytes; + if (pci_nvme_append_iov_req(sc, req, prp2, + size, is_write, offset)) { + pci_nvme_status_genc(&status, + NVME_SC_DATA_TRANSFER_ERROR); + goto out; + } + } else { + void *vmctx = sc->nsc_pi->pi_vmctx; + uint64_t *prp_list = &prp2; + uint64_t *last = prp_list; + + /* PRP2 is pointer to a physical region page list */ + while (bytes) { + /* Last entry in list points to the next list */ + if (prp_list == last) { + uint64_t prp = *prp_list; + + prp_list = paddr_guest2host(vmctx, prp, + PAGE_SIZE - (prp % PAGE_SIZE)); + last = prp_list + (NVME_PRP2_ITEMS - 1); + } + + size = MIN(bytes, PAGE_SIZE); + + if (pci_nvme_append_iov_req(sc, req, *prp_list, + size, is_write, offset)) { + pci_nvme_status_genc(&status, + NVME_SC_DATA_TRANSFER_ERROR); + goto out; + } + + offset += size; + bytes -= size; + + prp_list++; + } + } + req->io_req.br_callback = pci_nvme_io_done; + if (is_write) + err = blockif_write(nvstore->ctx, &req->io_req); + else + err = blockif_read(nvstore->ctx, &req->io_req); + + if (err) + pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); +out: + return (status); +} + +static bool +nvme_opc_write_read(struct pci_nvme_softc *sc, + struct nvme_command *cmd, + struct pci_nvme_blockstore *nvstore, + struct pci_nvme_ioreq *req, + uint16_t *status) +{ + uint64_t lba, nblocks, bytes = 0; + size_t offset; + bool is_write = cmd->opc == NVME_OPC_WRITE; + bool pending = false; + + lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; + nblocks = (cmd->cdw12 & 0xFFFF) + 1; + if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { + WPRINTF("%s command would exceed LBA range", __func__); + pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); + goto out; + } + + bytes = nblocks << nvstore->sectsz_bits; + if (bytes > NVME_MAX_DATA_SIZE) { + WPRINTF("%s command would exceed MDTS", __func__); + pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); + goto out; + } + + offset = lba << nvstore->sectsz_bits; + + req->bytes = bytes; + req->io_req.br_offset = lba; + + /* PRP bits 1:0 must be zero */ + cmd->prp1 &= ~0x3UL; + cmd->prp2 &= ~0x3UL; + + if (nvstore->type == NVME_STOR_RAM) { + *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, + cmd->prp2, offset, bytes, is_write); + } else { + *status = nvme_write_read_blockif(sc, nvstore, req, + cmd->prp1, cmd->prp2, offset, bytes, is_write); + + if (*status == NVME_NO_STATUS) + pending = true; + } +out: + if (!pending) + pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); + + return (pending); } static void @@ -1427,29 +2109,54 @@ pci_nvme_dealloc_sm(struct blockif_req *br, int err) if (done) { pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, - req->cid, 0, status, 0); + req->cid, 0, status); pci_nvme_release_ioreq(sc, req); } } -static int +static bool nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, struct nvme_command *cmd, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { - int err = -1; + struct nvme_dsm_range *range = NULL; + uint32_t nr, r, non_zero, dr; + int err; + bool pending = false; if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); goto out; } + nr = cmd->cdw10 & 0xff; + + /* copy locally because a range entry could straddle PRPs */ + range = calloc(1, NVME_MAX_DSM_TRIM); + if (range == NULL) { + pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); + goto out; + } + nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, + (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); + + /* Check for invalid ranges and the number of non-zero lengths */ + non_zero = 0; + for (r = 0; r <= nr; r++) { + if (pci_nvme_out_of_range(nvstore, + range[r].starting_lba, range[r].length)) { + pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); + goto out; + } + if (range[r].length != 0) + non_zero++; + } + if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { - struct nvme_dsm_range *range; - uint32_t nr, r; - int sectsz = sc->nvstore.sectsz; + size_t offset, bytes; + int sectsz_bits = sc->nvstore.sectsz_bits; /* * DSM calls are advisory only, and compliant controllers @@ -1460,23 +2167,20 @@ nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, goto out; } - if (req == NULL) { - pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); + /* If all ranges have a zero length, return Success */ + if (non_zero == 0) { + pci_nvme_status_genc(status, NVME_SC_SUCCESS); goto out; } - /* copy locally because a range entry could straddle PRPs */ - range = calloc(1, NVME_MAX_DSM_TRIM); - if (range == NULL) { + if (req == NULL) { pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); goto out; } - nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, - (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); - req->opc = cmd->opc; - req->cid = cmd->cid; - req->nsid = cmd->nsid; + offset = range[0].starting_lba << sectsz_bits; + bytes = range[0].length << sectsz_bits; + /* * If the request is for more than a single range, store * the ranges in the br_iov. Optimize for the common case @@ -1484,20 +2188,29 @@ nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, * * Note that NVMe Number of Ranges is a zero based value */ - nr = cmd->cdw10 & 0xff; - req->io_req.br_iovcnt = 0; - req->io_req.br_offset = range[0].starting_lba * sectsz; - req->io_req.br_resid = range[0].length * sectsz; + req->io_req.br_offset = offset; + req->io_req.br_resid = bytes; if (nr == 0) { req->io_req.br_callback = pci_nvme_io_done; } else { struct iovec *iov = req->io_req.br_iov; - for (r = 0; r <= nr; r++) { - iov[r].iov_base = (void *)(range[r].starting_lba * sectsz); - iov[r].iov_len = range[r].length * sectsz; + for (r = 0, dr = 0; r <= nr; r++) { + offset = range[r].starting_lba << sectsz_bits; + bytes = range[r].length << sectsz_bits; + if (bytes == 0) + continue; + + if ((nvstore->size - offset) < bytes) { + pci_nvme_status_genc(status, + NVME_SC_LBA_OUT_OF_RANGE); + goto out; + } + iov[dr].iov_base = (void *)offset; + iov[dr].iov_len = bytes; + dr++; } req->io_req.br_callback = pci_nvme_dealloc_sm; @@ -1506,17 +2219,18 @@ nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, * prev_size to track the number of entries */ req->prev_gpaddr = 0; - req->prev_size = r; + req->prev_size = dr; } err = blockif_delete(nvstore->ctx, &req->io_req); if (err) pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); - - free(range); + else + pending = true; } out: - return (err); + free(range); + return (pending); } static void @@ -1525,221 +2239,105 @@ pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) struct nvme_submission_queue *sq; uint16_t status = 0; uint16_t sqhead; - int err; /* handle all submissions up to sq->tail index */ sq = &sc->submit_queues[idx]; - if (atomic_testandset_int(&sq->busy, 1)) { - DPRINTF(("%s sqid %u busy", __func__, idx)); - return; - } + pthread_mutex_lock(&sq->mtx); - sqhead = atomic_load_acq_short(&sq->head); - - DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p", - idx, sqhead, sq->tail, sq->qbase)); + sqhead = sq->head; + DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", + idx, sqhead, sq->tail, sq->qbase); while (sqhead != atomic_load_acq_short(&sq->tail)) { struct nvme_command *cmd; - struct pci_nvme_ioreq *req = NULL; - uint64_t lba; - uint64_t nblocks, bytes, size, cpsz; + struct pci_nvme_ioreq *req; + uint32_t nsid; + bool pending; - /* TODO: support scatter gather list handling */ + pending = false; + req = NULL; + status = 0; cmd = &sq->qbase[sqhead]; sqhead = (sqhead + 1) % sq->size; - lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; - - if (cmd->opc == NVME_OPC_FLUSH) { - pci_nvme_status_genc(&status, NVME_SC_SUCCESS); - pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, - status, 1); - - continue; - } else if (cmd->opc == 0x08) { - /* TODO: write zeroes */ - WPRINTF(("%s write zeroes lba 0x%lx blocks %u", - __func__, lba, cmd->cdw12 & 0xFFFF)); - pci_nvme_status_genc(&status, NVME_SC_SUCCESS); - pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, - status, 1); - - continue; - } - - if (sc->nvstore.type == NVME_STOR_BLOCKIF) { - req = pci_nvme_get_ioreq(sc); - req->nvme_sq = sq; - req->sqid = idx; - } - - if (cmd->opc == NVME_OPC_DATASET_MANAGEMENT) { - if (nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req, - &status)) { - pci_nvme_set_completion(sc, sq, idx, cmd->cid, - 0, status, 1); - if (req) - pci_nvme_release_ioreq(sc, req); - } - continue; - } - - nblocks = (cmd->cdw12 & 0xFFFF) + 1; - - bytes = nblocks * sc->nvstore.sectsz; - - /* - * If data starts mid-page and flows into the next page, then - * increase page count - */ - - DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu " - "(%lu-bytes)", - sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size, - cmd->opc == NVME_OPC_WRITE ? - "WRITE" : "READ", - lba, nblocks, bytes)); - - cmd->prp1 &= ~(0x03UL); - cmd->prp2 &= ~(0x03UL); - - DPRINTF((" prp1 0x%lx prp2 0x%lx", cmd->prp1, cmd->prp2)); - - size = bytes; - lba *= sc->nvstore.sectsz; - - cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE); - - if (cpsz > bytes) - cpsz = bytes; - - if (req != NULL) { - req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) | - cmd->cdw10; - req->opc = cmd->opc; - req->cid = cmd->cid; - req->nsid = cmd->nsid; - } - - err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz, - cmd->opc == NVME_OPC_WRITE, lba); - lba += cpsz; - size -= cpsz; - - if (size == 0) - goto iodone; - - if (size <= PAGE_SIZE) { - /* prp2 is second (and final) page in transfer */ - - err = pci_nvme_append_iov_req(sc, req, cmd->prp2, - size, - cmd->opc == NVME_OPC_WRITE, - lba); - } else { - uint64_t *prp_list; - int i; - - /* prp2 is pointer to a physical region page list */ - prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx, - cmd->prp2, PAGE_SIZE); - - i = 0; - while (size != 0) { - cpsz = MIN(size, PAGE_SIZE); - - /* - * Move to linked physical region page list - * in last item. - */ - if (i == (NVME_PRP2_ITEMS-1) && - size > PAGE_SIZE) { - assert((prp_list[i] & (PAGE_SIZE-1)) == 0); - prp_list = paddr_guest2host( - sc->nsc_pi->pi_vmctx, - prp_list[i], PAGE_SIZE); - i = 0; - } - if (prp_list[i] == 0) { - WPRINTF(("PRP2[%d] = 0 !!!", i)); - err = 1; - break; - } - - err = pci_nvme_append_iov_req(sc, req, - prp_list[i], cpsz, - cmd->opc == NVME_OPC_WRITE, lba); - if (err) - break; - - lba += cpsz; - size -= cpsz; - i++; - } - } - -iodone: - if (sc->nvstore.type == NVME_STOR_RAM) { - uint16_t code, status = 0; - - code = err ? NVME_SC_LBA_OUT_OF_RANGE : - NVME_SC_SUCCESS; - pci_nvme_status_genc(&status, code); - - pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, - status, 1); + nsid = le32toh(cmd->nsid); + if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { + pci_nvme_status_genc(&status, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + status |= + NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; + goto complete; + } - continue; + req = pci_nvme_get_ioreq(sc); + if (req == NULL) { + pci_nvme_status_genc(&status, + NVME_SC_INTERNAL_DEVICE_ERROR); + WPRINTF("%s: unable to allocate IO req", __func__); + goto complete; } + req->nvme_sq = sq; + req->sqid = idx; + req->opc = cmd->opc; + req->cid = cmd->cid; + req->nsid = cmd->nsid; - - if (err) - goto do_error; - - req->io_req.br_callback = pci_nvme_io_done; - - err = 0; switch (cmd->opc) { + case NVME_OPC_FLUSH: + pending = nvme_opc_flush(sc, cmd, &sc->nvstore, + req, &status); + break; + case NVME_OPC_WRITE: case NVME_OPC_READ: - err = blockif_read(sc->nvstore.ctx, &req->io_req); + pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, + req, &status); break; - case NVME_OPC_WRITE: - err = blockif_write(sc->nvstore.ctx, &req->io_req); + case NVME_OPC_WRITE_ZEROES: + /* TODO: write zeroes + WPRINTF("%s write zeroes lba 0x%lx blocks %u", + __func__, lba, cmd->cdw12 & 0xFFFF); */ + pci_nvme_status_genc(&status, NVME_SC_SUCCESS); break; - default: - WPRINTF(("%s unhandled io command 0x%x", - __func__, cmd->opc)); - err = 1; + case NVME_OPC_DATASET_MANAGEMENT: + pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, + req, &status); + break; + default: + WPRINTF("%s unhandled io command 0x%x", + __func__, cmd->opc); + pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); } - -do_error: - if (err) { - uint16_t status = 0; - - pci_nvme_status_genc(&status, - NVME_SC_DATA_TRANSFER_ERROR); - +complete: + if (!pending) { pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, - status, 1); - pci_nvme_release_ioreq(sc, req); + status); + if (req != NULL) + pci_nvme_release_ioreq(sc, req); } } - atomic_store_short(&sq->head, sqhead); - atomic_store_int(&sq->busy, 0); + sq->head = sqhead; + + pthread_mutex_unlock(&sq->mtx); } static void pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, uint64_t idx, int is_sq, uint64_t value) { - DPRINTF(("nvme doorbell %lu, %s, val 0x%lx", - idx, is_sq ? "SQ" : "CQ", value & 0xFFFF)); + DPRINTF("nvme doorbell %lu, %s, val 0x%lx", + idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); if (is_sq) { + if (idx > sc->num_squeues) { + WPRINTF("%s queue index %lu overflow from " + "guest (max %u)", + __func__, idx, sc->num_squeues); + return; + } + atomic_store_short(&sc->submit_queues[idx].tail, (uint16_t)value); @@ -1748,22 +2346,23 @@ pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, } else { /* submission queue; handle new entries in SQ */ if (idx > sc->num_squeues) { - WPRINTF(("%s SQ index %lu overflow from " + WPRINTF("%s SQ index %lu overflow from " "guest (max %u)", - __func__, idx, sc->num_squeues)); + __func__, idx, sc->num_squeues); return; } pci_nvme_handle_io_cmd(sc, (uint16_t)idx); } } else { if (idx > sc->num_cqueues) { - WPRINTF(("%s queue index %lu overflow from " + WPRINTF("%s queue index %lu overflow from " "guest (max %u)", - __func__, idx, sc->num_cqueues)); + __func__, idx, sc->num_cqueues); return; } - sc->compl_queues[idx].head = (uint16_t)value; + atomic_store_short(&sc->compl_queues[idx].head, + (uint16_t)value); } } @@ -1774,46 +2373,46 @@ pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) switch (offset) { case NVME_CR_CAP_LOW: - DPRINTF(("%s %s NVME_CR_CAP_LOW", func, s)); + DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); break; case NVME_CR_CAP_HI: - DPRINTF(("%s %s NVME_CR_CAP_HI", func, s)); + DPRINTF("%s %s NVME_CR_CAP_HI", func, s); break; case NVME_CR_VS: - DPRINTF(("%s %s NVME_CR_VS", func, s)); + DPRINTF("%s %s NVME_CR_VS", func, s); break; case NVME_CR_INTMS: - DPRINTF(("%s %s NVME_CR_INTMS", func, s)); + DPRINTF("%s %s NVME_CR_INTMS", func, s); break; case NVME_CR_INTMC: - DPRINTF(("%s %s NVME_CR_INTMC", func, s)); + DPRINTF("%s %s NVME_CR_INTMC", func, s); break; case NVME_CR_CC: - DPRINTF(("%s %s NVME_CR_CC", func, s)); + DPRINTF("%s %s NVME_CR_CC", func, s); break; case NVME_CR_CSTS: - DPRINTF(("%s %s NVME_CR_CSTS", func, s)); + DPRINTF("%s %s NVME_CR_CSTS", func, s); break; case NVME_CR_NSSR: - DPRINTF(("%s %s NVME_CR_NSSR", func, s)); + DPRINTF("%s %s NVME_CR_NSSR", func, s); break; case NVME_CR_AQA: - DPRINTF(("%s %s NVME_CR_AQA", func, s)); + DPRINTF("%s %s NVME_CR_AQA", func, s); break; case NVME_CR_ASQ_LOW: - DPRINTF(("%s %s NVME_CR_ASQ_LOW", func, s)); + DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); break; case NVME_CR_ASQ_HI: - DPRINTF(("%s %s NVME_CR_ASQ_HI", func, s)); + DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); break; case NVME_CR_ACQ_LOW: - DPRINTF(("%s %s NVME_CR_ACQ_LOW", func, s)); + DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); break; case NVME_CR_ACQ_HI: - DPRINTF(("%s %s NVME_CR_ACQ_HI", func, s)); + DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); break; default: - DPRINTF(("unknown nvme bar-0 offset 0x%lx", offset)); + DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); } } @@ -1830,9 +2429,9 @@ pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, int is_sq = (belloffset % 8) < 4; if (belloffset > ((sc->max_queues+1) * 8 - 4)) { - WPRINTF(("guest attempted an overflow write offset " + WPRINTF("guest attempted an overflow write offset " "0x%lx, val 0x%lx in %s", - offset, value, __func__)); + offset, value, __func__); return; } @@ -1840,13 +2439,13 @@ pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, return; } - DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx", - offset, size, value)); + DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", + offset, size, value); if (size != 4) { - WPRINTF(("guest wrote invalid size %d (offset 0x%lx, " + WPRINTF("guest wrote invalid size %d (offset 0x%lx, " "val 0x%lx) to bar0 in %s", - size, offset, value, __func__)); + size, offset, value, __func__); /* TODO: shutdown device */ return; } @@ -1872,12 +2471,12 @@ pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, case NVME_CR_CC: ccreg = (uint32_t)value; - DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u " + DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " "iocqes %u", __func__, NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), - NVME_CC_GET_IOCQES(ccreg))); + NVME_CC_GET_IOCQES(ccreg)); if (NVME_CC_GET_SHN(ccreg)) { /* perform shutdown - flush out data to backend */ @@ -1931,8 +2530,8 @@ pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, (value << 32); break; default: - DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d", - __func__, offset, value, size)); + DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", + __func__, offset, value, size); } pthread_mutex_unlock(&sc->mtx); } @@ -1945,8 +2544,8 @@ pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { - DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, " - " value 0x%lx", baridx, offset, size, value)); + DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " + " value 0x%lx", baridx, offset, size, value); pci_emul_msix_twrite(pi, offset, size, value); return; @@ -1958,8 +2557,8 @@ pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, break; default: - DPRINTF(("%s unknown baridx %d, val 0x%lx", - __func__, baridx, value)); + DPRINTF("%s unknown baridx %d, val 0x%lx", + __func__, baridx, value); } } @@ -1977,7 +2576,7 @@ static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, pthread_mutex_unlock(&sc->mtx); } else { value = 0; - WPRINTF(("pci_nvme: read invalid offset %ld", offset)); + WPRINTF("pci_nvme: read invalid offset %ld", offset); } switch (size) { @@ -1992,8 +2591,8 @@ static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, break; } - DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x", - offset, size, (uint32_t)value)); + DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", + offset, size, (uint32_t)value); return (value); } @@ -2008,8 +2607,8 @@ pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { - DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d", - baridx, offset, size)); + DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", + baridx, offset, size); return pci_emul_msix_tread(pi, offset, size); } @@ -2019,7 +2618,7 @@ pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, return pci_nvme_read_bar_0(sc, offset, size); default: - DPRINTF(("unknown bar %d, 0x%lx", baridx, offset)); + DPRINTF("unknown bar %d, 0x%lx", baridx, offset); } return (0); @@ -2162,10 +2761,7 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); for (int i = 0; i < sc->ioslots; i++) { STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); - pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); - pthread_cond_init(&sc->ioreqs[i].cv, NULL); } - sc->intr_coales_aggr_thresh = 1; pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); @@ -2185,30 +2781,30 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) 2 * sizeof(uint32_t) * (sc->max_queues + 1); pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); - DPRINTF(("nvme membar size: %u", pci_membar_sz)); + DPRINTF("nvme membar size: %u", pci_membar_sz); error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); if (error) { - WPRINTF(("%s pci alloc mem bar failed", __func__)); + WPRINTF("%s pci alloc mem bar failed", __func__); goto done; } error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); if (error) { - WPRINTF(("%s pci add msixcap failed", __func__)); + WPRINTF("%s pci add msixcap failed", __func__); goto done; } error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); if (error) { - WPRINTF(("%s pci add Express capability failed", __func__)); + WPRINTF("%s pci add Express capability failed", __func__); goto done; } pthread_mutex_init(&sc->mtx, NULL); sem_init(&sc->iosemlock, 0, sc->ioslots); - pci_nvme_reset(sc); + pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); /* * Controller data depends on Namespace data so initialize Namespace * data first. @@ -2216,6 +2812,11 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); pci_nvme_init_ctrldata(sc); pci_nvme_init_logpages(sc); + pci_nvme_init_features(sc); + + pci_nvme_aer_init(sc); + + pci_nvme_reset(sc); pci_lintr_request(pi); diff --git a/usr/src/cmd/bhyve/pci_passthru.c b/usr/src/cmd/bhyve/pci_passthru.c index 664d07b731..c777c56cb1 100644 --- a/usr/src/cmd/bhyve/pci_passthru.c +++ b/usr/src/cmd/bhyve/pci_passthru.c @@ -622,7 +622,7 @@ cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) sc->psc_bar[i].addr = base; /* Allocate the BAR in the guest I/O or MMIO space */ - error = pci_emul_alloc_pbar(pi, i, base, bartype, size); + error = pci_emul_alloc_bar(pi, i, bartype, size); if (error) return (-1); @@ -849,6 +849,10 @@ passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, if (error) err(1, "vm_setup_pptdev_msix"); } + } else { + error = vm_disable_pptdev_msix(ctx, sc->pptfd); + if (error) + err(1, "vm_disable_pptdev_msix"); } return (0); } diff --git a/usr/src/cmd/bhyve/pci_virtio_block.c b/usr/src/cmd/bhyve/pci_virtio_block.c index a34bd864be..27d743a770 100644 --- a/usr/src/cmd/bhyve/pci_virtio_block.c +++ b/usr/src/cmd/bhyve/pci_virtio_block.c @@ -510,7 +510,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) sc->vbsc_cfg.vbc_writeback = 0; sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT; sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG; - sc->vbsc_cfg.discard_sector_alignment = sectsz / VTBLK_BSIZE; + sc->vbsc_cfg.discard_sector_alignment = MAX(sectsz, sts) / VTBLK_BSIZE; /* * Should we move some of this into virtio.c? Could diff --git a/usr/src/cmd/bhyve/pci_virtio_net.c b/usr/src/cmd/bhyve/pci_virtio_net.c index 3a1cc46a06..ded9ca90ea 100644 --- a/usr/src/cmd/bhyve/pci_virtio_net.c +++ b/usr/src/cmd/bhyve/pci_virtio_net.c @@ -179,6 +179,7 @@ struct pci_vtnet_softc { struct nm_desc *vsc_nmd; int vsc_rx_ready; + bool features_negotiated; /* protected by rx_mtx */ int resetting; /* protected by tx_mtx */ uint64_t vsc_features; /* negotiated features */ @@ -228,6 +229,8 @@ pci_vtnet_reset(void *vsc) /* Acquire the RX lock to block RX processing. */ pthread_mutex_lock(&sc->rx_mtx); + sc->features_negotiated = false; + /* Set sc->resetting and give a chance to the TX thread to stop. */ pthread_mutex_lock(&sc->tx_mtx); sc->resetting = 1; @@ -348,6 +351,11 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) assert(sc->vsc_dlpifd != -1); #endif + /* Features must be negotiated */ + if (!sc->features_negotiated) { + return; + } + /* * But, will be called when the rx ring hasn't yet * been set up. @@ -558,6 +566,11 @@ pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) */ assert(sc->vsc_nmd != NULL); + /* Features must be negotiated */ + if (!sc->features_negotiated) { + return; + } + /* * But, will be called when the rx ring hasn't yet * been set up. @@ -678,11 +691,14 @@ pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) /* * A qnotify means that the rx process can now begin. + * Enable RX only if features are negotiated. */ - if (sc->vsc_rx_ready == 0) { + pthread_mutex_lock(&sc->rx_mtx); + if (sc->vsc_rx_ready == 0 && sc->features_negotiated) { sc->vsc_rx_ready = 1; vq_kick_disable(vq); } + pthread_mutex_unlock(&sc->rx_mtx); } static void @@ -1132,6 +1148,10 @@ pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) /* non-merge rx header is 2 bytes shorter */ sc->rx_vhdrlen -= 2; } + + pthread_mutex_lock(&sc->rx_mtx); + sc->features_negotiated = true; + pthread_mutex_unlock(&sc->rx_mtx); } struct pci_devemu pci_de_vnet = { diff --git a/usr/src/cmd/bhyve/pci_xhci.c b/usr/src/cmd/bhyve/pci_xhci.c index b92be4dec3..587e80a91c 100644 --- a/usr/src/cmd/bhyve/pci_xhci.c +++ b/usr/src/cmd/bhyve/pci_xhci.c @@ -1849,6 +1849,9 @@ retry: DPRINTF(("pci_xhci[%d]: xfer->ndata %u", __LINE__, xfer->ndata)); + if (xfer->ndata <= 0) + goto errout; + if (epid == 1) { err = USB_ERR_NOT_STARTED; if (dev->dev_ue->ue_request != NULL) @@ -1863,6 +1866,7 @@ retry: err = USB_TO_XHCI_ERR(err); if ((err == XHCI_TRB_ERROR_SUCCESS) || + (err == XHCI_TRB_ERROR_STALL) || (err == XHCI_TRB_ERROR_SHORT_PKT)) { err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); if (err != XHCI_TRB_ERROR_SUCCESS) @@ -2813,7 +2817,8 @@ pci_xhci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) sc->hcsparams2 = XHCI_SET_HCSP2_ERSTMAX(XHCI_ERST_MAX) | XHCI_SET_HCSP2_IST(0x04); sc->hcsparams3 = 0; /* no latency */ - sc->hccparams1 = XHCI_SET_HCCP1_NSS(1) | /* no 2nd-streams */ + sc->hccparams1 = XHCI_SET_HCCP1_AC64(1) | /* 64-bit addrs */ + XHCI_SET_HCCP1_NSS(1) | /* no 2nd-streams */ XHCI_SET_HCCP1_SPC(1) | /* short packet */ XHCI_SET_HCCP1_MAXPSA(XHCI_STREAMS_MAX); sc->hccparams2 = XHCI_SET_HCCP2_LEC(1) | diff --git a/usr/src/cmd/bhyve/pctestdev.c b/usr/src/cmd/bhyve/pctestdev.c new file mode 100644 index 0000000000..be445e5c75 --- /dev/null +++ b/usr/src/cmd/bhyve/pctestdev.c @@ -0,0 +1,270 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Adam Fenn <adam@fenn.io> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Emulation of selected legacy test/debug interfaces expected by KVM-unit-tests + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/mman.h> +#include <machine/vmm.h> + +#include <assert.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <vmmapi.h> + +#include "debug.h" +#include "inout.h" +#include "mem.h" +#include "pctestdev.h" + +#define DEBUGEXIT_BASE 0xf4 +#define DEBUGEXIT_LEN 4 +#define DEBUGEXIT_NAME "isa-debug-exit" + +#define IOMEM_BASE 0xff000000 +#define IOMEM_LEN 0x10000 +#define IOMEM_NAME "pc-testdev-iomem" + +#define IOPORT_BASE 0xe0 +#define IOPORT_LEN 4 +#define IOPORT_NAME "pc-testdev-ioport" + +#define IRQ_BASE 0x2000 +#define IRQ_IOAPIC_PINCOUNT_MIN 24 +#define IRQ_IOAPIC_PINCOUNT_MAX 32 +#define IRQ_NAME "pc-testdev-irq-line" + +#define PCTESTDEV_NAME "pc-testdev" + +static bool pctestdev_inited; +static uint8_t pctestdev_iomem_buf[IOMEM_LEN]; +static uint32_t pctestdev_ioport_data; + +static int pctestdev_debugexit_io(struct vmctx *ctx, int vcpu, int in, + int port, int bytes, uint32_t *eax, void *arg); +static int pctestdev_iomem_io(struct vmctx *ctx, int vcpu, int dir, + uint64_t addr, int size, uint64_t *val, void *arg1, + long arg2); +static int pctestdev_ioport_io(struct vmctx *ctx, int vcpu, int in, + int port, int bytes, uint32_t *eax, void *arg); +static int pctestdev_irq_io(struct vmctx *ctx, int vcpu, int in, + int port, int bytes, uint32_t *eax, void *arg); + +const char * +pctestdev_getname(void) +{ + return (PCTESTDEV_NAME); +} + +int +pctestdev_parse(const char *opts) +{ + if (opts != NULL && *opts != '\0') + return (-1); + + return (0); +} + +int +pctestdev_init(struct vmctx *ctx) +{ + struct mem_range iomem; + struct inout_port debugexit, ioport, irq; + int err, pincount; + + if (pctestdev_inited) { + EPRINTLN("Only one pc-testdev device is allowed."); + + return (-1); + } + + err = vm_ioapic_pincount(ctx, &pincount); + if (err != 0) { + EPRINTLN("pc-testdev: Failed to obtain IOAPIC pin count."); + + return (-1); + } + if (pincount < IRQ_IOAPIC_PINCOUNT_MIN || + pincount > IRQ_IOAPIC_PINCOUNT_MAX) { + EPRINTLN("pc-testdev: Unsupported IOAPIC pin count: %d.", + pincount); + + return (-1); + } + + debugexit.name = DEBUGEXIT_NAME; + debugexit.port = DEBUGEXIT_BASE; + debugexit.size = DEBUGEXIT_LEN; + debugexit.flags = IOPORT_F_INOUT; + debugexit.handler = pctestdev_debugexit_io; + debugexit.arg = NULL; + + iomem.name = IOMEM_NAME; + iomem.flags = MEM_F_RW | MEM_F_IMMUTABLE; + iomem.handler = pctestdev_iomem_io; + iomem.arg1 = NULL; + iomem.arg2 = 0; + iomem.base = IOMEM_BASE; + iomem.size = IOMEM_LEN; + + ioport.name = IOPORT_NAME; + ioport.port = IOPORT_BASE; + ioport.size = IOPORT_LEN; + ioport.flags = IOPORT_F_INOUT; + ioport.handler = pctestdev_ioport_io; + ioport.arg = NULL; + + irq.name = IRQ_NAME; + irq.port = IRQ_BASE; + irq.size = pincount; + irq.flags = IOPORT_F_INOUT; + irq.handler = pctestdev_irq_io; + irq.arg = NULL; + + err = register_inout(&debugexit); + if (err != 0) + goto fail; + + err = register_inout(&ioport); + if (err != 0) + goto fail_after_debugexit_reg; + + err = register_inout(&irq); + if (err != 0) + goto fail_after_ioport_reg; + + err = register_mem(&iomem); + if (err != 0) + goto fail_after_irq_reg; + + pctestdev_inited = true; + + return (0); + +fail_after_irq_reg: + (void)unregister_inout(&irq); + +fail_after_ioport_reg: + (void)unregister_inout(&ioport); + +fail_after_debugexit_reg: + (void)unregister_inout(&debugexit); + +fail: + return (err); +} + +static int +pctestdev_debugexit_io(struct vmctx *ctx, int vcpu, int in, int port, + int bytes, uint32_t *eax, void *arg) +{ + if (in) + *eax = 0; + else + exit((*eax << 1) | 1); + + return (0); +} + +static int +pctestdev_iomem_io(struct vmctx *ctx, int vcpu, int dir, uint64_t addr, + int size, uint64_t *val, void *arg1, long arg2) +{ + uint64_t offset; + + if (addr + size > IOMEM_BASE + IOMEM_LEN) + return (-1); + + offset = addr - IOMEM_BASE; + if (dir == MEM_F_READ) { + (void)memcpy(val, pctestdev_iomem_buf + offset, size); + } else { + assert(dir == MEM_F_WRITE); + (void)memcpy(pctestdev_iomem_buf + offset, val, size); + } + + return (0); +} + +static int +pctestdev_ioport_io(struct vmctx *ctx, int vcpu, int in, int port, + int bytes, uint32_t *eax, void *arg) +{ + uint32_t mask; + int lsb; + + if (port + bytes > IOPORT_BASE + IOPORT_LEN) + return (-1); + + lsb = (port & 0x3) * 8; + mask = (-1UL >> (32 - (bytes * 8))) << lsb; + + if (in) + *eax = (pctestdev_ioport_data & mask) >> lsb; + else { + pctestdev_ioport_data &= ~mask; + pctestdev_ioport_data |= *eax << lsb; + } + + return (0); +} + +static int +pctestdev_irq_io(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int irq; + + if (bytes != 1) + return (-1); + + if (in) { + *eax = 0; + return (0); + } else { + irq = port - IRQ_BASE; + if (irq < 16) { + if (*eax) + return (vm_isa_assert_irq(ctx, irq, irq)); + else + return (vm_isa_deassert_irq(ctx, irq, irq)); + } else { + if (*eax) + return (vm_ioapic_assert_irq(ctx, irq)); + else + return (vm_ioapic_deassert_irq(ctx, irq)); + } + } +} diff --git a/usr/src/cmd/bhyve/pctestdev.h b/usr/src/cmd/bhyve/pctestdev.h new file mode 100644 index 0000000000..c1c940146e --- /dev/null +++ b/usr/src/cmd/bhyve/pctestdev.h @@ -0,0 +1,43 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Adam Fenn <adam@fenn.io> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Emulation of selected legacy test/debug interfaces expected by KVM-unit-tests + */ + +#ifndef _PCTESTDEV_H_ +#define _PCTESTDEV_H_ + +struct vmctx; + +const char *pctestdev_getname(void); +int pctestdev_init(struct vmctx *ctx); +int pctestdev_parse(const char *opts); + +#endif diff --git a/usr/src/cmd/bhyve/pm.c b/usr/src/cmd/bhyve/pm.c index fa162faab1..d2732242f9 100644 --- a/usr/src/cmd/bhyve/pm.c +++ b/usr/src/cmd/bhyve/pm.c @@ -211,7 +211,7 @@ pm1_enable_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, * the global lock, but ACPI-CA whines profusely if it * can't set GBL_EN. */ - pm1_enable = *eax & (PM1_PWRBTN_EN | PM1_GBL_EN); + pm1_enable = *eax & (PM1_RTC_EN | PM1_PWRBTN_EN | PM1_GBL_EN); sci_update(ctx); } pthread_mutex_unlock(&pm_lock); diff --git a/usr/src/cmd/bhyve/smbiostbl.c b/usr/src/cmd/bhyve/smbiostbl.c index 8af8a85755..3df2012f10 100644 --- a/usr/src/cmd/bhyve/smbiostbl.c +++ b/usr/src/cmd/bhyve/smbiostbl.c @@ -52,6 +52,10 @@ __FBSDID("$FreeBSD$"); #define SMBIOS_BASE 0xF1000 +#define FIRMWARE_VERSION "13.0" +/* The SMBIOS specification defines the date format to be mm/dd/yyyy */ +#define FIRMWARE_RELEASE_DATE "11/10/2020" + /* BHYVE_ACPI_BASE - SMBIOS_BASE) */ #define SMBIOS_MAX_LENGTH (0xF2400 - 0xF1000) @@ -324,9 +328,9 @@ struct smbios_table_type0 smbios_type0_template = { }; const char *smbios_type0_strings[] = { - "BHYVE", /* vendor string */ - "1.00", /* bios version string */ - "03/14/2014", /* bios release date string */ + "BHYVE", /* vendor string */ + FIRMWARE_VERSION, /* bios version string */ + FIRMWARE_RELEASE_DATE, /* bios release date string */ NULL }; @@ -347,12 +351,12 @@ static int smbios_type1_initializer(struct smbios_structure *template_entry, uint16_t *n, uint16_t *size); const char *smbios_type1_strings[] = { - " ", /* manufacturer string */ - "BHYVE", /* product name string */ - "1.0", /* version string */ - "None", /* serial number string */ - "None", /* sku string */ - " ", /* family name string */ + "illumos", /* manufacturer string */ + "BHYVE", /* product name string */ + "1.0", /* version string */ + "None", /* serial number string */ + "None", /* sku string */ + "Virtual Machine", /* family name string */ NULL }; @@ -375,7 +379,7 @@ struct smbios_table_type3 smbios_type3_template = { }; const char *smbios_type3_strings[] = { - " ", /* manufacturer string */ + "illumos", /* manufacturer string */ "1.0", /* version string */ "None", /* serial number string */ "None", /* asset tag string */ @@ -755,7 +759,7 @@ smbios_type19_initializer(struct smbios_structure *template_entry, type19 = (struct smbios_table_type19 *)curaddr; type19->arrayhand = type16_handle; type19->xsaddr = 4*GB; - type19->xeaddr = guest_himem; + type19->xeaddr = type19->xsaddr + guest_himem; } return (0); diff --git a/usr/src/cmd/bhyve/usb_mouse.c b/usr/src/cmd/bhyve/usb_mouse.c index 8af86fcdc7..7790fe0ec9 100644 --- a/usr/src/cmd/bhyve/usb_mouse.c +++ b/usr/src/cmd/bhyve/usb_mouse.c @@ -72,7 +72,7 @@ enum { }; static const char *umouse_desc_strings[] = { - "\x04\x09", + "\x09\x04", "BHYVE", "HID Tablet", "01", @@ -388,7 +388,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer) "sizeof(umouse_dev_desc) %lu", len, sizeof(umouse_dev_desc))); if ((value & 0xFF) != 0) { - err = USB_ERR_IOERROR; + err = USB_ERR_STALLED; goto done; } if (len > sizeof(umouse_dev_desc)) { @@ -403,7 +403,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer) case UDESC_CONFIG: DPRINTF(("umouse: (->UDESC_CONFIG)")); if ((value & 0xFF) != 0) { - err = USB_ERR_IOERROR; + err = USB_ERR_STALLED; goto done; } if (len > sizeof(umouse_confd)) { @@ -472,7 +472,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer) default: DPRINTF(("umouse: unknown(%d)->ERROR", value >> 8)); - err = USB_ERR_IOERROR; + err = USB_ERR_STALLED; goto done; } eshort = data->blen > 0; @@ -496,7 +496,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer) break; default: DPRINTF(("umouse: IO ERROR")); - err = USB_ERR_IOERROR; + err = USB_ERR_STALLED; goto done; } eshort = data->blen > 0; @@ -507,7 +507,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer) if (index != 0) { DPRINTF(("umouse get_interface, invalid index %d", index)); - err = USB_ERR_IOERROR; + err = USB_ERR_STALLED; goto done; } @@ -578,7 +578,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer) case UREQ(UR_SET_FEATURE, UT_WRITE_INTERFACE): case UREQ(UR_SET_FEATURE, UT_WRITE_ENDPOINT): DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_INTERFACE)")); - err = USB_ERR_IOERROR; + err = USB_ERR_STALLED; goto done; case UREQ(UR_SET_INTERFACE, UT_WRITE_INTERFACE): @@ -617,7 +617,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer) memcpy(data->buf, &sc->um_report, len); data->bdone += len; } else { - err = USB_ERR_IOERROR; + err = USB_ERR_STALLED; goto done; } eshort = data->blen > 0; @@ -659,7 +659,7 @@ umouse_request(void *scarg, struct usb_data_xfer *xfer) default: DPRINTF(("**** umouse request unhandled")); - err = USB_ERR_IOERROR; + err = USB_ERR_STALLED; break; } diff --git a/usr/src/contrib/bhyve/dev/nvme/nvme.h b/usr/src/contrib/bhyve/dev/nvme/nvme.h index c7f6496426..6fbf2b758f 100644 --- a/usr/src/contrib/bhyve/dev/nvme/nvme.h +++ b/usr/src/contrib/bhyve/dev/nvme/nvme.h @@ -30,6 +30,18 @@ * Copyright 2019 Joyent, Inc. */ +/* + * illumos port notes: + * + * The upstream version of this file uses conditionals of the form + * #if _BYTE_ORDER != _LITTLE_ENDIAN + * Rather than keep this file in compat with only that little bit changed, + * this is locally patched below. + * + * There is also a static assertion which has been commented out due to a + * problem with smatch. + */ + #ifndef __NVME_H__ #define __NVME_H__ @@ -42,6 +54,8 @@ #define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command) #define NVME_RESET_CONTROLLER _IO('n', 1) +#define NVME_GET_NSID _IOR('n', 2, struct nvme_get_nsid) +#define NVME_GET_MAX_XFER_SIZE _IOR('n', 3, uint64_t) #define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test) #define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test) @@ -59,8 +73,8 @@ */ #define NVME_GLOBAL_NAMESPACE_TAG ((uint32_t)0xFFFFFFFF) -/* Cap nvme to 1MB transfers driver explodes with larger sizes */ -#define NVME_MAX_XFER_SIZE (MAXPHYS < (1<<20) ? MAXPHYS : (1<<20)) +/* Cap transfers by the maximum addressable by page-sized PRP (4KB -> 2MB). */ +#define NVME_MAX_XFER_SIZE MIN(maxphys, (PAGE_SIZE/8*PAGE_SIZE)) /* Register field definitions */ #define NVME_CAP_LO_REG_MQES_SHIFT (0) @@ -71,15 +85,51 @@ #define NVME_CAP_LO_REG_AMS_MASK (0x3) #define NVME_CAP_LO_REG_TO_SHIFT (24) #define NVME_CAP_LO_REG_TO_MASK (0xFF) +#define NVME_CAP_LO_MQES(x) \ + (((x) >> NVME_CAP_LO_REG_MQES_SHIFT) & NVME_CAP_LO_REG_MQES_MASK) +#define NVME_CAP_LO_CQR(x) \ + (((x) >> NVME_CAP_LO_REG_CQR_SHIFT) & NVME_CAP_LO_REG_CQR_MASK) +#define NVME_CAP_LO_AMS(x) \ + (((x) >> NVME_CAP_LO_REG_AMS_SHIFT) & NVME_CAP_LO_REG_AMS_MASK) +#define NVME_CAP_LO_TO(x) \ + (((x) >> NVME_CAP_LO_REG_TO_SHIFT) & NVME_CAP_LO_REG_TO_MASK) #define NVME_CAP_HI_REG_DSTRD_SHIFT (0) #define NVME_CAP_HI_REG_DSTRD_MASK (0xF) +#define NVME_CAP_HI_REG_NSSRS_SHIFT (4) +#define NVME_CAP_HI_REG_NSSRS_MASK (0x1) +#define NVME_CAP_HI_REG_CSS_SHIFT (5) +#define NVME_CAP_HI_REG_CSS_MASK (0xff) #define NVME_CAP_HI_REG_CSS_NVM_SHIFT (5) #define NVME_CAP_HI_REG_CSS_NVM_MASK (0x1) +#define NVME_CAP_HI_REG_BPS_SHIFT (13) +#define NVME_CAP_HI_REG_BPS_MASK (0x1) #define NVME_CAP_HI_REG_MPSMIN_SHIFT (16) #define NVME_CAP_HI_REG_MPSMIN_MASK (0xF) #define NVME_CAP_HI_REG_MPSMAX_SHIFT (20) #define NVME_CAP_HI_REG_MPSMAX_MASK (0xF) +#define NVME_CAP_HI_REG_PMRS_SHIFT (24) +#define NVME_CAP_HI_REG_PMRS_MASK (0x1) +#define NVME_CAP_HI_REG_CMBS_SHIFT (25) +#define NVME_CAP_HI_REG_CMBS_MASK (0x1) +#define NVME_CAP_HI_DSTRD(x) \ + (((x) >> NVME_CAP_HI_REG_DSTRD_SHIFT) & NVME_CAP_HI_REG_DSTRD_MASK) +#define NVME_CAP_HI_NSSRS(x) \ + (((x) >> NVME_CAP_HI_REG_NSSRS_SHIFT) & NVME_CAP_HI_REG_NSSRS_MASK) +#define NVME_CAP_HI_CSS(x) \ + (((x) >> NVME_CAP_HI_REG_CSS_SHIFT) & NVME_CAP_HI_REG_CSS_MASK) +#define NVME_CAP_HI_CSS_NVM(x) \ + (((x) >> NVME_CAP_HI_REG_CSS_NVM_SHIFT) & NVME_CAP_HI_REG_CSS_NVM_MASK) +#define NVME_CAP_HI_BPS(x) \ + (((x) >> NVME_CAP_HI_REG_BPS_SHIFT) & NVME_CAP_HI_REG_BPS_MASK) +#define NVME_CAP_HI_MPSMIN(x) \ + (((x) >> NVME_CAP_HI_REG_MPSMIN_SHIFT) & NVME_CAP_HI_REG_MPSMIN_MASK) +#define NVME_CAP_HI_MPSMAX(x) \ + (((x) >> NVME_CAP_HI_REG_MPSMAX_SHIFT) & NVME_CAP_HI_REG_MPSMAX_MASK) +#define NVME_CAP_HI_PMRS(x) \ + (((x) >> NVME_CAP_HI_REG_PMRS_SHIFT) & NVME_CAP_HI_REG_PMRS_MASK) +#define NVME_CAP_HI_CMBS(x) \ + (((x) >> NVME_CAP_HI_REG_CMBS_SHIFT) & NVME_CAP_HI_REG_CMBS_MASK) #define NVME_CC_REG_EN_SHIFT (0) #define NVME_CC_REG_EN_MASK (0x1) @@ -102,6 +152,10 @@ #define NVME_CSTS_REG_CFS_MASK (0x1) #define NVME_CSTS_REG_SHST_SHIFT (2) #define NVME_CSTS_REG_SHST_MASK (0x3) +#define NVME_CSTS_REG_NVSRO_SHIFT (4) +#define NVME_CSTS_REG_NVSRO_MASK (0x1) +#define NVME_CSTS_REG_PP_SHIFT (5) +#define NVME_CSTS_REG_PP_MASK (0x1) #define NVME_CSTS_GET_SHST(csts) (((csts) >> NVME_CSTS_REG_SHST_SHIFT) & NVME_CSTS_REG_SHST_MASK) @@ -110,6 +164,36 @@ #define NVME_AQA_REG_ACQS_SHIFT (16) #define NVME_AQA_REG_ACQS_MASK (0xFFF) +#define NVME_PMRCAP_REG_RDS_SHIFT (3) +#define NVME_PMRCAP_REG_RDS_MASK (0x1) +#define NVME_PMRCAP_REG_WDS_SHIFT (4) +#define NVME_PMRCAP_REG_WDS_MASK (0x1) +#define NVME_PMRCAP_REG_BIR_SHIFT (5) +#define NVME_PMRCAP_REG_BIR_MASK (0x7) +#define NVME_PMRCAP_REG_PMRTU_SHIFT (8) +#define NVME_PMRCAP_REG_PMRTU_MASK (0x3) +#define NVME_PMRCAP_REG_PMRWBM_SHIFT (10) +#define NVME_PMRCAP_REG_PMRWBM_MASK (0xf) +#define NVME_PMRCAP_REG_PMRTO_SHIFT (16) +#define NVME_PMRCAP_REG_PMRTO_MASK (0xff) +#define NVME_PMRCAP_REG_CMSS_SHIFT (24) +#define NVME_PMRCAP_REG_CMSS_MASK (0x1) + +#define NVME_PMRCAP_RDS(x) \ + (((x) >> NVME_PMRCAP_REG_RDS_SHIFT) & NVME_PMRCAP_REG_RDS_MASK) +#define NVME_PMRCAP_WDS(x) \ + (((x) >> NVME_PMRCAP_REG_WDS_SHIFT) & NVME_PMRCAP_REG_WDS_MASK) +#define NVME_PMRCAP_BIR(x) \ + (((x) >> NVME_PMRCAP_REG_BIR_SHIFT) & NVME_PMRCAP_REG_BIR_MASK) +#define NVME_PMRCAP_PMRTU(x) \ + (((x) >> NVME_PMRCAP_REG_PMRTU_SHIFT) & NVME_PMRCAP_REG_PMRTU_MASK) +#define NVME_PMRCAP_PMRWBM(x) \ + (((x) >> NVME_PMRCAP_REG_PMRWBM_SHIFT) & NVME_PMRCAP_REG_PMRWBM_MASK) +#define NVME_PMRCAP_PMRTO(x) \ + (((x) >> NVME_PMRCAP_REG_PMRTO_SHIFT) & NVME_PMRCAP_REG_PMRTO_MASK) +#define NVME_PMRCAP_CMSS(x) \ + (((x) >> NVME_PMRCAP_REG_CMSS_SHIFT) & NVME_PMRCAP_REG_CMSS_MASK) + /* Command field definitions */ #define NVME_CMD_FUSE_SHIFT (8) @@ -121,6 +205,8 @@ #define NVME_STATUS_SC_MASK (0xFF) #define NVME_STATUS_SCT_SHIFT (9) #define NVME_STATUS_SCT_MASK (0x7) +#define NVME_STATUS_CRD_SHIFT (12) +#define NVME_STATUS_CRD_MASK (0x3) #define NVME_STATUS_M_SHIFT (14) #define NVME_STATUS_M_MASK (0x1) #define NVME_STATUS_DNR_SHIFT (15) @@ -161,6 +247,9 @@ /* SR-IOV Virtual Function */ #define NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT (2) #define NVME_CTRLR_DATA_MIC_SRIOVVF_MASK (0x1) +/* Asymmetric Namespace Access Reporting */ +#define NVME_CTRLR_DATA_MIC_ANAR_SHIFT (3) +#define NVME_CTRLR_DATA_MIC_ANAR_MASK (0x1) /** OACS - optional admin command support */ /* supports security send/receive commands */ @@ -190,6 +279,9 @@ /* supports Doorbell Buffer Config */ #define NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT (8) #define NVME_CTRLR_DATA_OACS_DBBUFFER_MASK (0x1) +/* supports Get LBA Status */ +#define NVME_CTRLR_DATA_OACS_GETLBA_SHIFT (9) +#define NVME_CTRLR_DATA_OACS_GETLBA_MASK (0x1) /** firmware updates */ /* first slot is read-only */ @@ -198,6 +290,9 @@ /* number of firmware slots */ #define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT (1) #define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK (0x7) +/* firmware activation without reset */ +#define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_SHIFT (4) +#define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_MASK (0x1) /** log page attributes */ /* per namespace smart/health log page */ @@ -214,6 +309,26 @@ #define NVME_CTRLR_DATA_APSTA_APST_SUPP_SHIFT (0) #define NVME_CTRLR_DATA_APSTA_APST_SUPP_MASK (0x1) +/** Sanitize Capabilities */ +/* Crypto Erase Support */ +#define NVME_CTRLR_DATA_SANICAP_CES_SHIFT (0) +#define NVME_CTRLR_DATA_SANICAP_CES_MASK (0x1) +/* Block Erase Support */ +#define NVME_CTRLR_DATA_SANICAP_BES_SHIFT (1) +#define NVME_CTRLR_DATA_SANICAP_BES_MASK (0x1) +/* Overwrite Support */ +#define NVME_CTRLR_DATA_SANICAP_OWS_SHIFT (2) +#define NVME_CTRLR_DATA_SANICAP_OWS_MASK (0x1) +/* No-Deallocate Inhibited */ +#define NVME_CTRLR_DATA_SANICAP_NDI_SHIFT (29) +#define NVME_CTRLR_DATA_SANICAP_NDI_MASK (0x1) +/* No-Deallocate Modifies Media After Sanitize */ +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT (30) +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_MASK (0x3) +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_UNDEF (0) +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_NO (1) +#define NVME_CTRLR_DATA_SANICAP_NODMMAS_YES (2) + /** submission queue entry size */ #define NVME_CTRLR_DATA_SQES_MIN_SHIFT (0) #define NVME_CTRLR_DATA_SQES_MIN_MASK (0xF) @@ -241,6 +356,8 @@ #define NVME_CTRLR_DATA_ONCS_RESERV_MASK (0x1) #define NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT (6) #define NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_VERIFY_SHIFT (7) +#define NVME_CTRLR_DATA_ONCS_VERIFY_MASK (0x1) /** Fused Operation Support */ #define NVME_CTRLR_DATA_FUSES_CNW_SHIFT (0) @@ -255,8 +372,15 @@ #define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK (0x1) /** volatile write cache */ +/* volatile write cache present */ #define NVME_CTRLR_DATA_VWC_PRESENT_SHIFT (0) #define NVME_CTRLR_DATA_VWC_PRESENT_MASK (0x1) +/* flush all namespaces supported */ +#define NVME_CTRLR_DATA_VWC_ALL_SHIFT (1) +#define NVME_CTRLR_DATA_VWC_ALL_MASK (0x3) +#define NVME_CTRLR_DATA_VWC_ALL_UNKNOWN (0) +#define NVME_CTRLR_DATA_VWC_ALL_NO (2) +#define NVME_CTRLR_DATA_VWC_ALL_YES (3) /** namespace features */ /* thin provisioning */ @@ -271,6 +395,9 @@ /* NGUID and EUI64 fields are not reusable */ #define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_SHIFT (3) #define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_MASK (0x1) +/* NPWG, NPWA, NPDG, NPDA, and NOWS are valid */ +#define NVME_NS_DATA_NSFEAT_NPVALID_SHIFT (4) +#define NVME_NS_DATA_NSFEAT_NPVALID_MASK (0x1) /** formatted lba size */ #define NVME_NS_DATA_FLBAS_FORMAT_SHIFT (0) @@ -351,6 +478,20 @@ #define NVME_NS_DATA_FPI_SUPP_SHIFT (7) #define NVME_NS_DATA_FPI_SUPP_MASK (0x1) +/** Deallocate Logical Block Features */ +/* deallocated logical block read behavior */ +#define NVME_NS_DATA_DLFEAT_READ_SHIFT (0) +#define NVME_NS_DATA_DLFEAT_READ_MASK (0x07) +#define NVME_NS_DATA_DLFEAT_READ_NR (0x00) +#define NVME_NS_DATA_DLFEAT_READ_00 (0x01) +#define NVME_NS_DATA_DLFEAT_READ_FF (0x02) +/* supports the Deallocate bit in the Write Zeroes */ +#define NVME_NS_DATA_DLFEAT_DWZ_SHIFT (3) +#define NVME_NS_DATA_DLFEAT_DWZ_MASK (0x01) +/* Guard field for deallocated logical blocks is set to the CRC */ +#define NVME_NS_DATA_DLFEAT_GCRC_SHIFT (4) +#define NVME_NS_DATA_DLFEAT_GCRC_MASK (0x01) + /** lba format support */ /* metadata size */ #define NVME_NS_DATA_LBAF_MS_SHIFT (0) @@ -370,11 +511,42 @@ enum nvme_critical_warning_state { NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP = 0x10, }; #define NVME_CRIT_WARN_ST_RESERVED_MASK (0xE0) +#define NVME_ASYNC_EVENT_NS_ATTRIBUTE (0x100) +#define NVME_ASYNC_EVENT_FW_ACTIVATE (0x200) /* slot for current FW */ #define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT (0) #define NVME_FIRMWARE_PAGE_AFI_SLOT_MASK (0x7) +/* Commands Supported and Effects */ +#define NVME_CE_PAGE_CSUP_SHIFT (0) +#define NVME_CE_PAGE_CSUP_MASK (0x1) +#define NVME_CE_PAGE_LBCC_SHIFT (1) +#define NVME_CE_PAGE_LBCC_MASK (0x1) +#define NVME_CE_PAGE_NCC_SHIFT (2) +#define NVME_CE_PAGE_NCC_MASK (0x1) +#define NVME_CE_PAGE_NIC_SHIFT (3) +#define NVME_CE_PAGE_NIC_MASK (0x1) +#define NVME_CE_PAGE_CCC_SHIFT (4) +#define NVME_CE_PAGE_CCC_MASK (0x1) +#define NVME_CE_PAGE_CSE_SHIFT (16) +#define NVME_CE_PAGE_CSE_MASK (0x7) +#define NVME_CE_PAGE_UUID_SHIFT (19) +#define NVME_CE_PAGE_UUID_MASK (0x1) + +/* Sanitize Status */ +#define NVME_SS_PAGE_SSTAT_STATUS_SHIFT (0) +#define NVME_SS_PAGE_SSTAT_STATUS_MASK (0x7) +#define NVME_SS_PAGE_SSTAT_STATUS_NEVER (0) +#define NVME_SS_PAGE_SSTAT_STATUS_COMPLETED (1) +#define NVME_SS_PAGE_SSTAT_STATUS_INPROG (2) +#define NVME_SS_PAGE_SSTAT_STATUS_FAILED (3) +#define NVME_SS_PAGE_SSTAT_STATUS_COMPLETEDWD (4) +#define NVME_SS_PAGE_SSTAT_PASSES_SHIFT (3) +#define NVME_SS_PAGE_SSTAT_PASSES_MASK (0x1f) +#define NVME_SS_PAGE_SSTAT_GDE_SHIFT (8) +#define NVME_SS_PAGE_SSTAT_GDE_MASK (0x1) + /* CC register SHN field values */ enum shn_value { NVME_SHN_NORMAL = 0x1, @@ -390,34 +562,37 @@ enum shst_value { struct nvme_registers { - /** controller capabilities */ - uint32_t cap_lo; - uint32_t cap_hi; - - uint32_t vs; /* version */ - uint32_t intms; /* interrupt mask set */ - uint32_t intmc; /* interrupt mask clear */ - - /** controller configuration */ - uint32_t cc; - - uint32_t reserved1; - - /** controller status */ - uint32_t csts; - - uint32_t reserved2; - - /** admin queue attributes */ - uint32_t aqa; - - uint64_t asq; /* admin submission queue base addr */ - uint64_t acq; /* admin completion queue base addr */ - uint32_t reserved3[0x3f2]; - + uint32_t cap_lo; /* controller capabilities */ + uint32_t cap_hi; + uint32_t vs; /* version */ + uint32_t intms; /* interrupt mask set */ + uint32_t intmc; /* interrupt mask clear */ + uint32_t cc; /* controller configuration */ + uint32_t reserved1; + uint32_t csts; /* controller status */ + uint32_t nssr; /* NVM Subsystem Reset */ + uint32_t aqa; /* admin queue attributes */ + uint64_t asq; /* admin submission queue base addr */ + uint64_t acq; /* admin completion queue base addr */ + uint32_t cmbloc; /* Controller Memory Buffer Location */ + uint32_t cmbsz; /* Controller Memory Buffer Size */ + uint32_t bpinfo; /* Boot Partition Information */ + uint32_t bprsel; /* Boot Partition Read Select */ + uint64_t bpmbl; /* Boot Partition Memory Buffer Location */ + uint64_t cmbmsc; /* Controller Memory Buffer Memory Space Control */ + uint32_t cmbsts; /* Controller Memory Buffer Status */ + uint8_t reserved3[3492]; /* 5Ch - DFFh */ + uint32_t pmrcap; /* Persistent Memory Capabilities */ + uint32_t pmrctl; /* Persistent Memory Region Control */ + uint32_t pmrsts; /* Persistent Memory Region Status */ + uint32_t pmrebs; /* Persistent Memory Region Elasticity Buffer Size */ + uint32_t pmrswtp; /* Persistent Memory Region Sustained Write Throughput */ + uint32_t pmrmsc_lo; /* Persistent Memory Region Controller Memory Space Control */ + uint32_t pmrmsc_hi; + uint8_t reserved4[484]; /* E1Ch - FFFh */ struct { - uint32_t sq_tdbl; /* submission queue tail doorbell */ - uint32_t cq_hdbl; /* completion queue head doorbell */ + uint32_t sq_tdbl; /* submission queue tail doorbell */ + uint32_t cq_hdbl; /* completion queue head doorbell */ } doorbell[1] __packed; } __packed; @@ -458,7 +633,6 @@ struct nvme_command _Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command"); struct nvme_completion { - /* dword 0 */ uint32_t cdw0; /* command-specific */ @@ -492,6 +666,7 @@ enum nvme_status_code_type { NVME_SCT_GENERIC = 0x0, NVME_SCT_COMMAND_SPECIFIC = 0x1, NVME_SCT_MEDIA_ERROR = 0x2, + NVME_SCT_PATH_RELATED = 0x3, /* 0x3-0x6 - reserved */ NVME_SCT_VENDOR_SPECIFIC = 0x7, }; @@ -530,6 +705,9 @@ enum nvme_generic_command_status_code { NVME_SC_SANITIZE_IN_PROGRESS = 0x1d, NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID = 0x1e, NVME_SC_NOT_SUPPORTED_IN_CMB = 0x1f, + NVME_SC_NAMESPACE_IS_WRITE_PROTECTED = 0x20, + NVME_SC_COMMAND_INTERRUPTED = 0x21, + NVME_SC_TRANSIENT_TRANSPORT_ERROR = 0x22, NVME_SC_LBA_OUT_OF_RANGE = 0x80, NVME_SC_CAPACITY_EXCEEDED = 0x81, @@ -569,12 +747,15 @@ enum nvme_command_specific_status_code { NVME_SC_NS_NOT_ATTACHED = 0x1a, NVME_SC_THIN_PROV_NOT_SUPPORTED = 0x1b, NVME_SC_CTRLR_LIST_INVALID = 0x1c, - NVME_SC_SELT_TEST_IN_PROGRESS = 0x1d, + NVME_SC_SELF_TEST_IN_PROGRESS = 0x1d, NVME_SC_BOOT_PART_WRITE_PROHIB = 0x1e, NVME_SC_INVALID_CTRLR_ID = 0x1f, NVME_SC_INVALID_SEC_CTRLR_STATE = 0x20, NVME_SC_INVALID_NUM_OF_CTRLR_RESRC = 0x21, NVME_SC_INVALID_RESOURCE_ID = 0x22, + NVME_SC_SANITIZE_PROHIBITED_WPMRE = 0x23, + NVME_SC_ANA_GROUP_ID_INVALID = 0x24, + NVME_SC_ANA_ATTACH_FAILED = 0x25, NVME_SC_CONFLICTING_ATTRIBUTES = 0x80, NVME_SC_INVALID_PROTECTION_INFO = 0x81, @@ -593,6 +774,17 @@ enum nvme_media_error_status_code { NVME_SC_DEALLOCATED_OR_UNWRITTEN = 0x87, }; +/* path related status codes */ +enum nvme_path_related_status_code { + NVME_SC_INTERNAL_PATH_ERROR = 0x00, + NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS = 0x01, + NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE = 0x02, + NVME_SC_ASYMMETRIC_ACCESS_TRANSITION = 0x03, + NVME_SC_CONTROLLER_PATHING_ERROR = 0x60, + NVME_SC_HOST_PATHING_ERROR = 0x70, + NVME_SC_COMMAND_ABOTHED_BY_HOST = 0x71, +}; + /* admin opcodes */ enum nvme_admin_opcode { NVME_OPC_DELETE_IO_SQ = 0x00, @@ -612,20 +804,27 @@ enum nvme_admin_opcode { /* 0x0e-0x0f - reserved */ NVME_OPC_FIRMWARE_ACTIVATE = 0x10, NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD = 0x11, + /* 0x12-0x13 - reserved */ NVME_OPC_DEVICE_SELF_TEST = 0x14, NVME_OPC_NAMESPACE_ATTACHMENT = 0x15, + /* 0x16-0x17 - reserved */ NVME_OPC_KEEP_ALIVE = 0x18, NVME_OPC_DIRECTIVE_SEND = 0x19, NVME_OPC_DIRECTIVE_RECEIVE = 0x1a, + /* 0x1b - reserved */ NVME_OPC_VIRTUALIZATION_MANAGEMENT = 0x1c, NVME_OPC_NVME_MI_SEND = 0x1d, NVME_OPC_NVME_MI_RECEIVE = 0x1e, + /* 0x1f-0x7b - reserved */ NVME_OPC_DOORBELL_BUFFER_CONFIG = 0x7c, NVME_OPC_FORMAT_NVM = 0x80, NVME_OPC_SECURITY_SEND = 0x81, NVME_OPC_SECURITY_RECEIVE = 0x82, + /* 0x83 - reserved */ NVME_OPC_SANITIZE = 0x84, + /* 0x85 - reserved */ + NVME_OPC_GET_LBA_STATUS = 0x86, }; /* nvme nvm opcodes */ @@ -636,11 +835,11 @@ enum nvme_nvm_opcode { /* 0x03 - reserved */ NVME_OPC_WRITE_UNCORRECTABLE = 0x04, NVME_OPC_COMPARE = 0x05, - /* 0x06 - reserved */ + /* 0x06-0x07 - reserved */ NVME_OPC_WRITE_ZEROES = 0x08, - /* 0x07 - reserved */ NVME_OPC_DATASET_MANAGEMENT = 0x09, - /* 0x0a-0x0c - reserved */ + /* 0x0a-0x0b - reserved */ + NVME_OPC_VERIFY = 0x0c, NVME_OPC_RESERVATION_REGISTER = 0x0d, NVME_OPC_RESERVATION_REPORT = 0x0e, /* 0x0f-0x10 - reserved */ @@ -668,10 +867,21 @@ enum nvme_feature { NVME_FEAT_KEEP_ALIVE_TIMER = 0x0F, NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT = 0x10, NVME_FEAT_NON_OP_POWER_STATE_CONFIG = 0x11, - /* 0x12-0x77 - reserved */ + NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG = 0x12, + NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG = 0x13, + NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW = 0x14, + NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES = 0x15, + NVME_FEAT_HOST_BEHAVIOR_SUPPORT = 0x16, + NVME_FEAT_SANITIZE_CONFIG = 0x17, + NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION = 0x18, + /* 0x19-0x77 - reserved */ /* 0x78-0x7f - NVMe Management Interface */ NVME_FEAT_SOFTWARE_PROGRESS_MARKER = 0x80, - /* 0x81-0xBF - command set specific (reserved) */ + NVME_FEAT_HOST_IDENTIFIER = 0x81, + NVME_FEAT_RESERVATION_NOTIFICATION_MASK = 0x82, + NVME_FEAT_RESERVATION_PERSISTENCE = 0x83, + NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG = 0x84, + /* 0x85-0xBF - command set specific (reserved) */ /* 0xC0-0xFF - vendor specific */ }; @@ -717,7 +927,6 @@ _Static_assert(sizeof(struct nvme_power_state) == 32, "bad size for nvme_power_s #define NVME_FIRMWARE_REVISION_LENGTH 8 struct nvme_controller_data { - /* bytes 0-255: controller capabilities and features */ /** pci vendor id */ @@ -765,12 +974,27 @@ struct nvme_controller_data { /** Controller Attributes */ uint32_t ctratt; /* bitfield really */ - uint8_t reserved1[12]; + /** Read Recovery Levels Supported */ + uint16_t rrls; + + uint8_t reserved1[9]; + + /** Controller Type */ + uint8_t cntrltype; /** FRU Globally Unique Identifier */ uint8_t fguid[16]; - uint8_t reserved2[128]; + /** Command Retry Delay Time 1 */ + uint16_t crdt1; + + /** Command Retry Delay Time 2 */ + uint16_t crdt2; + + /** Command Retry Delay Time 3 */ + uint16_t crdt3; + + uint8_t reserved2[122]; /* bytes 256-511: admin command set attributes */ @@ -850,7 +1074,34 @@ struct nvme_controller_data { /** Sanitize Capabilities */ uint32_t sanicap; /* Really a bitfield */ - uint8_t reserved3[180]; + /** Host Memory Buffer Minimum Descriptor Entry Size */ + uint32_t hmminds; + + /** Host Memory Maximum Descriptors Entries */ + uint16_t hmmaxd; + + /** NVM Set Identifier Maximum */ + uint16_t nsetidmax; + + /** Endurance Group Identifier Maximum */ + uint16_t endgidmax; + + /** ANA Transition Time */ + uint8_t anatt; + + /** Asymmetric Namespace Access Capabilities */ + uint8_t anacap; + + /** ANA Group Identifier Maximum */ + uint32_t anagrpmax; + + /** Number of ANA Group Identifiers */ + uint32_t nanagrpid; + + /** Persistent Event Log Size */ + uint32_t pels; + + uint8_t reserved3[156]; /* bytes 512-703: nvm command set attributes */ /** submission queue entry size */ @@ -885,7 +1136,9 @@ struct nvme_controller_data { /** NVM Vendor Specific Command Configuration */ uint8_t nvscc; - uint8_t reserved5; + + /** Namespace Write Protection Capabilities */ + uint8_t nwpc; /** Atomic Compare & Write Unit */ uint16_t acwu; @@ -894,8 +1147,11 @@ struct nvme_controller_data { /** SGL Support */ uint32_t sgls; + /** Maximum Number of Allowed Namespaces */ + uint32_t mnan; + /* bytes 540-767: Reserved */ - uint8_t reserved7[228]; + uint8_t reserved7[224]; /** NVM Subsystem NVMe Qualified Name */ uint8_t subnqn[256]; @@ -916,7 +1172,6 @@ struct nvme_controller_data { _Static_assert(sizeof(struct nvme_controller_data) == 4096, "bad size for nvme_controller_data"); struct nvme_namespace_data { - /** namespace size */ uint64_t nsze; @@ -980,8 +1235,38 @@ struct nvme_namespace_data { /** NVM Capacity */ uint8_t nvmcap[16]; - /* bytes 64-103: Reserved */ - uint8_t reserved5[40]; + /** Namespace Preferred Write Granularity */ + uint16_t npwg; + + /** Namespace Preferred Write Alignment */ + uint16_t npwa; + + /** Namespace Preferred Deallocate Granularity */ + uint16_t npdg; + + /** Namespace Preferred Deallocate Alignment */ + uint16_t npda; + + /** Namespace Optimal Write Size */ + uint16_t nows; + + /* bytes 74-91: Reserved */ + uint8_t reserved5[18]; + + /** ANA Group Identifier */ + uint32_t anagrpid; + + /* bytes 96-98: Reserved */ + uint8_t reserved6[3]; + + /** Namespace Attributes */ + uint8_t nsattr; + + /** NVM Set Identifier */ + uint16_t nvmsetid; + + /** Endurance Group Identifier */ + uint16_t endgid; /** Namespace Globally Unique Identifier */ uint8_t nguid[16]; @@ -992,7 +1277,7 @@ struct nvme_namespace_data { /** lba format support */ uint32_t lbaf[16]; - uint8_t reserved6[192]; + uint8_t reserved7[192]; uint8_t vendor_specific[3712]; } __packed __aligned(4); @@ -1000,16 +1285,27 @@ struct nvme_namespace_data { _Static_assert(sizeof(struct nvme_namespace_data) == 4096, "bad size for nvme_namepsace_data"); enum nvme_log_page { - /* 0x00 - reserved */ NVME_LOG_ERROR = 0x01, NVME_LOG_HEALTH_INFORMATION = 0x02, NVME_LOG_FIRMWARE_SLOT = 0x03, NVME_LOG_CHANGED_NAMESPACE = 0x04, NVME_LOG_COMMAND_EFFECT = 0x05, + NVME_LOG_DEVICE_SELF_TEST = 0x06, + NVME_LOG_TELEMETRY_HOST_INITIATED = 0x07, + NVME_LOG_TELEMETRY_CONTROLLER_INITIATED = 0x08, + NVME_LOG_ENDURANCE_GROUP_INFORMATION = 0x09, + NVME_LOG_PREDICTABLE_LATENCY_PER_NVM_SET = 0x0a, + NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE = 0x0b, + NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS = 0x0c, + NVME_LOG_PERSISTENT_EVENT_LOG = 0x0d, + NVME_LOG_LBA_STATUS_INFORMATION = 0x0e, + NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE = 0x0f, /* 0x06-0x7F - reserved */ /* 0x80-0xBF - I/O command set specific */ NVME_LOG_RES_NOTIFICATION = 0x80, + NVME_LOG_SANITIZE_STATUS = 0x81, + /* 0x82-0xBF - reserved */ /* 0xC0-0xFF - vendor specific */ /* @@ -1029,7 +1325,6 @@ enum nvme_log_page { }; struct nvme_error_information_entry { - uint64_t error_count; uint16_t sqid; uint16_t cid; @@ -1038,13 +1333,16 @@ struct nvme_error_information_entry { uint64_t lba; uint32_t nsid; uint8_t vendor_specific; - uint8_t reserved[35]; + uint8_t trtype; + uint16_t reserved30; + uint64_t csi; + uint16_t ttsi; + uint8_t reserved[22]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_error_information_entry) == 64, "bad size for nvme_error_information_entry"); struct nvme_health_information_page { - uint8_t critical_warning; uint16_t temperature; uint8_t available_spare; @@ -1074,8 +1372,16 @@ struct nvme_health_information_page { uint32_t warning_temp_time; uint32_t error_temp_time; uint16_t temp_sensor[8]; - - uint8_t reserved2[296]; + /* Thermal Management Temperature 1 Transition Count */ + uint32_t tmt1tc; + /* Thermal Management Temperature 2 Transition Count */ + uint32_t tmt2tc; + /* Total Time For Thermal Management Temperature 1 */ + uint32_t ttftmt1; + /* Total Time For Thermal Management Temperature 2 */ + uint32_t ttftmt2; + + uint8_t reserved2[280]; } __packed __aligned(4); /* Currently sparse/smatch incorrectly packs this struct in some situations. */ @@ -1084,7 +1390,6 @@ _Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for #endif struct nvme_firmware_page { - uint8_t afi; uint8_t reserved[7]; uint64_t revision[7]; /* revisions for 7 slots */ @@ -1099,6 +1404,43 @@ struct nvme_ns_list { _Static_assert(sizeof(struct nvme_ns_list) == 4096, "bad size for nvme_ns_list"); +struct nvme_command_effects_page { + uint32_t acs[256]; + uint32_t iocs[256]; + uint8_t reserved[2048]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_command_effects_page) == 4096, + "bad size for nvme_command_effects_page"); + +struct nvme_res_notification_page { + uint64_t log_page_count; + uint8_t log_page_type; + uint8_t available_log_pages; + uint8_t reserved2; + uint32_t nsid; + uint8_t reserved[48]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_res_notification_page) == 64, + "bad size for nvme_res_notification_page"); + +struct nvme_sanitize_status_page { + uint16_t sprog; + uint16_t sstat; + uint32_t scdw10; + uint32_t etfo; + uint32_t etfbe; + uint32_t etfce; + uint32_t etfownd; + uint32_t etfbewnd; + uint32_t etfcewnd; + uint8_t reserved[480]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_sanitize_status_page) == 512, + "bad size for nvme_sanitize_status_page"); + struct intel_log_temp_stats { uint64_t current; @@ -1114,10 +1456,59 @@ struct intel_log_temp_stats _Static_assert(sizeof(struct intel_log_temp_stats) == 13 * 8, "bad size for intel_log_temp_stats"); +struct nvme_resv_reg_ctrlr +{ + uint16_t ctrlr_id; /* Controller ID */ + uint8_t rcsts; /* Reservation Status */ + uint8_t reserved3[5]; + uint64_t hostid; /* Host Identifier */ + uint64_t rkey; /* Reservation Key */ +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_resv_reg_ctrlr) == 24, "bad size for nvme_resv_reg_ctrlr"); + +struct nvme_resv_reg_ctrlr_ext +{ + uint16_t ctrlr_id; /* Controller ID */ + uint8_t rcsts; /* Reservation Status */ + uint8_t reserved3[5]; + uint64_t rkey; /* Reservation Key */ + uint64_t hostid[2]; /* Host Identifier */ + uint8_t reserved32[32]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_resv_reg_ctrlr_ext) == 64, "bad size for nvme_resv_reg_ctrlr_ext"); + +struct nvme_resv_status +{ + uint32_t gen; /* Generation */ + uint8_t rtype; /* Reservation Type */ + uint8_t regctl[2]; /* Number of Registered Controllers */ + uint8_t reserved7[2]; + uint8_t ptpls; /* Persist Through Power Loss State */ + uint8_t reserved10[14]; + struct nvme_resv_reg_ctrlr ctrlr[0]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_resv_status) == 24, "bad size for nvme_resv_status"); + +struct nvme_resv_status_ext +{ + uint32_t gen; /* Generation */ + uint8_t rtype; /* Reservation Type */ + uint8_t regctl[2]; /* Number of Registered Controllers */ + uint8_t reserved7[2]; + uint8_t ptpls; /* Persist Through Power Loss State */ + uint8_t reserved10[14]; + uint8_t reserved24[40]; + struct nvme_resv_reg_ctrlr_ext ctrlr[0]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_resv_status_ext) == 64, "bad size for nvme_resv_status_ext"); + #define NVME_TEST_MAX_THREADS 128 struct nvme_io_test { - enum nvme_nvm_opcode opc; uint32_t size; uint32_t time; /* in seconds */ @@ -1127,7 +1518,6 @@ struct nvme_io_test { }; enum nvme_io_test_flags { - /* * Specifies whether dev_refthread/dev_relthread should be * called during NVME_BIO_TEST. Ignored for other test @@ -1137,7 +1527,6 @@ enum nvme_io_test_flags { }; struct nvme_pt_command { - /* * cmd is used to specify a passthrough command to a controller or * namespace. @@ -1189,6 +1578,17 @@ struct nvme_pt_command { struct mtx * driver_lock; }; +struct nvme_get_nsid { + char cdev[SPECNAMELEN + 1]; + uint32_t nsid; +}; + +struct nvme_hmb_desc { + uint64_t addr; + uint32_t size; + uint32_t reserved; +}; + #define nvme_completion_is_error(cpl) \ (NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0) @@ -1197,6 +1597,7 @@ void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen); #ifdef _KERNEL struct bio; +struct thread; struct nvme_namespace; struct nvme_controller; @@ -1223,6 +1624,8 @@ int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, /* Admin functions */ void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, + uint32_t cdw12, uint32_t cdw13, + uint32_t cdw14, uint32_t cdw15, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, @@ -1264,6 +1667,13 @@ void nvme_unregister_consumer(struct nvme_consumer *consumer); device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr); const struct nvme_controller_data * nvme_ctrlr_get_data(struct nvme_controller *ctrlr); +static inline bool +nvme_ctrlr_has_dataset_mgmt(const struct nvme_controller_data *cd) +{ + /* Assumes cd was byte swapped by nvme_controller_data_swapbytes() */ + return ((cd->oncs >> NVME_CTRLR_DATA_ONCS_DSM_SHIFT) & + NVME_CTRLR_DATA_ONCS_DSM_MASK); +} /* Namespace helper functions */ uint32_t nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns); @@ -1279,6 +1689,8 @@ uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns); int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn); +int nvme_ns_ioctl_process(struct nvme_namespace *ns, u_long cmd, + caddr_t arg, int flag, struct thread *td); /* * Command building helper functions -- shared with CAM @@ -1335,8 +1747,9 @@ extern int nvme_use_nvd; /* Endianess conversion functions for NVMe structs */ static inline -void nvme_completion_swapbytes(struct nvme_completion *s) +void nvme_completion_swapbytes(struct nvme_completion *s __unused) { +#ifndef _LITTLE_ENDIAN s->cdw0 = le32toh(s->cdw0); /* omit rsvd1 */ @@ -1344,22 +1757,26 @@ void nvme_completion_swapbytes(struct nvme_completion *s) s->sqid = le16toh(s->sqid); /* omit cid */ s->status = le16toh(s->status); +#endif } static inline -void nvme_power_state_swapbytes(struct nvme_power_state *s) +void nvme_power_state_swapbytes(struct nvme_power_state *s __unused) { +#ifndef _LITTLE_ENDIAN s->mp = le16toh(s->mp); s->enlat = le32toh(s->enlat); s->exlat = le32toh(s->exlat); s->idlp = le16toh(s->idlp); s->actp = le16toh(s->actp); +#endif } static inline -void nvme_controller_data_swapbytes(struct nvme_controller_data *s) +void nvme_controller_data_swapbytes(struct nvme_controller_data *s __unused) { +#ifndef _LITTLE_ENDIAN int i; s->vid = le16toh(s->vid); @@ -1370,6 +1787,10 @@ void nvme_controller_data_swapbytes(struct nvme_controller_data *s) s->rtd3e = le32toh(s->rtd3e); s->oaes = le32toh(s->oaes); s->ctratt = le32toh(s->ctratt); + s->rrls = le16toh(s->rrls); + s->crdt1 = le16toh(s->crdt1); + s->crdt2 = le16toh(s->crdt2); + s->crdt3 = le16toh(s->crdt3); s->oacs = le16toh(s->oacs); s->wctemp = le16toh(s->wctemp); s->cctemp = le16toh(s->cctemp); @@ -1383,6 +1804,13 @@ void nvme_controller_data_swapbytes(struct nvme_controller_data *s) s->mntmt = le16toh(s->mntmt); s->mxtmt = le16toh(s->mxtmt); s->sanicap = le32toh(s->sanicap); + s->hmminds = le32toh(s->hmminds); + s->hmmaxd = le16toh(s->hmmaxd); + s->nsetidmax = le16toh(s->nsetidmax); + s->endgidmax = le16toh(s->endgidmax); + s->anagrpmax = le32toh(s->anagrpmax); + s->nanagrpid = le32toh(s->nanagrpid); + s->pels = le32toh(s->pels); s->maxcmd = le16toh(s->maxcmd); s->nn = le32toh(s->nn); s->oncs = le16toh(s->oncs); @@ -1391,13 +1819,16 @@ void nvme_controller_data_swapbytes(struct nvme_controller_data *s) s->awupf = le16toh(s->awupf); s->acwu = le16toh(s->acwu); s->sgls = le32toh(s->sgls); + s->mnan = le32toh(s->mnan); for (i = 0; i < 32; i++) nvme_power_state_swapbytes(&s->power_state[i]); +#endif } static inline -void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s) +void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s __unused) { +#ifndef _LITTLE_ENDIAN int i; s->nsze = le64toh(s->nsze); @@ -1410,13 +1841,24 @@ void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s) s->nabo = le16toh(s->nabo); s->nabspf = le16toh(s->nabspf); s->noiob = le16toh(s->noiob); + s->npwg = le16toh(s->npwg); + s->npwa = le16toh(s->npwa); + s->npdg = le16toh(s->npdg); + s->npda = le16toh(s->npda); + s->nows = le16toh(s->nows); + s->anagrpid = le32toh(s->anagrpid); + s->nvmsetid = le16toh(s->nvmsetid); + s->endgid = le16toh(s->endgid); for (i = 0; i < 16; i++) s->lbaf[i] = le32toh(s->lbaf[i]); +#endif } static inline -void nvme_error_information_entry_swapbytes(struct nvme_error_information_entry *s) +void nvme_error_information_entry_swapbytes( + struct nvme_error_information_entry *s __unused) { +#ifndef _LITTLE_ENDIAN s->error_count = le64toh(s->error_count); s->sqid = le16toh(s->sqid); @@ -1425,18 +1867,14 @@ void nvme_error_information_entry_swapbytes(struct nvme_error_information_entry s->error_location = le16toh(s->error_location); s->lba = le64toh(s->lba); s->nsid = le32toh(s->nsid); + s->csi = le64toh(s->csi); + s->ttsi = le16toh(s->ttsi); +#endif } static inline -void nvme_le128toh(void *p) +void nvme_le128toh(void *p __unused) { - /* - * Upstream, this uses the following comparison: - * #if _BYTE_ORDER != _LITTLE_ENDIAN - * - * Rather than keep this file in compat with only that little bit - * changed, we'll just float a little patch here for now. - */ #ifndef _LITTLE_ENDIAN /* Swap 16 bytes in place */ char *tmp = (char*)p; @@ -1447,14 +1885,14 @@ void nvme_le128toh(void *p) tmp[i] = tmp[15-i]; tmp[15-i] = b; } -#else - (void)p; #endif } static inline -void nvme_health_information_page_swapbytes(struct nvme_health_information_page *s) +void nvme_health_information_page_swapbytes( + struct nvme_health_information_page *s __unused) { +#ifndef _LITTLE_ENDIAN int i; s->temperature = le16toh(s->temperature); @@ -1472,30 +1910,80 @@ void nvme_health_information_page_swapbytes(struct nvme_health_information_page s->error_temp_time = le32toh(s->error_temp_time); for (i = 0; i < 8; i++) s->temp_sensor[i] = le16toh(s->temp_sensor[i]); + s->tmt1tc = le32toh(s->tmt1tc); + s->tmt2tc = le32toh(s->tmt2tc); + s->ttftmt1 = le32toh(s->ttftmt1); + s->ttftmt2 = le32toh(s->ttftmt2); +#endif } - static inline -void nvme_firmware_page_swapbytes(struct nvme_firmware_page *s) +void nvme_firmware_page_swapbytes(struct nvme_firmware_page *s __unused) { +#ifndef _LITTLE_ENDIAN int i; for (i = 0; i < 7; i++) s->revision[i] = le64toh(s->revision[i]); +#endif } static inline -void nvme_ns_list_swapbytes(struct nvme_ns_list *s) +void nvme_ns_list_swapbytes(struct nvme_ns_list *s __unused) { +#ifndef _LITTLE_ENDIAN int i; for (i = 0; i < 1024; i++) s->ns[i] = le32toh(s->ns[i]); +#endif +} + +static inline +void nvme_command_effects_page_swapbytes( + struct nvme_command_effects_page *s __unused) +{ +#ifndef _LITTLE_ENDIAN + int i; + + for (i = 0; i < 256; i++) + s->acs[i] = le32toh(s->acs[i]); + for (i = 0; i < 256; i++) + s->iocs[i] = le32toh(s->iocs[i]); +#endif +} + +static inline +void nvme_res_notification_page_swapbytes( + struct nvme_res_notification_page *s __unused) +{ +#ifndef _LITTLE_ENDIAN + s->log_page_count = le64toh(s->log_page_count); + s->nsid = le32toh(s->nsid); +#endif +} + +static inline +void nvme_sanitize_status_page_swapbytes( + struct nvme_sanitize_status_page *s __unused) +{ +#ifndef _LITTLE_ENDIAN + s->sprog = le16toh(s->sprog); + s->sstat = le16toh(s->sstat); + s->scdw10 = le32toh(s->scdw10); + s->etfo = le32toh(s->etfo); + s->etfbe = le32toh(s->etfbe); + s->etfce = le32toh(s->etfce); + s->etfownd = le32toh(s->etfownd); + s->etfbewnd = le32toh(s->etfbewnd); + s->etfcewnd = le32toh(s->etfcewnd); +#endif } static inline -void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s) +void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s __unused) { +#ifndef _LITTLE_ENDIAN s->current = le64toh(s->current); s->overtemp_flag_last = le64toh(s->overtemp_flag_last); @@ -1506,6 +1994,43 @@ void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s) s->max_oper_temp = le64toh(s->max_oper_temp); s->min_oper_temp = le64toh(s->min_oper_temp); s->est_offset = le64toh(s->est_offset); +#endif +} + +static inline +void nvme_resv_status_swapbytes(struct nvme_resv_status *s __unused, + size_t size __unused) +{ +#ifndef _LITTLE_ENDIAN + u_int i, n; + + s->gen = le32toh(s->gen); + n = (s->regctl[1] << 8) | s->regctl[0]; + n = MIN(n, (size - sizeof(s)) / sizeof(s->ctrlr[0])); + for (i = 0; i < n; i++) { + s->ctrlr[i].ctrlr_id = le16toh(s->ctrlr[i].ctrlr_id); + s->ctrlr[i].hostid = le64toh(s->ctrlr[i].hostid); + s->ctrlr[i].rkey = le64toh(s->ctrlr[i].rkey); + } +#endif +} + +static inline +void nvme_resv_status_ext_swapbytes(struct nvme_resv_status_ext *s __unused, + size_t size __unused) +{ +#ifndef _LITTLE_ENDIAN + u_int i, n; + + s->gen = le32toh(s->gen); + n = (s->regctl[1] << 8) | s->regctl[0]; + n = MIN(n, (size - sizeof(s)) / sizeof(s->ctrlr[0])); + for (i = 0; i < n; i++) { + s->ctrlr[i].ctrlr_id = le16toh(s->ctrlr[i].ctrlr_id); + s->ctrlr[i].rkey = le64toh(s->ctrlr[i].rkey); + nvme_le128toh((void *)s->ctrlr[i].hostid); + } +#endif } #endif /* __NVME_H__ */ diff --git a/usr/src/contrib/bhyve/sys/ata.h b/usr/src/contrib/bhyve/sys/ata.h index 223bd7b3eb..83eb089dbe 100644 --- a/usr/src/contrib/bhyve/sys/ata.h +++ b/usr/src/contrib/bhyve/sys/ata.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org> * All rights reserved. * @@ -44,6 +46,7 @@ struct ata_params { #define ATA_ATAPI_TYPE_TAPE 0x0100 /* streaming tape */ #define ATA_ATAPI_TYPE_CDROM 0x0500 /* CD-ROM device */ #define ATA_ATAPI_TYPE_OPTICAL 0x0700 /* optical disk */ +#define ATA_ATAPI_REMOVABLE 0x0080 #define ATA_DRQ_MASK 0x0060 #define ATA_DRQ_SLOW 0x0000 /* cpu 3 ms delay */ #define ATA_DRQ_INTR 0x0020 /* interrupt 10 ms delay */ @@ -64,12 +67,13 @@ struct ata_params { /*023*/ u_int8_t revision[8]; /* firmware revision */ /*027*/ u_int8_t model[40]; /* model name */ /*047*/ u_int16_t sectors_intr; /* sectors per interrupt */ -/*048*/ u_int16_t usedmovsd; /* double word read/write? */ +/*048*/ u_int16_t tcg; /* Trusted Computing Group */ +#define ATA_SUPPORT_TCG 0x0001 /*049*/ u_int16_t capabilities1; #define ATA_SUPPORT_DMA 0x0100 #define ATA_SUPPORT_LBA 0x0200 -#define ATA_SUPPORT_IORDY 0x0400 -#define ATA_SUPPORT_IORDYDIS 0x0800 +#define ATA_SUPPORT_IORDYDIS 0x0400 +#define ATA_SUPPORT_IORDY 0x0800 #define ATA_SUPPORT_OVERLAP 0x4000 /*050*/ u_int16_t capabilities2; @@ -90,6 +94,12 @@ struct ata_params { /*057*/ u_int16_t current_size_1; /*058*/ u_int16_t current_size_2; /*059*/ u_int16_t multi; +#define ATA_SUPPORT_BLOCK_ERASE_EXT 0x8000 +#define ATA_SUPPORT_OVERWRITE_EXT 0x4000 +#define ATA_SUPPORT_CRYPTO_SCRAMBLE_EXT 0x2000 +#define ATA_SUPPORT_SANITIZE 0x1000 +#define ATA_SUPPORT_SANITIZE_ALLOWED 0x0800 +#define ATA_SUPPORT_ANTIFREEZE_LOCK_EXT 0x0400 #define ATA_MULTI_VALID 0x0100 /*060*/ u_int16_t lba_size_1; @@ -105,6 +115,7 @@ struct ata_params { /*069*/ u_int16_t support3; #define ATA_SUPPORT_RZAT 0x0020 #define ATA_SUPPORT_DRAT 0x4000 +#define ATA_ENCRYPTS_ALL_USER_DATA 0x0010 /* Self-encrypting drive */ #define ATA_SUPPORT_ZONE_MASK 0x0003 #define ATA_SUPPORT_ZONE_NR 0x0000 #define ATA_SUPPORT_ZONE_HOST_AWARE 0x0001 @@ -133,7 +144,8 @@ struct ata_params { /*77*/ u_int16_t satacapabilities2; #define ATA_SATA_CURR_GEN_MASK 0x0006 #define ATA_SUPPORT_NCQ_STREAM 0x0010 -#define ATA_SUPPORT_NCQ_QMANAGEMENT 0x0020 +#define ATA_SUPPORT_NCQ_NON_DATA 0x0020 +#define ATA_SUPPORT_NCQ_QMANAGEMENT ATA_SUPPORT_NCQ_NON_DATA #define ATA_SUPPORT_RCVSND_FPDMA_QUEUED 0x0040 /*78*/ u_int16_t satasupport; #define ATA_SUPPORT_NONZERO 0x0002 @@ -142,6 +154,7 @@ struct ata_params { #define ATA_SUPPORT_INORDERDATA 0x0010 #define ATA_SUPPORT_ASYNCNOTIF 0x0020 #define ATA_SUPPORT_SOFTSETPRESERVE 0x0040 +#define ATA_SUPPORT_NCQ_AUTOSENSE 0x0080 /*79*/ u_int16_t sataenabled; #define ATA_ENABLED_DAPST 0x0080 @@ -234,12 +247,15 @@ struct ata_params { #define ATA_SUPPORT_FREEFALL 0x0020 #define ATA_SUPPORT_SENSE_REPORT 0x0040 #define ATA_SUPPORT_EPC 0x0080 +#define ATA_SUPPORT_AMAX_ADDR 0x0100 +#define ATA_SUPPORT_DSN 0x0200 /*120*/ u_int16_t enabled2; #define ATA_ENABLED_WRITEREADVERIFY 0x0002 #define ATA_ENABLED_WRITEUNCORREXT 0x0004 #define ATA_ENABLED_FREEFALL 0x0020 #define ATA_ENABLED_SENSE_REPORT 0x0040 #define ATA_ENABLED_EPC 0x0080 +#define ATA_ENABLED_DSN 0x0200 u_int16_t reserved121[6]; /*127*/ u_int16_t removable_status; /*128*/ u_int16_t security_status; @@ -257,10 +273,23 @@ struct ata_params { /*162*/ u_int16_t cfa_kms_support; /*163*/ u_int16_t cfa_trueide_modes; /*164*/ u_int16_t cfa_memory_modes; - u_int16_t reserved165[4]; + u_int16_t reserved165[3]; +/*168*/ u_int16_t form_factor; +#define ATA_FORM_FACTOR_MASK 0x000f +#define ATA_FORM_FACTOR_NOT_REPORTED 0x0000 +#define ATA_FORM_FACTOR_5_25 0x0001 +#define ATA_FORM_FACTOR_3_5 0x0002 +#define ATA_FORM_FACTOR_2_5 0x0003 +#define ATA_FORM_FACTOR_1_8 0x0004 +#define ATA_FORM_FACTOR_SUB_1_8 0x0005 +#define ATA_FORM_FACTOR_MSATA 0x0006 +#define ATA_FORM_FACTOR_M_2 0x0007 +#define ATA_FORM_FACTOR_MICRO_SSD 0x0008 +#define ATA_FORM_FACTOR_C_FAST 0x0009 /*169*/ u_int16_t support_dsm; #define ATA_SUPPORT_DSM_TRIM 0x0001 - u_int16_t reserved170[6]; +/*170*/ u_int8_t product_id[8]; /* Additional Product Identifier */ + u_int16_t reserved174[2]; /*176*/ u_int8_t media_serial[60]; /*206*/ u_int16_t sct; u_int16_t reserved207[2]; @@ -283,7 +312,7 @@ struct ata_params { /*223*/ u_int16_t transport_minor; u_int16_t reserved224[31]; /*255*/ u_int16_t integrity; -} __packed; +} __packed __aligned(2); /* ATA Dataset Management */ #define ATA_DSM_BLK_SIZE 512 @@ -355,7 +384,6 @@ struct ata_params { #define ATA_SA600 0x49 #define ATA_DMA_MAX 0x4f - /* ATA commands */ #define ATA_NOP 0x00 /* NOP */ #define ATA_NF_FLUSHQUEUE 0x00 /* flush queued cmd's */ @@ -391,6 +419,12 @@ struct ata_params { #define ATA_READ_LOG_DMA_EXT 0x47 /* read log DMA ext - PIO Data-In */ #define ATA_ZAC_MANAGEMENT_IN 0x4a /* ZAC management in */ #define ATA_ZM_REPORT_ZONES 0x00 /* report zones */ +#define ATA_WRITE_LOG_DMA_EXT 0x57 /* WRITE LOG DMA EXT */ +#define ATA_TRUSTED_NON_DATA 0x5b /* TRUSTED NON-DATA */ +#define ATA_TRUSTED_RECEIVE 0x5c /* TRUSTED RECEIVE */ +#define ATA_TRUSTED_RECEIVE_DMA 0x5d /* TRUSTED RECEIVE DMA */ +#define ATA_TRUSTED_SEND 0x5e /* TRUSTED SEND */ +#define ATA_TRUSTED_SEND_DMA 0x5f /* TRUSTED SEND DMA */ #define ATA_READ_FPDMA_QUEUED 0x60 /* read DMA NCQ */ #define ATA_WRITE_FPDMA_QUEUED 0x61 /* write DMA NCQ */ #define ATA_NCQ_NON_DATA 0x63 /* NCQ non-data command */ @@ -410,15 +444,22 @@ struct ata_params { #define ATA_RFPDMA_ZAC_MGMT_IN 0x02 /* NCQ ZAC mgmt in w/data */ #define ATA_SEP_ATTN 0x67 /* SEP request */ #define ATA_SEEK 0x70 /* seek */ +#define ATA_AMAX_ADDR 0x78 /* Accessible Max Address */ +#define ATA_AMAX_ADDR_GET 0x00 /* GET NATIVE MAX ADDRESS EXT */ +#define ATA_AMAX_ADDR_SET 0x01 /* SET ACCESSIBLE MAX ADDRESS EXT */ +#define ATA_AMAX_ADDR_FREEZE 0x02 /* FREEZE ACCESSIBLE MAX ADDRESS EXT */ #define ATA_ZAC_MANAGEMENT_OUT 0x9f /* ZAC management out */ #define ATA_ZM_CLOSE_ZONE 0x01 /* close zone */ #define ATA_ZM_FINISH_ZONE 0x02 /* finish zone */ #define ATA_ZM_OPEN_ZONE 0x03 /* open zone */ #define ATA_ZM_RWP 0x04 /* reset write pointer */ +#define ATA_DOWNLOAD_MICROCODE 0x92 /* DOWNLOAD MICROCODE */ +#define ATA_DOWNLOAD_MICROCODE_DMA 0x93 /* DOWNLOAD MICROCODE DMA */ #define ATA_PACKET_CMD 0xa0 /* packet command */ #define ATA_ATAPI_IDENTIFY 0xa1 /* get ATAPI params*/ #define ATA_SERVICE 0xa2 /* service command */ #define ATA_SMART_CMD 0xb0 /* SMART command */ +#define ATA_SANITIZE 0xb4 /* sanitize device */ #define ATA_CFA_ERASE 0xc0 /* CFA erase */ #define ATA_READ_MUL 0xc4 /* read multi */ #define ATA_WRITE_MUL 0xc5 /* write multi */ @@ -437,8 +478,11 @@ struct ata_params { #define ATA_CHECK_POWER_MODE 0xe5 /* device power mode */ #define ATA_SLEEP 0xe6 /* sleep */ #define ATA_FLUSHCACHE 0xe7 /* flush cache to disk */ +#define ATA_WRITE_BUFFER 0xe8 /* write buffer */ #define ATA_WRITE_PM 0xe8 /* write portmultiplier */ +#define ATA_READ_BUFFER_DMA 0xe9 /* read buffer DMA */ #define ATA_FLUSHCACHE48 0xea /* flush cache to disk */ +#define ATA_WRITE_BUFFER_DMA 0xeb /* write buffer DMA */ #define ATA_ATA_IDENTIFY 0xec /* get ATA params */ #define ATA_SETFEATURES 0xef /* features command */ #define ATA_SF_ENAB_WCACHE 0x02 /* enable write cache */ @@ -463,7 +507,6 @@ struct ata_params { #define ATA_SF_DIS_SRVIRQ 0xde /* disable service interrupt */ #define ATA_SF_LPSAERC 0x62 /* Long Phys Sect Align ErrRep*/ #define ATA_SF_DSN 0x63 /* Device Stats Notification */ -#define ATA_CHECK_POWER_MODE 0xe5 /* Check Power Mode */ #define ATA_SECURITY_SET_PASSWORD 0xf1 /* set drive password */ #define ATA_SECURITY_UNLOCK 0xf2 /* unlock drive using passwd */ #define ATA_SECURITY_ERASE_PREPARE 0xf3 /* prepare to erase drive */ @@ -473,7 +516,6 @@ struct ata_params { #define ATA_READ_NATIVE_MAX_ADDRESS 0xf8 /* read native max address */ #define ATA_SET_MAX_ADDRESS 0xf9 /* set max address */ - /* ATAPI commands */ #define ATAPI_TEST_UNIT_READY 0x00 /* check if device is ready */ #define ATAPI_REZERO 0x01 /* rewind */ @@ -534,7 +576,6 @@ struct ata_params { #define ATAPI_READ_CD 0xbe /* read data */ #define ATAPI_POLL_DSC 0xff /* poll DSC status bit */ - struct ata_ioc_devices { int channel; char name[2][32]; @@ -585,7 +626,7 @@ struct atapi_sense { u_int8_t specific; /* sense key specific */ #define ATA_SENSE_SPEC_VALID 0x80 #define ATA_SENSE_SPEC_MASK 0x7f - + u_int8_t specific1; /* sense key specific */ u_int8_t specific2; /* sense key specific */ } __packed; @@ -682,7 +723,7 @@ struct atapi_sense { #define ATA_IDL_ATA_STRINGS 0x05 /* ATA Strings */ #define ATA_IDL_SECURITY 0x06 /* Security */ #define ATA_IDL_PARALLEL_ATA 0x07 /* Parallel ATA */ -#define ATA_IDL_SERIAL_ATA 0x08 /* Seiral ATA */ +#define ATA_IDL_SERIAL_ATA 0x08 /* Serial ATA */ #define ATA_IDL_ZDI 0x09 /* Zoned Device Information */ struct ata_gp_log_dir { @@ -965,7 +1006,6 @@ struct ata_security_password { #define IOCATAGSPINDOWN _IOR('a', 104, int) #define IOCATASSPINDOWN _IOW('a', 105, int) - struct ata_ioc_raid_config { int lun; int type; diff --git a/usr/src/lib/libvmmapi/common/mapfile-vers b/usr/src/lib/libvmmapi/common/mapfile-vers index 26cfd15426..be0a055490 100644 --- a/usr/src/lib/libvmmapi/common/mapfile-vers +++ b/usr/src/lib/libvmmapi/common/mapfile-vers @@ -34,7 +34,6 @@ $mapfile_version 2 SYMBOL_VERSION ILLUMOSprivate { global: vcpu_reset; - vm_active_cpus; vm_activate_cpu; vm_active_cpus; vm_apicid2vcpu; @@ -46,19 +45,18 @@ SYMBOL_VERSION ILLUMOSprivate { vm_copy_teardown; vm_copyin; vm_copyout; - vm_create_devmem; vm_create; vm_create_devmem; vm_debug_cpus; vm_destroy; - vm_destroy; + vm_disable_pptdev_msix; vm_get_capability; vm_get_desc; vm_get_device_fd; vm_get_devmem_offset; vm_get_gpa_pmap; - vm_get_hpet_capabilities; vm_get_highmem_size; + vm_get_hpet_capabilities; vm_get_intinfo; vm_get_lowmem_limit; vm_get_lowmem_size; @@ -77,10 +75,6 @@ SYMBOL_VERSION ILLUMOSprivate { vm_inject_exception; vm_inject_fault; vm_inject_nmi; - vm_isa_assert_irq; - vm_isa_deassert_irq; - vm_isa_pulse_irq; - vm_isa_set_irq_trigger; vm_ioapic_assert_irq; vm_ioapic_deassert_irq; vm_ioapic_pincount; @@ -98,8 +92,10 @@ SYMBOL_VERSION ILLUMOSprivate { vm_mmap_memseg; vm_open; vm_parse_memsize; + vm_pmtmr_set_location; vm_reinit; vm_restart_instruction; + vm_resume_cpu; vm_rtc_gettime; vm_rtc_read; vm_rtc_settime; @@ -119,9 +115,7 @@ SYMBOL_VERSION ILLUMOSprivate { vm_suspend; vm_suspend_cpu; vm_suspended_cpus; - vm_resume_cpu; vm_unassign_pptdev; - vm_pmtmr_set_location; vm_wrlock_cycle; vm_get_run_state; vm_set_run_state; diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c index 0b22ca7522..fcb098a74f 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.c +++ b/usr/src/lib/libvmmapi/common/vmmapi.c @@ -1162,7 +1162,22 @@ vm_get_pptdev_limits(struct vmctx *ctx, int bus, int slot, int func, return (error); } + +int +vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func) +{ + struct vm_pptdev ppt; + + bzero(&ppt, sizeof(ppt)); + ppt.bus = bus; + ppt.slot = slot; + ppt.func = func; + + return ioctl(ctx->fd, VM_PPTDEV_DISABLE_MSIX, &ppt); +} + #else /* __FreeBSD__ */ + int vm_assign_pptdev(struct vmctx *ctx, int pptfd) { @@ -1238,6 +1253,15 @@ vm_get_pptdev_limits(struct vmctx *ctx, int pptfd, int *msi_limit, *msix_limit = pptlimits.msix_limit; return (error); } + +int +vm_disable_pptdev_msix(struct vmctx *ctx, int pptfd) +{ + struct vm_pptdev pptdev; + + pptdev.pptfd = pptfd; + return (ioctl(ctx->fd, VM_PPTDEV_DISABLE_MSIX, &pptdev)); +} #endif /* __FreeBSD__ */ uint64_t * @@ -1905,7 +1929,8 @@ vm_get_ioctls(size_t *len) VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER, VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV, VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI, - VM_PPTDEV_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, + VM_PPTDEV_MSIX, VM_PPTDEV_DISABLE_MSIX, + VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE, VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA, VM_GLA2GPA_NOFAULT, diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h index f7aaa02087..72e43a4e3d 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.h +++ b/usr/src/lib/libvmmapi/common/vmmapi.h @@ -225,6 +225,7 @@ int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); +int vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func); int vm_get_pptdev_limits(struct vmctx *ctx, int bus, int slot, int func, int *msi_limit, int *msix_limit); #else /* __FreeBSD__ */ @@ -236,6 +237,7 @@ int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int pptfd, uint64_t addr, uint64_t msg, int numvec); int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int pptfd, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); +int vm_disable_pptdev_msix(struct vmctx *ctx, int pptfd); int vm_get_pptdev_limits(struct vmctx *ctx, int pptfd, int *msi_limit, int *msix_limit); #endif /* __FreeBSD__ */ diff --git a/usr/src/uts/i86pc/io/vmm/README.sync b/usr/src/uts/i86pc/io/vmm/README.sync deleted file mode 100644 index 1b766008a8..0000000000 --- a/usr/src/uts/i86pc/io/vmm/README.sync +++ /dev/null @@ -1,37 +0,0 @@ -The bhyve kernel module and its associated userland consumers have been updated -to the latest upstream FreeBSD sources as of: - -commit 8ade7383cafed0f7555cac16ef7f9e956e46eaeb -Author: grehan <grehan@FreeBSD.org> -Date: Mon May 25 06:25:31 2020 +0000 - - Fix pci-passthru MSI issues with OpenBSD guests - - - Return 2 x 16-bit registers in the correct byte order - for a 4-byte read that spans the CMD/STATUS register. - This reversal was hiding the capabilities-list, which prevented - the MSI capability from being found for XHCI passthru. - - - Reorganize MSI/MSI-x config writes so that a 4-byte write at the - capability offset would have the read-only portion skipped. - This prevented MSI interrupts from being enabled. - - Reported and extensively tested by Anatoli (me at anatoli dot ws) - - PR: 245392 - Reported by: Anatoli (me at anatoli dot ws) - Reviewed by: jhb (bhyve) - Approved by: jhb, bz (mentor) - MFC after: 1 week - Differential Revision: https://reviews.freebsd.org/D24951 - -Divergence Notes: -A previous sync skipped commit c8edafdabc27533d9c51eddc2896e772c16d965c which -introduced a generic backend functionality to network devices. Without that in -place, subsequent updates reflect the absence of that subsystem. Integrating -net backends has not been a priority, given the common use of viona on illumos. - -The draft Save/Restore functionality, added in FreeBSD commit -d3e4e512238b072fb9282e024610b981ba679869, has not been synced into illumos bhyve -yet. It is not built by default in FreeBSD, so we're not interested in taking -it until it successfully endures more in-depth testing. diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c index a01b06446d..e2f298ae09 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c +++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c @@ -356,7 +356,6 @@ amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id, cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); - cmd->opcode = AMDVI_INVD_PAGE_OPCODE; cmd->word1 = domain_id; /* @@ -729,7 +728,6 @@ amdvi_print_pci_cap(device_t dev) struct amdvi_softc *softc; uint32_t off, cap; - softc = device_get_softc(dev); off = softc->cap_off; @@ -869,7 +867,6 @@ amdvi_alloc_intr_resources(struct amdvi_softc *softc) return (0); } - static void amdvi_print_dev_cap(struct amdvi_softc *softc) { @@ -1121,7 +1118,6 @@ amdvi_free_ptp(uint64_t *ptp, int level) amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i] & AMDVI_PT_MASK), level - 1); - } free(ptp, M_AMDVI); diff --git a/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c index 11925582ef..96241be8f4 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c +++ b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c @@ -105,7 +105,6 @@ ivrs_hdr_iterate_tbl(ivhd_iter_t iter, void *arg) default: printf("AMD-Vi:Not IVHD/IVMD type(%d)", ivrs_hdr->Type); - } ivrs_hdr = (ACPI_IVRS_HEADER *)((uint8_t *)ivrs_hdr + diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c index 94dce3fa47..62823b3a65 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm.c +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c @@ -475,7 +475,6 @@ vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); } - /* * Intercept everything when tracing guest exceptions otherwise * just intercept machine check exception. diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s index c1537b1544..278dd5c5cb 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s @@ -37,10 +37,6 @@ /* Porting note: This is named 'svm_support.S' upstream. */ -#define VMLOAD .byte 0x0f, 0x01, 0xda -#define VMRUN .byte 0x0f, 0x01, 0xd8 -#define VMSAVE .byte 0x0f, 0x01, 0xdb - /* * Flush scratch registers to avoid lingering guest state being used for @@ -87,7 +83,7 @@ ENTRY_NP(svm_launch) movq %rsi, SVMSTK_RSI(%rsp) movq %rdi, SVMSTK_RDI(%rsp) - /* VMLOAD and VMRUN expect the VMCB physaddr in %rax */ + /* Save the physical address of the VMCB in %rax */ movq %rdi, %rax /* Restore guest state. */ @@ -106,9 +102,9 @@ ENTRY_NP(svm_launch) movq SCTX_RDI(%rsi), %rdi movq SCTX_RSI(%rsi), %rsi /* %rsi must be restored last */ - VMLOAD - VMRUN - VMSAVE + vmload %rax + vmrun %rax + vmsave %rax /* Grab the svm_regctx pointer */ movq SVMSTK_RSI(%rsp), %rax diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.c b/usr/src/uts/i86pc/io/vmm/io/ppt.c index 2f715bcc42..02446862ea 100644 --- a/usr/src/uts/i86pc/io/vmm/io/ppt.c +++ b/usr/src/uts/i86pc/io/vmm/io/ppt.c @@ -825,31 +825,44 @@ fail: return (B_FALSE); } - -static struct pptdev * -ppt_findf(int fd) +static int +ppt_findf(struct vm *vm, int fd, struct pptdev **pptp) { struct pptdev *ppt = NULL; file_t *fp; vattr_t va; + int err = 0; - if ((fp = getf(fd)) == NULL) { - return (NULL); - } + ASSERT(MUTEX_HELD(&pptdev_mtx)); + + if ((fp = getf(fd)) == NULL) + return (EBADF); va.va_mask = AT_RDEV; if (VOP_GETATTR(fp->f_vnode, &va, NO_FOLLOW, fp->f_cred, NULL) != 0 || - getmajor(va.va_rdev) != ppt_major) + getmajor(va.va_rdev) != ppt_major) { + err = EBADF; goto fail; + } ppt = ddi_get_soft_state(ppt_state, getminor(va.va_rdev)); - if (ppt != NULL) - return (ppt); + if (ppt == NULL) { + err = EBADF; + goto fail; + } + + if (ppt->vm != vm) { + err = EBUSY; + goto fail; + } + + *pptp = ppt; + return (0); fail: releasef(fd); - return (NULL); + return (err); } static void @@ -992,16 +1005,11 @@ ppt_assign_device(struct vm *vm, int pptfd) int err = 0; mutex_enter(&pptdev_mtx); - ppt = ppt_findf(pptfd); - if (ppt == NULL) { + /* Passing NULL requires the device to be unowned. */ + err = ppt_findf(NULL, pptfd, &ppt); + if (err != 0) { mutex_exit(&pptdev_mtx); - return (EBADF); - } - - /* Only one VM may own a device at any given time */ - if (ppt->vm != NULL && ppt->vm != vm) { - err = EBUSY; - goto done; + return (err); } if (pci_save_config_regs(ppt->pptd_dip) != DDI_SUCCESS) { @@ -1091,20 +1099,14 @@ ppt_unassign_device(struct vm *vm, int pptfd) int err = 0; mutex_enter(&pptdev_mtx); - ppt = ppt_findf(pptfd); - if (ppt == NULL) { + err = ppt_findf(vm, pptfd, &ppt); + if (err != 0) { mutex_exit(&pptdev_mtx); - return (EBADF); + return (err); } - /* If this device is not owned by this 'vm' then bail out. */ - if (ppt->vm != vm) { - err = EBUSY; - goto done; - } ppt_do_unassign(ppt); -done: releasef(pptfd); mutex_exit(&pptdev_mtx); return (err); @@ -1135,14 +1137,10 @@ ppt_map_mmio(struct vm *vm, int pptfd, vm_paddr_t gpa, size_t len, int err = 0; mutex_enter(&pptdev_mtx); - ppt = ppt_findf(pptfd); - if (ppt == NULL) { + err = ppt_findf(vm, pptfd, &ppt); + if (err != 0) { mutex_exit(&pptdev_mtx); - return (EBADF); - } - if (ppt->vm != vm) { - err = EBUSY; - goto done; + return (err); } /* @@ -1208,13 +1206,14 @@ ppt_setup_msi(struct vm *vm, int vcpu, int pptfd, uint64_t addr, uint64_t msg, return (EINVAL); mutex_enter(&pptdev_mtx); - ppt = ppt_findf(pptfd); - if (ppt == NULL) { + err = ppt_findf(vm, pptfd, &ppt); + if (err != 0) { mutex_exit(&pptdev_mtx); - return (EBADF); + return (err); } - if (ppt->vm != vm) { - /* Make sure we own this device */ + + /* Reject attempts to enable MSI while MSI-X is active. */ + if (ppt->msix.num_msgs != 0 && numvec != 0) { err = EBUSY; goto done; } @@ -1308,13 +1307,14 @@ ppt_setup_msix(struct vm *vm, int vcpu, int pptfd, int idx, uint64_t addr, int err = 0; mutex_enter(&pptdev_mtx); - ppt = ppt_findf(pptfd); - if (ppt == NULL) { + err = ppt_findf(vm, pptfd, &ppt); + if (err != 0) { mutex_exit(&pptdev_mtx); - return (EBADF); + return (err); } - /* Make sure we own this device */ - if (ppt->vm != vm) { + + /* Reject attempts to enable MSI-X while MSI is active. */ + if (ppt->msi.num_msgs != 0) { err = EBUSY; goto done; } @@ -1410,14 +1410,10 @@ ppt_get_limits(struct vm *vm, int pptfd, int *msilimit, int *msixlimit) int err = 0; mutex_enter(&pptdev_mtx); - ppt = ppt_findf(pptfd); - if (ppt == NULL) { + err = ppt_findf(vm, pptfd, &ppt); + if (err != 0) { mutex_exit(&pptdev_mtx); - return (EBADF); - } - if (ppt->vm != vm) { - err = EBUSY; - goto done; + return (err); } if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSI, @@ -1429,7 +1425,26 @@ ppt_get_limits(struct vm *vm, int pptfd, int *msilimit, int *msixlimit) *msixlimit = -1; } -done: + releasef(pptfd); + mutex_exit(&pptdev_mtx); + return (err); +} + +int +ppt_disable_msix(struct vm *vm, int pptfd) +{ + struct pptdev *ppt; + int err = 0; + + mutex_enter(&pptdev_mtx); + err = ppt_findf(vm, pptfd, &ppt); + if (err != 0) { + mutex_exit(&pptdev_mtx); + return (err); + } + + ppt_teardown_msix(ppt); + releasef(pptfd); mutex_exit(&pptdev_mtx); return (err); diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.h b/usr/src/uts/i86pc/io/vmm/io/ppt.h index 979c0e18ac..72a768c085 100644 --- a/usr/src/uts/i86pc/io/vmm/io/ppt.h +++ b/usr/src/uts/i86pc/io/vmm/io/ppt.h @@ -38,6 +38,7 @@ int ppt_setup_msi(struct vm *vm, int vcpu, int pptfd, uint64_t addr, uint64_t msg, int numvec); int ppt_setup_msix(struct vm *vm, int vcpu, int pptfd, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); +int ppt_disable_msix(struct vm *vm, int pptfd); int ppt_assigned_devices(struct vm *vm); boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa); int ppt_get_limits(struct vm *vm, int pptfd, int *msilimit, int *msixlimit); diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile b/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile index 708818d78e..1b08b06b58 100644 --- a/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile +++ b/usr/src/uts/i86pc/io/vmm/io/ppt.mapfile @@ -41,6 +41,7 @@ SYMBOL_VERSION ILLUMOSprivate { ppt_assigned_devices; ppt_is_mmio; ppt_assign_device; + ppt_disable_msix; ppt_unassign_device; ppt_unassign_all; ppt_map_mmio; diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.c b/usr/src/uts/i86pc/io/vmm/io/vatpit.c index d8cfc1beb6..024aa076f7 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpit.c +++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.c @@ -76,7 +76,6 @@ struct vatpit_callout_arg { int channel_num; }; - struct channel { int mode; uint16_t initial; /* initial counter value */ @@ -293,7 +292,6 @@ pit_readback(struct vatpit *vatpit, uint8_t cmd) return (error); } - static int vatpit_update_mode(struct vatpit *vatpit, uint8_t val) { diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c index cebcaf0fdb..557d32b764 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c @@ -490,6 +490,7 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, case VM_RTC_WRITE: case VM_RTC_SETTIME: case VM_RTC_GETTIME: + case VM_PPTDEV_DISABLE_MSIX: #ifndef __FreeBSD__ case VM_DEVMEM_GETOFFSET: #endif @@ -616,6 +617,16 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, pptmsix.vector_control); break; } + case VM_PPTDEV_DISABLE_MSIX: { + struct vm_pptdev pptdev; + + if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { + error = EFAULT; + break; + } + error = ppt_disable_msix(sc->vmm_vm, pptdev.pptfd); + break; + } case VM_MAP_PPTDEV_MMIO: { struct vm_pptdev_mmio pptmmio; diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h index f5d031bfd4..f4a68636b3 100644 --- a/usr/src/uts/i86pc/sys/vmm_dev.h +++ b/usr/src/uts/i86pc/sys/vmm_dev.h @@ -368,6 +368,7 @@ struct vm_run_state { #define VM_SUSPEND_CPU (VMM_IOC_BASE | 0x1d) #define VM_RESUME_CPU (VMM_IOC_BASE | 0x1e) +#define VM_PPTDEV_DISABLE_MSIX (VMM_IOC_BASE | 0x1f) #define VM_DEVMEM_GETOFFSET (VMM_IOC_BASE | 0xff) diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s index 49c0cce31c..070083d8f1 100644 --- a/usr/src/uts/intel/ia32/ml/modstubs.s +++ b/usr/src/uts/intel/ia32/ml/modstubs.s @@ -1301,6 +1301,7 @@ fcnname/**/_info: \ WSTUB(ppt, ppt_map_mmio, nomod_einval); WSTUB(ppt, ppt_setup_msi, nomod_einval); WSTUB(ppt, ppt_setup_msix, nomod_einval); + WSTUB(ppt, ppt_disable_msix, nomod_einval); WSTUB(ppt, ppt_assigned_devices, nomod_zero); WSTUB(ppt, ppt_is_mmio, nomod_zero); WSTUB(ppt, ppt_assign_device, nomod_einval); |