diff options
88 files changed, 7788 insertions, 620 deletions
diff --git a/usr/contrib/freebsd/dev/nvme/nvme.h b/usr/contrib/freebsd/dev/nvme/nvme.h new file mode 100644 index 0000000000..73d4e2d2db --- /dev/null +++ b/usr/contrib/freebsd/dev/nvme/nvme.h @@ -0,0 +1,1506 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (C) 2012-2013 Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __NVME_H__ +#define __NVME_H__ + +#ifdef _KERNEL +#include <sys/types.h> +#endif + +#include <sys/param.h> +#include <sys/endian.h> + +#define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command) +#define NVME_RESET_CONTROLLER _IO('n', 1) + +#define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test) +#define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test) + +/* + * Macros to deal with NVME revisions, as defined VS register + */ +#define NVME_REV(x, y) (((x) << 16) | ((y) << 8)) +#define NVME_MAJOR(r) (((r) >> 16) & 0xffff) +#define NVME_MINOR(r) (((r) >> 8) & 0xff) + +/* + * Use to mark a command to apply to all namespaces, or to retrieve global + * log pages. + */ +#define NVME_GLOBAL_NAMESPACE_TAG ((uint32_t)0xFFFFFFFF) + +/* Cap nvme to 1MB transfers driver explodes with larger sizes */ +#define NVME_MAX_XFER_SIZE (MAXPHYS < (1<<20) ? MAXPHYS : (1<<20)) + +/* Register field definitions */ +#define NVME_CAP_LO_REG_MQES_SHIFT (0) +#define NVME_CAP_LO_REG_MQES_MASK (0xFFFF) +#define NVME_CAP_LO_REG_CQR_SHIFT (16) +#define NVME_CAP_LO_REG_CQR_MASK (0x1) +#define NVME_CAP_LO_REG_AMS_SHIFT (17) +#define NVME_CAP_LO_REG_AMS_MASK (0x3) +#define NVME_CAP_LO_REG_TO_SHIFT (24) +#define NVME_CAP_LO_REG_TO_MASK (0xFF) + +#define NVME_CAP_HI_REG_DSTRD_SHIFT (0) +#define NVME_CAP_HI_REG_DSTRD_MASK (0xF) +#define NVME_CAP_HI_REG_CSS_NVM_SHIFT (5) +#define NVME_CAP_HI_REG_CSS_NVM_MASK (0x1) +#define NVME_CAP_HI_REG_MPSMIN_SHIFT (16) +#define NVME_CAP_HI_REG_MPSMIN_MASK (0xF) +#define NVME_CAP_HI_REG_MPSMAX_SHIFT (20) +#define NVME_CAP_HI_REG_MPSMAX_MASK (0xF) + +#define NVME_CC_REG_EN_SHIFT (0) +#define NVME_CC_REG_EN_MASK (0x1) +#define NVME_CC_REG_CSS_SHIFT (4) +#define NVME_CC_REG_CSS_MASK (0x7) +#define NVME_CC_REG_MPS_SHIFT (7) +#define NVME_CC_REG_MPS_MASK (0xF) +#define NVME_CC_REG_AMS_SHIFT (11) +#define NVME_CC_REG_AMS_MASK (0x7) +#define NVME_CC_REG_SHN_SHIFT (14) +#define NVME_CC_REG_SHN_MASK (0x3) +#define NVME_CC_REG_IOSQES_SHIFT (16) +#define NVME_CC_REG_IOSQES_MASK (0xF) +#define NVME_CC_REG_IOCQES_SHIFT (20) +#define NVME_CC_REG_IOCQES_MASK (0xF) + +#define NVME_CSTS_REG_RDY_SHIFT (0) +#define NVME_CSTS_REG_RDY_MASK (0x1) +#define NVME_CSTS_REG_CFS_SHIFT (1) +#define NVME_CSTS_REG_CFS_MASK (0x1) +#define NVME_CSTS_REG_SHST_SHIFT (2) +#define NVME_CSTS_REG_SHST_MASK (0x3) + +#define NVME_CSTS_GET_SHST(csts) (((csts) >> NVME_CSTS_REG_SHST_SHIFT) & NVME_CSTS_REG_SHST_MASK) + +#define NVME_AQA_REG_ASQS_SHIFT (0) +#define NVME_AQA_REG_ASQS_MASK (0xFFF) +#define NVME_AQA_REG_ACQS_SHIFT (16) +#define NVME_AQA_REG_ACQS_MASK (0xFFF) + +/* Command field definitions */ + +#define NVME_CMD_FUSE_SHIFT (8) +#define NVME_CMD_FUSE_MASK (0x3) + +#define NVME_STATUS_P_SHIFT (0) +#define NVME_STATUS_P_MASK (0x1) +#define NVME_STATUS_SC_SHIFT (1) +#define NVME_STATUS_SC_MASK (0xFF) +#define NVME_STATUS_SCT_SHIFT (9) +#define NVME_STATUS_SCT_MASK (0x7) +#define NVME_STATUS_M_SHIFT (14) +#define NVME_STATUS_M_MASK (0x1) +#define NVME_STATUS_DNR_SHIFT (15) +#define NVME_STATUS_DNR_MASK (0x1) + +#define NVME_STATUS_GET_P(st) (((st) >> NVME_STATUS_P_SHIFT) & NVME_STATUS_P_MASK) +#define NVME_STATUS_GET_SC(st) (((st) >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK) +#define NVME_STATUS_GET_SCT(st) (((st) >> NVME_STATUS_SCT_SHIFT) & NVME_STATUS_SCT_MASK) +#define NVME_STATUS_GET_M(st) (((st) >> NVME_STATUS_M_SHIFT) & NVME_STATUS_M_MASK) +#define NVME_STATUS_GET_DNR(st) (((st) >> NVME_STATUS_DNR_SHIFT) & NVME_STATUS_DNR_MASK) + +#define NVME_PWR_ST_MPS_SHIFT (0) +#define NVME_PWR_ST_MPS_MASK (0x1) +#define NVME_PWR_ST_NOPS_SHIFT (1) +#define NVME_PWR_ST_NOPS_MASK (0x1) +#define NVME_PWR_ST_RRT_SHIFT (0) +#define NVME_PWR_ST_RRT_MASK (0x1F) +#define NVME_PWR_ST_RRL_SHIFT (0) +#define NVME_PWR_ST_RRL_MASK (0x1F) +#define NVME_PWR_ST_RWT_SHIFT (0) +#define NVME_PWR_ST_RWT_MASK (0x1F) +#define NVME_PWR_ST_RWL_SHIFT (0) +#define NVME_PWR_ST_RWL_MASK (0x1F) +#define NVME_PWR_ST_IPS_SHIFT (6) +#define NVME_PWR_ST_IPS_MASK (0x3) +#define NVME_PWR_ST_APW_SHIFT (0) +#define NVME_PWR_ST_APW_MASK (0x7) +#define NVME_PWR_ST_APS_SHIFT (6) +#define NVME_PWR_ST_APS_MASK (0x3) + +/** Controller Multi-path I/O and Namespace Sharing Capabilities */ +/* More then one port */ +#define NVME_CTRLR_DATA_MIC_MPORTS_SHIFT (0) +#define NVME_CTRLR_DATA_MIC_MPORTS_MASK (0x1) +/* More then one controller */ +#define NVME_CTRLR_DATA_MIC_MCTRLRS_SHIFT (1) +#define NVME_CTRLR_DATA_MIC_MCTRLRS_MASK (0x1) +/* SR-IOV Virtual Function */ +#define NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT (2) +#define NVME_CTRLR_DATA_MIC_SRIOVVF_MASK (0x1) + +/** OACS - optional admin command support */ +/* supports security send/receive commands */ +#define NVME_CTRLR_DATA_OACS_SECURITY_SHIFT (0) +#define NVME_CTRLR_DATA_OACS_SECURITY_MASK (0x1) +/* supports format nvm command */ +#define NVME_CTRLR_DATA_OACS_FORMAT_SHIFT (1) +#define NVME_CTRLR_DATA_OACS_FORMAT_MASK (0x1) +/* supports firmware activate/download commands */ +#define NVME_CTRLR_DATA_OACS_FIRMWARE_SHIFT (2) +#define NVME_CTRLR_DATA_OACS_FIRMWARE_MASK (0x1) +/* supports namespace management commands */ +#define NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT (3) +#define NVME_CTRLR_DATA_OACS_NSMGMT_MASK (0x1) +/* supports Device Self-test command */ +#define NVME_CTRLR_DATA_OACS_SELFTEST_SHIFT (4) +#define NVME_CTRLR_DATA_OACS_SELFTEST_MASK (0x1) +/* supports Directives */ +#define NVME_CTRLR_DATA_OACS_DIRECTIVES_SHIFT (5) +#define NVME_CTRLR_DATA_OACS_DIRECTIVES_MASK (0x1) +/* supports NVMe-MI Send/Receive */ +#define NVME_CTRLR_DATA_OACS_NVMEMI_SHIFT (6) +#define NVME_CTRLR_DATA_OACS_NVMEMI_MASK (0x1) +/* supports Virtualization Management */ +#define NVME_CTRLR_DATA_OACS_VM_SHIFT (7) +#define NVME_CTRLR_DATA_OACS_VM_MASK (0x1) +/* supports Doorbell Buffer Config */ +#define NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT (8) +#define NVME_CTRLR_DATA_OACS_DBBUFFER_MASK (0x1) + +/** firmware updates */ +/* first slot is read-only */ +#define NVME_CTRLR_DATA_FRMW_SLOT1_RO_SHIFT (0) +#define NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK (0x1) +/* number of firmware slots */ +#define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT (1) +#define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK (0x7) + +/** log page attributes */ +/* per namespace smart/health log page */ +#define NVME_CTRLR_DATA_LPA_NS_SMART_SHIFT (0) +#define NVME_CTRLR_DATA_LPA_NS_SMART_MASK (0x1) + +/** AVSCC - admin vendor specific command configuration */ +/* admin vendor specific commands use spec format */ +#define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_SHIFT (0) +#define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_MASK (0x1) + +/** Autonomous Power State Transition Attributes */ +/* Autonomous Power State Transitions supported */ +#define NVME_CTRLR_DATA_APSTA_APST_SUPP_SHIFT (0) +#define NVME_CTRLR_DATA_APSTA_APST_SUPP_MASK (0x1) + +/** submission queue entry size */ +#define NVME_CTRLR_DATA_SQES_MIN_SHIFT (0) +#define NVME_CTRLR_DATA_SQES_MIN_MASK (0xF) +#define NVME_CTRLR_DATA_SQES_MAX_SHIFT (4) +#define NVME_CTRLR_DATA_SQES_MAX_MASK (0xF) + +/** completion queue entry size */ +#define NVME_CTRLR_DATA_CQES_MIN_SHIFT (0) +#define NVME_CTRLR_DATA_CQES_MIN_MASK (0xF) +#define NVME_CTRLR_DATA_CQES_MAX_SHIFT (4) +#define NVME_CTRLR_DATA_CQES_MAX_MASK (0xF) + +/** optional nvm command support */ +#define NVME_CTRLR_DATA_ONCS_COMPARE_SHIFT (0) +#define NVME_CTRLR_DATA_ONCS_COMPARE_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_WRITE_UNC_SHIFT (1) +#define NVME_CTRLR_DATA_ONCS_WRITE_UNC_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_DSM_SHIFT (2) +#define NVME_CTRLR_DATA_ONCS_DSM_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_WRZERO_SHIFT (3) +#define NVME_CTRLR_DATA_ONCS_WRZERO_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_SAVEFEAT_SHIFT (4) +#define NVME_CTRLR_DATA_ONCS_SAVEFEAT_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_RESERV_SHIFT (5) +#define NVME_CTRLR_DATA_ONCS_RESERV_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT (6) +#define NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK (0x1) + +/** Fused Operation Support */ +#define NVME_CTRLR_DATA_FUSES_CNW_SHIFT (0) +#define NVME_CTRLR_DATA_FUSES_CNW_MASK (0x1) + +/** Format NVM Attributes */ +#define NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT (0) +#define NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK (0x1) +#define NVME_CTRLR_DATA_FNA_ERASE_ALL_SHIFT (1) +#define NVME_CTRLR_DATA_FNA_ERASE_ALL_MASK (0x1) +#define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_SHIFT (2) +#define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK (0x1) + +/** volatile write cache */ +#define NVME_CTRLR_DATA_VWC_PRESENT_SHIFT (0) +#define NVME_CTRLR_DATA_VWC_PRESENT_MASK (0x1) + +/** namespace features */ +/* thin provisioning */ +#define NVME_NS_DATA_NSFEAT_THIN_PROV_SHIFT (0) +#define NVME_NS_DATA_NSFEAT_THIN_PROV_MASK (0x1) +/* NAWUN, NAWUPF, and NACWU fields are valid */ +#define NVME_NS_DATA_NSFEAT_NA_FIELDS_SHIFT (1) +#define NVME_NS_DATA_NSFEAT_NA_FIELDS_MASK (0x1) +/* Deallocated or Unwritten Logical Block errors supported */ +#define NVME_NS_DATA_NSFEAT_DEALLOC_SHIFT (2) +#define NVME_NS_DATA_NSFEAT_DEALLOC_MASK (0x1) +/* NGUID and EUI64 fields are not reusable */ +#define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_SHIFT (3) +#define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_MASK (0x1) + +/** formatted lba size */ +#define NVME_NS_DATA_FLBAS_FORMAT_SHIFT (0) +#define NVME_NS_DATA_FLBAS_FORMAT_MASK (0xF) +#define NVME_NS_DATA_FLBAS_EXTENDED_SHIFT (4) +#define NVME_NS_DATA_FLBAS_EXTENDED_MASK (0x1) + +/** metadata capabilities */ +/* metadata can be transferred as part of data prp list */ +#define NVME_NS_DATA_MC_EXTENDED_SHIFT (0) +#define NVME_NS_DATA_MC_EXTENDED_MASK (0x1) +/* metadata can be transferred with separate metadata pointer */ +#define NVME_NS_DATA_MC_POINTER_SHIFT (1) +#define NVME_NS_DATA_MC_POINTER_MASK (0x1) + +/** end-to-end data protection capabilities */ +/* protection information type 1 */ +#define NVME_NS_DATA_DPC_PIT1_SHIFT (0) +#define NVME_NS_DATA_DPC_PIT1_MASK (0x1) +/* protection information type 2 */ +#define NVME_NS_DATA_DPC_PIT2_SHIFT (1) +#define NVME_NS_DATA_DPC_PIT2_MASK (0x1) +/* protection information type 3 */ +#define NVME_NS_DATA_DPC_PIT3_SHIFT (2) +#define NVME_NS_DATA_DPC_PIT3_MASK (0x1) +/* first eight bytes of metadata */ +#define NVME_NS_DATA_DPC_MD_START_SHIFT (3) +#define NVME_NS_DATA_DPC_MD_START_MASK (0x1) +/* last eight bytes of metadata */ +#define NVME_NS_DATA_DPC_MD_END_SHIFT (4) +#define NVME_NS_DATA_DPC_MD_END_MASK (0x1) + +/** end-to-end data protection type settings */ +/* protection information type */ +#define NVME_NS_DATA_DPS_PIT_SHIFT (0) +#define NVME_NS_DATA_DPS_PIT_MASK (0x7) +/* 1 == protection info transferred at start of metadata */ +/* 0 == protection info transferred at end of metadata */ +#define NVME_NS_DATA_DPS_MD_START_SHIFT (3) +#define NVME_NS_DATA_DPS_MD_START_MASK (0x1) + +/** Namespace Multi-path I/O and Namespace Sharing Capabilities */ +/* the namespace may be attached to two or more controllers */ +#define NVME_NS_DATA_NMIC_MAY_BE_SHARED_SHIFT (0) +#define NVME_NS_DATA_NMIC_MAY_BE_SHARED_MASK (0x1) + +/** Reservation Capabilities */ +/* Persist Through Power Loss */ +#define NVME_NS_DATA_RESCAP_PTPL_SHIFT (0) +#define NVME_NS_DATA_RESCAP_PTPL_MASK (0x1) +/* supports the Write Exclusive */ +#define NVME_NS_DATA_RESCAP_WR_EX_SHIFT (1) +#define NVME_NS_DATA_RESCAP_WR_EX_MASK (0x1) +/* supports the Exclusive Access */ +#define NVME_NS_DATA_RESCAP_EX_AC_SHIFT (2) +#define NVME_NS_DATA_RESCAP_EX_AC_MASK (0x1) +/* supports the Write Exclusive – Registrants Only */ +#define NVME_NS_DATA_RESCAP_WR_EX_RO_SHIFT (3) +#define NVME_NS_DATA_RESCAP_WR_EX_RO_MASK (0x1) +/* supports the Exclusive Access - Registrants Only */ +#define NVME_NS_DATA_RESCAP_EX_AC_RO_SHIFT (4) +#define NVME_NS_DATA_RESCAP_EX_AC_RO_MASK (0x1) +/* supports the Write Exclusive – All Registrants */ +#define NVME_NS_DATA_RESCAP_WR_EX_AR_SHIFT (5) +#define NVME_NS_DATA_RESCAP_WR_EX_AR_MASK (0x1) +/* supports the Exclusive Access - All Registrants */ +#define NVME_NS_DATA_RESCAP_EX_AC_AR_SHIFT (6) +#define NVME_NS_DATA_RESCAP_EX_AC_AR_MASK (0x1) +/* Ignore Existing Key is used as defined in revision 1.3 or later */ +#define NVME_NS_DATA_RESCAP_IEKEY13_SHIFT (7) +#define NVME_NS_DATA_RESCAP_IEKEY13_MASK (0x1) + +/** Format Progress Indicator */ +/* percentage of the Format NVM command that remains to be completed */ +#define NVME_NS_DATA_FPI_PERC_SHIFT (0) +#define NVME_NS_DATA_FPI_PERC_MASK (0x7f) +/* namespace supports the Format Progress Indicator */ +#define NVME_NS_DATA_FPI_SUPP_SHIFT (7) +#define NVME_NS_DATA_FPI_SUPP_MASK (0x1) + +/** lba format support */ +/* metadata size */ +#define NVME_NS_DATA_LBAF_MS_SHIFT (0) +#define NVME_NS_DATA_LBAF_MS_MASK (0xFFFF) +/* lba data size */ +#define NVME_NS_DATA_LBAF_LBADS_SHIFT (16) +#define NVME_NS_DATA_LBAF_LBADS_MASK (0xFF) +/* relative performance */ +#define NVME_NS_DATA_LBAF_RP_SHIFT (24) +#define NVME_NS_DATA_LBAF_RP_MASK (0x3) + +enum nvme_critical_warning_state { + NVME_CRIT_WARN_ST_AVAILABLE_SPARE = 0x1, + NVME_CRIT_WARN_ST_TEMPERATURE = 0x2, + NVME_CRIT_WARN_ST_DEVICE_RELIABILITY = 0x4, + NVME_CRIT_WARN_ST_READ_ONLY = 0x8, + NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP = 0x10, +}; +#define NVME_CRIT_WARN_ST_RESERVED_MASK (0xE0) + +/* slot for current FW */ +#define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT (0) +#define NVME_FIRMWARE_PAGE_AFI_SLOT_MASK (0x7) + +/* CC register SHN field values */ +enum shn_value { + NVME_SHN_NORMAL = 0x1, + NVME_SHN_ABRUPT = 0x2, +}; + +/* CSTS register SHST field values */ +enum shst_value { + NVME_SHST_NORMAL = 0x0, + NVME_SHST_OCCURRING = 0x1, + NVME_SHST_COMPLETE = 0x2, +}; + +struct nvme_registers +{ + /** controller capabilities */ + uint32_t cap_lo; + uint32_t cap_hi; + + uint32_t vs; /* version */ + uint32_t intms; /* interrupt mask set */ + uint32_t intmc; /* interrupt mask clear */ + + /** controller configuration */ + uint32_t cc; + + uint32_t reserved1; + + /** controller status */ + uint32_t csts; + + uint32_t reserved2; + + /** admin queue attributes */ + uint32_t aqa; + + uint64_t asq; /* admin submission queue base addr */ + uint64_t acq; /* admin completion queue base addr */ + uint32_t reserved3[0x3f2]; + + struct { + uint32_t sq_tdbl; /* submission queue tail doorbell */ + uint32_t cq_hdbl; /* completion queue head doorbell */ + } doorbell[1] __packed; +} __packed; + +_Static_assert(sizeof(struct nvme_registers) == 0x1008, "bad size for nvme_registers"); + +struct nvme_command +{ + /* dword 0 */ + uint8_t opc; /* opcode */ + uint8_t fuse; /* fused operation */ + uint16_t cid; /* command identifier */ + + /* dword 1 */ + uint32_t nsid; /* namespace identifier */ + + /* dword 2-3 */ + uint32_t rsvd2; + uint32_t rsvd3; + + /* dword 4-5 */ + uint64_t mptr; /* metadata pointer */ + + /* dword 6-7 */ + uint64_t prp1; /* prp entry 1 */ + + /* dword 8-9 */ + uint64_t prp2; /* prp entry 2 */ + + /* dword 10-15 */ + uint32_t cdw10; /* command-specific */ + uint32_t cdw11; /* command-specific */ + uint32_t cdw12; /* command-specific */ + uint32_t cdw13; /* command-specific */ + uint32_t cdw14; /* command-specific */ + uint32_t cdw15; /* command-specific */ +} __packed; + +_Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command"); + +struct nvme_completion { + + /* dword 0 */ + uint32_t cdw0; /* command-specific */ + + /* dword 1 */ + uint32_t rsvd1; + + /* dword 2 */ + uint16_t sqhd; /* submission queue head pointer */ + uint16_t sqid; /* submission queue identifier */ + + /* dword 3 */ + uint16_t cid; /* command identifier */ + uint16_t status; +} __packed; + +_Static_assert(sizeof(struct nvme_completion) == 4 * 4, "bad size for nvme_completion"); + +struct nvme_dsm_range { + uint32_t attributes; + uint32_t length; + uint64_t starting_lba; +} __packed; + +/* Largest DSM Trim that can be done */ +#define NVME_MAX_DSM_TRIM 4096 + +_Static_assert(sizeof(struct nvme_dsm_range) == 16, "bad size for nvme_dsm_ranage"); + +/* status code types */ +enum nvme_status_code_type { + NVME_SCT_GENERIC = 0x0, + NVME_SCT_COMMAND_SPECIFIC = 0x1, + NVME_SCT_MEDIA_ERROR = 0x2, + /* 0x3-0x6 - reserved */ + NVME_SCT_VENDOR_SPECIFIC = 0x7, +}; + +/* generic command status codes */ +enum nvme_generic_command_status_code { + NVME_SC_SUCCESS = 0x00, + NVME_SC_INVALID_OPCODE = 0x01, + NVME_SC_INVALID_FIELD = 0x02, + NVME_SC_COMMAND_ID_CONFLICT = 0x03, + NVME_SC_DATA_TRANSFER_ERROR = 0x04, + NVME_SC_ABORTED_POWER_LOSS = 0x05, + NVME_SC_INTERNAL_DEVICE_ERROR = 0x06, + NVME_SC_ABORTED_BY_REQUEST = 0x07, + NVME_SC_ABORTED_SQ_DELETION = 0x08, + NVME_SC_ABORTED_FAILED_FUSED = 0x09, + NVME_SC_ABORTED_MISSING_FUSED = 0x0a, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT = 0x0b, + NVME_SC_COMMAND_SEQUENCE_ERROR = 0x0c, + NVME_SC_INVALID_SGL_SEGMENT_DESCR = 0x0d, + NVME_SC_INVALID_NUMBER_OF_SGL_DESCR = 0x0e, + NVME_SC_DATA_SGL_LENGTH_INVALID = 0x0f, + NVME_SC_METADATA_SGL_LENGTH_INVALID = 0x10, + NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID = 0x11, + NVME_SC_INVALID_USE_OF_CMB = 0x12, + NVME_SC_PRP_OFFET_INVALID = 0x13, + NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED = 0x14, + NVME_SC_OPERATION_DENIED = 0x15, + NVME_SC_SGL_OFFSET_INVALID = 0x16, + /* 0x17 - reserved */ + NVME_SC_HOST_ID_INCONSISTENT_FORMAT = 0x18, + NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED = 0x19, + NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID = 0x1a, + NVME_SC_ABORTED_DUE_TO_PREEMPT = 0x1b, + NVME_SC_SANITIZE_FAILED = 0x1c, + NVME_SC_SANITIZE_IN_PROGRESS = 0x1d, + NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID = 0x1e, + NVME_SC_NOT_SUPPORTED_IN_CMB = 0x1f, + + NVME_SC_LBA_OUT_OF_RANGE = 0x80, + NVME_SC_CAPACITY_EXCEEDED = 0x81, + NVME_SC_NAMESPACE_NOT_READY = 0x82, + NVME_SC_RESERVATION_CONFLICT = 0x83, + NVME_SC_FORMAT_IN_PROGRESS = 0x84, +}; + +/* command specific status codes */ +enum nvme_command_specific_status_code { + NVME_SC_COMPLETION_QUEUE_INVALID = 0x00, + NVME_SC_INVALID_QUEUE_IDENTIFIER = 0x01, + NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED = 0x02, + NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED = 0x03, + /* 0x04 - reserved */ + NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED = 0x05, + NVME_SC_INVALID_FIRMWARE_SLOT = 0x06, + NVME_SC_INVALID_FIRMWARE_IMAGE = 0x07, + NVME_SC_INVALID_INTERRUPT_VECTOR = 0x08, + NVME_SC_INVALID_LOG_PAGE = 0x09, + NVME_SC_INVALID_FORMAT = 0x0a, + NVME_SC_FIRMWARE_REQUIRES_RESET = 0x0b, + NVME_SC_INVALID_QUEUE_DELETION = 0x0c, + NVME_SC_FEATURE_NOT_SAVEABLE = 0x0d, + NVME_SC_FEATURE_NOT_CHANGEABLE = 0x0e, + NVME_SC_FEATURE_NOT_NS_SPECIFIC = 0x0f, + NVME_SC_FW_ACT_REQUIRES_NVMS_RESET = 0x10, + NVME_SC_FW_ACT_REQUIRES_RESET = 0x11, + NVME_SC_FW_ACT_REQUIRES_TIME = 0x12, + NVME_SC_FW_ACT_PROHIBITED = 0x13, + NVME_SC_OVERLAPPING_RANGE = 0x14, + NVME_SC_NS_INSUFFICIENT_CAPACITY = 0x15, + NVME_SC_NS_ID_UNAVAILABLE = 0x16, + /* 0x17 - reserved */ + NVME_SC_NS_ALREADY_ATTACHED = 0x18, + NVME_SC_NS_IS_PRIVATE = 0x19, + NVME_SC_NS_NOT_ATTACHED = 0x1a, + NVME_SC_THIN_PROV_NOT_SUPPORTED = 0x1b, + NVME_SC_CTRLR_LIST_INVALID = 0x1c, + NVME_SC_SELT_TEST_IN_PROGRESS = 0x1d, + NVME_SC_BOOT_PART_WRITE_PROHIB = 0x1e, + NVME_SC_INVALID_CTRLR_ID = 0x1f, + NVME_SC_INVALID_SEC_CTRLR_STATE = 0x20, + NVME_SC_INVALID_NUM_OF_CTRLR_RESRC = 0x21, + NVME_SC_INVALID_RESOURCE_ID = 0x22, + + NVME_SC_CONFLICTING_ATTRIBUTES = 0x80, + NVME_SC_INVALID_PROTECTION_INFO = 0x81, + NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE = 0x82, +}; + +/* media error status codes */ +enum nvme_media_error_status_code { + NVME_SC_WRITE_FAULTS = 0x80, + NVME_SC_UNRECOVERED_READ_ERROR = 0x81, + NVME_SC_GUARD_CHECK_ERROR = 0x82, + NVME_SC_APPLICATION_TAG_CHECK_ERROR = 0x83, + NVME_SC_REFERENCE_TAG_CHECK_ERROR = 0x84, + NVME_SC_COMPARE_FAILURE = 0x85, + NVME_SC_ACCESS_DENIED = 0x86, + NVME_SC_DEALLOCATED_OR_UNWRITTEN = 0x87, +}; + +/* admin opcodes */ +enum nvme_admin_opcode { + NVME_OPC_DELETE_IO_SQ = 0x00, + NVME_OPC_CREATE_IO_SQ = 0x01, + NVME_OPC_GET_LOG_PAGE = 0x02, + /* 0x03 - reserved */ + NVME_OPC_DELETE_IO_CQ = 0x04, + NVME_OPC_CREATE_IO_CQ = 0x05, + NVME_OPC_IDENTIFY = 0x06, + /* 0x07 - reserved */ + NVME_OPC_ABORT = 0x08, + NVME_OPC_SET_FEATURES = 0x09, + NVME_OPC_GET_FEATURES = 0x0a, + /* 0x0b - reserved */ + NVME_OPC_ASYNC_EVENT_REQUEST = 0x0c, + NVME_OPC_NAMESPACE_MANAGEMENT = 0x0d, + /* 0x0e-0x0f - reserved */ + NVME_OPC_FIRMWARE_ACTIVATE = 0x10, + NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD = 0x11, + NVME_OPC_DEVICE_SELF_TEST = 0x14, + NVME_OPC_NAMESPACE_ATTACHMENT = 0x15, + NVME_OPC_KEEP_ALIVE = 0x18, + NVME_OPC_DIRECTIVE_SEND = 0x19, + NVME_OPC_DIRECTIVE_RECEIVE = 0x1a, + NVME_OPC_VIRTUALIZATION_MANAGEMENT = 0x1c, + NVME_OPC_NVME_MI_SEND = 0x1d, + NVME_OPC_NVME_MI_RECEIVE = 0x1e, + NVME_OPC_DOORBELL_BUFFER_CONFIG = 0x7c, + + NVME_OPC_FORMAT_NVM = 0x80, + NVME_OPC_SECURITY_SEND = 0x81, + NVME_OPC_SECURITY_RECEIVE = 0x82, + NVME_OPC_SANITIZE = 0x84, +}; + +/* nvme nvm opcodes */ +enum nvme_nvm_opcode { + NVME_OPC_FLUSH = 0x00, + NVME_OPC_WRITE = 0x01, + NVME_OPC_READ = 0x02, + /* 0x03 - reserved */ + NVME_OPC_WRITE_UNCORRECTABLE = 0x04, + NVME_OPC_COMPARE = 0x05, + /* 0x06 - reserved */ + NVME_OPC_WRITE_ZEROES = 0x08, + /* 0x07 - reserved */ + NVME_OPC_DATASET_MANAGEMENT = 0x09, + /* 0x0a-0x0c - reserved */ + NVME_OPC_RESERVATION_REGISTER = 0x0d, + NVME_OPC_RESERVATION_REPORT = 0x0e, + /* 0x0f-0x10 - reserved */ + NVME_OPC_RESERVATION_ACQUIRE = 0x11, + /* 0x12-0x14 - reserved */ + NVME_OPC_RESERVATION_RELEASE = 0x15, +}; + +enum nvme_feature { + /* 0x00 - reserved */ + NVME_FEAT_ARBITRATION = 0x01, + NVME_FEAT_POWER_MANAGEMENT = 0x02, + NVME_FEAT_LBA_RANGE_TYPE = 0x03, + NVME_FEAT_TEMPERATURE_THRESHOLD = 0x04, + NVME_FEAT_ERROR_RECOVERY = 0x05, + NVME_FEAT_VOLATILE_WRITE_CACHE = 0x06, + NVME_FEAT_NUMBER_OF_QUEUES = 0x07, + NVME_FEAT_INTERRUPT_COALESCING = 0x08, + NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION = 0x09, + NVME_FEAT_WRITE_ATOMICITY = 0x0A, + NVME_FEAT_ASYNC_EVENT_CONFIGURATION = 0x0B, + NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION = 0x0C, + NVME_FEAT_HOST_MEMORY_BUFFER = 0x0D, + NVME_FEAT_TIMESTAMP = 0x0E, + NVME_FEAT_KEEP_ALIVE_TIMER = 0x0F, + NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT = 0x10, + NVME_FEAT_NON_OP_POWER_STATE_CONFIG = 0x11, + /* 0x12-0x77 - reserved */ + /* 0x78-0x7f - NVMe Management Interface */ + NVME_FEAT_SOFTWARE_PROGRESS_MARKER = 0x80, + /* 0x81-0xBF - command set specific (reserved) */ + /* 0xC0-0xFF - vendor specific */ +}; + +enum nvme_dsm_attribute { + NVME_DSM_ATTR_INTEGRAL_READ = 0x1, + NVME_DSM_ATTR_INTEGRAL_WRITE = 0x2, + NVME_DSM_ATTR_DEALLOCATE = 0x4, +}; + +enum nvme_activate_action { + NVME_AA_REPLACE_NO_ACTIVATE = 0x0, + NVME_AA_REPLACE_ACTIVATE = 0x1, + NVME_AA_ACTIVATE = 0x2, +}; + +struct nvme_power_state { + /** Maximum Power */ + uint16_t mp; /* Maximum Power */ + uint8_t ps_rsvd1; + uint8_t mps_nops; /* Max Power Scale, Non-Operational State */ + + uint32_t enlat; /* Entry Latency */ + uint32_t exlat; /* Exit Latency */ + + uint8_t rrt; /* Relative Read Throughput */ + uint8_t rrl; /* Relative Read Latency */ + uint8_t rwt; /* Relative Write Throughput */ + uint8_t rwl; /* Relative Write Latency */ + + uint16_t idlp; /* Idle Power */ + uint8_t ips; /* Idle Power Scale */ + uint8_t ps_rsvd8; + + uint16_t actp; /* Active Power */ + uint8_t apw_aps; /* Active Power Workload, Active Power Scale */ + uint8_t ps_rsvd10[9]; +} __packed; + +_Static_assert(sizeof(struct nvme_power_state) == 32, "bad size for nvme_power_state"); + +#define NVME_SERIAL_NUMBER_LENGTH 20 +#define NVME_MODEL_NUMBER_LENGTH 40 +#define NVME_FIRMWARE_REVISION_LENGTH 8 + +struct nvme_controller_data { + + /* bytes 0-255: controller capabilities and features */ + + /** pci vendor id */ + uint16_t vid; + + /** pci subsystem vendor id */ + uint16_t ssvid; + + /** serial number */ + uint8_t sn[NVME_SERIAL_NUMBER_LENGTH]; + + /** model number */ + uint8_t mn[NVME_MODEL_NUMBER_LENGTH]; + + /** firmware revision */ + uint8_t fr[NVME_FIRMWARE_REVISION_LENGTH]; + + /** recommended arbitration burst */ + uint8_t rab; + + /** ieee oui identifier */ + uint8_t ieee[3]; + + /** multi-interface capabilities */ + uint8_t mic; + + /** maximum data transfer size */ + uint8_t mdts; + + /** Controller ID */ + uint16_t ctrlr_id; + + /** Version */ + uint32_t ver; + + /** RTD3 Resume Latency */ + uint32_t rtd3r; + + /** RTD3 Enter Latency */ + uint32_t rtd3e; + + /** Optional Asynchronous Events Supported */ + uint32_t oaes; /* bitfield really */ + + /** Controller Attributes */ + uint32_t ctratt; /* bitfield really */ + + uint8_t reserved1[12]; + + /** FRU Globally Unique Identifier */ + uint8_t fguid[16]; + + uint8_t reserved2[128]; + + /* bytes 256-511: admin command set attributes */ + + /** optional admin command support */ + uint16_t oacs; + + /** abort command limit */ + uint8_t acl; + + /** asynchronous event request limit */ + uint8_t aerl; + + /** firmware updates */ + uint8_t frmw; + + /** log page attributes */ + uint8_t lpa; + + /** error log page entries */ + uint8_t elpe; + + /** number of power states supported */ + uint8_t npss; + + /** admin vendor specific command configuration */ + uint8_t avscc; + + /** Autonomous Power State Transition Attributes */ + uint8_t apsta; + + /** Warning Composite Temperature Threshold */ + uint16_t wctemp; + + /** Critical Composite Temperature Threshold */ + uint16_t cctemp; + + /** Maximum Time for Firmware Activation */ + uint16_t mtfa; + + /** Host Memory Buffer Preferred Size */ + uint32_t hmpre; + + /** Host Memory Buffer Minimum Size */ + uint32_t hmmin; + + /** Name space capabilities */ + struct { + /* if nsmgmt, report tnvmcap and unvmcap */ + uint8_t tnvmcap[16]; + uint8_t unvmcap[16]; + } __packed untncap; + + /** Replay Protected Memory Block Support */ + uint32_t rpmbs; /* Really a bitfield */ + + /** Extended Device Self-test Time */ + uint16_t edstt; + + /** Device Self-test Options */ + uint8_t dsto; /* Really a bitfield */ + + /** Firmware Update Granularity */ + uint8_t fwug; + + /** Keep Alive Support */ + uint16_t kas; + + /** Host Controlled Thermal Management Attributes */ + uint16_t hctma; /* Really a bitfield */ + + /** Minimum Thermal Management Temperature */ + uint16_t mntmt; + + /** Maximum Thermal Management Temperature */ + uint16_t mxtmt; + + /** Sanitize Capabilities */ + uint32_t sanicap; /* Really a bitfield */ + + uint8_t reserved3[180]; + /* bytes 512-703: nvm command set attributes */ + + /** submission queue entry size */ + uint8_t sqes; + + /** completion queue entry size */ + uint8_t cqes; + + /** Maximum Outstanding Commands */ + uint16_t maxcmd; + + /** number of namespaces */ + uint32_t nn; + + /** optional nvm command support */ + uint16_t oncs; + + /** fused operation support */ + uint16_t fuses; + + /** format nvm attributes */ + uint8_t fna; + + /** volatile write cache */ + uint8_t vwc; + + /** Atomic Write Unit Normal */ + uint16_t awun; + + /** Atomic Write Unit Power Fail */ + uint16_t awupf; + + /** NVM Vendor Specific Command Configuration */ + uint8_t nvscc; + uint8_t reserved5; + + /** Atomic Compare & Write Unit */ + uint16_t acwu; + uint16_t reserved6; + + /** SGL Support */ + uint32_t sgls; + + /* bytes 540-767: Reserved */ + uint8_t reserved7[228]; + + /** NVM Subsystem NVMe Qualified Name */ + uint8_t subnqn[256]; + + /* bytes 1024-1791: Reserved */ + uint8_t reserved8[768]; + + /* bytes 1792-2047: NVMe over Fabrics specification */ + uint8_t reserved9[256]; + + /* bytes 2048-3071: power state descriptors */ + struct nvme_power_state power_state[32]; + + /* bytes 3072-4095: vendor specific */ + uint8_t vs[1024]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_controller_data) == 4096, "bad size for nvme_controller_data"); + +struct nvme_namespace_data { + + /** namespace size */ + uint64_t nsze; + + /** namespace capacity */ + uint64_t ncap; + + /** namespace utilization */ + uint64_t nuse; + + /** namespace features */ + uint8_t nsfeat; + + /** number of lba formats */ + uint8_t nlbaf; + + /** formatted lba size */ + uint8_t flbas; + + /** metadata capabilities */ + uint8_t mc; + + /** end-to-end data protection capabilities */ + uint8_t dpc; + + /** end-to-end data protection type settings */ + uint8_t dps; + + /** Namespace Multi-path I/O and Namespace Sharing Capabilities */ + uint8_t nmic; + + /** Reservation Capabilities */ + uint8_t rescap; + + /** Format Progress Indicator */ + uint8_t fpi; + + /** Deallocate Logical Block Features */ + uint8_t dlfeat; + + /** Namespace Atomic Write Unit Normal */ + uint16_t nawun; + + /** Namespace Atomic Write Unit Power Fail */ + uint16_t nawupf; + + /** Namespace Atomic Compare & Write Unit */ + uint16_t nacwu; + + /** Namespace Atomic Boundary Size Normal */ + uint16_t nabsn; + + /** Namespace Atomic Boundary Offset */ + uint16_t nabo; + + /** Namespace Atomic Boundary Size Power Fail */ + uint16_t nabspf; + + /** Namespace Optimal IO Boundary */ + uint16_t noiob; + + /** NVM Capacity */ + uint8_t nvmcap[16]; + + /* bytes 64-103: Reserved */ + uint8_t reserved5[40]; + + /** Namespace Globally Unique Identifier */ + uint8_t nguid[16]; + + /** IEEE Extended Unique Identifier */ + uint8_t eui64[8]; + + /** lba format support */ + uint32_t lbaf[16]; + + uint8_t reserved6[192]; + + uint8_t vendor_specific[3712]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_namespace_data) == 4096, "bad size for nvme_namepsace_data"); + +enum nvme_log_page { + + /* 0x00 - reserved */ + NVME_LOG_ERROR = 0x01, + NVME_LOG_HEALTH_INFORMATION = 0x02, + NVME_LOG_FIRMWARE_SLOT = 0x03, + NVME_LOG_CHANGED_NAMESPACE = 0x04, + NVME_LOG_COMMAND_EFFECT = 0x05, + /* 0x06-0x7F - reserved */ + /* 0x80-0xBF - I/O command set specific */ + NVME_LOG_RES_NOTIFICATION = 0x80, + /* 0xC0-0xFF - vendor specific */ + + /* + * The following are Intel Specific log pages, but they seem + * to be widely implemented. + */ + INTEL_LOG_READ_LAT_LOG = 0xc1, + INTEL_LOG_WRITE_LAT_LOG = 0xc2, + INTEL_LOG_TEMP_STATS = 0xc5, + INTEL_LOG_ADD_SMART = 0xca, + INTEL_LOG_DRIVE_MKT_NAME = 0xdd, + + /* + * HGST log page, with lots ofs sub pages. + */ + HGST_INFO_LOG = 0xc1, +}; + +struct nvme_error_information_entry { + + uint64_t error_count; + uint16_t sqid; + uint16_t cid; + uint16_t status; + uint16_t error_location; + uint64_t lba; + uint32_t nsid; + uint8_t vendor_specific; + uint8_t reserved[35]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_error_information_entry) == 64, "bad size for nvme_error_information_entry"); + +struct nvme_health_information_page { + + uint8_t critical_warning; + uint16_t temperature; + uint8_t available_spare; + uint8_t available_spare_threshold; + uint8_t percentage_used; + + uint8_t reserved[26]; + + /* + * Note that the following are 128-bit values, but are + * defined as an array of 2 64-bit values. + */ + /* Data Units Read is always in 512-byte units. */ + uint64_t data_units_read[2]; + /* Data Units Written is always in 512-byte units. */ + uint64_t data_units_written[2]; + /* For NVM command set, this includes Compare commands. */ + uint64_t host_read_commands[2]; + uint64_t host_write_commands[2]; + /* Controller Busy Time is reported in minutes. */ + uint64_t controller_busy_time[2]; + uint64_t power_cycles[2]; + uint64_t power_on_hours[2]; + uint64_t unsafe_shutdowns[2]; + uint64_t media_errors[2]; + uint64_t num_error_info_log_entries[2]; + uint32_t warning_temp_time; + uint32_t error_temp_time; + uint16_t temp_sensor[8]; + + uint8_t reserved2[296]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page"); + +struct nvme_firmware_page { + + uint8_t afi; + uint8_t reserved[7]; + uint64_t revision[7]; /* revisions for 7 slots */ + uint8_t reserved2[448]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_firmware_page) == 512, "bad size for nvme_firmware_page"); + +struct nvme_ns_list { + uint32_t ns[1024]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_ns_list) == 4096, "bad size for nvme_ns_list"); + +struct intel_log_temp_stats +{ + uint64_t current; + uint64_t overtemp_flag_last; + uint64_t overtemp_flag_life; + uint64_t max_temp; + uint64_t min_temp; + uint64_t _rsvd[5]; + uint64_t max_oper_temp; + uint64_t min_oper_temp; + uint64_t est_offset; +} __packed __aligned(4); + +_Static_assert(sizeof(struct intel_log_temp_stats) == 13 * 8, "bad size for intel_log_temp_stats"); + +#define NVME_TEST_MAX_THREADS 128 + +struct nvme_io_test { + + enum nvme_nvm_opcode opc; + uint32_t size; + uint32_t time; /* in seconds */ + uint32_t num_threads; + uint32_t flags; + uint64_t io_completed[NVME_TEST_MAX_THREADS]; +}; + +enum nvme_io_test_flags { + + /* + * Specifies whether dev_refthread/dev_relthread should be + * called during NVME_BIO_TEST. Ignored for other test + * types. + */ + NVME_TEST_FLAG_REFTHREAD = 0x1, +}; + +struct nvme_pt_command { + + /* + * cmd is used to specify a passthrough command to a controller or + * namespace. + * + * The following fields from cmd may be specified by the caller: + * * opc (opcode) + * * nsid (namespace id) - for admin commands only + * * cdw10-cdw15 + * + * Remaining fields must be set to 0 by the caller. + */ + struct nvme_command cmd; + + /* + * cpl returns completion status for the passthrough command + * specified by cmd. + * + * The following fields will be filled out by the driver, for + * consumption by the caller: + * * cdw0 + * * status (except for phase) + * + * Remaining fields will be set to 0 by the driver. + */ + struct nvme_completion cpl; + + /* buf is the data buffer associated with this passthrough command. */ + void * buf; + + /* + * len is the length of the data buffer associated with this + * passthrough command. + */ + uint32_t len; + + /* + * is_read = 1 if the passthrough command will read data into the + * supplied buffer from the controller. + * + * is_read = 0 if the passthrough command will write data from the + * supplied buffer to the controller. + */ + uint32_t is_read; + + /* + * driver_lock is used by the driver only. It must be set to 0 + * by the caller. + */ + struct mtx * driver_lock; +}; + +#define nvme_completion_is_error(cpl) \ + (NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0) + +void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen); + +#ifdef _KERNEL + +struct bio; + +struct nvme_namespace; +struct nvme_controller; +struct nvme_consumer; + +typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *); + +typedef void *(*nvme_cons_ns_fn_t)(struct nvme_namespace *, void *); +typedef void *(*nvme_cons_ctrlr_fn_t)(struct nvme_controller *); +typedef void (*nvme_cons_async_fn_t)(void *, const struct nvme_completion *, + uint32_t, void *, uint32_t); +typedef void (*nvme_cons_fail_fn_t)(void *); + +enum nvme_namespace_flags { + NVME_NS_DEALLOCATE_SUPPORTED = 0x1, + NVME_NS_FLUSH_SUPPORTED = 0x2, +}; + +int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, + struct nvme_pt_command *pt, + uint32_t nsid, int is_user_buffer, + int is_admin_cmd); + +/* Admin functions */ +void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, + uint8_t feature, uint32_t cdw11, + void *payload, uint32_t payload_size, + nvme_cb_fn_t cb_fn, void *cb_arg); +void nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, + uint8_t feature, uint32_t cdw11, + void *payload, uint32_t payload_size, + nvme_cb_fn_t cb_fn, void *cb_arg); +void nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr, + uint8_t log_page, uint32_t nsid, + void *payload, uint32_t payload_size, + nvme_cb_fn_t cb_fn, void *cb_arg); + +/* NVM I/O functions */ +int nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload, + uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, + void *cb_arg); +int nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp, + nvme_cb_fn_t cb_fn, void *cb_arg); +int nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload, + uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, + void *cb_arg); +int nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp, + nvme_cb_fn_t cb_fn, void *cb_arg); +int nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload, + uint8_t num_ranges, nvme_cb_fn_t cb_fn, + void *cb_arg); +int nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn, + void *cb_arg); +int nvme_ns_dump(struct nvme_namespace *ns, void *virt, off_t offset, + size_t len); + +/* Registration functions */ +struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, + nvme_cons_ctrlr_fn_t ctrlr_fn, + nvme_cons_async_fn_t async_fn, + nvme_cons_fail_fn_t fail_fn); +void nvme_unregister_consumer(struct nvme_consumer *consumer); + +/* Controller helper functions */ +device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr); +const struct nvme_controller_data * + nvme_ctrlr_get_data(struct nvme_controller *ctrlr); + +/* Namespace helper functions */ +uint32_t nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns); +uint32_t nvme_ns_get_sector_size(struct nvme_namespace *ns); +uint64_t nvme_ns_get_num_sectors(struct nvme_namespace *ns); +uint64_t nvme_ns_get_size(struct nvme_namespace *ns); +uint32_t nvme_ns_get_flags(struct nvme_namespace *ns); +const char * nvme_ns_get_serial_number(struct nvme_namespace *ns); +const char * nvme_ns_get_model_number(struct nvme_namespace *ns); +const struct nvme_namespace_data * + nvme_ns_get_data(struct nvme_namespace *ns); +uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns); + +int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp, + nvme_cb_fn_t cb_fn); + +/* + * Command building helper functions -- shared with CAM + * These functions assume allocator zeros out cmd structure + * CAM's xpt_get_ccb and the request allocator for nvme both + * do zero'd allocations. + */ +static inline +void nvme_ns_flush_cmd(struct nvme_command *cmd, uint32_t nsid) +{ + + cmd->opc = NVME_OPC_FLUSH; + cmd->nsid = htole32(nsid); +} + +static inline +void nvme_ns_rw_cmd(struct nvme_command *cmd, uint32_t rwcmd, uint32_t nsid, + uint64_t lba, uint32_t count) +{ + cmd->opc = rwcmd; + cmd->nsid = htole32(nsid); + cmd->cdw10 = htole32(lba & 0xffffffffu); + cmd->cdw11 = htole32(lba >> 32); + cmd->cdw12 = htole32(count-1); +} + +static inline +void nvme_ns_write_cmd(struct nvme_command *cmd, uint32_t nsid, + uint64_t lba, uint32_t count) +{ + nvme_ns_rw_cmd(cmd, NVME_OPC_WRITE, nsid, lba, count); +} + +static inline +void nvme_ns_read_cmd(struct nvme_command *cmd, uint32_t nsid, + uint64_t lba, uint32_t count) +{ + nvme_ns_rw_cmd(cmd, NVME_OPC_READ, nsid, lba, count); +} + +static inline +void nvme_ns_trim_cmd(struct nvme_command *cmd, uint32_t nsid, + uint32_t num_ranges) +{ + cmd->opc = NVME_OPC_DATASET_MANAGEMENT; + cmd->nsid = htole32(nsid); + cmd->cdw10 = htole32(num_ranges - 1); + cmd->cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); +} + +extern int nvme_use_nvd; + +#endif /* _KERNEL */ + +/* Endianess conversion functions for NVMe structs */ +static inline +void nvme_completion_swapbytes(struct nvme_completion *s) +{ + + s->cdw0 = le32toh(s->cdw0); + /* omit rsvd1 */ + s->sqhd = le16toh(s->sqhd); + s->sqid = le16toh(s->sqid); + /* omit cid */ + s->status = le16toh(s->status); +} + +static inline +void nvme_power_state_swapbytes(struct nvme_power_state *s) +{ + + s->mp = le16toh(s->mp); + s->enlat = le32toh(s->enlat); + s->exlat = le32toh(s->exlat); + s->idlp = le16toh(s->idlp); + s->actp = le16toh(s->actp); +} + +static inline +void nvme_controller_data_swapbytes(struct nvme_controller_data *s) +{ + int i; + + s->vid = le16toh(s->vid); + s->ssvid = le16toh(s->ssvid); + s->ctrlr_id = le16toh(s->ctrlr_id); + s->ver = le32toh(s->ver); + s->rtd3r = le32toh(s->rtd3r); + s->rtd3e = le32toh(s->rtd3e); + s->oaes = le32toh(s->oaes); + s->ctratt = le32toh(s->ctratt); + s->oacs = le16toh(s->oacs); + s->wctemp = le16toh(s->wctemp); + s->cctemp = le16toh(s->cctemp); + s->mtfa = le16toh(s->mtfa); + s->hmpre = le32toh(s->hmpre); + s->hmmin = le32toh(s->hmmin); + s->rpmbs = le32toh(s->rpmbs); + s->edstt = le16toh(s->edstt); + s->kas = le16toh(s->kas); + s->hctma = le16toh(s->hctma); + s->mntmt = le16toh(s->mntmt); + s->mxtmt = le16toh(s->mxtmt); + s->sanicap = le32toh(s->sanicap); + s->maxcmd = le16toh(s->maxcmd); + s->nn = le32toh(s->nn); + s->oncs = le16toh(s->oncs); + s->fuses = le16toh(s->fuses); + s->awun = le16toh(s->awun); + s->awupf = le16toh(s->awupf); + s->acwu = le16toh(s->acwu); + s->sgls = le32toh(s->sgls); + for (i = 0; i < 32; i++) + nvme_power_state_swapbytes(&s->power_state[i]); +} + +static inline +void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s) +{ + int i; + + s->nsze = le64toh(s->nsze); + s->ncap = le64toh(s->ncap); + s->nuse = le64toh(s->nuse); + s->nawun = le16toh(s->nawun); + s->nawupf = le16toh(s->nawupf); + s->nacwu = le16toh(s->nacwu); + s->nabsn = le16toh(s->nabsn); + s->nabo = le16toh(s->nabo); + s->nabspf = le16toh(s->nabspf); + s->noiob = le16toh(s->noiob); + for (i = 0; i < 16; i++) + s->lbaf[i] = le32toh(s->lbaf[i]); +} + +static inline +void nvme_error_information_entry_swapbytes(struct nvme_error_information_entry *s) +{ + + s->error_count = le64toh(s->error_count); + s->sqid = le16toh(s->sqid); + s->cid = le16toh(s->cid); + s->status = le16toh(s->status); + s->error_location = le16toh(s->error_location); + s->lba = le64toh(s->lba); + s->nsid = le32toh(s->nsid); +} + +static inline +void nvme_le128toh(void *p) +{ + /* + * Upstream, this uses the following comparison: + * #if _BYTE_ORDER != _LITTLE_ENDIAN + * + * Rather than keep this file in compat with only that little bit + * changed, we'll just float a little patch here for now. + */ +#ifndef _LITTLE_ENDIAN + /* Swap 16 bytes in place */ + char *tmp = (char*)p; + char b; + int i; + for (i = 0; i < 8; i++) { + b = tmp[i]; + tmp[i] = tmp[15-i]; + tmp[15-i] = b; + } +#else + (void)p; +#endif +} + +static inline +void nvme_health_information_page_swapbytes(struct nvme_health_information_page *s) +{ + int i; + + s->temperature = le16toh(s->temperature); + nvme_le128toh((void *)s->data_units_read); + nvme_le128toh((void *)s->data_units_written); + nvme_le128toh((void *)s->host_read_commands); + nvme_le128toh((void *)s->host_write_commands); + nvme_le128toh((void *)s->controller_busy_time); + nvme_le128toh((void *)s->power_cycles); + nvme_le128toh((void *)s->power_on_hours); + nvme_le128toh((void *)s->unsafe_shutdowns); + nvme_le128toh((void *)s->media_errors); + nvme_le128toh((void *)s->num_error_info_log_entries); + s->warning_temp_time = le32toh(s->warning_temp_time); + s->error_temp_time = le32toh(s->error_temp_time); + for (i = 0; i < 8; i++) + s->temp_sensor[i] = le16toh(s->temp_sensor[i]); +} + + +static inline +void nvme_firmware_page_swapbytes(struct nvme_firmware_page *s) +{ + int i; + + for (i = 0; i < 7; i++) + s->revision[i] = le64toh(s->revision[i]); +} + +static inline +void nvme_ns_list_swapbytes(struct nvme_ns_list *s) +{ + int i; + + for (i = 0; i < 1024; i++) + s->ns[i] = le32toh(s->ns[i]); +} + +static inline +void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s) +{ + + s->current = le64toh(s->current); + s->overtemp_flag_last = le64toh(s->overtemp_flag_last); + s->overtemp_flag_life = le64toh(s->overtemp_flag_life); + s->max_temp = le64toh(s->max_temp); + s->min_temp = le64toh(s->min_temp); + /* omit _rsvd[] */ + s->max_oper_temp = le64toh(s->max_oper_temp); + s->min_oper_temp = le64toh(s->min_oper_temp); + s->est_offset = le64toh(s->est_offset); +} + +#endif /* __NVME_H__ */ diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile index a8c772c7f8..0ad066e6d4 100644 --- a/usr/src/cmd/bhyve/Makefile +++ b/usr/src/cmd/bhyve/Makefile @@ -51,6 +51,7 @@ SRCS = acpi.c \ pci_hostbridge.c \ pci_irq.c \ pci_lpc.c \ + pci_nvme.c \ pci_passthru.c \ pci_uart.c \ pci_virtio_block.c \ @@ -76,8 +77,16 @@ SRCS = acpi.c \ vmm_instruction_emul.c \ xmsr.c \ spinup_ap.c \ + iov.c \ bhyve_sol_glue.c +# The virtio-scsi driver appears to include a slew of materials from FreeBSD's +# native SCSI implementation. We will omit that complexity for now. + #ctl_util.c \ + #ctl_scsi_all.c \ + #pci_virtio_scsi.c \ + + OBJS = $(SRCS:.c=.o) CLOBBERFILES = $(ROOTUSRSBINPROG) $(ZHYVE) @@ -109,6 +118,13 @@ CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ # Disable the crypto code until it is wired up CPPFLAGS += -DNO_OPENSSL +pci_nvme.o := CERRWARN += -_gcc=-Wno-pointer-sign + +# Force c99 for everything +CSTD= $(CSTD_GNU99) +C99MODE= -xc99=%all +C99LMODE= -Xc99=%all + $(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -ldladm -lmd -luuid -lvmmapi -lz $(ZHYVE_PROG) := LDLIBS += -lnvpair $(MEVENT_TEST_PROG) := LDLIBS += -lsocket diff --git a/usr/src/cmd/bhyve/acpi.c b/usr/src/cmd/bhyve/acpi.c index 518ff34d69..309ba98a11 100644 --- a/usr/src/cmd/bhyve/acpi.c +++ b/usr/src/cmd/bhyve/acpi.c @@ -118,18 +118,14 @@ struct basl_fio { }; #define EFPRINTF(...) \ - err = fprintf(__VA_ARGS__); if (err < 0) goto err_exit; + if (fprintf(__VA_ARGS__) < 0) goto err_exit; #define EFFLUSH(x) \ - err = fflush(x); if (err != 0) goto err_exit; + if (fflush(x) != 0) goto err_exit; static int basl_fwrite_rsdp(FILE *fp) { - int err; - - err = 0; - EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve RSDP template\n"); EFPRINTF(fp, " */\n"); @@ -156,10 +152,6 @@ err_exit: static int basl_fwrite_rsdt(FILE *fp) { - int err; - - err = 0; - EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve RSDT template\n"); EFPRINTF(fp, " */\n"); @@ -196,10 +188,6 @@ err_exit: static int basl_fwrite_xsdt(FILE *fp) { - int err; - - err = 0; - EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve XSDT template\n"); EFPRINTF(fp, " */\n"); @@ -236,11 +224,8 @@ err_exit: static int basl_fwrite_madt(FILE *fp) { - int err; int i; - err = 0; - EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve MADT template\n"); EFPRINTF(fp, " */\n"); @@ -326,10 +311,6 @@ err_exit: static int basl_fwrite_fadt(FILE *fp) { - int err; - - err = 0; - EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve FADT template\n"); EFPRINTF(fp, " */\n"); @@ -547,10 +528,6 @@ err_exit: static int basl_fwrite_hpet(FILE *fp) { - int err; - - err = 0; - EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve HPET template\n"); EFPRINTF(fp, " */\n"); @@ -596,8 +573,6 @@ err_exit: static int basl_fwrite_mcfg(FILE *fp) { - int err = 0; - EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve MCFG template\n"); EFPRINTF(fp, " */\n"); @@ -629,10 +604,6 @@ err_exit: static int basl_fwrite_facs(FILE *fp) { - int err; - - err = 0; - EFPRINTF(fp, "/*\n"); EFPRINTF(fp, " * bhyve FACS template\n"); EFPRINTF(fp, " */\n"); @@ -666,7 +637,6 @@ void dsdt_line(const char *fmt, ...) { va_list ap; - int err = 0; if (dsdt_error != 0) return; @@ -675,8 +645,10 @@ dsdt_line(const char *fmt, ...) if (dsdt_indent_level != 0) EFPRINTF(dsdt_fp, "%*c", dsdt_indent_level * 2, ' '); va_start(ap, fmt); - if (vfprintf(dsdt_fp, fmt, ap) < 0) + if (vfprintf(dsdt_fp, fmt, ap) < 0) { + va_end(ap); goto err_exit; + } va_end(ap); } EFPRINTF(dsdt_fp, "\n"); @@ -735,9 +707,6 @@ dsdt_fixed_mem32(uint32_t base, uint32_t length) static int basl_fwrite_dsdt(FILE *fp) { - int err; - - err = 0; dsdt_fp = fp; dsdt_error = 0; dsdt_indent_level = 0; @@ -916,7 +885,7 @@ basl_make_templates(void) int len; err = 0; - + /* * */ diff --git a/usr/src/cmd/bhyve/ahci.h b/usr/src/cmd/bhyve/ahci.h index 1fd3bff99c..691d4bd438 100644 --- a/usr/src/cmd/bhyve/ahci.h +++ b/usr/src/cmd/bhyve/ahci.h @@ -33,292 +33,292 @@ #define _AHCI_H_ /* ATA register defines */ -#define ATA_DATA 0 /* (RW) data */ - -#define ATA_FEATURE 1 /* (W) feature */ -#define ATA_F_DMA 0x01 /* enable DMA */ -#define ATA_F_OVL 0x02 /* enable overlap */ - -#define ATA_COUNT 2 /* (W) sector count */ - -#define ATA_SECTOR 3 /* (RW) sector # */ -#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */ -#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */ -#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */ -#define ATA_D_LBA 0x40 /* use LBA addressing */ -#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */ - -#define ATA_COMMAND 7 /* (W) command */ - -#define ATA_ERROR 8 /* (R) error */ -#define ATA_E_ILI 0x01 /* illegal length */ -#define ATA_E_NM 0x02 /* no media */ -#define ATA_E_ABORT 0x04 /* command aborted */ -#define ATA_E_MCR 0x08 /* media change request */ -#define ATA_E_IDNF 0x10 /* ID not found */ -#define ATA_E_MC 0x20 /* media changed */ -#define ATA_E_UNC 0x40 /* uncorrectable data */ -#define ATA_E_ICRC 0x80 /* UDMA crc error */ -#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */ - -#define ATA_IREASON 9 /* (R) interrupt reason */ -#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */ -#define ATA_I_IN 0x02 /* read (1) | write (0) */ -#define ATA_I_RELEASE 0x04 /* released bus (1) */ -#define ATA_I_TAGMASK 0xf8 /* tag mask */ - -#define ATA_STATUS 10 /* (R) status */ -#define ATA_ALTSTAT 11 /* (R) alternate status */ -#define ATA_S_ERROR 0x01 /* error */ -#define ATA_S_INDEX 0x02 /* index */ -#define ATA_S_CORR 0x04 /* data corrected */ -#define ATA_S_DRQ 0x08 /* data request */ -#define ATA_S_DSC 0x10 /* drive seek completed */ -#define ATA_S_SERVICE 0x10 /* drive needs service */ -#define ATA_S_DWF 0x20 /* drive write fault */ -#define ATA_S_DMA 0x20 /* DMA ready */ -#define ATA_S_READY 0x40 /* drive ready */ -#define ATA_S_BUSY 0x80 /* busy */ - -#define ATA_CONTROL 12 /* (W) control */ -#define ATA_A_IDS 0x02 /* disable interrupts */ -#define ATA_A_RESET 0x04 /* RESET controller */ -#define ATA_A_4BIT 0x08 /* 4 head bits */ -#define ATA_A_HOB 0x80 /* High Order Byte enable */ +#define ATA_DATA 0 /* (RW) data */ + +#define ATA_FEATURE 1 /* (W) feature */ +#define ATA_F_DMA 0x01 /* enable DMA */ +#define ATA_F_OVL 0x02 /* enable overlap */ + +#define ATA_COUNT 2 /* (W) sector count */ + +#define ATA_SECTOR 3 /* (RW) sector # */ +#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */ +#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */ +#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */ +#define ATA_D_LBA 0x40 /* use LBA addressing */ +#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */ + +#define ATA_COMMAND 7 /* (W) command */ + +#define ATA_ERROR 8 /* (R) error */ +#define ATA_E_ILI 0x01 /* illegal length */ +#define ATA_E_NM 0x02 /* no media */ +#define ATA_E_ABORT 0x04 /* command aborted */ +#define ATA_E_MCR 0x08 /* media change request */ +#define ATA_E_IDNF 0x10 /* ID not found */ +#define ATA_E_MC 0x20 /* media changed */ +#define ATA_E_UNC 0x40 /* uncorrectable data */ +#define ATA_E_ICRC 0x80 /* UDMA crc error */ +#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */ + +#define ATA_IREASON 9 /* (R) interrupt reason */ +#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */ +#define ATA_I_IN 0x02 /* read (1) | write (0) */ +#define ATA_I_RELEASE 0x04 /* released bus (1) */ +#define ATA_I_TAGMASK 0xf8 /* tag mask */ + +#define ATA_STATUS 10 /* (R) status */ +#define ATA_ALTSTAT 11 /* (R) alternate status */ +#define ATA_S_ERROR 0x01 /* error */ +#define ATA_S_INDEX 0x02 /* index */ +#define ATA_S_CORR 0x04 /* data corrected */ +#define ATA_S_DRQ 0x08 /* data request */ +#define ATA_S_DSC 0x10 /* drive seek completed */ +#define ATA_S_SERVICE 0x10 /* drive needs service */ +#define ATA_S_DWF 0x20 /* drive write fault */ +#define ATA_S_DMA 0x20 /* DMA ready */ +#define ATA_S_READY 0x40 /* drive ready */ +#define ATA_S_BUSY 0x80 /* busy */ + +#define ATA_CONTROL 12 /* (W) control */ +#define ATA_A_IDS 0x02 /* disable interrupts */ +#define ATA_A_RESET 0x04 /* RESET controller */ +#define ATA_A_4BIT 0x08 /* 4 head bits */ +#define ATA_A_HOB 0x80 /* High Order Byte enable */ /* SATA register defines */ -#define ATA_SSTATUS 13 -#define ATA_SS_DET_MASK 0x0000000f -#define ATA_SS_DET_NO_DEVICE 0x00000000 -#define ATA_SS_DET_DEV_PRESENT 0x00000001 -#define ATA_SS_DET_PHY_ONLINE 0x00000003 -#define ATA_SS_DET_PHY_OFFLINE 0x00000004 - -#define ATA_SS_SPD_MASK 0x000000f0 -#define ATA_SS_SPD_NO_SPEED 0x00000000 -#define ATA_SS_SPD_GEN1 0x00000010 -#define ATA_SS_SPD_GEN2 0x00000020 -#define ATA_SS_SPD_GEN3 0x00000030 - -#define ATA_SS_IPM_MASK 0x00000f00 -#define ATA_SS_IPM_NO_DEVICE 0x00000000 -#define ATA_SS_IPM_ACTIVE 0x00000100 -#define ATA_SS_IPM_PARTIAL 0x00000200 -#define ATA_SS_IPM_SLUMBER 0x00000600 -#define ATA_SS_IPM_DEVSLEEP 0x00000800 - -#define ATA_SERROR 14 -#define ATA_SE_DATA_CORRECTED 0x00000001 -#define ATA_SE_COMM_CORRECTED 0x00000002 -#define ATA_SE_DATA_ERR 0x00000100 -#define ATA_SE_COMM_ERR 0x00000200 -#define ATA_SE_PROT_ERR 0x00000400 -#define ATA_SE_HOST_ERR 0x00000800 -#define ATA_SE_PHY_CHANGED 0x00010000 -#define ATA_SE_PHY_IERROR 0x00020000 -#define ATA_SE_COMM_WAKE 0x00040000 -#define ATA_SE_DECODE_ERR 0x00080000 -#define ATA_SE_PARITY_ERR 0x00100000 -#define ATA_SE_CRC_ERR 0x00200000 -#define ATA_SE_HANDSHAKE_ERR 0x00400000 -#define ATA_SE_LINKSEQ_ERR 0x00800000 -#define ATA_SE_TRANSPORT_ERR 0x01000000 -#define ATA_SE_UNKNOWN_FIS 0x02000000 -#define ATA_SE_EXCHANGED 0x04000000 - -#define ATA_SCONTROL 15 -#define ATA_SC_DET_MASK 0x0000000f -#define ATA_SC_DET_IDLE 0x00000000 -#define ATA_SC_DET_RESET 0x00000001 -#define ATA_SC_DET_DISABLE 0x00000004 - -#define ATA_SC_SPD_MASK 0x000000f0 -#define ATA_SC_SPD_NO_SPEED 0x00000000 -#define ATA_SC_SPD_SPEED_GEN1 0x00000010 -#define ATA_SC_SPD_SPEED_GEN2 0x00000020 -#define ATA_SC_SPD_SPEED_GEN3 0x00000030 - -#define ATA_SC_IPM_MASK 0x00000f00 -#define ATA_SC_IPM_NONE 0x00000000 -#define ATA_SC_IPM_DIS_PARTIAL 0x00000100 -#define ATA_SC_IPM_DIS_SLUMBER 0x00000200 -#define ATA_SC_IPM_DIS_DEVSLEEP 0x00000400 - -#define ATA_SACTIVE 16 - -#define AHCI_MAX_PORTS 32 -#define AHCI_MAX_SLOTS 32 -#define AHCI_MAX_IRQS 16 +#define ATA_SSTATUS 13 +#define ATA_SS_DET_MASK 0x0000000f +#define ATA_SS_DET_NO_DEVICE 0x00000000 +#define ATA_SS_DET_DEV_PRESENT 0x00000001 +#define ATA_SS_DET_PHY_ONLINE 0x00000003 +#define ATA_SS_DET_PHY_OFFLINE 0x00000004 + +#define ATA_SS_SPD_MASK 0x000000f0 +#define ATA_SS_SPD_NO_SPEED 0x00000000 +#define ATA_SS_SPD_GEN1 0x00000010 +#define ATA_SS_SPD_GEN2 0x00000020 +#define ATA_SS_SPD_GEN3 0x00000030 + +#define ATA_SS_IPM_MASK 0x00000f00 +#define ATA_SS_IPM_NO_DEVICE 0x00000000 +#define ATA_SS_IPM_ACTIVE 0x00000100 +#define ATA_SS_IPM_PARTIAL 0x00000200 +#define ATA_SS_IPM_SLUMBER 0x00000600 +#define ATA_SS_IPM_DEVSLEEP 0x00000800 + +#define ATA_SERROR 14 +#define ATA_SE_DATA_CORRECTED 0x00000001 +#define ATA_SE_COMM_CORRECTED 0x00000002 +#define ATA_SE_DATA_ERR 0x00000100 +#define ATA_SE_COMM_ERR 0x00000200 +#define ATA_SE_PROT_ERR 0x00000400 +#define ATA_SE_HOST_ERR 0x00000800 +#define ATA_SE_PHY_CHANGED 0x00010000 +#define ATA_SE_PHY_IERROR 0x00020000 +#define ATA_SE_COMM_WAKE 0x00040000 +#define ATA_SE_DECODE_ERR 0x00080000 +#define ATA_SE_PARITY_ERR 0x00100000 +#define ATA_SE_CRC_ERR 0x00200000 +#define ATA_SE_HANDSHAKE_ERR 0x00400000 +#define ATA_SE_LINKSEQ_ERR 0x00800000 +#define ATA_SE_TRANSPORT_ERR 0x01000000 +#define ATA_SE_UNKNOWN_FIS 0x02000000 +#define ATA_SE_EXCHANGED 0x04000000 + +#define ATA_SCONTROL 15 +#define ATA_SC_DET_MASK 0x0000000f +#define ATA_SC_DET_IDLE 0x00000000 +#define ATA_SC_DET_RESET 0x00000001 +#define ATA_SC_DET_DISABLE 0x00000004 + +#define ATA_SC_SPD_MASK 0x000000f0 +#define ATA_SC_SPD_NO_SPEED 0x00000000 +#define ATA_SC_SPD_SPEED_GEN1 0x00000010 +#define ATA_SC_SPD_SPEED_GEN2 0x00000020 +#define ATA_SC_SPD_SPEED_GEN3 0x00000030 + +#define ATA_SC_IPM_MASK 0x00000f00 +#define ATA_SC_IPM_NONE 0x00000000 +#define ATA_SC_IPM_DIS_PARTIAL 0x00000100 +#define ATA_SC_IPM_DIS_SLUMBER 0x00000200 +#define ATA_SC_IPM_DIS_DEVSLEEP 0x00000400 + +#define ATA_SACTIVE 16 + +#define AHCI_MAX_PORTS 32 +#define AHCI_MAX_SLOTS 32 +#define AHCI_MAX_IRQS 16 /* SATA AHCI v1.0 register defines */ -#define AHCI_CAP 0x00 -#define AHCI_CAP_NPMASK 0x0000001f -#define AHCI_CAP_SXS 0x00000020 -#define AHCI_CAP_EMS 0x00000040 -#define AHCI_CAP_CCCS 0x00000080 -#define AHCI_CAP_NCS 0x00001F00 -#define AHCI_CAP_NCS_SHIFT 8 -#define AHCI_CAP_PSC 0x00002000 -#define AHCI_CAP_SSC 0x00004000 -#define AHCI_CAP_PMD 0x00008000 -#define AHCI_CAP_FBSS 0x00010000 -#define AHCI_CAP_SPM 0x00020000 -#define AHCI_CAP_SAM 0x00080000 -#define AHCI_CAP_ISS 0x00F00000 -#define AHCI_CAP_ISS_SHIFT 20 -#define AHCI_CAP_SCLO 0x01000000 -#define AHCI_CAP_SAL 0x02000000 -#define AHCI_CAP_SALP 0x04000000 -#define AHCI_CAP_SSS 0x08000000 -#define AHCI_CAP_SMPS 0x10000000 -#define AHCI_CAP_SSNTF 0x20000000 -#define AHCI_CAP_SNCQ 0x40000000 -#define AHCI_CAP_64BIT 0x80000000 - -#define AHCI_GHC 0x04 -#define AHCI_GHC_AE 0x80000000 -#define AHCI_GHC_MRSM 0x00000004 -#define AHCI_GHC_IE 0x00000002 -#define AHCI_GHC_HR 0x00000001 - -#define AHCI_IS 0x08 -#define AHCI_PI 0x0c -#define AHCI_VS 0x10 - -#define AHCI_CCCC 0x14 -#define AHCI_CCCC_TV_MASK 0xffff0000 -#define AHCI_CCCC_TV_SHIFT 16 -#define AHCI_CCCC_CC_MASK 0x0000ff00 -#define AHCI_CCCC_CC_SHIFT 8 -#define AHCI_CCCC_INT_MASK 0x000000f8 -#define AHCI_CCCC_INT_SHIFT 3 -#define AHCI_CCCC_EN 0x00000001 -#define AHCI_CCCP 0x18 - -#define AHCI_EM_LOC 0x1C -#define AHCI_EM_CTL 0x20 -#define AHCI_EM_MR 0x00000001 -#define AHCI_EM_TM 0x00000100 -#define AHCI_EM_RST 0x00000200 -#define AHCI_EM_LED 0x00010000 -#define AHCI_EM_SAFTE 0x00020000 -#define AHCI_EM_SES2 0x00040000 -#define AHCI_EM_SGPIO 0x00080000 -#define AHCI_EM_SMB 0x01000000 -#define AHCI_EM_XMT 0x02000000 -#define AHCI_EM_ALHD 0x04000000 -#define AHCI_EM_PM 0x08000000 - -#define AHCI_CAP2 0x24 -#define AHCI_CAP2_BOH 0x00000001 -#define AHCI_CAP2_NVMP 0x00000002 -#define AHCI_CAP2_APST 0x00000004 -#define AHCI_CAP2_SDS 0x00000008 -#define AHCI_CAP2_SADM 0x00000010 -#define AHCI_CAP2_DESO 0x00000020 - -#define AHCI_OFFSET 0x100 -#define AHCI_STEP 0x80 - -#define AHCI_P_CLB 0x00 -#define AHCI_P_CLBU 0x04 -#define AHCI_P_FB 0x08 -#define AHCI_P_FBU 0x0c -#define AHCI_P_IS 0x10 -#define AHCI_P_IE 0x14 -#define AHCI_P_IX_DHR 0x00000001 -#define AHCI_P_IX_PS 0x00000002 -#define AHCI_P_IX_DS 0x00000004 -#define AHCI_P_IX_SDB 0x00000008 -#define AHCI_P_IX_UF 0x00000010 -#define AHCI_P_IX_DP 0x00000020 -#define AHCI_P_IX_PC 0x00000040 -#define AHCI_P_IX_MP 0x00000080 - -#define AHCI_P_IX_PRC 0x00400000 -#define AHCI_P_IX_IPM 0x00800000 -#define AHCI_P_IX_OF 0x01000000 -#define AHCI_P_IX_INF 0x04000000 -#define AHCI_P_IX_IF 0x08000000 -#define AHCI_P_IX_HBD 0x10000000 -#define AHCI_P_IX_HBF 0x20000000 -#define AHCI_P_IX_TFE 0x40000000 -#define AHCI_P_IX_CPD 0x80000000 - -#define AHCI_P_CMD 0x18 -#define AHCI_P_CMD_ST 0x00000001 -#define AHCI_P_CMD_SUD 0x00000002 -#define AHCI_P_CMD_POD 0x00000004 -#define AHCI_P_CMD_CLO 0x00000008 -#define AHCI_P_CMD_FRE 0x00000010 -#define AHCI_P_CMD_CCS_MASK 0x00001f00 -#define AHCI_P_CMD_CCS_SHIFT 8 -#define AHCI_P_CMD_ISS 0x00002000 -#define AHCI_P_CMD_FR 0x00004000 -#define AHCI_P_CMD_CR 0x00008000 -#define AHCI_P_CMD_CPS 0x00010000 -#define AHCI_P_CMD_PMA 0x00020000 -#define AHCI_P_CMD_HPCP 0x00040000 -#define AHCI_P_CMD_MPSP 0x00080000 -#define AHCI_P_CMD_CPD 0x00100000 -#define AHCI_P_CMD_ESP 0x00200000 -#define AHCI_P_CMD_FBSCP 0x00400000 -#define AHCI_P_CMD_APSTE 0x00800000 -#define AHCI_P_CMD_ATAPI 0x01000000 -#define AHCI_P_CMD_DLAE 0x02000000 -#define AHCI_P_CMD_ALPE 0x04000000 -#define AHCI_P_CMD_ASP 0x08000000 -#define AHCI_P_CMD_ICC_MASK 0xf0000000 -#define AHCI_P_CMD_NOOP 0x00000000 -#define AHCI_P_CMD_ACTIVE 0x10000000 -#define AHCI_P_CMD_PARTIAL 0x20000000 -#define AHCI_P_CMD_SLUMBER 0x60000000 -#define AHCI_P_CMD_DEVSLEEP 0x80000000 - -#define AHCI_P_TFD 0x20 -#define AHCI_P_SIG 0x24 -#define AHCI_P_SSTS 0x28 -#define AHCI_P_SCTL 0x2c -#define AHCI_P_SERR 0x30 -#define AHCI_P_SACT 0x34 -#define AHCI_P_CI 0x38 -#define AHCI_P_SNTF 0x3C -#define AHCI_P_FBS 0x40 -#define AHCI_P_FBS_EN 0x00000001 -#define AHCI_P_FBS_DEC 0x00000002 -#define AHCI_P_FBS_SDE 0x00000004 -#define AHCI_P_FBS_DEV 0x00000f00 -#define AHCI_P_FBS_DEV_SHIFT 8 -#define AHCI_P_FBS_ADO 0x0000f000 -#define AHCI_P_FBS_ADO_SHIFT 12 -#define AHCI_P_FBS_DWE 0x000f0000 -#define AHCI_P_FBS_DWE_SHIFT 16 -#define AHCI_P_DEVSLP 0x44 -#define AHCI_P_DEVSLP_ADSE 0x00000001 -#define AHCI_P_DEVSLP_DSP 0x00000002 -#define AHCI_P_DEVSLP_DETO 0x000003fc -#define AHCI_P_DEVSLP_DETO_SHIFT 2 -#define AHCI_P_DEVSLP_MDAT 0x00007c00 -#define AHCI_P_DEVSLP_MDAT_SHIFT 10 -#define AHCI_P_DEVSLP_DITO 0x01ff8000 -#define AHCI_P_DEVSLP_DITO_SHIFT 15 -#define AHCI_P_DEVSLP_DM 0x0e000000 -#define AHCI_P_DEVSLP_DM_SHIFT 25 +#define AHCI_CAP 0x00 +#define AHCI_CAP_NPMASK 0x0000001f +#define AHCI_CAP_SXS 0x00000020 +#define AHCI_CAP_EMS 0x00000040 +#define AHCI_CAP_CCCS 0x00000080 +#define AHCI_CAP_NCS 0x00001F00 +#define AHCI_CAP_NCS_SHIFT 8 +#define AHCI_CAP_PSC 0x00002000 +#define AHCI_CAP_SSC 0x00004000 +#define AHCI_CAP_PMD 0x00008000 +#define AHCI_CAP_FBSS 0x00010000 +#define AHCI_CAP_SPM 0x00020000 +#define AHCI_CAP_SAM 0x00080000 +#define AHCI_CAP_ISS 0x00F00000 +#define AHCI_CAP_ISS_SHIFT 20 +#define AHCI_CAP_SCLO 0x01000000 +#define AHCI_CAP_SAL 0x02000000 +#define AHCI_CAP_SALP 0x04000000 +#define AHCI_CAP_SSS 0x08000000 +#define AHCI_CAP_SMPS 0x10000000 +#define AHCI_CAP_SSNTF 0x20000000 +#define AHCI_CAP_SNCQ 0x40000000 +#define AHCI_CAP_64BIT 0x80000000 + +#define AHCI_GHC 0x04 +#define AHCI_GHC_AE 0x80000000 +#define AHCI_GHC_MRSM 0x00000004 +#define AHCI_GHC_IE 0x00000002 +#define AHCI_GHC_HR 0x00000001 + +#define AHCI_IS 0x08 +#define AHCI_PI 0x0c +#define AHCI_VS 0x10 + +#define AHCI_CCCC 0x14 +#define AHCI_CCCC_TV_MASK 0xffff0000 +#define AHCI_CCCC_TV_SHIFT 16 +#define AHCI_CCCC_CC_MASK 0x0000ff00 +#define AHCI_CCCC_CC_SHIFT 8 +#define AHCI_CCCC_INT_MASK 0x000000f8 +#define AHCI_CCCC_INT_SHIFT 3 +#define AHCI_CCCC_EN 0x00000001 +#define AHCI_CCCP 0x18 + +#define AHCI_EM_LOC 0x1C +#define AHCI_EM_CTL 0x20 +#define AHCI_EM_MR 0x00000001 +#define AHCI_EM_TM 0x00000100 +#define AHCI_EM_RST 0x00000200 +#define AHCI_EM_LED 0x00010000 +#define AHCI_EM_SAFTE 0x00020000 +#define AHCI_EM_SES2 0x00040000 +#define AHCI_EM_SGPIO 0x00080000 +#define AHCI_EM_SMB 0x01000000 +#define AHCI_EM_XMT 0x02000000 +#define AHCI_EM_ALHD 0x04000000 +#define AHCI_EM_PM 0x08000000 + +#define AHCI_CAP2 0x24 +#define AHCI_CAP2_BOH 0x00000001 +#define AHCI_CAP2_NVMP 0x00000002 +#define AHCI_CAP2_APST 0x00000004 +#define AHCI_CAP2_SDS 0x00000008 +#define AHCI_CAP2_SADM 0x00000010 +#define AHCI_CAP2_DESO 0x00000020 + +#define AHCI_OFFSET 0x100 +#define AHCI_STEP 0x80 + +#define AHCI_P_CLB 0x00 +#define AHCI_P_CLBU 0x04 +#define AHCI_P_FB 0x08 +#define AHCI_P_FBU 0x0c +#define AHCI_P_IS 0x10 +#define AHCI_P_IE 0x14 +#define AHCI_P_IX_DHR 0x00000001 +#define AHCI_P_IX_PS 0x00000002 +#define AHCI_P_IX_DS 0x00000004 +#define AHCI_P_IX_SDB 0x00000008 +#define AHCI_P_IX_UF 0x00000010 +#define AHCI_P_IX_DP 0x00000020 +#define AHCI_P_IX_PC 0x00000040 +#define AHCI_P_IX_MP 0x00000080 + +#define AHCI_P_IX_PRC 0x00400000 +#define AHCI_P_IX_IPM 0x00800000 +#define AHCI_P_IX_OF 0x01000000 +#define AHCI_P_IX_INF 0x04000000 +#define AHCI_P_IX_IF 0x08000000 +#define AHCI_P_IX_HBD 0x10000000 +#define AHCI_P_IX_HBF 0x20000000 +#define AHCI_P_IX_TFE 0x40000000 +#define AHCI_P_IX_CPD 0x80000000 + +#define AHCI_P_CMD 0x18 +#define AHCI_P_CMD_ST 0x00000001 +#define AHCI_P_CMD_SUD 0x00000002 +#define AHCI_P_CMD_POD 0x00000004 +#define AHCI_P_CMD_CLO 0x00000008 +#define AHCI_P_CMD_FRE 0x00000010 +#define AHCI_P_CMD_CCS_MASK 0x00001f00 +#define AHCI_P_CMD_CCS_SHIFT 8 +#define AHCI_P_CMD_ISS 0x00002000 +#define AHCI_P_CMD_FR 0x00004000 +#define AHCI_P_CMD_CR 0x00008000 +#define AHCI_P_CMD_CPS 0x00010000 +#define AHCI_P_CMD_PMA 0x00020000 +#define AHCI_P_CMD_HPCP 0x00040000 +#define AHCI_P_CMD_MPSP 0x00080000 +#define AHCI_P_CMD_CPD 0x00100000 +#define AHCI_P_CMD_ESP 0x00200000 +#define AHCI_P_CMD_FBSCP 0x00400000 +#define AHCI_P_CMD_APSTE 0x00800000 +#define AHCI_P_CMD_ATAPI 0x01000000 +#define AHCI_P_CMD_DLAE 0x02000000 +#define AHCI_P_CMD_ALPE 0x04000000 +#define AHCI_P_CMD_ASP 0x08000000 +#define AHCI_P_CMD_ICC_MASK 0xf0000000 +#define AHCI_P_CMD_NOOP 0x00000000 +#define AHCI_P_CMD_ACTIVE 0x10000000 +#define AHCI_P_CMD_PARTIAL 0x20000000 +#define AHCI_P_CMD_SLUMBER 0x60000000 +#define AHCI_P_CMD_DEVSLEEP 0x80000000 + +#define AHCI_P_TFD 0x20 +#define AHCI_P_SIG 0x24 +#define AHCI_P_SSTS 0x28 +#define AHCI_P_SCTL 0x2c +#define AHCI_P_SERR 0x30 +#define AHCI_P_SACT 0x34 +#define AHCI_P_CI 0x38 +#define AHCI_P_SNTF 0x3C +#define AHCI_P_FBS 0x40 +#define AHCI_P_FBS_EN 0x00000001 +#define AHCI_P_FBS_DEC 0x00000002 +#define AHCI_P_FBS_SDE 0x00000004 +#define AHCI_P_FBS_DEV 0x00000f00 +#define AHCI_P_FBS_DEV_SHIFT 8 +#define AHCI_P_FBS_ADO 0x0000f000 +#define AHCI_P_FBS_ADO_SHIFT 12 +#define AHCI_P_FBS_DWE 0x000f0000 +#define AHCI_P_FBS_DWE_SHIFT 16 +#define AHCI_P_DEVSLP 0x44 +#define AHCI_P_DEVSLP_ADSE 0x00000001 +#define AHCI_P_DEVSLP_DSP 0x00000002 +#define AHCI_P_DEVSLP_DETO 0x000003fc +#define AHCI_P_DEVSLP_DETO_SHIFT 2 +#define AHCI_P_DEVSLP_MDAT 0x00007c00 +#define AHCI_P_DEVSLP_MDAT_SHIFT 10 +#define AHCI_P_DEVSLP_DITO 0x01ff8000 +#define AHCI_P_DEVSLP_DITO_SHIFT 15 +#define AHCI_P_DEVSLP_DM 0x0e000000 +#define AHCI_P_DEVSLP_DM_SHIFT 25 /* Just to be sure, if building as module. */ #if MAXPHYS < 512 * 1024 #undef MAXPHYS -#define MAXPHYS 512 * 1024 +#define MAXPHYS 512 * 1024 #endif /* Pessimistic prognosis on number of required S/G entries */ -#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8)) +#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8)) /* Command list. 32 commands. First, 1Kbyte aligned. */ -#define AHCI_CL_OFFSET 0 -#define AHCI_CL_SIZE 32 +#define AHCI_CL_OFFSET 0 +#define AHCI_CL_SIZE 32 /* Command tables. Up to 32 commands, Each, 128byte aligned. */ -#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS) -#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16) +#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS) +#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16) /* Total main work area. */ -#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots) +#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots) #endif /* _AHCI_H_ */ diff --git a/usr/src/cmd/bhyve/atkbdc.c b/usr/src/cmd/bhyve/atkbdc.c index 8e71b0507c..1c1838c2e8 100644 --- a/usr/src/cmd/bhyve/atkbdc.c +++ b/usr/src/cmd/bhyve/atkbdc.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2015 Nahanni Systems Inc. * All rights reserved. @@ -45,6 +47,7 @@ __FBSDID("$FreeBSD$"); #include <pthread_np.h> #include "acpi.h" +#include "atkbdc.h" #include "inout.h" #include "pci_emul.h" #include "pci_irq.h" diff --git a/usr/src/cmd/bhyve/bhyvegc.c b/usr/src/cmd/bhyve/bhyvegc.c index 11cc2b1fb4..4bd49ded79 100644 --- a/usr/src/cmd/bhyve/bhyvegc.c +++ b/usr/src/cmd/bhyve/bhyvegc.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * diff --git a/usr/src/cmd/bhyve/bhyvegc.h b/usr/src/cmd/bhyve/bhyvegc.h index fa2ab68d9e..11323586df 100644 --- a/usr/src/cmd/bhyve/bhyvegc.h +++ b/usr/src/cmd/bhyve/bhyvegc.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c index b12fba0800..317d640a2c 100644 --- a/usr/src/cmd/bhyve/bhyverun.c +++ b/usr/src/cmd/bhyve/bhyverun.c @@ -145,14 +145,14 @@ static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); static struct vm_exit vmexit[VM_MAXCPU]; struct bhyvestats { - uint64_t vmexit_bogus; + uint64_t vmexit_bogus; uint64_t vmexit_reqidle; - uint64_t vmexit_hlt; - uint64_t vmexit_pause; - uint64_t vmexit_mtrap; - uint64_t vmexit_inst_emul; - uint64_t cpu_switch_rotate; - uint64_t cpu_switch_direct; + uint64_t vmexit_hlt; + uint64_t vmexit_pause; + uint64_t vmexit_mtrap; + uint64_t vmexit_inst_emul; + uint64_t cpu_switch_rotate; + uint64_t cpu_switch_direct; } stats; struct mt_vmm_info { @@ -180,7 +180,7 @@ usage(int code) #endif " -a: local apic is in xAPIC mode (deprecated)\n" " -A: create ACPI tables\n" - " -c: number of cpus and/or topology specification" + " -c: number of cpus and/or topology specification\n" " -C: include guest memory in core file\n" " -e: exit on unhandled I/O access\n" " -g: gdb port\n" @@ -228,6 +228,8 @@ topology_parse(const char *opt) c = 1, n = 1, s = 1, t = 1; ns = false, scts = false; str = strdup(opt); + if (str == NULL) + goto out; while ((cp = strsep(&str, ",")) != NULL) { if (sscanf(cp, "%i%n", &tmp, &chk) == 1) { @@ -253,11 +255,14 @@ topology_parse(const char *opt) } else if (cp[0] == '\0') continue; else - return (-1); + goto out; /* Any trailing garbage causes an error */ if (cp[chk] != '\0') - return (-1); + goto out; } + free(str); + str = NULL; + /* * Range check 1 <= n <= UINT16_MAX all values */ @@ -283,6 +288,10 @@ topology_parse(const char *opt) cores = c; threads = t; return(0); + +out: + free(str); + return (-1); } #ifndef WITHOUT_CAPSICUM @@ -462,7 +471,7 @@ fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); - exit(1); + exit(4); } CPU_CLR_ATOMIC(vcpu, &cpumask); @@ -478,7 +487,7 @@ vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, * put guest-driven debug here */ #endif - return (VMEXIT_CONTINUE); + return (VMEXIT_CONTINUE); } static int @@ -808,7 +817,7 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); - exit(1); + exit(4); } rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); @@ -819,7 +828,7 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) case VMEXIT_ABORT: abort(); default: - exit(1); + exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); @@ -851,7 +860,7 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); - exit(1); + exit(4); } vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); if (cpu == BSP) @@ -866,7 +875,7 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); - exit(1); + exit(4); } vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); if (cpu == BSP) @@ -880,7 +889,7 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); - exit(1); + exit(4); } #ifdef __FreeBSD__ @@ -918,7 +927,7 @@ do_open(const char *vmname) } } else { perror("vm_create"); - exit(1); + exit(4); } } else { if (!romboot) { @@ -927,14 +936,14 @@ do_open(const char *vmname) * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); - exit(1); + exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); - exit(1); + exit(4); } #ifndef WITHOUT_CAPSICUM @@ -956,7 +965,7 @@ do_open(const char *vmname) error = vm_reinit(ctx); if (error) { perror("vm_reinit"); - exit(1); + exit(4); } } error = vm_set_topology(ctx, sockets, cores, threads, maxcpus); @@ -1062,14 +1071,20 @@ main(int argc, char *argv[]) gdb_port = atoi(optarg); break; case 'l': - if (lpc_device_parse(optarg) != 0) { + if (strncmp(optarg, "help", strlen(optarg)) == 0) { + lpc_print_supported_devices(); + exit(0); + } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; case 's': - if (pci_parse_slot(optarg) != 0) - exit(1); + if (strncmp(optarg, "help", strlen(optarg)) == 0) { + pci_print_supported_devices(); + exit(0); + } else if (pci_parse_slot(optarg) != 0) + exit(4); else break; case 'S': @@ -1135,7 +1150,7 @@ main(int argc, char *argv[]) if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); - exit(1); + exit(4); } fbsdrun_set_capabilities(ctx, BSP); @@ -1157,13 +1172,13 @@ main(int argc, char *argv[]) #endif if (err) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); - exit(1); + exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); - exit(1); + exit(4); } init_mem(); @@ -1178,8 +1193,10 @@ main(int argc, char *argv[]) /* * Exit if a device emulation finds an error in its initilization */ - if (init_pci(ctx) != 0) - exit(1); + if (init_pci(ctx) != 0) { + perror("device emulation initialization error"); + exit(4); + } if (dbg_port != 0) init_dbgport(dbg_port); @@ -1196,7 +1213,7 @@ main(int argc, char *argv[]) if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); - exit(1); + exit(4); } error = vcpu_reset(ctx, BSP); assert(error == 0); @@ -1210,8 +1227,10 @@ main(int argc, char *argv[]) */ if (mptgen) { error = mptable_build(ctx, guest_ncpus); - if (error) - exit(1); + if (error) { + perror("error to build the guest tables"); + exit(4); + } } error = smbios_build(ctx); @@ -1225,21 +1244,21 @@ main(int argc, char *argv[]) if (lpc_bootrom()) fwctl_init(); + /* + * Change the proc title to include the VM name. + */ + setproctitle("%s", vmname); + #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); - if (cap_enter() == -1 && errno != ENOSYS) + if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif - /* - * Change the proc title to include the VM name. - */ - setproctitle("%s", vmname); - #ifndef __FreeBSD__ /* * If applicable, wait for bhyveconsole @@ -1269,5 +1288,5 @@ main(int argc, char *argv[]) */ mevent_dispatch(); - exit(1); + exit(4); } diff --git a/usr/src/cmd/bhyve/block_if.c b/usr/src/cmd/bhyve/block_if.c index 53d8507f8e..81a305493e 100644 --- a/usr/src/cmd/bhyve/block_if.c +++ b/usr/src/cmd/bhyve/block_if.c @@ -117,8 +117,8 @@ struct blockif_ctxt { int bc_psectoff; int bc_closing; pthread_t bc_btid[BLOCKIF_NUMTHR]; - pthread_mutex_t bc_mtx; - pthread_cond_t bc_cond; + pthread_mutex_t bc_mtx; + pthread_cond_t bc_cond; /* Request elements and free/pending/busy queues */ TAILQ_HEAD(, blockif_elem) bc_freeq; diff --git a/usr/src/cmd/bhyve/block_if.h b/usr/src/cmd/bhyve/block_if.h index 265048d90f..d01e5c9213 100644 --- a/usr/src/cmd/bhyve/block_if.h +++ b/usr/src/cmd/bhyve/block_if.h @@ -53,12 +53,12 @@ #endif struct blockif_req { - struct iovec br_iov[BLOCKIF_IOV_MAX]; int br_iovcnt; off_t br_offset; ssize_t br_resid; void (*br_callback)(struct blockif_req *req, int err); void *br_param; + struct iovec br_iov[BLOCKIF_IOV_MAX]; }; struct blockif_ctxt; diff --git a/usr/src/cmd/bhyve/bootrom.c b/usr/src/cmd/bhyve/bootrom.c index 5e4e0e93a2..b8c63828c8 100644 --- a/usr/src/cmd/bhyve/bootrom.c +++ b/usr/src/cmd/bhyve/bootrom.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Neel Natu <neel@freebsd.org> * All rights reserved. * diff --git a/usr/src/cmd/bhyve/bootrom.h b/usr/src/cmd/bhyve/bootrom.h index af150d3255..7fb12181dd 100644 --- a/usr/src/cmd/bhyve/bootrom.h +++ b/usr/src/cmd/bhyve/bootrom.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Neel Natu <neel@freebsd.org> * All rights reserved. * @@ -33,6 +35,6 @@ struct vmctx; -int bootrom_init(struct vmctx *ctx, const char *romfile); +int bootrom_init(struct vmctx *ctx, const char *romfile); #endif diff --git a/usr/src/cmd/bhyve/console.c b/usr/src/cmd/bhyve/console.c index ebb9c921bf..2567f69959 100644 --- a/usr/src/cmd/bhyve/console.c +++ b/usr/src/cmd/bhyve/console.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * diff --git a/usr/src/cmd/bhyve/console.h b/usr/src/cmd/bhyve/console.h index 47193938a6..0d0a854866 100644 --- a/usr/src/cmd/bhyve/console.h +++ b/usr/src/cmd/bhyve/console.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -35,19 +37,19 @@ typedef void (*fb_render_func_t)(struct bhyvegc *gc, void *arg); typedef void (*kbd_event_func_t)(int down, uint32_t keysym, void *arg); typedef void (*ptr_event_func_t)(uint8_t mask, int x, int y, void *arg); -void console_init(int w, int h, void *fbaddr); +void console_init(int w, int h, void *fbaddr); -void console_set_fbaddr(void *fbaddr); +void console_set_fbaddr(void *fbaddr); struct bhyvegc_image *console_get_image(void); -void console_fb_register(fb_render_func_t render_cb, void *arg); -void console_refresh(void); +void console_fb_register(fb_render_func_t render_cb, void *arg); +void console_refresh(void); -void console_kbd_register(kbd_event_func_t event_cb, void *arg, int pri); -void console_key_event(int down, uint32_t keysym); +void console_kbd_register(kbd_event_func_t event_cb, void *arg, int pri); +void console_key_event(int down, uint32_t keysym); -void console_ptr_register(ptr_event_func_t event_cb, void *arg, int pri); -void console_ptr_event(uint8_t button, int x, int y); +void console_ptr_register(ptr_event_func_t event_cb, void *arg, int pri); +void console_ptr_event(uint8_t button, int x, int y); #endif /* _CONSOLE_H_ */ diff --git a/usr/src/cmd/bhyve/consport.c b/usr/src/cmd/bhyve/consport.c index 7996e4ffab..f630cec1f3 100644 --- a/usr/src/cmd/bhyve/consport.c +++ b/usr/src/cmd/bhyve/consport.c @@ -78,14 +78,14 @@ ttyopen(void) static bool tty_char_available(void) { - fd_set rfds; - struct timeval tv; - - FD_ZERO(&rfds); - FD_SET(STDIN_FILENO, &rfds); - tv.tv_sec = 0; - tv.tv_usec = 0; - if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { + fd_set rfds; + struct timeval tv; + + FD_ZERO(&rfds); + FD_SET(STDIN_FILENO, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 0; + if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { return (true); } else { return (false); diff --git a/usr/src/cmd/bhyve/dbgport.c b/usr/src/cmd/bhyve/dbgport.c index d6c5f9383e..6b3d26336f 100644 --- a/usr/src/cmd/bhyve/dbgport.c +++ b/usr/src/cmd/bhyve/dbgport.c @@ -139,8 +139,8 @@ init_dbgport(int sport) conn_fd = -1; if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - perror("socket"); - exit(1); + perror("cannot create socket"); + exit(4); } #ifdef __FreeBSD__ @@ -153,18 +153,18 @@ init_dbgport(int sport) reuse = 1; if (setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)) < 0) { - perror("setsockopt"); - exit(1); + perror("cannot set socket options"); + exit(4); } if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { - perror("bind"); - exit(1); + perror("cannot bind socket"); + exit(4); } if (listen(listen_fd, 1) < 0) { - perror("listen"); - exit(1); + perror("cannot listen socket"); + exit(4); } #ifndef WITHOUT_CAPSICUM diff --git a/usr/src/cmd/bhyve/fwctl.c b/usr/src/cmd/bhyve/fwctl.c index 9e90c1ade6..00d6ef8681 100644 --- a/usr/src/cmd/bhyve/fwctl.c +++ b/usr/src/cmd/bhyve/fwctl.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org> * All rights reserved. * @@ -373,7 +375,7 @@ fwctl_request(uint32_t value) /* Verify size */ if (value < 12) { printf("msg size error"); - exit(1); + exit(4); } rinfo.req_size = value; rinfo.req_count = 1; diff --git a/usr/src/cmd/bhyve/fwctl.h b/usr/src/cmd/bhyve/fwctl.h index f5f8d131ab..6dad244811 100644 --- a/usr/src/cmd/bhyve/fwctl.h +++ b/usr/src/cmd/bhyve/fwctl.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org> * All rights reserved. * diff --git a/usr/src/cmd/bhyve/inout.c b/usr/src/cmd/bhyve/inout.c index 693c4fdbac..b460ee2988 100644 --- a/usr/src/cmd/bhyve/inout.c +++ b/usr/src/cmd/bhyve/inout.c @@ -68,21 +68,21 @@ static int default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { - if (in) { - switch (bytes) { - case 4: - *eax = 0xffffffff; - break; - case 2: - *eax = 0xffff; - break; - case 1: - *eax = 0xff; - break; - } - } - - return (0); + if (in) { + switch (bytes) { + case 4: + *eax = 0xffffffff; + break; + case 2: + *eax = 0xffff; + break; + case 1: + *eax = 0xff; + break; + } + } + + return (0); } static void diff --git a/usr/src/cmd/bhyve/iov.c b/usr/src/cmd/bhyve/iov.c new file mode 100644 index 0000000000..c564bd8ae5 --- /dev/null +++ b/usr/src/cmd/bhyve/iov.c @@ -0,0 +1,141 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/uio.h> + +#include <stdlib.h> +#include <string.h> +#include "iov.h" + +void +seek_iov(struct iovec *iov1, size_t niov1, struct iovec *iov2, size_t *niov2, + size_t seek) +{ + size_t remainder = 0; + size_t left = seek; + size_t i, j; + + for (i = 0; i < niov1; i++) { + size_t toseek = MIN(left, iov1[i].iov_len); + left -= toseek; + + if (toseek == iov1[i].iov_len) + continue; + + if (left == 0) { + remainder = toseek; + break; + } + } + + for (j = i; j < niov1; j++) { + iov2[j - i].iov_base = (char *)iov1[j].iov_base + remainder; + iov2[j - i].iov_len = iov1[j].iov_len - remainder; + remainder = 0; + } + + *niov2 = j - i; +} + +size_t +count_iov(struct iovec *iov, size_t niov) +{ + size_t i, total = 0; + + for (i = 0; i < niov; i++) + total += iov[i].iov_len; + + return (total); +} + +size_t +truncate_iov(struct iovec *iov, size_t niov, size_t length) +{ + size_t i, done = 0; + + for (i = 0; i < niov; i++) { + size_t toseek = MIN(length - done, iov[i].iov_len); + done += toseek; + + if (toseek < iov[i].iov_len) { + iov[i].iov_len = toseek; + return (i + 1); + } + } + + return (niov); +} + +ssize_t +iov_to_buf(struct iovec *iov, size_t niov, void **buf) +{ + size_t i, ptr = 0, total = 0; + + for (i = 0; i < niov; i++) { + total += iov[i].iov_len; + *buf = realloc(*buf, total); + if (*buf == NULL) + return (-1); + + memcpy(*buf + ptr, iov[i].iov_base, iov[i].iov_len); + ptr += iov[i].iov_len; + } + + return (total); +} + +ssize_t +buf_to_iov(void *buf, size_t buflen, struct iovec *iov, size_t niov, + size_t seek) +{ + struct iovec *diov; + size_t ndiov, i; + uintptr_t off = 0; + + if (seek > 0) { + diov = malloc(sizeof(struct iovec) * niov); + seek_iov(iov, niov, diov, &ndiov, seek); + } else { + diov = iov; + ndiov = niov; + } + + for (i = 0; i < ndiov; i++) { + memcpy(diov[i].iov_base, buf + off, diov[i].iov_len); + off += diov[i].iov_len; + } + + return ((ssize_t)off); +} + diff --git a/usr/src/cmd/bhyve/iov.h b/usr/src/cmd/bhyve/iov.h new file mode 100644 index 0000000000..87fa4c1dcf --- /dev/null +++ b/usr/src/cmd/bhyve/iov.h @@ -0,0 +1,43 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IOV_H_ +#define _IOV_H_ + +void seek_iov(struct iovec *iov1, size_t niov1, struct iovec *iov2, + size_t *niov2, size_t seek); +size_t truncate_iov(struct iovec *iov, size_t niov, size_t length); +size_t count_iov(struct iovec *iov, size_t niov); +ssize_t iov_to_buf(struct iovec *iov, size_t niov, void **buf); +ssize_t buf_to_iov(void *buf, size_t buflen, struct iovec *iov, size_t niov, + size_t seek); + +#endif /* _IOV_H_ */ diff --git a/usr/src/cmd/bhyve/mem.c b/usr/src/cmd/bhyve/mem.c index 105d37cf56..85e56af10b 100644 --- a/usr/src/cmd/bhyve/mem.c +++ b/usr/src/cmd/bhyve/mem.c @@ -38,15 +38,16 @@ __FBSDID("$FreeBSD$"); #include <sys/types.h> -#include <sys/tree.h> #include <sys/errno.h> +#include <sys/tree.h> #include <machine/vmm.h> #include <machine/vmm_instruction_emul.h> -#include <stdio.h> -#include <stdlib.h> #include <assert.h> +#include <err.h> #include <pthread.h> +#include <stdio.h> +#include <stdlib.h> #include "mem.h" @@ -123,6 +124,7 @@ mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new) static void mmio_rb_dump(struct mmio_rb_tree *rbt) { + int perror; struct mmio_rb_range *np; pthread_rwlock_rdlock(&mmio_rwlock); @@ -130,7 +132,8 @@ mmio_rb_dump(struct mmio_rb_tree *rbt) printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end, np->mr_param.name); } - pthread_rwlock_unlock(&mmio_rwlock); + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); } #endif @@ -166,7 +169,7 @@ access_memory(struct vmctx *ctx, int vcpu, uint64_t paddr, mem_cb_t *cb, void *arg) { struct mmio_rb_range *entry; - int err, immutable; + int err, perror, immutable; pthread_rwlock_rdlock(&mmio_rwlock); /* @@ -184,7 +187,8 @@ access_memory(struct vmctx *ctx, int vcpu, uint64_t paddr, mem_cb_t *cb, /* Update the per-vCPU cache */ mmio_hint[vcpu] = entry; } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) { - pthread_rwlock_unlock(&mmio_rwlock); + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); return (ESRCH); } } @@ -203,13 +207,18 @@ access_memory(struct vmctx *ctx, int vcpu, uint64_t paddr, mem_cb_t *cb, * config space window as 'immutable' the deadlock can be avoided. */ immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE); - if (immutable) - pthread_rwlock_unlock(&mmio_rwlock); + if (immutable) { + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); + } err = cb(ctx, vcpu, paddr, &entry->mr_param, arg); - if (!immutable) - pthread_rwlock_unlock(&mmio_rwlock); + if (!immutable) { + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); + } + return (err); } @@ -272,24 +281,27 @@ static int register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp) { struct mmio_rb_range *entry, *mrp; - int err; + int err, perror; err = 0; mrp = malloc(sizeof(struct mmio_rb_range)); - - if (mrp != NULL) { + if (mrp == NULL) { + warn("%s: couldn't allocate memory for mrp\n", + __func__); + err = ENOMEM; + } else { mrp->mr_param = *memp; mrp->mr_base = memp->base; mrp->mr_end = memp->base + memp->size - 1; pthread_rwlock_wrlock(&mmio_rwlock); if (mmio_rb_lookup(rbt, memp->base, &entry) != 0) err = mmio_rb_add(rbt, mrp); - pthread_rwlock_unlock(&mmio_rwlock); + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); if (err) free(mrp); - } else - err = ENOMEM; + } return (err); } @@ -313,7 +325,7 @@ unregister_mem(struct mem_range *memp) { struct mem_range *mr; struct mmio_rb_range *entry = NULL; - int err, i; + int err, perror, i; pthread_rwlock_wrlock(&mmio_rwlock); err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry); @@ -330,7 +342,8 @@ unregister_mem(struct mem_range *memp) mmio_hint[i] = NULL; } } - pthread_rwlock_unlock(&mmio_rwlock); + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); if (entry) free(entry); diff --git a/usr/src/cmd/bhyve/mem.h b/usr/src/cmd/bhyve/mem.h index f386d67749..596c0b0cf3 100644 --- a/usr/src/cmd/bhyve/mem.h +++ b/usr/src/cmd/bhyve/mem.h @@ -55,7 +55,7 @@ struct mem_range { void init_mem(void); int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie, struct vm_guest_paging *paging); - + int read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size); int register_mem(struct mem_range *memp); diff --git a/usr/src/cmd/bhyve/mevent.c b/usr/src/cmd/bhyve/mevent.c index edd5cf14cb..4ad33a9f01 100644 --- a/usr/src/cmd/bhyve/mevent.c +++ b/usr/src/cmd/bhyve/mevent.c @@ -82,7 +82,7 @@ static int mevent_timid = 43; static int mevent_pipefd[2]; static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; -struct mevent { +struct mevent { void (*me_func)(int, enum ev_type, void *); #define me_msecs me_fd int me_fd; @@ -101,7 +101,7 @@ struct mevent { struct sigevent me_sigev; boolean_t me_auto_requeue; #endif - LIST_ENTRY(mevent) me_list; + LIST_ENTRY(mevent) me_list; }; static LIST_HEAD(listhead, mevent) global_head, change_head; diff --git a/usr/src/cmd/bhyve/mevent_test.c b/usr/src/cmd/bhyve/mevent_test.c index 22e3561fed..4da3adb5ae 100644 --- a/usr/src/cmd/bhyve/mevent_test.c +++ b/usr/src/cmd/bhyve/mevent_test.c @@ -164,7 +164,7 @@ echoer(void *param) mev = mevent_add(fd, EVF_READ, echoer_callback, &sync); if (mev == NULL) { printf("Could not allocate echoer event\n"); - exit(1); + exit(4); } while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) { @@ -219,27 +219,27 @@ acceptor(void *param) int news; int s; - if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - perror("socket"); - exit(1); - } + if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("cannot create socket"); + exit(4); + } #ifdef __FreeBSD__ - sin.sin_len = sizeof(sin); + sin.sin_len = sizeof(sin); #endif - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = htonl(INADDR_ANY); - sin.sin_port = htons(TEST_PORT); - - if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) { - perror("bind"); - exit(1); - } - - if (listen(s, 1) < 0) { - perror("listen"); - exit(1); - } + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(TEST_PORT); + + if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("cannot bind socket"); + exit(4); + } + + if (listen(s, 1) < 0) { + perror("cannot listen socket"); + exit(4); + } (void) mevent_add(s, EVF_READ, acceptor_callback, NULL); diff --git a/usr/src/cmd/bhyve/pci_e82545.c b/usr/src/cmd/bhyve/pci_e82545.c index 121c0fc773..3f5a6ef0c5 100644 --- a/usr/src/cmd/bhyve/pci_e82545.c +++ b/usr/src/cmd/bhyve/pci_e82545.c @@ -1,4 +1,6 @@ /* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org> * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org> * Copyright (c) 2013 Jeremiah Lott, Avere Systems @@ -345,8 +347,8 @@ struct e82545_softc { #define E82545_NVM_MODE_OPADDR 0x0 #define E82545_NVM_MODE_DATAIN 0x1 #define E82545_NVM_MODE_DATAOUT 0x2 - /* EEPROM data */ - uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE]; + /* EEPROM data */ + uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE]; }; static void e82545_reset(struct e82545_softc *sc, int dev); @@ -1495,7 +1497,7 @@ e82545_rx_disable(struct e82545_softc *sc) static void e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval) { - struct eth_uni *eu; + struct eth_uni *eu; int idx; idx = reg >> 1; @@ -1521,7 +1523,7 @@ e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval) static uint32_t e82545_read_ra(struct e82545_softc *sc, int reg) { - struct eth_uni *eu; + struct eth_uni *eu; uint32_t retval; int idx; @@ -1765,12 +1767,12 @@ e82545_read_register(struct e82545_softc *sc, uint32_t offset) { uint32_t retval; int ridx; - + if (offset & 0x3) { DPRINTF("Unaligned register read offset:0x%x\r\n", offset); return 0; } - + DPRINTF("Register read: 0x%x\r\n", offset); switch (offset) { @@ -2247,7 +2249,7 @@ e82545_open_tap(struct e82545_softc *sc, char *opts) sc->esc_tapfd = open(tbuf, O_RDWR); if (sc->esc_tapfd == -1) { DPRINTF("unable to open tap device %s\n", opts); - exit(1); + exit(4); } /* diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c index 8af6a37498..03db632e37 100644 --- a/usr/src/cmd/bhyve/pci_emul.c +++ b/usr/src/cmd/bhyve/pci_emul.c @@ -250,6 +250,17 @@ done: return (error); } +void +pci_print_supported_devices() +{ + struct pci_devemu **pdpp, *pdp; + + SET_FOREACH(pdpp, pci_devemu_set) { + pdp = *pdpp; + printf("%s\n", pdp->pe_emu); + } +} + static int pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) { @@ -882,7 +893,7 @@ msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, { uint16_t msgctrl, rwmask; int off; - + off = offset - capoff; /* Message Control Register */ if (off == 2 && bytes == 2) { @@ -895,8 +906,8 @@ msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; pci_lintr_update(pi); - } - + } + CFGWRITE(pi, offset, val, bytes); } @@ -1355,11 +1366,11 @@ pci_bus_write_dsdt(int bus) dsdt_line("Name (PPRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); - dsdt_line("})"); + dsdt_line("})"); dsdt_line("Name (APRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_apic_prt_entry, NULL); - dsdt_line("})"); + dsdt_line("})"); dsdt_line("Method (_PRT, 0, NotSerialized)"); dsdt_line("{"); dsdt_line(" If (PICM)"); @@ -1750,7 +1761,7 @@ pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) * interrupt. */ pci_lintr_update(pi); -} +} static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h index 0a1dd39f57..0053caed99 100644 --- a/usr/src/cmd/bhyve/pci_emul.h +++ b/usr/src/cmd/bhyve/pci_emul.h @@ -241,6 +241,7 @@ int pci_msix_table_bar(struct pci_devinst *pi); int pci_msix_pba_bar(struct pci_devinst *pi); int pci_msi_maxmsgnum(struct pci_devinst *pi); int pci_parse_slot(char *opt); +void pci_print_supported_devices(); void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, diff --git a/usr/src/cmd/bhyve/pci_fbuf.c b/usr/src/cmd/bhyve/pci_fbuf.c index 8478f6e531..5a04c41e54 100644 --- a/usr/src/cmd/bhyve/pci_fbuf.c +++ b/usr/src/cmd/bhyve/pci_fbuf.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Nahanni Systems, Inc. * Copyright 2018 Joyent, Inc. * All rights reserved. diff --git a/usr/src/cmd/bhyve/pci_irq.c b/usr/src/cmd/bhyve/pci_irq.c index 4ae9ff3582..4ecb3eddb0 100644 --- a/usr/src/cmd/bhyve/pci_irq.c +++ b/usr/src/cmd/bhyve/pci_irq.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. diff --git a/usr/src/cmd/bhyve/pci_irq.h b/usr/src/cmd/bhyve/pci_irq.h index aa1a6c356b..1ae56efc8f 100644 --- a/usr/src/cmd/bhyve/pci_irq.h +++ b/usr/src/cmd/bhyve/pci_irq.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. diff --git a/usr/src/cmd/bhyve/pci_lpc.c b/usr/src/cmd/bhyve/pci_lpc.c index 70bfed96f6..b7ddb772a1 100644 --- a/usr/src/cmd/bhyve/pci_lpc.c +++ b/usr/src/cmd/bhyve/pci_lpc.c @@ -118,6 +118,16 @@ done: return (error); } +void +lpc_print_supported_devices() +{ + size_t i; + + printf("bootrom\n"); + for (i = 0; i < LPC_UART_NUM; i++) + printf("%s\n", lpc_uart_names[i]); +} + const char * lpc_bootrom(void) { diff --git a/usr/src/cmd/bhyve/pci_lpc.h b/usr/src/cmd/bhyve/pci_lpc.h index 8cab52f372..9041f79c50 100644 --- a/usr/src/cmd/bhyve/pci_lpc.h +++ b/usr/src/cmd/bhyve/pci_lpc.h @@ -68,6 +68,7 @@ struct lpc_sysres { #define SYSRES_MEM(base, length) LPC_SYSRES(LPC_SYSRES_MEM, base, length) int lpc_device_parse(const char *opt); +void lpc_print_supported_devices(); char *lpc_pirq_name(int pin); void lpc_pirq_routed(void); const char *lpc_bootrom(void); diff --git a/usr/src/cmd/bhyve/pci_nvme.c b/usr/src/cmd/bhyve/pci_nvme.c new file mode 100644 index 0000000000..571f916a9d --- /dev/null +++ b/usr/src/cmd/bhyve/pci_nvme.c @@ -0,0 +1,1873 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017 Shunsuke Mie + * Copyright (c) 2018 Leon Dang + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * bhyve PCIe-NVMe device emulation. + * + * options: + * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z + * + * accepted devpath: + * /dev/blockdev + * /path/to/image + * ram=size_in_MiB + * + * maxq = max number of queues + * qsz = max elements in each queue + * ioslots = max number of concurrent io requests + * sectsz = sector size (defaults to blockif sector size) + * ser = serial number (20-chars max) + * + */ + +/* TODO: + - create async event for smart and log + - intr coalesce + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <assert.h> +#include <pthread.h> +#include <semaphore.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <machine/atomic.h> +#include <machine/vmm.h> +#include <vmmapi.h> + +#include <dev/nvme/nvme.h> + +#include "bhyverun.h" +#include "block_if.h" +#include "pci_emul.h" + + +static int nvme_debug = 0; +#define DPRINTF(params) if (nvme_debug) printf params +#define WPRINTF(params) printf params + +/* defaults; can be overridden */ +#define NVME_MSIX_BAR 4 + +#define NVME_IOSLOTS 8 + +#define NVME_QUEUES 16 +#define NVME_MAX_QENTRIES 2048 + +#define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) +#define NVME_MAX_BLOCKIOVS 512 + +/* helpers */ + +#define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) + +enum nvme_controller_register_offsets { + NVME_CR_CAP_LOW = 0x00, + NVME_CR_CAP_HI = 0x04, + NVME_CR_VS = 0x08, + NVME_CR_INTMS = 0x0c, + NVME_CR_INTMC = 0x10, + NVME_CR_CC = 0x14, + NVME_CR_CSTS = 0x1c, + NVME_CR_NSSR = 0x20, + NVME_CR_AQA = 0x24, + NVME_CR_ASQ_LOW = 0x28, + NVME_CR_ASQ_HI = 0x2c, + NVME_CR_ACQ_LOW = 0x30, + NVME_CR_ACQ_HI = 0x34, +}; + +enum nvme_cmd_cdw11 { + NVME_CMD_CDW11_PC = 0x0001, + NVME_CMD_CDW11_IEN = 0x0002, + NVME_CMD_CDW11_IV = 0xFFFF0000, +}; + +#define NVME_CQ_INTEN 0x01 +#define NVME_CQ_INTCOAL 0x02 + +struct nvme_completion_queue { + struct nvme_completion *qbase; + uint32_t size; + uint16_t tail; /* nvme progress */ + uint16_t head; /* guest progress */ + uint16_t intr_vec; + uint32_t intr_en; + pthread_mutex_t mtx; +}; + +struct nvme_submission_queue { + struct nvme_command *qbase; + uint32_t size; + uint16_t head; /* nvme progress */ + uint16_t tail; /* guest progress */ + uint16_t cqid; /* completion queue id */ + int busy; /* queue is being processed */ + int qpriority; +}; + +enum nvme_storage_type { + NVME_STOR_BLOCKIF = 0, + NVME_STOR_RAM = 1, +}; + +struct pci_nvme_blockstore { + enum nvme_storage_type type; + void *ctx; + uint64_t size; + uint32_t sectsz; + uint32_t sectsz_bits; +}; + +struct pci_nvme_ioreq { + struct pci_nvme_softc *sc; + struct pci_nvme_ioreq *next; + struct nvme_submission_queue *nvme_sq; + uint16_t sqid; + + /* command information */ + uint16_t opc; + uint16_t cid; + uint32_t nsid; + + uint64_t prev_gpaddr; + size_t prev_size; + + /* + * lock if all iovs consumed (big IO); + * complete transaction before continuing + */ + pthread_mutex_t mtx; + pthread_cond_t cv; + + struct blockif_req io_req; + + /* pad to fit up to 512 page descriptors from guest IO request */ + struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; +}; + +struct pci_nvme_softc { + struct pci_devinst *nsc_pi; + + pthread_mutex_t mtx; + + struct nvme_registers regs; + + struct nvme_namespace_data nsdata; + struct nvme_controller_data ctrldata; + + struct pci_nvme_blockstore nvstore; + + uint16_t max_qentries; /* max entries per queue */ + uint32_t max_queues; + uint32_t num_cqueues; + uint32_t num_squeues; + + struct pci_nvme_ioreq *ioreqs; + struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */ + uint32_t pending_ios; + uint32_t ioslots; + sem_t iosemlock; + + /* status and guest memory mapped queues */ + struct nvme_completion_queue *compl_queues; + struct nvme_submission_queue *submit_queues; + + /* controller features */ + uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */ + uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */ + uint32_t async_ev_config; /* 0x0B: async event config */ +}; + + +static void pci_nvme_io_partial(struct blockif_req *br, int err); + +/* Controller Configuration utils */ +#define NVME_CC_GET_EN(cc) \ + ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) +#define NVME_CC_GET_CSS(cc) \ + ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) +#define NVME_CC_GET_SHN(cc) \ + ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) +#define NVME_CC_GET_IOSQES(cc) \ + ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) +#define NVME_CC_GET_IOCQES(cc) \ + ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) + +#define NVME_CC_WRITE_MASK \ + ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ + (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ + (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) + +#define NVME_CC_NEN_WRITE_MASK \ + ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ + (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ + (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) + +/* Controller Status utils */ +#define NVME_CSTS_GET_RDY(sts) \ + ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) + +#define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) + +/* Completion Queue status word utils */ +#define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) +#define NVME_STATUS_MASK \ + ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ + (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) + +static __inline void +cpywithpad(char *dst, int dst_size, const char *src, char pad) +{ + int len = strnlen(src, dst_size); + memcpy(dst, src, len); + memset(dst + len, pad, dst_size - len); +} + +static __inline void +pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) +{ + + *status &= ~NVME_STATUS_MASK; + *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | + (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; +} + +static __inline void +pci_nvme_status_genc(uint16_t *status, uint16_t code) +{ + + pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); +} + +static __inline void +pci_nvme_toggle_phase(uint16_t *status, int prev) +{ + + if (prev) + *status &= ~NVME_STATUS_P; + else + *status |= NVME_STATUS_P; +} + +static void +pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) +{ + struct nvme_controller_data *cd = &sc->ctrldata; + + cd->vid = 0xFB5D; + cd->ssvid = 0x0000; + + cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); + cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); + + /* Num of submission commands that we can handle at a time (2^rab) */ + cd->rab = 4; + + /* FreeBSD OUI */ + cd->ieee[0] = 0x58; + cd->ieee[1] = 0x9c; + cd->ieee[2] = 0xfc; + + cd->mic = 0; + + cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ + + cd->ver = 0x00010300; + + cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; + cd->acl = 2; + cd->aerl = 4; + + cd->lpa = 0; /* TODO: support some simple things like SMART */ + cd->elpe = 0; /* max error log page entries */ + cd->npss = 1; /* number of power states support */ + + /* Warning Composite Temperature Threshold */ + cd->wctemp = 0x0157; + + cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | + (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); + cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | + (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); + cd->nn = 1; /* number of namespaces */ + + cd->fna = 0x03; + + cd->power_state[0].mp = 10; +} + +static void +pci_nvme_init_nsdata(struct pci_nvme_softc *sc) +{ + struct nvme_namespace_data *nd; + + nd = &sc->nsdata; + + nd->nsze = sc->nvstore.size / sc->nvstore.sectsz; + nd->ncap = nd->nsze; + nd->nuse = nd->nsze; + + /* Get LBA and backstore information from backing store */ + nd->nlbaf = 1; + /* LBA data-sz = 2^lbads */ + nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; + + nd->flbas = 0; +} + +static void +pci_nvme_reset_locked(struct pci_nvme_softc *sc) +{ + DPRINTF(("%s\r\n", __func__)); + + sc->regs.cap_lo = (sc->max_qentries & NVME_CAP_LO_REG_MQES_MASK) | + (1 << NVME_CAP_LO_REG_CQR_SHIFT) | + (60 << NVME_CAP_LO_REG_TO_SHIFT); + + sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; + + sc->regs.vs = 0x00010300; /* NVMe v1.3 */ + + sc->regs.cc = 0; + sc->regs.csts = 0; + + sc->num_cqueues = sc->num_squeues = sc->max_queues; + if (sc->submit_queues != NULL) { + for (int i = 0; i <= sc->max_queues; i++) { + /* + * The Admin Submission Queue is at index 0. + * It must not be changed at reset otherwise the + * emulation will be out of sync with the guest. + */ + if (i != 0) { + sc->submit_queues[i].qbase = NULL; + sc->submit_queues[i].size = 0; + sc->submit_queues[i].cqid = 0; + + sc->compl_queues[i].qbase = NULL; + sc->compl_queues[i].size = 0; + } + sc->submit_queues[i].tail = 0; + sc->submit_queues[i].head = 0; + sc->submit_queues[i].busy = 0; + + sc->compl_queues[i].tail = 0; + sc->compl_queues[i].head = 0; + } + } else + sc->submit_queues = calloc(sc->max_queues + 1, + sizeof(struct nvme_submission_queue)); + + if (sc->compl_queues == NULL) { + sc->compl_queues = calloc(sc->max_queues + 1, + sizeof(struct nvme_completion_queue)); + + for (int i = 0; i <= sc->num_cqueues; i++) + pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); + } +} + +static void +pci_nvme_reset(struct pci_nvme_softc *sc) +{ + pthread_mutex_lock(&sc->mtx); + pci_nvme_reset_locked(sc); + pthread_mutex_unlock(&sc->mtx); +} + +static void +pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) +{ + uint16_t acqs, asqs; + + DPRINTF(("%s\r\n", __func__)); + + asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; + sc->submit_queues[0].size = asqs; + sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, + sizeof(struct nvme_command) * asqs); + + DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n", + __func__, sc->regs.asq, sc->submit_queues[0].qbase)); + + acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & + NVME_AQA_REG_ACQS_MASK) + 1; + sc->compl_queues[0].size = acqs; + sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, + sizeof(struct nvme_completion) * acqs); + DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n", + __func__, sc->regs.acq, sc->compl_queues[0].qbase)); +} + +static int +nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint16_t qid = command->cdw10 & 0xffff; + + DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid)); + if (qid == 0 || qid > sc->num_cqueues) { + WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n", + __func__, qid, sc->num_squeues)); + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + sc->submit_queues[qid].qbase = NULL; + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + if (command->cdw11 & NVME_CMD_CDW11_PC) { + uint16_t qid = command->cdw10 & 0xffff; + struct nvme_submission_queue *nsq; + + if (qid > sc->num_squeues) { + WPRINTF(("%s queue index %u > num_squeues %u\r\n", + __func__, qid, sc->num_squeues)); + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + nsq = &sc->submit_queues[qid]; + nsq->size = ((command->cdw10 >> 16) & 0xffff) + 1; + + nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(struct nvme_command) * (size_t)nsq->size); + nsq->cqid = (command->cdw11 >> 16) & 0xffff; + nsq->qpriority = (command->cdw11 >> 1) & 0x03; + + DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__, + qid, nsq->size, nsq->qbase, nsq->cqid)); + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + + DPRINTF(("%s completed creating IOSQ qid %u\r\n", + __func__, qid)); + } else { + /* + * Guest sent non-cont submission queue request. + * This setting is unsupported by this emulation. + */ + WPRINTF(("%s unsupported non-contig (list-based) " + "create i/o submission queue\r\n", __func__)); + + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + } + return (1); +} + +static int +nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint16_t qid = command->cdw10 & 0xffff; + + DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid)); + if (qid == 0 || qid > sc->num_cqueues) { + WPRINTF(("%s queue index %u / num_cqueues %u\r\n", + __func__, qid, sc->num_cqueues)); + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + sc->compl_queues[qid].qbase = NULL; + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + if (command->cdw11 & NVME_CMD_CDW11_PC) { + uint16_t qid = command->cdw10 & 0xffff; + struct nvme_completion_queue *ncq; + + if (qid > sc->num_cqueues) { + WPRINTF(("%s queue index %u > num_cqueues %u\r\n", + __func__, qid, sc->num_cqueues)); + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + ncq = &sc->compl_queues[qid]; + ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; + ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; + ncq->size = ((command->cdw10 >> 16) & 0xffff) + 1; + + ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, + command->prp1, + sizeof(struct nvme_command) * (size_t)ncq->size); + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + } else { + /* + * Non-contig completion queue unsupported. + */ + WPRINTF(("%s unsupported non-contig (list-based) " + "create i/o completion queue\r\n", + __func__)); + + /* 0x12 = Invalid Use of Controller Memory Buffer */ + pci_nvme_status_genc(&compl->status, 0x12); + } + + return (1); +} + +static int +nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2; + uint8_t logpage = command->cdw10 & 0xFF; +#ifdef __FreeBSD__ + void *data; +#else + /* Our compiler grumbles about this, despite it being OK */ + void *data = NULL; +#endif + + DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize)); + + if (logpage >= 1 && logpage <= 3) + data = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + PAGE_SIZE); + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + + switch (logpage) { + case 0x01: /* Error information */ + memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize); + break; + case 0x02: /* SMART/Health information */ + /* TODO: present some smart info */ + memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize); + break; + case 0x03: /* Firmware slot information */ + memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize); + break; + default: + WPRINTF(("%s get log page %x command not supported\r\n", + __func__, logpage)); + + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_LOG_PAGE); + } + + return (1); +} + +static int +nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + void *dest; + + DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__, + command->cdw10 & 0xFF, command->nsid)); + + switch (command->cdw10 & 0xFF) { + case 0x00: /* return Identify Namespace data structure */ + dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(sc->nsdata)); + memcpy(dest, &sc->nsdata, sizeof(sc->nsdata)); + break; + case 0x01: /* return Identify Controller data structure */ + dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(sc->ctrldata)); + memcpy(dest, &sc->ctrldata, sizeof(sc->ctrldata)); + break; + case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ + dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(uint32_t) * 1024); + ((uint32_t *)dest)[0] = 1; + ((uint32_t *)dest)[1] = 0; + break; + case 0x11: + pci_nvme_status_genc(&compl->status, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return (1); + case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ + case 0x10: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + default: + DPRINTF(("%s unsupported identify command requested 0x%x\r\n", + __func__, command->cdw10 & 0xFF)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + int feature = command->cdw10 & 0xFF; + uint32_t iv; + + DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); + compl->cdw0 = 0; + + switch (feature) { + case NVME_FEAT_ARBITRATION: + DPRINTF((" arbitration 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_POWER_MANAGEMENT: + DPRINTF((" power management 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_LBA_RANGE_TYPE: + DPRINTF((" lba range 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_TEMPERATURE_THRESHOLD: + DPRINTF((" temperature threshold 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_ERROR_RECOVERY: + DPRINTF((" error recovery 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_VOLATILE_WRITE_CACHE: + DPRINTF((" volatile write cache 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_NUMBER_OF_QUEUES: + sc->num_squeues = command->cdw11 & 0xFFFF; + sc->num_cqueues = (command->cdw11 >> 16) & 0xFFFF; + DPRINTF((" number of queues (submit %u, completion %u)\r\n", + sc->num_squeues, sc->num_cqueues)); + + if (sc->num_squeues == 0 || sc->num_squeues > sc->max_queues) + sc->num_squeues = sc->max_queues; + if (sc->num_cqueues == 0 || sc->num_cqueues > sc->max_queues) + sc->num_cqueues = sc->max_queues; + + compl->cdw0 = (sc->num_squeues & 0xFFFF) | + ((sc->num_cqueues & 0xFFFF) << 16); + + break; + case NVME_FEAT_INTERRUPT_COALESCING: + DPRINTF((" interrupt coalescing 0x%x\r\n", command->cdw11)); + + /* in uS */ + sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100; + + sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF; + break; + case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: + iv = command->cdw11 & 0xFFFF; + + DPRINTF((" interrupt vector configuration 0x%x\r\n", + command->cdw11)); + + for (uint32_t i = 0; i <= sc->num_cqueues; i++) { + if (sc->compl_queues[i].intr_vec == iv) { + if (command->cdw11 & (1 << 16)) + sc->compl_queues[i].intr_en |= + NVME_CQ_INTCOAL; + else + sc->compl_queues[i].intr_en &= + ~NVME_CQ_INTCOAL; + } + } + break; + case NVME_FEAT_WRITE_ATOMICITY: + DPRINTF((" write atomicity 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + DPRINTF((" async event configuration 0x%x\r\n", + command->cdw11)); + sc->async_ev_config = command->cdw11; + break; + case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: + DPRINTF((" software progress marker 0x%x\r\n", + command->cdw11)); + break; + case 0x0C: + DPRINTF((" autonomous power state transition 0x%x\r\n", + command->cdw11)); + break; + default: + WPRINTF(("%s invalid feature\r\n", __func__)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + int feature = command->cdw10 & 0xFF; + + DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); + + compl->cdw0 = 0; + + switch (feature) { + case NVME_FEAT_ARBITRATION: + DPRINTF((" arbitration\r\n")); + break; + case NVME_FEAT_POWER_MANAGEMENT: + DPRINTF((" power management\r\n")); + break; + case NVME_FEAT_LBA_RANGE_TYPE: + DPRINTF((" lba range\r\n")); + break; + case NVME_FEAT_TEMPERATURE_THRESHOLD: + DPRINTF((" temperature threshold\r\n")); + switch ((command->cdw11 >> 20) & 0x3) { + case 0: + /* Over temp threshold */ + compl->cdw0 = 0xFFFF; + break; + case 1: + /* Under temp threshold */ + compl->cdw0 = 0; + break; + default: + WPRINTF((" invalid threshold type select\r\n")); + pci_nvme_status_genc(&compl->status, + NVME_SC_INVALID_FIELD); + return (1); + } + break; + case NVME_FEAT_ERROR_RECOVERY: + DPRINTF((" error recovery\r\n")); + break; + case NVME_FEAT_VOLATILE_WRITE_CACHE: + DPRINTF((" volatile write cache\r\n")); + break; + case NVME_FEAT_NUMBER_OF_QUEUES: + compl->cdw0 = 0; + if (sc->num_squeues == 0) + compl->cdw0 |= sc->max_queues & 0xFFFF; + else + compl->cdw0 |= sc->num_squeues & 0xFFFF; + + if (sc->num_cqueues == 0) + compl->cdw0 |= (sc->max_queues & 0xFFFF) << 16; + else + compl->cdw0 |= (sc->num_cqueues & 0xFFFF) << 16; + + DPRINTF((" number of queues (submit %u, completion %u)\r\n", + compl->cdw0 & 0xFFFF, + (compl->cdw0 >> 16) & 0xFFFF)); + + break; + case NVME_FEAT_INTERRUPT_COALESCING: + DPRINTF((" interrupt coalescing\r\n")); + break; + case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: + DPRINTF((" interrupt vector configuration\r\n")); + break; + case NVME_FEAT_WRITE_ATOMICITY: + DPRINTF((" write atomicity\r\n")); + break; + case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + DPRINTF((" async event configuration\r\n")); + sc->async_ev_config = command->cdw11; + break; + case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: + DPRINTF((" software progress marker\r\n")); + break; + case 0x0C: + DPRINTF((" autonomous power state transition\r\n")); + break; + default: + WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__, + command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF)); + + /* TODO: search for the command ID and abort it */ + + compl->cdw0 = 1; + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +#ifdef __FreeBSD__ +static int +nvme_opc_async_event_req(struct pci_nvme_softc* sc, + struct nvme_command* command, struct nvme_completion* compl) +{ + DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11)); + + /* + * TODO: raise events when they happen based on the Set Features cmd. + * These events happen async, so only set completion successful if + * there is an event reflective of the request to get event. + */ + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); + return (0); +} +#else +/* This is kept behind an ifdef while it's unused to appease the compiler. */ +#endif /* __FreeBSD__ */ + +static void +pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) +{ + struct nvme_completion compl; + struct nvme_command *cmd; + struct nvme_submission_queue *sq; + struct nvme_completion_queue *cq; + int do_intr = 0; + uint16_t sqhead; + + DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value)); + + sq = &sc->submit_queues[0]; + + sqhead = atomic_load_acq_short(&sq->head); + + if (atomic_testandset_int(&sq->busy, 1)) { + DPRINTF(("%s SQ busy, head %u, tail %u\r\n", + __func__, sqhead, sq->tail)); + return; + } + + DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail)); + + while (sqhead != atomic_load_acq_short(&sq->tail)) { + cmd = &(sq->qbase)[sqhead]; + compl.status = 0; + + switch (cmd->opc) { + case NVME_OPC_DELETE_IO_SQ: + DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__)); + do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl); + break; + case NVME_OPC_CREATE_IO_SQ: + DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__)); + do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl); + break; + case NVME_OPC_DELETE_IO_CQ: + DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__)); + do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl); + break; + case NVME_OPC_CREATE_IO_CQ: + DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__)); + do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl); + break; + case NVME_OPC_GET_LOG_PAGE: + DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__)); + do_intr |= nvme_opc_get_log_page(sc, cmd, &compl); + break; + case NVME_OPC_IDENTIFY: + DPRINTF(("%s command IDENTIFY\r\n", __func__)); + do_intr |= nvme_opc_identify(sc, cmd, &compl); + break; + case NVME_OPC_ABORT: + DPRINTF(("%s command ABORT\r\n", __func__)); + do_intr |= nvme_opc_abort(sc, cmd, &compl); + break; + case NVME_OPC_SET_FEATURES: + DPRINTF(("%s command SET_FEATURES\r\n", __func__)); + do_intr |= nvme_opc_set_features(sc, cmd, &compl); + break; + case NVME_OPC_GET_FEATURES: + DPRINTF(("%s command GET_FEATURES\r\n", __func__)); + do_intr |= nvme_opc_get_features(sc, cmd, &compl); + break; + case NVME_OPC_ASYNC_EVENT_REQUEST: + DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__)); + /* XXX dont care, unhandled for now + do_intr |= nvme_opc_async_event_req(sc, cmd, &compl); + */ + break; + default: + WPRINTF(("0x%x command is not implemented\r\n", + cmd->opc)); + } + + /* for now skip async event generation */ + if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) { + struct nvme_completion *cp; + int phase; + + cq = &sc->compl_queues[0]; + + cp = &(cq->qbase)[cq->tail]; + cp->sqid = 0; + cp->sqhd = sqhead; + cp->cid = cmd->cid; + + phase = NVME_STATUS_GET_P(cp->status); + cp->status = compl.status; + pci_nvme_toggle_phase(&cp->status, phase); + + cq->tail = (cq->tail + 1) % cq->size; + } + sqhead = (sqhead + 1) % sq->size; + } + + DPRINTF(("setting sqhead %u\r\n", sqhead)); + atomic_store_short(&sq->head, sqhead); + atomic_store_int(&sq->busy, 0); + + if (do_intr) + pci_generate_msix(sc->nsc_pi, 0); + +} + +static int +pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, + uint64_t gpaddr, size_t size, int do_write, uint64_t lba) +{ + int iovidx; + + if (req != NULL) { + /* concatenate contig block-iovs to minimize number of iovs */ + if ((req->prev_gpaddr + req->prev_size) == gpaddr) { + iovidx = req->io_req.br_iovcnt - 1; + + req->io_req.br_iov[iovidx].iov_base = + paddr_guest2host(req->sc->nsc_pi->pi_vmctx, + req->prev_gpaddr, size); + + req->prev_size += size; + req->io_req.br_resid += size; + + req->io_req.br_iov[iovidx].iov_len = req->prev_size; + } else { + pthread_mutex_lock(&req->mtx); + + iovidx = req->io_req.br_iovcnt; + if (iovidx == NVME_MAX_BLOCKIOVS) { + int err = 0; + + DPRINTF(("large I/O, doing partial req\r\n")); + + iovidx = 0; + req->io_req.br_iovcnt = 0; + + req->io_req.br_callback = pci_nvme_io_partial; + + if (!do_write) + err = blockif_read(sc->nvstore.ctx, + &req->io_req); + else +#ifdef __FreeBSD__ + err = blockif_write(sc->nvstore.ctx, + &req->io_req); +#else + err = blockif_write(sc->nvstore.ctx, + &req->io_req, B_FALSE); + /* + * XXX: Is a follow-up needed for proper sync + * detection here or later flush behavior? + */ +#endif + + /* wait until req completes before cont */ + if (err == 0) + pthread_cond_wait(&req->cv, &req->mtx); + } + if (iovidx == 0) { + req->io_req.br_offset = lba; + req->io_req.br_resid = 0; + req->io_req.br_param = req; + } + + req->io_req.br_iov[iovidx].iov_base = + paddr_guest2host(req->sc->nsc_pi->pi_vmctx, + gpaddr, size); + + req->io_req.br_iov[iovidx].iov_len = size; + + req->prev_gpaddr = gpaddr; + req->prev_size = size; + req->io_req.br_resid += size; + + req->io_req.br_iovcnt++; + + pthread_mutex_unlock(&req->mtx); + } + } else { + /* RAM buffer: read/write directly */ + void *p = sc->nvstore.ctx; + void *gptr; + + if ((lba + size) > sc->nvstore.size) { + WPRINTF(("%s write would overflow RAM\r\n", __func__)); + return (-1); + } + + p = (void *)((uintptr_t)p + (uintptr_t)lba); + gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); + if (do_write) + memcpy(p, gptr, size); + else + memcpy(gptr, p, size); + } + return (0); +} + +static void +pci_nvme_set_completion(struct pci_nvme_softc *sc, + struct nvme_submission_queue *sq, int sqid, uint16_t cid, + uint32_t cdw0, uint16_t status, int ignore_busy) +{ + struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; + struct nvme_completion *compl; + int do_intr = 0; + int phase; + + DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n", + __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), + NVME_STATUS_GET_SC(status))); + + pthread_mutex_lock(&cq->mtx); + + assert(cq->qbase != NULL); + + compl = &cq->qbase[cq->tail]; + + compl->sqhd = atomic_load_acq_short(&sq->head); + compl->sqid = sqid; + compl->cid = cid; + + // toggle phase + phase = NVME_STATUS_GET_P(compl->status); + compl->status = status; + pci_nvme_toggle_phase(&compl->status, phase); + + cq->tail = (cq->tail + 1) % cq->size; + + if (cq->intr_en & NVME_CQ_INTEN) + do_intr = 1; + + pthread_mutex_unlock(&cq->mtx); + + if (ignore_busy || !atomic_load_acq_int(&sq->busy)) + if (do_intr) + pci_generate_msix(sc->nsc_pi, cq->intr_vec); +} + +static void +pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) +{ + req->sc = NULL; + req->nvme_sq = NULL; + req->sqid = 0; + + pthread_mutex_lock(&sc->mtx); + + req->next = sc->ioreqs_free; + sc->ioreqs_free = req; + sc->pending_ios--; + + /* when no more IO pending, can set to ready if device reset/enabled */ + if (sc->pending_ios == 0 && + NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) + sc->regs.csts |= NVME_CSTS_RDY; + + pthread_mutex_unlock(&sc->mtx); + + sem_post(&sc->iosemlock); +} + +static struct pci_nvme_ioreq * +pci_nvme_get_ioreq(struct pci_nvme_softc *sc) +{ + struct pci_nvme_ioreq *req = NULL;; + + sem_wait(&sc->iosemlock); + pthread_mutex_lock(&sc->mtx); + + req = sc->ioreqs_free; + assert(req != NULL); + + sc->ioreqs_free = req->next; + + req->next = NULL; + req->sc = sc; + + sc->pending_ios++; + + pthread_mutex_unlock(&sc->mtx); + + req->io_req.br_iovcnt = 0; + req->io_req.br_offset = 0; + req->io_req.br_resid = 0; + req->io_req.br_param = req; + req->prev_gpaddr = 0; + req->prev_size = 0; + + return req; +} + +static void +pci_nvme_io_done(struct blockif_req *br, int err) +{ + struct pci_nvme_ioreq *req = br->br_param; + struct nvme_submission_queue *sq = req->nvme_sq; + uint16_t code, status = 0; + + DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); + + /* TODO return correct error */ + code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; + pci_nvme_status_genc(&status, code); + + pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0); + pci_nvme_release_ioreq(req->sc, req); +} + +static void +pci_nvme_io_partial(struct blockif_req *br, int err) +{ + struct pci_nvme_ioreq *req = br->br_param; + + DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); + + pthread_cond_signal(&req->cv); +} + + +static void +pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) +{ + struct nvme_submission_queue *sq; + uint16_t status = 0; + uint16_t sqhead; + int err; + + /* handle all submissions up to sq->tail index */ + sq = &sc->submit_queues[idx]; + + if (atomic_testandset_int(&sq->busy, 1)) { + DPRINTF(("%s sqid %u busy\r\n", __func__, idx)); + return; + } + + sqhead = atomic_load_acq_short(&sq->head); + + DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n", + idx, sqhead, sq->tail, sq->qbase)); + + while (sqhead != atomic_load_acq_short(&sq->tail)) { + struct nvme_command *cmd; + struct pci_nvme_ioreq *req = NULL; + uint64_t lba; + uint64_t nblocks, bytes, size, cpsz; + + /* TODO: support scatter gather list handling */ + + cmd = &sq->qbase[sqhead]; + sqhead = (sqhead + 1) % sq->size; + + lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; + + if (cmd->opc == NVME_OPC_FLUSH) { + pci_nvme_status_genc(&status, NVME_SC_SUCCESS); + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + + continue; + } else if (cmd->opc == 0x08) { + /* TODO: write zeroes */ + WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n", + __func__, lba, cmd->cdw12 & 0xFFFF)); + pci_nvme_status_genc(&status, NVME_SC_SUCCESS); + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + + continue; + } + + nblocks = (cmd->cdw12 & 0xFFFF) + 1; + + bytes = nblocks * sc->nvstore.sectsz; + + if (sc->nvstore.type == NVME_STOR_BLOCKIF) { + req = pci_nvme_get_ioreq(sc); + req->nvme_sq = sq; + req->sqid = idx; + } + + /* + * If data starts mid-page and flows into the next page, then + * increase page count + */ + + DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu " + "(%lu-bytes)\r\n", + sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size, + cmd->opc == NVME_OPC_WRITE ? + "WRITE" : "READ", + lba, nblocks, bytes)); + + cmd->prp1 &= ~(0x03UL); + cmd->prp2 &= ~(0x03UL); + + DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2)); + + size = bytes; + lba *= sc->nvstore.sectsz; + + cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE); + + if (cpsz > bytes) + cpsz = bytes; + + if (req != NULL) { + req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) | + cmd->cdw10; + req->opc = cmd->opc; + req->cid = cmd->cid; + req->nsid = cmd->nsid; + } + + err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz, + cmd->opc == NVME_OPC_WRITE, lba); + lba += cpsz; + size -= cpsz; + + if (size == 0) + goto iodone; + + if (size <= PAGE_SIZE) { + /* prp2 is second (and final) page in transfer */ + + err = pci_nvme_append_iov_req(sc, req, cmd->prp2, + size, + cmd->opc == NVME_OPC_WRITE, + lba); + } else { + uint64_t *prp_list; + int i; + + /* prp2 is pointer to a physical region page list */ + prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx, + cmd->prp2, PAGE_SIZE); + + i = 0; + while (size != 0) { + cpsz = MIN(size, PAGE_SIZE); + + /* + * Move to linked physical region page list + * in last item. + */ + if (i == (NVME_PRP2_ITEMS-1) && + size > PAGE_SIZE) { + assert((prp_list[i] & (PAGE_SIZE-1)) == 0); + prp_list = paddr_guest2host( + sc->nsc_pi->pi_vmctx, + prp_list[i], PAGE_SIZE); + i = 0; + } + if (prp_list[i] == 0) { + WPRINTF(("PRP2[%d] = 0 !!!\r\n", i)); + err = 1; + break; + } + + err = pci_nvme_append_iov_req(sc, req, + prp_list[i], cpsz, + cmd->opc == NVME_OPC_WRITE, lba); + if (err) + break; + + lba += cpsz; + size -= cpsz; + i++; + } + } + +iodone: + if (sc->nvstore.type == NVME_STOR_RAM) { + uint16_t code, status = 0; + + code = err ? NVME_SC_LBA_OUT_OF_RANGE : + NVME_SC_SUCCESS; + pci_nvme_status_genc(&status, code); + + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + + continue; + } + + + if (err) + goto do_error; + + req->io_req.br_callback = pci_nvme_io_done; + + err = 0; + switch (cmd->opc) { + case NVME_OPC_READ: + err = blockif_read(sc->nvstore.ctx, &req->io_req); + break; + case NVME_OPC_WRITE: +#ifdef __FreeBSD__ + err = blockif_write(sc->nvstore.ctx, &req->io_req); +#else + /* XXX: Should this be sync? */ + err = blockif_write(sc->nvstore.ctx, &req->io_req, + B_FALSE); +#endif + break; + default: + WPRINTF(("%s unhandled io command 0x%x\r\n", + __func__, cmd->opc)); + err = 1; + } + +do_error: + if (err) { + uint16_t status = 0; + + pci_nvme_status_genc(&status, + NVME_SC_DATA_TRANSFER_ERROR); + + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + pci_nvme_release_ioreq(sc, req); + } + } + + atomic_store_short(&sq->head, sqhead); + atomic_store_int(&sq->busy, 0); +} + +static void +pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, + uint64_t idx, int is_sq, uint64_t value) +{ + DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n", + idx, is_sq ? "SQ" : "CQ", value & 0xFFFF)); + + if (is_sq) { + atomic_store_short(&sc->submit_queues[idx].tail, + (uint16_t)value); + + if (idx == 0) { + pci_nvme_handle_admin_cmd(sc, value); + } else { + /* submission queue; handle new entries in SQ */ + if (idx > sc->num_squeues) { + WPRINTF(("%s SQ index %lu overflow from " + "guest (max %u)\r\n", + __func__, idx, sc->num_squeues)); + return; + } + pci_nvme_handle_io_cmd(sc, (uint16_t)idx); + } + } else { + if (idx > sc->num_cqueues) { + WPRINTF(("%s queue index %lu overflow from " + "guest (max %u)\r\n", + __func__, idx, sc->num_cqueues)); + return; + } + + sc->compl_queues[idx].head = (uint16_t)value; + } +} + +static void +pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) +{ + const char *s = iswrite ? "WRITE" : "READ"; + + switch (offset) { + case NVME_CR_CAP_LOW: + DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s)); + break; + case NVME_CR_CAP_HI: + DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s)); + break; + case NVME_CR_VS: + DPRINTF(("%s %s NVME_CR_VS\r\n", func, s)); + break; + case NVME_CR_INTMS: + DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s)); + break; + case NVME_CR_INTMC: + DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s)); + break; + case NVME_CR_CC: + DPRINTF(("%s %s NVME_CR_CC\r\n", func, s)); + break; + case NVME_CR_CSTS: + DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s)); + break; + case NVME_CR_NSSR: + DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s)); + break; + case NVME_CR_AQA: + DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s)); + break; + case NVME_CR_ASQ_LOW: + DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s)); + break; + case NVME_CR_ASQ_HI: + DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s)); + break; + case NVME_CR_ACQ_LOW: + DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s)); + break; + case NVME_CR_ACQ_HI: + DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s)); + break; + default: + DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset)); + } + +} + +static void +pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, + uint64_t offset, int size, uint64_t value) +{ + uint32_t ccreg; + + if (offset >= NVME_DOORBELL_OFFSET) { + uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; + uint64_t idx = belloffset / 8; /* door bell size = 2*int */ + int is_sq = (belloffset % 8) < 4; + + if (belloffset > ((sc->max_queues+1) * 8 - 4)) { + WPRINTF(("guest attempted an overflow write offset " + "0x%lx, val 0x%lx in %s", + offset, value, __func__)); + return; + } + + pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); + return; + } + + DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n", + offset, size, value)); + + if (size != 4) { + WPRINTF(("guest wrote invalid size %d (offset 0x%lx, " + "val 0x%lx) to bar0 in %s", + size, offset, value, __func__)); + /* TODO: shutdown device */ + return; + } + + pci_nvme_bar0_reg_dumps(__func__, offset, 1); + + pthread_mutex_lock(&sc->mtx); + + switch (offset) { + case NVME_CR_CAP_LOW: + case NVME_CR_CAP_HI: + /* readonly */ + break; + case NVME_CR_VS: + /* readonly */ + break; + case NVME_CR_INTMS: + /* MSI-X, so ignore */ + break; + case NVME_CR_INTMC: + /* MSI-X, so ignore */ + break; + case NVME_CR_CC: + ccreg = (uint32_t)value; + + DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u " + "iocqes %u\r\n", + __func__, + NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), + NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), + NVME_CC_GET_IOCQES(ccreg))); + + if (NVME_CC_GET_SHN(ccreg)) { + /* perform shutdown - flush out data to backend */ + sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << + NVME_CSTS_REG_SHST_SHIFT); + sc->regs.csts |= NVME_SHST_COMPLETE << + NVME_CSTS_REG_SHST_SHIFT; + } + if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { + if (NVME_CC_GET_EN(ccreg) == 0) + /* transition 1-> causes controller reset */ + pci_nvme_reset_locked(sc); + else + pci_nvme_init_controller(ctx, sc); + } + + /* Insert the iocqes, iosqes and en bits from the write */ + sc->regs.cc &= ~NVME_CC_WRITE_MASK; + sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; + if (NVME_CC_GET_EN(ccreg) == 0) { + /* Insert the ams, mps and css bit fields */ + sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; + sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; + sc->regs.csts &= ~NVME_CSTS_RDY; + } else if (sc->pending_ios == 0) { + sc->regs.csts |= NVME_CSTS_RDY; + } + break; + case NVME_CR_CSTS: + break; + case NVME_CR_NSSR: + /* ignore writes; don't support subsystem reset */ + break; + case NVME_CR_AQA: + sc->regs.aqa = (uint32_t)value; + break; + case NVME_CR_ASQ_LOW: + sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | + (0xFFFFF000 & value); + break; + case NVME_CR_ASQ_HI: + sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | + (value << 32); + break; + case NVME_CR_ACQ_LOW: + sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | + (0xFFFFF000 & value); + break; + case NVME_CR_ACQ_HI: + sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | + (value << 32); + break; + default: + DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n", + __func__, offset, value, size)); + } + pthread_mutex_unlock(&sc->mtx); +} + +static void +pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_nvme_softc* sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, " + " value 0x%lx\r\n", baridx, offset, size, value)); + + pci_emul_msix_twrite(pi, offset, size, value); + return; + } + + switch (baridx) { + case 0: + pci_nvme_write_bar_0(ctx, sc, offset, size, value); + break; + + default: + DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n", + __func__, baridx, value)); + } +} + +static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, + uint64_t offset, int size) +{ + uint64_t value; + + pci_nvme_bar0_reg_dumps(__func__, offset, 0); + + if (offset < NVME_DOORBELL_OFFSET) { + void *p = &(sc->regs); + pthread_mutex_lock(&sc->mtx); + memcpy(&value, (void *)((uintptr_t)p + offset), size); + pthread_mutex_unlock(&sc->mtx); + } else { + value = 0; + WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset)); + } + + switch (size) { + case 1: + value &= 0xFF; + break; + case 2: + value &= 0xFFFF; + break; + case 4: + value &= 0xFFFFFFFF; + break; + } + + DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x\r\n", + offset, size, (uint32_t)value)); + + return (value); +} + + + +static uint64_t +pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct pci_nvme_softc* sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n", + baridx, offset, size)); + + return pci_emul_msix_tread(pi, offset, size); + } + + switch (baridx) { + case 0: + return pci_nvme_read_bar_0(sc, offset, size); + + default: + DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset)); + } + + return (0); +} + + +static int +pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) +{ + char bident[sizeof("XX:X:X")]; + char *uopt, *xopts, *config; + uint32_t sectsz; + int optidx; + + sc->max_queues = NVME_QUEUES; + sc->max_qentries = NVME_MAX_QENTRIES; + sc->ioslots = NVME_IOSLOTS; + sc->num_squeues = sc->max_queues; + sc->num_cqueues = sc->max_queues; + sectsz = 0; + + uopt = strdup(opts); + optidx = 0; + snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), + "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); + for (xopts = strtok(uopt, ","); + xopts != NULL; + xopts = strtok(NULL, ",")) { + + if ((config = strchr(xopts, '=')) != NULL) + *config++ = '\0'; + + if (!strcmp("maxq", xopts)) { + sc->max_queues = atoi(config); + } else if (!strcmp("qsz", xopts)) { + sc->max_qentries = atoi(config); + } else if (!strcmp("ioslots", xopts)) { + sc->ioslots = atoi(config); + } else if (!strcmp("sectsz", xopts)) { + sectsz = atoi(config); + } else if (!strcmp("ser", xopts)) { + /* + * This field indicates the Product Serial Number in + * 7-bit ASCII, unused bytes should be space characters. + * Ref: NVMe v1.3c. + */ + cpywithpad((char *)sc->ctrldata.sn, + sizeof(sc->ctrldata.sn), config, ' '); + } else if (!strcmp("ram", xopts)) { + uint64_t sz = strtoull(&xopts[4], NULL, 10); + + sc->nvstore.type = NVME_STOR_RAM; + sc->nvstore.size = sz * 1024 * 1024; + sc->nvstore.ctx = calloc(1, sc->nvstore.size); + sc->nvstore.sectsz = 4096; + sc->nvstore.sectsz_bits = 12; + if (sc->nvstore.ctx == NULL) { + perror("Unable to allocate RAM"); + free(uopt); + return (-1); + } + } else if (optidx == 0) { + snprintf(bident, sizeof(bident), "%d:%d", + sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); + sc->nvstore.ctx = blockif_open(xopts, bident); + if (sc->nvstore.ctx == NULL) { + perror("Could not open backing file"); + free(uopt); + return (-1); + } + sc->nvstore.type = NVME_STOR_BLOCKIF; + sc->nvstore.size = blockif_size(sc->nvstore.ctx); + } else { + fprintf(stderr, "Invalid option %s\n", xopts); + free(uopt); + return (-1); + } + + optidx++; + } + free(uopt); + + if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { + fprintf(stderr, "backing store not specified\n"); + return (-1); + } + if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) + sc->nvstore.sectsz = sectsz; + else if (sc->nvstore.type != NVME_STOR_RAM) + sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); + for (sc->nvstore.sectsz_bits = 9; + (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; + sc->nvstore.sectsz_bits++); + + if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) + sc->max_queues = NVME_QUEUES; + + if (sc->max_qentries <= 0) { + fprintf(stderr, "Invalid qsz option\n"); + return (-1); + } + if (sc->ioslots <= 0) { + fprintf(stderr, "Invalid ioslots option\n"); + return (-1); + } + + return (0); +} + +static int +pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_nvme_softc *sc; + uint32_t pci_membar_sz; + int error; + + error = 0; + + sc = calloc(1, sizeof(struct pci_nvme_softc)); + pi->pi_arg = sc; + sc->nsc_pi = pi; + + error = pci_nvme_parse_opts(sc, opts); + if (error < 0) + goto done; + else + error = 0; + + sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); + for (int i = 0; i < sc->ioslots; i++) { + if (i < (sc->ioslots-1)) + sc->ioreqs[i].next = &sc->ioreqs[i+1]; + pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); + pthread_cond_init(&sc->ioreqs[i].cv, NULL); + } + sc->ioreqs_free = sc->ioreqs; + sc->intr_coales_aggr_thresh = 1; + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); + pci_set_cfgdata8(pi, PCIR_PROGIF, + PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); + + /* allocate size of nvme registers + doorbell space for all queues */ + pci_membar_sz = sizeof(struct nvme_registers) + + 2*sizeof(uint32_t)*(sc->max_queues); + + DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz)); + + error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); + if (error) { + WPRINTF(("%s pci alloc mem bar failed\r\n", __func__)); + goto done; + } + + error = pci_emul_add_msixcap(pi, sc->max_queues, NVME_MSIX_BAR); + if (error) { + WPRINTF(("%s pci add msixcap failed\r\n", __func__)); + goto done; + } + + pthread_mutex_init(&sc->mtx, NULL); + sem_init(&sc->iosemlock, 0, sc->ioslots); + + pci_nvme_reset(sc); + pci_nvme_init_ctrldata(sc); + pci_nvme_init_nsdata(sc); + + pci_lintr_request(pi); + +done: + return (error); +} + + +struct pci_devemu pci_de_nvme = { + .pe_emu = "nvme", + .pe_init = pci_nvme_init, + .pe_barwrite = pci_nvme_write, + .pe_barread = pci_nvme_read +}; +PCI_EMUL_SET(pci_de_nvme); diff --git a/usr/src/cmd/bhyve/pci_virtio_block.c b/usr/src/cmd/bhyve/pci_virtio_block.c index d2f6ac7785..d272a96d71 100644 --- a/usr/src/cmd/bhyve/pci_virtio_block.c +++ b/usr/src/cmd/bhyve/pci_virtio_block.c @@ -127,9 +127,9 @@ struct virtio_blk_hdr { #define VBH_OP_WRITE 1 #define VBH_OP_FLUSH 4 #define VBH_OP_FLUSH_OUT 5 -#define VBH_OP_IDENT 8 +#define VBH_OP_IDENT 8 #define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ - uint32_t vbh_type; + uint32_t vbh_type; uint32_t vbh_ioprio; uint64_t vbh_sector; } __packed; @@ -143,8 +143,8 @@ static int pci_vtblk_debug; struct pci_vtblk_ioreq { struct blockif_req io_req; - struct pci_vtblk_softc *io_sc; - uint8_t *io_status; + struct pci_vtblk_softc *io_sc; + uint8_t *io_status; uint16_t io_idx; }; @@ -169,7 +169,7 @@ static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); static struct virtio_consts vtblk_vi_consts = { "vtblk", /* our name */ 1, /* we support 1 virtqueue */ - sizeof(struct vtblk_config), /* config reg size */ + sizeof(struct vtblk_config), /* config reg size */ pci_vtblk_reset, /* reset */ pci_vtblk_notify, /* device-wide qnotify */ pci_vtblk_cfgread, /* read PCI config */ @@ -275,7 +275,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) } io->io_req.br_resid = iolen; - DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r", + DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r", writeop ? "write" : "read/ident", iolen, i - 1, io->io_req.br_offset)); @@ -340,7 +340,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) */ snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); bctxt = blockif_open(opts, bident); - if (bctxt == NULL) { + if (bctxt == NULL) { perror("Could not open backing file"); return (1); } @@ -374,7 +374,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) */ MD5Init(&mdctx); MD5Update(&mdctx, opts, strlen(opts)); - MD5Final(digest, &mdctx); + MD5Final(digest, &mdctx); sprintf(sc->vbsc_ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); diff --git a/usr/src/cmd/bhyve/pci_virtio_console.c b/usr/src/cmd/bhyve/pci_virtio_console.c index c4ee10d53a..e1448780f1 100644 --- a/usr/src/cmd/bhyve/pci_virtio_console.c +++ b/usr/src/cmd/bhyve/pci_virtio_console.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2016 iXsystems Inc. * All rights reserved. * @@ -316,7 +318,7 @@ pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *name, sun.sun_family = AF_UNIX; sun.sun_len = sizeof(struct sockaddr_un); strcpy(pathcopy, path); - strncpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path)); + strlcpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path)); free(pathcopy); if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0) { @@ -326,7 +328,7 @@ pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *name, #else /* __FreeBSD__ */ /* Do a simple bind rather than the FreeBSD bindat() */ addr.sun_family = AF_UNIX; - (void) strncpy(addr.sun_path, path, sizeof (addr.sun_path)); + (void) strlcpy(addr.sun_path, path, sizeof (addr.sun_path)); if (bind(fd, (struct sockaddr *)&addr, sizeof (addr)) < 0) { error = -1; goto out; @@ -594,22 +596,15 @@ pci_vtcon_notify_tx(void *vsc, struct vqueue_info *vq) struct pci_vtcon_softc *sc; struct pci_vtcon_port *port; struct iovec iov[1]; -#ifdef __FreeBSD__ uint16_t idx, n; -#else - uint16_t idx; -#endif uint16_t flags[8]; sc = vsc; port = pci_vtcon_vq_to_port(sc, vq); while (vq_has_descs(vq)) { -#ifdef __FreeBSD__ n = vq_getchain(vq, &idx, iov, 1, flags); -#else - vq_getchain(vq, &idx, iov, 1, flags); -#endif + assert(n >= 1); if (port != NULL) port->vsp_cb(port, port->vsp_arg, iov, 1); @@ -681,7 +676,7 @@ pci_vtcon_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) while ((opt = strsep(&opts, ",")) != NULL) { portname = strsep(&opt, "="); - portpath = strdup(opt); + portpath = opt; /* create port */ if (pci_vtcon_sock_add(sc, portname, portpath) < 0) { diff --git a/usr/src/cmd/bhyve/pci_virtio_net.c b/usr/src/cmd/bhyve/pci_virtio_net.c index a3fe72474b..f5eadf4a2c 100644 --- a/usr/src/cmd/bhyve/pci_virtio_net.c +++ b/usr/src/cmd/bhyve/pci_virtio_net.c @@ -822,24 +822,24 @@ pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) static int pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr) { - struct ether_addr *ea; - char *tmpstr; - char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + struct ether_addr *ea; + char *tmpstr; + char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; - tmpstr = strsep(&mac_str,"="); - - if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { - ea = ether_aton(mac_str); + tmpstr = strsep(&mac_str,"="); - if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || - memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { + if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { + ea = ether_aton(mac_str); + + if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || + memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { fprintf(stderr, "Invalid MAC %s\n", mac_str); - return (EINVAL); - } else - memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); - } + return (EINVAL); + } else + memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); + } - return (0); + return (0); } #endif /* __FreeBSD__ */ @@ -1104,8 +1104,9 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pthread_mutex_init(&sc->tx_mtx, NULL); pthread_cond_init(&sc->tx_cond, NULL); pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); - snprintf(tname, sizeof(tname), "%s vtnet%d tx", vmname, pi->pi_slot); - pthread_set_name_np(sc->tx_tid, tname); + snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, + pi->pi_func); + pthread_set_name_np(sc->tx_tid, tname); return (0); } diff --git a/usr/src/cmd/bhyve/pci_virtio_rnd.c b/usr/src/cmd/bhyve/pci_virtio_rnd.c index 4ce749053c..44bc55e003 100644 --- a/usr/src/cmd/bhyve/pci_virtio_rnd.c +++ b/usr/src/cmd/bhyve/pci_virtio_rnd.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Nahanni Systems Inc. * All rights reserved. * diff --git a/usr/src/cmd/bhyve/pci_virtio_scsi.c b/usr/src/cmd/bhyve/pci_virtio_scsi.c new file mode 100644 index 0000000000..aa906bb854 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_scsi.c @@ -0,0 +1,718 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>. + * Copyright (c) 2018 Marcelo Araujo <araujo@FreeBSD.org>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <sys/time.h> +#include <sys/queue.h> +#include <sys/sbuf.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <pthread_np.h> + +#include <cam/scsi/scsi_all.h> +#include <cam/scsi/scsi_message.h> +#include <cam/ctl/ctl.h> +#include <cam/ctl/ctl_io.h> +#include <cam/ctl/ctl_backend.h> +#include <cam/ctl/ctl_ioctl.h> +#include <cam/ctl/ctl_util.h> +#include <cam/ctl/ctl_scsi_all.h> +#include <camlib.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" +#include "iov.h" + +#define VTSCSI_RINGSZ 64 +#define VTSCSI_REQUESTQ 1 +#define VTSCSI_THR_PER_Q 16 +#define VTSCSI_MAXQ (VTSCSI_REQUESTQ + 2) +#define VTSCSI_MAXSEG 64 + +#define VTSCSI_IN_HEADER_LEN(_sc) \ + (sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size) + +#define VTSCSI_OUT_HEADER_LEN(_sc) \ + (sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size) + +#define VIRTIO_SCSI_MAX_CHANNEL 0 +#define VIRTIO_SCSI_MAX_TARGET 0 +#define VIRTIO_SCSI_MAX_LUN 16383 + +#define VIRTIO_SCSI_F_INOUT (1 << 0) +#define VIRTIO_SCSI_F_HOTPLUG (1 << 1) +#define VIRTIO_SCSI_F_CHANGE (1 << 2) + +static int pci_vtscsi_debug = 0; +#define DPRINTF(params) if (pci_vtscsi_debug) printf params +#define WPRINTF(params) printf params + +struct pci_vtscsi_config { + uint32_t num_queues; + uint32_t seg_max; + uint32_t max_sectors; + uint32_t cmd_per_lun; + uint32_t event_info_size; + uint32_t sense_size; + uint32_t cdb_size; + uint16_t max_channel; + uint16_t max_target; + uint32_t max_lun; +} __attribute__((packed)); + +struct pci_vtscsi_queue { + struct pci_vtscsi_softc * vsq_sc; + struct vqueue_info * vsq_vq; + int vsq_ctl_fd; + pthread_mutex_t vsq_mtx; + pthread_mutex_t vsq_qmtx; + pthread_cond_t vsq_cv; + STAILQ_HEAD(, pci_vtscsi_request) vsq_requests; + LIST_HEAD(, pci_vtscsi_worker) vsq_workers; +}; + +struct pci_vtscsi_worker { + struct pci_vtscsi_queue * vsw_queue; + pthread_t vsw_thread; + bool vsw_exiting; + LIST_ENTRY(pci_vtscsi_worker) vsw_link; +}; + +struct pci_vtscsi_request { + struct pci_vtscsi_queue * vsr_queue; + struct iovec vsr_iov_in[VTSCSI_MAXSEG]; + int vsr_niov_in; + struct iovec vsr_iov_out[VTSCSI_MAXSEG]; + int vsr_niov_out; + uint32_t vsr_idx; + STAILQ_ENTRY(pci_vtscsi_request) vsr_link; +}; + +/* + * Per-device softc + */ +struct pci_vtscsi_softc { + struct virtio_softc vss_vs; + struct vqueue_info vss_vq[VTSCSI_MAXQ]; + struct pci_vtscsi_queue vss_queues[VTSCSI_REQUESTQ]; + pthread_mutex_t vss_mtx; + int vss_iid; + int vss_ctl_fd; + uint32_t vss_features; + struct pci_vtscsi_config vss_config; +}; + +#define VIRTIO_SCSI_T_TMF 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 +#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 +#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 +#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 +#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 + +/* command-specific response values */ +#define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0 +#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 +#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 + +struct pci_vtscsi_ctrl_tmf { + uint32_t type; + uint32_t subtype; + uint8_t lun[8]; + uint64_t id; + uint8_t response; +} __attribute__((packed)); + +#define VIRTIO_SCSI_T_AN_QUERY 1 +#define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2 +#define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4 +#define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8 +#define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16 +#define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32 +#define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64 + +struct pci_vtscsi_ctrl_an { + uint32_t type; + uint8_t lun[8]; + uint32_t event_requested; + uint32_t event_actual; + uint8_t response; +} __attribute__((packed)); + +/* command-specific response values */ +#define VIRTIO_SCSI_S_OK 0 +#define VIRTIO_SCSI_S_OVERRUN 1 +#define VIRTIO_SCSI_S_ABORTED 2 +#define VIRTIO_SCSI_S_BAD_TARGET 3 +#define VIRTIO_SCSI_S_RESET 4 +#define VIRTIO_SCSI_S_BUSY 5 +#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 +#define VIRTIO_SCSI_S_TARGET_FAILURE 7 +#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 +#define VIRTIO_SCSI_S_FAILURE 9 +#define VIRTIO_SCSI_S_INCORRECT_LUN 12 + +/* task_attr */ +#define VIRTIO_SCSI_S_SIMPLE 0 +#define VIRTIO_SCSI_S_ORDERED 1 +#define VIRTIO_SCSI_S_HEAD 2 +#define VIRTIO_SCSI_S_ACA 3 + +struct pci_vtscsi_event { + uint32_t event; + uint8_t lun[8]; + uint32_t reason; +} __attribute__((packed)); + +struct pci_vtscsi_req_cmd_rd { + uint8_t lun[8]; + uint64_t id; + uint8_t task_attr; + uint8_t prio; + uint8_t crn; + uint8_t cdb[]; +} __attribute__((packed)); + +struct pci_vtscsi_req_cmd_wr { + uint32_t sense_len; + uint32_t residual; + uint16_t status_qualifier; + uint8_t status; + uint8_t response; + uint8_t sense[]; +} __attribute__((packed)); + +static void *pci_vtscsi_proc(void *); +static void pci_vtscsi_reset(void *); +static void pci_vtscsi_neg_features(void *, uint64_t); +static int pci_vtscsi_cfgread(void *, int, int, uint32_t *); +static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t); +static inline int pci_vtscsi_get_lun(uint8_t *); +static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t); +static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *, + struct pci_vtscsi_ctrl_tmf *); +static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *, + struct pci_vtscsi_ctrl_an *); +static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *, + int, struct iovec *, int); +static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *); +static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *); +static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *); +static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *, + struct pci_vtscsi_queue *, int); +static int pci_vtscsi_init(struct vmctx *, struct pci_devinst *, char *); + +static struct virtio_consts vtscsi_vi_consts = { + "vtscsi", /* our name */ + VTSCSI_MAXQ, /* we support 2+n virtqueues */ + sizeof(struct pci_vtscsi_config), /* config reg size */ + pci_vtscsi_reset, /* reset */ + NULL, /* device-wide qnotify */ + pci_vtscsi_cfgread, /* read virtio config */ + pci_vtscsi_cfgwrite, /* write virtio config */ + pci_vtscsi_neg_features, /* apply negotiated features */ + 0, /* our capabilities */ +}; + +static void * +pci_vtscsi_proc(void *arg) +{ + struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg; + struct pci_vtscsi_queue *q = worker->vsw_queue; + struct pci_vtscsi_request *req; + int iolen; + + for (;;) { + pthread_mutex_lock(&q->vsq_mtx); + + while (STAILQ_EMPTY(&q->vsq_requests) + && !worker->vsw_exiting) + pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx); + + if (worker->vsw_exiting) + break; + + req = STAILQ_FIRST(&q->vsq_requests); + STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link); + + pthread_mutex_unlock(&q->vsq_mtx); + iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in, + req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out); + + pthread_mutex_lock(&q->vsq_qmtx); + vq_relchain(q->vsq_vq, req->vsr_idx, iolen); + vq_endchains(q->vsq_vq, 0); + pthread_mutex_unlock(&q->vsq_qmtx); + + DPRINTF(("virtio-scsi: request <idx=%d> completed\n", + req->vsr_idx)); + free(req); + } + + pthread_mutex_unlock(&q->vsq_mtx); + return (NULL); +} + +static void +pci_vtscsi_reset(void *vsc) +{ + struct pci_vtscsi_softc *sc; + + sc = vsc; + + DPRINTF(("vtscsi: device reset requested\n")); + vi_reset_dev(&sc->vss_vs); + + /* initialize config structure */ + sc->vss_config = (struct pci_vtscsi_config){ + .num_queues = VTSCSI_REQUESTQ, + .seg_max = VTSCSI_MAXSEG, + .max_sectors = 2, + .cmd_per_lun = 1, + .event_info_size = sizeof(struct pci_vtscsi_event), + .sense_size = 96, + .cdb_size = 32, + .max_channel = VIRTIO_SCSI_MAX_CHANNEL, + .max_target = VIRTIO_SCSI_MAX_TARGET, + .max_lun = VIRTIO_SCSI_MAX_LUN + }; +} + +static void +pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtscsi_softc *sc = vsc; + + sc->vss_features = negotiated_features; +} + +static int +pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtscsi_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)&sc->vss_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static int +pci_vtscsi_cfgwrite(void *vsc, int offset, int size, uint32_t val) +{ + + return (0); +} + +static inline int +pci_vtscsi_get_lun(uint8_t *lun) +{ + + return (((lun[2] << 8) | lun[3]) & 0x3fff); +} + +static int +pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf, + size_t bufsize) +{ + struct pci_vtscsi_ctrl_tmf *tmf; + struct pci_vtscsi_ctrl_an *an; + uint32_t type; + + type = *(uint32_t *)buf; + + if (type == VIRTIO_SCSI_T_TMF) { + tmf = (struct pci_vtscsi_ctrl_tmf *)buf; + return (pci_vtscsi_tmf_handle(sc, tmf)); + } + + if (type == VIRTIO_SCSI_T_AN_QUERY) { + an = (struct pci_vtscsi_ctrl_an *)buf; + return (pci_vtscsi_an_handle(sc, an)); + } + + return (0); +} + +static int +pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_ctrl_tmf *tmf) +{ + union ctl_io *io; + int err; + + io = ctl_scsi_alloc_io(sc->vss_iid); + ctl_scsi_zero_io(io); + + io->io_hdr.io_type = CTL_IO_TASK; + io->io_hdr.nexus.targ_port = tmf->lun[1]; + io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun); + io->taskio.tag_type = CTL_TAG_SIMPLE; + io->taskio.tag_num = (uint32_t)tmf->id; + + switch (tmf->subtype) { + case VIRTIO_SCSI_T_TMF_ABORT_TASK: + io->taskio.task_action = CTL_TASK_ABORT_TASK; + break; + + case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: + io->taskio.task_action = CTL_TASK_ABORT_TASK_SET; + break; + + case VIRTIO_SCSI_T_TMF_CLEAR_ACA: + io->taskio.task_action = CTL_TASK_CLEAR_ACA; + break; + + case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: + io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET; + break; + + case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: + io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET; + break; + + case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: + io->taskio.task_action = CTL_TASK_LUN_RESET; + break; + + case VIRTIO_SCSI_T_TMF_QUERY_TASK: + io->taskio.task_action = CTL_TASK_QUERY_TASK; + break; + + case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: + io->taskio.task_action = CTL_TASK_QUERY_TASK_SET; + break; + } + + if (pci_vtscsi_debug) { + struct sbuf *sb = sbuf_new_auto(); + ctl_io_sbuf(io, sb); + sbuf_finish(sb); + DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); + sbuf_delete(sb); + } + + err = ioctl(sc->vss_ctl_fd, CTL_IO, io); + if (err != 0) + WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno))); + + tmf->response = io->taskio.task_status; + ctl_scsi_free_io(io); + return (1); +} + +static int +pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_ctrl_an *an) +{ + + return (0); +} + +static int +pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in, + int niov_in, struct iovec *iov_out, int niov_out) +{ + struct pci_vtscsi_softc *sc = q->vsq_sc; + struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL; + struct pci_vtscsi_req_cmd_wr *cmd_wr; + struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG]; + union ctl_io *io; + size_t data_niov_in, data_niov_out; + void *ext_data_ptr = NULL; + uint32_t ext_data_len = 0, ext_sg_entries = 0; + int err; + + seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in, + VTSCSI_IN_HEADER_LEN(sc)); + seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out, + VTSCSI_OUT_HEADER_LEN(sc)); + + truncate_iov(iov_in, niov_in, VTSCSI_IN_HEADER_LEN(sc)); + truncate_iov(iov_out, niov_out, VTSCSI_OUT_HEADER_LEN(sc)); + iov_to_buf(iov_in, niov_in, (void **)&cmd_rd); + + cmd_wr = malloc(VTSCSI_OUT_HEADER_LEN(sc)); + io = ctl_scsi_alloc_io(sc->vss_iid); + ctl_scsi_zero_io(io); + + io->io_hdr.nexus.targ_port = cmd_rd->lun[1]; + io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun); + + io->io_hdr.io_type = CTL_IO_SCSI; + + if (data_niov_in > 0) { + ext_data_ptr = (void *)data_iov_in; + ext_sg_entries = data_niov_in; + ext_data_len = count_iov(data_iov_in, data_niov_in); + io->io_hdr.flags |= CTL_FLAG_DATA_OUT; + } else if (data_niov_out > 0) { + ext_data_ptr = (void *)data_iov_out; + ext_sg_entries = data_niov_out; + ext_data_len = count_iov(data_iov_out, data_niov_out); + io->io_hdr.flags |= CTL_FLAG_DATA_IN; + } + + io->scsiio.sense_len = sc->vss_config.sense_size; + io->scsiio.tag_num = (uint32_t)cmd_rd->id; + io->scsiio.tag_type = CTL_TAG_SIMPLE; + io->scsiio.ext_sg_entries = ext_sg_entries; + io->scsiio.ext_data_ptr = ext_data_ptr; + io->scsiio.ext_data_len = ext_data_len; + io->scsiio.ext_data_filled = 0; + io->scsiio.cdb_len = sc->vss_config.cdb_size; + memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size); + + if (pci_vtscsi_debug) { + struct sbuf *sb = sbuf_new_auto(); + ctl_io_sbuf(io, sb); + sbuf_finish(sb); + DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); + sbuf_delete(sb); + } + + err = ioctl(q->vsq_ctl_fd, CTL_IO, io); + if (err != 0) { + WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno))); + cmd_wr->response = VIRTIO_SCSI_S_FAILURE; + } else { + cmd_wr->sense_len = MIN(io->scsiio.sense_len, + sc->vss_config.sense_size); + cmd_wr->residual = io->scsiio.residual; + cmd_wr->status = io->scsiio.scsi_status; + cmd_wr->response = VIRTIO_SCSI_S_OK; + memcpy(&cmd_wr->sense, &io->scsiio.sense_data, + cmd_wr->sense_len); + } + + buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0); + free(cmd_rd); + free(cmd_wr); + ctl_scsi_free_io(io); + return (VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled); +} + +static void +pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtscsi_softc *sc; + struct iovec iov[VTSCSI_MAXSEG]; + uint16_t idx, n; + void *buf = NULL; + size_t bufsize; + int iolen; + + sc = vsc; + + while (vq_has_descs(vq)) { + n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, NULL); + bufsize = iov_to_buf(iov, n, &buf); + iolen = pci_vtscsi_control_handle(sc, buf, bufsize); + buf_to_iov(buf + bufsize - iolen, iolen, iov, n, iolen); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, iolen); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + +static void +pci_vtscsi_eventq_notify(void *vsc, struct vqueue_info *vq) +{ + + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; +} + +static void +pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtscsi_softc *sc; + struct pci_vtscsi_queue *q; + struct pci_vtscsi_request *req; + struct iovec iov[VTSCSI_MAXSEG]; + uint16_t flags[VTSCSI_MAXSEG]; + uint16_t idx, n, i; + int readable; + + sc = vsc; + q = &sc->vss_queues[vq->vq_num - 2]; + + while (vq_has_descs(vq)) { + readable = 0; + n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, flags); + + /* Count readable descriptors */ + for (i = 0; i < n; i++) { + if (flags[i] & VRING_DESC_F_WRITE) + break; + + readable++; + } + + req = calloc(1, sizeof(struct pci_vtscsi_request)); + req->vsr_idx = idx; + req->vsr_queue = q; + req->vsr_niov_in = readable; + req->vsr_niov_out = n - readable; + memcpy(req->vsr_iov_in, iov, + req->vsr_niov_in * sizeof(struct iovec)); + memcpy(req->vsr_iov_out, iov + readable, + req->vsr_niov_out * sizeof(struct iovec)); + + pthread_mutex_lock(&q->vsq_mtx); + STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link); + pthread_cond_signal(&q->vsq_cv); + pthread_mutex_unlock(&q->vsq_mtx); + + DPRINTF(("virtio-scsi: request <idx=%d> enqueued\n", idx)); + } +} + +static int +pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_queue *queue, int num) +{ + struct pci_vtscsi_worker *worker; + char threadname[16]; + int i; + + queue->vsq_sc = sc; + queue->vsq_ctl_fd = open("/dev/cam/ctl", O_RDWR); + queue->vsq_vq = &sc->vss_vq[num + 2]; + + if (queue->vsq_ctl_fd < 0) { + WPRINTF(("cannot open /dev/cam/ctl: %s\n", strerror(errno))); + return (-1); + } + + pthread_mutex_init(&queue->vsq_mtx, NULL); + pthread_mutex_init(&queue->vsq_qmtx, NULL); + pthread_cond_init(&queue->vsq_cv, NULL); + STAILQ_INIT(&queue->vsq_requests); + LIST_INIT(&queue->vsq_workers); + + for (i = 0; i < VTSCSI_THR_PER_Q; i++) { + worker = calloc(1, sizeof(struct pci_vtscsi_worker)); + worker->vsw_queue = queue; + + pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc, + (void *)worker); + + sprintf(threadname, "virtio-scsi:%d-%d", num, i); + pthread_set_name_np(worker->vsw_thread, threadname); + LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link); + } + + return (0); +} + +static int +pci_vtscsi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_vtscsi_softc *sc; + char *optname = NULL; + char *opt; + int i; + + sc = calloc(1, sizeof(struct pci_vtscsi_softc)); + sc->vss_ctl_fd = open("/dev/cam/ctl", O_RDWR); + + if (sc->vss_ctl_fd < 0) { + WPRINTF(("cannot open /dev/cam/ctl: %s\n", strerror(errno))); + return (1); + } + + while ((opt = strsep(&opts, ",")) != NULL) { + if ((optname = strsep(&opt, "=")) != NULL) { + if (strcmp(optname, "iid") == 0) { + sc->vss_iid = strtoul(opt, NULL, 10); + } + } + } + + vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq); + sc->vss_vs.vs_mtx = &sc->vss_mtx; + + /* controlq */ + sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify; + + /* eventq */ + sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify; + + /* request queues */ + for (i = 2; i < VTSCSI_MAXQ; i++) { + sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify; + pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2); + } + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_SCSI); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_bar(&sc->vss_vs, 0); + + return (0); +} + + +struct pci_devemu pci_de_vscsi = { + .pe_emu = "virtio-scsi", + .pe_init = pci_vtscsi_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vscsi); diff --git a/usr/src/cmd/bhyve/pci_xhci.c b/usr/src/cmd/bhyve/pci_xhci.c index 1cb2246486..be87453bf1 100644 --- a/usr/src/cmd/bhyve/pci_xhci.c +++ b/usr/src/cmd/bhyve/pci_xhci.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com> * Copyright 2018 Joyent, Inc. * All rights reserved. @@ -2227,12 +2229,12 @@ pci_xhci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, sc = pi->pi_arg; - assert(baridx == 0); + assert(baridx == 0); - pthread_mutex_lock(&sc->mtx); + pthread_mutex_lock(&sc->mtx); if (offset < XHCI_CAPLEN) /* read only registers */ - WPRINTF(("pci_xhci: write RO-CAPs offset %ld\r\n", offset)); + WPRINTF(("pci_xhci: write RO-CAPs offset %ld\r\n", offset)); else if (offset < sc->dboff) pci_xhci_hostop_write(sc, offset, value); else if (offset < sc->rtsoff) @@ -2240,9 +2242,9 @@ pci_xhci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, else if (offset < sc->regsend) pci_xhci_rtsregs_write(sc, offset, value); else - WPRINTF(("pci_xhci: write invalid offset %ld\r\n", offset)); + WPRINTF(("pci_xhci: write invalid offset %ld\r\n", offset)); - pthread_mutex_unlock(&sc->mtx); + pthread_mutex_unlock(&sc->mtx); } static uint64_t @@ -2450,9 +2452,9 @@ pci_xhci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, sc = pi->pi_arg; - assert(baridx == 0); + assert(baridx == 0); - pthread_mutex_lock(&sc->mtx); + pthread_mutex_lock(&sc->mtx); if (offset < XHCI_CAPLEN) value = pci_xhci_hostcap_read(sc, offset); else if (offset < sc->dboff) @@ -2465,10 +2467,10 @@ pci_xhci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, value = pci_xhci_xecp_read(sc, offset); else { value = 0; - WPRINTF(("pci_xhci: read invalid offset %ld\r\n", offset)); + WPRINTF(("pci_xhci: read invalid offset %ld\r\n", offset)); } - pthread_mutex_unlock(&sc->mtx); + pthread_mutex_unlock(&sc->mtx); switch (size) { case 1: diff --git a/usr/src/cmd/bhyve/pci_xhci.h b/usr/src/cmd/bhyve/pci_xhci.h index d5f05af5d0..7502f9396a 100644 --- a/usr/src/cmd/bhyve/pci_xhci.h +++ b/usr/src/cmd/bhyve/pci_xhci.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com> * All rights reserved. * diff --git a/usr/src/cmd/bhyve/ps2kbd.c b/usr/src/cmd/bhyve/ps2kbd.c index ec3bb9814c..ae82957ffa 100644 --- a/usr/src/cmd/bhyve/ps2kbd.c +++ b/usr/src/cmd/bhyve/ps2kbd.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2015 Nahanni Systems Inc. * All rights reserved. diff --git a/usr/src/cmd/bhyve/ps2kbd.h b/usr/src/cmd/bhyve/ps2kbd.h index 34c31b1ea8..17be6d0466 100644 --- a/usr/src/cmd/bhyve/ps2kbd.h +++ b/usr/src/cmd/bhyve/ps2kbd.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * diff --git a/usr/src/cmd/bhyve/ps2mouse.c b/usr/src/cmd/bhyve/ps2mouse.c index cea7210e2a..b2e08262b1 100644 --- a/usr/src/cmd/bhyve/ps2mouse.c +++ b/usr/src/cmd/bhyve/ps2mouse.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2015 Nahanni Systems Inc. * All rights reserved. diff --git a/usr/src/cmd/bhyve/ps2mouse.h b/usr/src/cmd/bhyve/ps2mouse.h index 10d5698a30..59430b01e2 100644 --- a/usr/src/cmd/bhyve/ps2mouse.h +++ b/usr/src/cmd/bhyve/ps2mouse.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * diff --git a/usr/src/cmd/bhyve/rfb.c b/usr/src/cmd/bhyve/rfb.c index e8c74766fe..f761646fc7 100644 --- a/usr/src/cmd/bhyve/rfb.c +++ b/usr/src/cmd/bhyve/rfb.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2015 Leon Dang * Copyright 2018 Joyent, Inc. @@ -77,11 +79,11 @@ static int rfb_debug = 0; #define AUTH_LENGTH 16 #define PASSWD_LENGTH 8 -#define SECURITY_TYPE_NONE 1 -#define SECURITY_TYPE_VNC_AUTH 2 +#define SECURITY_TYPE_NONE 1 +#define SECURITY_TYPE_VNC_AUTH 2 -#define AUTH_FAILED_UNAUTH 1 -#define AUTH_FAILED_ERROR 2 +#define AUTH_FAILED_UNAUTH 1 +#define AUTH_FAILED_ERROR 2 struct rfb_softc { int sfd; @@ -143,12 +145,12 @@ struct rfb_pixfmt_msg { #define RFB_ENCODING_ZLIB 6 #define RFB_ENCODING_RESIZE -223 -#define RFB_MAX_WIDTH 2000 -#define RFB_MAX_HEIGHT 1200 +#define RFB_MAX_WIDTH 2000 +#define RFB_MAX_HEIGHT 1200 #define RFB_ZLIB_BUFSZ RFB_MAX_WIDTH*RFB_MAX_HEIGHT*4 /* percentage changes to screen before sending the entire screen */ -#define RFB_SEND_ALL_THRESH 25 +#define RFB_SEND_ALL_THRESH 25 struct rfb_enc_msg { uint8_t type; @@ -309,7 +311,7 @@ rfb_send_rect(struct rfb_softc *rc, int cfd, struct bhyvegc_image *gc, int x, int y, int w, int h) { struct rfb_srvr_updt_msg supdt_msg; - struct rfb_srvr_rect_hdr srect_hdr; + struct rfb_srvr_rect_hdr srect_hdr; unsigned long zlen; ssize_t nwrite, total; int err; @@ -469,9 +471,9 @@ doraw: return (nwrite); } -#define PIX_PER_CELL 32 +#define PIX_PER_CELL 32 #define PIXCELL_SHIFT 5 -#define PIXCELL_MASK 0x1F +#define PIXCELL_MASK 0x1F static int rfb_send_screen(struct rfb_softc *rc, int cfd, int all) @@ -717,7 +719,7 @@ rfb_wr_thr(void *arg) tv.tv_usec = 10000; err = select(cfd+1, &rfds, NULL, NULL, &tv); - if (err < 0) + if (err < 0) return (NULL); /* Determine if its time to push screen; ~24hz */ diff --git a/usr/src/cmd/bhyve/rfb.h b/usr/src/cmd/bhyve/rfb.h index 94d937e5b8..990e2075ac 100644 --- a/usr/src/cmd/bhyve/rfb.h +++ b/usr/src/cmd/bhyve/rfb.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright 2018 Joyent, Inc. * All rights reserved. diff --git a/usr/src/cmd/bhyve/rtc.c b/usr/src/cmd/bhyve/rtc.c index 73b5610771..09ca3f61ae 100644 --- a/usr/src/cmd/bhyve/rtc.c +++ b/usr/src/cmd/bhyve/rtc.c @@ -51,7 +51,7 @@ __FBSDID("$FreeBSD$"); #define RTC_HMEM_SB 0x5c #define RTC_HMEM_MSB 0x5d -#define m_64KB (64*1024) +#define m_64KB (64*1024) #define m_16MB (16*1024*1024) #define m_4GB (4ULL*1024*1024*1024) diff --git a/usr/src/cmd/bhyve/sockstream.c b/usr/src/cmd/bhyve/sockstream.c index 1789206ff3..b592bce9aa 100644 --- a/usr/src/cmd/bhyve/sockstream.c +++ b/usr/src/cmd/bhyve/sockstream.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Nahanni Systems, Inc. * All rights reserved. * @@ -82,5 +84,3 @@ stream_write(int fd, const void *buf, ssize_t nbytes) } return (len); } - - diff --git a/usr/src/cmd/bhyve/sockstream.h b/usr/src/cmd/bhyve/sockstream.h index bb0b3b06eb..ecea849471 100644 --- a/usr/src/cmd/bhyve/sockstream.h +++ b/usr/src/cmd/bhyve/sockstream.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Nahanni Systems, Inc. * All rights reserved. * diff --git a/usr/src/cmd/bhyve/task_switch.c b/usr/src/cmd/bhyve/task_switch.c index 6138bcdef8..b5950a19d8 100644 --- a/usr/src/cmd/bhyve/task_switch.c +++ b/usr/src/cmd/bhyve/task_switch.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Neel Natu <neel@freebsd.org> * All rights reserved. * diff --git a/usr/src/cmd/bhyve/uart_emul.c b/usr/src/cmd/bhyve/uart_emul.c index 40eefa069a..656a48f93c 100644 --- a/usr/src/cmd/bhyve/uart_emul.c +++ b/usr/src/cmd/bhyve/uart_emul.c @@ -81,7 +81,7 @@ __FBSDID("$FreeBSD$"); #define COM1_BASE 0x3F8 #define COM1_IRQ 4 #define COM2_BASE 0x2F8 -#define COM2_IRQ 3 +#define COM2_IRQ 3 #define DEFAULT_RCLK 1843200 #define DEFAULT_BAUD 9600 @@ -94,7 +94,7 @@ __FBSDID("$FreeBSD$"); #define MSR_DELTA_MASK 0x0f #ifndef REG_SCR -#define REG_SCR com_scr +#define REG_SCR com_scr #endif #define FIFOSZ 16 diff --git a/usr/src/cmd/bhyve/usb_emul.c b/usr/src/cmd/bhyve/usb_emul.c index 3dc12a5c3c..6ecdd9530e 100644 --- a/usr/src/cmd/bhyve/usb_emul.c +++ b/usr/src/cmd/bhyve/usb_emul.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Nahanni Systems Inc. * All rights reserved. * diff --git a/usr/src/cmd/bhyve/usb_emul.h b/usr/src/cmd/bhyve/usb_emul.h index 083557f64f..e55a421b6f 100644 --- a/usr/src/cmd/bhyve/usb_emul.h +++ b/usr/src/cmd/bhyve/usb_emul.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com> * Copyright 2018 Joyent, Inc. * All rights reserved. diff --git a/usr/src/cmd/bhyve/usb_mouse.c b/usr/src/cmd/bhyve/usb_mouse.c index e9fc77ed8a..e613012071 100644 --- a/usr/src/cmd/bhyve/usb_mouse.c +++ b/usr/src/cmd/bhyve/usb_mouse.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com> * All rights reserved. * @@ -220,16 +222,16 @@ struct umouse_bos_desc umouse_bosd = { HSETW(.wTotalLength, sizeof(umouse_bosd)), .bNumDeviceCaps = 1, }, - .usbssd = { - .bLength = sizeof(umouse_bosd.usbssd), - .bDescriptorType = UDESC_DEVICE_CAPABILITY, - .bDevCapabilityType = 3, - .bmAttributes = 0, - HSETW(.wSpeedsSupported, 0x08), - .bFunctionalitySupport = 3, - .bU1DevExitLat = 0xa, /* dummy - not used */ - .wU2DevExitLat = { 0x20, 0x00 }, - } + .usbssd = { + .bLength = sizeof(umouse_bosd.usbssd), + .bDescriptorType = UDESC_DEVICE_CAPABILITY, + .bDevCapabilityType = 3, + .bmAttributes = 0, + HSETW(.wSpeedsSupported, 0x08), + .bFunctionalitySupport = 3, + .bU1DevExitLat = 0xa, /* dummy - not used */ + .wU2DevExitLat = { 0x20, 0x00 }, + } }; diff --git a/usr/src/cmd/bhyve/vga.c b/usr/src/cmd/bhyve/vga.c index a5f68ec543..314ddeb1e8 100644 --- a/usr/src/cmd/bhyve/vga.c +++ b/usr/src/cmd/bhyve/vga.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * diff --git a/usr/src/cmd/bhyve/vga.h b/usr/src/cmd/bhyve/vga.h index 4364f1b17a..36c6dc15fa 100644 --- a/usr/src/cmd/bhyve/vga.h +++ b/usr/src/cmd/bhyve/vga.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -38,8 +40,8 @@ #define GEN_MISC_OUTPUT_PORT 0x3cc #define GEN_INPUT_STS1_MONO_PORT 0x3ba #define GEN_INPUT_STS1_COLOR_PORT 0x3da -#define GEN_IS1_VR 0x08 /* Vertical retrace */ -#define GEN_IS1_DE 0x01 /* Display enable not */ +#define GEN_IS1_VR 0x08 /* Vertical retrace */ +#define GEN_IS1_DE 0x01 /* Display enable not */ /* Attribute controller registers. */ #define ATC_IDX_PORT 0x3c0 @@ -49,14 +51,14 @@ #define ATC_PALETTE0 0 #define ATC_PALETTE15 15 #define ATC_MODE_CONTROL 16 -#define ATC_MC_IPS 0x80 /* Internal palette size */ -#define ATC_MC_GA 0x01 /* Graphics/alphanumeric */ +#define ATC_MC_IPS 0x80 /* Internal palette size */ +#define ATC_MC_GA 0x01 /* Graphics/alphanumeric */ #define ATC_OVERSCAN_COLOR 17 #define ATC_COLOR_PLANE_ENABLE 18 #define ATC_HORIZ_PIXEL_PANNING 19 #define ATC_COLOR_SELECT 20 -#define ATC_CS_C67 0x0c /* Color select bits 6+7 */ -#define ATC_CS_C45 0x03 /* Color select bits 4+5 */ +#define ATC_CS_C67 0x0c /* Color select bits 6+7 */ +#define ATC_CS_C45 0x03 /* Color select bits 4+5 */ /* Sequencer registers. */ #define SEQ_IDX_PORT 0x3c4 @@ -66,22 +68,22 @@ #define SEQ_RESET_ASYNC 0x1 #define SEQ_RESET_SYNC 0x2 #define SEQ_CLOCKING_MODE 1 -#define SEQ_CM_SO 0x20 /* Screen off */ -#define SEQ_CM_89 0x01 /* 8/9 dot clock */ +#define SEQ_CM_SO 0x20 /* Screen off */ +#define SEQ_CM_89 0x01 /* 8/9 dot clock */ #define SEQ_MAP_MASK 2 #define SEQ_CHAR_MAP_SELECT 3 -#define SEQ_CMS_SAH 0x20 /* Char map A bit 2 */ -#define SEQ_CMS_SAH_SHIFT 5 -#define SEQ_CMS_SA 0x0c /* Char map A bits 0+1 */ -#define SEQ_CMS_SA_SHIFT 2 -#define SEQ_CMS_SBH 0x10 /* Char map B bit 2 */ -#define SEQ_CMS_SBH_SHIFT 4 -#define SEQ_CMS_SB 0x03 /* Char map B bits 0+1 */ -#define SEQ_CMS_SB_SHIFT 0 +#define SEQ_CMS_SAH 0x20 /* Char map A bit 2 */ +#define SEQ_CMS_SAH_SHIFT 5 +#define SEQ_CMS_SA 0x0c /* Char map A bits 0+1 */ +#define SEQ_CMS_SA_SHIFT 2 +#define SEQ_CMS_SBH 0x10 /* Char map B bit 2 */ +#define SEQ_CMS_SBH_SHIFT 4 +#define SEQ_CMS_SB 0x03 /* Char map B bits 0+1 */ +#define SEQ_CMS_SB_SHIFT 0 #define SEQ_MEMORY_MODE 4 -#define SEQ_MM_C4 0x08 /* Chain 4 */ -#define SEQ_MM_OE 0x04 /* Odd/even */ -#define SEQ_MM_EM 0x02 /* Extended memory */ +#define SEQ_MM_C4 0x08 /* Chain 4 */ +#define SEQ_MM_OE 0x04 /* Odd/even */ +#define SEQ_MM_EM 0x02 /* Extended memory */ /* Graphics controller registers. */ #define GC_IDX_PORT 0x3ce @@ -93,13 +95,13 @@ #define GC_DATA_ROTATE 3 #define GC_READ_MAP_SELECT 4 #define GC_MODE 5 -#define GC_MODE_OE 0x10 /* Odd/even */ -#define GC_MODE_C4 0x04 /* Chain 4 */ +#define GC_MODE_OE 0x10 /* Odd/even */ +#define GC_MODE_C4 0x04 /* Chain 4 */ #define GC_MISCELLANEOUS 6 -#define GC_MISC_GM 0x01 /* Graphics/alphanumeric */ -#define GC_MISC_MM 0x0c /* memory map */ -#define GC_MISC_MM_SHIFT 2 +#define GC_MISC_GM 0x01 /* Graphics/alphanumeric */ +#define GC_MISC_MM 0x0c /* memory map */ +#define GC_MISC_MM_SHIFT 2 #define GC_COLOR_DONT_CARE 7 #define GC_BIT_MASK 8 @@ -117,36 +119,36 @@ #define CRTC_END_HORIZ_RETRACE 5 #define CRTC_VERT_TOTAL 6 #define CRTC_OVERFLOW 7 -#define CRTC_OF_VRS9 0x80 /* VRS bit 9 */ -#define CRTC_OF_VRS9_SHIFT 7 -#define CRTC_OF_VDE9 0x40 /* VDE bit 9 */ -#define CRTC_OF_VDE9_SHIFT 6 -#define CRTC_OF_VRS8 0x04 /* VRS bit 8 */ -#define CRTC_OF_VRS8_SHIFT 2 -#define CRTC_OF_VDE8 0x02 /* VDE bit 8 */ -#define CRTC_OF_VDE8_SHIFT 1 +#define CRTC_OF_VRS9 0x80 /* VRS bit 9 */ +#define CRTC_OF_VRS9_SHIFT 7 +#define CRTC_OF_VDE9 0x40 /* VDE bit 9 */ +#define CRTC_OF_VDE9_SHIFT 6 +#define CRTC_OF_VRS8 0x04 /* VRS bit 8 */ +#define CRTC_OF_VRS8_SHIFT 2 +#define CRTC_OF_VDE8 0x02 /* VDE bit 8 */ +#define CRTC_OF_VDE8_SHIFT 1 #define CRTC_PRESET_ROW_SCAN 8 #define CRTC_MAX_SCAN_LINE 9 -#define CRTC_MSL_MSL 0x1f +#define CRTC_MSL_MSL 0x1f #define CRTC_CURSOR_START 10 -#define CRTC_CS_CO 0x20 /* Cursor off */ -#define CRTC_CS_CS 0x1f /* Cursor start */ +#define CRTC_CS_CO 0x20 /* Cursor off */ +#define CRTC_CS_CS 0x1f /* Cursor start */ #define CRTC_CURSOR_END 11 -#define CRTC_CE_CE 0x1f /* Cursor end */ +#define CRTC_CE_CE 0x1f /* Cursor end */ #define CRTC_START_ADDR_HIGH 12 #define CRTC_START_ADDR_LOW 13 #define CRTC_CURSOR_LOC_HIGH 14 #define CRTC_CURSOR_LOC_LOW 15 #define CRTC_VERT_RETRACE_START 16 #define CRTC_VERT_RETRACE_END 17 -#define CRTC_VRE_MASK 0xf +#define CRTC_VRE_MASK 0xf #define CRTC_VERT_DISP_END 18 #define CRTC_OFFSET 19 #define CRTC_UNDERLINE_LOC 20 #define CRTC_START_VERT_BLANK 21 #define CRTC_END_VERT_BLANK 22 #define CRTC_MODE_CONTROL 23 -#define CRTC_MC_TE 0x80 /* Timing enable */ +#define CRTC_MC_TE 0x80 /* Timing enable */ #define CRTC_LINE_COMPARE 24 /* DAC registers */ diff --git a/usr/src/cmd/bhyve/virtio.c b/usr/src/cmd/bhyve/virtio.c index fc0525c9ee..4c85000796 100644 --- a/usr/src/cmd/bhyve/virtio.c +++ b/usr/src/cmd/bhyve/virtio.c @@ -51,7 +51,7 @@ __FBSDID("$FreeBSD$"); * front of virtio-based device softc" constraint, let's use * this to convert. */ -#define DEV_SOFTC(vs) ((void *)(vs)) +#define DEV_SOFTC(vs) ((void *)(vs)) /* * Link a virtio_softc to its constants, the device softc, and diff --git a/usr/src/cmd/bhyve/virtio.h b/usr/src/cmd/bhyve/virtio.h index f59d823448..a2c3362ec2 100644 --- a/usr/src/cmd/bhyve/virtio.h +++ b/usr/src/cmd/bhyve/virtio.h @@ -188,7 +188,7 @@ struct vring_used { /* * PFN register shift amount */ -#define VRING_PFN 12 +#define VRING_PFN 12 /* * Virtio device types @@ -215,6 +215,7 @@ struct vring_used { #define VIRTIO_DEV_BLOCK 0x1001 #define VIRTIO_DEV_CONSOLE 0x1003 #define VIRTIO_DEV_RANDOM 0x1005 +#define VIRTIO_DEV_SCSI 0x1008 /* * PCI config space constants. @@ -225,19 +226,19 @@ struct vring_used { * If MSI-X is not enabled, those two registers disappear and * the remaining configuration registers start at offset 20. */ -#define VTCFG_R_HOSTCAP 0 -#define VTCFG_R_GUESTCAP 4 -#define VTCFG_R_PFN 8 -#define VTCFG_R_QNUM 12 -#define VTCFG_R_QSEL 14 -#define VTCFG_R_QNOTIFY 16 -#define VTCFG_R_STATUS 18 -#define VTCFG_R_ISR 19 -#define VTCFG_R_CFGVEC 20 -#define VTCFG_R_QVEC 22 -#define VTCFG_R_CFG0 20 /* No MSI-X */ -#define VTCFG_R_CFG1 24 /* With MSI-X */ -#define VTCFG_R_MSIX 20 +#define VTCFG_R_HOSTCAP 0 +#define VTCFG_R_GUESTCAP 4 +#define VTCFG_R_PFN 8 +#define VTCFG_R_QNUM 12 +#define VTCFG_R_QSEL 14 +#define VTCFG_R_QNOTIFY 16 +#define VTCFG_R_STATUS 18 +#define VTCFG_R_ISR 19 +#define VTCFG_R_CFGVEC 20 +#define VTCFG_R_QVEC 22 +#define VTCFG_R_CFG0 20 /* No MSI-X */ +#define VTCFG_R_CFG1 24 /* With MSI-X */ +#define VTCFG_R_MSIX 20 /* * Bits in VTCFG_R_STATUS. Guests need not actually set any of these, @@ -256,7 +257,7 @@ struct vring_used { #define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */ #define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */ -#define VIRTIO_MSI_NO_VECTOR 0xFFFF +#define VIRTIO_MSI_NO_VECTOR 0xFFFF /* * Feature flags. diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c index 5f8932efa8..d7179d5874 100644 --- a/usr/src/cmd/bhyvectl/bhyvectl.c +++ b/usr/src/cmd/bhyvectl/bhyvectl.c @@ -868,7 +868,7 @@ get_all_registers(struct vmctx *ctx, int vcpu) if (error == 0) printf("rflags[%d]\t0x%016lx\n", vcpu, rflags); } - + return (error); } @@ -1135,7 +1135,7 @@ get_misc_vmcs(struct vmctx *ctx, int vcpu) vcpu, u64); } } - + if (!error && (get_tpr_threshold || get_all)) { uint64_t threshold; error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD, @@ -1153,7 +1153,7 @@ get_misc_vmcs(struct vmctx *ctx, int vcpu) vcpu, insterr); } } - + if (!error && (get_exit_ctls || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl); if (error == 0) @@ -1201,7 +1201,7 @@ get_misc_vmcs(struct vmctx *ctx, int vcpu) if (error == 0) printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp); } - + if (!error && (get_vmcs_link || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr); if (error == 0) diff --git a/usr/src/compat/freebsd/amd64/machine/atomic.h b/usr/src/compat/freebsd/amd64/machine/atomic.h index 0b5998880e..6d8235d37c 100644 --- a/usr/src/compat/freebsd/amd64/machine/atomic.h +++ b/usr/src/compat/freebsd/amd64/machine/atomic.h @@ -18,6 +18,17 @@ #define _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_ static __inline u_int +atomic_load_acq_short(volatile u_short *p) +{ + u_short res; + + res = *p; + __asm volatile("" : : : "memory"); + + return (res); +} + +static __inline u_int atomic_load_acq_int(volatile u_int *p) { u_int res; @@ -96,6 +107,23 @@ atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src) return (res); } +static __inline int +atomic_testandset_int(volatile u_int *p, u_int v) +{ + u_char res; + + __asm __volatile( + " lock ; " + " btsl %2,%1 ; " + " setc %0 ; " + "# atomic_testandset_int" + : "=q" (res), /* 0 */ + "+m" (*p) /* 1 */ + : "Ir" (v & 0x1f) /* 2 */ + : "cc"); + return (res); +} + /* * Atomically add the value of v to the integer pointed to by p and return * the previous value of *p. @@ -188,6 +216,13 @@ atomic_swap_long(volatile u_long *p, u_long v) return (v); } + +#define atomic_store_short(p, v) \ + (*(volatile u_short *)(p) = (u_short)(v)) +#define atomic_store_int(p, v) \ + (*(volatile u_int *)(p) = (u_int)(v)) + + #define atomic_readandclear_int(p) atomic_swap_int(p, 0) #define atomic_readandclear_long(p) atomic_swap_long(p, 0) diff --git a/usr/src/compat/freebsd/amd64/machine/reg.h b/usr/src/compat/freebsd/amd64/machine/reg.h new file mode 100644 index 0000000000..4a73463603 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/reg.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_REG_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_REG_H_ + +#define DBREG_DR6_RESERVED1 0xffff0ff0 +#define DBREG_DR7_RESERVED1 0x0400 + + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_REG_H_ */ diff --git a/usr/src/compat/freebsd/sys/endian.h b/usr/src/compat/freebsd/sys/endian.h index a31bff55d6..24ea02d251 100644 --- a/usr/src/compat/freebsd/sys/endian.h +++ b/usr/src/compat/freebsd/sys/endian.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_ENDIAN_H_ @@ -122,4 +123,14 @@ le64enc(void *pp, uint64_t u) le32enc(p + 4, (uint32_t)(u >> 32)); } +#ifdef _LITTLE_ENDIAN +#define htole16(x) ((uint16_t)(x)) +#define htole32(x) ((uint32_t)(x)) +#define htole64(x) ((uint64_t)(x)) + +#define le16toh(x) ((uint16_t)(x)) +#define le32toh(x) ((uint32_t)(x)) +#define le64toh(x) ((uint64_t)(x)) +#endif + #endif /* _COMPAT_FREEBSD_SYS_ENDIAN_H_ */ diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c index c34bb60de6..7d20a3b323 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.c +++ b/usr/src/lib/libvmmapi/common/vmmapi.c @@ -77,8 +77,11 @@ __FBSDID("$FreeBSD$"); #ifndef __FreeBSD__ /* shim to no-op for now */ -#define MAP_NOCORE 0 -#define MAP_ALIGNED_SUPER 0 +#define MAP_NOCORE 0 +#define MAP_ALIGNED_SUPER 0 + +/* Rely on PROT_NONE for guard purposes */ +#define MAP_GUARD (MAP_PRIVATE | MAP_ANON | MAP_NORESERVE) #endif /* @@ -135,19 +138,19 @@ vm_do_ctl(int cmd, const char *name) static int vm_device_open(const char *name) { - int fd, len; - char *vmfile; + int fd, len; + char *vmfile; len = strlen("/dev/vmm/") + strlen(name) + 1; vmfile = malloc(len); assert(vmfile != NULL); snprintf(vmfile, len, "/dev/vmm/%s", name); - /* Open the device file */ - fd = open(vmfile, O_RDWR, 0); + /* Open the device file */ + fd = open(vmfile, O_RDWR, 0); free(vmfile); - return (fd); + return (fd); } int @@ -425,7 +428,7 @@ vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) size_t objsize, len; vm_paddr_t gpa; char *baseaddr, *ptr; - int error, flags; + int error; assert(vms == VM_MMAP_ALL); @@ -454,16 +457,7 @@ vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) * and the adjoining guard regions. */ len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; - flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER; -#ifndef __FreeBSD__ - /* - * There is no need to reserve swap for the guest physical memory and - * guard regions. Actual memory is allocated and mapped later through - * vm_alloc_memseg() and setup_memory_segment(). - */ - flags |= MAP_NORESERVE; -#endif - ptr = mmap(NULL, len, PROT_NONE, flags, -1, 0); + ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); if (ptr == MAP_FAILED) return (-1); @@ -607,8 +601,8 @@ vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) * adjoining guard regions. */ len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; - flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER; - base = mmap(NULL, len2, PROT_NONE, flags, -1, 0); + base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, + 0); if (base == MAP_FAILED) goto done; @@ -997,7 +991,7 @@ vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) vmcap.cpuid = vcpu; vmcap.captype = cap; vmcap.capval = val; - + return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); } @@ -1801,4 +1795,3 @@ vm_get_ioctls(size_t *len) return (NULL); } #endif /* __FreeBSD__ */ - diff --git a/usr/src/uts/i86pc/io/vmm/README.sync b/usr/src/uts/i86pc/io/vmm/README.sync index 667f34b9de..e9a2479b13 100644 --- a/usr/src/uts/i86pc/io/vmm/README.sync +++ b/usr/src/uts/i86pc/io/vmm/README.sync @@ -1,22 +1,18 @@ The bhyve kernel module and its associated userland consumers have been updated to the latest upstream FreeBSD sources as of: -commit 0fac2150fc0f1befa5803ca010ed63a6335847ad -Author: grehan <grehan@FreeBSD.org> -Date: Fri May 4 01:36:49 2018 +0000 +commit f81459bd8363602ed5e436f10288320419e80ccf +Author: andrew <andrew@FreeBSD.org> +Date: Thu Sep 27 11:16:19 2018 +0000 - Allow arbitrary numbers of columns for VNC server screen resolution. + Handle a guest executing a vm instruction by trapping and raising an + undefined instruction exception. Previously we would exit the guest, + however an unprivileged user could execute these. - The prior code only allowed multiples of 32 for the - numbers of columns. Remove this restriction to allow - a forthcoming UEFI firmware update to allow arbitrary - x,y resolutions. + Found with: syzkaller + Reviewed by: araujo, tychon (previous version) + Approved by: re (kib) + MFC after: 1 week + Differential Revision: https://reviews.freebsd.org/D17192 - (the code for handling rows already supported non mult-32 values) - - Reviewed by: Leon Dang (original author) - MFC after: 3 weeks - Differential Revision: https://reviews.freebsd.org/D15274 - - -Which corresponds to SVN revision: 333235 +Which corresponds to SVN revision: 338957 diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c new file mode 100644 index 0000000000..f6b6e60363 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c @@ -0,0 +1,1461 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/rman.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/resource.h> +#include <machine/vmm.h> +#include <machine/pmap.h> +#include <machine/vmparam.h> +#include <machine/pci_cfgreg.h> + +#include "pcib_if.h" + +#include "io/iommu.h" +#include "amdvi_priv.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW, NULL, NULL); + +#define MOD_INC(a, s, m) (((a) + (s)) % ((m) * (s))) +#define MOD_DEC(a, s, m) (((a) - (s)) % ((m) * (s))) + +/* Print RID or device ID in PCI string format. */ +#define RID2PCI_STR(d) PCI_RID2BUS(d), PCI_RID2SLOT(d), PCI_RID2FUNC(d) + +static void amdvi_dump_cmds(struct amdvi_softc *softc); +static void amdvi_print_dev_cap(struct amdvi_softc *softc); + +MALLOC_DEFINE(M_AMDVI, "amdvi", "amdvi"); + +extern device_t *ivhd_devs; + +extern int ivhd_count; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, count, CTLFLAG_RDTUN, &ivhd_count, + 0, NULL); + +static int amdvi_enable_user = 0; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, enable, CTLFLAG_RDTUN, + &amdvi_enable_user, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi_enable", &amdvi_enable_user); + +#ifdef AMDVI_ATS_ENABLE +/* XXX: ATS is not tested. */ +static int amdvi_enable_iotlb = 1; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, iotlb_enabled, CTLFLAG_RDTUN, + &amdvi_enable_iotlb, 0, NULL); +TUNABLE_INT("hw.vmm.enable_iotlb", &amdvi_enable_iotlb); +#endif + +static int amdvi_host_ptp = 1; /* Use page tables for host. */ +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, host_ptp, CTLFLAG_RDTUN, + &amdvi_host_ptp, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.host_ptp", &amdvi_host_ptp); + +/* Page table level used <= supported by h/w[v1=7]. */ +static int amdvi_ptp_level = 4; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, ptp_level, CTLFLAG_RDTUN, + &amdvi_ptp_level, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.ptp_level", &amdvi_ptp_level); + +/* Disable fault event reporting. */ +static int amdvi_disable_io_fault = 0; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, disable_io_fault, CTLFLAG_RDTUN, + &amdvi_disable_io_fault, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.disable_io_fault", &amdvi_disable_io_fault); + +static uint32_t amdvi_dom_id = 0; /* 0 is reserved for host. */ +SYSCTL_UINT(_hw_vmm_amdvi, OID_AUTO, domain_id, CTLFLAG_RD, + &amdvi_dom_id, 0, NULL); +/* + * Device table entry. + * Bus(256) x Dev(32) x Fun(8) x DTE(256 bits or 32 bytes). + * = 256 * 2 * PAGE_SIZE. + */ +static struct amdvi_dte amdvi_dte[PCI_NUM_DEV_MAX] __aligned(PAGE_SIZE); +CTASSERT(PCI_NUM_DEV_MAX == 0x10000); +CTASSERT(sizeof(amdvi_dte) == 0x200000); + +static SLIST_HEAD (, amdvi_domain) dom_head; + +static inline uint32_t +amdvi_pci_read(struct amdvi_softc *softc, int off) +{ + + return (pci_cfgregread(PCI_RID2BUS(softc->pci_rid), + PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid), + off, 4)); +} + +#ifdef AMDVI_ATS_ENABLE +/* XXX: Should be in pci.c */ +/* + * Check if device has ATS capability and its enabled. + * If ATS is absent or disabled, return (-1), otherwise ATS + * queue length. + */ +static int +amdvi_find_ats_qlen(uint16_t devid) +{ + device_t dev; + uint32_t off, cap; + int qlen = -1; + + dev = pci_find_bsf(PCI_RID2BUS(devid), PCI_RID2SLOT(devid), + PCI_RID2FUNC(devid)); + + if (!dev) { + return (-1); + } +#define PCIM_ATS_EN BIT(31) + + if (pci_find_extcap(dev, PCIZ_ATS, &off) == 0) { + cap = pci_read_config(dev, off + 4, 4); + qlen = (cap & 0x1F); + qlen = qlen ? qlen : 32; + printf("AMD-Vi: PCI device %d.%d.%d ATS %s qlen=%d\n", + RID2PCI_STR(devid), + (cap & PCIM_ATS_EN) ? "enabled" : "Disabled", + qlen); + qlen = (cap & PCIM_ATS_EN) ? qlen : -1; + } + + return (qlen); +} + +/* + * Check if an endpoint device support device IOTLB or ATS. + */ +static inline bool +amdvi_dev_support_iotlb(struct amdvi_softc *softc, uint16_t devid) +{ + struct ivhd_dev_cfg *cfg; + int qlen, i; + bool pci_ats, ivhd_ats; + + qlen = amdvi_find_ats_qlen(devid); + if (qlen < 0) + return (false); + + KASSERT(softc, ("softc is NULL")); + cfg = softc->dev_cfg; + + ivhd_ats = false; + for (i = 0; i < softc->dev_cfg_cnt; i++) { + if ((cfg->start_id <= devid) && (cfg->end_id >= devid)) { + ivhd_ats = cfg->enable_ats; + break; + } + cfg++; + } + + pci_ats = (qlen < 0) ? false : true; + if (pci_ats != ivhd_ats) + device_printf(softc->dev, + "BIOS bug: mismatch in ATS setting for %d.%d.%d," + "ATS inv qlen = %d\n", RID2PCI_STR(devid), qlen); + + /* Ignore IVRS setting and respect PCI setting. */ + return (pci_ats); +} +#endif + +/* Enable IOTLB support for IOMMU if its supported. */ +static inline void +amdvi_hw_enable_iotlb(struct amdvi_softc *softc) +{ +#ifndef AMDVI_ATS_ENABLE + softc->iotlb = false; +#else + bool supported; + + supported = (softc->ivhd_flag & IVHD_FLAG_IOTLB) ? true : false; + + if (softc->pci_cap & AMDVI_PCI_CAP_IOTLB) { + if (!supported) + device_printf(softc->dev, "IOTLB disabled by BIOS.\n"); + + if (supported && !amdvi_enable_iotlb) { + device_printf(softc->dev, "IOTLB disabled by user.\n"); + supported = false; + } + } else + supported = false; + + softc->iotlb = supported; + +#endif +} + +static int +amdvi_init_cmd(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl = softc->ctrl; + + ctrl->cmd.len = 8; /* Use 256 command buffer entries. */ + softc->cmd_max = 1 << ctrl->cmd.len; + + softc->cmd = malloc(sizeof(struct amdvi_cmd) * + softc->cmd_max, M_AMDVI, M_WAITOK | M_ZERO); + + if ((uintptr_t)softc->cmd & PAGE_MASK) + panic("AMDVi: Command buffer not aligned on page boundary."); + + ctrl->cmd.base = vtophys(softc->cmd) / PAGE_SIZE; + /* + * XXX: Reset the h/w pointers in case IOMMU is restarting, + * h/w doesn't clear these pointers based on empirical data. + */ + ctrl->cmd_tail = 0; + ctrl->cmd_head = 0; + + return (0); +} + +/* + * Note: Update tail pointer after we have written the command since tail + * pointer update cause h/w to execute new commands, see section 3.3 + * of AMD IOMMU spec ver 2.0. + */ +/* Get the command tail pointer w/o updating it. */ +static struct amdvi_cmd * +amdvi_get_cmd_tail(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_cmd *tail; + + KASSERT(softc, ("softc is NULL")); + KASSERT(softc->cmd != NULL, ("cmd is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + + tail = (struct amdvi_cmd *)((uint8_t *)softc->cmd + + ctrl->cmd_tail); + + return (tail); +} + +/* + * Update the command tail pointer which will start command execution. + */ +static void +amdvi_update_cmd_tail(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + int size; + + size = sizeof(struct amdvi_cmd); + KASSERT(softc->cmd != NULL, ("cmd is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + + ctrl->cmd_tail = MOD_INC(ctrl->cmd_tail, size, softc->cmd_max); + softc->total_cmd++; + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "cmd_tail: %s Tail:0x%x, Head:0x%x.\n", + ctrl->cmd_tail, + ctrl->cmd_head); +#endif + +} + +/* + * Various commands supported by IOMMU. + */ + +/* Completion wait command. */ +static void +amdvi_cmd_cmp(struct amdvi_softc *softc, const uint64_t data) +{ + struct amdvi_cmd *cmd; + uint64_t pa; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + pa = vtophys(&softc->cmp_data); + cmd->opcode = AMDVI_CMP_WAIT_OPCODE; + cmd->word0 = (pa & 0xFFFFFFF8) | + (AMDVI_CMP_WAIT_STORE); + //(AMDVI_CMP_WAIT_FLUSH | AMDVI_CMP_WAIT_STORE); + cmd->word1 = (pa >> 32) & 0xFFFFF; + cmd->addr = data; + + amdvi_update_cmd_tail(softc); +} + +/* Invalidate device table entry. */ +static void +amdvi_cmd_inv_dte(struct amdvi_softc *softc, uint16_t devid) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + cmd->opcode = AMDVI_INVD_DTE_OPCODE; + cmd->word0 = devid; + amdvi_update_cmd_tail(softc); +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidated DTE:0x%x\n", devid); +#endif +} + +/* Invalidate IOMMU page, use for invalidation of domain. */ +static void +amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id, + uint64_t addr, bool guest_nested, + bool pde, bool page) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + + cmd->opcode = AMDVI_INVD_PAGE_OPCODE; + cmd->word1 = domain_id; + /* + * Invalidate all addresses for this domain. + */ + cmd->addr = addr; + cmd->addr |= pde ? AMDVI_INVD_PAGE_PDE : 0; + cmd->addr |= page ? AMDVI_INVD_PAGE_S : 0; + + amdvi_update_cmd_tail(softc); +} + +#ifdef AMDVI_ATS_ENABLE +/* Invalidate device IOTLB. */ +static void +amdvi_cmd_inv_iotlb(struct amdvi_softc *softc, uint16_t devid) +{ + struct amdvi_cmd *cmd; + int qlen; + + if (!softc->iotlb) + return; + + qlen = amdvi_find_ats_qlen(devid); + if (qlen < 0) { + panic("AMDVI: Invalid ATS qlen(%d) for device %d.%d.%d\n", + qlen, RID2PCI_STR(devid)); + } + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate IOTLB devID 0x%x" + " Qlen:%d\n", devid, qlen); +#endif + cmd->opcode = AMDVI_INVD_IOTLB_OPCODE; + cmd->word0 = devid; + cmd->word1 = qlen; + cmd->addr = AMDVI_INVD_IOTLB_ALL_ADDR | + AMDVI_INVD_IOTLB_S; + amdvi_update_cmd_tail(softc); +} +#endif + +#ifdef notyet /* For Interrupt Remap. */ +static void +amdvi_cmd_inv_intr_map(struct amdvi_softc *softc, + uint16_t devid) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + cmd->opcode = AMDVI_INVD_INTR_OPCODE; + cmd->word0 = devid; + amdvi_update_cmd_tail(softc); +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate INTR map of devID 0x%x\n", devid); +#endif +} +#endif + +/* Invalidate domain using INVALIDATE_IOMMU_PAGES command. */ +static void +amdvi_inv_domain(struct amdvi_softc *softc, uint16_t domain_id) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + /* + * See section 3.3.3 of IOMMU spec rev 2.0, software note + * for invalidating domain. + */ + amdvi_cmd_inv_iommu_pages(softc, domain_id, AMDVI_INVD_PAGE_ALL_ADDR, + false, true, true); + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate domain:0x%x\n", domain_id); + +#endif +} + +static bool +amdvi_cmp_wait(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + const uint64_t VERIFY = 0xA5A5; + volatile uint64_t *read; + int i; + bool status; + + ctrl = softc->ctrl; + read = &softc->cmp_data; + *read = 0; + amdvi_cmd_cmp(softc, VERIFY); + /* Wait for h/w to update completion data. */ + for (i = 0; i < 100 && (*read != VERIFY); i++) { + DELAY(1000); /* 1 ms */ + } + status = (VERIFY == softc->cmp_data) ? true : false; + +#ifdef AMDVI_DEBUG_CMD + if (status) + device_printf(softc->dev, "CMD completion DONE Tail:0x%x, " + "Head:0x%x, loop:%d.\n", ctrl->cmd_tail, + ctrl->cmd_head, loop); +#endif + return (status); +} + +static void +amdvi_wait(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + int i; + + KASSERT(softc, ("softc is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + /* Don't wait if h/w is not enabled. */ + if ((ctrl->control & AMDVI_CTRL_EN) == 0) + return; + + for (i = 0; i < 10; i++) { + if (amdvi_cmp_wait(softc)) + return; + } + + device_printf(softc->dev, "Error: completion failed" + " tail:0x%x, head:0x%x.\n", + ctrl->cmd_tail, ctrl->cmd_head); + amdvi_dump_cmds(softc); +} + +static void +amdvi_dump_cmds(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_cmd *cmd; + int off, i; + + ctrl = softc->ctrl; + device_printf(softc->dev, "Dump all the commands:\n"); + /* + * If h/w is stuck in completion, it is the previous command, + * start dumping from previous command onward. + */ + off = MOD_DEC(ctrl->cmd_head, sizeof(struct amdvi_cmd), + softc->cmd_max); + for (i = 0; off != ctrl->cmd_tail && + i < softc->cmd_max; i++) { + cmd = (struct amdvi_cmd *)((uint8_t *)softc->cmd + off); + printf(" [CMD%d, off:0x%x] opcode= 0x%x 0x%x" + " 0x%x 0x%lx\n", i, off, cmd->opcode, + cmd->word0, cmd->word1, cmd->addr); + off = (off + sizeof(struct amdvi_cmd)) % + (softc->cmd_max * sizeof(struct amdvi_cmd)); + } +} + +static int +amdvi_init_event(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + + ctrl = softc->ctrl; + ctrl->event.len = 8; + softc->event_max = 1 << ctrl->event.len; + softc->event = malloc(sizeof(struct amdvi_event) * + softc->event_max, M_AMDVI, M_WAITOK | M_ZERO); + if ((uintptr_t)softc->event & PAGE_MASK) { + device_printf(softc->dev, "Event buffer not aligned on page."); + return (false); + } + ctrl->event.base = vtophys(softc->event) / PAGE_SIZE; + + /* Reset the pointers. */ + ctrl->evt_head = 0; + ctrl->evt_tail = 0; + + return (0); +} + +static inline void +amdvi_decode_evt_flag(uint16_t flag) +{ + + flag &= AMDVI_EVENT_FLAG_MASK; + printf(" 0x%b]\n", flag, + "\020" + "\001GN" + "\002NX" + "\003US" + "\004I" + "\005PR" + "\006RW" + "\007PE" + "\010RZ" + "\011TR" + ); +} + +/* See section 2.5.4 of AMD IOMMU spec ver 2.62.*/ +static inline void +amdvi_decode_evt_flag_type(uint8_t type) +{ + + switch (AMDVI_EVENT_FLAG_TYPE(type)) { + case 0: + printf("RSVD\n"); + break; + case 1: + printf("Master Abort\n"); + break; + case 2: + printf("Target Abort\n"); + break; + case 3: + printf("Data Err\n"); + break; + default: + break; + } +} + +static void +amdvi_decode_inv_dte_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", + devid, domid, addr); + amdvi_decode_evt_flag(flag); +} + +static void +amdvi_decode_pf_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", + devid, domid, addr); + amdvi_decode_evt_flag(flag); +} + +static void +amdvi_decode_dte_hwerr_evt(uint16_t devid, uint16_t domid, + uint64_t addr, uint16_t flag) +{ + + printf("\t[DEV_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", devid, domid, addr); + amdvi_decode_evt_flag(flag); + amdvi_decode_evt_flag_type(flag); +} + +static void +amdvi_decode_page_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[PAGE_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", devid, domid, addr); + amdvi_decode_evt_flag(flag); + amdvi_decode_evt_flag_type(AMDVI_EVENT_FLAG_TYPE(flag)); +} + +static void +amdvi_decode_evt(struct amdvi_event *evt) +{ + struct amdvi_cmd *cmd; + + switch (evt->opcode) { + case AMDVI_EVENT_INVALID_DTE: + amdvi_decode_inv_dte_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_PFAULT: + amdvi_decode_pf_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_DTE_HW_ERROR: + amdvi_decode_dte_hwerr_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_PAGE_HW_ERROR: + amdvi_decode_page_hwerr_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_ILLEGAL_CMD: + /* FALL THROUGH */ + case AMDVI_EVENT_CMD_HW_ERROR: + printf("\t[%s EVT]\n", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ? + "ILLEGAL CMD" : "CMD HW ERR"); + cmd = (struct amdvi_cmd *)PHYS_TO_DMAP(evt->addr); + printf("\tCMD opcode= 0x%x 0x%x 0x%x 0x%lx\n", + cmd->opcode, cmd->word0, cmd->word1, cmd->addr); + break; + + case AMDVI_EVENT_IOTLB_TIMEOUT: + printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx]\n", + evt->devid, evt->addr); + break; + + case AMDVI_EVENT_INVALID_DTE_REQ: + printf("\t[INV_DTE devid:0x%x addr:0x%lx type:0x%x tr:%d]\n", + evt->devid, evt->addr, evt->flag >> 9, + (evt->flag >> 8) & 1); + break; + + case AMDVI_EVENT_INVALID_PPR_REQ: + case AMDVI_EVENT_COUNTER_ZERO: + printf("AMD-Vi: v2 events.\n"); + break; + + default: + printf("Unsupported AMD-Vi event:%d\n", evt->opcode); + } +} + +static void +amdvi_print_events(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_event *event; + int i, size; + + ctrl = softc->ctrl; + size = sizeof(struct amdvi_event); + for (i = 0; i < softc->event_max; i++) { + event = &softc->event[ctrl->evt_head / size]; + if (!event->opcode) + break; + device_printf(softc->dev, "\t[Event%d: Head:0x%x Tail:0x%x]\n", + i, ctrl->evt_head, ctrl->evt_tail); + amdvi_decode_evt(event); + ctrl->evt_head = MOD_INC(ctrl->evt_head, size, + softc->event_max); + } +} + +static int +amdvi_init_dte(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + + ctrl = softc->ctrl; + ctrl->dte.base = vtophys(amdvi_dte) / PAGE_SIZE; + ctrl->dte.size = 0x1FF; /* 2MB device table. */ + + return (0); +} + +/* + * Not all capabilities of IOMMU are available in ACPI IVHD flag + * or EFR entry, read directly from device. + */ +static int +amdvi_print_pci_cap(device_t dev) +{ + struct amdvi_softc *softc; + uint32_t off, cap; + + + softc = device_get_softc(dev); + off = softc->cap_off; + + /* + * Section 3.7.1 of IOMMU sepc rev 2.0. + * Read capability from device. + */ + cap = amdvi_pci_read(softc, off); + + /* Make sure capability type[18:16] is 3. */ + KASSERT((((cap >> 16) & 0x7) == 0x3), + ("Not a IOMMU capability 0x%x@0x%x", cap, off)); + + softc->pci_cap = cap >> 24; + device_printf(softc->dev, "PCI cap 0x%x@0x%x feature:%b\n", + cap, off, softc->pci_cap, + "\20\1IOTLB\2HT\3NPCache\4EFR\5CapExt"); + + return (0); +} + +static void +amdvi_event_intr(void *arg) +{ + struct amdvi_softc *softc; + struct amdvi_ctrl *ctrl; + + softc = (struct amdvi_softc *)arg; + ctrl = softc->ctrl; + device_printf(softc->dev, "EVT INTR %ld Status:0x%x" + " EVT Head:0x%x Tail:0x%x]\n", softc->event_intr_cnt++, + ctrl->status, ctrl->evt_head, ctrl->evt_tail); + printf(" [CMD Total 0x%lx] Tail:0x%x, Head:0x%x.\n", + softc->total_cmd, ctrl->cmd_tail, ctrl->cmd_head); + + amdvi_print_events(softc); + ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; +} + +static void +amdvi_free_evt_intr_res(device_t dev) +{ + + struct amdvi_softc *softc; + + softc = device_get_softc(dev); + if (softc->event_tag != NULL) { + bus_teardown_intr(dev, softc->event_res, softc->event_tag); + } + if (softc->event_res != NULL) { + bus_release_resource(dev, SYS_RES_IRQ, softc->event_rid, + softc->event_res); + } + bus_delete_resource(dev, SYS_RES_IRQ, softc->event_rid); + PCIB_RELEASE_MSI(device_get_parent(device_get_parent(dev)), + dev, 1, &softc->event_irq); +} + +static bool +amdvi_alloc_intr_resources(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + device_t dev, pcib; + device_t mmio_dev; + uint64_t msi_addr; + uint32_t msi_data; + int err; + + dev = softc->dev; + pcib = device_get_parent(device_get_parent(dev)); + mmio_dev = pci_find_bsf(PCI_RID2BUS(softc->pci_rid), + PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid)); + if (device_is_attached(mmio_dev)) { + device_printf(dev, + "warning: IOMMU device is claimed by another driver %s\n", + device_get_driver(mmio_dev)->name); + } + + softc->event_irq = -1; + softc->event_rid = 0; + + /* + * Section 3.7.1 of IOMMU rev 2.0. With MSI, there is only one + * interrupt. XXX: Enable MSI/X support. + */ + err = PCIB_ALLOC_MSI(pcib, dev, 1, 1, &softc->event_irq); + if (err) { + device_printf(dev, + "Couldn't find event MSI IRQ resource.\n"); + return (ENOENT); + } + + err = bus_set_resource(dev, SYS_RES_IRQ, softc->event_rid, + softc->event_irq, 1); + if (err) { + device_printf(dev, "Couldn't set event MSI resource.\n"); + return (ENXIO); + } + + softc->event_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, + &softc->event_rid, RF_ACTIVE); + if (!softc->event_res) { + device_printf(dev, + "Unable to allocate event INTR resource.\n"); + return (ENOMEM); + } + + if (bus_setup_intr(dev, softc->event_res, + INTR_TYPE_MISC | INTR_MPSAFE, NULL, amdvi_event_intr, + softc, &softc->event_tag)) { + device_printf(dev, "Fail to setup event intr\n"); + bus_release_resource(softc->dev, SYS_RES_IRQ, + softc->event_rid, softc->event_res); + softc->event_res = NULL; + return (ENXIO); + } + + bus_describe_intr(dev, softc->event_res, softc->event_tag, + "fault"); + + err = PCIB_MAP_MSI(pcib, dev, softc->event_irq, &msi_addr, + &msi_data); + if (err) { + device_printf(dev, + "Event interrupt config failed, err=%d.\n", + err); + amdvi_free_evt_intr_res(softc->dev); + return (err); + } + + /* Clear interrupt status bits. */ + ctrl = softc->ctrl; + ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; + + /* Now enable MSI interrupt. */ + pci_enable_msi(mmio_dev, msi_addr, msi_data); + return (0); +} + + +static void +amdvi_print_dev_cap(struct amdvi_softc *softc) +{ + struct ivhd_dev_cfg *cfg; + int i; + + cfg = softc->dev_cfg; + for (i = 0; i < softc->dev_cfg_cnt; i++) { + device_printf(softc->dev, "device [0x%x - 0x%x]" + "config:%b%s\n", cfg->start_id, cfg->end_id, + cfg->data, + "\020\001INIT\002ExtInt\003NMI" + "\007LINT0\008LINT1", + cfg->enable_ats ? "ATS enabled" : ""); + cfg++; + } +} + +static int +amdvi_handle_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct amdvi_softc *softc; + int result, type, error = 0; + + softc = (struct amdvi_softc *)arg1; + type = arg2; + + switch (type) { + case 0: + result = softc->ctrl->cmd_head; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 1: + result = softc->ctrl->cmd_tail; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 2: + result = softc->ctrl->evt_head; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 3: + result = softc->ctrl->evt_tail; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + + default: + device_printf(softc->dev, "Unknown sysctl:%d\n", type); + } + + return (error); +} + +static void +amdvi_add_sysctl(struct amdvi_softc *softc) +{ + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + device_t dev; + + dev = softc->dev; + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "event_intr_count", CTLFLAG_RD, + &softc->event_intr_cnt, "Event interrupt count"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "command_count", CTLFLAG_RD, + &softc->total_cmd, "Command submitted count"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD, + &softc->pci_rid, 0, "IOMMU RID"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "start_dev_rid", CTLFLAG_RD, + &softc->start_dev_rid, 0, "Start of device under this IOMMU"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "end_dev_rid", CTLFLAG_RD, + &softc->end_dev_rid, 0, "End of device under this IOMMU"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_head", + CTLTYPE_UINT | CTLFLAG_RD, softc, 0, + amdvi_handle_sysctl, "IU", "Command head"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_tail", + CTLTYPE_UINT | CTLFLAG_RD, softc, 1, + amdvi_handle_sysctl, "IU", "Command tail"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_head", + CTLTYPE_UINT | CTLFLAG_RD, softc, 2, + amdvi_handle_sysctl, "IU", "Command head"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_tail", + CTLTYPE_UINT | CTLFLAG_RD, softc, 3, + amdvi_handle_sysctl, "IU", "Command tail"); +} + +int +amdvi_setup_hw(struct amdvi_softc *softc) +{ + device_t dev; + int status; + + dev = softc->dev; + + amdvi_hw_enable_iotlb(softc); + + amdvi_print_dev_cap(softc); + + if ((status = amdvi_print_pci_cap(dev)) != 0) { + device_printf(dev, "PCI capability.\n"); + return (status); + } + if ((status = amdvi_init_cmd(softc)) != 0) { + device_printf(dev, "Couldn't configure command buffer.\n"); + return (status); + } + if ((status = amdvi_init_event(softc)) != 0) { + device_printf(dev, "Couldn't configure event buffer.\n"); + return (status); + } + if ((status = amdvi_init_dte(softc)) != 0) { + device_printf(dev, "Couldn't configure device table.\n"); + return (status); + } + if ((status = amdvi_alloc_intr_resources(softc)) != 0) { + return (status); + } + amdvi_add_sysctl(softc); + return (0); +} + +int +amdvi_teardown_hw(struct amdvi_softc *softc) +{ + device_t dev; + + dev = softc->dev; + + /* + * Called after disable, h/w is stopped by now, free all the resources. + */ + amdvi_free_evt_intr_res(dev); + + if (softc->cmd) + free(softc->cmd, M_AMDVI); + + if (softc->event) + free(softc->event, M_AMDVI); + + return (0); +} + +/*********** bhyve interfaces *********************/ +static int +amdvi_init(void) +{ + if (!ivhd_count) { + return (EIO); + } + if (!amdvi_enable_user && ivhd_count) { + printf("bhyve: Found %d AMD-Vi/IOMMU device(s), " + "use hw.vmm.amdvi.enable=1 to enable pass-through.\n", + ivhd_count); + return (EINVAL); + } + return (0); +} + +static void +amdvi_cleanup(void) +{ + /* Nothing. */ +} + +static uint16_t +amdvi_domainId(void) +{ + + /* + * If we hit maximum domain limit, rollover leaving host + * domain(0). + * XXX: make sure that this domain is not used. + */ + if (amdvi_dom_id == AMDVI_MAX_DOMAIN) + amdvi_dom_id = 1; + + return ((uint16_t)amdvi_dom_id++); +} + +static void +amdvi_do_inv_domain(uint16_t domain_id, bool create) +{ + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL")); + /* + * If not present pages are cached, invalidate page after + * creating domain. + */ +#if 0 + if (create && ((softc->pci_cap & AMDVI_PCI_CAP_NPCACHE) == 0)) + continue; +#endif + amdvi_inv_domain(softc, domain_id); + amdvi_wait(softc); + } +} + +static void * +amdvi_create_domain(vm_paddr_t maxaddr) +{ + struct amdvi_domain *dom; + + dom = malloc(sizeof(struct amdvi_domain), M_AMDVI, M_ZERO | M_WAITOK); + dom->id = amdvi_domainId(); + //dom->maxaddr = maxaddr; +#ifdef AMDVI_DEBUG_CMD + printf("Created domain #%d\n", dom->id); +#endif + /* + * Host domain(#0) don't create translation table. + */ + if (dom->id || amdvi_host_ptp) + dom->ptp = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); + + dom->ptp_level = amdvi_ptp_level; + + amdvi_do_inv_domain(dom->id, true); + SLIST_INSERT_HEAD(&dom_head, dom, next); + + return (dom); +} + +static void +amdvi_free_ptp(uint64_t *ptp, int level) +{ + int i; + + if (level < 1) + return; + + for (i = 0; i < NPTEPG ; i++) { + if ((ptp[i] & AMDVI_PT_PRESENT) == 0) + continue; + /* XXX: Add super-page or PTE mapping > 4KB. */ +#ifdef notyet + /* Super-page mapping. */ + if (AMDVI_PD_SUPER(ptp[i])) + continue; +#endif + + amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i] + & AMDVI_PT_MASK), level - 1); + + } + + free(ptp, M_AMDVI); +} + +static void +amdvi_destroy_domain(void *arg) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain, ("domain is NULL")); +#ifdef AMDVI_DEBUG_CMD + printf("Destroying domain %d\n", domain->id); +#endif + if (domain->ptp) + amdvi_free_ptp(domain->ptp, domain->ptp_level); + + amdvi_do_inv_domain(domain->id, false); + SLIST_REMOVE(&dom_head, domain, amdvi_domain, next); + free(domain, M_AMDVI); +} + +static uint64_t +amdvi_set_pt(uint64_t *pt, int level, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t pg_size, bool create) +{ + uint64_t *page, pa; + int shift, index; + const int PT_SHIFT = 9; + const int PT_INDEX_MASK = (1 << PT_SHIFT) - 1; /* Based on PT_SHIFT */ + + if (!pg_size) + return (0); + + if (hpa & (pg_size - 1)) { + printf("HPA is not size aligned.\n"); + return (0); + } + if (gpa & (pg_size - 1)) { + printf("HPA is not size aligned.\n"); + return (0); + } + shift = PML4SHIFT; + while ((shift > PAGE_SHIFT) && (pg_size < (1UL << shift))) { + index = (gpa >> shift) & PT_INDEX_MASK; + + if ((pt[index] == 0) && create) { + page = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); + pa = vtophys(page); + pt[index] = pa | AMDVI_PT_PRESENT | AMDVI_PT_RW | + ((level - 1) << AMDVI_PD_LEVEL_SHIFT); + } +#ifdef AMDVI_DEBUG_PTE + if ((gpa % 0x1000000) == 0) + printf("[level%d, shift = %d]PTE:0x%lx\n", + level, shift, pt[index]); +#endif +#define PTE2PA(x) ((uint64_t)(x) & AMDVI_PT_MASK) + pa = PTE2PA(pt[index]); + pt = (uint64_t *)PHYS_TO_DMAP(pa); + shift -= PT_SHIFT; + level--; + } + + /* Leaf entry. */ + index = (gpa >> shift) & PT_INDEX_MASK; + + if (create) { + pt[index] = hpa | AMDVI_PT_RW | AMDVI_PT_PRESENT; + } else + pt[index] = 0; + +#ifdef AMDVI_DEBUG_PTE + if ((gpa % 0x1000000) == 0) + printf("[Last level%d, shift = %d]PTE:0x%lx\n", + level, shift, pt[index]); +#endif + return (1ULL << shift); +} + +static uint64_t +amdvi_update_mapping(struct amdvi_domain *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t size, bool create) +{ + uint64_t mapped, *ptp, len; + int level; + + KASSERT(domain, ("domain is NULL")); + level = domain->ptp_level; + KASSERT(level, ("Page table level is 0")); + + ptp = domain->ptp; + KASSERT(ptp, ("PTP is NULL")); + mapped = 0; + while (mapped < size) { + len = amdvi_set_pt(ptp, level, gpa + mapped, hpa + mapped, + PAGE_SIZE, create); + if (!len) { + printf("Error: Couldn't map HPA:0x%lx GPA:0x%lx\n", + hpa, gpa); + return (0); + } + mapped += len; + } + + return (mapped); +} + +static uint64_t +amdvi_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + + if (domain->id && !domain->ptp) { + printf("ptp is NULL"); + return (-1); + } + + /* + * If host domain is created w/o page table, skip IOMMU page + * table set-up. + */ + if (domain->ptp) + return (amdvi_update_mapping(domain, gpa, hpa, len, true)); + else + return (len); +} + +static uint64_t +amdvi_destroy_mapping(void *arg, vm_paddr_t gpa, uint64_t len) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + /* + * If host domain is created w/o page table, skip IOMMU page + * table set-up. + */ + if (domain->ptp) + return (amdvi_update_mapping(domain, gpa, 0, len, false)); + return + (len); +} + +static struct amdvi_softc * +amdvi_find_iommu(uint16_t devid) +{ + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + if ((devid >= softc->start_dev_rid) && + (devid <= softc->end_dev_rid)) + return (softc); + } + + /* + * XXX: BIOS bug, device not in IVRS table, assume its from first IOMMU. + */ + printf("BIOS bug device(%d.%d.%d) doesn't have IVHD entry.\n", + RID2PCI_STR(devid)); + + return (device_get_softc(ivhd_devs[0])); +} + +/* + * Set-up device table entry. + * IOMMU spec Rev 2.0, section 3.2.2.2, some of the fields must + * be set concurrently, e.g. read and write bits. + */ +static void +amdvi_set_dte(struct amdvi_domain *domain, uint16_t devid, bool enable) +{ + struct amdvi_softc *softc; + struct amdvi_dte* temp; + + KASSERT(domain, ("domain is NULL for pci_rid:0x%x\n", devid)); + + softc = amdvi_find_iommu(devid); + KASSERT(softc, ("softc is NULL for pci_rid:0x%x\n", devid)); + + temp = &amdvi_dte[devid]; + +#ifdef AMDVI_ATS_ENABLE + /* If IOMMU and device support IOTLB, enable it. */ + if (amdvi_dev_support_iotlb(softc, devid) && softc->iotlb) + temp->iotlb_enable = 1; +#endif + + /* Avoid duplicate I/O faults. */ + temp->sup_second_io_fault = 1; + temp->sup_all_io_fault = amdvi_disable_io_fault; + + temp->dt_valid = 1; + temp->domain_id = domain->id; + + if (enable) { + if (domain->ptp) { + temp->pt_base = vtophys(domain->ptp) >> 12; + temp->pt_level = amdvi_ptp_level; + } + /* + * XXX: Page table valid[TV] bit must be set even if host domain + * page tables are not enabled. + */ + temp->pt_valid = 1; + temp->read_allow = 1; + temp->write_allow = 1; + } +} + +static void +amdvi_inv_device(uint16_t devid) +{ + struct amdvi_softc *softc; + + softc = amdvi_find_iommu(devid); + KASSERT(softc, ("softc is NULL")); + + amdvi_cmd_inv_dte(softc, devid); +#ifdef AMDVI_ATS_ENABLE + if (amdvi_dev_support_iotlb(softc, devid)) + amdvi_cmd_inv_iotlb(softc, devid); +#endif + amdvi_wait(softc); +} + +static void +amdvi_add_device(void *arg, uint16_t devid) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain != NULL, ("domain is NULL")); +#ifdef AMDVI_DEBUG_CMD + printf("Assigning device(%d.%d.%d) to domain:%d\n", + RID2PCI_STR(devid), domain->id); +#endif + amdvi_set_dte(domain, devid, true); + amdvi_inv_device(devid); +} + +static void +amdvi_remove_device(void *arg, uint16_t devid) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; +#ifdef AMDVI_DEBUG_CMD + printf("Remove device(0x%x) from domain:%d\n", + devid, domain->id); +#endif + amdvi_set_dte(domain, devid, false); + amdvi_inv_device(devid); +} + +static void +amdvi_enable(void) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_softc *softc; + uint64_t val; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL\n")); + ctrl = softc->ctrl; + KASSERT(ctrl, ("ctrl is NULL\n")); + + val = ( AMDVI_CTRL_EN | + AMDVI_CTRL_CMD | + AMDVI_CTRL_ELOG | + AMDVI_CTRL_ELOGINT | + AMDVI_CTRL_INV_TO_1S); + + if (softc->ivhd_flag & IVHD_FLAG_COH) + val |= AMDVI_CTRL_COH; + if (softc->ivhd_flag & IVHD_FLAG_HTT) + val |= AMDVI_CTRL_HTT; + if (softc->ivhd_flag & IVHD_FLAG_RPPW) + val |= AMDVI_CTRL_RPPW; + if (softc->ivhd_flag & IVHD_FLAG_PPW) + val |= AMDVI_CTRL_PPW; + if (softc->ivhd_flag & IVHD_FLAG_ISOC) + val |= AMDVI_CTRL_ISOC; + + ctrl->control = val; + } +} + +static void +amdvi_disable(void) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL\n")); + ctrl = softc->ctrl; + KASSERT(ctrl, ("ctrl is NULL\n")); + + ctrl->control = 0; + } +} + +static void +amdvi_inv_tlb(void *arg) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain, ("domain is NULL")); + amdvi_do_inv_domain(domain->id, false); +} + +struct iommu_ops iommu_ops_amd = { + amdvi_init, + amdvi_cleanup, + amdvi_enable, + amdvi_disable, + amdvi_create_domain, + amdvi_destroy_domain, + amdvi_create_mapping, + amdvi_destroy_mapping, + amdvi_add_device, + amdvi_remove_device, + amdvi_inv_tlb +}; diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h b/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h new file mode 100644 index 0000000000..6ee6c36632 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h @@ -0,0 +1,431 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _AMDVI_PRIV_H_ +#define _AMDVI_PRIV_H_ + +#include <contrib/dev/acpica/include/acpi.h> + +#define BIT(n) (1ULL << (n)) +/* Return value of bits[n:m] where n and (n >= ) m are bit positions. */ +#define REG_BITS(x, n, m) (((x) >> (m)) & \ + ((1 << (((n) - (m)) + 1)) - 1)) + +/* + * IOMMU PCI capability. + */ +#define AMDVI_PCI_CAP_IOTLB BIT(0) /* IOTLB is supported. */ +#define AMDVI_PCI_CAP_HT BIT(1) /* HyperTransport tunnel support. */ +#define AMDVI_PCI_CAP_NPCACHE BIT(2) /* Not present page cached. */ +#define AMDVI_PCI_CAP_EFR BIT(3) /* Extended features. */ +#define AMDVI_PCI_CAP_EXT BIT(4) /* Miscellaneous information reg. */ + +/* + * IOMMU extended features. + */ +#define AMDVI_EX_FEA_PREFSUP BIT(0) /* Prefetch command support. */ +#define AMDVI_EX_FEA_PPRSUP BIT(1) /* PPR support */ +#define AMDVI_EX_FEA_XTSUP BIT(2) /* Reserved */ +#define AMDVI_EX_FEA_NXSUP BIT(3) /* No-execute. */ +#define AMDVI_EX_FEA_GTSUP BIT(4) /* Guest translation support. */ +#define AMDVI_EX_FEA_EFRW BIT(5) /* Reserved */ +#define AMDVI_EX_FEA_IASUP BIT(6) /* Invalidate all command supp. */ +#define AMDVI_EX_FEA_GASUP BIT(7) /* Guest APIC or AVIC support. */ +#define AMDVI_EX_FEA_HESUP BIT(8) /* Hardware Error. */ +#define AMDVI_EX_FEA_PCSUP BIT(9) /* Performance counters support. */ +/* XXX: add more EFER bits. */ + +/* + * Device table entry or DTE + * NOTE: Must be 256-bits/32 bytes aligned. + */ +struct amdvi_dte { + uint32_t dt_valid:1; /* Device Table valid. */ + uint32_t pt_valid:1; /* Page translation valid. */ + uint16_t :7; /* Reserved[8:2] */ + uint8_t pt_level:3; /* Paging level, 0 to disable. */ + uint64_t pt_base:40; /* Page table root pointer. */ + uint8_t :3; /* Reserved[54:52] */ + uint8_t gv_valid:1; /* Revision 2, GVA to SPA. */ + uint8_t gv_level:2; /* Revision 2, GLX level. */ + uint8_t gv_cr3_lsb:3; /* Revision 2, GCR3[14:12] */ + uint8_t read_allow:1; /* I/O read enabled. */ + uint8_t write_allow:1; /* I/O write enabled. */ + uint8_t :1; /* Reserved[63] */ + uint16_t domain_id:16; /* Domain ID */ + uint16_t gv_cr3_lsb2:16; /* Revision 2, GCR3[30:15] */ + uint8_t iotlb_enable:1; /* Device support IOTLB */ + uint8_t sup_second_io_fault:1; /* Suppress subsequent I/O faults. */ + uint8_t sup_all_io_fault:1; /* Suppress all I/O page faults. */ + uint8_t IOctl:2; /* Port I/O control. */ + uint8_t iotlb_cache_disable:1; /* IOTLB cache hints. */ + uint8_t snoop_disable:1; /* Snoop disable. */ + uint8_t allow_ex:1; /* Allow exclusion. */ + uint8_t sysmgmt:2; /* System management message.*/ + uint8_t :1; /* Reserved[106] */ + uint32_t gv_cr3_msb:21; /* Revision 2, GCR3[51:31] */ + uint8_t intmap_valid:1; /* Interrupt map valid. */ + uint8_t intmap_len:4; /* Interrupt map table length. */ + uint8_t intmap_ign:1; /* Ignore unmapped interrupts. */ + uint64_t intmap_base:46; /* IntMap base. */ + uint8_t :4; /* Reserved[183:180] */ + uint8_t init_pass:1; /* INIT pass through or PT */ + uint8_t extintr_pass:1; /* External Interrupt PT */ + uint8_t nmi_pass:1; /* NMI PT */ + uint8_t :1; /* Reserved[187] */ + uint8_t intr_ctrl:2; /* Interrupt control */ + uint8_t lint0_pass:1; /* LINT0 PT */ + uint8_t lint1_pass:1; /* LINT1 PT */ + uint64_t :64; /* Reserved[255:192] */ +} __attribute__((__packed__)); +CTASSERT(sizeof(struct amdvi_dte) == 32); + +/* + * IOMMU command entry. + */ +struct amdvi_cmd { + uint32_t word0; + uint32_t word1:28; + uint8_t opcode:4; + uint64_t addr; +} __attribute__((__packed__)); + +/* Command opcodes. */ +#define AMDVI_CMP_WAIT_OPCODE 0x1 /* Completion wait. */ +#define AMDVI_INVD_DTE_OPCODE 0x2 /* Invalidate device table entry. */ +#define AMDVI_INVD_PAGE_OPCODE 0x3 /* Invalidate pages. */ +#define AMDVI_INVD_IOTLB_OPCODE 0x4 /* Invalidate IOTLB pages. */ +#define AMDVI_INVD_INTR_OPCODE 0x5 /* Invalidate Interrupt table. */ +#define AMDVI_PREFETCH_PAGES_OPCODE 0x6 /* Prefetch IOMMU pages. */ +#define AMDVI_COMP_PPR_OPCODE 0x7 /* Complete PPR request. */ +#define AMDVI_INV_ALL_OPCODE 0x8 /* Invalidate all. */ + +/* Completion wait attributes. */ +#define AMDVI_CMP_WAIT_STORE BIT(0) /* Write back data. */ +#define AMDVI_CMP_WAIT_INTR BIT(1) /* Completion wait interrupt. */ +#define AMDVI_CMP_WAIT_FLUSH BIT(2) /* Flush queue. */ + +/* Invalidate page. */ +#define AMDVI_INVD_PAGE_S BIT(0) /* Invalidation size. */ +#define AMDVI_INVD_PAGE_PDE BIT(1) /* Invalidate PDE. */ +#define AMDVI_INVD_PAGE_GN_GVA BIT(2) /* GPA or GVA. */ + +#define AMDVI_INVD_PAGE_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) + +/* Invalidate IOTLB. */ +#define AMDVI_INVD_IOTLB_S BIT(0) /* Invalidation size 4k or addr */ +#define AMDVI_INVD_IOTLB_GN_GVA BIT(2) /* GPA or GVA. */ + +#define AMDVI_INVD_IOTLB_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) +/* XXX: add more command entries. */ + +/* + * IOMMU event entry. + */ +struct amdvi_event { + uint16_t devid; + uint16_t pasid_hi; + uint16_t pasid_domid; /* PASID low or DomainID */ + uint16_t flag:12; + uint8_t opcode:4; + uint64_t addr; +} __attribute__((__packed__)); +CTASSERT(sizeof(struct amdvi_event) == 16); + +/* Various event types. */ +#define AMDVI_EVENT_INVALID_DTE 0x1 +#define AMDVI_EVENT_PFAULT 0x2 +#define AMDVI_EVENT_DTE_HW_ERROR 0x3 +#define AMDVI_EVENT_PAGE_HW_ERROR 0x4 +#define AMDVI_EVENT_ILLEGAL_CMD 0x5 +#define AMDVI_EVENT_CMD_HW_ERROR 0x6 +#define AMDVI_EVENT_IOTLB_TIMEOUT 0x7 +#define AMDVI_EVENT_INVALID_DTE_REQ 0x8 +#define AMDVI_EVENT_INVALID_PPR_REQ 0x9 +#define AMDVI_EVENT_COUNTER_ZERO 0xA + +#define AMDVI_EVENT_FLAG_MASK 0x1FF /* Mask for event flags. */ +#define AMDVI_EVENT_FLAG_TYPE(x) (((x) >> 9) & 0x3) + +/* + * IOMMU control block. + */ +struct amdvi_ctrl { + struct { + uint16_t size:9; + uint16_t :3; + uint64_t base:40; /* Devtable register base. */ + uint16_t :12; + } dte; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } cmd; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } event; + uint16_t control :13; + uint64_t :51; + struct { + uint8_t enable:1; + uint8_t allow:1; + uint16_t :10; + uint64_t base:40; + uint16_t :12; + uint16_t :12; + uint64_t limit:40; + uint16_t :12; + } excl; + /* + * Revision 2 only. + */ + uint64_t ex_feature; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } ppr; + uint64_t first_event; + uint64_t second_event; + uint64_t event_status; + /* Revision 2 only, end. */ + uint8_t pad1[0x1FA8]; /* Padding. */ + uint32_t cmd_head:19; + uint64_t :45; + uint32_t cmd_tail:19; + uint64_t :45; + uint32_t evt_head:19; + uint64_t :45; + uint32_t evt_tail:19; + uint64_t :45; + uint32_t status:19; + uint64_t :45; + uint64_t pad2; + uint8_t :4; + uint16_t ppr_head:15; + uint64_t :45; + uint8_t :4; + uint16_t ppr_tail:15; + uint64_t :45; + uint8_t pad3[0x1FC0]; /* Padding. */ + + /* XXX: More for rev2. */ +} __attribute__((__packed__)); +CTASSERT(offsetof(struct amdvi_ctrl, pad1)== 0x58); +CTASSERT(offsetof(struct amdvi_ctrl, pad2)== 0x2028); +CTASSERT(offsetof(struct amdvi_ctrl, pad3)== 0x2040); + +#define AMDVI_MMIO_V1_SIZE (4 * PAGE_SIZE) /* v1 size */ +/* + * AMF IOMMU v2 size including event counters + */ +#define AMDVI_MMIO_V2_SIZE (8 * PAGE_SIZE) + +CTASSERT(sizeof(struct amdvi_ctrl) == 0x4000); +CTASSERT(sizeof(struct amdvi_ctrl) == AMDVI_MMIO_V1_SIZE); + +/* IVHD flag */ +#define IVHD_FLAG_HTT BIT(0) /* Hypertransport Tunnel. */ +#define IVHD_FLAG_PPW BIT(1) /* Pass posted write. */ +#define IVHD_FLAG_RPPW BIT(2) /* Response pass posted write. */ +#define IVHD_FLAG_ISOC BIT(3) /* Isoc support. */ +#define IVHD_FLAG_IOTLB BIT(4) /* IOTLB support. */ +#define IVHD_FLAG_COH BIT(5) /* Coherent control, default 1 */ +#define IVHD_FLAG_PFS BIT(6) /* Prefetch IOMMU pages. */ +#define IVHD_FLAG_PPRS BIT(7) /* Peripheral page support. */ + +/* IVHD device entry data setting. */ +#define IVHD_DEV_LINT0_PASS BIT(6) /* LINT0 interrupts. */ +#define IVHD_DEV_LINT1_PASS BIT(7) /* LINT1 interrupts. */ + +/* Bit[5:4] for System Mgmt. Bit3 is reserved. */ +#define IVHD_DEV_INIT_PASS BIT(0) /* INIT */ +#define IVHD_DEV_EXTINTR_PASS BIT(1) /* ExtInt */ +#define IVHD_DEV_NMI_PASS BIT(2) /* NMI */ + +/* IVHD 8-byte extended data settings. */ +#define IVHD_DEV_EXT_ATS_DISABLE BIT(31) /* Disable ATS */ + +/* IOMMU control register. */ +#define AMDVI_CTRL_EN BIT(0) /* IOMMU enable. */ +#define AMDVI_CTRL_HTT BIT(1) /* Hypertransport tunnel enable. */ +#define AMDVI_CTRL_ELOG BIT(2) /* Event log enable. */ +#define AMDVI_CTRL_ELOGINT BIT(3) /* Event log interrupt. */ +#define AMDVI_CTRL_COMINT BIT(4) /* Completion wait interrupt. */ +#define AMDVI_CTRL_PPW BIT(8) +#define AMDVI_CTRL_RPPW BIT(9) +#define AMDVI_CTRL_COH BIT(10) +#define AMDVI_CTRL_ISOC BIT(11) +#define AMDVI_CTRL_CMD BIT(12) /* Command buffer enable. */ +#define AMDVI_CTRL_PPRLOG BIT(13) +#define AMDVI_CTRL_PPRINT BIT(14) +#define AMDVI_CTRL_PPREN BIT(15) +#define AMDVI_CTRL_GTE BIT(16) /* Guest translation enable. */ +#define AMDVI_CTRL_GAE BIT(17) /* Guest APIC enable. */ + +/* Invalidation timeout. */ +#define AMDVI_CTRL_INV_NO_TO 0 /* No timeout. */ +#define AMDVI_CTRL_INV_TO_1ms 1 /* 1 ms */ +#define AMDVI_CTRL_INV_TO_10ms 2 /* 10 ms */ +#define AMDVI_CTRL_INV_TO_100ms 3 /* 100 ms */ +#define AMDVI_CTRL_INV_TO_1S 4 /* 1 second */ +#define AMDVI_CTRL_INV_TO_10S 5 /* 10 second */ +#define AMDVI_CTRL_INV_TO_100S 6 /* 100 second */ + +/* + * Max number of PCI devices. + * 256 bus x 32 slot/devices x 8 functions. + */ +#define PCI_NUM_DEV_MAX 0x10000 + +/* Maximum number of domains supported by IOMMU. */ +#define AMDVI_MAX_DOMAIN (BIT(16) - 1) + +/* + * IOMMU Page Table attributes. + */ +#define AMDVI_PT_PRESENT BIT(0) +#define AMDVI_PT_COHERENT BIT(60) +#define AMDVI_PT_READ BIT(61) +#define AMDVI_PT_WRITE BIT(62) + +#define AMDVI_PT_RW (AMDVI_PT_READ | AMDVI_PT_WRITE) +#define AMDVI_PT_MASK 0xFFFFFFFFFF000UL /* Only [51:12] for PA */ + +#define AMDVI_PD_LEVEL_SHIFT 9 +#define AMDVI_PD_SUPER(x) (((x) >> AMDVI_PD_LEVEL_SHIFT) == 7) +/* + * IOMMU Status, offset 0x2020 + */ +#define AMDVI_STATUS_EV_OF BIT(0) /* Event overflow. */ +#define AMDVI_STATUS_EV_INTR BIT(1) /* Event interrupt. */ +/* Completion wait command completed. */ +#define AMDVI_STATUS_CMP BIT(2) + +#define IVRS_CTRL_RID 1 /* MMIO RID */ + +/* ACPI IVHD */ +struct ivhd_dev_cfg { + uint32_t start_id; + uint32_t end_id; + uint8_t data; /* Device configuration. */ + bool enable_ats; /* ATS enabled for the device. */ + int ats_qlen; /* ATS invalidation queue depth. */ +}; + +struct amdvi_domain { + uint64_t *ptp; /* Highest level page table */ + int ptp_level; /* Level of page tables */ + u_int id; /* Domain id */ + SLIST_ENTRY (amdvi_domain) next; +}; + +/* + * I/O Virtualization Hardware Definition Block (IVHD) type 0x10 (legacy) + * uses ACPI_IVRS_HARDWARE define in contrib/dev/acpica/include/actbl2.h + * New IVHD types 0x11 and 0x40 as defined in AMD IOMMU spec[48882] are missing in + * ACPI code. These new types add extra field EFR(Extended Feature Register). + * XXX : Use definition from ACPI when it is available. + */ +typedef struct acpi_ivrs_hardware_efr_sup +{ + ACPI_IVRS_HEADER Header; + UINT16 CapabilityOffset; /* Offset for IOMMU control fields */ + UINT64 BaseAddress; /* IOMMU control registers */ + UINT16 PciSegmentGroup; + UINT16 Info; /* MSI number and unit ID */ + UINT32 Attr; /* IOMMU Feature */ + UINT64 ExtFR; /* IOMMU Extended Feature */ + UINT64 Reserved; /* v1 feature or v2 attribute */ +} __attribute__ ((__packed__)) ACPI_IVRS_HARDWARE_EFRSUP; +CTASSERT(sizeof(ACPI_IVRS_HARDWARE_EFRSUP) == 40); + +/* + * Different type of IVHD. + * XXX: Use AcpiIvrsType once new IVHD types are available. +*/ +enum IvrsType +{ + IVRS_TYPE_HARDWARE_LEGACY = 0x10, /* Legacy without EFRi support. */ + IVRS_TYPE_HARDWARE_EFR = 0x11, /* With EFR support. */ + IVRS_TYPE_HARDWARE_MIXED = 0x40, /* Mixed with EFR support. */ +}; + +/* + * AMD IOMMU softc. + */ +struct amdvi_softc { + struct amdvi_ctrl *ctrl; /* Control area. */ + device_t dev; /* IOMMU device. */ + enum IvrsType ivhd_type; /* IOMMU IVHD type. */ + bool iotlb; /* IOTLB supported by IOMMU */ + struct amdvi_cmd *cmd; /* Command descriptor area. */ + int cmd_max; /* Max number of commands. */ + uint64_t cmp_data; /* Command completion write back. */ + struct amdvi_event *event; /* Event descriptor area. */ + struct resource *event_res; /* Event interrupt resource. */ + void *event_tag; /* Event interrupt tag. */ + int event_max; /* Max number of events. */ + int event_irq; + int event_rid; + /* ACPI various flags. */ + uint32_t ivhd_flag; /* ACPI IVHD flag. */ + uint32_t ivhd_feature; /* ACPI v1 Reserved or v2 attribute. */ + uint64_t ext_feature; /* IVHD EFR */ + /* PCI related. */ + uint16_t cap_off; /* PCI Capability offset. */ + uint8_t pci_cap; /* PCI capability. */ + uint16_t pci_seg; /* IOMMU PCI domain/segment. */ + uint16_t pci_rid; /* PCI BDF of IOMMU */ + /* Device range under this IOMMU. */ + uint16_t start_dev_rid; /* First device under this IOMMU. */ + uint16_t end_dev_rid; /* Last device under this IOMMU. */ + + /* BIOS provided device configuration for end points. */ + struct ivhd_dev_cfg dev_cfg[10]; + int dev_cfg_cnt; + + /* Software statistics. */ + uint64_t event_intr_cnt; /* Total event INTR count. */ + uint64_t total_cmd; /* Total number of commands. */ +}; + +int amdvi_setup_hw(struct amdvi_softc *softc); +int amdvi_teardown_hw(struct amdvi_softc *softc); +#endif /* _AMDVI_PRIV_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c new file mode 100644 index 0000000000..370c20fb01 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c @@ -0,0 +1,735 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_acpi.h" +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/malloc.h> + +#include <machine/vmparam.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <contrib/dev/acpica/include/acpi.h> +#include <contrib/dev/acpica/include/accommon.h> +#include <dev/acpica/acpivar.h> + +#include "io/iommu.h" +#include "amdvi_priv.h" + +device_t *ivhd_devs; /* IVHD or AMD-Vi device list. */ +int ivhd_count; /* Number of IVHD header. */ +/* + * Cached IVHD header list. + * Single entry for each IVHD, filtered the legacy one. + */ +ACPI_IVRS_HARDWARE *ivhd_hdrs[10]; + +extern int amdvi_ptp_level; /* Page table levels. */ + +typedef int (*ivhd_iter_t)(ACPI_IVRS_HEADER *ptr, void *arg); +/* + * Iterate IVRS table for IVHD and IVMD device type. + */ +static void +ivrs_hdr_iterate_tbl(ivhd_iter_t iter, void *arg) +{ + ACPI_TABLE_IVRS *ivrs; + ACPI_IVRS_HEADER *ivrs_hdr, *end; + ACPI_STATUS status; + + status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); + if (ACPI_FAILURE(status)) + return; + + if (ivrs->Header.Length == 0) { + return; + } + + ivrs_hdr = (ACPI_IVRS_HEADER *)(ivrs + 1); + end = (ACPI_IVRS_HEADER *)((char *)ivrs + ivrs->Header.Length); + + while (ivrs_hdr < end) { + if ((uint8_t *)ivrs_hdr + ivrs_hdr->Length > (uint8_t *)end) { + printf("AMD-Vi:IVHD/IVMD is corrupted, length : %d\n", + ivrs_hdr->Length); + break; + } + + switch (ivrs_hdr->Type) { + case IVRS_TYPE_HARDWARE_LEGACY: /* Legacy */ + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + if (!iter(ivrs_hdr, arg)) + return; + break; + + case ACPI_IVRS_TYPE_MEMORY1: + case ACPI_IVRS_TYPE_MEMORY2: + case ACPI_IVRS_TYPE_MEMORY3: + if (!iter(ivrs_hdr, arg)) + return; + + break; + + default: + printf("AMD-Vi:Not IVHD/IVMD type(%d)", ivrs_hdr->Type); + + } + + ivrs_hdr = (ACPI_IVRS_HEADER *)((uint8_t *)ivrs_hdr + + ivrs_hdr->Length); + } +} + +static bool +ivrs_is_ivhd(UINT8 type) +{ + + switch(type) { + case IVRS_TYPE_HARDWARE_LEGACY: + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + return (true); + + default: + return (false); + } +} + +/* Count the number of AMD-Vi devices in the system. */ +static int +ivhd_count_iter(ACPI_IVRS_HEADER * ivrs_he, void *arg) +{ + + if (ivrs_is_ivhd(ivrs_he->Type)) + ivhd_count++; + + return (1); +} + +struct find_ivrs_hdr_args { + int i; + ACPI_IVRS_HEADER *ptr; +}; + +static int +ivrs_hdr_find_iter(ACPI_IVRS_HEADER * ivrs_hdr, void *args) +{ + struct find_ivrs_hdr_args *fi; + + fi = (struct find_ivrs_hdr_args *)args; + if (ivrs_is_ivhd(ivrs_hdr->Type)) { + if (fi->i == 0) { + fi->ptr = ivrs_hdr; + return (0); + } + fi->i--; + } + + return (1); +} + +static ACPI_IVRS_HARDWARE * +ivhd_find_by_index(int idx) +{ + struct find_ivrs_hdr_args fi; + + fi.i = idx; + fi.ptr = NULL; + + ivrs_hdr_iterate_tbl(ivrs_hdr_find_iter, &fi); + + return ((ACPI_IVRS_HARDWARE *)fi.ptr); +} + +static void +ivhd_dev_add_entry(struct amdvi_softc *softc, uint32_t start_id, + uint32_t end_id, uint8_t cfg, bool ats) +{ + struct ivhd_dev_cfg *dev_cfg; + + /* If device doesn't have special data, don't add it. */ + if (!cfg) + return; + + dev_cfg = &softc->dev_cfg[softc->dev_cfg_cnt++]; + dev_cfg->start_id = start_id; + dev_cfg->end_id = end_id; + dev_cfg->data = cfg; + dev_cfg->enable_ats = ats; +} + +/* + * Record device attributes as suggested by BIOS. + */ +static int +ivhd_dev_parse(ACPI_IVRS_HARDWARE* ivhd, struct amdvi_softc *softc) +{ + ACPI_IVRS_DE_HEADER *de; + uint8_t *p, *end; + int range_start_id = 0, range_end_id = 0; + uint32_t *extended; + uint8_t all_data = 0, range_data = 0; + bool range_enable_ats = false, enable_ats; + + softc->start_dev_rid = ~0; + softc->end_dev_rid = 0; + + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_LEGACY: + p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE); + break; + + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE_EFRSUP); + break; + + default: + device_printf(softc->dev, + "unknown type: 0x%x\n", ivhd->Header.Type); + return (-1); + } + + end = (uint8_t *)ivhd + ivhd->Header.Length; + + while (p < end) { + de = (ACPI_IVRS_DE_HEADER *)p; + softc->start_dev_rid = MIN(softc->start_dev_rid, de->Id); + softc->end_dev_rid = MAX(softc->end_dev_rid, de->Id); + switch (de->Type) { + case ACPI_IVRS_TYPE_ALL: + all_data = de->DataSetting; + break; + + case ACPI_IVRS_TYPE_SELECT: + case ACPI_IVRS_TYPE_ALIAS_SELECT: + case ACPI_IVRS_TYPE_EXT_SELECT: + enable_ats = false; + if (de->Type == ACPI_IVRS_TYPE_EXT_SELECT) { + extended = (uint32_t *)(de + 1); + enable_ats = + (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? + false : true; + } + ivhd_dev_add_entry(softc, de->Id, de->Id, + de->DataSetting | all_data, enable_ats); + break; + + case ACPI_IVRS_TYPE_START: + case ACPI_IVRS_TYPE_ALIAS_START: + case ACPI_IVRS_TYPE_EXT_START: + range_start_id = de->Id; + range_data = de->DataSetting; + if (de->Type == ACPI_IVRS_TYPE_EXT_START) { + extended = (uint32_t *)(de + 1); + range_enable_ats = + (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? + false : true; + } + break; + + case ACPI_IVRS_TYPE_END: + range_end_id = de->Id; + ivhd_dev_add_entry(softc, range_start_id, range_end_id, + range_data | all_data, range_enable_ats); + range_start_id = range_end_id = 0; + range_data = 0; + all_data = 0; + break; + + case ACPI_IVRS_TYPE_PAD4: + break; + + case ACPI_IVRS_TYPE_SPECIAL: + /* HPET or IOAPIC */ + break; + default: + if ((de->Type < 5) || + (de->Type >= ACPI_IVRS_TYPE_PAD8)) + device_printf(softc->dev, + "Unknown dev entry:0x%x\n", de->Type); + } + + if (softc->dev_cfg_cnt > + (sizeof(softc->dev_cfg) / sizeof(softc->dev_cfg[0]))) { + device_printf(softc->dev, + "WARN Too many device entries.\n"); + return (EINVAL); + } + if (de->Type < 0x40) + p += sizeof(ACPI_IVRS_DEVICE4); + else if (de->Type < 0x80) + p += sizeof(ACPI_IVRS_DEVICE8A); + else { + printf("Variable size IVHD type 0x%x not supported\n", + de->Type); + break; + } + } + + KASSERT((softc->end_dev_rid >= softc->start_dev_rid), + ("Device end[0x%x] < start[0x%x.\n", + softc->end_dev_rid, softc->start_dev_rid)); + + return (0); +} + +static bool +ivhd_is_newer(ACPI_IVRS_HEADER *old, ACPI_IVRS_HEADER *new) +{ + /* + * Newer IVRS header type take precedence. + */ + if ((old->DeviceId == new->DeviceId) && + (old->Type == IVRS_TYPE_HARDWARE_LEGACY) && + ((new->Type == IVRS_TYPE_HARDWARE_EFR) || + (new->Type == IVRS_TYPE_HARDWARE_MIXED))) { + return (true); + } + + return (false); +} + +static void +ivhd_identify(driver_t *driver, device_t parent) +{ + ACPI_TABLE_IVRS *ivrs; + ACPI_IVRS_HARDWARE *ivhd; + ACPI_STATUS status; + int i, count = 0; + uint32_t ivrs_ivinfo; + + if (acpi_disabled("ivhd")) + return; + + status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); + if (ACPI_FAILURE(status)) + return; + + if (ivrs->Header.Length == 0) { + return; + } + + ivrs_ivinfo = ivrs->Info; + printf("AMD-Vi: IVRS Info VAsize = %d PAsize = %d GVAsize = %d" + " flags:%b\n", + REG_BITS(ivrs_ivinfo, 21, 15), REG_BITS(ivrs_ivinfo, 14, 8), + REG_BITS(ivrs_ivinfo, 7, 5), REG_BITS(ivrs_ivinfo, 22, 22), + "\020\001EFRSup"); + + ivrs_hdr_iterate_tbl(ivhd_count_iter, NULL); + if (!ivhd_count) + return; + + for (i = 0; i < ivhd_count; i++) { + ivhd = ivhd_find_by_index(i); + KASSERT(ivhd, ("ivhd%d is NULL\n", i)); + ivhd_hdrs[i] = ivhd; + } + + /* + * Scan for presence of legacy and non-legacy device type + * for same AMD-Vi device and override the old one. + */ + for (i = ivhd_count - 1 ; i > 0 ; i--){ + if (ivhd_is_newer(&ivhd_hdrs[i-1]->Header, + &ivhd_hdrs[i]->Header)) { + ivhd_hdrs[i-1] = ivhd_hdrs[i]; + ivhd_count--; + } + } + + ivhd_devs = malloc(sizeof(device_t) * ivhd_count, M_DEVBUF, + M_WAITOK | M_ZERO); + for (i = 0; i < ivhd_count; i++) { + ivhd = ivhd_hdrs[i]; + KASSERT(ivhd, ("ivhd%d is NULL\n", i)); + + /* + * Use a high order to ensure that this driver is probed after + * the Host-PCI bridge and the root PCI bus. + */ + ivhd_devs[i] = BUS_ADD_CHILD(parent, + ACPI_DEV_BASE_ORDER + 10 * 10, "ivhd", i); + + /* + * XXX: In case device was not destroyed before, add will fail. + * locate the old device instance. + */ + if (ivhd_devs[i] == NULL) { + ivhd_devs[i] = device_find_child(parent, "ivhd", i); + if (ivhd_devs[i] == NULL) { + printf("AMD-Vi: cant find ivhd%d\n", i); + break; + } + } + count++; + } + + /* + * Update device count in case failed to attach. + */ + ivhd_count = count; +} + +static int +ivhd_probe(device_t dev) +{ + ACPI_IVRS_HARDWARE *ivhd; + int unit; + + if (acpi_get_handle(dev) != NULL) + return (ENXIO); + + unit = device_get_unit(dev); + KASSERT((unit < ivhd_count), + ("ivhd unit %d > count %d", unit, ivhd_count)); + ivhd = ivhd_hdrs[unit]; + KASSERT(ivhd, ("ivhd is NULL")); + + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_EFR: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd with EFR"); + break; + + case IVRS_TYPE_HARDWARE_MIXED: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd in mixed format"); + break; + + case IVRS_TYPE_HARDWARE_LEGACY: + default: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd"); + break; + } + + return (BUS_PROBE_NOWILDCARD); +} + +static void +ivhd_print_flag(device_t dev, enum IvrsType ivhd_type, uint8_t flag) +{ + /* + * IVHD lgeacy type has two extra high bits in flag which has + * been moved to EFR for non-legacy device. + */ + switch (ivhd_type) { + case IVRS_TYPE_HARDWARE_LEGACY: + device_printf(dev, "Flag:%b\n", flag, + "\020" + "\001HtTunEn" + "\002PassPW" + "\003ResPassPW" + "\004Isoc" + "\005IotlbSup" + "\006Coherent" + "\007PreFSup" + "\008PPRSup"); + break; + + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + device_printf(dev, "Flag:%b\n", flag, + "\020" + "\001HtTunEn" + "\002PassPW" + "\003ResPassPW" + "\004Isoc" + "\005IotlbSup" + "\006Coherent"); + break; + + default: + device_printf(dev, "Can't decode flag of ivhd type :0x%x\n", + ivhd_type); + break; + } +} + +/* + * Feature in legacy IVHD type(0x10) and attribute in newer type(0x11 and 0x40). + */ +static void +ivhd_print_feature(device_t dev, enum IvrsType ivhd_type, uint32_t feature) +{ + switch (ivhd_type) { + case IVRS_TYPE_HARDWARE_LEGACY: + device_printf(dev, "Features(type:0x%x) HATS = %d GATS = %d" + " MsiNumPPR = %d PNBanks= %d PNCounters= %d\n", + ivhd_type, + REG_BITS(feature, 31, 30), + REG_BITS(feature, 29, 28), + REG_BITS(feature, 27, 23), + REG_BITS(feature, 22, 17), + REG_BITS(feature, 16, 13)); + device_printf(dev, "max PASID = %d GLXSup = %d Feature:%b\n", + REG_BITS(feature, 12, 8), + REG_BITS(feature, 4, 3), + feature, + "\020" + "\002NXSup" + "\003GTSup" + "\004<b4>" + "\005IASup" + "\006GASup" + "\007HESup"); + break; + + /* Fewer features or attributes are reported in non-legacy type. */ + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + device_printf(dev, "Features(type:0x%x) MsiNumPPR = %d" + " PNBanks= %d PNCounters= %d\n", + ivhd_type, + REG_BITS(feature, 27, 23), + REG_BITS(feature, 22, 17), + REG_BITS(feature, 16, 13)); + break; + + default: /* Other ivhd type features are not decoded. */ + device_printf(dev, "Can't decode ivhd type :0x%x\n", ivhd_type); + } +} + +/* Print extended features of IOMMU. */ +static void +ivhd_print_ext_feature(device_t dev, uint64_t ext_feature) +{ + uint32_t ext_low, ext_high; + + if (!ext_feature) + return; + + ext_low = ext_feature; + device_printf(dev, "Extended features[31:0]:%b " + "HATS = 0x%x GATS = 0x%x " + "GLXSup = 0x%x SmiFSup = 0x%x SmiFRC = 0x%x " + "GAMSup = 0x%x DualPortLogSup = 0x%x DualEventLogSup = 0x%x\n", + (int)ext_low, + "\020" + "\001PreFSup" + "\002PPRSup" + "\003<b2>" + "\004NXSup" + "\005GTSup" + "\006<b5>" + "\007IASup" + "\008GASup" + "\009HESup" + "\010PCSup", + REG_BITS(ext_low, 11, 10), + REG_BITS(ext_low, 13, 12), + REG_BITS(ext_low, 15, 14), + REG_BITS(ext_low, 17, 16), + REG_BITS(ext_low, 20, 18), + REG_BITS(ext_low, 23, 21), + REG_BITS(ext_low, 25, 24), + REG_BITS(ext_low, 29, 28)); + + ext_high = ext_feature >> 32; + device_printf(dev, "Extended features[62:32]:%b " + "Max PASID: 0x%x DevTblSegSup = 0x%x " + "MarcSup = 0x%x\n", + (int)(ext_high), + "\020" + "\006USSup" + "\009PprOvrflwEarlySup" + "\010PPRAutoRspSup" + "\013BlKStopMrkSup" + "\014PerfOptSup" + "\015MsiCapMmioSup" + "\017GIOSup" + "\018HASup" + "\019EPHSup" + "\020AttrFWSup" + "\021HDSup" + "\023InvIotlbSup", + REG_BITS(ext_high, 5, 0), + REG_BITS(ext_high, 8, 7), + REG_BITS(ext_high, 11, 10)); +} + +static int +ivhd_print_cap(struct amdvi_softc *softc, ACPI_IVRS_HARDWARE * ivhd) +{ + device_t dev; + int max_ptp_level; + + dev = softc->dev; + + ivhd_print_flag(dev, softc->ivhd_type, softc->ivhd_flag); + ivhd_print_feature(dev, softc->ivhd_type, softc->ivhd_feature); + ivhd_print_ext_feature(dev, softc->ext_feature); + max_ptp_level = 7; + /* Make sure device support minimum page level as requested by user. */ + if (max_ptp_level < amdvi_ptp_level) { + device_printf(dev, "insufficient PTP level:%d\n", + max_ptp_level); + return (EINVAL); + } else { + device_printf(softc->dev, "supported paging level:%d, will use only: %d\n", + max_ptp_level, amdvi_ptp_level); + } + + device_printf(softc->dev, "device range: 0x%x - 0x%x\n", + softc->start_dev_rid, softc->end_dev_rid); + + return (0); +} + +static int +ivhd_attach(device_t dev) +{ + ACPI_IVRS_HARDWARE *ivhd; + ACPI_IVRS_HARDWARE_EFRSUP *ivhd_efr; + struct amdvi_softc *softc; + int status, unit; + + unit = device_get_unit(dev); + KASSERT((unit < ivhd_count), + ("ivhd unit %d > count %d", unit, ivhd_count)); + /* Make sure its same device for which attach is called. */ + KASSERT((ivhd_devs[unit] == dev), + ("Not same device old %p new %p", ivhd_devs[unit], dev)); + + softc = device_get_softc(dev); + softc->dev = dev; + ivhd = ivhd_hdrs[unit]; + KASSERT(ivhd, ("ivhd is NULL")); + + softc->ivhd_type = ivhd->Header.Type; + softc->pci_seg = ivhd->PciSegmentGroup; + softc->pci_rid = ivhd->Header.DeviceId; + softc->ivhd_flag = ivhd->Header.Flags; + /* + * On lgeacy IVHD type(0x10), it is documented as feature + * but in newer type it is attribute. + */ + softc->ivhd_feature = ivhd->Reserved; + /* + * PCI capability has more capabilities that are not part of IVRS. + */ + softc->cap_off = ivhd->CapabilityOffset; + +#ifdef notyet + /* IVHD Info bit[4:0] is event MSI/X number. */ + softc->event_msix = ivhd->Info & 0x1F; +#endif + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + ivhd_efr = (ACPI_IVRS_HARDWARE_EFRSUP *)ivhd; + softc->ext_feature = ivhd_efr->ExtFR; + break; + + } + + softc->ctrl = (struct amdvi_ctrl *) PHYS_TO_DMAP(ivhd->BaseAddress); + status = ivhd_dev_parse(ivhd, softc); + if (status != 0) { + device_printf(dev, + "endpoint device parsing error=%d\n", status); + } + + status = ivhd_print_cap(softc, ivhd); + if (status != 0) { + return (status); + } + + status = amdvi_setup_hw(softc); + if (status != 0) { + device_printf(dev, "couldn't be initialised, error=%d\n", + status); + return (status); + } + + return (0); +} + +static int +ivhd_detach(device_t dev) +{ + struct amdvi_softc *softc; + + softc = device_get_softc(dev); + + amdvi_teardown_hw(softc); + + /* + * XXX: delete the device. + * don't allow detach, return EBUSY. + */ + return (0); +} + +static int +ivhd_suspend(device_t dev) +{ + + return (0); +} + +static int +ivhd_resume(device_t dev) +{ + + return (0); +} + +static device_method_t ivhd_methods[] = { + DEVMETHOD(device_identify, ivhd_identify), + DEVMETHOD(device_probe, ivhd_probe), + DEVMETHOD(device_attach, ivhd_attach), + DEVMETHOD(device_detach, ivhd_detach), + DEVMETHOD(device_suspend, ivhd_suspend), + DEVMETHOD(device_resume, ivhd_resume), + DEVMETHOD_END +}; + +static driver_t ivhd_driver = { + "ivhd", + ivhd_methods, + sizeof(struct amdvi_softc), +}; + +static devclass_t ivhd_devclass; + +/* + * Load this module at the end after PCI re-probing to configure interrupt. + */ +DRIVER_MODULE_ORDERED(ivhd, acpi, ivhd_driver, ivhd_devclass, 0, 0, + SI_ORDER_ANY); +MODULE_DEPEND(ivhd, acpi, 1, 1, 1); +MODULE_DEPEND(ivhd, pci, 1, 1, 1); diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.c b/usr/src/uts/i86pc/io/vmm/amd/npt.c index e1c1b79e1b..e61464a964 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/npt.c +++ b/usr/src/uts/i86pc/io/vmm/amd/npt.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.h b/usr/src/uts/i86pc/io/vmm/amd/npt.h index 5966474711..35530d7833 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/npt.h +++ b/usr/src/uts/i86pc/io/vmm/amd/npt.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c index cb6251a791..9c22fc2532 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm.c +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) * All rights reserved. * @@ -50,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include <machine/cpufunc.h> #include <machine/psl.h> #include <machine/md_var.h> +#include <machine/reg.h> #include <machine/specialreg.h> #include <machine/smp.h> #include <machine/vmm.h> @@ -528,8 +531,8 @@ vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, PAT_VALUE(7, PAT_UNCACHEABLE); /* Set up DR6/7 to power-on state */ - state->dr6 = 0xffff0ff0; - state->dr7 = 0x400; + state->dr6 = DBREG_DR6_RESERVED1; + state->dr7 = DBREG_DR7_RESERVED1; } /* diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.h b/usr/src/uts/i86pc/io/vmm/amd/svm.h index 2f4277df2f..c78f7eb067 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm.h +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c index 49208a351c..0417983233 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014, Neel Natu (neel@freebsd.org) * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h index 07716c86de..1dba8101ab 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Neel Natu (neel@freebsd.org) * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h index 9377bf529a..8735353bb4 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/amd/vmcb.c b/usr/src/uts/i86pc/io/vmm/amd/vmcb.c index b1232c713d..5075b69867 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/vmcb.c +++ b/usr/src/uts/i86pc/io/vmm/amd/vmcb.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/amd/vmcb.h b/usr/src/uts/i86pc/io/vmm/amd/vmcb.h index 163f48f010..ec7caa91f9 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/vmcb.h +++ b/usr/src/uts/i86pc/io/vmm/amd/vmcb.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c index 1df2271b3a..769780e0d9 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c @@ -67,6 +67,7 @@ __FBSDID("$FreeBSD$"); #include <machine/psl.h> #include <machine/cpufunc.h> #include <machine/md_var.h> +#include <machine/reg.h> #include <machine/segments.h> #include <machine/smp.h> #include <machine/specialreg.h> @@ -227,6 +228,15 @@ static u_int vpid_alloc_failed; SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, &vpid_alloc_failed, 0, NULL); +static int guest_l1d_flush; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD, + &guest_l1d_flush, 0, NULL); +static int guest_l1d_flush_sw; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD, + &guest_l1d_flush_sw, 0, NULL); + +static struct msr_entry msr_load_list[1] __aligned(16); + /* * The definitions of SDT probes for VMX. */ @@ -297,6 +307,9 @@ SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, "struct vmx *", "int", "struct vm_exit *"); +SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, + "struct vmx *", "int", "struct vm_exit *"); + SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, "struct vmx *", "int", "struct vm_exit *", "uint32_t"); @@ -627,6 +640,9 @@ vmx_cleanup(void) vpid_unr = NULL; } + if (nmi_flush_l1d_sw == 1) + nmi_flush_l1d_sw = 0; + smp_rendezvous(NULL, vmx_disable, NULL, NULL); return (0); @@ -886,6 +902,36 @@ vmx_init(int ipinum) return (error); } +#ifdef __FreeBSD__ + guest_l1d_flush = (cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) == 0; + TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); + + /* + * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when + * available. Otherwise fall back to the software flush + * method which loads enough data from the kernel text to + * flush existing L1D content, both on VMX entry and on NMI + * return. + */ + if (guest_l1d_flush) { + if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { + guest_l1d_flush_sw = 1; + TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", + &guest_l1d_flush_sw); + } + if (guest_l1d_flush_sw) { + if (nmi_flush_l1d_sw <= 1) + nmi_flush_l1d_sw = 1; + } else { + msr_load_list[0].index = MSR_IA32_FLUSH_CMD; + msr_load_list[0].val = IA32_FLUSH_CMD_L1D; + } + } +#else + /* L1D flushing is taken care of by ht_acquire() and friends */ + guest_l1d_flush = 0; +#endif /* __FreeBSD__ */ + /* * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 */ @@ -1109,6 +1155,15 @@ vmx_vminit(struct vm *vm, pmap_t pmap) #endif error += vmwrite(VMCS_VPID, vpid[i]); + if (guest_l1d_flush && !guest_l1d_flush_sw) { + vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( + (vm_offset_t)&msr_load_list[0])); + vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, + nitems(msr_load_list)); + vmcs_write(VMCS_EXIT_MSR_STORE, 0); + vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); + } + /* exception bitmap */ if (vcpu_trace_exceptions(vm, i)) exc_bitmap = 0xffffffff; @@ -1116,8 +1171,8 @@ vmx_vminit(struct vm *vm, pmap_t pmap) exc_bitmap = 1 << IDT_MC; error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); - vmx->ctx[i].guest_dr6 = 0xffff0ff0; - error += vmwrite(VMCS_GUEST_DR7, 0x400); + vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; + error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); if (virtual_interrupt_delivery) { error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); @@ -2976,6 +3031,19 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MWAIT; break; + case EXIT_REASON_VMCALL: + case EXIT_REASON_VMCLEAR: + case EXIT_REASON_VMLAUNCH: + case EXIT_REASON_VMPTRLD: + case EXIT_REASON_VMPTRST: + case EXIT_REASON_VMREAD: + case EXIT_REASON_VMRESUME: + case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMXOFF: + case EXIT_REASON_VMXON: + SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_VMINSN; + break; default: SDT_PROBE4(vmm, vmx, exit, unknown, vmx, vcpu, vmexit, reason); diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.c b/usr/src/uts/i86pc/io/vmm/io/vatpic.c index 6e94f5bd9a..ba4cd7785e 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpic.c +++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.h b/usr/src/uts/i86pc/io/vmm/io/vatpit.h index 12f2db2c61..4bf9fe048d 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpit.h +++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2011 NetApp, Inc. * All rights reserved. diff --git a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c index 1e7bb93d7b..4df909777d 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c +++ b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014, Neel Natu (neel@freebsd.org) * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h index 039a28145b..e6562da5c0 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h +++ b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Neel Natu (neel@freebsd.org) * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/io/vrtc.c b/usr/src/uts/i86pc/io/vmm/io/vrtc.c index 0d61631626..f12d22fc26 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vrtc.c +++ b/usr/src/uts/i86pc/io/vmm/io/vrtc.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014, Neel Natu (neel@freebsd.org) * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/io/vrtc.h b/usr/src/uts/i86pc/io/vmm/io/vrtc.h index ffab3a5af0..13abbedeb9 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vrtc.h +++ b/usr/src/uts/i86pc/io/vmm/io/vrtc.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Neel Natu (neel@freebsd.org) * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c index 4d8cf1748d..9a4bbad9c1 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm.c @@ -991,8 +991,8 @@ sysmem_mapping(struct vm *vm, struct mem_map *mm) return (false); } -static vm_paddr_t -sysmem_maxaddr(struct vm *vm) +vm_paddr_t +vmm_sysmem_maxaddr(struct vm *vm) { struct mem_map *mm; vm_paddr_t maxaddr; @@ -1127,7 +1127,7 @@ vm_assign_pptdev(struct vm *vm, int pptfd) if (ppt_assigned_devices(vm) == 0) { KASSERT(vm->iommu == NULL, ("vm_assign_pptdev: iommu must be NULL")); - maxaddr = sysmem_maxaddr(vm); + maxaddr = vmm_sysmem_maxaddr(vm); vm->iommu = iommu_create_domain(maxaddr); if (vm->iommu == NULL) return (ENXIO); @@ -2190,6 +2190,7 @@ restart: break; case VM_EXITCODE_MONITOR: case VM_EXITCODE_MWAIT: + case VM_EXITCODE_VMINSN: vm_inject_ud(vm, vcpuid); break; #ifndef __FreeBSD__ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c index 934e01a38f..3d08fd5e85 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h index ba51989b1a..14e315f400 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h index c200a5eb33..163c0781cf 100644 --- a/usr/src/uts/i86pc/sys/vmm.h +++ b/usr/src/uts/i86pc/sys/vmm.h @@ -240,6 +240,7 @@ int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, struct vm_object **objptr); +vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len, int prot, void **cookie); void vm_gpa_release(void *cookie); @@ -587,6 +588,7 @@ enum vm_exitcode { VM_EXITCODE_SVM, VM_EXITCODE_REQIDLE, VM_EXITCODE_DEBUG, + VM_EXITCODE_VMINSN, #ifndef __FreeBSD__ VM_EXITCODE_HT, #endif |
