summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h31
-rw-r--r--usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c55
-rw-r--r--usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c20
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/ldi.c7
-rw-r--r--usr/src/uts/common/io/nvme/nvme.c96
-rw-r--r--usr/src/uts/common/io/nvme/nvme_var.h25
-rw-r--r--usr/src/uts/common/io/pciex/hotplug/pciehpc.c15
-rw-r--r--usr/src/uts/common/io/pciex/pcie.c7
-rw-r--r--usr/src/uts/common/io/pciex/pcie_fault.c76
-rw-r--r--usr/src/uts/common/os/ddi_hp_impl.c237
-rw-r--r--usr/src/uts/common/os/ddi_hp_ndi.c22
-rw-r--r--usr/src/uts/common/sys/ddi_hp.h8
-rw-r--r--usr/src/uts/common/sys/ddi_hp_impl.h6
-rw-r--r--usr/src/uts/common/sys/pcie_impl.h14
-rw-r--r--usr/src/uts/i86pc/io/pci/pci_common.h14
-rw-r--r--usr/src/uts/i86pc/io/pciex/npe.c147
-rw-r--r--usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c53
17 files changed, 758 insertions, 75 deletions
diff --git a/usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h b/usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h
index f33ea9ecd6..96e1a956af 100644
--- a/usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h
+++ b/usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _FABRIC_XLATE_H
@@ -31,6 +32,7 @@
#include <sys/types.h>
#include <sys/pcie.h>
#include <sys/fm/io/pci.h>
+#include <limits.h>
#ifdef __cplusplus
extern "C" {
@@ -45,6 +47,17 @@ extern "C" {
#define PF_ADDR_PIO (1 << 1)
#define PF_ADDR_CFG (1 << 2)
+
+/*
+ * The fabric ereport preparation functions (fab_prep_*) in fab_erpt_tbl_t
+ * structures may return an error if the ereport could not be set up properly.
+ * Typically, these errors are errnos. It is possible that based on incoming
+ * ereport payload data, we might not want to generate an ereport at all: In
+ * this case, the preparation functions may instead return PF_EREPORT_IGNORE,
+ * which is set at a high value so as not to collide with the errnos.
+ */
+#define PF_EREPORT_IGNORE INT_MAX
+
extern fmd_xprt_t *fab_fmd_xprt; /* FMD transport layer handle */
extern char fab_buf[];
@@ -121,8 +134,21 @@ typedef struct fab_data {
uint16_t pcie_rp_ctl; /* root complex control register */
uint32_t pcie_rp_err_status; /* pcie root complex error status reg */
uint32_t pcie_rp_err_cmd; /* pcie root complex error cmd reg */
- uint16_t pcie_rp_ce_src_id; /* pcie root complex ce sourpe id */
- uint16_t pcie_rp_ue_src_id; /* pcie root complex ue sourpe id */
+ uint16_t pcie_rp_ce_src_id; /* pcie root complex ce source id */
+ uint16_t pcie_rp_ue_src_id; /* pcie root complex ue source id */
+
+ /*
+ * The slot register values refer to the registers of the component's
+ * parent slot, not the component itself.
+ *
+ * You should only use the register values -- i.e.,
+ * pcie_slot_{cap,control,status} -- if pcie_slot_data_valid is set to
+ * true.
+ */
+ boolean_t pcie_slot_data_valid; /* true if slot data is valid */
+ uint32_t pcie_slot_cap; /* pcie slot capabilities */
+ uint16_t pcie_slot_control; /* pcie slot control */
+ uint16_t pcie_slot_status; /* pcie slot status */
/* Flags */
boolean_t pcie_rp_send_all; /* need to send ereports on all rps */
@@ -131,7 +157,6 @@ typedef struct fab_data {
typedef struct fab_erpt_tbl {
const char *err_class; /* Final Ereport Class */
uint32_t reg_bit; /* Error Bit Mask */
- /* Pointer to function that prepares the ereport body */
const char *tgt_class; /* Target Ereport Class */
} fab_erpt_tbl_t;
diff --git a/usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c b/usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c
index 69ecf1aa8d..14ae738863 100644
--- a/usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c
+++ b/usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c
@@ -22,10 +22,13 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2019 Joyent, Inc.
*/
#include <stddef.h>
#include <strings.h>
#include <sys/fm/util.h>
+#include <sys/pcie.h>
#include "fabric-xlate.h"
@@ -271,6 +274,24 @@ fab_pci_fabric_to_data(fmd_hdl_t *hdl, nvlist_t *nvl, fab_data_t *data)
FAB_LOOKUP(32, "pcie_adv_rp_command", &data->pcie_rp_err_cmd);
FAB_LOOKUP(16, "pcie_adv_rp_ce_src_id", &data->pcie_rp_ce_src_id);
FAB_LOOKUP(16, "pcie_adv_rp_ue_src_id", &data->pcie_rp_ue_src_id);
+
+ /*
+ * PCIe Parent Slot Registers
+ *
+ * These are only passed in the ereport if the parent PCIe component
+ * supports the registers and the registers have valid data. As such, we
+ * look up one slot register value first: If that value is present in
+ * the input ereport data, then we know the others should be there as
+ * well. We also set the pcie_slot_data_valid flag to ensure we know
+ * the slot register data is safe to use in the module.
+ */
+ data->pcie_slot_data_valid = B_FALSE;
+ if (nvlist_lookup_uint32(nvl, "pcie_slot_cap", &data->pcie_slot_cap) ==
+ 0) {
+ FAB_LOOKUP(16, "pcie_slot_control", &data->pcie_slot_control);
+ FAB_LOOKUP(16, "pcie_slot_status", &data->pcie_slot_status);
+ data->pcie_slot_data_valid = B_TRUE;
+ }
}
static int
@@ -358,6 +379,38 @@ fab_prep_pcie_ue_erpt(fmd_hdl_t *hdl, fab_data_t *data, nvlist_t *erpt,
PCIE_AER_CTL_FST_ERR_PTR_MASK);
int err = fab_prep_basic_erpt(hdl, data->nvl, erpt, B_FALSE);
+ if (data->pcie_slot_data_valid) {
+ (void) nvlist_add_uint32(erpt, "pcie_slot_cap",
+ data->pcie_slot_cap);
+ (void) nvlist_add_uint16(erpt, "pcie_slot_control",
+ data->pcie_slot_control);
+ (void) nvlist_add_uint16(erpt, "pcie_slot_status",
+ data->pcie_slot_status);
+
+ /*
+ * It is possible to see uncorrectable errors for a slot that
+ * are related to the slot's child device being physically
+ * removed from the slot. As such, in the case that the slot
+ * reports that it is empty, we do not want to generate an
+ * ereport for all errors. Generating an ereport here will cause
+ * the eft module to fault the device and io-retire to
+ * subsequently retire the device. Retiring the device makes
+ * little sense given that the device is physically gone; more
+ * confusingly, if plugged back into the slot, it would be
+ * marked retired already.
+ *
+ * The only error ignored for this case is Completion Timeout.
+ * It is possible more errors should be ignored, and if they
+ * are seen in the field it might be worth broadening the set
+ * of ignored errors.
+ */
+ if (tbl->reg_bit == PCIE_AER_UCE_TO &&
+ ((data->pcie_slot_status &
+ PCIE_SLOTSTS_PRESENCE_DETECTED) == 0x0)) {
+ return (PF_EREPORT_IGNORE);
+ }
+ }
+
/* Generate an ereport for this error bit. */
(void) snprintf(fab_buf, FM_MAX_CLASS, "ereport.io.%s.%s",
PCIEX_ERROR_SUBCLASS, class);
@@ -776,7 +829,7 @@ fab_xlate_pcie_erpts(fmd_hdl_t *hdl, fab_data_t *data)
fmd_hdl_debug(hdl, "Sending Ereports Now");
- /* Go through the error logs and send the relavant reports */
+ /* Go through the error logs and send the relevant reports */
for (tbl = fab_master_err_tbl; tbl->erpt_tbl; tbl++) {
fab_send_erpt(hdl, data, tbl);
}
diff --git a/usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c b/usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c
index 8593144b28..94678dbd47 100644
--- a/usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c
+++ b/usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#include <strings.h>
#include <fm/topo_hc.h>
@@ -185,6 +186,7 @@ fab_send_erpt(fmd_hdl_t *hdl, fab_data_t *data, fab_err_tbl_t *tbl)
fab_erpt_tbl_t *erpt_tbl, *entry;
nvlist_t *erpt;
uint32_t reg;
+ int err;
erpt_tbl = tbl->erpt_tbl;
if (tbl->reg_size == 16) {
@@ -200,7 +202,9 @@ fab_send_erpt(fmd_hdl_t *hdl, fab_data_t *data, fab_err_tbl_t *tbl)
if (nvlist_alloc(&erpt, NV_UNIQUE_NAME, 0) != 0)
goto done;
- if (tbl->fab_prep(hdl, data, erpt, entry) != 0) {
+
+ err = tbl->fab_prep(hdl, data, erpt, entry);
+ if (err != 0 && err != PF_EREPORT_IGNORE) {
fmd_hdl_debug(hdl, "Prepping ereport failed: "
"class = %s\n", entry->err_class);
nvlist_free(erpt);
@@ -394,7 +398,7 @@ fab_find_rppath_by_devbdf(fmd_hdl_t *hdl, nvlist_t *nvl, pcie_req_id_t bdf)
xmlXPathObjectPtr xpathObj;
xmlNodeSetPtr nodes;
xmlNodePtr devNode;
- char *retval, *temp;
+ char *retval, *temp;
char query[500];
int i, size, bus, dev, fn;
char *hcpath;
@@ -577,7 +581,7 @@ fail:
char *
fab_find_bdf(fmd_hdl_t *hdl, nvlist_t *nvl, pcie_req_id_t bdf)
{
- char *retval;
+ char *retval;
char query[500];
int bus, dev, fn;
char rcpath[255];
@@ -705,7 +709,7 @@ found:
propgroup:
/* Retrive the "dev" propval and return */
for (devNode = devNode->children; devNode; devNode = devNode->next) {
- char *tprop;
+ char *tprop;
tprop = GET_PROP(devNode, "name");
if (STRCMP(devNode->name, "propval") &&
@@ -866,8 +870,8 @@ fab_pr(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl)
char *
fab_get_rpdev(fmd_hdl_t *hdl)
{
- char *retval;
- char query[500];
+ char *retval;
+ char query[500];
(void) snprintf(query, sizeof (query), "//propval["
"@name='extended-capabilities' and contains(@value, '%s')]"
@@ -888,8 +892,8 @@ fab_send_erpt_all_rps(fmd_hdl_t *hdl, nvlist_t *erpt)
{
xmlXPathObjectPtr xpathObj;
xmlNodeSetPtr nodes;
- char *rppath, *hbpath;
- char query[600];
+ char *rppath, *hbpath;
+ char query[600];
nvlist_t *detector, *nvl;
uint_t i, size;
size_t len;
diff --git a/usr/src/cmd/mdb/common/modules/genunix/ldi.c b/usr/src/cmd/mdb/common/modules/genunix/ldi.c
index a3ceb64421..3e4f11ba28 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/ldi.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/ldi.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/types.h>
@@ -290,7 +290,8 @@ ldi_ident(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
}
static void
-ldi_handle_header(int refs, int ident) {
+ldi_handle_header(int refs, int ident)
+{
mdb_printf("%-?s ", "HANDLE");
if (refs)
@@ -369,7 +370,7 @@ ldi_handle(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
int refs = 1;
if (mdb_getopts(argc, argv,
- 'i', MDB_OPT_SETBITS, TRUE, &ident) != argc)
+ 'i', MDB_OPT_SETBITS, TRUE, &ident, NULL) != argc)
return (DCMD_USAGE);
if (ident)
diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c
index 5af89e3874..44fbf8ea89 100644
--- a/usr/src/uts/common/io/nvme/nvme.c
+++ b/usr/src/uts/common/io/nvme/nvme.c
@@ -13,7 +13,7 @@
* Copyright 2018 Nexenta Systems, Inc.
* Copyright 2016 Tegile Systems, Inc. All rights reserved.
* Copyright (c) 2016 The MathWorks, Inc. All rights reserved.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
* Copyright 2019 Western Digital Corporation.
*/
@@ -58,7 +58,7 @@
* but they share some driver state: the command array (holding pointers to
* commands currently being processed by the hardware) and the active command
* counter. Access to a submission queue and the shared state is protected by
- * nq_mutex, completion queue is protected by ncq_mutex.
+ * nq_mutex; completion queue is protected by ncq_mutex.
*
* When a command is submitted to a queue pair the active command counter is
* incremented and a pointer to the command is stored in the command array. The
@@ -201,6 +201,23 @@
* device.
*
*
+ * NVMe Hotplug:
+ *
+ * The driver supports hot removal. The driver uses the NDI event framework
+ * to register a callback, nvme_remove_callback, to clean up when a disk is
+ * removed. In particular, the driver will unqueue outstanding I/O commands and
+ * set n_dead on the softstate to true so that other operations, such as ioctls
+ * and command submissions, fail as well.
+ *
+ * While the callback registration relies on the NDI event framework, the
+ * removal event itself is kicked off in the PCIe hotplug framework, when the
+ * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicatating that a
+ * device was removed from the slot.
+ *
+ * The NVMe driver instance itself will remain until the final close of the
+ * device.
+ *
+ *
* Driver Configuration:
*
* The following driver properties can be changed to control some aspects of the
@@ -1017,6 +1034,10 @@ nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
static int
nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
{
+ if (cmd->nc_nvme->n_dead) {
+ return (EIO);
+ }
+
if (sema_tryp(&qp->nq_sema) == 0)
return (EAGAIN);
@@ -3181,6 +3202,47 @@ nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg)
return (fm_error->fme_status);
}
+static void
+nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a,
+ void *b)
+{
+ nvme_t *nvme = a;
+
+ nvme->n_dead = B_TRUE;
+
+ /*
+ * Fail all outstanding commands, including those in the admin queue
+ * (queue 0).
+ */
+ for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) {
+ nvme_qpair_t *qp = nvme->n_ioq[i];
+
+ mutex_enter(&qp->nq_mutex);
+ for (size_t j = 0; j < qp->nq_nentry; j++) {
+ nvme_cmd_t *cmd = qp->nq_cmd[j];
+ nvme_cmd_t *u_cmd;
+
+ if (cmd == NULL) {
+ continue;
+ }
+
+ /*
+ * Since we have the queue lock held the entire time we
+ * iterate over it, it's not possible for the queue to
+ * change underneath us. Thus, we don't need to check
+ * that the return value of nvme_unqueue_cmd matches the
+ * requested cmd to unqueue.
+ */
+ u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
+ taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
+ cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
+
+ ASSERT3P(u_cmd, ==, cmd);
+ }
+ mutex_exit(&qp->nq_mutex);
+ }
+}
+
static int
nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
@@ -3203,6 +3265,17 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
ddi_set_driver_private(dip, nvme);
nvme->n_dip = dip;
+ /* Set up event handlers for hot removal. */
+ if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT,
+ &nvme->n_rm_cookie) != DDI_SUCCESS) {
+ goto fail;
+ }
+ if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie,
+ nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) !=
+ DDI_SUCCESS) {
+ goto fail;
+ }
+
mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL);
nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
@@ -3510,6 +3583,12 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
if (nvme->n_product != NULL)
strfree(nvme->n_product);
+ /* Clean up hot removal event handler. */
+ if (nvme->n_ev_rm_cb_id != NULL) {
+ (void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id);
+ }
+ nvme->n_ev_rm_cb_id = NULL;
+
ddi_soft_state_free(nvme_state, instance);
return (DDI_SUCCESS);
@@ -3692,6 +3771,11 @@ static int
nvme_bd_mediainfo(void *arg, bd_media_t *media)
{
nvme_namespace_t *ns = arg;
+ nvme_t *nvme = ns->ns_nvme;
+
+ if (nvme->n_dead) {
+ return (EIO);
+ }
media->m_nblks = ns->ns_block_count;
media->m_blksize = ns->ns_block_size;
@@ -3712,8 +3796,9 @@ nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
boolean_t poll;
int ret;
- if (nvme->n_dead)
+ if (nvme->n_dead) {
return (EIO);
+ }
cmd = nvme_create_nvm_cmd(ns, opc, xfer);
if (cmd == NULL)
@@ -3794,6 +3879,11 @@ static int
nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
{
nvme_namespace_t *ns = arg;
+ nvme_t *nvme = ns->ns_nvme;
+
+ if (nvme->n_dead) {
+ return (EIO);
+ }
/*LINTED: E_BAD_PTR_CAST_ALIGN*/
if (*(uint64_t *)ns->ns_eui64 != 0) {
diff --git a/usr/src/uts/common/io/nvme/nvme_var.h b/usr/src/uts/common/io/nvme/nvme_var.h
index 6f3b53d3ec..7e2d1783d5 100644
--- a/usr/src/uts/common/io/nvme/nvme_var.h
+++ b/usr/src/uts/common/io/nvme/nvme_var.h
@@ -12,7 +12,7 @@
/*
* Copyright 2018 Nexenta Systems, Inc.
* Copyright 2016 The MathWorks, Inc. All rights reserved.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
* Copyright 2019 Western Digital Corporation.
*/
@@ -111,20 +111,23 @@ struct nvme_cq {
struct nvme_qpair {
size_t nq_nentry;
+ /* submission fields */
nvme_dma_t *nq_sqdma;
nvme_sqe_t *nq_sq;
uint_t nq_sqhead;
uint_t nq_sqtail;
uintptr_t nq_sqtdbl;
+ /* completion */
nvme_cq_t *nq_cq;
- nvme_cmd_t **nq_cmd;
- uint16_t nq_next_cmd;
- uint_t nq_active_cmds;
+ /* shared structures for completion and submission */
+ nvme_cmd_t **nq_cmd; /* active command array */
+ uint16_t nq_next_cmd; /* next potential empty queue slot */
+ uint_t nq_active_cmds; /* number of active cmds */
- kmutex_t nq_mutex;
- ksema_t nq_sema;
+ kmutex_t nq_mutex; /* protects shared state */
+ ksema_t nq_sema; /* semaphore to ensure q always has >= 1 empty slot */
};
struct nvme {
@@ -179,12 +182,17 @@ struct nvme {
int n_pagesize;
int n_namespace_count;
- uint_t n_ioq_count;
+ uint_t n_ioq_count; /* number of I/O command queues */
uint_t n_cq_count;
nvme_identify_ctrl_t *n_idctl;
+ /* Pointer to the admin queue, which is always queue 0 in n_ioq. */
nvme_qpair_t *n_adminq;
+ /*
+ * All command queues, including the admin queue.
+ * Its length is: n_ioq_count + 1.
+ */
nvme_qpair_t **n_ioq;
nvme_cq_t **n_cq;
@@ -242,6 +250,9 @@ struct nvme {
uint32_t n_vendor_event;
uint32_t n_unknown_event;
+ /* hot removal NDI event handling */
+ ddi_eventcookie_t n_rm_cookie;
+ ddi_callback_id_t n_ev_rm_cb_id;
};
struct nvme_namespace {
diff --git a/usr/src/uts/common/io/pciex/hotplug/pciehpc.c b/usr/src/uts/common/io/pciex/hotplug/pciehpc.c
index 5ce219bd2f..3e4beda495 100644
--- a/usr/src/uts/common/io/pciex/hotplug/pciehpc.c
+++ b/usr/src/uts/common/io/pciex/hotplug/pciehpc.c
@@ -396,6 +396,21 @@ pciehpc_intr(dev_info_t *dip)
control & ~PCIE_SLOTCTL_PWR_FAULT_EN);
/*
+ * If supported, notify the child device driver that the
+ * device is being removed.
+ */
+ dev_info_t *cdip = ddi_get_child(dip);
+ if (cdip != NULL) {
+ ddi_eventcookie_t rm_cookie;
+ if (ddi_get_eventcookie(cdip,
+ DDI_DEVI_REMOVE_EVENT,
+ &rm_cookie) == DDI_SUCCESS) {
+ ndi_post_event(dip, cdip, rm_cookie,
+ NULL);
+ }
+ }
+
+ /*
* Ask DDI Hotplug framework to change state to Empty
*/
(void) ndi_hp_state_change_req(dip,
diff --git a/usr/src/uts/common/io/pciex/pcie.c b/usr/src/uts/common/io/pciex/pcie.c
index 4eff14d563..3be7aeac6e 100644
--- a/usr/src/uts/common/io/pciex/pcie.c
+++ b/usr/src/uts/common/io/pciex/pcie.c
@@ -786,6 +786,13 @@ pcie_init_pfd(dev_info_t *dip)
PCIE_ZALLOC(pf_pcix_ecc_regs_t);
}
}
+
+ PCIE_SLOT_REG(pfd_p) = PCIE_ZALLOC(pf_pcie_slot_regs_t);
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_regs_valid = B_FALSE;
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_cap = 0;
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_control = 0;
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_status = 0;
+
} else if (PCIE_IS_PCIX(bus_p)) {
if (PCIE_IS_BDG(bus_p)) {
PCIX_BDG_ERR_REG(pfd_p) =
diff --git a/usr/src/uts/common/io/pciex/pcie_fault.c b/usr/src/uts/common/io/pciex/pcie_fault.c
index 6a335db3e2..3f14041e80 100644
--- a/usr/src/uts/common/io/pciex/pcie_fault.c
+++ b/usr/src/uts/common/io/pciex/pcie_fault.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/sysmacros.h>
@@ -200,7 +200,7 @@ pf_eh_exit(pcie_bus_t *bus_p)
* for the root_pfd_p.
*
* "Root Complexes" such as NPE and PX should call scan_fabric using itself as
- * the rdip. PCIe Root ports should call pf_scan_fabric using it's parent as
+ * the rdip. PCIe Root ports should call pf_scan_fabric using its parent as
* the rdip.
*
* Scan fabric initiated from RCs are likely due to a fabric message, traps or
@@ -587,6 +587,35 @@ pf_pcie_regs_gather(pf_data_t *pfd_p, pcie_bus_t *bus_p)
PCIE_ROOTCTL);
}
+ /*
+ * For eligible components, we gather Slot Register state.
+ *
+ * Eligible components are:
+ * - a Downstream Port or a Root Port with the Slot Implemented
+ * capability bit set
+ * - hotplug capable
+ *
+ * Slot register state is useful, for instance, to determine whether the
+ * Slot's child device is physically present (via the Slot Status
+ * register).
+ */
+ if ((PCIE_IS_SWD(bus_p) || PCIE_IS_ROOT(bus_p)) &&
+ PCIE_IS_HOTPLUG_ENABLED(PCIE_BUS2DIP(bus_p))) {
+ pf_pcie_slot_regs_t *pcie_slot_regs = PCIE_SLOT_REG(pfd_p);
+ pcie_slot_regs->pcie_slot_cap = PCIE_CAP_GET(32, bus_p,
+ PCIE_SLOTCAP);
+ pcie_slot_regs->pcie_slot_control = PCIE_CAP_GET(16, bus_p,
+ PCIE_SLOTCTL);
+ pcie_slot_regs->pcie_slot_status = PCIE_CAP_GET(16, bus_p,
+ PCIE_SLOTSTS);
+
+ if (pcie_slot_regs->pcie_slot_cap != PCI_EINVAL32 &&
+ pcie_slot_regs->pcie_slot_control != PCI_EINVAL16 &&
+ pcie_slot_regs->pcie_slot_status != PCI_EINVAL16) {
+ pcie_slot_regs->pcie_slot_regs_valid = B_TRUE;
+ }
+ }
+
if (!PCIE_HAS_AER(bus_p))
return;
@@ -838,7 +867,7 @@ pf_pci_find_rp_fault(pf_data_t *pfd_p, pcie_bus_t *bus_p)
* Check to see if an error has been received that
* requires a scan of the fabric. Count the number of
* faults seen. If MUL CE/FE_NFE that counts for
- * atleast 2 faults, so just return with full_scan.
+ * at least 2 faults, so just return with full_scan.
*/
if ((root_err & PCIE_AER_RE_STS_MUL_CE_RCVD) ||
(root_err & PCIE_AER_RE_STS_MUL_FE_NFE_RCVD)) {
@@ -1232,7 +1261,7 @@ const pf_fab_err_tbl_t pcie_rp_tbl[] = {
{PCIE_AER_UCE_FCP, pf_panic,
PF_AFFECTED_SELF | PF_AFFECTED_CHILDREN, 0},
- {PCIE_AER_UCE_TO, pf_panic,
+ {PCIE_AER_UCE_TO, pf_analyse_to,
PF_AFFECTED_ADDR, PF_AFFECTED_CHILDREN},
{PCIE_AER_UCE_CA, pf_no_panic,
@@ -1916,16 +1945,35 @@ pf_analyse_sc(ddi_fm_error_t *derr, uint32_t bit, pf_data_t *dq_head_p,
/*
* PCIe Timeout error analyser. This error can be forgiven if it is marked as
* CE Advisory. If it is marked as advisory, this means the HW can recover
- * and/or retry the transaction automatically.
+ * and/or retry the transaction automatically. Additionally, if a device's
+ * parent slot reports that it is no longer physically present, we do not panic,
+ * as one would not expect a missing device to respond to a command.
*/
/* ARGSUSED */
static int
pf_analyse_to(ddi_fm_error_t *derr, uint32_t bit, pf_data_t *dq_head_p,
pf_data_t *pfd_p)
{
+ dev_info_t *rpdip = PCIE_PFD2BUS(pfd_p)->bus_rp_dip;
+ pf_data_t *rppfd = PCIE_DIP2PFD(rpdip);
+ pf_pcie_slot_regs_t *p_pcie_slot_regs;
+
if (HAS_AER_LOGS(pfd_p, bit) && CE_ADVISORY(pfd_p))
return (PF_ERR_NO_PANIC);
+ p_pcie_slot_regs = PCIE_SLOT_REG(rppfd);
+ if (p_pcie_slot_regs->pcie_slot_regs_valid) {
+ /*
+ * If the device is reported gone from its parent slot, then it
+ * is expected that any outstanding commands would time out. In
+ * this case, do not panic.
+ */
+ if ((p_pcie_slot_regs->pcie_slot_status &
+ PCIE_SLOTSTS_PRESENCE_DETECTED) == 0x0) {
+ return (PF_ERR_NO_PANIC);
+ }
+ }
+
return (PF_ERR_PANIC);
}
@@ -2970,6 +3018,24 @@ pf_send_ereport(ddi_fm_error_t *derr, pf_impl_t *impl)
NULL);
}
+ /*
+ * Slot Status registers
+ *
+ * Since we only gather these for certain types of components,
+ * only put these registers into the ereport if we have valid
+ * data.
+ */
+ if (PCIE_SLOT_REG(pfd_p)->pcie_slot_regs_valid) {
+ fm_payload_set(ereport,
+ "pcie_slot_cap", DATA_TYPE_UINT32,
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_cap,
+ "pcie_slot_control", DATA_TYPE_UINT16,
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_control,
+ "pcie_slot_status", DATA_TYPE_UINT16,
+ PCIE_SLOT_REG(pfd_p)->pcie_slot_status,
+ NULL);
+ }
+
generic:
/* IOV related information */
if (!PCIE_BDG_IS_UNASSIGNED(PCIE_PFD2BUS(impl->pf_dq_head_p))) {
diff --git a/usr/src/uts/common/os/ddi_hp_impl.c b/usr/src/uts/common/os/ddi_hp_impl.c
index 79165af9ff..38e575dbfd 100644
--- a/usr/src/uts/common/os/ddi_hp_impl.c
+++ b/usr/src/uts/common/os/ddi_hp_impl.c
@@ -21,12 +21,239 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2019 Joyent, Inc.
*/
/*
* Sun DDI hotplug implementation specific functions
*/
+/*
+ * HOTPLUG FRAMEWORK
+ *
+ * The hotplug framework (also referred to "SHP", for "Solaris Hotplug
+ * Framework") refers to a large set of userland and kernel interfaces,
+ * including those in this file, that provide functionality related to device
+ * hotplug.
+ *
+ * Hotplug is a broad term that refers to both removal and insertion of devices
+ * on a live system. Such operations can have varying levels of notification to
+ * the system. Coordinated hotplug means that the operating system is notified
+ * in advance that a device will have a hotplug operation performed on it.
+ * Non-coordinated hotplug, also called "surprise removal", does not have such
+ * notification, and the device is simply removed or inserted from the system.
+ *
+ * The goals of a correct hotplug operation will vary based on the device. In
+ * general, though, we want the system to gracefully notice the device change
+ * and clean up (or create) any relevant structures related to using the device
+ * in the system.
+ *
+ * The goals of the hotplug framework are to provide common interfaces for nexus
+ * drivers, device drivers, and userland programs to build a foundation for
+ * implementing hotplug for a variety of devices. Notably, common support for
+ * PCIe devices is available. See also: the nexus driver for PCIe devices at
+ * uts/i86pc/io/pciex/npe.c.
+ *
+ *
+ * TERMINOLOGY
+ *
+ * The following terms may be useful when exploring hotplug-related code.
+ *
+ * PHYSICAL HOTPLUG
+ * Refers to hotplug operations on a physical hardware receptacle.
+ *
+ * VIRTUAL HOTPLUG
+ * Refers to hotplug operations on an arbitrary device node in the device
+ * tree.
+ *
+ * CONNECTION (often abbreviated "cn")
+ * A place where either physical or virtual hotplug happens. This is a more
+ * generic term to refer to "connectors" and "ports", which represent
+ * physical and virtual places where hotplug happens, respectively.
+ *
+ * CONNECTOR
+ * A place where physical hotplug happens. For example: a PCIe slot, a USB
+ * port, a SAS port, and a fiber channel port are all connectors.
+ *
+ * PORT
+ * A place where virtual hotplug happens. A port refers to an arbitrary
+ * place under a nexus dev_info node in the device tree.
+ *
+ *
+ * CONNECTION STATE MACHINE
+ *
+ * Connections have the states below. Connectors and ports are grouped into
+ * the same state machine. It is worth noting that the edges here are incomplete
+ * -- it is possible for a connection to move straight from ENABLED to EMPTY,
+ * for instance, if there is a surprise removal of its device.
+ *
+ * State changes are kicked off through two ways:
+ * - Through the nexus driver interface, ndi_hp_state_change_req. PCIe
+ * nexus drivers that pass a hotplug interrupt through to pciehpc will kick
+ * off state changes in this way.
+ * - Through coordinated removal, ddihp_modctl. Both cfgadm(1M) and
+ * hotplug(1M) pass state change requests through hotplugd, which uses
+ * modctl to request state changes to the DDI hotplug framework. That
+ * interface is ultimately implemented by ddihp_modctl.
+ *
+ * (start)
+ * |
+ * v
+ * EMPTY no component plugged into connector
+ * ^
+ * v
+ * PRESENT component plugged into connector
+ * ^
+ * v
+ * POWERED connector is powered
+ * ^
+ * v
+ * ENABLED connector is fully functional
+ * |
+ * .
+ * .
+ * .
+ * v
+ * (create port)
+ * |
+ * v
+ * PORT EMPTY port has no device occupying it
+ * ^
+ * v
+ * PORT PRESENT port occupied by device
+ *
+ *
+ * ARCHITECTURE DIAGRAM
+ *
+ * The following is a non-exhaustive summary of various components in the system
+ * that implement pieces of the hotplug framework. More detailed descriptions
+ * of some key components are below.
+ *
+ * +------------+
+ * | cfgadm(1M) |
+ * +------------+
+ * |
+ * +-------------------+
+ * | SHP cfgadm plugin |
+ * +-------------------+
+ * |
+ * +-------------+ +------------+
+ * | hotplug(1M) |----------| libhotplug |
+ * +-------------+ +------------+
+ * |
+ * +----------+
+ * | hotplugd |
+ * +----------+
+ * |
+ * +----------------+
+ * | modctl (HP op) |
+ * +----------------+
+ * |
+ * |
+ * User |
+ * =============================|===============================================
+ * Kernel |
+ * |
+ * |
+ * +------------------------+ +----------------+
+ * | DDI hotplug interfaces | --- | Device Drivers |
+ * +------------------------+ +----------------+
+ * | |
+ * | +------------------------+
+ * | | NDI hotplug interfaces |
+ * | +------------------------+
+ * | |
+ * | |
+ * +-------------+ +--------------+ +---------------------------+
+ * | `bus_hp_op` | -- |"pcie" module | --- | "npe" (PCIe nexus driver) |
+ * +-------------+ +--------------+ +---------------------------+
+ * | |
+ * | +-------------------+
+ * | | PCIe configurator |
+ * | +-------------------+
+ * |
+ * +-------------------------------------+
+ * | "pciehpc" (PCIe hotplug controller) |
+ * +-------------------------------------+
+ *
+ *
+ * .
+ * .
+ * .
+ * .
+ * .
+ * |
+ * |
+ * +-----------------------------------+
+ * | I/O Subsystem |
+ * | (LDI notifications and contracts) |
+ * +-----------------------------------+
+ *
+ *
+ * KEY HOTPLUG SOFTWARE COMPONENTS
+ *
+ * CFGADM(1M)
+ *
+ * cfgadm is the canonical tool for hotplug operations. It can be used to
+ * list connections on the system and change their state in a coordinated
+ * fashion. For more information, see its manual page.
+ *
+ *
+ * HOTPLUG(1M)
+ *
+ * hotplug is a command line tool for managing hotplug connections for
+ * connectors. For more information, see its manual page.
+ *
+ *
+ * DDI HOTPLUG INTERFACES
+ *
+ * This part of the framework provides interfaces for changing device state
+ * for connectors, including onlining and offlining child devices. Many of
+ * these functions are defined in this file.
+ *
+ *
+ * NDI HOTPLUG INTERFACES
+ *
+ * Nexus drivers can define their own hotplug bus implementations by
+ * defining a bus_hp_op entry point. This entry point must implement
+ * a set of hotplug related commands, including getting, probing, and
+ * changing connection state, as well as port creation and removal.
+ *
+ * Nexus drivers may also want to use the following interfaces for
+ * implementing hotplug. Note that the PCIe Hotplug Controller ("pciehpc")
+ * already takes care of using these:
+ * ndi_hp_{register,unregister}
+ * ndi_hp_state_change_req
+ * ndi_hp_walk_cn
+ *
+ * PCIe nexus drivers should use the common entry point pcie_hp_common_ops,
+ * which implements hotplug commands for PCIe devices, calling into other
+ * parts of the framework as needed.
+ *
+ *
+ * NPE DRIVER ("npe")
+ *
+ * npe is the common nexus driver for PCIe devices on x86. It implements
+ * hotplug using the NDI interfaces. For more information, see
+ * uts/i86pc/io/pciex/npe.c.
+ *
+ * The equivalent driver for SPARC is "px".
+ *
+ *
+ * PCIe HOTPLUG CONTROLLER DRIVER ("pciehpc")
+ *
+ * All hotplug-capable PCIe buses will initialize their own PCIe HPC,
+ * including the pcieb and ppb drivers. The controller maintains
+ * hotplug-related state about the slots on its bus, including their status
+ * and port state. It also features a common implementation of handling
+ * hotplug-related PCIe interrupts.
+ *
+ * For more information, see its interfaces in
+ * uts/common/sys/hotplug/pci/pciehpc.h.
+ *
+ */
+
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/file.h>
@@ -163,7 +390,9 @@ done:
}
/*
- * Return the state of Hotplug Connection (CN)
+ * Fetch the state of Hotplug Connection (CN).
+ * This function will also update the state and last changed timestamp in the
+ * connection handle structure if the state has changed.
*/
int
ddihp_cn_getstate(ddi_hp_cn_handle_t *hdlp)
@@ -597,7 +826,7 @@ ddihp_cn_pre_change_state(ddi_hp_cn_handle_t *hdlp,
curr_state == DDI_HP_CN_STATE_ENABLED) {
/*
* If the Connection goes to a lower state from ENABLED,
- * then offline all children under it.
+ * then offline all children under it.
*/
rv = ddihp_cn_change_children_state(hdlp, B_FALSE);
if (rv != DDI_SUCCESS) {
@@ -640,7 +869,7 @@ ddihp_cn_pre_change_state(ddi_hp_cn_handle_t *hdlp,
}
/*
- * Jobs after change state of a Connector: update last change time,
+ * Jobs after change state of a Connector: update state, last change time,
* probe, online, sysevent, etc.
*/
static int
@@ -813,7 +1042,7 @@ ddihp_cn_change_children_state(ddi_hp_cn_handle_t *hdlp, boolean_t online)
NDI_SUCCESS) {
cmn_err(CE_WARN,
"(%s%d):"
- " failed to dettach driver for the device"
+ " failed to detach driver for the device"
" (%s%d) in the Connection %s\n",
ddi_driver_name(dip), ddi_get_instance(dip),
ddi_driver_name(cdip),
diff --git a/usr/src/uts/common/os/ddi_hp_ndi.c b/usr/src/uts/common/os/ddi_hp_ndi.c
index a41a12fc74..73c62dc6b9 100644
--- a/usr/src/uts/common/os/ddi_hp_ndi.c
+++ b/usr/src/uts/common/os/ddi_hp_ndi.c
@@ -21,6 +21,8 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -380,13 +382,19 @@ ddihp_cn_req_handler(ddi_hp_cn_handle_t *hdlp,
ASSERT(DEVI_BUSY_OWNED(dip));
- if (ddihp_cn_getstate(hdlp) != DDI_SUCCESS) {
- DDI_HP_NEXDBG((CE_CONT, "ddihp_cn_req_handler: dip %p, "
- "hdlp %p ddi_cn_getstate failed\n", (void *)dip,
- (void *)hdlp));
-
- return (NDI_UNCLAIMED);
- }
+ /*
+ * We do not want to fetch the state first, as calling ddihp_cn_getstate
+ * will update the cn_state member of the connection handle. The
+ * connector's hotplug operations rely on this value to know how
+ * target_state compares to the last known state of the device and make
+ * decisions about whether to clean up, post sysevents about the state
+ * change, and so on.
+ *
+ * Instead, just carry out the request to change the state. The
+ * connector's hotplug operations will update the state in the
+ * connection handle after they complete their necessary state change
+ * actions.
+ */
if (hdlp->cn_info.cn_state != target_state) {
ddi_hp_cn_state_t result_state = 0;
diff --git a/usr/src/uts/common/sys/ddi_hp.h b/usr/src/uts/common/sys/ddi_hp.h
index eadb88ed49..b88762a9f5 100644
--- a/usr/src/uts/common/sys/ddi_hp.h
+++ b/usr/src/uts/common/sys/ddi_hp.h
@@ -21,6 +21,8 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _SYS_DDI_HP_H
@@ -28,6 +30,9 @@
/*
* Sun DDI hotplug support definitions
+ *
+ * See the big theory statement in uts/common/os/ddi_hp_impl.c for more
+ * information.
*/
#ifdef __cplusplus
@@ -73,7 +78,8 @@ typedef enum {
/*
* ddi_hp_cn_info_t
*
- * Hotplug Connection (CN) information structure
+ * Hotplug Connection (CN) information structure.
+ * A Connection is either a Connector or a Port.
*/
typedef struct ddi_hp_cn_info {
char *cn_name; /* Name of the Connection */
diff --git a/usr/src/uts/common/sys/ddi_hp_impl.h b/usr/src/uts/common/sys/ddi_hp_impl.h
index fb220119dd..b52df77cac 100644
--- a/usr/src/uts/common/sys/ddi_hp_impl.h
+++ b/usr/src/uts/common/sys/ddi_hp_impl.h
@@ -21,6 +21,12 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * See the big theory statement in uts/common/os/ddi_hp_impl.c for more
+ * information about the structures and functions defined here.
*/
#ifndef _SYS_DDI_HP_IMPL_H
diff --git a/usr/src/uts/common/sys/pcie_impl.h b/usr/src/uts/common/sys/pcie_impl.h
index d1d13625c2..442c55043c 100644
--- a/usr/src/uts/common/sys/pcie_impl.h
+++ b/usr/src/uts/common/sys/pcie_impl.h
@@ -166,6 +166,7 @@ extern "C" {
#define PCIE_ADV_BDG_HDR(pfd_p, n) PCIE_ADV_BDG_REG(pfd_p)->pcie_sue_hdr[n]
#define PCIE_ADV_RP_REG(pfd_p) \
PCIE_ADV_REG(pfd_p)->pcie_ext.pcie_adv_rp_regs
+#define PCIE_SLOT_REG(pfd_p) pfd_p->pe_pcie_slot_regs
#define PFD_AFFECTED_DEV(pfd_p) pfd_p->pe_affected_dev
#define PFD_SET_AFFECTED_FLAG(pfd_p, aff_flag) \
PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags = aff_flag
@@ -262,6 +263,18 @@ typedef struct pf_pcie_err_regs {
pf_pcie_adv_err_regs_t *pcie_adv_regs; /* pcie aer regs */
} pf_pcie_err_regs_t;
+/*
+ * Slot register values for hotplug-capable Downstream Ports or Root Ports with
+ * the Slot Implemented capability bit set. We gather these to help determine
+ * whether the slot's child device is physically present.
+ */
+typedef struct pf_pcie_slot_regs {
+ boolean_t pcie_slot_regs_valid; /* true if register values are valid */
+ uint32_t pcie_slot_cap; /* pcie slot capabilities register */
+ uint16_t pcie_slot_control; /* pcie slot control register */
+ uint16_t pcie_slot_status; /* pcie slot status register */
+} pf_pcie_slot_regs_t;
+
typedef enum {
PF_INTR_TYPE_NONE = 0,
PF_INTR_TYPE_FABRIC = 1, /* Fabric Message */
@@ -431,6 +444,7 @@ struct pf_data {
pf_pcie_err_regs_t *pe_pcie_regs; /* PCIe error reg */
} pe_ext;
pf_pcix_bdg_err_regs_t *pe_pcix_bdg_regs; /* PCI-X bridge regs */
+ pf_pcie_slot_regs_t *pe_pcie_slot_regs; /* PCIe slot regs */
pf_data_t *pe_prev; /* Next error in queue */
pf_data_t *pe_next; /* Next error in queue */
boolean_t pe_rber_fatal;
diff --git a/usr/src/uts/i86pc/io/pci/pci_common.h b/usr/src/uts/i86pc/io/pci/pci_common.h
index 63fe4bb165..d5fa3bfd55 100644
--- a/usr/src/uts/i86pc/io/pci/pci_common.h
+++ b/usr/src/uts/i86pc/io/pci/pci_common.h
@@ -22,6 +22,8 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _PCI_PCI_COMMON_H
@@ -33,7 +35,7 @@ extern "C" {
/*
* Common header file with definitions shared between
- * pci(7d) and npe(7d)
+ * pci(7D) and npe(7D)
*/
/* State structure. */
@@ -45,12 +47,18 @@ typedef struct pci_state {
kmutex_t pci_mutex;
kmutex_t pci_peek_poke_mutex;
kmutex_t pci_err_mutex;
+
+ /*
+ * The following members are only used by npe(7D).
+ * See uts/i86pc/io/pciex/npe.c for more information.
+ */
+ ndi_event_hdl_t pci_ndi_event_hdl;
} pci_state_t;
/*
* These are the access routines.
- * The pci_bus_map sets the handle to point to these in pci(7d).
- * The npe_bus_map sets the handle to point to these in npe(7d).
+ * The pci_bus_map sets the handle to point to these in pci(7D).
+ * The npe_bus_map sets the handle to point to these in npe(7D).
*/
uint8_t pci_config_rd8(ddi_acc_impl_t *hdlp, uint8_t *addr);
uint16_t pci_config_rd16(ddi_acc_impl_t *hdlp, uint16_t *addr);
diff --git a/usr/src/uts/i86pc/io/pciex/npe.c b/usr/src/uts/i86pc/io/pciex/npe.c
index 4ef393ddb0..fcb68164ee 100644
--- a/usr/src/uts/i86pc/io/pciex/npe.c
+++ b/usr/src/uts/i86pc/io/pciex/npe.c
@@ -26,11 +26,35 @@
/*
* Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
/*
- * Host to PCI-Express local bus driver
+ * npe (Nexus PCIe driver): Host to PCI-Express local bus driver
+ *
+ * npe serves as the driver for PCIe Root Complexes and as the nexus driver
+ * for PCIe devices. See also: npe(7D). For more information about hotplug,
+ * see the big theory statement at uts/common/os/ddi_hp_impl.c.
+ *
+ *
+ * NDI EVENT HANDLING SUPPORT
+ *
+ * npe supports NDI event handling. The only available event is surprise
+ * removal of a device. Child drivers can register surprise removal event
+ * callbacks by requesting an event cookie using ddi_get_eventcookie for
+ * the DDI_DEVI_REMOVE_EVENT and add their callback using
+ * ddi_add_event_handler. For an example, see the nvme driver in
+ * uts/common/io/nvme/nvme.c.
+ *
+ * The NDI events in npe are retrieved using NDI_EVENT_NOPASS, which
+ * prevent them from being propagated up the tree once they reach the npe's
+ * bus_get_eventcookie operations. This is important because npe maintains
+ * the state of PCIe devices and their receptacles, via the PCIe hotplug
+ * controller driver (pciehpc).
+ *
+ * Hot removal events are ultimately posted by the PCIe hotplug controller
+ * interrupt handler for hotplug events. Events are posted using the
+ * ndi_post_event interface.
*/
#include <sys/conf.h>
@@ -72,6 +96,15 @@ static int npe_intr_ops(dev_info_t *, dev_info_t *, ddi_intr_op_t,
ddi_intr_handle_impl_t *, void *);
static int npe_fm_init(dev_info_t *, dev_info_t *, int,
ddi_iblock_cookie_t *);
+static int npe_bus_get_eventcookie(dev_info_t *, dev_info_t *, char *,
+ ddi_eventcookie_t *);
+static int npe_bus_add_eventcall(dev_info_t *, dev_info_t *,
+ ddi_eventcookie_t, void (*)(dev_info_t *,
+ ddi_eventcookie_t, void *, void *),
+ void *, ddi_callback_id_t *);
+static int npe_bus_remove_eventcall(dev_info_t *, ddi_callback_id_t);
+static int npe_bus_post_event(dev_info_t *, dev_info_t *,
+ ddi_eventcookie_t, void *);
static int npe_fm_callback(dev_info_t *, ddi_fm_error_t *, const void *);
@@ -102,10 +135,10 @@ struct bus_ops npe_bus_ops = {
ddi_dma_mctl,
npe_ctlops,
ddi_bus_prop_op,
- 0, /* (*bus_get_eventcookie)(); */
- 0, /* (*bus_add_eventcall)(); */
- 0, /* (*bus_remove_eventcall)(); */
- 0, /* (*bus_post_event)(); */
+ npe_bus_get_eventcookie,
+ npe_bus_add_eventcall,
+ npe_bus_remove_eventcall,
+ npe_bus_post_event,
0, /* (*bus_intr_ctl)(); */
0, /* (*bus_config)(); */
0, /* (*bus_unconfig)(); */
@@ -271,12 +304,27 @@ npe_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
return (ret);
}
+/*
+ * See big theory statement at the top of this file for more information about
+ * surprise removal events.
+ */
+#define NPE_EVENT_TAG_HOT_REMOVAL 0
+static ndi_event_definition_t npe_ndi_event_defs[1] = {
+ {NPE_EVENT_TAG_HOT_REMOVAL, DDI_DEVI_REMOVE_EVENT, EPL_KERNEL,
+ NDI_EVENT_POST_TO_ALL}
+};
+
+static ndi_event_set_t npe_ndi_events = {
+ NDI_EVENTS_REV1, ARRAY_SIZE(npe_ndi_event_defs), npe_ndi_event_defs
+};
+
/*ARGSUSED*/
static int
npe_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
{
int instance = ddi_get_instance(devi);
pci_state_t *pcip = NULL;
+ int ret;
if (cmd == DDI_RESUME) {
/*
@@ -316,6 +364,22 @@ npe_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
if (pcie_init(devi, NULL) != DDI_SUCCESS)
goto fail1;
+ ret = ndi_event_alloc_hdl(pcip->pci_dip, NULL, &pcip->pci_ndi_event_hdl,
+ NDI_SLEEP);
+ if (ret == NDI_SUCCESS) {
+ ret = ndi_event_bind_set(pcip->pci_ndi_event_hdl,
+ &npe_ndi_events, NDI_SLEEP);
+ if (ret != NDI_SUCCESS) {
+ dev_err(pcip->pci_dip, CE_WARN, "npe: failed to bind "
+ "NDI event set (error=%d)", ret);
+ goto fail1;
+ }
+ } else {
+ dev_err(pcip->pci_dip, CE_WARN, "npe: failed to allocate "
+ "event handle (error=%d)", ret);
+ goto fail1;
+ }
+
/* Second arg: initialize for pci_express root nexus */
if (pcitool_init(devi, B_TRUE) != DDI_SUCCESS)
goto fail2;
@@ -352,11 +416,36 @@ npe_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
{
int instance = ddi_get_instance(devi);
pci_state_t *pcip;
+ int ret;
pcip = ddi_get_soft_state(npe_statep, ddi_get_instance(devi));
switch (cmd) {
case DDI_DETACH:
+
+ /*
+ * Clean up event handling first, to ensure there are no
+ * oustanding callbacks registered.
+ */
+ ret = ndi_event_unbind_set(pcip->pci_ndi_event_hdl,
+ &npe_ndi_events, NDI_SLEEP);
+ if (ret == NDI_SUCCESS) {
+ /* ndi_event_free_hdl always succeeds. */
+ (void) ndi_event_free_hdl(pcip->pci_ndi_event_hdl);
+ } else {
+ /*
+ * The event set will only fail to unbind if there are
+ * outstanding callbacks registered for it, which
+ * probably means a child driver still has one
+ * registered and thus was not cleaned up properly
+ * before npe's detach routine was called. Consequently,
+ * we should fail the detach here.
+ */
+ dev_err(pcip->pci_dip, CE_WARN, "npe: failed to "
+ "unbind NDI event set (error=%d)", ret);
+ return (DDI_FAILURE);
+ }
+
pcie_fab_fini_bus(devi, PCIE_BUS_INITIAL);
/* Uninitialize pcitool support. */
@@ -373,6 +462,7 @@ npe_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
ddi_fm_fini(devi);
ddi_soft_state_free(npe_statep, instance);
+
return (DDI_SUCCESS);
case DDI_SUSPEND:
@@ -414,7 +504,7 @@ static int
npe_bus_map(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp,
off_t offset, off_t len, caddr_t *vaddrp)
{
- int rnumber;
+ int rnumber;
int space;
ddi_acc_impl_t *ap;
ddi_acc_hdl_t *hp;
@@ -1111,6 +1201,49 @@ npe_fm_init(dev_info_t *dip, dev_info_t *tdip, int cap,
return (pcip->pci_fmcap);
}
+static int
+npe_bus_get_eventcookie(dev_info_t *dip, dev_info_t *rdip, char *eventname,
+ ddi_eventcookie_t *cookiep)
+{
+ pci_state_t *pcip = ddi_get_soft_state(npe_statep,
+ ddi_get_instance(dip));
+
+ return (ndi_event_retrieve_cookie(pcip->pci_ndi_event_hdl, rdip,
+ eventname, cookiep, NDI_EVENT_NOPASS));
+}
+
+static int
+npe_bus_add_eventcall(dev_info_t *dip, dev_info_t *rdip,
+ ddi_eventcookie_t cookie, void (*callback)(dev_info_t *dip,
+ ddi_eventcookie_t cookie, void *arg, void *bus_impldata),
+ void *arg, ddi_callback_id_t *cb_id)
+{
+ pci_state_t *pcip = ddi_get_soft_state(npe_statep,
+ ddi_get_instance(dip));
+
+ return (ndi_event_add_callback(pcip->pci_ndi_event_hdl, rdip, cookie,
+ callback, arg, NDI_SLEEP, cb_id));
+}
+
+static int
+npe_bus_remove_eventcall(dev_info_t *dip, ddi_callback_id_t cb_id)
+{
+ pci_state_t *pcip = ddi_get_soft_state(npe_statep,
+ ddi_get_instance(dip));
+ return (ndi_event_remove_callback(pcip->pci_ndi_event_hdl, cb_id));
+}
+
+static int
+npe_bus_post_event(dev_info_t *dip, dev_info_t *rdip,
+ ddi_eventcookie_t cookie, void *impl_data)
+{
+ pci_state_t *pcip = ddi_get_soft_state(npe_statep,
+ ddi_get_instance(dip));
+ return (ndi_event_do_callback(pcip->pci_ndi_event_hdl, rdip, cookie,
+ impl_data));
+
+}
+
/*ARGSUSED*/
static int
npe_fm_callback(dev_info_t *dip, ddi_fm_error_t *derr, const void *no_used)
diff --git a/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c b/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c
index b482117c7c..3f890d8f07 100644
--- a/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c
+++ b/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2019, Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -1251,7 +1251,7 @@ pcicfg_ntbridge_unconfigure_child(dev_info_t *new_device, uint_t devno)
{
dev_info_t *new_ntbridgechild;
- int len, bus;
+ int len, bus;
uint16_t vid;
ddi_acc_handle_t config_handle;
pci_bus_range_t pci_bus_range;
@@ -1368,7 +1368,7 @@ pcicfg_is_ntbridge(dev_info_t *dip)
static uint_t
pcicfg_ntbridge_child(dev_info_t *dip)
{
- int len, val, rc = DDI_FAILURE;
+ int len, val, rc = DDI_FAILURE;
dev_info_t *anode = dip;
/*
@@ -1398,7 +1398,7 @@ pcicfg_ntbridge_child(dev_info_t *dip)
static uint_t
pcicfg_get_ntbridge_child_range(dev_info_t *dip, uint64_t *boundbase,
- uint64_t *boundlen, uint_t space_type)
+ uint64_t *boundlen, uint_t space_type)
{
int length, found = DDI_FAILURE, acount, i, ibridge;
pci_regspec_t *assigned;
@@ -1584,6 +1584,7 @@ static int
pcicfg_teardown_device(dev_info_t *dip, pcicfg_flags_t flags, boolean_t is_pcie)
{
ddi_acc_handle_t handle;
+ int ret;
/*
* Free up resources associated with 'dip'
@@ -1596,10 +1597,20 @@ pcicfg_teardown_device(dev_info_t *dip, pcicfg_flags_t flags, boolean_t is_pcie)
/*
* disable the device
*/
- if (pcicfg_config_setup(dip, &handle) != PCICFG_SUCCESS)
+
+ ret = pcicfg_config_setup(dip, &handle);
+ if (ret == PCICFG_SUCCESS) {
+ pcicfg_device_off(handle);
+ pcicfg_config_teardown(&handle);
+ } else if (ret != PCICFG_NODEVICE) {
+ /*
+ * It is possible the device no longer exists -- for instance,
+ * if the device has been pulled from a hotpluggable slot on the
+ * system. In this case, do not fail the teardown, though there
+ * is less to clean up.
+ */
return (PCICFG_FAILURE);
- pcicfg_device_off(handle);
- pcicfg_config_teardown(&handle);
+ }
if (is_pcie) {
/*
@@ -2401,8 +2412,7 @@ pcicfg_get_mem(pcicfg_phdl_t *entry, uint32_t length, uint64_t *ans)
}
static void
-pcicfg_get_io(pcicfg_phdl_t *entry,
- uint32_t length, uint32_t *ans)
+pcicfg_get_io(pcicfg_phdl_t *entry, uint32_t length, uint32_t *ans)
{
uint32_t new_io;
uint64_t io_last;
@@ -3189,7 +3199,7 @@ pcicfg_device_off(ddi_acc_handle_t config_handle)
*/
static int
pcicfg_set_standard_props(dev_info_t *dip, ddi_acc_handle_t config_handle,
- uint8_t pcie_dev)
+ uint8_t pcie_dev)
{
int ret;
uint16_t cap_id_loc, val;
@@ -3361,7 +3371,7 @@ pcicfg_set_busnode_props(dev_info_t *dip, uint8_t pcie_device_type)
static int
pcicfg_set_childnode_props(dev_info_t *dip, ddi_acc_handle_t config_handle,
- uint8_t pcie_dev)
+ uint8_t pcie_dev)
{
int ret;
@@ -3521,8 +3531,8 @@ pcicfg_set_childnode_props(dev_info_t *dip, ddi_acc_handle_t config_handle,
* Program the bus numbers into the bridge
*/
static void
-pcicfg_set_bus_numbers(ddi_acc_handle_t config_handle,
-uint_t primary, uint_t secondary, uint_t subordinate)
+pcicfg_set_bus_numbers(ddi_acc_handle_t config_handle, uint_t primary,
+ uint_t secondary, uint_t subordinate)
{
DEBUG3("Setting bridge bus-range %d,%d,%d\n", primary, secondary,
subordinate);
@@ -3547,8 +3557,7 @@ uint_t primary, uint_t secondary, uint_t subordinate)
* Put bridge registers into initial state
*/
static void
-pcicfg_setup_bridge(pcicfg_phdl_t *entry,
- ddi_acc_handle_t handle)
+pcicfg_setup_bridge(pcicfg_phdl_t *entry, ddi_acc_handle_t handle)
{
/*
* The highest bus seen during probing is the max-subordinate bus
@@ -3607,8 +3616,7 @@ pcicfg_setup_bridge(pcicfg_phdl_t *entry,
}
static void
-pcicfg_update_bridge(pcicfg_phdl_t *entry,
- ddi_acc_handle_t handle)
+pcicfg_update_bridge(pcicfg_phdl_t *entry, ddi_acc_handle_t handle)
{
uint_t length;
@@ -3853,11 +3861,10 @@ failedconfig:
* Sizing the BARs and update "reg" property
*/
static int
-pcicfg_populate_reg_props(dev_info_t *new_child,
- ddi_acc_handle_t config_handle)
+pcicfg_populate_reg_props(dev_info_t *new_child, ddi_acc_handle_t config_handle)
{
int i;
- uint32_t request;
+ uint32_t request;
i = PCI_CONF_BASE0;
@@ -5079,7 +5086,7 @@ pcicfg_config_teardown(ddi_acc_handle_t *handle)
static int
pcicfg_add_config_reg(dev_info_t *dip,
- uint_t bus, uint_t device, uint_t func)
+ uint_t bus, uint_t device, uint_t func)
{
int reg[10] = { PCI_ADDR_CONFIG, 0, 0, 0, 0};
@@ -5104,8 +5111,8 @@ pcicfg_ari_configure(dev_info_t *dip)
#ifdef DEBUG
static void
-debug(char *fmt, uintptr_t a1, uintptr_t a2, uintptr_t a3,
- uintptr_t a4, uintptr_t a5)
+debug(char *fmt, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4,
+ uintptr_t a5)
{
if (pcicfg_debug > 1) {
prom_printf("pcicfg: ");