diff options
| author | Jordan Paige Hendricks <jordan.hendricks@joyent.com> | 2019-03-25 17:22:13 +0000 |
|---|---|---|
| committer | Jordan Paige Hendricks <jordan.hendricks@joyent.com> | 2019-08-27 18:52:13 +0000 |
| commit | 175e9b5e7534aad7d965dc4a8c6e9b0dd8e6f1f0 (patch) | |
| tree | 90d539b4766a111948ee8f654bea62269f8446ca | |
| parent | ba347c6543e6ff6ffba5d804e1705af24514e945 (diff) | |
| download | illumos-joyent-175e9b5e7534aad7d965dc4a8c6e9b0dd8e6f1f0.tar.gz | |
OS-5553 Want NVMe Hotplug Support
OS-7968 x86 pci configurator should not fail device teardown if device is gone
OS-7969 DDI hotplug request handler resets connection handle state before performing state change operations
OS-7691 ldi_handle dcmd segfaults occasionally
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Rob Johnston <rob.johnston@joyent.com>
Approved by: Joshua M. Clulow <jmc@joyent.com>
| -rw-r--r-- | usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h | 31 | ||||
| -rw-r--r-- | usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c | 55 | ||||
| -rw-r--r-- | usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c | 20 | ||||
| -rw-r--r-- | usr/src/cmd/mdb/common/modules/genunix/ldi.c | 7 | ||||
| -rw-r--r-- | usr/src/uts/common/io/nvme/nvme.c | 96 | ||||
| -rw-r--r-- | usr/src/uts/common/io/nvme/nvme_var.h | 25 | ||||
| -rw-r--r-- | usr/src/uts/common/io/pciex/hotplug/pciehpc.c | 15 | ||||
| -rw-r--r-- | usr/src/uts/common/io/pciex/pcie.c | 7 | ||||
| -rw-r--r-- | usr/src/uts/common/io/pciex/pcie_fault.c | 76 | ||||
| -rw-r--r-- | usr/src/uts/common/os/ddi_hp_impl.c | 237 | ||||
| -rw-r--r-- | usr/src/uts/common/os/ddi_hp_ndi.c | 22 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/ddi_hp.h | 8 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/ddi_hp_impl.h | 6 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/pcie_impl.h | 14 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/pci/pci_common.h | 14 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/pciex/npe.c | 147 | ||||
| -rw-r--r-- | usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c | 53 |
17 files changed, 758 insertions, 75 deletions
diff --git a/usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h b/usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h index f33ea9ecd6..96e1a956af 100644 --- a/usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h +++ b/usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _FABRIC_XLATE_H @@ -31,6 +32,7 @@ #include <sys/types.h> #include <sys/pcie.h> #include <sys/fm/io/pci.h> +#include <limits.h> #ifdef __cplusplus extern "C" { @@ -45,6 +47,17 @@ extern "C" { #define PF_ADDR_PIO (1 << 1) #define PF_ADDR_CFG (1 << 2) + +/* + * The fabric ereport preparation functions (fab_prep_*) in fab_erpt_tbl_t + * structures may return an error if the ereport could not be set up properly. + * Typically, these errors are errnos. It is possible that based on incoming + * ereport payload data, we might not want to generate an ereport at all: In + * this case, the preparation functions may instead return PF_EREPORT_IGNORE, + * which is set at a high value so as not to collide with the errnos. + */ +#define PF_EREPORT_IGNORE INT_MAX + extern fmd_xprt_t *fab_fmd_xprt; /* FMD transport layer handle */ extern char fab_buf[]; @@ -121,8 +134,21 @@ typedef struct fab_data { uint16_t pcie_rp_ctl; /* root complex control register */ uint32_t pcie_rp_err_status; /* pcie root complex error status reg */ uint32_t pcie_rp_err_cmd; /* pcie root complex error cmd reg */ - uint16_t pcie_rp_ce_src_id; /* pcie root complex ce sourpe id */ - uint16_t pcie_rp_ue_src_id; /* pcie root complex ue sourpe id */ + uint16_t pcie_rp_ce_src_id; /* pcie root complex ce source id */ + uint16_t pcie_rp_ue_src_id; /* pcie root complex ue source id */ + + /* + * The slot register values refer to the registers of the component's + * parent slot, not the component itself. + * + * You should only use the register values -- i.e., + * pcie_slot_{cap,control,status} -- if pcie_slot_data_valid is set to + * true. + */ + boolean_t pcie_slot_data_valid; /* true if slot data is valid */ + uint32_t pcie_slot_cap; /* pcie slot capabilities */ + uint16_t pcie_slot_control; /* pcie slot control */ + uint16_t pcie_slot_status; /* pcie slot status */ /* Flags */ boolean_t pcie_rp_send_all; /* need to send ereports on all rps */ @@ -131,7 +157,6 @@ typedef struct fab_data { typedef struct fab_erpt_tbl { const char *err_class; /* Final Ereport Class */ uint32_t reg_bit; /* Error Bit Mask */ - /* Pointer to function that prepares the ereport body */ const char *tgt_class; /* Target Ereport Class */ } fab_erpt_tbl_t; diff --git a/usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c b/usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c index 69ecf1aa8d..14ae738863 100644 --- a/usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c +++ b/usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c @@ -22,10 +22,13 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ #include <stddef.h> #include <strings.h> #include <sys/fm/util.h> +#include <sys/pcie.h> #include "fabric-xlate.h" @@ -271,6 +274,24 @@ fab_pci_fabric_to_data(fmd_hdl_t *hdl, nvlist_t *nvl, fab_data_t *data) FAB_LOOKUP(32, "pcie_adv_rp_command", &data->pcie_rp_err_cmd); FAB_LOOKUP(16, "pcie_adv_rp_ce_src_id", &data->pcie_rp_ce_src_id); FAB_LOOKUP(16, "pcie_adv_rp_ue_src_id", &data->pcie_rp_ue_src_id); + + /* + * PCIe Parent Slot Registers + * + * These are only passed in the ereport if the parent PCIe component + * supports the registers and the registers have valid data. As such, we + * look up one slot register value first: If that value is present in + * the input ereport data, then we know the others should be there as + * well. We also set the pcie_slot_data_valid flag to ensure we know + * the slot register data is safe to use in the module. + */ + data->pcie_slot_data_valid = B_FALSE; + if (nvlist_lookup_uint32(nvl, "pcie_slot_cap", &data->pcie_slot_cap) == + 0) { + FAB_LOOKUP(16, "pcie_slot_control", &data->pcie_slot_control); + FAB_LOOKUP(16, "pcie_slot_status", &data->pcie_slot_status); + data->pcie_slot_data_valid = B_TRUE; + } } static int @@ -358,6 +379,38 @@ fab_prep_pcie_ue_erpt(fmd_hdl_t *hdl, fab_data_t *data, nvlist_t *erpt, PCIE_AER_CTL_FST_ERR_PTR_MASK); int err = fab_prep_basic_erpt(hdl, data->nvl, erpt, B_FALSE); + if (data->pcie_slot_data_valid) { + (void) nvlist_add_uint32(erpt, "pcie_slot_cap", + data->pcie_slot_cap); + (void) nvlist_add_uint16(erpt, "pcie_slot_control", + data->pcie_slot_control); + (void) nvlist_add_uint16(erpt, "pcie_slot_status", + data->pcie_slot_status); + + /* + * It is possible to see uncorrectable errors for a slot that + * are related to the slot's child device being physically + * removed from the slot. As such, in the case that the slot + * reports that it is empty, we do not want to generate an + * ereport for all errors. Generating an ereport here will cause + * the eft module to fault the device and io-retire to + * subsequently retire the device. Retiring the device makes + * little sense given that the device is physically gone; more + * confusingly, if plugged back into the slot, it would be + * marked retired already. + * + * The only error ignored for this case is Completion Timeout. + * It is possible more errors should be ignored, and if they + * are seen in the field it might be worth broadening the set + * of ignored errors. + */ + if (tbl->reg_bit == PCIE_AER_UCE_TO && + ((data->pcie_slot_status & + PCIE_SLOTSTS_PRESENCE_DETECTED) == 0x0)) { + return (PF_EREPORT_IGNORE); + } + } + /* Generate an ereport for this error bit. */ (void) snprintf(fab_buf, FM_MAX_CLASS, "ereport.io.%s.%s", PCIEX_ERROR_SUBCLASS, class); @@ -776,7 +829,7 @@ fab_xlate_pcie_erpts(fmd_hdl_t *hdl, fab_data_t *data) fmd_hdl_debug(hdl, "Sending Ereports Now"); - /* Go through the error logs and send the relavant reports */ + /* Go through the error logs and send the relevant reports */ for (tbl = fab_master_err_tbl; tbl->erpt_tbl; tbl++) { fab_send_erpt(hdl, data, tbl); } diff --git a/usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c b/usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c index 8593144b28..94678dbd47 100644 --- a/usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c +++ b/usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <strings.h> #include <fm/topo_hc.h> @@ -185,6 +186,7 @@ fab_send_erpt(fmd_hdl_t *hdl, fab_data_t *data, fab_err_tbl_t *tbl) fab_erpt_tbl_t *erpt_tbl, *entry; nvlist_t *erpt; uint32_t reg; + int err; erpt_tbl = tbl->erpt_tbl; if (tbl->reg_size == 16) { @@ -200,7 +202,9 @@ fab_send_erpt(fmd_hdl_t *hdl, fab_data_t *data, fab_err_tbl_t *tbl) if (nvlist_alloc(&erpt, NV_UNIQUE_NAME, 0) != 0) goto done; - if (tbl->fab_prep(hdl, data, erpt, entry) != 0) { + + err = tbl->fab_prep(hdl, data, erpt, entry); + if (err != 0 && err != PF_EREPORT_IGNORE) { fmd_hdl_debug(hdl, "Prepping ereport failed: " "class = %s\n", entry->err_class); nvlist_free(erpt); @@ -394,7 +398,7 @@ fab_find_rppath_by_devbdf(fmd_hdl_t *hdl, nvlist_t *nvl, pcie_req_id_t bdf) xmlXPathObjectPtr xpathObj; xmlNodeSetPtr nodes; xmlNodePtr devNode; - char *retval, *temp; + char *retval, *temp; char query[500]; int i, size, bus, dev, fn; char *hcpath; @@ -577,7 +581,7 @@ fail: char * fab_find_bdf(fmd_hdl_t *hdl, nvlist_t *nvl, pcie_req_id_t bdf) { - char *retval; + char *retval; char query[500]; int bus, dev, fn; char rcpath[255]; @@ -705,7 +709,7 @@ found: propgroup: /* Retrive the "dev" propval and return */ for (devNode = devNode->children; devNode; devNode = devNode->next) { - char *tprop; + char *tprop; tprop = GET_PROP(devNode, "name"); if (STRCMP(devNode->name, "propval") && @@ -866,8 +870,8 @@ fab_pr(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl) char * fab_get_rpdev(fmd_hdl_t *hdl) { - char *retval; - char query[500]; + char *retval; + char query[500]; (void) snprintf(query, sizeof (query), "//propval[" "@name='extended-capabilities' and contains(@value, '%s')]" @@ -888,8 +892,8 @@ fab_send_erpt_all_rps(fmd_hdl_t *hdl, nvlist_t *erpt) { xmlXPathObjectPtr xpathObj; xmlNodeSetPtr nodes; - char *rppath, *hbpath; - char query[600]; + char *rppath, *hbpath; + char query[600]; nvlist_t *detector, *nvl; uint_t i, size; size_t len; diff --git a/usr/src/cmd/mdb/common/modules/genunix/ldi.c b/usr/src/cmd/mdb/common/modules/genunix/ldi.c index a3ceb64421..3e4f11ba28 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/ldi.c +++ b/usr/src/cmd/mdb/common/modules/genunix/ldi.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/types.h> @@ -290,7 +290,8 @@ ldi_ident(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) } static void -ldi_handle_header(int refs, int ident) { +ldi_handle_header(int refs, int ident) +{ mdb_printf("%-?s ", "HANDLE"); if (refs) @@ -369,7 +370,7 @@ ldi_handle(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) int refs = 1; if (mdb_getopts(argc, argv, - 'i', MDB_OPT_SETBITS, TRUE, &ident) != argc) + 'i', MDB_OPT_SETBITS, TRUE, &ident, NULL) != argc) return (DCMD_USAGE); if (ident) diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c index 5af89e3874..44fbf8ea89 100644 --- a/usr/src/uts/common/io/nvme/nvme.c +++ b/usr/src/uts/common/io/nvme/nvme.c @@ -13,7 +13,7 @@ * Copyright 2018 Nexenta Systems, Inc. * Copyright 2016 Tegile Systems, Inc. All rights reserved. * Copyright (c) 2016 The MathWorks, Inc. All rights reserved. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2019 Western Digital Corporation. */ @@ -58,7 +58,7 @@ * but they share some driver state: the command array (holding pointers to * commands currently being processed by the hardware) and the active command * counter. Access to a submission queue and the shared state is protected by - * nq_mutex, completion queue is protected by ncq_mutex. + * nq_mutex; completion queue is protected by ncq_mutex. * * When a command is submitted to a queue pair the active command counter is * incremented and a pointer to the command is stored in the command array. The @@ -201,6 +201,23 @@ * device. * * + * NVMe Hotplug: + * + * The driver supports hot removal. The driver uses the NDI event framework + * to register a callback, nvme_remove_callback, to clean up when a disk is + * removed. In particular, the driver will unqueue outstanding I/O commands and + * set n_dead on the softstate to true so that other operations, such as ioctls + * and command submissions, fail as well. + * + * While the callback registration relies on the NDI event framework, the + * removal event itself is kicked off in the PCIe hotplug framework, when the + * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicatating that a + * device was removed from the slot. + * + * The NVMe driver instance itself will remain until the final close of the + * device. + * + * * Driver Configuration: * * The following driver properties can be changed to control some aspects of the @@ -1017,6 +1034,10 @@ nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) static int nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) { + if (cmd->nc_nvme->n_dead) { + return (EIO); + } + if (sema_tryp(&qp->nq_sema) == 0) return (EAGAIN); @@ -3181,6 +3202,47 @@ nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg) return (fm_error->fme_status); } +static void +nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a, + void *b) +{ + nvme_t *nvme = a; + + nvme->n_dead = B_TRUE; + + /* + * Fail all outstanding commands, including those in the admin queue + * (queue 0). + */ + for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) { + nvme_qpair_t *qp = nvme->n_ioq[i]; + + mutex_enter(&qp->nq_mutex); + for (size_t j = 0; j < qp->nq_nentry; j++) { + nvme_cmd_t *cmd = qp->nq_cmd[j]; + nvme_cmd_t *u_cmd; + + if (cmd == NULL) { + continue; + } + + /* + * Since we have the queue lock held the entire time we + * iterate over it, it's not possible for the queue to + * change underneath us. Thus, we don't need to check + * that the return value of nvme_unqueue_cmd matches the + * requested cmd to unqueue. + */ + u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid); + taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq, + cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); + + ASSERT3P(u_cmd, ==, cmd); + } + mutex_exit(&qp->nq_mutex); + } +} + static int nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { @@ -3203,6 +3265,17 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) ddi_set_driver_private(dip, nvme); nvme->n_dip = dip; + /* Set up event handlers for hot removal. */ + if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT, + &nvme->n_rm_cookie) != DDI_SUCCESS) { + goto fail; + } + if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie, + nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) != + DDI_SUCCESS) { + goto fail; + } + mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL); nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip, @@ -3510,6 +3583,12 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) if (nvme->n_product != NULL) strfree(nvme->n_product); + /* Clean up hot removal event handler. */ + if (nvme->n_ev_rm_cb_id != NULL) { + (void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id); + } + nvme->n_ev_rm_cb_id = NULL; + ddi_soft_state_free(nvme_state, instance); return (DDI_SUCCESS); @@ -3692,6 +3771,11 @@ static int nvme_bd_mediainfo(void *arg, bd_media_t *media) { nvme_namespace_t *ns = arg; + nvme_t *nvme = ns->ns_nvme; + + if (nvme->n_dead) { + return (EIO); + } media->m_nblks = ns->ns_block_count; media->m_blksize = ns->ns_block_size; @@ -3712,8 +3796,9 @@ nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc) boolean_t poll; int ret; - if (nvme->n_dead) + if (nvme->n_dead) { return (EIO); + } cmd = nvme_create_nvm_cmd(ns, opc, xfer); if (cmd == NULL) @@ -3794,6 +3879,11 @@ static int nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) { nvme_namespace_t *ns = arg; + nvme_t *nvme = ns->ns_nvme; + + if (nvme->n_dead) { + return (EIO); + } /*LINTED: E_BAD_PTR_CAST_ALIGN*/ if (*(uint64_t *)ns->ns_eui64 != 0) { diff --git a/usr/src/uts/common/io/nvme/nvme_var.h b/usr/src/uts/common/io/nvme/nvme_var.h index 6f3b53d3ec..7e2d1783d5 100644 --- a/usr/src/uts/common/io/nvme/nvme_var.h +++ b/usr/src/uts/common/io/nvme/nvme_var.h @@ -12,7 +12,7 @@ /* * Copyright 2018 Nexenta Systems, Inc. * Copyright 2016 The MathWorks, Inc. All rights reserved. - * Copyright 2017 Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2019 Western Digital Corporation. */ @@ -111,20 +111,23 @@ struct nvme_cq { struct nvme_qpair { size_t nq_nentry; + /* submission fields */ nvme_dma_t *nq_sqdma; nvme_sqe_t *nq_sq; uint_t nq_sqhead; uint_t nq_sqtail; uintptr_t nq_sqtdbl; + /* completion */ nvme_cq_t *nq_cq; - nvme_cmd_t **nq_cmd; - uint16_t nq_next_cmd; - uint_t nq_active_cmds; + /* shared structures for completion and submission */ + nvme_cmd_t **nq_cmd; /* active command array */ + uint16_t nq_next_cmd; /* next potential empty queue slot */ + uint_t nq_active_cmds; /* number of active cmds */ - kmutex_t nq_mutex; - ksema_t nq_sema; + kmutex_t nq_mutex; /* protects shared state */ + ksema_t nq_sema; /* semaphore to ensure q always has >= 1 empty slot */ }; struct nvme { @@ -179,12 +182,17 @@ struct nvme { int n_pagesize; int n_namespace_count; - uint_t n_ioq_count; + uint_t n_ioq_count; /* number of I/O command queues */ uint_t n_cq_count; nvme_identify_ctrl_t *n_idctl; + /* Pointer to the admin queue, which is always queue 0 in n_ioq. */ nvme_qpair_t *n_adminq; + /* + * All command queues, including the admin queue. + * Its length is: n_ioq_count + 1. + */ nvme_qpair_t **n_ioq; nvme_cq_t **n_cq; @@ -242,6 +250,9 @@ struct nvme { uint32_t n_vendor_event; uint32_t n_unknown_event; + /* hot removal NDI event handling */ + ddi_eventcookie_t n_rm_cookie; + ddi_callback_id_t n_ev_rm_cb_id; }; struct nvme_namespace { diff --git a/usr/src/uts/common/io/pciex/hotplug/pciehpc.c b/usr/src/uts/common/io/pciex/hotplug/pciehpc.c index 5ce219bd2f..3e4beda495 100644 --- a/usr/src/uts/common/io/pciex/hotplug/pciehpc.c +++ b/usr/src/uts/common/io/pciex/hotplug/pciehpc.c @@ -396,6 +396,21 @@ pciehpc_intr(dev_info_t *dip) control & ~PCIE_SLOTCTL_PWR_FAULT_EN); /* + * If supported, notify the child device driver that the + * device is being removed. + */ + dev_info_t *cdip = ddi_get_child(dip); + if (cdip != NULL) { + ddi_eventcookie_t rm_cookie; + if (ddi_get_eventcookie(cdip, + DDI_DEVI_REMOVE_EVENT, + &rm_cookie) == DDI_SUCCESS) { + ndi_post_event(dip, cdip, rm_cookie, + NULL); + } + } + + /* * Ask DDI Hotplug framework to change state to Empty */ (void) ndi_hp_state_change_req(dip, diff --git a/usr/src/uts/common/io/pciex/pcie.c b/usr/src/uts/common/io/pciex/pcie.c index 4eff14d563..3be7aeac6e 100644 --- a/usr/src/uts/common/io/pciex/pcie.c +++ b/usr/src/uts/common/io/pciex/pcie.c @@ -786,6 +786,13 @@ pcie_init_pfd(dev_info_t *dip) PCIE_ZALLOC(pf_pcix_ecc_regs_t); } } + + PCIE_SLOT_REG(pfd_p) = PCIE_ZALLOC(pf_pcie_slot_regs_t); + PCIE_SLOT_REG(pfd_p)->pcie_slot_regs_valid = B_FALSE; + PCIE_SLOT_REG(pfd_p)->pcie_slot_cap = 0; + PCIE_SLOT_REG(pfd_p)->pcie_slot_control = 0; + PCIE_SLOT_REG(pfd_p)->pcie_slot_status = 0; + } else if (PCIE_IS_PCIX(bus_p)) { if (PCIE_IS_BDG(bus_p)) { PCIX_BDG_ERR_REG(pfd_p) = diff --git a/usr/src/uts/common/io/pciex/pcie_fault.c b/usr/src/uts/common/io/pciex/pcie_fault.c index 6a335db3e2..3f14041e80 100644 --- a/usr/src/uts/common/io/pciex/pcie_fault.c +++ b/usr/src/uts/common/io/pciex/pcie_fault.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/sysmacros.h> @@ -200,7 +200,7 @@ pf_eh_exit(pcie_bus_t *bus_p) * for the root_pfd_p. * * "Root Complexes" such as NPE and PX should call scan_fabric using itself as - * the rdip. PCIe Root ports should call pf_scan_fabric using it's parent as + * the rdip. PCIe Root ports should call pf_scan_fabric using its parent as * the rdip. * * Scan fabric initiated from RCs are likely due to a fabric message, traps or @@ -587,6 +587,35 @@ pf_pcie_regs_gather(pf_data_t *pfd_p, pcie_bus_t *bus_p) PCIE_ROOTCTL); } + /* + * For eligible components, we gather Slot Register state. + * + * Eligible components are: + * - a Downstream Port or a Root Port with the Slot Implemented + * capability bit set + * - hotplug capable + * + * Slot register state is useful, for instance, to determine whether the + * Slot's child device is physically present (via the Slot Status + * register). + */ + if ((PCIE_IS_SWD(bus_p) || PCIE_IS_ROOT(bus_p)) && + PCIE_IS_HOTPLUG_ENABLED(PCIE_BUS2DIP(bus_p))) { + pf_pcie_slot_regs_t *pcie_slot_regs = PCIE_SLOT_REG(pfd_p); + pcie_slot_regs->pcie_slot_cap = PCIE_CAP_GET(32, bus_p, + PCIE_SLOTCAP); + pcie_slot_regs->pcie_slot_control = PCIE_CAP_GET(16, bus_p, + PCIE_SLOTCTL); + pcie_slot_regs->pcie_slot_status = PCIE_CAP_GET(16, bus_p, + PCIE_SLOTSTS); + + if (pcie_slot_regs->pcie_slot_cap != PCI_EINVAL32 && + pcie_slot_regs->pcie_slot_control != PCI_EINVAL16 && + pcie_slot_regs->pcie_slot_status != PCI_EINVAL16) { + pcie_slot_regs->pcie_slot_regs_valid = B_TRUE; + } + } + if (!PCIE_HAS_AER(bus_p)) return; @@ -838,7 +867,7 @@ pf_pci_find_rp_fault(pf_data_t *pfd_p, pcie_bus_t *bus_p) * Check to see if an error has been received that * requires a scan of the fabric. Count the number of * faults seen. If MUL CE/FE_NFE that counts for - * atleast 2 faults, so just return with full_scan. + * at least 2 faults, so just return with full_scan. */ if ((root_err & PCIE_AER_RE_STS_MUL_CE_RCVD) || (root_err & PCIE_AER_RE_STS_MUL_FE_NFE_RCVD)) { @@ -1232,7 +1261,7 @@ const pf_fab_err_tbl_t pcie_rp_tbl[] = { {PCIE_AER_UCE_FCP, pf_panic, PF_AFFECTED_SELF | PF_AFFECTED_CHILDREN, 0}, - {PCIE_AER_UCE_TO, pf_panic, + {PCIE_AER_UCE_TO, pf_analyse_to, PF_AFFECTED_ADDR, PF_AFFECTED_CHILDREN}, {PCIE_AER_UCE_CA, pf_no_panic, @@ -1916,16 +1945,35 @@ pf_analyse_sc(ddi_fm_error_t *derr, uint32_t bit, pf_data_t *dq_head_p, /* * PCIe Timeout error analyser. This error can be forgiven if it is marked as * CE Advisory. If it is marked as advisory, this means the HW can recover - * and/or retry the transaction automatically. + * and/or retry the transaction automatically. Additionally, if a device's + * parent slot reports that it is no longer physically present, we do not panic, + * as one would not expect a missing device to respond to a command. */ /* ARGSUSED */ static int pf_analyse_to(ddi_fm_error_t *derr, uint32_t bit, pf_data_t *dq_head_p, pf_data_t *pfd_p) { + dev_info_t *rpdip = PCIE_PFD2BUS(pfd_p)->bus_rp_dip; + pf_data_t *rppfd = PCIE_DIP2PFD(rpdip); + pf_pcie_slot_regs_t *p_pcie_slot_regs; + if (HAS_AER_LOGS(pfd_p, bit) && CE_ADVISORY(pfd_p)) return (PF_ERR_NO_PANIC); + p_pcie_slot_regs = PCIE_SLOT_REG(rppfd); + if (p_pcie_slot_regs->pcie_slot_regs_valid) { + /* + * If the device is reported gone from its parent slot, then it + * is expected that any outstanding commands would time out. In + * this case, do not panic. + */ + if ((p_pcie_slot_regs->pcie_slot_status & + PCIE_SLOTSTS_PRESENCE_DETECTED) == 0x0) { + return (PF_ERR_NO_PANIC); + } + } + return (PF_ERR_PANIC); } @@ -2970,6 +3018,24 @@ pf_send_ereport(ddi_fm_error_t *derr, pf_impl_t *impl) NULL); } + /* + * Slot Status registers + * + * Since we only gather these for certain types of components, + * only put these registers into the ereport if we have valid + * data. + */ + if (PCIE_SLOT_REG(pfd_p)->pcie_slot_regs_valid) { + fm_payload_set(ereport, + "pcie_slot_cap", DATA_TYPE_UINT32, + PCIE_SLOT_REG(pfd_p)->pcie_slot_cap, + "pcie_slot_control", DATA_TYPE_UINT16, + PCIE_SLOT_REG(pfd_p)->pcie_slot_control, + "pcie_slot_status", DATA_TYPE_UINT16, + PCIE_SLOT_REG(pfd_p)->pcie_slot_status, + NULL); + } + generic: /* IOV related information */ if (!PCIE_BDG_IS_UNASSIGNED(PCIE_PFD2BUS(impl->pf_dq_head_p))) { diff --git a/usr/src/uts/common/os/ddi_hp_impl.c b/usr/src/uts/common/os/ddi_hp_impl.c index 79165af9ff..38e575dbfd 100644 --- a/usr/src/uts/common/os/ddi_hp_impl.c +++ b/usr/src/uts/common/os/ddi_hp_impl.c @@ -21,12 +21,239 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ /* * Sun DDI hotplug implementation specific functions */ +/* + * HOTPLUG FRAMEWORK + * + * The hotplug framework (also referred to "SHP", for "Solaris Hotplug + * Framework") refers to a large set of userland and kernel interfaces, + * including those in this file, that provide functionality related to device + * hotplug. + * + * Hotplug is a broad term that refers to both removal and insertion of devices + * on a live system. Such operations can have varying levels of notification to + * the system. Coordinated hotplug means that the operating system is notified + * in advance that a device will have a hotplug operation performed on it. + * Non-coordinated hotplug, also called "surprise removal", does not have such + * notification, and the device is simply removed or inserted from the system. + * + * The goals of a correct hotplug operation will vary based on the device. In + * general, though, we want the system to gracefully notice the device change + * and clean up (or create) any relevant structures related to using the device + * in the system. + * + * The goals of the hotplug framework are to provide common interfaces for nexus + * drivers, device drivers, and userland programs to build a foundation for + * implementing hotplug for a variety of devices. Notably, common support for + * PCIe devices is available. See also: the nexus driver for PCIe devices at + * uts/i86pc/io/pciex/npe.c. + * + * + * TERMINOLOGY + * + * The following terms may be useful when exploring hotplug-related code. + * + * PHYSICAL HOTPLUG + * Refers to hotplug operations on a physical hardware receptacle. + * + * VIRTUAL HOTPLUG + * Refers to hotplug operations on an arbitrary device node in the device + * tree. + * + * CONNECTION (often abbreviated "cn") + * A place where either physical or virtual hotplug happens. This is a more + * generic term to refer to "connectors" and "ports", which represent + * physical and virtual places where hotplug happens, respectively. + * + * CONNECTOR + * A place where physical hotplug happens. For example: a PCIe slot, a USB + * port, a SAS port, and a fiber channel port are all connectors. + * + * PORT + * A place where virtual hotplug happens. A port refers to an arbitrary + * place under a nexus dev_info node in the device tree. + * + * + * CONNECTION STATE MACHINE + * + * Connections have the states below. Connectors and ports are grouped into + * the same state machine. It is worth noting that the edges here are incomplete + * -- it is possible for a connection to move straight from ENABLED to EMPTY, + * for instance, if there is a surprise removal of its device. + * + * State changes are kicked off through two ways: + * - Through the nexus driver interface, ndi_hp_state_change_req. PCIe + * nexus drivers that pass a hotplug interrupt through to pciehpc will kick + * off state changes in this way. + * - Through coordinated removal, ddihp_modctl. Both cfgadm(1M) and + * hotplug(1M) pass state change requests through hotplugd, which uses + * modctl to request state changes to the DDI hotplug framework. That + * interface is ultimately implemented by ddihp_modctl. + * + * (start) + * | + * v + * EMPTY no component plugged into connector + * ^ + * v + * PRESENT component plugged into connector + * ^ + * v + * POWERED connector is powered + * ^ + * v + * ENABLED connector is fully functional + * | + * . + * . + * . + * v + * (create port) + * | + * v + * PORT EMPTY port has no device occupying it + * ^ + * v + * PORT PRESENT port occupied by device + * + * + * ARCHITECTURE DIAGRAM + * + * The following is a non-exhaustive summary of various components in the system + * that implement pieces of the hotplug framework. More detailed descriptions + * of some key components are below. + * + * +------------+ + * | cfgadm(1M) | + * +------------+ + * | + * +-------------------+ + * | SHP cfgadm plugin | + * +-------------------+ + * | + * +-------------+ +------------+ + * | hotplug(1M) |----------| libhotplug | + * +-------------+ +------------+ + * | + * +----------+ + * | hotplugd | + * +----------+ + * | + * +----------------+ + * | modctl (HP op) | + * +----------------+ + * | + * | + * User | + * =============================|=============================================== + * Kernel | + * | + * | + * +------------------------+ +----------------+ + * | DDI hotplug interfaces | --- | Device Drivers | + * +------------------------+ +----------------+ + * | | + * | +------------------------+ + * | | NDI hotplug interfaces | + * | +------------------------+ + * | | + * | | + * +-------------+ +--------------+ +---------------------------+ + * | `bus_hp_op` | -- |"pcie" module | --- | "npe" (PCIe nexus driver) | + * +-------------+ +--------------+ +---------------------------+ + * | | + * | +-------------------+ + * | | PCIe configurator | + * | +-------------------+ + * | + * +-------------------------------------+ + * | "pciehpc" (PCIe hotplug controller) | + * +-------------------------------------+ + * + * + * . + * . + * . + * . + * . + * | + * | + * +-----------------------------------+ + * | I/O Subsystem | + * | (LDI notifications and contracts) | + * +-----------------------------------+ + * + * + * KEY HOTPLUG SOFTWARE COMPONENTS + * + * CFGADM(1M) + * + * cfgadm is the canonical tool for hotplug operations. It can be used to + * list connections on the system and change their state in a coordinated + * fashion. For more information, see its manual page. + * + * + * HOTPLUG(1M) + * + * hotplug is a command line tool for managing hotplug connections for + * connectors. For more information, see its manual page. + * + * + * DDI HOTPLUG INTERFACES + * + * This part of the framework provides interfaces for changing device state + * for connectors, including onlining and offlining child devices. Many of + * these functions are defined in this file. + * + * + * NDI HOTPLUG INTERFACES + * + * Nexus drivers can define their own hotplug bus implementations by + * defining a bus_hp_op entry point. This entry point must implement + * a set of hotplug related commands, including getting, probing, and + * changing connection state, as well as port creation and removal. + * + * Nexus drivers may also want to use the following interfaces for + * implementing hotplug. Note that the PCIe Hotplug Controller ("pciehpc") + * already takes care of using these: + * ndi_hp_{register,unregister} + * ndi_hp_state_change_req + * ndi_hp_walk_cn + * + * PCIe nexus drivers should use the common entry point pcie_hp_common_ops, + * which implements hotplug commands for PCIe devices, calling into other + * parts of the framework as needed. + * + * + * NPE DRIVER ("npe") + * + * npe is the common nexus driver for PCIe devices on x86. It implements + * hotplug using the NDI interfaces. For more information, see + * uts/i86pc/io/pciex/npe.c. + * + * The equivalent driver for SPARC is "px". + * + * + * PCIe HOTPLUG CONTROLLER DRIVER ("pciehpc") + * + * All hotplug-capable PCIe buses will initialize their own PCIe HPC, + * including the pcieb and ppb drivers. The controller maintains + * hotplug-related state about the slots on its bus, including their status + * and port state. It also features a common implementation of handling + * hotplug-related PCIe interrupts. + * + * For more information, see its interfaces in + * uts/common/sys/hotplug/pci/pciehpc.h. + * + */ + #include <sys/sysmacros.h> #include <sys/types.h> #include <sys/file.h> @@ -163,7 +390,9 @@ done: } /* - * Return the state of Hotplug Connection (CN) + * Fetch the state of Hotplug Connection (CN). + * This function will also update the state and last changed timestamp in the + * connection handle structure if the state has changed. */ int ddihp_cn_getstate(ddi_hp_cn_handle_t *hdlp) @@ -597,7 +826,7 @@ ddihp_cn_pre_change_state(ddi_hp_cn_handle_t *hdlp, curr_state == DDI_HP_CN_STATE_ENABLED) { /* * If the Connection goes to a lower state from ENABLED, - * then offline all children under it. + * then offline all children under it. */ rv = ddihp_cn_change_children_state(hdlp, B_FALSE); if (rv != DDI_SUCCESS) { @@ -640,7 +869,7 @@ ddihp_cn_pre_change_state(ddi_hp_cn_handle_t *hdlp, } /* - * Jobs after change state of a Connector: update last change time, + * Jobs after change state of a Connector: update state, last change time, * probe, online, sysevent, etc. */ static int @@ -813,7 +1042,7 @@ ddihp_cn_change_children_state(ddi_hp_cn_handle_t *hdlp, boolean_t online) NDI_SUCCESS) { cmn_err(CE_WARN, "(%s%d):" - " failed to dettach driver for the device" + " failed to detach driver for the device" " (%s%d) in the Connection %s\n", ddi_driver_name(dip), ddi_get_instance(dip), ddi_driver_name(cdip), diff --git a/usr/src/uts/common/os/ddi_hp_ndi.c b/usr/src/uts/common/os/ddi_hp_ndi.c index a41a12fc74..73c62dc6b9 100644 --- a/usr/src/uts/common/os/ddi_hp_ndi.c +++ b/usr/src/uts/common/os/ddi_hp_ndi.c @@ -21,6 +21,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ /* @@ -380,13 +382,19 @@ ddihp_cn_req_handler(ddi_hp_cn_handle_t *hdlp, ASSERT(DEVI_BUSY_OWNED(dip)); - if (ddihp_cn_getstate(hdlp) != DDI_SUCCESS) { - DDI_HP_NEXDBG((CE_CONT, "ddihp_cn_req_handler: dip %p, " - "hdlp %p ddi_cn_getstate failed\n", (void *)dip, - (void *)hdlp)); - - return (NDI_UNCLAIMED); - } + /* + * We do not want to fetch the state first, as calling ddihp_cn_getstate + * will update the cn_state member of the connection handle. The + * connector's hotplug operations rely on this value to know how + * target_state compares to the last known state of the device and make + * decisions about whether to clean up, post sysevents about the state + * change, and so on. + * + * Instead, just carry out the request to change the state. The + * connector's hotplug operations will update the state in the + * connection handle after they complete their necessary state change + * actions. + */ if (hdlp->cn_info.cn_state != target_state) { ddi_hp_cn_state_t result_state = 0; diff --git a/usr/src/uts/common/sys/ddi_hp.h b/usr/src/uts/common/sys/ddi_hp.h index eadb88ed49..b88762a9f5 100644 --- a/usr/src/uts/common/sys/ddi_hp.h +++ b/usr/src/uts/common/sys/ddi_hp.h @@ -21,6 +21,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_DDI_HP_H @@ -28,6 +30,9 @@ /* * Sun DDI hotplug support definitions + * + * See the big theory statement in uts/common/os/ddi_hp_impl.c for more + * information. */ #ifdef __cplusplus @@ -73,7 +78,8 @@ typedef enum { /* * ddi_hp_cn_info_t * - * Hotplug Connection (CN) information structure + * Hotplug Connection (CN) information structure. + * A Connection is either a Connector or a Port. */ typedef struct ddi_hp_cn_info { char *cn_name; /* Name of the Connection */ diff --git a/usr/src/uts/common/sys/ddi_hp_impl.h b/usr/src/uts/common/sys/ddi_hp_impl.h index fb220119dd..b52df77cac 100644 --- a/usr/src/uts/common/sys/ddi_hp_impl.h +++ b/usr/src/uts/common/sys/ddi_hp_impl.h @@ -21,6 +21,12 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. + */ + +/* + * See the big theory statement in uts/common/os/ddi_hp_impl.c for more + * information about the structures and functions defined here. */ #ifndef _SYS_DDI_HP_IMPL_H diff --git a/usr/src/uts/common/sys/pcie_impl.h b/usr/src/uts/common/sys/pcie_impl.h index d1d13625c2..442c55043c 100644 --- a/usr/src/uts/common/sys/pcie_impl.h +++ b/usr/src/uts/common/sys/pcie_impl.h @@ -166,6 +166,7 @@ extern "C" { #define PCIE_ADV_BDG_HDR(pfd_p, n) PCIE_ADV_BDG_REG(pfd_p)->pcie_sue_hdr[n] #define PCIE_ADV_RP_REG(pfd_p) \ PCIE_ADV_REG(pfd_p)->pcie_ext.pcie_adv_rp_regs +#define PCIE_SLOT_REG(pfd_p) pfd_p->pe_pcie_slot_regs #define PFD_AFFECTED_DEV(pfd_p) pfd_p->pe_affected_dev #define PFD_SET_AFFECTED_FLAG(pfd_p, aff_flag) \ PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags = aff_flag @@ -262,6 +263,18 @@ typedef struct pf_pcie_err_regs { pf_pcie_adv_err_regs_t *pcie_adv_regs; /* pcie aer regs */ } pf_pcie_err_regs_t; +/* + * Slot register values for hotplug-capable Downstream Ports or Root Ports with + * the Slot Implemented capability bit set. We gather these to help determine + * whether the slot's child device is physically present. + */ +typedef struct pf_pcie_slot_regs { + boolean_t pcie_slot_regs_valid; /* true if register values are valid */ + uint32_t pcie_slot_cap; /* pcie slot capabilities register */ + uint16_t pcie_slot_control; /* pcie slot control register */ + uint16_t pcie_slot_status; /* pcie slot status register */ +} pf_pcie_slot_regs_t; + typedef enum { PF_INTR_TYPE_NONE = 0, PF_INTR_TYPE_FABRIC = 1, /* Fabric Message */ @@ -431,6 +444,7 @@ struct pf_data { pf_pcie_err_regs_t *pe_pcie_regs; /* PCIe error reg */ } pe_ext; pf_pcix_bdg_err_regs_t *pe_pcix_bdg_regs; /* PCI-X bridge regs */ + pf_pcie_slot_regs_t *pe_pcie_slot_regs; /* PCIe slot regs */ pf_data_t *pe_prev; /* Next error in queue */ pf_data_t *pe_next; /* Next error in queue */ boolean_t pe_rber_fatal; diff --git a/usr/src/uts/i86pc/io/pci/pci_common.h b/usr/src/uts/i86pc/io/pci/pci_common.h index 63fe4bb165..d5fa3bfd55 100644 --- a/usr/src/uts/i86pc/io/pci/pci_common.h +++ b/usr/src/uts/i86pc/io/pci/pci_common.h @@ -22,6 +22,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ #ifndef _PCI_PCI_COMMON_H @@ -33,7 +35,7 @@ extern "C" { /* * Common header file with definitions shared between - * pci(7d) and npe(7d) + * pci(7D) and npe(7D) */ /* State structure. */ @@ -45,12 +47,18 @@ typedef struct pci_state { kmutex_t pci_mutex; kmutex_t pci_peek_poke_mutex; kmutex_t pci_err_mutex; + + /* + * The following members are only used by npe(7D). + * See uts/i86pc/io/pciex/npe.c for more information. + */ + ndi_event_hdl_t pci_ndi_event_hdl; } pci_state_t; /* * These are the access routines. - * The pci_bus_map sets the handle to point to these in pci(7d). - * The npe_bus_map sets the handle to point to these in npe(7d). + * The pci_bus_map sets the handle to point to these in pci(7D). + * The npe_bus_map sets the handle to point to these in npe(7D). */ uint8_t pci_config_rd8(ddi_acc_impl_t *hdlp, uint8_t *addr); uint16_t pci_config_rd16(ddi_acc_impl_t *hdlp, uint16_t *addr); diff --git a/usr/src/uts/i86pc/io/pciex/npe.c b/usr/src/uts/i86pc/io/pciex/npe.c index 4ef393ddb0..fcb68164ee 100644 --- a/usr/src/uts/i86pc/io/pciex/npe.c +++ b/usr/src/uts/i86pc/io/pciex/npe.c @@ -26,11 +26,35 @@ /* * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ /* - * Host to PCI-Express local bus driver + * npe (Nexus PCIe driver): Host to PCI-Express local bus driver + * + * npe serves as the driver for PCIe Root Complexes and as the nexus driver + * for PCIe devices. See also: npe(7D). For more information about hotplug, + * see the big theory statement at uts/common/os/ddi_hp_impl.c. + * + * + * NDI EVENT HANDLING SUPPORT + * + * npe supports NDI event handling. The only available event is surprise + * removal of a device. Child drivers can register surprise removal event + * callbacks by requesting an event cookie using ddi_get_eventcookie for + * the DDI_DEVI_REMOVE_EVENT and add their callback using + * ddi_add_event_handler. For an example, see the nvme driver in + * uts/common/io/nvme/nvme.c. + * + * The NDI events in npe are retrieved using NDI_EVENT_NOPASS, which + * prevent them from being propagated up the tree once they reach the npe's + * bus_get_eventcookie operations. This is important because npe maintains + * the state of PCIe devices and their receptacles, via the PCIe hotplug + * controller driver (pciehpc). + * + * Hot removal events are ultimately posted by the PCIe hotplug controller + * interrupt handler for hotplug events. Events are posted using the + * ndi_post_event interface. */ #include <sys/conf.h> @@ -72,6 +96,15 @@ static int npe_intr_ops(dev_info_t *, dev_info_t *, ddi_intr_op_t, ddi_intr_handle_impl_t *, void *); static int npe_fm_init(dev_info_t *, dev_info_t *, int, ddi_iblock_cookie_t *); +static int npe_bus_get_eventcookie(dev_info_t *, dev_info_t *, char *, + ddi_eventcookie_t *); +static int npe_bus_add_eventcall(dev_info_t *, dev_info_t *, + ddi_eventcookie_t, void (*)(dev_info_t *, + ddi_eventcookie_t, void *, void *), + void *, ddi_callback_id_t *); +static int npe_bus_remove_eventcall(dev_info_t *, ddi_callback_id_t); +static int npe_bus_post_event(dev_info_t *, dev_info_t *, + ddi_eventcookie_t, void *); static int npe_fm_callback(dev_info_t *, ddi_fm_error_t *, const void *); @@ -102,10 +135,10 @@ struct bus_ops npe_bus_ops = { ddi_dma_mctl, npe_ctlops, ddi_bus_prop_op, - 0, /* (*bus_get_eventcookie)(); */ - 0, /* (*bus_add_eventcall)(); */ - 0, /* (*bus_remove_eventcall)(); */ - 0, /* (*bus_post_event)(); */ + npe_bus_get_eventcookie, + npe_bus_add_eventcall, + npe_bus_remove_eventcall, + npe_bus_post_event, 0, /* (*bus_intr_ctl)(); */ 0, /* (*bus_config)(); */ 0, /* (*bus_unconfig)(); */ @@ -271,12 +304,27 @@ npe_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) return (ret); } +/* + * See big theory statement at the top of this file for more information about + * surprise removal events. + */ +#define NPE_EVENT_TAG_HOT_REMOVAL 0 +static ndi_event_definition_t npe_ndi_event_defs[1] = { + {NPE_EVENT_TAG_HOT_REMOVAL, DDI_DEVI_REMOVE_EVENT, EPL_KERNEL, + NDI_EVENT_POST_TO_ALL} +}; + +static ndi_event_set_t npe_ndi_events = { + NDI_EVENTS_REV1, ARRAY_SIZE(npe_ndi_event_defs), npe_ndi_event_defs +}; + /*ARGSUSED*/ static int npe_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) { int instance = ddi_get_instance(devi); pci_state_t *pcip = NULL; + int ret; if (cmd == DDI_RESUME) { /* @@ -316,6 +364,22 @@ npe_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) if (pcie_init(devi, NULL) != DDI_SUCCESS) goto fail1; + ret = ndi_event_alloc_hdl(pcip->pci_dip, NULL, &pcip->pci_ndi_event_hdl, + NDI_SLEEP); + if (ret == NDI_SUCCESS) { + ret = ndi_event_bind_set(pcip->pci_ndi_event_hdl, + &npe_ndi_events, NDI_SLEEP); + if (ret != NDI_SUCCESS) { + dev_err(pcip->pci_dip, CE_WARN, "npe: failed to bind " + "NDI event set (error=%d)", ret); + goto fail1; + } + } else { + dev_err(pcip->pci_dip, CE_WARN, "npe: failed to allocate " + "event handle (error=%d)", ret); + goto fail1; + } + /* Second arg: initialize for pci_express root nexus */ if (pcitool_init(devi, B_TRUE) != DDI_SUCCESS) goto fail2; @@ -352,11 +416,36 @@ npe_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) { int instance = ddi_get_instance(devi); pci_state_t *pcip; + int ret; pcip = ddi_get_soft_state(npe_statep, ddi_get_instance(devi)); switch (cmd) { case DDI_DETACH: + + /* + * Clean up event handling first, to ensure there are no + * oustanding callbacks registered. + */ + ret = ndi_event_unbind_set(pcip->pci_ndi_event_hdl, + &npe_ndi_events, NDI_SLEEP); + if (ret == NDI_SUCCESS) { + /* ndi_event_free_hdl always succeeds. */ + (void) ndi_event_free_hdl(pcip->pci_ndi_event_hdl); + } else { + /* + * The event set will only fail to unbind if there are + * outstanding callbacks registered for it, which + * probably means a child driver still has one + * registered and thus was not cleaned up properly + * before npe's detach routine was called. Consequently, + * we should fail the detach here. + */ + dev_err(pcip->pci_dip, CE_WARN, "npe: failed to " + "unbind NDI event set (error=%d)", ret); + return (DDI_FAILURE); + } + pcie_fab_fini_bus(devi, PCIE_BUS_INITIAL); /* Uninitialize pcitool support. */ @@ -373,6 +462,7 @@ npe_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) ddi_fm_fini(devi); ddi_soft_state_free(npe_statep, instance); + return (DDI_SUCCESS); case DDI_SUSPEND: @@ -414,7 +504,7 @@ static int npe_bus_map(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp, off_t offset, off_t len, caddr_t *vaddrp) { - int rnumber; + int rnumber; int space; ddi_acc_impl_t *ap; ddi_acc_hdl_t *hp; @@ -1111,6 +1201,49 @@ npe_fm_init(dev_info_t *dip, dev_info_t *tdip, int cap, return (pcip->pci_fmcap); } +static int +npe_bus_get_eventcookie(dev_info_t *dip, dev_info_t *rdip, char *eventname, + ddi_eventcookie_t *cookiep) +{ + pci_state_t *pcip = ddi_get_soft_state(npe_statep, + ddi_get_instance(dip)); + + return (ndi_event_retrieve_cookie(pcip->pci_ndi_event_hdl, rdip, + eventname, cookiep, NDI_EVENT_NOPASS)); +} + +static int +npe_bus_add_eventcall(dev_info_t *dip, dev_info_t *rdip, + ddi_eventcookie_t cookie, void (*callback)(dev_info_t *dip, + ddi_eventcookie_t cookie, void *arg, void *bus_impldata), + void *arg, ddi_callback_id_t *cb_id) +{ + pci_state_t *pcip = ddi_get_soft_state(npe_statep, + ddi_get_instance(dip)); + + return (ndi_event_add_callback(pcip->pci_ndi_event_hdl, rdip, cookie, + callback, arg, NDI_SLEEP, cb_id)); +} + +static int +npe_bus_remove_eventcall(dev_info_t *dip, ddi_callback_id_t cb_id) +{ + pci_state_t *pcip = ddi_get_soft_state(npe_statep, + ddi_get_instance(dip)); + return (ndi_event_remove_callback(pcip->pci_ndi_event_hdl, cb_id)); +} + +static int +npe_bus_post_event(dev_info_t *dip, dev_info_t *rdip, + ddi_eventcookie_t cookie, void *impl_data) +{ + pci_state_t *pcip = ddi_get_soft_state(npe_statep, + ddi_get_instance(dip)); + return (ndi_event_do_callback(pcip->pci_ndi_event_hdl, rdip, cookie, + impl_data)); + +} + /*ARGSUSED*/ static int npe_fm_callback(dev_info_t *dip, ddi_fm_error_t *derr, const void *no_used) diff --git a/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c b/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c index b482117c7c..3f890d8f07 100644 --- a/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c +++ b/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2019, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ /* @@ -1251,7 +1251,7 @@ pcicfg_ntbridge_unconfigure_child(dev_info_t *new_device, uint_t devno) { dev_info_t *new_ntbridgechild; - int len, bus; + int len, bus; uint16_t vid; ddi_acc_handle_t config_handle; pci_bus_range_t pci_bus_range; @@ -1368,7 +1368,7 @@ pcicfg_is_ntbridge(dev_info_t *dip) static uint_t pcicfg_ntbridge_child(dev_info_t *dip) { - int len, val, rc = DDI_FAILURE; + int len, val, rc = DDI_FAILURE; dev_info_t *anode = dip; /* @@ -1398,7 +1398,7 @@ pcicfg_ntbridge_child(dev_info_t *dip) static uint_t pcicfg_get_ntbridge_child_range(dev_info_t *dip, uint64_t *boundbase, - uint64_t *boundlen, uint_t space_type) + uint64_t *boundlen, uint_t space_type) { int length, found = DDI_FAILURE, acount, i, ibridge; pci_regspec_t *assigned; @@ -1584,6 +1584,7 @@ static int pcicfg_teardown_device(dev_info_t *dip, pcicfg_flags_t flags, boolean_t is_pcie) { ddi_acc_handle_t handle; + int ret; /* * Free up resources associated with 'dip' @@ -1596,10 +1597,20 @@ pcicfg_teardown_device(dev_info_t *dip, pcicfg_flags_t flags, boolean_t is_pcie) /* * disable the device */ - if (pcicfg_config_setup(dip, &handle) != PCICFG_SUCCESS) + + ret = pcicfg_config_setup(dip, &handle); + if (ret == PCICFG_SUCCESS) { + pcicfg_device_off(handle); + pcicfg_config_teardown(&handle); + } else if (ret != PCICFG_NODEVICE) { + /* + * It is possible the device no longer exists -- for instance, + * if the device has been pulled from a hotpluggable slot on the + * system. In this case, do not fail the teardown, though there + * is less to clean up. + */ return (PCICFG_FAILURE); - pcicfg_device_off(handle); - pcicfg_config_teardown(&handle); + } if (is_pcie) { /* @@ -2401,8 +2412,7 @@ pcicfg_get_mem(pcicfg_phdl_t *entry, uint32_t length, uint64_t *ans) } static void -pcicfg_get_io(pcicfg_phdl_t *entry, - uint32_t length, uint32_t *ans) +pcicfg_get_io(pcicfg_phdl_t *entry, uint32_t length, uint32_t *ans) { uint32_t new_io; uint64_t io_last; @@ -3189,7 +3199,7 @@ pcicfg_device_off(ddi_acc_handle_t config_handle) */ static int pcicfg_set_standard_props(dev_info_t *dip, ddi_acc_handle_t config_handle, - uint8_t pcie_dev) + uint8_t pcie_dev) { int ret; uint16_t cap_id_loc, val; @@ -3361,7 +3371,7 @@ pcicfg_set_busnode_props(dev_info_t *dip, uint8_t pcie_device_type) static int pcicfg_set_childnode_props(dev_info_t *dip, ddi_acc_handle_t config_handle, - uint8_t pcie_dev) + uint8_t pcie_dev) { int ret; @@ -3521,8 +3531,8 @@ pcicfg_set_childnode_props(dev_info_t *dip, ddi_acc_handle_t config_handle, * Program the bus numbers into the bridge */ static void -pcicfg_set_bus_numbers(ddi_acc_handle_t config_handle, -uint_t primary, uint_t secondary, uint_t subordinate) +pcicfg_set_bus_numbers(ddi_acc_handle_t config_handle, uint_t primary, + uint_t secondary, uint_t subordinate) { DEBUG3("Setting bridge bus-range %d,%d,%d\n", primary, secondary, subordinate); @@ -3547,8 +3557,7 @@ uint_t primary, uint_t secondary, uint_t subordinate) * Put bridge registers into initial state */ static void -pcicfg_setup_bridge(pcicfg_phdl_t *entry, - ddi_acc_handle_t handle) +pcicfg_setup_bridge(pcicfg_phdl_t *entry, ddi_acc_handle_t handle) { /* * The highest bus seen during probing is the max-subordinate bus @@ -3607,8 +3616,7 @@ pcicfg_setup_bridge(pcicfg_phdl_t *entry, } static void -pcicfg_update_bridge(pcicfg_phdl_t *entry, - ddi_acc_handle_t handle) +pcicfg_update_bridge(pcicfg_phdl_t *entry, ddi_acc_handle_t handle) { uint_t length; @@ -3853,11 +3861,10 @@ failedconfig: * Sizing the BARs and update "reg" property */ static int -pcicfg_populate_reg_props(dev_info_t *new_child, - ddi_acc_handle_t config_handle) +pcicfg_populate_reg_props(dev_info_t *new_child, ddi_acc_handle_t config_handle) { int i; - uint32_t request; + uint32_t request; i = PCI_CONF_BASE0; @@ -5079,7 +5086,7 @@ pcicfg_config_teardown(ddi_acc_handle_t *handle) static int pcicfg_add_config_reg(dev_info_t *dip, - uint_t bus, uint_t device, uint_t func) + uint_t bus, uint_t device, uint_t func) { int reg[10] = { PCI_ADDR_CONFIG, 0, 0, 0, 0}; @@ -5104,8 +5111,8 @@ pcicfg_ari_configure(dev_info_t *dip) #ifdef DEBUG static void -debug(char *fmt, uintptr_t a1, uintptr_t a2, uintptr_t a3, - uintptr_t a4, uintptr_t a5) +debug(char *fmt, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4, + uintptr_t a5) { if (pcicfg_debug > 1) { prom_printf("pcicfg: "); |
