diff options
| author | Jordan Paige Hendricks <jordan.hendricks@joyent.com> | 2019-03-25 17:22:13 +0000 |
|---|---|---|
| committer | Robert Mustacchi <rm@fingolfin.org> | 2021-04-05 17:17:41 -0700 |
| commit | ffb6483089015eb90be1f5e7fc2a96c9929546a6 (patch) | |
| tree | bf7931c50a83cba1557b932f66c2c2d1b6d89141 | |
| parent | 8054a0e4c809d98ffb44f17b9a8b932ca2c24b2c (diff) | |
| download | illumos-joyent-ffb6483089015eb90be1f5e7fc2a96c9929546a6.tar.gz | |
11698 Want NVMe Hotplug Support
11699 x86 pci configurator should not fail device teardown if device is gone
11700 DDI hotplug request handler resets connection handle state before performing state change operations
11701 ldi_handle dcmd segfaults occasionally
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Rob Johnston <rob.johnston@joyent.com>
Reviewed by: Paul Winder <paul@winder.uk.net>
Approved by: Dan McDonald <danmcd@joyent.com>
| -rw-r--r-- | usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h | 31 | ||||
| -rw-r--r-- | usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c | 55 | ||||
| -rw-r--r-- | usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c | 20 | ||||
| -rw-r--r-- | usr/src/uts/common/io/nvme/nvme.c | 110 | ||||
| -rw-r--r-- | usr/src/uts/common/io/nvme/nvme_var.h | 22 | ||||
| -rw-r--r-- | usr/src/uts/common/io/pciex/hotplug/pciehpc.c | 15 | ||||
| -rw-r--r-- | usr/src/uts/common/io/pciex/pcie.c | 7 | ||||
| -rw-r--r-- | usr/src/uts/common/io/pciex/pcie_fault.c | 76 | ||||
| -rw-r--r-- | usr/src/uts/common/os/ddi_hp_impl.c | 237 | ||||
| -rw-r--r-- | usr/src/uts/common/os/ddi_hp_ndi.c | 22 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/ddi_hp.h | 8 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/ddi_hp_impl.h | 6 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/pcie_impl.h | 14 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/pci/pci_common.h | 14 | ||||
| -rw-r--r-- | usr/src/uts/i86pc/io/pciex/npe.c | 147 | ||||
| -rw-r--r-- | usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c | 53 |
16 files changed, 768 insertions, 69 deletions
diff --git a/usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h b/usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h index f33ea9ecd6..96e1a956af 100644 --- a/usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h +++ b/usr/src/cmd/fm/modules/common/fabric-xlate/fabric-xlate.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _FABRIC_XLATE_H @@ -31,6 +32,7 @@ #include <sys/types.h> #include <sys/pcie.h> #include <sys/fm/io/pci.h> +#include <limits.h> #ifdef __cplusplus extern "C" { @@ -45,6 +47,17 @@ extern "C" { #define PF_ADDR_PIO (1 << 1) #define PF_ADDR_CFG (1 << 2) + +/* + * The fabric ereport preparation functions (fab_prep_*) in fab_erpt_tbl_t + * structures may return an error if the ereport could not be set up properly. + * Typically, these errors are errnos. It is possible that based on incoming + * ereport payload data, we might not want to generate an ereport at all: In + * this case, the preparation functions may instead return PF_EREPORT_IGNORE, + * which is set at a high value so as not to collide with the errnos. + */ +#define PF_EREPORT_IGNORE INT_MAX + extern fmd_xprt_t *fab_fmd_xprt; /* FMD transport layer handle */ extern char fab_buf[]; @@ -121,8 +134,21 @@ typedef struct fab_data { uint16_t pcie_rp_ctl; /* root complex control register */ uint32_t pcie_rp_err_status; /* pcie root complex error status reg */ uint32_t pcie_rp_err_cmd; /* pcie root complex error cmd reg */ - uint16_t pcie_rp_ce_src_id; /* pcie root complex ce sourpe id */ - uint16_t pcie_rp_ue_src_id; /* pcie root complex ue sourpe id */ + uint16_t pcie_rp_ce_src_id; /* pcie root complex ce source id */ + uint16_t pcie_rp_ue_src_id; /* pcie root complex ue source id */ + + /* + * The slot register values refer to the registers of the component's + * parent slot, not the component itself. + * + * You should only use the register values -- i.e., + * pcie_slot_{cap,control,status} -- if pcie_slot_data_valid is set to + * true. + */ + boolean_t pcie_slot_data_valid; /* true if slot data is valid */ + uint32_t pcie_slot_cap; /* pcie slot capabilities */ + uint16_t pcie_slot_control; /* pcie slot control */ + uint16_t pcie_slot_status; /* pcie slot status */ /* Flags */ boolean_t pcie_rp_send_all; /* need to send ereports on all rps */ @@ -131,7 +157,6 @@ typedef struct fab_data { typedef struct fab_erpt_tbl { const char *err_class; /* Final Ereport Class */ uint32_t reg_bit; /* Error Bit Mask */ - /* Pointer to function that prepares the ereport body */ const char *tgt_class; /* Target Ereport Class */ } fab_erpt_tbl_t; diff --git a/usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c b/usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c index 69ecf1aa8d..14ae738863 100644 --- a/usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c +++ b/usr/src/cmd/fm/modules/common/fabric-xlate/fx_fabric.c @@ -22,10 +22,13 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ #include <stddef.h> #include <strings.h> #include <sys/fm/util.h> +#include <sys/pcie.h> #include "fabric-xlate.h" @@ -271,6 +274,24 @@ fab_pci_fabric_to_data(fmd_hdl_t *hdl, nvlist_t *nvl, fab_data_t *data) FAB_LOOKUP(32, "pcie_adv_rp_command", &data->pcie_rp_err_cmd); FAB_LOOKUP(16, "pcie_adv_rp_ce_src_id", &data->pcie_rp_ce_src_id); FAB_LOOKUP(16, "pcie_adv_rp_ue_src_id", &data->pcie_rp_ue_src_id); + + /* + * PCIe Parent Slot Registers + * + * These are only passed in the ereport if the parent PCIe component + * supports the registers and the registers have valid data. As such, we + * look up one slot register value first: If that value is present in + * the input ereport data, then we know the others should be there as + * well. We also set the pcie_slot_data_valid flag to ensure we know + * the slot register data is safe to use in the module. + */ + data->pcie_slot_data_valid = B_FALSE; + if (nvlist_lookup_uint32(nvl, "pcie_slot_cap", &data->pcie_slot_cap) == + 0) { + FAB_LOOKUP(16, "pcie_slot_control", &data->pcie_slot_control); + FAB_LOOKUP(16, "pcie_slot_status", &data->pcie_slot_status); + data->pcie_slot_data_valid = B_TRUE; + } } static int @@ -358,6 +379,38 @@ fab_prep_pcie_ue_erpt(fmd_hdl_t *hdl, fab_data_t *data, nvlist_t *erpt, PCIE_AER_CTL_FST_ERR_PTR_MASK); int err = fab_prep_basic_erpt(hdl, data->nvl, erpt, B_FALSE); + if (data->pcie_slot_data_valid) { + (void) nvlist_add_uint32(erpt, "pcie_slot_cap", + data->pcie_slot_cap); + (void) nvlist_add_uint16(erpt, "pcie_slot_control", + data->pcie_slot_control); + (void) nvlist_add_uint16(erpt, "pcie_slot_status", + data->pcie_slot_status); + + /* + * It is possible to see uncorrectable errors for a slot that + * are related to the slot's child device being physically + * removed from the slot. As such, in the case that the slot + * reports that it is empty, we do not want to generate an + * ereport for all errors. Generating an ereport here will cause + * the eft module to fault the device and io-retire to + * subsequently retire the device. Retiring the device makes + * little sense given that the device is physically gone; more + * confusingly, if plugged back into the slot, it would be + * marked retired already. + * + * The only error ignored for this case is Completion Timeout. + * It is possible more errors should be ignored, and if they + * are seen in the field it might be worth broadening the set + * of ignored errors. + */ + if (tbl->reg_bit == PCIE_AER_UCE_TO && + ((data->pcie_slot_status & + PCIE_SLOTSTS_PRESENCE_DETECTED) == 0x0)) { + return (PF_EREPORT_IGNORE); + } + } + /* Generate an ereport for this error bit. */ (void) snprintf(fab_buf, FM_MAX_CLASS, "ereport.io.%s.%s", PCIEX_ERROR_SUBCLASS, class); @@ -776,7 +829,7 @@ fab_xlate_pcie_erpts(fmd_hdl_t *hdl, fab_data_t *data) fmd_hdl_debug(hdl, "Sending Ereports Now"); - /* Go through the error logs and send the relavant reports */ + /* Go through the error logs and send the relevant reports */ for (tbl = fab_master_err_tbl; tbl->erpt_tbl; tbl++) { fab_send_erpt(hdl, data, tbl); } diff --git a/usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c b/usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c index 8593144b28..94678dbd47 100644 --- a/usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c +++ b/usr/src/cmd/fm/modules/common/fabric-xlate/fx_subr.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <strings.h> #include <fm/topo_hc.h> @@ -185,6 +186,7 @@ fab_send_erpt(fmd_hdl_t *hdl, fab_data_t *data, fab_err_tbl_t *tbl) fab_erpt_tbl_t *erpt_tbl, *entry; nvlist_t *erpt; uint32_t reg; + int err; erpt_tbl = tbl->erpt_tbl; if (tbl->reg_size == 16) { @@ -200,7 +202,9 @@ fab_send_erpt(fmd_hdl_t *hdl, fab_data_t *data, fab_err_tbl_t *tbl) if (nvlist_alloc(&erpt, NV_UNIQUE_NAME, 0) != 0) goto done; - if (tbl->fab_prep(hdl, data, erpt, entry) != 0) { + + err = tbl->fab_prep(hdl, data, erpt, entry); + if (err != 0 && err != PF_EREPORT_IGNORE) { fmd_hdl_debug(hdl, "Prepping ereport failed: " "class = %s\n", entry->err_class); nvlist_free(erpt); @@ -394,7 +398,7 @@ fab_find_rppath_by_devbdf(fmd_hdl_t *hdl, nvlist_t *nvl, pcie_req_id_t bdf) xmlXPathObjectPtr xpathObj; xmlNodeSetPtr nodes; xmlNodePtr devNode; - char *retval, *temp; + char *retval, *temp; char query[500]; int i, size, bus, dev, fn; char *hcpath; @@ -577,7 +581,7 @@ fail: char * fab_find_bdf(fmd_hdl_t *hdl, nvlist_t *nvl, pcie_req_id_t bdf) { - char *retval; + char *retval; char query[500]; int bus, dev, fn; char rcpath[255]; @@ -705,7 +709,7 @@ found: propgroup: /* Retrive the "dev" propval and return */ for (devNode = devNode->children; devNode; devNode = devNode->next) { - char *tprop; + char *tprop; tprop = GET_PROP(devNode, "name"); if (STRCMP(devNode->name, "propval") && @@ -866,8 +870,8 @@ fab_pr(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl) char * fab_get_rpdev(fmd_hdl_t *hdl) { - char *retval; - char query[500]; + char *retval; + char query[500]; (void) snprintf(query, sizeof (query), "//propval[" "@name='extended-capabilities' and contains(@value, '%s')]" @@ -888,8 +892,8 @@ fab_send_erpt_all_rps(fmd_hdl_t *hdl, nvlist_t *erpt) { xmlXPathObjectPtr xpathObj; xmlNodeSetPtr nodes; - char *rppath, *hbpath; - char query[600]; + char *rppath, *hbpath; + char query[600]; nvlist_t *detector, *nvl; uint_t i, size; size_t len; diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c index e37d0598c3..8215adaed6 100644 --- a/usr/src/uts/common/io/nvme/nvme.c +++ b/usr/src/uts/common/io/nvme/nvme.c @@ -59,7 +59,7 @@ * but they share some driver state: the command array (holding pointers to * commands currently being processed by the hardware) and the active command * counter. Access to a submission queue and the shared state is protected by - * nq_mutex, completion queue is protected by ncq_mutex. + * nq_mutex; completion queue is protected by ncq_mutex. * * When a command is submitted to a queue pair the active command counter is * incremented and a pointer to the command is stored in the command array. The @@ -202,6 +202,23 @@ * device. * * + * NVMe Hotplug: + * + * The driver supports hot removal. The driver uses the NDI event framework + * to register a callback, nvme_remove_callback, to clean up when a disk is + * removed. In particular, the driver will unqueue outstanding I/O commands and + * set n_dead on the softstate to true so that other operations, such as ioctls + * and command submissions, fail as well. + * + * While the callback registration relies on the NDI event framework, the + * removal event itself is kicked off in the PCIe hotplug framework, when the + * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicatating that a + * device was removed from the slot. + * + * The NVMe driver instance itself will remain until the final close of the + * device. + * + * * DDI UFM Support * * The driver supports the DDI UFM framework for reporting information about @@ -1066,6 +1083,10 @@ nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) static int nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) { + if (cmd->nc_nvme->n_dead) { + return (EIO); + } + if (sema_tryp(&qp->nq_sema) == 0) return (EAGAIN); @@ -1082,6 +1103,22 @@ nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd) cmd->nc_completed = B_FALSE; /* + * Now that we hold the queue pair lock, we must check whether or not + * the controller has been listed as dead (e.g. was removed due to + * hotplug). This is necessary as otherwise we could race with + * nvme_remove_callback(). Because this has not been enqueued, we don't + * call nvme_unqueue_cmd(), which is why we must manually decrement the + * semaphore. + */ + if (cmd->nc_nvme->n_dead) { + taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, cmd->nc_callback, + cmd, TQ_NOSLEEP, &cmd->nc_tqent); + sema_v(&qp->nq_sema); + mutex_exit(&qp->nq_mutex); + return; + } + + /* * Try to insert the cmd into the active cmd array at the nq_next_cmd * slot. If the slot is already occupied advance to the next slot and * try again. This can happen for long running commands like async event @@ -3267,6 +3304,47 @@ nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg) return (fm_error->fme_status); } +static void +nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a, + void *b) +{ + nvme_t *nvme = a; + + nvme->n_dead = B_TRUE; + + /* + * Fail all outstanding commands, including those in the admin queue + * (queue 0). + */ + for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) { + nvme_qpair_t *qp = nvme->n_ioq[i]; + + mutex_enter(&qp->nq_mutex); + for (size_t j = 0; j < qp->nq_nentry; j++) { + nvme_cmd_t *cmd = qp->nq_cmd[j]; + nvme_cmd_t *u_cmd; + + if (cmd == NULL) { + continue; + } + + /* + * Since we have the queue lock held the entire time we + * iterate over it, it's not possible for the queue to + * change underneath us. Thus, we don't need to check + * that the return value of nvme_unqueue_cmd matches the + * requested cmd to unqueue. + */ + u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid); + taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, + cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); + + ASSERT3P(u_cmd, ==, cmd); + } + mutex_exit(&qp->nq_mutex); + } +} + static int nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { @@ -3290,6 +3368,17 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) ddi_set_driver_private(dip, nvme); nvme->n_dip = dip; + /* Set up event handlers for hot removal. */ + if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT, + &nvme->n_rm_cookie) != DDI_SUCCESS) { + goto fail; + } + if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie, + nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) != + DDI_SUCCESS) { + goto fail; + } + mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL); nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip, @@ -3603,6 +3692,12 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) if (nvme->n_product != NULL) strfree(nvme->n_product); + /* Clean up hot removal event handler. */ + if (nvme->n_ev_rm_cb_id != NULL) { + (void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id); + } + nvme->n_ev_rm_cb_id = NULL; + ddi_soft_state_free(nvme_state, instance); return (DDI_SUCCESS); @@ -3891,6 +3986,11 @@ static int nvme_bd_mediainfo(void *arg, bd_media_t *media) { nvme_namespace_t *ns = arg; + nvme_t *nvme = ns->ns_nvme; + + if (nvme->n_dead) { + return (EIO); + } media->m_nblks = ns->ns_block_count; media->m_blksize = ns->ns_block_size; @@ -3911,8 +4011,9 @@ nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc) boolean_t poll; int ret; - if (nvme->n_dead) + if (nvme->n_dead) { return (EIO); + } cmd = nvme_create_nvm_cmd(ns, opc, xfer); if (cmd == NULL) @@ -3993,6 +4094,11 @@ static int nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) { nvme_namespace_t *ns = arg; + nvme_t *nvme = ns->ns_nvme; + + if (nvme->n_dead) { + return (EIO); + } /*LINTED: E_BAD_PTR_CAST_ALIGN*/ if (*(uint64_t *)ns->ns_eui64 != 0) { diff --git a/usr/src/uts/common/io/nvme/nvme_var.h b/usr/src/uts/common/io/nvme/nvme_var.h index fb8f4ba771..ea378b8be4 100644 --- a/usr/src/uts/common/io/nvme/nvme_var.h +++ b/usr/src/uts/common/io/nvme/nvme_var.h @@ -114,20 +114,23 @@ struct nvme_cq { struct nvme_qpair { size_t nq_nentry; + /* submission fields */ nvme_dma_t *nq_sqdma; nvme_sqe_t *nq_sq; uint_t nq_sqhead; uint_t nq_sqtail; uintptr_t nq_sqtdbl; + /* completion */ nvme_cq_t *nq_cq; - nvme_cmd_t **nq_cmd; - uint16_t nq_next_cmd; - uint_t nq_active_cmds; + /* shared structures for completion and submission */ + nvme_cmd_t **nq_cmd; /* active command array */ + uint16_t nq_next_cmd; /* next potential empty queue slot */ + uint_t nq_active_cmds; /* number of active cmds */ - kmutex_t nq_mutex; - ksema_t nq_sema; + kmutex_t nq_mutex; /* protects shared state */ + ksema_t nq_sema; /* semaphore to ensure q always has >= 1 empty slot */ }; struct nvme { @@ -188,7 +191,12 @@ struct nvme { nvme_identify_ctrl_t *n_idctl; + /* Pointer to the admin queue, which is always queue 0 in n_ioq. */ nvme_qpair_t *n_adminq; + /* + * All command queues, including the admin queue. + * Its length is: n_ioq_count + 1. + */ nvme_qpair_t **n_ioq; nvme_cq_t **n_cq; @@ -244,6 +252,10 @@ struct nvme { uint32_t n_vendor_event; uint32_t n_unknown_event; + /* hot removal NDI event handling */ + ddi_eventcookie_t n_rm_cookie; + ddi_callback_id_t n_ev_rm_cb_id; + /* DDI UFM handle */ ddi_ufm_handle_t *n_ufmh; /* Cached Firmware Slot Information log page */ diff --git a/usr/src/uts/common/io/pciex/hotplug/pciehpc.c b/usr/src/uts/common/io/pciex/hotplug/pciehpc.c index 5ce219bd2f..3e4beda495 100644 --- a/usr/src/uts/common/io/pciex/hotplug/pciehpc.c +++ b/usr/src/uts/common/io/pciex/hotplug/pciehpc.c @@ -396,6 +396,21 @@ pciehpc_intr(dev_info_t *dip) control & ~PCIE_SLOTCTL_PWR_FAULT_EN); /* + * If supported, notify the child device driver that the + * device is being removed. + */ + dev_info_t *cdip = ddi_get_child(dip); + if (cdip != NULL) { + ddi_eventcookie_t rm_cookie; + if (ddi_get_eventcookie(cdip, + DDI_DEVI_REMOVE_EVENT, + &rm_cookie) == DDI_SUCCESS) { + ndi_post_event(dip, cdip, rm_cookie, + NULL); + } + } + + /* * Ask DDI Hotplug framework to change state to Empty */ (void) ndi_hp_state_change_req(dip, diff --git a/usr/src/uts/common/io/pciex/pcie.c b/usr/src/uts/common/io/pciex/pcie.c index 22f191943c..35a0190be7 100644 --- a/usr/src/uts/common/io/pciex/pcie.c +++ b/usr/src/uts/common/io/pciex/pcie.c @@ -845,6 +845,13 @@ pcie_init_pfd(dev_info_t *dip) PCIE_ZALLOC(pf_pcix_ecc_regs_t); } } + + PCIE_SLOT_REG(pfd_p) = PCIE_ZALLOC(pf_pcie_slot_regs_t); + PCIE_SLOT_REG(pfd_p)->pcie_slot_regs_valid = B_FALSE; + PCIE_SLOT_REG(pfd_p)->pcie_slot_cap = 0; + PCIE_SLOT_REG(pfd_p)->pcie_slot_control = 0; + PCIE_SLOT_REG(pfd_p)->pcie_slot_status = 0; + } else if (PCIE_IS_PCIX(bus_p)) { if (PCIE_IS_BDG(bus_p)) { PCIX_BDG_ERR_REG(pfd_p) = diff --git a/usr/src/uts/common/io/pciex/pcie_fault.c b/usr/src/uts/common/io/pciex/pcie_fault.c index 90563c1d1a..de558a9fc6 100644 --- a/usr/src/uts/common/io/pciex/pcie_fault.c +++ b/usr/src/uts/common/io/pciex/pcie_fault.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/sysmacros.h> @@ -200,7 +200,7 @@ pf_eh_exit(pcie_bus_t *bus_p) * for the root_pfd_p. * * "Root Complexes" such as NPE and PX should call scan_fabric using itself as - * the rdip. PCIe Root ports should call pf_scan_fabric using it's parent as + * the rdip. PCIe Root ports should call pf_scan_fabric using its parent as * the rdip. * * Scan fabric initiated from RCs are likely due to a fabric message, traps or @@ -587,6 +587,35 @@ pf_pcie_regs_gather(pf_data_t *pfd_p, pcie_bus_t *bus_p) PCIE_ROOTCTL); } + /* + * For eligible components, we gather Slot Register state. + * + * Eligible components are: + * - a Downstream Port or a Root Port with the Slot Implemented + * capability bit set + * - hotplug capable + * + * Slot register state is useful, for instance, to determine whether the + * Slot's child device is physically present (via the Slot Status + * register). + */ + if ((PCIE_IS_SWD(bus_p) || PCIE_IS_ROOT(bus_p)) && + PCIE_IS_HOTPLUG_ENABLED(PCIE_BUS2DIP(bus_p))) { + pf_pcie_slot_regs_t *pcie_slot_regs = PCIE_SLOT_REG(pfd_p); + pcie_slot_regs->pcie_slot_cap = PCIE_CAP_GET(32, bus_p, + PCIE_SLOTCAP); + pcie_slot_regs->pcie_slot_control = PCIE_CAP_GET(16, bus_p, + PCIE_SLOTCTL); + pcie_slot_regs->pcie_slot_status = PCIE_CAP_GET(16, bus_p, + PCIE_SLOTSTS); + + if (pcie_slot_regs->pcie_slot_cap != PCI_EINVAL32 && + pcie_slot_regs->pcie_slot_control != PCI_EINVAL16 && + pcie_slot_regs->pcie_slot_status != PCI_EINVAL16) { + pcie_slot_regs->pcie_slot_regs_valid = B_TRUE; + } + } + if (!PCIE_HAS_AER(bus_p)) return; @@ -838,7 +867,7 @@ pf_pci_find_rp_fault(pf_data_t *pfd_p, pcie_bus_t *bus_p) * Check to see if an error has been received that * requires a scan of the fabric. Count the number of * faults seen. If MUL CE/FE_NFE that counts for - * atleast 2 faults, so just return with full_scan. + * at least 2 faults, so just return with full_scan. */ if ((root_err & PCIE_AER_RE_STS_MUL_CE_RCVD) || (root_err & PCIE_AER_RE_STS_MUL_FE_NFE_RCVD)) { @@ -1232,7 +1261,7 @@ const pf_fab_err_tbl_t pcie_rp_tbl[] = { {PCIE_AER_UCE_FCP, pf_panic, PF_AFFECTED_SELF | PF_AFFECTED_CHILDREN, 0}, - {PCIE_AER_UCE_TO, pf_panic, + {PCIE_AER_UCE_TO, pf_analyse_to, PF_AFFECTED_ADDR, PF_AFFECTED_CHILDREN}, {PCIE_AER_UCE_CA, pf_no_panic, @@ -1916,16 +1945,35 @@ pf_analyse_sc(ddi_fm_error_t *derr, uint32_t bit, pf_data_t *dq_head_p, /* * PCIe Timeout error analyser. This error can be forgiven if it is marked as * CE Advisory. If it is marked as advisory, this means the HW can recover - * and/or retry the transaction automatically. + * and/or retry the transaction automatically. Additionally, if a device's + * parent slot reports that it is no longer physically present, we do not panic, + * as one would not expect a missing device to respond to a command. */ /* ARGSUSED */ static int pf_analyse_to(ddi_fm_error_t *derr, uint32_t bit, pf_data_t *dq_head_p, pf_data_t *pfd_p) { + dev_info_t *rpdip = PCIE_PFD2BUS(pfd_p)->bus_rp_dip; + pf_data_t *rppfd = PCIE_DIP2PFD(rpdip); + pf_pcie_slot_regs_t *p_pcie_slot_regs; + if (HAS_AER_LOGS(pfd_p, bit) && CE_ADVISORY(pfd_p)) return (PF_ERR_NO_PANIC); + p_pcie_slot_regs = PCIE_SLOT_REG(rppfd); + if (p_pcie_slot_regs->pcie_slot_regs_valid) { + /* + * If the device is reported gone from its parent slot, then it + * is expected that any outstanding commands would time out. In + * this case, do not panic. + */ + if ((p_pcie_slot_regs->pcie_slot_status & + PCIE_SLOTSTS_PRESENCE_DETECTED) == 0x0) { + return (PF_ERR_NO_PANIC); + } + } + return (PF_ERR_PANIC); } @@ -2970,6 +3018,24 @@ pf_send_ereport(ddi_fm_error_t *derr, pf_impl_t *impl) NULL); } + /* + * Slot Status registers + * + * Since we only gather these for certain types of components, + * only put these registers into the ereport if we have valid + * data. + */ + if (PCIE_SLOT_REG(pfd_p)->pcie_slot_regs_valid) { + fm_payload_set(ereport, + "pcie_slot_cap", DATA_TYPE_UINT32, + PCIE_SLOT_REG(pfd_p)->pcie_slot_cap, + "pcie_slot_control", DATA_TYPE_UINT16, + PCIE_SLOT_REG(pfd_p)->pcie_slot_control, + "pcie_slot_status", DATA_TYPE_UINT16, + PCIE_SLOT_REG(pfd_p)->pcie_slot_status, + NULL); + } + generic: /* IOV related information */ if (!PCIE_BDG_IS_UNASSIGNED(PCIE_PFD2BUS(impl->pf_dq_head_p))) { diff --git a/usr/src/uts/common/os/ddi_hp_impl.c b/usr/src/uts/common/os/ddi_hp_impl.c index 79165af9ff..38e575dbfd 100644 --- a/usr/src/uts/common/os/ddi_hp_impl.c +++ b/usr/src/uts/common/os/ddi_hp_impl.c @@ -21,12 +21,239 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ /* * Sun DDI hotplug implementation specific functions */ +/* + * HOTPLUG FRAMEWORK + * + * The hotplug framework (also referred to "SHP", for "Solaris Hotplug + * Framework") refers to a large set of userland and kernel interfaces, + * including those in this file, that provide functionality related to device + * hotplug. + * + * Hotplug is a broad term that refers to both removal and insertion of devices + * on a live system. Such operations can have varying levels of notification to + * the system. Coordinated hotplug means that the operating system is notified + * in advance that a device will have a hotplug operation performed on it. + * Non-coordinated hotplug, also called "surprise removal", does not have such + * notification, and the device is simply removed or inserted from the system. + * + * The goals of a correct hotplug operation will vary based on the device. In + * general, though, we want the system to gracefully notice the device change + * and clean up (or create) any relevant structures related to using the device + * in the system. + * + * The goals of the hotplug framework are to provide common interfaces for nexus + * drivers, device drivers, and userland programs to build a foundation for + * implementing hotplug for a variety of devices. Notably, common support for + * PCIe devices is available. See also: the nexus driver for PCIe devices at + * uts/i86pc/io/pciex/npe.c. + * + * + * TERMINOLOGY + * + * The following terms may be useful when exploring hotplug-related code. + * + * PHYSICAL HOTPLUG + * Refers to hotplug operations on a physical hardware receptacle. + * + * VIRTUAL HOTPLUG + * Refers to hotplug operations on an arbitrary device node in the device + * tree. + * + * CONNECTION (often abbreviated "cn") + * A place where either physical or virtual hotplug happens. This is a more + * generic term to refer to "connectors" and "ports", which represent + * physical and virtual places where hotplug happens, respectively. + * + * CONNECTOR + * A place where physical hotplug happens. For example: a PCIe slot, a USB + * port, a SAS port, and a fiber channel port are all connectors. + * + * PORT + * A place where virtual hotplug happens. A port refers to an arbitrary + * place under a nexus dev_info node in the device tree. + * + * + * CONNECTION STATE MACHINE + * + * Connections have the states below. Connectors and ports are grouped into + * the same state machine. It is worth noting that the edges here are incomplete + * -- it is possible for a connection to move straight from ENABLED to EMPTY, + * for instance, if there is a surprise removal of its device. + * + * State changes are kicked off through two ways: + * - Through the nexus driver interface, ndi_hp_state_change_req. PCIe + * nexus drivers that pass a hotplug interrupt through to pciehpc will kick + * off state changes in this way. + * - Through coordinated removal, ddihp_modctl. Both cfgadm(1M) and + * hotplug(1M) pass state change requests through hotplugd, which uses + * modctl to request state changes to the DDI hotplug framework. That + * interface is ultimately implemented by ddihp_modctl. + * + * (start) + * | + * v + * EMPTY no component plugged into connector + * ^ + * v + * PRESENT component plugged into connector + * ^ + * v + * POWERED connector is powered + * ^ + * v + * ENABLED connector is fully functional + * | + * . + * . + * . + * v + * (create port) + * | + * v + * PORT EMPTY port has no device occupying it + * ^ + * v + * PORT PRESENT port occupied by device + * + * + * ARCHITECTURE DIAGRAM + * + * The following is a non-exhaustive summary of various components in the system + * that implement pieces of the hotplug framework. More detailed descriptions + * of some key components are below. + * + * +------------+ + * | cfgadm(1M) | + * +------------+ + * | + * +-------------------+ + * | SHP cfgadm plugin | + * +-------------------+ + * | + * +-------------+ +------------+ + * | hotplug(1M) |----------| libhotplug | + * +-------------+ +------------+ + * | + * +----------+ + * | hotplugd | + * +----------+ + * | + * +----------------+ + * | modctl (HP op) | + * +----------------+ + * | + * | + * User | + * =============================|=============================================== + * Kernel | + * | + * | + * +------------------------+ +----------------+ + * | DDI hotplug interfaces | --- | Device Drivers | + * +------------------------+ +----------------+ + * | | + * | +------------------------+ + * | | NDI hotplug interfaces | + * | +------------------------+ + * | | + * | | + * +-------------+ +--------------+ +---------------------------+ + * | `bus_hp_op` | -- |"pcie" module | --- | "npe" (PCIe nexus driver) | + * +-------------+ +--------------+ +---------------------------+ + * | | + * | +-------------------+ + * | | PCIe configurator | + * | +-------------------+ + * | + * +-------------------------------------+ + * | "pciehpc" (PCIe hotplug controller) | + * +-------------------------------------+ + * + * + * . + * . + * . + * . + * . + * | + * | + * +-----------------------------------+ + * | I/O Subsystem | + * | (LDI notifications and contracts) | + * +-----------------------------------+ + * + * + * KEY HOTPLUG SOFTWARE COMPONENTS + * + * CFGADM(1M) + * + * cfgadm is the canonical tool for hotplug operations. It can be used to + * list connections on the system and change their state in a coordinated + * fashion. For more information, see its manual page. + * + * + * HOTPLUG(1M) + * + * hotplug is a command line tool for managing hotplug connections for + * connectors. For more information, see its manual page. + * + * + * DDI HOTPLUG INTERFACES + * + * This part of the framework provides interfaces for changing device state + * for connectors, including onlining and offlining child devices. Many of + * these functions are defined in this file. + * + * + * NDI HOTPLUG INTERFACES + * + * Nexus drivers can define their own hotplug bus implementations by + * defining a bus_hp_op entry point. This entry point must implement + * a set of hotplug related commands, including getting, probing, and + * changing connection state, as well as port creation and removal. + * + * Nexus drivers may also want to use the following interfaces for + * implementing hotplug. Note that the PCIe Hotplug Controller ("pciehpc") + * already takes care of using these: + * ndi_hp_{register,unregister} + * ndi_hp_state_change_req + * ndi_hp_walk_cn + * + * PCIe nexus drivers should use the common entry point pcie_hp_common_ops, + * which implements hotplug commands for PCIe devices, calling into other + * parts of the framework as needed. + * + * + * NPE DRIVER ("npe") + * + * npe is the common nexus driver for PCIe devices on x86. It implements + * hotplug using the NDI interfaces. For more information, see + * uts/i86pc/io/pciex/npe.c. + * + * The equivalent driver for SPARC is "px". + * + * + * PCIe HOTPLUG CONTROLLER DRIVER ("pciehpc") + * + * All hotplug-capable PCIe buses will initialize their own PCIe HPC, + * including the pcieb and ppb drivers. The controller maintains + * hotplug-related state about the slots on its bus, including their status + * and port state. It also features a common implementation of handling + * hotplug-related PCIe interrupts. + * + * For more information, see its interfaces in + * uts/common/sys/hotplug/pci/pciehpc.h. + * + */ + #include <sys/sysmacros.h> #include <sys/types.h> #include <sys/file.h> @@ -163,7 +390,9 @@ done: } /* - * Return the state of Hotplug Connection (CN) + * Fetch the state of Hotplug Connection (CN). + * This function will also update the state and last changed timestamp in the + * connection handle structure if the state has changed. */ int ddihp_cn_getstate(ddi_hp_cn_handle_t *hdlp) @@ -597,7 +826,7 @@ ddihp_cn_pre_change_state(ddi_hp_cn_handle_t *hdlp, curr_state == DDI_HP_CN_STATE_ENABLED) { /* * If the Connection goes to a lower state from ENABLED, - * then offline all children under it. + * then offline all children under it. */ rv = ddihp_cn_change_children_state(hdlp, B_FALSE); if (rv != DDI_SUCCESS) { @@ -640,7 +869,7 @@ ddihp_cn_pre_change_state(ddi_hp_cn_handle_t *hdlp, } /* - * Jobs after change state of a Connector: update last change time, + * Jobs after change state of a Connector: update state, last change time, * probe, online, sysevent, etc. */ static int @@ -813,7 +1042,7 @@ ddihp_cn_change_children_state(ddi_hp_cn_handle_t *hdlp, boolean_t online) NDI_SUCCESS) { cmn_err(CE_WARN, "(%s%d):" - " failed to dettach driver for the device" + " failed to detach driver for the device" " (%s%d) in the Connection %s\n", ddi_driver_name(dip), ddi_get_instance(dip), ddi_driver_name(cdip), diff --git a/usr/src/uts/common/os/ddi_hp_ndi.c b/usr/src/uts/common/os/ddi_hp_ndi.c index a41a12fc74..73c62dc6b9 100644 --- a/usr/src/uts/common/os/ddi_hp_ndi.c +++ b/usr/src/uts/common/os/ddi_hp_ndi.c @@ -21,6 +21,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ /* @@ -380,13 +382,19 @@ ddihp_cn_req_handler(ddi_hp_cn_handle_t *hdlp, ASSERT(DEVI_BUSY_OWNED(dip)); - if (ddihp_cn_getstate(hdlp) != DDI_SUCCESS) { - DDI_HP_NEXDBG((CE_CONT, "ddihp_cn_req_handler: dip %p, " - "hdlp %p ddi_cn_getstate failed\n", (void *)dip, - (void *)hdlp)); - - return (NDI_UNCLAIMED); - } + /* + * We do not want to fetch the state first, as calling ddihp_cn_getstate + * will update the cn_state member of the connection handle. The + * connector's hotplug operations rely on this value to know how + * target_state compares to the last known state of the device and make + * decisions about whether to clean up, post sysevents about the state + * change, and so on. + * + * Instead, just carry out the request to change the state. The + * connector's hotplug operations will update the state in the + * connection handle after they complete their necessary state change + * actions. + */ if (hdlp->cn_info.cn_state != target_state) { ddi_hp_cn_state_t result_state = 0; diff --git a/usr/src/uts/common/sys/ddi_hp.h b/usr/src/uts/common/sys/ddi_hp.h index eadb88ed49..b88762a9f5 100644 --- a/usr/src/uts/common/sys/ddi_hp.h +++ b/usr/src/uts/common/sys/ddi_hp.h @@ -21,6 +21,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_DDI_HP_H @@ -28,6 +30,9 @@ /* * Sun DDI hotplug support definitions + * + * See the big theory statement in uts/common/os/ddi_hp_impl.c for more + * information. */ #ifdef __cplusplus @@ -73,7 +78,8 @@ typedef enum { /* * ddi_hp_cn_info_t * - * Hotplug Connection (CN) information structure + * Hotplug Connection (CN) information structure. + * A Connection is either a Connector or a Port. */ typedef struct ddi_hp_cn_info { char *cn_name; /* Name of the Connection */ diff --git a/usr/src/uts/common/sys/ddi_hp_impl.h b/usr/src/uts/common/sys/ddi_hp_impl.h index fb220119dd..b52df77cac 100644 --- a/usr/src/uts/common/sys/ddi_hp_impl.h +++ b/usr/src/uts/common/sys/ddi_hp_impl.h @@ -21,6 +21,12 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. + */ + +/* + * See the big theory statement in uts/common/os/ddi_hp_impl.c for more + * information about the structures and functions defined here. */ #ifndef _SYS_DDI_HP_IMPL_H diff --git a/usr/src/uts/common/sys/pcie_impl.h b/usr/src/uts/common/sys/pcie_impl.h index d1d13625c2..442c55043c 100644 --- a/usr/src/uts/common/sys/pcie_impl.h +++ b/usr/src/uts/common/sys/pcie_impl.h @@ -166,6 +166,7 @@ extern "C" { #define PCIE_ADV_BDG_HDR(pfd_p, n) PCIE_ADV_BDG_REG(pfd_p)->pcie_sue_hdr[n] #define PCIE_ADV_RP_REG(pfd_p) \ PCIE_ADV_REG(pfd_p)->pcie_ext.pcie_adv_rp_regs +#define PCIE_SLOT_REG(pfd_p) pfd_p->pe_pcie_slot_regs #define PFD_AFFECTED_DEV(pfd_p) pfd_p->pe_affected_dev #define PFD_SET_AFFECTED_FLAG(pfd_p, aff_flag) \ PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags = aff_flag @@ -262,6 +263,18 @@ typedef struct pf_pcie_err_regs { pf_pcie_adv_err_regs_t *pcie_adv_regs; /* pcie aer regs */ } pf_pcie_err_regs_t; +/* + * Slot register values for hotplug-capable Downstream Ports or Root Ports with + * the Slot Implemented capability bit set. We gather these to help determine + * whether the slot's child device is physically present. + */ +typedef struct pf_pcie_slot_regs { + boolean_t pcie_slot_regs_valid; /* true if register values are valid */ + uint32_t pcie_slot_cap; /* pcie slot capabilities register */ + uint16_t pcie_slot_control; /* pcie slot control register */ + uint16_t pcie_slot_status; /* pcie slot status register */ +} pf_pcie_slot_regs_t; + typedef enum { PF_INTR_TYPE_NONE = 0, PF_INTR_TYPE_FABRIC = 1, /* Fabric Message */ @@ -431,6 +444,7 @@ struct pf_data { pf_pcie_err_regs_t *pe_pcie_regs; /* PCIe error reg */ } pe_ext; pf_pcix_bdg_err_regs_t *pe_pcix_bdg_regs; /* PCI-X bridge regs */ + pf_pcie_slot_regs_t *pe_pcie_slot_regs; /* PCIe slot regs */ pf_data_t *pe_prev; /* Next error in queue */ pf_data_t *pe_next; /* Next error in queue */ boolean_t pe_rber_fatal; diff --git a/usr/src/uts/i86pc/io/pci/pci_common.h b/usr/src/uts/i86pc/io/pci/pci_common.h index 63fe4bb165..d5fa3bfd55 100644 --- a/usr/src/uts/i86pc/io/pci/pci_common.h +++ b/usr/src/uts/i86pc/io/pci/pci_common.h @@ -22,6 +22,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ #ifndef _PCI_PCI_COMMON_H @@ -33,7 +35,7 @@ extern "C" { /* * Common header file with definitions shared between - * pci(7d) and npe(7d) + * pci(7D) and npe(7D) */ /* State structure. */ @@ -45,12 +47,18 @@ typedef struct pci_state { kmutex_t pci_mutex; kmutex_t pci_peek_poke_mutex; kmutex_t pci_err_mutex; + + /* + * The following members are only used by npe(7D). + * See uts/i86pc/io/pciex/npe.c for more information. + */ + ndi_event_hdl_t pci_ndi_event_hdl; } pci_state_t; /* * These are the access routines. - * The pci_bus_map sets the handle to point to these in pci(7d). - * The npe_bus_map sets the handle to point to these in npe(7d). + * The pci_bus_map sets the handle to point to these in pci(7D). + * The npe_bus_map sets the handle to point to these in npe(7D). */ uint8_t pci_config_rd8(ddi_acc_impl_t *hdlp, uint8_t *addr); uint16_t pci_config_rd16(ddi_acc_impl_t *hdlp, uint16_t *addr); diff --git a/usr/src/uts/i86pc/io/pciex/npe.c b/usr/src/uts/i86pc/io/pciex/npe.c index 4ef393ddb0..fcb68164ee 100644 --- a/usr/src/uts/i86pc/io/pciex/npe.c +++ b/usr/src/uts/i86pc/io/pciex/npe.c @@ -26,11 +26,35 @@ /* * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ /* - * Host to PCI-Express local bus driver + * npe (Nexus PCIe driver): Host to PCI-Express local bus driver + * + * npe serves as the driver for PCIe Root Complexes and as the nexus driver + * for PCIe devices. See also: npe(7D). For more information about hotplug, + * see the big theory statement at uts/common/os/ddi_hp_impl.c. + * + * + * NDI EVENT HANDLING SUPPORT + * + * npe supports NDI event handling. The only available event is surprise + * removal of a device. Child drivers can register surprise removal event + * callbacks by requesting an event cookie using ddi_get_eventcookie for + * the DDI_DEVI_REMOVE_EVENT and add their callback using + * ddi_add_event_handler. For an example, see the nvme driver in + * uts/common/io/nvme/nvme.c. + * + * The NDI events in npe are retrieved using NDI_EVENT_NOPASS, which + * prevent them from being propagated up the tree once they reach the npe's + * bus_get_eventcookie operations. This is important because npe maintains + * the state of PCIe devices and their receptacles, via the PCIe hotplug + * controller driver (pciehpc). + * + * Hot removal events are ultimately posted by the PCIe hotplug controller + * interrupt handler for hotplug events. Events are posted using the + * ndi_post_event interface. */ #include <sys/conf.h> @@ -72,6 +96,15 @@ static int npe_intr_ops(dev_info_t *, dev_info_t *, ddi_intr_op_t, ddi_intr_handle_impl_t *, void *); static int npe_fm_init(dev_info_t *, dev_info_t *, int, ddi_iblock_cookie_t *); +static int npe_bus_get_eventcookie(dev_info_t *, dev_info_t *, char *, + ddi_eventcookie_t *); +static int npe_bus_add_eventcall(dev_info_t *, dev_info_t *, + ddi_eventcookie_t, void (*)(dev_info_t *, + ddi_eventcookie_t, void *, void *), + void *, ddi_callback_id_t *); +static int npe_bus_remove_eventcall(dev_info_t *, ddi_callback_id_t); +static int npe_bus_post_event(dev_info_t *, dev_info_t *, + ddi_eventcookie_t, void *); static int npe_fm_callback(dev_info_t *, ddi_fm_error_t *, const void *); @@ -102,10 +135,10 @@ struct bus_ops npe_bus_ops = { ddi_dma_mctl, npe_ctlops, ddi_bus_prop_op, - 0, /* (*bus_get_eventcookie)(); */ - 0, /* (*bus_add_eventcall)(); */ - 0, /* (*bus_remove_eventcall)(); */ - 0, /* (*bus_post_event)(); */ + npe_bus_get_eventcookie, + npe_bus_add_eventcall, + npe_bus_remove_eventcall, + npe_bus_post_event, 0, /* (*bus_intr_ctl)(); */ 0, /* (*bus_config)(); */ 0, /* (*bus_unconfig)(); */ @@ -271,12 +304,27 @@ npe_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) return (ret); } +/* + * See big theory statement at the top of this file for more information about + * surprise removal events. + */ +#define NPE_EVENT_TAG_HOT_REMOVAL 0 +static ndi_event_definition_t npe_ndi_event_defs[1] = { + {NPE_EVENT_TAG_HOT_REMOVAL, DDI_DEVI_REMOVE_EVENT, EPL_KERNEL, + NDI_EVENT_POST_TO_ALL} +}; + +static ndi_event_set_t npe_ndi_events = { + NDI_EVENTS_REV1, ARRAY_SIZE(npe_ndi_event_defs), npe_ndi_event_defs +}; + /*ARGSUSED*/ static int npe_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) { int instance = ddi_get_instance(devi); pci_state_t *pcip = NULL; + int ret; if (cmd == DDI_RESUME) { /* @@ -316,6 +364,22 @@ npe_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) if (pcie_init(devi, NULL) != DDI_SUCCESS) goto fail1; + ret = ndi_event_alloc_hdl(pcip->pci_dip, NULL, &pcip->pci_ndi_event_hdl, + NDI_SLEEP); + if (ret == NDI_SUCCESS) { + ret = ndi_event_bind_set(pcip->pci_ndi_event_hdl, + &npe_ndi_events, NDI_SLEEP); + if (ret != NDI_SUCCESS) { + dev_err(pcip->pci_dip, CE_WARN, "npe: failed to bind " + "NDI event set (error=%d)", ret); + goto fail1; + } + } else { + dev_err(pcip->pci_dip, CE_WARN, "npe: failed to allocate " + "event handle (error=%d)", ret); + goto fail1; + } + /* Second arg: initialize for pci_express root nexus */ if (pcitool_init(devi, B_TRUE) != DDI_SUCCESS) goto fail2; @@ -352,11 +416,36 @@ npe_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) { int instance = ddi_get_instance(devi); pci_state_t *pcip; + int ret; pcip = ddi_get_soft_state(npe_statep, ddi_get_instance(devi)); switch (cmd) { case DDI_DETACH: + + /* + * Clean up event handling first, to ensure there are no + * oustanding callbacks registered. + */ + ret = ndi_event_unbind_set(pcip->pci_ndi_event_hdl, + &npe_ndi_events, NDI_SLEEP); + if (ret == NDI_SUCCESS) { + /* ndi_event_free_hdl always succeeds. */ + (void) ndi_event_free_hdl(pcip->pci_ndi_event_hdl); + } else { + /* + * The event set will only fail to unbind if there are + * outstanding callbacks registered for it, which + * probably means a child driver still has one + * registered and thus was not cleaned up properly + * before npe's detach routine was called. Consequently, + * we should fail the detach here. + */ + dev_err(pcip->pci_dip, CE_WARN, "npe: failed to " + "unbind NDI event set (error=%d)", ret); + return (DDI_FAILURE); + } + pcie_fab_fini_bus(devi, PCIE_BUS_INITIAL); /* Uninitialize pcitool support. */ @@ -373,6 +462,7 @@ npe_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) ddi_fm_fini(devi); ddi_soft_state_free(npe_statep, instance); + return (DDI_SUCCESS); case DDI_SUSPEND: @@ -414,7 +504,7 @@ static int npe_bus_map(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp, off_t offset, off_t len, caddr_t *vaddrp) { - int rnumber; + int rnumber; int space; ddi_acc_impl_t *ap; ddi_acc_hdl_t *hp; @@ -1111,6 +1201,49 @@ npe_fm_init(dev_info_t *dip, dev_info_t *tdip, int cap, return (pcip->pci_fmcap); } +static int +npe_bus_get_eventcookie(dev_info_t *dip, dev_info_t *rdip, char *eventname, + ddi_eventcookie_t *cookiep) +{ + pci_state_t *pcip = ddi_get_soft_state(npe_statep, + ddi_get_instance(dip)); + + return (ndi_event_retrieve_cookie(pcip->pci_ndi_event_hdl, rdip, + eventname, cookiep, NDI_EVENT_NOPASS)); +} + +static int +npe_bus_add_eventcall(dev_info_t *dip, dev_info_t *rdip, + ddi_eventcookie_t cookie, void (*callback)(dev_info_t *dip, + ddi_eventcookie_t cookie, void *arg, void *bus_impldata), + void *arg, ddi_callback_id_t *cb_id) +{ + pci_state_t *pcip = ddi_get_soft_state(npe_statep, + ddi_get_instance(dip)); + + return (ndi_event_add_callback(pcip->pci_ndi_event_hdl, rdip, cookie, + callback, arg, NDI_SLEEP, cb_id)); +} + +static int +npe_bus_remove_eventcall(dev_info_t *dip, ddi_callback_id_t cb_id) +{ + pci_state_t *pcip = ddi_get_soft_state(npe_statep, + ddi_get_instance(dip)); + return (ndi_event_remove_callback(pcip->pci_ndi_event_hdl, cb_id)); +} + +static int +npe_bus_post_event(dev_info_t *dip, dev_info_t *rdip, + ddi_eventcookie_t cookie, void *impl_data) +{ + pci_state_t *pcip = ddi_get_soft_state(npe_statep, + ddi_get_instance(dip)); + return (ndi_event_do_callback(pcip->pci_ndi_event_hdl, rdip, cookie, + impl_data)); + +} + /*ARGSUSED*/ static int npe_fm_callback(dev_info_t *dip, ddi_fm_error_t *derr, const void *no_used) diff --git a/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c b/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c index b482117c7c..3f890d8f07 100644 --- a/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c +++ b/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2019, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ /* @@ -1251,7 +1251,7 @@ pcicfg_ntbridge_unconfigure_child(dev_info_t *new_device, uint_t devno) { dev_info_t *new_ntbridgechild; - int len, bus; + int len, bus; uint16_t vid; ddi_acc_handle_t config_handle; pci_bus_range_t pci_bus_range; @@ -1368,7 +1368,7 @@ pcicfg_is_ntbridge(dev_info_t *dip) static uint_t pcicfg_ntbridge_child(dev_info_t *dip) { - int len, val, rc = DDI_FAILURE; + int len, val, rc = DDI_FAILURE; dev_info_t *anode = dip; /* @@ -1398,7 +1398,7 @@ pcicfg_ntbridge_child(dev_info_t *dip) static uint_t pcicfg_get_ntbridge_child_range(dev_info_t *dip, uint64_t *boundbase, - uint64_t *boundlen, uint_t space_type) + uint64_t *boundlen, uint_t space_type) { int length, found = DDI_FAILURE, acount, i, ibridge; pci_regspec_t *assigned; @@ -1584,6 +1584,7 @@ static int pcicfg_teardown_device(dev_info_t *dip, pcicfg_flags_t flags, boolean_t is_pcie) { ddi_acc_handle_t handle; + int ret; /* * Free up resources associated with 'dip' @@ -1596,10 +1597,20 @@ pcicfg_teardown_device(dev_info_t *dip, pcicfg_flags_t flags, boolean_t is_pcie) /* * disable the device */ - if (pcicfg_config_setup(dip, &handle) != PCICFG_SUCCESS) + + ret = pcicfg_config_setup(dip, &handle); + if (ret == PCICFG_SUCCESS) { + pcicfg_device_off(handle); + pcicfg_config_teardown(&handle); + } else if (ret != PCICFG_NODEVICE) { + /* + * It is possible the device no longer exists -- for instance, + * if the device has been pulled from a hotpluggable slot on the + * system. In this case, do not fail the teardown, though there + * is less to clean up. + */ return (PCICFG_FAILURE); - pcicfg_device_off(handle); - pcicfg_config_teardown(&handle); + } if (is_pcie) { /* @@ -2401,8 +2412,7 @@ pcicfg_get_mem(pcicfg_phdl_t *entry, uint32_t length, uint64_t *ans) } static void -pcicfg_get_io(pcicfg_phdl_t *entry, - uint32_t length, uint32_t *ans) +pcicfg_get_io(pcicfg_phdl_t *entry, uint32_t length, uint32_t *ans) { uint32_t new_io; uint64_t io_last; @@ -3189,7 +3199,7 @@ pcicfg_device_off(ddi_acc_handle_t config_handle) */ static int pcicfg_set_standard_props(dev_info_t *dip, ddi_acc_handle_t config_handle, - uint8_t pcie_dev) + uint8_t pcie_dev) { int ret; uint16_t cap_id_loc, val; @@ -3361,7 +3371,7 @@ pcicfg_set_busnode_props(dev_info_t *dip, uint8_t pcie_device_type) static int pcicfg_set_childnode_props(dev_info_t *dip, ddi_acc_handle_t config_handle, - uint8_t pcie_dev) + uint8_t pcie_dev) { int ret; @@ -3521,8 +3531,8 @@ pcicfg_set_childnode_props(dev_info_t *dip, ddi_acc_handle_t config_handle, * Program the bus numbers into the bridge */ static void -pcicfg_set_bus_numbers(ddi_acc_handle_t config_handle, -uint_t primary, uint_t secondary, uint_t subordinate) +pcicfg_set_bus_numbers(ddi_acc_handle_t config_handle, uint_t primary, + uint_t secondary, uint_t subordinate) { DEBUG3("Setting bridge bus-range %d,%d,%d\n", primary, secondary, subordinate); @@ -3547,8 +3557,7 @@ uint_t primary, uint_t secondary, uint_t subordinate) * Put bridge registers into initial state */ static void -pcicfg_setup_bridge(pcicfg_phdl_t *entry, - ddi_acc_handle_t handle) +pcicfg_setup_bridge(pcicfg_phdl_t *entry, ddi_acc_handle_t handle) { /* * The highest bus seen during probing is the max-subordinate bus @@ -3607,8 +3616,7 @@ pcicfg_setup_bridge(pcicfg_phdl_t *entry, } static void -pcicfg_update_bridge(pcicfg_phdl_t *entry, - ddi_acc_handle_t handle) +pcicfg_update_bridge(pcicfg_phdl_t *entry, ddi_acc_handle_t handle) { uint_t length; @@ -3853,11 +3861,10 @@ failedconfig: * Sizing the BARs and update "reg" property */ static int -pcicfg_populate_reg_props(dev_info_t *new_child, - ddi_acc_handle_t config_handle) +pcicfg_populate_reg_props(dev_info_t *new_child, ddi_acc_handle_t config_handle) { int i; - uint32_t request; + uint32_t request; i = PCI_CONF_BASE0; @@ -5079,7 +5086,7 @@ pcicfg_config_teardown(ddi_acc_handle_t *handle) static int pcicfg_add_config_reg(dev_info_t *dip, - uint_t bus, uint_t device, uint_t func) + uint_t bus, uint_t device, uint_t func) { int reg[10] = { PCI_ADDR_CONFIG, 0, 0, 0, 0}; @@ -5104,8 +5111,8 @@ pcicfg_ari_configure(dev_info_t *dip) #ifdef DEBUG static void -debug(char *fmt, uintptr_t a1, uintptr_t a2, uintptr_t a3, - uintptr_t a4, uintptr_t a5) +debug(char *fmt, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4, + uintptr_t a5) { if (pcicfg_debug > 1) { prom_printf("pcicfg: "); |
