diff options
Diffstat (limited to 'usr/src/uts/sun4/io/px/px_fm.c')
-rw-r--r-- | usr/src/uts/sun4/io/px/px_fm.c | 624 |
1 files changed, 451 insertions, 173 deletions
diff --git a/usr/src/uts/sun4/io/px/px_fm.c b/usr/src/uts/sun4/io/px/px_fm.c index 727d0351a6..65c1d78d29 100644 --- a/usr/src/uts/sun4/io/px/px_fm.c +++ b/usr/src/uts/sun4/io/px/px_fm.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +33,7 @@ #include <sys/sunddi.h> #include <sys/fm/protocol.h> #include <sys/fm/util.h> +#include <sys/fm/io/pci.h> #include <sys/membar.h> #include "px_obj.h" @@ -43,26 +44,35 @@ (PCIE_AER_UCE_TRAINING | PCIE_AER_UCE_SD | PCIE_AER_UCE_CA | \ PCIE_AER_UCE_UC | PCIE_AER_UCE_UR) -static void px_err_fill_pfd(dev_info_t *rpdip, px_err_pcie_t *regs); +/* + * Global panicing state variabled used to control if further error handling + * should occur. If the system is already panic'ing or if PX itself has + * recommended panic'ing the system, no further error handling should occur to + * prevent the system from hanging. + */ +boolean_t px_panicing = B_FALSE; + +static pf_data_t *px_get_pfd(px_t *px_p); + static int px_pcie_ptlp(dev_info_t *dip, ddi_fm_error_t *derr, px_err_pcie_t *regs); #if defined(DEBUG) -static void px_pcie_log(dev_info_t *dip, px_err_pcie_t *regs, int severity); +static void px_pcie_log(dev_info_t *dip, px_err_pcie_t *regs); #else /* DEBUG */ #define px_pcie_log 0 && #endif /* DEBUG */ -/* external functions */ -extern int pci_xcap_locate(ddi_acc_handle_t h, uint16_t id, uint16_t *base_p); -extern int pci_lcap_locate(ddi_acc_handle_t h, uint8_t id, uint16_t *base_p); - /* * Initialize px FMA support */ int px_fm_attach(px_t *px_p) { + int i; + dev_info_t *dip = px_p->px_dip; + pcie_bus_t *bus_p; + px_p->px_fm_cap = DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE | DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE; @@ -75,7 +85,7 @@ px_fm_attach(px_t *px_p) /* * check parents' capability */ - ddi_fm_init(px_p->px_dip, &px_p->px_fm_cap, &px_p->px_fm_ibc); + ddi_fm_init(dip, &px_p->px_fm_cap, &px_p->px_fm_ibc); /* * parents need to be ereport and error handling capable @@ -89,10 +99,22 @@ px_fm_attach(px_t *px_p) mutex_init(&px_p->px_fm_mutex, NULL, MUTEX_DRIVER, (void *)px_p->px_fm_ibc); + + pcie_rc_init_bus(dip); + + px_p->px_pfd_idx = 0; + for (i = 0; i < 5; i++) + pcie_rc_init_pfd(dip, &px_p->px_pfd_arr[i]); + PCIE_DIP2PFD(dip) = px_p->px_pfd_arr; + + bus_p = PCIE_DIP2BUS(dip); + bus_p->bus_rp_bdf = px_p->px_bdf; + bus_p->bus_rp_dip = dip; + /* * register error callback in parent */ - ddi_fm_handler_register(px_p->px_dip, px_fm_callback, px_p); + ddi_fm_handler_register(dip, px_fm_callback, px_p); return (DDI_SUCCESS); } @@ -103,9 +125,14 @@ px_fm_attach(px_t *px_p) void px_fm_detach(px_t *px_p) { + int i; + ddi_fm_handler_unregister(px_p->px_dip); mutex_destroy(&px_p->px_fm_mutex); ddi_fm_fini(px_p->px_dip); + for (i = 0; i < 5; i++) + pcie_rc_fini_pfd(&px_p->px_pfd_arr[i]); + pcie_rc_fini_bus(px_p->px_dip); } /* @@ -113,9 +140,10 @@ px_fm_detach(px_t *px_p) * protection. */ void -px_fm_acc_setup(ddi_map_req_t *mp, dev_info_t *rdip) +px_fm_acc_setup(ddi_map_req_t *mp, dev_info_t *rdip, pci_regspec_t *rp) { uchar_t fflag; + ndi_err_t *errp; ddi_acc_hdl_t *hp; ddi_acc_impl_t *ap; @@ -143,6 +171,13 @@ px_fm_acc_setup(ddi_map_req_t *mp, dev_info_t *rdip) ap->ahi_rep_put16 = i_ddi_prot_rep_put16; ap->ahi_rep_put32 = i_ddi_prot_rep_put32; ap->ahi_rep_put64 = i_ddi_prot_rep_put64; + impl_acc_err_init(hp); + errp = ((ddi_acc_impl_t *)hp)->ahi_err; + if ((rp->pci_phys_hi & PCI_REG_ADDR_M) == + PCI_ADDR_CONFIG) + errp->err_cf = px_err_cfg_hdl_check; + else + errp->err_cf = px_err_pio_hdl_check; break; case DDI_CAUTIOUS_ACC : ap->ahi_get8 = i_ddi_caut_get8; @@ -161,6 +196,13 @@ px_fm_acc_setup(ddi_map_req_t *mp, dev_info_t *rdip) ap->ahi_rep_put16 = i_ddi_caut_rep_put16; ap->ahi_rep_put32 = i_ddi_caut_rep_put32; ap->ahi_rep_put64 = i_ddi_caut_rep_put64; + impl_acc_err_init(hp); + errp = ((ddi_acc_impl_t *)hp)->ahi_err; + if ((rp->pci_phys_hi & PCI_REG_ADDR_M) == + PCI_ADDR_CONFIG) + errp->err_cf = px_err_cfg_hdl_check; + else + errp->err_cf = px_err_pio_hdl_check; break; default: break; @@ -221,6 +263,24 @@ px_bus_exit(dev_info_t *dip, ddi_acc_handle_t handle) mutex_exit(&pec_p->pec_pokefault_mutex); } +static uint64_t +px_in_addr_range(dev_info_t *dip, px_ranges_t *ranges_p, uint64_t addr) +{ + uint64_t addr_low, addr_high; + + addr_low = ((uint64_t)ranges_p->parent_high << 32) | + (uint64_t)ranges_p->parent_low; + addr_high = addr_low + ((uint64_t)ranges_p->size_high << 32) + + (uint64_t)ranges_p->size_low; + + DBG(DBG_ERR_INTR, dip, "Addr: 0x%llx high: 0x%llx low: 0x%llx\n", + addr, addr_high, addr_low); + + if ((addr < addr_high) && (addr >= addr_low)) + return (addr_low); + + return (0); +} /* * PCI error callback which is registered with our parent to call @@ -234,8 +294,9 @@ px_fm_callback(dev_info_t *dip, ddi_fm_error_t *derr, const void *impl_data) dev_info_t *pdip = ddi_get_parent(dip); px_t *px_p = (px_t *)impl_data; int i, acc_type = 0; - int lookup, rc_err, fab_err = PF_NO_PANIC; - uint32_t addr, addr_high, addr_low; + int lookup, rc_err, fab_err; + uint64_t addr, base_addr; + uint64_t fault_addr = (uint64_t)derr->fme_bus_specific; pcie_req_id_t bdf; px_ranges_t *ranges_p; int range_len; @@ -250,11 +311,10 @@ px_fm_callback(dev_info_t *dip, ddi_fm_error_t *derr, const void *impl_data) return (DDI_FM_FATAL); i_ddi_fm_handler_exit(pdip); - mutex_enter(&px_p->px_fm_mutex); - px_p->px_fm_mutex_owner = curthread; - - addr_high = (uint32_t)((uint64_t)derr->fme_bus_specific >> 32); - addr_low = (uint32_t)((uint64_t)derr->fme_bus_specific); + if (px_fm_enter(px_p) != DDI_SUCCESS) { + i_ddi_fm_handler_enter(pdip); + return (DDI_FM_FATAL); + } /* * Make sure this failed load came from this PCIe port. Check by @@ -263,21 +323,20 @@ px_fm_callback(dev_info_t *dip, ddi_fm_error_t *derr, const void *impl_data) range_len = px_p->px_ranges_length / sizeof (px_ranges_t); i = 0; for (ranges_p = px_p->px_ranges_p; i < range_len; i++, ranges_p++) { - if (ranges_p->parent_high == addr_high) { + base_addr = px_in_addr_range(dip, ranges_p, fault_addr); + if (base_addr) { switch (ranges_p->child_high & PCI_ADDR_MASK) { case PCI_ADDR_CONFIG: - acc_type = PF_CFG_ADDR; + acc_type = PF_ADDR_CFG; addr = NULL; - bdf = (pcie_req_id_t)(addr_low >> 12); + bdf = (pcie_req_id_t)((fault_addr >> 12) & + 0xFFFF); break; case PCI_ADDR_IO: - acc_type = PF_IO_ADDR; - addr = addr_low; - bdf = NULL; - break; + case PCI_ADDR_MEM64: case PCI_ADDR_MEM32: - acc_type = PF_DMA_ADDR; - addr = addr_low; + acc_type = PF_ADDR_PIO; + addr = fault_addr - base_addr; bdf = NULL; break; } @@ -287,39 +346,31 @@ px_fm_callback(dev_info_t *dip, ddi_fm_error_t *derr, const void *impl_data) /* This address doesn't belong to this leaf, just return with OK */ if (!acc_type) { - px_p->px_fm_mutex_owner = NULL; - mutex_exit(&px_p->px_fm_mutex); + px_fm_exit(px_p); i_ddi_fm_handler_enter(pdip); return (DDI_FM_OK); - } else if (acc_type == PF_IO_ADDR) { - px_p->px_fm_mutex_owner = NULL; - mutex_exit(&px_p->px_fm_mutex); - i_ddi_fm_handler_enter(pdip); - return (DDI_FM_FATAL); } rc_err = px_err_cmn_intr(px_p, derr, PX_TRAP_CALL, PX_FM_BLOCK_ALL); - lookup = pf_hdl_lookup(dip, derr->fme_ena, acc_type, addr, bdf); + lookup = pf_hdl_lookup(dip, derr->fme_ena, acc_type, (uint64_t)addr, + bdf); - if (!px_lib_is_in_drain_state(px_p)) { - /* - * This is to ensure that device corresponding to the addr of - * the failed PIO/CFG load gets scanned. - */ - px_rp_en_q(px_p, bdf, addr, - (PCI_STAT_R_MAST_AB | PCI_STAT_R_TARG_AB)); - fab_err = pf_scan_fabric(dip, derr, px_p->px_dq_p, - &px_p->px_dq_tail); - } + px_rp_en_q(px_p, bdf, addr, + (PCI_STAT_R_MAST_AB | PCI_STAT_R_TARG_AB)); - px_p->px_fm_mutex_owner = NULL; - mutex_exit(&px_p->px_fm_mutex); + fab_err = px_scan_fabric(px_p, dip, derr); + + px_fm_exit(px_p); i_ddi_fm_handler_enter(pdip); - if ((rc_err & (PX_PANIC | PX_PROTECTED)) || (fab_err & PF_PANIC) || + if (!px_die) + return (DDI_FM_OK); + + if ((rc_err & (PX_PANIC | PX_PROTECTED)) || + (fab_err & PF_ERR_FATAL_FLAGS) || (lookup == PF_HDL_NOTFOUND)) return (DDI_FM_FATAL); - else if ((rc_err == PX_NO_ERROR) && (fab_err == PF_NO_ERROR)) + else if ((rc_err == PX_NO_ERROR) && (fab_err == PF_ERR_NO_ERROR)) return (DDI_FM_OK); return (DDI_FM_NONFATAL); @@ -341,11 +392,13 @@ uint_t px_err_fabric_intr(px_t *px_p, msgcode_t msg_code, pcie_req_id_t rid) { dev_info_t *rpdip = px_p->px_dip; - int rc_err, fab_err = PF_NO_PANIC; + int rc_err, fab_err; ddi_fm_error_t derr; + uint32_t rp_status; + uint16_t ce_source, ue_source; - mutex_enter(&px_p->px_fm_mutex); - px_p->px_fm_mutex_owner = curthread; + if (px_fm_enter(px_p) != DDI_SUCCESS) + goto done; /* Create the derr */ bzero(&derr, sizeof (ddi_fm_error_t)); @@ -353,26 +406,72 @@ px_err_fabric_intr(px_t *px_p, msgcode_t msg_code, pcie_req_id_t rid) derr.fme_ena = fm_ena_generate(0, FM_ENA_FMT1); derr.fme_flag = DDI_FM_ERR_UNEXPECTED; + px_err_safeacc_check(px_p, &derr); + + if (msg_code == PCIE_MSG_CODE_ERR_COR) { + rp_status = PCIE_AER_RE_STS_CE_RCVD; + ce_source = rid; + ue_source = 0; + } else { + rp_status = PCIE_AER_RE_STS_FE_NFE_RCVD; + ce_source = 0; + ue_source = rid; + if (msg_code == PCIE_MSG_CODE_ERR_NONFATAL) + rp_status |= PCIE_AER_RE_STS_NFE_MSGS_RCVD; + else { + rp_status |= PCIE_AER_RE_STS_FE_MSGS_RCVD; + rp_status |= PCIE_AER_RE_STS_FIRST_UC_FATAL; + } + } + + if (derr.fme_flag == DDI_FM_ERR_UNEXPECTED) { + ddi_fm_ereport_post(rpdip, PCI_ERROR_SUBCLASS "." PCIEX_FABRIC, + derr.fme_ena, + DDI_NOSLEEP, FM_VERSION, DATA_TYPE_UINT8, 0, + FIRE_PRIMARY, DATA_TYPE_BOOLEAN_VALUE, B_TRUE, + "pcie_adv_rp_status", DATA_TYPE_UINT32, rp_status, + "pcie_adv_rp_command", DATA_TYPE_UINT32, 0, + "pcie_adv_rp_ce_src_id", DATA_TYPE_UINT16, ce_source, + "pcie_adv_rp_ue_src_id", DATA_TYPE_UINT16, ue_source, + NULL); + } + /* Ensure that the rid of the fabric message will get scanned. */ px_rp_en_q(px_p, rid, NULL, NULL); rc_err = px_err_cmn_intr(px_p, &derr, PX_INTR_CALL, PX_FM_BLOCK_PCIE); /* call rootport dispatch */ - if (!px_lib_is_in_drain_state(px_p)) { - fab_err = pf_scan_fabric(rpdip, &derr, px_p->px_dq_p, - &px_p->px_dq_tail); - } - - px_p->px_fm_mutex_owner = NULL; - mutex_exit(&px_p->px_fm_mutex); + fab_err = px_scan_fabric(px_p, rpdip, &derr); - px_err_panic(rc_err, PX_RC, fab_err); + px_err_panic(rc_err, PX_RC, fab_err, B_TRUE); + px_fm_exit(px_p); + px_err_panic(rc_err, PX_RC, fab_err, B_FALSE); +done: return (DDI_INTR_CLAIMED); } /* + * px_scan_fabric: + * + * Check for drain state and if there is anything to scan. + */ +int +px_scan_fabric(px_t *px_p, dev_info_t *rpdip, ddi_fm_error_t *derr) { + int fab_err = 0; + + ASSERT(MUTEX_HELD(&px_p->px_fm_mutex)); + + if (!px_lib_is_in_drain_state(px_p) && px_p->px_pfd_idx) { + fab_err = pf_scan_fabric(rpdip, derr, px_p->px_pfd_arr); + px_p->px_pfd_idx = 0; + } + + return (fab_err); +} + +/* * px_err_safeacc_check: * Check to see if a peek/poke and cautious access is currently being * done on a particular leaf. @@ -456,103 +555,79 @@ px_err_check_eq(dev_info_t *dip) return (PX_NO_PANIC); } -static void -px_err_fill_pfd(dev_info_t *rpdip, px_err_pcie_t *regs) +/* ARGSUSED */ +int +px_err_check_pcie(dev_info_t *dip, ddi_fm_error_t *derr, px_err_pcie_t *regs) { - px_t *px_p = DIP_TO_STATE(rpdip); - pf_data_t pf_data = {0}; - pcie_req_id_t fault_bdf = 0; - uint32_t fault_addr = 0; - uint16_t s_status = 0; - - pf_data.rp_bdf = px_p->px_bdf; + px_t *px_p = DIP_TO_STATE(dip); + pf_data_t *pfd_p = px_get_pfd(px_p); + int i; + pf_pcie_adv_err_regs_t *adv_reg = PCIE_ADV_REG(pfd_p); /* * set RC s_status in PCI term to coordinate with downstream fabric * errors ananlysis. */ if (regs->primary_ue & PCIE_AER_UCE_UR) - s_status = PCI_STAT_R_MAST_AB; + PCI_BDG_ERR_REG(pfd_p)->pci_bdg_sec_stat = PCI_STAT_R_MAST_AB; if (regs->primary_ue & PCIE_AER_UCE_CA) - s_status = PCI_STAT_R_TARG_AB; + PCI_BDG_ERR_REG(pfd_p)->pci_bdg_sec_stat = PCI_STAT_R_TARG_AB; if (regs->primary_ue & (PCIE_AER_UCE_PTLP | PCIE_AER_UCE_ECRC)) - s_status = PCI_STAT_PERROR; - - if (regs->primary_ue & (PCIE_AER_UCE_UR | PCIE_AER_UCE_CA)) { - pf_data.aer_h0 = regs->rx_hdr1; - pf_data.aer_h1 = regs->rx_hdr2; - pf_data.aer_h2 = regs->rx_hdr3; - pf_data.aer_h3 = regs->rx_hdr4; - - pf_tlp_decode(rpdip, &pf_data, &fault_bdf, NULL, NULL); - } else if (regs->primary_ue & PCIE_AER_UCE_PTLP) { - pcie_tlp_hdr_t *tlp_p; - - pf_data.aer_h0 = regs->rx_hdr1; - pf_data.aer_h1 = regs->rx_hdr2; - pf_data.aer_h2 = regs->rx_hdr3; - pf_data.aer_h3 = regs->rx_hdr4; - - tlp_p = (pcie_tlp_hdr_t *)&pf_data.aer_h0; - if (tlp_p->type == PCIE_TLP_TYPE_CPL) - pf_tlp_decode(rpdip, &pf_data, &fault_bdf, NULL, NULL); - - pf_data.aer_h0 = regs->tx_hdr1; - pf_data.aer_h1 = regs->tx_hdr2; - pf_data.aer_h2 = regs->tx_hdr3; - pf_data.aer_h3 = regs->tx_hdr4; - - pf_tlp_decode(rpdip, &pf_data, NULL, &fault_addr, NULL); - } + PCI_BDG_ERR_REG(pfd_p)->pci_bdg_sec_stat = PCI_STAT_PERROR; - px_rp_en_q(px_p, fault_bdf, fault_addr, s_status); -} - -int -px_err_check_pcie(dev_info_t *dip, ddi_fm_error_t *derr, px_err_pcie_t *regs) -{ - uint32_t ce_reg, ue_reg; - int err = PX_NO_ERROR; - - ce_reg = regs->ce_reg; - if (ce_reg) - err |= (ce_reg & px_fabric_die_rc_ce) ? PX_PANIC : PX_NO_ERROR; - - ue_reg = regs->ue_reg; - if (!ue_reg) + if (!regs->primary_ue) goto done; - if (ue_reg & PCIE_AER_UCE_PTLP) - err |= px_pcie_ptlp(dip, derr, regs); + adv_reg->pcie_ce_status = regs->ce_reg; + adv_reg->pcie_ue_status = regs->ue_reg | regs->primary_ue; + PCIE_ADV_HDR(pfd_p, 0) = regs->rx_hdr1; + PCIE_ADV_HDR(pfd_p, 1) = regs->rx_hdr2; + PCIE_ADV_HDR(pfd_p, 2) = regs->rx_hdr3; + PCIE_ADV_HDR(pfd_p, 3) = regs->rx_hdr4; + for (i = regs->primary_ue; i != 1; i = i >> 1) + adv_reg->pcie_adv_ctl++; - if (ue_reg & PX_PCIE_PANIC_BITS) - err |= PX_PANIC; + if (regs->primary_ue & (PCIE_AER_UCE_UR | PCIE_AER_UCE_CA)) { + if (pf_tlp_decode(PCIE_DIP2BUS(dip), adv_reg) == DDI_SUCCESS) + PCIE_ROOT_FAULT(pfd_p)->fault_bdf = + adv_reg->pcie_ue_tgt_bdf; + } else if (regs->primary_ue & PCIE_AER_UCE_PTLP) { + if (pf_tlp_decode(PCIE_DIP2BUS(dip), adv_reg) == DDI_SUCCESS) { + PCIE_ROOT_FAULT(pfd_p)->fault_bdf = + adv_reg->pcie_ue_tgt_bdf; + if (adv_reg->pcie_ue_tgt_trans == + PF_ADDR_PIO) + PCIE_ROOT_FAULT(pfd_p)->fault_addr = + adv_reg->pcie_ue_tgt_addr; + } - if (ue_reg & PX_PCIE_NO_PANIC_BITS) - err |= PX_NO_PANIC; + /* + * Normally for Poisoned Completion TLPs we can look at the + * transmit log header for the original request and the original + * address, however this doesn't seem to be working. HW BUG. + */ + } - /* Scan the fabric to clean up error bits, for the following errors. */ - if (ue_reg & (PCIE_AER_UCE_PTLP | PCIE_AER_UCE_CA | PCIE_AER_UCE_UR)) - px_err_fill_pfd(dip, regs); done: - px_pcie_log(dip, regs, err); - return (err); + px_pcie_log(dip, regs); + + /* Return No Error here and let the pcie misc module analyse it */ + return (PX_NO_ERROR); } #if defined(DEBUG) static void -px_pcie_log(dev_info_t *dip, px_err_pcie_t *regs, int severity) +px_pcie_log(dev_info_t *dip, px_err_pcie_t *regs) { DBG(DBG_ERR_INTR, dip, - "A PCIe RC error has occured with a severity of \"%s\"\n" + "A PCIe RC error has occured\n" "\tCE: 0x%x UE: 0x%x Primary UE: 0x%x\n" "\tTX Hdr: 0x%x 0x%x 0x%x 0x%x\n\tRX Hdr: 0x%x 0x%x 0x%x 0x%x\n", - (severity & PX_PANIC) ? "PANIC" : "NO PANIC", regs->ce_reg, - regs->ue_reg, regs->primary_ue, regs->tx_hdr1, regs->tx_hdr2, - regs->tx_hdr3, regs->tx_hdr4, regs->rx_hdr1, regs->rx_hdr2, - regs->rx_hdr3, regs->rx_hdr4); + regs->ce_reg, regs->ue_reg, regs->primary_ue, + regs->tx_hdr1, regs->tx_hdr2, regs->tx_hdr3, regs->tx_hdr4, + regs->rx_hdr1, regs->rx_hdr2, regs->rx_hdr3, regs->rx_hdr4); } -#endif /* DEBUG */ +#endif /* * look through poisoned TLP cases and suggest panic/no panic depend on @@ -561,12 +636,12 @@ px_pcie_log(dev_info_t *dip, px_err_pcie_t *regs, int severity) static int px_pcie_ptlp(dev_info_t *dip, ddi_fm_error_t *derr, px_err_pcie_t *regs) { - px_t *px_p = DIP_TO_STATE(dip); - pf_data_t pf_data; + pf_pcie_adv_err_regs_t adv_reg; pcie_req_id_t bdf; - uint32_t addr, trans_type; + uint64_t addr; + uint32_t trans_type; int tlp_sts, tlp_cmd; - int sts = PF_HDL_NOTFOUND; + int lookup = PF_HDL_NOTFOUND; if (regs->primary_ue != PCIE_AER_UCE_PTLP) return (PX_PANIC); @@ -574,18 +649,21 @@ px_pcie_ptlp(dev_info_t *dip, ddi_fm_error_t *derr, px_err_pcie_t *regs) if (!regs->rx_hdr1) goto done; - pf_data.rp_bdf = px_p->px_bdf; - pf_data.aer_h0 = regs->rx_hdr1; - pf_data.aer_h1 = regs->rx_hdr2; - pf_data.aer_h2 = regs->rx_hdr3; - pf_data.aer_h3 = regs->rx_hdr4; + adv_reg.pcie_ue_hdr[0] = regs->rx_hdr1; + adv_reg.pcie_ue_hdr[1] = regs->rx_hdr2; + adv_reg.pcie_ue_hdr[2] = regs->rx_hdr3; + adv_reg.pcie_ue_hdr[3] = regs->rx_hdr4; - tlp_sts = pf_tlp_decode(dip, &pf_data, &bdf, &addr, &trans_type); - tlp_cmd = ((pcie_tlp_hdr_t *)(&pf_data.aer_h0))->type; + tlp_sts = pf_tlp_decode(PCIE_DIP2BUS(dip), &adv_reg); + tlp_cmd = ((pcie_tlp_hdr_t *)(adv_reg.pcie_ue_hdr))->type; if (tlp_sts == DDI_FAILURE) goto done; + bdf = adv_reg.pcie_ue_tgt_bdf; + addr = adv_reg.pcie_ue_tgt_addr; + trans_type = adv_reg.pcie_ue_tgt_trans; + switch (tlp_cmd) { case PCIE_TLP_TYPE_CPL: case PCIE_TLP_TYPE_CPLLK: @@ -594,24 +672,58 @@ px_pcie_ptlp(dev_info_t *dip, ddi_fm_error_t *derr, px_err_pcie_t *regs) * from the RX TLP, and the original address from the TX TLP. */ if (regs->tx_hdr1) { - pf_data.aer_h0 = regs->tx_hdr1; - pf_data.aer_h1 = regs->tx_hdr2; - pf_data.aer_h2 = regs->tx_hdr3; - pf_data.aer_h3 = regs->tx_hdr4; + adv_reg.pcie_ue_hdr[0] = regs->tx_hdr1; + adv_reg.pcie_ue_hdr[1] = regs->tx_hdr2; + adv_reg.pcie_ue_hdr[2] = regs->tx_hdr3; + adv_reg.pcie_ue_hdr[3] = regs->tx_hdr4; - sts = pf_tlp_decode(dip, &pf_data, NULL, &addr, - &trans_type); + lookup = pf_tlp_decode(PCIE_DIP2BUS(dip), &adv_reg); + if (lookup != DDI_SUCCESS) + break; + addr = adv_reg.pcie_ue_tgt_addr; + trans_type = adv_reg.pcie_ue_tgt_trans; } /* FALLTHRU */ case PCIE_TLP_TYPE_IO: case PCIE_TLP_TYPE_MEM: case PCIE_TLP_TYPE_MEMLK: - sts = pf_hdl_lookup(dip, derr->fme_ena, trans_type, addr, bdf); + lookup = pf_hdl_lookup(dip, derr->fme_ena, trans_type, addr, + bdf); break; default: - sts = PF_HDL_NOTFOUND; + lookup = PF_HDL_NOTFOUND; } done: - return (sts == PF_HDL_NOTFOUND ? PX_PANIC : PX_NO_PANIC); + return (lookup == PF_HDL_FOUND ? PX_NO_PANIC : PX_PANIC); +} + +/* + * px_get_pdf automatically allocates a RC pf_data_t and returns a pointer to + * it. This function should be used when an error requires a fabric scan. + */ +static pf_data_t * +px_get_pfd(px_t *px_p) { + int idx = px_p->px_pfd_idx++; + pf_data_t *pfd_p = &px_p->px_pfd_arr[idx]; + + /* Clear Old Data */ + PCIE_ROOT_FAULT(pfd_p)->fault_bdf = 0; + PCIE_ROOT_FAULT(pfd_p)->fault_addr = 0; + PCI_BDG_ERR_REG(pfd_p)->pci_bdg_sec_stat = 0; + PCIE_ADV_REG(pfd_p)->pcie_ce_status = 0; + PCIE_ADV_REG(pfd_p)->pcie_ue_status = 0; + + pfd_p->pe_next = NULL; + + if (idx > 0) { + px_p->px_pfd_arr[idx - 1].pe_next = pfd_p; + pfd_p->pe_prev = &px_p->px_pfd_arr[idx - 1]; + } else { + pfd_p->pe_prev = NULL; + } + + pfd_p->pe_valid = B_TRUE; + + return (pfd_p); } /* @@ -627,47 +739,208 @@ done: * (ie S-TA/MA, R-TA) * Either the fault bdf or addr may be NULL, but not both. */ -int px_foo = 0; void px_rp_en_q(px_t *px_p, pcie_req_id_t fault_bdf, uint32_t fault_addr, uint16_t s_status) { - pf_data_t pf_data = {0}; + pf_data_t *pfd_p; if (!fault_bdf && !fault_addr) return; - pf_data.dev_type = PCIE_PCIECAP_DEV_TYPE_ROOT; - if (px_foo) { - pf_data.fault_bdf = px_foo; - px_foo = 0; - } else - pf_data.fault_bdf = fault_bdf; + pfd_p = px_get_pfd(px_p); + + PCIE_ROOT_FAULT(pfd_p)->fault_bdf = fault_bdf; + PCIE_ROOT_FAULT(pfd_p)->fault_addr = (uint64_t)fault_addr; + PCI_BDG_ERR_REG(pfd_p)->pci_bdg_sec_stat = s_status; +} + + +/* + * Find and Mark CFG Handles as failed associated with the given BDF. We should + * always know the BDF for CFG accesses, since it is encoded in the address of + * the TLP. Since there can be multiple cfg handles, mark them all as failed. + */ +/* ARGSUSED */ +int +px_err_cfg_hdl_check(dev_info_t *dip, const void *handle, const void *arg1, + const void *arg2) +{ + int status = DDI_FM_FATAL; + uint32_t addr = *(uint32_t *)arg1; + uint16_t bdf = *(uint16_t *)arg2; + pcie_bus_t *bus_p; + + DBG(DBG_ERR_INTR, dip, "Check CFG Hdl: dip 0x%p addr 0x%x bdf=0x%x\n", + dip, addr, bdf); - pf_data.bdf = px_p->px_bdf; - pf_data.rp_bdf = px_p->px_bdf; - pf_data.fault_addr = fault_addr; - pf_data.s_status = s_status; - pf_data.send_erpt = PF_SEND_ERPT_NO; + bus_p = PCIE_DIP2BUS(dip); - (void) pf_en_dq(&pf_data, px_p->px_dq_p, &px_p->px_dq_tail, -1); + /* + * Because CFG and IO Acc Handlers are on the same cache list and both + * types of hdls gets called for both types of errors. For this checker + * only mark the device as "Non-Fatal" if the addr == NULL and bdf != + * NULL. + */ + status = (!addr && (bus_p->bus_bdf == bdf)) ? DDI_FM_NONFATAL : + DDI_FM_FATAL; + + return (status); +} + +/* + * Find and Mark all ACC Handles associated with a give address and BDF as + * failed. If the BDF != NULL, then check to see if the device has a ACC Handle + * associated with ADDR. If the handle is not found, mark all the handles as + * failed. If the BDF == NULL, mark the handle as failed if it is associated + * with ADDR. + */ +int +px_err_pio_hdl_check(dev_info_t *dip, const void *handle, const void *arg1, + const void *arg2) +{ + dev_info_t *px_dip = PCIE_DIP2BUS(dip)->bus_rp_dip; + px_t *px_p = INST_TO_STATE(ddi_get_instance(px_dip)); + px_ranges_t *ranges_p; + int range_len; + ddi_acc_handle_t ap = (ddi_acc_handle_t)handle; + ddi_acc_hdl_t *hp = impl_acc_hdl_get(ap); + int i, status = DDI_FM_FATAL; + uint64_t fault_addr = *(uint64_t *)arg1; + uint16_t bdf = *(uint16_t *)arg2; + uint64_t base_addr, range_addr; + uint_t size; + + DBG(DBG_ERR_INTR, dip, "Check PIO Hdl: dip 0x%x addr 0x%x bdf=0x%x\n", + dip, fault_addr, bdf); + + /* Normalize the base addr to the addr and strip off the HB info. */ + base_addr = (hp->ah_pfn << MMU_PAGESHIFT) + hp->ah_offset; + range_len = px_p->px_ranges_length / sizeof (px_ranges_t); + i = 0; + for (ranges_p = px_p->px_ranges_p; i < range_len; i++, ranges_p++) { + range_addr = px_in_addr_range(dip, ranges_p, base_addr); + if (range_addr) { + switch (ranges_p->child_high & PCI_ADDR_MASK) { + case PCI_ADDR_IO: + case PCI_ADDR_MEM64: + case PCI_ADDR_MEM32: + base_addr = base_addr - range_addr; + break; + } + break; + } + } + + /* + * Mark the handle as failed if the ADDR is mapped, or if we + * know the BDF and ADDR == 0. + */ + size = hp->ah_len; + if (((fault_addr >= base_addr) && (fault_addr < (base_addr + size))) || + ((fault_addr == NULL) && (bdf == PCIE_DIP2BUS(dip)->bus_bdf))) + status = DDI_FM_NONFATAL; + + return (status); +} + +/* + * Find and Mark all DNA Handles associated with a give address and BDF as + * failed. If the BDF != NULL, then check to see if the device has a DMA Handle + * associated with ADDR. If the handle is not found, mark all the handles as + * failed. If the BDF == NULL, mark the handle as failed if it is associated + * with ADDR. + */ +int +px_err_dma_hdl_check(dev_info_t *dip, const void *handle, const void *arg1, + const void *arg2) +{ + ddi_dma_impl_t *pcie_dp; + int status = DDI_FM_FATAL; + uint32_t addr = *(uint32_t *)arg1; + uint16_t bdf = *(uint16_t *)arg2; + uint32_t base_addr; + uint_t size; + + DBG(DBG_ERR_INTR, dip, "Check PIO Hdl: dip 0x%x addr 0x%x bdf=0x%x\n", + dip, addr, bdf); + + pcie_dp = (ddi_dma_impl_t *)handle; + base_addr = (uint32_t)pcie_dp->dmai_mapping; + size = pcie_dp->dmai_size; + + /* + * Mark the handle as failed if the ADDR is mapped, or if we + * know the BDF and ADDR == 0. + */ + if (((addr >= base_addr) && (addr < (base_addr + size))) || + ((addr == NULL) && (bdf != NULL))) + status = DDI_FM_NONFATAL; + + return (status); +} + +int +px_fm_enter(px_t *px_p) { + if (px_panicing || (px_p->px_fm_mutex_owner == curthread)) + return (DDI_FAILURE); + + mutex_enter(&px_p->px_fm_mutex); + /* + * In rare cases when trap occurs and in the middle of scanning the + * fabric, a PIO will fail in the scan fabric. The CPU error handling + * code will correctly panic the system, while a mondo for the failed + * PIO may also show up. Normally the mondo will try to grab the mutex + * and wait until the callback finishes. But in this rare case, + * mutex_enter actually suceeds also continues to scan the fabric. + * + * This code below is designed specifically to check for this case. If + * we successfully grab the px_fm_mutex, the px_fm_mutex_owner better be + * NULL. If it isn't that means we are in the rare corner case. Return + * DDI_FAILURE, this should prevent PX from doing anymore error + * handling. + */ + if (px_p->px_fm_mutex_owner) { + return (DDI_FAILURE); + } + + px_p->px_fm_mutex_owner = curthread; + + if (px_panicing) { + px_fm_exit(px_p); + return (DDI_FAILURE); + } + return (DDI_SUCCESS); +} + +void +px_fm_exit(px_t *px_p) { + px_p->px_fm_mutex_owner = NULL; + mutex_exit(&px_p->px_fm_mutex); } /* * Panic if the err tunable is set and that we are not already in the middle * of panic'ing. + * + * rc_err = Error severity of PX specific errors + * msg = Where the error was detected + * fabric_err = Error severity of PCIe Fabric errors + * isTest = Test if error severity causes panic */ #define MSZ (sizeof (fm_msg) -strlen(fm_msg) - 1) void -px_err_panic(int err, int msg, int fab_err) +px_err_panic(int rc_err, int msg, int fabric_err, boolean_t isTest) { char fm_msg[96] = ""; int ferr = PX_NO_ERROR; - if (panicstr) + if (panicstr) { + px_panicing = B_TRUE; return; + } - if (!(err & px_die)) + if (!(rc_err & px_die)) goto fabric; if (msg & PX_RC) (void) strncat(fm_msg, px_panic_rc_msg, MSZ); @@ -677,17 +950,22 @@ px_err_panic(int err, int msg, int fab_err) (void) strncat(fm_msg, px_panic_hb_msg, MSZ); fabric: - if (fab_err & PF_PANIC) + if (fabric_err & PF_ERR_FATAL_FLAGS) ferr = PX_PANIC; - else if (fab_err & ~(PF_PANIC | PF_NO_ERROR)) + else if (fabric_err & ~(PF_ERR_FATAL_FLAGS | PF_ERR_NO_ERROR)) ferr = PX_NO_PANIC; if (ferr & px_die) { - if (strlen(fm_msg)) + if (strlen(fm_msg)) { (void) strncat(fm_msg, " and", MSZ); + } (void) strncat(fm_msg, px_panic_fab_msg, MSZ); } - if (strlen(fm_msg)) - fm_panic("Fatal error has occured in:%s.", fm_msg); + if (strlen(fm_msg)) { + px_panicing = B_TRUE; + if (!isTest) + fm_panic("Fatal error has occured in:%s.(0x%x)(0x%x)", + fm_msg, rc_err, fabric_err); + } } |