From f237ed540aa1cb834711c90e9bb71c244f362653 Mon Sep 17 00:00:00 2001 From: "Joshua M. Clulow" Date: Thu, 25 Feb 2016 10:51:36 +0000 Subject: XXX move to ddi_periodic, panic on firmware hang --- usr/src/uts/common/io/cpqary3/cpqary3.c | 21 +-- usr/src/uts/common/io/cpqary3/cpqary3.h | 12 +- usr/src/uts/common/io/cpqary3/cpqary3_isr.c | 36 +---- usr/src/uts/common/io/cpqary3/cpqary3_talk2ctlr.c | 10 +- usr/src/uts/common/io/cpqary3/cpqary3_util.c | 157 +++++++++++----------- 5 files changed, 108 insertions(+), 128 deletions(-) diff --git a/usr/src/uts/common/io/cpqary3/cpqary3.c b/usr/src/uts/common/io/cpqary3/cpqary3.c index ff4aada92c..f95befda10 100644 --- a/usr/src/uts/common/io/cpqary3/cpqary3.c +++ b/usr/src/uts/common/io/cpqary3/cpqary3.c @@ -412,11 +412,6 @@ cpqary3_attach(dev_info_t *dip, ddi_attach_cmd_t attach_cmd) } - /* Register a timeout driver-routine to be called every 2 secs */ - cpqary3p->tick_tmout_id = timeout(cpqary3_tick_hdlr, - (caddr_t)cpqary3p, drv_usectohz(CPQARY3_TICKTMOUT_VALUE)); - cleanstatus |= CPQARY3_TICK_TMOUT_REGD; - /* Register Software Interrupt Handler */ if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, &cpqary3p->cpqary3_softintr_id, &cpqary3p->sw_iblock_cookie, NULL, @@ -439,6 +434,13 @@ cpqary3_attach(dev_info_t *dip, ddi_attach_cmd_t attach_cmd) if (cpqary3p->host_support & 0x4) cpqary3_lockup_intr_onoff(cpqary3p, CPQARY3_LOCKUP_INTR_ENABLE); + /* + * Register a periodic function to be called every 15 seconds. + */ + cpqary3p->cpq_periodic = ddi_periodic_add(cpqary3_periodic, cpqary3p, + 15 * NANOSEC, DDI_IPL_0); + cleanstatus |= CPQARY3_TICK_TMOUT_REGD; + /* * We have come with hmaeventd - which logs the storage events on * console as well as in IML. So we are commenting the NOE support in @@ -661,17 +663,16 @@ cpqary3_cleanup(cpqary3_t *cpqary3p, uint32_t status) * any register/memory mapping */ + if ((status & CPQARY3_TICK_TMOUT_REGD) && cpqary3p->cpq_periodic) { + ddi_periodic_delete(cpqary3p->cpq_periodic); + } + if (status & CPQARY3_INTR_HDLR_SET) ddi_remove_intr(cpqary3p->dip, 0, cpqary3p->hw_iblock_cookie); if (status & CPQARY3_SW_INTR_HDLR_SET) ddi_remove_softintr(cpqary3p->cpqary3_softintr_id); - if ((status & CPQARY3_TICK_TMOUT_REGD) && cpqary3p->tick_tmout_id) { - VERIFY(untimeout(cpqary3p->tick_tmout_id) >= 0); - cpqary3p->tick_tmout_id = NULL; - } - if (status & CPQARY3_CREATE_MINOR_NODE) { (void) sprintf(node_name, "cpqary3%d", cpqary3p->instance); ddi_remove_minor_node(cpqary3p->dip, node_name); diff --git a/usr/src/uts/common/io/cpqary3/cpqary3.h b/usr/src/uts/common/io/cpqary3/cpqary3.h index f7c229eb69..3f35d34ca4 100644 --- a/usr/src/uts/common/io/cpqary3/cpqary3.h +++ b/usr/src/uts/common/io/cpqary3/cpqary3.h @@ -84,8 +84,6 @@ extern "C" { #define CPQARY3_CLEAN_ALL 0x0FFF -#define CPQARY3_TICKTMOUT_VALUE 180000000 /* 180 seconds */ - /* * Defines for Maximum and Default Settings. */ @@ -155,6 +153,7 @@ extern "C" { #define RETURN_VOID_IF_NULL(x) if (NULL == x) return #define RETURN_NULL_IF_NULL(x) if (NULL == x) return (NULL) #define RETURN_FAILURE_IF_NULL(x) if (NULL == x) return (CPQARY3_FAILURE) +#define CPQARY3_SEC2HZ(x) drv_usectohz((x) * 1000000) /* * Macros for memory allocation/deallocations @@ -283,10 +282,12 @@ typedef struct cpqary3_per_controller { /* Controller Specific Information */ int8_t hba_name[38]; ulong_t num_of_targets; - uint32_t heartbeat; uint32_t board_id; cpqary3_bd_t *bddef; + uint32_t cpq_last_heartbeat; + clock_t cpq_last_heartbeat_lbolt; + /* Condition Variables used */ kcondvar_t cv_immediate_wait; kcondvar_t cv_noe_wait; @@ -309,7 +310,7 @@ typedef struct cpqary3_per_controller { kmutex_t sw_mutex; /* s/w mutex */ ddi_softintr_t cpqary3_softintr_id; /* s/w intr identifier */ uint8_t swintr_flag; - timeout_id_t tick_tmout_id; /* timeout identifier */ + ddi_periodic_t cpq_periodic; uint8_t cpqary3_tick_hdlr; scsi_hba_tran_t *hba_tran; /* transport structure */ cpqary3_cmdmemlist_t *cmdmemlistp; /* database - Memory Pool */ @@ -452,7 +453,7 @@ typedef struct cpqary3_ioctlreq { void cpqary3_init_hbatran(cpqary3_t *); void cpqary3_read_conf_file(dev_info_t *, cpqary3_t *); -void cpqary3_tick_hdlr(void *); +void cpqary3_periodic(void *); void cpqary3_flush_cache(cpqary3_t *); void cpqary3_intr_onoff(cpqary3_t *, uint8_t); void cpqary3_lockup_intr_onoff(cpqary3_t *, uint8_t); @@ -487,6 +488,7 @@ void cpqary3_synccmd_free(cpqary3_t *, cpqary3_cmdpvt_t *); int cpqary3_synccmd_send(cpqary3_t *, cpqary3_cmdpvt_t *, clock_t, int); uint8_t cpqary3_poll_retrieve(cpqary3_t *cpqary3p, uint32_t poll_tag); uint8_t cpqary3_build_cmdlist(cpqary3_cmdpvt_t *cpqary3_cmdpvtp, uint32_t tid); +void cpqary3_lockup_check(cpqary3_t *); #ifdef __cplusplus } diff --git a/usr/src/uts/common/io/cpqary3/cpqary3_isr.c b/usr/src/uts/common/io/cpqary3/cpqary3_isr.c index d62e93159d..76e5242c9e 100644 --- a/usr/src/uts/common/io/cpqary3/cpqary3_isr.c +++ b/usr/src/uts/common/io/cpqary3/cpqary3_isr.c @@ -50,44 +50,14 @@ cpqary3_hw_isr(caddr_t per_ctlr) */ if (cpqary3p->check_ctlr_intr(cpqary3p) != CPQARY3_SUCCESS) { /* - * The Outbound Post List FIFO is not empty, so we must - * service this interrupt. + * Check to see if the firmware has come to rest. If it has, + * this routine will panic the system. */ - goto service; - } + cpqary3_lockup_check(cpqary3p); - if (CPQARY3_FAILURE == cpqary3p->check_ctlr_intr(cpqary3p)) { - if (cpqary3p->heartbeat == - DDI_GET32(cpqary3p, &ctp->HeartBeat)) { - if (0x2 & ddi_get32(cpqary3p->odr_handle, - (uint32_t *)cpqary3p->odr)) { - spr0 = ddi_get32(cpqary3p->spr0_handle, - (uint32_t *)cpqary3p->spr0); - spr0 = spr0 >> 16; - cmn_err(CE_WARN, "CPQary3 : %s HBA firmware " - "Locked !!! Lockup Code: 0x%x", - cpqary3p->hba_name, spr0); - cmn_err(CE_WARN, "CPQary3 : Please reboot " - "the system"); - ddi_put32(cpqary3p->odr_cl_handle, - (uint32_t *)cpqary3p->odr_cl, 0x2); - cpqary3_intr_onoff(cpqary3p, - CPQARY3_INTR_DISABLE); - if (cpqary3p->host_support & 0x4) { - cpqary3_lockup_intr_onoff(cpqary3p, - CPQARY3_LOCKUP_INTR_DISABLE); - } - cpqary3p->controller_lockup = CPQARY3_TRUE; - } - return (DDI_INTR_CLAIMED); - } return (DDI_INTR_UNCLAIMED); } -service: - - /* PERF */ - /* * We decided that we will have only one retrieve function for * both simple and performant mode. To achieve this we have to mimic diff --git a/usr/src/uts/common/io/cpqary3/cpqary3_talk2ctlr.c b/usr/src/uts/common/io/cpqary3/cpqary3_talk2ctlr.c index 2382ac22c5..7275c656a1 100644 --- a/usr/src/uts/common/io/cpqary3/cpqary3_talk2ctlr.c +++ b/usr/src/uts/common/io/cpqary3/cpqary3_talk2ctlr.c @@ -638,7 +638,6 @@ cpqary3_init_ctlr(cpqary3_t *cpqary3p) * Zero the Upper 32 Address in the Controller */ DDI_PUT32(cpqary3p, &ctp->HostWrite.Upper32Addr, 0x00000000); - cpqary3p->heartbeat = DDI_GET32(cpqary3p, &ctp->HeartBeat); /* Set the controller interrupt check routine */ cpqary3p->check_ctlr_intr = cpqary3_check_simple_ctlr_intr; @@ -815,7 +814,6 @@ cpqary3_init_ctlr(cpqary3_t *cpqary3p) */ DDI_PUT32(cpqary3p, &ctp->HostWrite.Upper32Addr, 0x00000000); - cpqary3p->heartbeat = DDI_GET32(cpqary3p, &ctp->HeartBeat); /* Set the controller interrupt check routine */ @@ -838,6 +836,14 @@ cpqary3_init_ctlr(cpqary3_t *cpqary3p) DDI_GET32(cpqary3p, &ctp->HostDrvrSupport); } + /* + * Read initial controller heartbeat value and mark the current + * reading time. + */ + cpqary3p->cpq_last_heartbeat = ddi_get32(cpqary3p->ct_handle, + &ctp->HeartBeat); + cpqary3p->cpq_last_heartbeat_lbolt = ddi_get_lbolt(); + return (CPQARY3_SUCCESS); } diff --git a/usr/src/uts/common/io/cpqary3/cpqary3_util.c b/usr/src/uts/common/io/cpqary3/cpqary3_util.c index b4a017d5ce..ed8323e387 100644 --- a/usr/src/uts/common/io/cpqary3/cpqary3_util.c +++ b/usr/src/uts/common/io/cpqary3/cpqary3_util.c @@ -86,111 +86,112 @@ cpqary3_read_conf_file(dev_info_t *dip, cpqary3_t *cpqary3p) void cpqary3_lockup_check(cpqary3_t *cpq) { + /* + * Read the current controller heartbeat value. + */ + uint32_t heartbeat = ddi_get32(cpq->ct_handle, &cpq->ct->HeartBeat); + + /* + * Check to see if the value is the same as last time we looked: + */ + if (heartbeat != cpq->cpq_last_heartbeat) { + /* + * The heartbeat value has changed, which suggests that the + * firmware in the controller has not yet come to a complete + * stop. Record the new value, as well as the current time. + */ + cpq->cpq_last_heartbeat = heartbeat; + cpq->cpq_last_heartbeat_lbolt = ddi_get_lbolt(); + return; + } + + /* + * The controller _might_ have been able to signal to us that is + * has locked up. This is a truly unfathomable state of affairs: + * If the firmware can tell it has flown off the rails, why not + * simply reset the controller? + */ + uint32_t odr = ddi_get32(cpq->odr_handle, cpq->odr); + uint32_t spr = ddi_get32(cpq->spr0_handle, cpq->spr0); + if ((odr & CISS_ODR_BIT_LOCKUP) != 0) { + dev_err(cpq->dip, CE_PANIC, "HP SmartArray firmware has " + "reported a critical fault (odr %08x spr %08x)", + odr, spr); + } + + clock_t expiry = cpq->cpq_last_heartbeat_lbolt + CPQARY3_SEC2HZ(60); + if (ddi_get_lbolt() >= expiry) { + dev_err(cpq->dip, CE_PANIC, "HP SmartArray firmware has " + "stopped responding (odr %08x spr %08x)", + odr, spr); + } } /* - * Function : cpqary3_tick_hdlr - * Description : This routine is called once in 60 seconds to detect any + * Function : cpqary3_periodic + * Description : This routine is called once in 15 seconds to detect any * command that is pending with the controller and has * timed out. - * Once invoked, it re-initializes itself such that it is - * invoked after an interval of 60 seconds. * Called By : kernel * Parameters : per_controller * Calls : None * Return Values: None */ void -cpqary3_tick_hdlr(void *arg) +cpqary3_periodic(void *arg) { - clock_t cpqary3_lbolt; - clock_t cpqary3_ticks; - cpqary3_t *ctlr; - cpqary3_pkt_t *pktp; - struct scsi_pkt *scsi_pktp; - cpqary3_cmdpvt_t *local; - volatile CfgTable_t *ctp; - uint32_t i; - uint32_t no_cmds = 0; + cpqary3_t *cpq = arg; + uint32_t no_cmds; - /* - * The per-controller shall be passed as argument. - * Read the HeartBeat of the controller. - * if the current heartbeat is the same as the one recorded earlier, - * the f/w has locked up!!! - */ + cpqary3_lockup_check(cpq); - if (NULL == (ctlr = (cpqary3_t *)arg)) - return; - - ctp = (CfgTable_t *)ctlr->ct; - - /* CONTROLLER_LOCKUP */ - if (ctlr->heartbeat == DDI_GET32(ctlr, &ctp->HeartBeat)) { - if (ctlr->lockup_logged == CPQARY3_FALSE) { - cmn_err(CE_WARN, "CPQary3 : " - "%s HBA firmware Locked !!!", ctlr->hba_name); - cmn_err(CE_WARN, "CPQary3 : " - "Please reboot the system"); - cpqary3_intr_onoff(ctlr, CPQARY3_INTR_DISABLE); - if (ctlr->host_support & 0x4) - cpqary3_lockup_intr_onoff(ctlr, - CPQARY3_LOCKUP_INTR_DISABLE); - ctlr->controller_lockup = CPQARY3_TRUE; - ctlr->lockup_logged = CPQARY3_TRUE; - } - } - /* CONTROLLER_LOCKUP */ - no_cmds = (uint32_t)((ctlr->ctlr_maxcmds / 3) * - NO_OF_CMDLIST_IN_A_BLK); - mutex_enter(&ctlr->sw_mutex); + mutex_enter(&cpq->sw_mutex); + no_cmds = (uint32_t)((cpq->ctlr_maxcmds / 3) * NO_OF_CMDLIST_IN_A_BLK); + for (uint32_t i = 0; i < no_cmds; i++) { + cpqary3_cmdpvt_t *local = &cpq->cmdmemlistp->pool[i]; + cpqary3_pkt_t *pktp; + struct scsi_pkt *scsi_pktp; + clock_t cpqary3_lbolt; - for (i = 0; i < no_cmds; i++) { - local = &ctlr->cmdmemlistp->pool[i]; ASSERT(local != NULL); - pktp = MEM2PVTPKT(local); - - if (!pktp) + if ((pktp = MEM2PVTPKT(local)) == NULL) { continue; + } if ((local->cmdpvt_flag == CPQARY3_TIMEOUT) || (local->cmdpvt_flag == CPQARY3_RESET)) { continue; } - if (local->occupied == CPQARY3_OCCUPIED) { - scsi_pktp = pktp->scsi_cmd_pkt; - cpqary3_lbolt = ddi_get_lbolt(); - if ((scsi_pktp) && (scsi_pktp->pkt_time)) { - cpqary3_ticks = cpqary3_lbolt - - pktp->cmd_start_time; - - if ((drv_hztousec(cpqary3_ticks)/1000000) > - scsi_pktp->pkt_time) { - scsi_pktp->pkt_reason = CMD_TIMEOUT; - scsi_pktp->pkt_statistics = - STAT_TIMEOUT; - scsi_pktp->pkt_state = STATE_GOT_BUS | - STATE_GOT_TARGET | STATE_SENT_CMD; - local->cmdpvt_flag = CPQARY3_TIMEOUT; - - /* This should always be the case */ - if (scsi_pktp->pkt_comp) { - mutex_exit(&ctlr->sw_mutex); - (*scsi_pktp->pkt_comp) - (scsi_pktp); - mutex_enter(&ctlr->sw_mutex); - continue; - } + if (local->occupied != CPQARY3_OCCUPIED) { + continue; + } + + scsi_pktp = pktp->scsi_cmd_pkt; + cpqary3_lbolt = ddi_get_lbolt(); + if ((scsi_pktp) && (scsi_pktp->pkt_time)) { + clock_t cpqary3_ticks = cpqary3_lbolt - + pktp->cmd_start_time; + + if ((drv_hztousec(cpqary3_ticks) / 1000000) > + scsi_pktp->pkt_time) { + scsi_pktp->pkt_reason = CMD_TIMEOUT; + scsi_pktp->pkt_statistics = STAT_TIMEOUT; + scsi_pktp->pkt_state = STATE_GOT_BUS | + STATE_GOT_TARGET | STATE_SENT_CMD; + local->cmdpvt_flag = CPQARY3_TIMEOUT; + + /* This should always be the case */ + if (scsi_pktp->pkt_comp != NULL) { + mutex_exit(&cpq->sw_mutex); + (*scsi_pktp->pkt_comp)(scsi_pktp); + mutex_enter(&cpq->sw_mutex); + continue; } } } } - - ctlr->heartbeat = DDI_GET32(ctlr, &ctp->HeartBeat); - mutex_exit(&ctlr->sw_mutex); - ctlr->tick_tmout_id = timeout(cpqary3_tick_hdlr, - (caddr_t)ctlr, drv_usectohz(CPQARY3_TICKTMOUT_VALUE)); + mutex_exit(&cpq->sw_mutex); } /* -- cgit v1.2.3