diff options
| author | Winson Wang - Sun Microsystems - Beijing China <Zhen.W@Sun.COM> | 2009-12-21 08:56:45 +0800 |
|---|---|---|
| committer | Winson Wang - Sun Microsystems - Beijing China <Zhen.W@Sun.COM> | 2009-12-21 08:56:45 +0800 |
| commit | 3a84c50f71e0942a55e90913f6b44878d5062621 (patch) | |
| tree | bf84e71dd463e80a2718d2b6acaf85fe765f5a1b /usr/src | |
| parent | 05cffdd1e5bacc2be9e9bd3721a3af9fe838769a (diff) | |
| download | illumos-gate-3a84c50f71e0942a55e90913f6b44878d5062621.tar.gz | |
6888015 rge tx performance with RTL8168B is very low and cpu usage is high.
6905195 rge opackets stats is wrong and using pci_lcap_locate to check whether it is the pcie card
6906394 rge reads write-only register
6906401 rge will hang because of reading corrupted tx descriptors
6906408 rge using pci MMIO regs (rather than IO) will reduce cpu usage
Diffstat (limited to 'usr/src')
| -rw-r--r-- | usr/src/uts/common/io/rge/rge.h | 44 | ||||
| -rw-r--r-- | usr/src/uts/common/io/rge/rge_chip.c | 182 | ||||
| -rw-r--r-- | usr/src/uts/common/io/rge/rge_hw.h | 6 | ||||
| -rw-r--r-- | usr/src/uts/common/io/rge/rge_main.c | 2 | ||||
| -rw-r--r-- | usr/src/uts/common/io/rge/rge_rxtx.c | 50 |
5 files changed, 194 insertions, 90 deletions
diff --git a/usr/src/uts/common/io/rge/rge.h b/usr/src/uts/common/io/rge/rge.h index 4a58da1c92..d6236ab5d7 100644 --- a/usr/src/uts/common/io/rge/rge.h +++ b/usr/src/uts/common/io/rge/rge.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,6 +36,7 @@ extern "C" { #include <sys/strsubr.h> #include <sys/stat.h> #include <sys/pci.h> +#include <sys/pci_cap.h> #include <sys/note.h> #include <sys/modctl.h> #include <sys/kstat.h> @@ -316,19 +317,23 @@ typedef struct sw_sbd { } sw_sbd_t; -#define HW_RBD_INIT(rbd, slot) \ - rbd->flags_len |= RGE_BSWAP_32(BD_FLAG_HW_OWN); \ - rbd->vlan_tag = 0; \ - if (slot == (RGE_RECV_SLOTS -1)) \ - rbd->flags_len |= RGE_BSWAP_32(BD_FLAG_EOR); -#define HW_SBD_INIT(sbd, slot) \ - sbd->flags_len = 0; \ - if (slot == (RGE_SEND_SLOTS -1)) \ - sbd->flags_len |= RGE_BSWAP_32(BD_FLAG_EOR); -#define HW_SBD_SET(sbd, slot) \ - sbd->flags_len |= RGE_BSWAP_32(SBD_FLAG_TX_PKT); \ - if (slot == (RGE_SEND_SLOTS -1)) \ - sbd->flags_len |= RGE_BSWAP_32(BD_FLAG_EOR); +#define HW_RBD_INIT(rbd, slot) { \ + (rbd)->vlan_tag = 0; \ + if ((slot) == (RGE_RECV_SLOTS -1)) { \ + (rbd)->flags_len |= \ + RGE_BSWAP_32(BD_FLAG_EOR | BD_FLAG_HW_OWN); \ + } else { \ + (rbd)->flags_len |= RGE_BSWAP_32(BD_FLAG_HW_OWN); \ + } \ +} +#define HW_SBD_SET(sbd, slot) { \ + if ((slot) == (RGE_SEND_SLOTS -1)) { \ + (sbd)->flags_len |= \ + RGE_BSWAP_32(BD_FLAG_EOR | SBD_FLAG_TX_PKT); \ + } else { \ + (sbd)->flags_len |= RGE_BSWAP_32(SBD_FLAG_TX_PKT); \ + } \ +} /* * Describes the characteristics of a specific chip @@ -478,6 +483,17 @@ typedef struct rge { enum rge_chip_state rge_chip_state; /* definitions above */ boolean_t suspended; + + /* + * Polling + */ +#define TX_COALESC max(RGE_BUF_SLOTS/32LL, 8) +#define RX_COALESC 8LL +#define CLK_TICK 100 + clock_t curr_tick; + clock_t tick_delta; + uint64_t last_opackets; + uint64_t last_rpackets; } rge_t; /* diff --git a/usr/src/uts/common/io/rge/rge_chip.c b/usr/src/uts/common/io/rge/rge_chip.c index f043cf5cb0..1bf894e3e2 100644 --- a/usr/src/uts/common/io/rge/rge_chip.c +++ b/usr/src/uts/common/io/rge/rge_chip.c @@ -703,20 +703,8 @@ rge_chip_ident(rge_t *rgep) val32 = rge_reg_get32(rgep, TX_CONFIG_REG); val32 &= HW_VERSION_ID_0 | HW_VERSION_ID_1; chip->mac_ver = val32; - switch (chip->mac_ver) { - case MAC_VER_8168: - case MAC_VER_8168B_B: - case MAC_VER_8168B_C: - case MAC_VER_8168C: - case MAC_VER_8101E: - case MAC_VER_8101E_B: - chip->is_pcie = B_TRUE; - break; - - default: - chip->is_pcie = B_FALSE; - break; - } + chip->is_pcie = pci_lcap_locate(rgep->cfg_handle, + PCI_CAP_ID_PCI_E, &val16) == DDI_SUCCESS; /* * Read and record PHY version @@ -775,6 +763,11 @@ rge_chip_ident(rge_t *rgep) chip->rxconfig = RX_CONFIG_DEFAULT; chip->txconfig = TX_CONFIG_DEFAULT; + /* interval to update statistics for polling mode */ + rgep->tick_delta = drv_usectohz(1000*1000/CLK_TICK); + + /* ensure we are not in polling mode */ + rgep->curr_tick = ddi_get_lbolt() - 2*rgep->tick_delta; RGE_TRACE(("%s: MAC version = %x, PHY version = %x", rgep->ifname, chip->mac_ver, chip->phy_ver)); } @@ -884,34 +877,31 @@ rge_chip_init(rge_t *rgep) uint32_t *hashp; chip_id_t *chip = &rgep->chipid; - if (chip->is_pcie) { - /* - * Increase the threshold voltage of RX sensitivity - */ - if (chip->mac_ver != MAC_VER_8168 && - chip->mac_ver != MAC_VER_8168C && - chip->mac_ver != MAC_VER_8101E_B) - rge_ephy_put16(rgep, 0x01, 0x1bd3); + /* + * Increase the threshold voltage of RX sensitivity + */ + if (chip->mac_ver == MAC_VER_8168B_B || + chip->mac_ver == MAC_VER_8168B_C || + chip->mac_ver == MAC_VER_8101E || + chip->mac_ver == MAC_VER_8101E_C) { + rge_ephy_put16(rgep, 0x01, 0x1bd3); + } + if (chip->mac_ver == MAC_VER_8168 || + chip->mac_ver == MAC_VER_8168B_B) { val16 = rge_reg_get8(rgep, PHY_STATUS_REG); val16 = 0x12<<8 | val16; - if (rgep->chipid.mac_ver != MAC_VER_8101E && - rgep->chipid.mac_ver != MAC_VER_8101E_B && - rgep->chipid.mac_ver != MAC_VER_8101E_C && - rgep->chipid.mac_ver != MAC_VER_8168B_C && - rgep->chipid.mac_ver != MAC_VER_8168C) { - rge_reg_put16(rgep, PHY_STATUS_REG, val16); - rge_reg_put32(rgep, RT_CSI_DATA_REG, 0x00021c01); - rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x8000f088); - rge_reg_put32(rgep, RT_CSI_DATA_REG, 0x00004000); - rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x8000f0b0); - rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x0000f068); - val32 = rge_reg_get32(rgep, RT_CSI_DATA_REG); - val32 |= 0x7000; - val32 &= 0xffff5fff; - rge_reg_put32(rgep, RT_CSI_DATA_REG, val32); - rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x8000f068); - } + rge_reg_put16(rgep, PHY_STATUS_REG, val16); + rge_reg_put32(rgep, RT_CSI_DATA_REG, 0x00021c01); + rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x8000f088); + rge_reg_put32(rgep, RT_CSI_DATA_REG, 0x00004000); + rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x8000f0b0); + rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x0000f068); + val32 = rge_reg_get32(rgep, RT_CSI_DATA_REG); + val32 |= 0x7000; + val32 &= 0xffff5fff; + rge_reg_put32(rgep, RT_CSI_DATA_REG, val32); + rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x8000f068); } /* @@ -1067,6 +1057,9 @@ rge_chip_start(rge_t *rgep) * Enable interrupt */ rgep->int_mask = RGE_INT_MASK; + if (rgep->chipid.is_pcie) { + rgep->int_mask |= NO_TXDESC_INT; + } rge_reg_put16(rgep, INT_MASK_REG, rgep->int_mask); /* @@ -1288,7 +1281,7 @@ void rge_tx_trigger(rge_t *rgep); void rge_tx_trigger(rge_t *rgep) { - rge_reg_set8(rgep, TX_RINGS_POLL_REG, NORMAL_TX_RING_POLL); + rge_reg_put8(rgep, TX_RINGS_POLL_REG, NORMAL_TX_RING_POLL); } void rge_hw_stats_dump(rge_t *rgep); @@ -1345,6 +1338,14 @@ rge_intr(caddr_t arg1, caddr_t arg2) { rge_t *rgep = (rge_t *)arg1; uint16_t int_status; + clock_t now; + uint32_t tx_pkts; + uint32_t rx_pkts; + uint32_t poll_rate; + uint32_t opt_pkts; + uint32_t opt_intrs; + boolean_t update_int_mask = B_FALSE; + uint32_t itimer; _NOTE(ARGUNUSED(arg2)) @@ -1370,11 +1371,93 @@ rge_intr(caddr_t arg1, caddr_t arg2) * Clear interrupt * For PCIE chipset, we need disable interrupt first. */ - if (rgep->chipid.is_pcie) + if (rgep->chipid.is_pcie) { rge_reg_put16(rgep, INT_MASK_REG, INT_MASK_NONE); + update_int_mask = B_TRUE; + } rge_reg_put16(rgep, INT_STATUS_REG, int_status); /* + * Calculate optimal polling interval + */ + now = ddi_get_lbolt(); + if (now - rgep->curr_tick >= rgep->tick_delta && + (rgep->param_link_speed == RGE_SPEED_1000M || + rgep->param_link_speed == RGE_SPEED_100M)) { + /* number of rx and tx packets in the last tick */ + tx_pkts = rgep->stats.opackets - rgep->last_opackets; + rx_pkts = rgep->stats.rpackets - rgep->last_rpackets; + + rgep->last_opackets = rgep->stats.opackets; + rgep->last_rpackets = rgep->stats.rpackets; + + /* restore interrupt mask */ + rgep->int_mask |= TX_OK_INT | RX_OK_INT; + if (rgep->chipid.is_pcie) { + rgep->int_mask |= NO_TXDESC_INT; + } + + /* optimal number of packets in a tick */ + if (rgep->param_link_speed == RGE_SPEED_1000M) { + opt_pkts = (1000*1000*1000/8)/ETHERMTU/CLK_TICK; + } else { + opt_pkts = (100*1000*1000/8)/ETHERMTU/CLK_TICK; + } + + /* + * calculate polling interval based on rx and tx packets + * in the last tick + */ + poll_rate = 0; + if (now - rgep->curr_tick < 2*rgep->tick_delta) { + opt_intrs = opt_pkts/TX_COALESC; + if (tx_pkts > opt_intrs) { + poll_rate = max(tx_pkts/TX_COALESC, opt_intrs); + rgep->int_mask &= ~(TX_OK_INT | NO_TXDESC_INT); + } + + opt_intrs = opt_pkts/RX_COALESC; + if (rx_pkts > opt_intrs) { + opt_intrs = max(rx_pkts/RX_COALESC, opt_intrs); + poll_rate = max(opt_intrs, poll_rate); + rgep->int_mask &= ~RX_OK_INT; + } + /* ensure poll_rate reasonable */ + poll_rate = min(poll_rate, opt_pkts*4); + } + + if (poll_rate) { + /* move to polling mode */ + if (rgep->chipid.is_pcie) { + itimer = (TIMER_CLK_PCIE/CLK_TICK)/poll_rate; + } else { + itimer = (TIMER_CLK_PCI/CLK_TICK)/poll_rate; + } + } else { + /* move to normal mode */ + itimer = 0; + } + RGE_DEBUG(("%s: poll: itimer:%d int_mask:0x%x", + __func__, itimer, rgep->int_mask)); + rge_reg_put32(rgep, TIMER_INT_REG, itimer); + + /* update timestamp for statistics */ + rgep->curr_tick = now; + + /* reset timer */ + int_status |= TIME_OUT_INT; + + update_int_mask = B_TRUE; + } + + if (int_status & TIME_OUT_INT) { + rge_reg_put32(rgep, TIMER_COUNT_REG, 0); + } + + /* flush post writes */ + (void) rge_reg_get16(rgep, INT_STATUS_REG); + + /* * Cable link change interrupt */ if (int_status & LINK_CHANGE_INT) { @@ -1390,9 +1473,22 @@ rge_intr(caddr_t arg1, caddr_t arg2) rge_receive(rgep); /* - * Re-enable interrupt for PCIE chipset + * Transmit interrupt + */ + if (int_status & TX_ERR_INT) { + RGE_REPORT((rgep, "tx error happened, resetting the chip ")); + mutex_enter(rgep->genlock); + rgep->rge_chip_state = RGE_CHIP_ERROR; + mutex_exit(rgep->genlock); + } else if ((rgep->chipid.is_pcie && (int_status & NO_TXDESC_INT)) || + ((int_status & TX_OK_INT) && rgep->tx_free < RGE_SEND_SLOTS/8)) { + (void) ddi_intr_trigger_softint(rgep->resched_hdl, NULL); + } + + /* + * Re-enable interrupt for PCIE chipset or install new int_mask */ - if (rgep->chipid.is_pcie) + if (update_int_mask) rge_reg_put16(rgep, INT_MASK_REG, rgep->int_mask); return (DDI_INTR_CLAIMED); /* indicate it was our interrupt */ diff --git a/usr/src/uts/common/io/rge/rge_hw.h b/usr/src/uts/common/io/rge/rge_hw.h index 2bc6241149..eee4d26337 100644 --- a/usr/src/uts/common/io/rge/rge_hw.h +++ b/usr/src/uts/common/io/rge/rge_hw.h @@ -136,7 +136,9 @@ extern "C" { #define INT_MASK_NONE 0x0000 #define RGE_RX_INT (RX_OK_INT | RX_ERR_INT | \ NO_RXDESC_INT) -#define RGE_INT_MASK (RGE_RX_INT | LINK_CHANGE_INT) +#define RGE_INT_MASK (TX_OK_INT | TX_ERR_INT | \ + RGE_RX_INT | LINK_CHANGE_INT | \ + TIME_OUT_INT) /* * Transmit configuration register @@ -213,6 +215,8 @@ extern "C" { * Timer count register */ #define TIMER_COUNT_REG 0x0048 +#define TIMER_CLK_PCIE (125*1000*1000) +#define TIMER_CLK_PCI (33*1000*1000) /* * Missed packet counter: indicates the number of packets diff --git a/usr/src/uts/common/io/rge/rge_main.c b/usr/src/uts/common/io/rge/rge_main.c index e181e1ae87..1e2eab2342 100644 --- a/usr/src/uts/common/io/rge/rge_main.c +++ b/usr/src/uts/common/io/rge/rge_main.c @@ -1638,7 +1638,7 @@ rge_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) /* * Map operating registers */ - err = ddi_regs_map_setup(devinfo, 1, ®s, + err = ddi_regs_map_setup(devinfo, 2, ®s, 0, 0, &rge_reg_accattr, &rgep->io_handle); if (err != DDI_SUCCESS) { rge_problem(rgep, "ddi_regs_map_setup() failed"); diff --git a/usr/src/uts/common/io/rge/rge_rxtx.c b/usr/src/uts/common/io/rge/rge_rxtx.c index 09d23825d3..916b2e8aef 100644 --- a/usr/src/uts/common/io/rge/rge_rxtx.c +++ b/usr/src/uts/common/io/rge/rge_rxtx.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -611,8 +611,6 @@ rge_send(rge_t *rgep, mblk_t *mp) { struct ether_vlan_header *ehp; uint16_t tci; - rge_hw_stats_t *bstp; - uint8_t counter; ASSERT(mp->b_next == NULL); @@ -623,7 +621,6 @@ rge_send(rge_t *rgep, mblk_t *mp) RGE_DEBUG(("rge_send: no free slots")); rgep->stats.defer++; rgep->resched_needed = B_TRUE; - (void) ddi_intr_trigger_softint(rgep->resched_hdl, NULL); return (B_FALSE); } @@ -651,35 +648,9 @@ rge_send(rge_t *rgep, mblk_t *mp) mutex_enter(rgep->tx_lock); if (--rgep->tx_flow == 0) { DMA_SYNC(rgep->tx_desc, DDI_DMA_SYNC_FORDEV); - rge_tx_trigger(rgep); - rgep->stats.opackets ++; - if (rgep->tx_free < RGE_SEND_SLOTS/2) - rge_send_recycle(rgep); rgep->tc_tail = rgep->tx_next; - - /* - * It's observed that in current Realtek PCI-E chips, tx - * request of the second fragment for upper layer packets - * will be ignored if the hardware transmission is in - * progress and will not be processed when the tx engine - * is idle. So one solution is to re-issue the requests - * if the hardware and the software tx packets statistics - * are inconsistent. - */ - if (rgep->chipid.is_pcie && rgep->stats.tx_pre_ismax) { - for (counter = 0; counter < 10; counter ++) { - mutex_enter(rgep->genlock); - rge_hw_stats_dump(rgep); - mutex_exit(rgep->genlock); - bstp = rgep->hw_stats; - if (rgep->stats.opackets - != RGE_BSWAP_64(bstp->rcv_ok)) - rge_tx_trigger(rgep); - else - break; - } - } } + rgep->stats.opackets++; mutex_exit(rgep->tx_lock); return (B_TRUE); @@ -695,6 +666,19 @@ rge_reschedule(caddr_t arg1, caddr_t arg2) rge_send_recycle(rgep); + if (rgep->chipid.is_pcie && rgep->tx_free != RGE_SEND_SLOTS) { + /* + * It's observed that in current Realtek PCI-E chips, tx + * request of the second fragment for upper layer packets + * will be ignored if the hardware transmission is in + * progress and will not be processed when the tx engine + * is idle. So one solution is to re-issue the requests + * if there are untransmitted packets after tx interrupts + * occur. + */ + rge_tx_trigger(rgep); + } + return (DDI_INTR_CLAIMED); } @@ -706,6 +690,7 @@ rge_m_tx(void *arg, mblk_t *mp) { rge_t *rgep = arg; /* private device info */ mblk_t *next; + mblk_t *mp_org = mp; ASSERT(mp != NULL); @@ -728,6 +713,9 @@ rge_m_tx(void *arg, mblk_t *mp) mp = next; } + if (mp != mp_org) { + rge_tx_trigger(rgep); + } rw_exit(rgep->errlock); return (mp); |
