summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorWinson Wang - Sun Microsystems - Beijing China <Zhen.W@Sun.COM>2009-12-21 08:56:45 +0800
committerWinson Wang - Sun Microsystems - Beijing China <Zhen.W@Sun.COM>2009-12-21 08:56:45 +0800
commit3a84c50f71e0942a55e90913f6b44878d5062621 (patch)
treebf84e71dd463e80a2718d2b6acaf85fe765f5a1b /usr/src
parent05cffdd1e5bacc2be9e9bd3721a3af9fe838769a (diff)
downloadillumos-gate-3a84c50f71e0942a55e90913f6b44878d5062621.tar.gz
6888015 rge tx performance with RTL8168B is very low and cpu usage is high.
6905195 rge opackets stats is wrong and using pci_lcap_locate to check whether it is the pcie card 6906394 rge reads write-only register 6906401 rge will hang because of reading corrupted tx descriptors 6906408 rge using pci MMIO regs (rather than IO) will reduce cpu usage
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/uts/common/io/rge/rge.h44
-rw-r--r--usr/src/uts/common/io/rge/rge_chip.c182
-rw-r--r--usr/src/uts/common/io/rge/rge_hw.h6
-rw-r--r--usr/src/uts/common/io/rge/rge_main.c2
-rw-r--r--usr/src/uts/common/io/rge/rge_rxtx.c50
5 files changed, 194 insertions, 90 deletions
diff --git a/usr/src/uts/common/io/rge/rge.h b/usr/src/uts/common/io/rge/rge.h
index 4a58da1c92..d6236ab5d7 100644
--- a/usr/src/uts/common/io/rge/rge.h
+++ b/usr/src/uts/common/io/rge/rge.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -36,6 +36,7 @@ extern "C" {
#include <sys/strsubr.h>
#include <sys/stat.h>
#include <sys/pci.h>
+#include <sys/pci_cap.h>
#include <sys/note.h>
#include <sys/modctl.h>
#include <sys/kstat.h>
@@ -316,19 +317,23 @@ typedef struct sw_sbd {
} sw_sbd_t;
-#define HW_RBD_INIT(rbd, slot) \
- rbd->flags_len |= RGE_BSWAP_32(BD_FLAG_HW_OWN); \
- rbd->vlan_tag = 0; \
- if (slot == (RGE_RECV_SLOTS -1)) \
- rbd->flags_len |= RGE_BSWAP_32(BD_FLAG_EOR);
-#define HW_SBD_INIT(sbd, slot) \
- sbd->flags_len = 0; \
- if (slot == (RGE_SEND_SLOTS -1)) \
- sbd->flags_len |= RGE_BSWAP_32(BD_FLAG_EOR);
-#define HW_SBD_SET(sbd, slot) \
- sbd->flags_len |= RGE_BSWAP_32(SBD_FLAG_TX_PKT); \
- if (slot == (RGE_SEND_SLOTS -1)) \
- sbd->flags_len |= RGE_BSWAP_32(BD_FLAG_EOR);
+#define HW_RBD_INIT(rbd, slot) { \
+ (rbd)->vlan_tag = 0; \
+ if ((slot) == (RGE_RECV_SLOTS -1)) { \
+ (rbd)->flags_len |= \
+ RGE_BSWAP_32(BD_FLAG_EOR | BD_FLAG_HW_OWN); \
+ } else { \
+ (rbd)->flags_len |= RGE_BSWAP_32(BD_FLAG_HW_OWN); \
+ } \
+}
+#define HW_SBD_SET(sbd, slot) { \
+ if ((slot) == (RGE_SEND_SLOTS -1)) { \
+ (sbd)->flags_len |= \
+ RGE_BSWAP_32(BD_FLAG_EOR | SBD_FLAG_TX_PKT); \
+ } else { \
+ (sbd)->flags_len |= RGE_BSWAP_32(SBD_FLAG_TX_PKT); \
+ } \
+}
/*
* Describes the characteristics of a specific chip
@@ -478,6 +483,17 @@ typedef struct rge {
enum rge_chip_state rge_chip_state; /* definitions above */
boolean_t suspended;
+
+ /*
+ * Polling
+ */
+#define TX_COALESC max(RGE_BUF_SLOTS/32LL, 8)
+#define RX_COALESC 8LL
+#define CLK_TICK 100
+ clock_t curr_tick;
+ clock_t tick_delta;
+ uint64_t last_opackets;
+ uint64_t last_rpackets;
} rge_t;
/*
diff --git a/usr/src/uts/common/io/rge/rge_chip.c b/usr/src/uts/common/io/rge/rge_chip.c
index f043cf5cb0..1bf894e3e2 100644
--- a/usr/src/uts/common/io/rge/rge_chip.c
+++ b/usr/src/uts/common/io/rge/rge_chip.c
@@ -703,20 +703,8 @@ rge_chip_ident(rge_t *rgep)
val32 = rge_reg_get32(rgep, TX_CONFIG_REG);
val32 &= HW_VERSION_ID_0 | HW_VERSION_ID_1;
chip->mac_ver = val32;
- switch (chip->mac_ver) {
- case MAC_VER_8168:
- case MAC_VER_8168B_B:
- case MAC_VER_8168B_C:
- case MAC_VER_8168C:
- case MAC_VER_8101E:
- case MAC_VER_8101E_B:
- chip->is_pcie = B_TRUE;
- break;
-
- default:
- chip->is_pcie = B_FALSE;
- break;
- }
+ chip->is_pcie = pci_lcap_locate(rgep->cfg_handle,
+ PCI_CAP_ID_PCI_E, &val16) == DDI_SUCCESS;
/*
* Read and record PHY version
@@ -775,6 +763,11 @@ rge_chip_ident(rge_t *rgep)
chip->rxconfig = RX_CONFIG_DEFAULT;
chip->txconfig = TX_CONFIG_DEFAULT;
+ /* interval to update statistics for polling mode */
+ rgep->tick_delta = drv_usectohz(1000*1000/CLK_TICK);
+
+ /* ensure we are not in polling mode */
+ rgep->curr_tick = ddi_get_lbolt() - 2*rgep->tick_delta;
RGE_TRACE(("%s: MAC version = %x, PHY version = %x",
rgep->ifname, chip->mac_ver, chip->phy_ver));
}
@@ -884,34 +877,31 @@ rge_chip_init(rge_t *rgep)
uint32_t *hashp;
chip_id_t *chip = &rgep->chipid;
- if (chip->is_pcie) {
- /*
- * Increase the threshold voltage of RX sensitivity
- */
- if (chip->mac_ver != MAC_VER_8168 &&
- chip->mac_ver != MAC_VER_8168C &&
- chip->mac_ver != MAC_VER_8101E_B)
- rge_ephy_put16(rgep, 0x01, 0x1bd3);
+ /*
+ * Increase the threshold voltage of RX sensitivity
+ */
+ if (chip->mac_ver == MAC_VER_8168B_B ||
+ chip->mac_ver == MAC_VER_8168B_C ||
+ chip->mac_ver == MAC_VER_8101E ||
+ chip->mac_ver == MAC_VER_8101E_C) {
+ rge_ephy_put16(rgep, 0x01, 0x1bd3);
+ }
+ if (chip->mac_ver == MAC_VER_8168 ||
+ chip->mac_ver == MAC_VER_8168B_B) {
val16 = rge_reg_get8(rgep, PHY_STATUS_REG);
val16 = 0x12<<8 | val16;
- if (rgep->chipid.mac_ver != MAC_VER_8101E &&
- rgep->chipid.mac_ver != MAC_VER_8101E_B &&
- rgep->chipid.mac_ver != MAC_VER_8101E_C &&
- rgep->chipid.mac_ver != MAC_VER_8168B_C &&
- rgep->chipid.mac_ver != MAC_VER_8168C) {
- rge_reg_put16(rgep, PHY_STATUS_REG, val16);
- rge_reg_put32(rgep, RT_CSI_DATA_REG, 0x00021c01);
- rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x8000f088);
- rge_reg_put32(rgep, RT_CSI_DATA_REG, 0x00004000);
- rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x8000f0b0);
- rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x0000f068);
- val32 = rge_reg_get32(rgep, RT_CSI_DATA_REG);
- val32 |= 0x7000;
- val32 &= 0xffff5fff;
- rge_reg_put32(rgep, RT_CSI_DATA_REG, val32);
- rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x8000f068);
- }
+ rge_reg_put16(rgep, PHY_STATUS_REG, val16);
+ rge_reg_put32(rgep, RT_CSI_DATA_REG, 0x00021c01);
+ rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x8000f088);
+ rge_reg_put32(rgep, RT_CSI_DATA_REG, 0x00004000);
+ rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x8000f0b0);
+ rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x0000f068);
+ val32 = rge_reg_get32(rgep, RT_CSI_DATA_REG);
+ val32 |= 0x7000;
+ val32 &= 0xffff5fff;
+ rge_reg_put32(rgep, RT_CSI_DATA_REG, val32);
+ rge_reg_put32(rgep, RT_CSI_ACCESS_REG, 0x8000f068);
}
/*
@@ -1067,6 +1057,9 @@ rge_chip_start(rge_t *rgep)
* Enable interrupt
*/
rgep->int_mask = RGE_INT_MASK;
+ if (rgep->chipid.is_pcie) {
+ rgep->int_mask |= NO_TXDESC_INT;
+ }
rge_reg_put16(rgep, INT_MASK_REG, rgep->int_mask);
/*
@@ -1288,7 +1281,7 @@ void rge_tx_trigger(rge_t *rgep);
void
rge_tx_trigger(rge_t *rgep)
{
- rge_reg_set8(rgep, TX_RINGS_POLL_REG, NORMAL_TX_RING_POLL);
+ rge_reg_put8(rgep, TX_RINGS_POLL_REG, NORMAL_TX_RING_POLL);
}
void rge_hw_stats_dump(rge_t *rgep);
@@ -1345,6 +1338,14 @@ rge_intr(caddr_t arg1, caddr_t arg2)
{
rge_t *rgep = (rge_t *)arg1;
uint16_t int_status;
+ clock_t now;
+ uint32_t tx_pkts;
+ uint32_t rx_pkts;
+ uint32_t poll_rate;
+ uint32_t opt_pkts;
+ uint32_t opt_intrs;
+ boolean_t update_int_mask = B_FALSE;
+ uint32_t itimer;
_NOTE(ARGUNUSED(arg2))
@@ -1370,11 +1371,93 @@ rge_intr(caddr_t arg1, caddr_t arg2)
* Clear interrupt
* For PCIE chipset, we need disable interrupt first.
*/
- if (rgep->chipid.is_pcie)
+ if (rgep->chipid.is_pcie) {
rge_reg_put16(rgep, INT_MASK_REG, INT_MASK_NONE);
+ update_int_mask = B_TRUE;
+ }
rge_reg_put16(rgep, INT_STATUS_REG, int_status);
/*
+ * Calculate optimal polling interval
+ */
+ now = ddi_get_lbolt();
+ if (now - rgep->curr_tick >= rgep->tick_delta &&
+ (rgep->param_link_speed == RGE_SPEED_1000M ||
+ rgep->param_link_speed == RGE_SPEED_100M)) {
+ /* number of rx and tx packets in the last tick */
+ tx_pkts = rgep->stats.opackets - rgep->last_opackets;
+ rx_pkts = rgep->stats.rpackets - rgep->last_rpackets;
+
+ rgep->last_opackets = rgep->stats.opackets;
+ rgep->last_rpackets = rgep->stats.rpackets;
+
+ /* restore interrupt mask */
+ rgep->int_mask |= TX_OK_INT | RX_OK_INT;
+ if (rgep->chipid.is_pcie) {
+ rgep->int_mask |= NO_TXDESC_INT;
+ }
+
+ /* optimal number of packets in a tick */
+ if (rgep->param_link_speed == RGE_SPEED_1000M) {
+ opt_pkts = (1000*1000*1000/8)/ETHERMTU/CLK_TICK;
+ } else {
+ opt_pkts = (100*1000*1000/8)/ETHERMTU/CLK_TICK;
+ }
+
+ /*
+ * calculate polling interval based on rx and tx packets
+ * in the last tick
+ */
+ poll_rate = 0;
+ if (now - rgep->curr_tick < 2*rgep->tick_delta) {
+ opt_intrs = opt_pkts/TX_COALESC;
+ if (tx_pkts > opt_intrs) {
+ poll_rate = max(tx_pkts/TX_COALESC, opt_intrs);
+ rgep->int_mask &= ~(TX_OK_INT | NO_TXDESC_INT);
+ }
+
+ opt_intrs = opt_pkts/RX_COALESC;
+ if (rx_pkts > opt_intrs) {
+ opt_intrs = max(rx_pkts/RX_COALESC, opt_intrs);
+ poll_rate = max(opt_intrs, poll_rate);
+ rgep->int_mask &= ~RX_OK_INT;
+ }
+ /* ensure poll_rate reasonable */
+ poll_rate = min(poll_rate, opt_pkts*4);
+ }
+
+ if (poll_rate) {
+ /* move to polling mode */
+ if (rgep->chipid.is_pcie) {
+ itimer = (TIMER_CLK_PCIE/CLK_TICK)/poll_rate;
+ } else {
+ itimer = (TIMER_CLK_PCI/CLK_TICK)/poll_rate;
+ }
+ } else {
+ /* move to normal mode */
+ itimer = 0;
+ }
+ RGE_DEBUG(("%s: poll: itimer:%d int_mask:0x%x",
+ __func__, itimer, rgep->int_mask));
+ rge_reg_put32(rgep, TIMER_INT_REG, itimer);
+
+ /* update timestamp for statistics */
+ rgep->curr_tick = now;
+
+ /* reset timer */
+ int_status |= TIME_OUT_INT;
+
+ update_int_mask = B_TRUE;
+ }
+
+ if (int_status & TIME_OUT_INT) {
+ rge_reg_put32(rgep, TIMER_COUNT_REG, 0);
+ }
+
+ /* flush post writes */
+ (void) rge_reg_get16(rgep, INT_STATUS_REG);
+
+ /*
* Cable link change interrupt
*/
if (int_status & LINK_CHANGE_INT) {
@@ -1390,9 +1473,22 @@ rge_intr(caddr_t arg1, caddr_t arg2)
rge_receive(rgep);
/*
- * Re-enable interrupt for PCIE chipset
+ * Transmit interrupt
+ */
+ if (int_status & TX_ERR_INT) {
+ RGE_REPORT((rgep, "tx error happened, resetting the chip "));
+ mutex_enter(rgep->genlock);
+ rgep->rge_chip_state = RGE_CHIP_ERROR;
+ mutex_exit(rgep->genlock);
+ } else if ((rgep->chipid.is_pcie && (int_status & NO_TXDESC_INT)) ||
+ ((int_status & TX_OK_INT) && rgep->tx_free < RGE_SEND_SLOTS/8)) {
+ (void) ddi_intr_trigger_softint(rgep->resched_hdl, NULL);
+ }
+
+ /*
+ * Re-enable interrupt for PCIE chipset or install new int_mask
*/
- if (rgep->chipid.is_pcie)
+ if (update_int_mask)
rge_reg_put16(rgep, INT_MASK_REG, rgep->int_mask);
return (DDI_INTR_CLAIMED); /* indicate it was our interrupt */
diff --git a/usr/src/uts/common/io/rge/rge_hw.h b/usr/src/uts/common/io/rge/rge_hw.h
index 2bc6241149..eee4d26337 100644
--- a/usr/src/uts/common/io/rge/rge_hw.h
+++ b/usr/src/uts/common/io/rge/rge_hw.h
@@ -136,7 +136,9 @@ extern "C" {
#define INT_MASK_NONE 0x0000
#define RGE_RX_INT (RX_OK_INT | RX_ERR_INT | \
NO_RXDESC_INT)
-#define RGE_INT_MASK (RGE_RX_INT | LINK_CHANGE_INT)
+#define RGE_INT_MASK (TX_OK_INT | TX_ERR_INT | \
+ RGE_RX_INT | LINK_CHANGE_INT | \
+ TIME_OUT_INT)
/*
* Transmit configuration register
@@ -213,6 +215,8 @@ extern "C" {
* Timer count register
*/
#define TIMER_COUNT_REG 0x0048
+#define TIMER_CLK_PCIE (125*1000*1000)
+#define TIMER_CLK_PCI (33*1000*1000)
/*
* Missed packet counter: indicates the number of packets
diff --git a/usr/src/uts/common/io/rge/rge_main.c b/usr/src/uts/common/io/rge/rge_main.c
index e181e1ae87..1e2eab2342 100644
--- a/usr/src/uts/common/io/rge/rge_main.c
+++ b/usr/src/uts/common/io/rge/rge_main.c
@@ -1638,7 +1638,7 @@ rge_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
/*
* Map operating registers
*/
- err = ddi_regs_map_setup(devinfo, 1, &regs,
+ err = ddi_regs_map_setup(devinfo, 2, &regs,
0, 0, &rge_reg_accattr, &rgep->io_handle);
if (err != DDI_SUCCESS) {
rge_problem(rgep, "ddi_regs_map_setup() failed");
diff --git a/usr/src/uts/common/io/rge/rge_rxtx.c b/usr/src/uts/common/io/rge/rge_rxtx.c
index 09d23825d3..916b2e8aef 100644
--- a/usr/src/uts/common/io/rge/rge_rxtx.c
+++ b/usr/src/uts/common/io/rge/rge_rxtx.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -611,8 +611,6 @@ rge_send(rge_t *rgep, mblk_t *mp)
{
struct ether_vlan_header *ehp;
uint16_t tci;
- rge_hw_stats_t *bstp;
- uint8_t counter;
ASSERT(mp->b_next == NULL);
@@ -623,7 +621,6 @@ rge_send(rge_t *rgep, mblk_t *mp)
RGE_DEBUG(("rge_send: no free slots"));
rgep->stats.defer++;
rgep->resched_needed = B_TRUE;
- (void) ddi_intr_trigger_softint(rgep->resched_hdl, NULL);
return (B_FALSE);
}
@@ -651,35 +648,9 @@ rge_send(rge_t *rgep, mblk_t *mp)
mutex_enter(rgep->tx_lock);
if (--rgep->tx_flow == 0) {
DMA_SYNC(rgep->tx_desc, DDI_DMA_SYNC_FORDEV);
- rge_tx_trigger(rgep);
- rgep->stats.opackets ++;
- if (rgep->tx_free < RGE_SEND_SLOTS/2)
- rge_send_recycle(rgep);
rgep->tc_tail = rgep->tx_next;
-
- /*
- * It's observed that in current Realtek PCI-E chips, tx
- * request of the second fragment for upper layer packets
- * will be ignored if the hardware transmission is in
- * progress and will not be processed when the tx engine
- * is idle. So one solution is to re-issue the requests
- * if the hardware and the software tx packets statistics
- * are inconsistent.
- */
- if (rgep->chipid.is_pcie && rgep->stats.tx_pre_ismax) {
- for (counter = 0; counter < 10; counter ++) {
- mutex_enter(rgep->genlock);
- rge_hw_stats_dump(rgep);
- mutex_exit(rgep->genlock);
- bstp = rgep->hw_stats;
- if (rgep->stats.opackets
- != RGE_BSWAP_64(bstp->rcv_ok))
- rge_tx_trigger(rgep);
- else
- break;
- }
- }
}
+ rgep->stats.opackets++;
mutex_exit(rgep->tx_lock);
return (B_TRUE);
@@ -695,6 +666,19 @@ rge_reschedule(caddr_t arg1, caddr_t arg2)
rge_send_recycle(rgep);
+ if (rgep->chipid.is_pcie && rgep->tx_free != RGE_SEND_SLOTS) {
+ /*
+ * It's observed that in current Realtek PCI-E chips, tx
+ * request of the second fragment for upper layer packets
+ * will be ignored if the hardware transmission is in
+ * progress and will not be processed when the tx engine
+ * is idle. So one solution is to re-issue the requests
+ * if there are untransmitted packets after tx interrupts
+ * occur.
+ */
+ rge_tx_trigger(rgep);
+ }
+
return (DDI_INTR_CLAIMED);
}
@@ -706,6 +690,7 @@ rge_m_tx(void *arg, mblk_t *mp)
{
rge_t *rgep = arg; /* private device info */
mblk_t *next;
+ mblk_t *mp_org = mp;
ASSERT(mp != NULL);
@@ -728,6 +713,9 @@ rge_m_tx(void *arg, mblk_t *mp)
mp = next;
}
+ if (mp != mp_org) {
+ rge_tx_trigger(rgep);
+ }
rw_exit(rgep->errlock);
return (mp);