diff options
Diffstat (limited to 'usr/src')
| -rw-r--r-- | usr/src/man/man7d/mlxcx.7d | 39 | ||||
| -rw-r--r-- | usr/src/uts/common/io/mlxcx/mlxcx.c | 75 | ||||
| -rw-r--r-- | usr/src/uts/common/io/mlxcx/mlxcx.conf | 20 | ||||
| -rw-r--r-- | usr/src/uts/common/io/mlxcx/mlxcx.h | 56 | ||||
| -rw-r--r-- | usr/src/uts/common/io/mlxcx/mlxcx_gld.c | 82 | ||||
| -rw-r--r-- | usr/src/uts/common/io/mlxcx/mlxcx_intr.c | 353 | ||||
| -rw-r--r-- | usr/src/uts/common/io/mlxcx/mlxcx_reg.h | 23 | ||||
| -rw-r--r-- | usr/src/uts/common/io/mlxcx/mlxcx_ring.c | 380 |
8 files changed, 706 insertions, 322 deletions
diff --git a/usr/src/man/man7d/mlxcx.7d b/usr/src/man/man7d/mlxcx.7d index 5373b5bec5..d7b0cf8ad9 100644 --- a/usr/src/man/man7d/mlxcx.7d +++ b/usr/src/man/man7d/mlxcx.7d @@ -11,7 +11,7 @@ .\" .\" Copyright 2020 the University of Queensland .\" -.Dd January 17, 2020 +.Dd April 9, 2020 .Dt MLXCX 7D .Os .Sh NAME @@ -94,8 +94,11 @@ property determines the number of entries on Completion Queues for the device. The number of entries is calculated as .Li (1 << cq_size_shift) , so a value of 9 would mean 512 entries are created on each Event Queue. -The default value is -.Sy 10 . +The default value is device dependent, +.Sy 10 +for devices with maximum supported speed of 10Gb/s or less and +.Sy 12 +for devices with higher supported speeds. This should be kept very close to the value set for .Sy rq_size_shift and @@ -116,8 +119,11 @@ The number of descriptors is calculated as .Dv (1 << rq_size_shift) , so a value of 9 would mean 512 descriptors are created on each Receive Queue. This sets the number of packets on RX rings advertised to MAC. -The default value is -.Sy 10 . +The default value is device dependent, +.Sy 10 +for devices with maximum supported speed of 10Gb/s or less and +.Sy 12 +for devices with higher supported speeds. .Ed .It Sy sq_size_shift .Bd -filled -compact @@ -134,8 +140,11 @@ The number of descriptors is calculated as .Dv (1 << sq_size_shift) , so a value of 9 would mean 512 descriptors are created on each Send Queue. This sets the number of packets on RX rings advertised to MAC. -The default value is -.Sy 11 . +The default value is device dependent, +.Sy 11 +for devices with maximum supported speed of 10Gb/s or less and +.Sy 13 +for devices with higher supported speeds. Note that large packets often occupy more than one descriptor slot on the SQ, so it is sometimes a good idea to increase this if using a large MTU. .Ed @@ -325,6 +334,22 @@ is seldom worth using them for small packets. The default value is .Sy 2048 . .Ed +.It Sy rx_limit_per_completion +.Bd -filled -compact +Minimum: +.Sy 16 | +Maximum: +.Sy 4096 +.Ed +.Bd -filled +The +.Sy rx_limit_per_completion +property determines the maximum number of packets that +will be processed on a given completion ring during a single interrupt. +This is done to try and guarantee some amount of liveness in the system. +The default value is +.Sy 256 . +.Ed .El .Sh FILES .Bl -tag -width Pa diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.c b/usr/src/uts/common/io/mlxcx/mlxcx.c index 12a8d52b3f..c90fa0969b 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx.c @@ -12,6 +12,7 @@ /* * Copyright 2020, The University of Queensland * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 RackTop Systems, Inc. */ /* @@ -453,23 +454,68 @@ uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT; uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT; static void -mlxcx_load_props(mlxcx_t *mlxp) +mlxcx_load_prop_defaults(mlxcx_t *mlxp) { mlxcx_drv_props_t *p = &mlxp->mlx_props; + mlxcx_port_t *port = &mlxp->mlx_ports[0]; + + VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0); + VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0); + + /* + * Currently we have different queue size defaults for two + * categories of queues. One set for devices which support a + * maximum speed of 10Gb/s, and another for those above that. + */ + if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G | + MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0) { + p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G; + p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G; + p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G; + } else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G | + MLXCX_PROTO_10G)) != 0) { + p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; + p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; + p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; + } else { + mlxcx_warn(mlxp, "Encountered a port with a speed we don't " + "recognize. Proto: 0x%x", port->mlp_max_proto); + p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; + p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; + p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; + } +} + +/* + * Properties which may have different defaults based on hardware + * characteristics. + */ +static void +mlxcx_load_model_props(mlxcx_t *mlxp) +{ + mlxcx_drv_props_t *p = &mlxp->mlx_props; + + mlxcx_load_prop_defaults(mlxp); - p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, - DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift", - MLXCX_EQ_SIZE_SHIFT_DFLT); p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift", - MLXCX_CQ_SIZE_SHIFT_DFLT); + p->mldp_cq_size_shift_default); p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift", - MLXCX_SQ_SIZE_SHIFT_DFLT); + p->mldp_sq_size_shift_default); p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift", - MLXCX_RQ_SIZE_SHIFT_DFLT); + p->mldp_rq_size_shift_default); +} + +static void +mlxcx_load_props(mlxcx_t *mlxp) +{ + mlxcx_drv_props_t *p = &mlxp->mlx_props; + p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, + DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift", + MLXCX_EQ_SIZE_SHIFT_DFLT); p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec", MLXCX_CQEMOD_PERIOD_USEC_DFLT); @@ -521,6 +567,19 @@ mlxcx_load_props(mlxcx_t *mlxp) p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT); + + p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, + DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion", + MLXCX_RX_PER_CQ_DEFAULT); + + if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN || + p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) { + mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is " + "out of range. Defaulting to: %d. Valid values are from " + "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT, + MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX); + p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT; + } } void @@ -2595,6 +2654,8 @@ mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) } mlxp->mlx_attach |= MLXCX_ATTACH_PORTS; + mlxcx_load_model_props(mlxp); + /* * Set up, enable and arm the rest of the interrupt EQs which will * service events from CQs. diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.conf b/usr/src/uts/common/io/mlxcx/mlxcx.conf index 3569c4e5f5..321820a47b 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.conf +++ b/usr/src/uts/common/io/mlxcx/mlxcx.conf @@ -12,6 +12,7 @@ # # Copyright 2018, Joyent, Inc. # Copyright 2020, The University of Queensland +# Copyright 2020 RackTop Systems, Inc. # # @@ -23,10 +24,15 @@ # Sizing of event and completion queues. # # The number of entries on each queue will be (1 << *_size_shift) -- so -# a value of 9 would mean 512 entries. +# a value of 10 would mean 1024 entries. # #eq_size_shift = 9; + +# The default for devices with a maximum supported speed up to 10Gb/s #cq_size_shift = 10; +# +# The default for devices with a maximum supported speed above 10Gb/s +#cq_size_shift = 12; # # Sizing of send and receive queues. @@ -35,8 +41,13 @@ # advertise to MAC. It also determines how many packet buffers we will allocate # when starting the interface. # +# The defaults for devices with a maximum supported speed up to 10Gb/s #sq_size_shift = 11; #rq_size_shift = 10; +# +# The defaults for devices with a maximum supported speed above 10Gb/s +#sq_size_shift = 13; +#rq_size_shift = 12; # # Number and configuration of TX groups and rings. @@ -99,3 +110,10 @@ #eq_check_interval_sec = 30; #cq_check_interval_sec = 300; #wq_check_interval_sec = 300; + +# +# To provide some level of moderation and aid latencies, after +# "rx_limit_per_completion" packets are received in a single completion +# event, the interrupt handler will pass the chain up the receive stack. +# +#rx_limit_per_completion = 256; diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h index 3b58989961..bf07691095 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx.h @@ -12,6 +12,7 @@ /* * Copyright 2020, The University of Queensland * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 RackTop Systems, Inc. */ /* @@ -32,6 +33,7 @@ #include <sys/ddifm.h> #include <sys/id_space.h> #include <sys/list.h> +#include <sys/taskq_impl.h> #include <sys/stddef.h> #include <sys/stream.h> #include <sys/strsun.h> @@ -89,18 +91,36 @@ extern "C" { * Queues will be sized to (1 << *Q_SIZE_SHIFT) entries long. */ #define MLXCX_EQ_SIZE_SHIFT_DFLT 9 + +/* + * The CQ, SQ and RQ sizes can effect throughput on higher speed interfaces. + * EQ less so, as it only takes a single EQ entry to indicate there are + * multiple completions on the CQ. + * + * Particularly on the Rx side, the RQ (and corresponding CQ) would run + * low on available entries. A symptom of this is the refill taskq running + * frequently. A larger RQ (and CQ) alleviates this, and as there is a + * close relationship between SQ and CQ size, the SQ is increased too. + */ #define MLXCX_CQ_SIZE_SHIFT_DFLT 10 +#define MLXCX_CQ_SIZE_SHIFT_25G 12 /* * Default to making SQs bigger than RQs for 9k MTU, since most packets will * spill over into more than one slot. RQ WQEs are always 1 slot. */ #define MLXCX_SQ_SIZE_SHIFT_DFLT 11 +#define MLXCX_SQ_SIZE_SHIFT_25G 13 + #define MLXCX_RQ_SIZE_SHIFT_DFLT 10 +#define MLXCX_RQ_SIZE_SHIFT_25G 12 #define MLXCX_CQ_HWM_GAP 16 #define MLXCX_CQ_LWM_GAP 24 +#define MLXCX_WQ_HWM_GAP MLXCX_CQ_HWM_GAP +#define MLXCX_WQ_LWM_GAP MLXCX_CQ_LWM_GAP + #define MLXCX_RQ_REFILL_STEP 64 /* @@ -135,6 +155,14 @@ extern "C" { #define MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT 300 #define MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT 30 +/* + * After this many packets, the packets received so far are passed to + * the mac layer. + */ +#define MLXCX_RX_PER_CQ_DEFAULT 256 +#define MLXCX_RX_PER_CQ_MIN 16 +#define MLXCX_RX_PER_CQ_MAX 4096 + #define MLXCX_DOORBELL_TRIES_DFLT 3 extern uint_t mlxcx_doorbell_tries; @@ -417,6 +445,11 @@ typedef struct mlxcx_buffer { size_t mlb_used; mblk_t *mlb_tx_mp; + /* + * The number of work queue basic blocks this buf uses. + */ + uint_t mlb_wqebbs; + mlxcx_t *mlb_mlx; mlxcx_buffer_state_t mlb_state; uint_t mlb_wqe_index; @@ -495,6 +528,8 @@ typedef enum { MLXCX_WQ_DESTROYED = 1 << 3, MLXCX_WQ_TEARDOWN = 1 << 4, MLXCX_WQ_BUFFERS = 1 << 5, + MLXCX_WQ_REFILLING = 1 << 6, + MLXCX_WQ_BLOCKED_MAC = 1 << 7 } mlxcx_workq_state_t; typedef enum { @@ -540,12 +575,18 @@ struct mlxcx_work_queue { }; uint64_t mlwq_pc; /* producer counter */ + uint64_t mlwq_wqebb_used; + size_t mlwq_bufhwm; + size_t mlwq_buflwm; + mlxcx_dma_buffer_t mlwq_doorbell_dma; mlxcx_workq_doorbell_t *mlwq_doorbell; mlxcx_buf_shard_t *mlwq_bufs; mlxcx_buf_shard_t *mlwq_foreign_bufs; + taskq_ent_t mlwq_tqe; + boolean_t mlwq_fm_repd_qstate; }; @@ -773,6 +814,8 @@ struct mlxcx_ring_group { mlxcx_flow_group_t *mlg_rx_vlan_promisc_fg; list_t mlg_rx_vlans; + taskq_t *mlg_refill_tq; + /* * Flow table for separating out by protocol before hashing */ @@ -856,8 +899,11 @@ typedef struct { typedef struct { uint_t mldp_eq_size_shift; uint_t mldp_cq_size_shift; + uint_t mldp_cq_size_shift_default; uint_t mldp_rq_size_shift; + uint_t mldp_rq_size_shift_default; uint_t mldp_sq_size_shift; + uint_t mldp_sq_size_shift_default; uint_t mldp_cqemod_period_usec; uint_t mldp_cqemod_count; uint_t mldp_intrmod_period_usec; @@ -865,6 +911,7 @@ typedef struct { uint_t mldp_rx_ngroups_small; uint_t mldp_rx_nrings_per_large_group; uint_t mldp_rx_nrings_per_small_group; + uint_t mldp_rx_per_cq; uint_t mldp_tx_ngroups; uint_t mldp_tx_nrings_per_group; uint_t mldp_ftbl_root_size_shift; @@ -1098,6 +1145,7 @@ extern boolean_t mlxcx_intr_setup(mlxcx_t *); extern void mlxcx_intr_teardown(mlxcx_t *); extern void mlxcx_arm_eq(mlxcx_t *, mlxcx_event_queue_t *); extern void mlxcx_arm_cq(mlxcx_t *, mlxcx_completion_queue_t *); +extern void mlxcx_update_cqci(mlxcx_t *, mlxcx_completion_queue_t *); extern mblk_t *mlxcx_rx_poll(mlxcx_t *, mlxcx_completion_queue_t *, size_t); @@ -1109,8 +1157,6 @@ extern boolean_t mlxcx_register_mac(mlxcx_t *); /* * From mlxcx_ring.c */ -extern boolean_t mlxcx_cq_alloc_dma(mlxcx_t *, mlxcx_completion_queue_t *); -extern void mlxcx_cq_rele_dma(mlxcx_t *, mlxcx_completion_queue_t *); extern boolean_t mlxcx_wq_alloc_dma(mlxcx_t *, mlxcx_work_queue_t *); extern void mlxcx_wq_rele_dma(mlxcx_t *, mlxcx_work_queue_t *); @@ -1118,7 +1164,7 @@ extern boolean_t mlxcx_buf_create(mlxcx_t *, mlxcx_buf_shard_t *, mlxcx_buffer_t **); extern boolean_t mlxcx_buf_create_foreign(mlxcx_t *, mlxcx_buf_shard_t *, mlxcx_buffer_t **); -extern void mlxcx_buf_take(mlxcx_t *, mlxcx_work_queue_t *, mlxcx_buffer_t **); +extern mlxcx_buffer_t *mlxcx_buf_take(mlxcx_t *, mlxcx_work_queue_t *); extern size_t mlxcx_buf_take_n(mlxcx_t *, mlxcx_work_queue_t *, mlxcx_buffer_t **, size_t); extern boolean_t mlxcx_buf_loan(mlxcx_t *, mlxcx_buffer_t *); @@ -1126,8 +1172,8 @@ extern void mlxcx_buf_return(mlxcx_t *, mlxcx_buffer_t *); extern void mlxcx_buf_return_chain(mlxcx_t *, mlxcx_buffer_t *, boolean_t); extern void mlxcx_buf_destroy(mlxcx_t *, mlxcx_buffer_t *); -extern boolean_t mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *, - mblk_t *, size_t, mlxcx_buffer_t **); +extern mlxcx_buffer_t *mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *, + mblk_t *, size_t); extern boolean_t mlxcx_rx_group_setup(mlxcx_t *, mlxcx_ring_group_t *); extern boolean_t mlxcx_tx_group_setup(mlxcx_t *, mlxcx_ring_group_t *); diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c index 7b01702376..a1d50659c1 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c @@ -430,15 +430,10 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) } } - if (!mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take, &b)) { - /* - * Something went really wrong, and we probably will never be - * able to TX again (all our buffers are broken and DMA is - * failing). Drop the packet on the floor -- FMA should be - * reporting this error elsewhere. - */ - freemsg(mp); - return (NULL); + b = mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take); + if (b == NULL) { + atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC); + return (mp); } mutex_enter(&sq->mlwq_mtx); @@ -467,18 +462,20 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) */ if (cq->mlcq_bufcnt >= cq->mlcq_bufhwm) { atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC); - mutex_exit(&sq->mlwq_mtx); - mlxcx_buf_return_chain(mlxp, b, B_TRUE); - return (mp); + goto blocked; + } + + if (sq->mlwq_wqebb_used >= sq->mlwq_bufhwm) { + atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC); + goto blocked; } ok = mlxcx_sq_add_buffer(mlxp, sq, inline_hdrs, inline_hdrlen, chkflags, b); if (!ok) { atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC); - mutex_exit(&sq->mlwq_mtx); - mlxcx_buf_return_chain(mlxp, b, B_TRUE); - return (mp); + atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC); + goto blocked; } /* @@ -493,6 +490,11 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp) mutex_exit(&sq->mlwq_mtx); return (NULL); + +blocked: + mutex_exit(&sq->mlwq_mtx); + mlxcx_buf_return_chain(mlxp, b, B_TRUE); + return (mp); } static int @@ -862,9 +864,8 @@ mlxcx_mac_ring_intr_disable(mac_intr_handle_t intrh) { mlxcx_completion_queue_t *cq = (mlxcx_completion_queue_t *)intrh; - atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_POLLING); mutex_enter(&cq->mlcq_mtx); - VERIFY(cq->mlcq_state & MLXCX_CQ_POLLING); + atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_POLLING); mutex_exit(&cq->mlcq_mtx); return (0); @@ -1061,56 +1062,43 @@ mlxcx_mac_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, case MAC_PROP_EN_100GFDX_CAP: mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); mac_prop_info_set_default_uint8(prh, - (port->mlp_oper_proto & - (MLXCX_PROTO_100GBASE_CR4 | MLXCX_PROTO_100GBASE_SR4 | - MLXCX_PROTO_100GBASE_KR4)) != 0); + (port->mlp_oper_proto & MLXCX_PROTO_100G) != 0); break; case MAC_PROP_ADV_50GFDX_CAP: case MAC_PROP_EN_50GFDX_CAP: mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); mac_prop_info_set_default_uint8(prh, - (port->mlp_oper_proto & - (MLXCX_PROTO_50GBASE_CR2 | MLXCX_PROTO_50GBASE_KR2 | - MLXCX_PROTO_50GBASE_SR2)) != 0); + (port->mlp_oper_proto & MLXCX_PROTO_50G) != 0); break; case MAC_PROP_ADV_40GFDX_CAP: case MAC_PROP_EN_40GFDX_CAP: mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); mac_prop_info_set_default_uint8(prh, - (port->mlp_oper_proto & - (MLXCX_PROTO_40GBASE_SR4 | MLXCX_PROTO_40GBASE_LR4_ER4 | - MLXCX_PROTO_40GBASE_CR4 | MLXCX_PROTO_40GBASE_KR4)) - != 0); + (port->mlp_oper_proto & MLXCX_PROTO_40G) != 0); break; case MAC_PROP_ADV_25GFDX_CAP: case MAC_PROP_EN_25GFDX_CAP: mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); mac_prop_info_set_default_uint8(prh, - (port->mlp_oper_proto & - (MLXCX_PROTO_25GBASE_CR | MLXCX_PROTO_25GBASE_KR | - MLXCX_PROTO_25GBASE_SR)) != 0); + (port->mlp_oper_proto & MLXCX_PROTO_25G) != 0); break; case MAC_PROP_ADV_10GFDX_CAP: case MAC_PROP_EN_10GFDX_CAP: mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); mac_prop_info_set_default_uint8(prh, - (port->mlp_oper_proto & - (MLXCX_PROTO_10GBASE_CX4 | MLXCX_PROTO_10GBASE_KX4 | - MLXCX_PROTO_10GBASE_KR | MLXCX_PROTO_10GBASE_CR | - MLXCX_PROTO_10GBASE_SR | MLXCX_PROTO_10GBASE_ER_LR)) != 0); + (port->mlp_oper_proto & MLXCX_PROTO_10G) != 0); break; case MAC_PROP_ADV_1000FDX_CAP: case MAC_PROP_EN_1000FDX_CAP: mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); mac_prop_info_set_default_uint8(prh, - (port->mlp_oper_proto & (MLXCX_PROTO_1000BASE_KX | - MLXCX_PROTO_SGMII)) != 0); + (port->mlp_oper_proto & MLXCX_PROTO_1G) != 0); break; case MAC_PROP_ADV_100FDX_CAP: case MAC_PROP_EN_100FDX_CAP: mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); mac_prop_info_set_default_uint8(prh, - (port->mlp_oper_proto & MLXCX_PROTO_SGMII_100BASE) != 0); + (port->mlp_oper_proto & MLXCX_PROTO_100M) != 0); break; default: break; @@ -1252,8 +1240,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, break; } *(uint8_t *)pr_val = (port->mlp_max_proto & - (MLXCX_PROTO_100GBASE_CR4 | MLXCX_PROTO_100GBASE_SR4 | - MLXCX_PROTO_100GBASE_KR4)) != 0; + MLXCX_PROTO_100G) != 0; break; case MAC_PROP_ADV_50GFDX_CAP: case MAC_PROP_EN_50GFDX_CAP: @@ -1262,8 +1249,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, break; } *(uint8_t *)pr_val = (port->mlp_max_proto & - (MLXCX_PROTO_50GBASE_CR2 | MLXCX_PROTO_50GBASE_KR2 | - MLXCX_PROTO_50GBASE_SR2)) != 0; + MLXCX_PROTO_50G) != 0; break; case MAC_PROP_ADV_40GFDX_CAP: case MAC_PROP_EN_40GFDX_CAP: @@ -1272,8 +1258,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, break; } *(uint8_t *)pr_val = (port->mlp_max_proto & - (MLXCX_PROTO_40GBASE_SR4 | MLXCX_PROTO_40GBASE_LR4_ER4 | - MLXCX_PROTO_40GBASE_CR4 | MLXCX_PROTO_40GBASE_KR4)) != 0; + MLXCX_PROTO_40G) != 0; break; case MAC_PROP_ADV_25GFDX_CAP: case MAC_PROP_EN_25GFDX_CAP: @@ -1282,8 +1267,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, break; } *(uint8_t *)pr_val = (port->mlp_max_proto & - (MLXCX_PROTO_25GBASE_CR | MLXCX_PROTO_25GBASE_KR | - MLXCX_PROTO_25GBASE_SR)) != 0; + MLXCX_PROTO_25G) != 0; break; case MAC_PROP_ADV_10GFDX_CAP: case MAC_PROP_EN_10GFDX_CAP: @@ -1292,9 +1276,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, break; } *(uint8_t *)pr_val = (port->mlp_max_proto & - (MLXCX_PROTO_10GBASE_CX4 | MLXCX_PROTO_10GBASE_KX4 | - MLXCX_PROTO_10GBASE_KR | MLXCX_PROTO_10GBASE_CR | - MLXCX_PROTO_10GBASE_SR | MLXCX_PROTO_10GBASE_ER_LR)) != 0; + MLXCX_PROTO_10G) != 0; break; case MAC_PROP_ADV_1000FDX_CAP: case MAC_PROP_EN_1000FDX_CAP: @@ -1303,7 +1285,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, break; } *(uint8_t *)pr_val = (port->mlp_max_proto & - (MLXCX_PROTO_1000BASE_KX | MLXCX_PROTO_SGMII)) != 0; + MLXCX_PROTO_1G) != 0; break; case MAC_PROP_ADV_100FDX_CAP: case MAC_PROP_EN_100FDX_CAP: @@ -1312,7 +1294,7 @@ mlxcx_mac_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, break; } *(uint8_t *)pr_val = (port->mlp_max_proto & - MLXCX_PROTO_SGMII_100BASE) != 0; + MLXCX_PROTO_100M) != 0; break; default: ret = ENOTSUP; diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c index 0516f86d6b..4dc4291b08 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c @@ -11,6 +11,7 @@ /* * Copyright (c) 2020, the University of Queensland + * Copyright 2020 RackTop Systems, Inc. */ /* @@ -26,6 +27,11 @@ #include <mlxcx.h> +/* + * CTASSERT(s) to cover bad values which would induce bugs. + */ +CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP); + void mlxcx_intr_teardown(mlxcx_t *mlxp) { @@ -190,6 +196,31 @@ mlxcx_cq_next(mlxcx_completion_queue_t *mlcq) } void +mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) +{ + ddi_fm_error_t err; + uint_t try = 0; + + mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); + +retry: + MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV); + ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err, + DDI_FME_VERSION); + if (err.fme_status != DDI_FM_OK) { + if (try++ < mlxcx_doorbell_tries) { + ddi_fm_dma_err_clear( + mlcq->mlcq_doorbell_dma.mxdb_dma_handle, + DDI_FME_VERSION); + goto retry; + } else { + ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST); + return; + } + } +} + +void mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) { bits32_t dbval = new_bits32(); @@ -538,14 +569,15 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2) if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) || !(mleq->mleq_state & MLXCX_EQ_CREATED) || (mleq->mleq_state & MLXCX_EQ_DESTROYED)) { - mlxcx_warn(mlxp, "int0 on bad eq state"); + mlxcx_warn(mlxp, "int %d on bad eq state", + mleq->mleq_intr_index); mutex_exit(&mleq->mleq_mtx); return (DDI_INTR_UNCLAIMED); } ent = mlxcx_eq_next(mleq); if (ent == NULL) { - mlxcx_warn(mlxp, "spurious int 0?"); + mlxcx_warn(mlxp, "spurious int %d", mleq->mleq_intr_index); mutex_exit(&mleq->mleq_mtx); return (DDI_INTR_UNCLAIMED); } @@ -574,8 +606,8 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2) mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod); break; default: - mlxcx_warn(mlxp, "unhandled event 0x%x on int0", - ent->mleqe_event_type); + mlxcx_warn(mlxp, "unhandled event 0x%x on int %d", + ent->mleqe_event_type, mleq->mleq_intr_index); } } @@ -591,46 +623,56 @@ mlxcx_intr_0(caddr_t arg, caddr_t arg2) return (DDI_INTR_CLAIMED); } -mblk_t * -mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim) +static boolean_t +mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp, + size_t bytelim) { - mlxcx_buffer_t *buf; - mblk_t *mp, *cmp, *nmp; + mlxcx_work_queue_t *wq = mlcq->mlcq_wq; mlxcx_completionq_ent_t *cent; + mblk_t *mp, *cmp, *nmp; + mlxcx_buffer_t *buf; + boolean_t found, added; size_t bytes = 0; - boolean_t found; - - ASSERT(mutex_owned(&mlcq->mlcq_mtx)); + uint_t rx_frames = 0; + uint_t comp_cnt = 0; + int64_t wqebbs, bufcnt; - ASSERT(mlcq->mlcq_wq != NULL); - ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ); + *mpp = NULL; if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) || !(mlcq->mlcq_state & MLXCX_CQ_CREATED) || (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) || (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) { - return (NULL); + return (B_FALSE); } - ASSERT(mlcq->mlcq_state & MLXCX_CQ_POLLING); - nmp = cmp = mp = NULL; - cent = mlxcx_cq_next(mlcq); - for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) { + wqebbs = 0; + bufcnt = 0; + for (cent = mlxcx_cq_next(mlcq); cent != NULL; + cent = mlxcx_cq_next(mlcq)) { /* * Teardown and ring stop can atomic_or this flag * into our state if they want us to stop early. */ if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) - break; + return (B_FALSE); + comp_cnt++; if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ && cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) { /* NOP */ + atomic_dec_64(&wq->mlwq_wqebb_used); goto nextcq; } +lookagain: + /* + * Generally the buffer we're looking for will be + * at the front of the list, so this loop won't + * need to look far. + */ buf = list_head(&mlcq->mlcq_buffers); found = B_FALSE; while (buf != NULL) { @@ -641,36 +683,118 @@ mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim) } buf = list_next(&mlcq->mlcq_buffers, buf); } + if (!found) { + /* + * If there's any buffers waiting on the + * buffers_b list, then merge those into + * the main list and have another look. + * + * The wq enqueue routines push new buffers + * into buffers_b so that they can avoid + * taking the mlcq_mtx and blocking us for + * every single packet. + */ + added = B_FALSE; + mutex_enter(&mlcq->mlcq_bufbmtx); + if (!list_is_empty(&mlcq->mlcq_buffers_b)) { + list_move_tail(&mlcq->mlcq_buffers, + &mlcq->mlcq_buffers_b); + added = B_TRUE; + } + mutex_exit(&mlcq->mlcq_bufbmtx); + if (added) + goto lookagain; + buf = list_head(&mlcq->mlcq_buffers); mlxcx_warn(mlxp, "got completion on CQ %x but " "no buffer matching wqe found: %x (first " "buffer counter = %x)", mlcq->mlcq_num, from_be16(cent->mlcqe_wqe_counter), - buf == NULL ? UINT32_MAX : buf->mlb_wqe_index); + buf == NULL ? UINT32_MAX : + buf->mlb_wqe_index); mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE); goto nextcq; } + + /* + * The buf is likely to be freed below, count this now. + */ + wqebbs += buf->mlb_wqebbs; + list_remove(&mlcq->mlcq_buffers, buf); - atomic_dec_64(&mlcq->mlcq_bufcnt); + bufcnt++; - nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf); - if (nmp != NULL) { + switch (mlcq->mlcq_wq->mlwq_type) { + case MLXCX_WQ_TYPE_SENDQ: + mlxcx_tx_completion(mlxp, mlcq, cent, buf); + break; + case MLXCX_WQ_TYPE_RECVQ: + nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf); bytes += from_be32(cent->mlcqe_byte_cnt); - if (cmp != NULL) { - cmp->b_next = nmp; - cmp = nmp; - } else { - mp = cmp = nmp; + if (nmp != NULL) { + if (cmp != NULL) { + cmp->b_next = nmp; + cmp = nmp; + } else { + mp = cmp = nmp; + } + + rx_frames++; } + break; } -nextcq: - mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc); - if (bytelim != 0 && bytes > bytelim) + /* + * Update the consumer index with what has been processed, + * followed by driver counters. It is important to tell the + * hardware first, otherwise when we throw more packets at + * it, it may get an overflow error. + * We do this whenever we've processed enough to bridge the + * high->low water mark. + */ + if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) { + mlxcx_update_cqci(mlxp, mlcq); + /* + * Both these variables are incremented using + * atomics as they are modified in other code paths + * (Eg during tx) which hold different locks. + */ + atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); + atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs); + wqebbs = 0; + bufcnt = 0; + comp_cnt = 0; + } +nextcq: + if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq || + (bytelim != 0 && bytes > bytelim)) break; } + if (comp_cnt > 0) { + mlxcx_update_cqci(mlxp, mlcq); + atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt); + atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs); + } + + *mpp = mp; + return (B_TRUE); +} + + +mblk_t * +mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim) +{ + mblk_t *mp = NULL; + + ASSERT(mutex_owned(&mlcq->mlcq_mtx)); + + ASSERT(mlcq->mlcq_wq != NULL); + ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ); + + (void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim); + return (mp); } @@ -680,11 +804,10 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2) mlxcx_t *mlxp = (mlxcx_t *)arg; mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2; mlxcx_eventq_ent_t *ent; - mlxcx_completionq_ent_t *cent; mlxcx_completion_queue_t *mlcq, probe; - mlxcx_buffer_t *buf; - mblk_t *mp, *cmp, *nmp; - boolean_t found, tellmac = B_FALSE, added; + mlxcx_work_queue_t *mlwq; + mblk_t *mp = NULL; + boolean_t tellmac = B_FALSE; mutex_enter(&mleq->mleq_mtx); @@ -729,10 +852,12 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2) if (mlcq == NULL) continue; + mlwq = mlcq->mlcq_wq; + /* * The polling function might have the mutex and stop us from - * getting the lock here, so we increment the event counter - * atomically from outside. + * getting the lock in mlxcx_process_cq(), so we increment + * the event counter atomically from outside. * * This way at the end of polling when we go back to interrupts * from this CQ, the event counter is still correct. @@ -746,145 +871,57 @@ mlxcx_intr_n(caddr_t arg, caddr_t arg2) if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) { /* - * If we failed to take the mutex because the polling - * function has it, just move on. We don't want to - * block other CQs behind this one. + * If we failed to take the mutex because the + * polling function has it, just move on. + * We don't want to block other CQs behind + * this one. */ if (mlcq->mlcq_state & MLXCX_CQ_POLLING) - continue; + goto update_eq; + /* Otherwise we will wait. */ mutex_enter(&mlcq->mlcq_mtx); } - if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) || - !(mlcq->mlcq_state & MLXCX_CQ_CREATED) || - (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) || - (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) || - (mlcq->mlcq_state & MLXCX_CQ_POLLING)) { - mutex_exit(&mlcq->mlcq_mtx); - continue; - } - - nmp = cmp = mp = NULL; - tellmac = B_FALSE; - - cent = mlxcx_cq_next(mlcq); - for (; cent != NULL; cent = mlxcx_cq_next(mlcq)) { + if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 && + mlxcx_process_cq(mlxp, mlcq, &mp, 0)) { /* - * Teardown and ring stop can atomic_or this flag - * into our state if they want us to stop early. + * The ring is not in polling mode and we processed + * some completion queue entries. */ - if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN) - break; - if (mlcq->mlcq_state & MLXCX_CQ_POLLING) - break; - - if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ && - cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) { - /* NOP */ - goto nextcq; + if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 && + mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) { + atomic_and_uint(&mlcq->mlcq_state, + ~MLXCX_CQ_BLOCKED_MAC); + tellmac = B_TRUE; } -lookagain: - /* - * Generally the buffer we're looking for will be - * at the front of the list, so this loop won't - * need to look far. - */ - buf = list_head(&mlcq->mlcq_buffers); - found = B_FALSE; - while (buf != NULL) { - if ((buf->mlb_wqe_index & UINT16_MAX) == - from_be16(cent->mlcqe_wqe_counter)) { - found = B_TRUE; - break; - } - buf = list_next(&mlcq->mlcq_buffers, buf); - } - if (!found) { - /* - * If there's any buffers waiting on the - * buffers_b list, then merge those into - * the main list and have another look. - * - * The wq enqueue routines push new buffers - * into buffers_b so that they can avoid - * taking the mlcq_mtx and blocking us for - * every single packet. - */ - added = B_FALSE; - mutex_enter(&mlcq->mlcq_bufbmtx); - if (!list_is_empty(&mlcq->mlcq_buffers_b)) { - list_move_tail(&mlcq->mlcq_buffers, - &mlcq->mlcq_buffers_b); - added = B_TRUE; - } - mutex_exit(&mlcq->mlcq_bufbmtx); - if (added) - goto lookagain; - } - if (!found) { - buf = list_head(&mlcq->mlcq_buffers); - mlxcx_warn(mlxp, "got completion on CQ %x but " - "no buffer matching wqe found: %x (first " - "buffer counter = %x)", mlcq->mlcq_num, - from_be16(cent->mlcqe_wqe_counter), - buf == NULL ? UINT32_MAX : - buf->mlb_wqe_index); - mlxcx_fm_ereport(mlxp, - DDI_FM_DEVICE_INVAL_STATE); - goto nextcq; + if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 && + mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) { + atomic_and_uint(&mlwq->mlwq_state, + ~MLXCX_WQ_BLOCKED_MAC); + tellmac = B_TRUE; } - list_remove(&mlcq->mlcq_buffers, buf); - atomic_dec_64(&mlcq->mlcq_bufcnt); - switch (mlcq->mlcq_wq->mlwq_type) { - case MLXCX_WQ_TYPE_SENDQ: - mlxcx_tx_completion(mlxp, mlcq, cent, buf); - break; - case MLXCX_WQ_TYPE_RECVQ: - nmp = mlxcx_rx_completion(mlxp, mlcq, cent, - buf); - if (nmp != NULL) { - if (cmp != NULL) { - cmp->b_next = nmp; - cmp = nmp; - } else { - mp = cmp = nmp; - } - } - break; - } + mlxcx_arm_cq(mlxp, mlcq); -nextcq: - /* - * Update the "doorbell" consumer counter for the queue - * every time. Unlike a UAR write, this is relatively - * cheap and doesn't require us to go out on the bus - * straight away (since it's our memory). - */ - mlcq->mlcq_doorbell->mlcqd_update_ci = - to_be24(mlcq->mlcq_cc); + mutex_exit(&mlcq->mlcq_mtx); - if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) && - mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) { - mlcq->mlcq_state &= ~MLXCX_CQ_BLOCKED_MAC; - tellmac = B_TRUE; + if (tellmac) { + mac_tx_ring_update(mlxp->mlx_mac_hdl, + mlcq->mlcq_mac_hdl); + tellmac = B_FALSE; } - } - mlxcx_arm_cq(mlxp, mlcq); - mutex_exit(&mlcq->mlcq_mtx); - - if (tellmac) { - mac_tx_ring_update(mlxp->mlx_mac_hdl, - mlcq->mlcq_mac_hdl); - } - if (mp != NULL) { - mac_rx_ring(mlxp->mlx_mac_hdl, mlcq->mlcq_mac_hdl, - mp, mlcq->mlcq_mac_gen); + if (mp != NULL) { + mac_rx_ring(mlxp->mlx_mac_hdl, + mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen); + } + } else { + mutex_exit(&mlcq->mlcq_mtx); } +update_eq: /* * Updating the consumer counter for an EQ requires a write * to the UAR, which is possibly expensive. diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h index 76d0da30e7..f65280d41d 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h +++ b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h @@ -12,6 +12,7 @@ /* * Copyright 2020, The University of Queensland * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 RackTop Systems, Inc. */ #ifndef _MLXCX_REG_H @@ -2259,6 +2260,28 @@ typedef enum { MLXCX_PROTO_50GBASE_KR2 = 1UL << 31, } mlxcx_eth_proto_t; +#define MLXCX_PROTO_100M MLXCX_PROTO_SGMII_100BASE + +#define MLXCX_PROTO_1G (MLXCX_PROTO_1000BASE_KX | MLXCX_PROTO_SGMII) + +#define MLXCX_PROTO_10G (MLXCX_PROTO_10GBASE_CX4 | \ + MLXCX_PROTO_10GBASE_KX4 | MLXCX_PROTO_10GBASE_KR | \ + MLXCX_PROTO_10GBASE_CR | MLXCX_PROTO_10GBASE_SR | \ + MLXCX_PROTO_10GBASE_ER_LR) + +#define MLXCX_PROTO_25G (MLXCX_PROTO_25GBASE_CR | \ + MLXCX_PROTO_25GBASE_KR | MLXCX_PROTO_25GBASE_SR) + +#define MLXCX_PROTO_40G (MLXCX_PROTO_40GBASE_SR4 | \ + MLXCX_PROTO_40GBASE_LR4_ER4 | MLXCX_PROTO_40GBASE_CR4 | \ + MLXCX_PROTO_40GBASE_KR4) + +#define MLXCX_PROTO_50G (MLXCX_PROTO_50GBASE_CR2 | \ + MLXCX_PROTO_50GBASE_KR2 | MLXCX_PROTO_50GBASE_SR2) + +#define MLXCX_PROTO_100G (MLXCX_PROTO_100GBASE_CR4 | \ + MLXCX_PROTO_100GBASE_SR4 | MLXCX_PROTO_100GBASE_KR4) + typedef enum { MLXCX_AUTONEG_DISABLE_CAP = 1 << 5, MLXCX_AUTONEG_DISABLE = 1 << 6 diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c index 8337545b57..da609ed28c 100644 --- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c +++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c @@ -12,6 +12,7 @@ /* * Copyright 2020, The University of Queensland * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 RackTop Systems, Inc. */ /* @@ -113,8 +114,9 @@ mlxcx_wq_rele_dma(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) mlwq->mlwq_state &= ~MLXCX_CQ_ALLOC; } -boolean_t -mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) +static boolean_t +mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, + uint_t ent_shift) { ddi_device_acc_attr_t acc; ddi_dma_attr_t attr; @@ -123,7 +125,7 @@ mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) VERIFY0(mlcq->mlcq_state & MLXCX_EQ_ALLOC); - mlcq->mlcq_entshift = mlxp->mlx_props.mldp_cq_size_shift; + mlcq->mlcq_entshift = ent_shift; mlcq->mlcq_nents = (1 << mlcq->mlcq_entshift); sz = mlcq->mlcq_nents * sizeof (mlxcx_completionq_ent_t); ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); @@ -165,7 +167,7 @@ mlxcx_cq_alloc_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) return (B_TRUE); } -void +static void mlxcx_cq_rele_dma(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) { VERIFY(mlcq->mlcq_state & MLXCX_CQ_ALLOC); @@ -331,7 +333,7 @@ mlxcx_cq_teardown(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq) static boolean_t mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq, - mlxcx_completion_queue_t **cqp) + mlxcx_completion_queue_t **cqp, uint_t ent_shift) { mlxcx_completion_queue_t *cq; @@ -350,7 +352,7 @@ mlxcx_cq_setup(mlxcx_t *mlxp, mlxcx_event_queue_t *eq, mutex_enter(&cq->mlcq_mtx); - if (!mlxcx_cq_alloc_dma(mlxp, cq)) { + if (!mlxcx_cq_alloc_dma(mlxp, cq, ent_shift)) { mutex_exit(&cq->mlcq_mtx); return (B_FALSE); } @@ -413,6 +415,9 @@ mlxcx_rq_setup(mlxcx_t *mlxp, mlxcx_completion_queue_t *cq, return (B_FALSE); } + wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP; + wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP; + mutex_exit(&wq->mlwq_mtx); mutex_enter(&cq->mlcq_mtx); @@ -459,6 +464,9 @@ mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq, return (B_FALSE); } + wq->mlwq_bufhwm = wq->mlwq_nents - MLXCX_WQ_HWM_GAP; + wq->mlwq_buflwm = wq->mlwq_nents - MLXCX_WQ_LWM_GAP; + mutex_exit(&wq->mlwq_mtx); mutex_enter(&cq->mlcq_mtx); @@ -471,6 +479,35 @@ mlxcx_sq_setup(mlxcx_t *mlxp, mlxcx_port_t *port, mlxcx_completion_queue_t *cq, return (B_TRUE); } +/* + * Before we tear down the queues associated with the rx group, + * flag each cq as being torn down and wake up any tasks. + */ +static void +mlxcx_quiesce_rx_cqs(mlxcx_t *mlxp, mlxcx_ring_group_t *g) +{ + mlxcx_work_queue_t *wq; + mlxcx_completion_queue_t *cq; + mlxcx_buf_shard_t *s; + uint_t i; + + mutex_enter(&g->mlg_mtx); + + for (i = 0; i < g->mlg_nwqs; ++i) { + wq = &g->mlg_wqs[i]; + cq = wq->mlwq_cq; + if (cq != NULL) { + s = wq->mlwq_bufs; + mutex_enter(&s->mlbs_mtx); + atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_TEARDOWN); + cv_broadcast(&s->mlbs_free_nonempty); + mutex_exit(&s->mlbs_mtx); + } + } + + mutex_exit(&g->mlg_mtx); +} + void mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) { @@ -551,6 +588,7 @@ mlxcx_teardown_rx_group(mlxcx_t *mlxp, mlxcx_ring_group_t *g) } mutex_exit(&wq->mlwq_mtx); } + taskq_destroy(g->mlg_refill_tq); g->mlg_state &= ~MLXCX_GROUP_RUNNING; } @@ -662,8 +700,16 @@ mlxcx_teardown_groups(mlxcx_t *mlxp) if (!(g->mlg_state & MLXCX_GROUP_INIT)) continue; ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_RX); + mlxcx_quiesce_rx_cqs(mlxp, g); + } + + for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { + g = &mlxp->mlx_rx_groups[i]; + if (!(g->mlg_state & MLXCX_GROUP_INIT)) + continue; mlxcx_teardown_rx_group(mlxp, g); } + kmem_free(mlxp->mlx_rx_groups, mlxp->mlx_rx_groups_size); mlxp->mlx_rx_groups = NULL; @@ -674,6 +720,7 @@ mlxcx_teardown_groups(mlxcx_t *mlxp) ASSERT3S(g->mlg_type, ==, MLXCX_GROUP_TX); mlxcx_teardown_tx_group(mlxp, g); } + kmem_free(mlxp->mlx_tx_groups, mlxp->mlx_tx_groups_size); mlxp->mlx_tx_groups = NULL; } @@ -687,6 +734,7 @@ mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) mlxcx_flow_table_t *ft; mlxcx_flow_group_t *fg; mlxcx_flow_entry_t *fe; + uint_t ent_shift; uint_t i, j; ASSERT3S(g->mlg_state, ==, 0); @@ -730,10 +778,18 @@ mlxcx_rx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) } } - if (!mlxcx_cq_setup(mlxp, eq, &cq)) { + /* + * A single completion is indicated for each rq entry as + * it is used. So, the number of cq entries never needs + * to be larger than the rq. + */ + ent_shift = MIN(mlxp->mlx_props.mldp_cq_size_shift, + mlxp->mlx_props.mldp_rq_size_shift); + if (!mlxcx_cq_setup(mlxp, eq, &cq, ent_shift)) { g->mlg_nwqs = i; break; } + cq->mlcq_stats = &g->mlg_port->mlp_stats; rq = &g->mlg_wqs[i]; @@ -1182,6 +1238,7 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g) mlxcx_flow_table_t *ft; mlxcx_flow_group_t *fg; mlxcx_flow_entry_t *fe; + char tq_name[TASKQ_NAMELEN]; mutex_enter(&g->mlg_mtx); @@ -1194,6 +1251,23 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g) g->mlg_state |= MLXCX_GROUP_RUNNING; + snprintf(tq_name, sizeof (tq_name), "%s_refill_%d_%ld", + ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst, + g - &mlxp->mlx_rx_groups[0]); + + /* + * Create one refill taskq per group with one thread per work queue. + * The refill task may block waiting for resources, so by effectively + * having one thread per work queue we avoid work queues blocking each + * other. + */ + if ((g->mlg_refill_tq = taskq_create(tq_name, g->mlg_nwqs, minclsyspri, + g->mlg_nwqs, INT_MAX, TASKQ_PREPOPULATE)) == NULL) { + mlxcx_warn(mlxp, "failed to create rq refill task queue"); + mutex_exit(&g->mlg_mtx); + return (B_FALSE); + } + if (g == &mlxp->mlx_rx_groups[0]) { ft = g->mlg_port->mlp_rx_flow; mutex_enter(&ft->mlft_mtx); @@ -1207,6 +1281,8 @@ mlxcx_rx_group_start(mlxcx_t *mlxp, mlxcx_ring_group_t *g) fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = g->mlg_rx_hash_ft; if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { mutex_exit(&ft->mlft_mtx); + g->mlg_state &= ~MLXCX_GROUP_RUNNING; + taskq_destroy(g->mlg_refill_tq); mutex_exit(&g->mlg_mtx); return (B_FALSE); } @@ -1273,8 +1349,10 @@ mlxcx_tx_group_setup(mlxcx_t *mlxp, mlxcx_ring_group_t *g) } } - if (!mlxcx_cq_setup(mlxp, eq, &cq)) + if (!mlxcx_cq_setup(mlxp, eq, &cq, + mlxp->mlx_props.mldp_cq_size_shift)) return (B_FALSE); + cq->mlcq_stats = &g->mlg_port->mlp_stats; sq = &g->mlg_wqs[i]; @@ -1409,6 +1487,11 @@ mlxcx_sq_add_nop(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) ent0 = &mlwq->mlwq_send_ent[index]; start_pc = mlwq->mlwq_pc; ++mlwq->mlwq_pc; + /* + * This counter is manipulated in the interrupt handler, which + * does not hold the mlwq_mtx, hence the atomic. + */ + atomic_inc_64(&mlwq->mlwq_wqebb_used); bzero(ent0, sizeof (mlxcx_sendq_ent_t)); ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_NOP; @@ -1441,7 +1524,7 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint8_t *inlinehdrs, size_t inlinelen, uint32_t chkflags, mlxcx_buffer_t *b0) { - uint_t index, first, ents = 0; + uint_t index, first, ents; mlxcx_completion_queue_t *cq; mlxcx_sendq_ent_t *ent0; mlxcx_sendq_extra_ent_t *ent; @@ -1449,8 +1532,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, uint_t ptri, nptr; const ddi_dma_cookie_t *c; size_t rem; + uint64_t wqebb_used; mlxcx_buffer_t *b; ddi_fm_error_t err; + boolean_t rv; ASSERT(mutex_owned(&mlwq->mlwq_mtx)); ASSERT3P(b0->mlb_tx_head, ==, b0); @@ -1460,16 +1545,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); ent0 = &mlwq->mlwq_send_ent[index]; b0->mlb_wqe_index = mlwq->mlwq_pc; - ++mlwq->mlwq_pc; - ++ents; + ents = 1; first = index; - mutex_enter(&cq->mlcq_bufbmtx); - list_insert_tail(&cq->mlcq_buffers_b, b0); - atomic_inc_64(&cq->mlcq_bufcnt); - mutex_exit(&cq->mlcq_bufbmtx); - bzero(ent0, sizeof (mlxcx_sendq_ent_t)); ent0->mlsqe_control.mlcs_opcode = MLXCX_WQE_OP_SEND; ent0->mlsqe_control.mlcs_qp_or_sq = to_be24(mlwq->mlwq_num); @@ -1502,6 +1581,16 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, MLXCX_SQE_ETH_CSFLAG_L4_CHECKSUM); } + /* + * mlwq_wqebb_used is only incremented whilst holding + * the mlwq_mtx mutex, but it is decremented (atomically) in + * the interrupt context *not* under mlwq_mtx mutex. + * So, now take a snapshot of the number of used wqes which will + * be a conistent maximum we can use whilst iterating through + * the buffers and DMA cookies. + */ + wqebb_used = mlwq->mlwq_wqebb_used; + b = b0; ptri = 0; nptr = sizeof (ent0->mlsqe_data) / sizeof (mlxcx_wqe_data_seg_t); @@ -1513,9 +1602,12 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, while (rem > 0 && (c = mlxcx_dma_cookie_iter(&b->mlb_dma, c)) != NULL) { if (ptri >= nptr) { - index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); + if ((ents + wqebb_used) >= mlwq->mlwq_nents) + return (B_FALSE); + + index = (mlwq->mlwq_pc + ents) & + (mlwq->mlwq_nents - 1); ent = &mlwq->mlwq_send_extra_ent[index]; - ++mlwq->mlwq_pc; ++ents; seg = ent->mlsqe_data; @@ -1548,6 +1640,10 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, } } + b0->mlb_wqebbs = ents; + mlwq->mlwq_pc += ents; + atomic_add_64(&mlwq->mlwq_wqebb_used, ents); + for (; ptri < nptr; ++ptri, ++seg) { seg->mlds_lkey = to_be32(MLXCX_NULL_LKEY); seg->mlds_byte_count = to_be32(0); @@ -1566,10 +1662,24 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, if (err.fme_status != DDI_FM_OK) { return (B_FALSE); } - if (!mlxcx_sq_ring_dbell(mlxp, mlwq, first)) { - return (B_FALSE); + + /* + * Hold the bufmtx whilst ringing the doorbell, to prevent + * the buffer from being moved to another list, so we can + * safely remove it should the ring fail. + */ + mutex_enter(&cq->mlcq_bufbmtx); + + list_insert_tail(&cq->mlcq_buffers_b, b0); + if ((rv = mlxcx_sq_ring_dbell(mlxp, mlwq, first))) { + atomic_inc_64(&cq->mlcq_bufcnt); + } else { + list_remove(&cq->mlcq_buffers_b, b0); } - return (B_TRUE); + + mutex_exit(&cq->mlcq_bufbmtx); + + return (rv); } boolean_t @@ -1604,8 +1714,10 @@ mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, index = mlwq->mlwq_pc & (mlwq->mlwq_nents - 1); ent = &mlwq->mlwq_recv_ent[index]; buf->mlb_wqe_index = mlwq->mlwq_pc; + buf->mlb_wqebbs = 1; ++mlwq->mlwq_pc; + atomic_inc_64(&mlwq->mlwq_wqebb_used); mutex_enter(&cq->mlcq_bufbmtx); list_insert_tail(&cq->mlcq_buffers, buf); @@ -1666,11 +1778,53 @@ mlxcx_rq_add_buffers(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq, return (B_TRUE); } +static void +mlxcx_rq_refill_task(void *arg) +{ + mlxcx_work_queue_t *wq = arg; + mlxcx_completion_queue_t *cq = wq->mlwq_cq; + mlxcx_t *mlxp = wq->mlwq_mlx; + mlxcx_buf_shard_t *s = wq->mlwq_bufs; + boolean_t refill; + + do { + /* + * Wait until there are some free buffers. + */ + mutex_enter(&s->mlbs_mtx); + while (list_is_empty(&s->mlbs_free) && + (cq->mlcq_state & MLXCX_CQ_TEARDOWN) == 0) + cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); + mutex_exit(&s->mlbs_mtx); + + mutex_enter(&cq->mlcq_mtx); + mutex_enter(&wq->mlwq_mtx); + + if ((cq->mlcq_state & MLXCX_CQ_TEARDOWN) != 0) { + refill = B_FALSE; + wq->mlwq_state &= ~MLXCX_WQ_REFILLING; + } else { + mlxcx_rq_refill(mlxp, wq); + + if (cq->mlcq_bufcnt < MLXCX_RQ_REFILL_STEP) { + refill = B_TRUE; + } else { + refill = B_FALSE; + wq->mlwq_state &= ~MLXCX_WQ_REFILLING; + } + } + + mutex_exit(&wq->mlwq_mtx); + mutex_exit(&cq->mlcq_mtx); + } while (refill); +} + void mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) { size_t target, current, want, done, n; mlxcx_completion_queue_t *cq; + mlxcx_ring_group_t *g; mlxcx_buffer_t *b[MLXCX_RQ_REFILL_STEP]; uint_t i; @@ -1697,10 +1851,24 @@ mlxcx_rq_refill(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq) while (!(mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) && done < want) { n = mlxcx_buf_take_n(mlxp, mlwq, b, MLXCX_RQ_REFILL_STEP); if (n == 0) { - mlxcx_warn(mlxp, "!exiting rq refill early, done %u " - "but wanted %u", done, want); + /* + * We didn't get any buffers from the free queue. + * It might not be an issue, schedule a taskq + * to wait for free buffers if the completion + * queue is low. + */ + if (current < MLXCX_RQ_REFILL_STEP && + (mlwq->mlwq_state & MLXCX_WQ_REFILLING) == 0) { + mlwq->mlwq_state |= MLXCX_WQ_REFILLING; + g = mlwq->mlwq_group; + taskq_dispatch_ent(g->mlg_refill_tq, + mlxcx_rq_refill_task, mlwq, TQ_NOSLEEP, + &mlwq->mlwq_tqe); + } + return; } + if (mlwq->mlwq_state & MLXCX_WQ_TEARDOWN) { for (i = 0; i < n; ++i) mlxcx_buf_return(mlxp, b[i]); @@ -1826,6 +1994,7 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mlxcx_completionq_ent_t *ent, mlxcx_buffer_t *buf) { uint32_t chkflags = 0; + uint_t wqe_index; ddi_fm_error_t err; ASSERT(mutex_owned(&mlcq->mlcq_mtx)); @@ -1868,6 +2037,12 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, return (NULL); } + /* + * mlxcx_buf_loan() will set mlb_wqe_index to zero. + * Remember it for later. + */ + wqe_index = buf->mlb_wqe_index; + if (!mlxcx_buf_loan(mlxp, buf)) { mlxcx_warn(mlxp, "!loan failed, dropping packet"); mlxcx_buf_return(mlxp, buf); @@ -1894,7 +2069,7 @@ mlxcx_rx_completion(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, * Don't check if a refill is needed on every single completion, * since checking involves taking the RQ lock. */ - if ((buf->mlb_wqe_index & 0x7) == 0) { + if ((wqe_index & 0x7) == 0) { mlxcx_work_queue_t *wq = mlcq->mlcq_wq; ASSERT(wq != NULL); mutex_enter(&wq->mlwq_mtx); @@ -1981,39 +2156,66 @@ mlxcx_buf_create_foreign(mlxcx_t *mlxp, mlxcx_buf_shard_t *shard, return (B_TRUE); } -static void -mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, - mlxcx_buffer_t **bp) +static mlxcx_buffer_t * +mlxcx_buf_take_foreign(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) { mlxcx_buffer_t *b; mlxcx_buf_shard_t *s = wq->mlwq_foreign_bufs; mutex_enter(&s->mlbs_mtx); - while (list_is_empty(&s->mlbs_free)) - cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); - b = list_remove_head(&s->mlbs_free); - ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); - ASSERT(b->mlb_foreign); - b->mlb_state = MLXCX_BUFFER_ON_WQ; - list_insert_tail(&s->mlbs_busy, b); + if ((b = list_remove_head(&s->mlbs_free)) != NULL) { + ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); + ASSERT(b->mlb_foreign); + b->mlb_state = MLXCX_BUFFER_ON_WQ; + list_insert_tail(&s->mlbs_busy, b); + } mutex_exit(&s->mlbs_mtx); - *bp = b; + return (b); } -boolean_t +static mlxcx_buffer_t * +mlxcx_copy_data(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, uint8_t *rptr, size_t sz) +{ + ddi_fm_error_t err; + mlxcx_buffer_t *b; + uint_t attempts = 0; + +copyb: + if ((b = mlxcx_buf_take(mlxp, wq)) == NULL) + return (NULL); + + ASSERT3U(b->mlb_dma.mxdb_len, >=, sz); + bcopy(rptr, b->mlb_dma.mxdb_va, sz); + + MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV); + + ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err, + DDI_FME_VERSION); + if (err.fme_status != DDI_FM_OK) { + ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle, + DDI_FME_VERSION); + mlxcx_buf_return(mlxp, b); + if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) { + return (NULL); + } + goto copyb; + } + + return (b); +} + +mlxcx_buffer_t * mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, - mblk_t *mpb, size_t off, mlxcx_buffer_t **bp) + mblk_t *mpb, size_t off) { mlxcx_buffer_t *b, *b0 = NULL; boolean_t first = B_TRUE; - ddi_fm_error_t err; mblk_t *mp; uint8_t *rptr; size_t sz; size_t ncookies = 0; boolean_t ret; - uint_t attempts = 0; for (mp = mpb; mp != NULL; mp = mp->b_cont) { rptr = mp->b_rptr; @@ -2024,31 +2226,24 @@ mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, rptr += off; sz -= off; - if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) - goto copyb; - - mlxcx_buf_take_foreign(mlxp, wq, &b); - ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, B_FALSE); + if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) { + b = mlxcx_copy_data(mlxp, wq, rptr, sz); + if (b == NULL) + goto failed; + } else { + b = mlxcx_buf_take_foreign(mlxp, wq); + if (b == NULL) + goto failed; - if (!ret) { - mlxcx_buf_return(mlxp, b); + ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off, + B_FALSE); -copyb: - mlxcx_buf_take(mlxp, wq, &b); - ASSERT3U(b->mlb_dma.mxdb_len, >=, sz); - bcopy(rptr, b->mlb_dma.mxdb_va, sz); - MLXCX_DMA_SYNC(b->mlb_dma, DDI_DMA_SYNC_FORDEV); - ddi_fm_dma_err_get(b->mlb_dma.mxdb_dma_handle, &err, - DDI_FME_VERSION); - if (err.fme_status != DDI_FM_OK) { - ddi_fm_dma_err_clear(b->mlb_dma.mxdb_dma_handle, - DDI_FME_VERSION); + if (!ret) { mlxcx_buf_return(mlxp, b); - if (++attempts > MLXCX_BUF_BIND_MAX_ATTEMTPS) { - *bp = NULL; - return (B_FALSE); - } - goto copyb; + + b = mlxcx_copy_data(mlxp, wq, rptr, sz); + if (b == NULL) + goto failed; } } @@ -2082,54 +2277,44 @@ copyb: ASSERT3U(ncookies, <=, MLXCX_SQE_MAX_PTRS); - *bp = b0; - return (B_TRUE); + return (b0); + +failed: + if (b0 != NULL) + mlxcx_buf_return_chain(mlxp, b0, B_TRUE); + + return (NULL); } -void -mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mlxcx_buffer_t **bp) +mlxcx_buffer_t * +mlxcx_buf_take(mlxcx_t *mlxp, mlxcx_work_queue_t *wq) { mlxcx_buffer_t *b; mlxcx_buf_shard_t *s = wq->mlwq_bufs; mutex_enter(&s->mlbs_mtx); - while (list_is_empty(&s->mlbs_free)) - cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); - b = list_remove_head(&s->mlbs_free); - ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); - b->mlb_state = MLXCX_BUFFER_ON_WQ; - list_insert_tail(&s->mlbs_busy, b); + if ((b = list_remove_head(&s->mlbs_free)) != NULL) { + ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); + b->mlb_state = MLXCX_BUFFER_ON_WQ; + list_insert_tail(&s->mlbs_busy, b); + } mutex_exit(&s->mlbs_mtx); - *bp = b; + return (b); } -#define MLXCX_BUF_TAKE_N_TIMEOUT_USEC 5000 -#define MLXCX_BUF_TAKE_N_MAX_RETRIES 3 - size_t mlxcx_buf_take_n(mlxcx_t *mlxp, mlxcx_work_queue_t *wq, mlxcx_buffer_t **bp, size_t nbufs) { mlxcx_buffer_t *b; - size_t done = 0, empty = 0; - clock_t wtime = drv_usectohz(MLXCX_BUF_TAKE_N_TIMEOUT_USEC); + size_t done = 0; mlxcx_buf_shard_t *s; s = wq->mlwq_bufs; mutex_enter(&s->mlbs_mtx); - while (done < nbufs) { - while (list_is_empty(&s->mlbs_free)) { - (void) cv_reltimedwait(&s->mlbs_free_nonempty, - &s->mlbs_mtx, wtime, TR_MILLISEC); - if (list_is_empty(&s->mlbs_free) && - empty++ >= MLXCX_BUF_TAKE_N_MAX_RETRIES) { - mutex_exit(&s->mlbs_mtx); - return (done); - } - } - b = list_remove_head(&s->mlbs_free); + while (done < nbufs && (b = list_remove_head(&s->mlbs_free)) != NULL) { ASSERT3U(b->mlb_state, ==, MLXCX_BUFFER_FREE); b->mlb_state = MLXCX_BUFFER_ON_WQ; list_insert_tail(&s->mlbs_busy, b); @@ -2187,13 +2372,26 @@ mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b) VERIFY3U(oldstate, !=, MLXCX_BUFFER_FREE); ASSERT3P(b->mlb_mlx, ==, mlxp); + + /* + * The mlbs_mtx held below is a heavily contended lock, so it is + * imperative we do as much of the buffer clean up outside the lock + * as is possible. + */ b->mlb_state = MLXCX_BUFFER_FREE; b->mlb_wqe_index = 0; b->mlb_tx_head = NULL; b->mlb_tx_mp = NULL; b->mlb_used = 0; + b->mlb_wqebbs = 0; ASSERT(list_is_empty(&b->mlb_tx_chain)); + if (b->mlb_foreign) { + if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) { + mlxcx_dma_unbind(mlxp, &b->mlb_dma); + } + } + mutex_enter(&s->mlbs_mtx); switch (oldstate) { case MLXCX_BUFFER_INIT: @@ -2215,12 +2413,6 @@ mlxcx_buf_return(mlxcx_t *mlxp, mlxcx_buffer_t *b) break; } - if (b->mlb_foreign) { - if (b->mlb_dma.mxdb_flags & MLXCX_DMABUF_BOUND) { - mlxcx_dma_unbind(mlxp, &b->mlb_dma); - } - } - list_insert_tail(&s->mlbs_free, b); cv_signal(&s->mlbs_free_nonempty); |
