diff options
author | Ryan Zezeski <ryan@zinascii.com> | 2020-08-25 00:52:37 -0600 |
---|---|---|
committer | Dan McDonald <danmcd@joyent.com> | 2021-11-23 13:18:50 -0500 |
commit | 6f443ebc1fb4fec01d6e8fa8ca4648182ed215bb (patch) | |
tree | 5c4551c6d6caaaf138fe369af872c3fc31d02c8a | |
parent | a28480febf31f0e61debac062a55216a98a05a92 (diff) | |
download | illumos-joyent-6f443ebc1fb4fec01d6e8fa8ca4648182ed215bb.tar.gz |
13689 Want AWS ENA driver
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Approved by: Dan McDonald <danmcd@joyent.com>
22 files changed, 8158 insertions, 1 deletions
diff --git a/usr/src/man/man7d/Makefile b/usr/src/man/man7d/Makefile index 9da7d4b205..af38c7a9bd 100644 --- a/usr/src/man/man7d/Makefile +++ b/usr/src/man/man7d/Makefile @@ -16,6 +16,7 @@ # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> # Copyright 2018 Nexenta Systems, Inc. # Copyright 2020 Peter Tribble +# Copyright 2021 Oxide Computer Company # include $(SRC)/Makefile.master @@ -46,6 +47,7 @@ _MANFILES= aac.7d \ dtrace.7d \ e1000g.7d \ ehci.7d \ + ena.7d \ fasttrap.7d \ fbt.7d \ fcip.7d \ diff --git a/usr/src/man/man7d/ena.7d b/usr/src/man/man7d/ena.7d new file mode 100644 index 0000000000..d4070e1745 --- /dev/null +++ b/usr/src/man/man7d/ena.7d @@ -0,0 +1,135 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright 2021 Oxide Computer Company +.\" +.Dd Nov 17, 2021 +.Dt ENA 7D +.Os +.Sh NAME +.Nm ena +.Nd Driver for the AWS Elastic Network Adapter +.Sh SYNOPSIS +.Pa /dev/net/ena* +.Sh DESCRIPTION +The +.Sy ena +driver is a GLDv3 NIC driver for the AWS Elastic Network Adapter +family of virtual devices. +The driver supports: +.Bl -dash -offset indent +.It +Jumbo frames up to 9216 bytes. +.It +Multiple Rx and Tx rings. +.El +.Pp +By design, this driver does not support VNICs. +A given ENA device can only ever receive traffic for a single unicast +MAC address and IP address combination, as determined by the AWS configuration. +There is no support for promiscuous mode, or for receiving traffic for +additional unicast or multicast addresses. +.Sh CONFIGURATION +The +.Sy ena.conf +file contains user configurable parameters, each of which is described +below. +This file is read when an ENA device is found and an instance of the +driver is attached to it. +Changes made to this file do not affect running instances. +Only instances attached after the changes will see the effects of +those changes. +Therefore, if you want your change to take effect on a running +instance, you must somehow reload it. +That could be done by a manual reloading of the driver or a system +reboot. +.Sh PROPERTIES +The configuration file can be found at +.Pa /kernel/drv/ena.conf . +.Bl -hang -width Ds +.It Sy rx_queue_num_descs +.Bd -filled -compact +Minimum: +.Sy 64 | +Maximum: +.Sy device dependent +.Ed +.Bd -filled -compact +Default: +.Sy device maximum +.Ed +.Bd -filled +The +.Sy rx_queue_num_descs +property determines the number of descriptors provided by the Rx queue. +Currently a single descriptor is equal to a single packet, but in the +future it may be that a single packet consumes multiple descriptors. +.Ed +.It Sy rx_queue_intr_limit +.Bd -filled -compact +Minimum: +.Sy 16 | +Maximum: +.Sy 4096 +.Ed +.Bd -filled -compact +Default: +.Sy 256 +.Ed +.Bd -filled +The +.Sy rx_queue_intr_limit +property determines the number frames an Rx interrupt will attempt to +process before returning and claiming the interrupt. +This is meant to keep the ENA Rx interrupt handler from consuming too +much system time. +In general, when a NIC becomes saturated with packets, the +.Sy MAC +layer will switch the driver into polling mode to reduce interrupt +load. +.Ed +.It Sy tx_queue_num_descs +.Bd -filled -compact +Minimum: +.Sy 64 | +Maximum: +.Sy device dependent +.Ed +.Bd -filled -compact +Default: +.Sy device maximum +.Ed +.Bd -filled +The +.Sy tx_queue_num_descs +property determines the number of descriptors provided by the Tx queue. +Currently a single descriptor is equal to a single packet, but in the +future it may be that a single packet consumes multiple descriptors. +.Ed +.El +.Sh FILES +.Bl -tag -width Pa +.It Pa /kernel/drv/amd64/ena +Device driver (x86) +.It Pa /kernel/drv/ena.conf +Driver configuration file containing user-configurable options +.El +.Sh INTERFACE STABILITY +The tunables in +.Pa ena.conf +are considered +.Sy Evolving +and may change in the future. +.Sh SEE ALSO +.Xr dladm 1M , +.Xr snoop 1M , +.Xr driver.conf 4 , +.Xr dlpi 7P diff --git a/usr/src/pkg/manifests/driver-network-ena.p5m b/usr/src/pkg/manifests/driver-network-ena.p5m new file mode 100644 index 0000000000..cd64e9c504 --- /dev/null +++ b/usr/src/pkg/manifests/driver-network-ena.p5m @@ -0,0 +1,36 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2021 Oxide Computer Company +# + +<include global_zone_only_component> +set name=pkg.fmri value=pkg:/driver/network/ena@$(PKGVERS) +set name=pkg.summary value="AWS ENA Ethernet Driver" +set name=pkg.description value="AWS ENA Ethernet Driver" +set name=info.classification \ + value=org.opensolaris.category.2008:Drivers/Networking +set name=variant.arch value=i386 +dir path=kernel group=sys +dir path=kernel/drv group=sys +dir path=kernel/drv/$(ARCH64) group=sys +file path=kernel/drv/$(ARCH64)/ena group=sys +file path=kernel/drv/ena.conf group=sys +dir path=usr/share/man +dir path=usr/share/man/man7d +file path=usr/share/man/man7d/ena.7d +driver name=ena perms="* 0666 root sys" clone_perms="ena 0666 root sys" \ + alias=pciex1d0f,ec2 \ + alias=pciex1d0f,1ec2 \ + alias=pciex1d0f,ec20 \ + alias=pciex1d0f,ec21 +license lic_CDDL license=lic_CDDL diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index d768802685..00af839874 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -29,6 +29,7 @@ # Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved. # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> # Copyright 2020 RackTop Systems, Inc. +# Copyright 2021 Oxide Computer Company # # @@ -2288,3 +2289,10 @@ BNX_OBJS += \ # MLXCX_OBJS += mlxcx.o mlxcx_dma.o mlxcx_cmd.o mlxcx_intr.o mlxcx_gld.o \ mlxcx_ring.o mlxcx_sensor.o + +# +# ena(7D) +# +ENA_OBJS += ena.o ena_admin.o ena_dma.o ena_gld.o ena_hw.o ena_intr.o \ + ena_stats.o ena_tx.o ena_rx.o + diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index 32a80767b2..78f01a1f9f 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -26,7 +26,7 @@ # Copyright 2019 Joyent, Inc. # Copyright 2018 Nexenta Systems, Inc. # Copyright (c) 2017 by Delphix. All rights reserved. -# Copyright 2020 Oxide Computer Company +# Copyright 2021 Oxide Computer Company # # @@ -777,6 +777,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/elxl/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ena/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/fcoe/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) diff --git a/usr/src/uts/common/io/ena/ena.c b/usr/src/uts/common/io/ena/ena.c new file mode 100644 index 0000000000..b42f6350af --- /dev/null +++ b/usr/src/uts/common/io/ena/ena.c @@ -0,0 +1,1944 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ + +#include "ena_hw.h" +#include "ena.h" + +/* + * Elastic Network Adapter (ENA) Driver + * ------------------------------------ + * + * The ena driver provides support for the AWS ENA device, also + * referred to as their "enhanced networking". This device is present + * on "Nitro"-based instances. It presents itself with the following + * PCI Vendor/Device IDs + * + * o 1d0f:0ec2 -- ENA PF + * o 1d0f:1ec2 -- ENA PF (Reserved) + * o 1d0f:ec20 -- ENA VF + * o 1d0f:ec21 -- ENA VF (Reserved) + * + * This driver provides support for only the essential features needed + * to drive traffic on an ENA device. Support for the following + * features IS NOT currently implemented. + * + * o Admin Queue Interrupts: queue completion events are always polled + * o AENQ keep alive + * o FMA + * o Rx checksum offloads + * o Tx checksum offloads + * o Tx DMA bind (borrow buffers) + * o Rx DMA bind (loaned buffers) + * o TSO + * o RSS + * o Low Latency Queues (LLQ) + * o Support for different Tx complection policies + * o More controlled Tx recycling and Rx refill + * + * Even without these features the ena driver should perform + * reasonably well. + * + * Driver vs. Hardware Types + * ------------------------- + * + * To properly communicate with the ENA device the driver must + * populate memory (registers and buffers) with specific types. These + * types are defined by the device and are found under the "common" + * (ena_com) code of the AWS Linux and FreeBSD drivers [1]. We have + * simplified this a bit by defining all device-specific types in the + * ena_hw.h file. Furthermore, all device-specific types are given an + * "enahw" prefix. This makes it clear when we are dealing with a + * device type and when we are dealing with a driver type. + * + * [1]: https://github.com/amzn/amzn-drivers + * + * Groups, Rings (Queues), and Interrupts + * -------------------------------------- + * + * The ENA device presents one mac group. This single mac group + * represents the single unicast address that this device represents + * in your AWS instance. The ENA device presents no option for + * configuring additional MAC addresses, multicast, or promisc mode -- + * you receive only what AWS wants you to receive. + * + * This single mac group may have one or more rings. The ENA driver + * refers to rings as queues, for no special reason other than it was + * the dominant language in the Linux and FreeBSD drivers, and it + * spilled over into this port. The upper bound on number of queues is + * presented by the device. However, we don't just go with whatever + * number of queues the device reports; but rather we limit the queues + * based on other factors such as an absolute maximum, number of + * online CPUs, and number of available interrupts. The upper bound is + * calculated by ena_set_max_io_queues(), and that is used and + * possibly further restricted in ena_attach_intr_alloc(). As this + * point, ultimately, it is the number of available interrupts (minus + * one for the admin queue) that determines the number of queues: one + * Tx and one Rx on each I/O interrupt. + * + * NOTE: Perhaps it is overly restrictive to limit the number of + * queues to the number of I/O interrupts. Something worth considering + * on larger instances if they present far less interrupts than they + * do queues + CPUs. + * + * The ENA device presents MSI-X interrupts only. During attach the + * driver queries the number of available interrupts and sets aside + * one for admin/AENQ (vector 0) and the rest for I/O (vector 1 to N). + * This means that a Tx/Rx queue at index 0 will map to vector 1, and + * so on. + * + * NOTE: The ENA driver currently doesn't make use of the Admin Queue + * interrupt. This interrupt is used to notify a the driver that a + * command response is read. The ENA driver always polls the Admin + * Queue for responses. + * + * Tx Queue Workings + * ----------------- + * + * A single Tx queue (ena_txq_t) is made up of one submission queue + * (SQ) and its paired completion queue (CQ). These two queues form a + * logical descriptor ring which is used to send packets out of the + * device -- where each SQ entry describes the packet to be sent + * (enahw_tx_desc_t) and each CQ entry describes the result of sending + * a packet (enahw_tx_cdesc_t). For this to work the host and device + * must agree on which descriptors are currently owned by the host + * (free for sending) and which are owned by the device (pending + * device completion). This state is tracked on the host side via head + * and tail indexes along with a phase value. + * + * The head and tail values represent the head and tail of the FIFO + * queue of pending packets -- the next packet to be sent by the + * device is head, and all descriptors up to tail are ready for + * sending. The phase allows the host to determine which CQ + * descriptors represent completed events when using per-SQ completion + * events (as opposed to queue head pointer updates). As the queues + * represent a logical ring buffer, the phase must alternate on + * wrap-around. The device initializes the phase to zero, and the host + * starts with a phase of 1. The first packet descriptor writes, and + * their corresponding completions, are indicated with a phase of 1. + * + * + * For example, the diagram below represents the SQ/CQ state after the + * first 6 packets have been sent by the host and 2 of them have been + * completed by the device (and these completions have been processed + * by the driver). In this state the host could send 4 more packets + * before needing to wait on completion events. + * + * + * +---+---+---+---+---+---+---+---+ + * SQ | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | phase = 1 + * +---+---+---+---+---+---+---+---+ + * ^ + * | + * tail + * head + * | + * v + * +---+---+---+---+---+---+---+---+ + * CQ | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | phase = 1 + * +---+---+---+---+---+---+---+---+ + * + * + * The next diagram shows how the state changes as 5 more packets are + * sent (for a total of 11) and 7 more are completed (for a total of + * 9). Notice that as the SQ and CQ have wrapped around their phases + * have been complemented. In this state the host could send 6 more + * packets before needing to wait on completion events. + * + * +---+---+---+---+---+---+---+---+ + * SQ | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | phase = 0 + * +---+---+---+---+---+---+---+---+ + * ^ + * | + * tail + * head + * | + * v + * +---+---+---+---+---+---+---+---+ + * CQ | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | phase = 0 + * +---+---+---+---+---+---+---+---+ + * + * + * Currently, all packets are copied for Tx. At ring start we allocate + * a Tx Control Buffer (TCB) for each queue descriptor. Each TCB has + * DMA buffer associated with it; and each buffer is large enough to + * hold the MTU. Therefore, Tx descriptors and TCBs currently have a + * 1:1 mapping. When a packet is sent, the mblk's buffer is copied to + * the TCB's DMA buffer, and a new descriptor is written to the SQ + * describing said TCB buffer. If and when we add more advanced + * features like DMA binding of mblks and TSO, this 1:1 guarantee will + * no longer hold. + * + * Rx Queue Workings + * ----------------- + * + * In terms of implementing the logical descriptor ring, the Rx queues + * are very much like the Tx queues. There is a paired SQ and CQ for + * each logical ring. The difference is that in Rx the SQ is for + * handing buffers to the device to fill, and the CQ is for describing + * the contents of those buffers for a given received frame. At Rx + * ring start we allocate a Rx Control Buffer (RCB) for each + * descriptor in the ring. Each RCB has a DMA buffer associated with + * it; and each buffer is large enough to hold the MTU. For each + * received frame we copy the contents out of the RCB and into its own + * mblk, immediately returning the RCB for reuse. As with Tx, this + * gives us a simple 1:1 mapping currently, but if more advanced + * features are implemented later this could change. + * + * Asynchronous Event Notification Queue (AENQ) + * -------------------------------------------- + * + * Each ENA device comes with a mechanism for sending out-of-band + * notifications to the driver. This includes events like link state + * changes, fatal errors, and a watchdog/keep alive signal. The AENQ + * delivery mechanism is via interrupt, handled by the ena_aenq_work() + * function, which dispatches via the eaenq_hdlrs table. If no handler + * is registered, the ena_aenq_default_hdlr() handler is used. A given + * device may not support all the different event types + * (enahw_aenq_groups_t); and the driver may choose to enable a subset + * of the supported events. During attach we call ena_setup_aenq() to + * negotiate the supported/enabled events. The enabled group is stored + * at ena_aenq_enabled_groups. + * + * Queues and Unsigned Wraparound + * ------------------------------ + * + * All the queues use a uint16_t value as their head/tail values, e.g. + * the Rx queue's er_cq_head_idx value. You might notice that we only + * ever increment these values, letting them perform implicit unsigned + * integer wraparound. This is intended. This is the same behavior as + * the common code, and seems to be what the hardware expects. Of + * course, when accessing our own descriptor arrays we must make sure + * to first perform a modulo of this value or risk running off into + * space. + * + * Attach Sequencing + * ----------------- + * + * Most drivers implement their attach/detach/cleanup functions as a + * sequential stream of function calls used to allocate and initialize + * resources in an order determined by the device's programming manual + * combined with any requirements imposed by the kernel and its + * relevant modules. These functions can become quite long. It is + * often hard to see the order in which steps are taken, and even + * harder to tell if detach/cleanup undoes them in the correct order, + * or even if it undoes them at all! The only sure way to understand + * the flow is to take good notes while closely inspecting each line + * of code. Even then, it's easy for attach and detach to get out of + * sync. + * + * Some more recent drivers have improved on this situation by using a + * bit vector to track the sequence of events in attach/detach. Each + * bit is declared in as an enum value, in the same order it is + * expected attach would run, and thus detach would run in the exact + * opposite order. This has three main benefits: + * + * 1. It makes it easier to determine sequence order at a + * glance. + * + * 2. It gives a better idea of what state the device is in during + * debugging (the sequence bit vector is kept with the instance + * state). + * + * 3. The detach function can verify that all sequence bits are + * cleared, indicating that everything done in attach was + * successfully undone. + * + * These are great improvements. However, the attach/detach functions + * can still become unruly, and there is still no guarantee that + * detach is done in opposite order of attach (this is not always + * strictly required, but is probably the best way to write detach). + * There is still a lot of boilerplate and chance for programmer + * error. + * + * The ena driver takes the sequence idea a bit further, creating a + * descriptor table of the attach sequence (ena_attach_tbl). This + * table is used by attach/detach to generically, declaratively, and + * programmaticaly enforce the precise sequence order and verify that + * anything that is done is undone. This provides several benefits: + * + * o Correct order is enforced implicitly by the descriptor table. + * It is impossible for the detach sequence to run in any other + * order other than opposite that of attach. + * + * o It is obvious what the precise attach sequence is. While the + * bit vector enum helps a lot with this it doesn't prevent + * programmer error. With the sequence defined as a declarative + * table it makes it easy for the programmer to see the order and + * know it's followed exactly. + * + * o It is impossible to modify the attach sequence without also + * specifying a callback for its dual in the detach sequence. + * + * o Common and repetitive code like error checking, logging, and bit + * vector modification is eliminated and centralized, again + * reducing the chance of programmer error. + * + * The ena attach sequence is defined under ena_attach_seq_t. The + * descriptor table is defined under ena_attach_tbl. + */ + +/* + * These are some basic data layout invariants on which development + * assumptions where made. + */ +CTASSERT(sizeof (enahw_aenq_desc_t) == 64); +/* TODO: Why doesn't this work? */ +/* CTASSERT(sizeof (enahw_tx_data_desc_t) == 64); */ +CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_meta_desc_t)); +CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_desc_t)); +CTASSERT(sizeof (enahw_tx_meta_desc_t) == sizeof (enahw_tx_desc_t)); +/* + * We add this here as an extra safety check to make sure that any + * addition to the AENQ group enum also updates the groups array num + * value. + */ +CTASSERT(ENAHW_AENQ_GROUPS_ARR_NUM == 6); + +/* + * Amazon does not specify the endianess of the ENA device. We assume + * it's the same as the bus, and we assume the CPU/bus is always + * little endian. + */ +#ifdef _BIG_ENDIAN +#error "ENA driver is little-endian only" +#endif + +/* + * These values are used to communicate the driver version to the AWS + * hypervisor via the ena_set_host_info() function. We don't know what + * exactly AWS does with this info, but it's fairly safe to assume + * it's used solely for debug/informational purposes. The Linux driver + * updates these values frequently as bugs are fixed and features are + * added. + */ +#define ENA_DRV_VER_MAJOR 1 +#define ENA_DRV_VER_MINOR 0 +#define ENA_DRV_VER_SUBMINOR 0 + +uint64_t ena_admin_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT; + +/* + * Log an error message. We leave the destination (console or system + * log) up to the caller + */ +void +ena_err(const ena_t *ena, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + if (ena != NULL && ena->ena_dip != NULL) { + vdev_err(ena->ena_dip, CE_WARN, fmt, ap); + } else { + vcmn_err(CE_WARN, fmt, ap); + } + va_end(ap); +} + +/* + * Set this to B_TRUE to enable debug messages. + */ +boolean_t ena_debug = B_FALSE; + +/* + * Log a debug message. We force all debug messages to go to the + * system log. + */ +void +ena_dbg(const ena_t *ena, const char *fmt, ...) +{ + va_list ap; + + if (ena_debug) { + char msg[1024]; + + va_start(ap, fmt); + (void) vsnprintf(msg, sizeof (msg), fmt, ap); + va_end(ap); + + if (ena != NULL && ena->ena_dip != NULL) { + dev_err(ena->ena_dip, CE_NOTE, "!%s", msg); + } else { + cmn_err(CE_NOTE, "!%s", msg); + } + } +} + +ena_aenq_grpstr_t ena_groups_str[ENAHW_AENQ_GROUPS_ARR_NUM] = { + { .eag_type = ENAHW_AENQ_GROUP_LINK_CHANGE, .eag_str = "LINK CHANGE" }, + { .eag_type = ENAHW_AENQ_GROUP_FATAL_ERROR, .eag_str = "FATAL ERROR" }, + { .eag_type = ENAHW_AENQ_GROUP_WARNING, .eag_str = "WARNING" }, + { + .eag_type = ENAHW_AENQ_GROUP_NOTIFICATION, + .eag_str = "NOTIFICATION" + }, + { .eag_type = ENAHW_AENQ_GROUP_KEEP_ALIVE, .eag_str = "KEEP ALIVE" }, + { + .eag_type = ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES, + .eag_str = "REFRESH CAPABILITIES" + }, +}; + +void +ena_aenq_work(ena_t *ena) +{ + ena_aenq_t *aenq = &ena->ena_aenq; + uint16_t head_mod = aenq->eaenq_head & (aenq->eaenq_num_descs - 1); + boolean_t processed = B_FALSE; + enahw_aenq_desc_t *desc = &aenq->eaenq_descs[head_mod]; + uint64_t ts; + + ts = ((uint64_t)desc->ead_ts_high << 32) | (uint64_t)desc->ead_ts_low; + ENA_DMA_SYNC(aenq->eaenq_dma, DDI_DMA_SYNC_FORKERNEL); + + while (ENAHW_AENQ_DESC_PHASE(desc) == aenq->eaenq_phase) { + ena_aenq_hdlr_t hdlr; + + ASSERT3U(desc->ead_group, <, ENAHW_AENQ_GROUPS_ARR_NUM); + processed = B_TRUE; + ena_dbg(ena, "AENQ Group: (0x%x) %s Syndrome: 0x%x ts: %" PRIu64 + " us", desc->ead_group, + ena_groups_str[desc->ead_group].eag_str, desc->ead_syndrome, + ts); + + hdlr = ena->ena_aenq.eaenq_hdlrs[desc->ead_group]; + hdlr(ena, desc); + + aenq->eaenq_head++; + head_mod = aenq->eaenq_head & (aenq->eaenq_num_descs - 1); + + if (head_mod == 0) { + aenq->eaenq_phase = !aenq->eaenq_phase; + } + + desc = &aenq->eaenq_descs[head_mod]; + } + + if (processed) { + ena_hw_bar_write32(ena, ENAHW_REG_AENQ_HEAD_DB, + aenq->eaenq_head); + } +} + +/* + * Use for attach sequences which perform no resource allocation (or + * global state modification) and thus require no subsequent + * deallocation. + */ +static void +ena_no_cleanup(ena_t *ena) +{ +} + +static boolean_t +ena_attach_pci(ena_t *ena) +{ + ddi_acc_handle_t hdl; + + if (pci_config_setup(ena->ena_dip, &hdl) != 0) { + return (B_FALSE); + } + + ena->ena_pci_hdl = hdl; + ena->ena_pci_vid = pci_config_get16(hdl, PCI_CONF_VENID); + ena->ena_pci_did = pci_config_get16(hdl, PCI_CONF_DEVID); + ena->ena_pci_rev = pci_config_get8(hdl, PCI_CONF_REVID); + ena->ena_pci_svid = pci_config_get16(hdl, PCI_CONF_SUBVENID); + ena->ena_pci_sdid = pci_config_get16(hdl, PCI_CONF_SUBSYSID); + ena_dbg(ena, "vid: 0x%x did: 0x%x rev: 0x%x svid: 0x%x sdid: 0x%x", + ena->ena_pci_vid, ena->ena_pci_did, ena->ena_pci_rev, + ena->ena_pci_svid, ena->ena_pci_sdid); + + return (B_TRUE); +} + +static void +ena_cleanup_pci(ena_t *ena) +{ + pci_config_teardown(&ena->ena_pci_hdl); +} + +static void +ena_cleanup_regs_map(ena_t *ena) +{ + ddi_regs_map_free(&ena->ena_reg_hdl); +} + +static boolean_t +ena_attach_regs_map(ena_t *ena) +{ + int ret = 0; + + if (ddi_dev_regsize(ena->ena_dip, ENA_REG_NUMBER, &ena->ena_reg_size) != + DDI_SUCCESS) { + ena_err(ena, "failed to get register set %d size", + ENA_REG_NUMBER); + return (B_FALSE); + } + + ena_dbg(ena, "register size: %ld", ena->ena_reg_size); + bzero(&ena->ena_reg_attr, sizeof (ena->ena_reg_attr)); + ena->ena_reg_attr.devacc_attr_version = DDI_DEVICE_ATTR_V1; + ena->ena_reg_attr.devacc_attr_endian_flags = DDI_NEVERSWAP_ACC; + ena->ena_reg_attr.devacc_attr_dataorder = DDI_STRICTORDER_ACC; + + /* + * This function can return several different failure values, + * so we make sure to capture its return value for the purpose + * of logging. + */ + ret = ddi_regs_map_setup(ena->ena_dip, ENA_REG_NUMBER, + &ena->ena_reg_base, 0, ena->ena_reg_size, &ena->ena_reg_attr, + &ena->ena_reg_hdl); + + if (ret != DDI_SUCCESS) { + ena_err(ena, "failed to map register set %d: %d", + ENA_REG_NUMBER, ret); + return (B_FALSE); + } + + ena_dbg(ena, "registers mapped to base: 0x%p", + (void *)ena->ena_reg_base); + + return (B_TRUE); +} + +/* + * Free any resources related to the admin submission queue. + */ +static void +ena_admin_sq_free(ena_t *ena) +{ + ena_dma_free(&ena->ena_aq.ea_sq.eas_dma); +} + +/* + * Initialize the admin submission queue. + */ +static boolean_t +ena_admin_sq_init(ena_t *ena) +{ + ena_adminq_t *aq = &ena->ena_aq; + ena_dma_buf_t *dma = &aq->ea_sq.eas_dma; + size_t size = aq->ea_qlen * sizeof (*aq->ea_sq.eas_entries); + uint32_t addr_low, addr_high, wval; + ena_dma_conf_t conf = { + .edc_size = size, + .edc_align = ENAHW_ADMIN_SQ_DESC_BUF_ALIGNMENT, + .edc_sgl = 1, + .edc_endian = DDI_NEVERSWAP_ACC, + .edc_stream = B_FALSE, + }; + + if (!ena_dma_alloc(ena, dma, &conf, size)) { + ena_err(ena, "failed to allocate DMA for Admin SQ"); + return (B_FALSE); + } + + aq->ea_sq.eas_entries = (void *)dma->edb_va; + aq->ea_sq.eas_tail = 0; + aq->ea_sq.eas_phase = 1; + aq->ea_sq.eas_dbaddr = + (uint32_t *)(ena->ena_reg_base + ENAHW_REG_ASQ_DB); + ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress); + addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress); + addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32); + ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_LO, addr_low); + ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_HI, addr_high); + wval = ENAHW_ASQ_CAPS_DEPTH(aq->ea_qlen) | + ENAHW_ASQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_sq.eas_entries)); + ena_hw_bar_write32(ena, ENAHW_REG_ASQ_CAPS, wval); + return (B_TRUE); +} + +/* + * Free any resources related to the admin completion queue. + */ +static void +ena_admin_cq_free(ena_t *ena) +{ + ena_dma_free(&ena->ena_aq.ea_cq.eac_dma); +} + +/* + * Initialize the admin completion queue. + */ +static boolean_t +ena_admin_cq_init(ena_t *ena) +{ + ena_adminq_t *aq = &ena->ena_aq; + ena_dma_buf_t *dma = &aq->ea_cq.eac_dma; + size_t size = aq->ea_qlen * sizeof (*aq->ea_cq.eac_entries); + uint32_t addr_low, addr_high, wval; + ena_dma_conf_t conf = { + .edc_size = size, + .edc_align = ENAHW_ADMIN_CQ_DESC_BUF_ALIGNMENT, + .edc_sgl = 1, + .edc_endian = DDI_NEVERSWAP_ACC, + .edc_stream = B_FALSE, + }; + + if (!ena_dma_alloc(ena, dma, &conf, size)) { + ena_err(ena, "failed to allocate DMA for Admin CQ"); + return (B_FALSE); + } + + aq->ea_cq.eac_entries = (void *)dma->edb_va; + aq->ea_cq.eac_head = 0; + aq->ea_cq.eac_phase = 1; + ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress); + addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress); + addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32); + ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_LO, addr_low); + ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_HI, addr_high); + wval = ENAHW_ACQ_CAPS_DEPTH(aq->ea_qlen) | + ENAHW_ACQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_cq.eac_entries)); + ena_hw_bar_write32(ena, ENAHW_REG_ACQ_CAPS, wval); + return (B_TRUE); +} + +static void +ena_aenq_default_hdlr(void *data, enahw_aenq_desc_t *desc) +{ + ena_t *ena = data; + + ena->ena_aenq_stat.eaes_default.value.ui64++; + ena_dbg(ena, "unimplemented handler for aenq group: %s", + ena_groups_str[desc->ead_group].eag_str); +} + +static void +ena_aenq_link_change_hdlr(void *data, enahw_aenq_desc_t *desc) +{ + ena_t *ena = data; + boolean_t is_up = (desc->ead_payload.link_change.flags & + ENAHW_AENQ_LINK_CHANGE_LINK_STATUS_MASK) != 0; + + /* + * The interupts are not enabled until after we register mac, + * so the mac handle should be valid. + */ + ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_MAC_REGISTER); + ena->ena_aenq_stat.eaes_link_change.value.ui64++; + + mutex_enter(&ena->ena_lock); + + /* + * Notify mac only on an actual change in status. + */ + if (ena->ena_link_up != is_up) { + if (is_up) { + mac_link_update(ena->ena_mh, LINK_STATE_UP); + } else { + mac_link_update(ena->ena_mh, LINK_STATE_DOWN); + } + } + + ena->ena_link_up = is_up; + + mutex_exit(&ena->ena_lock); +} + +/* + * Free any resources related to the Async Event Notification Queue. + */ +static void +ena_aenq_free(ena_t *ena) +{ + ena_dma_free(&ena->ena_aenq.eaenq_dma); +} + +static void +ena_aenq_set_def_hdlrs(ena_aenq_t *aenq) +{ + aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_LINK_CHANGE] = ena_aenq_default_hdlr; + aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_FATAL_ERROR] = ena_aenq_default_hdlr; + aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_WARNING] = ena_aenq_default_hdlr; + aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_NOTIFICATION] = + ena_aenq_default_hdlr; + aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_KEEP_ALIVE] = ena_aenq_default_hdlr; + aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES] = + ena_aenq_default_hdlr; +} +/* + * Initialize the Async Event Notification Queue. + */ +static boolean_t +ena_aenq_init(ena_t *ena) +{ + ena_aenq_t *aenq = &ena->ena_aenq; + size_t size; + uint32_t addr_low, addr_high, wval; + ena_dma_conf_t conf; + + aenq->eaenq_num_descs = ENA_AENQ_NUM_DESCS; + size = aenq->eaenq_num_descs * sizeof (*aenq->eaenq_descs); + + conf = (ena_dma_conf_t) { + .edc_size = size, + .edc_align = ENAHW_AENQ_DESC_BUF_ALIGNMENT, + .edc_sgl = 1, + .edc_endian = DDI_NEVERSWAP_ACC, + .edc_stream = B_FALSE, + }; + + if (!ena_dma_alloc(ena, &aenq->eaenq_dma, &conf, size)) { + ena_err(ena, "failed to allocate DMA for AENQ"); + return (B_FALSE); + } + + aenq->eaenq_descs = (void *)aenq->eaenq_dma.edb_va; + aenq->eaenq_head = 0; + aenq->eaenq_phase = 1; + bzero(aenq->eaenq_descs, size); + ena_aenq_set_def_hdlrs(aenq); + + aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_LINK_CHANGE] = + ena_aenq_link_change_hdlr; + + ENA_DMA_VERIFY_ADDR(ena, aenq->eaenq_dma.edb_cookie->dmac_laddress); + addr_low = (uint32_t)(aenq->eaenq_dma.edb_cookie->dmac_laddress); + addr_high = (uint32_t)(aenq->eaenq_dma.edb_cookie->dmac_laddress >> 32); + ena_hw_bar_write32(ena, ENAHW_REG_AENQ_BASE_LO, addr_low); + ena_hw_bar_write32(ena, ENAHW_REG_AENQ_BASE_HI, addr_high); + ENA_DMA_SYNC(aenq->eaenq_dma, DDI_DMA_SYNC_FORDEV); + wval = ENAHW_AENQ_CAPS_DEPTH(aenq->eaenq_num_descs) | + ENAHW_AENQ_CAPS_ENTRY_SIZE(sizeof (*aenq->eaenq_descs)); + ena_hw_bar_write32(ena, ENAHW_REG_AENQ_CAPS, wval); + return (B_TRUE); +} + +/* + * We limit the max number of I/O queues based on several aspects of + * the underlying hardware. + * + * 1. The absolute upper limit is set by ENAHW_MAX_NUM_IO_QUEUES, + * which comes from the common code and presumably is based on device + * constraints. + * + * 2. Next we latch the number of I/O queues to the number of online + * CPUs. The idea being that each queue is a parallel work stream, + * and having more queues than CPUs to flush them will not improve + * performance. The number of online CPUs can change dynamically, + * and that's okay, everything should still work fine, it just + * might not be ideal. + * + * 3. Next we latch the number of I/O queues to the smallest of the + * max Tx queues and max Rx queues. We could probably loosen this + * restriction in the future, and have separate max I/O queues for + * Tx and Rx. This is what Linux does, and seems like a fine place + * to start. + */ +static void +ena_set_max_io_queues(ena_t *ena) +{ + uint32_t max = ENAHW_MAX_NUM_IO_QUEUES; + + max = MIN(ncpus_online, max); + /* + * Supposedly a device could present a different number of SQs + * and CQs. This driver is desinged in a way that requires + * each SQ to have a corresponding and dedicated CQ (how would + * it work otherwise). Therefore, we must check both values + * and find the minimum between them. + */ + max = MIN(ena->ena_tx_max_sq_num, max); + max = MIN(ena->ena_tx_max_cq_num, max); + max = MIN(ena->ena_rx_max_sq_num, max); + max = MIN(ena->ena_rx_max_cq_num, max); + + + /* This shouldn't happen, but just in case. */ + if (max == 0) { + max = 1; + } + + ena->ena_max_io_queues = max; +} + +/* + * We require that an Rx or Tx buffer be able to hold the maximum MTU + * along with the maximum frame header length. In this case we know + * ENA is presenting us an Ethernet frame so we add the size of an + * Ethernet VLAN header. Rx has the additional requirement of needing + * additional margin for the sake of IP header alignment. + */ +static void +ena_update_buf_sizes(ena_t *ena) +{ + ena->ena_max_frame_hdr = sizeof (struct ether_vlan_header); + ena->ena_max_frame_total = ena->ena_max_frame_hdr + ena->ena_mtu; + ena->ena_tx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total, + ena->ena_page_sz, uint32_t); + ena->ena_rx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total + + ENA_RX_BUF_IPHDR_ALIGNMENT, ena->ena_page_sz, uint32_t); +} + +static boolean_t +ena_get_offloads(ena_t *ena) +{ + int ret = 0; + enahw_resp_desc_t resp; + enahw_feat_offload_t *feat = &resp.erd_resp.erd_get_feat.ergf_offload; + + ena->ena_tx_l3_ipv4_csum = B_FALSE; + + ena->ena_tx_l4_ipv4_part_csum = B_FALSE; + ena->ena_tx_l4_ipv4_full_csum = B_FALSE; + ena->ena_tx_l4_ipv4_lso = B_FALSE; + + ena->ena_tx_l4_ipv6_part_csum = B_FALSE; + ena->ena_tx_l4_ipv6_full_csum = B_FALSE; + ena->ena_tx_l4_ipv6_lso = B_FALSE; + + ena->ena_rx_l3_ipv4_csum = B_FALSE; + ena->ena_rx_l4_ipv4_csum = B_FALSE; + ena->ena_rx_l4_ipv6_csum = B_FALSE; + ena->ena_rx_hash = B_FALSE; + + bzero(&resp, sizeof (resp)); + ret = ena_get_feature(ena, &resp, ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG, + ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG_VER); + + if (ret == ENOTSUP) { + /* + * In this case the device does not support querying + * for hardware offloads. We take that as a sign that + * the device provides no offloads. + */ + return (B_TRUE); + } else if (ret != 0) { + ena_err(ena, "error getting stateless offload: %d", ret); + return (B_FALSE); + } + + ena->ena_tx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM(feat); + + ena->ena_tx_l4_ipv4_part_csum = + ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART(feat); + ena->ena_tx_l4_ipv4_full_csum = + ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL(feat); + ena->ena_tx_l4_ipv4_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV4(feat); + + ena->ena_tx_l4_ipv6_part_csum = + ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART(feat); + ena->ena_tx_l4_ipv6_full_csum = + ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL(feat); + ena->ena_tx_l4_ipv6_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV6(feat); + + ena->ena_rx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM(feat); + ena->ena_rx_l4_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM(feat); + ena->ena_rx_l4_ipv6_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM(feat); + return (B_TRUE); +} + +static int +ena_get_prop(ena_t *ena, char *propname, const int minval, const int maxval, + const int defval) +{ + int value = ddi_prop_get_int(DDI_DEV_T_ANY, ena->ena_dip, + DDI_PROP_DONTPASS, propname, defval); + + if (value > maxval) { + ena_err(ena, "user value %s=%d exceeded maximum, setting to %d", + propname, value, maxval); + value = maxval; + } + + if (value < minval) { + ena_err(ena, "user value %s=%d below minimum, setting to %d", + propname, value, minval); + value = minval; + } + + return (value); +} + +static boolean_t +ena_set_mtu(ena_t *ena) +{ + int ret = 0; + enahw_cmd_desc_t cmd; + enahw_feat_mtu_t *feat = &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_mtu; + enahw_resp_desc_t resp; + + bzero(&cmd, sizeof (cmd)); + bzero(&resp, sizeof (resp)); + feat->efm_mtu = ena->ena_mtu; + + if ((ret = ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_MTU, + ENAHW_FEAT_MTU_VER)) != 0) { + ena_err(ena, "failed to set device MTU to %u: %d", ena->ena_mtu, + ret); + return (B_FALSE); + } + + return (B_TRUE); +} + +static void +ena_get_link_config(ena_t *ena) +{ + enahw_resp_desc_t resp; + enahw_feat_link_conf_t *feat = + &resp.erd_resp.erd_get_feat.ergf_link_conf; + boolean_t full_duplex; + + bzero(&resp, sizeof (resp)); + + if (ena_get_feature(ena, &resp, ENAHW_FEAT_LINK_CONFIG, + ENAHW_FEAT_LINK_CONFIG_VER) != 0) { + /* + * Some ENA devices do no support this feature. In + * those cases we report a 1Gbps link, full duplex. + * For the most accurate information on bandwidth + * limits see the official AWS documentation. + */ + ena->ena_link_speed_mbits = 1 * 1000 * 1000; + ena->ena_link_speeds = ENAHW_LINK_SPEED_1G; + ena->ena_link_duplex = LINK_DUPLEX_FULL; + ena->ena_link_autoneg = B_TRUE; + return; + } + + ena->ena_link_speed_mbits = feat->eflc_speed; + ena->ena_link_speeds = feat->eflc_supported; + full_duplex = ENAHW_FEAT_LINK_CONF_FULL_DUPLEX(feat); + ena->ena_link_duplex = full_duplex ? LINK_DUPLEX_FULL : + LINK_DUPLEX_HALF; + ena->ena_link_autoneg = ENAHW_FEAT_LINK_CONF_AUTONEG(feat); +} + +/* + * Retrieve all configuration values which are modifiable via + * ena.conf, and set ena_t members accordingly. While the conf values + * have priority, they may be implicitly modified by the driver to + * meet resource constraints on a given platform. If no value is + * specified in the conf file, the driver will attempt to use the + * largest value supported. While there should be no value large + * enough, keep in mind that ena_get_prop() will cast the values to an + * int. + * + * This function should be called after the device is initialized, + * admin queue is established, and the hardware features/capabs have + * been queried; it should be called before mac registration. + */ +static boolean_t +ena_attach_read_conf(ena_t *ena) +{ + uint32_t gcv; /* Greatest Common Value */ + + /* + * We expect that the queue lengths are the same for both the + * CQ and SQ, but technically the device could return + * different lengths. For now the driver locks them together. + */ + gcv = min(ena->ena_rx_max_sq_num_descs, ena->ena_rx_max_cq_num_descs); + ASSERT3U(gcv, <=, INT_MAX); + ena->ena_rxq_num_descs = ena_get_prop(ena, ENA_PROP_RXQ_NUM_DESCS, + ENA_PROP_RXQ_NUM_DESCS_MIN, gcv, gcv); + + ena->ena_rxq_intr_limit = ena_get_prop(ena, ENA_PROP_RXQ_INTR_LIMIT, + ENA_PROP_RXQ_INTR_LIMIT_MIN, ENA_PROP_RXQ_INTR_LIMIT_MAX, + ENA_PROP_RXQ_INTR_LIMIT_DEF); + + gcv = min(ena->ena_tx_max_sq_num_descs, ena->ena_tx_max_cq_num_descs); + ASSERT3U(gcv, <=, INT_MAX); + ena->ena_txq_num_descs = ena_get_prop(ena, ENA_PROP_TXQ_NUM_DESCS, + ENA_PROP_TXQ_NUM_DESCS_MIN, gcv, gcv); + + return (B_TRUE); +} + +/* + * Perform any necessary device configuration after the driver.conf + * has been read. + */ +static boolean_t +ena_attach_dev_cfg(ena_t *ena) +{ + ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_READ_CONF); + + if (!ena_set_mtu(ena)) { + /* + * We don't expect this to fail, but we try a fallback + * first before failing the attach sequence. + */ + ena->ena_mtu = 1500; + ena_err(ena, "trying fallback MTU: %u", ena->ena_mtu); + + if (!ena_set_mtu(ena)) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static boolean_t +ena_check_versions(ena_t *ena) +{ + uint32_t dev_vsn = ena_hw_bar_read32(ena, ENAHW_REG_VERSION); + uint32_t ctrl_vsn = + ena_hw_bar_read32(ena, ENAHW_REG_CONTROLLER_VERSION); + + ena->ena_dev_major_vsn = ENAHW_DEV_MAJOR_VSN(dev_vsn); + ena->ena_dev_minor_vsn = ENAHW_DEV_MINOR_VSN(dev_vsn); + + ena->ena_ctrl_major_vsn = ENAHW_CTRL_MAJOR_VSN(ctrl_vsn); + ena->ena_ctrl_minor_vsn = ENAHW_CTRL_MINOR_VSN(ctrl_vsn); + ena->ena_ctrl_subminor_vsn = ENAHW_CTRL_SUBMINOR_VSN(ctrl_vsn); + ena->ena_ctrl_impl_id = ENAHW_CTRL_IMPL_ID(ctrl_vsn); + + if (ena->ena_ctrl_subminor_vsn < ENA_CTRL_SUBMINOR_VSN_MIN) { + ena_err(ena, "unsupported controller version: %u.%u.%u", + ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn, + ena->ena_ctrl_subminor_vsn); + return (B_FALSE); + } + + return (B_TRUE); +} + +boolean_t +ena_setup_aenq(ena_t *ena) +{ + enahw_cmd_desc_t cmd; + enahw_feat_aenq_t *cmd_feat = + &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_aenq; + enahw_resp_desc_t resp; + enahw_feat_aenq_t *resp_feat = &resp.erd_resp.erd_get_feat.ergf_aenq; + enahw_aenq_groups_t to_enable; + + bzero(&resp, sizeof (resp)); + if (ena_get_feature(ena, &resp, ENAHW_FEAT_AENQ_CONFIG, + ENAHW_FEAT_AENQ_CONFIG_VER) != 0) { + return (B_FALSE); + } + + to_enable = BIT(ENAHW_AENQ_GROUP_LINK_CHANGE) | + BIT(ENAHW_AENQ_GROUP_FATAL_ERROR) | + BIT(ENAHW_AENQ_GROUP_WARNING) | + BIT(ENAHW_AENQ_GROUP_NOTIFICATION); + to_enable &= resp_feat->efa_supported_groups; + + bzero(&cmd, sizeof (cmd)); + bzero(&resp, sizeof (cmd)); + cmd_feat->efa_enabled_groups = to_enable; + + if (ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_AENQ_CONFIG, + ENAHW_FEAT_AENQ_CONFIG_VER) != 0) { + return (B_FALSE); + } + + bzero(&resp, sizeof (resp)); + if (ena_get_feature(ena, &resp, ENAHW_FEAT_AENQ_CONFIG, + ENAHW_FEAT_AENQ_CONFIG_VER) != 0) { + return (B_FALSE); + } + + ena->ena_aenq_supported_groups = resp_feat->efa_supported_groups; + ena->ena_aenq_enabled_groups = resp_feat->efa_enabled_groups; + + for (uint_t i = 0; i < ENAHW_AENQ_GROUPS_ARR_NUM; i++) { + ena_aenq_grpstr_t *grpstr = &ena_groups_str[i]; + boolean_t supported = BIT(grpstr->eag_type) & + resp_feat->efa_supported_groups; + boolean_t enabled = BIT(grpstr->eag_type) & + resp_feat->efa_enabled_groups; + + ena_dbg(ena, "%s supported: %s enabled: %s", grpstr->eag_str, + supported ? "Y" : "N", enabled ? "Y" : "N"); + } + + return (B_TRUE); +} + +/* + * Free all resources allocated as part of ena_device_init(). + */ +static void +ena_cleanup_device_init(ena_t *ena) +{ + ena_adminq_t *aq = &ena->ena_aq; + + ena_free_host_info(ena); + mutex_destroy(&aq->ea_sq_lock); + mutex_destroy(&aq->ea_cq_lock); + mutex_destroy(&aq->ea_stat_lock); + list_destroy(&aq->ea_cmd_ctxs_free); + kmem_free(aq->ea_cmd_ctxs, sizeof (ena_cmd_ctx_t) * aq->ea_qlen); + ena_admin_sq_free(ena); + ena_admin_cq_free(ena); + ena_aenq_free(ena); + ena_stat_device_basic_cleanup(ena); + ena_stat_device_extended_cleanup(ena); + ena_stat_aenq_cleanup(ena); +} + +static boolean_t +ena_attach_device_init(ena_t *ena) +{ + ena_adminq_t *aq = &ena->ena_aq; + uint32_t rval, wval; + uint8_t dma_width; + hrtime_t timeout, cmd_timeout; + hrtime_t expired; + enahw_resp_desc_t resp; + enahw_feat_dev_attr_t *feat = &resp.erd_resp.erd_get_feat.ergf_dev_attr; + uint8_t *maddr; + uint32_t supported_features; + int ret = 0; + + rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS); + if ((rval & ENAHW_DEV_STS_READY_MASK) == 0) { + ena_err(ena, "device is not ready"); + return (B_FALSE); + } + + rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS); + + /* + * The device stores the reset timeout at 100ms resolution; we + * normalize that to nanoseconds. + */ + timeout = MSEC2NSEC(ENAHW_CAPS_RESET_TIMEOUT(rval) * 100); + + if (timeout == 0) { + ena_err(ena, "device gave invalid reset timeout"); + return (B_FALSE); + } + + expired = gethrtime() + timeout; + + wval = ENAHW_DEV_CTL_DEV_RESET_MASK; + wval |= (ENAHW_RESET_NORMAL << ENAHW_DEV_CTL_RESET_REASON_SHIFT) & + ENAHW_DEV_CTL_RESET_REASON_MASK; + ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, wval); + + /* + * Make sure reset is in progress. + */ + while (1) { + rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS); + + if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) != 0) { + break; + } + + if (gethrtime() > expired) { + ena_err(ena, "device reset start timed out"); + return (B_FALSE); + } + + /* Sleep for 100 milliseconds. */ + delay(drv_usectohz(100 * 1000)); + } + + /* + * Reset the timeout counter for the next device request. + */ + expired = gethrtime() + timeout; + + /* + * Wait for the device reset to finish. + */ + ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, 0); + while (1) { + rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS); + + if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) == 0) { + break; + } + + if (gethrtime() > expired) { + ena_err(ena, "device reset timed out"); + return (B_FALSE); + } + + /* Sleep for 100 milliseconds. */ + delay(drv_usectohz(100 * 1000)); + } + + if (!ena_check_versions(ena)) { + return (B_FALSE); + } + + rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS); + dma_width = ENAHW_CAPS_DMA_ADDR_WIDTH(rval); + ena->ena_dma_width = dma_width; + + /* + * As we are not using an interrupt for admin queue completion + * signaling, we do not need a priority on these mutexes. If + * that changes, we will have to rejigger some code to create + * the admin queue interrupt before this function. + */ + mutex_init(&aq->ea_sq_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&aq->ea_cq_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&aq->ea_stat_lock, NULL, MUTEX_DRIVER, NULL); + aq->ea_qlen = ENA_ADMINQ_DEPTH; + aq->ea_pending_cmds = 0; + + aq->ea_cmd_ctxs = kmem_zalloc(sizeof (ena_cmd_ctx_t) * aq->ea_qlen, + KM_SLEEP); + list_create(&aq->ea_cmd_ctxs_free, sizeof (ena_cmd_ctx_t), + offsetof(ena_cmd_ctx_t, ectx_node)); + + for (uint_t i = 0; i < aq->ea_qlen; i++) { + ena_cmd_ctx_t *ctx = &aq->ea_cmd_ctxs[i]; + + ctx->ectx_id = i; + ctx->ectx_pending = B_FALSE; + ctx->ectx_cmd_opcode = ENAHW_CMD_NONE; + ctx->ectx_resp = NULL; + list_insert_tail(&aq->ea_cmd_ctxs_free, ctx); + } + + /* + * The value stored in the device register is in the + * resolution of 100 milliseconds. We normalize that to + * nanoseconds. + */ + cmd_timeout = MSEC2NSEC(ENAHW_CAPS_ADMIN_CMD_TIMEOUT(rval) * 100); + aq->ea_cmd_timeout_ns = max(cmd_timeout, ena_admin_cmd_timeout_ns); + + if (aq->ea_cmd_timeout_ns == 0) { + aq->ea_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT; + } + + if (!ena_admin_sq_init(ena)) { + return (B_FALSE); + } + + if (!ena_admin_cq_init(ena)) { + return (B_FALSE); + } + + if (!ena_aenq_init(ena)) { + return (B_FALSE); + } + + /* + * While the Linux driver prefers to use interrupts to deliver + * admin queue completions, we just poll -- it seems to work + * just fine. + */ + ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, 0); + aq->ea_poll_mode = B_TRUE; + + bzero(&resp, sizeof (resp)); + ret = ena_get_feature(ena, &resp, ENAHW_FEAT_DEVICE_ATTRIBUTES, + ENAHW_FEAT_DEVICE_ATTRIBUTES_VER); + + if (ret != 0) { + ena_err(ena, "failed to get device attributes: %d", ret); + return (B_FALSE); + } + + ena_dbg(ena, "impl ID: %u", feat->efda_impl_id); + ena_dbg(ena, "device version: %u", feat->efda_device_version); + ena_dbg(ena, "supported features: 0x%x", + feat->efda_supported_features); + ena_dbg(ena, "phys addr width: %u", feat->efda_phys_addr_width); + ena_dbg(ena, "virt addr width: %u", feat->efda_virt_addr_with); + maddr = feat->efda_mac_addr; + ena_dbg(ena, "mac addr: %x:%x:%x:%x:%x:%x", maddr[0], maddr[1], + maddr[2], maddr[3], maddr[4], maddr[5]); + ena_dbg(ena, "max MTU: %u", feat->efda_max_mtu); + + bcopy(maddr, ena->ena_mac_addr, ETHERADDRL); + ena->ena_max_mtu = feat->efda_max_mtu; + supported_features = feat->efda_supported_features; + ena->ena_supported_features = supported_features; + feat = NULL; + bzero(&resp, sizeof (resp)); + + if (supported_features & BIT(ENAHW_FEAT_MAX_QUEUES_EXT)) { + enahw_feat_max_queue_ext_t *feat_mqe = + &resp.erd_resp.erd_get_feat.ergf_max_queue_ext; + + ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_EXT, + ENAHW_FEAT_MAX_QUEUES_EXT_VER); + + if (ret != 0) { + ena_err(ena, "failed to query max queues ext: %d", ret); + return (B_FALSE); + } + + ena->ena_tx_max_sq_num = feat_mqe->efmqe_max_tx_sq_num; + ena->ena_tx_max_sq_num_descs = feat_mqe->efmqe_max_tx_sq_depth; + ena->ena_tx_max_cq_num = feat_mqe->efmqe_max_tx_cq_num; + ena->ena_tx_max_cq_num_descs = feat_mqe->efmqe_max_tx_cq_depth; + ena->ena_tx_max_desc_per_pkt = + feat_mqe->efmqe_max_per_packet_tx_descs; + ena->ena_tx_max_hdr_len = feat_mqe->efmqe_max_tx_header_size; + + ena->ena_rx_max_sq_num = feat_mqe->efmqe_max_rx_sq_num; + ena->ena_rx_max_sq_num_descs = feat_mqe->efmqe_max_rx_sq_depth; + ena->ena_rx_max_cq_num = feat_mqe->efmqe_max_rx_cq_num; + ena->ena_rx_max_cq_num_descs = feat_mqe->efmqe_max_rx_cq_depth; + ena->ena_rx_max_desc_per_pkt = + feat_mqe->efmqe_max_per_packet_rx_descs; + + ena_set_max_io_queues(ena); + } else { + enahw_feat_max_queue_t *feat_mq = + &resp.erd_resp.erd_get_feat.ergf_max_queue; + + ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_NUM, + ENAHW_FEAT_MAX_QUEUES_NUM_VER); + + if (ret != 0) { + ena_err(ena, "failed to query max queues: %d", ret); + return (B_FALSE); + } + + ena->ena_tx_max_sq_num = feat_mq->efmq_max_sq_num; + ena->ena_tx_max_sq_num_descs = feat_mq->efmq_max_sq_depth; + ena->ena_tx_max_cq_num = feat_mq->efmq_max_cq_num; + ena->ena_tx_max_cq_num_descs = feat_mq->efmq_max_cq_depth; + ena->ena_tx_max_desc_per_pkt = + feat_mq->efmq_max_per_packet_tx_descs; + ena->ena_tx_max_hdr_len = feat_mq->efmq_max_header_size; + + ena->ena_rx_max_sq_num = feat_mq->efmq_max_sq_num; + ena->ena_rx_max_sq_num_descs = feat_mq->efmq_max_sq_depth; + ena->ena_rx_max_cq_num = feat_mq->efmq_max_cq_num; + ena->ena_rx_max_cq_num_descs = feat_mq->efmq_max_cq_depth; + ena->ena_rx_max_desc_per_pkt = + feat_mq->efmq_max_per_packet_rx_descs; + + ena_set_max_io_queues(ena); + } + + ena->ena_mtu = ena->ena_max_mtu; + ena_update_buf_sizes(ena); + /* + * We could use ENAHW_FEAT_HW_HINTS to determine actual SGL + * sizes, for now we just force everything to use one + * segment. + */ + ena->ena_tx_sgl_max_sz = 1; + ena->ena_rx_sgl_max_sz = 1; + + if (!ena_init_host_info(ena)) { + return (B_FALSE); + } + + if (!ena_setup_aenq(ena)) { + return (B_FALSE); + } + + ena_get_link_config(ena); + + if (!ena_get_offloads(ena)) { + return (B_FALSE); + } + + if (!ena_stat_device_basic_init(ena)) { + return (B_FALSE); + } + + if (!ena_stat_device_extended_init(ena)) { + return (B_FALSE); + } + + if (!ena_stat_aenq_init(ena)) { + return (B_FALSE); + } + + return (B_TRUE); +} + +static void +ena_cleanup_intr_alloc(ena_t *ena) +{ + for (int i = 0; i < ena->ena_num_intrs; i++) { + int ret = ddi_intr_free(ena->ena_intr_handles[i]); + if (ret != DDI_SUCCESS) { + ena_err(ena, "failed to free interrupt %d: %d", i, ret); + } + } + + if (ena->ena_intr_handles != NULL) { + kmem_free(ena->ena_intr_handles, ena->ena_intr_handles_sz); + ena->ena_intr_handles = NULL; + ena->ena_intr_handles_sz = 0; + } +} + +/* + * The Linux driver supports only MSI-X interrupts. We do the same, + * with the assumption that it's the only type of interrupt the device + * can present. + */ +static boolean_t +ena_attach_intr_alloc(ena_t *ena) +{ + int ret; + int types; + int min, req, ideal, avail, actual; + + ret = ddi_intr_get_supported_types(ena->ena_dip, &types); + if (ret != DDI_SUCCESS) { + ena_err(ena, "failed to get interrupt types: %d", ret); + return (B_FALSE); + } + + ena_dbg(ena, "supported interrupt types: 0x%x", types); + if ((types & DDI_INTR_TYPE_MSIX) == 0) { + ena_err(ena, "the ena driver only supports MSI-X interrupts"); + return (B_FALSE); + } + + /* One for I/O, one for adminq. */ + min = 2; + ideal = ena->ena_max_io_queues + 1; + ret = ddi_intr_get_nintrs(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail); + if (ret != DDI_SUCCESS) { + ena_err(ena, "failed to get number of MSI-X interrupts: %d", + ret); + return (B_FALSE); + } + + if (avail < min) { + ena_err(ena, "number of MSI-X interrupts is %d, but the driver " + "requires a minimum of %d", avail, min); + return (B_FALSE); + } + + ena_dbg(ena, "%d MSI-X interrupts available", avail); + + ret = ddi_intr_get_navail(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail); + if (ret != DDI_SUCCESS) { + ena_err(ena, "failed to get available interrupts: %d", ret); + return (B_FALSE); + } + + if (avail < min) { + ena_err(ena, "number of available MSI-X interrupts is %d, " + "but the driver requires a minimum of %d", avail, min); + return (B_FALSE); + } + + req = MIN(ideal, avail); + ena->ena_intr_handles_sz = req * sizeof (ddi_intr_handle_t); + ena->ena_intr_handles = kmem_zalloc(ena->ena_intr_handles_sz, KM_SLEEP); + + ret = ddi_intr_alloc(ena->ena_dip, ena->ena_intr_handles, + DDI_INTR_TYPE_MSIX, 0, req, &actual, DDI_INTR_ALLOC_NORMAL); + if (ret != DDI_SUCCESS) { + ena_err(ena, "failed to allocate %d MSI-X interrupts: %d", + req, ret); + return (B_FALSE); + } + + if (actual < min) { + ena_err(ena, "number of allocated interrupts is %d, but the " + "driver requires a minimum of %d", actual, min); + return (B_FALSE); + } + + ena->ena_num_intrs = actual; + + ret = ddi_intr_get_cap(ena->ena_intr_handles[0], &ena->ena_intr_caps); + if (ret != DDI_SUCCESS) { + ena_err(ena, "failed to get interrupt capability: %d", ret); + return (B_FALSE); + } + + ret = ddi_intr_get_pri(ena->ena_intr_handles[0], &ena->ena_intr_pri); + if (ret != DDI_SUCCESS) { + ena_err(ena, "failed to get interrupt priority: %d", ret); + return (B_FALSE); + } + + ena_dbg(ena, "MSI-X interrupts allocated: %d, cap: 0x%x, pri: %u", + actual, ena->ena_intr_caps, ena->ena_intr_pri); + + /* + * The ena_lock should not be held in the datapath, but it is + * held as part of the AENQ handler, which runs in interrupt + * context. Therefore, we delayed the initilization of this + * mutex until after the interrupts are allocated. + */ + mutex_init(&ena->ena_lock, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(ena->ena_intr_pri)); + + return (B_TRUE); +} + +/* + * Allocate the parent Rx queue structures. More importantly, this is + * NOT allocating the queue descriptors or data buffers. Those are + * allocated on demand as queues are started. + */ +static boolean_t +ena_attach_alloc_rxqs(ena_t *ena) +{ + /* We rely on the interrupt priority for initializing the mutexes. */ + VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC); + ena->ena_num_rxqs = ena->ena_num_intrs - 1; + ASSERT3U(ena->ena_num_rxqs, >, 0); + ena->ena_rxqs = kmem_zalloc(ena->ena_num_rxqs * sizeof (*ena->ena_rxqs), + KM_SLEEP); + + for (uint_t i = 0; i < ena->ena_num_rxqs; i++) { + ena_rxq_t *rxq = &ena->ena_rxqs[i]; + + rxq->er_rxqs_idx = i; + /* The 0th vector is for Admin + AENQ. */ + rxq->er_intr_vector = i + 1; + rxq->er_mrh = NULL; + + mutex_init(&rxq->er_lock, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(ena->ena_intr_pri)); + mutex_init(&rxq->er_stat_lock, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(ena->ena_intr_pri)); + + rxq->er_ena = ena; + rxq->er_sq_num_descs = ena->ena_rxq_num_descs; + rxq->er_cq_num_descs = ena->ena_rxq_num_descs; + + if (!ena_stat_rxq_init(rxq)) { + return (B_FALSE); + } + + if (!ena_alloc_rxq(rxq)) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void +ena_cleanup_rxqs(ena_t *ena) +{ + for (uint_t i = 0; i < ena->ena_num_rxqs; i++) { + ena_rxq_t *rxq = &ena->ena_rxqs[i]; + + ena_cleanup_rxq(rxq); + mutex_destroy(&rxq->er_lock); + mutex_destroy(&rxq->er_stat_lock); + ena_stat_rxq_cleanup(rxq); + } + + kmem_free(ena->ena_rxqs, ena->ena_num_rxqs * sizeof (*ena->ena_rxqs)); +} + +/* + * Allocate the parent Tx queue structures. More importantly, this is + * NOT allocating the queue descriptors or data buffers. Those are + * allocated on demand as a queue is started. + */ +static boolean_t +ena_attach_alloc_txqs(ena_t *ena) +{ + /* We rely on the interrupt priority for initializing the mutexes. */ + VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC); + ena->ena_num_txqs = ena->ena_num_intrs - 1; + ASSERT3U(ena->ena_num_txqs, >, 0); + ena->ena_txqs = kmem_zalloc(ena->ena_num_txqs * sizeof (*ena->ena_txqs), + KM_SLEEP); + + for (uint_t i = 0; i < ena->ena_num_txqs; i++) { + ena_txq_t *txq = &ena->ena_txqs[i]; + + txq->et_txqs_idx = i; + /* The 0th vector is for Admin + AENQ. */ + txq->et_intr_vector = i + 1; + txq->et_mrh = NULL; + + mutex_init(&txq->et_lock, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(ena->ena_intr_pri)); + mutex_init(&txq->et_stat_lock, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(ena->ena_intr_pri)); + + txq->et_ena = ena; + txq->et_sq_num_descs = ena->ena_txq_num_descs; + txq->et_cq_num_descs = ena->ena_txq_num_descs; + + if (!ena_stat_txq_init(txq)) { + return (B_FALSE); + } + + if (!ena_alloc_txq(txq)) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void +ena_cleanup_txqs(ena_t *ena) +{ + for (uint_t i = 0; i < ena->ena_num_rxqs; i++) { + ena_txq_t *txq = &ena->ena_txqs[i]; + + ena_cleanup_txq(txq); + mutex_destroy(&txq->et_lock); + mutex_destroy(&txq->et_stat_lock); + ena_stat_txq_cleanup(txq); + } + + kmem_free(ena->ena_txqs, ena->ena_num_txqs * sizeof (*ena->ena_txqs)); +} + +ena_attach_desc_t ena_attach_tbl[ENA_ATTACH_NUM_ENTRIES] = { + { + .ead_seq = ENA_ATTACH_PCI, + .ead_name = "PCI config", + .ead_attach_fn = ena_attach_pci, + .ead_attach_hard_fail = B_TRUE, + .ead_cleanup_fn = ena_cleanup_pci, + }, + + { + .ead_seq = ENA_ATTACH_REGS, + .ead_name = "BAR mapping", + .ead_attach_fn = ena_attach_regs_map, + .ead_attach_hard_fail = B_TRUE, + .ead_cleanup_fn = ena_cleanup_regs_map, + }, + + { + .ead_seq = ENA_ATTACH_DEV_INIT, + .ead_name = "device initialization", + .ead_attach_fn = ena_attach_device_init, + .ead_attach_hard_fail = B_TRUE, + .ead_cleanup_fn = ena_cleanup_device_init, + }, + + { + .ead_seq = ENA_ATTACH_READ_CONF, + .ead_name = "ena.conf", + .ead_attach_fn = ena_attach_read_conf, + .ead_attach_hard_fail = B_TRUE, + .ead_cleanup_fn = ena_no_cleanup, + }, + + { + .ead_seq = ENA_ATTACH_DEV_CFG, + .ead_name = "device config", + .ead_attach_fn = ena_attach_dev_cfg, + .ead_attach_hard_fail = B_TRUE, + .ead_cleanup_fn = ena_no_cleanup, + }, + + { + .ead_seq = ENA_ATTACH_INTR_ALLOC, + .ead_name = "interrupt allocation", + .ead_attach_fn = ena_attach_intr_alloc, + .ead_attach_hard_fail = B_TRUE, + .ead_cleanup_fn = ena_cleanup_intr_alloc, + }, + + { + .ead_seq = ENA_ATTACH_INTR_HDLRS, + .ead_name = "interrupt handlers", + .ead_attach_fn = ena_intr_add_handlers, + .ead_attach_hard_fail = B_TRUE, + .ead_cleanup_fn = ena_intr_remove_handlers, + }, + + { + .ead_seq = ENA_ATTACH_TXQS_ALLOC, + .ead_name = "Tx queues", + .ead_attach_fn = ena_attach_alloc_txqs, + .ead_attach_hard_fail = B_TRUE, + .ead_cleanup_fn = ena_cleanup_txqs, + }, + + { + .ead_seq = ENA_ATTACH_RXQS_ALLOC, + .ead_name = "Rx queues", + .ead_attach_fn = ena_attach_alloc_rxqs, + .ead_attach_hard_fail = B_TRUE, + .ead_cleanup_fn = ena_cleanup_rxqs, + }, + + /* + * The chance of mac_unregister() failure poses a problem to + * cleanup. We address interrupt disablement and mac + * unregistration explicitly in the attach/detach routines. + */ + { + .ead_seq = ENA_ATTACH_MAC_REGISTER, + .ead_name = "mac registration", + .ead_attach_fn = ena_mac_register, + .ead_attach_hard_fail = B_TRUE, + .ead_cleanup_fn = ena_no_cleanup, + }, + + { + .ead_seq = ENA_ATTACH_INTRS_ENABLE, + .ead_name = "enable interrupts", + .ead_attach_fn = ena_intrs_enable, + .ead_attach_hard_fail = B_TRUE, + .ead_cleanup_fn = ena_no_cleanup, + } +}; + +/* + * This function undoes any work done by ena_attach(), either in + * response to a failed attach or a planned detach. At the end of this + * function ena_attach_seq should be zero, otherwise it means + * something has not be freed/uninitialized. + */ +static void +ena_cleanup(ena_t *ena) +{ + if (ena == NULL || ena->ena_attach_seq == 0) { + return; + } + + /* + * We VERIFY this because if the seq is greater than entries + * we drift into space and execute god knows what. + */ + VERIFY3U(ena->ena_attach_seq, <, ENA_ATTACH_NUM_ENTRIES); + + while (ena->ena_attach_seq > 0) { + int idx = ena->ena_attach_seq - 1; + ena_attach_desc_t *desc = &ena_attach_tbl[idx]; + + ena_dbg(ena, "running cleanup sequence: %s (%d)", + desc->ead_name, idx); + + desc->ead_cleanup_fn(ena); + ena->ena_attach_seq--; + } + + ASSERT3U(ena->ena_attach_seq, ==, 0); + mutex_destroy(&ena->ena_lock); +} + +static int +ena_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + ena_t *ena; + + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + ena = kmem_zalloc(sizeof (ena_t), KM_SLEEP); + ena->ena_instance = ddi_get_instance(dip); + ena->ena_dip = dip; + ena->ena_instance = ddi_get_instance(dip); + ena->ena_page_sz = ddi_ptob(dip, 1); + + for (int i = 0; i < ENA_ATTACH_NUM_ENTRIES; i++) { + boolean_t success; + ena_attach_desc_t *desc = &ena_attach_tbl[i]; + + ena_dbg(ena, "running attach sequence: %s (%d)", desc->ead_name, + i); + + if (!(success = desc->ead_attach_fn(ena))) { + ena_err(ena, "attach sequence failed: %s (%d)", + desc->ead_name, i); + + if (ena->ena_attach_seq == ENA_ATTACH_MAC_REGISTER) { + /* + * In this specific case + * ENA_ATTACH_INTRS_ENABLE has failed, + * and we may or may not be able to + * unregister the mac, depending on if + * something in userspace has created + * a client on top. + * + * NOTE: Something that would be nice + * to add to mac is the ability to + * register a provider separate from + * "publishing" it to the rest of the + * system. This would allow a driver + * to register its mac, do some + * additional work that might fail, + * and then unregister if that work + * fails without concern for any + * chance of failure when calling + * unregister. This would remove the + * complexity of the situation we are + * trying to address here, as we would + * know that until the mac has been + * "published", there is no chance for + * mac_unregister() to fail. + */ + if (ena_mac_unregister(ena) != 0) { + return (DDI_FAILURE); + } + + ena->ena_attach_seq--; + } else { + /* + * Since the ead_seq is predicated on + * successful ead_attach_fn we must + * run the specific cleanup handler + * before calling the global cleanup + * routine. This also means that all + * cleanup functions must be able to + * deal with partial success of the + * corresponding ead_attach_fn. + */ + desc->ead_cleanup_fn(ena); + } + + ena_cleanup(ena); + kmem_free(ena, sizeof (ena_t)); + return (DDI_FAILURE); + } + + if (success) { + ena_dbg(ena, "attach sequence completed: %s (%d)", + desc->ead_name, i); + } + + ena->ena_attach_seq = desc->ead_seq; + } + + /* + * Now that interrupts are enabled make sure to tell the + * device that all AENQ descriptors are ready for writing. + */ + ena_hw_bar_write32(ena, ENAHW_REG_AENQ_HEAD_DB, + ena->ena_aenq.eaenq_num_descs); + + ddi_set_driver_private(dip, ena); + return (DDI_SUCCESS); +} + +static int +ena_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + ena_t *ena = ddi_get_driver_private(dip); + + if (ena == NULL) { + return (DDI_FAILURE); + } + + /* + * Before we can proceed to cleanup we have to treat + * mac_unregister() explicitly -- if there are still + * outstanding clients, then we can't proceed with detach or + * cleanup. + */ + + /* + * Why this would fail I don't know, but if we proceed to mac + * unregister, then there is a good chance we will panic in + * the Rx interrupt handler when calling mac_rx_ring() + */ + if (!ena_intrs_disable(ena)) { + return (DDI_FAILURE); + } + + /* We can't detach if clients are actively using the device. */ + if (ena_mac_unregister(ena) != 0) { + (void) ena_intrs_enable(ena); + return (DDI_FAILURE); + } + + /* + * At this point we can proceed with the rest of cleanup on a + * best-effort basis. + */ + ena->ena_attach_seq = ENA_ATTACH_RXQS_ALLOC; + ena_cleanup(ena); + ddi_set_driver_private(dip, NULL); + kmem_free(ena, sizeof (ena_t)); + return (DDI_SUCCESS); +} + +static struct cb_ops ena_cb_ops = { + .cb_open = nodev, + .cb_close = nodev, + .cb_strategy = nodev, + .cb_print = nodev, + .cb_dump = nodev, + .cb_read = nodev, + .cb_write = nodev, + .cb_ioctl = nodev, + .cb_devmap = nodev, + .cb_mmap = nodev, + .cb_segmap = nodev, + .cb_chpoll = nochpoll, + .cb_prop_op = ddi_prop_op, + .cb_flag = D_MP, + .cb_rev = CB_REV, + .cb_aread = nodev, + .cb_awrite = nodev +}; + +static struct dev_ops ena_dev_ops = { + .devo_rev = DEVO_REV, + .devo_refcnt = 0, + .devo_getinfo = NULL, + .devo_identify = nulldev, + .devo_probe = nulldev, + .devo_attach = ena_attach, + .devo_detach = ena_detach, + .devo_reset = nodev, + .devo_quiesce = ddi_quiesce_not_supported, + .devo_cb_ops = &ena_cb_ops +}; + +static struct modldrv ena_modldrv = { + .drv_modops = &mod_driverops, + .drv_linkinfo = "AWS ENA Ethernet", + .drv_dev_ops = &ena_dev_ops +}; + +static struct modlinkage ena_modlinkage = { + .ml_rev = MODREV_1, + .ml_linkage = { &ena_modldrv, NULL } +}; + +int +_init(void) +{ + int ret; + + mac_init_ops(&ena_dev_ops, ENA_MODULE_NAME); + + if ((ret = mod_install(&ena_modlinkage)) != 0) { + mac_fini_ops(&ena_dev_ops); + return (ret); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&ena_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + if ((ret = mod_remove(&ena_modlinkage)) != 0) { + return (ret); + } + + mac_fini_ops(&ena_dev_ops); + return (ret); +} diff --git a/usr/src/uts/common/io/ena/ena.conf b/usr/src/uts/common/io/ena/ena.conf new file mode 100644 index 0000000000..64ee011d7c --- /dev/null +++ b/usr/src/uts/common/io/ena/ena.conf @@ -0,0 +1,50 @@ +# +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2021 Oxide Computer Company +# + +# +# Driver .conf file for AWS Elastic Network Adapter. See ena(7D) for +# valid options. +# + +# +# rx_queue_num_descs +# +# The number of descriptors provided by each Rx queue. +# +# Range: 64 - <device maximum> +# Default: <device maximum> +# +# rx_queue_num_descs = 1024; + +# +# rx_queue_intr_limit +# +# The number of frames that may be read by a single Rx interrupt. +# +# Range: 16 - 4096 +# Default: 256 +# +# rx_queue_intr_limit = 256; + +# +# tx_queue_num_descs +# +# The number of descriptors provided by each Tx queue. +# +# Range: 64 - <device maximum> +# Default: <device maximum> +# +# tx_queue_num_descs = 1024;
\ No newline at end of file diff --git a/usr/src/uts/common/io/ena/ena.h b/usr/src/uts/common/io/ena/ena.h new file mode 100644 index 0000000000..467da40f4b --- /dev/null +++ b/usr/src/uts/common/io/ena/ena.h @@ -0,0 +1,848 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ + +#ifndef _ENA_H +#define _ENA_H + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/types.h> +#include <sys/atomic.h> +#include <sys/list.h> +#include <sys/time.h> +#include <sys/modctl.h> +#include <sys/conf.h> +#include <sys/cpuvar.h> +#include <sys/pci.h> +#include <sys/sysmacros.h> +#include <sys/mac.h> +#include <sys/mac_ether.h> +#include <sys/mac_provider.h> +#include <sys/pattr.h> +#include <sys/strsun.h> +#include <sys/ethernet.h> +#include <sys/vlan.h> +#include <sys/utsname.h> +#include "ena_hw.h" + +/* + * AWS ENA Ethernet Driver + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define ENA_MODULE_NAME "ena" + +/* + * The minimum supported ENA device controller version. + */ +#define ENA_CTRL_MAJOR_VSN_MIN 0 +#define ENA_CTRL_MINOR_VSN_MIN 0 +#define ENA_CTRL_SUBMINOR_VSN_MIN 1 + +#define ENA_MODULE_VER_MAJOR 1 +#define ENA_MODULE_VER_MINOR 0 +#define ENA_MODULE_VER_SUBMINOR 0 + +/* + * The Linux driver doesn't document what the specification version + * number controls or the contract around version changes. The best we + * can do is use the same version that they use and port version + * changes as they come (the last one was in 2018). + * + * common: ENA_COMMON_SPEC_VERSION_{MAJOR,MINOR} + */ +#define ENA_SPEC_VERSION_MAJOR 2 +#define ENA_SPEC_VERSION_MINOR 0 + + +/* This represents BAR 0. */ +#define ENA_REG_NUMBER 1 + +/* + * A sentinel value passed as argument to ena_ring_rx() to indicate + * the Rx ring is being read in interrupt mode, not polling mode. + */ +#define ENA_INTERRUPT_MODE -1 + +#define ENA_RX_BUF_IPHDR_ALIGNMENT 2 +#define ENA_ADMINQ_DEPTH 32 +#define ENA_AENQ_NUM_DESCS 32 + +/* Convert milliseconds to nanoseconds. */ +#define ENA_MS_TO_NS(ms) ((ms) * 1000000ul) + +/* + * The default amount of time we will wait for an admin command to + * complete, specified in microseconds. In this case, 500 milliseconds. + */ +#define ENA_ADMIN_CMD_DEF_TIMEOUT MSEC2NSEC(500) + +/* + * Property macros. + */ +#define ENA_PROP_RXQ_NUM_DESCS "rx_queue_num_descs" +#define ENA_PROP_RXQ_NUM_DESCS_MIN 64 + +#define ENA_PROP_TXQ_NUM_DESCS "tx_queue_num_descs" +#define ENA_PROP_TXQ_NUM_DESCS_MIN 64 + +#define ENA_PROP_RXQ_INTR_LIMIT "rx_queue_intr_limit" +#define ENA_PROP_RXQ_INTR_LIMIT_MIN 16 +#define ENA_PROP_RXQ_INTR_LIMIT_MAX 4096 +#define ENA_PROP_RXQ_INTR_LIMIT_DEF 256 + +#define ENA_DMA_BIT_MASK(x) ((1ULL << (x)) - 1ULL) +#define ENA_DMA_VERIFY_ADDR(ena, phys_addr) \ + VERIFY3U(ENA_DMA_BIT_MASK((ena)->ena_dma_width) & (phys_addr), \ + ==, (phys_addr)) + +typedef struct ena_dma_conf { + size_t edc_size; + uint64_t edc_align; + int edc_sgl; + uchar_t edc_endian; + boolean_t edc_stream; +} ena_dma_conf_t; + +typedef struct ena_dma_buf { + caddr_t edb_va; + size_t edb_len; + /* + * The length given by DMA engine, kept around for debugging + * purposes. + */ + size_t edb_real_len; + size_t edb_used_len; + ddi_acc_handle_t edb_acc_hdl; + ddi_dma_handle_t edb_dma_hdl; + const ddi_dma_cookie_t *edb_cookie; +} ena_dma_buf_t; + +/* + * We always sync the entire range, and therefore expect success. + */ +#ifdef DEBUG +#define ENA_DMA_SYNC(buf, flag) \ + ASSERT0(ddi_dma_sync((buf).edb_dma_hdl, 0, 0, (flag))) +#else /* DEBUG */ +#define ENA_DMA_SYNC(buf, flag) \ + ((void)ddi_dma_sync((buf).edb_dma_hdl, 0, 0, (flag))) +#endif + +typedef struct ena_aenq_grpstr { + enahw_aenq_groups_t eag_type; + const char *eag_str; +} ena_aenq_grpstr_t; + +typedef struct ena_aenq_synstr { + enahw_aenq_syndrome_t eas_type; + const char *eas_str; +} ena_aenq_synstr_t; + +typedef void (*ena_aenq_hdlr_t)(void *data, enahw_aenq_desc_t *desc); + +typedef struct ena_aenq { + enahw_aenq_desc_t *eaenq_descs; + ena_dma_buf_t eaenq_dma; + ena_aenq_hdlr_t eaenq_hdlrs[ENAHW_AENQ_GROUPS_ARR_NUM]; + uint16_t eaenq_num_descs; + uint16_t eaenq_head; + uint8_t eaenq_phase; +} ena_aenq_t; + +typedef struct ena_admin_sq { + enahw_cmd_desc_t *eas_entries; + ena_dma_buf_t eas_dma; + uint32_t *eas_dbaddr; + uint16_t eas_tail; + uint8_t eas_phase; +} ena_admin_sq_t; + +typedef struct ena_admin_cq { + enahw_resp_desc_t *eac_entries; + ena_dma_buf_t eac_dma; + uint16_t eac_head; + uint8_t eac_phase; +} ena_admin_cq_t; + +/* + * The command context is used to track outstanding requests and match + * them to device responses. + */ +typedef struct ena_cmd_ctx { + list_node_t ectx_node; + + /* + * The index into ea_cmd_ctxs where this ctx lives. Used as + * the command ID value in the command descriptor. This allows + * us to match a response to its associated context. + */ + uint16_t ectx_id; + + /* Is the command pending? */ + boolean_t ectx_pending; + + /* The type of command associated with this context. */ + enahw_cmd_opcode_t ectx_cmd_opcode; + + /* + * The location to copy the full response to. This is + * specified by the caller of the command during + * submission. + */ + enahw_resp_desc_t *ectx_resp; +} ena_cmd_ctx_t; + +/* + * The admin queue, the queue through which commands are sent to the + * device. + * + * WO: Write Once (at initialization) + * + * In general, only a single lock needs to be held in order to access + * the different parts of the admin queue: + * + * sq_lock: Any data deailng with submitting admin commands, which + * includes acquiring a command context. + * + * cq_lock: Any data dealing with reading command responses. + * + * stat_lock: For accessing statistics. + * + * In some cases, the ectx_lock/stat_lock may be held in tandem with + * either the SQ or CQ lock. In that case, the SQ/CQ lock is always + * entered first. + */ +typedef struct ena_adminq { + kmutex_t ea_sq_lock; /* WO */ + kmutex_t ea_cq_lock; /* WO */ + kmutex_t ea_stat_lock; /* WO */ + + hrtime_t ea_cmd_timeout_ns; /* WO */ + + uint16_t ea_qlen; /* WO */ + boolean_t ea_poll_mode; /* WO */ + + ena_cmd_ctx_t *ea_cmd_ctxs; /* WO */ + list_t ea_cmd_ctxs_free; /* ea_sq_lock */ + uint16_t ea_pending_cmds; /* ea_sq_lock */ + ena_admin_sq_t ea_sq; /* eq_sq_lock */ + ena_admin_cq_t ea_cq; /* eq_cq_lock */ + + /* ea_stat_lock */ + struct ena_adminq_stats { + uint64_t cmds_fail; + uint64_t cmds_submitted; + uint64_t cmds_success; + uint64_t queue_full; + } ea_stats; +} ena_adminq_t; + +typedef enum ena_attach_seq { + ENA_ATTACH_PCI = 1, /* PCI config space */ + ENA_ATTACH_REGS, /* BAR mapping */ + ENA_ATTACH_DEV_INIT, /* ENA device initialization */ + ENA_ATTACH_READ_CONF, /* Read driver conf file */ + ENA_ATTACH_DEV_CFG, /* Set any needed device config */ + ENA_ATTACH_INTR_ALLOC, /* interrupt handles allocated */ + ENA_ATTACH_INTR_HDLRS, /* intr handlers set */ + ENA_ATTACH_TXQS_ALLOC, /* Tx Queues allocated */ + ENA_ATTACH_RXQS_ALLOC, /* Tx Queues allocated */ + ENA_ATTACH_MAC_REGISTER, /* registered with mac */ + ENA_ATTACH_INTRS_ENABLE, /* interrupts are enabled */ + ENA_ATTACH_END +} ena_attach_seq_t; + +#define ENA_ATTACH_SEQ_FIRST (ENA_ATTACH_PCI) +#define ENA_ATTACH_NUM_ENTRIES (ENA_ATTACH_END - 1) + +struct ena; +typedef boolean_t (*ena_attach_fn_t)(struct ena *); +typedef void (*ena_cleanup_fn_t)(struct ena *); + +typedef struct ena_attach_desc { + ena_attach_seq_t ead_seq; + const char *ead_name; + ena_attach_fn_t ead_attach_fn; + boolean_t ead_attach_hard_fail; + ena_cleanup_fn_t ead_cleanup_fn; +} ena_attach_desc_t; + +typedef enum { + ENA_TCB_NONE, + ENA_TCB_COPY +} ena_tcb_type_t; + +/* + * The TCB is used to track information relating to the Tx of a + * packet. At the moment we support copy only. + */ +typedef struct ena_tx_control_block { + mblk_t *etcb_mp; + ena_tcb_type_t etcb_type; + ena_dma_buf_t etcb_dma; +} ena_tx_control_block_t; + +typedef enum ena_txq_state { + ENA_TXQ_STATE_NONE = 0, + ENA_TXQ_STATE_HOST_ALLOC = 1 << 0, + ENA_TXQ_STATE_CQ_CREATED = 1 << 1, + ENA_TXQ_STATE_SQ_CREATED = 1 << 2, + ENA_TXQ_STATE_READY = 1 << 3, /* TxQ ready and waiting */ + ENA_TXQ_STATE_RUNNING = 1 << 4, /* intrs enabled */ +} ena_txq_state_t; + +typedef struct ena_txq_stat { + /* Number of times mac_ether_offload_info() has failed. */ + kstat_named_t ets_hck_meoifail; + + /* + * Total number of times the ring was blocked due to + * insufficient descriptors, or unblocked due to recycling + * descriptors. + */ + kstat_named_t ets_blocked; + kstat_named_t ets_unblocked; + + /* The total number descriptors that have been recycled. */ + kstat_named_t ets_recycled; + + /* + * Number of bytes and packets that have been _submitted_ to + * the device. + */ + kstat_named_t ets_bytes; + kstat_named_t ets_packets; +} ena_txq_stat_t; + +/* + * A transmit queue, made up of a Submission Queue (SQ) and Completion + * Queue (CQ) to form a logical descriptor ring for sending packets. + * + * Write Once (WO) + * + * This value is written once, before the datapath is activated, in + * a function which is controlled by mac(9E). Some values may be + * written earlier, during ena attach, like et_ena and + * et_sq_num_descs. + * + * Tx Mutex (TM) -- et_lock + * + * This value is protected by the Tx queue's mutex. Some values may + * be initialized in a WO path, but also continually updated as part + * of normal datapath operation, such as et_sq_avail_descs. These + * values need mutex protection. + */ +typedef struct ena_txq { + kmutex_t et_lock; /* WO */ + + struct ena *et_ena; /* WO */ + uint_t et_txqs_idx; /* WO */ + mac_ring_handle_t et_mrh; /* WO */ + uint64_t et_m_gen_num; /* TM */ + ena_txq_state_t et_state; /* WO */ + uint16_t et_intr_vector; /* WO */ + + enahw_tx_desc_t *et_sq_descs; /* TM */ + ena_dma_buf_t et_sq_dma; /* WO */ + + /* Is the Tx queue currently in a blocked state? */ + boolean_t et_blocked; /* TM */ + + /* + * The number of descriptors owned by this ring. This value + * never changes after initialization. + */ + uint16_t et_sq_num_descs; /* WO */ + + /* + * The number of descriptors currently available for Tx + * submission. When this value reaches zero the ring must + * block until device notifies us of freed descriptors. + */ + uint16_t et_sq_avail_descs; /* TM */ + + /* + * The current tail index of the queue (the first free + * descriptor for host Tx submission). After initialization, + * this value only increments, relying on unsigned wrap + * around. The ENA device seems to expect this behavior, + * performing its own modulo on the value for the purposes of + * indexing, much like the driver code needs to do in order to + * access the proper TCB entry. + */ + uint16_t et_sq_tail_idx; /* TM */ + + /* + * The phase is used to know which CQ descriptors may be + * reclaimed. This is explained further in ena.c. + */ + uint16_t et_sq_phase; /* TM */ + uint16_t et_sq_hw_idx; /* WO */ + + /* + * The "doorbell" address is how the host indicates to the + * device which descriptors are ready for Tx processing. + */ + uint32_t *et_sq_db_addr; /* WO */ + + /* + * The TCBs track host Tx information, like a pointer to the + * mblk being submitted. Currently we maintain a 1:1 mapping + * of SQ descriptors to TCBs as Tx is copy only. + */ + ena_tx_control_block_t *et_tcbs; /* TM */ + + enahw_tx_cdesc_t *et_cq_descs; /* TM */ + ena_dma_buf_t et_cq_dma; /* WO */ + uint16_t et_cq_num_descs; /* WO */ + uint16_t et_cq_head_idx; /* TM */ + uint16_t et_cq_phase; /* TM */ + uint16_t et_cq_hw_idx; /* WO */ + + /* + * This address is used to control the CQ interrupts. + */ + uint32_t *et_cq_unmask_addr; /* WO */ + uint32_t *et_cq_head_db_addr; /* WO (currently unused) */ + uint32_t *et_cq_numa_addr; /* WO (currently unused) */ + + /* + * This mutex protects the Tx queue stats. This mutex may be + * entered while et_lock is held, but et_lock is not required + * to access/modify the stats. However, if both locks are + * held, then et_lock must be entered first. + */ + kmutex_t et_stat_lock; + ena_txq_stat_t et_stat; + kstat_t *et_kstat; +} ena_txq_t; + +typedef enum ena_rxq_state { + ENA_RXQ_STATE_NONE = 0, + ENA_RXQ_STATE_HOST_ALLOC = 1 << 0, + ENA_RXQ_STATE_CQ_CREATED = 1 << 1, + ENA_RXQ_STATE_SQ_CREATED = 1 << 2, + ENA_RXQ_STATE_READY = 1 << 3, /* RxQ ready and waiting */ + ENA_RXQ_STATE_RUNNING = 1 << 4, /* intrs enabled */ +} ena_rxq_state_t; + +typedef struct ena_rx_ctrl_block { + ena_dma_buf_t ercb_dma; + uint8_t ercb_offset; + uint16_t ercb_length; +} ena_rx_ctrl_block_t; + +typedef enum { + ENA_RXQ_MODE_POLLING = 1, + ENA_RXQ_MODE_INTR = 2, +} ena_rxq_mode_t; + +typedef struct ena_rxq_stat_t { + /* The total number of packets/bytes received on this queue. */ + kstat_named_t ers_packets; + kstat_named_t ers_bytes; + + /* + * At this time we expect all incoming frames to fit in a + * single buffer/descriptor. In some rare event that the + * device doesn't cooperate this stat is incremented. + */ + kstat_named_t ers_multi_desc; + + /* + * The total number of times we failed to allocate a new mblk + * for an incoming frame. + */ + kstat_named_t ers_allocb_fail; + + /* + * The total number of times the Rx interrupt handler reached + * its maximum limit for number of packets to process in a + * single interrupt. If you see this number increase + * continuously at a steady rate, then it may be an indication + * the driver is not entering polling mode. + */ + kstat_named_t ers_intr_limit; + + /* + * The total number of times the device detected an incorrect + * IPv4 header checksum. + */ + kstat_named_t ers_hck_ipv4_err; + + /* + * The total number of times the device detected an incorrect + * L4/ULP checksum. + */ + kstat_named_t ers_hck_l4_err; +} ena_rxq_stat_t; + +/* + * A receive queue, made up of a Submission Queue (SQ) and Completion + * Queue (CQ) to form a logical descriptor ring for receiving packets. + * + * Write Once (WO) + * + * This value is written once, before the datapath is activated, in + * a function which is controlled by mac(9E). + * + * Rx Mutex (RM) -- er_lock + * + * This value is protected by the Rx queue's mutex. Some values may + * be initialized in a WO path, but also continually updated as part + * of normal datapath operation, such as er_sq_avail_descs. These + * values need mutex protection. + */ +typedef struct ena_rxq { + kmutex_t er_lock; + + struct ena *er_ena; /* WO */ + uint_t er_rxqs_idx; /* WO */ + mac_ring_handle_t er_mrh; /* WO */ + uint64_t er_m_gen_num; /* WO */ + ena_rxq_state_t er_state; /* WO */ + uint16_t er_intr_vector; /* WO */ + ena_rxq_mode_t er_mode; /* RM */ + uint16_t er_intr_limit; /* RM */ + + enahw_rx_desc_t *er_sq_descs; /* RM */ + ena_dma_buf_t er_sq_dma; /* WO */ + uint16_t er_sq_num_descs; /* WO */ + uint16_t er_sq_avail_descs; /* RM */ + uint16_t er_sq_tail_idx; /* RM */ + uint16_t er_sq_phase; /* RM */ + uint16_t er_sq_hw_idx; /* WO */ + uint32_t *er_sq_db_addr; /* WO */ + + enahw_rx_cdesc_t *er_cq_descs; /* RM */ + ena_dma_buf_t er_cq_dma; /* WO */ + uint16_t er_cq_num_descs; /* WO */ + uint16_t er_cq_head_idx; /* RM */ + uint16_t er_cq_phase; /* RM */ + uint16_t er_cq_hw_idx; /* WO */ + uint32_t *er_cq_unmask_addr; /* WO */ + uint32_t *er_cq_head_db_addr; /* WO (currently unused) */ + uint32_t *er_cq_numa_addr; /* WO (currently unused) */ + + ena_rx_ctrl_block_t *er_rcbs; /* RM */ + + kmutex_t er_stat_lock; + ena_rxq_stat_t er_stat; + kstat_t *er_kstat; +} ena_rxq_t; + +/* These are stats based off of enahw_resp_basic_stats_t. */ +typedef struct ena_basic_stat { + kstat_named_t ebs_tx_bytes; + kstat_named_t ebs_tx_pkts; + kstat_named_t ebs_tx_drops; + + kstat_named_t ebs_rx_bytes; + kstat_named_t ebs_rx_pkts; + kstat_named_t ebs_rx_drops; +} ena_basic_stat_t; + +/* These are stats based off of enahw_resp_eni_stats_t. */ +typedef struct ena_extended_stat { + kstat_named_t ees_bw_in_exceeded; + kstat_named_t ees_bw_out_exceeded; + kstat_named_t ees_pps_exceeded; + kstat_named_t ees_conns_exceeded; + kstat_named_t ees_linklocal_exceeded; +} ena_extended_stat_t; + +/* These stats monitor which AENQ handlers have been called. */ +typedef struct ena_aenq_stat { + kstat_named_t eaes_default; + kstat_named_t eaes_link_change; +} ena_aenq_stat_t; + +#define ENA_STATE_PRIMORDIAL 0x1u +#define ENA_STATE_RUNNING 0x2u + +/* + * This structure contains the per-instance (PF of VF) state of the + * device. + */ +typedef struct ena { + dev_info_t *ena_dip; + int ena_instance; + + /* + * Global lock, used to synchronize administration changes to + * the ena_t. This lock should not be held in the datapath. + */ + kmutex_t ena_lock; + ena_attach_seq_t ena_attach_seq; + + /* + * We use atomic ops for ena_state so that datapath consumers + * do not need to enter ena_lock. + */ + uint32_t ena_state; + + /* + * PCI config space and BAR handle. + */ + ddi_acc_handle_t ena_pci_hdl; + off_t ena_reg_size; + caddr_t ena_reg_base; + ddi_device_acc_attr_t ena_reg_attr; + ddi_acc_handle_t ena_reg_hdl; + + /* + * Vendor information. + */ + uint16_t ena_pci_vid; + uint16_t ena_pci_did; + uint8_t ena_pci_rev; + uint16_t ena_pci_svid; + uint16_t ena_pci_sdid; + + /* + * Device and controller versions. + */ + uint32_t ena_dev_major_vsn; + uint32_t ena_dev_minor_vsn; + uint32_t ena_ctrl_major_vsn; + uint32_t ena_ctrl_minor_vsn; + uint32_t ena_ctrl_subminor_vsn; + uint32_t ena_ctrl_impl_id; + + /* + * Interrupts + */ + int ena_num_intrs; + ddi_intr_handle_t *ena_intr_handles; + size_t ena_intr_handles_sz; + int ena_intr_caps; + uint_t ena_intr_pri; + + mac_handle_t ena_mh; + + size_t ena_page_sz; + + /* + * The MTU and data layer frame sizes. + */ + uint32_t ena_mtu; + uint32_t ena_max_frame_hdr; + uint32_t ena_max_frame_total; + + /* The size (in bytes) of the Rx/Tx data buffers. */ + uint32_t ena_tx_buf_sz; + uint32_t ena_rx_buf_sz; + + /* + * The maximum number of Scatter Gather List segments the + * device can address. + */ + uint8_t ena_tx_sgl_max_sz; + uint8_t ena_rx_sgl_max_sz; + + /* The number of descriptors per Rx/Tx queue. */ + uint16_t ena_rxq_num_descs; + uint16_t ena_txq_num_descs; + + /* + * The maximum number of frames which may be read per Rx + * interrupt. + */ + uint16_t ena_rxq_intr_limit; + + /* The Rx/Tx data queues (rings). */ + ena_rxq_t *ena_rxqs; + uint16_t ena_num_rxqs; + ena_txq_t *ena_txqs; + uint16_t ena_num_txqs; + + /* These statistics are device-wide. */ + kstat_t *ena_device_basic_kstat; + kstat_t *ena_device_extended_kstat; + + /* + * This tracks AENQ-related stats, it is implicitly + * device-wide. + */ + ena_aenq_stat_t ena_aenq_stat; + kstat_t *ena_aenq_kstat; + + /* + * The Admin Queue, through which call device commands are + * sent. + */ + ena_adminq_t ena_aq; + + ena_aenq_t ena_aenq; + ena_dma_buf_t ena_host_info; + + /* + * Hardware info + */ + uint32_t ena_supported_features; + uint8_t ena_dma_width; + boolean_t ena_link_up; + boolean_t ena_link_autoneg; + boolean_t ena_link_full_duplex; + link_duplex_t ena_link_duplex; + uint64_t ena_link_speed_mbits; + enahw_link_speeds_t ena_link_speeds; + link_state_t ena_link_state; + uint32_t ena_aenq_supported_groups; + uint32_t ena_aenq_enabled_groups; + + uint32_t ena_tx_max_sq_num; + uint32_t ena_tx_max_sq_num_descs; + uint32_t ena_tx_max_cq_num; + uint32_t ena_tx_max_cq_num_descs; + uint16_t ena_tx_max_desc_per_pkt; + uint32_t ena_tx_max_hdr_len; + + uint32_t ena_rx_max_sq_num; + uint32_t ena_rx_max_sq_num_descs; + uint32_t ena_rx_max_cq_num; + uint32_t ena_rx_max_cq_num_descs; + uint16_t ena_rx_max_desc_per_pkt; + + /* This is calculated from the Rx/Tx queue nums. */ + uint16_t ena_max_io_queues; + + /* Hardware Offloads */ + boolean_t ena_tx_l3_ipv4_csum; + + boolean_t ena_tx_l4_ipv4_part_csum; + boolean_t ena_tx_l4_ipv4_full_csum; + boolean_t ena_tx_l4_ipv4_lso; + + boolean_t ena_tx_l4_ipv6_part_csum; + boolean_t ena_tx_l4_ipv6_full_csum; + boolean_t ena_tx_l4_ipv6_lso; + + boolean_t ena_rx_l3_ipv4_csum; + boolean_t ena_rx_l4_ipv4_csum; + boolean_t ena_rx_l4_ipv6_csum; + boolean_t ena_rx_hash; + + uint32_t ena_max_mtu; + uint8_t ena_mac_addr[ETHERADDRL]; +} ena_t; + +/* + * Logging functions. + */ +/*PRINTFLIKE2*/ +extern void ena_err(const ena_t *, const char *, ...) __KPRINTFLIKE(2); +/*PRINTFLIKE2*/ +extern void ena_dbg(const ena_t *, const char *, ...) __KPRINTFLIKE(2); + +extern uint32_t ena_hw_bar_read32(const ena_t *, const uint16_t); +extern uint32_t ena_hw_abs_read32(const ena_t *, uint32_t *); +extern void ena_hw_bar_write32(const ena_t *, const uint16_t, const uint32_t); +extern void ena_hw_abs_write32(const ena_t *, uint32_t *, const uint32_t); + +/* + * Stats + */ +extern void ena_stat_device_basic_cleanup(ena_t *); +extern boolean_t ena_stat_device_basic_init(ena_t *); + +extern void ena_stat_device_extended_cleanup(ena_t *); +extern boolean_t ena_stat_device_extended_init(ena_t *); + +extern void ena_stat_aenq_cleanup(ena_t *); +extern boolean_t ena_stat_aenq_init(ena_t *); + +extern void ena_stat_rxq_cleanup(ena_rxq_t *); +extern boolean_t ena_stat_rxq_init(ena_rxq_t *); +extern void ena_stat_txq_cleanup(ena_txq_t *); +extern boolean_t ena_stat_txq_init(ena_txq_t *); + +/* + * DMA + */ +extern boolean_t ena_dma_alloc(ena_t *, ena_dma_buf_t *, ena_dma_conf_t *, + size_t); +extern void ena_dma_free(ena_dma_buf_t *); +extern void ena_set_dma_addr(const ena_t *, const uint64_t, enahw_addr_t *); +extern void ena_set_dma_addr_values(const ena_t *, const uint64_t, uint32_t *, + uint16_t *); + +/* + * Interrupts + */ +extern boolean_t ena_intr_add_handlers(ena_t *); +extern void ena_intr_remove_handlers(ena_t *); +extern void ena_tx_intr_work(ena_txq_t *); +extern void ena_rx_intr_work(ena_rxq_t *); +extern void ena_aenq_work(ena_t *); +extern boolean_t ena_intrs_disable(ena_t *); +extern boolean_t ena_intrs_enable(ena_t *); + +/* + * MAC + */ +extern boolean_t ena_mac_register(ena_t *); +extern int ena_mac_unregister(ena_t *); +extern void ena_ring_tx_stop(mac_ring_driver_t); +extern int ena_ring_tx_start(mac_ring_driver_t, uint64_t); +extern mblk_t *ena_ring_tx(void *, mblk_t *); +extern void ena_ring_rx_stop(mac_ring_driver_t); +extern int ena_ring_rx_start(mac_ring_driver_t rh, uint64_t gen_num); +extern int ena_m_stat(void *, uint_t, uint64_t *); +extern mblk_t *ena_ring_rx_poll(void *, int); +extern int ena_ring_rx_stat(mac_ring_driver_t, uint_t, uint64_t *); +extern int ena_ring_tx_stat(mac_ring_driver_t, uint_t, uint64_t *); + +/* + * Admin API + */ +extern int ena_admin_submit_cmd(ena_t *, enahw_cmd_desc_t *, + enahw_resp_desc_t *, ena_cmd_ctx_t **); +extern int ena_admin_poll_for_resp(ena_t *, ena_cmd_ctx_t *); +extern void ena_free_host_info(ena_t *); +extern boolean_t ena_init_host_info(ena_t *); +extern int ena_create_cq(ena_t *, uint16_t, uint64_t, boolean_t, uint32_t, + uint16_t *, uint32_t **, uint32_t **, uint32_t **); +extern int ena_destroy_cq(ena_t *, uint16_t); +extern int ena_create_sq(ena_t *, uint16_t, uint64_t, boolean_t, uint16_t, + uint16_t *, uint32_t **); +extern int ena_destroy_sq(ena_t *, uint16_t, boolean_t); +extern int ena_set_feature(ena_t *, enahw_cmd_desc_t *, + enahw_resp_desc_t *, const enahw_feature_id_t, const uint8_t); +extern int ena_get_feature(ena_t *, enahw_resp_desc_t *, + const enahw_feature_id_t, const uint8_t); +extern int ena_admin_get_basic_stats(ena_t *, enahw_resp_desc_t *); +extern int ena_admin_get_eni_stats(ena_t *, enahw_resp_desc_t *); +extern int enahw_resp_status_to_errno(ena_t *, enahw_resp_status_t); + +/* + * Rx/Tx allocations + */ +extern boolean_t ena_alloc_rxq(ena_rxq_t *); +extern void ena_cleanup_rxq(ena_rxq_t *); +extern boolean_t ena_alloc_txq(ena_txq_t *); +extern void ena_cleanup_txq(ena_txq_t *); + +extern ena_aenq_grpstr_t ena_groups_str[]; + +#ifdef __cplusplus +} +#endif + +#endif /* _ENA_H */ diff --git a/usr/src/uts/common/io/ena/ena_admin.c b/usr/src/uts/common/io/ena/ena_admin.c new file mode 100644 index 0000000000..55e5b48901 --- /dev/null +++ b/usr/src/uts/common/io/ena/ena_admin.c @@ -0,0 +1,674 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ + +/* + * This file contains everything having to do with communicating with + * the admin queue for sending commands to the device. + */ + +#include "ena_hw.h" +#include "ena.h" + +/* + * Mark the context as complete (a response has been received). + */ +static void +ena_complete_cmd_ctx(ena_cmd_ctx_t *ctx, enahw_resp_desc_t *hwresp) +{ + bcopy(hwresp, ctx->ectx_resp, sizeof (*hwresp)); + ctx->ectx_pending = B_FALSE; +} + +/* + * Reset and release the context back to the free list. + */ +static void +ena_release_cmd_ctx(ena_t *ena, ena_cmd_ctx_t *ctx) +{ + ASSERT(ctx->ectx_pending == B_FALSE); + ctx->ectx_resp = NULL; + ctx->ectx_cmd_opcode = ENAHW_CMD_NONE; + + mutex_enter(&ena->ena_aq.ea_sq_lock); + list_insert_head(&ena->ena_aq.ea_cmd_ctxs_free, ctx); + ena->ena_aq.ea_pending_cmds--; + mutex_exit(&ena->ena_aq.ea_sq_lock); +} + +/* + * Acquire the next avaiable command context. + */ +static ena_cmd_ctx_t * +ena_acquire_cmd_ctx(ena_adminq_t *aq) +{ + VERIFY(MUTEX_HELD(&aq->ea_sq_lock)); + ASSERT3U(aq->ea_pending_cmds, <, aq->ea_qlen); + ena_cmd_ctx_t *ctx = list_remove_head(&aq->ea_cmd_ctxs_free); + + ctx->ectx_pending = B_TRUE; + return (ctx); +} + +/* + * Submit a command to the admin queue. + */ +int +ena_admin_submit_cmd(ena_t *ena, enahw_cmd_desc_t *cmd, enahw_resp_desc_t *resp, + ena_cmd_ctx_t **ctx) +{ + VERIFY3U(cmd->ecd_opcode, !=, 0); + ena_adminq_t *aq = &ena->ena_aq; + ena_admin_sq_t *sq = &aq->ea_sq; + uint16_t modulo_mask = aq->ea_qlen - 1; + ena_cmd_ctx_t *lctx = NULL; + + mutex_enter(&aq->ea_sq_lock); + uint16_t tail_mod = sq->eas_tail & modulo_mask; + + if (aq->ea_pending_cmds >= aq->ea_qlen) { + mutex_enter(&aq->ea_stat_lock); + aq->ea_stats.queue_full++; + mutex_exit(&aq->ea_stat_lock); + mutex_exit(&aq->ea_sq_lock); + return (ENOSPC); + } + + lctx = ena_acquire_cmd_ctx(aq); + lctx->ectx_cmd_opcode = cmd->ecd_opcode; + lctx->ectx_resp = resp; + + cmd->ecd_flags = sq->eas_phase & ENAHW_CMD_PHASE_MASK; + ENAHW_CMD_ID(cmd, lctx->ectx_id); + bcopy(cmd, &sq->eas_entries[tail_mod], sizeof (*cmd)); + ENA_DMA_SYNC(sq->eas_dma, DDI_DMA_SYNC_FORDEV); + sq->eas_tail++; + aq->ea_pending_cmds++; + + mutex_enter(&aq->ea_stat_lock); + aq->ea_stats.cmds_submitted++; + mutex_exit(&aq->ea_stat_lock); + + DTRACE_PROBE4(cmd__submit, enahw_cmd_desc_t *, cmd, ena_cmd_ctx_t *, + lctx, uint16_t, tail_mod, uint8_t, sq->eas_phase); + + if ((sq->eas_tail & modulo_mask) == 0) { + sq->eas_phase = !sq->eas_phase; + } + + ena_hw_abs_write32(ena, sq->eas_dbaddr, sq->eas_tail); + mutex_exit(&aq->ea_sq_lock); + *ctx = lctx; + return (0); +} + +/* + * Read a single response from the admin queue. + */ +static void +ena_admin_read_resp(ena_t *ena, enahw_resp_desc_t *hwresp) +{ + ena_adminq_t *aq = &ena->ena_aq; + ena_admin_cq_t *cq = &aq->ea_cq; + ena_cmd_ctx_t *ctx = NULL; + uint16_t modulo_mask = aq->ea_qlen - 1; + VERIFY(MUTEX_HELD(&aq->ea_cq_lock)); + + uint16_t head_mod = cq->eac_head & modulo_mask; + uint8_t phase = cq->eac_phase & ENAHW_RESP_PHASE_MASK; + uint16_t cmd_id = ENAHW_RESP_CMD_ID(hwresp); + ctx = &aq->ea_cmd_ctxs[cmd_id]; + ASSERT3U(ctx->ectx_id, ==, cmd_id); + ena_complete_cmd_ctx(ctx, hwresp); + + if (hwresp->erd_status != ENAHW_RESP_SUCCESS) { + mutex_enter(&aq->ea_stat_lock); + aq->ea_stats.cmds_fail++; + mutex_exit(&aq->ea_stat_lock); + DTRACE_PROBE4(cmd__fail, enahw_resp_desc_t *, hwresp, + ena_cmd_ctx_t *, ctx, uint16_t, head_mod, uint8_t, phase); + return; + } + + DTRACE_PROBE4(cmd__success, enahw_resp_desc_t *, hwresp, + ena_cmd_ctx_t *, ctx, uint16_t, head_mod, uint8_t, phase); + mutex_enter(&aq->ea_stat_lock); + aq->ea_stats.cmds_success++; + mutex_exit(&aq->ea_stat_lock); +} + +static void +ena_admin_process_responses(ena_t *ena) +{ + ena_adminq_t *aq = &ena->ena_aq; + ena_admin_cq_t *cq = &aq->ea_cq; + uint16_t modulo_mask = aq->ea_qlen - 1; + enahw_resp_desc_t *hwresp; + + mutex_enter(&aq->ea_cq_lock); + uint16_t head_mod = cq->eac_head & modulo_mask; + uint8_t phase = cq->eac_phase & ENAHW_RESP_PHASE_MASK; + + ENA_DMA_SYNC(cq->eac_dma, DDI_DMA_SYNC_FORKERNEL); + hwresp = &cq->eac_entries[head_mod]; + while ((hwresp->erd_flags & ENAHW_RESP_PHASE_MASK) == phase) { + ena_admin_read_resp(ena, hwresp); + + cq->eac_head++; + head_mod = cq->eac_head & modulo_mask; + + if (head_mod == 0) { + phase = !phase; + } + + hwresp = &cq->eac_entries[head_mod]; + } + + cq->eac_phase = phase; + mutex_exit(&aq->ea_cq_lock); +} + +/* + * Wait for the command described by ctx to complete by polling for + * status updates. + */ +int +ena_admin_poll_for_resp(ena_t *ena, ena_cmd_ctx_t *ctx) +{ + int ret = 0; + hrtime_t expire = gethrtime() + ena->ena_aq.ea_cmd_timeout_ns; + + while (1) { + ena_admin_process_responses(ena); + + if (!ctx->ectx_pending) { + break; + } + + /* Wait for 1 millisecond. */ + delay(drv_usectohz(1000)); + + if (gethrtime() > expire) { + /* + * We have no visibility into the device to + * confirm it is making progress on this + * command. At this point the driver and + * device cannot agree on the state of the + * world: perhaps the device is still making + * progress but not fast enough, perhaps the + * device completed the command but there was + * a failure to deliver the reply, perhaps the + * command failed but once again the reply was + * not delivered. With this unknown state the + * best thing to do is to reset the device and + * start from scratch. But as we don't have + * that capability at the moment the next best + * thing to do is to spin or panic; we choose + * to panic. + */ + panic("timed out waiting for admin response"); + } + } + + ret = enahw_resp_status_to_errno(ena, ctx->ectx_resp->erd_status); + ena_release_cmd_ctx(ena, ctx); + return (ret); +} + +void +ena_free_host_info(ena_t *ena) +{ + ena_dma_free(&ena->ena_host_info); +} + +boolean_t +ena_init_host_info(ena_t *ena) +{ + enahw_host_info_t *ehi; + int ret = 0; + int *regs; + uint_t nregs; + ena_dma_buf_t *hi_dma; + enahw_cmd_desc_t cmd; + enahw_feat_host_attr_t *ha_cmd = + &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_host_attr; + enahw_resp_desc_t resp; + ena_dma_conf_t conf = { + .edc_size = ENAHW_HOST_INFO_ALLOC_SZ, + .edc_align = ENAHW_HOST_INFO_ALIGNMENT, + .edc_sgl = 1, + .edc_endian = DDI_NEVERSWAP_ACC, + .edc_stream = B_FALSE, + }; + + hi_dma = &ena->ena_host_info; + + if (!ena_dma_alloc(ena, hi_dma, &conf, 4096)) { + ena_err(ena, "failed to allocate DMA for host info"); + return (B_FALSE); + } + + ehi = (void *)hi_dma->edb_va; + ehi->ehi_ena_spec_version = + ((ENA_SPEC_VERSION_MAJOR << ENAHW_HOST_INFO_SPEC_MAJOR_SHIFT) | + (ENA_SPEC_VERSION_MINOR)); + + ehi->ehi_bdf = 0; + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, ena->ena_dip, + DDI_PROP_DONTPASS, "reg", ®s, &nregs) == DDI_PROP_SUCCESS) { + if (nregs != 0) { + ehi->ehi_bdf |= PCI_REG_BUS_G(regs[0]) << 8; + ehi->ehi_bdf |= PCI_REG_DEV_G(regs[0]) << 3; + ehi->ehi_bdf |= PCI_REG_FUNC_G(regs[0]); + } + + ddi_prop_free(regs); + } + + /* + * There is no illumos OS type, it would be nice to ping + * someone at Amazon and see if we can't get one added. + */ + ehi->ehi_os_type = ENAHW_OS_FREEBSD; + ehi->ehi_kernel_ver = 511; /* If you know you know */ + (void) strlcpy((char *)ehi->ehi_kernel_ver_str, utsname.version, + sizeof (ehi->ehi_kernel_ver_str)); + ehi->ehi_os_dist = 0; /* What everyone else does. */ + ehi->ehi_driver_ver = + (ENA_MODULE_VER_MAJOR) | + (ENA_MODULE_VER_MINOR << ENAHW_HOST_INFO_MINOR_SHIFT) | + (ENA_MODULE_VER_SUBMINOR << ENAHW_HOST_INFO_SUB_MINOR_SHIFT); + ehi->ehi_num_cpus = ncpus_online; + + /* + * ENA devices are not created equal. Some will support + * features not found in others. This field tells the device + * which features the driver supports. + * + * ENAHW_HOST_INFO_RX_OFFSET + * + * Some ENA devices will write the frame data at an offset + * in the buffer, presumably for alignment purposes. We + * support this feature for the sole reason that the Linux + * driver does as well. + * + * ENAHW_HOST_INFO_INTERRUPT_MODERATION + * + * Based on the Linux history this flag indicates that the + * driver "supports interrupt moderation properly". What + * that means is anyone's guess. The Linux driver seems to + * have some "adaptive" interrupt moderation, so perhaps + * it's that? In any case, FreeBSD doesn't bother with + * setting this flag, so we'll leave it be for now as well. + * + * If you're curious to know if the device supports + * interrupt moderation: the FEAT_INTERRUPT_MODERATION flag + * will be set in ena_hw.eh_supported_features. + * + * ENAHW_HOST_INFO_RX_BUF_MIRRORING + * + * Support traffic mirroring by allowing the hypervisor to + * read the buffer memory directly. This probably has to do + * with AWS flow logs, allowing more efficient mirroring. + * But it's hard to say for sure given we only have the + * Linux commit log to go off of. In any case, the only + * requirement for this feature is that the Rx DMA buffers + * be read/write, which they are. + * + * ENAHW_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY + * + * The device supports the retrieving and updating of the + * RSS function and hash key. As we don't yet implement RSS + * this is disabled. + */ + ehi->ehi_driver_supported_features = + ENAHW_HOST_INFO_RX_OFFSET_MASK | + ENAHW_HOST_INFO_RX_BUF_MIRRORING_MASK; + + ENA_DMA_SYNC(*hi_dma, DDI_DMA_SYNC_FORDEV); + bzero(&cmd, sizeof (cmd)); + ena_set_dma_addr(ena, hi_dma->edb_cookie->dmac_laddress, + &ha_cmd->efha_os_addr); + + /* + * You might notice the "debug area" is not allocated or + * configured, that is on purpose. + * + * The "debug area" is a region of host memory that contains + * the String Set (SS) tables used to report statistics to + * tools like ethtool (on Linux). This table consists of one + * of more entries of a 32-byte string (the name of the + * statistic) along with its associated 64-bit value. The + * stats reported here contain both the host-side stats as + * well as device-reported stats (ENAHW_GET_STATS_TYPE_ENI). I + * believe the reason for calling it the "debug area" is that + * it can be accessed from outside of the guest, allowing an + * AWS user (?) or Amazon employee to get basic information + * about the state of the device from the guest's point of + * view. + * + * In the fullness of time, our driver should probably support + * this aspect of ENA. For the time being, all testing + * indicates the driver and device function fine without it. + */ + + ret = ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_HOST_ATTR_CONFIG, + ENAHW_FEAT_HOST_ATTR_CONFIG_VER); + if (ret != 0) { + ena_err(ena, "failed to set host attributes: %d", ret); + ena_dma_free(hi_dma); + return (B_FALSE); + } + + return (B_TRUE); +} + +int +ena_create_cq(ena_t *ena, uint16_t num_descs, uint64_t phys_addr, + boolean_t is_tx, uint32_t vector, uint16_t *hw_index, + uint32_t **unmask_addr, uint32_t **headdb, uint32_t **numanode) +{ + int ret; + enahw_cmd_desc_t cmd; + enahw_cmd_create_cq_t *cmd_cq = &cmd.ecd_cmd.ecd_create_cq; + enahw_resp_desc_t resp; + enahw_resp_create_cq_t *resp_cq = &resp.erd_resp.erd_create_cq; + ena_cmd_ctx_t *ctx = NULL; + uint8_t desc_size = is_tx ? sizeof (enahw_tx_cdesc_t) : + sizeof (enahw_rx_cdesc_t); + + bzero(&cmd, sizeof (cmd)); + bzero(&resp, sizeof (resp)); + + cmd.ecd_opcode = ENAHW_CMD_CREATE_CQ; + ENAHW_CMD_CREATE_CQ_INTERRUPT_MODE_ENABLE(cmd_cq); + ASSERT3U(desc_size % 4, ==, 0); + ENAHW_CMD_CREATE_CQ_DESC_SIZE_WORDS(cmd_cq, desc_size / 4); + cmd_cq->ecq_num_descs = num_descs; + cmd_cq->ecq_msix_vector = vector; + ena_set_dma_addr(ena, phys_addr, &cmd_cq->ecq_addr); + + if ((ret = ena_admin_submit_cmd(ena, &cmd, &resp, &ctx)) != 0) { + ena_err(ena, "failed to submit Create CQ command: %d", ret); + return (ret); + } + + if ((ret = ena_admin_poll_for_resp(ena, ctx)) != 0) { + ena_err(ena, "failed to Create CQ: %d", ret); + return (ret); + } + + *hw_index = resp_cq->ercq_idx; + *unmask_addr = (uint32_t *)(ena->ena_reg_base + + resp_cq->ercq_interrupt_mask_reg_offset); + + if (resp_cq->ercq_head_db_reg_offset != 0) { + *headdb = (uint32_t *)(ena->ena_reg_base + + resp_cq->ercq_head_db_reg_offset); + } else { + *headdb = NULL; + } + + if (resp_cq->ercq_numa_node_reg_offset != 0) { + *numanode = (uint32_t *)(ena->ena_reg_base + + resp_cq->ercq_numa_node_reg_offset); + } else { + *numanode = NULL; + } + + return (0); +} + +int +ena_destroy_cq(ena_t *ena, uint16_t hw_idx) +{ + enahw_cmd_desc_t cmd; + enahw_resp_desc_t resp; + ena_cmd_ctx_t *ctx = NULL; + int ret; + + bzero(&cmd, sizeof (cmd)); + bzero(&resp, sizeof (resp)); + cmd.ecd_opcode = ENAHW_CMD_DESTROY_CQ; + cmd.ecd_cmd.ecd_destroy_cq.edcq_idx = hw_idx; + + if ((ret = ena_admin_submit_cmd(ena, &cmd, &resp, &ctx)) != 0) { + ena_err(ena, "failed to submit Destroy CQ command: %d", ret); + return (ret); + } + + if ((ret = ena_admin_poll_for_resp(ena, ctx)) != 0) { + ena_err(ena, "failed to Destroy CQ: %d", ret); + return (ret); + } + + return (0); +} + +int +ena_create_sq(ena_t *ena, uint16_t num_descs, uint64_t phys_addr, + boolean_t is_tx, uint16_t cq_index, uint16_t *hw_index, uint32_t **db_addr) +{ + int ret; + enahw_cmd_desc_t cmd; + enahw_cmd_create_sq_t *cmd_sq = &cmd.ecd_cmd.ecd_create_sq; + enahw_resp_desc_t resp; + enahw_resp_create_sq_t *resp_sq = &resp.erd_resp.erd_create_sq; + enahw_sq_direction_t dir = + is_tx ? ENAHW_SQ_DIRECTION_TX : ENAHW_SQ_DIRECTION_RX; + ena_cmd_ctx_t *ctx = NULL; + + if (!ISP2(num_descs)) { + ena_err(ena, "the number of descs must be a power of 2, but " + " is %d", num_descs); + return (B_FALSE); + } + + bzero(&cmd, sizeof (cmd)); + bzero(&resp, sizeof (resp)); + cmd.ecd_opcode = ENAHW_CMD_CREATE_SQ; + ENAHW_CMD_CREATE_SQ_DIR(cmd_sq, dir); + ENAHW_CMD_CREATE_SQ_PLACEMENT_POLICY(cmd_sq, + ENAHW_PLACEMENT_POLICY_HOST); + ENAHW_CMD_CREATE_SQ_COMPLETION_POLICY(cmd_sq, + ENAHW_COMPLETION_POLICY_DESC); + /* + * We limit all SQ descriptor rings to an SGL of 1, therefore + * they are always physically contiguous. + */ + ENAHW_CMD_CREATE_SQ_PHYSMEM_CONTIG(cmd_sq); + cmd_sq->ecsq_cq_idx = cq_index; + cmd_sq->ecsq_num_descs = num_descs; + + /* + * If we ever use a non-host placement policy, then guard this + * code against placement type (this value should not be set + * for device placement). + */ + ena_set_dma_addr(ena, phys_addr, &cmd_sq->ecsq_base); + + if ((ret = ena_admin_submit_cmd(ena, &cmd, &resp, &ctx)) != 0) { + ena_err(ena, "failed to submit Create SQ command: %d", ret); + return (ret); + } + + if ((ret = ena_admin_poll_for_resp(ena, ctx)) != 0) { + ena_err(ena, "failed to Create SQ: %d", ret); + return (ret); + } + + *hw_index = resp_sq->ersq_idx; + *db_addr = (uint32_t *)(ena->ena_reg_base + + resp_sq->ersq_db_reg_offset); + return (0); +} + +int +ena_destroy_sq(ena_t *ena, uint16_t hw_idx, boolean_t is_tx) +{ + enahw_cmd_desc_t cmd; + enahw_cmd_destroy_sq_t *cmd_sq = &cmd.ecd_cmd.ecd_destroy_sq; + enahw_resp_desc_t resp; + ena_cmd_ctx_t *ctx = NULL; + int ret; + + bzero(&cmd, sizeof (cmd)); + bzero(&resp, sizeof (resp)); + cmd.ecd_opcode = ENAHW_CMD_DESTROY_SQ; + cmd_sq->edsq_idx = hw_idx; + ENAHW_CMD_DESTROY_SQ_DIR(cmd_sq, is_tx); + + if ((ret = ena_admin_submit_cmd(ena, &cmd, &resp, &ctx)) != 0) { + ena_err(ena, "failed to submit Destroy SQ command: %d", ret); + return (ret); + } + + if ((ret = ena_admin_poll_for_resp(ena, ctx)) != 0) { + ena_err(ena, "failed Destroy SQ: %d", ret); + return (ret); + } + + return (0); +} + +/* + * Determine if a given feature is available on this device. + */ +static boolean_t +ena_is_feature_avail(ena_t *ena, const enahw_feature_id_t feat_id) +{ + VERIFY3U(feat_id, <=, ENAHW_FEAT_NUM); + uint32_t mask = 1U << feat_id; + + /* + * The device attributes feature is always supported, as + * indicated by the common code. + */ + if (feat_id == ENAHW_FEAT_DEVICE_ATTRIBUTES) { + return (B_TRUE); + } + + return ((ena->ena_supported_features & mask) != 0); +} + +int +ena_set_feature(ena_t *ena, enahw_cmd_desc_t *cmd, enahw_resp_desc_t *resp, + const enahw_feature_id_t feat_id, const uint8_t feat_ver) +{ + enahw_cmd_set_feat_t *cmd_sf = &cmd->ecd_cmd.ecd_set_feat; + ena_cmd_ctx_t *ctx = NULL; + int ret = 0; + + if (!ena_is_feature_avail(ena, feat_id)) { + ena_err(ena, "attempted to set unsupported feature: 0x%x %d" + " (0x%x)", feat_id, feat_ver, ena->ena_supported_features); + return (ENOTSUP); + } + + cmd->ecd_opcode = ENAHW_CMD_SET_FEATURE; + cmd_sf->ecsf_comm.efc_id = feat_id; + cmd_sf->ecsf_comm.efc_version = feat_ver; + cmd_sf->ecsf_comm.efc_flags = 0; + + if ((ret = ena_admin_submit_cmd(ena, cmd, resp, &ctx)) != 0) { + ena_err(ena, "failed to submit Set Feature command: %d", ret); + return (ret); + } + + return (ena_admin_poll_for_resp(ena, ctx)); +} + +int +ena_get_feature(ena_t *ena, enahw_resp_desc_t *resp, + const enahw_feature_id_t feat_id, const uint8_t feat_ver) +{ + enahw_cmd_desc_t cmd; + enahw_cmd_get_feat_t *cmd_gf = &cmd.ecd_cmd.ecd_get_feat; + ena_cmd_ctx_t *ctx = NULL; + int ret = 0; + + if (!ena_is_feature_avail(ena, feat_id)) { + return (ENOTSUP); + } + + bzero(&cmd, sizeof (cmd)); + cmd.ecd_opcode = ENAHW_CMD_GET_FEATURE; + cmd_gf->ecgf_comm.efc_id = feat_id; + cmd_gf->ecgf_comm.efc_version = feat_ver; + ENAHW_GET_FEAT_FLAGS_GET_CURR_VAL(cmd_gf); + + if ((ret = ena_admin_submit_cmd(ena, &cmd, resp, &ctx)) != 0) { + ena_err(ena, "failed to submit Get Feature command: %d", ret); + return (ret); + } + + return (ena_admin_poll_for_resp(ena, ctx)); +} + +int +ena_admin_get_basic_stats(ena_t *ena, enahw_resp_desc_t *resp) +{ + int ret = 0; + enahw_cmd_desc_t cmd; + enahw_cmd_get_stats_t *cmd_stats = &cmd.ecd_cmd.ecd_get_stats; + ena_cmd_ctx_t *ctx = NULL; + + bzero(&cmd, sizeof (cmd)); + bzero(resp, sizeof (*resp)); + cmd.ecd_opcode = ENAHW_CMD_GET_STATS; + cmd_stats->ecgs_type = ENAHW_GET_STATS_TYPE_BASIC; + cmd_stats->ecgs_scope = ENAHW_GET_STATS_SCOPE_ETH; + cmd_stats->ecgs_device_id = ENAHW_CMD_GET_STATS_MY_DEVICE_ID; + + if ((ret = ena_admin_submit_cmd(ena, &cmd, resp, &ctx)) != 0) { + ena_err(ena, "failed to submit Get Basic Stats command: %d", + ret); + return (ret); + } + + if ((ret = ena_admin_poll_for_resp(ena, ctx)) != 0) { + ena_err(ena, "failed to Get Basic Stats: %d", ret); + return (ret); + } + + return (0); +} + +int +ena_admin_get_eni_stats(ena_t *ena, enahw_resp_desc_t *resp) +{ + int ret = 0; + enahw_cmd_desc_t cmd; + enahw_cmd_get_stats_t *cmd_stats = &cmd.ecd_cmd.ecd_get_stats; + ena_cmd_ctx_t *ctx = NULL; + + bzero(&cmd, sizeof (cmd)); + bzero(resp, sizeof (*resp)); + cmd.ecd_opcode = ENAHW_CMD_GET_STATS; + cmd_stats->ecgs_type = ENAHW_GET_STATS_TYPE_ENI; + cmd_stats->ecgs_scope = ENAHW_GET_STATS_SCOPE_ETH; + cmd_stats->ecgs_device_id = ENAHW_CMD_GET_STATS_MY_DEVICE_ID; + + if ((ret = ena_admin_submit_cmd(ena, &cmd, resp, &ctx)) != 0) { + ena_err(ena, "failed to submit Get ENI Stats command: %d", ret); + return (ret); + } + + if ((ret = ena_admin_poll_for_resp(ena, ctx)) != 0) { + ena_err(ena, "failed to Get ENI Stats: %d", ret); + return (ret); + } + + return (0); +} diff --git a/usr/src/uts/common/io/ena/ena_dma.c b/usr/src/uts/common/io/ena/ena_dma.c new file mode 100644 index 0000000000..48f39b9dbb --- /dev/null +++ b/usr/src/uts/common/io/ena/ena_dma.c @@ -0,0 +1,191 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ + +#include "ena.h" + +/* + * Create DMA attributes based on the conf parameter. + */ +void +ena_dma_attr(const ena_t *ena, ddi_dma_attr_t *attrp, + const ena_dma_conf_t *conf) +{ + bzero(attrp, sizeof (*attrp)); + + /* + * Round up maximums to next page. This is what the Linux and + * FreeBSD driver do, so we follow suit. + */ + const size_t size_up = + P2ROUNDUP_TYPED(conf->edc_size, ena->ena_page_sz, size_t); + + attrp->dma_attr_version = DMA_ATTR_V0; + + /* + * The device tells us the window it supports in terms of + * number of bits, we convert that to the appropriate mask. + */ + ASSERT3U(ena->ena_dma_width, >=, 32); + ASSERT3U(ena->ena_dma_width, <=, 48); + attrp->dma_attr_addr_lo = 0x0; + attrp->dma_attr_addr_hi = ENA_DMA_BIT_MASK(ena->ena_dma_width); + + /* + * This indicates the amount of data that can fit in one + * cookie/segment. We allow the entire object to live in one + * segment, when possible. + * + * NOTE: This value must be _one less_ than the desired max + * (i.e. a value of 4095 indicates a max of 4096). + */ + attrp->dma_attr_count_max = size_up - 1; + + /* + * The alignment of the starting address. + */ + attrp->dma_attr_align = conf->edc_align; + + /* + * The segment boundary dictates the address which a segment + * cannot cross. In this case there is no boundary. + */ + attrp->dma_attr_seg = UINT64_MAX; + + /* + * Allow a burst size of the entire object. + */ + attrp->dma_attr_burstsizes = size_up; + + /* + * Minimum and maximum amount of data we can send. This isn't + * strictly limited by PCI in hardware, as it'll just make the + * appropriate number of requests. Simiarly, PCIe allows for + * an arbitrary granularity. We set this to one, as it's + * really a matter of what hardware is requesting from us. + */ + attrp->dma_attr_minxfer = 0x1; + attrp->dma_attr_maxxfer = size_up; + attrp->dma_attr_granular = 0x1; + + /* + * The maximum length of the Scatter Gather List, aka the + * maximum number of segments a device can address in a + * transfer. + */ + attrp->dma_attr_sgllen = conf->edc_sgl; +} + +void +ena_dma_free(ena_dma_buf_t *edb) +{ + if (edb->edb_cookie != NULL) { + (void) ddi_dma_unbind_handle(edb->edb_dma_hdl); + edb->edb_cookie = NULL; + edb->edb_real_len = 0; + } + + if (edb->edb_acc_hdl != NULL) { + ddi_dma_mem_free(&edb->edb_acc_hdl); + edb->edb_acc_hdl = NULL; + edb->edb_va = NULL; + } + + if (edb->edb_dma_hdl != NULL) { + ddi_dma_free_handle(&edb->edb_dma_hdl); + edb->edb_dma_hdl = NULL; + } + + edb->edb_len = 0; +} + +boolean_t +ena_dma_alloc(ena_t *ena, ena_dma_buf_t *edb, ena_dma_conf_t *conf, size_t size) +{ + int ret; + size_t size_allocated; + ddi_dma_attr_t attr; + ddi_device_acc_attr_t acc; + uint_t flags = + conf->edc_stream ? DDI_DMA_STREAMING : DDI_DMA_CONSISTENT; + + ena_dma_attr(ena, &attr, conf); + + acc.devacc_attr_version = DDI_DEVICE_ATTR_V1; + acc.devacc_attr_endian_flags = conf->edc_endian; + acc.devacc_attr_dataorder = DDI_STRICTORDER_ACC; + + ret = ddi_dma_alloc_handle(ena->ena_dip, &attr, DDI_DMA_DONTWAIT, NULL, + &edb->edb_dma_hdl); + if (ret != DDI_SUCCESS) { + ena_err(ena, "!failed to allocate DMA handle: %d", ret); + return (B_FALSE); + } + + ret = ddi_dma_mem_alloc(edb->edb_dma_hdl, size, &acc, flags, + DDI_DMA_DONTWAIT, NULL, &edb->edb_va, &size_allocated, + &edb->edb_acc_hdl); + if (ret != DDI_SUCCESS) { + ena_err(ena, "!failed to allocate %lu bytes of DMA " + "memory: %d", size, ret); + ena_dma_free(edb); + return (B_FALSE); + } + + bzero(edb->edb_va, size_allocated); + + ret = ddi_dma_addr_bind_handle(edb->edb_dma_hdl, NULL, edb->edb_va, + size_allocated, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT, NULL, NULL, + NULL); + if (ret != DDI_SUCCESS) { + ena_err(ena, "!failed to bind %lu bytes of DMA " + "memory: %d", size_allocated, ret); + ena_dma_free(edb); + return (B_FALSE); + } + + edb->edb_len = size; + edb->edb_real_len = size_allocated; + edb->edb_cookie = ddi_dma_cookie_one(edb->edb_dma_hdl); + return (B_TRUE); +} + +/* + * Write the physical DMA address to the ENA hardware address pointer. + * While the DMA engine should guarantee that the allocation is within + * the specified range, we double check here to catch programmer error + * and avoid hard-to-debug situations. + */ +void +ena_set_dma_addr(const ena_t *ena, const uint64_t phys_addr, + enahw_addr_t *hwaddrp) +{ + ENA_DMA_VERIFY_ADDR(ena, phys_addr); + hwaddrp->ea_low = (uint32_t)phys_addr; + hwaddrp->ea_high = (uint16_t)(phys_addr >> 32); +} + +/* + * The same as the above function, but writes the phsyical address to + * the supplied value pointers instead. Mostly used as a sanity check + * that the address fits in the reported DMA width. + */ +void +ena_set_dma_addr_values(const ena_t *ena, const uint64_t phys_addr, + uint32_t *dst_low, uint16_t *dst_high) +{ + ENA_DMA_VERIFY_ADDR(ena, phys_addr); + *dst_low = (uint32_t)phys_addr; + *dst_high = (uint16_t)(phys_addr >> 32); +} diff --git a/usr/src/uts/common/io/ena/ena_gld.c b/usr/src/uts/common/io/ena/ena_gld.c new file mode 100644 index 0000000000..2c27d0d31c --- /dev/null +++ b/usr/src/uts/common/io/ena/ena_gld.c @@ -0,0 +1,465 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ +#include "ena.h" + +/* + * Group/Ring callbacks + */ + +/* + * The ena driver supports only a single mac address: the one assigned + * to it by the hypervisor. If mac requests an address besides this + * one, then return ENOTSUP. This will prevent VNICs from being + * created, as it should. + */ +static int +ena_group_add_mac(void *arg, const uint8_t *mac_addr) +{ + ena_t *ena = arg; + + if (ETHER_IS_MULTICAST(mac_addr)) { + return (EINVAL); + } + + if (bcmp(ena->ena_mac_addr, mac_addr, ETHERADDRL) == 0) { + return (0); + } + + return (ENOTSUP); +} + +static int +ena_group_rem_mac(void *arg, const uint8_t *mac_addr) +{ + ena_t *ena = arg; + + if (ETHER_IS_MULTICAST(mac_addr)) { + return (EINVAL); + } + + if (bcmp(ena->ena_mac_addr, mac_addr, ETHERADDRL) == 0) { + return (0); + } + + return (ENOTSUP); +} + +static int +ena_ring_rx_intr_disable(mac_intr_handle_t mih) +{ + ena_rxq_t *rxq = (ena_rxq_t *)mih; + uint32_t intr_ctrl; + + mutex_enter(&rxq->er_lock); + intr_ctrl = ena_hw_abs_read32(rxq->er_ena, rxq->er_cq_unmask_addr); + ENAHW_REG_INTR_MASK(intr_ctrl); + ena_hw_abs_write32(rxq->er_ena, rxq->er_cq_unmask_addr, intr_ctrl); + rxq->er_mode = ENA_RXQ_MODE_POLLING; + mutex_exit(&rxq->er_lock); + return (0); +} + +static int +ena_ring_rx_intr_enable(mac_intr_handle_t mih) +{ + ena_rxq_t *rxq = (ena_rxq_t *)mih; + uint32_t intr_ctrl; + + mutex_enter(&rxq->er_lock); + intr_ctrl = ena_hw_abs_read32(rxq->er_ena, rxq->er_cq_unmask_addr); + ENAHW_REG_INTR_UNMASK(intr_ctrl); + ena_hw_abs_write32(rxq->er_ena, rxq->er_cq_unmask_addr, intr_ctrl); + rxq->er_mode = ENA_RXQ_MODE_INTR; + mutex_exit(&rxq->er_lock); + return (0); +} + +static void +ena_fill_rx_group(void *arg, mac_ring_type_t rtype, const int index, + mac_group_info_t *infop, mac_group_handle_t gh) +{ + ena_t *ena = arg; + + VERIFY3S(rtype, ==, MAC_RING_TYPE_RX); + /* + * Typically you pass an Rx group data structure as + * mgi_driver, but given we should only ever have one group we + * just pass the top-level ena_t. + */ + infop->mgi_driver = (mac_group_driver_t)ena; + infop->mgi_start = NULL; + infop->mgi_stop = NULL; + infop->mgi_addmac = ena_group_add_mac; + infop->mgi_remmac = ena_group_rem_mac; + infop->mgi_count = ena->ena_num_intrs - 1; +} + +static void +ena_fill_tx_ring(void *arg, mac_ring_type_t rtype, const int group_index, + const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh) +{ + ena_t *ena = arg; + ena_txq_t *txq = &(ena->ena_txqs[ring_index]); + + VERIFY3S(rtype, ==, MAC_RING_TYPE_TX); + VERIFY3S(ring_index, <, ena->ena_num_txqs); + /* Link driver Tx queue to mac ring handle and vice versa. */ + txq->et_mrh = rh; + infop->mri_driver = (mac_ring_driver_t)txq; + infop->mri_start = ena_ring_tx_start; + infop->mri_stop = ena_ring_tx_stop; + infop->mri_tx = ena_ring_tx; + infop->mri_stat = ena_ring_tx_stat; +} + +static void +ena_fill_rx_ring(void *arg, mac_ring_type_t rtype, const int group_index, + const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh) +{ + ena_t *ena = arg; + ena_rxq_t *rxq = &(ena->ena_rxqs[ring_index]); + + VERIFY3S(rtype, ==, MAC_RING_TYPE_RX); + VERIFY3S(ring_index, <, ena->ena_num_rxqs); + rxq->er_mrh = rh; + infop->mri_driver = (mac_ring_driver_t)rxq; + infop->mri_start = ena_ring_rx_start; + infop->mri_stop = ena_ring_rx_stop; + infop->mri_poll = ena_ring_rx_poll; + infop->mri_stat = ena_ring_rx_stat; + infop->mri_intr.mi_handle = (mac_intr_handle_t)rxq; + infop->mri_intr.mi_enable = ena_ring_rx_intr_enable; + infop->mri_intr.mi_disable = ena_ring_rx_intr_disable; + infop->mri_intr.mi_ddi_handle = + ena->ena_intr_handles[rxq->er_intr_vector]; +} + +static int +ena_m_start(void *arg) +{ + ena_t *ena = arg; + + atomic_or_32(&ena->ena_state, ENA_STATE_RUNNING); + return (0); +} + +static void +ena_m_stop(void *arg) +{ + ena_t *ena = arg; + atomic_and_32(&ena->ena_state, ~ENA_STATE_RUNNING); +} + +/* + * As discussed in ena_group_add_mac(), ENA only supports a single MAC + * address, and therefore we prevent VNICs from being created. That + * means there is no chance for promisc to be used as a means for + * implementing VNIC support on ENA, as we never allow them to be + * created in the first place. + * + * As for promisc itself, returning success is about the best we can + * do. There is no promisc API for an ENA device -- you get only the + * exact traffic AWS wants you to see. + */ +static int +ena_m_setpromisc(void *arg, boolean_t on) +{ + return (0); +} + +/* + * Similarly to promisc, there is no multicast API for an ENA + * device. + */ +static int +ena_m_multicast(void *arg, boolean_t add, const uint8_t *multicast_address) +{ + return (0); +} + +static boolean_t +ena_m_getcapab(void *arg, mac_capab_t capab, void *cap_data) +{ + ena_t *ena = arg; + mac_capab_rings_t *cap_rings; + + switch (capab) { + case MAC_CAPAB_RINGS: + cap_rings = cap_data; + cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; + cap_rings->mr_gaddring = NULL; + cap_rings->mr_gremring = NULL; + ASSERT3U(ena->ena_num_intrs, >=, 2); + + switch (cap_rings->mr_type) { + case MAC_RING_TYPE_TX: + /* + * We use pseudo Tx groups for now. + */ + cap_rings->mr_gnum = 0; + cap_rings->mr_rnum = ena->ena_num_intrs - 1; + cap_rings->mr_rget = ena_fill_tx_ring; + break; + case MAC_RING_TYPE_RX: + cap_rings->mr_rnum = ena->ena_num_intrs - 1; + cap_rings->mr_rget = ena_fill_rx_ring; + /* + * The ENA device provides no means to add mac + * filters or set promisc mode; it's only + * meant to receive its pre-designated unicast + * address. However, we still want rings as + * the device does provide multiple queues and + * RSS. + */ + cap_rings->mr_gnum = 1; + cap_rings->mr_gget = ena_fill_rx_group; + break; + } + + break; + + case MAC_CAPAB_HCKSUM: + case MAC_CAPAB_LSO: + return (B_FALSE); + default: + return (B_FALSE); + } + + return (B_TRUE); +} + +static int +ena_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, const void *pr_val) +{ + return (ENOTSUP); +} + +static int +ena_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, void *pr_val) +{ + ena_t *ena = arg; + int ret = 0; + uint64_t speed; + uint8_t *u8; + + mutex_enter(&ena->ena_lock); + + switch (pr_num) { + case MAC_PROP_DUPLEX: + if (pr_valsize < sizeof (link_duplex_t)) { + ret = EOVERFLOW; + break; + } + + bcopy(&ena->ena_link_duplex, pr_val, sizeof (link_duplex_t)); + break; + + case MAC_PROP_SPEED: + if (pr_valsize < sizeof (uint64_t)) { + ret = EOVERFLOW; + break; + } + + speed = ena->ena_link_speed_mbits * 1000000ULL; + bcopy(&speed, pr_val, sizeof (speed)); + break; + + case MAC_PROP_STATUS: + if (pr_valsize < sizeof (link_state_t)) { + ret = EOVERFLOW; + break; + } + + bcopy(&ena->ena_link_state, pr_val, sizeof (link_state_t)); + break; + + case MAC_PROP_AUTONEG: + if (pr_valsize < sizeof (uint8_t)) { + ret = EOVERFLOW; + break; + } + + u8 = pr_val; + *u8 = (ena->ena_link_autoneg ? 0 : 1); + break; + + case MAC_PROP_MTU: + if (pr_valsize < sizeof (uint32_t)) { + ret = EOVERFLOW; + break; + } + + bcopy(&ena->ena_mtu, pr_val, sizeof (uint32_t)); + break; + + case MAC_PROP_ADV_1000FDX_CAP: + case MAC_PROP_EN_1000FDX_CAP: + if (pr_valsize < sizeof (uint8_t)) { + ret = EOVERFLOW; + break; + } + + u8 = pr_val; + *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_1G) != 0; + break; + + case MAC_PROP_ADV_2500FDX_CAP: + case MAC_PROP_EN_2500FDX_CAP: + if (pr_valsize < sizeof (uint8_t)) { + ret = EOVERFLOW; + break; + } + + u8 = pr_val; + *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_2_HALF_G) != 0; + break; + + case MAC_PROP_ADV_5000FDX_CAP: + case MAC_PROP_EN_5000FDX_CAP: + if (pr_valsize < sizeof (uint8_t)) { + ret = EOVERFLOW; + break; + } + + u8 = pr_val; + *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_5G) != 0; + break; + + case MAC_PROP_ADV_10GFDX_CAP: + case MAC_PROP_EN_10GFDX_CAP: + if (pr_valsize < sizeof (uint8_t)) { + ret = EOVERFLOW; + break; + } + + u8 = pr_val; + *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_10G) != 0; + break; + + case MAC_PROP_ADV_25GFDX_CAP: + case MAC_PROP_EN_25GFDX_CAP: + if (pr_valsize < sizeof (uint8_t)) { + ret = EOVERFLOW; + break; + } + + u8 = pr_val; + *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_25G) != 0; + break; + + case MAC_PROP_ADV_40GFDX_CAP: + case MAC_PROP_EN_40GFDX_CAP: + if (pr_valsize < sizeof (uint8_t)) { + ret = EOVERFLOW; + break; + } + + u8 = pr_val; + *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_40G) != 0; + break; + + case MAC_PROP_ADV_100GFDX_CAP: + case MAC_PROP_EN_100GFDX_CAP: + if (pr_valsize < sizeof (uint8_t)) { + ret = EOVERFLOW; + break; + } + + u8 = pr_val; + *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_100G) != 0; + break; + + default: + ret = ENOTSUP; + break; + } + + mutex_exit(&ena->ena_lock); + return (ret); +} + +static void +ena_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, + mac_prop_info_handle_t prh) +{ +} + +static mac_callbacks_t ena_m_callbacks = { + .mc_callbacks = MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO, + .mc_getstat = ena_m_stat, + .mc_start = ena_m_start, + .mc_stop = ena_m_stop, + .mc_setpromisc = ena_m_setpromisc, + .mc_multicst = ena_m_multicast, + .mc_getcapab = ena_m_getcapab, + .mc_setprop = ena_m_setprop, + .mc_getprop = ena_m_getprop, + .mc_propinfo = ena_m_propinfo, +}; + +int +ena_mac_unregister(ena_t *ena) +{ + if (ena->ena_mh == NULL) { + return (0); + } + + return (mac_unregister(ena->ena_mh)); +} + +boolean_t +ena_mac_register(ena_t *ena) +{ + int ret; + mac_register_t *regp; + + if ((regp = mac_alloc(MAC_VERSION)) == NULL) { + ena_err(ena, "failed to allocate MAC handle"); + return (B_FALSE); + } + + regp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; + regp->m_driver = ena; + regp->m_dip = ena->ena_dip; + regp->m_instance = 0; + regp->m_src_addr = ena->ena_mac_addr; + regp->m_dst_addr = NULL; + regp->m_callbacks = &ena_m_callbacks; + regp->m_min_sdu = 0; + regp->m_max_sdu = ena->ena_mtu; + regp->m_pdata = NULL; + regp->m_pdata_size = 0; + regp->m_priv_props = NULL; + regp->m_margin = VLAN_TAGSZ; + regp->m_v12n = MAC_VIRT_LEVEL1; + + if ((ret = mac_register(regp, &ena->ena_mh)) != 0) { + ena_err(ena, "failed to register ena with mac: %d", ret); + } + + mac_free(regp); + + if (ret == 0) { + /* + * Until we get the first AENQ link change event, we + * do not actually know the status of the link. + */ + mac_link_update(ena->ena_mh, LINK_STATE_UNKNOWN); + } + + return (ret == 0); +} diff --git a/usr/src/uts/common/io/ena/ena_hw.c b/usr/src/uts/common/io/ena/ena_hw.c new file mode 100644 index 0000000000..f37b4100df --- /dev/null +++ b/usr/src/uts/common/io/ena/ena_hw.c @@ -0,0 +1,93 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ + +#include "ena_hw.h" +#include "ena.h" + +uint32_t +ena_hw_bar_read32(const ena_t *ena, const uint16_t offset) +{ + caddr_t addr = ena->ena_reg_base + offset; + return (ena_hw_abs_read32(ena, (uint32_t *)addr)); +} + +uint32_t +ena_hw_abs_read32(const ena_t *ena, uint32_t *addr) +{ + VERIFY3U(addr, >=, ena->ena_reg_base); + VERIFY3U(addr, <, ena->ena_reg_base + (ena->ena_reg_size - 4)); + + return (ddi_get32(ena->ena_reg_hdl, addr)); +} + +void +ena_hw_bar_write32(const ena_t *ena, const uint16_t offset, const uint32_t val) +{ + caddr_t addr = ena->ena_reg_base + offset; + ena_hw_abs_write32(ena, (uint32_t *)addr, val); +} + +void +ena_hw_abs_write32(const ena_t *ena, uint32_t *addr, const uint32_t val) +{ + VERIFY3P(ena, !=, NULL); + VERIFY3P(addr, !=, NULL); + VERIFY3U(addr, >=, ena->ena_reg_base); + VERIFY3U(addr, <, ena->ena_reg_base + (ena->ena_reg_size - 4)); + + ddi_put32(ena->ena_reg_hdl, addr, val); +} + +int +enahw_resp_status_to_errno(ena_t *ena, enahw_resp_status_t status) +{ + int ret = 0; + + switch (status) { + case ENAHW_RESP_SUCCESS: + break; + + case ENAHW_RESP_RESOURCE_ALLOCATION_FAILURE: + ret = ENOMEM; + break; + + case ENAHW_RESP_UNSUPPORTED_OPCODE: + ret = ENOTSUP; + break; + + case ENAHW_RESP_BAD_OPCODE: + case ENAHW_RESP_MALFORMED_REQUEST: + case ENAHW_RESP_ILLEGAL_PARAMETER: + ret = EINVAL; + break; + + case ENAHW_RESP_RESOURCE_BUSY: + ret = EAGAIN; + break; + + case ENAHW_RESP_UNKNOWN_ERROR: + default: + /* + * If the device presents us with an "unknwon error" + * code, or the status code is undefined, then we log + * an error and convert it to EIO. + */ + ena_err(ena, "unexpected status code: %d", status); + ret = EIO; + break; + } + + return (ret); +} diff --git a/usr/src/uts/common/io/ena/ena_hw.h b/usr/src/uts/common/io/ena/ena_hw.h new file mode 100644 index 0000000000..fbd67851b4 --- /dev/null +++ b/usr/src/uts/common/io/ena/ena_hw.h @@ -0,0 +1,1930 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ + +/* + * This file declares all constants and structures dealing with the + * physical ENA device. It is based on the ena_com code of the public + * Linux and FreeBSD drivers. While this file is based on the common + * code it doesn't share the same type names. Where it is useful, a + * "common" reference is added to include the name of the type as + * defined in the common code. + * + * The Linux driver defines enq_admin_aq_entry as the top-level type + * for admin command descriptors. From this type you can access the + * common bits shared by every descriptor (ena_admin_aq_common_desc) + * as well as the control buffer (ena_admin_ctrl_buff_info) which is + * present for _some_ commands. Other than that, this top-level type + * treats the rest of the data as an opaque array of unsigned 32-bit + * integers. Then, for each individual command, the Linux driver + * defines a dedicated type, each of which contains the following: + * + * 1. The common descriptor: ena_admin_aq_common_desc. + * + * 2. The optional control buffer desc: ena_admin_ctrl_buff_info. + * + * 3. The command-specific data. + * + * 4. Optional padding to make sure all commands are 64 bytes in size. + * + * Furthermore, there may be further common types for commands which + * are made up of several sub-commands, e.g. the get/set feature + * commands. + * + * Finally, when a command is passed to the common function for + * executing commands (ena_com_execute_admin_command()), it is cast as + * a pointer to the top-level type: ena_admin_aq_entry. + * + * This works for the Linux driver just fine, but it causes lots of + * repetition in the structure definitions and also means there is no + * easy way to determine all valid commands. This ENA driver has + * turned the Linux approach inside out -- the top-level type is a + * union of all possible commands: enahw_cmd_desc_t. Each command may + * then further sub-type via unions to represent its sub-commands. + * This same treatment was given to the response descriptor: + * enahw_resp_desc_t. + * + * What is the point of knowing all this? Well, when referencing the + * common type in the comment above the enahw_ type, you need to keep + * in mind that the Linux/common type will include all the common + * descriptor bits, whereas these types do not. + * + * The common code DOES NOT pack any of these structures, and thus + * neither do we. That means these structures all rely on natural + * compiler alignment, just as the common code does. In ena.c you will + * find CTASSERTs for many of these structures, to verify they are of + * the expected size. + */ + +#ifndef _ENA_HW_H +#define _ENA_HW_H + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/types.h> +#include <sys/debug.h> +#include <sys/ethernet.h> + +/* + * The common code sets the upper limit of I/O queues to 128. In this + * case a "queue" is a SQ+CQ pair that forms a logical queue or ring + * for sending or receiving packets. Thus, at maximum, we may expect + * 128 Tx rings, and 128 Rx rings; though, practically speaking, the + * number of rings will often be limited by number of CPUs or + * available interrupts. + * + * common: ENA_MAX_NUM_IO_QUEUES + */ +#define ENAHW_MAX_NUM_IO_QUEUES 128 + +/* + * Generate a 32-bit bitmask where the bits between high (inclusive) + * and low (inclusive) are set to 1. + */ +#define GENMASK(h, l) (((~0U) - (1U << (l)) + 1) & (~0U >> (32 - 1 - (h)))) + +/* + * Generate a 64-bit bitmask where bit b is set to 1. + */ +#define BIT(b) (1UL << (b)) + +#define ENAHW_DMA_ADMINQ_ALIGNMENT 8 + +#define ENAHW_ADMIN_CQ_DESC_BUF_ALIGNMENT 8 +#define ENAHW_ADMIN_SQ_DESC_BUF_ALIGNMENT 8 +#define ENAHW_AENQ_DESC_BUF_ALIGNMENT 8 +#define ENAHW_HOST_INFO_ALIGNMENT 8 +#define ENAHW_HOST_INFO_ALLOC_SZ 4096 +#define ENAHW_IO_CQ_DESC_BUF_ALIGNMENT 4096 +#define ENAHW_IO_SQ_DESC_BUF_ALIGNMENT 8 + +/* + * BAR0 register offsets. + * + * Any register not defined in the common code was marked as a gap, + * using the hex address of the register as suffix. The idea is to + * make it clear where the gaps are and allow the + * ena_hw_update_reg_cache() function to display any bits stored in + * these gaps in case they turn out to be interesting later. + */ +#define ENAHW_REG_VERSION 0x0 +#define ENAHW_REG_CONTROLLER_VERSION 0x4 +#define ENAHW_REG_CAPS 0x8 +#define ENAHW_REG_CAPS_EXT 0xc +#define ENAHW_REG_ASQ_BASE_LO 0x10 +#define ENAHW_REG_ASQ_BASE_HI 0x14 +#define ENAHW_REG_ASQ_CAPS 0x18 +#define ENAHW_REG_GAP_1C 0x1c +#define ENAHW_REG_ACQ_BASE_LO 0x20 +#define ENAHW_REG_ACQ_BASE_HI 0x24 +#define ENAHW_REG_ACQ_CAPS 0x28 +#define ENAHW_REG_ASQ_DB 0x2c +#define ENAHW_REG_ACQ_TAIL 0x30 +#define ENAHW_REG_AENQ_CAPS 0x34 +#define ENAHW_REG_AENQ_BASE_LO 0x38 +#define ENAHW_REG_AENQ_BASE_HI 0x3c +#define ENAHW_REG_AENQ_HEAD_DB 0x40 +#define ENAHW_REG_AENQ_TAIL 0x44 +#define ENAHW_REG_GAP_48 0x48 +#define ENAHW_REG_INTERRUPT_MASK 0x4c +#define ENAHW_REG_GAP_50 0x50 +#define ENAHW_REG_DEV_CTL 0x54 +#define ENAHW_REG_DEV_STS 0x58 +#define ENAHW_REG_MMIO_REG_READ 0x5c +#define ENAHW_REG_MMIO_RESP_LO 0x60 +#define ENAHW_REG_MMIO_RESP_HI 0x64 +#define ENAHW_REG_RSS_IND_ENTRY_UPDATE 0x68 +#define ENAHW_NUM_REGS ((ENAHW_REG_RSS_IND_ENTRY_UPDATE / 4) + 1) + +/* + * Device Version (Register 0x0) + */ +#define ENAHW_DEV_MINOR_VSN_MASK 0xff +#define ENAHW_DEV_MAJOR_VSN_SHIFT 8 +#define ENAHW_DEV_MAJOR_VSN_MASK 0xff00 + +#define ENAHW_DEV_MAJOR_VSN(vsn) \ + (((vsn) & ENAHW_DEV_MAJOR_VSN_MASK) >> ENAHW_DEV_MAJOR_VSN_SHIFT) +#define ENAHW_DEV_MINOR_VSN(vsn) \ + ((vsn) & ENAHW_DEV_MINOR_VSN_MASK) + +/* + * Controller Version (Register 0x4) + */ +#define ENAHW_CTRL_SUBMINOR_VSN_MASK 0xff +#define ENAHW_CTRL_MINOR_VSN_SHIFT 8 +#define ENAHW_CTRL_MINOR_VSN_MASK 0xff00 +#define ENAHW_CTRL_MAJOR_VSN_SHIFT 16 +#define ENAHW_CTRL_MAJOR_VSN_MASK 0xff0000 +#define ENAHW_CTRL_IMPL_ID_SHIFT 24 +#define ENAHW_CTRL_IMPL_ID_MASK 0xff000000 + +#define ENAHW_CTRL_MAJOR_VSN(vsn) \ + (((vsn) & ENAHW_CTRL_MAJOR_VSN_MASK) >> ENAHW_CTRL_MAJOR_VSN_SHIFT) +#define ENAHW_CTRL_MINOR_VSN(vsn) \ + (((vsn) & ENAHW_CTRL_MINOR_VSN_MASK) >> ENAHW_CTRL_MINOR_VSN_SHIFT) +#define ENAHW_CTRL_SUBMINOR_VSN(vsn) \ + ((vsn) & ENAHW_CTRL_SUBMINOR_VSN_MASK) +#define ENAHW_CTRL_IMPL_ID(vsn) \ + (((vsn) & ENAHW_CTRL_IMPL_ID_MASK) >> ENAHW_CTRL_IMPL_ID_SHIFT) + +/* + * Device Caps (Register 0x8) + */ +#define ENAHW_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK 0x1 +#define ENAHW_CAPS_RESET_TIMEOUT_SHIFT 1 +#define ENAHW_CAPS_RESET_TIMEOUT_MASK 0x3e +#define ENAHW_CAPS_RESET_TIMEOUT(v) \ + (((v) & ENAHW_CAPS_RESET_TIMEOUT_MASK) >> \ + ENAHW_CAPS_RESET_TIMEOUT_SHIFT) +#define ENAHW_CAPS_DMA_ADDR_WIDTH_SHIFT 8 +#define ENAHW_CAPS_DMA_ADDR_WIDTH_MASK 0xff00 +#define ENAHW_CAPS_DMA_ADDR_WIDTH(v) \ + (((v) & ENAHW_CAPS_DMA_ADDR_WIDTH_MASK) >> \ + ENAHW_CAPS_DMA_ADDR_WIDTH_SHIFT) +#define ENAHW_CAPS_ADMIN_CMD_TIMEOUT_SHIFT 16 +#define ENAHW_CAPS_ADMIN_CMD_TIMEOUT_MASK 0xf0000 +#define ENAHW_CAPS_ADMIN_CMD_TIMEOUT(v) \ + (((v) & ENAHW_CAPS_ADMIN_CMD_TIMEOUT_MASK) >> \ + ENAHW_CAPS_ADMIN_CMD_TIMEOUT_SHIFT) + +enum enahw_reset_reason_types { + ENAHW_RESET_NORMAL = 0, + ENAHW_RESET_KEEP_ALIVE_TO = 1, + ENAHW_RESET_ADMIN_TO = 2, + ENAHW_RESET_MISS_TX_CMPL = 3, + ENAHW_RESET_INV_RX_REQ_ID = 4, + ENAHW_RESET_INV_TX_REQ_ID = 5, + ENAHW_RESET_TOO_MANY_RX_DESCS = 6, + ENAHW_RESET_INIT_ERR = 7, + ENAHW_RESET_DRIVER_INVALID_STATE = 8, + ENAHW_RESET_OS_TRIGGER = 9, + ENAHW_RESET_OS_NETDEV_WD = 10, + ENAHW_RESET_SHUTDOWN = 11, + ENAHW_RESET_USER_TRIGGER = 12, + ENAHW_RESET_GENERIC = 13, + ENAHW_RESET_MISS_INTERRUPT = 14, + ENAHW_RESET_LAST, +}; + +/* + * Admin Submission Queue Caps (Register 0x18) + */ +#define ENAHW_ASQ_CAPS_DEPTH_MASK 0xffff +#define ENAHW_ASQ_CAPS_ENTRY_SIZE_SHIFT 16 +#define ENAHW_ASQ_CAPS_ENTRY_SIZE_MASK 0xffff0000 + +#define ENAHW_ASQ_CAPS_DEPTH(x) ((x) & ENAHW_ASQ_CAPS_DEPTH_MASK) + +#define ENAHW_ASQ_CAPS_ENTRY_SIZE(x) \ + (((x) << ENAHW_ASQ_CAPS_ENTRY_SIZE_SHIFT) & \ + ENAHW_ASQ_CAPS_ENTRY_SIZE_MASK) + +/* + * Admin Completion Queue Caps (Register 0x28) + */ +#define ENAHW_ACQ_CAPS_DEPTH_MASK 0xffff +#define ENAHW_ACQ_CAPS_ENTRY_SIZE_SHIFT 16 +#define ENAHW_ACQ_CAPS_ENTRY_SIZE_MASK 0xffff0000 + +#define ENAHW_ACQ_CAPS_DEPTH(x) ((x) & ENAHW_ACQ_CAPS_DEPTH_MASK) + +#define ENAHW_ACQ_CAPS_ENTRY_SIZE(x) \ + (((x) << ENAHW_ACQ_CAPS_ENTRY_SIZE_SHIFT) & \ + ENAHW_ACQ_CAPS_ENTRY_SIZE_MASK) + +/* + * Asynchronous Event Notification Queue Caps (Register 0x34) + */ +#define ENAHW_AENQ_CAPS_DEPTH_MASK 0xffff +#define ENAHW_AENQ_CAPS_ENTRY_SIZE_SHIFT 16 +#define ENAHW_AENQ_CAPS_ENTRY_SIZE_MASK 0xffff0000 + +#define ENAHW_AENQ_CAPS_DEPTH(x) ((x) & ENAHW_AENQ_CAPS_DEPTH_MASK) + +#define ENAHW_AENQ_CAPS_ENTRY_SIZE(x) \ + (((x) << ENAHW_AENQ_CAPS_ENTRY_SIZE_SHIFT) & \ + ENAHW_AENQ_CAPS_ENTRY_SIZE_MASK) + +/* + * Interrupt Mask (Register 0x4c) + */ +#define ENAHW_INTR_UNMASK 0x0 +#define ENAHW_INTR_MASK 0x1 + +/* + * Device Control (Register 0x54) + */ +#define ENAHW_DEV_CTL_DEV_RESET_MASK 0x1 +#define ENAHW_DEV_CTL_AQ_RESTART_SHIFT 1 +#define ENAHW_DEV_CTL_AQ_RESTART_MASK 0x2 +#define ENAHW_DEV_CTL_QUIESCENT_SHIFT 2 +#define ENAHW_DEV_CTL_QUIESCENT_MASK 0x4 +#define ENAHW_DEV_CTL_IO_RESUME_SHIFT 3 +#define ENAHW_DEV_CTL_IO_RESUME_MASK 0x8 +#define ENAHW_DEV_CTL_RESET_REASON_SHIFT 28 +#define ENAHW_DEV_CTL_RESET_REASON_MASK 0xf0000000 + +/* + * Device Status (Register 0x58) + */ +#define ENAHW_DEV_STS_READY_MASK 0x1 +#define ENAHW_DEV_STS_AQ_RESTART_IN_PROGRESS_SHIFT 1 +#define ENAHW_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK 0x2 +#define ENAHW_DEV_STS_AQ_RESTART_FINISHED_SHIFT 2 +#define ENAHW_DEV_STS_AQ_RESTART_FINISHED_MASK 0x4 +#define ENAHW_DEV_STS_RESET_IN_PROGRESS_SHIFT 3 +#define ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK 0x8 +#define ENAHW_DEV_STS_RESET_FINISHED_SHIFT 4 +#define ENAHW_DEV_STS_RESET_FINISHED_MASK 0x10 +#define ENAHW_DEV_STS_FATAL_ERROR_SHIFT 5 +#define ENAHW_DEV_STS_FATAL_ERROR_MASK 0x20 +#define ENAHW_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_SHIFT 6 +#define ENAHW_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_MASK 0x40 +#define ENAHW_DEV_STS_QUIESCENT_STATE_ACHIEVED_SHIFT 7 +#define ENAHW_DEV_STS_QUIESCENT_STATE_ACHIEVED_MASK 0x80 + +/* common: ena_admin_aenq_common_desc */ +typedef struct enahw_aenq_desc { + uint16_t ead_group; + uint16_t ead_syndrome; + uint8_t ead_flags; + uint8_t ead_rsvd1[3]; + uint32_t ead_ts_low; + uint32_t ead_ts_high; + + union { + uint32_t raw[12]; + + struct { + uint32_t flags; + } link_change; + + struct { + uint32_t rx_drops_low; + uint32_t rx_drops_high; + uint32_t tx_drops_low; + uint32_t tx_drops_high; + } keep_alive; + } ead_payload; +} enahw_aenq_desc_t; + +#define ENAHW_AENQ_DESC_PHASE_MASK BIT(0) + +#define ENAHW_AENQ_DESC_PHASE(desc) \ + ((desc)->ead_flags & ENAHW_AENQ_DESC_PHASE_MASK) + +#define ENAHW_AENQ_LINK_CHANGE_LINK_STATUS_MASK BIT(0) + +/* + * Asynchronous Event Notification Queue groups. + * + * Note: These values represent the bit position of each feature as + * returned by ENAHW_FEAT_AENQ_CONFIG. We encode them this way so that + * they can double as an index into the AENQ handlers array. + * + * common: ena_admin_aenq_group + */ +typedef enum enahw_aenq_groups { + ENAHW_AENQ_GROUP_LINK_CHANGE = 0, + ENAHW_AENQ_GROUP_FATAL_ERROR = 1, + ENAHW_AENQ_GROUP_WARNING = 2, + ENAHW_AENQ_GROUP_NOTIFICATION = 3, + ENAHW_AENQ_GROUP_KEEP_ALIVE = 4, + ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES = 5, + ENAHW_AENQ_GROUPS_ARR_NUM = 6, +} enahw_aenq_groups_t; + +/* + * The reason for ENAHW_AENQ_GROUP_NOFIFICATION. + * + * common: ena_admin_aenq_notification_syndrome + */ +typedef enum enahw_aenq_syndrome { + ENAHW_AENQ_SYNDROME_UPDATE_HINTS = 2, +} enahw_aenq_syndrome_t; + +/* + * ENA devices use a 48-bit memory space. + * + * common: ena_common_mem_addr + */ +typedef struct enahw_addr { + uint32_t ea_low; + uint16_t ea_high; + uint16_t ea_rsvd; /* must be zero */ +} enahw_addr_t; + +/* common: ena_admin_ctrl_buff_info */ +struct enahw_ctrl_buff { + uint32_t ecb_length; + enahw_addr_t ecb_addr; +}; + +/* common: ena_admin_get_set_feature_common_desc */ +struct enahw_feat_common { + /* + * 1:0 Select which value you want. + * + * 0x1 = Current value. + * 0x3 = Default value. + * + * Note: Linux seems to set this to 0 to get the value, + * not sure if that's a bug or just another way to get the + * current value. + * + * 7:3 Reserved. + */ + uint8_t efc_flags; + + /* An id from enahw_feature_id_t. */ + uint8_t efc_id; + + /* + * Each feature is versioned, allowing upgrades to the feature + * set without breaking backwards compatibility. The driver + * uses this field to specify which version it supports + * (starting from zero). Linux doesn't document this very well + * and sets this value to 0 for most features. We define a set + * of macros, underneath the enahw_feature_id_t type, clearly + * documenting the version we support for each feature. + */ + uint8_t efc_version; + uint8_t efc_rsvd; +}; + +/* common: ena_admin_get_feat_cmd */ +typedef struct enahw_cmd_get_feat { + struct enahw_ctrl_buff ecgf_ctrl_buf; + struct enahw_feat_common ecgf_comm; + uint32_t egcf_unused[11]; +} enahw_cmd_get_feat_t; + +/* + * N.B. Linux sets efc_flags to 0 (via memset) when reading the + * current value, but the comments say it should be 0x1. We follow the + * comments. + */ +#define ENAHW_GET_FEAT_FLAGS_GET_CURR_VAL(desc) \ + ((desc)->ecgf_comm.efc_flags) |= 0x1 +#define ENAHW_GET_FEAT_FLAGS_GET_DEF_VAL(desc) \ + ((desc)->ecgf_comm.efc_flags) |= 0x3 + +/* + * Set the MTU of the device. This value does not include the L2 + * headers or trailers, only the payload. + * + * common: ena_admin_set_feature_mtu_desc + */ +typedef struct enahw_feat_mtu { + uint32_t efm_mtu; +} enahw_feat_mtu_t; + +/* common: ena_admin_set_feature_host_attr_desc */ +typedef struct enahw_feat_host_attr { + enahw_addr_t efha_os_addr; + enahw_addr_t efha_debug_addr; + uint32_t efha_debug_sz; +} enahw_feat_host_attr_t; + +/* + * ENAHW_FEAT_AENQ_CONFIG + * + * common: ena_admin_feature_aenq_desc + */ +typedef struct enahw_feat_aenq { + /* Bitmask of AENQ groups this device supports. */ + uint32_t efa_supported_groups; + + /* Bitmask of AENQ groups currently enabled. */ + uint32_t efa_enabled_groups; +} enahw_feat_aenq_t; + +/* common: ena_admin_set_feat_cmd */ +typedef struct enahw_cmd_set_feat { + struct enahw_ctrl_buff ecsf_ctrl_buf; + struct enahw_feat_common ecsf_comm; + + union { + uint32_t ecsf_raw[11]; + enahw_feat_host_attr_t ecsf_host_attr; + enahw_feat_mtu_t ecsf_mtu; + enahw_feat_aenq_t ecsf_aenq; + } ecsf_feat; +} enahw_cmd_set_feat_t; + +/* + * Used to populate the host information buffer which the Nitro + * hypervisor supposedly uses for display, debugging, and possibly + * other purposes. + * + * common: ena_admin_host_info + */ +typedef struct enahw_host_info { + uint32_t ehi_os_type; + uint8_t ehi_os_dist_str[128]; + uint32_t ehi_os_dist; + uint8_t ehi_kernel_ver_str[32]; + uint32_t ehi_kernel_ver; + uint32_t ehi_driver_ver; + uint32_t ehi_supported_net_features[2]; + uint16_t ehi_ena_spec_version; + uint16_t ehi_bdf; + uint16_t ehi_num_cpus; + uint16_t ehi_rsvd; + uint32_t ehi_driver_supported_features; +} enahw_host_info_t; + +#define ENAHW_HOST_INFO_MAJOR_MASK GENMASK(7, 0) +#define ENAHW_HOST_INFO_MINOR_SHIFT 8 +#define ENAHW_HOST_INFO_MINOR_MASK GENMASK(15, 8) +#define ENAHW_HOST_INFO_SUB_MINOR_SHIFT 16 +#define ENAHW_HOST_INFO_SUB_MINOR_MASK GENMASK(23, 16) +#define ENAHW_HOST_INFO_SPEC_MAJOR_SHIFT 8 +#define ENAHW_HOST_INFO_MODULE_TYPE_SHIFT 24 +#define ENAHW_HOST_INFO_MODULE_TYPE_MASK GENMASK(31, 24) +#define ENAHW_HOST_INFO_FUNCTION_MASK GENMASK(2, 0) +#define ENAHW_HOST_INFO_DEVICE_SHIFT 3 +#define ENAHW_HOST_INFO_DEVICE_MASK GENMASK(7, 3) +#define ENAHW_HOST_INFO_BUS_SHIFT 8 +#define ENAHW_HOST_INFO_BUS_MASK GENMASK(15, 8) +#define ENAHW_HOST_INFO_RX_OFFSET_SHIFT 1 +#define ENAHW_HOST_INFO_RX_OFFSET_MASK BIT(1) +#define ENAHW_HOST_INFO_INTERRUPT_MODERATION_SHIFT 2 +#define ENAHW_HOST_INFO_INTERRUPT_MODERATION_MASK BIT(2) +#define ENAHW_HOST_INFO_RX_BUF_MIRRORING_SHIFT 3 +#define ENAHW_HOST_INFO_RX_BUF_MIRRORING_MASK BIT(3) +#define ENAHW_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_SHIFT 4 +#define ENAHW_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK BIT(4) + +/* common: ena_admin_os_type */ +enum enahw_os_type { + ENAHW_OS_LINUX = 1, + ENAHW_OS_WIN = 2, + ENAHW_OS_DPDK = 3, + ENAHW_OS_FREEBSD = 4, + ENAHW_OS_IPXE = 5, + ENAHW_OS_ESXI = 6, + ENAHW_OS_MACOS = 7, + ENAHW_OS_GROUPS_NUM = 7, +}; + +/* + * Create I/O Completion Queue + * + * A completion queue is where the device writes responses to I/O + * requests. The admin completion queue must be created before such a + * command can be issued, see ena_admin_cq_init(). + * + * common: ena_admin_aq_create_cq_cmd + */ +typedef struct enahw_cmd_create_cq { + /* + * 7-6 reserved + * + * 5 interrupt mode: when set the device sends an interrupt + * for each completion, otherwise the driver must poll + * the queue. + * + * 4-0 reserved + */ + uint8_t ecq_caps_1; + + /* + * 7-5 reserved + * + * 4-0 CQ entry size (in words): the size of a single CQ entry + * in multiples of 32-bit words. + * + * NOTE: According to the common code the "valid" values + * are 4 or 8 -- this is incorrect. The valid values are + * 2 and 4. The common code does have an "extended" Rx + * completion descriptor, ena_eth_io_rx_cdesc_ext, that + * is 32 bytes and thus would use a value of 8, but it is + * not used by the Linux or FreeBSD drivers, so we do not + * bother with it. + * + * Type Bytes Value + * enahw_tx_cdesc_t 8 2 + * enahw_rx_cdesc_t 16 4 + */ + uint8_t ecq_caps_2; + + /* The number of CQ entries, must be a power of 2. */ + uint16_t ecq_num_descs; + + /* The MSI-X vector assigned to this CQ. */ + uint32_t ecq_msix_vector; + + /* + * The CQ's physical base address. The CQ memory must be + * physically contiguous. + */ + enahw_addr_t ecq_addr; +} enahw_cmd_create_cq_t; + +#define ENAHW_CMD_CREATE_CQ_INTERRUPT_MODE_ENABLED_SHIFT 5 +#define ENAHW_CMD_CREATE_CQ_INTERRUPT_MODE_ENABLED_MASK (BIT(5)) +#define ENAHW_CMD_CREATE_CQ_DESC_SIZE_WORDS_MASK (GENMASK(4, 0)) + +#define ENAHW_CMD_CREATE_CQ_INTERRUPT_MODE_ENABLE(cmd) \ + ((cmd)->ecq_caps_1 |= ENAHW_CMD_CREATE_CQ_INTERRUPT_MODE_ENABLED_MASK) + +#define ENAHW_CMD_CREATE_CQ_DESC_SIZE_WORDS(cmd, val) \ + (((cmd)->ecq_caps_2) |= \ + ((val) & ENAHW_CMD_CREATE_CQ_DESC_SIZE_WORDS_MASK)) + +/* + * Destroy Completion Queue + * + * common: ena_admin_aq_destroy_cq_cmd + */ +typedef struct enahw_cmd_destroy_cq { + uint16_t edcq_idx; + uint16_t edcq_rsvd; +} enahw_cmd_destroy_cq_t; + +/* + * common: ena_admin_aq_create_sq_cmd + */ +typedef struct enahw_cmd_create_sq { + /* + * 7-5 direction: 0x1 = Tx, 0x2 = Rx + * 4-0 reserved + */ + uint8_t ecsq_dir; + uint8_t ecsq_rsvd1; + + /* + * 7 reserved + * + * 6-4 completion policy: How are completion events generated. + * + * See enahw_completion_policy_type_t for a description of + * the various values. + * + * 3-0 placement policy: Where the descriptor ring and + * headers reside. + * + * See enahw_placement_policy_t for a description of the + * various values. + */ + uint8_t ecsq_caps_2; + + /* + * 7-1 reserved + * + * 0 physically contiguous: When set indicates the descriptor + * ring memory is physically contiguous. + */ + uint8_t ecsq_caps_3; + + /* + * The index of the associated Completion Queue (CQ). The CQ + * must be created before the SQ. + */ + uint16_t ecsq_cq_idx; + + /* The number of descriptors in this SQ. */ + uint16_t ecsq_num_descs; + + /* + * The base physical address of the SQ. This should not be set + * for LLQ. Must be page aligned. + */ + enahw_addr_t ecsq_base; + + /* + * The physical address of the head write-back pointer. Valid + * only when the completion policy is set to one of the head + * write-back modes (0x2 or 0x3). Must be cacheline size + * aligned. + */ + enahw_addr_t ecsq_head_wb; + uint32_t ecsq_rsvdw2; + uint32_t ecsq_rsvdw3; +} enahw_cmd_create_sq_t; + +typedef enum enahw_sq_direction { + ENAHW_SQ_DIRECTION_TX = 1, + ENAHW_SQ_DIRECTION_RX = 2, +} enahw_sq_direction_t; + +typedef enum enahw_placement_policy { + /* Descriptors and headers are in host memory. */ + ENAHW_PLACEMENT_POLICY_HOST = 1, + + /* + * Descriptors and headers are in device memory (a.k.a Low + * Latency Queue). + */ + ENAHW_PLACEMENT_POLICY_DEV = 3, +} enahw_placement_policy_t; + +/* + * DESC: Write a CQ entry for each SQ descriptor. + * + * DESC_ON_DEMAND: Write a CQ entry when requested by the SQ descriptor. + * + * HEAD_ON_DEMAND: Update head pointer when requested by the SQ + * descriptor. + * + * HEAD: Update head pointer for each SQ descriptor. + * + */ +typedef enum enahw_completion_policy_type { + ENAHW_COMPLETION_POLICY_DESC = 0, + ENAHW_COMPLETION_POLICY_DESC_ON_DEMAND = 1, + ENAHW_COMPLETION_POLICY_HEAD_ON_DEMAND = 2, + ENAHW_COMPLETION_POLICY_HEAD = 3, +} enahw_completion_policy_type_t; + +#define ENAHW_CMD_CREATE_SQ_DIR_SHIFT 5 +#define ENAHW_CMD_CREATE_SQ_DIR_MASK GENMASK(7, 5) +#define ENAHW_CMD_CREATE_SQ_PLACEMENT_POLICY_MASK GENMASK(3, 0) +#define ENAHW_CMD_CREATE_SQ_COMPLETION_POLICY_SHIFT 4 +#define ENAHW_CMD_CREATE_SQ_COMPLETION_POLICY_MASK GENMASK(6, 4) +#define ENAHW_CMD_CREATE_SQ_PHYSMEM_CONTIG_MASK BIT(0) + +#define ENAHW_CMD_CREATE_SQ_DIR(cmd, val) \ + (((cmd)->ecsq_dir) |= (((val) << ENAHW_CMD_CREATE_SQ_DIR_SHIFT) & \ + ENAHW_CMD_CREATE_SQ_DIR_MASK)) + +#define ENAHW_CMD_CREATE_SQ_PLACEMENT_POLICY(cmd, val) \ + (((cmd)->ecsq_caps_2) |= \ + ((val) & ENAHW_CMD_CREATE_SQ_PLACEMENT_POLICY_MASK)) + +#define ENAHW_CMD_CREATE_SQ_COMPLETION_POLICY(cmd, val) \ + (((cmd)->ecsq_caps_2) |= \ + (((val) << ENAHW_CMD_CREATE_SQ_COMPLETION_POLICY_SHIFT) & \ + ENAHW_CMD_CREATE_SQ_COMPLETION_POLICY_MASK)) + +#define ENAHW_CMD_CREATE_SQ_PHYSMEM_CONTIG(cmd) \ + ((cmd)->ecsq_caps_3 |= ENAHW_CMD_CREATE_SQ_PHYSMEM_CONTIG_MASK) + +/* common: ena_admin_sq */ +typedef struct enahw_cmd_destroy_sq { + uint16_t edsq_idx; + uint8_t edsq_dir; /* Tx/Rx */ + uint8_t edsq_rsvd; +} enahw_cmd_destroy_sq_t; + +#define ENAHW_CMD_DESTROY_SQ_DIR_SHIFT 5 +#define ENAHW_CMD_DESTROY_SQ_DIR_MASK GENMASK(7, 5) + +#define ENAHW_CMD_DESTROY_SQ_DIR(cmd, val) \ + (((cmd)->edsq_dir) |= (((val) << ENAHW_CMD_DESTROY_SQ_DIR_SHIFT) & \ + ENAHW_CMD_DESTROY_SQ_DIR_MASK)) + +/* common: ena_admin_aq_get_stats_cmd */ +typedef struct enahw_cmd_get_stats { + struct enahw_ctrl_buff ecgs_ctrl_buf; + uint8_t ecgs_type; + uint8_t ecgs_scope; + uint16_t ecgs_rsvd; + uint16_t ecgs_queue_idx; + + /* + * The device ID for which to query stats from. The sentinel + * value 0xFFFF indicates a query of the current device. + * According to the common docs, a "privileged device" may + * query stats for other ENA devices. However the definition + * of this "privilege device" is not expanded upon. + */ + uint16_t ecgs_device_id; +} enahw_cmd_get_stats_t; + +/* Query the stats for my device. */ +#define ENAHW_CMD_GET_STATS_MY_DEVICE_ID 0xFFFF + +/* + * BASIC: Returns enahw_resp_basic_stats. + * + * EXTENDED: According to the Linux documentation returns a buffer in + * "string format" with additional statistics per queue and per device ID. + * + * ENI: According to the Linux documentation it returns "extra HW + * stats for a specific network interfaces". + * + * common: ena_admin_get_stats_type + */ +typedef enum enahw_get_stats_type { + ENAHW_GET_STATS_TYPE_BASIC = 0, + ENAHW_GET_STATS_TYPE_EXTENDED = 1, + ENAHW_GET_STATS_TYPE_ENI = 2, +} enahw_get_stats_type_t; + +/* common: ena_admin_get_stats_scope */ +typedef enum enahw_get_stats_scope { + ENAHW_GET_STATS_SCOPE_QUEUE = 0, + ENAHW_GET_STATS_SCOPE_ETH = 1, +} enahw_get_stats_scope_t; + +/* common: ena_admin_aq_entry */ +typedef struct enahw_cmd_desc { + uint16_t ecd_cmd_id; + uint8_t ecd_opcode; + uint8_t ecd_flags; + + union { + uint32_t ecd_raw[15]; + enahw_cmd_get_feat_t ecd_get_feat; + enahw_cmd_set_feat_t ecd_set_feat; + enahw_cmd_create_cq_t ecd_create_cq; + enahw_cmd_destroy_cq_t ecd_destroy_cq; + enahw_cmd_create_sq_t ecd_create_sq; + enahw_cmd_destroy_sq_t ecd_destroy_sq; + enahw_cmd_get_stats_t ecd_get_stats; + } ecd_cmd; + +} enahw_cmd_desc_t; + +/* + * top level commands that may be sent to the Admin Queue. + * + * common: ena_admin_aq_opcode + */ +typedef enum ena_cmd_opcode { + ENAHW_CMD_NONE = 0, + ENAHW_CMD_CREATE_SQ = 1, + ENAHW_CMD_DESTROY_SQ = 2, + ENAHW_CMD_CREATE_CQ = 3, + ENAHW_CMD_DESTROY_CQ = 4, + ENAHW_CMD_GET_FEATURE = 8, + ENAHW_CMD_SET_FEATURE = 9, + ENAHW_CMD_GET_STATS = 11, +} enahw_cmd_opcode_t; + +/* common: ENA_ADMIN_AQ_COMMON_DESC */ +#define ENAHW_CMD_ID_MASK GENMASK(11, 0) +#define ENAHW_CMD_PHASE_MASK BIT(0) + +#define ENAHW_CMD_ID(desc, id) \ + (((desc)->ecd_cmd_id) |= ((id) & ENAHW_CMD_ID_MASK)) + +/* + * Subcommands for ENA_ADMIN_{GET,SET}_FEATURE. + * + * common: ena_admin_aq_feature_id + */ +typedef enum enahw_feature_id { + ENAHW_FEAT_DEVICE_ATTRIBUTES = 1, + ENAHW_FEAT_MAX_QUEUES_NUM = 2, + ENAHW_FEAT_HW_HINTS = 3, + ENAHW_FEAT_LLQ = 4, + ENAHW_FEAT_EXTRA_PROPERTIES_STRINGS = 5, + ENAHW_FEAT_EXTRA_PROPERTIES_FLAGS = 6, + ENAHW_FEAT_MAX_QUEUES_EXT = 7, + ENAHW_FEAT_RSS_HASH_FUNCTION = 10, + ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG = 11, + ENAHW_FEAT_RSS_INDIRECTION_TABLE_CONFIG = 12, + ENAHW_FEAT_MTU = 14, + ENAHW_FEAT_RSS_HASH_INPUT = 18, + ENAHW_FEAT_INTERRUPT_MODERATION = 20, + ENAHW_FEAT_AENQ_CONFIG = 26, + ENAHW_FEAT_LINK_CONFIG = 27, + ENAHW_FEAT_HOST_ATTR_CONFIG = 28, + ENAHW_FEAT_NUM = 32, +} enahw_feature_id_t; + +/* + * The following macros define the maximum version we support for each + * feature. These are the feature versions we use to communicate with + * the feature command. Linux has these values spread throughout the + * code at the various callsites of ena_com_get_feature(). We choose + * to centralize our feature versions to make it easier to audit. + */ +#define ENAHW_FEAT_DEVICE_ATTRIBUTES_VER 0 +#define ENAHW_FEAT_MAX_QUEUES_NUM_VER 0 +#define ENAHW_FEAT_HW_HINTS_VER 0 +#define ENAHW_FEAT_LLQ_VER 0 +#define ENAHW_FEAT_EXTRA_PROPERTIES_STRINGS_VER 0 +#define ENAHW_FEAT_EXTRA_PROPERTIES_FLAGS_VER 0 +#define ENAHW_FEAT_MAX_QUEUES_EXT_VER 1 +#define ENAHW_FEAT_RSS_HASH_FUNCTION_VER 0 +#define ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG_VER 0 +#define ENAHW_FEAT_RSS_INDIRECTION_TABLE_CONFIG_VER 0 +#define ENAHW_FEAT_MTU_VER 0 +#define ENAHW_FEAT_RSS_HASH_INPUT_VER 0 +#define ENAHW_FEAT_INTERRUPT_MODERATION_VER 0 +#define ENAHW_FEAT_AENQ_CONFIG_VER 0 +#define ENAHW_FEAT_LINK_CONFIG_VER 0 +#define ENAHW_FEAT_HOST_ATTR_CONFIG_VER 0 + +/* common: ena_admin_link_types */ +typedef enum enahw_link_speeds { + ENAHW_LINK_SPEED_1G = 0x1, + ENAHW_LINK_SPEED_2_HALF_G = 0x2, + ENAHW_LINK_SPEED_5G = 0x4, + ENAHW_LINK_SPEED_10G = 0x8, + ENAHW_LINK_SPEED_25G = 0x10, + ENAHW_LINK_SPEED_40G = 0x20, + ENAHW_LINK_SPEED_50G = 0x40, + ENAHW_LINK_SPEED_100G = 0x80, + ENAHW_LINK_SPEED_200G = 0x100, + ENAHW_LINK_SPEED_400G = 0x200, +} enahw_link_speeds_t; + +/* + * Response to ENAHW_FEAT_HW_HINTS. + * + * Hints from the device to the driver about what values to use for + * various communications between the two. A value of 0 indicates + * there is no hint and the driver should provide its own default. All + * timeout values are in milliseconds. + * + * common: ena_admin_ena_hw_hints + */ +typedef struct enahw_device_hints { + /* + * The amount of time the driver should wait for an MMIO read + * reply before giving up and returning an error. + */ + uint16_t edh_mmio_read_timeout; + + /* + * If the driver has not seen an AENQ keep alive in this + * timeframe, then consider the device hung and perform a + * reset. + */ + uint16_t edh_keep_alive_timeout; + + /* + * The timeperiod in which we expect a Tx to report + * completion, otherwise it is considered "missed". Initiate a + * device reset when the number of missed completions is + * greater than the threshold. + */ + uint16_t edh_tx_comp_timeout; + uint16_t edh_missed_tx_reset_threshold; + + /* + * The timeperiod in which we expect an admin command to + * report completion. + */ + uint16_t edh_admin_comp_timeout; + + /* + * Used by Linux to set the netdevice 'watchdog_timeo' value. + * This value is used by the networking stack to determine + * when a pending transmission has stalled. This is similar to + * the keep alive timeout, except its viewing progress from + * the perspective of the network stack itself. This differnce + * is subtle but important: the device could be in a state + * where it has a functioning keep alive heartbeat, but has a + * stuck Tx queue impeding forward progress of the networking + * stack (which in many cases results in a scenario + * indistinguishable form a complete host hang). + * + * The mac layer does not currently provide such + * functionality, though it could and should be extended to + * support such a feature. + */ + uint16_t edh_net_wd_timeout; + + /* + * The maximum number of cookies/segments allowed in a DMA + * scatter-gather list. + */ + uint16_t edh_max_tx_sgl; + uint16_t edh_max_rx_sgl; + + uint16_t reserved[8]; +} enahw_device_hints_t; + +/* + * Response to ENAHW_FEAT_DEVICE_ATTRIBUTES. + * + * common: ena_admin_device_attr_feature_desc + */ +typedef struct enahw_feat_dev_attr { + uint32_t efda_impl_id; + uint32_t efda_device_version; + + /* + * Bitmap representing supported get/set feature subcommands + * (enahw_feature_id). + */ + uint32_t efda_supported_features; + uint32_t efda_rsvd1; + + /* Number of bits used for physical/vritual address. */ + uint32_t efda_phys_addr_width; + uint32_t efda_virt_addr_with; + + /* The unicast MAC address in network byte order. */ + uint8_t efda_mac_addr[6]; + uint8_t efda_rsvd2[2]; + uint32_t efda_max_mtu; +} enahw_feat_dev_attr_t; + +/* + * Response to ENAHW_FEAT_MAX_QUEUES_NUM. + * + * common: ena_admin_queue_feature_desc + */ +typedef struct enahw_feat_max_queue { + uint32_t efmq_max_sq_num; + uint32_t efmq_max_sq_depth; + uint32_t efmq_max_cq_num; + uint32_t efmq_max_cq_depth; + uint32_t efmq_max_legacy_llq_num; + uint32_t efmq_max_legacy_llq_depth; + uint32_t efmq_max_header_size; + + /* + * The maximum number of descriptors a single Tx packet may + * span. This includes the meta descriptor. + */ + uint16_t efmq_max_per_packet_tx_descs; + + /* + * The maximum number of descriptors a single Rx packet may span. + */ + uint16_t efmq_max_per_packet_rx_descs; +} enahw_feat_max_queue_t; + +/* + * Response to ENAHW_FEAT_MAX_QUEUES_EXT. + * + * common: ena_admin_queue_ext_feature_desc + */ +typedef struct enahw_feat_max_queue_ext { + uint8_t efmqe_version; + uint8_t efmqe_rsvd[3]; + + uint32_t efmqe_max_tx_sq_num; + uint32_t efmqe_max_tx_cq_num; + uint32_t efmqe_max_rx_sq_num; + uint32_t efmqe_max_rx_cq_num; + uint32_t efmqe_max_tx_sq_depth; + uint32_t efmqe_max_tx_cq_depth; + uint32_t efmqe_max_rx_sq_depth; + uint32_t efmqe_max_rx_cq_depth; + uint32_t efmqe_max_tx_header_size; + + /* + * The maximum number of descriptors a single Tx packet may + * span. This includes the meta descriptor. + */ + uint16_t efmqe_max_per_packet_tx_descs; + + /* + * The maximum number of descriptors a single Rx packet may span. + */ + uint16_t efmqe_max_per_packet_rx_descs; +} enahw_feat_max_queue_ext_t; + +/* + * Response to ENA_ADMIN_LINK_CONFIG. + * + * common: ena_admin_get_feature_link_desc + */ +typedef struct enahw_feat_link_conf { + /* Link speed in Mbit/s. */ + uint32_t eflc_speed; + + /* Bit field of enahw_link_speeds_t. */ + uint32_t eflc_supported; + + /* + * 31-2: reserved + * 1: duplex - Full Duplex + * 0: autoneg + */ + uint32_t eflc_flags; +} enahw_feat_link_conf_t; + +#define ENAHW_FEAT_LINK_CONF_AUTONEG_MASK BIT(0) +#define ENAHW_FEAT_LINK_CONF_DUPLEX_SHIFT 1 +#define ENAHW_FEAT_LINK_CONF_DUPLEX_MASK BIT(1) + +#define ENAHW_FEAT_LINK_CONF_AUTONEG(f) \ + ((f)->eflc_flags & ENAHW_FEAT_LINK_CONF_AUTONEG_MASK) + +#define ENAHW_FEAT_LINK_CONF_FULL_DUPLEX(f) \ + ((((f)->eflc_flags & ENAHW_FEAT_LINK_CONF_DUPLEX_MASK) >> \ + ENAHW_FEAT_LINK_CONF_DUPLEX_SHIFT) == 1) + +/* + * Response to ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG. + * + * common: ena_admin_feature_offload_desc + */ +typedef struct enahw_feat_offload { + /* + * 0 : Tx IPv4 Header Checksum + * 1 : Tx L4/IPv4 Partial Checksum + * + * The L4 checksum field should be initialized with pseudo + * header checksum. + * + * 2 : Tx L4/IPv4 Checksum Full + * 3 : Tx L4/IPv6 Partial Checksum + * + * The L4 checksum field should be initialized with pseudo + * header checksum. + * + * 4 : Tx L4/IPv6 Checksum Full + * 5 : TCP/IPv4 LSO (aka TSO) + * 6 : TCP/IPv6 LSO (aka TSO) + * 7 : LSO ECN + */ + uint32_t efo_tx; + + /* + * Receive side supported stateless offload. + * + * 0 : Rx IPv4 Header Checksum + * 1 : Rx TCP/UDP + IPv4 Full Checksum + * 2 : Rx TCP/UDP + IPv6 Full Checksum + * 3 : Rx hash calculation + */ + uint32_t efo_rx_supported; + + /* Linux seems to only check rx_supported. */ + uint32_t efo_rx_enabled; +} enahw_feat_offload_t; + +/* Feature Offloads */ +#define ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM_MASK BIT(0) +#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART_SHIFT 1 +#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART_MASK BIT(1) +#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL_SHIFT 2 +#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL_MASK BIT(2) +#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART_SHIFT 3 +#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART_MASK BIT(3) +#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL_SHIFT 4 +#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL_MASK BIT(4) +#define ENAHW_FEAT_OFFLOAD_TSO_IPV4_SHIFT 5 +#define ENAHW_FEAT_OFFLOAD_TSO_IPV4_MASK BIT(5) +#define ENAHW_FEAT_OFFLOAD_TSO_IPV6_SHIFT 6 +#define ENAHW_FEAT_OFFLOAD_TSO_IPV6_MASK BIT(6) +#define ENAHW_FEAT_OFFLOAD_TSO_ECN_SHIFT 7 +#define ENAHW_FEAT_OFFLOAD_TSO_ECN_MASK BIT(7) +#define ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM_MASK BIT(0) +#define ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM_SHIFT 1 +#define ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM_MASK BIT(1) +#define ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM_SHIFT 2 +#define ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM_MASK BIT(2) +#define ENAHW_FEAT_OFFLOAD_RX_HASH_SHIFT 3 +#define ENAHW_FEAT_OFFLOAD_RX_HASH_MASK BIT(3) + +#define ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM(f) \ + (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM_MASK) != 0) + +#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART(f) \ + (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART_MASK) != 0) + +#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL(f) \ + (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL_MASK) != 0) + +#define ENAHW_FEAT_OFFLOAD_TSO_IPV4(f) \ + (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TSO_IPV4_MASK) != 0) + +#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART(f) \ + (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART_MASK) != 0) + +#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL(f) \ + (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL_MASK) != 0) + +#define ENAHW_FEAT_OFFLOAD_TSO_IPV6(f) \ + (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TSO_IPV6_MASK) != 0) + +#define ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM(f) \ + (((f)->efo_rx_supported & ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM_MASK) != 0) + +#define ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM(f) \ + (((f)->efo_rx_supported & ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM_MASK) != 0) + +#define ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM(f) \ + (((f)->efo_rx_supported & ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM_MASK) != 0) + +typedef union enahw_resp_get_feat { + uint32_t ergf_raw[14]; + enahw_feat_dev_attr_t ergf_dev_attr; + enahw_feat_max_queue_t ergf_max_queue; + enahw_feat_max_queue_ext_t ergf_max_queue_ext; + enahw_feat_aenq_t ergf_aenq; + enahw_feat_link_conf_t ergf_link_conf; + enahw_feat_offload_t ergf_offload; +} enahw_resp_get_feat_u; + +/* + * common: ena_admin_acq_create_cq_resp_desc + */ +typedef struct enahw_resp_create_cq { + /* + * The hardware's index for this queue. + */ + uint16_t ercq_idx; + + /* + * Apparently the number of descriptors granted may be + * different than that requested. + */ + uint16_t ercq_actual_num_descs; + uint32_t ercq_numa_node_reg_offset; + uint32_t ercq_head_db_reg_offset; /* doorbell */ + uint32_t ercq_interrupt_mask_reg_offset; /* stop intr */ +} enahw_resp_create_cq_t; + +/* common: ena_admin_acq_create_sq_resp_desc */ +typedef struct enahw_resp_create_sq { + uint16_t ersq_idx; + uint16_t ersq_rsvdw1; + uint32_t ersq_db_reg_offset; + uint32_t ersq_llq_descs_reg_offset; + uint32_t ersq_llq_headers_reg_offset; +} enahw_resp_create_sq_t; + +/* common: ena_admin_basic_stats */ +typedef struct enahw_resp_basic_stats { + uint32_t erbs_tx_bytes_low; + uint32_t erbs_tx_bytes_high; + uint32_t erbs_tx_pkts_low; + uint32_t erbs_tx_pkts_high; + uint32_t erbs_rx_bytes_low; + uint32_t erbs_rx_bytes_high; + uint32_t erbs_rx_pkts_low; + uint32_t erbs_rx_pkts_high; + uint32_t erbs_rx_drops_low; + uint32_t erbs_rx_drops_high; + uint32_t erbs_tx_drops_low; + uint32_t erbs_tx_drops_high; +} enahw_resp_basic_stats_t; + +/* common: ena_admin_eni_stats */ +typedef struct enahw_resp_eni_stats { + /* + * The number of inbound packets dropped due to aggregate + * inbound bandwidth allowance being exceeded. + */ + uint64_t eres_bw_in_exceeded; + + /* + * The number of outbound packets dropped due to aggregated outbound + * bandwidth allowance being exceeded. + */ + uint64_t eres_bw_out_exceeded; + + /* + * The number of packets dropped due to the Packets Per Second + * allowance being exceeded. + */ + uint64_t eres_pps_exceeded; + + /* + * The number of packets dropped due to connection tracking + * allowance being exceeded and leading to failure in + * establishment of new connections. + */ + uint64_t eres_conns_exceeded; + + /* + * The number of packets dropped due to linklocal packet rate + * allowance being exceeded. + */ + uint64_t eres_linklocal_exceeded; +} enahw_resp_eni_stats_t; + +/* + * common: ena_admin_acq_entry + */ +typedef struct enahw_resp_desc { + /* The index of the completed command. */ + uint16_t erd_cmd_id; + + /* The status of the command (enahw_resp_status_t). */ + uint8_t erd_status; + + /* + * 7-1 Reserved + * 0 Phase + */ + uint8_t erd_flags; + + /* Extended status. */ + uint16_t erd_ext_status; + + /* + * The AQ entry (enahw_cmd_desc) index which has been consumed + * by the device and can be reused. However, this field is not + * used in the other drivers, and it seems to be redundant + * with the erd_idx field. + */ + uint16_t erd_sq_head_idx; + + union { + uint32_t raw[14]; + enahw_resp_get_feat_u erd_get_feat; + enahw_resp_create_cq_t erd_create_cq; + /* destroy_cq: No command-specific response. */ + enahw_resp_create_sq_t erd_create_sq; + /* destroy_sq: No command-specific response. */ + enahw_resp_basic_stats_t erd_basic_stats; + enahw_resp_eni_stats_t erd_eni_stats; + } erd_resp; +} enahw_resp_desc_t; + +/* common: ENA_ADMIN_ACQ_COMMON_DESC */ +#define ENAHW_RESP_CMD_ID_MASK GENMASK(11, 0) +#define ENAHW_RESP_PHASE_MASK 0x1 + +#define ENAHW_RESP_CMD_ID(desc) \ + (((desc)->erd_cmd_id) & ENAHW_RESP_CMD_ID_MASK) + +/* + * The response status of an Admin Queue command. + * + * common: ena_admin_aq_completion_status + */ +typedef enum enahw_resp_status { + ENAHW_RESP_SUCCESS = 0, + ENAHW_RESP_RESOURCE_ALLOCATION_FAILURE = 1, + ENAHW_RESP_BAD_OPCODE = 2, + ENAHW_RESP_UNSUPPORTED_OPCODE = 3, + ENAHW_RESP_MALFORMED_REQUEST = 4, + /* + * At this place in the common code it mentions that there is + * "additional status" in the reponse descriptor's + * erd_ext_status field. As the common code never actually + * uses this field it's hard to know the exact meaning of the + * comment. My best guess is the illegal parameter error + * stores additional context in the erd_ext_status field. But + * how to interpret that additional context is anyone's guess. + */ + ENAHW_RESP_ILLEGAL_PARAMETER = 5, + ENAHW_RESP_UNKNOWN_ERROR = 6, + ENAHW_RESP_RESOURCE_BUSY = 7, +} enahw_resp_status_t; + +/* + * Not really a device structure, more of a helper to debug register values. + */ +typedef struct enahw_reg_nv { + char *ern_name; + uint32_t ern_offset; + uint32_t ern_value; +} enahw_reg_nv_t; + +/* + * I/O macros and strcutures. + * ------------------------- + */ + +/* + * The device's L3 and L4 protocol numbers. These are specific to the + * ENA device and not to be confused with IANA protocol numbers. + * + * common: ena_eth_io_l3_proto_index + */ +typedef enum enahw_io_l3_proto { + ENAHW_IO_L3_PROTO_UNKNOWN = 0, + ENAHW_IO_L3_PROTO_IPV4 = 8, + ENAHW_IO_L3_PROTO_IPV6 = 11, + ENAHW_IO_L3_PROTO_FCOE = 21, + ENAHW_IO_L3_PROTO_ROCE = 22, +} enahw_io_l3_proto_t; + +/* common: ena_eth_io_l4_proto_index */ +typedef enum enahw_io_l4_proto { + ENAHW_IO_L4_PROTO_UNKNOWN = 0, + ENAHW_IO_L4_PROTO_TCP = 12, + ENAHW_IO_L4_PROTO_UDP = 13, + ENAHW_IO_L4_PROTO_ROUTEABLE_ROCE = 23, +} enahw_io_l4_proto_t; + +/* common: ena_eth_io_tx_desc */ +typedef struct enahw_tx_data_desc { + /* + * 15-0 Buffer Length (LENGTH) + * + * The buffer length in bytes. This should NOT include the + * Ethernet FCS bytes. + * + * 21-16 Request ID High Bits [15-10] (REQ_ID_HI) + * 22 Reserved Zero + * 23 Metadata Flag always zero (META_DESC) + * + * This flag indicates if the descriptor is a metadata + * descriptor or not. In this case we are defining the Tx + * descriptor, so it's always zero. + * + * 24 Phase bit (PHASE) + * 25 Reserved Zero + * 26 First Descriptor Bit (FIRST) + * + * Indicates this is the first descriptor for the frame. + * + * 27 Last Descriptor Bit (LAST) + * + * Indicates this is the last descriptor for the frame. + * + * 28 Completion Request Bit (COMP_REQ) + * + * Indicates if completion should be posted after the + * frame is transmitted. This bit is only valid on the + * first descriptor. + * + * 31-29 Reserved Zero + */ + uint32_t etd_len_ctrl; + + /* + * 3-0 L3 Protocol Number (L3_PROTO_IDX) + * + * The L3 protocol type, one of enahw_io_l3_proto_t. This + * field is required when L3_CSUM_EN or TSO_EN is set. + * + * 4 Don't Fragment Bit (DF) + * + * The value of IPv4 DF. This value must copy the value + * found in the packet's IPv4 header. + * + * 6-5 Reserved Zero + * 7 TSO Bit (TSO_EN) + * + * Enable TCP Segment Offload. + * + * 12-8 L4 Protocol Number (L4_PROTO_IDX) + * + * The L4 protocol type, one of enahw_io_l4_proto_t. This + * field is required when L4_CSUM_EN or TSO_EN are + * set. + * + * 13 L3 Checksum Offload (L3_CSUM_EN) + * + * Enable IPv4 header checksum offload. + * + * 14 L4 Checksum Offload (L4_CSUM_EN) + * + * Enable TCP/UDP checksum offload. + * + * 15 Ethernet FCS Disable (ETHERNET_FCS_DIS) + * + * Disable the device's Ethernet Frame Check sequence. + * + * 16 Reserved Zero + * 17 L4 Partial Checksum Present (L4_CSUM_PARTIAL) + * + * When set it indicates the host has already provided + * the pseudo-header checksum. Otherwise, it is up to the + * device to calculate it. + * + * When set and using TSO the host stack must remember + * not to include the TCP segment length in the supplied + * pseudo-header. + * + * The host stack should provide the pseudo-header + * checksum when using IPv6 with Routing Headers. + * + * 21-18 Reserved Zero + * 31-22 Request ID Low [9-0] (REQ_ID_LO) + */ + uint32_t etd_meta_ctrl; + + /* The low 32 bits of the buffer address. */ + uint32_t etd_buff_addr_lo; + + /* + * address high and header size + * + * 15-0 Buffer Address High [47-32] (ADDR_HI) + * + * The upper 15 bits of the buffer address. + * + * 23-16 Reserved Zero + * 31-24 Header Length (HEADER_LENGTH) + * + * This field has dubious documentation in the + * common/Linux driver code, even contradicting itself in + * the same sentence. Here's what it says, verbatim: + * + * > Header length. For Low Latency Queues, this fields + * > indicates the number of bytes written to the + * > headers' memory. For normal queues, if packet is TCP + * > or UDP, and longer than max_header_size, then this + * > field should be set to the sum of L4 header offset + * > and L4 header size(without options), otherwise, this + * > field should be set to 0. For both modes, this field + * > must not exceed the max_header_size. max_header_size + * > value is reported by the Max Queues Feature + * > descriptor + * + * Here's what one _might_ ascertain from the above. + * + * 1. This field should always be set in the case of + * LLQs/device placement. + * + * 2. This field must _never_ exceed the max header size + * as reported by feature detection. In our code this + * would be efmq_max_header_size for older ENA devices + * and efmqe_max_tx_header_size for newer ones. One + * empirical data point from a t3.small (with newer + * device) is a max Tx header size of 128 bytes. + * + * 3. If the packet is TCP or UDP, and the packet (or the + * headers?) is longer than the max header size, then + * this field should be set to the total header size + * with the exception of TCP header options. + * Otherwise, if the packet is not TCP or UDP, or if + * the packet (or header length?) _does not_ exceed + * the max header size, then set this value to 0. + * + * One might think, based on (3), that when the header + * size exceeds the max this field needs to be set, but + * that contradicts (2), which dictates that the total + * header size can never exceed the max. Sure enough, the + * Linux code drops all packets with headers that exceed + * the max. So in that case it would mean that "and + * longer than max_header_size" is referring to the total + * packet length. So for most workloads, the TCP/UDP + * packets should have this field set, to indicate their + * header length. This matches with Linux, which seems to + * set header length regardless of IP protocol. + * + * However, the FreeBSD code tells a different story. In + * it's non-LLQ Tx path it has the following comment, + * verbatim: + * + * > header_len is just a hint for the device. Because + * > FreeBSD is not giving us information about packet + * > header length and it is not guaranteed that all + * > packet headers will be in the 1st mbuf, setting + * > header_len to 0 is making the device ignore this + * > value and resolve header on it's own. + * + * According to this we can just set the value to zero + * and let the device figure it out. This maps better to + * illumos, where we also allow the header to potentially + * span multiple mblks (though we do have access to the + * header sizes via mac_ether_offload_info_t). + * + * The upshot: for now we take advantage of the device's + * ability to determine the header length on its own, at + * the potential cost of some performance (not measured). + */ + uint32_t etd_buff_addr_hi_hdr_sz; +} enahw_tx_data_desc_t; + +#define ENAHW_TX_DESC_LENGTH_MASK GENMASK(15, 0) +#define ENAHW_TX_DESC_REQ_ID_HI_SHIFT 16 +#define ENAHW_TX_DESC_REQ_ID_HI_MASK GENMASK(21, 16) +#define ENAHW_TX_DESC_META_DESC_SHIFT 23 +#define ENAHW_TX_DESC_META_DESC_MASK BIT(23) +#define ENAHW_TX_DESC_PHASE_SHIFT 24 +#define ENAHW_TX_DESC_PHASE_MASK BIT(24) +#define ENAHW_TX_DESC_FIRST_SHIFT 26 +#define ENAHW_TX_DESC_FIRST_MASK BIT(26) +#define ENAHW_TX_DESC_LAST_SHIFT 27 +#define ENAHW_TX_DESC_LAST_MASK BIT(27) +#define ENAHW_TX_DESC_COMP_REQ_SHIFT 28 +#define ENAHW_TX_DESC_COMP_REQ_MASK BIT(28) +#define ENAHW_TX_DESC_L3_PROTO_IDX_MASK GENMASK(3, 0) +#define ENAHW_TX_DESC_DF_SHIFT 4 +#define ENAHW_TX_DESC_DF_MASK BIT(4) +#define ENAHW_TX_DESC_TSO_EN_SHIFT 7 +#define ENAHW_TX_DESC_TSO_EN_MASK BIT(7) +#define ENAHW_TX_DESC_L4_PROTO_IDX_SHIFT 8 +#define ENAHW_TX_DESC_L4_PROTO_IDX_MASK GENMASK(12, 8) +#define ENAHW_TX_DESC_L3_CSUM_EN_SHIFT 13 +#define ENAHW_TX_DESC_L3_CSUM_EN_MASK BIT(13) +#define ENAHW_TX_DESC_L4_CSUM_EN_SHIFT 14 +#define ENAHW_TX_DESC_L4_CSUM_EN_MASK BIT(14) +#define ENAHW_TX_DESC_ETHERNET_FCS_DIS_SHIFT 15 +#define ENAHW_TX_DESC_ETHERNET_FCS_DIS_MASK BIT(15) +#define ENAHW_TX_DESC_L4_CSUM_PARTIAL_SHIFT 17 +#define ENAHW_TX_DESC_L4_CSUM_PARTIAL_MASK BIT(17) +#define ENAHW_TX_DESC_REQ_ID_LO_SHIFT 22 +#define ENAHW_TX_DESC_REQ_ID_LO_MASK GENMASK(31, 22) +#define ENAHW_TX_DESC_ADDR_HI_MASK GENMASK(15, 0) +#define ENAHW_TX_DESC_HEADER_LENGTH_SHIFT 24 +#define ENAHW_TX_DESC_HEADER_LENGTH_MASK GENMASK(31, 24) + +#define ENAHW_TX_DESC_LENGTH(desc, len) \ + (((desc)->etd_len_ctrl) |= ((len) & ENAHW_TX_DESC_LENGTH_MASK)) + +#define ENAHW_TX_DESC_FIRST_ON(desc) \ + (((desc)->etd_len_ctrl) |= ENAHW_TX_DESC_FIRST_MASK) + +#define ENAHW_TX_DESC_FIRST_OFF(desc) \ + (((desc)->etd_len_ctrl) &= ~ENAHW_TX_DESC_FIRST_MASK) + +#define ENAHW_TX_DESC_REQID_HI(desc, reqid) \ + (((desc)->etd_len_ctrl) |= \ + ((((reqid) >> 10) << ENAHW_TX_DESC_REQ_ID_HI_SHIFT) & \ + ENAHW_TX_DESC_REQ_ID_HI_MASK)) + +#define ENAHW_TX_DESC_REQID_LO(desc, reqid) \ + (((desc)->etd_meta_ctrl) |= \ + (((reqid) << ENAHW_TX_DESC_REQ_ID_LO_SHIFT) & \ + ENAHW_TX_DESC_REQ_ID_LO_MASK)) + +#define ENAHW_TX_DESC_PHASE(desc, phase) \ + (((desc)->etd_len_ctrl) |= (((phase) << ENAHW_TX_DESC_PHASE_SHIFT) & \ + ENAHW_TX_DESC_PHASE_MASK)) + +#define ENAHW_TX_DESC_LAST_ON(desc) \ + (((desc)->etd_len_ctrl) |= ENAHW_TX_DESC_LAST_MASK) + +#define ENAHW_TX_DESC_LAST_OFF(desc) \ + (((desc)->etd_len_ctrl) &= ~ENAHW_TX_DESC_LAST_MASK) + +#define ENAHW_TX_DESC_COMP_REQ_ON(desc) \ + (((desc)->etd_len_ctrl) |= ENAHW_TX_DESC_COMP_REQ_MASK) + +#define ENAHW_TX_DESC_COMP_REQ_OFF(desc) \ + (((desc)->etd_len_ctrl) &= ~ENAHW_TX_DESC_COMP_REQ_MASK) + +#define ENAHW_TX_DESC_META_DESC_ON(desc) \ + (((desc)->etd_len_ctrl) |= ENAHW_TX_DESC_META_DESC_MASK) + +#define ENAHW_TX_DESC_META_DESC_OFF(desc) \ + (((desc)->etd_len_ctrl) &= ~ENAHW_TX_DESC_META_DESC_MASK) + +#define ENAHW_TX_DESC_ADDR_LO(desc, addr) \ + (((desc)->etd_buff_addr_lo) = (addr)) + +#define ENAHW_TX_DESC_ADDR_HI(desc, addr) \ + (((desc)->etd_buff_addr_hi_hdr_sz) |= \ + (((addr) >> 32) & ENAHW_TX_DESC_ADDR_HI_MASK)) + +#define ENAHW_TX_DESC_HEADER_LENGTH(desc, len) \ + (((desc)->etd_buff_addr_hi_hdr_sz) |= \ + (((len) << ENAHW_TX_DESC_HEADER_LENGTH_SHIFT) & \ + ENAHW_TX_DESC_HEADER_LENGTH_MASK)) + +#define ENAHW_TX_DESC_DF_ON(desc) \ + ((desc)->etd_meta_ctrl |= ENAHW_TX_DESC_DF_MASK) + +#define ENAHW_TX_DESC_TSO_OFF(desc) \ + (((desc)->etd_meta_ctrl) &= ~ENAHW_TX_DESC_TSO_EN_MASK) + +#define ENAHW_TX_DESC_L3_CSUM_OFF(desc) \ + (((desc)->etd_meta_ctrl) &= ~ENAHW_TX_DESC_L3_CSUM_EN_MASK) + +#define ENAHW_TX_DESC_L4_CSUM_OFF(desc) \ + (((desc)->etd_meta_ctrl) &= ~ENAHW_TX_DESC_L4_CSUM_EN_MASK) + +#define ENAHW_TX_DESC_L4_CSUM_PARTIAL_ON(desc) \ + (((desc)->etd_meta_ctrl) &= ~ENAHW_TX_DESC_L4_CSUM_PARTIAL_MASK) + +/* common: ena_eth_io_tx_meta_desc */ +typedef struct enahw_tx_meta_desc { + /* + * 9-0 Request ID Low [9-0] (REQ_ID_LO) + * 13-10 Reserved Zero + * 14 Extended Metadata Valid (EXT_VALID) + * + * When set this descriptor contains valid extended + * metadata. The extended metadata includes the L3/L4 + * length and offset fields as well as the MSS bits. This + * is needed for TSO. + * + * 15 Reserved Zero + * 19-16 MSS High Bits (MSS_HI) + * 20 Meta Type (ETH_META_TYPE) + * + * If enabled this is an extended metadata descriptor. + * This seems redundant with EXT_VALID. + * + * 21 Meta Store (META_STORE) + * + * Store the extended metadata in the queue cache. + * + * 22 Reserved Zero + * 23 Metadata Flag (META_DESC) -- always one + * 24 Phase (PHASE) + * 25 Reserved Zero + * 26 First Descriptor Bit (FIRST) + * 27 Last Descriptor Bit (LAST) + * 28 Completion Request Bit (COMP_REQ) + * 31-29 Reserved Zero + */ + uint32_t etmd_len_ctrl; + + /* + * 5-0 Request ID High Bits [15-10] (REQ_ID_HI) + * 31-6 Reserved Zero + */ + uint32_t etmd_word1; + + /* + * 7-0 L3 Header Length (L3_HDR_LEN) + * 15:8 L3 Header Offset (L3_HDR_OFF) + * 21:16 L4 Header Length in Words (L4_HDR_LEN_IN_WORDS) + * + * Specifies the L4 header length in words. The device + * assumes the L4 header follows directly after the L3 + * header and that the L4 offset is equal to L3_HDR_OFF + + * L3_HDR_LEN. + * + * 31-22 MSS Low Bits (MSS_LO) + */ + uint32_t etmd_word2; + uint32_t etmd_reserved; +} enahw_tx_meta_desc_t; + +/* common: N/A */ +typedef union enahw_tx_desc { + enahw_tx_data_desc_t etd_data; + enahw_tx_meta_desc_t etd_meta; +} enahw_tx_desc_t; + +/* common: ena_eth_io_tx_cdesc */ +typedef struct enahw_tx_cdesc { + /* + * 15-0 Request ID Bits + * 16 Reserved Zero + */ + uint16_t etc_req_id; + + /* + * Presumably the status of the Tx, though the Linux driver + * never checks this field. + */ + uint8_t etc_status; + + /* + * 0 Phase + * 7-1 Reserved Zero + */ + uint8_t etc_flags; + + /* + * This isn't documented or used in the Linux driver, but + * these probably store the submission queue ID and the + * submission queue head index. + */ + uint16_t etc_sub_qid; + uint16_t etc_sq_head_idx; +} enahw_tx_cdesc_t; + +#define ENAHW_TX_CDESC_PHASE_SHIFT 0 +#define ENAHW_TX_CDESC_PHASE_MASK BIT(0) + +#define ENAHW_TX_CDESC_GET_PHASE(cdesc) \ + ((cdesc)->etc_flags & ENAHW_TX_CDESC_PHASE_MASK) + +/* common: ena_eth_io_rx_desc */ +typedef struct enahw_rx_desc { + /* + * The length of the buffer provided by the host, in bytes. + * Use the value of 0 to indicate 64K. + */ + uint16_t erd_length; + uint8_t erd_reserved1; + + /* + * 0 Phase (PHASE) + * 1 Reserved Zero + * 2 First (FIRST) + * + * Indicates this is the first descriptor for the frame. + * + * 3 Last (LAST) + * + * Indicates this is the last descriptor for the frame. + * + * 4 Completion Request (COMP_REQ) + * + * Indicates that a completion request should be generated + * for this descriptor. + * + * 7-5 Reserved Zero + */ + uint8_t erd_ctrl; + + /* + * 15-0 Request ID + * 16 Reserved 0 + */ + uint16_t erd_req_id; + uint16_t erd_reserved2; + + /* The physical address of the buffer provided by the host. */ + uint32_t erd_buff_addr_lo; + uint16_t erd_buff_addr_hi; + uint16_t erd_reserved3; +} enahw_rx_desc_t; + +#define ENAHW_RX_DESC_PHASE_MASK BIT(0) +#define ENAHW_RX_DESC_FIRST_SHIFT 2 +#define ENAHW_RX_DESC_FIRST_MASK BIT(2) +#define ENAHW_RX_DESC_LAST_SHIFT 3 +#define ENAHW_RX_DESC_LAST_MASK BIT(3) +#define ENAHW_RX_DESC_COMP_REQ_SHIFT 4 +#define ENAHW_RX_DESC_COMP_REQ_MASK BIT(4) + +#define ENAHW_RX_DESC_SET_PHASE(desc, val) \ + ((desc)->erd_ctrl |= ((val) & ENAHW_RX_DESC_PHASE_MASK)) + +#define ENAHW_RX_DESC_SET_FIRST(desc) \ + ((desc)->erd_ctrl |= ENAHW_RX_DESC_FIRST_MASK) + +#define ENAHW_RX_DESC_SET_LAST(desc) \ + ((desc)->erd_ctrl |= ENAHW_RX_DESC_LAST_MASK) + +#define ENAHW_RX_DESC_SET_COMP_REQ(desc) \ + ((desc)->erd_ctrl |= ENAHW_RX_DESC_COMP_REQ_MASK) + +/* + * Ethernet parsing information is only valid when last == 1. + * + * common: ena_eth_io_rx_cdesc_base + */ +typedef struct enahw_rx_cdesc { + /* + * 4-0 L3 Protocol Number (L3_PROTO) + * + * The L3 protocol type, one of enahw_io_l3_proto_t. + * + * 6-5 (SRC_VLAN_CNT) + * 7 Reserved Zero + * 12-8 L4 Protocol Number (L4_PROTO) + * 13 L3 Checksum Error (L3_CSUM_ERR) + * + * When set either the L3 checksum failed to match or the + * controller didn't attempt to validate the checksum. + * This bit is valid only when L3_PROTO indicates an IPv4 + * packet. + * + * 14 L4 Checksum Error (L4_CSUM_ERR) + * + * When set either the L4 checksum failed to match or the + * controller didn't attempt to validate the checksum. + * This bit is valid only when L4_PROTO indicates a + * TCP/UDP packet, IPV4_FRAG is not set, and + * L4_CSUM_CHECKED is set. + * + * 15 IPv4 Fragmented (IPV4_FRAG) + * 16 L4 Checksum Validated (L4_CSUM_CHECKED) + * + * When set it indicates the device attempted to validate + * the L4 checksum. + * + * 23-17 Reserved Zero + * 24 Phase (PHASE) + * 25 (L3_CSUM2) + * + * According to the Linux source this is the "second + * checksum engine result". It's never checked. + * + * 26 First Descriptor Bit (FIRST) + * + * Indicates the first descriptor for the frame. + * + * 27 Last Descriptor Bit (LAST) + * + * Indicates the last descriptor for the frame. + * + * 29-28 Reserved Zero + * 30 Buffer Type (BUFFER) + * + * When enabled indicates this is a data descriptor. + * Otherwse, it is a metadata descriptor. + * + * 31 : reserved31 + */ + uint32_t erc_status; + uint16_t erc_length; + uint16_t erc_req_id; + + /* 32-bit hash result */ + uint32_t erc_hash; + uint16_t erc_sub_qid; + + /* + * The device may choose to offset the start of the header + * data (which implies this value only applies to the first + * descriptor). When and why the device does this is not + * documented in the common code. The most likely case would + * be for IP header alignment. + */ + uint8_t erc_offset; + uint8_t erc_reserved; +} enahw_rx_cdesc_t; + +#define ENAHW_RX_CDESC_L3_PROTO_MASK GENMASK(4, 0) +#define ENAHW_RX_CDESC_SRC_VLAN_CNT_SHIFT 5 +#define ENAHW_RX_CDESC_SRC_VLAN_CNT_MASK GENMASK(6, 5) +#define ENAHW_RX_CDESC_L4_PROTO_SHIFT 8 +#define ENAHW_RX_CDESC_L4_PROTO_MASK GENMASK(12, 8) +#define ENAHW_RX_CDESC_L3_CSUM_ERR_SHIFT 13 +#define ENAHW_RX_CDESC_L3_CSUM_ERR_MASK BIT(13) +#define ENAHW_RX_CDESC_L4_CSUM_ERR_SHIFT 14 +#define ENAHW_RX_CDESC_L4_CSUM_ERR_MASK BIT(14) +#define ENAHW_RX_CDESC_IPV4_FRAG_SHIFT 15 +#define ENAHW_RX_CDESC_IPV4_FRAG_MASK BIT(15) +#define ENAHW_RX_CDESC_L4_CSUM_CHECKED_SHIFT 16 +#define ENAHW_RX_CDESC_L4_CSUM_CHECKED_MASK BIT(16) +#define ENAHW_RX_CDESC_PHASE_SHIFT 24 +#define ENAHW_RX_CDESC_PHASE_MASK BIT(24) +#define ENAHW_RX_CDESC_L3_CSUM2_SHIFT 25 +#define ENAHW_RX_CDESC_L3_CSUM2_MASK BIT(25) +#define ENAHW_RX_CDESC_FIRST_SHIFT 26 +#define ENAHW_RX_CDESC_FIRST_MASK BIT(26) +#define ENAHW_RX_CDESC_LAST_SHIFT 27 +#define ENAHW_RX_CDESC_LAST_MASK BIT(27) +#define ENAHW_RX_CDESC_BUFFER_SHIFT 30 +#define ENAHW_RX_CDESC_BUFFER_MASK BIT(30) + +#define ENAHW_RX_CDESC_L3_PROTO(desc) \ + ((desc)->erc_status & ENAHW_RX_CDESC_L3_PROTO_MASK) + +#define ENAHW_RX_CDESC_L3_CSUM_ERR(desc) \ + ((((desc)->erc_status & ENAHW_RX_CDESC_L3_CSUM_ERR_MASK) >> \ + ENAHW_RX_CDESC_L3_CSUM_ERR_SHIFT) != 0) + +#define ENAHW_RX_CDESC_L4_PROTO(desc) \ + (((desc)->erc_status & ENAHW_RX_CDESC_L4_PROTO_MASK) >> \ + ENAHW_RX_CDESC_L4_PROTO_SHIFT) + +#define ENAHW_RX_CDESC_L4_CSUM_CHECKED(desc) \ + ((((desc)->erc_status & ENAHW_RX_CDESC_L4_CSUM_CHECKED_MASK) >> \ + ENAHW_RX_CDESC_L4_CSUM_CHECKED_SHIFT) != 0) + +#define ENAHW_RX_CDESC_L4_CSUM_ERR(desc) \ + ((((desc)->erc_status & ENAHW_RX_CDESC_L4_CSUM_ERR_MASK) >> \ + ENAHW_RX_CDESC_L4_CSUM_ERR_SHIFT) != 0) + +#define ENAHW_RX_CDESC_PHASE(desc) \ + (((desc)->erc_status & ENAHW_RX_CDESC_PHASE_MASK) >> \ + ENAHW_RX_CDESC_PHASE_SHIFT) + +#define ENAHW_RX_CDESC_FIRST(desc) \ + ((((desc)->erc_status & ENAHW_RX_CDESC_FIRST_MASK) >> \ + ENAHW_RX_CDESC_FIRST_SHIFT) == 1) + +#define ENAHW_RX_CDESC_LAST(desc) \ + ((((desc)->erc_status & ENAHW_RX_CDESC_LAST_MASK) >> \ + ENAHW_RX_CDESC_LAST_SHIFT) == 1) + +/* + * Controls for the interrupt register mapped to each Rx/Tx CQ. + */ +#define ENAHW_REG_INTR_RX_DELAY_MASK GENMASK(14, 0) +#define ENAHW_REG_INTR_TX_DELAY_SHIFT 15 +#define ENAHW_REG_INTR_TX_DELAY_MASK GENMASK(29, 15) +#define ENAHW_REG_INTR_UNMASK_SHIFT 30 +#define ENAHW_REG_INTR_UNMASK_MASK BIT(30) + +#define ENAHW_REG_INTR_UNMASK(val) \ + ((val) |= ENAHW_REG_INTR_UNMASK_MASK) + +#define ENAHW_REG_INTR_MASK(val) \ + ((val) &= ~ENAHW_REG_INTR_UNMASK_MASK) + +#endif /* _ENA_HW_H */ diff --git a/usr/src/uts/common/io/ena/ena_intr.c b/usr/src/uts/common/io/ena/ena_intr.c new file mode 100644 index 0000000000..2650609cfa --- /dev/null +++ b/usr/src/uts/common/io/ena/ena_intr.c @@ -0,0 +1,175 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ +#include "ena.h" + +/* + * We currently limit the number of Tx/Rx queues to the number of + * available interrupts (minus one for the admin queue). + */ +static uint_t +ena_io_intr(caddr_t arg1, caddr_t arg2) +{ + ena_t *ena = (ena_t *)arg1; + uint16_t vector = (uintptr_t)(void *)arg2; + ASSERT3U(vector, >, 0); + ASSERT3U(vector, <, ena->ena_num_intrs); + ena_txq_t *txq = &ena->ena_txqs[vector - 1]; + ena_rxq_t *rxq = &ena->ena_rxqs[vector - 1]; + uint32_t intr_ctrl; + + ASSERT3P(txq, !=, NULL); + ASSERT3P(rxq, !=, NULL); + ena_tx_intr_work(txq); + ena_rx_intr_work(rxq); + + /* + * The Rx/Tx queue share the same interrupt, only need to + * unmask interrupts for one of them. + */ + intr_ctrl = ena_hw_abs_read32(ena, txq->et_cq_unmask_addr); + ENAHW_REG_INTR_UNMASK(intr_ctrl); + ena_hw_abs_write32(ena, txq->et_cq_unmask_addr, intr_ctrl); + return (DDI_INTR_CLAIMED); +} + +static uint_t +ena_admin_intr(caddr_t arg1, caddr_t arg2) +{ + ena_t *ena = (ena_t *)arg1; + + ena_aenq_work(ena); + return (DDI_INTR_CLAIMED); +} + +void +ena_intr_remove_handlers(ena_t *ena) +{ + for (int i = 0; i < ena->ena_num_intrs; i++) { + int ret = ddi_intr_remove_handler(ena->ena_intr_handles[i]); + + /* Nothing we can really do except log. */ + if (ret != DDI_SUCCESS) { + ena_err(ena, "failed to remove interrupt handler for " + "vector %d: %d", i, ret); + } + } +} + +/* + * The ena driver uses separate interrupt handlers for the admin queue + * and I/O queues. + */ +boolean_t +ena_intr_add_handlers(ena_t *ena) +{ + ASSERT3S(ena->ena_num_intrs, >=, 2); + if (ddi_intr_add_handler(ena->ena_intr_handles[0], ena_admin_intr, ena, + (void *)(uintptr_t)0) != DDI_SUCCESS) { + ena_err(ena, "failed to add admin interrupt handler"); + return (B_FALSE); + } + + for (int i = 1; i < ena->ena_num_intrs; i++) { + caddr_t vector = (void *)(uintptr_t)(i); + int ret = ddi_intr_add_handler(ena->ena_intr_handles[i], + ena_io_intr, ena, vector); + + if (ret != DDI_SUCCESS) { + ena_err(ena, "failed to add I/O interrupt handler " + "for vector %u", i); + + /* + * If we fail to add any I/O handler, then all + * successfully added handlers are removed, + * including the admin handler. For example, + * when i=2 we remove handler 1 (the first I/O + * handler), and when i=1 we remove handler 0 + * (the admin handler). + */ + while (i >= 1) { + i--; + (void) ddi_intr_remove_handler( + ena->ena_intr_handles[i]); + } + + return (B_FALSE); + } + } + + return (B_TRUE); +} + +boolean_t +ena_intrs_disable(ena_t *ena) +{ + int ret; + + if (ena->ena_intr_caps & DDI_INTR_FLAG_BLOCK) { + if ((ret = ddi_intr_block_disable(ena->ena_intr_handles, + ena->ena_num_intrs)) != DDI_SUCCESS) { + ena_err(ena, "failed to block disable interrupts: %d", + ret); + return (B_FALSE); + } + } else { + for (int i = 0; i < ena->ena_num_intrs; i++) { + ret = ddi_intr_disable(ena->ena_intr_handles[i]); + if (ret != DDI_SUCCESS) { + ena_err(ena, "failed to disable interrupt " + "%d: %d", i, ret); + return (B_FALSE); + } + } + } + + return (B_TRUE); +} + +boolean_t +ena_intrs_enable(ena_t *ena) +{ + int ret; + + if (ena->ena_intr_caps & DDI_INTR_FLAG_BLOCK) { + if ((ret = ddi_intr_block_enable(ena->ena_intr_handles, + ena->ena_num_intrs)) != DDI_SUCCESS) { + ena_err(ena, "failed to block enable interrupts: %d", + ret); + return (B_FALSE); + } + } else { + for (int i = 0; i < ena->ena_num_intrs; i++) { + if ((ret = ddi_intr_enable(ena->ena_intr_handles[i])) != + DDI_SUCCESS) { + ena_err(ena, "failed to enable interrupt " + "%d: %d", i, ret); + + /* + * If we fail to enable any interrupt, + * then all interrupts are disabled. + */ + while (i >= 1) { + i--; + (void) ddi_intr_disable( + ena->ena_intr_handles[i]); + } + + return (B_FALSE); + } + } + } + + return (B_TRUE); +} diff --git a/usr/src/uts/common/io/ena/ena_rx.c b/usr/src/uts/common/io/ena/ena_rx.c new file mode 100644 index 0000000000..7f0b7db94a --- /dev/null +++ b/usr/src/uts/common/io/ena/ena_rx.c @@ -0,0 +1,531 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ +#include "ena.h" + +static void +ena_refill_rx(ena_rxq_t *rxq, uint16_t num) +{ + VERIFY3P(rxq, !=, NULL); + ASSERT(MUTEX_HELD(&rxq->er_lock)); + ASSERT3U(num, <=, rxq->er_sq_num_descs); + uint16_t tail_mod = rxq->er_sq_tail_idx & (rxq->er_sq_num_descs - 1); + + while (num != 0) { + enahw_rx_desc_t *desc = &rxq->er_sq_descs[tail_mod]; + ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[tail_mod]; + uint16_t phase = rxq->er_sq_phase; + + VERIFY3U(tail_mod, <, rxq->er_sq_num_descs); + VERIFY3P(desc, !=, NULL); + VERIFY3P(rcb, !=, NULL); + VERIFY3P(desc, >=, rxq->er_sq_descs); + VERIFY3P(desc, <=, + (rxq->er_sq_descs + rxq->er_sq_num_descs - 1)); + + desc->erd_length = rcb->ercb_dma.edb_len; + desc->erd_req_id = tail_mod; + VERIFY3P(rcb->ercb_dma.edb_cookie, !=, NULL); + ena_set_dma_addr_values(rxq->er_ena, + rcb->ercb_dma.edb_cookie->dmac_laddress, + &desc->erd_buff_addr_lo, &desc->erd_buff_addr_hi); + ENAHW_RX_DESC_SET_PHASE(desc, phase); + ENAHW_RX_DESC_SET_FIRST(desc); + ENAHW_RX_DESC_SET_LAST(desc); + ENAHW_RX_DESC_SET_COMP_REQ(desc); + DTRACE_PROBE1(ena__refill__rx, enahw_rx_desc_t *, desc); + rxq->er_sq_tail_idx++; + tail_mod = rxq->er_sq_tail_idx & (rxq->er_sq_num_descs - 1); + + if (tail_mod == 0) { + rxq->er_sq_phase = !rxq->er_sq_phase; + } + + num--; + } + + ENA_DMA_SYNC(rxq->er_sq_dma, DDI_DMA_SYNC_FORDEV); + ena_hw_abs_write32(rxq->er_ena, rxq->er_sq_db_addr, + rxq->er_sq_tail_idx); +} + +void +ena_free_rx_dma(ena_rxq_t *rxq) +{ + if (rxq->er_rcbs != NULL) { + for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) { + ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i]; + ena_dma_free(&rcb->ercb_dma); + } + + kmem_free(rxq->er_rcbs, + sizeof (*rxq->er_rcbs) * rxq->er_sq_num_descs); + + rxq->er_rcbs = NULL; + } + + ena_dma_free(&rxq->er_cq_dma); + rxq->er_cq_descs = NULL; + rxq->er_cq_num_descs = 0; + + ena_dma_free(&rxq->er_sq_dma); + rxq->er_sq_descs = NULL; + rxq->er_sq_num_descs = 0; + + rxq->er_state &= ~ENA_RXQ_STATE_HOST_ALLOC; +} + +static int +ena_alloc_rx_dma(ena_rxq_t *rxq) +{ + ena_t *ena = rxq->er_ena; + size_t cq_descs_sz; + size_t sq_descs_sz; + ena_dma_conf_t conf; + int err = 0; + + cq_descs_sz = rxq->er_cq_num_descs * sizeof (*rxq->er_cq_descs); + sq_descs_sz = rxq->er_sq_num_descs * sizeof (*rxq->er_sq_descs); + conf = (ena_dma_conf_t) { + .edc_size = sq_descs_sz, + .edc_align = ENAHW_IO_SQ_DESC_BUF_ALIGNMENT, + .edc_sgl = 1, + .edc_endian = DDI_NEVERSWAP_ACC, + .edc_stream = B_FALSE, + }; + + if (!ena_dma_alloc(ena, &rxq->er_sq_dma, &conf, sq_descs_sz)) { + return (ENOMEM); + } + + rxq->er_sq_descs = (void *)rxq->er_sq_dma.edb_va; + rxq->er_rcbs = kmem_zalloc(sizeof (*rxq->er_rcbs) * + rxq->er_sq_num_descs, KM_SLEEP); + + for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) { + ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i]; + ena_dma_conf_t buf_conf = { + .edc_size = ena->ena_rx_buf_sz, + .edc_align = 1, + .edc_sgl = ena->ena_rx_sgl_max_sz, + .edc_endian = DDI_NEVERSWAP_ACC, + .edc_stream = B_TRUE, + }; + + if (!ena_dma_alloc(ena, &rcb->ercb_dma, &buf_conf, + ena->ena_rx_buf_sz)) { + err = ENOMEM; + goto error; + } + } + + conf = (ena_dma_conf_t) { + .edc_size = cq_descs_sz, + .edc_align = ENAHW_IO_CQ_DESC_BUF_ALIGNMENT, + .edc_sgl = 1, + .edc_endian = DDI_NEVERSWAP_ACC, + .edc_stream = B_FALSE, + }; + + if (!ena_dma_alloc(ena, &rxq->er_cq_dma, &conf, cq_descs_sz)) { + err = ENOMEM; + goto error; + } + + rxq->er_cq_descs = (void *)rxq->er_cq_dma.edb_va; + rxq->er_state |= ENA_RXQ_STATE_HOST_ALLOC; + return (0); + +error: + ena_free_rx_dma(rxq); + return (err); +} + +boolean_t +ena_alloc_rxq(ena_rxq_t *rxq) +{ + int ret = 0; + ena_t *ena = rxq->er_ena; + uint16_t cq_hw_idx, sq_hw_idx; + uint32_t *cq_unmask_addr, *cq_headdb, *cq_numanode; + uint32_t *sq_db_addr; + + /* + * First, allocate the Rx data buffers. + */ + if ((ret = ena_alloc_rx_dma(rxq)) != 0) { + ena_err(ena, "failed to allocate Rx queue %u data buffers: %d", + rxq->er_rxqs_idx, ret); + return (B_FALSE); + } + + ASSERT(rxq->er_state & ENA_RXQ_STATE_HOST_ALLOC); + + /* + * Second, create the Completion Queue. + */ + ret = ena_create_cq(ena, rxq->er_cq_num_descs, + rxq->er_cq_dma.edb_cookie->dmac_laddress, B_FALSE, + rxq->er_intr_vector, &cq_hw_idx, &cq_unmask_addr, &cq_headdb, + &cq_numanode); + + if (ret != 0) { + ena_err(ena, "failed to create Rx CQ %u: %d", rxq->er_rxqs_idx, + ret); + return (B_FALSE); + } + + /* The phase must always start on 1. */ + rxq->er_cq_phase = 1; + rxq->er_cq_head_idx = 0; + rxq->er_cq_hw_idx = cq_hw_idx; + rxq->er_cq_unmask_addr = cq_unmask_addr; + rxq->er_cq_head_db_addr = cq_headdb; + rxq->er_cq_numa_addr = cq_numanode; + rxq->er_state |= ENA_RXQ_STATE_CQ_CREATED; + + /* + * Third, create the Submission Queue to match with the above + * CQ. At this time we force the SQ and CQ to have the same + * number of descriptors as we only use a 1:1 completion + * policy. However, in the future, we could loosen this and + * use an on-demand completion policy and the two could have a + * different number of descriptors. + */ + ASSERT3U(rxq->er_sq_num_descs, ==, rxq->er_cq_num_descs); + ret = ena_create_sq(ena, rxq->er_sq_num_descs, + rxq->er_sq_dma.edb_cookie->dmac_laddress, B_FALSE, cq_hw_idx, + &sq_hw_idx, &sq_db_addr); + + if (ret != 0) { + ena_err(ena, "failed to create Rx SQ %u: %d", rxq->er_rxqs_idx, + ret); + return (B_FALSE); + } + + ASSERT3P(sq_db_addr, !=, NULL); + rxq->er_sq_hw_idx = sq_hw_idx; + rxq->er_sq_db_addr = sq_db_addr; + /* The phase must always start on 1. */ + rxq->er_sq_phase = 1; + rxq->er_sq_tail_idx = 0; + rxq->er_sq_avail_descs = rxq->er_sq_num_descs; + rxq->er_mode = ENA_RXQ_MODE_INTR; + rxq->er_state |= ENA_RXQ_STATE_SQ_CREATED; + + return (B_TRUE); +} + +void +ena_cleanup_rxq(ena_rxq_t *rxq) +{ + int ret = 0; + ena_t *ena = rxq->er_ena; + + if ((rxq->er_state & ENA_RXQ_STATE_SQ_CREATED) != 0) { + ret = ena_destroy_sq(ena, rxq->er_sq_hw_idx, B_FALSE); + + if (ret != 0) { + ena_err(ena, "failed to destroy Rx SQ %u: %d", + rxq->er_rxqs_idx, ret); + } + + rxq->er_sq_hw_idx = 0; + rxq->er_sq_db_addr = NULL; + rxq->er_sq_tail_idx = 0; + rxq->er_sq_phase = 0; + rxq->er_state &= ~ENA_RXQ_STATE_SQ_CREATED; + } + + if ((rxq->er_state & ENA_RXQ_STATE_CQ_CREATED) != 0) { + ret = ena_destroy_cq(ena, rxq->er_cq_hw_idx); + + if (ret != 0) { + ena_err(ena, "failed to destroy Rx CQ %u: %d", + rxq->er_rxqs_idx, ret); + } + + rxq->er_cq_hw_idx = 0; + rxq->er_cq_head_idx = 0; + rxq->er_cq_phase = 0; + rxq->er_cq_head_db_addr = NULL; + rxq->er_cq_unmask_addr = NULL; + rxq->er_cq_numa_addr = NULL; + rxq->er_state &= ~ENA_RXQ_STATE_CQ_CREATED; + } + + ena_free_rx_dma(rxq); + ASSERT3S(rxq->er_state, ==, ENA_RXQ_STATE_NONE); +} + +void +ena_ring_rx_stop(mac_ring_driver_t rh) +{ + ena_rxq_t *rxq = (ena_rxq_t *)rh; + uint32_t intr_ctrl; + + intr_ctrl = ena_hw_abs_read32(rxq->er_ena, rxq->er_cq_unmask_addr); + ENAHW_REG_INTR_MASK(intr_ctrl); + ena_hw_abs_write32(rxq->er_ena, rxq->er_cq_unmask_addr, intr_ctrl); + + rxq->er_state &= ~ENA_RXQ_STATE_RUNNING; + rxq->er_state &= ~ENA_RXQ_STATE_READY; +} + +int +ena_ring_rx_start(mac_ring_driver_t rh, uint64_t gen_num) +{ + ena_rxq_t *rxq = (ena_rxq_t *)rh; + ena_t *ena = rxq->er_ena; + uint32_t intr_ctrl; + + mutex_enter(&rxq->er_lock); + ena_refill_rx(rxq, rxq->er_sq_num_descs); + rxq->er_m_gen_num = gen_num; + rxq->er_intr_limit = ena->ena_rxq_intr_limit; + mutex_exit(&rxq->er_lock); + + rxq->er_state |= ENA_RXQ_STATE_READY; + + intr_ctrl = ena_hw_abs_read32(ena, rxq->er_cq_unmask_addr); + ENAHW_REG_INTR_UNMASK(intr_ctrl); + ena_hw_abs_write32(ena, rxq->er_cq_unmask_addr, intr_ctrl); + rxq->er_state |= ENA_RXQ_STATE_RUNNING; + return (0); +} + +mblk_t * +ena_ring_rx(ena_rxq_t *rxq, int poll_bytes) +{ + ena_t *ena = rxq->er_ena; + uint16_t head_mod = rxq->er_cq_head_idx & (rxq->er_cq_num_descs - 1); + uint64_t total_bytes = 0; + uint64_t num_frames = 0; + enahw_rx_cdesc_t *cdesc; + boolean_t polling = B_TRUE; + mblk_t *head = NULL; + mblk_t *tail = NULL; + + ASSERT(MUTEX_HELD(&rxq->er_lock)); + ENA_DMA_SYNC(rxq->er_cq_dma, DDI_DMA_SYNC_FORKERNEL); + + if (poll_bytes == ENA_INTERRUPT_MODE) { + polling = B_FALSE; + } + + cdesc = &rxq->er_cq_descs[head_mod]; + VERIFY3P(cdesc, >=, rxq->er_cq_descs); + VERIFY3P(cdesc, <=, (rxq->er_cq_descs + rxq->er_cq_num_descs - 1)); + + while (ENAHW_RX_CDESC_PHASE(cdesc) == rxq->er_cq_phase) { + boolean_t first, last; + ena_rx_ctrl_block_t *rcb; + uint16_t req_id; + mblk_t *mp; + enahw_io_l3_proto_t l3proto; + enahw_io_l4_proto_t l4proto; + boolean_t l4csum_checked; + uint32_t hflags = 0; + + VERIFY3U(head_mod, <, rxq->er_cq_num_descs); + /* + * Currently, all incoming frames fit in a single Rx + * buffer (erd_length > total frame size). In the + * future, if we decide to loan buffers which are + * smaller, we will need to modify this code to read + * one or more descriptors (based on frame size). + * + * For this reason we do not expect any frame to span + * multiple descriptors. Therefore, we drop any data + * not delivered as a single descriptor, i.e., where + * 'first' and 'last' are both true. + */ + first = ENAHW_RX_CDESC_FIRST(cdesc); + last = ENAHW_RX_CDESC_LAST(cdesc); + + if (!first || !last) { + mutex_enter(&rxq->er_stat_lock); + rxq->er_stat.ers_multi_desc.value.ui64++; + mutex_exit(&rxq->er_stat_lock); + goto next_desc; + } + + req_id = cdesc->erc_req_id; + VERIFY3U(req_id, <, rxq->er_cq_num_descs); + rcb = &rxq->er_rcbs[req_id]; + rcb->ercb_offset = cdesc->erc_offset; + rcb->ercb_length = cdesc->erc_length; + ASSERT3U(rcb->ercb_length, <=, ena->ena_max_frame_total); + mp = allocb(rcb->ercb_length + ENA_RX_BUF_IPHDR_ALIGNMENT, 0); + + /* + * If we can't allocate an mblk, things are looking + * grim. Forget about this frame and move on. + */ + if (mp == NULL) { + mutex_enter(&rxq->er_stat_lock); + rxq->er_stat.ers_allocb_fail.value.ui64++; + mutex_exit(&rxq->er_stat_lock); + goto next_desc; + } + + /* + * As we pull frames we need to link them together as + * one chain to be delivered up to mac. + */ + if (head == NULL) { + head = mp; + } else { + tail->b_next = mp; + } + + tail = mp; + + /* + * We need to make sure the bytes are copied to the + * correct offset to achieve 4-byte IP header + * alignment. + * + * If we start using desballoc on the buffers, then we + * will need to make sure to apply this offset to the + * DMA buffers as well. Though it may be the case the + * device does this implicitly and that's what + * cdesc->erc_offset is for; we don't know because + * it's not documented. + */ + mp->b_wptr += ENA_RX_BUF_IPHDR_ALIGNMENT; + mp->b_rptr += ENA_RX_BUF_IPHDR_ALIGNMENT; + bcopy(rcb->ercb_dma.edb_va + rcb->ercb_offset, mp->b_wptr, + rcb->ercb_length); + mp->b_wptr += rcb->ercb_length; + total_bytes += rcb->ercb_length; + VERIFY3P(mp->b_wptr, >, mp->b_rptr); + VERIFY3P(mp->b_wptr, <=, mp->b_datap->db_lim); + + l3proto = ENAHW_RX_CDESC_L3_PROTO(cdesc); + l4proto = ENAHW_RX_CDESC_L4_PROTO(cdesc); + + /* + * When it comes to bad TCP/IP checksums we do not + * discard the packet at this level. Instead, we let + * it percolate up for further processing and tracking + * by the upstream TCP/IP stack. + */ + if (ena->ena_rx_l3_ipv4_csum && + l3proto == ENAHW_IO_L3_PROTO_IPV4) { + boolean_t l3_csum_err = + ENAHW_RX_CDESC_L3_CSUM_ERR(cdesc); + + if (l3_csum_err) { + mutex_enter(&rxq->er_stat_lock); + rxq->er_stat.ers_hck_ipv4_err.value.ui64++; + mutex_exit(&rxq->er_stat_lock); + } else { + hflags |= HCK_IPV4_HDRCKSUM_OK; + } + } + + l4csum_checked = ENAHW_RX_CDESC_L4_CSUM_CHECKED(cdesc); + + if (ena->ena_rx_l4_ipv4_csum && l4csum_checked && + l4proto == ENAHW_IO_L4_PROTO_TCP) { + boolean_t l4_csum_err = + ENAHW_RX_CDESC_L4_CSUM_ERR(cdesc); + + if (l4_csum_err) { + mutex_enter(&rxq->er_stat_lock); + rxq->er_stat.ers_hck_l4_err.value.ui64++; + mutex_exit(&rxq->er_stat_lock); + } else { + hflags |= HCK_FULLCKSUM_OK; + } + } + + if (hflags != 0) { + mac_hcksum_set(mp, 0, 0, 0, 0, hflags); + } + +next_desc: + /* + * Technically, if we arrived here due to a failure, + * then we did not read a new frame. However, we count + * it all the same anyways in order to count it as + * progress to the interrupt work limit. The failure + * stats will allow us to differentiate good frames + * from bad. + */ + num_frames++; + rxq->er_cq_head_idx++; + head_mod = rxq->er_cq_head_idx & (rxq->er_cq_num_descs - 1); + + if (head_mod == 0) { + rxq->er_cq_phase = !rxq->er_cq_phase; + } + + if (polling && (total_bytes > poll_bytes)) { + break; + } else if (!polling && (num_frames >= rxq->er_intr_limit)) { + mutex_enter(&rxq->er_stat_lock); + rxq->er_stat.ers_intr_limit.value.ui64++; + mutex_exit(&rxq->er_stat_lock); + break; + } + + cdesc = &rxq->er_cq_descs[head_mod]; + VERIFY3P(cdesc, >=, rxq->er_cq_descs); + VERIFY3P(cdesc, <=, + (rxq->er_cq_descs + rxq->er_cq_num_descs - 1)); + } + + mutex_enter(&rxq->er_stat_lock); + rxq->er_stat.ers_packets.value.ui64 += num_frames; + rxq->er_stat.ers_bytes.value.ui64 += total_bytes; + mutex_exit(&rxq->er_stat_lock); + + DTRACE_PROBE4(rx__frames, mblk_t *, head, boolean_t, polling, uint64_t, + num_frames, uint64_t, total_bytes); + ena_refill_rx(rxq, num_frames); + return (head); +} + +void +ena_rx_intr_work(ena_rxq_t *rxq) +{ + mblk_t *mp; + + mutex_enter(&rxq->er_lock); + mp = ena_ring_rx(rxq, ENA_INTERRUPT_MODE); + mutex_exit(&rxq->er_lock); + + if (mp == NULL) { + return; + } + + mac_rx_ring(rxq->er_ena->ena_mh, rxq->er_mrh, mp, rxq->er_m_gen_num); +} + +mblk_t * +ena_ring_rx_poll(void *rh, int poll_bytes) +{ + ena_rxq_t *rxq = rh; + mblk_t *mp; + + ASSERT3S(poll_bytes, >, 0); + + mutex_enter(&rxq->er_lock); + mp = ena_ring_rx(rxq, poll_bytes); + mutex_exit(&rxq->er_lock); + + return (mp); +} diff --git a/usr/src/uts/common/io/ena/ena_stats.c b/usr/src/uts/common/io/ena/ena_stats.c new file mode 100644 index 0000000000..c8ef7ae260 --- /dev/null +++ b/usr/src/uts/common/io/ena/ena_stats.c @@ -0,0 +1,475 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ +#include "ena.h" + +/* + * The ENA device provides the following hardware stats. It appears + * that all stats are available at both a device-level and + * queue-level. However, Linux and FreeBSD don't implement queue + * scope. It's not clear how one would implement queue scope because + * there is nothing in the common code describing how to determine the + * queue index number. Both the SQ and CQ have device index values, + * but for a given logical queue they don't always match and so it's + * not clear what value to use for querying the stats. Therefore, + * device-wide basic and extended stats come from the device, while + * queue/ring stats come from driver. + * + * From empirical testing, these statistics appear to be cumulative. + * However, this guarantee is not explicitly documented anywhere in + * the common code that the author could find. + * + * BASIC (ENAHW_GET_STATS_TYPE_BASIC) + * + * - Rx packets/bytes + * - Rx drops + * - Tx packets/bytes + * - Tx drops + * + * EXTENDED (ENAHW_GET_STATS_TYPE_EXTENDED) + * + * There is no structure defined for these stats in the Linux + * driver. Based on the FreeBSD driver, it looks like extended + * stats are simply a buffer of C strings? Come back to this + * later. + * + * ENI (ENAHW_GET_STATS_TYPE_ENI) + * + * - Rx Bandwidth Allowance Exceeded + * - Tx Bandwidth Allowance Exceeded + * - PPS Allowance Exceeded (presumably for combined Rx/Tx) + * - Connection Tracking PPS Allowance Exceeded + * - Link-local PPS Alloance Exceeded + */ + +static int +ena_stat_device_basic_update(kstat_t *ksp, int rw) +{ + ena_t *ena = ksp->ks_private; + ena_basic_stat_t *ebs = ksp->ks_data; + enahw_resp_desc_t resp; + enahw_resp_basic_stats_t *stats = &resp.erd_resp.erd_basic_stats; + int ret = 0; + + if (rw == KSTAT_WRITE) { + return (EACCES); + } + + if ((ret = ena_admin_get_basic_stats(ena, &resp)) != 0) { + return (ret); + } + + mutex_enter(&ena->ena_lock); + + ebs->ebs_tx_bytes.value.ui64 = + ((uint64_t)stats->erbs_tx_bytes_high << 32) | + (uint64_t)stats->erbs_tx_bytes_low; + ebs->ebs_tx_pkts.value.ui64 = + ((uint64_t)stats->erbs_tx_pkts_high << 32) | + (uint64_t)stats->erbs_tx_pkts_low; + ebs->ebs_tx_drops.value.ui64 = + ((uint64_t)stats->erbs_tx_drops_high << 32) | + (uint64_t)stats->erbs_tx_drops_low; + + ebs->ebs_rx_bytes.value.ui64 = + ((uint64_t)stats->erbs_rx_bytes_high << 32) | + (uint64_t)stats->erbs_rx_bytes_low; + ebs->ebs_rx_pkts.value.ui64 = + ((uint64_t)stats->erbs_rx_pkts_high << 32) | + (uint64_t)stats->erbs_rx_pkts_low; + ebs->ebs_rx_drops.value.ui64 = + ((uint64_t)stats->erbs_rx_drops_high << 32) | + (uint64_t)stats->erbs_rx_drops_low; + + mutex_exit(&ena->ena_lock); + + return (0); +} + +void +ena_stat_device_basic_cleanup(ena_t *ena) +{ + if (ena->ena_device_basic_kstat != NULL) { + kstat_delete(ena->ena_device_basic_kstat); + ena->ena_device_basic_kstat = NULL; + } +} + +boolean_t +ena_stat_device_basic_init(ena_t *ena) +{ + kstat_t *ksp = kstat_create(ENA_MODULE_NAME, + ddi_get_instance(ena->ena_dip), "device_basic", "net", + KSTAT_TYPE_NAMED, + sizeof (ena_basic_stat_t) / sizeof (kstat_named_t), 0); + ena_basic_stat_t *ebs = NULL; + + if (ksp == NULL) { + ena_err(ena, "!failed to create device_basic kstats"); + return (B_FALSE); + } + + ena->ena_device_basic_kstat = ksp; + ebs = ksp->ks_data; + ksp->ks_update = ena_stat_device_basic_update; + ksp->ks_private = ena; + + kstat_named_init(&ebs->ebs_tx_bytes, "tx_bytes", KSTAT_DATA_UINT64); + ebs->ebs_tx_bytes.value.ui64 = 0; + kstat_named_init(&ebs->ebs_tx_pkts, "tx_packets", KSTAT_DATA_UINT64); + ebs->ebs_tx_pkts.value.ui64 = 0; + kstat_named_init(&ebs->ebs_tx_drops, "tx_drops", KSTAT_DATA_UINT64); + ebs->ebs_tx_drops.value.ui64 = 0; + + kstat_named_init(&ebs->ebs_rx_bytes, "rx_bytes", KSTAT_DATA_UINT64); + ebs->ebs_rx_bytes.value.ui64 = 0; + kstat_named_init(&ebs->ebs_rx_pkts, "rx_packets", KSTAT_DATA_UINT64); + ebs->ebs_rx_pkts.value.ui64 = 0; + kstat_named_init(&ebs->ebs_rx_drops, "rx_drops", KSTAT_DATA_UINT64); + ebs->ebs_rx_drops.value.ui64 = 0; + + kstat_install(ena->ena_device_basic_kstat); + return (B_TRUE); +} + +int +ena_stat_device_extended_update(kstat_t *ksp, int rw) +{ + ena_t *ena = ksp->ks_private; + ena_extended_stat_t *ees = ksp->ks_data; + enahw_resp_desc_t resp; + enahw_resp_eni_stats_t *stats = &resp.erd_resp.erd_eni_stats; + int ret = 0; + + if (rw == KSTAT_WRITE) { + return (EACCES); + } + + if ((ret = ena_admin_get_eni_stats(ena, &resp)) != 0) { + return (ret); + } + + mutex_enter(&ena->ena_lock); + + ees->ees_bw_in_exceeded.value.ui64 = stats->eres_bw_in_exceeded; + ees->ees_bw_out_exceeded.value.ui64 = stats->eres_bw_out_exceeded; + ees->ees_pps_exceeded.value.ui64 = stats->eres_pps_exceeded; + ees->ees_conns_exceeded.value.ui64 = stats->eres_conns_exceeded; + ees->ees_linklocal_exceeded.value.ui64 = stats->eres_linklocal_exceeded; + + mutex_exit(&ena->ena_lock); + + return (0); +} + +void +ena_stat_device_extended_cleanup(ena_t *ena) +{ + if (ena->ena_device_extended_kstat != NULL) { + kstat_delete(ena->ena_device_extended_kstat); + ena->ena_device_extended_kstat = NULL; + } +} + +boolean_t +ena_stat_device_extended_init(ena_t *ena) +{ + kstat_t *ksp = kstat_create(ENA_MODULE_NAME, + ddi_get_instance(ena->ena_dip), "device_ext", "net", + KSTAT_TYPE_NAMED, + sizeof (ena_extended_stat_t) / sizeof (kstat_named_t), 0); + ena_extended_stat_t *ees; + + if (ksp == NULL) { + ena_err(ena, "!failed to create device_ext kstats"); + return (B_FALSE); + } + + ena->ena_device_extended_kstat = ksp; + ees = ksp->ks_data; + ksp->ks_update = ena_stat_device_extended_update; + ksp->ks_private = ena; + + kstat_named_init(&ees->ees_bw_in_exceeded, "bw_in_exceeded", + KSTAT_DATA_UINT64); + ees->ees_bw_in_exceeded.value.ui64 = 0; + + kstat_named_init(&ees->ees_bw_out_exceeded, "bw_out_exceeded", + KSTAT_DATA_UINT64); + ees->ees_bw_out_exceeded.value.ui64 = 0; + + kstat_named_init(&ees->ees_pps_exceeded, "pps_exceeded", + KSTAT_DATA_UINT64); + ees->ees_pps_exceeded.value.ui64 = 0; + + kstat_named_init(&ees->ees_conns_exceeded, "conns_exceeded", + KSTAT_DATA_UINT64); + ees->ees_conns_exceeded.value.ui64 = 0; + + kstat_named_init(&ees->ees_linklocal_exceeded, "linklocal_exceeded", + KSTAT_DATA_UINT64); + ees->ees_linklocal_exceeded.value.ui64 = 0; + + kstat_install(ena->ena_device_extended_kstat); + return (B_TRUE); +} + +void +ena_stat_aenq_cleanup(ena_t *ena) +{ + if (ena->ena_aenq_kstat != NULL) { + kstat_delete(ena->ena_aenq_kstat); + ena->ena_aenq_kstat = NULL; + } +} + +boolean_t +ena_stat_aenq_init(ena_t *ena) +{ + kstat_t *ksp = kstat_create(ENA_MODULE_NAME, + ddi_get_instance(ena->ena_dip), "aenq", "net", KSTAT_TYPE_NAMED, + sizeof (ena_aenq_stat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + ena_aenq_stat_t *eas = &ena->ena_aenq_stat; + + if (ksp == NULL) { + ena_err(ena, "!failed to create aenq kstats"); + return (B_FALSE); + } + + ena->ena_aenq_kstat = ksp; + ksp->ks_data = eas; + + kstat_named_init(&eas->eaes_default, "default", KSTAT_DATA_UINT64); + eas->eaes_default.value.ui64 = 0; + + kstat_named_init(&eas->eaes_link_change, "link_change", + KSTAT_DATA_UINT64); + eas->eaes_link_change.value.ui64 = 0; + + kstat_install(ena->ena_aenq_kstat); + return (B_TRUE); +} + +void +ena_stat_txq_cleanup(ena_txq_t *txq) +{ + if (txq->et_kstat != NULL) { + kstat_delete(txq->et_kstat); + txq->et_kstat = NULL; + } +} + +boolean_t +ena_stat_txq_init(ena_txq_t *txq) +{ + ena_t *ena = txq->et_ena; + kstat_t *ksp; + char buf[128]; + ena_txq_stat_t *ets = &txq->et_stat; + + (void) snprintf(buf, sizeof (buf), "txq_%d", txq->et_txqs_idx); + + ksp = kstat_create(ENA_MODULE_NAME, ddi_get_instance(ena->ena_dip), buf, + "net", KSTAT_TYPE_NAMED, + sizeof (ena_txq_stat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) { + ena_err(ena, "!failed to create %s kstats", buf); + return (B_FALSE); + } + + txq->et_kstat = ksp; + ksp->ks_data = ets; + + kstat_named_init(&ets->ets_hck_meoifail, "meoi_fail", + KSTAT_DATA_UINT64); + ets->ets_hck_meoifail.value.ui64 = 0; + + kstat_named_init(&ets->ets_blocked, "blocked", KSTAT_DATA_UINT64); + ets->ets_blocked.value.ui64 = 0; + + kstat_named_init(&ets->ets_unblocked, "unblocked", KSTAT_DATA_UINT64); + ets->ets_unblocked.value.ui64 = 0; + + kstat_named_init(&ets->ets_recycled, "recycled", KSTAT_DATA_UINT64); + ets->ets_recycled.value.ui64 = 0; + + kstat_named_init(&ets->ets_bytes, "bytes", KSTAT_DATA_UINT64); + ets->ets_bytes.value.ui64 = 0; + + kstat_named_init(&ets->ets_packets, "packets", KSTAT_DATA_UINT64); + ets->ets_packets.value.ui64 = 0; + + kstat_install(txq->et_kstat); + return (B_TRUE); +} + +void +ena_stat_rxq_cleanup(ena_rxq_t *rxq) +{ + if (rxq->er_kstat != NULL) { + kstat_delete(rxq->er_kstat); + rxq->er_kstat = NULL; + } +} + +boolean_t +ena_stat_rxq_init(ena_rxq_t *rxq) +{ + ena_t *ena = rxq->er_ena; + kstat_t *ksp; + char buf[128]; + ena_rxq_stat_t *ers = &rxq->er_stat; + + (void) snprintf(buf, sizeof (buf), "rxq_%d", rxq->er_rxqs_idx); + + ksp = kstat_create(ENA_MODULE_NAME, ddi_get_instance(ena->ena_dip), buf, + "net", KSTAT_TYPE_NAMED, + sizeof (ena_rxq_stat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) { + ena_err(ena, "!failed to create %s kstats", buf); + return (B_FALSE); + } + + rxq->er_kstat = ksp; + ksp->ks_data = ers; + + kstat_named_init(&ers->ers_packets, "packets", KSTAT_DATA_UINT64); + ers->ers_packets.value.ui64 = 0; + + kstat_named_init(&ers->ers_bytes, "bytes", KSTAT_DATA_UINT64); + ers->ers_bytes.value.ui64 = 0; + + kstat_named_init(&ers->ers_multi_desc, "multi_desc", KSTAT_DATA_UINT64); + ers->ers_multi_desc.value.ui64 = 0; + + kstat_named_init(&ers->ers_allocb_fail, "allocb_fail", + KSTAT_DATA_UINT64); + ers->ers_allocb_fail.value.ui64 = 0; + + kstat_named_init(&ers->ers_intr_limit, "intr_limit", KSTAT_DATA_UINT64); + ers->ers_intr_limit.value.ui64 = 0; + + kstat_named_init(&ers->ers_hck_ipv4_err, "hck_ipv4_err", + KSTAT_DATA_UINT64); + ers->ers_hck_ipv4_err.value.ui64 = 0; + + kstat_named_init(&ers->ers_hck_l4_err, "hck_l4_err", KSTAT_DATA_UINT64); + ers->ers_hck_l4_err.value.ui64 = 0; + + kstat_install(rxq->er_kstat); + return (B_TRUE); +} + +int +ena_ring_rx_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val) +{ + int ret = 0; + ena_rxq_t *rxq = (ena_rxq_t *)rh; + + mutex_enter(&rxq->er_stat_lock); + + switch (stat) { + case MAC_STAT_RBYTES: + *val = rxq->er_stat.ers_bytes.value.ui64; + break; + case MAC_STAT_IPACKETS: + *val = rxq->er_stat.ers_packets.value.ui64; + break; + default: + *val = 0; + ret = ENOTSUP; + } + + mutex_exit(&rxq->er_stat_lock); + return (ret); +} + +int +ena_ring_tx_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val) +{ + int ret = 0; + ena_txq_t *txq = (ena_txq_t *)rh; + + mutex_enter(&txq->et_stat_lock); + + switch (stat) { + case MAC_STAT_OBYTES: + *val = txq->et_stat.ets_bytes.value.ui64; + break; + case MAC_STAT_OPACKETS: + *val = txq->et_stat.ets_packets.value.ui64; + break; + default: + *val = 0; + ret = ENOTSUP; + } + + mutex_exit(&txq->et_stat_lock); + return (ret); +} + +int +ena_m_stat(void *arg, uint_t stat, uint64_t *val) +{ + ena_t *ena = arg; + ena_basic_stat_t *ebs = ena->ena_device_basic_kstat->ks_data; + int ret = 0; + + ret = ena_stat_device_basic_update(ena->ena_device_basic_kstat, + KSTAT_READ); + + if (ret != 0) { + return (ret); + } + + mutex_enter(&ena->ena_lock); + + /* + * The ENA device does not provide a lot of the stats that a + * traditional NIC device would. + */ + switch (stat) { + case MAC_STAT_NORCVBUF: + *val = ebs->ebs_rx_drops.value.ui64; + break; + + case MAC_STAT_RBYTES: + *val = ebs->ebs_rx_bytes.value.ui64; + break; + + case MAC_STAT_IPACKETS: + *val = ebs->ebs_rx_pkts.value.ui64; + break; + + case MAC_STAT_OBYTES: + *val = ebs->ebs_tx_bytes.value.ui64; + break; + + case MAC_STAT_OPACKETS: + *val = ebs->ebs_tx_pkts.value.ui64; + break; + + default: + ret = ENOTSUP; + break; + } + + mutex_exit(&ena->ena_lock); + return (ret); +} diff --git a/usr/src/uts/common/io/ena/ena_tx.c b/usr/src/uts/common/io/ena/ena_tx.c new file mode 100644 index 0000000000..30773496b0 --- /dev/null +++ b/usr/src/uts/common/io/ena/ena_tx.c @@ -0,0 +1,534 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Oxide Computer Company + */ +#include "ena.h" + +void +ena_free_tx_dma(ena_txq_t *txq) +{ + if (txq->et_tcbs != NULL) { + for (uint_t i = 0; i < txq->et_sq_num_descs; i++) { + ena_tx_control_block_t *tcb = &txq->et_tcbs[i]; + ena_dma_free(&tcb->etcb_dma); + } + + kmem_free(txq->et_tcbs, + sizeof (*txq->et_tcbs) * txq->et_sq_num_descs); + + txq->et_tcbs = NULL; + + } + + ena_dma_free(&txq->et_cq_dma); + txq->et_cq_descs = NULL; + + ena_dma_free(&txq->et_sq_dma); + txq->et_sq_descs = NULL; + + txq->et_state &= ~ENA_TXQ_STATE_HOST_ALLOC; +} + +static int +ena_alloc_tx_dma(ena_txq_t *txq) +{ + ena_t *ena = txq->et_ena; + size_t cq_descs_sz; + size_t sq_descs_sz; + int err = 0; + ena_dma_conf_t conf; + + ASSERT0(txq->et_state & ENA_TXQ_STATE_HOST_ALLOC); + ASSERT3P(ena, !=, NULL); + + cq_descs_sz = txq->et_cq_num_descs * sizeof (*txq->et_cq_descs); + sq_descs_sz = txq->et_sq_num_descs * sizeof (*txq->et_sq_descs); + + conf = (ena_dma_conf_t) { + .edc_size = sq_descs_sz, + .edc_align = ENAHW_IO_SQ_DESC_BUF_ALIGNMENT, + .edc_sgl = 1, + .edc_endian = DDI_NEVERSWAP_ACC, + .edc_stream = B_FALSE, + }; + + if (!ena_dma_alloc(ena, &txq->et_sq_dma, &conf, sq_descs_sz)) { + return (ENOMEM); + } + + bzero(txq->et_sq_dma.edb_va, sq_descs_sz); + txq->et_sq_descs = (void *)txq->et_sq_dma.edb_va; + txq->et_tcbs = kmem_zalloc(sizeof (*txq->et_tcbs) * + txq->et_sq_num_descs, KM_SLEEP); + + for (uint_t i = 0; i < txq->et_sq_num_descs; i++) { + ena_tx_control_block_t *tcb = &txq->et_tcbs[i]; + ena_dma_conf_t buf_conf = { + .edc_size = ena->ena_tx_buf_sz, + .edc_align = 1, + .edc_sgl = ena->ena_tx_sgl_max_sz, + .edc_endian = DDI_NEVERSWAP_ACC, + .edc_stream = B_TRUE, + }; + + if (!ena_dma_alloc(ena, &tcb->etcb_dma, &buf_conf, + ena->ena_tx_buf_sz)) { + err = ENOMEM; + goto error; + } + } + + conf = (ena_dma_conf_t) { + .edc_size = cq_descs_sz, + .edc_align = ENAHW_IO_CQ_DESC_BUF_ALIGNMENT, + .edc_sgl = 1, + .edc_endian = DDI_NEVERSWAP_ACC, + .edc_stream = B_FALSE, + }; + + if (!ena_dma_alloc(ena, &txq->et_cq_dma, &conf, cq_descs_sz)) { + err = ENOMEM; + goto error; + } + + bzero(txq->et_cq_dma.edb_va, cq_descs_sz); + txq->et_cq_descs = (void *)txq->et_cq_dma.edb_va; + txq->et_state |= ENA_TXQ_STATE_HOST_ALLOC; + return (0); + +error: + ena_free_tx_dma(txq); + return (err); +} + +boolean_t +ena_alloc_txq(ena_txq_t *txq) +{ + int ret = 0; + ena_t *ena = txq->et_ena; + uint16_t cq_hw_idx, sq_hw_idx; + uint32_t *cq_unmask_addr, *cq_headdb, *cq_numanode; + uint32_t *sq_db_addr; + + ASSERT3U(txq->et_cq_num_descs, >, 0); + + /* + * First, allocate the Tx data buffers. + */ + if ((ret = ena_alloc_tx_dma(txq)) != 0) { + ena_err(ena, "failed to allocate Tx queue %u data buffers: %d", + txq->et_txqs_idx, ret); + return (B_FALSE); + } + + ASSERT(txq->et_state & ENA_TXQ_STATE_HOST_ALLOC); + + /* + * Second, create the Completion Queue. + */ + ret = ena_create_cq(ena, txq->et_cq_num_descs, + txq->et_cq_dma.edb_cookie->dmac_laddress, B_TRUE, + txq->et_intr_vector, &cq_hw_idx, &cq_unmask_addr, &cq_headdb, + &cq_numanode); + + if (ret != 0) { + ena_err(ena, "failed to create Tx CQ %u: %d", txq->et_txqs_idx, + ret); + return (B_FALSE); + } + + txq->et_cq_hw_idx = cq_hw_idx; + txq->et_cq_phase = 1; + txq->et_cq_unmask_addr = cq_unmask_addr; + txq->et_cq_head_db_addr = cq_headdb; + txq->et_cq_numa_addr = cq_numanode; + txq->et_state |= ENA_TXQ_STATE_CQ_CREATED; + + /* + * Third, create the Submission Queue to match with the above + * CQ. At this time we force the SQ and CQ to have the same + * number of descriptors as we only use a 1:1 completion + * policy. However, in the future, we could loosen this and + * use an on-demand completion policy and the two could have a + * different number of descriptors. + */ + ASSERT3U(txq->et_sq_num_descs, ==, txq->et_cq_num_descs); + + ret = ena_create_sq(ena, txq->et_sq_num_descs, + txq->et_sq_dma.edb_cookie->dmac_laddress, B_TRUE, cq_hw_idx, + &sq_hw_idx, &sq_db_addr); + + if (ret != 0) { + ena_err(ena, "failed to create Tx SQ %u: %d", txq->et_txqs_idx, + ret); + return (B_FALSE); + } + + txq->et_sq_hw_idx = sq_hw_idx; + txq->et_sq_db_addr = sq_db_addr; + /* The phase must always start on 1. */ + txq->et_sq_phase = 1; + txq->et_sq_avail_descs = txq->et_sq_num_descs; + txq->et_blocked = B_FALSE; + txq->et_state |= ENA_TXQ_STATE_SQ_CREATED; + + return (B_TRUE); +} + +void +ena_cleanup_txq(ena_txq_t *txq) +{ + int ret = 0; + ena_t *ena = txq->et_ena; + + if ((txq->et_state & ENA_TXQ_STATE_SQ_CREATED) != 0) { + ret = ena_destroy_sq(ena, txq->et_sq_hw_idx, B_TRUE); + + if (ret != 0) { + ena_err(ena, "failed to destroy Tx SQ %u: %d", + txq->et_txqs_idx, ret); + } + + txq->et_sq_hw_idx = 0; + txq->et_sq_db_addr = NULL; + txq->et_sq_tail_idx = 0; + txq->et_sq_phase = 0; + txq->et_state &= ~ENA_TXQ_STATE_SQ_CREATED; + } + + if ((txq->et_state & ENA_TXQ_STATE_CQ_CREATED) != 0) { + ret = ena_destroy_cq(ena, txq->et_cq_hw_idx); + + if (ret != 0) { + ena_err(ena, "failed to destroy Tx CQ %u: %d", + txq->et_txqs_idx, ret); + } + + txq->et_cq_hw_idx = 0; + txq->et_cq_head_idx = 0; + txq->et_cq_phase = 0; + txq->et_cq_head_db_addr = NULL; + txq->et_cq_unmask_addr = NULL; + txq->et_cq_numa_addr = NULL; + txq->et_state &= ~ENA_TXQ_STATE_CQ_CREATED; + } + + ena_free_tx_dma(txq); + VERIFY3S(txq->et_state, ==, ENA_TXQ_STATE_NONE); +} + +void +ena_ring_tx_stop(mac_ring_driver_t rh) +{ + ena_txq_t *txq = (ena_txq_t *)rh; + uint32_t intr_ctrl; + + intr_ctrl = ena_hw_abs_read32(txq->et_ena, txq->et_cq_unmask_addr); + ENAHW_REG_INTR_UNMASK(intr_ctrl); + ena_hw_abs_write32(txq->et_ena, txq->et_cq_unmask_addr, intr_ctrl); + + txq->et_state &= ~ENA_TXQ_STATE_RUNNING; + txq->et_state &= ~ENA_TXQ_STATE_READY; +} + +int +ena_ring_tx_start(mac_ring_driver_t rh, uint64_t gen_num) +{ + ena_txq_t *txq = (ena_txq_t *)rh; + ena_t *ena = txq->et_ena; + uint32_t intr_ctrl; + + mutex_enter(&txq->et_lock); + txq->et_m_gen_num = gen_num; + mutex_exit(&txq->et_lock); + + txq->et_state |= ENA_TXQ_STATE_READY; + + intr_ctrl = ena_hw_abs_read32(ena, txq->et_cq_unmask_addr); + ENAHW_REG_INTR_UNMASK(intr_ctrl); + ena_hw_abs_write32(ena, txq->et_cq_unmask_addr, intr_ctrl); + txq->et_state |= ENA_TXQ_STATE_RUNNING; + return (0); +} + +static void +ena_tx_copy_fragment(ena_tx_control_block_t *tcb, const mblk_t *mp, + const size_t off, const size_t len) +{ + const void *soff = mp->b_rptr + off; + void *doff = + (void *)(tcb->etcb_dma.edb_va + tcb->etcb_dma.edb_used_len); + + VERIFY3U(len, >, 0); + VERIFY3P(soff, >=, mp->b_rptr); + VERIFY3P(soff, <=, mp->b_wptr); + VERIFY3U(len, <=, MBLKL(mp)); + VERIFY3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr); + VERIFY3U(tcb->etcb_dma.edb_used_len + len, <, tcb->etcb_dma.edb_len); + + bcopy(soff, doff, len); + tcb->etcb_type = ENA_TCB_COPY; + tcb->etcb_dma.edb_used_len += len; +} + +ena_tx_control_block_t * +ena_pull_tcb(const ena_txq_t *txq, mblk_t *mp) +{ + mblk_t *nmp = mp; + ena_t *ena = txq->et_ena; + ena_tx_control_block_t *tcb = NULL; + const uint16_t tail_mod = + txq->et_sq_tail_idx & (txq->et_sq_num_descs - 1); + + ASSERT(MUTEX_HELD(&txq->et_lock)); + VERIFY3U(msgsize(mp), <, ena->ena_tx_buf_sz); + + while (nmp != NULL) { + const size_t nmp_len = MBLKL(nmp); + + if (nmp_len == 0) { + nmp = nmp->b_cont; + continue; + } + + /* For now TCB is bound to SQ desc. */ + if (tcb == NULL) { + tcb = &txq->et_tcbs[tail_mod]; + } + + ena_tx_copy_fragment(tcb, nmp, 0, nmp_len); + nmp = nmp->b_cont; + } + + ENA_DMA_SYNC(tcb->etcb_dma, DDI_DMA_SYNC_FORDEV); + VERIFY3P(nmp, ==, NULL); + VERIFY3P(tcb, !=, NULL); + return (tcb); +} + +static void +ena_fill_tx_data_desc(ena_txq_t *txq, ena_tx_control_block_t *tcb, + uint16_t tail, uint8_t phase, enahw_tx_data_desc_t *desc, + mac_ether_offload_info_t *meo, size_t mlen) +{ + VERIFY3U(mlen, <=, ENAHW_TX_DESC_LENGTH_MASK); + +#ifdef DEBUG + /* + * If there is no header for the specific layer it will be set + * to zero, thus we elide the meoi_flags check here. + */ + size_t hdr_len = meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen; + ASSERT3U(hdr_len, <=, txq->et_ena->ena_tx_max_hdr_len); +#endif + + bzero(desc, sizeof (*desc)); + ENAHW_TX_DESC_FIRST_ON(desc); + ENAHW_TX_DESC_LENGTH(desc, mlen); + ENAHW_TX_DESC_REQID_HI(desc, tail); + ENAHW_TX_DESC_REQID_LO(desc, tail); + ENAHW_TX_DESC_PHASE(desc, phase); + ENAHW_TX_DESC_DF_ON(desc); + ENAHW_TX_DESC_LAST_ON(desc); + ENAHW_TX_DESC_COMP_REQ_ON(desc); + ENAHW_TX_DESC_META_DESC_OFF(desc); + ENAHW_TX_DESC_ADDR_LO(desc, tcb->etcb_dma.edb_cookie->dmac_laddress); + ENAHW_TX_DESC_ADDR_HI(desc, tcb->etcb_dma.edb_cookie->dmac_laddress); + /* + * NOTE: Please see the block comment above + * etd_buff_addr_hi_hdr_sz to see why this is set to 0. + */ + ENAHW_TX_DESC_HEADER_LENGTH(desc, 0); + ENAHW_TX_DESC_TSO_OFF(desc); + ENAHW_TX_DESC_L3_CSUM_OFF(desc); + ENAHW_TX_DESC_L4_CSUM_OFF(desc); + /* + * Enabling this bit tells the device NOT to calculate the + * pseudo header checksum. + */ + ENAHW_TX_DESC_L4_CSUM_PARTIAL_ON(desc); +} + +static void +ena_submit_tx(ena_txq_t *txq, uint16_t desc_idx) +{ + ena_hw_abs_write32(txq->et_ena, txq->et_sq_db_addr, desc_idx); +} + +/* + * For now we do the simplest thing possible. All Tx uses bcopy to + * pre-allocated buffers, no checksum, no TSO, etc. + */ +mblk_t * +ena_ring_tx(void *arg, mblk_t *mp) +{ + ena_txq_t *txq = arg; + ena_t *ena = txq->et_ena; + mac_ether_offload_info_t meo; + enahw_tx_data_desc_t *desc; + ena_tx_control_block_t *tcb; + const uint16_t tail_mod = + txq->et_sq_tail_idx & (txq->et_sq_num_descs - 1); + + VERIFY3P(mp->b_next, ==, NULL); + VERIFY(txq->et_blocked == B_FALSE); + + /* + * The ena_state value is written by atomic operations. The + * et_state value is currently Write Once, but if that changes + * it should also be written with atomics. + */ + if (!(ena->ena_state & ENA_STATE_RUNNING) || + !(txq->et_state & ENA_TXQ_STATE_RUNNING)) { + freemsg(mp); + return (NULL); + } + + if (mac_ether_offload_info(mp, &meo) != 0) { + freemsg(mp); + mutex_enter(&txq->et_stat_lock); + txq->et_stat.ets_hck_meoifail.value.ui64++; + mutex_exit(&txq->et_stat_lock); + return (NULL); + } + + mutex_enter(&txq->et_lock); + + /* + * For the moment there is a 1:1 mapping between Tx descs and + * Tx contexts. Currently Tx is copy only, and each context + * buffer is guaranteed to be as large as MTU + frame header, + * see ena_update_buf_sizes(). + */ + if (txq->et_sq_avail_descs == 0) { + txq->et_blocked = B_TRUE; + mutex_enter(&txq->et_stat_lock); + txq->et_stat.ets_blocked.value.ui64++; + mutex_exit(&txq->et_stat_lock); + mutex_exit(&txq->et_lock); + return (mp); + } + + ASSERT3U(meo.meoi_len, <=, ena->ena_max_frame_total); + tcb = ena_pull_tcb(txq, mp); + ASSERT3P(tcb, !=, NULL); + tcb->etcb_mp = mp; + txq->et_sq_avail_descs--; + + /* Fill in the Tx descriptor. */ + desc = &(txq->et_sq_descs[tail_mod].etd_data); + ena_fill_tx_data_desc(txq, tcb, tail_mod, txq->et_sq_phase, desc, &meo, + meo.meoi_len); + DTRACE_PROBE3(tx__submit, ena_tx_control_block_t *, tcb, uint16_t, + tail_mod, enahw_tx_data_desc_t *, desc); + + /* + * Remember, we submit the raw tail value to the device, the + * hardware performs its own modulo (like we did to get + * tail_mod). + */ + txq->et_sq_tail_idx++; + ena_submit_tx(txq, txq->et_sq_tail_idx); + + mutex_enter(&txq->et_stat_lock); + txq->et_stat.ets_packets.value.ui64++; + txq->et_stat.ets_bytes.value.ui64 += meo.meoi_len; + mutex_exit(&txq->et_stat_lock); + + if ((txq->et_sq_tail_idx & (txq->et_sq_num_descs - 1)) == 0) { + txq->et_sq_phase = !txq->et_sq_phase; + } + + mutex_exit(&txq->et_lock); + return (NULL); +} + +void +ena_tx_intr_work(ena_txq_t *txq) +{ + uint16_t head_mod; + enahw_tx_cdesc_t *cdesc; + ena_tx_control_block_t *tcb; + uint16_t req_id; + uint64_t recycled = 0; + boolean_t unblocked = B_FALSE; + + mutex_enter(&txq->et_lock); + head_mod = txq->et_cq_head_idx & (txq->et_cq_num_descs - 1); + ENA_DMA_SYNC(txq->et_cq_dma, DDI_DMA_SYNC_FORKERNEL); + cdesc = &txq->et_cq_descs[head_mod]; + + /* Recycle any completed descriptors. */ + while (ENAHW_TX_CDESC_GET_PHASE(cdesc) == txq->et_cq_phase) { + mblk_t *mp; + + /* Get the corresponding TCB. */ + req_id = cdesc->etc_req_id; + /* + * It would be nice to make this a device reset + * instead. + */ + VERIFY3U(req_id, <=, txq->et_sq_num_descs); + tcb = &txq->et_tcbs[req_id]; + DTRACE_PROBE2(tx__complete, uint16_t, req_id, + ena_tx_control_block_t *, tcb); + + /* Free the associated mblk. */ + tcb->etcb_dma.edb_used_len = 0; + mp = tcb->etcb_mp; + /* Make this a device reset instead. */ + VERIFY3P(mp, !=, NULL); + freemsg(mp); + tcb->etcb_mp = NULL; + + /* Add this descriptor back to the free list. */ + txq->et_sq_avail_descs++; + txq->et_cq_head_idx++; + + /* Check for phase rollover. */ + head_mod = txq->et_cq_head_idx & (txq->et_cq_num_descs - 1); + + if (head_mod == 0) { + txq->et_cq_phase = !txq->et_cq_phase; + } + + if (txq->et_blocked) { + txq->et_blocked = B_FALSE; + unblocked = B_TRUE; + mac_tx_ring_update(txq->et_ena->ena_mh, txq->et_mrh); + } + + recycled++; + cdesc = &txq->et_cq_descs[head_mod]; + } + + /* + * If the device provided a head doorbell register, then we + * need to update it to let the device know we are done + * reading these CQ entries. + */ + if (txq->et_cq_head_db_addr != NULL) { + ena_hw_abs_write32(txq->et_ena, txq->et_cq_head_db_addr, + head_mod); + } + + mutex_exit(&txq->et_lock); + + /* Update stats. */ + mutex_enter(&txq->et_stat_lock); + txq->et_stat.ets_recycled.value.ui64 += recycled; + if (unblocked) { + txq->et_stat.ets_unblocked.value.ui64++; + } + mutex_exit(&txq->et_stat_lock); +} diff --git a/usr/src/uts/common/mapfiles/ddi.mapfile b/usr/src/uts/common/mapfiles/ddi.mapfile index 798367c7e3..a9f4f2d730 100644 --- a/usr/src/uts/common/mapfiles/ddi.mapfile +++ b/usr/src/uts/common/mapfiles/ddi.mapfile @@ -12,6 +12,7 @@ # # Copyright 2020 Joyent, Inc. # Copyright 2020 RackTop Systems, Inc. +# Copyright 2021 Oxide Computer Company # # @@ -78,6 +79,7 @@ SYMBOL_SCOPE { ddi_dma_addr_bind_handle { FLAGS = EXTERN }; ddi_dma_alloc_handle { FLAGS = EXTERN }; ddi_dma_cookie_iter { FLAGS = EXTERN }; + ddi_dma_cookie_one { FLAGS = EXTERN }; ddi_dma_free_handle { FLAGS = EXTERN }; ddi_dma_mem_alloc { FLAGS = EXTERN }; ddi_dma_mem_free { FLAGS = EXTERN }; @@ -153,6 +155,7 @@ SYMBOL_SCOPE { dev_err { FLAGS = EXTERN }; drv_usectohz { FLAGS = EXTERN }; drv_usecwait { FLAGS = EXTERN }; + ffs { FLAGS = EXTERN }; fm_ena_generate { FLAGS = EXTERN }; freeb { FLAGS = EXTERN }; freemsg { FLAGS = EXTERN }; @@ -168,6 +171,7 @@ SYMBOL_SCOPE { list_create { FLAGS = EXTERN }; list_destroy { FLAGS = EXTERN }; list_head { FLAGS = EXTERN }; + list_insert_head { FLAGS = EXTERN }; list_insert_tail { FLAGS = EXTERN }; list_next { FLAGS = EXTERN }; list_remove { FLAGS = EXTERN }; @@ -219,9 +223,12 @@ SYMBOL_SCOPE { strcat { FLAGS = EXTERN }; strcmp { FLAGS = EXTERN }; strcpy { FLAGS = EXTERN }; + strlcpy { FLAGS = EXTERN }; strlen { FLAGS = EXTERN }; timeout { FLAGS = EXTERN }; untimeout { FLAGS = EXTERN }; + vcmn_err { FLAGS = EXTERN }; + vdev_err { FLAGS = EXTERN }; vsnprintf { FLAGS = EXTERN }; vsprintf { FLAGS = EXTERN }; }; diff --git a/usr/src/uts/common/mapfiles/kernel.mapfile b/usr/src/uts/common/mapfiles/kernel.mapfile index 21a691dca2..6fcc1fa371 100644 --- a/usr/src/uts/common/mapfiles/kernel.mapfile +++ b/usr/src/uts/common/mapfiles/kernel.mapfile @@ -11,6 +11,7 @@ # # Copyright 2016 Joyent, Inc. +# Copyright 2021 Oxide Computer Company # # @@ -40,4 +41,6 @@ SYMBOL_SCOPE { servicing_interrupt { FLAGS = EXTERN }; fnvlist_alloc { FLAGS = EXTERN }; fnvlist_add_string { FLAGS = EXTERN }; + ncpus_online { FLAGS = EXTERN }; + utsname { FLAGS = EXTERN }; }; diff --git a/usr/src/uts/common/sys/ethernet.h b/usr/src/uts/common/sys/ethernet.h index 5b9de2f2bf..4febb8915f 100644 --- a/usr/src/uts/common/sys/ethernet.h +++ b/usr/src/uts/common/sys/ethernet.h @@ -20,6 +20,7 @@ */ /* * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * Copyright 2021 Oxide Computer Company * * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -140,6 +141,8 @@ struct ether_vlan_extinfo { #endif #ifdef _KERNEL +#define ETHER_IS_MULTICAST(addr) (((addr)[0] & 0x01) != 0) + extern int localetheraddr(struct ether_addr *, struct ether_addr *); extern char *ether_sprintf(struct ether_addr *); extern int ether_aton(char *, uchar_t *); diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel index cd5eabf7c5..4d1d2664c3 100644 --- a/usr/src/uts/intel/Makefile.intel +++ b/usr/src/uts/intel/Makefile.intel @@ -27,6 +27,7 @@ # Copyright 2018 Nexenta Systems, Inc. # Copyright 2019 RackTop Systems # Copyright 2019 Peter Tribble. +# Copyright 2021 Oxide Computer Company # # @@ -385,6 +386,7 @@ DRV_KMODS += dmfe DRV_KMODS += e1000g DRV_KMODS += efe DRV_KMODS += elxl +DRV_KMODS += ena DRV_KMODS += hme DRV_KMODS += mxfe DRV_KMODS += nge diff --git a/usr/src/uts/intel/ena/Makefile b/usr/src/uts/intel/ena/Makefile new file mode 100644 index 0000000000..bef9878cc0 --- /dev/null +++ b/usr/src/uts/intel/ena/Makefile @@ -0,0 +1,47 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2021 Oxide Computer Company +# + +UTSBASE = ../.. + +MODULE = ena +OBJECTS = $(ENA_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/io/ena + +include $(UTSBASE)/intel/Makefile.intel + +CPPFLAGS += -I$(UTSBASE)/common/io/ena + +ALL_TARGET = $(BINARY) $(CONFMOD) +INSTALL_TARGET = $(BINBAR) $(ROOTMODULE) $(ROOT_CONFFILE) + +LDFLAGS += -dy -N misc/mac + +MAPFILES += ddi mac kernel + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/Makefile.mapfile +include $(UTSBASE)/intel/Makefile.targ |