summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRyan Zezeski <ryan@zinascii.com>2020-08-25 00:52:37 -0600
committerDan McDonald <danmcd@joyent.com>2021-11-23 13:18:50 -0500
commit6f443ebc1fb4fec01d6e8fa8ca4648182ed215bb (patch)
tree5c4551c6d6caaaf138fe369af872c3fc31d02c8a
parenta28480febf31f0e61debac062a55216a98a05a92 (diff)
downloadillumos-joyent-6f443ebc1fb4fec01d6e8fa8ca4648182ed215bb.tar.gz
13689 Want AWS ENA driver
Reviewed by: Robert Mustacchi <rm@fingolfin.org> Approved by: Dan McDonald <danmcd@joyent.com>
-rw-r--r--usr/src/man/man7d/Makefile2
-rw-r--r--usr/src/man/man7d/ena.7d135
-rw-r--r--usr/src/pkg/manifests/driver-network-ena.p5m36
-rw-r--r--usr/src/uts/common/Makefile.files8
-rw-r--r--usr/src/uts/common/Makefile.rules6
-rw-r--r--usr/src/uts/common/io/ena/ena.c1944
-rw-r--r--usr/src/uts/common/io/ena/ena.conf50
-rw-r--r--usr/src/uts/common/io/ena/ena.h848
-rw-r--r--usr/src/uts/common/io/ena/ena_admin.c674
-rw-r--r--usr/src/uts/common/io/ena/ena_dma.c191
-rw-r--r--usr/src/uts/common/io/ena/ena_gld.c465
-rw-r--r--usr/src/uts/common/io/ena/ena_hw.c93
-rw-r--r--usr/src/uts/common/io/ena/ena_hw.h1930
-rw-r--r--usr/src/uts/common/io/ena/ena_intr.c175
-rw-r--r--usr/src/uts/common/io/ena/ena_rx.c531
-rw-r--r--usr/src/uts/common/io/ena/ena_stats.c475
-rw-r--r--usr/src/uts/common/io/ena/ena_tx.c534
-rw-r--r--usr/src/uts/common/mapfiles/ddi.mapfile7
-rw-r--r--usr/src/uts/common/mapfiles/kernel.mapfile3
-rw-r--r--usr/src/uts/common/sys/ethernet.h3
-rw-r--r--usr/src/uts/intel/Makefile.intel2
-rw-r--r--usr/src/uts/intel/ena/Makefile47
22 files changed, 8158 insertions, 1 deletions
diff --git a/usr/src/man/man7d/Makefile b/usr/src/man/man7d/Makefile
index 9da7d4b205..af38c7a9bd 100644
--- a/usr/src/man/man7d/Makefile
+++ b/usr/src/man/man7d/Makefile
@@ -16,6 +16,7 @@
# Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
# Copyright 2018 Nexenta Systems, Inc.
# Copyright 2020 Peter Tribble
+# Copyright 2021 Oxide Computer Company
#
include $(SRC)/Makefile.master
@@ -46,6 +47,7 @@ _MANFILES= aac.7d \
dtrace.7d \
e1000g.7d \
ehci.7d \
+ ena.7d \
fasttrap.7d \
fbt.7d \
fcip.7d \
diff --git a/usr/src/man/man7d/ena.7d b/usr/src/man/man7d/ena.7d
new file mode 100644
index 0000000000..d4070e1745
--- /dev/null
+++ b/usr/src/man/man7d/ena.7d
@@ -0,0 +1,135 @@
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright 2021 Oxide Computer Company
+.\"
+.Dd Nov 17, 2021
+.Dt ENA 7D
+.Os
+.Sh NAME
+.Nm ena
+.Nd Driver for the AWS Elastic Network Adapter
+.Sh SYNOPSIS
+.Pa /dev/net/ena*
+.Sh DESCRIPTION
+The
+.Sy ena
+driver is a GLDv3 NIC driver for the AWS Elastic Network Adapter
+family of virtual devices.
+The driver supports:
+.Bl -dash -offset indent
+.It
+Jumbo frames up to 9216 bytes.
+.It
+Multiple Rx and Tx rings.
+.El
+.Pp
+By design, this driver does not support VNICs.
+A given ENA device can only ever receive traffic for a single unicast
+MAC address and IP address combination, as determined by the AWS configuration.
+There is no support for promiscuous mode, or for receiving traffic for
+additional unicast or multicast addresses.
+.Sh CONFIGURATION
+The
+.Sy ena.conf
+file contains user configurable parameters, each of which is described
+below.
+This file is read when an ENA device is found and an instance of the
+driver is attached to it.
+Changes made to this file do not affect running instances.
+Only instances attached after the changes will see the effects of
+those changes.
+Therefore, if you want your change to take effect on a running
+instance, you must somehow reload it.
+That could be done by a manual reloading of the driver or a system
+reboot.
+.Sh PROPERTIES
+The configuration file can be found at
+.Pa /kernel/drv/ena.conf .
+.Bl -hang -width Ds
+.It Sy rx_queue_num_descs
+.Bd -filled -compact
+Minimum:
+.Sy 64 |
+Maximum:
+.Sy device dependent
+.Ed
+.Bd -filled -compact
+Default:
+.Sy device maximum
+.Ed
+.Bd -filled
+The
+.Sy rx_queue_num_descs
+property determines the number of descriptors provided by the Rx queue.
+Currently a single descriptor is equal to a single packet, but in the
+future it may be that a single packet consumes multiple descriptors.
+.Ed
+.It Sy rx_queue_intr_limit
+.Bd -filled -compact
+Minimum:
+.Sy 16 |
+Maximum:
+.Sy 4096
+.Ed
+.Bd -filled -compact
+Default:
+.Sy 256
+.Ed
+.Bd -filled
+The
+.Sy rx_queue_intr_limit
+property determines the number frames an Rx interrupt will attempt to
+process before returning and claiming the interrupt.
+This is meant to keep the ENA Rx interrupt handler from consuming too
+much system time.
+In general, when a NIC becomes saturated with packets, the
+.Sy MAC
+layer will switch the driver into polling mode to reduce interrupt
+load.
+.Ed
+.It Sy tx_queue_num_descs
+.Bd -filled -compact
+Minimum:
+.Sy 64 |
+Maximum:
+.Sy device dependent
+.Ed
+.Bd -filled -compact
+Default:
+.Sy device maximum
+.Ed
+.Bd -filled
+The
+.Sy tx_queue_num_descs
+property determines the number of descriptors provided by the Tx queue.
+Currently a single descriptor is equal to a single packet, but in the
+future it may be that a single packet consumes multiple descriptors.
+.Ed
+.El
+.Sh FILES
+.Bl -tag -width Pa
+.It Pa /kernel/drv/amd64/ena
+Device driver (x86)
+.It Pa /kernel/drv/ena.conf
+Driver configuration file containing user-configurable options
+.El
+.Sh INTERFACE STABILITY
+The tunables in
+.Pa ena.conf
+are considered
+.Sy Evolving
+and may change in the future.
+.Sh SEE ALSO
+.Xr dladm 1M ,
+.Xr snoop 1M ,
+.Xr driver.conf 4 ,
+.Xr dlpi 7P
diff --git a/usr/src/pkg/manifests/driver-network-ena.p5m b/usr/src/pkg/manifests/driver-network-ena.p5m
new file mode 100644
index 0000000000..cd64e9c504
--- /dev/null
+++ b/usr/src/pkg/manifests/driver-network-ena.p5m
@@ -0,0 +1,36 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2021 Oxide Computer Company
+#
+
+<include global_zone_only_component>
+set name=pkg.fmri value=pkg:/driver/network/ena@$(PKGVERS)
+set name=pkg.summary value="AWS ENA Ethernet Driver"
+set name=pkg.description value="AWS ENA Ethernet Driver"
+set name=info.classification \
+ value=org.opensolaris.category.2008:Drivers/Networking
+set name=variant.arch value=i386
+dir path=kernel group=sys
+dir path=kernel/drv group=sys
+dir path=kernel/drv/$(ARCH64) group=sys
+file path=kernel/drv/$(ARCH64)/ena group=sys
+file path=kernel/drv/ena.conf group=sys
+dir path=usr/share/man
+dir path=usr/share/man/man7d
+file path=usr/share/man/man7d/ena.7d
+driver name=ena perms="* 0666 root sys" clone_perms="ena 0666 root sys" \
+ alias=pciex1d0f,ec2 \
+ alias=pciex1d0f,1ec2 \
+ alias=pciex1d0f,ec20 \
+ alias=pciex1d0f,ec21
+license lic_CDDL license=lic_CDDL
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index d768802685..00af839874 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -29,6 +29,7 @@
# Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
# Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
# Copyright 2020 RackTop Systems, Inc.
+# Copyright 2021 Oxide Computer Company
#
#
@@ -2288,3 +2289,10 @@ BNX_OBJS += \
#
MLXCX_OBJS += mlxcx.o mlxcx_dma.o mlxcx_cmd.o mlxcx_intr.o mlxcx_gld.o \
mlxcx_ring.o mlxcx_sensor.o
+
+#
+# ena(7D)
+#
+ENA_OBJS += ena.o ena_admin.o ena_dma.o ena_gld.o ena_hw.o ena_intr.o \
+ ena_stats.o ena_tx.o ena_rx.o
+
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index 32a80767b2..78f01a1f9f 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -26,7 +26,7 @@
# Copyright 2019 Joyent, Inc.
# Copyright 2018 Nexenta Systems, Inc.
# Copyright (c) 2017 by Delphix. All rights reserved.
-# Copyright 2020 Oxide Computer Company
+# Copyright 2021 Oxide Computer Company
#
#
@@ -777,6 +777,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/elxl/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ena/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/fcoe/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
diff --git a/usr/src/uts/common/io/ena/ena.c b/usr/src/uts/common/io/ena/ena.c
new file mode 100644
index 0000000000..b42f6350af
--- /dev/null
+++ b/usr/src/uts/common/io/ena/ena.c
@@ -0,0 +1,1944 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+#include "ena_hw.h"
+#include "ena.h"
+
+/*
+ * Elastic Network Adapter (ENA) Driver
+ * ------------------------------------
+ *
+ * The ena driver provides support for the AWS ENA device, also
+ * referred to as their "enhanced networking". This device is present
+ * on "Nitro"-based instances. It presents itself with the following
+ * PCI Vendor/Device IDs
+ *
+ * o 1d0f:0ec2 -- ENA PF
+ * o 1d0f:1ec2 -- ENA PF (Reserved)
+ * o 1d0f:ec20 -- ENA VF
+ * o 1d0f:ec21 -- ENA VF (Reserved)
+ *
+ * This driver provides support for only the essential features needed
+ * to drive traffic on an ENA device. Support for the following
+ * features IS NOT currently implemented.
+ *
+ * o Admin Queue Interrupts: queue completion events are always polled
+ * o AENQ keep alive
+ * o FMA
+ * o Rx checksum offloads
+ * o Tx checksum offloads
+ * o Tx DMA bind (borrow buffers)
+ * o Rx DMA bind (loaned buffers)
+ * o TSO
+ * o RSS
+ * o Low Latency Queues (LLQ)
+ * o Support for different Tx complection policies
+ * o More controlled Tx recycling and Rx refill
+ *
+ * Even without these features the ena driver should perform
+ * reasonably well.
+ *
+ * Driver vs. Hardware Types
+ * -------------------------
+ *
+ * To properly communicate with the ENA device the driver must
+ * populate memory (registers and buffers) with specific types. These
+ * types are defined by the device and are found under the "common"
+ * (ena_com) code of the AWS Linux and FreeBSD drivers [1]. We have
+ * simplified this a bit by defining all device-specific types in the
+ * ena_hw.h file. Furthermore, all device-specific types are given an
+ * "enahw" prefix. This makes it clear when we are dealing with a
+ * device type and when we are dealing with a driver type.
+ *
+ * [1]: https://github.com/amzn/amzn-drivers
+ *
+ * Groups, Rings (Queues), and Interrupts
+ * --------------------------------------
+ *
+ * The ENA device presents one mac group. This single mac group
+ * represents the single unicast address that this device represents
+ * in your AWS instance. The ENA device presents no option for
+ * configuring additional MAC addresses, multicast, or promisc mode --
+ * you receive only what AWS wants you to receive.
+ *
+ * This single mac group may have one or more rings. The ENA driver
+ * refers to rings as queues, for no special reason other than it was
+ * the dominant language in the Linux and FreeBSD drivers, and it
+ * spilled over into this port. The upper bound on number of queues is
+ * presented by the device. However, we don't just go with whatever
+ * number of queues the device reports; but rather we limit the queues
+ * based on other factors such as an absolute maximum, number of
+ * online CPUs, and number of available interrupts. The upper bound is
+ * calculated by ena_set_max_io_queues(), and that is used and
+ * possibly further restricted in ena_attach_intr_alloc(). As this
+ * point, ultimately, it is the number of available interrupts (minus
+ * one for the admin queue) that determines the number of queues: one
+ * Tx and one Rx on each I/O interrupt.
+ *
+ * NOTE: Perhaps it is overly restrictive to limit the number of
+ * queues to the number of I/O interrupts. Something worth considering
+ * on larger instances if they present far less interrupts than they
+ * do queues + CPUs.
+ *
+ * The ENA device presents MSI-X interrupts only. During attach the
+ * driver queries the number of available interrupts and sets aside
+ * one for admin/AENQ (vector 0) and the rest for I/O (vector 1 to N).
+ * This means that a Tx/Rx queue at index 0 will map to vector 1, and
+ * so on.
+ *
+ * NOTE: The ENA driver currently doesn't make use of the Admin Queue
+ * interrupt. This interrupt is used to notify a the driver that a
+ * command response is read. The ENA driver always polls the Admin
+ * Queue for responses.
+ *
+ * Tx Queue Workings
+ * -----------------
+ *
+ * A single Tx queue (ena_txq_t) is made up of one submission queue
+ * (SQ) and its paired completion queue (CQ). These two queues form a
+ * logical descriptor ring which is used to send packets out of the
+ * device -- where each SQ entry describes the packet to be sent
+ * (enahw_tx_desc_t) and each CQ entry describes the result of sending
+ * a packet (enahw_tx_cdesc_t). For this to work the host and device
+ * must agree on which descriptors are currently owned by the host
+ * (free for sending) and which are owned by the device (pending
+ * device completion). This state is tracked on the host side via head
+ * and tail indexes along with a phase value.
+ *
+ * The head and tail values represent the head and tail of the FIFO
+ * queue of pending packets -- the next packet to be sent by the
+ * device is head, and all descriptors up to tail are ready for
+ * sending. The phase allows the host to determine which CQ
+ * descriptors represent completed events when using per-SQ completion
+ * events (as opposed to queue head pointer updates). As the queues
+ * represent a logical ring buffer, the phase must alternate on
+ * wrap-around. The device initializes the phase to zero, and the host
+ * starts with a phase of 1. The first packet descriptor writes, and
+ * their corresponding completions, are indicated with a phase of 1.
+ *
+ *
+ * For example, the diagram below represents the SQ/CQ state after the
+ * first 6 packets have been sent by the host and 2 of them have been
+ * completed by the device (and these completions have been processed
+ * by the driver). In this state the host could send 4 more packets
+ * before needing to wait on completion events.
+ *
+ *
+ * +---+---+---+---+---+---+---+---+
+ * SQ | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | phase = 1
+ * +---+---+---+---+---+---+---+---+
+ * ^
+ * |
+ * tail
+ * head
+ * |
+ * v
+ * +---+---+---+---+---+---+---+---+
+ * CQ | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | phase = 1
+ * +---+---+---+---+---+---+---+---+
+ *
+ *
+ * The next diagram shows how the state changes as 5 more packets are
+ * sent (for a total of 11) and 7 more are completed (for a total of
+ * 9). Notice that as the SQ and CQ have wrapped around their phases
+ * have been complemented. In this state the host could send 6 more
+ * packets before needing to wait on completion events.
+ *
+ * +---+---+---+---+---+---+---+---+
+ * SQ | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | phase = 0
+ * +---+---+---+---+---+---+---+---+
+ * ^
+ * |
+ * tail
+ * head
+ * |
+ * v
+ * +---+---+---+---+---+---+---+---+
+ * CQ | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | phase = 0
+ * +---+---+---+---+---+---+---+---+
+ *
+ *
+ * Currently, all packets are copied for Tx. At ring start we allocate
+ * a Tx Control Buffer (TCB) for each queue descriptor. Each TCB has
+ * DMA buffer associated with it; and each buffer is large enough to
+ * hold the MTU. Therefore, Tx descriptors and TCBs currently have a
+ * 1:1 mapping. When a packet is sent, the mblk's buffer is copied to
+ * the TCB's DMA buffer, and a new descriptor is written to the SQ
+ * describing said TCB buffer. If and when we add more advanced
+ * features like DMA binding of mblks and TSO, this 1:1 guarantee will
+ * no longer hold.
+ *
+ * Rx Queue Workings
+ * -----------------
+ *
+ * In terms of implementing the logical descriptor ring, the Rx queues
+ * are very much like the Tx queues. There is a paired SQ and CQ for
+ * each logical ring. The difference is that in Rx the SQ is for
+ * handing buffers to the device to fill, and the CQ is for describing
+ * the contents of those buffers for a given received frame. At Rx
+ * ring start we allocate a Rx Control Buffer (RCB) for each
+ * descriptor in the ring. Each RCB has a DMA buffer associated with
+ * it; and each buffer is large enough to hold the MTU. For each
+ * received frame we copy the contents out of the RCB and into its own
+ * mblk, immediately returning the RCB for reuse. As with Tx, this
+ * gives us a simple 1:1 mapping currently, but if more advanced
+ * features are implemented later this could change.
+ *
+ * Asynchronous Event Notification Queue (AENQ)
+ * --------------------------------------------
+ *
+ * Each ENA device comes with a mechanism for sending out-of-band
+ * notifications to the driver. This includes events like link state
+ * changes, fatal errors, and a watchdog/keep alive signal. The AENQ
+ * delivery mechanism is via interrupt, handled by the ena_aenq_work()
+ * function, which dispatches via the eaenq_hdlrs table. If no handler
+ * is registered, the ena_aenq_default_hdlr() handler is used. A given
+ * device may not support all the different event types
+ * (enahw_aenq_groups_t); and the driver may choose to enable a subset
+ * of the supported events. During attach we call ena_setup_aenq() to
+ * negotiate the supported/enabled events. The enabled group is stored
+ * at ena_aenq_enabled_groups.
+ *
+ * Queues and Unsigned Wraparound
+ * ------------------------------
+ *
+ * All the queues use a uint16_t value as their head/tail values, e.g.
+ * the Rx queue's er_cq_head_idx value. You might notice that we only
+ * ever increment these values, letting them perform implicit unsigned
+ * integer wraparound. This is intended. This is the same behavior as
+ * the common code, and seems to be what the hardware expects. Of
+ * course, when accessing our own descriptor arrays we must make sure
+ * to first perform a modulo of this value or risk running off into
+ * space.
+ *
+ * Attach Sequencing
+ * -----------------
+ *
+ * Most drivers implement their attach/detach/cleanup functions as a
+ * sequential stream of function calls used to allocate and initialize
+ * resources in an order determined by the device's programming manual
+ * combined with any requirements imposed by the kernel and its
+ * relevant modules. These functions can become quite long. It is
+ * often hard to see the order in which steps are taken, and even
+ * harder to tell if detach/cleanup undoes them in the correct order,
+ * or even if it undoes them at all! The only sure way to understand
+ * the flow is to take good notes while closely inspecting each line
+ * of code. Even then, it's easy for attach and detach to get out of
+ * sync.
+ *
+ * Some more recent drivers have improved on this situation by using a
+ * bit vector to track the sequence of events in attach/detach. Each
+ * bit is declared in as an enum value, in the same order it is
+ * expected attach would run, and thus detach would run in the exact
+ * opposite order. This has three main benefits:
+ *
+ * 1. It makes it easier to determine sequence order at a
+ * glance.
+ *
+ * 2. It gives a better idea of what state the device is in during
+ * debugging (the sequence bit vector is kept with the instance
+ * state).
+ *
+ * 3. The detach function can verify that all sequence bits are
+ * cleared, indicating that everything done in attach was
+ * successfully undone.
+ *
+ * These are great improvements. However, the attach/detach functions
+ * can still become unruly, and there is still no guarantee that
+ * detach is done in opposite order of attach (this is not always
+ * strictly required, but is probably the best way to write detach).
+ * There is still a lot of boilerplate and chance for programmer
+ * error.
+ *
+ * The ena driver takes the sequence idea a bit further, creating a
+ * descriptor table of the attach sequence (ena_attach_tbl). This
+ * table is used by attach/detach to generically, declaratively, and
+ * programmaticaly enforce the precise sequence order and verify that
+ * anything that is done is undone. This provides several benefits:
+ *
+ * o Correct order is enforced implicitly by the descriptor table.
+ * It is impossible for the detach sequence to run in any other
+ * order other than opposite that of attach.
+ *
+ * o It is obvious what the precise attach sequence is. While the
+ * bit vector enum helps a lot with this it doesn't prevent
+ * programmer error. With the sequence defined as a declarative
+ * table it makes it easy for the programmer to see the order and
+ * know it's followed exactly.
+ *
+ * o It is impossible to modify the attach sequence without also
+ * specifying a callback for its dual in the detach sequence.
+ *
+ * o Common and repetitive code like error checking, logging, and bit
+ * vector modification is eliminated and centralized, again
+ * reducing the chance of programmer error.
+ *
+ * The ena attach sequence is defined under ena_attach_seq_t. The
+ * descriptor table is defined under ena_attach_tbl.
+ */
+
+/*
+ * These are some basic data layout invariants on which development
+ * assumptions where made.
+ */
+CTASSERT(sizeof (enahw_aenq_desc_t) == 64);
+/* TODO: Why doesn't this work? */
+/* CTASSERT(sizeof (enahw_tx_data_desc_t) == 64); */
+CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_meta_desc_t));
+CTASSERT(sizeof (enahw_tx_data_desc_t) == sizeof (enahw_tx_desc_t));
+CTASSERT(sizeof (enahw_tx_meta_desc_t) == sizeof (enahw_tx_desc_t));
+/*
+ * We add this here as an extra safety check to make sure that any
+ * addition to the AENQ group enum also updates the groups array num
+ * value.
+ */
+CTASSERT(ENAHW_AENQ_GROUPS_ARR_NUM == 6);
+
+/*
+ * Amazon does not specify the endianess of the ENA device. We assume
+ * it's the same as the bus, and we assume the CPU/bus is always
+ * little endian.
+ */
+#ifdef _BIG_ENDIAN
+#error "ENA driver is little-endian only"
+#endif
+
+/*
+ * These values are used to communicate the driver version to the AWS
+ * hypervisor via the ena_set_host_info() function. We don't know what
+ * exactly AWS does with this info, but it's fairly safe to assume
+ * it's used solely for debug/informational purposes. The Linux driver
+ * updates these values frequently as bugs are fixed and features are
+ * added.
+ */
+#define ENA_DRV_VER_MAJOR 1
+#define ENA_DRV_VER_MINOR 0
+#define ENA_DRV_VER_SUBMINOR 0
+
+uint64_t ena_admin_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT;
+
+/*
+ * Log an error message. We leave the destination (console or system
+ * log) up to the caller
+ */
+void
+ena_err(const ena_t *ena, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ if (ena != NULL && ena->ena_dip != NULL) {
+ vdev_err(ena->ena_dip, CE_WARN, fmt, ap);
+ } else {
+ vcmn_err(CE_WARN, fmt, ap);
+ }
+ va_end(ap);
+}
+
+/*
+ * Set this to B_TRUE to enable debug messages.
+ */
+boolean_t ena_debug = B_FALSE;
+
+/*
+ * Log a debug message. We force all debug messages to go to the
+ * system log.
+ */
+void
+ena_dbg(const ena_t *ena, const char *fmt, ...)
+{
+ va_list ap;
+
+ if (ena_debug) {
+ char msg[1024];
+
+ va_start(ap, fmt);
+ (void) vsnprintf(msg, sizeof (msg), fmt, ap);
+ va_end(ap);
+
+ if (ena != NULL && ena->ena_dip != NULL) {
+ dev_err(ena->ena_dip, CE_NOTE, "!%s", msg);
+ } else {
+ cmn_err(CE_NOTE, "!%s", msg);
+ }
+ }
+}
+
+ena_aenq_grpstr_t ena_groups_str[ENAHW_AENQ_GROUPS_ARR_NUM] = {
+ { .eag_type = ENAHW_AENQ_GROUP_LINK_CHANGE, .eag_str = "LINK CHANGE" },
+ { .eag_type = ENAHW_AENQ_GROUP_FATAL_ERROR, .eag_str = "FATAL ERROR" },
+ { .eag_type = ENAHW_AENQ_GROUP_WARNING, .eag_str = "WARNING" },
+ {
+ .eag_type = ENAHW_AENQ_GROUP_NOTIFICATION,
+ .eag_str = "NOTIFICATION"
+ },
+ { .eag_type = ENAHW_AENQ_GROUP_KEEP_ALIVE, .eag_str = "KEEP ALIVE" },
+ {
+ .eag_type = ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES,
+ .eag_str = "REFRESH CAPABILITIES"
+ },
+};
+
+void
+ena_aenq_work(ena_t *ena)
+{
+ ena_aenq_t *aenq = &ena->ena_aenq;
+ uint16_t head_mod = aenq->eaenq_head & (aenq->eaenq_num_descs - 1);
+ boolean_t processed = B_FALSE;
+ enahw_aenq_desc_t *desc = &aenq->eaenq_descs[head_mod];
+ uint64_t ts;
+
+ ts = ((uint64_t)desc->ead_ts_high << 32) | (uint64_t)desc->ead_ts_low;
+ ENA_DMA_SYNC(aenq->eaenq_dma, DDI_DMA_SYNC_FORKERNEL);
+
+ while (ENAHW_AENQ_DESC_PHASE(desc) == aenq->eaenq_phase) {
+ ena_aenq_hdlr_t hdlr;
+
+ ASSERT3U(desc->ead_group, <, ENAHW_AENQ_GROUPS_ARR_NUM);
+ processed = B_TRUE;
+ ena_dbg(ena, "AENQ Group: (0x%x) %s Syndrome: 0x%x ts: %" PRIu64
+ " us", desc->ead_group,
+ ena_groups_str[desc->ead_group].eag_str, desc->ead_syndrome,
+ ts);
+
+ hdlr = ena->ena_aenq.eaenq_hdlrs[desc->ead_group];
+ hdlr(ena, desc);
+
+ aenq->eaenq_head++;
+ head_mod = aenq->eaenq_head & (aenq->eaenq_num_descs - 1);
+
+ if (head_mod == 0) {
+ aenq->eaenq_phase = !aenq->eaenq_phase;
+ }
+
+ desc = &aenq->eaenq_descs[head_mod];
+ }
+
+ if (processed) {
+ ena_hw_bar_write32(ena, ENAHW_REG_AENQ_HEAD_DB,
+ aenq->eaenq_head);
+ }
+}
+
+/*
+ * Use for attach sequences which perform no resource allocation (or
+ * global state modification) and thus require no subsequent
+ * deallocation.
+ */
+static void
+ena_no_cleanup(ena_t *ena)
+{
+}
+
+static boolean_t
+ena_attach_pci(ena_t *ena)
+{
+ ddi_acc_handle_t hdl;
+
+ if (pci_config_setup(ena->ena_dip, &hdl) != 0) {
+ return (B_FALSE);
+ }
+
+ ena->ena_pci_hdl = hdl;
+ ena->ena_pci_vid = pci_config_get16(hdl, PCI_CONF_VENID);
+ ena->ena_pci_did = pci_config_get16(hdl, PCI_CONF_DEVID);
+ ena->ena_pci_rev = pci_config_get8(hdl, PCI_CONF_REVID);
+ ena->ena_pci_svid = pci_config_get16(hdl, PCI_CONF_SUBVENID);
+ ena->ena_pci_sdid = pci_config_get16(hdl, PCI_CONF_SUBSYSID);
+ ena_dbg(ena, "vid: 0x%x did: 0x%x rev: 0x%x svid: 0x%x sdid: 0x%x",
+ ena->ena_pci_vid, ena->ena_pci_did, ena->ena_pci_rev,
+ ena->ena_pci_svid, ena->ena_pci_sdid);
+
+ return (B_TRUE);
+}
+
+static void
+ena_cleanup_pci(ena_t *ena)
+{
+ pci_config_teardown(&ena->ena_pci_hdl);
+}
+
+static void
+ena_cleanup_regs_map(ena_t *ena)
+{
+ ddi_regs_map_free(&ena->ena_reg_hdl);
+}
+
+static boolean_t
+ena_attach_regs_map(ena_t *ena)
+{
+ int ret = 0;
+
+ if (ddi_dev_regsize(ena->ena_dip, ENA_REG_NUMBER, &ena->ena_reg_size) !=
+ DDI_SUCCESS) {
+ ena_err(ena, "failed to get register set %d size",
+ ENA_REG_NUMBER);
+ return (B_FALSE);
+ }
+
+ ena_dbg(ena, "register size: %ld", ena->ena_reg_size);
+ bzero(&ena->ena_reg_attr, sizeof (ena->ena_reg_attr));
+ ena->ena_reg_attr.devacc_attr_version = DDI_DEVICE_ATTR_V1;
+ ena->ena_reg_attr.devacc_attr_endian_flags = DDI_NEVERSWAP_ACC;
+ ena->ena_reg_attr.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
+
+ /*
+ * This function can return several different failure values,
+ * so we make sure to capture its return value for the purpose
+ * of logging.
+ */
+ ret = ddi_regs_map_setup(ena->ena_dip, ENA_REG_NUMBER,
+ &ena->ena_reg_base, 0, ena->ena_reg_size, &ena->ena_reg_attr,
+ &ena->ena_reg_hdl);
+
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "failed to map register set %d: %d",
+ ENA_REG_NUMBER, ret);
+ return (B_FALSE);
+ }
+
+ ena_dbg(ena, "registers mapped to base: 0x%p",
+ (void *)ena->ena_reg_base);
+
+ return (B_TRUE);
+}
+
+/*
+ * Free any resources related to the admin submission queue.
+ */
+static void
+ena_admin_sq_free(ena_t *ena)
+{
+ ena_dma_free(&ena->ena_aq.ea_sq.eas_dma);
+}
+
+/*
+ * Initialize the admin submission queue.
+ */
+static boolean_t
+ena_admin_sq_init(ena_t *ena)
+{
+ ena_adminq_t *aq = &ena->ena_aq;
+ ena_dma_buf_t *dma = &aq->ea_sq.eas_dma;
+ size_t size = aq->ea_qlen * sizeof (*aq->ea_sq.eas_entries);
+ uint32_t addr_low, addr_high, wval;
+ ena_dma_conf_t conf = {
+ .edc_size = size,
+ .edc_align = ENAHW_ADMIN_SQ_DESC_BUF_ALIGNMENT,
+ .edc_sgl = 1,
+ .edc_endian = DDI_NEVERSWAP_ACC,
+ .edc_stream = B_FALSE,
+ };
+
+ if (!ena_dma_alloc(ena, dma, &conf, size)) {
+ ena_err(ena, "failed to allocate DMA for Admin SQ");
+ return (B_FALSE);
+ }
+
+ aq->ea_sq.eas_entries = (void *)dma->edb_va;
+ aq->ea_sq.eas_tail = 0;
+ aq->ea_sq.eas_phase = 1;
+ aq->ea_sq.eas_dbaddr =
+ (uint32_t *)(ena->ena_reg_base + ENAHW_REG_ASQ_DB);
+ ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress);
+ addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress);
+ addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32);
+ ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_LO, addr_low);
+ ena_hw_bar_write32(ena, ENAHW_REG_ASQ_BASE_HI, addr_high);
+ wval = ENAHW_ASQ_CAPS_DEPTH(aq->ea_qlen) |
+ ENAHW_ASQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_sq.eas_entries));
+ ena_hw_bar_write32(ena, ENAHW_REG_ASQ_CAPS, wval);
+ return (B_TRUE);
+}
+
+/*
+ * Free any resources related to the admin completion queue.
+ */
+static void
+ena_admin_cq_free(ena_t *ena)
+{
+ ena_dma_free(&ena->ena_aq.ea_cq.eac_dma);
+}
+
+/*
+ * Initialize the admin completion queue.
+ */
+static boolean_t
+ena_admin_cq_init(ena_t *ena)
+{
+ ena_adminq_t *aq = &ena->ena_aq;
+ ena_dma_buf_t *dma = &aq->ea_cq.eac_dma;
+ size_t size = aq->ea_qlen * sizeof (*aq->ea_cq.eac_entries);
+ uint32_t addr_low, addr_high, wval;
+ ena_dma_conf_t conf = {
+ .edc_size = size,
+ .edc_align = ENAHW_ADMIN_CQ_DESC_BUF_ALIGNMENT,
+ .edc_sgl = 1,
+ .edc_endian = DDI_NEVERSWAP_ACC,
+ .edc_stream = B_FALSE,
+ };
+
+ if (!ena_dma_alloc(ena, dma, &conf, size)) {
+ ena_err(ena, "failed to allocate DMA for Admin CQ");
+ return (B_FALSE);
+ }
+
+ aq->ea_cq.eac_entries = (void *)dma->edb_va;
+ aq->ea_cq.eac_head = 0;
+ aq->ea_cq.eac_phase = 1;
+ ENA_DMA_VERIFY_ADDR(ena, dma->edb_cookie->dmac_laddress);
+ addr_low = (uint32_t)(dma->edb_cookie->dmac_laddress);
+ addr_high = (uint32_t)(dma->edb_cookie->dmac_laddress >> 32);
+ ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_LO, addr_low);
+ ena_hw_bar_write32(ena, ENAHW_REG_ACQ_BASE_HI, addr_high);
+ wval = ENAHW_ACQ_CAPS_DEPTH(aq->ea_qlen) |
+ ENAHW_ACQ_CAPS_ENTRY_SIZE(sizeof (*aq->ea_cq.eac_entries));
+ ena_hw_bar_write32(ena, ENAHW_REG_ACQ_CAPS, wval);
+ return (B_TRUE);
+}
+
+static void
+ena_aenq_default_hdlr(void *data, enahw_aenq_desc_t *desc)
+{
+ ena_t *ena = data;
+
+ ena->ena_aenq_stat.eaes_default.value.ui64++;
+ ena_dbg(ena, "unimplemented handler for aenq group: %s",
+ ena_groups_str[desc->ead_group].eag_str);
+}
+
+static void
+ena_aenq_link_change_hdlr(void *data, enahw_aenq_desc_t *desc)
+{
+ ena_t *ena = data;
+ boolean_t is_up = (desc->ead_payload.link_change.flags &
+ ENAHW_AENQ_LINK_CHANGE_LINK_STATUS_MASK) != 0;
+
+ /*
+ * The interupts are not enabled until after we register mac,
+ * so the mac handle should be valid.
+ */
+ ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_MAC_REGISTER);
+ ena->ena_aenq_stat.eaes_link_change.value.ui64++;
+
+ mutex_enter(&ena->ena_lock);
+
+ /*
+ * Notify mac only on an actual change in status.
+ */
+ if (ena->ena_link_up != is_up) {
+ if (is_up) {
+ mac_link_update(ena->ena_mh, LINK_STATE_UP);
+ } else {
+ mac_link_update(ena->ena_mh, LINK_STATE_DOWN);
+ }
+ }
+
+ ena->ena_link_up = is_up;
+
+ mutex_exit(&ena->ena_lock);
+}
+
+/*
+ * Free any resources related to the Async Event Notification Queue.
+ */
+static void
+ena_aenq_free(ena_t *ena)
+{
+ ena_dma_free(&ena->ena_aenq.eaenq_dma);
+}
+
+static void
+ena_aenq_set_def_hdlrs(ena_aenq_t *aenq)
+{
+ aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_LINK_CHANGE] = ena_aenq_default_hdlr;
+ aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_FATAL_ERROR] = ena_aenq_default_hdlr;
+ aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_WARNING] = ena_aenq_default_hdlr;
+ aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_NOTIFICATION] =
+ ena_aenq_default_hdlr;
+ aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_KEEP_ALIVE] = ena_aenq_default_hdlr;
+ aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES] =
+ ena_aenq_default_hdlr;
+}
+/*
+ * Initialize the Async Event Notification Queue.
+ */
+static boolean_t
+ena_aenq_init(ena_t *ena)
+{
+ ena_aenq_t *aenq = &ena->ena_aenq;
+ size_t size;
+ uint32_t addr_low, addr_high, wval;
+ ena_dma_conf_t conf;
+
+ aenq->eaenq_num_descs = ENA_AENQ_NUM_DESCS;
+ size = aenq->eaenq_num_descs * sizeof (*aenq->eaenq_descs);
+
+ conf = (ena_dma_conf_t) {
+ .edc_size = size,
+ .edc_align = ENAHW_AENQ_DESC_BUF_ALIGNMENT,
+ .edc_sgl = 1,
+ .edc_endian = DDI_NEVERSWAP_ACC,
+ .edc_stream = B_FALSE,
+ };
+
+ if (!ena_dma_alloc(ena, &aenq->eaenq_dma, &conf, size)) {
+ ena_err(ena, "failed to allocate DMA for AENQ");
+ return (B_FALSE);
+ }
+
+ aenq->eaenq_descs = (void *)aenq->eaenq_dma.edb_va;
+ aenq->eaenq_head = 0;
+ aenq->eaenq_phase = 1;
+ bzero(aenq->eaenq_descs, size);
+ ena_aenq_set_def_hdlrs(aenq);
+
+ aenq->eaenq_hdlrs[ENAHW_AENQ_GROUP_LINK_CHANGE] =
+ ena_aenq_link_change_hdlr;
+
+ ENA_DMA_VERIFY_ADDR(ena, aenq->eaenq_dma.edb_cookie->dmac_laddress);
+ addr_low = (uint32_t)(aenq->eaenq_dma.edb_cookie->dmac_laddress);
+ addr_high = (uint32_t)(aenq->eaenq_dma.edb_cookie->dmac_laddress >> 32);
+ ena_hw_bar_write32(ena, ENAHW_REG_AENQ_BASE_LO, addr_low);
+ ena_hw_bar_write32(ena, ENAHW_REG_AENQ_BASE_HI, addr_high);
+ ENA_DMA_SYNC(aenq->eaenq_dma, DDI_DMA_SYNC_FORDEV);
+ wval = ENAHW_AENQ_CAPS_DEPTH(aenq->eaenq_num_descs) |
+ ENAHW_AENQ_CAPS_ENTRY_SIZE(sizeof (*aenq->eaenq_descs));
+ ena_hw_bar_write32(ena, ENAHW_REG_AENQ_CAPS, wval);
+ return (B_TRUE);
+}
+
+/*
+ * We limit the max number of I/O queues based on several aspects of
+ * the underlying hardware.
+ *
+ * 1. The absolute upper limit is set by ENAHW_MAX_NUM_IO_QUEUES,
+ * which comes from the common code and presumably is based on device
+ * constraints.
+ *
+ * 2. Next we latch the number of I/O queues to the number of online
+ * CPUs. The idea being that each queue is a parallel work stream,
+ * and having more queues than CPUs to flush them will not improve
+ * performance. The number of online CPUs can change dynamically,
+ * and that's okay, everything should still work fine, it just
+ * might not be ideal.
+ *
+ * 3. Next we latch the number of I/O queues to the smallest of the
+ * max Tx queues and max Rx queues. We could probably loosen this
+ * restriction in the future, and have separate max I/O queues for
+ * Tx and Rx. This is what Linux does, and seems like a fine place
+ * to start.
+ */
+static void
+ena_set_max_io_queues(ena_t *ena)
+{
+ uint32_t max = ENAHW_MAX_NUM_IO_QUEUES;
+
+ max = MIN(ncpus_online, max);
+ /*
+ * Supposedly a device could present a different number of SQs
+ * and CQs. This driver is desinged in a way that requires
+ * each SQ to have a corresponding and dedicated CQ (how would
+ * it work otherwise). Therefore, we must check both values
+ * and find the minimum between them.
+ */
+ max = MIN(ena->ena_tx_max_sq_num, max);
+ max = MIN(ena->ena_tx_max_cq_num, max);
+ max = MIN(ena->ena_rx_max_sq_num, max);
+ max = MIN(ena->ena_rx_max_cq_num, max);
+
+
+ /* This shouldn't happen, but just in case. */
+ if (max == 0) {
+ max = 1;
+ }
+
+ ena->ena_max_io_queues = max;
+}
+
+/*
+ * We require that an Rx or Tx buffer be able to hold the maximum MTU
+ * along with the maximum frame header length. In this case we know
+ * ENA is presenting us an Ethernet frame so we add the size of an
+ * Ethernet VLAN header. Rx has the additional requirement of needing
+ * additional margin for the sake of IP header alignment.
+ */
+static void
+ena_update_buf_sizes(ena_t *ena)
+{
+ ena->ena_max_frame_hdr = sizeof (struct ether_vlan_header);
+ ena->ena_max_frame_total = ena->ena_max_frame_hdr + ena->ena_mtu;
+ ena->ena_tx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total,
+ ena->ena_page_sz, uint32_t);
+ ena->ena_rx_buf_sz = P2ROUNDUP_TYPED(ena->ena_max_frame_total +
+ ENA_RX_BUF_IPHDR_ALIGNMENT, ena->ena_page_sz, uint32_t);
+}
+
+static boolean_t
+ena_get_offloads(ena_t *ena)
+{
+ int ret = 0;
+ enahw_resp_desc_t resp;
+ enahw_feat_offload_t *feat = &resp.erd_resp.erd_get_feat.ergf_offload;
+
+ ena->ena_tx_l3_ipv4_csum = B_FALSE;
+
+ ena->ena_tx_l4_ipv4_part_csum = B_FALSE;
+ ena->ena_tx_l4_ipv4_full_csum = B_FALSE;
+ ena->ena_tx_l4_ipv4_lso = B_FALSE;
+
+ ena->ena_tx_l4_ipv6_part_csum = B_FALSE;
+ ena->ena_tx_l4_ipv6_full_csum = B_FALSE;
+ ena->ena_tx_l4_ipv6_lso = B_FALSE;
+
+ ena->ena_rx_l3_ipv4_csum = B_FALSE;
+ ena->ena_rx_l4_ipv4_csum = B_FALSE;
+ ena->ena_rx_l4_ipv6_csum = B_FALSE;
+ ena->ena_rx_hash = B_FALSE;
+
+ bzero(&resp, sizeof (resp));
+ ret = ena_get_feature(ena, &resp, ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG,
+ ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG_VER);
+
+ if (ret == ENOTSUP) {
+ /*
+ * In this case the device does not support querying
+ * for hardware offloads. We take that as a sign that
+ * the device provides no offloads.
+ */
+ return (B_TRUE);
+ } else if (ret != 0) {
+ ena_err(ena, "error getting stateless offload: %d", ret);
+ return (B_FALSE);
+ }
+
+ ena->ena_tx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM(feat);
+
+ ena->ena_tx_l4_ipv4_part_csum =
+ ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART(feat);
+ ena->ena_tx_l4_ipv4_full_csum =
+ ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL(feat);
+ ena->ena_tx_l4_ipv4_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV4(feat);
+
+ ena->ena_tx_l4_ipv6_part_csum =
+ ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART(feat);
+ ena->ena_tx_l4_ipv6_full_csum =
+ ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL(feat);
+ ena->ena_tx_l4_ipv6_lso = ENAHW_FEAT_OFFLOAD_TSO_IPV6(feat);
+
+ ena->ena_rx_l3_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM(feat);
+ ena->ena_rx_l4_ipv4_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM(feat);
+ ena->ena_rx_l4_ipv6_csum = ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM(feat);
+ return (B_TRUE);
+}
+
+static int
+ena_get_prop(ena_t *ena, char *propname, const int minval, const int maxval,
+ const int defval)
+{
+ int value = ddi_prop_get_int(DDI_DEV_T_ANY, ena->ena_dip,
+ DDI_PROP_DONTPASS, propname, defval);
+
+ if (value > maxval) {
+ ena_err(ena, "user value %s=%d exceeded maximum, setting to %d",
+ propname, value, maxval);
+ value = maxval;
+ }
+
+ if (value < minval) {
+ ena_err(ena, "user value %s=%d below minimum, setting to %d",
+ propname, value, minval);
+ value = minval;
+ }
+
+ return (value);
+}
+
+static boolean_t
+ena_set_mtu(ena_t *ena)
+{
+ int ret = 0;
+ enahw_cmd_desc_t cmd;
+ enahw_feat_mtu_t *feat = &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_mtu;
+ enahw_resp_desc_t resp;
+
+ bzero(&cmd, sizeof (cmd));
+ bzero(&resp, sizeof (resp));
+ feat->efm_mtu = ena->ena_mtu;
+
+ if ((ret = ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_MTU,
+ ENAHW_FEAT_MTU_VER)) != 0) {
+ ena_err(ena, "failed to set device MTU to %u: %d", ena->ena_mtu,
+ ret);
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+static void
+ena_get_link_config(ena_t *ena)
+{
+ enahw_resp_desc_t resp;
+ enahw_feat_link_conf_t *feat =
+ &resp.erd_resp.erd_get_feat.ergf_link_conf;
+ boolean_t full_duplex;
+
+ bzero(&resp, sizeof (resp));
+
+ if (ena_get_feature(ena, &resp, ENAHW_FEAT_LINK_CONFIG,
+ ENAHW_FEAT_LINK_CONFIG_VER) != 0) {
+ /*
+ * Some ENA devices do no support this feature. In
+ * those cases we report a 1Gbps link, full duplex.
+ * For the most accurate information on bandwidth
+ * limits see the official AWS documentation.
+ */
+ ena->ena_link_speed_mbits = 1 * 1000 * 1000;
+ ena->ena_link_speeds = ENAHW_LINK_SPEED_1G;
+ ena->ena_link_duplex = LINK_DUPLEX_FULL;
+ ena->ena_link_autoneg = B_TRUE;
+ return;
+ }
+
+ ena->ena_link_speed_mbits = feat->eflc_speed;
+ ena->ena_link_speeds = feat->eflc_supported;
+ full_duplex = ENAHW_FEAT_LINK_CONF_FULL_DUPLEX(feat);
+ ena->ena_link_duplex = full_duplex ? LINK_DUPLEX_FULL :
+ LINK_DUPLEX_HALF;
+ ena->ena_link_autoneg = ENAHW_FEAT_LINK_CONF_AUTONEG(feat);
+}
+
+/*
+ * Retrieve all configuration values which are modifiable via
+ * ena.conf, and set ena_t members accordingly. While the conf values
+ * have priority, they may be implicitly modified by the driver to
+ * meet resource constraints on a given platform. If no value is
+ * specified in the conf file, the driver will attempt to use the
+ * largest value supported. While there should be no value large
+ * enough, keep in mind that ena_get_prop() will cast the values to an
+ * int.
+ *
+ * This function should be called after the device is initialized,
+ * admin queue is established, and the hardware features/capabs have
+ * been queried; it should be called before mac registration.
+ */
+static boolean_t
+ena_attach_read_conf(ena_t *ena)
+{
+ uint32_t gcv; /* Greatest Common Value */
+
+ /*
+ * We expect that the queue lengths are the same for both the
+ * CQ and SQ, but technically the device could return
+ * different lengths. For now the driver locks them together.
+ */
+ gcv = min(ena->ena_rx_max_sq_num_descs, ena->ena_rx_max_cq_num_descs);
+ ASSERT3U(gcv, <=, INT_MAX);
+ ena->ena_rxq_num_descs = ena_get_prop(ena, ENA_PROP_RXQ_NUM_DESCS,
+ ENA_PROP_RXQ_NUM_DESCS_MIN, gcv, gcv);
+
+ ena->ena_rxq_intr_limit = ena_get_prop(ena, ENA_PROP_RXQ_INTR_LIMIT,
+ ENA_PROP_RXQ_INTR_LIMIT_MIN, ENA_PROP_RXQ_INTR_LIMIT_MAX,
+ ENA_PROP_RXQ_INTR_LIMIT_DEF);
+
+ gcv = min(ena->ena_tx_max_sq_num_descs, ena->ena_tx_max_cq_num_descs);
+ ASSERT3U(gcv, <=, INT_MAX);
+ ena->ena_txq_num_descs = ena_get_prop(ena, ENA_PROP_TXQ_NUM_DESCS,
+ ENA_PROP_TXQ_NUM_DESCS_MIN, gcv, gcv);
+
+ return (B_TRUE);
+}
+
+/*
+ * Perform any necessary device configuration after the driver.conf
+ * has been read.
+ */
+static boolean_t
+ena_attach_dev_cfg(ena_t *ena)
+{
+ ASSERT3U(ena->ena_attach_seq, >=, ENA_ATTACH_READ_CONF);
+
+ if (!ena_set_mtu(ena)) {
+ /*
+ * We don't expect this to fail, but we try a fallback
+ * first before failing the attach sequence.
+ */
+ ena->ena_mtu = 1500;
+ ena_err(ena, "trying fallback MTU: %u", ena->ena_mtu);
+
+ if (!ena_set_mtu(ena)) {
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+static boolean_t
+ena_check_versions(ena_t *ena)
+{
+ uint32_t dev_vsn = ena_hw_bar_read32(ena, ENAHW_REG_VERSION);
+ uint32_t ctrl_vsn =
+ ena_hw_bar_read32(ena, ENAHW_REG_CONTROLLER_VERSION);
+
+ ena->ena_dev_major_vsn = ENAHW_DEV_MAJOR_VSN(dev_vsn);
+ ena->ena_dev_minor_vsn = ENAHW_DEV_MINOR_VSN(dev_vsn);
+
+ ena->ena_ctrl_major_vsn = ENAHW_CTRL_MAJOR_VSN(ctrl_vsn);
+ ena->ena_ctrl_minor_vsn = ENAHW_CTRL_MINOR_VSN(ctrl_vsn);
+ ena->ena_ctrl_subminor_vsn = ENAHW_CTRL_SUBMINOR_VSN(ctrl_vsn);
+ ena->ena_ctrl_impl_id = ENAHW_CTRL_IMPL_ID(ctrl_vsn);
+
+ if (ena->ena_ctrl_subminor_vsn < ENA_CTRL_SUBMINOR_VSN_MIN) {
+ ena_err(ena, "unsupported controller version: %u.%u.%u",
+ ena->ena_ctrl_major_vsn, ena->ena_ctrl_minor_vsn,
+ ena->ena_ctrl_subminor_vsn);
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+boolean_t
+ena_setup_aenq(ena_t *ena)
+{
+ enahw_cmd_desc_t cmd;
+ enahw_feat_aenq_t *cmd_feat =
+ &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_aenq;
+ enahw_resp_desc_t resp;
+ enahw_feat_aenq_t *resp_feat = &resp.erd_resp.erd_get_feat.ergf_aenq;
+ enahw_aenq_groups_t to_enable;
+
+ bzero(&resp, sizeof (resp));
+ if (ena_get_feature(ena, &resp, ENAHW_FEAT_AENQ_CONFIG,
+ ENAHW_FEAT_AENQ_CONFIG_VER) != 0) {
+ return (B_FALSE);
+ }
+
+ to_enable = BIT(ENAHW_AENQ_GROUP_LINK_CHANGE) |
+ BIT(ENAHW_AENQ_GROUP_FATAL_ERROR) |
+ BIT(ENAHW_AENQ_GROUP_WARNING) |
+ BIT(ENAHW_AENQ_GROUP_NOTIFICATION);
+ to_enable &= resp_feat->efa_supported_groups;
+
+ bzero(&cmd, sizeof (cmd));
+ bzero(&resp, sizeof (cmd));
+ cmd_feat->efa_enabled_groups = to_enable;
+
+ if (ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_AENQ_CONFIG,
+ ENAHW_FEAT_AENQ_CONFIG_VER) != 0) {
+ return (B_FALSE);
+ }
+
+ bzero(&resp, sizeof (resp));
+ if (ena_get_feature(ena, &resp, ENAHW_FEAT_AENQ_CONFIG,
+ ENAHW_FEAT_AENQ_CONFIG_VER) != 0) {
+ return (B_FALSE);
+ }
+
+ ena->ena_aenq_supported_groups = resp_feat->efa_supported_groups;
+ ena->ena_aenq_enabled_groups = resp_feat->efa_enabled_groups;
+
+ for (uint_t i = 0; i < ENAHW_AENQ_GROUPS_ARR_NUM; i++) {
+ ena_aenq_grpstr_t *grpstr = &ena_groups_str[i];
+ boolean_t supported = BIT(grpstr->eag_type) &
+ resp_feat->efa_supported_groups;
+ boolean_t enabled = BIT(grpstr->eag_type) &
+ resp_feat->efa_enabled_groups;
+
+ ena_dbg(ena, "%s supported: %s enabled: %s", grpstr->eag_str,
+ supported ? "Y" : "N", enabled ? "Y" : "N");
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * Free all resources allocated as part of ena_device_init().
+ */
+static void
+ena_cleanup_device_init(ena_t *ena)
+{
+ ena_adminq_t *aq = &ena->ena_aq;
+
+ ena_free_host_info(ena);
+ mutex_destroy(&aq->ea_sq_lock);
+ mutex_destroy(&aq->ea_cq_lock);
+ mutex_destroy(&aq->ea_stat_lock);
+ list_destroy(&aq->ea_cmd_ctxs_free);
+ kmem_free(aq->ea_cmd_ctxs, sizeof (ena_cmd_ctx_t) * aq->ea_qlen);
+ ena_admin_sq_free(ena);
+ ena_admin_cq_free(ena);
+ ena_aenq_free(ena);
+ ena_stat_device_basic_cleanup(ena);
+ ena_stat_device_extended_cleanup(ena);
+ ena_stat_aenq_cleanup(ena);
+}
+
+static boolean_t
+ena_attach_device_init(ena_t *ena)
+{
+ ena_adminq_t *aq = &ena->ena_aq;
+ uint32_t rval, wval;
+ uint8_t dma_width;
+ hrtime_t timeout, cmd_timeout;
+ hrtime_t expired;
+ enahw_resp_desc_t resp;
+ enahw_feat_dev_attr_t *feat = &resp.erd_resp.erd_get_feat.ergf_dev_attr;
+ uint8_t *maddr;
+ uint32_t supported_features;
+ int ret = 0;
+
+ rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
+ if ((rval & ENAHW_DEV_STS_READY_MASK) == 0) {
+ ena_err(ena, "device is not ready");
+ return (B_FALSE);
+ }
+
+ rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS);
+
+ /*
+ * The device stores the reset timeout at 100ms resolution; we
+ * normalize that to nanoseconds.
+ */
+ timeout = MSEC2NSEC(ENAHW_CAPS_RESET_TIMEOUT(rval) * 100);
+
+ if (timeout == 0) {
+ ena_err(ena, "device gave invalid reset timeout");
+ return (B_FALSE);
+ }
+
+ expired = gethrtime() + timeout;
+
+ wval = ENAHW_DEV_CTL_DEV_RESET_MASK;
+ wval |= (ENAHW_RESET_NORMAL << ENAHW_DEV_CTL_RESET_REASON_SHIFT) &
+ ENAHW_DEV_CTL_RESET_REASON_MASK;
+ ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, wval);
+
+ /*
+ * Make sure reset is in progress.
+ */
+ while (1) {
+ rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
+
+ if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) != 0) {
+ break;
+ }
+
+ if (gethrtime() > expired) {
+ ena_err(ena, "device reset start timed out");
+ return (B_FALSE);
+ }
+
+ /* Sleep for 100 milliseconds. */
+ delay(drv_usectohz(100 * 1000));
+ }
+
+ /*
+ * Reset the timeout counter for the next device request.
+ */
+ expired = gethrtime() + timeout;
+
+ /*
+ * Wait for the device reset to finish.
+ */
+ ena_hw_bar_write32(ena, ENAHW_REG_DEV_CTL, 0);
+ while (1) {
+ rval = ena_hw_bar_read32(ena, ENAHW_REG_DEV_STS);
+
+ if ((rval & ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK) == 0) {
+ break;
+ }
+
+ if (gethrtime() > expired) {
+ ena_err(ena, "device reset timed out");
+ return (B_FALSE);
+ }
+
+ /* Sleep for 100 milliseconds. */
+ delay(drv_usectohz(100 * 1000));
+ }
+
+ if (!ena_check_versions(ena)) {
+ return (B_FALSE);
+ }
+
+ rval = ena_hw_bar_read32(ena, ENAHW_REG_CAPS);
+ dma_width = ENAHW_CAPS_DMA_ADDR_WIDTH(rval);
+ ena->ena_dma_width = dma_width;
+
+ /*
+ * As we are not using an interrupt for admin queue completion
+ * signaling, we do not need a priority on these mutexes. If
+ * that changes, we will have to rejigger some code to create
+ * the admin queue interrupt before this function.
+ */
+ mutex_init(&aq->ea_sq_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&aq->ea_cq_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&aq->ea_stat_lock, NULL, MUTEX_DRIVER, NULL);
+ aq->ea_qlen = ENA_ADMINQ_DEPTH;
+ aq->ea_pending_cmds = 0;
+
+ aq->ea_cmd_ctxs = kmem_zalloc(sizeof (ena_cmd_ctx_t) * aq->ea_qlen,
+ KM_SLEEP);
+ list_create(&aq->ea_cmd_ctxs_free, sizeof (ena_cmd_ctx_t),
+ offsetof(ena_cmd_ctx_t, ectx_node));
+
+ for (uint_t i = 0; i < aq->ea_qlen; i++) {
+ ena_cmd_ctx_t *ctx = &aq->ea_cmd_ctxs[i];
+
+ ctx->ectx_id = i;
+ ctx->ectx_pending = B_FALSE;
+ ctx->ectx_cmd_opcode = ENAHW_CMD_NONE;
+ ctx->ectx_resp = NULL;
+ list_insert_tail(&aq->ea_cmd_ctxs_free, ctx);
+ }
+
+ /*
+ * The value stored in the device register is in the
+ * resolution of 100 milliseconds. We normalize that to
+ * nanoseconds.
+ */
+ cmd_timeout = MSEC2NSEC(ENAHW_CAPS_ADMIN_CMD_TIMEOUT(rval) * 100);
+ aq->ea_cmd_timeout_ns = max(cmd_timeout, ena_admin_cmd_timeout_ns);
+
+ if (aq->ea_cmd_timeout_ns == 0) {
+ aq->ea_cmd_timeout_ns = ENA_ADMIN_CMD_DEF_TIMEOUT;
+ }
+
+ if (!ena_admin_sq_init(ena)) {
+ return (B_FALSE);
+ }
+
+ if (!ena_admin_cq_init(ena)) {
+ return (B_FALSE);
+ }
+
+ if (!ena_aenq_init(ena)) {
+ return (B_FALSE);
+ }
+
+ /*
+ * While the Linux driver prefers to use interrupts to deliver
+ * admin queue completions, we just poll -- it seems to work
+ * just fine.
+ */
+ ena_hw_bar_write32(ena, ENAHW_REG_INTERRUPT_MASK, 0);
+ aq->ea_poll_mode = B_TRUE;
+
+ bzero(&resp, sizeof (resp));
+ ret = ena_get_feature(ena, &resp, ENAHW_FEAT_DEVICE_ATTRIBUTES,
+ ENAHW_FEAT_DEVICE_ATTRIBUTES_VER);
+
+ if (ret != 0) {
+ ena_err(ena, "failed to get device attributes: %d", ret);
+ return (B_FALSE);
+ }
+
+ ena_dbg(ena, "impl ID: %u", feat->efda_impl_id);
+ ena_dbg(ena, "device version: %u", feat->efda_device_version);
+ ena_dbg(ena, "supported features: 0x%x",
+ feat->efda_supported_features);
+ ena_dbg(ena, "phys addr width: %u", feat->efda_phys_addr_width);
+ ena_dbg(ena, "virt addr width: %u", feat->efda_virt_addr_with);
+ maddr = feat->efda_mac_addr;
+ ena_dbg(ena, "mac addr: %x:%x:%x:%x:%x:%x", maddr[0], maddr[1],
+ maddr[2], maddr[3], maddr[4], maddr[5]);
+ ena_dbg(ena, "max MTU: %u", feat->efda_max_mtu);
+
+ bcopy(maddr, ena->ena_mac_addr, ETHERADDRL);
+ ena->ena_max_mtu = feat->efda_max_mtu;
+ supported_features = feat->efda_supported_features;
+ ena->ena_supported_features = supported_features;
+ feat = NULL;
+ bzero(&resp, sizeof (resp));
+
+ if (supported_features & BIT(ENAHW_FEAT_MAX_QUEUES_EXT)) {
+ enahw_feat_max_queue_ext_t *feat_mqe =
+ &resp.erd_resp.erd_get_feat.ergf_max_queue_ext;
+
+ ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_EXT,
+ ENAHW_FEAT_MAX_QUEUES_EXT_VER);
+
+ if (ret != 0) {
+ ena_err(ena, "failed to query max queues ext: %d", ret);
+ return (B_FALSE);
+ }
+
+ ena->ena_tx_max_sq_num = feat_mqe->efmqe_max_tx_sq_num;
+ ena->ena_tx_max_sq_num_descs = feat_mqe->efmqe_max_tx_sq_depth;
+ ena->ena_tx_max_cq_num = feat_mqe->efmqe_max_tx_cq_num;
+ ena->ena_tx_max_cq_num_descs = feat_mqe->efmqe_max_tx_cq_depth;
+ ena->ena_tx_max_desc_per_pkt =
+ feat_mqe->efmqe_max_per_packet_tx_descs;
+ ena->ena_tx_max_hdr_len = feat_mqe->efmqe_max_tx_header_size;
+
+ ena->ena_rx_max_sq_num = feat_mqe->efmqe_max_rx_sq_num;
+ ena->ena_rx_max_sq_num_descs = feat_mqe->efmqe_max_rx_sq_depth;
+ ena->ena_rx_max_cq_num = feat_mqe->efmqe_max_rx_cq_num;
+ ena->ena_rx_max_cq_num_descs = feat_mqe->efmqe_max_rx_cq_depth;
+ ena->ena_rx_max_desc_per_pkt =
+ feat_mqe->efmqe_max_per_packet_rx_descs;
+
+ ena_set_max_io_queues(ena);
+ } else {
+ enahw_feat_max_queue_t *feat_mq =
+ &resp.erd_resp.erd_get_feat.ergf_max_queue;
+
+ ret = ena_get_feature(ena, &resp, ENAHW_FEAT_MAX_QUEUES_NUM,
+ ENAHW_FEAT_MAX_QUEUES_NUM_VER);
+
+ if (ret != 0) {
+ ena_err(ena, "failed to query max queues: %d", ret);
+ return (B_FALSE);
+ }
+
+ ena->ena_tx_max_sq_num = feat_mq->efmq_max_sq_num;
+ ena->ena_tx_max_sq_num_descs = feat_mq->efmq_max_sq_depth;
+ ena->ena_tx_max_cq_num = feat_mq->efmq_max_cq_num;
+ ena->ena_tx_max_cq_num_descs = feat_mq->efmq_max_cq_depth;
+ ena->ena_tx_max_desc_per_pkt =
+ feat_mq->efmq_max_per_packet_tx_descs;
+ ena->ena_tx_max_hdr_len = feat_mq->efmq_max_header_size;
+
+ ena->ena_rx_max_sq_num = feat_mq->efmq_max_sq_num;
+ ena->ena_rx_max_sq_num_descs = feat_mq->efmq_max_sq_depth;
+ ena->ena_rx_max_cq_num = feat_mq->efmq_max_cq_num;
+ ena->ena_rx_max_cq_num_descs = feat_mq->efmq_max_cq_depth;
+ ena->ena_rx_max_desc_per_pkt =
+ feat_mq->efmq_max_per_packet_rx_descs;
+
+ ena_set_max_io_queues(ena);
+ }
+
+ ena->ena_mtu = ena->ena_max_mtu;
+ ena_update_buf_sizes(ena);
+ /*
+ * We could use ENAHW_FEAT_HW_HINTS to determine actual SGL
+ * sizes, for now we just force everything to use one
+ * segment.
+ */
+ ena->ena_tx_sgl_max_sz = 1;
+ ena->ena_rx_sgl_max_sz = 1;
+
+ if (!ena_init_host_info(ena)) {
+ return (B_FALSE);
+ }
+
+ if (!ena_setup_aenq(ena)) {
+ return (B_FALSE);
+ }
+
+ ena_get_link_config(ena);
+
+ if (!ena_get_offloads(ena)) {
+ return (B_FALSE);
+ }
+
+ if (!ena_stat_device_basic_init(ena)) {
+ return (B_FALSE);
+ }
+
+ if (!ena_stat_device_extended_init(ena)) {
+ return (B_FALSE);
+ }
+
+ if (!ena_stat_aenq_init(ena)) {
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+static void
+ena_cleanup_intr_alloc(ena_t *ena)
+{
+ for (int i = 0; i < ena->ena_num_intrs; i++) {
+ int ret = ddi_intr_free(ena->ena_intr_handles[i]);
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "failed to free interrupt %d: %d", i, ret);
+ }
+ }
+
+ if (ena->ena_intr_handles != NULL) {
+ kmem_free(ena->ena_intr_handles, ena->ena_intr_handles_sz);
+ ena->ena_intr_handles = NULL;
+ ena->ena_intr_handles_sz = 0;
+ }
+}
+
+/*
+ * The Linux driver supports only MSI-X interrupts. We do the same,
+ * with the assumption that it's the only type of interrupt the device
+ * can present.
+ */
+static boolean_t
+ena_attach_intr_alloc(ena_t *ena)
+{
+ int ret;
+ int types;
+ int min, req, ideal, avail, actual;
+
+ ret = ddi_intr_get_supported_types(ena->ena_dip, &types);
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "failed to get interrupt types: %d", ret);
+ return (B_FALSE);
+ }
+
+ ena_dbg(ena, "supported interrupt types: 0x%x", types);
+ if ((types & DDI_INTR_TYPE_MSIX) == 0) {
+ ena_err(ena, "the ena driver only supports MSI-X interrupts");
+ return (B_FALSE);
+ }
+
+ /* One for I/O, one for adminq. */
+ min = 2;
+ ideal = ena->ena_max_io_queues + 1;
+ ret = ddi_intr_get_nintrs(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail);
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "failed to get number of MSI-X interrupts: %d",
+ ret);
+ return (B_FALSE);
+ }
+
+ if (avail < min) {
+ ena_err(ena, "number of MSI-X interrupts is %d, but the driver "
+ "requires a minimum of %d", avail, min);
+ return (B_FALSE);
+ }
+
+ ena_dbg(ena, "%d MSI-X interrupts available", avail);
+
+ ret = ddi_intr_get_navail(ena->ena_dip, DDI_INTR_TYPE_MSIX, &avail);
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "failed to get available interrupts: %d", ret);
+ return (B_FALSE);
+ }
+
+ if (avail < min) {
+ ena_err(ena, "number of available MSI-X interrupts is %d, "
+ "but the driver requires a minimum of %d", avail, min);
+ return (B_FALSE);
+ }
+
+ req = MIN(ideal, avail);
+ ena->ena_intr_handles_sz = req * sizeof (ddi_intr_handle_t);
+ ena->ena_intr_handles = kmem_zalloc(ena->ena_intr_handles_sz, KM_SLEEP);
+
+ ret = ddi_intr_alloc(ena->ena_dip, ena->ena_intr_handles,
+ DDI_INTR_TYPE_MSIX, 0, req, &actual, DDI_INTR_ALLOC_NORMAL);
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "failed to allocate %d MSI-X interrupts: %d",
+ req, ret);
+ return (B_FALSE);
+ }
+
+ if (actual < min) {
+ ena_err(ena, "number of allocated interrupts is %d, but the "
+ "driver requires a minimum of %d", actual, min);
+ return (B_FALSE);
+ }
+
+ ena->ena_num_intrs = actual;
+
+ ret = ddi_intr_get_cap(ena->ena_intr_handles[0], &ena->ena_intr_caps);
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "failed to get interrupt capability: %d", ret);
+ return (B_FALSE);
+ }
+
+ ret = ddi_intr_get_pri(ena->ena_intr_handles[0], &ena->ena_intr_pri);
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "failed to get interrupt priority: %d", ret);
+ return (B_FALSE);
+ }
+
+ ena_dbg(ena, "MSI-X interrupts allocated: %d, cap: 0x%x, pri: %u",
+ actual, ena->ena_intr_caps, ena->ena_intr_pri);
+
+ /*
+ * The ena_lock should not be held in the datapath, but it is
+ * held as part of the AENQ handler, which runs in interrupt
+ * context. Therefore, we delayed the initilization of this
+ * mutex until after the interrupts are allocated.
+ */
+ mutex_init(&ena->ena_lock, NULL, MUTEX_DRIVER,
+ DDI_INTR_PRI(ena->ena_intr_pri));
+
+ return (B_TRUE);
+}
+
+/*
+ * Allocate the parent Rx queue structures. More importantly, this is
+ * NOT allocating the queue descriptors or data buffers. Those are
+ * allocated on demand as queues are started.
+ */
+static boolean_t
+ena_attach_alloc_rxqs(ena_t *ena)
+{
+ /* We rely on the interrupt priority for initializing the mutexes. */
+ VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC);
+ ena->ena_num_rxqs = ena->ena_num_intrs - 1;
+ ASSERT3U(ena->ena_num_rxqs, >, 0);
+ ena->ena_rxqs = kmem_zalloc(ena->ena_num_rxqs * sizeof (*ena->ena_rxqs),
+ KM_SLEEP);
+
+ for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
+ ena_rxq_t *rxq = &ena->ena_rxqs[i];
+
+ rxq->er_rxqs_idx = i;
+ /* The 0th vector is for Admin + AENQ. */
+ rxq->er_intr_vector = i + 1;
+ rxq->er_mrh = NULL;
+
+ mutex_init(&rxq->er_lock, NULL, MUTEX_DRIVER,
+ DDI_INTR_PRI(ena->ena_intr_pri));
+ mutex_init(&rxq->er_stat_lock, NULL, MUTEX_DRIVER,
+ DDI_INTR_PRI(ena->ena_intr_pri));
+
+ rxq->er_ena = ena;
+ rxq->er_sq_num_descs = ena->ena_rxq_num_descs;
+ rxq->er_cq_num_descs = ena->ena_rxq_num_descs;
+
+ if (!ena_stat_rxq_init(rxq)) {
+ return (B_FALSE);
+ }
+
+ if (!ena_alloc_rxq(rxq)) {
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+static void
+ena_cleanup_rxqs(ena_t *ena)
+{
+ for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
+ ena_rxq_t *rxq = &ena->ena_rxqs[i];
+
+ ena_cleanup_rxq(rxq);
+ mutex_destroy(&rxq->er_lock);
+ mutex_destroy(&rxq->er_stat_lock);
+ ena_stat_rxq_cleanup(rxq);
+ }
+
+ kmem_free(ena->ena_rxqs, ena->ena_num_rxqs * sizeof (*ena->ena_rxqs));
+}
+
+/*
+ * Allocate the parent Tx queue structures. More importantly, this is
+ * NOT allocating the queue descriptors or data buffers. Those are
+ * allocated on demand as a queue is started.
+ */
+static boolean_t
+ena_attach_alloc_txqs(ena_t *ena)
+{
+ /* We rely on the interrupt priority for initializing the mutexes. */
+ VERIFY3U(ena->ena_attach_seq, >=, ENA_ATTACH_INTR_ALLOC);
+ ena->ena_num_txqs = ena->ena_num_intrs - 1;
+ ASSERT3U(ena->ena_num_txqs, >, 0);
+ ena->ena_txqs = kmem_zalloc(ena->ena_num_txqs * sizeof (*ena->ena_txqs),
+ KM_SLEEP);
+
+ for (uint_t i = 0; i < ena->ena_num_txqs; i++) {
+ ena_txq_t *txq = &ena->ena_txqs[i];
+
+ txq->et_txqs_idx = i;
+ /* The 0th vector is for Admin + AENQ. */
+ txq->et_intr_vector = i + 1;
+ txq->et_mrh = NULL;
+
+ mutex_init(&txq->et_lock, NULL, MUTEX_DRIVER,
+ DDI_INTR_PRI(ena->ena_intr_pri));
+ mutex_init(&txq->et_stat_lock, NULL, MUTEX_DRIVER,
+ DDI_INTR_PRI(ena->ena_intr_pri));
+
+ txq->et_ena = ena;
+ txq->et_sq_num_descs = ena->ena_txq_num_descs;
+ txq->et_cq_num_descs = ena->ena_txq_num_descs;
+
+ if (!ena_stat_txq_init(txq)) {
+ return (B_FALSE);
+ }
+
+ if (!ena_alloc_txq(txq)) {
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+static void
+ena_cleanup_txqs(ena_t *ena)
+{
+ for (uint_t i = 0; i < ena->ena_num_rxqs; i++) {
+ ena_txq_t *txq = &ena->ena_txqs[i];
+
+ ena_cleanup_txq(txq);
+ mutex_destroy(&txq->et_lock);
+ mutex_destroy(&txq->et_stat_lock);
+ ena_stat_txq_cleanup(txq);
+ }
+
+ kmem_free(ena->ena_txqs, ena->ena_num_txqs * sizeof (*ena->ena_txqs));
+}
+
+ena_attach_desc_t ena_attach_tbl[ENA_ATTACH_NUM_ENTRIES] = {
+ {
+ .ead_seq = ENA_ATTACH_PCI,
+ .ead_name = "PCI config",
+ .ead_attach_fn = ena_attach_pci,
+ .ead_attach_hard_fail = B_TRUE,
+ .ead_cleanup_fn = ena_cleanup_pci,
+ },
+
+ {
+ .ead_seq = ENA_ATTACH_REGS,
+ .ead_name = "BAR mapping",
+ .ead_attach_fn = ena_attach_regs_map,
+ .ead_attach_hard_fail = B_TRUE,
+ .ead_cleanup_fn = ena_cleanup_regs_map,
+ },
+
+ {
+ .ead_seq = ENA_ATTACH_DEV_INIT,
+ .ead_name = "device initialization",
+ .ead_attach_fn = ena_attach_device_init,
+ .ead_attach_hard_fail = B_TRUE,
+ .ead_cleanup_fn = ena_cleanup_device_init,
+ },
+
+ {
+ .ead_seq = ENA_ATTACH_READ_CONF,
+ .ead_name = "ena.conf",
+ .ead_attach_fn = ena_attach_read_conf,
+ .ead_attach_hard_fail = B_TRUE,
+ .ead_cleanup_fn = ena_no_cleanup,
+ },
+
+ {
+ .ead_seq = ENA_ATTACH_DEV_CFG,
+ .ead_name = "device config",
+ .ead_attach_fn = ena_attach_dev_cfg,
+ .ead_attach_hard_fail = B_TRUE,
+ .ead_cleanup_fn = ena_no_cleanup,
+ },
+
+ {
+ .ead_seq = ENA_ATTACH_INTR_ALLOC,
+ .ead_name = "interrupt allocation",
+ .ead_attach_fn = ena_attach_intr_alloc,
+ .ead_attach_hard_fail = B_TRUE,
+ .ead_cleanup_fn = ena_cleanup_intr_alloc,
+ },
+
+ {
+ .ead_seq = ENA_ATTACH_INTR_HDLRS,
+ .ead_name = "interrupt handlers",
+ .ead_attach_fn = ena_intr_add_handlers,
+ .ead_attach_hard_fail = B_TRUE,
+ .ead_cleanup_fn = ena_intr_remove_handlers,
+ },
+
+ {
+ .ead_seq = ENA_ATTACH_TXQS_ALLOC,
+ .ead_name = "Tx queues",
+ .ead_attach_fn = ena_attach_alloc_txqs,
+ .ead_attach_hard_fail = B_TRUE,
+ .ead_cleanup_fn = ena_cleanup_txqs,
+ },
+
+ {
+ .ead_seq = ENA_ATTACH_RXQS_ALLOC,
+ .ead_name = "Rx queues",
+ .ead_attach_fn = ena_attach_alloc_rxqs,
+ .ead_attach_hard_fail = B_TRUE,
+ .ead_cleanup_fn = ena_cleanup_rxqs,
+ },
+
+ /*
+ * The chance of mac_unregister() failure poses a problem to
+ * cleanup. We address interrupt disablement and mac
+ * unregistration explicitly in the attach/detach routines.
+ */
+ {
+ .ead_seq = ENA_ATTACH_MAC_REGISTER,
+ .ead_name = "mac registration",
+ .ead_attach_fn = ena_mac_register,
+ .ead_attach_hard_fail = B_TRUE,
+ .ead_cleanup_fn = ena_no_cleanup,
+ },
+
+ {
+ .ead_seq = ENA_ATTACH_INTRS_ENABLE,
+ .ead_name = "enable interrupts",
+ .ead_attach_fn = ena_intrs_enable,
+ .ead_attach_hard_fail = B_TRUE,
+ .ead_cleanup_fn = ena_no_cleanup,
+ }
+};
+
+/*
+ * This function undoes any work done by ena_attach(), either in
+ * response to a failed attach or a planned detach. At the end of this
+ * function ena_attach_seq should be zero, otherwise it means
+ * something has not be freed/uninitialized.
+ */
+static void
+ena_cleanup(ena_t *ena)
+{
+ if (ena == NULL || ena->ena_attach_seq == 0) {
+ return;
+ }
+
+ /*
+ * We VERIFY this because if the seq is greater than entries
+ * we drift into space and execute god knows what.
+ */
+ VERIFY3U(ena->ena_attach_seq, <, ENA_ATTACH_NUM_ENTRIES);
+
+ while (ena->ena_attach_seq > 0) {
+ int idx = ena->ena_attach_seq - 1;
+ ena_attach_desc_t *desc = &ena_attach_tbl[idx];
+
+ ena_dbg(ena, "running cleanup sequence: %s (%d)",
+ desc->ead_name, idx);
+
+ desc->ead_cleanup_fn(ena);
+ ena->ena_attach_seq--;
+ }
+
+ ASSERT3U(ena->ena_attach_seq, ==, 0);
+ mutex_destroy(&ena->ena_lock);
+}
+
+static int
+ena_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ ena_t *ena;
+
+ if (cmd != DDI_ATTACH) {
+ return (DDI_FAILURE);
+ }
+
+ ena = kmem_zalloc(sizeof (ena_t), KM_SLEEP);
+ ena->ena_instance = ddi_get_instance(dip);
+ ena->ena_dip = dip;
+ ena->ena_instance = ddi_get_instance(dip);
+ ena->ena_page_sz = ddi_ptob(dip, 1);
+
+ for (int i = 0; i < ENA_ATTACH_NUM_ENTRIES; i++) {
+ boolean_t success;
+ ena_attach_desc_t *desc = &ena_attach_tbl[i];
+
+ ena_dbg(ena, "running attach sequence: %s (%d)", desc->ead_name,
+ i);
+
+ if (!(success = desc->ead_attach_fn(ena))) {
+ ena_err(ena, "attach sequence failed: %s (%d)",
+ desc->ead_name, i);
+
+ if (ena->ena_attach_seq == ENA_ATTACH_MAC_REGISTER) {
+ /*
+ * In this specific case
+ * ENA_ATTACH_INTRS_ENABLE has failed,
+ * and we may or may not be able to
+ * unregister the mac, depending on if
+ * something in userspace has created
+ * a client on top.
+ *
+ * NOTE: Something that would be nice
+ * to add to mac is the ability to
+ * register a provider separate from
+ * "publishing" it to the rest of the
+ * system. This would allow a driver
+ * to register its mac, do some
+ * additional work that might fail,
+ * and then unregister if that work
+ * fails without concern for any
+ * chance of failure when calling
+ * unregister. This would remove the
+ * complexity of the situation we are
+ * trying to address here, as we would
+ * know that until the mac has been
+ * "published", there is no chance for
+ * mac_unregister() to fail.
+ */
+ if (ena_mac_unregister(ena) != 0) {
+ return (DDI_FAILURE);
+ }
+
+ ena->ena_attach_seq--;
+ } else {
+ /*
+ * Since the ead_seq is predicated on
+ * successful ead_attach_fn we must
+ * run the specific cleanup handler
+ * before calling the global cleanup
+ * routine. This also means that all
+ * cleanup functions must be able to
+ * deal with partial success of the
+ * corresponding ead_attach_fn.
+ */
+ desc->ead_cleanup_fn(ena);
+ }
+
+ ena_cleanup(ena);
+ kmem_free(ena, sizeof (ena_t));
+ return (DDI_FAILURE);
+ }
+
+ if (success) {
+ ena_dbg(ena, "attach sequence completed: %s (%d)",
+ desc->ead_name, i);
+ }
+
+ ena->ena_attach_seq = desc->ead_seq;
+ }
+
+ /*
+ * Now that interrupts are enabled make sure to tell the
+ * device that all AENQ descriptors are ready for writing.
+ */
+ ena_hw_bar_write32(ena, ENAHW_REG_AENQ_HEAD_DB,
+ ena->ena_aenq.eaenq_num_descs);
+
+ ddi_set_driver_private(dip, ena);
+ return (DDI_SUCCESS);
+}
+
+static int
+ena_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ ena_t *ena = ddi_get_driver_private(dip);
+
+ if (ena == NULL) {
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Before we can proceed to cleanup we have to treat
+ * mac_unregister() explicitly -- if there are still
+ * outstanding clients, then we can't proceed with detach or
+ * cleanup.
+ */
+
+ /*
+ * Why this would fail I don't know, but if we proceed to mac
+ * unregister, then there is a good chance we will panic in
+ * the Rx interrupt handler when calling mac_rx_ring()
+ */
+ if (!ena_intrs_disable(ena)) {
+ return (DDI_FAILURE);
+ }
+
+ /* We can't detach if clients are actively using the device. */
+ if (ena_mac_unregister(ena) != 0) {
+ (void) ena_intrs_enable(ena);
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * At this point we can proceed with the rest of cleanup on a
+ * best-effort basis.
+ */
+ ena->ena_attach_seq = ENA_ATTACH_RXQS_ALLOC;
+ ena_cleanup(ena);
+ ddi_set_driver_private(dip, NULL);
+ kmem_free(ena, sizeof (ena_t));
+ return (DDI_SUCCESS);
+}
+
+static struct cb_ops ena_cb_ops = {
+ .cb_open = nodev,
+ .cb_close = nodev,
+ .cb_strategy = nodev,
+ .cb_print = nodev,
+ .cb_dump = nodev,
+ .cb_read = nodev,
+ .cb_write = nodev,
+ .cb_ioctl = nodev,
+ .cb_devmap = nodev,
+ .cb_mmap = nodev,
+ .cb_segmap = nodev,
+ .cb_chpoll = nochpoll,
+ .cb_prop_op = ddi_prop_op,
+ .cb_flag = D_MP,
+ .cb_rev = CB_REV,
+ .cb_aread = nodev,
+ .cb_awrite = nodev
+};
+
+static struct dev_ops ena_dev_ops = {
+ .devo_rev = DEVO_REV,
+ .devo_refcnt = 0,
+ .devo_getinfo = NULL,
+ .devo_identify = nulldev,
+ .devo_probe = nulldev,
+ .devo_attach = ena_attach,
+ .devo_detach = ena_detach,
+ .devo_reset = nodev,
+ .devo_quiesce = ddi_quiesce_not_supported,
+ .devo_cb_ops = &ena_cb_ops
+};
+
+static struct modldrv ena_modldrv = {
+ .drv_modops = &mod_driverops,
+ .drv_linkinfo = "AWS ENA Ethernet",
+ .drv_dev_ops = &ena_dev_ops
+};
+
+static struct modlinkage ena_modlinkage = {
+ .ml_rev = MODREV_1,
+ .ml_linkage = { &ena_modldrv, NULL }
+};
+
+int
+_init(void)
+{
+ int ret;
+
+ mac_init_ops(&ena_dev_ops, ENA_MODULE_NAME);
+
+ if ((ret = mod_install(&ena_modlinkage)) != 0) {
+ mac_fini_ops(&ena_dev_ops);
+ return (ret);
+ }
+
+ return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&ena_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int ret;
+
+ if ((ret = mod_remove(&ena_modlinkage)) != 0) {
+ return (ret);
+ }
+
+ mac_fini_ops(&ena_dev_ops);
+ return (ret);
+}
diff --git a/usr/src/uts/common/io/ena/ena.conf b/usr/src/uts/common/io/ena/ena.conf
new file mode 100644
index 0000000000..64ee011d7c
--- /dev/null
+++ b/usr/src/uts/common/io/ena/ena.conf
@@ -0,0 +1,50 @@
+#
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2021 Oxide Computer Company
+#
+
+#
+# Driver .conf file for AWS Elastic Network Adapter. See ena(7D) for
+# valid options.
+#
+
+#
+# rx_queue_num_descs
+#
+# The number of descriptors provided by each Rx queue.
+#
+# Range: 64 - <device maximum>
+# Default: <device maximum>
+#
+# rx_queue_num_descs = 1024;
+
+#
+# rx_queue_intr_limit
+#
+# The number of frames that may be read by a single Rx interrupt.
+#
+# Range: 16 - 4096
+# Default: 256
+#
+# rx_queue_intr_limit = 256;
+
+#
+# tx_queue_num_descs
+#
+# The number of descriptors provided by each Tx queue.
+#
+# Range: 64 - <device maximum>
+# Default: <device maximum>
+#
+# tx_queue_num_descs = 1024; \ No newline at end of file
diff --git a/usr/src/uts/common/io/ena/ena.h b/usr/src/uts/common/io/ena/ena.h
new file mode 100644
index 0000000000..467da40f4b
--- /dev/null
+++ b/usr/src/uts/common/io/ena/ena.h
@@ -0,0 +1,848 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+#ifndef _ENA_H
+#define _ENA_H
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/types.h>
+#include <sys/atomic.h>
+#include <sys/list.h>
+#include <sys/time.h>
+#include <sys/modctl.h>
+#include <sys/conf.h>
+#include <sys/cpuvar.h>
+#include <sys/pci.h>
+#include <sys/sysmacros.h>
+#include <sys/mac.h>
+#include <sys/mac_ether.h>
+#include <sys/mac_provider.h>
+#include <sys/pattr.h>
+#include <sys/strsun.h>
+#include <sys/ethernet.h>
+#include <sys/vlan.h>
+#include <sys/utsname.h>
+#include "ena_hw.h"
+
+/*
+ * AWS ENA Ethernet Driver
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ENA_MODULE_NAME "ena"
+
+/*
+ * The minimum supported ENA device controller version.
+ */
+#define ENA_CTRL_MAJOR_VSN_MIN 0
+#define ENA_CTRL_MINOR_VSN_MIN 0
+#define ENA_CTRL_SUBMINOR_VSN_MIN 1
+
+#define ENA_MODULE_VER_MAJOR 1
+#define ENA_MODULE_VER_MINOR 0
+#define ENA_MODULE_VER_SUBMINOR 0
+
+/*
+ * The Linux driver doesn't document what the specification version
+ * number controls or the contract around version changes. The best we
+ * can do is use the same version that they use and port version
+ * changes as they come (the last one was in 2018).
+ *
+ * common: ENA_COMMON_SPEC_VERSION_{MAJOR,MINOR}
+ */
+#define ENA_SPEC_VERSION_MAJOR 2
+#define ENA_SPEC_VERSION_MINOR 0
+
+
+/* This represents BAR 0. */
+#define ENA_REG_NUMBER 1
+
+/*
+ * A sentinel value passed as argument to ena_ring_rx() to indicate
+ * the Rx ring is being read in interrupt mode, not polling mode.
+ */
+#define ENA_INTERRUPT_MODE -1
+
+#define ENA_RX_BUF_IPHDR_ALIGNMENT 2
+#define ENA_ADMINQ_DEPTH 32
+#define ENA_AENQ_NUM_DESCS 32
+
+/* Convert milliseconds to nanoseconds. */
+#define ENA_MS_TO_NS(ms) ((ms) * 1000000ul)
+
+/*
+ * The default amount of time we will wait for an admin command to
+ * complete, specified in microseconds. In this case, 500 milliseconds.
+ */
+#define ENA_ADMIN_CMD_DEF_TIMEOUT MSEC2NSEC(500)
+
+/*
+ * Property macros.
+ */
+#define ENA_PROP_RXQ_NUM_DESCS "rx_queue_num_descs"
+#define ENA_PROP_RXQ_NUM_DESCS_MIN 64
+
+#define ENA_PROP_TXQ_NUM_DESCS "tx_queue_num_descs"
+#define ENA_PROP_TXQ_NUM_DESCS_MIN 64
+
+#define ENA_PROP_RXQ_INTR_LIMIT "rx_queue_intr_limit"
+#define ENA_PROP_RXQ_INTR_LIMIT_MIN 16
+#define ENA_PROP_RXQ_INTR_LIMIT_MAX 4096
+#define ENA_PROP_RXQ_INTR_LIMIT_DEF 256
+
+#define ENA_DMA_BIT_MASK(x) ((1ULL << (x)) - 1ULL)
+#define ENA_DMA_VERIFY_ADDR(ena, phys_addr) \
+ VERIFY3U(ENA_DMA_BIT_MASK((ena)->ena_dma_width) & (phys_addr), \
+ ==, (phys_addr))
+
+typedef struct ena_dma_conf {
+ size_t edc_size;
+ uint64_t edc_align;
+ int edc_sgl;
+ uchar_t edc_endian;
+ boolean_t edc_stream;
+} ena_dma_conf_t;
+
+typedef struct ena_dma_buf {
+ caddr_t edb_va;
+ size_t edb_len;
+ /*
+ * The length given by DMA engine, kept around for debugging
+ * purposes.
+ */
+ size_t edb_real_len;
+ size_t edb_used_len;
+ ddi_acc_handle_t edb_acc_hdl;
+ ddi_dma_handle_t edb_dma_hdl;
+ const ddi_dma_cookie_t *edb_cookie;
+} ena_dma_buf_t;
+
+/*
+ * We always sync the entire range, and therefore expect success.
+ */
+#ifdef DEBUG
+#define ENA_DMA_SYNC(buf, flag) \
+ ASSERT0(ddi_dma_sync((buf).edb_dma_hdl, 0, 0, (flag)))
+#else /* DEBUG */
+#define ENA_DMA_SYNC(buf, flag) \
+ ((void)ddi_dma_sync((buf).edb_dma_hdl, 0, 0, (flag)))
+#endif
+
+typedef struct ena_aenq_grpstr {
+ enahw_aenq_groups_t eag_type;
+ const char *eag_str;
+} ena_aenq_grpstr_t;
+
+typedef struct ena_aenq_synstr {
+ enahw_aenq_syndrome_t eas_type;
+ const char *eas_str;
+} ena_aenq_synstr_t;
+
+typedef void (*ena_aenq_hdlr_t)(void *data, enahw_aenq_desc_t *desc);
+
+typedef struct ena_aenq {
+ enahw_aenq_desc_t *eaenq_descs;
+ ena_dma_buf_t eaenq_dma;
+ ena_aenq_hdlr_t eaenq_hdlrs[ENAHW_AENQ_GROUPS_ARR_NUM];
+ uint16_t eaenq_num_descs;
+ uint16_t eaenq_head;
+ uint8_t eaenq_phase;
+} ena_aenq_t;
+
+typedef struct ena_admin_sq {
+ enahw_cmd_desc_t *eas_entries;
+ ena_dma_buf_t eas_dma;
+ uint32_t *eas_dbaddr;
+ uint16_t eas_tail;
+ uint8_t eas_phase;
+} ena_admin_sq_t;
+
+typedef struct ena_admin_cq {
+ enahw_resp_desc_t *eac_entries;
+ ena_dma_buf_t eac_dma;
+ uint16_t eac_head;
+ uint8_t eac_phase;
+} ena_admin_cq_t;
+
+/*
+ * The command context is used to track outstanding requests and match
+ * them to device responses.
+ */
+typedef struct ena_cmd_ctx {
+ list_node_t ectx_node;
+
+ /*
+ * The index into ea_cmd_ctxs where this ctx lives. Used as
+ * the command ID value in the command descriptor. This allows
+ * us to match a response to its associated context.
+ */
+ uint16_t ectx_id;
+
+ /* Is the command pending? */
+ boolean_t ectx_pending;
+
+ /* The type of command associated with this context. */
+ enahw_cmd_opcode_t ectx_cmd_opcode;
+
+ /*
+ * The location to copy the full response to. This is
+ * specified by the caller of the command during
+ * submission.
+ */
+ enahw_resp_desc_t *ectx_resp;
+} ena_cmd_ctx_t;
+
+/*
+ * The admin queue, the queue through which commands are sent to the
+ * device.
+ *
+ * WO: Write Once (at initialization)
+ *
+ * In general, only a single lock needs to be held in order to access
+ * the different parts of the admin queue:
+ *
+ * sq_lock: Any data deailng with submitting admin commands, which
+ * includes acquiring a command context.
+ *
+ * cq_lock: Any data dealing with reading command responses.
+ *
+ * stat_lock: For accessing statistics.
+ *
+ * In some cases, the ectx_lock/stat_lock may be held in tandem with
+ * either the SQ or CQ lock. In that case, the SQ/CQ lock is always
+ * entered first.
+ */
+typedef struct ena_adminq {
+ kmutex_t ea_sq_lock; /* WO */
+ kmutex_t ea_cq_lock; /* WO */
+ kmutex_t ea_stat_lock; /* WO */
+
+ hrtime_t ea_cmd_timeout_ns; /* WO */
+
+ uint16_t ea_qlen; /* WO */
+ boolean_t ea_poll_mode; /* WO */
+
+ ena_cmd_ctx_t *ea_cmd_ctxs; /* WO */
+ list_t ea_cmd_ctxs_free; /* ea_sq_lock */
+ uint16_t ea_pending_cmds; /* ea_sq_lock */
+ ena_admin_sq_t ea_sq; /* eq_sq_lock */
+ ena_admin_cq_t ea_cq; /* eq_cq_lock */
+
+ /* ea_stat_lock */
+ struct ena_adminq_stats {
+ uint64_t cmds_fail;
+ uint64_t cmds_submitted;
+ uint64_t cmds_success;
+ uint64_t queue_full;
+ } ea_stats;
+} ena_adminq_t;
+
+typedef enum ena_attach_seq {
+ ENA_ATTACH_PCI = 1, /* PCI config space */
+ ENA_ATTACH_REGS, /* BAR mapping */
+ ENA_ATTACH_DEV_INIT, /* ENA device initialization */
+ ENA_ATTACH_READ_CONF, /* Read driver conf file */
+ ENA_ATTACH_DEV_CFG, /* Set any needed device config */
+ ENA_ATTACH_INTR_ALLOC, /* interrupt handles allocated */
+ ENA_ATTACH_INTR_HDLRS, /* intr handlers set */
+ ENA_ATTACH_TXQS_ALLOC, /* Tx Queues allocated */
+ ENA_ATTACH_RXQS_ALLOC, /* Tx Queues allocated */
+ ENA_ATTACH_MAC_REGISTER, /* registered with mac */
+ ENA_ATTACH_INTRS_ENABLE, /* interrupts are enabled */
+ ENA_ATTACH_END
+} ena_attach_seq_t;
+
+#define ENA_ATTACH_SEQ_FIRST (ENA_ATTACH_PCI)
+#define ENA_ATTACH_NUM_ENTRIES (ENA_ATTACH_END - 1)
+
+struct ena;
+typedef boolean_t (*ena_attach_fn_t)(struct ena *);
+typedef void (*ena_cleanup_fn_t)(struct ena *);
+
+typedef struct ena_attach_desc {
+ ena_attach_seq_t ead_seq;
+ const char *ead_name;
+ ena_attach_fn_t ead_attach_fn;
+ boolean_t ead_attach_hard_fail;
+ ena_cleanup_fn_t ead_cleanup_fn;
+} ena_attach_desc_t;
+
+typedef enum {
+ ENA_TCB_NONE,
+ ENA_TCB_COPY
+} ena_tcb_type_t;
+
+/*
+ * The TCB is used to track information relating to the Tx of a
+ * packet. At the moment we support copy only.
+ */
+typedef struct ena_tx_control_block {
+ mblk_t *etcb_mp;
+ ena_tcb_type_t etcb_type;
+ ena_dma_buf_t etcb_dma;
+} ena_tx_control_block_t;
+
+typedef enum ena_txq_state {
+ ENA_TXQ_STATE_NONE = 0,
+ ENA_TXQ_STATE_HOST_ALLOC = 1 << 0,
+ ENA_TXQ_STATE_CQ_CREATED = 1 << 1,
+ ENA_TXQ_STATE_SQ_CREATED = 1 << 2,
+ ENA_TXQ_STATE_READY = 1 << 3, /* TxQ ready and waiting */
+ ENA_TXQ_STATE_RUNNING = 1 << 4, /* intrs enabled */
+} ena_txq_state_t;
+
+typedef struct ena_txq_stat {
+ /* Number of times mac_ether_offload_info() has failed. */
+ kstat_named_t ets_hck_meoifail;
+
+ /*
+ * Total number of times the ring was blocked due to
+ * insufficient descriptors, or unblocked due to recycling
+ * descriptors.
+ */
+ kstat_named_t ets_blocked;
+ kstat_named_t ets_unblocked;
+
+ /* The total number descriptors that have been recycled. */
+ kstat_named_t ets_recycled;
+
+ /*
+ * Number of bytes and packets that have been _submitted_ to
+ * the device.
+ */
+ kstat_named_t ets_bytes;
+ kstat_named_t ets_packets;
+} ena_txq_stat_t;
+
+/*
+ * A transmit queue, made up of a Submission Queue (SQ) and Completion
+ * Queue (CQ) to form a logical descriptor ring for sending packets.
+ *
+ * Write Once (WO)
+ *
+ * This value is written once, before the datapath is activated, in
+ * a function which is controlled by mac(9E). Some values may be
+ * written earlier, during ena attach, like et_ena and
+ * et_sq_num_descs.
+ *
+ * Tx Mutex (TM) -- et_lock
+ *
+ * This value is protected by the Tx queue's mutex. Some values may
+ * be initialized in a WO path, but also continually updated as part
+ * of normal datapath operation, such as et_sq_avail_descs. These
+ * values need mutex protection.
+ */
+typedef struct ena_txq {
+ kmutex_t et_lock; /* WO */
+
+ struct ena *et_ena; /* WO */
+ uint_t et_txqs_idx; /* WO */
+ mac_ring_handle_t et_mrh; /* WO */
+ uint64_t et_m_gen_num; /* TM */
+ ena_txq_state_t et_state; /* WO */
+ uint16_t et_intr_vector; /* WO */
+
+ enahw_tx_desc_t *et_sq_descs; /* TM */
+ ena_dma_buf_t et_sq_dma; /* WO */
+
+ /* Is the Tx queue currently in a blocked state? */
+ boolean_t et_blocked; /* TM */
+
+ /*
+ * The number of descriptors owned by this ring. This value
+ * never changes after initialization.
+ */
+ uint16_t et_sq_num_descs; /* WO */
+
+ /*
+ * The number of descriptors currently available for Tx
+ * submission. When this value reaches zero the ring must
+ * block until device notifies us of freed descriptors.
+ */
+ uint16_t et_sq_avail_descs; /* TM */
+
+ /*
+ * The current tail index of the queue (the first free
+ * descriptor for host Tx submission). After initialization,
+ * this value only increments, relying on unsigned wrap
+ * around. The ENA device seems to expect this behavior,
+ * performing its own modulo on the value for the purposes of
+ * indexing, much like the driver code needs to do in order to
+ * access the proper TCB entry.
+ */
+ uint16_t et_sq_tail_idx; /* TM */
+
+ /*
+ * The phase is used to know which CQ descriptors may be
+ * reclaimed. This is explained further in ena.c.
+ */
+ uint16_t et_sq_phase; /* TM */
+ uint16_t et_sq_hw_idx; /* WO */
+
+ /*
+ * The "doorbell" address is how the host indicates to the
+ * device which descriptors are ready for Tx processing.
+ */
+ uint32_t *et_sq_db_addr; /* WO */
+
+ /*
+ * The TCBs track host Tx information, like a pointer to the
+ * mblk being submitted. Currently we maintain a 1:1 mapping
+ * of SQ descriptors to TCBs as Tx is copy only.
+ */
+ ena_tx_control_block_t *et_tcbs; /* TM */
+
+ enahw_tx_cdesc_t *et_cq_descs; /* TM */
+ ena_dma_buf_t et_cq_dma; /* WO */
+ uint16_t et_cq_num_descs; /* WO */
+ uint16_t et_cq_head_idx; /* TM */
+ uint16_t et_cq_phase; /* TM */
+ uint16_t et_cq_hw_idx; /* WO */
+
+ /*
+ * This address is used to control the CQ interrupts.
+ */
+ uint32_t *et_cq_unmask_addr; /* WO */
+ uint32_t *et_cq_head_db_addr; /* WO (currently unused) */
+ uint32_t *et_cq_numa_addr; /* WO (currently unused) */
+
+ /*
+ * This mutex protects the Tx queue stats. This mutex may be
+ * entered while et_lock is held, but et_lock is not required
+ * to access/modify the stats. However, if both locks are
+ * held, then et_lock must be entered first.
+ */
+ kmutex_t et_stat_lock;
+ ena_txq_stat_t et_stat;
+ kstat_t *et_kstat;
+} ena_txq_t;
+
+typedef enum ena_rxq_state {
+ ENA_RXQ_STATE_NONE = 0,
+ ENA_RXQ_STATE_HOST_ALLOC = 1 << 0,
+ ENA_RXQ_STATE_CQ_CREATED = 1 << 1,
+ ENA_RXQ_STATE_SQ_CREATED = 1 << 2,
+ ENA_RXQ_STATE_READY = 1 << 3, /* RxQ ready and waiting */
+ ENA_RXQ_STATE_RUNNING = 1 << 4, /* intrs enabled */
+} ena_rxq_state_t;
+
+typedef struct ena_rx_ctrl_block {
+ ena_dma_buf_t ercb_dma;
+ uint8_t ercb_offset;
+ uint16_t ercb_length;
+} ena_rx_ctrl_block_t;
+
+typedef enum {
+ ENA_RXQ_MODE_POLLING = 1,
+ ENA_RXQ_MODE_INTR = 2,
+} ena_rxq_mode_t;
+
+typedef struct ena_rxq_stat_t {
+ /* The total number of packets/bytes received on this queue. */
+ kstat_named_t ers_packets;
+ kstat_named_t ers_bytes;
+
+ /*
+ * At this time we expect all incoming frames to fit in a
+ * single buffer/descriptor. In some rare event that the
+ * device doesn't cooperate this stat is incremented.
+ */
+ kstat_named_t ers_multi_desc;
+
+ /*
+ * The total number of times we failed to allocate a new mblk
+ * for an incoming frame.
+ */
+ kstat_named_t ers_allocb_fail;
+
+ /*
+ * The total number of times the Rx interrupt handler reached
+ * its maximum limit for number of packets to process in a
+ * single interrupt. If you see this number increase
+ * continuously at a steady rate, then it may be an indication
+ * the driver is not entering polling mode.
+ */
+ kstat_named_t ers_intr_limit;
+
+ /*
+ * The total number of times the device detected an incorrect
+ * IPv4 header checksum.
+ */
+ kstat_named_t ers_hck_ipv4_err;
+
+ /*
+ * The total number of times the device detected an incorrect
+ * L4/ULP checksum.
+ */
+ kstat_named_t ers_hck_l4_err;
+} ena_rxq_stat_t;
+
+/*
+ * A receive queue, made up of a Submission Queue (SQ) and Completion
+ * Queue (CQ) to form a logical descriptor ring for receiving packets.
+ *
+ * Write Once (WO)
+ *
+ * This value is written once, before the datapath is activated, in
+ * a function which is controlled by mac(9E).
+ *
+ * Rx Mutex (RM) -- er_lock
+ *
+ * This value is protected by the Rx queue's mutex. Some values may
+ * be initialized in a WO path, but also continually updated as part
+ * of normal datapath operation, such as er_sq_avail_descs. These
+ * values need mutex protection.
+ */
+typedef struct ena_rxq {
+ kmutex_t er_lock;
+
+ struct ena *er_ena; /* WO */
+ uint_t er_rxqs_idx; /* WO */
+ mac_ring_handle_t er_mrh; /* WO */
+ uint64_t er_m_gen_num; /* WO */
+ ena_rxq_state_t er_state; /* WO */
+ uint16_t er_intr_vector; /* WO */
+ ena_rxq_mode_t er_mode; /* RM */
+ uint16_t er_intr_limit; /* RM */
+
+ enahw_rx_desc_t *er_sq_descs; /* RM */
+ ena_dma_buf_t er_sq_dma; /* WO */
+ uint16_t er_sq_num_descs; /* WO */
+ uint16_t er_sq_avail_descs; /* RM */
+ uint16_t er_sq_tail_idx; /* RM */
+ uint16_t er_sq_phase; /* RM */
+ uint16_t er_sq_hw_idx; /* WO */
+ uint32_t *er_sq_db_addr; /* WO */
+
+ enahw_rx_cdesc_t *er_cq_descs; /* RM */
+ ena_dma_buf_t er_cq_dma; /* WO */
+ uint16_t er_cq_num_descs; /* WO */
+ uint16_t er_cq_head_idx; /* RM */
+ uint16_t er_cq_phase; /* RM */
+ uint16_t er_cq_hw_idx; /* WO */
+ uint32_t *er_cq_unmask_addr; /* WO */
+ uint32_t *er_cq_head_db_addr; /* WO (currently unused) */
+ uint32_t *er_cq_numa_addr; /* WO (currently unused) */
+
+ ena_rx_ctrl_block_t *er_rcbs; /* RM */
+
+ kmutex_t er_stat_lock;
+ ena_rxq_stat_t er_stat;
+ kstat_t *er_kstat;
+} ena_rxq_t;
+
+/* These are stats based off of enahw_resp_basic_stats_t. */
+typedef struct ena_basic_stat {
+ kstat_named_t ebs_tx_bytes;
+ kstat_named_t ebs_tx_pkts;
+ kstat_named_t ebs_tx_drops;
+
+ kstat_named_t ebs_rx_bytes;
+ kstat_named_t ebs_rx_pkts;
+ kstat_named_t ebs_rx_drops;
+} ena_basic_stat_t;
+
+/* These are stats based off of enahw_resp_eni_stats_t. */
+typedef struct ena_extended_stat {
+ kstat_named_t ees_bw_in_exceeded;
+ kstat_named_t ees_bw_out_exceeded;
+ kstat_named_t ees_pps_exceeded;
+ kstat_named_t ees_conns_exceeded;
+ kstat_named_t ees_linklocal_exceeded;
+} ena_extended_stat_t;
+
+/* These stats monitor which AENQ handlers have been called. */
+typedef struct ena_aenq_stat {
+ kstat_named_t eaes_default;
+ kstat_named_t eaes_link_change;
+} ena_aenq_stat_t;
+
+#define ENA_STATE_PRIMORDIAL 0x1u
+#define ENA_STATE_RUNNING 0x2u
+
+/*
+ * This structure contains the per-instance (PF of VF) state of the
+ * device.
+ */
+typedef struct ena {
+ dev_info_t *ena_dip;
+ int ena_instance;
+
+ /*
+ * Global lock, used to synchronize administration changes to
+ * the ena_t. This lock should not be held in the datapath.
+ */
+ kmutex_t ena_lock;
+ ena_attach_seq_t ena_attach_seq;
+
+ /*
+ * We use atomic ops for ena_state so that datapath consumers
+ * do not need to enter ena_lock.
+ */
+ uint32_t ena_state;
+
+ /*
+ * PCI config space and BAR handle.
+ */
+ ddi_acc_handle_t ena_pci_hdl;
+ off_t ena_reg_size;
+ caddr_t ena_reg_base;
+ ddi_device_acc_attr_t ena_reg_attr;
+ ddi_acc_handle_t ena_reg_hdl;
+
+ /*
+ * Vendor information.
+ */
+ uint16_t ena_pci_vid;
+ uint16_t ena_pci_did;
+ uint8_t ena_pci_rev;
+ uint16_t ena_pci_svid;
+ uint16_t ena_pci_sdid;
+
+ /*
+ * Device and controller versions.
+ */
+ uint32_t ena_dev_major_vsn;
+ uint32_t ena_dev_minor_vsn;
+ uint32_t ena_ctrl_major_vsn;
+ uint32_t ena_ctrl_minor_vsn;
+ uint32_t ena_ctrl_subminor_vsn;
+ uint32_t ena_ctrl_impl_id;
+
+ /*
+ * Interrupts
+ */
+ int ena_num_intrs;
+ ddi_intr_handle_t *ena_intr_handles;
+ size_t ena_intr_handles_sz;
+ int ena_intr_caps;
+ uint_t ena_intr_pri;
+
+ mac_handle_t ena_mh;
+
+ size_t ena_page_sz;
+
+ /*
+ * The MTU and data layer frame sizes.
+ */
+ uint32_t ena_mtu;
+ uint32_t ena_max_frame_hdr;
+ uint32_t ena_max_frame_total;
+
+ /* The size (in bytes) of the Rx/Tx data buffers. */
+ uint32_t ena_tx_buf_sz;
+ uint32_t ena_rx_buf_sz;
+
+ /*
+ * The maximum number of Scatter Gather List segments the
+ * device can address.
+ */
+ uint8_t ena_tx_sgl_max_sz;
+ uint8_t ena_rx_sgl_max_sz;
+
+ /* The number of descriptors per Rx/Tx queue. */
+ uint16_t ena_rxq_num_descs;
+ uint16_t ena_txq_num_descs;
+
+ /*
+ * The maximum number of frames which may be read per Rx
+ * interrupt.
+ */
+ uint16_t ena_rxq_intr_limit;
+
+ /* The Rx/Tx data queues (rings). */
+ ena_rxq_t *ena_rxqs;
+ uint16_t ena_num_rxqs;
+ ena_txq_t *ena_txqs;
+ uint16_t ena_num_txqs;
+
+ /* These statistics are device-wide. */
+ kstat_t *ena_device_basic_kstat;
+ kstat_t *ena_device_extended_kstat;
+
+ /*
+ * This tracks AENQ-related stats, it is implicitly
+ * device-wide.
+ */
+ ena_aenq_stat_t ena_aenq_stat;
+ kstat_t *ena_aenq_kstat;
+
+ /*
+ * The Admin Queue, through which call device commands are
+ * sent.
+ */
+ ena_adminq_t ena_aq;
+
+ ena_aenq_t ena_aenq;
+ ena_dma_buf_t ena_host_info;
+
+ /*
+ * Hardware info
+ */
+ uint32_t ena_supported_features;
+ uint8_t ena_dma_width;
+ boolean_t ena_link_up;
+ boolean_t ena_link_autoneg;
+ boolean_t ena_link_full_duplex;
+ link_duplex_t ena_link_duplex;
+ uint64_t ena_link_speed_mbits;
+ enahw_link_speeds_t ena_link_speeds;
+ link_state_t ena_link_state;
+ uint32_t ena_aenq_supported_groups;
+ uint32_t ena_aenq_enabled_groups;
+
+ uint32_t ena_tx_max_sq_num;
+ uint32_t ena_tx_max_sq_num_descs;
+ uint32_t ena_tx_max_cq_num;
+ uint32_t ena_tx_max_cq_num_descs;
+ uint16_t ena_tx_max_desc_per_pkt;
+ uint32_t ena_tx_max_hdr_len;
+
+ uint32_t ena_rx_max_sq_num;
+ uint32_t ena_rx_max_sq_num_descs;
+ uint32_t ena_rx_max_cq_num;
+ uint32_t ena_rx_max_cq_num_descs;
+ uint16_t ena_rx_max_desc_per_pkt;
+
+ /* This is calculated from the Rx/Tx queue nums. */
+ uint16_t ena_max_io_queues;
+
+ /* Hardware Offloads */
+ boolean_t ena_tx_l3_ipv4_csum;
+
+ boolean_t ena_tx_l4_ipv4_part_csum;
+ boolean_t ena_tx_l4_ipv4_full_csum;
+ boolean_t ena_tx_l4_ipv4_lso;
+
+ boolean_t ena_tx_l4_ipv6_part_csum;
+ boolean_t ena_tx_l4_ipv6_full_csum;
+ boolean_t ena_tx_l4_ipv6_lso;
+
+ boolean_t ena_rx_l3_ipv4_csum;
+ boolean_t ena_rx_l4_ipv4_csum;
+ boolean_t ena_rx_l4_ipv6_csum;
+ boolean_t ena_rx_hash;
+
+ uint32_t ena_max_mtu;
+ uint8_t ena_mac_addr[ETHERADDRL];
+} ena_t;
+
+/*
+ * Logging functions.
+ */
+/*PRINTFLIKE2*/
+extern void ena_err(const ena_t *, const char *, ...) __KPRINTFLIKE(2);
+/*PRINTFLIKE2*/
+extern void ena_dbg(const ena_t *, const char *, ...) __KPRINTFLIKE(2);
+
+extern uint32_t ena_hw_bar_read32(const ena_t *, const uint16_t);
+extern uint32_t ena_hw_abs_read32(const ena_t *, uint32_t *);
+extern void ena_hw_bar_write32(const ena_t *, const uint16_t, const uint32_t);
+extern void ena_hw_abs_write32(const ena_t *, uint32_t *, const uint32_t);
+
+/*
+ * Stats
+ */
+extern void ena_stat_device_basic_cleanup(ena_t *);
+extern boolean_t ena_stat_device_basic_init(ena_t *);
+
+extern void ena_stat_device_extended_cleanup(ena_t *);
+extern boolean_t ena_stat_device_extended_init(ena_t *);
+
+extern void ena_stat_aenq_cleanup(ena_t *);
+extern boolean_t ena_stat_aenq_init(ena_t *);
+
+extern void ena_stat_rxq_cleanup(ena_rxq_t *);
+extern boolean_t ena_stat_rxq_init(ena_rxq_t *);
+extern void ena_stat_txq_cleanup(ena_txq_t *);
+extern boolean_t ena_stat_txq_init(ena_txq_t *);
+
+/*
+ * DMA
+ */
+extern boolean_t ena_dma_alloc(ena_t *, ena_dma_buf_t *, ena_dma_conf_t *,
+ size_t);
+extern void ena_dma_free(ena_dma_buf_t *);
+extern void ena_set_dma_addr(const ena_t *, const uint64_t, enahw_addr_t *);
+extern void ena_set_dma_addr_values(const ena_t *, const uint64_t, uint32_t *,
+ uint16_t *);
+
+/*
+ * Interrupts
+ */
+extern boolean_t ena_intr_add_handlers(ena_t *);
+extern void ena_intr_remove_handlers(ena_t *);
+extern void ena_tx_intr_work(ena_txq_t *);
+extern void ena_rx_intr_work(ena_rxq_t *);
+extern void ena_aenq_work(ena_t *);
+extern boolean_t ena_intrs_disable(ena_t *);
+extern boolean_t ena_intrs_enable(ena_t *);
+
+/*
+ * MAC
+ */
+extern boolean_t ena_mac_register(ena_t *);
+extern int ena_mac_unregister(ena_t *);
+extern void ena_ring_tx_stop(mac_ring_driver_t);
+extern int ena_ring_tx_start(mac_ring_driver_t, uint64_t);
+extern mblk_t *ena_ring_tx(void *, mblk_t *);
+extern void ena_ring_rx_stop(mac_ring_driver_t);
+extern int ena_ring_rx_start(mac_ring_driver_t rh, uint64_t gen_num);
+extern int ena_m_stat(void *, uint_t, uint64_t *);
+extern mblk_t *ena_ring_rx_poll(void *, int);
+extern int ena_ring_rx_stat(mac_ring_driver_t, uint_t, uint64_t *);
+extern int ena_ring_tx_stat(mac_ring_driver_t, uint_t, uint64_t *);
+
+/*
+ * Admin API
+ */
+extern int ena_admin_submit_cmd(ena_t *, enahw_cmd_desc_t *,
+ enahw_resp_desc_t *, ena_cmd_ctx_t **);
+extern int ena_admin_poll_for_resp(ena_t *, ena_cmd_ctx_t *);
+extern void ena_free_host_info(ena_t *);
+extern boolean_t ena_init_host_info(ena_t *);
+extern int ena_create_cq(ena_t *, uint16_t, uint64_t, boolean_t, uint32_t,
+ uint16_t *, uint32_t **, uint32_t **, uint32_t **);
+extern int ena_destroy_cq(ena_t *, uint16_t);
+extern int ena_create_sq(ena_t *, uint16_t, uint64_t, boolean_t, uint16_t,
+ uint16_t *, uint32_t **);
+extern int ena_destroy_sq(ena_t *, uint16_t, boolean_t);
+extern int ena_set_feature(ena_t *, enahw_cmd_desc_t *,
+ enahw_resp_desc_t *, const enahw_feature_id_t, const uint8_t);
+extern int ena_get_feature(ena_t *, enahw_resp_desc_t *,
+ const enahw_feature_id_t, const uint8_t);
+extern int ena_admin_get_basic_stats(ena_t *, enahw_resp_desc_t *);
+extern int ena_admin_get_eni_stats(ena_t *, enahw_resp_desc_t *);
+extern int enahw_resp_status_to_errno(ena_t *, enahw_resp_status_t);
+
+/*
+ * Rx/Tx allocations
+ */
+extern boolean_t ena_alloc_rxq(ena_rxq_t *);
+extern void ena_cleanup_rxq(ena_rxq_t *);
+extern boolean_t ena_alloc_txq(ena_txq_t *);
+extern void ena_cleanup_txq(ena_txq_t *);
+
+extern ena_aenq_grpstr_t ena_groups_str[];
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ENA_H */
diff --git a/usr/src/uts/common/io/ena/ena_admin.c b/usr/src/uts/common/io/ena/ena_admin.c
new file mode 100644
index 0000000000..55e5b48901
--- /dev/null
+++ b/usr/src/uts/common/io/ena/ena_admin.c
@@ -0,0 +1,674 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+/*
+ * This file contains everything having to do with communicating with
+ * the admin queue for sending commands to the device.
+ */
+
+#include "ena_hw.h"
+#include "ena.h"
+
+/*
+ * Mark the context as complete (a response has been received).
+ */
+static void
+ena_complete_cmd_ctx(ena_cmd_ctx_t *ctx, enahw_resp_desc_t *hwresp)
+{
+ bcopy(hwresp, ctx->ectx_resp, sizeof (*hwresp));
+ ctx->ectx_pending = B_FALSE;
+}
+
+/*
+ * Reset and release the context back to the free list.
+ */
+static void
+ena_release_cmd_ctx(ena_t *ena, ena_cmd_ctx_t *ctx)
+{
+ ASSERT(ctx->ectx_pending == B_FALSE);
+ ctx->ectx_resp = NULL;
+ ctx->ectx_cmd_opcode = ENAHW_CMD_NONE;
+
+ mutex_enter(&ena->ena_aq.ea_sq_lock);
+ list_insert_head(&ena->ena_aq.ea_cmd_ctxs_free, ctx);
+ ena->ena_aq.ea_pending_cmds--;
+ mutex_exit(&ena->ena_aq.ea_sq_lock);
+}
+
+/*
+ * Acquire the next avaiable command context.
+ */
+static ena_cmd_ctx_t *
+ena_acquire_cmd_ctx(ena_adminq_t *aq)
+{
+ VERIFY(MUTEX_HELD(&aq->ea_sq_lock));
+ ASSERT3U(aq->ea_pending_cmds, <, aq->ea_qlen);
+ ena_cmd_ctx_t *ctx = list_remove_head(&aq->ea_cmd_ctxs_free);
+
+ ctx->ectx_pending = B_TRUE;
+ return (ctx);
+}
+
+/*
+ * Submit a command to the admin queue.
+ */
+int
+ena_admin_submit_cmd(ena_t *ena, enahw_cmd_desc_t *cmd, enahw_resp_desc_t *resp,
+ ena_cmd_ctx_t **ctx)
+{
+ VERIFY3U(cmd->ecd_opcode, !=, 0);
+ ena_adminq_t *aq = &ena->ena_aq;
+ ena_admin_sq_t *sq = &aq->ea_sq;
+ uint16_t modulo_mask = aq->ea_qlen - 1;
+ ena_cmd_ctx_t *lctx = NULL;
+
+ mutex_enter(&aq->ea_sq_lock);
+ uint16_t tail_mod = sq->eas_tail & modulo_mask;
+
+ if (aq->ea_pending_cmds >= aq->ea_qlen) {
+ mutex_enter(&aq->ea_stat_lock);
+ aq->ea_stats.queue_full++;
+ mutex_exit(&aq->ea_stat_lock);
+ mutex_exit(&aq->ea_sq_lock);
+ return (ENOSPC);
+ }
+
+ lctx = ena_acquire_cmd_ctx(aq);
+ lctx->ectx_cmd_opcode = cmd->ecd_opcode;
+ lctx->ectx_resp = resp;
+
+ cmd->ecd_flags = sq->eas_phase & ENAHW_CMD_PHASE_MASK;
+ ENAHW_CMD_ID(cmd, lctx->ectx_id);
+ bcopy(cmd, &sq->eas_entries[tail_mod], sizeof (*cmd));
+ ENA_DMA_SYNC(sq->eas_dma, DDI_DMA_SYNC_FORDEV);
+ sq->eas_tail++;
+ aq->ea_pending_cmds++;
+
+ mutex_enter(&aq->ea_stat_lock);
+ aq->ea_stats.cmds_submitted++;
+ mutex_exit(&aq->ea_stat_lock);
+
+ DTRACE_PROBE4(cmd__submit, enahw_cmd_desc_t *, cmd, ena_cmd_ctx_t *,
+ lctx, uint16_t, tail_mod, uint8_t, sq->eas_phase);
+
+ if ((sq->eas_tail & modulo_mask) == 0) {
+ sq->eas_phase = !sq->eas_phase;
+ }
+
+ ena_hw_abs_write32(ena, sq->eas_dbaddr, sq->eas_tail);
+ mutex_exit(&aq->ea_sq_lock);
+ *ctx = lctx;
+ return (0);
+}
+
+/*
+ * Read a single response from the admin queue.
+ */
+static void
+ena_admin_read_resp(ena_t *ena, enahw_resp_desc_t *hwresp)
+{
+ ena_adminq_t *aq = &ena->ena_aq;
+ ena_admin_cq_t *cq = &aq->ea_cq;
+ ena_cmd_ctx_t *ctx = NULL;
+ uint16_t modulo_mask = aq->ea_qlen - 1;
+ VERIFY(MUTEX_HELD(&aq->ea_cq_lock));
+
+ uint16_t head_mod = cq->eac_head & modulo_mask;
+ uint8_t phase = cq->eac_phase & ENAHW_RESP_PHASE_MASK;
+ uint16_t cmd_id = ENAHW_RESP_CMD_ID(hwresp);
+ ctx = &aq->ea_cmd_ctxs[cmd_id];
+ ASSERT3U(ctx->ectx_id, ==, cmd_id);
+ ena_complete_cmd_ctx(ctx, hwresp);
+
+ if (hwresp->erd_status != ENAHW_RESP_SUCCESS) {
+ mutex_enter(&aq->ea_stat_lock);
+ aq->ea_stats.cmds_fail++;
+ mutex_exit(&aq->ea_stat_lock);
+ DTRACE_PROBE4(cmd__fail, enahw_resp_desc_t *, hwresp,
+ ena_cmd_ctx_t *, ctx, uint16_t, head_mod, uint8_t, phase);
+ return;
+ }
+
+ DTRACE_PROBE4(cmd__success, enahw_resp_desc_t *, hwresp,
+ ena_cmd_ctx_t *, ctx, uint16_t, head_mod, uint8_t, phase);
+ mutex_enter(&aq->ea_stat_lock);
+ aq->ea_stats.cmds_success++;
+ mutex_exit(&aq->ea_stat_lock);
+}
+
+static void
+ena_admin_process_responses(ena_t *ena)
+{
+ ena_adminq_t *aq = &ena->ena_aq;
+ ena_admin_cq_t *cq = &aq->ea_cq;
+ uint16_t modulo_mask = aq->ea_qlen - 1;
+ enahw_resp_desc_t *hwresp;
+
+ mutex_enter(&aq->ea_cq_lock);
+ uint16_t head_mod = cq->eac_head & modulo_mask;
+ uint8_t phase = cq->eac_phase & ENAHW_RESP_PHASE_MASK;
+
+ ENA_DMA_SYNC(cq->eac_dma, DDI_DMA_SYNC_FORKERNEL);
+ hwresp = &cq->eac_entries[head_mod];
+ while ((hwresp->erd_flags & ENAHW_RESP_PHASE_MASK) == phase) {
+ ena_admin_read_resp(ena, hwresp);
+
+ cq->eac_head++;
+ head_mod = cq->eac_head & modulo_mask;
+
+ if (head_mod == 0) {
+ phase = !phase;
+ }
+
+ hwresp = &cq->eac_entries[head_mod];
+ }
+
+ cq->eac_phase = phase;
+ mutex_exit(&aq->ea_cq_lock);
+}
+
+/*
+ * Wait for the command described by ctx to complete by polling for
+ * status updates.
+ */
+int
+ena_admin_poll_for_resp(ena_t *ena, ena_cmd_ctx_t *ctx)
+{
+ int ret = 0;
+ hrtime_t expire = gethrtime() + ena->ena_aq.ea_cmd_timeout_ns;
+
+ while (1) {
+ ena_admin_process_responses(ena);
+
+ if (!ctx->ectx_pending) {
+ break;
+ }
+
+ /* Wait for 1 millisecond. */
+ delay(drv_usectohz(1000));
+
+ if (gethrtime() > expire) {
+ /*
+ * We have no visibility into the device to
+ * confirm it is making progress on this
+ * command. At this point the driver and
+ * device cannot agree on the state of the
+ * world: perhaps the device is still making
+ * progress but not fast enough, perhaps the
+ * device completed the command but there was
+ * a failure to deliver the reply, perhaps the
+ * command failed but once again the reply was
+ * not delivered. With this unknown state the
+ * best thing to do is to reset the device and
+ * start from scratch. But as we don't have
+ * that capability at the moment the next best
+ * thing to do is to spin or panic; we choose
+ * to panic.
+ */
+ panic("timed out waiting for admin response");
+ }
+ }
+
+ ret = enahw_resp_status_to_errno(ena, ctx->ectx_resp->erd_status);
+ ena_release_cmd_ctx(ena, ctx);
+ return (ret);
+}
+
+void
+ena_free_host_info(ena_t *ena)
+{
+ ena_dma_free(&ena->ena_host_info);
+}
+
+boolean_t
+ena_init_host_info(ena_t *ena)
+{
+ enahw_host_info_t *ehi;
+ int ret = 0;
+ int *regs;
+ uint_t nregs;
+ ena_dma_buf_t *hi_dma;
+ enahw_cmd_desc_t cmd;
+ enahw_feat_host_attr_t *ha_cmd =
+ &cmd.ecd_cmd.ecd_set_feat.ecsf_feat.ecsf_host_attr;
+ enahw_resp_desc_t resp;
+ ena_dma_conf_t conf = {
+ .edc_size = ENAHW_HOST_INFO_ALLOC_SZ,
+ .edc_align = ENAHW_HOST_INFO_ALIGNMENT,
+ .edc_sgl = 1,
+ .edc_endian = DDI_NEVERSWAP_ACC,
+ .edc_stream = B_FALSE,
+ };
+
+ hi_dma = &ena->ena_host_info;
+
+ if (!ena_dma_alloc(ena, hi_dma, &conf, 4096)) {
+ ena_err(ena, "failed to allocate DMA for host info");
+ return (B_FALSE);
+ }
+
+ ehi = (void *)hi_dma->edb_va;
+ ehi->ehi_ena_spec_version =
+ ((ENA_SPEC_VERSION_MAJOR << ENAHW_HOST_INFO_SPEC_MAJOR_SHIFT) |
+ (ENA_SPEC_VERSION_MINOR));
+
+ ehi->ehi_bdf = 0;
+ if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, ena->ena_dip,
+ DDI_PROP_DONTPASS, "reg", &regs, &nregs) == DDI_PROP_SUCCESS) {
+ if (nregs != 0) {
+ ehi->ehi_bdf |= PCI_REG_BUS_G(regs[0]) << 8;
+ ehi->ehi_bdf |= PCI_REG_DEV_G(regs[0]) << 3;
+ ehi->ehi_bdf |= PCI_REG_FUNC_G(regs[0]);
+ }
+
+ ddi_prop_free(regs);
+ }
+
+ /*
+ * There is no illumos OS type, it would be nice to ping
+ * someone at Amazon and see if we can't get one added.
+ */
+ ehi->ehi_os_type = ENAHW_OS_FREEBSD;
+ ehi->ehi_kernel_ver = 511; /* If you know you know */
+ (void) strlcpy((char *)ehi->ehi_kernel_ver_str, utsname.version,
+ sizeof (ehi->ehi_kernel_ver_str));
+ ehi->ehi_os_dist = 0; /* What everyone else does. */
+ ehi->ehi_driver_ver =
+ (ENA_MODULE_VER_MAJOR) |
+ (ENA_MODULE_VER_MINOR << ENAHW_HOST_INFO_MINOR_SHIFT) |
+ (ENA_MODULE_VER_SUBMINOR << ENAHW_HOST_INFO_SUB_MINOR_SHIFT);
+ ehi->ehi_num_cpus = ncpus_online;
+
+ /*
+ * ENA devices are not created equal. Some will support
+ * features not found in others. This field tells the device
+ * which features the driver supports.
+ *
+ * ENAHW_HOST_INFO_RX_OFFSET
+ *
+ * Some ENA devices will write the frame data at an offset
+ * in the buffer, presumably for alignment purposes. We
+ * support this feature for the sole reason that the Linux
+ * driver does as well.
+ *
+ * ENAHW_HOST_INFO_INTERRUPT_MODERATION
+ *
+ * Based on the Linux history this flag indicates that the
+ * driver "supports interrupt moderation properly". What
+ * that means is anyone's guess. The Linux driver seems to
+ * have some "adaptive" interrupt moderation, so perhaps
+ * it's that? In any case, FreeBSD doesn't bother with
+ * setting this flag, so we'll leave it be for now as well.
+ *
+ * If you're curious to know if the device supports
+ * interrupt moderation: the FEAT_INTERRUPT_MODERATION flag
+ * will be set in ena_hw.eh_supported_features.
+ *
+ * ENAHW_HOST_INFO_RX_BUF_MIRRORING
+ *
+ * Support traffic mirroring by allowing the hypervisor to
+ * read the buffer memory directly. This probably has to do
+ * with AWS flow logs, allowing more efficient mirroring.
+ * But it's hard to say for sure given we only have the
+ * Linux commit log to go off of. In any case, the only
+ * requirement for this feature is that the Rx DMA buffers
+ * be read/write, which they are.
+ *
+ * ENAHW_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY
+ *
+ * The device supports the retrieving and updating of the
+ * RSS function and hash key. As we don't yet implement RSS
+ * this is disabled.
+ */
+ ehi->ehi_driver_supported_features =
+ ENAHW_HOST_INFO_RX_OFFSET_MASK |
+ ENAHW_HOST_INFO_RX_BUF_MIRRORING_MASK;
+
+ ENA_DMA_SYNC(*hi_dma, DDI_DMA_SYNC_FORDEV);
+ bzero(&cmd, sizeof (cmd));
+ ena_set_dma_addr(ena, hi_dma->edb_cookie->dmac_laddress,
+ &ha_cmd->efha_os_addr);
+
+ /*
+ * You might notice the "debug area" is not allocated or
+ * configured, that is on purpose.
+ *
+ * The "debug area" is a region of host memory that contains
+ * the String Set (SS) tables used to report statistics to
+ * tools like ethtool (on Linux). This table consists of one
+ * of more entries of a 32-byte string (the name of the
+ * statistic) along with its associated 64-bit value. The
+ * stats reported here contain both the host-side stats as
+ * well as device-reported stats (ENAHW_GET_STATS_TYPE_ENI). I
+ * believe the reason for calling it the "debug area" is that
+ * it can be accessed from outside of the guest, allowing an
+ * AWS user (?) or Amazon employee to get basic information
+ * about the state of the device from the guest's point of
+ * view.
+ *
+ * In the fullness of time, our driver should probably support
+ * this aspect of ENA. For the time being, all testing
+ * indicates the driver and device function fine without it.
+ */
+
+ ret = ena_set_feature(ena, &cmd, &resp, ENAHW_FEAT_HOST_ATTR_CONFIG,
+ ENAHW_FEAT_HOST_ATTR_CONFIG_VER);
+ if (ret != 0) {
+ ena_err(ena, "failed to set host attributes: %d", ret);
+ ena_dma_free(hi_dma);
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+int
+ena_create_cq(ena_t *ena, uint16_t num_descs, uint64_t phys_addr,
+ boolean_t is_tx, uint32_t vector, uint16_t *hw_index,
+ uint32_t **unmask_addr, uint32_t **headdb, uint32_t **numanode)
+{
+ int ret;
+ enahw_cmd_desc_t cmd;
+ enahw_cmd_create_cq_t *cmd_cq = &cmd.ecd_cmd.ecd_create_cq;
+ enahw_resp_desc_t resp;
+ enahw_resp_create_cq_t *resp_cq = &resp.erd_resp.erd_create_cq;
+ ena_cmd_ctx_t *ctx = NULL;
+ uint8_t desc_size = is_tx ? sizeof (enahw_tx_cdesc_t) :
+ sizeof (enahw_rx_cdesc_t);
+
+ bzero(&cmd, sizeof (cmd));
+ bzero(&resp, sizeof (resp));
+
+ cmd.ecd_opcode = ENAHW_CMD_CREATE_CQ;
+ ENAHW_CMD_CREATE_CQ_INTERRUPT_MODE_ENABLE(cmd_cq);
+ ASSERT3U(desc_size % 4, ==, 0);
+ ENAHW_CMD_CREATE_CQ_DESC_SIZE_WORDS(cmd_cq, desc_size / 4);
+ cmd_cq->ecq_num_descs = num_descs;
+ cmd_cq->ecq_msix_vector = vector;
+ ena_set_dma_addr(ena, phys_addr, &cmd_cq->ecq_addr);
+
+ if ((ret = ena_admin_submit_cmd(ena, &cmd, &resp, &ctx)) != 0) {
+ ena_err(ena, "failed to submit Create CQ command: %d", ret);
+ return (ret);
+ }
+
+ if ((ret = ena_admin_poll_for_resp(ena, ctx)) != 0) {
+ ena_err(ena, "failed to Create CQ: %d", ret);
+ return (ret);
+ }
+
+ *hw_index = resp_cq->ercq_idx;
+ *unmask_addr = (uint32_t *)(ena->ena_reg_base +
+ resp_cq->ercq_interrupt_mask_reg_offset);
+
+ if (resp_cq->ercq_head_db_reg_offset != 0) {
+ *headdb = (uint32_t *)(ena->ena_reg_base +
+ resp_cq->ercq_head_db_reg_offset);
+ } else {
+ *headdb = NULL;
+ }
+
+ if (resp_cq->ercq_numa_node_reg_offset != 0) {
+ *numanode = (uint32_t *)(ena->ena_reg_base +
+ resp_cq->ercq_numa_node_reg_offset);
+ } else {
+ *numanode = NULL;
+ }
+
+ return (0);
+}
+
+int
+ena_destroy_cq(ena_t *ena, uint16_t hw_idx)
+{
+ enahw_cmd_desc_t cmd;
+ enahw_resp_desc_t resp;
+ ena_cmd_ctx_t *ctx = NULL;
+ int ret;
+
+ bzero(&cmd, sizeof (cmd));
+ bzero(&resp, sizeof (resp));
+ cmd.ecd_opcode = ENAHW_CMD_DESTROY_CQ;
+ cmd.ecd_cmd.ecd_destroy_cq.edcq_idx = hw_idx;
+
+ if ((ret = ena_admin_submit_cmd(ena, &cmd, &resp, &ctx)) != 0) {
+ ena_err(ena, "failed to submit Destroy CQ command: %d", ret);
+ return (ret);
+ }
+
+ if ((ret = ena_admin_poll_for_resp(ena, ctx)) != 0) {
+ ena_err(ena, "failed to Destroy CQ: %d", ret);
+ return (ret);
+ }
+
+ return (0);
+}
+
+int
+ena_create_sq(ena_t *ena, uint16_t num_descs, uint64_t phys_addr,
+ boolean_t is_tx, uint16_t cq_index, uint16_t *hw_index, uint32_t **db_addr)
+{
+ int ret;
+ enahw_cmd_desc_t cmd;
+ enahw_cmd_create_sq_t *cmd_sq = &cmd.ecd_cmd.ecd_create_sq;
+ enahw_resp_desc_t resp;
+ enahw_resp_create_sq_t *resp_sq = &resp.erd_resp.erd_create_sq;
+ enahw_sq_direction_t dir =
+ is_tx ? ENAHW_SQ_DIRECTION_TX : ENAHW_SQ_DIRECTION_RX;
+ ena_cmd_ctx_t *ctx = NULL;
+
+ if (!ISP2(num_descs)) {
+ ena_err(ena, "the number of descs must be a power of 2, but "
+ " is %d", num_descs);
+ return (B_FALSE);
+ }
+
+ bzero(&cmd, sizeof (cmd));
+ bzero(&resp, sizeof (resp));
+ cmd.ecd_opcode = ENAHW_CMD_CREATE_SQ;
+ ENAHW_CMD_CREATE_SQ_DIR(cmd_sq, dir);
+ ENAHW_CMD_CREATE_SQ_PLACEMENT_POLICY(cmd_sq,
+ ENAHW_PLACEMENT_POLICY_HOST);
+ ENAHW_CMD_CREATE_SQ_COMPLETION_POLICY(cmd_sq,
+ ENAHW_COMPLETION_POLICY_DESC);
+ /*
+ * We limit all SQ descriptor rings to an SGL of 1, therefore
+ * they are always physically contiguous.
+ */
+ ENAHW_CMD_CREATE_SQ_PHYSMEM_CONTIG(cmd_sq);
+ cmd_sq->ecsq_cq_idx = cq_index;
+ cmd_sq->ecsq_num_descs = num_descs;
+
+ /*
+ * If we ever use a non-host placement policy, then guard this
+ * code against placement type (this value should not be set
+ * for device placement).
+ */
+ ena_set_dma_addr(ena, phys_addr, &cmd_sq->ecsq_base);
+
+ if ((ret = ena_admin_submit_cmd(ena, &cmd, &resp, &ctx)) != 0) {
+ ena_err(ena, "failed to submit Create SQ command: %d", ret);
+ return (ret);
+ }
+
+ if ((ret = ena_admin_poll_for_resp(ena, ctx)) != 0) {
+ ena_err(ena, "failed to Create SQ: %d", ret);
+ return (ret);
+ }
+
+ *hw_index = resp_sq->ersq_idx;
+ *db_addr = (uint32_t *)(ena->ena_reg_base +
+ resp_sq->ersq_db_reg_offset);
+ return (0);
+}
+
+int
+ena_destroy_sq(ena_t *ena, uint16_t hw_idx, boolean_t is_tx)
+{
+ enahw_cmd_desc_t cmd;
+ enahw_cmd_destroy_sq_t *cmd_sq = &cmd.ecd_cmd.ecd_destroy_sq;
+ enahw_resp_desc_t resp;
+ ena_cmd_ctx_t *ctx = NULL;
+ int ret;
+
+ bzero(&cmd, sizeof (cmd));
+ bzero(&resp, sizeof (resp));
+ cmd.ecd_opcode = ENAHW_CMD_DESTROY_SQ;
+ cmd_sq->edsq_idx = hw_idx;
+ ENAHW_CMD_DESTROY_SQ_DIR(cmd_sq, is_tx);
+
+ if ((ret = ena_admin_submit_cmd(ena, &cmd, &resp, &ctx)) != 0) {
+ ena_err(ena, "failed to submit Destroy SQ command: %d", ret);
+ return (ret);
+ }
+
+ if ((ret = ena_admin_poll_for_resp(ena, ctx)) != 0) {
+ ena_err(ena, "failed Destroy SQ: %d", ret);
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * Determine if a given feature is available on this device.
+ */
+static boolean_t
+ena_is_feature_avail(ena_t *ena, const enahw_feature_id_t feat_id)
+{
+ VERIFY3U(feat_id, <=, ENAHW_FEAT_NUM);
+ uint32_t mask = 1U << feat_id;
+
+ /*
+ * The device attributes feature is always supported, as
+ * indicated by the common code.
+ */
+ if (feat_id == ENAHW_FEAT_DEVICE_ATTRIBUTES) {
+ return (B_TRUE);
+ }
+
+ return ((ena->ena_supported_features & mask) != 0);
+}
+
+int
+ena_set_feature(ena_t *ena, enahw_cmd_desc_t *cmd, enahw_resp_desc_t *resp,
+ const enahw_feature_id_t feat_id, const uint8_t feat_ver)
+{
+ enahw_cmd_set_feat_t *cmd_sf = &cmd->ecd_cmd.ecd_set_feat;
+ ena_cmd_ctx_t *ctx = NULL;
+ int ret = 0;
+
+ if (!ena_is_feature_avail(ena, feat_id)) {
+ ena_err(ena, "attempted to set unsupported feature: 0x%x %d"
+ " (0x%x)", feat_id, feat_ver, ena->ena_supported_features);
+ return (ENOTSUP);
+ }
+
+ cmd->ecd_opcode = ENAHW_CMD_SET_FEATURE;
+ cmd_sf->ecsf_comm.efc_id = feat_id;
+ cmd_sf->ecsf_comm.efc_version = feat_ver;
+ cmd_sf->ecsf_comm.efc_flags = 0;
+
+ if ((ret = ena_admin_submit_cmd(ena, cmd, resp, &ctx)) != 0) {
+ ena_err(ena, "failed to submit Set Feature command: %d", ret);
+ return (ret);
+ }
+
+ return (ena_admin_poll_for_resp(ena, ctx));
+}
+
+int
+ena_get_feature(ena_t *ena, enahw_resp_desc_t *resp,
+ const enahw_feature_id_t feat_id, const uint8_t feat_ver)
+{
+ enahw_cmd_desc_t cmd;
+ enahw_cmd_get_feat_t *cmd_gf = &cmd.ecd_cmd.ecd_get_feat;
+ ena_cmd_ctx_t *ctx = NULL;
+ int ret = 0;
+
+ if (!ena_is_feature_avail(ena, feat_id)) {
+ return (ENOTSUP);
+ }
+
+ bzero(&cmd, sizeof (cmd));
+ cmd.ecd_opcode = ENAHW_CMD_GET_FEATURE;
+ cmd_gf->ecgf_comm.efc_id = feat_id;
+ cmd_gf->ecgf_comm.efc_version = feat_ver;
+ ENAHW_GET_FEAT_FLAGS_GET_CURR_VAL(cmd_gf);
+
+ if ((ret = ena_admin_submit_cmd(ena, &cmd, resp, &ctx)) != 0) {
+ ena_err(ena, "failed to submit Get Feature command: %d", ret);
+ return (ret);
+ }
+
+ return (ena_admin_poll_for_resp(ena, ctx));
+}
+
+int
+ena_admin_get_basic_stats(ena_t *ena, enahw_resp_desc_t *resp)
+{
+ int ret = 0;
+ enahw_cmd_desc_t cmd;
+ enahw_cmd_get_stats_t *cmd_stats = &cmd.ecd_cmd.ecd_get_stats;
+ ena_cmd_ctx_t *ctx = NULL;
+
+ bzero(&cmd, sizeof (cmd));
+ bzero(resp, sizeof (*resp));
+ cmd.ecd_opcode = ENAHW_CMD_GET_STATS;
+ cmd_stats->ecgs_type = ENAHW_GET_STATS_TYPE_BASIC;
+ cmd_stats->ecgs_scope = ENAHW_GET_STATS_SCOPE_ETH;
+ cmd_stats->ecgs_device_id = ENAHW_CMD_GET_STATS_MY_DEVICE_ID;
+
+ if ((ret = ena_admin_submit_cmd(ena, &cmd, resp, &ctx)) != 0) {
+ ena_err(ena, "failed to submit Get Basic Stats command: %d",
+ ret);
+ return (ret);
+ }
+
+ if ((ret = ena_admin_poll_for_resp(ena, ctx)) != 0) {
+ ena_err(ena, "failed to Get Basic Stats: %d", ret);
+ return (ret);
+ }
+
+ return (0);
+}
+
+int
+ena_admin_get_eni_stats(ena_t *ena, enahw_resp_desc_t *resp)
+{
+ int ret = 0;
+ enahw_cmd_desc_t cmd;
+ enahw_cmd_get_stats_t *cmd_stats = &cmd.ecd_cmd.ecd_get_stats;
+ ena_cmd_ctx_t *ctx = NULL;
+
+ bzero(&cmd, sizeof (cmd));
+ bzero(resp, sizeof (*resp));
+ cmd.ecd_opcode = ENAHW_CMD_GET_STATS;
+ cmd_stats->ecgs_type = ENAHW_GET_STATS_TYPE_ENI;
+ cmd_stats->ecgs_scope = ENAHW_GET_STATS_SCOPE_ETH;
+ cmd_stats->ecgs_device_id = ENAHW_CMD_GET_STATS_MY_DEVICE_ID;
+
+ if ((ret = ena_admin_submit_cmd(ena, &cmd, resp, &ctx)) != 0) {
+ ena_err(ena, "failed to submit Get ENI Stats command: %d", ret);
+ return (ret);
+ }
+
+ if ((ret = ena_admin_poll_for_resp(ena, ctx)) != 0) {
+ ena_err(ena, "failed to Get ENI Stats: %d", ret);
+ return (ret);
+ }
+
+ return (0);
+}
diff --git a/usr/src/uts/common/io/ena/ena_dma.c b/usr/src/uts/common/io/ena/ena_dma.c
new file mode 100644
index 0000000000..48f39b9dbb
--- /dev/null
+++ b/usr/src/uts/common/io/ena/ena_dma.c
@@ -0,0 +1,191 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+#include "ena.h"
+
+/*
+ * Create DMA attributes based on the conf parameter.
+ */
+void
+ena_dma_attr(const ena_t *ena, ddi_dma_attr_t *attrp,
+ const ena_dma_conf_t *conf)
+{
+ bzero(attrp, sizeof (*attrp));
+
+ /*
+ * Round up maximums to next page. This is what the Linux and
+ * FreeBSD driver do, so we follow suit.
+ */
+ const size_t size_up =
+ P2ROUNDUP_TYPED(conf->edc_size, ena->ena_page_sz, size_t);
+
+ attrp->dma_attr_version = DMA_ATTR_V0;
+
+ /*
+ * The device tells us the window it supports in terms of
+ * number of bits, we convert that to the appropriate mask.
+ */
+ ASSERT3U(ena->ena_dma_width, >=, 32);
+ ASSERT3U(ena->ena_dma_width, <=, 48);
+ attrp->dma_attr_addr_lo = 0x0;
+ attrp->dma_attr_addr_hi = ENA_DMA_BIT_MASK(ena->ena_dma_width);
+
+ /*
+ * This indicates the amount of data that can fit in one
+ * cookie/segment. We allow the entire object to live in one
+ * segment, when possible.
+ *
+ * NOTE: This value must be _one less_ than the desired max
+ * (i.e. a value of 4095 indicates a max of 4096).
+ */
+ attrp->dma_attr_count_max = size_up - 1;
+
+ /*
+ * The alignment of the starting address.
+ */
+ attrp->dma_attr_align = conf->edc_align;
+
+ /*
+ * The segment boundary dictates the address which a segment
+ * cannot cross. In this case there is no boundary.
+ */
+ attrp->dma_attr_seg = UINT64_MAX;
+
+ /*
+ * Allow a burst size of the entire object.
+ */
+ attrp->dma_attr_burstsizes = size_up;
+
+ /*
+ * Minimum and maximum amount of data we can send. This isn't
+ * strictly limited by PCI in hardware, as it'll just make the
+ * appropriate number of requests. Simiarly, PCIe allows for
+ * an arbitrary granularity. We set this to one, as it's
+ * really a matter of what hardware is requesting from us.
+ */
+ attrp->dma_attr_minxfer = 0x1;
+ attrp->dma_attr_maxxfer = size_up;
+ attrp->dma_attr_granular = 0x1;
+
+ /*
+ * The maximum length of the Scatter Gather List, aka the
+ * maximum number of segments a device can address in a
+ * transfer.
+ */
+ attrp->dma_attr_sgllen = conf->edc_sgl;
+}
+
+void
+ena_dma_free(ena_dma_buf_t *edb)
+{
+ if (edb->edb_cookie != NULL) {
+ (void) ddi_dma_unbind_handle(edb->edb_dma_hdl);
+ edb->edb_cookie = NULL;
+ edb->edb_real_len = 0;
+ }
+
+ if (edb->edb_acc_hdl != NULL) {
+ ddi_dma_mem_free(&edb->edb_acc_hdl);
+ edb->edb_acc_hdl = NULL;
+ edb->edb_va = NULL;
+ }
+
+ if (edb->edb_dma_hdl != NULL) {
+ ddi_dma_free_handle(&edb->edb_dma_hdl);
+ edb->edb_dma_hdl = NULL;
+ }
+
+ edb->edb_len = 0;
+}
+
+boolean_t
+ena_dma_alloc(ena_t *ena, ena_dma_buf_t *edb, ena_dma_conf_t *conf, size_t size)
+{
+ int ret;
+ size_t size_allocated;
+ ddi_dma_attr_t attr;
+ ddi_device_acc_attr_t acc;
+ uint_t flags =
+ conf->edc_stream ? DDI_DMA_STREAMING : DDI_DMA_CONSISTENT;
+
+ ena_dma_attr(ena, &attr, conf);
+
+ acc.devacc_attr_version = DDI_DEVICE_ATTR_V1;
+ acc.devacc_attr_endian_flags = conf->edc_endian;
+ acc.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
+
+ ret = ddi_dma_alloc_handle(ena->ena_dip, &attr, DDI_DMA_DONTWAIT, NULL,
+ &edb->edb_dma_hdl);
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "!failed to allocate DMA handle: %d", ret);
+ return (B_FALSE);
+ }
+
+ ret = ddi_dma_mem_alloc(edb->edb_dma_hdl, size, &acc, flags,
+ DDI_DMA_DONTWAIT, NULL, &edb->edb_va, &size_allocated,
+ &edb->edb_acc_hdl);
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "!failed to allocate %lu bytes of DMA "
+ "memory: %d", size, ret);
+ ena_dma_free(edb);
+ return (B_FALSE);
+ }
+
+ bzero(edb->edb_va, size_allocated);
+
+ ret = ddi_dma_addr_bind_handle(edb->edb_dma_hdl, NULL, edb->edb_va,
+ size_allocated, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT, NULL, NULL,
+ NULL);
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "!failed to bind %lu bytes of DMA "
+ "memory: %d", size_allocated, ret);
+ ena_dma_free(edb);
+ return (B_FALSE);
+ }
+
+ edb->edb_len = size;
+ edb->edb_real_len = size_allocated;
+ edb->edb_cookie = ddi_dma_cookie_one(edb->edb_dma_hdl);
+ return (B_TRUE);
+}
+
+/*
+ * Write the physical DMA address to the ENA hardware address pointer.
+ * While the DMA engine should guarantee that the allocation is within
+ * the specified range, we double check here to catch programmer error
+ * and avoid hard-to-debug situations.
+ */
+void
+ena_set_dma_addr(const ena_t *ena, const uint64_t phys_addr,
+ enahw_addr_t *hwaddrp)
+{
+ ENA_DMA_VERIFY_ADDR(ena, phys_addr);
+ hwaddrp->ea_low = (uint32_t)phys_addr;
+ hwaddrp->ea_high = (uint16_t)(phys_addr >> 32);
+}
+
+/*
+ * The same as the above function, but writes the phsyical address to
+ * the supplied value pointers instead. Mostly used as a sanity check
+ * that the address fits in the reported DMA width.
+ */
+void
+ena_set_dma_addr_values(const ena_t *ena, const uint64_t phys_addr,
+ uint32_t *dst_low, uint16_t *dst_high)
+{
+ ENA_DMA_VERIFY_ADDR(ena, phys_addr);
+ *dst_low = (uint32_t)phys_addr;
+ *dst_high = (uint16_t)(phys_addr >> 32);
+}
diff --git a/usr/src/uts/common/io/ena/ena_gld.c b/usr/src/uts/common/io/ena/ena_gld.c
new file mode 100644
index 0000000000..2c27d0d31c
--- /dev/null
+++ b/usr/src/uts/common/io/ena/ena_gld.c
@@ -0,0 +1,465 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+#include "ena.h"
+
+/*
+ * Group/Ring callbacks
+ */
+
+/*
+ * The ena driver supports only a single mac address: the one assigned
+ * to it by the hypervisor. If mac requests an address besides this
+ * one, then return ENOTSUP. This will prevent VNICs from being
+ * created, as it should.
+ */
+static int
+ena_group_add_mac(void *arg, const uint8_t *mac_addr)
+{
+ ena_t *ena = arg;
+
+ if (ETHER_IS_MULTICAST(mac_addr)) {
+ return (EINVAL);
+ }
+
+ if (bcmp(ena->ena_mac_addr, mac_addr, ETHERADDRL) == 0) {
+ return (0);
+ }
+
+ return (ENOTSUP);
+}
+
+static int
+ena_group_rem_mac(void *arg, const uint8_t *mac_addr)
+{
+ ena_t *ena = arg;
+
+ if (ETHER_IS_MULTICAST(mac_addr)) {
+ return (EINVAL);
+ }
+
+ if (bcmp(ena->ena_mac_addr, mac_addr, ETHERADDRL) == 0) {
+ return (0);
+ }
+
+ return (ENOTSUP);
+}
+
+static int
+ena_ring_rx_intr_disable(mac_intr_handle_t mih)
+{
+ ena_rxq_t *rxq = (ena_rxq_t *)mih;
+ uint32_t intr_ctrl;
+
+ mutex_enter(&rxq->er_lock);
+ intr_ctrl = ena_hw_abs_read32(rxq->er_ena, rxq->er_cq_unmask_addr);
+ ENAHW_REG_INTR_MASK(intr_ctrl);
+ ena_hw_abs_write32(rxq->er_ena, rxq->er_cq_unmask_addr, intr_ctrl);
+ rxq->er_mode = ENA_RXQ_MODE_POLLING;
+ mutex_exit(&rxq->er_lock);
+ return (0);
+}
+
+static int
+ena_ring_rx_intr_enable(mac_intr_handle_t mih)
+{
+ ena_rxq_t *rxq = (ena_rxq_t *)mih;
+ uint32_t intr_ctrl;
+
+ mutex_enter(&rxq->er_lock);
+ intr_ctrl = ena_hw_abs_read32(rxq->er_ena, rxq->er_cq_unmask_addr);
+ ENAHW_REG_INTR_UNMASK(intr_ctrl);
+ ena_hw_abs_write32(rxq->er_ena, rxq->er_cq_unmask_addr, intr_ctrl);
+ rxq->er_mode = ENA_RXQ_MODE_INTR;
+ mutex_exit(&rxq->er_lock);
+ return (0);
+}
+
+static void
+ena_fill_rx_group(void *arg, mac_ring_type_t rtype, const int index,
+ mac_group_info_t *infop, mac_group_handle_t gh)
+{
+ ena_t *ena = arg;
+
+ VERIFY3S(rtype, ==, MAC_RING_TYPE_RX);
+ /*
+ * Typically you pass an Rx group data structure as
+ * mgi_driver, but given we should only ever have one group we
+ * just pass the top-level ena_t.
+ */
+ infop->mgi_driver = (mac_group_driver_t)ena;
+ infop->mgi_start = NULL;
+ infop->mgi_stop = NULL;
+ infop->mgi_addmac = ena_group_add_mac;
+ infop->mgi_remmac = ena_group_rem_mac;
+ infop->mgi_count = ena->ena_num_intrs - 1;
+}
+
+static void
+ena_fill_tx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
+ const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+ ena_t *ena = arg;
+ ena_txq_t *txq = &(ena->ena_txqs[ring_index]);
+
+ VERIFY3S(rtype, ==, MAC_RING_TYPE_TX);
+ VERIFY3S(ring_index, <, ena->ena_num_txqs);
+ /* Link driver Tx queue to mac ring handle and vice versa. */
+ txq->et_mrh = rh;
+ infop->mri_driver = (mac_ring_driver_t)txq;
+ infop->mri_start = ena_ring_tx_start;
+ infop->mri_stop = ena_ring_tx_stop;
+ infop->mri_tx = ena_ring_tx;
+ infop->mri_stat = ena_ring_tx_stat;
+}
+
+static void
+ena_fill_rx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
+ const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
+{
+ ena_t *ena = arg;
+ ena_rxq_t *rxq = &(ena->ena_rxqs[ring_index]);
+
+ VERIFY3S(rtype, ==, MAC_RING_TYPE_RX);
+ VERIFY3S(ring_index, <, ena->ena_num_rxqs);
+ rxq->er_mrh = rh;
+ infop->mri_driver = (mac_ring_driver_t)rxq;
+ infop->mri_start = ena_ring_rx_start;
+ infop->mri_stop = ena_ring_rx_stop;
+ infop->mri_poll = ena_ring_rx_poll;
+ infop->mri_stat = ena_ring_rx_stat;
+ infop->mri_intr.mi_handle = (mac_intr_handle_t)rxq;
+ infop->mri_intr.mi_enable = ena_ring_rx_intr_enable;
+ infop->mri_intr.mi_disable = ena_ring_rx_intr_disable;
+ infop->mri_intr.mi_ddi_handle =
+ ena->ena_intr_handles[rxq->er_intr_vector];
+}
+
+static int
+ena_m_start(void *arg)
+{
+ ena_t *ena = arg;
+
+ atomic_or_32(&ena->ena_state, ENA_STATE_RUNNING);
+ return (0);
+}
+
+static void
+ena_m_stop(void *arg)
+{
+ ena_t *ena = arg;
+ atomic_and_32(&ena->ena_state, ~ENA_STATE_RUNNING);
+}
+
+/*
+ * As discussed in ena_group_add_mac(), ENA only supports a single MAC
+ * address, and therefore we prevent VNICs from being created. That
+ * means there is no chance for promisc to be used as a means for
+ * implementing VNIC support on ENA, as we never allow them to be
+ * created in the first place.
+ *
+ * As for promisc itself, returning success is about the best we can
+ * do. There is no promisc API for an ENA device -- you get only the
+ * exact traffic AWS wants you to see.
+ */
+static int
+ena_m_setpromisc(void *arg, boolean_t on)
+{
+ return (0);
+}
+
+/*
+ * Similarly to promisc, there is no multicast API for an ENA
+ * device.
+ */
+static int
+ena_m_multicast(void *arg, boolean_t add, const uint8_t *multicast_address)
+{
+ return (0);
+}
+
+static boolean_t
+ena_m_getcapab(void *arg, mac_capab_t capab, void *cap_data)
+{
+ ena_t *ena = arg;
+ mac_capab_rings_t *cap_rings;
+
+ switch (capab) {
+ case MAC_CAPAB_RINGS:
+ cap_rings = cap_data;
+ cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
+ cap_rings->mr_gaddring = NULL;
+ cap_rings->mr_gremring = NULL;
+ ASSERT3U(ena->ena_num_intrs, >=, 2);
+
+ switch (cap_rings->mr_type) {
+ case MAC_RING_TYPE_TX:
+ /*
+ * We use pseudo Tx groups for now.
+ */
+ cap_rings->mr_gnum = 0;
+ cap_rings->mr_rnum = ena->ena_num_intrs - 1;
+ cap_rings->mr_rget = ena_fill_tx_ring;
+ break;
+ case MAC_RING_TYPE_RX:
+ cap_rings->mr_rnum = ena->ena_num_intrs - 1;
+ cap_rings->mr_rget = ena_fill_rx_ring;
+ /*
+ * The ENA device provides no means to add mac
+ * filters or set promisc mode; it's only
+ * meant to receive its pre-designated unicast
+ * address. However, we still want rings as
+ * the device does provide multiple queues and
+ * RSS.
+ */
+ cap_rings->mr_gnum = 1;
+ cap_rings->mr_gget = ena_fill_rx_group;
+ break;
+ }
+
+ break;
+
+ case MAC_CAPAB_HCKSUM:
+ case MAC_CAPAB_LSO:
+ return (B_FALSE);
+ default:
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+static int
+ena_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+ uint_t pr_valsize, const void *pr_val)
+{
+ return (ENOTSUP);
+}
+
+static int
+ena_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+ uint_t pr_valsize, void *pr_val)
+{
+ ena_t *ena = arg;
+ int ret = 0;
+ uint64_t speed;
+ uint8_t *u8;
+
+ mutex_enter(&ena->ena_lock);
+
+ switch (pr_num) {
+ case MAC_PROP_DUPLEX:
+ if (pr_valsize < sizeof (link_duplex_t)) {
+ ret = EOVERFLOW;
+ break;
+ }
+
+ bcopy(&ena->ena_link_duplex, pr_val, sizeof (link_duplex_t));
+ break;
+
+ case MAC_PROP_SPEED:
+ if (pr_valsize < sizeof (uint64_t)) {
+ ret = EOVERFLOW;
+ break;
+ }
+
+ speed = ena->ena_link_speed_mbits * 1000000ULL;
+ bcopy(&speed, pr_val, sizeof (speed));
+ break;
+
+ case MAC_PROP_STATUS:
+ if (pr_valsize < sizeof (link_state_t)) {
+ ret = EOVERFLOW;
+ break;
+ }
+
+ bcopy(&ena->ena_link_state, pr_val, sizeof (link_state_t));
+ break;
+
+ case MAC_PROP_AUTONEG:
+ if (pr_valsize < sizeof (uint8_t)) {
+ ret = EOVERFLOW;
+ break;
+ }
+
+ u8 = pr_val;
+ *u8 = (ena->ena_link_autoneg ? 0 : 1);
+ break;
+
+ case MAC_PROP_MTU:
+ if (pr_valsize < sizeof (uint32_t)) {
+ ret = EOVERFLOW;
+ break;
+ }
+
+ bcopy(&ena->ena_mtu, pr_val, sizeof (uint32_t));
+ break;
+
+ case MAC_PROP_ADV_1000FDX_CAP:
+ case MAC_PROP_EN_1000FDX_CAP:
+ if (pr_valsize < sizeof (uint8_t)) {
+ ret = EOVERFLOW;
+ break;
+ }
+
+ u8 = pr_val;
+ *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_1G) != 0;
+ break;
+
+ case MAC_PROP_ADV_2500FDX_CAP:
+ case MAC_PROP_EN_2500FDX_CAP:
+ if (pr_valsize < sizeof (uint8_t)) {
+ ret = EOVERFLOW;
+ break;
+ }
+
+ u8 = pr_val;
+ *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_2_HALF_G) != 0;
+ break;
+
+ case MAC_PROP_ADV_5000FDX_CAP:
+ case MAC_PROP_EN_5000FDX_CAP:
+ if (pr_valsize < sizeof (uint8_t)) {
+ ret = EOVERFLOW;
+ break;
+ }
+
+ u8 = pr_val;
+ *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_5G) != 0;
+ break;
+
+ case MAC_PROP_ADV_10GFDX_CAP:
+ case MAC_PROP_EN_10GFDX_CAP:
+ if (pr_valsize < sizeof (uint8_t)) {
+ ret = EOVERFLOW;
+ break;
+ }
+
+ u8 = pr_val;
+ *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_10G) != 0;
+ break;
+
+ case MAC_PROP_ADV_25GFDX_CAP:
+ case MAC_PROP_EN_25GFDX_CAP:
+ if (pr_valsize < sizeof (uint8_t)) {
+ ret = EOVERFLOW;
+ break;
+ }
+
+ u8 = pr_val;
+ *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_25G) != 0;
+ break;
+
+ case MAC_PROP_ADV_40GFDX_CAP:
+ case MAC_PROP_EN_40GFDX_CAP:
+ if (pr_valsize < sizeof (uint8_t)) {
+ ret = EOVERFLOW;
+ break;
+ }
+
+ u8 = pr_val;
+ *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_40G) != 0;
+ break;
+
+ case MAC_PROP_ADV_100GFDX_CAP:
+ case MAC_PROP_EN_100GFDX_CAP:
+ if (pr_valsize < sizeof (uint8_t)) {
+ ret = EOVERFLOW;
+ break;
+ }
+
+ u8 = pr_val;
+ *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_100G) != 0;
+ break;
+
+ default:
+ ret = ENOTSUP;
+ break;
+ }
+
+ mutex_exit(&ena->ena_lock);
+ return (ret);
+}
+
+static void
+ena_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
+ mac_prop_info_handle_t prh)
+{
+}
+
+static mac_callbacks_t ena_m_callbacks = {
+ .mc_callbacks = MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO,
+ .mc_getstat = ena_m_stat,
+ .mc_start = ena_m_start,
+ .mc_stop = ena_m_stop,
+ .mc_setpromisc = ena_m_setpromisc,
+ .mc_multicst = ena_m_multicast,
+ .mc_getcapab = ena_m_getcapab,
+ .mc_setprop = ena_m_setprop,
+ .mc_getprop = ena_m_getprop,
+ .mc_propinfo = ena_m_propinfo,
+};
+
+int
+ena_mac_unregister(ena_t *ena)
+{
+ if (ena->ena_mh == NULL) {
+ return (0);
+ }
+
+ return (mac_unregister(ena->ena_mh));
+}
+
+boolean_t
+ena_mac_register(ena_t *ena)
+{
+ int ret;
+ mac_register_t *regp;
+
+ if ((regp = mac_alloc(MAC_VERSION)) == NULL) {
+ ena_err(ena, "failed to allocate MAC handle");
+ return (B_FALSE);
+ }
+
+ regp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
+ regp->m_driver = ena;
+ regp->m_dip = ena->ena_dip;
+ regp->m_instance = 0;
+ regp->m_src_addr = ena->ena_mac_addr;
+ regp->m_dst_addr = NULL;
+ regp->m_callbacks = &ena_m_callbacks;
+ regp->m_min_sdu = 0;
+ regp->m_max_sdu = ena->ena_mtu;
+ regp->m_pdata = NULL;
+ regp->m_pdata_size = 0;
+ regp->m_priv_props = NULL;
+ regp->m_margin = VLAN_TAGSZ;
+ regp->m_v12n = MAC_VIRT_LEVEL1;
+
+ if ((ret = mac_register(regp, &ena->ena_mh)) != 0) {
+ ena_err(ena, "failed to register ena with mac: %d", ret);
+ }
+
+ mac_free(regp);
+
+ if (ret == 0) {
+ /*
+ * Until we get the first AENQ link change event, we
+ * do not actually know the status of the link.
+ */
+ mac_link_update(ena->ena_mh, LINK_STATE_UNKNOWN);
+ }
+
+ return (ret == 0);
+}
diff --git a/usr/src/uts/common/io/ena/ena_hw.c b/usr/src/uts/common/io/ena/ena_hw.c
new file mode 100644
index 0000000000..f37b4100df
--- /dev/null
+++ b/usr/src/uts/common/io/ena/ena_hw.c
@@ -0,0 +1,93 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+#include "ena_hw.h"
+#include "ena.h"
+
+uint32_t
+ena_hw_bar_read32(const ena_t *ena, const uint16_t offset)
+{
+ caddr_t addr = ena->ena_reg_base + offset;
+ return (ena_hw_abs_read32(ena, (uint32_t *)addr));
+}
+
+uint32_t
+ena_hw_abs_read32(const ena_t *ena, uint32_t *addr)
+{
+ VERIFY3U(addr, >=, ena->ena_reg_base);
+ VERIFY3U(addr, <, ena->ena_reg_base + (ena->ena_reg_size - 4));
+
+ return (ddi_get32(ena->ena_reg_hdl, addr));
+}
+
+void
+ena_hw_bar_write32(const ena_t *ena, const uint16_t offset, const uint32_t val)
+{
+ caddr_t addr = ena->ena_reg_base + offset;
+ ena_hw_abs_write32(ena, (uint32_t *)addr, val);
+}
+
+void
+ena_hw_abs_write32(const ena_t *ena, uint32_t *addr, const uint32_t val)
+{
+ VERIFY3P(ena, !=, NULL);
+ VERIFY3P(addr, !=, NULL);
+ VERIFY3U(addr, >=, ena->ena_reg_base);
+ VERIFY3U(addr, <, ena->ena_reg_base + (ena->ena_reg_size - 4));
+
+ ddi_put32(ena->ena_reg_hdl, addr, val);
+}
+
+int
+enahw_resp_status_to_errno(ena_t *ena, enahw_resp_status_t status)
+{
+ int ret = 0;
+
+ switch (status) {
+ case ENAHW_RESP_SUCCESS:
+ break;
+
+ case ENAHW_RESP_RESOURCE_ALLOCATION_FAILURE:
+ ret = ENOMEM;
+ break;
+
+ case ENAHW_RESP_UNSUPPORTED_OPCODE:
+ ret = ENOTSUP;
+ break;
+
+ case ENAHW_RESP_BAD_OPCODE:
+ case ENAHW_RESP_MALFORMED_REQUEST:
+ case ENAHW_RESP_ILLEGAL_PARAMETER:
+ ret = EINVAL;
+ break;
+
+ case ENAHW_RESP_RESOURCE_BUSY:
+ ret = EAGAIN;
+ break;
+
+ case ENAHW_RESP_UNKNOWN_ERROR:
+ default:
+ /*
+ * If the device presents us with an "unknwon error"
+ * code, or the status code is undefined, then we log
+ * an error and convert it to EIO.
+ */
+ ena_err(ena, "unexpected status code: %d", status);
+ ret = EIO;
+ break;
+ }
+
+ return (ret);
+}
diff --git a/usr/src/uts/common/io/ena/ena_hw.h b/usr/src/uts/common/io/ena/ena_hw.h
new file mode 100644
index 0000000000..fbd67851b4
--- /dev/null
+++ b/usr/src/uts/common/io/ena/ena_hw.h
@@ -0,0 +1,1930 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+/*
+ * This file declares all constants and structures dealing with the
+ * physical ENA device. It is based on the ena_com code of the public
+ * Linux and FreeBSD drivers. While this file is based on the common
+ * code it doesn't share the same type names. Where it is useful, a
+ * "common" reference is added to include the name of the type as
+ * defined in the common code.
+ *
+ * The Linux driver defines enq_admin_aq_entry as the top-level type
+ * for admin command descriptors. From this type you can access the
+ * common bits shared by every descriptor (ena_admin_aq_common_desc)
+ * as well as the control buffer (ena_admin_ctrl_buff_info) which is
+ * present for _some_ commands. Other than that, this top-level type
+ * treats the rest of the data as an opaque array of unsigned 32-bit
+ * integers. Then, for each individual command, the Linux driver
+ * defines a dedicated type, each of which contains the following:
+ *
+ * 1. The common descriptor: ena_admin_aq_common_desc.
+ *
+ * 2. The optional control buffer desc: ena_admin_ctrl_buff_info.
+ *
+ * 3. The command-specific data.
+ *
+ * 4. Optional padding to make sure all commands are 64 bytes in size.
+ *
+ * Furthermore, there may be further common types for commands which
+ * are made up of several sub-commands, e.g. the get/set feature
+ * commands.
+ *
+ * Finally, when a command is passed to the common function for
+ * executing commands (ena_com_execute_admin_command()), it is cast as
+ * a pointer to the top-level type: ena_admin_aq_entry.
+ *
+ * This works for the Linux driver just fine, but it causes lots of
+ * repetition in the structure definitions and also means there is no
+ * easy way to determine all valid commands. This ENA driver has
+ * turned the Linux approach inside out -- the top-level type is a
+ * union of all possible commands: enahw_cmd_desc_t. Each command may
+ * then further sub-type via unions to represent its sub-commands.
+ * This same treatment was given to the response descriptor:
+ * enahw_resp_desc_t.
+ *
+ * What is the point of knowing all this? Well, when referencing the
+ * common type in the comment above the enahw_ type, you need to keep
+ * in mind that the Linux/common type will include all the common
+ * descriptor bits, whereas these types do not.
+ *
+ * The common code DOES NOT pack any of these structures, and thus
+ * neither do we. That means these structures all rely on natural
+ * compiler alignment, just as the common code does. In ena.c you will
+ * find CTASSERTs for many of these structures, to verify they are of
+ * the expected size.
+ */
+
+#ifndef _ENA_HW_H
+#define _ENA_HW_H
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/types.h>
+#include <sys/debug.h>
+#include <sys/ethernet.h>
+
+/*
+ * The common code sets the upper limit of I/O queues to 128. In this
+ * case a "queue" is a SQ+CQ pair that forms a logical queue or ring
+ * for sending or receiving packets. Thus, at maximum, we may expect
+ * 128 Tx rings, and 128 Rx rings; though, practically speaking, the
+ * number of rings will often be limited by number of CPUs or
+ * available interrupts.
+ *
+ * common: ENA_MAX_NUM_IO_QUEUES
+ */
+#define ENAHW_MAX_NUM_IO_QUEUES 128
+
+/*
+ * Generate a 32-bit bitmask where the bits between high (inclusive)
+ * and low (inclusive) are set to 1.
+ */
+#define GENMASK(h, l) (((~0U) - (1U << (l)) + 1) & (~0U >> (32 - 1 - (h))))
+
+/*
+ * Generate a 64-bit bitmask where bit b is set to 1.
+ */
+#define BIT(b) (1UL << (b))
+
+#define ENAHW_DMA_ADMINQ_ALIGNMENT 8
+
+#define ENAHW_ADMIN_CQ_DESC_BUF_ALIGNMENT 8
+#define ENAHW_ADMIN_SQ_DESC_BUF_ALIGNMENT 8
+#define ENAHW_AENQ_DESC_BUF_ALIGNMENT 8
+#define ENAHW_HOST_INFO_ALIGNMENT 8
+#define ENAHW_HOST_INFO_ALLOC_SZ 4096
+#define ENAHW_IO_CQ_DESC_BUF_ALIGNMENT 4096
+#define ENAHW_IO_SQ_DESC_BUF_ALIGNMENT 8
+
+/*
+ * BAR0 register offsets.
+ *
+ * Any register not defined in the common code was marked as a gap,
+ * using the hex address of the register as suffix. The idea is to
+ * make it clear where the gaps are and allow the
+ * ena_hw_update_reg_cache() function to display any bits stored in
+ * these gaps in case they turn out to be interesting later.
+ */
+#define ENAHW_REG_VERSION 0x0
+#define ENAHW_REG_CONTROLLER_VERSION 0x4
+#define ENAHW_REG_CAPS 0x8
+#define ENAHW_REG_CAPS_EXT 0xc
+#define ENAHW_REG_ASQ_BASE_LO 0x10
+#define ENAHW_REG_ASQ_BASE_HI 0x14
+#define ENAHW_REG_ASQ_CAPS 0x18
+#define ENAHW_REG_GAP_1C 0x1c
+#define ENAHW_REG_ACQ_BASE_LO 0x20
+#define ENAHW_REG_ACQ_BASE_HI 0x24
+#define ENAHW_REG_ACQ_CAPS 0x28
+#define ENAHW_REG_ASQ_DB 0x2c
+#define ENAHW_REG_ACQ_TAIL 0x30
+#define ENAHW_REG_AENQ_CAPS 0x34
+#define ENAHW_REG_AENQ_BASE_LO 0x38
+#define ENAHW_REG_AENQ_BASE_HI 0x3c
+#define ENAHW_REG_AENQ_HEAD_DB 0x40
+#define ENAHW_REG_AENQ_TAIL 0x44
+#define ENAHW_REG_GAP_48 0x48
+#define ENAHW_REG_INTERRUPT_MASK 0x4c
+#define ENAHW_REG_GAP_50 0x50
+#define ENAHW_REG_DEV_CTL 0x54
+#define ENAHW_REG_DEV_STS 0x58
+#define ENAHW_REG_MMIO_REG_READ 0x5c
+#define ENAHW_REG_MMIO_RESP_LO 0x60
+#define ENAHW_REG_MMIO_RESP_HI 0x64
+#define ENAHW_REG_RSS_IND_ENTRY_UPDATE 0x68
+#define ENAHW_NUM_REGS ((ENAHW_REG_RSS_IND_ENTRY_UPDATE / 4) + 1)
+
+/*
+ * Device Version (Register 0x0)
+ */
+#define ENAHW_DEV_MINOR_VSN_MASK 0xff
+#define ENAHW_DEV_MAJOR_VSN_SHIFT 8
+#define ENAHW_DEV_MAJOR_VSN_MASK 0xff00
+
+#define ENAHW_DEV_MAJOR_VSN(vsn) \
+ (((vsn) & ENAHW_DEV_MAJOR_VSN_MASK) >> ENAHW_DEV_MAJOR_VSN_SHIFT)
+#define ENAHW_DEV_MINOR_VSN(vsn) \
+ ((vsn) & ENAHW_DEV_MINOR_VSN_MASK)
+
+/*
+ * Controller Version (Register 0x4)
+ */
+#define ENAHW_CTRL_SUBMINOR_VSN_MASK 0xff
+#define ENAHW_CTRL_MINOR_VSN_SHIFT 8
+#define ENAHW_CTRL_MINOR_VSN_MASK 0xff00
+#define ENAHW_CTRL_MAJOR_VSN_SHIFT 16
+#define ENAHW_CTRL_MAJOR_VSN_MASK 0xff0000
+#define ENAHW_CTRL_IMPL_ID_SHIFT 24
+#define ENAHW_CTRL_IMPL_ID_MASK 0xff000000
+
+#define ENAHW_CTRL_MAJOR_VSN(vsn) \
+ (((vsn) & ENAHW_CTRL_MAJOR_VSN_MASK) >> ENAHW_CTRL_MAJOR_VSN_SHIFT)
+#define ENAHW_CTRL_MINOR_VSN(vsn) \
+ (((vsn) & ENAHW_CTRL_MINOR_VSN_MASK) >> ENAHW_CTRL_MINOR_VSN_SHIFT)
+#define ENAHW_CTRL_SUBMINOR_VSN(vsn) \
+ ((vsn) & ENAHW_CTRL_SUBMINOR_VSN_MASK)
+#define ENAHW_CTRL_IMPL_ID(vsn) \
+ (((vsn) & ENAHW_CTRL_IMPL_ID_MASK) >> ENAHW_CTRL_IMPL_ID_SHIFT)
+
+/*
+ * Device Caps (Register 0x8)
+ */
+#define ENAHW_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK 0x1
+#define ENAHW_CAPS_RESET_TIMEOUT_SHIFT 1
+#define ENAHW_CAPS_RESET_TIMEOUT_MASK 0x3e
+#define ENAHW_CAPS_RESET_TIMEOUT(v) \
+ (((v) & ENAHW_CAPS_RESET_TIMEOUT_MASK) >> \
+ ENAHW_CAPS_RESET_TIMEOUT_SHIFT)
+#define ENAHW_CAPS_DMA_ADDR_WIDTH_SHIFT 8
+#define ENAHW_CAPS_DMA_ADDR_WIDTH_MASK 0xff00
+#define ENAHW_CAPS_DMA_ADDR_WIDTH(v) \
+ (((v) & ENAHW_CAPS_DMA_ADDR_WIDTH_MASK) >> \
+ ENAHW_CAPS_DMA_ADDR_WIDTH_SHIFT)
+#define ENAHW_CAPS_ADMIN_CMD_TIMEOUT_SHIFT 16
+#define ENAHW_CAPS_ADMIN_CMD_TIMEOUT_MASK 0xf0000
+#define ENAHW_CAPS_ADMIN_CMD_TIMEOUT(v) \
+ (((v) & ENAHW_CAPS_ADMIN_CMD_TIMEOUT_MASK) >> \
+ ENAHW_CAPS_ADMIN_CMD_TIMEOUT_SHIFT)
+
+enum enahw_reset_reason_types {
+ ENAHW_RESET_NORMAL = 0,
+ ENAHW_RESET_KEEP_ALIVE_TO = 1,
+ ENAHW_RESET_ADMIN_TO = 2,
+ ENAHW_RESET_MISS_TX_CMPL = 3,
+ ENAHW_RESET_INV_RX_REQ_ID = 4,
+ ENAHW_RESET_INV_TX_REQ_ID = 5,
+ ENAHW_RESET_TOO_MANY_RX_DESCS = 6,
+ ENAHW_RESET_INIT_ERR = 7,
+ ENAHW_RESET_DRIVER_INVALID_STATE = 8,
+ ENAHW_RESET_OS_TRIGGER = 9,
+ ENAHW_RESET_OS_NETDEV_WD = 10,
+ ENAHW_RESET_SHUTDOWN = 11,
+ ENAHW_RESET_USER_TRIGGER = 12,
+ ENAHW_RESET_GENERIC = 13,
+ ENAHW_RESET_MISS_INTERRUPT = 14,
+ ENAHW_RESET_LAST,
+};
+
+/*
+ * Admin Submission Queue Caps (Register 0x18)
+ */
+#define ENAHW_ASQ_CAPS_DEPTH_MASK 0xffff
+#define ENAHW_ASQ_CAPS_ENTRY_SIZE_SHIFT 16
+#define ENAHW_ASQ_CAPS_ENTRY_SIZE_MASK 0xffff0000
+
+#define ENAHW_ASQ_CAPS_DEPTH(x) ((x) & ENAHW_ASQ_CAPS_DEPTH_MASK)
+
+#define ENAHW_ASQ_CAPS_ENTRY_SIZE(x) \
+ (((x) << ENAHW_ASQ_CAPS_ENTRY_SIZE_SHIFT) & \
+ ENAHW_ASQ_CAPS_ENTRY_SIZE_MASK)
+
+/*
+ * Admin Completion Queue Caps (Register 0x28)
+ */
+#define ENAHW_ACQ_CAPS_DEPTH_MASK 0xffff
+#define ENAHW_ACQ_CAPS_ENTRY_SIZE_SHIFT 16
+#define ENAHW_ACQ_CAPS_ENTRY_SIZE_MASK 0xffff0000
+
+#define ENAHW_ACQ_CAPS_DEPTH(x) ((x) & ENAHW_ACQ_CAPS_DEPTH_MASK)
+
+#define ENAHW_ACQ_CAPS_ENTRY_SIZE(x) \
+ (((x) << ENAHW_ACQ_CAPS_ENTRY_SIZE_SHIFT) & \
+ ENAHW_ACQ_CAPS_ENTRY_SIZE_MASK)
+
+/*
+ * Asynchronous Event Notification Queue Caps (Register 0x34)
+ */
+#define ENAHW_AENQ_CAPS_DEPTH_MASK 0xffff
+#define ENAHW_AENQ_CAPS_ENTRY_SIZE_SHIFT 16
+#define ENAHW_AENQ_CAPS_ENTRY_SIZE_MASK 0xffff0000
+
+#define ENAHW_AENQ_CAPS_DEPTH(x) ((x) & ENAHW_AENQ_CAPS_DEPTH_MASK)
+
+#define ENAHW_AENQ_CAPS_ENTRY_SIZE(x) \
+ (((x) << ENAHW_AENQ_CAPS_ENTRY_SIZE_SHIFT) & \
+ ENAHW_AENQ_CAPS_ENTRY_SIZE_MASK)
+
+/*
+ * Interrupt Mask (Register 0x4c)
+ */
+#define ENAHW_INTR_UNMASK 0x0
+#define ENAHW_INTR_MASK 0x1
+
+/*
+ * Device Control (Register 0x54)
+ */
+#define ENAHW_DEV_CTL_DEV_RESET_MASK 0x1
+#define ENAHW_DEV_CTL_AQ_RESTART_SHIFT 1
+#define ENAHW_DEV_CTL_AQ_RESTART_MASK 0x2
+#define ENAHW_DEV_CTL_QUIESCENT_SHIFT 2
+#define ENAHW_DEV_CTL_QUIESCENT_MASK 0x4
+#define ENAHW_DEV_CTL_IO_RESUME_SHIFT 3
+#define ENAHW_DEV_CTL_IO_RESUME_MASK 0x8
+#define ENAHW_DEV_CTL_RESET_REASON_SHIFT 28
+#define ENAHW_DEV_CTL_RESET_REASON_MASK 0xf0000000
+
+/*
+ * Device Status (Register 0x58)
+ */
+#define ENAHW_DEV_STS_READY_MASK 0x1
+#define ENAHW_DEV_STS_AQ_RESTART_IN_PROGRESS_SHIFT 1
+#define ENAHW_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK 0x2
+#define ENAHW_DEV_STS_AQ_RESTART_FINISHED_SHIFT 2
+#define ENAHW_DEV_STS_AQ_RESTART_FINISHED_MASK 0x4
+#define ENAHW_DEV_STS_RESET_IN_PROGRESS_SHIFT 3
+#define ENAHW_DEV_STS_RESET_IN_PROGRESS_MASK 0x8
+#define ENAHW_DEV_STS_RESET_FINISHED_SHIFT 4
+#define ENAHW_DEV_STS_RESET_FINISHED_MASK 0x10
+#define ENAHW_DEV_STS_FATAL_ERROR_SHIFT 5
+#define ENAHW_DEV_STS_FATAL_ERROR_MASK 0x20
+#define ENAHW_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_SHIFT 6
+#define ENAHW_DEV_STS_QUIESCENT_STATE_IN_PROGRESS_MASK 0x40
+#define ENAHW_DEV_STS_QUIESCENT_STATE_ACHIEVED_SHIFT 7
+#define ENAHW_DEV_STS_QUIESCENT_STATE_ACHIEVED_MASK 0x80
+
+/* common: ena_admin_aenq_common_desc */
+typedef struct enahw_aenq_desc {
+ uint16_t ead_group;
+ uint16_t ead_syndrome;
+ uint8_t ead_flags;
+ uint8_t ead_rsvd1[3];
+ uint32_t ead_ts_low;
+ uint32_t ead_ts_high;
+
+ union {
+ uint32_t raw[12];
+
+ struct {
+ uint32_t flags;
+ } link_change;
+
+ struct {
+ uint32_t rx_drops_low;
+ uint32_t rx_drops_high;
+ uint32_t tx_drops_low;
+ uint32_t tx_drops_high;
+ } keep_alive;
+ } ead_payload;
+} enahw_aenq_desc_t;
+
+#define ENAHW_AENQ_DESC_PHASE_MASK BIT(0)
+
+#define ENAHW_AENQ_DESC_PHASE(desc) \
+ ((desc)->ead_flags & ENAHW_AENQ_DESC_PHASE_MASK)
+
+#define ENAHW_AENQ_LINK_CHANGE_LINK_STATUS_MASK BIT(0)
+
+/*
+ * Asynchronous Event Notification Queue groups.
+ *
+ * Note: These values represent the bit position of each feature as
+ * returned by ENAHW_FEAT_AENQ_CONFIG. We encode them this way so that
+ * they can double as an index into the AENQ handlers array.
+ *
+ * common: ena_admin_aenq_group
+ */
+typedef enum enahw_aenq_groups {
+ ENAHW_AENQ_GROUP_LINK_CHANGE = 0,
+ ENAHW_AENQ_GROUP_FATAL_ERROR = 1,
+ ENAHW_AENQ_GROUP_WARNING = 2,
+ ENAHW_AENQ_GROUP_NOTIFICATION = 3,
+ ENAHW_AENQ_GROUP_KEEP_ALIVE = 4,
+ ENAHW_AENQ_GROUP_REFRESH_CAPABILITIES = 5,
+ ENAHW_AENQ_GROUPS_ARR_NUM = 6,
+} enahw_aenq_groups_t;
+
+/*
+ * The reason for ENAHW_AENQ_GROUP_NOFIFICATION.
+ *
+ * common: ena_admin_aenq_notification_syndrome
+ */
+typedef enum enahw_aenq_syndrome {
+ ENAHW_AENQ_SYNDROME_UPDATE_HINTS = 2,
+} enahw_aenq_syndrome_t;
+
+/*
+ * ENA devices use a 48-bit memory space.
+ *
+ * common: ena_common_mem_addr
+ */
+typedef struct enahw_addr {
+ uint32_t ea_low;
+ uint16_t ea_high;
+ uint16_t ea_rsvd; /* must be zero */
+} enahw_addr_t;
+
+/* common: ena_admin_ctrl_buff_info */
+struct enahw_ctrl_buff {
+ uint32_t ecb_length;
+ enahw_addr_t ecb_addr;
+};
+
+/* common: ena_admin_get_set_feature_common_desc */
+struct enahw_feat_common {
+ /*
+ * 1:0 Select which value you want.
+ *
+ * 0x1 = Current value.
+ * 0x3 = Default value.
+ *
+ * Note: Linux seems to set this to 0 to get the value,
+ * not sure if that's a bug or just another way to get the
+ * current value.
+ *
+ * 7:3 Reserved.
+ */
+ uint8_t efc_flags;
+
+ /* An id from enahw_feature_id_t. */
+ uint8_t efc_id;
+
+ /*
+ * Each feature is versioned, allowing upgrades to the feature
+ * set without breaking backwards compatibility. The driver
+ * uses this field to specify which version it supports
+ * (starting from zero). Linux doesn't document this very well
+ * and sets this value to 0 for most features. We define a set
+ * of macros, underneath the enahw_feature_id_t type, clearly
+ * documenting the version we support for each feature.
+ */
+ uint8_t efc_version;
+ uint8_t efc_rsvd;
+};
+
+/* common: ena_admin_get_feat_cmd */
+typedef struct enahw_cmd_get_feat {
+ struct enahw_ctrl_buff ecgf_ctrl_buf;
+ struct enahw_feat_common ecgf_comm;
+ uint32_t egcf_unused[11];
+} enahw_cmd_get_feat_t;
+
+/*
+ * N.B. Linux sets efc_flags to 0 (via memset) when reading the
+ * current value, but the comments say it should be 0x1. We follow the
+ * comments.
+ */
+#define ENAHW_GET_FEAT_FLAGS_GET_CURR_VAL(desc) \
+ ((desc)->ecgf_comm.efc_flags) |= 0x1
+#define ENAHW_GET_FEAT_FLAGS_GET_DEF_VAL(desc) \
+ ((desc)->ecgf_comm.efc_flags) |= 0x3
+
+/*
+ * Set the MTU of the device. This value does not include the L2
+ * headers or trailers, only the payload.
+ *
+ * common: ena_admin_set_feature_mtu_desc
+ */
+typedef struct enahw_feat_mtu {
+ uint32_t efm_mtu;
+} enahw_feat_mtu_t;
+
+/* common: ena_admin_set_feature_host_attr_desc */
+typedef struct enahw_feat_host_attr {
+ enahw_addr_t efha_os_addr;
+ enahw_addr_t efha_debug_addr;
+ uint32_t efha_debug_sz;
+} enahw_feat_host_attr_t;
+
+/*
+ * ENAHW_FEAT_AENQ_CONFIG
+ *
+ * common: ena_admin_feature_aenq_desc
+ */
+typedef struct enahw_feat_aenq {
+ /* Bitmask of AENQ groups this device supports. */
+ uint32_t efa_supported_groups;
+
+ /* Bitmask of AENQ groups currently enabled. */
+ uint32_t efa_enabled_groups;
+} enahw_feat_aenq_t;
+
+/* common: ena_admin_set_feat_cmd */
+typedef struct enahw_cmd_set_feat {
+ struct enahw_ctrl_buff ecsf_ctrl_buf;
+ struct enahw_feat_common ecsf_comm;
+
+ union {
+ uint32_t ecsf_raw[11];
+ enahw_feat_host_attr_t ecsf_host_attr;
+ enahw_feat_mtu_t ecsf_mtu;
+ enahw_feat_aenq_t ecsf_aenq;
+ } ecsf_feat;
+} enahw_cmd_set_feat_t;
+
+/*
+ * Used to populate the host information buffer which the Nitro
+ * hypervisor supposedly uses for display, debugging, and possibly
+ * other purposes.
+ *
+ * common: ena_admin_host_info
+ */
+typedef struct enahw_host_info {
+ uint32_t ehi_os_type;
+ uint8_t ehi_os_dist_str[128];
+ uint32_t ehi_os_dist;
+ uint8_t ehi_kernel_ver_str[32];
+ uint32_t ehi_kernel_ver;
+ uint32_t ehi_driver_ver;
+ uint32_t ehi_supported_net_features[2];
+ uint16_t ehi_ena_spec_version;
+ uint16_t ehi_bdf;
+ uint16_t ehi_num_cpus;
+ uint16_t ehi_rsvd;
+ uint32_t ehi_driver_supported_features;
+} enahw_host_info_t;
+
+#define ENAHW_HOST_INFO_MAJOR_MASK GENMASK(7, 0)
+#define ENAHW_HOST_INFO_MINOR_SHIFT 8
+#define ENAHW_HOST_INFO_MINOR_MASK GENMASK(15, 8)
+#define ENAHW_HOST_INFO_SUB_MINOR_SHIFT 16
+#define ENAHW_HOST_INFO_SUB_MINOR_MASK GENMASK(23, 16)
+#define ENAHW_HOST_INFO_SPEC_MAJOR_SHIFT 8
+#define ENAHW_HOST_INFO_MODULE_TYPE_SHIFT 24
+#define ENAHW_HOST_INFO_MODULE_TYPE_MASK GENMASK(31, 24)
+#define ENAHW_HOST_INFO_FUNCTION_MASK GENMASK(2, 0)
+#define ENAHW_HOST_INFO_DEVICE_SHIFT 3
+#define ENAHW_HOST_INFO_DEVICE_MASK GENMASK(7, 3)
+#define ENAHW_HOST_INFO_BUS_SHIFT 8
+#define ENAHW_HOST_INFO_BUS_MASK GENMASK(15, 8)
+#define ENAHW_HOST_INFO_RX_OFFSET_SHIFT 1
+#define ENAHW_HOST_INFO_RX_OFFSET_MASK BIT(1)
+#define ENAHW_HOST_INFO_INTERRUPT_MODERATION_SHIFT 2
+#define ENAHW_HOST_INFO_INTERRUPT_MODERATION_MASK BIT(2)
+#define ENAHW_HOST_INFO_RX_BUF_MIRRORING_SHIFT 3
+#define ENAHW_HOST_INFO_RX_BUF_MIRRORING_MASK BIT(3)
+#define ENAHW_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_SHIFT 4
+#define ENAHW_HOST_INFO_RSS_CONFIGURABLE_FUNCTION_KEY_MASK BIT(4)
+
+/* common: ena_admin_os_type */
+enum enahw_os_type {
+ ENAHW_OS_LINUX = 1,
+ ENAHW_OS_WIN = 2,
+ ENAHW_OS_DPDK = 3,
+ ENAHW_OS_FREEBSD = 4,
+ ENAHW_OS_IPXE = 5,
+ ENAHW_OS_ESXI = 6,
+ ENAHW_OS_MACOS = 7,
+ ENAHW_OS_GROUPS_NUM = 7,
+};
+
+/*
+ * Create I/O Completion Queue
+ *
+ * A completion queue is where the device writes responses to I/O
+ * requests. The admin completion queue must be created before such a
+ * command can be issued, see ena_admin_cq_init().
+ *
+ * common: ena_admin_aq_create_cq_cmd
+ */
+typedef struct enahw_cmd_create_cq {
+ /*
+ * 7-6 reserved
+ *
+ * 5 interrupt mode: when set the device sends an interrupt
+ * for each completion, otherwise the driver must poll
+ * the queue.
+ *
+ * 4-0 reserved
+ */
+ uint8_t ecq_caps_1;
+
+ /*
+ * 7-5 reserved
+ *
+ * 4-0 CQ entry size (in words): the size of a single CQ entry
+ * in multiples of 32-bit words.
+ *
+ * NOTE: According to the common code the "valid" values
+ * are 4 or 8 -- this is incorrect. The valid values are
+ * 2 and 4. The common code does have an "extended" Rx
+ * completion descriptor, ena_eth_io_rx_cdesc_ext, that
+ * is 32 bytes and thus would use a value of 8, but it is
+ * not used by the Linux or FreeBSD drivers, so we do not
+ * bother with it.
+ *
+ * Type Bytes Value
+ * enahw_tx_cdesc_t 8 2
+ * enahw_rx_cdesc_t 16 4
+ */
+ uint8_t ecq_caps_2;
+
+ /* The number of CQ entries, must be a power of 2. */
+ uint16_t ecq_num_descs;
+
+ /* The MSI-X vector assigned to this CQ. */
+ uint32_t ecq_msix_vector;
+
+ /*
+ * The CQ's physical base address. The CQ memory must be
+ * physically contiguous.
+ */
+ enahw_addr_t ecq_addr;
+} enahw_cmd_create_cq_t;
+
+#define ENAHW_CMD_CREATE_CQ_INTERRUPT_MODE_ENABLED_SHIFT 5
+#define ENAHW_CMD_CREATE_CQ_INTERRUPT_MODE_ENABLED_MASK (BIT(5))
+#define ENAHW_CMD_CREATE_CQ_DESC_SIZE_WORDS_MASK (GENMASK(4, 0))
+
+#define ENAHW_CMD_CREATE_CQ_INTERRUPT_MODE_ENABLE(cmd) \
+ ((cmd)->ecq_caps_1 |= ENAHW_CMD_CREATE_CQ_INTERRUPT_MODE_ENABLED_MASK)
+
+#define ENAHW_CMD_CREATE_CQ_DESC_SIZE_WORDS(cmd, val) \
+ (((cmd)->ecq_caps_2) |= \
+ ((val) & ENAHW_CMD_CREATE_CQ_DESC_SIZE_WORDS_MASK))
+
+/*
+ * Destroy Completion Queue
+ *
+ * common: ena_admin_aq_destroy_cq_cmd
+ */
+typedef struct enahw_cmd_destroy_cq {
+ uint16_t edcq_idx;
+ uint16_t edcq_rsvd;
+} enahw_cmd_destroy_cq_t;
+
+/*
+ * common: ena_admin_aq_create_sq_cmd
+ */
+typedef struct enahw_cmd_create_sq {
+ /*
+ * 7-5 direction: 0x1 = Tx, 0x2 = Rx
+ * 4-0 reserved
+ */
+ uint8_t ecsq_dir;
+ uint8_t ecsq_rsvd1;
+
+ /*
+ * 7 reserved
+ *
+ * 6-4 completion policy: How are completion events generated.
+ *
+ * See enahw_completion_policy_type_t for a description of
+ * the various values.
+ *
+ * 3-0 placement policy: Where the descriptor ring and
+ * headers reside.
+ *
+ * See enahw_placement_policy_t for a description of the
+ * various values.
+ */
+ uint8_t ecsq_caps_2;
+
+ /*
+ * 7-1 reserved
+ *
+ * 0 physically contiguous: When set indicates the descriptor
+ * ring memory is physically contiguous.
+ */
+ uint8_t ecsq_caps_3;
+
+ /*
+ * The index of the associated Completion Queue (CQ). The CQ
+ * must be created before the SQ.
+ */
+ uint16_t ecsq_cq_idx;
+
+ /* The number of descriptors in this SQ. */
+ uint16_t ecsq_num_descs;
+
+ /*
+ * The base physical address of the SQ. This should not be set
+ * for LLQ. Must be page aligned.
+ */
+ enahw_addr_t ecsq_base;
+
+ /*
+ * The physical address of the head write-back pointer. Valid
+ * only when the completion policy is set to one of the head
+ * write-back modes (0x2 or 0x3). Must be cacheline size
+ * aligned.
+ */
+ enahw_addr_t ecsq_head_wb;
+ uint32_t ecsq_rsvdw2;
+ uint32_t ecsq_rsvdw3;
+} enahw_cmd_create_sq_t;
+
+typedef enum enahw_sq_direction {
+ ENAHW_SQ_DIRECTION_TX = 1,
+ ENAHW_SQ_DIRECTION_RX = 2,
+} enahw_sq_direction_t;
+
+typedef enum enahw_placement_policy {
+ /* Descriptors and headers are in host memory. */
+ ENAHW_PLACEMENT_POLICY_HOST = 1,
+
+ /*
+ * Descriptors and headers are in device memory (a.k.a Low
+ * Latency Queue).
+ */
+ ENAHW_PLACEMENT_POLICY_DEV = 3,
+} enahw_placement_policy_t;
+
+/*
+ * DESC: Write a CQ entry for each SQ descriptor.
+ *
+ * DESC_ON_DEMAND: Write a CQ entry when requested by the SQ descriptor.
+ *
+ * HEAD_ON_DEMAND: Update head pointer when requested by the SQ
+ * descriptor.
+ *
+ * HEAD: Update head pointer for each SQ descriptor.
+ *
+ */
+typedef enum enahw_completion_policy_type {
+ ENAHW_COMPLETION_POLICY_DESC = 0,
+ ENAHW_COMPLETION_POLICY_DESC_ON_DEMAND = 1,
+ ENAHW_COMPLETION_POLICY_HEAD_ON_DEMAND = 2,
+ ENAHW_COMPLETION_POLICY_HEAD = 3,
+} enahw_completion_policy_type_t;
+
+#define ENAHW_CMD_CREATE_SQ_DIR_SHIFT 5
+#define ENAHW_CMD_CREATE_SQ_DIR_MASK GENMASK(7, 5)
+#define ENAHW_CMD_CREATE_SQ_PLACEMENT_POLICY_MASK GENMASK(3, 0)
+#define ENAHW_CMD_CREATE_SQ_COMPLETION_POLICY_SHIFT 4
+#define ENAHW_CMD_CREATE_SQ_COMPLETION_POLICY_MASK GENMASK(6, 4)
+#define ENAHW_CMD_CREATE_SQ_PHYSMEM_CONTIG_MASK BIT(0)
+
+#define ENAHW_CMD_CREATE_SQ_DIR(cmd, val) \
+ (((cmd)->ecsq_dir) |= (((val) << ENAHW_CMD_CREATE_SQ_DIR_SHIFT) & \
+ ENAHW_CMD_CREATE_SQ_DIR_MASK))
+
+#define ENAHW_CMD_CREATE_SQ_PLACEMENT_POLICY(cmd, val) \
+ (((cmd)->ecsq_caps_2) |= \
+ ((val) & ENAHW_CMD_CREATE_SQ_PLACEMENT_POLICY_MASK))
+
+#define ENAHW_CMD_CREATE_SQ_COMPLETION_POLICY(cmd, val) \
+ (((cmd)->ecsq_caps_2) |= \
+ (((val) << ENAHW_CMD_CREATE_SQ_COMPLETION_POLICY_SHIFT) & \
+ ENAHW_CMD_CREATE_SQ_COMPLETION_POLICY_MASK))
+
+#define ENAHW_CMD_CREATE_SQ_PHYSMEM_CONTIG(cmd) \
+ ((cmd)->ecsq_caps_3 |= ENAHW_CMD_CREATE_SQ_PHYSMEM_CONTIG_MASK)
+
+/* common: ena_admin_sq */
+typedef struct enahw_cmd_destroy_sq {
+ uint16_t edsq_idx;
+ uint8_t edsq_dir; /* Tx/Rx */
+ uint8_t edsq_rsvd;
+} enahw_cmd_destroy_sq_t;
+
+#define ENAHW_CMD_DESTROY_SQ_DIR_SHIFT 5
+#define ENAHW_CMD_DESTROY_SQ_DIR_MASK GENMASK(7, 5)
+
+#define ENAHW_CMD_DESTROY_SQ_DIR(cmd, val) \
+ (((cmd)->edsq_dir) |= (((val) << ENAHW_CMD_DESTROY_SQ_DIR_SHIFT) & \
+ ENAHW_CMD_DESTROY_SQ_DIR_MASK))
+
+/* common: ena_admin_aq_get_stats_cmd */
+typedef struct enahw_cmd_get_stats {
+ struct enahw_ctrl_buff ecgs_ctrl_buf;
+ uint8_t ecgs_type;
+ uint8_t ecgs_scope;
+ uint16_t ecgs_rsvd;
+ uint16_t ecgs_queue_idx;
+
+ /*
+ * The device ID for which to query stats from. The sentinel
+ * value 0xFFFF indicates a query of the current device.
+ * According to the common docs, a "privileged device" may
+ * query stats for other ENA devices. However the definition
+ * of this "privilege device" is not expanded upon.
+ */
+ uint16_t ecgs_device_id;
+} enahw_cmd_get_stats_t;
+
+/* Query the stats for my device. */
+#define ENAHW_CMD_GET_STATS_MY_DEVICE_ID 0xFFFF
+
+/*
+ * BASIC: Returns enahw_resp_basic_stats.
+ *
+ * EXTENDED: According to the Linux documentation returns a buffer in
+ * "string format" with additional statistics per queue and per device ID.
+ *
+ * ENI: According to the Linux documentation it returns "extra HW
+ * stats for a specific network interfaces".
+ *
+ * common: ena_admin_get_stats_type
+ */
+typedef enum enahw_get_stats_type {
+ ENAHW_GET_STATS_TYPE_BASIC = 0,
+ ENAHW_GET_STATS_TYPE_EXTENDED = 1,
+ ENAHW_GET_STATS_TYPE_ENI = 2,
+} enahw_get_stats_type_t;
+
+/* common: ena_admin_get_stats_scope */
+typedef enum enahw_get_stats_scope {
+ ENAHW_GET_STATS_SCOPE_QUEUE = 0,
+ ENAHW_GET_STATS_SCOPE_ETH = 1,
+} enahw_get_stats_scope_t;
+
+/* common: ena_admin_aq_entry */
+typedef struct enahw_cmd_desc {
+ uint16_t ecd_cmd_id;
+ uint8_t ecd_opcode;
+ uint8_t ecd_flags;
+
+ union {
+ uint32_t ecd_raw[15];
+ enahw_cmd_get_feat_t ecd_get_feat;
+ enahw_cmd_set_feat_t ecd_set_feat;
+ enahw_cmd_create_cq_t ecd_create_cq;
+ enahw_cmd_destroy_cq_t ecd_destroy_cq;
+ enahw_cmd_create_sq_t ecd_create_sq;
+ enahw_cmd_destroy_sq_t ecd_destroy_sq;
+ enahw_cmd_get_stats_t ecd_get_stats;
+ } ecd_cmd;
+
+} enahw_cmd_desc_t;
+
+/*
+ * top level commands that may be sent to the Admin Queue.
+ *
+ * common: ena_admin_aq_opcode
+ */
+typedef enum ena_cmd_opcode {
+ ENAHW_CMD_NONE = 0,
+ ENAHW_CMD_CREATE_SQ = 1,
+ ENAHW_CMD_DESTROY_SQ = 2,
+ ENAHW_CMD_CREATE_CQ = 3,
+ ENAHW_CMD_DESTROY_CQ = 4,
+ ENAHW_CMD_GET_FEATURE = 8,
+ ENAHW_CMD_SET_FEATURE = 9,
+ ENAHW_CMD_GET_STATS = 11,
+} enahw_cmd_opcode_t;
+
+/* common: ENA_ADMIN_AQ_COMMON_DESC */
+#define ENAHW_CMD_ID_MASK GENMASK(11, 0)
+#define ENAHW_CMD_PHASE_MASK BIT(0)
+
+#define ENAHW_CMD_ID(desc, id) \
+ (((desc)->ecd_cmd_id) |= ((id) & ENAHW_CMD_ID_MASK))
+
+/*
+ * Subcommands for ENA_ADMIN_{GET,SET}_FEATURE.
+ *
+ * common: ena_admin_aq_feature_id
+ */
+typedef enum enahw_feature_id {
+ ENAHW_FEAT_DEVICE_ATTRIBUTES = 1,
+ ENAHW_FEAT_MAX_QUEUES_NUM = 2,
+ ENAHW_FEAT_HW_HINTS = 3,
+ ENAHW_FEAT_LLQ = 4,
+ ENAHW_FEAT_EXTRA_PROPERTIES_STRINGS = 5,
+ ENAHW_FEAT_EXTRA_PROPERTIES_FLAGS = 6,
+ ENAHW_FEAT_MAX_QUEUES_EXT = 7,
+ ENAHW_FEAT_RSS_HASH_FUNCTION = 10,
+ ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG = 11,
+ ENAHW_FEAT_RSS_INDIRECTION_TABLE_CONFIG = 12,
+ ENAHW_FEAT_MTU = 14,
+ ENAHW_FEAT_RSS_HASH_INPUT = 18,
+ ENAHW_FEAT_INTERRUPT_MODERATION = 20,
+ ENAHW_FEAT_AENQ_CONFIG = 26,
+ ENAHW_FEAT_LINK_CONFIG = 27,
+ ENAHW_FEAT_HOST_ATTR_CONFIG = 28,
+ ENAHW_FEAT_NUM = 32,
+} enahw_feature_id_t;
+
+/*
+ * The following macros define the maximum version we support for each
+ * feature. These are the feature versions we use to communicate with
+ * the feature command. Linux has these values spread throughout the
+ * code at the various callsites of ena_com_get_feature(). We choose
+ * to centralize our feature versions to make it easier to audit.
+ */
+#define ENAHW_FEAT_DEVICE_ATTRIBUTES_VER 0
+#define ENAHW_FEAT_MAX_QUEUES_NUM_VER 0
+#define ENAHW_FEAT_HW_HINTS_VER 0
+#define ENAHW_FEAT_LLQ_VER 0
+#define ENAHW_FEAT_EXTRA_PROPERTIES_STRINGS_VER 0
+#define ENAHW_FEAT_EXTRA_PROPERTIES_FLAGS_VER 0
+#define ENAHW_FEAT_MAX_QUEUES_EXT_VER 1
+#define ENAHW_FEAT_RSS_HASH_FUNCTION_VER 0
+#define ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG_VER 0
+#define ENAHW_FEAT_RSS_INDIRECTION_TABLE_CONFIG_VER 0
+#define ENAHW_FEAT_MTU_VER 0
+#define ENAHW_FEAT_RSS_HASH_INPUT_VER 0
+#define ENAHW_FEAT_INTERRUPT_MODERATION_VER 0
+#define ENAHW_FEAT_AENQ_CONFIG_VER 0
+#define ENAHW_FEAT_LINK_CONFIG_VER 0
+#define ENAHW_FEAT_HOST_ATTR_CONFIG_VER 0
+
+/* common: ena_admin_link_types */
+typedef enum enahw_link_speeds {
+ ENAHW_LINK_SPEED_1G = 0x1,
+ ENAHW_LINK_SPEED_2_HALF_G = 0x2,
+ ENAHW_LINK_SPEED_5G = 0x4,
+ ENAHW_LINK_SPEED_10G = 0x8,
+ ENAHW_LINK_SPEED_25G = 0x10,
+ ENAHW_LINK_SPEED_40G = 0x20,
+ ENAHW_LINK_SPEED_50G = 0x40,
+ ENAHW_LINK_SPEED_100G = 0x80,
+ ENAHW_LINK_SPEED_200G = 0x100,
+ ENAHW_LINK_SPEED_400G = 0x200,
+} enahw_link_speeds_t;
+
+/*
+ * Response to ENAHW_FEAT_HW_HINTS.
+ *
+ * Hints from the device to the driver about what values to use for
+ * various communications between the two. A value of 0 indicates
+ * there is no hint and the driver should provide its own default. All
+ * timeout values are in milliseconds.
+ *
+ * common: ena_admin_ena_hw_hints
+ */
+typedef struct enahw_device_hints {
+ /*
+ * The amount of time the driver should wait for an MMIO read
+ * reply before giving up and returning an error.
+ */
+ uint16_t edh_mmio_read_timeout;
+
+ /*
+ * If the driver has not seen an AENQ keep alive in this
+ * timeframe, then consider the device hung and perform a
+ * reset.
+ */
+ uint16_t edh_keep_alive_timeout;
+
+ /*
+ * The timeperiod in which we expect a Tx to report
+ * completion, otherwise it is considered "missed". Initiate a
+ * device reset when the number of missed completions is
+ * greater than the threshold.
+ */
+ uint16_t edh_tx_comp_timeout;
+ uint16_t edh_missed_tx_reset_threshold;
+
+ /*
+ * The timeperiod in which we expect an admin command to
+ * report completion.
+ */
+ uint16_t edh_admin_comp_timeout;
+
+ /*
+ * Used by Linux to set the netdevice 'watchdog_timeo' value.
+ * This value is used by the networking stack to determine
+ * when a pending transmission has stalled. This is similar to
+ * the keep alive timeout, except its viewing progress from
+ * the perspective of the network stack itself. This differnce
+ * is subtle but important: the device could be in a state
+ * where it has a functioning keep alive heartbeat, but has a
+ * stuck Tx queue impeding forward progress of the networking
+ * stack (which in many cases results in a scenario
+ * indistinguishable form a complete host hang).
+ *
+ * The mac layer does not currently provide such
+ * functionality, though it could and should be extended to
+ * support such a feature.
+ */
+ uint16_t edh_net_wd_timeout;
+
+ /*
+ * The maximum number of cookies/segments allowed in a DMA
+ * scatter-gather list.
+ */
+ uint16_t edh_max_tx_sgl;
+ uint16_t edh_max_rx_sgl;
+
+ uint16_t reserved[8];
+} enahw_device_hints_t;
+
+/*
+ * Response to ENAHW_FEAT_DEVICE_ATTRIBUTES.
+ *
+ * common: ena_admin_device_attr_feature_desc
+ */
+typedef struct enahw_feat_dev_attr {
+ uint32_t efda_impl_id;
+ uint32_t efda_device_version;
+
+ /*
+ * Bitmap representing supported get/set feature subcommands
+ * (enahw_feature_id).
+ */
+ uint32_t efda_supported_features;
+ uint32_t efda_rsvd1;
+
+ /* Number of bits used for physical/vritual address. */
+ uint32_t efda_phys_addr_width;
+ uint32_t efda_virt_addr_with;
+
+ /* The unicast MAC address in network byte order. */
+ uint8_t efda_mac_addr[6];
+ uint8_t efda_rsvd2[2];
+ uint32_t efda_max_mtu;
+} enahw_feat_dev_attr_t;
+
+/*
+ * Response to ENAHW_FEAT_MAX_QUEUES_NUM.
+ *
+ * common: ena_admin_queue_feature_desc
+ */
+typedef struct enahw_feat_max_queue {
+ uint32_t efmq_max_sq_num;
+ uint32_t efmq_max_sq_depth;
+ uint32_t efmq_max_cq_num;
+ uint32_t efmq_max_cq_depth;
+ uint32_t efmq_max_legacy_llq_num;
+ uint32_t efmq_max_legacy_llq_depth;
+ uint32_t efmq_max_header_size;
+
+ /*
+ * The maximum number of descriptors a single Tx packet may
+ * span. This includes the meta descriptor.
+ */
+ uint16_t efmq_max_per_packet_tx_descs;
+
+ /*
+ * The maximum number of descriptors a single Rx packet may span.
+ */
+ uint16_t efmq_max_per_packet_rx_descs;
+} enahw_feat_max_queue_t;
+
+/*
+ * Response to ENAHW_FEAT_MAX_QUEUES_EXT.
+ *
+ * common: ena_admin_queue_ext_feature_desc
+ */
+typedef struct enahw_feat_max_queue_ext {
+ uint8_t efmqe_version;
+ uint8_t efmqe_rsvd[3];
+
+ uint32_t efmqe_max_tx_sq_num;
+ uint32_t efmqe_max_tx_cq_num;
+ uint32_t efmqe_max_rx_sq_num;
+ uint32_t efmqe_max_rx_cq_num;
+ uint32_t efmqe_max_tx_sq_depth;
+ uint32_t efmqe_max_tx_cq_depth;
+ uint32_t efmqe_max_rx_sq_depth;
+ uint32_t efmqe_max_rx_cq_depth;
+ uint32_t efmqe_max_tx_header_size;
+
+ /*
+ * The maximum number of descriptors a single Tx packet may
+ * span. This includes the meta descriptor.
+ */
+ uint16_t efmqe_max_per_packet_tx_descs;
+
+ /*
+ * The maximum number of descriptors a single Rx packet may span.
+ */
+ uint16_t efmqe_max_per_packet_rx_descs;
+} enahw_feat_max_queue_ext_t;
+
+/*
+ * Response to ENA_ADMIN_LINK_CONFIG.
+ *
+ * common: ena_admin_get_feature_link_desc
+ */
+typedef struct enahw_feat_link_conf {
+ /* Link speed in Mbit/s. */
+ uint32_t eflc_speed;
+
+ /* Bit field of enahw_link_speeds_t. */
+ uint32_t eflc_supported;
+
+ /*
+ * 31-2: reserved
+ * 1: duplex - Full Duplex
+ * 0: autoneg
+ */
+ uint32_t eflc_flags;
+} enahw_feat_link_conf_t;
+
+#define ENAHW_FEAT_LINK_CONF_AUTONEG_MASK BIT(0)
+#define ENAHW_FEAT_LINK_CONF_DUPLEX_SHIFT 1
+#define ENAHW_FEAT_LINK_CONF_DUPLEX_MASK BIT(1)
+
+#define ENAHW_FEAT_LINK_CONF_AUTONEG(f) \
+ ((f)->eflc_flags & ENAHW_FEAT_LINK_CONF_AUTONEG_MASK)
+
+#define ENAHW_FEAT_LINK_CONF_FULL_DUPLEX(f) \
+ ((((f)->eflc_flags & ENAHW_FEAT_LINK_CONF_DUPLEX_MASK) >> \
+ ENAHW_FEAT_LINK_CONF_DUPLEX_SHIFT) == 1)
+
+/*
+ * Response to ENAHW_FEAT_STATELESS_OFFLOAD_CONFIG.
+ *
+ * common: ena_admin_feature_offload_desc
+ */
+typedef struct enahw_feat_offload {
+ /*
+ * 0 : Tx IPv4 Header Checksum
+ * 1 : Tx L4/IPv4 Partial Checksum
+ *
+ * The L4 checksum field should be initialized with pseudo
+ * header checksum.
+ *
+ * 2 : Tx L4/IPv4 Checksum Full
+ * 3 : Tx L4/IPv6 Partial Checksum
+ *
+ * The L4 checksum field should be initialized with pseudo
+ * header checksum.
+ *
+ * 4 : Tx L4/IPv6 Checksum Full
+ * 5 : TCP/IPv4 LSO (aka TSO)
+ * 6 : TCP/IPv6 LSO (aka TSO)
+ * 7 : LSO ECN
+ */
+ uint32_t efo_tx;
+
+ /*
+ * Receive side supported stateless offload.
+ *
+ * 0 : Rx IPv4 Header Checksum
+ * 1 : Rx TCP/UDP + IPv4 Full Checksum
+ * 2 : Rx TCP/UDP + IPv6 Full Checksum
+ * 3 : Rx hash calculation
+ */
+ uint32_t efo_rx_supported;
+
+ /* Linux seems to only check rx_supported. */
+ uint32_t efo_rx_enabled;
+} enahw_feat_offload_t;
+
+/* Feature Offloads */
+#define ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM_MASK BIT(0)
+#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART_SHIFT 1
+#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART_MASK BIT(1)
+#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL_SHIFT 2
+#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL_MASK BIT(2)
+#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART_SHIFT 3
+#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART_MASK BIT(3)
+#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL_SHIFT 4
+#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL_MASK BIT(4)
+#define ENAHW_FEAT_OFFLOAD_TSO_IPV4_SHIFT 5
+#define ENAHW_FEAT_OFFLOAD_TSO_IPV4_MASK BIT(5)
+#define ENAHW_FEAT_OFFLOAD_TSO_IPV6_SHIFT 6
+#define ENAHW_FEAT_OFFLOAD_TSO_IPV6_MASK BIT(6)
+#define ENAHW_FEAT_OFFLOAD_TSO_ECN_SHIFT 7
+#define ENAHW_FEAT_OFFLOAD_TSO_ECN_MASK BIT(7)
+#define ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM_MASK BIT(0)
+#define ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM_SHIFT 1
+#define ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM_MASK BIT(1)
+#define ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM_SHIFT 2
+#define ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM_MASK BIT(2)
+#define ENAHW_FEAT_OFFLOAD_RX_HASH_SHIFT 3
+#define ENAHW_FEAT_OFFLOAD_RX_HASH_MASK BIT(3)
+
+#define ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM(f) \
+ (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TX_L3_IPV4_CSUM_MASK) != 0)
+
+#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART(f) \
+ (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_PART_MASK) != 0)
+
+#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL(f) \
+ (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TX_L4_IPV4_CSUM_FULL_MASK) != 0)
+
+#define ENAHW_FEAT_OFFLOAD_TSO_IPV4(f) \
+ (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TSO_IPV4_MASK) != 0)
+
+#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART(f) \
+ (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_PART_MASK) != 0)
+
+#define ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL(f) \
+ (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TX_L4_IPV6_CSUM_FULL_MASK) != 0)
+
+#define ENAHW_FEAT_OFFLOAD_TSO_IPV6(f) \
+ (((f)->efo_tx & ENAHW_FEAT_OFFLOAD_TSO_IPV6_MASK) != 0)
+
+#define ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM(f) \
+ (((f)->efo_rx_supported & ENAHW_FEAT_OFFLOAD_RX_L3_IPV4_CSUM_MASK) != 0)
+
+#define ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM(f) \
+ (((f)->efo_rx_supported & ENAHW_FEAT_OFFLOAD_RX_L4_IPV4_CSUM_MASK) != 0)
+
+#define ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM(f) \
+ (((f)->efo_rx_supported & ENAHW_FEAT_OFFLOAD_RX_L4_IPV6_CSUM_MASK) != 0)
+
+typedef union enahw_resp_get_feat {
+ uint32_t ergf_raw[14];
+ enahw_feat_dev_attr_t ergf_dev_attr;
+ enahw_feat_max_queue_t ergf_max_queue;
+ enahw_feat_max_queue_ext_t ergf_max_queue_ext;
+ enahw_feat_aenq_t ergf_aenq;
+ enahw_feat_link_conf_t ergf_link_conf;
+ enahw_feat_offload_t ergf_offload;
+} enahw_resp_get_feat_u;
+
+/*
+ * common: ena_admin_acq_create_cq_resp_desc
+ */
+typedef struct enahw_resp_create_cq {
+ /*
+ * The hardware's index for this queue.
+ */
+ uint16_t ercq_idx;
+
+ /*
+ * Apparently the number of descriptors granted may be
+ * different than that requested.
+ */
+ uint16_t ercq_actual_num_descs;
+ uint32_t ercq_numa_node_reg_offset;
+ uint32_t ercq_head_db_reg_offset; /* doorbell */
+ uint32_t ercq_interrupt_mask_reg_offset; /* stop intr */
+} enahw_resp_create_cq_t;
+
+/* common: ena_admin_acq_create_sq_resp_desc */
+typedef struct enahw_resp_create_sq {
+ uint16_t ersq_idx;
+ uint16_t ersq_rsvdw1;
+ uint32_t ersq_db_reg_offset;
+ uint32_t ersq_llq_descs_reg_offset;
+ uint32_t ersq_llq_headers_reg_offset;
+} enahw_resp_create_sq_t;
+
+/* common: ena_admin_basic_stats */
+typedef struct enahw_resp_basic_stats {
+ uint32_t erbs_tx_bytes_low;
+ uint32_t erbs_tx_bytes_high;
+ uint32_t erbs_tx_pkts_low;
+ uint32_t erbs_tx_pkts_high;
+ uint32_t erbs_rx_bytes_low;
+ uint32_t erbs_rx_bytes_high;
+ uint32_t erbs_rx_pkts_low;
+ uint32_t erbs_rx_pkts_high;
+ uint32_t erbs_rx_drops_low;
+ uint32_t erbs_rx_drops_high;
+ uint32_t erbs_tx_drops_low;
+ uint32_t erbs_tx_drops_high;
+} enahw_resp_basic_stats_t;
+
+/* common: ena_admin_eni_stats */
+typedef struct enahw_resp_eni_stats {
+ /*
+ * The number of inbound packets dropped due to aggregate
+ * inbound bandwidth allowance being exceeded.
+ */
+ uint64_t eres_bw_in_exceeded;
+
+ /*
+ * The number of outbound packets dropped due to aggregated outbound
+ * bandwidth allowance being exceeded.
+ */
+ uint64_t eres_bw_out_exceeded;
+
+ /*
+ * The number of packets dropped due to the Packets Per Second
+ * allowance being exceeded.
+ */
+ uint64_t eres_pps_exceeded;
+
+ /*
+ * The number of packets dropped due to connection tracking
+ * allowance being exceeded and leading to failure in
+ * establishment of new connections.
+ */
+ uint64_t eres_conns_exceeded;
+
+ /*
+ * The number of packets dropped due to linklocal packet rate
+ * allowance being exceeded.
+ */
+ uint64_t eres_linklocal_exceeded;
+} enahw_resp_eni_stats_t;
+
+/*
+ * common: ena_admin_acq_entry
+ */
+typedef struct enahw_resp_desc {
+ /* The index of the completed command. */
+ uint16_t erd_cmd_id;
+
+ /* The status of the command (enahw_resp_status_t). */
+ uint8_t erd_status;
+
+ /*
+ * 7-1 Reserved
+ * 0 Phase
+ */
+ uint8_t erd_flags;
+
+ /* Extended status. */
+ uint16_t erd_ext_status;
+
+ /*
+ * The AQ entry (enahw_cmd_desc) index which has been consumed
+ * by the device and can be reused. However, this field is not
+ * used in the other drivers, and it seems to be redundant
+ * with the erd_idx field.
+ */
+ uint16_t erd_sq_head_idx;
+
+ union {
+ uint32_t raw[14];
+ enahw_resp_get_feat_u erd_get_feat;
+ enahw_resp_create_cq_t erd_create_cq;
+ /* destroy_cq: No command-specific response. */
+ enahw_resp_create_sq_t erd_create_sq;
+ /* destroy_sq: No command-specific response. */
+ enahw_resp_basic_stats_t erd_basic_stats;
+ enahw_resp_eni_stats_t erd_eni_stats;
+ } erd_resp;
+} enahw_resp_desc_t;
+
+/* common: ENA_ADMIN_ACQ_COMMON_DESC */
+#define ENAHW_RESP_CMD_ID_MASK GENMASK(11, 0)
+#define ENAHW_RESP_PHASE_MASK 0x1
+
+#define ENAHW_RESP_CMD_ID(desc) \
+ (((desc)->erd_cmd_id) & ENAHW_RESP_CMD_ID_MASK)
+
+/*
+ * The response status of an Admin Queue command.
+ *
+ * common: ena_admin_aq_completion_status
+ */
+typedef enum enahw_resp_status {
+ ENAHW_RESP_SUCCESS = 0,
+ ENAHW_RESP_RESOURCE_ALLOCATION_FAILURE = 1,
+ ENAHW_RESP_BAD_OPCODE = 2,
+ ENAHW_RESP_UNSUPPORTED_OPCODE = 3,
+ ENAHW_RESP_MALFORMED_REQUEST = 4,
+ /*
+ * At this place in the common code it mentions that there is
+ * "additional status" in the reponse descriptor's
+ * erd_ext_status field. As the common code never actually
+ * uses this field it's hard to know the exact meaning of the
+ * comment. My best guess is the illegal parameter error
+ * stores additional context in the erd_ext_status field. But
+ * how to interpret that additional context is anyone's guess.
+ */
+ ENAHW_RESP_ILLEGAL_PARAMETER = 5,
+ ENAHW_RESP_UNKNOWN_ERROR = 6,
+ ENAHW_RESP_RESOURCE_BUSY = 7,
+} enahw_resp_status_t;
+
+/*
+ * Not really a device structure, more of a helper to debug register values.
+ */
+typedef struct enahw_reg_nv {
+ char *ern_name;
+ uint32_t ern_offset;
+ uint32_t ern_value;
+} enahw_reg_nv_t;
+
+/*
+ * I/O macros and strcutures.
+ * -------------------------
+ */
+
+/*
+ * The device's L3 and L4 protocol numbers. These are specific to the
+ * ENA device and not to be confused with IANA protocol numbers.
+ *
+ * common: ena_eth_io_l3_proto_index
+ */
+typedef enum enahw_io_l3_proto {
+ ENAHW_IO_L3_PROTO_UNKNOWN = 0,
+ ENAHW_IO_L3_PROTO_IPV4 = 8,
+ ENAHW_IO_L3_PROTO_IPV6 = 11,
+ ENAHW_IO_L3_PROTO_FCOE = 21,
+ ENAHW_IO_L3_PROTO_ROCE = 22,
+} enahw_io_l3_proto_t;
+
+/* common: ena_eth_io_l4_proto_index */
+typedef enum enahw_io_l4_proto {
+ ENAHW_IO_L4_PROTO_UNKNOWN = 0,
+ ENAHW_IO_L4_PROTO_TCP = 12,
+ ENAHW_IO_L4_PROTO_UDP = 13,
+ ENAHW_IO_L4_PROTO_ROUTEABLE_ROCE = 23,
+} enahw_io_l4_proto_t;
+
+/* common: ena_eth_io_tx_desc */
+typedef struct enahw_tx_data_desc {
+ /*
+ * 15-0 Buffer Length (LENGTH)
+ *
+ * The buffer length in bytes. This should NOT include the
+ * Ethernet FCS bytes.
+ *
+ * 21-16 Request ID High Bits [15-10] (REQ_ID_HI)
+ * 22 Reserved Zero
+ * 23 Metadata Flag always zero (META_DESC)
+ *
+ * This flag indicates if the descriptor is a metadata
+ * descriptor or not. In this case we are defining the Tx
+ * descriptor, so it's always zero.
+ *
+ * 24 Phase bit (PHASE)
+ * 25 Reserved Zero
+ * 26 First Descriptor Bit (FIRST)
+ *
+ * Indicates this is the first descriptor for the frame.
+ *
+ * 27 Last Descriptor Bit (LAST)
+ *
+ * Indicates this is the last descriptor for the frame.
+ *
+ * 28 Completion Request Bit (COMP_REQ)
+ *
+ * Indicates if completion should be posted after the
+ * frame is transmitted. This bit is only valid on the
+ * first descriptor.
+ *
+ * 31-29 Reserved Zero
+ */
+ uint32_t etd_len_ctrl;
+
+ /*
+ * 3-0 L3 Protocol Number (L3_PROTO_IDX)
+ *
+ * The L3 protocol type, one of enahw_io_l3_proto_t. This
+ * field is required when L3_CSUM_EN or TSO_EN is set.
+ *
+ * 4 Don't Fragment Bit (DF)
+ *
+ * The value of IPv4 DF. This value must copy the value
+ * found in the packet's IPv4 header.
+ *
+ * 6-5 Reserved Zero
+ * 7 TSO Bit (TSO_EN)
+ *
+ * Enable TCP Segment Offload.
+ *
+ * 12-8 L4 Protocol Number (L4_PROTO_IDX)
+ *
+ * The L4 protocol type, one of enahw_io_l4_proto_t. This
+ * field is required when L4_CSUM_EN or TSO_EN are
+ * set.
+ *
+ * 13 L3 Checksum Offload (L3_CSUM_EN)
+ *
+ * Enable IPv4 header checksum offload.
+ *
+ * 14 L4 Checksum Offload (L4_CSUM_EN)
+ *
+ * Enable TCP/UDP checksum offload.
+ *
+ * 15 Ethernet FCS Disable (ETHERNET_FCS_DIS)
+ *
+ * Disable the device's Ethernet Frame Check sequence.
+ *
+ * 16 Reserved Zero
+ * 17 L4 Partial Checksum Present (L4_CSUM_PARTIAL)
+ *
+ * When set it indicates the host has already provided
+ * the pseudo-header checksum. Otherwise, it is up to the
+ * device to calculate it.
+ *
+ * When set and using TSO the host stack must remember
+ * not to include the TCP segment length in the supplied
+ * pseudo-header.
+ *
+ * The host stack should provide the pseudo-header
+ * checksum when using IPv6 with Routing Headers.
+ *
+ * 21-18 Reserved Zero
+ * 31-22 Request ID Low [9-0] (REQ_ID_LO)
+ */
+ uint32_t etd_meta_ctrl;
+
+ /* The low 32 bits of the buffer address. */
+ uint32_t etd_buff_addr_lo;
+
+ /*
+ * address high and header size
+ *
+ * 15-0 Buffer Address High [47-32] (ADDR_HI)
+ *
+ * The upper 15 bits of the buffer address.
+ *
+ * 23-16 Reserved Zero
+ * 31-24 Header Length (HEADER_LENGTH)
+ *
+ * This field has dubious documentation in the
+ * common/Linux driver code, even contradicting itself in
+ * the same sentence. Here's what it says, verbatim:
+ *
+ * > Header length. For Low Latency Queues, this fields
+ * > indicates the number of bytes written to the
+ * > headers' memory. For normal queues, if packet is TCP
+ * > or UDP, and longer than max_header_size, then this
+ * > field should be set to the sum of L4 header offset
+ * > and L4 header size(without options), otherwise, this
+ * > field should be set to 0. For both modes, this field
+ * > must not exceed the max_header_size. max_header_size
+ * > value is reported by the Max Queues Feature
+ * > descriptor
+ *
+ * Here's what one _might_ ascertain from the above.
+ *
+ * 1. This field should always be set in the case of
+ * LLQs/device placement.
+ *
+ * 2. This field must _never_ exceed the max header size
+ * as reported by feature detection. In our code this
+ * would be efmq_max_header_size for older ENA devices
+ * and efmqe_max_tx_header_size for newer ones. One
+ * empirical data point from a t3.small (with newer
+ * device) is a max Tx header size of 128 bytes.
+ *
+ * 3. If the packet is TCP or UDP, and the packet (or the
+ * headers?) is longer than the max header size, then
+ * this field should be set to the total header size
+ * with the exception of TCP header options.
+ * Otherwise, if the packet is not TCP or UDP, or if
+ * the packet (or header length?) _does not_ exceed
+ * the max header size, then set this value to 0.
+ *
+ * One might think, based on (3), that when the header
+ * size exceeds the max this field needs to be set, but
+ * that contradicts (2), which dictates that the total
+ * header size can never exceed the max. Sure enough, the
+ * Linux code drops all packets with headers that exceed
+ * the max. So in that case it would mean that "and
+ * longer than max_header_size" is referring to the total
+ * packet length. So for most workloads, the TCP/UDP
+ * packets should have this field set, to indicate their
+ * header length. This matches with Linux, which seems to
+ * set header length regardless of IP protocol.
+ *
+ * However, the FreeBSD code tells a different story. In
+ * it's non-LLQ Tx path it has the following comment,
+ * verbatim:
+ *
+ * > header_len is just a hint for the device. Because
+ * > FreeBSD is not giving us information about packet
+ * > header length and it is not guaranteed that all
+ * > packet headers will be in the 1st mbuf, setting
+ * > header_len to 0 is making the device ignore this
+ * > value and resolve header on it's own.
+ *
+ * According to this we can just set the value to zero
+ * and let the device figure it out. This maps better to
+ * illumos, where we also allow the header to potentially
+ * span multiple mblks (though we do have access to the
+ * header sizes via mac_ether_offload_info_t).
+ *
+ * The upshot: for now we take advantage of the device's
+ * ability to determine the header length on its own, at
+ * the potential cost of some performance (not measured).
+ */
+ uint32_t etd_buff_addr_hi_hdr_sz;
+} enahw_tx_data_desc_t;
+
+#define ENAHW_TX_DESC_LENGTH_MASK GENMASK(15, 0)
+#define ENAHW_TX_DESC_REQ_ID_HI_SHIFT 16
+#define ENAHW_TX_DESC_REQ_ID_HI_MASK GENMASK(21, 16)
+#define ENAHW_TX_DESC_META_DESC_SHIFT 23
+#define ENAHW_TX_DESC_META_DESC_MASK BIT(23)
+#define ENAHW_TX_DESC_PHASE_SHIFT 24
+#define ENAHW_TX_DESC_PHASE_MASK BIT(24)
+#define ENAHW_TX_DESC_FIRST_SHIFT 26
+#define ENAHW_TX_DESC_FIRST_MASK BIT(26)
+#define ENAHW_TX_DESC_LAST_SHIFT 27
+#define ENAHW_TX_DESC_LAST_MASK BIT(27)
+#define ENAHW_TX_DESC_COMP_REQ_SHIFT 28
+#define ENAHW_TX_DESC_COMP_REQ_MASK BIT(28)
+#define ENAHW_TX_DESC_L3_PROTO_IDX_MASK GENMASK(3, 0)
+#define ENAHW_TX_DESC_DF_SHIFT 4
+#define ENAHW_TX_DESC_DF_MASK BIT(4)
+#define ENAHW_TX_DESC_TSO_EN_SHIFT 7
+#define ENAHW_TX_DESC_TSO_EN_MASK BIT(7)
+#define ENAHW_TX_DESC_L4_PROTO_IDX_SHIFT 8
+#define ENAHW_TX_DESC_L4_PROTO_IDX_MASK GENMASK(12, 8)
+#define ENAHW_TX_DESC_L3_CSUM_EN_SHIFT 13
+#define ENAHW_TX_DESC_L3_CSUM_EN_MASK BIT(13)
+#define ENAHW_TX_DESC_L4_CSUM_EN_SHIFT 14
+#define ENAHW_TX_DESC_L4_CSUM_EN_MASK BIT(14)
+#define ENAHW_TX_DESC_ETHERNET_FCS_DIS_SHIFT 15
+#define ENAHW_TX_DESC_ETHERNET_FCS_DIS_MASK BIT(15)
+#define ENAHW_TX_DESC_L4_CSUM_PARTIAL_SHIFT 17
+#define ENAHW_TX_DESC_L4_CSUM_PARTIAL_MASK BIT(17)
+#define ENAHW_TX_DESC_REQ_ID_LO_SHIFT 22
+#define ENAHW_TX_DESC_REQ_ID_LO_MASK GENMASK(31, 22)
+#define ENAHW_TX_DESC_ADDR_HI_MASK GENMASK(15, 0)
+#define ENAHW_TX_DESC_HEADER_LENGTH_SHIFT 24
+#define ENAHW_TX_DESC_HEADER_LENGTH_MASK GENMASK(31, 24)
+
+#define ENAHW_TX_DESC_LENGTH(desc, len) \
+ (((desc)->etd_len_ctrl) |= ((len) & ENAHW_TX_DESC_LENGTH_MASK))
+
+#define ENAHW_TX_DESC_FIRST_ON(desc) \
+ (((desc)->etd_len_ctrl) |= ENAHW_TX_DESC_FIRST_MASK)
+
+#define ENAHW_TX_DESC_FIRST_OFF(desc) \
+ (((desc)->etd_len_ctrl) &= ~ENAHW_TX_DESC_FIRST_MASK)
+
+#define ENAHW_TX_DESC_REQID_HI(desc, reqid) \
+ (((desc)->etd_len_ctrl) |= \
+ ((((reqid) >> 10) << ENAHW_TX_DESC_REQ_ID_HI_SHIFT) & \
+ ENAHW_TX_DESC_REQ_ID_HI_MASK))
+
+#define ENAHW_TX_DESC_REQID_LO(desc, reqid) \
+ (((desc)->etd_meta_ctrl) |= \
+ (((reqid) << ENAHW_TX_DESC_REQ_ID_LO_SHIFT) & \
+ ENAHW_TX_DESC_REQ_ID_LO_MASK))
+
+#define ENAHW_TX_DESC_PHASE(desc, phase) \
+ (((desc)->etd_len_ctrl) |= (((phase) << ENAHW_TX_DESC_PHASE_SHIFT) & \
+ ENAHW_TX_DESC_PHASE_MASK))
+
+#define ENAHW_TX_DESC_LAST_ON(desc) \
+ (((desc)->etd_len_ctrl) |= ENAHW_TX_DESC_LAST_MASK)
+
+#define ENAHW_TX_DESC_LAST_OFF(desc) \
+ (((desc)->etd_len_ctrl) &= ~ENAHW_TX_DESC_LAST_MASK)
+
+#define ENAHW_TX_DESC_COMP_REQ_ON(desc) \
+ (((desc)->etd_len_ctrl) |= ENAHW_TX_DESC_COMP_REQ_MASK)
+
+#define ENAHW_TX_DESC_COMP_REQ_OFF(desc) \
+ (((desc)->etd_len_ctrl) &= ~ENAHW_TX_DESC_COMP_REQ_MASK)
+
+#define ENAHW_TX_DESC_META_DESC_ON(desc) \
+ (((desc)->etd_len_ctrl) |= ENAHW_TX_DESC_META_DESC_MASK)
+
+#define ENAHW_TX_DESC_META_DESC_OFF(desc) \
+ (((desc)->etd_len_ctrl) &= ~ENAHW_TX_DESC_META_DESC_MASK)
+
+#define ENAHW_TX_DESC_ADDR_LO(desc, addr) \
+ (((desc)->etd_buff_addr_lo) = (addr))
+
+#define ENAHW_TX_DESC_ADDR_HI(desc, addr) \
+ (((desc)->etd_buff_addr_hi_hdr_sz) |= \
+ (((addr) >> 32) & ENAHW_TX_DESC_ADDR_HI_MASK))
+
+#define ENAHW_TX_DESC_HEADER_LENGTH(desc, len) \
+ (((desc)->etd_buff_addr_hi_hdr_sz) |= \
+ (((len) << ENAHW_TX_DESC_HEADER_LENGTH_SHIFT) & \
+ ENAHW_TX_DESC_HEADER_LENGTH_MASK))
+
+#define ENAHW_TX_DESC_DF_ON(desc) \
+ ((desc)->etd_meta_ctrl |= ENAHW_TX_DESC_DF_MASK)
+
+#define ENAHW_TX_DESC_TSO_OFF(desc) \
+ (((desc)->etd_meta_ctrl) &= ~ENAHW_TX_DESC_TSO_EN_MASK)
+
+#define ENAHW_TX_DESC_L3_CSUM_OFF(desc) \
+ (((desc)->etd_meta_ctrl) &= ~ENAHW_TX_DESC_L3_CSUM_EN_MASK)
+
+#define ENAHW_TX_DESC_L4_CSUM_OFF(desc) \
+ (((desc)->etd_meta_ctrl) &= ~ENAHW_TX_DESC_L4_CSUM_EN_MASK)
+
+#define ENAHW_TX_DESC_L4_CSUM_PARTIAL_ON(desc) \
+ (((desc)->etd_meta_ctrl) &= ~ENAHW_TX_DESC_L4_CSUM_PARTIAL_MASK)
+
+/* common: ena_eth_io_tx_meta_desc */
+typedef struct enahw_tx_meta_desc {
+ /*
+ * 9-0 Request ID Low [9-0] (REQ_ID_LO)
+ * 13-10 Reserved Zero
+ * 14 Extended Metadata Valid (EXT_VALID)
+ *
+ * When set this descriptor contains valid extended
+ * metadata. The extended metadata includes the L3/L4
+ * length and offset fields as well as the MSS bits. This
+ * is needed for TSO.
+ *
+ * 15 Reserved Zero
+ * 19-16 MSS High Bits (MSS_HI)
+ * 20 Meta Type (ETH_META_TYPE)
+ *
+ * If enabled this is an extended metadata descriptor.
+ * This seems redundant with EXT_VALID.
+ *
+ * 21 Meta Store (META_STORE)
+ *
+ * Store the extended metadata in the queue cache.
+ *
+ * 22 Reserved Zero
+ * 23 Metadata Flag (META_DESC) -- always one
+ * 24 Phase (PHASE)
+ * 25 Reserved Zero
+ * 26 First Descriptor Bit (FIRST)
+ * 27 Last Descriptor Bit (LAST)
+ * 28 Completion Request Bit (COMP_REQ)
+ * 31-29 Reserved Zero
+ */
+ uint32_t etmd_len_ctrl;
+
+ /*
+ * 5-0 Request ID High Bits [15-10] (REQ_ID_HI)
+ * 31-6 Reserved Zero
+ */
+ uint32_t etmd_word1;
+
+ /*
+ * 7-0 L3 Header Length (L3_HDR_LEN)
+ * 15:8 L3 Header Offset (L3_HDR_OFF)
+ * 21:16 L4 Header Length in Words (L4_HDR_LEN_IN_WORDS)
+ *
+ * Specifies the L4 header length in words. The device
+ * assumes the L4 header follows directly after the L3
+ * header and that the L4 offset is equal to L3_HDR_OFF +
+ * L3_HDR_LEN.
+ *
+ * 31-22 MSS Low Bits (MSS_LO)
+ */
+ uint32_t etmd_word2;
+ uint32_t etmd_reserved;
+} enahw_tx_meta_desc_t;
+
+/* common: N/A */
+typedef union enahw_tx_desc {
+ enahw_tx_data_desc_t etd_data;
+ enahw_tx_meta_desc_t etd_meta;
+} enahw_tx_desc_t;
+
+/* common: ena_eth_io_tx_cdesc */
+typedef struct enahw_tx_cdesc {
+ /*
+ * 15-0 Request ID Bits
+ * 16 Reserved Zero
+ */
+ uint16_t etc_req_id;
+
+ /*
+ * Presumably the status of the Tx, though the Linux driver
+ * never checks this field.
+ */
+ uint8_t etc_status;
+
+ /*
+ * 0 Phase
+ * 7-1 Reserved Zero
+ */
+ uint8_t etc_flags;
+
+ /*
+ * This isn't documented or used in the Linux driver, but
+ * these probably store the submission queue ID and the
+ * submission queue head index.
+ */
+ uint16_t etc_sub_qid;
+ uint16_t etc_sq_head_idx;
+} enahw_tx_cdesc_t;
+
+#define ENAHW_TX_CDESC_PHASE_SHIFT 0
+#define ENAHW_TX_CDESC_PHASE_MASK BIT(0)
+
+#define ENAHW_TX_CDESC_GET_PHASE(cdesc) \
+ ((cdesc)->etc_flags & ENAHW_TX_CDESC_PHASE_MASK)
+
+/* common: ena_eth_io_rx_desc */
+typedef struct enahw_rx_desc {
+ /*
+ * The length of the buffer provided by the host, in bytes.
+ * Use the value of 0 to indicate 64K.
+ */
+ uint16_t erd_length;
+ uint8_t erd_reserved1;
+
+ /*
+ * 0 Phase (PHASE)
+ * 1 Reserved Zero
+ * 2 First (FIRST)
+ *
+ * Indicates this is the first descriptor for the frame.
+ *
+ * 3 Last (LAST)
+ *
+ * Indicates this is the last descriptor for the frame.
+ *
+ * 4 Completion Request (COMP_REQ)
+ *
+ * Indicates that a completion request should be generated
+ * for this descriptor.
+ *
+ * 7-5 Reserved Zero
+ */
+ uint8_t erd_ctrl;
+
+ /*
+ * 15-0 Request ID
+ * 16 Reserved 0
+ */
+ uint16_t erd_req_id;
+ uint16_t erd_reserved2;
+
+ /* The physical address of the buffer provided by the host. */
+ uint32_t erd_buff_addr_lo;
+ uint16_t erd_buff_addr_hi;
+ uint16_t erd_reserved3;
+} enahw_rx_desc_t;
+
+#define ENAHW_RX_DESC_PHASE_MASK BIT(0)
+#define ENAHW_RX_DESC_FIRST_SHIFT 2
+#define ENAHW_RX_DESC_FIRST_MASK BIT(2)
+#define ENAHW_RX_DESC_LAST_SHIFT 3
+#define ENAHW_RX_DESC_LAST_MASK BIT(3)
+#define ENAHW_RX_DESC_COMP_REQ_SHIFT 4
+#define ENAHW_RX_DESC_COMP_REQ_MASK BIT(4)
+
+#define ENAHW_RX_DESC_SET_PHASE(desc, val) \
+ ((desc)->erd_ctrl |= ((val) & ENAHW_RX_DESC_PHASE_MASK))
+
+#define ENAHW_RX_DESC_SET_FIRST(desc) \
+ ((desc)->erd_ctrl |= ENAHW_RX_DESC_FIRST_MASK)
+
+#define ENAHW_RX_DESC_SET_LAST(desc) \
+ ((desc)->erd_ctrl |= ENAHW_RX_DESC_LAST_MASK)
+
+#define ENAHW_RX_DESC_SET_COMP_REQ(desc) \
+ ((desc)->erd_ctrl |= ENAHW_RX_DESC_COMP_REQ_MASK)
+
+/*
+ * Ethernet parsing information is only valid when last == 1.
+ *
+ * common: ena_eth_io_rx_cdesc_base
+ */
+typedef struct enahw_rx_cdesc {
+ /*
+ * 4-0 L3 Protocol Number (L3_PROTO)
+ *
+ * The L3 protocol type, one of enahw_io_l3_proto_t.
+ *
+ * 6-5 (SRC_VLAN_CNT)
+ * 7 Reserved Zero
+ * 12-8 L4 Protocol Number (L4_PROTO)
+ * 13 L3 Checksum Error (L3_CSUM_ERR)
+ *
+ * When set either the L3 checksum failed to match or the
+ * controller didn't attempt to validate the checksum.
+ * This bit is valid only when L3_PROTO indicates an IPv4
+ * packet.
+ *
+ * 14 L4 Checksum Error (L4_CSUM_ERR)
+ *
+ * When set either the L4 checksum failed to match or the
+ * controller didn't attempt to validate the checksum.
+ * This bit is valid only when L4_PROTO indicates a
+ * TCP/UDP packet, IPV4_FRAG is not set, and
+ * L4_CSUM_CHECKED is set.
+ *
+ * 15 IPv4 Fragmented (IPV4_FRAG)
+ * 16 L4 Checksum Validated (L4_CSUM_CHECKED)
+ *
+ * When set it indicates the device attempted to validate
+ * the L4 checksum.
+ *
+ * 23-17 Reserved Zero
+ * 24 Phase (PHASE)
+ * 25 (L3_CSUM2)
+ *
+ * According to the Linux source this is the "second
+ * checksum engine result". It's never checked.
+ *
+ * 26 First Descriptor Bit (FIRST)
+ *
+ * Indicates the first descriptor for the frame.
+ *
+ * 27 Last Descriptor Bit (LAST)
+ *
+ * Indicates the last descriptor for the frame.
+ *
+ * 29-28 Reserved Zero
+ * 30 Buffer Type (BUFFER)
+ *
+ * When enabled indicates this is a data descriptor.
+ * Otherwse, it is a metadata descriptor.
+ *
+ * 31 : reserved31
+ */
+ uint32_t erc_status;
+ uint16_t erc_length;
+ uint16_t erc_req_id;
+
+ /* 32-bit hash result */
+ uint32_t erc_hash;
+ uint16_t erc_sub_qid;
+
+ /*
+ * The device may choose to offset the start of the header
+ * data (which implies this value only applies to the first
+ * descriptor). When and why the device does this is not
+ * documented in the common code. The most likely case would
+ * be for IP header alignment.
+ */
+ uint8_t erc_offset;
+ uint8_t erc_reserved;
+} enahw_rx_cdesc_t;
+
+#define ENAHW_RX_CDESC_L3_PROTO_MASK GENMASK(4, 0)
+#define ENAHW_RX_CDESC_SRC_VLAN_CNT_SHIFT 5
+#define ENAHW_RX_CDESC_SRC_VLAN_CNT_MASK GENMASK(6, 5)
+#define ENAHW_RX_CDESC_L4_PROTO_SHIFT 8
+#define ENAHW_RX_CDESC_L4_PROTO_MASK GENMASK(12, 8)
+#define ENAHW_RX_CDESC_L3_CSUM_ERR_SHIFT 13
+#define ENAHW_RX_CDESC_L3_CSUM_ERR_MASK BIT(13)
+#define ENAHW_RX_CDESC_L4_CSUM_ERR_SHIFT 14
+#define ENAHW_RX_CDESC_L4_CSUM_ERR_MASK BIT(14)
+#define ENAHW_RX_CDESC_IPV4_FRAG_SHIFT 15
+#define ENAHW_RX_CDESC_IPV4_FRAG_MASK BIT(15)
+#define ENAHW_RX_CDESC_L4_CSUM_CHECKED_SHIFT 16
+#define ENAHW_RX_CDESC_L4_CSUM_CHECKED_MASK BIT(16)
+#define ENAHW_RX_CDESC_PHASE_SHIFT 24
+#define ENAHW_RX_CDESC_PHASE_MASK BIT(24)
+#define ENAHW_RX_CDESC_L3_CSUM2_SHIFT 25
+#define ENAHW_RX_CDESC_L3_CSUM2_MASK BIT(25)
+#define ENAHW_RX_CDESC_FIRST_SHIFT 26
+#define ENAHW_RX_CDESC_FIRST_MASK BIT(26)
+#define ENAHW_RX_CDESC_LAST_SHIFT 27
+#define ENAHW_RX_CDESC_LAST_MASK BIT(27)
+#define ENAHW_RX_CDESC_BUFFER_SHIFT 30
+#define ENAHW_RX_CDESC_BUFFER_MASK BIT(30)
+
+#define ENAHW_RX_CDESC_L3_PROTO(desc) \
+ ((desc)->erc_status & ENAHW_RX_CDESC_L3_PROTO_MASK)
+
+#define ENAHW_RX_CDESC_L3_CSUM_ERR(desc) \
+ ((((desc)->erc_status & ENAHW_RX_CDESC_L3_CSUM_ERR_MASK) >> \
+ ENAHW_RX_CDESC_L3_CSUM_ERR_SHIFT) != 0)
+
+#define ENAHW_RX_CDESC_L4_PROTO(desc) \
+ (((desc)->erc_status & ENAHW_RX_CDESC_L4_PROTO_MASK) >> \
+ ENAHW_RX_CDESC_L4_PROTO_SHIFT)
+
+#define ENAHW_RX_CDESC_L4_CSUM_CHECKED(desc) \
+ ((((desc)->erc_status & ENAHW_RX_CDESC_L4_CSUM_CHECKED_MASK) >> \
+ ENAHW_RX_CDESC_L4_CSUM_CHECKED_SHIFT) != 0)
+
+#define ENAHW_RX_CDESC_L4_CSUM_ERR(desc) \
+ ((((desc)->erc_status & ENAHW_RX_CDESC_L4_CSUM_ERR_MASK) >> \
+ ENAHW_RX_CDESC_L4_CSUM_ERR_SHIFT) != 0)
+
+#define ENAHW_RX_CDESC_PHASE(desc) \
+ (((desc)->erc_status & ENAHW_RX_CDESC_PHASE_MASK) >> \
+ ENAHW_RX_CDESC_PHASE_SHIFT)
+
+#define ENAHW_RX_CDESC_FIRST(desc) \
+ ((((desc)->erc_status & ENAHW_RX_CDESC_FIRST_MASK) >> \
+ ENAHW_RX_CDESC_FIRST_SHIFT) == 1)
+
+#define ENAHW_RX_CDESC_LAST(desc) \
+ ((((desc)->erc_status & ENAHW_RX_CDESC_LAST_MASK) >> \
+ ENAHW_RX_CDESC_LAST_SHIFT) == 1)
+
+/*
+ * Controls for the interrupt register mapped to each Rx/Tx CQ.
+ */
+#define ENAHW_REG_INTR_RX_DELAY_MASK GENMASK(14, 0)
+#define ENAHW_REG_INTR_TX_DELAY_SHIFT 15
+#define ENAHW_REG_INTR_TX_DELAY_MASK GENMASK(29, 15)
+#define ENAHW_REG_INTR_UNMASK_SHIFT 30
+#define ENAHW_REG_INTR_UNMASK_MASK BIT(30)
+
+#define ENAHW_REG_INTR_UNMASK(val) \
+ ((val) |= ENAHW_REG_INTR_UNMASK_MASK)
+
+#define ENAHW_REG_INTR_MASK(val) \
+ ((val) &= ~ENAHW_REG_INTR_UNMASK_MASK)
+
+#endif /* _ENA_HW_H */
diff --git a/usr/src/uts/common/io/ena/ena_intr.c b/usr/src/uts/common/io/ena/ena_intr.c
new file mode 100644
index 0000000000..2650609cfa
--- /dev/null
+++ b/usr/src/uts/common/io/ena/ena_intr.c
@@ -0,0 +1,175 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+#include "ena.h"
+
+/*
+ * We currently limit the number of Tx/Rx queues to the number of
+ * available interrupts (minus one for the admin queue).
+ */
+static uint_t
+ena_io_intr(caddr_t arg1, caddr_t arg2)
+{
+ ena_t *ena = (ena_t *)arg1;
+ uint16_t vector = (uintptr_t)(void *)arg2;
+ ASSERT3U(vector, >, 0);
+ ASSERT3U(vector, <, ena->ena_num_intrs);
+ ena_txq_t *txq = &ena->ena_txqs[vector - 1];
+ ena_rxq_t *rxq = &ena->ena_rxqs[vector - 1];
+ uint32_t intr_ctrl;
+
+ ASSERT3P(txq, !=, NULL);
+ ASSERT3P(rxq, !=, NULL);
+ ena_tx_intr_work(txq);
+ ena_rx_intr_work(rxq);
+
+ /*
+ * The Rx/Tx queue share the same interrupt, only need to
+ * unmask interrupts for one of them.
+ */
+ intr_ctrl = ena_hw_abs_read32(ena, txq->et_cq_unmask_addr);
+ ENAHW_REG_INTR_UNMASK(intr_ctrl);
+ ena_hw_abs_write32(ena, txq->et_cq_unmask_addr, intr_ctrl);
+ return (DDI_INTR_CLAIMED);
+}
+
+static uint_t
+ena_admin_intr(caddr_t arg1, caddr_t arg2)
+{
+ ena_t *ena = (ena_t *)arg1;
+
+ ena_aenq_work(ena);
+ return (DDI_INTR_CLAIMED);
+}
+
+void
+ena_intr_remove_handlers(ena_t *ena)
+{
+ for (int i = 0; i < ena->ena_num_intrs; i++) {
+ int ret = ddi_intr_remove_handler(ena->ena_intr_handles[i]);
+
+ /* Nothing we can really do except log. */
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "failed to remove interrupt handler for "
+ "vector %d: %d", i, ret);
+ }
+ }
+}
+
+/*
+ * The ena driver uses separate interrupt handlers for the admin queue
+ * and I/O queues.
+ */
+boolean_t
+ena_intr_add_handlers(ena_t *ena)
+{
+ ASSERT3S(ena->ena_num_intrs, >=, 2);
+ if (ddi_intr_add_handler(ena->ena_intr_handles[0], ena_admin_intr, ena,
+ (void *)(uintptr_t)0) != DDI_SUCCESS) {
+ ena_err(ena, "failed to add admin interrupt handler");
+ return (B_FALSE);
+ }
+
+ for (int i = 1; i < ena->ena_num_intrs; i++) {
+ caddr_t vector = (void *)(uintptr_t)(i);
+ int ret = ddi_intr_add_handler(ena->ena_intr_handles[i],
+ ena_io_intr, ena, vector);
+
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "failed to add I/O interrupt handler "
+ "for vector %u", i);
+
+ /*
+ * If we fail to add any I/O handler, then all
+ * successfully added handlers are removed,
+ * including the admin handler. For example,
+ * when i=2 we remove handler 1 (the first I/O
+ * handler), and when i=1 we remove handler 0
+ * (the admin handler).
+ */
+ while (i >= 1) {
+ i--;
+ (void) ddi_intr_remove_handler(
+ ena->ena_intr_handles[i]);
+ }
+
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+boolean_t
+ena_intrs_disable(ena_t *ena)
+{
+ int ret;
+
+ if (ena->ena_intr_caps & DDI_INTR_FLAG_BLOCK) {
+ if ((ret = ddi_intr_block_disable(ena->ena_intr_handles,
+ ena->ena_num_intrs)) != DDI_SUCCESS) {
+ ena_err(ena, "failed to block disable interrupts: %d",
+ ret);
+ return (B_FALSE);
+ }
+ } else {
+ for (int i = 0; i < ena->ena_num_intrs; i++) {
+ ret = ddi_intr_disable(ena->ena_intr_handles[i]);
+ if (ret != DDI_SUCCESS) {
+ ena_err(ena, "failed to disable interrupt "
+ "%d: %d", i, ret);
+ return (B_FALSE);
+ }
+ }
+ }
+
+ return (B_TRUE);
+}
+
+boolean_t
+ena_intrs_enable(ena_t *ena)
+{
+ int ret;
+
+ if (ena->ena_intr_caps & DDI_INTR_FLAG_BLOCK) {
+ if ((ret = ddi_intr_block_enable(ena->ena_intr_handles,
+ ena->ena_num_intrs)) != DDI_SUCCESS) {
+ ena_err(ena, "failed to block enable interrupts: %d",
+ ret);
+ return (B_FALSE);
+ }
+ } else {
+ for (int i = 0; i < ena->ena_num_intrs; i++) {
+ if ((ret = ddi_intr_enable(ena->ena_intr_handles[i])) !=
+ DDI_SUCCESS) {
+ ena_err(ena, "failed to enable interrupt "
+ "%d: %d", i, ret);
+
+ /*
+ * If we fail to enable any interrupt,
+ * then all interrupts are disabled.
+ */
+ while (i >= 1) {
+ i--;
+ (void) ddi_intr_disable(
+ ena->ena_intr_handles[i]);
+ }
+
+ return (B_FALSE);
+ }
+ }
+ }
+
+ return (B_TRUE);
+}
diff --git a/usr/src/uts/common/io/ena/ena_rx.c b/usr/src/uts/common/io/ena/ena_rx.c
new file mode 100644
index 0000000000..7f0b7db94a
--- /dev/null
+++ b/usr/src/uts/common/io/ena/ena_rx.c
@@ -0,0 +1,531 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+#include "ena.h"
+
+static void
+ena_refill_rx(ena_rxq_t *rxq, uint16_t num)
+{
+ VERIFY3P(rxq, !=, NULL);
+ ASSERT(MUTEX_HELD(&rxq->er_lock));
+ ASSERT3U(num, <=, rxq->er_sq_num_descs);
+ uint16_t tail_mod = rxq->er_sq_tail_idx & (rxq->er_sq_num_descs - 1);
+
+ while (num != 0) {
+ enahw_rx_desc_t *desc = &rxq->er_sq_descs[tail_mod];
+ ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[tail_mod];
+ uint16_t phase = rxq->er_sq_phase;
+
+ VERIFY3U(tail_mod, <, rxq->er_sq_num_descs);
+ VERIFY3P(desc, !=, NULL);
+ VERIFY3P(rcb, !=, NULL);
+ VERIFY3P(desc, >=, rxq->er_sq_descs);
+ VERIFY3P(desc, <=,
+ (rxq->er_sq_descs + rxq->er_sq_num_descs - 1));
+
+ desc->erd_length = rcb->ercb_dma.edb_len;
+ desc->erd_req_id = tail_mod;
+ VERIFY3P(rcb->ercb_dma.edb_cookie, !=, NULL);
+ ena_set_dma_addr_values(rxq->er_ena,
+ rcb->ercb_dma.edb_cookie->dmac_laddress,
+ &desc->erd_buff_addr_lo, &desc->erd_buff_addr_hi);
+ ENAHW_RX_DESC_SET_PHASE(desc, phase);
+ ENAHW_RX_DESC_SET_FIRST(desc);
+ ENAHW_RX_DESC_SET_LAST(desc);
+ ENAHW_RX_DESC_SET_COMP_REQ(desc);
+ DTRACE_PROBE1(ena__refill__rx, enahw_rx_desc_t *, desc);
+ rxq->er_sq_tail_idx++;
+ tail_mod = rxq->er_sq_tail_idx & (rxq->er_sq_num_descs - 1);
+
+ if (tail_mod == 0) {
+ rxq->er_sq_phase = !rxq->er_sq_phase;
+ }
+
+ num--;
+ }
+
+ ENA_DMA_SYNC(rxq->er_sq_dma, DDI_DMA_SYNC_FORDEV);
+ ena_hw_abs_write32(rxq->er_ena, rxq->er_sq_db_addr,
+ rxq->er_sq_tail_idx);
+}
+
+void
+ena_free_rx_dma(ena_rxq_t *rxq)
+{
+ if (rxq->er_rcbs != NULL) {
+ for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) {
+ ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i];
+ ena_dma_free(&rcb->ercb_dma);
+ }
+
+ kmem_free(rxq->er_rcbs,
+ sizeof (*rxq->er_rcbs) * rxq->er_sq_num_descs);
+
+ rxq->er_rcbs = NULL;
+ }
+
+ ena_dma_free(&rxq->er_cq_dma);
+ rxq->er_cq_descs = NULL;
+ rxq->er_cq_num_descs = 0;
+
+ ena_dma_free(&rxq->er_sq_dma);
+ rxq->er_sq_descs = NULL;
+ rxq->er_sq_num_descs = 0;
+
+ rxq->er_state &= ~ENA_RXQ_STATE_HOST_ALLOC;
+}
+
+static int
+ena_alloc_rx_dma(ena_rxq_t *rxq)
+{
+ ena_t *ena = rxq->er_ena;
+ size_t cq_descs_sz;
+ size_t sq_descs_sz;
+ ena_dma_conf_t conf;
+ int err = 0;
+
+ cq_descs_sz = rxq->er_cq_num_descs * sizeof (*rxq->er_cq_descs);
+ sq_descs_sz = rxq->er_sq_num_descs * sizeof (*rxq->er_sq_descs);
+ conf = (ena_dma_conf_t) {
+ .edc_size = sq_descs_sz,
+ .edc_align = ENAHW_IO_SQ_DESC_BUF_ALIGNMENT,
+ .edc_sgl = 1,
+ .edc_endian = DDI_NEVERSWAP_ACC,
+ .edc_stream = B_FALSE,
+ };
+
+ if (!ena_dma_alloc(ena, &rxq->er_sq_dma, &conf, sq_descs_sz)) {
+ return (ENOMEM);
+ }
+
+ rxq->er_sq_descs = (void *)rxq->er_sq_dma.edb_va;
+ rxq->er_rcbs = kmem_zalloc(sizeof (*rxq->er_rcbs) *
+ rxq->er_sq_num_descs, KM_SLEEP);
+
+ for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) {
+ ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i];
+ ena_dma_conf_t buf_conf = {
+ .edc_size = ena->ena_rx_buf_sz,
+ .edc_align = 1,
+ .edc_sgl = ena->ena_rx_sgl_max_sz,
+ .edc_endian = DDI_NEVERSWAP_ACC,
+ .edc_stream = B_TRUE,
+ };
+
+ if (!ena_dma_alloc(ena, &rcb->ercb_dma, &buf_conf,
+ ena->ena_rx_buf_sz)) {
+ err = ENOMEM;
+ goto error;
+ }
+ }
+
+ conf = (ena_dma_conf_t) {
+ .edc_size = cq_descs_sz,
+ .edc_align = ENAHW_IO_CQ_DESC_BUF_ALIGNMENT,
+ .edc_sgl = 1,
+ .edc_endian = DDI_NEVERSWAP_ACC,
+ .edc_stream = B_FALSE,
+ };
+
+ if (!ena_dma_alloc(ena, &rxq->er_cq_dma, &conf, cq_descs_sz)) {
+ err = ENOMEM;
+ goto error;
+ }
+
+ rxq->er_cq_descs = (void *)rxq->er_cq_dma.edb_va;
+ rxq->er_state |= ENA_RXQ_STATE_HOST_ALLOC;
+ return (0);
+
+error:
+ ena_free_rx_dma(rxq);
+ return (err);
+}
+
+boolean_t
+ena_alloc_rxq(ena_rxq_t *rxq)
+{
+ int ret = 0;
+ ena_t *ena = rxq->er_ena;
+ uint16_t cq_hw_idx, sq_hw_idx;
+ uint32_t *cq_unmask_addr, *cq_headdb, *cq_numanode;
+ uint32_t *sq_db_addr;
+
+ /*
+ * First, allocate the Rx data buffers.
+ */
+ if ((ret = ena_alloc_rx_dma(rxq)) != 0) {
+ ena_err(ena, "failed to allocate Rx queue %u data buffers: %d",
+ rxq->er_rxqs_idx, ret);
+ return (B_FALSE);
+ }
+
+ ASSERT(rxq->er_state & ENA_RXQ_STATE_HOST_ALLOC);
+
+ /*
+ * Second, create the Completion Queue.
+ */
+ ret = ena_create_cq(ena, rxq->er_cq_num_descs,
+ rxq->er_cq_dma.edb_cookie->dmac_laddress, B_FALSE,
+ rxq->er_intr_vector, &cq_hw_idx, &cq_unmask_addr, &cq_headdb,
+ &cq_numanode);
+
+ if (ret != 0) {
+ ena_err(ena, "failed to create Rx CQ %u: %d", rxq->er_rxqs_idx,
+ ret);
+ return (B_FALSE);
+ }
+
+ /* The phase must always start on 1. */
+ rxq->er_cq_phase = 1;
+ rxq->er_cq_head_idx = 0;
+ rxq->er_cq_hw_idx = cq_hw_idx;
+ rxq->er_cq_unmask_addr = cq_unmask_addr;
+ rxq->er_cq_head_db_addr = cq_headdb;
+ rxq->er_cq_numa_addr = cq_numanode;
+ rxq->er_state |= ENA_RXQ_STATE_CQ_CREATED;
+
+ /*
+ * Third, create the Submission Queue to match with the above
+ * CQ. At this time we force the SQ and CQ to have the same
+ * number of descriptors as we only use a 1:1 completion
+ * policy. However, in the future, we could loosen this and
+ * use an on-demand completion policy and the two could have a
+ * different number of descriptors.
+ */
+ ASSERT3U(rxq->er_sq_num_descs, ==, rxq->er_cq_num_descs);
+ ret = ena_create_sq(ena, rxq->er_sq_num_descs,
+ rxq->er_sq_dma.edb_cookie->dmac_laddress, B_FALSE, cq_hw_idx,
+ &sq_hw_idx, &sq_db_addr);
+
+ if (ret != 0) {
+ ena_err(ena, "failed to create Rx SQ %u: %d", rxq->er_rxqs_idx,
+ ret);
+ return (B_FALSE);
+ }
+
+ ASSERT3P(sq_db_addr, !=, NULL);
+ rxq->er_sq_hw_idx = sq_hw_idx;
+ rxq->er_sq_db_addr = sq_db_addr;
+ /* The phase must always start on 1. */
+ rxq->er_sq_phase = 1;
+ rxq->er_sq_tail_idx = 0;
+ rxq->er_sq_avail_descs = rxq->er_sq_num_descs;
+ rxq->er_mode = ENA_RXQ_MODE_INTR;
+ rxq->er_state |= ENA_RXQ_STATE_SQ_CREATED;
+
+ return (B_TRUE);
+}
+
+void
+ena_cleanup_rxq(ena_rxq_t *rxq)
+{
+ int ret = 0;
+ ena_t *ena = rxq->er_ena;
+
+ if ((rxq->er_state & ENA_RXQ_STATE_SQ_CREATED) != 0) {
+ ret = ena_destroy_sq(ena, rxq->er_sq_hw_idx, B_FALSE);
+
+ if (ret != 0) {
+ ena_err(ena, "failed to destroy Rx SQ %u: %d",
+ rxq->er_rxqs_idx, ret);
+ }
+
+ rxq->er_sq_hw_idx = 0;
+ rxq->er_sq_db_addr = NULL;
+ rxq->er_sq_tail_idx = 0;
+ rxq->er_sq_phase = 0;
+ rxq->er_state &= ~ENA_RXQ_STATE_SQ_CREATED;
+ }
+
+ if ((rxq->er_state & ENA_RXQ_STATE_CQ_CREATED) != 0) {
+ ret = ena_destroy_cq(ena, rxq->er_cq_hw_idx);
+
+ if (ret != 0) {
+ ena_err(ena, "failed to destroy Rx CQ %u: %d",
+ rxq->er_rxqs_idx, ret);
+ }
+
+ rxq->er_cq_hw_idx = 0;
+ rxq->er_cq_head_idx = 0;
+ rxq->er_cq_phase = 0;
+ rxq->er_cq_head_db_addr = NULL;
+ rxq->er_cq_unmask_addr = NULL;
+ rxq->er_cq_numa_addr = NULL;
+ rxq->er_state &= ~ENA_RXQ_STATE_CQ_CREATED;
+ }
+
+ ena_free_rx_dma(rxq);
+ ASSERT3S(rxq->er_state, ==, ENA_RXQ_STATE_NONE);
+}
+
+void
+ena_ring_rx_stop(mac_ring_driver_t rh)
+{
+ ena_rxq_t *rxq = (ena_rxq_t *)rh;
+ uint32_t intr_ctrl;
+
+ intr_ctrl = ena_hw_abs_read32(rxq->er_ena, rxq->er_cq_unmask_addr);
+ ENAHW_REG_INTR_MASK(intr_ctrl);
+ ena_hw_abs_write32(rxq->er_ena, rxq->er_cq_unmask_addr, intr_ctrl);
+
+ rxq->er_state &= ~ENA_RXQ_STATE_RUNNING;
+ rxq->er_state &= ~ENA_RXQ_STATE_READY;
+}
+
+int
+ena_ring_rx_start(mac_ring_driver_t rh, uint64_t gen_num)
+{
+ ena_rxq_t *rxq = (ena_rxq_t *)rh;
+ ena_t *ena = rxq->er_ena;
+ uint32_t intr_ctrl;
+
+ mutex_enter(&rxq->er_lock);
+ ena_refill_rx(rxq, rxq->er_sq_num_descs);
+ rxq->er_m_gen_num = gen_num;
+ rxq->er_intr_limit = ena->ena_rxq_intr_limit;
+ mutex_exit(&rxq->er_lock);
+
+ rxq->er_state |= ENA_RXQ_STATE_READY;
+
+ intr_ctrl = ena_hw_abs_read32(ena, rxq->er_cq_unmask_addr);
+ ENAHW_REG_INTR_UNMASK(intr_ctrl);
+ ena_hw_abs_write32(ena, rxq->er_cq_unmask_addr, intr_ctrl);
+ rxq->er_state |= ENA_RXQ_STATE_RUNNING;
+ return (0);
+}
+
+mblk_t *
+ena_ring_rx(ena_rxq_t *rxq, int poll_bytes)
+{
+ ena_t *ena = rxq->er_ena;
+ uint16_t head_mod = rxq->er_cq_head_idx & (rxq->er_cq_num_descs - 1);
+ uint64_t total_bytes = 0;
+ uint64_t num_frames = 0;
+ enahw_rx_cdesc_t *cdesc;
+ boolean_t polling = B_TRUE;
+ mblk_t *head = NULL;
+ mblk_t *tail = NULL;
+
+ ASSERT(MUTEX_HELD(&rxq->er_lock));
+ ENA_DMA_SYNC(rxq->er_cq_dma, DDI_DMA_SYNC_FORKERNEL);
+
+ if (poll_bytes == ENA_INTERRUPT_MODE) {
+ polling = B_FALSE;
+ }
+
+ cdesc = &rxq->er_cq_descs[head_mod];
+ VERIFY3P(cdesc, >=, rxq->er_cq_descs);
+ VERIFY3P(cdesc, <=, (rxq->er_cq_descs + rxq->er_cq_num_descs - 1));
+
+ while (ENAHW_RX_CDESC_PHASE(cdesc) == rxq->er_cq_phase) {
+ boolean_t first, last;
+ ena_rx_ctrl_block_t *rcb;
+ uint16_t req_id;
+ mblk_t *mp;
+ enahw_io_l3_proto_t l3proto;
+ enahw_io_l4_proto_t l4proto;
+ boolean_t l4csum_checked;
+ uint32_t hflags = 0;
+
+ VERIFY3U(head_mod, <, rxq->er_cq_num_descs);
+ /*
+ * Currently, all incoming frames fit in a single Rx
+ * buffer (erd_length > total frame size). In the
+ * future, if we decide to loan buffers which are
+ * smaller, we will need to modify this code to read
+ * one or more descriptors (based on frame size).
+ *
+ * For this reason we do not expect any frame to span
+ * multiple descriptors. Therefore, we drop any data
+ * not delivered as a single descriptor, i.e., where
+ * 'first' and 'last' are both true.
+ */
+ first = ENAHW_RX_CDESC_FIRST(cdesc);
+ last = ENAHW_RX_CDESC_LAST(cdesc);
+
+ if (!first || !last) {
+ mutex_enter(&rxq->er_stat_lock);
+ rxq->er_stat.ers_multi_desc.value.ui64++;
+ mutex_exit(&rxq->er_stat_lock);
+ goto next_desc;
+ }
+
+ req_id = cdesc->erc_req_id;
+ VERIFY3U(req_id, <, rxq->er_cq_num_descs);
+ rcb = &rxq->er_rcbs[req_id];
+ rcb->ercb_offset = cdesc->erc_offset;
+ rcb->ercb_length = cdesc->erc_length;
+ ASSERT3U(rcb->ercb_length, <=, ena->ena_max_frame_total);
+ mp = allocb(rcb->ercb_length + ENA_RX_BUF_IPHDR_ALIGNMENT, 0);
+
+ /*
+ * If we can't allocate an mblk, things are looking
+ * grim. Forget about this frame and move on.
+ */
+ if (mp == NULL) {
+ mutex_enter(&rxq->er_stat_lock);
+ rxq->er_stat.ers_allocb_fail.value.ui64++;
+ mutex_exit(&rxq->er_stat_lock);
+ goto next_desc;
+ }
+
+ /*
+ * As we pull frames we need to link them together as
+ * one chain to be delivered up to mac.
+ */
+ if (head == NULL) {
+ head = mp;
+ } else {
+ tail->b_next = mp;
+ }
+
+ tail = mp;
+
+ /*
+ * We need to make sure the bytes are copied to the
+ * correct offset to achieve 4-byte IP header
+ * alignment.
+ *
+ * If we start using desballoc on the buffers, then we
+ * will need to make sure to apply this offset to the
+ * DMA buffers as well. Though it may be the case the
+ * device does this implicitly and that's what
+ * cdesc->erc_offset is for; we don't know because
+ * it's not documented.
+ */
+ mp->b_wptr += ENA_RX_BUF_IPHDR_ALIGNMENT;
+ mp->b_rptr += ENA_RX_BUF_IPHDR_ALIGNMENT;
+ bcopy(rcb->ercb_dma.edb_va + rcb->ercb_offset, mp->b_wptr,
+ rcb->ercb_length);
+ mp->b_wptr += rcb->ercb_length;
+ total_bytes += rcb->ercb_length;
+ VERIFY3P(mp->b_wptr, >, mp->b_rptr);
+ VERIFY3P(mp->b_wptr, <=, mp->b_datap->db_lim);
+
+ l3proto = ENAHW_RX_CDESC_L3_PROTO(cdesc);
+ l4proto = ENAHW_RX_CDESC_L4_PROTO(cdesc);
+
+ /*
+ * When it comes to bad TCP/IP checksums we do not
+ * discard the packet at this level. Instead, we let
+ * it percolate up for further processing and tracking
+ * by the upstream TCP/IP stack.
+ */
+ if (ena->ena_rx_l3_ipv4_csum &&
+ l3proto == ENAHW_IO_L3_PROTO_IPV4) {
+ boolean_t l3_csum_err =
+ ENAHW_RX_CDESC_L3_CSUM_ERR(cdesc);
+
+ if (l3_csum_err) {
+ mutex_enter(&rxq->er_stat_lock);
+ rxq->er_stat.ers_hck_ipv4_err.value.ui64++;
+ mutex_exit(&rxq->er_stat_lock);
+ } else {
+ hflags |= HCK_IPV4_HDRCKSUM_OK;
+ }
+ }
+
+ l4csum_checked = ENAHW_RX_CDESC_L4_CSUM_CHECKED(cdesc);
+
+ if (ena->ena_rx_l4_ipv4_csum && l4csum_checked &&
+ l4proto == ENAHW_IO_L4_PROTO_TCP) {
+ boolean_t l4_csum_err =
+ ENAHW_RX_CDESC_L4_CSUM_ERR(cdesc);
+
+ if (l4_csum_err) {
+ mutex_enter(&rxq->er_stat_lock);
+ rxq->er_stat.ers_hck_l4_err.value.ui64++;
+ mutex_exit(&rxq->er_stat_lock);
+ } else {
+ hflags |= HCK_FULLCKSUM_OK;
+ }
+ }
+
+ if (hflags != 0) {
+ mac_hcksum_set(mp, 0, 0, 0, 0, hflags);
+ }
+
+next_desc:
+ /*
+ * Technically, if we arrived here due to a failure,
+ * then we did not read a new frame. However, we count
+ * it all the same anyways in order to count it as
+ * progress to the interrupt work limit. The failure
+ * stats will allow us to differentiate good frames
+ * from bad.
+ */
+ num_frames++;
+ rxq->er_cq_head_idx++;
+ head_mod = rxq->er_cq_head_idx & (rxq->er_cq_num_descs - 1);
+
+ if (head_mod == 0) {
+ rxq->er_cq_phase = !rxq->er_cq_phase;
+ }
+
+ if (polling && (total_bytes > poll_bytes)) {
+ break;
+ } else if (!polling && (num_frames >= rxq->er_intr_limit)) {
+ mutex_enter(&rxq->er_stat_lock);
+ rxq->er_stat.ers_intr_limit.value.ui64++;
+ mutex_exit(&rxq->er_stat_lock);
+ break;
+ }
+
+ cdesc = &rxq->er_cq_descs[head_mod];
+ VERIFY3P(cdesc, >=, rxq->er_cq_descs);
+ VERIFY3P(cdesc, <=,
+ (rxq->er_cq_descs + rxq->er_cq_num_descs - 1));
+ }
+
+ mutex_enter(&rxq->er_stat_lock);
+ rxq->er_stat.ers_packets.value.ui64 += num_frames;
+ rxq->er_stat.ers_bytes.value.ui64 += total_bytes;
+ mutex_exit(&rxq->er_stat_lock);
+
+ DTRACE_PROBE4(rx__frames, mblk_t *, head, boolean_t, polling, uint64_t,
+ num_frames, uint64_t, total_bytes);
+ ena_refill_rx(rxq, num_frames);
+ return (head);
+}
+
+void
+ena_rx_intr_work(ena_rxq_t *rxq)
+{
+ mblk_t *mp;
+
+ mutex_enter(&rxq->er_lock);
+ mp = ena_ring_rx(rxq, ENA_INTERRUPT_MODE);
+ mutex_exit(&rxq->er_lock);
+
+ if (mp == NULL) {
+ return;
+ }
+
+ mac_rx_ring(rxq->er_ena->ena_mh, rxq->er_mrh, mp, rxq->er_m_gen_num);
+}
+
+mblk_t *
+ena_ring_rx_poll(void *rh, int poll_bytes)
+{
+ ena_rxq_t *rxq = rh;
+ mblk_t *mp;
+
+ ASSERT3S(poll_bytes, >, 0);
+
+ mutex_enter(&rxq->er_lock);
+ mp = ena_ring_rx(rxq, poll_bytes);
+ mutex_exit(&rxq->er_lock);
+
+ return (mp);
+}
diff --git a/usr/src/uts/common/io/ena/ena_stats.c b/usr/src/uts/common/io/ena/ena_stats.c
new file mode 100644
index 0000000000..c8ef7ae260
--- /dev/null
+++ b/usr/src/uts/common/io/ena/ena_stats.c
@@ -0,0 +1,475 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+#include "ena.h"
+
+/*
+ * The ENA device provides the following hardware stats. It appears
+ * that all stats are available at both a device-level and
+ * queue-level. However, Linux and FreeBSD don't implement queue
+ * scope. It's not clear how one would implement queue scope because
+ * there is nothing in the common code describing how to determine the
+ * queue index number. Both the SQ and CQ have device index values,
+ * but for a given logical queue they don't always match and so it's
+ * not clear what value to use for querying the stats. Therefore,
+ * device-wide basic and extended stats come from the device, while
+ * queue/ring stats come from driver.
+ *
+ * From empirical testing, these statistics appear to be cumulative.
+ * However, this guarantee is not explicitly documented anywhere in
+ * the common code that the author could find.
+ *
+ * BASIC (ENAHW_GET_STATS_TYPE_BASIC)
+ *
+ * - Rx packets/bytes
+ * - Rx drops
+ * - Tx packets/bytes
+ * - Tx drops
+ *
+ * EXTENDED (ENAHW_GET_STATS_TYPE_EXTENDED)
+ *
+ * There is no structure defined for these stats in the Linux
+ * driver. Based on the FreeBSD driver, it looks like extended
+ * stats are simply a buffer of C strings? Come back to this
+ * later.
+ *
+ * ENI (ENAHW_GET_STATS_TYPE_ENI)
+ *
+ * - Rx Bandwidth Allowance Exceeded
+ * - Tx Bandwidth Allowance Exceeded
+ * - PPS Allowance Exceeded (presumably for combined Rx/Tx)
+ * - Connection Tracking PPS Allowance Exceeded
+ * - Link-local PPS Alloance Exceeded
+ */
+
+static int
+ena_stat_device_basic_update(kstat_t *ksp, int rw)
+{
+ ena_t *ena = ksp->ks_private;
+ ena_basic_stat_t *ebs = ksp->ks_data;
+ enahw_resp_desc_t resp;
+ enahw_resp_basic_stats_t *stats = &resp.erd_resp.erd_basic_stats;
+ int ret = 0;
+
+ if (rw == KSTAT_WRITE) {
+ return (EACCES);
+ }
+
+ if ((ret = ena_admin_get_basic_stats(ena, &resp)) != 0) {
+ return (ret);
+ }
+
+ mutex_enter(&ena->ena_lock);
+
+ ebs->ebs_tx_bytes.value.ui64 =
+ ((uint64_t)stats->erbs_tx_bytes_high << 32) |
+ (uint64_t)stats->erbs_tx_bytes_low;
+ ebs->ebs_tx_pkts.value.ui64 =
+ ((uint64_t)stats->erbs_tx_pkts_high << 32) |
+ (uint64_t)stats->erbs_tx_pkts_low;
+ ebs->ebs_tx_drops.value.ui64 =
+ ((uint64_t)stats->erbs_tx_drops_high << 32) |
+ (uint64_t)stats->erbs_tx_drops_low;
+
+ ebs->ebs_rx_bytes.value.ui64 =
+ ((uint64_t)stats->erbs_rx_bytes_high << 32) |
+ (uint64_t)stats->erbs_rx_bytes_low;
+ ebs->ebs_rx_pkts.value.ui64 =
+ ((uint64_t)stats->erbs_rx_pkts_high << 32) |
+ (uint64_t)stats->erbs_rx_pkts_low;
+ ebs->ebs_rx_drops.value.ui64 =
+ ((uint64_t)stats->erbs_rx_drops_high << 32) |
+ (uint64_t)stats->erbs_rx_drops_low;
+
+ mutex_exit(&ena->ena_lock);
+
+ return (0);
+}
+
+void
+ena_stat_device_basic_cleanup(ena_t *ena)
+{
+ if (ena->ena_device_basic_kstat != NULL) {
+ kstat_delete(ena->ena_device_basic_kstat);
+ ena->ena_device_basic_kstat = NULL;
+ }
+}
+
+boolean_t
+ena_stat_device_basic_init(ena_t *ena)
+{
+ kstat_t *ksp = kstat_create(ENA_MODULE_NAME,
+ ddi_get_instance(ena->ena_dip), "device_basic", "net",
+ KSTAT_TYPE_NAMED,
+ sizeof (ena_basic_stat_t) / sizeof (kstat_named_t), 0);
+ ena_basic_stat_t *ebs = NULL;
+
+ if (ksp == NULL) {
+ ena_err(ena, "!failed to create device_basic kstats");
+ return (B_FALSE);
+ }
+
+ ena->ena_device_basic_kstat = ksp;
+ ebs = ksp->ks_data;
+ ksp->ks_update = ena_stat_device_basic_update;
+ ksp->ks_private = ena;
+
+ kstat_named_init(&ebs->ebs_tx_bytes, "tx_bytes", KSTAT_DATA_UINT64);
+ ebs->ebs_tx_bytes.value.ui64 = 0;
+ kstat_named_init(&ebs->ebs_tx_pkts, "tx_packets", KSTAT_DATA_UINT64);
+ ebs->ebs_tx_pkts.value.ui64 = 0;
+ kstat_named_init(&ebs->ebs_tx_drops, "tx_drops", KSTAT_DATA_UINT64);
+ ebs->ebs_tx_drops.value.ui64 = 0;
+
+ kstat_named_init(&ebs->ebs_rx_bytes, "rx_bytes", KSTAT_DATA_UINT64);
+ ebs->ebs_rx_bytes.value.ui64 = 0;
+ kstat_named_init(&ebs->ebs_rx_pkts, "rx_packets", KSTAT_DATA_UINT64);
+ ebs->ebs_rx_pkts.value.ui64 = 0;
+ kstat_named_init(&ebs->ebs_rx_drops, "rx_drops", KSTAT_DATA_UINT64);
+ ebs->ebs_rx_drops.value.ui64 = 0;
+
+ kstat_install(ena->ena_device_basic_kstat);
+ return (B_TRUE);
+}
+
+int
+ena_stat_device_extended_update(kstat_t *ksp, int rw)
+{
+ ena_t *ena = ksp->ks_private;
+ ena_extended_stat_t *ees = ksp->ks_data;
+ enahw_resp_desc_t resp;
+ enahw_resp_eni_stats_t *stats = &resp.erd_resp.erd_eni_stats;
+ int ret = 0;
+
+ if (rw == KSTAT_WRITE) {
+ return (EACCES);
+ }
+
+ if ((ret = ena_admin_get_eni_stats(ena, &resp)) != 0) {
+ return (ret);
+ }
+
+ mutex_enter(&ena->ena_lock);
+
+ ees->ees_bw_in_exceeded.value.ui64 = stats->eres_bw_in_exceeded;
+ ees->ees_bw_out_exceeded.value.ui64 = stats->eres_bw_out_exceeded;
+ ees->ees_pps_exceeded.value.ui64 = stats->eres_pps_exceeded;
+ ees->ees_conns_exceeded.value.ui64 = stats->eres_conns_exceeded;
+ ees->ees_linklocal_exceeded.value.ui64 = stats->eres_linklocal_exceeded;
+
+ mutex_exit(&ena->ena_lock);
+
+ return (0);
+}
+
+void
+ena_stat_device_extended_cleanup(ena_t *ena)
+{
+ if (ena->ena_device_extended_kstat != NULL) {
+ kstat_delete(ena->ena_device_extended_kstat);
+ ena->ena_device_extended_kstat = NULL;
+ }
+}
+
+boolean_t
+ena_stat_device_extended_init(ena_t *ena)
+{
+ kstat_t *ksp = kstat_create(ENA_MODULE_NAME,
+ ddi_get_instance(ena->ena_dip), "device_ext", "net",
+ KSTAT_TYPE_NAMED,
+ sizeof (ena_extended_stat_t) / sizeof (kstat_named_t), 0);
+ ena_extended_stat_t *ees;
+
+ if (ksp == NULL) {
+ ena_err(ena, "!failed to create device_ext kstats");
+ return (B_FALSE);
+ }
+
+ ena->ena_device_extended_kstat = ksp;
+ ees = ksp->ks_data;
+ ksp->ks_update = ena_stat_device_extended_update;
+ ksp->ks_private = ena;
+
+ kstat_named_init(&ees->ees_bw_in_exceeded, "bw_in_exceeded",
+ KSTAT_DATA_UINT64);
+ ees->ees_bw_in_exceeded.value.ui64 = 0;
+
+ kstat_named_init(&ees->ees_bw_out_exceeded, "bw_out_exceeded",
+ KSTAT_DATA_UINT64);
+ ees->ees_bw_out_exceeded.value.ui64 = 0;
+
+ kstat_named_init(&ees->ees_pps_exceeded, "pps_exceeded",
+ KSTAT_DATA_UINT64);
+ ees->ees_pps_exceeded.value.ui64 = 0;
+
+ kstat_named_init(&ees->ees_conns_exceeded, "conns_exceeded",
+ KSTAT_DATA_UINT64);
+ ees->ees_conns_exceeded.value.ui64 = 0;
+
+ kstat_named_init(&ees->ees_linklocal_exceeded, "linklocal_exceeded",
+ KSTAT_DATA_UINT64);
+ ees->ees_linklocal_exceeded.value.ui64 = 0;
+
+ kstat_install(ena->ena_device_extended_kstat);
+ return (B_TRUE);
+}
+
+void
+ena_stat_aenq_cleanup(ena_t *ena)
+{
+ if (ena->ena_aenq_kstat != NULL) {
+ kstat_delete(ena->ena_aenq_kstat);
+ ena->ena_aenq_kstat = NULL;
+ }
+}
+
+boolean_t
+ena_stat_aenq_init(ena_t *ena)
+{
+ kstat_t *ksp = kstat_create(ENA_MODULE_NAME,
+ ddi_get_instance(ena->ena_dip), "aenq", "net", KSTAT_TYPE_NAMED,
+ sizeof (ena_aenq_stat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ ena_aenq_stat_t *eas = &ena->ena_aenq_stat;
+
+ if (ksp == NULL) {
+ ena_err(ena, "!failed to create aenq kstats");
+ return (B_FALSE);
+ }
+
+ ena->ena_aenq_kstat = ksp;
+ ksp->ks_data = eas;
+
+ kstat_named_init(&eas->eaes_default, "default", KSTAT_DATA_UINT64);
+ eas->eaes_default.value.ui64 = 0;
+
+ kstat_named_init(&eas->eaes_link_change, "link_change",
+ KSTAT_DATA_UINT64);
+ eas->eaes_link_change.value.ui64 = 0;
+
+ kstat_install(ena->ena_aenq_kstat);
+ return (B_TRUE);
+}
+
+void
+ena_stat_txq_cleanup(ena_txq_t *txq)
+{
+ if (txq->et_kstat != NULL) {
+ kstat_delete(txq->et_kstat);
+ txq->et_kstat = NULL;
+ }
+}
+
+boolean_t
+ena_stat_txq_init(ena_txq_t *txq)
+{
+ ena_t *ena = txq->et_ena;
+ kstat_t *ksp;
+ char buf[128];
+ ena_txq_stat_t *ets = &txq->et_stat;
+
+ (void) snprintf(buf, sizeof (buf), "txq_%d", txq->et_txqs_idx);
+
+ ksp = kstat_create(ENA_MODULE_NAME, ddi_get_instance(ena->ena_dip), buf,
+ "net", KSTAT_TYPE_NAMED,
+ sizeof (ena_txq_stat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (ksp == NULL) {
+ ena_err(ena, "!failed to create %s kstats", buf);
+ return (B_FALSE);
+ }
+
+ txq->et_kstat = ksp;
+ ksp->ks_data = ets;
+
+ kstat_named_init(&ets->ets_hck_meoifail, "meoi_fail",
+ KSTAT_DATA_UINT64);
+ ets->ets_hck_meoifail.value.ui64 = 0;
+
+ kstat_named_init(&ets->ets_blocked, "blocked", KSTAT_DATA_UINT64);
+ ets->ets_blocked.value.ui64 = 0;
+
+ kstat_named_init(&ets->ets_unblocked, "unblocked", KSTAT_DATA_UINT64);
+ ets->ets_unblocked.value.ui64 = 0;
+
+ kstat_named_init(&ets->ets_recycled, "recycled", KSTAT_DATA_UINT64);
+ ets->ets_recycled.value.ui64 = 0;
+
+ kstat_named_init(&ets->ets_bytes, "bytes", KSTAT_DATA_UINT64);
+ ets->ets_bytes.value.ui64 = 0;
+
+ kstat_named_init(&ets->ets_packets, "packets", KSTAT_DATA_UINT64);
+ ets->ets_packets.value.ui64 = 0;
+
+ kstat_install(txq->et_kstat);
+ return (B_TRUE);
+}
+
+void
+ena_stat_rxq_cleanup(ena_rxq_t *rxq)
+{
+ if (rxq->er_kstat != NULL) {
+ kstat_delete(rxq->er_kstat);
+ rxq->er_kstat = NULL;
+ }
+}
+
+boolean_t
+ena_stat_rxq_init(ena_rxq_t *rxq)
+{
+ ena_t *ena = rxq->er_ena;
+ kstat_t *ksp;
+ char buf[128];
+ ena_rxq_stat_t *ers = &rxq->er_stat;
+
+ (void) snprintf(buf, sizeof (buf), "rxq_%d", rxq->er_rxqs_idx);
+
+ ksp = kstat_create(ENA_MODULE_NAME, ddi_get_instance(ena->ena_dip), buf,
+ "net", KSTAT_TYPE_NAMED,
+ sizeof (ena_rxq_stat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (ksp == NULL) {
+ ena_err(ena, "!failed to create %s kstats", buf);
+ return (B_FALSE);
+ }
+
+ rxq->er_kstat = ksp;
+ ksp->ks_data = ers;
+
+ kstat_named_init(&ers->ers_packets, "packets", KSTAT_DATA_UINT64);
+ ers->ers_packets.value.ui64 = 0;
+
+ kstat_named_init(&ers->ers_bytes, "bytes", KSTAT_DATA_UINT64);
+ ers->ers_bytes.value.ui64 = 0;
+
+ kstat_named_init(&ers->ers_multi_desc, "multi_desc", KSTAT_DATA_UINT64);
+ ers->ers_multi_desc.value.ui64 = 0;
+
+ kstat_named_init(&ers->ers_allocb_fail, "allocb_fail",
+ KSTAT_DATA_UINT64);
+ ers->ers_allocb_fail.value.ui64 = 0;
+
+ kstat_named_init(&ers->ers_intr_limit, "intr_limit", KSTAT_DATA_UINT64);
+ ers->ers_intr_limit.value.ui64 = 0;
+
+ kstat_named_init(&ers->ers_hck_ipv4_err, "hck_ipv4_err",
+ KSTAT_DATA_UINT64);
+ ers->ers_hck_ipv4_err.value.ui64 = 0;
+
+ kstat_named_init(&ers->ers_hck_l4_err, "hck_l4_err", KSTAT_DATA_UINT64);
+ ers->ers_hck_l4_err.value.ui64 = 0;
+
+ kstat_install(rxq->er_kstat);
+ return (B_TRUE);
+}
+
+int
+ena_ring_rx_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
+{
+ int ret = 0;
+ ena_rxq_t *rxq = (ena_rxq_t *)rh;
+
+ mutex_enter(&rxq->er_stat_lock);
+
+ switch (stat) {
+ case MAC_STAT_RBYTES:
+ *val = rxq->er_stat.ers_bytes.value.ui64;
+ break;
+ case MAC_STAT_IPACKETS:
+ *val = rxq->er_stat.ers_packets.value.ui64;
+ break;
+ default:
+ *val = 0;
+ ret = ENOTSUP;
+ }
+
+ mutex_exit(&rxq->er_stat_lock);
+ return (ret);
+}
+
+int
+ena_ring_tx_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
+{
+ int ret = 0;
+ ena_txq_t *txq = (ena_txq_t *)rh;
+
+ mutex_enter(&txq->et_stat_lock);
+
+ switch (stat) {
+ case MAC_STAT_OBYTES:
+ *val = txq->et_stat.ets_bytes.value.ui64;
+ break;
+ case MAC_STAT_OPACKETS:
+ *val = txq->et_stat.ets_packets.value.ui64;
+ break;
+ default:
+ *val = 0;
+ ret = ENOTSUP;
+ }
+
+ mutex_exit(&txq->et_stat_lock);
+ return (ret);
+}
+
+int
+ena_m_stat(void *arg, uint_t stat, uint64_t *val)
+{
+ ena_t *ena = arg;
+ ena_basic_stat_t *ebs = ena->ena_device_basic_kstat->ks_data;
+ int ret = 0;
+
+ ret = ena_stat_device_basic_update(ena->ena_device_basic_kstat,
+ KSTAT_READ);
+
+ if (ret != 0) {
+ return (ret);
+ }
+
+ mutex_enter(&ena->ena_lock);
+
+ /*
+ * The ENA device does not provide a lot of the stats that a
+ * traditional NIC device would.
+ */
+ switch (stat) {
+ case MAC_STAT_NORCVBUF:
+ *val = ebs->ebs_rx_drops.value.ui64;
+ break;
+
+ case MAC_STAT_RBYTES:
+ *val = ebs->ebs_rx_bytes.value.ui64;
+ break;
+
+ case MAC_STAT_IPACKETS:
+ *val = ebs->ebs_rx_pkts.value.ui64;
+ break;
+
+ case MAC_STAT_OBYTES:
+ *val = ebs->ebs_tx_bytes.value.ui64;
+ break;
+
+ case MAC_STAT_OPACKETS:
+ *val = ebs->ebs_tx_pkts.value.ui64;
+ break;
+
+ default:
+ ret = ENOTSUP;
+ break;
+ }
+
+ mutex_exit(&ena->ena_lock);
+ return (ret);
+}
diff --git a/usr/src/uts/common/io/ena/ena_tx.c b/usr/src/uts/common/io/ena/ena_tx.c
new file mode 100644
index 0000000000..30773496b0
--- /dev/null
+++ b/usr/src/uts/common/io/ena/ena_tx.c
@@ -0,0 +1,534 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+#include "ena.h"
+
+void
+ena_free_tx_dma(ena_txq_t *txq)
+{
+ if (txq->et_tcbs != NULL) {
+ for (uint_t i = 0; i < txq->et_sq_num_descs; i++) {
+ ena_tx_control_block_t *tcb = &txq->et_tcbs[i];
+ ena_dma_free(&tcb->etcb_dma);
+ }
+
+ kmem_free(txq->et_tcbs,
+ sizeof (*txq->et_tcbs) * txq->et_sq_num_descs);
+
+ txq->et_tcbs = NULL;
+
+ }
+
+ ena_dma_free(&txq->et_cq_dma);
+ txq->et_cq_descs = NULL;
+
+ ena_dma_free(&txq->et_sq_dma);
+ txq->et_sq_descs = NULL;
+
+ txq->et_state &= ~ENA_TXQ_STATE_HOST_ALLOC;
+}
+
+static int
+ena_alloc_tx_dma(ena_txq_t *txq)
+{
+ ena_t *ena = txq->et_ena;
+ size_t cq_descs_sz;
+ size_t sq_descs_sz;
+ int err = 0;
+ ena_dma_conf_t conf;
+
+ ASSERT0(txq->et_state & ENA_TXQ_STATE_HOST_ALLOC);
+ ASSERT3P(ena, !=, NULL);
+
+ cq_descs_sz = txq->et_cq_num_descs * sizeof (*txq->et_cq_descs);
+ sq_descs_sz = txq->et_sq_num_descs * sizeof (*txq->et_sq_descs);
+
+ conf = (ena_dma_conf_t) {
+ .edc_size = sq_descs_sz,
+ .edc_align = ENAHW_IO_SQ_DESC_BUF_ALIGNMENT,
+ .edc_sgl = 1,
+ .edc_endian = DDI_NEVERSWAP_ACC,
+ .edc_stream = B_FALSE,
+ };
+
+ if (!ena_dma_alloc(ena, &txq->et_sq_dma, &conf, sq_descs_sz)) {
+ return (ENOMEM);
+ }
+
+ bzero(txq->et_sq_dma.edb_va, sq_descs_sz);
+ txq->et_sq_descs = (void *)txq->et_sq_dma.edb_va;
+ txq->et_tcbs = kmem_zalloc(sizeof (*txq->et_tcbs) *
+ txq->et_sq_num_descs, KM_SLEEP);
+
+ for (uint_t i = 0; i < txq->et_sq_num_descs; i++) {
+ ena_tx_control_block_t *tcb = &txq->et_tcbs[i];
+ ena_dma_conf_t buf_conf = {
+ .edc_size = ena->ena_tx_buf_sz,
+ .edc_align = 1,
+ .edc_sgl = ena->ena_tx_sgl_max_sz,
+ .edc_endian = DDI_NEVERSWAP_ACC,
+ .edc_stream = B_TRUE,
+ };
+
+ if (!ena_dma_alloc(ena, &tcb->etcb_dma, &buf_conf,
+ ena->ena_tx_buf_sz)) {
+ err = ENOMEM;
+ goto error;
+ }
+ }
+
+ conf = (ena_dma_conf_t) {
+ .edc_size = cq_descs_sz,
+ .edc_align = ENAHW_IO_CQ_DESC_BUF_ALIGNMENT,
+ .edc_sgl = 1,
+ .edc_endian = DDI_NEVERSWAP_ACC,
+ .edc_stream = B_FALSE,
+ };
+
+ if (!ena_dma_alloc(ena, &txq->et_cq_dma, &conf, cq_descs_sz)) {
+ err = ENOMEM;
+ goto error;
+ }
+
+ bzero(txq->et_cq_dma.edb_va, cq_descs_sz);
+ txq->et_cq_descs = (void *)txq->et_cq_dma.edb_va;
+ txq->et_state |= ENA_TXQ_STATE_HOST_ALLOC;
+ return (0);
+
+error:
+ ena_free_tx_dma(txq);
+ return (err);
+}
+
+boolean_t
+ena_alloc_txq(ena_txq_t *txq)
+{
+ int ret = 0;
+ ena_t *ena = txq->et_ena;
+ uint16_t cq_hw_idx, sq_hw_idx;
+ uint32_t *cq_unmask_addr, *cq_headdb, *cq_numanode;
+ uint32_t *sq_db_addr;
+
+ ASSERT3U(txq->et_cq_num_descs, >, 0);
+
+ /*
+ * First, allocate the Tx data buffers.
+ */
+ if ((ret = ena_alloc_tx_dma(txq)) != 0) {
+ ena_err(ena, "failed to allocate Tx queue %u data buffers: %d",
+ txq->et_txqs_idx, ret);
+ return (B_FALSE);
+ }
+
+ ASSERT(txq->et_state & ENA_TXQ_STATE_HOST_ALLOC);
+
+ /*
+ * Second, create the Completion Queue.
+ */
+ ret = ena_create_cq(ena, txq->et_cq_num_descs,
+ txq->et_cq_dma.edb_cookie->dmac_laddress, B_TRUE,
+ txq->et_intr_vector, &cq_hw_idx, &cq_unmask_addr, &cq_headdb,
+ &cq_numanode);
+
+ if (ret != 0) {
+ ena_err(ena, "failed to create Tx CQ %u: %d", txq->et_txqs_idx,
+ ret);
+ return (B_FALSE);
+ }
+
+ txq->et_cq_hw_idx = cq_hw_idx;
+ txq->et_cq_phase = 1;
+ txq->et_cq_unmask_addr = cq_unmask_addr;
+ txq->et_cq_head_db_addr = cq_headdb;
+ txq->et_cq_numa_addr = cq_numanode;
+ txq->et_state |= ENA_TXQ_STATE_CQ_CREATED;
+
+ /*
+ * Third, create the Submission Queue to match with the above
+ * CQ. At this time we force the SQ and CQ to have the same
+ * number of descriptors as we only use a 1:1 completion
+ * policy. However, in the future, we could loosen this and
+ * use an on-demand completion policy and the two could have a
+ * different number of descriptors.
+ */
+ ASSERT3U(txq->et_sq_num_descs, ==, txq->et_cq_num_descs);
+
+ ret = ena_create_sq(ena, txq->et_sq_num_descs,
+ txq->et_sq_dma.edb_cookie->dmac_laddress, B_TRUE, cq_hw_idx,
+ &sq_hw_idx, &sq_db_addr);
+
+ if (ret != 0) {
+ ena_err(ena, "failed to create Tx SQ %u: %d", txq->et_txqs_idx,
+ ret);
+ return (B_FALSE);
+ }
+
+ txq->et_sq_hw_idx = sq_hw_idx;
+ txq->et_sq_db_addr = sq_db_addr;
+ /* The phase must always start on 1. */
+ txq->et_sq_phase = 1;
+ txq->et_sq_avail_descs = txq->et_sq_num_descs;
+ txq->et_blocked = B_FALSE;
+ txq->et_state |= ENA_TXQ_STATE_SQ_CREATED;
+
+ return (B_TRUE);
+}
+
+void
+ena_cleanup_txq(ena_txq_t *txq)
+{
+ int ret = 0;
+ ena_t *ena = txq->et_ena;
+
+ if ((txq->et_state & ENA_TXQ_STATE_SQ_CREATED) != 0) {
+ ret = ena_destroy_sq(ena, txq->et_sq_hw_idx, B_TRUE);
+
+ if (ret != 0) {
+ ena_err(ena, "failed to destroy Tx SQ %u: %d",
+ txq->et_txqs_idx, ret);
+ }
+
+ txq->et_sq_hw_idx = 0;
+ txq->et_sq_db_addr = NULL;
+ txq->et_sq_tail_idx = 0;
+ txq->et_sq_phase = 0;
+ txq->et_state &= ~ENA_TXQ_STATE_SQ_CREATED;
+ }
+
+ if ((txq->et_state & ENA_TXQ_STATE_CQ_CREATED) != 0) {
+ ret = ena_destroy_cq(ena, txq->et_cq_hw_idx);
+
+ if (ret != 0) {
+ ena_err(ena, "failed to destroy Tx CQ %u: %d",
+ txq->et_txqs_idx, ret);
+ }
+
+ txq->et_cq_hw_idx = 0;
+ txq->et_cq_head_idx = 0;
+ txq->et_cq_phase = 0;
+ txq->et_cq_head_db_addr = NULL;
+ txq->et_cq_unmask_addr = NULL;
+ txq->et_cq_numa_addr = NULL;
+ txq->et_state &= ~ENA_TXQ_STATE_CQ_CREATED;
+ }
+
+ ena_free_tx_dma(txq);
+ VERIFY3S(txq->et_state, ==, ENA_TXQ_STATE_NONE);
+}
+
+void
+ena_ring_tx_stop(mac_ring_driver_t rh)
+{
+ ena_txq_t *txq = (ena_txq_t *)rh;
+ uint32_t intr_ctrl;
+
+ intr_ctrl = ena_hw_abs_read32(txq->et_ena, txq->et_cq_unmask_addr);
+ ENAHW_REG_INTR_UNMASK(intr_ctrl);
+ ena_hw_abs_write32(txq->et_ena, txq->et_cq_unmask_addr, intr_ctrl);
+
+ txq->et_state &= ~ENA_TXQ_STATE_RUNNING;
+ txq->et_state &= ~ENA_TXQ_STATE_READY;
+}
+
+int
+ena_ring_tx_start(mac_ring_driver_t rh, uint64_t gen_num)
+{
+ ena_txq_t *txq = (ena_txq_t *)rh;
+ ena_t *ena = txq->et_ena;
+ uint32_t intr_ctrl;
+
+ mutex_enter(&txq->et_lock);
+ txq->et_m_gen_num = gen_num;
+ mutex_exit(&txq->et_lock);
+
+ txq->et_state |= ENA_TXQ_STATE_READY;
+
+ intr_ctrl = ena_hw_abs_read32(ena, txq->et_cq_unmask_addr);
+ ENAHW_REG_INTR_UNMASK(intr_ctrl);
+ ena_hw_abs_write32(ena, txq->et_cq_unmask_addr, intr_ctrl);
+ txq->et_state |= ENA_TXQ_STATE_RUNNING;
+ return (0);
+}
+
+static void
+ena_tx_copy_fragment(ena_tx_control_block_t *tcb, const mblk_t *mp,
+ const size_t off, const size_t len)
+{
+ const void *soff = mp->b_rptr + off;
+ void *doff =
+ (void *)(tcb->etcb_dma.edb_va + tcb->etcb_dma.edb_used_len);
+
+ VERIFY3U(len, >, 0);
+ VERIFY3P(soff, >=, mp->b_rptr);
+ VERIFY3P(soff, <=, mp->b_wptr);
+ VERIFY3U(len, <=, MBLKL(mp));
+ VERIFY3U((uintptr_t)soff + len, <=, (uintptr_t)mp->b_wptr);
+ VERIFY3U(tcb->etcb_dma.edb_used_len + len, <, tcb->etcb_dma.edb_len);
+
+ bcopy(soff, doff, len);
+ tcb->etcb_type = ENA_TCB_COPY;
+ tcb->etcb_dma.edb_used_len += len;
+}
+
+ena_tx_control_block_t *
+ena_pull_tcb(const ena_txq_t *txq, mblk_t *mp)
+{
+ mblk_t *nmp = mp;
+ ena_t *ena = txq->et_ena;
+ ena_tx_control_block_t *tcb = NULL;
+ const uint16_t tail_mod =
+ txq->et_sq_tail_idx & (txq->et_sq_num_descs - 1);
+
+ ASSERT(MUTEX_HELD(&txq->et_lock));
+ VERIFY3U(msgsize(mp), <, ena->ena_tx_buf_sz);
+
+ while (nmp != NULL) {
+ const size_t nmp_len = MBLKL(nmp);
+
+ if (nmp_len == 0) {
+ nmp = nmp->b_cont;
+ continue;
+ }
+
+ /* For now TCB is bound to SQ desc. */
+ if (tcb == NULL) {
+ tcb = &txq->et_tcbs[tail_mod];
+ }
+
+ ena_tx_copy_fragment(tcb, nmp, 0, nmp_len);
+ nmp = nmp->b_cont;
+ }
+
+ ENA_DMA_SYNC(tcb->etcb_dma, DDI_DMA_SYNC_FORDEV);
+ VERIFY3P(nmp, ==, NULL);
+ VERIFY3P(tcb, !=, NULL);
+ return (tcb);
+}
+
+static void
+ena_fill_tx_data_desc(ena_txq_t *txq, ena_tx_control_block_t *tcb,
+ uint16_t tail, uint8_t phase, enahw_tx_data_desc_t *desc,
+ mac_ether_offload_info_t *meo, size_t mlen)
+{
+ VERIFY3U(mlen, <=, ENAHW_TX_DESC_LENGTH_MASK);
+
+#ifdef DEBUG
+ /*
+ * If there is no header for the specific layer it will be set
+ * to zero, thus we elide the meoi_flags check here.
+ */
+ size_t hdr_len = meo->meoi_l2hlen + meo->meoi_l3hlen + meo->meoi_l4hlen;
+ ASSERT3U(hdr_len, <=, txq->et_ena->ena_tx_max_hdr_len);
+#endif
+
+ bzero(desc, sizeof (*desc));
+ ENAHW_TX_DESC_FIRST_ON(desc);
+ ENAHW_TX_DESC_LENGTH(desc, mlen);
+ ENAHW_TX_DESC_REQID_HI(desc, tail);
+ ENAHW_TX_DESC_REQID_LO(desc, tail);
+ ENAHW_TX_DESC_PHASE(desc, phase);
+ ENAHW_TX_DESC_DF_ON(desc);
+ ENAHW_TX_DESC_LAST_ON(desc);
+ ENAHW_TX_DESC_COMP_REQ_ON(desc);
+ ENAHW_TX_DESC_META_DESC_OFF(desc);
+ ENAHW_TX_DESC_ADDR_LO(desc, tcb->etcb_dma.edb_cookie->dmac_laddress);
+ ENAHW_TX_DESC_ADDR_HI(desc, tcb->etcb_dma.edb_cookie->dmac_laddress);
+ /*
+ * NOTE: Please see the block comment above
+ * etd_buff_addr_hi_hdr_sz to see why this is set to 0.
+ */
+ ENAHW_TX_DESC_HEADER_LENGTH(desc, 0);
+ ENAHW_TX_DESC_TSO_OFF(desc);
+ ENAHW_TX_DESC_L3_CSUM_OFF(desc);
+ ENAHW_TX_DESC_L4_CSUM_OFF(desc);
+ /*
+ * Enabling this bit tells the device NOT to calculate the
+ * pseudo header checksum.
+ */
+ ENAHW_TX_DESC_L4_CSUM_PARTIAL_ON(desc);
+}
+
+static void
+ena_submit_tx(ena_txq_t *txq, uint16_t desc_idx)
+{
+ ena_hw_abs_write32(txq->et_ena, txq->et_sq_db_addr, desc_idx);
+}
+
+/*
+ * For now we do the simplest thing possible. All Tx uses bcopy to
+ * pre-allocated buffers, no checksum, no TSO, etc.
+ */
+mblk_t *
+ena_ring_tx(void *arg, mblk_t *mp)
+{
+ ena_txq_t *txq = arg;
+ ena_t *ena = txq->et_ena;
+ mac_ether_offload_info_t meo;
+ enahw_tx_data_desc_t *desc;
+ ena_tx_control_block_t *tcb;
+ const uint16_t tail_mod =
+ txq->et_sq_tail_idx & (txq->et_sq_num_descs - 1);
+
+ VERIFY3P(mp->b_next, ==, NULL);
+ VERIFY(txq->et_blocked == B_FALSE);
+
+ /*
+ * The ena_state value is written by atomic operations. The
+ * et_state value is currently Write Once, but if that changes
+ * it should also be written with atomics.
+ */
+ if (!(ena->ena_state & ENA_STATE_RUNNING) ||
+ !(txq->et_state & ENA_TXQ_STATE_RUNNING)) {
+ freemsg(mp);
+ return (NULL);
+ }
+
+ if (mac_ether_offload_info(mp, &meo) != 0) {
+ freemsg(mp);
+ mutex_enter(&txq->et_stat_lock);
+ txq->et_stat.ets_hck_meoifail.value.ui64++;
+ mutex_exit(&txq->et_stat_lock);
+ return (NULL);
+ }
+
+ mutex_enter(&txq->et_lock);
+
+ /*
+ * For the moment there is a 1:1 mapping between Tx descs and
+ * Tx contexts. Currently Tx is copy only, and each context
+ * buffer is guaranteed to be as large as MTU + frame header,
+ * see ena_update_buf_sizes().
+ */
+ if (txq->et_sq_avail_descs == 0) {
+ txq->et_blocked = B_TRUE;
+ mutex_enter(&txq->et_stat_lock);
+ txq->et_stat.ets_blocked.value.ui64++;
+ mutex_exit(&txq->et_stat_lock);
+ mutex_exit(&txq->et_lock);
+ return (mp);
+ }
+
+ ASSERT3U(meo.meoi_len, <=, ena->ena_max_frame_total);
+ tcb = ena_pull_tcb(txq, mp);
+ ASSERT3P(tcb, !=, NULL);
+ tcb->etcb_mp = mp;
+ txq->et_sq_avail_descs--;
+
+ /* Fill in the Tx descriptor. */
+ desc = &(txq->et_sq_descs[tail_mod].etd_data);
+ ena_fill_tx_data_desc(txq, tcb, tail_mod, txq->et_sq_phase, desc, &meo,
+ meo.meoi_len);
+ DTRACE_PROBE3(tx__submit, ena_tx_control_block_t *, tcb, uint16_t,
+ tail_mod, enahw_tx_data_desc_t *, desc);
+
+ /*
+ * Remember, we submit the raw tail value to the device, the
+ * hardware performs its own modulo (like we did to get
+ * tail_mod).
+ */
+ txq->et_sq_tail_idx++;
+ ena_submit_tx(txq, txq->et_sq_tail_idx);
+
+ mutex_enter(&txq->et_stat_lock);
+ txq->et_stat.ets_packets.value.ui64++;
+ txq->et_stat.ets_bytes.value.ui64 += meo.meoi_len;
+ mutex_exit(&txq->et_stat_lock);
+
+ if ((txq->et_sq_tail_idx & (txq->et_sq_num_descs - 1)) == 0) {
+ txq->et_sq_phase = !txq->et_sq_phase;
+ }
+
+ mutex_exit(&txq->et_lock);
+ return (NULL);
+}
+
+void
+ena_tx_intr_work(ena_txq_t *txq)
+{
+ uint16_t head_mod;
+ enahw_tx_cdesc_t *cdesc;
+ ena_tx_control_block_t *tcb;
+ uint16_t req_id;
+ uint64_t recycled = 0;
+ boolean_t unblocked = B_FALSE;
+
+ mutex_enter(&txq->et_lock);
+ head_mod = txq->et_cq_head_idx & (txq->et_cq_num_descs - 1);
+ ENA_DMA_SYNC(txq->et_cq_dma, DDI_DMA_SYNC_FORKERNEL);
+ cdesc = &txq->et_cq_descs[head_mod];
+
+ /* Recycle any completed descriptors. */
+ while (ENAHW_TX_CDESC_GET_PHASE(cdesc) == txq->et_cq_phase) {
+ mblk_t *mp;
+
+ /* Get the corresponding TCB. */
+ req_id = cdesc->etc_req_id;
+ /*
+ * It would be nice to make this a device reset
+ * instead.
+ */
+ VERIFY3U(req_id, <=, txq->et_sq_num_descs);
+ tcb = &txq->et_tcbs[req_id];
+ DTRACE_PROBE2(tx__complete, uint16_t, req_id,
+ ena_tx_control_block_t *, tcb);
+
+ /* Free the associated mblk. */
+ tcb->etcb_dma.edb_used_len = 0;
+ mp = tcb->etcb_mp;
+ /* Make this a device reset instead. */
+ VERIFY3P(mp, !=, NULL);
+ freemsg(mp);
+ tcb->etcb_mp = NULL;
+
+ /* Add this descriptor back to the free list. */
+ txq->et_sq_avail_descs++;
+ txq->et_cq_head_idx++;
+
+ /* Check for phase rollover. */
+ head_mod = txq->et_cq_head_idx & (txq->et_cq_num_descs - 1);
+
+ if (head_mod == 0) {
+ txq->et_cq_phase = !txq->et_cq_phase;
+ }
+
+ if (txq->et_blocked) {
+ txq->et_blocked = B_FALSE;
+ unblocked = B_TRUE;
+ mac_tx_ring_update(txq->et_ena->ena_mh, txq->et_mrh);
+ }
+
+ recycled++;
+ cdesc = &txq->et_cq_descs[head_mod];
+ }
+
+ /*
+ * If the device provided a head doorbell register, then we
+ * need to update it to let the device know we are done
+ * reading these CQ entries.
+ */
+ if (txq->et_cq_head_db_addr != NULL) {
+ ena_hw_abs_write32(txq->et_ena, txq->et_cq_head_db_addr,
+ head_mod);
+ }
+
+ mutex_exit(&txq->et_lock);
+
+ /* Update stats. */
+ mutex_enter(&txq->et_stat_lock);
+ txq->et_stat.ets_recycled.value.ui64 += recycled;
+ if (unblocked) {
+ txq->et_stat.ets_unblocked.value.ui64++;
+ }
+ mutex_exit(&txq->et_stat_lock);
+}
diff --git a/usr/src/uts/common/mapfiles/ddi.mapfile b/usr/src/uts/common/mapfiles/ddi.mapfile
index 798367c7e3..a9f4f2d730 100644
--- a/usr/src/uts/common/mapfiles/ddi.mapfile
+++ b/usr/src/uts/common/mapfiles/ddi.mapfile
@@ -12,6 +12,7 @@
#
# Copyright 2020 Joyent, Inc.
# Copyright 2020 RackTop Systems, Inc.
+# Copyright 2021 Oxide Computer Company
#
#
@@ -78,6 +79,7 @@ SYMBOL_SCOPE {
ddi_dma_addr_bind_handle { FLAGS = EXTERN };
ddi_dma_alloc_handle { FLAGS = EXTERN };
ddi_dma_cookie_iter { FLAGS = EXTERN };
+ ddi_dma_cookie_one { FLAGS = EXTERN };
ddi_dma_free_handle { FLAGS = EXTERN };
ddi_dma_mem_alloc { FLAGS = EXTERN };
ddi_dma_mem_free { FLAGS = EXTERN };
@@ -153,6 +155,7 @@ SYMBOL_SCOPE {
dev_err { FLAGS = EXTERN };
drv_usectohz { FLAGS = EXTERN };
drv_usecwait { FLAGS = EXTERN };
+ ffs { FLAGS = EXTERN };
fm_ena_generate { FLAGS = EXTERN };
freeb { FLAGS = EXTERN };
freemsg { FLAGS = EXTERN };
@@ -168,6 +171,7 @@ SYMBOL_SCOPE {
list_create { FLAGS = EXTERN };
list_destroy { FLAGS = EXTERN };
list_head { FLAGS = EXTERN };
+ list_insert_head { FLAGS = EXTERN };
list_insert_tail { FLAGS = EXTERN };
list_next { FLAGS = EXTERN };
list_remove { FLAGS = EXTERN };
@@ -219,9 +223,12 @@ SYMBOL_SCOPE {
strcat { FLAGS = EXTERN };
strcmp { FLAGS = EXTERN };
strcpy { FLAGS = EXTERN };
+ strlcpy { FLAGS = EXTERN };
strlen { FLAGS = EXTERN };
timeout { FLAGS = EXTERN };
untimeout { FLAGS = EXTERN };
+ vcmn_err { FLAGS = EXTERN };
+ vdev_err { FLAGS = EXTERN };
vsnprintf { FLAGS = EXTERN };
vsprintf { FLAGS = EXTERN };
};
diff --git a/usr/src/uts/common/mapfiles/kernel.mapfile b/usr/src/uts/common/mapfiles/kernel.mapfile
index 21a691dca2..6fcc1fa371 100644
--- a/usr/src/uts/common/mapfiles/kernel.mapfile
+++ b/usr/src/uts/common/mapfiles/kernel.mapfile
@@ -11,6 +11,7 @@
#
# Copyright 2016 Joyent, Inc.
+# Copyright 2021 Oxide Computer Company
#
#
@@ -40,4 +41,6 @@ SYMBOL_SCOPE {
servicing_interrupt { FLAGS = EXTERN };
fnvlist_alloc { FLAGS = EXTERN };
fnvlist_add_string { FLAGS = EXTERN };
+ ncpus_online { FLAGS = EXTERN };
+ utsname { FLAGS = EXTERN };
};
diff --git a/usr/src/uts/common/sys/ethernet.h b/usr/src/uts/common/sys/ethernet.h
index 5b9de2f2bf..4febb8915f 100644
--- a/usr/src/uts/common/sys/ethernet.h
+++ b/usr/src/uts/common/sys/ethernet.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ * Copyright 2021 Oxide Computer Company
*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
@@ -140,6 +141,8 @@ struct ether_vlan_extinfo {
#endif
#ifdef _KERNEL
+#define ETHER_IS_MULTICAST(addr) (((addr)[0] & 0x01) != 0)
+
extern int localetheraddr(struct ether_addr *, struct ether_addr *);
extern char *ether_sprintf(struct ether_addr *);
extern int ether_aton(char *, uchar_t *);
diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel
index cd5eabf7c5..4d1d2664c3 100644
--- a/usr/src/uts/intel/Makefile.intel
+++ b/usr/src/uts/intel/Makefile.intel
@@ -27,6 +27,7 @@
# Copyright 2018 Nexenta Systems, Inc.
# Copyright 2019 RackTop Systems
# Copyright 2019 Peter Tribble.
+# Copyright 2021 Oxide Computer Company
#
#
@@ -385,6 +386,7 @@ DRV_KMODS += dmfe
DRV_KMODS += e1000g
DRV_KMODS += efe
DRV_KMODS += elxl
+DRV_KMODS += ena
DRV_KMODS += hme
DRV_KMODS += mxfe
DRV_KMODS += nge
diff --git a/usr/src/uts/intel/ena/Makefile b/usr/src/uts/intel/ena/Makefile
new file mode 100644
index 0000000000..bef9878cc0
--- /dev/null
+++ b/usr/src/uts/intel/ena/Makefile
@@ -0,0 +1,47 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2021 Oxide Computer Company
+#
+
+UTSBASE = ../..
+
+MODULE = ena
+OBJECTS = $(ENA_OBJS:%=$(OBJS_DIR)/%)
+ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
+CONF_SRCDIR = $(UTSBASE)/common/io/ena
+
+include $(UTSBASE)/intel/Makefile.intel
+
+CPPFLAGS += -I$(UTSBASE)/common/io/ena
+
+ALL_TARGET = $(BINARY) $(CONFMOD)
+INSTALL_TARGET = $(BINBAR) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+LDFLAGS += -dy -N misc/mac
+
+MAPFILES += ddi mac kernel
+
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+install: $(INSTALL_DEPS)
+
+include $(UTSBASE)/Makefile.mapfile
+include $(UTSBASE)/intel/Makefile.targ