diff options
| author | Dan McDonald <danmcd@mnx.io> | 2022-08-05 01:09:21 -0400 |
|---|---|---|
| committer | Dan McDonald <danmcd@mnx.io> | 2022-08-05 01:09:21 -0400 |
| commit | d491608f4e80ef223b7567c7914acbf733832708 (patch) | |
| tree | db3702c2a2ba0bfe6843d5fcaa0cc19f647e65d6 /usr/src | |
| parent | 7214807eae5f4cd01de736a5d1c72ef382fcb622 (diff) | |
| parent | 7105039931b43c4efeda411cd0527843723c90bd (diff) | |
| download | illumos-joyent-d491608f4e80ef223b7567c7914acbf733832708.tar.gz | |
[illumos-gate merge]
commit 7105039931b43c4efeda411cd0527843723c90bd
14887 audio_legacy_* prototypes can be removed
commit b75a8b718b0b6c50c43b47b15603947383771a04
14873 Retire ddi_getiminor()
commit 5b2c4190a831f52d91a5b92473ffb5a06e84511d
14812 pcie: properly set max packet size and tagging
commit bdb5139270356ff627abb9467f2b4fc8db3fb81d
14827 overlay_m_stop() blows verify in race with overlay_target_inject()
14853 overlay_setprop_vnetid bungles OVERLAY_F_MDDROP
commit 2c76d75129011c98e79463bb84917b828f922a11
13700 pollhead_delete trips over bad pointer
Conflicts:
manifest
usr/src/uts/common/sys/overlay_impl.h
usr/src/uts/common/syscall/poll.c
Diffstat (limited to 'usr/src')
26 files changed, 1043 insertions, 530 deletions
diff --git a/usr/src/man/man9f/Intro.9f b/usr/src/man/man9f/Intro.9f index 3e0627b878..bc90a70681 100644 --- a/usr/src/man/man9f/Intro.9f +++ b/usr/src/man/man9f/Intro.9f @@ -1,10 +1,10 @@ '\" te -.\" Copyright 2014 Garrett D'Amore <garrett@damore.org> .\" Copyright (c) 2005, Sun Microsystems, Inc., All Rights Reserved .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH INTRO 9F "Feb 06, 2012" +.\" Copyright 2022 Garrett D'Amore +.TH INTRO 9F "July 30, 2022" .SH NAME Intro, intro \- introduction to DDI/DKI functions .SH DESCRIPTION @@ -399,7 +399,6 @@ _ \fBddi_get_driver_private\fR illumos DDI \fBddi_get_eventcookie\fR illumos DDI \fBddi_get_iblock_cookie\fR illumos DDI -\fBddi_get_iminor\fR illumos DDI \fBddi_get_instance\fR illumos DDI \fBddi_get_kt_did\fR illumos DDI \fBddi_get_lbolt\fR illumos DDI diff --git a/usr/src/man/man9f/Makefile b/usr/src/man/man9f/Makefile index 9bbb662981..f2a8ba5f80 100644 --- a/usr/src/man/man9f/Makefile +++ b/usr/src/man/man9f/Makefile @@ -11,12 +11,12 @@ # # Copyright 2017, Richard Lowe -# Copyright 2014 Garrett D'Amore <garrett@damore> # Copyright 2019 Joyent, Inc. # Copyright 2020-2021 Tintri by DDN, Inc. All rights reserved. # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> # Copyright 2022 RackTop Systems, Inc. # Copyright 2022 Oxide Computer Company +# Copyright 2022 Garrett D'Amore # include $(SRC)/Makefile.master @@ -191,7 +191,6 @@ MANFILES= ASSERT.9f \ ddi_get_parent.9f \ ddi_get_pid.9f \ ddi_get_time.9f \ - ddi_getiminor.9f \ ddi_in_panic.9f \ ddi_intr_add_handler.9f \ ddi_intr_add_softint.9f \ diff --git a/usr/src/man/man9f/ddi_getiminor.9f b/usr/src/man/man9f/ddi_getiminor.9f deleted file mode 100644 index 29ab0cf54a..0000000000 --- a/usr/src/man/man9f/ddi_getiminor.9f +++ /dev/null @@ -1,94 +0,0 @@ -'\" te -.\" Copyright (c) 2004, Sun Microsystems, Inc. -.\" All Rights Reserved -.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. -.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. -.\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH DDI_GETIMINOR 9F "Nov 18, 2004" -.SH NAME -ddi_getiminor \- get kernel internal minor number from an external dev_t -.SH SYNOPSIS -.LP -.nf -#include <sys/types.h> -#include <sys/mkdev.h> -#include <sys/ddi.h> - - - -\fBminor_t\fR \fBddi_getiminor\fR(\fBdev_t\fR \fIdev\fR); -.fi - -.SH INTERFACE LEVEL -.sp -.LP -This interface is obsolete. \fBgetminor\fR(9F) should be used instead. -.SH PARAMETERS -.sp -.LP -The following parameters are supported: -.sp -.ne 2 -.na -\fB\fIdev\fR\fR -.ad -.RS 7n -Device number. -.RE - -.SH DESCRIPTION -.sp -.LP -\fBddi_getiminor()\fR extracts the minor number from a device number. This call -should be used only for device numbers that have been passed to the kernel from -the user space through opaque interfaces such as the contents of -\fBioctl\fR(9E) and \fBputmsg\fR(2). The device numbers passed in using -standard device entry points must continue to be interpreted using the -\fBgetminor\fR(9F) interface. This new interface is used to translate between -user visible device numbers and in kernel device numbers. The two numbers may -differ in a clustered system. -.sp -.LP -For certain bus types, you can call this \fBDDI\fR function from a -high-interrupt context. These types include \fBISA\fR and SBus buses. See -\fBsysbus\fR(5), \fBisa\fR(5), and \fBsbus\fR(5) for details. -.SH CONTEXT -.sp -.LP -\fBddi_getiminor()\fR can be called from user context only. -.SH RETURN VALUES -.sp -.LP -The minor number or \fBEMINOR_UNKNOWN\fR if the minor number of the device is -invalid. -.SH ATTRIBUTES -.sp -.LP -See \fBattributes\fR(7) for a description of the following attributes: -.sp - -.sp -.TS -box; -c | c -l | l . -ATTRIBUTE TYPE ATTRIBUTE VALUE -_ -Stability Level Obsolete -.TE - -.SH SEE ALSO -.sp -.LP -.BR attributes (7), -.BR getmajor (9F), -.BR getminor (9F), -.BR makedevice (9F) -.sp -.LP -\fIWriting Device Drivers\fR -.SH WARNINGS -.sp -.LP -Drivers are required to replace calls to \fBddi_getminor.9f\fR by -\fBgetminor\fR(9F)) in order to compile under Solaris 10 and later versions. diff --git a/usr/src/pkg/manifests/system-kernel.man9f.inc b/usr/src/pkg/manifests/system-kernel.man9f.inc index ae782936d3..62a8927bae 100644 --- a/usr/src/pkg/manifests/system-kernel.man9f.inc +++ b/usr/src/pkg/manifests/system-kernel.man9f.inc @@ -11,7 +11,7 @@ # # Copyright 2017, Richard Lowe -# Copyright 2014 Garrett D'Amore <garrett@damore.org> +# Copyright 2022 Garrett D'Amore # Copyright 2020-2021 Tintri by DDN, Inc. All rights reserved. # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> # Copyright 2019 Joyent, Inc. @@ -439,7 +439,6 @@ link path=usr/share/man/man9f/ddi_get_soft_iblock_cookie.9f \ target=ddi_add_softintr.9f link path=usr/share/man/man9f/ddi_get_soft_state.9f target=ddi_soft_state.9f file path=usr/share/man/man9f/ddi_get_time.9f -file path=usr/share/man/man9f/ddi_getiminor.9f link path=usr/share/man/man9f/ddi_getlongprop.9f target=ddi_prop_op.9f link path=usr/share/man/man9f/ddi_getlongprop_buf.9f target=ddi_prop_op.9f link path=usr/share/man/man9f/ddi_getprop.9f target=ddi_prop_op.9f diff --git a/usr/src/uts/common/fs/ctfs/ctfs_event.c b/usr/src/uts/common/fs/ctfs/ctfs_event.c index c9a99e85fb..a8902c735a 100644 --- a/usr/src/uts/common/fs/ctfs/ctfs_event.c +++ b/usr/src/uts/common/fs/ctfs/ctfs_event.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/time.h> @@ -88,6 +86,7 @@ ctfs_endpoint_inactive(ctfs_endpoint_t *endpt) endpt->ctfs_endpt_flags = 0; cte_remove_listener(&endpt->ctfs_endpt_listener); } + pollhead_clean(&endpt->ctfs_endpt_listener.ctl_pollhead); mutex_exit(&endpt->ctfs_endpt_lock); } diff --git a/usr/src/uts/common/fs/portfs/port_fd.c b/usr/src/uts/common/fs/portfs/port_fd.c index 511c15e979..59651add4f 100644 --- a/usr/src/uts/common/fs/portfs/port_fd.c +++ b/usr/src/uts/common/fs/portfs/port_fd.c @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2022 Oxide Computer Company */ @@ -116,10 +117,7 @@ port_fd_callback(void *arg, int *events, pid_t pid, int flag, void *evp) mutex_enter(&pcp->pc_lock); pdp->pd_fp = NULL; pdp->pd_events = 0; - if (pdp->pd_php != NULL) { - pollhead_delete(pdp->pd_php, pdp); - pdp->pd_php = NULL; - } + polldat_disassociate(pdp); port_pcache_remove_fd(pcp, pfd); mutex_exit(&pcp->pc_lock); error = 0; @@ -339,7 +337,7 @@ port_associate_fd(port_t *pp, int source, uintptr_t object, int events, /* * To keep synchronization between VOP_POLL above and - * pollhead_insert below, it is necessary to + * polldat_associate() below, it is necessary to * call VOP_POLL() again (see port_bind_pollhead()). */ if (error) { @@ -525,23 +523,21 @@ port_bind_pollhead(pollhead_t **php, polldat_t *pdp, short *revents) int error; file_t *fp; - /* polldat_t associated with another pollhead_t pointer */ - if (pdp->pd_php != NULL) - pollhead_delete(pdp->pd_php, pdp); + /* break any existing association with pollhead */ + polldat_disassociate(pdp); /* - * Before pollhead_insert() pollwakeup() will not detect a polldat + * Before polldat_associate(), pollwakeup() will not detect a polldat * entry in the ph_list and the event notification will disappear. * This happens because polldat_t is still not associated with * the pointer to the pollhead_t structure. */ - pollhead_insert(*php, pdp); + polldat_associate(pdp, *php); /* * From now on event notification can be detected in pollwakeup(), * Use VOP_POLL() again to check the current status of the event. */ - pdp->pd_php = *php; fp = pdp->pd_fp; curthread->t_pollcache = (pollcache_t *)pdp->pd_pcache; error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0, revents, php, NULL); diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c index 21a1684179..e14c3e73ee 100644 --- a/usr/src/uts/common/fs/proc/prvnops.c +++ b/usr/src/uts/common/fs/proc/prvnops.c @@ -5144,9 +5144,11 @@ prfreecommon(prcommon_t *pcp) mutex_exit(&pcp->prc_mutex); else { mutex_exit(&pcp->prc_mutex); - ASSERT(pcp->prc_pollhead.ph_list == NULL); + ASSERT(pcp->prc_refcnt == 0); ASSERT(pcp->prc_selfopens == 0 && pcp->prc_writers == 0); + + pollhead_clean(&pcp->prc_pollhead); mutex_destroy(&pcp->prc_mutex); cv_destroy(&pcp->prc_wait); kmem_free(pcp, sizeof (prcommon_t)); diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c index 7368c9b43d..afc087132b 100644 --- a/usr/src/uts/common/io/devpoll.c +++ b/usr/src/uts/common/io/devpoll.c @@ -26,6 +26,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2019 Joyent, Inc. + * Copyright 2022 Oxide Computer Company */ #include <sys/types.h> @@ -334,11 +335,7 @@ repoll: pdp->pd_fp = NULL; pdp->pd_events = 0; - if (pdp->pd_php != NULL) { - pollhead_delete(pdp->pd_php, - pdp); - pdp->pd_php = NULL; - } + polldat_disassociate(pdp); BT_CLEAR(pcp->pc_bitmap, fd); } else if (pfdp != NULL) { @@ -379,10 +376,7 @@ repoll: * that a valid one will be provided as part of * the later VOP_POLL. */ - if (pdp->pd_php != NULL) { - pollhead_delete(pdp->pd_php, pdp); - pdp->pd_php = NULL; - } + polldat_disassociate(pdp); /* * Since epoll is expected to act on the @@ -467,9 +461,8 @@ repoll: */ if (php != NULL && pdp->pd_php != NULL && php != pdp->pd_php) { - pollhead_delete(pdp->pd_php, pdp); - pdp->pd_php = php; - pollhead_insert(php, pdp); + polldat_disassociate(pdp); + polldat_associate(pdp, php); /* * The bit should still be set. */ @@ -567,11 +560,7 @@ repoll: * later add/modify event rearms them. */ pdp->pd_events = POLLONESHOT; - if (pdp->pd_php != NULL) { - pollhead_delete(pdp->pd_php, - pdp); - pdp->pd_php = NULL; - } + polldat_disassociate(pdp); BT_CLEAR(pcp->pc_bitmap, fd); } else if (pdp->pd_events & POLLET) { /* @@ -582,8 +571,7 @@ repoll: */ if (php != NULL && pdp->pd_php == NULL) { - pollhead_insert(php, pdp); - pdp->pd_php = php; + polldat_associate(pdp, php); } /* @@ -614,12 +602,12 @@ repoll: BT_CLEAR(pcp->pc_bitmap, fd); } if (pdp->pd_php == NULL) { - pollhead_insert(php, pdp); - pdp->pd_php = php; + polldat_associate(pdp, php); /* * An event of interest may have * arrived between the VOP_POLL() and - * the pollhead_insert(); check again. + * the polldat_associate(), so we + * must check again. */ goto repoll; } @@ -1043,7 +1031,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) * We always set the bit when this fd is cached; * this forces the first DP_POLL to poll this fd. * Real performance gain comes from subsequent - * DP_POLL. We also attempt a pollhead_insert(); + * DP_POLL. We also attempt a polldat_associate(); * if it's not possible, we'll do it in dpioctl(). */ BT_SET(pcp->pc_bitmap, fd); @@ -1056,14 +1044,11 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) pdp->pd_events |= pfdp->events; if (php != NULL) { if (pdp->pd_php == NULL) { - pollhead_insert(php, pdp); - pdp->pd_php = php; + polldat_associate(pdp, php); } else { if (pdp->pd_php != php) { - pollhead_delete(pdp->pd_php, - pdp); - pollhead_insert(php, pdp); - pdp->pd_php = php; + polldat_disassociate(pdp); + polldat_associate(pdp, php); } } } @@ -1087,10 +1072,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) pdp->pd_fp = NULL; pdp->pd_events = 0; ASSERT(pdp->pd_thread == NULL); - if (pdp->pd_php != NULL) { - pollhead_delete(pdp->pd_php, pdp); - pdp->pd_php = NULL; - } + polldat_disassociate(pdp); BT_CLEAR(pcp->pc_bitmap, fd); } } @@ -1662,11 +1644,8 @@ dpclose(dev_t dev, int flag, int otyp, cred_t *credp) hashtbl = pcp->pc_hash; for (i = 0; i < pcp->pc_hashsize; i++) { for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { - if (pdp->pd_php != NULL) { - pollhead_delete(pdp->pd_php, pdp); - pdp->pd_php = NULL; - pdp->pd_fp = NULL; - } + polldat_disassociate(pdp); + pdp->pd_fp = NULL; } } /* diff --git a/usr/src/uts/common/io/ib/clients/of/sol_uverbs/sol_uverbs_event.c b/usr/src/uts/common/io/ib/clients/of/sol_uverbs/sol_uverbs_event.c index ea7b551cee..01160d9386 100644 --- a/usr/src/uts/common/io/ib/clients/of/sol_uverbs/sol_uverbs_event.c +++ b/usr/src/uts/common/io/ib/clients/of/sol_uverbs/sol_uverbs_event.c @@ -226,7 +226,7 @@ sol_uverbs_event_file_read(uverbs_ufile_uobj_t *ufile, struct uio *uiop, * sol_uverbs_event_file_poll * Input: * ufile - user file for desired completion channel event file - * events - The events that may occur. + * events - The events that may occur. * anyyet - A flag that is non-zero if any files in the set * of descriptors has an event waiting. * ct - Pointer to the callers context. @@ -238,7 +238,7 @@ sol_uverbs_event_file_read(uverbs_ufile_uobj_t *ufile, struct uio *uiop, * Zero on success, else error code. * EINVAL - Vnode does not point to valid event file. * Description: - * Support for event channel polling interface, allows use of completion + * Support for event channel polling interface, allows use of completion * channel in asynchronous type environment. If events may be read * without blocking indicate a POLLIN | POLLRDNORM event; otherwise if * no other descriptors in the set have data waiting, set the pollhead @@ -408,6 +408,7 @@ uverbs_release_event_file(sol_ofs_uobj_t *uobj) mutex_exit(&ufile->lock); + pollhead_clean(&ufile->poll_head); cv_destroy(&ufile->poll_wait); mutex_destroy(&ufile->lock); sol_ofs_uobj_free(uobj); @@ -417,7 +418,7 @@ uverbs_release_event_file(sol_ofs_uobj_t *uobj) * Function: * uverbs_ibt_to_ofa_event_code * Input: - * code - The OFA event code. + * code - The OFA event code. * Output: * The OFED event code. * Returns: @@ -803,9 +804,9 @@ uverbs_async_event_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, * Function: * uverbs_async_event_common * Input: - * uctxt - Pointer to the user context associated with the + * uctxt - Pointer to the user context associated with the * affiliated event. - * element - The users handle to the associated object. + * element - The users handle to the associated object. * event - The event type. * uobj_list - The list to enqueue the asynchronous event. * counter - The counter to track the event delivery. @@ -879,7 +880,7 @@ uverbs_async_event_common(uverbs_uctxt_uobj_t *uctxt, uint64_t element, * Returns: * None * Description: - * Release any completion and asynchronous events that may + * Release any completion and asynchronous events that may * be queued to the specified completion channel/UCQ but not * yet reaped. */ diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c index 7a40dd1db0..311fae6719 100644 --- a/usr/src/uts/common/io/ksocket/ksocket.c +++ b/usr/src/uts/common/io/ksocket/ksocket.c @@ -802,8 +802,7 @@ ksocket_spoll(ksocket_t ks, int timo, short events, short *revents, pdp->pd_events = events; pdp->pd_pcache = pcp; pcache_insert_fd(pcp, pdp, 1); - pollhead_insert(php, pdp); - pdp->pd_php = php; + polldat_associate(pdp, php); mutex_enter(&pcp->pc_lock); while (!(so->so_state & SS_CLOSING)) { @@ -836,11 +835,8 @@ ksocket_spoll(ksocket_t ks, int timo, short events, short *revents, } mutex_exit(&pcp->pc_lock); - if (pdp->pd_php != NULL) { - pollhead_delete(pdp->pd_php, pdp); - pdp->pd_php = NULL; - pdp->pd_fd = 0; - } + polldat_disassociate(pdp); + pdp->pd_fd = 0; /* * pollwakeup() may still interact with this pollcache. Wait until diff --git a/usr/src/uts/common/io/pciex/pcie.c b/usr/src/uts/common/io/pciex/pcie.c index ce81339ea2..c40055809a 100644 --- a/usr/src/uts/common/io/pciex/pcie.c +++ b/usr/src/uts/common/io/pciex/pcie.c @@ -25,6 +25,454 @@ * Copyright 2022 Oxide Computer Company */ +/* + * PCIe Initialization + * ------------------- + * + * The PCIe subsystem is split about and initializes itself in a couple of + * different places. This is due to the platform-specific nature of initializing + * resources and the nature of the SPARC PROM and how that influenced the + * subsystem. Note that traditional PCI (mostly seen these days in Virtual + * Machines) follows most of the same basic path outlined here, but skips a + * large chunk of PCIe-specific initialization. + * + * First, there is an initial device discovery phase that is taken care of by + * the platform. This is where we discover the set of devices that are present + * at system power on. These devices may or may not be hot-pluggable. In + * particular, this happens in a platform-specific way right now. In general, we + * expect most discovery to be driven by scanning each bus, device, and + * function, and seeing what actually exists and responds to configuration space + * reads. This is driven via pci_boot.c on x86. This may be seeded by something + * like device tree, a PROM, supplemented with ACPI, or by knowledge that the + * underlying platform has. + * + * As a part of this discovery process, the full set of resources that exist in + * the system for PCIe are: + * + * o PCI buses + * o Prefetchable Memory + * o Non-prefetchable memory + * o I/O ports + * + * This process is driven by a platform's PCI platform Resource Discovery (PRD) + * module. The PRD definitions can be found in <sys/plat/pci_prd.h> and are used + * to discover these resources, which will be converted into the initial set of + * the standard properties in the system: 'regs', 'available', 'ranges', etc. + * Currently it is up to platform-specific code (which should ideally be + * consolidated at some point) to set up all these properties. + * + * As a part of the discovery process, the platform code will create a device + * node (dev_info_t) for each discovered function and will create a PCIe nexus + * for each overall root complex that exists in the system. Most root complexes + * will have multiple root ports, each of which is the foundation of an + * independent PCIe bus due to the point-to-point nature of PCIe. When a root + * complex is found, a nexus driver such as npe (Nexus for PCIe Express) is + * attached. In the case of a non-PCIe-capable system this is where the older + * pci nexus driver would be used instead. + * + * To track data about a given device on a bus, a 'pcie_bus_t' structure is + * created for and assigned to every PCIe-based dev_info_t. This can be used to + * find the root port and get basic information about the device, its faults, + * and related information. This contains pointers to the corresponding root + * port as well. + * + * A root complex has its pcie_bus_t initialized as part of the device discovery + * process. That is, because we're trying to bootstrap the actual tree and most + * platforms don't have a representation for this that's explicitly + * discoverable, this is created manually. See callers of pcie_rc_init_bus(). + * + * For other devices, bridges, and switches, the process is split into two. + * There is an initial pcie_bus_t that is created which will exist before we go + * through the actual driver attachment process. For example, on x86 this is + * done as part of the device and function discovery. The second pass of + * initialization is done only after the nexus driver actually is attached and + * it goes through and finishes processing all of its children. + * + * Child Initialization + * -------------------- + * + * Generally speaking, the platform will first enumerate all PCIe devices that + * are in the sytem before it actually creates a device tree. This is part of + * the bus/device/function scanning that is performed and from that dev_info_t + * nodes are created for each discovered device and are inserted into the + * broader device tree. Later in boot, the actual device tree is walked and the + * nodes go through the standard dev_info_t initialization process (DS_PROTO, + * DS_LINKED, DS_BOUND, etc.). + * + * PCIe-specific initialization can roughly be broken into the following pieces: + * + * 1. Platform initial discovery and resource assignment + * 2. The pcie_bus_t initialization + * 3. Nexus driver child initialization + * 4. Fabric initialization + * 5. Device driver-specific initialization + * + * The first part of this (1) and (2) are discussed in the previous section. + * Part (1) in particular is a combination of the PRD (platform resource + * discovery) and general device initialization. After this, because we have a + * device tree, most of the standard nexus initialization happens. + * + * (5) is somewhat simple, so let's get into it before we discuss (3) and (4). + * This is the last thing that is called and that happens after all of the + * others are done. This is the logic that occurs in a driver's attach(9E) entry + * point. This is always device-specific and generally speaking should not be + * manipulating standard PCIe registers directly on their own. For example, the + * MSI/MSI-X, AER, Serial Number, etc. capabilities will be automatically dealt + * with by the framework in (3) and (4) below. In many cases, particularly + * things that are part of (4), adjusting them in the individual driver is not + * safe. + * + * Finally, let's talk about (3) and (4) as these are related. The NDI provides + * for a standard hook for a nexus to initialize its children. In our platforms, + * there are basically two possible PCIe nexus drivers: there is the generic + * pcieb -- PCIe bridge -- driver which is used for standard root ports, + * switches, etc. Then there is the platform-specific primary nexus driver, + * which is being slowly consolidated into a single one where it makes sense. An + * example of this is npe. + * + * Each of these has a child initialization function which is called from their + * DDI_CTLOPS_INITCHILD operation on the bus_ctl function pointer. This goes + * through and initializes a large number of different pieces of PCIe-based + * settings through the common pcie_initchild() function. This takes care of + * things like: + * + * o Advanced Error Reporting + * o Alternative Routing + * o Capturing information around link speed, width, serial numbers, etc. + * o Setting common properties around aborts + * + * There are a few caveats with this that need to be kept in mind: + * + * o A dev_info_t indicates a specific function. This means that a + * multi-function device will not all be initialized at the same time and + * there is no guarantee that all children will be initialized before one of + * them is attached. + * o A child is only initialized if we have found a driver that matches an + * alias in the dev_info_t's compatible array property. While a lot of + * multi-function devices are often multiple instances of the same thing + * (e.g. a multi-port NIC with a function / NIC), this is not always the + * case and one cannot make any assumptions here. + * + * This in turn leads to the next form of initialization that takes place in the + * case of (4). This is where we take care of things that need to be consistent + * across either entire devices or more generally across an entire root port and + * all of its children. There are a few different examples of this: + * + * o Setting the maximum packet size + * o Determining the tag width + * + * Note that features which are only based on function 0, such as ASPM (Active + * State Power Management), hardware autonomous width disable, etc. ultimately + * do not go through this path today. There are some implications here in that + * today several of these things are captured on functions which may not have + * any control here. This is an area of needed improvement. + * + * The settings in (4) are initialized in a common way, via + * pcie_fabric_setup(). This is called into from two different parts of + * the stack: + * + * 1. When we attach a root port, which is driven by pcieb. + * 2. When we have a hotplug event that adds a device. + * + * In general here we are going to use the term 'fabric' to refer to everything + * that is downstream of a root port. This corresponds to what the PCIe + * specification calls a 'hierarchy domain'. Strictly speaking, this is fine + * until peer-to-peer requests begin to happen that cause you to need to forward + * things across root ports. At that point the scope of the fabric increases and + * these settings become more complicated. We currently optimize for the much + * more common case, which is that each root port is effectively independent + * from a PCIe transaction routing perspective. + * + * Put differently, we use the term 'fabric' to refer to a set of PCIe devices + * that can route transactions to one another, which is generally constrained to + * everything under a root port and that root ports are independent. If this + * constraint changes, then all one needs to do is replace the discussion of the + * root port below with the broader root complex and system. + * + * A challenge with these settings is that once they're set and devices are + * actively making requests, we cannot really change them without resetting the + * links and cancelling all outstanding transactions via device resets. Because + * this is not something that we want to do, we instead look at how and when we + * set this to constrain what's going on. + * + * Because of this we basically say that if a given fabric has more than one + * hot-plug capable device that's encountered, then we have to use safe defaults + * (which we can allow an operator to tune eventually via pcieadm). If we have a + * mix of non-hotpluggable slots with downstream endpoints present and + * hot-pluggable slots, then we're in this case. If we don't have hot-pluggable + * slots, then we can have an arbitrarily complex setup. Let's look at a few of + * these visually: + * + * In the following diagrams, RP stands for Root Port, EP stands for Endpoint. + * If something is hot-pluggable, then we label it with (HP). + * + * (1) RP --> EP + * (2) RP --> Switch --> EP + * +--> EP + * +--> EP + * + * (3) RP --> Switch --> EP + * +--> EP + * +--> Switch --> EP + * +--> EP + * +--> EP + * + * + * (4) RP (HP) --> EP + * (5) RP (HP) --> Switch --> EP + * +--> EP + * +--> EP + * + * (6) RP --> Switch (HP) --> EP + * (7) RP (HP) --> Switch (HP) --> EP + * + * If we look at all of these, these are all cases where it's safe for us to set + * things based on all devices. (1), (2), and (3) are straightforward because + * they have no hot-pluggable elements. This means that nothing should come/go + * on the system and we can set up fabric-wide properties as part of the root + * port. + * + * Case (4) is the most standard one that we encounter for hot-plug. Here you + * have a root port directly connected to an endpoint. The most common example + * would be an NVMe device plugged into a root port. Case (5) is interesting to + * highlight. While there is a switch and multiple endpoints there, they are + * showing up as a unit. This ends up being a weirder variant of (4), but it is + * safe for us to set advanced properties because we can figure out what the + * total set should be. + * + * Now, the more interesting bits here are (6) and (7). The reason that (6) + * works is that ultimately there is only a single down-stream port here that is + * hot-pluggable and all non-hotpluggable ports do not have a device present, + * which suggests that they will never have a device present. (7) also could be + * made to work by making the observation that if there's truly only one + * endpoint in a fabric, it doesn't matter how many switches there are that are + * hot-pluggable. This would only hold if we can assume for some reason that no + * other endpoints could be added. + * + * In turn, let's look at several cases that we believe aren't safe: + * + * (8) RP --> Switch --> EP + * +--> EP + * (HP) +--> EP + * + * (9) RP --> Switch (HP) +--> EP + * (HP) +--> EP + * + * (10) RP (HP) --> Switch (HP) +--> EP + * (HP) +--> EP + * + * All of these are situations where it's much more explicitly unsafe. Let's + * take (8). The problem here is that the devices on the non-hotpluggable + * downstream switches are always there and we should assume all device drivers + * will be active and performing I/O when the hot-pluggable slot changes. If the + * hot-pluggable slot has a lower max payload size, then we're mostly out of + * luck. The case of (9) is very similar to (8), just that we have more hot-plug + * capable slots. + * + * Finally (10) is a case of multiple instances of hotplug. (9) and (10) are the + * more general case of (6) and (7). While we can try to detect (6) and (7) more + * generally or try to make it safe, we're going to start with a simpler form of + * detection for this, which roughly follows the following rules: + * + * o If there are no hot-pluggable slots in an entire fabric, then we can set + * all fabric properties based on device capabilities. + * o If we encounter a hot-pluggable slot, we can only set fabric properties + * based on device capabilities if: + * + * 1. The hotpluggable slot is a root port. + * 2. There are no other hotpluggable devices downstream of it. + * + * Otherwise, if neither of the above is true, then we must use the basic PCIe + * defaults for various fabric-wide properties (discussed below). Even in these + * more complicated cases, device-specific properties such as the configuration + * of AERs, ASPM, etc. are still handled in the general pcie_init_bus() and + * related discussed earlier here. + * + * Because the only fabrics that we'll change are those that correspond to root + * ports, we will only call into the actual fabric feature setup when one of + * those changes. This has the side effect of simplifying locking. When we make + * changes here we need to be able to hold the entire device tree under the root + * port (including the root port and its parent). This is much harder to do + * safely when starting in the middle of the tree. + * + * Handling of Specific Properties + * ------------------------------- + * + * This section goes into the rationale behind how we initialize and program + * various parts of the PCIe stack. + * + * 5-, 8-, 10- AND 14-BIT TAGS + * + * Tags are part of PCIe transactions and when combined with a device identifier + * are used to uniquely identify a transaction. In PCIe parlance, a Requester + * (someone who initiates a PCIe request) sets a unique tag in the request and + * the Completer (someone who processes and responds to a PCIe request) echoes + * the tag back. This means that a requester generally is responsible for + * ensuring that they don't reuse a tag between transactions. + * + * Thus the number of tags that a device has relates to the number of + * outstanding transactions that it can have, which are usually tied to the + * number of outstanding DMA transfers. The size of these transactions is also + * then scoped by the handling of the Maximum Packet Payload. + * + * In PCIe 1.0, devices default to a 5-bit tag. There was also an option to + * support an 8-bit tag. The 8-bit extended tag did not distinguish between a + * Requester or Completer. There was a bit to indicate device support of 8-bit + * tags in the Device Capabilities Register of the PCIe Capability and a + * separate bit to enable it in the Device Control Register of the PCIe + * Capability. + * + * In PCIe 4.0, support for a 10-bit tag was added. The specification broke + * apart the support bit into multiple pieces. In particular, in the Device + * Capabilities 2 register of the PCIe Capability there is a separate bit to + * indicate whether the device supports 10-bit completions and 10-bit requests. + * All PCIe 4.0 compliant devices are required to support 10-bit tags if they + * operate at 16.0 GT/s speed (a PCIe Gen 4 compliant device does not have to + * operate at Gen 4 speeds). + * + * This allows a device to support 10-bit completions but not 10-bit requests. + * A device that supports 10-bit requests is required to support 10-bit + * completions. There is no ability to enable or disable 10-bit completion + * support in the Device Capabilities 2 register. There is only a bit to enable + * 10-bit requests. This distinction makes our life easier as this means that as + * long as the entire fabric supports 10-bit completions, it doesn't matter if + * not all devices support 10-bit requests and we can enable them as required. + * More on this in a bit. + * + * In PCIe 6.0, another set of bits was added for 14-bit tags. These follow the + * same pattern as the 10-bit tags. The biggest difference is that the + * capabilities and control for these are found in the Device Capabilities 3 + * and Device Control 3 register of the Device 3 Extended Capability. Similar to + * what we see with 10-bit tags, requesters are required to support the + * completer capability. The only control bit is for whether or not they enable + * a 14-bit requester. + * + * PCIe switches which sit between root ports and endpoints and show up to + * software as a set of bridges. Bridges generally don't have to know about tags + * as they are usually neither requesters or completers (unless directly talking + * to the bridge instance). That is they are generally required to forward + * packets without modifying them. This works until we deal with switch error + * handling. At that point, the switch may try to interpret the transaction and + * if it doesn't understand the tagging scheme in use, return the transaction to + * with the wrong tag and also an incorrectly diagnosed error (usually a + * malformed TLP). + * + * With all this, we construct a somewhat simple policy of how and when we + * enable extended tags: + * + * o If we have a complex hotplug-capable fabric (based on the discussion + * earlier in fabric-specific settings), then we cannot enable any of the + * 8-bit, 10-bit, and 14-bit tagging features. This is due to the issues + * with intermediate PCIe switches and related. + * + * o If every device supports 8-bit capable tags, then we will go through and + * enable those everywhere. + * + * o If every device supports 10-bit capable completions, then we will enable + * 10-bit requester on every device that supports it. + * + * o If every device supports 14-bit capable completions, then we will enable + * 14-bit requesters on every device that supports it. + * + * This is the simpler end of the policy and one that is relatively easy to + * implement. While we could attempt to relax the constraint that every device + * in the fabric implement these features by making assumptions about peer-to- + * peer requests (that is devices at the same layer in the tree won't talk to + * one another), that is a lot of complexity. For now, we leave such an + * implementation to those who need it in the future. + * + * MAX PAYLOAD SIZE + * + * When performing transactions on the PCIe bus, a given transaction has a + * maximum allowed size. This size is called the MPS or 'Maximum Payload Size'. + * A given device reports its maximum supported size in the Device Capabilities + * register of the PCIe Capability. It is then set in the Device Control + * register. + * + * One of the challenges with this value is that different functions of a device + * have independent values, but strictly speaking are required to actually have + * the same value programmed in all of them lest device behavior goes awry. When + * a device has the ARI (alternative routing ID) capability enabled, then only + * function 0 controls the actual payload size. + * + * The settings for this need to be consistent throughout the fabric. A + * Transmitter is not allowed to create a TLP that exceeds its maximum packet + * size and a Receiver is not allowed to receive a packet that exceeds its + * maximum packet size. In all of these cases, this would result in something + * like a malformed TLP error. + * + * Effectively, this means that everything on a given fabric must have the same + * value programmed in its Device Control register for this value. While in the + * case of tags, switches generally weren't completers or requesters, here every + * device along the path is subject to this. This makes the actual value that we + * set throughout the fabric even more important and the constraints of hotplug + * even worse to deal with. + * + * Because a hotplug device can be inserted with any packet size, if we hit + * anything other than the simple hotplug cases discussed in the fabric-specific + * settings section, then we must use the smallest size of 128 byte payloads. + * This is because a device could be plugged in that supports something smaller + * than we had otherwise set. If there are other active devices, those could not + * be changed without quiescing the entire fabric. As such our algorithm is as + * follows: + * + * 1. Scan the entire fabric, keeping track of the smallest seen MPS in the + * Device Capabilities Register. + * 2. If we have a complex fabric, program each Device Control register with + * a 128 byte maximum payload size, otherwise, program it with the + * discovered value. + * + * + * MAX READ REQUEST SIZE + * + * The maximum read request size (mrrs) is a much more confusing thing when + * compared to the maximum payload size counterpart. The maximum payload size + * (MPS) above is what restricts the actual size of a TLP. The mrrs value + * is used to control part of the behavior of Memory Read Request, which is not + * strictly speaking subject to the MPS. A PCIe device is allowed to respond to + * a Memory Read Request with less bytes than were actually requested in a + * single completion. In general, the default size that a root complex and its + * root port will reply to are based around the length of a cache line. + * + * What this ultimately controls is the number of requests that the Requester + * has to make and trades off bandwidth, bus sharing, and related here. For + * example, if the maximum read request size is 4 KiB, then the requester would + * only issue a single read request asking for 4 KiB. It would still receive + * these as multiple packets in units of the MPS. If however, the maximum read + * request was only say 512 B, then it would need to make 8 separate requests, + * potentially increasing latency. On the other hand, if systems are relying on + * total requests for QoS, then it's important to set it to something that's + * closer to the actual MPS. + * + * Traditionally, the OS has not been the most straightforward about this. It's + * important to remember that setting this up is also somewhat in the realm of + * system firmware. Due to the PCI Firmware specification, the firmware may have + * set up a value for not just the MRRS but also the MPS. As such, our logic + * basically left the MRRS alone and used whatever the device had there as long + * as we weren't shrinking the device's MPS. If we were, then we'd set it to the + * MPS. If the device was a root port, then it was just left at a system wide + * and PCIe default of 512 bytes. + * + * If we survey firmware (which isn't easy due to its nature), we have seen most + * cases where the firmware just doesn't do anything and leaves it to the + * device's default, which is basically just the PCIe default, unless it has a + * specific knowledge of something like say wanting to do something for an NVMe + * device. The same is generally true of other systems, leaving it at its + * default unless otherwise set by a device driver. + * + * Because this value doesn't really have the same constraints as other fabric + * properties, this becomes much simpler and we instead opt to set it as part of + * the device node initialization. In addition, there are no real rules about + * different functions having different values here as it doesn't really impact + * the TLP processing the same way that the MPS does. + * + * While we should add a fuller way of setting this and allowing operator + * override of the MRRS based on things like device class, etc. that is driven + * by pcieadm, that is left to the future. For now we opt to that all devices + * are kept at their default (512 bytes or whatever firmware left behind) and we + * ensure that root ports always have the mrrs set to 512. + */ + #include <sys/sysmacros.h> #include <sys/types.h> #include <sys/kmem.h> @@ -142,7 +590,6 @@ uint32_t pcie_aer_suce_severity = PCIE_AER_SUCE_SERR_ASSERT | \ PCIE_AER_SUCE_UC_ADDR_ERR | PCIE_AER_SUCE_UC_ATTR_ERR | \ PCIE_AER_SUCE_USC_MSG_DATA_ERR; -int pcie_max_mps = PCIE_DEVCTL_MAX_PAYLOAD_4096 >> 5; int pcie_disable_ari = 0; /* @@ -618,9 +1065,12 @@ pcie_determine_aspm(dev_info_t *dip) } /* - * PCI-Express child device initialization. - * This function enables generic pci-express interrupts and error - * handling. + * PCI-Express child device initialization. Note, this only will be called on a + * device or function if we actually attach a device driver to it. + * + * This function enables generic pci-express interrupts and error handling. + * Note, tagging, the max packet size, and related are all set up before this + * point and is performed in pcie_fabric_setup(). * * @param pdip root dip (root nexus's dip) * @param cdip child's dip (device's dip) @@ -774,11 +1224,6 @@ pcie_initchild(dev_info_t *cdip) bus_p->bus_ari = B_TRUE; } - if (pcie_initchild_mps(cdip) == DDI_FAILURE) { - pcie_fini_cfghdl(cdip); - return (DDI_FAILURE); - } - return (DDI_SUCCESS); } @@ -1519,11 +1964,15 @@ pcie_init_bus(dev_info_t *dip, pcie_req_id_t bdf, uint8_t flags) for (base = pci_cfgacc_get8(rcdip, bdf, base); base && num_cap; base = pci_cfgacc_get8(rcdip, bdf, base + PCI_CAP_NEXT_PTR)) { capid = pci_cfgacc_get8(rcdip, bdf, base); + uint16_t pcap; + switch (capid) { case PCI_CAP_ID_PCI_E: bus_p->bus_pcie_off = base; - bus_p->bus_dev_type = pci_cfgacc_get16(rcdip, bdf, - base + PCIE_PCIECAP) & PCIE_PCIECAP_DEV_TYPE_MASK; + pcap = pci_cfgacc_get16(rcdip, bdf, base + + PCIE_PCIECAP); + bus_p->bus_dev_type = pcap & PCIE_PCIECAP_DEV_TYPE_MASK; + bus_p->bus_pcie_vers = pcap & PCIE_PCIECAP_VER_MASK; /* Check and save PCIe hotplug capability information */ if ((PCIE_IS_RP(bus_p) || PCIE_IS_SWD(bus_p)) && @@ -1578,22 +2027,24 @@ pcie_init_bus(dev_info_t *dip, pcie_req_id_t bdf, uint8_t flags) capid = pci_cfgacc_get32(rcdip, bdf, base); if (capid == PCI_CAP_EINVAL32) break; - if (((capid >> PCIE_EXT_CAP_ID_SHIFT) & PCIE_EXT_CAP_ID_MASK) - == PCIE_EXT_CAP_ID_AER) { + switch ((capid >> PCIE_EXT_CAP_ID_SHIFT) & + PCIE_EXT_CAP_ID_MASK) { + case PCIE_EXT_CAP_ID_AER: bus_p->bus_aer_off = base; break; + case PCIE_EXT_CAP_ID_DEV3: + bus_p->bus_dev3_off = base; + break; } } - /* - * Save and record speed information about the device. - */ - caps_done: /* save RP dip and RP bdf */ if (PCIE_IS_RP(bus_p)) { bus_p->bus_rp_dip = dip; bus_p->bus_rp_bdf = bus_p->bus_bdf; + + bus_p->bus_fab = PCIE_ZALLOC(pcie_fabric_data_t); } else { for (pdip = ddi_get_parent(dip); pdip; pdip = ddi_get_parent(pdip)) { @@ -1626,7 +2077,6 @@ caps_done: bus_p->bus_soft_state = PCI_SOFT_STATE_CLOSED; (void) atomic_swap_uint(&bus_p->bus_fm_flags, 0); - bus_p->bus_mps = 0; ndi_set_bus_private(dip, B_TRUE, DEVI_PORT_TYPE_PCI, (void *)bus_p); @@ -1712,6 +2162,11 @@ pcie_fini_bus(dev_info_t *dip, uint8_t flags) pcie_fini_plat(dip); pcie_fini_pfd(dip); + if (PCIE_IS_RP(bus_p)) { + kmem_free(bus_p->bus_fab, sizeof (pcie_fabric_data_t)); + bus_p->bus_fab = NULL; + } + kmem_free(bus_p->bus_assigned_addr, (sizeof (pci_regspec_t) * bus_p->bus_assigned_entries)); kmem_free(bus_p->bus_addr_ranges, @@ -2245,233 +2700,6 @@ pcie_is_link_disabled(dev_info_t *dip) } /* - * Initialize the MPS for a root port. - * - * dip - dip of root port device. - */ -void -pcie_init_root_port_mps(dev_info_t *dip) -{ - pcie_bus_t *bus_p = PCIE_DIP2BUS(dip); - int rp_cap, max_supported = pcie_max_mps; - int circular_count; - - ndi_devi_enter(dip, &circular_count); - (void) pcie_get_fabric_mps(ddi_get_parent(dip), - ddi_get_child(dip), &max_supported); - ndi_devi_exit(dip, circular_count); - - rp_cap = PCI_CAP_GET16(bus_p->bus_cfg_hdl, 0, - bus_p->bus_pcie_off, PCIE_DEVCAP) & - PCIE_DEVCAP_MAX_PAYLOAD_MASK; - - if (rp_cap < max_supported) - max_supported = rp_cap; - - bus_p->bus_mps = max_supported; - (void) pcie_initchild_mps(dip); -} - -/* - * Initialize the Maximum Payload Size of a device. - * - * cdip - dip of device. - * - * returns - DDI_SUCCESS or DDI_FAILURE - */ -int -pcie_initchild_mps(dev_info_t *cdip) -{ - pcie_bus_t *bus_p; - dev_info_t *pdip = ddi_get_parent(cdip); - uint8_t dev_type; - - bus_p = PCIE_DIP2BUS(cdip); - if (bus_p == NULL) { - PCIE_DBG("%s: BUS not found.\n", - ddi_driver_name(cdip)); - return (DDI_FAILURE); - } - - dev_type = bus_p->bus_dev_type; - - /* - * For ARI Devices, only function zero's MPS needs to be set. - */ - if ((dev_type == PCIE_PCIECAP_DEV_TYPE_PCIE_DEV) && - (pcie_ari_is_enabled(pdip) == PCIE_ARI_FORW_ENABLED)) { - pcie_req_id_t child_bdf; - - if (pcie_get_bdf_from_dip(cdip, &child_bdf) == DDI_FAILURE) - return (DDI_FAILURE); - if ((child_bdf & PCIE_REQ_ID_ARI_FUNC_MASK) != 0) - return (DDI_SUCCESS); - } - - if (PCIE_IS_PCIE(bus_p)) { - int suggested_mrrs, fabric_mps; - uint16_t device_mps, device_mps_cap, device_mrrs, dev_ctrl; - - dev_ctrl = PCIE_CAP_GET(16, bus_p, PCIE_DEVCTL); - if ((fabric_mps = (PCIE_IS_RP(bus_p) ? bus_p : - PCIE_DIP2BUS(pdip))->bus_mps) < 0) { - dev_ctrl = (dev_ctrl & ~(PCIE_DEVCTL_MAX_READ_REQ_MASK | - PCIE_DEVCTL_MAX_PAYLOAD_MASK)) | - (pcie_devctl_default & - (PCIE_DEVCTL_MAX_READ_REQ_MASK | - PCIE_DEVCTL_MAX_PAYLOAD_MASK)); - - PCIE_CAP_PUT(16, bus_p, PCIE_DEVCTL, dev_ctrl); - return (DDI_SUCCESS); - } - - device_mps_cap = PCIE_CAP_GET(16, bus_p, PCIE_DEVCAP) & - PCIE_DEVCAP_MAX_PAYLOAD_MASK; - - device_mrrs = (dev_ctrl & PCIE_DEVCTL_MAX_READ_REQ_MASK) >> - PCIE_DEVCTL_MAX_READ_REQ_SHIFT; - - if (device_mps_cap < fabric_mps) - device_mrrs = device_mps = device_mps_cap; - else - device_mps = (uint16_t)fabric_mps; - - suggested_mrrs = (uint32_t)ddi_prop_get_int(DDI_DEV_T_ANY, - cdip, DDI_PROP_DONTPASS, "suggested-mrrs", device_mrrs); - - if ((device_mps == fabric_mps) || - (suggested_mrrs < device_mrrs)) - device_mrrs = (uint16_t)suggested_mrrs; - - /* - * Replace MPS and MRRS settings. - */ - dev_ctrl &= ~(PCIE_DEVCTL_MAX_READ_REQ_MASK | - PCIE_DEVCTL_MAX_PAYLOAD_MASK); - - dev_ctrl |= ((device_mrrs << PCIE_DEVCTL_MAX_READ_REQ_SHIFT) | - device_mps << PCIE_DEVCTL_MAX_PAYLOAD_SHIFT); - - PCIE_CAP_PUT(16, bus_p, PCIE_DEVCTL, dev_ctrl); - - bus_p->bus_mps = device_mps; - } - - return (DDI_SUCCESS); -} - -/* - * Scans a device tree/branch for a maximum payload size capabilities. - * - * rc_dip - dip of Root Complex. - * dip - dip of device where scan will begin. - * max_supported (IN) - maximum allowable MPS. - * max_supported (OUT) - maximum payload size capability of fabric. - */ -void -pcie_get_fabric_mps(dev_info_t *rc_dip, dev_info_t *dip, int *max_supported) -{ - if (dip == NULL) - return; - - /* - * Perform a fabric scan to obtain Maximum Payload Capabilities - */ - (void) pcie_scan_mps(rc_dip, dip, max_supported); - - PCIE_DBG("MPS: Highest Common MPS= %x\n", max_supported); -} - -/* - * Scans fabric and determines Maximum Payload Size based on - * highest common denominator alogorithm - */ -static void -pcie_scan_mps(dev_info_t *rc_dip, dev_info_t *dip, int *max_supported) -{ - int circular_count; - pcie_max_supported_t max_pay_load_supported; - - max_pay_load_supported.dip = rc_dip; - max_pay_load_supported.highest_common_mps = *max_supported; - - ndi_devi_enter(ddi_get_parent(dip), &circular_count); - ddi_walk_devs(dip, pcie_get_max_supported, - (void *)&max_pay_load_supported); - ndi_devi_exit(ddi_get_parent(dip), circular_count); - - *max_supported = max_pay_load_supported.highest_common_mps; -} - -/* - * Called as part of the Maximum Payload Size scan. - */ -static int -pcie_get_max_supported(dev_info_t *dip, void *arg) -{ - uint32_t max_supported; - uint16_t cap_ptr; - pcie_max_supported_t *current = (pcie_max_supported_t *)arg; - pci_regspec_t *reg; - int rlen; - caddr_t virt; - ddi_acc_handle_t config_handle; - - if (ddi_get_child(current->dip) == NULL) { - goto fail1; - } - - if (pcie_dev(dip) == DDI_FAILURE) { - PCIE_DBG("MPS: pcie_get_max_supported: %s: " - "Not a PCIe dev\n", ddi_driver_name(dip)); - goto fail1; - } - - /* - * If the suggested-mrrs property exists, then don't include this - * device in the MPS capabilities scan. - */ - if (ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, - "suggested-mrrs") != 0) - goto fail1; - - if (ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "reg", - (caddr_t)®, &rlen) != DDI_PROP_SUCCESS) { - PCIE_DBG("MPS: pcie_get_max_supported: %s: " - "Can not read reg\n", ddi_driver_name(dip)); - goto fail1; - } - - if (pcie_map_phys(ddi_get_child(current->dip), reg, &virt, - &config_handle) != DDI_SUCCESS) { - PCIE_DBG("MPS: pcie_get_max_supported: %s: pcie_map_phys " - "failed\n", ddi_driver_name(dip)); - goto fail2; - } - - if ((PCI_CAP_LOCATE(config_handle, PCI_CAP_ID_PCI_E, &cap_ptr)) == - DDI_FAILURE) { - goto fail3; - } - - max_supported = PCI_CAP_GET16(config_handle, 0, cap_ptr, - PCIE_DEVCAP) & PCIE_DEVCAP_MAX_PAYLOAD_MASK; - - PCIE_DBG("PCIE MPS: %s: MPS Capabilities %x\n", ddi_driver_name(dip), - max_supported); - - if (max_supported < current->highest_common_mps) - current->highest_common_mps = max_supported; - -fail3: - pcie_unmap_phys(&config_handle, reg); -fail2: - kmem_free(reg, rlen); -fail1: - return (DDI_WALK_CONTINUE); -} - -/* * Determines if there are any root ports attached to a root complex. * * dip - dip of root complex @@ -3253,3 +3481,292 @@ pcie_link_retrain(dev_info_t *dip) return (0); } + +/* + * Here we're going through and grabbing information about a given PCIe device. + * Our situation is a little bit complicated at this point. This gets invoked + * both during early initialization and during hotplug events. We cannot rely on + * the device node having been fully set up, that is, while the pcie_bus_t + * normally contains a ddi_acc_handle_t for configuration space, that may not be + * valid yet as this can occur before child initialization or we may be dealing + * with a function that will never have a handle. + * + * However, we should always have a fully furnished pcie_bus_t, which means that + * we can get its bdf and use that to access the devices configuration space. + */ +static int +pcie_fabric_feature_scan(dev_info_t *dip, void *arg) +{ + pcie_bus_t *bus_p; + uint32_t devcap; + uint16_t mps; + dev_info_t *rcdip; + pcie_fabric_data_t *fab = arg; + + /* + * Skip over non-PCIe devices. If we encounter something here, we don't + * bother going through any of its children because we don't have reason + * to believe that a PCIe device that this will impact will exist below + * this. While it is possible that there's a PCIe fabric downstream an + * intermediate old PCI/PCI-X bus, at that point, we'll still trigger + * our complex fabric detection and use the minimums. + * + * The reason this doesn't trigger an immediate flagging as a complex + * case like the one below is because we could be scanning a device that + * is a nexus driver and has children already (albeit that would be + * somewhat surprising as we don't anticipate being called at this + * point). + */ + if (pcie_dev(dip) != DDI_SUCCESS) { + return (DDI_WALK_PRUNECHILD); + } + + /* + * If we fail to find a pcie_bus_t for some reason, that's somewhat + * surprising. We log this fact and set the complex flag and indicate it + * was because of this case. This immediately transitions us to a + * "complex" case which means use the minimal, safe, settings. + */ + bus_p = PCIE_DIP2BUS(dip); + if (bus_p == NULL) { + dev_err(dip, CE_WARN, "failed to find associated pcie_bus_t " + "during fabric scan"); + fab->pfd_flags |= PCIE_FABRIC_F_COMPLEX; + return (DDI_WALK_TERMINATE); + } + rcdip = pcie_get_rc_dip(dip); + + /* + * First, start by determining what the device's tagging and max packet + * size is. All PCIe devices will always have the 8-bit tag information + * as this has existed since PCIe 1.0. 10-bit tagging requires a V2 + * PCIe capability. 14-bit requires the DEV3 cap. If we are missing a + * version or capability, then we always treat that as lacking the bits + * in the fabric. + */ + ASSERT3U(bus_p->bus_pcie_off, !=, 0); + devcap = pci_cfgacc_get32(rcdip, bus_p->bus_bdf, bus_p->bus_pcie_off + + PCIE_DEVCAP); + mps = devcap & PCIE_DEVCAP_MAX_PAYLOAD_MASK; + if (mps < fab->pfd_mps_found) { + fab->pfd_mps_found = mps; + } + + if ((devcap & PCIE_DEVCAP_EXT_TAG_8BIT) == 0) { + fab->pfd_tag_found &= ~PCIE_TAG_8B; + } + + if (bus_p->bus_pcie_vers == PCIE_PCIECAP_VER_2_0) { + uint32_t devcap2 = pci_cfgacc_get32(rcdip, bus_p->bus_bdf, + bus_p->bus_pcie_off + PCIE_DEVCAP2); + if ((devcap2 & PCIE_DEVCAP2_10B_TAG_COMP_SUP) == 0) { + fab->pfd_tag_found &= ~PCIE_TAG_10B_COMP; + } + } else { + fab->pfd_tag_found &= ~PCIE_TAG_10B_COMP; + } + + if (bus_p->bus_dev3_off != 0) { + uint32_t devcap3 = pci_cfgacc_get32(rcdip, bus_p->bus_bdf, + bus_p->bus_dev3_off + PCIE_DEVCAP3); + if ((devcap3 & PCIE_DEVCAP3_14B_TAG_COMP_SUP) == 0) { + fab->pfd_tag_found &= ~PCIE_TAG_14B_COMP; + } + } else { + fab->pfd_tag_found &= ~PCIE_TAG_14B_COMP; + } + + /* + * Now that we have captured device information, we must go and ask + * questions of the topology here. The big theory statement enumerates + * several types of cases. The big question we need to answer is have we + * encountered a hotpluggable bridge that means we need to mark this as + * complex. + * + * The big theory statement notes several different kinds of hotplug + * topologies that exist that we can theoretically support. Right now we + * opt to keep our lives simple and focus solely on (4) and (5). These + * can both be summarized by a single, fairly straightforward rule: + * + * The only allowed hotpluggable entity is a root port. + * + * The reason that this can work and detect cases like (6), (7), and our + * other invalid ones is that the hotplug code will scan and find all + * children before we are called into here. + */ + if (bus_p->bus_hp_sup_modes != 0) { + /* + * We opt to terminate in this case because there's no value in + * scanning the rest of the tree at this point. + */ + if (!PCIE_IS_RP(bus_p)) { + fab->pfd_flags |= PCIE_FABRIC_F_COMPLEX; + return (DDI_WALK_TERMINATE); + } + + fab->pfd_flags |= PCIE_FABRIC_F_RP_HP; + } + + /* + * As our walk starts at a root port, we need to make sure that we don't + * pick up any of its siblings and their children as those would be + * different PCIe fabric domains for us to scan. In many hardware + * platforms multiple root ports are all at the same level in the tree. + */ + if (bus_p->bus_rp_dip == dip) { + return (DDI_WALK_PRUNESIB); + } + + return (DDI_WALK_CONTINUE); +} + +static int +pcie_fabric_feature_set(dev_info_t *dip, void *arg) +{ + pcie_bus_t *bus_p; + dev_info_t *rcdip; + pcie_fabric_data_t *fab = arg; + uint32_t devcap, devctl; + + if (pcie_dev(dip) != DDI_SUCCESS) { + return (DDI_WALK_PRUNECHILD); + } + + /* + * The missing bus_t sent us into the complex case previously. We still + * need to make sure all devices have values we expect here and thus + * don't terminate like the above. + */ + bus_p = PCIE_DIP2BUS(dip); + if (bus_p == NULL) { + return (DDI_WALK_CONTINUE); + } + rcdip = pcie_get_rc_dip(dip); + + devcap = pci_cfgacc_get32(rcdip, bus_p->bus_bdf, bus_p->bus_pcie_off + + PCIE_DEVCAP); + devctl = pci_cfgacc_get16(rcdip, bus_p->bus_bdf, bus_p->bus_pcie_off + + PCIE_DEVCTL); + + if ((devcap & PCIE_DEVCAP_EXT_TAG_8BIT) != 0 && + (fab->pfd_tag_act & PCIE_TAG_8B) != 0) { + devctl |= PCIE_DEVCTL_EXT_TAG_FIELD_EN; + } + + devctl &= ~PCIE_DEVCTL_MAX_PAYLOAD_MASK; + ASSERT0(fab->pfd_mps_act & ~PCIE_DEVCAP_MAX_PAYLOAD_MASK); + devctl |= fab->pfd_mps_act << PCIE_DEVCTL_MAX_PAYLOAD_SHIFT; + + pci_cfgacc_put16(rcdip, bus_p->bus_bdf, bus_p->bus_pcie_off + + PCIE_DEVCTL, devctl); + + if (bus_p->bus_pcie_vers == PCIE_PCIECAP_VER_2_0 && + (fab->pfd_tag_act & PCIE_TAG_10B_COMP) != 0) { + uint32_t devcap2 = pci_cfgacc_get32(rcdip, bus_p->bus_bdf, + bus_p->bus_pcie_off + PCIE_DEVCAP2); + + if ((devcap2 & PCIE_DEVCAP2_10B_TAG_REQ_SUP) == 0) { + uint16_t devctl2 = pci_cfgacc_get16(rcdip, + bus_p->bus_bdf, bus_p->bus_pcie_off + PCIE_DEVCTL2); + devctl2 |= PCIE_DEVCTL2_10B_TAG_REQ_EN; + pci_cfgacc_put16(rcdip, bus_p->bus_bdf, + bus_p->bus_pcie_off + PCIE_DEVCTL2, devctl2); + } + } + + if (bus_p->bus_dev3_off != 0 && + (fab->pfd_tag_act & PCIE_TAG_14B_COMP) != 0) { + uint32_t devcap3 = pci_cfgacc_get32(rcdip, bus_p->bus_bdf, + bus_p->bus_dev3_off + PCIE_DEVCAP3); + + if ((devcap3 & PCIE_DEVCAP3_14B_TAG_REQ_SUP) == 0) { + uint16_t devctl3 = pci_cfgacc_get16(rcdip, + bus_p->bus_bdf, bus_p->bus_dev3_off + PCIE_DEVCTL3); + devctl3 |= PCIE_DEVCTL3_14B_TAG_REQ_EN; + pci_cfgacc_put16(rcdip, bus_p->bus_bdf, + bus_p->bus_pcie_off + PCIE_DEVCTL2, devctl3); + } + } + + /* + * As our walk starts at a root port, we need to make sure that we don't + * pick up any of its siblings and their children as those would be + * different PCIe fabric domains for us to scan. In many hardware + * platforms multiple root ports are all at the same level in the tree. + */ + if (bus_p->bus_rp_dip == dip) { + return (DDI_WALK_PRUNESIB); + } + + return (DDI_WALK_CONTINUE); +} + +/* + * This is used to scan and determine the total set of PCIe fabric settings that + * we should have in the system for everything downstream of this specified root + * port. Note, it is only really safe to call this while working from the + * perspective of a root port as we will be walking down the entire device tree. + * + * However, our callers, particularly hoptlug, don't have all the information + * we'd like. In particular, we need to check that: + * + * o This is actually a PCIe device. + * o That this is a root port (see the big theory statement to understand this + * constraint). + */ +void +pcie_fabric_setup(dev_info_t *dip) +{ + pcie_bus_t *bus_p; + pcie_fabric_data_t *fab; + dev_info_t *pdip; + int circular_count; + + bus_p = PCIE_DIP2BUS(dip); + if (bus_p == NULL || !PCIE_IS_RP(bus_p)) { + return; + } + + VERIFY3P(bus_p->bus_fab, !=, NULL); + fab = bus_p->bus_fab; + + /* + * For us to call ddi_walk_devs(), our parent needs to be held. + * ddi_walk_devs() will take care of grabbing our dip as part of its + * walk before we iterate over our children. + * + * A reasonable question to ask here is why is it safe to ask for our + * parent? In this case, because we have entered here through some + * thread that's operating on us whether as part of attach or a hotplug + * event, our dip somewhat by definition has to be valid. If we were + * looking at our dip's children and then asking them for a parent, then + * that would be a race condition. + */ + pdip = ddi_get_parent(dip); + VERIFY3P(pdip, !=, NULL); + ndi_devi_enter(pdip, &circular_count); + fab->pfd_flags |= PCIE_FABRIC_F_SCANNING; + + /* + * Reinitialize the tracking structure to basically set the maximum + * caps. These will be chipped away during the scan. + */ + fab->pfd_mps_found = PCIE_DEVCAP_MAX_PAYLOAD_4096; + fab->pfd_tag_found = PCIE_TAG_ALL; + fab->pfd_flags &= ~PCIE_FABRIC_F_COMPLEX; + + ddi_walk_devs(dip, pcie_fabric_feature_scan, fab); + + if ((fab->pfd_flags & PCIE_FABRIC_F_COMPLEX) != 0) { + fab->pfd_tag_act = PCIE_TAG_5B; + fab->pfd_mps_act = PCIE_DEVCAP_MAX_PAYLOAD_128; + } else { + fab->pfd_tag_act = fab->pfd_tag_found; + fab->pfd_mps_act = fab->pfd_mps_found; + } + + ddi_walk_devs(dip, pcie_fabric_feature_set, fab); + + fab->pfd_flags &= ~PCIE_FABRIC_F_SCANNING; + ndi_devi_exit(pdip, circular_count); +} diff --git a/usr/src/uts/common/io/pciex/pcieb.c b/usr/src/uts/common/io/pciex/pcieb.c index d81a12b6c0..8ca85e6543 100644 --- a/usr/src/uts/common/io/pciex/pcieb.c +++ b/usr/src/uts/common/io/pciex/pcieb.c @@ -24,6 +24,7 @@ /* * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. * Copyright 2019 Joyent, Inc. + * Copyright 2022 Oxide Computer Company */ /* @@ -593,12 +594,16 @@ pcieb_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) pcieb_plat_attach_workaround(devi); /* - * If this is a root port, determine and set the max payload size. - * Since this will involve scanning the fabric, all error enabling - * and sw workarounds should be in place before doing this. + * If this is a root port, we need to go through and at this point in + * time set up and initialize all fabric-wide settings such as the max + * packet size, tagging, etc. Since this will involve scanning the + * fabric, all error enabling and sw workarounds should be in place + * before doing this. For hotplug-capable bridges, this will happen + * again when a hotplug event occurs. See the pcie theory statement in + * uts/common/io/pciex/pcie.c for more information. */ if (PCIE_IS_RP(bus_p)) - pcie_init_root_port_mps(devi); + pcie_fabric_setup(devi); ddi_report_dev(devi); return (DDI_SUCCESS); diff --git a/usr/src/uts/common/os/port_subr.c b/usr/src/uts/common/os/port_subr.c index 4c1de94e20..eeb63f27fd 100644 --- a/usr/src/uts/common/os/port_subr.c +++ b/usr/src/uts/common/os/port_subr.c @@ -22,10 +22,9 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2022 Oxide Computer Company */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file containts all the functions required for interactions of * event sources with the event port file system. @@ -623,10 +622,7 @@ port_remove_fd_object(portfd_t *pfd, port_t *pp, port_fdcache_t *pcp) int removed = 0; ASSERT(MUTEX_HELD(&pcp->pc_lock)); - if (pdp->pd_php != NULL) { - pollhead_delete(pdp->pd_php, pdp); - pdp->pd_php = NULL; - } + polldat_disassociate(pdp); pkevp = pdp->pd_portev; portq = &pp->port_queue; mutex_enter(&portq->portq_mutex); diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c index ac48bf31b7..1874a2ef00 100644 --- a/usr/src/uts/common/os/sunddi.c +++ b/usr/src/uts/common/os/sunddi.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * Copyright 2022 Garrett D'Amore */ #include <sys/note.h> @@ -5325,16 +5325,6 @@ ddi_append_minor_node(dev_info_t *ddip, struct ddi_minor_data *dmdp) ndi_devi_exit(ddip, circ); } -/* - * Part of the obsolete SunCluster DDI Hooks. - * Keep for binary compatibility - */ -minor_t -ddi_getiminor(dev_t dev) -{ - return (getminor(dev)); -} - static int i_log_devfs_minor_create(dev_info_t *dip, char *minor_name) { diff --git a/usr/src/uts/common/sys/audio/audio_driver.h b/usr/src/uts/common/sys/audio/audio_driver.h index 3b124b88c6..306d76e333 100644 --- a/usr/src/uts/common/sys/audio/audio_driver.h +++ b/usr/src/uts/common/sys/audio/audio_driver.h @@ -168,16 +168,6 @@ void audio_dump_dwords(const uint32_t *w, int dcount); #define ENGINE_NDELAY (1U << 21) /* non-blocking open */ /* - * entry points used by legacy SADA drivers - */ -int audio_legacy_open(queue_t *, dev_t *, int, int, cred_t *); -int audio_legacy_close(queue_t *, int, cred_t *); -int audio_legacy_wput(queue_t *, mblk_t *); -int audio_legacy_wsrv(queue_t *); - - - -/* * Audio device controls */ diff --git a/usr/src/uts/common/sys/ib/clients/of/sol_uverbs/sol_uverbs.h b/usr/src/uts/common/sys/ib/clients/of/sol_uverbs/sol_uverbs.h index 4da3a7a853..ad34be4b79 100644 --- a/usr/src/uts/common/sys/ib/clients/of/sol_uverbs/sol_uverbs.h +++ b/usr/src/uts/common/sys/ib/clients/of/sol_uverbs/sol_uverbs.h @@ -116,7 +116,7 @@ typedef struct uverbs_ufile_uobj { kmutex_t lock; int ref; kcondvar_t poll_wait; - struct pollhead poll_head; + pollhead_t poll_head; struct uverbs_uctxt_uobj *uctxt; int is_async; llist_head_t event_list; diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h index a9d9e842ad..6ac8abc936 100644 --- a/usr/src/uts/common/sys/overlay_impl.h +++ b/usr/src/uts/common/sys/overlay_impl.h @@ -146,7 +146,7 @@ typedef struct overlay_target_entry { #define OVERLAY_CTL "overlay" -#define OVERLAY_FREEMSG(mp, reason) \ +#define OVERLAY_FREEMSG(mp, reason) \ DTRACE_PROBE2(overlay__freemsg, mblk_t *, mp, char *, reason) extern dev_info_t *overlay_dip; diff --git a/usr/src/uts/common/sys/pcie.h b/usr/src/uts/common/sys/pcie.h index 1901b200d8..751fb449e0 100644 --- a/usr/src/uts/common/sys/pcie.h +++ b/usr/src/uts/common/sys/pcie.h @@ -24,7 +24,7 @@ */ /* * Copyright 2019 Joyent, Inc. - * Copyright 2021 Oxide Computer Company + * Copyright 2022 Oxide Computer Company */ #ifndef _SYS_PCIE_H @@ -460,7 +460,7 @@ extern "C" { #define PCIE_DEVCTL2_IDO_COMPL_EN 0x200 #define PCIE_DEVCTL2_LTR_MECH_EN 0x400 #define PCIE_DEVCTL2_EPR_REQ 0x800 -#define PCIE_DEVCTL2_10BTAG_REQ_EN 0x1000 +#define PCIE_DEVCTL2_10B_TAG_REQ_EN 0x1000 #define PCIE_DEVCTL2_OBFF_MASK 0x6000 #define PCIE_DEVCTL2_OBFF_DISABLE 0x0000 #define PCIE_DEVCTL2_OBFF_EN_VARA 0x2000 @@ -776,6 +776,56 @@ extern "C" { #define PCIE_ARI_CTRL_FUNC_GRP_MASK 0x7 /* + * PCIe Device 3 Extended Capability Header (PCIE_EXT_CAP_ID_DEV3) + */ +#define PCIE_DEVCAP3 0x04 +#define PCIE_DEVCAP3_DMWR_REQ_ROUTE 0x01 +#define PCIE_DEVCAP3_14B_TAG_COMP_SUP 0x02 +#define PCIE_DEVCAP3_14B_TAG_REQ_SUP 0x04 +#define PCIE_DEVCAP3_PORT_L0P_SUP 0x08 +#define PCIE_DEVCAP3_PORT_L0P_EXIT_LAT_MASK 0x070 +#define PCIE_DEVCAP3_PORT_L0P_EXIT_LAT_MIN 0x0 /* < 1us */ +#define PCIE_DEVCAP3_PORT_L0P_EXIT_LAT_1us 0x1 /* [ 1us, 2us ) */ +#define PCIE_DEVCAP3_PORT_L0P_EXIT_LAT_2us 0x2 /* [ 2us, 4us ) */ +#define PCIE_DEVCAP3_PORT_L0P_EXIT_LAT_4us 0x3 /* [ 4us, 8us ) */ +#define PCIE_DEVCAP3_PORT_L0P_EXIT_LAT_8us 0x4 /* [ 8us, 16us ) */ +#define PCIE_DEVCAP3_PORT_L0P_EXIT_LAT_16us 0x5 /* [ 16us, 32us ) */ +#define PCIE_DEVCAP3_PORT_L0P_EXIT_LAT_32us 0x6 /* [ 32us, 64us ] */ +#define PCIE_DEVCAP3_PORT_L0P_EXIT_LAT_MAX 0x7 /* > 64us */ +#define PCIE_DEVCAP3_RTMR_L0P_EXIT_LAT_MASK 0x380 +#define PCIE_DEVCAP3_RTMR_L0P_EXIT_LAT_MIN 0x0 /* < 1us */ +#define PCIE_DEVCAP3_RTMR_L0P_EXIT_LAT_1us 0x1 /* [ 1us, 2us ) */ +#define PCIE_DEVCAP3_RTMR_L0P_EXIT_LAT_2us 0x2 /* [ 2us, 4us ) */ +#define PCIE_DEVCAP3_RTMR_L0P_EXIT_LAT_4us 0x3 /* [ 4us, 8us ) */ +#define PCIE_DEVCAP3_RTMR_L0P_EXIT_LAT_8us 0x4 /* [ 8us, 16us ) */ +#define PCIE_DEVCAP3_RTMR_L0P_EXIT_LAT_16us 0x5 /* [ 16us, 32us ) */ +#define PCIE_DEVCAP3_RTMR_L0P_EXIT_LAT_32us 0x6 /* [ 32us, 64us ] */ +#define PCIE_DEVCAP3_RTMR_L0P_EXIT_LAT_MAX 0x7 /* > 64us */ + +#define PCIE_DEVCTL3 0x08 +#define PCIE_DEVCTL3_DMWR_REQ_EN 0x01 +#define PCIE_DEVCTL3_DMWR_EG_BLOCK 0x02 +#define PCIE_DEVCTL3_14B_TAG_REQ_EN 0x04 +#define PCIE_DEVCTL3_L0P_EN 0x08 +#define PCIE_DEVCTL3_TARGET_WIDTH_MASK 0x70 +#define PCIE_DEVCTL3_TARGET_WIDTH_X1 0x00 +#define PCIE_DEVCTL3_TARGET_WIDTH_X2 0x10 +#define PCIE_DEVCTL3_TARGET_WIDTH_X4 0x20 +#define PCIE_DEVCTL3_TARGET_WIDTH_X8 0x30 +#define PCIE_DEVCTL3_TARGET_WIDTH_X16 0x40 +#define PCIE_DEVCTL3_TARGET_WIDTH_DYN 0x70 + +#define PCIE_DEVSTS3 0x0c +#define PCIE_DEVSTS3_INIT_WIDTH_MASK 0x07 +#define PCIE_DEVSTS3_INIT_WIDTH_X1 0x00 +#define PCIE_DEVSTS3_INIT_WIDTH_X2 0x01 +#define PCIE_DEVSTS3_INIT_WIDTH_X4 0x02 +#define PCIE_DEVSTS3_INIT_WIDTH_X8 0x03 +#define PCIE_DEVSTS3_INIT_WIDTH_X16 0x04 +#define PCIE_DEVSTS3_SEG_CAP 0x08 +#define PCIE_DEVSTS3_REM_L0P_SUP 0x10 + +/* * PCI-E Common TLP Header Fields */ #define PCIE_TLP_FMT_3DW 0x00 diff --git a/usr/src/uts/common/sys/pcie_impl.h b/usr/src/uts/common/sys/pcie_impl.h index 07eb6fee65..501d9839b0 100644 --- a/usr/src/uts/common/sys/pcie_impl.h +++ b/usr/src/uts/common/sys/pcie_impl.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Joyent, Inc. + * Copyright 2022 Oxide Computer Company */ #ifndef _SYS_PCIE_IMPL_H @@ -330,6 +331,52 @@ typedef enum { } pcie_lbw_state_t; /* + * This structure is used to keep track of a given bus hierarchy and the set of + * PCIe tags that we have enabled on it. + */ +typedef enum { + PCIE_TAG_5B = 0, + PCIE_TAG_8B = 1 << 0, + PCIE_TAG_10B_COMP = 1 << 1, + PCIE_TAG_14B_COMP = 1 << 2 +} pcie_tag_t; + +#define PCIE_TAG_ALL (PCIE_TAG_8B | PCIE_TAG_10B_COMP | PCIE_TAG_14B_COMP) + +typedef enum { + /* + * This flag is kept around for debugging and noticing that we're in the + * process of trying to perform a scan. + */ + PCIE_FABRIC_F_SCANNING = 1 << 0, + /* + * This is used to indicate that we have discovered a topology that is + * too complex for us to be able to set advanced settings on and + * therefore have to leave it at the bare minimum. + */ + PCIE_FABRIC_F_COMPLEX = 1 << 1, + /* + * Indicates that we found a hot-pluggable root port in the fabric. + */ + PCIE_FABRIC_F_RP_HP = 1 << 2 +} pcie_fabric_flags_t; + +/* + * This structure represents hierarchy wide settings that are used in a given + * PCIe fabric (what the spec calls a "hierarchy domain"). This keeps track of + * what we have found and enabled in the fabric as part of our initialization. + * For more information on this, please see the theory statement in + * uts/common/io/pciex/pcie.c. + */ +typedef struct pice_fabric_data { + pcie_fabric_flags_t pfd_flags; + uint16_t pfd_mps_found; + uint16_t pfd_mps_act; + pcie_tag_t pfd_tag_found; + pcie_tag_t pfd_tag_act; +} pcie_fabric_data_t; + +/* * For hot plugged device, these data are init'ed during during probe * For non-hotplugged device, these data are init'ed in pci_autoconfig (on x86), * or in px_attach()(on sparc). @@ -355,8 +402,10 @@ typedef struct pcie_bus { uint8_t bus_hdr_type; /* pci header type, see pci.h */ uint16_t bus_dev_type; /* PCI-E dev type, see pcie.h */ uint8_t bus_bdg_secbus; /* Bridge secondary bus num */ + uint8_t bus_pcie_vers; /* Version of the PCIe cap */ uint16_t bus_pcie_off; /* PCIe Capability Offset */ uint16_t bus_aer_off; /* PCIe Advanced Error Offset */ + uint16_t bus_dev3_off; /* PCIe Device 3 Capability */ uint16_t bus_pcix_off; /* PCIx Capability Offset */ uint16_t bus_pci_hp_off; /* PCI HP (SHPC) Cap Offset */ uint16_t bus_ecc_ver; /* PCIX ecc version */ @@ -370,8 +419,6 @@ typedef struct pcie_bus { pf_data_t *bus_pfd; pcie_domain_t *bus_dom; - int bus_mps; /* Maximum Payload Size */ - void *bus_plat_private; /* Platform specific */ /* Hotplug specific fields */ pcie_hp_mode_t bus_hp_sup_modes; /* HP modes supported */ @@ -404,6 +451,14 @@ typedef struct pcie_bus { uint64_t bus_lbw_nevents; char *bus_lbw_pbuf; char *bus_lbw_cbuf; + + /* + * The following contains fabric wide settings and information that are + * used. This member is only valid on the root port. It is NULL on all + * other pcie_bus_t members who instead need to access this through the + * corresponding root port dip information. + */ + pcie_fabric_data_t *bus_fab; } pcie_bus_t; /* @@ -511,11 +566,6 @@ typedef struct pf_impl { #define PCIE_INVALID_BDF 0xFFFF #define PCIE_CHECK_VALID_BDF(x) (x != PCIE_INVALID_BDF) -typedef struct { - dev_info_t *dip; - int highest_common_mps; -} pcie_max_supported_t; - /* * Default interrupt priority for all PCI and PCIe nexus drivers including * hotplug interrupts. @@ -585,7 +635,7 @@ extern int pcie_ioctl(dev_info_t *dip, dev_t dev, int cmd, intptr_t arg, extern int pcie_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags, char *name, caddr_t valuep, int *lengthp); -extern void pcie_init_root_port_mps(dev_info_t *dip); +extern void pcie_fabric_setup(dev_info_t *dip); extern int pcie_initchild(dev_info_t *dip); extern void pcie_uninitchild(dev_info_t *dip); extern int pcie_init_cfghdl(dev_info_t *dip); @@ -612,10 +662,7 @@ extern int pcie_get_bdf_from_dip(dev_info_t *dip, pcie_req_id_t *bdf); extern dev_info_t *pcie_get_my_childs_dip(dev_info_t *dip, dev_info_t *rdip); extern uint32_t pcie_get_bdf_for_dma_xfer(dev_info_t *dip, dev_info_t *rdip); extern int pcie_dev(dev_info_t *dip); -extern void pcie_get_fabric_mps(dev_info_t *rc_dip, dev_info_t *dip, - int *max_supported); extern int pcie_root_port(dev_info_t *dip); -extern int pcie_initchild_mps(dev_info_t *dip); extern void pcie_set_rber_fatal(dev_info_t *dip, boolean_t val); extern boolean_t pcie_get_rber_fatal(dev_info_t *dip); diff --git a/usr/src/uts/common/sys/poll_impl.h b/usr/src/uts/common/sys/poll_impl.h index 3e0eb3b21f..cd0a571c47 100644 --- a/usr/src/uts/common/sys/poll_impl.h +++ b/usr/src/uts/common/sys/poll_impl.h @@ -26,6 +26,7 @@ /* * Copyright 2017 Joyent, Inc. + * Copyright 2022 Oxide Computer Company */ #ifndef _SYS_POLL_IMPL_H @@ -276,11 +277,11 @@ extern void pollhead_clean(pollhead_t *); /* * private poll head interfaces: * - * pollhead_insert adds a polldat to a pollhead list - * pollhead_delete removes a polldat from a pollhead list + * polldat_associate adds a polldat to a pollhead list + * polldat_disassociate remove polldat from its assoc'd pollhead list */ -extern void pollhead_insert(pollhead_t *, polldat_t *); -extern void pollhead_delete(pollhead_t *, polldat_t *); +extern void polldat_associate(polldat_t *, pollhead_t *); +extern void polldat_disassociate(polldat_t *); /* * poll state interfaces: diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c index b66f4f430f..373c86c474 100644 --- a/usr/src/uts/common/syscall/poll.c +++ b/usr/src/uts/common/syscall/poll.c @@ -25,11 +25,12 @@ */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ +/* All Rights Reserved */ /* * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2017, Joyent, Inc. + * Copyright 2022 Oxide Computer Company */ /* @@ -198,18 +199,18 @@ static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int, * * When removing a polled fd from poll cache, the fd is always removed * from pollhead list first and then from fpollinfo list, i.e., - * pollhead_delete() is called before delfpollinfo(). + * polldat_disassociate() is called before delfpollinfo(). * * * Locking hierarchy: * pc_no_exit is a leaf level lock. * ps_lock is held when acquiring pc_lock (except when pollwakeup * acquires pc_lock). - * pc_lock might be held when acquiring PHLOCK (pollhead_insert/ - * pollhead_delete) + * pc_lock might be held when acquiring PHLOCK (polldat_associate/ + * polldat_disassociate) * pc_lock is always held (but this is not required) - * when acquiring PHLOCK (in polladd/pollhead_delete and pollwakeup called - * from pcache_clean_entry). + * when acquiring PHLOCK (in polladd/polldat_disassociate and pollwakeup + * called from pcache_clean_entry). * pc_lock is held across addfpollinfo/delfpollinfo which acquire * uf_lock. * pc_lock is held across getf/releasef which acquire uf_lock. @@ -824,7 +825,7 @@ retry: if ((pdp->pd_events & events) || (events & (POLLHUP | POLLERR))) { - pollcache_t *pcp; + pollcache_t *pcp; if (pdp->pd_portev != NULL) { port_kevent_t *pkevp = pdp->pd_portev; @@ -977,53 +978,87 @@ pollnotify(pollcache_t *pcp, int fd) } /* - * add a polldat entry to pollhead ph_list. The polldat struct is used - * by pollwakeup to wake sleeping pollers when polled events has happened. + * Associate a polldat entry with a pollhead (add it to ph_list). + * + * The polldat struct is used by pollwakeup to wake sleeping pollers when polled + * events has happened. */ void -pollhead_insert(pollhead_t *php, polldat_t *pdp) +polldat_associate(polldat_t *pdp, pollhead_t *php) { + ASSERT3P(pdp->pd_php, ==, NULL); + ASSERT3P(pdp->pd_next, ==, NULL); + PH_ENTER(php); - ASSERT(pdp->pd_next == NULL); #ifdef DEBUG - { - /* - * the polldat should not be already on the list - */ - polldat_t *wp; - for (wp = php->ph_list; wp; wp = wp->pd_next) { - ASSERT(wp != pdp); - } + /* The polldat should not be already on the list */ + for (polldat_t *wp = php->ph_list; wp != NULL; wp = wp->pd_next) { + ASSERT3P(wp, !=, pdp); } #endif /* DEBUG */ + pdp->pd_next = php->ph_list; php->ph_list = pdp; + pdp->pd_php = php; PH_EXIT(php); } /* - * Delete the polldat entry from ph_list. + * Disassociate a polldat from its pollhead (if such an association exists). */ void -pollhead_delete(pollhead_t *php, polldat_t *pdp) +polldat_disassociate(polldat_t *pdp) { - polldat_t *wp; - polldat_t **wpp; + pollhead_t *php; - PH_ENTER(php); - for (wpp = &php->ph_list; (wp = *wpp) != NULL; wpp = &wp->pd_next) { + /* + * Acquire the lock for the pollhead which this polldat is associated + * with. This must be done with care, re-checking pd_php after entering + * the pollhead lock, since a racing pollhead_clean() could have already + * performed the disassociation. + */ + for (;;) { + php = pdp->pd_php; + if (php == NULL) { + /* polldat is not associated with a pollhead */ + return; + } + + /* + * The lock for a given pollhead is not stored in the pollhead + * itself, but is rather a global entry in an array (plocks) + * which the pollhead pointer hashes into (see: PHLOCK()). + */ + PH_ENTER(php); + if (pdp->pd_php == php) { + break; + } + PH_EXIT(php); + } + + polldat_t **wpp = &php->ph_list, *wp = php->ph_list; + while (wp != NULL) { if (wp == pdp) { + /* Unlink the polldat from the list */ *wpp = pdp->pd_next; pdp->pd_next = NULL; break; } + wpp = &wp->pd_next; + wp = wp->pd_next; } + #ifdef DEBUG - /* assert that pdp is no longer in the list */ + /* It would be unexpected if pdp was not in the pollhead list */ + ASSERT(wp != NULL); + + /* Assert that pdp is not duplicated somewhere later in the list */ for (wp = *wpp; wp; wp = wp->pd_next) { ASSERT(wp != pdp); } #endif /* DEBUG */ + + pdp->pd_php = NULL; PH_EXIT(php); } @@ -1193,7 +1228,7 @@ pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds) void pcache_grow_map(pollcache_t *pcp, int fd) { - int newsize; + int newsize; ulong_t *newmap; /* @@ -1231,10 +1266,7 @@ pcache_clean(pollcache_t *pcp) hashtbl = pcp->pc_hash; for (i = 0; i < pcp->pc_hashsize; i++) { for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { - if (pdp->pd_php != NULL) { - pollhead_delete(pdp->pd_php, pdp); - pdp->pd_php = NULL; - } + polldat_disassociate(pdp); if (pdp->pd_fp != NULL) { delfpollinfo(pdp->pd_fd); pdp->pd_fp = NULL; @@ -1246,7 +1278,7 @@ pcache_clean(pollcache_t *pcp) void pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp) { - int i; + int i; int fd = pdp->pd_fd; /* @@ -1418,8 +1450,7 @@ pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, } if (memphp) { if (pdp->pd_php == NULL) { - pollhead_insert(memphp, pdp); - pdp->pd_php = memphp; + polldat_associate(pdp, memphp); } else { if (memphp != pdp->pd_php) { /* @@ -1427,9 +1458,8 @@ pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, * may change the vnode and thus the pollhead * pointer out from underneath us. */ - pollhead_delete(pdp->pd_php, pdp); - pollhead_insert(memphp, pdp); - pdp->pd_php = memphp; + polldat_disassociate(pdp); + polldat_associate(pdp, memphp); } } } @@ -1472,16 +1502,14 @@ pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent) refp->xf_position = POLLPOSINVAL; ASSERT(refp->xf_refcnt == 1); refp->xf_refcnt = 0; - if (pdp->pd_php) { - /* - * It is possible for a wakeup thread to get ahead - * of the following pollhead_delete and set the bit in - * bitmap. It is OK because the bit will be cleared - * here anyway. - */ - pollhead_delete(pdp->pd_php, pdp); - pdp->pd_php = NULL; - } + + /* + * It is possible for a wakeup thread to get ahead of the + * following polldat_disassociate and set the bit in bitmap. + * That is OK because the bit will be cleared here anyway. + */ + polldat_disassociate(pdp); + pdp->pd_count = 0; if (pdp->pd_fp != NULL) { pdp->pd_fp = NULL; @@ -1550,7 +1578,7 @@ pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which) static int pollchecksanity(pollstate_t *ps, nfds_t nfds) { - int i; + int i; int fd; pollcache_t *pcp = ps->ps_pcache; polldat_t *pdp; @@ -1588,7 +1616,7 @@ pollchecksanity(pollstate_t *ps, nfds_t nfds) int pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which) { - int i; + int i; pollcache_t *pcp = ps->ps_pcache; pollfd_t *newlist = NULL; pollfd_t *current = ps->ps_pollfd; @@ -1912,8 +1940,8 @@ pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp, { int i; pollcache_t *pcp; - int fd; - int begin, end, done; + int fd; + int begin, end, done; pollhead_t *php; int fdcnt; int error = 0; @@ -2056,9 +2084,8 @@ retry: if (php != NULL && pdp->pd_php != NULL && php != pdp->pd_php) { releasef(fd); - pollhead_delete(pdp->pd_php, pdp); - pdp->pd_php = php; - pollhead_insert(php, pdp); + polldat_disassociate(pdp); + polldat_associate(pdp, php); /* * We could have missed a wakeup on the new * target device. Make sure the new target @@ -2109,8 +2136,7 @@ retry: * do it now. */ if ((pdp->pd_php == NULL) && (php != NULL)) { - pdp->pd_php = php; - pollhead_insert(php, pdp); + polldat_associate(pdp, php); /* * We are inserting a polldat struct for * the first time. We may have missed a @@ -2291,9 +2317,14 @@ pcache_clean_entry(pollstate_t *ps, int fd) } } if (pdp->pd_php) { + /* + * Using pdp->pd_php is a bit risky here, as we lack any + * protection from a racing close operation which could free + * that pollhead prior to pollwakeup() acquiring the locks + * necessary to make it safe. + */ pollwakeup(pdp->pd_php, POLLHUP); - pollhead_delete(pdp->pd_php, pdp); - pdp->pd_php = NULL; + polldat_disassociate(pdp); } } @@ -3086,9 +3117,8 @@ plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp, */ if (php != NULL && pdp->pd_php != NULL && php != pdp->pd_php) { - pollhead_delete(pdp->pd_php, pdp); - pdp->pd_php = php; - pollhead_insert(php, pdp); + polldat_disassociate(pdp); + polldat_associate(pdp, php); /* * We could have missed a wakeup on the * new target device. Make sure the new diff --git a/usr/src/uts/common/xen/io/evtchn_dev.c b/usr/src/uts/common/xen/io/evtchn_dev.c index 7a8d50eb33..3fbc48e271 100644 --- a/usr/src/uts/common/xen/io/evtchn_dev.c +++ b/usr/src/uts/common/xen/io/evtchn_dev.c @@ -94,7 +94,7 @@ struct evtsoftdata { kcondvar_t evtchn_wait; /* Processes wait on this when ring is empty. */ kmutex_t evtchn_lock; - struct pollhead evtchn_pollhead; + pollhead_t evtchn_pollhead; pid_t pid; /* last pid to bind to this event channel. */ processorid_t cpu; /* cpu thread/evtchn is bound to */ @@ -551,6 +551,7 @@ evtchndrv_close(dev_t dev, int flag, int otyp, struct cred *credp) mutex_exit(&port_user_lock); kmem_free(ep->ring, PAGESIZE); + pollhead_clean(&ep->evtchn_pollhead); ddi_soft_state_free(evtchndrv_statep, EVTCHNDRV_MINOR2INST(minor)); /* diff --git a/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c b/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c index 468da22085..3ced3c0077 100644 --- a/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c +++ b/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Joyent, Inc. + * Copyright 2022 Oxide Computer Company */ /* @@ -42,7 +43,6 @@ #include <sys/hotplug/pci/pcicfg.h> #include <sys/ndi_impldefs.h> #include <sys/pci_cfgacc.h> -#include <sys/pcie_impl.h> /* * ************************************************************************ @@ -716,6 +716,18 @@ next: DEBUG1("Next Function - %x\n", func); } + /* + * At this point we have set up the various dev_info nodes that we + * expect to see in the tree and we must re-evaluate the general fabric + * settings such as the overall max payload size or the tagging that is + * enabled. However, as part the big theory statement in pcie.c, this + * can only be performed on a root port; however, that determination + * will be made by the fabric scanning logic. + */ + if (visited > 0 && is_pcie) { + pcie_fabric_setup(devi); + } + ndi_devi_exit(devi, circ); if (visited == 0) diff --git a/usr/src/uts/intel/io/ipmi/ipmi.c b/usr/src/uts/intel/io/ipmi/ipmi.c index 7fb5b757f6..7213ccc281 100644 --- a/usr/src/uts/intel/io/ipmi/ipmi.c +++ b/usr/src/uts/intel/io/ipmi/ipmi.c @@ -120,7 +120,7 @@ ipmi_complete_request(struct ipmi_softc *sc, struct ipmi_request *req) } else { dev = req->ir_owner; TAILQ_INSERT_TAIL(&dev->ipmi_completed_requests, req, ir_link); - pollwakeup(dev->ipmi_pollhead, POLLIN | POLLRDNORM); + pollwakeup(&dev->ipmi_pollhead, POLLIN | POLLRDNORM); dev->ipmi_status &= ~IPMI_BUSY; if (dev->ipmi_status & IPMI_CLOSING) diff --git a/usr/src/uts/intel/io/ipmi/ipmi_main.c b/usr/src/uts/intel/io/ipmi/ipmi_main.c index e7671ce734..4c64a70a14 100644 --- a/usr/src/uts/intel/io/ipmi/ipmi_main.c +++ b/usr/src/uts/intel/io/ipmi/ipmi_main.c @@ -170,8 +170,6 @@ ipmi_open(dev_t *devp, int flag, int otyp, cred_t *cred) /* Initialize the per file descriptor data. */ dev = kmem_zalloc(sizeof (ipmi_device_t), KM_SLEEP); - dev->ipmi_pollhead = kmem_zalloc(sizeof (pollhead_t), KM_SLEEP); - TAILQ_INIT(&dev->ipmi_completed_requests); dev->ipmi_address = IPMI_BMC_SLAVE_ADDR; dev->ipmi_lun = IPMI_BMC_SMS_LUN; @@ -225,7 +223,7 @@ ipmi_close(dev_t dev, int flag, int otyp, cred_t *cred) mutex_exit(&dev_list_lock); id_free(minor_ids, getminor(dev)); cv_destroy(&dp->ipmi_cv); - kmem_free(dp->ipmi_pollhead, sizeof (pollhead_t)); + pollhead_clean(&dp->ipmi_pollhead); kmem_free(dp, sizeof (ipmi_device_t)); return (0); @@ -463,7 +461,7 @@ ipmi_poll(dev_t dv, short events, int anyyet, short *reventsp, } if ((revent == 0 && !anyyet) || (events & POLLET)) { - *phpp = dev->ipmi_pollhead; + *phpp = &dev->ipmi_pollhead; } *reventsp = revent; diff --git a/usr/src/uts/intel/io/ipmi/ipmivars.h b/usr/src/uts/intel/io/ipmi/ipmivars.h index f547d6f043..cd73753438 100644 --- a/usr/src/uts/intel/io/ipmi/ipmivars.h +++ b/usr/src/uts/intel/io/ipmi/ipmivars.h @@ -85,7 +85,7 @@ struct ipmi_softc; /* Per file descriptor data. */ typedef struct ipmi_device { TAILQ_HEAD(, ipmi_request) ipmi_completed_requests; - pollhead_t *ipmi_pollhead; + pollhead_t ipmi_pollhead; int ipmi_requests; uchar_t ipmi_address; /* IPMB address. */ uchar_t ipmi_lun; |
