diff options
author | Alex Wilson <alex@uq.edu.au> | 2020-01-08 18:05:31 +1000 |
---|---|---|
committer | Robert Mustacchi <rm@fingolfin.org> | 2020-03-04 02:46:52 +0000 |
commit | 12eb87fbfbcd9e0abde89898daa0a87c695807e4 (patch) | |
tree | 954328dd9aa76b62eac3439d6e6d14deb91bb134 | |
parent | a23b3b1bb4e08abaac9fb78fea486e678ce6d6de (diff) | |
download | illumos-joyent-12eb87fbfbcd9e0abde89898daa0a87c695807e4.tar.gz |
12205 want generic NIC transceiver fault events
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Reviewed by: Paul Winder <paul@winders.demon.co.uk>
Reviewed by: Rob Johnston <rob.johnston@joyent.com>
Approved by: Garrett D'Amore <garrett@damore.org>
-rw-r--r-- | usr/src/cmd/fm/dicts/Makefile | 3 | ||||
-rw-r--r-- | usr/src/cmd/fm/dicts/NIC.dict | 21 | ||||
-rw-r--r-- | usr/src/cmd/fm/dicts/NIC.po | 98 | ||||
-rw-r--r-- | usr/src/cmd/fm/eversholt/files/common/nic.esc | 127 | ||||
-rw-r--r-- | usr/src/cmd/fm/eversholt/files/i386/Makefile | 3 | ||||
-rw-r--r-- | usr/src/cmd/fm/eversholt/files/sparc/Makefile | 3 | ||||
-rw-r--r-- | usr/src/pkg/manifests/consolidation-osnet-osnet-message-files.mf | 1 | ||||
-rw-r--r-- | usr/src/pkg/manifests/service-fault-management.mf | 5 | ||||
-rw-r--r-- | usr/src/uts/common/sys/fm/io/ddi.h | 11 |
9 files changed, 269 insertions, 3 deletions
diff --git a/usr/src/cmd/fm/dicts/Makefile b/usr/src/cmd/fm/dicts/Makefile index 22bebd3ae8..93e0303f83 100644 --- a/usr/src/cmd/fm/dicts/Makefile +++ b/usr/src/cmd/fm/dicts/Makefile @@ -38,7 +38,8 @@ common_DCNAMES = \ SCA1000 \ SENSOR \ STORAGE \ - TEST + TEST \ + NIC i386_DCNAMES = \ AMD \ diff --git a/usr/src/cmd/fm/dicts/NIC.dict b/usr/src/cmd/fm/dicts/NIC.dict new file mode 100644 index 0000000000..670dc53d46 --- /dev/null +++ b/usr/src/cmd/fm/dicts/NIC.dict @@ -0,0 +1,21 @@ +# +# Copyright 2020 the University of Queensland +# Use is subject to license terms. +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +FMDICT: name=NIC version=1 maxkey=4 + +fault.io.nic.transceiver.notsupp=0 +fault.io.nic.transceiver.whitelist=1 +fault.io.nic.transceiver.overtemp=2 +fault.io.nic.transceiver.hwfail=3 +fault.io.nic.transceiver.unknown=4 diff --git a/usr/src/cmd/fm/dicts/NIC.po b/usr/src/cmd/fm/dicts/NIC.po new file mode 100644 index 0000000000..46f1c859b9 --- /dev/null +++ b/usr/src/cmd/fm/dicts/NIC.po @@ -0,0 +1,98 @@ +# +# Copyright 2020 the University of Queensland +# Use is subject to license terms. +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# code: NIC-8000-0Q +# keys: fault.io.nic.transceiver.notsupp +# +msgid "NIC-8000-0Q.type" +msgstr "Fault" +msgid "NIC-8000-0Q.severity" +msgstr "Critical" +msgid "NIC-8000-0Q.description" +msgstr "NIC transceiver module %<fault-list[0].resource.hc-specific.txr_index> (SFP/SFP+/QSFP+ etc.) in %<fault-list[0].resource.hc-specific.link-name> is of a type that is not supported. This may be due to an incompatible link type or speed. In some NICs, this may also be caused by enforcement of a vendor or part whitelist.\n\n NIC data link: %<fault-list[0].resource.hc-specific.link-name> (%<fault-list[0].resource.hc-specific.primary-mac-address>)\n Module vendor: %<fault-list[0].resource.hc-specific.vendor>\n Module part: %<fault-list[0].resource.part>\n Module serial: %<fault-list[0].resource.serial>\n\n Refer to %s for more information." +msgid "NIC-8000-0Q.response" +msgstr "The transceiver module has been disabled, and the network data link associated with it (%<fault-list[0].resource.hc-specific.link-name>) has been marked as down.\n" +msgid "NIC-8000-0Q.impact" +msgstr "No network traffic will pass through the data link or network interfaces associated with this transceiver slot.\n" +msgid "NIC-8000-0Q.action" +msgstr "Replace the transceiver module with one of a supported type.\n" + +# +# code: NIC-8000-1C +# keys: fault.io.nic.transceiver.whitelist +# +msgid "NIC-8000-1C.type" +msgstr "Fault" +msgid "NIC-8000-1C.severity" +msgstr "Critical" +msgid "NIC-8000-1C.description" +msgstr "NIC transceiver module %<fault-list[0].resource.hc-specific.txr_index> (SFP/SFP+/QSFP+ etc.) in %<fault-list[0].resource.hc-specific.link-name> is of a type that is not allowed to be used with this NIC (due to a hardware-enforced vendor or part whitelist).\n\n NIC data link: %<fault-list[0].resource.hc-specific.link-name> (%<fault-list[0].resource.hc-specific.primary-mac-address>)\n Module vendor: %<fault-list[0].resource.hc-specific.vendor>\n Module part: %<fault-list[0].resource.part>\n Module serial: %<fault-list[0].resource.serial>\n\n Refer to %s for more information." +msgid "NIC-8000-1C.response" +msgstr "The transceiver module has been disabled, and the network data link associated with it (%<fault-list[0].resource.hc-specific.link-name>) has been marked as down.\n" +msgid "NIC-8000-1C.impact" +msgstr "No network traffic will pass through the data link or network\ninterfaces associated with this transceiver slot.\n" +msgid "NIC-8000-1C.action" +msgstr "Replace the transceiver module with one of a supported type.\n" + +# +# code: NIC-8000-2R +# keys: fault.io.nic.transceiver.overtemp +# +msgid "NIC-8000-2R.type" +msgstr "Fault" +msgid "NIC-8000-2R.severity" +msgstr "Critical" +msgid "NIC-8000-2R.description" +msgstr "NIC transceiver module %<fault-list[0].resource.hc-specific.txr_index> (SFP/SFP+/QSFP+ etc.) in %<fault-list[0].resource.hc-specific.link-name> has overheated.\n\n NIC data link: %<fault-list[0].resource.hc-specific.link-name> (%<fault-list[0].resource.hc-specific.primary-mac-address>)\n Module vendor: %<fault-list[0].resource.hc-specific.vendor>\n Module part: %<fault-list[0].resource.part>\n Module serial: %<fault-list[0].resource.serial>\n\n Refer to %s for more information." +msgid "NIC-8000-2R.response" +msgstr "The transceiver module has been disabled, and the network data link associated with it (%<fault-list[0].resource.hc-specific.link-name>) has been marked as down.\n" +msgid "NIC-8000-2R.impact" +msgstr "No network traffic will pass through the data link or network interfaces associated with this transceiver slot.\n" +msgid "NIC-8000-2R.action" +msgstr "Remove the transceiver module and check for adequate ventilation\nand cooling. Re-inserting the module after it has cooled will restore service.\n" + +# +# code: NIC-8000-34 +# keys: fault.io.nic.transceiver.hwfail +# +msgid "NIC-8000-34.type" +msgstr "Fault" +msgid "NIC-8000-34.severity" +msgstr "Critical" +msgid "NIC-8000-34.description" +msgstr "NIC transceiver module %<fault-list[0].resource.hc-specific.txr_index> (SFP/SFP+/QSFP+ etc.) in %<fault-list[0].resource.hc-specific.link-name> has experienced a hardware failure.\n\n NIC data link: %<fault-list[0].resource.hc-specific.link-name> (%<fault-list[0].resource.hc-specific.primary-mac-address>)\n Module vendor: %<fault-list[0].resource.hc-specific.vendor>\n Module part: %<fault-list[0].resource.part>\n Module serial: %<fault-list[0].resource.serial>\n\n Refer to %s for more information." +msgid "NIC-8000-34.response" +msgstr "The transceiver module has been disabled, and the network data link associated with it (%<fault-list[0].resource.hc-specific.link-name>) has been marked as down.\n" +msgid "NIC-8000-34.impact" +msgstr "No network traffic will pass through the data link or network\ninterfaces associated with this transceiver slot.\n" +msgid "NIC-8000-34.action" +msgstr "Remove and check the transceiver module, and consider replacing it.\n" + +# +# code: NIC-8000-4X +# keys: fault.io.nic.transceiver.unknown +# +msgid "NIC-8000-4X.type" +msgstr "Fault" +msgid "NIC-8000-4X.severity" +msgstr "Critical" +msgid "NIC-8000-4X.description" +msgstr "The slot for NIC transceiver module %<fault-list[0].resource.hc-specific.txr_index> (SFP/SFP+/QSFP+ etc.) in %<fault-list[0].resource.hc-specific.link-name> is occupied, but hardware did not find a valid transceiver in it.\n Refer to %s for more information." +msgid "NIC-8000-4X.response" +msgstr "The transceiver module slot has been disabled, and the network data link associated with it (%<fault-list[0].resource.hc-specific.link-name>) has been marked as down.\n" +msgid "NIC-8000-4X.impact" +msgstr "No network traffic will pass through the data link or network\ninterfaces associated with this transceiver slot.\n" +msgid "NIC-8000-4X.action" +msgstr "Remove and check the transceiver module. It may be faulty,\ninserted incorrectly, or not of the correct type for the slot.\nIf problems persist, consider replacing the module.\n" diff --git a/usr/src/cmd/fm/eversholt/files/common/nic.esc b/usr/src/cmd/fm/eversholt/files/common/nic.esc new file mode 100644 index 0000000000..6dfaf5fa5b --- /dev/null +++ b/usr/src/cmd/fm/eversholt/files/common/nic.esc @@ -0,0 +1,127 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2020, the University of Queensland + */ + +#pragma dictionary "NIC" + +/* + * Rules for the generic NIC (non-driver-specific) fault events. + */ + +/* + * Transceiver events are emitted by drivers under ereport.io.nic.txr-err. + * + * These are emitted with detector = the PCI/PCIex function of the NIC. + * They must always have a string property "error", set to one of the + * generic transceiver fault type names (notsupp, whitelist, overtemp etc). + * + * As well as "error", they must have both the "port_index" and "txr_index" + * properties set in the event payload (both integer types). + * + * It is expected that drivers will call ddi_fm_service_impact() immediately + * after noticing a transceiver error, with an argument of DDI_SERVICE_LOST or + * DDI_SERVICE_DEGRADED (depending on the specific error -- at time of writing + * all the supported events expect DDI_SERVICE_LOST). + */ + +asru pcifn; +fru pcifn/port/transceiver; + +asru pciexfn; +fru pciexfn/port/transceiver; + +#define EV_DECL_TXR_FAULT(TYPE) \ + event fault.io.nic.transceiver.TYPE@pcifn/port/transceiver \ + FRU=pcifn/port/transceiver, ASRU=pcifn; \ + event fault.io.nic.transceiver.TYPE@pciexfn/port/transceiver \ + FRU=pciexfn/port/transceiver, ASRU=pciexfn; + +EV_DECL_TXR_FAULT(notsupp) +EV_DECL_TXR_FAULT(whitelist) +EV_DECL_TXR_FAULT(overtemp) +EV_DECL_TXR_FAULT(hwfail) +EV_DECL_TXR_FAULT(unknown) + +event ereport.io.nic.txr-err@pcifn; +event ereport.io.service.lost@pcifn; + +event ereport.io.nic.txr-err@pciexfn; +event ereport.io.service.lost@pciexfn; + +#define EV_PROP_TXR_FAULT(TYPE) \ + prop fault.io.nic.transceiver.TYPE@pcifn/port[pn]/transceiver[tn] (2) -> \ + ereport.io.nic.txr-err@pcifn { \ + payloadprop("txr_index") == tn && \ + payloadprop("port_index") == pn && \ + payloadprop("error") == "TYPE" && \ + setpayloadprop("txr_index", tn) && \ + setpayloadprop("link-name", confprop(pcifn/port[pn], "link-name")) && \ + setpayloadprop("primary-mac-address", confprop(pcifn/port[pn], "primary-mac-address")) && \ + (!confprop_defined(pcifn/port[pn]/transceiver[tn], "vendor") || \ + setpayloadprop("vendor", confprop(pcifn/port[pn]/transceiver[tn], "vendor"))) \ + }, \ + ereport.io.service.lost@pcifn { within(1s) }; \ + prop fault.io.nic.transceiver.TYPE@pciexfn/port[pn]/transceiver[tn] (2) -> \ + ereport.io.nic.txr-err@pciexfn { \ + payloadprop("txr_index") == tn && \ + payloadprop("port_index") == pn && \ + payloadprop("error") == "TYPE" && \ + setpayloadprop("txr_index", tn) && \ + setpayloadprop("link-name", confprop(pciexfn/port[pn], "link-name")) && \ + setpayloadprop("primary-mac-address", confprop(pciexfn/port[pn], "primary-mac-address")) && \ + (!confprop_defined(pciexfn/port[pn]/transceiver[tn], "vendor") || \ + setpayloadprop("vendor", confprop(pciexfn/port[pn]/transceiver[tn], "vendor"))) \ + }, \ + ereport.io.service.lost@pciexfn { within(1s) }; + +EV_PROP_TXR_FAULT(notsupp) +EV_PROP_TXR_FAULT(whitelist) +EV_PROP_TXR_FAULT(overtemp) +EV_PROP_TXR_FAULT(hwfail) +EV_PROP_TXR_FAULT(unknown) + +/* + * Allow drivers (e.g. i40e) which can't tell the difference between the events + * notsupp/unknown/whitelist to generate a single ereport covering all 3. + * + * If transceiver information is available in topo, we will turn it into + * a "notsupp" fault. If it isn't, we'll turn it into an "unknown" fault + * instead. The text in "notsupp" explicitly notes that certain drivers might + * have difficulty telling the difference between it and "whitelist". + * + * If you want this for a pcifn driver rather than pciexfn, you'll have to + * make another copy. + */ +prop fault.io.nic.transceiver.notsupp@pciexfn/port[pn]/transceiver[tn] (2) -> + ereport.io.nic.txr-err@pciexfn { + payloadprop("txr_index") == tn && + payloadprop("port_index") == pn && + payloadprop("error") == "notsupp/unknown" && + confprop_defined(pciexfn/port[pn]/transceiver[tn], "vendor") && + setpayloadprop("txr_index", tn) && + setpayloadprop("link-name", confprop(pciexfn/port[pn], "link-name")) && + setpayloadprop("primary-mac-address", confprop(pciexfn/port[pn], "primary-mac-address")) && + setpayloadprop("vendor", confprop(pciexfn/port[pn]/transceiver[tn], "vendor")) + }, + ereport.io.service.lost@pciexfn { within(1s) }; +prop fault.io.nic.transceiver.unknown@pciexfn/port[pn]/transceiver[tn] (2) -> + ereport.io.nic.txr-err@pciexfn { + payloadprop("txr_index") == tn && + payloadprop("port_index") == pn && + payloadprop("error") == "notsupp/unknown" && + !confprop_defined(pciexfn/port[pn]/transceiver[tn], "vendor") && + setpayloadprop("txr_index", tn) && + setpayloadprop("link-name", confprop(pciexfn/port[pn], "link-name")) && + setpayloadprop("primary-mac-address", confprop(pciexfn/port[pn], "primary-mac-address")) + }, + ereport.io.service.lost@pciexfn { within(1s) }; diff --git a/usr/src/cmd/fm/eversholt/files/i386/Makefile b/usr/src/cmd/fm/eversholt/files/i386/Makefile index bb6cda3b38..67caa4468e 100644 --- a/usr/src/cmd/fm/eversholt/files/i386/Makefile +++ b/usr/src/cmd/fm/eversholt/files/i386/Makefile @@ -33,7 +33,8 @@ EFT_COMMON_FILES= \ sca500.eft \ sca1000.eft \ sensor.eft \ - storage.eft + storage.eft \ + nic.eft include ../../../Makefile.subdirs diff --git a/usr/src/cmd/fm/eversholt/files/sparc/Makefile b/usr/src/cmd/fm/eversholt/files/sparc/Makefile index 4e5655cbf7..0482b12b33 100644 --- a/usr/src/cmd/fm/eversholt/files/sparc/Makefile +++ b/usr/src/cmd/fm/eversholt/files/sparc/Makefile @@ -34,7 +34,8 @@ EFT_COMMON_FILES= \ sca500.eft \ sca1000.eft \ sensor.eft \ - storage.eft + storage.eft \ + nic.eft include ../../../Makefile.subdirs diff --git a/usr/src/pkg/manifests/consolidation-osnet-osnet-message-files.mf b/usr/src/pkg/manifests/consolidation-osnet-osnet-message-files.mf index 373721a966..a4800aa033 100644 --- a/usr/src/pkg/manifests/consolidation-osnet-osnet-message-files.mf +++ b/usr/src/pkg/manifests/consolidation-osnet-osnet-message-files.mf @@ -282,6 +282,7 @@ file path=usr/lib/locale/C/LC_MESSAGES/FMD.po file path=usr/lib/locale/C/LC_MESSAGES/FMNOTIFY.po file path=usr/lib/locale/C/LC_MESSAGES/GMCA.po file path=usr/lib/locale/C/LC_MESSAGES/INTEL.po +file path=usr/lib/locale/C/LC_MESSAGES/NIC.po file path=usr/lib/locale/C/LC_MESSAGES/NXGE.po file path=usr/lib/locale/C/LC_MESSAGES/PCI.po file path=usr/lib/locale/C/LC_MESSAGES/PCIEX.po diff --git a/usr/src/pkg/manifests/service-fault-management.mf b/usr/src/pkg/manifests/service-fault-management.mf index 5127927e5b..18e07dd5fc 100644 --- a/usr/src/pkg/manifests/service-fault-management.mf +++ b/usr/src/pkg/manifests/service-fault-management.mf @@ -335,6 +335,8 @@ $(i386_ONLY)file path=usr/lib/fm/dict/GMCA.dict mode=0444 \ variant.opensolaris.zone=__NODEFAULT $(i386_ONLY)file path=usr/lib/fm/dict/INTEL.dict mode=0444 \ variant.opensolaris.zone=__NODEFAULT +file path=usr/lib/fm/dict/NIC.dict mode=0444 \ + variant.opensolaris.zone=__NODEFAULT file path=usr/lib/fm/dict/NXGE.dict mode=0444 \ variant.opensolaris.zone=__NODEFAULT file path=usr/lib/fm/dict/PCI.dict mode=0444 \ @@ -366,6 +368,7 @@ file path=usr/lib/fm/eft/disk.eft mode=0444 \ variant.opensolaris.zone=__NODEFAULT file path=usr/lib/fm/eft/neptune_xaui.eft mode=0444 file path=usr/lib/fm/eft/neptune_xfp.eft mode=0444 +file path=usr/lib/fm/eft/nic.eft mode=0444 file path=usr/lib/fm/eft/pci.eft mode=0444 file path=usr/lib/fm/eft/pciex.eft mode=0444 file path=usr/lib/fm/eft/pciexrc.eft mode=0444 @@ -531,6 +534,8 @@ $(i386_ONLY)file path=usr/lib/locale/C/LC_MESSAGES/GMCA.mo mode=0444 \ variant.opensolaris.zone=__NODEFAULT $(i386_ONLY)file path=usr/lib/locale/C/LC_MESSAGES/INTEL.mo mode=0444 \ variant.opensolaris.zone=__NODEFAULT +file path=usr/lib/locale/C/LC_MESSAGES/NIC.mo mode=0444 \ + variant.opensolaris.zone=__NODEFAULT file path=usr/lib/locale/C/LC_MESSAGES/NXGE.mo mode=0444 \ variant.opensolaris.zone=__NODEFAULT file path=usr/lib/locale/C/LC_MESSAGES/PCI.mo mode=0444 \ diff --git a/usr/src/uts/common/sys/fm/io/ddi.h b/usr/src/uts/common/sys/fm/io/ddi.h index 75afff5c38..d8c772cdaf 100644 --- a/usr/src/uts/common/sys/fm/io/ddi.h +++ b/usr/src/uts/common/sys/fm/io/ddi.h @@ -66,6 +66,17 @@ extern "C" { #define DVR_STACK_DEPTH "dvr-stack-depth" #define DVR_ERR_SPECIFIC "dvr-error-specific" +/* Generic NIC driver ereports. */ +#define DDI_FM_NIC "nic" +#define DDI_FM_TXR_ERROR "txr-err" + +/* Valid values of the "error" field in txr-err ereports */ +#define DDI_FM_TXR_ERROR_WHITELIST "whitelist" +#define DDI_FM_TXR_ERROR_NOTSUPP "notsupp" +#define DDI_FM_TXR_ERROR_OVERTEMP "overtemp" +#define DDI_FM_TXR_ERROR_HWFAIL "hwfail" +#define DDI_FM_TXR_ERROR_UNKNOWN "unknown" + #ifdef __cplusplus } #endif |