summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Wilson <alex@uq.edu.au>2020-01-08 18:05:31 +1000
committerRobert Mustacchi <rm@fingolfin.org>2020-03-04 02:46:52 +0000
commit12eb87fbfbcd9e0abde89898daa0a87c695807e4 (patch)
tree954328dd9aa76b62eac3439d6e6d14deb91bb134
parenta23b3b1bb4e08abaac9fb78fea486e678ce6d6de (diff)
downloadillumos-joyent-12eb87fbfbcd9e0abde89898daa0a87c695807e4.tar.gz
12205 want generic NIC transceiver fault events
Reviewed by: Robert Mustacchi <rm@fingolfin.org> Reviewed by: Paul Winder <paul@winders.demon.co.uk> Reviewed by: Rob Johnston <rob.johnston@joyent.com> Approved by: Garrett D'Amore <garrett@damore.org>
-rw-r--r--usr/src/cmd/fm/dicts/Makefile3
-rw-r--r--usr/src/cmd/fm/dicts/NIC.dict21
-rw-r--r--usr/src/cmd/fm/dicts/NIC.po98
-rw-r--r--usr/src/cmd/fm/eversholt/files/common/nic.esc127
-rw-r--r--usr/src/cmd/fm/eversholt/files/i386/Makefile3
-rw-r--r--usr/src/cmd/fm/eversholt/files/sparc/Makefile3
-rw-r--r--usr/src/pkg/manifests/consolidation-osnet-osnet-message-files.mf1
-rw-r--r--usr/src/pkg/manifests/service-fault-management.mf5
-rw-r--r--usr/src/uts/common/sys/fm/io/ddi.h11
9 files changed, 269 insertions, 3 deletions
diff --git a/usr/src/cmd/fm/dicts/Makefile b/usr/src/cmd/fm/dicts/Makefile
index 22bebd3ae8..93e0303f83 100644
--- a/usr/src/cmd/fm/dicts/Makefile
+++ b/usr/src/cmd/fm/dicts/Makefile
@@ -38,7 +38,8 @@ common_DCNAMES = \
SCA1000 \
SENSOR \
STORAGE \
- TEST
+ TEST \
+ NIC
i386_DCNAMES = \
AMD \
diff --git a/usr/src/cmd/fm/dicts/NIC.dict b/usr/src/cmd/fm/dicts/NIC.dict
new file mode 100644
index 0000000000..670dc53d46
--- /dev/null
+++ b/usr/src/cmd/fm/dicts/NIC.dict
@@ -0,0 +1,21 @@
+#
+# Copyright 2020 the University of Queensland
+# Use is subject to license terms.
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+FMDICT: name=NIC version=1 maxkey=4
+
+fault.io.nic.transceiver.notsupp=0
+fault.io.nic.transceiver.whitelist=1
+fault.io.nic.transceiver.overtemp=2
+fault.io.nic.transceiver.hwfail=3
+fault.io.nic.transceiver.unknown=4
diff --git a/usr/src/cmd/fm/dicts/NIC.po b/usr/src/cmd/fm/dicts/NIC.po
new file mode 100644
index 0000000000..46f1c859b9
--- /dev/null
+++ b/usr/src/cmd/fm/dicts/NIC.po
@@ -0,0 +1,98 @@
+#
+# Copyright 2020 the University of Queensland
+# Use is subject to license terms.
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# code: NIC-8000-0Q
+# keys: fault.io.nic.transceiver.notsupp
+#
+msgid "NIC-8000-0Q.type"
+msgstr "Fault"
+msgid "NIC-8000-0Q.severity"
+msgstr "Critical"
+msgid "NIC-8000-0Q.description"
+msgstr "NIC transceiver module %<fault-list[0].resource.hc-specific.txr_index> (SFP/SFP+/QSFP+ etc.) in %<fault-list[0].resource.hc-specific.link-name> is of a type that is not supported. This may be due to an incompatible link type or speed. In some NICs, this may also be caused by enforcement of a vendor or part whitelist.\n\n NIC data link: %<fault-list[0].resource.hc-specific.link-name> (%<fault-list[0].resource.hc-specific.primary-mac-address>)\n Module vendor: %<fault-list[0].resource.hc-specific.vendor>\n Module part: %<fault-list[0].resource.part>\n Module serial: %<fault-list[0].resource.serial>\n\n Refer to %s for more information."
+msgid "NIC-8000-0Q.response"
+msgstr "The transceiver module has been disabled, and the network data link associated with it (%<fault-list[0].resource.hc-specific.link-name>) has been marked as down.\n"
+msgid "NIC-8000-0Q.impact"
+msgstr "No network traffic will pass through the data link or network interfaces associated with this transceiver slot.\n"
+msgid "NIC-8000-0Q.action"
+msgstr "Replace the transceiver module with one of a supported type.\n"
+
+#
+# code: NIC-8000-1C
+# keys: fault.io.nic.transceiver.whitelist
+#
+msgid "NIC-8000-1C.type"
+msgstr "Fault"
+msgid "NIC-8000-1C.severity"
+msgstr "Critical"
+msgid "NIC-8000-1C.description"
+msgstr "NIC transceiver module %<fault-list[0].resource.hc-specific.txr_index> (SFP/SFP+/QSFP+ etc.) in %<fault-list[0].resource.hc-specific.link-name> is of a type that is not allowed to be used with this NIC (due to a hardware-enforced vendor or part whitelist).\n\n NIC data link: %<fault-list[0].resource.hc-specific.link-name> (%<fault-list[0].resource.hc-specific.primary-mac-address>)\n Module vendor: %<fault-list[0].resource.hc-specific.vendor>\n Module part: %<fault-list[0].resource.part>\n Module serial: %<fault-list[0].resource.serial>\n\n Refer to %s for more information."
+msgid "NIC-8000-1C.response"
+msgstr "The transceiver module has been disabled, and the network data link associated with it (%<fault-list[0].resource.hc-specific.link-name>) has been marked as down.\n"
+msgid "NIC-8000-1C.impact"
+msgstr "No network traffic will pass through the data link or network\ninterfaces associated with this transceiver slot.\n"
+msgid "NIC-8000-1C.action"
+msgstr "Replace the transceiver module with one of a supported type.\n"
+
+#
+# code: NIC-8000-2R
+# keys: fault.io.nic.transceiver.overtemp
+#
+msgid "NIC-8000-2R.type"
+msgstr "Fault"
+msgid "NIC-8000-2R.severity"
+msgstr "Critical"
+msgid "NIC-8000-2R.description"
+msgstr "NIC transceiver module %<fault-list[0].resource.hc-specific.txr_index> (SFP/SFP+/QSFP+ etc.) in %<fault-list[0].resource.hc-specific.link-name> has overheated.\n\n NIC data link: %<fault-list[0].resource.hc-specific.link-name> (%<fault-list[0].resource.hc-specific.primary-mac-address>)\n Module vendor: %<fault-list[0].resource.hc-specific.vendor>\n Module part: %<fault-list[0].resource.part>\n Module serial: %<fault-list[0].resource.serial>\n\n Refer to %s for more information."
+msgid "NIC-8000-2R.response"
+msgstr "The transceiver module has been disabled, and the network data link associated with it (%<fault-list[0].resource.hc-specific.link-name>) has been marked as down.\n"
+msgid "NIC-8000-2R.impact"
+msgstr "No network traffic will pass through the data link or network interfaces associated with this transceiver slot.\n"
+msgid "NIC-8000-2R.action"
+msgstr "Remove the transceiver module and check for adequate ventilation\nand cooling. Re-inserting the module after it has cooled will restore service.\n"
+
+#
+# code: NIC-8000-34
+# keys: fault.io.nic.transceiver.hwfail
+#
+msgid "NIC-8000-34.type"
+msgstr "Fault"
+msgid "NIC-8000-34.severity"
+msgstr "Critical"
+msgid "NIC-8000-34.description"
+msgstr "NIC transceiver module %<fault-list[0].resource.hc-specific.txr_index> (SFP/SFP+/QSFP+ etc.) in %<fault-list[0].resource.hc-specific.link-name> has experienced a hardware failure.\n\n NIC data link: %<fault-list[0].resource.hc-specific.link-name> (%<fault-list[0].resource.hc-specific.primary-mac-address>)\n Module vendor: %<fault-list[0].resource.hc-specific.vendor>\n Module part: %<fault-list[0].resource.part>\n Module serial: %<fault-list[0].resource.serial>\n\n Refer to %s for more information."
+msgid "NIC-8000-34.response"
+msgstr "The transceiver module has been disabled, and the network data link associated with it (%<fault-list[0].resource.hc-specific.link-name>) has been marked as down.\n"
+msgid "NIC-8000-34.impact"
+msgstr "No network traffic will pass through the data link or network\ninterfaces associated with this transceiver slot.\n"
+msgid "NIC-8000-34.action"
+msgstr "Remove and check the transceiver module, and consider replacing it.\n"
+
+#
+# code: NIC-8000-4X
+# keys: fault.io.nic.transceiver.unknown
+#
+msgid "NIC-8000-4X.type"
+msgstr "Fault"
+msgid "NIC-8000-4X.severity"
+msgstr "Critical"
+msgid "NIC-8000-4X.description"
+msgstr "The slot for NIC transceiver module %<fault-list[0].resource.hc-specific.txr_index> (SFP/SFP+/QSFP+ etc.) in %<fault-list[0].resource.hc-specific.link-name> is occupied, but hardware did not find a valid transceiver in it.\n Refer to %s for more information."
+msgid "NIC-8000-4X.response"
+msgstr "The transceiver module slot has been disabled, and the network data link associated with it (%<fault-list[0].resource.hc-specific.link-name>) has been marked as down.\n"
+msgid "NIC-8000-4X.impact"
+msgstr "No network traffic will pass through the data link or network\ninterfaces associated with this transceiver slot.\n"
+msgid "NIC-8000-4X.action"
+msgstr "Remove and check the transceiver module. It may be faulty,\ninserted incorrectly, or not of the correct type for the slot.\nIf problems persist, consider replacing the module.\n"
diff --git a/usr/src/cmd/fm/eversholt/files/common/nic.esc b/usr/src/cmd/fm/eversholt/files/common/nic.esc
new file mode 100644
index 0000000000..6dfaf5fa5b
--- /dev/null
+++ b/usr/src/cmd/fm/eversholt/files/common/nic.esc
@@ -0,0 +1,127 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+/*
+ * Copyright 2020, the University of Queensland
+ */
+
+#pragma dictionary "NIC"
+
+/*
+ * Rules for the generic NIC (non-driver-specific) fault events.
+ */
+
+/*
+ * Transceiver events are emitted by drivers under ereport.io.nic.txr-err.
+ *
+ * These are emitted with detector = the PCI/PCIex function of the NIC.
+ * They must always have a string property "error", set to one of the
+ * generic transceiver fault type names (notsupp, whitelist, overtemp etc).
+ *
+ * As well as "error", they must have both the "port_index" and "txr_index"
+ * properties set in the event payload (both integer types).
+ *
+ * It is expected that drivers will call ddi_fm_service_impact() immediately
+ * after noticing a transceiver error, with an argument of DDI_SERVICE_LOST or
+ * DDI_SERVICE_DEGRADED (depending on the specific error -- at time of writing
+ * all the supported events expect DDI_SERVICE_LOST).
+ */
+
+asru pcifn;
+fru pcifn/port/transceiver;
+
+asru pciexfn;
+fru pciexfn/port/transceiver;
+
+#define EV_DECL_TXR_FAULT(TYPE) \
+ event fault.io.nic.transceiver.TYPE@pcifn/port/transceiver \
+ FRU=pcifn/port/transceiver, ASRU=pcifn; \
+ event fault.io.nic.transceiver.TYPE@pciexfn/port/transceiver \
+ FRU=pciexfn/port/transceiver, ASRU=pciexfn;
+
+EV_DECL_TXR_FAULT(notsupp)
+EV_DECL_TXR_FAULT(whitelist)
+EV_DECL_TXR_FAULT(overtemp)
+EV_DECL_TXR_FAULT(hwfail)
+EV_DECL_TXR_FAULT(unknown)
+
+event ereport.io.nic.txr-err@pcifn;
+event ereport.io.service.lost@pcifn;
+
+event ereport.io.nic.txr-err@pciexfn;
+event ereport.io.service.lost@pciexfn;
+
+#define EV_PROP_TXR_FAULT(TYPE) \
+ prop fault.io.nic.transceiver.TYPE@pcifn/port[pn]/transceiver[tn] (2) -> \
+ ereport.io.nic.txr-err@pcifn { \
+ payloadprop("txr_index") == tn && \
+ payloadprop("port_index") == pn && \
+ payloadprop("error") == "TYPE" && \
+ setpayloadprop("txr_index", tn) && \
+ setpayloadprop("link-name", confprop(pcifn/port[pn], "link-name")) && \
+ setpayloadprop("primary-mac-address", confprop(pcifn/port[pn], "primary-mac-address")) && \
+ (!confprop_defined(pcifn/port[pn]/transceiver[tn], "vendor") || \
+ setpayloadprop("vendor", confprop(pcifn/port[pn]/transceiver[tn], "vendor"))) \
+ }, \
+ ereport.io.service.lost@pcifn { within(1s) }; \
+ prop fault.io.nic.transceiver.TYPE@pciexfn/port[pn]/transceiver[tn] (2) -> \
+ ereport.io.nic.txr-err@pciexfn { \
+ payloadprop("txr_index") == tn && \
+ payloadprop("port_index") == pn && \
+ payloadprop("error") == "TYPE" && \
+ setpayloadprop("txr_index", tn) && \
+ setpayloadprop("link-name", confprop(pciexfn/port[pn], "link-name")) && \
+ setpayloadprop("primary-mac-address", confprop(pciexfn/port[pn], "primary-mac-address")) && \
+ (!confprop_defined(pciexfn/port[pn]/transceiver[tn], "vendor") || \
+ setpayloadprop("vendor", confprop(pciexfn/port[pn]/transceiver[tn], "vendor"))) \
+ }, \
+ ereport.io.service.lost@pciexfn { within(1s) };
+
+EV_PROP_TXR_FAULT(notsupp)
+EV_PROP_TXR_FAULT(whitelist)
+EV_PROP_TXR_FAULT(overtemp)
+EV_PROP_TXR_FAULT(hwfail)
+EV_PROP_TXR_FAULT(unknown)
+
+/*
+ * Allow drivers (e.g. i40e) which can't tell the difference between the events
+ * notsupp/unknown/whitelist to generate a single ereport covering all 3.
+ *
+ * If transceiver information is available in topo, we will turn it into
+ * a "notsupp" fault. If it isn't, we'll turn it into an "unknown" fault
+ * instead. The text in "notsupp" explicitly notes that certain drivers might
+ * have difficulty telling the difference between it and "whitelist".
+ *
+ * If you want this for a pcifn driver rather than pciexfn, you'll have to
+ * make another copy.
+ */
+prop fault.io.nic.transceiver.notsupp@pciexfn/port[pn]/transceiver[tn] (2) ->
+ ereport.io.nic.txr-err@pciexfn {
+ payloadprop("txr_index") == tn &&
+ payloadprop("port_index") == pn &&
+ payloadprop("error") == "notsupp/unknown" &&
+ confprop_defined(pciexfn/port[pn]/transceiver[tn], "vendor") &&
+ setpayloadprop("txr_index", tn) &&
+ setpayloadprop("link-name", confprop(pciexfn/port[pn], "link-name")) &&
+ setpayloadprop("primary-mac-address", confprop(pciexfn/port[pn], "primary-mac-address")) &&
+ setpayloadprop("vendor", confprop(pciexfn/port[pn]/transceiver[tn], "vendor"))
+ },
+ ereport.io.service.lost@pciexfn { within(1s) };
+prop fault.io.nic.transceiver.unknown@pciexfn/port[pn]/transceiver[tn] (2) ->
+ ereport.io.nic.txr-err@pciexfn {
+ payloadprop("txr_index") == tn &&
+ payloadprop("port_index") == pn &&
+ payloadprop("error") == "notsupp/unknown" &&
+ !confprop_defined(pciexfn/port[pn]/transceiver[tn], "vendor") &&
+ setpayloadprop("txr_index", tn) &&
+ setpayloadprop("link-name", confprop(pciexfn/port[pn], "link-name")) &&
+ setpayloadprop("primary-mac-address", confprop(pciexfn/port[pn], "primary-mac-address"))
+ },
+ ereport.io.service.lost@pciexfn { within(1s) };
diff --git a/usr/src/cmd/fm/eversholt/files/i386/Makefile b/usr/src/cmd/fm/eversholt/files/i386/Makefile
index bb6cda3b38..67caa4468e 100644
--- a/usr/src/cmd/fm/eversholt/files/i386/Makefile
+++ b/usr/src/cmd/fm/eversholt/files/i386/Makefile
@@ -33,7 +33,8 @@ EFT_COMMON_FILES= \
sca500.eft \
sca1000.eft \
sensor.eft \
- storage.eft
+ storage.eft \
+ nic.eft
include ../../../Makefile.subdirs
diff --git a/usr/src/cmd/fm/eversholt/files/sparc/Makefile b/usr/src/cmd/fm/eversholt/files/sparc/Makefile
index 4e5655cbf7..0482b12b33 100644
--- a/usr/src/cmd/fm/eversholt/files/sparc/Makefile
+++ b/usr/src/cmd/fm/eversholt/files/sparc/Makefile
@@ -34,7 +34,8 @@ EFT_COMMON_FILES= \
sca500.eft \
sca1000.eft \
sensor.eft \
- storage.eft
+ storage.eft \
+ nic.eft
include ../../../Makefile.subdirs
diff --git a/usr/src/pkg/manifests/consolidation-osnet-osnet-message-files.mf b/usr/src/pkg/manifests/consolidation-osnet-osnet-message-files.mf
index 373721a966..a4800aa033 100644
--- a/usr/src/pkg/manifests/consolidation-osnet-osnet-message-files.mf
+++ b/usr/src/pkg/manifests/consolidation-osnet-osnet-message-files.mf
@@ -282,6 +282,7 @@ file path=usr/lib/locale/C/LC_MESSAGES/FMD.po
file path=usr/lib/locale/C/LC_MESSAGES/FMNOTIFY.po
file path=usr/lib/locale/C/LC_MESSAGES/GMCA.po
file path=usr/lib/locale/C/LC_MESSAGES/INTEL.po
+file path=usr/lib/locale/C/LC_MESSAGES/NIC.po
file path=usr/lib/locale/C/LC_MESSAGES/NXGE.po
file path=usr/lib/locale/C/LC_MESSAGES/PCI.po
file path=usr/lib/locale/C/LC_MESSAGES/PCIEX.po
diff --git a/usr/src/pkg/manifests/service-fault-management.mf b/usr/src/pkg/manifests/service-fault-management.mf
index 5127927e5b..18e07dd5fc 100644
--- a/usr/src/pkg/manifests/service-fault-management.mf
+++ b/usr/src/pkg/manifests/service-fault-management.mf
@@ -335,6 +335,8 @@ $(i386_ONLY)file path=usr/lib/fm/dict/GMCA.dict mode=0444 \
variant.opensolaris.zone=__NODEFAULT
$(i386_ONLY)file path=usr/lib/fm/dict/INTEL.dict mode=0444 \
variant.opensolaris.zone=__NODEFAULT
+file path=usr/lib/fm/dict/NIC.dict mode=0444 \
+ variant.opensolaris.zone=__NODEFAULT
file path=usr/lib/fm/dict/NXGE.dict mode=0444 \
variant.opensolaris.zone=__NODEFAULT
file path=usr/lib/fm/dict/PCI.dict mode=0444 \
@@ -366,6 +368,7 @@ file path=usr/lib/fm/eft/disk.eft mode=0444 \
variant.opensolaris.zone=__NODEFAULT
file path=usr/lib/fm/eft/neptune_xaui.eft mode=0444
file path=usr/lib/fm/eft/neptune_xfp.eft mode=0444
+file path=usr/lib/fm/eft/nic.eft mode=0444
file path=usr/lib/fm/eft/pci.eft mode=0444
file path=usr/lib/fm/eft/pciex.eft mode=0444
file path=usr/lib/fm/eft/pciexrc.eft mode=0444
@@ -531,6 +534,8 @@ $(i386_ONLY)file path=usr/lib/locale/C/LC_MESSAGES/GMCA.mo mode=0444 \
variant.opensolaris.zone=__NODEFAULT
$(i386_ONLY)file path=usr/lib/locale/C/LC_MESSAGES/INTEL.mo mode=0444 \
variant.opensolaris.zone=__NODEFAULT
+file path=usr/lib/locale/C/LC_MESSAGES/NIC.mo mode=0444 \
+ variant.opensolaris.zone=__NODEFAULT
file path=usr/lib/locale/C/LC_MESSAGES/NXGE.mo mode=0444 \
variant.opensolaris.zone=__NODEFAULT
file path=usr/lib/locale/C/LC_MESSAGES/PCI.mo mode=0444 \
diff --git a/usr/src/uts/common/sys/fm/io/ddi.h b/usr/src/uts/common/sys/fm/io/ddi.h
index 75afff5c38..d8c772cdaf 100644
--- a/usr/src/uts/common/sys/fm/io/ddi.h
+++ b/usr/src/uts/common/sys/fm/io/ddi.h
@@ -66,6 +66,17 @@ extern "C" {
#define DVR_STACK_DEPTH "dvr-stack-depth"
#define DVR_ERR_SPECIFIC "dvr-error-specific"
+/* Generic NIC driver ereports. */
+#define DDI_FM_NIC "nic"
+#define DDI_FM_TXR_ERROR "txr-err"
+
+/* Valid values of the "error" field in txr-err ereports */
+#define DDI_FM_TXR_ERROR_WHITELIST "whitelist"
+#define DDI_FM_TXR_ERROR_NOTSUPP "notsupp"
+#define DDI_FM_TXR_ERROR_OVERTEMP "overtemp"
+#define DDI_FM_TXR_ERROR_HWFAIL "hwfail"
+#define DDI_FM_TXR_ERROR_UNKNOWN "unknown"
+
#ifdef __cplusplus
}
#endif