diff options
Diffstat (limited to 'src/pmdas/infiniband')
-rw-r--r-- | src/pmdas/infiniband/GNUmakefile | 58 | ||||
-rwxr-xr-x | src/pmdas/infiniband/Install | 46 | ||||
-rwxr-xr-x | src/pmdas/infiniband/Remove | 24 | ||||
-rw-r--r-- | src/pmdas/infiniband/help | 190 | ||||
-rw-r--r-- | src/pmdas/infiniband/ib.c | 1050 | ||||
-rw-r--r-- | src/pmdas/infiniband/ibpmda.h | 125 | ||||
-rw-r--r-- | src/pmdas/infiniband/pmda.c | 418 | ||||
-rw-r--r-- | src/pmdas/infiniband/pmns | 100 | ||||
-rw-r--r-- | src/pmdas/infiniband/root | 9 |
9 files changed, 2020 insertions, 0 deletions
diff --git a/src/pmdas/infiniband/GNUmakefile b/src/pmdas/infiniband/GNUmakefile new file mode 100644 index 0000000..79991dc --- /dev/null +++ b/src/pmdas/infiniband/GNUmakefile @@ -0,0 +1,58 @@ +# +# Copyright (c) 2013 Red Hat. +# Copyright (c) 2007-2009 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# + +TOPDIR = ../../.. +include $(TOPDIR)/src/include/builddefs + +CMDTARGET = pmdaib$(EXECSUFFIX) +CFILES = ib.c pmda.c +HFILES = ibpmda.h + +LSRCFILES = help root pmns Install Remove +LLDLIBS = $(IB_LIBS) $(PCP_LIBS) -lpcp_pmda -lpcp + +IAM = ib +DOMAIN = IB +PMDADIR = $(PCP_PMDAS_DIR)/infiniband +LDIRT = domain.h *.o $(IAM).log $(CMDTARGET) + +default: build-me + +include $(BUILDRULES) + +ifneq "$(PMDA_INFINIBAND)" "" +build-me: domain.h $(CMDTARGET) + +install: default + $(INSTALL) -m 755 -d $(PMDADIR) + $(INSTALL) -S $(PMDADIR) $(PCP_PMDAS_DIR)/$(IAM) + $(INSTALL) -m 755 Install Remove $(CMDTARGET) $(PMDADIR) + $(INSTALL) -m 644 pmns root help domain.h $(PMDADIR) +else +build-me: +install: +endif + +ib.o: domain.h + +.NOTPARALLEL: +.ORDER: domain.h $(OBJECTS) + +default_pcp : default + +install_pcp : install + +domain.h: ../../pmns/stdpmid + $(DOMAIN_MAKERULE) diff --git a/src/pmdas/infiniband/Install b/src/pmdas/infiniband/Install new file mode 100755 index 0000000..b674b88 --- /dev/null +++ b/src/pmdas/infiniband/Install @@ -0,0 +1,46 @@ +#! /bin/sh +# +# Copyright (C) 2013 Red Hat. +# Copyright (C) 2007,2008 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# + +. /etc/pcp.env +. $PCP_SHARE_DIR/lib/pmdaproc.sh + +iam=infiniband +pmda_interface=3 +daemon_opt=true +dso_opt=false +pipe_opt=true +socket_opt=false + +path=/sys/class/infiniband_mad +if ! test -d $path; then + echo "Kernel lacks Infiniband support - $path directory not found" + exit 1 +fi + +__choose_mode() +{ + do_pmda=true +} + +__choose_ipc() +{ + ipc_type=pipe + type="pipe binary $PCP_PMDAS_DIR/infiniband/pmdaib" +} + +pmdaSetup +pmdaInstall +exit 0 diff --git a/src/pmdas/infiniband/Remove b/src/pmdas/infiniband/Remove new file mode 100755 index 0000000..c35c41f --- /dev/null +++ b/src/pmdas/infiniband/Remove @@ -0,0 +1,24 @@ +#! /bin/sh +# +# Copyright (C) 2007,2008 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# + +. /etc/pcp.env +. $PCP_SHARE_DIR/lib/pmdaproc.sh + +iam=infiniband + +pmdaSetup +pmdaRemove +exit 0 + diff --git a/src/pmdas/infiniband/help b/src/pmdas/infiniband/help new file mode 100644 index 0000000..93eaa7d --- /dev/null +++ b/src/pmdas/infiniband/help @@ -0,0 +1,190 @@ +# +# Copyright (c) 2007,2008 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +@ infiniband.hca.type Node type +Node type: channel adapter (CA), switch, router etc +@ infiniband.hca.ca_type HCA type +HCA type, e.g. MT23108, +@ infiniband.hca.numports Number of ports on HCA +Number of ports on HCA +@ infiniband.hca.fw_ver Version of HCA firmware +Version of HCA firmware +@ infiniband.hca.hw_ver Version of HCA hardware +Version of HCA hardware +@ infiniband.hca.node_guid Node's Global Unique Identifier +Node's Global Unique Identifier - 64 bit integer to refer to the node +@ infiniband.hca.system_guid System's Global Unique Identifier +System's Global Unique Identifier - 64 bit integer to refer to the system +@ infiniband.port.guid Port's Global Unique Identifier +Port's Global Unique Identifier - 64 bit integer to refer to the port +@ infiniband.port.gid_prefix GID prefix +GID prefix, assigned by subnet manager +@ infiniband.port.lid Port's Local Identifier +Port's Local Identifier, assigned by subnet manager +@ infiniband.port.state Port's state +Port's state - can be Active, Down, NoChange, Armed or Initialize +@ infiniband.port.phystate Port's physical state +Port's physical state +@ infiniband.port.rate Port's Data Rate +Port's Data Rate: 2, 5, 10 or 20 Gbps +@ infiniband.port.capabilities Port's capabilities +Port's capabilities. +@ infiniband.port.linkspeed Base link speed of the port. +This is a string which represents the base link speed of the port. +Multiplying link speed by link width gives port's data rate. +@ infiniband.port.linkwidth Port's link width. +Number of bi-directional Infiniband links active on the port. +Also known as X-factor, as in 1X, 4X, 12X. +@ infiniband.port.in.bytes Bytes received +Counter of data octets received on all VLs at the port. This +includes all octets between (and not including) the start of +packet delimiter and the VCRC, and may include packets containing errors. +It excludes all link packets. + +This counter is implemented by sampling underlying saturating PortRcvData +counter. When a value of saturated counter reaches predefined threshold, +the counter is reset after its value is copied into internal state. + +@ infiniband.port.in.packets Packets received +Counter of data packets received on all VLs at the port. This +may include packets containing errors but excludes all link packets. + +@ infiniband.port.in.errors.drop Packets dropped due to errors +Number of packets received on the port that were discarded because they +could not be forwarded by the switch relay due to DLID mapping, VL mapping +or looping. Implemented by sampling 16 bit PortRcvSwitchRelayErrors +counter. + +@ infiniband.port.in.errors.filter Packets filtered out +Number of packets received by the port that were discarded because +it was a raw packet and FilterRawInbound is enabled or because +PartitionEnforcementInbound is enabled and packet failed partition +key check or IP version check. Implemented by sampling 8 bit +PortRcvConstraintErrors counter. + +@ infiniband.port.in.errors.local Packets with errors +Counter of packets containing local physical errors, malformed data or +link packets or packets discarded due to buffer overrun. Implemented by +sampling 16 bit PortRcvErrors counter. + +@ infiniband.port.in.errors.remote Packets with EBP delimiter. +Number of packets marked with End Bad Packet delimited received by +the port. Implemented by sampling 16 bit PortRcvRemotePhysicalerrors +counter. + +@ infiniband.port.out.bytes Bytes transmitted +Counter of data octets, transmitted on all VLs from the port. This +includes all octets between (and not including) the start of +packet delimiter and the VCRC, and may include packets containing errors. +It excludes all link packets. + +This counter is implemented by sampling underlying saturating PortXmtData +counter. When a value of saturated counter reaches predefined threshold, +the counter is reset after its value is copied into internal state. + +@ infiniband.port.out.packets Packets transmitted +Counter of data packets transmitted on all VLs from the port. This +may include packets containing errors but excludes all link packets. + +@ infiniband.port.out.errors.drop Packets dropped without transmitting +Number of outbound packets which were droped because port is down +or congested. Implemented by sampling 16 bit PortXmtDiscard counter. + +@ infiniband.port.out.errors.filter Packets filtered out before transmitting +Number of packets not transmitted by the port because +it was a raw packet and FilterRawInbound is enabled or because +PartitionEnforcementInbound is enabled and packet failed partition +key check or IP version check. Implemented by sampling 8 bit +PortXmitConstraintErrors counter. + +@ infiniband.port.total.bytes Bytes transmitted and received +Cumulative value of infiniband.port.in.bytes and +infiniband.port.out.bytes, provided for convenience. + +@ infiniband.port.total.packets Packets transmitted and received +Cumulative value of infiniband.port.in.packets and +infiniband.port.out.packets, provided for convenience. + +@ infiniband.port.total.errors.drop Packet dropped +Cumulative counter of infiniband.port.in.errors.drop and +infiniband.out.errors.drops. + +@ infiniband.port.total.errors.filter Packet filtered out +Cumulative counter of infiniband.port.in.errors.filter and +infiniband.out.errors.filter. + +@ infiniband.port.total.errors.link Link downed +Number of times Port Training state machine has failed to +complete link recovery process and downed the link. Implemented by +sampling 8 bit LinkDownedCounter. + +@ infiniband.port.total.errors.recover Successful recoveries +Number of times Port Training state machine has managed successfully +complete link recovery process. Implemented by sampling 8 bit +LinkErrorRecoveryCounter. + +@ infiniband.port.total.errors.integrity Excessive local physical errors +Number of times the count of local physical errors exceeded the threshold. +Implemented by sampling 4 bit LocalLinkIntegrityErrors counter. + +@ infiniband.port.total.errors.vl15 Dropped packets to VL15 +Number of times packets to VL15 (management virtual line) was dropped +due to resource limitations. Implemented by sampling 16 bit VL15Dropped +counter. + +@ infiniband.port.total.errors.overrun Excessive Buffer Overruns +The number of times buffer overrun errors had persisted over multiple +flow control update times. Implemented by sampling 4 bit +ExcessiveBufferOverrun counter. + +@ infiniband.port.total.errors.symbol Total number of minor link errors +Total number of minor link errors detected on one or more physical lines. +Implemented by sampling 16 bit SymbolErrorCounter. + +@ infiniband.control.query_timeout Timeout for MAD perquery +Timeout in milliseconds for MAD rpcs. Default value is 1000 milliseconds. +Timeout can be set per port. + +@ infiniband.control.hiwat Counter threshold values +Threshold values for each MAD performance counter. Due to saturating +nature of the counters they're reset when value of a particular counter +gets above a threshold. Setting threshold to the maximum value disables +the reset mechanism. + +@ infiniband.port.switch.in.bytes Bytes received (using switch counter) +Counter for the bytes received by a port. This is calculated using the counter +of the switch the port is attached to. + +@ infiniband.port.switch.in.packets Packets received (using switch counter) +Counter for the packets received by a port. This is calculated using the +counter of the switch the port is attached to. + +@ infiniband.port.switch.out.bytes Bytes transmitted (using switch counter) +Counter for the bytes transmitted by a port. This is calculated using the +counter of the switch the port is attached to. + +@ infiniband.port.switch.out.packets Packets transmitted (using switch counter) +Counter for the packets transmitted by a port. This is calculated using the +counter of the switch the port is attached to. + +@ infiniband.port.switch.total.bytes Bytes transmitted and received (using switch counters) +Cumulative value of infiniband.port.switch.in.bytes and +infiniband.port.switch.out.bytes, provided for convenience. + +@ infiniband.port.switch.total.packets Packets transmitted and received (using switch counters) +Cumulative value of infiniband.port.switch.in.packets and +infiniband.port.switch.out.packets, provided for convenience. diff --git a/src/pmdas/infiniband/ib.c b/src/pmdas/infiniband/ib.c new file mode 100644 index 0000000..b3dc182 --- /dev/null +++ b/src/pmdas/infiniband/ib.c @@ -0,0 +1,1050 @@ +/* + * Copyright (C) 2008 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * IB part of the PMDA - initialization, fetching etc. + */ +#include "ibpmda.h" +#include <infiniband/umad.h> +#include <infiniband/mad.h> +#include <ctype.h> + +#define IBPMDA_MAX_HCAS (16) + +typedef struct local_port_s { + /* + * Cache the ca_name and portnum to avoid a bug in libibumad that + * leaks memory when umad_port_get() is called over and over. + * With ca_name and portnum we can safely do umad_port_release() + * first and then umad_port_get() without fear that some future + * version of release() will deallocate port->ca_name and + * port->portnum. + */ + char ca_name[UMAD_CA_NAME_LEN]; + int portnum; + umad_port_t *ump; + void * hndl; + int needsupdate; +} local_port_t; + +/* umad_ca_t starts with a name which is good enough for us to use */ +typedef struct hca_state_s { + umad_ca_t ca; + local_port_t lports[UMAD_CA_MAX_PORTS]; +} hca_state_t; + +/* IB Architecture rel 1.2 demands that performance counters + * must plateau once they reach 2^32. This structure is used + * to track the counters and reset them when they get close to + * the magic boundary */ +typedef struct mad_counter_s { + uint64_t accum; /* Accumulated value */ + uint32_t prev; /* Previous value of the counter */ + uint32_t cur; /* Current value, only valid during iteration */ + uint32_t isvalid; /* Is current value valid? */ +} mad_counter_t; + +typedef struct mad_cnt_desc_s { + enum MAD_FIELDS madid; /* ID for the counter */ + char *name; + int resetmask; /* Reset mask for port_performance_reset */ + uint32_t hiwat; /* If current value is over hiwat mark, reset it */ + int multiplier; +} mad_cnt_desc_t; + +#define MADDESC_INIT(id, mask, shft, mul) \ + [ IBPMDA_##id ] {IB_PC_##id##_F, #id, (1<<mask), (1U<<shft), mul} + +static mad_cnt_desc_t mad_cnt_descriptors[] = { + MADDESC_INIT(ERR_SYM, 0, 15, 1), + MADDESC_INIT(LINK_RECOVERS, 1, 7, 1), + MADDESC_INIT(LINK_DOWNED, 2, 7, 1), + MADDESC_INIT(ERR_RCV, 3, 15, 1), + MADDESC_INIT(ERR_PHYSRCV, 4, 15, 1), + MADDESC_INIT(ERR_SWITCH_REL, 5, 15, 1), + MADDESC_INIT(XMT_DISCARDS, 6, 15, 1), + MADDESC_INIT(ERR_XMTCONSTR, 7, 7, 1), + MADDESC_INIT(ERR_RCVCONSTR, 8, 7, 1), + MADDESC_INIT(ERR_LOCALINTEG, 9, 3, 1), + MADDESC_INIT(ERR_EXCESS_OVR, 10, 3, 1), + MADDESC_INIT(VL15_DROPPED, 11, 15, 1), + MADDESC_INIT(XMT_BYTES, 12, 31, 4), + MADDESC_INIT(RCV_BYTES, 13, 31, 4), + MADDESC_INIT(XMT_PKTS, 14, 31, 1), + MADDESC_INIT(RCV_PKTS, 15, 31, 1) +}; + +#undef MADDESC_INIT + +static char *node_types[] = {"Unknown", "CA", "Switch", "Router", "iWARP RNIC"}; + +static char *port_states[] = { + "Unknown", + "Down", + "Initializing", + "Armed", + "Active" +}; + +static char *port_phystates[] = { + "No change", + "Sleep", + "Polling", + "Disabled", + "Port Configuration Training", + "Link Up", + "Error Recovery", + "PHY Test" +}; + +/* Size is arbitrary, currently need 285 bytes for all caps */ +#define IB_ALLPORTCAPSTRLEN 320 + +typedef struct port_state_s { + ib_portid_t portid; + local_port_t *lport; + int needupdate; + int validstate; + int resetmask; + int timeout; + uint64_t guid; + int remport; + unsigned char perfdata[IB_MAD_SIZE]; + unsigned char portinfo[IB_MAD_SIZE]; + uint8_t switchperfdata[1024]; + mad_counter_t madcnts[ARRAYSZ(mad_cnt_descriptors)]; + char pcap[IB_ALLPORTCAPSTRLEN]; +} port_state_t; + +static char confpath[MAXPATHLEN]; +static int portcount; +/* Line number while parsing the config file */ +static FILE *fconf; +static int lcnt; + +#define print_parse_err(loglevel, fmt, args...) \ + if (fconf) { \ + __pmNotifyErr(loglevel, "%s(%d): " fmt, confpath, lcnt, args); \ + } else { \ + __pmNotifyErr(loglevel, fmt, args); \ + } + +static void +monitor_guid(pmdaIndom *itab, char *name, long long guid, int rport, + char *local, int lport) +{ + int inst; + hca_state_t *hca = NULL; + port_state_t *ps; + + if (pmdaCacheLookupName(itab[IB_HCA_INDOM].it_indom, local, NULL, + (void**)&hca) != PMDA_CACHE_ACTIVE) { + print_parse_err(LOG_ERR, "unknown HCA '%s' in 'via' clause\n", local); + return; + } + + if ((lport >= UMAD_CA_MAX_PORTS) || (lport < 0)) { + print_parse_err(LOG_ERR, + "port number %d is out of bounds for HCA %s\n", + lport, local); + return; + } + + if (hca->lports[lport].hndl == NULL) { + print_parse_err(LOG_ERR, + "port %s:%d has failed initialization\n", + local, lport); + return; + } + + if ((ps = (port_state_t *)calloc(1, sizeof(port_state_t))) == NULL) { + __pmNotifyErr (LOG_ERR, "Out of memory to save state for %s\n", name); + return; + } + + ps->guid = guid; + ps->remport = rport; + ps->lport = hca->lports + lport; + ps->portid.lid = -1; + ps->timeout = 1000; + + if ((inst = pmdaCacheStore(itab[IB_PORT_INDOM].it_indom, + PMDA_CACHE_ADD, name, ps)) < 0) { + __pmNotifyErr(LOG_ERR, "Cannot add %s to the cache - %s\n", + name, pmErrStr(inst)); + free (ps); + return; + } + + portcount++; +} + + +static int +foreachport(hca_state_t *hst, void (*cb)(hca_state_t *, umad_port_t *, void *), + void *closure) +{ + int pcnt = hst->ca.numports; + int p; + int nports = 0; + + for (p=0; (pcnt >0) && (p < UMAD_CA_MAX_PORTS); p++) { + umad_port_t *port = hst->ca.ports[p]; + + if (port ) { + pcnt--; + nports++; + if (cb) { + cb (hst, port, closure); + } + } + } + return (nports); +} + +#ifdef HAVE_NETWORK_BYTEORDER +#define guid_htonll(a) do { } while (0) /* noop */ +#define guid_ntohll(a) do { } while (0) /* noop */ +#else +static void +guid_htonll(char *p) +{ + char c; + int i; + + for (i = 0; i < 4; i++) { + c = p[i]; + p[i] = p[7-i]; + p[7-i] = c; + } +} +#define guid_ntohll(v) guid_htonll(v) +#endif + +static void +printportconfig (hca_state_t *hst, umad_port_t *port, void *arg) +{ + uint64_t hguid = port->port_guid; + + guid_ntohll((char *)&hguid); + + fprintf (fconf, "%s:%d 0x%llx %d via %s:%d\n", + port->ca_name, port->portnum, (unsigned long long)hguid, + port->portnum, hst->ca.ca_name, port->portnum); +} + +static void +monitorport(hca_state_t *hst, umad_port_t *port, void *arg) +{ + pmdaIndom *itab = arg; + uint64_t hguid = port->port_guid; + char name[128]; + + guid_ntohll((char *)&hguid); + sprintf(name, "%s:%d", port->ca_name, port->portnum); + + monitor_guid(itab, name, hguid, port->portnum, port->ca_name, port->portnum); +} + + +static int mgmt_classes[] = {IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, + IB_SA_CLASS, IB_PERFORMANCE_CLASS}; +static void +openumadport (hca_state_t *hst, umad_port_t *port, void *arg) +{ + void *hndl = arg; + local_port_t *lp; + + if ((hndl = mad_rpc_open_port(port->ca_name, port->portnum, mgmt_classes, + ARRAYSZ(mgmt_classes))) == NULL) { + __pmNotifyErr(LOG_ERR, "Cannot open port handle for %s:%d\n", + port->ca_name, port->portnum); + } + lp = &hst->lports[port->portnum]; + strcpy(lp->ca_name, port->ca_name); + lp->portnum = port->portnum; + lp->ump = port; + lp->hndl = hndl; +} + +static void +parse_config(pmdaIndom *itab) +{ + char buffer[2048]; + + while ((fgets(buffer, sizeof(buffer)-1, fconf)) != NULL) { + char *p; + + lcnt++; + + /* strip comments */ + if ((p = strchr(buffer,'#'))) + *p='\0'; + + for (p = buffer; *p; p++) { + if (!isspace (*p)) + break; + } + + if (*p != '\0') { + char name[128]; + long long guid; + int rport; + char local[128]; + int lport; + + if (sscanf(p, "%[^ \t]%llx%d via %[^:]:%d", + name, &guid, &rport, local, &lport) != 5) { + __pmNotifyErr (LOG_ERR, "%s(%d): cannot parse the line\n", + confpath, lcnt); + continue; + } + + monitor_guid(itab, name, guid, rport, local, lport); + } + } +} + +int +ib_load_config(const char *cp, int writeconf, pmdaIndom *itab, unsigned int nindoms) +{ + char hcas[IBPMDA_MAX_HCAS][UMAD_CA_NAME_LEN]; + hca_state_t *st = NULL; + int i, n; + int (*closef)(FILE *) = fclose; + + if (nindoms <= IB_CNT_INDOM) + return -EINVAL; + + if (umad_init()) { + __pmNotifyErr(LOG_ERR, + "umad_init() failed. No IB kernel support or incorrect ABI version\n"); + return -EIO; + } + + if ((n = umad_get_cas_names(hcas, ARRAYSZ(hcas)))) { + if ((st = calloc (n, sizeof(hca_state_t))) == NULL) + return -ENOMEM; + } else + /* No HCAs */ + return 0; + + /* Open config file - if the executable bit is set then assume that + * user wants it to be a script and run it, otherwise try loading it. + */ + strcpy(confpath, cp); + if (access(confpath, F_OK) == 0) { + if (writeconf) { + __pmNotifyErr(LOG_ERR, + "Config file exists and writeconf arg was given to pmdaib. Aborting."); + exit(1); + } + + if (access(confpath, X_OK)) { + /* Not an executable, just read it */ + fconf = fopen (confpath, "r"); + } else { + fconf = popen(confpath, "r"); + closef = pclose; + } + } else if (writeconf) { + fconf = fopen(confpath, "w"); + } + /* else no config file: Just monitor local ports */ + + for (i=0; i < n; i++) { + if (umad_get_ca(hcas[i], &st[i].ca) == 0) { + int e = pmdaCacheStore(itab[IB_HCA_INDOM].it_indom, PMDA_CACHE_ADD, + st[i].ca.ca_name, &st[i].ca); + + if (e < 0) { + __pmNotifyErr(LOG_ERR, + "Cannot add instance for %s to the cache - %s\n", + st[i].ca.ca_name, pmErrStr(e)); + continue; + } + + foreachport(st+i, openumadport, NULL); + if (fconf == NULL) + /* No config file - monitor local ports */ + foreachport(st+i, monitorport, itab); + if (writeconf) + foreachport(st+i, printportconfig, fconf); + } + } + + if (fconf) { + parse_config(itab); + (*closef)(fconf); + } + + if (writeconf) + /* Config file is now written. Exit. */ + exit(0); + + if (!portcount) { + __pmNotifyErr(LOG_INFO, "No IB ports found to monitor"); + } + + itab[IB_CNT_INDOM].it_set = (pmdaInstid *)calloc(ARRAYSZ(mad_cnt_descriptors), + sizeof(pmdaInstid)); + + if (itab[IB_CNT_INDOM].it_set == NULL) { + return -ENOMEM; + } + + itab[IB_CNT_INDOM].it_numinst = ARRAYSZ(mad_cnt_descriptors); + for (i=0; i < ARRAYSZ(mad_cnt_descriptors); i++) { + itab[IB_CNT_INDOM].it_set[i].i_inst = i; + itab[IB_CNT_INDOM].it_set[i].i_name = mad_cnt_descriptors[i].name; + + } + + return 0; +} + +static char * +ib_portcap_to_string(port_state_t *pst) +{ + static struct { + int bit; + const char *cap; + } capdest [] = { + {1, "SM"}, + {2, "Notice"}, + {3, "Trap"}, + {5, "AutomaticMigration"}, + {6, "SLMapping"}, + {7, "MKeyNVRAM"}, + {8, "PKeyNVRAM"}, + {9, "LedInfo"}, + {10, "SMdisabled"}, + {11, "SystemImageGUID"}, + {12, "PkeySwitchExternalPortTrap"}, + {16, "CommunicatonManagement"}, + {17, "SNMPTunneling"}, + {18, "Reinit"}, + {19, "DeviceManagement"}, + {20, "VendorClass"}, + {21, "DRNotice"}, + {22, "CapabilityMaskNotice"}, + {23, "BootManagement"}, + {24, "IsLinkRoundTripLatency"}, + {25, "ClientRegistration"} + }; + char *comma = ""; + int commalen = 0; + int i; + char *ptr = pst->pcap; + uint32_t bsiz = sizeof(pst->pcap); + int pcap = mad_get_field(pst->portinfo, 0, IB_PORT_CAPMASK_F); + + *ptr ='\0'; + + for (i=0; i < ARRAYSZ(capdest); i++) { + if (pcap & (1<<capdest[i].bit)) { + int sl = strlen(capdest[i].cap) + commalen; + if (sl < bsiz) { + sprintf (ptr, "%s%s", comma, capdest[i].cap); + comma = ","; commalen=1; + bsiz -= sl; + ptr += sl; + } + } + } + + return (pst->pcap); +} + + +/* This function can be called multiple times during single + * fetch operation so take care to avoid side effects, for example, + * if the "previous" value of the counter is above the high + * watermark and must be reset, don't change the previous value here - + * it could lead to double counting on the second call */ +static uint64_t +ib_update_perfcnt (port_state_t *pst, int udata, int *rv ) +{ + mad_cnt_desc_t * md = mad_cnt_descriptors + udata; + mad_counter_t *mcnt = pst->madcnts + udata; + + if (!mcnt->isvalid) { + uint32_t delta; + + mcnt->cur = mad_get_field(pst->perfdata, 0, md->madid); + mcnt->isvalid = 1; + + /* If someone resets the counters, then don't update the the + * accumulated value because we don't know what was the value before it + * was reset. And if the difference between current and previous value + * is larger then the high watermark then don't update the accumulated + * value either - current value could've pegged because we didn't + * fetch often enough */ + delta = mcnt->cur - mcnt->prev; + if ((mcnt->cur < mcnt->prev) || (delta > md->hiwat)) { + mcnt->isvalid = PM_ERR_VALUE; + } else { + mcnt->accum += delta; + } + + if (mcnt->cur > md->hiwat) { + pst->resetmask |= md->resetmask; + } + } + + *rv = mcnt->isvalid; + return (mcnt->accum * md->multiplier); +} + +static int +ib_linkwidth (port_state_t *pst) +{ + int w = mad_get_field(pst->portinfo, 0, IB_PORT_LINK_WIDTH_ACTIVE_F); + + switch (w) { + case 1: + return (1); + case 2: + return (4); + case 4: + return (8); + case 8: + return (12); + } + return (0); +} + +int +ib_fetch_val(pmdaMetric *mdesc, unsigned int inst, pmAtomValue *atom) +{ + __pmInDom_int *ind = (__pmInDom_int *)&(mdesc->m_desc.indom); + __pmID_int *idp = (__pmID_int *)&(mdesc->m_desc.pmid); + int rv = 1; + port_state_t *pst = NULL; + hca_state_t *hca = NULL; + int umask = 1<<idp->cluster; + int udata = (int)((__psint_t)mdesc->m_user); + void *closure = NULL; + int st; + char *name = NULL; + + if (inst == PM_INDOM_NULL) { + return PM_ERR_INST; + } + + if (ind->serial != IB_CNT_INDOM) { + if ((st = pmdaCacheLookup (mdesc->m_desc.indom, inst, &name, + &closure)) != PMDA_CACHE_ACTIVE) { + if (st == PMDA_CACHE_INACTIVE) + st = PM_ERR_INST; + __pmNotifyErr (LOG_ERR, "Cannot find instance %d in indom %s: %s\n", + inst, pmInDomStr(mdesc->m_desc.indom), pmErrStr(st)); + return st; + } + } + + /* If fetching from HCA indom, then no refreshing is necessary for the + * lifetime of a pmda. Ports could change state, so some update could be + * necessary */ + switch (ind->serial) { + case IB_PORT_INDOM: + if (idp->cluster > 3) { + return PM_ERR_INST; + } + + pst = closure; + + if (pst->needupdate & umask) { + local_port_t *lp = pst->lport; + + /* A port state is considered up-to date regardless of any + * errors which could happen later - this is used to implement + * one shot updates */ + pst->needupdate ^= umask; + + /* The state of the local port used for queries is checked + * once per fetch request */ + if (lp->needsupdate) { + umad_release_port(lp->ump); + if (umad_get_port(lp->ca_name, lp->portnum, lp->ump) != 0) { + __pmNotifyErr (LOG_ERR, + "Cannot get state of the port %s:%d\n", + lp->ump->ca_name, lp->ump->portnum); + return 0; + } + lp->needsupdate = 0; + } + + /* If the port which we're supposed to use to query the data + * does not have a LID then we don't even try to query anything, + * is it going to fail anyway */ + if (lp->ump->base_lid == 0) { + return 0; /* No values available */ + } + + if (pst->portid.lid < 0) { + ib_portid_t sm = {0}; + sm.lid = lp->ump->sm_lid; + + memset (&pst->portid, 0, sizeof (pst->portid)); + if (ib_resolve_guid_via (&pst->portid, &pst->guid, &sm, + pst->timeout, lp->hndl) < 0) { + __pmNotifyErr (LOG_ERR, + "Cannot resolve GUID 0x%llx for %s " + "via %s:%d\n", + (unsigned long long)pst->guid, + name, lp->ump->ca_name, + lp->ump->portnum); + pst->portid.lid = -1; + return 0; + } + } + + switch (idp->cluster) { + case 0: /* port attributes */ + memset (pst->portinfo, 0, sizeof(pst->portinfo)); + if (!smp_query_via (pst->portinfo, &pst->portid, + IB_ATTR_PORT_INFO, 0, pst->timeout, + lp->hndl)) { + __pmNotifyErr (LOG_ERR, + "Cannot get port info for %s via %s:%d\n", + name, lp->ump->ca_name, lp->ump->portnum); + return 0; + } + break; + + case 1: /* performance counters */ + /* I thought about updating all accumulating counters + * in case port_performance_query() succeeds but + * decided not to do it right now - updating all counters + * could mean more resets even in case when nobody is + * actually looking at the particular counter and I'm + * trying to minimize resets. */ + memset (pst->perfdata, 0, sizeof (pst->perfdata)); + if (!port_perf_query(pst->perfdata, &pst->portid, + pst->remport, pst->timeout, lp->hndl)) { + __pmNotifyErr (LOG_ERR, + "Cannot get performance counters for %s " + "via %s:%d\n", + name, lp->ump->ca_name, lp->ump->portnum); + return 0; + } + break; + + case 3: { /* switch performance counters */ + +#ifdef HAVE_PMA_QUERY_VIA + + // To find the LID of the switch the HCA is connected to, + // send an SMP on the directed route 0,1 and ask the port + // to identify itself. + ib_portid_t sw_port_id = { + .drpath = { + .cnt = 1, + .p = { 0, 1, }, + }, + }; + + uint8_t sw_info[64]; + memset(sw_info, 0, sizeof(sw_info)); + if (!smp_query_via(sw_info, &sw_port_id, IB_ATTR_PORT_INFO, 0, + pst->timeout, lp->hndl)) { + __pmNotifyErr(LOG_ERR, + "Cannot get switch port info for %s via %s:%d.\n", + name, lp->ump->ca_name, lp->ump->portnum); + return 0; + } + + int sw_lid, sw_port; + mad_decode_field(sw_info, IB_PORT_LID_F, &sw_lid); + mad_decode_field(sw_info, IB_PORT_LOCAL_PORT_F, &sw_port); + + sw_port_id.lid = sw_lid; + + // Query for the switch's performance counters' values. + memset(pst->switchperfdata, 0, sizeof(pst->switchperfdata)); + if (!pma_query_via(pst->switchperfdata, &sw_port_id, sw_port, + pst->timeout, IB_GSI_PORT_COUNTERS_EXT, lp->hndl)) { + __pmNotifyErr(LOG_ERR, + "Cannot query performance counters of switch LID %d, port %d.\n", + sw_lid, sw_port); + return 0; + } +#endif + break; + } + + } + pst->validstate ^= umask; + } else if (!(pst->validstate & umask)) { + /* We've hit an error on the previous update - continue + * reporting no data for this instance */ + return (0); + } + break; + + case IB_HCA_INDOM: + hca = closure; + break; + + case IB_CNT_INDOM: + break; + + default: + return (PM_ERR_INST); + } + + switch (idp->cluster) { + case 0: /* UMAD data - hca name, fw_version, number of ports etc */ + switch(idp->item) { + case METRIC_ib_hca_hw_ver: + atom->cp = hca->ca.hw_ver; + break; + + case METRIC_ib_hca_system_guid: + atom->ull = hca->ca.system_guid; + break; + + case METRIC_ib_hca_node_guid: + atom->ull = hca->ca.node_guid; + break; + + case METRIC_ib_hca_numports: + atom->l = hca->ca.numports; + break; + + case METRIC_ib_hca_type: + if (hca->ca.node_type < ARRAYSZ(node_types)) { + atom->cp = node_types[hca->ca.node_type]; + } else { + __pmNotifyErr (LOG_INFO, "Unknown node type %d for %s\n", + hca->ca.node_type, hca->ca.ca_name); + atom->cp = "Unknown"; + } + break; + + case METRIC_ib_hca_fw_ver: + atom->cp = hca->ca.fw_ver; + break; + + case METRIC_ib_port_gid_prefix: + atom->ull = mad_get_field64(pst->portinfo, 0, IB_PORT_GID_PREFIX_F); + break; + + case METRIC_ib_port_rate: + atom->l = ib_linkwidth(pst) * + (5 * mad_get_field (pst->portinfo, 0, + IB_PORT_LINK_SPEED_ACTIVE_F))/2; + break; + + case METRIC_ib_port_lid: + atom->l = pst->portid.lid; + break; + + case METRIC_ib_port_capabilities: + atom->cp = ib_portcap_to_string(pst); + break; + + case METRIC_ib_port_phystate: + st = mad_get_field (pst->portinfo, 0, IB_PORT_PHYS_STATE_F); + if (st < ARRAYSZ(port_phystates)) { + atom->cp = port_phystates[st]; + } else { + __pmNotifyErr (LOG_INFO, "Unknown port PHY state %d on %s\n", + st, name); + atom->cp = "Unknown"; + } + break; + + case METRIC_ib_port_guid: + atom->ull = pst->guid; + break; + + case METRIC_ib_hca_ca_type: + atom->cp = hca->ca.ca_type; + break; + + case METRIC_ib_port_state: + st = mad_get_field (pst->portinfo, 0, IB_PORT_STATE_F); + if (st < ARRAYSZ(port_states)) { + atom->cp = port_states[st]; + } else { + __pmNotifyErr (LOG_INFO, "Unknown port state %d on %s\n", + st, name); + atom->cp = "Unknown"; + } + break; + + case METRIC_ib_port_linkspeed: + switch ((st = mad_get_field(pst->portinfo, 0, + IB_PORT_LINK_SPEED_ACTIVE_F))) { + case 1: + atom->cp = "2.5 Gpbs"; + break; + case 2: + atom->cp = "5.0 Gbps"; + break; + case 4: + atom->cp = "10.0 Gbps"; + break; + default: + __pmNotifyErr (LOG_INFO, "Unknown link speed %d on %s\n", + st, name); + atom->cp = "Unknown"; + break; + } + break; + + case METRIC_ib_port_linkwidth: + atom->l = ib_linkwidth(pst); + break; + + default: + rv = PM_ERR_PMID; + break; + } + break; + + case 1: /* Fetch values from mad rpc response */ + if ((udata >= 0) && (udata < ARRAYSZ(mad_cnt_descriptors))) { + /* If a metric has udata set then it's one of the "direct" + * metrics - just update the accumulated counter + * and stuff its value into pmAtomValue */ + switch (mdesc->m_desc.type) { + case PM_TYPE_32: + atom->l = (int32_t)ib_update_perfcnt (pst, udata, &rv); + break; + case PM_TYPE_64: + atom->ll = ib_update_perfcnt (pst, udata, &rv); + break; + default: + rv = PM_ERR_INST; + break; + } + } else { + int rv1=0, rv2=0; + /* Synthetic metrics */ + switch (idp->item) { + case METRIC_ib_port_total_bytes: + atom->ll = ib_update_perfcnt (pst, IBPMDA_XMT_BYTES, &rv1) + + ib_update_perfcnt (pst, IBPMDA_RCV_BYTES, &rv2); + break; + + case METRIC_ib_port_total_packets: + atom->ll = ib_update_perfcnt (pst, IBPMDA_XMT_PKTS, &rv1) + + ib_update_perfcnt (pst, IBPMDA_RCV_PKTS, &rv2); + break; + + case METRIC_ib_port_total_errors_drop: + atom->l = (int)(ib_update_perfcnt (pst, IBPMDA_ERR_SWITCH_REL, &rv1) + + ib_update_perfcnt (pst, IBPMDA_XMT_DISCARDS, &rv2)); + break; + + case METRIC_ib_port_total_errors_filter: + atom->l = (int)(ib_update_perfcnt (pst, IBPMDA_ERR_XMTCONSTR, &rv1) + + ib_update_perfcnt (pst, IBPMDA_ERR_RCVCONSTR, &rv2)); + break; + + default: + rv = PM_ERR_PMID; + break; + } + + if ((rv1 < 0) || (rv2 < 0)) { + rv = (rv1 < 0) ? rv1 : rv2; + } + } + break; + + case 2: /* Control structures */ + switch (idp->item) { + case METRIC_ib_control_query_timeout: + atom->l = pst->timeout; + break; + + case METRIC_ib_control_hiwat: + if (inst < ARRAYSZ(mad_cnt_descriptors)) { + atom->ul = mad_cnt_descriptors[inst].hiwat; + } else { + rv = PM_ERR_INST; + } + break; + + default: + rv = PM_ERR_PMID; + break; + } + break; + + case 3: /* Fetch values from switch response */ + +#ifdef HAVE_PMA_QUERY_VIA + + // (The values are "swapped" because what the port receives is what the + // switch sends, and vice versa.) + switch (idp->item) { + case METRIC_ib_port_switch_in_bytes: { + mad_decode_field(pst->switchperfdata, + IB_PC_EXT_XMT_BYTES_F, &atom->ull); + atom->ull *= 4; // TODO: programmatically determine link width + break; + } + case METRIC_ib_port_switch_in_packets: { + mad_decode_field(pst->switchperfdata, + IB_PC_EXT_XMT_PKTS_F, &atom->ull); + break; + } + case METRIC_ib_port_switch_out_bytes: { + mad_decode_field(pst->switchperfdata, + IB_PC_EXT_RCV_BYTES_F, &atom->ull); + atom->ull *= 4; // TODO: programmatically determine link width + break; + } + case METRIC_ib_port_switch_out_packets: { + mad_decode_field(pst->switchperfdata, + IB_PC_EXT_RCV_PKTS_F, &atom->ull); + break; + } + case METRIC_ib_port_switch_total_bytes: { + uint64_t sw_rx_bytes, sw_tx_bytes; + int ib_lw; + mad_decode_field(pst->switchperfdata, + IB_PC_EXT_RCV_BYTES_F, &sw_rx_bytes); + mad_decode_field(pst->switchperfdata, + IB_PC_EXT_XMT_BYTES_F, &sw_tx_bytes); + ib_lw = 4; // TODO: programmatically determine link width + atom->ull = (sw_rx_bytes * ib_lw) + (sw_tx_bytes * ib_lw); + break; + } + case METRIC_ib_port_switch_total_packets: { + uint64_t sw_rx_packets, sw_tx_packets; + mad_decode_field(pst->switchperfdata, + IB_PC_EXT_RCV_PKTS_F, &sw_rx_packets); + mad_decode_field(pst->switchperfdata, + IB_PC_EXT_XMT_PKTS_F, &sw_tx_packets); + atom->ull = sw_rx_packets + sw_tx_packets; + break; + } + default: { + rv = PM_ERR_PMID; + break; + } + } +#else + + return PM_ERR_VALUE; + +#endif + break; + + default: + rv = PM_ERR_PMID; + break; + } + + return rv; +} + +/* Walk the instances and arm needupdate flag in each instance's + * state. The actuall updating is done in the fetch function */ +void +ib_rearm_for_update(void *state) +{ + port_state_t *pst = state; + + pst->lport->needsupdate = 1; + + pst->needupdate = IB_PORTINFO_UPDATE | IB_HCA_PERF_UPDATE | IB_SWITCH_PERF_UPDATE; + pst->validstate = 4; /* 0x4 for timeout which is always valid */ +} + +void +ib_reset_perfcounters (void *state) +{ + int m; + port_state_t *pst = state; + + if (pst->resetmask && (pst->portid.lid != 0)) { + memset (pst->perfdata, 0, sizeof (pst->perfdata)); + + if (port_perf_reset(pst->perfdata, &pst->portid, pst->remport, + pst->resetmask, pst->timeout, pst->lport->hndl)) { + int j; + + for (j=0; j < ARRAYSZ(mad_cnt_descriptors); j++) { + if (pst->resetmask & (1<<j)) { + pst->madcnts[j].prev = 0; + pst->madcnts[j].isvalid = 0; + } + } + } + } + pst->resetmask = 0; + + for (m=0; m < ARRAYSZ(mad_cnt_descriptors); m++) { + if (pst->madcnts[m].isvalid) { + pst->madcnts[m].prev = pst->madcnts[m].cur; + pst->madcnts[m].isvalid = 0; + } + } +} + +int +ib_store(pmResult *result, pmdaExt *pmda) +{ + int i; + + for (i = 0; i < result->numpmid ; i++) { + pmValueSet *vs = result->vset[i]; + __pmID_int *pmidp = (__pmID_int *)&vs->pmid; + int inst; + + if (pmidp->cluster != 2) { + return (-EACCES); + } + + if (vs->valfmt != PM_VAL_INSITU) { + return (-EINVAL); + } + + for (inst=0; inst < vs->numval; inst++) { + int id = vs->vlist[inst].inst; + void *closure = NULL; + + switch (pmidp->item) { + case METRIC_ib_control_query_timeout: + if (pmdaCacheLookup (pmda->e_indoms[IB_PORT_INDOM].it_indom, + id, NULL, &closure) == PMDA_CACHE_ACTIVE) { + port_state_t *pst = closure; + pst->timeout = vs->vlist[inst].value.lval; + } else { + return (PM_ERR_INST); + } + break; + + case METRIC_ib_control_hiwat: + if ((id < 0) || + (id > pmda->e_indoms[IB_CNT_INDOM].it_numinst)) { + return (PM_ERR_INST); + } + + mad_cnt_descriptors[id].hiwat = (uint32_t)vs->vlist[inst].value.lval; + break; + + default: + return (-EACCES); + } + } + } + return 0; +} + diff --git a/src/pmdas/infiniband/ibpmda.h b/src/pmdas/infiniband/ibpmda.h new file mode 100644 index 0000000..a9bbafa --- /dev/null +++ b/src/pmdas/infiniband/ibpmda.h @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2013 Red Hat. + * Copyright (C) 2007,2008 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#ifndef _IBPMDA_H +#define _IBPMDA_H + +#include <pcp/pmapi.h> +#include <pcp/impl.h> +#include <pcp/pmda.h> +#include "domain.h" + +#ifdef HAVE_PORT_PERFORMANCE_QUERY_VIA +#define port_perf_query(data, dst, port, timeout, srcport) \ + port_performance_query_via(data, dst, port, timeout, srcport) +#define port_perf_reset(data, dst, port, mask, timeout, srcport) \ + port_performance_reset_via(data, dst, port, mask, timeout, srcport) +#else +#define port_perf_query(data, dst, port, timeout, srcport) \ + pma_query_via(data, dst, port, timeout, IB_GSI_PORT_COUNTERS, srcport) +#define port_perf_reset(data, dst, port, mask, timeout, srcport) \ + performance_reset_via(data, dst, port, mask, timeout, IB_GSI_PORT_COUNTERS, srcport) +#endif + + +void ibpmda_init (const char *configpath, int, pmdaInterface *); + +int ib_fetch_val(pmdaMetric *, unsigned int, pmAtomValue *); +int ib_load_config(const char *, int, pmdaIndom *, unsigned int); +void ib_rearm_for_update(void *); +void ib_reset_perfcounters (void *); +int ib_store(pmResult *, pmdaExt *); + +#define IB_HCA_INDOM 0 +#define IB_PORT_INDOM 1 +#define IB_CNT_INDOM 2 + +#define IB_PORTINFO_UPDATE 0x1 +#define IB_HCA_PERF_UPDATE 0x2 +#define IB_SWITCH_PERF_UPDATE 0x8 + +#define ARRAYSZ(a) (sizeof(a)/sizeof(a[0])) + +#define METRIC_ib_hca_type 0 +#define METRIC_ib_hca_ca_type 1 +#define METRIC_ib_hca_numports 2 +#define METRIC_ib_hca_fw_ver 3 +#define METRIC_ib_hca_hw_ver 4 +#define METRIC_ib_hca_node_guid 5 +#define METRIC_ib_hca_system_guid 6 +#define METRIC_ib_port_guid 7 +#define METRIC_ib_port_gid_prefix 8 +#define METRIC_ib_port_lid 9 +#define METRIC_ib_port_state 10 +#define METRIC_ib_port_phystate 11 +#define METRIC_ib_port_rate 12 +#define METRIC_ib_port_capabilities 13 +#define METRIC_ib_port_linkspeed 14 +#define METRIC_ib_port_linkwidth 15 + +/* Per-port performance counters, cluster #1 */ +#define METRIC_ib_port_in_bytes 0 +#define METRIC_ib_port_in_packets 1 +#define METRIC_ib_port_out_bytes 2 +#define METRIC_ib_port_out_packets 3 +#define METRIC_ib_port_in_errors_drop 4 +#define METRIC_ib_port_out_errors_drop 5 +#define METRIC_ib_port_total_bytes 6 +#define METRIC_ib_port_total_packets 7 +#define METRIC_ib_port_total_errors_drop 8 +#define METRIC_ib_port_in_errors_filter 9 +#define METRIC_ib_port_in_errors_local 10 +#define METRIC_ib_port_in_errors_remote 11 +#define METRIC_ib_port_out_errors_filter 12 +#define METRIC_ib_port_total_errors_filter 13 +#define METRIC_ib_port_total_errors_link 14 +#define METRIC_ib_port_total_errors_recover 15 +#define METRIC_ib_port_total_errors_integrity 16 +#define METRIC_ib_port_total_errors_vl15 17 +#define METRIC_ib_port_total_errors_overrun 18 +#define METRIC_ib_port_total_errors_symbol 19 + +/* Control metrics */ +#define METRIC_ib_control_query_timeout 0 +#define METRIC_ib_control_hiwat 1 + +/* Per-port switch performance counters, cluster #3 */ +#define METRIC_ib_port_switch_in_bytes 0 +#define METRIC_ib_port_switch_in_packets 1 +#define METRIC_ib_port_switch_out_bytes 2 +#define METRIC_ib_port_switch_out_packets 3 +#define METRIC_ib_port_switch_total_bytes 4 +#define METRIC_ib_port_switch_total_packets 5 + +enum ibpmd_cndid { + IBPMDA_ERR_SYM = 0, + IBPMDA_LINK_RECOVERS, + IBPMDA_LINK_DOWNED, + IBPMDA_ERR_RCV, + IBPMDA_ERR_PHYSRCV, + IBPMDA_ERR_SWITCH_REL, + IBPMDA_XMT_DISCARDS, + IBPMDA_ERR_XMTCONSTR, + IBPMDA_ERR_RCVCONSTR, + IBPMDA_ERR_LOCALINTEG, + IBPMDA_ERR_EXCESS_OVR, + IBPMDA_VL15_DROPPED, + IBPMDA_XMT_BYTES, + IBPMDA_RCV_BYTES, + IBPMDA_XMT_PKTS, + IBPMDA_RCV_PKTS +}; + +#endif /* _IBPMDA_H */ diff --git a/src/pmdas/infiniband/pmda.c b/src/pmdas/infiniband/pmda.c new file mode 100644 index 0000000..20c5ba1 --- /dev/null +++ b/src/pmdas/infiniband/pmda.c @@ -0,0 +1,418 @@ +/* + * Copyright (C) 2013 Red Hat. + * Copyright (C) 2007,2008 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + */ + +#include "ibpmda.h" + +/* + * Metric Table + */ +pmdaMetric metrictab[] = { + /* infiniband.hca.type */ + { NULL, + {PMDA_PMID(0,METRIC_ib_hca_type), + PM_TYPE_STRING, IB_HCA_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.hca.ca_type */ + { NULL, + {PMDA_PMID(0,METRIC_ib_hca_ca_type), + PM_TYPE_STRING, IB_HCA_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.hca.numports */ + { NULL, + {PMDA_PMID(0,METRIC_ib_hca_numports), + PM_TYPE_32, IB_HCA_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.hca.fw_ver */ + { NULL, + {PMDA_PMID(0,METRIC_ib_hca_fw_ver), + PM_TYPE_STRING, IB_HCA_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.hca.hw_ver */ + { NULL, + {PMDA_PMID(0,METRIC_ib_hca_hw_ver), + PM_TYPE_STRING, IB_HCA_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.hca.node_guid */ + { NULL, + {PMDA_PMID(0,METRIC_ib_hca_node_guid), + PM_TYPE_U64, IB_HCA_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.hca.system_guid */ + { NULL, + {PMDA_PMID(0,METRIC_ib_hca_system_guid), + PM_TYPE_U64, IB_HCA_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.port.guid */ + { NULL, + {PMDA_PMID(0,METRIC_ib_port_guid), + PM_TYPE_U64, IB_PORT_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.port.gid_prefix */ + { NULL, + {PMDA_PMID(0,METRIC_ib_port_gid_prefix), + PM_TYPE_U64, IB_PORT_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.port.lid */ + { NULL, + {PMDA_PMID(0,METRIC_ib_port_lid), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.port.state */ + { NULL, + {PMDA_PMID(0,METRIC_ib_port_state), + PM_TYPE_STRING, IB_PORT_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.port.phystate */ + { NULL, + {PMDA_PMID(0,METRIC_ib_port_phystate), + PM_TYPE_STRING, IB_PORT_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.port.rate */ + { NULL, + {PMDA_PMID(0,METRIC_ib_port_rate), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.port.capabilities */ + { NULL, + {PMDA_PMID(0,METRIC_ib_port_capabilities), + PM_TYPE_STRING, IB_PORT_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.port.linkspeed */ + { NULL, + {PMDA_PMID(0,METRIC_ib_port_linkspeed), + PM_TYPE_STRING, IB_PORT_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.port.linkwidth */ + { NULL, + {PMDA_PMID(0,METRIC_ib_port_linkwidth), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) }, }, + + /* infiniband.port.in.bytes */ + { (void *)IBPMDA_RCV_BYTES, + {PMDA_PMID(1,METRIC_ib_port_in_bytes), + PM_TYPE_64, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0) }, }, + + /* infiniband.port.in.packets */ + { (void *)IBPMDA_RCV_PKTS, + {PMDA_PMID(1,METRIC_ib_port_in_packets), + PM_TYPE_64, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) }, }, + + /* infiniband.port.in.errors.drop */ + { (void *)IBPMDA_ERR_SWITCH_REL, + {PMDA_PMID(1,METRIC_ib_port_in_errors_drop), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.in.errors.filter */ + { (void *)IBPMDA_ERR_RCVCONSTR, + {PMDA_PMID(1,METRIC_ib_port_in_errors_filter), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.in.errors.local */ + { (void *)IBPMDA_ERR_RCV, + {PMDA_PMID(1,METRIC_ib_port_in_errors_local), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.in.errors.filter */ + { (void *)IBPMDA_ERR_PHYSRCV, + {PMDA_PMID(1,METRIC_ib_port_in_errors_remote), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.out.bytes */ + { (void *)IBPMDA_XMT_BYTES, + {PMDA_PMID(1,METRIC_ib_port_out_bytes), + PM_TYPE_64, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0) }, }, + + /* infiniband.port.out.packets */ + {(void *)IBPMDA_XMT_PKTS, + {PMDA_PMID(1,METRIC_ib_port_out_packets), + PM_TYPE_64, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) }, }, + + /* infiniband.port.out.errors.drop */ + { (void *)IBPMDA_XMT_DISCARDS, + {PMDA_PMID(1,METRIC_ib_port_out_errors_drop), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.out.errors.filter */ + { (void *)IBPMDA_ERR_XMTCONSTR, + {PMDA_PMID(1,METRIC_ib_port_out_errors_filter), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.total.bytes */ + { (void *)-1, + {PMDA_PMID(1,METRIC_ib_port_total_bytes), + PM_TYPE_64, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0) } }, + + /* infiniband.port.total.packets */ + { (void *)-1, + {PMDA_PMID(1,METRIC_ib_port_total_packets), + PM_TYPE_64, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.total.errors.drop */ + { (void *)-1, + {PMDA_PMID(1,METRIC_ib_port_total_errors_drop), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.total.errors.filter */ + { (void *)-1, + {PMDA_PMID(1,METRIC_ib_port_total_errors_filter), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.total.errors.link */ + { (void *)IBPMDA_LINK_DOWNED, + {PMDA_PMID(1,METRIC_ib_port_total_errors_link), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.total.errors.recover */ + { (void *)IBPMDA_LINK_RECOVERS, + {PMDA_PMID(1,METRIC_ib_port_total_errors_recover), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.total.errors.integrity */ + { (void *)IBPMDA_ERR_LOCALINTEG, + {PMDA_PMID(1,METRIC_ib_port_total_errors_integrity), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.total.errors.vl15 */ + { (void *)IBPMDA_VL15_DROPPED, + {PMDA_PMID(1,METRIC_ib_port_total_errors_vl15), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.total.errors.overrun */ + { (void *)IBPMDA_ERR_EXCESS_OVR, + {PMDA_PMID(1,METRIC_ib_port_total_errors_overrun), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.total.errors.symbol */ + { (void *)IBPMDA_ERR_SYM, + {PMDA_PMID(1,METRIC_ib_port_total_errors_symbol), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.control.query_timeout */ + { NULL, + {PMDA_PMID(2,METRIC_ib_control_query_timeout), + PM_TYPE_32, IB_PORT_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + + /* infiniband.control.hiwat */ + { NULL, + {PMDA_PMID(2,METRIC_ib_control_hiwat), + PM_TYPE_U32, IB_CNT_INDOM, PM_SEM_DISCRETE, + PMDA_PMUNITS(0,0,0,0,0,0) } }, + + /* infiniband.port.switch.in.bytes */ + { NULL, + {PMDA_PMID(3,METRIC_ib_port_switch_in_bytes), + PM_TYPE_U64, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0) } }, + + /* infiniband.port.switch.in.packets */ + { NULL, + {PMDA_PMID(3,METRIC_ib_port_switch_in_packets), + PM_TYPE_U64, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.switch.out.bytes */ + { NULL, + {PMDA_PMID(3,METRIC_ib_port_switch_out_bytes), + PM_TYPE_U64, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0) } }, + + /* infiniband.port.switch.out.packets */ + { NULL, + {PMDA_PMID(3,METRIC_ib_port_switch_out_packets), + PM_TYPE_U64, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, + + /* infiniband.port.switch.total.bytes */ + { NULL, + {PMDA_PMID(3,METRIC_ib_port_switch_total_bytes), + PM_TYPE_U64, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0) } }, + + /* infiniband.port.switch.total.packets */ + { NULL, + {PMDA_PMID(3,METRIC_ib_port_switch_total_packets), + PM_TYPE_U64, IB_PORT_INDOM, PM_SEM_COUNTER, + PMDA_PMUNITS(0,0,1,0,0,0) } }, +}; + +pmdaIndom indomtab[] = { + { IB_HCA_INDOM, 0, NULL }, + { IB_PORT_INDOM, 0, NULL }, + { IB_CNT_INDOM, 0, NULL }, +}; + +static void +foreach_inst(pmInDom indom, void (*cb)(void *state)) +{ + int i; + pmdaCacheOp (indom, PMDA_CACHE_WALK_REWIND); + + while ((i = pmdaCacheOp (indom, PMDA_CACHE_WALK_NEXT)) >= 0) { + void *state = NULL; + + if (pmdaCacheLookup (indom, i, NULL, &state) != PMDA_CACHE_ACTIVE) { + abort(); + } + cb (state); + } +} + +int +ib_fetch(int numpmid, pmID pmidlist[], pmResult **resp, pmdaExt *pmda) +{ + int rv; + + foreach_inst (indomtab[IB_PORT_INDOM].it_indom, ib_rearm_for_update); + rv = pmdaFetch (numpmid, pmidlist, resp, pmda); + foreach_inst (indomtab[IB_PORT_INDOM].it_indom, ib_reset_perfcounters); + + return (rv); +} + +void +ibpmda_init(const char *confpath, int writeconf, pmdaInterface *dp) +{ + char defconf[MAXPATHLEN]; + int sep = __pmPathSeparator(); + int i; + + if (dp->status != 0) + return; + + if (confpath == NULL) { + snprintf(defconf, sizeof(defconf), "%s%c" "infiniband" "%c" "config", + pmGetConfig("PCP_PMDAS_DIR"), sep, sep); + confpath = defconf; + } + + for (i=0; i < ARRAYSZ(indomtab); i++) { + __pmindom_int(&indomtab[i].it_indom)->domain = dp->domain; + if (IB_CNT_INDOM != __pmindom_int(&indomtab[i].it_indom)->serial) { + pmdaCacheOp (indomtab[i].it_indom, PMDA_CACHE_LOAD); + } + } + + + if ((dp->status = ib_load_config(confpath, writeconf, indomtab, ARRAYSZ(indomtab)))) + return; + + for (i=0; i < ARRAYSZ(indomtab); i++) { + if (IB_CNT_INDOM != __pmindom_int(&indomtab[i].it_indom)->serial) { + pmdaCacheOp (indomtab[i].it_indom, PMDA_CACHE_SAVE); + } + } + + dp->version.two.fetch = ib_fetch; + dp->version.two.store = ib_store; + pmdaSetFetchCallBack(dp, ib_fetch_val); + + pmdaInit(dp, indomtab, ARRAYSZ(indomtab), metrictab, ARRAYSZ(metrictab)); +} + +static void +usage(void) +{ + fprintf(stderr, "Usage: %s [options]\n\n", pmProgname); + fputs("Options:\n" + " -d domain use domain (numeric) for metrics domain of PMDA\n" + " -l logfile write log into logfile rather than using default log name\n" + " -c path to configuration file\n" + " -w write the basic configuration file\n", + stderr); + exit(1); +} + +int +main(int argc, char **argv) +{ + int err = 0; + int sep = __pmPathSeparator(); + pmdaInterface dispatch; + char helppath[MAXPATHLEN]; + char *confpath = NULL; + int opt; + int writeconf = 0; + + __pmSetProgname(argv[0]); + snprintf(helppath, sizeof(helppath), "%s%c" "infiniband" "%c" "help", + pmGetConfig("PCP_PMDAS_DIR"), sep, sep); + pmdaDaemon(&dispatch, PMDA_INTERFACE_3, pmProgname, IB, "infiniband.log", helppath); + + while ((opt = pmdaGetOpt(argc, argv, "D:c:d:l:w?", &dispatch, &err)) != EOF) { + switch (opt) { + case 'c': + confpath = optarg; + break; + case 'w': + writeconf = 1; + break; + default: + err++; + } + } + + if (err) { + usage(); + } + + if (!writeconf) { + /* If writeconf is specified, then errors should go to stdout + * since the PMDA daemon will exit immediately after writing + * out the default config file + */ + pmdaOpenLog(&dispatch); + } + ibpmda_init(confpath, writeconf, &dispatch); + pmdaConnect(&dispatch); + pmdaMain(&dispatch); + exit(0); +} diff --git a/src/pmdas/infiniband/pmns b/src/pmdas/infiniband/pmns new file mode 100644 index 0000000..b42919d --- /dev/null +++ b/src/pmdas/infiniband/pmns @@ -0,0 +1,100 @@ + +infiniband { + hca + port + control +} + +infiniband.hca { + type IB:0:0 + ca_type IB:0:1 + numports IB:0:2 + fw_ver IB:0:3 + hw_ver IB:0:4 + node_guid IB:0:5 + system_guid IB:0:6 +} + +infiniband.port { + guid IB:0:7 + gid_prefix IB:0:8 + lid IB:0:9 + state IB:0:10 + phystate IB:0:11 + rate IB:0:12 + capabilities IB:0:13 + linkspeed IB:0:14 + linkwidth IB:0:15 + + in + out + total + switch +} + +infiniband.port.in { + bytes IB:1:0 + packets IB:1:1 + errors +} + +infiniband.port.in.errors { + drop IB:1:4 + filter IB:1:9 + local IB:1:10 + remote IB:1:11 +} + +infiniband.port.out { + bytes IB:1:2 + packets IB:1:3 + errors +} + +infiniband.port.out.errors { + drop IB:1:5 + filter IB:1:12 +} + +infiniband.port.total { + bytes IB:1:6 + packets IB:1:7 + errors +} + +infiniband.port.total.errors { + drop IB:1:8 + filter IB:1:13 + link IB:1:14 + recover IB:1:15 + integrity IB:1:16 + vl15 IB:1:17 + overrun IB:1:18 + symbol IB:1:19 +} + +infiniband.port.switch { + in + out + total +} + +infiniband.port.switch.in { + bytes IB:3:0 + packets IB:3:1 +} + +infiniband.port.switch.out { + bytes IB:3:2 + packets IB:3:3 +} + +infiniband.port.switch.total { + bytes IB:3:4 + packets IB:3:5 +} + +infiniband.control { + query_timeout IB:2:0 + hiwat IB:2:1 +} diff --git a/src/pmdas/infiniband/root b/src/pmdas/infiniband/root new file mode 100644 index 0000000..460798c --- /dev/null +++ b/src/pmdas/infiniband/root @@ -0,0 +1,9 @@ +/* + * fake "root" for validating the local PMNS subtree + */ + +#include <stdpmid> + +root { infiniband } + +#include "pmns" |