usr/src/cmd/fm/eversholt/files/i386/i86pc/gcpu_amd.esc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Eversholt rules for generic AMD with on-chip memory-controller(s), as seen
 * in AMD family 0xf and 0x10.
 *
 * In the absence of any model-specific support, any memory errors that
 * are observed via MCA (typically through an on-chip memory-controller)
 * will surface as ereport.cpu.generic-x86.bus_interconnect_memory[_uc]
 * ereports and are diagnosed via generic rules in gcpu.esc.
 *
 * If full model-specific support is available, including full NorthBridge
 * support, then memory ereports will surface in a more-specific subclass
 * such as ereport.cpu.amd.mem_ce; these are diagnosed in amd64.esc.
 *
 * In the case where some "vendor generic" support is present, memory errors
 * are reported as ereport.cpu.generic-x86.mem_{ce,ue} and include a
 * syndrome and syndrome-type, and usually also a resource FMRI to identify
 * the affected resource.  In the AMD case a resource FMRI is included for
 * those chip versions that include an Online Spare Control register; this
 * register provides counts of ECC errors seen per channel and chip-select
 * on a NorthBridge node.  The resource FMRI has form
 * 	hc:///motherboard/chip/memory-controller/dram-channel/chip-select
 * in these cases.
 */

#pragma dictionary "GMCA"

/*
 * The number of pages that must be faulted on a chip-select for repeated
 * correctable errors before we will consider one of the component dimms
 * faulty.
 */
#define	CS_DIMMSB_THRESH	64

/*
 * The maximum number of pages we will diagnose as faulty on any one
 * chip-select (must be at least CS_PAGEFLT_THRESH).  If a chip-select
 * has a fault that will affect zillions of pages this limit stops us
 * diagnosing excessive numbers of page faults.
 */
#define	CS_PAGEFLT_MAX		(2 * CS_DIMMSB_THRESH)

/*
 * SERD paramters for individual page faults.  When more than PAGE_SB_COUNT
 * correctable ereports are experienced on a single chip-select within
 * PAGE_SB_TIME the engine will fire and we will fault the most recent
 * page.
 */
#define	PAGE_SB_COUNT		3
#define	PAGE_SB_TIME		24h

#define	CSPATH	chip/memory-controller/dram-channel/chip-select

/*
 * ADDR_VALID is true if the ereport payload includes IA32_MCi_ADDR.
 */
#define	ADDR_VALID (payloadprop_defined("IA32_MCi_ADDR"))

/*
 * CONTAINS_CS is true if the resource nvlist array exists and one of its
 * members matches the chip-select path.  This is used to constrain
 * propogations to those for which a resource element matches the
 * chip-select path of the propogation.  This is necessary because the
 * detector element of memory ereports is a cpu and not the chip-select itself.
 */
#define	CONTAINS_CS (payloadprop_contains("resource", asru(CSPATH)))

#define	SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR"))) 
/* Generic memory ereports. */
event ereport.cpu.generic-x86.mem_ce@chip/core/strand { within(1s) };
event ereport.cpu.generic-x86.mem_ue@chip/core/strand { within(1s) };

/*
 *	 ========= Propogations for correctable memory faults ==========
 *	|								|	
 *	| Discard mem_ce with no resource in the ereport payload.	|
 *	| Discard mem_ce with no address info - we can't fault the	|
 *	| corresponding page without it.				|
 *	|								|
 *	| For a mem_ce ereport detected by a given chip/cpu (as per	|
 *	| the payload detector info) whose resource payload member	|
 *	| includes a chip/memory-controller/dram-channel/chip-select	|
 *	| (CSPATH) for the same chip number, diagnose to an fault event	|
 *	| associated with a per-CSPATH SERD engine as long as we are	|
 *	| below the page fault limit for this CSPATH (defined below);	|
 *	| if we are over that limit then discard the event since we	|
 *	| will already have faulted a dimm and there is no point in	|
 *	| continuing to diagnose endless page faults from a dimm with	|
 *	| something like a pin failure.					|
 *	|								|
 *	| When the per-CSPATH SERD engine fires we fault the page	|
 *	| containing the address included in the ereport that caused	|
 *	| the trip, and increment a per-CSPATH counter to count page	|
 *	| faults on that chip-select from repeated correctable errors.	|
 *	|								|
 *	| A dimm_ce fault is diagnosed when we have faulted an		|
 *	| excessive number of page_ce faults on a chip-select - more	|
 *	| than CE_DIMMSB_THRESH.					|
 *	|===============================================================| 
 */

#define	CS_PGFLT_LIMIT_REACHED (count(stat.cepgflt@CSPATH) > CS_PAGEFLT_MAX)
#define	CS_DIMMSB_THRESH_REACHED \
	(count(stat.cepgflt@CSPATH) >= CS_DIMMSB_THRESH)

engine stat.cepgflt@CSPATH;
engine serd.memory.generic-x86.page_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
event fault.memory.generic-x86.page_ce@CSPATH,
    message=0, response=0,		/* do not message individual pageflts */
    count=stat.cepgflt@CSPATH,		/* increment on pageflt diagnosis */
    engine=serd.memory.generic-x86.page_ce@CSPATH;
engine serd.memory.generic-x86.dimm_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
event fault.memory.generic-x86.dimm_ce@CSPATH,
    engine=serd.memory.generic-x86.dimm_ce@CSPATH;

prop fault.memory.generic-x86.page_ce@CSPATH
    { ADDR_VALID && CONTAINS_CS && !CS_PGFLT_LIMIT_REACHED && SET_ADDR } (1)->
    ereport.cpu.generic-x86.mem_ce@chip/core<>/strand<>;

prop fault.memory.generic-x86.dimm_ce@CSPATH
    { ADDR_VALID && CONTAINS_CS && CS_DIMMSB_THRESH_REACHED } (1)->
    ereport.cpu.generic-x86.mem_ce@chip/core<>/strand<>;

event upset.memory.generic-x86.discard@chip/core/strand;
prop upset.memory.generic-x86.discard@chip/core/strand
    { !payloadprop_defined("resource") || !ADDR_VALID } (1)->
    ereport.cpu.generic-x86.mem_ce@chip/core/strand;

/*
 *	 ========= Propogations for uncorrectable page faults ==========
 *	|								|
 *	| A UE produces an immediate page fault.
 *	|===============================================================| 
 */

event fault.memory.generic-x86.page_ue@CSPATH,
    message=0, response=0,		/* do not message individual pageflts */
    count=stat.cepgflt@CSPATH;		/* increment on pageflt diagnosis */
event fault.memory.generic-x86.dimm_ue@CSPATH;

prop fault.memory.generic-x86.page_ue@CSPATH
    { ADDR_VALID && CONTAINS_CS && SET_ADDR } (1)->
    ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>;

prop fault.memory.generic-x86.dimm_ue@CSPATH
    { ADDR_VALID && CONTAINS_CS } (1)->
    ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>;

event upset.memory.generic-x86.discard3@CSPATH;
prop upset.memory.generic-x86.discard3@CSPATH
    { !payloadprop_defined("resource") || !ADDR_VALID } (1)->
    ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>;

/*
 *	 ========= Propogations for GART Table Walk Errors =============
 *	|								|
 *	| These are usually due to software mis-programming of the GART	|
 *	| TLB rather than from hardware errors.  It would be incorrect	|
 *	| to fault and potentially offline a cpu in response to these	|
 *	| so they have their own fault class to facilitate us ignoring	|
 *	| them.								|
 *	|===============================================================| 
 */

event ereport.cpu.generic-x86.gart_tbl_walk@chip/core/strand { within(1s) };
event upset.cpu.generic-x86.gart_tbl_walk@chip/core/strand;

prop upset.cpu.generic-x86.gart_tbl_walk@chip/core/strand (1)->
    ereport.cpu.generic-x86.gart_tbl_walk@chip/core/strand;