1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
|
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ASYNC_H
#define _SYS_ASYNC_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/privregs.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifndef _ASM
#include <sys/errorq.h>
/*
* The async_flt structure is used to record all pertinent information about
* an asynchronous CPU or bus-related memory error. Typically, the structure
* is initialized by a high-level interrupt or trap handler, and then enqueued
* for later processing. Separate queues are maintained for correctable and
* uncorrectable errors. The current CPU module determines the size of the
* queue elements, so that it may declare a CPU-specific fault structure
* which contains a struct async_flt as its first member. Each async_flt also
* contains a callback function (flt_func) that is invoked by the processing
* code in order to actually log messages when the event is dequeued. This
* function may be called from a softint, from trap() as part of AST handling
* before the victim thread returns to userland, or as part of panic(). As
* such, the flt_func should basically only be calling cmn_err (but NOT with
* the CE_PANIC flag). It must not call panic(), acquire locks, or block.
* The owner of the event is responsible for determining whether the event is
* fatal; if so, the owner should set flt_panic and panic() after enqueuing
* the event. The event will then be dequeued and logged as part of panic
* processing. If flt_panic is not set, the queue function will schedule a
* soft interrupt to process the event.
*/
struct async_flt;
typedef void (*async_func_t)(struct async_flt *, char *);
struct async_flt {
uint64_t flt_id; /* gethrtime() at time of fault */
uint64_t flt_stat; /* async fault status register */
uint64_t flt_addr; /* async fault address register */
caddr_t flt_pc; /* program counter from error trap */
async_func_t flt_func; /* logging function */
uint_t flt_bus_id; /* hardware bus id# of cpu/sbus/pci */
uint_t flt_inst; /* software instance of cpu/sbus/pci */
ushort_t flt_status; /* error information */
ushort_t flt_synd; /* ECC syndrome */
uchar_t flt_in_memory; /* fault occurred in memory if != 0 */
uchar_t flt_class; /* fault class (cpu or bus) */
uchar_t flt_prot; /* type of fault protection (if any) */
uchar_t flt_priv; /* fault occurred in kernel if != 0 */
uchar_t flt_panic; /* fault caused owner to panic() */
uchar_t flt_tl; /* fault occurred at TL > 0 */
uchar_t flt_core; /* fault occurred during core() dump */
uchar_t flt_pad; /* reserved for future use */
uint64_t flt_disp; /* error disposition information */
uint64_t flt_payload; /* ereport payload information */
char *flt_erpt_class; /* ereport class string */
};
/*
* Bus nexus drivers can use the bus_func_register() interface to register
* callback functions for error handling and panic handling. The handler
* functions should be registered and unregistered from driver attach and
* detach context, where it is safe to perform a sleeping allocation. The
* callbacks themselves can be invoked from panic, or from the CPU module's
* asynchronous trap handler at high PIL. As such, these routines may only
* test for errors and enqueue async_flt events. They may not grab adaptive
* locks, call panic(), or invoke bus_func_register() or bus_func_unregister().
* Each callback function should return one of the BF_* return status values
* below. The bus_func_invoke() function calls all the registered handlers of
* the specified type, and returns the maximum of their return values (e.g.
* BF_FATAL if any callback returned BF_FATAL). If any callback returns
* BF_FATAL, the system will panic at the end of callback processing.
*/
typedef uint_t (*busfunc_t)(void *);
#define BF_TYPE_UE 1 /* check for uncorrectable errors */
#define BF_TYPE_ERRDIS 2 /* disable error detection */
#define BF_TYPE_RESINTR 3 /* reset interrupts */
#define BF_NONE 0 /* no errors were detected */
#define BF_NONFATAL 1 /* one or more non-fatal errors found */
#define BF_FATAL 2 /* one or more fatal errors found */
typedef struct bus_func_desc {
int bf_type; /* type of function (see above) */
busfunc_t bf_func; /* function to call */
void *bf_arg; /* function argument */
struct bus_func_desc *bf_next; /* pointer to next registered desc */
} bus_func_desc_t;
extern void bus_func_register(int, busfunc_t, void *);
extern void bus_func_unregister(int, busfunc_t, void *);
extern void bus_async_log_err(struct async_flt *);
extern uint_t bus_func_invoke(int);
extern void ecc_cpu_call(struct async_flt *, char *, int);
extern void ce_scrub(struct async_flt *);
extern void ecc_page_zero(void *);
extern void error_init(void);
extern int ce_verbose_memory;
extern int ce_verbose_other;
extern int ce_show_data;
extern int ce_debug;
extern int ue_debug;
extern int aft_verbose;
extern int aft_panic;
extern int aft_testfatal;
extern struct async_flt panic_aflt;
extern errorq_t *ce_queue;
extern errorq_t *ue_queue;
#endif /* !_ASM */
/*
* ECC or parity error status for async_flt.flt_status.
*/
#define ECC_C_TRAP 0x0001 /* Trap 0x63 Corrected ECC Error */
#define ECC_I_TRAP 0x0002 /* Trap 0x0A Instr Access Error */
#define ECC_ECACHE 0x0004 /* Ecache ECC Error */
#define ECC_IOBUS 0x0008 /* Pci or sysio ECC Error */
#define ECC_INTERMITTENT 0x0010 /* Intermittent ECC Error */
#define ECC_PERSISTENT 0x0020 /* Persistent ECC Error */
#define ECC_STICKY 0x0040 /* Sticky ECC Error */
#define ECC_D_TRAP 0x0080 /* Trap 0x32 Data Access Error */
#define ECC_F_TRAP 0x0100 /* Cheetah Trap 0x70 Fast ECC Error */
#define ECC_DP_TRAP 0x0200 /* Cheetah+ Trap 0x71 D$ Parity Error */
#define ECC_IP_TRAP 0x0400 /* Cheetah+ Trap 0x72 I$ Parity Error */
#define ECC_ITLB_TRAP 0x0800 /* Panther ITLB Parity Error */
#define ECC_DTLB_TRAP 0x1000 /* Panther DTLB Parity Error */
#define ECC_IO_CE 0x2000 /* Pci or sysio CE */
#define ECC_IO_UE 0x4000 /* Pci or sysio UE */
/*
* Trap type numbers corresponding to the fault types defined above.
*/
#define TRAP_TYPE_ECC_I 0x0A
#define TRAP_TYPE_ECC_D 0x32
#define TRAP_TYPE_ECC_F 0x70
#define TRAP_TYPE_ECC_C 0x63
#define TRAP_TYPE_ECC_DP 0x71
#define TRAP_TYPE_ECC_IP 0x72
#define TRAP_TYPE_ECC_ITLB 0x08
#define TRAP_TYPE_ECC_DTLB 0x30
#define TRAP_TYPE_UNKNOWN 0
/*
* Fault classes for async_flt.flt_class.
*/
#define BUS_FAULT 0 /* originating from bus drivers */
#define CPU_FAULT 1 /* originating from CPUs */
#define RECIRC_BUS_FAULT 2 /* scheduled diagnostic */
#define RECIRC_CPU_FAULT 3 /* scheduled diagnostic */
/*
* Invalid or unknown physical address for async_flt.flt_addr.
*/
#define AFLT_INV_ADDR (-1ULL)
/*
* Fault protection values for async_flt.flt_prot. The async error handling
* code may be able to recover from errors when kernel code has explicitly
* protected itself using one of the mechanisms specified here.
*/
#define AFLT_PROT_NONE 0 /* no protection active */
#define AFLT_PROT_ACCESS 1 /* on_trap OT_DATA_ACCESS protection */
#define AFLT_PROT_EC 2 /* on_trap OT_DATA_EC protection */
#define AFLT_PROT_COPY 3 /* t_lofault protection (ucopy, etc.) */
/*
* These flags are used to indicate the validity of certain data based on
* the various overwrite priority features of the AFSR/AFAR:
* AFAR, ESYND and MSYND, each of which have different overwrite priorities.
*
* Given a specific afsr error bit and the entire afsr, there are three cases:
* INVALID: The specified bit is lower overwrite priority than some other
* error bit which is on in the afsr (or IVU/IVC).
* VALID: The specified bit is higher priority than all other error bits
* which are on in the afsr.
* AMBIGUOUS: Another error bit (or bits) of equal priority to the specified
* bit is on in the afsr.
*
* NB: The domain-to-SC communications depend on these values. If they are
* changed, plat_ecc_unum.[ch] must be updated to match.
*/
#define AFLT_STAT_INVALID 0 /* higher priority afsr bit is on */
#define AFLT_STAT_VALID 1 /* this is highest priority afsr bit */
#define AFLT_STAT_AMBIGUOUS 2 /* two afsr bits of equal priority */
/*
* Maximum length of unum string.
*/
#define UNUM_NAMLEN 60
/*
* Maximum length of a DIMM serial id string + null
*/
#define DIMM_SERIAL_ID_LEN 16
#ifdef __cplusplus
}
#endif
#endif /* _SYS_ASYNC_H */
|