summaryrefslogtreecommitdiff
path: root/usr/src/uts/sun4u/sys/cheetahasm.h
blob: 5048defef026834b357080d5e1c0bdf1d5bfd762 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#ifndef	_CHEETAHASM_H
#define	_CHEETAHASM_H

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#ifdef	__cplusplus
extern "C" {
#endif

#ifdef _ASM
/* BEGIN CSTYLED */

#define	ASM_LD(reg, symbol)						\
	sethi	%hi(symbol), reg;					\
	ld	[reg + %lo(symbol)], reg;				\

#define	ASM_LDX(reg, symbol)						\
	sethi	%hi(symbol), reg;					\
	ldx	[reg + %lo(symbol)], reg;				\

#define	ASM_JMP(reg, symbol)						\
	sethi	%hi(symbol), reg;					\
	jmp	reg + %lo(symbol);					\
	nop

/*
 * Macro for getting to offset from 'cpu_private' ptr.  The 'cpu_private'
 * ptr is in the machcpu structure.
 *  off_reg:  Register offset from 'cpu_private' ptr.
 *  scr1:    Scratch, ptr is returned in this register.
 *  scr2:    Scratch
 *  label:   Label to branch to if cpu_private ptr is null/zero.
 */
#define	GET_CPU_PRIVATE_PTR(off_reg, scr1, scr2, label)			\
	CPU_ADDR(scr1, scr2);						\
	ldn	[scr1 + CPU_PRIVATE], scr1;				\
	cmp	scr1, 0;						\
	be	label;							\
	  nop;								\
	add	scr1, off_reg, scr1

/*
 * Macro version of get_dcache_dtag.  We use this macro in the
 * CPU logout code. Since the Dcache is virtually indexed, only
 * bits [12:5] of the AFAR can be used so we need to search through
 * 8 indexes (4 ways + bit 13) in order to find the tag we want.
 *   afar:  input AFAR, not modified.
 *   datap: input ptr to ch_dc_data_t, at end pts to end of ch_dc_data_t.
 *   scr1:  scratch.
 *   scr2:  scratch, will hold tag to look for.
 *   scr3:  used for Dcache index, loops through 4 ways.
 */
#define	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
	set	CH_DCACHE_IDX_MASK, scr3;				\
	and	afar, scr3, scr3;					\
	srlx	afar, CH_DCTAG_PA_SHIFT, scr2;				\
	b	1f;							\
	  or	scr2, CH_DCTAG_VALID_BIT, scr2; /* tag we want */	\
	.align	128;							\
1:									\
	ldxa	[scr3]ASI_DC_TAG, scr1;		/* read tag */		\
	cmp	scr1, scr2;						\
	bne	4f;				/* not found? */	\
	  nop;								\
	stxa	scr3, [datap + CH_DC_IDX]%asi;	/* store index */	\
	stxa	scr1, [datap + CH_DC_TAG]%asi;	/* store tag */		\
	membar	#Sync;			/* Cheetah PRM 10.6.3 */	\
	ldxa	[scr3]ASI_DC_UTAG, scr1;	/* read utag */		\
	membar	#Sync;			/* Cheetah PRM 10.6.3 */	\
	stxa	scr1, [datap + CH_DC_UTAG]%asi;				\
	ldxa	[scr3]ASI_DC_SNP_TAG, scr1;	/* read snoop tag */	\
	stxa	scr1, [datap + CH_DC_SNTAG]%asi;			\
	add	datap, CH_DC_DATA, datap;				\
	clr	scr2;							\
2:									\
	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
	ldxa	[scr3 + scr2]ASI_DC_DATA, scr1;	/* read data */		\
	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
	stxa	scr1, [datap]%asi;					\
	add	datap, 8, datap;					\
	cmp	scr2, CH_DC_DATA_REG_SIZE - 8;				\
	blt	2b;							\
	  add	scr2, 8, scr2;						\
									\
	GET_CPU_IMPL(scr2);	/* Parity bits are elsewhere for */	\
	cmp	scr2, PANTHER_IMPL;	/* panther processors. */	\
	bne,a	5f;			/* Done if not panther. */	\
	  add	datap, 8, datap; /* Skip to the end of the struct. */	\
	clr	scr2;							\
	add	datap, 7, datap; /* offset of the last parity byte */	\
	mov	1, scr1;						\
	sll	scr1, PN_DC_DATA_PARITY_BIT_SHIFT, scr1;		\
	or	scr3, scr1, scr3; /* add DC_data_parity bit to index */	\
3:									\
	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
	ldxa	[scr3 + scr2]ASI_DC_DATA, scr1;	/* read parity bits */	\
	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
	stba	scr1, [datap]%asi;					\
	dec	datap;							\
	cmp	scr2, CH_DC_DATA_REG_SIZE - 8;				\
	blt	3b;							\
	  add	scr2, 8, scr2;						\
	b	5f;							\
	  add	datap, 5, datap; /* set pointer to end of our struct */	\
4:									\
	set	CH_DCACHE_IDX_INCR, scr1;	/* incr. idx (scr3) */	\
	add	scr3, scr1, scr3;					\
	set	CH_DCACHE_IDX_LIMIT, scr1;	/* done? */		\
	cmp	scr3, scr1;						\
	blt	1b;							\
	  nop;								\
	add	datap, CH_DC_DATA_SIZE, datap;				\
5:

/*
 * Macro version of get_icache_dtag.  We use this macro in the CPU
 * logout code. If the Icache is on, we don't want to capture the data.
 *   afar:  input AFAR, not modified.
 *   datap: input ptr to ch_ic_data_t, at end pts to end of ch_ic_data_t.
 *   scr1:  scratch.
 *   scr2:  scratch, will hold tag to look for.
 *   scr3:  used for Icache index, loops through 4 ways.
 * Note: For Panther, the Icache is virtually indexed and increases in
 * size to 64KB (instead of 32KB) with a line size of 64 bytes (instead
 * of 32). This means the IC_addr index bits[14:7] for Panther now
 * correspond to VA bits[13:6]. But since it is virtually indexed, we
 * still mask out only bits[12:5] from the AFAR (we have to manually
 * check bit 13). In order to make this code work for all processors,
 * we end up checking twice as many indexes (8 instead of 4) as required
 * for non-Panther CPUs and saving off twice as much data (16 instructions
 * instead of just 8).
 */
#define	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
	ldxa	[%g0]ASI_DCU, scr1;					\
	btst	DCU_IC, scr1;		/* is Icache enabled? */	\
	bne,a	6f;			/* yes, don't capture */	\
	  add	datap, CH_IC_DATA_SIZE, datap;	/* anul if no branch */	\
	GET_CPU_IMPL(scr2);	/* Panther only uses VA[13:6] */	\
	cmp	scr2, PANTHER_IMPL;	/* and we also want to mask */	\
	be	1f;			/* out bit 13 since the */	\
	  nop;				/* Panther I$ is VIPT. */	\
	set	CH_ICACHE_IDX_MASK, scr3;				\
	b	2f;							\
	  nop;								\
1:									\
	set	PN_ICACHE_VA_IDX_MASK, scr3;				\
2:									\
	and	afar, scr3, scr3;					\
	sllx	scr3, CH_ICACHE_IDX_SHIFT, scr3;			\
	srlx	afar, CH_ICPATAG_SHIFT, scr2;	/* pa tag we want */	\
	andn	scr2, CH_ICPATAG_LBITS, scr2;	/* mask off lower */	\
	b	3f;							\
	  nop;								\
	.align	128;							\
3:									\
	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read pa tag */	\
	andn	scr1, CH_ICPATAG_LBITS, scr1;	/* mask off lower */	\
	cmp	scr1, scr2;						\
	bne	5f;				/* not found? */	\
	  nop;								\
	stxa	scr3, [datap + CH_IC_IDX]%asi;	/* store index */	\
	stxa	scr1, [datap + CH_IC_PATAG]%asi; /* store pa tag */	\
	add	scr3, CH_ICTAG_UTAG, scr3;	/* read utag */		\
	ldxa	[scr3]ASI_IC_TAG, scr1;					\
	add	scr3, (CH_ICTAG_UPPER - CH_ICTAG_UTAG), scr3;		\
	stxa	scr1, [datap + CH_IC_UTAG]%asi;				\
	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read upper tag */	\
	add	scr3, (CH_ICTAG_LOWER - CH_ICTAG_UPPER), scr3;		\
	stxa	scr1, [datap + CH_IC_UPPER]%asi;			\
	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read lower tag */	\
	andn	scr3, CH_ICTAG_TMASK, scr3;				\
	stxa	scr1, [datap + CH_IC_LOWER]%asi;			\
	ldxa	[scr3]ASI_IC_SNP_TAG, scr1;	/* read snoop tag */	\
	stxa	scr1, [datap + CH_IC_SNTAG]%asi;			\
	add	datap, CH_IC_DATA, datap;				\
	clr	scr2;							\
4:									\
	ldxa	[scr3 + scr2]ASI_IC_DATA, scr1;	/* read ins. data */	\
	stxa	scr1, [datap]%asi;					\
	add	datap, 8, datap;					\
	cmp	scr2, PN_IC_DATA_REG_SIZE - 8;				\
	blt	4b;							\
	  add	scr2, 8, scr2;						\
	b	6f;							\
	  nop;								\
5:									\
	set	CH_ICACHE_IDX_INCR, scr1;	/* incr. idx (scr3) */	\
	add	scr3, scr1, scr3;					\
	set	PN_ICACHE_IDX_LIMIT, scr1;	/* done? */		\
	cmp	scr3, scr1;						\
	blt	3b;							\
	  nop;								\
	add	datap, CH_IC_DATA_SIZE, datap;				\
6:

#if defined(JALAPENO) || defined(SERRANO)
/*
 * Macro version of get_ecache_dtag.  We use this macro in the
 * CPU logout code.
 *   afar:	input AFAR, not modified
 *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
 *   ec_way:	Constant value (way number) 
 *   scr1:      Scratch
 *   scr2:	Scratch.
 *   scr3:	Scratch.
 */
#define	GET_ECACHE_DTAG(afar, datap, ec_way, scr1, scr2, scr3)		\
	mov	ec_way, scr1;						\
	and	scr1, JP_ECACHE_NWAY - 1, scr1;	/* mask E$ way bits */	\
	sllx	scr1, JP_EC_TAG_DATA_WAY_SHIFT, scr1;			\
	set	((JP_ECACHE_MAX_SIZE / JP_ECACHE_NWAY) - 1), scr2;	\
	and	afar, scr2, scr3;		/* get set offset */	\
	andn	scr3, (JP_ECACHE_MAX_LSIZE - 1), scr3; /* VA<5:0>=0 */	\
	or	scr3, scr1, scr3;		/* or WAY bits */	\
	b	1f;							\
	  stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
	.align	64;							\
1:									\
	JP_EC_DIAG_ACCESS_MEMBAR;					\
	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
	JP_EC_DIAG_ACCESS_MEMBAR;					\
	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
	add	datap, CH_EC_DATA, datap;				\
2:									\
	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
	clr	scr1;							\
3:						/* loop thru 5 regs */	\
	ldxa	[scr1]ASI_EC_DATA, scr2;				\
	stxa	scr2, [datap]%asi;					\
	add	datap, 8, datap;					\
	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
	bne	3b;							\
	   add	scr1, 8, scr1;						\
	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
	beq	2b;							\
	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3

#define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
	GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);		\
	GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);		\
	GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);		\
	add	datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap;	\
	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\

/*
 * Jalapeno does not have cores so these macros are null.
 */
#define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
#define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)

#if defined(JALAPENO)
/*
 * Jalapeno gets primary AFSR and AFAR.  All bits in the AFSR except
 * the fatal error bits are cleared.
 *	datap:		pointer to cpu logout structure.
 *	afar:		returned primary AFAR value.
 *	scr1:		scratch
 *	scr2:		scratch
 */
#define	GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
	ldxa	[%g0]ASI_AFAR, afar;					\
	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
	ldxa	[%g0]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
	sllx	scr1, 32, scr1;						\
	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
	membar	#Sync

/*
 * Jalapeno has no shadow AFAR, null operation.
 */
#define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)

#elif defined(SERRANO)
/*
 * Serrano gets primary AFSR and AFAR.  All bits in the AFSR except
 * the fatal error bits are cleared.  For Serrano, we also save the
 * AFAR2 register. 
 *	datap:	pointer to cpu logout structure.
 *	afar:	returned primary AFAR value.
 *	scr1:	scratch
 *	scr2:	scratch
 */
#define GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
	set	ASI_MCU_AFAR2_VA, scr1;					\
	ldxa	[scr1]ASI_MCU_CTRL, afar;				\
	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi;	\
	ldxa	[%g0]ASI_AFAR, afar;					\
	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
	ldxa	[%g0]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
	sllx	scr1, 32, scr1;						\
	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ 	\
	membar	#Sync

/*
 * Serrano needs to capture E$, D$ and I$ lines associated with afar2.
 *      afar:   scratch, holds afar2.
 *      datap:  pointer to cpu logout structure
 *      scr1:   scratch
 *      scr2:   scratch
 *      scr3:   scratch
 */
#define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)		\
	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi, afar;	\
	add	datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap;		\
	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	sub	datap, CH_CPU_LOGOUT_SIZE, datap
#endif /* SERRANO */

#elif defined(CHEETAH_PLUS)
/*
 * Macro version of get_ecache_dtag.  We use this macro in the
 * CPU logout code.
 *   afar:	input AFAR, not modified.
 *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
 *   pn_way:	ecache way for panther (value = 0-3). For non-panther
 *		cpus, this macro will be called with pn_way = 0.
 *   scr1:	Scratch.
 *   scr2:	Scratch.
 *   scr3:	Scratch.
 */
#define	GET_ECACHE_DTAG(afar, datap, pn_way, scr1, scr2, scr3)		\
	mov	afar, scr3;						\
	andn	scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
	set	(CH_ECACHE_8M_SIZE - 1), scr2;				\
	and	scr3, scr2, scr3;		/* VA<63:23>=0 */	\
	mov	pn_way, scr1;	/* panther L3$ is 4-way so we ...    */	\
	sllx	scr1, PN_L3_WAY_SHIFT, scr1;	/* need to mask...   */	\
	or	scr3, scr1, scr3;	/* in the way bits <24:23>.  */	\
	b	1f;							\
	   stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
	.align	64;							\
1:									\
	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
	stxa     scr1, [datap + CH_EC_TAG]%asi;				\
	set	CHP_ECACHE_IDX_TAG_ECC, scr1;				\
	or	scr3, scr1, scr1;					\
	ldxa    [scr1]ASI_EC_DIAG, scr1;	/* get E$ tag ECC */	\
	stxa	scr1, [datap + CH_EC_TAG_ECC]%asi;			\
	add	datap, CH_EC_DATA, datap;				\
2:									\
	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
	clr	scr1;							\
3:						/* loop thru 5 regs */	\
	ldxa	[scr1]ASI_EC_DATA, scr2;				\
	stxa	scr2, [datap]%asi;					\
	add	datap, 8, datap;					\
	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
	bne	3b;							\
	   add	scr1, 8, scr1;						\
	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
	beq	2b;							\
	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3

/*
 * If this is a panther, we need to make sure the sibling core is
 * parked so that we avoid any race conditions during diagnostic
 * accesses to the shared L2 and L3 caches.
 * dcucr_reg:	This register will be used to keep track of whether
 *		or not we need to unpark the core later.
 *		It just so happens that we also use this same register
 *		to keep track of our saved DCUCR value so we only touch
 *		bit 4 of the register (which is a "reserved" bit in the
 *		DCUCR) for keeping track of core parking.
 * scr1:	Scratch register.
 * scr2:	Scratch register.
 */
#define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)			\
	GET_CPU_IMPL(scr1);						\
	cmp	scr1, PANTHER_IMPL;	/* only park for panthers */	\
	bne,a	%xcc, 2f;						\
	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
	set	ASI_CORE_RUNNING_STATUS, scr1;	/* check other core */	\
	ldxa	[scr1]ASI_CMP_SHARED, scr2;	/* is it running?   */	\
	cmp	scr2, PN_BOTH_CORES_RUNNING;				\
	bne,a	%xcc, 2f;	/* if not running, we are done */	\
	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
	or	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
	set	ASI_CORE_ID, scr1;					\
	ldxa	[scr1]ASI_CMP_PER_CORE, scr2;				\
	and	scr2, COREID_MASK, scr2;				\
	or	%g0, 1, scr1;		/* find out which core... */	\
	sll	scr1, scr2, scr2;	/* ... we need to park... */	\
1:									\
	set	ASI_CORE_RUNNING_RW, scr1;				\
	stxa	scr2, [scr1]ASI_CMP_SHARED;	/* ... and park it. */	\
	membar	#Sync;							\
	set	ASI_CORE_RUNNING_STATUS, scr1;	/* spin until... */	\
	ldxa	[scr1]ASI_CMP_SHARED, scr1;	/* ... the other...  */	\
	cmp	scr1, scr2;	/* ...core is parked according to... */	\
	bne,a	%xcc, 1b;	/* ...the core running status reg.  */	\
	  nop;								\
2:

/*
 * The core running this code will unpark its sibling core if the
 * sibling core had been parked by the current core earlier in this
 * trap handler.
 * dcucr_reg:	This register is used to keep track of whether or not
 *		we need to unpark our sibling core.
 *		It just so happens that we also use this same register
 *		to keep track of our saved DCUCR value so we only touch
 *		bit 4 of the register (which is a "reserved" bit in the
 *		DCUCR) for keeping track of core parking.
 * scr1:	Scratch register.
 * scr2:	Scratch register.
 */
#define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)			\
	btst	PN_PARKED_OTHER_CORE, dcucr_reg;			\
	bz,pt	%xcc, 1f;	/* if nothing to unpark, we are done */	\
	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
	set	ASI_CORE_RUNNING_RW, scr1;				\
	set	PN_BOTH_CORES_RUNNING, scr2;	/* we want both...   */	\
	stxa	scr2, [scr1]ASI_CMP_SHARED;	/* ...cores running. */	\
	membar	#Sync;							\
1:

/*
 * Cheetah+ and Jaguar get both primary and secondary AFSR/AFAR.  All bits
 * in the primary AFSR are cleared except the fatal error bits.  For Panther,
 * we also have to read and clear the AFSR_EXT, again leaving the fatal
 * error bits alone.
 *	datap:		pointer to cpu logout structure.
 *	afar:		returned primary AFAR value.
 *	scr1:		scratch
 *	scr2:		scratch
 */
#define	GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
	set	ASI_SHADOW_REG_VA, scr1;				\
	ldxa	[scr1]ASI_AFAR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi;	\
	ldxa	[scr1]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR)]%asi;	\
	ldxa	[%g0]ASI_AFAR, afar;					\
	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
	ldxa	[%g0]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
	sllx	scr1, 32, scr1;						\
	bclr	scr1, scr2;	/* Clear fatal error bits here, so */ 	\
	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
	membar	#Sync;							\
	GET_CPU_IMPL(scr1);						\
	cmp	scr1, PANTHER_IMPL;					\
	bne	%xcc, 1f;						\
	   nop;								\
	set	ASI_SHADOW_AFSR_EXT_VA, scr1;	/* shadow AFSR_EXT */	\
	ldxa	[scr1]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR_EXT)]%asi; \
	set	ASI_AFSR_EXT_VA, scr1;		/* primary AFSR_EXT */	\
	ldxa	[scr1]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR_EXT)]%asi;	\
	set	C_AFSR_EXT_FATAL_ERRS, scr1;				\
	bclr	scr1, scr2;	/* Clear fatal error bits here, */	\
	set	ASI_AFSR_EXT_VA, scr1;	/* so they're left */		\
	stxa	scr2, [scr1]ASI_AFSR;	/* as is in AFSR_EXT */		\
	membar	#Sync;							\
1:

/*
 * This macro is used in the CPU logout code to capture diagnostic
 * information from the L2 cache on panther processors.
 *   afar:	input AFAR, not modified.
 *   datap:	Ptr to pn_l2_data_t, at end pts just past pn_l2_data_t.
 *   scr1:	Scratch.
 *   scr2:	Scratch.
 *   scr3:	Scratch.
 */
#define	GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3)		\
	mov	afar, scr3;						\
	set	PN_L2_INDEX_MASK, scr1;					\
	and	scr3, scr1, scr3;					\
	b	1f;	/* code to read tags and data should be ...  */	\
	   nop;		/* ...on the same cache line if possible.    */	\
	.align	128;	/* update this line if you add lines below. */	\
1:									\
	stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store L2$ index  */	\
	ldxa	[scr3]ASI_L2_TAG, scr1;		/* read the L2$ tag */	\
	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
	add	datap, CH_EC_DATA, datap;				\
	clr	scr1;							\
2:									\
	ldxa	[scr3 + scr1]ASI_L2_DATA, scr2;	/* loop through     */	\
	stxa	scr2, [datap]%asi;		/* <511:256> of L2  */	\
	add	datap, 8, datap;		/* data and record  */	\
	cmp	scr1, (PN_L2_LINESIZE / 2) - 8;	/* it in the cpu    */	\
	bne	2b;				/* logout struct.   */	\
	  add	scr1, 8, scr1;						\
	set	PN_L2_DATA_ECC_SEL, scr2;	/* ECC_sel bit.     */	\
	ldxa	[scr3 + scr2]ASI_L2_DATA, scr2;	/* Read and record  */	\
	stxa	scr2, [datap]%asi;		/* ecc of <511:256> */	\
	add	datap, 8, datap;					\
3:									\
	ldxa	[scr3 + scr1]ASI_L2_DATA, scr2;	/* loop through     */	\
	stxa	scr2, [datap]%asi;		/* <255:0> of L2    */	\
	add	datap, 8, datap;		/* data and record  */	\
	cmp	scr1, PN_L2_LINESIZE - 8;	/* it in the cpu    */	\
	bne	3b;				/* logout struct.   */	\
	  add	scr1, 8, scr1;						\
	set	PN_L2_DATA_ECC_SEL, scr2;	/* ECC_sel bit.     */	\
	add	scr2, PN_L2_ECC_LO_REG, scr2;				\
	ldxa	[scr3 + scr2]ASI_L2_DATA, scr2;	/* Read and record  */	\
	stxa	scr2, [datap]%asi;		/* ecc of <255:0>.  */	\
	add	datap, 8, datap;		/* Advance pointer  */	\
	set	PN_L2_SET_SIZE, scr2;					\
	set	PN_L2_MAX_SET, scr1;					\
	cmp	scr1, scr3;	/* more ways to try for this line? */	\
	bg,a	%xcc, 1b;	/* if so, start over with next way */	\
	  add	scr3, scr2, scr3

/*
 * Cheetah+ assumes E$ is 2-way and grabs both E$ lines associated with afar.
 *	afar:	AFAR from access.
 *	datap:	pointer to cpu logout structure.
 *	scr1:	scratch
 *	scr2:	scratch
 *	scr3:	scratch
 */
#define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
	GET_CPU_IMPL(scr1);						\
	cmp	scr1, PANTHER_IMPL;					\
	bne	%xcc, 4f;						\
	  nop;								\
	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
	GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);		\
	GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);		\
	GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);		\
	add	datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap;	\
	GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
	b	5f;							\
	  nop;								\
4:									\
	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
	GET_ECACHE_WAY_BIT(scr1, scr2);					\
	xor	afar, scr1, afar;					\
	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
	GET_ECACHE_WAY_BIT(scr1, scr2);		/* restore AFAR */	\
	xor	afar, scr1, afar;					\
	add	datap, (CHD_EC_DATA_SETS-2)*CH_EC_DATA_SIZE, datap;	\
	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
5:

/*
 * Cheetah+ needs to capture E$, D$ and I$ lines associated with
 * shadow afar.
 *	afar:	scratch, holds shadow afar.
 *	datap:	pointer to cpu logout structure
 *	scr1:	scratch
 *	scr2:	scratch
 *	scr3:	scratch
 */
#define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)		\
	ldxa	[datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi, afar;	\
	add	datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap;	\
	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	sub	datap, CH_CPU_LOGOUT_SIZE, datap

/*
 * Compute the "Way" bit for 2-way Ecache for Cheetah+.
 */
#define	GET_ECACHE_WAY_BIT(scr1, scr2)					\
	CPU_INDEX(scr1, scr2);						\
	mulx	scr1, CPU_NODE_SIZE, scr1;				\
	add	scr1, ECACHE_SIZE, scr1;				\
	set	cpunodes, scr2;						\
	ld	[scr1 + scr2], scr1;					\
	srlx	scr1, 1, scr1

#else /* CHEETAH_PLUS */
/*
 * Macro version of get_ecache_dtag.  We use this macro in the
 * CPU logout code.
 *   afar:	input AFAR, not modified.
 *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
 *   scr1:      Scratch.
 *   scr2:	Scratch.
 *   scr3:	Scratch.
 */
#define	GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
	mov	afar, scr3;						\
	andn	scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
	set	(CH_ECACHE_8M_SIZE - 1), scr2;				\
	and	scr3, scr2, scr3;		/* VA<63:23>=0 */	\
	b	1f;							\
	   stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
	.align	64;							\
1:									\
	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
	add	datap, CH_EC_DATA, datap;				\
2:									\
	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
	clr	scr1;							\
3:						/* loop thru 5 regs */	\
	ldxa	[scr1]ASI_EC_DATA, scr2;				\
	stxa	scr2, [datap]%asi;					\
	add	datap, 8, datap;					\
	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
	bne	3b;							\
	   add	scr1, 8, scr1;						\
	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
	beq	2b;							\
	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3

/*
 * Cheetah does not have cores so these macros are null.
 */
#define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
#define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)

/*
 * Cheetah gets primary AFSR and AFAR and clears the AFSR, except for the
 * fatal error bits.
 *	datap:		pointer to cpu logout structure.
 *	afar:		returned primary AFAR value.
 *	scr1:		scratch
 *	scr2:		scratch
 */
#define	GET_AFSR_AFAR(datap, afar, scr1, scr2)	\
	ldxa	[%g0]ASI_AFAR, afar;					\
	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
	ldxa	[%g0]ASI_AFSR, scr2;					\
	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
	sllx	scr1, 32, scr1;						\
	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
	membar	#Sync

/*
 * Cheetah E$ is direct-mapped, so we grab line data and skip second line.
 *	afar:	AFAR from access.
 *	datap:	pointer to cpu logout structure.
 *	scr1:	scratch
 *	scr2:	scratch
 *	scr3:	scratch
 */
#define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
	GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	add	datap, (CHD_EC_DATA_SETS-1)*CH_EC_DATA_SIZE, datap;	\
	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\

/*
 * Cheetah has no shadow AFAR, null operation.
 */
#define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)

#endif	/* CHEETAH_PLUS */

/*
 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
 * logout data at TL>0. r_val is a register that returns the "failure count"
 * to the caller, and may be used as a scratch register until the end of
 * the macro.  afar is used to return the primary AFAR value to the caller
 * and it too can be used as a scratch register until the end. r_or_s is
 * a reg or symbol that has the offset within the "cpu_private" data area
 * to deposit the logout data.  t_flags is a register that has the
 * trap-type/trap-level/CEEN info. This t_flags register may be used after
 * the GET_AFSR_AFAR macro.
 *
 * The CPU logout operation will fail (r_val > 0) if the logout
 * structure in question is already being used. Otherwise, the CPU
 * logout operation will succeed (r_val = 0). For failures, r_val
 * returns the busy count (# of times we tried using this CPU logout
 * structure when it was busy.)
 *
 *   Register usage:
 *	%asi:   Must be set to either ASI_MEM if the address in datap
 *		is a physical address or to ASI_N if the address in
 *		datap is a virtual address.
 *	r_val:	This register is the return value which tells the
 *		caller whether or not the LOGOUT operation was successful.
 *		For failures, r_val returns the fail count (i.e. number of
 *		times we have tried to use this logout structure when it was
 *		already being used.
 *	afar:	output: contains AFAR on exit
 *	t_flags: input trap type info, may be used as scratch after stored
 *		to cpu log out structure.
 *	datap:	Points to log out data area.
 *	scr1:	Scratch
 *	scr2:	Scratch (may be r_val)
 *	scr3:   Scratch (may be t_flags)
 */
#define	DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, datap, scr1, scr2, scr3) \
	setx	LOGOUT_INVALID, scr2, scr1;				\
	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, scr2;	\
	cmp	scr2, scr1;						\
	bne	8f;							\
	  nop;								\
	stxa	t_flags, [datap + CH_CLO_FLAGS]%asi;			\
	GET_AFSR_AFAR(datap, afar, scr1, scr2);				\
	add	datap, CH_CLO_DATA + CH_CHD_EC_DATA, datap;		\
	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
	sub	datap, CH_CLO_DATA + CH_DIAG_DATA_SIZE, datap;		\
	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3);			\
	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, afar;	\
	set	0, r_val;	/* return value for success */		\
	ba	9f;							\
	  nop;								\
8:									\
	ldxa	[%g0]ASI_AFAR, afar;					\
	ldxa	[datap + CH_CLO_NEST_CNT]%asi, r_val;			\
	inc	r_val;		/* return value for failure */		\
	stxa	r_val, [datap + CH_CLO_NEST_CNT]%asi;			\
	membar	#Sync;							\
9:

/*
 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
 * logout data.  Uses DO_TL1_CPU_LOGOUT macro defined above, and sets
 * up the expected data pointer in the scr1 register and sets the %asi
 * register to ASI_N for kernel virtual addresses instead of ASI_MEM as
 * is used at TL>0.
 *
 * The CPU logout operation will fail (r_val > 0) if the logout
 * structure in question is already being used. Otherwise, the CPU
 * logout operation will succeed (r_val = 0). For failures, r_val
 * returns the busy count (# of times we tried using this CPU logout
 * structure when it was busy.)
 *
 *   Register usage:
 *	r_val:	This register is the return value which tells the
 *		caller whether or not the LOGOUT operation was successful.
 *		For failures, r_val returns the fail count (i.e. number of
 *		times we have tried to use this logout structure when it was
 *		already being used.
 *	afar:	returns AFAR, used internally as afar value.
 *		output: if the cpu_private struct has not been initialized,
 *		        then we return the t_flags value listed below.
 *	r_or_s:	input offset, either register or constant (symbol).  It's
 *		OK for r_or_s to be a register as long as it's not scr1 or
 *		scr3.
 *	t_flags: input trap type info, may be used as scratch after stored
 *		to cpu log out structure.
 *	scr1:	Scratch, points to log out data area.
 *	scr2:	Scratch (may be r_or_s)
 *	scr3:	Scratch (may be r_val)
 *	scr4:   Scratch (may be t_flags)
 */
#define	DO_CPU_LOGOUT(r_val, afar, r_or_s, t_flags, scr1, scr2, scr3, scr4) \
	GET_CPU_PRIVATE_PTR(r_or_s, scr1, scr3, 7f); /* can't use scr2/4 */ \
	wr	%g0, ASI_N, %asi;					\
	DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, scr1, scr2, scr3, scr4)	\
	ba	6f;							\
	  nop;								\
7:									\
	mov	t_flags, afar;		/* depends on afar = %g2  */	\
	set	0, r_val;		/* success in this case.  */	\
6:

/*
 * The P$ is flushed as a side effect of writing to the Primary
 * or Secondary Context Register. After writing to a context
 * register, every line of the P$ in the Valid state is invalidated,
 * regardless of which context it belongs to.
 * This routine simply touches the Primary context register by
 * reading the current value and writing it back. The Primary
 * context is not changed.
 */
#define	PCACHE_FLUSHALL(tmp1, tmp2, tmp3)				\
	sethi	%hi(FLUSH_ADDR), tmp1					;\
	set	MMU_PCONTEXT, tmp2					;\
	ldxa	[tmp2]ASI_DMMU, tmp3					;\
	stxa	tmp3, [tmp2]ASI_DMMU					;\
	flush	tmp1	/* See Cheetah PRM 8.10.2 */

/*
 * Macro that flushes the entire Dcache.
 *
 * arg1 = dcache size
 * arg2 = dcache linesize
 */
#define	CH_DCACHE_FLUSHALL(arg1, arg2, tmp1)				\
	sub	arg1, arg2, tmp1;					\
1:									\
	stxa	%g0, [tmp1]ASI_DC_TAG;					\
	membar	#Sync;							\
	cmp	%g0, tmp1;						\
	bne,pt	%icc, 1b;						\
	  sub	tmp1, arg2, tmp1;

/*
 * Macro that flushes the entire Icache.
 *
 * Note that we cannot access ASI 0x67 (ASI_IC_TAG) with the Icache on,
 * because accesses to ASI 0x67 interfere with Icache coherency.  We
 * must make sure the Icache is off, then turn it back on after the entire
 * cache has been invalidated.  If the Icache is originally off, we'll just
 * clear the tags but not turn the Icache on.
 *
 * arg1 = icache size
 * arg2 = icache linesize
 */
#define	CH_ICACHE_FLUSHALL(arg1, arg2, tmp1, tmp2)			\
	ldxa	[%g0]ASI_DCU, tmp2;					\
	andn	tmp2, DCU_IC, tmp1;					\
	stxa	tmp1, [%g0]ASI_DCU;					\
	flush	%g0;	/* flush required after changing the IC bit */	\
	sllx	arg2, 1, arg2;		/* arg2 = linesize * 2 */	\
	sllx	arg1, 1, arg1;		/* arg1 = size * 2 */		\
	sub	arg1, arg2, arg1;					\
	or	arg1, CH_ICTAG_LOWER, arg1;	/* "write" tag */	\
1:									\
	stxa	%g0, [arg1]ASI_IC_TAG;					\
	membar	#Sync;				/* Cheetah PRM 8.9.3 */	\
	cmp	arg1, CH_ICTAG_LOWER;					\
	bne,pt	%icc, 1b;						\
	  sub	arg1, arg2, arg1;					\
	stxa	tmp2, [%g0]ASI_DCU;					\
	flush	%g0;	/* flush required after changing the IC bit */


#if defined(JALAPENO) || defined(SERRANO)

/*
 * ASI access to the L2 tag or L2 flush can hang the cpu when interacting 
 * with combinations of L2 snoops, victims and stores.
 *
 * A possible workaround is to surround each L2 ASI access with membars
 * and make sure that the code is hitting in the Icache.  This requires
 * aligning code sequence at E$ boundary and forcing I$ fetch by
 * jumping to selected offsets so that we don't take any I$ misses
 * during ASI access to the L2 tag or L2 flush.  This also requires
 * making sure that we don't take any interrupts or traps (such as
 * fast ECC trap, I$/D$ tag parity error) which can result in eviction
 * of this code sequence from I$, thus causing a miss.
 *
 * Because of the complexity/risk, we have decided to do a partial fix
 * of adding membar around each ASI access to the L2 tag or L2 flush.
 */

#define	JP_EC_DIAG_ACCESS_MEMBAR	\
	membar	#Sync

/*
 * Jalapeno version of macro that flushes the entire Ecache.
 *
 * Uses Jalapeno displacement flush feature of ASI_EC_DIAG.
 *
 * arg1 = ecache size
 * arg2 = ecache linesize - not modified; can be an immediate constant.
 */
#define	ECACHE_FLUSHALL(arg1, arg2, tmp1, tmp2)	\
	CPU_INDEX(tmp1, tmp2);						\
	set	JP_ECACHE_IDX_DISP_FLUSH, tmp2;				\
	sllx	tmp1, JP_ECFLUSH_PORTID_SHIFT, tmp1;			\
	or	tmp1, tmp2, tmp1;					\
	srlx	arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2;			\
1:									\
	subcc	tmp2, arg2, tmp2;					\
	JP_EC_DIAG_ACCESS_MEMBAR;					\
	ldxa	[tmp1 + tmp2]ASI_EC_DIAG, %g0;				\
	JP_EC_DIAG_ACCESS_MEMBAR;					\
	bg,pt	%xcc, 1b;						\
	  nop;								\
	mov	1, tmp2;						\
	sllx	tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;			\
	add	tmp1, tmp2, tmp1;					\
	mov	(JP_ECACHE_NWAY-1), tmp2;				\
	sllx	tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;			\
	andcc	tmp1, tmp2, tmp2;					\
	bnz,pt	%xcc, 1b;						\
	  srlx	arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2

#else	/* JALAPENO || SERRANO */

/*
 * Cheetah version of macro that flushes the entire Ecache.
 *
 *  Need to displacement flush 2x ecache size from Ecache flush area.
 *
 * arg1 = ecache size
 * arg2 = ecache linesize
 * arg3 = ecache flush address - for cheetah only
 */
#define	CH_ECACHE_FLUSHALL(arg1, arg2, arg3)				\
	sllx	arg1, 1, arg1;						\
1:									\
	subcc	arg1, arg2, arg1;					\
	bg,pt	%xcc, 1b;						\
	  ldxa	[arg1 + arg3]ASI_MEM, %g0;

/*
 * Cheetah+ version of macro that flushes the entire Ecache.
 *
 * Uses the displacement flush feature.
 *
 * arg1 = ecache size
 * arg2 = ecache linesize
 * impl = CPU implementation as returned from GET_CPU_IMPL()
 *        The value in this register is destroyed during execution
 *        of the macro.
 */
#if defined(CHEETAH_PLUS)
#define	CHP_ECACHE_FLUSHALL(arg1, arg2, impl)				\
	cmp	impl, PANTHER_IMPL;					\
	bne	%xcc, 1f;						\
	  nop;								\
	set	PN_L3_IDX_DISP_FLUSH, impl;				\
	b	2f;							\
	  nop;								\
1:									\
	set	CHP_ECACHE_IDX_DISP_FLUSH, impl;			\
2:									\
	subcc	arg1, arg2, arg1;					\
	bg,pt	%xcc, 2b;						\
	  ldxa	[arg1 + impl]ASI_EC_DIAG, %g0;
#else	/* CHEETAH_PLUS */
#define	CHP_ECACHE_FLUSHALL(arg1, arg2, impl)
#endif	/* CHEETAH_PLUS */

/*
 * Macro that flushes the entire Ecache.
 *
 * arg1 = ecache size
 * arg2 = ecache linesize
 * arg3 = ecache flush address - for cheetah only
 */
#define	ECACHE_FLUSHALL(arg1, arg2, arg3, tmp1)				\
	GET_CPU_IMPL(tmp1);						\
	cmp	tmp1, CHEETAH_IMPL;					\
	bne	%xcc, 2f;						\
	  nop;								\
	CH_ECACHE_FLUSHALL(arg1, arg2, arg3);				\
	ba	3f;							\
	  nop;								\
2:									\
	CHP_ECACHE_FLUSHALL(arg1, arg2, tmp1);				\
3:

#endif	/* JALAPENO || SERRANO */

/*
 * Macro that flushes the Panther L2 cache.
 */
#if defined(CHEETAH_PLUS)
#define	PN_L2_FLUSHALL(scr1, scr2, scr3)				\
	GET_CPU_IMPL(scr3);						\
	cmp	scr3, PANTHER_IMPL;					\
	bne	%xcc, 2f;						\
	  nop;								\
	set	PN_L2_SIZE, scr1;					\
	set	PN_L2_LINESIZE, scr2;					\
	set	PN_L2_IDX_DISP_FLUSH, scr3;				\
1:									\
	subcc	scr1, scr2, scr1;					\
	bg,pt	%xcc, 1b;						\
	  ldxa	[scr1 + scr3]ASI_L2_TAG, %g0;				\
2:
#else	/* CHEETAH_PLUS */
#define	PN_L2_FLUSHALL(scr1, scr2, scr3)
#endif	/* CHEETAH_PLUS */

/*
 * Given a VA and page size (page size as encoded in ASI_MMU_TAG_ACCESS_EXT),
 * this macro returns the TLB index for that mapping based on a 512 entry
 * (2-way set associative) TLB. Aaside from the 16 entry fully associative
 * TLBs, all TLBs in Panther are 512 entry, 2-way set associative.
 *  
 * To find the index, we shift the VA right by 13 + (3 * pg_sz) and then
 * mask out all but the lower 8 bits because:
 *
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 0 for   8K
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 1 for  64K
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 2 for 512K
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 3 for   4M
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 4 for  32M
 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 5 for 256M
 *
 * and
 *
 *    array index for   8K pages = VA[20:13]
 *    array index for  64K pages = VA[23:16]
 *    array index for 512K pages = VA[26:19]
 *    array index for   4M pages = VA[29:22]
 *    array index for  32M pages = VA[32:25]
 *    array index for 256M pages = VA[35:28]
 *
 * Inputs:
 *
 *    va	- Register.
 *		  Input: Virtual address in which we are interested.
 *		  Output: TLB index value.
 *    pg_sz	- Register. Page Size of the TLB in question as encoded
 *		  in the ASI_[D|I]MMU_TAG_ACCESS_EXT register.
 */
#if defined(CHEETAH_PLUS)
#define	PN_GET_TLB_INDEX(va, pg_sz)					\
	srlx	va, 13, va;	/* first shift the 13 bits and then */	\
	srlx	va, pg_sz, va;	/* shift by pg_sz three times. */	\
	srlx	va, pg_sz, va;						\
	srlx	va, pg_sz, va;						\
	and	va, 0xff, va;	/* mask out all but the lower 8 bits */
#endif	/* CHEETAH_PLUS */

/*
 * The following macros are for error traps at TL>0.
 * The issue with error traps at TL>0 is that there are no safely
 * available global registers.  So we use the trick of generating a
 * software trap, then using the %tpc, %tnpc and %tstate registers to
 * temporarily save the values of %g1 and %g2.
 */

/*
 * Macro to generate 8-instruction trap table entry for TL>0 trap handlers.
 * Does the following steps:
 *	1. membar #Sync - required for USIII family errors.
 *	2. Specified software trap.
 * NB: Must be 8 instructions or less to fit in trap table and code must
 *     be relocatable.
 */
#define	CH_ERR_TL1_TRAPENTRY(trapno)		\
	membar	#Sync;				\
	ta	trapno;				\
	nop; nop; nop; nop; nop; nop

/*
 * Macro to generate 8-instruction trap table entry for TL>0 software trap.
 * We save the values of %g1 and %g2 in %tpc, %tnpc and %tstate (since
 * the low-order two bits of %tpc/%tnpc are reserved and read as zero,
 * we need to put the low-order two bits of %g1 and %g2 in %tstate).
 * Note that %tstate has a reserved hole from bits 3-7, so we put the
 * low-order two bits of %g1 in bits 0-1 and the low-order two bits of
 * %g2 in bits 10-11 (insuring bits 8-9 are zero for use by the D$/I$
 * state bits).  Note that we must do a jmp instruction, since this
 * is moved into the trap table entry.
 * NB: Must be 8 instructions or less to fit in trap table and code must
 *     be relocatable.
 */
#define	CH_ERR_TL1_SWTRAPENTRY(label)		\
	wrpr	%g1, %tpc;			\
	and	%g1, 3, %g1;			\
	wrpr	%g2, %tnpc;			\
	sllx	%g2, CH_ERR_G2_TO_TSTATE_SHFT, %g2; \
	or	%g1, %g2, %g2;			\
	sethi	%hi(label), %g1;		\
	jmp	%g1+%lo(label);			\
	  wrpr	%g2, %tstate

/*
 * Macro to get ptr to ch_err_tl1_data.
 * reg1 will either point to a physaddr with ASI_MEM in %asi OR it
 * will point to a kernel nucleus virtual address with ASI_N in %asi.
 * This allows us to:
 *   1. Avoid getting MMU misses.  We may have gotten the original
 *	Fast ECC error in an MMU handler and if we get an MMU trap
 *	in the TL>0 handlers, we'll scribble on the MMU regs.
 *   2. Allows us to use the same code in the TL>0 handlers whether
 *	we're accessing kernel nucleus virtual addresses or physical
 *	addresses.
 * pseudo-code:
 *	reg1 <- ch_err_tl1_paddrs[CPUID];
 *	if (reg1 == NULL) {
 *		reg1 <- &ch_err_tl1_data
 *		%asi <- ASI_N
 *	} else {
 *		reg1 <- reg1 + offset +
 *		    sizeof (ch_err_tl1_data) * (%tl - 3)
 *		%asi <- ASI_MEM
 *	}
 */
#define	GET_CH_ERR_TL1_PTR(reg1, reg2, offset)	\
	CPU_INDEX(reg1, reg2);			\
	sllx	reg1, 3, reg1;			\
	set	ch_err_tl1_paddrs, reg2;	\
	ldx	[reg1+reg2], reg1;		\
	brnz	reg1, 1f;			\
	add	reg1, offset, reg1;		\
	set	ch_err_tl1_data, reg1;		\
	ba	2f;				\
	wr	%g0, ASI_N, %asi;		\
1:	rdpr	%tl, reg2;			\
	sub	reg2, 3, reg2;			\
	mulx	reg2, CH_ERR_TL1_DATA_SIZE, reg2;	\
	add	reg1, reg2, reg1;		\
	wr	%g0, ASI_MEM, %asi;		\
2:

/*
 * Macro to generate entry code for TL>0 error handlers.
 * At the end of this macro, %g1 will point to the ch_err_tl1_data
 * structure and %g2 will have the original flags in the ch_err_tl1_data
 * structure and %g5 will have the value of %tstate where the Fast ECC
 * routines will save the state of the D$ in Bit2 CH_ERR_TSTATE_DC_ON.
 * All %g registers except for %g1, %g2 and %g5 will be available after
 * this macro.
 * Does the following steps:
 *   1. Compute physical address of per-cpu/per-tl save area using
 *	only %g1+%g2 (which we've saved in %tpc, %tnpc, %tstate)
 *	leaving address in %g1 and updating the %asi register.
 *	If there is no data area available, we branch to label.
 *   2. Save %g3-%g7 in save area.
 *   3. Save %tpc->%g3, %tnpc->%g4, %tstate->%g5, which contain
 *	original %g1+%g2 values (because we're going to change %tl).
 *   4. set %tl <- %tl - 1.  We do this ASAP to make window of
 *	running at %tl+1 as small as possible.
 *   5. Reconstitute %g1+%g2 from %tpc (%g3), %tnpc (%g4),
 *	%tstate (%g5) and save in save area, carefully preserving %g5
 *	because it has the CH_ERR_TSTATE_DC_ON value.
 *   6. Load existing ch_err_tl1_data flags in %g2
 *   7. Compute the new flags
 *   8. If %g2 is non-zero (the structure was busy), shift the new
 *	flags by CH_ERR_ME_SHIFT and or them with the old flags.
 *   9. Store the updated flags into ch_err_tl1_data flags.
 *   10. If %g2 is non-zero, read the %tpc and store it in
 *	ch_err_tl1_data.
 */
#define	CH_ERR_TL1_ENTER(flags)			\
	GET_CH_ERR_TL1_PTR(%g1, %g2, CHPR_TL1_ERR_DATA);	\
	stxa	%g3, [%g1 + CH_ERR_TL1_G3]%asi;	\
	stxa	%g4, [%g1 + CH_ERR_TL1_G4]%asi;	\
	stxa	%g5, [%g1 + CH_ERR_TL1_G5]%asi;	\
	stxa	%g6, [%g1 + CH_ERR_TL1_G6]%asi;	\
	stxa	%g7, [%g1 + CH_ERR_TL1_G7]%asi;	\
	rdpr	%tpc, %g3;			\
	rdpr	%tnpc, %g4;			\
	rdpr	%tstate, %g5;			\
	rdpr	%tl, %g6;			\
	sub	%g6, 1, %g6;			\
	wrpr	%g6, %tl;			\
	and	%g5, 3, %g6;			\
	andn	%g3, 3, %g3;			\
	or	%g3, %g6, %g3;			\
	stxa	%g3, [%g1 + CH_ERR_TL1_G1]%asi;	\
	srlx	%g5, CH_ERR_G2_TO_TSTATE_SHFT, %g6;	\
	and	%g6, 3, %g6;			\
	andn	%g4, 3, %g4;			\
	or	%g6, %g4, %g4;			\
	stxa	%g4, [%g1 + CH_ERR_TL1_G2]%asi;	\
	ldxa	[%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;	\
	set	flags | CH_ERR_TL, %g3;		\
	brz	%g2, 9f;			\
	sllx	%g3, CH_ERR_ME_SHIFT, %g4;	\
	or	%g2, %g4, %g3;			\
9:	stxa	%g3, [%g1 + CH_ERR_TL1_FLAGS]%asi;	\
	brnz	%g2, 8f;			\
	rdpr	%tpc, %g4;			\
	stxa	%g4, [%g1 + CH_ERR_TL1_TPC]%asi;	\
8:

/*
 * Turns off D$/I$ and saves the state of DCU_DC+DCU_IC in %tstate Bits 8+9
 * (CH_ERR_TSTATE_DC_ON/CH_ERR_TSTATE_IC_ON).  This is invoked on Fast ECC
 * at TL>0 handlers because the D$ may have corrupted data and we need to
 * turn off the I$ to allow for diagnostic accesses.  We then invoke
 * the normal entry macro and after it is done we save the values of
 * the original D$/I$ state, which is in %g5 bits CH_ERR_TSTATE_DC_ON/
 * CH_ERR_TSTATE_IC_ON in ch_err_tl1_tmp.
 */
#define	CH_ERR_TL1_FECC_ENTER			\
	ldxa	[%g0]ASI_DCU, %g1;		\
	andn	%g1, DCU_DC + DCU_IC, %g2;	\
	stxa	%g2, [%g0]ASI_DCU;		\
	flush	%g0;	/* DCU_IC need flush */	\
	rdpr	%tstate, %g2;			\
	and	%g1, DCU_DC + DCU_IC, %g1;	\
	sllx	%g1, CH_ERR_DCU_TO_TSTATE_SHFT, %g1;	\
	or	%g1, %g2, %g2;			\
	wrpr	%g2, %tstate;			\
	CH_ERR_TL1_ENTER(CH_ERR_FECC);		\
	and	%g5, CH_ERR_TSTATE_DC_ON + CH_ERR_TSTATE_IC_ON, %g5;	\
	stxa	%g5, [%g1 + CH_ERR_TL1_TMP]%asi

/*
 * Macro to generate exit code for TL>0 error handlers.
 * We fall into this macro if we've successfully logged the error in
 * the ch_err_tl1_data structure and want the PIL15 softint to pick
 * it up and log it.
 * Does the following steps:
 *   1.	Set pending flag for this cpu in ch_err_tl1_pending.
 *   2.	Write %set_softint with (1<<pil) to cause a pil level trap
 *   3.	Restore registers from ch_err_tl1_data, which is pointed to
 *	by %g1, last register to restore is %g1 since it's pointing
 *	to the save area.
 *   4. Execute retry
 */
#define	CH_ERR_TL1_EXIT				\
	CPU_INDEX(%g2, %g3);			\
	set	ch_err_tl1_pending, %g3;	\
	set	-1, %g4;			\
	stb	%g4, [%g2 + %g3];		\
	mov	1, %g2;				\
	sll	%g2, PIL_15, %g2;		\
	wr	%g2, SET_SOFTINT;		\
	ldxa	[%g1 + CH_ERR_TL1_G7]%asi, %g7;	\
	ldxa	[%g1 + CH_ERR_TL1_G6]%asi, %g6;	\
	ldxa	[%g1 + CH_ERR_TL1_G5]%asi, %g5;	\
	ldxa	[%g1 + CH_ERR_TL1_G4]%asi, %g4;	\
	ldxa	[%g1 + CH_ERR_TL1_G3]%asi, %g3;	\
	ldxa	[%g1 + CH_ERR_TL1_G2]%asi, %g2;	\
	ldxa	[%g1 + CH_ERR_TL1_G1]%asi, %g1;	\
	retry

/*
 * Generates unrecoverable error label for TL>0 handlers.
 * At label (Unrecoverable error routine)
 *   1. Sets flags in ch_err_tl1_data and leaves in %g2 (first
 *	argument to cpu_tl1_err_panic).
 *   2.	Call cpu_tl1_err_panic via systrap at PIL 15
 */
#define	CH_ERR_TL1_PANIC_EXIT(label)		\
label:	ldxa	[%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;	\
	or	%g2, CH_ERR_TL | CH_ERR_PANIC, %g2;	\
	stxa	%g2, [%g1 + CH_ERR_TL1_FLAGS]%asi;	\
	set	cpu_tl1_err_panic, %g1;		\
	ba	sys_trap;			\
	  mov	PIL_15, %g4



/* END CSTYLED */
#endif	/* _ASM */

#ifdef	__cplusplus
}
#endif

#endif /* _CHEETAHASM_H */