1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
|
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
* Support for determining capacity and utilization of performance relevant
* hardware components in a computer
*
* THEORY
* ------
* The capacity and utilization of the performance relevant hardware components
* is needed to be able to optimize performance while minimizing the amount of
* power used on a system. The idea is to use hardware performance counters
* and potentially other means to determine the capacity and utilization of
* performance relevant hardware components (eg. execution pipeline, cache,
* memory, etc.) and attribute the utilization to the responsible CPU and the
* thread running there.
*
* This will help characterize the utilization of performance relevant
* components and how much is used by each CPU and each thread. With
* that data, the utilization can be aggregated to all the CPUs sharing each
* performance relevant hardware component to calculate the total utilization
* of each component and compare that with the component's capacity to
* essentially determine the actual hardware load of the component. The
* hardware utilization attributed to each running thread can also be
* aggregated to determine the total hardware utilization of each component to
* a workload.
*
* Once that is done, one can determine how much of each performance relevant
* hardware component is needed by a given thread or set of threads (eg. a
* workload) and size up exactly what hardware is needed by the threads and how
* much. With this info, we can better place threads among CPUs to match their
* exact hardware resource needs and potentially lower or raise the power based
* on their utilization or pack threads onto the fewest hardware components
* needed and power off any remaining unused components to minimize power
* without sacrificing performance.
*
* IMPLEMENTATION
* --------------
* The code has been designed and implemented to make (un)programming and
* reading the counters for a given CPU as lightweight and fast as possible.
* This is very important because we need to read and potentially (un)program
* the counters very often and in performance sensitive code. Specifically,
* the counters may need to be (un)programmed during context switch and/or a
* cyclic handler when there are more counter events to count than existing
* counters.
*
* Consequently, the code has been split up to allow allocating and
* initializing everything needed to program and read the counters on a given
* CPU once and make (un)programming and reading the counters for a given CPU
* not have to allocate/free memory or grab any locks. To do this, all the
* state needed to (un)program and read the counters on a CPU is kept per CPU
* and is made lock free by forcing any code that reads or manipulates the
* counters or the state needed to (un)program or read the counters to run on
* the target CPU and disable preemption while running on the target CPU to
* protect any critical sections. All counter manipulation on the target CPU is
* happening either from a cross-call to the target CPU or at the same PIL as
* used by the cross-call subsystem. This guarantees that counter manipulation
* is not interrupted by cross-calls from other CPUs.
*
* The synchronization has been made lock free or as simple as possible for
* performance and to avoid getting the locking all tangled up when we interpose
* on the CPC routines that (un)program the counters to manage the counters
* between the kernel and user on each CPU. When the user starts using the
* counters on a given CPU, the kernel will unprogram the counters that it is
* using on that CPU just before they are programmed for the user. Then the
* kernel will program the counters on a given CPU for its own use when the user
* stops using them.
*
* There is a special interaction with DTrace cpc provider (dcpc). Before dcpc
* enables any probe, it requests to disable and unprogram all counters used for
* capacity and utilizations. These counters are never re-programmed back until
* dcpc completes. When all DTrace cpc probes are removed, dcpc notifies CU
* framework and it re-programs the counters.
*
* When a CPU is going offline, its CU counters are unprogrammed and disabled,
* so that they would not be re-programmed again by some other activity on the
* CPU that is going offline.
*
* The counters are programmed during boot. However, a flag is available to
* disable this if necessary (see cu_flag below). A handler is provided to
* (un)program the counters during CPU on/offline. Basic routines are provided
* to initialize and tear down this module, initialize and tear down any state
* needed for a given CPU, and (un)program the counters for a given CPU.
* Lastly, a handler is provided to read the counters and attribute the
* utilization to the responsible CPU.
*/
#include <sys/types.h>
#include <sys/cmn_err.h>
#include <sys/cpuvar.h>
#include <sys/ddi.h>
#include <sys/systm.h>
#include <sys/disp.h>
#include <sys/sdt.h>
#include <sys/sunddi.h>
#include <sys/thread.h>
#include <sys/pghw.h>
#include <sys/cmt.h>
#include <sys/policy.h>
#include <sys/x_call.h>
#include <sys/cap_util.h>
#include <sys/archsystm.h>
#include <sys/promif.h>
#if defined(__x86)
#include <sys/xc_levels.h>
#endif
/*
* Default CPU hardware performance counter flags to use for measuring capacity
* and utilization
*/
#define CU_CPC_FLAGS_DEFAULT \
(CPC_COUNT_USER|CPC_COUNT_SYSTEM|CPC_OVF_NOTIFY_EMT)
/*
* Possible Flags for controlling this module.
*/
#define CU_FLAG_ENABLE 1 /* Enable module */
#define CU_FLAG_READY 2 /* Ready to setup module */
#define CU_FLAG_ON 4 /* Module is on */
/*
* pg_cpu kstats calculate utilization rate and maximum utilization rate for
* some CPUs. The rate is calculated based on data from two subsequent
* snapshots. When the time between such two snapshots is too small, the
* resulting rate may have low accuracy, so we only consider snapshots which
* are separated by SAMPLE_INTERVAL nanoseconds from one another. We do not
* update the rate if the interval is smaller than that.
*
* Use one tenth of a second as the minimum interval for utilization rate
* calculation.
*
* NOTE: The CU_SAMPLE_INTERVAL_MIN should be higher than the scaling factor in
* the CU_RATE() macro below to guarantee that we never divide by zero.
*
* Rate is the number of events per second. The rate is the number of events
* divided by time and multiplied by the number of nanoseconds in a second. We
* do not want time to be too small since it will cause large errors in
* division.
*
* We do not want to multiply two large numbers (the instruction count and
* NANOSEC) either since it may cause integer overflow. So we divide both the
* numerator and the denominator by the same value.
*
* NOTE: The scaling factor below should be less than CU_SAMPLE_INTERVAL_MIN
* above to guarantee that time divided by this value is always non-zero.
*/
#define CU_RATE(val, time) \
(((val) * (NANOSEC / CU_SCALE)) / ((time) / CU_SCALE))
#define CU_SAMPLE_INTERVAL_MIN (NANOSEC / 10)
#define CU_SCALE (CU_SAMPLE_INTERVAL_MIN / 10000)
/*
* When the time between two kstat reads for the same CPU is less than
* CU_UPDATE_THRESHOLD use the old counter data and skip updating counter values
* for the CPU. This helps reduce cross-calls when kstat consumers read data
* very often or when they read PG utilization data and then CPU utilization
* data quickly after that.
*/
#define CU_UPDATE_THRESHOLD (NANOSEC / 10)
/*
* The IS_HIPIL() macro verifies that the code is executed either from a
* cross-call or from high-PIL interrupt
*/
#ifdef DEBUG
#define IS_HIPIL() (getpil() >= XCALL_PIL)
#else
#define IS_HIPIL()
#endif /* DEBUG */
typedef void (*cu_cpu_func_t)(uintptr_t, int *);
/*
* Flags to use for programming CPU hardware performance counters to measure
* capacity and utilization
*/
int cu_cpc_flags = CU_CPC_FLAGS_DEFAULT;
/*
* Initial value used for programming hardware counters
*/
uint64_t cu_cpc_preset_value = 0;
/*
* List of CPC event requests for capacity and utilization.
*/
static kcpc_request_list_t *cu_cpc_reqs = NULL;
/*
* When a CPU is a member of PG with a sharing relationship that is supported
* by the capacity/utilization framework, a kstat is created for that CPU and
* sharing relationship.
*
* These kstats are updated one at a time, so we can have a single scratch
* space to fill the data.
*
* CPU counter kstats fields:
*
* cu_cpu_id CPU ID for this kstat
*
* cu_pg_id PG ID for this kstat
*
* cu_generation Generation value that increases whenever any CPU goes
* offline or online. Two kstat snapshots for the same
* CPU may only be compared if they have the same
* generation.
*
* cu_pg_id PG ID for the relationship described by this kstat
*
* cu_cpu_util Running value of CPU utilization for the sharing
* relationship
*
* cu_cpu_time_running Total time spent collecting CU data. The time may be
* less than wall time if CU counters were stopped for
* some time.
*
* cu_cpu_time_stopped Total time the CU counters were stopped.
*
* cu_cpu_rate Utilization rate, expressed in operations per second.
*
* cu_cpu_rate_max Maximum observed value of utilization rate.
*
* cu_cpu_relationship Name of sharing relationship for the PG in this kstat
*/
struct cu_cpu_kstat {
kstat_named_t cu_cpu_id;
kstat_named_t cu_pg_id;
kstat_named_t cu_generation;
kstat_named_t cu_cpu_util;
kstat_named_t cu_cpu_time_running;
kstat_named_t cu_cpu_time_stopped;
kstat_named_t cu_cpu_rate;
kstat_named_t cu_cpu_rate_max;
kstat_named_t cu_cpu_relationship;
} cu_cpu_kstat = {
{ "cpu_id", KSTAT_DATA_UINT32 },
{ "pg_id", KSTAT_DATA_INT32 },
{ "generation", KSTAT_DATA_UINT32 },
{ "hw_util", KSTAT_DATA_UINT64 },
{ "hw_util_time_running", KSTAT_DATA_UINT64 },
{ "hw_util_time_stopped", KSTAT_DATA_UINT64 },
{ "hw_util_rate", KSTAT_DATA_UINT64 },
{ "hw_util_rate_max", KSTAT_DATA_UINT64 },
{ "relationship", KSTAT_DATA_STRING },
};
/*
* Flags for controlling this module
*/
uint_t cu_flags = CU_FLAG_ENABLE;
/*
* Error return value for cu_init() since it can't return anything to be called
* from mp_init_tbl[] (:-(
*/
static int cu_init_error = 0;
hrtime_t cu_sample_interval_min = CU_SAMPLE_INTERVAL_MIN;
hrtime_t cu_update_threshold = CU_UPDATE_THRESHOLD;
static kmutex_t pg_cpu_kstat_lock;
/*
* Forward declaration of interface routines
*/
void cu_disable(void);
void cu_enable(void);
void cu_init(void);
void cu_cpc_program(cpu_t *cp, int *err);
void cu_cpc_unprogram(cpu_t *cp, int *err);
int cu_cpu_update(struct cpu *cp, boolean_t move_to);
void cu_pg_update(pghw_t *pg);
/*
* Forward declaration of private routines
*/
static int cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs);
static void cu_cpc_program_xcall(uintptr_t arg, int *err);
static int cu_cpc_req_add(char *event, kcpc_request_list_t *reqs,
int nreqs, cu_cntr_stats_t *stats, int kmem_flags, int *nevents);
static int cu_cpu_callback(cpu_setup_t what, int id, void *arg);
static void cu_cpu_disable(cpu_t *cp);
static void cu_cpu_enable(cpu_t *cp);
static int cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs);
static int cu_cpu_fini(cpu_t *cp);
static void cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info);
static int cu_cpu_kstat_update(kstat_t *ksp, int rw);
static int cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg);
static int cu_cpu_update_stats(cu_cntr_stats_t *stats,
uint64_t cntr_value);
static void cu_cpu_info_detach_xcall(void);
/*
* Disable or enable Capacity Utilization counters on all CPUs.
*/
void
cu_disable(void)
{
cpu_t *cp;
ASSERT(MUTEX_HELD(&cpu_lock));
cp = cpu_active;
do {
if (!(cp->cpu_flags & CPU_OFFLINE))
cu_cpu_disable(cp);
} while ((cp = cp->cpu_next_onln) != cpu_active);
}
void
cu_enable(void)
{
cpu_t *cp;
ASSERT(MUTEX_HELD(&cpu_lock));
cp = cpu_active;
do {
if (!(cp->cpu_flags & CPU_OFFLINE))
cu_cpu_enable(cp);
} while ((cp = cp->cpu_next_onln) != cpu_active);
}
/*
* Setup capacity and utilization support
*/
void
cu_init(void)
{
cpu_t *cp;
cu_init_error = 0;
if (!(cu_flags & CU_FLAG_ENABLE) || (cu_flags & CU_FLAG_ON)) {
cu_init_error = -1;
return;
}
if (kcpc_init() != 0) {
cu_init_error = -2;
return;
}
/*
* Can't measure hardware capacity and utilization without CPU
* hardware performance counters
*/
if (cpc_ncounters <= 0) {
cu_init_error = -3;
return;
}
/*
* Setup CPC event request queue
*/
cu_cpc_reqs = kcpc_reqs_init(cpc_ncounters, KM_SLEEP);
mutex_enter(&cpu_lock);
/*
* Mark flags to say that module is ready to be setup
*/
cu_flags |= CU_FLAG_READY;
cp = cpu_active;
do {
/*
* Allocate and setup state needed to measure capacity and
* utilization
*/
if (cu_cpu_init(cp, cu_cpc_reqs) != 0)
cu_init_error = -5;
/*
* Reset list of counter event requests so its space can be
* reused for a different set of requests for next CPU
*/
(void) kcpc_reqs_reset(cu_cpc_reqs);
cp = cp->cpu_next_onln;
} while (cp != cpu_active);
/*
* Mark flags to say that module is on now and counters are ready to be
* programmed on all active CPUs
*/
cu_flags |= CU_FLAG_ON;
/*
* Program counters on currently active CPUs
*/
cp = cpu_active;
do {
if (cu_cpu_run(cp, cu_cpc_program_xcall,
(uintptr_t)B_FALSE) != 0)
cu_init_error = -6;
cp = cp->cpu_next_onln;
} while (cp != cpu_active);
/*
* Register callback for CPU state changes to enable and disable
* CPC counters as CPUs come on and offline
*/
register_cpu_setup_func(cu_cpu_callback, NULL);
mutex_exit(&cpu_lock);
}
/*
* Return number of counter events needed to measure capacity and utilization
* for specified CPU and fill in list of CPC requests with each counter event
* needed if list where to add CPC requests is given
*
* NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
* everything that has been successfully allocated if any memory
* allocation fails
*/
static int
cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
{
group_t *cmt_pgs;
cu_cntr_info_t **cntr_info_array;
cpu_pg_t *cpu_pgs;
cu_cpu_info_t *cu_cpu_info;
pg_cmt_t *pg_cmt;
pghw_t *pg_hw;
cu_cntr_stats_t *stats;
int nevents;
pghw_type_t pg_hw_type;
group_iter_t iter;
ASSERT(MUTEX_HELD(&cpu_lock));
/*
* There has to be a target CPU for this
*/
if (cp == NULL)
return (-1);
/*
* Return 0 when CPU doesn't belong to any group
*/
cpu_pgs = cp->cpu_pg;
if (cpu_pgs == NULL || GROUP_SIZE(&cpu_pgs->cmt_pgs) < 1)
return (0);
cmt_pgs = &cpu_pgs->cmt_pgs;
cu_cpu_info = cp->cpu_cu_info;
/*
* Grab counter statistics and info
*/
if (reqs == NULL) {
stats = NULL;
cntr_info_array = NULL;
} else {
if (cu_cpu_info == NULL || cu_cpu_info->cu_cntr_stats == NULL)
return (-2);
stats = cu_cpu_info->cu_cntr_stats;
cntr_info_array = cu_cpu_info->cu_cntr_info;
}
/*
* See whether platform (or processor) specific code knows which CPC
* events to request, etc. are needed to measure hardware capacity and
* utilization on this machine
*/
nevents = cu_plat_cpc_init(cp, reqs, nreqs);
if (nevents >= 0)
return (nevents);
/*
* Let common code decide which CPC events to request, etc. to measure
* capacity and utilization since platform (or processor) specific does
* not know....
*
* Walk CPU's PG lineage and do following:
*
* - Setup CPC request, counter info, and stats needed for each counter
* event to measure capacity and and utilization for each of CPU's PG
* hardware sharing relationships
*
* - Create PG CPU kstats to export capacity and utilization for each PG
*/
nevents = 0;
group_iter_init(&iter);
while ((pg_cmt = group_iterate(cmt_pgs, &iter)) != NULL) {
cu_cntr_info_t *cntr_info;
int nevents_save;
int nstats;
pg_hw = (pghw_t *)pg_cmt;
pg_hw_type = pg_hw->pghw_hw;
nevents_save = nevents;
nstats = 0;
switch (pg_hw_type) {
case PGHW_IPIPE:
if (cu_cpc_req_add("PAPI_tot_ins", reqs, nreqs, stats,
KM_NOSLEEP, &nevents) != 0)
continue;
nstats = 1;
break;
case PGHW_FPU:
if (cu_cpc_req_add("PAPI_fp_ins", reqs, nreqs, stats,
KM_NOSLEEP, &nevents) != 0)
continue;
nstats = 1;
break;
default:
/*
* Don't measure capacity and utilization for this kind
* of PG hardware relationship so skip to next PG in
* CPU's PG lineage
*/
continue;
}
cntr_info = cntr_info_array[pg_hw_type];
/*
* Nothing to measure for this hardware sharing relationship
*/
if (nevents - nevents_save == 0) {
if (cntr_info != NULL) {
kmem_free(cntr_info, sizeof (cu_cntr_info_t));
cntr_info_array[pg_hw_type] = NULL;
}
continue;
}
/*
* Fill in counter info for this PG hardware relationship
*/
if (cntr_info == NULL) {
cntr_info = kmem_zalloc(sizeof (cu_cntr_info_t),
KM_NOSLEEP);
if (cntr_info == NULL)
continue;
cntr_info_array[pg_hw_type] = cntr_info;
}
cntr_info->ci_cpu = cp;
cntr_info->ci_pg = pg_hw;
cntr_info->ci_stats = &stats[nevents_save];
cntr_info->ci_nstats = nstats;
/*
* Create PG CPU kstats for this hardware relationship
*/
cu_cpu_kstat_create(pg_hw, cntr_info);
}
return (nevents);
}
/*
* Program counters for capacity and utilization on given CPU
*
* If any of the following conditions is true, the counters are not programmed:
*
* - CU framework is disabled
* - The cpu_cu_info field of the cpu structure is NULL
* - DTrace is active
* - Counters are programmed already
* - Counters are disabled (by calls to cu_cpu_disable())
*/
void
cu_cpc_program(cpu_t *cp, int *err)
{
cu_cpc_ctx_t *cpu_ctx;
kcpc_ctx_t *ctx;
cu_cpu_info_t *cu_cpu_info;
ASSERT(IS_HIPIL());
/*
* Should be running on given CPU. We disable preemption to keep CPU
* from disappearing and make sure flags and CPC context don't change
* from underneath us
*/
kpreempt_disable();
ASSERT(cp == CPU);
/*
* Module not ready to program counters
*/
if (!(cu_flags & CU_FLAG_ON)) {
*err = -1;
kpreempt_enable();
return;
}
if (cp == NULL) {
*err = -2;
kpreempt_enable();
return;
}
cu_cpu_info = cp->cpu_cu_info;
if (cu_cpu_info == NULL) {
*err = -3;
kpreempt_enable();
return;
}
/*
* If DTrace CPC is active or counters turned on already or are
* disabled, just return.
*/
if (dtrace_cpc_in_use || (cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON) ||
cu_cpu_info->cu_disabled) {
*err = 1;
kpreempt_enable();
return;
}
if ((CPU->cpu_cpc_ctx != NULL) &&
!(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
*err = -4;
kpreempt_enable();
return;
}
/*
* Get CPU's CPC context needed for capacity and utilization
*/
cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
ASSERT(cpu_ctx != NULL);
ASSERT(cpu_ctx->nctx >= 0);
ASSERT(cpu_ctx->ctx_ptr_array == NULL || cpu_ctx->ctx_ptr_array_sz > 0);
ASSERT(cpu_ctx->nctx <= cpu_ctx->ctx_ptr_array_sz);
if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
cpu_ctx->ctx_ptr_array_sz <= 0) {
*err = -5;
kpreempt_enable();
return;
}
/*
* Increment index in CPU's CPC context info to point at next context
* to program
*
* NOTE: Do this now instead of after programming counters to ensure
* that index will always point at *current* context so we will
* always be able to unprogram *current* context if necessary
*/
cpu_ctx->cur_index = (cpu_ctx->cur_index + 1) % cpu_ctx->nctx;
ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
/*
* Clear KCPC_CTX_INVALID and KCPC_CTX_INVALID_STOPPED from CPU's CPC
* context before programming counters
*
* Context is marked with KCPC_CTX_INVALID_STOPPED when context is
* unprogrammed and may be marked with KCPC_CTX_INVALID when
* kcpc_invalidate_all() is called by cpustat(8) and dtrace CPC to
* invalidate all CPC contexts before they take over all the counters.
*
* This isn't necessary since these flags are only used for thread bound
* CPC contexts not CPU bound CPC contexts like ones used for capacity
* and utilization.
*
* There is no need to protect the flag update since no one is using
* this context now.
*/
ctx->kc_flags &= ~(KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
/*
* Program counters on this CPU
*/
kcpc_program(ctx, B_FALSE, B_FALSE);
cp->cpu_cpc_ctx = ctx;
/*
* Set state in CPU structure to say that CPU's counters are programmed
* for capacity and utilization now and that they are transitioning from
* off to on state. This will cause cu_cpu_update to update stop times
* for all programmed counters.
*/
cu_cpu_info->cu_flag |= CU_CPU_CNTRS_ON | CU_CPU_CNTRS_OFF_ON;
/*
* Update counter statistics
*/
(void) cu_cpu_update(cp, B_FALSE);
cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_OFF_ON;
*err = 0;
kpreempt_enable();
}
/*
* Cross call wrapper routine for cu_cpc_program()
*
* Checks to make sure that counters on CPU aren't being used by someone else
* before calling cu_cpc_program() since cu_cpc_program() needs to assert that
* nobody else is using the counters to catch and prevent any broken code.
* Also, this check needs to happen on the target CPU since the CPU's CPC
* context can only be changed while running on the CPU.
*
* If the first argument is TRUE, cu_cpc_program_xcall also checks that there is
* no valid thread bound cpc context. This is important to check to prevent
* re-programming thread counters with CU counters when CPU is coming on-line.
*/
static void
cu_cpc_program_xcall(uintptr_t arg, int *err)
{
boolean_t avoid_thread_context = (boolean_t)arg;
kpreempt_disable();
if (CPU->cpu_cpc_ctx != NULL &&
!(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
*err = -100;
kpreempt_enable();
return;
}
if (avoid_thread_context && (curthread->t_cpc_ctx != NULL) &&
!(curthread->t_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
*err = -200;
kpreempt_enable();
return;
}
cu_cpc_program(CPU, err);
kpreempt_enable();
}
/*
* Unprogram counters for capacity and utilization on given CPU
* This function should be always executed on the target CPU at high PIL
*/
void
cu_cpc_unprogram(cpu_t *cp, int *err)
{
cu_cpc_ctx_t *cpu_ctx;
kcpc_ctx_t *ctx;
cu_cpu_info_t *cu_cpu_info;
ASSERT(IS_HIPIL());
/*
* Should be running on given CPU with preemption disabled to keep CPU
* from disappearing and make sure flags and CPC context don't change
* from underneath us
*/
kpreempt_disable();
ASSERT(cp == CPU);
/*
* Module not on
*/
if (!(cu_flags & CU_FLAG_ON)) {
*err = -1;
kpreempt_enable();
return;
}
cu_cpu_info = cp->cpu_cu_info;
if (cu_cpu_info == NULL) {
*err = -3;
kpreempt_enable();
return;
}
/*
* Counters turned off already
*/
if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) {
*err = 1;
kpreempt_enable();
return;
}
/*
* Update counter statistics
*/
(void) cu_cpu_update(cp, B_FALSE);
/*
* Get CPU's CPC context needed for capacity and utilization
*/
cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
cpu_ctx->ctx_ptr_array_sz <= 0) {
*err = -5;
kpreempt_enable();
return;
}
ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];
/*
* CPU's CPC context should be current capacity and utilization CPC
* context
*/
ASSERT(cp->cpu_cpc_ctx == ctx);
if (cp->cpu_cpc_ctx != ctx) {
*err = -6;
kpreempt_enable();
return;
}
/*
* Unprogram counters on CPU.
*/
kcpc_unprogram(ctx, B_FALSE);
ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
/*
* Unset state in CPU structure saying that CPU's counters are
* programmed
*/
cp->cpu_cpc_ctx = NULL;
cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_ON;
*err = 0;
kpreempt_enable();
}
/*
* Add given counter event to list of CPC requests
*/
static int
cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, int nreqs,
cu_cntr_stats_t *stats, int kmem_flags, int *nevents)
{
int n;
int retval;
uint_t flags;
/*
* Return error when no counter event specified, counter event not
* supported by CPC's PCBE, or number of events not given
*/
if (event == NULL || kcpc_event_supported(event) == B_FALSE ||
nevents == NULL)
return (-1);
n = *nevents;
/*
* Only count number of counter events needed if list
* where to add CPC requests not given
*/
if (reqs == NULL) {
n++;
*nevents = n;
return (-3);
}
/*
* Return error when stats not given or not enough room on list of CPC
* requests for more counter events
*/
if (stats == NULL || (nreqs <= 0 && n >= nreqs))
return (-4);
/*
* Use flags in cu_cpc_flags to program counters and enable overflow
* interrupts/traps (unless PCBE can't handle overflow interrupts) so
* PCBE can catch counters before they wrap to hopefully give us an
* accurate (64-bit) virtualized counter
*/
flags = cu_cpc_flags;
if ((kcpc_pcbe_capabilities() & CPC_CAP_OVERFLOW_INTERRUPT) == 0)
flags &= ~CPC_OVF_NOTIFY_EMT;
/*
* Add CPC request to list
*/
retval = kcpc_reqs_add(reqs, event, cu_cpc_preset_value,
flags, 0, NULL, &stats[n], kmem_flags);
if (retval != 0)
return (-5);
n++;
*nevents = n;
return (0);
}
static void
cu_cpu_info_detach_xcall(void)
{
ASSERT(IS_HIPIL());
CPU->cpu_cu_info = NULL;
}
/*
* Enable or disable collection of capacity/utilization data for a current CPU.
* Counters are enabled if 'on' argument is True and disabled if it is False.
* This function should be always executed at high PIL
*/
static void
cu_cpc_trigger(uintptr_t arg1, uintptr_t arg2)
{
cpu_t *cp = (cpu_t *)arg1;
boolean_t on = (boolean_t)arg2;
int error;
cu_cpu_info_t *cu_cpu_info;
ASSERT(IS_HIPIL());
kpreempt_disable();
ASSERT(cp == CPU);
if (!(cu_flags & CU_FLAG_ON)) {
kpreempt_enable();
return;
}
cu_cpu_info = cp->cpu_cu_info;
if (cu_cpu_info == NULL) {
kpreempt_enable();
return;
}
ASSERT(!cu_cpu_info->cu_disabled ||
!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
if (on) {
/*
* Decrement the cu_disabled counter.
* Once it drops to zero, call cu_cpc_program.
*/
if (cu_cpu_info->cu_disabled > 0)
cu_cpu_info->cu_disabled--;
if (cu_cpu_info->cu_disabled == 0)
cu_cpc_program(CPU, &error);
} else if (cu_cpu_info->cu_disabled++ == 0) {
/*
* This is the first attempt to disable CU, so turn it off
*/
cu_cpc_unprogram(cp, &error);
ASSERT(!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
}
kpreempt_enable();
}
/*
* Callback for changes in CPU states
* Used to enable or disable hardware performance counters on CPUs that are
* turned on or off
*
* NOTE: cpc should be programmed/unprogrammed while running on the target CPU.
* We have to use thread_affinity_set to hop to the right CPU because these
* routines expect cpu_lock held, so we can't cross-call other CPUs while
* holding CPU lock.
*/
static int
/* LINTED E_FUNC_ARG_UNUSED */
cu_cpu_callback(cpu_setup_t what, int id, void *arg)
{
cpu_t *cp;
int retval = 0;
ASSERT(MUTEX_HELD(&cpu_lock));
if (!(cu_flags & CU_FLAG_ON))
return (-1);
cp = cpu_get(id);
if (cp == NULL)
return (-2);
switch (what) {
case CPU_ON:
/*
* Setup counters on CPU being turned on
*/
retval = cu_cpu_init(cp, cu_cpc_reqs);
/*
* Reset list of counter event requests so its space can be
* reused for a different set of requests for next CPU
*/
(void) kcpc_reqs_reset(cu_cpc_reqs);
break;
case CPU_INTR_ON:
/*
* Setup counters on CPU being turned on.
*/
retval = cu_cpu_run(cp, cu_cpc_program_xcall,
(uintptr_t)B_TRUE);
break;
case CPU_OFF:
/*
* Disable counters on CPU being turned off. Counters will not
* be re-enabled on this CPU until it comes back online.
*/
cu_cpu_disable(cp);
ASSERT(!CU_CPC_ON(cp));
retval = cu_cpu_fini(cp);
break;
default:
break;
}
return (retval);
}
/*
* Disable or enable Capacity Utilization counters on a given CPU. This function
* can be called from any CPU to disable counters on the given CPU.
*/
static void
cu_cpu_disable(cpu_t *cp)
{
cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_FALSE);
}
static void
cu_cpu_enable(cpu_t *cp)
{
cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_TRUE);
}
/*
* Setup capacity and utilization support for given CPU
*
* NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
* everything that has been successfully allocated including cpu_cu_info
* if any memory allocation fails
*/
static int
cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs)
{
kcpc_ctx_t **ctx_ptr_array;
size_t ctx_ptr_array_sz;
cu_cpc_ctx_t *cpu_ctx;
cu_cpu_info_t *cu_cpu_info;
int n;
/*
* cpu_lock should be held and protect against CPU going away and races
* with cu_{init,fini,cpu_fini}()
*/
ASSERT(MUTEX_HELD(&cpu_lock));
/*
* Return if not ready to setup counters yet
*/
if (!(cu_flags & CU_FLAG_READY))
return (-1);
if (cp->cpu_cu_info == NULL) {
cp->cpu_cu_info = kmem_zalloc(sizeof (cu_cpu_info_t),
KM_NOSLEEP);
if (cp->cpu_cu_info == NULL)
return (-2);
}
/*
* Get capacity and utilization CPC context for CPU and check to see
* whether it has been setup already
*/
cu_cpu_info = cp->cpu_cu_info;
cu_cpu_info->cu_cpu = cp;
cu_cpu_info->cu_disabled = dtrace_cpc_in_use ? 1 : 0;
cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
if (cpu_ctx->nctx > 0 && cpu_ctx->ctx_ptr_array != NULL &&
cpu_ctx->ctx_ptr_array_sz > 0) {
return (1);
}
/*
* Should have no contexts since it hasn't been setup already
*/
ASSERT(cpu_ctx->nctx == 0 && cpu_ctx->ctx_ptr_array == NULL &&
cpu_ctx->ctx_ptr_array_sz == 0);
/*
* Determine how many CPC events needed to measure capacity and
* utilization for this CPU, allocate space for counter statistics for
* each event, and fill in list of CPC event requests with corresponding
* counter stats for each request to make attributing counter data
* easier later....
*/
n = cu_cpc_init(cp, NULL, 0);
if (n <= 0) {
(void) cu_cpu_fini(cp);
return (-3);
}
cu_cpu_info->cu_cntr_stats = kmem_zalloc(n * sizeof (cu_cntr_stats_t),
KM_NOSLEEP);
if (cu_cpu_info->cu_cntr_stats == NULL) {
(void) cu_cpu_fini(cp);
return (-4);
}
cu_cpu_info->cu_ncntr_stats = n;
n = cu_cpc_init(cp, reqs, n);
if (n <= 0) {
(void) cu_cpu_fini(cp);
return (-5);
}
/*
* Create CPC context with given requests
*/
ctx_ptr_array = NULL;
ctx_ptr_array_sz = 0;
n = kcpc_cpu_ctx_create(cp, reqs, KM_NOSLEEP, &ctx_ptr_array,
&ctx_ptr_array_sz);
if (n <= 0) {
(void) cu_cpu_fini(cp);
return (-6);
}
/*
* Should have contexts
*/
ASSERT(n > 0 && ctx_ptr_array != NULL && ctx_ptr_array_sz > 0);
if (ctx_ptr_array == NULL || ctx_ptr_array_sz <= 0) {
(void) cu_cpu_fini(cp);
return (-7);
}
/*
* Fill in CPC context info for CPU needed for capacity and utilization
*/
cpu_ctx->cur_index = 0;
cpu_ctx->nctx = n;
cpu_ctx->ctx_ptr_array = ctx_ptr_array;
cpu_ctx->ctx_ptr_array_sz = ctx_ptr_array_sz;
return (0);
}
/*
* Tear down capacity and utilization support for given CPU
*/
static int
cu_cpu_fini(cpu_t *cp)
{
kcpc_ctx_t *ctx;
cu_cpc_ctx_t *cpu_ctx;
cu_cpu_info_t *cu_cpu_info;
int i;
pghw_type_t pg_hw_type;
/*
* cpu_lock should be held and protect against CPU going away and races
* with cu_{init,fini,cpu_init}()
*/
ASSERT(MUTEX_HELD(&cpu_lock));
/*
* Have to at least be ready to setup counters to have allocated
* anything that needs to be deallocated now
*/
if (!(cu_flags & CU_FLAG_READY))
return (-1);
/*
* Nothing to do if CPU's capacity and utilization info doesn't exist
*/
cu_cpu_info = cp->cpu_cu_info;
if (cu_cpu_info == NULL)
return (1);
/*
* Tear down any existing kstats and counter info for each hardware
* sharing relationship
*/
for (pg_hw_type = PGHW_START; pg_hw_type < PGHW_NUM_COMPONENTS;
pg_hw_type++) {
cu_cntr_info_t *cntr_info;
cntr_info = cu_cpu_info->cu_cntr_info[pg_hw_type];
if (cntr_info == NULL)
continue;
if (cntr_info->ci_kstat != NULL) {
kstat_delete(cntr_info->ci_kstat);
cntr_info->ci_kstat = NULL;
}
kmem_free(cntr_info, sizeof (cu_cntr_info_t));
}
/*
* Free counter statistics for CPU
*/
ASSERT(cu_cpu_info->cu_cntr_stats == NULL ||
cu_cpu_info->cu_ncntr_stats > 0);
if (cu_cpu_info->cu_cntr_stats != NULL &&
cu_cpu_info->cu_ncntr_stats > 0) {
kmem_free(cu_cpu_info->cu_cntr_stats,
cu_cpu_info->cu_ncntr_stats * sizeof (cu_cntr_stats_t));
cu_cpu_info->cu_cntr_stats = NULL;
cu_cpu_info->cu_ncntr_stats = 0;
}
/*
* Get capacity and utilization CPC contexts for given CPU and check to
* see whether they have been freed already
*/
cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
if (cpu_ctx != NULL && cpu_ctx->ctx_ptr_array != NULL &&
cpu_ctx->ctx_ptr_array_sz > 0) {
/*
* Free CPC contexts for given CPU
*/
for (i = 0; i < cpu_ctx->nctx; i++) {
ctx = cpu_ctx->ctx_ptr_array[i];
if (ctx == NULL)
continue;
kcpc_free_cpu(ctx);
}
/*
* Free CPC context pointer array
*/
kmem_free(cpu_ctx->ctx_ptr_array, cpu_ctx->ctx_ptr_array_sz);
/*
* Zero CPC info for CPU
*/
bzero(cpu_ctx, sizeof (cu_cpc_ctx_t));
}
/*
* Set cp->cpu_cu_info pointer to NULL. Go through cross-call to ensure
* that no one is going to access the cpu_cu_info whicch we are going to
* free.
*/
if (cpu_is_online(cp))
cpu_call(cp, (cpu_call_func_t)cu_cpu_info_detach_xcall, 0, 0);
else
cp->cpu_cu_info = NULL;
/*
* Free CPU's capacity and utilization info
*/
kmem_free(cu_cpu_info, sizeof (cu_cpu_info_t));
return (0);
}
/*
* Create capacity & utilization kstats for given PG CPU hardware sharing
* relationship
*/
static void
cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info)
{
kstat_t *ks;
char *sharing = pghw_type_string(pg->pghw_hw);
char name[KSTAT_STRLEN + 1];
/*
* Just return when no counter info or CPU
*/
if (cntr_info == NULL || cntr_info->ci_cpu == NULL)
return;
/*
* Canonify PG name to conform to kstat name rules
*/
(void) strncpy(name, pghw_type_string(pg->pghw_hw), KSTAT_STRLEN + 1);
strident_canon(name, TASKQ_NAMELEN + 1);
if ((ks = kstat_create_zone("pg_hw_perf_cpu",
cntr_info->ci_cpu->cpu_id,
name, "processor_group", KSTAT_TYPE_NAMED,
sizeof (cu_cpu_kstat) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID)) == NULL)
return;
ks->ks_lock = &pg_cpu_kstat_lock;
ks->ks_data = &cu_cpu_kstat;
ks->ks_update = cu_cpu_kstat_update;
ks->ks_data_size += strlen(sharing) + 1;
ks->ks_private = cntr_info;
cntr_info->ci_kstat = ks;
kstat_install(cntr_info->ci_kstat);
}
/*
* Propagate values from CPU capacity & utilization stats to kstats
*/
static int
cu_cpu_kstat_update(kstat_t *ksp, int rw)
{
cpu_t *cp;
cu_cntr_info_t *cntr_info = ksp->ks_private;
struct cu_cpu_kstat *kstat = &cu_cpu_kstat;
pghw_t *pg;
cu_cntr_stats_t *stats;
if (rw == KSTAT_WRITE)
return (EACCES);
cp = cntr_info->ci_cpu;
pg = cntr_info->ci_pg;
kstat->cu_cpu_id.value.ui32 = cp->cpu_id;
kstat->cu_pg_id.value.i32 = ((pg_t *)pg)->pg_id;
/*
* The caller should have priv_cpc_cpu privilege to get utilization
* data. Callers who do not have the privilege will see zeroes as the
* values.
*/
if (secpolicy_cpc_cpu(crgetcred()) != 0) {
kstat->cu_generation.value.ui32 = cp->cpu_generation;
kstat_named_setstr(&kstat->cu_cpu_relationship,
pghw_type_string(pg->pghw_hw));
kstat->cu_cpu_util.value.ui64 = 0;
kstat->cu_cpu_rate.value.ui64 = 0;
kstat->cu_cpu_rate_max.value.ui64 = 0;
kstat->cu_cpu_time_running.value.ui64 = 0;
kstat->cu_cpu_time_stopped.value.ui64 = 0;
return (0);
}
kpreempt_disable();
/*
* Update capacity and utilization statistics needed for CPU's PG (CPU)
* kstats
*/
(void) cu_cpu_update(cp, B_TRUE);
stats = cntr_info->ci_stats;
kstat->cu_generation.value.ui32 = cp->cpu_generation;
kstat_named_setstr(&kstat->cu_cpu_relationship,
pghw_type_string(pg->pghw_hw));
kstat->cu_cpu_util.value.ui64 = stats->cs_value_total;
kstat->cu_cpu_rate.value.ui64 = stats->cs_rate;
kstat->cu_cpu_rate_max.value.ui64 = stats->cs_rate_max;
kstat->cu_cpu_time_running.value.ui64 = stats->cs_time_running;
kstat->cu_cpu_time_stopped.value.ui64 = stats->cs_time_stopped;
/*
* Counters are stopped now, so the cs_time_stopped was last
* updated at cs_time_start time. Add the time passed since then
* to the stopped time.
*/
if (!(cp->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON))
kstat->cu_cpu_time_stopped.value.ui64 +=
gethrtime() - stats->cs_time_start;
kpreempt_enable();
return (0);
}
/*
* Run specified function with specified argument on a given CPU and return
* whatever the function returns
*/
static int
cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg)
{
int error = 0;
/*
* cpu_call() will call func on the CPU specified with given argument
* and return func's return value in last argument
*/
cpu_call(cp, (cpu_call_func_t)(uintptr_t)func, arg, (uintptr_t)&error);
return (error);
}
/*
* Update counter statistics on a given CPU.
*
* If move_to argument is True, execute the function on the CPU specified
* Otherwise, assume that it is already runninng on the right CPU
*
* If move_to is specified, the caller should hold cpu_lock or have preemption
* disabled. Otherwise it is up to the caller to guarantee that things do not
* change in the process.
*/
int
cu_cpu_update(struct cpu *cp, boolean_t move_to)
{
int retval;
cu_cpu_info_t *cu_cpu_info = cp->cpu_cu_info;
hrtime_t time_snap;
ASSERT(!move_to || MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0);
/*
* Nothing to do if counters are not programmed
*/
if (!(cu_flags & CU_FLAG_ON) ||
(cu_cpu_info == NULL) ||
!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
return (0);
/*
* Don't update CPU statistics if it was updated recently
* and provide old results instead
*/
time_snap = gethrtime();
if ((time_snap - cu_cpu_info->cu_sample_time) < cu_update_threshold) {
DTRACE_PROBE1(cu__drop__sample, cpu_t *, cp);
return (0);
}
cu_cpu_info->cu_sample_time = time_snap;
/*
* CPC counter should be read on the CPU that is running the counter. We
* either have to move ourselves to the target CPU or insure that we
* already run there.
*
* We use cross-call to the target CPU to execute kcpc_read() and
* cu_cpu_update_stats() there.
*/
retval = 0;
if (move_to)
(void) cu_cpu_run(cp, (cu_cpu_func_t)(uintptr_t)kcpc_read,
(uintptr_t)cu_cpu_update_stats);
else {
retval = kcpc_read((kcpc_update_func_t)cu_cpu_update_stats);
/*
* Offset negative return value by -10 so we can distinguish it
* from error return values of this routine vs kcpc_read()
*/
if (retval < 0)
retval -= 10;
}
return (retval);
}
/*
* Update CPU counter statistics for current CPU.
* This function may be called from a cross-call
*/
static int
cu_cpu_update_stats(cu_cntr_stats_t *stats, uint64_t cntr_value)
{
cu_cpu_info_t *cu_cpu_info = CPU->cpu_cu_info;
uint_t flags;
uint64_t delta;
hrtime_t time_delta;
hrtime_t time_snap;
if (stats == NULL)
return (-1);
/*
* Nothing to do if counters are not programmed. This should not happen,
* but we check just in case.
*/
ASSERT(cu_flags & CU_FLAG_ON);
ASSERT(cu_cpu_info != NULL);
if (!(cu_flags & CU_FLAG_ON) ||
(cu_cpu_info == NULL))
return (-2);
flags = cu_cpu_info->cu_flag;
ASSERT(flags & CU_CPU_CNTRS_ON);
if (!(flags & CU_CPU_CNTRS_ON))
return (-2);
/*
* Take snapshot of high resolution timer
*/
time_snap = gethrtime();
/*
* CU counters have just been programmed. We cannot assume that the new
* cntr_value continues from where we left off, so use the cntr_value as
* the new initial value.
*/
if (flags & CU_CPU_CNTRS_OFF_ON)
stats->cs_value_start = cntr_value;
/*
* Calculate delta in counter values between start of sampling period
* and now
*/
delta = cntr_value - stats->cs_value_start;
/*
* Calculate time between start of sampling period and now
*/
time_delta = stats->cs_time_start ?
time_snap - stats->cs_time_start :
0;
stats->cs_time_start = time_snap;
stats->cs_value_start = cntr_value;
if (time_delta > 0) { /* wrap shouldn't happen */
/*
* Update either running or stopped time based on the transition
* state
*/
if (flags & CU_CPU_CNTRS_OFF_ON)
stats->cs_time_stopped += time_delta;
else
stats->cs_time_running += time_delta;
}
/*
* Update rest of counter statistics if counter value didn't wrap
*/
if (delta > 0) {
/*
* Update utilization rate if the interval between samples is
* sufficient.
*/
ASSERT(cu_sample_interval_min > CU_SCALE);
if (time_delta > cu_sample_interval_min)
stats->cs_rate = CU_RATE(delta, time_delta);
if (stats->cs_rate_max < stats->cs_rate)
stats->cs_rate_max = stats->cs_rate;
stats->cs_value_last = delta;
stats->cs_value_total += delta;
}
return (0);
}
/*
* Update CMT PG utilization data.
*
* This routine computes the running total utilization and times for the
* specified PG by adding up the total utilization and counter running and
* stopped times of all CPUs in the PG and calculates the utilization rate and
* maximum rate for all CPUs in the PG.
*/
void
cu_pg_update(pghw_t *pg)
{
pg_cpu_itr_t cpu_iter;
pghw_type_t pg_hwtype;
cpu_t *cpu;
pghw_util_t *hw_util = &pg->pghw_stats;
uint64_t old_utilization = hw_util->pghw_util;
hrtime_t now;
hrtime_t time_delta;
uint64_t utilization_delta;
ASSERT(MUTEX_HELD(&cpu_lock));
now = gethrtime();
pg_hwtype = pg->pghw_hw;
/*
* Initialize running total utilization and times for PG to 0
*/
hw_util->pghw_util = 0;
hw_util->pghw_time_running = 0;
hw_util->pghw_time_stopped = 0;
/*
* Iterate over all CPUs in the PG and aggregate utilization, running
* time and stopped time.
*/
PG_CPU_ITR_INIT(pg, cpu_iter);
while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
cu_cpu_info_t *cu_cpu_info = cpu->cpu_cu_info;
cu_cntr_info_t *cntr_info;
cu_cntr_stats_t *stats;
if (cu_cpu_info == NULL)
continue;
/*
* Update utilization data for the CPU and then
* aggregate per CPU running totals for PG
*/
(void) cu_cpu_update(cpu, B_TRUE);
cntr_info = cu_cpu_info->cu_cntr_info[pg_hwtype];
if (cntr_info == NULL || (stats = cntr_info->ci_stats) == NULL)
continue;
hw_util->pghw_util += stats->cs_value_total;
hw_util->pghw_time_running += stats->cs_time_running;
hw_util->pghw_time_stopped += stats->cs_time_stopped;
/*
* If counters are stopped now, the pg_time_stopped was last
* updated at cs_time_start time. Add the time passed since then
* to the stopped time.
*/
if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
hw_util->pghw_time_stopped +=
now - stats->cs_time_start;
}
/*
* Compute per PG instruction rate and maximum rate
*/
time_delta = now - hw_util->pghw_time_stamp;
hw_util->pghw_time_stamp = now;
if (old_utilization == 0)
return;
/*
* Calculate change in utilization over sampling period and set this to
* 0 if the delta would be 0 or negative which may happen if any CPUs go
* offline during the sampling period
*/
if (hw_util->pghw_util > old_utilization)
utilization_delta = hw_util->pghw_util - old_utilization;
else
utilization_delta = 0;
/*
* Update utilization rate if the interval between samples is
* sufficient.
*/
ASSERT(cu_sample_interval_min > CU_SCALE);
if (time_delta > CU_SAMPLE_INTERVAL_MIN)
hw_util->pghw_rate = CU_RATE(utilization_delta, time_delta);
/*
* Update the maximum observed rate
*/
if (hw_util->pghw_rate_max < hw_util->pghw_rate)
hw_util->pghw_rate_max = hw_util->pghw_rate;
}
|