1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
|
/*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*/
/*
* Copyright 2016 Joyent, Inc.
*/
#include <sys/asm_linkage.h>
#include <sys/segments.h>
#include <sys/time_impl.h>
#include <sys/tsc.h>
#include <cp_offsets.h>
#define GETCPU_GDT_OFFSET SEL_GDT(GDT_CPUID, SEL_UPL)
.file "cp_subr.s"
/*
* These are cloned from TSC and time related code in the kernel. They should
* be kept in sync in the case that the source values are changed.
* See: uts/i86pc/os/timestamp.c
*/
#define NSEC_SHIFT 5
#define ADJ_SHIFT 4
#define NANOSEC 0x3b9aca00
/*
* hrtime_t
* __cp_tsc_read(comm_page_t *cp)
*
* Stack usage: 0 bytes
*/
ENTRY_NP(__cp_tsc_read)
movl CP_TSC_TYPE(%rdi), %esi
movl CP_TSC_NCPU(%rdi), %r8d
leaq CP_TSC_SYNC_TICK_DELTA(%rdi), %r9
cmpl $TSC_TSCP, %esi
jne 2f
rdtscp
/*
* When the TSC is read, the low 32 bits are placed in %eax while the
* high 32 bits are placed in %edx. They are shifted and ORed together
* to obtain the full 64-bit value.
*/
shlq $0x20, %rdx
orq %rdx, %rax
cmpl $0, %esi
jne 1f
ret
1:
/*
* When cp_tsc_ncpu is non-zero, it indicates the length of the
* cp_tsc_sync_tick_delta array, which contains per-CPU offsets for the
* TSC. The CPU ID furnished by the IA32_TSC_AUX register via rdtscp
* is used to look up an offset value in that array and apply it to the
* TSC reading.
*/
movq (%r9, %rcx, 8), %rdx
addq %rdx, %rax
ret
2:
/*
* Without rdtscp, there is no way to perform a TSC reading and
* simultaneously query the current CPU. If tsc_ncpu indicates that
* per-CPU TSC offsets are present, the ID of the current CPU is
* queried before performing a TSC reading. It will be later compared
* to a second CPU ID lookup to catch CPU migrations.
*
* This method will catch all but the most pathological scheduling.
*/
cmpl $0, %r8d
je 3f
movl $GETCPU_GDT_OFFSET, %edx
lsl %dx, %edx
3:
/* Save the most recently queried CPU ID for later comparison. */
movl %edx, %r10d
cmpl $TSC_RDTSC_MFENCE, %esi
jne 4f
mfence
rdtsc
jmp 7f
4:
cmpl $TSC_RDTSC_LFENCE, %esi
jne 5f
lfence
rdtsc
jmp 7f
5:
cmpl $TSC_RDTSC_CPUID, %esi
jne 6f
/*
* Since the amd64 ABI dictates that %rbx is callee-saved, it must be
* preserved here. Its contents will be overwritten when cpuid is used
* as a serializing instruction.
*/
movq %rbx, %r11
xorl %eax, %eax
cpuid
rdtsc
movq %r11, %rbx
jmp 7f
6:
/*
* Other protections should have prevented this function from being
* called in the first place. The only sane action is to abort.
* The easiest means in this context is via SIGILL.
*/
ud2a
7:
shlq $0x20, %rdx
orq %rdx, %rax
/*
* Query the current CPU again if a per-CPU offset is being applied to
* the TSC reading. If the result differs from the earlier reading,
* then a migration has occured and the TSC must be read again.
*/
cmpl $0, %r8d
je 8f
movl $GETCPU_GDT_OFFSET, %edx
lsl %dx, %edx
cmpl %edx, %r10d
jne 3b
movq (%r9, %rdx, 8), %rdx
addq %rdx, %rax
8:
ret
SET_SIZE(__cp_tsc_read)
/*
* uint_t
* __cp_getcpu(comm_page_t *)
*
* Stack usage: 0 bytes
*/
ENTRY_NP(__cp_getcpu)
movl CP_TSC_TYPE(%rdi), %edi
/*
* If RDTSCP is available, it is a quick way to grab the cpu_id which
* is stored in the TSC_AUX MSR by the kernel.
*/
cmpl $TSC_TSCP, %edi
jne 1f
rdtscp
movl %ecx, %eax
ret
1:
mov $GETCPU_GDT_OFFSET, %eax
lsl %ax, %eax
ret
SET_SIZE(__cp_getcpu)
/*
* hrtime_t
* __cp_gethrtime(comm_page_t *cp)
*
* Stack usage: 0x20 local + 0x8 call = 0x28 bytes
*
* %rsp+0x00 - hrtime_t tsc_last
* %rsp+0x08 - hrtime_t hrtime_base
* %rsp+0x10 - commpage_t *cp
* %rsp+0x18 - int hres_lock
*/
ENTRY_NP(__cp_gethrtime)
subq $0x20, %rsp
movq %rdi, 0x10(%rsp)
1:
movl CP_HRES_LOCK(%rdi), %r9d
movl %r9d, 0x18(%rsp)
movq CP_TSC_LAST(%rdi), %rax
movq CP_TSC_HRTIME_BASE(%rdi), %rdx
movq %rax, (%rsp)
movq %rdx, 0x8(%rsp)
call __cp_tsc_read
movq 0x10(%rsp), %rdi
movl 0x18(%rsp), %r9d
movl CP_HRES_LOCK(%rdi), %edx
andl $0xfffffffe, %r9d
cmpl %r9d, %edx
jne 1b
/*
* The in-kernel logic for calculating hrtime performs several checks
* to protect against edge cases. That logic is summarized as:
* if (tsc >= tsc_last) {
* delta -= tsc_last;
* } else if (tsc >= tsc_last - 2*tsc_max_delta) {
* delta = 0;
* } else {
* delta = MIN(tsc, tsc_resume_cap);
* }
*
* The below implementation achieves the same result, although it is
* structured for speed and optimized for the fast path:
*
* delta = tsc - tsc_last;
* if (delta < 0) {
* delta += (tsc_max_delta << 1);
* if (delta >= 0) {
* delta = 0;
* } else {
* delta = MIN(tsc, tsc_resume_cap);
* }
* }
*/
movq (%rsp), %rdx
subq %rdx, %rax /* delta = tsc - tsc_last */
jbe 3f /* if (delta < 0) */
2:
/*
* Optimized TSC_CONVERT_AND_ADD:
* hrtime_base += (tsc_delta * nsec_scale) >> (32 - NSEC_SHIFT)
*
* Since the multiply and shift are done in 128-bit, there is no need
* to worry about overflow.
*/
movl CP_NSEC_SCALE(%rdi), %ecx
mulq %rcx
shrdq $_CONST(32 - NSEC_SHIFT), %rdx, %rax
movq 0x8(%rsp), %r8
addq %r8, %rax
addq $0x20, %rsp
ret
3:
movq %rax, %r9 /* save (tsc - tsc_last) in r9 */
movl CP_TSC_MAX_DELTA(%rdi), %ecx
sall $1, %ecx
addq %rcx, %rax /* delta += (tsc_max_delta << 1) */
jae 4f /* delta < 0 */
xorq %rax, %rax
jmp 2b
4:
/*
* Repopulate %rax with the TSC reading by adding tsc_last to %r9
* (which holds tsc - tsc_last)
*/
movq (%rsp), %rax
addq %r9, %rax
/* delta = MIN(tsc, resume_cap) */
movq CP_TSC_RESUME_CAP(%rdi), %rcx
cmpq %rcx, %rax
jbe 5f
movq %rcx, %rax
5:
jmp 2b
SET_SIZE(__cp_gethrtime)
/*
* int
* __cp_clock_gettime_monotonic(comm_page_t *cp, timespec_t *tsp)
*
* Stack usage: 0x8 local + 0x8 call + 0x28 called func. = 0x38 bytes
*
* %rsp+0x00 - timespec_t *tsp
*/
ENTRY_NP(__cp_clock_gettime_monotonic)
subq $0x8, %rsp
movq %rsi, (%rsp)
call __cp_gethrtime
/*
* Convert from hrtime_t (int64_t in nanoseconds) to timespec_t.
* This uses the same approach as hrt2ts, although it has been updated
* to utilize 64-bit math.
* 1 / 1,000,000,000 =
* 1000100101110000010111110100000100110110101101001010110110011B-26
* = 0x112e0be826d694b3 * 2^-26
*
* secs = (nsecs * 0x112e0be826d694b3) >> 26
*
* In order to account for the 2s-compliment of negative inputs, a
* final operation completes the process:
*
* secs -= (nsecs >> 63)
*/
movq %rax, %r11
movq $0x112e0be826d694b3, %rdx
imulq %rdx
sarq $0x1a, %rdx
movq %r11, %rax
sarq $0x3f, %rax
subq %rax, %rdx
movq (%rsp), %rsi
movq %rdx, (%rsi)
/*
* Populating tv_nsec is easier:
* tv_nsec = nsecs - (secs * NANOSEC)
*/
imulq $NANOSEC, %rdx, %rdx
subq %rdx, %r11
movq %r11, 0x8(%rsi)
xorl %eax, %eax
addq $0x8, %rsp
ret
SET_SIZE(__cp_clock_gettime_monotonic)
/*
* int
* __cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tsp)
*
* Stack usage: 0x18 local + 0x8 call + 0x28 called func. = 0x48 bytes
*
* %rsp+0x00 - commpage_t *cp
* %rsp+0x08 - timespec_t *tsp
* %rsp+0x10 - int hres_lock
*/
ENTRY_NP(__cp_clock_gettime_realtime)
subq $0x18, %rsp
movq %rdi, (%rsp)
movq %rsi, 0x8(%rsp)
1:
movl CP_HRES_LOCK(%rdi), %eax
movl %eax, 0x10(%rsp)
call __cp_gethrtime
movq (%rsp), %rdi
movq CP_HRES_LAST_TICK(%rdi), %rdx
subq %rdx, %rax /* nslt = hrtime - last_tick */
jb 1b
movq CP_HRESTIME(%rdi), %r9
movq _CONST(CP_HRESTIME + CP_HRESTIME_INCR)(%rdi), %r10
movl CP_HRESTIME_ADJ(%rdi), %r11d
addq %rax, %r10 /* now.tv_nsec += nslt */
cmpl $0, %r11d
jb 4f /* hres_adj > 0 */
ja 6f /* hres_adj < 0 */
2:
cmpq $NANOSEC, %r10
jae 8f /* tv_nsec >= NANOSEC */
3:
movl 0x10(%rsp), %eax
movl CP_HRES_LOCK(%rdi), %edx
andl $0xfffffffe, %edx
cmpl %eax, %edx
jne 1b
movq 0x8(%rsp), %rsi
movq %r9, (%rsi)
movq %r10, 0x8(%rsi)
xorl %eax, %eax
addq $0x18, %rsp
ret
4: /* hres_adj > 0 */
sarq $ADJ_SHIFT, %rax
cmpl %r11d, %eax
jbe 5f
movl %r11d, %eax
5:
addq %rax, %r10
jmp 2b
6: /* hres_adj < 0 */
sarq $ADJ_SHIFT, %rax
negl %r11d
cmpl %r11d, %eax
jbe 7f
movl %r11d, %eax
7:
subq %rax, %r10
jmp 2b
8: /* tv_nsec >= NANOSEC */
subq $NANOSEC, %r10
incq %r9
cmpq $NANOSEC, %r10
jae 8b
jmp 3b
SET_SIZE(__cp_clock_gettime_realtime)
|