summaryrefslogtreecommitdiff
path: root/usr/src/uts/sun4v/os/suspend.c
blob: d4cd260d7d3015e3e544ba988fb6c4afd1178a2e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/mutex.h>
#include <sys/cpuvar.h>
#include <sys/cyclic.h>
#include <sys/disp.h>
#include <sys/ddi.h>
#include <sys/wdt.h>
#include <sys/callb.h>
#include <sys/cmn_err.h>
#include <sys/hypervisor_api.h>
#include <sys/membar.h>
#include <sys/x_call.h>
#include <sys/promif.h>
#include <sys/systm.h>
#include <sys/mach_descrip.h>
#include <sys/cpu_module.h>
#include <sys/pg.h>
#include <sys/lgrp.h>
#include <sys/sysmacros.h>
#include <sys/sunddi.h>
#include <sys/cpupart.h>
#include <sys/hsvc.h>
#include <sys/mpo.h>
#include <vm/hat_sfmmu.h>

/*
 * Sun4v OS Suspend
 *
 * Provides a means to suspend a sun4v guest domain by pausing CPUs and then
 * calling into the HV to initiate a suspension. Suspension is sequenced
 * externally by calling suspend_pre, suspend_start, and suspend_post.
 * suspend_pre and suspend_post are meant to perform any special operations
 * that should be done before or after a suspend/resume operation. e.g.,
 * callbacks to cluster software to disable heartbeat monitoring before the
 * system is suspended. suspend_start prepares kernel services to be suspended
 * and then suspends the domain by calling hv_guest_suspend.
 *
 * Special Handling for %tick and %stick Registers
 *
 * After a suspend/resume operation, the %tick and %stick registers may have
 * jumped forwards or backwards. The delta is assumed to be consistent across
 * all CPUs, within the negligible level of %tick and %stick variation
 * acceptable on a cold boot. In order to maintain increasing %tick and %stick
 * counter values without exposing large positive or negative jumps to kernel
 * or user code, a %tick and %stick offset is used. Kernel reads of these
 * counters return the sum of the hardware register counter and offset
 * variable. After a suspend/resume operation, user reads of %tick or %stick
 * are emulated. Suspend code enables emulation by setting the
 * %{tick,stick}.NPT fields which trigger a privileged instruction access
 * trap whenever the registers are read from user mode. If emulation has been
 * enabled, the trap handler emulates the instruction. Emulation is only
 * enabled during a successful suspend/resume operation. When emulation is
 * enabled, CPUs that are DR'd into the system will have their
 * %{tick,stick}.NPT bits set to 1 as well.
 */

extern u_longlong_t gettick(void);	/* returns %stick */
extern uint64_t gettick_counter(void);	/* returns %tick */
extern uint64_t gettick_npt(void);
extern uint64_t getstick_npt(void);
extern int mach_descrip_update(void);
extern cpuset_t cpu_ready_set;
extern uint64_t native_tick_offset;
extern uint64_t native_stick_offset;

/*
 * Global Sun Cluster pre/post callbacks.
 */
const char *(*cl_suspend_error_decode)(int);
int (*cl_suspend_pre_callback)(void);
int (*cl_suspend_post_callback)(void);
#define	SC_PRE_FAIL_STR_FMT	"Sun Cluster pre-suspend failure: %d"
#define	SC_POST_FAIL_STR_FMT	"Sun Cluster post-suspend failure: %d"
#define	SC_FAIL_STR_MAX		256

/*
 * The minimum major and minor version of the HSVC_GROUP_CORE API group
 * required in order to use OS suspend.
 */
#define	SUSPEND_CORE_MAJOR	1
#define	SUSPEND_CORE_MINOR	2

/*
 * By default, sun4v OS suspend is supported if the required HV version
 * is present. suspend_disabled should be set on platforms that do not
 * allow OS suspend regardless of whether or not the HV supports it.
 * It can also be set in /etc/system.
 */
static int suspend_disabled = 0;

/*
 * Controls whether or not user-land tick and stick register emulation
 * will be enabled following a successful suspend operation.
 */
static int enable_user_tick_stick_emulation = 1;

/*
 * Indicates whether or not tick and stick emulation is currently active.
 * After a successful suspend operation, if emulation is enabled, this
 * variable is set to B_TRUE. Global scope to allow emulation code to
 * check if emulation is active.
 */
boolean_t tick_stick_emulation_active = B_FALSE;

/*
 * When non-zero, after a successful suspend and resume, cpunodes, CPU HW
 * sharing data structures, and processor groups will be updated using
 * information from the updated MD.
 */
static int suspend_update_cpu_mappings = 1;

/*
 * DBG and DBG_PROM() macro.
 */
#ifdef	DEBUG

static int suspend_debug_flag = 0;

#define	DBG_PROM		\
if (suspend_debug_flag)		\
	prom_printf

#define	DBG			\
if (suspend_debug_flag)		\
	suspend_debug

static void
suspend_debug(const char *fmt, ...)
{
	char	buf[512];
	va_list	ap;

	va_start(ap, fmt);
	(void) vsprintf(buf, fmt, ap);
	va_end(ap);

	cmn_err(CE_NOTE, "%s", buf);
}

#else /* DEBUG */

#define	DBG_PROM
#define	DBG

#endif /* DEBUG */

/*
 * Return true if the HV supports OS suspend and if suspend has not been
 * disabled on this platform.
 */
boolean_t
suspend_supported(void)
{
	uint64_t major, minor;

	if (suspend_disabled)
		return (B_FALSE);

	if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0)
		return (B_FALSE);

	return ((major == SUSPEND_CORE_MAJOR && minor >= SUSPEND_CORE_MINOR) ||
	    (major > SUSPEND_CORE_MAJOR));
}

/*
 * Given a source tick and stick value, set the tick and stick offsets such
 * that the (current physical register value + offset == source value).
 */
static void
set_tick_offsets(uint64_t source_tick, uint64_t source_stick)
{
	uint64_t target_tick;
	uint64_t target_stick;

	native_tick_offset = 0;
	native_stick_offset = 0;

	target_tick = gettick_counter();	/* returns %tick */
	target_stick = gettick();		/* returns %stick */

	native_tick_offset = source_tick - target_tick;
	native_stick_offset = source_stick - target_stick;
}

/*
 * Set the {tick,stick}.NPT field to 1 on this CPU.
 */
static void
enable_tick_stick_npt(void)
{
	(void) hv_stick_set_npt(1);
	(void) hv_tick_set_npt(1);
}

/*
 * Synchronize a CPU's {tick,stick}.NPT fields with the current state
 * of the system. This is used when a CPU is DR'd into the system.
 */
void
suspend_sync_tick_stick_npt(void)
{
	if (tick_stick_emulation_active) {
		DBG("enabling {%%tick/%%stick}.NPT on CPU 0x%x", CPU->cpu_id);
		(void) hv_stick_set_npt(1);
		(void) hv_tick_set_npt(1);
	} else {
		ASSERT(gettick_npt() == 0);
		ASSERT(getstick_npt() == 0);
	}
}

/*
 * Obtain an updated MD from the hypervisor and update cpunodes, CPU HW
 * sharing data structures, and processor groups.
 */
static void
update_cpu_mappings(void)
{
	md_t		*mdp;
	processorid_t	id;
	cpu_t		*cp;
	cpu_pg_t	*pgps[NCPU];

	if ((mdp = md_get_handle()) == NULL) {
		DBG("suspend: md_get_handle failed");
		return;
	}

	DBG("suspend: updating CPU mappings");

	mutex_enter(&cpu_lock);

	setup_chip_mappings(mdp);
	setup_exec_unit_mappings(mdp);
	for (id = 0; id < NCPU; id++) {
		if ((cp = cpu_get(id)) == NULL)
			continue;
		cpu_map_exec_units(cp);
	}

	/*
	 * Re-calculate processor groups.
	 *
	 * First tear down all PG information before adding any new PG
	 * information derived from the MD we just downloaded. We must
	 * call pg_cpu_inactive and pg_cpu_active with CPUs paused and
	 * we want to minimize the number of times pause_cpus is called.
	 * Inactivating all CPUs would leave PGs without any active CPUs,
	 * so while CPUs are paused, call pg_cpu_inactive and swap in the
	 * bootstrap PG structure saving the original PG structure to be
	 * fini'd afterwards. This prevents the dispatcher from encountering
	 * PGs in which all CPUs are inactive.
	 */
	pause_cpus(NULL);
	for (id = 0; id < NCPU; id++) {
		if ((cp = cpu_get(id)) == NULL)
			continue;
		pg_cpu_inactive(cp);
		pgps[id] = cp->cpu_pg;
		pg_cpu_bootstrap(cp);
	}
	start_cpus();

	/*
	 * pg_cpu_fini* and pg_cpu_init* must be called while CPUs are
	 * not paused. Use two separate loops here so that we do not
	 * initialize PG data for CPUs until all the old PG data structures
	 * are torn down.
	 */
	for (id = 0; id < NCPU; id++) {
		if ((cp = cpu_get(id)) == NULL)
			continue;
		pg_cpu_fini(cp, pgps[id]);
		mpo_cpu_remove(id);
	}

	/*
	 * Initialize PG data for each CPU, but leave the bootstrapped
	 * PG structure in place to avoid running with any PGs containing
	 * nothing but inactive CPUs.
	 */
	for (id = 0; id < NCPU; id++) {
		if ((cp = cpu_get(id)) == NULL)
			continue;
		mpo_cpu_add(mdp, id);
		pgps[id] = pg_cpu_init(cp, B_TRUE);
	}

	/*
	 * Now that PG data has been initialized for all CPUs in the
	 * system, replace the bootstrapped PG structure with the
	 * initialized PG structure and call pg_cpu_active for each CPU.
	 */
	pause_cpus(NULL);
	for (id = 0; id < NCPU; id++) {
		if ((cp = cpu_get(id)) == NULL)
			continue;
		cp->cpu_pg = pgps[id];
		pg_cpu_active(cp);
	}
	start_cpus();

	mutex_exit(&cpu_lock);

	(void) md_fini_handle(mdp);
}

/*
 * Wrapper for the Sun Cluster error decoding function.
 */
static int
cluster_error_decode(int error, char *error_reason, size_t max_reason_len)
{
	const char	*decoded;
	size_t		decoded_len;

	ASSERT(error_reason != NULL);
	ASSERT(max_reason_len > 0);

	max_reason_len = MIN(max_reason_len, SC_FAIL_STR_MAX);

	if (cl_suspend_error_decode == NULL)
		return (-1);

	if ((decoded = (*cl_suspend_error_decode)(error)) == NULL)
		return (-1);

	/* Get number of non-NULL bytes */
	if ((decoded_len = strnlen(decoded, max_reason_len - 1)) == 0)
		return (-1);

	bcopy(decoded, error_reason, decoded_len);

	/*
	 * The error string returned from cl_suspend_error_decode
	 * should be NULL-terminated, but set the terminator here
	 * because we only copied non-NULL bytes. If the decoded
	 * string was not NULL-terminated, this guarantees that
	 * error_reason will be.
	 */
	error_reason[decoded_len] = '\0';

	return (0);
}

/*
 * Wrapper for the Sun Cluster pre-suspend callback.
 */
static int
cluster_pre_wrapper(char *error_reason, size_t max_reason_len)
{
	int rv = 0;

	if (cl_suspend_pre_callback != NULL) {
		rv = (*cl_suspend_pre_callback)();
		DBG("suspend: cl_suspend_pre_callback returned %d", rv);
		if (rv != 0 && error_reason != NULL && max_reason_len > 0) {
			if (cluster_error_decode(rv, error_reason,
			    max_reason_len)) {
				(void) snprintf(error_reason, max_reason_len,
				    SC_PRE_FAIL_STR_FMT, rv);
			}
		}
	}

	return (rv);
}

/*
 * Wrapper for the Sun Cluster post-suspend callback.
 */
static int
cluster_post_wrapper(char *error_reason, size_t max_reason_len)
{
	int rv = 0;

	if (cl_suspend_post_callback != NULL) {
		rv = (*cl_suspend_post_callback)();
		DBG("suspend: cl_suspend_post_callback returned %d", rv);
		if (rv != 0 && error_reason != NULL && max_reason_len > 0) {
			if (cluster_error_decode(rv, error_reason,
			    max_reason_len)) {
				(void) snprintf(error_reason,
				    max_reason_len, SC_POST_FAIL_STR_FMT, rv);
			}
		}
	}

	return (rv);
}

/*
 * Execute pre-suspend callbacks preparing the system for a suspend operation.
 * Returns zero on success, non-zero on failure. Sets the recovered argument
 * to indicate whether or not callbacks could be undone in the event of a
 * failure--if callbacks were successfully undone, *recovered is set to B_TRUE,
 * otherwise *recovered is set to B_FALSE. Must be called successfully before
 * suspend_start can be called. Callers should first call suspend_support to
 * determine if OS suspend is supported.
 */
int
suspend_pre(char *error_reason, size_t max_reason_len, boolean_t *recovered)
{
	int rv;

	ASSERT(recovered != NULL);

	/*
	 * Return an error if suspend_pre is erreoneously called
	 * when OS suspend is not supported.
	 */
	ASSERT(suspend_supported());
	if (!suspend_supported()) {
		DBG("suspend: suspend_pre called without suspend support");
		*recovered = B_TRUE;
		return (ENOTSUP);
	}
	DBG("suspend: %s", __func__);

	rv = cluster_pre_wrapper(error_reason, max_reason_len);

	/*
	 * At present, only one pre-suspend operation exists.
	 * If it fails, no recovery needs to be done.
	 */
	if (rv != 0 && recovered != NULL)
		*recovered = B_TRUE;

	return (rv);
}

/*
 * Execute post-suspend callbacks. Returns zero on success, non-zero on
 * failure. Must be called after suspend_start is called, regardless of
 * whether or not suspend_start is successful.
 */
int
suspend_post(char *error_reason, size_t max_reason_len)
{
	ASSERT(suspend_supported());
	DBG("suspend: %s", __func__);
	return (cluster_post_wrapper(error_reason, max_reason_len));
}

/*
 * Suspends the OS by pausing CPUs and calling into the HV to initiate
 * the suspend. When the HV routine hv_guest_suspend returns, the system
 * will be resumed. Must be called after a successful call to suspend_pre.
 * suspend_post must be called after suspend_start, whether or not
 * suspend_start returns an error.
 */
/*ARGSUSED*/
int
suspend_start(char *error_reason, size_t max_reason_len)
{
	uint64_t	source_tick;
	uint64_t	source_stick;
	uint64_t	rv;
	timestruc_t	source_tod;
	int		spl;

	ASSERT(suspend_supported());
	DBG("suspend: %s", __func__);

	sfmmu_ctxdoms_lock();

	mutex_enter(&cpu_lock);

	/* Suspend the watchdog */
	watchdog_suspend();

	/* Record the TOD */
	mutex_enter(&tod_lock);
	source_tod = tod_get();
	mutex_exit(&tod_lock);

	/* Pause all other CPUs */
	pause_cpus(NULL);
	DBG_PROM("suspend: CPUs paused\n");

	/* Suspend cyclics and disable interrupts */
	cyclic_suspend();
	DBG_PROM("suspend: cyclics suspended\n");
	spl = spl8();

	source_tick = gettick_counter();
	source_stick = gettick();
	DBG_PROM("suspend: source_tick: 0x%lx\n", source_tick);
	DBG_PROM("suspend: source_stick: 0x%lx\n", source_stick);

	/*
	 * Call into the HV to initiate the suspend.
	 * hv_guest_suspend() returns after the guest has been
	 * resumed or if the suspend operation failed or was
	 * cancelled. After a successful suspend, the %tick and
	 * %stick registers may have changed by an amount that is
	 * not proportional to the amount of time that has passed.
	 * They may have jumped forwards or backwards. This jump
	 * must be uniform across all CPUs and we operate under
	 * the assumption that it is (maintaining two global offset
	 * variables--one for %tick and one for %stick.)
	 */
	DBG_PROM("suspend: suspending... \n");
	rv = hv_guest_suspend();
	if (rv != 0) {
		splx(spl);
		cyclic_resume();
		start_cpus();
		watchdog_resume();
		mutex_exit(&cpu_lock);
		sfmmu_ctxdoms_unlock();
		DBG("suspend: failed, rv: %ld\n", rv);
		return (rv);
	}

	/* Update the global tick and stick offsets */
	set_tick_offsets(source_tick, source_stick);

	/* Ensure new offsets are globally visible before resuming CPUs */
	membar_sync();

	/* Enable interrupts */
	splx(spl);

	/* Set the {%tick,%stick}.NPT bits on all CPUs */
	if (enable_user_tick_stick_emulation) {
		xc_all((xcfunc_t *)enable_tick_stick_npt, NULL, NULL);
		xt_sync(cpu_ready_set);
		ASSERT(gettick_npt() != 0);
		ASSERT(getstick_npt() != 0);
	}

	/* If emulation is enabled, but not currently active, enable it */
	if (enable_user_tick_stick_emulation && !tick_stick_emulation_active) {
		tick_stick_emulation_active = B_TRUE;
	}

	sfmmu_ctxdoms_remove();

	/* Resume cyclics, unpause CPUs */
	cyclic_resume();
	start_cpus();

	/* Set the TOD */
	mutex_enter(&tod_lock);
	tod_set(source_tod);
	mutex_exit(&tod_lock);

	/* Re-enable the watchdog */
	watchdog_resume();

	mutex_exit(&cpu_lock);

	/* Download the latest MD */
	if ((rv = mach_descrip_update()) != 0)
		cmn_err(CE_PANIC, "suspend: mach_descrip_update failed: %ld",
		    rv);

	sfmmu_ctxdoms_update();
	sfmmu_ctxdoms_unlock();

	/* Get new MD, update CPU mappings/relationships */
	if (suspend_update_cpu_mappings)
		update_cpu_mappings();

	DBG("suspend: target tick: 0x%lx", gettick_counter());
	DBG("suspend: target stick: 0x%llx", gettick());
	DBG("suspend: user %%tick/%%stick emulation is %d",
	    tick_stick_emulation_active);
	DBG("suspend: finished");

	return (0);
}