diff options
Diffstat (limited to 'usr/src/uts/sun4v/os/suspend.c')
-rw-r--r-- | usr/src/uts/sun4v/os/suspend.c | 589 |
1 files changed, 589 insertions, 0 deletions
diff --git a/usr/src/uts/sun4v/os/suspend.c b/usr/src/uts/sun4v/os/suspend.c new file mode 100644 index 0000000000..2de4331424 --- /dev/null +++ b/usr/src/uts/sun4v/os/suspend.c @@ -0,0 +1,589 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/mutex.h> +#include <sys/cpuvar.h> +#include <sys/cyclic.h> +#include <sys/disp.h> +#include <sys/ddi.h> +#include <sys/wdt.h> +#include <sys/callb.h> +#include <sys/cmn_err.h> +#include <sys/hypervisor_api.h> +#include <sys/membar.h> +#include <sys/x_call.h> +#include <sys/promif.h> +#include <sys/systm.h> +#include <sys/mach_descrip.h> +#include <sys/cpu_module.h> +#include <sys/pg.h> +#include <sys/lgrp.h> +#include <sys/sysmacros.h> +#include <sys/sunddi.h> +#include <sys/cpupart.h> +#include <sys/hsvc.h> + +/* + * Sun4v OS Suspend + * + * Provides a means to suspend a sun4v guest domain by pausing CPUs and then + * calling into the HV to initiate a suspension. Suspension is sequenced + * externally by calling suspend_pre, suspend_start, and suspend_post. + * suspend_pre and suspend_post are meant to perform any special operations + * that should be done before or after a suspend/resume operation. e.g., + * callbacks to cluster software to disable heartbeat monitoring before the + * system is suspended. suspend_start prepares kernel services to be suspended + * and then suspends the domain by calling hv_guest_suspend. + * + * Special Handling for %tick and %stick Registers + * + * After a suspend/resume operation, the %tick and %stick registers may have + * jumped forwards or backwards. The delta is assumed to be consistent across + * all CPUs, within the negligible level of %tick and %stick variation + * acceptable on a cold boot. In order to maintain increasing %tick and %stick + * counter values without exposing large positive or negative jumps to kernel + * or user code, a %tick and %stick offset is used. Kernel reads of these + * counters return the sum of the hardware register counter and offset + * variable. After a suspend/resume operation, user reads of %tick or %stick + * are emulated. Suspend code enables emulation by setting the + * %{tick,stick}.NPT fields which trigger a privileged instruction access + * trap whenever the registers are read from user mode. If emulation has been + * enabled, the trap handler emulates the instruction. Emulation is only + * enabled during a successful suspend/resume operation. When emulation is + * enabled, CPUs that are DR'd into the system will have their + * %{tick,stick}.NPT bits set to 1 as well. + */ + +extern u_longlong_t gettick(void); /* returns %stick */ +extern uint64_t gettick_counter(void); /* returns %tick */ +extern uint64_t gettick_npt(void); +extern uint64_t getstick_npt(void); +extern int mach_descrip_update(void); +extern cpuset_t cpu_ready_set; +extern uint64_t native_tick_offset; +extern uint64_t native_stick_offset; + +/* + * Global Sun Cluster pre/post callbacks. + */ +const char *(*cl_suspend_error_decode)(int); +int (*cl_suspend_pre_callback)(void); +int (*cl_suspend_post_callback)(void); +#define SC_PRE_FAIL_STR_FMT "Sun Cluster pre-suspend failure: %d" +#define SC_POST_FAIL_STR_FMT "Sun Cluster post-suspend failure: %d" +#define SC_FAIL_STR_MAX 256 + +/* + * The minimum major and minor version of the HSVC_GROUP_CORE API group + * required in order to use OS suspend. + */ +#define SUSPEND_CORE_MAJOR 1 +#define SUSPEND_CORE_MINOR 2 + +/* + * By default, sun4v OS suspend is supported if the required HV version + * is present. suspend_disabled should be set on platforms that do not + * allow OS suspend regardless of whether or not the HV supports it. + * It can also be set in /etc/system. + */ +static int suspend_disabled = 0; + +/* + * Controls whether or not user-land tick and stick register emulation + * will be enabled following a successful suspend operation. + */ +static int enable_user_tick_stick_emulation = 1; + +/* + * Indicates whether or not tick and stick emulation is currently active. + * After a successful suspend operation, if emulation is enabled, this + * variable is set to B_TRUE. Global scope to allow emulation code to + * check if emulation is active. + */ +boolean_t tick_stick_emulation_active = B_FALSE; + +/* + * Controls whether or not MD information is refreshed after a + * successful suspend and resume. When non-zero, after a successful + * suspend and resume, the MD will be downloaded, cpunodes updated, + * and processor grouping information recalculated. + */ +static int suspend_update_cpu_mappings = 1; + +/* + * DBG and DBG_PROM() macro. + */ +#ifdef DEBUG + +static int suspend_debug_flag = 0; + +#define DBG_PROM \ +if (suspend_debug_flag) \ + prom_printf + +#define DBG \ +if (suspend_debug_flag) \ + suspend_debug + +static void +suspend_debug(const char *fmt, ...) +{ + char buf[512]; + va_list ap; + + va_start(ap, fmt); + (void) vsprintf(buf, fmt, ap); + va_end(ap); + + cmn_err(CE_NOTE, "%s", buf); +} + +#else /* DEBUG */ + +#define DBG_PROM +#define DBG + +#endif /* DEBUG */ + +/* + * Return true if the HV supports OS suspend and if suspend has not been + * disabled on this platform. + */ +boolean_t +suspend_supported(void) +{ + uint64_t major, minor; + + if (suspend_disabled) + return (B_FALSE); + + if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0) + return (B_FALSE); + + return ((major == SUSPEND_CORE_MAJOR && minor >= SUSPEND_CORE_MINOR) || + (major > SUSPEND_CORE_MAJOR)); +} + +/* + * Given a source tick and stick value, set the tick and stick offsets such + * that the (current physical register value + offset == source value). + */ +static void +set_tick_offsets(uint64_t source_tick, uint64_t source_stick) +{ + uint64_t target_tick; + uint64_t target_stick; + + native_tick_offset = 0; + native_stick_offset = 0; + + target_tick = gettick_counter(); /* returns %tick */ + target_stick = gettick(); /* returns %stick */ + + native_tick_offset = source_tick - target_tick; + native_stick_offset = source_stick - target_stick; +} + +/* + * Set the {tick,stick}.NPT field to 1 on this CPU. + */ +static void +enable_tick_stick_npt(void) +{ + hv_stick_set_npt(1); + hv_tick_set_npt(1); +} + +/* + * Synchronize a CPU's {tick,stick}.NPT fields with the current state + * of the system. This is used when a CPU is DR'd into the system. + */ +void +suspend_sync_tick_stick_npt(void) +{ + if (tick_stick_emulation_active) { + DBG("enabling {%%tick/%%stick}.NPT on CPU 0x%x", CPU->cpu_id); + hv_stick_set_npt(1); + hv_tick_set_npt(1); + } else { + ASSERT(gettick_npt() == 0); + ASSERT(getstick_npt() == 0); + } +} + +/* + * Obtain an updated MD from the hypervisor and update cpunodes, CPU HW + * sharing data structures, and processor groups. + */ +static void +update_cpu_mappings(void) +{ + md_t *mdp; + processorid_t id; + cpu_t *cp; + int rv; + cpu_pg_t *pgps[NCPU]; + + /* Download the latest MD */ + if ((rv = mach_descrip_update()) != 0) { + DBG("suspend: mach_descrip_update error: %d", rv); + return; + } + + if ((mdp = md_get_handle()) == NULL) { + DBG("suspend: md_get_handle failed"); + return; + } + + DBG("suspend: updating CPU mappings"); + + mutex_enter(&cpu_lock); + + setup_chip_mappings(mdp); + setup_exec_unit_mappings(mdp); + for (id = 0; id < NCPU; id++) { + if ((cp = cpu_get(id)) == NULL) + continue; + cpu_map_exec_units(cp); + } + + /* + * Re-calculate processor groups. + * + * First tear down all PG information before adding any new PG + * information derived from the MD we just downloaded. We must + * call pg_cpu_inactive and pg_cpu_active with CPUs paused and + * we want to minimize the number of times pause_cpus is called. + * Inactivating all CPUs would leave PGs without any active CPUs, + * so while CPUs are paused, call pg_cpu_inactive and swap in the + * bootstrap PG structure saving the original PG structure to be + * fini'd afterwards. This prevents the dispatcher from encountering + * PGs in which all CPUs are inactive. + */ + pause_cpus(NULL); + for (id = 0; id < NCPU; id++) { + if ((cp = cpu_get(id)) == NULL) + continue; + pg_cpu_inactive(cp); + pgps[id] = cp->cpu_pg; + pg_cpu_bootstrap(cp); + } + start_cpus(); + + /* + * pg_cpu_fini* and pg_cpu_init* must be called while CPUs are + * not paused. Use two separate loops here so that we do not + * initialize PG data for CPUs until all the old PG data structures + * are torn down. + */ + for (id = 0; id < NCPU; id++) { + if ((cp = cpu_get(id)) == NULL) + continue; + pg_cpu_fini(cp, pgps[id]); + } + + /* + * Initialize PG data for each CPU, but leave the bootstrapped + * PG structure in place to avoid running with any PGs containing + * nothing but inactive CPUs. + */ + for (id = 0; id < NCPU; id++) { + if ((cp = cpu_get(id)) == NULL) + continue; + pgps[id] = pg_cpu_init(cp, B_TRUE); + } + + /* + * Now that PG data has been initialized for all CPUs in the + * system, replace the bootstrapped PG structure with the + * initialized PG structure and call pg_cpu_active for each CPU. + */ + pause_cpus(NULL); + for (id = 0; id < NCPU; id++) { + if ((cp = cpu_get(id)) == NULL) + continue; + cp->cpu_pg = pgps[id]; + pg_cpu_active(cp); + } + start_cpus(); + + mutex_exit(&cpu_lock); + + (void) md_fini_handle(mdp); +} + +/* + * Wrapper for the Sun Cluster error decoding function. + */ +static int +cluster_error_decode(int error, char *error_reason, size_t max_reason_len) +{ + const char *decoded; + size_t decoded_len; + + ASSERT(error_reason != NULL); + ASSERT(max_reason_len > 0); + + max_reason_len = MIN(max_reason_len, SC_FAIL_STR_MAX); + + if (cl_suspend_error_decode == NULL) + return (-1); + + if ((decoded = (*cl_suspend_error_decode)(error)) == NULL) + return (-1); + + /* Get number of non-NULL bytes */ + if ((decoded_len = strnlen(decoded, max_reason_len - 1)) == 0) + return (-1); + + bcopy(decoded, error_reason, decoded_len); + + /* + * The error string returned from cl_suspend_error_decode + * should be NULL-terminated, but set the terminator here + * because we only copied non-NULL bytes. If the decoded + * string was not NULL-terminated, this guarantees that + * error_reason will be. + */ + error_reason[decoded_len] = '\0'; + + return (0); +} + +/* + * Wrapper for the Sun Cluster pre-suspend callback. + */ +static int +cluster_pre_wrapper(char *error_reason, size_t max_reason_len) +{ + int rv = 0; + + if (cl_suspend_pre_callback != NULL) { + rv = (*cl_suspend_pre_callback)(); + DBG("suspend: cl_suspend_pre_callback returned %d", rv); + if (rv != 0 && error_reason != NULL && max_reason_len > 0) { + if (cluster_error_decode(rv, error_reason, + max_reason_len)) { + (void) snprintf(error_reason, max_reason_len, + SC_PRE_FAIL_STR_FMT, rv); + } + } + } + + return (rv); +} + +/* + * Wrapper for the Sun Cluster post-suspend callback. + */ +static int +cluster_post_wrapper(char *error_reason, size_t max_reason_len) +{ + int rv = 0; + + if (cl_suspend_post_callback != NULL) { + rv = (*cl_suspend_post_callback)(); + DBG("suspend: cl_suspend_post_callback returned %d", rv); + if (rv != 0 && error_reason != NULL && max_reason_len > 0) { + if (cluster_error_decode(rv, error_reason, + max_reason_len)) { + (void) snprintf(error_reason, + max_reason_len, SC_POST_FAIL_STR_FMT, rv); + } + } + } + + return (rv); +} + +/* + * Execute pre-suspend callbacks preparing the system for a suspend operation. + * Returns zero on success, non-zero on failure. Sets the recovered argument + * to indicate whether or not callbacks could be undone in the event of a + * failure--if callbacks were successfully undone, *recovered is set to B_TRUE, + * otherwise *recovered is set to B_FALSE. Must be called successfully before + * suspend_start can be called. Callers should first call suspend_support to + * determine if OS suspend is supported. + */ +int +suspend_pre(char *error_reason, size_t max_reason_len, boolean_t *recovered) +{ + int rv; + + ASSERT(recovered != NULL); + + /* + * Return an error if suspend_pre is erreoneously called + * when OS suspend is not supported. + */ + ASSERT(suspend_supported()); + if (!suspend_supported()) { + DBG("suspend: suspend_pre called without suspend support"); + *recovered = B_TRUE; + return (ENOTSUP); + } + DBG("suspend: %s", __func__); + + rv = cluster_pre_wrapper(error_reason, max_reason_len); + + /* + * At present, only one pre-suspend operation exists. + * If it fails, no recovery needs to be done. + */ + if (rv != 0 && recovered != NULL) + *recovered = B_TRUE; + + return (rv); +} + +/* + * Execute post-suspend callbacks. Returns zero on success, non-zero on + * failure. Must be called after suspend_start is called, regardless of + * whether or not suspend_start is successful. + */ +int +suspend_post(char *error_reason, size_t max_reason_len) +{ + ASSERT(suspend_supported()); + DBG("suspend: %s", __func__); + return (cluster_post_wrapper(error_reason, max_reason_len)); +} + +/* + * Suspends the OS by pausing CPUs and calling into the HV to initiate + * the suspend. When the HV routine hv_guest_suspend returns, the system + * will be resumed. Must be called after a successful call to suspend_pre. + * suspend_post must be called after suspend_start, whether or not + * suspend_start returns an error. + */ +/*ARGSUSED*/ +int +suspend_start(char *error_reason, size_t max_reason_len) +{ + uint64_t source_tick; + uint64_t source_stick; + uint64_t rv; + timestruc_t source_tod; + int spl; + + ASSERT(suspend_supported()); + DBG("suspend: %s", __func__); + + mutex_enter(&cpu_lock); + + /* Suspend the watchdog */ + watchdog_suspend(); + + /* Record the TOD */ + mutex_enter(&tod_lock); + source_tod = tod_get(); + mutex_exit(&tod_lock); + + /* Pause all other CPUs */ + pause_cpus(NULL); + DBG_PROM("suspend: CPUs paused\n"); + + /* Suspend cyclics and disable interrupts */ + cyclic_suspend(); + DBG_PROM("suspend: cyclics suspended\n"); + spl = spl8(); + + source_tick = gettick_counter(); + source_stick = gettick(); + DBG_PROM("suspend: source_tick: 0x%lx\n", source_tick); + DBG_PROM("suspend: source_stick: 0x%lx\n", source_stick); + + /* + * Call into the HV to initiate the suspend. + * hv_guest_suspend() returns after the guest has been + * resumed or if the suspend operation failed or was + * cancelled. After a successful suspend, the %tick and + * %stick registers may have changed by an amount that is + * not proportional to the amount of time that has passed. + * They may have jumped forwards or backwards. This jump + * must be uniform across all CPUs and we operate under + * the assumption that it is (maintaining two global offset + * variables--one for %tick and one for %stick.) + */ + DBG_PROM("suspend: suspending... \n"); + rv = hv_guest_suspend(); + if (rv != 0) { + splx(spl); + cyclic_resume(); + start_cpus(); + watchdog_resume(); + mutex_exit(&cpu_lock); + DBG("suspend: failed, rv: %ld\n", rv); + return (rv); + } + + /* Update the global tick and stick offsets */ + set_tick_offsets(source_tick, source_stick); + + /* Ensure new offsets are globally visible before resuming CPUs */ + membar_sync(); + + /* Enable interrupts */ + splx(spl); + + /* Set the {%tick,%stick}.NPT bits on all CPUs */ + if (enable_user_tick_stick_emulation) { + xc_all((xcfunc_t *)enable_tick_stick_npt, NULL, NULL); + xt_sync(cpu_ready_set); + ASSERT(gettick_npt() != 0); + ASSERT(getstick_npt() != 0); + } + + /* If emulation is enabled, but not currently active, enable it */ + if (enable_user_tick_stick_emulation && !tick_stick_emulation_active) { + tick_stick_emulation_active = B_TRUE; + } + + /* Resume cyclics, unpause CPUs */ + cyclic_resume(); + start_cpus(); + + /* Set the TOD */ + mutex_enter(&tod_lock); + tod_set(source_tod); + mutex_exit(&tod_lock); + + /* Re-enable the watchdog */ + watchdog_resume(); + + mutex_exit(&cpu_lock); + + /* Get new MD, update CPU mappings/relationships */ + if (suspend_update_cpu_mappings) + update_cpu_mappings(); + + DBG("suspend: target tick: 0x%lx", gettick_counter()); + DBG("suspend: target stick: 0x%llx", gettick()); + DBG("suspend: user %%tick/%%stick emulation is %d", + tick_stick_emulation_active); + DBG("suspend: finished"); + + return (0); +} |