diff options
author | smaybe <none@none> | 2008-04-14 10:25:34 -0700 |
---|---|---|
committer | smaybe <none@none> | 2008-04-14 10:25:34 -0700 |
commit | ea8190a2a3b5243d911d4bd8185a4161b02d09c0 (patch) | |
tree | 56697e4ca176c8a1b06d92aed281c9567e085226 | |
parent | db061fdf6dcfbac81c13366f1819b7fc0e984748 (diff) | |
download | illumos-gate-ea8190a2a3b5243d911d4bd8185a4161b02d09c0.tar.gz |
6639790 need Solaris PV-on-HVM migration support
-rw-r--r-- | usr/src/uts/common/xen/io/xenbus_comms.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/xen/io/xnf.c | 3 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/xpv/evtchn.c | 15 | ||||
-rw-r--r-- | usr/src/uts/i86pc/io/xpv/xpv_support.c | 403 |
4 files changed, 426 insertions, 3 deletions
diff --git a/usr/src/uts/common/xen/io/xenbus_comms.c b/usr/src/uts/common/xen/io/xenbus_comms.c index e7eb20f166..b250feaa1d 100644 --- a/usr/src/uts/common/xen/io/xenbus_comms.c +++ b/usr/src/uts/common/xen/io/xenbus_comms.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -81,7 +81,9 @@ #include <xen/sys/xenbus_comms.h> #include <xen/public/io/xs_wire.h> +#ifndef XPV_HVM_DRIVER static int xenbus_irq; +#endif static ddi_umem_cookie_t xb_cookie; /* cookie for xenbus comm page */ extern caddr_t xb_addr; /* va of xenbus comm page */ @@ -240,7 +242,11 @@ xb_read(void *data, unsigned len) void xb_suspend(void) { +#ifdef XPV_HVM_DRIVER + ec_unbind_evtchn(xen_info->store_evtchn); +#else rem_avintr(NULL, IPL_XENBUS, (avfunc)xenbus_intr, xenbus_irq); +#endif } void diff --git a/usr/src/uts/common/xen/io/xnf.c b/usr/src/uts/common/xen/io/xnf.c index 597e4cf344..5e9f2c519b 100644 --- a/usr/src/uts/common/xen/io/xnf.c +++ b/usr/src/uts/common/xen/io/xnf.c @@ -778,6 +778,7 @@ failure_2: xvdi_remove_event_handler(devinfo, XS_OE_STATE); #ifdef XPV_HVM_DRIVER ec_unbind_evtchn(xnfp->xnf_evtchn); + xvdi_free_evtchn(devinfo); #else ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); #endif @@ -816,6 +817,7 @@ xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) case DDI_SUSPEND: #ifdef XPV_HVM_DRIVER ec_unbind_evtchn(xnfp->xnf_evtchn); + xvdi_free_evtchn(devinfo); #else ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); #endif @@ -874,6 +876,7 @@ xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) /* Remove the interrupt */ #ifdef XPV_HVM_DRIVER ec_unbind_evtchn(xnfp->xnf_evtchn); + xvdi_free_evtchn(devinfo); #else ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); #endif diff --git a/usr/src/uts/i86pc/io/xpv/evtchn.c b/usr/src/uts/i86pc/io/xpv/evtchn.c index 6242becf80..14d5bcc4b9 100644 --- a/usr/src/uts/i86pc/io/xpv/evtchn.c +++ b/usr/src/uts/i86pc/io/xpv/evtchn.c @@ -43,7 +43,7 @@ extern dev_info_t *xpv_dip; static ddi_intr_handle_t *evtchn_ihp = NULL; static ddi_softint_handle_t evtchn_to_handle[NR_EVENT_CHANNELS]; -static kmutex_t ec_lock; +kmutex_t ec_lock; static int evtchn_callback_irq = -1; @@ -374,3 +374,16 @@ ec_init(dev_info_t *dip) } return (0); } + +void +ec_resume(void) +{ + int i; + + /* New event-channel space is not 'live' yet. */ + for (i = 0; i < NR_EVENT_CHANNELS; i++) + (void) hypervisor_mask_event(i); + if (set_hvm_callback(evtchn_callback_irq) != 0) + cmn_err(CE_WARN, "Couldn't register evtchn callback"); + +} diff --git a/usr/src/uts/i86pc/io/xpv/xpv_support.c b/usr/src/uts/i86pc/io/xpv/xpv_support.c index dc61e1e25a..f400729e59 100644 --- a/usr/src/uts/i86pc/io/xpv/xpv_support.c +++ b/usr/src/uts/i86pc/io/xpv/xpv_support.c @@ -47,12 +47,15 @@ #include <sys/devops.h> #include <sys/pc_mmu.h> #include <sys/cmn_err.h> +#include <sys/cpr.h> +#include <sys/ddi.h> #include <vm/seg_kmem.h> #include <vm/as.h> #include <vm/hat_pte.h> #include <vm/hat_i86.h> #define XPV_MINOR 0 +#define XPV_BUFSIZE 128 /* * This structure is ordinarily constructed by Xen. In the HVM world, we @@ -76,6 +79,18 @@ int xen_is_64bit = -1; caddr_t xb_addr; dev_info_t *xpv_dip; +static dev_info_t *xpvd_dip; + +/* saved pfn of the shared info page */ +static pfn_t shared_info_frame; + +#ifdef DEBUG +int xen_suspend_debug; + +#define SUSPEND_DEBUG if (xen_suspend_debug) xen_printf +#else +#define SUSPEND_DEBUG(...) +#endif /* * Forward declarations @@ -300,6 +315,382 @@ hvm_get_param(int param_id) return (xhp.value); } +static struct xenbus_watch shutdown_watch; +taskq_t *xen_shutdown_tq; + +#define SHUTDOWN_INVALID -1 +#define SHUTDOWN_POWEROFF 0 +#define SHUTDOWN_REBOOT 1 +#define SHUTDOWN_SUSPEND 2 +#define SHUTDOWN_HALT 3 +#define SHUTDOWN_MAX 4 + +#define SHUTDOWN_TIMEOUT_SECS (60 * 5) + +static const char *cmd_strings[SHUTDOWN_MAX] = { + "poweroff", + "reboot", + "suspend", + "halt" +}; + +int +xen_suspend_devices(dev_info_t *dip) +{ + int error; + char buf[XPV_BUFSIZE]; + + SUSPEND_DEBUG("xen_suspend_devices\n"); + + for (; dip != NULL; dip = ddi_get_next_sibling(dip)) { + if (xen_suspend_devices(ddi_get_child(dip))) + return (ENXIO); + if (ddi_get_driver(dip) == NULL) + continue; + SUSPEND_DEBUG("Suspending device %s\n", ddi_deviname(dip, buf)); + ASSERT((DEVI(dip)->devi_cpr_flags & DCF_CPR_SUSPENDED) == 0); + + + if (!i_ddi_devi_attached(dip)) { + error = DDI_FAILURE; + } else { + error = devi_detach(dip, DDI_SUSPEND); + } + + if (error == DDI_SUCCESS) { + DEVI(dip)->devi_cpr_flags |= DCF_CPR_SUSPENDED; + } else { + SUSPEND_DEBUG("WARNING: Unable to suspend device %s\n", + ddi_deviname(dip, buf)); + cmn_err(CE_WARN, "Unable to suspend device %s.", + ddi_deviname(dip, buf)); + cmn_err(CE_WARN, "Device is busy or does not " + "support suspend/resume."); + return (ENXIO); + } + } + return (0); +} + +int +xen_resume_devices(dev_info_t *start, int resume_failed) +{ + dev_info_t *dip, *next, *last = NULL; + int did_suspend; + int error = resume_failed; + char buf[XPV_BUFSIZE]; + + SUSPEND_DEBUG("xen_resume_devices\n"); + + while (last != start) { + dip = start; + next = ddi_get_next_sibling(dip); + while (next != last) { + dip = next; + next = ddi_get_next_sibling(dip); + } + + /* + * cpr is the only one that uses this field and the device + * itself hasn't resumed yet, there is no need to use a + * lock, even though kernel threads are active by now. + */ + did_suspend = DEVI(dip)->devi_cpr_flags & DCF_CPR_SUSPENDED; + if (did_suspend) + DEVI(dip)->devi_cpr_flags &= ~DCF_CPR_SUSPENDED; + + /* + * There may be background attaches happening on devices + * that were not originally suspended by cpr, so resume + * only devices that were suspended by cpr. Also, stop + * resuming after the first resume failure, but traverse + * the entire tree to clear the suspend flag. + */ + if (did_suspend && !error) { + SUSPEND_DEBUG("Resuming device %s\n", + ddi_deviname(dip, buf)); + /* + * If a device suspended by cpr gets detached during + * the resume process (for example, due to hotplugging) + * before cpr gets around to issuing it a DDI_RESUME, + * we'll have problems. + */ + if (!i_ddi_devi_attached(dip)) { + cmn_err(CE_WARN, "Skipping %s, device " + "not ready for resume", + ddi_deviname(dip, buf)); + } else { + if (devi_attach(dip, DDI_RESUME) != + DDI_SUCCESS) { + error = ENXIO; + } + } + } + + if (error == ENXIO) { + cmn_err(CE_WARN, "Unable to resume device %s", + ddi_deviname(dip, buf)); + } + + error = xen_resume_devices(ddi_get_child(dip), error); + last = dip; + } + + return (error); +} + +/*ARGSUSED*/ +static int +check_xpvd(dev_info_t *dip, void *arg) +{ + char *name; + + name = ddi_node_name(dip); + if (name == NULL || strcmp(name, "xpvd")) { + return (DDI_WALK_CONTINUE); + } else { + xpvd_dip = dip; + return (DDI_WALK_TERMINATE); + } +} + +/* + * Top level routine to direct suspend/resume of a domain. + */ +void +xen_suspend_domain(void) +{ + extern void rtcsync(void); + extern void ec_resume(void); + extern kmutex_t ec_lock; + struct xen_add_to_physmap xatp; + ulong_t flags; + int err; + + cmn_err(CE_NOTE, "Domain suspending for save/migrate"); + + SUSPEND_DEBUG("xen_suspend_domain\n"); + + /* + * We only want to suspend the PV devices, since the emulated devices + * are suspended by saving the emulated device state. The PV devices + * are all children of the xpvd nexus device. So we search the + * device tree for the xpvd node to use as the root of the tree to + * be suspended. + */ + if (xpvd_dip == NULL) + ddi_walk_devs(ddi_root_node(), check_xpvd, NULL); + + /* + * suspend interrupts and devices + */ + if (xpvd_dip != NULL) + (void) xen_suspend_devices(ddi_get_child(xpvd_dip)); + else + cmn_err(CE_WARN, "No PV devices found to suspend"); + SUSPEND_DEBUG("xenbus_suspend\n"); + xenbus_suspend(); + + mutex_enter(&cpu_lock); + + /* + * Suspend on vcpu 0 + */ + thread_affinity_set(curthread, 0); + kpreempt_disable(); + + if (ncpus > 1) + pause_cpus(NULL); + /* + * We can grab the ec_lock as it's a spinlock with a high SPL. Hence + * any holder would have dropped it to get through pause_cpus(). + */ + mutex_enter(&ec_lock); + + /* + * From here on in, we can't take locks. + */ + + flags = intr_clear(); + + SUSPEND_DEBUG("HYPERVISOR_suspend\n"); + /* + * At this point we suspend and sometime later resume. + * Note that this call may return with an indication of a cancelled + * for now no matter ehat the return we do a full resume of all + * suspended drivers, etc. + */ + (void) HYPERVISOR_shutdown(SHUTDOWN_suspend); + + /* + * Point HYPERVISOR_shared_info to the proper place. + */ + xatp.domid = DOMID_SELF; + xatp.idx = 0; + xatp.space = XENMAPSPACE_shared_info; + xatp.gpfn = shared_info_frame; + if ((err = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) != 0) + panic("Could not set shared_info page. error: %d", err); + + SUSPEND_DEBUG("gnttab_resume\n"); + gnttab_resume(); + + SUSPEND_DEBUG("ec_resume\n"); + ec_resume(); + + intr_restore(flags); + + if (ncpus > 1) + start_cpus(); + + mutex_exit(&ec_lock); + mutex_exit(&cpu_lock); + + /* + * Now we can take locks again. + */ + + rtcsync(); + + SUSPEND_DEBUG("xenbus_resume\n"); + xenbus_resume(); + SUSPEND_DEBUG("xen_resume_devices\n"); + if (xpvd_dip != NULL) + (void) xen_resume_devices(ddi_get_child(xpvd_dip), 0); + + thread_affinity_clear(curthread); + kpreempt_enable(); + + SUSPEND_DEBUG("finished xen_suspend_domain\n"); + + cmn_err(CE_NOTE, "domain restore/migrate completed"); +} + +static void +xen_dirty_shutdown(void *arg) +{ + int cmd = (uintptr_t)arg; + + cmn_err(CE_WARN, "Externally requested shutdown failed or " + "timed out.\nShutting down.\n"); + + switch (cmd) { + case SHUTDOWN_HALT: + case SHUTDOWN_POWEROFF: + (void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred); + break; + case SHUTDOWN_REBOOT: + (void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred); + break; + } +} + +static void +xen_shutdown(void *arg) +{ + nvlist_t *attr_list = NULL; + sysevent_t *event = NULL; + sysevent_id_t eid; + int cmd = (uintptr_t)arg; + int err; + + ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX); + + if (cmd == SHUTDOWN_SUSPEND) { + xen_suspend_domain(); + return; + } + + err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_SLEEP); + if (err != DDI_SUCCESS) + goto failure; + + err = nvlist_add_string(attr_list, "shutdown", cmd_strings[cmd]); + if (err != DDI_SUCCESS) + goto failure; + + if ((event = sysevent_alloc("EC_xpvsys", "control", "SUNW:kern:xpv", + SE_SLEEP)) == NULL) + goto failure; + (void) sysevent_attach_attributes(event, + (sysevent_attr_list_t *)attr_list); + + err = log_sysevent(event, SE_SLEEP, &eid); + + sysevent_detach_attributes(event); + sysevent_free(event); + + if (err != 0) + goto failure; + + (void) timeout(xen_dirty_shutdown, arg, + SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC)); + + nvlist_free(attr_list); + return; + +failure: + if (attr_list != NULL) + nvlist_free(attr_list); + xen_dirty_shutdown(arg); +} + +/*ARGSUSED*/ +static void +xen_shutdown_handler(struct xenbus_watch *watch, const char **vec, + unsigned int len) +{ + char *str; + xenbus_transaction_t xbt; + int err, shutdown_code = SHUTDOWN_INVALID; + unsigned int slen; + +again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) { + (void) xenbus_transaction_end(xbt, 1); + return; + } + + SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str); + + /* + * If this is a watch fired from our write below, check out early to + * avoid an infinite loop. + */ + if (strcmp(str, "") == 0) { + (void) xenbus_transaction_end(xbt, 0); + kmem_free(str, slen); + return; + } else if (strcmp(str, "poweroff") == 0) { + shutdown_code = SHUTDOWN_POWEROFF; + } else if (strcmp(str, "reboot") == 0) { + shutdown_code = SHUTDOWN_REBOOT; + } else if (strcmp(str, "suspend") == 0) { + shutdown_code = SHUTDOWN_SUSPEND; + } else if (strcmp(str, "halt") == 0) { + shutdown_code = SHUTDOWN_HALT; + } else { + printf("Ignoring shutdown request: %s\n", str); + } + + (void) xenbus_write(xbt, "control", "shutdown", ""); + err = xenbus_transaction_end(xbt, 0); + if (err == EAGAIN) { + SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id); + kmem_free(str, slen); + goto again; + } + + kmem_free(str, slen); + if (shutdown_code != SHUTDOWN_INVALID) { + (void) taskq_dispatch(xen_shutdown_tq, xen_shutdown, + (void *)(intptr_t)shutdown_code, 0); + } +} + static int xen_pv_init(dev_info_t *xpv_dip) { @@ -401,10 +792,12 @@ xen_pv_init(dev_info_t *xpv_dip) * is. */ HYPERVISOR_shared_info = xen_alloc_pages(1); + shared_info_frame = hat_getpfnum(kas.a_hat, + (caddr_t)HYPERVISOR_shared_info); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = hat_getpfnum(kas.a_hat, (caddr_t)HYPERVISOR_shared_info); + xatp.gpfn = shared_info_frame; if ((err = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) != 0) { cmn_err(CE_WARN, "Could not get shared_info page from Xen." " error: %d", err); @@ -423,6 +816,14 @@ xen_pv_init(dev_info_t *xpv_dip) xs_early_init(); xs_domu_init(); + /* Set up for suspend/resume/migrate */ + xen_shutdown_tq = taskq_create("shutdown_taskq", 1, + maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); + shutdown_watch.node = "control/shutdown"; + shutdown_watch.callback = xen_shutdown_handler; + if (register_xenbus_watch(&shutdown_watch)) + cmn_err(CE_WARN, "Failed to set shutdown watcher"); + return (0); } |