diff options
Diffstat (limited to 'usr/src/uts/i86xpv/os/balloon.c')
-rw-r--r-- | usr/src/uts/i86xpv/os/balloon.c | 1065 |
1 files changed, 1065 insertions, 0 deletions
diff --git a/usr/src/uts/i86xpv/os/balloon.c b/usr/src/uts/i86xpv/os/balloon.c new file mode 100644 index 0000000000..3866acf364 --- /dev/null +++ b/usr/src/uts/i86xpv/os/balloon.c @@ -0,0 +1,1065 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/balloon_impl.h> +#include <sys/hypervisor.h> +#include <xen/sys/xenbus_impl.h> +#include <sys/atomic.h> +#include <sys/cmn_err.h> +#include <sys/disp.h> +#include <sys/callb.h> +#include <xen/public/memory.h> +#include <vm/hat.h> +#include <sys/promif.h> +#include <vm/seg_kmem.h> +#include <sys/memnode.h> +#include <sys/param.h> +#include <vm/vm_dep.h> +#include <sys/mman.h> +#include <sys/memlist.h> +#include <sys/sysmacros.h> +#include <sys/machsystm.h> +#include <sys/sdt.h> + +/* + * This file implements a balloon thread, which controls a domain's memory + * reservation, or the amount of memory a domain is currently allocated. + * The hypervisor provides the current memory reservation through xenbus, + * so we register a watch on this. We will then be signalled when the + * reservation changes. If it goes up, we map the new mfn's to our pfn's + * (allocating page_t's if necessary), and release them into the system. + * If the reservation goes down, we grab pages and release them back to + * the hypervisor, saving the page_t's for later use. + */ + +/* + * Various structures needed by the balloon thread + */ +static bln_stats_t bln_stats; +static kthread_t *bln_thread; +static kmutex_t bln_mutex; +static kcondvar_t bln_cv; +static struct xenbus_watch bln_watch; +static mfn_t new_high_mfn; + +/* + * For holding spare page_t structures - keep a singly-linked list. + * The list may hold both valid (pagenum < mfn_count) and invalid + * (pagenum >= mfn_count) page_t's. Valid page_t's should be inserted + * at the front, and invalid page_t's at the back. Removal should + * always be from the front. This is a singly-linked list using + * p_next, so p_prev is always NULL. + */ +static page_t *bln_spare_list_front, *bln_spare_list_back; + +int balloon_zero_memory = 1; +size_t balloon_minkmem = (8 * 1024 * 1024); +static caddr_t balloon_kva; +static kmutex_t balloon_kva_mutex; +static void balloon_zero_page(pfn_t pfn); + +/* + * reassign_pfn() calls update_contig_pfnlist(), which can cause a large + * slowdown when calling multiple times. If we're reassigning less than the + * quota defined here, we just accept the slowdown. If the count is greater + * than the quota, we tell the contig alloc code to stop its accounting until + * we're done. Setting the quota to less than 2 is not supported. + * + * Note that we define our own wrapper around the external + * clear_and_lock_contig_pfnlist(), but we just use the version of + * unlock_contig_pfnlist() in vm_machdep.c. + */ +uint_t bln_contig_list_quota = 50; + +extern void clear_and_lock_contig_pfnlist(void); +extern void unlock_contig_pfnlist(void); + +/* + * Lock the pfnlist if necessary (see above), and return whether we locked it. + */ +static int +balloon_lock_contig_pfnlist(int count) { + if (count > bln_contig_list_quota) { + clear_and_lock_contig_pfnlist(); + return (1); + } else { + return (0); + } +} + +/* + * The page represented by pp is being given back to the hypervisor. + * Add the page_t structure to our spare list. + */ +static void +balloon_page_add(page_t *pp) +{ + /* + * We need to keep the page exclusively locked + * to prevent swrand from grabbing it. + */ + ASSERT(PAGE_EXCL(pp)); + ASSERT(MUTEX_HELD(&bln_mutex)); + + pp->p_prev = NULL; + if (bln_spare_list_front == NULL) { + bln_spare_list_front = bln_spare_list_back = pp; + pp->p_next = NULL; + } else if (pp->p_pagenum >= mfn_count) { + /* + * The pfn is invalid, so add at the end of list. Since these + * adds should *only* be done by balloon_init_new_pages(), and + * that does adds in order, the following ASSERT should + * never trigger. + */ + ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum); + bln_spare_list_back->p_next = pp; + pp->p_next = NULL; + bln_spare_list_back = pp; + } else { + /* Add at beginning of list */ + pp->p_next = bln_spare_list_front; + bln_spare_list_front = pp; + } +} + +/* + * Return a page_t structure from our spare list, or NULL if none are available. + */ +static page_t * +balloon_page_sub(void) +{ + page_t *pp; + + ASSERT(MUTEX_HELD(&bln_mutex)); + if (bln_spare_list_front == NULL) { + return (NULL); + } + + pp = bln_spare_list_front; + ASSERT(PAGE_EXCL(pp)); + ASSERT(pp->p_pagenum <= mfn_count); + if (pp->p_pagenum == mfn_count) { + return (NULL); + } + + bln_spare_list_front = pp->p_next; + if (bln_spare_list_front == NULL) + bln_spare_list_back = NULL; + pp->p_next = NULL; + return (pp); +} + +/* + * NOTE: We currently do not support growing beyond the boot memory size, + * so the following function will not be called. It is left in here with + * the hope that someday this restriction can be lifted, and this code can + * be used. + */ + +/* + * This structure is placed at the start of every block of new pages + */ +typedef struct { + struct memseg memseg; + struct memlist memlist; + page_t pages[1]; +} mem_structs_t; + +/* + * To make the math below slightly less confusing, we calculate the first + * two parts here. page_t's are handled separately, so they are not included. + */ +#define MEM_STRUCT_SIZE (sizeof (struct memseg) + sizeof (struct memlist)) + +/* + * We want to add memory, but have no spare page_t structures. Use some of + * our new memory for the page_t structures. + * + * Somewhat similar to kphysm_add_memory_dynamic(), but simpler. + */ +static int +balloon_init_new_pages(mfn_t framelist[], pgcnt_t count) +{ + pgcnt_t metapgs, totalpgs, num_pages; + paddr_t metasz; + pfn_t meta_start; + page_t *page_array; + caddr_t va; + int i, rv, locked; + mem_structs_t *mem; + struct memseg *segp; + + /* Calculate the number of pages we're going to add */ + totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages; + + /* + * The following calculates the number of "meta" pages -- the pages + * that will be required to hold page_t structures for all new pages. + * Proof of this calculation is left up to the reader. + */ + metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) / + (PAGESIZE + sizeof (page_t))); + + /* + * Given the number of page_t structures we need, is there also + * room in our meta pages for a memseg and memlist struct? + * If not, we'll need one more meta page. + */ + if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) + + MEM_STRUCT_SIZE)) + metapgs++; + + /* + * metapgs is calculated from totalpgs, which may be much larger than + * count. If we don't have enough pages, all of the pages in this + * batch will be made meta pages, and a future trip through + * balloon_inc_reservation() will add the rest of the meta pages. + */ + if (metapgs > count) + metapgs = count; + + /* + * Figure out the number of page_t structures that can fit in metapgs + * + * This will cause us to initialize more page_t structures than we + * need - these may be used in future memory increases. + */ + metasz = pfn_to_pa(metapgs); + num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t); + + DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t, + num_pages, pgcnt_t, metapgs); + + /* + * We only increment mfn_count by count, not num_pages, to keep the + * space of all valid pfns contiguous. This means we create page_t + * structures with invalid pagenums -- we deal with this situation + * in balloon_page_sub. + */ + mfn_count += count; + + /* + * Get a VA for the pages that will hold page_t and other structures. + * The memseg and memlist structures will go at the beginning, with + * the page_t structures following. + */ + va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP); + /* LINTED: improper alignment */ + mem = (mem_structs_t *)va; + page_array = mem->pages; + + meta_start = bln_stats.bln_max_pages; + + /* + * Set the mfn to pfn mapping for the meta pages. + */ + locked = balloon_lock_contig_pfnlist(metapgs); + for (i = 0; i < metapgs; i++) { + reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]); + } + if (locked) + unlock_contig_pfnlist(); + + /* + * For our meta pages, map them in and zero the page. + * This will be the first time touching the new pages. + */ + hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages, + PROT_READ | PROT_WRITE, + HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); + bzero(va, metasz); + + /* + * Initialize the page array for the new pages. + */ + for (i = 0; i < metapgs; i++) { + page_array[i].p_pagenum = bln_stats.bln_max_pages++; + page_array[i].p_offset = (u_offset_t)-1; + page_iolock_init(&page_array[i]); + rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); + ASSERT(rv == 1); + } + + /* + * For the rest of the pages, initialize the page_t struct and + * add them to the free list + */ + for (i = metapgs; i < num_pages; i++) { + page_array[i].p_pagenum = bln_stats.bln_max_pages++; + page_array[i].p_offset = (u_offset_t)-1; + page_iolock_init(&page_array[i]); + rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); + ASSERT(rv == 1); + balloon_page_add(&page_array[i]); + } + + /* + * Remember where I said that we don't call this function? The missing + * code right here is why. We need to set up kpm mappings for any new + * pages coming in. However, if someone starts up a domain with small + * memory, then greatly increases it, we could get in some horrible + * deadlock situations as we steal page tables for kpm use, and + * userland applications take them right back before we can use them + * to set up our new memory. Once a way around that is found, and a + * few other changes are made, we'll be able to enable this code. + */ + + /* + * Update kernel structures, part 1: memsegs list + */ + mem->memseg.pages_base = meta_start; + mem->memseg.pages_end = bln_stats.bln_max_pages - 1; + mem->memseg.pages = &page_array[0]; + mem->memseg.epages = &page_array[num_pages - 1]; + mem->memseg.next = NULL; + memsegs_lock(1); + for (segp = memsegs; segp->next != NULL; segp = segp->next) + ; + segp->next = &mem->memseg; + memsegs_unlock(1); + + /* + * Update kernel structures, part 2: mem_node array + */ + mem_node_add_slice(meta_start, bln_stats.bln_max_pages); + + /* + * Update kernel structures, part 3: phys_install array + * (*sigh* how many of these things do we need?) + */ + memlist_write_lock(); + memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist, + &phys_install); + memlist_write_unlock(); + + build_pfn_hash(); + + return (metapgs); +} + +/* How many ulong_t's can we fit on a page? */ +#define FRAME_ARRAY_SIZE (PAGESIZE / sizeof (ulong_t)) + +/* + * These are too large to declare on the stack, so we make them static instead + */ +static ulong_t mfn_frames[FRAME_ARRAY_SIZE]; +static pfn_t pfn_frames[FRAME_ARRAY_SIZE]; + +/* + * This function is called when our reservation is increasing. Make a + * hypervisor call to get our new pages, then integrate them into the system. + */ +static spgcnt_t +balloon_inc_reservation(ulong_t credit) +{ + int i, cnt, locked; + int meta_pg_start, meta_pg_end; + long rv; + page_t *pp; + page_t *new_list_front, *new_list_back; + + rv = 0; + new_list_front = new_list_back = NULL; + meta_pg_start = meta_pg_end = 0; + bzero(mfn_frames, PAGESIZE); + + if (credit > FRAME_ARRAY_SIZE) + credit = FRAME_ARRAY_SIZE; + + xen_block_migrate(); + rv = balloon_alloc_pages(credit, mfn_frames); + + if (rv < 0) { + xen_allow_migrate(); + return (0); + } + for (i = 0; i < rv; i++) { + if (mfn_frames[i] > new_high_mfn) + new_high_mfn = mfn_frames[i]; + + pp = balloon_page_sub(); + if (pp == NULL) { + /* + * We pass the index into the current mfn array, + * then move the counter past the mfns we used + */ + meta_pg_start = i; + cnt = balloon_init_new_pages(&mfn_frames[i], rv - i); + i += cnt; + meta_pg_end = i; + if (i < rv) { + pp = balloon_page_sub(); + } else { + ASSERT(i == rv); + } + } + if (pp == NULL) { + break; + } + + if (new_list_back == NULL) { + new_list_front = new_list_back = pp; + } else { + new_list_back->p_next = pp; + new_list_back = pp; + } + pp->p_next = NULL; + } + cnt = i; + locked = balloon_lock_contig_pfnlist(cnt); + for (i = 0, pp = new_list_front; (i < meta_pg_start) && (pp != NULL); + i++, pp = pp->p_next) { + reassign_pfn(pp->p_pagenum, mfn_frames[i]); + } + for (i = meta_pg_end; (i < cnt) && (pp != NULL); i++, pp = pp->p_next) { + reassign_pfn(pp->p_pagenum, mfn_frames[i]); + } + if (locked) + unlock_contig_pfnlist(); + while (new_list_front != NULL) { + pp = new_list_front; + new_list_front = pp->p_next; + page_free(pp, 1); + } + page_unresv(cnt - (meta_pg_end - meta_pg_start)); + + if (cnt < rv) { + /* + * We couldn't get page structures. + * + * This shouldn't happen, but causes no real harm if it does. + * On debug kernels, we'll flag it. On all kernels, we'll + * give back the pages we couldn't assign. + */ +#ifdef DEBUG + cmn_err(CE_WARN, "Could only assign %d of %ld pages", i, rv); +#endif /* DEBUG */ + + (void) balloon_free_pages(rv - i, &mfn_frames[i], NULL, NULL); + + rv = i; + } + + xen_allow_migrate(); + return (rv); +} + +/* + * This function is called when we want to decrease the memory reservation + * of our domain. Allocate the memory and make a hypervisor call to give + * it back. + */ +static spgcnt_t +balloon_dec_reservation(ulong_t debit) +{ + int i, locked; + long rv; + page_t *pp; + + bzero(mfn_frames, sizeof (mfn_frames)); + bzero(pfn_frames, sizeof (pfn_frames)); + + if (debit > FRAME_ARRAY_SIZE) { + debit = FRAME_ARRAY_SIZE; + } + + /* + * Don't bother if there isn't a safe amount of kmem left. + */ + if (kmem_avail() < balloon_minkmem) { + kmem_reap(); + if (kmem_avail() < balloon_minkmem) + return (0); + } + + if (page_resv(debit, KM_NOSLEEP) == 0) { + return (0); + } + xen_block_migrate(); + for (i = 0; i < debit; i++) { + pp = page_get_high_mfn(new_high_mfn); + new_high_mfn = 0; + if (pp == NULL) { + /* + * Call kmem_reap(), then try once more, + * but only if there is a safe amount of + * kmem left. + */ + kmem_reap(); + if (kmem_avail() < balloon_minkmem || + (pp = page_get_high_mfn(0)) == NULL) { + debit = i; + break; + } + } + ASSERT(PAGE_EXCL(pp)); + ASSERT(!hat_page_is_mapped(pp)); + + balloon_page_add(pp); + pfn_frames[i] = pp->p_pagenum; + mfn_frames[i] = pfn_to_mfn(pp->p_pagenum); + } + if (debit == 0) { + xen_allow_migrate(); + return (0); + } + + /* + * Remove all mappings for the pfns from the system + */ + locked = balloon_lock_contig_pfnlist(debit); + for (i = 0; i < debit; i++) { + reassign_pfn(pfn_frames[i], MFN_INVALID); + } + if (locked) + unlock_contig_pfnlist(); + + rv = balloon_free_pages(debit, mfn_frames, NULL, NULL); + + if (rv < 0) { + cmn_err(CE_WARN, "Attempt to return pages to the hypervisor " + "failed - up to %lu pages lost (error = %ld)", debit, rv); + rv = 0; + } else if (rv != debit) { + panic("Unexpected return value (%ld) from decrease reservation " + "hypervisor call", rv); + } + + xen_allow_migrate(); + return (rv); +} + +/* + * This function is the callback which is called when the memory/target + * node is changed. When it is fired, we will read a new reservation + * target for our domain and signal the worker thread to make the change. + * + * If the reservation is larger than we can handle, we issue a warning. dom0 + * does this automatically every boot, so we skip the first warning on dom0. + */ +/*ARGSUSED*/ +static void +balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len) +{ + ulong_t new_target_kb; + pgcnt_t new_target_pages; + int rv; + static uchar_t warning_cnt = 0; + + rv = xenbus_scanf(NULL, "memory", "target", "%lu", &new_target_kb); + if (rv != 0) { + return; + } + + /* new_target is in kB - change this to pages */ + new_target_pages = kbtop(new_target_kb); + + DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages); + + /* + * Unfortunately, dom0 may give us a target that is larger than + * our max limit. Re-check the limit, and, if the new target is + * too large, adjust it downwards. + */ + mutex_enter(&bln_mutex); + if (new_target_pages > bln_stats.bln_max_pages) { + DTRACE_PROBE2(balloon__target__too__large, pgcnt_t, + new_target_pages, pgcnt_t, bln_stats.bln_max_pages); + if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) { + cmn_err(CE_WARN, "New balloon target (0x%lx pages) is " + "larger than original memory size (0x%lx pages). " + "Ballooning beyond original memory size is not " + "allowed.", + new_target_pages, bln_stats.bln_max_pages); + } + warning_cnt = 1; + bln_stats.bln_new_target = bln_stats.bln_max_pages; + } else { + bln_stats.bln_new_target = new_target_pages; + } + + mutex_exit(&bln_mutex); + cv_signal(&bln_cv); +} + +/* + * bln_wait_sec can be used to throttle the hv calls, but by default it's + * turned off. If a balloon attempt fails, the wait time is forced on, and + * then is exponentially increased as further attempts fail. + */ +uint_t bln_wait_sec = 0; +uint_t bln_wait_shift = 1; + +/* + * This is the main balloon thread. Wait on the cv. When woken, if our + * reservation has changed, call the appropriate function to adjust the + * reservation. + */ +static void +balloon_worker_thread(void) +{ + uint_t bln_wait; + callb_cpr_t cprinfo; + spgcnt_t rv; + + bln_wait = bln_wait_sec; + + CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon"); + for (;;) { + rv = 0; + + mutex_enter(&bln_mutex); + CALLB_CPR_SAFE_BEGIN(&cprinfo); + if (bln_stats.bln_new_target != bln_stats.bln_current_pages) { + /* + * We weren't able to fully complete the request + * last time through, so try again. + */ + (void) cv_timedwait(&bln_cv, &bln_mutex, + lbolt + (bln_wait * hz)); + } else { + cv_wait(&bln_cv, &bln_mutex); + } + CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex); + + if (bln_stats.bln_new_target != bln_stats.bln_current_pages) { + if (bln_stats.bln_new_target < + bln_stats.bln_current_pages) { + /* reservation shrunk */ + rv = -balloon_dec_reservation( + bln_stats.bln_current_pages - + bln_stats.bln_new_target); + } else if (bln_stats.bln_new_target > + bln_stats.bln_current_pages) { + /* reservation grew */ + rv = balloon_inc_reservation( + bln_stats.bln_new_target - + bln_stats.bln_current_pages); + } + } + if (rv == 0) { + if (bln_wait == 0) { + bln_wait = 1; + } else { + bln_wait <<= bln_wait_shift; + } + } else { + bln_stats.bln_current_pages += rv; + bln_wait = bln_wait_sec; + } + if (bln_stats.bln_current_pages < bln_stats.bln_low) + bln_stats.bln_low = bln_stats.bln_current_pages; + else if (bln_stats.bln_current_pages > bln_stats.bln_high) + bln_stats.bln_high = bln_stats.bln_current_pages; + mutex_exit(&bln_mutex); + } +} + +/* + * Called after balloon_init(), which is below. The xenbus thread is up + * and running, so we can register our watch and create the balloon thread. + */ +static void +balloon_config_watch(int state) +{ + if (state != XENSTORE_UP) + return; + + bln_watch.node = "memory/target"; + bln_watch.callback = balloon_handler; + if (register_xenbus_watch(&bln_watch)) { + cmn_err(CE_WARN, "Failed to register balloon watcher; balloon " + "thread will be disabled"); + return; + } + + if (bln_thread == NULL) + bln_thread = thread_create(NULL, 0, balloon_worker_thread, + NULL, 0, &p0, TS_RUN, minclsyspri); +} + +/* + * Basic initialization of the balloon thread. Set all of our variables, + * and register a callback for later when we can register a xenbus watch. + */ +void +balloon_init(pgcnt_t nr_pages) +{ + domid_t domid = DOMID_SELF; + + bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages; + bln_stats.bln_new_target = bln_stats.bln_high = nr_pages; + bln_stats.bln_max_pages = nr_pages; + cv_init(&bln_cv, NULL, CV_DEFAULT, NULL); + + /* init balloon zero logic */ + balloon_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); + mutex_init(&balloon_kva_mutex, NULL, MUTEX_DRIVER, NULL); + + bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op( + XENMEM_maximum_reservation, &domid); + + (void) xs_register_xenbus_callback(balloon_config_watch); +} + +/* + * These functions are called from the network drivers when they gain a page + * or give one away. We simply update our count. Note that the counter + * tracks the number of pages we give away, so we need to subtract any + * amount passed to balloon_drv_added. + */ +void +balloon_drv_added(int64_t delta) +{ + atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta); +} + +void +balloon_drv_subtracted(int64_t delta) +{ + atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta); +} + +/* + * balloon_alloc_pages() + * Allocate page_cnt mfns. mfns storage provided by the caller. Returns + * the number of pages allocated, which could be less than page_cnt, or + * a negative number if an error occurred. + */ +long +balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns) +{ + xen_memory_reservation_t memres; + long rv; + + bzero(&memres, sizeof (memres)); + /*LINTED: constant in conditional context*/ + set_xen_guest_handle(memres.extent_start, mfns); + memres.domid = DOMID_SELF; + memres.nr_extents = page_cnt; + + rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); + if (rv > 0) + atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv); + return (rv); +} + +/* + * balloon_free_pages() + * free page_cnt pages, using any combination of mfns, pfns, and kva as long + * as they refer to the same mapping. We need to zero the pages before + * giving them back to the hypervisor. kva space is not free'd up in case + * the caller wants to re-use it. + */ +long +balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns) +{ + xen_memory_reservation_t memdec; + mfn_t mfn; + pfn_t pfn; + uint_t i; + long e; + + +#if DEBUG + /* make sure kva is page aligned and maps to first pfn */ + if (kva != NULL) { + ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0); + if (pfns != NULL) { + ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]); + } + } +#endif + + /* if we have a kva, we can clean all pages with just one bzero */ + if ((kva != NULL) && balloon_zero_memory) { + bzero(kva, (page_cnt * PAGESIZE)); + } + + /* if we were given a kva and/or a pfn */ + if ((kva != NULL) || (pfns != NULL)) { + + /* + * All the current callers only pass 1 page when using kva or + * pfns, and use mfns when passing multiple pages. If that + * assumption is changed, the following code will need some + * work. The following ASSERT() guarantees we're respecting + * the io locking quota. + */ + ASSERT(page_cnt < bln_contig_list_quota); + + /* go through all the pages */ + for (i = 0; i < page_cnt; i++) { + + /* get the next pfn */ + if (pfns == NULL) { + pfn = hat_getpfnum(kas.a_hat, + (kva + (PAGESIZE * i))); + } else { + pfn = pfns[i]; + } + + /* + * if we didn't already zero this page, do it now. we + * need to do this *before* we give back the MFN + */ + if ((kva == NULL) && (balloon_zero_memory)) { + balloon_zero_page(pfn); + } + + /* + * unmap the pfn. We don't free up the kva vmem space + * so the caller can re-use it. The page must be + * unmapped before it is given back to the hypervisor. + */ + if (kva != NULL) { + hat_unload(kas.a_hat, (kva + (PAGESIZE * i)), + PAGESIZE, HAT_UNLOAD_UNMAP); + } + + /* grab the mfn before the pfn is marked as invalid */ + mfn = pfn_to_mfn(pfn); + + /* mark the pfn as invalid */ + reassign_pfn(pfn, MFN_INVALID); + + /* + * if we weren't given an array of MFNs, we need to + * free them up one at a time. Otherwise, we'll wait + * until later and do it in one hypercall + */ + if (mfns == NULL) { + bzero(&memdec, sizeof (memdec)); + /*LINTED: constant in conditional context*/ + set_xen_guest_handle(memdec.extent_start, &mfn); + memdec.domid = DOMID_SELF; + memdec.nr_extents = 1; + e = HYPERVISOR_memory_op( + XENMEM_decrease_reservation, &memdec); + if (e != 1) { + cmn_err(CE_PANIC, "balloon: unable to " + "give a page back to the " + "hypervisor.\n"); + } + } + } + + /* + * if all we were given was an array of MFN's, we only need to zero out + * each page. The MFNs will be free'd up below. + */ + } else if (balloon_zero_memory) { + ASSERT(mfns != NULL); + for (i = 0; i < page_cnt; i++) { + pfn = xen_assign_pfn(mfns[i]); + balloon_zero_page(pfn); + xen_release_pfn(pfn); + } + } + + /* + * if we were passed in MFNs, we haven't free'd them up yet. We can + * do it with one call. + */ + if (mfns != NULL) { + bzero(&memdec, sizeof (memdec)); + /*LINTED: constant in conditional context*/ + set_xen_guest_handle(memdec.extent_start, mfns); + memdec.domid = DOMID_SELF; + memdec.nr_extents = page_cnt; + e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec); + if (e != page_cnt) { + cmn_err(CE_PANIC, "balloon: unable to give pages back " + "to the hypervisor.\n"); + } + } + + atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt); + return (page_cnt); +} + + +/* + * balloon_replace_pages() + * Try to replace nextexts blocks of 2^order pages. addr_bits specifies + * how many bits of address the pages must be within (i.e. 16 would mean + * that the pages cannot have an address > 64k). The constrints are on + * what the hypervisor gives us -- we are free to give any pages in + * exchange. The array pp is the pages we are giving away. The caller + * provides storage space for mfns, which hold the new physical pages. + */ +long +balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits, + uint_t order, mfn_t *mfns) +{ + xen_memory_reservation_t memres; + long fallback_cnt; + long cnt; + uint_t i, j, page_cnt, extlen; + long e; + int locked; + + + /* + * we shouldn't be allocating constrained pages on a guest. It doesn't + * make any sense. They won't be constrained after a migration. + */ + ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); + + extlen = 1 << order; + page_cnt = nextents * extlen; + /* Give back the current pages to the hypervisor */ + for (i = 0; i < page_cnt; i++) { + cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum); + if (cnt != 1) { + cmn_err(CE_PANIC, "balloon: unable to give a page back " + "to the hypervisor.\n"); + } + } + + /* + * try to allocate the new pages using addr_bits and order. If we can't + * get all of the pages, try to get the remaining pages with no + * constraints and, if that was successful, return the number of + * constrained pages we did allocate. + */ + bzero(&memres, sizeof (memres)); + /*LINTED: constant in conditional context*/ + set_xen_guest_handle(memres.extent_start, mfns); + memres.domid = DOMID_SELF; + memres.nr_extents = nextents; + memres.address_bits = addr_bits; + memres.extent_order = order; + cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); + /* assign the new MFNs to the current PFNs */ + locked = balloon_lock_contig_pfnlist(cnt * extlen); + for (i = 0; i < cnt; i++) { + for (j = 0; j < extlen; j++) { + reassign_pfn(pp[i * extlen + j]->p_pagenum, + mfns[i] + j); + } + } + if (locked) + unlock_contig_pfnlist(); + if (cnt != nextents) { + if (cnt < 0) { + cnt = 0; + } + + /* + * We couldn't get enough memory to satisfy our requirements. + * The above loop will assign the parts of the request that + * were successful (this part may be 0). We need to fill + * in the rest. The bzero below clears out extent_order and + * address_bits, so we'll take anything from the hypervisor + * to replace the pages we gave away. + */ + fallback_cnt = page_cnt - cnt * extlen; + bzero(&memres, sizeof (memres)); + /*LINTED: constant in conditional context*/ + set_xen_guest_handle(memres.extent_start, mfns); + memres.domid = DOMID_SELF; + memres.nr_extents = fallback_cnt; + e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); + if (e != fallback_cnt) { + cmn_err(CE_PANIC, "balloon: unable to recover from " + "failed increase_reservation.\n"); + } + locked = balloon_lock_contig_pfnlist(fallback_cnt); + for (i = 0; i < fallback_cnt; i++) { + uint_t offset = page_cnt - fallback_cnt; + + /* + * We already used pp[0...(cnt * extlen)] before, + * so start at the next entry in the pp array. + */ + reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]); + } + if (locked) + unlock_contig_pfnlist(); + } + + /* + * balloon_free_pages increments our counter. Decrement it here. + */ + atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt); + + /* + * return the number of extents we were able to replace. If we got + * this far, we know all the pp's are valid. + */ + return (cnt); +} + + +/* + * balloon_zero_page() + * zero out the page. + */ +static void +balloon_zero_page(pfn_t pfn) +{ + /* balloon_init() should have been called first */ + ASSERT(balloon_kva != NULL); + + mutex_enter(&balloon_kva_mutex); + + /* map the pfn into kva, zero the page, then unmap the pfn */ + hat_devload(kas.a_hat, balloon_kva, PAGESIZE, pfn, + HAT_STORECACHING_OK | PROT_READ | PROT_WRITE | HAT_NOSYNC, + HAT_LOAD_LOCK); + bzero(balloon_kva, PAGESIZE); + hat_unload(kas.a_hat, balloon_kva, PAGESIZE, HAT_UNLOAD); + + mutex_exit(&balloon_kva_mutex); +} + +/* + * Called from the driver - return the requested stat. + */ +size_t +balloon_values(int cmd) +{ + switch (cmd) { + case BLN_IOCTL_CURRENT: + return (ptokb(bln_stats.bln_current_pages)); + case BLN_IOCTL_TARGET: + return (ptokb(bln_stats.bln_new_target)); + case BLN_IOCTL_LOW: + return (ptokb(bln_stats.bln_low)); + case BLN_IOCTL_HIGH: + return (ptokb(bln_stats.bln_high)); + case BLN_IOCTL_LIMIT: + return (ptokb(bln_stats.bln_hard_limit)); + default: + panic("Unexpected cmd %d in balloon_values()\n", cmd); + } + /*NOTREACHED*/ +} |