summaryrefslogtreecommitdiff
path: root/usr/src/uts/i86xpv/os/balloon.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/i86xpv/os/balloon.c')
-rw-r--r--usr/src/uts/i86xpv/os/balloon.c1065
1 files changed, 1065 insertions, 0 deletions
diff --git a/usr/src/uts/i86xpv/os/balloon.c b/usr/src/uts/i86xpv/os/balloon.c
new file mode 100644
index 0000000000..3866acf364
--- /dev/null
+++ b/usr/src/uts/i86xpv/os/balloon.c
@@ -0,0 +1,1065 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/balloon_impl.h>
+#include <sys/hypervisor.h>
+#include <xen/sys/xenbus_impl.h>
+#include <sys/atomic.h>
+#include <sys/cmn_err.h>
+#include <sys/disp.h>
+#include <sys/callb.h>
+#include <xen/public/memory.h>
+#include <vm/hat.h>
+#include <sys/promif.h>
+#include <vm/seg_kmem.h>
+#include <sys/memnode.h>
+#include <sys/param.h>
+#include <vm/vm_dep.h>
+#include <sys/mman.h>
+#include <sys/memlist.h>
+#include <sys/sysmacros.h>
+#include <sys/machsystm.h>
+#include <sys/sdt.h>
+
+/*
+ * This file implements a balloon thread, which controls a domain's memory
+ * reservation, or the amount of memory a domain is currently allocated.
+ * The hypervisor provides the current memory reservation through xenbus,
+ * so we register a watch on this. We will then be signalled when the
+ * reservation changes. If it goes up, we map the new mfn's to our pfn's
+ * (allocating page_t's if necessary), and release them into the system.
+ * If the reservation goes down, we grab pages and release them back to
+ * the hypervisor, saving the page_t's for later use.
+ */
+
+/*
+ * Various structures needed by the balloon thread
+ */
+static bln_stats_t bln_stats;
+static kthread_t *bln_thread;
+static kmutex_t bln_mutex;
+static kcondvar_t bln_cv;
+static struct xenbus_watch bln_watch;
+static mfn_t new_high_mfn;
+
+/*
+ * For holding spare page_t structures - keep a singly-linked list.
+ * The list may hold both valid (pagenum < mfn_count) and invalid
+ * (pagenum >= mfn_count) page_t's. Valid page_t's should be inserted
+ * at the front, and invalid page_t's at the back. Removal should
+ * always be from the front. This is a singly-linked list using
+ * p_next, so p_prev is always NULL.
+ */
+static page_t *bln_spare_list_front, *bln_spare_list_back;
+
+int balloon_zero_memory = 1;
+size_t balloon_minkmem = (8 * 1024 * 1024);
+static caddr_t balloon_kva;
+static kmutex_t balloon_kva_mutex;
+static void balloon_zero_page(pfn_t pfn);
+
+/*
+ * reassign_pfn() calls update_contig_pfnlist(), which can cause a large
+ * slowdown when calling multiple times. If we're reassigning less than the
+ * quota defined here, we just accept the slowdown. If the count is greater
+ * than the quota, we tell the contig alloc code to stop its accounting until
+ * we're done. Setting the quota to less than 2 is not supported.
+ *
+ * Note that we define our own wrapper around the external
+ * clear_and_lock_contig_pfnlist(), but we just use the version of
+ * unlock_contig_pfnlist() in vm_machdep.c.
+ */
+uint_t bln_contig_list_quota = 50;
+
+extern void clear_and_lock_contig_pfnlist(void);
+extern void unlock_contig_pfnlist(void);
+
+/*
+ * Lock the pfnlist if necessary (see above), and return whether we locked it.
+ */
+static int
+balloon_lock_contig_pfnlist(int count) {
+ if (count > bln_contig_list_quota) {
+ clear_and_lock_contig_pfnlist();
+ return (1);
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * The page represented by pp is being given back to the hypervisor.
+ * Add the page_t structure to our spare list.
+ */
+static void
+balloon_page_add(page_t *pp)
+{
+ /*
+ * We need to keep the page exclusively locked
+ * to prevent swrand from grabbing it.
+ */
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(MUTEX_HELD(&bln_mutex));
+
+ pp->p_prev = NULL;
+ if (bln_spare_list_front == NULL) {
+ bln_spare_list_front = bln_spare_list_back = pp;
+ pp->p_next = NULL;
+ } else if (pp->p_pagenum >= mfn_count) {
+ /*
+ * The pfn is invalid, so add at the end of list. Since these
+ * adds should *only* be done by balloon_init_new_pages(), and
+ * that does adds in order, the following ASSERT should
+ * never trigger.
+ */
+ ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum);
+ bln_spare_list_back->p_next = pp;
+ pp->p_next = NULL;
+ bln_spare_list_back = pp;
+ } else {
+ /* Add at beginning of list */
+ pp->p_next = bln_spare_list_front;
+ bln_spare_list_front = pp;
+ }
+}
+
+/*
+ * Return a page_t structure from our spare list, or NULL if none are available.
+ */
+static page_t *
+balloon_page_sub(void)
+{
+ page_t *pp;
+
+ ASSERT(MUTEX_HELD(&bln_mutex));
+ if (bln_spare_list_front == NULL) {
+ return (NULL);
+ }
+
+ pp = bln_spare_list_front;
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(pp->p_pagenum <= mfn_count);
+ if (pp->p_pagenum == mfn_count) {
+ return (NULL);
+ }
+
+ bln_spare_list_front = pp->p_next;
+ if (bln_spare_list_front == NULL)
+ bln_spare_list_back = NULL;
+ pp->p_next = NULL;
+ return (pp);
+}
+
+/*
+ * NOTE: We currently do not support growing beyond the boot memory size,
+ * so the following function will not be called. It is left in here with
+ * the hope that someday this restriction can be lifted, and this code can
+ * be used.
+ */
+
+/*
+ * This structure is placed at the start of every block of new pages
+ */
+typedef struct {
+ struct memseg memseg;
+ struct memlist memlist;
+ page_t pages[1];
+} mem_structs_t;
+
+/*
+ * To make the math below slightly less confusing, we calculate the first
+ * two parts here. page_t's are handled separately, so they are not included.
+ */
+#define MEM_STRUCT_SIZE (sizeof (struct memseg) + sizeof (struct memlist))
+
+/*
+ * We want to add memory, but have no spare page_t structures. Use some of
+ * our new memory for the page_t structures.
+ *
+ * Somewhat similar to kphysm_add_memory_dynamic(), but simpler.
+ */
+static int
+balloon_init_new_pages(mfn_t framelist[], pgcnt_t count)
+{
+ pgcnt_t metapgs, totalpgs, num_pages;
+ paddr_t metasz;
+ pfn_t meta_start;
+ page_t *page_array;
+ caddr_t va;
+ int i, rv, locked;
+ mem_structs_t *mem;
+ struct memseg *segp;
+
+ /* Calculate the number of pages we're going to add */
+ totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages;
+
+ /*
+ * The following calculates the number of "meta" pages -- the pages
+ * that will be required to hold page_t structures for all new pages.
+ * Proof of this calculation is left up to the reader.
+ */
+ metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) /
+ (PAGESIZE + sizeof (page_t)));
+
+ /*
+ * Given the number of page_t structures we need, is there also
+ * room in our meta pages for a memseg and memlist struct?
+ * If not, we'll need one more meta page.
+ */
+ if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) +
+ MEM_STRUCT_SIZE))
+ metapgs++;
+
+ /*
+ * metapgs is calculated from totalpgs, which may be much larger than
+ * count. If we don't have enough pages, all of the pages in this
+ * batch will be made meta pages, and a future trip through
+ * balloon_inc_reservation() will add the rest of the meta pages.
+ */
+ if (metapgs > count)
+ metapgs = count;
+
+ /*
+ * Figure out the number of page_t structures that can fit in metapgs
+ *
+ * This will cause us to initialize more page_t structures than we
+ * need - these may be used in future memory increases.
+ */
+ metasz = pfn_to_pa(metapgs);
+ num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t);
+
+ DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t,
+ num_pages, pgcnt_t, metapgs);
+
+ /*
+ * We only increment mfn_count by count, not num_pages, to keep the
+ * space of all valid pfns contiguous. This means we create page_t
+ * structures with invalid pagenums -- we deal with this situation
+ * in balloon_page_sub.
+ */
+ mfn_count += count;
+
+ /*
+ * Get a VA for the pages that will hold page_t and other structures.
+ * The memseg and memlist structures will go at the beginning, with
+ * the page_t structures following.
+ */
+ va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP);
+ /* LINTED: improper alignment */
+ mem = (mem_structs_t *)va;
+ page_array = mem->pages;
+
+ meta_start = bln_stats.bln_max_pages;
+
+ /*
+ * Set the mfn to pfn mapping for the meta pages.
+ */
+ locked = balloon_lock_contig_pfnlist(metapgs);
+ for (i = 0; i < metapgs; i++) {
+ reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]);
+ }
+ if (locked)
+ unlock_contig_pfnlist();
+
+ /*
+ * For our meta pages, map them in and zero the page.
+ * This will be the first time touching the new pages.
+ */
+ hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages,
+ PROT_READ | PROT_WRITE,
+ HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
+ bzero(va, metasz);
+
+ /*
+ * Initialize the page array for the new pages.
+ */
+ for (i = 0; i < metapgs; i++) {
+ page_array[i].p_pagenum = bln_stats.bln_max_pages++;
+ page_array[i].p_offset = (u_offset_t)-1;
+ page_iolock_init(&page_array[i]);
+ rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
+ ASSERT(rv == 1);
+ }
+
+ /*
+ * For the rest of the pages, initialize the page_t struct and
+ * add them to the free list
+ */
+ for (i = metapgs; i < num_pages; i++) {
+ page_array[i].p_pagenum = bln_stats.bln_max_pages++;
+ page_array[i].p_offset = (u_offset_t)-1;
+ page_iolock_init(&page_array[i]);
+ rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
+ ASSERT(rv == 1);
+ balloon_page_add(&page_array[i]);
+ }
+
+ /*
+ * Remember where I said that we don't call this function? The missing
+ * code right here is why. We need to set up kpm mappings for any new
+ * pages coming in. However, if someone starts up a domain with small
+ * memory, then greatly increases it, we could get in some horrible
+ * deadlock situations as we steal page tables for kpm use, and
+ * userland applications take them right back before we can use them
+ * to set up our new memory. Once a way around that is found, and a
+ * few other changes are made, we'll be able to enable this code.
+ */
+
+ /*
+ * Update kernel structures, part 1: memsegs list
+ */
+ mem->memseg.pages_base = meta_start;
+ mem->memseg.pages_end = bln_stats.bln_max_pages - 1;
+ mem->memseg.pages = &page_array[0];
+ mem->memseg.epages = &page_array[num_pages - 1];
+ mem->memseg.next = NULL;
+ memsegs_lock(1);
+ for (segp = memsegs; segp->next != NULL; segp = segp->next)
+ ;
+ segp->next = &mem->memseg;
+ memsegs_unlock(1);
+
+ /*
+ * Update kernel structures, part 2: mem_node array
+ */
+ mem_node_add_slice(meta_start, bln_stats.bln_max_pages);
+
+ /*
+ * Update kernel structures, part 3: phys_install array
+ * (*sigh* how many of these things do we need?)
+ */
+ memlist_write_lock();
+ memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist,
+ &phys_install);
+ memlist_write_unlock();
+
+ build_pfn_hash();
+
+ return (metapgs);
+}
+
+/* How many ulong_t's can we fit on a page? */
+#define FRAME_ARRAY_SIZE (PAGESIZE / sizeof (ulong_t))
+
+/*
+ * These are too large to declare on the stack, so we make them static instead
+ */
+static ulong_t mfn_frames[FRAME_ARRAY_SIZE];
+static pfn_t pfn_frames[FRAME_ARRAY_SIZE];
+
+/*
+ * This function is called when our reservation is increasing. Make a
+ * hypervisor call to get our new pages, then integrate them into the system.
+ */
+static spgcnt_t
+balloon_inc_reservation(ulong_t credit)
+{
+ int i, cnt, locked;
+ int meta_pg_start, meta_pg_end;
+ long rv;
+ page_t *pp;
+ page_t *new_list_front, *new_list_back;
+
+ rv = 0;
+ new_list_front = new_list_back = NULL;
+ meta_pg_start = meta_pg_end = 0;
+ bzero(mfn_frames, PAGESIZE);
+
+ if (credit > FRAME_ARRAY_SIZE)
+ credit = FRAME_ARRAY_SIZE;
+
+ xen_block_migrate();
+ rv = balloon_alloc_pages(credit, mfn_frames);
+
+ if (rv < 0) {
+ xen_allow_migrate();
+ return (0);
+ }
+ for (i = 0; i < rv; i++) {
+ if (mfn_frames[i] > new_high_mfn)
+ new_high_mfn = mfn_frames[i];
+
+ pp = balloon_page_sub();
+ if (pp == NULL) {
+ /*
+ * We pass the index into the current mfn array,
+ * then move the counter past the mfns we used
+ */
+ meta_pg_start = i;
+ cnt = balloon_init_new_pages(&mfn_frames[i], rv - i);
+ i += cnt;
+ meta_pg_end = i;
+ if (i < rv) {
+ pp = balloon_page_sub();
+ } else {
+ ASSERT(i == rv);
+ }
+ }
+ if (pp == NULL) {
+ break;
+ }
+
+ if (new_list_back == NULL) {
+ new_list_front = new_list_back = pp;
+ } else {
+ new_list_back->p_next = pp;
+ new_list_back = pp;
+ }
+ pp->p_next = NULL;
+ }
+ cnt = i;
+ locked = balloon_lock_contig_pfnlist(cnt);
+ for (i = 0, pp = new_list_front; (i < meta_pg_start) && (pp != NULL);
+ i++, pp = pp->p_next) {
+ reassign_pfn(pp->p_pagenum, mfn_frames[i]);
+ }
+ for (i = meta_pg_end; (i < cnt) && (pp != NULL); i++, pp = pp->p_next) {
+ reassign_pfn(pp->p_pagenum, mfn_frames[i]);
+ }
+ if (locked)
+ unlock_contig_pfnlist();
+ while (new_list_front != NULL) {
+ pp = new_list_front;
+ new_list_front = pp->p_next;
+ page_free(pp, 1);
+ }
+ page_unresv(cnt - (meta_pg_end - meta_pg_start));
+
+ if (cnt < rv) {
+ /*
+ * We couldn't get page structures.
+ *
+ * This shouldn't happen, but causes no real harm if it does.
+ * On debug kernels, we'll flag it. On all kernels, we'll
+ * give back the pages we couldn't assign.
+ */
+#ifdef DEBUG
+ cmn_err(CE_WARN, "Could only assign %d of %ld pages", i, rv);
+#endif /* DEBUG */
+
+ (void) balloon_free_pages(rv - i, &mfn_frames[i], NULL, NULL);
+
+ rv = i;
+ }
+
+ xen_allow_migrate();
+ return (rv);
+}
+
+/*
+ * This function is called when we want to decrease the memory reservation
+ * of our domain. Allocate the memory and make a hypervisor call to give
+ * it back.
+ */
+static spgcnt_t
+balloon_dec_reservation(ulong_t debit)
+{
+ int i, locked;
+ long rv;
+ page_t *pp;
+
+ bzero(mfn_frames, sizeof (mfn_frames));
+ bzero(pfn_frames, sizeof (pfn_frames));
+
+ if (debit > FRAME_ARRAY_SIZE) {
+ debit = FRAME_ARRAY_SIZE;
+ }
+
+ /*
+ * Don't bother if there isn't a safe amount of kmem left.
+ */
+ if (kmem_avail() < balloon_minkmem) {
+ kmem_reap();
+ if (kmem_avail() < balloon_minkmem)
+ return (0);
+ }
+
+ if (page_resv(debit, KM_NOSLEEP) == 0) {
+ return (0);
+ }
+ xen_block_migrate();
+ for (i = 0; i < debit; i++) {
+ pp = page_get_high_mfn(new_high_mfn);
+ new_high_mfn = 0;
+ if (pp == NULL) {
+ /*
+ * Call kmem_reap(), then try once more,
+ * but only if there is a safe amount of
+ * kmem left.
+ */
+ kmem_reap();
+ if (kmem_avail() < balloon_minkmem ||
+ (pp = page_get_high_mfn(0)) == NULL) {
+ debit = i;
+ break;
+ }
+ }
+ ASSERT(PAGE_EXCL(pp));
+ ASSERT(!hat_page_is_mapped(pp));
+
+ balloon_page_add(pp);
+ pfn_frames[i] = pp->p_pagenum;
+ mfn_frames[i] = pfn_to_mfn(pp->p_pagenum);
+ }
+ if (debit == 0) {
+ xen_allow_migrate();
+ return (0);
+ }
+
+ /*
+ * Remove all mappings for the pfns from the system
+ */
+ locked = balloon_lock_contig_pfnlist(debit);
+ for (i = 0; i < debit; i++) {
+ reassign_pfn(pfn_frames[i], MFN_INVALID);
+ }
+ if (locked)
+ unlock_contig_pfnlist();
+
+ rv = balloon_free_pages(debit, mfn_frames, NULL, NULL);
+
+ if (rv < 0) {
+ cmn_err(CE_WARN, "Attempt to return pages to the hypervisor "
+ "failed - up to %lu pages lost (error = %ld)", debit, rv);
+ rv = 0;
+ } else if (rv != debit) {
+ panic("Unexpected return value (%ld) from decrease reservation "
+ "hypervisor call", rv);
+ }
+
+ xen_allow_migrate();
+ return (rv);
+}
+
+/*
+ * This function is the callback which is called when the memory/target
+ * node is changed. When it is fired, we will read a new reservation
+ * target for our domain and signal the worker thread to make the change.
+ *
+ * If the reservation is larger than we can handle, we issue a warning. dom0
+ * does this automatically every boot, so we skip the first warning on dom0.
+ */
+/*ARGSUSED*/
+static void
+balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len)
+{
+ ulong_t new_target_kb;
+ pgcnt_t new_target_pages;
+ int rv;
+ static uchar_t warning_cnt = 0;
+
+ rv = xenbus_scanf(NULL, "memory", "target", "%lu", &new_target_kb);
+ if (rv != 0) {
+ return;
+ }
+
+ /* new_target is in kB - change this to pages */
+ new_target_pages = kbtop(new_target_kb);
+
+ DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages);
+
+ /*
+ * Unfortunately, dom0 may give us a target that is larger than
+ * our max limit. Re-check the limit, and, if the new target is
+ * too large, adjust it downwards.
+ */
+ mutex_enter(&bln_mutex);
+ if (new_target_pages > bln_stats.bln_max_pages) {
+ DTRACE_PROBE2(balloon__target__too__large, pgcnt_t,
+ new_target_pages, pgcnt_t, bln_stats.bln_max_pages);
+ if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) {
+ cmn_err(CE_WARN, "New balloon target (0x%lx pages) is "
+ "larger than original memory size (0x%lx pages). "
+ "Ballooning beyond original memory size is not "
+ "allowed.",
+ new_target_pages, bln_stats.bln_max_pages);
+ }
+ warning_cnt = 1;
+ bln_stats.bln_new_target = bln_stats.bln_max_pages;
+ } else {
+ bln_stats.bln_new_target = new_target_pages;
+ }
+
+ mutex_exit(&bln_mutex);
+ cv_signal(&bln_cv);
+}
+
+/*
+ * bln_wait_sec can be used to throttle the hv calls, but by default it's
+ * turned off. If a balloon attempt fails, the wait time is forced on, and
+ * then is exponentially increased as further attempts fail.
+ */
+uint_t bln_wait_sec = 0;
+uint_t bln_wait_shift = 1;
+
+/*
+ * This is the main balloon thread. Wait on the cv. When woken, if our
+ * reservation has changed, call the appropriate function to adjust the
+ * reservation.
+ */
+static void
+balloon_worker_thread(void)
+{
+ uint_t bln_wait;
+ callb_cpr_t cprinfo;
+ spgcnt_t rv;
+
+ bln_wait = bln_wait_sec;
+
+ CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon");
+ for (;;) {
+ rv = 0;
+
+ mutex_enter(&bln_mutex);
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
+ /*
+ * We weren't able to fully complete the request
+ * last time through, so try again.
+ */
+ (void) cv_timedwait(&bln_cv, &bln_mutex,
+ lbolt + (bln_wait * hz));
+ } else {
+ cv_wait(&bln_cv, &bln_mutex);
+ }
+ CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex);
+
+ if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
+ if (bln_stats.bln_new_target <
+ bln_stats.bln_current_pages) {
+ /* reservation shrunk */
+ rv = -balloon_dec_reservation(
+ bln_stats.bln_current_pages -
+ bln_stats.bln_new_target);
+ } else if (bln_stats.bln_new_target >
+ bln_stats.bln_current_pages) {
+ /* reservation grew */
+ rv = balloon_inc_reservation(
+ bln_stats.bln_new_target -
+ bln_stats.bln_current_pages);
+ }
+ }
+ if (rv == 0) {
+ if (bln_wait == 0) {
+ bln_wait = 1;
+ } else {
+ bln_wait <<= bln_wait_shift;
+ }
+ } else {
+ bln_stats.bln_current_pages += rv;
+ bln_wait = bln_wait_sec;
+ }
+ if (bln_stats.bln_current_pages < bln_stats.bln_low)
+ bln_stats.bln_low = bln_stats.bln_current_pages;
+ else if (bln_stats.bln_current_pages > bln_stats.bln_high)
+ bln_stats.bln_high = bln_stats.bln_current_pages;
+ mutex_exit(&bln_mutex);
+ }
+}
+
+/*
+ * Called after balloon_init(), which is below. The xenbus thread is up
+ * and running, so we can register our watch and create the balloon thread.
+ */
+static void
+balloon_config_watch(int state)
+{
+ if (state != XENSTORE_UP)
+ return;
+
+ bln_watch.node = "memory/target";
+ bln_watch.callback = balloon_handler;
+ if (register_xenbus_watch(&bln_watch)) {
+ cmn_err(CE_WARN, "Failed to register balloon watcher; balloon "
+ "thread will be disabled");
+ return;
+ }
+
+ if (bln_thread == NULL)
+ bln_thread = thread_create(NULL, 0, balloon_worker_thread,
+ NULL, 0, &p0, TS_RUN, minclsyspri);
+}
+
+/*
+ * Basic initialization of the balloon thread. Set all of our variables,
+ * and register a callback for later when we can register a xenbus watch.
+ */
+void
+balloon_init(pgcnt_t nr_pages)
+{
+ domid_t domid = DOMID_SELF;
+
+ bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages;
+ bln_stats.bln_new_target = bln_stats.bln_high = nr_pages;
+ bln_stats.bln_max_pages = nr_pages;
+ cv_init(&bln_cv, NULL, CV_DEFAULT, NULL);
+
+ /* init balloon zero logic */
+ balloon_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
+ mutex_init(&balloon_kva_mutex, NULL, MUTEX_DRIVER, NULL);
+
+ bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op(
+ XENMEM_maximum_reservation, &domid);
+
+ (void) xs_register_xenbus_callback(balloon_config_watch);
+}
+
+/*
+ * These functions are called from the network drivers when they gain a page
+ * or give one away. We simply update our count. Note that the counter
+ * tracks the number of pages we give away, so we need to subtract any
+ * amount passed to balloon_drv_added.
+ */
+void
+balloon_drv_added(int64_t delta)
+{
+ atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta);
+}
+
+void
+balloon_drv_subtracted(int64_t delta)
+{
+ atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta);
+}
+
+/*
+ * balloon_alloc_pages()
+ * Allocate page_cnt mfns. mfns storage provided by the caller. Returns
+ * the number of pages allocated, which could be less than page_cnt, or
+ * a negative number if an error occurred.
+ */
+long
+balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns)
+{
+ xen_memory_reservation_t memres;
+ long rv;
+
+ bzero(&memres, sizeof (memres));
+ /*LINTED: constant in conditional context*/
+ set_xen_guest_handle(memres.extent_start, mfns);
+ memres.domid = DOMID_SELF;
+ memres.nr_extents = page_cnt;
+
+ rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
+ if (rv > 0)
+ atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv);
+ return (rv);
+}
+
+/*
+ * balloon_free_pages()
+ * free page_cnt pages, using any combination of mfns, pfns, and kva as long
+ * as they refer to the same mapping. We need to zero the pages before
+ * giving them back to the hypervisor. kva space is not free'd up in case
+ * the caller wants to re-use it.
+ */
+long
+balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns)
+{
+ xen_memory_reservation_t memdec;
+ mfn_t mfn;
+ pfn_t pfn;
+ uint_t i;
+ long e;
+
+
+#if DEBUG
+ /* make sure kva is page aligned and maps to first pfn */
+ if (kva != NULL) {
+ ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0);
+ if (pfns != NULL) {
+ ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]);
+ }
+ }
+#endif
+
+ /* if we have a kva, we can clean all pages with just one bzero */
+ if ((kva != NULL) && balloon_zero_memory) {
+ bzero(kva, (page_cnt * PAGESIZE));
+ }
+
+ /* if we were given a kva and/or a pfn */
+ if ((kva != NULL) || (pfns != NULL)) {
+
+ /*
+ * All the current callers only pass 1 page when using kva or
+ * pfns, and use mfns when passing multiple pages. If that
+ * assumption is changed, the following code will need some
+ * work. The following ASSERT() guarantees we're respecting
+ * the io locking quota.
+ */
+ ASSERT(page_cnt < bln_contig_list_quota);
+
+ /* go through all the pages */
+ for (i = 0; i < page_cnt; i++) {
+
+ /* get the next pfn */
+ if (pfns == NULL) {
+ pfn = hat_getpfnum(kas.a_hat,
+ (kva + (PAGESIZE * i)));
+ } else {
+ pfn = pfns[i];
+ }
+
+ /*
+ * if we didn't already zero this page, do it now. we
+ * need to do this *before* we give back the MFN
+ */
+ if ((kva == NULL) && (balloon_zero_memory)) {
+ balloon_zero_page(pfn);
+ }
+
+ /*
+ * unmap the pfn. We don't free up the kva vmem space
+ * so the caller can re-use it. The page must be
+ * unmapped before it is given back to the hypervisor.
+ */
+ if (kva != NULL) {
+ hat_unload(kas.a_hat, (kva + (PAGESIZE * i)),
+ PAGESIZE, HAT_UNLOAD_UNMAP);
+ }
+
+ /* grab the mfn before the pfn is marked as invalid */
+ mfn = pfn_to_mfn(pfn);
+
+ /* mark the pfn as invalid */
+ reassign_pfn(pfn, MFN_INVALID);
+
+ /*
+ * if we weren't given an array of MFNs, we need to
+ * free them up one at a time. Otherwise, we'll wait
+ * until later and do it in one hypercall
+ */
+ if (mfns == NULL) {
+ bzero(&memdec, sizeof (memdec));
+ /*LINTED: constant in conditional context*/
+ set_xen_guest_handle(memdec.extent_start, &mfn);
+ memdec.domid = DOMID_SELF;
+ memdec.nr_extents = 1;
+ e = HYPERVISOR_memory_op(
+ XENMEM_decrease_reservation, &memdec);
+ if (e != 1) {
+ cmn_err(CE_PANIC, "balloon: unable to "
+ "give a page back to the "
+ "hypervisor.\n");
+ }
+ }
+ }
+
+ /*
+ * if all we were given was an array of MFN's, we only need to zero out
+ * each page. The MFNs will be free'd up below.
+ */
+ } else if (balloon_zero_memory) {
+ ASSERT(mfns != NULL);
+ for (i = 0; i < page_cnt; i++) {
+ pfn = xen_assign_pfn(mfns[i]);
+ balloon_zero_page(pfn);
+ xen_release_pfn(pfn);
+ }
+ }
+
+ /*
+ * if we were passed in MFNs, we haven't free'd them up yet. We can
+ * do it with one call.
+ */
+ if (mfns != NULL) {
+ bzero(&memdec, sizeof (memdec));
+ /*LINTED: constant in conditional context*/
+ set_xen_guest_handle(memdec.extent_start, mfns);
+ memdec.domid = DOMID_SELF;
+ memdec.nr_extents = page_cnt;
+ e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec);
+ if (e != page_cnt) {
+ cmn_err(CE_PANIC, "balloon: unable to give pages back "
+ "to the hypervisor.\n");
+ }
+ }
+
+ atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt);
+ return (page_cnt);
+}
+
+
+/*
+ * balloon_replace_pages()
+ * Try to replace nextexts blocks of 2^order pages. addr_bits specifies
+ * how many bits of address the pages must be within (i.e. 16 would mean
+ * that the pages cannot have an address > 64k). The constrints are on
+ * what the hypervisor gives us -- we are free to give any pages in
+ * exchange. The array pp is the pages we are giving away. The caller
+ * provides storage space for mfns, which hold the new physical pages.
+ */
+long
+balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits,
+ uint_t order, mfn_t *mfns)
+{
+ xen_memory_reservation_t memres;
+ long fallback_cnt;
+ long cnt;
+ uint_t i, j, page_cnt, extlen;
+ long e;
+ int locked;
+
+
+ /*
+ * we shouldn't be allocating constrained pages on a guest. It doesn't
+ * make any sense. They won't be constrained after a migration.
+ */
+ ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
+
+ extlen = 1 << order;
+ page_cnt = nextents * extlen;
+ /* Give back the current pages to the hypervisor */
+ for (i = 0; i < page_cnt; i++) {
+ cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum);
+ if (cnt != 1) {
+ cmn_err(CE_PANIC, "balloon: unable to give a page back "
+ "to the hypervisor.\n");
+ }
+ }
+
+ /*
+ * try to allocate the new pages using addr_bits and order. If we can't
+ * get all of the pages, try to get the remaining pages with no
+ * constraints and, if that was successful, return the number of
+ * constrained pages we did allocate.
+ */
+ bzero(&memres, sizeof (memres));
+ /*LINTED: constant in conditional context*/
+ set_xen_guest_handle(memres.extent_start, mfns);
+ memres.domid = DOMID_SELF;
+ memres.nr_extents = nextents;
+ memres.address_bits = addr_bits;
+ memres.extent_order = order;
+ cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
+ /* assign the new MFNs to the current PFNs */
+ locked = balloon_lock_contig_pfnlist(cnt * extlen);
+ for (i = 0; i < cnt; i++) {
+ for (j = 0; j < extlen; j++) {
+ reassign_pfn(pp[i * extlen + j]->p_pagenum,
+ mfns[i] + j);
+ }
+ }
+ if (locked)
+ unlock_contig_pfnlist();
+ if (cnt != nextents) {
+ if (cnt < 0) {
+ cnt = 0;
+ }
+
+ /*
+ * We couldn't get enough memory to satisfy our requirements.
+ * The above loop will assign the parts of the request that
+ * were successful (this part may be 0). We need to fill
+ * in the rest. The bzero below clears out extent_order and
+ * address_bits, so we'll take anything from the hypervisor
+ * to replace the pages we gave away.
+ */
+ fallback_cnt = page_cnt - cnt * extlen;
+ bzero(&memres, sizeof (memres));
+ /*LINTED: constant in conditional context*/
+ set_xen_guest_handle(memres.extent_start, mfns);
+ memres.domid = DOMID_SELF;
+ memres.nr_extents = fallback_cnt;
+ e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
+ if (e != fallback_cnt) {
+ cmn_err(CE_PANIC, "balloon: unable to recover from "
+ "failed increase_reservation.\n");
+ }
+ locked = balloon_lock_contig_pfnlist(fallback_cnt);
+ for (i = 0; i < fallback_cnt; i++) {
+ uint_t offset = page_cnt - fallback_cnt;
+
+ /*
+ * We already used pp[0...(cnt * extlen)] before,
+ * so start at the next entry in the pp array.
+ */
+ reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]);
+ }
+ if (locked)
+ unlock_contig_pfnlist();
+ }
+
+ /*
+ * balloon_free_pages increments our counter. Decrement it here.
+ */
+ atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt);
+
+ /*
+ * return the number of extents we were able to replace. If we got
+ * this far, we know all the pp's are valid.
+ */
+ return (cnt);
+}
+
+
+/*
+ * balloon_zero_page()
+ * zero out the page.
+ */
+static void
+balloon_zero_page(pfn_t pfn)
+{
+ /* balloon_init() should have been called first */
+ ASSERT(balloon_kva != NULL);
+
+ mutex_enter(&balloon_kva_mutex);
+
+ /* map the pfn into kva, zero the page, then unmap the pfn */
+ hat_devload(kas.a_hat, balloon_kva, PAGESIZE, pfn,
+ HAT_STORECACHING_OK | PROT_READ | PROT_WRITE | HAT_NOSYNC,
+ HAT_LOAD_LOCK);
+ bzero(balloon_kva, PAGESIZE);
+ hat_unload(kas.a_hat, balloon_kva, PAGESIZE, HAT_UNLOAD);
+
+ mutex_exit(&balloon_kva_mutex);
+}
+
+/*
+ * Called from the driver - return the requested stat.
+ */
+size_t
+balloon_values(int cmd)
+{
+ switch (cmd) {
+ case BLN_IOCTL_CURRENT:
+ return (ptokb(bln_stats.bln_current_pages));
+ case BLN_IOCTL_TARGET:
+ return (ptokb(bln_stats.bln_new_target));
+ case BLN_IOCTL_LOW:
+ return (ptokb(bln_stats.bln_low));
+ case BLN_IOCTL_HIGH:
+ return (ptokb(bln_stats.bln_high));
+ case BLN_IOCTL_LIMIT:
+ return (ptokb(bln_stats.bln_hard_limit));
+ default:
+ panic("Unexpected cmd %d in balloon_values()\n", cmd);
+ }
+ /*NOTREACHED*/
+}