usr/src/uts/i86xpv/os/balloon.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2016 PALO, Richard.
 */

#include <sys/balloon_impl.h>
#include <sys/hypervisor.h>
#include <xen/sys/xenbus_impl.h>
#include <sys/atomic.h>
#include <sys/cmn_err.h>
#include <sys/disp.h>
#include <sys/callb.h>
#include <xen/public/memory.h>
#include <vm/hat.h>
#include <sys/promif.h>
#include <vm/seg_kmem.h>
#include <sys/memnode.h>
#include <sys/param.h>
#include <vm/vm_dep.h>
#include <sys/mman.h>
#include <sys/memlist.h>
#include <sys/sysmacros.h>
#include <sys/machsystm.h>
#include <sys/sdt.h>

/*
 * This file implements a balloon thread, which controls a domain's memory
 * reservation, or the amount of memory a domain is currently allocated.
 * The hypervisor provides the current memory reservation through xenbus,
 * so we register a watch on this.  We will then be signalled when the
 * reservation changes.  If it goes up, we map the new mfn's to our pfn's
 * (allocating page_t's if necessary), and release them into the system.
 * If the reservation goes down, we grab pages and release them back to
 * the hypervisor, saving the page_t's for later use.
 */

/*
 * Various structures needed by the balloon thread
 */
static bln_stats_t bln_stats;
static kthread_t *bln_thread;
static kmutex_t bln_mutex;
static kcondvar_t bln_cv;
static struct xenbus_watch bln_watch;
static mfn_t new_high_mfn;

/*
 * For holding spare page_t structures - keep a singly-linked list.
 * The list may hold both valid (pagenum < mfn_count) and invalid
 * (pagenum >= mfn_count) page_t's.  Valid page_t's should be inserted
 * at the front, and invalid page_t's at the back.  Removal should
 * always be from the front.  This is a singly-linked list using
 * p_next, so p_prev is always NULL.
 */
static page_t *bln_spare_list_front, *bln_spare_list_back;

int balloon_zero_memory = 1;
size_t balloon_minkmem = (8 * 1024 * 1024);

/*
 * reassign_pfn() calls update_contig_pfnlist(), which can cause a large
 * slowdown when calling multiple times.  If we're reassigning less than the
 * quota defined here, we just accept the slowdown.  If the count is greater
 * than the quota, we tell the contig alloc code to stop its accounting until
 * we're done.  Setting the quota to less than 2 is not supported.
 *
 * Note that we define our own wrapper around the external
 * clear_and_lock_contig_pfnlist(), but we just use the version of
 * unlock_contig_pfnlist() in vm_machdep.c.
 */
uint_t bln_contig_list_quota = 50;

extern void clear_and_lock_contig_pfnlist(void);
extern void unlock_contig_pfnlist(void);

/*
 * Lock the pfnlist if necessary (see above), and return whether we locked it.
 */
static int
balloon_lock_contig_pfnlist(int count)
{
	if (count > bln_contig_list_quota) {
		clear_and_lock_contig_pfnlist();
		return (1);
	} else {
		return (0);
	}
}

/*
 * The page represented by pp is being given back to the hypervisor.
 * Add the page_t structure to our spare list.
 */
static void
balloon_page_add(page_t *pp)
{
	/*
	 * We need to keep the page exclusively locked
	 * to prevent swrand from grabbing it.
	 */
	ASSERT(PAGE_EXCL(pp));
	ASSERT(MUTEX_HELD(&bln_mutex));

	pp->p_prev = NULL;
	if (bln_spare_list_front == NULL) {
		bln_spare_list_front = bln_spare_list_back = pp;
		pp->p_next = NULL;
	} else if (pp->p_pagenum >= mfn_count) {
		/*
		 * The pfn is invalid, so add at the end of list.  Since these
		 * adds should *only* be done by balloon_init_new_pages(), and
		 * that does adds in order, the following ASSERT should
		 * never trigger.
		 */
		ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum);
		bln_spare_list_back->p_next = pp;
		pp->p_next = NULL;
		bln_spare_list_back = pp;
	} else {
		/* Add at beginning of list */
		pp->p_next = bln_spare_list_front;
		bln_spare_list_front = pp;
	}
}

/*
 * Return a page_t structure from our spare list, or NULL if none are available.
 */
static page_t *
balloon_page_sub(void)
{
	page_t *pp;

	ASSERT(MUTEX_HELD(&bln_mutex));
	if (bln_spare_list_front == NULL) {
		return (NULL);
	}

	pp = bln_spare_list_front;
	ASSERT(PAGE_EXCL(pp));
	ASSERT(pp->p_pagenum <= mfn_count);
	if (pp->p_pagenum == mfn_count) {
		return (NULL);
	}

	bln_spare_list_front = pp->p_next;
	if (bln_spare_list_front == NULL)
		bln_spare_list_back = NULL;
	pp->p_next = NULL;
	return (pp);
}

/*
 * NOTE: We currently do not support growing beyond the boot memory size,
 * so the following function will not be called.  It is left in here with
 * the hope that someday this restriction can be lifted, and this code can
 * be used.
 */

/*
 * This structure is placed at the start of every block of new pages
 */
typedef struct {
	struct memseg	memseg;
	struct memlist	memlist;
	page_t		pages[1];
} mem_structs_t;

/*
 * To make the math below slightly less confusing, we calculate the first
 * two parts here.  page_t's are handled separately, so they are not included.
 */
#define	MEM_STRUCT_SIZE	(sizeof (struct memseg) + sizeof (struct memlist))

/*
 * We want to add memory, but have no spare page_t structures.  Use some of
 * our new memory for the page_t structures.
 *
 * Somewhat similar to kphysm_add_memory_dynamic(), but simpler.
 */
static int
balloon_init_new_pages(mfn_t framelist[], pgcnt_t count)
{
	pgcnt_t	metapgs, totalpgs, num_pages;
	paddr_t	metasz;
	pfn_t	meta_start;
	page_t	*page_array;
	caddr_t	va;
	int	i, rv, locked;
	mem_structs_t *mem;
	struct memseg *segp;

	/* Calculate the number of pages we're going to add */
	totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages;

	/*
	 * The following calculates the number of "meta" pages -- the pages
	 * that will be required to hold page_t structures for all new pages.
	 * Proof of this calculation is left up to the reader.
	 */
	metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) /
	    (PAGESIZE + sizeof (page_t)));

	/*
	 * Given the number of page_t structures we need, is there also
	 * room in our meta pages for a memseg and memlist struct?
	 * If not, we'll need one more meta page.
	 */
	if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) +
	    MEM_STRUCT_SIZE))
		metapgs++;

	/*
	 * metapgs is calculated from totalpgs, which may be much larger than
	 * count.  If we don't have enough pages, all of the pages in this
	 * batch will be made meta pages, and a future trip through
	 * balloon_inc_reservation() will add the rest of the meta pages.
	 */
	if (metapgs > count)
		metapgs = count;

	/*
	 * Figure out the number of page_t structures that can fit in metapgs
	 *
	 * This will cause us to initialize more page_t structures than we
	 * need - these may be used in future memory increases.
	 */
	metasz = pfn_to_pa(metapgs);
	num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t);

	DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t,
	    num_pages, pgcnt_t, metapgs);

	/*
	 * We only increment mfn_count by count, not num_pages, to keep the
	 * space of all valid pfns contiguous.  This means we create page_t
	 * structures with invalid pagenums -- we deal with this situation
	 * in balloon_page_sub.
	 */
	mfn_count += count;

	/*
	 * Get a VA for the pages that will hold page_t and other structures.
	 * The memseg and memlist structures will go at the beginning, with
	 * the page_t structures following.
	 */
	va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP);
	/* LINTED: improper alignment */
	mem = (mem_structs_t *)va;
	page_array = mem->pages;

	meta_start = bln_stats.bln_max_pages;

	/*
	 * Set the mfn to pfn mapping for the meta pages.
	 */
	locked = balloon_lock_contig_pfnlist(metapgs);
	for (i = 0; i < metapgs; i++) {
		reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]);
	}
	if (locked)
		unlock_contig_pfnlist();

	/*
	 * For our meta pages, map them in and zero the page.
	 * This will be the first time touching the new pages.
	 */
	hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages,
	    PROT_READ | PROT_WRITE,
	    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
	bzero(va, metasz);

	/*
	 * Initialize the page array for the new pages.
	 */
	for (i = 0; i < metapgs; i++) {
		page_array[i].p_pagenum = bln_stats.bln_max_pages++;
		page_array[i].p_offset = (u_offset_t)-1;
		page_iolock_init(&page_array[i]);
		rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
		ASSERT(rv == 1);
	}

	/*
	 * For the rest of the pages, initialize the page_t struct and
	 * add them to the free list
	 */
	for (i = metapgs; i < num_pages; i++) {
		page_array[i].p_pagenum = bln_stats.bln_max_pages++;
		page_array[i].p_offset = (u_offset_t)-1;
		page_iolock_init(&page_array[i]);
		rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
		ASSERT(rv == 1);
		balloon_page_add(&page_array[i]);
	}

	/*
	 * Remember where I said that we don't call this function?  The missing
	 * code right here is why.  We need to set up kpm mappings for any new
	 * pages coming in.  However, if someone starts up a domain with small
	 * memory, then greatly increases it, we could get in some horrible
	 * deadlock situations as we steal page tables for kpm use, and
	 * userland applications take them right back before we can use them
	 * to set up our new memory.  Once a way around that is found, and a
	 * few other changes are made, we'll be able to enable this code.
	 */

	/*
	 * Update kernel structures, part 1: memsegs list
	 */
	mem->memseg.pages_base = meta_start;
	mem->memseg.pages_end = bln_stats.bln_max_pages - 1;
	mem->memseg.pages = &page_array[0];
	mem->memseg.epages = &page_array[num_pages - 1];
	mem->memseg.next = NULL;
	memsegs_lock(1);
	for (segp = memsegs; segp->next != NULL; segp = segp->next)
		;
	segp->next = &mem->memseg;
	memsegs_unlock(1);

	/*
	 * Update kernel structures, part 2: mem_node array
	 */
	mem_node_add_slice(meta_start, bln_stats.bln_max_pages);

	/*
	 * Update kernel structures, part 3: phys_install array
	 * (*sigh* how many of these things do we need?)
	 */
	memlist_write_lock();
	memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist,
	    &phys_install);
	memlist_write_unlock();

	build_pfn_hash();

	return (metapgs);
}

/* How many ulong_t's can we fit on a page? */
#define	FRAME_ARRAY_SIZE	(PAGESIZE / sizeof (ulong_t))

/*
 * These are too large to declare on the stack, so we make them static instead
 */
static ulong_t	mfn_frames[FRAME_ARRAY_SIZE];
static pfn_t	pfn_frames[FRAME_ARRAY_SIZE];

/*
 * This function is called when our reservation is increasing.  Make a
 * hypervisor call to get our new pages, then integrate them into the system.
 */
static spgcnt_t
balloon_inc_reservation(ulong_t credit)
{
	int	i, cnt, locked;
	int	meta_pg_start, meta_pg_end;
	long	rv;
	page_t	*pp;
	page_t	*new_list_front, *new_list_back;

	/* Make sure we're single-threaded. */
	ASSERT(MUTEX_HELD(&bln_mutex));

	rv = 0;
	new_list_front = new_list_back = NULL;
	meta_pg_start = meta_pg_end = 0;
	bzero(mfn_frames, PAGESIZE);

	if (credit > FRAME_ARRAY_SIZE)
		credit = FRAME_ARRAY_SIZE;

	xen_block_migrate();
	rv = balloon_alloc_pages(credit, mfn_frames);

	if (rv < 0) {
		xen_allow_migrate();
		return (0);
	}
	for (i = 0; i < rv; i++) {
		if (mfn_frames[i] > new_high_mfn)
			new_high_mfn = mfn_frames[i];

		pp = balloon_page_sub();
		if (pp == NULL) {
			/*
			 * We pass the index into the current mfn array,
			 * then move the counter past the mfns we used
			 */
			meta_pg_start = i;
			cnt = balloon_init_new_pages(&mfn_frames[i], rv - i);
			i += cnt;
			meta_pg_end = i;
			if (i < rv) {
				pp = balloon_page_sub();
			} else {
				ASSERT(i == rv);
			}
		}
		if (pp == NULL) {
			break;
		}

		if (new_list_back == NULL) {
			new_list_front = new_list_back = pp;
		} else {
			new_list_back->p_next = pp;
			new_list_back = pp;
		}
		pp->p_next = NULL;
	}
	cnt = i;
	locked = balloon_lock_contig_pfnlist(cnt);
	for (i = 0, pp = new_list_front; i < meta_pg_start;
	    i++, pp = pp->p_next) {
		reassign_pfn(pp->p_pagenum, mfn_frames[i]);
	}
	for (i = meta_pg_end; i < cnt; i++, pp = pp->p_next) {
		reassign_pfn(pp->p_pagenum, mfn_frames[i]);
	}
	if (locked)
		unlock_contig_pfnlist();

	/*
	 * Make sure we don't allow pages without pfn->mfn mappings
	 * into the system.
	 */
	ASSERT(pp == NULL);

	while (new_list_front != NULL) {
		pp = new_list_front;
		new_list_front = pp->p_next;
		page_free(pp, 1);
	}

	/*
	 * Variable review: at this point, rv contains the number of pages
	 * the hypervisor gave us.  cnt contains the number of pages for which
	 * we had page_t structures.  i contains the number of pages
	 * where we set up pfn <-> mfn mappings.  If this ASSERT trips, that
	 * means we somehow lost page_t's from our local list.
	 */
	ASSERT(cnt == i);
	if (cnt < rv) {
		/*
		 * We couldn't get page structures.
		 *
		 * This shouldn't happen, but causes no real harm if it does.
		 * On debug kernels, we'll flag it.  On all kernels, we'll
		 * give back the pages we couldn't assign.
		 *
		 * Since these pages are new to the system and haven't been
		 * used, we don't bother zeroing them.
		 */
#ifdef DEBUG
		cmn_err(CE_WARN, "Could only assign %d of %ld pages", cnt, rv);
#endif	/* DEBUG */

		(void) balloon_free_pages(rv - cnt, &mfn_frames[i], NULL, NULL);

		rv = cnt;
	}

	xen_allow_migrate();
	page_unresv(rv - (meta_pg_end - meta_pg_start));
	return (rv);
}

/*
 * This function is called when we want to decrease the memory reservation
 * of our domain.  Allocate the memory and make a hypervisor call to give
 * it back.
 */
static spgcnt_t
balloon_dec_reservation(ulong_t debit)
{
	int	i, locked;
	long	rv;
	ulong_t	request;
	page_t	*pp;

	bzero(mfn_frames, sizeof (mfn_frames));
	bzero(pfn_frames, sizeof (pfn_frames));

	if (debit > FRAME_ARRAY_SIZE) {
		debit = FRAME_ARRAY_SIZE;
	}
	request = debit;

	/*
	 * Don't bother if there isn't a safe amount of kmem left.
	 */
	if (kmem_avail() < balloon_minkmem) {
		kmem_reap();
		if (kmem_avail() < balloon_minkmem)
			return (0);
	}

	if (page_resv(request, KM_NOSLEEP) == 0) {
		return (0);
	}
	xen_block_migrate();
	for (i = 0; i < debit; i++) {
		pp = page_get_high_mfn(new_high_mfn);
		new_high_mfn = 0;
		if (pp == NULL) {
			/*
			 * Call kmem_reap(), then try once more,
			 * but only if there is a safe amount of
			 * kmem left.
			 */
			kmem_reap();
			if (kmem_avail() < balloon_minkmem ||
			    (pp = page_get_high_mfn(0)) == NULL) {
				debit = i;
				break;
			}
		}
		ASSERT(PAGE_EXCL(pp));
		ASSERT(!hat_page_is_mapped(pp));

		balloon_page_add(pp);
		pfn_frames[i] = pp->p_pagenum;
		mfn_frames[i] = pfn_to_mfn(pp->p_pagenum);
	}
	if (debit == 0) {
		xen_allow_migrate();
		page_unresv(request);
		return (0);
	}

	/*
	 * We zero all the pages before we start reassigning them in order to
	 * minimize the time spent holding the lock on the contig pfn list.
	 */
	if (balloon_zero_memory) {
		for (i = 0; i < debit; i++) {
			pfnzero(pfn_frames[i], 0, PAGESIZE);
		}
	}

	/*
	 * Remove all mappings for the pfns from the system
	 */
	locked = balloon_lock_contig_pfnlist(debit);
	for (i = 0; i < debit; i++) {
		reassign_pfn(pfn_frames[i], MFN_INVALID);
	}
	if (locked)
		unlock_contig_pfnlist();

	rv = balloon_free_pages(debit, mfn_frames, NULL, NULL);

	if (rv < 0) {
		cmn_err(CE_WARN, "Attempt to return pages to the hypervisor "
		    "failed - up to %lu pages lost (error = %ld)", debit, rv);
		rv = 0;
	} else if (rv != debit) {
		panic("Unexpected return value (%ld) from decrease reservation "
		    "hypervisor call", rv);
	}

	xen_allow_migrate();
	if (debit != request)
		page_unresv(request - debit);
	return (rv);
}

/*
 * This function is the callback which is called when the memory/target
 * node is changed.  When it is fired, we will read a new reservation
 * target for our domain and signal the worker thread to make the change.
 *
 * If the reservation is larger than we can handle, we issue a warning.  dom0
 * does this automatically every boot, so we skip the first warning on dom0.
 */
/*ARGSUSED*/
static void
balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len)
{
	ulong_t new_target_kb;
	pgcnt_t	new_target_pages;
	int rv;
	static uchar_t warning_cnt = 0;

	rv = xenbus_scanf(0, "memory", "target", "%lu", &new_target_kb);
	if (rv != 0) {
		return;
	}

	/* new_target is in kB - change this to pages */
	new_target_pages = kbtop(new_target_kb);

	DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages);

	/*
	 * Unfortunately, dom0 may give us a target that is larger than
	 * our max limit.  Re-check the limit, and, if the new target is
	 * too large, adjust it downwards.
	 */
	mutex_enter(&bln_mutex);
	if (new_target_pages > bln_stats.bln_max_pages) {
		DTRACE_PROBE2(balloon__target__too__large, pgcnt_t,
		    new_target_pages, pgcnt_t, bln_stats.bln_max_pages);
		if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) {
			cmn_err(CE_WARN, "New balloon target (0x%lx pages) is "
			    "larger than original memory size (0x%lx pages). "
			    "Ballooning beyond original memory size is not "
			    "allowed.",
			    new_target_pages, bln_stats.bln_max_pages);
		}
		warning_cnt = 1;
		bln_stats.bln_new_target = bln_stats.bln_max_pages;
	} else {
		bln_stats.bln_new_target = new_target_pages;
	}

	mutex_exit(&bln_mutex);
	cv_signal(&bln_cv);
}

/*
 * bln_wait_sec can be used to throttle the hv calls, but by default it's
 * turned off.  If a balloon attempt fails, the wait time is forced on, and
 * then is exponentially increased as further attempts fail.
 */
uint_t bln_wait_sec = 0;
uint_t bln_wait_shift = 1;

/*
 * This is the main balloon thread.  Wait on the cv.  When woken, if our
 * reservation has changed, call the appropriate function to adjust the
 * reservation.
 */
static void
balloon_worker_thread(void)
{
	uint_t		bln_wait;
	callb_cpr_t	cprinfo;
	spgcnt_t	rv;

	bln_wait = bln_wait_sec;

	CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon");
	for (;;) {
		rv = 0;

		mutex_enter(&bln_mutex);
		CALLB_CPR_SAFE_BEGIN(&cprinfo);
		if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
			/*
			 * We weren't able to fully complete the request
			 * last time through, so try again.
			 */
			(void) cv_reltimedwait(&bln_cv, &bln_mutex,
			    (bln_wait * hz), TR_CLOCK_TICK);
		} else {
			cv_wait(&bln_cv, &bln_mutex);
		}
		CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex);

		if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
			if (bln_stats.bln_new_target <
			    bln_stats.bln_current_pages) {
				/* reservation shrunk */
				rv = -balloon_dec_reservation(
				    bln_stats.bln_current_pages -
				    bln_stats.bln_new_target);
			} else if (bln_stats.bln_new_target >
			    bln_stats.bln_current_pages) {
				/* reservation grew */
				rv = balloon_inc_reservation(
				    bln_stats.bln_new_target -
				    bln_stats.bln_current_pages);
			}
		}
		if (rv == 0) {
			if (bln_wait == 0) {
				bln_wait = 1;
			} else {
				bln_wait <<= bln_wait_shift;
			}
		} else {
			bln_stats.bln_current_pages += rv;
			bln_wait = bln_wait_sec;
		}
		if (bln_stats.bln_current_pages < bln_stats.bln_low)
			bln_stats.bln_low = bln_stats.bln_current_pages;
		else if (bln_stats.bln_current_pages > bln_stats.bln_high)
			bln_stats.bln_high = bln_stats.bln_current_pages;
		mutex_exit(&bln_mutex);
	}
}

/*
 * Called after balloon_init(), which is below.  The xenbus thread is up
 * and running, so we can register our watch and create the balloon thread.
 */
static void
balloon_config_watch(int state)
{
	if (state != XENSTORE_UP)
		return;

	bln_watch.node = "memory/target";
	bln_watch.callback = balloon_handler;
	if (register_xenbus_watch(&bln_watch)) {
		cmn_err(CE_WARN, "Failed to register balloon watcher; balloon "
		    "thread will be disabled");
		return;
	}

	if (bln_thread == NULL)
		bln_thread = thread_create(NULL, 0, balloon_worker_thread,
		    NULL, 0, &p0, TS_RUN, minclsyspri);
}

/*
 * Basic initialization of the balloon thread.  Set all of our variables,
 * and register a callback for later when we can register a xenbus watch.
 */
void
balloon_init(pgcnt_t nr_pages)
{
	domid_t domid = DOMID_SELF;

	bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages;
	bln_stats.bln_new_target = bln_stats.bln_high = nr_pages;
	bln_stats.bln_max_pages = nr_pages;
	cv_init(&bln_cv, NULL, CV_DEFAULT, NULL);

	bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op(
	    XENMEM_maximum_reservation, &domid);

	(void) xs_register_xenbus_callback(balloon_config_watch);
}

/*
 * These functions are called from the network drivers when they gain a page
 * or give one away.  We simply update our count.  Note that the counter
 * tracks the number of pages we give away, so we need to subtract any
 * amount passed to balloon_drv_added.
 */
void
balloon_drv_added(int64_t delta)
{
	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta);
}

void
balloon_drv_subtracted(int64_t delta)
{
	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta);
}

/*
 * balloon_alloc_pages()
 *	Allocate page_cnt mfns.  mfns storage provided by the caller.  Returns
 *	the number of pages allocated, which could be less than page_cnt, or
 *	a negative number if an error occurred.
 */
long
balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns)
{
	xen_memory_reservation_t memres;
	long rv;

	bzero(&memres, sizeof (memres));
	/*LINTED: constant in conditional context*/
	set_xen_guest_handle(memres.extent_start, mfns);
	memres.domid = DOMID_SELF;
	memres.nr_extents = page_cnt;

	rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
	if (rv > 0)
		atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv);
	return (rv);
}

/*
 * balloon_free_pages()
 *    free page_cnt pages, using any combination of mfns, pfns, and kva as long
 *    as they refer to the same mapping.  If an array of mfns is passed in, we
 *    assume they were already cleared.  Otherwise, we need to zero the pages
 *    before giving them back to the hypervisor. kva space is not free'd up in
 *    case the caller wants to re-use it.
 */
long
balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns)
{
	xen_memory_reservation_t memdec;
	mfn_t mfn;
	pfn_t pfn;
	uint_t i;
	long e;


#if DEBUG
	/* make sure kva is page aligned and maps to first pfn */
	if (kva != NULL) {
		ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0);
		if (pfns != NULL) {
			ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]);
		}
	}
#endif

	/* if we have a kva, we can clean all pages with just one bzero */
	if ((kva != NULL) && balloon_zero_memory) {
		bzero(kva, (page_cnt * PAGESIZE));
	}

	/* if we were given a kva and/or a pfn */
	if ((kva != NULL) || (pfns != NULL)) {

		/*
		 * All the current callers only pass 1 page when using kva or
		 * pfns, and use mfns when passing multiple pages.  If that
		 * assumption is changed, the following code will need some
		 * work.  The following ASSERT() guarantees we're respecting
		 * the io locking quota.
		 */
		ASSERT(page_cnt < bln_contig_list_quota);

		/* go through all the pages */
		for (i = 0; i < page_cnt; i++) {

			/* get the next pfn */
			if (pfns == NULL) {
				pfn = hat_getpfnum(kas.a_hat,
				    (kva + (PAGESIZE * i)));
			} else {
				pfn = pfns[i];
			}

			/*
			 * if we didn't already zero this page, do it now. we
			 * need to do this *before* we give back the MFN
			 */
			if ((kva == NULL) && (balloon_zero_memory)) {
				pfnzero(pfn, 0, PAGESIZE);
			}

			/*
			 * unmap the pfn. We don't free up the kva vmem space
			 * so the caller can re-use it. The page must be
			 * unmapped before it is given back to the hypervisor.
			 */
			if (kva != NULL) {
				hat_unload(kas.a_hat, (kva + (PAGESIZE * i)),
				    PAGESIZE, HAT_UNLOAD_UNMAP);
			}

			/* grab the mfn before the pfn is marked as invalid */
			mfn = pfn_to_mfn(pfn);

			/* mark the pfn as invalid */
			reassign_pfn(pfn, MFN_INVALID);

			/*
			 * if we weren't given an array of MFNs, we need to
			 * free them up one at a time. Otherwise, we'll wait
			 * until later and do it in one hypercall
			 */
			if (mfns == NULL) {
				bzero(&memdec, sizeof (memdec));
				/*LINTED: constant in conditional context*/
				set_xen_guest_handle(memdec.extent_start, &mfn);
				memdec.domid = DOMID_SELF;
				memdec.nr_extents = 1;
				e = HYPERVISOR_memory_op(
				    XENMEM_decrease_reservation, &memdec);
				if (e != 1) {
					cmn_err(CE_PANIC, "balloon: unable to "
					    "give a page back to the "
					    "hypervisor.\n");
				}
			}
		}
	}

	/*
	 * if we were passed in MFNs, we haven't free'd them up yet. We can
	 * do it with one call.
	 */
	if (mfns != NULL) {
		bzero(&memdec, sizeof (memdec));
		/*LINTED: constant in conditional context*/
		set_xen_guest_handle(memdec.extent_start, mfns);
		memdec.domid = DOMID_SELF;
		memdec.nr_extents = page_cnt;
		e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec);
		if (e != page_cnt) {
			cmn_err(CE_PANIC, "balloon: unable to give pages back "
			    "to the hypervisor.\n");
		}
	}

	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt);
	return (page_cnt);
}


/*
 * balloon_replace_pages()
 *	Try to replace nextexts blocks of 2^order pages.  addr_bits specifies
 *	how many bits of address the pages must be within (i.e. 16 would mean
 *	that the pages cannot have an address > 64k).  The constrints are on
 *	what the hypervisor gives us -- we are free to give any pages in
 *	exchange.  The array pp is the pages we are giving away.  The caller
 *	provides storage space for mfns, which hold the new physical pages.
 */
long
balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits,
    uint_t order, mfn_t *mfns)
{
	xen_memory_reservation_t memres;
	long fallback_cnt;
	long cnt;
	uint_t i, j, page_cnt, extlen;
	long e;
	int locked;


	/*
	 * we shouldn't be allocating constrained pages on a guest. It doesn't
	 * make any sense. They won't be constrained after a migration.
	 */
	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));

	extlen = 1 << order;
	page_cnt = nextents * extlen;
	/* Give back the current pages to the hypervisor */
	for (i = 0; i < page_cnt; i++) {
		cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum);
		if (cnt != 1) {
			cmn_err(CE_PANIC, "balloon: unable to give a page back "
			    "to the hypervisor.\n");
		}
	}

	/*
	 * try to allocate the new pages using addr_bits and order. If we can't
	 * get all of the pages, try to get the remaining pages with no
	 * constraints and, if that was successful, return the number of
	 * constrained pages we did allocate.
	 */
	bzero(&memres, sizeof (memres));
	/*LINTED: constant in conditional context*/
	set_xen_guest_handle(memres.extent_start, mfns);
	memres.domid = DOMID_SELF;
	memres.nr_extents = nextents;
	memres.mem_flags = XENMEMF_address_bits(addr_bits);
	memres.extent_order = order;
	cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
	/* assign the new MFNs to the current PFNs */
	locked = balloon_lock_contig_pfnlist(cnt * extlen);
	for (i = 0; i < cnt; i++) {
		for (j = 0; j < extlen; j++) {
			reassign_pfn(pp[i * extlen + j]->p_pagenum,
			    mfns[i] + j);
		}
	}
	if (locked)
		unlock_contig_pfnlist();
	if (cnt != nextents) {
		if (cnt < 0) {
			cnt = 0;
		}

		/*
		 * We couldn't get enough memory to satisfy our requirements.
		 * The above loop will assign the parts of the request that
		 * were successful (this part may be 0).  We need to fill
		 * in the rest.  The bzero below clears out extent_order and
		 * address_bits, so we'll take anything from the hypervisor
		 * to replace the pages we gave away.
		 */
		fallback_cnt = page_cnt - cnt * extlen;
		bzero(&memres, sizeof (memres));
		/*LINTED: constant in conditional context*/
		set_xen_guest_handle(memres.extent_start, mfns);
		memres.domid = DOMID_SELF;
		memres.nr_extents = fallback_cnt;
		e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
		if (e != fallback_cnt) {
			cmn_err(CE_PANIC, "balloon: unable to recover from "
			    "failed increase_reservation.\n");
		}
		locked = balloon_lock_contig_pfnlist(fallback_cnt);
		for (i = 0; i < fallback_cnt; i++) {
			uint_t offset = page_cnt - fallback_cnt;

			/*
			 * We already used pp[0...(cnt * extlen)] before,
			 * so start at the next entry in the pp array.
			 */
			reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]);
		}
		if (locked)
			unlock_contig_pfnlist();
	}

	/*
	 * balloon_free_pages increments our counter.  Decrement it here.
	 */
	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt);

	/*
	 * return the number of extents we were able to replace. If we got
	 * this far, we know all the pp's are valid.
	 */
	return (cnt);
}


/*
 * Called from the driver - return the requested stat.
 */
size_t
balloon_values(int cmd)
{
	switch (cmd) {
	case BLN_IOCTL_CURRENT:
		return (ptokb(bln_stats.bln_current_pages));
	case BLN_IOCTL_TARGET:
		return (ptokb(bln_stats.bln_new_target));
	case BLN_IOCTL_LOW:
		return (ptokb(bln_stats.bln_low));
	case BLN_IOCTL_HIGH:
		return (ptokb(bln_stats.bln_high));
	case BLN_IOCTL_LIMIT:
		return (ptokb(bln_stats.bln_hard_limit));
	default:
		panic("Unexpected cmd %d in balloon_values()\n", cmd);
	}
	/*NOTREACHED*/
}