1 files changed, 371 insertions, 233 deletions
diff --git a/src/pkg/runtime/malloc.goc b/src/pkg/runtime/malloc.goc
index c3ede4abd..7b7e350d8 100644
--- a/src/pkg/runtime/malloc.goc
+++ b/src/pkg/runtime/malloc.goc
@@ -19,6 +19,8 @@ package runtime
 // Mark mheap as 'no pointers', it does not contain interesting pointers but occupies ~45K.
 #pragma dataflag NOPTR
 MHeap runtime·mheap;
+#pragma dataflag NOPTR
+MStats mstats;
 
 int32	runtime·checking;
 
@@ -26,6 +28,10 @@ extern MStats mstats;	// defined in zruntime_def_$GOOS_$GOARCH.go
 
 extern volatile intgo runtime·MemProfileRate;
 
+static MSpan* largealloc(uint32, uintptr*);
+static void profilealloc(void *v, uintptr size);
+static void settype(MSpan *s, void *v, uintptr typ);
+
 // Allocate an object of at least size bytes.
 // Small objects are allocated from the per-thread cache's free lists.
 // Large objects (> 32 kB) are allocated straight from the heap.
@@ -34,12 +40,12 @@ void*
 runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
 {
 	int32 sizeclass;
+	uintptr tinysize, size1;
 	intgo rate;
 	MCache *c;
-	MCacheList *l;
-	uintptr npages;
 	MSpan *s;
-	MLink *v;
+	MLink *v, *next;
+	byte *tiny;
 
 	if(size == 0) {
 		// All 0-length allocations use this pointer.
@@ -49,8 +55,8 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
 	}
 	if(m->mallocing)
 		runtime·throw("malloc/free - deadlock");
-	// Disable preemption during settype_flush.
-	// We can not use m->mallocing for this, because settype_flush calls mallocgc.
+	// Disable preemption during settype.
+	// We can not use m->mallocing for this, because settype calls mallocgc.
 	m->locks++;
 	m->mallocing = 1;
 
@@ -58,7 +64,82 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
 		size += sizeof(uintptr);
 
 	c = m->mcache;
-	if(size <= MaxSmallSize) {
+	if(!runtime·debug.efence && size <= MaxSmallSize) {
+		if((flag&(FlagNoScan|FlagNoGC)) == FlagNoScan && size < TinySize) {
+			// Tiny allocator.
+			//
+			// Tiny allocator combines several tiny allocation requests
+			// into a single memory block. The resulting memory block
+			// is freed when all subobjects are unreachable. The subobjects
+			// must be FlagNoScan (don't have pointers), this ensures that
+			// the amount of potentially wasted memory is bounded.
+			//
+			// Size of the memory block used for combining (TinySize) is tunable.
+			// Current setting is 16 bytes, which relates to 2x worst case memory
+			// wastage (when all but one subobjects are unreachable).
+			// 8 bytes would result in no wastage at all, but provides less
+			// opportunities for combining.
+			// 32 bytes provides more opportunities for combining,
+			// but can lead to 4x worst case wastage.
+			// The best case winning is 8x regardless of block size.
+			//
+			// Objects obtained from tiny allocator must not be freed explicitly.
+			// So when an object will be freed explicitly, we ensure that
+			// its size >= TinySize.
+			//
+			// SetFinalizer has a special case for objects potentially coming
+			// from tiny allocator, it such case it allows to set finalizers
+			// for an inner byte of a memory block.
+			//
+			// The main targets of tiny allocator are small strings and
+			// standalone escaping variables. On a json benchmark
+			// the allocator reduces number of allocations by ~12% and
+			// reduces heap size by ~20%.
+
+			tinysize = c->tinysize;
+			if(size <= tinysize) {
+				tiny = c->tiny;
+				// Align tiny pointer for required (conservative) alignment.
+				if((size&7) == 0)
+					tiny = (byte*)ROUND((uintptr)tiny, 8);
+				else if((size&3) == 0)
+					tiny = (byte*)ROUND((uintptr)tiny, 4);
+				else if((size&1) == 0)
+					tiny = (byte*)ROUND((uintptr)tiny, 2);
+				size1 = size + (tiny - c->tiny);
+				if(size1 <= tinysize) {
+					// The object fits into existing tiny block.
+					v = (MLink*)tiny;
+					c->tiny += size1;
+					c->tinysize -= size1;
+					m->mallocing = 0;
+					m->locks--;
+					if(m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+						g->stackguard0 = StackPreempt;
+					return v;
+				}
+			}
+			// Allocate a new TinySize block.
+			s = c->alloc[TinySizeClass];
+			if(s->freelist == nil)
+				s = runtime·MCache_Refill(c, TinySizeClass);
+			v = s->freelist;
+			next = v->next;
+			s->freelist = next;
+			s->ref++;
+			if(next != nil)  // prefetching nil leads to a DTLB miss
+				PREFETCH(next);
+			((uint64*)v)[0] = 0;
+			((uint64*)v)[1] = 0;
+			// See if we need to replace the existing tiny block with the new one
+			// based on amount of remaining free space.
+			if(TinySize-size > tinysize) {
+				c->tiny = (byte*)v + size;
+				c->tinysize = TinySize - size;
+			}
+			size = TinySize;
+			goto done;
+		}
 		// Allocate from mcache free lists.
 		// Inlined version of SizeToClass().
 		if(size <= 1024-8)
@@ -66,87 +147,117 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
 		else
 			sizeclass = runtime·size_to_class128[(size-1024+127) >> 7];
 		size = runtime·class_to_size[sizeclass];
-		l = &c->list[sizeclass];
-		if(l->list == nil)
-			runtime·MCache_Refill(c, sizeclass);
-		v = l->list;
-		l->list = v->next;
-		l->nlist--;
+		s = c->alloc[sizeclass];
+		if(s->freelist == nil)
+			s = runtime·MCache_Refill(c, sizeclass);
+		v = s->freelist;
+		next = v->next;
+		s->freelist = next;
+		s->ref++;
+		if(next != nil)  // prefetching nil leads to a DTLB miss
+			PREFETCH(next);
 		if(!(flag & FlagNoZero)) {
 			v->next = nil;
 			// block is zeroed iff second word is zero ...
-			if(size > sizeof(uintptr) && ((uintptr*)v)[1] != 0)
+			if(size > 2*sizeof(uintptr) && ((uintptr*)v)[1] != 0)
 				runtime·memclr((byte*)v, size);
 		}
+	done:
 		c->local_cachealloc += size;
 	} else {
-		// TODO(rsc): Report tracebacks for very large allocations.
-
 		// Allocate directly from heap.
-		npages = size >> PageShift;
-		if((size & PageMask) != 0)
-			npages++;
-		s = runtime·MHeap_Alloc(&runtime·mheap, npages, 0, 1, !(flag & FlagNoZero));
-		if(s == nil)
-			runtime·throw("out of memory");
-		s->limit = (byte*)(s->start<<PageShift) + size;
-		size = npages<<PageShift;
+		s = largealloc(flag, &size);
 		v = (void*)(s->start << PageShift);
-
-		// setup for mark sweep
-		runtime·markspan(v, 0, 0, true);
 	}
 
-	if(!(flag & FlagNoGC))
-		runtime·markallocated(v, size, (flag&FlagNoScan) != 0);
+	if(flag & FlagNoGC)
+		runtime·marknogc(v);
+	else if(!(flag & FlagNoScan))
+		runtime·markscan(v);
 
 	if(DebugTypeAtBlockEnd)
 		*(uintptr*)((uintptr)v+size-sizeof(uintptr)) = typ;
 
+	m->mallocing = 0;
 	// TODO: save type even if FlagNoScan?  Potentially expensive but might help
 	// heap profiling/tracing.
-	if(UseSpanType && !(flag & FlagNoScan) && typ != 0) {
-		uintptr *buf, i;
-
-		buf = m->settype_buf;
-		i = m->settype_bufsize;
-		buf[i++] = (uintptr)v;
-		buf[i++] = typ;
-		m->settype_bufsize = i;
+	if(UseSpanType && !(flag & FlagNoScan) && typ != 0)
+		settype(s, v, typ);
+
+	if(raceenabled)
+		runtime·racemalloc(v, size);
+
+	if(runtime·debug.allocfreetrace)
+		runtime·tracealloc(v, size, typ);
+
+	if(!(flag & FlagNoProfiling) && (rate = runtime·MemProfileRate) > 0) {
+		if(size < rate && size < c->next_sample)
+			c->next_sample -= size;
+		else
+			profilealloc(v, size);
 	}
 
-	m->mallocing = 0;
-	if(UseSpanType && !(flag & FlagNoScan) && typ != 0 && m->settype_bufsize == nelem(m->settype_buf))
-		runtime·settype_flush(m);
 	m->locks--;
 	if(m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
 		g->stackguard0 = StackPreempt;
 
-	if(!(flag & FlagNoProfiling) && (rate = runtime·MemProfileRate) > 0) {
-		if(size >= rate)
-			goto profile;
-		if(m->mcache->next_sample > size)
-			m->mcache->next_sample -= size;
-		else {
-			// pick next profile time
-			// If you change this, also change allocmcache.
-			if(rate > 0x3fffffff)	// make 2*rate not overflow
-				rate = 0x3fffffff;
-			m->mcache->next_sample = runtime·fastrand1() % (2*rate);
-		profile:
-			runtime·setblockspecial(v, true);
-			runtime·MProf_Malloc(v, size);
-		}
-	}
-
 	if(!(flag & FlagNoInvokeGC) && mstats.heap_alloc >= mstats.next_gc)
 		runtime·gc(0);
 
-	if(raceenabled)
-		runtime·racemalloc(v, size);
 	return v;
 }
 
+static MSpan*
+largealloc(uint32 flag, uintptr *sizep)
+{
+	uintptr npages, size;
+	MSpan *s;
+	void *v;
+
+	// Allocate directly from heap.
+	size = *sizep;
+	if(size + PageSize < size)
+		runtime·throw("out of memory");
+	npages = size >> PageShift;
+	if((size & PageMask) != 0)
+		npages++;
+	s = runtime·MHeap_Alloc(&runtime·mheap, npages, 0, 1, !(flag & FlagNoZero));
+	if(s == nil)
+		runtime·throw("out of memory");
+	s->limit = (byte*)(s->start<<PageShift) + size;
+	*sizep = npages<<PageShift;
+	v = (void*)(s->start << PageShift);
+	// setup for mark sweep
+	runtime·markspan(v, 0, 0, true);
+	return s;
+}
+
+static void
+profilealloc(void *v, uintptr size)
+{
+	uintptr rate;
+	int32 next;
+	MCache *c;
+
+	c = m->mcache;
+	rate = runtime·MemProfileRate;
+	if(size < rate) {
+		// pick next profile time
+		// If you change this, also change allocmcache.
+		if(rate > 0x3fffffff)	// make 2*rate not overflow
+			rate = 0x3fffffff;
+		next = runtime·fastrand1() % (2*rate);
+		// Subtract the "remainder" of the current allocation.
+		// Otherwise objects that are close in size to sampling rate
+		// will be under-sampled, because we consistently discard this remainder.
+		next -= (size - c->next_sample);
+		if(next < 0)
+			next = 0;
+		c->next_sample = next;
+	}
+	runtime·MProf_Malloc(v, size);
+}
+
 void*
 runtime·malloc(uintptr size)
 {
@@ -160,7 +271,6 @@ runtime·free(void *v)
 	int32 sizeclass;
 	MSpan *s;
 	MCache *c;
-	uint32 prof;
 	uintptr size;
 
 	if(v == nil)
@@ -177,39 +287,73 @@ runtime·free(void *v)
 		runtime·printf("free %p: not an allocated block\n", v);
 		runtime·throw("free runtime·mlookup");
 	}
-	prof = runtime·blockspecial(v);
+	size = s->elemsize;
+	sizeclass = s->sizeclass;
+	// Objects that are smaller than TinySize can be allocated using tiny alloc,
+	// if then such object is combined with an object with finalizer, we will crash.
+	if(size < TinySize)
+		runtime·throw("freeing too small block");
 
-	if(raceenabled)
-		runtime·racefree(v);
+	if(runtime·debug.allocfreetrace)
+		runtime·tracefree(v, size);
+
+	// Ensure that the span is swept.
+	// If we free into an unswept span, we will corrupt GC bitmaps.
+	runtime·MSpan_EnsureSwept(s);
+
+	if(s->specials != nil)
+		runtime·freeallspecials(s, v, size);
 
-	// Find size class for v.
-	sizeclass = s->sizeclass;
 	c = m->mcache;
 	if(sizeclass == 0) {
 		// Large object.
-		size = s->npages<<PageShift;
-		*(uintptr*)(s->start<<PageShift) = (uintptr)0xfeedfeedfeedfeedll;	// mark as "needs to be zeroed"
+		s->needzero = 1;
 		// Must mark v freed before calling unmarkspan and MHeap_Free:
 		// they might coalesce v into other spans and change the bitmap further.
-		runtime·markfreed(v, size);
+		runtime·markfreed(v);
 		runtime·unmarkspan(v, 1<<PageShift);
-		runtime·MHeap_Free(&runtime·mheap, s, 1);
+		// NOTE(rsc,dvyukov): The original implementation of efence
+		// in CL 22060046 used SysFree instead of SysFault, so that
+		// the operating system would eventually give the memory
+		// back to us again, so that an efence program could run
+		// longer without running out of memory. Unfortunately,
+		// calling SysFree here without any kind of adjustment of the
+		// heap data structures means that when the memory does
+		// come back to us, we have the wrong metadata for it, either in
+		// the MSpan structures or in the garbage collection bitmap.
+		// Using SysFault here means that the program will run out of
+		// memory fairly quickly in efence mode, but at least it won't
+		// have mysterious crashes due to confused memory reuse.
+		// It should be possible to switch back to SysFree if we also 
+		// implement and then call some kind of MHeap_DeleteSpan.
+		if(runtime·debug.efence)
+			runtime·SysFault((void*)(s->start<<PageShift), size);
+		else
+			runtime·MHeap_Free(&runtime·mheap, s, 1);
 		c->local_nlargefree++;
 		c->local_largefree += size;
 	} else {
 		// Small object.
-		size = runtime·class_to_size[sizeclass];
-		if(size > sizeof(uintptr))
+		if(size > 2*sizeof(uintptr))
 			((uintptr*)v)[1] = (uintptr)0xfeedfeedfeedfeedll;	// mark as "needs to be zeroed"
+		else if(size > sizeof(uintptr))
+			((uintptr*)v)[1] = 0;
 		// Must mark v freed before calling MCache_Free:
 		// it might coalesce v and other blocks into a bigger span
 		// and change the bitmap further.
-		runtime·markfreed(v, size);
 		c->local_nsmallfree[sizeclass]++;
-		runtime·MCache_Free(c, v, sizeclass, size);
+		c->local_cachealloc -= size;
+		if(c->alloc[sizeclass] == s) {
+			// We own the span, so we can just add v to the freelist
+			runtime·markfreed(v);
+			((MLink*)v)->next = s->freelist;
+			s->freelist = v;
+			s->ref--;
+		} else {
+			// Someone else owns this span.  Add to free queue.
+			runtime·MCache_Free(c, v, sizeclass, size);
+		}
 	}
-	if(prof)
-		runtime·MProf_Free(v, size);
 	m->mallocing = 0;
 }
 
@@ -261,37 +405,6 @@ runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
 	return 1;
 }
 
-MCache*
-runtime·allocmcache(void)
-{
-	intgo rate;
-	MCache *c;
-
-	runtime·lock(&runtime·mheap);
-	c = runtime·FixAlloc_Alloc(&runtime·mheap.cachealloc);
-	runtime·unlock(&runtime·mheap);
-	runtime·memclr((byte*)c, sizeof(*c));
-
-	// Set first allocation sample size.
-	rate = runtime·MemProfileRate;
-	if(rate > 0x3fffffff)	// make 2*rate not overflow
-		rate = 0x3fffffff;
-	if(rate != 0)
-		c->next_sample = runtime·fastrand1() % (2*rate);
-
-	return c;
-}
-
-void
-runtime·freemcache(MCache *c)
-{
-	runtime·MCache_ReleaseAll(c);
-	runtime·lock(&runtime·mheap);
-	runtime·purgecachedstats(c);
-	runtime·FixAlloc_Free(&runtime·mheap.cachealloc, c);
-	runtime·unlock(&runtime·mheap);
-}
-
 void
 runtime·purgecachedstats(MCache *c)
 {
@@ -314,33 +427,42 @@ runtime·purgecachedstats(MCache *c)
 	}
 }
 
-uintptr runtime·sizeof_C_MStats = sizeof(MStats);
+// Size of the trailing by_size array differs between Go and C,
+// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
+// sizeof_C_MStats is what C thinks about size of Go struct.
+uintptr runtime·sizeof_C_MStats = sizeof(MStats) - (NumSizeClasses - 61) * sizeof(mstats.by_size[0]);
 
 #define MaxArena32 (2U<<30)
 
 void
 runtime·mallocinit(void)
 {
-	byte *p;
-	uintptr arena_size, bitmap_size, spans_size;
+	byte *p, *p1;
+	uintptr arena_size, bitmap_size, spans_size, p_size;
 	extern byte end[];
-	byte *want;
 	uintptr limit;
 	uint64 i;
+	bool reserved;
 
 	p = nil;
+	p_size = 0;
 	arena_size = 0;
 	bitmap_size = 0;
 	spans_size = 0;
+	reserved = false;
 
 	// for 64-bit build
 	USED(p);
+	USED(p_size);
 	USED(arena_size);
 	USED(bitmap_size);
 	USED(spans_size);
 
 	runtime·InitSizes();
 
+	if(runtime·class_to_size[TinySizeClass] != TinySize)
+		runtime·throw("bad TinySizeClass");
+
 	// limit = runtime·memlimit();
 	// See https://code.google.com/p/go/issues/detail?id=5049
 	// TODO(rsc): Fix after 1.1.
@@ -380,7 +502,8 @@ runtime·mallocinit(void)
 		spans_size = ROUND(spans_size, PageSize);
 		for(i = 0; i <= 0x7f; i++) {
 			p = (void*)(i<<40 | 0x00c0ULL<<32);
-			p = runtime·SysReserve(p, bitmap_size + spans_size + arena_size);
+			p_size = bitmap_size + spans_size + arena_size + PageSize;
+			p = runtime·SysReserve(p, p_size, &reserved);
 			if(p != nil)
 				break;
 		}
@@ -422,91 +545,119 @@ runtime·mallocinit(void)
 		// So adjust it upward a little bit ourselves: 1/4 MB to get
 		// away from the running binary image and then round up
 		// to a MB boundary.
-		want = (byte*)ROUND((uintptr)end + (1<<18), 1<<20);
-		p = runtime·SysReserve(want, bitmap_size + spans_size + arena_size);
+		p = (byte*)ROUND((uintptr)end + (1<<18), 1<<20);
+		p_size = bitmap_size + spans_size + arena_size + PageSize;
+		p = runtime·SysReserve(p, p_size, &reserved);
 		if(p == nil)
 			runtime·throw("runtime: cannot reserve arena virtual address space");
-		if((uintptr)p & (((uintptr)1<<PageShift)-1))
-			runtime·printf("runtime: SysReserve returned unaligned address %p; asked for %p", p,
-				bitmap_size+spans_size+arena_size);
 	}
-	if((uintptr)p & (((uintptr)1<<PageShift)-1))
-		runtime·throw("runtime: SysReserve returned unaligned address");
 
-	runtime·mheap.spans = (MSpan**)p;
-	runtime·mheap.bitmap = p + spans_size;
-	runtime·mheap.arena_start = p + spans_size + bitmap_size;
+	// PageSize can be larger than OS definition of page size,
+	// so SysReserve can give us a PageSize-unaligned pointer.
+	// To overcome this we ask for PageSize more and round up the pointer.
+	p1 = (byte*)ROUND((uintptr)p, PageSize);
+
+	runtime·mheap.spans = (MSpan**)p1;
+	runtime·mheap.bitmap = p1 + spans_size;
+	runtime·mheap.arena_start = p1 + spans_size + bitmap_size;
 	runtime·mheap.arena_used = runtime·mheap.arena_start;
-	runtime·mheap.arena_end = runtime·mheap.arena_start + arena_size;
+	runtime·mheap.arena_end = p + p_size;
+	runtime·mheap.arena_reserved = reserved;
+
+	if(((uintptr)runtime·mheap.arena_start & (PageSize-1)) != 0)
+		runtime·throw("misrounded allocation in mallocinit");
 
 	// Initialize the rest of the allocator.	
 	runtime·MHeap_Init(&runtime·mheap);
 	m->mcache = runtime·allocmcache();
 
 	// See if it works.
-	runtime·free(runtime·malloc(1));
+	runtime·free(runtime·malloc(TinySize));
 }
 
 void*
 runtime·MHeap_SysAlloc(MHeap *h, uintptr n)
 {
-	byte *p;
+	byte *p, *p_end;
+	uintptr p_size;
+	bool reserved;
 
 	if(n > h->arena_end - h->arena_used) {
 		// We are in 32-bit mode, maybe we didn't use all possible address space yet.
 		// Reserve some more space.
 		byte *new_end;
-		uintptr needed;
 
-		needed = (uintptr)h->arena_used + n - (uintptr)h->arena_end;
-		needed = ROUND(needed, 256<<20);
-		new_end = h->arena_end + needed;
+		p_size = ROUND(n + PageSize, 256<<20);
+		new_end = h->arena_end + p_size;
 		if(new_end <= h->arena_start + MaxArena32) {
-			p = runtime·SysReserve(h->arena_end, new_end - h->arena_end);
-			if(p == h->arena_end)
+			// TODO: It would be bad if part of the arena
+			// is reserved and part is not.
+			p = runtime·SysReserve(h->arena_end, p_size, &reserved);
+			if(p == h->arena_end) {
 				h->arena_end = new_end;
+				h->arena_reserved = reserved;
+			}
+			else if(p+p_size <= h->arena_start + MaxArena32) {
+				// Keep everything page-aligned.
+				// Our pages are bigger than hardware pages.
+				h->arena_end = p+p_size;
+				h->arena_used = p + (-(uintptr)p&(PageSize-1));
+				h->arena_reserved = reserved;
+			} else {
+				uint64 stat;
+				stat = 0;
+				runtime·SysFree(p, p_size, &stat);
+			}
 		}
 	}
 	if(n <= h->arena_end - h->arena_used) {
 		// Keep taking from our reservation.
 		p = h->arena_used;
-		runtime·SysMap(p, n, &mstats.heap_sys);
+		runtime·SysMap(p, n, h->arena_reserved, &mstats.heap_sys);
 		h->arena_used += n;
 		runtime·MHeap_MapBits(h);
 		runtime·MHeap_MapSpans(h);
 		if(raceenabled)
 			runtime·racemapshadow(p, n);
+		
+		if(((uintptr)p & (PageSize-1)) != 0)
+			runtime·throw("misrounded allocation in MHeap_SysAlloc");
 		return p;
 	}
 	
 	// If using 64-bit, our reservation is all we have.
-	if(sizeof(void*) == 8 && (uintptr)h->bitmap >= 0xffffffffU)
+	if(h->arena_end - h->arena_start >= MaxArena32)
 		return nil;
 
 	// On 32-bit, once the reservation is gone we can
 	// try to get memory at a location chosen by the OS
 	// and hope that it is in the range we allocated bitmap for.
-	p = runtime·SysAlloc(n, &mstats.heap_sys);
+	p_size = ROUND(n, PageSize) + PageSize;
+	p = runtime·SysAlloc(p_size, &mstats.heap_sys);
 	if(p == nil)
 		return nil;
 
-	if(p < h->arena_start || p+n - h->arena_start >= MaxArena32) {
+	if(p < h->arena_start || p+p_size - h->arena_start >= MaxArena32) {
 		runtime·printf("runtime: memory allocated by OS (%p) not in usable range [%p,%p)\n",
 			p, h->arena_start, h->arena_start+MaxArena32);
-		runtime·SysFree(p, n, &mstats.heap_sys);
+		runtime·SysFree(p, p_size, &mstats.heap_sys);
 		return nil;
 	}
-
+	
+	p_end = p + p_size;
+	p += -(uintptr)p & (PageSize-1);
 	if(p+n > h->arena_used) {
 		h->arena_used = p+n;
-		if(h->arena_used > h->arena_end)
-			h->arena_end = h->arena_used;
+		if(p_end > h->arena_end)
+			h->arena_end = p_end;
 		runtime·MHeap_MapBits(h);
 		runtime·MHeap_MapSpans(h);
 		if(raceenabled)
 			runtime·racemapshadow(p, n);
 	}
 	
+	if(((uintptr)p & (PageSize-1)) != 0)
+		runtime·throw("misrounded allocation in MHeap_SysAlloc");
 	return p;
 }
 
@@ -534,7 +685,7 @@ runtime·persistentalloc(uintptr size, uintptr align, uint64 *stat)
 
 	if(align != 0) {
 		if(align&(align-1))
-			runtime·throw("persistentalloc: align is now a power of 2");
+			runtime·throw("persistentalloc: align is not a power of 2");
 		if(align > PageSize)
 			runtime·throw("persistentalloc: align is too large");
 	} else
@@ -562,95 +713,67 @@ runtime·persistentalloc(uintptr size, uintptr align, uint64 *stat)
 	return p;
 }
 
-static Lock settype_lock;
-
-void
-runtime·settype_flush(M *mp)
+static void
+settype(MSpan *s, void *v, uintptr typ)
 {
-	uintptr *buf, *endbuf;
 	uintptr size, ofs, j, t;
 	uintptr ntypes, nbytes2, nbytes3;
 	uintptr *data2;
 	byte *data3;
-	void *v;
-	uintptr typ, p;
-	MSpan *s;
 
-	buf = mp->settype_buf;
-	endbuf = buf + mp->settype_bufsize;
-
-	runtime·lock(&settype_lock);
-	while(buf < endbuf) {
-		v = (void*)*buf;
-		*buf = 0;
-		buf++;
-		typ = *buf;
-		buf++;
-
-		// (Manually inlined copy of runtime·MHeap_Lookup)
-		p = (uintptr)v>>PageShift;
-		if(sizeof(void*) == 8)
-			p -= (uintptr)runtime·mheap.arena_start >> PageShift;
-		s = runtime·mheap.spans[p];
-
-		if(s->sizeclass == 0) {
-			s->types.compression = MTypes_Single;
-			s->types.data = typ;
-			continue;
+	if(s->sizeclass == 0) {
+		s->types.compression = MTypes_Single;
+		s->types.data = typ;
+		return;
+	}
+	size = s->elemsize;
+	ofs = ((uintptr)v - (s->start<<PageShift)) / size;
+
+	switch(s->types.compression) {
+	case MTypes_Empty:
+		ntypes = (s->npages << PageShift) / size;
+		nbytes3 = 8*sizeof(uintptr) + 1*ntypes;
+		data3 = runtime·mallocgc(nbytes3, 0, FlagNoProfiling|FlagNoScan|FlagNoInvokeGC);
+		s->types.compression = MTypes_Bytes;
+		s->types.data = (uintptr)data3;
+		((uintptr*)data3)[1] = typ;
+		data3[8*sizeof(uintptr) + ofs] = 1;
+		break;
+		
+	case MTypes_Words:
+		((uintptr*)s->types.data)[ofs] = typ;
+		break;
+		
+	case MTypes_Bytes:
+		data3 = (byte*)s->types.data;
+		for(j=1; j<8; j++) {
+			if(((uintptr*)data3)[j] == typ) {
+				break;
+			}
+			if(((uintptr*)data3)[j] == 0) {
+				((uintptr*)data3)[j] = typ;
+				break;
+			}
 		}
-
-		size = s->elemsize;
-		ofs = ((uintptr)v - (s->start<<PageShift)) / size;
-
-		switch(s->types.compression) {
-		case MTypes_Empty:
+		if(j < 8) {
+			data3[8*sizeof(uintptr) + ofs] = j;
+		} else {
 			ntypes = (s->npages << PageShift) / size;
-			nbytes3 = 8*sizeof(uintptr) + 1*ntypes;
-			data3 = runtime·mallocgc(nbytes3, 0, FlagNoProfiling|FlagNoScan|FlagNoInvokeGC);
-			s->types.compression = MTypes_Bytes;
-			s->types.data = (uintptr)data3;
-			((uintptr*)data3)[1] = typ;
-			data3[8*sizeof(uintptr) + ofs] = 1;
-			break;
-
-		case MTypes_Words:
-			((uintptr*)s->types.data)[ofs] = typ;
-			break;
-
-		case MTypes_Bytes:
-			data3 = (byte*)s->types.data;
-			for(j=1; j<8; j++) {
-				if(((uintptr*)data3)[j] == typ) {
-					break;
-				}
-				if(((uintptr*)data3)[j] == 0) {
-					((uintptr*)data3)[j] = typ;
-					break;
-				}
+			nbytes2 = ntypes * sizeof(uintptr);
+			data2 = runtime·mallocgc(nbytes2, 0, FlagNoProfiling|FlagNoScan|FlagNoInvokeGC);
+			s->types.compression = MTypes_Words;
+			s->types.data = (uintptr)data2;
+			
+			// Move the contents of data3 to data2. Then deallocate data3.
+			for(j=0; j<ntypes; j++) {
+				t = data3[8*sizeof(uintptr) + j];
+				t = ((uintptr*)data3)[t];
+				data2[j] = t;
 			}
-			if(j < 8) {
-				data3[8*sizeof(uintptr) + ofs] = j;
-			} else {
-				ntypes = (s->npages << PageShift) / size;
-				nbytes2 = ntypes * sizeof(uintptr);
-				data2 = runtime·mallocgc(nbytes2, 0, FlagNoProfiling|FlagNoScan|FlagNoInvokeGC);
-				s->types.compression = MTypes_Words;
-				s->types.data = (uintptr)data2;
-
-				// Move the contents of data3 to data2. Then deallocate data3.
-				for(j=0; j<ntypes; j++) {
-					t = data3[8*sizeof(uintptr) + j];
-					t = ((uintptr*)data3)[t];
-					data2[j] = t;
-				}
-				data2[ofs] = typ;
-			}
-			break;
+			data2[ofs] = typ;
 		}
+		break;
 	}
-	runtime·unlock(&settype_lock);
-
-	mp->settype_bufsize = 0;
 }
 
 uintptr
@@ -683,9 +806,7 @@ runtime·gettype(void *v)
 			runtime·throw("runtime·gettype: invalid compression kind");
 		}
 		if(0) {
-			runtime·lock(&settype_lock);
 			runtime·printf("%p -> %d,%X\n", v, (int32)s->types.compression, (int64)t);
-			runtime·unlock(&settype_lock);
 		}
 		return t;
 	}
@@ -701,11 +822,8 @@ runtime·mal(uintptr n)
 }
 
 #pragma textflag NOSPLIT
-void
-runtime·new(Type *typ, uint8 *ret)
-{
+func new(typ *Type) (ret *uint8) {
 	ret = runtime·mallocgc(typ->size, (uintptr)typ | TypeInfo_SingleObject, typ->kind&KindNoPointers ? FlagNoScan : 0);
-	FLUSH(&ret);
 }
 
 static void*
@@ -732,7 +850,7 @@ runtime·cnewarray(Type *typ, intgo n)
 }
 
 func GC() {
-	runtime·gc(1);
+	runtime·gc(2);  // force GC and do eager sweep
 }
 
 func SetFinalizer(obj Eface, finalizer Eface) {
@@ -754,14 +872,30 @@ func SetFinalizer(obj Eface, finalizer Eface) {
 		runtime·printf("runtime.SetFinalizer: first argument is %S, not pointer\n", *obj.type->string);
 		goto throw;
 	}
+	ot = (PtrType*)obj.type;
+	// As an implementation detail we do not run finalizers for zero-sized objects,
+	// because we use &runtime·zerobase for all such allocations.
+	if(ot->elem != nil && ot->elem->size == 0)
+		return;
+	// The following check is required for cases when a user passes a pointer to composite literal,
+	// but compiler makes it a pointer to global. For example:
+	//	var Foo = &Object{}
+	//	func main() {
+	//		runtime.SetFinalizer(Foo, nil)
+	//	}
+	// See issue 7656.
+	if((byte*)obj.data < runtime·mheap.arena_start || runtime·mheap.arena_used <= (byte*)obj.data)
+		return;
 	if(!runtime·mlookup(obj.data, &base, &size, nil) || obj.data != base) {
-		runtime·printf("runtime.SetFinalizer: pointer not at beginning of allocated block\n");
-		goto throw;
+		// As an implementation detail we allow to set finalizers for an inner byte
+		// of an object if it could come from tiny alloc (see mallocgc for details).
+		if(ot->elem == nil || (ot->elem->kind&KindNoPointers) == 0 || ot->elem->size >= TinySize) {
+			runtime·printf("runtime.SetFinalizer: pointer not at beginning of allocated block (%p)\n", obj.data);
+			goto throw;
+		}
 	}
-	nret = 0;
-	ot = (PtrType*)obj.type;
-	fint = nil;
 	if(finalizer.type != nil) {
+		runtime·createfing();
 		if(finalizer.type->kind != KindFunc)
 			goto badfunc;
 		ft = (FuncType*)finalizer.type;
@@ -781,16 +915,20 @@ func SetFinalizer(obj Eface, finalizer Eface) {
 			goto badfunc;
 
 		// compute size needed for return parameters
+		nret = 0;
 		for(i=0; i<ft->out.len; i++) {
 			t = ((Type**)ft->out.array)[i];
 			nret = ROUND(nret, t->align) + t->size;
 		}
 		nret = ROUND(nret, sizeof(void*));
-	}
-	
-	if(!runtime·addfinalizer(obj.data, finalizer.data, nret, fint, ot)) {
-		runtime·printf("runtime.SetFinalizer: finalizer already set\n");
-		goto throw;
+		ot = (PtrType*)obj.type;
+		if(!runtime·addfinalizer(obj.data, finalizer.data, nret, fint, ot)) {
+			runtime·printf("runtime.SetFinalizer: finalizer already set\n");
+			goto throw;
+		}
+	} else {
+		// NOTE: asking to remove a finalizer when there currently isn't one set is OK.
+		runtime·removefinalizer(obj.data);
 	}
 	return;