runtime: fix finalization and profiling of tiny allocations

Handling of special records for tiny allocations has two problems: 1. Once we queue a finalizer we mark the object. As the result any subsequent finalizers for the same object will not be queued during this GC cycle. If we have 16 finalizers setup (the worst case), finalization will take 16 GC cycles. This is what caused misbehave of tinyfin.go. The actual flakiness was caused by the fact that fing is asynchronous and don't always run before the check. 2. If a tiny block has both finalizer and profile specials, it is possible that we both queue finalizer, preserve the object live and free the profile record. As the result heap profile can be skewed. Fix both issues by analyzing all special records for a single object at once. Also, make tinyfin test stricter and remove reliance on real time. Also, add a test for the problem 2. Currently heap profile missed about a half of live memory. Fixes #13100 Change-Id: I9ae4dc1c44893724138a4565ca5cae29f2e97544 Reviewed-on: https://go-review.googlesource.com/16591 Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Dmitry Vyukov <dvyukov@google.com>
2025-05-05 15:43:04 +00:00 · 2015-11-03 12:19:15 +01:00 · 2015-11-03 12:19:15 +01:00 · bf606094ee
commit bf606094ee
parent 95333aea53
4 changed files with 142 additions and 39 deletions
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@ -197,6 +197,13 @@ func mSpan_Sweep(s *mspan, preserve bool) bool {
 	}

 	// Unlink & free special records for any objects we're about to free.
+	// Two complications here:
+	// 1. An object can have both finalizer and profile special records.
+	//    In such case we need to queue finalizer for execution,
+	//    mark the object as live and preserve the profile special.
+	// 2. A tiny object can have several finalizers setup for different offsets.
+	//    If such object is not marked, we need to queue all finalizers at once.
+	// Both 1 and 2 are possible at the same time.
 	specialp := &s.specials
 	special := *specialp
 	for special != nil {
@ -204,16 +211,35 @@ func mSpan_Sweep(s *mspan, preserve bool) bool {
 		p := uintptr(s.start<<_PageShift) + uintptr(special.offset)/size*size
 		hbits := heapBitsForAddr(p)
 		if !hbits.isMarked() {
-			// Find the exact byte for which the special was setup
-			// (as opposed to object beginning).
-			p := uintptr(s.start<<_PageShift) + uintptr(special.offset)
-			// about to free object: splice out special record
-			y := special
-			special = special.next
-			*specialp = special
-			if !freespecial(y, unsafe.Pointer(p), size, false) {
-				// stop freeing of object if it has a finalizer
-				hbits.setMarkedNonAtomic()
+			// This object is not marked and has at least one special record.
+			// Pass 1: see if it has at least one finalizer.
+			hasFin := false
+			endOffset := p - uintptr(s.start<<_PageShift) + size
+			for tmp := special; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next {
+				if tmp.kind == _KindSpecialFinalizer {
+					// Stop freeing of object if it has a finalizer.
+					hbits.setMarkedNonAtomic()
+					hasFin = true
+					break
+				}
+			}
+			// Pass 2: queue all finalizers _or_ handle profile record.
+			for special != nil && uintptr(special.offset) < endOffset {
+				// Find the exact byte for which the special was setup
+				// (as opposed to object beginning).
+				p := uintptr(s.start<<_PageShift) + uintptr(special.offset)
+				if special.kind == _KindSpecialFinalizer || !hasFin {
+					// Splice out special record.
+					y := special
+					special = special.next
+					*specialp = special
+					freespecial(y, unsafe.Pointer(p), size, false)
+				} else {
+					// This is profile record, but the object has finalizers (so kept alive).
+					// Keep special record.
+					specialp = &special.next
+					special = *specialp
+				}
 			}
 		} else {
 			// object is still live: keep special record
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@ -1143,8 +1143,7 @@ func setprofilebucket(p unsafe.Pointer, b *bucket) {

 // Do whatever cleanup needs to be done to deallocate s.  It has
 // already been unlinked from the MSpan specials list.
-// Returns true if we should keep working on deallocating p.
-func freespecial(s *special, p unsafe.Pointer, size uintptr, freed bool) bool {
+func freespecial(s *special, p unsafe.Pointer, size uintptr, freed bool) {
 	switch s.kind {
 	case _KindSpecialFinalizer:
 		sf := (*specialfinalizer)(unsafe.Pointer(s))
@ -1152,14 +1151,12 @@ func freespecial(s *special, p unsafe.Pointer, size uintptr, freed bool) bool {
 		lock(&mheap_.speciallock)
 		fixAlloc_Free(&mheap_.specialfinalizeralloc, unsafe.Pointer(sf))
 		unlock(&mheap_.speciallock)
-		return false // don't free p until finalizer is done
 	case _KindSpecialProfile:
 		sp := (*specialprofile)(unsafe.Pointer(s))
 		mProf_Free(sp.b, size, freed)
 		lock(&mheap_.speciallock)
 		fixAlloc_Free(&mheap_.specialprofilealloc, unsafe.Pointer(sp))
 		unlock(&mheap_.speciallock)
-		return true
 	default:
 		throw("bad special kind")
 		panic("not reached")
--- a/test/finprofiled.go
+++ b/test/finprofiled.go
@ -0,0 +1,74 @@
+// run
+
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test that tiny allocations with finalizers are correctly profiled.
+// Previously profile special records could have been processed prematurely
+// (while the object is still live).
+
+package main
+
+import (
+	"runtime"
+	"time"
+	"unsafe"
+)
+
+func main() {
+	runtime.MemProfileRate = 1
+	// Allocate 1M 4-byte objects and set a finalizer for every third object.
+	// Assuming that tiny block size is 16, some objects get finalizers setup
+	// only for middle bytes. The finalizer resurrects that object.
+	// As the result, all allocated memory must stay alive.
+	const (
+		N = 1 << 20
+		tinyBlockSize = 16 // runtime._TinySize
+	)
+	hold := make([]*int32, 0, N)
+	for i := 0; i < N; i++ {
+		x := new(int32)
+		if i%3 == 0 {
+			runtime.SetFinalizer(x, func(p *int32) {
+				hold = append(hold, p)
+			})
+		}
+	}
+	// Finalize as much as possible.
+	// Note: the sleep only increases probility of bug detection,
+	// it cannot lead to false failure.
+	for i := 0; i < 5; i++ {
+		runtime.GC()
+		time.Sleep(10 * time.Millisecond)
+	}
+	// Read memory profile.
+	var prof []runtime.MemProfileRecord
+	for {
+		if n, ok := runtime.MemProfile(prof, false); ok {
+			prof = prof[:n]
+			break
+		} else {
+			prof = make([]runtime.MemProfileRecord, n+10)
+		}
+	}
+	// See how much memory in tiny objects is profiled.
+	var totalBytes int64
+	for _, p := range prof {
+		bytes := p.AllocBytes - p.FreeBytes
+		nobj := p.AllocObjects - p.FreeObjects
+		size := bytes / nobj
+		if size == tinyBlockSize {
+			totalBytes += bytes
+		}
+	}
+	// 2*tinyBlockSize slack is for any boundary effects.
+	if want := N*int64(unsafe.Sizeof(int32(0))) - 2*tinyBlockSize; totalBytes < want {
+		println("got", totalBytes, "want >=", want)
+		panic("some of the tiny objects are not profiled")
+	}
+	// Just to keep hold alive.
+	if len(hold) != 0 && hold[0] == nil {
+		panic("bad")
+	}
+}
--- a/test/tinyfin.go
+++ b/test/tinyfin.go
@ -10,7 +10,6 @@ package main

 import (
 	"runtime"
-	"sync/atomic"
 	"time"
 )

@ -20,39 +19,46 @@ func main() {
 	if runtime.Compiler == "gccgo" {
 		return
 	}
-	N := int32(100)
-	count := N
-	done := make([]bool, N)
-	for i := int32(0); i < N; i++ {
+	const N = 100
+	finalized := make(chan int32, N)
+	for i := 0; i < N; i++ {
 		x := new(int32) // subject to tiny alloc
-		*x = i
+		*x = int32(i)
 		// the closure must be big enough to be combined
 		runtime.SetFinalizer(x, func(p *int32) {
+			finalized <- *p
+		})
+	}
+	runtime.GC()
+	count := 0
+	done := make([]bool, N)
+	timeout := time.After(5*time.Second)
+	for {
+		select {
+		case <-timeout:
+			println("timeout,", count, "finalized so far")
+			panic("not all finalizers are called")
+		case x := <-finalized:
 			// Check that p points to the correct subobject of the tiny allocation.
 			// It's a bit tricky, because we can't capture another variable
 			// with the expected value (it would be combined as well).
-			if *p < 0 || *p >= N {
-				println("got", *p)
+			if x < 0 || x >= N {
+				println("got", x)
 				panic("corrupted")
 			}
-			if done[*p] {
-				println("got", *p)
+			if done[x] {
+				println("got", x)
 				panic("already finalized")
 			}
-			done[*p] = true
-			atomic.AddInt32(&count, -1)
-		})
-	}
-	for i := 0; i < 4; i++ {
-		runtime.GC()
-		time.Sleep(10 * time.Millisecond)
-	}
-	// Some of the finalizers may not be executed,
-	// if the outermost allocations are combined with something persistent.
-	// Currently 4 int32's are combined into a 16-byte block,
-	// ensure that most of them are finalized.
-	if atomic.LoadInt32(&count) >= N/4 {
-		println(count, "out of", N, "finalizer are not called")
-		panic("not all finalizers are called")
+			done[x] = true
+			count++
+			if count > N/10*9 {
+				// Some of the finalizers may not be executed,
+				// if the outermost allocations are combined with something persistent.
+				// Currently 4 int32's are combined into a 16-byte block,
+				// ensure that most of them are finalized.
+				return
+			}
+		}
 	}
 }