runtime: use bit-parallel operations to compute heap bit summaries

The new implementation is much faster in all cases. name old time/op new time/op delta PallocBitsSummarize/Unpacked00-16 142ns ± 1% 7ns ± 2% -94.75% (p=0.000 n=10+9) PallocBitsSummarize/UnpackedFFFFFFFFFFFFFFFF-16 172ns ± 0% 24ns ± 0% -86.02% (p=0.000 n=9+9) PallocBitsSummarize/UnpackedAA-16 145ns ± 0% 32ns ± 0% -78.16% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedAAAAAAAAAAAAAAAA-16 172ns ± 0% 33ns ± 0% -80.95% (p=0.000 n=9+9) PallocBitsSummarize/Unpacked80000000AAAAAAAA-16 162ns ± 1% 60ns ± 0% -62.69% (p=0.000 n=10+9) PallocBitsSummarize/UnpackedAAAAAAAA00000001-16 163ns ± 0% 68ns ± 1% -58.47% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedBBBBBBBBBBBBBBBB-16 172ns ± 0% 35ns ± 0% -79.70% (p=0.000 n=9+9) PallocBitsSummarize/Unpacked80000000BBBBBBBB-16 161ns ± 0% 63ns ± 0% -60.61% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedBBBBBBBB00000001-16 163ns ± 0% 60ns ± 0% -63.14% (p=0.000 n=9+10) PallocBitsSummarize/UnpackedCCCCCCCCCCCCCCCC-16 172ns ± 0% 39ns ± 0% -77.41% (p=0.000 n=7+10) PallocBitsSummarize/Unpacked4444444444444444-16 172ns ± 0% 39ns ± 0% -77.42% (p=0.000 n=7+10) PallocBitsSummarize/Unpacked4040404040404040-16 173ns ± 2% 51ns ± 1% -70.55% (p=0.000 n=10+10) PallocBitsSummarize/Unpacked4000400040004000-16 160ns ± 1% 53ns ± 0% -66.78% (p=0.000 n=10+10) PallocBitsSummarize/Unpacked1000404044CCAAFF-16 169ns ± 1% 59ns ± 1% -65.28% (p=0.000 n=10+10) Change-Id: I94daa645b76a9cf9c93edeb2058d7132216fcb72 Reviewed-on: https://go-review.googlesource.com/c/go/+/240900 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2025-05-30 19:52:53 +00:00 · 2020-07-03 11:28:50 -07:00 · 2020-07-03 11:28:50 -07:00 · 4e5ed83e8d
commit 4e5ed83e8d
parent 88c094c96a
2 changed files with 107 additions and 74 deletions
--- a/src/runtime/mpallocbits.go
+++ b/src/runtime/mpallocbits.go
@ -120,78 +120,99 @@ func (b *pageBits) popcntRange(i, n uint) (s uint) {
 // sake of documentation, 0s are free pages and 1s are allocated pages.
 type pallocBits pageBits

-// consec8tab is a table containing the number of consecutive
-// zero bits for any uint8 value.
-//
-// The table is generated by calling consec8(i) for each
-// possible uint8 value, which is defined as:
-//
-// // consec8 counts the maximum number of consecutive 0 bits
-// // in a uint8.
-// func consec8(n uint8) int {
-// 	n = ^n
-// 	i := 0
-// 	for n != 0 {
-// 		n &= (n << 1)
-// 		i++
-// 	}
-// 	return i
-// }
-var consec8tab = [256]uint{
-	8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
-	4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
-	4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
-	6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
-	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
-	7, 6, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
-	4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
-	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
-	6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
-	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
-	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 0,
-}
-
 // summarize returns a packed summary of the bitmap in pallocBits.
 func (b *pallocBits) summarize() pallocSum {
-	// TODO(mknyszek): There may be something more clever to be done
-	// here to make the summarize operation more efficient. For example,
-	// we can compute start and end with 64-bit wide operations easily,
-	// but max is a bit more complex. Perhaps there exists some way to
-	// leverage the 64-bit start and end to our advantage?
-	var start, max, end uint
+	var start, max, cur uint
+	const notSetYet = ^uint(0) // sentinel for start value
+	start = notSetYet
 	for i := 0; i < len(b); i++ {
-		a := b[i]
-		for j := 0; j < 64; j += 8 {
-			k := uint8(a >> j)
+		x := b[i]
+		if x == 0 {
+			cur += 64
+			continue
+		}
+		t := uint(sys.TrailingZeros64(x))
+		l := uint(sys.LeadingZeros64(x))

-			// Compute start.
-			si := uint(sys.TrailingZeros8(k))
-			if start == uint(i*64+j) {
-				start += si
+		// Finish any region spanning the uint64s
+		cur += t
+		if start == notSetYet {
+			start = cur
+		}
+		if cur > max {
+			max = cur
+		}
+		// Final region that might span to next uint64
+		cur = l
+	}
+	if start == notSetYet {
+		// Made it all the way through without finding a single 1 bit.
+		const n = uint(64 * len(b))
+		return packPallocSum(n, n, n)
+	}
+	if cur > max {
+		max = cur
+	}
+	if max >= 64-2 {
+		// There is no way an internal run of zeros could beat max.
+		return packPallocSum(start, max, cur)
+	}
+	// Now look inside each uint64 for runs of zeros.
+	// All uint64s must be nonzero, or we would have aborted above.
+outer:
+	for i := 0; i < len(b); i++ {
+		x := b[i]
+
+		// Look inside this uint64. We have a pattern like
+		// 000000 1xxxxx1 000000
+		// We need to look inside the 1xxxxx1 for any contiguous
+		// region of zeros.
+
+		// We already know the trailing zeros are no larger than max. Remove them.
+		x >>= sys.TrailingZeros64(x) & 63
+		if x&(x+1) == 0 { // no more zeros (except at the top).
+			continue
+		}
+
+		// Strategy: shrink all runs of zeros by max. If any runs of zero
+		// remain, then we've identified a larger maxiumum zero run.
+		p := max     // number of zeros we still need to shrink by.
+		k := uint(1) // current minimum length of runs of ones in x.
+		for {
+			// Shrink all runs of zeros by p places (except the top zeros).
+			for p > 0 {
+				if p <= k {
+					// Shift p ones down into the top of each run of zeros.
+					x |= x >> (p & 63)
+					if x&(x+1) == 0 { // no more zeros (except at the top).
+						continue outer
+					}
+					break
+				}
+				// Shift k ones down into the top of each run of zeros.
+				x |= x >> (k & 63)
+				if x&(x+1) == 0 { // no more zeros (except at the top).
+					continue outer
+				}
+				p -= k
+				// We've just doubled the minimum length of 1-runs.
+				// This allows us to shift farther in the next iteration.
+				k *= 2
 			}

-			// Compute max.
-			if end+si > max {
-				max = end + si
-			}
-			if mi := consec8tab[k]; mi > max {
-				max = mi
-			}
-
-			// Compute end.
-			if k == 0 {
-				end += 8
-			} else {
-				end = uint(sys.LeadingZeros8(k))
+			// The length of the lowest-order zero run is an increment to our maximum.
+			j := uint(sys.TrailingZeros64(^x)) // count contiguous trailing ones
+			x >>= j & 63                       // remove trailing ones
+			j = uint(sys.TrailingZeros64(x))   // count contiguous trailing zeros
+			x >>= j & 63                       // remove zeros
+			max += j                           // we have a new maximum!
+			if x&(x+1) == 0 {                  // no more zeros (except at the top).
+				continue outer
 			}
+			p = j // remove j more zeros from each zero run.
 		}
 	}
-	return packPallocSum(start, max, end)
+	return packPallocSum(start, max, cur)
 }

 // find searches for npages contiguous free pages in pallocBits and returns
--- a/src/runtime/mpallocbits_test.go
+++ b/src/runtime/mpallocbits_test.go
@ -101,7 +101,7 @@ func invertPallocBits(b *PallocBits) {

 // Ensures two packed summaries are identical, and reports a detailed description
 // of the difference if they're not.
-func checkPallocSum(t *testing.T, got, want PallocSum) {
+func checkPallocSum(t testing.TB, got, want PallocSum) {
 	if got.Start() != want.Start() {
 		t.Errorf("inconsistent start: got %d, want %d", got.Start(), want.Start())
 	}
@ -297,17 +297,29 @@ func TestPallocBitsSummarize(t *testing.T) {

 // Benchmarks how quickly we can summarize a PallocBits.
 func BenchmarkPallocBitsSummarize(b *testing.B) {
-	buf0 := new(PallocBits)
-	buf1 := new(PallocBits)
-	for i := 0; i < len(buf1); i++ {
-		buf1[i] = ^uint64(0)
+	patterns := []uint64{
+		0,
+		^uint64(0),
+		0xaa,
+		0xaaaaaaaaaaaaaaaa,
+		0x80000000aaaaaaaa,
+		0xaaaaaaaa00000001,
+		0xbbbbbbbbbbbbbbbb,
+		0x80000000bbbbbbbb,
+		0xbbbbbbbb00000001,
+		0xcccccccccccccccc,
+		0x4444444444444444,
+		0x4040404040404040,
+		0x4000400040004000,
+		0x1000404044ccaaff,
 	}
-	bufa := new(PallocBits)
-	for i := 0; i < len(bufa); i++ {
-		bufa[i] = 0xaa
-	}
-	for _, buf := range []*PallocBits{buf0, buf1, bufa} {
-		b.Run(fmt.Sprintf("Unpacked%02X", buf[0]), func(b *testing.B) {
+	for _, p := range patterns {
+		buf := new(PallocBits)
+		for i := 0; i < len(buf); i++ {
+			buf[i] = p
+		}
+		b.Run(fmt.Sprintf("Unpacked%02X", p), func(b *testing.B) {
+			checkPallocSum(b, buf.Summarize(), SummarizeSlow(buf))
 			for i := 0; i < b.N; i++ {
 				buf.Summarize()
 			}