mirror of
https://github.com/golang/go.git
synced 2025-05-05 15:43:04 +00:00
runtime: avoid MADV_HUGEPAGE for heap memory
Currently the runtime marks all new memory as MADV_HUGEPAGE on Linux and manages its hugepage eligibility status. Unfortunately, the default THP behavior on most Linux distros is that MADV_HUGEPAGE blocks while the kernel eagerly reclaims and compacts memory to allocate a hugepage. This direct reclaim and compaction is unbounded, and may result in significant application thread stalls. In really bad cases, this can exceed 100s of ms or even seconds. Really all we want is to undo MADV_NOHUGEPAGE marks and let the default Linux paging behavior take over, but the only way to unmark a region as MADV_NOHUGEPAGE is to also mark it MADV_HUGEPAGE. The overall strategy of trying to keep hugepages for the heap unbroken however is sound. So instead let's use the new shiny MADV_COLLAPSE if it exists. MADV_COLLAPSE makes a best-effort synchronous attempt at collapsing the physical memory backing a memory region into a hugepage. We'll use MADV_COLLAPSE where we would've used MADV_HUGEPAGE, and stop using MADV_NOHUGEPAGE altogether. Because MADV_COLLAPSE is synchronous, it's also important to not re-collapse huge pages if the huge pages are likely part of some large allocation. Although in many cases it's advantageous to back these allocations with hugepages because they're contiguous, eagerly collapsing every hugepage means having to page in at least part of the large allocation. However, because we won't use MADV_NOHUGEPAGE anymore, we'll no longer handle the fact that khugepaged might come in and back some memory we returned to the OS with a hugepage. I've come to the conclusion that this is basically unavoidable without a new madvise flag and that it's just not a good default. If this change lands, advice about Linux huge page settings will be added to the GC guide. Verified that this change doesn't regress Sweet, at least not on my machine with: /sys/kernel/mm/transparent_hugepage/enabled [always or madvise] /sys/kernel/mm/transparent_hugepage/defrag [madvise] /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none [0 or 511] Unfortunately, this workaround means that we only get forced hugepages on Linux 6.1+. Fixes #61718. Change-Id: I7f4a7ba397847de29f800a99f9cb66cb2720a533 Reviewed-on: https://go-review.googlesource.com/c/go/+/516795 Reviewed-by: Austin Clements <austin@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Knyszek <mknyszek@google.com> Auto-Submit: Michael Knyszek <mknyszek@google.com>
This commit is contained in:
parent
5dda490372
commit
9f9bb26880
@ -23,6 +23,7 @@ const (
|
||||
_MADV_FREE = 0x8
|
||||
_MADV_HUGEPAGE = 0xe
|
||||
_MADV_NOHUGEPAGE = 0xf
|
||||
_MADV_COLLAPSE = 0x19
|
||||
|
||||
_SA_RESTART = 0x10000000
|
||||
_SA_ONSTACK = 0x8000000
|
||||
|
@ -23,6 +23,7 @@ const (
|
||||
_MADV_FREE = 0x8
|
||||
_MADV_HUGEPAGE = 0xe
|
||||
_MADV_NOHUGEPAGE = 0xf
|
||||
_MADV_COLLAPSE = 0x19
|
||||
|
||||
_SA_RESTART = 0x10000000
|
||||
_SA_ONSTACK = 0x8000000
|
||||
|
@ -25,6 +25,7 @@ const (
|
||||
_MADV_FREE = 0x8
|
||||
_MADV_HUGEPAGE = 0xe
|
||||
_MADV_NOHUGEPAGE = 0xf
|
||||
_MADV_COLLAPSE = 0x19
|
||||
|
||||
_SA_RESTART = 0x10000000
|
||||
_SA_ONSTACK = 0x8000000
|
||||
|
@ -23,6 +23,7 @@ const (
|
||||
_MADV_FREE = 0x8
|
||||
_MADV_HUGEPAGE = 0xe
|
||||
_MADV_NOHUGEPAGE = 0xf
|
||||
_MADV_COLLAPSE = 0x19
|
||||
|
||||
_SA_RESTART = 0x10000000
|
||||
_SA_ONSTACK = 0x8000000
|
||||
|
@ -24,6 +24,7 @@ const (
|
||||
_MADV_FREE = 0x8
|
||||
_MADV_HUGEPAGE = 0xe
|
||||
_MADV_NOHUGEPAGE = 0xf
|
||||
_MADV_COLLAPSE = 0x19
|
||||
|
||||
_SA_RESTART = 0x10000000
|
||||
_SA_ONSTACK = 0x8000000
|
||||
|
@ -26,6 +26,7 @@ const (
|
||||
_MADV_FREE = 0x8
|
||||
_MADV_HUGEPAGE = 0xe
|
||||
_MADV_NOHUGEPAGE = 0xf
|
||||
_MADV_COLLAPSE = 0x19
|
||||
|
||||
_SA_RESTART = 0x10000000
|
||||
_SA_ONSTACK = 0x8000000
|
||||
|
@ -26,6 +26,7 @@ const (
|
||||
_MADV_FREE = 0x8
|
||||
_MADV_HUGEPAGE = 0xe
|
||||
_MADV_NOHUGEPAGE = 0xf
|
||||
_MADV_COLLAPSE = 0x19
|
||||
|
||||
_SA_RESTART = 0x10000000
|
||||
_SA_ONSTACK = 0x8000000
|
||||
|
@ -23,6 +23,7 @@ const (
|
||||
_MADV_FREE = 0x8
|
||||
_MADV_HUGEPAGE = 0xe
|
||||
_MADV_NOHUGEPAGE = 0xf
|
||||
_MADV_COLLAPSE = 0x19
|
||||
|
||||
_SA_RESTART = 0x10000000
|
||||
_SA_ONSTACK = 0x8000000
|
||||
|
@ -23,6 +23,7 @@ const (
|
||||
_MADV_FREE = 0x8
|
||||
_MADV_HUGEPAGE = 0xe
|
||||
_MADV_NOHUGEPAGE = 0xf
|
||||
_MADV_COLLAPSE = 0x19
|
||||
|
||||
_SA_RESTART = 0x10000000
|
||||
_SA_ONSTACK = 0x8000000
|
||||
|
@ -24,6 +24,7 @@ const (
|
||||
_MADV_FREE = 0x8
|
||||
_MADV_HUGEPAGE = 0xe
|
||||
_MADV_NOHUGEPAGE = 0xf
|
||||
_MADV_COLLAPSE = 0x19
|
||||
|
||||
_SA_RESTART = 0x10000000
|
||||
_SA_ONSTACK = 0x8000000
|
||||
|
@ -24,6 +24,7 @@ const (
|
||||
_MADV_FREE = 0x8
|
||||
_MADV_HUGEPAGE = 0xe
|
||||
_MADV_NOHUGEPAGE = 0xf
|
||||
_MADV_COLLAPSE = 0x19
|
||||
|
||||
_SA_RESTART = 0x10000000
|
||||
_SA_ONSTACK = 0x8000000
|
||||
|
@ -1820,8 +1820,8 @@ func (s *ScavengeIndex) SetEmpty(ci ChunkIdx) {
|
||||
s.i.setEmpty(chunkIdx(ci))
|
||||
}
|
||||
|
||||
func (s *ScavengeIndex) SetNoHugePage(ci ChunkIdx) bool {
|
||||
return s.i.setNoHugePage(chunkIdx(ci))
|
||||
func (s *ScavengeIndex) SetNoHugePage(ci ChunkIdx) {
|
||||
s.i.setNoHugePage(chunkIdx(ci))
|
||||
}
|
||||
|
||||
func CheckPackScavChunkData(gen uint32, inUse, lastInUse uint16, flags uint8) bool {
|
||||
|
@ -91,6 +91,12 @@ func sysNoHugePage(v unsafe.Pointer, n uintptr) {
|
||||
sysNoHugePageOS(v, n)
|
||||
}
|
||||
|
||||
// sysHugePageCollapse attempts to immediately back the provided memory region
|
||||
// with huge pages. It is best-effort and may fail silently.
|
||||
func sysHugePageCollapse(v unsafe.Pointer, n uintptr) {
|
||||
sysHugePageCollapseOS(v, n)
|
||||
}
|
||||
|
||||
// sysFree transitions a memory region from any state to None. Therefore, it
|
||||
// returns memory unconditionally. It is used if an out-of-memory error has been
|
||||
// detected midway through an allocation or to carve out an aligned section of
|
||||
|
@ -41,6 +41,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) {
|
||||
func sysNoHugePageOS(v unsafe.Pointer, n uintptr) {
|
||||
}
|
||||
|
||||
func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) {
|
||||
}
|
||||
|
||||
// Don't split the stack as this function may be invoked without a valid G,
|
||||
// which prevents us from allocating more stack.
|
||||
//
|
||||
|
@ -39,6 +39,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) {
|
||||
func sysNoHugePageOS(v unsafe.Pointer, n uintptr) {
|
||||
}
|
||||
|
||||
func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) {
|
||||
}
|
||||
|
||||
// Don't split the stack as this function may be invoked without a valid G,
|
||||
// which prevents us from allocating more stack.
|
||||
//
|
||||
|
@ -39,6 +39,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) {
|
||||
func sysNoHugePageOS(v unsafe.Pointer, n uintptr) {
|
||||
}
|
||||
|
||||
func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) {
|
||||
}
|
||||
|
||||
// Don't split the stack as this function may be invoked without a valid G,
|
||||
// which prevents us from allocating more stack.
|
||||
//
|
||||
|
@ -116,6 +116,31 @@ func sysNoHugePageOS(v unsafe.Pointer, n uintptr) {
|
||||
madvise(v, n, _MADV_NOHUGEPAGE)
|
||||
}
|
||||
|
||||
func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) {
|
||||
if uintptr(v)&(physPageSize-1) != 0 {
|
||||
// The Linux implementation requires that the address
|
||||
// addr be page-aligned, and allows length to be zero.
|
||||
throw("unaligned sysHugePageCollapseOS")
|
||||
}
|
||||
if physHugePageSize == 0 {
|
||||
return
|
||||
}
|
||||
// N.B. If you find yourself debugging this code, note that
|
||||
// this call can fail with EAGAIN because it's best-effort.
|
||||
// Also, when it returns an error, it's only for the last
|
||||
// huge page in the region requested.
|
||||
//
|
||||
// It can also sometimes return EINVAL if the corresponding
|
||||
// region hasn't been backed by physical memory. This is
|
||||
// difficult to guarantee in general, and it also means
|
||||
// there's no way to distinguish whether this syscall is
|
||||
// actually available. Oops.
|
||||
//
|
||||
// Anyway, that's why this call just doesn't bother checking
|
||||
// any errors.
|
||||
madvise(v, n, _MADV_COLLAPSE)
|
||||
}
|
||||
|
||||
// Don't split the stack as this function may be invoked without a valid G,
|
||||
// which prevents us from allocating more stack.
|
||||
//
|
||||
|
@ -163,6 +163,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) {
|
||||
func sysNoHugePageOS(v unsafe.Pointer, n uintptr) {
|
||||
}
|
||||
|
||||
func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) {
|
||||
}
|
||||
|
||||
func sysMapOS(v unsafe.Pointer, n uintptr) {
|
||||
}
|
||||
|
||||
|
@ -97,6 +97,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) {
|
||||
func sysNoHugePageOS(v unsafe.Pointer, n uintptr) {
|
||||
}
|
||||
|
||||
func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) {
|
||||
}
|
||||
|
||||
// Don't split the stack as this function may be invoked without a valid G,
|
||||
// which prevents us from allocating more stack.
|
||||
//
|
||||
|
@ -771,7 +771,7 @@ func (p *pageAlloc) scavengeOne(ci chunkIdx, searchIdx uint, max uintptr) uintpt
|
||||
|
||||
// Grab whether the chunk is hugepage backed and if it is,
|
||||
// clear it. We're about to break up this huge page.
|
||||
shouldNoHugePage := p.scav.index.setNoHugePage(ci)
|
||||
p.scav.index.setNoHugePage(ci)
|
||||
|
||||
// With that done, it's safe to unlock.
|
||||
unlock(p.mheapLock)
|
||||
@ -781,9 +781,6 @@ func (p *pageAlloc) scavengeOne(ci chunkIdx, searchIdx uint, max uintptr) uintpt
|
||||
|
||||
// Only perform sys* operations if we're not in a test.
|
||||
// It's dangerous to do so otherwise.
|
||||
if shouldNoHugePage {
|
||||
sysNoHugePage(unsafe.Pointer(chunkBase(ci)), pallocChunkBytes)
|
||||
}
|
||||
sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
|
||||
|
||||
// Update global accounting only when not in test, otherwise
|
||||
@ -1134,17 +1131,34 @@ func (s *scavengeIndex) find(force bool) (chunkIdx, uint) {
|
||||
}
|
||||
|
||||
// alloc updates metadata for chunk at index ci with the fact that
|
||||
// an allocation of npages occurred.
|
||||
// an allocation of npages occurred. It also eagerly attempts to collapse
|
||||
// the chunk's memory into hugepage if the chunk has become sufficiently
|
||||
// dense and we're not allocating the whole chunk at once (which suggests
|
||||
// the allocation is part of a bigger one and it's probably not worth
|
||||
// eagerly collapsing).
|
||||
//
|
||||
// alloc may only run concurrently with find.
|
||||
func (s *scavengeIndex) alloc(ci chunkIdx, npages uint) {
|
||||
sc := s.chunks[ci].load()
|
||||
sc.alloc(npages, s.gen)
|
||||
if !sc.isHugePage() && sc.inUse > scavChunkHiOccPages {
|
||||
// Mark dense chunks as specifically backed by huge pages.
|
||||
// Mark that we're considering this chunk as backed by huge pages.
|
||||
sc.setHugePage()
|
||||
if !s.test {
|
||||
sysHugePage(unsafe.Pointer(chunkBase(ci)), pallocChunkBytes)
|
||||
|
||||
// Collapse dense chunks into huge pages and mark that
|
||||
// we did that, but only if we're not allocating to
|
||||
// use the entire chunk. If we're allocating an entire chunk,
|
||||
// this is likely part of a much bigger allocation. For
|
||||
// instance, if the caller is allocating a 1 GiB slice of bytes, we
|
||||
// don't want to go and manually collapse all those pages; we want
|
||||
// them to be demand-paged. If the caller is actually going to use
|
||||
// all that memory, it'll naturally get backed by huge pages later.
|
||||
//
|
||||
// This also avoids having sysHugePageCollapse fail. On Linux,
|
||||
// the call requires that some part of the huge page being collapsed
|
||||
// is already paged in.
|
||||
if !s.test && npages < pallocChunkPages {
|
||||
sysHugePageCollapse(unsafe.Pointer(chunkBase(ci)), pallocChunkBytes)
|
||||
}
|
||||
}
|
||||
s.chunks[ci].store(sc)
|
||||
@ -1204,14 +1218,13 @@ func (s *scavengeIndex) setEmpty(ci chunkIdx) {
|
||||
// Returns true if the set was successful (not already backed by huge pages).
|
||||
//
|
||||
// setNoHugePage may only run concurrently with find.
|
||||
func (s *scavengeIndex) setNoHugePage(ci chunkIdx) bool {
|
||||
func (s *scavengeIndex) setNoHugePage(ci chunkIdx) {
|
||||
val := s.chunks[ci].load()
|
||||
if !val.isHugePage() {
|
||||
return false
|
||||
return
|
||||
}
|
||||
val.setNoHugePage()
|
||||
s.chunks[ci].store(val)
|
||||
return true
|
||||
}
|
||||
|
||||
// atomicScavChunkData is an atomic wrapper around a scavChunkData
|
||||
@ -1282,8 +1295,9 @@ const (
|
||||
// file. The reason we say "HasFree" here is so the zero value is
|
||||
// correct for a newly-grown chunk. (New memory is scavenged.)
|
||||
scavChunkHasFree scavChunkFlags = 1 << iota
|
||||
// scavChunkNoHugePage indicates whether this chunk has been marked
|
||||
// sysNoHugePage. If not set, it means the chunk is marked sysHugePage.
|
||||
// scavChunkNoHugePage indicates whether this chunk has had any huge
|
||||
// pages broken by the scavenger.
|
||||
//.
|
||||
// The negative here is unfortunate, but necessary to make it so that
|
||||
// the zero value of scavChunkData accurately represents the state of
|
||||
// a newly-grown chunk. (New memory is marked as backed by huge pages.)
|
||||
|
@ -426,11 +426,6 @@ func (p *pageAlloc) grow(base, size uintptr) {
|
||||
// we need to ensure this newly-free memory is visible in the
|
||||
// summaries.
|
||||
p.update(base, size/pageSize, true, false)
|
||||
|
||||
// Mark all new memory as huge page eligible.
|
||||
if !p.test {
|
||||
sysHugePage(unsafe.Pointer(base), size)
|
||||
}
|
||||
}
|
||||
|
||||
// enableChunkHugePages enables huge pages for the chunk bitmap mappings (disabled by default).
|
||||
|
Loading…
x
Reference in New Issue
Block a user