From 3047c0c3f8af347076716ecba27870a994252083 Mon Sep 17 00:00:00 2001 From: Cole Leavitt Date: Thu, 26 Feb 2026 17:48:26 -0700 Subject: [PATCH 1/2] bmalloc: add 1ms backoff and retry cap to SYSCALL/PAS_SYSCALL EAGAIN loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SYSCALL and PAS_SYSCALL macros retry syscalls on EAGAIN in a zero-delay tight loop. When madvise(MADV_DONTDUMP) returns EAGAIN due to kernel mmap_write_lock contention (VMA split/merge allocation failure under memory pressure), this causes 100% CPU usage across all GC threads — effectively freezing the application. Add usleep(1000) backoff (1ms) and cap retries at 100 (100ms total). madvise failures here are advisory, not fatal, so breaking after max retries is safe. This matches the existing Windows precedent in libpas/pas_page_malloc.c virtual_alloc_with_retry() which uses Sleep(50ms) with 10 max retries. Upstream Apple WebKit has the same zero-delay loop and has not yet addressed this. tcmalloc uses bounded retries (3 attempts) for expensive madvise operations. sched_yield() was considered but is explicitly not recommended for this use case (Red Hat RHEL-RT guide). Related: oven-sh/bun#17723, oven-sh/bun#27371, oven-sh/bun#27196, google/tcmalloc#247, golang/go#61718 --- Source/bmalloc/bmalloc/BSyscall.h | 13 ++++++++++++- Source/bmalloc/libpas/src/libpas/pas_utils.h | 15 ++++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/Source/bmalloc/bmalloc/BSyscall.h b/Source/bmalloc/bmalloc/BSyscall.h index e5ecfb643b42..9b26d518f940 100644 --- a/Source/bmalloc/bmalloc/BSyscall.h +++ b/Source/bmalloc/bmalloc/BSyscall.h @@ -26,7 +26,18 @@ #pragma once #include +#include +/* Retry syscalls on EAGAIN with 1ms backoff to avoid busy-spinning. + madvise(MADV_DONTDUMP) can return EAGAIN under kernel mmap_write_lock + contention, causing 100% CPU usage with concurrent GC threads. + Cap retries at 100 (100ms total) as a safety net — madvise failures + here are advisory, not fatal. See also: virtual_alloc_with_retry() + in libpas/pas_page_malloc.c for the Windows equivalent. */ #define SYSCALL(x) do { \ - while ((x) == -1 && errno == EAGAIN) { } \ + int _syscall_tries = 0; \ + while ((x) == -1 && errno == EAGAIN) { \ + if (++_syscall_tries > 100) break; \ + usleep(1000); \ + } \ } while (0); diff --git a/Source/bmalloc/libpas/src/libpas/pas_utils.h b/Source/bmalloc/libpas/src/libpas/pas_utils.h index 7036db695b81..6f6eae0b6d75 100644 --- a/Source/bmalloc/libpas/src/libpas/pas_utils.h +++ b/Source/bmalloc/libpas/src/libpas/pas_utils.h @@ -46,6 +46,10 @@ #include #include +#if !PAS_OS(WINDOWS) +#include +#endif + #if PAS_OS(WINDOWS) #include #endif @@ -1263,8 +1267,17 @@ static inline bool pas_is_divisible_by(unsigned value, uint64_t magic_constant) enum cpp_initialization_t { cpp_initialization }; #endif +/* Retry syscalls on EAGAIN with 1ms backoff to avoid busy-spinning. + madvise(MADV_DONTDUMP) can return EAGAIN under kernel mmap_write_lock + contention, causing 100% CPU usage with concurrent GC threads. + Cap retries at 100 (100ms total) as a safety net — madvise failures + here are advisory, not fatal. */ #define PAS_SYSCALL(x) do { \ - while ((x) == -1 && errno == EAGAIN) { } \ + int _pas_syscall_tries = 0; \ + while ((x) == -1 && errno == EAGAIN) { \ + if (++_pas_syscall_tries > 100) break; \ + usleep(1000); \ + } \ } while (0) PAS_END_EXTERN_C; From 455fa9f3174322f563d2b9bc8f7b5495105f58b6 Mon Sep 17 00:00:00 2001 From: Cole Leavitt Date: Thu, 26 Feb 2026 17:52:29 -0700 Subject: [PATCH 2/2] bmalloc: remove MADV_DONTDUMP/MADV_DODUMP to eliminate mmap_write_lock contention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MADV_DONTDUMP is the sole cause of the mmap_write_lock contention that triggers the EAGAIN spin loop fixed in the previous commit. Unlike MADV_DONTNEED which only acquires the kernel's mmap_read_lock (no contention), MADV_DONTDUMP requires mmap_write_lock — a single process-wide exclusive lock. With concurrent GC threads all calling vmDeallocatePhysicalPages(), MADV_DONTDUMP creates a serialization point in the kernel. Under memory pressure, VMA split/merge allocation fails and the kernel returns EAGAIN, which (before the previous fix) caused 100% CPU spin. MADV_DONTDUMP only affects core dump size — it has zero impact on memory reclamation or allocation correctness. MADV_DODUMP (its symmetric counterpart in vmAllocatePhysicalPages/commit_impl) is also removed. This is the root cause elimination (vs the previous commit which is the defensive mitigation). Together they fully resolve the issue. Removed 4 madvise calls: - VMAllocate.h vmDeallocatePhysicalPages: MADV_DONTDUMP - VMAllocate.h vmAllocatePhysicalPages: MADV_DODUMP - pas_page_malloc.c decommit_impl: MADV_DONTDUMP - pas_page_malloc.c commit_impl: MADV_DODUMP --- Source/bmalloc/bmalloc/VMAllocate.h | 10 ++++------ Source/bmalloc/libpas/src/libpas/pas_page_malloc.c | 9 ++++++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/Source/bmalloc/bmalloc/VMAllocate.h b/Source/bmalloc/bmalloc/VMAllocate.h index 06894519b5d1..07441912205f 100644 --- a/Source/bmalloc/bmalloc/VMAllocate.h +++ b/Source/bmalloc/bmalloc/VMAllocate.h @@ -302,9 +302,9 @@ inline void vmDeallocatePhysicalPages(void* p, size_t vmSize) SYSCALL(madvise(p, vmSize, MADV_FREE)); #else SYSCALL(madvise(p, vmSize, MADV_DONTNEED)); -#if BOS(LINUX) - SYSCALL(madvise(p, vmSize, MADV_DONTDUMP)); -#endif +/* MADV_DONTDUMP removed: it only reduces core dump size but requires + kernel mmap_write_lock, causing severe contention with concurrent + GC threads. MADV_DONTNEED (above) only needs mmap_read_lock. */ #endif } @@ -319,9 +319,7 @@ inline void vmAllocatePhysicalPages(void* p, size_t vmSize) // Instead the kernel will commit pages as they are touched. #else SYSCALL(madvise(p, vmSize, MADV_NORMAL)); -#if BOS(LINUX) - SYSCALL(madvise(p, vmSize, MADV_DODUMP)); -#endif +/* MADV_DODUMP removed: symmetric with MADV_DONTDUMP removal above. */ #endif } #else diff --git a/Source/bmalloc/libpas/src/libpas/pas_page_malloc.c b/Source/bmalloc/libpas/src/libpas/pas_page_malloc.c index 3f21987adef5..23fab5ff3de6 100644 --- a/Source/bmalloc/libpas/src/libpas/pas_page_malloc.c +++ b/Source/bmalloc/libpas/src/libpas/pas_page_malloc.c @@ -350,8 +350,9 @@ static void commit_impl(void* ptr, size_t size, bool do_mprotect, pas_mmap_capab #endif } -#if PAS_OS(LINUX) - PAS_SYSCALL(madvise(ptr, size, MADV_DODUMP)); +/* MADV_DODUMP removed: symmetric with MADV_DONTDUMP removal in decommit_impl. + MADV_DONTDUMP only reduces core dump size but requires kernel mmap_write_lock, + causing severe contention with concurrent GC threads. */ #elif PAS_OS(WINDOWS) /* Sometimes the returned memInfo.RegionSize < size, and VirtualAlloc can't span regions We loop to make sure we get the full requested range. */ @@ -415,7 +416,9 @@ static void decommit_impl(void* ptr, size_t size, PAS_SYSCALL(madvise(ptr, size, MADV_FREE)); #elif PAS_OS(LINUX) PAS_SYSCALL(madvise(ptr, size, MADV_DONTNEED)); - PAS_SYSCALL(madvise(ptr, size, MADV_DONTDUMP)); + /* MADV_DONTDUMP removed: it only reduces core dump size but requires + kernel mmap_write_lock, causing severe contention with concurrent + GC threads. MADV_DONTNEED (above) only needs mmap_read_lock. */ #elif PAS_OS(WINDOWS) // DiscardVirtualMemory returns memory to the OS faster, but fails sometimes on Windows 10 // Fall back to VirtualAlloc in those cases