stack growth direction: how hard can it be?
This is another horror story with lots of technical details on how
glibc
was unhappy (again!) on ia64
.
A week ago I’ve decided to give ia64
more love in gentoo
by
cleaning up backlog of bugs related to ia64
team:
hotlist
The backlog was not too large: about 150 bugs.
ruby garbage collection bug
Many bugs were blocked on SIGSEGV
ing
ruby
.
The curious fact is that ruby-2.1.9
used to work on ia64
and
broke at some point. These kinds of bugs (as opposed to bring software
support for the first time) are usually easy to fix. Something got
slightly off and exposed a SIGSEGV
. We need to find that something and
tweak it a tiny bit.
Let’s look at the SIGSEGV
(it happens as you compile ruby
on
ia64
):
Program received signal SIGSEGV, Segmentation fault.
mark_locations_array (objspace=0x6000000000045db0, x=0x0, n=864692227966763116) at gc.c:3297
3297 v = *x;
(gdb) bt
#0 mark_locations_array (objspace=0x6000000000045db0, x=0x0, n=864692227966763116) at gc.c:3297
#1 0x400000000014a040 in gc_mark_locations (objspace=0x6000000000045db0, start=0x0, end=0x6000080000000368) at gc.c:3310
#2 0x400000000014b3a0 in mark_current_machine_context (objspace=0x6000000000045db0, th=0x60000000000455b0) at gc.c:3500
#3 0x400000000014dfe0 in gc_mark_roots (objspace=0x6000000000045db0, full_mark=0, categoryp=0x0) at gc.c:4105
#4 0x400000000014e6b0 in gc_marks_body (objspace=0x6000000000045db0, full_mark=0) at gc.c:4164
#5 0x400000000014f260 in gc_marks (objspace=0x6000000000045db0, full_mark=0) at gc.c:4526
#6 0x40000000001525c0 in garbage_collect_body (objspace=0x6000000000045db0, full_mark=0, immediate_sweep=0, reason=256) at gc.c:5024
#7 0x400000000013c010 in heap_prepare_freepage (objspace=0x6000000000045db0, heap=0x6000000000045dc0) at gc.c:1219
#8 0x400000000013c140 in heap_get_freeobj_from_next_freepage (objspace=0x6000000000045db0, heap=0x6000000000045dc0) at gc.c:1237
#9 0x400000000013c360 in heap_get_freeobj (objspace=0x6000000000045db0, heap=0x6000000000045dc0) at gc.c:1259
#10 0x400000000013c950 in newobj_of (klass=0, flags=40, v1=0, v2=0, v3=0) at gc.c:1303
#11 0x400000000013ccc0 in rb_newobj_of (klass=0, flags=40) at gc.c:1356
#12 0x4000000000163740 in hash_alloc (klass=0) at hash.c:289
#13 0x4000000000163860 in rb_hash_new () at hash.c:309
#14 0x400000000050e420 in Init_BareVM () at vm.c:2822
#15 0x40000000000f6b60 in ruby_setup () at eval.c:54
#16 0x40000000000f6f50 in ruby_init () at eval.c:75
#17 0x400000000001b010 in main (argc=9, argv=0x60000fffffffb1d8) at main.c:35
I’ve added a bunch of printf
calls into ruby
runtime to figure
out where NULL
pointer dereference comes from. It became immediately
obvious: one of ia64
-specific pointers was not initialized and kept
default NULL
.
While I was adding printf
calls I’ve noticed a lot of #ifdef __ia64
in every place that deals with threads: thread creation,
thread switch, garbage collection. I had almost no idea what code is
supposed to do but very basic understanding of how garbage collection
works suggested variable native_main_thread.register_stack_start
should never be NULL
on ia64
.
Cooking up the fix was trivial (proposed pull request
upstream):
diff --git a/thread_pthread.c b/thread_pthread.c
--- a/thread_pthread.c
+++ b/thread_pthread.c
@@ -740,100 +740,100 @@ ruby_init_stack(volatile VALUE *addr
void
ruby_init_stack(volatile VALUE *addr
#ifdef __ia64
, void *bsp
#endif
)
{
native_main_thread.id = pthread_self();+#ifdef __ia64
+ if (!native_main_thread.register_stack_start ||
+ (VALUE*)bsp < native_main_thread.register_stack_start) {
+ native_main_thread.register_stack_start = (VALUE*)bsp;
+ }
+#endif
#if MAINSTACKADDR_AVAILABLE
if (native_main_thread.stack_maxsize) return;
{
void* stackaddr;
size_t size;
if (get_main_stack(&stackaddr, &size) == 0) {
native_main_thread.stack_maxsize = size;
native_main_thread.stack_start = stackaddr;
reserve_stack(stackaddr, size);
goto bound_check;
}
}
#endif
#ifdef STACK_END_ADDRESS
native_main_thread.stack_start = STACK_END_ADDRESS;
#else
if (!native_main_thread.stack_start ||
STACK_UPPER((VALUE *)(void *)&addr,
native_main_thread.stack_start > addr,
native_main_thread.stack_start < addr)) {
native_main_thread.stack_start = (VALUE *)addr;
}
#endif-#ifdef __ia64
- if (!native_main_thread.register_stack_start ||
- (VALUE*)bsp < native_main_thread.register_stack_start) {
- native_main_thread.register_stack_start = (VALUE*)bsp;
- }
-#endif
{
#if defined(HAVE_GETRLIMIT)
#if defined(PTHREAD_STACK_DEFAULT)
# if PTHREAD_STACK_DEFAULT < RUBY_STACK_SPACE*5
# error "PTHREAD_STACK_DEFAULT is too small"
# endif
size_t size = PTHREAD_STACK_DEFAULT;
#else
size_t size = RUBY_VM_THREAD_VM_STACK_SIZE;
#endif
size_t space;
int pagesize = getpagesize();
struct rlimit rlim;
STACK_GROW_DIR_DETECTION;
if (getrlimit(RLIMIT_STACK, &rlim) == 0) {
size = (size_t)rlim.rlim_cur;
}
addr = native_main_thread.stack_start;
if (IS_STACK_DIR_UPPER()) {
space = ((size_t)((char *)addr + size) / pagesize) * pagesize - (size_t)addr;
}
else {
space = (size_t)addr - ((size_t)((char *)addr - size) / pagesize + 1) * pagesize;
}
native_main_thread.stack_maxsize = space;
#endif
}
#if MAINSTACKADDR_AVAILABLE
bound_check:
#endif
/* If addr is out of range of main-thread stack range estimation, */
/* it should be on co-routine (alternative stack). [Feature #2294] */
{
void *start, *end;
STACK_GROW_DIR_DETECTION;
if (IS_STACK_DIR_UPPER()) {
start = native_main_thread.stack_start;
end = (char *)native_main_thread.stack_start + native_main_thread.stack_maxsize;
}
else {
start = (char *)native_main_thread.stack_start - native_main_thread.stack_maxsize;
end = native_main_thread.stack_start;
}
if ((void *)addr < start || (void *)addr > end) {
/* out of range */
native_main_thread.stack_start = (VALUE *)addr;
native_main_thread.stack_maxsize = 0; /* unknown */
}
} }
The fix is to move initialization code before exit from function happens.
I think ruby_init_stack
used to work because
MAINSTACKADDR_AVAILABLE
was not defined in older glibc
. Perhaps
due to missing HAVE_PTHREAD_GETATTR_NP
support or something similar.
See how complicate detection of STACKADDR_AVAILABLE
is:
#if defined HAVE_PTHREAD_GETATTR_NP || defined HAVE_PTHREAD_ATTR_GET_NP
# define STACKADDR_AVAILABLE 1
#elif defined HAVE_PTHREAD_GET_STACKADDR_NP && defined HAVE_PTHREAD_GET_STACKSIZE_NP
# define STACKADDR_AVAILABLE 1
# undef MAINSTACKADDR_AVAILABLE
# define MAINSTACKADDR_AVAILABLE 1
void *pthread_get_stackaddr_np(pthread_t);
size_t pthread_get_stacksize_np(pthread_t);
#elif defined HAVE_THR_STKSEGMENT || defined HAVE_PTHREAD_STACKSEG_NP
# define STACKADDR_AVAILABLE 1
#elif defined HAVE_PTHREAD_GETTHRDS_NP
# define STACKADDR_AVAILABLE 1
#elif defined __HAIKU__
# define STACKADDR_AVAILABLE 1
#elif defined __ia64 && defined _HPUX_SOURCE
# include <sys/dyntune.h>
...
But now STACKADDR_AVAILABLE
is defined and goto bound_check
skips native_main_thread.register_stack_start
initialization
completely.
My patch worked and I was happy. But still it was slightly confusing to
see all that ia64
-specific code for stack handling. What is so
special about it’s stack?
Let’s look at a code example in cont.c
file that scans stack for
heap pointers:
static void
(void *ptr)
cont_mark{
*cont = ptr;
rb_context_t
("cont");
RUBY_MARK_ENTER(cont->value);
rb_gc_mark
(&cont->saved_thread);
rb_thread_mark(cont->saved_thread.self);
rb_gc_mark
if (cont->vm_stack) {
#ifdef CAPTURE_JUST_VALID_VM_STACK
(cont->vm_stack,
rb_gc_mark_locations->vm_stack + cont->vm_stack_slen + cont->vm_stack_clen);
cont#else
(cont->vm_stack,
rb_gc_mark_locations->vm_stack, cont->saved_thread.stack_size);
cont#endif
}
if (cont->machine.stack) {
if (cont->type == CONTINUATION_CONTEXT) {
/* cont */
(cont->machine.stack,
rb_gc_mark_locations->machine.stack + cont->machine.stack_size);
cont} else {
/* fiber */
*th;
rb_thread_t *fib = (rb_fiber_t*)cont;
rb_fiber_t (cont->saved_thread.self, th);
GetThreadPtrif ((th->fiber != fib) && fib->status == RUNNING) {
(cont->machine.stack,
rb_gc_mark_locations->machine.stack + cont->machine.stack_size);
cont}
}
}
#ifdef __ia64
if (cont->machine.register_stack) {
(cont->machine.register_stack,
rb_gc_mark_locations->machine.register_stack + cont->machine.register_stack_size);
cont}
#endif
("cont");
RUBY_MARK_LEAVE}
Additional code under #ifdef __ia64
looked unusual but it didn’t
seem to harm any ruby
tests and I moved on.
binutils
out-of-bounds bug
Next bug was lurking in binutils-2.28
package which occasionally
crashed strip
program. In my
case crash was happening only when I was building gcc
. gcc
build system happens to call strip
binary when compares stage2
and stage3
as one of stages was built with debugging sections (using
-gtoggle
switch).
The strip
SIGSEGV
fix was also surprisingly trivial and not
directly related to stack (or even ia64
) specifics (upstream
commit):
diff --git a/bfd/elf.c b/bfd/elf.c
index 5f37e7f79c..76c6a5c6a7 100644--- a/bfd/elf.c
+++ b/bfd/elf.c
@@ -1283,7 +1283,8 @@ section_match (const Elf_Internal_Shdr * a,
static unsigned int
find_link (const bfd * obfd, const Elf_Internal_Shdr * iheader, const unsigned int hint)
{
Elf_Internal_Shdr ** oheaders = elf_elfsections (obfd);
unsigned int i;
BFD_ASSERT (iheader != NULL);
/* See PR 20922 for a reproducer of the NULL test. */- if (oheaders[hint] != NULL
+ if (hint < elf_numsections (obfd)
+ && oheaders[hint] != NULL
&& section_match (oheaders[hint], iheader)) return hint;
Here a mysterious hint
was used to refer to out-of-bounds area and
that caused SIGSEGV
. My guess why it triggered mostly on ia64
is because ia64
has many more ELF
-sections than other
architectures (160
versus 32
).
glibc
pthread_create
bug
The next failure happened for glibc-2.24
package (glibc-2.23
worked fine). It looked like every threaded program crashed around
program shutdown (upstream bug).
That small reproducer was enough to make program crash:
// how to crash: gcc -O0 -ggdb3 -o r bug.c -pthread && ./r
#include <pthread.h>
static void * f (void * p)
{
return NULL;
}
int main (int argc, const char ** argv)
{
;
pthread_t t(&t, NULL, &f, NULL);
pthread_create
(t, NULL);
pthread_join return 0;
}
Here we create a no-op thread and wait for it’s shutdown. The
SIGSEGV
happened at address 0x8
(another NULL
-pointer
dereference). Backtrace was not very informative:
$ gcc -O0 -ggdb3 -o r bug.c -pthread && ./r
Segmentation fault (core dumped)
$ gdb r core
...
Program terminated with signal SIGSEGV, Segmentation fault.
#0 0x2000000000077da0 in start_thread (arg=0x0) at pthread_create.c:432
432 __madvise (pd->stackblock, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
[Current thread is 1 (Thread 0x2000000000b6b1f0 (LWP 20912))]
(gdb) list
427 #ifdef _STACK_GROWS_DOWN
428 char *sp = CURRENT_STACK_FRAME;
429 size_t freesize = (sp - (char *) pd->stackblock) & ~pagesize_m1;
430 assert (freesize < pd->stackblock_size);
431 if (freesize > PTHREAD_STACK_MIN)
432 __madvise (pd->stackblock, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
433 #else
434 /* Page aligned start of memory to free (higher than or equal
435 to current sp plus the minimum stack size). */
436 void *freeblock = (void*)((size_t)(CURRENT_STACK_FRAME
#0 0x2000000000077da0 in start_thread (arg=0x0) at pthread_create.c:432
pd = 0x0
now = <optimized out>
unwind_buf = <error reading variable unwind_buf (Cannot access memory at address 0xfffffffffffffd90)>
not_first_call = <optimized out>
pagesize_m1 = <optimized out>
sp = 0x2000000000b6a870 ""
freesize = <optimized out>
__PRETTY_FUNCTION__ = "start_thread"
#1 0x0000000000000000 in ?? ()
This crash did not make much sense. At first I thought it was caused by
pd->stackblock
code where pd
was somehow turned into NULL
.
But if we look a few lines above (source
link)
pd
is used in that function all over the places. strace
run also
suggested that crash happened after madvise
syscall returned
successfully (pd->stackblock
has a sane value).
The whole start_thread
function
is quite large but very straightforward. It does tree main things:
- setup environment for current thread: locale data,
futex robust_lists
(efficientmutex
runtime support), signal masks - run user’s code with this one line:
THREAD_SETMEM (pd, result, pd->start_routine (pd->arg));
- teardown environment: call thread-local destructors, call
futex robust_lists
and free thread’s stack
Our crash happens in 3. teardown environment
phase right at the
place of thread’s stack teardown.
glibc
stack teardown is interesting: it does not free all the stack
because code responsible for stack cleanup uses that very same stack.
Let’s look at the code in detail:
/* Mark the memory of the stack as usable to the kernel. We free
everything except for the space used for the TCB itself. */
size_t pagesize_m1 = __getpagesize () - 1;
#ifdef _STACK_GROWS_DOWN
char *sp = CURRENT_STACK_FRAME;
size_t freesize = (sp - (char *) pd->stackblock) & ~pagesize_m1;
(freesize < pd->stackblock_size);
assert if (freesize > PTHREAD_STACK_MIN)
(pd->stackblock, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
__madvise #else
/* Page aligned start of memory to free (higher than or equal
to current sp plus the minimum stack size). */
void *freeblock = (void*)((size_t)(CURRENT_STACK_FRAME
+ PTHREAD_STACK_MIN
+ pagesize_m1)
& ~pagesize_m1);
char *free_end = (char *) (((uintptr_t) pd - pd->guardsize) & ~pagesize_m1);
/* Is there any space to free? */
if (free_end > (char *)freeblock)
{
size_t freesize = (size_t)(free_end - (char *)freeblock);
(freesize < pd->stackblock_size);
assert (freeblock, freesize, MADV_DONTNEED);
__madvise }
#endif
Here we see two major branches: _STACK_GROWS_DOWN
and the #else
one (not used on ia64
). pd
knows precisely where thread’s stack
resides: it’s in [pd->stackblock, pd->stackblock + pd->stackblock_size)
range. _STACK_GROWS_DOWN
means that stack
starts at address around pd->stackblock + pd->stackblock_size
and
grows in backward direction (to clarify: stack pointer decreases when
value is pushed to stack).
So far so good, no magic here. x86_64
does the same.
First clue
But why madvise()
affects anyting? Isn’t it just a hint to
kernel’s memory that can’t go wrong even if you messed up the
arguments?
Does page deallocation happen at all? It’s just a madvise
after
all. From man 2 madvise
:
Conventional advice values
The advice values listed below allow an application to tell the kernel how it expects to use
some mapped or shared memory areas, so that the kernel can choose appropriate read-ahead and
caching techniques. These advice values do not influence the semantics of the application
(except in the case of MADV_DONTNEED), but may influence its performance. All of the advice
values listed here have analogs in the POSIX-specified posix_madvise(3) function, and the val‐
ues have the same meanings, with the exception of MADV_DONTNEED.
MADV_DONTNEED
Do not expect access in the near future. (For the time being, the application is fin‐
ished with the given range, so the kernel can free resources associated with it.)
After a successful MADV_DONTNEED operation, the semantics of memory access in the speci‐
fied region are changed: subsequent accesses of pages in the range will succeed, but
will result in either repopulating the memory contents from the up-to-date contents of
the underlying mapped file (for shared file mappings, shared anonymous mappings, and
shmem-based techniques such as System V shared memory segments) or zero-fill-on-demand
pages for anonymous private mappings.
Note that, when applied to shared mappings, MADV_DONTNEED might not lead to immediate
freeing of the pages in the range. The kernel is free to delay freeing the pages until
an appropriate moment. The resident set size (RSS) of the calling process will be imme‐
diately reduced however.
MADV_DONTNEED cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP pages.
(Pages marked with the kernel-internal VM_PFNMAP flag are special memory areas that are
not managed by the virtual memory subsystem. Such pages are typically created by device
drivers that map the pages into user space.)
Tl;DR
variant: madvise(p, size, MADV_DONTNEED)
works as memset(p, 0, size)
. And it’s the only advice
value that changes program
semantics.
That was the first clue: perhaps we are zeroing out some crucial data
structure? I’ve commented out __madvise
call and SIGSEGV
disappeared!
I’ve decided to check if ia64
is an indeed a _STACK_GROWS_DOWN
platform:
#include <pthread.h>
#include <stdio.h>
#include <unistd.h>
static void g(int a, int b, int c, int d, int e, int f)
{
int v;
("sp = %p\n", &v);
printf }
static void * f (void * p)
{
int v;
("sp = %p\n", &v);
printf (1,2,3,4,5,6);
g
return NULL;
}
int main (int argc, const char ** argv)
{
("page = %u\n", getpagesize());
printf
;
pthread_t t(&t, NULL, &f, NULL);
pthread_create
(t, NULL);
pthread_join return 0;
}
$ ia64-unknown-linux-gnu-gcc -O0 -ggdb3 -o stack stack.c -pthread && ./stack
page = 65536
sp = 0x2000000000b7e860
sp = 0x2000000000b7e830
...
madvise(start=0x20000000003b0000, len=0x790000, flags=0x4)
We see a few facts here:
- stack indeed grows by decreasing
sp
(aka_STACK_GROWS_DOWN
) ia64
pages are 64K- madvise does not touch
sp
itself and stops at 3 pages away (PTHREAD_STACK_MIN
value):0x20000000003b0000 + 0x790000 = 0x2000000000B40000
So why do things fail? I’ve tried to add more debug statements into
kernel’s sys_madvise
implementation and ran it under ski
emulator.
SIGSEGV
was still reproducible.
Then I’ve recalled strange bsp
business and additional stack area
tracked by ruby
garbage collector. I wondered where that additional
memory region resides:
#include <pthread.h>
#include <stdio.h>
#include <unistd.h>
static void g(int a, int b, int c, int d, int e, int f)
{
int v;
("sp = %p\n", &v);
printf ("bsp = %p\n", __builtin_ia64_bsp());
printf }
static void * f (void * p)
{
int v;
("sp = %p\n", &v);
printf ("bsp = %p\n", __builtin_ia64_bsp());
printf (1,2,3,4,5,6);
g
return NULL;
}
int main (int argc, const char ** argv)
{
;
pthread_t t(&t, NULL, &f, NULL);
pthread_create
(t, NULL);
pthread_join return 0;
}
$ ia64-unknown-linux-gnu-gcc -O0 -ggdb3 -o stack2 stack2.c -pthread && ./stack2
$ ./stack2
sp = 0x2000000000b7e860
bsp = 0x2000000000380090
sp = 0x2000000000b7e830
bsp = 0x20000000003800b8
madvise(start=0x20000000003b0000, len=0x790000, flags=0x4)
See what happens here? sp
and bsp
grow from opposite directions
of stack block towards one another both staring the same stack area:
+--------------------------+
| bsp_start: 0x2...3b0000 |
| ... |
| bsp: 0x2...3800b8 |
+--------------------------+
| .... |
| guard page: 0x2...770000 |
| .... |
+--------------------------+
| sp: 0x2...b7e830 |
| ... |
| sp_start: 0x2...b80000 |
+--------------------------+
BSP
means Backing Store Pointer. That memory area is used by CPU to
back up and restore CPU register values (but not other local variables)
for each procedure call/return for caller-save registers. Usually C
programs don’t need to care about bsp
value or area contents.
It means that we should try hard not to lose bsp
area when we are
tearing down the stack because register spilling/loading happens at
unusual times: CPU can defer or avoid spilling/reloading registers to
speed up performance.
Thus the fix could look like that (proposed
upstream):
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 7a970ffc5b..6e3f6db5b1 100644--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -555,10 +555,24 @@ START_THREAD_DEFN
size_t pagesize_m1 = __getpagesize () - 1;
#ifdef _STACK_GROWS_DOWN
char *sp = CURRENT_STACK_FRAME;- size_t freesize = (sp - (char *) pd->stackblock) & ~pagesize_m1;
+ char *freeblock = (char *) pd->stackblock;
+ size_t freesize = (sp - freeblock) & ~pagesize_m1;
assert (freesize < pd->stackblock_size);+# ifdef __ia64__
if (freesize > PTHREAD_STACK_MIN)- __madvise (pd->stackblock, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
+ {
+ /* On ia64 stack grows both ways!
+ - normal "sp" stack (stack for local variables) grows down
+ - register stack "bsp" grows up from the opposite end of stack block
+
+ Thus we leave PTHREAD_STACK_MIN bytes from stack block top
+ and leave same PTHREAD_STACK_MIN at stack block bottom. */
+ freeblock += PTHREAD_STACK_MIN;
+ freesize -= PTHREAD_STACK_MIN;
+ }
+# endif
+ if (freesize > PTHREAD_STACK_MIN)
+ __madvise (freeblock, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
#else
/* Page aligned start of memory to free (higher than or equal to current sp plus the minimum stack size). */
Here we skip PTHREAD_STACK_MIN
bytes from both beginning and end of
pd->stackblock
. This fixed pthread_create
SIGSEGV
. Why it did
not exhibit before? I have no idea! My guess would be that older glibc
used less stack space and didn’t bother to reload from bsp
after
madvise
call.
Random facts about ia64
- default page size on
linux
is64K
- rare (unique?) setup of two stacks growing in opposite directions
- many garbage collector implementations have to special-case for
bsp
area as additional pointers can lurk there - thread shutdown is delicate in
glibc
Have fun!