[ruby-dev:49105] [Ruby trunk - Bug #11265] deadlock on Solaris 10 since r50900
From:
ngotogenome@...
Date:
2015-06-18 05:15:44 UTC
List:
ruby-dev #49105
Issue #11265 has been updated by Naohisa Goto.
IO.popen内で、retry_fork_async_signal_safeから呼んでいるvforkから帰ってこないようです。
なお、スタックトレースを取るためにデバッグ付き最適化無しでコンパイルしたら、再現確率が20-30回に1回と大幅に下がりました。タイミング依存が激しいようです。
IO.popen呼び出し側。vfork内で止まっているようです。
~~~
$ /opt/solarisstudio12.3/bin/dbx - 1582
For information about new features see `help changes'
To remove this message, put `dbxenv suppress_startup_message 7.9' in your .dbxrc
Reading ruby
Reading ld.so.1
Reading libpthread.so.1
Reading librt.so.1
Reading libgmp.so.10.1.3
Reading libsocket.so.1
Reading libdl.so.1
Reading libcrypt_d.so.1
Reading libm.so.2
Reading libc.so.1
Reading libaio.so.1
Reading libmd.so.1
Reading libnsl.so.1
Reading libgen.so.1
Reading libc_psr.so.1
Reading encdb.so
Reading libmp.so.2
Reading libscf.so.1
Reading libdoor.so.1
Reading libuutil.so.1
Reading transdb.so
Reading thread.so
Attached to process 1582 with 3 LWPs
t@1 (l@1) stopped in _vfork at 0x7fffffff7e8dd440
0x7fffffff7e8dd440: _vfork+0x0024: bcc,a,pt %icc,_vfork+0x4c ! 0x7fffffff7e8dd468
Current function is retry_fork_async_signal_safe
3529 pid = vfork();
(dbx) lwps
>l@1 running in _vfork()
l@2 LWP suspended in __pollsys()
l@5 LWP suspended in elf_find_sym()
(dbx) where
current thread: t@1
[1] _vfork(0x630, 0x0, 0xffff, 0x100577aa8, 0x50, 0x15), at 0x7fffffff7e8dd440
=>[2] retry_fork_async_signal_safe(status = 0xffffffff7fffc594, ep = 0xffffffff7fffc3fc, chfunc = 0x1001518c0 = &`ruby`io.c`popen_exec(void *pp, char *errmsg, size_t errmsg_len), charg = 0xffffffff7fffc518, errmsg = 0xffffffff7fffc544 "", errmsg_buflen = 80U), line 3529 in "process.c"
[3] rb_fork_async_signal_safe(status = 0xffffffff7fffc594, chfunc = 0x1001518c0 = &`ruby`io.c`popen_exec(void *pp, char *errmsg, size_t errmsg_len), charg = 0xffffffff7fffc518, fds = 4301789120U, errmsg = 0xffffffff7fffc544 "", errmsg_buflen = 80U), line 3571 in "process.c"
[4] pipe_open(execarg_obj = 4301789560U, modestr = 0x10044db18 "r+", fmode = 7, convconfig = 0xffffffff7fffc6e8), line 6006 in "io.c"
[5] rb_io_s_popen(argc = 2, argv = 0x10057ca28, klass = 4302377040U), line 6299 in "io.c"
[6] call_cfunc_m1(func = 0x1001523c0 = &`ruby`io.c`rb_io_s_popen(int argc, VALUE *argv, VALUE klass), recv = 4302377040U, argc = 2, argv = 0x10057ca28), line 1421 in "vm_insnhelper.c"
[7] vm_call_cfunc_with_frame(th = 0x100578240, reg_cfp = 0x10067c208, ci = 0x100868f88), line 1595 in "vm_insnhelper.c"
[8] vm_call_cfunc(th = 0x100578240, reg_cfp = 0x10067c208, ci = 0x100868f88), line 1690 in "vm_insnhelper.c"
[9] vm_call_method(th = 0x100578240, cfp = 0x10067c208, ci = 0x100868f88), line 1921 in "vm_insnhelper.c"
[10] vm_call_general(th = 0x100578240, reg_cfp = 0x10067c208, ci = 0x100868f88), line 2085 in "vm_insnhelper.c"
[11] vm_exec_core(th = 0x100578240, initial = 0), line 977 in "insns.def"
[12] vm_exec(th = 0x100578240), line 1486 in "vm.c"
[13] invoke_block_from_c(th = 0x100578240, block = 0x10067c348, self = 4305731800U, argc = 1, argv = 0xffffffff7fffd6d8, blockptr = (nil), cref = (nil), defined_class = 4305737000U, splattable = 1), line 857 in "vm.c"
[14] vm_yield(th = 0x100578240, argc = 1, argv = 0xffffffff7fffd6d8), line 898 in "vm.c"
[15] rb_yield_0(argc = 1, argv = 0xffffffff7fffd6d8), line 994 in "vm_eval.c"
[16] rb_yield(val = 1U), line 1004 in "vm_eval.c"
[17] int_dotimes(num = 3U), line 3951 in "numeric.c"
[18] call_cfunc_0(func = 0x1001a1440 = &`ruby`numeric.c`int_dotimes(VALUE num), recv = 3U, argc = 0, argv = 0x10057c9a8), line 1427 in "vm_insnhelper.c"
[19] vm_call_cfunc_with_frame(th = 0x100578240, reg_cfp = 0x10067c328, ci = 0x10089c378), line 1595 in "vm_insnhelper.c"
[20] vm_call_cfunc(th = 0x100578240, reg_cfp = 0x10067c328, ci = 0x10089c378), line 1690 in "vm_insnhelper.c"
[21] vm_call_method(th = 0x100578240, cfp = 0x10067c328, ci = 0x10089c378), line 1921 in "vm_insnhelper.c"
[22] vm_call_general(th = 0x100578240, reg_cfp = 0x10067c328, ci = 0x10089c378), line 2085 in "vm_insnhelper.c"
[23] vm_exec_core(th = 0x100578240, initial = 0), line 947 in "insns.def"
[24] vm_exec(th = 0x100578240), line 1486 in "vm.c"
[25] invoke_block_from_c(th = 0x100578240, block = 0x10067c540, self = 4305731800U, argc = 1, argv = 0xffffffff7fffe718, blockptr = (nil), cref = (nil), defined_class = 4305636240U, splattable = 1), line 857 in "vm.c"
[26] vm_yield(th = 0x100578240, argc = 1, argv = 0xffffffff7fffe718), line 898 in "vm.c"
[27] rb_yield_0(argc = 1, argv = 0xffffffff7fffe718), line 994 in "vm_eval.c"
[28] rb_yield(val = 4305221440U), line 1004 in "vm_eval.c"
[29] rb_ary_each(array = 4305221080U), line 1821 in "array.c"
[30] call_cfunc_0(func = 0x100042900 = &rb_ary_each(VALUE array), recv = 4305221080U, argc = 0, argv = 0x10057c858), line 1427 in "vm_insnhelper.c"
[31] vm_call_cfunc_with_frame(th = 0x100578240, reg_cfp = 0x10067c520, ci = 0x10092d448), line 1595 in "vm_insnhelper.c"
[32] vm_call_cfunc(th = 0x100578240, reg_cfp = 0x10067c520, ci = 0x10092d448), line 1690 in "vm_insnhelper.c"
[33] vm_call_method(th = 0x100578240, cfp = 0x10067c520, ci = 0x10092d448), line 1921 in "vm_insnhelper.c"
[34] vm_call_general(th = 0x100578240, reg_cfp = 0x10067c520, ci = 0x10092d448), line 2085 in "vm_insnhelper.c"
[35] vm_exec_core(th = 0x100578240, initial = 0), line 947 in "insns.def"
[36] vm_exec(th = 0x100578240), line 1486 in "vm.c"
[37] rb_iseq_eval_main(iseqval = 4302120600U), line 1733 in "vm.c"
[38] ruby_exec_internal(n = 0x1006d2698), line 257 in "eval.c"
[39] ruby_exec_node(n = 0x1006d2698), line 322 in "eval.c"
[40] ruby_run_node(n = 0x1006d2698), line 314 in "eval.c"
[41] main(argc = 7, argv = 0xffffffff7ffff848), line 36 in "main.c"
(dbx) list
3529 pid = vfork();
3530 else
3531 pid = fork();
3532 #else
3533 pid = fork();
3534 #endif
3535 if (pid == 0) {/* fork succeed, child process */
3536 int ret;
3537 close(ep[0]);
3538 ret = disable_child_handler_fork_child(&old, errmsg, errmsg_buflen); /* async-signal-safe */
(dbx) exit
detaching from process 1582
~~~
IO.popenから起動した新プロセス側。こちらは fcntl内で止まっている?
~~~
$ /opt/solarisstudio12.3/bin/dbx - 1584
For information about new features see `help changes'
To remove this message, put `dbxenv suppress_startup_message 7.9' in your .dbxrc
Reading ruby
Reading ld.so.1
Reading libpthread.so.1
Reading librt.so.1
Reading libgmp.so.10.1.3
Reading libsocket.so.1
Reading libdl.so.1
Reading libcrypt_d.so.1
Reading libm.so.2
Reading libc.so.1
Reading libaio.so.1
Reading libmd.so.1
Reading libnsl.so.1
Reading libgen.so.1
Reading libc_psr.so.1
Reading encdb.so
Reading libmp.so.2
Reading libscf.so.1
Reading libdoor.so.1
Reading libuutil.so.1
Reading transdb.so
Reading thread.so
Attached to process 1584
t@1 (l@1) stopped in ___lwp_mutex_timedlock at 0x7fffffff7fe33f24
0x7fffffff7fe33f24: ___lwp_mutex_timedlock+0x0008: ta %icc,0x0000000000000040
Current function is rb_close_before_exec
5870 ret = fcntl(fd, F_GETFD); /* async-signal-safe */
(dbx) lwps
>l@1 running in ___lwp_mutex_timedlock()
(dbx) where
current thread: t@1
[1] ___lwp_mutex_timedlock(0x7fffffff7ff416d8, 0x0, 0x0, 0x7fffffff7e8d6e6c, 0x1, 0x1), at 0x7fffffff7fe33f24
[2] enter(0x0, 0x7fffffff7ff3c210, 0x1000, 0x7fffffff7e82f294, 0x7fffffff7ff3cdf8, 0x7fffffff7ff3c638), at 0x7fffffff7fe1bf20
[3] elf_bndr(0x7fffffff7fd00848, 0x1580, 0x1001eb110, 0x14, 0x0, 0x7fffffff7ff3cdf8), at 0x7fffffff7fe28168
[4] elf_rtbndr(0x560000, 0x100567438, 0xfffffffffffffff7, 0x1001eb110, 0x1001f1080, 0xfffffffffffffff7), at 0x7fffffff7fe08394
=>[5] rb_close_before_exec(lowfd = 3, maxhint = 10, noclose_fds = 4301789120U), line 5870 in "io.c"
[6] proc_exec_cmd(prog = 0x100868980 "/XXXXX-trunk/bin/ruby", argv_str = 4301789280U, envp_str = 0), line 1269 in "process.c"
[7] rb_exec_async_signal_safe(eargp = 0x1009d3ea0, errmsg = 0xffffffff7fffc544 "", errmsg_buflen = 80U), line 3068 in "process.c"
[8] popen_exec(pp = 0xffffffff7fffc518, errmsg = 0xffffffff7fffc544 "", errmsg_len = 80U), line 5888 in "io.c"
[9] retry_fork_async_signal_safe(status = 0xffffffff7fffc594, ep = 0xffffffff7fffc3fc, chfunc = 0x1001518c0 = &`ruby`io.c`popen_exec(void *pp, char *errmsg, size_t errmsg_len), charg = 0xffffffff7fffc518, errmsg = 0xffffffff7fffc544 "", errmsg_buflen = 80U), line 3540 in "process.c"
[10] rb_fork_async_signal_safe(status = 0xffffffff7fffc594, chfunc = 0x1001518c0 = &`ruby`io.c`popen_exec(void *pp, char *errmsg, size_t errmsg_len), charg = 0xffffffff7fffc518, fds = 4301789120U, errmsg = 0xffffffff7fffc544 "", errmsg_buflen = 80U), line 3571 in "process.c"
[11] pipe_open(execarg_obj = 4301789560U, modestr = 0x10044db18 "r+", fmode = 7, convconfig = 0xffffffff7fffc6e8), line 6006 in "io.c"
[12] rb_io_s_popen(argc = 2, argv = 0x10057ca28, klass = 4302377040U), line 6299 in "io.c"
[13] call_cfunc_m1(func = 0x1001523c0 = &`ruby`io.c`rb_io_s_popen(int argc, VALUE *argv, VALUE klass), recv = 4302377040U, argc = 2, argv = 0x10057ca28), line 1421 in "vm_insnhelper.c"
[14] vm_call_cfunc_with_frame(th = 0x100578240, reg_cfp = 0x10067c208, ci = 0x100868f88), line 1595 in "vm_insnhelper.c"
[15] vm_call_cfunc(th = 0x100578240, reg_cfp = 0x10067c208, ci = 0x100868f88), line 1690 in "vm_insnhelper.c"
[16] vm_call_method(th = 0x100578240, cfp = 0x10067c208, ci = 0x100868f88), line 1921 in "vm_insnhelper.c"
[17] vm_call_general(th = 0x100578240, reg_cfp = 0x10067c208, ci = 0x100868f88), line 2085 in "vm_insnhelper.c"
[18] vm_exec_core(th = 0x100578240, initial = 0), line 977 in "insns.def"
[19] vm_exec(th = 0x100578240), line 1486 in "vm.c"
[20] invoke_block_from_c(th = 0x100578240, block = 0x10067c348, self = 4305731800U, argc = 1, argv = 0xffffffff7fffd6d8, blockptr = (nil), cref = (nil), defined_class = 4305737000U, splattable = 1), line 857 in "vm.c"
[21] vm_yield(th = 0x100578240, argc = 1, argv = 0xffffffff7fffd6d8), line 898 in "vm.c"
[22] rb_yield_0(argc = 1, argv = 0xffffffff7fffd6d8), line 994 in "vm_eval.c"
[23] rb_yield(val = 1U), line 1004 in "vm_eval.c"
[24] int_dotimes(num = 3U), line 3951 in "numeric.c"
[25] call_cfunc_0(func = 0x1001a1440 = &`ruby`numeric.c`int_dotimes(VALUE num), recv = 3U, argc = 0, argv = 0x10057c9a8), line 1427 in "vm_insnhelper.c"
[26] vm_call_cfunc_with_frame(th = 0x100578240, reg_cfp = 0x10067c328, ci = 0x10089c378), line 1595 in "vm_insnhelper.c"
[27] vm_call_cfunc(th = 0x100578240, reg_cfp = 0x10067c328, ci = 0x10089c378), line 1690 in "vm_insnhelper.c"
[28] vm_call_method(th = 0x100578240, cfp = 0x10067c328, ci = 0x10089c378), line 1921 in "vm_insnhelper.c"
[29] vm_call_general(th = 0x100578240, reg_cfp = 0x10067c328, ci = 0x10089c378), line 2085 in "vm_insnhelper.c"
[30] vm_exec_core(th = 0x100578240, initial = 0), line 947 in "insns.def"
[31] vm_exec(th = 0x100578240), line 1486 in "vm.c"
[32] invoke_block_from_c(th = 0x100578240, block = 0x10067c540, self = 4305731800U, argc = 1, argv = 0xffffffff7fffe718, blockptr = (nil), cref = (nil), defined_class = 4305636240U, splattable = 1), line 857 in "vm.c"
[33] vm_yield(th = 0x100578240, argc = 1, argv = 0xffffffff7fffe718), line 898 in "vm.c"
[34] rb_yield_0(argc = 1, argv = 0xffffffff7fffe718), line 994 in "vm_eval.c"
[35] rb_yield(val = 4305221440U), line 1004 in "vm_eval.c"
[36] rb_ary_each(array = 4305221080U), line 1821 in "array.c"
[37] call_cfunc_0(func = 0x100042900 = &rb_ary_each(VALUE array), recv = 4305221080U, argc = 0, argv = 0x10057c858), line 1427 in "vm_insnhelper.c"
[38] vm_call_cfunc_with_frame(th = 0x100578240, reg_cfp = 0x10067c520, ci = 0x10092d448), line 1595 in "vm_insnhelper.c"
[39] vm_call_cfunc(th = 0x100578240, reg_cfp = 0x10067c520, ci = 0x10092d448), line 1690 in "vm_insnhelper.c"
[40] vm_call_method(th = 0x100578240, cfp = 0x10067c520, ci = 0x10092d448), line 1921 in "vm_insnhelper.c"
[41] vm_call_general(th = 0x100578240, reg_cfp = 0x10067c520, ci = 0x10092d448), line 2085 in "vm_insnhelper.c"
[42] vm_exec_core(th = 0x100578240, initial = 0), line 947 in "insns.def"
[43] vm_exec(th = 0x100578240), line 1486 in "vm.c"
[44] rb_iseq_eval_main(iseqval = 4302120600U), line 1733 in "vm.c"
[45] ruby_exec_internal(n = 0x1006d2698), line 257 in "eval.c"
[46] ruby_exec_node(n = 0x1006d2698), line 322 in "eval.c"
[47] ruby_run_node(n = 0x1006d2698), line 314 in "eval.c"
[48] main(argc = 7, argv = 0xffffffff7ffff848), line 36 in "main.c"
(dbx) list
5870 ret = fcntl(fd, F_GETFD); /* async-signal-safe */
5871 if (ret != -1 && !(ret & FD_CLOEXEC)) {
5872 fcntl(fd, F_SETFD, ret|FD_CLOEXEC); /* async-signal-safe */
5873 }
5874 # define CONTIGUOUS_CLOSED_FDS 20
5875 if (ret != -1) {
5876 if (max < fd + CONTIGUOUS_CLOSED_FDS)
5877 max = fd + CONTIGUOUS_CLOSED_FDS;
5878 }
5879 }
(dbx)
(dbx) print fd
fd = 33
(dbx) print max
max = 32
(dbx) print lowfd
lowfd = 3
~~~
5870: ret = fcntl(fd, F_GETFD);
は、
5866: for (fd = lowfd; fd <= max; fd++) { ... }
の中でループしているから、fd が max を越えるはずはないはずなのに、
max = 32 にもかかわらず fd = 33 で fcntl(fd, F_GETFD) が呼ばれている?
----------------------------------------
Bug #11265: deadlock on Solaris 10 since r50900
https://bugs.ruby-lang.org/issues/11265#change-53000
* Author: Naohisa Goto
* Status: Open
* Priority: Normal
* Assignee:
* ruby -v: -
* Backport: 2.0.0: UNKNOWN, 2.1: UNKNOWN, 2.2: UNKNOWN
----------------------------------------
Solaris10にて、r50900 以降、
TestParallel::TestParallel#test_jobs_status
TestParallel::TestParallel#test_separate
のいずれか、または両方で、テストが停止状態となり次に進まなくなります。
スレッドのデッドロックが発生しているような感じです。
ところで、こういう場合の上手いデバッグ方法って何があるでしょうか?
--
https://bugs.ruby-lang.org/