From: nobu@... Date: 2017-12-15T07:40:47+00:00 Subject: [ruby-core:84280] [Ruby trunk Bug#14181] hangs or deadlocks from waitpid, threads, and trapping SIGCHLD Issue #14181 has been updated by nobu (Nobuyoshi Nakada). normalperson (Eric Wong) wrote: > I guess the sleep_wait_for_interrupt path when !forever has the > same problem and might sleep too long.. How about this patch? ```diff diff --git i/thread.c w/thread.c index cc62ea3905..138c26cb09 100644 --- i/thread.c +++ w/thread.c @@ -90,9 +90,13 @@ static VALUE sym_on_blocking; static VALUE sym_never; static ID id_locals; -static void sleep_timeval(rb_thread_t *th, struct timeval time, int spurious_check); -static void sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check); -static void sleep_forever(rb_thread_t *th, int nodeadlock, int spurious_check); +typedef int (*wakeup_cond_func)(void *); +static void sleep_timeval(rb_thread_t *th, struct timeval time, int spurious_check, + wakeup_cond_func wake, void *warg); +static void sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check, + wakeup_cond_func wake, void *warg); +static void sleep_forever(rb_thread_t *th, int nodeadlock, int spurious_check, + wakeup_cond_func wake, void *warg); static void rb_thread_sleep_deadly_allow_spurious_wakeup(void); static double timeofday(void); static int rb_threadptr_dead(rb_thread_t *th); @@ -873,6 +877,13 @@ remove_from_join_list(VALUE arg) return Qnil; } +static int +wake_join(void *arg) +{ + rb_thread_t *target_th = arg; + return target_th->status == THREAD_KILLED; +} + static VALUE thread_join_sleep(VALUE arg) { @@ -883,13 +894,7 @@ thread_join_sleep(VALUE arg) while (target_th->status != THREAD_KILLED) { if (forever) { - th->status = THREAD_STOPPED_FOREVER; - th->vm->sleeper++; - rb_check_deadlock(th->vm); - native_sleep(th, 0); - th->vm->sleeper--; - RUBY_VM_CHECK_INTS_BLOCKING(th->ec); - th->status = THREAD_RUNNABLE; + sleep_forever(th, TRUE, FALSE, wake_join, target_th); } else { double now = timeofday(); @@ -898,7 +903,7 @@ thread_join_sleep(VALUE arg) thread_id_str(target_th)); return Qfalse; } - sleep_wait_for_interrupt(th, limit - now, 0); + sleep_wait_for_interrupt(th, limit - now, 0, wake_join, target_th); } thread_debug("thread_join: interrupted (thid: %"PRI_THREAD_ID", status: %s)\n", thread_id_str(target_th), thread_status_name(target_th, TRUE)); @@ -1094,14 +1099,15 @@ double2timeval(double d) } static void -sleep_forever(rb_thread_t *th, int deadlockable, int spurious_check) +sleep_forever(rb_thread_t *th, int deadlockable, int spurious_check, + wakeup_cond_func wake, void *warg) { enum rb_thread_status prev_status = th->status; enum rb_thread_status status = deadlockable ? THREAD_STOPPED_FOREVER : THREAD_STOPPED; th->status = status; RUBY_VM_CHECK_INTS_BLOCKING(th->ec); - while (th->status == status) { + while (!(wake && wake(warg)) && th->status == status) { if (deadlockable) { th->vm->sleeper++; rb_check_deadlock(th->vm); @@ -1135,7 +1141,8 @@ getclockofday(struct timeval *tp) } static void -sleep_timeval(rb_thread_t *th, struct timeval tv, int spurious_check) +sleep_timeval(rb_thread_t *th, struct timeval tv, int spurious_check, + wakeup_cond_func wake, void *warg) { struct timeval to, tvn; enum rb_thread_status prev_status = th->status; @@ -1156,7 +1163,7 @@ sleep_timeval(rb_thread_t *th, struct timeval tv, int spurious_check) th->status = THREAD_STOPPED; RUBY_VM_CHECK_INTS_BLOCKING(th->ec); - while (th->status == THREAD_STOPPED) { + while (!(wake && wake(warg)) && th->status == THREAD_STOPPED) { native_sleep(th, &tv); RUBY_VM_CHECK_INTS_BLOCKING(th->ec); getclockofday(&tvn); @@ -1180,21 +1187,21 @@ void rb_thread_sleep_forever(void) { thread_debug("rb_thread_sleep_forever\n"); - sleep_forever(GET_THREAD(), FALSE, TRUE); + sleep_forever(GET_THREAD(), FALSE, TRUE, NULL, NULL); } void rb_thread_sleep_deadly(void) { thread_debug("rb_thread_sleep_deadly\n"); - sleep_forever(GET_THREAD(), TRUE, TRUE); + sleep_forever(GET_THREAD(), TRUE, TRUE, NULL, NULL); } static void rb_thread_sleep_deadly_allow_spurious_wakeup(void) { thread_debug("rb_thread_sleep_deadly_allow_spurious_wakeup\n"); - sleep_forever(GET_THREAD(), TRUE, FALSE); + sleep_forever(GET_THREAD(), TRUE, FALSE, NULL, NULL); } static double @@ -1216,16 +1223,17 @@ timeofday(void) } static void -sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check) +sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check, + wakeup_cond_func wake, void *warg) { - sleep_timeval(th, double2timeval(sleepsec), spurious_check); + sleep_timeval(th, double2timeval(sleepsec), spurious_check, wake, warg); } void rb_thread_wait_for(struct timeval time) { rb_thread_t *th = GET_THREAD(); - sleep_timeval(th, time, 1); + sleep_timeval(th, time, 1, NULL, NULL); } /* diff --git i/thread_sync.c w/thread_sync.c index 6eff5e759c..ca74fe5866 100644 --- i/thread_sync.c +++ w/thread_sync.c @@ -428,7 +428,7 @@ static VALUE rb_mutex_wait_for(VALUE time) { struct timeval *t = (struct timeval *)time; - sleep_timeval(GET_THREAD(), *t, 0); /* permit spurious check */ + sleep_timeval(GET_THREAD(), *t, 0, NULL, NULL); /* permit spurious check */ return Qnil; } ``` ---------------------------------------- Bug #14181: hangs or deadlocks from waitpid, threads, and trapping SIGCHLD https://bugs.ruby-lang.org/issues/14181#change-68435 * Author: ccutrer (Cody Cutrer) * Status: Closed * Priority: Normal * Assignee: * Target version: * ruby -v: ruby 2.4.2p198 (2017-09-14 revision 59899) [x86_64-linux-gnu] * Backport: 2.3: UNKNOWN, 2.4: UNKNOWN ---------------------------------------- I'm not exactly sure what's going on here, but the end result is basically a thread is getting killed unexpectedly during a waitpid call, when SIGCHLD is being handled. In a more complex scenario, we end up hanging because Thread#join is ends up waiting on a thread that's already dead (presumably because it died in a non-standard way), or in a simpler scenario, the output is: ``` loop 250 loop 251 /usr/lib/ruby/2.4.0/timeout.rb:97:in `join': No live threads left. Deadlock? (fatal) 1 threads, 1 sleeps current:0x00000000019205e0 main thread:0x00000000019205e0 * # rb_thread_t:0x00000000019205e0 native:0x00007f900a082700 int:0 /usr/lib/ruby/2.4.0/timeout.rb:97:in `join' /usr/lib/ruby/2.4.0/timeout.rb:97:in `ensure in block in timeout' /usr/lib/ruby/2.4.0/timeout.rb:97:in `block in timeout' /usr/lib/ruby/2.4.0/timeout.rb:33:in `block in catch' /usr/lib/ruby/2.4.0/timeout.rb:33:in `catch' /usr/lib/ruby/2.4.0/timeout.rb:33:in `catch' /usr/lib/ruby/2.4.0/timeout.rb:108:in `timeout' ./test.rb:11:in `
' from /usr/lib/ruby/2.4.0/timeout.rb:97:in `ensure in block in timeout' from /usr/lib/ruby/2.4.0/timeout.rb:97:in `block in timeout' from /usr/lib/ruby/2.4.0/timeout.rb:33:in `block in catch' from /usr/lib/ruby/2.4.0/timeout.rb:33:in `catch' from /usr/lib/ruby/2.4.0/timeout.rb:33:in `catch' from /usr/lib/ruby/2.4.0/timeout.rb:108:in `timeout' from ./test.rb:11:in `
' ``` The simpler repro, where I'm obviously not doing anything I shouldn't be doing in the signal handler: ``` #!/usr/bin/env ruby require 'timeout' trap(:CHLD) { } x = 0 while true puts "loop #{x += 1}" pid = Process.spawn('sleep 1') Timeout.timeout(30) do Process.waitpid(pid) end end ``` A slightly more complex repro that I'm still pretty sure what I'm doing in the signal handler is okay, but ends up hanging: ``` #!/usr/bin/env ruby require 'timeout' self_pipe = IO.pipe signal_queue = [] def wake_up(self_pipe) self_pipe[1].write_nonblock('.', exception: false) end trap(:CHLD) { signal_queue << :CHLD; wake_up(self_pipe) } signal_processor = Thread.new do loop do self_pipe[0].read(1) signal_queue.pop end end x = 0 while true puts "loop #{x += 1}" pid = Process.spawn('sleep 1') Timeout.timeout(30) do Process.waitpid(pid) end end ``` In either case, it can take many loops before it fails, up to a few hundred. I've reproed on both Ubuntu Xenial, and macOS 10.12.6 (the former with ruby 2.4.2, the latter with ruby 2.4.1). -- https://bugs.ruby-lang.org/ Unsubscribe: