[ruby-core:84280] [Ruby trunk Bug#14181] hangs or deadlocks from waitpid, threads, and trapping SIGCHLD

From: nobu@...
Date: 2017-12-15 07:40:47 UTC
List: ruby-core #84280
Issue #14181 has been updated by nobu (Nobuyoshi Nakada).


normalperson (Eric Wong) wrote:
>  I guess the sleep_wait_for_interrupt path when !forever has the
>  same problem and might sleep too long..

How about this patch?

```diff
diff --git i/thread.c w/thread.c
index cc62ea3905..138c26cb09 100644
--- i/thread.c
+++ w/thread.c
@@ -90,9 +90,13 @@ static VALUE sym_on_blocking;
 static VALUE sym_never;
 static ID id_locals;
 
-static void sleep_timeval(rb_thread_t *th, struct timeval time, int spurious_check);
-static void sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check);
-static void sleep_forever(rb_thread_t *th, int nodeadlock, int spurious_check);
+typedef int (*wakeup_cond_func)(void *);
+static void sleep_timeval(rb_thread_t *th, struct timeval time, int spurious_check,
+			  wakeup_cond_func wake, void *warg);
+static void sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check,
+				     wakeup_cond_func wake, void *warg);
+static void sleep_forever(rb_thread_t *th, int nodeadlock, int spurious_check,
+			  wakeup_cond_func wake, void *warg);
 static void rb_thread_sleep_deadly_allow_spurious_wakeup(void);
 static double timeofday(void);
 static int rb_threadptr_dead(rb_thread_t *th);
@@ -873,6 +877,13 @@ remove_from_join_list(VALUE arg)
     return Qnil;
 }
 
+static int
+wake_join(void *arg)
+{
+    rb_thread_t *target_th = arg;
+    return target_th->status == THREAD_KILLED;
+}
+
 static VALUE
 thread_join_sleep(VALUE arg)
 {
@@ -883,13 +894,7 @@ thread_join_sleep(VALUE arg)
 
     while (target_th->status != THREAD_KILLED) {
 	if (forever) {
-	    th->status = THREAD_STOPPED_FOREVER;
-	    th->vm->sleeper++;
-	    rb_check_deadlock(th->vm);
-	    native_sleep(th, 0);
-	    th->vm->sleeper--;
-	    RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
-	    th->status = THREAD_RUNNABLE;
+	    sleep_forever(th, TRUE, FALSE, wake_join, target_th);
 	}
 	else {
 	    double now = timeofday();
@@ -898,7 +903,7 @@ thread_join_sleep(VALUE arg)
 			     thread_id_str(target_th));
 		return Qfalse;
 	    }
-	    sleep_wait_for_interrupt(th, limit - now, 0);
+	    sleep_wait_for_interrupt(th, limit - now, 0, wake_join, target_th);
 	}
 	thread_debug("thread_join: interrupted (thid: %"PRI_THREAD_ID", status: %s)\n",
 		     thread_id_str(target_th), thread_status_name(target_th, TRUE));
@@ -1094,14 +1099,15 @@ double2timeval(double d)
 }
 
 static void
-sleep_forever(rb_thread_t *th, int deadlockable, int spurious_check)
+sleep_forever(rb_thread_t *th, int deadlockable, int spurious_check,
+	      wakeup_cond_func wake, void *warg)
 {
     enum rb_thread_status prev_status = th->status;
     enum rb_thread_status status = deadlockable ? THREAD_STOPPED_FOREVER : THREAD_STOPPED;
 
     th->status = status;
     RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
-    while (th->status == status) {
+    while (!(wake && wake(warg)) && th->status == status) {
 	if (deadlockable) {
 	    th->vm->sleeper++;
 	    rb_check_deadlock(th->vm);
@@ -1135,7 +1141,8 @@ getclockofday(struct timeval *tp)
 }
 
 static void
-sleep_timeval(rb_thread_t *th, struct timeval tv, int spurious_check)
+sleep_timeval(rb_thread_t *th, struct timeval tv, int spurious_check,
+	      wakeup_cond_func wake, void *warg)
 {
     struct timeval to, tvn;
     enum rb_thread_status prev_status = th->status;
@@ -1156,7 +1163,7 @@ sleep_timeval(rb_thread_t *th, struct timeval tv, int spurious_check)
 
     th->status = THREAD_STOPPED;
     RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
-    while (th->status == THREAD_STOPPED) {
+    while (!(wake && wake(warg)) && th->status == THREAD_STOPPED) {
 	native_sleep(th, &tv);
 	RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
 	getclockofday(&tvn);
@@ -1180,21 +1187,21 @@ void
 rb_thread_sleep_forever(void)
 {
     thread_debug("rb_thread_sleep_forever\n");
-    sleep_forever(GET_THREAD(), FALSE, TRUE);
+    sleep_forever(GET_THREAD(), FALSE, TRUE, NULL, NULL);
 }
 
 void
 rb_thread_sleep_deadly(void)
 {
     thread_debug("rb_thread_sleep_deadly\n");
-    sleep_forever(GET_THREAD(), TRUE, TRUE);
+    sleep_forever(GET_THREAD(), TRUE, TRUE, NULL, NULL);
 }
 
 static void
 rb_thread_sleep_deadly_allow_spurious_wakeup(void)
 {
     thread_debug("rb_thread_sleep_deadly_allow_spurious_wakeup\n");
-    sleep_forever(GET_THREAD(), TRUE, FALSE);
+    sleep_forever(GET_THREAD(), TRUE, FALSE, NULL, NULL);
 }
 
 static double
@@ -1216,16 +1223,17 @@ timeofday(void)
 }
 
 static void
-sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check)
+sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check,
+			 wakeup_cond_func wake, void *warg)
 {
-    sleep_timeval(th, double2timeval(sleepsec), spurious_check);
+    sleep_timeval(th, double2timeval(sleepsec), spurious_check, wake, warg);
 }
 
 void
 rb_thread_wait_for(struct timeval time)
 {
     rb_thread_t *th = GET_THREAD();
-    sleep_timeval(th, time, 1);
+    sleep_timeval(th, time, 1, NULL, NULL);
 }
 
 /*
diff --git i/thread_sync.c w/thread_sync.c
index 6eff5e759c..ca74fe5866 100644
--- i/thread_sync.c
+++ w/thread_sync.c
@@ -428,7 +428,7 @@ static VALUE
 rb_mutex_wait_for(VALUE time)
 {
     struct timeval *t = (struct timeval *)time;
-    sleep_timeval(GET_THREAD(), *t, 0); /* permit spurious check */
+    sleep_timeval(GET_THREAD(), *t, 0, NULL, NULL); /* permit spurious check */
     return Qnil;
 }
 
```

----------------------------------------
Bug #14181: hangs or deadlocks from waitpid, threads, and trapping SIGCHLD
https://bugs.ruby-lang.org/issues/14181#change-68435

* Author: ccutrer (Cody Cutrer)
* Status: Closed
* Priority: Normal
* Assignee: 
* Target version: 
* ruby -v: ruby 2.4.2p198 (2017-09-14 revision 59899) [x86_64-linux-gnu]
* Backport: 2.3: UNKNOWN, 2.4: UNKNOWN
----------------------------------------
I'm not exactly sure what's going on here, but the end result is basically a thread is getting killed unexpectedly during a waitpid call, when SIGCHLD is being handled. In a more complex scenario, we end up hanging because Thread#join is ends up waiting on a thread that's already dead (presumably because it died in a non-standard way), or in a simpler scenario, the output is:

```
loop 250
loop 251
/usr/lib/ruby/2.4.0/timeout.rb:97:in `join': No live threads left. Deadlock? (fatal)
1 threads, 1 sleeps current:0x00000000019205e0 main thread:0x00000000019205e0
* #<Thread:0x0000000001955e38 sleep_forever>
   rb_thread_t:0x00000000019205e0 native:0x00007f900a082700 int:0
   /usr/lib/ruby/2.4.0/timeout.rb:97:in `join'
   /usr/lib/ruby/2.4.0/timeout.rb:97:in `ensure in block in timeout'
   /usr/lib/ruby/2.4.0/timeout.rb:97:in `block in timeout'
   /usr/lib/ruby/2.4.0/timeout.rb:33:in `block in catch'
   /usr/lib/ruby/2.4.0/timeout.rb:33:in `catch'
   /usr/lib/ruby/2.4.0/timeout.rb:33:in `catch'
   /usr/lib/ruby/2.4.0/timeout.rb:108:in `timeout'
   ./test.rb:11:in `<main>'
	from /usr/lib/ruby/2.4.0/timeout.rb:97:in `ensure in block in timeout'
	from /usr/lib/ruby/2.4.0/timeout.rb:97:in `block in timeout'
	from /usr/lib/ruby/2.4.0/timeout.rb:33:in `block in catch'
	from /usr/lib/ruby/2.4.0/timeout.rb:33:in `catch'
	from /usr/lib/ruby/2.4.0/timeout.rb:33:in `catch'
	from /usr/lib/ruby/2.4.0/timeout.rb:108:in `timeout'
	from ./test.rb:11:in `<main>'
```

The simpler repro, where I'm obviously not doing anything I shouldn't be doing in the signal handler:

```
#!/usr/bin/env ruby

require 'timeout'

trap(:CHLD) { }

x = 0
while true
  puts "loop #{x += 1}"
  pid = Process.spawn('sleep 1')
  Timeout.timeout(30) do
    Process.waitpid(pid)
  end
end
```

A slightly more complex repro that I'm still pretty sure what I'm doing in the signal handler is okay, but ends up hanging:

```
#!/usr/bin/env ruby

require 'timeout'

self_pipe = IO.pipe
signal_queue = []

def wake_up(self_pipe)
  self_pipe[1].write_nonblock('.', exception: false)
end

trap(:CHLD) { signal_queue << :CHLD; wake_up(self_pipe)  }

signal_processor = Thread.new do
  loop do
    self_pipe[0].read(1)
    signal_queue.pop
  end
end

x = 0
while true
  puts "loop #{x += 1}"
  pid = Process.spawn('sleep 1')
  Timeout.timeout(30) do
    Process.waitpid(pid)
  end
end
```

In either case, it can take many loops before it fails, up to a few hundred. I've reproed on both Ubuntu Xenial, and macOS 10.12.6 (the former with ruby 2.4.2, the latter with ruby 2.4.1).



-- 
https://bugs.ruby-lang.org/

Unsubscribe: <mailto:ruby-core-request@ruby-lang.org?subject=unsubscribe>
<http://lists.ruby-lang.org/cgi-bin/mailman/options/ruby-core>

In This Thread

Prev Next