From: "ivoanjo (Ivo Anjo) via ruby-core" Date: 2024-07-12T08:02:54+00:00 Subject: [ruby-core:118582] [Ruby master Bug#20629] Ruby infinite loop when raising+rescuing inside a RUBY_INTERNAL_EVENT_NEWOBJ Issue #20629 has been updated by ivoanjo (Ivo Anjo). Thanks for pointing me in the right direction! I had completely forgotten that bit of info ����. I guess I had to spend a few hours learning it the hard way. ---------------------------------------- Bug #20629: Ruby infinite loop when raising+rescuing inside a RUBY_INTERNAL_EVENT_NEWOBJ https://bugs.ruby-lang.org/issues/20629#change-109105 * Author: ivoanjo (Ivo Anjo) * Status: Closed * ruby -v: ruby 3.3.4 (2024-07-09 revision be1089c8ec) [x86_64-linux] * Backport: 3.1: UNKNOWN, 3.2: UNKNOWN, 3.3: UNKNOWN ---------------------------------------- ### Background Hey! While working on the [Datadog Ruby Profiler](https://github.com/datadog/dd-trace-rb) I ran into a weird situation where the Ruby VM locks up as it gets stuck in an infinite loop in vm_sync.c: ```c while (recorded_lock_rec < current_lock_rec) { RB_VM_LOCK_LEAVE_LEV(¤t_lock_rec); } ``` ...because leave is this: ```c static inline void rb_vm_lock_leave(unsigned int *lev, const char *file, int line) { if (rb_multi_ractor_p()) { rb_vm_lock_leave_body(lev APPEND_LOCATION_PARAMS); } } ``` ...and `rb_multi_ractor_p()` is always `false` as I'm not using Ractors. ### How to reproduce (I include a zip with all the files needed to reproduce this, no need to copy-paste from here!) Inside a native extension, I'm creating a `RUBY_INTERNAL_EVENT_NEWOBJ`. Inside it, I'm raising and catching an exception, to simulate a (handled) bug in our code: ```c #include #include #include #include static VALUE raise_exception(VALUE unused) { fprintf(stderr, "Raising!\n"); rb_raise(rb_eRuntimeError, "This is a test!"); } static VALUE ignore_failure(VALUE self_instance, VALUE exception) { fprintf(stderr, "Caught!\n"); return Qnil; } static void on_newobj_event(VALUE tracepoint_data, void *unused) { fprintf(stderr, "Inside on_newobj_event\n"); // VALUE klass = rb_class_of(rb_tracearg_object(rb_tracearg_from_tracepoint(tracepoint_data))); // rb_p(klass); rb_rescue2( raise_exception, Qnil, ignore_failure, Qnil, rb_eException, // rb_eException is the base class of all Ruby exceptions 0 // Required by API to be the last argument ); fprintf(stderr, "Finished on_newobj_event\n"); } static VALUE install_tracepoint(VALUE unused) { VALUE tp = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_NEWOBJ, on_newobj_event, NULL /* unused */); rb_tracepoint_enable(tp); fprintf(stderr, "Tracepoint installed!\n"); return Qnil; } void Init_exception_bug_repro(void) { rb_define_singleton_method(rb_define_module("Repro"), "install_tracepoint", install_tracepoint, 0); } ``` And my test Ruby script is: ```ruby require_relative './exception_bug_repro' Repro.install_tracepoint while true begin raise('hello') rescue print '.' end end ``` Here's how running it looks: ``` $ bundle exec rake clean compile $ bundle exec ruby testcase.rb Tracepoint installed! Inside on_newobj_event Raising! Caught! Finished on_newobj_event Inside on_newobj_event Raising! Caught! Finished on_newobj_event Inside on_newobj_event Raising! Caught! Finished on_newobj_event Inside on_newobj_event Raising! Caught! Finished on_newobj_event Inside on_newobj_event Raising! # <--- Infinite loop ``` That is, we should always see the raise -> caught, but sometimes the raise doesn't go very well, and from then Ruby eventually locks up. Here's the full backtrace from gdb: ``` #0 rb_multi_ractor_p () at ruby-3_3/vm_sync.h:39 #1 0x00005fdde2534665 in rb_vm_lock_leave (lev=0x7ffc2f1b5f70, file=0x5fdde2783bb5 "vm_sync.c", line=231) at ruby-3_3/vm_sync.h:92 #2 0x00005fdde2534b28 in rb_ec_vm_lock_rec_release (ec=0x5fdde3e9b570, recorded_lock_rec=0, current_lock_rec=1) at vm_sync.c:231 #3 0x00005fdde23088f2 in rb_ec_vm_lock_rec_check (ec=0x5fdde3e9b570, recorded_lock_rec=0) at ruby-3_3/eval_intern.h:144 #4 0x00005fdde2308937 in rb_ec_tag_state (ec=0x5fdde3e9b570) at ruby-3_3/eval_intern.h:155 #5 0x00005fdde230c71e in setup_exception (ec=0x5fdde3e9b570, tag=6, mesg=126196681963080, cause=4) at eval.c:564 #6 0x00005fdde230cdc5 in rb_longjmp (ec=0x5fdde3e9b570, tag=6, mesg=126196681963080, cause=4) at eval.c:660 #7 0x00005fdde230d398 in rb_raise_jump (mesg=126196681963080, cause=36) at eval.c:853 #8 0x00005fdde230d0cc in rb_f_raise (argc=1, argv=0x72c690294048) at eval.c:752 #9 0x00005fdde230d0f4 in f_raise (c=1, v=0x72c690294048, _=126197147684320) at eval.c:789 #10 0x00005fdde2502d24 in ractor_safe_call_cfunc_m1 (recv=126197147684320, argc=1, argv=0x72c690294048, func=0x5fdde230d0cc ) at ruby-3_3/vm_insnhelper.c:3311 #11 0x00005fdde250395b in vm_call_cfunc_with_frame_ (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0, argc=1, argv=0x72c690294048, stack_bottom=0x72c690294040) at ruby-3_3/vm_insnhelper.c:3502 #12 0x00005fdde2503b30 in vm_call_cfunc_with_frame (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:3530 #13 0x00005fdde2503c48 in vm_call_cfunc_other (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:3556 #14 0x00005fdde2504070 in vm_call_cfunc (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:3638 #15 0x00005fdde2506b9f in vm_call_method_each_type (ec=0x5fdde3e9b570, cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:4429 #16 0x00005fdde25076e0 in vm_call_method (ec=0x5fdde3e9b570, cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:4566 #17 0x00005fdde250784d in vm_call_general (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:4599 #18 0x00005fdde2509ed2 in vm_sendish (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, cd=0x5fdde3f4f7e0, block_handler=0, method_explorer=mexp_search_method) at ruby-3_3/vm_insnhelper.c:5593 #19 0x00005fdde25114b9 in vm_exec_core (ec=0x5fdde3e9b570) at ruby-3_3/insns.def:834 #20 0x00005fdde252829b in rb_vm_exec (ec=0x5fdde3e9b570) at vm.c:2486 #21 0x00005fdde25290fa in rb_iseq_eval_main (iseq=0x72c674667ad0) at vm.c:2752 #22 0x00005fdde230be59 in rb_ec_exec_node (ec=0x5fdde3e9b570, n=0x72c674667ad0) at eval.c:287 #23 0x00005fdde230bfd6 in ruby_run_node (n=0x72c674667ad0) at eval.c:328 #24 0x00005fdde2307245 in rb_main (argc=2, argv=0x7ffc2f1b7618) at ./main.c:39 #25 0x00005fdde23072bd in main (argc=2, argv=0x7ffc2f1b7618) at ./main.c:58 ``` In this minimal reproducer, usually it's when allocating a `Thread::Backtrace` object that triggers the issue, but on the original codebase where I was experimenting I saw this issue even with other objects (I think??). To be honest, maybe the problem is on the extension side -- this code assumes it's ok to raise/rescue in the `RUBY_INTERNAL_EVENT_NEWOBJ` tracepoint as it usually works fine, but maybe it's not? ---Files-------------------------------- exception-bug-repo.zip (3.79 KB) -- https://bugs.ruby-lang.org/ ______________________________________________ ruby-core mailing list -- ruby-core@ml.ruby-lang.org To unsubscribe send an email to ruby-core-leave@ml.ruby-lang.org ruby-core info -- https://ml.ruby-lang.org/mailman3/lists/ruby-core.ml.ruby-lang.org/