From: "naruse (Yui NARUSE)" Date: 2013-04-04T02:55:41+09:00 Subject: [ruby-core:53968] [ruby-trunk - Feature #8206] Should Ruby core implement String#blank? Issue #8206 has been updated by naruse (Yui NARUSE). I came up with an idea, String#include? with regexp without backref. Could you try and comment this? % ruby -e'p [$&," foo".include?(/[[:space:]]/),$&]' [nil, true, nil] diff --git a/re.c b/re.c index 16d7e34..8c7d9de 100644 --- a/re.c +++ b/re.c @@ -1352,18 +1352,19 @@ rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse) } /* returns byte offset */ -long -rb_reg_search(VALUE re, VALUE str, long pos, int reverse) +static long +rb_reg_search0(VALUE re, VALUE str, long pos, int reverse, int backref) { long result; VALUE match; - struct re_registers regi, *regs = ®i; + struct re_registers regi; + struct re_registers *regs = NULL; char *range = RSTRING_PTR(str); - regex_t *reg; + regex_t *reg = NULL; int tmpreg; if (pos > RSTRING_LEN(str) || pos < 0) { - rb_backref_set(Qnil); + if (backref) rb_backref_set(Qnil); return -1; } @@ -1371,18 +1372,21 @@ rb_reg_search(VALUE re, VALUE str, long pos, int reverse) tmpreg = reg != RREGEXP(re)->ptr; if (!tmpreg) RREGEXP(re)->usecnt++; - match = rb_backref_get(); - if (!NIL_P(match)) { - if (FL_TEST(match, MATCH_BUSY)) { - match = Qnil; + if (backref) { + regs = ®i; + match = rb_backref_get(); + if (!NIL_P(match)) { + if (FL_TEST(match, MATCH_BUSY)) { + match = Qnil; + } + else { + regs = RMATCH_REGS(match); + } } - else { - regs = RMATCH_REGS(match); + if (NIL_P(match)) { + MEMZERO(regs, struct re_registers, 1); } } - if (NIL_P(match)) { - MEMZERO(regs, struct re_registers, 1); - } if (!reverse) { range += RSTRING_LEN(str); } @@ -1416,29 +1420,44 @@ rb_reg_search(VALUE re, VALUE str, long pos, int reverse) } } - if (NIL_P(match)) { - match = match_alloc(rb_cMatch); - onig_region_copy(RMATCH_REGS(match), regs); - onig_region_free(regs, 0); - } - else { - if (rb_safe_level() >= 3) - OBJ_TAINT(match); - else - FL_UNSET(match, FL_TAINT); - } + if (backref) { + if (NIL_P(match)) { + match = match_alloc(rb_cMatch); + onig_region_copy(RMATCH_REGS(match), regs); + onig_region_free(regs, 0); + } + else { + if (rb_safe_level() >= 3) + OBJ_TAINT(match); + else + FL_UNSET(match, FL_TAINT); + } - RMATCH(match)->str = rb_str_new4(str); - RMATCH(match)->regexp = re; - RMATCH(match)->rmatch->char_offset_updated = 0; - rb_backref_set(match); + RMATCH(match)->str = rb_str_new4(str); + RMATCH(match)->regexp = re; + RMATCH(match)->rmatch->char_offset_updated = 0; + rb_backref_set(match); - OBJ_INFECT(match, re); - OBJ_INFECT(match, str); + OBJ_INFECT(match, re); + OBJ_INFECT(match, str); + } return result; } +/* returns byte offset */ +long +rb_reg_search(VALUE re, VALUE str, long pos, int reverse) +{ + return rb_reg_search0(re, str, pos, reverse, TRUE); +} + +long +rb_reg_search_without_backref(VALUE re, VALUE str, long pos, int reverse) +{ + return rb_reg_search0(re, str, pos, reverse, FALSE); +} + VALUE rb_reg_nth_defined(int nth, VALUE match) { diff --git a/string.c b/string.c index 8bbd8a4..64d53be 100644 --- a/string.c +++ b/string.c @@ -4335,6 +4335,7 @@ rb_str_reverse_bang(VALUE str) return str; } +long rb_reg_search_without_backref(VALUE re, VALUE str, long pos, int reverse); /* * call-seq: @@ -4353,8 +4354,13 @@ rb_str_include(VALUE str, VALUE arg) { long i; - StringValue(arg); - i = rb_str_index(str, arg, 0); + if (RB_TYPE_P(arg, T_REGEXP)) { + i = rb_reg_search_without_backref(arg, str, 0, FALSE); + } + else { + StringValue(arg); + i = rb_str_index(str, arg, 0); + } if (i == -1) return Qfalse; return Qtrue; ---------------------------------------- Feature #8206: Should Ruby core implement String#blank? https://bugs.ruby-lang.org/issues/8206#change-38187 Author: sam.saffron (Sam Saffron) Status: Open Priority: Normal Assignee: Category: core Target version: There has been some discussion about porting the #blank? protocol over to Ruby in the past that has been rejected by Matz. This proposal is only about String however. At the moment to figure out if you have a blank string you would " ".strip.length == 0 The disadvantage is that this forces unneeded allocations and does too much work: An optimal implementation would be: static VALUE rb_str_blank(VALUE str) { rb_encoding *enc; char *s, *e; enc = STR_ENC_GET(str); s = RSTRING_PTR(str); if (!s || RSTRING_LEN(str) == 0) return Qtrue; e = RSTRING_END(str); while (s < e) { int n; unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc); if (!rb_isspace(cc) && cc != 0) return Qfalse; s += n; } return Qtrue; } This in turn is about 5-8x than the regex solution to the problem and way faster than allocating one massive string with strip when length is large. Should Ruby take on this method, to accompany #strip following its practice. --- A slight caveat though is that active support has a somewhat different definition of blank? const unsigned int as_blank[26] = {9, 0xa, 0xb, 0xc, 0xd, 0x20, 0x85, 0xa0, 0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200a, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000 }; static VALUE rb_str_blank_as(VALUE str) { rb_encoding *enc; char *s, *e; int i; int found; enc = STR_ENC_GET(str); s = RSTRING_PTR(str); if (!s || RSTRING_LEN(str) == 0) return Qtrue; e = RSTRING_END(str); while (s < e) { int n; unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc); found = 0; for(i=0;i<26;i++){ unsigned int current = as_blank[i]; if(current == cc) { found = 1; break; } if(cc < current){ break; } } if (!found) return Qfalse; s += n; } return Qtrue; } Clearly it makes no sense to have such a method. If Ruby took over implementing String#blank? it would clash with Active Support. But imho would enforce better API consistency. Thoughts? -- http://bugs.ruby-lang.org/