[ruby-dev:31806] rb_str_substr is much slower than rb_str_subpat

From: Nobuyoshi Nakada <nobu@...>
Date: 2007-09-20 20:37:59 UTC
List: ruby-dev #31806
なかだです。

1.9では、長い文字列に対するString#[]が、Fixnumを使ったときのほう
がRegexpを使ったときよりも、格段に遅くなっています。

#! /usr/bin/ruby -Ke
require "benchmark"
class String
  def first; self[/\A./m]; end
  def last; self[/.\z/m]; end
  def first1; self[0]; end
  def last1; self[-1]; end
end

[["日本語"*1000, 10000], ["x"*10, 100000]].each do |str, n|
  p [str.first, str.last, str.first1, str.last1]
  Benchmark.bm(6) {|b|
    b.report("first") {n.times{str.first}}
    b.report("last ") {n.times{str.last}}
    b.report("first1") {n.times{str.first1}}
    b.report("last1 ") {n.times{str.last1}}
  }
end

$ ruby19 -v /tmp/nobu/str.rb
ruby 1.9.0 (2007-09-21 revision 13475) [i686-linux]
["日", "語", "日", "語"]
            user     system      total        real
first   0.020000   0.000000   0.020000 (  0.014829)
last    0.160000   0.000000   0.160000 (  0.164511)
first1  0.580000   0.000000   0.580000 (  0.579861)
last1   1.160000   0.010000   1.170000 (  1.152732)
["x", "x", "x", "x"]
            user     system      total        real
first   0.150000   0.000000   0.150000 (  0.154660)
last    0.210000   0.000000   0.210000 (  0.212844)
first1  0.060000   0.000000   0.060000 (  0.059491)
last1   0.060000   0.000000   0.060000 (  0.058233)

どうもstr_strlen()がボトルネックっぽいです。

$ ./ruby -v /tmp/nobu/str.rb
ruby 1.9.0 (2007-09-21 revision 13478) [i686-linux]
["日", "語", "日", "語"]
            user     system      total        real
first   0.030000   0.000000   0.030000 (  0.023918)
last    0.200000   0.000000   0.200000 (  0.199748)
first1  0.010000   0.000000   0.010000 (  0.006900)
last1   0.080000   0.000000   0.080000 (  0.079601)
["x", "x", "x", "x"]
            user     system      total        real
first   0.160000   0.000000   0.160000 (  0.164098)
last    0.190000   0.000000   0.190000 (  0.184262)
first1  0.050000   0.000000   0.050000 (  0.055136)
last1   0.060000   0.000000   0.060000 (  0.059857)


Index: string.c
===================================================================
--- string.c	(revision 13478)
+++ string.c	(working copy)
@@ -737,25 +737,39 @@ rb_str_substr(VALUE str, long beg, long 
     rb_encoding *enc = rb_enc_get(str);
     VALUE str2;
-    int slen = str_strlen(str, enc);
+    char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
 
     if (len < 0) return Qnil;
-    if (beg > slen) return Qnil;
-    if (beg < 0) {
-	beg += slen;
-	if (beg < 0) return Qnil;
+    if (!RSTRING_LEN(str)) {
+	len = 0;
     }
-    if (beg + len > slen) {
-	len = slen - beg;
+    if (beg < 0) {
+	if (len > -beg) len = -beg;
+	if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
+	    beg = -beg;
+	    while (len++ < beg && (e = rb_enc_prev_char(s, e, enc)) != 0);
+	    p = e;
+	    if (!p) return Qnil;
+	    while (beg-- > 0 && (p = rb_enc_prev_char(s, p, enc)) != 0);
+	    if (!p) return Qnil;
+	    len = e - p;
+	    goto sub;
+	}
+	else {
+	    beg += str_strlen(str, enc);
+	    if (beg < 0) return Qnil;
+	}
     }
-    if (len < 0) {
-	len = 0;
+    else if (beg > 0 && beg > str_strlen(str, enc)) {
+	return Qnil;
     }
     if (len == 0) {
-	str2 = rb_str_new5(str,0,0);
+	p = 0;
     }
     else {
-	char *p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc);
-	str2 = rb_str_new5(str, p, str_offset(p, RSTRING_END(str), len, enc));
+	p = str_nth(s, e, beg, enc);
+	len = str_offset(p, e, len, enc);
     }
+  sub:
+    str2 = rb_str_new5(str, p, len);
     rb_enc_copy(str2, str);
     OBJ_INFECT(str2, str);


-- 
--- 僕の前にBugはない。
--- 僕の後ろにBugはできる。
    中田 伸悦

In This Thread

Prev Next