[#15625] rb_hash_initialize — Takaaki Tateishi <ttate@...>

立石です.

22 messages 2002/01/04
[#15627] Re: rb_hash_initialize — matz@... (Yukihiro Matsumoto) 2002/01/04

まつもと ゆきひろです

[#15628] Re: rb_hash_initialize — Takaaki Tateishi <ttate@...> 2002/01/04

立石です.

[#15685] undefined method `inherited' for false (NameError) — WATANABE Hirofumi <eban@...>

わたなべです。

13 messages 2002/01/15
[#15686] Re: undefined method `inherited' for false (NameError) — nobu.nakada@... 2002/01/15

なかだです。

[#15757] 文字列→整数変換 — nobu.nakada@...

なかだです。

30 messages 2002/01/25

[#15830] [ 提案 ] puts, print 等を IO から分離 — UENO Katsuhiro <unnie@...>

うえのです。

14 messages 2002/01/31

[ruby-dev:15795] [PATCH] improve on \G

From: nobu.nakada@...
Date: 2002-01-29 10:43:48 UTC
List: ruby-dev #15795
なかだです。

[ruby-talk:30523]で、-Kを指定すると\Gを使ったマッチが遅いという
話が出てます。調べてみると、regex.cのre_adjust_startpos()でマル
チバイトの境界を文字列の最初から探しているせいのようです。逆方
向に探すことで-Kがないときとほぼ同じ速度になります。

#!./ruby -Ke
require 'rubyunit'

class TestKanji < TestCase
  def test_kanji
    s = "あいうえお"
    assert_equal(0, /\Gあ/ =~ s)
    assert_equal(0, /\Aあ/ =~ s)
    assert_equal(2, /い/ =~ s)
    assert_equal(4, s.index(/\Gう/, 4))
    assert_equal(4, s.index(/\Gう/, 5))
    assert_nil(s.index(/\Gう/, 6))
  end
end

それから、SJISの一バイト目は0xfcまでだったようなのと、EUCのSSL2
も追加してあります。


Index: regex.c
===================================================================
RCS file: /cvs/ruby/src/ruby/regex.c,v
retrieving revision 1.58
diff -u -2 -p -r1.58 regex.c
--- regex.c	2002/01/23 07:30:39	1.58
+++ regex.c	2002/01/29 10:39:58
@@ -479,4 +479,6 @@ re_set_syntax(syntax)
  ((current_mbctype != MBCTYPE_UTF8) ? ((c<0x100) ? (c) : (((c)>>8)&0xff)) : utf8_firstbyte(c))
 
+int mbc_backward_char _((const char *start, int pos));
+
 static unsigned int
 utf8_firstbyte(c)
@@ -3077,26 +3079,8 @@ re_adjust_startpos(bufp, string, size, s
   /* Adjust startpos for mbc string */
   if (current_mbctype && startpos>0 && !(bufp->options&RE_OPTIMIZE_BMATCH)) {
-    int i = 0;
+    int i = mbc_backward_char(string, startpos);
 
-    if (range > 0) {
-      while (i<size) {
-	i += mbclen(string[i]);
-	if (startpos <= i) {
-	  startpos = i;
-	  break;
-	}
-      }
-    }
-    else {
-      int w;
-
-      while (i<size) {
-	w = mbclen(string[i]);
-	if (startpos < i + w) {
-	  startpos = i;
-	  break;
-	}
-	i += w;
-      }
+    if (i < startpos && range > 0) {
+      startpos = i + mbclen(string[i]);
     }
   }
@@ -4392,4 +4376,16 @@ re_free_registers(regs)
    Last change: Jul. 9, 1993 by t^2  */
 static const unsigned char mbctab_ascii[] = {
+  /* forward scan */
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -4397,4 +4393,11 @@ static const unsigned char mbctab_ascii[
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+  /* reverse scan */
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -4411,4 +4414,24 @@ static const unsigned char mbctab_ascii[
 
 static const unsigned char mbctab_euc[] = { /* 0xA1-0xFE */
+  /* forward scan */
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+
+  /* reverse scan */,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -4419,5 +4442,4 @@ static const unsigned char mbctab_euc[] 
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -4430,4 +4452,5 @@ static const unsigned char mbctab_euc[] 
 
 static const unsigned char mbctab_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
+  /* forward scan */
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -4445,8 +4468,27 @@ static const unsigned char mbctab_sjis[]
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+
+  /* reverse scan */,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
 };
 
 static const unsigned char mbctab_utf8[] = {
+  /* forward scan */
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -4464,5 +4506,23 @@ static const unsigned char mbctab_utf8[]
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 0, 0
+  3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 0, 0,
+
+  /* reverse scan */,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
@@ -4490,4 +4550,29 @@ re_mbcinit(mbctype)
     current_mbctype = MBCTYPE_UTF8;
     break;
+  }
+}
+
+int
+mbc_backward_char(string, pos)
+     const char *string;
+     int pos;
+{
+  const char *ptr = string + pos;
+  while (ptr > string) {
+    unsigned char c = *ptr;
+    if (!re_mbctab[c+256]) return ptr - string;
+    --ptr;
+  }
+
+  switch (current_mbctype) {
+  case MBCTYPE_EUC:
+  case MBCTYPE_SJIS:
+    /* double byte char only */
+    return pos / 2;
+  case MBCTYPE_UTF8:
+    /* illegal sequence */
+    /* fall through */
+  default:
+    return pos;
   }
 }


-- 
--- 僕の前にBugはない。
--- 僕の後ろにBugはできる。
    中田 伸悦

In This Thread

Prev Next