ruby-dev

うえのです。

正規表現中の文字クラスのマッチで、マルチバイト文字とシングルバイト
文字の区別が不完全なために、UTF-8な正規表現の中の文字クラスに
U+0080 から U+00FF の範囲の文字が含まれていた場合、期待した結果が
得られないことがあります。

EUC や SJIS だとマルチバイト文字の文字コードが 255 以下になることが
ないので問題無いのですが、UTF-8 では文字コードが 255 以下のマルチ
バイト文字があるので問題が発生し得るようです。

以下、テストケースと修正パッチです。


実験:

% ruby -v
ruby 1.8.0 (2003-01-17) [i686-linux]


re = Regexp.new("[\xc2\x80-\xed\x9f\xbf]+", nil, 'U')
s = "\xe3\x81\x82\xe3\x81\x81\xf0\x90\x80\x85\xe3\x81\x8a\xe3\x81\x85"
p s.scan(re)
  # => ["\xe3\x81\x82\xe3\x81\x81\xf0\x90\x80\x85\xe3\x81\x8a\xe3\x81\x85"]
  # expected: ["\xe3\x81\x82\xe3\x81\x81", "\xe3\x81\x8a\xe3\x81\x85"]

    \xe3\x81\x82, \xe3\x81\x81 はマルチバイト文字としてマッチ。
    \xf0\x90\x80\x85 はマルチバイト文字としてはマッチしないが
    \xf0 単独でのマッチは成功してしまう。\x90,\x80,\x85 も同様。
    regex.c:3827 辺り。


re = Regexp.new("[\xc2\x80-\xed\x9f\xbf]", nil, 'U')
p "\xf0\x90\x80\x85\xe3\x81\x82".scan(re)
  # => ["\xe3\x81\x82"]
  # expected: ["\xe3\x81\x82"]

    これは正しく動く。\xf0 が fastmap に含まれていないので
    re_search で \xf0\x90\x80\x85 が読み飛ばされる。
    regex.c:3212 辺り。


re = Regexp.new("[\xc2\x80-\xed\x9f\xbe]", nil, 'U')
p "\xed\x9f\xbf".scan(re)
  # => ["\xed"]
  # expected: []

    \xed は fastmap に含まれるため re_search は読み飛ばせない。
    \xed\x9f\xbf はマッチ失敗だが \xed はマッチ成功、\x9f と \xbf は
    2回目以降のマッチで re_search により読み飛ばされる。


re = Regexp.new("[\xc3\xad\xed\x9f\xbe]", nil, 'U')
p "\xed\x9f\xbf".scan(re)
  # => ["\xed"]
  # expected: []

    範囲でなくても同じ。(\xc3\xad は U+00ED)


re = Regexp.new("[\xc4\x80-\xed\x9f\xbe]", nil, 'U')
p "\xed\x9f\xbf".scan(re)
  # => []
  # expected: []

    これなら良い。マルチバイト文字の文字コードが 256 以上になっている
    ため。


re = Regexp.new("[^\xc2\x80-\xed\x9f\xbe]", nil, 'U')
s = "\xed\x9f\xbf\xf0\x90\x80\x85\xed\x9f\xbf"
p s.scan(re)
  # => []
  # expected: ["\xed\x9f\xbf", "\xf0\x90\x80\x85", "\xed\x9f\xbf"]

    \xed\x9f\xbf は範囲に含まれないが \xed は範囲に含まれる。よって
    マッチ失敗。マルチバイト文字 \xed\x9f\xbf を飛ばして \xf0 から
    再試行。以下同様。


re = Regexp.new("[^\xc3\xad\xed\x9f\xbe]", nil, 'U')
s = "\xed\x9f\xbf"
p s.scan(re)
  # => []
  # expected: ["\xed\x9f\xbf"]

    範囲でなくても同じ。(\xc3\xad は U+00ED)


re = Regexp.new("[^\xc4\x80-\xed\x9f\xbe]", nil, 'U')
s = "\xed\x9f\xbf\xf0\x90\x80\x85\xed\x9f\xbf"
p s.scan(re)
  # => ["\xed\x9f\xbf","\xf0\x90\x80\x85","\xed\x9f\xbf"]
  # expected: ["\xed\x9f\xbf","\xf0\x90\x80\x85","\xed\x9f\xbf"]

    これなら良い。マルチバイト文字の文字コードが 256 以上になっている
    ため。


re = Regexp.new("[\xfe\xff\xc3\x80]", nil, 'U')
p "\xc3\xbe\xc3\xbf".scan(re)
  # => [ "\xc3\xbe", "\xc3\xbf" ]
  # expected: []

    マルチバイト文字がシングルバイト文字としてマッチしている。
    \xc3\x80 を入れているのは \xc3 で始まるマルチバイト文字を
    fastmap によってスキップできないようにするため。



パッチ:

Index: regex.c
===================================================================
RCS file: /src/ruby/regex.c,v
retrieving revision 1.77
diff -u -p -r1.77 regex.c
--- regex.c	2 Jan 2003 16:56:16 -0000	1.77
+++ regex.c	19 Jan 2003 05:01:12 -0000
@@ -698,7 +698,18 @@ set_list_bits(c1, c2, b)
 }
 
 static int
-is_in_list(c, b)
+is_in_list_sbc(c, b)
+    unsigned long c;
+    const unsigned char *b;
+{
+  unsigned short size;
+
+  size = *b++;
+  return ((int)c / BYTEWIDTH < (int)size && b[c / BYTEWIDTH] & 1 << c % BYTEWIDTH);
+}
+  
+static int
+is_in_list_mbc(c, b)
     unsigned long c;
     const unsigned char *b;
 {
@@ -706,9 +717,6 @@ is_in_list(c, b)
   unsigned short i, j;
 
   size = *b++;
-  if ((int)c / BYTEWIDTH < (int)size && b[c / BYTEWIDTH] & 1 << c % BYTEWIDTH) {
-    return 1;
-  }
   b += size + 2;
   size = EXTRACT_UNSIGNED(&b[-2]);
   if (size == 0) return 0;
@@ -727,6 +735,14 @@ is_in_list(c, b)
   return 0;
 }
 
+static int
+is_in_list(c, b)
+    unsigned long c;
+    const unsigned char *b;
+{
+  return is_in_list_sbc(c, b) || is_in_list_mbc(c, b);
+}
+
 static void
 print_partial_compiled_pattern(start, end)
     unsigned char *start;
@@ -3815,19 +3832,25 @@ re_match_exec(bufp, string_arg, size, po
 	  int cc, c;
 
 	  PREFETCH;
-	  cc = c = (unsigned char)*d++;
+	  c = (unsigned char)*d++;
 	  if (ismbchar(c)) {
 	    if (d + mbclen(c) - 1 <= dend) {
+	      cc = c;
 	      MBC2WC(c, d);
+	      not = is_in_list_mbc(c, p);
+	      if (!not) {
+		part = not = is_in_list_sbc(cc, p);
+	      }
+	    } else {
+	      not = is_in_list_sbc(c, p);
 	    }
 	  }
-	  else if (TRANSLATE_P())
-	    cc = c = (unsigned char)translate[c];
-
-	  not = is_in_list(c, p);
-	  if (!not && cc != c) {
-	      part = not = is_in_list(cc, p);
+	  else {
+	    if (TRANSLATE_P())
+	      c = (unsigned char)translate[c];
+	    not = is_in_list_sbc(c, p);
 	  }
+
 	  if (*(p - 1) == (unsigned char)charset_not) {
 	    not = !not;
 	  }



 --  ----  -     - - -- -
うえの かつひろ <unnie@blue.sky.or.jp>

# regex.c、初めてまともに読みましたけど、かなりツラかったです… ;p

Thread

Prev Next

In This Thread

Prev Next