From: "naruse (Yui NARUSE)" <naruse@...> Date: 2013-04-09T04:38:13+09:00 Subject: [ruby-dev:47240] [ruby-trunk - Feature #6752] Replacing ill-formed subsequencce Issue #6752 has been updated by naruse (Yui NARUSE). duerst (Martin Dürst) wrote: > I have thought about this a bit. Yui's patch to string treats this as a problem separat from transcoding. I think it is preferable to use the transcoding logic to implement this. The advantage is that exactly the same options and fallbacks can be used, and if we add a new option or fallback to transcode, it will be usable, too. This method doesn't need same options and fallbacks. It need only invalid related, doesn't need undef related. Moreover transcoder is usable only if Ruby has related transcoder of the target encoding. But Ruby has some encodings which doesn't have transcoder for example emacs-mule. Therefore this can't be built on transcoder. > Some more notes: The checks for converting from one encoding to the same encoding are in str_transcode0. Anywhere else? We need some data to drive the conversion, but this should be easy to generate, and will be the same for many 8-bit encodings. Yeah, I came to str_transcode0 and it is correct place. The date we need is problem. transcode doesn't have all the data though tool/transcode-tblgen.rb has some base data. The only one which has all the data we need is enc/*. > It will be easy to catch invalid byte sequences, but I'm not sure it's worth to check unassigned codepoints, at least not in Unicode. If we need unassigned codepoints, we must define encodings more strictly. Even if it is Unicode, it needs versions. I don't think it's worth to check. ---------------------------------------- Feature #6752: Replacing ill-formed subsequencce https://bugs.ruby-lang.org/issues/6752#change-38370 Author: naruse (Yui NARUSE) Status: Assigned Priority: Normal Assignee: matz (Yukihiro Matsumoto) Category: core Target version: next minor =begin == 概要 Stringになんらかの理由で不正なバイト列が含まれている時に、それを置換文字で置き換えたい。 == ユースケース 実際に確認されているユースケースは以下の通りです。 * twitterのtitle * IRCのログ * ニコニコ動画の API * Webクローリング これらの不正なバイト列の生成過程は、おそらく、バイト単位で文字列を切り詰めた時に末尾が切れて、 末尾がおかしい不正な文字列が作られます。(前二者) これをコンテナに入れたり結合することによって、途中にも混ざった文字列が作られます。(後二者) * https://twitter.com/takahashim/status/18974040397 * https://twitter.com/n0kada/status/215674740705210368 * https://twitter.com/n0kada/status/215686490070585346 * https://twitter.com/hajimehoshi/status/215671146769682432 * http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/ * http://stackoverflow.com/questions/2982677/ruby-1-9-invalid-byte-sequence-in-utf-8 == 必要な引数: 置換文字 省略可能、String。 デフォルトは、Unicode系ならU+FFFD、それ以外では「?」。 デフォルトが空文字でない理由は、削除してしまうことで、従来は存在しなかったトークンを作れてしまい、 上位のレイヤーの脆弱性に繋がるからです。 http://unicode.org/reports/tr36/#UTF-8_Exploit == API --- str.encode(str.encoding, invalid: replace, [replace: "〓"]) * CSI的じゃなくて気持ち悪い * iconv でできるのは glibc iconv か GNU libiconv に //IGNORE つけた時で他はできない * 実装上のメリットは後述の通り、直感に反してあまりない(と思う) == 別メソッド * 新しいメソッドである * fix/repair invalid/illegal bytes/sequence あたりの名前か == 実装 === 鬼車ベース int ret = rb_enc_precise_mbclen(p, e, enc); して、 MBCLEN_INVALID_P(ret) が真な時、何バイト目が不正なのかわからないのが微妙。 ONIGENC_CONSTRUCT_MBCLEN_INVALID() がバイト数を取らないのが原因なので、 鬼車のエンコーディングモジュール全てに影響してしまうため、修正困難。 不正なバイトはほとんど存在しないと仮定して、効率を犠牲にすれば回避は可能。 === transcodeベース UCS正規化なglibc iconv, GNU libiconv, Perl Encodeなどと違って、 CSIなtranscodeでは、自分自身に変換する場合、 エンコーディングごとに「何もしない」変換モジュールを用意しないといけない。 とりあえず鬼車ベースのコンセプト実装とテストを添付しておきます。 diff --git a/string.c b/string.c index d038835..4808f15 100644 --- a/string.c +++ b/string.c @@ -7426,6 +7426,199 @@ rb_str_ellipsize(VALUE str, long len) return ret; } +/* + * call-seq: + * str.fix_invalid -> new_str + * + * If the string is well-formed, it returns self. + * If the string has invalid byte sequence, repair it with given replacement + * character. + */ +VALUE +rb_str_fix_invalid(VALUE str) +{ + int cr = ENC_CODERANGE(str); + rb_encoding *enc; + if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) + return rb_str_dup(str); + + enc = STR_ENC_GET(str); + if (rb_enc_asciicompat(enc)) { + const char *p = RSTRING_PTR(str); + const char *e = RSTRING_END(str); + const char *p1 = p; + /* 10 should be enough for the usual use case, + * fixing a wrongly chopped character at the end of the string + */ + long room = 10; + VALUE buf = rb_str_buf_new(RSTRING_LEN(str) + room); + const char *rep; + if (enc == rb_utf8_encoding()) + rep = "\xEF\xBF\xBD"; + else + rep = "?"; + cr = ENC_CODERANGE_7BIT; + + p = search_nonascii(p, e); + if (!p) { + p = e; + } + while (p < e) { + int ret = rb_enc_precise_mbclen(p, e, enc); + if (MBCLEN_CHARFOUND_P(ret)) { + if ((unsigned char)*p > 127) cr = ENC_CODERANGE_VALID; + p += MBCLEN_CHARFOUND_LEN(ret); + } + else if (MBCLEN_INVALID_P(ret)) { + const char *q; + long clen = rb_enc_mbmaxlen(enc); + if (p > p1) rb_str_buf_cat(buf, p1, p - p1); + q = RSTRING_END(buf); + + if (e - p < clen) clen = e - p; + if (clen < 3) { + clen = 1; + } + else { + long len = RSTRING_LEN(buf); + clen--; + rb_str_buf_cat(buf, p, clen); + for (; clen > 1; clen--) { + ret = rb_enc_precise_mbclen(q, q + clen, enc); + if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else if (MBCLEN_INVALID_P(ret)) { + continue; + } + else { + rb_bug("shouldn't reach here '%s'", q); + } + } + rb_str_set_len(buf, len); + } + p += clen; + p1 = p; + rb_str_buf_cat2(buf, rep); + p = search_nonascii(p, e); + if (!p) { + p = e; + break; + } + } + else if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else { + rb_bug("shouldn't reach here"); + } + } + if (p1 < p) { + rb_str_buf_cat(buf, p1, p - p1); + } + if (p < e) { + rb_str_buf_cat2(buf, rep); + cr = ENC_CODERANGE_VALID; + } + ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr); + return buf; + } + else if (rb_enc_dummy_p(enc)) { + return rb_str_dup(str); + } + else { + /* ASCII incompatible */ + const char *p = RSTRING_PTR(str); + const char *e = RSTRING_END(str); + const char *p1 = p; + /* 10 should be enough for the usual use case, + * fixing a wrongly chopped character at the end of the string + */ + long room = 10; + VALUE buf = rb_str_buf_new(RSTRING_LEN(str) + room); + const char *rep; + long mbminlen = rb_enc_mbminlen(enc); + static rb_encoding *utf16be; + static rb_encoding *utf16le; + static rb_encoding *utf32be; + static rb_encoding *utf32le; + if (!utf16be) { + utf16be = rb_enc_find("UTF-16BE"); + utf16le = rb_enc_find("UTF-16LE"); + utf32be = rb_enc_find("UTF-32BE"); + utf32le = rb_enc_find("UTF-32LE"); + } + if (enc == utf16be) { + rep = "\xFF\xFD"; + } + else if (enc == utf16le) { + rep = "\xFD\xFF"; + } + else if (enc == utf32be) { + rep = "\x00\x00\xFF\xFD"; + } + else if (enc == utf32le) { + rep = "\xFD\xFF\x00\x00"; + } + else { + rep = "?"; + } + + while (p < e) { + int ret = rb_enc_precise_mbclen(p, e, enc); + if (MBCLEN_CHARFOUND_P(ret)) { + p += MBCLEN_CHARFOUND_LEN(ret); + } + else if (MBCLEN_INVALID_P(ret)) { + const char *q; + long clen = rb_enc_mbmaxlen(enc); + if (p > p1) rb_str_buf_cat(buf, p1, p - p1); + q = RSTRING_END(buf); + + if (e - p < clen) clen = e - p; + if (clen < mbminlen * 3) { + clen = mbminlen; + } + else { + long len = RSTRING_LEN(buf); + clen -= mbminlen; + rb_str_buf_cat(buf, p, clen); + for (; clen > mbminlen; clen-=mbminlen) { + ret = rb_enc_precise_mbclen(q, q + clen, enc); + if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else if (MBCLEN_INVALID_P(ret)) { + continue; + } + else { + rb_bug("shouldn't reach here '%s'", q); + } + } + rb_str_set_len(buf, len); + } + p += clen; + p1 = p; + rb_str_buf_cat2(buf, rep); + } + else if (MBCLEN_NEEDMORE_P(ret)) { + break; + } + else { + rb_bug("shouldn't reach here"); + } + } + if (p1 < p) { + rb_str_buf_cat(buf, p1, p - p1); + } + if (p < e) { + rb_str_buf_cat2(buf, rep); + } + ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID); + return buf; + } +} + /********************************************************************** * Document-class: Symbol * @@ -7882,6 +8075,7 @@ Init_String(void) rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1); rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2); rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1); + rb_define_method(rb_cString, "fix_invalid", rb_str_fix_invalid, 0); rb_define_method(rb_cString, "to_i", rb_str_to_i, -1); rb_define_method(rb_cString, "to_f", rb_str_to_f, 0); diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb index 47f349c..2b0cfeb 100644 --- a/test/ruby/test_string.rb +++ b/test/ruby/test_string.rb @@ -2031,6 +2031,29 @@ class TestString < Test::Unit::TestCase assert_equal(u("\x82")+("\u3042"*9), ("\u3042"*10).byteslice(2, 28)) end + + def test_fix_invalid + assert_equal("\uFFFD\uFFFD\uFFFD", "\x80\x80\x80".fix_invalid) + assert_equal("\uFFFDA", "\xF4\x80\x80A".fix_invalid) + + # exapmles in Unicode 6.1.0 D93b + assert_equal("\x41\uFFFD\uFFFD\x41\uFFFD\x41", + "\x41\xC0\xAF\x41\xF4\x80\x80\x41".fix_invalid) + assert_equal("\x41\uFFFD\uFFFD\uFFFD\x41", + "\x41\xE0\x9F\x80\x41".fix_invalid) + assert_equal("\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", + "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64".fix_invalid) + + assert_equal("abcdefghijklmnopqrstuvwxyz\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", + "abcdefghijklmnopqrstuvwxyz\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64".fix_invalid) + + assert_equal("\uFFFD\u3042".encode("UTF-16BE"), + "\xD8\x00\x30\x42".force_encoding(Encoding::UTF_16BE). + fix_invalid) + assert_equal("\uFFFD\u3042".encode("UTF-16LE"), + "\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE). + fix_invalid) + end end class TestString2 < TestString =end -- http://bugs.ruby-lang.org/