From: "phasis68 (Heesob Park)" Date: 2013-07-18T19:51:43+09:00 Subject: [ruby-core:56075] [ruby-trunk - Bug #8653] Unexpected result of String#succ with utf-16 and utf-32 string. Issue #8653 has been updated by phasis68 (Heesob Park). I understand String#succ is not easy for UTF-16LE encoded string. In case of UTF-16 or UTF-32 string, it is possible to convert it to UTF-8 string and get succ value and revert it to the original encoding. Here is a draft patch for rb_str_succ diff --git a/string.c b/string.c.new index f7a12e0..f933052 100644 --- a/string.c +++ b/string.c.new @@ -3032,6 +3032,7 @@ enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry) VALUE rb_str_succ(VALUE orig) { + int idx; rb_encoding *enc; VALUE str; char *sbeg, *s, *e, *last_alnum = 0; @@ -3041,12 +3042,26 @@ rb_str_succ(VALUE orig) long carry_pos = 0, carry_len = 1; enum neighbor_char neighbor = NEIGHBOR_FOUND; - str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig)); - rb_enc_cr_str_copy_for_substr(str, orig); + idx = ENCODING_GET(orig); + switch(idx) { + case ENCINDEX_UTF_16BE: + case ENCINDEX_UTF_16LE: + case ENCINDEX_UTF_32BE: + case ENCINDEX_UTF_32LE: + case ENCINDEX_UTF_16: + case ENCINDEX_UTF_32: + str = rb_str_encode(orig, rb_enc_from_encoding(rb_utf8_encoding()), 0, Qnil); + break; + default: + str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig)); + rb_enc_cr_str_copy_for_substr(str, orig); + } + OBJ_INFECT(str, orig); if (RSTRING_LEN(str) == 0) return str; - enc = STR_ENC_GET(orig); + + enc = STR_ENC_GET(str); sbeg = RSTRING_PTR(str); s = e = sbeg + RSTRING_LEN(str); @@ -3066,6 +3081,15 @@ rb_str_succ(VALUE orig) case NEIGHBOR_NOT_CHAR: continue; case NEIGHBOR_FOUND: + switch(idx) { + case ENCINDEX_UTF_16BE: + case ENCINDEX_UTF_16LE: + case ENCINDEX_UTF_32BE: + case ENCINDEX_UTF_32LE: + case ENCINDEX_UTF_16: + case ENCINDEX_UTF_32: + str = rb_str_encode(str, rb_enc_from_encoding(rb_enc_from_index(idx)), 0, Qnil); + } return str; case NEIGHBOR_WRAPPED: last_alnum = s; @@ -3103,6 +3127,17 @@ rb_str_succ(VALUE orig) STR_SET_LEN(str, RSTRING_LEN(str) + carry_len); RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; rb_enc_str_coderange(str); + + switch(idx) { + case ENCINDEX_UTF_16BE: + case ENCINDEX_UTF_16LE: + case ENCINDEX_UTF_32BE: + case ENCINDEX_UTF_32LE: + case ENCINDEX_UTF_16: + case ENCINDEX_UTF_32: + str = rb_str_encode(str, rb_enc_from_encoding(rb_enc_from_index(idx)), 0, Qnil); + } + return str; } ---------------------------------------- Bug #8653: Unexpected result of String#succ with utf-16 and utf-32 string. https://bugs.ruby-lang.org/issues/8653#change-40569 Author: phasis68 (Heesob Park) Status: Open Priority: Normal Assignee: Category: Target version: ruby -v: ruby 2.1.0dev (2013-07-17 trunk 42011) [i386-mingw32] Backport: 1.9.3: UNKNOWN, 2.0.0: UNKNOWN I found the result of String#succ of UTF-16LE encoded string is incorrect. As a result, Range of UTF-16LE encoded string show some unexpected behavior. C:\work>irb irb(main):001:0> a = 'A'.encode('UTF-16LE') => "A" irb(main):002:0> b = 'B'.encode('UTF-16LE') => "B" irb(main):003:0> a.succ => "\u0141" irb(main):004:0> r = a..b => "A".."B" irb(main):005:0> r.to_s => "A\u2E2EB" irb(main):006:0> r.count => 3 irb(main):007:0> r.to_a => ["A", "\u0141", "\u0241"] irb(main):008:0> r.include?(b) => false irb(main):009:0> a = 'A'.encode('UTF-32LE') => "A" irb(main):010:0> b = 'B'.encode('UTF-32LE') => "B" irb(main):011:0> a.succ => "\u{1000041}" irb(main):012:0> r = a..b => "A".."B" irb(main):013:0> r.to_s => "A\u{422E2E}\x00\x00" irb(main):014:0> r.count => 16777217 irb(main):015:0> r.to_a [FATAL] failed to allocate memory C:\work> -- http://bugs.ruby-lang.org/