[ruby-dev:31734] [m17n] String#chop & String#succ

From: Nobuyoshi Nakada <nobu@...>
Date: 2007-09-05 07:25:46 UTC
List: ruby-dev #31734
なかだです。

http://dontstopmusic.no-ip.org/diary/20070827.html#p02 で指摘さ
れているchopとsuccに関するパッチです。

succに関しては、英数字以外は単純に文字コード順にずらしていますが、
本来ならひらがなとかカタカナとかの中で繰り上げをしたほうがいいの
かもしれません。たぶん以下のような機能が必要になるかと思います。

1 scriptを調べる
2 順序付可能か調べる
3 あるscript中での次の文字を得る
4 あるscript中での最初の文字を得る


Index: string.c
===================================================================
--- string.c	(revision 13339)
+++ string.c	(working copy)
@@ -1585,4 +1585,45 @@ succ_char(char *s)
 }
 
+static int
+enc_succ_char(unsigned int c, char *s, rb_encoding *enc)
+{
+    unsigned int cs;
+
+    /* numerics */
+    if (rb_enc_isdigit(c, enc)) {
+	cs = c++;
+	if (rb_enc_isdigit(c, enc)) {
+	    rb_enc_mbcput(c, s, enc);
+	    return 0;
+	}
+	do c = cs--; while (rb_enc_isdigit(cs, enc));
+	rb_enc_mbcput(c, s, enc);
+	return ++c;
+    }
+    /* small alphabets */
+    if (rb_enc_islower(c, enc)) {
+	cs = c++;
+	if (rb_enc_islower(c, enc)) {
+	    rb_enc_mbcput(c, s, enc);
+	    return 0;
+	}
+	do c = cs--; while (rb_enc_islower(cs, enc));
+	rb_enc_mbcput(c, s, enc);
+	return c;
+    }
+    /* capital alphabets */
+    if (rb_enc_isupper(c, enc)) {
+	cs = c++;
+	if (rb_enc_isupper(c, enc)) {
+	    rb_enc_mbcput(c, s, enc);
+	    return 0;
+	}
+	do c = cs--; while (rb_enc_isupper(cs, enc));
+	rb_enc_mbcput(c, s, enc);
+	return c;
+    }
+    return -1;
+}
+
 
 /*
@@ -1618,36 +1659,49 @@ rb_str_succ(VALUE orig)
     char *sbeg, *s, *e;
     int c = -1;
-    long n = 0;
+    long n = 0, o = 0, l;
+    char carry[ONIGENC_CODE_TO_MBC_MAXLEN];
 
     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
+    rb_enc_copy(str, orig);
     OBJ_INFECT(str, orig);
     if (RSTRING_LEN(str) == 0) return str;
 
     enc = rb_enc_get(orig);
-    sbeg = RSTRING_PTR(str); s = sbeg + RSTRING_LEN(str) - 1;
-    e = RSTRING_END(str);
+    sbeg = RSTRING_PTR(str);
+    s = e = sbeg + RSTRING_LEN(str);
 
-    while (sbeg <= s) {
+    while (sbeg <= (s = rb_enc_prev_char(sbeg, s, enc))) {
 	unsigned int cc = rb_enc_codepoint(s, e, enc);
 	if (rb_enc_isalnum(cc, enc)) {
-	    if ((c = succ_char(s)) == 0) break;
+	    if (isascii(cc)) {
+		if ((c = succ_char(s)) == 0) break;
+	    }
+	    else {
+		if ((c = enc_succ_char(cc, s, enc)) == 0) break;
+	    }
 	    n = s - sbeg;
 	}
-	s--;
     }
     if (c == -1) {		/* str contains no alnum */
-	sbeg = RSTRING_PTR(str); s = sbeg + RSTRING_LEN(str) - 1;
 	c = '\001';
-	while (sbeg <= s) {
-	    if ((*s += 1) != 0) break;
-	    s--;
+	s = e;
+	while (sbeg <= (s = rb_enc_prev_char(sbeg, e, enc))) {
+	    unsigned int cc = rb_enc_codepoint(s, e, enc) + 1;
+	    l = rb_enc_mbcput(cc, carry, enc);
+	    if (l > 0) {
+		if (l == (o = e - s)) goto overlay;
+		n = s - sbeg;
+		goto insert;
+	    }
 	}
     }
-    if (s < sbeg) {
-	RESIZE_CAPA(str, RSTRING_LEN(str) + 1);
+    if (s < sbeg && (l = rb_enc_mbcput(c, carry, enc)) > 0) {
+      insert:
+	RESIZE_CAPA(str, RSTRING_LEN(str) + l - o);
 	s = RSTRING_PTR(str) + n;
-	memmove(s+1, s, RSTRING_LEN(str) - n);
-	*s = c;
-	STR_SET_LEN(str, RSTRING_LEN(str) + 1);
+	memmove(s + l, s + o, RSTRING_LEN(str) - n - o);
+      overlay:
+	memmove(s, carry, l);
+	STR_SET_LEN(str, RSTRING_LEN(str) + l - o);
 	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
     }
@@ -4041,4 +4095,22 @@ rb_str_each_char(VALUE str)
 }
 
+static long
+chopped_length(VALUE str)
+{
+    rb_encoding *enc = rb_enc_get(str);
+    const char *p, *p2, *beg, *end;
+
+    beg = RSTRING_PTR(str);
+    end = beg + RSTRING_LEN(str);
+    if (beg > end) return 0;
+    p = rb_enc_prev_char(beg, end, enc);
+    if (!p) return 0;
+    if (p > beg && rb_enc_codepoint(p, end, enc) == '\n') {
+	p2 = rb_enc_prev_char(beg, p, enc);
+	if (p2 && rb_enc_codepoint(p2, end, enc) == '\r') p = p2;
+    }
+    return p - beg;
+}
+
 /*
  *  call-seq:
@@ -4054,13 +4126,9 @@ rb_str_chop_bang(VALUE str)
 {
     if (RSTRING_LEN(str) > 0) {
+	long len;
 	rb_str_modify(str);
-	STR_DEC_LEN(str);
-	if (RSTRING_PTR(str)[RSTRING_LEN(str)] == '\n') {
-	    if (RSTRING_LEN(str) > 0 &&
-		RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
-		STR_DEC_LEN(str);
-	    }
-	}
-	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
+	len = chopped_length(str);
+	STR_SET_LEN(str, len);
+	RSTRING_PTR(str)[len] = '\0';
 	return str;
     }
@@ -4089,7 +4157,8 @@ static VALUE
 rb_str_chop(VALUE str)
 {
-    str = rb_str_dup(str);
-    rb_str_chop_bang(str);
-    return str;
+    VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
+    rb_enc_copy(str2, str);
+    OBJ_INFECT(str2, str);
+    return str2;
 }
 
Index: include/ruby/encoding.h
===================================================================
--- include/ruby/encoding.h	(revision 13339)
+++ include/ruby/encoding.h	(working copy)
@@ -63,5 +63,5 @@ int rb_enc_codelen(int, rb_encoding*);
 
 /* ptr, ptr, encoding -> prev_char */
-#define rb_enc_prev_char(s,p,enc) onigenc_get_prev_char_head(enc,(UChar*)s,(UChar*)p)
+#define rb_enc_prev_char(s,p,enc) (char *)onigenc_get_prev_char_head(enc,(UChar*)s,(UChar*)p)
 
 #define rb_enc_isascii(c,enc) ONIGENC_IS_CODE_ASCII(c)


-- 
--- 僕の前にBugはない。
--- 僕の後ろにBugはできる。
    中田 伸悦

In This Thread

Prev Next