ruby-core

I have made some changes to the faulty patch to string.c that I posted  
last week, and the attached version seems to pass the tests (though there  
are a couple of failures, but these are the same ones as the vanilla 1.9 I  
checked out).

I had to make one change to a test of String#split because it now raises a  
different kind or error. Hope that's OK.

I know this is a big patch, but I am running out of time to split it into  
smaller chunks to make it easier to merge. I can try.

The main optimizations are:

- The CODERANGE flags were being cleared unnecessarilly in several places.  
This can cause the string to be re-scanned, which is inefficient. I think  
this was the main problem with the "count_words benchmark.

- Optimized String#split when the parameter is a string. Now about twice  
as fast

- Several other minor optimizations, mainly for single-byte character  
strings

Cheers
Mike

On Fri, 12 Sep 2008 02:16:51 +1000, NARUSE, Yui <naruse@airemix.jp> wrote:

> P.S.
> If you split your patch into small atomic patches,
> your patch will be merged rapidly.
>

Attachments (1)

string.patch (24.7 KB, text/x-diff)

Index: string.c
===================================================================
--- string.c	(revision 19073)
+++ string.c	(working copy)
@@ -111,7 +111,7 @@
 
 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
 
-static int
+static inline int
 single_byte_optimizable(VALUE str)
 {
     rb_encoding *enc = STR_ENC_GET(str);
@@ -1057,6 +1057,17 @@
     ENC_CODERANGE_CLEAR(str);
 }
 
+/* As rb_str_modify(), but don't clear coderange */
+static void
+str_modify(VALUE str)
+{
+    if (!str_independent(str))
+	str_make_independent(str);
+    if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
+	/* Force re-scan later */
+	ENC_CODERANGE_CLEAR(str);
+}
+
 void
 rb_str_associate(VALUE str, VALUE add)
 {
@@ -1281,12 +1292,27 @@
     rb_encoding *enc = STR_ENC_GET(str);
     VALUE str2;
     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
-    int singlebyte;
 
     if (len < 0) return Qnil;
     if (!RSTRING_LEN(str)) {
 	len = 0;
     }
+    if (single_byte_optimizable(str)) {
+	if (beg > RSTRING_LEN(str)) return Qnil;
+	if (beg < 0) {
+	    beg += RSTRING_LEN(str);
+	    if (beg < 0) return Qnil;
+	}
+	if (beg + len > RSTRING_LEN(str))
+	    len = RSTRING_LEN(str) - beg;
+	if (len <= 0) {
+	    len = 0;
+	    p = 0;
+	}
+	else
+	    p = s + beg;
+	goto sub;
+    }
     if (beg < 0) {
 	if (len > -beg) len = -beg;
 	if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
@@ -1307,7 +1333,6 @@
     else if (beg > 0 && beg > str_strlen(str, enc)) {
 	return Qnil;
     }
-    singlebyte = single_byte_optimizable(str);
     if (len == 0) {
 	p = 0;
     }
@@ -1318,17 +1343,24 @@
         len = str_utf8_offset(p, e, len);
     }
 #endif
-    else if ((p = str_nth(s, e, beg, enc, singlebyte)) == e) {
-	len = 0;
-    }
     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
-        if (len * rb_enc_mbmaxlen(enc) > e - p)
+	int char_sz = rb_enc_mbmaxlen(enc);
+
+	p = s + beg * char_sz;
+	if (p > e) {
+	    p = e;
+	    len = 0;
+	}
+        else if (len * char_sz > e - p)
             len = e - p;
         else
-	    len *= rb_enc_mbmaxlen(enc);
+	    len *= char_sz;
     }
+    else if ((p = str_nth(s, e, beg, enc, 0)) == e) {
+	len = 0;
+    }
     else {
-	len = str_offset(p, e, len, enc, singlebyte);
+	len = str_offset(p, e, len, enc, 0);
     }
   sub:
     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
@@ -2065,19 +2097,33 @@
 
     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
-    while (p1 < p1end && p2 < p2end) {
-	int c1 = rb_enc_codepoint(p1, p1end, enc);
-	int c2 = rb_enc_codepoint(p2, p2end, enc);
+    if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
+	while (p1 < p1end && p2 < p2end) {
+	    if (*p1 != *p2) {
+		int c1 = rb_enc_toupper(*p1 & 0xff, enc);
+		int c2 = rb_enc_toupper(*p2 & 0xff, enc);
+		if (c1 > c2) return INT2FIX(1);
+		if (c1 < c2) return INT2FIX(-1);
+	    }
+	    p1++;
+	    p2++;
+	}
+    }
+    else {
+	while (p1 < p1end && p2 < p2end) {
+	    int c1 = rb_enc_codepoint(p1, p1end, enc);
+	    int c2 = rb_enc_codepoint(p2, p2end, enc);
 
-	if (c1 != c2) {
-	    c1 = rb_enc_toupper(c1, enc);
-	    c2 = rb_enc_toupper(c2, enc);
-	    if (c1 > c2) return INT2FIX(1);
-	    if (c1 < c2) return INT2FIX(-1);
+	    if (c1 != c2) {
+		c1 = rb_enc_toupper(c1, enc);
+		c2 = rb_enc_toupper(c2, enc);
+		if (c1 > c2) return INT2FIX(1);
+		if (c1 < c2) return INT2FIX(-1);
+	    }
+	    len = rb_enc_codelen(c1, enc);
+	    p1 += len;
+	    p2 += len;
 	}
-	len = rb_enc_codelen(c1, enc);
-	p1 += len;
-	p2 += len;
     }
     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
@@ -2850,6 +2896,7 @@
 	RSTRING(str)->as.heap.len = nlen;
     }
     ptr[nlen] = 0;
+    /* Is this necessary? Isn't len always a whole number of chars? */
     ENC_CODERANGE_CLEAR(str);
     return str;
 }
@@ -2857,13 +2904,15 @@
 static void
 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
 {
+    int cr;
+
     if (beg == 0 && RSTRING_LEN(val) == 0) {
 	rb_str_drop_bytes(str, len);
 	OBJ_INFECT(str, val);
 	return;
     }
 
-    rb_str_modify(str);
+    str_modify(str);
     if (len < RSTRING_LEN(val)) {
 	/* expand string */
 	RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
@@ -2885,6 +2934,9 @@
 	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
     }
     OBJ_INFECT(str, val);
+    cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
+    if (cr != ENC_CODERANGE_BROKEN)
+	ENC_CODERANGE_SET(str, cr);
 }
 
 static void
@@ -2898,7 +2950,6 @@
     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
 
     StringValue(val);
-    rb_str_modify(str);
     enc = rb_enc_check(str, val);
     slen = str_strlen(str, enc);
 
@@ -2915,6 +2966,7 @@
     if (slen < len || slen < beg + len) {
 	len = slen - beg;
     }
+    str_modify(str);
     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
     if (!p) p = RSTRING_END(str);
     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
@@ -3114,7 +3166,7 @@
     for (i=0; i<argc; i++) {
 	buf[i] = argv[i];
     }
-    rb_str_modify(str);
+    str_modify(str);
     buf[i] = rb_str_new(0,0);
     result = rb_str_aref_m(argc, buf, str);
     if (!NIL_P(result)) {
@@ -3524,7 +3576,10 @@
     STR_SET_EMBED(str);
     STR_SET_EMBED_LEN(str, 0);
     RSTRING_PTR(str)[0] = 0;
-    ENC_CODERANGE_CLEAR(str);
+    if (rb_enc_asciicompat(STR_ENC_GET(str)))
+	ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
+    else
+	ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
     return str;
 }
 
@@ -3656,22 +3711,15 @@
     if (RSTRING_LEN(str) > 1) {
 	if (single_byte_optimizable(str)) {
 	    char *s, *e, c;
-	    int cr = ENC_CODERANGE(str);
-	    int single = 1;
 
-	    rb_str_modify(str);
+	    str_modify(str);
 	    s = RSTRING_PTR(str);
 	    e = RSTRING_END(str) - 1;
 	    while (s < e) {
 		c = *s;
-		if (*s & 0x80) single = 0;
 		*s++ = *e;
  		*e-- = c;
 	    }
-	    if (cr == ENC_CODERANGE_UNKNOWN && single) {
-		cr = ENC_CODERANGE_7BIT;
-	    }
-	    ENC_CODERANGE_SET(str, cr);
 	}
 	else {
 	    rb_str_shared_replace(str, rb_str_reverse(str));
@@ -4034,23 +4082,34 @@
     rb_encoding *enc;
     char *s, *send;
     int modify = 0;
-    int cr = ENC_CODERANGE(str);
 
-    rb_str_modify(str);
+    str_modify(str);
     enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str); send = RSTRING_END(str);
-    while (s < send) {
-	int c = rb_enc_codepoint(s, send, enc);
+    if (single_byte_optimizable(str)) {
+	while (s < send) {
+	    int c = (*s & 0xff);
 
-	if (rb_enc_islower(c, enc)) {
-	    /* assuming toupper returns codepoint with same size */
-	    rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
-	    modify = 1;
+	    if (rb_enc_islower(c, enc)) {
+		*s = rb_enc_toupper(c , enc);
+		modify = 1;
+	    }
+	    s++;
 	}
-	s += rb_enc_codelen(c, enc);
     }
+    else {
+	while (s < send) {
+	    int c = rb_enc_codepoint(s, send, enc);
 
-    ENC_CODERANGE_SET(str, cr);
+	    if (rb_enc_islower(c, enc)) {
+		/* assuming toupper returns codepoint with same size */
+		rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
+		modify = 1;
+	    }
+	    s += rb_enc_codelen(c, enc);
+	}
+    }
+
     if (modify) return str;
     return Qnil;
 }
@@ -4092,23 +4151,34 @@
     rb_encoding *enc;
     char *s, *send;
     int modify = 0;
-    int cr = ENC_CODERANGE(str);
 
-    rb_str_modify(str);
+    str_modify(str);
     enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str); send = RSTRING_END(str);
-    while (s < send) {
-	int c = rb_enc_codepoint(s, send, enc);
+    if (single_byte_optimizable(str)) {
+	while (s < send) {
+	    int c = (*s & 0xff);
 
-	if (rb_enc_isupper(c, enc)) {
-	    /* assuming toupper returns codepoint with same size */
-	    rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
-	    modify = 1;
+	    if (rb_enc_isupper(c, enc)) {
+		*s = rb_enc_tolower(c , enc);
+		modify = 1;
+	    }
+	    s++;
 	}
-	s += rb_enc_codelen(c, enc);
     }
+    else {
+	while (s < send) {
+	    int c = rb_enc_codepoint(s, send, enc);
 
-    ENC_CODERANGE_SET(str, cr);
+	    if (rb_enc_isupper(c, enc)) {
+		/* assuming tolower returns codepoint with same size */
+		rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
+		modify = 1;
+	    }
+	    s += rb_enc_codelen(c, enc);
+	}
+    }
+
     if (modify) return str;
     return Qnil;
 }
@@ -4156,9 +4226,8 @@
     char *s, *send;
     int modify = 0;
     int c;
-    int cr = ENC_CODERANGE(str);
 
-    rb_str_modify(str);
+    str_modify(str);
     enc = STR_ENC_GET(str);
     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
     s = RSTRING_PTR(str); send = RSTRING_END(str);
@@ -4178,7 +4247,6 @@
 	s += rb_enc_codelen(c, enc);
     }
 
-    ENC_CODERANGE_SET(str, cr);
     if (modify) return str;
     return Qnil;
 }
@@ -4221,28 +4289,43 @@
     rb_encoding *enc;
     char *s, *send;
     int modify = 0;
-    int cr = ENC_CODERANGE(str);
 
-    rb_str_modify(str);
+    str_modify(str);
     enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str); send = RSTRING_END(str);
-    while (s < send) {
-	int c = rb_enc_codepoint(s, send, enc);
+    if (single_byte_optimizable(str)) {
+	while (s < send) {
+	    int c = (*s & 0xff);
 
-	if (rb_enc_isupper(c, enc)) {
-	    /* assuming toupper returns codepoint with same size */
-	    rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
-	    modify = 1;
+	    if (rb_enc_isupper(c, enc)) {
+		*s = rb_enc_tolower(c , enc);
+		modify = 1;
+	    }
+	    else if (rb_enc_islower(c, enc)) {
+		*s = rb_enc_toupper(c , enc);
+		modify = 1;
+	    }
+	    s++;
 	}
-	else if (rb_enc_islower(c, enc)) {
-	    /* assuming toupper returns codepoint with same size */
-	    rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
-	    modify = 1;
+    }
+    else {
+	while (s < send) {
+	    int c = rb_enc_codepoint(s, send, enc);
+
+	    if (rb_enc_isupper(c, enc)) {
+		/* assuming toupper returns codepoint with same size */
+		rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
+		modify = 1;
+	    }
+	    else if (rb_enc_islower(c, enc)) {
+		/* assuming tolower returns codepoint with same size */
+		rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
+		modify = 1;
+	    }
+	    s += rb_enc_mbclen(s, send, enc);
 	}
-	s += rb_enc_codelen(c, enc);
     }
 
-    ENC_CODERANGE_SET(str, cr);
     if (modify) return str;
     return Qnil;
 }
@@ -4321,6 +4404,7 @@
     char *s, *send;
     VALUE hash = 0;
     int singlebyte = single_byte_optimizable(str);
+    int cr;
 
     StringValue(src);
     StringValue(repl);
@@ -4392,7 +4476,8 @@
 	}
     }
 
-    rb_str_modify(str);
+    cr = ENC_CODERANGE(str);
+    str_modify(str);
     s = RSTRING_PTR(str); send = RSTRING_END(str);
     if (sflag) {
 	int clen, tlen, max = RSTRING_LEN(str);
@@ -4515,6 +4600,9 @@
     
     if (modify) {
 	rb_enc_associate(str, enc);
+	cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(repl));
+	if (cr != ENC_CODERANGE_BROKEN)
+	    ENC_CODERANGE_SET(str, cr);
 	return str;
     }
     return Qnil;
@@ -4651,11 +4739,10 @@
     char *s, *send, *t;
     VALUE del = 0, nodel = 0;
     int modify = 0;
-    int i;
-    int cr;
+    int i, singlebyte;
 
     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
-    cr = ENC_CODERANGE(str);
+    singlebyte = single_byte_optimizable(str);
     if (argc < 1) {
 	rb_raise(rb_eArgError, "wrong number of arguments");
     }
@@ -4664,30 +4751,41 @@
 
 	StringValue(s);
 	enc = rb_enc_check(str, s);
+	if (singlebyte && !single_byte_optimizable(s))
+	    singlebyte = 0;
 	tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
     }
 
-    rb_str_modify(str);
+    str_modify(str);
     s = t = RSTRING_PTR(str);
-    if (!s || RSTRING_LEN(str) == 0) return Qnil;
     send = RSTRING_END(str);
-    while (s < send) {
-	int c = rb_enc_codepoint(s, send, enc);
-	int clen = rb_enc_codelen(c, enc);
+    if (singlebyte) {
+	while (s < send) {
+	    if (squeez[*s & 0xff])
+		modify = 1;
+	    else
+		*t++ = *s;
+	    s++;
+	}
+    }
+    else {
+	while (s < send) {
+	    int c = rb_enc_codepoint(s, send, enc);
+	    int clen = rb_enc_codelen(c, enc);
 
-	if (tr_find(c, squeez, del, nodel)) {
-	    modify = 1;
+	    if (tr_find(c, squeez, del, nodel)) {
+		modify = 1;
+	    }
+	    else {
+		if (t != s) rb_enc_mbcput(c, t, enc);
+		t += clen;
+	    }
+	    s += clen;
 	}
-	else {
-	    if (t != s) rb_enc_mbcput(c, t, enc);
-	    t += clen;
-	}
-	s += clen;
     }
     *t = '\0';
     STR_SET_LEN(str, t - RSTRING_PTR(str));
 
-    ENC_CODERANGE_SET(str, cr);
     if (modify) return str;
     return Qnil;
 }
@@ -4732,8 +4830,9 @@
     VALUE del = 0, nodel = 0;
     char *s, *send, *t;
     int save, modify = 0;
-    int i;
+    int i, singlebyte;
 
+    singlebyte = single_byte_optimizable(str);
     if (argc == 0) {
 	enc = STR_ENC_GET(str);
     }
@@ -4743,26 +4842,39 @@
 
 	    StringValue(s);
 	    enc = rb_enc_check(str, s);
+	    if (singlebyte && !single_byte_optimizable(s))
+		singlebyte = 0;
 	    tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
 	}
     }
 
-    rb_str_modify(str);
+    str_modify(str);
     s = t = RSTRING_PTR(str);
     if (!s || RSTRING_LEN(str) == 0) return Qnil;
     send = RSTRING_END(str);
     save = -1;
-    while (s < send) {
-	int c = rb_enc_codepoint(s, send, enc);
-	int clen = rb_enc_codelen(c, enc);
+    if (singlebyte) {
+	while (s < send) {
+	    int c = *s++ & 0xff;
 
-	if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
-	    if (t != s) rb_enc_mbcput(c, t, enc);
-	    save = c;
-	    t += clen;
+	    if (c != save || !squeez[c]) {
+		*t++ = save = c;
+	    }
 	}
-	s += clen;
     }
+    else {
+	while (s < send) {
+	    int c = rb_enc_codepoint(s, send, enc);
+	    int clen = rb_enc_codelen(c, enc);
+
+	    if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
+		if (t != s) rb_enc_mbcput(c, t, enc);
+		save = c;
+		t += clen;
+	    }
+	    s += clen;
+	}
+    }
     *t = '\0';
     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
 	STR_SET_LEN(str, t - RSTRING_PTR(str));
@@ -4857,33 +4969,68 @@
     char table[256];
     rb_encoding *enc = 0;
     VALUE del = 0, nodel = 0;
+    VALUE arg;
     char *s, *send;
-    int i;
+    int i, singlebyte;
 
     if (argc < 1) {
 	rb_raise(rb_eArgError, "wrong number of arguments");
     }
-    for (i=0; i<argc; i++) {
-	VALUE s = argv[i];
+    arg = argv[0];
 
-	StringValue(s);
-	enc = rb_enc_check(str, s);
-	tr_setup_table(s, table,i==0, &del, &nodel, enc);
-    }
+    StringValue(arg);
+    enc = rb_enc_check(str, arg);
 
     s = RSTRING_PTR(str);
-    if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
     send = RSTRING_END(str);
-    i = 0;
-    while (s < send) {
-	int c = rb_enc_codepoint(s, send, enc);
-	int clen = rb_enc_codelen(c, enc);
+    singlebyte = single_byte_optimizable(str) && single_byte_optimizable(arg);
 
-	if (tr_find(c, table, del, nodel)) {
+    if (argc == 1 && RSTRING_LEN(arg) == 1 && singlebyte) {
+	/* Single byte search */
+	char *argp = RSTRING_PTR(arg);
+	int pos;
+
+	if (!s) return INT2FIX(0); /* Shouldn't happen? */
+	i = 0;
+	while (s < send &&
+	       (pos = rb_memsearch(argp, 1, s, send - s, enc)) >= 0) {
+	    s += pos + 1;
 	    i++;
 	}
-	s += clen;
+	return INT2NUM(i);
     }
+
+    tr_setup_table(arg, table, 1, &del, &nodel, enc);
+    for (i=1; i<argc; i++) {
+	arg = argv[i];
+
+	StringValue(arg);
+	enc = rb_enc_check(str, arg);
+	if (singlebyte && !single_byte_optimizable(arg))
+	   singlebyte = 0;
+	tr_setup_table(arg, table, 0, &del, &nodel, enc);
+    }
+
+    if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
+    i = 0;
+    if (singlebyte) {
+	while (s < send) {
+	    if (table[*s++ & 0xff]) {
+		i++;
+	    }
+	}
+    }
+    else {
+	while (s < send) {
+	    int c = rb_enc_codepoint(s, send, enc);
+	    int clen = rb_enc_codelen(c, enc);
+
+	    if (tr_find(c, table, del, nodel)) {
+        	i++;
+	    }
+	    s += clen;
+	}
+    }
     return INT2NUM(i);
 }
 
@@ -4936,7 +5083,7 @@
     rb_encoding *enc;
     VALUE spat;
     VALUE limit;
-    int awk_split = Qfalse;
+    enum {awk, string, regexp} split_type;
     long beg, end, i = 0;
     int lim = 0;
     VALUE result, tmp;
@@ -4958,37 +5105,41 @@
 	    spat = rb_fs;
 	    goto fs_set;
 	}
-	awk_split = Qtrue;
+	split_type = awk;
     }
     else {
       fs_set:
 	if (TYPE(spat) == T_STRING) {
 	    rb_encoding *enc2 = STR_ENC_GET(spat);
 
-	    if (rb_enc_mbminlen(enc2) == 1) {
+	    split_type = string;
+	    if (RSTRING_LEN(spat) == 0) {
+		/* Special case - split into chars */
+		spat = rb_reg_regcomp(spat);
+		split_type = regexp;
+	    }
+	    else if (rb_enc_mbminlen(enc2) == 1) {
 		if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
-		    awk_split = Qtrue;
+		    split_type = awk;
 		}
 	    }
 	    else {
 		int l;
 		if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
 		    RSTRING_LEN(spat) == l) {
-		    awk_split = Qtrue;
+		    split_type = awk;
 		}
 	    }
-	    if (!awk_split) {
-		spat = rb_reg_regcomp(rb_reg_quote(spat));
-	    }
 	}
 	else {
 	    spat = get_pat(spat, 1);
+	    split_type = regexp;
 	}
     }
 
     result = rb_ary_new();
     beg = 0;
-    if (awk_split) {
+    if (split_type == awk) {
 	char *ptr = RSTRING_PTR(str);
 	char *eptr = RSTRING_END(str);
 	char *bptr = ptr;
@@ -5022,6 +5173,33 @@
 	    }
 	}
     }
+    else if (split_type == string) {
+	char *ptr = RSTRING_PTR(str);
+	char *eptr = RSTRING_END(str);
+	char *sptr = RSTRING_PTR(spat);
+	int slen = RSTRING_LEN(spat);
+
+	if (is_broken_string(str)) {
+	    rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
+	}
+	if (is_broken_string(spat)) {
+	    rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
+	}
+	enc = rb_enc_check(str, spat);
+	while (ptr < eptr &&
+	       (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
+	    /* Check we are at the start of a char */
+	    char *t = rb_enc_right_char_head(ptr, ptr + end, enc);
+	    if (t != ptr + end) {
+		ptr = t;
+		continue;
+	    }
+	    rb_ary_push(result, rb_str_substr(str, ptr - RSTRING_PTR(str), end));
+	    ptr += end + slen;
+	    if (!NIL_P(limit) && lim <= ++i) break;
+	}
+	beg = ptr - RSTRING_PTR(str);
+    }
     else {
 	long start = beg;
 	long idx;
@@ -5323,6 +5501,56 @@
     return str;
 }
 
+/*
+ *  Document-method: codepoints
+ *  call-seq:
+ *     str.codepoints                   => anEnumerator
+ *     str.codepoints {|fixnum| block } => str
+ *  
+ *  Returns an enumerator that gives the <code>Integer</code> ordinal
+ *  of each character in the string, also known as a <i>codepoint</i>
+ *  when applied to Unicode strings. If a block is given, it iterates
+ *  over each character in the string.
+ *     
+ *     "foo\u0635".chars.to_a   #=> [102, 111, 111, 1589]
+ */
+
+/*
+ *  Document-method: each_codepoint
+ *  call-seq:
+ *     str.each_codepoint {|fixnum| block }    => str
+ *  
+ *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
+ *  also known as a <i>codepoint</i> when applied to Unicode strings to the
+ *  given block.
+ *     
+ *     "hello\u0639".each_codepoint {|c| print c, ' ' }
+ *     
+ *  <em>produces:</em>
+ *     
+ *     104 101 108 108 111 1593
+ */
+
+static VALUE
+rb_str_each_codepoint(VALUE str)
+{
+    int i, len, n;
+    const char *ptr, *end;
+    rb_encoding *enc;
+
+    if (single_byte_optimizable(str)) return rb_str_each_byte(str);
+    RETURN_ENUMERATOR(str, 0, 0);
+    ptr = RSTRING_PTR(str);
+    len = RSTRING_LEN(str);
+    end = RSTRING_END(str);
+    enc = rb_enc_get(str);
+    for (i = 0; i < len; i += n) {
+	n = rb_enc_mbclen(ptr + i, ptr + len, enc);
+	rb_yield(INT2FIX(rb_enc_codepoint(ptr + i, end, enc)));
+    }
+    return str;
+}
+
 static long
 chopped_length(VALUE str)
 {
@@ -5355,7 +5583,8 @@
 {
     if (RSTRING_LEN(str) > 0) {
 	long len;
-	rb_str_modify(str);
+
+	str_modify(str);
 	len = chopped_length(str);
 	STR_SET_LEN(str, len);
 	RSTRING_PTR(str)[len] = '\0';
@@ -5417,7 +5646,7 @@
 	rs = rb_rs;
 	if (rs == rb_default_rs) {
 	  smart_chomp:
-	    rb_str_modify(str);
+	    str_modify(str);
 	    enc = rb_enc_get(str);
 	    if (rb_enc_mbminlen(enc) > 1) {
 		pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), enc);
@@ -5469,7 +5698,7 @@
 		len--;
 	}
 	if (len < RSTRING_LEN(str)) {
-	    rb_str_modify(str);
+	    str_modify(str);
 	    STR_SET_LEN(str, len);
 	    RSTRING_PTR(str)[len] = '\0';
 	    return str;
@@ -5491,7 +5720,7 @@
 	 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
 	if (rb_enc_left_char_head(p, pp, enc) != pp)
 	    return Qnil;
-	rb_str_modify(str);
+	str_modify(str);
 	STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
 	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
 	return str;
@@ -5545,21 +5774,28 @@
     rb_encoding *enc;
     char *s, *t, *e;
 
-    rb_str_modify(str);
-    enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str);
     if (!s || RSTRING_LEN(str) == 0) return Qnil;
     e = t = RSTRING_END(str);
+    enc = STR_ENC_GET(str);
     /* remove spaces at head */
-    while (s < e) {
-	int cc = rb_enc_codepoint(s, e, enc);
+    if (single_byte_optimizable(str)) {
+	while (s < e) {
+	    if (!rb_enc_isspace(*(unsigned char *)s, enc)) break;
+	    s++;
+	}
+    }
+    else {
+	while (s < e) {
+	    int cc = rb_enc_codepoint(s, e, enc);
 	
-	if (!rb_enc_isspace(cc, enc)) break;
-	s += rb_enc_codelen(cc, enc);
+	    if (!rb_enc_isspace(cc, enc)) break;
+	    s += rb_enc_codelen(cc, enc);
+	}
     }
 
     if (s > RSTRING_PTR(str)) {
-	rb_str_modify(str);
+	str_modify(str);
 	STR_SET_LEN(str, t-s);
 	memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
 	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
@@ -5606,30 +5842,31 @@
 {
     rb_encoding *enc;
     char *s, *t, *e;
-    int space_seen = Qfalse;
 
-    rb_str_modify(str);
     enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str);
     if (!s || RSTRING_LEN(str) == 0) return Qnil;
     t = e = RSTRING_END(str);
-    while (s < e) {
-	int cc = rb_enc_codepoint(s, e, enc);
+    if (single_byte_optimizable(str)) {
+        while (t > s) {
+	    if (!rb_enc_isspace(*(unsigned char *)(t-1), enc)) break;
+	    t--;
+	}
+    }
+    else {
+	char *tp;
 
-	if (!cc || rb_enc_isspace(cc, enc)) {
-	    if (!space_seen) t = s;
-	    space_seen = Qtrue;
+        while ((tp = rb_enc_prev_char(s, t, enc)) != NULL) {
+	    if (!rb_enc_isspace(rb_enc_codepoint(tp, e, enc), enc)) break;
+	    t = tp;
 	}
-	else {
-	    space_seen = Qfalse;
-	}
-	s += rb_enc_codelen(cc, enc);
     }
-    if (!space_seen) t = s;
     if (t < e) {
-	rb_str_modify(str);
-	STR_SET_LEN(str, t-RSTRING_PTR(str));
-	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
+	int len = t-RSTRING_PTR(str);
+
+	str_modify(str);
+	STR_SET_LEN(str, len);
+	RSTRING_PTR(str)[len] = '\0';
 	return str;
     }
     return Qnil;
@@ -6011,7 +6248,7 @@
     const char *f = " ";
     long n, llen, rlen;
     volatile VALUE pad;
-    int singlebyte = 1;
+    int singlebyte = 1, cr;
 
     rb_scan_args(argc, argv, "11", &w, &pad);
     enc = STR_ENC_GET(str);
@@ -6032,6 +6269,7 @@
     n = width - len;
     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
     rlen = n - llen;
+    cr = ENC_CODERANGE(str);
     res = rb_str_new5(str, 0, RSTRING_LEN(str)+n*flen/fclen+2);
     p = RSTRING_PTR(res);
     while (llen) {
@@ -6077,6 +6315,10 @@
     OBJ_INFECT(res, str);
     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
     rb_enc_associate(res, enc);
+    if (argc == 2)
+	cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
+    if (cr != ENC_CODERANGE_BROKEN)
+	ENC_CODERANGE_SET(res, cr);
     return res;
 }
 
@@ -6677,6 +6919,7 @@
     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
+    rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
@@ -6726,6 +6969,7 @@
     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
+    rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
 
     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
 
Index: test/ruby/test_m17n_comb.rb
===================================================================
--- test/ruby/test_m17n_comb.rb	(revision 19073)
+++ test/ruby/test_m17n_comb.rb	(working copy)
@@ -1158,11 +1158,11 @@
   def test_str_split
     combination(STRINGS, STRINGS) {|s1, s2|
       if !s2.valid_encoding?
-        assert_raise(RegexpError) { s1.split(s2) }
+        assert_raise(ArgumentError, RegexpError) { s1.split(s2) }
         next
       end
       if !s1.ascii_only? && !s2.ascii_only? && s1.encoding != s2.encoding
-        assert_raise(ArgumentError) { s1.split(s2) }
+        assert_raise(ArgumentError, EncodingCompatibilityError) { s1.split(s2) }
         next
       end
       if !s1.valid_encoding?

Thread

Prev Next

In This Thread

Prev Next