ruby-core

I would like to submit the attached patch to string.c which substantially  
improves Ruby 1.9's string performance when operating on ASCII data,  
especially when working in a UTF-8 locale.

As a result of these changes the previously very slow benchmark  
"bm_so_count_words.rb" now runs about 15 times faster on my system (UTF-8  
locale) - about the same speed as it did on 1.8. It now runs about 7 times  
faster than it did under an ASCII locale (eg: LANG=C). And it now runs at  
the same speed whether you use UTF-8 or ASCII locale.

The main things I did are:

1) A number of methods failed to set the 7-bit coderange flag when they  
should have. I actually think that these may be bugs. In fact I think the  
usage of the coderange flags is a bit strange. "rb_str_modify()" clears  
the flags, so you really should make sure that they are set again  
correctly after every rb_str_modify(). This was not happenning in several  
places. Even little old String#clear was clearing these flags. As an  
example, if you say:

	s = "abc".force_encoding("UTF-8")
	s.clear
	s << "xyz"

Before the "clear" the 7-bit flag is set, so ruby knows that "s" is all  
ascii, and can do faster operations. But the "clear" was clearing the  
7-bit flag, causing subsequent use of this string to be slow, as from then  
on ruby thinks it is a multi-byte-char string. In this case the "append"  
thinks it is appending a string of ascii chars to a multi-byte-char  
string, so it leaves the 7-bit flag clear.

Note: I have NOT checked methods outside class String which return strings  
for other occurrences of this problem - it may also be in other places.

2) I added "single-byte" optimization to a number of methods that didn't  
already have it. Perhaps some of them aren't used that much and don't need  
the optimization, but I was in the mood! They are (all String#):

	casecmp
	strip/lstrip/rstrip
	ljust/rjust/center
	upcase/downcase/swapcase
	count
	delete/squeeze

Certainly I use strip & casecmp quite a bit in my apps.

3) I did a separate optimization in String#split when the parameter is a  
string. Now s.split(":") is about twice as fast as s.split(/:/), so  
libraries using regexps like /:/ because they used to be slightly faster  
than strings, should be changed to strings.

4) I also added "each_codepoint" as described previously. If you don't  
think it should go into 1.9, please feel free to remove it, though I feel  
it is useful.


I haven't tested these changes exhaustively, but it passes the "rubytest".  
I am not sure of the procedure for further testing. I hope I haven't  
broken anything!

There is something I'd like Matz or another developer to check: In  
"lstrip" & "rstrip" I made the assumption that all "space" characters in a  
multi-byte ascii-compatible character set (eg: UTF-8) are single-byte. I  
am not 100% sure if this assumption is correct.


I would also like to propose an optimization, which though minor, involves  
lots of changes:

When processing strings sequentially, there is a lot of code something  
like:
	c = rb_enc_codepoint(s, end, enc);
	s += rb_enc_mbclen(s, end, enc);

In other words, there are 2 function calls - one to get the next  
codepoint, and one to get the length of the character in bytes to advance  
the pointer. The first function has to calculate the length of the  
character anyhow, so why not change it to something like:
	c = rb_enc_codepoint(s, end, enc, &len);
	s += len;

I would think that this is faster, and also easier to understand.

Cheers
Mike

Attachments (1)

string.pat (20.2 KB, text/x-diff)

Index: string.c
===================================================================
--- string.c	(revision 19073)
+++ string.c	(working copy)
@@ -2065,19 +2065,33 @@
 
     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
-    while (p1 < p1end && p2 < p2end) {
-	int c1 = rb_enc_codepoint(p1, p1end, enc);
-	int c2 = rb_enc_codepoint(p2, p2end, enc);
+    if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
+	while (p1 < p1end && p2 < p2end) {
+	    if (*p1 != *p2) {
+		int c1 = rb_enc_toupper(*p1 & 0xff, enc);
+		int c2 = rb_enc_toupper(*p2 & 0xff, enc);
+		if (c1 > c2) return INT2FIX(1);
+		if (c1 < c2) return INT2FIX(-1);
+	    }
+	    p1++;
+	    p2++;
+	}
+    }
+    else {
+	while (p1 < p1end && p2 < p2end) {
+	    int c1 = rb_enc_codepoint(p1, p1end, enc);
+	    int c2 = rb_enc_codepoint(p2, p2end, enc);
 
-	if (c1 != c2) {
-	    c1 = rb_enc_toupper(c1, enc);
-	    c2 = rb_enc_toupper(c2, enc);
-	    if (c1 > c2) return INT2FIX(1);
-	    if (c1 < c2) return INT2FIX(-1);
+	    if (c1 != c2) {
+		c1 = rb_enc_toupper(c1, enc);
+		c2 = rb_enc_toupper(c2, enc);
+		if (c1 > c2) return INT2FIX(1);
+		if (c1 < c2) return INT2FIX(-1);
+	    }
+	    len = rb_enc_codelen(c1, enc);
+	    p1 += len;
+	    p2 += len;
 	}
-	len = rb_enc_codelen(c1, enc);
-	p1 += len;
-	p2 += len;
     }
     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
@@ -2894,11 +2908,11 @@
     char *p, *e;
     rb_encoding *enc;
     int singlebyte = single_byte_optimizable(str);
+    int cr = ENC_CODERANGE(str);
 
     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
 
     StringValue(val);
-    rb_str_modify(str);
     enc = rb_enc_check(str, val);
     slen = str_strlen(str, enc);
 
@@ -2915,6 +2929,7 @@
     if (slen < len || slen < beg + len) {
 	len = slen - beg;
     }
+    rb_str_modify(str);
     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
     if (!p) p = RSTRING_END(str);
     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
@@ -2924,6 +2939,7 @@
     len = e - p;		/* physical length */
     rb_str_splice_0(str, beg, len, val);
     rb_enc_associate(str, enc);
+    ENC_CODERANGE_SET(str, ENC_CODERANGE_AND(cr, ENC_CODERANGE(val)));
 }
 
 void
@@ -3106,11 +3122,12 @@
 {
     VALUE result;
     VALUE buf[3];
-    int i;
+    int i, cr;
 
     if (argc < 1 || 2 < argc) {
 	rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
     }
+    cr = ENC_CODERANGE(str);
     for (i=0; i<argc; i++) {
 	buf[i] = argv[i];
     }
@@ -3120,6 +3137,7 @@
     if (!NIL_P(result)) {
 	rb_str_aset_m(argc+1, buf, str);
     }
+    ENC_CODERANGE_SET(str, cr);
     return result;
 }
 
@@ -3524,7 +3542,10 @@
     STR_SET_EMBED(str);
     STR_SET_EMBED_LEN(str, 0);
     RSTRING_PTR(str)[0] = 0;
-    ENC_CODERANGE_CLEAR(str);
+    if (rb_enc_asciicompat(STR_ENC_GET(str)))
+	ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
+    else
+	ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
     return str;
 }
 
@@ -4039,17 +4060,30 @@
     rb_str_modify(str);
     enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str); send = RSTRING_END(str);
-    while (s < send) {
-	int c = rb_enc_codepoint(s, send, enc);
+    if (single_byte_optimizable(str)) {
+	while (s < send) {
+	    int c = (*s & 0xff);
 
-	if (rb_enc_islower(c, enc)) {
-	    /* assuming toupper returns codepoint with same size */
-	    rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
-	    modify = 1;
+	    if (rb_enc_islower(c, enc)) {
+		*s = rb_enc_toupper(c , enc);
+		modify = 1;
+	    }
+	    s++;
 	}
-	s += rb_enc_codelen(c, enc);
     }
+    else {
+	while (s < send) {
+	    int c = rb_enc_codepoint(s, send, enc);
 
+	    if (rb_enc_islower(c, enc)) {
+		/* assuming toupper returns codepoint with same size */
+		rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
+		modify = 1;
+	    }
+	    s += rb_enc_codelen(c, enc);
+	}
+    }
+
     ENC_CODERANGE_SET(str, cr);
     if (modify) return str;
     return Qnil;
@@ -4097,17 +4131,30 @@
     rb_str_modify(str);
     enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str); send = RSTRING_END(str);
-    while (s < send) {
-	int c = rb_enc_codepoint(s, send, enc);
+    if (single_byte_optimizable(str)) {
+	while (s < send) {
+	    int c = (*s & 0xff);
 
-	if (rb_enc_isupper(c, enc)) {
-	    /* assuming toupper returns codepoint with same size */
-	    rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
-	    modify = 1;
+	    if (rb_enc_isupper(c, enc)) {
+		*s = rb_enc_tolower(c , enc);
+		modify = 1;
+	    }
+	    s++;
 	}
-	s += rb_enc_codelen(c, enc);
     }
+    else {
+	while (s < send) {
+	    int c = rb_enc_codepoint(s, send, enc);
 
+	    if (rb_enc_isupper(c, enc)) {
+		/* assuming tolower returns codepoint with same size */
+		rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
+		modify = 1;
+	    }
+	    s += rb_enc_codelen(c, enc);
+	}
+    }
+
     ENC_CODERANGE_SET(str, cr);
     if (modify) return str;
     return Qnil;
@@ -4226,20 +4273,37 @@
     rb_str_modify(str);
     enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str); send = RSTRING_END(str);
-    while (s < send) {
-	int c = rb_enc_codepoint(s, send, enc);
+    if (single_byte_optimizable(str)) {
+	while (s < send) {
+	    int c = (*s & 0xff);
 
-	if (rb_enc_isupper(c, enc)) {
-	    /* assuming toupper returns codepoint with same size */
-	    rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
-	    modify = 1;
+	    if (rb_enc_isupper(c, enc)) {
+		*s = rb_enc_tolower(c , enc);
+		modify = 1;
+	    }
+	    else if (rb_enc_islower(c, enc)) {
+		*s = rb_enc_toupper(c , enc);
+		modify = 1;
+	    }
+	    s++;
 	}
-	else if (rb_enc_islower(c, enc)) {
-	    /* assuming toupper returns codepoint with same size */
-	    rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
-	    modify = 1;
+    }
+    else {
+	while (s < send) {
+	    int c = rb_enc_codepoint(s, send, enc);
+
+	    if (rb_enc_isupper(c, enc)) {
+		/* assuming toupper returns codepoint with same size */
+		rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
+		modify = 1;
+	    }
+	    else if (rb_enc_islower(c, enc)) {
+		/* assuming tolower returns codepoint with same size */
+		rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
+		modify = 1;
+	    }
+	    s += rb_enc_mbclen(s, send, enc);
 	}
-	s += rb_enc_codelen(c, enc);
     }
 
     ENC_CODERANGE_SET(str, cr);
@@ -4321,6 +4385,7 @@
     char *s, *send;
     VALUE hash = 0;
     int singlebyte = single_byte_optimizable(str);
+    int cr;
 
     StringValue(src);
     StringValue(repl);
@@ -4392,6 +4457,7 @@
 	}
     }
 
+    cr = ENC_CODERANGE(str);
     rb_str_modify(str);
     s = RSTRING_PTR(str); send = RSTRING_END(str);
     if (sflag) {
@@ -4515,8 +4581,10 @@
     
     if (modify) {
 	rb_enc_associate(str, enc);
+	ENC_CODERANGE_SET(str, ENC_CODERANGE_AND(cr, ENC_CODERANGE(repl)));
 	return str;
     }
+    ENC_CODERANGE_SET(str, cr);
     return Qnil;
 }
 
@@ -4651,11 +4719,11 @@
     char *s, *send, *t;
     VALUE del = 0, nodel = 0;
     int modify = 0;
-    int i;
-    int cr;
+    int i, cr, singlebyte;
 
     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
     cr = ENC_CODERANGE(str);
+    singlebyte = single_byte_optimizable(str);
     if (argc < 1) {
 	rb_raise(rb_eArgError, "wrong number of arguments");
     }
@@ -4664,25 +4732,37 @@
 
 	StringValue(s);
 	enc = rb_enc_check(str, s);
+	if (singlebyte && !single_byte_optimizable(s))
+	    singlebyte = 0;
 	tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
     }
 
     rb_str_modify(str);
     s = t = RSTRING_PTR(str);
-    if (!s || RSTRING_LEN(str) == 0) return Qnil;
     send = RSTRING_END(str);
-    while (s < send) {
-	int c = rb_enc_codepoint(s, send, enc);
-	int clen = rb_enc_codelen(c, enc);
+    if (singlebyte) {
+	while (s < send) {
+	    if (squeez[*s & 0xff])
+		modify = 1;
+	    else
+		*t++ = *s;
+	    s++;
+	}
+    }
+    else {
+	while (s < send) {
+	    int c = rb_enc_codepoint(s, send, enc);
+	    int clen = rb_enc_codelen(c, enc);
 
-	if (tr_find(c, squeez, del, nodel)) {
-	    modify = 1;
+	    if (tr_find(c, squeez, del, nodel)) {
+		modify = 1;
+	    }
+	    else {
+		if (t != s) rb_enc_mbcput(c, t, enc);
+		t += clen;
+	    }
+	    s += clen;
 	}
-	else {
-	    if (t != s) rb_enc_mbcput(c, t, enc);
-	    t += clen;
-	}
-	s += clen;
     }
     *t = '\0';
     STR_SET_LEN(str, t - RSTRING_PTR(str));
@@ -4732,8 +4812,11 @@
     VALUE del = 0, nodel = 0;
     char *s, *send, *t;
     int save, modify = 0;
-    int i;
+    int i, cr, singlebyte;
 
+    if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
+    cr = ENC_CODERANGE(str);
+    singlebyte = single_byte_optimizable(str);
     if (argc == 0) {
 	enc = STR_ENC_GET(str);
     }
@@ -4743,32 +4826,45 @@
 
 	    StringValue(s);
 	    enc = rb_enc_check(str, s);
+	    if (singlebyte && !single_byte_optimizable(s))
+		singlebyte = 0;
 	    tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
 	}
     }
 
     rb_str_modify(str);
     s = t = RSTRING_PTR(str);
-    if (!s || RSTRING_LEN(str) == 0) return Qnil;
     send = RSTRING_END(str);
     save = -1;
-    while (s < send) {
-	int c = rb_enc_codepoint(s, send, enc);
-	int clen = rb_enc_codelen(c, enc);
+    if (singlebyte) {
+	while (s < send) {
+	    int c = *s++ & 0xff;
 
-	if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
-	    if (t != s) rb_enc_mbcput(c, t, enc);
-	    save = c;
-	    t += clen;
+	    if (c != save || !squeez[c]) {
+		*t++ = save = c;
+	    }
 	}
-	s += clen;
     }
+    else {
+	while (s < send) {
+	    int c = rb_enc_codepoint(s, send, enc);
+	    int clen = rb_enc_codelen(c, enc);
+
+	    if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
+		if (t != s) rb_enc_mbcput(c, t, enc);
+		save = c;
+		t += clen;
+	    }
+	    s += clen;
+	}
+    }
     *t = '\0';
     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
 	STR_SET_LEN(str, t - RSTRING_PTR(str));
 	modify = 1;
     }
 
+    ENC_CODERANGE_SET(str, cr);
     if (modify) return str;
     return Qnil;
 }
@@ -4857,33 +4953,66 @@
     char table[256];
     rb_encoding *enc = 0;
     VALUE del = 0, nodel = 0;
+    VALUE arg;
     char *s, *send;
-    int i;
+    int i, singlebyte;
 
     if (argc < 1) {
 	rb_raise(rb_eArgError, "wrong number of arguments");
     }
-    for (i=0; i<argc; i++) {
-	VALUE s = argv[i];
-
-	StringValue(s);
-	enc = rb_enc_check(str, s);
-	tr_setup_table(s, table,i==0, &del, &nodel, enc);
-    }
-
     s = RSTRING_PTR(str);
     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
     send = RSTRING_END(str);
-    i = 0;
-    while (s < send) {
-	int c = rb_enc_codepoint(s, send, enc);
-	int clen = rb_enc_codelen(c, enc);
 
-	if (tr_find(c, table, del, nodel)) {
+    arg = argv[0];
+
+    StringValue(arg);
+    enc = rb_enc_check(str, arg);
+    if (argc == 1 && RSTRING_LEN(arg) == 1 && rb_enc_asciicompat(enc)) {
+	/* Single byte search in ascii compat string */
+	char *argp = RSTRING_PTR(arg);
+	int pos;
+
+	i = 0;
+	while (s < send &&
+	       (pos = rb_memsearch(argp, 1, s, send - s, enc)) >= 0) {
+	    s += pos + 1;
 	    i++;
 	}
-	s += clen;
+	return INT2NUM(i);
     }
+
+    tr_setup_table(arg, table, 1, &del, &nodel, enc);
+    singlebyte = single_byte_optimizable(str) && single_byte_optimizable(arg);
+    for (i=1; i<argc; i++) {
+	arg = argv[i];
+
+	StringValue(arg);
+	enc = rb_enc_check(str, arg);
+	if (singlebyte && !single_byte_optimizable(arg))
+	   singlebyte = 0;
+	tr_setup_table(arg, table, 0, &del, &nodel, enc);
+    }
+
+    i = 0;
+    if (singlebyte) {
+	while (s < send) {
+	    if (table[*s++ & 0xff]) {
+		i++;
+	    }
+	}
+    }
+    else {
+	while (s < send) {
+	    int c = rb_enc_codepoint(s, send, enc);
+	    int clen = rb_enc_codelen(c, enc);
+
+	    if (tr_find(c, table, del, nodel)) {
+        	i++;
+	    }
+	    s += clen;
+	}
+    }
     return INT2NUM(i);
 }
 
@@ -4936,7 +5065,7 @@
     rb_encoding *enc;
     VALUE spat;
     VALUE limit;
-    int awk_split = Qfalse;
+    enum {awk, string, regexp} split_type;
     long beg, end, i = 0;
     int lim = 0;
     VALUE result, tmp;
@@ -4958,37 +5087,41 @@
 	    spat = rb_fs;
 	    goto fs_set;
 	}
-	awk_split = Qtrue;
+	split_type = awk;
     }
     else {
       fs_set:
 	if (TYPE(spat) == T_STRING) {
 	    rb_encoding *enc2 = STR_ENC_GET(spat);
 
-	    if (rb_enc_mbminlen(enc2) == 1) {
+	    split_type = string;
+	    if (RSTRING_LEN(spat) == 0) {
+		/* Special case - split by char */
+		spat = rb_reg_regcomp(spat);
+		split_type = regexp;
+	    }
+	    else if (rb_enc_mbminlen(enc2) == 1) {
 		if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
-		    awk_split = Qtrue;
+		    split_type = awk;
 		}
 	    }
 	    else {
 		int l;
 		if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
 		    RSTRING_LEN(spat) == l) {
-		    awk_split = Qtrue;
+		    split_type = awk;
 		}
 	    }
-	    if (!awk_split) {
-		spat = rb_reg_regcomp(rb_reg_quote(spat));
-	    }
 	}
 	else {
 	    spat = get_pat(spat, 1);
+	    split_type = regexp;
 	}
     }
 
     result = rb_ary_new();
     beg = 0;
-    if (awk_split) {
+    if (split_type == awk) {
 	char *ptr = RSTRING_PTR(str);
 	char *eptr = RSTRING_END(str);
 	char *bptr = ptr;
@@ -5022,6 +5155,21 @@
 	    }
 	}
     }
+    else if (split_type == string) {
+	char *ptr = RSTRING_PTR(str);
+	char len = RSTRING_LEN(str);
+	char *sptr = RSTRING_PTR(spat);
+	int slen = RSTRING_LEN(spat);
+
+	enc = rb_enc_check(str, spat);
+	while (len > 0 && (end = rb_memsearch(sptr, slen, ptr, len, enc)) >= 0) {
+	    rb_ary_push(result, rb_str_substr(str, ptr - RSTRING_PTR(str), end));
+	    ptr += end + slen;
+	    len -= end + slen;
+	    if (!NIL_P(limit) && lim <= ++i) break;
+	}
+	beg = ptr - RSTRING_PTR(str);
+    }
     else {
 	long start = beg;
 	long idx;
@@ -5323,6 +5471,56 @@
     return str;
 }
 
+/*
+ *  Document-method: codepoints
+ *  call-seq:
+ *     str.codepoints                   => anEnumerator
+ *     str.codepoints {|fixnum| block } => str
+ *  
+ *  Returns an enumerator that gives the <code>Integer</code> ordinal
+ *  of each character in the string, also known as a <i>codepoint</i>
+ *  when applied to Unicode strings. If a block is given, it iterates
+ *  over each character in the string.
+ *     
+ *     "foo\u0635".chars.to_a   #=> [102, 111, 111, 1589]
+ */
+
+/*
+ *  Document-method: each_codepoint
+ *  call-seq:
+ *     str.each_codepoint {|fixnum| block }    => str
+ *  
+ *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
+ *  also known as a <i>codepoint</i> when applied to Unicode strings to the
+ *  given block.
+ *     
+ *     "hello\u0639".each_codepoint {|c| print c, ' ' }
+ *     
+ *  <em>produces:</em>
+ *     
+ *     104 101 108 108 111 1593
+ */
+
+static VALUE
+rb_str_each_codepoint(VALUE str)
+{
+    int i, len, n;
+    const char *ptr, *end;
+    rb_encoding *enc;
+
+    if (single_byte_optimizable(str)) return rb_str_each_byte(str);
+    RETURN_ENUMERATOR(str, 0, 0);
+    ptr = RSTRING_PTR(str);
+    len = RSTRING_LEN(str);
+    end = RSTRING_END(str);
+    enc = rb_enc_get(str);
+    for (i = 0; i < len; i += n) {
+	n = rb_enc_mbclen(ptr + i, ptr + len, enc);
+	rb_yield(INT2FIX(rb_enc_codepoint(ptr + i, end, enc)));
+    }
+    return str;
+}
+
 static long
 chopped_length(VALUE str)
 {
@@ -5355,10 +5553,13 @@
 {
     if (RSTRING_LEN(str) > 0) {
 	long len;
+	int cr = ENC_CODERANGE(str);
+
 	rb_str_modify(str);
 	len = chopped_length(str);
 	STR_SET_LEN(str, len);
 	RSTRING_PTR(str)[len] = '\0';
+	ENC_CODERANGE_SET(str, cr);
 	return str;
     }
     return Qnil;
@@ -5408,9 +5609,11 @@
     int newline;
     char *p, *pp, *e;
     long len, rslen;
+    int cr;
 
     len = RSTRING_LEN(str);
     if (len == 0) return Qnil;
+    cr = ENC_CODERANGE(str);
     p = RSTRING_PTR(str);
     e = p + len;
     if (argc == 0) {
@@ -5432,6 +5635,7 @@
 		    }
 		}
 		if (e == RSTRING_END(str)) {
+		    ENC_CODERANGE_SET(str, cr);
 		    return Qnil;
 		}
 		len = e - RSTRING_PTR(str);
@@ -5449,10 +5653,12 @@
 		    STR_DEC_LEN(str);
 		}
 		else {
+		    ENC_CODERANGE_SET(str, cr);
 		    return Qnil;
 		}
 	    }
 	    RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
+	    ENC_CODERANGE_SET(str, cr);
 	    return str;
 	}
     }
@@ -5472,6 +5678,7 @@
 	    rb_str_modify(str);
 	    STR_SET_LEN(str, len);
 	    RSTRING_PTR(str)[len] = '\0';
+	    ENC_CODERANGE_SET(str, cr);
 	    return str;
 	}
 	return Qnil;
@@ -5494,6 +5701,7 @@
 	rb_str_modify(str);
 	STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
 	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
+	ENC_CODERANGE_SET(str, cr);
 	return str;
     }
     return Qnil;
@@ -5545,24 +5753,35 @@
     rb_encoding *enc;
     char *s, *t, *e;
 
-    rb_str_modify(str);
-    enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str);
     if (!s || RSTRING_LEN(str) == 0) return Qnil;
     e = t = RSTRING_END(str);
+    enc = STR_ENC_GET(str);
     /* remove spaces at head */
-    while (s < e) {
-	int cc = rb_enc_codepoint(s, e, enc);
+    if (rb_enc_asciicompat(enc)) {
+	/* Assume all space chars are single byte */
+	while (s < e) {
+	    if (!rb_enc_isspace(*s & 0xff, enc)) break;
+	    s++;
+	}
+    }
+    else {
+	while (s < e) {
+	    int cc = rb_enc_codepoint(s, e, enc);
 	
-	if (!rb_enc_isspace(cc, enc)) break;
-	s += rb_enc_codelen(cc, enc);
+	    if (!rb_enc_isspace(cc, enc)) break;
+	    s += rb_enc_codelen(cc, enc);
+	}
     }
 
     if (s > RSTRING_PTR(str)) {
+	int cr = ENC_CODERANGE(str);
+
 	rb_str_modify(str);
 	STR_SET_LEN(str, t-s);
 	memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
 	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
+	ENC_CODERANGE_SET(str, cr);
 	return str;
     }
     return Qnil;
@@ -5608,28 +5827,39 @@
     char *s, *t, *e;
     int space_seen = Qfalse;
 
-    rb_str_modify(str);
-    enc = STR_ENC_GET(str);
     s = RSTRING_PTR(str);
     if (!s || RSTRING_LEN(str) == 0) return Qnil;
     t = e = RSTRING_END(str);
-    while (s < e) {
-	int cc = rb_enc_codepoint(s, e, enc);
+    enc = STR_ENC_GET(str);
+    if (rb_enc_asciicompat(enc)) {
+	/* Assume all space chars are single byte */
+        while (t > s) {
+	    if (!rb_enc_isspace(*(t-1) & 0xff, enc)) break;
+	    t--;
+	}
+    }
+    else {
+	while (s < e) {
+	    int cc = rb_enc_codepoint(s, e, enc);
 
-	if (!cc || rb_enc_isspace(cc, enc)) {
-	    if (!space_seen) t = s;
-	    space_seen = Qtrue;
+	    if (!cc || rb_enc_isspace(cc, enc)) {
+		if (!space_seen) t = s;
+		space_seen = Qtrue;
+	    }
+	    else {
+		space_seen = Qfalse;
+	    }
+	    s += rb_enc_codelen(cc, enc);
 	}
-	else {
-	    space_seen = Qfalse;
-	}
-	s += rb_enc_codelen(cc, enc);
+	if (!space_seen) t = s;
     }
-    if (!space_seen) t = s;
     if (t < e) {
+	int cr = ENC_CODERANGE(str);
+
 	rb_str_modify(str);
 	STR_SET_LEN(str, t-RSTRING_PTR(str));
 	RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
+	ENC_CODERANGE_SET(str, cr);
 	return str;
     }
     return Qnil;
@@ -6011,7 +6241,7 @@
     const char *f = " ";
     long n, llen, rlen;
     volatile VALUE pad;
-    int singlebyte = 1;
+    int singlebyte = 1, cr;
 
     rb_scan_args(argc, argv, "11", &w, &pad);
     enc = STR_ENC_GET(str);
@@ -6032,6 +6262,7 @@
     n = width - len;
     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
     rlen = n - llen;
+    cr = ENC_CODERANGE(str);
     res = rb_str_new5(str, 0, RSTRING_LEN(str)+n*flen/fclen+2);
     p = RSTRING_PTR(res);
     while (llen) {
@@ -6077,6 +6308,10 @@
     OBJ_INFECT(res, str);
     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
     rb_enc_associate(res, enc);
+    if (argc == 2)
+	ENC_CODERANGE_SET(res, ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad)));
+    else
+	ENC_CODERANGE_SET(res, cr);
     return res;
 }
 
@@ -6677,6 +6912,7 @@
     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
+    rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
@@ -6726,6 +6962,7 @@
     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
+    rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
 
     rb_define_method(rb_cString, "sum", rb_str_sum, -1);

Thread

Prev Next

In This Thread

Prev Next