ruby-core

On Fri, 12 Sep 2008 02:16:51 +1000, NARUSE, Yui <naruse@airemix.jp> wrote:

>
> If you split your patch into small atomic patches,
> your patch will be merged rapidly.
>

Unfortunately this is very time consuming because string.c is changing  
every day lately!
Some of the things in my previous patch have already been done by Matz.

As a start I have attached 2 "atomic" patches to string.c. These are:

substr.pat:
Improves the speed of rb_str_substr(), mainly for single byte encodings,  
and also (less so) for fixed-width char multi-byte encodings.
rb_str_substr is used frequently so even minor improvement in speed is  
good.

split.pat:
Improves the speed of String#split when the parameter is a string (at  
least twice as fast).
Note: this patch breaks the test suite but only because of exceptions  
raised. The new implementation will NOT raise an error when running  
s.split(s2) when s is a broken string, This is COMPATIBLE with the other  
string methods, but INCOMPATIBLE with the previous split implementation  
which converted s2 to a regexp.

I'll try to get the other things I have done to you later.

Cheers
Mike

Attachments (2)

substr.pat (2.36 KB, text/x-diff)

Index: string.c
===================================================================
--- string.c	(revision 19374)
+++ string.c	(working copy)
@@ -112,7 +112,7 @@
 
 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
 
-static int
+static inline int
 single_byte_optimizable(VALUE str)
 {
     rb_encoding *enc;
@@ -1194,7 +1194,7 @@
     return (char*)p;
 }
 
-static char*
+static inline char*
 str_nth(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
 {
     if (singlebyte)
@@ -1208,7 +1208,7 @@
 }
 
 /* char offset to byte offset */
-static int
+static inline int
 str_offset(const char *p, const char *e, int nth, rb_encoding *enc, int singlebyte)
 {
     const char *pp = str_nth(p, e, nth, enc, singlebyte);
@@ -1283,12 +1283,27 @@
     rb_encoding *enc = STR_ENC_GET(str);
     VALUE str2;
     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
-    int singlebyte;
 
     if (len < 0) return Qnil;
     if (!RSTRING_LEN(str)) {
 	len = 0;
     }
+    if (single_byte_optimizable(str)) {
+	if (beg > RSTRING_LEN(str)) return Qnil;
+	if (beg < 0) {
+	    beg += RSTRING_LEN(str);
+	    if (beg < 0) return Qnil;
+	}
+	if (beg + len > RSTRING_LEN(str))
+	    len = RSTRING_LEN(str) - beg;
+	if (len <= 0) {
+	    len = 0;
+	    p = 0;
+	}
+	else
+	    p = s + beg;
+	goto sub;
+    }
     if (beg < 0) {
 	if (len > -beg) len = -beg;
 	if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
@@ -1309,7 +1324,6 @@
     else if (beg > 0 && beg > str_strlen(str, enc)) {
 	return Qnil;
     }
-    singlebyte = single_byte_optimizable(str);
     if (len == 0) {
 	p = 0;
     }
@@ -1320,17 +1334,24 @@
         len = str_utf8_offset(p, e, len);
     }
 #endif
-    else if ((p = str_nth(s, e, beg, enc, singlebyte)) == e) {
-	len = 0;
-    }
     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
-        if (len * rb_enc_mbmaxlen(enc) > e - p)
+	int char_sz = rb_enc_mbmaxlen(enc);
+
+	p = s + beg * char_sz;
+	if (p > e) {
+	    p = e;
+	    len = 0;
+	}
+        else if (len * char_sz > e - p)
             len = e - p;
         else
-	    len *= rb_enc_mbmaxlen(enc);
+	    len *= char_sz;
     }
+    else if ((p = str_nth(s, e, beg, enc, 0)) == e) {
+	len = 0;
+    }
     else {
-	len = str_offset(p, e, len, enc, singlebyte);
+	len = str_offset(p, e, len, enc, 0);
     }
   sub:
     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {

split.pat (2.33 KB, text/x-diff)

Index: string.c
===================================================================
--- string.c	(revision 19374)
+++ string.c	(working copy)
@@ -4991,7 +4991,7 @@
     rb_encoding *enc;
     VALUE spat;
     VALUE limit;
-    int awk_split = Qfalse;
+    enum {awk, string, regexp} split_type;
     long beg, end, i = 0;
     int lim = 0;
     VALUE result, tmp;
@@ -5013,37 +5013,41 @@
 	    spat = rb_fs;
 	    goto fs_set;
 	}
-	awk_split = Qtrue;
+	split_type = awk;
     }
     else {
       fs_set:
 	if (TYPE(spat) == T_STRING) {
 	    rb_encoding *enc2 = STR_ENC_GET(spat);
 
-	    if (rb_enc_mbminlen(enc2) == 1) {
+	    split_type = string;
+	    if (RSTRING_LEN(spat) == 0) {
+		/* Special case - split into chars */
+		spat = rb_reg_regcomp(spat);
+		split_type = regexp;
+	    }
+	    else if (rb_enc_mbminlen(enc2) == 1) {
 		if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
-		    awk_split = Qtrue;
+		    split_type = awk;
 		}
 	    }
 	    else {
 		int l;
 		if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
 		    RSTRING_LEN(spat) == l) {
-		    awk_split = Qtrue;
+		    split_type = awk;
 		}
 	    }
-	    if (!awk_split) {
-		spat = rb_reg_regcomp(rb_reg_quote(spat));
-	    }
 	}
 	else {
 	    spat = get_pat(spat, 1);
+	    split_type = regexp;
 	}
     }
 
     result = rb_ary_new();
     beg = 0;
-    if (awk_split) {
+    if (split_type == awk) {
 	char *ptr = RSTRING_PTR(str);
 	char *eptr = RSTRING_END(str);
 	char *bptr = ptr;
@@ -5077,6 +5081,30 @@
 	    }
 	}
     }
+    else if (split_type == string) {
+	char *ptr = RSTRING_PTR(str);
+	char *eptr = RSTRING_END(str);
+	char *sptr = RSTRING_PTR(spat);
+	int slen = RSTRING_LEN(spat);
+
+	if (is_broken_string(spat)) {
+	    rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
+	}
+	enc = rb_enc_check(str, spat);
+	while (ptr < eptr &&
+	       (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
+	    /* Check we are at the start of a char */
+	    char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
+	    if (t != ptr + end) {
+		ptr = t;
+		continue;
+	    }
+	    rb_ary_push(result, rb_str_substr(str, ptr - RSTRING_PTR(str), end));
+	    ptr += end + slen;
+	    if (!NIL_P(limit) && lim <= ++i) break;
+	}
+	beg = ptr - RSTRING_PTR(str);
+    }
     else {
 	long start = beg;
 	long idx;

Thread

Prev Next

In This Thread

Prev Next