[#38392] Enumerable#gather_each — Tanaka Akira <akr@...>

ときに、複数行をまとめて扱いたいことがあります。

47 messages 2009/05/09
[#38394] Re: Enumerable#gather_each — ujihisa <ujihisa@...> 2009/05/09

ujihisaと申します。

[#38400] Re: Enumerable#gather_each — Yukihiro Matsumoto <matz@...> 2009/05/09

まつもと ゆきひろです

[#38399] Re: Enumerable#gather_each — "Akinori MUSHA" <knu@...> 2009/05/09

At Sat, 9 May 2009 15:30:20 +0900,

[#38405] Re: Enumerable#gather_each — Tanaka Akira <akr@...> 2009/05/10

In article <86r5yy2nrg.knu@iDaemons.org>,

[#38417] Re: Enumerable#gather_each — "Akinori MUSHA" <knu@...> 2009/05/10

At Sun, 10 May 2009 10:08:47 +0900,

[#38524] [Bug #1503] -Kuをつけた時、/[#{s}]/n と Regexp.new("[#{s}]",nil,"n") で実行結果が異なる — sinnichi eguchi <redmine@...>

Bug #1503: -Kuをつけた時、/[#{s}]/n と Regexp.new("[#{s}]",nil,"n") で実行結果が異なる

8 messages 2009/05/22

[ruby-dev:38449] Re: Enumerable#gather_each

From: Tanaka Akira <akr@...>
Date: 2009-05-14 14:35:33 UTC
List: ruby-dev #38449
In article <87d4acfn27.fsf@fsij.org>,
  Tanaka Akira <akr@fsij.org> writes:

> Python のように、each のブロックの第一引数に渡してもいいのか
> もしれません。
>
> enum.gather(init_state) {|elt,state| ... }.each {|category,ary| ... }

こうすると、これが group_by と似ていることがよくわかります。

% ./ruby -e '(1..10).gather {|e| e & 4 }.each {|x| p x }'
[0, [1, 2, 3]]
[4, [4, 5, 6, 7]]
[0, [8, 9, 10]]
% ./ruby -e '(1..10).group_by {|e| e & 4 }.each {|x| p x }'
[0, [1, 2, 3, 8, 9, 10]]
[4, [4, 5, 6, 7]]

gather は 1, 2, 3 と 8, 9, 10 がわかれていますが、
group_by は最後までみるのでそれらがまとまっています。

まぁ、返り値が Enumerator と Hash と違ったりもするので、全体
として似ているかというとまた別ですが。

というように実装してみたものをつけておきます。

% svn diff --diff-cmd diff -x '-u -p'
Index: enum.c
===================================================================
--- enum.c	(revision 23381)
+++ enum.c	(working copy)
@@ -1793,6 +1793,247 @@ enum_cycle(int argc, VALUE *argv, VALUE 
     return Qnil;		/* not reached */
 }
 
+struct chunk_by_arg {
+    VALUE categorize;
+    VALUE state;
+    VALUE prev_value;
+    VALUE prev_elts;
+    VALUE yielder;
+};
+
+static VALUE
+chunk_by_ii(VALUE i, VALUE _argp, int argc, VALUE *argv)
+{
+    struct chunk_by_arg *argp = (struct chunk_by_arg *)_argp;
+    VALUE v;
+    VALUE singleton = ID2SYM(rb_intern("_singleton"));
+    VALUE separator = ID2SYM(rb_intern("_separator"));
+
+    ENUM_WANT_SVALUE();
+
+    v = rb_funcall(argp->categorize, rb_intern("call"), 2, i, argp->state);
+
+    if (v == singleton) {
+        if (!NIL_P(argp->prev_value)) {
+            rb_funcall(argp->yielder, rb_intern("<<"), 1, rb_assoc_new(argp->prev_value, argp->prev_elts));
+            argp->prev_value = argp->prev_elts = Qnil;
+        }
+        rb_funcall(argp->yielder, rb_intern("<<"), 1, rb_assoc_new(v, rb_ary_new3(1, i)));
+    }
+    else if (!RTEST(v) || v == separator) {
+        if (!NIL_P(argp->prev_value)) {
+            rb_funcall(argp->yielder, rb_intern("<<"), 1, rb_assoc_new(argp->prev_value, argp->prev_elts));
+            argp->prev_value = argp->prev_elts = Qnil;
+        }
+    }
+    else if (SYMBOL_P(v) && rb_id2name(SYM2ID(v))[0] == '_') {
+	rb_raise(rb_eRuntimeError, "symbol begins with an underscore is reserved");
+    }
+    else {
+        if (NIL_P(argp->prev_value)) {
+            argp->prev_value = v;
+            argp->prev_elts = rb_ary_new3(1, i);
+        }
+        else {
+            if (rb_equal(argp->prev_value, v)) {
+                rb_ary_push(argp->prev_elts, i);
+            }
+            else {
+                rb_funcall(argp->yielder, rb_intern("<<"), 1, rb_assoc_new(argp->prev_value, argp->prev_elts));
+                argp->prev_value = v;
+                argp->prev_elts = rb_ary_new3(1, i);
+            }
+        }
+    }
+    return Qnil;
+}
+
+static VALUE
+chunk_by_i(VALUE yielder, VALUE enumerator, int argc, VALUE *argv)
+{
+    VALUE enumerable;
+    struct chunk_by_arg arg;
+
+    enumerable = rb_ivar_get(enumerator, rb_intern("chunk_by_enumerable"));
+    arg.categorize = rb_ivar_get(enumerator, rb_intern("chunk_by_categorize"));
+    arg.state = rb_ivar_get(enumerator, rb_intern("chunk_by_initial_state"));
+    arg.prev_value = Qnil;
+    arg.prev_elts = Qnil;
+    arg.yielder = yielder;
+
+    if (!NIL_P(arg.state))
+        arg.state = rb_obj_dup(arg.state);
+
+    rb_block_call(enumerable, id_each, 0, 0, chunk_by_ii, (VALUE)&arg);
+    if (!NIL_P(arg.prev_elts))
+        rb_funcall(arg.yielder, rb_intern("<<"), 1, rb_assoc_new(arg.prev_value, arg.prev_elts));
+    return Qnil;
+}
+
+/*
+ *  call-seq:
+ *     enum.chunk_by(initial_state=nil) {|elt, state| ... } => enumerator
+ *
+ *  Creates an enumerator for iterating chunked elements of _enum_.
+ *
+ *  This method gathers consecutive elements which
+ *  the block returns a same value.
+ *
+ *  The following values has special meaning:
+ *  - nil, false and :_separator specifies that gathered elements is not yielded.
+ *  - :_singleton specifies the element should be gathered only itself.
+ *
+ *  Other symbols which begins an underscore may be used in future.
+ *
+ *  If non-nil value is given for _initial_state_,
+ *  it is duplicated for each "each" method invocation of the enumerator.
+ *  The duplicated object is passed to second argument of the block for "chunk_by" method..
+ *
+ *    (1..10).chunk_by {|n| n & 2 }.each {|a| p a }
+ *    #=> [1]           # 1 & 2 = 0
+ *    #   [2, 3]        # 2 & 2 = 3 & 2 = 1
+ *    #   [4, 5]        # 4 & 2 = 5 & 2 = 0
+ *    #   [6, 7]        # 6 & 2 = 7 & 2 = 1
+ *    #   [8, 9]        # 8 & 2 = 9 & 2 = 0
+ *    #   [10]          # 10 & 2 = 1
+ *
+ *    # gather indented blocks.
+ *    io.chunk_by {|line| /\A\s/ =~ line }.each {|lines| pp lines }
+ *
+ *    # iterate over svn log entries.
+ *    IO.popen([{"LANG"=>"C"}, *%w[svn log enum.c]]) {|f|
+ *      sep = "-"*72+"\n"
+ *      f.chunk_by {|line| line != sep }.each {|e| pp e }
+ *    }
+ */
+static VALUE
+enum_chunk_by(int argc, VALUE *argv, VALUE enumerable)
+{
+    VALUE initial_state;
+    VALUE enumerator;
+
+    rb_scan_args(argc, argv, "01", &initial_state);
+
+    enumerator = rb_obj_alloc(rb_cEnumerator);
+    rb_ivar_set(enumerator, rb_intern("chunk_by_enumerable"), enumerable);
+    rb_ivar_set(enumerator, rb_intern("chunk_by_categorize"), rb_block_proc());
+    rb_ivar_set(enumerator, rb_intern("chunk_by_initial_state"), initial_state);
+    rb_block_call(enumerator, rb_intern("initialize"), 0, 0, chunk_by_i, enumerator);
+    return enumerator;
+}
+
+struct slice_before_arg {
+    VALUE separator_p;
+    VALUE state;
+    VALUE prev_elts;
+    VALUE yielder;
+};
+
+static VALUE
+slice_before_ii(VALUE i, VALUE _argp, int argc, VALUE *argv)
+{
+    struct slice_before_arg *argp = (struct slice_before_arg *)_argp;
+    VALUE bool;
+
+    ENUM_WANT_SVALUE();
+
+    bool = rb_funcall(argp->separator_p, rb_intern("call"), 2, i, argp->state);
+    if (RTEST(bool)) {
+        if (!NIL_P(argp->prev_elts))
+            rb_funcall(argp->yielder, rb_intern("<<"), 1, argp->prev_elts);
+        argp->prev_elts = rb_ary_new3(1, i);
+    }
+    else {
+        if (NIL_P(argp->prev_elts))
+            argp->prev_elts = rb_ary_new3(1, i);
+        else
+            rb_ary_push(argp->prev_elts, i);
+    }
+
+    return Qnil;
+}
+
+static VALUE
+slice_before_i(VALUE yielder, VALUE enumerator, int argc, VALUE *argv)
+{
+    VALUE enumerable;
+    struct slice_before_arg arg;
+
+    enumerable = rb_ivar_get(enumerator, rb_intern("slice_before_enumerable"));
+    arg.separator_p = rb_ivar_get(enumerator, rb_intern("slice_before_separator_p"));
+    arg.state = rb_ivar_get(enumerator, rb_intern("slice_before_initial_state"));
+    arg.prev_elts = Qnil;
+    arg.yielder = yielder;
+
+    if (!NIL_P(arg.state))
+        arg.state = rb_obj_dup(arg.state);
+
+    rb_block_call(enumerable, id_each, 0, 0, slice_before_ii, (VALUE)&arg);
+    if (!NIL_P(arg.prev_elts))
+        rb_funcall(arg.yielder, rb_intern("<<"), 1, arg.prev_elts);
+    return Qnil;
+}
+
+/*
+ *  call-seq:
+ *     enum.slice_before(initial_state=nil) {|elt, state| ... } => enumerator
+ *
+ *  Creates an enumerator for iterating gathered elements of _enum_.
+ *
+ *  This method gathers consecutive elements which
+ *  the block returns a true for the starting element.
+ *
+ *  If non-nil value is given for _initial_state_,
+ *  it is duplicated for each "each" method invocation of the enumerator.
+ *  The duplicated object is passed to second argument of the block for "chunk_by" method..
+ *
+ *    # iterate over ChangeLog entries.
+ *    open("ChangeLog") {|f|
+ *      f.slice_before {|line| /\A\S/ =~ line }.each {|e| pp e}
+ *    }
+ *
+ *    # parse mbox
+ *    open("mbox") {|f|
+ *      f.slice_before {|line|
+ *        line.start_with? "From "
+ *      }.each {|mail|
+ *        unix_from = mail.shift
+ *        i = mail.index("\n")
+ *        header = mail[0...i]
+ *        body = mail[(i+1)..-1]
+ *        fields = header.slice_before {|line| !" \t".include?(line[0]) }.to_a
+ *        p unix_from
+ *        pp fields
+ *        pp body
+ *      }
+ *    }
+ *
+ *    # split mails in mbox (slice before Unix From line after an empty line)
+ *    open("mbox") {|f|
+ *      f.slice_before(emp: true) {|line,h|
+ *      prevemp = h[:emp]
+ *      h[:emp] = line == "\n"
+ *      prevemp && line.start_with?("From ")
+ *    }.each {|mail|
+ *      pp mail
+ *    }
+ *
+ */
+static VALUE
+enum_slice_before(int argc, VALUE *argv, VALUE enumerable)
+{
+    VALUE initial_state, enumerator;
+
+    rb_scan_args(argc, argv, "01", &initial_state);
+
+    enumerator = rb_obj_alloc(rb_cEnumerator);
+    rb_ivar_set(enumerator, rb_intern("slice_before_enumerable"), enumerable);
+    rb_ivar_set(enumerator, rb_intern("slice_before_separator_p"), rb_block_proc());
+    rb_ivar_set(enumerator, rb_intern("slice_before_initial_state"), initial_state);
+    rb_block_call(enumerator, rb_intern("initialize"), 0, 0, slice_before_i, enumerator);
+    return enumerator;
+}
+
 /*
  *  The <code>Enumerable</code> mixin provides collection classes with
  *  several traversal and searching methods, and with the ability to
@@ -1852,6 +2093,9 @@ Init_Enumerable(void)
     rb_define_method(rb_mEnumerable, "drop", enum_drop, 1);
     rb_define_method(rb_mEnumerable, "drop_while", enum_drop_while, 0);
     rb_define_method(rb_mEnumerable, "cycle", enum_cycle, -1);
+    rb_define_method(rb_mEnumerable, "gather", enum_chunk_by, -1);
+    rb_define_method(rb_mEnumerable, "chunk_by", enum_chunk_by, -1);
+    rb_define_method(rb_mEnumerable, "slice_before", enum_slice_before, -1);
 
     id_eqq  = rb_intern("===");
     id_each = rb_intern("each");
-- 
[田中 哲][たなか あきら][Tanaka Akira]

In This Thread