[ruby-dev:37784] Re: String#hash
From:
Nobuyoshi Nakada <nobu@...>
Date:
2009-01-18 20:28:18 UTC
List:
ruby-dev #37784
なかだです。
At Sat, 17 Jan 2009 16:33:08 +0900,
Yukihiro Matsumoto wrote in [ruby-dev:37779]:
> |例えば, Complexだと
> |
> | @real.hash ^ @imag.hash
> |
> |となっています. 実際には, これはあまり良くなくて, @real==@image のとき
> |つねに同じ値になってしまいます. そこで, 別のseedを与え
> |
> | @real.hash(seed1) ^ @image.hash(seed2)
> |
> |の様にすると, 上記の問題はなくなります. これは, Array等のhash関数でも
> |同様です. また, Fixnum#hashとObject#hashも別のseedを与えるのもよさそう
> |な気がします.
>
> 毎回変わる理由は[ruby-dev:37778]の通りなんですが、各オブジェ
> クトのhashの計算方式を更新した方が良いという提案はそれとは独
> 立に有効だと思います。どんな計算式が良いのか議論していただけ
> ませんか?
ComplexやRationalはrb_memhash()が使えると思いますが、可変長であ
るArrayやObject、Structなどでは多少工夫が必要でしょう。
しかし、MurmurHashって途中でたまたま0になるとそれ以降のデータは
無関係になるような気がするんですが、いいんでしょうか。
Index: array.c
===================================================================
--- array.c (revision 21650)
+++ array.c (working copy)
@@ -2722,10 +2722,10 @@ recursive_hash(VALUE ary, VALUE dummy, i
return LONG2FIX(0);
}
- h = RARRAY_LEN(ary);
+ h = rb_hash_start(RARRAY_LEN(ary));
for (i=0; i<RARRAY_LEN(ary); i++) {
- h = (h << 1) | (h<0 ? 1 : 0);
n = rb_hash(RARRAY_PTR(ary)[i]);
- h ^= NUM2LONG(n);
+ h = rb_hash_uint(h, NUM2LONG(n));
}
+ h = rb_hash_end(h);
return LONG2FIX(h);
}
Index: complex.c
===================================================================
--- complex.c (revision 21650)
+++ complex.c (working copy)
@@ -23,5 +23,5 @@ VALUE rb_cComplex;
static ID id_abs, id_abs2, id_arg, id_cmp, id_conj, id_convert,
- id_denominator, id_divmod, id_equal_p, id_expt, id_floor, id_hash,
+ id_denominator, id_divmod, id_equal_p, id_expt, id_floor,
id_idiv, id_inspect, id_negate, id_numerator, id_polar, id_quo,
id_real_p, id_to_f, id_to_i, id_to_r, id_to_s;
@@ -154,6 +154,4 @@ f_sub(VALUE x, VALUE y)
}
-binop(xor, '^')
-
fun1(abs)
fun1(abs2)
@@ -162,5 +160,4 @@ fun1(conj)
fun1(denominator)
fun1(floor)
-fun1(hash)
fun1(inspect)
fun1(negate)
@@ -858,6 +855,15 @@ static VALUE
nucomp_hash(VALUE self)
{
+ long v, h[3];
+ VALUE n;
+
get_dat1(self);
- return f_xor(f_hash(dat->real), f_hash(dat->imag));
+ h[0] = rb_hash(rb_obj_class(self));
+ n = rb_hash(dat->real);
+ h[1] = NUM2LONG(n);
+ n = rb_hash(dat->imag);
+ h[2] = NUM2LONG(n);
+ v = rb_memhash(h, sizeof(h));
+ return LONG2FIX(v);
}
@@ -1383,5 +1389,4 @@ Init_Complex(void)
id_expt = rb_intern("**");
id_floor = rb_intern("floor");
- id_hash = rb_intern("hash");
id_idiv = rb_intern("div");
id_inspect = rb_intern("inspect");
Index: hash.c
===================================================================
--- hash.c (revision 21650)
+++ hash.c (working copy)
@@ -70,5 +70,5 @@ rb_any_hash(VALUE a)
case T_FIXNUM:
case T_SYMBOL:
- hnum = (int)a;
+ hnum = rb_hash_end(rb_hash_start((unsigned int)a));
break;
@@ -1511,7 +1511,10 @@ static int
hash_i(VALUE key, VALUE val, int *hval)
{
+ VALUE n;
if (key == Qundef) return ST_CONTINUE;
- *hval ^= rb_hash(key);
- *hval ^= rb_hash(val);
+ n = rb_hash(key);
+ *hval = rb_hash_uint(*hval, NUM2LONG(n));
+ n = rb_hash(val);
+ *hval = rb_hash_uint(*hval, NUM2LONG(n));
return ST_CONTINUE;
}
@@ -1527,6 +1530,7 @@ recursive_hash(VALUE hash, VALUE dummy,
if (!RHASH(hash)->ntbl)
return LONG2FIX(0);
- hval = RHASH(hash)->ntbl->num_entries;
+ hval = rb_hash_start(RHASH(hash)->ntbl->num_entries);
rb_hash_foreach(hash, hash_i, (st_data_t)&hval);
+ hval = rb_hash_end(hval);
return INT2FIX(hval);
}
Index: gc.c
===================================================================
--- gc.c (revision 21650)
+++ gc.c (working copy)
@@ -2904,5 +2904,4 @@ Init_GC(void)
OBJ_FREEZE(nomem_error);
- rb_define_method(rb_mKernel, "hash", rb_obj_id, 0);
rb_define_method(rb_mKernel, "__id__", rb_obj_id, 0);
rb_define_method(rb_mKernel, "object_id", rb_obj_id, 0);
Index: object.c
===================================================================
--- object.c (revision 21650)
+++ object.c (working copy)
@@ -96,4 +96,12 @@ rb_obj_equal(VALUE obj1, VALUE obj2)
}
+VALUE
+rb_obj_hash(VALUE obj)
+{
+ VALUE oid = rb_obj_id(obj);
+ unsigned h = rb_hash_end(rb_hash_start(NUM2LONG(oid)));
+ return LONG2NUM(h);
+}
+
/*
* call-seq:
@@ -2517,4 +2525,5 @@ Init_Object(void)
rb_define_method(rb_mKernel, "!~", rb_obj_not_match, 1);
rb_define_method(rb_mKernel, "eql?", rb_obj_equal, 1);
+ rb_define_method(rb_mKernel, "hash", rb_obj_hash, 0);
rb_define_method(rb_mKernel, "class", rb_obj_class, 0);
Index: range.c
===================================================================
--- range.c (revision 21650)
+++ range.c (working copy)
@@ -214,12 +214,14 @@ static VALUE
range_hash(VALUE range)
{
- long hash = EXCL(range);
+ unsigned hash = EXCL(range);
VALUE v;
+ hash = rb_hash_start(hash);
v = rb_hash(RANGE_BEG(range));
- hash ^= v << 1;
+ hash = rb_hash_uint(hash, NUM2LONG(v));
v = rb_hash(RANGE_END(range));
- hash ^= v << 9;
- hash ^= EXCL(range) << 24;
+ hash = rb_hash_uint(hash, NUM2LONG(v));
+ hash = rb_hash_uint(hash, EXCL(range) << 24);
+ hash = rb_hash_end(hash);
return LONG2FIX(hash);
Index: rational.c
===================================================================
--- rational.c (revision 21650)
+++ rational.c (working copy)
@@ -28,5 +28,5 @@ VALUE rb_cRational;
static ID id_abs, id_cmp, id_convert, id_equal_p, id_expt, id_floor,
- id_hash, id_idiv, id_inspect, id_integer_p, id_negate, id_to_f,
+ id_idiv, id_inspect, id_integer_p, id_negate, id_to_f,
id_to_i, id_to_s, id_truncate;
@@ -136,9 +136,6 @@ f_sub(VALUE x, VALUE y)
}
-binop(xor, '^')
-
fun1(abs)
fun1(floor)
-fun1(hash)
fun1(inspect)
fun1(integer_p)
@@ -1162,6 +1159,15 @@ static VALUE
nurat_hash(VALUE self)
{
+ long v, h[3];
+ VALUE n;
+
get_dat1(self);
- return f_xor(f_hash(dat->num), f_hash(dat->den));
+ h[0] = rb_hash(rb_obj_class(self));
+ n = rb_hash(dat->num);
+ h[1] = NUM2LONG(n);
+ n = rb_hash(dat->den);
+ h[2] = NUM2LONG(n);
+ v = rb_memhash(h, sizeof(h));
+ return LONG2FIX(v);
}
@@ -1555,5 +1561,4 @@ Init_Rational(void)
id_expt = rb_intern("**");
id_floor = rb_intern("floor");
- id_hash = rb_intern("hash");
id_idiv = rb_intern("div");
id_inspect = rb_intern("inspect");
Index: string.c
===================================================================
--- string.c (revision 21650)
+++ string.c (working copy)
@@ -1883,10 +1883,17 @@ rb_str_concat(VALUE str1, VALUE str2)
/* MurmurHash described in http://murmurhash.googlepages.com/ */
-static unsigned int
-hash(const unsigned char * data, int len, unsigned int h)
+static inline uint32_t
+murmur(unsigned int h, const int r)
{
const unsigned int m = 0x7fd652ad;
- const int r = 16;
+ h *= m;
+ return h ^ (h >> r);
+}
+
+#define murmur16(h) murmur(h, 16)
+static unsigned int
+hash(const unsigned char * data, int len, unsigned int h)
+{
h += 0xdeadbeef;
@@ -1929,7 +1936,5 @@ hash(const unsigned char * data, int len
t = (t >> sr) | (d << sl);
#endif
- h += t;
- h *= m;
- h ^= h >> r;
+ h = murmur16(h + t);
t = d;
@@ -1954,6 +1959,5 @@ hash(const unsigned char * data, int len
h += (t >> sr) | (d << sl);
#endif
- h *= m;
- h ^= h >> r;
+ h = murmur16(h);
}
@@ -1965,8 +1969,5 @@ hash(const unsigned char * data, int len
{
do {
- h += *(uint32_t *)data;
- h *= m;
- h ^= h >> r;
-
+ h = murmur16(h + *(uint32_t *)data);
data += 4;
len -= 4;
@@ -1991,18 +1992,57 @@ hash(const unsigned char * data, int len
h += data[0];
#endif
- h *= m;
- h ^= h >> r;
+ h = murmur16(h);
}
- h *= m;
- h ^= h >> 10;
- h *= m;
- h ^= h >> 17;
+ return rb_hash_end(h);
+}
+
+unsigned int
+rb_hash_uint32(unsigned int h, unsigned int i)
+{
+ return murmur(h + i, 16);
+}
+
+unsigned int
+rb_hash_uint(unsigned int h, unsigned int i)
+{
+ unsigned int v = 0;
+ h += i;
+#ifdef WORDS_BIGENDIAN
+#if SIZEOF_INT*CHAR_BIT > 12*8
+ v = murmur16(v + (h >> 12*8));
+#endif
+#if SIZEOF_INT*CHAR_BIT > 8*8
+ v = murmur16(v + (h >> 8*8));
+#endif
+#if SIZEOF_INT*CHAR_BIT > 4*8
+ v = murmur16(v + (h >> 4*8));
+#endif
+#endif
+ v = murmur16(v + h);
+#ifndef WORDS_BIGENDIAN
+#if SIZEOF_INT*CHAR_BIT > 4*8
+ v = murmur16(v + (h >> 4*8));
+#endif
+#if SIZEOF_INT*CHAR_BIT > 8*8
+ v = murmur16(v + (h >> 8*8));
+#endif
+#if SIZEOF_INT*CHAR_BIT > 12*8
+ v = murmur16(v + (h >> 12*8));
+#endif
+#endif
+ return v;
+}
+unsigned int
+rb_hash_end(unsigned int h)
+{
+ h = murmur(h, 10);
+ h = murmur(h, 17);
return h;
}
-int
-rb_memhash(const void *ptr, long len)
+unsigned int
+rb_hash_start(unsigned int h)
{
static int hashseed_init = 0;
@@ -2013,6 +2053,11 @@ rb_memhash(const void *ptr, long len)
hashseed_init = 1;
}
+ return hashseed + h;
+}
- return hash(ptr, len, hashseed);
+int
+rb_memhash(const void *ptr, long len)
+{
+ return hash(ptr, len, rb_hash_start(0));
}
Index: struct.c
===================================================================
--- struct.c (revision 21650)
+++ struct.c (working copy)
@@ -805,14 +805,15 @@ static VALUE
rb_struct_hash(VALUE s)
{
- long i, h;
+ long i;
+ unsigned h;
VALUE n;
- h = rb_hash(rb_obj_class(s));
+ h = rb_hash_start(rb_hash(rb_obj_class(s)));
for (i = 0; i < RSTRUCT_LEN(s); i++) {
- h = (h << 1) | (h<0 ? 1 : 0);
n = rb_hash(RSTRUCT_PTR(s)[i]);
- h ^= NUM2LONG(n);
+ h = rb_hash_uint(h, NUM2LONG(n));
}
- return LONG2FIX(h);
+ h = rb_hash_end(h);
+ return INT2FIX(h);
}
Index: time.c
===================================================================
--- time.c (revision 21650)
+++ time.c (working copy)
@@ -1184,5 +1184,5 @@ time_hash(VALUE time)
GetTimeval(time, tobj);
- hash = tobj->ts.tv_sec ^ tobj->ts.tv_nsec;
+ hash = rb_hash_end(rb_hash_uint(rb_hash_start(tobj->ts.tv_sec), tobj->ts.tv_nsec));
return LONG2FIX(hash);
}
Index: include/ruby/intern.h
===================================================================
--- include/ruby/intern.h (revision 21650)
+++ include/ruby/intern.h (working copy)
@@ -607,4 +607,8 @@ VALUE rb_str_append(VALUE, VALUE);
VALUE rb_str_concat(VALUE, VALUE);
int rb_memhash(const void *ptr, long len);
+unsigned int rb_hash_start(unsigned int);
+unsigned int rb_hash_uint32(unsigned int, unsigned int);
+unsigned int rb_hash_uint(unsigned int, unsigned int);
+unsigned int rb_hash_end(unsigned int);
int rb_str_hash(VALUE);
int rb_str_hash_cmp(VALUE,VALUE);
--
--- 僕の前にBugはない。
--- 僕の後ろにBugはできる。
中田 伸悦