From: "naruse (Yui NARUSE)" <naruse@...> Date: 2013-03-18T15:22:37+09:00 Subject: [ruby-core:53493] [ruby-trunk - Bug #7267] Dir.glob on Mac OS X returns unexpected string encodings for unicode file names Issue #7267 has been updated by naruse (Yui NARUSE). Attaching my proof of concept code. diff --git a/dir.c b/dir.c index d4b3dd3..126c27e 100644 --- a/dir.c +++ b/dir.c @@ -81,6 +81,84 @@ char *strchr(char*,char); #define rb_sys_fail_path(path) rb_sys_fail_str(path) +#if defined(__APPLE__) + +#include <sys/param.h> +#include <sys/mount.h> + +static int +is_hfs(const char *path, size_t len) +{ + struct statfs buf; + char *p = ALLOCA_N(char, len+1); + memcpy(p, path, len); + p[len] = 0; + if (statfs(p, &buf) == 0) { + return buf.f_type == 17; /* HFS on darwin */ + } + return FALSE; +} + +/* + * vpath is UTF8-MAC string + */ +VALUE +compose_utf8_mac(VALUE vpath) +{ + const char *path0 = RSTRING_PTR(vpath); + const char *p = path0; + const char *subpath = p; + const char *pend = RSTRING_END(vpath); + int hfs_p; + VALUE result; + rb_encoding *utf8, *utf8mac; + static VALUE utf8enc; + + if (*p++ != '/') + return vpath; + + if (!utf8enc) { + utf8mac = rb_enc_find("UTF8-MAC"); + utf8 = rb_utf8_encoding(); + utf8enc = rb_enc_from_encoding(utf8); + } + + result = rb_str_buf_new(RSTRING_LEN(vpath)); + hfs_p = is_hfs("/", 1); + + for (; p < pend; p++) { + if (*p != '/') + continue; + if (hfs_p != is_hfs(path0, p-path0)) { + if (hfs_p) { + VALUE str = rb_enc_str_new(subpath, p - subpath, utf8mac); + rb_str_buf_append(result, rb_str_encode(str, utf8enc, 0, Qnil)); + } + else { + rb_str_buf_cat(result, subpath, p - subpath); + } + hfs_p = !hfs_p; + subpath = p; + } + } + if (hfs_p) { + VALUE str = rb_enc_str_new(subpath, p - subpath, utf8mac); + rb_str_buf_append(result, rb_str_encode(str, utf8enc, 0, Qnil)); + } + else { + rb_str_buf_cat(result, subpath, p - subpath); + } + + rb_enc_associate(result, utf8); + return result; +} +static VALUE +dir_s_compose_path(VALUE dir, VALUE path) +{ + return compose_utf8_mac(path); +} +#endif + #define FNM_NOESCAPE 0x01 #define FNM_PATHNAME 0x02 #define FNM_DOTMATCH 0x04 @@ -2120,6 +2198,7 @@ Init_Dir(void) rb_define_singleton_method(rb_cDir,"delete", dir_s_rmdir, 1); rb_define_singleton_method(rb_cDir,"unlink", dir_s_rmdir, 1); rb_define_singleton_method(rb_cDir,"home", dir_s_home, -1); + rb_define_singleton_method(rb_cDir,"compose", dir_s_compose_path, 1); rb_define_singleton_method(rb_cDir,"glob", dir_s_glob, -1); rb_define_singleton_method(rb_cDir,"[]", dir_s_aref, -1); ---------------------------------------- Bug #7267: Dir.glob on Mac OS X returns unexpected string encodings for unicode file names https://bugs.ruby-lang.org/issues/7267#change-37687 Author: kennygrant (Kenny Grant) Status: Assigned Priority: Normal Assignee: duerst (Martin D��rst) Category: Target version: next minor ruby -v: ruby 1.9.3p194 (2012-04-20 revision 35410) [x86_64-darwin11.4.0] Tested on Ruby 1.9.3-p194 and ruby-2.0.0-preview1 on Mac OS X 10. 7.5 When calling file system methods with Ruby on Mac OS X, it is not possible to manipulate the resulting file name as a normal UTF-8 string, even though it reports the encoding as UTF-8. It seems to be a UTF-8-MAC string, even when the default encoding is set to UTF-8. This leads to confusion as the string can be manipulated normally except for any unicode characters, which seem to be decomposed. So a regexp using utf-8 characters won't work on the string, unless it is first converted from UTF-8-MAC. I'd expect the string encoding to be UTF-8, or at least to report that it is not a normal UTF-8 string if it has to be UTF-8-MAC for some reason. Example, run with a file called Test��.txt in the same folder: def transform_string s puts "Testing string #{s}" puts s.gsub(/��/,'TEST') end Dir.glob("./*.txt").each do |f| puts "Inline string works as expected" s = "./Test��.txt" puts transform_string s puts "File name from Dir.glob does not" puts transform_string f puts "Encoded file name works as expected, though it is reported as UTF-8, not UTF-8-MAC" f.encode!('UTF-8','UTF-8-MAC') puts transform_string f end -- http://bugs.ruby-lang.org/