From: "naruse (Yui NARUSE)" <naruse@...>
Date: 2013-03-18T15:22:37+09:00
Subject: [ruby-core:53493] [ruby-trunk - Bug #7267] Dir.glob on Mac OS X returns unexpected string encodings for unicode file names


Issue #7267 has been updated by naruse (Yui NARUSE).


Attaching my proof of concept code.

diff --git a/dir.c b/dir.c
index d4b3dd3..126c27e 100644
--- a/dir.c
+++ b/dir.c
@@ -81,6 +81,84 @@ char *strchr(char*,char);
 
 #define rb_sys_fail_path(path) rb_sys_fail_str(path)
 
+#if defined(__APPLE__)
+
+#include <sys/param.h>
+#include <sys/mount.h>
+
+static int
+is_hfs(const char *path, size_t len)
+{
+    struct statfs buf;
+    char *p = ALLOCA_N(char, len+1);
+    memcpy(p, path, len);
+    p[len] = 0;
+    if (statfs(p, &buf) == 0) {
+	return buf.f_type == 17; /* HFS on darwin */
+    }
+    return FALSE;
+}
+
+/*
+ * vpath is UTF8-MAC string
+ */
+VALUE
+compose_utf8_mac(VALUE vpath)
+{
+    const char *path0 = RSTRING_PTR(vpath);
+    const char *p = path0;
+    const char *subpath = p;
+    const char *pend = RSTRING_END(vpath);
+    int hfs_p;
+    VALUE result;
+    rb_encoding *utf8, *utf8mac;
+    static VALUE utf8enc;
+
+    if (*p++ != '/')
+	return vpath;
+
+    if (!utf8enc) {
+	utf8mac = rb_enc_find("UTF8-MAC");
+	utf8 = rb_utf8_encoding();
+	utf8enc = rb_enc_from_encoding(utf8);
+    }
+
+    result = rb_str_buf_new(RSTRING_LEN(vpath));
+    hfs_p = is_hfs("/", 1);
+
+    for (; p < pend; p++) {
+	if (*p != '/')
+	    continue;
+	if (hfs_p != is_hfs(path0, p-path0)) {
+	    if (hfs_p) {
+		VALUE str = rb_enc_str_new(subpath, p - subpath, utf8mac);
+		rb_str_buf_append(result, rb_str_encode(str, utf8enc, 0, Qnil));
+	    }
+	    else {
+                rb_str_buf_cat(result, subpath, p - subpath);
+	    }
+	    hfs_p = !hfs_p;
+	    subpath = p;
+	}
+    }
+    if (hfs_p) {
+	VALUE str = rb_enc_str_new(subpath, p - subpath, utf8mac);
+	rb_str_buf_append(result, rb_str_encode(str, utf8enc, 0, Qnil));
+    }
+    else {
+	rb_str_buf_cat(result, subpath, p - subpath);
+    }
+
+    rb_enc_associate(result, utf8);
+    return result;
+}
+static VALUE
+dir_s_compose_path(VALUE dir, VALUE path)
+{
+    return compose_utf8_mac(path);
+}
+#endif
+
 #define FNM_NOESCAPE	0x01
 #define FNM_PATHNAME	0x02
 #define FNM_DOTMATCH	0x04
@@ -2120,6 +2198,7 @@ Init_Dir(void)
     rb_define_singleton_method(rb_cDir,"delete", dir_s_rmdir, 1);
     rb_define_singleton_method(rb_cDir,"unlink", dir_s_rmdir, 1);
     rb_define_singleton_method(rb_cDir,"home", dir_s_home, -1);
+    rb_define_singleton_method(rb_cDir,"compose", dir_s_compose_path, 1);
 
     rb_define_singleton_method(rb_cDir,"glob", dir_s_glob, -1);
     rb_define_singleton_method(rb_cDir,"[]", dir_s_aref, -1);

----------------------------------------
Bug #7267: Dir.glob on Mac OS X returns unexpected string encodings for unicode file names
https://bugs.ruby-lang.org/issues/7267#change-37687

Author: kennygrant (Kenny Grant)
Status: Assigned
Priority: Normal
Assignee: duerst (Martin D��rst)
Category: 
Target version: next minor
ruby -v: ruby 1.9.3p194 (2012-04-20 revision 35410) [x86_64-darwin11.4.0]


Tested on Ruby 1.9.3-p194 and ruby-2.0.0-preview1 on Mac OS X 10. 7.5

When calling file system methods with Ruby on Mac OS X, it is not possible to manipulate the resulting file name as a normal UTF-8 string, even though it reports the encoding as UTF-8. It seems to be a UTF-8-MAC string, even when the default encoding is set to UTF-8. This leads to confusion as the string can be manipulated normally except for any unicode characters, which seem to be decomposed. So a regexp using utf-8 characters won't work on the string, unless it is first converted from UTF-8-MAC. I'd expect the string encoding to be UTF-8, or at least to report that it is not a normal UTF-8 string if it has to be UTF-8-MAC for some reason. 

Example, run with a file called Test��.txt in the same folder:

def transform_string s
   puts "Testing string #{s}"
   puts s.gsub(/��/,'TEST')
end

Dir.glob("./*.txt").each do |f|  
  puts "Inline string works as expected" 
   s = "./Test��.txt" 
   puts transform_string s

   puts "File name from Dir.glob does not" 
   puts transform_string f
   
   puts "Encoded file name works as expected, though it is reported as UTF-8, not UTF-8-MAC" 
   f.encode!('UTF-8','UTF-8-MAC')
   puts transform_string f
end


-- 
http://bugs.ruby-lang.org/