ruby-dev

前から考えていたのですが、ファイル先頭の magic comment や
BOM を調べて encoding を設定してくれるライブラリを添付するの
はどうでしょうか。

% head -1 lib/rexml/rexml.rb
# -*- encoding: utf-8 -*-
% ./ruby -Ilib -rencdet -e '
EncDet.open("lib/rexml/rexml.rb") {|f|
  p f.external_encoding   
}' 
#<Encoding:UTF-8>

というように検出できます。

とりあえず、lib/rdoc/rdoc.rb で適当にやっているのをこういう
のに変えたいんですが。

Index: lib/encdet.rb
===================================================================
--- lib/encdet.rb	(revision 0)
+++ lib/encdet.rb	(revision 0)
@@ -0,0 +1,96 @@
+module EncDet
+  def EncDet.open(fname, mode='r')
+    if block_given?
+      File.open(fname, mode) {|f|
+        f.set_encoding detect_encoding(f)
+        yield f
+      }
+    else
+      f = File.open(fname, mode)
+      f.set_encoding detect_encoding(f)
+      f
+    end
+  end
+
+  PEEKSIZE = 4096
+
+  Detectors = {}
+
+  Detectors[:magic_comment] = lambda {|buf|
+    if /\A#!/ =~ buf
+      numlines = 2
+    else
+      numlines = 1
+    end
+    target = ''
+    buf.each_line {|line|
+      target << line
+      numlines -= 1
+      break if numlines == 0
+    }
+    if /coding[:=][ \t]*(?<encname>[\w.-]+)[^\w.-]/ =~ target
+      begin
+        Encoding.find(encname)
+      rescue ArgumentError
+        Encoding::ASCII_8BIT
+      end
+    else
+      if numlines == 0 && /\n\z/ =~ target 
+        :not_found
+      else
+        :more_bytes
+      end
+    end
+  }
+
+  Detectors[:utf_bom] = lambda {|buf|
+    h = {
+      "UTF-8" => /\A\xEF(?:\xBB(\xBF)?)?/n,
+      "UTF-16BE" => /\A\xFE(\xFF)?/n,
+      "UTF-16LE" => /\A\xFF(\xFE)?/n,
+    }
+    result = :not_found
+    h.each {|name, pat|
+      if pat =~ buf
+        if $1
+          result = Encoding.find(name)
+          break
+        end
+        result = :more_bytes
+      end
+    }
+    result
+  }
+
+  def EncDet.detect_encoding(f)
+    result = nil
+    detectors = Detectors.dup
+    buf = ''
+    while buf.bytesize < PEEKSIZE
+      begin
+        buf << f.readpartial(PEEKSIZE - buf.bytesize)
+      rescue EOFError
+        break
+      end
+      next_detectors = {}
+      detectors.each {|name, det|
+        r = det.call(buf)
+        case r
+        when :not_found
+          next
+        when :more_bytes
+          next_detectors[name] = det
+        when Encoding
+          result = r
+          break
+        else
+          raise TypeError, "unexpected detection result by #{name}: #{r.inspect}"
+        end
+      }
+      break if next_detectors.empty? || result
+      detectors = next_detectors
+    end
+    f.ungetc buf
+    result || f.external_encoding
+  end
+end
Index: test/test-encdet.rb
===================================================================
--- test/test-encdet.rb	(revision 0)
+++ test/test-encdet.rb	(revision 0)
@@ -0,0 +1,41 @@
+require 'test/unit'
+require 'encdet'
+
+class TestEncDet < Test::Unit::TestCase
+
+  def check_detect_encoding(content, enc)
+    r, w = IO.pipe
+    w.write content
+    w.close
+    result = EncDet.detect_encoding(r)
+    assert_equal(Encoding.find(enc), result)
+  ensure
+    r.close if r && !r.closed?
+    w.close if w && !w.closed?
+  end
+
+  def test_magic_comment
+    [
+      ["-*- coding: utf-8 -*-\n\u{3042}\n", "UTF-8"],
+      ["-*- coding: euc-jp -*-\n\xa4\xa2\n", "EUC-JP"],
+      ["#!/bin/sh\n# -*- coding: utf-8 -*-\n\u{3042}\n", "UTF-8"],
+      ["#!/bin/sh\n# -*- coding: euc-jp -*-\n\xa4\xa2\n", "EUC-JP"],
+      ["\n-*- coding: utf-8 -*-\n\u{3042}\n", Encoding.locale_charmap],
+      ["\n-*- coding: euc-jp -*-\n\xa4\xa2\n", Encoding.locale_charmap],
+    ].each {|content, enc|
+      check_detect_encoding(content, enc)
+    }
+  end
+
+  def test_bom
+    [
+      ["\xef\xbb\xbfab", "UTF-8"],
+      ["\xfe\xff\0a\0b", "UTF-16BE"],
+      ["\xff\xfea\0b\0", "UTF-16LE"],
+    ].each {|content, enc|
+      check_detect_encoding(content, enc)
+    }
+  end
+
+end
+
-- 
[田中 哲][たなか あきら][Tanaka Akira]

Thread

Prev Next

In This Thread

Prev Next