From: Tanaka Akira Date: 2008-02-07T15:09:11+09:00 Subject: [ruby-dev:33628] encdet.rb 前から考えていたのですが、ファイル先頭の magic comment や BOM を調べて encoding を設定してくれるライブラリを添付するの はどうでしょうか。 % head -1 lib/rexml/rexml.rb # -*- encoding: utf-8 -*- % ./ruby -Ilib -rencdet -e ' EncDet.open("lib/rexml/rexml.rb") {|f| p f.external_encoding }' # というように検出できます。 とりあえず、lib/rdoc/rdoc.rb で適当にやっているのをこういう のに変えたいんですが。 Index: lib/encdet.rb =================================================================== --- lib/encdet.rb (revision 0) +++ lib/encdet.rb (revision 0) @@ -0,0 +1,96 @@ +module EncDet + def EncDet.open(fname, mode='r') + if block_given? + File.open(fname, mode) {|f| + f.set_encoding detect_encoding(f) + yield f + } + else + f = File.open(fname, mode) + f.set_encoding detect_encoding(f) + f + end + end + + PEEKSIZE = 4096 + + Detectors = {} + + Detectors[:magic_comment] = lambda {|buf| + if /\A#!/ =~ buf + numlines = 2 + else + numlines = 1 + end + target = '' + buf.each_line {|line| + target << line + numlines -= 1 + break if numlines == 0 + } + if /coding[:=][ \t]*(?[\w.-]+)[^\w.-]/ =~ target + begin + Encoding.find(encname) + rescue ArgumentError + Encoding::ASCII_8BIT + end + else + if numlines == 0 && /\n\z/ =~ target + :not_found + else + :more_bytes + end + end + } + + Detectors[:utf_bom] = lambda {|buf| + h = { + "UTF-8" => /\A\xEF(?:\xBB(\xBF)?)?/n, + "UTF-16BE" => /\A\xFE(\xFF)?/n, + "UTF-16LE" => /\A\xFF(\xFE)?/n, + } + result = :not_found + h.each {|name, pat| + if pat =~ buf + if $1 + result = Encoding.find(name) + break + end + result = :more_bytes + end + } + result + } + + def EncDet.detect_encoding(f) + result = nil + detectors = Detectors.dup + buf = '' + while buf.bytesize < PEEKSIZE + begin + buf << f.readpartial(PEEKSIZE - buf.bytesize) + rescue EOFError + break + end + next_detectors = {} + detectors.each {|name, det| + r = det.call(buf) + case r + when :not_found + next + when :more_bytes + next_detectors[name] = det + when Encoding + result = r + break + else + raise TypeError, "unexpected detection result by #{name}: #{r.inspect}" + end + } + break if next_detectors.empty? || result + detectors = next_detectors + end + f.ungetc buf + result || f.external_encoding + end +end Index: test/test-encdet.rb =================================================================== --- test/test-encdet.rb (revision 0) +++ test/test-encdet.rb (revision 0) @@ -0,0 +1,41 @@ +require 'test/unit' +require 'encdet' + +class TestEncDet < Test::Unit::TestCase + + def check_detect_encoding(content, enc) + r, w = IO.pipe + w.write content + w.close + result = EncDet.detect_encoding(r) + assert_equal(Encoding.find(enc), result) + ensure + r.close if r && !r.closed? + w.close if w && !w.closed? + end + + def test_magic_comment + [ + ["-*- coding: utf-8 -*-\n\u{3042}\n", "UTF-8"], + ["-*- coding: euc-jp -*-\n\xa4\xa2\n", "EUC-JP"], + ["#!/bin/sh\n# -*- coding: utf-8 -*-\n\u{3042}\n", "UTF-8"], + ["#!/bin/sh\n# -*- coding: euc-jp -*-\n\xa4\xa2\n", "EUC-JP"], + ["\n-*- coding: utf-8 -*-\n\u{3042}\n", Encoding.locale_charmap], + ["\n-*- coding: euc-jp -*-\n\xa4\xa2\n", Encoding.locale_charmap], + ].each {|content, enc| + check_detect_encoding(content, enc) + } + end + + def test_bom + [ + ["\xef\xbb\xbfab", "UTF-8"], + ["\xfe\xff\0a\0b", "UTF-16BE"], + ["\xff\xfea\0b\0", "UTF-16LE"], + ].each {|content, enc| + check_detect_encoding(content, enc) + } + end + +end + -- [田中 哲][たなか あきら][Tanaka Akira]