Re: about REXML::Encoding
From:
nobu.nokada@...
Date:
2005-04-11 13:27:50 UTC
List:
ruby-core #4697
Hi,
At Mon, 11 Apr 2005 22:00:37 +0900,
Sean E. Russell wrote in [ruby-core:04695]:
> Carp. I also didn't notice that you didn't patch some of the encodings
> (CP-1252, ISO-8859-15) so I need to fix those, too. I don't have time this
> morning, but I'll do it this evening (UTC-5).
They were not there when I posted the patch.
Index: lib/rexml/encodings/CP-1252.rb
===================================================================
RCS file: /cvs/ruby/src/ruby/lib/rexml/encodings/CP-1252.rb,v
retrieving revision 1.1
diff -U2 -p -r1.1 CP-1252.rb
--- lib/rexml/encodings/CP-1252.rb 16 May 2004 15:19:22 -0000 1.1
+++ lib/rexml/encodings/CP-1252.rb 17 May 2004 02:55:48 -0000
@@ -4,95 +4,67 @@
module REXML
module Encoding
- @@__REXML_encoding_methods = %q~
+ UTF8_CP1252_TABLE = {
+ 0x20AC => 0x80, # 0xe2 0x82 0xac
+ 0x201A => 0x82, # 0xe2 0x82 0x9a
+ 0x0192 => 0x83, # 0xc6 0x92
+ 0x201E => 0x84, # 0xe2 0x82 0x9e
+ 0x2026 => 0x85, # 0xe2 0x80 0xa6
+ 0x2020 => 0x86, # 0xe2 0x80 0xa0
+ 0x2021 => 0x87, # 0xe2 0x80 0xa1
+ 0x02C6 => 0x88, # 0xcb 0x86
+ 0x2030 => 0x89, # 0xe2 0x80 0xb0
+ 0x0160 => 0x8A, # 0xc5 0xa0
+ 0x2039 => 0x8B, # 0xe2 0x80 0xb9
+ 0x0152 => 0x8C, # 0xc5 0x92
+ 0x017D => 0x8E, # 0xc5 0xbd
+ 0x2018 => 0x91, # 0xe2 0x80 0x98
+ 0x2019 => 0x92, # 0xe2 0x80 0x99
+ 0x201C => 0x93, # 0xe2 0x80 0x9c
+ 0x201D => 0x94, # 0xe2 0x80 0x9d
+ 0x2022 => 0x95, # 0xe2 0x80 0xa2
+ 0x2013 => 0x96, # 0xe2 0x80 0x93
+ 0x2014 => 0x97, # 0xe2 0x80 0x94
+ 0x02DC => 0x98, # 0xcb 0x9c
+ 0x2122 => 0x99, # 0xe2 0x84 0xa2
+ 0x0161 => 0x9A, # 0xc5 0xa1
+ 0x203A => 0x9B, # 0xe2 0x80 0xba
+ 0x0152 => 0x9C, # 0xc5 0x93
+ 0x017E => 0x9E, # 0xc5 0xbe
+ 0x0178 => 0x9F, # 0xc5 0xb8
+ }
+ CP1252_UTF8_TABLE = UTF8_CP1252_TABLE.reverse
+
# Convert from UTF-8
- def encode content
- array_utf8 = content.unpack('U*')
- array_enc = []
- array_utf8.each do |num|
- case num
- # shortcut first bunch basic characters
- when 0..0xFF: array_enc << num
- # characters added compared to iso-8859-1
- when 0x20AC: array_enc << 0x80 # 0xe2 0x82 0xac
- when 0x201A: array_enc << 0x82 # 0xe2 0x82 0x9a
- when 0x0192: array_enc << 0x83 # 0xc6 0x92
- when 0x201E: array_enc << 0x84 # 0xe2 0x82 0x9e
- when 0x2026: array_enc << 0x85 # 0xe2 0x80 0xa6
- when 0x2020: array_enc << 0x86 # 0xe2 0x80 0xa0
- when 0x2021: array_enc << 0x87 # 0xe2 0x80 0xa1
- when 0x02C6: array_enc << 0x88 # 0xcb 0x86
- when 0x2030: array_enc << 0x89 # 0xe2 0x80 0xb0
- when 0x0160: array_enc << 0x8A # 0xc5 0xa0
- when 0x2039: array_enc << 0x8B # 0xe2 0x80 0xb9
- when 0x0152: array_enc << 0x8C # 0xc5 0x92
- when 0x017D: array_enc << 0x8E # 0xc5 0xbd
- when 0x2018: array_enc << 0x91 # 0xe2 0x80 0x98
- when 0x2019: array_enc << 0x92 # 0xe2 0x80 0x99
- when 0x201C: array_enc << 0x93 # 0xe2 0x80 0x9c
- when 0x201D: array_enc << 0x94 # 0xe2 0x80 0x9d
- when 0x2022: array_enc << 0x95 # 0xe2 0x80 0xa2
- when 0x2013: array_enc << 0x96 # 0xe2 0x80 0x93
- when 0x2014: array_enc << 0x97 # 0xe2 0x80 0x94
- when 0x02DC: array_enc << 0x98 # 0xcb 0x9c
- when 0x2122: array_enc << 0x99 # 0xe2 0x84 0xa2
- when 0x0161: array_enc << 0x9A # 0xc5 0xa1
- when 0x203A: array_enc << 0x9B # 0xe2 0x80 0xba
- when 0x0152: array_enc << 0x9C # 0xc5 0x93
- when 0x017E: array_enc << 0x9E # 0xc5 0xbe
- when 0x0178: array_enc << 0x9F # 0xc5 0xb8
- else
+ def encode_cp_1252 content
+ enc = ""
+ content.unpack('U*').each do |num|
+ enc << UTF8_CP1252_TABLE.fetch(num) {
# all remaining basic characters can be used directly
if num <= 0xFF
- array_enc << num
+ num
else
# Numeric entity (&#nnnn;); shard by Stefan Scholl
- array_enc.concat "&\##{num};".unpack('C*')
+ "&\##{num};"
end
- end
+ }
end
- array_enc.pack('C*')
+ enc
end
-
+
# Convert to UTF-8
- def decode(str)
- array_latin9 = str.unpack('C*')
- array_enc = []
- array_latin9.each do |num|
- case num
- # characters that added compared to iso-8859-1
- when 0x80: array_enc << 0x20AC # 0xe2 0x82 0xac
- when 0x82: array_enc << 0x201A # 0xe2 0x82 0x9a
- when 0x83: array_enc << 0x0192 # 0xc6 0x92
- when 0x84: array_enc << 0x201E # 0xe2 0x82 0x9e
- when 0x85: array_enc << 0x2026 # 0xe2 0x80 0xa6
- when 0x86: array_enc << 0x2020 # 0xe2 0x80 0xa0
- when 0x87: array_enc << 0x2021 # 0xe2 0x80 0xa1
- when 0x88: array_enc << 0x02C6 # 0xcb 0x86
- when 0x89: array_enc << 0x2030 # 0xe2 0x80 0xb0
- when 0x8A: array_enc << 0x0160 # 0xc5 0xa0
- when 0x8B: array_enc << 0x2039 # 0xe2 0x80 0xb9
- when 0x8C: array_enc << 0x0152 # 0xc5 0x92
- when 0x8E: array_enc << 0x017D # 0xc5 0xbd
- when 0x91: array_enc << 0x2018 # 0xe2 0x80 0x98
- when 0x92: array_enc << 0x2019 # 0xe2 0x80 0x99
- when 0x93: array_enc << 0x201C # 0xe2 0x80 0x9c
- when 0x94: array_enc << 0x201D # 0xe2 0x80 0x9d
- when 0x95: array_enc << 0x2022 # 0xe2 0x80 0xa2
- when 0x96: array_enc << 0x2013 # 0xe2 0x80 0x93
- when 0x97: array_enc << 0x2014 # 0xe2 0x80 0x94
- when 0x98: array_enc << 0x02DC # 0xcb 0x9c
- when 0x99: array_enc << 0x2122 # 0xe2 0x84 0xa2
- when 0x9A: array_enc << 0x0161 # 0xc5 0xa1
- when 0x9B: array_enc << 0x203A # 0xe2 0x80 0xba
- when 0x9C: array_enc << 0x0152 # 0xc5 0x93
- when 0x9E: array_enc << 0x017E # 0xc5 0xbe
- when 0x9F: array_enc << 0x0178 # 0xc5 0xb8
- else
- array_enc << num
- end
+ def decode_cp_1252(str)
+ enc = ""
+ str.gsub(/&\#(\d+);/){$1.to_i.chr}.each_byte do |num|
+ enc << CP1252_UTF8_TABLE.fetch(num, num)
+ end
+ enc
+ end
+
+ register("CP-1252") do |obj|
+ class << obj
+ alias decode decode_iso_8859_15
+ alias encode encode_iso_8859_15
end
- array_enc.pack('U*')
end
- ~
end
end
Index: lib/rexml/encodings/ISO-8859-15.rb
===================================================================
RCS file: /cvs/ruby/src/ruby/lib/rexml/encodings/ISO-8859-15.rb,v
retrieving revision 1.1
diff -U2 -p -r1.1 ISO-8859-15.rb
--- lib/rexml/encodings/ISO-8859-15.rb 16 May 2004 15:19:22 -0000 1.1
+++ lib/rexml/encodings/ISO-8859-15.rb 17 May 2004 02:55:19 -0000
@@ -4,66 +4,58 @@
module REXML
module Encoding
- @@__REXML_encoding_methods = %q~
+ ISO885915_UTF8_TABLE = {
+ 0xA4 => 0x20AC, # 0xe2 0x82 0xac
+ 0xA6 => 0x0160, # 0xc5 0xa0
+ 0xA8 => 0x0161, # 0xc5 0xa1
+ 0xB4 => 0x017D, # 0xc5 0xbd
+ 0xB8 => 0x017E, # 0xc5 0xbe
+ 0xBC => 0x0152, # 0xc5 0x92
+ 0xBD => 0x0153, # 0xc5 0x93
+ 0xBE => 0x0178, # 0xc5 0xb8
+ }
+ UTF8_ISO885915_TABLE = {
+ # characters removed compared to iso-8859-1
+ 0xA4 => '¤',
+ 0xA6 => '¦',
+ 0xA8 => '¨',
+ 0xB4 => '´',
+ 0xB8 => '¸',
+ 0xBC => '¼',
+ 0xBD => '½',
+ 0xBE => '¾',
+ }.update(ISO885915_UTF8_TABLE.reverse)
+
# Convert from UTF-8
- def to_iso_8859_15 content
- array_utf8 = content.unpack('U*')
- array_enc = []
- array_utf8.each do |num|
- case num
- # shortcut first bunch basic characters
- when 0..0xA3: array_enc << num
- # characters removed compared to iso-8859-1
- when 0xA4: array_enc << '¤'
- when 0xA6: array_enc << '¦'
- when 0xA8: array_enc << '¨'
- when 0xB4: array_enc << '´'
- when 0xB8: array_enc << '¸'
- when 0xBC: array_enc << '¼'
- when 0xBD: array_enc << '½'
- when 0xBE: array_enc << '¾'
- # characters added compared to iso-8859-1
- when 0x20AC: array_enc << 0xA4 # 0xe2 0x82 0xac
- when 0x0160: array_enc << 0xA6 # 0xc5 0xa0
- when 0x0161: array_enc << 0xA8 # 0xc5 0xa1
- when 0x017D: array_enc << 0xB4 # 0xc5 0xbd
- when 0x017E: array_enc << 0xB8 # 0xc5 0xbe
- when 0x0152: array_enc << 0xBC # 0xc5 0x92
- when 0x0153: array_enc << 0xBD # 0xc5 0x93
- when 0x0178: array_enc << 0xBE # 0xc5 0xb8
- else
+ def encode_iso_8859_15 content
+ enc = ""
+ content.unpack('U*').each do |num|
+ enc << UTF8_ISO885915_TABLE.fetch(num) {
# all remaining basic characters can be used directly
if num <= 0xFF
- array_enc << num
+ num
else
# Numeric entity (&#nnnn;); shard by Stefan Scholl
- array_enc.concat "&\##{num};".unpack('C*')
+ "&\##{num};"
end
- end
+ }
end
- array_enc.pack('C*')
+ enc
end
-
+
# Convert to UTF-8
- def from_iso_8859_15(str)
- array_latin9 = str.unpack('C*')
- array_enc = []
- array_latin9.each do |num|
- case num
- # characters that differ compared to iso-8859-1
- when 0xA4: array_enc << 0x20AC
- when 0xA6: array_enc << 0x0160
- when 0xA8: array_enc << 0x0161
- when 0xB4: array_enc << 0x017D
- when 0xB8: array_enc << 0x017E
- when 0xBC: array_enc << 0x0152
- when 0xBD: array_enc << 0x0153
- when 0xBE: array_enc << 0x0178
- else
- array_enc << num
- end
+ def decode_iso_8859_15(str)
+ enc = ""
+ str.gsub(/&\#(\d+);/){$1.to_i.chr}.each_byte do |num|
+ enc << ISO885915_UTF8_TABLE.fetch(num, num)
+ end
+ enc
+ end
+
+ register("ISO-8859-15") do |obj|
+ class << obj
+ alias decode decode_iso_8859_15
+ alias encode encode_iso_8859_15
end
- array_enc.pack('U*')
end
- ~
end
end
Index: lib/rexml/encodings/SHIFT_JIS.rb
===================================================================
RCS file: /cvs/ruby/src/ruby/lib/rexml/encodings/SHIFT_JIS.rb,v
retrieving revision 1.7
diff -U2 -p -r1.7 SHIFT_JIS.rb
--- lib/rexml/encodings/SHIFT_JIS.rb 19 Dec 2004 15:19:43 -0000 1.7
+++ lib/rexml/encodings/SHIFT_JIS.rb 11 Apr 2005 13:26:24 -0000
@@ -1 +1 @@
-load 'rexml/encodings/SHIFT-JIS.rb'
+require 'rexml/encodings/SHIFT-JIS'
--
Nobu Nakada