From: "nobu (Nobuyoshi Nakada)" Date: 2022-09-18T16:01:59+00:00 Subject: [ruby-core:109950] [Ruby master Bug#19007] Unicode tables differences from Unicode.org 14.0 data and removed properties since 13.0 Issue #19007 has been updated by nobu (Nobuyoshi Nakada). https://github.com/nobu/ruby/tree/emoji ---------------------------------------- Bug #19007: Unicode tables differences from Unicode.org 14.0 data and removed properties since 13.0 https://bugs.ruby-lang.org/issues/19007#change-99198 * Author: nobu (Nobuyoshi Nakada) * Status: Open * Priority: Normal * Assignee: duerst (Martin D��rst) * Target version: 3.2 * ruby -v: 3.2.0 6898984f1cd * Backport: 2.7: DONTNEED, 3.0: DONTNEED, 3.1: DONTNEED ---------------------------------------- I found the header in Unicode Emoji 14.0 data files had changed slightly (and again at 15.0), but `enc/unicode/case-folding.rb` didn't follow it. Then I fixed it and rebuilt the headers under `enc/unicode/14.0.0`, `name2ctype.h` had diffences from the master, as bellow. `CR_Lower`, `CR_Cased` and `CR_Other_Lowercase` just seem misses in the previous operation, and no problems. But U+11720..U+11721 in `CR_Grapheme_Cluster_Break_SpacingMark` is absent in the original data of the Unicode.org. According to @naruse's investigation, it was removed at the commit [Update to Unicode 14.0.0], while U+11720 is still SpacingMark in the latest https://www.unicode.org/reports/tr29/. [Update to Unicode 14.0.0]: https://github.com/latex3/unicode-data/commit/5570040ac8a30e2c2ca4912d415ecaa0498fa23a#diff-1e957b94de10ea96d32a338c005b1f05788af458cf335fc92683bc297e53ed94L582 ```diff diff --git a/enc/unicode/14.0.0/name2ctype.h b/enc/unicode/14.0.0/name2ctype.h index 99a3eeca190..f49e5cd7273 100644 --- a/enc/unicode/14.0.0/name2ctype.h +++ b/enc/unicode/14.0.0/name2ctype.h @@ -1565,7 +1565,7 @@ static const OnigCodePoint CR_Graph[] = { /* 'Lower': [[:Lower:]] */ static const OnigCodePoint CR_Lower[] = { - 664, + 668, 0x0061, 0x007a, 0x00aa, 0x00aa, 0x00b5, 0x00b5, @@ -2196,6 +2196,10 @@ static const OnigCodePoint CR_Lower[] = { 0x105a3, 0x105b1, 0x105b3, 0x105b9, 0x105bb, 0x105bc, + 0x10780, 0x10780, + 0x10783, 0x10785, + 0x10787, 0x107b0, + 0x107b2, 0x107ba, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, 0x16e60, 0x16e7f, @@ -12651,7 +12655,7 @@ static const OnigCodePoint CR_Math[] = { /* 'Cased': Derived Property */ static const OnigCodePoint CR_Cased[] = { - 151, + 155, 0x0041, 0x005a, 0x0061, 0x007a, 0x00aa, 0x00aa, @@ -12763,6 +12767,10 @@ static const OnigCodePoint CR_Cased[] = { 0x105a3, 0x105b1, 0x105b3, 0x105b9, 0x105bb, 0x105bc, + 0x10780, 0x10780, + 0x10783, 0x10785, + 0x10787, 0x107b0, + 0x107b2, 0x107ba, 0x10c80, 0x10cb2, 0x10cc0, 0x10cf2, 0x118a0, 0x118df, @@ -22615,7 +22623,7 @@ static const OnigCodePoint CR_Extender[] = { /* 'Other_Lowercase': Binary Property */ static const OnigCodePoint CR_Other_Lowercase[] = { - 20, + 24, 0x00aa, 0x00aa, 0x00ba, 0x00ba, 0x02b0, 0x02b8, @@ -22636,6 +22644,10 @@ static const OnigCodePoint CR_Other_Lowercase[] = { 0xa770, 0xa770, 0xa7f8, 0xa7f9, 0xab5c, 0xab5f, + 0x10780, 0x10780, + 0x10783, 0x10785, + 0x10787, 0x107b0, + 0x107b2, 0x107ba, }; /* CR_Other_Lowercase */ /* 'Other_Uppercase': Binary Property */ @@ -37049,7 +37061,7 @@ static const OnigCodePoint CR_Grapheme_Cluster_Break_Extend[] = { /* 'Grapheme_Cluster_Break_SpacingMark': Grapheme_Cluster_Break=SpacingMark */ static const OnigCodePoint CR_Grapheme_Cluster_Break_SpacingMark[] = { - 161, + 160, 0x0903, 0x0903, 0x093b, 0x093b, 0x093e, 0x0940, @@ -37183,7 +37195,6 @@ static const OnigCodePoint CR_Grapheme_Cluster_Break_SpacingMark[] = { 0x116ac, 0x116ac, 0x116ae, 0x116af, 0x116b6, 0x116b6, - 0x11720, 0x11721, 0x11726, 0x11726, 0x1182c, 0x1182e, 0x11838, 0x11838, ``` -- https://bugs.ruby-lang.org/ Unsubscribe: