From 99dc2a34ccda126161e1b70f3fe270716c771c95 Mon Sep 17 00:00:00 2001 From: tompng Date: Mon, 7 Jul 2025 20:44:50 +0900 Subject: [PATCH] Update grapheme cluster width calculation Width of NonspacingMark and EnclosingMark is 0 Width of char just after ZeroWidthJoiner is 0 Width of Hangul GraphemeClusterBreak=V,T are 0 because there should be preceding L or LV --- bin/generate_east_asian_width | 14 ++++++--- lib/reline/unicode.rb | 39 +++++++++++--------------- lib/reline/unicode/east_asian_width.rb | 15 +++++----- test/reline/test_unicode.rb | 29 +++++++++++++++++++ 4 files changed, 62 insertions(+), 35 deletions(-) diff --git a/bin/generate_east_asian_width b/bin/generate_east_asian_width index 1a7999f1f9..7d9018d096 100755 --- a/bin/generate_east_asian_width +++ b/bin/generate_east_asian_width @@ -5,8 +5,14 @@ if ARGV.empty? exit 1 end -def unicode_width(type, category) - return 0 if category == 'Mn' # Nonspacing Mark +def unicode_width(type, category, rest) + # Nonspacing Mark, Enclosing Mark + return 0 if category == 'Mn' || category == 'Me' + + # Grapheme_Cluster_Break=V, Grapheme_Cluster_Break=T. + # Width of L, LV, LVT are 2. Treat V and T as width=0 because there should be L or LV before V or T. + return 0 if rest =~ /HANGUL JUNGSEONG|HANGUL JONGSEONG/ + case type when 'F', 'W' # Fullwidth, Wide 2 @@ -27,10 +33,10 @@ open(ARGV.first, 'rt') do |f| widths = [] f.each_line do |line| - next unless /^(?\h+)(?:\.\.(?\h+))?\s*;\s*(?\w+)\s+# +(?[^ ]+)/ =~ line + next unless /^(?\h+)(?:\.\.(?\h+))?\s*;\s*(?\w+)\s+# +(?[^ ]+)(?.*)/ =~ line range = first.to_i(16)..(last || first).to_i(16) - widths.fill(unicode_width(type, category), range) + widths.fill(unicode_width(type, category, rest), range) end # EscapedPairs diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb index 26e7246a84..28d6b1b2b9 100644 --- a/lib/reline/unicode.rb +++ b/lib/reline/unicode.rb @@ -40,8 +40,6 @@ class Reline::Unicode CSI_REGEXP = /\e\[[\d;]*[ABCDEFGHJKSTfminsuhl]/ OSC_REGEXP = /\e\]\d+(?:;[^;\a\e]+)*(?:\a|\e\\)/ WIDTH_SCANNER = /\G(?:(#{NON_PRINTING_START})|(#{NON_PRINTING_END})|(#{CSI_REGEXP})|(#{OSC_REGEXP})|(\X))/o - HALFWIDTH_DAKUTEN = 0xFF9E - HALFWIDTH_HANDAKUTEN = 0xFF9F def self.escape_for_print(str) str.chars.map! { |gr| @@ -74,30 +72,32 @@ def self.safe_encode(str, encoding) require 'reline/unicode/east_asian_width' + def self.east_asian_width(ord) + chunk_index = EastAsianWidth::CHUNK_LAST.bsearch_index { |o| ord <= o } + size = EastAsianWidth::CHUNK_WIDTH[chunk_index] + size == -1 ? Reline.ambiguous_width : size + end + def self.get_mbchar_width(mbchar) ord = mbchar.ord if ord <= 0x1F # in EscapedPairs return 2 - elsif mbchar.length <= 1 && ord <= 0x7E # printable ASCII chars - # ~~~~~~~~~~~~~~~~~~ guard against the following grapheme combination character (e.g., dakuten/handakuten) + elsif mbchar.length == 1 && ord <= 0x7E # printable ASCII chars return 1 end utf8_mbchar = mbchar.encode(Encoding::UTF_8) - ord = utf8_mbchar.ord - - chunk_index = EastAsianWidth::CHUNK_LAST.bsearch_index { |o| ord <= o } - size = EastAsianWidth::CHUNK_WIDTH[chunk_index] - if size == -1 - Reline.ambiguous_width - elsif halfwidth_dakuten_or_handakuten_character?(utf8_mbchar[-1]) - if utf8_mbchar.length >= 2 # Whether this is a dakuten or handakuten combination character - utf8_mbchar.each_char.sum { |char| get_mbchar_width(char) } + zwj = false + utf8_mbchar.chars.sum do |c| + if zwj + zwj = false + 0 + elsif c.ord == 0x200D # Zero Width Joiner + zwj = true + 0 else - 1 + east_asian_width(c.ord) end - else - size end end @@ -418,11 +418,4 @@ def self.word_character?(s) def self.space_character?(s) s.match?(/\s/) if s end - - def self.halfwidth_dakuten_or_handakuten_character?(s) - return false if s.encoding != Encoding::UTF_8 || !s.valid_encoding? - - ord = s.ord - ord == HALFWIDTH_DAKUTEN || ord == HALFWIDTH_HANDAKUTEN - end end diff --git a/lib/reline/unicode/east_asian_width.rb b/lib/reline/unicode/east_asian_width.rb index 9c5e42e239..725d88bcea 100644 --- a/lib/reline/unicode/east_asian_width.rb +++ b/lib/reline/unicode/east_asian_width.rb @@ -129,7 +129,7 @@ class Reline::Unicode::EastAsianWidth [0x450, 1], [0x451, -1], [0x482, 1], - [0x487, 0], + [0x489, 0], [0x590, 1], [0x5bd, 0], [0x5be, 1], @@ -356,6 +356,7 @@ class Reline::Unicode::EastAsianWidth [0x109d, 0], [0x10ff, 1], [0x115f, 2], + [0x11ff, 0], [0x135c, 1], [0x135f, 0], [0x1711, 1], @@ -411,8 +412,6 @@ class Reline::Unicode::EastAsianWidth [0x1a7e, 1], [0x1a7f, 0], [0x1aaf, 1], - [0x1abd, 0], - [0x1abe, 1], [0x1ace, 0], [0x1aff, 1], [0x1b03, 0], @@ -491,10 +490,6 @@ class Reline::Unicode::EastAsianWidth [0x20ab, 1], [0x20ac, -1], [0x20cf, 1], - [0x20dc, 0], - [0x20e0, 1], - [0x20e1, 0], - [0x20e4, 1], [0x20f0, 0], [0x2102, 1], [0x2103, -1], @@ -767,7 +762,7 @@ class Reline::Unicode::EastAsianWidth [0xa48f, 1], [0xa4c6, 2], [0xa66e, 1], - [0xa66f, 0], + [0xa672, 0], [0xa673, 1], [0xa67d, 0], [0xa69d, 1], @@ -840,6 +835,10 @@ class Reline::Unicode::EastAsianWidth [0xabed, 0], [0xabff, 1], [0xd7a3, 2], + [0xd7af, 1], + [0xd7c6, 0], + [0xd7ca, 1], + [0xd7fb, 0], [0xdfff, 1], [0xf8ff, -1], [0xfaff, 2], diff --git a/test/reline/test_unicode.rb b/test/reline/test_unicode.rb index 9cfc53b57b..d86dc4d759 100644 --- a/test/reline/test_unicode.rb +++ b/test/reline/test_unicode.rb @@ -295,4 +295,33 @@ def test_halfwidth_dakuten_handakuten_combinations assert_equal 3, Reline::Unicode.get_mbchar_width("あ゙") assert_equal 3, Reline::Unicode.get_mbchar_width("紅゙") end + + def test_grapheme_cluster_width + # GB6, GB7, GB8: Hangul syllable + assert_equal 2, Reline::Unicode.get_mbchar_width('한'.unicode_normalize(:nfd)) + assert_equal 6, Reline::Unicode.get_mbchar_width('ᄀ' * 3) + + # GB9 + # Char + NonspacingMark + assert_equal 1, Reline::Unicode.get_mbchar_width('ç'.unicode_normalize(:nfd)) + assert_equal 2, Reline::Unicode.get_mbchar_width('ぱ'.unicode_normalize(:nfd)) + assert_equal 1, Reline::Unicode.get_mbchar_width("c\u{301}\u{327}") + # '1' + NonspacingMark + EnclosingMark + assert_equal 1, Reline::Unicode.get_mbchar_width('1️⃣') + # Char + SpacingMark + assert_equal 2, Reline::Unicode.get_mbchar_width('কা') + assert_equal 5, Reline::Unicode.get_mbchar_width('ガ゚゙゙') + # Emoji joined with ZeroWidthJoiner + assert_equal 2, Reline::Unicode.get_mbchar_width('👨‍👩‍👧') + assert_equal 7, Reline::Unicode.get_mbchar_width('👨‍👩‍👧゙゚゚゚゙') + + # GB9a: Char + GraphemeClusterBreak=SpacingMark + assert_equal 2, Reline::Unicode.get_mbchar_width('คำ') + + # GB9c: Consonant + Linker(NonspacingMark) + Consonant + assert_equal 2, Reline::Unicode.get_mbchar_width('क्त') + + # GB12, GB13: RegionalIndicator + assert_equal 2, Reline::Unicode.get_mbchar_width('🇯🇵') + end end