From 99dc2a34ccda126161e1b70f3fe270716c771c95 Mon Sep 17 00:00:00 2001
From: tompng <tomoyapenguin@gmail.com>
Date: Mon, 7 Jul 2025 20:44:50 +0900
Subject: [PATCH] Update grapheme cluster width calculation

Width of NonspacingMark and EnclosingMark is 0
Width of char just after ZeroWidthJoiner is 0
Width of Hangul GraphemeClusterBreak=V,T are 0 because there should be preceding L or LV
---
 bin/generate_east_asian_width          | 14 ++++++---
 lib/reline/unicode.rb                  | 39 +++++++++++---------------
 lib/reline/unicode/east_asian_width.rb | 15 +++++-----
 test/reline/test_unicode.rb            | 29 +++++++++++++++++++
 4 files changed, 62 insertions(+), 35 deletions(-)
diff --git a/bin/generate_east_asian_width b/bin/generate_east_asian_width
index 1a7999f1f9..7d9018d096 100755
--- a/bin/generate_east_asian_width
+++ b/bin/generate_east_asian_width
@@ -5,8 +5,14 @@ if ARGV.empty?
   exit 1
 end
 
-def unicode_width(type, category)
-  return 0 if category == 'Mn' # Nonspacing Mark
+def unicode_width(type, category, rest)
+  # Nonspacing Mark, Enclosing Mark
+  return 0 if category == 'Mn' || category == 'Me'
+
+  # Grapheme_Cluster_Break=V, Grapheme_Cluster_Break=T.
+  # Width of L, LV, LVT are 2. Treat V and T as width=0 because there should be L or LV before V or T.
+  return 0 if rest =~ /HANGUL JUNGSEONG|HANGUL JONGSEONG/
+
   case type
   when 'F', 'W' # Fullwidth, Wide
     2
@@ -27,10 +33,10 @@ open(ARGV.first, 'rt') do |f|
 
   widths = []
   f.each_line do |line|
-    next unless /^(?<first>\h+)(?:\.\.(?<last>\h+))?\s*;\s*(?<type>\w+)\s+# +(?<category>[^ ]+)/ =~ line
+    next unless /^(?<first>\h+)(?:\.\.(?<last>\h+))?\s*;\s*(?<type>\w+)\s+# +(?<category>[^ ]+)(?<rest>.*)/ =~ line
 
     range = first.to_i(16)..(last || first).to_i(16)
-    widths.fill(unicode_width(type, category), range)
+    widths.fill(unicode_width(type, category, rest), range)
   end
 
   # EscapedPairs
diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb
index 26e7246a84..28d6b1b2b9 100644
--- a/lib/reline/unicode.rb
+++ b/lib/reline/unicode.rb
@@ -40,8 +40,6 @@ class Reline::Unicode
   CSI_REGEXP = /\e\[[\d;]*[ABCDEFGHJKSTfminsuhl]/
   OSC_REGEXP = /\e\]\d+(?:;[^;\a\e]+)*(?:\a|\e\\)/
   WIDTH_SCANNER = /\G(?:(#{NON_PRINTING_START})|(#{NON_PRINTING_END})|(#{CSI_REGEXP})|(#{OSC_REGEXP})|(\X))/o
-  HALFWIDTH_DAKUTEN = 0xFF9E
-  HALFWIDTH_HANDAKUTEN = 0xFF9F
 
   def self.escape_for_print(str)
     str.chars.map! { |gr|
@@ -74,30 +72,32 @@ def self.safe_encode(str, encoding)
 
   require 'reline/unicode/east_asian_width'
 
+  def self.east_asian_width(ord)
+    chunk_index = EastAsianWidth::CHUNK_LAST.bsearch_index { |o| ord <= o }
+    size = EastAsianWidth::CHUNK_WIDTH[chunk_index]
+    size == -1 ? Reline.ambiguous_width : size
+  end
+
   def self.get_mbchar_width(mbchar)
     ord = mbchar.ord
     if ord <= 0x1F # in EscapedPairs
       return 2
-    elsif mbchar.length <= 1 && ord <= 0x7E # printable ASCII chars
-      #   ~~~~~~~~~~~~~~~~~~ guard against the following grapheme combination character (e.g., dakuten/handakuten)
+    elsif mbchar.length == 1 && ord <= 0x7E # printable ASCII chars
       return 1
     end
 
     utf8_mbchar = mbchar.encode(Encoding::UTF_8)
-    ord = utf8_mbchar.ord
-
-    chunk_index = EastAsianWidth::CHUNK_LAST.bsearch_index { |o| ord <= o }
-    size = EastAsianWidth::CHUNK_WIDTH[chunk_index]
-    if size == -1
-      Reline.ambiguous_width
-    elsif halfwidth_dakuten_or_handakuten_character?(utf8_mbchar[-1])
-      if utf8_mbchar.length >= 2 # Whether this is a dakuten or handakuten combination character
-        utf8_mbchar.each_char.sum { |char| get_mbchar_width(char) }
+    zwj = false
+    utf8_mbchar.chars.sum do |c|
+      if zwj
+        zwj = false
+        0
+      elsif c.ord == 0x200D # Zero Width Joiner
+        zwj = true
+        0
       else
-        1
+        east_asian_width(c.ord)
       end
-    else
-      size
     end
   end
 
@@ -418,11 +418,4 @@ def self.word_character?(s)
   def self.space_character?(s)
     s.match?(/\s/) if s
   end
-
-  def self.halfwidth_dakuten_or_handakuten_character?(s)
-    return false if s.encoding != Encoding::UTF_8 || !s.valid_encoding?
-
-    ord = s.ord
-    ord == HALFWIDTH_DAKUTEN || ord == HALFWIDTH_HANDAKUTEN
-  end
 end
diff --git a/lib/reline/unicode/east_asian_width.rb b/lib/reline/unicode/east_asian_width.rb
index 9c5e42e239..725d88bcea 100644
--- a/lib/reline/unicode/east_asian_width.rb
+++ b/lib/reline/unicode/east_asian_width.rb
@@ -129,7 +129,7 @@ class Reline::Unicode::EastAsianWidth
     [0x450, 1],
     [0x451, -1],
     [0x482, 1],
-    [0x487, 0],
+    [0x489, 0],
     [0x590, 1],
     [0x5bd, 0],
     [0x5be, 1],
@@ -356,6 +356,7 @@ class Reline::Unicode::EastAsianWidth
     [0x109d, 0],
     [0x10ff, 1],
     [0x115f, 2],
+    [0x11ff, 0],
     [0x135c, 1],
     [0x135f, 0],
     [0x1711, 1],
@@ -411,8 +412,6 @@ class Reline::Unicode::EastAsianWidth
     [0x1a7e, 1],
     [0x1a7f, 0],
     [0x1aaf, 1],
-    [0x1abd, 0],
-    [0x1abe, 1],
     [0x1ace, 0],
     [0x1aff, 1],
     [0x1b03, 0],
@@ -491,10 +490,6 @@ class Reline::Unicode::EastAsianWidth
     [0x20ab, 1],
     [0x20ac, -1],
     [0x20cf, 1],
-    [0x20dc, 0],
-    [0x20e0, 1],
-    [0x20e1, 0],
-    [0x20e4, 1],
     [0x20f0, 0],
     [0x2102, 1],
     [0x2103, -1],
@@ -767,7 +762,7 @@ class Reline::Unicode::EastAsianWidth
     [0xa48f, 1],
     [0xa4c6, 2],
     [0xa66e, 1],
-    [0xa66f, 0],
+    [0xa672, 0],
     [0xa673, 1],
     [0xa67d, 0],
     [0xa69d, 1],
@@ -840,6 +835,10 @@ class Reline::Unicode::EastAsianWidth
     [0xabed, 0],
     [0xabff, 1],
     [0xd7a3, 2],
+    [0xd7af, 1],
+    [0xd7c6, 0],
+    [0xd7ca, 1],
+    [0xd7fb, 0],
     [0xdfff, 1],
     [0xf8ff, -1],
     [0xfaff, 2],
diff --git a/test/reline/test_unicode.rb b/test/reline/test_unicode.rb
index 9cfc53b57b..d86dc4d759 100644
--- a/test/reline/test_unicode.rb
+++ b/test/reline/test_unicode.rb
@@ -295,4 +295,33 @@ def test_halfwidth_dakuten_handakuten_combinations
     assert_equal 3, Reline::Unicode.get_mbchar_width("あﾞ")
     assert_equal 3, Reline::Unicode.get_mbchar_width("紅ﾞ")
   end
+
+  def test_grapheme_cluster_width
+    # GB6, GB7, GB8: Hangul syllable
+    assert_equal 2, Reline::Unicode.get_mbchar_width('한'.unicode_normalize(:nfd))
+    assert_equal 6, Reline::Unicode.get_mbchar_width('ᄀ' * 3)
+
+    # GB9
+    # Char + NonspacingMark
+    assert_equal 1, Reline::Unicode.get_mbchar_width('ç'.unicode_normalize(:nfd))
+    assert_equal 2, Reline::Unicode.get_mbchar_width('ぱ'.unicode_normalize(:nfd))
+    assert_equal 1, Reline::Unicode.get_mbchar_width("c\u{301}\u{327}")
+    # '1' + NonspacingMark + EnclosingMark
+    assert_equal 1, Reline::Unicode.get_mbchar_width('1️⃣')
+    # Char + SpacingMark
+    assert_equal 2, Reline::Unicode.get_mbchar_width('কা')
+    assert_equal 5, Reline::Unicode.get_mbchar_width('ｶﾞﾟﾞﾞ')
+    # Emoji joined with ZeroWidthJoiner
+    assert_equal 2, Reline::Unicode.get_mbchar_width('👨‍👩‍👧')
+    assert_equal 7, Reline::Unicode.get_mbchar_width('👨‍👩‍👧ﾞﾟﾟﾟﾞ')
+
+    # GB9a: Char + GraphemeClusterBreak=SpacingMark
+    assert_equal 2, Reline::Unicode.get_mbchar_width('คำ')
+
+    # GB9c: Consonant + Linker(NonspacingMark) + Consonant
+    assert_equal 2, Reline::Unicode.get_mbchar_width('क्त')
+
+    # GB12, GB13: RegionalIndicator
+    assert_equal 2, Reline::Unicode.get_mbchar_width('🇯🇵')
+  end
 end