From c3791b01418c7e711a6eb4cf64605dd326dd84aa Mon Sep 17 00:00:00 2001
From: moznion <moznion@mail.moznion.net>
Date: Wed, 2 Jul 2025 20:01:46 +0900
Subject: [PATCH 1/5] Fix cursor positioning for invalid halfwidth
 dakuten/handakuten combinations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When halfwidth dakuten (ﾞ) or handakuten (ﾟ) marks are combined with
non-katakana characters (e.g., 'aﾞ', '1ﾟ'), the cursor movement was
treating them as a single grapheme cluster. This caused incorrect cursor
positioning.

This change:
- Adds detection for invalid combining mark clusters
- Treats invalid combinations as separate characters for width calculation
- Ensures cursor moves character-by-character for invalid combinations
- Maintains proper grapheme cluster handling for valid katakana combinations

The fix applies to:
- get_mbchar_width: calculates width separately for invalid combinations
- get_next_mbchar_size: moves by single character for invalid clusters
- get_prev_mbchar_size: moves by single character for invalid clusters

Signed-off-by: moznion <moznion@mail.moznion.net>
---
 lib/reline/unicode.rb       | 64 ++++++++++++++++++++++++++++++++-----
 test/reline/test_unicode.rb | 44 +++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 8 deletions(-)

diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb
index 21e4ea240e..d6c85cb1b5 100644
--- a/lib/reline/unicode.rb
+++ b/lib/reline/unicode.rb
@@ -40,6 +40,8 @@ class Reline::Unicode
   CSI_REGEXP = /\e\[[\d;]*[ABCDEFGHJKSTfminsuhl]/
   OSC_REGEXP = /\e\]\d+(?:;[^;\a\e]+)*(?:\a|\e\\)/
   WIDTH_SCANNER = /\G(?:(#{NON_PRINTING_START})|(#{NON_PRINTING_END})|(#{CSI_REGEXP})|(#{OSC_REGEXP})|(\X))/o
+  HALFWIDTH_DAKUTEN = 0xFF9E
+  HALFWIDTH_HANDAKUTEN = 0xFF9F
 
   def self.escape_for_print(str)
     str.chars.map! { |gr|
@@ -73,6 +75,32 @@ def self.safe_encode(str, encoding)
   require 'reline/unicode/east_asian_width'
 
   def self.get_mbchar_width(mbchar)
+    # If mbchar contains multiple characters, check if it's a valid combination
+    if mbchar.length >= 2 && invalid_combining_mark_cluster?(mbchar)
+      # Calculate width for each character separately for invalid combinations
+      mbchar.each_char.sum { |char| get_single_char_width(char) }
+    else
+      get_single_char_width(mbchar)
+    end
+  end
+
+  def self.invalid_combining_mark_cluster?(mbchar)
+    begin
+      chars = mbchar.encode(Encoding::UTF_8).chars
+      return false if chars.length < 2
+
+      # Check if the last character is a combining mark (Halfwidth Dakuten/Handakuten)
+      return false unless halfwidth_dakuten_or_handakuten_character?(chars.last)
+
+      # Check if the base character can validly combine with Halfwidth dakuten/handakuten
+      ord = chars[-2].ord
+      ord < 0xFF66 || ord > 0xFF9D # out-of-range of Halfwidth Katakana
+    rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError, ArgumentError
+      false
+    end
+  end
+
+  def self.get_single_char_width(mbchar)
     ord = mbchar.ord
     if ord <= 0x1F # in EscapedPairs
       return 2
@@ -86,10 +114,9 @@ def self.get_mbchar_width(mbchar)
     if size == -1
       Reline.ambiguous_width
     elsif size == 1 && utf8_mbchar.size >= 2
-      second_char_ord = utf8_mbchar[1].ord
       # Halfwidth Dakuten Handakuten
       # Only these two character has Letter Modifier category and can be combined in a single grapheme cluster
-      (second_char_ord == 0xFF9E || second_char_ord == 0xFF9F) ? 2 : 1
+      halfwidth_dakuten_or_handakuten_character?(utf8_mbchar[1]) ? 2 : 1
     else
       size
     end
@@ -248,16 +275,30 @@ def self.take_mbchar_range(str, start_col, width, cover_begin: false, cover_end:
   end
 
   def self.get_next_mbchar_size(line, byte_pointer)
-    grapheme = line.byteslice(byte_pointer..-1).grapheme_clusters.first
-    grapheme ? grapheme.bytesize : 0
+    byteslice = line.byteslice(byte_pointer..-1)
+    grapheme = byteslice.grapheme_clusters.first
+    return 0 unless grapheme
+
+    # If it's an invalid combining mark cluster, move one character at a time
+    if invalid_combining_mark_cluster?(grapheme)
+      byteslice.chars.first.bytesize
+    else
+      grapheme.bytesize
+    end
   end
 
   def self.get_prev_mbchar_size(line, byte_pointer)
-    if byte_pointer.zero?
-      0
+    return 0 if byte_pointer <= 0
+
+    byteslice = line.byteslice(0..(byte_pointer - 1))
+    grapheme = byteslice.grapheme_clusters.last
+    return 0 unless grapheme
+
+    # If it's an invalid combining mark cluster, move one character at a time
+    if invalid_combining_mark_cluster?(grapheme)
+      byteslice.chars.last.bytesize
     else
-      grapheme = line.byteslice(0..(byte_pointer - 1)).grapheme_clusters.last
-      grapheme ? grapheme.bytesize : 0
+      grapheme.bytesize
     end
   end
 
@@ -412,4 +453,11 @@ def self.word_character?(s)
   def self.space_character?(s)
     s.match?(/\s/) if s
   end
+
+  def self.halfwidth_dakuten_or_handakuten_character?(s)
+    return false if s.encoding != Encoding::UTF_8 || !s.valid_encoding?
+
+    ord = s.ord
+    ord == HALFWIDTH_DAKUTEN || ord == HALFWIDTH_HANDAKUTEN
+  end
 end
diff --git a/test/reline/test_unicode.rb b/test/reline/test_unicode.rb
index 0778306c32..7615984d98 100644
--- a/test/reline/test_unicode.rb
+++ b/test/reline/test_unicode.rb
@@ -283,4 +283,48 @@ def test_character_type
     refute(Reline::Unicode.space_character?('-'))
     refute(Reline::Unicode.space_character?(nil))
   end
+
+  def test_halfwidth_dakuten_handakuten_combinations
+    assert_equal 2, Reline::Unicode.get_mbchar_width("ｶﾞ")
+    assert_equal 2, Reline::Unicode.get_mbchar_width("ﾊﾟ")
+    assert_equal 2, Reline::Unicode.get_mbchar_width("ｻﾞ")
+    assert_equal 2, Reline::Unicode.get_mbchar_width("aﾞ")
+    assert_equal 2, Reline::Unicode.get_mbchar_width("1ﾟ")
+    assert_equal 3, Reline::Unicode.get_mbchar_width("あﾞ")
+    assert_equal 3, Reline::Unicode.get_mbchar_width("紅ﾞ")
+  end
+
+  def test_get_next_mbchar_size_with_dakuten
+    line = "ｶﾞtest" # valid combination
+    assert_equal 6, Reline::Unicode.get_next_mbchar_size(line, 0)  # 'ｶ' (3 bytes) + 'ﾞ' (3 bytes)
+    assert_equal 1, Reline::Unicode.get_next_mbchar_size(line, 6)  # 't' only (1 byte)
+
+    line = "aﾞtest" # invalid combination
+    assert_equal 1, Reline::Unicode.get_next_mbchar_size(line, 0)  # 'a' only (1 byte)
+    assert_equal 3, Reline::Unicode.get_next_mbchar_size(line, 1)  # 'ﾞ' only (3 bytes)
+    assert_equal 1, Reline::Unicode.get_next_mbchar_size(line, 4)  # 't' only (1 bytes)
+  end
+
+  def test_get_prev_mbchar_size_with_dakuten
+    line = "testｶﾞ" # valid combination
+    assert_equal 0, Reline::Unicode.get_prev_mbchar_size(line, 0)
+    assert_equal 1, Reline::Unicode.get_prev_mbchar_size(line, 4) # 't' (1 byte)
+    assert_equal 3, Reline::Unicode.get_prev_mbchar_size(line, 7) # 'ｶ' (3 bytes)
+    assert_equal 6, Reline::Unicode.get_prev_mbchar_size(line, 10) # ｶﾞ (6 bytes)
+
+    line = "testaﾞ" # invalid combination
+    assert_equal 1, Reline::Unicode.get_prev_mbchar_size(line, 4)  # 't' (1 byte)
+    assert_equal 3, Reline::Unicode.get_prev_mbchar_size(line, 8)  # 'ﾞ' (3 byte)
+  end
+
+  def test_invalid_combining_mark_cluster
+    assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("ｶﾞ")
+    assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("ﾊﾟ")
+
+    assert_equal true, Reline::Unicode.invalid_combining_mark_cluster?("aﾞ")
+    assert_equal true, Reline::Unicode.invalid_combining_mark_cluster?("あﾟ")
+
+    assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("abc")
+    assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("ｶ")
+  end
 end

From c92a9ea75344606fed91b2ddb067f304fde6eaaf Mon Sep 17 00:00:00 2001
From: moznion <moznion@mail.moznion.net>
Date: Thu, 3 Jul 2025 11:16:08 +0900
Subject: [PATCH 2/5] Insert the fast path for performance

Signed-off-by: moznion <moznion@mail.moznion.net>
---
 lib/reline/unicode.rb | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb
index d6c85cb1b5..ccad032c14 100644
--- a/lib/reline/unicode.rb
+++ b/lib/reline/unicode.rb
@@ -75,6 +75,14 @@ def self.safe_encode(str, encoding)
   require 'reline/unicode/east_asian_width'
 
   def self.get_mbchar_width(mbchar)
+    ord = mbchar.ord
+    if ord <= 0x1F # in EscapedPairs
+      return 2
+    elsif mbchar.length <= 1 && ord <= 0x7E # printable ASCII chars
+      return 1
+    end
+    # ^ fast path to quickly calculate the width.
+
     # If mbchar contains multiple characters, check if it's a valid combination
     if mbchar.length >= 2 && invalid_combining_mark_cluster?(mbchar)
       # Calculate width for each character separately for invalid combinations
@@ -107,8 +115,10 @@ def self.get_single_char_width(mbchar)
     elsif ord <= 0x7E # printable ASCII chars
       return 1
     end
+
     utf8_mbchar = mbchar.encode(Encoding::UTF_8)
     ord = utf8_mbchar.ord
+
     chunk_index = EastAsianWidth::CHUNK_LAST.bsearch_index { |o| ord <= o }
     size = EastAsianWidth::CHUNK_WIDTH[chunk_index]
     if size == -1

From 911e7714871482311cada4ffb26afe573739dcd2 Mon Sep 17 00:00:00 2001
From: moznion <moznion@mail.moznion.net>
Date: Fri, 4 Jul 2025 00:41:32 +0900
Subject: [PATCH 3/5] Optimize by suppressing unnecessary combination check

Signed-off-by: moznion <moznion@mail.moznion.net>
---
 lib/reline/unicode.rb | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb
index ccad032c14..1a92a3ea01 100644
--- a/lib/reline/unicode.rb
+++ b/lib/reline/unicode.rb
@@ -84,7 +84,7 @@ def self.get_mbchar_width(mbchar)
     # ^ fast path to quickly calculate the width.
 
     # If mbchar contains multiple characters, check if it's a valid combination
-    if mbchar.length >= 2 && invalid_combining_mark_cluster?(mbchar)
+    if invalid_combining_mark_cluster?(mbchar)
       # Calculate width for each character separately for invalid combinations
       mbchar.each_char.sum { |char| get_single_char_width(char) }
     else
@@ -93,6 +93,8 @@ def self.get_mbchar_width(mbchar)
   end
 
   def self.invalid_combining_mark_cluster?(mbchar)
+    return false if mbchar.length < 2
+
     begin
       chars = mbchar.encode(Encoding::UTF_8).chars
       return false if chars.length < 2

From 9282ae7bcae54826e17424c03f6e461f800f0bfb Mon Sep 17 00:00:00 2001
From: moznion <moznion@mail.moznion.net>
Date: Sat, 5 Jul 2025 20:26:54 +0900
Subject: [PATCH 4/5] Rollback `get_prev_mbchar_size` and
 `get_next_mbchar_size` to the original

Signed-off-by: moznion <moznion@mail.moznion.net>
---
 lib/reline/unicode.rb       | 26 ++++++--------------------
 test/reline/test_unicode.rb | 23 -----------------------
 2 files changed, 6 insertions(+), 43 deletions(-)

diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb
index 1a92a3ea01..3f842e7180 100644
--- a/lib/reline/unicode.rb
+++ b/lib/reline/unicode.rb
@@ -287,30 +287,16 @@ def self.take_mbchar_range(str, start_col, width, cover_begin: false, cover_end:
   end
 
   def self.get_next_mbchar_size(line, byte_pointer)
-    byteslice = line.byteslice(byte_pointer..-1)
-    grapheme = byteslice.grapheme_clusters.first
-    return 0 unless grapheme
-
-    # If it's an invalid combining mark cluster, move one character at a time
-    if invalid_combining_mark_cluster?(grapheme)
-      byteslice.chars.first.bytesize
-    else
-      grapheme.bytesize
-    end
+    grapheme = line.byteslice(byte_pointer..-1).grapheme_clusters.first
+    grapheme ? grapheme.bytesize : 0
   end
 
   def self.get_prev_mbchar_size(line, byte_pointer)
-    return 0 if byte_pointer <= 0
-
-    byteslice = line.byteslice(0..(byte_pointer - 1))
-    grapheme = byteslice.grapheme_clusters.last
-    return 0 unless grapheme
-
-    # If it's an invalid combining mark cluster, move one character at a time
-    if invalid_combining_mark_cluster?(grapheme)
-      byteslice.chars.last.bytesize
+    if byte_pointer.zero?
+      0
     else
-      grapheme.bytesize
+      grapheme = line.byteslice(0..(byte_pointer - 1)).grapheme_clusters.last
+      grapheme ? grapheme.bytesize : 0
     end
   end
 
diff --git a/test/reline/test_unicode.rb b/test/reline/test_unicode.rb
index 7615984d98..743e35869f 100644
--- a/test/reline/test_unicode.rb
+++ b/test/reline/test_unicode.rb
@@ -294,29 +294,6 @@ def test_halfwidth_dakuten_handakuten_combinations
     assert_equal 3, Reline::Unicode.get_mbchar_width("紅ﾞ")
   end
 
-  def test_get_next_mbchar_size_with_dakuten
-    line = "ｶﾞtest" # valid combination
-    assert_equal 6, Reline::Unicode.get_next_mbchar_size(line, 0)  # 'ｶ' (3 bytes) + 'ﾞ' (3 bytes)
-    assert_equal 1, Reline::Unicode.get_next_mbchar_size(line, 6)  # 't' only (1 byte)
-
-    line = "aﾞtest" # invalid combination
-    assert_equal 1, Reline::Unicode.get_next_mbchar_size(line, 0)  # 'a' only (1 byte)
-    assert_equal 3, Reline::Unicode.get_next_mbchar_size(line, 1)  # 'ﾞ' only (3 bytes)
-    assert_equal 1, Reline::Unicode.get_next_mbchar_size(line, 4)  # 't' only (1 bytes)
-  end
-
-  def test_get_prev_mbchar_size_with_dakuten
-    line = "testｶﾞ" # valid combination
-    assert_equal 0, Reline::Unicode.get_prev_mbchar_size(line, 0)
-    assert_equal 1, Reline::Unicode.get_prev_mbchar_size(line, 4) # 't' (1 byte)
-    assert_equal 3, Reline::Unicode.get_prev_mbchar_size(line, 7) # 'ｶ' (3 bytes)
-    assert_equal 6, Reline::Unicode.get_prev_mbchar_size(line, 10) # ｶﾞ (6 bytes)
-
-    line = "testaﾞ" # invalid combination
-    assert_equal 1, Reline::Unicode.get_prev_mbchar_size(line, 4)  # 't' (1 byte)
-    assert_equal 3, Reline::Unicode.get_prev_mbchar_size(line, 8)  # 'ﾞ' (3 byte)
-  end
-
   def test_invalid_combining_mark_cluster
     assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("ｶﾞ")
     assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("ﾊﾟ")

From c666d065c2c913e7660939a7ba0c3f9e8da4c073 Mon Sep 17 00:00:00 2001
From: moznion <moznion@mail.moznion.net>
Date: Sat, 5 Jul 2025 22:35:27 +0900
Subject: [PATCH 5/5] Unify the halfwidth dakuten/handakuten handling in
 `get_mbchar_width`

This commit removes `get_single_char_width` and unifies the logic.

Ref:

- https://github.com/ruby/reline/pull/832/files#r2187234832

Signed-off-by: moznion <moznion@mail.moznion.net>
---
 lib/reline/unicode.rb       | 47 ++++++-------------------------------
 test/reline/test_unicode.rb | 13 ++--------
 2 files changed, 9 insertions(+), 51 deletions(-)

diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb
index 3f842e7180..26e7246a84 100644
--- a/lib/reline/unicode.rb
+++ b/lib/reline/unicode.rb
@@ -79,42 +79,7 @@ def self.get_mbchar_width(mbchar)
     if ord <= 0x1F # in EscapedPairs
       return 2
     elsif mbchar.length <= 1 && ord <= 0x7E # printable ASCII chars
-      return 1
-    end
-    # ^ fast path to quickly calculate the width.
-
-    # If mbchar contains multiple characters, check if it's a valid combination
-    if invalid_combining_mark_cluster?(mbchar)
-      # Calculate width for each character separately for invalid combinations
-      mbchar.each_char.sum { |char| get_single_char_width(char) }
-    else
-      get_single_char_width(mbchar)
-    end
-  end
-
-  def self.invalid_combining_mark_cluster?(mbchar)
-    return false if mbchar.length < 2
-
-    begin
-      chars = mbchar.encode(Encoding::UTF_8).chars
-      return false if chars.length < 2
-
-      # Check if the last character is a combining mark (Halfwidth Dakuten/Handakuten)
-      return false unless halfwidth_dakuten_or_handakuten_character?(chars.last)
-
-      # Check if the base character can validly combine with Halfwidth dakuten/handakuten
-      ord = chars[-2].ord
-      ord < 0xFF66 || ord > 0xFF9D # out-of-range of Halfwidth Katakana
-    rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError, ArgumentError
-      false
-    end
-  end
-
-  def self.get_single_char_width(mbchar)
-    ord = mbchar.ord
-    if ord <= 0x1F # in EscapedPairs
-      return 2
-    elsif ord <= 0x7E # printable ASCII chars
+      #   ~~~~~~~~~~~~~~~~~~ guard against the following grapheme combination character (e.g., dakuten/handakuten)
       return 1
     end
 
@@ -125,10 +90,12 @@ def self.get_single_char_width(mbchar)
     size = EastAsianWidth::CHUNK_WIDTH[chunk_index]
     if size == -1
       Reline.ambiguous_width
-    elsif size == 1 && utf8_mbchar.size >= 2
-      # Halfwidth Dakuten Handakuten
-      # Only these two character has Letter Modifier category and can be combined in a single grapheme cluster
-      halfwidth_dakuten_or_handakuten_character?(utf8_mbchar[1]) ? 2 : 1
+    elsif halfwidth_dakuten_or_handakuten_character?(utf8_mbchar[-1])
+      if utf8_mbchar.length >= 2 # Whether this is a dakuten or handakuten combination character
+        utf8_mbchar.each_char.sum { |char| get_mbchar_width(char) }
+      else
+        1
+      end
     else
       size
     end
diff --git a/test/reline/test_unicode.rb b/test/reline/test_unicode.rb
index 743e35869f..9cfc53b57b 100644
--- a/test/reline/test_unicode.rb
+++ b/test/reline/test_unicode.rb
@@ -285,6 +285,8 @@ def test_character_type
   end
 
   def test_halfwidth_dakuten_handakuten_combinations
+    assert_equal 1, Reline::Unicode.get_mbchar_width("\uFF9E")
+    assert_equal 1, Reline::Unicode.get_mbchar_width("\uFF9F")
     assert_equal 2, Reline::Unicode.get_mbchar_width("ｶﾞ")
     assert_equal 2, Reline::Unicode.get_mbchar_width("ﾊﾟ")
     assert_equal 2, Reline::Unicode.get_mbchar_width("ｻﾞ")
@@ -293,15 +295,4 @@ def test_halfwidth_dakuten_handakuten_combinations
     assert_equal 3, Reline::Unicode.get_mbchar_width("あﾞ")
     assert_equal 3, Reline::Unicode.get_mbchar_width("紅ﾞ")
   end
-
-  def test_invalid_combining_mark_cluster
-    assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("ｶﾞ")
-    assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("ﾊﾟ")
-
-    assert_equal true, Reline::Unicode.invalid_combining_mark_cluster?("aﾞ")
-    assert_equal true, Reline::Unicode.invalid_combining_mark_cluster?("あﾟ")
-
-    assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("abc")
-    assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("ｶ")
-  end
 end