From c3791b01418c7e711a6eb4cf64605dd326dd84aa Mon Sep 17 00:00:00 2001 From: moznion Date: Wed, 2 Jul 2025 20:01:46 +0900 Subject: [PATCH 1/5] Fix cursor positioning for invalid halfwidth dakuten/handakuten combinations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When halfwidth dakuten (゙) or handakuten (゚) marks are combined with non-katakana characters (e.g., 'a゙', '1゚'), the cursor movement was treating them as a single grapheme cluster. This caused incorrect cursor positioning. This change: - Adds detection for invalid combining mark clusters - Treats invalid combinations as separate characters for width calculation - Ensures cursor moves character-by-character for invalid combinations - Maintains proper grapheme cluster handling for valid katakana combinations The fix applies to: - get_mbchar_width: calculates width separately for invalid combinations - get_next_mbchar_size: moves by single character for invalid clusters - get_prev_mbchar_size: moves by single character for invalid clusters Signed-off-by: moznion --- lib/reline/unicode.rb | 64 ++++++++++++++++++++++++++++++++----- test/reline/test_unicode.rb | 44 +++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 8 deletions(-) diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb index 21e4ea240e..d6c85cb1b5 100644 --- a/lib/reline/unicode.rb +++ b/lib/reline/unicode.rb @@ -40,6 +40,8 @@ class Reline::Unicode CSI_REGEXP = /\e\[[\d;]*[ABCDEFGHJKSTfminsuhl]/ OSC_REGEXP = /\e\]\d+(?:;[^;\a\e]+)*(?:\a|\e\\)/ WIDTH_SCANNER = /\G(?:(#{NON_PRINTING_START})|(#{NON_PRINTING_END})|(#{CSI_REGEXP})|(#{OSC_REGEXP})|(\X))/o + HALFWIDTH_DAKUTEN = 0xFF9E + HALFWIDTH_HANDAKUTEN = 0xFF9F def self.escape_for_print(str) str.chars.map! { |gr| @@ -73,6 +75,32 @@ def self.safe_encode(str, encoding) require 'reline/unicode/east_asian_width' def self.get_mbchar_width(mbchar) + # If mbchar contains multiple characters, check if it's a valid combination + if mbchar.length >= 2 && invalid_combining_mark_cluster?(mbchar) + # Calculate width for each character separately for invalid combinations + mbchar.each_char.sum { |char| get_single_char_width(char) } + else + get_single_char_width(mbchar) + end + end + + def self.invalid_combining_mark_cluster?(mbchar) + begin + chars = mbchar.encode(Encoding::UTF_8).chars + return false if chars.length < 2 + + # Check if the last character is a combining mark (Halfwidth Dakuten/Handakuten) + return false unless halfwidth_dakuten_or_handakuten_character?(chars.last) + + # Check if the base character can validly combine with Halfwidth dakuten/handakuten + ord = chars[-2].ord + ord < 0xFF66 || ord > 0xFF9D # out-of-range of Halfwidth Katakana + rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError, ArgumentError + false + end + end + + def self.get_single_char_width(mbchar) ord = mbchar.ord if ord <= 0x1F # in EscapedPairs return 2 @@ -86,10 +114,9 @@ def self.get_mbchar_width(mbchar) if size == -1 Reline.ambiguous_width elsif size == 1 && utf8_mbchar.size >= 2 - second_char_ord = utf8_mbchar[1].ord # Halfwidth Dakuten Handakuten # Only these two character has Letter Modifier category and can be combined in a single grapheme cluster - (second_char_ord == 0xFF9E || second_char_ord == 0xFF9F) ? 2 : 1 + halfwidth_dakuten_or_handakuten_character?(utf8_mbchar[1]) ? 2 : 1 else size end @@ -248,16 +275,30 @@ def self.take_mbchar_range(str, start_col, width, cover_begin: false, cover_end: end def self.get_next_mbchar_size(line, byte_pointer) - grapheme = line.byteslice(byte_pointer..-1).grapheme_clusters.first - grapheme ? grapheme.bytesize : 0 + byteslice = line.byteslice(byte_pointer..-1) + grapheme = byteslice.grapheme_clusters.first + return 0 unless grapheme + + # If it's an invalid combining mark cluster, move one character at a time + if invalid_combining_mark_cluster?(grapheme) + byteslice.chars.first.bytesize + else + grapheme.bytesize + end end def self.get_prev_mbchar_size(line, byte_pointer) - if byte_pointer.zero? - 0 + return 0 if byte_pointer <= 0 + + byteslice = line.byteslice(0..(byte_pointer - 1)) + grapheme = byteslice.grapheme_clusters.last + return 0 unless grapheme + + # If it's an invalid combining mark cluster, move one character at a time + if invalid_combining_mark_cluster?(grapheme) + byteslice.chars.last.bytesize else - grapheme = line.byteslice(0..(byte_pointer - 1)).grapheme_clusters.last - grapheme ? grapheme.bytesize : 0 + grapheme.bytesize end end @@ -412,4 +453,11 @@ def self.word_character?(s) def self.space_character?(s) s.match?(/\s/) if s end + + def self.halfwidth_dakuten_or_handakuten_character?(s) + return false if s.encoding != Encoding::UTF_8 || !s.valid_encoding? + + ord = s.ord + ord == HALFWIDTH_DAKUTEN || ord == HALFWIDTH_HANDAKUTEN + end end diff --git a/test/reline/test_unicode.rb b/test/reline/test_unicode.rb index 0778306c32..7615984d98 100644 --- a/test/reline/test_unicode.rb +++ b/test/reline/test_unicode.rb @@ -283,4 +283,48 @@ def test_character_type refute(Reline::Unicode.space_character?('-')) refute(Reline::Unicode.space_character?(nil)) end + + def test_halfwidth_dakuten_handakuten_combinations + assert_equal 2, Reline::Unicode.get_mbchar_width("ガ") + assert_equal 2, Reline::Unicode.get_mbchar_width("パ") + assert_equal 2, Reline::Unicode.get_mbchar_width("ザ") + assert_equal 2, Reline::Unicode.get_mbchar_width("a゙") + assert_equal 2, Reline::Unicode.get_mbchar_width("1゚") + assert_equal 3, Reline::Unicode.get_mbchar_width("あ゙") + assert_equal 3, Reline::Unicode.get_mbchar_width("紅゙") + end + + def test_get_next_mbchar_size_with_dakuten + line = "ガtest" # valid combination + assert_equal 6, Reline::Unicode.get_next_mbchar_size(line, 0) # 'カ' (3 bytes) + '゙' (3 bytes) + assert_equal 1, Reline::Unicode.get_next_mbchar_size(line, 6) # 't' only (1 byte) + + line = "a゙test" # invalid combination + assert_equal 1, Reline::Unicode.get_next_mbchar_size(line, 0) # 'a' only (1 byte) + assert_equal 3, Reline::Unicode.get_next_mbchar_size(line, 1) # '゙' only (3 bytes) + assert_equal 1, Reline::Unicode.get_next_mbchar_size(line, 4) # 't' only (1 bytes) + end + + def test_get_prev_mbchar_size_with_dakuten + line = "testガ" # valid combination + assert_equal 0, Reline::Unicode.get_prev_mbchar_size(line, 0) + assert_equal 1, Reline::Unicode.get_prev_mbchar_size(line, 4) # 't' (1 byte) + assert_equal 3, Reline::Unicode.get_prev_mbchar_size(line, 7) # 'カ' (3 bytes) + assert_equal 6, Reline::Unicode.get_prev_mbchar_size(line, 10) # ガ (6 bytes) + + line = "testa゙" # invalid combination + assert_equal 1, Reline::Unicode.get_prev_mbchar_size(line, 4) # 't' (1 byte) + assert_equal 3, Reline::Unicode.get_prev_mbchar_size(line, 8) # '゙' (3 byte) + end + + def test_invalid_combining_mark_cluster + assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("ガ") + assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("パ") + + assert_equal true, Reline::Unicode.invalid_combining_mark_cluster?("a゙") + assert_equal true, Reline::Unicode.invalid_combining_mark_cluster?("あ゚") + + assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("abc") + assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("カ") + end end From c92a9ea75344606fed91b2ddb067f304fde6eaaf Mon Sep 17 00:00:00 2001 From: moznion Date: Thu, 3 Jul 2025 11:16:08 +0900 Subject: [PATCH 2/5] Insert the fast path for performance Signed-off-by: moznion --- lib/reline/unicode.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb index d6c85cb1b5..ccad032c14 100644 --- a/lib/reline/unicode.rb +++ b/lib/reline/unicode.rb @@ -75,6 +75,14 @@ def self.safe_encode(str, encoding) require 'reline/unicode/east_asian_width' def self.get_mbchar_width(mbchar) + ord = mbchar.ord + if ord <= 0x1F # in EscapedPairs + return 2 + elsif mbchar.length <= 1 && ord <= 0x7E # printable ASCII chars + return 1 + end + # ^ fast path to quickly calculate the width. + # If mbchar contains multiple characters, check if it's a valid combination if mbchar.length >= 2 && invalid_combining_mark_cluster?(mbchar) # Calculate width for each character separately for invalid combinations @@ -107,8 +115,10 @@ def self.get_single_char_width(mbchar) elsif ord <= 0x7E # printable ASCII chars return 1 end + utf8_mbchar = mbchar.encode(Encoding::UTF_8) ord = utf8_mbchar.ord + chunk_index = EastAsianWidth::CHUNK_LAST.bsearch_index { |o| ord <= o } size = EastAsianWidth::CHUNK_WIDTH[chunk_index] if size == -1 From 911e7714871482311cada4ffb26afe573739dcd2 Mon Sep 17 00:00:00 2001 From: moznion Date: Fri, 4 Jul 2025 00:41:32 +0900 Subject: [PATCH 3/5] Optimize by suppressing unnecessary combination check Signed-off-by: moznion --- lib/reline/unicode.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb index ccad032c14..1a92a3ea01 100644 --- a/lib/reline/unicode.rb +++ b/lib/reline/unicode.rb @@ -84,7 +84,7 @@ def self.get_mbchar_width(mbchar) # ^ fast path to quickly calculate the width. # If mbchar contains multiple characters, check if it's a valid combination - if mbchar.length >= 2 && invalid_combining_mark_cluster?(mbchar) + if invalid_combining_mark_cluster?(mbchar) # Calculate width for each character separately for invalid combinations mbchar.each_char.sum { |char| get_single_char_width(char) } else @@ -93,6 +93,8 @@ def self.get_mbchar_width(mbchar) end def self.invalid_combining_mark_cluster?(mbchar) + return false if mbchar.length < 2 + begin chars = mbchar.encode(Encoding::UTF_8).chars return false if chars.length < 2 From 9282ae7bcae54826e17424c03f6e461f800f0bfb Mon Sep 17 00:00:00 2001 From: moznion Date: Sat, 5 Jul 2025 20:26:54 +0900 Subject: [PATCH 4/5] Rollback `get_prev_mbchar_size` and `get_next_mbchar_size` to the original Signed-off-by: moznion --- lib/reline/unicode.rb | 26 ++++++-------------------- test/reline/test_unicode.rb | 23 ----------------------- 2 files changed, 6 insertions(+), 43 deletions(-) diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb index 1a92a3ea01..3f842e7180 100644 --- a/lib/reline/unicode.rb +++ b/lib/reline/unicode.rb @@ -287,30 +287,16 @@ def self.take_mbchar_range(str, start_col, width, cover_begin: false, cover_end: end def self.get_next_mbchar_size(line, byte_pointer) - byteslice = line.byteslice(byte_pointer..-1) - grapheme = byteslice.grapheme_clusters.first - return 0 unless grapheme - - # If it's an invalid combining mark cluster, move one character at a time - if invalid_combining_mark_cluster?(grapheme) - byteslice.chars.first.bytesize - else - grapheme.bytesize - end + grapheme = line.byteslice(byte_pointer..-1).grapheme_clusters.first + grapheme ? grapheme.bytesize : 0 end def self.get_prev_mbchar_size(line, byte_pointer) - return 0 if byte_pointer <= 0 - - byteslice = line.byteslice(0..(byte_pointer - 1)) - grapheme = byteslice.grapheme_clusters.last - return 0 unless grapheme - - # If it's an invalid combining mark cluster, move one character at a time - if invalid_combining_mark_cluster?(grapheme) - byteslice.chars.last.bytesize + if byte_pointer.zero? + 0 else - grapheme.bytesize + grapheme = line.byteslice(0..(byte_pointer - 1)).grapheme_clusters.last + grapheme ? grapheme.bytesize : 0 end end diff --git a/test/reline/test_unicode.rb b/test/reline/test_unicode.rb index 7615984d98..743e35869f 100644 --- a/test/reline/test_unicode.rb +++ b/test/reline/test_unicode.rb @@ -294,29 +294,6 @@ def test_halfwidth_dakuten_handakuten_combinations assert_equal 3, Reline::Unicode.get_mbchar_width("紅゙") end - def test_get_next_mbchar_size_with_dakuten - line = "ガtest" # valid combination - assert_equal 6, Reline::Unicode.get_next_mbchar_size(line, 0) # 'カ' (3 bytes) + '゙' (3 bytes) - assert_equal 1, Reline::Unicode.get_next_mbchar_size(line, 6) # 't' only (1 byte) - - line = "a゙test" # invalid combination - assert_equal 1, Reline::Unicode.get_next_mbchar_size(line, 0) # 'a' only (1 byte) - assert_equal 3, Reline::Unicode.get_next_mbchar_size(line, 1) # '゙' only (3 bytes) - assert_equal 1, Reline::Unicode.get_next_mbchar_size(line, 4) # 't' only (1 bytes) - end - - def test_get_prev_mbchar_size_with_dakuten - line = "testガ" # valid combination - assert_equal 0, Reline::Unicode.get_prev_mbchar_size(line, 0) - assert_equal 1, Reline::Unicode.get_prev_mbchar_size(line, 4) # 't' (1 byte) - assert_equal 3, Reline::Unicode.get_prev_mbchar_size(line, 7) # 'カ' (3 bytes) - assert_equal 6, Reline::Unicode.get_prev_mbchar_size(line, 10) # ガ (6 bytes) - - line = "testa゙" # invalid combination - assert_equal 1, Reline::Unicode.get_prev_mbchar_size(line, 4) # 't' (1 byte) - assert_equal 3, Reline::Unicode.get_prev_mbchar_size(line, 8) # '゙' (3 byte) - end - def test_invalid_combining_mark_cluster assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("ガ") assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("パ") From c666d065c2c913e7660939a7ba0c3f9e8da4c073 Mon Sep 17 00:00:00 2001 From: moznion Date: Sat, 5 Jul 2025 22:35:27 +0900 Subject: [PATCH 5/5] Unify the halfwidth dakuten/handakuten handling in `get_mbchar_width` This commit removes `get_single_char_width` and unifies the logic. Ref: - https://github.com/ruby/reline/pull/832/files#r2187234832 Signed-off-by: moznion --- lib/reline/unicode.rb | 47 ++++++------------------------------- test/reline/test_unicode.rb | 13 ++-------- 2 files changed, 9 insertions(+), 51 deletions(-) diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb index 3f842e7180..26e7246a84 100644 --- a/lib/reline/unicode.rb +++ b/lib/reline/unicode.rb @@ -79,42 +79,7 @@ def self.get_mbchar_width(mbchar) if ord <= 0x1F # in EscapedPairs return 2 elsif mbchar.length <= 1 && ord <= 0x7E # printable ASCII chars - return 1 - end - # ^ fast path to quickly calculate the width. - - # If mbchar contains multiple characters, check if it's a valid combination - if invalid_combining_mark_cluster?(mbchar) - # Calculate width for each character separately for invalid combinations - mbchar.each_char.sum { |char| get_single_char_width(char) } - else - get_single_char_width(mbchar) - end - end - - def self.invalid_combining_mark_cluster?(mbchar) - return false if mbchar.length < 2 - - begin - chars = mbchar.encode(Encoding::UTF_8).chars - return false if chars.length < 2 - - # Check if the last character is a combining mark (Halfwidth Dakuten/Handakuten) - return false unless halfwidth_dakuten_or_handakuten_character?(chars.last) - - # Check if the base character can validly combine with Halfwidth dakuten/handakuten - ord = chars[-2].ord - ord < 0xFF66 || ord > 0xFF9D # out-of-range of Halfwidth Katakana - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError, ArgumentError - false - end - end - - def self.get_single_char_width(mbchar) - ord = mbchar.ord - if ord <= 0x1F # in EscapedPairs - return 2 - elsif ord <= 0x7E # printable ASCII chars + # ~~~~~~~~~~~~~~~~~~ guard against the following grapheme combination character (e.g., dakuten/handakuten) return 1 end @@ -125,10 +90,12 @@ def self.get_single_char_width(mbchar) size = EastAsianWidth::CHUNK_WIDTH[chunk_index] if size == -1 Reline.ambiguous_width - elsif size == 1 && utf8_mbchar.size >= 2 - # Halfwidth Dakuten Handakuten - # Only these two character has Letter Modifier category and can be combined in a single grapheme cluster - halfwidth_dakuten_or_handakuten_character?(utf8_mbchar[1]) ? 2 : 1 + elsif halfwidth_dakuten_or_handakuten_character?(utf8_mbchar[-1]) + if utf8_mbchar.length >= 2 # Whether this is a dakuten or handakuten combination character + utf8_mbchar.each_char.sum { |char| get_mbchar_width(char) } + else + 1 + end else size end diff --git a/test/reline/test_unicode.rb b/test/reline/test_unicode.rb index 743e35869f..9cfc53b57b 100644 --- a/test/reline/test_unicode.rb +++ b/test/reline/test_unicode.rb @@ -285,6 +285,8 @@ def test_character_type end def test_halfwidth_dakuten_handakuten_combinations + assert_equal 1, Reline::Unicode.get_mbchar_width("\uFF9E") + assert_equal 1, Reline::Unicode.get_mbchar_width("\uFF9F") assert_equal 2, Reline::Unicode.get_mbchar_width("ガ") assert_equal 2, Reline::Unicode.get_mbchar_width("パ") assert_equal 2, Reline::Unicode.get_mbchar_width("ザ") @@ -293,15 +295,4 @@ def test_halfwidth_dakuten_handakuten_combinations assert_equal 3, Reline::Unicode.get_mbchar_width("あ゙") assert_equal 3, Reline::Unicode.get_mbchar_width("紅゙") end - - def test_invalid_combining_mark_cluster - assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("ガ") - assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("パ") - - assert_equal true, Reline::Unicode.invalid_combining_mark_cluster?("a゙") - assert_equal true, Reline::Unicode.invalid_combining_mark_cluster?("あ゚") - - assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("abc") - assert_equal false, Reline::Unicode.invalid_combining_mark_cluster?("カ") - end end