Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions bin/generate_east_asian_width
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,14 @@ if ARGV.empty?
exit 1
end

def unicode_width(type, category)
return 0 if category == 'Mn' # Nonspacing Mark
def unicode_width(type, category, rest)
# Nonspacing Mark, Enclosing Mark
return 0 if category == 'Mn' || category == 'Me'

# Grapheme_Cluster_Break=V, Grapheme_Cluster_Break=T.
# Width of L, LV, LVT are 2. Treat V and T as width=0 because there should be L or LV before V or T.
return 0 if rest =~ /HANGUL JUNGSEONG|HANGUL JONGSEONG/

case type
when 'F', 'W' # Fullwidth, Wide
2
Expand All @@ -27,10 +33,10 @@ open(ARGV.first, 'rt') do |f|

widths = []
f.each_line do |line|
next unless /^(?<first>\h+)(?:\.\.(?<last>\h+))?\s*;\s*(?<type>\w+)\s+# +(?<category>[^ ]+)/ =~ line
next unless /^(?<first>\h+)(?:\.\.(?<last>\h+))?\s*;\s*(?<type>\w+)\s+# +(?<category>[^ ]+)(?<rest>.*)/ =~ line

range = first.to_i(16)..(last || first).to_i(16)
widths.fill(unicode_width(type, category), range)
widths.fill(unicode_width(type, category, rest), range)
end

# EscapedPairs
Expand Down
39 changes: 16 additions & 23 deletions lib/reline/unicode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ class Reline::Unicode
CSI_REGEXP = /\e\[[\d;]*[ABCDEFGHJKSTfminsuhl]/
OSC_REGEXP = /\e\]\d+(?:;[^;\a\e]+)*(?:\a|\e\\)/
WIDTH_SCANNER = /\G(?:(#{NON_PRINTING_START})|(#{NON_PRINTING_END})|(#{CSI_REGEXP})|(#{OSC_REGEXP})|(\X))/o
HALFWIDTH_DAKUTEN = 0xFF9E
HALFWIDTH_HANDAKUTEN = 0xFF9F

def self.escape_for_print(str)
str.chars.map! { |gr|
Expand Down Expand Up @@ -74,30 +72,32 @@ def self.safe_encode(str, encoding)

require 'reline/unicode/east_asian_width'

def self.east_asian_width(ord)
chunk_index = EastAsianWidth::CHUNK_LAST.bsearch_index { |o| ord <= o }
size = EastAsianWidth::CHUNK_WIDTH[chunk_index]
size == -1 ? Reline.ambiguous_width : size
end

def self.get_mbchar_width(mbchar)
ord = mbchar.ord
if ord <= 0x1F # in EscapedPairs
return 2
elsif mbchar.length <= 1 && ord <= 0x7E # printable ASCII chars
# ~~~~~~~~~~~~~~~~~~ guard against the following grapheme combination character (e.g., dakuten/handakuten)
elsif mbchar.length == 1 && ord <= 0x7E # printable ASCII chars
return 1
end

utf8_mbchar = mbchar.encode(Encoding::UTF_8)
ord = utf8_mbchar.ord

chunk_index = EastAsianWidth::CHUNK_LAST.bsearch_index { |o| ord <= o }
size = EastAsianWidth::CHUNK_WIDTH[chunk_index]
if size == -1
Reline.ambiguous_width
elsif halfwidth_dakuten_or_handakuten_character?(utf8_mbchar[-1])
if utf8_mbchar.length >= 2 # Whether this is a dakuten or handakuten combination character
utf8_mbchar.each_char.sum { |char| get_mbchar_width(char) }
zwj = false
utf8_mbchar.chars.sum do |c|
if zwj
zwj = false
0
elsif c.ord == 0x200D # Zero Width Joiner
zwj = true
0
else
1
east_asian_width(c.ord)
end
else
size
end
end

Expand Down Expand Up @@ -418,11 +418,4 @@ def self.word_character?(s)
def self.space_character?(s)
s.match?(/\s/) if s
end

def self.halfwidth_dakuten_or_handakuten_character?(s)
return false if s.encoding != Encoding::UTF_8 || !s.valid_encoding?

ord = s.ord
ord == HALFWIDTH_DAKUTEN || ord == HALFWIDTH_HANDAKUTEN
end
end
15 changes: 7 additions & 8 deletions lib/reline/unicode/east_asian_width.rb
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ class Reline::Unicode::EastAsianWidth
[0x450, 1],
[0x451, -1],
[0x482, 1],
[0x487, 0],
[0x489, 0],
[0x590, 1],
[0x5bd, 0],
[0x5be, 1],
Expand Down Expand Up @@ -356,6 +356,7 @@ class Reline::Unicode::EastAsianWidth
[0x109d, 0],
[0x10ff, 1],
[0x115f, 2],
[0x11ff, 0],
[0x135c, 1],
[0x135f, 0],
[0x1711, 1],
Expand Down Expand Up @@ -411,8 +412,6 @@ class Reline::Unicode::EastAsianWidth
[0x1a7e, 1],
[0x1a7f, 0],
[0x1aaf, 1],
[0x1abd, 0],
[0x1abe, 1],
[0x1ace, 0],
[0x1aff, 1],
[0x1b03, 0],
Expand Down Expand Up @@ -491,10 +490,6 @@ class Reline::Unicode::EastAsianWidth
[0x20ab, 1],
[0x20ac, -1],
[0x20cf, 1],
[0x20dc, 0],
[0x20e0, 1],
[0x20e1, 0],
[0x20e4, 1],
[0x20f0, 0],
[0x2102, 1],
[0x2103, -1],
Expand Down Expand Up @@ -767,7 +762,7 @@ class Reline::Unicode::EastAsianWidth
[0xa48f, 1],
[0xa4c6, 2],
[0xa66e, 1],
[0xa66f, 0],
[0xa672, 0],
[0xa673, 1],
[0xa67d, 0],
[0xa69d, 1],
Expand Down Expand Up @@ -840,6 +835,10 @@ class Reline::Unicode::EastAsianWidth
[0xabed, 0],
[0xabff, 1],
[0xd7a3, 2],
[0xd7af, 1],
[0xd7c6, 0],
[0xd7ca, 1],
[0xd7fb, 0],
[0xdfff, 1],
[0xf8ff, -1],
[0xfaff, 2],
Expand Down
29 changes: 29 additions & 0 deletions test/reline/test_unicode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -295,4 +295,33 @@ def test_halfwidth_dakuten_handakuten_combinations
assert_equal 3, Reline::Unicode.get_mbchar_width("あ゙")
assert_equal 3, Reline::Unicode.get_mbchar_width("紅゙")
end

def test_grapheme_cluster_width
# GB6, GB7, GB8: Hangul syllable
assert_equal 2, Reline::Unicode.get_mbchar_width('한'.unicode_normalize(:nfd))
assert_equal 6, Reline::Unicode.get_mbchar_width('ᄀ' * 3)

# GB9
# Char + NonspacingMark
assert_equal 1, Reline::Unicode.get_mbchar_width('ç'.unicode_normalize(:nfd))
assert_equal 2, Reline::Unicode.get_mbchar_width('ぱ'.unicode_normalize(:nfd))
assert_equal 1, Reline::Unicode.get_mbchar_width("c\u{301}\u{327}")
# '1' + NonspacingMark + EnclosingMark
assert_equal 1, Reline::Unicode.get_mbchar_width('1️⃣')
# Char + SpacingMark
assert_equal 2, Reline::Unicode.get_mbchar_width('কা')
assert_equal 5, Reline::Unicode.get_mbchar_width('ガ゚゙゙')
# Emoji joined with ZeroWidthJoiner
assert_equal 2, Reline::Unicode.get_mbchar_width('👨‍👩‍👧')
assert_equal 7, Reline::Unicode.get_mbchar_width('👨‍👩‍👧゙゚゚゚゙')

# GB9a: Char + GraphemeClusterBreak=SpacingMark
assert_equal 2, Reline::Unicode.get_mbchar_width('คำ')

# GB9c: Consonant + Linker(NonspacingMark) + Consonant
assert_equal 2, Reline::Unicode.get_mbchar_width('क्त')

# GB12, GB13: RegionalIndicator
assert_equal 2, Reline::Unicode.get_mbchar_width('🇯🇵')
end
end