Skip to content

Commit 169ad17

Browse files
committed
Use non-regex approach for VS16 adjustments
1 parent 82b17bd commit 169ad17

File tree

2 files changed

+99
-47
lines changed

2 files changed

+99
-47
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# CHANGELOG
22

3+
## 3.1.1 (unreleased)
4+
5+
- Performance improvements
6+
37
## 3.1.0
48

59
**Improve Emoji support:**

lib/unicode/display_width.rb

Lines changed: 95 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,17 @@
88

99
module Unicode
1010
class DisplayWidth
11-
DEFAULT_AMBIGUOUS = 1
1211
INITIAL_DEPTH = 0x10000
12+
def self.width_in_index(codepoint, index)
13+
d = INITIAL_DEPTH
14+
w = index[codepoint / d]
15+
while w.instance_of? Array
16+
w = w[(codepoint %= d) / (d /= 16)]
17+
end
18+
w || 1
19+
end
20+
21+
DEFAULT_AMBIGUOUS = 1
1322
ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n\v\f\r\x0E\x0F]/
1423
ASCII_NON_ZERO_STRING = "\0\x05\a\b\n\v\f\r\x0E\x0F"
1524
ASCII_BACKSPACE = "\b"
@@ -25,11 +34,19 @@ class DisplayWidth
2534
WIDTH_ONE: decompress_index(INDEX[:WIDTH_ONE][0][0], 1),
2635
WIDTH_TWO: decompress_index(INDEX[:WIDTH_TWO][0][0], 1),
2736
}
37+
VS16_TEXT_CODEPOINTS = {
38+
WIDTH_ONE: Unicode::Emoji::TEXT_PRESENTATION - Unicode::Emoji::EMOJI_COMPONENT,
39+
WIDTH_TWO: (Unicode::Emoji::TEXT_PRESENTATION - Unicode::Emoji::EMOJI_COMPONENT).reject{ |codepoint|
40+
width_in_index(codepoint, INDEX[:WIDTH_TWO]) == 2
41+
},
42+
}
2843
EMOJI_SEQUENCES_REGEX_MAPPING = {
2944
rgi: :REGEX_INCLUDE_MQE_UQE,
3045
rgi_at: :REGEX_INCLUDE_MQE_UQE,
3146
possible: :REGEX_WELL_FORMED,
3247
}
48+
EMOJI_NON_VS16_OPTIONS = [:all_no_vs16, :rgi_at, :none, false]
49+
VS16 = 0xFE0F
3350
REGEX_EMOJI_BASIC_OR_KEYCAP = Regexp.union(Unicode::Emoji::REGEX_BASIC, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
3451
REGEX_EMOJI_ALL_SEQUENCES = Regexp.union(/.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?(\u{200D}.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?)+/, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
3552
REGEX_EMOJI_NOT_POSSIBLE = /\A[#*0-9]\z/
@@ -61,14 +78,25 @@ def self.of(string, ambiguous = nil, overwrite = nil, old_options = {}, **option
6178
# # #
6279

6380
if !options[:overwrite].empty?
64-
return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
65-
width_all_features(string, index_full, index_low, first_ambiguous, options[:overwrite])
81+
return width_frame(string, options) do |string, index_full, index_low, first_ambiguous, vs16_text_codepoints|
82+
width_all_features(
83+
string,
84+
index_full,
85+
index_low,
86+
first_ambiguous,
87+
options[:overwrite],
88+
EMOJI_NON_VS16_OPTIONS.include?(options[:emoji]) ? nil : vs16_text_codepoints
89+
)
6690
end
6791
end
6892

6993
if !string.ascii_only?
70-
return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
71-
width_no_overwrite(string, index_full, index_low, first_ambiguous)
94+
return width_frame(string, options) do |string, index_full, index_low, first_ambiguous, vs16_text_codepoints|
95+
if EMOJI_NON_VS16_OPTIONS.include?(options[:emoji])
96+
width_no_overwrite(string, index_full, index_low, first_ambiguous)
97+
else
98+
width_no_overwrite_with_vs16(string, index_full, index_low, first_ambiguous, vs16_text_codepoints)
99+
end
72100
end
73101
end
74102

@@ -102,7 +130,13 @@ def self.width_frame(string, options)
102130
ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]
103131

104132
# Get general width
105-
res += yield(string, INDEX[ambiguous_index_name], FIRST_4096[ambiguous_index_name], FIRST_AMBIGUOUS[ambiguous_index_name])
133+
res += yield(
134+
string,
135+
INDEX[ambiguous_index_name],
136+
FIRST_4096[ambiguous_index_name],
137+
FIRST_AMBIGUOUS[ambiguous_index_name],
138+
VS16_TEXT_CODEPOINTS[ambiguous_index_name]
139+
)
106140

107141
# Return result + prevent negative lengths
108142
res < 0 ? 0 : res
@@ -139,26 +173,73 @@ def self.width_no_overwrite(string, index_full, index_low, first_ambiguous, _ =
139173
res
140174
end
141175

176+
def self.width_no_overwrite_with_vs16(string, index_full, index_low, first_ambiguous, vs16_text_codepoints)
177+
res = 0
178+
179+
# Make sure we have UTF-8
180+
string = string.encode(Encoding::UTF_8) unless string.encoding.name == "utf-8"
181+
182+
# Track last codepoint and apply VS16 adjustment if necassary
183+
last_codepoint = nil
184+
185+
string.scan(/.{,80}/m){ |batch|
186+
if batch.ascii_only?
187+
res += batch.size
188+
else
189+
batch.each_codepoint{ |codepoint|
190+
if codepoint > 15 && codepoint < first_ambiguous
191+
res += 1
192+
elsif codepoint < 0x1001
193+
res += index_low[codepoint] || 1
194+
elsif codepoint == VS16 && vs16_text_codepoints.include?(last_codepoint)
195+
res += 1
196+
else
197+
d = INITIAL_DEPTH
198+
c = codepoint
199+
w = index_full[c / d]
200+
while w.instance_of? Array
201+
w = w[(c %= d) / (d /= 16)]
202+
end
203+
204+
res += w || 1
205+
end
206+
207+
last_codepoint = codepoint
208+
}
209+
end
210+
}
211+
212+
res
213+
end
214+
142215
# Same as .width_no_overwrite - but with applying overwrites for each char
143-
def self.width_all_features(string, index_full, index_low, first_ambiguous, overwrite)
216+
def self.width_all_features(string, index_full, index_low, first_ambiguous, overwrite, vs16_text_codepoints)
144217
res = 0
145218

219+
# Track last codepoint and apply VS16 adjustment if necassary
220+
last_codepoint = nil
221+
146222
string.each_codepoint{ |codepoint|
147223
if overwrite[codepoint]
148224
res += overwrite[codepoint]
149225
elsif codepoint > 15 && codepoint < first_ambiguous
150226
res += 1
151227
elsif codepoint < 0x1001
152228
res += index_low[codepoint] || 1
229+
elsif codepoint == VS16 && vs16_text_codepoints && vs16_text_codepoints.include?(last_codepoint)
230+
res += 1
153231
else
154232
d = INITIAL_DEPTH
155-
w = index_full[codepoint / d]
233+
c = codepoint
234+
w = index_full[c / d]
156235
while w.instance_of? Array
157-
w = w[(codepoint %= d) / (d /= 16)]
236+
w = w[(c %= d) / (d /= 16)]
158237
end
159238

160239
res += w || 1
161240
end
241+
242+
last_codepoint = codepoint
162243
}
163244

164245
res
@@ -177,35 +258,13 @@ def self.emoji_width(string, mode = :all, ambiguous = DEFAULT_AMBIGUOUS)
177258
mode == :rgi_at,
178259
ambiguous,
179260
)
180-
elsif mode == :all_no_vs16
261+
elsif mode == :all_no_vs16 || mode == :all
181262
emoji_width_all(string)
182-
elsif mode == :vs16
183-
emoji_width_basic(string)
184-
elsif mode == :all
185-
res_all, string = emoji_width_all(string)
186-
res_basic, string = emoji_width_basic(string)
187-
[res_all + res_basic, string]
188263
else
189264
[0, string]
190265
end
191266
end
192267

193-
# Ensure all explicit VS16 sequences have width 2
194-
def self.emoji_width_basic(string)
195-
res = 0
196-
197-
no_emoji_string = string.gsub(REGEX_EMOJI_BASIC_OR_KEYCAP){ |basic_emoji|
198-
if basic_emoji.size >= 2 # VS16 present
199-
res += 2
200-
""
201-
else
202-
basic_emoji
203-
end
204-
}
205-
206-
[res, no_emoji_string]
207-
end
208-
209268
# Use simplistic ZWJ/modifier/kecap sequence matching
210269
def self.emoji_width_all(string)
211270
res = 0
@@ -226,31 +285,20 @@ def self.emoji_width_via_possible(string, emoji_set_regex, strict_eaw = false, a
226285
no_emoji_string = string.gsub(Unicode::Emoji::REGEX_POSSIBLE){ |emoji_candidate|
227286
# Skip notorious false positives
228287
if REGEX_EMOJI_NOT_POSSIBLE.match?(emoji_candidate)
229-
emoji_candidate
288+
res += 1
289+
""
230290

231291
# Check if we have a combined Emoji with width 2 (or EAW an Apple Terminal)
232292
elsif emoji_candidate == emoji_candidate[emoji_set_regex]
233293
if strict_eaw
234-
res += self.of(emoji_candidate[0], ambiguous, emoji: false)
294+
res += self.width_in_index(emoji_candidate[0].ord, INDEX[AMBIGUOUS_MAP[ambiguous]])
235295
else
236296
res += 2
237297
end
238298
""
239299

240-
# We are dealing with a default text presentation emoji or a well-formed sequence not matching the above Emoji set
300+
# Use other counting mechanisms
241301
else
242-
if !strict_eaw
243-
# Ensure all explicit VS16 sequences have width 2
244-
emoji_candidate.gsub!(Unicode::Emoji::REGEX_BASIC){ |basic_emoji|
245-
if basic_emoji.size == 2 # VS16 present
246-
res += 2
247-
""
248-
else
249-
basic_emoji
250-
end
251-
}
252-
end
253-
254302
emoji_candidate
255303
end
256304
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy