From 169ad17b40f2279f139059e2b8e0764ad74ae577 Mon Sep 17 00:00:00 2001 From: Jan Lelis Date: Tue, 19 Nov 2024 00:26:42 +0100 Subject: [PATCH] Use non-regex approach for VS16 adjustments --- CHANGELOG.md | 4 + lib/unicode/display_width.rb | 142 +++++++++++++++++++++++------------ 2 files changed, 99 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c67d671..c46f69a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # CHANGELOG +## 3.1.1 (unreleased) + +- Performance improvements + ## 3.1.0 **Improve Emoji support:** diff --git a/lib/unicode/display_width.rb b/lib/unicode/display_width.rb index d763659..717958a 100644 --- a/lib/unicode/display_width.rb +++ b/lib/unicode/display_width.rb @@ -8,8 +8,17 @@ module Unicode class DisplayWidth - DEFAULT_AMBIGUOUS = 1 INITIAL_DEPTH = 0x10000 + def self.width_in_index(codepoint, index) + d = INITIAL_DEPTH + w = index[codepoint / d] + while w.instance_of? Array + w = w[(codepoint %= d) / (d /= 16)] + end + w || 1 + end + + DEFAULT_AMBIGUOUS = 1 ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n\v\f\r\x0E\x0F]/ ASCII_NON_ZERO_STRING = "\0\x05\a\b\n\v\f\r\x0E\x0F" ASCII_BACKSPACE = "\b" @@ -25,11 +34,19 @@ class DisplayWidth WIDTH_ONE: decompress_index(INDEX[:WIDTH_ONE][0][0], 1), WIDTH_TWO: decompress_index(INDEX[:WIDTH_TWO][0][0], 1), } + VS16_TEXT_CODEPOINTS = { + WIDTH_ONE: Unicode::Emoji::TEXT_PRESENTATION - Unicode::Emoji::EMOJI_COMPONENT, + WIDTH_TWO: (Unicode::Emoji::TEXT_PRESENTATION - Unicode::Emoji::EMOJI_COMPONENT).reject{ |codepoint| + width_in_index(codepoint, INDEX[:WIDTH_TWO]) == 2 + }, + } EMOJI_SEQUENCES_REGEX_MAPPING = { rgi: :REGEX_INCLUDE_MQE_UQE, rgi_at: :REGEX_INCLUDE_MQE_UQE, possible: :REGEX_WELL_FORMED, } + EMOJI_NON_VS16_OPTIONS = [:all_no_vs16, :rgi_at, :none, false] + VS16 = 0xFE0F REGEX_EMOJI_BASIC_OR_KEYCAP = Regexp.union(Unicode::Emoji::REGEX_BASIC, Unicode::Emoji::REGEX_EMOJI_KEYCAP) REGEX_EMOJI_ALL_SEQUENCES = Regexp.union(/.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?(\u{200D}.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?)+/, Unicode::Emoji::REGEX_EMOJI_KEYCAP) REGEX_EMOJI_NOT_POSSIBLE = /\A[#*0-9]\z/ @@ -61,14 +78,25 @@ def self.of(string, ambiguous = nil, overwrite = nil, old_options = {}, **option # # # if !options[:overwrite].empty? - return width_frame(string, options) do |string, index_full, index_low, first_ambiguous| - width_all_features(string, index_full, index_low, first_ambiguous, options[:overwrite]) + return width_frame(string, options) do |string, index_full, index_low, first_ambiguous, vs16_text_codepoints| + width_all_features( + string, + index_full, + index_low, + first_ambiguous, + options[:overwrite], + EMOJI_NON_VS16_OPTIONS.include?(options[:emoji]) ? nil : vs16_text_codepoints + ) end end if !string.ascii_only? - return width_frame(string, options) do |string, index_full, index_low, first_ambiguous| - width_no_overwrite(string, index_full, index_low, first_ambiguous) + return width_frame(string, options) do |string, index_full, index_low, first_ambiguous, vs16_text_codepoints| + if EMOJI_NON_VS16_OPTIONS.include?(options[:emoji]) + width_no_overwrite(string, index_full, index_low, first_ambiguous) + else + width_no_overwrite_with_vs16(string, index_full, index_low, first_ambiguous, vs16_text_codepoints) + end end end @@ -102,7 +130,13 @@ def self.width_frame(string, options) ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]] # Get general width - res += yield(string, INDEX[ambiguous_index_name], FIRST_4096[ambiguous_index_name], FIRST_AMBIGUOUS[ambiguous_index_name]) + res += yield( + string, + INDEX[ambiguous_index_name], + FIRST_4096[ambiguous_index_name], + FIRST_AMBIGUOUS[ambiguous_index_name], + VS16_TEXT_CODEPOINTS[ambiguous_index_name] + ) # Return result + prevent negative lengths res < 0 ? 0 : res @@ -139,10 +173,52 @@ def self.width_no_overwrite(string, index_full, index_low, first_ambiguous, _ = res end + def self.width_no_overwrite_with_vs16(string, index_full, index_low, first_ambiguous, vs16_text_codepoints) + res = 0 + + # Make sure we have UTF-8 + string = string.encode(Encoding::UTF_8) unless string.encoding.name == "utf-8" + + # Track last codepoint and apply VS16 adjustment if necassary + last_codepoint = nil + + string.scan(/.{,80}/m){ |batch| + if batch.ascii_only? + res += batch.size + else + batch.each_codepoint{ |codepoint| + if codepoint > 15 && codepoint < first_ambiguous + res += 1 + elsif codepoint < 0x1001 + res += index_low[codepoint] || 1 + elsif codepoint == VS16 && vs16_text_codepoints.include?(last_codepoint) + res += 1 + else + d = INITIAL_DEPTH + c = codepoint + w = index_full[c / d] + while w.instance_of? Array + w = w[(c %= d) / (d /= 16)] + end + + res += w || 1 + end + + last_codepoint = codepoint + } + end + } + + res + end + # Same as .width_no_overwrite - but with applying overwrites for each char - def self.width_all_features(string, index_full, index_low, first_ambiguous, overwrite) + def self.width_all_features(string, index_full, index_low, first_ambiguous, overwrite, vs16_text_codepoints) res = 0 + # Track last codepoint and apply VS16 adjustment if necassary + last_codepoint = nil + string.each_codepoint{ |codepoint| if overwrite[codepoint] res += overwrite[codepoint] @@ -150,15 +226,20 @@ def self.width_all_features(string, index_full, index_low, first_ambiguous, over res += 1 elsif codepoint < 0x1001 res += index_low[codepoint] || 1 + elsif codepoint == VS16 && vs16_text_codepoints && vs16_text_codepoints.include?(last_codepoint) + res += 1 else d = INITIAL_DEPTH - w = index_full[codepoint / d] + c = codepoint + w = index_full[c / d] while w.instance_of? Array - w = w[(codepoint %= d) / (d /= 16)] + w = w[(c %= d) / (d /= 16)] end res += w || 1 end + + last_codepoint = codepoint } res @@ -177,35 +258,13 @@ def self.emoji_width(string, mode = :all, ambiguous = DEFAULT_AMBIGUOUS) mode == :rgi_at, ambiguous, ) - elsif mode == :all_no_vs16 + elsif mode == :all_no_vs16 || mode == :all emoji_width_all(string) - elsif mode == :vs16 - emoji_width_basic(string) - elsif mode == :all - res_all, string = emoji_width_all(string) - res_basic, string = emoji_width_basic(string) - [res_all + res_basic, string] else [0, string] end end - # Ensure all explicit VS16 sequences have width 2 - def self.emoji_width_basic(string) - res = 0 - - no_emoji_string = string.gsub(REGEX_EMOJI_BASIC_OR_KEYCAP){ |basic_emoji| - if basic_emoji.size >= 2 # VS16 present - res += 2 - "" - else - basic_emoji - end - } - - [res, no_emoji_string] - end - # Use simplistic ZWJ/modifier/kecap sequence matching def self.emoji_width_all(string) res = 0 @@ -226,31 +285,20 @@ def self.emoji_width_via_possible(string, emoji_set_regex, strict_eaw = false, a no_emoji_string = string.gsub(Unicode::Emoji::REGEX_POSSIBLE){ |emoji_candidate| # Skip notorious false positives if REGEX_EMOJI_NOT_POSSIBLE.match?(emoji_candidate) - emoji_candidate + res += 1 + "" # Check if we have a combined Emoji with width 2 (or EAW an Apple Terminal) elsif emoji_candidate == emoji_candidate[emoji_set_regex] if strict_eaw - res += self.of(emoji_candidate[0], ambiguous, emoji: false) + res += self.width_in_index(emoji_candidate[0].ord, INDEX[AMBIGUOUS_MAP[ambiguous]]) else res += 2 end "" - # We are dealing with a default text presentation emoji or a well-formed sequence not matching the above Emoji set + # Use other counting mechanisms else - if !strict_eaw - # Ensure all explicit VS16 sequences have width 2 - emoji_candidate.gsub!(Unicode::Emoji::REGEX_BASIC){ |basic_emoji| - if basic_emoji.size == 2 # VS16 present - res += 2 - "" - else - basic_emoji - end - } - end - emoji_candidate end } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy