Skip to content

Commit d4decae

Browse files
Add API to correct defective combining character sequences
1 parent ac8fa20 commit d4decae

File tree

5 files changed

+413
-1
lines changed

5 files changed

+413
-1
lines changed

scripts/unicode.py

Lines changed: 114 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
# Since this should not require frequent updates, we just store this
2020
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
2121
import collections
22+
import re
2223
import urllib.request
2324
from itertools import batched
2425

@@ -67,6 +68,8 @@
6768
class UnicodeData(object):
6869
def __init__(self):
6970
self._load_unicode_data()
71+
self._load_default_ignorable_marks()
72+
7073
self.norm_props = self._load_norm_props()
7174
self.norm_tests = self._load_norm_tests()
7275

@@ -101,6 +104,11 @@ def _load_unicode_data(self):
101104
self.general_category_mark = []
102105
self.general_category_public_assigned = []
103106

107+
# Characters that cannot be part of a combining character sequence:
108+
# control characters, format characters other than ZWJ and ZWNJ,
109+
# the line and paragraph separators, and noncharacters.
110+
self.not_in_ccs = []
111+
104112
assigned_start = 0;
105113
prev_char_int = -1;
106114
prev_name = "";
@@ -126,6 +134,9 @@ def _load_unicode_data(self):
126134
if category == 'M' or 'M' in expanded_categories.get(category, []):
127135
self.general_category_mark.append(char_int)
128136

137+
if category in ['Cc', 'Cf', 'Zl', 'Zp'] and char_int not in [0x200C, 0x200D]:
138+
self.not_in_ccs.append(char_int)
139+
129140
assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
130141
if category not in ['Co', 'Cs']:
131142
if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
@@ -136,6 +147,44 @@ def _load_unicode_data(self):
136147

137148
self.general_category_public_assigned.append((assigned_start, prev_char_int))
138149

150+
# Mark noncharacters as nongraphic
151+
for i in range(0xFDD0, 0xFDF0):
152+
self.not_in_ccs.append(i)
153+
for prefix in range(0, 0x11):
154+
shifted = prefix << 16
155+
self.not_in_ccs.append(shifted | 0xFFFE)
156+
self.not_in_ccs.append(shifted | 0xFFFF)
157+
158+
self.not_in_ccs.sort()
159+
160+
def _load_default_ignorable_marks(self):
161+
default_ignorable_cps = set()
162+
163+
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
164+
multiple = re.compile(
165+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
166+
)
167+
168+
for line in self._fetch("DerivedCoreProperties.txt").splitlines():
169+
raw_data = None # (low, high)
170+
if match := single.match(line):
171+
raw_data = (match.group(1), match.group(1))
172+
elif match := multiple.match(line):
173+
raw_data = (match.group(1), match.group(2))
174+
else:
175+
continue
176+
low = int(raw_data[0], 16)
177+
high = int(raw_data[1], 16)
178+
for cp in range(low, high + 1):
179+
default_ignorable_cps.add(cp)
180+
181+
self.default_ignorable_marks = []
182+
for cp in self.general_category_mark:
183+
if cp in default_ignorable_cps:
184+
self.default_ignorable_marks.append(cp)
185+
186+
self.default_ignorable_marks.sort()
187+
139188
def _load_cjk_compat_ideograph_variants(self):
140189
for line in self._fetch("StandardizedVariants.txt").splitlines():
141190
strip_comments = line.split('#', 1)[0].strip()
@@ -461,7 +510,7 @@ def gen_combining_mark(general_category_mark, out):
461510

462511
def gen_public_assigned(general_category_public_assigned, out):
463512
# This could be done as a hash but the table is somewhat small.
464-
out.write("#[inline]\n")
513+
out.write("\n#[inline]\n")
465514
out.write("pub fn is_public_assigned(c: char) -> bool {\n")
466515
out.write(" match c {\n")
467516

@@ -482,6 +531,66 @@ def gen_public_assigned(general_category_public_assigned, out):
482531
out.write(" }\n")
483532
out.write("}\n")
484533

534+
def gen_not_in_ccs(not_in_ccs, out):
535+
# List of codepoints to list of ranges
536+
range_list = []
537+
for cp in not_in_ccs:
538+
if len(range_list) != 0 and range_list[-1][1] == cp - 1:
539+
range_list[-1] = (range_list[-1][0], cp)
540+
else:
541+
range_list.append((cp, cp))
542+
543+
out.write("\n#[inline]\n")
544+
out.write("pub fn not_in_ccs(c: char) -> bool {\n")
545+
out.write(" match c {\n")
546+
547+
start = True
548+
for first, last in range_list:
549+
if start:
550+
out.write(" ")
551+
start = False
552+
else:
553+
out.write("\n | ")
554+
if first == last:
555+
out.write("'\\u{%s}'" % hexify(first))
556+
else:
557+
out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
558+
out.write(" => true,\n")
559+
560+
out.write(" _ => false,\n")
561+
out.write(" }\n")
562+
out.write("}\n")
563+
564+
def gen_default_ignorable_mark(default_ignorable_marks, out):
565+
# List of codepoints to list of ranges
566+
range_list = []
567+
for cp in default_ignorable_marks:
568+
if len(range_list) != 0 and range_list[-1][1] == cp - 1:
569+
range_list[-1] = (range_list[-1][0], cp)
570+
else:
571+
range_list.append((cp, cp))
572+
573+
out.write("\n#[inline]\n")
574+
out.write("pub fn is_default_ignorable_mark(c: char) -> bool {\n")
575+
out.write(" match c {\n")
576+
577+
start = True
578+
for first, last in range_list:
579+
if start:
580+
out.write(" ")
581+
start = False
582+
else:
583+
out.write("\n | ")
584+
if first == last:
585+
out.write("'\\u{%s}'" % hexify(first))
586+
else:
587+
out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
588+
out.write(" => true,\n")
589+
590+
out.write(" _ => false,\n")
591+
out.write(" }\n")
592+
out.write("}\n")
593+
485594
def gen_stream_safe(leading, trailing, out):
486595
# This could be done as a hash but the table is very small.
487596
out.write("#[inline]\n")
@@ -602,6 +711,10 @@ def minimal_perfect_hash(d):
602711

603712
gen_public_assigned(data.general_category_public_assigned, out)
604713

714+
gen_not_in_ccs(data.not_in_ccs, out)
715+
716+
gen_default_ignorable_mark(data.default_ignorable_marks, out)
717+
605718
gen_nfc_qc(data.norm_props, out)
606719

607720
gen_nfkc_qc(data.norm_props, out)

src/correct_ccs.rs

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#[cfg(not(feature = "std"))]
2+
use alloc::collections::VecDeque;
3+
use core::iter::FusedIterator;
4+
#[cfg(feature = "std")]
5+
use std::collections::VecDeque;
6+
7+
use crate::{lookups, tables};
8+
9+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
10+
enum CcsKind {
11+
/// A CCS base character (graphic character other than combining mark).
12+
Base,
13+
14+
/// A combining character other than a `Default_Ignorable_Code_Point`.
15+
NonIgnorableCombining,
16+
17+
/// A default-ignorable combining character, ZWJ, or ZWNJ.
18+
IgnorableCombining,
19+
}
20+
21+
impl CcsKind {
22+
fn of(c: char) -> Option<Self> {
23+
if c == '\u{200C}' || c == '\u{200D}' {
24+
// ZWNJ || ZWJ
25+
Some(CcsKind::IgnorableCombining)
26+
} else if lookups::is_combining_mark(c) {
27+
if tables::is_default_ignorable_mark(c) {
28+
Some(CcsKind::IgnorableCombining)
29+
} else {
30+
Some(CcsKind::NonIgnorableCombining)
31+
}
32+
} else if tables::not_in_ccs(c) {
33+
None
34+
} else {
35+
Some(CcsKind::Base)
36+
}
37+
}
38+
}
39+
40+
/// An iterator over the string that corrects
41+
/// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487)
42+
/// by inserting U+00A0 NO-BREAK SPACE in front of them.
43+
///
44+
/// For the purposes of this iterator, private use characters,
45+
/// as well as unassigned codepoints other than noncharacters,
46+
/// are considered valid base characters,
47+
/// so combining character sequences that start with such will not be modified.
48+
///
49+
/// In addition, combining character sequences that consist entirely of `Default_Ignorable_Code_Point`s
50+
/// will not be modified. (Because of this, this iterator may buffer up to the entire length of its input;
51+
/// it is *not* "stream-safe" *even if* used with [`StreamSafe`][crate::StreamSafe]).
52+
#[derive(Clone, Debug)]
53+
pub struct CorrectDefectiveCcs<I> {
54+
/// Whether the last character emitted was part of a CCS.
55+
in_ccs: bool,
56+
buffer: VecDeque<Option<char>>,
57+
/// Whether the last character in `buffer` is part of a CCS.
58+
/// (Updated only when `is_ccs` is set from false to true).
59+
end_of_buffer_in_ccs: bool,
60+
iter: I,
61+
}
62+
63+
impl<I: Iterator<Item = char>> Iterator for CorrectDefectiveCcs<I> {
64+
type Item = char;
65+
66+
fn next(&mut self) -> Option<Self::Item> {
67+
if self.in_ccs {
68+
if let Some(c) = self.buffer.pop_front() {
69+
// Empty buffer
70+
71+
if self.buffer.is_empty() {
72+
self.in_ccs = self.end_of_buffer_in_ccs;
73+
}
74+
c
75+
} else {
76+
// Forward from inner iterator
77+
78+
let c = self.iter.next();
79+
if c.map_or(true, tables::not_in_ccs) {
80+
self.in_ccs = false;
81+
}
82+
c
83+
}
84+
} else {
85+
if self.buffer.is_empty() {
86+
// We don't have a buffer of default ignorable combining characters built up
87+
88+
let c = self.iter.next()?;
89+
match CcsKind::of(c) {
90+
// Character not in CCS, just forward it
91+
None => return Some(c),
92+
93+
// Character starts non-defective CCS,
94+
// label ourselves as in CCS and forward it
95+
Some(CcsKind::Base) => {
96+
self.in_ccs = true;
97+
return Some(c);
98+
}
99+
100+
// Character starts defective CCS and is not default-ignorable.
101+
// Put it in the buffer to emit on next iteration,
102+
// mark ourselves as in CCS,
103+
// and emit NO-BREAK SPACE
104+
Some(CcsKind::NonIgnorableCombining) => {
105+
self.in_ccs = true;
106+
self.end_of_buffer_in_ccs = true;
107+
self.buffer.push_back(Some(c));
108+
return Some('\u{00A0}'); // NO-BREAK SPACE
109+
}
110+
111+
// Character starts defective CCS and is default-ignorable.
112+
// Put it in the buffer, and fall through to loop below
113+
// to find out whether we emit a NO-BREAK SPACE first.
114+
Some(CcsKind::IgnorableCombining) => {
115+
self.buffer.push_back(Some(c));
116+
}
117+
}
118+
}
119+
120+
loop {
121+
// We do have a buffer of default ignorable combining characters built up,
122+
// and we need to figure out whether to emit a NO-BREAK SPACE first.
123+
124+
let c = self.iter.next();
125+
match c.and_then(CcsKind::of) {
126+
// Inner iterator yielded character outside CCS (or `None`).
127+
// Emit the built-up buffer with no leading NO-BREAK SPACE.
128+
None => {
129+
self.in_ccs = true;
130+
self.end_of_buffer_in_ccs = false;
131+
let ret = self.buffer.pop_front().unwrap();
132+
self.buffer.push_back(c);
133+
return ret;
134+
}
135+
136+
// Inner iterator yielded character that starts a new CCS.
137+
// Emit the built-up buffer with no leading NO-BREAK SPACE.
138+
Some(CcsKind::Base) => {
139+
self.in_ccs = true;
140+
self.end_of_buffer_in_ccs = true;
141+
let ret = self.buffer.pop_front().unwrap();
142+
self.buffer.push_back(c);
143+
return ret;
144+
}
145+
146+
// Inner iterator yielded non-ignorable combining character.
147+
// Emit the built-up buffer with leading NO-BREAK SPACE.
148+
Some(CcsKind::NonIgnorableCombining) => {
149+
self.in_ccs = true;
150+
self.end_of_buffer_in_ccs = true;
151+
self.buffer.push_back(c);
152+
return Some('\u{00A0}'); // NO-BREAK SPACE
153+
}
154+
155+
// Inner iterator yielded ignorable combining character.
156+
// Add it to the buffer, don't emit anything.
157+
Some(CcsKind::IgnorableCombining) => {
158+
self.buffer.push_back(c);
159+
}
160+
}
161+
}
162+
}
163+
}
164+
}
165+
166+
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for CorrectDefectiveCcs<I> {}
167+
168+
impl<I> CorrectDefectiveCcs<I> {
169+
pub(crate) fn new(iter: I) -> Self {
170+
Self {
171+
in_ccs: false,
172+
buffer: VecDeque::new(),
173+
end_of_buffer_in_ccs: false,
174+
iter,
175+
}
176+
}
177+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy