Skip to content

Commit b83714b

Browse files
Add API to correct defective combining character sequences
1 parent a6a221a commit b83714b

File tree

5 files changed

+413
-1
lines changed

5 files changed

+413
-1
lines changed

scripts/unicode.py

Lines changed: 114 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
# Since this should not require frequent updates, we just store this
2020
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
2121
import collections
22+
import re
2223
import urllib.request
2324

2425
UNICODE_VERSION = "15.1.0"
@@ -66,6 +67,8 @@
6667
class UnicodeData(object):
6768
def __init__(self):
6869
self._load_unicode_data()
70+
self._load_default_ignorable_marks()
71+
6972
self.norm_props = self._load_norm_props()
7073
self.norm_tests = self._load_norm_tests()
7174

@@ -100,6 +103,11 @@ def _load_unicode_data(self):
100103
self.general_category_mark = []
101104
self.general_category_public_assigned = []
102105

106+
# Characters that cannot be part of a combining character sequence:
107+
# control characters, format characters other than ZWJ and ZWNJ,
108+
# the line and paragraph separators, and noncharacters.
109+
self.not_in_ccs = []
110+
103111
assigned_start = 0;
104112
prev_char_int = -1;
105113
prev_name = "";
@@ -125,6 +133,9 @@ def _load_unicode_data(self):
125133
if category == 'M' or 'M' in expanded_categories.get(category, []):
126134
self.general_category_mark.append(char_int)
127135

136+
if category in ['Cc', 'Cf', 'Zl', 'Zp'] and char_int not in [0x200C, 0x200D]:
137+
self.not_in_ccs.append(char_int)
138+
128139
assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
129140
if category not in ['Co', 'Cs']:
130141
if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
@@ -135,6 +146,44 @@ def _load_unicode_data(self):
135146

136147
self.general_category_public_assigned.append((assigned_start, prev_char_int))
137148

149+
# Mark noncharacters as nongraphic
150+
for i in range(0xFDD0, 0xFDF0):
151+
self.not_in_ccs.append(i)
152+
for prefix in range(0, 0x11):
153+
shifted = prefix << 16
154+
self.not_in_ccs.append(shifted | 0xFFFE)
155+
self.not_in_ccs.append(shifted | 0xFFFF)
156+
157+
self.not_in_ccs.sort()
158+
159+
def _load_default_ignorable_marks(self):
160+
default_ignorable_cps = set()
161+
162+
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
163+
multiple = re.compile(
164+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
165+
)
166+
167+
for line in self._fetch("DerivedCoreProperties.txt").splitlines():
168+
raw_data = None # (low, high)
169+
if match := single.match(line):
170+
raw_data = (match.group(1), match.group(1))
171+
elif match := multiple.match(line):
172+
raw_data = (match.group(1), match.group(2))
173+
else:
174+
continue
175+
low = int(raw_data[0], 16)
176+
high = int(raw_data[1], 16)
177+
for cp in range(low, high + 1):
178+
default_ignorable_cps.add(cp)
179+
180+
self.default_ignorable_marks = []
181+
for cp in self.general_category_mark:
182+
if cp in default_ignorable_cps:
183+
self.default_ignorable_marks.append(cp)
184+
185+
self.default_ignorable_marks.sort()
186+
138187
def _load_cjk_compat_ideograph_variants(self):
139188
for line in self._fetch("StandardizedVariants.txt").splitlines():
140189
strip_comments = line.split('#', 1)[0].strip()
@@ -454,7 +503,7 @@ def gen_combining_mark(general_category_mark, out):
454503

455504
def gen_public_assigned(general_category_public_assigned, out):
456505
# This could be done as a hash but the table is somewhat small.
457-
out.write("#[inline]\n")
506+
out.write("\n#[inline]\n")
458507
out.write("pub fn is_public_assigned(c: char) -> bool {\n")
459508
out.write(" match c {\n")
460509

@@ -476,6 +525,66 @@ def gen_public_assigned(general_category_public_assigned, out):
476525
out.write("}\n")
477526
out.write("\n")
478527

528+
def gen_not_in_ccs(not_in_ccs, out):
529+
# List of codepoints to list of ranges
530+
range_list = []
531+
for cp in not_in_ccs:
532+
if len(range_list) != 0 and range_list[-1][1] == cp - 1:
533+
range_list[-1] = (range_list[-1][0], cp)
534+
else:
535+
range_list.append((cp, cp))
536+
537+
out.write("\n#[inline]\n")
538+
out.write("pub fn not_in_ccs(c: char) -> bool {\n")
539+
out.write(" match c {\n")
540+
541+
start = True
542+
for first, last in range_list:
543+
if start:
544+
out.write(" ")
545+
start = False
546+
else:
547+
out.write("\n | ")
548+
if first == last:
549+
out.write("'\\u{%s}'" % hexify(first))
550+
else:
551+
out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
552+
out.write(" => true,\n")
553+
554+
out.write(" _ => false,\n")
555+
out.write(" }\n")
556+
out.write("}\n")
557+
558+
def gen_default_ignorable_mark(default_ignorable_marks, out):
559+
# List of codepoints to list of ranges
560+
range_list = []
561+
for cp in default_ignorable_marks:
562+
if len(range_list) != 0 and range_list[-1][1] == cp - 1:
563+
range_list[-1] = (range_list[-1][0], cp)
564+
else:
565+
range_list.append((cp, cp))
566+
567+
out.write("\n#[inline]\n")
568+
out.write("pub fn is_default_ignorable_mark(c: char) -> bool {\n")
569+
out.write(" match c {\n")
570+
571+
start = True
572+
for first, last in range_list:
573+
if start:
574+
out.write(" ")
575+
start = False
576+
else:
577+
out.write("\n | ")
578+
if first == last:
579+
out.write("'\\u{%s}'" % hexify(first))
580+
else:
581+
out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
582+
out.write(" => true,\n")
583+
584+
out.write(" _ => false,\n")
585+
out.write(" }\n")
586+
out.write("}\n")
587+
479588
def gen_stream_safe(leading, trailing, out):
480589
# This could be done as a hash but the table is very small.
481590
out.write("#[inline]\n")
@@ -602,6 +711,10 @@ def minimal_perfect_hash(d):
602711
gen_public_assigned(data.general_category_public_assigned, out)
603712
out.write("\n")
604713

714+
gen_not_in_ccs(data.not_in_ccs, out)
715+
716+
gen_default_ignorable_mark(data.default_ignorable_marks, out)
717+
605718
gen_nfc_qc(data.norm_props, out)
606719
out.write("\n")
607720

src/correct_ccs.rs

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#[cfg(not(feature = "std"))]
2+
use alloc::collections::VecDeque;
3+
use core::iter::FusedIterator;
4+
#[cfg(feature = "std")]
5+
use std::collections::VecDeque;
6+
7+
use crate::{lookups, tables};
8+
9+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
10+
enum CcsKind {
11+
/// A CCS base character (graphic character other than combining mark).
12+
Base,
13+
14+
/// A combining character other than a `Default_Ignorable_Code_Point`.
15+
NonIgnorableCombining,
16+
17+
/// A default-ignorable combining character, ZWJ, or ZWNJ.
18+
IgnorableCombining,
19+
}
20+
21+
impl CcsKind {
22+
fn of(c: char) -> Option<Self> {
23+
if c == '\u{200C}' || c == '\u{200D}' {
24+
// ZWNJ || ZWJ
25+
Some(CcsKind::IgnorableCombining)
26+
} else if lookups::is_combining_mark(c) {
27+
if tables::is_default_ignorable_mark(c) {
28+
Some(CcsKind::IgnorableCombining)
29+
} else {
30+
Some(CcsKind::NonIgnorableCombining)
31+
}
32+
} else if tables::not_in_ccs(c) {
33+
None
34+
} else {
35+
Some(CcsKind::Base)
36+
}
37+
}
38+
}
39+
40+
/// An iterator over the string that corrects
41+
/// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487)
42+
/// by inserting U+00A0 NO-BREAK SPACE in front of them.
43+
///
44+
/// For the purposes of this iterator, private use characters,
45+
/// as well as unassigned codepoints other than noncharacters,
46+
/// are considered valid base characters,
47+
/// so combining character sequences that start with such will not be modified.
48+
///
49+
/// In addition, combining character sequences that consist entirely of `Default_Ignorable_Code_Point`s
50+
/// will not be modified. (Because of this, this iterator may buffer up to the entire length of its input;
51+
/// it is *not* "stream-safe" *even if* used with [`StreamSafe`][crate::StreamSafe]).
52+
#[derive(Clone, Debug)]
53+
pub struct CorrectDefectiveCcs<I> {
54+
/// Whether the last character emitted was part of a CCS.
55+
in_ccs: bool,
56+
buffer: VecDeque<Option<char>>,
57+
/// Whether the last character in `buffer` is part of a CCS.
58+
/// (Updated only when `is_ccs` is set from false to true).
59+
end_of_buffer_in_ccs: bool,
60+
iter: I,
61+
}
62+
63+
impl<I: Iterator<Item = char>> Iterator for CorrectDefectiveCcs<I> {
64+
type Item = char;
65+
66+
fn next(&mut self) -> Option<Self::Item> {
67+
if self.in_ccs {
68+
if let Some(c) = self.buffer.pop_front() {
69+
// Empty buffer
70+
71+
if self.buffer.is_empty() {
72+
self.in_ccs = self.end_of_buffer_in_ccs;
73+
}
74+
c
75+
} else {
76+
// Forward from inner iterator
77+
78+
let c = self.iter.next();
79+
if c.map_or(true, tables::not_in_ccs) {
80+
self.in_ccs = false;
81+
}
82+
c
83+
}
84+
} else {
85+
if self.buffer.is_empty() {
86+
// We don't have a buffer of default ignorable combining characters built up
87+
88+
let c = self.iter.next()?;
89+
match CcsKind::of(c) {
90+
// Character not in CCS, just forward it
91+
None => return Some(c),
92+
93+
// Character starts non-defective CCS,
94+
// label ourselves as in CCS and forward it
95+
Some(CcsKind::Base) => {
96+
self.in_ccs = true;
97+
return Some(c);
98+
}
99+
100+
// Character starts defective CCS and is not default-ignorable.
101+
// Put it in the buffer to emit on next iteration,
102+
// mark ourselves as in CCS,
103+
// and emit NO-BREAK SPACE
104+
Some(CcsKind::NonIgnorableCombining) => {
105+
self.in_ccs = true;
106+
self.end_of_buffer_in_ccs = true;
107+
self.buffer.push_back(Some(c));
108+
return Some('\u{00A0}'); // NO-BREAK SPACE
109+
}
110+
111+
// Character starts defective CCS and is default-ignorable.
112+
// Put it in the buffer, and fall through to loop below
113+
// to find out whether we emit a NO-BREAK SPACE first.
114+
Some(CcsKind::IgnorableCombining) => {
115+
self.buffer.push_back(Some(c));
116+
}
117+
}
118+
}
119+
120+
loop {
121+
// We do have a buffer of default ignorable combining characters built up,
122+
// and we need to figure out whether to emit a NO-BREAK SPACE first.
123+
124+
let c = self.iter.next();
125+
match c.and_then(CcsKind::of) {
126+
// Inner iterator yielded character outside CCS (or `None`).
127+
// Emit the built-up buffer with no leading NO-BREAK SPACE.
128+
None => {
129+
self.in_ccs = true;
130+
self.end_of_buffer_in_ccs = false;
131+
let ret = self.buffer.pop_front().unwrap();
132+
self.buffer.push_back(c);
133+
return ret;
134+
}
135+
136+
// Inner iterator yielded character that starts a new CCS.
137+
// Emit the built-up buffer with no leading NO-BREAK SPACE.
138+
Some(CcsKind::Base) => {
139+
self.in_ccs = true;
140+
self.end_of_buffer_in_ccs = true;
141+
let ret = self.buffer.pop_front().unwrap();
142+
self.buffer.push_back(c);
143+
return ret;
144+
}
145+
146+
// Inner iterator yielded non-ignorable combining character.
147+
// Emit the built-up buffer with leading NO-BREAK SPACE.
148+
Some(CcsKind::NonIgnorableCombining) => {
149+
self.in_ccs = true;
150+
self.end_of_buffer_in_ccs = true;
151+
self.buffer.push_back(c);
152+
return Some('\u{00A0}'); // NO-BREAK SPACE
153+
}
154+
155+
// Inner iterator yielded ignorable combining character.
156+
// Add it to the buffer, don't emit anything.
157+
Some(CcsKind::IgnorableCombining) => {
158+
self.buffer.push_back(c);
159+
}
160+
}
161+
}
162+
}
163+
}
164+
}
165+
166+
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for CorrectDefectiveCcs<I> {}
167+
168+
impl<I> CorrectDefectiveCcs<I> {
169+
pub(crate) fn new(iter: I) -> Self {
170+
Self {
171+
in_ccs: false,
172+
buffer: VecDeque::new(),
173+
end_of_buffer_in_ccs: false,
174+
iter,
175+
}
176+
}
177+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy