Skip to content

Add new normalization algorithms using Standardized Variants #70

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 53 additions & 9 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
# - DerivedNormalizationProps.txt
# - NormalizationTest.txt
# - UnicodeData.txt
# - StandardizedVariants.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
import collections
import urllib.request

Expand Down Expand Up @@ -57,6 +58,11 @@
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
}

# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
S_COUNT = L_COUNT * V_COUNT * T_COUNT

class UnicodeData(object):
def __init__(self):
self._load_unicode_data()
Expand All @@ -66,6 +72,9 @@ def __init__(self):
self.canon_comp = self._compute_canonical_comp()
self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()

self.cjk_compat_variants_fully_decomp = {}
self._load_cjk_compat_ideograph_variants()

def stats(name, table):
count = sum(len(v) for v in table.values())
print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
Expand All @@ -75,6 +84,7 @@ def stats(name, table):
stats("Compatible decomp", self.compat_decomp)
stats("Canonical fully decomp", self.canon_fully_decomp)
stats("Compatible fully decomp", self.compat_fully_decomp)
stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp)

self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()

Expand All @@ -83,6 +93,7 @@ def _fetch(self, filename):
return resp.read().decode('utf-8')

def _load_unicode_data(self):
self.name_to_char_int = {}
self.combining_classes = {}
self.compat_decomp = {}
self.canon_decomp = {}
Expand All @@ -95,6 +106,9 @@ def _load_unicode_data(self):
char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
char_int = int(char, 16)

name = pieces[1].strip()
self.name_to_char_int[name] = char_int

if cc != '0':
self.combining_classes[char_int] = cc

Expand All @@ -106,6 +120,41 @@ def _load_unicode_data(self):
if category == 'M' or 'M' in expanded_categories.get(category, []):
self.general_category_mark.append(char_int)

def _load_cjk_compat_ideograph_variants(self):
for line in self._fetch("StandardizedVariants.txt").splitlines():
strip_comments = line.split('#', 1)[0].strip()
if not strip_comments:
continue

variation_sequence, description, differences = strip_comments.split(';')
description = description.strip()

# Don't use variations that only apply in particular shaping environments.
if differences:
continue

# Look for entries where the description field is a codepoint name.
if description not in self.name_to_char_int:
continue

# Only consider the CJK Compatibility Ideographs.
if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
continue

char_int = self.name_to_char_int[description]

assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
# If we ever need to handle Hangul here, we'll need to handle it separately.
assert not (S_BASE <= char_int < S_BASE + S_COUNT)

cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
for c in cjk_compat_variant_parts:
assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts

def _load_norm_props(self):
props = collections.defaultdict(list)

Expand Down Expand Up @@ -178,11 +227,6 @@ def _compute_fully_decomposed(self):
The upshot is that decomposition code is very simple and easy to inline
at mild code size cost.
"""
# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
S_COUNT = L_COUNT * V_COUNT * T_COUNT

def _decompose(char_int, compatible):
# 7-bit ASCII never decomposes
if char_int <= 0x7f:
Expand Down Expand Up @@ -320,8 +364,8 @@ def gen_composition_table(canon_comp, out):
out.write(" }\n")
out.write("}\n")

def gen_decomposition_tables(canon_decomp, compat_decomp, out):
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility')]
def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
for table, name in tables:
gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
lambda k: "(0x{:x}, &[{}])".format(k,
Expand Down Expand Up @@ -491,7 +535,7 @@ def minimal_perfect_hash(d):
gen_composition_table(data.canon_comp, out)
out.write("\n")

gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, out)
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)

gen_combining_mark(data.general_category_mark, out)
out.write("\n")
Expand Down
28 changes: 27 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ pub use crate::quick_check::{
IsNormalized,
};
pub use crate::recompose::Recompositions;
pub use crate::replace::Replacements;
pub use crate::stream_safe::StreamSafe;
pub use crate::tables::UNICODE_VERSION;
use core::str::Chars;
Expand All @@ -71,6 +72,7 @@ mod normalize;
mod perfect_hash;
mod quick_check;
mod recompose;
mod replace;
mod stream_safe;

#[rustfmt::skip]
Expand All @@ -83,7 +85,9 @@ mod test;

/// Methods for composing and decomposing characters.
pub mod char {
pub use crate::normalize::{compose, decompose_canonical, decompose_compatible};
pub use crate::normalize::{
compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
};

pub use crate::lookups::{canonical_combining_class, is_combining_mark};
}
Expand All @@ -108,6 +112,18 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
/// (compatibility decomposition followed by canonical composition).
fn nfkc(self) -> Recompositions<I>;

/// A transformation which replaces CJK Compatibility Ideograph codepoints
/// with normal forms using Standardized Variation Sequences. This is not
/// part of the canonical or compatibility decomposition algorithms, but
/// performing it before those algorithms produces normalized output which
/// better preserves the intent of the original text.
///
/// Note that many systems today ignore variation selectors, so these
/// may not immediately help text display as intended, but they at
/// least preserve the information in a standardized form, giving
/// implementations the option to recognize them.
fn cjk_compat_variants(self) -> Replacements<I>;

/// An Iterator over the string with Conjoining Grapheme Joiner characters
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
fn stream_safe(self) -> StreamSafe<I>;
Expand All @@ -134,6 +150,11 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
recompose::new_compatible(self.chars())
}

#[inline]
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
replace::new_cjk_compat_variants(self.chars())
}

#[inline]
fn stream_safe(self) -> StreamSafe<Chars<'a>> {
StreamSafe::new(self.chars())
Expand Down Expand Up @@ -161,6 +182,11 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
recompose::new_compatible(self)
}

#[inline]
fn cjk_compat_variants(self) -> Replacements<I> {
replace::new_cjk_compat_variants(self)
}

#[inline]
fn stream_safe(self) -> StreamSafe<I> {
StreamSafe::new(self)
Expand Down
11 changes: 11 additions & 0 deletions src/lookups.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,17 @@ pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]>
)
}

pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> {
mph_lookup(
c.into(),
CJK_COMPAT_VARIANTS_DECOMPOSED_SALT,
CJK_COMPAT_VARIANTS_DECOMPOSED_KV,
pair_lookup_fk,
pair_lookup_fv_opt,
None,
)
}

/// Return whether the given character is a combining mark (`General_Category=Mark`)
pub fn is_combining_mark(c: char) -> bool {
mph_lookup(
Expand Down
36 changes: 35 additions & 1 deletion src/normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@

//! Functions for computing canonical and compatible decompositions for Unicode characters.
use crate::lookups::{
canonical_fully_decomposed, compatibility_fully_decomposed, composition_table,
canonical_fully_decomposed, cjk_compat_variants_fully_decomposed,
compatibility_fully_decomposed, composition_table,
};

use core::{char, ops::FnMut};
Expand All @@ -36,6 +37,39 @@ pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
decompose(c, decompose_char, emit_char)
}

/// Compute standard-variation decomposition for character.
///
/// [Standardized Variation Sequences] are used instead of the standard canonical
/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
/// to avoid losing information. See the
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
/// "Other Enhancements" section of the
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
/// for more information.
#[inline]
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
where
F: FnMut(char),
{
// 7-bit ASCII never decomposes
if c <= '\x7f' {
emit_char(c);
return;
}

// Don't perform decomposition for Hangul

if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) {
for &d in decomposed {
emit_char(d);
}
return;
}

// Finally bottom out.
emit_char(c);
}

#[inline]
fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
where
Expand Down
61 changes: 61 additions & 0 deletions src/replace.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use core::fmt::{self, Write};
use tinyvec::ArrayVec;

/// External iterator for replacements for a string's characters.
#[derive(Clone)]
pub struct Replacements<I> {
iter: I,
// At this time, the longest replacement sequence has length 2, so we just
// need buffer space for 1 codepoint.
buffer: Option<char>,
}

#[inline]
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
Replacements { iter, buffer: None }
}

impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
type Item = char;

#[inline]
fn next(&mut self) -> Option<char> {
if let Some(c) = self.buffer.take() {
return Some(c);
}

match self.iter.next() {
Some(ch) => {
// At this time, the longest replacement sequence has length 2.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we assert this or codegen a constant from the python file?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It effectively is asserted by the TinyVec::<[char; 2]>, which panics if too many elements are appended. I could codegen the constant if you want, but the code is simpler this way.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could be misreading it, but I thought TinyVec allocates if it exceeds the inline array size?

https://docs.rs/tinyvec/1.1.0/tinyvec/enum.TinyVec.html

but yeah, just asserting within the python file should be fine, agreed that codegenerating a constant is probably overkill :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah yeah, you can use ArrayVec which will have the panic on overflow behavior we want here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops, looks like I misread the TinyVec docs. Updated to use ArrayVec, and I added an assert to the python file.

let mut buffer = ArrayVec::<[char; 2]>::new();
super::char::decompose_cjk_compat_variants(ch, |d| buffer.push(d));
self.buffer = buffer.get(1).copied();
Some(buffer[0])
}
None => None,
}
}

fn size_hint(&self) -> (usize, Option<usize>) {
let (lower, _) = self.iter.size_hint();
(lower, None)
}
}

impl<I: Iterator<Item = char> + Clone> fmt::Display for Replacements<I> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for c in self.clone() {
f.write_char(c)?;
}
Ok(())
}
}
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy