Skip to content

Commit e29c432

Browse files
committed
add comments with reference to the spec
1 parent 7599d62 commit e29c432

File tree

1 file changed

+50
-11
lines changed

1 file changed

+50
-11
lines changed

src/word.rs

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,31 @@ impl<'a> UWordBounds<'a> {
728728
}
729729
}
730730

731+
/// ASCII‑fast‑path word‑boundary iterator for strings that contain only ASCII characters.
732+
///
733+
/// Since we handle only ASCII characters, we can use a much simpler set of
734+
/// word break values than the full Unicode algorithm.
735+
/// https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values
736+
///
737+
/// | Word_Break value | ASCII code points that belong to it |
738+
/// | -----------------| --------------------------------------------------------------- |
739+
/// | CR | U+000D (CR) |
740+
/// | LF | U+000A (LF) |
741+
/// | Newline | U+000B (VT), U+000C (FF) |
742+
/// | Single_Quote | U+0027 (') |
743+
/// | Double_Quote | U+0022 (") |
744+
/// | MidNumLet | U+002E (.) FULL STOP |
745+
/// | MidLetter | U+003A (:) COLON |
746+
/// | MidNum | U+002C (,), U+003B (;) |
747+
/// | Numeric | U+0030 – U+0039 (0 … 9) |
748+
/// | ALetter | U+0041 – U+005A (A … Z), U+0061 – U+007A (a … z) |
749+
/// | ExtendNumLet | U+005F (_) underscore |
750+
/// | WSegSpace | U+0020 (SPACE) |
751+
///
752+
/// The macro MidNumLetQ boils down to: U+002E (.) FULL STOP and U+0027 (')
753+
/// AHLetter is the same as ALetter, so we don't need to distinguish it.
754+
///
755+
/// Any other single ASCII byte is its own boundary (the default WB999).
731756
pub struct AsciiWordBoundIter<'a> {
732757
rest: &'a str,
733758
offset: usize,
@@ -746,12 +771,17 @@ impl<'a> AsciiWordBoundIter<'a> {
746771
#[inline]
747772
fn is_infix(b: u8, prev: u8, next: u8) -> bool {
748773
match b {
749-
// numeric separators
774+
// Numeric separators such as "1,000" or "3.14" (WB11/WB12)
775+
//
776+
// "Numeric (MidNum | MidNumLetQ) Numeric"
750777
b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true,
751-
// apostrophe in contractions
752-
b'\'' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
753-
// dot/colon inside letters
754-
b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
778+
779+
// Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7)
780+
//
781+
// "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)"
782+
// MidLetter = b':'
783+
// MidNumLetQ = b'.' | b'\''
784+
b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
755785
_ => false,
756786
}
757787
}
@@ -769,7 +799,8 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
769799
let bytes = self.rest.as_bytes();
770800
let len = bytes.len();
771801

772-
// 1) Group runs of spaces
802+
// 1) Keep horizontal whitespace together.
803+
// Spec: WB3d joins adjacent *WSegSpace* into a single segment.
773804
if bytes[0] == b' ' {
774805
let mut i = 1;
775806
while i < len && bytes[i] == b' ' {
@@ -783,6 +814,7 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
783814
}
784815

785816
// 2) Core-run (letters/digits/underscore + infix)
817+
// Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
786818
if Self::is_core(bytes[0]) {
787819
let mut i = 1;
788820
while i < len {
@@ -802,14 +834,17 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
802834
return Some((pos, word));
803835
}
804836

805-
// 3) Non-core: CR+LF as one token, otherwise single char
837+
// 3) Do not break within CRLF.
838+
// Spec: WB3 treats CR+LF as a single non‑breaking pair.
806839
if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' {
807840
let word = &self.rest[..2];
808841
let pos = self.offset;
809842
self.rest = &self.rest[2..];
810843
self.offset += 2;
811844
Some((pos, word))
812845
} else {
846+
// 4) Otherwise, break everywhere
847+
// Spec: the catch‑all rule WB999.
813848
let word = &self.rest[..1];
814849
let pos = self.offset;
815850
self.rest = &self.rest[1..];
@@ -828,7 +863,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
828863
let bytes = rest.as_bytes();
829864
let len = bytes.len();
830865

831-
// 1) Trailing spaces
866+
// 1) Group runs of spaces
867+
// Spec: WB3d joins adjacent *WSegSpace* into a single segment.
832868
if bytes[len - 1] == b' ' {
833869
// find start of this last run of spaces
834870
let mut start = len - 1;
@@ -841,7 +877,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
841877
return Some((pos, word));
842878
}
843879

844-
// 2) Trailing core-run (letters/digits/underscore + infix)
880+
// 2) Trailing Core-run (letters/digits/underscore + infix)
881+
// Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
845882
if Self::is_core(bytes[len - 1]) {
846883
// scan backwards as long as we see `is_core` or an `is_infix`
847884
let mut start = len - 1;
@@ -861,7 +898,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
861898
return Some((pos, word));
862899
}
863900

864-
// 3) CR+LF at end
901+
// 3) Non-core: CR+LF as one token, otherwise single char
902+
// Spec: WB3 treats CR+LF as a single non‑breaking pair.
865903
if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' {
866904
let start = len - 2;
867905
let word = &rest[start..];
@@ -870,7 +908,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
870908
return Some((pos, word));
871909
}
872910

873-
// 4) Single non-core byte
911+
// 4) Fallback – every other byte is its own segment
912+
// Spec: the catch‑all rule WB999.
874913
let start = len - 1;
875914
let word = &rest[start..];
876915
let pos = self.offset + start;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy