@@ -728,6 +728,31 @@ impl<'a> UWordBounds<'a> {
728
728
}
729
729
}
730
730
731
+ /// ASCII‑fast‑path word‑boundary iterator for strings that contain only ASCII characters.
732
+ ///
733
+ /// Since we handle only ASCII characters, we can use a much simpler set of
734
+ /// word break values than the full Unicode algorithm.
735
+ /// https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values
736
+ ///
737
+ /// | Word_Break value | ASCII code points that belong to it |
738
+ /// | -----------------| --------------------------------------------------------------- |
739
+ /// | CR | U+000D (CR) |
740
+ /// | LF | U+000A (LF) |
741
+ /// | Newline | U+000B (VT), U+000C (FF) |
742
+ /// | Single_Quote | U+0027 (') |
743
+ /// | Double_Quote | U+0022 (") |
744
+ /// | MidNumLet | U+002E (.) FULL STOP |
745
+ /// | MidLetter | U+003A (:) COLON |
746
+ /// | MidNum | U+002C (,), U+003B (;) |
747
+ /// | Numeric | U+0030 – U+0039 (0 … 9) |
748
+ /// | ALetter | U+0041 – U+005A (A … Z), U+0061 – U+007A (a … z) |
749
+ /// | ExtendNumLet | U+005F (_) underscore |
750
+ /// | WSegSpace | U+0020 (SPACE) |
751
+ ///
752
+ /// The macro MidNumLetQ boils down to: U+002E (.) FULL STOP and U+0027 (')
753
+ /// AHLetter is the same as ALetter, so we don't need to distinguish it.
754
+ ///
755
+ /// Any other single ASCII byte is its own boundary (the default WB999).
731
756
pub struct AsciiWordBoundIter < ' a > {
732
757
rest : & ' a str ,
733
758
offset : usize ,
@@ -746,12 +771,17 @@ impl<'a> AsciiWordBoundIter<'a> {
746
771
#[ inline]
747
772
fn is_infix ( b : u8 , prev : u8 , next : u8 ) -> bool {
748
773
match b {
749
- // numeric separators
774
+ // Numeric separators such as "1,000" or "3.14" (WB11/WB12)
775
+ //
776
+ // "Numeric (MidNum | MidNumLetQ) Numeric"
750
777
b'.' | b',' | b';' | b'\'' if prev. is_ascii_digit ( ) && next. is_ascii_digit ( ) => true ,
751
- // apostrophe in contractions
752
- b'\'' if prev. is_ascii_alphabetic ( ) && next. is_ascii_alphabetic ( ) => true ,
753
- // dot/colon inside letters
754
- b'.' | b':' if prev. is_ascii_alphabetic ( ) && next. is_ascii_alphabetic ( ) => true ,
778
+
779
+ // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7)
780
+ //
781
+ // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)"
782
+ // MidLetter = b':'
783
+ // MidNumLetQ = b'.' | b'\''
784
+ b'\'' | b'.' | b':' if prev. is_ascii_alphabetic ( ) && next. is_ascii_alphabetic ( ) => true ,
755
785
_ => false ,
756
786
}
757
787
}
@@ -769,7 +799,8 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
769
799
let bytes = self . rest . as_bytes ( ) ;
770
800
let len = bytes. len ( ) ;
771
801
772
- // 1) Group runs of spaces
802
+ // 1) Keep horizontal whitespace together.
803
+ // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
773
804
if bytes[ 0 ] == b' ' {
774
805
let mut i = 1 ;
775
806
while i < len && bytes[ i] == b' ' {
@@ -783,6 +814,7 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
783
814
}
784
815
785
816
// 2) Core-run (letters/digits/underscore + infix)
817
+ // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
786
818
if Self :: is_core ( bytes[ 0 ] ) {
787
819
let mut i = 1 ;
788
820
while i < len {
@@ -802,14 +834,17 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
802
834
return Some ( ( pos, word) ) ;
803
835
}
804
836
805
- // 3) Non-core: CR+LF as one token, otherwise single char
837
+ // 3) Do not break within CRLF.
838
+ // Spec: WB3 treats CR+LF as a single non‑breaking pair.
806
839
if bytes[ 0 ] == b'\r' && len >= 2 && bytes[ 1 ] == b'\n' {
807
840
let word = & self . rest [ ..2 ] ;
808
841
let pos = self . offset ;
809
842
self . rest = & self . rest [ 2 ..] ;
810
843
self . offset += 2 ;
811
844
Some ( ( pos, word) )
812
845
} else {
846
+ // 4) Otherwise, break everywhere
847
+ // Spec: the catch‑all rule WB999.
813
848
let word = & self . rest [ ..1 ] ;
814
849
let pos = self . offset ;
815
850
self . rest = & self . rest [ 1 ..] ;
@@ -828,7 +863,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
828
863
let bytes = rest. as_bytes ( ) ;
829
864
let len = bytes. len ( ) ;
830
865
831
- // 1) Trailing spaces
866
+ // 1) Group runs of spaces
867
+ // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
832
868
if bytes[ len - 1 ] == b' ' {
833
869
// find start of this last run of spaces
834
870
let mut start = len - 1 ;
@@ -841,7 +877,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
841
877
return Some ( ( pos, word) ) ;
842
878
}
843
879
844
- // 2) Trailing core-run (letters/digits/underscore + infix)
880
+ // 2) Trailing Core-run (letters/digits/underscore + infix)
881
+ // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
845
882
if Self :: is_core ( bytes[ len - 1 ] ) {
846
883
// scan backwards as long as we see `is_core` or an `is_infix`
847
884
let mut start = len - 1 ;
@@ -861,7 +898,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
861
898
return Some ( ( pos, word) ) ;
862
899
}
863
900
864
- // 3) CR+LF at end
901
+ // 3) Non-core: CR+LF as one token, otherwise single char
902
+ // Spec: WB3 treats CR+LF as a single non‑breaking pair.
865
903
if len >= 2 && bytes[ len - 2 ] == b'\r' && bytes[ len - 1 ] == b'\n' {
866
904
let start = len - 2 ;
867
905
let word = & rest[ start..] ;
@@ -870,7 +908,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
870
908
return Some ( ( pos, word) ) ;
871
909
}
872
910
873
- // 4) Single non-core byte
911
+ // 4) Fallback – every other byte is its own segment
912
+ // Spec: the catch‑all rule WB999.
874
913
let start = len - 1 ;
875
914
let word = & rest[ start..] ;
876
915
let pos = self . offset + start;
0 commit comments