Skip to content

Commit 9c7abf2

Browse files
committed
Documentation and code reorg
1 parent 50058a5 commit 9c7abf2

File tree

1 file changed

+31
-21
lines changed

1 file changed

+31
-21
lines changed

src/sentence.rs

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ mod fwd {
1616
use tables::sentence::SentenceCat;
1717
use core::cmp;
1818

19+
// Describe a parsed part of source string as described in this table:
20+
// https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
1921
#[derive(Clone, Copy, PartialEq, Eq)]
2022
enum StatePart {
2123
Sot,
@@ -49,6 +51,8 @@ mod fwd {
4951
}
5052

5153
impl SentenceBreaksState {
54+
// Attempt to advance the internal state by one part
55+
// Whitespace and some punctutation will be collapsed
5256
fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
5357
let &SentenceBreaksState(parts) = self;
5458
let parts = match (parts[3], cat) {
@@ -85,27 +89,28 @@ mod fwd {
8589
])
8690
}
8791

92+
// Helper function to check if state head matches a single `StatePart`
8893
fn match1(&self, part: StatePart) -> bool {
8994
let &SentenceBreaksState(parts) = self;
9095
part == parts[3]
9196
}
9297

98+
// Helper function to check if first two `StateParts` in state match
99+
// the given two
93100
fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
94101
let &SentenceBreaksState(parts) = self;
95102
part1 == parts[2] && part2 == parts[3]
96103
}
97104
}
98105

106+
// https://unicode.org/reports/tr29/#SB8
107+
// TODO cache this, it is currently quadratic
99108
fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
100-
let aterm_part = {
101-
// ATerm Close* Sp*
102-
let &SentenceBreaksState(parts) = state;
103-
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
104-
if parts[idx] == StatePart::ClosePlus { idx -= 1 }
105-
parts[idx]
106-
};
109+
let &SentenceBreaksState(parts) = state;
110+
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
111+
if parts[idx] == StatePart::ClosePlus { idx -= 1 }
107112

108-
if aterm_part == StatePart::ATerm {
113+
if parts[idx] == StatePart::ATerm {
109114
use tables::sentence as se;
110115

111116
for next_char in ahead.chars() {
@@ -124,6 +129,7 @@ mod fwd {
124129
false
125130
}
126131

132+
// https://unicode.org/reports/tr29/#SB8a
127133
fn match_sb8a(state: &SentenceBreaksState) -> bool {
128134
// SATerm Close* Sp*
129135
let &SentenceBreaksState(parts) = state;
@@ -132,13 +138,15 @@ mod fwd {
132138
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
133139
}
134140

141+
// https://unicode.org/reports/tr29/#SB9
135142
fn match_sb9(state: &SentenceBreaksState) -> bool {
136143
// SATerm Close*
137144
let &SentenceBreaksState(parts) = state;
138145
let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 };
139146
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
140147
}
141148

149+
// https://unicode.org/reports/tr29/#SB11
142150
fn match_sb11(state: &SentenceBreaksState) -> bool {
143151
// SATerm Close* Sp* ParaSep?
144152
let &SentenceBreaksState(parts) = state;
@@ -180,67 +188,69 @@ mod fwd {
180188
self.state = self.state.next(next_cat);
181189

182190
match next_cat {
183-
// SB1
191+
// SB1 https://unicode.org/reports/tr29/#SB1
184192
_ if state_before.match1(StatePart::Sot) =>
185193
return Some(position_before),
186194

187-
// SB3
195+
// SB2 is handled when inner iterator (chars) is finished
196+
197+
// SB3 https://unicode.org/reports/tr29/#SB3
188198
SentenceCat::SC_LF if state_before.match1(StatePart::CR) =>
189199
continue,
190200

191-
// SB4
201+
// SB4 https://unicode.org/reports/tr29/#SB4
192202
_ if state_before.match1(StatePart::Sep)
193203
|| state_before.match1(StatePart::CR)
194204
|| state_before.match1(StatePart::LF)
195205
=> return Some(position_before),
196206

197-
// SB5
207+
// SB5 https://unicode.org/reports/tr29/#SB5
198208
SentenceCat::SC_Extend |
199209
SentenceCat::SC_Format => self.state = state_before,
200210

201-
// SB6
211+
// SB6 https://unicode.org/reports/tr29/#SB6
202212
SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) =>
203213
continue,
204214

205-
// SB7
215+
// SB7 https://unicode.org/reports/tr29/#SB7
206216
SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
207217
continue,
208218

209-
// SB8
219+
// SB8 https://unicode.org/reports/tr29/#SB8
210220
_ if match_sb8(&state_before, &self.string[position_before..]) =>
211221
continue,
212222

213-
// SB8a
223+
// SB8a https://unicode.org/reports/tr29/#SB8a
214224
SentenceCat::SC_SContinue |
215225
SentenceCat::SC_STerm |
216226
SentenceCat::SC_ATerm if match_sb8a(&state_before) =>
217227
continue,
218228

219-
// SB9
229+
// SB9 https://unicode.org/reports/tr29/#SB9
220230
SentenceCat::SC_Close |
221231
SentenceCat::SC_Sp |
222232
SentenceCat::SC_Sep |
223233
SentenceCat::SC_CR |
224234
SentenceCat::SC_LF if match_sb9(&state_before) =>
225235
continue,
226236

227-
// SB10
237+
// SB10 https://unicode.org/reports/tr29/#SB10
228238
SentenceCat::SC_Sp |
229239
SentenceCat::SC_Sep |
230240
SentenceCat::SC_CR |
231241
SentenceCat::SC_LF if match_sb8a(&state_before) =>
232242
continue,
233243

234-
// SB11
244+
// SB11 https://unicode.org/reports/tr29/#SB11
235245
_ if match_sb11(&state_before) =>
236246
return Some(position_before),
237247

238-
// SB998
248+
// SB998 https://unicode.org/reports/tr29/#SB998
239249
_ => continue
240250
}
241251
}
242252

243-
// SB2
253+
// SB2 https://unicode.org/reports/tr29/#SB2
244254
if self.state.match1(StatePart::Sot) {
245255
None
246256
} else if self.state.match1(StatePart::Eot) {

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy