icu_segmenter/
rule_segmenter.rs1use crate::complex::ComplexPayloadsBorrowed;
6use crate::indices::{Latin1Indices, Utf16Indices};
7use crate::options::WordType;
8use crate::provider::*;
9use core::str::CharIndices;
10use utf8_iter::Utf8CharIndices;
11
12pub trait RuleBreakType: crate::private::Sealed + Sized {
20 type IterAttr<'s>: Iterator<Item = (usize, Self::CharType)> + Clone + core::fmt::Debug;
22
23 type CharType: Copy + Into<u32> + core::fmt::Debug;
25
26 #[doc(hidden)]
27 fn char_len(ch: Self::CharType) -> usize;
28}
29
30#[derive(Debug)]
44pub struct RuleBreakIterator<'data, 's, Y: RuleBreakType> {
45 pub(crate) iter: Y::IterAttr<'s>,
46 pub(crate) len: usize,
47 pub(crate) current_pos_data: Option<(usize, Y::CharType)>,
48 pub(crate) result_cache: alloc::vec::Vec<usize>,
49 pub(crate) data: &'data RuleBreakData<'data>,
50 pub(crate) complex: Option<ComplexPayloadsBorrowed<'data>>,
51 pub(crate) boundary_property: u8,
52 pub(crate) locale_override: Option<&'data RuleBreakDataOverride<'data>>,
53 pub(crate) handle_complex_language:
55 fn(&mut RuleBreakIterator<'data, 's, Y>, Y::CharType) -> Option<usize>,
56}
57
58pub(crate) fn empty_handle_complex_language<Y: RuleBreakType>(
59 _i: &mut RuleBreakIterator<'_, '_, Y>,
60 _c: Y::CharType,
61) -> Option<usize> {
62 debug_assert!(
63 false,
64 "grapheme/sentence segmenters should never need complex language handling"
65 );
66 None
67}
68
69impl<Y: RuleBreakType> Iterator for RuleBreakIterator<'_, '_, Y> {
70 type Item = usize;
71
72 fn next(&mut self) -> Option<Self::Item> {
73 if let Some(&first_result) = self.result_cache.first() {
75 let mut i = 0;
76 loop {
77 if i == first_result {
78 self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
79 return self.get_current_position();
80 }
81 i += self.get_current_codepoint().map_or(0, Y::char_len);
82 self.advance_iter();
83 if self.is_eof() {
84 self.result_cache.clear();
85 self.boundary_property = self.data.complex_property;
86 return Some(self.len);
87 }
88 }
89 }
90
91 if self.is_eof() {
92 self.advance_iter();
93 if self.is_eof() && self.len == 0 {
94 self.len = 1;
98 return Some(0);
99 }
100 let Some(right_prop) = self.get_current_break_property() else {
101 self.boundary_property = 0;
103 return None;
104 };
105 if matches!(
107 self.get_break_state_from_table(self.data.sot_property, right_prop),
108 BreakState::Break | BreakState::NoMatch
109 ) {
110 self.boundary_property = 0; return self.get_current_position();
112 }
113 }
114
115 'a: loop {
116 debug_assert!(!self.is_eof());
117 let left_codepoint = self.get_current_codepoint()?;
118 let left_prop = self.get_break_property(left_codepoint);
119 self.advance_iter();
120
121 let Some(right_prop) = self.get_current_break_property() else {
122 self.boundary_property = left_prop;
123 return Some(self.len);
124 };
125
126 if right_prop == self.data.complex_property {
129 if left_prop != self.data.complex_property {
130 self.boundary_property = left_prop;
132 return self.get_current_position();
133 }
134 let break_offset = (self.handle_complex_language)(self, left_codepoint);
135 if break_offset.is_some() {
136 return break_offset;
137 }
138 }
139
140 match self.get_break_state_from_table(left_prop, right_prop) {
141 BreakState::Keep => continue,
142 BreakState::Break | BreakState::NoMatch => {
143 self.boundary_property = left_prop;
144 return self.get_current_position();
145 }
146 BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
147 let mut previous_iter = self.iter.clone();
149 let mut previous_pos_data = self.current_pos_data;
150 let mut previous_left_prop = left_prop;
151
152 loop {
153 self.advance_iter();
154
155 let Some(prop) = self.get_current_break_property() else {
156 self.boundary_property = index;
158 if self.get_break_state_from_table(index, self.data.eot_property)
159 == BreakState::NoMatch
160 {
161 self.boundary_property = previous_left_prop;
162 self.iter = previous_iter;
163 self.current_pos_data = previous_pos_data;
164 return self.get_current_position();
165 }
166 return Some(self.len);
168 };
169
170 let previous_break_state_is_cp_prop =
171 index <= self.data.last_codepoint_property;
172
173 match self.get_break_state_from_table(index, prop) {
174 BreakState::Keep => continue 'a,
175 BreakState::NoMatch => {
176 self.boundary_property = previous_left_prop;
177 self.iter = previous_iter;
178 self.current_pos_data = previous_pos_data;
179 return self.get_current_position();
180 }
181 BreakState::Break => return self.get_current_position(),
182 BreakState::Intermediate(i) => {
183 index = i;
184 if previous_break_state_is_cp_prop {
185 previous_left_prop = index;
187 }
188 previous_iter = self.iter.clone();
189 previous_pos_data = self.current_pos_data;
190 }
191 BreakState::Index(i) => {
192 index = i;
193 if previous_break_state_is_cp_prop {
194 previous_iter = self.iter.clone();
196 previous_pos_data = self.current_pos_data;
197 previous_left_prop = index;
198 }
199 }
200 }
201 }
202 }
203 }
204 }
205 }
206}
207
208impl<Y: RuleBreakType> RuleBreakIterator<'_, '_, Y> {
209 pub(crate) fn advance_iter(&mut self) {
210 self.current_pos_data = self.iter.next();
211 }
212
213 pub(crate) fn is_eof(&self) -> bool {
214 self.current_pos_data.is_none()
215 }
216
217 pub(crate) fn get_current_break_property(&self) -> Option<u8> {
218 self.get_current_codepoint()
219 .map(|c| self.get_break_property(c))
220 }
221
222 pub(crate) fn get_current_position(&self) -> Option<usize> {
223 self.current_pos_data.map(|(pos, _)| pos)
224 }
225
226 pub(crate) fn get_current_codepoint(&self) -> Option<Y::CharType> {
227 self.current_pos_data.map(|(_, codepoint)| codepoint)
228 }
229
230 fn get_break_property(&self, codepoint: Y::CharType) -> u8 {
231 if let Some(locale_override) = &self.locale_override {
233 let property = locale_override
234 .property_table_override
235 .get32(codepoint.into());
236 if property != 0 {
237 return property;
238 }
239 }
240 self.data.property_table.get32(codepoint.into())
241 }
242
243 fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
244 let idx = left as usize * self.data.property_count as usize + right as usize;
245 self.data
247 .break_state_table
248 .get(idx)
249 .unwrap_or(BreakState::Keep)
250 }
251
252 pub fn word_type(&self) -> WordType {
255 if !self.result_cache.is_empty() {
256 return WordType::Letter;
258 }
259 if self.boundary_property == 0 {
260 return WordType::None;
262 }
263 self.data
264 .word_type_table
265 .get((self.boundary_property - 1) as usize)
266 .unwrap_or(WordType::None)
267 }
268
269 pub fn is_word_like(&self) -> bool {
272 self.word_type().is_word_like()
273 }
274}
275
276#[derive(Debug)]
277#[non_exhaustive]
278pub struct Utf8;
280
281impl crate::private::Sealed for Utf8 {}
282
283impl RuleBreakType for Utf8 {
284 type IterAttr<'s> = CharIndices<'s>;
285 type CharType = char;
286
287 fn char_len(ch: Self::CharType) -> usize {
288 ch.len_utf8()
289 }
290}
291
292#[derive(Debug)]
293#[non_exhaustive]
294pub struct PotentiallyIllFormedUtf8;
296
297impl crate::private::Sealed for PotentiallyIllFormedUtf8 {}
298
299impl RuleBreakType for PotentiallyIllFormedUtf8 {
300 type IterAttr<'s> = Utf8CharIndices<'s>;
301 type CharType = char;
302
303 fn char_len(ch: Self::CharType) -> usize {
304 ch.len_utf8()
305 }
306}
307
308#[derive(Debug)]
309#[non_exhaustive]
310pub struct Latin1;
312
313impl crate::private::Sealed for Latin1 {}
314
315impl RuleBreakType for Latin1 {
316 type IterAttr<'s> = Latin1Indices<'s>;
317 type CharType = u8;
318
319 fn char_len(_ch: Self::CharType) -> usize {
320 unreachable!()
321 }
322}
323
324#[derive(Debug)]
325#[non_exhaustive]
326pub struct Utf16;
328
329impl crate::private::Sealed for Utf16 {}
330
331impl RuleBreakType for Utf16 {
332 type IterAttr<'s> = Utf16Indices<'s>;
333 type CharType = u32;
334
335 fn char_len(ch: Self::CharType) -> usize {
336 if ch >= 0x10000 {
337 2
338 } else {
339 1
340 }
341 }
342}