icu_segmenter/
rule_segmenter.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::complex::ComplexPayloadsBorrowed;
6use crate::indices::{Latin1Indices, Utf16Indices};
7use crate::options::WordType;
8use crate::provider::*;
9use core::str::CharIndices;
10use utf8_iter::Utf8CharIndices;
11
12/// A trait allowing for RuleBreakIterator to be generalized to multiple string
13/// encoding methods and granularity such as grapheme cluster, word, etc.
14///
15/// <div class="stab unstable">
16/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
17/// trait, please consider using a type from the implementors listed below.
18/// </div>
19pub trait RuleBreakType: crate::private::Sealed + Sized {
20    /// The iterator over characters.
21    type IterAttr<'s>: Iterator<Item = (usize, Self::CharType)> + Clone + core::fmt::Debug;
22
23    /// The character type.
24    type CharType: Copy + Into<u32> + core::fmt::Debug;
25
26    #[doc(hidden)]
27    fn char_len(ch: Self::CharType) -> usize;
28}
29
30/// Implements the [`Iterator`] trait over the segmenter boundaries of the given string.
31///
32/// Lifetimes:
33///
34/// - `'l` = lifetime of the segmenter object from which this iterator was created
35/// - `'data` = lifetime of data borrowed by segmenter object
36///   (this largely exists because segmenter data is invariant due to ZeroMap constraints,
37///   think of it as a second 'l)
38/// - `'s` = lifetime of the string being segmented
39///
40/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
41/// _after_ the boundary (for a boundary at the end of text, this index is the length
42/// of the [`str`] or array of code units).
43#[derive(Debug)]
44pub struct RuleBreakIterator<'data, 's, Y: RuleBreakType> {
45    pub(crate) iter: Y::IterAttr<'s>,
46    pub(crate) len: usize,
47    pub(crate) current_pos_data: Option<(usize, Y::CharType)>,
48    pub(crate) result_cache: alloc::vec::Vec<usize>,
49    pub(crate) data: &'data RuleBreakData<'data>,
50    pub(crate) complex: Option<ComplexPayloadsBorrowed<'data>>,
51    pub(crate) boundary_property: u8,
52    pub(crate) locale_override: Option<&'data RuleBreakDataOverride<'data>>,
53    // Should return None if there is no complex language handling
54    pub(crate) handle_complex_language:
55        fn(&mut RuleBreakIterator<'data, 's, Y>, Y::CharType) -> Option<usize>,
56}
57
58pub(crate) fn empty_handle_complex_language<Y: RuleBreakType>(
59    _i: &mut RuleBreakIterator<'_, '_, Y>,
60    _c: Y::CharType,
61) -> Option<usize> {
62    debug_assert!(
63        false,
64        "grapheme/sentence segmenters should never need complex language handling"
65    );
66    None
67}
68
69impl<Y: RuleBreakType> Iterator for RuleBreakIterator<'_, '_, Y> {
70    type Item = usize;
71
72    fn next(&mut self) -> Option<Self::Item> {
73        // If we have break point cache by previous run, return this result
74        if let Some(&first_result) = self.result_cache.first() {
75            let mut i = 0;
76            loop {
77                if i == first_result {
78                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
79                    return self.get_current_position();
80                }
81                i += self.get_current_codepoint().map_or(0, Y::char_len);
82                self.advance_iter();
83                if self.is_eof() {
84                    self.result_cache.clear();
85                    self.boundary_property = self.data.complex_property;
86                    return Some(self.len);
87                }
88            }
89        }
90
91        if self.is_eof() {
92            self.advance_iter();
93            if self.is_eof() && self.len == 0 {
94                // Empty string. Since `self.current_pos_data` is always going to be empty,
95                // we never read `self.len` except for here, so we can use it to mark that
96                // we have already returned the single empty-string breakpoint.
97                self.len = 1;
98                return Some(0);
99            }
100            let Some(right_prop) = self.get_current_break_property() else {
101                // iterator already reaches to EOT. Reset boundary property for word-like.
102                self.boundary_property = 0;
103                return None;
104            };
105            // SOT x anything
106            if matches!(
107                self.get_break_state_from_table(self.data.sot_property, right_prop),
108                BreakState::Break | BreakState::NoMatch
109            ) {
110                self.boundary_property = 0; // SOT is special type
111                return self.get_current_position();
112            }
113        }
114
115        'a: loop {
116            debug_assert!(!self.is_eof());
117            let left_codepoint = self.get_current_codepoint()?;
118            let left_prop = self.get_break_property(left_codepoint);
119            self.advance_iter();
120
121            let Some(right_prop) = self.get_current_break_property() else {
122                self.boundary_property = left_prop;
123                return Some(self.len);
124            };
125
126            // Some segmenter rules doesn't have language-specific rules, we have to use LSTM (or dictionary) segmenter.
127            // If property is marked as SA, use it
128            if right_prop == self.data.complex_property {
129                if left_prop != self.data.complex_property {
130                    // break before SA
131                    self.boundary_property = left_prop;
132                    return self.get_current_position();
133                }
134                let break_offset = (self.handle_complex_language)(self, left_codepoint);
135                if break_offset.is_some() {
136                    return break_offset;
137                }
138            }
139
140            match self.get_break_state_from_table(left_prop, right_prop) {
141                BreakState::Keep => continue,
142                BreakState::Break | BreakState::NoMatch => {
143                    self.boundary_property = left_prop;
144                    return self.get_current_position();
145                }
146                BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
147                    // This isn't simple rule set. We need marker to restore iterator to previous position.
148                    let mut previous_iter = self.iter.clone();
149                    let mut previous_pos_data = self.current_pos_data;
150                    let mut previous_left_prop = left_prop;
151
152                    loop {
153                        self.advance_iter();
154
155                        let Some(prop) = self.get_current_break_property() else {
156                            // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
157                            self.boundary_property = index;
158                            if self.get_break_state_from_table(index, self.data.eot_property)
159                                == BreakState::NoMatch
160                            {
161                                self.boundary_property = previous_left_prop;
162                                self.iter = previous_iter;
163                                self.current_pos_data = previous_pos_data;
164                                return self.get_current_position();
165                            }
166                            // EOF
167                            return Some(self.len);
168                        };
169
170                        let previous_break_state_is_cp_prop =
171                            index <= self.data.last_codepoint_property;
172
173                        match self.get_break_state_from_table(index, prop) {
174                            BreakState::Keep => continue 'a,
175                            BreakState::NoMatch => {
176                                self.boundary_property = previous_left_prop;
177                                self.iter = previous_iter;
178                                self.current_pos_data = previous_pos_data;
179                                return self.get_current_position();
180                            }
181                            BreakState::Break => return self.get_current_position(),
182                            BreakState::Intermediate(i) => {
183                                index = i;
184                                if previous_break_state_is_cp_prop {
185                                    // Move marker
186                                    previous_left_prop = index;
187                                }
188                                previous_iter = self.iter.clone();
189                                previous_pos_data = self.current_pos_data;
190                            }
191                            BreakState::Index(i) => {
192                                index = i;
193                                if previous_break_state_is_cp_prop {
194                                    // Move marker
195                                    previous_iter = self.iter.clone();
196                                    previous_pos_data = self.current_pos_data;
197                                    previous_left_prop = index;
198                                }
199                            }
200                        }
201                    }
202                }
203            }
204        }
205    }
206}
207
208impl<Y: RuleBreakType> RuleBreakIterator<'_, '_, Y> {
209    pub(crate) fn advance_iter(&mut self) {
210        self.current_pos_data = self.iter.next();
211    }
212
213    pub(crate) fn is_eof(&self) -> bool {
214        self.current_pos_data.is_none()
215    }
216
217    pub(crate) fn get_current_break_property(&self) -> Option<u8> {
218        self.get_current_codepoint()
219            .map(|c| self.get_break_property(c))
220    }
221
222    pub(crate) fn get_current_position(&self) -> Option<usize> {
223        self.current_pos_data.map(|(pos, _)| pos)
224    }
225
226    pub(crate) fn get_current_codepoint(&self) -> Option<Y::CharType> {
227        self.current_pos_data.map(|(_, codepoint)| codepoint)
228    }
229
230    fn get_break_property(&self, codepoint: Y::CharType) -> u8 {
231        // Note: Default value is 0 == UNKNOWN
232        if let Some(locale_override) = &self.locale_override {
233            let property = locale_override
234                .property_table_override
235                .get32(codepoint.into());
236            if property != 0 {
237                return property;
238            }
239        }
240        self.data.property_table.get32(codepoint.into())
241    }
242
243    fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
244        let idx = left as usize * self.data.property_count as usize + right as usize;
245        // We use unwrap_or to fall back to the base case and prevent panics on bad data.
246        self.data
247            .break_state_table
248            .get(idx)
249            .unwrap_or(BreakState::Keep)
250    }
251
252    /// Return the status value of break boundary.
253    /// If segmenter isn't word, always return WordType::None
254    pub fn word_type(&self) -> WordType {
255        if !self.result_cache.is_empty() {
256            // Dictionary type (CJ and East Asian) is letter.
257            return WordType::Letter;
258        }
259        if self.boundary_property == 0 {
260            // break position is SOT / Any
261            return WordType::None;
262        }
263        self.data
264            .word_type_table
265            .get((self.boundary_property - 1) as usize)
266            .unwrap_or(WordType::None)
267    }
268
269    /// Return true when break boundary is word-like such as letter/number/CJK
270    /// If segmenter isn't word, return false
271    pub fn is_word_like(&self) -> bool {
272        self.word_type().is_word_like()
273    }
274}
275
276#[derive(Debug)]
277#[non_exhaustive]
278/// [`RuleBreakType`] for UTF-8 strings
279pub struct Utf8;
280
281impl crate::private::Sealed for Utf8 {}
282
283impl RuleBreakType for Utf8 {
284    type IterAttr<'s> = CharIndices<'s>;
285    type CharType = char;
286
287    fn char_len(ch: Self::CharType) -> usize {
288        ch.len_utf8()
289    }
290}
291
292#[derive(Debug)]
293#[non_exhaustive]
294/// [`RuleBreakType`] for potentially ill-formed UTF-8 strings
295pub struct PotentiallyIllFormedUtf8;
296
297impl crate::private::Sealed for PotentiallyIllFormedUtf8 {}
298
299impl RuleBreakType for PotentiallyIllFormedUtf8 {
300    type IterAttr<'s> = Utf8CharIndices<'s>;
301    type CharType = char;
302
303    fn char_len(ch: Self::CharType) -> usize {
304        ch.len_utf8()
305    }
306}
307
308#[derive(Debug)]
309#[non_exhaustive]
310/// [`RuleBreakType`] for Latin-1 strings
311pub struct Latin1;
312
313impl crate::private::Sealed for Latin1 {}
314
315impl RuleBreakType for Latin1 {
316    type IterAttr<'s> = Latin1Indices<'s>;
317    type CharType = u8;
318
319    fn char_len(_ch: Self::CharType) -> usize {
320        unreachable!()
321    }
322}
323
324#[derive(Debug)]
325#[non_exhaustive]
326/// [`RuleBreakType`] for UTF-16 strings
327pub struct Utf16;
328
329impl crate::private::Sealed for Utf16 {}
330
331impl RuleBreakType for Utf16 {
332    type IterAttr<'s> = Utf16Indices<'s>;
333    type CharType = u32;
334
335    fn char_len(ch: Self::CharType) -> usize {
336        if ch >= 0x10000 {
337            2
338        } else {
339            1
340        }
341    }
342}