icu_segmenter/
word.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::complex::*;
6use crate::indices::{Latin1Indices, Utf16Indices};
7use crate::iterator_helpers::derive_usize_iterator_with_type;
8use crate::provider::*;
9use crate::rule_segmenter::*;
10use alloc::string::String;
11use alloc::vec;
12use alloc::vec::Vec;
13use icu_locale_core::LanguageIdentifier;
14use icu_provider::prelude::*;
15use utf8_iter::Utf8CharIndices;
16
17/// Options to tailor word breaking behavior.
18#[non_exhaustive]
19#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
20pub struct WordBreakOptions<'a> {
21    /// Content locale for word segmenter
22    pub content_locale: Option<&'a LanguageIdentifier>,
23    /// Options independent of the locale
24    pub invariant_options: WordBreakInvariantOptions,
25}
26
27/// Locale-independent options to tailor word breaking behavior
28///
29/// Currently empty but may grow in the future
30#[non_exhaustive]
31#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
32pub struct WordBreakInvariantOptions {}
33
34/// Implements the [`Iterator`] trait over the word boundaries of the given string.
35///
36/// Lifetimes:
37///
38/// - `'l` = lifetime of the segmenter object from which this iterator was created
39/// - `'s` = lifetime of the string being segmented
40///
41/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
42/// _after_ the boundary (for a boundary at the end of text, this index is the length
43/// of the [`str`] or array of code units).
44///
45/// For examples of use, see [`WordSegmenter`].
46#[derive(Debug)]
47pub struct WordBreakIterator<'data, 's, Y: RuleBreakType>(RuleBreakIterator<'data, 's, Y>);
48
49derive_usize_iterator_with_type!(WordBreakIterator, 'data);
50
51/// Hide ULE type
52pub(crate) mod inner {
53    /// The word type tag that is returned by [`WordBreakIterator::word_type()`].
54    ///
55    /// [`WordBreakIterator::word_type()`]: super::WordBreakIterator::word_type
56    #[non_exhaustive]
57    #[derive(Copy, Clone, PartialEq, Debug)]
58    #[repr(u8)]
59    #[zerovec::make_ule(WordTypeULE)]
60    pub enum WordType {
61        /// No category tag.
62        None = 0,
63        /// Number category tag.
64        Number = 1,
65        /// Letter category tag, including CJK.
66        Letter = 2,
67    }
68}
69
70pub use inner::WordType;
71
72impl WordType {
73    /// Whether the segment is word-like; word-like segments include numbers, as
74    /// well as segments made up of letters (including CJKV ideographs).
75    pub fn is_word_like(&self) -> bool {
76        self != &WordType::None
77    }
78}
79
80impl<'data, 's, Y: RuleBreakType> WordBreakIterator<'data, 's, Y> {
81    /// Returns the word type of the segment preceding the current boundary.
82    #[inline]
83    pub fn word_type(&self) -> WordType {
84        self.0.word_type()
85    }
86
87    /// Returns an iterator over pairs of boundary position and word type.
88    pub fn iter_with_word_type(self) -> WordBreakIteratorWithWordType<'data, 's, Y> {
89        WordBreakIteratorWithWordType(self)
90    }
91
92    /// Returns `true` when the segment preceding the current boundary is word-like,
93    /// such as letters, numbers, or CJKV ideographs.
94    #[inline]
95    pub fn is_word_like(&self) -> bool {
96        self.word_type().is_word_like()
97    }
98}
99
100/// Word break iterator that also returns the word type
101// We can use impl Trait here once `use<..>` syntax is available, see https://github.com/rust-lang/rust/issues/61756
102#[derive(Debug)]
103pub struct WordBreakIteratorWithWordType<'data, 's, Y: RuleBreakType>(
104    WordBreakIterator<'data, 's, Y>,
105);
106
107impl<Y: RuleBreakType> Iterator for WordBreakIteratorWithWordType<'_, '_, Y> {
108    type Item = (usize, WordType);
109    fn next(&mut self) -> Option<Self::Item> {
110        let ret = self.0.next()?;
111        Some((ret, self.0 .0.word_type()))
112    }
113}
114
115/// Supports loading word break data, and creating word break iterators for different string
116/// encodings.
117///
118/// Most segmentation methods live on [`WordSegmenterBorrowed`], which can be obtained via
119/// [`WordSegmenter::new_auto()`] (etc) or [`WordSegmenter::as_borrowed()`].
120///
121/// # Examples
122///
123/// Segment a string:
124///
125/// ```rust
126/// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
127/// let segmenter =
128///     WordSegmenter::new_auto(WordBreakInvariantOptions::default());
129///
130/// let breakpoints: Vec<usize> =
131///     segmenter.segment_str("Hello World").collect();
132/// assert_eq!(&breakpoints, &[0, 5, 6, 11]);
133/// ```
134///
135/// Segment a Latin1 byte string:
136///
137/// ```rust
138/// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
139/// let segmenter =
140///     WordSegmenter::new_auto(WordBreakInvariantOptions::default());
141///
142/// let breakpoints: Vec<usize> =
143///     segmenter.segment_latin1(b"Hello World").collect();
144/// assert_eq!(&breakpoints, &[0, 5, 6, 11]);
145/// ```
146///
147/// Successive boundaries can be used to retrieve the segments.
148/// In particular, the first boundary is always 0, and the last one is the
149/// length of the segmented text in code units.
150///
151/// ```rust
152/// # use icu::segmenter::{WordSegmenter, options::WordBreakInvariantOptions};
153/// # let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
154/// use itertools::Itertools;
155/// let text = "Mark’d ye his words?";
156/// let segments: Vec<&str> = segmenter
157///     .segment_str(text)
158///     .tuple_windows()
159///     .map(|(i, j)| &text[i..j])
160///     .collect();
161/// assert_eq!(
162///     &segments,
163///     &["Mark’d", " ", "ye", " ", "his", " ", "words", "?"]
164/// );
165/// ```
166///
167/// Not all segments delimited by word boundaries are words; some are interword
168/// segments such as spaces and punctuation.
169/// The [`WordBreakIterator::word_type()`] of a boundary can be used to
170/// classify the preceding segment; [`WordBreakIterator::iter_with_word_type()`]
171/// associates each boundary with its status.
172/// ```rust
173/// # use itertools::Itertools;
174/// # use icu::segmenter::WordSegmenter;
175/// # use icu::segmenter::options::{WordType, WordBreakInvariantOptions};
176/// # let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
177/// # let text = "Mark’d ye his words?";
178/// let words: Vec<&str> = segmenter
179///     .segment_str(text)
180///     .iter_with_word_type()
181///     .tuple_windows()
182///     .filter(|(_, (_, segment_type))| segment_type.is_word_like())
183///     .map(|((i, _), (j, _))| &text[i..j])
184///     .collect();
185/// assert_eq!(&words, &["Mark’d", "ye", "his", "words"]);
186/// ```
187#[derive(Debug)]
188pub struct WordSegmenter {
189    payload: DataPayload<SegmenterBreakWordV1>,
190    complex: ComplexPayloads,
191    payload_locale_override: Option<DataPayload<SegmenterBreakWordOverrideV1>>,
192}
193
194/// Segments a string into words (borrowed version).
195///
196/// See [`WordSegmenter`] for examples.
197#[derive(Clone, Debug, Copy)]
198pub struct WordSegmenterBorrowed<'data> {
199    data: &'data RuleBreakData<'data>,
200    complex: ComplexPayloadsBorrowed<'data>,
201    locale_override: Option<&'data RuleBreakDataOverride<'data>>,
202}
203
204impl WordSegmenter {
205    /// Constructs a [`WordSegmenter`] with an invariant locale and the best available compiled data for
206    /// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
207    ///
208    /// The current behavior, which is subject to change, is to use the LSTM model when available
209    /// and the dictionary model for Chinese and Japanese.
210    ///
211    /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
212    ///
213    /// [📚 Help choosing a constructor](icu_provider::constructors)
214    ///
215    /// # Examples
216    ///
217    /// Behavior with complex scripts:
218    ///
219    /// ```
220    /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
221    ///
222    /// let th_str = "ทุกสองสัปดาห์";
223    /// let ja_str = "こんにちは世界";
224    ///
225    /// let segmenter =
226    ///     WordSegmenter::new_auto(WordBreakInvariantOptions::default());
227    ///
228    /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
229    /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
230    ///
231    /// assert_eq!(th_bps, [0, 9, 18, 39]);
232    /// assert_eq!(ja_bps, [0, 15, 21]);
233    /// ```
234    #[cfg(feature = "compiled_data")]
235    #[cfg(feature = "auto")]
236    pub fn new_auto(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
237        WordSegmenterBorrowed {
238            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
239            complex: ComplexPayloadsBorrowed::new_auto(),
240            locale_override: None,
241        }
242    }
243
244    #[cfg(feature = "auto")]
245    icu_provider::gen_buffer_data_constructors!(
246        (options: WordBreakOptions) -> error: DataError,
247        functions: [
248            try_new_auto,
249            try_new_auto_with_buffer_provider,
250            try_new_auto_unstable,
251            Self
252        ]
253    );
254
255    #[cfg(feature = "auto")]
256    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
257    pub fn try_new_auto_unstable<D>(
258        provider: &D,
259        options: WordBreakOptions,
260    ) -> Result<Self, DataError>
261    where
262        D: DataProvider<SegmenterBreakWordV1>
263            + DataProvider<SegmenterBreakWordOverrideV1>
264            + DataProvider<SegmenterDictionaryAutoV1>
265            + DataProvider<SegmenterLstmAutoV1>
266            + DataProvider<SegmenterBreakGraphemeClusterV1>
267            + ?Sized,
268    {
269        Ok(Self {
270            payload: provider.load(Default::default())?.payload,
271            complex: ComplexPayloads::try_new_auto(provider)?,
272            payload_locale_override: if let Some(locale) = options.content_locale {
273                let locale = DataLocale::from(locale);
274                let req = DataRequest {
275                    id: DataIdentifierBorrowed::for_locale(&locale),
276                    metadata: {
277                        let mut metadata = DataRequestMetadata::default();
278                        metadata.silent = true;
279                        metadata
280                    },
281                };
282                provider
283                    .load(req)
284                    .allow_identifier_not_found()?
285                    .map(|r| r.payload)
286            } else {
287                None
288            },
289        })
290    }
291
292    /// Constructs a [`WordSegmenter`] with an invariant locale and compiled LSTM data for
293    /// complex scripts (Burmese, Khmer, Lao, and Thai).
294    ///
295    /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
296    /// the full dictionary but more expensive during segmentation (inference).
297    ///
298    /// Warning: there is not currently an LSTM model for Chinese or Japanese, so the [`WordSegmenter`]
299    /// created by this function will have unexpected behavior in spans of those scripts.
300    ///
301    /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
302    ///
303    /// [📚 Help choosing a constructor](icu_provider::constructors)
304    ///
305    /// # Examples
306    ///
307    /// Behavior with complex scripts:
308    ///
309    /// ```
310    /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
311    ///
312    /// let th_str = "ทุกสองสัปดาห์";
313    /// let ja_str = "こんにちは世界";
314    ///
315    /// let segmenter =
316    ///     WordSegmenter::new_lstm(WordBreakInvariantOptions::default());
317    ///
318    /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
319    /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
320    ///
321    /// assert_eq!(th_bps, [0, 9, 18, 39]);
322    ///
323    /// // Note: We aren't able to find a suitable breakpoint in Chinese/Japanese.
324    /// assert_eq!(ja_bps, [0, 21]);
325    /// ```
326    #[cfg(feature = "compiled_data")]
327    #[cfg(feature = "lstm")]
328    pub fn new_lstm(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
329        WordSegmenterBorrowed {
330            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
331            complex: ComplexPayloadsBorrowed::new_lstm(),
332            locale_override: None,
333        }
334    }
335
336    #[cfg(feature = "lstm")]
337    icu_provider::gen_buffer_data_constructors!(
338        (options: WordBreakOptions) -> error: DataError,
339        functions: [
340            try_new_lstm,
341            try_new_lstm_with_buffer_provider,
342            try_new_lstm_unstable,
343            Self
344        ]
345    );
346
347    #[cfg(feature = "lstm")]
348    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
349    pub fn try_new_lstm_unstable<D>(
350        provider: &D,
351        options: WordBreakOptions,
352    ) -> Result<Self, DataError>
353    where
354        D: DataProvider<SegmenterBreakWordV1>
355            + DataProvider<SegmenterBreakWordOverrideV1>
356            + DataProvider<SegmenterLstmAutoV1>
357            + DataProvider<SegmenterBreakGraphemeClusterV1>
358            + ?Sized,
359    {
360        Ok(Self {
361            payload: provider.load(Default::default())?.payload,
362            complex: ComplexPayloads::try_new_lstm(provider)?,
363            payload_locale_override: if let Some(locale) = options.content_locale {
364                let locale = DataLocale::from(locale);
365                let req = DataRequest {
366                    id: DataIdentifierBorrowed::for_locale(&locale),
367                    metadata: {
368                        let mut metadata = DataRequestMetadata::default();
369                        metadata.silent = true;
370                        metadata
371                    },
372                };
373                provider
374                    .load(req)
375                    .allow_identifier_not_found()?
376                    .map(|r| r.payload)
377            } else {
378                None
379            },
380        })
381    }
382
383    /// Construct a [`WordSegmenter`] with an invariant locale and compiled dictionary data for
384    /// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
385    ///
386    /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
387    /// faster than the LSTM model but requires more data.
388    ///
389    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
390    ///
391    /// [📚 Help choosing a constructor](icu_provider::constructors)
392    ///
393    /// # Examples
394    ///
395    /// Behavior with complex scripts:
396    ///
397    /// ```
398    /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
399    ///
400    /// let th_str = "ทุกสองสัปดาห์";
401    /// let ja_str = "こんにちは世界";
402    ///
403    /// let segmenter =
404    ///     WordSegmenter::new_dictionary(WordBreakInvariantOptions::default());
405    ///
406    /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
407    /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
408    ///
409    /// assert_eq!(th_bps, [0, 9, 18, 39]);
410    /// assert_eq!(ja_bps, [0, 15, 21]);
411    /// ```
412    #[cfg(feature = "compiled_data")]
413    pub fn new_dictionary(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
414        WordSegmenterBorrowed {
415            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
416            complex: ComplexPayloadsBorrowed::new_dict(),
417            locale_override: None,
418        }
419    }
420
421    icu_provider::gen_buffer_data_constructors!(
422        (options: WordBreakOptions) -> error: DataError,
423        functions: [
424            try_new_dictionary,
425            try_new_dictionary_with_buffer_provider,
426            try_new_dictionary_unstable,
427            Self
428        ]
429    );
430
431    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
432    pub fn try_new_dictionary_unstable<D>(
433        provider: &D,
434        options: WordBreakOptions,
435    ) -> Result<Self, DataError>
436    where
437        D: DataProvider<SegmenterBreakWordV1>
438            + DataProvider<SegmenterBreakWordOverrideV1>
439            + DataProvider<SegmenterDictionaryAutoV1>
440            + DataProvider<SegmenterDictionaryExtendedV1>
441            + DataProvider<SegmenterBreakGraphemeClusterV1>
442            + ?Sized,
443    {
444        Ok(Self {
445            payload: provider.load(Default::default())?.payload,
446            complex: ComplexPayloads::try_new_dict(provider)?,
447            payload_locale_override: if let Some(locale) = options.content_locale {
448                let locale = DataLocale::from(locale);
449                let req = DataRequest {
450                    id: DataIdentifierBorrowed::for_locale(&locale),
451                    metadata: {
452                        let mut metadata = DataRequestMetadata::default();
453                        metadata.silent = true;
454                        metadata
455                    },
456                };
457                provider
458                    .load(req)
459                    .allow_identifier_not_found()?
460                    .map(|r| r.payload)
461            } else {
462                None
463            },
464        })
465    }
466    /// Constructs a borrowed version of this type for more efficient querying.
467    ///
468    /// Most useful methods for segmentation are on this type.
469    pub fn as_borrowed(&self) -> WordSegmenterBorrowed<'_> {
470        WordSegmenterBorrowed {
471            data: self.payload.get(),
472            complex: self.complex.as_borrowed(),
473            locale_override: self.payload_locale_override.as_ref().map(|p| p.get()),
474        }
475    }
476}
477
478impl<'data> WordSegmenterBorrowed<'data> {
479    /// Creates a word break iterator for an `str` (a UTF-8 string).
480    ///
481    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
482    pub fn segment_str<'s>(self, input: &'s str) -> WordBreakIterator<'data, 's, Utf8> {
483        WordBreakIterator(RuleBreakIterator {
484            iter: input.char_indices(),
485            len: input.len(),
486            current_pos_data: None,
487            result_cache: Vec::new(),
488            data: self.data,
489            complex: Some(self.complex),
490            boundary_property: 0,
491            locale_override: self.locale_override,
492            handle_complex_language: Utf8::word_handle_complex_language,
493        })
494    }
495
496    /// Creates a word break iterator for a potentially ill-formed UTF8 string
497    ///
498    /// Invalid characters are treated as REPLACEMENT CHARACTER
499    ///
500    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
501    pub fn segment_utf8<'s>(
502        self,
503        input: &'s [u8],
504    ) -> WordBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
505        WordBreakIterator(RuleBreakIterator {
506            iter: Utf8CharIndices::new(input),
507            len: input.len(),
508            current_pos_data: None,
509            result_cache: Vec::new(),
510            data: self.data,
511            complex: Some(self.complex),
512            boundary_property: 0,
513            locale_override: self.locale_override,
514            handle_complex_language: PotentiallyIllFormedUtf8::word_handle_complex_language,
515        })
516    }
517
518    /// Creates a word break iterator for a Latin-1 (8-bit) string.
519    ///
520    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
521    pub fn segment_latin1<'s>(self, input: &'s [u8]) -> WordBreakIterator<'data, 's, Latin1> {
522        WordBreakIterator(RuleBreakIterator {
523            iter: Latin1Indices::new(input),
524            len: input.len(),
525            current_pos_data: None,
526            result_cache: Vec::new(),
527            data: self.data,
528            complex: Some(self.complex),
529            boundary_property: 0,
530            locale_override: self.locale_override,
531            handle_complex_language: Latin1::word_handle_complex_language,
532        })
533    }
534
535    /// Creates a word break iterator for a UTF-16 string.
536    ///
537    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
538    pub fn segment_utf16<'s>(self, input: &'s [u16]) -> WordBreakIterator<'data, 's, Utf16> {
539        WordBreakIterator(RuleBreakIterator {
540            iter: Utf16Indices::new(input),
541            len: input.len(),
542            current_pos_data: None,
543            result_cache: Vec::new(),
544            data: self.data,
545            complex: Some(self.complex),
546            boundary_property: 0,
547            locale_override: self.locale_override,
548            handle_complex_language: Utf16::word_handle_complex_language,
549        })
550    }
551}
552
553impl WordSegmenterBorrowed<'static> {
554    /// Cheaply converts a [`WordSegmenterBorrowed<'static>`] into a [`WordSegmenter`].
555    ///
556    /// Note: Due to branching and indirection, using [`WordSegmenter`] might inhibit some
557    /// compile-time optimizations that are possible with [`WordSegmenterBorrowed`].
558    pub fn static_to_owned(self) -> WordSegmenter {
559        let payload_locale_override = self.locale_override.map(DataPayload::from_static_ref);
560        WordSegmenter {
561            payload: DataPayload::from_static_ref(self.data),
562            complex: self.complex.static_to_owned(),
563            payload_locale_override,
564        }
565    }
566}
567
568/// A trait allowing for [`WordBreakIterator`] to be generalized to multiple string iteration methods.
569///
570/// This is implemented by ICU4X for several common string types.
571///
572/// <div class="stab unstable">
573/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
574/// trait, please consider using a type from the implementors listed below.
575/// </div>
576pub trait WordBreakType: crate::private::Sealed + Sized + RuleBreakType {
577    #[doc(hidden)]
578    fn word_handle_complex_language(
579        iterator: &mut RuleBreakIterator<'_, '_, Self>,
580        left_codepoint: Self::CharType,
581    ) -> Option<usize>;
582}
583
584impl WordBreakType for Utf8 {
585    fn word_handle_complex_language(
586        iter: &mut RuleBreakIterator<'_, '_, Self>,
587        left_codepoint: Self::CharType,
588    ) -> Option<usize> {
589        handle_complex_language_utf8(iter, left_codepoint)
590    }
591}
592
593impl WordBreakType for PotentiallyIllFormedUtf8 {
594    fn word_handle_complex_language(
595        iter: &mut RuleBreakIterator<'_, '_, Self>,
596        left_codepoint: Self::CharType,
597    ) -> Option<usize> {
598        handle_complex_language_utf8(iter, left_codepoint)
599    }
600}
601
602impl WordBreakType for Latin1 {
603    fn word_handle_complex_language(
604        _iter: &mut RuleBreakIterator<'_, '_, Self>,
605        _left_codepoint: Self::CharType,
606    ) -> Option<usize> {
607        debug_assert!(
608            false,
609            "latin-1 text should never need complex language handling"
610        );
611        None
612    }
613}
614
615/// handle_complex_language impl for UTF8 iterators
616fn handle_complex_language_utf8<T>(
617    iter: &mut RuleBreakIterator<'_, '_, T>,
618    left_codepoint: T::CharType,
619) -> Option<usize>
620where
621    T: RuleBreakType<CharType = char>,
622{
623    // word segmenter doesn't define break rules for some languages such as Thai.
624    let start_iter = iter.iter.clone();
625    let start_point = iter.current_pos_data;
626    let mut s = String::new();
627    s.push(left_codepoint);
628    loop {
629        debug_assert!(!iter.is_eof());
630        s.push(iter.get_current_codepoint()?);
631        iter.advance_iter();
632        if let Some(current_break_property) = iter.get_current_break_property() {
633            if current_break_property != iter.data.complex_property {
634                break;
635            }
636        } else {
637            // EOF
638            break;
639        }
640    }
641
642    // Restore iterator to move to head of complex string
643    iter.iter = start_iter;
644    iter.current_pos_data = start_point;
645    #[allow(clippy::unwrap_used)] // iter.complex present for word segmenter
646    let breaks = iter.complex.unwrap().complex_language_segment_str(&s);
647    iter.result_cache = breaks;
648    let first_pos = *iter.result_cache.first()?;
649    let mut i = left_codepoint.len_utf8();
650    loop {
651        if i == first_pos {
652            // Re-calculate breaking offset
653            iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
654            return iter.get_current_position();
655        }
656        debug_assert!(
657            i < first_pos,
658            "we should always arrive at first_pos: near index {:?}",
659            iter.get_current_position()
660        );
661        i += iter.get_current_codepoint().map_or(0, T::char_len);
662        iter.advance_iter();
663        if iter.is_eof() {
664            iter.result_cache.clear();
665            return Some(iter.len);
666        }
667    }
668}
669
670impl WordBreakType for Utf16 {
671    fn word_handle_complex_language(
672        iter: &mut RuleBreakIterator<Self>,
673        left_codepoint: Self::CharType,
674    ) -> Option<usize> {
675        // word segmenter doesn't define break rules for some languages such as Thai.
676        let start_iter = iter.iter.clone();
677        let start_point = iter.current_pos_data;
678        let mut s = vec![left_codepoint as u16];
679        loop {
680            debug_assert!(!iter.is_eof());
681            s.push(iter.get_current_codepoint()? as u16);
682            iter.advance_iter();
683            if let Some(current_break_property) = iter.get_current_break_property() {
684                if current_break_property != iter.data.complex_property {
685                    break;
686                }
687            } else {
688                // EOF
689                break;
690            }
691        }
692
693        // Restore iterator to move to head of complex string
694        iter.iter = start_iter;
695        iter.current_pos_data = start_point;
696        #[allow(clippy::unwrap_used)] // iter.complex present for word segmenter
697        let breaks = iter.complex.unwrap().complex_language_segment_utf16(&s);
698        iter.result_cache = breaks;
699        // result_cache vector is utf-16 index that is in BMP.
700        let first_pos = *iter.result_cache.first()?;
701        let mut i = 1;
702        loop {
703            if i == first_pos {
704                // Re-calculate breaking offset
705                iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
706                return iter.get_current_position();
707            }
708            debug_assert!(
709                i < first_pos,
710                "we should always arrive at first_pos: near index {:?}",
711                iter.get_current_position()
712            );
713            i += 1;
714            iter.advance_iter();
715            if iter.is_eof() {
716                iter.result_cache.clear();
717                return Some(iter.len);
718            }
719        }
720    }
721}
722
723#[cfg(all(test, feature = "serde"))]
724#[test]
725fn empty_string() {
726    let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
727    let breaks: Vec<usize> = segmenter.segment_str("").collect();
728    assert_eq!(breaks, [0]);
729}