icu_segmenter/
word.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::complex::*;
6use crate::indices::{Latin1Indices, Utf16Indices};
7use crate::iterator_helpers::derive_usize_iterator_with_type;
8use crate::provider::*;
9use crate::rule_segmenter::*;
10use alloc::string::String;
11use alloc::vec;
12use alloc::vec::Vec;
13use icu_locale_core::LanguageIdentifier;
14use icu_provider::prelude::*;
15use utf8_iter::Utf8CharIndices;
16
17/// Options to tailor word breaking behavior.
18#[non_exhaustive]
19#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
20pub struct WordBreakOptions<'a> {
21    /// Content locale for word segmenter
22    ///
23    /// If you know the language of the text being segmented, provide it here in order to produce
24    /// higher quality breakpoints.
25    ///
26    /// # Examples
27    ///
28    /// Normally, a colon character ':' is a word separator:
29    ///
30    /// ```rust
31    /// use icu::segmenter::WordSegmenter;
32    ///
33    /// let segmenter = WordSegmenter::new_auto(Default::default());
34    ///
35    /// let breakpoints: Vec<usize> = segmenter.segment_str("EU:ssa").collect();
36    /// assert_eq!(&breakpoints, &[0, 2, 3, 6]);
37    /// ```
38    ///
39    /// But not in Finnish, where it is used for loanwords:
40    ///
41    /// ```rust
42    /// use icu::locale::langid;
43    /// use icu::segmenter::options::WordBreakOptions;
44    /// use icu::segmenter::WordSegmenter;
45    ///
46    /// let mut options = WordBreakOptions::default();
47    /// let langid = &langid!("fi");
48    /// options.content_locale = Some(langid);
49    /// let segmenter = WordSegmenter::try_new_auto(options).unwrap();
50    ///
51    /// let breakpoints: Vec<usize> =
52    ///     segmenter.as_borrowed().segment_str("EU:ssa").collect();
53    /// assert_eq!(&breakpoints, &[0, 6]);
54    /// ```
55    pub content_locale: Option<&'a LanguageIdentifier>,
56    /// Options independent of the locale
57    pub invariant_options: WordBreakInvariantOptions,
58}
59
60/// Locale-independent options to tailor word breaking behavior
61///
62/// Currently empty but may grow in the future
63#[non_exhaustive]
64#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
65pub struct WordBreakInvariantOptions {}
66
67/// Implements the [`Iterator`] trait over the word boundaries of the given string.
68///
69/// Lifetimes:
70///
71/// - `'l` = lifetime of the segmenter object from which this iterator was created
72/// - `'s` = lifetime of the string being segmented
73///
74/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
75/// _after_ the boundary (for a boundary at the end of text, this index is the length
76/// of the [`str`] or array of code units).
77///
78/// For examples of use, see [`WordSegmenter`].
79#[derive(Debug)]
80pub struct WordBreakIterator<'data, 's, Y: RuleBreakType>(RuleBreakIterator<'data, 's, Y>);
81
82derive_usize_iterator_with_type!(WordBreakIterator, 'data);
83
84/// Hide ULE type
85pub(crate) mod inner {
86    /// The word type tag that is returned by [`WordBreakIterator::word_type()`].
87    ///
88    /// [`WordBreakIterator::word_type()`]: super::WordBreakIterator::word_type
89    #[non_exhaustive]
90    #[derive(Copy, Clone, PartialEq, Debug)]
91    #[repr(u8)]
92    #[zerovec::make_ule(WordTypeULE)]
93    pub enum WordType {
94        /// No category tag.
95        None = 0,
96        /// Number category tag.
97        Number = 1,
98        /// Letter category tag, including CJK.
99        Letter = 2,
100    }
101}
102
103pub use inner::WordType;
104
105impl WordType {
106    /// Whether the segment is word-like; word-like segments include numbers, as
107    /// well as segments made up of letters (including CJKV ideographs).
108    pub fn is_word_like(&self) -> bool {
109        self != &WordType::None
110    }
111}
112
113impl<'data, 's, Y: RuleBreakType> WordBreakIterator<'data, 's, Y> {
114    /// Returns the word type of the segment preceding the current boundary.
115    #[inline]
116    pub fn word_type(&self) -> WordType {
117        self.0.word_type()
118    }
119
120    /// Returns an iterator over pairs of boundary position and word type.
121    pub fn iter_with_word_type(self) -> WordBreakIteratorWithWordType<'data, 's, Y> {
122        WordBreakIteratorWithWordType(self)
123    }
124
125    /// Returns `true` when the segment preceding the current boundary is word-like,
126    /// such as letters, numbers, or CJKV ideographs.
127    #[inline]
128    pub fn is_word_like(&self) -> bool {
129        self.word_type().is_word_like()
130    }
131}
132
133/// Word break iterator that also returns the word type
134// We can use impl Trait here once `use<..>` syntax is available, see https://github.com/rust-lang/rust/issues/61756
135#[derive(Debug)]
136pub struct WordBreakIteratorWithWordType<'data, 's, Y: RuleBreakType>(
137    WordBreakIterator<'data, 's, Y>,
138);
139
140impl<Y: RuleBreakType> Iterator for WordBreakIteratorWithWordType<'_, '_, Y> {
141    type Item = (usize, WordType);
142    fn next(&mut self) -> Option<Self::Item> {
143        let ret = self.0.next()?;
144        Some((ret, self.0 .0.word_type()))
145    }
146}
147
148/// Supports loading word break data, and creating word break iterators for different string
149/// encodings.
150///
151/// Most segmentation methods live on [`WordSegmenterBorrowed`], which can be obtained via
152/// [`WordSegmenter::new_auto()`] (etc) or [`WordSegmenter::as_borrowed()`].
153///
154/// # Content Locale
155///
156/// You can optionally provide a _content locale_ to the [`WordSegmenter`] constructor. If you
157/// have information on the language of the text being segmented, providing this hint can
158/// produce higher-quality results.
159///
160/// If you have a content locale, use [`WordBreakOptions`] and a constructor begining with `new`.
161/// If you do not have a content locale use [`WordBreakInvariantOptions`] and a constructor
162/// beginning with `try_new`.
163///
164/// # Examples
165///
166/// Segment a string:
167///
168/// ```rust
169/// use icu::segmenter::WordSegmenter;
170///
171/// let segmenter = WordSegmenter::new_auto(Default::default());
172///
173/// let breakpoints: Vec<usize> =
174///     segmenter.segment_str("Hello World").collect();
175/// assert_eq!(&breakpoints, &[0, 5, 6, 11]);
176/// ```
177///
178/// Segment a Latin1 byte string with a content locale:
179///
180/// ```rust
181/// use icu::locale::langid;
182/// use icu::segmenter::options::WordBreakOptions;
183/// use icu::segmenter::WordSegmenter;
184///
185/// let mut options = WordBreakOptions::default();
186/// let langid = &langid!("en");
187/// options.content_locale = Some(langid);
188/// let segmenter = WordSegmenter::try_new_auto(options).unwrap();
189///
190/// let breakpoints: Vec<usize> = segmenter
191///     .as_borrowed()
192///     .segment_latin1(b"Hello World")
193///     .collect();
194/// assert_eq!(&breakpoints, &[0, 5, 6, 11]);
195/// ```
196///
197/// Successive boundaries can be used to retrieve the segments.
198/// In particular, the first boundary is always 0, and the last one is the
199/// length of the segmented text in code units.
200///
201/// ```rust
202/// # use icu::segmenter::{WordSegmenter, options::WordBreakInvariantOptions};
203/// # let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
204/// use itertools::Itertools;
205/// let text = "Mark’d ye his words?";
206/// let segments: Vec<&str> = segmenter
207///     .segment_str(text)
208///     .tuple_windows()
209///     .map(|(i, j)| &text[i..j])
210///     .collect();
211/// assert_eq!(
212///     &segments,
213///     &["Mark’d", " ", "ye", " ", "his", " ", "words", "?"]
214/// );
215/// ```
216///
217/// Not all segments delimited by word boundaries are words; some are interword
218/// segments such as spaces and punctuation.
219/// The [`WordBreakIterator::word_type()`] of a boundary can be used to
220/// classify the preceding segment; [`WordBreakIterator::iter_with_word_type()`]
221/// associates each boundary with its status.
222/// ```rust
223/// # use itertools::Itertools;
224/// # use icu::segmenter::WordSegmenter;
225/// # use icu::segmenter::options::{WordType, WordBreakInvariantOptions};
226/// # let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
227/// # let text = "Mark’d ye his words?";
228/// let words: Vec<&str> = segmenter
229///     .segment_str(text)
230///     .iter_with_word_type()
231///     .tuple_windows()
232///     .filter(|(_, (_, segment_type))| segment_type.is_word_like())
233///     .map(|((i, _), (j, _))| &text[i..j])
234///     .collect();
235/// assert_eq!(&words, &["Mark’d", "ye", "his", "words"]);
236/// ```
237#[derive(Debug)]
238pub struct WordSegmenter {
239    payload: DataPayload<SegmenterBreakWordV1>,
240    complex: ComplexPayloads,
241    payload_locale_override: Option<DataPayload<SegmenterBreakWordOverrideV1>>,
242}
243
244/// Segments a string into words (borrowed version).
245///
246/// See [`WordSegmenter`] for examples.
247#[derive(Clone, Debug, Copy)]
248pub struct WordSegmenterBorrowed<'data> {
249    data: &'data RuleBreakData<'data>,
250    complex: ComplexPayloadsBorrowed<'data>,
251    locale_override: Option<&'data RuleBreakDataOverride<'data>>,
252}
253
254impl WordSegmenter {
255    /// Constructs a [`WordSegmenter`] with an invariant locale and the best available compiled data for
256    /// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
257    ///
258    /// The current behavior, which is subject to change, is to use the LSTM model when available
259    /// and the dictionary model for Chinese and Japanese.
260    ///
261    /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
262    ///
263    /// [📚 Help choosing a constructor](icu_provider::constructors)
264    ///
265    /// # Examples
266    ///
267    /// Behavior with complex scripts:
268    ///
269    /// ```
270    /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
271    ///
272    /// let th_str = "ทุกสองสัปดาห์";
273    /// let ja_str = "こんにちは世界";
274    ///
275    /// let segmenter =
276    ///     WordSegmenter::new_auto(WordBreakInvariantOptions::default());
277    ///
278    /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
279    /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
280    ///
281    /// assert_eq!(th_bps, [0, 9, 18, 39]);
282    /// assert_eq!(ja_bps, [0, 15, 21]);
283    /// ```
284    #[cfg(feature = "compiled_data")]
285    #[cfg(feature = "auto")]
286    pub fn new_auto(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
287        WordSegmenterBorrowed {
288            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
289            complex: ComplexPayloadsBorrowed::new_auto(),
290            locale_override: None,
291        }
292    }
293
294    #[cfg(feature = "auto")]
295    icu_provider::gen_buffer_data_constructors!(
296        (options: WordBreakOptions) -> error: DataError,
297        functions: [
298            try_new_auto,
299            try_new_auto_with_buffer_provider,
300            try_new_auto_unstable,
301            Self
302        ]
303    );
304
305    #[cfg(feature = "auto")]
306    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
307    pub fn try_new_auto_unstable<D>(
308        provider: &D,
309        options: WordBreakOptions,
310    ) -> Result<Self, DataError>
311    where
312        D: DataProvider<SegmenterBreakWordV1>
313            + DataProvider<SegmenterBreakWordOverrideV1>
314            + DataProvider<SegmenterDictionaryAutoV1>
315            + DataProvider<SegmenterLstmAutoV1>
316            + DataProvider<SegmenterBreakGraphemeClusterV1>
317            + ?Sized,
318    {
319        Ok(Self {
320            payload: provider.load(Default::default())?.payload,
321            complex: ComplexPayloads::try_new_auto(provider)?,
322            payload_locale_override: if let Some(locale) = options.content_locale {
323                let locale = DataLocale::from(locale);
324                let req = DataRequest {
325                    id: DataIdentifierBorrowed::for_locale(&locale),
326                    metadata: {
327                        let mut metadata = DataRequestMetadata::default();
328                        metadata.silent = true;
329                        metadata
330                    },
331                };
332                provider
333                    .load(req)
334                    .allow_identifier_not_found()?
335                    .map(|r| r.payload)
336            } else {
337                None
338            },
339        })
340    }
341
342    /// Constructs a [`WordSegmenter`] with an invariant locale and compiled LSTM data for
343    /// complex scripts (Burmese, Khmer, Lao, and Thai).
344    ///
345    /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
346    /// the full dictionary but more expensive during segmentation (inference).
347    ///
348    /// Warning: there is not currently an LSTM model for Chinese or Japanese, so the [`WordSegmenter`]
349    /// created by this function will have unexpected behavior in spans of those scripts.
350    ///
351    /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
352    ///
353    /// [📚 Help choosing a constructor](icu_provider::constructors)
354    ///
355    /// # Examples
356    ///
357    /// Behavior with complex scripts:
358    ///
359    /// ```
360    /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
361    ///
362    /// let th_str = "ทุกสองสัปดาห์";
363    /// let ja_str = "こんにちは世界";
364    ///
365    /// let segmenter =
366    ///     WordSegmenter::new_lstm(WordBreakInvariantOptions::default());
367    ///
368    /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
369    /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
370    ///
371    /// assert_eq!(th_bps, [0, 9, 18, 39]);
372    ///
373    /// // Note: We aren't able to find a suitable breakpoint in Chinese/Japanese.
374    /// assert_eq!(ja_bps, [0, 21]);
375    /// ```
376    #[cfg(feature = "compiled_data")]
377    #[cfg(feature = "lstm")]
378    pub fn new_lstm(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
379        WordSegmenterBorrowed {
380            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
381            complex: ComplexPayloadsBorrowed::new_lstm(),
382            locale_override: None,
383        }
384    }
385
386    #[cfg(feature = "lstm")]
387    icu_provider::gen_buffer_data_constructors!(
388        (options: WordBreakOptions) -> error: DataError,
389        functions: [
390            try_new_lstm,
391            try_new_lstm_with_buffer_provider,
392            try_new_lstm_unstable,
393            Self
394        ]
395    );
396
397    #[cfg(feature = "lstm")]
398    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
399    pub fn try_new_lstm_unstable<D>(
400        provider: &D,
401        options: WordBreakOptions,
402    ) -> Result<Self, DataError>
403    where
404        D: DataProvider<SegmenterBreakWordV1>
405            + DataProvider<SegmenterBreakWordOverrideV1>
406            + DataProvider<SegmenterLstmAutoV1>
407            + DataProvider<SegmenterBreakGraphemeClusterV1>
408            + ?Sized,
409    {
410        Ok(Self {
411            payload: provider.load(Default::default())?.payload,
412            complex: ComplexPayloads::try_new_lstm(provider)?,
413            payload_locale_override: if let Some(locale) = options.content_locale {
414                let locale = DataLocale::from(locale);
415                let req = DataRequest {
416                    id: DataIdentifierBorrowed::for_locale(&locale),
417                    metadata: {
418                        let mut metadata = DataRequestMetadata::default();
419                        metadata.silent = true;
420                        metadata
421                    },
422                };
423                provider
424                    .load(req)
425                    .allow_identifier_not_found()?
426                    .map(|r| r.payload)
427            } else {
428                None
429            },
430        })
431    }
432
433    /// Construct a [`WordSegmenter`] with an invariant locale and compiled dictionary data for
434    /// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
435    ///
436    /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
437    /// faster than the LSTM model but requires more data.
438    ///
439    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
440    ///
441    /// [📚 Help choosing a constructor](icu_provider::constructors)
442    ///
443    /// # Examples
444    ///
445    /// Behavior with complex scripts:
446    ///
447    /// ```
448    /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
449    ///
450    /// let th_str = "ทุกสองสัปดาห์";
451    /// let ja_str = "こんにちは世界";
452    ///
453    /// let segmenter =
454    ///     WordSegmenter::new_dictionary(WordBreakInvariantOptions::default());
455    ///
456    /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
457    /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
458    ///
459    /// assert_eq!(th_bps, [0, 9, 18, 39]);
460    /// assert_eq!(ja_bps, [0, 15, 21]);
461    /// ```
462    #[cfg(feature = "compiled_data")]
463    pub fn new_dictionary(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
464        WordSegmenterBorrowed {
465            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
466            complex: ComplexPayloadsBorrowed::new_dict(),
467            locale_override: None,
468        }
469    }
470
471    icu_provider::gen_buffer_data_constructors!(
472        (options: WordBreakOptions) -> error: DataError,
473        functions: [
474            try_new_dictionary,
475            try_new_dictionary_with_buffer_provider,
476            try_new_dictionary_unstable,
477            Self
478        ]
479    );
480
481    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
482    pub fn try_new_dictionary_unstable<D>(
483        provider: &D,
484        options: WordBreakOptions,
485    ) -> Result<Self, DataError>
486    where
487        D: DataProvider<SegmenterBreakWordV1>
488            + DataProvider<SegmenterBreakWordOverrideV1>
489            + DataProvider<SegmenterDictionaryAutoV1>
490            + DataProvider<SegmenterDictionaryExtendedV1>
491            + DataProvider<SegmenterBreakGraphemeClusterV1>
492            + ?Sized,
493    {
494        Ok(Self {
495            payload: provider.load(Default::default())?.payload,
496            complex: ComplexPayloads::try_new_dict(provider)?,
497            payload_locale_override: if let Some(locale) = options.content_locale {
498                let locale = DataLocale::from(locale);
499                let req = DataRequest {
500                    id: DataIdentifierBorrowed::for_locale(&locale),
501                    metadata: {
502                        let mut metadata = DataRequestMetadata::default();
503                        metadata.silent = true;
504                        metadata
505                    },
506                };
507                provider
508                    .load(req)
509                    .allow_identifier_not_found()?
510                    .map(|r| r.payload)
511            } else {
512                None
513            },
514        })
515    }
516    /// Constructs a borrowed version of this type for more efficient querying.
517    ///
518    /// Most useful methods for segmentation are on this type.
519    pub fn as_borrowed(&self) -> WordSegmenterBorrowed<'_> {
520        WordSegmenterBorrowed {
521            data: self.payload.get(),
522            complex: self.complex.as_borrowed(),
523            locale_override: self.payload_locale_override.as_ref().map(|p| p.get()),
524        }
525    }
526}
527
528impl<'data> WordSegmenterBorrowed<'data> {
529    /// Creates a word break iterator for an `str` (a UTF-8 string).
530    ///
531    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
532    pub fn segment_str<'s>(self, input: &'s str) -> WordBreakIterator<'data, 's, Utf8> {
533        WordBreakIterator(RuleBreakIterator {
534            iter: input.char_indices(),
535            len: input.len(),
536            current_pos_data: None,
537            result_cache: Vec::new(),
538            data: self.data,
539            complex: Some(self.complex),
540            boundary_property: 0,
541            locale_override: self.locale_override,
542            handle_complex_language: Utf8::word_handle_complex_language,
543        })
544    }
545
546    /// Creates a word break iterator for a potentially ill-formed UTF8 string
547    ///
548    /// Invalid characters are treated as REPLACEMENT CHARACTER
549    ///
550    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
551    pub fn segment_utf8<'s>(
552        self,
553        input: &'s [u8],
554    ) -> WordBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
555        WordBreakIterator(RuleBreakIterator {
556            iter: Utf8CharIndices::new(input),
557            len: input.len(),
558            current_pos_data: None,
559            result_cache: Vec::new(),
560            data: self.data,
561            complex: Some(self.complex),
562            boundary_property: 0,
563            locale_override: self.locale_override,
564            handle_complex_language: PotentiallyIllFormedUtf8::word_handle_complex_language,
565        })
566    }
567
568    /// Creates a word break iterator for a Latin-1 (8-bit) string.
569    ///
570    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
571    pub fn segment_latin1<'s>(self, input: &'s [u8]) -> WordBreakIterator<'data, 's, Latin1> {
572        WordBreakIterator(RuleBreakIterator {
573            iter: Latin1Indices::new(input),
574            len: input.len(),
575            current_pos_data: None,
576            result_cache: Vec::new(),
577            data: self.data,
578            complex: Some(self.complex),
579            boundary_property: 0,
580            locale_override: self.locale_override,
581            handle_complex_language: Latin1::word_handle_complex_language,
582        })
583    }
584
585    /// Creates a word break iterator for a UTF-16 string.
586    ///
587    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
588    pub fn segment_utf16<'s>(self, input: &'s [u16]) -> WordBreakIterator<'data, 's, Utf16> {
589        WordBreakIterator(RuleBreakIterator {
590            iter: Utf16Indices::new(input),
591            len: input.len(),
592            current_pos_data: None,
593            result_cache: Vec::new(),
594            data: self.data,
595            complex: Some(self.complex),
596            boundary_property: 0,
597            locale_override: self.locale_override,
598            handle_complex_language: Utf16::word_handle_complex_language,
599        })
600    }
601}
602
603impl WordSegmenterBorrowed<'static> {
604    /// Cheaply converts a [`WordSegmenterBorrowed<'static>`] into a [`WordSegmenter`].
605    ///
606    /// Note: Due to branching and indirection, using [`WordSegmenter`] might inhibit some
607    /// compile-time optimizations that are possible with [`WordSegmenterBorrowed`].
608    pub fn static_to_owned(self) -> WordSegmenter {
609        let payload_locale_override = self.locale_override.map(DataPayload::from_static_ref);
610        WordSegmenter {
611            payload: DataPayload::from_static_ref(self.data),
612            complex: self.complex.static_to_owned(),
613            payload_locale_override,
614        }
615    }
616}
617
618/// A trait allowing for [`WordBreakIterator`] to be generalized to multiple string iteration methods.
619///
620/// This is implemented by ICU4X for several common string types.
621///
622/// <div class="stab unstable">
623/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
624/// trait, please consider using a type from the implementors listed below.
625/// </div>
626pub trait WordBreakType: crate::private::Sealed + Sized + RuleBreakType {
627    #[doc(hidden)]
628    fn word_handle_complex_language(
629        iterator: &mut RuleBreakIterator<'_, '_, Self>,
630        left_codepoint: Self::CharType,
631    ) -> Option<usize>;
632}
633
634impl WordBreakType for Utf8 {
635    fn word_handle_complex_language(
636        iter: &mut RuleBreakIterator<'_, '_, Self>,
637        left_codepoint: Self::CharType,
638    ) -> Option<usize> {
639        handle_complex_language_utf8(iter, left_codepoint)
640    }
641}
642
643impl WordBreakType for PotentiallyIllFormedUtf8 {
644    fn word_handle_complex_language(
645        iter: &mut RuleBreakIterator<'_, '_, Self>,
646        left_codepoint: Self::CharType,
647    ) -> Option<usize> {
648        handle_complex_language_utf8(iter, left_codepoint)
649    }
650}
651
652impl WordBreakType for Latin1 {
653    fn word_handle_complex_language(
654        _iter: &mut RuleBreakIterator<'_, '_, Self>,
655        _left_codepoint: Self::CharType,
656    ) -> Option<usize> {
657        debug_assert!(
658            false,
659            "latin-1 text should never need complex language handling"
660        );
661        None
662    }
663}
664
665/// handle_complex_language impl for UTF8 iterators
666fn handle_complex_language_utf8<T>(
667    iter: &mut RuleBreakIterator<'_, '_, T>,
668    left_codepoint: T::CharType,
669) -> Option<usize>
670where
671    T: RuleBreakType<CharType = char>,
672{
673    // word segmenter doesn't define break rules for some languages such as Thai.
674    let start_iter = iter.iter.clone();
675    let start_point = iter.current_pos_data;
676    let mut s = String::new();
677    s.push(left_codepoint);
678    loop {
679        debug_assert!(!iter.is_eof());
680        s.push(iter.get_current_codepoint()?);
681        iter.advance_iter();
682        if let Some(current_break_property) = iter.get_current_break_property() {
683            if current_break_property != iter.data.complex_property {
684                break;
685            }
686        } else {
687            // EOF
688            break;
689        }
690    }
691
692    // Restore iterator to move to head of complex string
693    iter.iter = start_iter;
694    iter.current_pos_data = start_point;
695    #[expect(clippy::unwrap_used)] // iter.complex present for word segmenter
696    let breaks = iter.complex.unwrap().complex_language_segment_str(&s);
697    iter.result_cache = breaks;
698    let first_pos = *iter.result_cache.first()?;
699    let mut i = left_codepoint.len_utf8();
700    loop {
701        if i == first_pos {
702            // Re-calculate breaking offset
703            iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
704            return iter.get_current_position();
705        }
706        debug_assert!(
707            i < first_pos,
708            "we should always arrive at first_pos: near index {:?}",
709            iter.get_current_position()
710        );
711        i += iter.get_current_codepoint().map_or(0, T::char_len);
712        iter.advance_iter();
713        if iter.is_eof() {
714            iter.result_cache.clear();
715            return Some(iter.len);
716        }
717    }
718}
719
720impl WordBreakType for Utf16 {
721    fn word_handle_complex_language(
722        iter: &mut RuleBreakIterator<Self>,
723        left_codepoint: Self::CharType,
724    ) -> Option<usize> {
725        // word segmenter doesn't define break rules for some languages such as Thai.
726        let start_iter = iter.iter.clone();
727        let start_point = iter.current_pos_data;
728        let mut s = vec![left_codepoint as u16];
729        loop {
730            debug_assert!(!iter.is_eof());
731            s.push(iter.get_current_codepoint()? as u16);
732            iter.advance_iter();
733            if let Some(current_break_property) = iter.get_current_break_property() {
734                if current_break_property != iter.data.complex_property {
735                    break;
736                }
737            } else {
738                // EOF
739                break;
740            }
741        }
742
743        // Restore iterator to move to head of complex string
744        iter.iter = start_iter;
745        iter.current_pos_data = start_point;
746        #[expect(clippy::unwrap_used)] // iter.complex present for word segmenter
747        let breaks = iter.complex.unwrap().complex_language_segment_utf16(&s);
748        iter.result_cache = breaks;
749        // result_cache vector is utf-16 index that is in BMP.
750        let first_pos = *iter.result_cache.first()?;
751        let mut i = 1;
752        loop {
753            if i == first_pos {
754                // Re-calculate breaking offset
755                iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
756                return iter.get_current_position();
757            }
758            debug_assert!(
759                i < first_pos,
760                "we should always arrive at first_pos: near index {:?}",
761                iter.get_current_position()
762            );
763            i += 1;
764            iter.advance_iter();
765            if iter.is_eof() {
766                iter.result_cache.clear();
767                return Some(iter.len);
768            }
769        }
770    }
771}
772
773#[cfg(all(test, feature = "serde"))]
774#[test]
775fn empty_string() {
776    let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
777    let breaks: Vec<usize> = segmenter.segment_str("").collect();
778    assert_eq!(breaks, [0]);
779}