icu_segmenter/word.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::complex::*;
6use crate::indices::{Latin1Indices, Utf16Indices};
7use crate::iterator_helpers::derive_usize_iterator_with_type;
8use crate::provider::*;
9use crate::rule_segmenter::*;
10use alloc::string::String;
11use alloc::vec;
12use alloc::vec::Vec;
13use icu_locale_core::LanguageIdentifier;
14use icu_provider::prelude::*;
15use utf8_iter::Utf8CharIndices;
16
17/// Options to tailor word breaking behavior.
18#[non_exhaustive]
19#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
20pub struct WordBreakOptions<'a> {
21 /// Content locale for word segmenter
22 ///
23 /// If you know the language of the text being segmented, provide it here in order to produce
24 /// higher quality breakpoints.
25 ///
26 /// # Examples
27 ///
28 /// Normally, a colon character ':' is a word separator:
29 ///
30 /// ```rust
31 /// use icu::segmenter::WordSegmenter;
32 ///
33 /// let segmenter = WordSegmenter::new_auto(Default::default());
34 ///
35 /// let breakpoints: Vec<usize> = segmenter.segment_str("EU:ssa").collect();
36 /// assert_eq!(&breakpoints, &[0, 2, 3, 6]);
37 /// ```
38 ///
39 /// But not in Finnish, where it is used for loanwords:
40 ///
41 /// ```rust
42 /// use icu::locale::langid;
43 /// use icu::segmenter::options::WordBreakOptions;
44 /// use icu::segmenter::WordSegmenter;
45 ///
46 /// let mut options = WordBreakOptions::default();
47 /// let langid = &langid!("fi");
48 /// options.content_locale = Some(langid);
49 /// let segmenter = WordSegmenter::try_new_auto(options).unwrap();
50 ///
51 /// let breakpoints: Vec<usize> =
52 /// segmenter.as_borrowed().segment_str("EU:ssa").collect();
53 /// assert_eq!(&breakpoints, &[0, 6]);
54 /// ```
55 pub content_locale: Option<&'a LanguageIdentifier>,
56 /// Options independent of the locale
57 pub invariant_options: WordBreakInvariantOptions,
58}
59
60/// Locale-independent options to tailor word breaking behavior
61///
62/// Currently empty but may grow in the future
63#[non_exhaustive]
64#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
65pub struct WordBreakInvariantOptions {}
66
67/// Implements the [`Iterator`] trait over the word boundaries of the given string.
68///
69/// Lifetimes:
70///
71/// - `'l` = lifetime of the segmenter object from which this iterator was created
72/// - `'s` = lifetime of the string being segmented
73///
74/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
75/// _after_ the boundary (for a boundary at the end of text, this index is the length
76/// of the [`str`] or array of code units).
77///
78/// For examples of use, see [`WordSegmenter`].
79#[derive(Debug)]
80pub struct WordBreakIterator<'data, 's, Y: RuleBreakType>(RuleBreakIterator<'data, 's, Y>);
81
82derive_usize_iterator_with_type!(WordBreakIterator, 'data);
83
84/// Hide ULE type
85pub(crate) mod inner {
86 /// The word type tag that is returned by [`WordBreakIterator::word_type()`].
87 ///
88 /// [`WordBreakIterator::word_type()`]: super::WordBreakIterator::word_type
89 #[non_exhaustive]
90 #[derive(Copy, Clone, PartialEq, Debug)]
91 #[repr(u8)]
92 #[zerovec::make_ule(WordTypeULE)]
93 pub enum WordType {
94 /// No category tag.
95 None = 0,
96 /// Number category tag.
97 Number = 1,
98 /// Letter category tag, including CJK.
99 Letter = 2,
100 }
101}
102
103pub use inner::WordType;
104
105impl WordType {
106 /// Whether the segment is word-like; word-like segments include numbers, as
107 /// well as segments made up of letters (including CJKV ideographs).
108 pub fn is_word_like(&self) -> bool {
109 self != &WordType::None
110 }
111}
112
113impl<'data, 's, Y: RuleBreakType> WordBreakIterator<'data, 's, Y> {
114 /// Returns the word type of the segment preceding the current boundary.
115 #[inline]
116 pub fn word_type(&self) -> WordType {
117 self.0.word_type()
118 }
119
120 /// Returns an iterator over pairs of boundary position and word type.
121 pub fn iter_with_word_type(self) -> WordBreakIteratorWithWordType<'data, 's, Y> {
122 WordBreakIteratorWithWordType(self)
123 }
124
125 /// Returns `true` when the segment preceding the current boundary is word-like,
126 /// such as letters, numbers, or CJKV ideographs.
127 #[inline]
128 pub fn is_word_like(&self) -> bool {
129 self.word_type().is_word_like()
130 }
131}
132
133/// Word break iterator that also returns the word type
134// We can use impl Trait here once `use<..>` syntax is available, see https://github.com/rust-lang/rust/issues/61756
135#[derive(Debug)]
136pub struct WordBreakIteratorWithWordType<'data, 's, Y: RuleBreakType>(
137 WordBreakIterator<'data, 's, Y>,
138);
139
140impl<Y: RuleBreakType> Iterator for WordBreakIteratorWithWordType<'_, '_, Y> {
141 type Item = (usize, WordType);
142 fn next(&mut self) -> Option<Self::Item> {
143 let ret = self.0.next()?;
144 Some((ret, self.0 .0.word_type()))
145 }
146}
147
148/// Supports loading word break data, and creating word break iterators for different string
149/// encodings.
150///
151/// Most segmentation methods live on [`WordSegmenterBorrowed`], which can be obtained via
152/// [`WordSegmenter::new_auto()`] (etc) or [`WordSegmenter::as_borrowed()`].
153///
154/// # Content Locale
155///
156/// You can optionally provide a _content locale_ to the [`WordSegmenter`] constructor. If you
157/// have information on the language of the text being segmented, providing this hint can
158/// produce higher-quality results.
159///
160/// If you have a content locale, use [`WordBreakOptions`] and a constructor begining with `new`.
161/// If you do not have a content locale use [`WordBreakInvariantOptions`] and a constructor
162/// beginning with `try_new`.
163///
164/// # Examples
165///
166/// Segment a string:
167///
168/// ```rust
169/// use icu::segmenter::WordSegmenter;
170///
171/// let segmenter = WordSegmenter::new_auto(Default::default());
172///
173/// let breakpoints: Vec<usize> =
174/// segmenter.segment_str("Hello World").collect();
175/// assert_eq!(&breakpoints, &[0, 5, 6, 11]);
176/// ```
177///
178/// Segment a Latin1 byte string with a content locale:
179///
180/// ```rust
181/// use icu::locale::langid;
182/// use icu::segmenter::options::WordBreakOptions;
183/// use icu::segmenter::WordSegmenter;
184///
185/// let mut options = WordBreakOptions::default();
186/// let langid = &langid!("en");
187/// options.content_locale = Some(langid);
188/// let segmenter = WordSegmenter::try_new_auto(options).unwrap();
189///
190/// let breakpoints: Vec<usize> = segmenter
191/// .as_borrowed()
192/// .segment_latin1(b"Hello World")
193/// .collect();
194/// assert_eq!(&breakpoints, &[0, 5, 6, 11]);
195/// ```
196///
197/// Successive boundaries can be used to retrieve the segments.
198/// In particular, the first boundary is always 0, and the last one is the
199/// length of the segmented text in code units.
200///
201/// ```rust
202/// # use icu::segmenter::{WordSegmenter, options::WordBreakInvariantOptions};
203/// # let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
204/// use itertools::Itertools;
205/// let text = "Mark’d ye his words?";
206/// let segments: Vec<&str> = segmenter
207/// .segment_str(text)
208/// .tuple_windows()
209/// .map(|(i, j)| &text[i..j])
210/// .collect();
211/// assert_eq!(
212/// &segments,
213/// &["Mark’d", " ", "ye", " ", "his", " ", "words", "?"]
214/// );
215/// ```
216///
217/// Not all segments delimited by word boundaries are words; some are interword
218/// segments such as spaces and punctuation.
219/// The [`WordBreakIterator::word_type()`] of a boundary can be used to
220/// classify the preceding segment; [`WordBreakIterator::iter_with_word_type()`]
221/// associates each boundary with its status.
222/// ```rust
223/// # use itertools::Itertools;
224/// # use icu::segmenter::WordSegmenter;
225/// # use icu::segmenter::options::{WordType, WordBreakInvariantOptions};
226/// # let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
227/// # let text = "Mark’d ye his words?";
228/// let words: Vec<&str> = segmenter
229/// .segment_str(text)
230/// .iter_with_word_type()
231/// .tuple_windows()
232/// .filter(|(_, (_, segment_type))| segment_type.is_word_like())
233/// .map(|((i, _), (j, _))| &text[i..j])
234/// .collect();
235/// assert_eq!(&words, &["Mark’d", "ye", "his", "words"]);
236/// ```
237#[derive(Debug)]
238pub struct WordSegmenter {
239 payload: DataPayload<SegmenterBreakWordV1>,
240 complex: ComplexPayloads,
241 payload_locale_override: Option<DataPayload<SegmenterBreakWordOverrideV1>>,
242}
243
244/// Segments a string into words (borrowed version).
245///
246/// See [`WordSegmenter`] for examples.
247#[derive(Clone, Debug, Copy)]
248pub struct WordSegmenterBorrowed<'data> {
249 data: &'data RuleBreakData<'data>,
250 complex: ComplexPayloadsBorrowed<'data>,
251 locale_override: Option<&'data RuleBreakDataOverride<'data>>,
252}
253
254impl WordSegmenter {
255 /// Constructs a [`WordSegmenter`] with an invariant locale and the best available compiled data for
256 /// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
257 ///
258 /// The current behavior, which is subject to change, is to use the LSTM model when available
259 /// and the dictionary model for Chinese and Japanese.
260 ///
261 /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
262 ///
263 /// [📚 Help choosing a constructor](icu_provider::constructors)
264 ///
265 /// # Examples
266 ///
267 /// Behavior with complex scripts:
268 ///
269 /// ```
270 /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
271 ///
272 /// let th_str = "ทุกสองสัปดาห์";
273 /// let ja_str = "こんにちは世界";
274 ///
275 /// let segmenter =
276 /// WordSegmenter::new_auto(WordBreakInvariantOptions::default());
277 ///
278 /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
279 /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
280 ///
281 /// assert_eq!(th_bps, [0, 9, 18, 39]);
282 /// assert_eq!(ja_bps, [0, 15, 21]);
283 /// ```
284 #[cfg(feature = "compiled_data")]
285 #[cfg(feature = "auto")]
286 pub fn new_auto(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
287 WordSegmenterBorrowed {
288 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
289 complex: ComplexPayloadsBorrowed::new_auto(),
290 locale_override: None,
291 }
292 }
293
294 #[cfg(feature = "auto")]
295 icu_provider::gen_buffer_data_constructors!(
296 (options: WordBreakOptions) -> error: DataError,
297 functions: [
298 try_new_auto,
299 try_new_auto_with_buffer_provider,
300 try_new_auto_unstable,
301 Self
302 ]
303 );
304
305 #[cfg(feature = "auto")]
306 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
307 pub fn try_new_auto_unstable<D>(
308 provider: &D,
309 options: WordBreakOptions,
310 ) -> Result<Self, DataError>
311 where
312 D: DataProvider<SegmenterBreakWordV1>
313 + DataProvider<SegmenterBreakWordOverrideV1>
314 + DataProvider<SegmenterDictionaryAutoV1>
315 + DataProvider<SegmenterLstmAutoV1>
316 + DataProvider<SegmenterBreakGraphemeClusterV1>
317 + ?Sized,
318 {
319 Ok(Self {
320 payload: provider.load(Default::default())?.payload,
321 complex: ComplexPayloads::try_new_auto(provider)?,
322 payload_locale_override: if let Some(locale) = options.content_locale {
323 let locale = DataLocale::from(locale);
324 let req = DataRequest {
325 id: DataIdentifierBorrowed::for_locale(&locale),
326 metadata: {
327 let mut metadata = DataRequestMetadata::default();
328 metadata.silent = true;
329 metadata
330 },
331 };
332 provider
333 .load(req)
334 .allow_identifier_not_found()?
335 .map(|r| r.payload)
336 } else {
337 None
338 },
339 })
340 }
341
342 /// Constructs a [`WordSegmenter`] with an invariant locale and compiled LSTM data for
343 /// complex scripts (Burmese, Khmer, Lao, and Thai).
344 ///
345 /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
346 /// the full dictionary but more expensive during segmentation (inference).
347 ///
348 /// Warning: there is not currently an LSTM model for Chinese or Japanese, so the [`WordSegmenter`]
349 /// created by this function will have unexpected behavior in spans of those scripts.
350 ///
351 /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
352 ///
353 /// [📚 Help choosing a constructor](icu_provider::constructors)
354 ///
355 /// # Examples
356 ///
357 /// Behavior with complex scripts:
358 ///
359 /// ```
360 /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
361 ///
362 /// let th_str = "ทุกสองสัปดาห์";
363 /// let ja_str = "こんにちは世界";
364 ///
365 /// let segmenter =
366 /// WordSegmenter::new_lstm(WordBreakInvariantOptions::default());
367 ///
368 /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
369 /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
370 ///
371 /// assert_eq!(th_bps, [0, 9, 18, 39]);
372 ///
373 /// // Note: We aren't able to find a suitable breakpoint in Chinese/Japanese.
374 /// assert_eq!(ja_bps, [0, 21]);
375 /// ```
376 #[cfg(feature = "compiled_data")]
377 #[cfg(feature = "lstm")]
378 pub fn new_lstm(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
379 WordSegmenterBorrowed {
380 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
381 complex: ComplexPayloadsBorrowed::new_lstm(),
382 locale_override: None,
383 }
384 }
385
386 #[cfg(feature = "lstm")]
387 icu_provider::gen_buffer_data_constructors!(
388 (options: WordBreakOptions) -> error: DataError,
389 functions: [
390 try_new_lstm,
391 try_new_lstm_with_buffer_provider,
392 try_new_lstm_unstable,
393 Self
394 ]
395 );
396
397 #[cfg(feature = "lstm")]
398 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
399 pub fn try_new_lstm_unstable<D>(
400 provider: &D,
401 options: WordBreakOptions,
402 ) -> Result<Self, DataError>
403 where
404 D: DataProvider<SegmenterBreakWordV1>
405 + DataProvider<SegmenterBreakWordOverrideV1>
406 + DataProvider<SegmenterLstmAutoV1>
407 + DataProvider<SegmenterBreakGraphemeClusterV1>
408 + ?Sized,
409 {
410 Ok(Self {
411 payload: provider.load(Default::default())?.payload,
412 complex: ComplexPayloads::try_new_lstm(provider)?,
413 payload_locale_override: if let Some(locale) = options.content_locale {
414 let locale = DataLocale::from(locale);
415 let req = DataRequest {
416 id: DataIdentifierBorrowed::for_locale(&locale),
417 metadata: {
418 let mut metadata = DataRequestMetadata::default();
419 metadata.silent = true;
420 metadata
421 },
422 };
423 provider
424 .load(req)
425 .allow_identifier_not_found()?
426 .map(|r| r.payload)
427 } else {
428 None
429 },
430 })
431 }
432
433 /// Construct a [`WordSegmenter`] with an invariant locale and compiled dictionary data for
434 /// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
435 ///
436 /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
437 /// faster than the LSTM model but requires more data.
438 ///
439 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
440 ///
441 /// [📚 Help choosing a constructor](icu_provider::constructors)
442 ///
443 /// # Examples
444 ///
445 /// Behavior with complex scripts:
446 ///
447 /// ```
448 /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
449 ///
450 /// let th_str = "ทุกสองสัปดาห์";
451 /// let ja_str = "こんにちは世界";
452 ///
453 /// let segmenter =
454 /// WordSegmenter::new_dictionary(WordBreakInvariantOptions::default());
455 ///
456 /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
457 /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
458 ///
459 /// assert_eq!(th_bps, [0, 9, 18, 39]);
460 /// assert_eq!(ja_bps, [0, 15, 21]);
461 /// ```
462 #[cfg(feature = "compiled_data")]
463 pub fn new_dictionary(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
464 WordSegmenterBorrowed {
465 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
466 complex: ComplexPayloadsBorrowed::new_dict(),
467 locale_override: None,
468 }
469 }
470
471 icu_provider::gen_buffer_data_constructors!(
472 (options: WordBreakOptions) -> error: DataError,
473 functions: [
474 try_new_dictionary,
475 try_new_dictionary_with_buffer_provider,
476 try_new_dictionary_unstable,
477 Self
478 ]
479 );
480
481 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
482 pub fn try_new_dictionary_unstable<D>(
483 provider: &D,
484 options: WordBreakOptions,
485 ) -> Result<Self, DataError>
486 where
487 D: DataProvider<SegmenterBreakWordV1>
488 + DataProvider<SegmenterBreakWordOverrideV1>
489 + DataProvider<SegmenterDictionaryAutoV1>
490 + DataProvider<SegmenterDictionaryExtendedV1>
491 + DataProvider<SegmenterBreakGraphemeClusterV1>
492 + ?Sized,
493 {
494 Ok(Self {
495 payload: provider.load(Default::default())?.payload,
496 complex: ComplexPayloads::try_new_dict(provider)?,
497 payload_locale_override: if let Some(locale) = options.content_locale {
498 let locale = DataLocale::from(locale);
499 let req = DataRequest {
500 id: DataIdentifierBorrowed::for_locale(&locale),
501 metadata: {
502 let mut metadata = DataRequestMetadata::default();
503 metadata.silent = true;
504 metadata
505 },
506 };
507 provider
508 .load(req)
509 .allow_identifier_not_found()?
510 .map(|r| r.payload)
511 } else {
512 None
513 },
514 })
515 }
516 /// Constructs a borrowed version of this type for more efficient querying.
517 ///
518 /// Most useful methods for segmentation are on this type.
519 pub fn as_borrowed(&self) -> WordSegmenterBorrowed<'_> {
520 WordSegmenterBorrowed {
521 data: self.payload.get(),
522 complex: self.complex.as_borrowed(),
523 locale_override: self.payload_locale_override.as_ref().map(|p| p.get()),
524 }
525 }
526}
527
528impl<'data> WordSegmenterBorrowed<'data> {
529 /// Creates a word break iterator for an `str` (a UTF-8 string).
530 ///
531 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
532 pub fn segment_str<'s>(self, input: &'s str) -> WordBreakIterator<'data, 's, Utf8> {
533 WordBreakIterator(RuleBreakIterator {
534 iter: input.char_indices(),
535 len: input.len(),
536 current_pos_data: None,
537 result_cache: Vec::new(),
538 data: self.data,
539 complex: Some(self.complex),
540 boundary_property: 0,
541 locale_override: self.locale_override,
542 handle_complex_language: Utf8::word_handle_complex_language,
543 })
544 }
545
546 /// Creates a word break iterator for a potentially ill-formed UTF8 string
547 ///
548 /// Invalid characters are treated as REPLACEMENT CHARACTER
549 ///
550 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
551 pub fn segment_utf8<'s>(
552 self,
553 input: &'s [u8],
554 ) -> WordBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
555 WordBreakIterator(RuleBreakIterator {
556 iter: Utf8CharIndices::new(input),
557 len: input.len(),
558 current_pos_data: None,
559 result_cache: Vec::new(),
560 data: self.data,
561 complex: Some(self.complex),
562 boundary_property: 0,
563 locale_override: self.locale_override,
564 handle_complex_language: PotentiallyIllFormedUtf8::word_handle_complex_language,
565 })
566 }
567
568 /// Creates a word break iterator for a Latin-1 (8-bit) string.
569 ///
570 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
571 pub fn segment_latin1<'s>(self, input: &'s [u8]) -> WordBreakIterator<'data, 's, Latin1> {
572 WordBreakIterator(RuleBreakIterator {
573 iter: Latin1Indices::new(input),
574 len: input.len(),
575 current_pos_data: None,
576 result_cache: Vec::new(),
577 data: self.data,
578 complex: Some(self.complex),
579 boundary_property: 0,
580 locale_override: self.locale_override,
581 handle_complex_language: Latin1::word_handle_complex_language,
582 })
583 }
584
585 /// Creates a word break iterator for a UTF-16 string.
586 ///
587 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
588 pub fn segment_utf16<'s>(self, input: &'s [u16]) -> WordBreakIterator<'data, 's, Utf16> {
589 WordBreakIterator(RuleBreakIterator {
590 iter: Utf16Indices::new(input),
591 len: input.len(),
592 current_pos_data: None,
593 result_cache: Vec::new(),
594 data: self.data,
595 complex: Some(self.complex),
596 boundary_property: 0,
597 locale_override: self.locale_override,
598 handle_complex_language: Utf16::word_handle_complex_language,
599 })
600 }
601}
602
603impl WordSegmenterBorrowed<'static> {
604 /// Cheaply converts a [`WordSegmenterBorrowed<'static>`] into a [`WordSegmenter`].
605 ///
606 /// Note: Due to branching and indirection, using [`WordSegmenter`] might inhibit some
607 /// compile-time optimizations that are possible with [`WordSegmenterBorrowed`].
608 pub fn static_to_owned(self) -> WordSegmenter {
609 let payload_locale_override = self.locale_override.map(DataPayload::from_static_ref);
610 WordSegmenter {
611 payload: DataPayload::from_static_ref(self.data),
612 complex: self.complex.static_to_owned(),
613 payload_locale_override,
614 }
615 }
616}
617
618/// A trait allowing for [`WordBreakIterator`] to be generalized to multiple string iteration methods.
619///
620/// This is implemented by ICU4X for several common string types.
621///
622/// <div class="stab unstable">
623/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
624/// trait, please consider using a type from the implementors listed below.
625/// </div>
626pub trait WordBreakType: crate::private::Sealed + Sized + RuleBreakType {
627 #[doc(hidden)]
628 fn word_handle_complex_language(
629 iterator: &mut RuleBreakIterator<'_, '_, Self>,
630 left_codepoint: Self::CharType,
631 ) -> Option<usize>;
632}
633
634impl WordBreakType for Utf8 {
635 fn word_handle_complex_language(
636 iter: &mut RuleBreakIterator<'_, '_, Self>,
637 left_codepoint: Self::CharType,
638 ) -> Option<usize> {
639 handle_complex_language_utf8(iter, left_codepoint)
640 }
641}
642
643impl WordBreakType for PotentiallyIllFormedUtf8 {
644 fn word_handle_complex_language(
645 iter: &mut RuleBreakIterator<'_, '_, Self>,
646 left_codepoint: Self::CharType,
647 ) -> Option<usize> {
648 handle_complex_language_utf8(iter, left_codepoint)
649 }
650}
651
652impl WordBreakType for Latin1 {
653 fn word_handle_complex_language(
654 _iter: &mut RuleBreakIterator<'_, '_, Self>,
655 _left_codepoint: Self::CharType,
656 ) -> Option<usize> {
657 debug_assert!(
658 false,
659 "latin-1 text should never need complex language handling"
660 );
661 None
662 }
663}
664
665/// handle_complex_language impl for UTF8 iterators
666fn handle_complex_language_utf8<T>(
667 iter: &mut RuleBreakIterator<'_, '_, T>,
668 left_codepoint: T::CharType,
669) -> Option<usize>
670where
671 T: RuleBreakType<CharType = char>,
672{
673 // word segmenter doesn't define break rules for some languages such as Thai.
674 let start_iter = iter.iter.clone();
675 let start_point = iter.current_pos_data;
676 let mut s = String::new();
677 s.push(left_codepoint);
678 loop {
679 debug_assert!(!iter.is_eof());
680 s.push(iter.get_current_codepoint()?);
681 iter.advance_iter();
682 if let Some(current_break_property) = iter.get_current_break_property() {
683 if current_break_property != iter.data.complex_property {
684 break;
685 }
686 } else {
687 // EOF
688 break;
689 }
690 }
691
692 // Restore iterator to move to head of complex string
693 iter.iter = start_iter;
694 iter.current_pos_data = start_point;
695 #[expect(clippy::unwrap_used)] // iter.complex present for word segmenter
696 let breaks = iter.complex.unwrap().complex_language_segment_str(&s);
697 iter.result_cache = breaks;
698 let first_pos = *iter.result_cache.first()?;
699 let mut i = left_codepoint.len_utf8();
700 loop {
701 if i == first_pos {
702 // Re-calculate breaking offset
703 iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
704 return iter.get_current_position();
705 }
706 debug_assert!(
707 i < first_pos,
708 "we should always arrive at first_pos: near index {:?}",
709 iter.get_current_position()
710 );
711 i += iter.get_current_codepoint().map_or(0, T::char_len);
712 iter.advance_iter();
713 if iter.is_eof() {
714 iter.result_cache.clear();
715 return Some(iter.len);
716 }
717 }
718}
719
720impl WordBreakType for Utf16 {
721 fn word_handle_complex_language(
722 iter: &mut RuleBreakIterator<Self>,
723 left_codepoint: Self::CharType,
724 ) -> Option<usize> {
725 // word segmenter doesn't define break rules for some languages such as Thai.
726 let start_iter = iter.iter.clone();
727 let start_point = iter.current_pos_data;
728 let mut s = vec![left_codepoint as u16];
729 loop {
730 debug_assert!(!iter.is_eof());
731 s.push(iter.get_current_codepoint()? as u16);
732 iter.advance_iter();
733 if let Some(current_break_property) = iter.get_current_break_property() {
734 if current_break_property != iter.data.complex_property {
735 break;
736 }
737 } else {
738 // EOF
739 break;
740 }
741 }
742
743 // Restore iterator to move to head of complex string
744 iter.iter = start_iter;
745 iter.current_pos_data = start_point;
746 #[expect(clippy::unwrap_used)] // iter.complex present for word segmenter
747 let breaks = iter.complex.unwrap().complex_language_segment_utf16(&s);
748 iter.result_cache = breaks;
749 // result_cache vector is utf-16 index that is in BMP.
750 let first_pos = *iter.result_cache.first()?;
751 let mut i = 1;
752 loop {
753 if i == first_pos {
754 // Re-calculate breaking offset
755 iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
756 return iter.get_current_position();
757 }
758 debug_assert!(
759 i < first_pos,
760 "we should always arrive at first_pos: near index {:?}",
761 iter.get_current_position()
762 );
763 i += 1;
764 iter.advance_iter();
765 if iter.is_eof() {
766 iter.result_cache.clear();
767 return Some(iter.len);
768 }
769 }
770 }
771}
772
773#[cfg(all(test, feature = "serde"))]
774#[test]
775fn empty_string() {
776 let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
777 let breaks: Vec<usize> = segmenter.segment_str("").collect();
778 assert_eq!(breaks, [0]);
779}