icu_segmenter/word.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::complex::*;
6use crate::indices::{Latin1Indices, Utf16Indices};
7use crate::iterator_helpers::derive_usize_iterator_with_type;
8use crate::provider::*;
9use crate::rule_segmenter::*;
10use alloc::string::String;
11use alloc::vec;
12use alloc::vec::Vec;
13use icu_locale_core::LanguageIdentifier;
14use icu_provider::prelude::*;
15use utf8_iter::Utf8CharIndices;
16
17/// Options to tailor word breaking behavior.
18#[non_exhaustive]
19#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
20pub struct WordBreakOptions<'a> {
21 /// Content locale for word segmenter
22 pub content_locale: Option<&'a LanguageIdentifier>,
23 /// Options independent of the locale
24 pub invariant_options: WordBreakInvariantOptions,
25}
26
27/// Locale-independent options to tailor word breaking behavior
28///
29/// Currently empty but may grow in the future
30#[non_exhaustive]
31#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
32pub struct WordBreakInvariantOptions {}
33
34/// Implements the [`Iterator`] trait over the word boundaries of the given string.
35///
36/// Lifetimes:
37///
38/// - `'l` = lifetime of the segmenter object from which this iterator was created
39/// - `'s` = lifetime of the string being segmented
40///
41/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
42/// _after_ the boundary (for a boundary at the end of text, this index is the length
43/// of the [`str`] or array of code units).
44///
45/// For examples of use, see [`WordSegmenter`].
46#[derive(Debug)]
47pub struct WordBreakIterator<'data, 's, Y: RuleBreakType>(RuleBreakIterator<'data, 's, Y>);
48
49derive_usize_iterator_with_type!(WordBreakIterator, 'data);
50
51/// Hide ULE type
52pub(crate) mod inner {
53 /// The word type tag that is returned by [`WordBreakIterator::word_type()`].
54 ///
55 /// [`WordBreakIterator::word_type()`]: super::WordBreakIterator::word_type
56 #[non_exhaustive]
57 #[derive(Copy, Clone, PartialEq, Debug)]
58 #[repr(u8)]
59 #[zerovec::make_ule(WordTypeULE)]
60 pub enum WordType {
61 /// No category tag.
62 None = 0,
63 /// Number category tag.
64 Number = 1,
65 /// Letter category tag, including CJK.
66 Letter = 2,
67 }
68}
69
70pub use inner::WordType;
71
72impl WordType {
73 /// Whether the segment is word-like; word-like segments include numbers, as
74 /// well as segments made up of letters (including CJKV ideographs).
75 pub fn is_word_like(&self) -> bool {
76 self != &WordType::None
77 }
78}
79
80impl<'data, 's, Y: RuleBreakType> WordBreakIterator<'data, 's, Y> {
81 /// Returns the word type of the segment preceding the current boundary.
82 #[inline]
83 pub fn word_type(&self) -> WordType {
84 self.0.word_type()
85 }
86
87 /// Returns an iterator over pairs of boundary position and word type.
88 pub fn iter_with_word_type(self) -> WordBreakIteratorWithWordType<'data, 's, Y> {
89 WordBreakIteratorWithWordType(self)
90 }
91
92 /// Returns `true` when the segment preceding the current boundary is word-like,
93 /// such as letters, numbers, or CJKV ideographs.
94 #[inline]
95 pub fn is_word_like(&self) -> bool {
96 self.word_type().is_word_like()
97 }
98}
99
100/// Word break iterator that also returns the word type
101// We can use impl Trait here once `use<..>` syntax is available, see https://github.com/rust-lang/rust/issues/61756
102#[derive(Debug)]
103pub struct WordBreakIteratorWithWordType<'data, 's, Y: RuleBreakType>(
104 WordBreakIterator<'data, 's, Y>,
105);
106
107impl<Y: RuleBreakType> Iterator for WordBreakIteratorWithWordType<'_, '_, Y> {
108 type Item = (usize, WordType);
109 fn next(&mut self) -> Option<Self::Item> {
110 let ret = self.0.next()?;
111 Some((ret, self.0 .0.word_type()))
112 }
113}
114
115/// Supports loading word break data, and creating word break iterators for different string
116/// encodings.
117///
118/// Most segmentation methods live on [`WordSegmenterBorrowed`], which can be obtained via
119/// [`WordSegmenter::new_auto()`] (etc) or [`WordSegmenter::as_borrowed()`].
120///
121/// # Examples
122///
123/// Segment a string:
124///
125/// ```rust
126/// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
127/// let segmenter =
128/// WordSegmenter::new_auto(WordBreakInvariantOptions::default());
129///
130/// let breakpoints: Vec<usize> =
131/// segmenter.segment_str("Hello World").collect();
132/// assert_eq!(&breakpoints, &[0, 5, 6, 11]);
133/// ```
134///
135/// Segment a Latin1 byte string:
136///
137/// ```rust
138/// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
139/// let segmenter =
140/// WordSegmenter::new_auto(WordBreakInvariantOptions::default());
141///
142/// let breakpoints: Vec<usize> =
143/// segmenter.segment_latin1(b"Hello World").collect();
144/// assert_eq!(&breakpoints, &[0, 5, 6, 11]);
145/// ```
146///
147/// Successive boundaries can be used to retrieve the segments.
148/// In particular, the first boundary is always 0, and the last one is the
149/// length of the segmented text in code units.
150///
151/// ```rust
152/// # use icu::segmenter::{WordSegmenter, options::WordBreakInvariantOptions};
153/// # let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
154/// use itertools::Itertools;
155/// let text = "Mark’d ye his words?";
156/// let segments: Vec<&str> = segmenter
157/// .segment_str(text)
158/// .tuple_windows()
159/// .map(|(i, j)| &text[i..j])
160/// .collect();
161/// assert_eq!(
162/// &segments,
163/// &["Mark’d", " ", "ye", " ", "his", " ", "words", "?"]
164/// );
165/// ```
166///
167/// Not all segments delimited by word boundaries are words; some are interword
168/// segments such as spaces and punctuation.
169/// The [`WordBreakIterator::word_type()`] of a boundary can be used to
170/// classify the preceding segment; [`WordBreakIterator::iter_with_word_type()`]
171/// associates each boundary with its status.
172/// ```rust
173/// # use itertools::Itertools;
174/// # use icu::segmenter::WordSegmenter;
175/// # use icu::segmenter::options::{WordType, WordBreakInvariantOptions};
176/// # let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
177/// # let text = "Mark’d ye his words?";
178/// let words: Vec<&str> = segmenter
179/// .segment_str(text)
180/// .iter_with_word_type()
181/// .tuple_windows()
182/// .filter(|(_, (_, segment_type))| segment_type.is_word_like())
183/// .map(|((i, _), (j, _))| &text[i..j])
184/// .collect();
185/// assert_eq!(&words, &["Mark’d", "ye", "his", "words"]);
186/// ```
187#[derive(Debug)]
188pub struct WordSegmenter {
189 payload: DataPayload<SegmenterBreakWordV1>,
190 complex: ComplexPayloads,
191 payload_locale_override: Option<DataPayload<SegmenterBreakWordOverrideV1>>,
192}
193
194/// Segments a string into words (borrowed version).
195///
196/// See [`WordSegmenter`] for examples.
197#[derive(Clone, Debug, Copy)]
198pub struct WordSegmenterBorrowed<'data> {
199 data: &'data RuleBreakData<'data>,
200 complex: ComplexPayloadsBorrowed<'data>,
201 locale_override: Option<&'data RuleBreakDataOverride<'data>>,
202}
203
204impl WordSegmenter {
205 /// Constructs a [`WordSegmenter`] with an invariant locale and the best available compiled data for
206 /// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
207 ///
208 /// The current behavior, which is subject to change, is to use the LSTM model when available
209 /// and the dictionary model for Chinese and Japanese.
210 ///
211 /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
212 ///
213 /// [📚 Help choosing a constructor](icu_provider::constructors)
214 ///
215 /// # Examples
216 ///
217 /// Behavior with complex scripts:
218 ///
219 /// ```
220 /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
221 ///
222 /// let th_str = "ทุกสองสัปดาห์";
223 /// let ja_str = "こんにちは世界";
224 ///
225 /// let segmenter =
226 /// WordSegmenter::new_auto(WordBreakInvariantOptions::default());
227 ///
228 /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
229 /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
230 ///
231 /// assert_eq!(th_bps, [0, 9, 18, 39]);
232 /// assert_eq!(ja_bps, [0, 15, 21]);
233 /// ```
234 #[cfg(feature = "compiled_data")]
235 #[cfg(feature = "auto")]
236 pub fn new_auto(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
237 WordSegmenterBorrowed {
238 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
239 complex: ComplexPayloadsBorrowed::new_auto(),
240 locale_override: None,
241 }
242 }
243
244 #[cfg(feature = "auto")]
245 icu_provider::gen_buffer_data_constructors!(
246 (options: WordBreakOptions) -> error: DataError,
247 functions: [
248 try_new_auto,
249 try_new_auto_with_buffer_provider,
250 try_new_auto_unstable,
251 Self
252 ]
253 );
254
255 #[cfg(feature = "auto")]
256 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
257 pub fn try_new_auto_unstable<D>(
258 provider: &D,
259 options: WordBreakOptions,
260 ) -> Result<Self, DataError>
261 where
262 D: DataProvider<SegmenterBreakWordV1>
263 + DataProvider<SegmenterBreakWordOverrideV1>
264 + DataProvider<SegmenterDictionaryAutoV1>
265 + DataProvider<SegmenterLstmAutoV1>
266 + DataProvider<SegmenterBreakGraphemeClusterV1>
267 + ?Sized,
268 {
269 Ok(Self {
270 payload: provider.load(Default::default())?.payload,
271 complex: ComplexPayloads::try_new_auto(provider)?,
272 payload_locale_override: if let Some(locale) = options.content_locale {
273 let locale = DataLocale::from(locale);
274 let req = DataRequest {
275 id: DataIdentifierBorrowed::for_locale(&locale),
276 metadata: {
277 let mut metadata = DataRequestMetadata::default();
278 metadata.silent = true;
279 metadata
280 },
281 };
282 provider
283 .load(req)
284 .allow_identifier_not_found()?
285 .map(|r| r.payload)
286 } else {
287 None
288 },
289 })
290 }
291
292 /// Constructs a [`WordSegmenter`] with an invariant locale and compiled LSTM data for
293 /// complex scripts (Burmese, Khmer, Lao, and Thai).
294 ///
295 /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
296 /// the full dictionary but more expensive during segmentation (inference).
297 ///
298 /// Warning: there is not currently an LSTM model for Chinese or Japanese, so the [`WordSegmenter`]
299 /// created by this function will have unexpected behavior in spans of those scripts.
300 ///
301 /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
302 ///
303 /// [📚 Help choosing a constructor](icu_provider::constructors)
304 ///
305 /// # Examples
306 ///
307 /// Behavior with complex scripts:
308 ///
309 /// ```
310 /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
311 ///
312 /// let th_str = "ทุกสองสัปดาห์";
313 /// let ja_str = "こんにちは世界";
314 ///
315 /// let segmenter =
316 /// WordSegmenter::new_lstm(WordBreakInvariantOptions::default());
317 ///
318 /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
319 /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
320 ///
321 /// assert_eq!(th_bps, [0, 9, 18, 39]);
322 ///
323 /// // Note: We aren't able to find a suitable breakpoint in Chinese/Japanese.
324 /// assert_eq!(ja_bps, [0, 21]);
325 /// ```
326 #[cfg(feature = "compiled_data")]
327 #[cfg(feature = "lstm")]
328 pub fn new_lstm(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
329 WordSegmenterBorrowed {
330 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
331 complex: ComplexPayloadsBorrowed::new_lstm(),
332 locale_override: None,
333 }
334 }
335
336 #[cfg(feature = "lstm")]
337 icu_provider::gen_buffer_data_constructors!(
338 (options: WordBreakOptions) -> error: DataError,
339 functions: [
340 try_new_lstm,
341 try_new_lstm_with_buffer_provider,
342 try_new_lstm_unstable,
343 Self
344 ]
345 );
346
347 #[cfg(feature = "lstm")]
348 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
349 pub fn try_new_lstm_unstable<D>(
350 provider: &D,
351 options: WordBreakOptions,
352 ) -> Result<Self, DataError>
353 where
354 D: DataProvider<SegmenterBreakWordV1>
355 + DataProvider<SegmenterBreakWordOverrideV1>
356 + DataProvider<SegmenterLstmAutoV1>
357 + DataProvider<SegmenterBreakGraphemeClusterV1>
358 + ?Sized,
359 {
360 Ok(Self {
361 payload: provider.load(Default::default())?.payload,
362 complex: ComplexPayloads::try_new_lstm(provider)?,
363 payload_locale_override: if let Some(locale) = options.content_locale {
364 let locale = DataLocale::from(locale);
365 let req = DataRequest {
366 id: DataIdentifierBorrowed::for_locale(&locale),
367 metadata: {
368 let mut metadata = DataRequestMetadata::default();
369 metadata.silent = true;
370 metadata
371 },
372 };
373 provider
374 .load(req)
375 .allow_identifier_not_found()?
376 .map(|r| r.payload)
377 } else {
378 None
379 },
380 })
381 }
382
383 /// Construct a [`WordSegmenter`] with an invariant locale and compiled dictionary data for
384 /// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
385 ///
386 /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
387 /// faster than the LSTM model but requires more data.
388 ///
389 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
390 ///
391 /// [📚 Help choosing a constructor](icu_provider::constructors)
392 ///
393 /// # Examples
394 ///
395 /// Behavior with complex scripts:
396 ///
397 /// ```
398 /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
399 ///
400 /// let th_str = "ทุกสองสัปดาห์";
401 /// let ja_str = "こんにちは世界";
402 ///
403 /// let segmenter =
404 /// WordSegmenter::new_dictionary(WordBreakInvariantOptions::default());
405 ///
406 /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
407 /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
408 ///
409 /// assert_eq!(th_bps, [0, 9, 18, 39]);
410 /// assert_eq!(ja_bps, [0, 15, 21]);
411 /// ```
412 #[cfg(feature = "compiled_data")]
413 pub fn new_dictionary(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
414 WordSegmenterBorrowed {
415 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
416 complex: ComplexPayloadsBorrowed::new_dict(),
417 locale_override: None,
418 }
419 }
420
421 icu_provider::gen_buffer_data_constructors!(
422 (options: WordBreakOptions) -> error: DataError,
423 functions: [
424 try_new_dictionary,
425 try_new_dictionary_with_buffer_provider,
426 try_new_dictionary_unstable,
427 Self
428 ]
429 );
430
431 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
432 pub fn try_new_dictionary_unstable<D>(
433 provider: &D,
434 options: WordBreakOptions,
435 ) -> Result<Self, DataError>
436 where
437 D: DataProvider<SegmenterBreakWordV1>
438 + DataProvider<SegmenterBreakWordOverrideV1>
439 + DataProvider<SegmenterDictionaryAutoV1>
440 + DataProvider<SegmenterDictionaryExtendedV1>
441 + DataProvider<SegmenterBreakGraphemeClusterV1>
442 + ?Sized,
443 {
444 Ok(Self {
445 payload: provider.load(Default::default())?.payload,
446 complex: ComplexPayloads::try_new_dict(provider)?,
447 payload_locale_override: if let Some(locale) = options.content_locale {
448 let locale = DataLocale::from(locale);
449 let req = DataRequest {
450 id: DataIdentifierBorrowed::for_locale(&locale),
451 metadata: {
452 let mut metadata = DataRequestMetadata::default();
453 metadata.silent = true;
454 metadata
455 },
456 };
457 provider
458 .load(req)
459 .allow_identifier_not_found()?
460 .map(|r| r.payload)
461 } else {
462 None
463 },
464 })
465 }
466 /// Constructs a borrowed version of this type for more efficient querying.
467 ///
468 /// Most useful methods for segmentation are on this type.
469 pub fn as_borrowed(&self) -> WordSegmenterBorrowed<'_> {
470 WordSegmenterBorrowed {
471 data: self.payload.get(),
472 complex: self.complex.as_borrowed(),
473 locale_override: self.payload_locale_override.as_ref().map(|p| p.get()),
474 }
475 }
476}
477
478impl<'data> WordSegmenterBorrowed<'data> {
479 /// Creates a word break iterator for an `str` (a UTF-8 string).
480 ///
481 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
482 pub fn segment_str<'s>(self, input: &'s str) -> WordBreakIterator<'data, 's, Utf8> {
483 WordBreakIterator(RuleBreakIterator {
484 iter: input.char_indices(),
485 len: input.len(),
486 current_pos_data: None,
487 result_cache: Vec::new(),
488 data: self.data,
489 complex: Some(self.complex),
490 boundary_property: 0,
491 locale_override: self.locale_override,
492 handle_complex_language: Utf8::word_handle_complex_language,
493 })
494 }
495
496 /// Creates a word break iterator for a potentially ill-formed UTF8 string
497 ///
498 /// Invalid characters are treated as REPLACEMENT CHARACTER
499 ///
500 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
501 pub fn segment_utf8<'s>(
502 self,
503 input: &'s [u8],
504 ) -> WordBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
505 WordBreakIterator(RuleBreakIterator {
506 iter: Utf8CharIndices::new(input),
507 len: input.len(),
508 current_pos_data: None,
509 result_cache: Vec::new(),
510 data: self.data,
511 complex: Some(self.complex),
512 boundary_property: 0,
513 locale_override: self.locale_override,
514 handle_complex_language: PotentiallyIllFormedUtf8::word_handle_complex_language,
515 })
516 }
517
518 /// Creates a word break iterator for a Latin-1 (8-bit) string.
519 ///
520 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
521 pub fn segment_latin1<'s>(self, input: &'s [u8]) -> WordBreakIterator<'data, 's, Latin1> {
522 WordBreakIterator(RuleBreakIterator {
523 iter: Latin1Indices::new(input),
524 len: input.len(),
525 current_pos_data: None,
526 result_cache: Vec::new(),
527 data: self.data,
528 complex: Some(self.complex),
529 boundary_property: 0,
530 locale_override: self.locale_override,
531 handle_complex_language: Latin1::word_handle_complex_language,
532 })
533 }
534
535 /// Creates a word break iterator for a UTF-16 string.
536 ///
537 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
538 pub fn segment_utf16<'s>(self, input: &'s [u16]) -> WordBreakIterator<'data, 's, Utf16> {
539 WordBreakIterator(RuleBreakIterator {
540 iter: Utf16Indices::new(input),
541 len: input.len(),
542 current_pos_data: None,
543 result_cache: Vec::new(),
544 data: self.data,
545 complex: Some(self.complex),
546 boundary_property: 0,
547 locale_override: self.locale_override,
548 handle_complex_language: Utf16::word_handle_complex_language,
549 })
550 }
551}
552
553impl WordSegmenterBorrowed<'static> {
554 /// Cheaply converts a [`WordSegmenterBorrowed<'static>`] into a [`WordSegmenter`].
555 ///
556 /// Note: Due to branching and indirection, using [`WordSegmenter`] might inhibit some
557 /// compile-time optimizations that are possible with [`WordSegmenterBorrowed`].
558 pub fn static_to_owned(self) -> WordSegmenter {
559 let payload_locale_override = self.locale_override.map(DataPayload::from_static_ref);
560 WordSegmenter {
561 payload: DataPayload::from_static_ref(self.data),
562 complex: self.complex.static_to_owned(),
563 payload_locale_override,
564 }
565 }
566}
567
568/// A trait allowing for [`WordBreakIterator`] to be generalized to multiple string iteration methods.
569///
570/// This is implemented by ICU4X for several common string types.
571///
572/// <div class="stab unstable">
573/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
574/// trait, please consider using a type from the implementors listed below.
575/// </div>
576pub trait WordBreakType: crate::private::Sealed + Sized + RuleBreakType {
577 #[doc(hidden)]
578 fn word_handle_complex_language(
579 iterator: &mut RuleBreakIterator<'_, '_, Self>,
580 left_codepoint: Self::CharType,
581 ) -> Option<usize>;
582}
583
584impl WordBreakType for Utf8 {
585 fn word_handle_complex_language(
586 iter: &mut RuleBreakIterator<'_, '_, Self>,
587 left_codepoint: Self::CharType,
588 ) -> Option<usize> {
589 handle_complex_language_utf8(iter, left_codepoint)
590 }
591}
592
593impl WordBreakType for PotentiallyIllFormedUtf8 {
594 fn word_handle_complex_language(
595 iter: &mut RuleBreakIterator<'_, '_, Self>,
596 left_codepoint: Self::CharType,
597 ) -> Option<usize> {
598 handle_complex_language_utf8(iter, left_codepoint)
599 }
600}
601
602impl WordBreakType for Latin1 {
603 fn word_handle_complex_language(
604 _iter: &mut RuleBreakIterator<'_, '_, Self>,
605 _left_codepoint: Self::CharType,
606 ) -> Option<usize> {
607 debug_assert!(
608 false,
609 "latin-1 text should never need complex language handling"
610 );
611 None
612 }
613}
614
615/// handle_complex_language impl for UTF8 iterators
616fn handle_complex_language_utf8<T>(
617 iter: &mut RuleBreakIterator<'_, '_, T>,
618 left_codepoint: T::CharType,
619) -> Option<usize>
620where
621 T: RuleBreakType<CharType = char>,
622{
623 // word segmenter doesn't define break rules for some languages such as Thai.
624 let start_iter = iter.iter.clone();
625 let start_point = iter.current_pos_data;
626 let mut s = String::new();
627 s.push(left_codepoint);
628 loop {
629 debug_assert!(!iter.is_eof());
630 s.push(iter.get_current_codepoint()?);
631 iter.advance_iter();
632 if let Some(current_break_property) = iter.get_current_break_property() {
633 if current_break_property != iter.data.complex_property {
634 break;
635 }
636 } else {
637 // EOF
638 break;
639 }
640 }
641
642 // Restore iterator to move to head of complex string
643 iter.iter = start_iter;
644 iter.current_pos_data = start_point;
645 #[allow(clippy::unwrap_used)] // iter.complex present for word segmenter
646 let breaks = iter.complex.unwrap().complex_language_segment_str(&s);
647 iter.result_cache = breaks;
648 let first_pos = *iter.result_cache.first()?;
649 let mut i = left_codepoint.len_utf8();
650 loop {
651 if i == first_pos {
652 // Re-calculate breaking offset
653 iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
654 return iter.get_current_position();
655 }
656 debug_assert!(
657 i < first_pos,
658 "we should always arrive at first_pos: near index {:?}",
659 iter.get_current_position()
660 );
661 i += iter.get_current_codepoint().map_or(0, T::char_len);
662 iter.advance_iter();
663 if iter.is_eof() {
664 iter.result_cache.clear();
665 return Some(iter.len);
666 }
667 }
668}
669
670impl WordBreakType for Utf16 {
671 fn word_handle_complex_language(
672 iter: &mut RuleBreakIterator<Self>,
673 left_codepoint: Self::CharType,
674 ) -> Option<usize> {
675 // word segmenter doesn't define break rules for some languages such as Thai.
676 let start_iter = iter.iter.clone();
677 let start_point = iter.current_pos_data;
678 let mut s = vec![left_codepoint as u16];
679 loop {
680 debug_assert!(!iter.is_eof());
681 s.push(iter.get_current_codepoint()? as u16);
682 iter.advance_iter();
683 if let Some(current_break_property) = iter.get_current_break_property() {
684 if current_break_property != iter.data.complex_property {
685 break;
686 }
687 } else {
688 // EOF
689 break;
690 }
691 }
692
693 // Restore iterator to move to head of complex string
694 iter.iter = start_iter;
695 iter.current_pos_data = start_point;
696 #[allow(clippy::unwrap_used)] // iter.complex present for word segmenter
697 let breaks = iter.complex.unwrap().complex_language_segment_utf16(&s);
698 iter.result_cache = breaks;
699 // result_cache vector is utf-16 index that is in BMP.
700 let first_pos = *iter.result_cache.first()?;
701 let mut i = 1;
702 loop {
703 if i == first_pos {
704 // Re-calculate breaking offset
705 iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
706 return iter.get_current_position();
707 }
708 debug_assert!(
709 i < first_pos,
710 "we should always arrive at first_pos: near index {:?}",
711 iter.get_current_position()
712 );
713 i += 1;
714 iter.advance_iter();
715 if iter.is_eof() {
716 iter.result_cache.clear();
717 return Some(iter.len);
718 }
719 }
720 }
721}
722
723#[cfg(all(test, feature = "serde"))]
724#[test]
725fn empty_string() {
726 let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
727 let breaks: Vec<usize> = segmenter.segment_str("").collect();
728 assert_eq!(breaks, [0]);
729}