icu_segmenter/word.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::complex::*;
6use crate::indices::{Latin1Indices, Utf16Indices};
7use crate::iterator_helpers::derive_usize_iterator_with_type;
8use crate::provider::*;
9use crate::rule_segmenter::*;
10use alloc::string::String;
11use alloc::vec;
12use alloc::vec::Vec;
13use icu_locale_core::LanguageIdentifier;
14use icu_provider::prelude::*;
15use utf8_iter::Utf8CharIndices;
16
17/// Options to tailor word breaking behavior.
18#[non_exhaustive]
19#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
20pub struct WordBreakOptions<'a> {
21 /// Content locale for word segmenter
22 ///
23 /// If you know the language of the text being segmented, provide it here in order to produce
24 /// higher quality breakpoints.
25 ///
26 /// # Examples
27 ///
28 /// Normally, a colon character ':' is a word separator:
29 ///
30 /// ```rust
31 /// use icu::segmenter::WordSegmenter;
32 ///
33 /// let segmenter = WordSegmenter::new_auto(Default::default());
34 ///
35 /// let breakpoints: Vec<usize> = segmenter.segment_str("EU:ssa").collect();
36 /// assert_eq!(&breakpoints, &[0, 2, 3, 6]);
37 /// ```
38 ///
39 /// But not in Finnish, where it is used for loanwords:
40 ///
41 /// ```rust
42 /// use icu::locale::langid;
43 /// use icu::segmenter::options::WordBreakOptions;
44 /// use icu::segmenter::WordSegmenter;
45 ///
46 /// let mut options = WordBreakOptions::default();
47 /// let langid = &langid!("fi");
48 /// options.content_locale = Some(langid);
49 /// let segmenter = WordSegmenter::try_new_auto(options).unwrap();
50 ///
51 /// let breakpoints: Vec<usize> =
52 /// segmenter.as_borrowed().segment_str("EU:ssa").collect();
53 /// assert_eq!(&breakpoints, &[0, 6]);
54 /// ```
55 pub content_locale: Option<&'a LanguageIdentifier>,
56 /// Options independent of the locale
57 pub invariant_options: WordBreakInvariantOptions,
58}
59
60impl WordBreakOptions<'_> {
61 /// `const` version of [`Default::default`]
62 pub const fn default() -> Self {
63 Self {
64 content_locale: None,
65 invariant_options: WordBreakInvariantOptions::default(),
66 }
67 }
68}
69
70/// Locale-independent options to tailor word breaking behavior
71///
72/// Currently empty but may grow in the future
73#[non_exhaustive]
74#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
75pub struct WordBreakInvariantOptions {}
76
77impl WordBreakInvariantOptions {
78 /// `const` version of [`Default::default`]
79 pub const fn default() -> Self {
80 Self {}
81 }
82}
83
84/// Implements the [`Iterator`] trait over the word boundaries of the given string.
85///
86/// Lifetimes:
87///
88/// - `'l` = lifetime of the segmenter object from which this iterator was created
89/// - `'s` = lifetime of the string being segmented
90///
91/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
92/// _after_ the boundary (for a boundary at the end of text, this index is the length
93/// of the [`str`] or array of code units).
94///
95/// For examples of use, see [`WordSegmenter`].
96#[derive(Debug)]
97pub struct WordBreakIterator<'data, 's, Y: RuleBreakType>(RuleBreakIterator<'data, 's, Y>);
98
99derive_usize_iterator_with_type!(WordBreakIterator, 'data);
100
101/// Hide ULE type
102pub(crate) mod inner {
103 /// The word type tag that is returned by [`WordBreakIterator::word_type()`].
104 ///
105 /// [`WordBreakIterator::word_type()`]: super::WordBreakIterator::word_type
106 #[non_exhaustive]
107 #[derive(Copy, Clone, PartialEq, Debug)]
108 #[repr(u8)]
109 #[zerovec::make_ule(WordTypeULE)]
110 pub enum WordType {
111 /// No category tag.
112 None = 0,
113 /// Number category tag.
114 Number = 1,
115 /// Letter category tag, including CJK.
116 Letter = 2,
117 }
118}
119
120pub use inner::WordType;
121
122impl WordType {
123 /// Whether the segment is word-like; word-like segments include numbers, as
124 /// well as segments made up of letters (including CJKV ideographs).
125 pub fn is_word_like(&self) -> bool {
126 self != &WordType::None
127 }
128}
129
130impl<'data, 's, Y: RuleBreakType> WordBreakIterator<'data, 's, Y> {
131 /// Returns the word type of the segment preceding the current boundary.
132 #[inline]
133 pub fn word_type(&self) -> WordType {
134 self.0.word_type()
135 }
136
137 /// Returns an iterator over pairs of boundary position and word type.
138 pub fn iter_with_word_type(self) -> WordBreakIteratorWithWordType<'data, 's, Y> {
139 WordBreakIteratorWithWordType(self)
140 }
141
142 /// Returns `true` when the segment preceding the current boundary is word-like,
143 /// such as letters, numbers, or CJKV ideographs.
144 #[inline]
145 pub fn is_word_like(&self) -> bool {
146 self.word_type().is_word_like()
147 }
148}
149
150/// Word break iterator that also returns the word type
151// We can use impl Trait here once `use<..>` syntax is available, see https://github.com/rust-lang/rust/issues/61756
152#[derive(Debug)]
153pub struct WordBreakIteratorWithWordType<'data, 's, Y: RuleBreakType>(
154 WordBreakIterator<'data, 's, Y>,
155);
156
157impl<Y: RuleBreakType> Iterator for WordBreakIteratorWithWordType<'_, '_, Y> {
158 type Item = (usize, WordType);
159 fn next(&mut self) -> Option<Self::Item> {
160 let ret = self.0.next()?;
161 Some((ret, self.0 .0.word_type()))
162 }
163}
164
165/// Supports loading word break data, and creating word break iterators for different string
166/// encodings.
167///
168/// Most segmentation methods live on [`WordSegmenterBorrowed`], which can be obtained via
169/// [`WordSegmenter::new_auto()`] (etc) or [`WordSegmenter::as_borrowed()`].
170///
171/// # Content Locale
172///
173/// You can optionally provide a _content locale_ to the [`WordSegmenter`] constructor. If you
174/// have information on the language of the text being segmented, providing this hint can
175/// produce higher-quality results.
176///
177/// If you have a content locale, use [`WordBreakOptions`] and a constructor begining with `new`.
178/// If you do not have a content locale use [`WordBreakInvariantOptions`] and a constructor
179/// beginning with `try_new`.
180///
181/// # Examples
182///
183/// Segment a string:
184///
185/// ```rust
186/// use icu::segmenter::WordSegmenter;
187///
188/// let segmenter = WordSegmenter::new_auto(Default::default());
189///
190/// let breakpoints: Vec<usize> =
191/// segmenter.segment_str("Hello World").collect();
192/// assert_eq!(&breakpoints, &[0, 5, 6, 11]);
193/// ```
194///
195/// Segment a Latin1 byte string with a content locale:
196///
197/// ```rust
198/// use icu::locale::langid;
199/// use icu::segmenter::options::WordBreakOptions;
200/// use icu::segmenter::WordSegmenter;
201///
202/// let mut options = WordBreakOptions::default();
203/// let langid = &langid!("en");
204/// options.content_locale = Some(langid);
205/// let segmenter = WordSegmenter::try_new_auto(options).unwrap();
206///
207/// let breakpoints: Vec<usize> = segmenter
208/// .as_borrowed()
209/// .segment_latin1(b"Hello World")
210/// .collect();
211/// assert_eq!(&breakpoints, &[0, 5, 6, 11]);
212/// ```
213///
214/// Successive boundaries can be used to retrieve the segments.
215/// In particular, the first boundary is always 0, and the last one is the
216/// length of the segmented text in code units.
217///
218/// ```rust
219/// # use icu::segmenter::{WordSegmenter, options::WordBreakInvariantOptions};
220/// # let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
221/// use itertools::Itertools;
222/// let text = "Mark’d ye his words?";
223/// let segments: Vec<&str> = segmenter
224/// .segment_str(text)
225/// .tuple_windows()
226/// .map(|(i, j)| &text[i..j])
227/// .collect();
228/// assert_eq!(
229/// &segments,
230/// &["Mark’d", " ", "ye", " ", "his", " ", "words", "?"]
231/// );
232/// ```
233///
234/// Not all segments delimited by word boundaries are words; some are interword
235/// segments such as spaces and punctuation.
236/// The [`WordBreakIterator::word_type()`] of a boundary can be used to
237/// classify the preceding segment; [`WordBreakIterator::iter_with_word_type()`]
238/// associates each boundary with its status.
239/// ```rust
240/// # use itertools::Itertools;
241/// # use icu::segmenter::WordSegmenter;
242/// # use icu::segmenter::options::{WordType, WordBreakInvariantOptions};
243/// # let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
244/// # let text = "Mark’d ye his words?";
245/// let words: Vec<&str> = segmenter
246/// .segment_str(text)
247/// .iter_with_word_type()
248/// .tuple_windows()
249/// .filter(|(_, (_, segment_type))| segment_type.is_word_like())
250/// .map(|((i, _), (j, _))| &text[i..j])
251/// .collect();
252/// assert_eq!(&words, &["Mark’d", "ye", "his", "words"]);
253/// ```
254#[derive(Debug)]
255pub struct WordSegmenter {
256 payload: DataPayload<SegmenterBreakWordV1>,
257 complex: ComplexPayloads,
258 payload_locale_override: Option<DataPayload<SegmenterBreakWordOverrideV1>>,
259}
260
261/// Segments a string into words (borrowed version).
262///
263/// See [`WordSegmenter`] for examples.
264#[derive(Clone, Debug, Copy)]
265pub struct WordSegmenterBorrowed<'data> {
266 data: &'data RuleBreakData<'data>,
267 complex: ComplexPayloadsBorrowed<'data>,
268 locale_override: Option<&'data RuleBreakDataOverride<'data>>,
269}
270
271impl WordSegmenter {
272 /// Constructs a [`WordSegmenter`] with an invariant locale and the best available compiled data for
273 /// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
274 ///
275 /// The current behavior, which is subject to change, is to use the LSTM model when available
276 /// and the dictionary model for Chinese and Japanese.
277 ///
278 /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
279 ///
280 /// [📚 Help choosing a constructor](icu_provider::constructors)
281 ///
282 /// # Examples
283 ///
284 /// Behavior with complex scripts:
285 ///
286 /// ```
287 /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
288 ///
289 /// let th_str = "ทุกสองสัปดาห์";
290 /// let ja_str = "こんにちは世界";
291 ///
292 /// let segmenter =
293 /// WordSegmenter::new_auto(WordBreakInvariantOptions::default());
294 ///
295 /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
296 /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
297 ///
298 /// assert_eq!(th_bps, [0, 9, 18, 39]);
299 /// assert_eq!(ja_bps, [0, 15, 21]);
300 /// ```
301 #[cfg(feature = "compiled_data")]
302 #[cfg(feature = "auto")]
303 pub fn new_auto(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
304 WordSegmenterBorrowed {
305 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
306 complex: ComplexPayloadsBorrowed::new_auto(),
307 locale_override: None,
308 }
309 }
310
311 #[cfg(feature = "auto")]
312 icu_provider::gen_buffer_data_constructors!(
313 (options: WordBreakOptions) -> error: DataError,
314 functions: [
315 try_new_auto,
316 try_new_auto_with_buffer_provider,
317 try_new_auto_unstable,
318 Self
319 ]
320 );
321
322 #[cfg(feature = "auto")]
323 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
324 pub fn try_new_auto_unstable<D>(
325 provider: &D,
326 options: WordBreakOptions,
327 ) -> Result<Self, DataError>
328 where
329 D: DataProvider<SegmenterBreakWordV1>
330 + DataProvider<SegmenterBreakWordOverrideV1>
331 + DataProvider<SegmenterDictionaryAutoV1>
332 + DataProvider<SegmenterLstmAutoV1>
333 + DataProvider<SegmenterBreakGraphemeClusterV1>
334 + ?Sized,
335 {
336 Ok(Self {
337 payload: provider.load(Default::default())?.payload,
338 complex: ComplexPayloads::try_new_auto(provider)?,
339 payload_locale_override: if let Some(locale) = options.content_locale {
340 let locale = DataLocale::from(locale);
341 let req = DataRequest {
342 id: DataIdentifierBorrowed::for_locale(&locale),
343 metadata: {
344 let mut metadata = DataRequestMetadata::default();
345 metadata.silent = true;
346 metadata
347 },
348 };
349 provider
350 .load(req)
351 .allow_identifier_not_found()?
352 .map(|r| r.payload)
353 } else {
354 None
355 },
356 })
357 }
358
359 /// Constructs a [`WordSegmenter`] with an invariant locale and compiled LSTM data for
360 /// complex scripts (Burmese, Khmer, Lao, and Thai).
361 ///
362 /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
363 /// the full dictionary but more expensive during segmentation (inference).
364 ///
365 /// Warning: there is not currently an LSTM model for Chinese or Japanese, so the [`WordSegmenter`]
366 /// created by this function will have unexpected behavior in spans of those scripts.
367 ///
368 /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
369 ///
370 /// [📚 Help choosing a constructor](icu_provider::constructors)
371 ///
372 /// # Examples
373 ///
374 /// Behavior with complex scripts:
375 ///
376 /// ```
377 /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
378 ///
379 /// let th_str = "ทุกสองสัปดาห์";
380 /// let ja_str = "こんにちは世界";
381 ///
382 /// let segmenter =
383 /// WordSegmenter::new_lstm(WordBreakInvariantOptions::default());
384 ///
385 /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
386 /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
387 ///
388 /// assert_eq!(th_bps, [0, 9, 18, 39]);
389 ///
390 /// // Note: We aren't able to find a suitable breakpoint in Chinese/Japanese.
391 /// assert_eq!(ja_bps, [0, 21]);
392 /// ```
393 #[cfg(feature = "compiled_data")]
394 #[cfg(feature = "lstm")]
395 pub fn new_lstm(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
396 WordSegmenterBorrowed {
397 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
398 complex: ComplexPayloadsBorrowed::new_lstm(),
399 locale_override: None,
400 }
401 }
402
403 #[cfg(feature = "lstm")]
404 icu_provider::gen_buffer_data_constructors!(
405 (options: WordBreakOptions) -> error: DataError,
406 functions: [
407 try_new_lstm,
408 try_new_lstm_with_buffer_provider,
409 try_new_lstm_unstable,
410 Self
411 ]
412 );
413
414 #[cfg(feature = "lstm")]
415 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
416 pub fn try_new_lstm_unstable<D>(
417 provider: &D,
418 options: WordBreakOptions,
419 ) -> Result<Self, DataError>
420 where
421 D: DataProvider<SegmenterBreakWordV1>
422 + DataProvider<SegmenterBreakWordOverrideV1>
423 + DataProvider<SegmenterLstmAutoV1>
424 + DataProvider<SegmenterBreakGraphemeClusterV1>
425 + ?Sized,
426 {
427 Ok(Self {
428 payload: provider.load(Default::default())?.payload,
429 complex: ComplexPayloads::try_new_lstm(provider)?,
430 payload_locale_override: if let Some(locale) = options.content_locale {
431 let locale = DataLocale::from(locale);
432 let req = DataRequest {
433 id: DataIdentifierBorrowed::for_locale(&locale),
434 metadata: {
435 let mut metadata = DataRequestMetadata::default();
436 metadata.silent = true;
437 metadata
438 },
439 };
440 provider
441 .load(req)
442 .allow_identifier_not_found()?
443 .map(|r| r.payload)
444 } else {
445 None
446 },
447 })
448 }
449
450 /// Construct a [`WordSegmenter`] with an invariant locale and compiled dictionary data for
451 /// complex scripts (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
452 ///
453 /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
454 /// faster than the LSTM model but requires more data.
455 ///
456 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
457 ///
458 /// [📚 Help choosing a constructor](icu_provider::constructors)
459 ///
460 /// # Examples
461 ///
462 /// Behavior with complex scripts:
463 ///
464 /// ```
465 /// use icu::segmenter::{options::WordBreakInvariantOptions, WordSegmenter};
466 ///
467 /// let th_str = "ทุกสองสัปดาห์";
468 /// let ja_str = "こんにちは世界";
469 ///
470 /// let segmenter =
471 /// WordSegmenter::new_dictionary(WordBreakInvariantOptions::default());
472 ///
473 /// let th_bps = segmenter.segment_str(th_str).collect::<Vec<_>>();
474 /// let ja_bps = segmenter.segment_str(ja_str).collect::<Vec<_>>();
475 ///
476 /// assert_eq!(th_bps, [0, 9, 18, 39]);
477 /// assert_eq!(ja_bps, [0, 15, 21]);
478 /// ```
479 #[cfg(feature = "compiled_data")]
480 pub fn new_dictionary(_options: WordBreakInvariantOptions) -> WordSegmenterBorrowed<'static> {
481 WordSegmenterBorrowed {
482 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
483 complex: ComplexPayloadsBorrowed::new_dict(),
484 locale_override: None,
485 }
486 }
487
488 icu_provider::gen_buffer_data_constructors!(
489 (options: WordBreakOptions) -> error: DataError,
490 functions: [
491 try_new_dictionary,
492 try_new_dictionary_with_buffer_provider,
493 try_new_dictionary_unstable,
494 Self
495 ]
496 );
497
498 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
499 pub fn try_new_dictionary_unstable<D>(
500 provider: &D,
501 options: WordBreakOptions,
502 ) -> Result<Self, DataError>
503 where
504 D: DataProvider<SegmenterBreakWordV1>
505 + DataProvider<SegmenterBreakWordOverrideV1>
506 + DataProvider<SegmenterDictionaryAutoV1>
507 + DataProvider<SegmenterDictionaryExtendedV1>
508 + DataProvider<SegmenterBreakGraphemeClusterV1>
509 + ?Sized,
510 {
511 Ok(Self {
512 payload: provider.load(Default::default())?.payload,
513 complex: ComplexPayloads::try_new_dict(provider)?,
514 payload_locale_override: if let Some(locale) = options.content_locale {
515 let locale = DataLocale::from(locale);
516 let req = DataRequest {
517 id: DataIdentifierBorrowed::for_locale(&locale),
518 metadata: {
519 let mut metadata = DataRequestMetadata::default();
520 metadata.silent = true;
521 metadata
522 },
523 };
524 provider
525 .load(req)
526 .allow_identifier_not_found()?
527 .map(|r| r.payload)
528 } else {
529 None
530 },
531 })
532 }
533
534 /// Construct a [`WordSegmenter`] with an invariant locale and no support for
535 /// scripts requiring complex context dependent word breaks (Chinese, Japanese, Khmer, Lao, Myanmar, and Thai).
536 ///
537 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
538 ///
539 /// [📚 Help choosing a constructor](icu_provider::constructors)
540 #[cfg(feature = "compiled_data")]
541 pub const fn new_for_non_complex_scripts(
542 _options: WordBreakInvariantOptions,
543 ) -> WordSegmenterBorrowed<'static> {
544 WordSegmenterBorrowed {
545 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_WORD_V1,
546 complex: ComplexPayloadsBorrowed::empty(),
547 locale_override: None,
548 }
549 }
550
551 icu_provider::gen_buffer_data_constructors!(
552 (options: WordBreakOptions) -> error: DataError,
553 functions: [
554 try_new_for_non_complex_scripts,
555 try_new_for_non_complex_scripts_with_buffer_provider,
556 try_new_for_non_complex_scripts_unstable,
557 Self
558 ]
559 );
560
561 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_for_non_complex_scripts)]
562 pub fn try_new_for_non_complex_scripts_unstable<D>(
563 provider: &D,
564 options: WordBreakOptions,
565 ) -> Result<Self, DataError>
566 where
567 D: DataProvider<SegmenterBreakWordV1>
568 + DataProvider<SegmenterBreakWordOverrideV1>
569 + DataProvider<SegmenterBreakGraphemeClusterV1>
570 + ?Sized,
571 {
572 Ok(Self {
573 payload: provider.load(Default::default())?.payload,
574 complex: ComplexPayloads::try_new_empty(provider)?,
575 payload_locale_override: if let Some(locale) = options.content_locale {
576 let locale = DataLocale::from(locale);
577 let req = DataRequest {
578 id: DataIdentifierBorrowed::for_locale(&locale),
579 metadata: {
580 let mut metadata = DataRequestMetadata::default();
581 metadata.silent = true;
582 metadata
583 },
584 };
585 provider
586 .load(req)
587 .allow_identifier_not_found()?
588 .map(|r| r.payload)
589 } else {
590 None
591 },
592 })
593 }
594
595 /// Constructs a borrowed version of this type for more efficient querying.
596 ///
597 /// Most useful methods for segmentation are on this type.
598 pub fn as_borrowed(&self) -> WordSegmenterBorrowed<'_> {
599 WordSegmenterBorrowed {
600 data: self.payload.get(),
601 complex: self.complex.as_borrowed(),
602 locale_override: self.payload_locale_override.as_ref().map(|p| p.get()),
603 }
604 }
605}
606
607impl<'data> WordSegmenterBorrowed<'data> {
608 /// Creates a word break iterator for an `str` (a UTF-8 string).
609 ///
610 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
611 pub fn segment_str<'s>(self, input: &'s str) -> WordBreakIterator<'data, 's, Utf8> {
612 WordBreakIterator(RuleBreakIterator {
613 iter: input.char_indices(),
614 len: input.len(),
615 current_pos_data: None,
616 result_cache: Vec::new(),
617 data: self.data,
618 complex: Some(self.complex),
619 boundary_property: 0,
620 locale_override: self.locale_override,
621 handle_complex_language: Utf8::word_handle_complex_language,
622 })
623 }
624
625 /// Creates a word break iterator for a potentially ill-formed UTF8 string
626 ///
627 /// Invalid characters are treated as REPLACEMENT CHARACTER
628 ///
629 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
630 pub fn segment_utf8<'s>(
631 self,
632 input: &'s [u8],
633 ) -> WordBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
634 WordBreakIterator(RuleBreakIterator {
635 iter: Utf8CharIndices::new(input),
636 len: input.len(),
637 current_pos_data: None,
638 result_cache: Vec::new(),
639 data: self.data,
640 complex: Some(self.complex),
641 boundary_property: 0,
642 locale_override: self.locale_override,
643 handle_complex_language: PotentiallyIllFormedUtf8::word_handle_complex_language,
644 })
645 }
646
647 /// Creates a word break iterator for a Latin-1 (8-bit) string.
648 ///
649 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
650 pub fn segment_latin1<'s>(self, input: &'s [u8]) -> WordBreakIterator<'data, 's, Latin1> {
651 WordBreakIterator(RuleBreakIterator {
652 iter: Latin1Indices::new(input),
653 len: input.len(),
654 current_pos_data: None,
655 result_cache: Vec::new(),
656 data: self.data,
657 complex: Some(self.complex),
658 boundary_property: 0,
659 locale_override: self.locale_override,
660 handle_complex_language: Latin1::word_handle_complex_language,
661 })
662 }
663
664 /// Creates a word break iterator for a UTF-16 string.
665 ///
666 /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
667 pub fn segment_utf16<'s>(self, input: &'s [u16]) -> WordBreakIterator<'data, 's, Utf16> {
668 WordBreakIterator(RuleBreakIterator {
669 iter: Utf16Indices::new(input),
670 len: input.len(),
671 current_pos_data: None,
672 result_cache: Vec::new(),
673 data: self.data,
674 complex: Some(self.complex),
675 boundary_property: 0,
676 locale_override: self.locale_override,
677 handle_complex_language: Utf16::word_handle_complex_language,
678 })
679 }
680}
681
682impl WordSegmenterBorrowed<'static> {
683 /// Cheaply converts a [`WordSegmenterBorrowed<'static>`] into a [`WordSegmenter`].
684 ///
685 /// Note: Due to branching and indirection, using [`WordSegmenter`] might inhibit some
686 /// compile-time optimizations that are possible with [`WordSegmenterBorrowed`].
687 pub fn static_to_owned(self) -> WordSegmenter {
688 let payload_locale_override = self.locale_override.map(DataPayload::from_static_ref);
689 WordSegmenter {
690 payload: DataPayload::from_static_ref(self.data),
691 complex: self.complex.static_to_owned(),
692 payload_locale_override,
693 }
694 }
695}
696
697/// A trait allowing for [`WordBreakIterator`] to be generalized to multiple string iteration methods.
698///
699/// This is implemented by ICU4X for several common string types.
700///
701/// <div class="stab unstable">
702/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
703/// trait, please consider using a type from the implementors listed below.
704/// </div>
705pub trait WordBreakType: crate::private::Sealed + Sized + RuleBreakType {
706 #[doc(hidden)]
707 fn word_handle_complex_language(
708 iterator: &mut RuleBreakIterator<'_, '_, Self>,
709 left_codepoint: Self::CharType,
710 ) -> Option<usize>;
711}
712
713impl WordBreakType for Utf8 {
714 fn word_handle_complex_language(
715 iter: &mut RuleBreakIterator<'_, '_, Self>,
716 left_codepoint: Self::CharType,
717 ) -> Option<usize> {
718 handle_complex_language_utf8(iter, left_codepoint)
719 }
720}
721
722impl WordBreakType for PotentiallyIllFormedUtf8 {
723 fn word_handle_complex_language(
724 iter: &mut RuleBreakIterator<'_, '_, Self>,
725 left_codepoint: Self::CharType,
726 ) -> Option<usize> {
727 handle_complex_language_utf8(iter, left_codepoint)
728 }
729}
730
731impl WordBreakType for Latin1 {
732 fn word_handle_complex_language(
733 _iter: &mut RuleBreakIterator<'_, '_, Self>,
734 _left_codepoint: Self::CharType,
735 ) -> Option<usize> {
736 debug_assert!(
737 false,
738 "latin-1 text should never need complex language handling"
739 );
740 None
741 }
742}
743
744/// handle_complex_language impl for UTF8 iterators
745fn handle_complex_language_utf8<T>(
746 iter: &mut RuleBreakIterator<'_, '_, T>,
747 left_codepoint: T::CharType,
748) -> Option<usize>
749where
750 T: RuleBreakType<CharType = char>,
751{
752 // word segmenter doesn't define break rules for some languages such as Thai.
753 let start_iter = iter.iter.clone();
754 let start_point = iter.current_pos_data;
755 let mut s = String::new();
756 s.push(left_codepoint);
757 loop {
758 debug_assert!(!iter.is_eof());
759 s.push(iter.get_current_codepoint()?);
760 iter.advance_iter();
761 if let Some(current_break_property) = iter.get_current_break_property() {
762 if current_break_property != iter.data.complex_property {
763 break;
764 }
765 } else {
766 // EOF
767 break;
768 }
769 }
770
771 // Restore iterator to move to head of complex string
772 iter.iter = start_iter;
773 iter.current_pos_data = start_point;
774 #[expect(clippy::unwrap_used)] // iter.complex present for word segmenter
775 let breaks = iter.complex.unwrap().complex_language_segment_str(&s);
776 iter.result_cache = breaks;
777 let first_pos = *iter.result_cache.first()?;
778 let mut i = left_codepoint.len_utf8();
779 loop {
780 if i == first_pos {
781 // Re-calculate breaking offset
782 iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
783 return iter.get_current_position();
784 }
785 debug_assert!(
786 i < first_pos,
787 "we should always arrive at first_pos: near index {:?}",
788 iter.get_current_position()
789 );
790 i += iter.get_current_codepoint().map_or(0, T::char_len);
791 iter.advance_iter();
792 if iter.is_eof() {
793 iter.result_cache.clear();
794 return Some(iter.len);
795 }
796 }
797}
798
799impl WordBreakType for Utf16 {
800 fn word_handle_complex_language(
801 iter: &mut RuleBreakIterator<Self>,
802 left_codepoint: Self::CharType,
803 ) -> Option<usize> {
804 // word segmenter doesn't define break rules for some languages such as Thai.
805 let start_iter = iter.iter.clone();
806 let start_point = iter.current_pos_data;
807 let mut s = vec![left_codepoint as u16];
808 loop {
809 debug_assert!(!iter.is_eof());
810 s.push(iter.get_current_codepoint()? as u16);
811 iter.advance_iter();
812 if let Some(current_break_property) = iter.get_current_break_property() {
813 if current_break_property != iter.data.complex_property {
814 break;
815 }
816 } else {
817 // EOF
818 break;
819 }
820 }
821
822 // Restore iterator to move to head of complex string
823 iter.iter = start_iter;
824 iter.current_pos_data = start_point;
825 #[expect(clippy::unwrap_used)] // iter.complex present for word segmenter
826 let breaks = iter.complex.unwrap().complex_language_segment_utf16(&s);
827 iter.result_cache = breaks;
828 // result_cache vector is utf-16 index that is in BMP.
829 let first_pos = *iter.result_cache.first()?;
830 let mut i = 1;
831 loop {
832 if i == first_pos {
833 // Re-calculate breaking offset
834 iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
835 return iter.get_current_position();
836 }
837 debug_assert!(
838 i < first_pos,
839 "we should always arrive at first_pos: near index {:?}",
840 iter.get_current_position()
841 );
842 i += 1;
843 iter.advance_iter();
844 if iter.is_eof() {
845 iter.result_cache.clear();
846 return Some(iter.len);
847 }
848 }
849 }
850}
851
852#[cfg(all(test, feature = "serde"))]
853#[test]
854fn empty_string() {
855 let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
856 let breaks: Vec<usize> = segmenter.segment_str("").collect();
857 assert_eq!(breaks, [0]);
858}