Struct WordSegmenter

Source

pub struct WordSegmenter { /* private fields */ }

Expand description

Supports loading word break data, and creating word break iterators for different string encodings.

Most segmentation methods live on WordSegmenterBorrowed, which can be obtained via WordSegmenter::new_auto() (etc) or WordSegmenter::as_borrowed().

§Content Locale

You can optionally provide a content locale to the WordSegmenter constructor. If you have information on the language of the text being segmented, providing this hint can produce higher-quality results.

If you have a content locale, use WordBreakOptions and a constructor begining with new. If you do not have a content locale use WordBreakInvariantOptions and a constructor beginning with try_new.

§Examples

Segment a string:

use icu::segmenter::WordSegmenter;

let segmenter = WordSegmenter::new_auto(Default::default());

let breakpoints: Vec<usize> =
    segmenter.segment_str("Hello World").collect();
assert_eq!(&breakpoints, &[0, 5, 6, 11]);

Segment a Latin1 byte string with a content locale:

use icu::locale::langid;
use icu::segmenter::options::WordBreakOptions;
use icu::segmenter::WordSegmenter;

let mut options = WordBreakOptions::default();
let langid = &langid!("en");
options.content_locale = Some(langid);
let segmenter = WordSegmenter::try_new_auto(options).unwrap();

let breakpoints: Vec<usize> = segmenter
    .as_borrowed()
    .segment_latin1(b"Hello World")
    .collect();
assert_eq!(&breakpoints, &[0, 5, 6, 11]);

Successive boundaries can be used to retrieve the segments. In particular, the first boundary is always 0, and the last one is the length of the segmented text in code units.

use itertools::Itertools;
let text = "Mark’d ye his words?";
let segments: Vec<&str> = segmenter
    .segment_str(text)
    .tuple_windows()
    .map(|(i, j)| &text[i..j])
    .collect();
assert_eq!(
    &segments,
    &["Mark’d", " ", "ye", " ", "his", " ", "words", "?"]
);

Not all segments delimited by word boundaries are words; some are interword segments such as spaces and punctuation. The WordBreakIterator::word_type() of a boundary can be used to classify the preceding segment; WordBreakIterator::iter_with_word_type() associates each boundary with its status.

let words: Vec<&str> = segmenter
    .segment_str(text)
    .iter_with_word_type()
    .tuple_windows()
    .filter(|(_, (_, segment_type))| segment_type.is_word_like())
    .map(|((i, _), (j, _))| &text[i..j])
    .collect();
assert_eq!(&words, &["Mark’d", "ye", "his", "words"]);

Struct WordSegmenter Copy item path

§Content Locale

§Examples

Implementations§

impl WordSegmenter

pub fn new_auto( _options: WordBreakInvariantOptions, ) -> WordSegmenterBorrowed<'static>

§Examples

pub fn try_new_auto(options: WordBreakOptions<'_>) -> Result<Self, DataError>

pub fn try_new_auto_with_buffer_provider( provider: &(impl BufferProvider + ?Sized), options: WordBreakOptions<'_>, ) -> Result<Self, DataError>

pub fn new_lstm( _options: WordBreakInvariantOptions, ) -> WordSegmenterBorrowed<'static>

§Examples

pub fn try_new_lstm(options: WordBreakOptions<'_>) -> Result<Self, DataError>

pub fn try_new_lstm_with_buffer_provider( provider: &(impl BufferProvider + ?Sized), options: WordBreakOptions<'_>, ) -> Result<Self, DataError>

pub fn try_new_lstm_unstable<D>( provider: &D, options: WordBreakOptions<'_>, ) -> Result<Self, DataError>where D: DataProvider<SegmenterBreakWordV1> + DataProvider<SegmenterBreakWordOverrideV1> + DataProvider<SegmenterLstmAutoV1> + DataProvider<SegmenterBreakGraphemeClusterV1> + ?Sized,

pub fn new_dictionary( _options: WordBreakInvariantOptions, ) -> WordSegmenterBorrowed<'static>

§Examples

pub fn try_new_dictionary( options: WordBreakOptions<'_>, ) -> Result<Self, DataError>

pub fn try_new_dictionary_with_buffer_provider( provider: &(impl BufferProvider + ?Sized), options: WordBreakOptions<'_>, ) -> Result<Self, DataError>

pub const fn new_for_non_complex_scripts( _options: WordBreakInvariantOptions, ) -> WordSegmenterBorrowed<'static>

pub fn try_new_for_non_complex_scripts( options: WordBreakOptions<'_>, ) -> Result<Self, DataError>

pub fn try_new_for_non_complex_scripts_with_buffer_provider( provider: &(impl BufferProvider + ?Sized), options: WordBreakOptions<'_>, ) -> Result<Self, DataError>

pub fn try_new_for_non_complex_scripts_unstable<D>( provider: &D, options: WordBreakOptions<'_>, ) -> Result<Self, DataError>where D: DataProvider<SegmenterBreakWordV1> + DataProvider<SegmenterBreakWordOverrideV1> + DataProvider<SegmenterBreakGraphemeClusterV1> + ?Sized,

pub fn as_borrowed(&self) -> WordSegmenterBorrowed<'_>

Trait Implementations§

impl Debug for WordSegmenter

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Auto Trait Implementations§

impl Freeze for WordSegmenter

impl RefUnwindSafe for WordSegmenter

impl !Send for WordSegmenter

impl !Sync for WordSegmenter

impl Unpin for WordSegmenter

impl UnwindSafe for WordSegmenter

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<T> ErasedDestructor for Twhere T: 'static,

Struct WordSegmenter

pub fn try_new_lstm_unstable<D>( provider: &D, options: WordBreakOptions<'_>, ) -> Result<Self, DataError>
where D: DataProvider<SegmenterBreakWordV1> + DataProvider<SegmenterBreakWordOverrideV1> + DataProvider<SegmenterLstmAutoV1> + DataProvider<SegmenterBreakGraphemeClusterV1> + ?Sized,

pub fn try_new_for_non_complex_scripts_unstable<D>( provider: &D, options: WordBreakOptions<'_>, ) -> Result<Self, DataError>
where D: DataProvider<SegmenterBreakWordV1> + DataProvider<SegmenterBreakWordOverrideV1> + DataProvider<SegmenterBreakGraphemeClusterV1> + ?Sized,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<T> ErasedDestructor for T
where T: 'static,