pub struct LineSegmenter { /* private fields */ }Expand description
Supports loading line break data, and creating line break iterators for different string encodings.
Most segmentation methods live on LineSegmenterBorrowed, which can be obtained via
LineSegmenter::new_auto() (etc) or LineSegmenter::as_borrowed().
The segmenter returns mandatory breaks (as defined by definition LD7 of Unicode Standard Annex #14, Unicode Line Breaking Algorithm) as well as line break opportunities (definition LD3). It does not distinguish them. Callers requiring that distinction can check the Line_Break property of the code point preceding the break against those listed in rules LB4 and LB5, special-casing the end of text according to LB3.
For consistency with the grapheme, word, and sentence segmenters, there is always a breakpoint returned at index 0, but this breakpoint is not a meaningful line break opportunity.
let text = "Summary\r\nThis annex…";
let breakpoints: Vec<usize> = segmenter.segment_str(text).collect();
// 9 and 22 are mandatory breaks, 14 is a line break opportunity.
assert_eq!(&breakpoints, &[0, 9, 14, 22]);
// There is a break opportunity between emoji, but not within the ZWJ sequence 🏳️🌈.
let flag_equation = "🏳️➕🌈🟰🏳️\u{200D}🌈";
let possible_first_lines: Vec<&str> =
segmenter.segment_str(flag_equation).skip(1).map(|i| &flag_equation[..i]).collect();
assert_eq!(
&possible_first_lines,
&[
"🏳️",
"🏳️➕",
"🏳️➕🌈",
"🏳️➕🌈🟰",
"🏳️➕🌈🟰🏳️🌈"
]
);§Examples
Segment a string with default options:
use icu::segmenter::LineSegmenter;
let segmenter = LineSegmenter::new_auto(Default::default());
let breakpoints: Vec<usize> =
segmenter.segment_str("Hello World").collect();
assert_eq!(&breakpoints, &[0, 6, 11]);Segment a string with CSS option overrides:
use icu::segmenter::options::{
LineBreakOptions, LineBreakStrictness, LineBreakWordOption,
};
use icu::segmenter::LineSegmenter;
let mut options = LineBreakOptions::default();
options.strictness = Some(LineBreakStrictness::Strict);
options.word_option = Some(LineBreakWordOption::BreakAll);
options.content_locale = None;
let segmenter = LineSegmenter::new_auto(options);
let breakpoints: Vec<usize> =
segmenter.segment_str("Hello World").collect();
assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]);Segment a Latin1 byte string:
use icu::segmenter::LineSegmenter;
let segmenter = LineSegmenter::new_auto(Default::default());
let breakpoints: Vec<usize> =
segmenter.segment_latin1(b"Hello World").collect();
assert_eq!(&breakpoints, &[0, 6, 11]);Separate mandatory breaks from the break opportunities:
use icu::properties::{props::LineBreak, CodePointMapData};
use icu::segmenter::LineSegmenter;
let text = "Summary\r\nThis annex…";
let mandatory_breaks: Vec<usize> = segmenter
.segment_str(text)
.into_iter()
.filter(|&i| {
text[..i].chars().next_back().map_or(false, |c| {
matches!(
CodePointMapData::<LineBreak>::new().get(c),
LineBreak::MandatoryBreak
| LineBreak::CarriageReturn
| LineBreak::LineFeed
| LineBreak::NextLine
) || i == text.len()
})
})
.collect();
assert_eq!(&mandatory_breaks, &[9, 22]);Implementations§
Source§impl LineSegmenter
impl LineSegmenter
Sourcepub fn new_auto(options: LineBreakOptions<'_>) -> LineSegmenterBorrowed<'static>
pub fn new_auto(options: LineBreakOptions<'_>) -> LineSegmenterBorrowed<'static>
Constructs a LineSegmenter with an invariant locale, custom LineBreakOptions, and
the best available compiled data for complex scripts (Khmer, Lao, Myanmar, and Thai).
The current behavior, which is subject to change, is to use the LSTM model when available.
See also Self::new_auto.
✨ Enabled with the compiled_data and auto Cargo features.
Sourcepub fn try_new_auto_with_buffer_provider(
provider: &(impl BufferProvider + ?Sized),
options: LineBreakOptions<'_>,
) -> Result<LineSegmenter, DataError>
pub fn try_new_auto_with_buffer_provider( provider: &(impl BufferProvider + ?Sized), options: LineBreakOptions<'_>, ) -> Result<LineSegmenter, DataError>
A version of [Self :: new_auto] that uses custom data provided by a BufferProvider.
✨ Enabled with the serde feature.
Sourcepub fn try_new_auto_unstable<D>(
provider: &D,
options: LineBreakOptions<'_>,
) -> Result<LineSegmenter, DataError>
pub fn try_new_auto_unstable<D>( provider: &D, options: LineBreakOptions<'_>, ) -> Result<LineSegmenter, DataError>
A version of Self::new_auto that uses custom data provided by a DataProvider.
Sourcepub fn new_lstm(options: LineBreakOptions<'_>) -> LineSegmenterBorrowed<'static>
pub fn new_lstm(options: LineBreakOptions<'_>) -> LineSegmenterBorrowed<'static>
Constructs a LineSegmenter with an invariant locale, custom LineBreakOptions, and
compiled LSTM data for complex scripts (Khmer, Lao, Myanmar, and Thai).
The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than the full dictionary but more expensive during segmentation (inference).
See also Self::new_lstm.
✨ Enabled with the compiled_data and lstm Cargo features.
Sourcepub fn try_new_lstm_with_buffer_provider(
provider: &(impl BufferProvider + ?Sized),
options: LineBreakOptions<'_>,
) -> Result<LineSegmenter, DataError>
pub fn try_new_lstm_with_buffer_provider( provider: &(impl BufferProvider + ?Sized), options: LineBreakOptions<'_>, ) -> Result<LineSegmenter, DataError>
A version of [Self :: try_new_lstm] that uses custom data provided by a BufferProvider.
✨ Enabled with the serde feature.
Sourcepub fn try_new_lstm_unstable<D>(
provider: &D,
options: LineBreakOptions<'_>,
) -> Result<LineSegmenter, DataError>
pub fn try_new_lstm_unstable<D>( provider: &D, options: LineBreakOptions<'_>, ) -> Result<LineSegmenter, DataError>
A version of Self::new_lstm that uses custom data provided by a DataProvider.
Sourcepub fn new_dictionary(
options: LineBreakOptions<'_>,
) -> LineSegmenterBorrowed<'static>
pub fn new_dictionary( options: LineBreakOptions<'_>, ) -> LineSegmenterBorrowed<'static>
Constructs a LineSegmenter with an invariant locale, custom LineBreakOptions, and
compiled dictionary data for complex scripts (Khmer, Lao, Myanmar, and Thai).
The dictionary model uses a list of words to determine appropriate breakpoints. It is faster than the LSTM model but requires more data.
See also Self::new_dictionary.
✨ Enabled with the compiled_data Cargo feature.
Sourcepub fn try_new_dictionary_with_buffer_provider(
provider: &(impl BufferProvider + ?Sized),
options: LineBreakOptions<'_>,
) -> Result<LineSegmenter, DataError>
pub fn try_new_dictionary_with_buffer_provider( provider: &(impl BufferProvider + ?Sized), options: LineBreakOptions<'_>, ) -> Result<LineSegmenter, DataError>
A version of [Self :: new_dictionary] that uses custom data provided by a BufferProvider.
✨ Enabled with the serde feature.
Sourcepub fn try_new_dictionary_unstable<D>(
provider: &D,
options: LineBreakOptions<'_>,
) -> Result<LineSegmenter, DataError>
pub fn try_new_dictionary_unstable<D>( provider: &D, options: LineBreakOptions<'_>, ) -> Result<LineSegmenter, DataError>
A version of Self::new_dictionary that uses custom data provided by a DataProvider.
Sourcepub fn as_borrowed(&self) -> LineSegmenterBorrowed<'_>
pub fn as_borrowed(&self) -> LineSegmenterBorrowed<'_>
Constructs a borrowed version of this type for more efficient querying.
Most useful methods for segmentation are on this type.
Trait Implementations§
Auto Trait Implementations§
impl Freeze for LineSegmenter
impl RefUnwindSafe for LineSegmenter
impl !Send for LineSegmenter
impl !Sync for LineSegmenter
impl Unpin for LineSegmenter
impl UnwindSafe for LineSegmenter
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more