pub struct LineSegmenter { /* private fields */ }
Expand description
Supports loading line break data, and creating line break iterators for different string encodings.
Most segmentation methods live on LineSegmenterBorrowed
, which can be obtained via
LineSegmenter::new_auto()
(etc) or LineSegmenter::as_borrowed()
.
The segmenter returns mandatory breaks (as defined by definition LD7 of Unicode Standard Annex #14, Unicode Line Breaking Algorithm) as well as line break opportunities (definition LD3). It does not distinguish them. Callers requiring that distinction can check the Line_Break property of the code point preceding the break against those listed in rules LB4 and LB5, special-casing the end of text according to LB3.
For consistency with the grapheme, word, and sentence segmenters, there is always a breakpoint returned at index 0, but this breakpoint is not a meaningful line break opportunity.
let text = "Summary\r\nThis annex…";
let breakpoints: Vec<usize> = segmenter.segment_str(text).collect();
// 9 and 22 are mandatory breaks, 14 is a line break opportunity.
assert_eq!(&breakpoints, &[0, 9, 14, 22]);
// There is a break opportunity between emoji, but not within the ZWJ sequence 🏳️🌈.
let flag_equation = "🏳️➕🌈🟰🏳️\u{200D}🌈";
let possible_first_lines: Vec<&str> =
segmenter.segment_str(flag_equation).skip(1).map(|i| &flag_equation[..i]).collect();
assert_eq!(
&possible_first_lines,
&[
"🏳️",
"🏳️➕",
"🏳️➕🌈",
"🏳️➕🌈🟰",
"🏳️➕🌈🟰🏳️🌈"
]
);
§Examples
Segment a string with default options:
use icu::segmenter::LineSegmenter;
let segmenter = LineSegmenter::new_auto(Default::default());
let breakpoints: Vec<usize> =
segmenter.segment_str("Hello World").collect();
assert_eq!(&breakpoints, &[0, 6, 11]);
Segment a string with CSS option overrides:
use icu::segmenter::options::{
LineBreakOptions, LineBreakStrictness, LineBreakWordOption,
};
use icu::segmenter::LineSegmenter;
let mut options = LineBreakOptions::default();
options.strictness = Some(LineBreakStrictness::Strict);
options.word_option = Some(LineBreakWordOption::BreakAll);
options.content_locale = None;
let segmenter = LineSegmenter::new_auto(options);
let breakpoints: Vec<usize> =
segmenter.segment_str("Hello World").collect();
assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]);
Segment a Latin1 byte string:
use icu::segmenter::LineSegmenter;
let segmenter = LineSegmenter::new_auto(Default::default());
let breakpoints: Vec<usize> =
segmenter.segment_latin1(b"Hello World").collect();
assert_eq!(&breakpoints, &[0, 6, 11]);
Separate mandatory breaks from the break opportunities:
use icu::properties::{props::LineBreak, CodePointMapData};
use icu::segmenter::LineSegmenter;
let text = "Summary\r\nThis annex…";
let mandatory_breaks: Vec<usize> = segmenter
.segment_str(text)
.into_iter()
.filter(|&i| {
text[..i].chars().next_back().map_or(false, |c| {
matches!(
CodePointMapData::<LineBreak>::new().get(c),
LineBreak::MandatoryBreak
| LineBreak::CarriageReturn
| LineBreak::LineFeed
| LineBreak::NextLine
) || i == text.len()
})
})
.collect();
assert_eq!(&mandatory_breaks, &[9, 22]);
Implementations§
Source§impl LineSegmenter
impl LineSegmenter
Sourcepub fn new_auto(options: LineBreakOptions<'_>) -> LineSegmenterBorrowed<'static>
pub fn new_auto(options: LineBreakOptions<'_>) -> LineSegmenterBorrowed<'static>
Constructs a LineSegmenter
with an invariant locale, custom LineBreakOptions
, and
the best available compiled data for complex scripts (Khmer, Lao, Myanmar, and Thai).
The current behavior, which is subject to change, is to use the LSTM model when available.
See also Self::new_auto
.
✨ Enabled with the compiled_data
and auto
Cargo features.
Sourcepub fn try_new_auto_with_buffer_provider(
provider: &(impl BufferProvider + ?Sized),
options: LineBreakOptions<'_>,
) -> Result<LineSegmenter, DataError>
pub fn try_new_auto_with_buffer_provider( provider: &(impl BufferProvider + ?Sized), options: LineBreakOptions<'_>, ) -> Result<LineSegmenter, DataError>
A version of [Self :: new_auto
] that uses custom data provided by a BufferProvider
.
✨ Enabled with the serde
feature.
Sourcepub fn try_new_auto_unstable<D>(
provider: &D,
options: LineBreakOptions<'_>,
) -> Result<LineSegmenter, DataError>
pub fn try_new_auto_unstable<D>( provider: &D, options: LineBreakOptions<'_>, ) -> Result<LineSegmenter, DataError>
A version of Self::new_auto
that uses custom data provided by a DataProvider
.
Sourcepub fn new_lstm(options: LineBreakOptions<'_>) -> LineSegmenterBorrowed<'static>
pub fn new_lstm(options: LineBreakOptions<'_>) -> LineSegmenterBorrowed<'static>
Constructs a LineSegmenter
with an invariant locale, custom LineBreakOptions
, and
compiled LSTM data for complex scripts (Khmer, Lao, Myanmar, and Thai).
The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than the full dictionary but more expensive during segmentation (inference).
See also Self::new_lstm
.
✨ Enabled with the compiled_data
and lstm
Cargo features.
Sourcepub fn try_new_lstm_with_buffer_provider(
provider: &(impl BufferProvider + ?Sized),
options: LineBreakOptions<'_>,
) -> Result<LineSegmenter, DataError>
pub fn try_new_lstm_with_buffer_provider( provider: &(impl BufferProvider + ?Sized), options: LineBreakOptions<'_>, ) -> Result<LineSegmenter, DataError>
A version of [Self :: try_new_lstm
] that uses custom data provided by a BufferProvider
.
✨ Enabled with the serde
feature.
Sourcepub fn try_new_lstm_unstable<D>(
provider: &D,
options: LineBreakOptions<'_>,
) -> Result<LineSegmenter, DataError>
pub fn try_new_lstm_unstable<D>( provider: &D, options: LineBreakOptions<'_>, ) -> Result<LineSegmenter, DataError>
A version of Self::new_lstm
that uses custom data provided by a DataProvider
.
Sourcepub fn new_dictionary(
options: LineBreakOptions<'_>,
) -> LineSegmenterBorrowed<'static>
pub fn new_dictionary( options: LineBreakOptions<'_>, ) -> LineSegmenterBorrowed<'static>
Constructs a LineSegmenter
with an invariant locale, custom LineBreakOptions
, and
compiled dictionary data for complex scripts (Khmer, Lao, Myanmar, and Thai).
The dictionary model uses a list of words to determine appropriate breakpoints. It is faster than the LSTM model but requires more data.
See also Self::new_dictionary
.
✨ Enabled with the compiled_data
Cargo feature.
Sourcepub fn try_new_dictionary_with_buffer_provider(
provider: &(impl BufferProvider + ?Sized),
options: LineBreakOptions<'_>,
) -> Result<LineSegmenter, DataError>
pub fn try_new_dictionary_with_buffer_provider( provider: &(impl BufferProvider + ?Sized), options: LineBreakOptions<'_>, ) -> Result<LineSegmenter, DataError>
A version of [Self :: new_dictionary
] that uses custom data provided by a BufferProvider
.
✨ Enabled with the serde
feature.
Sourcepub fn try_new_dictionary_unstable<D>(
provider: &D,
options: LineBreakOptions<'_>,
) -> Result<LineSegmenter, DataError>
pub fn try_new_dictionary_unstable<D>( provider: &D, options: LineBreakOptions<'_>, ) -> Result<LineSegmenter, DataError>
A version of Self::new_dictionary
that uses custom data provided by a DataProvider
.
Sourcepub fn as_borrowed(&self) -> LineSegmenterBorrowed<'_>
pub fn as_borrowed(&self) -> LineSegmenterBorrowed<'_>
Constructs a borrowed version of this type for more efficient querying.
Most useful methods for segmentation are on this type.
Trait Implementations§
Auto Trait Implementations§
impl Freeze for LineSegmenter
impl RefUnwindSafe for LineSegmenter
impl !Send for LineSegmenter
impl !Sync for LineSegmenter
impl Unpin for LineSegmenter
impl UnwindSafe for LineSegmenter
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left
is true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self
into a Left
variant of Either<Self, Self>
if into_left(&self)
returns true
.
Converts self
into a Right
variant of Either<Self, Self>
otherwise. Read more