icu_segmenter/
sentence.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use alloc::vec::Vec;
6use icu_locale_core::LanguageIdentifier;
7use icu_provider::prelude::*;
8
9use crate::indices::{Latin1Indices, Utf16Indices};
10use crate::iterator_helpers::derive_usize_iterator_with_type;
11use crate::provider::*;
12use crate::rule_segmenter::*;
13use utf8_iter::Utf8CharIndices;
14
15/// Options to tailor sentence breaking behavior.
16#[non_exhaustive]
17#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
18pub struct SentenceBreakOptions<'a> {
19    /// Content locale for sentence segmenter.
20    ///
21    /// If you know the language of the text being segmented, provide it here in order to produce
22    /// higher quality breakpoints.
23    pub content_locale: Option<&'a LanguageIdentifier>,
24    /// Options independent of the locale
25    pub invariant_options: SentenceBreakInvariantOptions,
26}
27
28/// Locale-independent options to tailor sentence breaking behavior
29///
30/// Currently empty but may grow in the future
31#[non_exhaustive]
32#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
33pub struct SentenceBreakInvariantOptions {}
34
35/// Implements the [`Iterator`] trait over the sentence boundaries of the given string.
36///
37/// Lifetimes:
38///
39/// - `'data` = lifetime of the segmenter object from which this iterator was created
40/// - `'s` = lifetime of the string being segmented
41///
42/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
43/// _after_ the boundary (for a boundary at the end of text, this index is the length
44/// of the [`str`] or array of code units).
45///
46/// For examples of use, see [`SentenceSegmenter`].
47#[derive(Debug)]
48pub struct SentenceBreakIterator<'data, 's, Y: RuleBreakType>(RuleBreakIterator<'data, 's, Y>);
49
50derive_usize_iterator_with_type!(SentenceBreakIterator, 'data);
51
52/// Supports loading sentence break data, and creating sentence break iterators for different string
53/// encodings.
54///
55/// Most segmentation methods live on [`SentenceSegmenterBorrowed`], which can be obtained via
56/// [`SentenceSegmenter::new()`] or [`SentenceSegmenter::as_borrowed()`].
57///
58/// # Content Locale
59///
60/// You can optionally provide a _content locale_ to the [`SentenceSegmenter`] constructor. If you
61/// have information on the language of the text being segmented, providing this hint can
62/// produce higher-quality results.
63///
64/// If you have a content locale, use [`SentenceBreakOptions`] and a constructor begining with `new`.
65/// If you do not have a content locale use [`SentenceBreakInvariantOptions`] and a constructor
66/// beginning with `try_new`.
67///
68/// # Examples
69///
70/// Segment a string:
71///
72/// ```rust
73/// use icu::segmenter::SentenceSegmenter;
74///
75/// let segmenter = SentenceSegmenter::new(Default::default());
76///
77/// let breakpoints: Vec<usize> =
78///     segmenter.segment_str("Hello World").collect();
79/// assert_eq!(&breakpoints, &[0, 11]);
80/// ```
81///
82/// Segment a Latin1 byte string with a content locale:
83///
84/// ```rust
85/// use icu::locale::langid;
86/// use icu::segmenter::options::SentenceBreakOptions;
87/// use icu::segmenter::SentenceSegmenter;
88///
89/// let mut options = SentenceBreakOptions::default();
90/// let langid = &langid!("en");
91/// options.content_locale = Some(langid);
92/// let segmenter = SentenceSegmenter::try_new(options).unwrap();
93///
94/// let breakpoints: Vec<usize> = segmenter
95///     .as_borrowed()
96///     .segment_latin1(b"Hello World")
97///     .collect();
98/// assert_eq!(&breakpoints, &[0, 11]);
99/// ```
100///
101/// Successive boundaries can be used to retrieve the sentences.
102/// In particular, the first boundary is always 0, and the last one is the
103/// length of the segmented text in code units.
104///
105/// ```rust
106/// # use icu::segmenter::{SentenceSegmenter, options::SentenceBreakInvariantOptions};
107/// # let segmenter = SentenceSegmenter::new(SentenceBreakInvariantOptions::default());
108/// use itertools::Itertools;
109/// let text = "Ceci tuera cela. Le livre tuera l’édifice.";
110/// let sentences: Vec<&str> = segmenter
111///     .segment_str(text)
112///     .tuple_windows()
113///     .map(|(i, j)| &text[i..j])
114///     .collect();
115/// assert_eq!(
116///     &sentences,
117///     &["Ceci tuera cela. ", "Le livre tuera l’édifice."]
118/// );
119/// ```
120#[derive(Debug)]
121pub struct SentenceSegmenter {
122    payload: DataPayload<SegmenterBreakSentenceV1>,
123    payload_locale_override: Option<DataPayload<SegmenterBreakSentenceOverrideV1>>,
124}
125
126/// Segments a string into sentences (borrowed version).
127///
128/// See [`SentenceSegmenter`] for examples.
129#[derive(Clone, Debug, Copy)]
130pub struct SentenceSegmenterBorrowed<'data> {
131    data: &'data RuleBreakData<'data>,
132    locale_override: Option<&'data RuleBreakDataOverride<'data>>,
133}
134
135impl SentenceSegmenter {
136    /// Constructs a [`SentenceSegmenterBorrowed`] with an invariant locale and compiled data.
137    ///
138    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
139    ///
140    /// [📚 Help choosing a constructor](icu_provider::constructors)
141    #[cfg(feature = "compiled_data")]
142    #[expect(clippy::new_ret_no_self)]
143    pub const fn new(
144        _options: SentenceBreakInvariantOptions,
145    ) -> SentenceSegmenterBorrowed<'static> {
146        SentenceSegmenterBorrowed {
147            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_SENTENCE_V1,
148            locale_override: None,
149        }
150    }
151
152    icu_provider::gen_buffer_data_constructors!(
153        (options: SentenceBreakOptions) -> error: DataError,
154        /// Constructs a [`SentenceSegmenter`] for a given options and using compiled data.
155        functions: [
156            try_new,
157            try_new_with_buffer_provider,
158            try_new_unstable,
159            Self
160        ]
161    );
162
163    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::try_new)]
164    pub fn try_new_unstable<D>(
165        provider: &D,
166        options: SentenceBreakOptions,
167    ) -> Result<Self, DataError>
168    where
169        D: DataProvider<SegmenterBreakSentenceV1>
170            + DataProvider<SegmenterBreakSentenceOverrideV1>
171            + ?Sized,
172    {
173        let payload = provider.load(Default::default())?.payload;
174        let payload_locale_override = if let Some(locale) = options.content_locale {
175            let locale = DataLocale::from(locale);
176            let req = DataRequest {
177                id: DataIdentifierBorrowed::for_locale(&locale),
178                metadata: {
179                    let mut metadata = DataRequestMetadata::default();
180                    metadata.silent = true;
181                    metadata
182                },
183            };
184            provider
185                .load(req)
186                .allow_identifier_not_found()?
187                .map(|r| r.payload)
188        } else {
189            None
190        };
191
192        Ok(Self {
193            payload,
194            payload_locale_override,
195        })
196    }
197
198    /// Constructs a borrowed version of this type for more efficient querying.
199    ///
200    /// Most useful methods for segmentation are on this type.
201    pub fn as_borrowed(&self) -> SentenceSegmenterBorrowed<'_> {
202        SentenceSegmenterBorrowed {
203            data: self.payload.get(),
204            locale_override: self.payload_locale_override.as_ref().map(|p| p.get()),
205        }
206    }
207}
208
209impl<'data> SentenceSegmenterBorrowed<'data> {
210    /// Creates a sentence break iterator for an `str` (a UTF-8 string).
211    ///
212    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
213    pub fn segment_str<'s>(self, input: &'s str) -> SentenceBreakIterator<'data, 's, Utf8> {
214        SentenceBreakIterator(RuleBreakIterator {
215            iter: input.char_indices(),
216            len: input.len(),
217            current_pos_data: None,
218            result_cache: Vec::new(),
219            data: self.data,
220            complex: None,
221            boundary_property: 0,
222            locale_override: self.locale_override,
223            handle_complex_language: empty_handle_complex_language,
224        })
225    }
226    /// Creates a sentence break iterator for a potentially ill-formed UTF8 string
227    ///
228    /// Invalid characters are treated as REPLACEMENT CHARACTER
229    ///
230    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
231    pub fn segment_utf8<'s>(
232        self,
233        input: &'s [u8],
234    ) -> SentenceBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
235        SentenceBreakIterator(RuleBreakIterator {
236            iter: Utf8CharIndices::new(input),
237            len: input.len(),
238            current_pos_data: None,
239            result_cache: Vec::new(),
240            data: self.data,
241            complex: None,
242            boundary_property: 0,
243            locale_override: self.locale_override,
244            handle_complex_language: empty_handle_complex_language,
245        })
246    }
247    /// Creates a sentence break iterator for a Latin-1 (8-bit) string.
248    ///
249    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
250    pub fn segment_latin1<'s>(self, input: &'s [u8]) -> SentenceBreakIterator<'data, 's, Latin1> {
251        SentenceBreakIterator(RuleBreakIterator {
252            iter: Latin1Indices::new(input),
253            len: input.len(),
254            current_pos_data: None,
255            result_cache: Vec::new(),
256            data: self.data,
257            complex: None,
258            boundary_property: 0,
259            locale_override: self.locale_override,
260            handle_complex_language: empty_handle_complex_language,
261        })
262    }
263
264    /// Creates a sentence break iterator for a UTF-16 string.
265    ///
266    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
267    pub fn segment_utf16<'s>(self, input: &'s [u16]) -> SentenceBreakIterator<'data, 's, Utf16> {
268        SentenceBreakIterator(RuleBreakIterator {
269            iter: Utf16Indices::new(input),
270            len: input.len(),
271            current_pos_data: None,
272            result_cache: Vec::new(),
273            data: self.data,
274            complex: None,
275            boundary_property: 0,
276            locale_override: self.locale_override,
277            handle_complex_language: empty_handle_complex_language,
278        })
279    }
280}
281
282impl SentenceSegmenterBorrowed<'static> {
283    /// Cheaply converts a [`SentenceSegmenterBorrowed<'static>`] into a [`SentenceSegmenter`].
284    ///
285    /// Note: Due to branching and indirection, using [`SentenceSegmenter`] might inhibit some
286    /// compile-time optimizations that are possible with [`SentenceSegmenterBorrowed`].
287    pub const fn static_to_owned(self) -> SentenceSegmenter {
288        let payload_locale_override = if let Some(d) = self.locale_override {
289            Some(DataPayload::from_static_ref(d))
290        } else {
291            None
292        };
293        SentenceSegmenter {
294            payload: DataPayload::from_static_ref(self.data),
295            payload_locale_override,
296        }
297    }
298}
299
300#[cfg(all(test, feature = "serde"))]
301#[test]
302fn empty_string() {
303    let segmenter = SentenceSegmenter::new(Default::default());
304    let breaks: Vec<usize> = segmenter.segment_str("").collect();
305    assert_eq!(breaks, [0]);
306}