icu_segmenter/
sentence.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use alloc::vec::Vec;
6use icu_locale_core::LanguageIdentifier;
7use icu_provider::prelude::*;
8
9use crate::indices::{Latin1Indices, Utf16Indices};
10use crate::iterator_helpers::derive_usize_iterator_with_type;
11use crate::provider::*;
12use crate::rule_segmenter::*;
13use utf8_iter::Utf8CharIndices;
14
15/// Options to tailor sentence breaking behavior.
16#[non_exhaustive]
17#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
18pub struct SentenceBreakOptions<'a> {
19    /// Content locale for sentence segmenter.
20    pub content_locale: Option<&'a LanguageIdentifier>,
21    /// Options independent of the locale
22    pub invariant_options: SentenceBreakInvariantOptions,
23}
24
25/// Locale-independent options to tailor sentence breaking behavior
26///
27/// Currently empty but may grow in the future
28#[non_exhaustive]
29#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
30pub struct SentenceBreakInvariantOptions {}
31
32/// Implements the [`Iterator`] trait over the sentence boundaries of the given string.
33///
34/// Lifetimes:
35///
36/// - `'data` = lifetime of the segmenter object from which this iterator was created
37/// - `'s` = lifetime of the string being segmented
38///
39/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
40/// _after_ the boundary (for a boundary at the end of text, this index is the length
41/// of the [`str`] or array of code units).
42///
43/// For examples of use, see [`SentenceSegmenter`].
44#[derive(Debug)]
45pub struct SentenceBreakIterator<'data, 's, Y: RuleBreakType>(RuleBreakIterator<'data, 's, Y>);
46
47derive_usize_iterator_with_type!(SentenceBreakIterator, 'data);
48
49/// Supports loading sentence break data, and creating sentence break iterators for different string
50/// encodings.
51///
52/// Most segmentation methods live on [`SentenceSegmenterBorrowed`], which can be obtained via
53/// [`SentenceSegmenter::new()`] or [`SentenceSegmenter::as_borrowed()`].
54///
55/// # Examples
56///
57/// Segment a string:
58///
59/// ```rust
60/// use icu::segmenter::{
61///     options::SentenceBreakInvariantOptions, SentenceSegmenter,
62/// };
63/// let segmenter =
64///     SentenceSegmenter::new(SentenceBreakInvariantOptions::default());
65///
66/// let breakpoints: Vec<usize> =
67///     segmenter.segment_str("Hello World").collect();
68/// assert_eq!(&breakpoints, &[0, 11]);
69/// ```
70///
71/// Segment a Latin1 byte string:
72///
73/// ```rust
74/// use icu::segmenter::{
75///     options::SentenceBreakInvariantOptions, SentenceSegmenter,
76/// };
77/// let segmenter =
78///     SentenceSegmenter::new(SentenceBreakInvariantOptions::default());
79///
80/// let breakpoints: Vec<usize> =
81///     segmenter.segment_latin1(b"Hello World").collect();
82/// assert_eq!(&breakpoints, &[0, 11]);
83/// ```
84///
85/// Successive boundaries can be used to retrieve the sentences.
86/// In particular, the first boundary is always 0, and the last one is the
87/// length of the segmented text in code units.
88///
89/// ```rust
90/// # use icu::segmenter::{SentenceSegmenter, options::SentenceBreakInvariantOptions};
91/// # let segmenter = SentenceSegmenter::new(SentenceBreakInvariantOptions::default());
92/// use itertools::Itertools;
93/// let text = "Ceci tuera cela. Le livre tuera l’édifice.";
94/// let sentences: Vec<&str> = segmenter
95///     .segment_str(text)
96///     .tuple_windows()
97///     .map(|(i, j)| &text[i..j])
98///     .collect();
99/// assert_eq!(
100///     &sentences,
101///     &["Ceci tuera cela. ", "Le livre tuera l’édifice."]
102/// );
103/// ```
104#[derive(Debug)]
105pub struct SentenceSegmenter {
106    payload: DataPayload<SegmenterBreakSentenceV1>,
107    payload_locale_override: Option<DataPayload<SegmenterBreakSentenceOverrideV1>>,
108}
109
110/// Segments a string into sentences (borrowed version).
111///
112/// See [`SentenceSegmenter`] for examples.
113#[derive(Clone, Debug, Copy)]
114pub struct SentenceSegmenterBorrowed<'data> {
115    data: &'data RuleBreakData<'data>,
116    locale_override: Option<&'data RuleBreakDataOverride<'data>>,
117}
118
119impl SentenceSegmenter {
120    /// Constructs a [`SentenceSegmenterBorrowed`] with an invariant locale and compiled data.
121    ///
122    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
123    ///
124    /// [📚 Help choosing a constructor](icu_provider::constructors)
125    #[cfg(feature = "compiled_data")]
126    #[allow(clippy::new_ret_no_self)]
127    pub const fn new(
128        _options: SentenceBreakInvariantOptions,
129    ) -> SentenceSegmenterBorrowed<'static> {
130        SentenceSegmenterBorrowed {
131            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_SENTENCE_V1,
132            locale_override: None,
133        }
134    }
135
136    icu_provider::gen_buffer_data_constructors!(
137        (options: SentenceBreakOptions) -> error: DataError,
138        /// Constructs a [`SentenceSegmenter`] for a given options and using compiled data.
139        functions: [
140            try_new,
141            try_new_with_buffer_provider,
142            try_new_unstable,
143            Self
144        ]
145    );
146
147    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::try_new)]
148    pub fn try_new_unstable<D>(
149        provider: &D,
150        options: SentenceBreakOptions,
151    ) -> Result<Self, DataError>
152    where
153        D: DataProvider<SegmenterBreakSentenceV1>
154            + DataProvider<SegmenterBreakSentenceOverrideV1>
155            + ?Sized,
156    {
157        let payload = provider.load(Default::default())?.payload;
158        let payload_locale_override = if let Some(locale) = options.content_locale {
159            let locale = DataLocale::from(locale);
160            let req = DataRequest {
161                id: DataIdentifierBorrowed::for_locale(&locale),
162                metadata: {
163                    let mut metadata = DataRequestMetadata::default();
164                    metadata.silent = true;
165                    metadata
166                },
167            };
168            provider
169                .load(req)
170                .allow_identifier_not_found()?
171                .map(|r| r.payload)
172        } else {
173            None
174        };
175
176        Ok(Self {
177            payload,
178            payload_locale_override,
179        })
180    }
181
182    /// Constructs a borrowed version of this type for more efficient querying.
183    ///
184    /// Most useful methods for segmentation are on this type.
185    pub fn as_borrowed(&self) -> SentenceSegmenterBorrowed<'_> {
186        SentenceSegmenterBorrowed {
187            data: self.payload.get(),
188            locale_override: self.payload_locale_override.as_ref().map(|p| p.get()),
189        }
190    }
191}
192
193impl<'data> SentenceSegmenterBorrowed<'data> {
194    /// Creates a sentence break iterator for an `str` (a UTF-8 string).
195    ///
196    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
197    pub fn segment_str<'s>(self, input: &'s str) -> SentenceBreakIterator<'data, 's, Utf8> {
198        SentenceBreakIterator(RuleBreakIterator {
199            iter: input.char_indices(),
200            len: input.len(),
201            current_pos_data: None,
202            result_cache: Vec::new(),
203            data: self.data,
204            complex: None,
205            boundary_property: 0,
206            locale_override: self.locale_override,
207            handle_complex_language: empty_handle_complex_language,
208        })
209    }
210    /// Creates a sentence break iterator for a potentially ill-formed UTF8 string
211    ///
212    /// Invalid characters are treated as REPLACEMENT CHARACTER
213    ///
214    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
215    pub fn segment_utf8<'s>(
216        self,
217        input: &'s [u8],
218    ) -> SentenceBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
219        SentenceBreakIterator(RuleBreakIterator {
220            iter: Utf8CharIndices::new(input),
221            len: input.len(),
222            current_pos_data: None,
223            result_cache: Vec::new(),
224            data: self.data,
225            complex: None,
226            boundary_property: 0,
227            locale_override: self.locale_override,
228            handle_complex_language: empty_handle_complex_language,
229        })
230    }
231    /// Creates a sentence break iterator for a Latin-1 (8-bit) string.
232    ///
233    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
234    pub fn segment_latin1<'s>(self, input: &'s [u8]) -> SentenceBreakIterator<'data, 's, Latin1> {
235        SentenceBreakIterator(RuleBreakIterator {
236            iter: Latin1Indices::new(input),
237            len: input.len(),
238            current_pos_data: None,
239            result_cache: Vec::new(),
240            data: self.data,
241            complex: None,
242            boundary_property: 0,
243            locale_override: self.locale_override,
244            handle_complex_language: empty_handle_complex_language,
245        })
246    }
247
248    /// Creates a sentence break iterator for a UTF-16 string.
249    ///
250    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
251    pub fn segment_utf16<'s>(self, input: &'s [u16]) -> SentenceBreakIterator<'data, 's, Utf16> {
252        SentenceBreakIterator(RuleBreakIterator {
253            iter: Utf16Indices::new(input),
254            len: input.len(),
255            current_pos_data: None,
256            result_cache: Vec::new(),
257            data: self.data,
258            complex: None,
259            boundary_property: 0,
260            locale_override: self.locale_override,
261            handle_complex_language: empty_handle_complex_language,
262        })
263    }
264}
265
266impl SentenceSegmenterBorrowed<'static> {
267    /// Cheaply converts a [`SentenceSegmenterBorrowed<'static>`] into a [`SentenceSegmenter`].
268    ///
269    /// Note: Due to branching and indirection, using [`SentenceSegmenter`] might inhibit some
270    /// compile-time optimizations that are possible with [`SentenceSegmenterBorrowed`].
271    pub const fn static_to_owned(self) -> SentenceSegmenter {
272        let payload_locale_override = if let Some(d) = self.locale_override {
273            Some(DataPayload::from_static_ref(d))
274        } else {
275            None
276        };
277        SentenceSegmenter {
278            payload: DataPayload::from_static_ref(self.data),
279            payload_locale_override,
280        }
281    }
282}
283
284#[cfg(all(test, feature = "serde"))]
285#[test]
286fn empty_string() {
287    let segmenter = SentenceSegmenter::new(Default::default());
288    let breaks: Vec<usize> = segmenter.segment_str("").collect();
289    assert_eq!(breaks, [0]);
290}