icu_segmenter/
grapheme.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use alloc::vec::Vec;
6use icu_provider::prelude::*;
7
8use crate::indices::{Latin1Indices, Utf16Indices};
9use crate::iterator_helpers::derive_usize_iterator_with_type;
10use crate::provider::*;
11use crate::rule_segmenter::*;
12use utf8_iter::Utf8CharIndices;
13
14/// Implements the [`Iterator`] trait over the grapheme cluster boundaries of the given string.
15///
16/// Lifetimes:
17///
18/// - `'data` = lifetime of the segmenter object from which this iterator was created
19/// - `'s` = lifetime of the string being segmented
20///
21/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
22/// _after_ the boundary (for a boundary at the end of text, this index is the length
23/// of the [`str`] or array of code units).
24///
25/// For examples of use, see [`GraphemeClusterSegmenter`].
26#[derive(Debug)]
27pub struct GraphemeClusterBreakIterator<'data, 's, Y: RuleBreakType>(
28    RuleBreakIterator<'data, 's, Y>,
29);
30
31derive_usize_iterator_with_type!(GraphemeClusterBreakIterator, 'data);
32
33/// Segments a string into grapheme clusters.
34///
35/// Supports loading grapheme cluster break data, and creating grapheme cluster break iterators for
36/// different string encodings.
37///
38/// Most segmentation methods live on [`GraphemeClusterSegmenterBorrowed`], which can be obtained via
39/// [`GraphemeClusterSegmenter::new()`] or [`GraphemeClusterSegmenter::as_borrowed()`].
40///
41/// # Examples
42///
43/// Segment a string:
44///
45/// ```rust
46/// use icu::segmenter::GraphemeClusterSegmenter;
47/// let segmenter = GraphemeClusterSegmenter::new();
48///
49/// let breakpoints: Vec<usize> = segmenter.segment_str("Hello 🗺").collect();
50/// // World Map (U+1F5FA) is encoded in four bytes in UTF-8.
51/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 10]);
52/// ```
53///
54/// Segment a Latin1 byte string:
55///
56/// ```rust
57/// use icu::segmenter::GraphemeClusterSegmenter;
58/// let segmenter = GraphemeClusterSegmenter::new();
59///
60/// let breakpoints: Vec<usize> =
61///     segmenter.segment_latin1(b"Hello World").collect();
62/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]);
63/// ```
64///
65/// Successive boundaries can be used to retrieve the grapheme clusters.
66/// In particular, the first boundary is always 0, and the last one is the
67/// length of the segmented text in code units.
68///
69/// ```rust
70/// # use icu::segmenter::GraphemeClusterSegmenter;
71/// # let segmenter =
72/// #     GraphemeClusterSegmenter::new();
73/// use itertools::Itertools;
74/// let text = "मांजर";
75/// let grapheme_clusters: Vec<&str> = segmenter
76///     .segment_str(text)
77///     .tuple_windows()
78///     .map(|(i, j)| &text[i..j])
79///     .collect();
80/// assert_eq!(&grapheme_clusters, &["मां", "ज", "र"]);
81/// ```
82///
83/// This segmenter applies all rules provided to the constructor.
84/// Thus, if the data supplied by the provider comprises all
85/// [grapheme cluster boundary rules][Rules] from Unicode Standard Annex #29,
86/// _Unicode Text Segmentation_, which is the case of default data
87/// (both test data and data produced by `icu_provider_source`), the `segment_*`
88/// functions return extended grapheme cluster boundaries, as opposed to
89/// legacy grapheme cluster boundaries.  See [_Section 3, Grapheme Cluster
90/// Boundaries_][GC], and [_Table 1a, Sample Grapheme Clusters_][Sample_GC],
91/// in Unicode Standard Annex #29, _Unicode Text Segmentation_.
92///
93/// [Rules]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
94/// [GC]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
95/// [Sample_GC]: https://www.unicode.org/reports/tr29/#Table_Sample_Grapheme_Clusters
96///
97/// ```rust
98/// use icu::segmenter::GraphemeClusterSegmenter;
99/// let segmenter =
100///     GraphemeClusterSegmenter::new();
101///
102/// // நி (TAMIL LETTER NA, TAMIL VOWEL SIGN I) is an extended grapheme cluster,
103/// // but not a legacy grapheme cluster.
104/// let ni = "நி";
105/// let egc_boundaries: Vec<usize> = segmenter.segment_str(ni).collect();
106/// assert_eq!(&egc_boundaries, &[0, ni.len()]);
107/// ```
108#[derive(Debug)]
109pub struct GraphemeClusterSegmenter {
110    payload: DataPayload<SegmenterBreakGraphemeClusterV1>,
111}
112
113/// Segments a string into grapheme clusters (borrowed version).
114///
115/// See [`GraphemeClusterSegmenter`] for examples.
116#[derive(Clone, Debug, Copy)]
117pub struct GraphemeClusterSegmenterBorrowed<'data> {
118    data: &'data RuleBreakData<'data>,
119}
120
121impl GraphemeClusterSegmenter {
122    /// Constructs a [`GraphemeClusterSegmenterBorrowed`] with an invariant locale from compiled data.
123    ///
124    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
125    ///
126    /// [📚 Help choosing a constructor](icu_provider::constructors)
127    #[cfg(feature = "compiled_data")]
128    #[allow(clippy::new_ret_no_self)] // Deliberate choice, see #5554
129    pub const fn new() -> GraphemeClusterSegmenterBorrowed<'static> {
130        GraphemeClusterSegmenterBorrowed {
131            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_GRAPHEME_CLUSTER_V1,
132        }
133    }
134
135    icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
136        functions: [
137            new: skip,
138            try_new_with_buffer_provider,
139            try_new_unstable,
140            Self,
141    ]);
142
143    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
144    pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError>
145    where
146        D: DataProvider<SegmenterBreakGraphemeClusterV1> + ?Sized,
147    {
148        let payload = provider.load(Default::default())?.payload;
149        Ok(Self { payload })
150    }
151
152    /// Constructs a borrowed version of this type for more efficient querying.
153    ///
154    /// Most useful methods for segmentation are on this type.
155    pub fn as_borrowed(&self) -> GraphemeClusterSegmenterBorrowed<'_> {
156        GraphemeClusterSegmenterBorrowed {
157            data: self.payload.get(),
158        }
159    }
160}
161
162impl<'data> GraphemeClusterSegmenterBorrowed<'data> {
163    /// Creates a grapheme cluster break iterator for an `str` (a UTF-8 string).
164    pub fn segment_str<'s>(self, input: &'s str) -> GraphemeClusterBreakIterator<'data, 's, Utf8> {
165        GraphemeClusterBreakIterator(RuleBreakIterator {
166            iter: input.char_indices(),
167            len: input.len(),
168            current_pos_data: None,
169            result_cache: Vec::new(),
170            data: self.data,
171            complex: None,
172            boundary_property: 0,
173            locale_override: None,
174            handle_complex_language: empty_handle_complex_language,
175        })
176    }
177    /// Creates a grapheme cluster break iterator for a potentially ill-formed UTF8 string
178    ///
179    /// Invalid characters are treated as REPLACEMENT CHARACTER
180    ///
181    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
182    pub fn segment_utf8<'s>(
183        self,
184        input: &'s [u8],
185    ) -> GraphemeClusterBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
186        GraphemeClusterBreakIterator(RuleBreakIterator {
187            iter: Utf8CharIndices::new(input),
188            len: input.len(),
189            current_pos_data: None,
190            result_cache: Vec::new(),
191            data: self.data,
192            complex: None,
193            boundary_property: 0,
194            locale_override: None,
195            handle_complex_language: empty_handle_complex_language,
196        })
197    }
198    /// Creates a grapheme cluster break iterator for a Latin-1 (8-bit) string.
199    ///
200    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
201    pub fn segment_latin1<'s>(
202        self,
203        input: &'s [u8],
204    ) -> GraphemeClusterBreakIterator<'data, 's, Latin1> {
205        GraphemeClusterBreakIterator(RuleBreakIterator {
206            iter: Latin1Indices::new(input),
207            len: input.len(),
208            current_pos_data: None,
209            result_cache: Vec::new(),
210            data: self.data,
211            complex: None,
212            boundary_property: 0,
213            locale_override: None,
214            handle_complex_language: empty_handle_complex_language,
215        })
216    }
217
218    /// Creates a grapheme cluster break iterator for a UTF-16 string.
219    ///
220    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
221    pub fn segment_utf16<'s>(
222        self,
223        input: &'s [u16],
224    ) -> GraphemeClusterBreakIterator<'data, 's, Utf16> {
225        GraphemeClusterBreakIterator(RuleBreakIterator {
226            iter: Utf16Indices::new(input),
227            len: input.len(),
228            current_pos_data: None,
229            result_cache: Vec::new(),
230            data: self.data,
231            complex: None,
232            boundary_property: 0,
233            locale_override: None,
234            handle_complex_language: empty_handle_complex_language,
235        })
236    }
237}
238impl GraphemeClusterSegmenterBorrowed<'static> {
239    /// Cheaply converts a [`GraphemeClusterSegmenterBorrowed<'static>`] into a [`GraphemeClusterSegmenter`].
240    ///
241    /// Note: Due to branching and indirection, using [`GraphemeClusterSegmenter`] might inhibit some
242    /// compile-time optimizations that are possible with [`GraphemeClusterSegmenterBorrowed`].
243    pub const fn static_to_owned(self) -> GraphemeClusterSegmenter {
244        GraphemeClusterSegmenter {
245            payload: DataPayload::from_static_ref(self.data),
246        }
247    }
248}
249
250#[test]
251fn empty_string() {
252    let segmenter = GraphemeClusterSegmenter::new();
253    let breaks: Vec<usize> = segmenter.segment_str("").collect();
254    assert_eq!(breaks, [0]);
255}
256
257#[test]
258fn emoji_flags() {
259    // https://github.com/unicode-org/icu4x/issues/4780
260    let segmenter = GraphemeClusterSegmenter::new();
261    let breaks: Vec<usize> = segmenter.segment_str("🇺🇸🏴󠁧󠁢󠁥󠁮󠁧󠁿").collect();
262    assert_eq!(breaks, [0, 8, 36]);
263}