icu_segmenter/
line.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::complex::*;
6use crate::indices::*;
7use crate::provider::*;
8use crate::rule_segmenter::*;
9use alloc::string::String;
10use alloc::vec;
11use alloc::vec::Vec;
12use core::char;
13use icu_locale_core::subtags::language;
14use icu_locale_core::LanguageIdentifier;
15use icu_provider::prelude::*;
16use utf8_iter::Utf8CharIndices;
17
18// TODO(#1637): These constants should be data driven.
19#[allow(dead_code)]
20const UNKNOWN: u8 = 0;
21#[allow(dead_code)]
22const AI: u8 = 1;
23#[allow(dead_code)]
24const AK: u8 = 2;
25#[allow(dead_code)]
26const AL: u8 = 3;
27#[allow(dead_code)]
28const AL_DOTTED_CIRCLE: u8 = 4;
29#[allow(dead_code)]
30const AP: u8 = 5;
31#[allow(dead_code)]
32const AS: u8 = 6;
33#[allow(dead_code)]
34const B2: u8 = 7;
35#[allow(dead_code)]
36const BA: u8 = 8;
37#[allow(dead_code)]
38const BB: u8 = 9;
39#[allow(dead_code)]
40const BK: u8 = 10;
41#[allow(dead_code)]
42const CB: u8 = 11;
43#[allow(dead_code)]
44const CJ: u8 = 12;
45#[allow(dead_code)]
46const CL: u8 = 13;
47#[allow(dead_code)]
48const CM: u8 = 14;
49#[allow(dead_code)]
50const CP: u8 = 15;
51#[allow(dead_code)]
52const CR: u8 = 16;
53#[allow(dead_code)]
54const EB: u8 = 17;
55#[allow(dead_code)]
56const EM: u8 = 18;
57#[allow(dead_code)]
58const EX: u8 = 19;
59#[allow(dead_code)]
60const GL: u8 = 20;
61#[allow(dead_code)]
62const H2: u8 = 21;
63#[allow(dead_code)]
64const H3: u8 = 22;
65#[allow(dead_code)]
66const HL: u8 = 23;
67#[allow(dead_code)]
68const HY: u8 = 24;
69#[allow(dead_code)]
70const ID: u8 = 25;
71#[allow(dead_code)]
72const ID_CN: u8 = 26;
73#[allow(dead_code)]
74const IN: u8 = 27;
75#[allow(dead_code)]
76const IS: u8 = 28;
77#[allow(dead_code)]
78const JL: u8 = 29;
79#[allow(dead_code)]
80const JT: u8 = 30;
81#[allow(dead_code)]
82const JV: u8 = 31;
83#[allow(dead_code)]
84const LF: u8 = 32;
85#[allow(dead_code)]
86const NL: u8 = 33;
87#[allow(dead_code)]
88const NS: u8 = 34;
89#[allow(dead_code)]
90const NU: u8 = 35;
91#[allow(dead_code)]
92const OP_EA: u8 = 36;
93#[allow(dead_code)]
94const OP_OP30: u8 = 37;
95#[allow(dead_code)]
96const PO: u8 = 38;
97#[allow(dead_code)]
98const PO_EAW: u8 = 39;
99#[allow(dead_code)]
100const PR: u8 = 40;
101#[allow(dead_code)]
102const PR_EAW: u8 = 41;
103#[allow(dead_code)]
104const QU: u8 = 42;
105#[allow(dead_code)]
106const QU_PF: u8 = 43;
107#[allow(dead_code)]
108const QU_PI: u8 = 44;
109#[allow(dead_code)]
110const RI: u8 = 45;
111#[allow(dead_code)]
112const SA: u8 = 46;
113#[allow(dead_code)]
114const SP: u8 = 47;
115#[allow(dead_code)]
116const SY: u8 = 48;
117#[allow(dead_code)]
118const VF: u8 = 49;
119#[allow(dead_code)]
120const VI: u8 = 50;
121#[allow(dead_code)]
122const WJ: u8 = 51;
123#[allow(dead_code)]
124const XX: u8 = 52;
125#[allow(dead_code)]
126const ZW: u8 = 53;
127#[allow(dead_code)]
128const ZWJ: u8 = 54;
129
130/// An enum specifies the strictness of line-breaking rules. It can be passed as
131/// an argument when creating a line segmenter.
132///
133/// Each enum value has the same meaning with respect to the `line-break`
134/// property values in the CSS Text spec. See the details in
135/// <https://drafts.csswg.org/css-text-3/#line-break-property>.
136#[non_exhaustive]
137#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
138pub enum LineBreakStrictness {
139    /// Breaks text using the least restrictive set of line-breaking rules.
140    /// Typically used for short lines, such as in newspapers.
141    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-loose>
142    Loose,
143
144    /// Breaks text using the most common set of line-breaking rules.
145    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-normal>
146    Normal,
147
148    /// Breaks text using the most stringent set of line-breaking rules.
149    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-strict>
150    ///
151    /// This is the default behaviour of the Unicode Line Breaking Algorithm,
152    /// resolving class [CJ](https://www.unicode.org/reports/tr14/#CJ) to
153    /// [NS](https://www.unicode.org/reports/tr14/#NS);
154    /// see rule [LB1](https://www.unicode.org/reports/tr14/#LB1).
155    #[default]
156    Strict,
157
158    /// Breaks text assuming there is a soft wrap opportunity around every
159    /// typographic character unit, disregarding any prohibition against line
160    /// breaks. See more details in
161    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-anywhere>.
162    Anywhere,
163}
164
165/// An enum specifies the line break opportunities between letters. It can be
166/// passed as an argument when creating a line segmenter.
167///
168/// Each enum value has the same meaning with respect to the `word-break`
169/// property values in the CSS Text spec. See the details in
170/// <https://drafts.csswg.org/css-text-3/#word-break-property>
171#[non_exhaustive]
172#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
173pub enum LineBreakWordOption {
174    /// Words break according to their customary rules. See the details in
175    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-normal>.
176    #[default]
177    Normal,
178
179    /// Breaking is allowed within "words".
180    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-break-all>
181    BreakAll,
182
183    /// Breaking is forbidden within "word".
184    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all>
185    KeepAll,
186}
187
188/// Options to tailor line-breaking behavior.
189#[non_exhaustive]
190#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
191pub struct LineBreakOptions<'a> {
192    /// Strictness of line-breaking rules. See [`LineBreakStrictness`].
193    ///
194    /// Default is [`LineBreakStrictness::Strict`]
195    pub strictness: Option<LineBreakStrictness>,
196
197    /// Line break opportunities between letters. See [`LineBreakWordOption`].
198    ///
199    /// Default is [`LineBreakStrictness::Normal`]
200    pub word_option: Option<LineBreakWordOption>,
201
202    /// Content locale for line segmenter
203    ///
204    /// This allows more break opportunities when `LineBreakStrictness` is
205    /// `Normal` or `Loose`. See
206    /// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
207    /// This option has no effect in Latin-1 mode.
208    pub content_locale: Option<&'a LanguageIdentifier>,
209}
210
211#[derive(Debug, Clone, Copy)]
212struct ResolvedLineBreakOptions {
213    strictness: LineBreakStrictness,
214    word_option: LineBreakWordOption,
215    ja_zh: bool,
216}
217
218impl From<LineBreakOptions<'_>> for ResolvedLineBreakOptions {
219    fn from(options: LineBreakOptions<'_>) -> Self {
220        let ja_zh = if let Some(content_locale) = options.content_locale.as_ref() {
221            content_locale.language == language!("ja") || content_locale.language == language!("zh")
222        } else {
223            false
224        };
225        Self {
226            strictness: options.strictness.unwrap_or_default(),
227            word_option: options.word_option.unwrap_or_default(),
228            ja_zh,
229        }
230    }
231}
232
233/// Supports loading line break data, and creating line break iterators for different string
234/// encodings.
235///
236/// Most segmentation methods live on [`LineSegmenterBorrowed`], which can be obtained via
237/// [`LineSegmenter::new_auto()`] (etc) or [`LineSegmenter::as_borrowed()`].
238///
239/// The segmenter returns mandatory breaks (as defined by [definition LD7][LD7] of
240/// Unicode Standard Annex #14, _Unicode Line Breaking Algorithm_) as well as
241/// line break opportunities ([definition LD3][LD3]).
242/// It does not distinguish them.  Callers requiring that distinction can check
243/// the Line_Break property of the code point preceding the break against those
244/// listed in rules [LB4][LB4] and [LB5][LB5], special-casing the end of text
245/// according to [LB3][LB3].
246///
247/// For consistency with the grapheme, word, and sentence segmenters, there is
248/// always a breakpoint returned at index 0, but this breakpoint is not a
249/// meaningful line break opportunity.
250///
251/// [LD3]: https://www.unicode.org/reports/tr14/#LD3
252/// [LD7]: https://www.unicode.org/reports/tr14/#LD7
253/// [LB3]: https://www.unicode.org/reports/tr14/#LB3
254/// [LB4]: https://www.unicode.org/reports/tr14/#LB4
255/// [LB5]: https://www.unicode.org/reports/tr14/#LB5
256///
257/// ```rust
258/// # use icu::segmenter::LineSegmenter;
259/// #
260/// # let segmenter = LineSegmenter::new_auto(Default::default());
261/// #
262/// let text = "Summary\r\nThis annex…";
263/// let breakpoints: Vec<usize> = segmenter.segment_str(text).collect();
264/// // 9 and 22 are mandatory breaks, 14 is a line break opportunity.
265/// assert_eq!(&breakpoints, &[0, 9, 14, 22]);
266///
267/// // There is a break opportunity between emoji, but not within the ZWJ sequence 🏳️‍🌈.
268/// let flag_equation = "🏳️➕🌈🟰🏳️\u{200D}🌈";
269/// let possible_first_lines: Vec<&str> =
270///     segmenter.segment_str(flag_equation).skip(1).map(|i| &flag_equation[..i]).collect();
271/// assert_eq!(
272///     &possible_first_lines,
273///     &[
274///         "🏳️",
275///         "🏳️➕",
276///         "🏳️➕🌈",
277///         "🏳️➕🌈🟰",
278///         "🏳️➕🌈🟰🏳️‍🌈"
279///     ]
280/// );
281/// ```
282///
283/// # Examples
284///
285/// Segment a string with default options:
286///
287/// ```rust
288/// use icu::segmenter::LineSegmenter;
289///
290/// let segmenter = LineSegmenter::new_auto(Default::default());
291///
292/// let breakpoints: Vec<usize> =
293///     segmenter.segment_str("Hello World").collect();
294/// assert_eq!(&breakpoints, &[0, 6, 11]);
295/// ```
296///
297/// Segment a string with CSS option overrides:
298///
299/// ```rust
300/// use icu::segmenter::options::{
301///     LineBreakOptions, LineBreakStrictness, LineBreakWordOption,
302/// };
303/// use icu::segmenter::LineSegmenter;
304///
305/// let mut options = LineBreakOptions::default();
306/// options.strictness = Some(LineBreakStrictness::Strict);
307/// options.word_option = Some(LineBreakWordOption::BreakAll);
308/// options.content_locale = None;
309/// let segmenter = LineSegmenter::new_auto(options);
310///
311/// let breakpoints: Vec<usize> =
312///     segmenter.segment_str("Hello World").collect();
313/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]);
314/// ```
315///
316/// Segment a Latin1 byte string:
317///
318/// ```rust
319/// use icu::segmenter::LineSegmenter;
320///
321/// let segmenter = LineSegmenter::new_auto(Default::default());
322///
323/// let breakpoints: Vec<usize> =
324///     segmenter.segment_latin1(b"Hello World").collect();
325/// assert_eq!(&breakpoints, &[0, 6, 11]);
326/// ```
327///
328/// Separate mandatory breaks from the break opportunities:
329///
330/// ```rust
331/// use icu::properties::{props::LineBreak, CodePointMapData};
332/// use icu::segmenter::LineSegmenter;
333///
334/// # let segmenter = LineSegmenter::new_auto(Default::default());
335/// #
336/// let text = "Summary\r\nThis annex…";
337///
338/// let mandatory_breaks: Vec<usize> = segmenter
339///     .segment_str(text)
340///     .filter(|&i| {
341///         text[..i].chars().next_back().is_some_and(|c| {
342///             matches!(
343///                 CodePointMapData::<LineBreak>::new().get(c),
344///                 LineBreak::MandatoryBreak
345///                     | LineBreak::CarriageReturn
346///                     | LineBreak::LineFeed
347///                     | LineBreak::NextLine
348///             ) || i == text.len()
349///         })
350///     })
351///     .collect();
352/// assert_eq!(&mandatory_breaks, &[9, 22]);
353/// ```
354#[derive(Debug)]
355pub struct LineSegmenter {
356    options: ResolvedLineBreakOptions,
357    payload: DataPayload<SegmenterBreakLineV1>,
358    complex: ComplexPayloads,
359}
360
361/// Segments a string into lines (borrowed version).
362///
363/// See [`LineSegmenter`] for examples.
364#[derive(Clone, Debug, Copy)]
365pub struct LineSegmenterBorrowed<'data> {
366    options: ResolvedLineBreakOptions,
367    data: &'data RuleBreakData<'data>,
368    complex: ComplexPayloadsBorrowed<'data>,
369}
370
371impl LineSegmenter {
372    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
373    /// the best available compiled data for complex scripts (Khmer, Lao, Myanmar, and Thai).
374    ///
375    /// The current behavior, which is subject to change, is to use the LSTM model when available.
376    ///
377    /// See also [`Self::new_auto`].
378    ///
379    /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
380    ///
381    /// [📚 Help choosing a constructor](icu_provider::constructors)
382    #[cfg(feature = "auto")]
383    #[cfg(feature = "compiled_data")]
384    pub fn new_auto(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
385        Self::new_lstm(options)
386    }
387
388    #[cfg(feature = "auto")]
389    icu_provider::gen_buffer_data_constructors!(
390        (options: LineBreakOptions) -> error: DataError,
391        functions: [
392            new_auto: skip,
393            try_new_auto_with_buffer_provider,
394            try_new_auto_unstable,
395            Self,
396        ]
397    );
398
399    #[cfg(feature = "auto")]
400    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
401    pub fn try_new_auto_unstable<D>(
402        provider: &D,
403        options: LineBreakOptions,
404    ) -> Result<Self, DataError>
405    where
406        D: DataProvider<SegmenterBreakLineV1>
407            + DataProvider<SegmenterLstmAutoV1>
408            + DataProvider<SegmenterBreakGraphemeClusterV1>
409            + ?Sized,
410    {
411        Self::try_new_lstm_unstable(provider, options)
412    }
413
414    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
415    /// compiled LSTM data for complex scripts (Khmer, Lao, Myanmar, and Thai).
416    ///
417    /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
418    /// the full dictionary but more expensive during segmentation (inference).
419    ///
420    /// See also [`Self::new_lstm`].
421    ///
422    /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
423    ///
424    /// [📚 Help choosing a constructor](icu_provider::constructors)
425    #[cfg(feature = "lstm")]
426    #[cfg(feature = "compiled_data")]
427    pub fn new_lstm(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
428        LineSegmenterBorrowed {
429            options: options.into(),
430            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
431            complex: ComplexPayloadsBorrowed::new_lstm(),
432        }
433    }
434
435    #[cfg(feature = "lstm")]
436    icu_provider::gen_buffer_data_constructors!(
437        (options: LineBreakOptions) -> error: DataError,
438        functions: [
439            try_new_lstm: skip,
440            try_new_lstm_with_buffer_provider,
441            try_new_lstm_unstable,
442            Self,
443        ]
444    );
445
446    #[cfg(feature = "lstm")]
447    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
448    pub fn try_new_lstm_unstable<D>(
449        provider: &D,
450        options: LineBreakOptions,
451    ) -> Result<Self, DataError>
452    where
453        D: DataProvider<SegmenterBreakLineV1>
454            + DataProvider<SegmenterLstmAutoV1>
455            + DataProvider<SegmenterBreakGraphemeClusterV1>
456            + ?Sized,
457    {
458        Ok(Self {
459            options: options.into(),
460            payload: provider.load(Default::default())?.payload,
461            complex: ComplexPayloads::try_new_lstm(provider)?,
462        })
463    }
464
465    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
466    /// compiled dictionary data for complex scripts (Khmer, Lao, Myanmar, and Thai).
467    ///
468    /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
469    /// faster than the LSTM model but requires more data.
470    ///
471    /// See also [`Self::new_dictionary`].
472    ///
473    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
474    ///
475    /// [📚 Help choosing a constructor](icu_provider::constructors)
476    #[cfg(feature = "compiled_data")]
477    pub fn new_dictionary(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
478        LineSegmenterBorrowed {
479            options: options.into(),
480            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
481            // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
482            // characters [1]. Southeast Asian languages however require complex context analysis
483            // [2].
484            //
485            // [1]: https://www.unicode.org/reports/tr14/#ID
486            // [2]: https://www.unicode.org/reports/tr14/#SA
487            complex: ComplexPayloadsBorrowed::new_southeast_asian(),
488        }
489    }
490
491    icu_provider::gen_buffer_data_constructors!(
492        (options: LineBreakOptions) -> error: DataError,
493        functions: [
494            new_dictionary: skip,
495            try_new_dictionary_with_buffer_provider,
496            try_new_dictionary_unstable,
497            Self,
498        ]
499    );
500
501    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
502    pub fn try_new_dictionary_unstable<D>(
503        provider: &D,
504        options: LineBreakOptions,
505    ) -> Result<Self, DataError>
506    where
507        D: DataProvider<SegmenterBreakLineV1>
508            + DataProvider<SegmenterDictionaryExtendedV1>
509            + DataProvider<SegmenterBreakGraphemeClusterV1>
510            + ?Sized,
511    {
512        Ok(Self {
513            options: options.into(),
514            payload: provider.load(Default::default())?.payload,
515            // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
516            // characters [1]. Southeast Asian languages however require complex context analysis
517            // [2].
518            //
519            // [1]: https://www.unicode.org/reports/tr14/#ID
520            // [2]: https://www.unicode.org/reports/tr14/#SA
521            complex: ComplexPayloads::try_new_southeast_asian(provider)?,
522        })
523    }
524
525    /// Constructs a borrowed version of this type for more efficient querying.
526    ///
527    /// Most useful methods for segmentation are on this type.
528    pub fn as_borrowed(&self) -> LineSegmenterBorrowed<'_> {
529        LineSegmenterBorrowed {
530            options: self.options,
531            data: self.payload.get(),
532            complex: self.complex.as_borrowed(),
533        }
534    }
535}
536
537impl<'data> LineSegmenterBorrowed<'data> {
538    /// Creates a line break iterator for an `str` (a UTF-8 string).
539    ///
540    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
541    pub fn segment_str<'s>(self, input: &'s str) -> LineBreakIterator<'data, 's, Utf8> {
542        LineBreakIterator {
543            iter: input.char_indices(),
544            len: input.len(),
545            current_pos_data: None,
546            result_cache: Vec::new(),
547            data: self.data,
548            options: self.options,
549            complex: self.complex,
550        }
551    }
552    /// Creates a line break iterator for a potentially ill-formed UTF8 string
553    ///
554    /// Invalid characters are treated as REPLACEMENT CHARACTER
555    ///
556    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
557    pub fn segment_utf8<'s>(
558        self,
559        input: &'s [u8],
560    ) -> LineBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
561        LineBreakIterator {
562            iter: Utf8CharIndices::new(input),
563            len: input.len(),
564            current_pos_data: None,
565            result_cache: Vec::new(),
566            data: self.data,
567            options: self.options,
568            complex: self.complex,
569        }
570    }
571    /// Creates a line break iterator for a Latin-1 (8-bit) string.
572    ///
573    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
574    pub fn segment_latin1<'s>(self, input: &'s [u8]) -> LineBreakIterator<'data, 's, Latin1> {
575        LineBreakIterator {
576            iter: Latin1Indices::new(input),
577            len: input.len(),
578            current_pos_data: None,
579            result_cache: Vec::new(),
580            data: self.data,
581            options: self.options,
582            complex: self.complex,
583        }
584    }
585
586    /// Creates a line break iterator for a UTF-16 string.
587    ///
588    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
589    pub fn segment_utf16<'s>(self, input: &'s [u16]) -> LineBreakIterator<'data, 's, Utf16> {
590        LineBreakIterator {
591            iter: Utf16Indices::new(input),
592            len: input.len(),
593            current_pos_data: None,
594            result_cache: Vec::new(),
595            data: self.data,
596            options: self.options,
597            complex: self.complex,
598        }
599    }
600}
601
602impl LineSegmenterBorrowed<'static> {
603    /// Cheaply converts a [`LineSegmenterBorrowed<'static>`] into a [`LineSegmenter`].
604    ///
605    /// Note: Due to branching and indirection, using [`LineSegmenter`] might inhibit some
606    /// compile-time optimizations that are possible with [`LineSegmenterBorrowed`].
607    pub fn static_to_owned(self) -> LineSegmenter {
608        LineSegmenter {
609            payload: DataPayload::from_static_ref(self.data),
610            complex: self.complex.static_to_owned(),
611            options: self.options,
612        }
613    }
614}
615
616impl RuleBreakData<'_> {
617    fn get_linebreak_property_utf32_with_rule(
618        &self,
619        codepoint: u32,
620        strictness: LineBreakStrictness,
621        word_option: LineBreakWordOption,
622    ) -> u8 {
623        // Note: Default value is 0 == UNKNOWN
624        let prop = self.property_table.get32(codepoint);
625
626        if word_option == LineBreakWordOption::BreakAll
627            || strictness == LineBreakStrictness::Loose
628            || strictness == LineBreakStrictness::Normal
629        {
630            return match prop {
631                CJ => ID, // All CJ's General_Category is Other_Letter (Lo).
632                _ => prop,
633            };
634        }
635
636        // CJ is treated as NS by default, yielding strict line breaking.
637        // https://www.unicode.org/reports/tr14/#CJ
638        prop
639    }
640
641    #[inline]
642    fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
643        let idx = (left as usize) * (self.property_count as usize) + (right as usize);
644        // We use unwrap_or to fall back to the base case and prevent panics on bad data.
645        self.break_state_table.get(idx).unwrap_or(BreakState::Keep)
646    }
647
648    #[inline]
649    fn use_complex_breaking_utf32(&self, codepoint: u32) -> bool {
650        let line_break_property = self.get_linebreak_property_utf32_with_rule(
651            codepoint,
652            LineBreakStrictness::Strict,
653            LineBreakWordOption::Normal,
654        );
655
656        line_break_property == SA
657    }
658}
659
660#[inline]
661fn is_break_utf32_by_loose(
662    right_codepoint: u32,
663    left_prop: u8,
664    right_prop: u8,
665    ja_zh: bool,
666) -> Option<bool> {
667    // breaks before hyphens
668    if right_prop == BA {
669        if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) {
670            return Some(true);
671        }
672    } else if right_prop == NS {
673        // breaks before certain CJK hyphen-like characters
674        if right_codepoint == 0x301C || right_codepoint == 0x30A0 {
675            return Some(ja_zh);
676        }
677
678        // breaks before iteration marks
679        if right_codepoint == 0x3005
680            || right_codepoint == 0x303B
681            || right_codepoint == 0x309D
682            || right_codepoint == 0x309E
683            || right_codepoint == 0x30FD
684            || right_codepoint == 0x30FE
685        {
686            return Some(true);
687        }
688
689        // breaks before certain centered punctuation marks:
690        if right_codepoint == 0x30FB
691            || right_codepoint == 0xFF1A
692            || right_codepoint == 0xFF1B
693            || right_codepoint == 0xFF65
694            || right_codepoint == 0x203C
695            || (0x2047..=0x2049).contains(&right_codepoint)
696        {
697            return Some(ja_zh);
698        }
699    } else if right_prop == IN {
700        // breaks between inseparable characters such as U+2025, U+2026 i.e. characters with the Unicode Line Break property IN
701        return Some(true);
702    } else if right_prop == EX {
703        // breaks before certain centered punctuation marks:
704        if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F {
705            return Some(ja_zh);
706        }
707    }
708
709    // breaks before suffixes:
710    // Characters with the Unicode Line Break property PO and the East Asian Width property
711    if right_prop == PO_EAW {
712        return Some(ja_zh);
713    }
714    // breaks after prefixes:
715    // Characters with the Unicode Line Break property PR and the East Asian Width property
716    if left_prop == PR_EAW {
717        return Some(ja_zh);
718    }
719    None
720}
721
722/// A trait allowing for LineBreakIterator to be generalized to multiple string iteration methods.
723///
724/// This is implemented by ICU4X for several common string types.
725///
726/// <div class="stab unstable">
727/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
728/// trait, please consider using a type from the implementors listed below.
729/// </div>
730pub trait LineBreakType: crate::private::Sealed + Sized + RuleBreakType {
731    #[doc(hidden)]
732    fn use_complex_breaking(iterator: &LineBreakIterator<'_, '_, Self>, c: Self::CharType) -> bool;
733
734    #[doc(hidden)]
735    fn get_linebreak_property_with_rule(
736        iterator: &LineBreakIterator<'_, '_, Self>,
737        c: Self::CharType,
738    ) -> u8;
739
740    #[doc(hidden)]
741    fn line_handle_complex_language(
742        iterator: &mut LineBreakIterator<'_, '_, Self>,
743        left_codepoint: Self::CharType,
744    ) -> Option<usize>;
745}
746
747/// Implements the [`Iterator`] trait over the line break opportunities of the given string.
748///
749/// Lifetimes:
750///
751/// - `'l` = lifetime of the [`LineSegmenter`] object from which this iterator was created
752/// - `'s` = lifetime of the string being segmented
753///
754/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
755/// _after_ the break (for a break at the end of text, this index is the length
756/// of the [`str`] or array of code units).
757///
758/// For examples of use, see [`LineSegmenter`].
759#[derive(Debug)]
760pub struct LineBreakIterator<'data, 's, Y: LineBreakType> {
761    iter: Y::IterAttr<'s>,
762    len: usize,
763    current_pos_data: Option<(usize, Y::CharType)>,
764    result_cache: Vec<usize>,
765    data: &'data RuleBreakData<'data>,
766    options: ResolvedLineBreakOptions,
767    complex: ComplexPayloadsBorrowed<'data>,
768}
769
770impl<Y: LineBreakType> Iterator for LineBreakIterator<'_, '_, Y> {
771    type Item = usize;
772
773    fn next(&mut self) -> Option<Self::Item> {
774        match self.check_eof() {
775            StringBoundaryPosType::Start => return Some(0),
776            StringBoundaryPosType::End => return None,
777            _ => (),
778        }
779
780        // If we have break point cache by previous run, return this result
781        if let Some(&first_pos) = self.result_cache.first() {
782            let mut i = 0;
783            loop {
784                if i == first_pos {
785                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
786                    return self.get_current_position();
787                }
788                i += self.get_current_codepoint().map_or(0, Y::char_len);
789                self.advance_iter();
790                if self.is_eof() {
791                    self.result_cache.clear();
792                    return Some(self.len);
793                }
794            }
795        }
796
797        // The state prior to a sequence of CM and ZWJ affected by rule LB9.
798        let mut lb9_left: Option<u8> = None;
799        // Whether LB9 was applied to a ZWJ, so that breaks at the current
800        // position must be suppressed.
801        let mut lb8a_after_lb9 = false;
802
803        'a: loop {
804            debug_assert!(!self.is_eof());
805            let left_codepoint = self.get_current_codepoint()?;
806            let mut left_prop =
807                lb9_left.unwrap_or_else(|| self.get_linebreak_property(left_codepoint));
808            let after_zwj = lb8a_after_lb9 || (lb9_left.is_none() && left_prop == ZWJ);
809            self.advance_iter();
810
811            let Some(right_codepoint) = self.get_current_codepoint() else {
812                return Some(self.len);
813            };
814            let right_prop = self.get_linebreak_property(right_codepoint);
815            // NOTE(egg): The special-casing of `LineBreakStrictness::Anywhere` allows us to pass
816            // a test, but eventually that option should just be simplified to call the extended
817            // grapheme cluster segmenter.
818            if (right_prop == CM
819                || (right_prop == ZWJ && self.options.strictness != LineBreakStrictness::Anywhere))
820                && left_prop != BK
821                && left_prop != CR
822                && left_prop != LF
823                && left_prop != NL
824                && left_prop != SP
825                && left_prop != ZW
826            {
827                lb9_left = Some(left_prop);
828                lb8a_after_lb9 = right_prop == ZWJ;
829                continue;
830            } else {
831                lb9_left = None;
832                lb8a_after_lb9 = false;
833            }
834
835            // CSS word-break property handling
836            match (self.options.word_option, left_prop, right_prop) {
837                (LineBreakWordOption::BreakAll, AL | NU | SA, _) => {
838                    left_prop = ID;
839                }
840                //  typographic letter units shouldn't be break
841                (
842                    LineBreakWordOption::KeepAll,
843                    AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
844                    AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
845                ) => {
846                    continue;
847                }
848                _ => (),
849            }
850
851            // CSS line-break property handling
852            match self.options.strictness {
853                LineBreakStrictness::Normal => {
854                    if self.is_break_by_normal(right_codepoint) && !after_zwj {
855                        return self.get_current_position();
856                    }
857                }
858                LineBreakStrictness::Loose => {
859                    if let Some(breakable) = is_break_utf32_by_loose(
860                        right_codepoint.into(),
861                        left_prop,
862                        right_prop,
863                        self.options.ja_zh,
864                    ) {
865                        if breakable && !after_zwj {
866                            return self.get_current_position();
867                        }
868                        continue;
869                    }
870                }
871                LineBreakStrictness::Anywhere => {
872                    // TODO(egg): My reading of the CSS standard is that this
873                    // should break around extended grapheme clusters, not at
874                    // arbitrary code points, so this seems wrong.
875                    return self.get_current_position();
876                }
877                _ => (),
878            };
879
880            // UAX14 doesn't have Thai etc, so use another way.
881            if self.options.word_option != LineBreakWordOption::BreakAll
882                && Y::use_complex_breaking(self, left_codepoint)
883                && Y::use_complex_breaking(self, right_codepoint)
884            {
885                let result = Y::line_handle_complex_language(self, left_codepoint);
886                if result.is_some() {
887                    return result;
888                }
889                // I may have to fetch text until non-SA character?.
890            }
891
892            // If break_state is equals or grater than 0, it is alias of property.
893            match self.data.get_break_state_from_table(left_prop, right_prop) {
894                BreakState::Break | BreakState::NoMatch => {
895                    if after_zwj {
896                        continue;
897                    } else {
898                        return self.get_current_position();
899                    }
900                }
901                BreakState::Keep => continue,
902                BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
903                    let mut previous_iter = self.iter.clone();
904                    let mut previous_pos_data = self.current_pos_data;
905                    let mut previous_is_after_zwj = after_zwj;
906
907                    // Since we are building up a state in this inner loop, we do not
908                    // need an analogue of lb9_left; continuing the inner loop preserves
909                    // `index` which is the current state, and thus implements the
910                    // “treat as” rule.
911                    let mut left_prop_pre_lb9 = right_prop;
912
913                    // current state isn't resolved due to intermediating.
914                    // Example, [AK] [AS] is processing LB28a, but if not matched after fetching
915                    // data, we should break after [AK].
916                    let is_intermediate_rule_no_match = if lb8a_after_lb9 {
917                        // left was ZWJ so we don't break between ZWJ.
918                        true
919                    } else {
920                        index > self.data.last_codepoint_property
921                    };
922
923                    loop {
924                        self.advance_iter();
925                        let after_zwj = left_prop_pre_lb9 == ZWJ;
926
927                        let previous_break_state_is_cp_prop =
928                            index <= self.data.last_codepoint_property;
929
930                        let Some(prop) = self.get_current_linebreak_property() else {
931                            // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
932                            let break_state = self
933                                .data
934                                .get_break_state_from_table(index, self.data.eot_property);
935                            if break_state == BreakState::NoMatch {
936                                self.iter = previous_iter;
937                                self.current_pos_data = previous_pos_data;
938                                if previous_is_after_zwj {
939                                    // Do not break [AK] [ZWJ] ÷ [AS] (eot).
940                                    continue 'a;
941                                } else {
942                                    return self.get_current_position();
943                                }
944                            }
945                            // EOF
946                            return Some(self.len);
947                        };
948
949                        if (prop == CM || prop == ZWJ)
950                            && left_prop_pre_lb9 != BK
951                            && left_prop_pre_lb9 != CR
952                            && left_prop_pre_lb9 != LF
953                            && left_prop_pre_lb9 != NL
954                            && left_prop_pre_lb9 != SP
955                            && left_prop_pre_lb9 != ZW
956                        {
957                            left_prop_pre_lb9 = prop;
958                            continue;
959                        }
960
961                        match self.data.get_break_state_from_table(index, prop) {
962                            BreakState::Keep => continue 'a,
963                            BreakState::NoMatch => {
964                                self.iter = previous_iter;
965                                self.current_pos_data = previous_pos_data;
966                                if after_zwj {
967                                    // Break [AK] ÷ [AS] [ZWJ] [XX],
968                                    // but not [AK] [ZWJ] ÷ [AS] [ZWJ] [XX].
969                                    if is_intermediate_rule_no_match && !previous_is_after_zwj {
970                                        return self.get_current_position();
971                                    }
972                                    continue 'a;
973                                } else if previous_is_after_zwj {
974                                    // Do not break [AK] [ZWJ] ÷ [AS] [XX].
975                                    continue 'a;
976                                } else {
977                                    return self.get_current_position();
978                                }
979                            }
980                            BreakState::Break => {
981                                if after_zwj {
982                                    continue 'a;
983                                } else {
984                                    return self.get_current_position();
985                                }
986                            }
987                            BreakState::Intermediate(i) => {
988                                index = i;
989                                previous_iter = self.iter.clone();
990                                previous_pos_data = self.current_pos_data;
991                                previous_is_after_zwj = after_zwj;
992                            }
993                            BreakState::Index(i) => {
994                                index = i;
995                                if previous_break_state_is_cp_prop {
996                                    previous_iter = self.iter.clone();
997                                    previous_pos_data = self.current_pos_data;
998                                    previous_is_after_zwj = after_zwj;
999                                }
1000                            }
1001                        }
1002                        left_prop_pre_lb9 = prop;
1003                    }
1004                }
1005            }
1006        }
1007    }
1008}
1009
1010enum StringBoundaryPosType {
1011    Start,
1012    Middle,
1013    End,
1014}
1015
1016impl<Y: LineBreakType> LineBreakIterator<'_, '_, Y> {
1017    fn advance_iter(&mut self) {
1018        self.current_pos_data = self.iter.next();
1019    }
1020
1021    fn is_eof(&self) -> bool {
1022        self.current_pos_data.is_none()
1023    }
1024
1025    #[inline]
1026    fn check_eof(&mut self) -> StringBoundaryPosType {
1027        if self.is_eof() {
1028            self.advance_iter();
1029            if self.is_eof() {
1030                if self.len == 0 {
1031                    // Empty string. Since `self.current_pos_data` is always going to be empty,
1032                    // we never read `self.len` except for here, so we can use it to mark that
1033                    // we have already returned the single empty-string breakpoint.
1034                    self.len = 1;
1035                    StringBoundaryPosType::Start
1036                } else {
1037                    StringBoundaryPosType::End
1038                }
1039            } else {
1040                StringBoundaryPosType::Start
1041            }
1042        } else {
1043            StringBoundaryPosType::Middle
1044        }
1045    }
1046
1047    fn get_current_position(&self) -> Option<usize> {
1048        self.current_pos_data.map(|(pos, _)| pos)
1049    }
1050
1051    fn get_current_codepoint(&self) -> Option<Y::CharType> {
1052        self.current_pos_data.map(|(_, codepoint)| codepoint)
1053    }
1054
1055    fn get_linebreak_property(&self, codepoint: Y::CharType) -> u8 {
1056        Y::get_linebreak_property_with_rule(self, codepoint)
1057    }
1058
1059    fn get_current_linebreak_property(&self) -> Option<u8> {
1060        self.get_current_codepoint()
1061            .map(|c| self.get_linebreak_property(c))
1062    }
1063
1064    fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
1065        match codepoint.into() {
1066            0x301C | 0x30A0 => self.options.ja_zh,
1067            _ => false,
1068        }
1069    }
1070}
1071
1072impl LineBreakType for Utf8 {
1073    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1074        iterator.data.get_linebreak_property_utf32_with_rule(
1075            c as u32,
1076            iterator.options.strictness,
1077            iterator.options.word_option,
1078        )
1079    }
1080
1081    #[inline]
1082    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1083        iterator.data.use_complex_breaking_utf32(c as u32)
1084    }
1085
1086    fn line_handle_complex_language(
1087        iter: &mut LineBreakIterator<'_, '_, Self>,
1088        left_codepoint: char,
1089    ) -> Option<usize> {
1090        line_handle_complex_language_utf8(iter, left_codepoint)
1091    }
1092}
1093
1094impl LineBreakType for PotentiallyIllFormedUtf8 {
1095    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1096        iterator.data.get_linebreak_property_utf32_with_rule(
1097            c as u32,
1098            iterator.options.strictness,
1099            iterator.options.word_option,
1100        )
1101    }
1102
1103    #[inline]
1104    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1105        iterator.data.use_complex_breaking_utf32(c as u32)
1106    }
1107
1108    fn line_handle_complex_language(
1109        iter: &mut LineBreakIterator<'_, '_, Self>,
1110        left_codepoint: char,
1111    ) -> Option<usize> {
1112        line_handle_complex_language_utf8(iter, left_codepoint)
1113    }
1114}
1115/// line_handle_complex_language impl for UTF8 iterators
1116fn line_handle_complex_language_utf8<T>(
1117    iter: &mut LineBreakIterator<'_, '_, T>,
1118    left_codepoint: char,
1119) -> Option<usize>
1120where
1121    T: LineBreakType<CharType = char>,
1122{
1123    // word segmenter doesn't define break rules for some languages such as Thai.
1124    let start_iter = iter.iter.clone();
1125    let start_point = iter.current_pos_data;
1126    let mut s = String::new();
1127    s.push(left_codepoint);
1128    loop {
1129        debug_assert!(!iter.is_eof());
1130        s.push(iter.get_current_codepoint()?);
1131        iter.advance_iter();
1132        if let Some(current_codepoint) = iter.get_current_codepoint() {
1133            if !T::use_complex_breaking(iter, current_codepoint) {
1134                break;
1135            }
1136        } else {
1137            // EOF
1138            break;
1139        }
1140    }
1141
1142    // Restore iterator to move to head of complex string
1143    iter.iter = start_iter;
1144    iter.current_pos_data = start_point;
1145    let breaks = iter.complex.complex_language_segment_str(&s);
1146    iter.result_cache = breaks;
1147    let first_pos = *iter.result_cache.first()?;
1148    let mut i = left_codepoint.len_utf8();
1149    loop {
1150        if i == first_pos {
1151            // Re-calculate breaking offset
1152            iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
1153            return iter.get_current_position();
1154        }
1155        debug_assert!(
1156            i < first_pos,
1157            "we should always arrive at first_pos: near index {:?}",
1158            iter.get_current_position()
1159        );
1160        i += iter.get_current_codepoint().map_or(0, T::char_len);
1161        iter.advance_iter();
1162        if iter.is_eof() {
1163            iter.result_cache.clear();
1164            return Some(iter.len);
1165        }
1166    }
1167}
1168
1169impl LineBreakType for Latin1 {
1170    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 {
1171        // No CJ on Latin1
1172        // Note: Default value is 0 == UNKNOWN
1173        iterator.data.property_table.get32(c as u32)
1174    }
1175
1176    #[inline]
1177    fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool {
1178        false
1179    }
1180
1181    fn line_handle_complex_language(
1182        _: &mut LineBreakIterator<Self>,
1183        _: Self::CharType,
1184    ) -> Option<usize> {
1185        unreachable!()
1186    }
1187}
1188
1189impl LineBreakType for Utf16 {
1190    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 {
1191        iterator.data.get_linebreak_property_utf32_with_rule(
1192            c,
1193            iterator.options.strictness,
1194            iterator.options.word_option,
1195        )
1196    }
1197
1198    #[inline]
1199    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool {
1200        iterator.data.use_complex_breaking_utf32(c)
1201    }
1202
1203    fn line_handle_complex_language(
1204        iterator: &mut LineBreakIterator<Self>,
1205        left_codepoint: Self::CharType,
1206    ) -> Option<usize> {
1207        // word segmenter doesn't define break rules for some languages such as Thai.
1208        let start_iter = iterator.iter.clone();
1209        let start_point = iterator.current_pos_data;
1210        let mut s = vec![left_codepoint as u16];
1211        loop {
1212            debug_assert!(!iterator.is_eof());
1213            s.push(iterator.get_current_codepoint()? as u16);
1214            iterator.advance_iter();
1215            if let Some(current_codepoint) = iterator.get_current_codepoint() {
1216                if !Self::use_complex_breaking(iterator, current_codepoint) {
1217                    break;
1218                }
1219            } else {
1220                // EOF
1221                break;
1222            }
1223        }
1224
1225        // Restore iterator to move to head of complex string
1226        iterator.iter = start_iter;
1227        iterator.current_pos_data = start_point;
1228        let breaks = iterator.complex.complex_language_segment_utf16(&s);
1229        iterator.result_cache = breaks;
1230        // result_cache vector is utf-16 index that is in BMP.
1231        let first_pos = *iterator.result_cache.first()?;
1232        let mut i = 1;
1233        loop {
1234            if i == first_pos {
1235                // Re-calculate breaking offset
1236                iterator.result_cache = iterator
1237                    .result_cache
1238                    .iter()
1239                    .skip(1)
1240                    .map(|r| r - i)
1241                    .collect();
1242                return iterator.get_current_position();
1243            }
1244            debug_assert!(
1245                i < first_pos,
1246                "we should always arrive at first_pos: near index {:?}",
1247                iterator.get_current_position()
1248            );
1249            i += 1;
1250            iterator.advance_iter();
1251            if iterator.is_eof() {
1252                iterator.result_cache.clear();
1253                return Some(iterator.len);
1254            }
1255        }
1256    }
1257}
1258
1259#[cfg(test)]
1260#[cfg(feature = "serde")]
1261mod tests {
1262    use super::*;
1263    use crate::LineSegmenter;
1264
1265    #[test]
1266    fn linebreak_property() {
1267        let payload =
1268            DataProvider::<SegmenterBreakLineV1>::load(&crate::provider::Baked, Default::default())
1269                .expect("Loading should succeed!")
1270                .payload;
1271
1272        let get_linebreak_property = |codepoint| {
1273            payload.get().get_linebreak_property_utf32_with_rule(
1274                codepoint as u32,
1275                LineBreakStrictness::Strict,
1276                LineBreakWordOption::Normal,
1277            )
1278        };
1279
1280        assert_eq!(get_linebreak_property('\u{0020}'), SP);
1281        assert_eq!(get_linebreak_property('\u{0022}'), QU);
1282        assert_eq!(get_linebreak_property('('), OP_OP30);
1283        assert_eq!(get_linebreak_property('\u{0030}'), NU);
1284        assert_eq!(get_linebreak_property('['), OP_OP30);
1285        assert_eq!(get_linebreak_property('\u{1f3fb}'), EM);
1286        assert_eq!(get_linebreak_property('\u{20000}'), ID);
1287        assert_eq!(get_linebreak_property('\u{e0020}'), CM);
1288        assert_eq!(get_linebreak_property('\u{3041}'), CJ);
1289        assert_eq!(get_linebreak_property('\u{0025}'), PO);
1290        assert_eq!(get_linebreak_property('\u{00A7}'), AI);
1291        assert_eq!(get_linebreak_property('\u{50005}'), XX);
1292        assert_eq!(get_linebreak_property('\u{17D6}'), NS);
1293        assert_eq!(get_linebreak_property('\u{2014}'), B2);
1294    }
1295
1296    #[test]
1297    #[expect(clippy::bool_assert_comparison)] // clearer when we're testing bools directly
1298    fn break_rule() {
1299        let payload =
1300            DataProvider::<SegmenterBreakLineV1>::load(&crate::provider::Baked, Default::default())
1301                .expect("Loading should succeed!")
1302                .payload;
1303        let lb_data: &RuleBreakData = payload.get();
1304
1305        let is_break = |left, right| {
1306            matches!(
1307                lb_data.get_break_state_from_table(left, right),
1308                BreakState::Break | BreakState::NoMatch
1309            )
1310        };
1311
1312        // LB4
1313        assert_eq!(is_break(BK, AL), true);
1314        // LB5
1315        assert_eq!(is_break(CR, LF), false);
1316        assert_eq!(is_break(CR, AL), true);
1317        assert_eq!(is_break(LF, AL), true);
1318        assert_eq!(is_break(NL, AL), true);
1319        // LB6
1320        assert_eq!(is_break(AL, BK), false);
1321        assert_eq!(is_break(AL, CR), false);
1322        assert_eq!(is_break(AL, LF), false);
1323        assert_eq!(is_break(AL, NL), false);
1324        // LB7
1325        assert_eq!(is_break(AL, SP), false);
1326        assert_eq!(is_break(AL, ZW), false);
1327        // LB8
1328        // LB8a and LB9 omitted: These are handled outside of the state table.
1329        // LB10
1330        assert_eq!(is_break(ZWJ, SP), false);
1331        assert_eq!(is_break(SP, CM), true);
1332        // LB11
1333        assert_eq!(is_break(AL, WJ), false);
1334        assert_eq!(is_break(WJ, AL), false);
1335        // LB12
1336        assert_eq!(is_break(GL, AL), false);
1337        // LB12a
1338        assert_eq!(is_break(AL, GL), false);
1339        assert_eq!(is_break(SP, GL), true);
1340        // LB13
1341        assert_eq!(is_break(AL, CL), false);
1342        assert_eq!(is_break(AL, CP), false);
1343        assert_eq!(is_break(AL, EX), false);
1344        assert_eq!(is_break(AL, IS), false);
1345        assert_eq!(is_break(AL, SY), false);
1346        // LB18
1347        assert_eq!(is_break(SP, AL), true);
1348        // LB19
1349        assert_eq!(is_break(AL, QU), false);
1350        assert_eq!(is_break(QU, AL), false);
1351        // LB20
1352        assert_eq!(is_break(AL, CB), true);
1353        assert_eq!(is_break(CB, AL), true);
1354        // LB20
1355        assert_eq!(is_break(AL, BA), false);
1356        assert_eq!(is_break(AL, HY), false);
1357        assert_eq!(is_break(AL, NS), false);
1358        // LB21
1359        assert_eq!(is_break(AL, BA), false);
1360        assert_eq!(is_break(BB, AL), false);
1361        assert_eq!(is_break(ID, BA), false);
1362        assert_eq!(is_break(ID, NS), false);
1363        // LB21a
1364        // LB21b
1365        assert_eq!(is_break(SY, HL), false);
1366        // LB22
1367        assert_eq!(is_break(AL, IN), false);
1368        // LB 23
1369        assert_eq!(is_break(AL, NU), false);
1370        assert_eq!(is_break(HL, NU), false);
1371        // LB 23a
1372        assert_eq!(is_break(PR, ID), false);
1373        assert_eq!(is_break(PR, EB), false);
1374        assert_eq!(is_break(PR, EM), false);
1375        assert_eq!(is_break(ID, PO), false);
1376        assert_eq!(is_break(EB, PO), false);
1377        assert_eq!(is_break(EM, PO), false);
1378        // LB26
1379        assert_eq!(is_break(JL, JL), false);
1380        assert_eq!(is_break(JL, JV), false);
1381        assert_eq!(is_break(JL, H2), false);
1382        // LB27
1383        assert_eq!(is_break(JL, IN), false);
1384        assert_eq!(is_break(JL, PO), false);
1385        assert_eq!(is_break(PR, JL), false);
1386        // LB28
1387        assert_eq!(is_break(AL, AL), false);
1388        assert_eq!(is_break(HL, AL), false);
1389        // LB29
1390        assert_eq!(is_break(IS, AL), false);
1391        assert_eq!(is_break(IS, HL), false);
1392        // LB30b
1393        assert_eq!(is_break(EB, EM), false);
1394        // LB31
1395        assert_eq!(is_break(ID, ID), true);
1396    }
1397
1398    #[test]
1399    fn linebreak() {
1400        let segmenter =
1401            LineSegmenter::try_new_dictionary_unstable(&crate::provider::Baked, Default::default())
1402                .expect("Data exists");
1403        let segmenter = segmenter.as_borrowed();
1404
1405        let mut iter = segmenter.segment_str("hello world");
1406        assert_eq!(Some(0), iter.next());
1407        assert_eq!(Some(6), iter.next());
1408        assert_eq!(Some(11), iter.next());
1409        assert_eq!(None, iter.next());
1410
1411        iter = segmenter.segment_str("$10 $10");
1412        assert_eq!(Some(0), iter.next());
1413        assert_eq!(Some(4), iter.next());
1414        assert_eq!(Some(7), iter.next());
1415        assert_eq!(None, iter.next());
1416
1417        // LB10
1418
1419        // LB14
1420        iter = segmenter.segment_str("[  abc def");
1421        assert_eq!(Some(0), iter.next());
1422        assert_eq!(Some(7), iter.next());
1423        assert_eq!(Some(10), iter.next());
1424        assert_eq!(None, iter.next());
1425
1426        let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1427        let mut iter_u8 = segmenter.segment_latin1(&input);
1428        assert_eq!(Some(0), iter_u8.next());
1429        assert_eq!(Some(7), iter_u8.next());
1430        assert_eq!(Some(10), iter_u8.next());
1431        assert_eq!(None, iter_u8.next());
1432
1433        let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1434        let mut iter_u16 = segmenter.segment_utf16(&input);
1435        assert_eq!(Some(0), iter_u16.next());
1436        assert_eq!(Some(7), iter_u16.next());
1437        assert_eq!(Some(10), iter_u16.next());
1438        assert_eq!(None, iter_u16.next());
1439
1440        // LB15 used to prevent the break at 6, but has been removed in Unicode 15.1.
1441        iter = segmenter.segment_str("abc\u{0022}  (def");
1442        assert_eq!(Some(0), iter.next());
1443        assert_eq!(Some(6), iter.next());
1444        assert_eq!(Some(10), iter.next());
1445        assert_eq!(None, iter.next());
1446
1447        let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1448        let mut iter_u8 = segmenter.segment_latin1(&input);
1449        assert_eq!(Some(0), iter_u8.next());
1450        assert_eq!(Some(6), iter_u8.next());
1451        assert_eq!(Some(10), iter_u8.next());
1452        assert_eq!(None, iter_u8.next());
1453
1454        let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1455        let mut iter_u16 = segmenter.segment_utf16(&input);
1456        assert_eq!(Some(0), iter_u16.next());
1457        assert_eq!(Some(6), iter_u16.next());
1458        assert_eq!(Some(10), iter_u16.next());
1459        assert_eq!(None, iter_u16.next());
1460
1461        // Instead, in Unicode 15.1, LB15a and LB15b prevent these breaks.
1462        iter = segmenter.segment_str("« miaou »");
1463        assert_eq!(Some(0), iter.next());
1464        assert_eq!(Some(11), iter.next());
1465        assert_eq!(None, iter.next());
1466
1467        let input: Vec<u8> = "« miaou »"
1468            .chars()
1469            .map(|c| u8::try_from(u32::from(c)).unwrap())
1470            .collect();
1471        let mut iter_u8 = segmenter.segment_latin1(&input);
1472        assert_eq!(Some(0), iter_u8.next());
1473        assert_eq!(Some(9), iter_u8.next());
1474        assert_eq!(None, iter_u8.next());
1475
1476        let input: Vec<u16> = "« miaou »".encode_utf16().collect();
1477        let mut iter_u16 = segmenter.segment_utf16(&input);
1478        assert_eq!(Some(0), iter_u16.next());
1479        assert_eq!(Some(9), iter_u16.next());
1480        assert_eq!(None, iter_u16.next());
1481
1482        // But not these:
1483        iter = segmenter.segment_str("Die Katze hat »miau« gesagt.");
1484        assert_eq!(Some(0), iter.next());
1485        assert_eq!(Some(4), iter.next());
1486        assert_eq!(Some(10), iter.next());
1487        assert_eq!(Some(14), iter.next());
1488        assert_eq!(Some(23), iter.next());
1489        assert_eq!(Some(30), iter.next());
1490        assert_eq!(None, iter.next());
1491
1492        let input: Vec<u8> = "Die Katze hat »miau« gesagt."
1493            .chars()
1494            .map(|c| u8::try_from(u32::from(c)).unwrap())
1495            .collect();
1496        let mut iter_u8 = segmenter.segment_latin1(&input);
1497        assert_eq!(Some(0), iter_u8.next());
1498        assert_eq!(Some(4), iter_u8.next());
1499        assert_eq!(Some(10), iter_u8.next());
1500        assert_eq!(Some(14), iter_u8.next());
1501        assert_eq!(Some(21), iter_u8.next());
1502        assert_eq!(Some(28), iter_u8.next());
1503        assert_eq!(None, iter_u8.next());
1504
1505        let input: Vec<u16> = "Die Katze hat »miau« gesagt.".encode_utf16().collect();
1506        let mut iter_u16 = segmenter.segment_utf16(&input);
1507        assert_eq!(Some(0), iter_u16.next());
1508        assert_eq!(Some(4), iter_u16.next());
1509        assert_eq!(Some(10), iter_u16.next());
1510        assert_eq!(Some(14), iter_u16.next());
1511        assert_eq!(Some(21), iter_u16.next());
1512        assert_eq!(Some(28), iter_u16.next());
1513        assert_eq!(None, iter_u16.next());
1514
1515        // LB16
1516        iter = segmenter.segment_str("\u{0029}\u{203C}");
1517        assert_eq!(Some(0), iter.next());
1518        assert_eq!(Some(4), iter.next());
1519        assert_eq!(None, iter.next());
1520        iter = segmenter.segment_str("\u{0029}  \u{203C}");
1521        assert_eq!(Some(0), iter.next());
1522        assert_eq!(Some(6), iter.next());
1523        assert_eq!(None, iter.next());
1524
1525        let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c];
1526        let mut iter_u16 = segmenter.segment_utf16(&input);
1527        assert_eq!(Some(0), iter_u16.next());
1528        assert_eq!(Some(4), iter_u16.next());
1529        assert_eq!(None, iter_u16.next());
1530
1531        // LB17
1532        iter = segmenter.segment_str("\u{2014}\u{2014}aa");
1533        assert_eq!(Some(0), iter.next());
1534        assert_eq!(Some(6), iter.next());
1535        assert_eq!(Some(8), iter.next());
1536        assert_eq!(None, iter.next());
1537        iter = segmenter.segment_str("\u{2014}  \u{2014}aa");
1538        assert_eq!(Some(0), iter.next());
1539        assert_eq!(Some(8), iter.next());
1540        assert_eq!(Some(10), iter.next());
1541        assert_eq!(None, iter.next());
1542
1543        iter = segmenter.segment_str("\u{2014}\u{2014}  \u{2014}\u{2014}123 abc");
1544        assert_eq!(Some(0), iter.next());
1545        assert_eq!(Some(14), iter.next());
1546        assert_eq!(Some(18), iter.next());
1547        assert_eq!(Some(21), iter.next());
1548        assert_eq!(None, iter.next());
1549
1550        // LB25
1551        let mut iter = segmenter.segment_str("(0,1)+(2,3)");
1552        assert_eq!(Some(0), iter.next());
1553        assert_eq!(Some(11), iter.next());
1554        assert_eq!(None, iter.next());
1555        let input: [u16; 11] = [
1556            0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29,
1557        ];
1558        let mut iter_u16 = segmenter.segment_utf16(&input);
1559        assert_eq!(Some(0), iter_u16.next());
1560        assert_eq!(Some(11), iter_u16.next());
1561        assert_eq!(None, iter_u16.next());
1562
1563        let input: [u16; 13] = [
1564            0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63,
1565        ];
1566        let mut iter_u16 = segmenter.segment_utf16(&input);
1567        assert_eq!(Some(0), iter_u16.next());
1568        assert_eq!(Some(6), iter_u16.next());
1569        assert_eq!(Some(10), iter_u16.next());
1570        assert_eq!(Some(13), iter_u16.next());
1571        assert_eq!(None, iter_u16.next());
1572
1573        iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}");
1574        assert_eq!(Some(0), iter.next());
1575        assert_eq!(Some(5), iter.next());
1576        assert_eq!(Some(9), iter.next());
1577        assert_eq!(None, iter.next());
1578    }
1579
1580    #[test]
1581    #[cfg(feature = "lstm")]
1582    fn thai_line_break() {
1583        const TEST_STR: &str = "ภาษาไทยภาษาไทย";
1584
1585        let segmenter = LineSegmenter::new_lstm(Default::default());
1586        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1587        assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test");
1588
1589        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1590        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1591        assert_eq!(breaks, [0, 4, 7, 11, utf16.len()], "Thai test");
1592
1593        let utf16: [u16; 4] = [0x0e20, 0x0e32, 0x0e29, 0x0e32];
1594        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1595        assert_eq!(breaks, [0, 4], "Thai test");
1596    }
1597
1598    #[test]
1599    #[cfg(feature = "lstm")]
1600    fn burmese_line_break() {
1601        // "Burmese Language" in Burmese
1602        const TEST_STR: &str = "မြန်မာဘာသာစကား";
1603
1604        let segmenter = LineSegmenter::new_lstm(Default::default());
1605        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1606        // LSTM model breaks more characters, but it is better to return [30].
1607        assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test");
1608
1609        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1610        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1611        // LSTM model breaks more characters, but it is better to return [10].
1612        assert_eq!(breaks, [0, 4, 6, 10, utf16.len()], "Burmese utf-16 test");
1613    }
1614
1615    #[test]
1616    #[cfg(feature = "lstm")]
1617    fn khmer_line_break() {
1618        const TEST_STR: &str = "សេចក្ដីប្រកាសជាសកលស្ដីពីសិទ្ធិមនុស្ស";
1619
1620        let segmenter = LineSegmenter::new_lstm(Default::default());
1621        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1622        // Note: This small sample matches the ICU dictionary segmenter
1623        assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test");
1624
1625        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1626        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1627        assert_eq!(
1628            breaks,
1629            [0, 13, 16, 18, 24, utf16.len()],
1630            "Khmer utf-16 test"
1631        );
1632    }
1633
1634    #[test]
1635    #[cfg(feature = "lstm")]
1636    fn lao_line_break() {
1637        const TEST_STR: &str = "ກ່ຽວກັບສິດຂອງມະນຸດ";
1638
1639        let segmenter = LineSegmenter::new_lstm(Default::default());
1640        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1641        // Note: LSTM finds a break at '12' that the dictionary does not find
1642        assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test");
1643
1644        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1645        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1646        assert_eq!(breaks, [0, 4, 7, 10, 13, utf16.len()], "Lao utf-16 test");
1647    }
1648
1649    #[test]
1650    fn empty_string() {
1651        let segmenter = LineSegmenter::new_auto(Default::default());
1652        let breaks: Vec<usize> = segmenter.segment_str("").collect();
1653        assert_eq!(breaks, [0]);
1654    }
1655}