icu_segmenter/
line.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::complex::*;
6use crate::indices::*;
7use crate::provider::*;
8use crate::rule_segmenter::*;
9use alloc::string::String;
10use alloc::vec;
11use alloc::vec::Vec;
12use core::char;
13use icu_locale_core::subtags::language;
14use icu_locale_core::LanguageIdentifier;
15use icu_provider::prelude::*;
16use utf8_iter::Utf8CharIndices;
17
18// TODO(#1637): These constants should be data driven.
19#[allow(dead_code)]
20const UNKNOWN: u8 = 0;
21#[allow(dead_code)]
22const AI: u8 = 1;
23#[allow(dead_code)]
24const AK: u8 = 2;
25#[allow(dead_code)]
26const AL: u8 = 3;
27#[allow(dead_code)]
28const AL_DOTTED_CIRCLE: u8 = 4;
29#[allow(dead_code)]
30const AP: u8 = 5;
31#[allow(dead_code)]
32const AS: u8 = 6;
33#[allow(dead_code)]
34const B2: u8 = 7;
35#[allow(dead_code)]
36const BA: u8 = 8;
37#[allow(dead_code)]
38const BB: u8 = 9;
39#[allow(dead_code)]
40const BK: u8 = 10;
41#[allow(dead_code)]
42const CB: u8 = 11;
43#[allow(dead_code)]
44const CJ: u8 = 12;
45#[allow(dead_code)]
46const CL: u8 = 13;
47#[allow(dead_code)]
48const CM: u8 = 14;
49#[allow(dead_code)]
50const CP: u8 = 15;
51#[allow(dead_code)]
52const CR: u8 = 16;
53#[allow(dead_code)]
54const EB: u8 = 17;
55#[allow(dead_code)]
56const EM: u8 = 18;
57#[allow(dead_code)]
58const EX: u8 = 19;
59#[allow(dead_code)]
60const GL: u8 = 20;
61#[allow(dead_code)]
62const H2: u8 = 21;
63#[allow(dead_code)]
64const H3: u8 = 22;
65#[allow(dead_code)]
66const HL: u8 = 23;
67#[allow(dead_code)]
68const HY: u8 = 24;
69#[allow(dead_code)]
70const ID: u8 = 25;
71#[allow(dead_code)]
72const ID_CN: u8 = 26;
73#[allow(dead_code)]
74const IN: u8 = 27;
75#[allow(dead_code)]
76const IS: u8 = 28;
77#[allow(dead_code)]
78const JL: u8 = 29;
79#[allow(dead_code)]
80const JT: u8 = 30;
81#[allow(dead_code)]
82const JV: u8 = 31;
83#[allow(dead_code)]
84const LF: u8 = 32;
85#[allow(dead_code)]
86const NL: u8 = 33;
87#[allow(dead_code)]
88const NS: u8 = 34;
89#[allow(dead_code)]
90const NU: u8 = 35;
91#[allow(dead_code)]
92const OP_EA: u8 = 36;
93#[allow(dead_code)]
94const OP_OP30: u8 = 37;
95#[allow(dead_code)]
96const PO: u8 = 38;
97#[allow(dead_code)]
98const PO_EAW: u8 = 39;
99#[allow(dead_code)]
100const PR: u8 = 40;
101#[allow(dead_code)]
102const PR_EAW: u8 = 41;
103#[allow(dead_code)]
104const QU: u8 = 42;
105#[allow(dead_code)]
106const QU_PF: u8 = 43;
107#[allow(dead_code)]
108const QU_PI: u8 = 44;
109#[allow(dead_code)]
110const RI: u8 = 45;
111#[allow(dead_code)]
112const SA: u8 = 46;
113#[allow(dead_code)]
114const SP: u8 = 47;
115#[allow(dead_code)]
116const SY: u8 = 48;
117#[allow(dead_code)]
118const VF: u8 = 49;
119#[allow(dead_code)]
120const VI: u8 = 50;
121#[allow(dead_code)]
122const WJ: u8 = 51;
123#[allow(dead_code)]
124const XX: u8 = 52;
125#[allow(dead_code)]
126const ZW: u8 = 53;
127#[allow(dead_code)]
128const ZWJ: u8 = 54;
129
130/// An enum specifies the strictness of line-breaking rules. It can be passed as
131/// an argument when creating a line segmenter.
132///
133/// Each enum value has the same meaning with respect to the `line-break`
134/// property values in the CSS Text spec. See the details in
135/// <https://drafts.csswg.org/css-text-3/#line-break-property>.
136#[non_exhaustive]
137#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
138pub enum LineBreakStrictness {
139    /// Breaks text using the least restrictive set of line-breaking rules.
140    /// Typically used for short lines, such as in newspapers.
141    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-loose>
142    Loose,
143
144    /// Breaks text using the most common set of line-breaking rules.
145    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-normal>
146    Normal,
147
148    /// Breaks text using the most stringent set of line-breaking rules.
149    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-strict>
150    ///
151    /// This is the default behaviour of the Unicode Line Breaking Algorithm,
152    /// resolving class [CJ](https://www.unicode.org/reports/tr14/#CJ) to
153    /// [NS](https://www.unicode.org/reports/tr14/#NS);
154    /// see rule [LB1](https://www.unicode.org/reports/tr14/#LB1).
155    #[default]
156    Strict,
157
158    /// Breaks text assuming there is a soft wrap opportunity around every
159    /// typographic character unit, disregarding any prohibition against line
160    /// breaks. See more details in
161    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-anywhere>.
162    Anywhere,
163}
164
165/// An enum specifies the line break opportunities between letters. It can be
166/// passed as an argument when creating a line segmenter.
167///
168/// Each enum value has the same meaning with respect to the `word-break`
169/// property values in the CSS Text spec. See the details in
170/// <https://drafts.csswg.org/css-text-3/#word-break-property>
171#[non_exhaustive]
172#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
173pub enum LineBreakWordOption {
174    /// Words break according to their customary rules. See the details in
175    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-normal>.
176    #[default]
177    Normal,
178
179    /// Breaking is allowed within "words".
180    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-break-all>
181    BreakAll,
182
183    /// Breaking is forbidden within "word".
184    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all>
185    KeepAll,
186}
187
188/// Options to tailor line-breaking behavior.
189#[non_exhaustive]
190#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
191pub struct LineBreakOptions<'a> {
192    /// Strictness of line-breaking rules. See [`LineBreakStrictness`].
193    ///
194    /// Default is [`LineBreakStrictness::Strict`]
195    pub strictness: Option<LineBreakStrictness>,
196
197    /// Line break opportunities between letters. See [`LineBreakWordOption`].
198    ///
199    /// Default is [`LineBreakStrictness::Normal`]
200    pub word_option: Option<LineBreakWordOption>,
201
202    /// Content locale for line segmenter
203    ///
204    /// This allows more break opportunities when `LineBreakStrictness` is
205    /// `Normal` or `Loose`. See
206    /// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
207    /// This option has no effect in Latin-1 mode.
208    pub content_locale: Option<&'a LanguageIdentifier>,
209}
210
211#[derive(Debug, Clone, Copy)]
212struct ResolvedLineBreakOptions {
213    strictness: LineBreakStrictness,
214    word_option: LineBreakWordOption,
215    ja_zh: bool,
216}
217
218impl From<LineBreakOptions<'_>> for ResolvedLineBreakOptions {
219    fn from(options: LineBreakOptions<'_>) -> Self {
220        let ja_zh = if let Some(content_locale) = options.content_locale.as_ref() {
221            content_locale.language == language!("ja") || content_locale.language == language!("zh")
222        } else {
223            false
224        };
225        Self {
226            strictness: options.strictness.unwrap_or_default(),
227            word_option: options.word_option.unwrap_or_default(),
228            ja_zh,
229        }
230    }
231}
232
233/// Supports loading line break data, and creating line break iterators for different string
234/// encodings.
235///
236/// Most segmentation methods live on [`LineSegmenterBorrowed`], which can be obtained via
237/// [`LineSegmenter::new_auto()`] (etc) or [`LineSegmenter::as_borrowed()`].
238///
239/// The segmenter returns mandatory breaks (as defined by [definition LD7][LD7] of
240/// Unicode Standard Annex #14, _Unicode Line Breaking Algorithm_) as well as
241/// line break opportunities ([definition LD3][LD3]).
242/// It does not distinguish them.  Callers requiring that distinction can check
243/// the Line_Break property of the code point preceding the break against those
244/// listed in rules [LB4][LB4] and [LB5][LB5], special-casing the end of text
245/// according to [LB3][LB3].
246///
247/// For consistency with the grapheme, word, and sentence segmenters, there is
248/// always a breakpoint returned at index 0, but this breakpoint is not a
249/// meaningful line break opportunity.
250///
251/// [LD3]: https://www.unicode.org/reports/tr14/#LD3
252/// [LD7]: https://www.unicode.org/reports/tr14/#LD7
253/// [LB3]: https://www.unicode.org/reports/tr14/#LB3
254/// [LB4]: https://www.unicode.org/reports/tr14/#LB4
255/// [LB5]: https://www.unicode.org/reports/tr14/#LB5
256///
257/// ```rust
258/// # use icu::segmenter::LineSegmenter;
259/// #
260/// # let segmenter = LineSegmenter::new_auto(Default::default());
261/// #
262/// let text = "Summary\r\nThis annex…";
263/// let breakpoints: Vec<usize> = segmenter.segment_str(text).collect();
264/// // 9 and 22 are mandatory breaks, 14 is a line break opportunity.
265/// assert_eq!(&breakpoints, &[0, 9, 14, 22]);
266///
267/// // There is a break opportunity between emoji, but not within the ZWJ sequence 🏳️‍🌈.
268/// let flag_equation = "🏳️➕🌈🟰🏳️\u{200D}🌈";
269/// let possible_first_lines: Vec<&str> =
270///     segmenter.segment_str(flag_equation).skip(1).map(|i| &flag_equation[..i]).collect();
271/// assert_eq!(
272///     &possible_first_lines,
273///     &[
274///         "🏳️",
275///         "🏳️➕",
276///         "🏳️➕🌈",
277///         "🏳️➕🌈🟰",
278///         "🏳️➕🌈🟰🏳️‍🌈"
279///     ]
280/// );
281/// ```
282///
283/// # Examples
284///
285/// Segment a string with default options:
286///
287/// ```rust
288/// use icu::segmenter::LineSegmenter;
289///
290/// let segmenter = LineSegmenter::new_auto(Default::default());
291///
292/// let breakpoints: Vec<usize> =
293///     segmenter.segment_str("Hello World").collect();
294/// assert_eq!(&breakpoints, &[0, 6, 11]);
295/// ```
296///
297/// Segment a string with CSS option overrides:
298///
299/// ```rust
300/// use icu::segmenter::options::{
301///     LineBreakOptions, LineBreakStrictness, LineBreakWordOption,
302/// };
303/// use icu::segmenter::LineSegmenter;
304///
305/// let mut options = LineBreakOptions::default();
306/// options.strictness = Some(LineBreakStrictness::Strict);
307/// options.word_option = Some(LineBreakWordOption::BreakAll);
308/// options.content_locale = None;
309/// let segmenter = LineSegmenter::new_auto(options);
310///
311/// let breakpoints: Vec<usize> =
312///     segmenter.segment_str("Hello World").collect();
313/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]);
314/// ```
315///
316/// Segment a Latin1 byte string:
317///
318/// ```rust
319/// use icu::segmenter::LineSegmenter;
320///
321/// let segmenter = LineSegmenter::new_auto(Default::default());
322///
323/// let breakpoints: Vec<usize> =
324///     segmenter.segment_latin1(b"Hello World").collect();
325/// assert_eq!(&breakpoints, &[0, 6, 11]);
326/// ```
327///
328/// Separate mandatory breaks from the break opportunities:
329///
330/// ```rust
331/// use icu::properties::{props::LineBreak, CodePointMapData};
332/// use icu::segmenter::LineSegmenter;
333///
334/// # let segmenter = LineSegmenter::new_auto(Default::default());
335/// #
336/// let text = "Summary\r\nThis annex…";
337///
338/// let mandatory_breaks: Vec<usize> = segmenter
339///     .segment_str(text)
340///     .into_iter()
341///     .filter(|&i| {
342///         text[..i].chars().next_back().map_or(false, |c| {
343///             matches!(
344///                 CodePointMapData::<LineBreak>::new().get(c),
345///                 LineBreak::MandatoryBreak
346///                     | LineBreak::CarriageReturn
347///                     | LineBreak::LineFeed
348///                     | LineBreak::NextLine
349///             ) || i == text.len()
350///         })
351///     })
352///     .collect();
353/// assert_eq!(&mandatory_breaks, &[9, 22]);
354/// ```
355#[derive(Debug)]
356pub struct LineSegmenter {
357    options: ResolvedLineBreakOptions,
358    payload: DataPayload<SegmenterBreakLineV1>,
359    complex: ComplexPayloads,
360}
361
362/// Segments a string into lines (borrowed version).
363///
364/// See [`LineSegmenter`] for examples.
365#[derive(Clone, Debug, Copy)]
366pub struct LineSegmenterBorrowed<'data> {
367    options: ResolvedLineBreakOptions,
368    data: &'data RuleBreakData<'data>,
369    complex: ComplexPayloadsBorrowed<'data>,
370}
371
372impl LineSegmenter {
373    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
374    /// the best available compiled data for complex scripts (Khmer, Lao, Myanmar, and Thai).
375    ///
376    /// The current behavior, which is subject to change, is to use the LSTM model when available.
377    ///
378    /// See also [`Self::new_auto`].
379    ///
380    /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
381    ///
382    /// [📚 Help choosing a constructor](icu_provider::constructors)
383    #[cfg(feature = "auto")]
384    #[cfg(feature = "compiled_data")]
385    pub fn new_auto(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
386        Self::new_lstm(options)
387    }
388
389    #[cfg(feature = "auto")]
390    icu_provider::gen_buffer_data_constructors!(
391        (options: LineBreakOptions) -> error: DataError,
392        functions: [
393            new_auto: skip,
394            try_new_auto_with_buffer_provider,
395            try_new_auto_unstable,
396            Self,
397        ]
398    );
399
400    #[cfg(feature = "auto")]
401    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
402    pub fn try_new_auto_unstable<D>(
403        provider: &D,
404        options: LineBreakOptions,
405    ) -> Result<Self, DataError>
406    where
407        D: DataProvider<SegmenterBreakLineV1>
408            + DataProvider<SegmenterLstmAutoV1>
409            + DataProvider<SegmenterBreakGraphemeClusterV1>
410            + ?Sized,
411    {
412        Self::try_new_lstm_unstable(provider, options)
413    }
414
415    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
416    /// compiled LSTM data for complex scripts (Khmer, Lao, Myanmar, and Thai).
417    ///
418    /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
419    /// the full dictionary but more expensive during segmentation (inference).
420    ///
421    /// See also [`Self::new_lstm`].
422    ///
423    /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
424    ///
425    /// [📚 Help choosing a constructor](icu_provider::constructors)
426    #[cfg(feature = "lstm")]
427    #[cfg(feature = "compiled_data")]
428    pub fn new_lstm(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
429        LineSegmenterBorrowed {
430            options: options.into(),
431            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
432            complex: ComplexPayloadsBorrowed::new_lstm(),
433        }
434    }
435
436    #[cfg(feature = "lstm")]
437    icu_provider::gen_buffer_data_constructors!(
438        (options: LineBreakOptions) -> error: DataError,
439        functions: [
440            try_new_lstm: skip,
441            try_new_lstm_with_buffer_provider,
442            try_new_lstm_unstable,
443            Self,
444        ]
445    );
446
447    #[cfg(feature = "lstm")]
448    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
449    pub fn try_new_lstm_unstable<D>(
450        provider: &D,
451        options: LineBreakOptions,
452    ) -> Result<Self, DataError>
453    where
454        D: DataProvider<SegmenterBreakLineV1>
455            + DataProvider<SegmenterLstmAutoV1>
456            + DataProvider<SegmenterBreakGraphemeClusterV1>
457            + ?Sized,
458    {
459        Ok(Self {
460            options: options.into(),
461            payload: provider.load(Default::default())?.payload,
462            complex: ComplexPayloads::try_new_lstm(provider)?,
463        })
464    }
465
466    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
467    /// compiled dictionary data for complex scripts (Khmer, Lao, Myanmar, and Thai).
468    ///
469    /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
470    /// faster than the LSTM model but requires more data.
471    ///
472    /// See also [`Self::new_dictionary`].
473    ///
474    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
475    ///
476    /// [📚 Help choosing a constructor](icu_provider::constructors)
477    #[cfg(feature = "compiled_data")]
478    pub fn new_dictionary(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
479        LineSegmenterBorrowed {
480            options: options.into(),
481            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
482            // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
483            // characters [1]. Southeast Asian languages however require complex context analysis
484            // [2].
485            //
486            // [1]: https://www.unicode.org/reports/tr14/#ID
487            // [2]: https://www.unicode.org/reports/tr14/#SA
488            complex: ComplexPayloadsBorrowed::new_southeast_asian(),
489        }
490    }
491
492    icu_provider::gen_buffer_data_constructors!(
493        (options: LineBreakOptions) -> error: DataError,
494        functions: [
495            new_dictionary: skip,
496            try_new_dictionary_with_buffer_provider,
497            try_new_dictionary_unstable,
498            Self,
499        ]
500    );
501
502    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
503    pub fn try_new_dictionary_unstable<D>(
504        provider: &D,
505        options: LineBreakOptions,
506    ) -> Result<Self, DataError>
507    where
508        D: DataProvider<SegmenterBreakLineV1>
509            + DataProvider<SegmenterDictionaryExtendedV1>
510            + DataProvider<SegmenterBreakGraphemeClusterV1>
511            + ?Sized,
512    {
513        Ok(Self {
514            options: options.into(),
515            payload: provider.load(Default::default())?.payload,
516            // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
517            // characters [1]. Southeast Asian languages however require complex context analysis
518            // [2].
519            //
520            // [1]: https://www.unicode.org/reports/tr14/#ID
521            // [2]: https://www.unicode.org/reports/tr14/#SA
522            complex: ComplexPayloads::try_new_southeast_asian(provider)?,
523        })
524    }
525
526    /// Constructs a borrowed version of this type for more efficient querying.
527    ///
528    /// Most useful methods for segmentation are on this type.
529    pub fn as_borrowed(&self) -> LineSegmenterBorrowed<'_> {
530        LineSegmenterBorrowed {
531            options: self.options,
532            data: self.payload.get(),
533            complex: self.complex.as_borrowed(),
534        }
535    }
536}
537
538impl<'data> LineSegmenterBorrowed<'data> {
539    /// Creates a line break iterator for an `str` (a UTF-8 string).
540    ///
541    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
542    pub fn segment_str<'s>(self, input: &'s str) -> LineBreakIterator<'data, 's, Utf8> {
543        LineBreakIterator {
544            iter: input.char_indices(),
545            len: input.len(),
546            current_pos_data: None,
547            result_cache: Vec::new(),
548            data: self.data,
549            options: self.options,
550            complex: self.complex,
551        }
552    }
553    /// Creates a line break iterator for a potentially ill-formed UTF8 string
554    ///
555    /// Invalid characters are treated as REPLACEMENT CHARACTER
556    ///
557    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
558    pub fn segment_utf8<'s>(
559        self,
560        input: &'s [u8],
561    ) -> LineBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
562        LineBreakIterator {
563            iter: Utf8CharIndices::new(input),
564            len: input.len(),
565            current_pos_data: None,
566            result_cache: Vec::new(),
567            data: self.data,
568            options: self.options,
569            complex: self.complex,
570        }
571    }
572    /// Creates a line break iterator for a Latin-1 (8-bit) string.
573    ///
574    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
575    pub fn segment_latin1<'s>(self, input: &'s [u8]) -> LineBreakIterator<'data, 's, Latin1> {
576        LineBreakIterator {
577            iter: Latin1Indices::new(input),
578            len: input.len(),
579            current_pos_data: None,
580            result_cache: Vec::new(),
581            data: self.data,
582            options: self.options,
583            complex: self.complex,
584        }
585    }
586
587    /// Creates a line break iterator for a UTF-16 string.
588    ///
589    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
590    pub fn segment_utf16<'s>(self, input: &'s [u16]) -> LineBreakIterator<'data, 's, Utf16> {
591        LineBreakIterator {
592            iter: Utf16Indices::new(input),
593            len: input.len(),
594            current_pos_data: None,
595            result_cache: Vec::new(),
596            data: self.data,
597            options: self.options,
598            complex: self.complex,
599        }
600    }
601}
602
603impl LineSegmenterBorrowed<'static> {
604    /// Cheaply converts a [`LineSegmenterBorrowed<'static>`] into a [`LineSegmenter`].
605    ///
606    /// Note: Due to branching and indirection, using [`LineSegmenter`] might inhibit some
607    /// compile-time optimizations that are possible with [`LineSegmenterBorrowed`].
608    pub fn static_to_owned(self) -> LineSegmenter {
609        LineSegmenter {
610            payload: DataPayload::from_static_ref(self.data),
611            complex: self.complex.static_to_owned(),
612            options: self.options,
613        }
614    }
615}
616
617impl RuleBreakData<'_> {
618    fn get_linebreak_property_utf32_with_rule(
619        &self,
620        codepoint: u32,
621        strictness: LineBreakStrictness,
622        word_option: LineBreakWordOption,
623    ) -> u8 {
624        // Note: Default value is 0 == UNKNOWN
625        let prop = self.property_table.get32(codepoint);
626
627        if word_option == LineBreakWordOption::BreakAll
628            || strictness == LineBreakStrictness::Loose
629            || strictness == LineBreakStrictness::Normal
630        {
631            return match prop {
632                CJ => ID, // All CJ's General_Category is Other_Letter (Lo).
633                _ => prop,
634            };
635        }
636
637        // CJ is treated as NS by default, yielding strict line breaking.
638        // https://www.unicode.org/reports/tr14/#CJ
639        prop
640    }
641
642    #[inline]
643    fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
644        let idx = (left as usize) * (self.property_count as usize) + (right as usize);
645        // We use unwrap_or to fall back to the base case and prevent panics on bad data.
646        self.break_state_table.get(idx).unwrap_or(BreakState::Keep)
647    }
648
649    #[inline]
650    fn use_complex_breaking_utf32(&self, codepoint: u32) -> bool {
651        let line_break_property = self.get_linebreak_property_utf32_with_rule(
652            codepoint,
653            LineBreakStrictness::Strict,
654            LineBreakWordOption::Normal,
655        );
656
657        line_break_property == SA
658    }
659}
660
661#[inline]
662fn is_break_utf32_by_loose(
663    right_codepoint: u32,
664    left_prop: u8,
665    right_prop: u8,
666    ja_zh: bool,
667) -> Option<bool> {
668    // breaks before hyphens
669    if right_prop == BA {
670        if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) {
671            return Some(true);
672        }
673    } else if right_prop == NS {
674        // breaks before certain CJK hyphen-like characters
675        if right_codepoint == 0x301C || right_codepoint == 0x30A0 {
676            return Some(ja_zh);
677        }
678
679        // breaks before iteration marks
680        if right_codepoint == 0x3005
681            || right_codepoint == 0x303B
682            || right_codepoint == 0x309D
683            || right_codepoint == 0x309E
684            || right_codepoint == 0x30FD
685            || right_codepoint == 0x30FE
686        {
687            return Some(true);
688        }
689
690        // breaks before certain centered punctuation marks:
691        if right_codepoint == 0x30FB
692            || right_codepoint == 0xFF1A
693            || right_codepoint == 0xFF1B
694            || right_codepoint == 0xFF65
695            || right_codepoint == 0x203C
696            || (0x2047..=0x2049).contains(&right_codepoint)
697        {
698            return Some(ja_zh);
699        }
700    } else if right_prop == IN {
701        // breaks between inseparable characters such as U+2025, U+2026 i.e. characters with the Unicode Line Break property IN
702        return Some(true);
703    } else if right_prop == EX {
704        // breaks before certain centered punctuation marks:
705        if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F {
706            return Some(ja_zh);
707        }
708    }
709
710    // breaks before suffixes:
711    // Characters with the Unicode Line Break property PO and the East Asian Width property
712    if right_prop == PO_EAW {
713        return Some(ja_zh);
714    }
715    // breaks after prefixes:
716    // Characters with the Unicode Line Break property PR and the East Asian Width property
717    if left_prop == PR_EAW {
718        return Some(ja_zh);
719    }
720    None
721}
722
723/// A trait allowing for LineBreakIterator to be generalized to multiple string iteration methods.
724///
725/// This is implemented by ICU4X for several common string types.
726///
727/// <div class="stab unstable">
728/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
729/// trait, please consider using a type from the implementors listed below.
730/// </div>
731pub trait LineBreakType: crate::private::Sealed + Sized + RuleBreakType {
732    #[doc(hidden)]
733    fn use_complex_breaking(iterator: &LineBreakIterator<'_, '_, Self>, c: Self::CharType) -> bool;
734
735    #[doc(hidden)]
736    fn get_linebreak_property_with_rule(
737        iterator: &LineBreakIterator<'_, '_, Self>,
738        c: Self::CharType,
739    ) -> u8;
740
741    #[doc(hidden)]
742    fn line_handle_complex_language(
743        iterator: &mut LineBreakIterator<'_, '_, Self>,
744        left_codepoint: Self::CharType,
745    ) -> Option<usize>;
746}
747
748/// Implements the [`Iterator`] trait over the line break opportunities of the given string.
749///
750/// Lifetimes:
751///
752/// - `'l` = lifetime of the [`LineSegmenter`] object from which this iterator was created
753/// - `'s` = lifetime of the string being segmented
754///
755/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
756/// _after_ the break (for a break at the end of text, this index is the length
757/// of the [`str`] or array of code units).
758///
759/// For examples of use, see [`LineSegmenter`].
760#[derive(Debug)]
761pub struct LineBreakIterator<'data, 's, Y: LineBreakType> {
762    iter: Y::IterAttr<'s>,
763    len: usize,
764    current_pos_data: Option<(usize, Y::CharType)>,
765    result_cache: Vec<usize>,
766    data: &'data RuleBreakData<'data>,
767    options: ResolvedLineBreakOptions,
768    complex: ComplexPayloadsBorrowed<'data>,
769}
770
771impl<Y: LineBreakType> Iterator for LineBreakIterator<'_, '_, Y> {
772    type Item = usize;
773
774    fn next(&mut self) -> Option<Self::Item> {
775        match self.check_eof() {
776            StringBoundaryPosType::Start => return Some(0),
777            StringBoundaryPosType::End => return None,
778            _ => (),
779        }
780
781        // If we have break point cache by previous run, return this result
782        if let Some(&first_pos) = self.result_cache.first() {
783            let mut i = 0;
784            loop {
785                if i == first_pos {
786                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
787                    return self.get_current_position();
788                }
789                i += self.get_current_codepoint().map_or(0, Y::char_len);
790                self.advance_iter();
791                if self.is_eof() {
792                    self.result_cache.clear();
793                    return Some(self.len);
794                }
795            }
796        }
797
798        // The state prior to a sequence of CM and ZWJ affected by rule LB9.
799        let mut lb9_left: Option<u8> = None;
800        // Whether LB9 was applied to a ZWJ, so that breaks at the current
801        // position must be suppressed.
802        let mut lb8a_after_lb9 = false;
803
804        'a: loop {
805            debug_assert!(!self.is_eof());
806            let left_codepoint = self.get_current_codepoint()?;
807            let mut left_prop =
808                lb9_left.unwrap_or_else(|| self.get_linebreak_property(left_codepoint));
809            let after_zwj = lb8a_after_lb9 || (lb9_left.is_none() && left_prop == ZWJ);
810            self.advance_iter();
811
812            let Some(right_codepoint) = self.get_current_codepoint() else {
813                return Some(self.len);
814            };
815            let right_prop = self.get_linebreak_property(right_codepoint);
816            // NOTE(egg): The special-casing of `LineBreakStrictness::Anywhere` allows us to pass
817            // a test, but eventually that option should just be simplified to call the extended
818            // grapheme cluster segmenter.
819            if (right_prop == CM
820                || (right_prop == ZWJ && self.options.strictness != LineBreakStrictness::Anywhere))
821                && left_prop != BK
822                && left_prop != CR
823                && left_prop != LF
824                && left_prop != NL
825                && left_prop != SP
826                && left_prop != ZW
827            {
828                lb9_left = Some(left_prop);
829                lb8a_after_lb9 = right_prop == ZWJ;
830                continue;
831            } else {
832                lb9_left = None;
833                lb8a_after_lb9 = false;
834            }
835
836            // CSS word-break property handling
837            match (self.options.word_option, left_prop, right_prop) {
838                (LineBreakWordOption::BreakAll, AL | NU | SA, _) => {
839                    left_prop = ID;
840                }
841                //  typographic letter units shouldn't be break
842                (
843                    LineBreakWordOption::KeepAll,
844                    AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
845                    AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
846                ) => {
847                    continue;
848                }
849                _ => (),
850            }
851
852            // CSS line-break property handling
853            match self.options.strictness {
854                LineBreakStrictness::Normal => {
855                    if self.is_break_by_normal(right_codepoint) && !after_zwj {
856                        return self.get_current_position();
857                    }
858                }
859                LineBreakStrictness::Loose => {
860                    if let Some(breakable) = is_break_utf32_by_loose(
861                        right_codepoint.into(),
862                        left_prop,
863                        right_prop,
864                        self.options.ja_zh,
865                    ) {
866                        if breakable && !after_zwj {
867                            return self.get_current_position();
868                        }
869                        continue;
870                    }
871                }
872                LineBreakStrictness::Anywhere => {
873                    // TODO(egg): My reading of the CSS standard is that this
874                    // should break around extended grapheme clusters, not at
875                    // arbitrary code points, so this seems wrong.
876                    return self.get_current_position();
877                }
878                _ => (),
879            };
880
881            // UAX14 doesn't have Thai etc, so use another way.
882            if self.options.word_option != LineBreakWordOption::BreakAll
883                && Y::use_complex_breaking(self, left_codepoint)
884                && Y::use_complex_breaking(self, right_codepoint)
885            {
886                let result = Y::line_handle_complex_language(self, left_codepoint);
887                if result.is_some() {
888                    return result;
889                }
890                // I may have to fetch text until non-SA character?.
891            }
892
893            // If break_state is equals or grater than 0, it is alias of property.
894            match self.data.get_break_state_from_table(left_prop, right_prop) {
895                BreakState::Break | BreakState::NoMatch => {
896                    if after_zwj {
897                        continue;
898                    } else {
899                        return self.get_current_position();
900                    }
901                }
902                BreakState::Keep => continue,
903                BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
904                    let mut previous_iter = self.iter.clone();
905                    let mut previous_pos_data = self.current_pos_data;
906                    let mut previous_is_after_zwj = after_zwj;
907
908                    // Since we are building up a state in this inner loop, we do not
909                    // need an analogue of lb9_left; continuing the inner loop preserves
910                    // `index` which is the current state, and thus implements the
911                    // “treat as” rule.
912                    let mut left_prop_pre_lb9 = right_prop;
913
914                    // current state isn't resolved due to intermediating.
915                    // Example, [AK] [AS] is processing LB28a, but if not matched after fetching
916                    // data, we should break after [AK].
917                    let is_intermediate_rule_no_match = if lb8a_after_lb9 {
918                        // left was ZWJ so we don't break between ZWJ.
919                        true
920                    } else {
921                        index > self.data.last_codepoint_property
922                    };
923
924                    loop {
925                        self.advance_iter();
926                        let after_zwj = left_prop_pre_lb9 == ZWJ;
927
928                        let previous_break_state_is_cp_prop =
929                            index <= self.data.last_codepoint_property;
930
931                        let Some(prop) = self.get_current_linebreak_property() else {
932                            // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
933                            let break_state = self
934                                .data
935                                .get_break_state_from_table(index, self.data.eot_property);
936                            if break_state == BreakState::NoMatch {
937                                self.iter = previous_iter;
938                                self.current_pos_data = previous_pos_data;
939                                if previous_is_after_zwj {
940                                    // Do not break [AK] [ZWJ] ÷ [AS] (eot).
941                                    continue 'a;
942                                } else {
943                                    return self.get_current_position();
944                                }
945                            }
946                            // EOF
947                            return Some(self.len);
948                        };
949
950                        if (prop == CM || prop == ZWJ)
951                            && left_prop_pre_lb9 != BK
952                            && left_prop_pre_lb9 != CR
953                            && left_prop_pre_lb9 != LF
954                            && left_prop_pre_lb9 != NL
955                            && left_prop_pre_lb9 != SP
956                            && left_prop_pre_lb9 != ZW
957                        {
958                            left_prop_pre_lb9 = prop;
959                            continue;
960                        }
961
962                        match self.data.get_break_state_from_table(index, prop) {
963                            BreakState::Keep => continue 'a,
964                            BreakState::NoMatch => {
965                                self.iter = previous_iter;
966                                self.current_pos_data = previous_pos_data;
967                                if after_zwj {
968                                    // Break [AK] ÷ [AS] [ZWJ] [XX],
969                                    // but not [AK] [ZWJ] ÷ [AS] [ZWJ] [XX].
970                                    if is_intermediate_rule_no_match && !previous_is_after_zwj {
971                                        return self.get_current_position();
972                                    }
973                                    continue 'a;
974                                } else if previous_is_after_zwj {
975                                    // Do not break [AK] [ZWJ] ÷ [AS] [XX].
976                                    continue 'a;
977                                } else {
978                                    return self.get_current_position();
979                                }
980                            }
981                            BreakState::Break => {
982                                if after_zwj {
983                                    continue 'a;
984                                } else {
985                                    return self.get_current_position();
986                                }
987                            }
988                            BreakState::Intermediate(i) => {
989                                index = i;
990                                previous_iter = self.iter.clone();
991                                previous_pos_data = self.current_pos_data;
992                                previous_is_after_zwj = after_zwj;
993                            }
994                            BreakState::Index(i) => {
995                                index = i;
996                                if previous_break_state_is_cp_prop {
997                                    previous_iter = self.iter.clone();
998                                    previous_pos_data = self.current_pos_data;
999                                    previous_is_after_zwj = after_zwj;
1000                                }
1001                            }
1002                        }
1003                        left_prop_pre_lb9 = prop;
1004                    }
1005                }
1006            }
1007        }
1008    }
1009}
1010
1011enum StringBoundaryPosType {
1012    Start,
1013    Middle,
1014    End,
1015}
1016
1017impl<Y: LineBreakType> LineBreakIterator<'_, '_, Y> {
1018    fn advance_iter(&mut self) {
1019        self.current_pos_data = self.iter.next();
1020    }
1021
1022    fn is_eof(&self) -> bool {
1023        self.current_pos_data.is_none()
1024    }
1025
1026    #[inline]
1027    fn check_eof(&mut self) -> StringBoundaryPosType {
1028        if self.is_eof() {
1029            self.advance_iter();
1030            if self.is_eof() {
1031                if self.len == 0 {
1032                    // Empty string. Since `self.current_pos_data` is always going to be empty,
1033                    // we never read `self.len` except for here, so we can use it to mark that
1034                    // we have already returned the single empty-string breakpoint.
1035                    self.len = 1;
1036                    StringBoundaryPosType::Start
1037                } else {
1038                    StringBoundaryPosType::End
1039                }
1040            } else {
1041                StringBoundaryPosType::Start
1042            }
1043        } else {
1044            StringBoundaryPosType::Middle
1045        }
1046    }
1047
1048    fn get_current_position(&self) -> Option<usize> {
1049        self.current_pos_data.map(|(pos, _)| pos)
1050    }
1051
1052    fn get_current_codepoint(&self) -> Option<Y::CharType> {
1053        self.current_pos_data.map(|(_, codepoint)| codepoint)
1054    }
1055
1056    fn get_linebreak_property(&self, codepoint: Y::CharType) -> u8 {
1057        Y::get_linebreak_property_with_rule(self, codepoint)
1058    }
1059
1060    fn get_current_linebreak_property(&self) -> Option<u8> {
1061        self.get_current_codepoint()
1062            .map(|c| self.get_linebreak_property(c))
1063    }
1064
1065    fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
1066        match codepoint.into() {
1067            0x301C | 0x30A0 => self.options.ja_zh,
1068            _ => false,
1069        }
1070    }
1071}
1072
1073impl LineBreakType for Utf8 {
1074    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1075        iterator.data.get_linebreak_property_utf32_with_rule(
1076            c as u32,
1077            iterator.options.strictness,
1078            iterator.options.word_option,
1079        )
1080    }
1081
1082    #[inline]
1083    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1084        iterator.data.use_complex_breaking_utf32(c as u32)
1085    }
1086
1087    fn line_handle_complex_language(
1088        iter: &mut LineBreakIterator<'_, '_, Self>,
1089        left_codepoint: char,
1090    ) -> Option<usize> {
1091        line_handle_complex_language_utf8(iter, left_codepoint)
1092    }
1093}
1094
1095impl LineBreakType for PotentiallyIllFormedUtf8 {
1096    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1097        iterator.data.get_linebreak_property_utf32_with_rule(
1098            c as u32,
1099            iterator.options.strictness,
1100            iterator.options.word_option,
1101        )
1102    }
1103
1104    #[inline]
1105    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1106        iterator.data.use_complex_breaking_utf32(c as u32)
1107    }
1108
1109    fn line_handle_complex_language(
1110        iter: &mut LineBreakIterator<'_, '_, Self>,
1111        left_codepoint: char,
1112    ) -> Option<usize> {
1113        line_handle_complex_language_utf8(iter, left_codepoint)
1114    }
1115}
1116/// line_handle_complex_language impl for UTF8 iterators
1117fn line_handle_complex_language_utf8<T>(
1118    iter: &mut LineBreakIterator<'_, '_, T>,
1119    left_codepoint: char,
1120) -> Option<usize>
1121where
1122    T: LineBreakType<CharType = char>,
1123{
1124    // word segmenter doesn't define break rules for some languages such as Thai.
1125    let start_iter = iter.iter.clone();
1126    let start_point = iter.current_pos_data;
1127    let mut s = String::new();
1128    s.push(left_codepoint);
1129    loop {
1130        debug_assert!(!iter.is_eof());
1131        s.push(iter.get_current_codepoint()?);
1132        iter.advance_iter();
1133        if let Some(current_codepoint) = iter.get_current_codepoint() {
1134            if !T::use_complex_breaking(iter, current_codepoint) {
1135                break;
1136            }
1137        } else {
1138            // EOF
1139            break;
1140        }
1141    }
1142
1143    // Restore iterator to move to head of complex string
1144    iter.iter = start_iter;
1145    iter.current_pos_data = start_point;
1146    let breaks = iter.complex.complex_language_segment_str(&s);
1147    iter.result_cache = breaks;
1148    let first_pos = *iter.result_cache.first()?;
1149    let mut i = left_codepoint.len_utf8();
1150    loop {
1151        if i == first_pos {
1152            // Re-calculate breaking offset
1153            iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
1154            return iter.get_current_position();
1155        }
1156        debug_assert!(
1157            i < first_pos,
1158            "we should always arrive at first_pos: near index {:?}",
1159            iter.get_current_position()
1160        );
1161        i += iter.get_current_codepoint().map_or(0, T::char_len);
1162        iter.advance_iter();
1163        if iter.is_eof() {
1164            iter.result_cache.clear();
1165            return Some(iter.len);
1166        }
1167    }
1168}
1169
1170impl LineBreakType for Latin1 {
1171    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 {
1172        // No CJ on Latin1
1173        // Note: Default value is 0 == UNKNOWN
1174        iterator.data.property_table.get32(c as u32)
1175    }
1176
1177    #[inline]
1178    fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool {
1179        false
1180    }
1181
1182    fn line_handle_complex_language(
1183        _: &mut LineBreakIterator<Self>,
1184        _: Self::CharType,
1185    ) -> Option<usize> {
1186        unreachable!()
1187    }
1188}
1189
1190impl LineBreakType for Utf16 {
1191    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 {
1192        iterator.data.get_linebreak_property_utf32_with_rule(
1193            c,
1194            iterator.options.strictness,
1195            iterator.options.word_option,
1196        )
1197    }
1198
1199    #[inline]
1200    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool {
1201        iterator.data.use_complex_breaking_utf32(c)
1202    }
1203
1204    fn line_handle_complex_language(
1205        iterator: &mut LineBreakIterator<Self>,
1206        left_codepoint: Self::CharType,
1207    ) -> Option<usize> {
1208        // word segmenter doesn't define break rules for some languages such as Thai.
1209        let start_iter = iterator.iter.clone();
1210        let start_point = iterator.current_pos_data;
1211        let mut s = vec![left_codepoint as u16];
1212        loop {
1213            debug_assert!(!iterator.is_eof());
1214            s.push(iterator.get_current_codepoint()? as u16);
1215            iterator.advance_iter();
1216            if let Some(current_codepoint) = iterator.get_current_codepoint() {
1217                if !Self::use_complex_breaking(iterator, current_codepoint) {
1218                    break;
1219                }
1220            } else {
1221                // EOF
1222                break;
1223            }
1224        }
1225
1226        // Restore iterator to move to head of complex string
1227        iterator.iter = start_iter;
1228        iterator.current_pos_data = start_point;
1229        let breaks = iterator.complex.complex_language_segment_utf16(&s);
1230        iterator.result_cache = breaks;
1231        // result_cache vector is utf-16 index that is in BMP.
1232        let first_pos = *iterator.result_cache.first()?;
1233        let mut i = 1;
1234        loop {
1235            if i == first_pos {
1236                // Re-calculate breaking offset
1237                iterator.result_cache = iterator
1238                    .result_cache
1239                    .iter()
1240                    .skip(1)
1241                    .map(|r| r - i)
1242                    .collect();
1243                return iterator.get_current_position();
1244            }
1245            debug_assert!(
1246                i < first_pos,
1247                "we should always arrive at first_pos: near index {:?}",
1248                iterator.get_current_position()
1249            );
1250            i += 1;
1251            iterator.advance_iter();
1252            if iterator.is_eof() {
1253                iterator.result_cache.clear();
1254                return Some(iterator.len);
1255            }
1256        }
1257    }
1258}
1259
1260#[cfg(test)]
1261#[cfg(feature = "serde")]
1262mod tests {
1263    use super::*;
1264    use crate::LineSegmenter;
1265
1266    #[test]
1267    fn linebreak_property() {
1268        let payload =
1269            DataProvider::<SegmenterBreakLineV1>::load(&crate::provider::Baked, Default::default())
1270                .expect("Loading should succeed!")
1271                .payload;
1272
1273        let get_linebreak_property = |codepoint| {
1274            payload.get().get_linebreak_property_utf32_with_rule(
1275                codepoint as u32,
1276                LineBreakStrictness::Strict,
1277                LineBreakWordOption::Normal,
1278            )
1279        };
1280
1281        assert_eq!(get_linebreak_property('\u{0020}'), SP);
1282        assert_eq!(get_linebreak_property('\u{0022}'), QU);
1283        assert_eq!(get_linebreak_property('('), OP_OP30);
1284        assert_eq!(get_linebreak_property('\u{0030}'), NU);
1285        assert_eq!(get_linebreak_property('['), OP_OP30);
1286        assert_eq!(get_linebreak_property('\u{1f3fb}'), EM);
1287        assert_eq!(get_linebreak_property('\u{20000}'), ID);
1288        assert_eq!(get_linebreak_property('\u{e0020}'), CM);
1289        assert_eq!(get_linebreak_property('\u{3041}'), CJ);
1290        assert_eq!(get_linebreak_property('\u{0025}'), PO);
1291        assert_eq!(get_linebreak_property('\u{00A7}'), AI);
1292        assert_eq!(get_linebreak_property('\u{50005}'), XX);
1293        assert_eq!(get_linebreak_property('\u{17D6}'), NS);
1294        assert_eq!(get_linebreak_property('\u{2014}'), B2);
1295    }
1296
1297    #[test]
1298    #[allow(clippy::bool_assert_comparison)] // clearer when we're testing bools directly
1299    fn break_rule() {
1300        let payload =
1301            DataProvider::<SegmenterBreakLineV1>::load(&crate::provider::Baked, Default::default())
1302                .expect("Loading should succeed!")
1303                .payload;
1304        let lb_data: &RuleBreakData = payload.get();
1305
1306        let is_break = |left, right| {
1307            matches!(
1308                lb_data.get_break_state_from_table(left, right),
1309                BreakState::Break | BreakState::NoMatch
1310            )
1311        };
1312
1313        // LB4
1314        assert_eq!(is_break(BK, AL), true);
1315        // LB5
1316        assert_eq!(is_break(CR, LF), false);
1317        assert_eq!(is_break(CR, AL), true);
1318        assert_eq!(is_break(LF, AL), true);
1319        assert_eq!(is_break(NL, AL), true);
1320        // LB6
1321        assert_eq!(is_break(AL, BK), false);
1322        assert_eq!(is_break(AL, CR), false);
1323        assert_eq!(is_break(AL, LF), false);
1324        assert_eq!(is_break(AL, NL), false);
1325        // LB7
1326        assert_eq!(is_break(AL, SP), false);
1327        assert_eq!(is_break(AL, ZW), false);
1328        // LB8
1329        // LB8a and LB9 omitted: These are handled outside of the state table.
1330        // LB10
1331        assert_eq!(is_break(ZWJ, SP), false);
1332        assert_eq!(is_break(SP, CM), true);
1333        // LB11
1334        assert_eq!(is_break(AL, WJ), false);
1335        assert_eq!(is_break(WJ, AL), false);
1336        // LB12
1337        assert_eq!(is_break(GL, AL), false);
1338        // LB12a
1339        assert_eq!(is_break(AL, GL), false);
1340        assert_eq!(is_break(SP, GL), true);
1341        // LB13
1342        assert_eq!(is_break(AL, CL), false);
1343        assert_eq!(is_break(AL, CP), false);
1344        assert_eq!(is_break(AL, EX), false);
1345        assert_eq!(is_break(AL, IS), false);
1346        assert_eq!(is_break(AL, SY), false);
1347        // LB18
1348        assert_eq!(is_break(SP, AL), true);
1349        // LB19
1350        assert_eq!(is_break(AL, QU), false);
1351        assert_eq!(is_break(QU, AL), false);
1352        // LB20
1353        assert_eq!(is_break(AL, CB), true);
1354        assert_eq!(is_break(CB, AL), true);
1355        // LB20
1356        assert_eq!(is_break(AL, BA), false);
1357        assert_eq!(is_break(AL, HY), false);
1358        assert_eq!(is_break(AL, NS), false);
1359        // LB21
1360        assert_eq!(is_break(AL, BA), false);
1361        assert_eq!(is_break(BB, AL), false);
1362        assert_eq!(is_break(ID, BA), false);
1363        assert_eq!(is_break(ID, NS), false);
1364        // LB21a
1365        // LB21b
1366        assert_eq!(is_break(SY, HL), false);
1367        // LB22
1368        assert_eq!(is_break(AL, IN), false);
1369        // LB 23
1370        assert_eq!(is_break(AL, NU), false);
1371        assert_eq!(is_break(HL, NU), false);
1372        // LB 23a
1373        assert_eq!(is_break(PR, ID), false);
1374        assert_eq!(is_break(PR, EB), false);
1375        assert_eq!(is_break(PR, EM), false);
1376        assert_eq!(is_break(ID, PO), false);
1377        assert_eq!(is_break(EB, PO), false);
1378        assert_eq!(is_break(EM, PO), false);
1379        // LB26
1380        assert_eq!(is_break(JL, JL), false);
1381        assert_eq!(is_break(JL, JV), false);
1382        assert_eq!(is_break(JL, H2), false);
1383        // LB27
1384        assert_eq!(is_break(JL, IN), false);
1385        assert_eq!(is_break(JL, PO), false);
1386        assert_eq!(is_break(PR, JL), false);
1387        // LB28
1388        assert_eq!(is_break(AL, AL), false);
1389        assert_eq!(is_break(HL, AL), false);
1390        // LB29
1391        assert_eq!(is_break(IS, AL), false);
1392        assert_eq!(is_break(IS, HL), false);
1393        // LB30b
1394        assert_eq!(is_break(EB, EM), false);
1395        // LB31
1396        assert_eq!(is_break(ID, ID), true);
1397    }
1398
1399    #[test]
1400    fn linebreak() {
1401        let segmenter =
1402            LineSegmenter::try_new_dictionary_unstable(&crate::provider::Baked, Default::default())
1403                .expect("Data exists");
1404        let segmenter = segmenter.as_borrowed();
1405
1406        let mut iter = segmenter.segment_str("hello world");
1407        assert_eq!(Some(0), iter.next());
1408        assert_eq!(Some(6), iter.next());
1409        assert_eq!(Some(11), iter.next());
1410        assert_eq!(None, iter.next());
1411
1412        iter = segmenter.segment_str("$10 $10");
1413        assert_eq!(Some(0), iter.next());
1414        assert_eq!(Some(4), iter.next());
1415        assert_eq!(Some(7), iter.next());
1416        assert_eq!(None, iter.next());
1417
1418        // LB10
1419
1420        // LB14
1421        iter = segmenter.segment_str("[  abc def");
1422        assert_eq!(Some(0), iter.next());
1423        assert_eq!(Some(7), iter.next());
1424        assert_eq!(Some(10), iter.next());
1425        assert_eq!(None, iter.next());
1426
1427        let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1428        let mut iter_u8 = segmenter.segment_latin1(&input);
1429        assert_eq!(Some(0), iter_u8.next());
1430        assert_eq!(Some(7), iter_u8.next());
1431        assert_eq!(Some(10), iter_u8.next());
1432        assert_eq!(None, iter_u8.next());
1433
1434        let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1435        let mut iter_u16 = segmenter.segment_utf16(&input);
1436        assert_eq!(Some(0), iter_u16.next());
1437        assert_eq!(Some(7), iter_u16.next());
1438        assert_eq!(Some(10), iter_u16.next());
1439        assert_eq!(None, iter_u16.next());
1440
1441        // LB15 used to prevent the break at 6, but has been removed in Unicode 15.1.
1442        iter = segmenter.segment_str("abc\u{0022}  (def");
1443        assert_eq!(Some(0), iter.next());
1444        assert_eq!(Some(6), iter.next());
1445        assert_eq!(Some(10), iter.next());
1446        assert_eq!(None, iter.next());
1447
1448        let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1449        let mut iter_u8 = segmenter.segment_latin1(&input);
1450        assert_eq!(Some(0), iter_u8.next());
1451        assert_eq!(Some(6), iter_u8.next());
1452        assert_eq!(Some(10), iter_u8.next());
1453        assert_eq!(None, iter_u8.next());
1454
1455        let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1456        let mut iter_u16 = segmenter.segment_utf16(&input);
1457        assert_eq!(Some(0), iter_u16.next());
1458        assert_eq!(Some(6), iter_u16.next());
1459        assert_eq!(Some(10), iter_u16.next());
1460        assert_eq!(None, iter_u16.next());
1461
1462        // Instead, in Unicode 15.1, LB15a and LB15b prevent these breaks.
1463        iter = segmenter.segment_str("« miaou »");
1464        assert_eq!(Some(0), iter.next());
1465        assert_eq!(Some(11), iter.next());
1466        assert_eq!(None, iter.next());
1467
1468        let input: Vec<u8> = "« miaou »"
1469            .chars()
1470            .map(|c| u8::try_from(u32::from(c)).unwrap())
1471            .collect();
1472        let mut iter_u8 = segmenter.segment_latin1(&input);
1473        assert_eq!(Some(0), iter_u8.next());
1474        assert_eq!(Some(9), iter_u8.next());
1475        assert_eq!(None, iter_u8.next());
1476
1477        let input: Vec<u16> = "« miaou »".encode_utf16().collect();
1478        let mut iter_u16 = segmenter.segment_utf16(&input);
1479        assert_eq!(Some(0), iter_u16.next());
1480        assert_eq!(Some(9), iter_u16.next());
1481        assert_eq!(None, iter_u16.next());
1482
1483        // But not these:
1484        iter = segmenter.segment_str("Die Katze hat »miau« gesagt.");
1485        assert_eq!(Some(0), iter.next());
1486        assert_eq!(Some(4), iter.next());
1487        assert_eq!(Some(10), iter.next());
1488        assert_eq!(Some(14), iter.next());
1489        assert_eq!(Some(23), iter.next());
1490        assert_eq!(Some(30), iter.next());
1491        assert_eq!(None, iter.next());
1492
1493        let input: Vec<u8> = "Die Katze hat »miau« gesagt."
1494            .chars()
1495            .map(|c| u8::try_from(u32::from(c)).unwrap())
1496            .collect();
1497        let mut iter_u8 = segmenter.segment_latin1(&input);
1498        assert_eq!(Some(0), iter_u8.next());
1499        assert_eq!(Some(4), iter_u8.next());
1500        assert_eq!(Some(10), iter_u8.next());
1501        assert_eq!(Some(14), iter_u8.next());
1502        assert_eq!(Some(21), iter_u8.next());
1503        assert_eq!(Some(28), iter_u8.next());
1504        assert_eq!(None, iter_u8.next());
1505
1506        let input: Vec<u16> = "Die Katze hat »miau« gesagt.".encode_utf16().collect();
1507        let mut iter_u16 = segmenter.segment_utf16(&input);
1508        assert_eq!(Some(0), iter_u16.next());
1509        assert_eq!(Some(4), iter_u16.next());
1510        assert_eq!(Some(10), iter_u16.next());
1511        assert_eq!(Some(14), iter_u16.next());
1512        assert_eq!(Some(21), iter_u16.next());
1513        assert_eq!(Some(28), iter_u16.next());
1514        assert_eq!(None, iter_u16.next());
1515
1516        // LB16
1517        iter = segmenter.segment_str("\u{0029}\u{203C}");
1518        assert_eq!(Some(0), iter.next());
1519        assert_eq!(Some(4), iter.next());
1520        assert_eq!(None, iter.next());
1521        iter = segmenter.segment_str("\u{0029}  \u{203C}");
1522        assert_eq!(Some(0), iter.next());
1523        assert_eq!(Some(6), iter.next());
1524        assert_eq!(None, iter.next());
1525
1526        let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c];
1527        let mut iter_u16 = segmenter.segment_utf16(&input);
1528        assert_eq!(Some(0), iter_u16.next());
1529        assert_eq!(Some(4), iter_u16.next());
1530        assert_eq!(None, iter_u16.next());
1531
1532        // LB17
1533        iter = segmenter.segment_str("\u{2014}\u{2014}aa");
1534        assert_eq!(Some(0), iter.next());
1535        assert_eq!(Some(6), iter.next());
1536        assert_eq!(Some(8), iter.next());
1537        assert_eq!(None, iter.next());
1538        iter = segmenter.segment_str("\u{2014}  \u{2014}aa");
1539        assert_eq!(Some(0), iter.next());
1540        assert_eq!(Some(8), iter.next());
1541        assert_eq!(Some(10), iter.next());
1542        assert_eq!(None, iter.next());
1543
1544        iter = segmenter.segment_str("\u{2014}\u{2014}  \u{2014}\u{2014}123 abc");
1545        assert_eq!(Some(0), iter.next());
1546        assert_eq!(Some(14), iter.next());
1547        assert_eq!(Some(18), iter.next());
1548        assert_eq!(Some(21), iter.next());
1549        assert_eq!(None, iter.next());
1550
1551        // LB25
1552        let mut iter = segmenter.segment_str("(0,1)+(2,3)");
1553        assert_eq!(Some(0), iter.next());
1554        assert_eq!(Some(11), iter.next());
1555        assert_eq!(None, iter.next());
1556        let input: [u16; 11] = [
1557            0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29,
1558        ];
1559        let mut iter_u16 = segmenter.segment_utf16(&input);
1560        assert_eq!(Some(0), iter_u16.next());
1561        assert_eq!(Some(11), iter_u16.next());
1562        assert_eq!(None, iter_u16.next());
1563
1564        let input: [u16; 13] = [
1565            0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63,
1566        ];
1567        let mut iter_u16 = segmenter.segment_utf16(&input);
1568        assert_eq!(Some(0), iter_u16.next());
1569        assert_eq!(Some(6), iter_u16.next());
1570        assert_eq!(Some(10), iter_u16.next());
1571        assert_eq!(Some(13), iter_u16.next());
1572        assert_eq!(None, iter_u16.next());
1573
1574        iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}");
1575        assert_eq!(Some(0), iter.next());
1576        assert_eq!(Some(5), iter.next());
1577        assert_eq!(Some(9), iter.next());
1578        assert_eq!(None, iter.next());
1579    }
1580
1581    #[test]
1582    #[cfg(feature = "lstm")]
1583    fn thai_line_break() {
1584        const TEST_STR: &str = "ภาษาไทยภาษาไทย";
1585
1586        let segmenter = LineSegmenter::new_lstm(Default::default());
1587        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1588        assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test");
1589
1590        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1591        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1592        assert_eq!(breaks, [0, 4, 7, 11, utf16.len()], "Thai test");
1593
1594        let utf16: [u16; 4] = [0x0e20, 0x0e32, 0x0e29, 0x0e32];
1595        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1596        assert_eq!(breaks, [0, 4], "Thai test");
1597    }
1598
1599    #[test]
1600    #[cfg(feature = "lstm")]
1601    fn burmese_line_break() {
1602        // "Burmese Language" in Burmese
1603        const TEST_STR: &str = "မြန်မာဘာသာစကား";
1604
1605        let segmenter = LineSegmenter::new_lstm(Default::default());
1606        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1607        // LSTM model breaks more characters, but it is better to return [30].
1608        assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test");
1609
1610        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1611        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1612        // LSTM model breaks more characters, but it is better to return [10].
1613        assert_eq!(breaks, [0, 4, 6, 10, utf16.len()], "Burmese utf-16 test");
1614    }
1615
1616    #[test]
1617    #[cfg(feature = "lstm")]
1618    fn khmer_line_break() {
1619        const TEST_STR: &str = "សេចក្ដីប្រកាសជាសកលស្ដីពីសិទ្ធិមនុស្ស";
1620
1621        let segmenter = LineSegmenter::new_lstm(Default::default());
1622        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1623        // Note: This small sample matches the ICU dictionary segmenter
1624        assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test");
1625
1626        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1627        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1628        assert_eq!(
1629            breaks,
1630            [0, 13, 16, 18, 24, utf16.len()],
1631            "Khmer utf-16 test"
1632        );
1633    }
1634
1635    #[test]
1636    #[cfg(feature = "lstm")]
1637    fn lao_line_break() {
1638        const TEST_STR: &str = "ກ່ຽວກັບສິດຂອງມະນຸດ";
1639
1640        let segmenter = LineSegmenter::new_lstm(Default::default());
1641        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1642        // Note: LSTM finds a break at '12' that the dictionary does not find
1643        assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test");
1644
1645        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1646        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1647        assert_eq!(breaks, [0, 4, 7, 10, 13, utf16.len()], "Lao utf-16 test");
1648    }
1649
1650    #[test]
1651    fn empty_string() {
1652        let segmenter = LineSegmenter::new_auto(Default::default());
1653        let breaks: Vec<usize> = segmenter.segment_str("").collect();
1654        assert_eq!(breaks, [0]);
1655    }
1656}