icu_segmenter/
line.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::complex::*;
6use crate::indices::*;
7use crate::provider::*;
8use crate::rule_segmenter::*;
9use alloc::string::String;
10use alloc::vec;
11use alloc::vec::Vec;
12use core::char;
13use icu_locale_core::subtags::{language, Language};
14use icu_locale_core::LanguageIdentifier;
15use icu_provider::prelude::*;
16use utf8_iter::Utf8CharIndices;
17
18// TODO(#1637): These constants should be data driven.
19#[allow(dead_code)]
20const UNKNOWN: u8 = 0;
21#[allow(dead_code)]
22const AI: u8 = 1;
23#[allow(dead_code)]
24const AK: u8 = 2;
25#[allow(dead_code)]
26const AL: u8 = 3;
27#[allow(dead_code)]
28const AL_DOTTED_CIRCLE: u8 = 4;
29#[allow(dead_code)]
30const AP: u8 = 5;
31#[allow(dead_code)]
32const AS: u8 = 6;
33#[allow(dead_code)]
34const B2: u8 = 7;
35#[allow(dead_code)]
36const BA: u8 = 8;
37#[allow(dead_code)]
38const BB: u8 = 9;
39#[allow(dead_code)]
40const BK: u8 = 10;
41#[allow(dead_code)]
42const CB: u8 = 11;
43#[allow(dead_code)]
44const CJ: u8 = 12;
45#[allow(dead_code)]
46const CL: u8 = 13;
47#[allow(dead_code)]
48const CM: u8 = 14;
49#[allow(dead_code)]
50const CP: u8 = 15;
51#[allow(dead_code)]
52const CR: u8 = 16;
53#[allow(dead_code)]
54const EB: u8 = 17;
55#[allow(dead_code)]
56const EM: u8 = 18;
57#[allow(dead_code)]
58const EX: u8 = 19;
59#[allow(dead_code)]
60const GL: u8 = 20;
61#[allow(dead_code)]
62const H2: u8 = 21;
63#[allow(dead_code)]
64const H3: u8 = 22;
65#[allow(dead_code)]
66const HL: u8 = 23;
67#[allow(dead_code)]
68const HY: u8 = 24;
69#[allow(dead_code)]
70const ID: u8 = 25;
71#[allow(dead_code)]
72const ID_CN: u8 = 26;
73#[allow(dead_code)]
74const IN: u8 = 27;
75#[allow(dead_code)]
76const IS: u8 = 28;
77#[allow(dead_code)]
78const JL: u8 = 29;
79#[allow(dead_code)]
80const JT: u8 = 30;
81#[allow(dead_code)]
82const JV: u8 = 31;
83#[allow(dead_code)]
84const LF: u8 = 32;
85#[allow(dead_code)]
86const NL: u8 = 33;
87#[allow(dead_code)]
88const NS: u8 = 34;
89#[allow(dead_code)]
90const NU: u8 = 35;
91#[allow(dead_code)]
92const OP_EA: u8 = 36;
93#[allow(dead_code)]
94const OP_OP30: u8 = 37;
95#[allow(dead_code)]
96const PO: u8 = 38;
97#[allow(dead_code)]
98const PO_EAW: u8 = 39;
99#[allow(dead_code)]
100const PR: u8 = 40;
101#[allow(dead_code)]
102const PR_EAW: u8 = 41;
103#[allow(dead_code)]
104const QU: u8 = 42;
105#[allow(dead_code)]
106const QU_PF: u8 = 43;
107#[allow(dead_code)]
108const QU_PI: u8 = 44;
109#[allow(dead_code)]
110const RI: u8 = 45;
111#[allow(dead_code)]
112const SA: u8 = 46;
113#[allow(dead_code)]
114const SP: u8 = 47;
115#[allow(dead_code)]
116const SY: u8 = 48;
117#[allow(dead_code)]
118const VF: u8 = 49;
119#[allow(dead_code)]
120const VI: u8 = 50;
121#[allow(dead_code)]
122const WJ: u8 = 51;
123#[allow(dead_code)]
124const XX: u8 = 52;
125#[allow(dead_code)]
126const ZW: u8 = 53;
127#[allow(dead_code)]
128const ZWJ: u8 = 54;
129
130/// An enum specifies the strictness of line-breaking rules. It can be passed as
131/// an argument when creating a line segmenter.
132///
133/// Each enum value has the same meaning with respect to the `line-break`
134/// property values in the CSS Text spec. See the details in
135/// <https://drafts.csswg.org/css-text-3/#line-break-property>.
136#[non_exhaustive]
137#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
138pub enum LineBreakStrictness {
139    /// Breaks text using the least restrictive set of line-breaking rules.
140    /// Typically used for short lines, such as in newspapers.
141    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-loose>
142    Loose,
143
144    /// Breaks text using the most common set of line-breaking rules.
145    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-normal>
146    Normal,
147
148    /// Breaks text using the most stringent set of line-breaking rules.
149    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-strict>
150    ///
151    /// This is the default behaviour of the Unicode Line Breaking Algorithm,
152    /// resolving class [CJ](https://www.unicode.org/reports/tr14/#CJ) to
153    /// [NS](https://www.unicode.org/reports/tr14/#NS);
154    /// see rule [LB1](https://www.unicode.org/reports/tr14/#LB1).
155    #[default]
156    Strict,
157
158    /// Breaks text assuming there is a soft wrap opportunity around every
159    /// typographic character unit, disregarding any prohibition against line
160    /// breaks. See more details in
161    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-anywhere>.
162    Anywhere,
163}
164
165/// An enum specifies the line break opportunities between letters. It can be
166/// passed as an argument when creating a line segmenter.
167///
168/// Each enum value has the same meaning with respect to the `word-break`
169/// property values in the CSS Text spec. See the details in
170/// <https://drafts.csswg.org/css-text-3/#word-break-property>
171#[non_exhaustive]
172#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
173pub enum LineBreakWordOption {
174    /// Words break according to their customary rules. See the details in
175    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-normal>.
176    #[default]
177    Normal,
178
179    /// Breaking is allowed within "words".
180    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-break-all>
181    BreakAll,
182
183    /// Breaking is forbidden within "word".
184    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all>
185    KeepAll,
186}
187
188/// Options to tailor line-breaking behavior.
189#[non_exhaustive]
190#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
191pub struct LineBreakOptions<'a> {
192    /// Strictness of line-breaking rules. See [`LineBreakStrictness`].
193    ///
194    /// Default is [`LineBreakStrictness::Strict`]
195    pub strictness: Option<LineBreakStrictness>,
196
197    /// Line break opportunities between letters. See [`LineBreakWordOption`].
198    ///
199    /// Default is [`LineBreakStrictness::Normal`]
200    pub word_option: Option<LineBreakWordOption>,
201
202    /// Content locale for line segmenter
203    ///
204    /// This allows more break opportunities when `LineBreakStrictness` is
205    /// `Normal` or `Loose`. See
206    /// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
207    /// This option has no effect in Latin-1 mode.
208    pub content_locale: Option<&'a LanguageIdentifier>,
209}
210
211impl LineBreakOptions<'_> {
212    /// `const` version of [`Default::default`]
213    pub const fn default() -> Self {
214        Self {
215            strictness: None,
216            word_option: None,
217            content_locale: None,
218        }
219    }
220}
221
222#[derive(Debug, Clone, Copy)]
223struct ResolvedLineBreakOptions {
224    strictness: LineBreakStrictness,
225    word_option: LineBreakWordOption,
226    ja_zh: bool,
227}
228
229impl LineBreakOptions<'_> {
230    const fn resolve(self) -> ResolvedLineBreakOptions {
231        ResolvedLineBreakOptions {
232            strictness: match self.strictness {
233                Some(s) => s,
234                None => LineBreakStrictness::Strict,
235            },
236            word_option: match self.word_option {
237                Some(s) => s,
238                None => LineBreakWordOption::Normal,
239            },
240            ja_zh: if let Some(content_locale) = self.content_locale.as_ref() {
241                const JA: Language = language!("ja");
242                const ZH: Language = language!("zh");
243                matches!(content_locale.language, JA | ZH)
244            } else {
245                false
246            },
247        }
248    }
249}
250
251/// Supports loading line break data, and creating line break iterators for different string
252/// encodings.
253///
254/// Most segmentation methods live on [`LineSegmenterBorrowed`], which can be obtained via
255/// [`LineSegmenter::new_auto()`] (etc) or [`LineSegmenter::as_borrowed()`].
256///
257/// The segmenter returns mandatory breaks (as defined by [definition LD7][LD7] of
258/// Unicode Standard Annex #14, _Unicode Line Breaking Algorithm_) as well as
259/// line break opportunities ([definition LD3][LD3]).
260/// It does not distinguish them.  Callers requiring that distinction can check
261/// the Line_Break property of the code point preceding the break against those
262/// listed in rules [LB4][LB4] and [LB5][LB5], special-casing the end of text
263/// according to [LB3][LB3].
264///
265/// For consistency with the grapheme, word, and sentence segmenters, there is
266/// always a breakpoint returned at index 0, but this breakpoint is not a
267/// meaningful line break opportunity.
268///
269/// [LD3]: https://www.unicode.org/reports/tr14/#LD3
270/// [LD7]: https://www.unicode.org/reports/tr14/#LD7
271/// [LB3]: https://www.unicode.org/reports/tr14/#LB3
272/// [LB4]: https://www.unicode.org/reports/tr14/#LB4
273/// [LB5]: https://www.unicode.org/reports/tr14/#LB5
274///
275/// ```rust
276/// # use icu::segmenter::LineSegmenter;
277/// #
278/// # let segmenter = LineSegmenter::new_auto(Default::default());
279/// #
280/// let text = "Summary\r\nThis annex…";
281/// let breakpoints: Vec<usize> = segmenter.segment_str(text).collect();
282/// // 9 and 22 are mandatory breaks, 14 is a line break opportunity.
283/// assert_eq!(&breakpoints, &[0, 9, 14, 22]);
284///
285/// // There is a break opportunity between emoji, but not within the ZWJ sequence 🏳️‍🌈.
286/// let flag_equation = "🏳️➕🌈🟰🏳️\u{200D}🌈";
287/// let possible_first_lines: Vec<&str> =
288///     segmenter.segment_str(flag_equation).skip(1).map(|i| &flag_equation[..i]).collect();
289/// assert_eq!(
290///     &possible_first_lines,
291///     &[
292///         "🏳️",
293///         "🏳️➕",
294///         "🏳️➕🌈",
295///         "🏳️➕🌈🟰",
296///         "🏳️➕🌈🟰🏳️‍🌈"
297///     ]
298/// );
299/// ```
300///
301/// # Examples
302///
303/// Segment a string with default options:
304///
305/// ```rust
306/// use icu::segmenter::LineSegmenter;
307///
308/// let segmenter = LineSegmenter::new_auto(Default::default());
309///
310/// let breakpoints: Vec<usize> =
311///     segmenter.segment_str("Hello World").collect();
312/// assert_eq!(&breakpoints, &[0, 6, 11]);
313/// ```
314///
315/// Segment a string with CSS option overrides:
316///
317/// ```rust
318/// use icu::segmenter::options::{
319///     LineBreakOptions, LineBreakStrictness, LineBreakWordOption,
320/// };
321/// use icu::segmenter::LineSegmenter;
322///
323/// let mut options = LineBreakOptions::default();
324/// options.strictness = Some(LineBreakStrictness::Strict);
325/// options.word_option = Some(LineBreakWordOption::BreakAll);
326/// options.content_locale = None;
327/// let segmenter = LineSegmenter::new_auto(options);
328///
329/// let breakpoints: Vec<usize> =
330///     segmenter.segment_str("Hello World").collect();
331/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]);
332/// ```
333///
334/// Segment a Latin1 byte string:
335///
336/// ```rust
337/// use icu::segmenter::LineSegmenter;
338///
339/// let segmenter = LineSegmenter::new_auto(Default::default());
340///
341/// let breakpoints: Vec<usize> =
342///     segmenter.segment_latin1(b"Hello World").collect();
343/// assert_eq!(&breakpoints, &[0, 6, 11]);
344/// ```
345///
346/// Separate mandatory breaks from the break opportunities:
347///
348/// ```rust
349/// use icu::properties::{props::LineBreak, CodePointMapData};
350/// use icu::segmenter::LineSegmenter;
351///
352/// # let segmenter = LineSegmenter::new_auto(Default::default());
353/// #
354/// let text = "Summary\r\nThis annex…";
355///
356/// let mandatory_breaks: Vec<usize> = segmenter
357///     .segment_str(text)
358///     .filter(|&i| {
359///         text[..i].chars().next_back().is_some_and(|c| {
360///             matches!(
361///                 CodePointMapData::<LineBreak>::new().get(c),
362///                 LineBreak::MandatoryBreak
363///                     | LineBreak::CarriageReturn
364///                     | LineBreak::LineFeed
365///                     | LineBreak::NextLine
366///             ) || i == text.len()
367///         })
368///     })
369///     .collect();
370/// assert_eq!(&mandatory_breaks, &[9, 22]);
371/// ```
372#[derive(Debug)]
373pub struct LineSegmenter {
374    options: ResolvedLineBreakOptions,
375    payload: DataPayload<SegmenterBreakLineV1>,
376    complex: ComplexPayloads,
377}
378
379/// Segments a string into lines (borrowed version).
380///
381/// See [`LineSegmenter`] for examples.
382#[derive(Clone, Debug, Copy)]
383pub struct LineSegmenterBorrowed<'data> {
384    options: ResolvedLineBreakOptions,
385    data: &'data RuleBreakData<'data>,
386    complex: ComplexPayloadsBorrowed<'data>,
387}
388
389impl LineSegmenter {
390    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
391    /// the best available compiled data for complex scripts (Khmer, Lao, Myanmar, and Thai).
392    ///
393    /// The current behavior, which is subject to change, is to use the LSTM model when available.
394    ///
395    /// See also [`Self::new_auto`].
396    ///
397    /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
398    ///
399    /// [📚 Help choosing a constructor](icu_provider::constructors)
400    #[cfg(feature = "auto")]
401    #[cfg(feature = "compiled_data")]
402    pub fn new_auto(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
403        Self::new_lstm(options)
404    }
405
406    #[cfg(feature = "auto")]
407    icu_provider::gen_buffer_data_constructors!(
408        (options: LineBreakOptions) -> error: DataError,
409        functions: [
410            new_auto: skip,
411            try_new_auto_with_buffer_provider,
412            try_new_auto_unstable,
413            Self,
414        ]
415    );
416
417    #[cfg(feature = "auto")]
418    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
419    pub fn try_new_auto_unstable<D>(
420        provider: &D,
421        options: LineBreakOptions,
422    ) -> Result<Self, DataError>
423    where
424        D: DataProvider<SegmenterBreakLineV1>
425            + DataProvider<SegmenterLstmAutoV1>
426            + DataProvider<SegmenterBreakGraphemeClusterV1>
427            + ?Sized,
428    {
429        Self::try_new_lstm_unstable(provider, options)
430    }
431
432    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
433    /// compiled LSTM data for complex scripts (Khmer, Lao, Myanmar, and Thai).
434    ///
435    /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
436    /// the full dictionary but more expensive during segmentation (inference).
437    ///
438    /// See also [`Self::new_lstm`].
439    ///
440    /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
441    ///
442    /// [📚 Help choosing a constructor](icu_provider::constructors)
443    #[cfg(feature = "lstm")]
444    #[cfg(feature = "compiled_data")]
445    pub fn new_lstm(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
446        LineSegmenterBorrowed {
447            options: options.resolve(),
448            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
449            complex: ComplexPayloadsBorrowed::new_lstm(),
450        }
451    }
452
453    #[cfg(feature = "lstm")]
454    icu_provider::gen_buffer_data_constructors!(
455        (options: LineBreakOptions) -> error: DataError,
456        functions: [
457            try_new_lstm: skip,
458            try_new_lstm_with_buffer_provider,
459            try_new_lstm_unstable,
460            Self,
461        ]
462    );
463
464    #[cfg(feature = "lstm")]
465    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
466    pub fn try_new_lstm_unstable<D>(
467        provider: &D,
468        options: LineBreakOptions,
469    ) -> Result<Self, DataError>
470    where
471        D: DataProvider<SegmenterBreakLineV1>
472            + DataProvider<SegmenterLstmAutoV1>
473            + DataProvider<SegmenterBreakGraphemeClusterV1>
474            + ?Sized,
475    {
476        Ok(Self {
477            options: options.resolve(),
478            payload: provider.load(Default::default())?.payload,
479            complex: ComplexPayloads::try_new_lstm(provider)?,
480        })
481    }
482
483    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
484    /// compiled dictionary data for complex scripts (Khmer, Lao, Myanmar, and Thai).
485    ///
486    /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
487    /// faster than the LSTM model but requires more data.
488    ///
489    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
490    ///
491    /// [📚 Help choosing a constructor](icu_provider::constructors)
492    #[cfg(feature = "compiled_data")]
493    pub fn new_dictionary(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
494        LineSegmenterBorrowed {
495            options: options.resolve(),
496            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
497            // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
498            // characters [1]. Southeast Asian languages however require complex context analysis
499            // [2].
500            //
501            // [1]: https://www.unicode.org/reports/tr14/#ID
502            // [2]: https://www.unicode.org/reports/tr14/#SA
503            complex: ComplexPayloadsBorrowed::new_southeast_asian(),
504        }
505    }
506
507    icu_provider::gen_buffer_data_constructors!(
508        (options: LineBreakOptions) -> error: DataError,
509        functions: [
510            new_dictionary: skip,
511            try_new_dictionary_with_buffer_provider,
512            try_new_dictionary_unstable,
513            Self,
514        ]
515    );
516
517    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
518    pub fn try_new_dictionary_unstable<D>(
519        provider: &D,
520        options: LineBreakOptions,
521    ) -> Result<Self, DataError>
522    where
523        D: DataProvider<SegmenterBreakLineV1>
524            + DataProvider<SegmenterDictionaryExtendedV1>
525            + DataProvider<SegmenterBreakGraphemeClusterV1>
526            + ?Sized,
527    {
528        Ok(Self {
529            options: options.resolve(),
530            payload: provider.load(Default::default())?.payload,
531            // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
532            // characters [1]. Southeast Asian languages however require complex context analysis
533            // [2].
534            //
535            // [1]: https://www.unicode.org/reports/tr14/#ID
536            // [2]: https://www.unicode.org/reports/tr14/#SA
537            complex: ComplexPayloads::try_new_southeast_asian(provider)?,
538        })
539    }
540
541    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
542    /// no support for scripts requiring complex context dependent line breaks (Khmer, Lao, Myanmar, Thai).
543    ///
544    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
545    ///
546    /// [📚 Help choosing a constructor](icu_provider::constructors)
547    #[cfg(feature = "compiled_data")]
548    pub const fn new_for_non_complex_scripts(
549        options: LineBreakOptions,
550    ) -> LineSegmenterBorrowed<'static> {
551        LineSegmenterBorrowed {
552            options: options.resolve(),
553            data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
554            complex: ComplexPayloadsBorrowed::empty(),
555        }
556    }
557
558    icu_provider::gen_buffer_data_constructors!(
559        (options: LineBreakOptions) -> error: DataError,
560        functions: [
561            new_for_non_complex_scripts: skip,
562            try_new_for_non_complex_scripts_with_buffer_provider,
563            try_new_for_non_complex_scripts_unstable,
564            Self,
565        ]
566    );
567
568    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_for_non_complex_scripts)]
569    pub fn try_new_for_non_complex_scripts_unstable<D>(
570        provider: &D,
571        options: LineBreakOptions,
572    ) -> Result<Self, DataError>
573    where
574        D: DataProvider<SegmenterBreakLineV1>
575            + DataProvider<SegmenterBreakGraphemeClusterV1>
576            + ?Sized,
577    {
578        Ok(Self {
579            options: options.resolve(),
580            payload: provider.load(Default::default())?.payload,
581            complex: ComplexPayloads::try_new_empty(provider)?,
582        })
583    }
584
585    /// Constructs a borrowed version of this type for more efficient querying.
586    ///
587    /// Most useful methods for segmentation are on this type.
588    pub fn as_borrowed(&self) -> LineSegmenterBorrowed<'_> {
589        LineSegmenterBorrowed {
590            options: self.options,
591            data: self.payload.get(),
592            complex: self.complex.as_borrowed(),
593        }
594    }
595}
596
597impl<'data> LineSegmenterBorrowed<'data> {
598    /// Creates a line break iterator for an `str` (a UTF-8 string).
599    ///
600    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
601    pub fn segment_str<'s>(self, input: &'s str) -> LineBreakIterator<'data, 's, Utf8> {
602        LineBreakIterator {
603            iter: input.char_indices(),
604            len: input.len(),
605            current_pos_data: None,
606            result_cache: Vec::new(),
607            data: self.data,
608            options: self.options,
609            complex: self.complex,
610        }
611    }
612    /// Creates a line break iterator for a potentially ill-formed UTF8 string
613    ///
614    /// Invalid characters are treated as REPLACEMENT CHARACTER
615    ///
616    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
617    pub fn segment_utf8<'s>(
618        self,
619        input: &'s [u8],
620    ) -> LineBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
621        LineBreakIterator {
622            iter: Utf8CharIndices::new(input),
623            len: input.len(),
624            current_pos_data: None,
625            result_cache: Vec::new(),
626            data: self.data,
627            options: self.options,
628            complex: self.complex,
629        }
630    }
631    /// Creates a line break iterator for a Latin-1 (8-bit) string.
632    ///
633    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
634    pub fn segment_latin1<'s>(self, input: &'s [u8]) -> LineBreakIterator<'data, 's, Latin1> {
635        LineBreakIterator {
636            iter: Latin1Indices::new(input),
637            len: input.len(),
638            current_pos_data: None,
639            result_cache: Vec::new(),
640            data: self.data,
641            options: self.options,
642            complex: self.complex,
643        }
644    }
645
646    /// Creates a line break iterator for a UTF-16 string.
647    ///
648    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
649    pub fn segment_utf16<'s>(self, input: &'s [u16]) -> LineBreakIterator<'data, 's, Utf16> {
650        LineBreakIterator {
651            iter: Utf16Indices::new(input),
652            len: input.len(),
653            current_pos_data: None,
654            result_cache: Vec::new(),
655            data: self.data,
656            options: self.options,
657            complex: self.complex,
658        }
659    }
660}
661
662impl LineSegmenterBorrowed<'static> {
663    /// Cheaply converts a [`LineSegmenterBorrowed<'static>`] into a [`LineSegmenter`].
664    ///
665    /// Note: Due to branching and indirection, using [`LineSegmenter`] might inhibit some
666    /// compile-time optimizations that are possible with [`LineSegmenterBorrowed`].
667    pub fn static_to_owned(self) -> LineSegmenter {
668        LineSegmenter {
669            payload: DataPayload::from_static_ref(self.data),
670            complex: self.complex.static_to_owned(),
671            options: self.options,
672        }
673    }
674}
675
676impl RuleBreakData<'_> {
677    fn get_linebreak_property_utf32_with_rule(
678        &self,
679        codepoint: u32,
680        strictness: LineBreakStrictness,
681        word_option: LineBreakWordOption,
682    ) -> u8 {
683        // Note: Default value is 0 == UNKNOWN
684        let prop = self.property_table.get32(codepoint);
685
686        if word_option == LineBreakWordOption::BreakAll
687            || strictness == LineBreakStrictness::Loose
688            || strictness == LineBreakStrictness::Normal
689        {
690            return match prop {
691                CJ => ID, // All CJ's General_Category is Other_Letter (Lo).
692                _ => prop,
693            };
694        }
695
696        // CJ is treated as NS by default, yielding strict line breaking.
697        // https://www.unicode.org/reports/tr14/#CJ
698        prop
699    }
700
701    #[inline]
702    fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
703        let idx = (left as usize) * (self.property_count as usize) + (right as usize);
704        // We use unwrap_or to fall back to the base case and prevent panics on bad data.
705        self.break_state_table.get(idx).unwrap_or(BreakState::Keep)
706    }
707
708    #[inline]
709    fn use_complex_breaking_utf32(&self, codepoint: u32) -> bool {
710        let line_break_property = self.get_linebreak_property_utf32_with_rule(
711            codepoint,
712            LineBreakStrictness::Strict,
713            LineBreakWordOption::Normal,
714        );
715
716        line_break_property == SA
717    }
718}
719
720#[inline]
721fn is_break_utf32_by_loose(
722    right_codepoint: u32,
723    left_prop: u8,
724    right_prop: u8,
725    ja_zh: bool,
726) -> Option<bool> {
727    // breaks before hyphens
728    if right_prop == BA {
729        if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) {
730            return Some(true);
731        }
732    } else if right_prop == NS {
733        // breaks before certain CJK hyphen-like characters
734        if right_codepoint == 0x301C || right_codepoint == 0x30A0 {
735            return Some(ja_zh);
736        }
737
738        // breaks before iteration marks
739        if right_codepoint == 0x3005
740            || right_codepoint == 0x303B
741            || right_codepoint == 0x309D
742            || right_codepoint == 0x309E
743            || right_codepoint == 0x30FD
744            || right_codepoint == 0x30FE
745        {
746            return Some(true);
747        }
748
749        // breaks before certain centered punctuation marks:
750        if right_codepoint == 0x30FB
751            || right_codepoint == 0xFF1A
752            || right_codepoint == 0xFF1B
753            || right_codepoint == 0xFF65
754            || right_codepoint == 0x203C
755            || (0x2047..=0x2049).contains(&right_codepoint)
756        {
757            return Some(ja_zh);
758        }
759    } else if right_prop == IN {
760        // breaks between inseparable characters such as U+2025, U+2026 i.e. characters with the Unicode Line Break property IN
761        return Some(true);
762    } else if right_prop == EX {
763        // breaks before certain centered punctuation marks:
764        if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F {
765            return Some(ja_zh);
766        }
767    }
768
769    // breaks before suffixes:
770    // Characters with the Unicode Line Break property PO and the East Asian Width property
771    if right_prop == PO_EAW {
772        return Some(ja_zh);
773    }
774    // breaks after prefixes:
775    // Characters with the Unicode Line Break property PR and the East Asian Width property
776    if left_prop == PR_EAW {
777        return Some(ja_zh);
778    }
779    None
780}
781
782/// A trait allowing for LineBreakIterator to be generalized to multiple string iteration methods.
783///
784/// This is implemented by ICU4X for several common string types.
785///
786/// <div class="stab unstable">
787/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
788/// trait, please consider using a type from the implementors listed below.
789/// </div>
790pub trait LineBreakType: crate::private::Sealed + Sized + RuleBreakType {
791    #[doc(hidden)]
792    fn use_complex_breaking(iterator: &LineBreakIterator<'_, '_, Self>, c: Self::CharType) -> bool;
793
794    #[doc(hidden)]
795    fn get_linebreak_property_with_rule(
796        iterator: &LineBreakIterator<'_, '_, Self>,
797        c: Self::CharType,
798    ) -> u8;
799
800    #[doc(hidden)]
801    fn line_handle_complex_language(
802        iterator: &mut LineBreakIterator<'_, '_, Self>,
803        left_codepoint: Self::CharType,
804    ) -> Option<usize>;
805}
806
807/// Implements the [`Iterator`] trait over the line break opportunities of the given string.
808///
809/// Lifetimes:
810///
811/// - `'l` = lifetime of the [`LineSegmenter`] object from which this iterator was created
812/// - `'s` = lifetime of the string being segmented
813///
814/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
815/// _after_ the break (for a break at the end of text, this index is the length
816/// of the [`str`] or array of code units).
817///
818/// For examples of use, see [`LineSegmenter`].
819#[derive(Debug)]
820pub struct LineBreakIterator<'data, 's, Y: LineBreakType> {
821    iter: Y::IterAttr<'s>,
822    len: usize,
823    current_pos_data: Option<(usize, Y::CharType)>,
824    result_cache: Vec<usize>,
825    data: &'data RuleBreakData<'data>,
826    options: ResolvedLineBreakOptions,
827    complex: ComplexPayloadsBorrowed<'data>,
828}
829
830impl<Y: LineBreakType> Iterator for LineBreakIterator<'_, '_, Y> {
831    type Item = usize;
832
833    fn next(&mut self) -> Option<Self::Item> {
834        match self.check_eof() {
835            StringBoundaryPosType::Start => return Some(0),
836            StringBoundaryPosType::End => return None,
837            _ => (),
838        }
839
840        // If we have break point cache by previous run, return this result
841        if let Some(&first_pos) = self.result_cache.first() {
842            let mut i = 0;
843            loop {
844                if i == first_pos {
845                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
846                    return self.get_current_position();
847                }
848                i += self.get_current_codepoint().map_or(0, Y::char_len);
849                self.advance_iter();
850                if self.is_eof() {
851                    self.result_cache.clear();
852                    return Some(self.len);
853                }
854            }
855        }
856
857        // The state prior to a sequence of CM and ZWJ affected by rule LB9.
858        let mut lb9_left: Option<u8> = None;
859        // Whether LB9 was applied to a ZWJ, so that breaks at the current
860        // position must be suppressed.
861        let mut lb8a_after_lb9 = false;
862
863        'a: loop {
864            debug_assert!(!self.is_eof());
865            let left_codepoint = self.get_current_codepoint()?;
866            let mut left_prop =
867                lb9_left.unwrap_or_else(|| self.get_linebreak_property(left_codepoint));
868            let after_zwj = lb8a_after_lb9 || (lb9_left.is_none() && left_prop == ZWJ);
869            self.advance_iter();
870
871            let Some(right_codepoint) = self.get_current_codepoint() else {
872                return Some(self.len);
873            };
874            let right_prop = self.get_linebreak_property(right_codepoint);
875            // NOTE(egg): The special-casing of `LineBreakStrictness::Anywhere` allows us to pass
876            // a test, but eventually that option should just be simplified to call the extended
877            // grapheme cluster segmenter.
878            if (right_prop == CM
879                || (right_prop == ZWJ && self.options.strictness != LineBreakStrictness::Anywhere))
880                && left_prop != BK
881                && left_prop != CR
882                && left_prop != LF
883                && left_prop != NL
884                && left_prop != SP
885                && left_prop != ZW
886            {
887                lb9_left = Some(left_prop);
888                lb8a_after_lb9 = right_prop == ZWJ;
889                continue;
890            } else {
891                lb9_left = None;
892                lb8a_after_lb9 = false;
893            }
894
895            // CSS word-break property handling
896            match (self.options.word_option, left_prop, right_prop) {
897                (LineBreakWordOption::BreakAll, AL | NU | SA, _) => {
898                    left_prop = ID;
899                }
900                //  typographic letter units shouldn't be break
901                (
902                    LineBreakWordOption::KeepAll,
903                    AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
904                    AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
905                ) => {
906                    continue;
907                }
908                _ => (),
909            }
910
911            // CSS line-break property handling
912            match self.options.strictness {
913                LineBreakStrictness::Normal => {
914                    if self.is_break_by_normal(right_codepoint) && !after_zwj {
915                        return self.get_current_position();
916                    }
917                }
918                LineBreakStrictness::Loose => {
919                    if let Some(breakable) = is_break_utf32_by_loose(
920                        right_codepoint.into(),
921                        left_prop,
922                        right_prop,
923                        self.options.ja_zh,
924                    ) {
925                        if breakable && !after_zwj {
926                            return self.get_current_position();
927                        }
928                        continue;
929                    }
930                }
931                LineBreakStrictness::Anywhere => {
932                    // TODO(egg): My reading of the CSS standard is that this
933                    // should break around extended grapheme clusters, not at
934                    // arbitrary code points, so this seems wrong.
935                    return self.get_current_position();
936                }
937                _ => (),
938            };
939
940            // UAX14 doesn't have Thai etc, so use another way.
941            if self.options.word_option != LineBreakWordOption::BreakAll
942                && Y::use_complex_breaking(self, left_codepoint)
943                && Y::use_complex_breaking(self, right_codepoint)
944            {
945                let result = Y::line_handle_complex_language(self, left_codepoint);
946                if result.is_some() {
947                    return result;
948                }
949                // I may have to fetch text until non-SA character?.
950            }
951
952            // If break_state is equals or grater than 0, it is alias of property.
953            match self.data.get_break_state_from_table(left_prop, right_prop) {
954                BreakState::Break | BreakState::NoMatch => {
955                    if after_zwj {
956                        continue;
957                    } else {
958                        return self.get_current_position();
959                    }
960                }
961                BreakState::Keep => continue,
962                BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
963                    let mut previous_iter = self.iter.clone();
964                    let mut previous_pos_data = self.current_pos_data;
965                    let mut previous_is_after_zwj = after_zwj;
966
967                    // Since we are building up a state in this inner loop, we do not
968                    // need an analogue of lb9_left; continuing the inner loop preserves
969                    // `index` which is the current state, and thus implements the
970                    // “treat as” rule.
971                    let mut left_prop_pre_lb9 = right_prop;
972
973                    // current state isn't resolved due to intermediating.
974                    // Example, [AK] [AS] is processing LB28a, but if not matched after fetching
975                    // data, we should break after [AK].
976                    let is_intermediate_rule_no_match = if lb8a_after_lb9 {
977                        // left was ZWJ so we don't break between ZWJ.
978                        true
979                    } else {
980                        index > self.data.last_codepoint_property
981                    };
982
983                    loop {
984                        self.advance_iter();
985                        let after_zwj = left_prop_pre_lb9 == ZWJ;
986
987                        let previous_break_state_is_cp_prop =
988                            index <= self.data.last_codepoint_property;
989
990                        let Some(prop) = self.get_current_linebreak_property() else {
991                            // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
992                            let break_state = self
993                                .data
994                                .get_break_state_from_table(index, self.data.eot_property);
995                            if break_state == BreakState::NoMatch {
996                                self.iter = previous_iter;
997                                self.current_pos_data = previous_pos_data;
998                                if previous_is_after_zwj {
999                                    // Do not break [AK] [ZWJ] ÷ [AS] (eot).
1000                                    continue 'a;
1001                                } else {
1002                                    return self.get_current_position();
1003                                }
1004                            }
1005                            // EOF
1006                            return Some(self.len);
1007                        };
1008
1009                        if (prop == CM || prop == ZWJ)
1010                            && left_prop_pre_lb9 != BK
1011                            && left_prop_pre_lb9 != CR
1012                            && left_prop_pre_lb9 != LF
1013                            && left_prop_pre_lb9 != NL
1014                            && left_prop_pre_lb9 != SP
1015                            && left_prop_pre_lb9 != ZW
1016                        {
1017                            left_prop_pre_lb9 = prop;
1018                            continue;
1019                        }
1020
1021                        match self.data.get_break_state_from_table(index, prop) {
1022                            BreakState::Keep => continue 'a,
1023                            BreakState::NoMatch => {
1024                                self.iter = previous_iter;
1025                                self.current_pos_data = previous_pos_data;
1026                                if after_zwj {
1027                                    // Break [AK] ÷ [AS] [ZWJ] [XX],
1028                                    // but not [AK] [ZWJ] ÷ [AS] [ZWJ] [XX].
1029                                    if is_intermediate_rule_no_match && !previous_is_after_zwj {
1030                                        return self.get_current_position();
1031                                    }
1032                                    continue 'a;
1033                                } else if previous_is_after_zwj {
1034                                    // Do not break [AK] [ZWJ] ÷ [AS] [XX].
1035                                    continue 'a;
1036                                } else {
1037                                    return self.get_current_position();
1038                                }
1039                            }
1040                            BreakState::Break => {
1041                                if after_zwj {
1042                                    continue 'a;
1043                                } else {
1044                                    return self.get_current_position();
1045                                }
1046                            }
1047                            BreakState::Intermediate(i) => {
1048                                index = i;
1049                                previous_iter = self.iter.clone();
1050                                previous_pos_data = self.current_pos_data;
1051                                previous_is_after_zwj = after_zwj;
1052                            }
1053                            BreakState::Index(i) => {
1054                                index = i;
1055                                if previous_break_state_is_cp_prop {
1056                                    previous_iter = self.iter.clone();
1057                                    previous_pos_data = self.current_pos_data;
1058                                    previous_is_after_zwj = after_zwj;
1059                                }
1060                            }
1061                        }
1062                        left_prop_pre_lb9 = prop;
1063                    }
1064                }
1065            }
1066        }
1067    }
1068}
1069
1070enum StringBoundaryPosType {
1071    Start,
1072    Middle,
1073    End,
1074}
1075
1076impl<Y: LineBreakType> LineBreakIterator<'_, '_, Y> {
1077    fn advance_iter(&mut self) {
1078        self.current_pos_data = self.iter.next();
1079    }
1080
1081    fn is_eof(&self) -> bool {
1082        self.current_pos_data.is_none()
1083    }
1084
1085    #[inline]
1086    fn check_eof(&mut self) -> StringBoundaryPosType {
1087        if self.is_eof() {
1088            self.advance_iter();
1089            if self.is_eof() {
1090                if self.len == 0 {
1091                    // Empty string. Since `self.current_pos_data` is always going to be empty,
1092                    // we never read `self.len` except for here, so we can use it to mark that
1093                    // we have already returned the single empty-string breakpoint.
1094                    self.len = 1;
1095                    StringBoundaryPosType::Start
1096                } else {
1097                    StringBoundaryPosType::End
1098                }
1099            } else {
1100                StringBoundaryPosType::Start
1101            }
1102        } else {
1103            StringBoundaryPosType::Middle
1104        }
1105    }
1106
1107    fn get_current_position(&self) -> Option<usize> {
1108        self.current_pos_data.map(|(pos, _)| pos)
1109    }
1110
1111    fn get_current_codepoint(&self) -> Option<Y::CharType> {
1112        self.current_pos_data.map(|(_, codepoint)| codepoint)
1113    }
1114
1115    fn get_linebreak_property(&self, codepoint: Y::CharType) -> u8 {
1116        Y::get_linebreak_property_with_rule(self, codepoint)
1117    }
1118
1119    fn get_current_linebreak_property(&self) -> Option<u8> {
1120        self.get_current_codepoint()
1121            .map(|c| self.get_linebreak_property(c))
1122    }
1123
1124    fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
1125        match codepoint.into() {
1126            0x301C | 0x30A0 => self.options.ja_zh,
1127            _ => false,
1128        }
1129    }
1130}
1131
1132impl LineBreakType for Utf8 {
1133    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1134        iterator.data.get_linebreak_property_utf32_with_rule(
1135            c as u32,
1136            iterator.options.strictness,
1137            iterator.options.word_option,
1138        )
1139    }
1140
1141    #[inline]
1142    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1143        iterator.data.use_complex_breaking_utf32(c as u32)
1144    }
1145
1146    fn line_handle_complex_language(
1147        iter: &mut LineBreakIterator<'_, '_, Self>,
1148        left_codepoint: char,
1149    ) -> Option<usize> {
1150        line_handle_complex_language_utf8(iter, left_codepoint)
1151    }
1152}
1153
1154impl LineBreakType for PotentiallyIllFormedUtf8 {
1155    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1156        iterator.data.get_linebreak_property_utf32_with_rule(
1157            c as u32,
1158            iterator.options.strictness,
1159            iterator.options.word_option,
1160        )
1161    }
1162
1163    #[inline]
1164    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1165        iterator.data.use_complex_breaking_utf32(c as u32)
1166    }
1167
1168    fn line_handle_complex_language(
1169        iter: &mut LineBreakIterator<'_, '_, Self>,
1170        left_codepoint: char,
1171    ) -> Option<usize> {
1172        line_handle_complex_language_utf8(iter, left_codepoint)
1173    }
1174}
1175/// line_handle_complex_language impl for UTF8 iterators
1176fn line_handle_complex_language_utf8<T>(
1177    iter: &mut LineBreakIterator<'_, '_, T>,
1178    left_codepoint: char,
1179) -> Option<usize>
1180where
1181    T: LineBreakType<CharType = char>,
1182{
1183    // word segmenter doesn't define break rules for some languages such as Thai.
1184    let start_iter = iter.iter.clone();
1185    let start_point = iter.current_pos_data;
1186    let mut s = String::new();
1187    s.push(left_codepoint);
1188    loop {
1189        debug_assert!(!iter.is_eof());
1190        s.push(iter.get_current_codepoint()?);
1191        iter.advance_iter();
1192        if let Some(current_codepoint) = iter.get_current_codepoint() {
1193            if !T::use_complex_breaking(iter, current_codepoint) {
1194                break;
1195            }
1196        } else {
1197            // EOF
1198            break;
1199        }
1200    }
1201
1202    // Restore iterator to move to head of complex string
1203    iter.iter = start_iter;
1204    iter.current_pos_data = start_point;
1205    let breaks = iter.complex.complex_language_segment_str(&s);
1206    iter.result_cache = breaks;
1207    let first_pos = *iter.result_cache.first()?;
1208    let mut i = left_codepoint.len_utf8();
1209    loop {
1210        if i == first_pos {
1211            // Re-calculate breaking offset
1212            iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
1213            return iter.get_current_position();
1214        }
1215        debug_assert!(
1216            i < first_pos,
1217            "we should always arrive at first_pos: near index {:?}",
1218            iter.get_current_position()
1219        );
1220        i += iter.get_current_codepoint().map_or(0, T::char_len);
1221        iter.advance_iter();
1222        if iter.is_eof() {
1223            iter.result_cache.clear();
1224            return Some(iter.len);
1225        }
1226    }
1227}
1228
1229impl LineBreakType for Latin1 {
1230    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 {
1231        // No CJ on Latin1
1232        // Note: Default value is 0 == UNKNOWN
1233        iterator.data.property_table.get32(c as u32)
1234    }
1235
1236    #[inline]
1237    fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool {
1238        false
1239    }
1240
1241    fn line_handle_complex_language(
1242        _: &mut LineBreakIterator<Self>,
1243        _: Self::CharType,
1244    ) -> Option<usize> {
1245        unreachable!()
1246    }
1247}
1248
1249impl LineBreakType for Utf16 {
1250    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 {
1251        iterator.data.get_linebreak_property_utf32_with_rule(
1252            c,
1253            iterator.options.strictness,
1254            iterator.options.word_option,
1255        )
1256    }
1257
1258    #[inline]
1259    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool {
1260        iterator.data.use_complex_breaking_utf32(c)
1261    }
1262
1263    fn line_handle_complex_language(
1264        iterator: &mut LineBreakIterator<Self>,
1265        left_codepoint: Self::CharType,
1266    ) -> Option<usize> {
1267        // word segmenter doesn't define break rules for some languages such as Thai.
1268        let start_iter = iterator.iter.clone();
1269        let start_point = iterator.current_pos_data;
1270        let mut s = vec![left_codepoint as u16];
1271        loop {
1272            debug_assert!(!iterator.is_eof());
1273            s.push(iterator.get_current_codepoint()? as u16);
1274            iterator.advance_iter();
1275            if let Some(current_codepoint) = iterator.get_current_codepoint() {
1276                if !Self::use_complex_breaking(iterator, current_codepoint) {
1277                    break;
1278                }
1279            } else {
1280                // EOF
1281                break;
1282            }
1283        }
1284
1285        // Restore iterator to move to head of complex string
1286        iterator.iter = start_iter;
1287        iterator.current_pos_data = start_point;
1288        let breaks = iterator.complex.complex_language_segment_utf16(&s);
1289        iterator.result_cache = breaks;
1290        // result_cache vector is utf-16 index that is in BMP.
1291        let first_pos = *iterator.result_cache.first()?;
1292        let mut i = 1;
1293        loop {
1294            if i == first_pos {
1295                // Re-calculate breaking offset
1296                iterator.result_cache = iterator
1297                    .result_cache
1298                    .iter()
1299                    .skip(1)
1300                    .map(|r| r - i)
1301                    .collect();
1302                return iterator.get_current_position();
1303            }
1304            debug_assert!(
1305                i < first_pos,
1306                "we should always arrive at first_pos: near index {:?}",
1307                iterator.get_current_position()
1308            );
1309            i += 1;
1310            iterator.advance_iter();
1311            if iterator.is_eof() {
1312                iterator.result_cache.clear();
1313                return Some(iterator.len);
1314            }
1315        }
1316    }
1317}
1318
1319#[cfg(test)]
1320#[cfg(feature = "serde")]
1321mod tests {
1322    use super::*;
1323    use crate::LineSegmenter;
1324
1325    #[test]
1326    fn linebreak_property() {
1327        let payload =
1328            DataProvider::<SegmenterBreakLineV1>::load(&crate::provider::Baked, Default::default())
1329                .expect("Loading should succeed!")
1330                .payload;
1331
1332        let get_linebreak_property = |codepoint| {
1333            payload.get().get_linebreak_property_utf32_with_rule(
1334                codepoint as u32,
1335                LineBreakStrictness::Strict,
1336                LineBreakWordOption::Normal,
1337            )
1338        };
1339
1340        assert_eq!(get_linebreak_property('\u{0020}'), SP);
1341        assert_eq!(get_linebreak_property('\u{0022}'), QU);
1342        assert_eq!(get_linebreak_property('('), OP_OP30);
1343        assert_eq!(get_linebreak_property('\u{0030}'), NU);
1344        assert_eq!(get_linebreak_property('['), OP_OP30);
1345        assert_eq!(get_linebreak_property('\u{1f3fb}'), EM);
1346        assert_eq!(get_linebreak_property('\u{20000}'), ID);
1347        assert_eq!(get_linebreak_property('\u{e0020}'), CM);
1348        assert_eq!(get_linebreak_property('\u{3041}'), CJ);
1349        assert_eq!(get_linebreak_property('\u{0025}'), PO);
1350        assert_eq!(get_linebreak_property('\u{00A7}'), AI);
1351        assert_eq!(get_linebreak_property('\u{50005}'), XX);
1352        assert_eq!(get_linebreak_property('\u{17D6}'), NS);
1353        assert_eq!(get_linebreak_property('\u{2014}'), B2);
1354    }
1355
1356    #[test]
1357    #[expect(clippy::bool_assert_comparison)] // clearer when we're testing bools directly
1358    fn break_rule() {
1359        let payload =
1360            DataProvider::<SegmenterBreakLineV1>::load(&crate::provider::Baked, Default::default())
1361                .expect("Loading should succeed!")
1362                .payload;
1363        let lb_data: &RuleBreakData = payload.get();
1364
1365        let is_break = |left, right| {
1366            matches!(
1367                lb_data.get_break_state_from_table(left, right),
1368                BreakState::Break | BreakState::NoMatch
1369            )
1370        };
1371
1372        // LB4
1373        assert_eq!(is_break(BK, AL), true);
1374        // LB5
1375        assert_eq!(is_break(CR, LF), false);
1376        assert_eq!(is_break(CR, AL), true);
1377        assert_eq!(is_break(LF, AL), true);
1378        assert_eq!(is_break(NL, AL), true);
1379        // LB6
1380        assert_eq!(is_break(AL, BK), false);
1381        assert_eq!(is_break(AL, CR), false);
1382        assert_eq!(is_break(AL, LF), false);
1383        assert_eq!(is_break(AL, NL), false);
1384        // LB7
1385        assert_eq!(is_break(AL, SP), false);
1386        assert_eq!(is_break(AL, ZW), false);
1387        // LB8
1388        // LB8a and LB9 omitted: These are handled outside of the state table.
1389        // LB10
1390        assert_eq!(is_break(ZWJ, SP), false);
1391        assert_eq!(is_break(SP, CM), true);
1392        // LB11
1393        assert_eq!(is_break(AL, WJ), false);
1394        assert_eq!(is_break(WJ, AL), false);
1395        // LB12
1396        assert_eq!(is_break(GL, AL), false);
1397        // LB12a
1398        assert_eq!(is_break(AL, GL), false);
1399        assert_eq!(is_break(SP, GL), true);
1400        // LB13
1401        assert_eq!(is_break(AL, CL), false);
1402        assert_eq!(is_break(AL, CP), false);
1403        assert_eq!(is_break(AL, EX), false);
1404        assert_eq!(is_break(AL, IS), false);
1405        assert_eq!(is_break(AL, SY), false);
1406        // LB18
1407        assert_eq!(is_break(SP, AL), true);
1408        // LB19
1409        assert_eq!(is_break(AL, QU), false);
1410        assert_eq!(is_break(QU, AL), false);
1411        // LB20
1412        assert_eq!(is_break(AL, CB), true);
1413        assert_eq!(is_break(CB, AL), true);
1414        // LB20
1415        assert_eq!(is_break(AL, BA), false);
1416        assert_eq!(is_break(AL, HY), false);
1417        assert_eq!(is_break(AL, NS), false);
1418        // LB21
1419        assert_eq!(is_break(AL, BA), false);
1420        assert_eq!(is_break(BB, AL), false);
1421        assert_eq!(is_break(ID, BA), false);
1422        assert_eq!(is_break(ID, NS), false);
1423        // LB21a
1424        // LB21b
1425        assert_eq!(is_break(SY, HL), false);
1426        // LB22
1427        assert_eq!(is_break(AL, IN), false);
1428        // LB 23
1429        assert_eq!(is_break(AL, NU), false);
1430        assert_eq!(is_break(HL, NU), false);
1431        // LB 23a
1432        assert_eq!(is_break(PR, ID), false);
1433        assert_eq!(is_break(PR, EB), false);
1434        assert_eq!(is_break(PR, EM), false);
1435        assert_eq!(is_break(ID, PO), false);
1436        assert_eq!(is_break(EB, PO), false);
1437        assert_eq!(is_break(EM, PO), false);
1438        // LB26
1439        assert_eq!(is_break(JL, JL), false);
1440        assert_eq!(is_break(JL, JV), false);
1441        assert_eq!(is_break(JL, H2), false);
1442        // LB27
1443        assert_eq!(is_break(JL, IN), false);
1444        assert_eq!(is_break(JL, PO), false);
1445        assert_eq!(is_break(PR, JL), false);
1446        // LB28
1447        assert_eq!(is_break(AL, AL), false);
1448        assert_eq!(is_break(HL, AL), false);
1449        // LB29
1450        assert_eq!(is_break(IS, AL), false);
1451        assert_eq!(is_break(IS, HL), false);
1452        // LB30b
1453        assert_eq!(is_break(EB, EM), false);
1454        // LB31
1455        assert_eq!(is_break(ID, ID), true);
1456    }
1457
1458    #[test]
1459    fn linebreak() {
1460        let segmenter =
1461            LineSegmenter::try_new_dictionary_unstable(&crate::provider::Baked, Default::default())
1462                .expect("Data exists");
1463        let segmenter = segmenter.as_borrowed();
1464
1465        let mut iter = segmenter.segment_str("hello world");
1466        assert_eq!(Some(0), iter.next());
1467        assert_eq!(Some(6), iter.next());
1468        assert_eq!(Some(11), iter.next());
1469        assert_eq!(None, iter.next());
1470
1471        iter = segmenter.segment_str("$10 $10");
1472        assert_eq!(Some(0), iter.next());
1473        assert_eq!(Some(4), iter.next());
1474        assert_eq!(Some(7), iter.next());
1475        assert_eq!(None, iter.next());
1476
1477        // LB10
1478
1479        // LB14
1480        iter = segmenter.segment_str("[  abc def");
1481        assert_eq!(Some(0), iter.next());
1482        assert_eq!(Some(7), iter.next());
1483        assert_eq!(Some(10), iter.next());
1484        assert_eq!(None, iter.next());
1485
1486        let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1487        let mut iter_u8 = segmenter.segment_latin1(&input);
1488        assert_eq!(Some(0), iter_u8.next());
1489        assert_eq!(Some(7), iter_u8.next());
1490        assert_eq!(Some(10), iter_u8.next());
1491        assert_eq!(None, iter_u8.next());
1492
1493        let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1494        let mut iter_u16 = segmenter.segment_utf16(&input);
1495        assert_eq!(Some(0), iter_u16.next());
1496        assert_eq!(Some(7), iter_u16.next());
1497        assert_eq!(Some(10), iter_u16.next());
1498        assert_eq!(None, iter_u16.next());
1499
1500        // LB15 used to prevent the break at 6, but has been removed in Unicode 15.1.
1501        iter = segmenter.segment_str("abc\u{0022}  (def");
1502        assert_eq!(Some(0), iter.next());
1503        assert_eq!(Some(6), iter.next());
1504        assert_eq!(Some(10), iter.next());
1505        assert_eq!(None, iter.next());
1506
1507        let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1508        let mut iter_u8 = segmenter.segment_latin1(&input);
1509        assert_eq!(Some(0), iter_u8.next());
1510        assert_eq!(Some(6), iter_u8.next());
1511        assert_eq!(Some(10), iter_u8.next());
1512        assert_eq!(None, iter_u8.next());
1513
1514        let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1515        let mut iter_u16 = segmenter.segment_utf16(&input);
1516        assert_eq!(Some(0), iter_u16.next());
1517        assert_eq!(Some(6), iter_u16.next());
1518        assert_eq!(Some(10), iter_u16.next());
1519        assert_eq!(None, iter_u16.next());
1520
1521        // Instead, in Unicode 15.1, LB15a and LB15b prevent these breaks.
1522        iter = segmenter.segment_str("« miaou »");
1523        assert_eq!(Some(0), iter.next());
1524        assert_eq!(Some(11), iter.next());
1525        assert_eq!(None, iter.next());
1526
1527        let input: Vec<u8> = "« miaou »"
1528            .chars()
1529            .map(|c| u8::try_from(u32::from(c)).unwrap())
1530            .collect();
1531        let mut iter_u8 = segmenter.segment_latin1(&input);
1532        assert_eq!(Some(0), iter_u8.next());
1533        assert_eq!(Some(9), iter_u8.next());
1534        assert_eq!(None, iter_u8.next());
1535
1536        let input: Vec<u16> = "« miaou »".encode_utf16().collect();
1537        let mut iter_u16 = segmenter.segment_utf16(&input);
1538        assert_eq!(Some(0), iter_u16.next());
1539        assert_eq!(Some(9), iter_u16.next());
1540        assert_eq!(None, iter_u16.next());
1541
1542        // But not these:
1543        iter = segmenter.segment_str("Die Katze hat »miau« gesagt.");
1544        assert_eq!(Some(0), iter.next());
1545        assert_eq!(Some(4), iter.next());
1546        assert_eq!(Some(10), iter.next());
1547        assert_eq!(Some(14), iter.next());
1548        assert_eq!(Some(23), iter.next());
1549        assert_eq!(Some(30), iter.next());
1550        assert_eq!(None, iter.next());
1551
1552        let input: Vec<u8> = "Die Katze hat »miau« gesagt."
1553            .chars()
1554            .map(|c| u8::try_from(u32::from(c)).unwrap())
1555            .collect();
1556        let mut iter_u8 = segmenter.segment_latin1(&input);
1557        assert_eq!(Some(0), iter_u8.next());
1558        assert_eq!(Some(4), iter_u8.next());
1559        assert_eq!(Some(10), iter_u8.next());
1560        assert_eq!(Some(14), iter_u8.next());
1561        assert_eq!(Some(21), iter_u8.next());
1562        assert_eq!(Some(28), iter_u8.next());
1563        assert_eq!(None, iter_u8.next());
1564
1565        let input: Vec<u16> = "Die Katze hat »miau« gesagt.".encode_utf16().collect();
1566        let mut iter_u16 = segmenter.segment_utf16(&input);
1567        assert_eq!(Some(0), iter_u16.next());
1568        assert_eq!(Some(4), iter_u16.next());
1569        assert_eq!(Some(10), iter_u16.next());
1570        assert_eq!(Some(14), iter_u16.next());
1571        assert_eq!(Some(21), iter_u16.next());
1572        assert_eq!(Some(28), iter_u16.next());
1573        assert_eq!(None, iter_u16.next());
1574
1575        // LB16
1576        iter = segmenter.segment_str("\u{0029}\u{203C}");
1577        assert_eq!(Some(0), iter.next());
1578        assert_eq!(Some(4), iter.next());
1579        assert_eq!(None, iter.next());
1580        iter = segmenter.segment_str("\u{0029}  \u{203C}");
1581        assert_eq!(Some(0), iter.next());
1582        assert_eq!(Some(6), iter.next());
1583        assert_eq!(None, iter.next());
1584
1585        let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c];
1586        let mut iter_u16 = segmenter.segment_utf16(&input);
1587        assert_eq!(Some(0), iter_u16.next());
1588        assert_eq!(Some(4), iter_u16.next());
1589        assert_eq!(None, iter_u16.next());
1590
1591        // LB17
1592        iter = segmenter.segment_str("\u{2014}\u{2014}aa");
1593        assert_eq!(Some(0), iter.next());
1594        assert_eq!(Some(6), iter.next());
1595        assert_eq!(Some(8), iter.next());
1596        assert_eq!(None, iter.next());
1597        iter = segmenter.segment_str("\u{2014}  \u{2014}aa");
1598        assert_eq!(Some(0), iter.next());
1599        assert_eq!(Some(8), iter.next());
1600        assert_eq!(Some(10), iter.next());
1601        assert_eq!(None, iter.next());
1602
1603        iter = segmenter.segment_str("\u{2014}\u{2014}  \u{2014}\u{2014}123 abc");
1604        assert_eq!(Some(0), iter.next());
1605        assert_eq!(Some(14), iter.next());
1606        assert_eq!(Some(18), iter.next());
1607        assert_eq!(Some(21), iter.next());
1608        assert_eq!(None, iter.next());
1609
1610        // LB25
1611        let mut iter = segmenter.segment_str("(0,1)+(2,3)");
1612        assert_eq!(Some(0), iter.next());
1613        assert_eq!(Some(11), iter.next());
1614        assert_eq!(None, iter.next());
1615        let input: [u16; 11] = [
1616            0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29,
1617        ];
1618        let mut iter_u16 = segmenter.segment_utf16(&input);
1619        assert_eq!(Some(0), iter_u16.next());
1620        assert_eq!(Some(11), iter_u16.next());
1621        assert_eq!(None, iter_u16.next());
1622
1623        let input: [u16; 13] = [
1624            0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63,
1625        ];
1626        let mut iter_u16 = segmenter.segment_utf16(&input);
1627        assert_eq!(Some(0), iter_u16.next());
1628        assert_eq!(Some(6), iter_u16.next());
1629        assert_eq!(Some(10), iter_u16.next());
1630        assert_eq!(Some(13), iter_u16.next());
1631        assert_eq!(None, iter_u16.next());
1632
1633        iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}");
1634        assert_eq!(Some(0), iter.next());
1635        assert_eq!(Some(5), iter.next());
1636        assert_eq!(Some(9), iter.next());
1637        assert_eq!(None, iter.next());
1638    }
1639
1640    #[test]
1641    #[cfg(feature = "lstm")]
1642    fn thai_line_break() {
1643        const TEST_STR: &str = "ภาษาไทยภาษาไทย";
1644
1645        let segmenter = LineSegmenter::new_lstm(Default::default());
1646        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1647        assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test");
1648
1649        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1650        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1651        assert_eq!(breaks, [0, 4, 7, 11, utf16.len()], "Thai test");
1652
1653        let utf16: [u16; 4] = [0x0e20, 0x0e32, 0x0e29, 0x0e32];
1654        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1655        assert_eq!(breaks, [0, 4], "Thai test");
1656    }
1657
1658    #[test]
1659    #[cfg(feature = "lstm")]
1660    fn burmese_line_break() {
1661        // "Burmese Language" in Burmese
1662        const TEST_STR: &str = "မြန်မာဘာသာစကား";
1663
1664        let segmenter = LineSegmenter::new_lstm(Default::default());
1665        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1666        // LSTM model breaks more characters, but it is better to return [30].
1667        assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test");
1668
1669        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1670        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1671        // LSTM model breaks more characters, but it is better to return [10].
1672        assert_eq!(breaks, [0, 4, 6, 10, utf16.len()], "Burmese utf-16 test");
1673    }
1674
1675    #[test]
1676    #[cfg(feature = "lstm")]
1677    fn khmer_line_break() {
1678        const TEST_STR: &str = "សេចក្ដីប្រកាសជាសកលស្ដីពីសិទ្ធិមនុស្ស";
1679
1680        let segmenter = LineSegmenter::new_lstm(Default::default());
1681        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1682        // Note: This small sample matches the ICU dictionary segmenter
1683        assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test");
1684
1685        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1686        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1687        assert_eq!(
1688            breaks,
1689            [0, 13, 16, 18, 24, utf16.len()],
1690            "Khmer utf-16 test"
1691        );
1692    }
1693
1694    #[test]
1695    #[cfg(feature = "lstm")]
1696    fn lao_line_break() {
1697        const TEST_STR: &str = "ກ່ຽວກັບສິດຂອງມະນຸດ";
1698
1699        let segmenter = LineSegmenter::new_lstm(Default::default());
1700        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1701        // Note: LSTM finds a break at '12' that the dictionary does not find
1702        assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test");
1703
1704        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1705        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1706        assert_eq!(breaks, [0, 4, 7, 10, 13, utf16.len()], "Lao utf-16 test");
1707    }
1708
1709    #[test]
1710    fn empty_string() {
1711        let segmenter = LineSegmenter::new_auto(Default::default());
1712        let breaks: Vec<usize> = segmenter.segment_str("").collect();
1713        assert_eq!(breaks, [0]);
1714    }
1715}