1use crate::complex::*;
6use crate::indices::*;
7use crate::provider::*;
8use crate::rule_segmenter::*;
9use alloc::string::String;
10use alloc::vec;
11use alloc::vec::Vec;
12use core::char;
13use icu_locale_core::subtags::{language, Language};
14use icu_locale_core::LanguageIdentifier;
15use icu_provider::prelude::*;
16use utf8_iter::Utf8CharIndices;
17
18#[allow(dead_code)]
20const UNKNOWN: u8 = 0;
21#[allow(dead_code)]
22const AI: u8 = 1;
23#[allow(dead_code)]
24const AK: u8 = 2;
25#[allow(dead_code)]
26const AL: u8 = 3;
27#[allow(dead_code)]
28const AL_DOTTED_CIRCLE: u8 = 4;
29#[allow(dead_code)]
30const AP: u8 = 5;
31#[allow(dead_code)]
32const AS: u8 = 6;
33#[allow(dead_code)]
34const B2: u8 = 7;
35#[allow(dead_code)]
36const BA: u8 = 8;
37#[allow(dead_code)]
38const BB: u8 = 9;
39#[allow(dead_code)]
40const BK: u8 = 10;
41#[allow(dead_code)]
42const CB: u8 = 11;
43#[allow(dead_code)]
44const CJ: u8 = 12;
45#[allow(dead_code)]
46const CL: u8 = 13;
47#[allow(dead_code)]
48const CM: u8 = 14;
49#[allow(dead_code)]
50const CP: u8 = 15;
51#[allow(dead_code)]
52const CR: u8 = 16;
53#[allow(dead_code)]
54const EB: u8 = 17;
55#[allow(dead_code)]
56const EM: u8 = 18;
57#[allow(dead_code)]
58const EX: u8 = 19;
59#[allow(dead_code)]
60const GL: u8 = 20;
61#[allow(dead_code)]
62const H2: u8 = 21;
63#[allow(dead_code)]
64const H3: u8 = 22;
65#[allow(dead_code)]
66const HL: u8 = 23;
67#[allow(dead_code)]
68const HY: u8 = 24;
69#[allow(dead_code)]
70const ID: u8 = 25;
71#[allow(dead_code)]
72const ID_CN: u8 = 26;
73#[allow(dead_code)]
74const IN: u8 = 27;
75#[allow(dead_code)]
76const IS: u8 = 28;
77#[allow(dead_code)]
78const JL: u8 = 29;
79#[allow(dead_code)]
80const JT: u8 = 30;
81#[allow(dead_code)]
82const JV: u8 = 31;
83#[allow(dead_code)]
84const LF: u8 = 32;
85#[allow(dead_code)]
86const NL: u8 = 33;
87#[allow(dead_code)]
88const NS: u8 = 34;
89#[allow(dead_code)]
90const NU: u8 = 35;
91#[allow(dead_code)]
92const OP_EA: u8 = 36;
93#[allow(dead_code)]
94const OP_OP30: u8 = 37;
95#[allow(dead_code)]
96const PO: u8 = 38;
97#[allow(dead_code)]
98const PO_EAW: u8 = 39;
99#[allow(dead_code)]
100const PR: u8 = 40;
101#[allow(dead_code)]
102const PR_EAW: u8 = 41;
103#[allow(dead_code)]
104const QU: u8 = 42;
105#[allow(dead_code)]
106const QU_PF: u8 = 43;
107#[allow(dead_code)]
108const QU_PI: u8 = 44;
109#[allow(dead_code)]
110const RI: u8 = 45;
111#[allow(dead_code)]
112const SA: u8 = 46;
113#[allow(dead_code)]
114const SP: u8 = 47;
115#[allow(dead_code)]
116const SY: u8 = 48;
117#[allow(dead_code)]
118const VF: u8 = 49;
119#[allow(dead_code)]
120const VI: u8 = 50;
121#[allow(dead_code)]
122const WJ: u8 = 51;
123#[allow(dead_code)]
124const XX: u8 = 52;
125#[allow(dead_code)]
126const ZW: u8 = 53;
127#[allow(dead_code)]
128const ZWJ: u8 = 54;
129
130#[non_exhaustive]
137#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
138pub enum LineBreakStrictness {
139 Loose,
143
144 Normal,
147
148 #[default]
156 Strict,
157
158 Anywhere,
163}
164
165#[non_exhaustive]
172#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
173pub enum LineBreakWordOption {
174 #[default]
177 Normal,
178
179 BreakAll,
182
183 KeepAll,
186}
187
188#[non_exhaustive]
190#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
191pub struct LineBreakOptions<'a> {
192 pub strictness: Option<LineBreakStrictness>,
196
197 pub word_option: Option<LineBreakWordOption>,
201
202 pub content_locale: Option<&'a LanguageIdentifier>,
209}
210
211impl LineBreakOptions<'_> {
212 pub const fn default() -> Self {
214 Self {
215 strictness: None,
216 word_option: None,
217 content_locale: None,
218 }
219 }
220}
221
222#[derive(Debug, Clone, Copy)]
223struct ResolvedLineBreakOptions {
224 strictness: LineBreakStrictness,
225 word_option: LineBreakWordOption,
226 ja_zh: bool,
227}
228
229impl LineBreakOptions<'_> {
230 const fn resolve(self) -> ResolvedLineBreakOptions {
231 ResolvedLineBreakOptions {
232 strictness: match self.strictness {
233 Some(s) => s,
234 None => LineBreakStrictness::Strict,
235 },
236 word_option: match self.word_option {
237 Some(s) => s,
238 None => LineBreakWordOption::Normal,
239 },
240 ja_zh: if let Some(content_locale) = self.content_locale.as_ref() {
241 const JA: Language = language!("ja");
242 const ZH: Language = language!("zh");
243 matches!(content_locale.language, JA | ZH)
244 } else {
245 false
246 },
247 }
248 }
249}
250
251#[derive(Debug)]
373pub struct LineSegmenter {
374 options: ResolvedLineBreakOptions,
375 payload: DataPayload<SegmenterBreakLineV1>,
376 complex: ComplexPayloads,
377}
378
379#[derive(Clone, Debug, Copy)]
383pub struct LineSegmenterBorrowed<'data> {
384 options: ResolvedLineBreakOptions,
385 data: &'data RuleBreakData<'data>,
386 complex: ComplexPayloadsBorrowed<'data>,
387}
388
389impl LineSegmenter {
390 #[cfg(feature = "auto")]
401 #[cfg(feature = "compiled_data")]
402 pub fn new_auto(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
403 Self::new_lstm(options)
404 }
405
406 #[cfg(feature = "auto")]
407 icu_provider::gen_buffer_data_constructors!(
408 (options: LineBreakOptions) -> error: DataError,
409 functions: [
410 new_auto: skip,
411 try_new_auto_with_buffer_provider,
412 try_new_auto_unstable,
413 Self,
414 ]
415 );
416
417 #[cfg(feature = "auto")]
418 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
419 pub fn try_new_auto_unstable<D>(
420 provider: &D,
421 options: LineBreakOptions,
422 ) -> Result<Self, DataError>
423 where
424 D: DataProvider<SegmenterBreakLineV1>
425 + DataProvider<SegmenterLstmAutoV1>
426 + DataProvider<SegmenterBreakGraphemeClusterV1>
427 + ?Sized,
428 {
429 Self::try_new_lstm_unstable(provider, options)
430 }
431
432 #[cfg(feature = "lstm")]
444 #[cfg(feature = "compiled_data")]
445 pub fn new_lstm(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
446 LineSegmenterBorrowed {
447 options: options.resolve(),
448 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
449 complex: ComplexPayloadsBorrowed::new_lstm(),
450 }
451 }
452
453 #[cfg(feature = "lstm")]
454 icu_provider::gen_buffer_data_constructors!(
455 (options: LineBreakOptions) -> error: DataError,
456 functions: [
457 try_new_lstm: skip,
458 try_new_lstm_with_buffer_provider,
459 try_new_lstm_unstable,
460 Self,
461 ]
462 );
463
464 #[cfg(feature = "lstm")]
465 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
466 pub fn try_new_lstm_unstable<D>(
467 provider: &D,
468 options: LineBreakOptions,
469 ) -> Result<Self, DataError>
470 where
471 D: DataProvider<SegmenterBreakLineV1>
472 + DataProvider<SegmenterLstmAutoV1>
473 + DataProvider<SegmenterBreakGraphemeClusterV1>
474 + ?Sized,
475 {
476 Ok(Self {
477 options: options.resolve(),
478 payload: provider.load(Default::default())?.payload,
479 complex: ComplexPayloads::try_new_lstm(provider)?,
480 })
481 }
482
483 #[cfg(feature = "compiled_data")]
493 pub fn new_dictionary(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
494 LineSegmenterBorrowed {
495 options: options.resolve(),
496 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
497 complex: ComplexPayloadsBorrowed::new_southeast_asian(),
504 }
505 }
506
507 icu_provider::gen_buffer_data_constructors!(
508 (options: LineBreakOptions) -> error: DataError,
509 functions: [
510 new_dictionary: skip,
511 try_new_dictionary_with_buffer_provider,
512 try_new_dictionary_unstable,
513 Self,
514 ]
515 );
516
517 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
518 pub fn try_new_dictionary_unstable<D>(
519 provider: &D,
520 options: LineBreakOptions,
521 ) -> Result<Self, DataError>
522 where
523 D: DataProvider<SegmenterBreakLineV1>
524 + DataProvider<SegmenterDictionaryExtendedV1>
525 + DataProvider<SegmenterBreakGraphemeClusterV1>
526 + ?Sized,
527 {
528 Ok(Self {
529 options: options.resolve(),
530 payload: provider.load(Default::default())?.payload,
531 complex: ComplexPayloads::try_new_southeast_asian(provider)?,
538 })
539 }
540
541 #[cfg(feature = "compiled_data")]
548 pub const fn new_for_non_complex_scripts(
549 options: LineBreakOptions,
550 ) -> LineSegmenterBorrowed<'static> {
551 LineSegmenterBorrowed {
552 options: options.resolve(),
553 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
554 complex: ComplexPayloadsBorrowed::empty(),
555 }
556 }
557
558 icu_provider::gen_buffer_data_constructors!(
559 (options: LineBreakOptions) -> error: DataError,
560 functions: [
561 new_for_non_complex_scripts: skip,
562 try_new_for_non_complex_scripts_with_buffer_provider,
563 try_new_for_non_complex_scripts_unstable,
564 Self,
565 ]
566 );
567
568 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_for_non_complex_scripts)]
569 pub fn try_new_for_non_complex_scripts_unstable<D>(
570 provider: &D,
571 options: LineBreakOptions,
572 ) -> Result<Self, DataError>
573 where
574 D: DataProvider<SegmenterBreakLineV1>
575 + DataProvider<SegmenterBreakGraphemeClusterV1>
576 + ?Sized,
577 {
578 Ok(Self {
579 options: options.resolve(),
580 payload: provider.load(Default::default())?.payload,
581 complex: ComplexPayloads::try_new_empty(provider)?,
582 })
583 }
584
585 pub fn as_borrowed(&self) -> LineSegmenterBorrowed<'_> {
589 LineSegmenterBorrowed {
590 options: self.options,
591 data: self.payload.get(),
592 complex: self.complex.as_borrowed(),
593 }
594 }
595}
596
597impl<'data> LineSegmenterBorrowed<'data> {
598 pub fn segment_str<'s>(self, input: &'s str) -> LineBreakIterator<'data, 's, Utf8> {
602 LineBreakIterator {
603 iter: input.char_indices(),
604 len: input.len(),
605 current_pos_data: None,
606 result_cache: Vec::new(),
607 data: self.data,
608 options: self.options,
609 complex: self.complex,
610 }
611 }
612 pub fn segment_utf8<'s>(
618 self,
619 input: &'s [u8],
620 ) -> LineBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
621 LineBreakIterator {
622 iter: Utf8CharIndices::new(input),
623 len: input.len(),
624 current_pos_data: None,
625 result_cache: Vec::new(),
626 data: self.data,
627 options: self.options,
628 complex: self.complex,
629 }
630 }
631 pub fn segment_latin1<'s>(self, input: &'s [u8]) -> LineBreakIterator<'data, 's, Latin1> {
635 LineBreakIterator {
636 iter: Latin1Indices::new(input),
637 len: input.len(),
638 current_pos_data: None,
639 result_cache: Vec::new(),
640 data: self.data,
641 options: self.options,
642 complex: self.complex,
643 }
644 }
645
646 pub fn segment_utf16<'s>(self, input: &'s [u16]) -> LineBreakIterator<'data, 's, Utf16> {
650 LineBreakIterator {
651 iter: Utf16Indices::new(input),
652 len: input.len(),
653 current_pos_data: None,
654 result_cache: Vec::new(),
655 data: self.data,
656 options: self.options,
657 complex: self.complex,
658 }
659 }
660}
661
662impl LineSegmenterBorrowed<'static> {
663 pub fn static_to_owned(self) -> LineSegmenter {
668 LineSegmenter {
669 payload: DataPayload::from_static_ref(self.data),
670 complex: self.complex.static_to_owned(),
671 options: self.options,
672 }
673 }
674}
675
676impl RuleBreakData<'_> {
677 fn get_linebreak_property_utf32_with_rule(
678 &self,
679 codepoint: u32,
680 strictness: LineBreakStrictness,
681 word_option: LineBreakWordOption,
682 ) -> u8 {
683 let prop = self.property_table.get32(codepoint);
685
686 if word_option == LineBreakWordOption::BreakAll
687 || strictness == LineBreakStrictness::Loose
688 || strictness == LineBreakStrictness::Normal
689 {
690 return match prop {
691 CJ => ID, _ => prop,
693 };
694 }
695
696 prop
699 }
700
701 #[inline]
702 fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
703 let idx = (left as usize) * (self.property_count as usize) + (right as usize);
704 self.break_state_table.get(idx).unwrap_or(BreakState::Keep)
706 }
707
708 #[inline]
709 fn use_complex_breaking_utf32(&self, codepoint: u32) -> bool {
710 let line_break_property = self.get_linebreak_property_utf32_with_rule(
711 codepoint,
712 LineBreakStrictness::Strict,
713 LineBreakWordOption::Normal,
714 );
715
716 line_break_property == SA
717 }
718}
719
720#[inline]
721fn is_break_utf32_by_loose(
722 right_codepoint: u32,
723 left_prop: u8,
724 right_prop: u8,
725 ja_zh: bool,
726) -> Option<bool> {
727 if right_prop == BA {
729 if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) {
730 return Some(true);
731 }
732 } else if right_prop == NS {
733 if right_codepoint == 0x301C || right_codepoint == 0x30A0 {
735 return Some(ja_zh);
736 }
737
738 if right_codepoint == 0x3005
740 || right_codepoint == 0x303B
741 || right_codepoint == 0x309D
742 || right_codepoint == 0x309E
743 || right_codepoint == 0x30FD
744 || right_codepoint == 0x30FE
745 {
746 return Some(true);
747 }
748
749 if right_codepoint == 0x30FB
751 || right_codepoint == 0xFF1A
752 || right_codepoint == 0xFF1B
753 || right_codepoint == 0xFF65
754 || right_codepoint == 0x203C
755 || (0x2047..=0x2049).contains(&right_codepoint)
756 {
757 return Some(ja_zh);
758 }
759 } else if right_prop == IN {
760 return Some(true);
762 } else if right_prop == EX {
763 if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F {
765 return Some(ja_zh);
766 }
767 }
768
769 if right_prop == PO_EAW {
772 return Some(ja_zh);
773 }
774 if left_prop == PR_EAW {
777 return Some(ja_zh);
778 }
779 None
780}
781
782pub trait LineBreakType: crate::private::Sealed + Sized + RuleBreakType {
791 #[doc(hidden)]
792 fn use_complex_breaking(iterator: &LineBreakIterator<'_, '_, Self>, c: Self::CharType) -> bool;
793
794 #[doc(hidden)]
795 fn get_linebreak_property_with_rule(
796 iterator: &LineBreakIterator<'_, '_, Self>,
797 c: Self::CharType,
798 ) -> u8;
799
800 #[doc(hidden)]
801 fn line_handle_complex_language(
802 iterator: &mut LineBreakIterator<'_, '_, Self>,
803 left_codepoint: Self::CharType,
804 ) -> Option<usize>;
805}
806
807#[derive(Debug)]
820pub struct LineBreakIterator<'data, 's, Y: LineBreakType> {
821 iter: Y::IterAttr<'s>,
822 len: usize,
823 current_pos_data: Option<(usize, Y::CharType)>,
824 result_cache: Vec<usize>,
825 data: &'data RuleBreakData<'data>,
826 options: ResolvedLineBreakOptions,
827 complex: ComplexPayloadsBorrowed<'data>,
828}
829
830impl<Y: LineBreakType> Iterator for LineBreakIterator<'_, '_, Y> {
831 type Item = usize;
832
833 fn next(&mut self) -> Option<Self::Item> {
834 match self.check_eof() {
835 StringBoundaryPosType::Start => return Some(0),
836 StringBoundaryPosType::End => return None,
837 _ => (),
838 }
839
840 if let Some(&first_pos) = self.result_cache.first() {
842 let mut i = 0;
843 loop {
844 if i == first_pos {
845 self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
846 return self.get_current_position();
847 }
848 i += self.get_current_codepoint().map_or(0, Y::char_len);
849 self.advance_iter();
850 if self.is_eof() {
851 self.result_cache.clear();
852 return Some(self.len);
853 }
854 }
855 }
856
857 let mut lb9_left: Option<u8> = None;
859 let mut lb8a_after_lb9 = false;
862
863 'a: loop {
864 debug_assert!(!self.is_eof());
865 let left_codepoint = self.get_current_codepoint()?;
866 let mut left_prop =
867 lb9_left.unwrap_or_else(|| self.get_linebreak_property(left_codepoint));
868 let after_zwj = lb8a_after_lb9 || (lb9_left.is_none() && left_prop == ZWJ);
869 self.advance_iter();
870
871 let Some(right_codepoint) = self.get_current_codepoint() else {
872 return Some(self.len);
873 };
874 let right_prop = self.get_linebreak_property(right_codepoint);
875 if (right_prop == CM
879 || (right_prop == ZWJ && self.options.strictness != LineBreakStrictness::Anywhere))
880 && left_prop != BK
881 && left_prop != CR
882 && left_prop != LF
883 && left_prop != NL
884 && left_prop != SP
885 && left_prop != ZW
886 {
887 lb9_left = Some(left_prop);
888 lb8a_after_lb9 = right_prop == ZWJ;
889 continue;
890 } else {
891 lb9_left = None;
892 lb8a_after_lb9 = false;
893 }
894
895 match (self.options.word_option, left_prop, right_prop) {
897 (LineBreakWordOption::BreakAll, AL | NU | SA, _) => {
898 left_prop = ID;
899 }
900 (
902 LineBreakWordOption::KeepAll,
903 AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
904 AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
905 ) => {
906 continue;
907 }
908 _ => (),
909 }
910
911 match self.options.strictness {
913 LineBreakStrictness::Normal => {
914 if self.is_break_by_normal(right_codepoint) && !after_zwj {
915 return self.get_current_position();
916 }
917 }
918 LineBreakStrictness::Loose => {
919 if let Some(breakable) = is_break_utf32_by_loose(
920 right_codepoint.into(),
921 left_prop,
922 right_prop,
923 self.options.ja_zh,
924 ) {
925 if breakable && !after_zwj {
926 return self.get_current_position();
927 }
928 continue;
929 }
930 }
931 LineBreakStrictness::Anywhere => {
932 return self.get_current_position();
936 }
937 _ => (),
938 };
939
940 if self.options.word_option != LineBreakWordOption::BreakAll
942 && Y::use_complex_breaking(self, left_codepoint)
943 && Y::use_complex_breaking(self, right_codepoint)
944 {
945 let result = Y::line_handle_complex_language(self, left_codepoint);
946 if result.is_some() {
947 return result;
948 }
949 }
951
952 match self.data.get_break_state_from_table(left_prop, right_prop) {
954 BreakState::Break | BreakState::NoMatch => {
955 if after_zwj {
956 continue;
957 } else {
958 return self.get_current_position();
959 }
960 }
961 BreakState::Keep => continue,
962 BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
963 let mut previous_iter = self.iter.clone();
964 let mut previous_pos_data = self.current_pos_data;
965 let mut previous_is_after_zwj = after_zwj;
966
967 let mut left_prop_pre_lb9 = right_prop;
972
973 let is_intermediate_rule_no_match = if lb8a_after_lb9 {
977 true
979 } else {
980 index > self.data.last_codepoint_property
981 };
982
983 loop {
984 self.advance_iter();
985 let after_zwj = left_prop_pre_lb9 == ZWJ;
986
987 let previous_break_state_is_cp_prop =
988 index <= self.data.last_codepoint_property;
989
990 let Some(prop) = self.get_current_linebreak_property() else {
991 let break_state = self
993 .data
994 .get_break_state_from_table(index, self.data.eot_property);
995 if break_state == BreakState::NoMatch {
996 self.iter = previous_iter;
997 self.current_pos_data = previous_pos_data;
998 if previous_is_after_zwj {
999 continue 'a;
1001 } else {
1002 return self.get_current_position();
1003 }
1004 }
1005 return Some(self.len);
1007 };
1008
1009 if (prop == CM || prop == ZWJ)
1010 && left_prop_pre_lb9 != BK
1011 && left_prop_pre_lb9 != CR
1012 && left_prop_pre_lb9 != LF
1013 && left_prop_pre_lb9 != NL
1014 && left_prop_pre_lb9 != SP
1015 && left_prop_pre_lb9 != ZW
1016 {
1017 left_prop_pre_lb9 = prop;
1018 continue;
1019 }
1020
1021 match self.data.get_break_state_from_table(index, prop) {
1022 BreakState::Keep => continue 'a,
1023 BreakState::NoMatch => {
1024 self.iter = previous_iter;
1025 self.current_pos_data = previous_pos_data;
1026 if after_zwj {
1027 if is_intermediate_rule_no_match && !previous_is_after_zwj {
1030 return self.get_current_position();
1031 }
1032 continue 'a;
1033 } else if previous_is_after_zwj {
1034 continue 'a;
1036 } else {
1037 return self.get_current_position();
1038 }
1039 }
1040 BreakState::Break => {
1041 if after_zwj {
1042 continue 'a;
1043 } else {
1044 return self.get_current_position();
1045 }
1046 }
1047 BreakState::Intermediate(i) => {
1048 index = i;
1049 previous_iter = self.iter.clone();
1050 previous_pos_data = self.current_pos_data;
1051 previous_is_after_zwj = after_zwj;
1052 }
1053 BreakState::Index(i) => {
1054 index = i;
1055 if previous_break_state_is_cp_prop {
1056 previous_iter = self.iter.clone();
1057 previous_pos_data = self.current_pos_data;
1058 previous_is_after_zwj = after_zwj;
1059 }
1060 }
1061 }
1062 left_prop_pre_lb9 = prop;
1063 }
1064 }
1065 }
1066 }
1067 }
1068}
1069
1070enum StringBoundaryPosType {
1071 Start,
1072 Middle,
1073 End,
1074}
1075
1076impl<Y: LineBreakType> LineBreakIterator<'_, '_, Y> {
1077 fn advance_iter(&mut self) {
1078 self.current_pos_data = self.iter.next();
1079 }
1080
1081 fn is_eof(&self) -> bool {
1082 self.current_pos_data.is_none()
1083 }
1084
1085 #[inline]
1086 fn check_eof(&mut self) -> StringBoundaryPosType {
1087 if self.is_eof() {
1088 self.advance_iter();
1089 if self.is_eof() {
1090 if self.len == 0 {
1091 self.len = 1;
1095 StringBoundaryPosType::Start
1096 } else {
1097 StringBoundaryPosType::End
1098 }
1099 } else {
1100 StringBoundaryPosType::Start
1101 }
1102 } else {
1103 StringBoundaryPosType::Middle
1104 }
1105 }
1106
1107 fn get_current_position(&self) -> Option<usize> {
1108 self.current_pos_data.map(|(pos, _)| pos)
1109 }
1110
1111 fn get_current_codepoint(&self) -> Option<Y::CharType> {
1112 self.current_pos_data.map(|(_, codepoint)| codepoint)
1113 }
1114
1115 fn get_linebreak_property(&self, codepoint: Y::CharType) -> u8 {
1116 Y::get_linebreak_property_with_rule(self, codepoint)
1117 }
1118
1119 fn get_current_linebreak_property(&self) -> Option<u8> {
1120 self.get_current_codepoint()
1121 .map(|c| self.get_linebreak_property(c))
1122 }
1123
1124 fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
1125 match codepoint.into() {
1126 0x301C | 0x30A0 => self.options.ja_zh,
1127 _ => false,
1128 }
1129 }
1130}
1131
1132impl LineBreakType for Utf8 {
1133 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1134 iterator.data.get_linebreak_property_utf32_with_rule(
1135 c as u32,
1136 iterator.options.strictness,
1137 iterator.options.word_option,
1138 )
1139 }
1140
1141 #[inline]
1142 fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1143 iterator.data.use_complex_breaking_utf32(c as u32)
1144 }
1145
1146 fn line_handle_complex_language(
1147 iter: &mut LineBreakIterator<'_, '_, Self>,
1148 left_codepoint: char,
1149 ) -> Option<usize> {
1150 line_handle_complex_language_utf8(iter, left_codepoint)
1151 }
1152}
1153
1154impl LineBreakType for PotentiallyIllFormedUtf8 {
1155 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1156 iterator.data.get_linebreak_property_utf32_with_rule(
1157 c as u32,
1158 iterator.options.strictness,
1159 iterator.options.word_option,
1160 )
1161 }
1162
1163 #[inline]
1164 fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1165 iterator.data.use_complex_breaking_utf32(c as u32)
1166 }
1167
1168 fn line_handle_complex_language(
1169 iter: &mut LineBreakIterator<'_, '_, Self>,
1170 left_codepoint: char,
1171 ) -> Option<usize> {
1172 line_handle_complex_language_utf8(iter, left_codepoint)
1173 }
1174}
1175fn line_handle_complex_language_utf8<T>(
1177 iter: &mut LineBreakIterator<'_, '_, T>,
1178 left_codepoint: char,
1179) -> Option<usize>
1180where
1181 T: LineBreakType<CharType = char>,
1182{
1183 let start_iter = iter.iter.clone();
1185 let start_point = iter.current_pos_data;
1186 let mut s = String::new();
1187 s.push(left_codepoint);
1188 loop {
1189 debug_assert!(!iter.is_eof());
1190 s.push(iter.get_current_codepoint()?);
1191 iter.advance_iter();
1192 if let Some(current_codepoint) = iter.get_current_codepoint() {
1193 if !T::use_complex_breaking(iter, current_codepoint) {
1194 break;
1195 }
1196 } else {
1197 break;
1199 }
1200 }
1201
1202 iter.iter = start_iter;
1204 iter.current_pos_data = start_point;
1205 let breaks = iter.complex.complex_language_segment_str(&s);
1206 iter.result_cache = breaks;
1207 let first_pos = *iter.result_cache.first()?;
1208 let mut i = left_codepoint.len_utf8();
1209 loop {
1210 if i == first_pos {
1211 iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
1213 return iter.get_current_position();
1214 }
1215 debug_assert!(
1216 i < first_pos,
1217 "we should always arrive at first_pos: near index {:?}",
1218 iter.get_current_position()
1219 );
1220 i += iter.get_current_codepoint().map_or(0, T::char_len);
1221 iter.advance_iter();
1222 if iter.is_eof() {
1223 iter.result_cache.clear();
1224 return Some(iter.len);
1225 }
1226 }
1227}
1228
1229impl LineBreakType for Latin1 {
1230 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 {
1231 iterator.data.property_table.get32(c as u32)
1234 }
1235
1236 #[inline]
1237 fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool {
1238 false
1239 }
1240
1241 fn line_handle_complex_language(
1242 _: &mut LineBreakIterator<Self>,
1243 _: Self::CharType,
1244 ) -> Option<usize> {
1245 unreachable!()
1246 }
1247}
1248
1249impl LineBreakType for Utf16 {
1250 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 {
1251 iterator.data.get_linebreak_property_utf32_with_rule(
1252 c,
1253 iterator.options.strictness,
1254 iterator.options.word_option,
1255 )
1256 }
1257
1258 #[inline]
1259 fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool {
1260 iterator.data.use_complex_breaking_utf32(c)
1261 }
1262
1263 fn line_handle_complex_language(
1264 iterator: &mut LineBreakIterator<Self>,
1265 left_codepoint: Self::CharType,
1266 ) -> Option<usize> {
1267 let start_iter = iterator.iter.clone();
1269 let start_point = iterator.current_pos_data;
1270 let mut s = vec![left_codepoint as u16];
1271 loop {
1272 debug_assert!(!iterator.is_eof());
1273 s.push(iterator.get_current_codepoint()? as u16);
1274 iterator.advance_iter();
1275 if let Some(current_codepoint) = iterator.get_current_codepoint() {
1276 if !Self::use_complex_breaking(iterator, current_codepoint) {
1277 break;
1278 }
1279 } else {
1280 break;
1282 }
1283 }
1284
1285 iterator.iter = start_iter;
1287 iterator.current_pos_data = start_point;
1288 let breaks = iterator.complex.complex_language_segment_utf16(&s);
1289 iterator.result_cache = breaks;
1290 let first_pos = *iterator.result_cache.first()?;
1292 let mut i = 1;
1293 loop {
1294 if i == first_pos {
1295 iterator.result_cache = iterator
1297 .result_cache
1298 .iter()
1299 .skip(1)
1300 .map(|r| r - i)
1301 .collect();
1302 return iterator.get_current_position();
1303 }
1304 debug_assert!(
1305 i < first_pos,
1306 "we should always arrive at first_pos: near index {:?}",
1307 iterator.get_current_position()
1308 );
1309 i += 1;
1310 iterator.advance_iter();
1311 if iterator.is_eof() {
1312 iterator.result_cache.clear();
1313 return Some(iterator.len);
1314 }
1315 }
1316 }
1317}
1318
1319#[cfg(test)]
1320#[cfg(feature = "serde")]
1321mod tests {
1322 use super::*;
1323 use crate::LineSegmenter;
1324
1325 #[test]
1326 fn linebreak_property() {
1327 let payload =
1328 DataProvider::<SegmenterBreakLineV1>::load(&crate::provider::Baked, Default::default())
1329 .expect("Loading should succeed!")
1330 .payload;
1331
1332 let get_linebreak_property = |codepoint| {
1333 payload.get().get_linebreak_property_utf32_with_rule(
1334 codepoint as u32,
1335 LineBreakStrictness::Strict,
1336 LineBreakWordOption::Normal,
1337 )
1338 };
1339
1340 assert_eq!(get_linebreak_property('\u{0020}'), SP);
1341 assert_eq!(get_linebreak_property('\u{0022}'), QU);
1342 assert_eq!(get_linebreak_property('('), OP_OP30);
1343 assert_eq!(get_linebreak_property('\u{0030}'), NU);
1344 assert_eq!(get_linebreak_property('['), OP_OP30);
1345 assert_eq!(get_linebreak_property('\u{1f3fb}'), EM);
1346 assert_eq!(get_linebreak_property('\u{20000}'), ID);
1347 assert_eq!(get_linebreak_property('\u{e0020}'), CM);
1348 assert_eq!(get_linebreak_property('\u{3041}'), CJ);
1349 assert_eq!(get_linebreak_property('\u{0025}'), PO);
1350 assert_eq!(get_linebreak_property('\u{00A7}'), AI);
1351 assert_eq!(get_linebreak_property('\u{50005}'), XX);
1352 assert_eq!(get_linebreak_property('\u{17D6}'), NS);
1353 assert_eq!(get_linebreak_property('\u{2014}'), B2);
1354 }
1355
1356 #[test]
1357 #[expect(clippy::bool_assert_comparison)] fn break_rule() {
1359 let payload =
1360 DataProvider::<SegmenterBreakLineV1>::load(&crate::provider::Baked, Default::default())
1361 .expect("Loading should succeed!")
1362 .payload;
1363 let lb_data: &RuleBreakData = payload.get();
1364
1365 let is_break = |left, right| {
1366 matches!(
1367 lb_data.get_break_state_from_table(left, right),
1368 BreakState::Break | BreakState::NoMatch
1369 )
1370 };
1371
1372 assert_eq!(is_break(BK, AL), true);
1374 assert_eq!(is_break(CR, LF), false);
1376 assert_eq!(is_break(CR, AL), true);
1377 assert_eq!(is_break(LF, AL), true);
1378 assert_eq!(is_break(NL, AL), true);
1379 assert_eq!(is_break(AL, BK), false);
1381 assert_eq!(is_break(AL, CR), false);
1382 assert_eq!(is_break(AL, LF), false);
1383 assert_eq!(is_break(AL, NL), false);
1384 assert_eq!(is_break(AL, SP), false);
1386 assert_eq!(is_break(AL, ZW), false);
1387 assert_eq!(is_break(ZWJ, SP), false);
1391 assert_eq!(is_break(SP, CM), true);
1392 assert_eq!(is_break(AL, WJ), false);
1394 assert_eq!(is_break(WJ, AL), false);
1395 assert_eq!(is_break(GL, AL), false);
1397 assert_eq!(is_break(AL, GL), false);
1399 assert_eq!(is_break(SP, GL), true);
1400 assert_eq!(is_break(AL, CL), false);
1402 assert_eq!(is_break(AL, CP), false);
1403 assert_eq!(is_break(AL, EX), false);
1404 assert_eq!(is_break(AL, IS), false);
1405 assert_eq!(is_break(AL, SY), false);
1406 assert_eq!(is_break(SP, AL), true);
1408 assert_eq!(is_break(AL, QU), false);
1410 assert_eq!(is_break(QU, AL), false);
1411 assert_eq!(is_break(AL, CB), true);
1413 assert_eq!(is_break(CB, AL), true);
1414 assert_eq!(is_break(AL, BA), false);
1416 assert_eq!(is_break(AL, HY), false);
1417 assert_eq!(is_break(AL, NS), false);
1418 assert_eq!(is_break(AL, BA), false);
1420 assert_eq!(is_break(BB, AL), false);
1421 assert_eq!(is_break(ID, BA), false);
1422 assert_eq!(is_break(ID, NS), false);
1423 assert_eq!(is_break(SY, HL), false);
1426 assert_eq!(is_break(AL, IN), false);
1428 assert_eq!(is_break(AL, NU), false);
1430 assert_eq!(is_break(HL, NU), false);
1431 assert_eq!(is_break(PR, ID), false);
1433 assert_eq!(is_break(PR, EB), false);
1434 assert_eq!(is_break(PR, EM), false);
1435 assert_eq!(is_break(ID, PO), false);
1436 assert_eq!(is_break(EB, PO), false);
1437 assert_eq!(is_break(EM, PO), false);
1438 assert_eq!(is_break(JL, JL), false);
1440 assert_eq!(is_break(JL, JV), false);
1441 assert_eq!(is_break(JL, H2), false);
1442 assert_eq!(is_break(JL, IN), false);
1444 assert_eq!(is_break(JL, PO), false);
1445 assert_eq!(is_break(PR, JL), false);
1446 assert_eq!(is_break(AL, AL), false);
1448 assert_eq!(is_break(HL, AL), false);
1449 assert_eq!(is_break(IS, AL), false);
1451 assert_eq!(is_break(IS, HL), false);
1452 assert_eq!(is_break(EB, EM), false);
1454 assert_eq!(is_break(ID, ID), true);
1456 }
1457
1458 #[test]
1459 fn linebreak() {
1460 let segmenter =
1461 LineSegmenter::try_new_dictionary_unstable(&crate::provider::Baked, Default::default())
1462 .expect("Data exists");
1463 let segmenter = segmenter.as_borrowed();
1464
1465 let mut iter = segmenter.segment_str("hello world");
1466 assert_eq!(Some(0), iter.next());
1467 assert_eq!(Some(6), iter.next());
1468 assert_eq!(Some(11), iter.next());
1469 assert_eq!(None, iter.next());
1470
1471 iter = segmenter.segment_str("$10 $10");
1472 assert_eq!(Some(0), iter.next());
1473 assert_eq!(Some(4), iter.next());
1474 assert_eq!(Some(7), iter.next());
1475 assert_eq!(None, iter.next());
1476
1477 iter = segmenter.segment_str("[ abc def");
1481 assert_eq!(Some(0), iter.next());
1482 assert_eq!(Some(7), iter.next());
1483 assert_eq!(Some(10), iter.next());
1484 assert_eq!(None, iter.next());
1485
1486 let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1487 let mut iter_u8 = segmenter.segment_latin1(&input);
1488 assert_eq!(Some(0), iter_u8.next());
1489 assert_eq!(Some(7), iter_u8.next());
1490 assert_eq!(Some(10), iter_u8.next());
1491 assert_eq!(None, iter_u8.next());
1492
1493 let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1494 let mut iter_u16 = segmenter.segment_utf16(&input);
1495 assert_eq!(Some(0), iter_u16.next());
1496 assert_eq!(Some(7), iter_u16.next());
1497 assert_eq!(Some(10), iter_u16.next());
1498 assert_eq!(None, iter_u16.next());
1499
1500 iter = segmenter.segment_str("abc\u{0022} (def");
1502 assert_eq!(Some(0), iter.next());
1503 assert_eq!(Some(6), iter.next());
1504 assert_eq!(Some(10), iter.next());
1505 assert_eq!(None, iter.next());
1506
1507 let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1508 let mut iter_u8 = segmenter.segment_latin1(&input);
1509 assert_eq!(Some(0), iter_u8.next());
1510 assert_eq!(Some(6), iter_u8.next());
1511 assert_eq!(Some(10), iter_u8.next());
1512 assert_eq!(None, iter_u8.next());
1513
1514 let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1515 let mut iter_u16 = segmenter.segment_utf16(&input);
1516 assert_eq!(Some(0), iter_u16.next());
1517 assert_eq!(Some(6), iter_u16.next());
1518 assert_eq!(Some(10), iter_u16.next());
1519 assert_eq!(None, iter_u16.next());
1520
1521 iter = segmenter.segment_str("« miaou »");
1523 assert_eq!(Some(0), iter.next());
1524 assert_eq!(Some(11), iter.next());
1525 assert_eq!(None, iter.next());
1526
1527 let input: Vec<u8> = "« miaou »"
1528 .chars()
1529 .map(|c| u8::try_from(u32::from(c)).unwrap())
1530 .collect();
1531 let mut iter_u8 = segmenter.segment_latin1(&input);
1532 assert_eq!(Some(0), iter_u8.next());
1533 assert_eq!(Some(9), iter_u8.next());
1534 assert_eq!(None, iter_u8.next());
1535
1536 let input: Vec<u16> = "« miaou »".encode_utf16().collect();
1537 let mut iter_u16 = segmenter.segment_utf16(&input);
1538 assert_eq!(Some(0), iter_u16.next());
1539 assert_eq!(Some(9), iter_u16.next());
1540 assert_eq!(None, iter_u16.next());
1541
1542 iter = segmenter.segment_str("Die Katze hat »miau« gesagt.");
1544 assert_eq!(Some(0), iter.next());
1545 assert_eq!(Some(4), iter.next());
1546 assert_eq!(Some(10), iter.next());
1547 assert_eq!(Some(14), iter.next());
1548 assert_eq!(Some(23), iter.next());
1549 assert_eq!(Some(30), iter.next());
1550 assert_eq!(None, iter.next());
1551
1552 let input: Vec<u8> = "Die Katze hat »miau« gesagt."
1553 .chars()
1554 .map(|c| u8::try_from(u32::from(c)).unwrap())
1555 .collect();
1556 let mut iter_u8 = segmenter.segment_latin1(&input);
1557 assert_eq!(Some(0), iter_u8.next());
1558 assert_eq!(Some(4), iter_u8.next());
1559 assert_eq!(Some(10), iter_u8.next());
1560 assert_eq!(Some(14), iter_u8.next());
1561 assert_eq!(Some(21), iter_u8.next());
1562 assert_eq!(Some(28), iter_u8.next());
1563 assert_eq!(None, iter_u8.next());
1564
1565 let input: Vec<u16> = "Die Katze hat »miau« gesagt.".encode_utf16().collect();
1566 let mut iter_u16 = segmenter.segment_utf16(&input);
1567 assert_eq!(Some(0), iter_u16.next());
1568 assert_eq!(Some(4), iter_u16.next());
1569 assert_eq!(Some(10), iter_u16.next());
1570 assert_eq!(Some(14), iter_u16.next());
1571 assert_eq!(Some(21), iter_u16.next());
1572 assert_eq!(Some(28), iter_u16.next());
1573 assert_eq!(None, iter_u16.next());
1574
1575 iter = segmenter.segment_str("\u{0029}\u{203C}");
1577 assert_eq!(Some(0), iter.next());
1578 assert_eq!(Some(4), iter.next());
1579 assert_eq!(None, iter.next());
1580 iter = segmenter.segment_str("\u{0029} \u{203C}");
1581 assert_eq!(Some(0), iter.next());
1582 assert_eq!(Some(6), iter.next());
1583 assert_eq!(None, iter.next());
1584
1585 let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c];
1586 let mut iter_u16 = segmenter.segment_utf16(&input);
1587 assert_eq!(Some(0), iter_u16.next());
1588 assert_eq!(Some(4), iter_u16.next());
1589 assert_eq!(None, iter_u16.next());
1590
1591 iter = segmenter.segment_str("\u{2014}\u{2014}aa");
1593 assert_eq!(Some(0), iter.next());
1594 assert_eq!(Some(6), iter.next());
1595 assert_eq!(Some(8), iter.next());
1596 assert_eq!(None, iter.next());
1597 iter = segmenter.segment_str("\u{2014} \u{2014}aa");
1598 assert_eq!(Some(0), iter.next());
1599 assert_eq!(Some(8), iter.next());
1600 assert_eq!(Some(10), iter.next());
1601 assert_eq!(None, iter.next());
1602
1603 iter = segmenter.segment_str("\u{2014}\u{2014} \u{2014}\u{2014}123 abc");
1604 assert_eq!(Some(0), iter.next());
1605 assert_eq!(Some(14), iter.next());
1606 assert_eq!(Some(18), iter.next());
1607 assert_eq!(Some(21), iter.next());
1608 assert_eq!(None, iter.next());
1609
1610 let mut iter = segmenter.segment_str("(0,1)+(2,3)");
1612 assert_eq!(Some(0), iter.next());
1613 assert_eq!(Some(11), iter.next());
1614 assert_eq!(None, iter.next());
1615 let input: [u16; 11] = [
1616 0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29,
1617 ];
1618 let mut iter_u16 = segmenter.segment_utf16(&input);
1619 assert_eq!(Some(0), iter_u16.next());
1620 assert_eq!(Some(11), iter_u16.next());
1621 assert_eq!(None, iter_u16.next());
1622
1623 let input: [u16; 13] = [
1624 0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63,
1625 ];
1626 let mut iter_u16 = segmenter.segment_utf16(&input);
1627 assert_eq!(Some(0), iter_u16.next());
1628 assert_eq!(Some(6), iter_u16.next());
1629 assert_eq!(Some(10), iter_u16.next());
1630 assert_eq!(Some(13), iter_u16.next());
1631 assert_eq!(None, iter_u16.next());
1632
1633 iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}");
1634 assert_eq!(Some(0), iter.next());
1635 assert_eq!(Some(5), iter.next());
1636 assert_eq!(Some(9), iter.next());
1637 assert_eq!(None, iter.next());
1638 }
1639
1640 #[test]
1641 #[cfg(feature = "lstm")]
1642 fn thai_line_break() {
1643 const TEST_STR: &str = "ภาษาไทยภาษาไทย";
1644
1645 let segmenter = LineSegmenter::new_lstm(Default::default());
1646 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1647 assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test");
1648
1649 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1650 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1651 assert_eq!(breaks, [0, 4, 7, 11, utf16.len()], "Thai test");
1652
1653 let utf16: [u16; 4] = [0x0e20, 0x0e32, 0x0e29, 0x0e32];
1654 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1655 assert_eq!(breaks, [0, 4], "Thai test");
1656 }
1657
1658 #[test]
1659 #[cfg(feature = "lstm")]
1660 fn burmese_line_break() {
1661 const TEST_STR: &str = "မြန်မာဘာသာစကား";
1663
1664 let segmenter = LineSegmenter::new_lstm(Default::default());
1665 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1666 assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test");
1668
1669 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1670 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1671 assert_eq!(breaks, [0, 4, 6, 10, utf16.len()], "Burmese utf-16 test");
1673 }
1674
1675 #[test]
1676 #[cfg(feature = "lstm")]
1677 fn khmer_line_break() {
1678 const TEST_STR: &str = "សេចក្ដីប្រកាសជាសកលស្ដីពីសិទ្ធិមនុស្ស";
1679
1680 let segmenter = LineSegmenter::new_lstm(Default::default());
1681 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1682 assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test");
1684
1685 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1686 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1687 assert_eq!(
1688 breaks,
1689 [0, 13, 16, 18, 24, utf16.len()],
1690 "Khmer utf-16 test"
1691 );
1692 }
1693
1694 #[test]
1695 #[cfg(feature = "lstm")]
1696 fn lao_line_break() {
1697 const TEST_STR: &str = "ກ່ຽວກັບສິດຂອງມະນຸດ";
1698
1699 let segmenter = LineSegmenter::new_lstm(Default::default());
1700 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1701 assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test");
1703
1704 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1705 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1706 assert_eq!(breaks, [0, 4, 7, 10, 13, utf16.len()], "Lao utf-16 test");
1707 }
1708
1709 #[test]
1710 fn empty_string() {
1711 let segmenter = LineSegmenter::new_auto(Default::default());
1712 let breaks: Vec<usize> = segmenter.segment_str("").collect();
1713 assert_eq!(breaks, [0]);
1714 }
1715}