1use crate::complex::*;
6use crate::indices::*;
7use crate::provider::*;
8use crate::rule_segmenter::*;
9use alloc::string::String;
10use alloc::vec;
11use alloc::vec::Vec;
12use core::char;
13use icu_locale_core::subtags::language;
14use icu_locale_core::LanguageIdentifier;
15use icu_provider::prelude::*;
16use utf8_iter::Utf8CharIndices;
17
18#[allow(dead_code)]
20const UNKNOWN: u8 = 0;
21#[allow(dead_code)]
22const AI: u8 = 1;
23#[allow(dead_code)]
24const AK: u8 = 2;
25#[allow(dead_code)]
26const AL: u8 = 3;
27#[allow(dead_code)]
28const AL_DOTTED_CIRCLE: u8 = 4;
29#[allow(dead_code)]
30const AP: u8 = 5;
31#[allow(dead_code)]
32const AS: u8 = 6;
33#[allow(dead_code)]
34const B2: u8 = 7;
35#[allow(dead_code)]
36const BA: u8 = 8;
37#[allow(dead_code)]
38const BB: u8 = 9;
39#[allow(dead_code)]
40const BK: u8 = 10;
41#[allow(dead_code)]
42const CB: u8 = 11;
43#[allow(dead_code)]
44const CJ: u8 = 12;
45#[allow(dead_code)]
46const CL: u8 = 13;
47#[allow(dead_code)]
48const CM: u8 = 14;
49#[allow(dead_code)]
50const CP: u8 = 15;
51#[allow(dead_code)]
52const CR: u8 = 16;
53#[allow(dead_code)]
54const EB: u8 = 17;
55#[allow(dead_code)]
56const EM: u8 = 18;
57#[allow(dead_code)]
58const EX: u8 = 19;
59#[allow(dead_code)]
60const GL: u8 = 20;
61#[allow(dead_code)]
62const H2: u8 = 21;
63#[allow(dead_code)]
64const H3: u8 = 22;
65#[allow(dead_code)]
66const HL: u8 = 23;
67#[allow(dead_code)]
68const HY: u8 = 24;
69#[allow(dead_code)]
70const ID: u8 = 25;
71#[allow(dead_code)]
72const ID_CN: u8 = 26;
73#[allow(dead_code)]
74const IN: u8 = 27;
75#[allow(dead_code)]
76const IS: u8 = 28;
77#[allow(dead_code)]
78const JL: u8 = 29;
79#[allow(dead_code)]
80const JT: u8 = 30;
81#[allow(dead_code)]
82const JV: u8 = 31;
83#[allow(dead_code)]
84const LF: u8 = 32;
85#[allow(dead_code)]
86const NL: u8 = 33;
87#[allow(dead_code)]
88const NS: u8 = 34;
89#[allow(dead_code)]
90const NU: u8 = 35;
91#[allow(dead_code)]
92const OP_EA: u8 = 36;
93#[allow(dead_code)]
94const OP_OP30: u8 = 37;
95#[allow(dead_code)]
96const PO: u8 = 38;
97#[allow(dead_code)]
98const PO_EAW: u8 = 39;
99#[allow(dead_code)]
100const PR: u8 = 40;
101#[allow(dead_code)]
102const PR_EAW: u8 = 41;
103#[allow(dead_code)]
104const QU: u8 = 42;
105#[allow(dead_code)]
106const QU_PF: u8 = 43;
107#[allow(dead_code)]
108const QU_PI: u8 = 44;
109#[allow(dead_code)]
110const RI: u8 = 45;
111#[allow(dead_code)]
112const SA: u8 = 46;
113#[allow(dead_code)]
114const SP: u8 = 47;
115#[allow(dead_code)]
116const SY: u8 = 48;
117#[allow(dead_code)]
118const VF: u8 = 49;
119#[allow(dead_code)]
120const VI: u8 = 50;
121#[allow(dead_code)]
122const WJ: u8 = 51;
123#[allow(dead_code)]
124const XX: u8 = 52;
125#[allow(dead_code)]
126const ZW: u8 = 53;
127#[allow(dead_code)]
128const ZWJ: u8 = 54;
129
130#[non_exhaustive]
137#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
138pub enum LineBreakStrictness {
139 Loose,
143
144 Normal,
147
148 #[default]
156 Strict,
157
158 Anywhere,
163}
164
165#[non_exhaustive]
172#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
173pub enum LineBreakWordOption {
174 #[default]
177 Normal,
178
179 BreakAll,
182
183 KeepAll,
186}
187
188#[non_exhaustive]
190#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
191pub struct LineBreakOptions<'a> {
192 pub strictness: Option<LineBreakStrictness>,
196
197 pub word_option: Option<LineBreakWordOption>,
201
202 pub content_locale: Option<&'a LanguageIdentifier>,
209}
210
211#[derive(Debug, Clone, Copy)]
212struct ResolvedLineBreakOptions {
213 strictness: LineBreakStrictness,
214 word_option: LineBreakWordOption,
215 ja_zh: bool,
216}
217
218impl From<LineBreakOptions<'_>> for ResolvedLineBreakOptions {
219 fn from(options: LineBreakOptions<'_>) -> Self {
220 let ja_zh = if let Some(content_locale) = options.content_locale.as_ref() {
221 content_locale.language == language!("ja") || content_locale.language == language!("zh")
222 } else {
223 false
224 };
225 Self {
226 strictness: options.strictness.unwrap_or_default(),
227 word_option: options.word_option.unwrap_or_default(),
228 ja_zh,
229 }
230 }
231}
232
233#[derive(Debug)]
355pub struct LineSegmenter {
356 options: ResolvedLineBreakOptions,
357 payload: DataPayload<SegmenterBreakLineV1>,
358 complex: ComplexPayloads,
359}
360
361#[derive(Clone, Debug, Copy)]
365pub struct LineSegmenterBorrowed<'data> {
366 options: ResolvedLineBreakOptions,
367 data: &'data RuleBreakData<'data>,
368 complex: ComplexPayloadsBorrowed<'data>,
369}
370
371impl LineSegmenter {
372 #[cfg(feature = "auto")]
383 #[cfg(feature = "compiled_data")]
384 pub fn new_auto(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
385 Self::new_lstm(options)
386 }
387
388 #[cfg(feature = "auto")]
389 icu_provider::gen_buffer_data_constructors!(
390 (options: LineBreakOptions) -> error: DataError,
391 functions: [
392 new_auto: skip,
393 try_new_auto_with_buffer_provider,
394 try_new_auto_unstable,
395 Self,
396 ]
397 );
398
399 #[cfg(feature = "auto")]
400 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
401 pub fn try_new_auto_unstable<D>(
402 provider: &D,
403 options: LineBreakOptions,
404 ) -> Result<Self, DataError>
405 where
406 D: DataProvider<SegmenterBreakLineV1>
407 + DataProvider<SegmenterLstmAutoV1>
408 + DataProvider<SegmenterBreakGraphemeClusterV1>
409 + ?Sized,
410 {
411 Self::try_new_lstm_unstable(provider, options)
412 }
413
414 #[cfg(feature = "lstm")]
426 #[cfg(feature = "compiled_data")]
427 pub fn new_lstm(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
428 LineSegmenterBorrowed {
429 options: options.into(),
430 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
431 complex: ComplexPayloadsBorrowed::new_lstm(),
432 }
433 }
434
435 #[cfg(feature = "lstm")]
436 icu_provider::gen_buffer_data_constructors!(
437 (options: LineBreakOptions) -> error: DataError,
438 functions: [
439 try_new_lstm: skip,
440 try_new_lstm_with_buffer_provider,
441 try_new_lstm_unstable,
442 Self,
443 ]
444 );
445
446 #[cfg(feature = "lstm")]
447 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
448 pub fn try_new_lstm_unstable<D>(
449 provider: &D,
450 options: LineBreakOptions,
451 ) -> Result<Self, DataError>
452 where
453 D: DataProvider<SegmenterBreakLineV1>
454 + DataProvider<SegmenterLstmAutoV1>
455 + DataProvider<SegmenterBreakGraphemeClusterV1>
456 + ?Sized,
457 {
458 Ok(Self {
459 options: options.into(),
460 payload: provider.load(Default::default())?.payload,
461 complex: ComplexPayloads::try_new_lstm(provider)?,
462 })
463 }
464
465 #[cfg(feature = "compiled_data")]
477 pub fn new_dictionary(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
478 LineSegmenterBorrowed {
479 options: options.into(),
480 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
481 complex: ComplexPayloadsBorrowed::new_southeast_asian(),
488 }
489 }
490
491 icu_provider::gen_buffer_data_constructors!(
492 (options: LineBreakOptions) -> error: DataError,
493 functions: [
494 new_dictionary: skip,
495 try_new_dictionary_with_buffer_provider,
496 try_new_dictionary_unstable,
497 Self,
498 ]
499 );
500
501 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
502 pub fn try_new_dictionary_unstable<D>(
503 provider: &D,
504 options: LineBreakOptions,
505 ) -> Result<Self, DataError>
506 where
507 D: DataProvider<SegmenterBreakLineV1>
508 + DataProvider<SegmenterDictionaryExtendedV1>
509 + DataProvider<SegmenterBreakGraphemeClusterV1>
510 + ?Sized,
511 {
512 Ok(Self {
513 options: options.into(),
514 payload: provider.load(Default::default())?.payload,
515 complex: ComplexPayloads::try_new_southeast_asian(provider)?,
522 })
523 }
524
525 pub fn as_borrowed(&self) -> LineSegmenterBorrowed<'_> {
529 LineSegmenterBorrowed {
530 options: self.options,
531 data: self.payload.get(),
532 complex: self.complex.as_borrowed(),
533 }
534 }
535}
536
537impl<'data> LineSegmenterBorrowed<'data> {
538 pub fn segment_str<'s>(self, input: &'s str) -> LineBreakIterator<'data, 's, Utf8> {
542 LineBreakIterator {
543 iter: input.char_indices(),
544 len: input.len(),
545 current_pos_data: None,
546 result_cache: Vec::new(),
547 data: self.data,
548 options: self.options,
549 complex: self.complex,
550 }
551 }
552 pub fn segment_utf8<'s>(
558 self,
559 input: &'s [u8],
560 ) -> LineBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
561 LineBreakIterator {
562 iter: Utf8CharIndices::new(input),
563 len: input.len(),
564 current_pos_data: None,
565 result_cache: Vec::new(),
566 data: self.data,
567 options: self.options,
568 complex: self.complex,
569 }
570 }
571 pub fn segment_latin1<'s>(self, input: &'s [u8]) -> LineBreakIterator<'data, 's, Latin1> {
575 LineBreakIterator {
576 iter: Latin1Indices::new(input),
577 len: input.len(),
578 current_pos_data: None,
579 result_cache: Vec::new(),
580 data: self.data,
581 options: self.options,
582 complex: self.complex,
583 }
584 }
585
586 pub fn segment_utf16<'s>(self, input: &'s [u16]) -> LineBreakIterator<'data, 's, Utf16> {
590 LineBreakIterator {
591 iter: Utf16Indices::new(input),
592 len: input.len(),
593 current_pos_data: None,
594 result_cache: Vec::new(),
595 data: self.data,
596 options: self.options,
597 complex: self.complex,
598 }
599 }
600}
601
602impl LineSegmenterBorrowed<'static> {
603 pub fn static_to_owned(self) -> LineSegmenter {
608 LineSegmenter {
609 payload: DataPayload::from_static_ref(self.data),
610 complex: self.complex.static_to_owned(),
611 options: self.options,
612 }
613 }
614}
615
616impl RuleBreakData<'_> {
617 fn get_linebreak_property_utf32_with_rule(
618 &self,
619 codepoint: u32,
620 strictness: LineBreakStrictness,
621 word_option: LineBreakWordOption,
622 ) -> u8 {
623 let prop = self.property_table.get32(codepoint);
625
626 if word_option == LineBreakWordOption::BreakAll
627 || strictness == LineBreakStrictness::Loose
628 || strictness == LineBreakStrictness::Normal
629 {
630 return match prop {
631 CJ => ID, _ => prop,
633 };
634 }
635
636 prop
639 }
640
641 #[inline]
642 fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
643 let idx = (left as usize) * (self.property_count as usize) + (right as usize);
644 self.break_state_table.get(idx).unwrap_or(BreakState::Keep)
646 }
647
648 #[inline]
649 fn use_complex_breaking_utf32(&self, codepoint: u32) -> bool {
650 let line_break_property = self.get_linebreak_property_utf32_with_rule(
651 codepoint,
652 LineBreakStrictness::Strict,
653 LineBreakWordOption::Normal,
654 );
655
656 line_break_property == SA
657 }
658}
659
660#[inline]
661fn is_break_utf32_by_loose(
662 right_codepoint: u32,
663 left_prop: u8,
664 right_prop: u8,
665 ja_zh: bool,
666) -> Option<bool> {
667 if right_prop == BA {
669 if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) {
670 return Some(true);
671 }
672 } else if right_prop == NS {
673 if right_codepoint == 0x301C || right_codepoint == 0x30A0 {
675 return Some(ja_zh);
676 }
677
678 if right_codepoint == 0x3005
680 || right_codepoint == 0x303B
681 || right_codepoint == 0x309D
682 || right_codepoint == 0x309E
683 || right_codepoint == 0x30FD
684 || right_codepoint == 0x30FE
685 {
686 return Some(true);
687 }
688
689 if right_codepoint == 0x30FB
691 || right_codepoint == 0xFF1A
692 || right_codepoint == 0xFF1B
693 || right_codepoint == 0xFF65
694 || right_codepoint == 0x203C
695 || (0x2047..=0x2049).contains(&right_codepoint)
696 {
697 return Some(ja_zh);
698 }
699 } else if right_prop == IN {
700 return Some(true);
702 } else if right_prop == EX {
703 if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F {
705 return Some(ja_zh);
706 }
707 }
708
709 if right_prop == PO_EAW {
712 return Some(ja_zh);
713 }
714 if left_prop == PR_EAW {
717 return Some(ja_zh);
718 }
719 None
720}
721
722pub trait LineBreakType: crate::private::Sealed + Sized + RuleBreakType {
731 #[doc(hidden)]
732 fn use_complex_breaking(iterator: &LineBreakIterator<'_, '_, Self>, c: Self::CharType) -> bool;
733
734 #[doc(hidden)]
735 fn get_linebreak_property_with_rule(
736 iterator: &LineBreakIterator<'_, '_, Self>,
737 c: Self::CharType,
738 ) -> u8;
739
740 #[doc(hidden)]
741 fn line_handle_complex_language(
742 iterator: &mut LineBreakIterator<'_, '_, Self>,
743 left_codepoint: Self::CharType,
744 ) -> Option<usize>;
745}
746
747#[derive(Debug)]
760pub struct LineBreakIterator<'data, 's, Y: LineBreakType> {
761 iter: Y::IterAttr<'s>,
762 len: usize,
763 current_pos_data: Option<(usize, Y::CharType)>,
764 result_cache: Vec<usize>,
765 data: &'data RuleBreakData<'data>,
766 options: ResolvedLineBreakOptions,
767 complex: ComplexPayloadsBorrowed<'data>,
768}
769
770impl<Y: LineBreakType> Iterator for LineBreakIterator<'_, '_, Y> {
771 type Item = usize;
772
773 fn next(&mut self) -> Option<Self::Item> {
774 match self.check_eof() {
775 StringBoundaryPosType::Start => return Some(0),
776 StringBoundaryPosType::End => return None,
777 _ => (),
778 }
779
780 if let Some(&first_pos) = self.result_cache.first() {
782 let mut i = 0;
783 loop {
784 if i == first_pos {
785 self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
786 return self.get_current_position();
787 }
788 i += self.get_current_codepoint().map_or(0, Y::char_len);
789 self.advance_iter();
790 if self.is_eof() {
791 self.result_cache.clear();
792 return Some(self.len);
793 }
794 }
795 }
796
797 let mut lb9_left: Option<u8> = None;
799 let mut lb8a_after_lb9 = false;
802
803 'a: loop {
804 debug_assert!(!self.is_eof());
805 let left_codepoint = self.get_current_codepoint()?;
806 let mut left_prop =
807 lb9_left.unwrap_or_else(|| self.get_linebreak_property(left_codepoint));
808 let after_zwj = lb8a_after_lb9 || (lb9_left.is_none() && left_prop == ZWJ);
809 self.advance_iter();
810
811 let Some(right_codepoint) = self.get_current_codepoint() else {
812 return Some(self.len);
813 };
814 let right_prop = self.get_linebreak_property(right_codepoint);
815 if (right_prop == CM
819 || (right_prop == ZWJ && self.options.strictness != LineBreakStrictness::Anywhere))
820 && left_prop != BK
821 && left_prop != CR
822 && left_prop != LF
823 && left_prop != NL
824 && left_prop != SP
825 && left_prop != ZW
826 {
827 lb9_left = Some(left_prop);
828 lb8a_after_lb9 = right_prop == ZWJ;
829 continue;
830 } else {
831 lb9_left = None;
832 lb8a_after_lb9 = false;
833 }
834
835 match (self.options.word_option, left_prop, right_prop) {
837 (LineBreakWordOption::BreakAll, AL | NU | SA, _) => {
838 left_prop = ID;
839 }
840 (
842 LineBreakWordOption::KeepAll,
843 AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
844 AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
845 ) => {
846 continue;
847 }
848 _ => (),
849 }
850
851 match self.options.strictness {
853 LineBreakStrictness::Normal => {
854 if self.is_break_by_normal(right_codepoint) && !after_zwj {
855 return self.get_current_position();
856 }
857 }
858 LineBreakStrictness::Loose => {
859 if let Some(breakable) = is_break_utf32_by_loose(
860 right_codepoint.into(),
861 left_prop,
862 right_prop,
863 self.options.ja_zh,
864 ) {
865 if breakable && !after_zwj {
866 return self.get_current_position();
867 }
868 continue;
869 }
870 }
871 LineBreakStrictness::Anywhere => {
872 return self.get_current_position();
876 }
877 _ => (),
878 };
879
880 if self.options.word_option != LineBreakWordOption::BreakAll
882 && Y::use_complex_breaking(self, left_codepoint)
883 && Y::use_complex_breaking(self, right_codepoint)
884 {
885 let result = Y::line_handle_complex_language(self, left_codepoint);
886 if result.is_some() {
887 return result;
888 }
889 }
891
892 match self.data.get_break_state_from_table(left_prop, right_prop) {
894 BreakState::Break | BreakState::NoMatch => {
895 if after_zwj {
896 continue;
897 } else {
898 return self.get_current_position();
899 }
900 }
901 BreakState::Keep => continue,
902 BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
903 let mut previous_iter = self.iter.clone();
904 let mut previous_pos_data = self.current_pos_data;
905 let mut previous_is_after_zwj = after_zwj;
906
907 let mut left_prop_pre_lb9 = right_prop;
912
913 let is_intermediate_rule_no_match = if lb8a_after_lb9 {
917 true
919 } else {
920 index > self.data.last_codepoint_property
921 };
922
923 loop {
924 self.advance_iter();
925 let after_zwj = left_prop_pre_lb9 == ZWJ;
926
927 let previous_break_state_is_cp_prop =
928 index <= self.data.last_codepoint_property;
929
930 let Some(prop) = self.get_current_linebreak_property() else {
931 let break_state = self
933 .data
934 .get_break_state_from_table(index, self.data.eot_property);
935 if break_state == BreakState::NoMatch {
936 self.iter = previous_iter;
937 self.current_pos_data = previous_pos_data;
938 if previous_is_after_zwj {
939 continue 'a;
941 } else {
942 return self.get_current_position();
943 }
944 }
945 return Some(self.len);
947 };
948
949 if (prop == CM || prop == ZWJ)
950 && left_prop_pre_lb9 != BK
951 && left_prop_pre_lb9 != CR
952 && left_prop_pre_lb9 != LF
953 && left_prop_pre_lb9 != NL
954 && left_prop_pre_lb9 != SP
955 && left_prop_pre_lb9 != ZW
956 {
957 left_prop_pre_lb9 = prop;
958 continue;
959 }
960
961 match self.data.get_break_state_from_table(index, prop) {
962 BreakState::Keep => continue 'a,
963 BreakState::NoMatch => {
964 self.iter = previous_iter;
965 self.current_pos_data = previous_pos_data;
966 if after_zwj {
967 if is_intermediate_rule_no_match && !previous_is_after_zwj {
970 return self.get_current_position();
971 }
972 continue 'a;
973 } else if previous_is_after_zwj {
974 continue 'a;
976 } else {
977 return self.get_current_position();
978 }
979 }
980 BreakState::Break => {
981 if after_zwj {
982 continue 'a;
983 } else {
984 return self.get_current_position();
985 }
986 }
987 BreakState::Intermediate(i) => {
988 index = i;
989 previous_iter = self.iter.clone();
990 previous_pos_data = self.current_pos_data;
991 previous_is_after_zwj = after_zwj;
992 }
993 BreakState::Index(i) => {
994 index = i;
995 if previous_break_state_is_cp_prop {
996 previous_iter = self.iter.clone();
997 previous_pos_data = self.current_pos_data;
998 previous_is_after_zwj = after_zwj;
999 }
1000 }
1001 }
1002 left_prop_pre_lb9 = prop;
1003 }
1004 }
1005 }
1006 }
1007 }
1008}
1009
1010enum StringBoundaryPosType {
1011 Start,
1012 Middle,
1013 End,
1014}
1015
1016impl<Y: LineBreakType> LineBreakIterator<'_, '_, Y> {
1017 fn advance_iter(&mut self) {
1018 self.current_pos_data = self.iter.next();
1019 }
1020
1021 fn is_eof(&self) -> bool {
1022 self.current_pos_data.is_none()
1023 }
1024
1025 #[inline]
1026 fn check_eof(&mut self) -> StringBoundaryPosType {
1027 if self.is_eof() {
1028 self.advance_iter();
1029 if self.is_eof() {
1030 if self.len == 0 {
1031 self.len = 1;
1035 StringBoundaryPosType::Start
1036 } else {
1037 StringBoundaryPosType::End
1038 }
1039 } else {
1040 StringBoundaryPosType::Start
1041 }
1042 } else {
1043 StringBoundaryPosType::Middle
1044 }
1045 }
1046
1047 fn get_current_position(&self) -> Option<usize> {
1048 self.current_pos_data.map(|(pos, _)| pos)
1049 }
1050
1051 fn get_current_codepoint(&self) -> Option<Y::CharType> {
1052 self.current_pos_data.map(|(_, codepoint)| codepoint)
1053 }
1054
1055 fn get_linebreak_property(&self, codepoint: Y::CharType) -> u8 {
1056 Y::get_linebreak_property_with_rule(self, codepoint)
1057 }
1058
1059 fn get_current_linebreak_property(&self) -> Option<u8> {
1060 self.get_current_codepoint()
1061 .map(|c| self.get_linebreak_property(c))
1062 }
1063
1064 fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
1065 match codepoint.into() {
1066 0x301C | 0x30A0 => self.options.ja_zh,
1067 _ => false,
1068 }
1069 }
1070}
1071
1072impl LineBreakType for Utf8 {
1073 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1074 iterator.data.get_linebreak_property_utf32_with_rule(
1075 c as u32,
1076 iterator.options.strictness,
1077 iterator.options.word_option,
1078 )
1079 }
1080
1081 #[inline]
1082 fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1083 iterator.data.use_complex_breaking_utf32(c as u32)
1084 }
1085
1086 fn line_handle_complex_language(
1087 iter: &mut LineBreakIterator<'_, '_, Self>,
1088 left_codepoint: char,
1089 ) -> Option<usize> {
1090 line_handle_complex_language_utf8(iter, left_codepoint)
1091 }
1092}
1093
1094impl LineBreakType for PotentiallyIllFormedUtf8 {
1095 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1096 iterator.data.get_linebreak_property_utf32_with_rule(
1097 c as u32,
1098 iterator.options.strictness,
1099 iterator.options.word_option,
1100 )
1101 }
1102
1103 #[inline]
1104 fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1105 iterator.data.use_complex_breaking_utf32(c as u32)
1106 }
1107
1108 fn line_handle_complex_language(
1109 iter: &mut LineBreakIterator<'_, '_, Self>,
1110 left_codepoint: char,
1111 ) -> Option<usize> {
1112 line_handle_complex_language_utf8(iter, left_codepoint)
1113 }
1114}
1115fn line_handle_complex_language_utf8<T>(
1117 iter: &mut LineBreakIterator<'_, '_, T>,
1118 left_codepoint: char,
1119) -> Option<usize>
1120where
1121 T: LineBreakType<CharType = char>,
1122{
1123 let start_iter = iter.iter.clone();
1125 let start_point = iter.current_pos_data;
1126 let mut s = String::new();
1127 s.push(left_codepoint);
1128 loop {
1129 debug_assert!(!iter.is_eof());
1130 s.push(iter.get_current_codepoint()?);
1131 iter.advance_iter();
1132 if let Some(current_codepoint) = iter.get_current_codepoint() {
1133 if !T::use_complex_breaking(iter, current_codepoint) {
1134 break;
1135 }
1136 } else {
1137 break;
1139 }
1140 }
1141
1142 iter.iter = start_iter;
1144 iter.current_pos_data = start_point;
1145 let breaks = iter.complex.complex_language_segment_str(&s);
1146 iter.result_cache = breaks;
1147 let first_pos = *iter.result_cache.first()?;
1148 let mut i = left_codepoint.len_utf8();
1149 loop {
1150 if i == first_pos {
1151 iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
1153 return iter.get_current_position();
1154 }
1155 debug_assert!(
1156 i < first_pos,
1157 "we should always arrive at first_pos: near index {:?}",
1158 iter.get_current_position()
1159 );
1160 i += iter.get_current_codepoint().map_or(0, T::char_len);
1161 iter.advance_iter();
1162 if iter.is_eof() {
1163 iter.result_cache.clear();
1164 return Some(iter.len);
1165 }
1166 }
1167}
1168
1169impl LineBreakType for Latin1 {
1170 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 {
1171 iterator.data.property_table.get32(c as u32)
1174 }
1175
1176 #[inline]
1177 fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool {
1178 false
1179 }
1180
1181 fn line_handle_complex_language(
1182 _: &mut LineBreakIterator<Self>,
1183 _: Self::CharType,
1184 ) -> Option<usize> {
1185 unreachable!()
1186 }
1187}
1188
1189impl LineBreakType for Utf16 {
1190 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 {
1191 iterator.data.get_linebreak_property_utf32_with_rule(
1192 c,
1193 iterator.options.strictness,
1194 iterator.options.word_option,
1195 )
1196 }
1197
1198 #[inline]
1199 fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool {
1200 iterator.data.use_complex_breaking_utf32(c)
1201 }
1202
1203 fn line_handle_complex_language(
1204 iterator: &mut LineBreakIterator<Self>,
1205 left_codepoint: Self::CharType,
1206 ) -> Option<usize> {
1207 let start_iter = iterator.iter.clone();
1209 let start_point = iterator.current_pos_data;
1210 let mut s = vec![left_codepoint as u16];
1211 loop {
1212 debug_assert!(!iterator.is_eof());
1213 s.push(iterator.get_current_codepoint()? as u16);
1214 iterator.advance_iter();
1215 if let Some(current_codepoint) = iterator.get_current_codepoint() {
1216 if !Self::use_complex_breaking(iterator, current_codepoint) {
1217 break;
1218 }
1219 } else {
1220 break;
1222 }
1223 }
1224
1225 iterator.iter = start_iter;
1227 iterator.current_pos_data = start_point;
1228 let breaks = iterator.complex.complex_language_segment_utf16(&s);
1229 iterator.result_cache = breaks;
1230 let first_pos = *iterator.result_cache.first()?;
1232 let mut i = 1;
1233 loop {
1234 if i == first_pos {
1235 iterator.result_cache = iterator
1237 .result_cache
1238 .iter()
1239 .skip(1)
1240 .map(|r| r - i)
1241 .collect();
1242 return iterator.get_current_position();
1243 }
1244 debug_assert!(
1245 i < first_pos,
1246 "we should always arrive at first_pos: near index {:?}",
1247 iterator.get_current_position()
1248 );
1249 i += 1;
1250 iterator.advance_iter();
1251 if iterator.is_eof() {
1252 iterator.result_cache.clear();
1253 return Some(iterator.len);
1254 }
1255 }
1256 }
1257}
1258
1259#[cfg(test)]
1260#[cfg(feature = "serde")]
1261mod tests {
1262 use super::*;
1263 use crate::LineSegmenter;
1264
1265 #[test]
1266 fn linebreak_property() {
1267 let payload =
1268 DataProvider::<SegmenterBreakLineV1>::load(&crate::provider::Baked, Default::default())
1269 .expect("Loading should succeed!")
1270 .payload;
1271
1272 let get_linebreak_property = |codepoint| {
1273 payload.get().get_linebreak_property_utf32_with_rule(
1274 codepoint as u32,
1275 LineBreakStrictness::Strict,
1276 LineBreakWordOption::Normal,
1277 )
1278 };
1279
1280 assert_eq!(get_linebreak_property('\u{0020}'), SP);
1281 assert_eq!(get_linebreak_property('\u{0022}'), QU);
1282 assert_eq!(get_linebreak_property('('), OP_OP30);
1283 assert_eq!(get_linebreak_property('\u{0030}'), NU);
1284 assert_eq!(get_linebreak_property('['), OP_OP30);
1285 assert_eq!(get_linebreak_property('\u{1f3fb}'), EM);
1286 assert_eq!(get_linebreak_property('\u{20000}'), ID);
1287 assert_eq!(get_linebreak_property('\u{e0020}'), CM);
1288 assert_eq!(get_linebreak_property('\u{3041}'), CJ);
1289 assert_eq!(get_linebreak_property('\u{0025}'), PO);
1290 assert_eq!(get_linebreak_property('\u{00A7}'), AI);
1291 assert_eq!(get_linebreak_property('\u{50005}'), XX);
1292 assert_eq!(get_linebreak_property('\u{17D6}'), NS);
1293 assert_eq!(get_linebreak_property('\u{2014}'), B2);
1294 }
1295
1296 #[test]
1297 #[expect(clippy::bool_assert_comparison)] fn break_rule() {
1299 let payload =
1300 DataProvider::<SegmenterBreakLineV1>::load(&crate::provider::Baked, Default::default())
1301 .expect("Loading should succeed!")
1302 .payload;
1303 let lb_data: &RuleBreakData = payload.get();
1304
1305 let is_break = |left, right| {
1306 matches!(
1307 lb_data.get_break_state_from_table(left, right),
1308 BreakState::Break | BreakState::NoMatch
1309 )
1310 };
1311
1312 assert_eq!(is_break(BK, AL), true);
1314 assert_eq!(is_break(CR, LF), false);
1316 assert_eq!(is_break(CR, AL), true);
1317 assert_eq!(is_break(LF, AL), true);
1318 assert_eq!(is_break(NL, AL), true);
1319 assert_eq!(is_break(AL, BK), false);
1321 assert_eq!(is_break(AL, CR), false);
1322 assert_eq!(is_break(AL, LF), false);
1323 assert_eq!(is_break(AL, NL), false);
1324 assert_eq!(is_break(AL, SP), false);
1326 assert_eq!(is_break(AL, ZW), false);
1327 assert_eq!(is_break(ZWJ, SP), false);
1331 assert_eq!(is_break(SP, CM), true);
1332 assert_eq!(is_break(AL, WJ), false);
1334 assert_eq!(is_break(WJ, AL), false);
1335 assert_eq!(is_break(GL, AL), false);
1337 assert_eq!(is_break(AL, GL), false);
1339 assert_eq!(is_break(SP, GL), true);
1340 assert_eq!(is_break(AL, CL), false);
1342 assert_eq!(is_break(AL, CP), false);
1343 assert_eq!(is_break(AL, EX), false);
1344 assert_eq!(is_break(AL, IS), false);
1345 assert_eq!(is_break(AL, SY), false);
1346 assert_eq!(is_break(SP, AL), true);
1348 assert_eq!(is_break(AL, QU), false);
1350 assert_eq!(is_break(QU, AL), false);
1351 assert_eq!(is_break(AL, CB), true);
1353 assert_eq!(is_break(CB, AL), true);
1354 assert_eq!(is_break(AL, BA), false);
1356 assert_eq!(is_break(AL, HY), false);
1357 assert_eq!(is_break(AL, NS), false);
1358 assert_eq!(is_break(AL, BA), false);
1360 assert_eq!(is_break(BB, AL), false);
1361 assert_eq!(is_break(ID, BA), false);
1362 assert_eq!(is_break(ID, NS), false);
1363 assert_eq!(is_break(SY, HL), false);
1366 assert_eq!(is_break(AL, IN), false);
1368 assert_eq!(is_break(AL, NU), false);
1370 assert_eq!(is_break(HL, NU), false);
1371 assert_eq!(is_break(PR, ID), false);
1373 assert_eq!(is_break(PR, EB), false);
1374 assert_eq!(is_break(PR, EM), false);
1375 assert_eq!(is_break(ID, PO), false);
1376 assert_eq!(is_break(EB, PO), false);
1377 assert_eq!(is_break(EM, PO), false);
1378 assert_eq!(is_break(JL, JL), false);
1380 assert_eq!(is_break(JL, JV), false);
1381 assert_eq!(is_break(JL, H2), false);
1382 assert_eq!(is_break(JL, IN), false);
1384 assert_eq!(is_break(JL, PO), false);
1385 assert_eq!(is_break(PR, JL), false);
1386 assert_eq!(is_break(AL, AL), false);
1388 assert_eq!(is_break(HL, AL), false);
1389 assert_eq!(is_break(IS, AL), false);
1391 assert_eq!(is_break(IS, HL), false);
1392 assert_eq!(is_break(EB, EM), false);
1394 assert_eq!(is_break(ID, ID), true);
1396 }
1397
1398 #[test]
1399 fn linebreak() {
1400 let segmenter =
1401 LineSegmenter::try_new_dictionary_unstable(&crate::provider::Baked, Default::default())
1402 .expect("Data exists");
1403 let segmenter = segmenter.as_borrowed();
1404
1405 let mut iter = segmenter.segment_str("hello world");
1406 assert_eq!(Some(0), iter.next());
1407 assert_eq!(Some(6), iter.next());
1408 assert_eq!(Some(11), iter.next());
1409 assert_eq!(None, iter.next());
1410
1411 iter = segmenter.segment_str("$10 $10");
1412 assert_eq!(Some(0), iter.next());
1413 assert_eq!(Some(4), iter.next());
1414 assert_eq!(Some(7), iter.next());
1415 assert_eq!(None, iter.next());
1416
1417 iter = segmenter.segment_str("[ abc def");
1421 assert_eq!(Some(0), iter.next());
1422 assert_eq!(Some(7), iter.next());
1423 assert_eq!(Some(10), iter.next());
1424 assert_eq!(None, iter.next());
1425
1426 let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1427 let mut iter_u8 = segmenter.segment_latin1(&input);
1428 assert_eq!(Some(0), iter_u8.next());
1429 assert_eq!(Some(7), iter_u8.next());
1430 assert_eq!(Some(10), iter_u8.next());
1431 assert_eq!(None, iter_u8.next());
1432
1433 let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1434 let mut iter_u16 = segmenter.segment_utf16(&input);
1435 assert_eq!(Some(0), iter_u16.next());
1436 assert_eq!(Some(7), iter_u16.next());
1437 assert_eq!(Some(10), iter_u16.next());
1438 assert_eq!(None, iter_u16.next());
1439
1440 iter = segmenter.segment_str("abc\u{0022} (def");
1442 assert_eq!(Some(0), iter.next());
1443 assert_eq!(Some(6), iter.next());
1444 assert_eq!(Some(10), iter.next());
1445 assert_eq!(None, iter.next());
1446
1447 let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1448 let mut iter_u8 = segmenter.segment_latin1(&input);
1449 assert_eq!(Some(0), iter_u8.next());
1450 assert_eq!(Some(6), iter_u8.next());
1451 assert_eq!(Some(10), iter_u8.next());
1452 assert_eq!(None, iter_u8.next());
1453
1454 let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1455 let mut iter_u16 = segmenter.segment_utf16(&input);
1456 assert_eq!(Some(0), iter_u16.next());
1457 assert_eq!(Some(6), iter_u16.next());
1458 assert_eq!(Some(10), iter_u16.next());
1459 assert_eq!(None, iter_u16.next());
1460
1461 iter = segmenter.segment_str("« miaou »");
1463 assert_eq!(Some(0), iter.next());
1464 assert_eq!(Some(11), iter.next());
1465 assert_eq!(None, iter.next());
1466
1467 let input: Vec<u8> = "« miaou »"
1468 .chars()
1469 .map(|c| u8::try_from(u32::from(c)).unwrap())
1470 .collect();
1471 let mut iter_u8 = segmenter.segment_latin1(&input);
1472 assert_eq!(Some(0), iter_u8.next());
1473 assert_eq!(Some(9), iter_u8.next());
1474 assert_eq!(None, iter_u8.next());
1475
1476 let input: Vec<u16> = "« miaou »".encode_utf16().collect();
1477 let mut iter_u16 = segmenter.segment_utf16(&input);
1478 assert_eq!(Some(0), iter_u16.next());
1479 assert_eq!(Some(9), iter_u16.next());
1480 assert_eq!(None, iter_u16.next());
1481
1482 iter = segmenter.segment_str("Die Katze hat »miau« gesagt.");
1484 assert_eq!(Some(0), iter.next());
1485 assert_eq!(Some(4), iter.next());
1486 assert_eq!(Some(10), iter.next());
1487 assert_eq!(Some(14), iter.next());
1488 assert_eq!(Some(23), iter.next());
1489 assert_eq!(Some(30), iter.next());
1490 assert_eq!(None, iter.next());
1491
1492 let input: Vec<u8> = "Die Katze hat »miau« gesagt."
1493 .chars()
1494 .map(|c| u8::try_from(u32::from(c)).unwrap())
1495 .collect();
1496 let mut iter_u8 = segmenter.segment_latin1(&input);
1497 assert_eq!(Some(0), iter_u8.next());
1498 assert_eq!(Some(4), iter_u8.next());
1499 assert_eq!(Some(10), iter_u8.next());
1500 assert_eq!(Some(14), iter_u8.next());
1501 assert_eq!(Some(21), iter_u8.next());
1502 assert_eq!(Some(28), iter_u8.next());
1503 assert_eq!(None, iter_u8.next());
1504
1505 let input: Vec<u16> = "Die Katze hat »miau« gesagt.".encode_utf16().collect();
1506 let mut iter_u16 = segmenter.segment_utf16(&input);
1507 assert_eq!(Some(0), iter_u16.next());
1508 assert_eq!(Some(4), iter_u16.next());
1509 assert_eq!(Some(10), iter_u16.next());
1510 assert_eq!(Some(14), iter_u16.next());
1511 assert_eq!(Some(21), iter_u16.next());
1512 assert_eq!(Some(28), iter_u16.next());
1513 assert_eq!(None, iter_u16.next());
1514
1515 iter = segmenter.segment_str("\u{0029}\u{203C}");
1517 assert_eq!(Some(0), iter.next());
1518 assert_eq!(Some(4), iter.next());
1519 assert_eq!(None, iter.next());
1520 iter = segmenter.segment_str("\u{0029} \u{203C}");
1521 assert_eq!(Some(0), iter.next());
1522 assert_eq!(Some(6), iter.next());
1523 assert_eq!(None, iter.next());
1524
1525 let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c];
1526 let mut iter_u16 = segmenter.segment_utf16(&input);
1527 assert_eq!(Some(0), iter_u16.next());
1528 assert_eq!(Some(4), iter_u16.next());
1529 assert_eq!(None, iter_u16.next());
1530
1531 iter = segmenter.segment_str("\u{2014}\u{2014}aa");
1533 assert_eq!(Some(0), iter.next());
1534 assert_eq!(Some(6), iter.next());
1535 assert_eq!(Some(8), iter.next());
1536 assert_eq!(None, iter.next());
1537 iter = segmenter.segment_str("\u{2014} \u{2014}aa");
1538 assert_eq!(Some(0), iter.next());
1539 assert_eq!(Some(8), iter.next());
1540 assert_eq!(Some(10), iter.next());
1541 assert_eq!(None, iter.next());
1542
1543 iter = segmenter.segment_str("\u{2014}\u{2014} \u{2014}\u{2014}123 abc");
1544 assert_eq!(Some(0), iter.next());
1545 assert_eq!(Some(14), iter.next());
1546 assert_eq!(Some(18), iter.next());
1547 assert_eq!(Some(21), iter.next());
1548 assert_eq!(None, iter.next());
1549
1550 let mut iter = segmenter.segment_str("(0,1)+(2,3)");
1552 assert_eq!(Some(0), iter.next());
1553 assert_eq!(Some(11), iter.next());
1554 assert_eq!(None, iter.next());
1555 let input: [u16; 11] = [
1556 0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29,
1557 ];
1558 let mut iter_u16 = segmenter.segment_utf16(&input);
1559 assert_eq!(Some(0), iter_u16.next());
1560 assert_eq!(Some(11), iter_u16.next());
1561 assert_eq!(None, iter_u16.next());
1562
1563 let input: [u16; 13] = [
1564 0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63,
1565 ];
1566 let mut iter_u16 = segmenter.segment_utf16(&input);
1567 assert_eq!(Some(0), iter_u16.next());
1568 assert_eq!(Some(6), iter_u16.next());
1569 assert_eq!(Some(10), iter_u16.next());
1570 assert_eq!(Some(13), iter_u16.next());
1571 assert_eq!(None, iter_u16.next());
1572
1573 iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}");
1574 assert_eq!(Some(0), iter.next());
1575 assert_eq!(Some(5), iter.next());
1576 assert_eq!(Some(9), iter.next());
1577 assert_eq!(None, iter.next());
1578 }
1579
1580 #[test]
1581 #[cfg(feature = "lstm")]
1582 fn thai_line_break() {
1583 const TEST_STR: &str = "ภาษาไทยภาษาไทย";
1584
1585 let segmenter = LineSegmenter::new_lstm(Default::default());
1586 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1587 assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test");
1588
1589 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1590 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1591 assert_eq!(breaks, [0, 4, 7, 11, utf16.len()], "Thai test");
1592
1593 let utf16: [u16; 4] = [0x0e20, 0x0e32, 0x0e29, 0x0e32];
1594 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1595 assert_eq!(breaks, [0, 4], "Thai test");
1596 }
1597
1598 #[test]
1599 #[cfg(feature = "lstm")]
1600 fn burmese_line_break() {
1601 const TEST_STR: &str = "မြန်မာဘာသာစကား";
1603
1604 let segmenter = LineSegmenter::new_lstm(Default::default());
1605 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1606 assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test");
1608
1609 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1610 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1611 assert_eq!(breaks, [0, 4, 6, 10, utf16.len()], "Burmese utf-16 test");
1613 }
1614
1615 #[test]
1616 #[cfg(feature = "lstm")]
1617 fn khmer_line_break() {
1618 const TEST_STR: &str = "សេចក្ដីប្រកាសជាសកលស្ដីពីសិទ្ធិមនុស្ស";
1619
1620 let segmenter = LineSegmenter::new_lstm(Default::default());
1621 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1622 assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test");
1624
1625 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1626 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1627 assert_eq!(
1628 breaks,
1629 [0, 13, 16, 18, 24, utf16.len()],
1630 "Khmer utf-16 test"
1631 );
1632 }
1633
1634 #[test]
1635 #[cfg(feature = "lstm")]
1636 fn lao_line_break() {
1637 const TEST_STR: &str = "ກ່ຽວກັບສິດຂອງມະນຸດ";
1638
1639 let segmenter = LineSegmenter::new_lstm(Default::default());
1640 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1641 assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test");
1643
1644 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1645 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1646 assert_eq!(breaks, [0, 4, 7, 10, 13, utf16.len()], "Lao utf-16 test");
1647 }
1648
1649 #[test]
1650 fn empty_string() {
1651 let segmenter = LineSegmenter::new_auto(Default::default());
1652 let breaks: Vec<usize> = segmenter.segment_str("").collect();
1653 assert_eq!(breaks, [0]);
1654 }
1655}