1use crate::complex::*;
6use crate::indices::*;
7use crate::provider::*;
8use crate::rule_segmenter::*;
9use alloc::string::String;
10use alloc::vec;
11use alloc::vec::Vec;
12use core::char;
13use icu_locale_core::subtags::language;
14use icu_locale_core::LanguageIdentifier;
15use icu_provider::prelude::*;
16use utf8_iter::Utf8CharIndices;
17
18#[allow(dead_code)]
20const UNKNOWN: u8 = 0;
21#[allow(dead_code)]
22const AI: u8 = 1;
23#[allow(dead_code)]
24const AK: u8 = 2;
25#[allow(dead_code)]
26const AL: u8 = 3;
27#[allow(dead_code)]
28const AL_DOTTED_CIRCLE: u8 = 4;
29#[allow(dead_code)]
30const AP: u8 = 5;
31#[allow(dead_code)]
32const AS: u8 = 6;
33#[allow(dead_code)]
34const B2: u8 = 7;
35#[allow(dead_code)]
36const BA: u8 = 8;
37#[allow(dead_code)]
38const BB: u8 = 9;
39#[allow(dead_code)]
40const BK: u8 = 10;
41#[allow(dead_code)]
42const CB: u8 = 11;
43#[allow(dead_code)]
44const CJ: u8 = 12;
45#[allow(dead_code)]
46const CL: u8 = 13;
47#[allow(dead_code)]
48const CM: u8 = 14;
49#[allow(dead_code)]
50const CP: u8 = 15;
51#[allow(dead_code)]
52const CR: u8 = 16;
53#[allow(dead_code)]
54const EB: u8 = 17;
55#[allow(dead_code)]
56const EM: u8 = 18;
57#[allow(dead_code)]
58const EX: u8 = 19;
59#[allow(dead_code)]
60const GL: u8 = 20;
61#[allow(dead_code)]
62const H2: u8 = 21;
63#[allow(dead_code)]
64const H3: u8 = 22;
65#[allow(dead_code)]
66const HL: u8 = 23;
67#[allow(dead_code)]
68const HY: u8 = 24;
69#[allow(dead_code)]
70const ID: u8 = 25;
71#[allow(dead_code)]
72const ID_CN: u8 = 26;
73#[allow(dead_code)]
74const IN: u8 = 27;
75#[allow(dead_code)]
76const IS: u8 = 28;
77#[allow(dead_code)]
78const JL: u8 = 29;
79#[allow(dead_code)]
80const JT: u8 = 30;
81#[allow(dead_code)]
82const JV: u8 = 31;
83#[allow(dead_code)]
84const LF: u8 = 32;
85#[allow(dead_code)]
86const NL: u8 = 33;
87#[allow(dead_code)]
88const NS: u8 = 34;
89#[allow(dead_code)]
90const NU: u8 = 35;
91#[allow(dead_code)]
92const OP_EA: u8 = 36;
93#[allow(dead_code)]
94const OP_OP30: u8 = 37;
95#[allow(dead_code)]
96const PO: u8 = 38;
97#[allow(dead_code)]
98const PO_EAW: u8 = 39;
99#[allow(dead_code)]
100const PR: u8 = 40;
101#[allow(dead_code)]
102const PR_EAW: u8 = 41;
103#[allow(dead_code)]
104const QU: u8 = 42;
105#[allow(dead_code)]
106const QU_PF: u8 = 43;
107#[allow(dead_code)]
108const QU_PI: u8 = 44;
109#[allow(dead_code)]
110const RI: u8 = 45;
111#[allow(dead_code)]
112const SA: u8 = 46;
113#[allow(dead_code)]
114const SP: u8 = 47;
115#[allow(dead_code)]
116const SY: u8 = 48;
117#[allow(dead_code)]
118const VF: u8 = 49;
119#[allow(dead_code)]
120const VI: u8 = 50;
121#[allow(dead_code)]
122const WJ: u8 = 51;
123#[allow(dead_code)]
124const XX: u8 = 52;
125#[allow(dead_code)]
126const ZW: u8 = 53;
127#[allow(dead_code)]
128const ZWJ: u8 = 54;
129
130#[non_exhaustive]
137#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
138pub enum LineBreakStrictness {
139 Loose,
143
144 Normal,
147
148 #[default]
156 Strict,
157
158 Anywhere,
163}
164
165#[non_exhaustive]
172#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
173pub enum LineBreakWordOption {
174 #[default]
177 Normal,
178
179 BreakAll,
182
183 KeepAll,
186}
187
188#[non_exhaustive]
190#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
191pub struct LineBreakOptions<'a> {
192 pub strictness: Option<LineBreakStrictness>,
196
197 pub word_option: Option<LineBreakWordOption>,
201
202 pub content_locale: Option<&'a LanguageIdentifier>,
209}
210
211#[derive(Debug, Clone, Copy)]
212struct ResolvedLineBreakOptions {
213 strictness: LineBreakStrictness,
214 word_option: LineBreakWordOption,
215 ja_zh: bool,
216}
217
218impl From<LineBreakOptions<'_>> for ResolvedLineBreakOptions {
219 fn from(options: LineBreakOptions<'_>) -> Self {
220 let ja_zh = if let Some(content_locale) = options.content_locale.as_ref() {
221 content_locale.language == language!("ja") || content_locale.language == language!("zh")
222 } else {
223 false
224 };
225 Self {
226 strictness: options.strictness.unwrap_or_default(),
227 word_option: options.word_option.unwrap_or_default(),
228 ja_zh,
229 }
230 }
231}
232
233#[derive(Debug)]
356pub struct LineSegmenter {
357 options: ResolvedLineBreakOptions,
358 payload: DataPayload<SegmenterBreakLineV1>,
359 complex: ComplexPayloads,
360}
361
362#[derive(Clone, Debug, Copy)]
366pub struct LineSegmenterBorrowed<'data> {
367 options: ResolvedLineBreakOptions,
368 data: &'data RuleBreakData<'data>,
369 complex: ComplexPayloadsBorrowed<'data>,
370}
371
372impl LineSegmenter {
373 #[cfg(feature = "auto")]
384 #[cfg(feature = "compiled_data")]
385 pub fn new_auto(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
386 Self::new_lstm(options)
387 }
388
389 #[cfg(feature = "auto")]
390 icu_provider::gen_buffer_data_constructors!(
391 (options: LineBreakOptions) -> error: DataError,
392 functions: [
393 new_auto: skip,
394 try_new_auto_with_buffer_provider,
395 try_new_auto_unstable,
396 Self,
397 ]
398 );
399
400 #[cfg(feature = "auto")]
401 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
402 pub fn try_new_auto_unstable<D>(
403 provider: &D,
404 options: LineBreakOptions,
405 ) -> Result<Self, DataError>
406 where
407 D: DataProvider<SegmenterBreakLineV1>
408 + DataProvider<SegmenterLstmAutoV1>
409 + DataProvider<SegmenterBreakGraphemeClusterV1>
410 + ?Sized,
411 {
412 Self::try_new_lstm_unstable(provider, options)
413 }
414
415 #[cfg(feature = "lstm")]
427 #[cfg(feature = "compiled_data")]
428 pub fn new_lstm(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
429 LineSegmenterBorrowed {
430 options: options.into(),
431 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
432 complex: ComplexPayloadsBorrowed::new_lstm(),
433 }
434 }
435
436 #[cfg(feature = "lstm")]
437 icu_provider::gen_buffer_data_constructors!(
438 (options: LineBreakOptions) -> error: DataError,
439 functions: [
440 try_new_lstm: skip,
441 try_new_lstm_with_buffer_provider,
442 try_new_lstm_unstable,
443 Self,
444 ]
445 );
446
447 #[cfg(feature = "lstm")]
448 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
449 pub fn try_new_lstm_unstable<D>(
450 provider: &D,
451 options: LineBreakOptions,
452 ) -> Result<Self, DataError>
453 where
454 D: DataProvider<SegmenterBreakLineV1>
455 + DataProvider<SegmenterLstmAutoV1>
456 + DataProvider<SegmenterBreakGraphemeClusterV1>
457 + ?Sized,
458 {
459 Ok(Self {
460 options: options.into(),
461 payload: provider.load(Default::default())?.payload,
462 complex: ComplexPayloads::try_new_lstm(provider)?,
463 })
464 }
465
466 #[cfg(feature = "compiled_data")]
478 pub fn new_dictionary(options: LineBreakOptions) -> LineSegmenterBorrowed<'static> {
479 LineSegmenterBorrowed {
480 options: options.into(),
481 data: crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_LINE_V1,
482 complex: ComplexPayloadsBorrowed::new_southeast_asian(),
489 }
490 }
491
492 icu_provider::gen_buffer_data_constructors!(
493 (options: LineBreakOptions) -> error: DataError,
494 functions: [
495 new_dictionary: skip,
496 try_new_dictionary_with_buffer_provider,
497 try_new_dictionary_unstable,
498 Self,
499 ]
500 );
501
502 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
503 pub fn try_new_dictionary_unstable<D>(
504 provider: &D,
505 options: LineBreakOptions,
506 ) -> Result<Self, DataError>
507 where
508 D: DataProvider<SegmenterBreakLineV1>
509 + DataProvider<SegmenterDictionaryExtendedV1>
510 + DataProvider<SegmenterBreakGraphemeClusterV1>
511 + ?Sized,
512 {
513 Ok(Self {
514 options: options.into(),
515 payload: provider.load(Default::default())?.payload,
516 complex: ComplexPayloads::try_new_southeast_asian(provider)?,
523 })
524 }
525
526 pub fn as_borrowed(&self) -> LineSegmenterBorrowed<'_> {
530 LineSegmenterBorrowed {
531 options: self.options,
532 data: self.payload.get(),
533 complex: self.complex.as_borrowed(),
534 }
535 }
536}
537
538impl<'data> LineSegmenterBorrowed<'data> {
539 pub fn segment_str<'s>(self, input: &'s str) -> LineBreakIterator<'data, 's, Utf8> {
543 LineBreakIterator {
544 iter: input.char_indices(),
545 len: input.len(),
546 current_pos_data: None,
547 result_cache: Vec::new(),
548 data: self.data,
549 options: self.options,
550 complex: self.complex,
551 }
552 }
553 pub fn segment_utf8<'s>(
559 self,
560 input: &'s [u8],
561 ) -> LineBreakIterator<'data, 's, PotentiallyIllFormedUtf8> {
562 LineBreakIterator {
563 iter: Utf8CharIndices::new(input),
564 len: input.len(),
565 current_pos_data: None,
566 result_cache: Vec::new(),
567 data: self.data,
568 options: self.options,
569 complex: self.complex,
570 }
571 }
572 pub fn segment_latin1<'s>(self, input: &'s [u8]) -> LineBreakIterator<'data, 's, Latin1> {
576 LineBreakIterator {
577 iter: Latin1Indices::new(input),
578 len: input.len(),
579 current_pos_data: None,
580 result_cache: Vec::new(),
581 data: self.data,
582 options: self.options,
583 complex: self.complex,
584 }
585 }
586
587 pub fn segment_utf16<'s>(self, input: &'s [u16]) -> LineBreakIterator<'data, 's, Utf16> {
591 LineBreakIterator {
592 iter: Utf16Indices::new(input),
593 len: input.len(),
594 current_pos_data: None,
595 result_cache: Vec::new(),
596 data: self.data,
597 options: self.options,
598 complex: self.complex,
599 }
600 }
601}
602
603impl LineSegmenterBorrowed<'static> {
604 pub fn static_to_owned(self) -> LineSegmenter {
609 LineSegmenter {
610 payload: DataPayload::from_static_ref(self.data),
611 complex: self.complex.static_to_owned(),
612 options: self.options,
613 }
614 }
615}
616
617impl RuleBreakData<'_> {
618 fn get_linebreak_property_utf32_with_rule(
619 &self,
620 codepoint: u32,
621 strictness: LineBreakStrictness,
622 word_option: LineBreakWordOption,
623 ) -> u8 {
624 let prop = self.property_table.get32(codepoint);
626
627 if word_option == LineBreakWordOption::BreakAll
628 || strictness == LineBreakStrictness::Loose
629 || strictness == LineBreakStrictness::Normal
630 {
631 return match prop {
632 CJ => ID, _ => prop,
634 };
635 }
636
637 prop
640 }
641
642 #[inline]
643 fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
644 let idx = (left as usize) * (self.property_count as usize) + (right as usize);
645 self.break_state_table.get(idx).unwrap_or(BreakState::Keep)
647 }
648
649 #[inline]
650 fn use_complex_breaking_utf32(&self, codepoint: u32) -> bool {
651 let line_break_property = self.get_linebreak_property_utf32_with_rule(
652 codepoint,
653 LineBreakStrictness::Strict,
654 LineBreakWordOption::Normal,
655 );
656
657 line_break_property == SA
658 }
659}
660
661#[inline]
662fn is_break_utf32_by_loose(
663 right_codepoint: u32,
664 left_prop: u8,
665 right_prop: u8,
666 ja_zh: bool,
667) -> Option<bool> {
668 if right_prop == BA {
670 if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) {
671 return Some(true);
672 }
673 } else if right_prop == NS {
674 if right_codepoint == 0x301C || right_codepoint == 0x30A0 {
676 return Some(ja_zh);
677 }
678
679 if right_codepoint == 0x3005
681 || right_codepoint == 0x303B
682 || right_codepoint == 0x309D
683 || right_codepoint == 0x309E
684 || right_codepoint == 0x30FD
685 || right_codepoint == 0x30FE
686 {
687 return Some(true);
688 }
689
690 if right_codepoint == 0x30FB
692 || right_codepoint == 0xFF1A
693 || right_codepoint == 0xFF1B
694 || right_codepoint == 0xFF65
695 || right_codepoint == 0x203C
696 || (0x2047..=0x2049).contains(&right_codepoint)
697 {
698 return Some(ja_zh);
699 }
700 } else if right_prop == IN {
701 return Some(true);
703 } else if right_prop == EX {
704 if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F {
706 return Some(ja_zh);
707 }
708 }
709
710 if right_prop == PO_EAW {
713 return Some(ja_zh);
714 }
715 if left_prop == PR_EAW {
718 return Some(ja_zh);
719 }
720 None
721}
722
723pub trait LineBreakType: crate::private::Sealed + Sized + RuleBreakType {
732 #[doc(hidden)]
733 fn use_complex_breaking(iterator: &LineBreakIterator<'_, '_, Self>, c: Self::CharType) -> bool;
734
735 #[doc(hidden)]
736 fn get_linebreak_property_with_rule(
737 iterator: &LineBreakIterator<'_, '_, Self>,
738 c: Self::CharType,
739 ) -> u8;
740
741 #[doc(hidden)]
742 fn line_handle_complex_language(
743 iterator: &mut LineBreakIterator<'_, '_, Self>,
744 left_codepoint: Self::CharType,
745 ) -> Option<usize>;
746}
747
748#[derive(Debug)]
761pub struct LineBreakIterator<'data, 's, Y: LineBreakType> {
762 iter: Y::IterAttr<'s>,
763 len: usize,
764 current_pos_data: Option<(usize, Y::CharType)>,
765 result_cache: Vec<usize>,
766 data: &'data RuleBreakData<'data>,
767 options: ResolvedLineBreakOptions,
768 complex: ComplexPayloadsBorrowed<'data>,
769}
770
771impl<Y: LineBreakType> Iterator for LineBreakIterator<'_, '_, Y> {
772 type Item = usize;
773
774 fn next(&mut self) -> Option<Self::Item> {
775 match self.check_eof() {
776 StringBoundaryPosType::Start => return Some(0),
777 StringBoundaryPosType::End => return None,
778 _ => (),
779 }
780
781 if let Some(&first_pos) = self.result_cache.first() {
783 let mut i = 0;
784 loop {
785 if i == first_pos {
786 self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
787 return self.get_current_position();
788 }
789 i += self.get_current_codepoint().map_or(0, Y::char_len);
790 self.advance_iter();
791 if self.is_eof() {
792 self.result_cache.clear();
793 return Some(self.len);
794 }
795 }
796 }
797
798 let mut lb9_left: Option<u8> = None;
800 let mut lb8a_after_lb9 = false;
803
804 'a: loop {
805 debug_assert!(!self.is_eof());
806 let left_codepoint = self.get_current_codepoint()?;
807 let mut left_prop =
808 lb9_left.unwrap_or_else(|| self.get_linebreak_property(left_codepoint));
809 let after_zwj = lb8a_after_lb9 || (lb9_left.is_none() && left_prop == ZWJ);
810 self.advance_iter();
811
812 let Some(right_codepoint) = self.get_current_codepoint() else {
813 return Some(self.len);
814 };
815 let right_prop = self.get_linebreak_property(right_codepoint);
816 if (right_prop == CM
820 || (right_prop == ZWJ && self.options.strictness != LineBreakStrictness::Anywhere))
821 && left_prop != BK
822 && left_prop != CR
823 && left_prop != LF
824 && left_prop != NL
825 && left_prop != SP
826 && left_prop != ZW
827 {
828 lb9_left = Some(left_prop);
829 lb8a_after_lb9 = right_prop == ZWJ;
830 continue;
831 } else {
832 lb9_left = None;
833 lb8a_after_lb9 = false;
834 }
835
836 match (self.options.word_option, left_prop, right_prop) {
838 (LineBreakWordOption::BreakAll, AL | NU | SA, _) => {
839 left_prop = ID;
840 }
841 (
843 LineBreakWordOption::KeepAll,
844 AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
845 AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
846 ) => {
847 continue;
848 }
849 _ => (),
850 }
851
852 match self.options.strictness {
854 LineBreakStrictness::Normal => {
855 if self.is_break_by_normal(right_codepoint) && !after_zwj {
856 return self.get_current_position();
857 }
858 }
859 LineBreakStrictness::Loose => {
860 if let Some(breakable) = is_break_utf32_by_loose(
861 right_codepoint.into(),
862 left_prop,
863 right_prop,
864 self.options.ja_zh,
865 ) {
866 if breakable && !after_zwj {
867 return self.get_current_position();
868 }
869 continue;
870 }
871 }
872 LineBreakStrictness::Anywhere => {
873 return self.get_current_position();
877 }
878 _ => (),
879 };
880
881 if self.options.word_option != LineBreakWordOption::BreakAll
883 && Y::use_complex_breaking(self, left_codepoint)
884 && Y::use_complex_breaking(self, right_codepoint)
885 {
886 let result = Y::line_handle_complex_language(self, left_codepoint);
887 if result.is_some() {
888 return result;
889 }
890 }
892
893 match self.data.get_break_state_from_table(left_prop, right_prop) {
895 BreakState::Break | BreakState::NoMatch => {
896 if after_zwj {
897 continue;
898 } else {
899 return self.get_current_position();
900 }
901 }
902 BreakState::Keep => continue,
903 BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
904 let mut previous_iter = self.iter.clone();
905 let mut previous_pos_data = self.current_pos_data;
906 let mut previous_is_after_zwj = after_zwj;
907
908 let mut left_prop_pre_lb9 = right_prop;
913
914 let is_intermediate_rule_no_match = if lb8a_after_lb9 {
918 true
920 } else {
921 index > self.data.last_codepoint_property
922 };
923
924 loop {
925 self.advance_iter();
926 let after_zwj = left_prop_pre_lb9 == ZWJ;
927
928 let previous_break_state_is_cp_prop =
929 index <= self.data.last_codepoint_property;
930
931 let Some(prop) = self.get_current_linebreak_property() else {
932 let break_state = self
934 .data
935 .get_break_state_from_table(index, self.data.eot_property);
936 if break_state == BreakState::NoMatch {
937 self.iter = previous_iter;
938 self.current_pos_data = previous_pos_data;
939 if previous_is_after_zwj {
940 continue 'a;
942 } else {
943 return self.get_current_position();
944 }
945 }
946 return Some(self.len);
948 };
949
950 if (prop == CM || prop == ZWJ)
951 && left_prop_pre_lb9 != BK
952 && left_prop_pre_lb9 != CR
953 && left_prop_pre_lb9 != LF
954 && left_prop_pre_lb9 != NL
955 && left_prop_pre_lb9 != SP
956 && left_prop_pre_lb9 != ZW
957 {
958 left_prop_pre_lb9 = prop;
959 continue;
960 }
961
962 match self.data.get_break_state_from_table(index, prop) {
963 BreakState::Keep => continue 'a,
964 BreakState::NoMatch => {
965 self.iter = previous_iter;
966 self.current_pos_data = previous_pos_data;
967 if after_zwj {
968 if is_intermediate_rule_no_match && !previous_is_after_zwj {
971 return self.get_current_position();
972 }
973 continue 'a;
974 } else if previous_is_after_zwj {
975 continue 'a;
977 } else {
978 return self.get_current_position();
979 }
980 }
981 BreakState::Break => {
982 if after_zwj {
983 continue 'a;
984 } else {
985 return self.get_current_position();
986 }
987 }
988 BreakState::Intermediate(i) => {
989 index = i;
990 previous_iter = self.iter.clone();
991 previous_pos_data = self.current_pos_data;
992 previous_is_after_zwj = after_zwj;
993 }
994 BreakState::Index(i) => {
995 index = i;
996 if previous_break_state_is_cp_prop {
997 previous_iter = self.iter.clone();
998 previous_pos_data = self.current_pos_data;
999 previous_is_after_zwj = after_zwj;
1000 }
1001 }
1002 }
1003 left_prop_pre_lb9 = prop;
1004 }
1005 }
1006 }
1007 }
1008 }
1009}
1010
1011enum StringBoundaryPosType {
1012 Start,
1013 Middle,
1014 End,
1015}
1016
1017impl<Y: LineBreakType> LineBreakIterator<'_, '_, Y> {
1018 fn advance_iter(&mut self) {
1019 self.current_pos_data = self.iter.next();
1020 }
1021
1022 fn is_eof(&self) -> bool {
1023 self.current_pos_data.is_none()
1024 }
1025
1026 #[inline]
1027 fn check_eof(&mut self) -> StringBoundaryPosType {
1028 if self.is_eof() {
1029 self.advance_iter();
1030 if self.is_eof() {
1031 if self.len == 0 {
1032 self.len = 1;
1036 StringBoundaryPosType::Start
1037 } else {
1038 StringBoundaryPosType::End
1039 }
1040 } else {
1041 StringBoundaryPosType::Start
1042 }
1043 } else {
1044 StringBoundaryPosType::Middle
1045 }
1046 }
1047
1048 fn get_current_position(&self) -> Option<usize> {
1049 self.current_pos_data.map(|(pos, _)| pos)
1050 }
1051
1052 fn get_current_codepoint(&self) -> Option<Y::CharType> {
1053 self.current_pos_data.map(|(_, codepoint)| codepoint)
1054 }
1055
1056 fn get_linebreak_property(&self, codepoint: Y::CharType) -> u8 {
1057 Y::get_linebreak_property_with_rule(self, codepoint)
1058 }
1059
1060 fn get_current_linebreak_property(&self) -> Option<u8> {
1061 self.get_current_codepoint()
1062 .map(|c| self.get_linebreak_property(c))
1063 }
1064
1065 fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
1066 match codepoint.into() {
1067 0x301C | 0x30A0 => self.options.ja_zh,
1068 _ => false,
1069 }
1070 }
1071}
1072
1073impl LineBreakType for Utf8 {
1074 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1075 iterator.data.get_linebreak_property_utf32_with_rule(
1076 c as u32,
1077 iterator.options.strictness,
1078 iterator.options.word_option,
1079 )
1080 }
1081
1082 #[inline]
1083 fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1084 iterator.data.use_complex_breaking_utf32(c as u32)
1085 }
1086
1087 fn line_handle_complex_language(
1088 iter: &mut LineBreakIterator<'_, '_, Self>,
1089 left_codepoint: char,
1090 ) -> Option<usize> {
1091 line_handle_complex_language_utf8(iter, left_codepoint)
1092 }
1093}
1094
1095impl LineBreakType for PotentiallyIllFormedUtf8 {
1096 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1097 iterator.data.get_linebreak_property_utf32_with_rule(
1098 c as u32,
1099 iterator.options.strictness,
1100 iterator.options.word_option,
1101 )
1102 }
1103
1104 #[inline]
1105 fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1106 iterator.data.use_complex_breaking_utf32(c as u32)
1107 }
1108
1109 fn line_handle_complex_language(
1110 iter: &mut LineBreakIterator<'_, '_, Self>,
1111 left_codepoint: char,
1112 ) -> Option<usize> {
1113 line_handle_complex_language_utf8(iter, left_codepoint)
1114 }
1115}
1116fn line_handle_complex_language_utf8<T>(
1118 iter: &mut LineBreakIterator<'_, '_, T>,
1119 left_codepoint: char,
1120) -> Option<usize>
1121where
1122 T: LineBreakType<CharType = char>,
1123{
1124 let start_iter = iter.iter.clone();
1126 let start_point = iter.current_pos_data;
1127 let mut s = String::new();
1128 s.push(left_codepoint);
1129 loop {
1130 debug_assert!(!iter.is_eof());
1131 s.push(iter.get_current_codepoint()?);
1132 iter.advance_iter();
1133 if let Some(current_codepoint) = iter.get_current_codepoint() {
1134 if !T::use_complex_breaking(iter, current_codepoint) {
1135 break;
1136 }
1137 } else {
1138 break;
1140 }
1141 }
1142
1143 iter.iter = start_iter;
1145 iter.current_pos_data = start_point;
1146 let breaks = iter.complex.complex_language_segment_str(&s);
1147 iter.result_cache = breaks;
1148 let first_pos = *iter.result_cache.first()?;
1149 let mut i = left_codepoint.len_utf8();
1150 loop {
1151 if i == first_pos {
1152 iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
1154 return iter.get_current_position();
1155 }
1156 debug_assert!(
1157 i < first_pos,
1158 "we should always arrive at first_pos: near index {:?}",
1159 iter.get_current_position()
1160 );
1161 i += iter.get_current_codepoint().map_or(0, T::char_len);
1162 iter.advance_iter();
1163 if iter.is_eof() {
1164 iter.result_cache.clear();
1165 return Some(iter.len);
1166 }
1167 }
1168}
1169
1170impl LineBreakType for Latin1 {
1171 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 {
1172 iterator.data.property_table.get32(c as u32)
1175 }
1176
1177 #[inline]
1178 fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool {
1179 false
1180 }
1181
1182 fn line_handle_complex_language(
1183 _: &mut LineBreakIterator<Self>,
1184 _: Self::CharType,
1185 ) -> Option<usize> {
1186 unreachable!()
1187 }
1188}
1189
1190impl LineBreakType for Utf16 {
1191 fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 {
1192 iterator.data.get_linebreak_property_utf32_with_rule(
1193 c,
1194 iterator.options.strictness,
1195 iterator.options.word_option,
1196 )
1197 }
1198
1199 #[inline]
1200 fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool {
1201 iterator.data.use_complex_breaking_utf32(c)
1202 }
1203
1204 fn line_handle_complex_language(
1205 iterator: &mut LineBreakIterator<Self>,
1206 left_codepoint: Self::CharType,
1207 ) -> Option<usize> {
1208 let start_iter = iterator.iter.clone();
1210 let start_point = iterator.current_pos_data;
1211 let mut s = vec![left_codepoint as u16];
1212 loop {
1213 debug_assert!(!iterator.is_eof());
1214 s.push(iterator.get_current_codepoint()? as u16);
1215 iterator.advance_iter();
1216 if let Some(current_codepoint) = iterator.get_current_codepoint() {
1217 if !Self::use_complex_breaking(iterator, current_codepoint) {
1218 break;
1219 }
1220 } else {
1221 break;
1223 }
1224 }
1225
1226 iterator.iter = start_iter;
1228 iterator.current_pos_data = start_point;
1229 let breaks = iterator.complex.complex_language_segment_utf16(&s);
1230 iterator.result_cache = breaks;
1231 let first_pos = *iterator.result_cache.first()?;
1233 let mut i = 1;
1234 loop {
1235 if i == first_pos {
1236 iterator.result_cache = iterator
1238 .result_cache
1239 .iter()
1240 .skip(1)
1241 .map(|r| r - i)
1242 .collect();
1243 return iterator.get_current_position();
1244 }
1245 debug_assert!(
1246 i < first_pos,
1247 "we should always arrive at first_pos: near index {:?}",
1248 iterator.get_current_position()
1249 );
1250 i += 1;
1251 iterator.advance_iter();
1252 if iterator.is_eof() {
1253 iterator.result_cache.clear();
1254 return Some(iterator.len);
1255 }
1256 }
1257 }
1258}
1259
1260#[cfg(test)]
1261#[cfg(feature = "serde")]
1262mod tests {
1263 use super::*;
1264 use crate::LineSegmenter;
1265
1266 #[test]
1267 fn linebreak_property() {
1268 let payload =
1269 DataProvider::<SegmenterBreakLineV1>::load(&crate::provider::Baked, Default::default())
1270 .expect("Loading should succeed!")
1271 .payload;
1272
1273 let get_linebreak_property = |codepoint| {
1274 payload.get().get_linebreak_property_utf32_with_rule(
1275 codepoint as u32,
1276 LineBreakStrictness::Strict,
1277 LineBreakWordOption::Normal,
1278 )
1279 };
1280
1281 assert_eq!(get_linebreak_property('\u{0020}'), SP);
1282 assert_eq!(get_linebreak_property('\u{0022}'), QU);
1283 assert_eq!(get_linebreak_property('('), OP_OP30);
1284 assert_eq!(get_linebreak_property('\u{0030}'), NU);
1285 assert_eq!(get_linebreak_property('['), OP_OP30);
1286 assert_eq!(get_linebreak_property('\u{1f3fb}'), EM);
1287 assert_eq!(get_linebreak_property('\u{20000}'), ID);
1288 assert_eq!(get_linebreak_property('\u{e0020}'), CM);
1289 assert_eq!(get_linebreak_property('\u{3041}'), CJ);
1290 assert_eq!(get_linebreak_property('\u{0025}'), PO);
1291 assert_eq!(get_linebreak_property('\u{00A7}'), AI);
1292 assert_eq!(get_linebreak_property('\u{50005}'), XX);
1293 assert_eq!(get_linebreak_property('\u{17D6}'), NS);
1294 assert_eq!(get_linebreak_property('\u{2014}'), B2);
1295 }
1296
1297 #[test]
1298 #[allow(clippy::bool_assert_comparison)] fn break_rule() {
1300 let payload =
1301 DataProvider::<SegmenterBreakLineV1>::load(&crate::provider::Baked, Default::default())
1302 .expect("Loading should succeed!")
1303 .payload;
1304 let lb_data: &RuleBreakData = payload.get();
1305
1306 let is_break = |left, right| {
1307 matches!(
1308 lb_data.get_break_state_from_table(left, right),
1309 BreakState::Break | BreakState::NoMatch
1310 )
1311 };
1312
1313 assert_eq!(is_break(BK, AL), true);
1315 assert_eq!(is_break(CR, LF), false);
1317 assert_eq!(is_break(CR, AL), true);
1318 assert_eq!(is_break(LF, AL), true);
1319 assert_eq!(is_break(NL, AL), true);
1320 assert_eq!(is_break(AL, BK), false);
1322 assert_eq!(is_break(AL, CR), false);
1323 assert_eq!(is_break(AL, LF), false);
1324 assert_eq!(is_break(AL, NL), false);
1325 assert_eq!(is_break(AL, SP), false);
1327 assert_eq!(is_break(AL, ZW), false);
1328 assert_eq!(is_break(ZWJ, SP), false);
1332 assert_eq!(is_break(SP, CM), true);
1333 assert_eq!(is_break(AL, WJ), false);
1335 assert_eq!(is_break(WJ, AL), false);
1336 assert_eq!(is_break(GL, AL), false);
1338 assert_eq!(is_break(AL, GL), false);
1340 assert_eq!(is_break(SP, GL), true);
1341 assert_eq!(is_break(AL, CL), false);
1343 assert_eq!(is_break(AL, CP), false);
1344 assert_eq!(is_break(AL, EX), false);
1345 assert_eq!(is_break(AL, IS), false);
1346 assert_eq!(is_break(AL, SY), false);
1347 assert_eq!(is_break(SP, AL), true);
1349 assert_eq!(is_break(AL, QU), false);
1351 assert_eq!(is_break(QU, AL), false);
1352 assert_eq!(is_break(AL, CB), true);
1354 assert_eq!(is_break(CB, AL), true);
1355 assert_eq!(is_break(AL, BA), false);
1357 assert_eq!(is_break(AL, HY), false);
1358 assert_eq!(is_break(AL, NS), false);
1359 assert_eq!(is_break(AL, BA), false);
1361 assert_eq!(is_break(BB, AL), false);
1362 assert_eq!(is_break(ID, BA), false);
1363 assert_eq!(is_break(ID, NS), false);
1364 assert_eq!(is_break(SY, HL), false);
1367 assert_eq!(is_break(AL, IN), false);
1369 assert_eq!(is_break(AL, NU), false);
1371 assert_eq!(is_break(HL, NU), false);
1372 assert_eq!(is_break(PR, ID), false);
1374 assert_eq!(is_break(PR, EB), false);
1375 assert_eq!(is_break(PR, EM), false);
1376 assert_eq!(is_break(ID, PO), false);
1377 assert_eq!(is_break(EB, PO), false);
1378 assert_eq!(is_break(EM, PO), false);
1379 assert_eq!(is_break(JL, JL), false);
1381 assert_eq!(is_break(JL, JV), false);
1382 assert_eq!(is_break(JL, H2), false);
1383 assert_eq!(is_break(JL, IN), false);
1385 assert_eq!(is_break(JL, PO), false);
1386 assert_eq!(is_break(PR, JL), false);
1387 assert_eq!(is_break(AL, AL), false);
1389 assert_eq!(is_break(HL, AL), false);
1390 assert_eq!(is_break(IS, AL), false);
1392 assert_eq!(is_break(IS, HL), false);
1393 assert_eq!(is_break(EB, EM), false);
1395 assert_eq!(is_break(ID, ID), true);
1397 }
1398
1399 #[test]
1400 fn linebreak() {
1401 let segmenter =
1402 LineSegmenter::try_new_dictionary_unstable(&crate::provider::Baked, Default::default())
1403 .expect("Data exists");
1404 let segmenter = segmenter.as_borrowed();
1405
1406 let mut iter = segmenter.segment_str("hello world");
1407 assert_eq!(Some(0), iter.next());
1408 assert_eq!(Some(6), iter.next());
1409 assert_eq!(Some(11), iter.next());
1410 assert_eq!(None, iter.next());
1411
1412 iter = segmenter.segment_str("$10 $10");
1413 assert_eq!(Some(0), iter.next());
1414 assert_eq!(Some(4), iter.next());
1415 assert_eq!(Some(7), iter.next());
1416 assert_eq!(None, iter.next());
1417
1418 iter = segmenter.segment_str("[ abc def");
1422 assert_eq!(Some(0), iter.next());
1423 assert_eq!(Some(7), iter.next());
1424 assert_eq!(Some(10), iter.next());
1425 assert_eq!(None, iter.next());
1426
1427 let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1428 let mut iter_u8 = segmenter.segment_latin1(&input);
1429 assert_eq!(Some(0), iter_u8.next());
1430 assert_eq!(Some(7), iter_u8.next());
1431 assert_eq!(Some(10), iter_u8.next());
1432 assert_eq!(None, iter_u8.next());
1433
1434 let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1435 let mut iter_u16 = segmenter.segment_utf16(&input);
1436 assert_eq!(Some(0), iter_u16.next());
1437 assert_eq!(Some(7), iter_u16.next());
1438 assert_eq!(Some(10), iter_u16.next());
1439 assert_eq!(None, iter_u16.next());
1440
1441 iter = segmenter.segment_str("abc\u{0022} (def");
1443 assert_eq!(Some(0), iter.next());
1444 assert_eq!(Some(6), iter.next());
1445 assert_eq!(Some(10), iter.next());
1446 assert_eq!(None, iter.next());
1447
1448 let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1449 let mut iter_u8 = segmenter.segment_latin1(&input);
1450 assert_eq!(Some(0), iter_u8.next());
1451 assert_eq!(Some(6), iter_u8.next());
1452 assert_eq!(Some(10), iter_u8.next());
1453 assert_eq!(None, iter_u8.next());
1454
1455 let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1456 let mut iter_u16 = segmenter.segment_utf16(&input);
1457 assert_eq!(Some(0), iter_u16.next());
1458 assert_eq!(Some(6), iter_u16.next());
1459 assert_eq!(Some(10), iter_u16.next());
1460 assert_eq!(None, iter_u16.next());
1461
1462 iter = segmenter.segment_str("« miaou »");
1464 assert_eq!(Some(0), iter.next());
1465 assert_eq!(Some(11), iter.next());
1466 assert_eq!(None, iter.next());
1467
1468 let input: Vec<u8> = "« miaou »"
1469 .chars()
1470 .map(|c| u8::try_from(u32::from(c)).unwrap())
1471 .collect();
1472 let mut iter_u8 = segmenter.segment_latin1(&input);
1473 assert_eq!(Some(0), iter_u8.next());
1474 assert_eq!(Some(9), iter_u8.next());
1475 assert_eq!(None, iter_u8.next());
1476
1477 let input: Vec<u16> = "« miaou »".encode_utf16().collect();
1478 let mut iter_u16 = segmenter.segment_utf16(&input);
1479 assert_eq!(Some(0), iter_u16.next());
1480 assert_eq!(Some(9), iter_u16.next());
1481 assert_eq!(None, iter_u16.next());
1482
1483 iter = segmenter.segment_str("Die Katze hat »miau« gesagt.");
1485 assert_eq!(Some(0), iter.next());
1486 assert_eq!(Some(4), iter.next());
1487 assert_eq!(Some(10), iter.next());
1488 assert_eq!(Some(14), iter.next());
1489 assert_eq!(Some(23), iter.next());
1490 assert_eq!(Some(30), iter.next());
1491 assert_eq!(None, iter.next());
1492
1493 let input: Vec<u8> = "Die Katze hat »miau« gesagt."
1494 .chars()
1495 .map(|c| u8::try_from(u32::from(c)).unwrap())
1496 .collect();
1497 let mut iter_u8 = segmenter.segment_latin1(&input);
1498 assert_eq!(Some(0), iter_u8.next());
1499 assert_eq!(Some(4), iter_u8.next());
1500 assert_eq!(Some(10), iter_u8.next());
1501 assert_eq!(Some(14), iter_u8.next());
1502 assert_eq!(Some(21), iter_u8.next());
1503 assert_eq!(Some(28), iter_u8.next());
1504 assert_eq!(None, iter_u8.next());
1505
1506 let input: Vec<u16> = "Die Katze hat »miau« gesagt.".encode_utf16().collect();
1507 let mut iter_u16 = segmenter.segment_utf16(&input);
1508 assert_eq!(Some(0), iter_u16.next());
1509 assert_eq!(Some(4), iter_u16.next());
1510 assert_eq!(Some(10), iter_u16.next());
1511 assert_eq!(Some(14), iter_u16.next());
1512 assert_eq!(Some(21), iter_u16.next());
1513 assert_eq!(Some(28), iter_u16.next());
1514 assert_eq!(None, iter_u16.next());
1515
1516 iter = segmenter.segment_str("\u{0029}\u{203C}");
1518 assert_eq!(Some(0), iter.next());
1519 assert_eq!(Some(4), iter.next());
1520 assert_eq!(None, iter.next());
1521 iter = segmenter.segment_str("\u{0029} \u{203C}");
1522 assert_eq!(Some(0), iter.next());
1523 assert_eq!(Some(6), iter.next());
1524 assert_eq!(None, iter.next());
1525
1526 let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c];
1527 let mut iter_u16 = segmenter.segment_utf16(&input);
1528 assert_eq!(Some(0), iter_u16.next());
1529 assert_eq!(Some(4), iter_u16.next());
1530 assert_eq!(None, iter_u16.next());
1531
1532 iter = segmenter.segment_str("\u{2014}\u{2014}aa");
1534 assert_eq!(Some(0), iter.next());
1535 assert_eq!(Some(6), iter.next());
1536 assert_eq!(Some(8), iter.next());
1537 assert_eq!(None, iter.next());
1538 iter = segmenter.segment_str("\u{2014} \u{2014}aa");
1539 assert_eq!(Some(0), iter.next());
1540 assert_eq!(Some(8), iter.next());
1541 assert_eq!(Some(10), iter.next());
1542 assert_eq!(None, iter.next());
1543
1544 iter = segmenter.segment_str("\u{2014}\u{2014} \u{2014}\u{2014}123 abc");
1545 assert_eq!(Some(0), iter.next());
1546 assert_eq!(Some(14), iter.next());
1547 assert_eq!(Some(18), iter.next());
1548 assert_eq!(Some(21), iter.next());
1549 assert_eq!(None, iter.next());
1550
1551 let mut iter = segmenter.segment_str("(0,1)+(2,3)");
1553 assert_eq!(Some(0), iter.next());
1554 assert_eq!(Some(11), iter.next());
1555 assert_eq!(None, iter.next());
1556 let input: [u16; 11] = [
1557 0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29,
1558 ];
1559 let mut iter_u16 = segmenter.segment_utf16(&input);
1560 assert_eq!(Some(0), iter_u16.next());
1561 assert_eq!(Some(11), iter_u16.next());
1562 assert_eq!(None, iter_u16.next());
1563
1564 let input: [u16; 13] = [
1565 0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63,
1566 ];
1567 let mut iter_u16 = segmenter.segment_utf16(&input);
1568 assert_eq!(Some(0), iter_u16.next());
1569 assert_eq!(Some(6), iter_u16.next());
1570 assert_eq!(Some(10), iter_u16.next());
1571 assert_eq!(Some(13), iter_u16.next());
1572 assert_eq!(None, iter_u16.next());
1573
1574 iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}");
1575 assert_eq!(Some(0), iter.next());
1576 assert_eq!(Some(5), iter.next());
1577 assert_eq!(Some(9), iter.next());
1578 assert_eq!(None, iter.next());
1579 }
1580
1581 #[test]
1582 #[cfg(feature = "lstm")]
1583 fn thai_line_break() {
1584 const TEST_STR: &str = "ภาษาไทยภาษาไทย";
1585
1586 let segmenter = LineSegmenter::new_lstm(Default::default());
1587 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1588 assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test");
1589
1590 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1591 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1592 assert_eq!(breaks, [0, 4, 7, 11, utf16.len()], "Thai test");
1593
1594 let utf16: [u16; 4] = [0x0e20, 0x0e32, 0x0e29, 0x0e32];
1595 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1596 assert_eq!(breaks, [0, 4], "Thai test");
1597 }
1598
1599 #[test]
1600 #[cfg(feature = "lstm")]
1601 fn burmese_line_break() {
1602 const TEST_STR: &str = "မြန်မာဘာသာစကား";
1604
1605 let segmenter = LineSegmenter::new_lstm(Default::default());
1606 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1607 assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test");
1609
1610 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1611 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1612 assert_eq!(breaks, [0, 4, 6, 10, utf16.len()], "Burmese utf-16 test");
1614 }
1615
1616 #[test]
1617 #[cfg(feature = "lstm")]
1618 fn khmer_line_break() {
1619 const TEST_STR: &str = "សេចក្ដីប្រកាសជាសកលស្ដីពីសិទ្ធិមនុស្ស";
1620
1621 let segmenter = LineSegmenter::new_lstm(Default::default());
1622 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1623 assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test");
1625
1626 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1627 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1628 assert_eq!(
1629 breaks,
1630 [0, 13, 16, 18, 24, utf16.len()],
1631 "Khmer utf-16 test"
1632 );
1633 }
1634
1635 #[test]
1636 #[cfg(feature = "lstm")]
1637 fn lao_line_break() {
1638 const TEST_STR: &str = "ກ່ຽວກັບສິດຂອງມະນຸດ";
1639
1640 let segmenter = LineSegmenter::new_lstm(Default::default());
1641 let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1642 assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test");
1644
1645 let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1646 let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1647 assert_eq!(breaks, [0, 4, 7, 10, 13, utf16.len()], "Lao utf-16 test");
1648 }
1649
1650 #[test]
1651 fn empty_string() {
1652 let segmenter = LineSegmenter::new_auto(Default::default());
1653 let breaks: Vec<usize> = segmenter.segment_str("").collect();
1654 assert_eq!(breaks, [0]);
1655 }
1656}