1#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
17
18mod lstm;
19pub use lstm::*;
20
21use crate::options::WordType;
22use icu_collections::codepointtrie::CodePointTrie;
23use icu_provider::prelude::*;
24use zerovec::ZeroVec;
25
26#[cfg(feature = "compiled_data")]
27#[derive(Debug)]
28pub struct Baked;
36
37#[cfg(feature = "compiled_data")]
38#[allow(unused_imports)]
39const _: () = {
40 use icu_segmenter_data::*;
41 pub mod icu {
42 pub use crate as segmenter;
43 pub use icu_collections as collections;
44 pub use icu_locale as locale;
45 }
46 make_provider!(Baked);
47 impl_segmenter_break_sentence_v1!(Baked);
48 impl_segmenter_dictionary_auto_v1!(Baked);
49 impl_segmenter_break_grapheme_cluster_v1!(Baked);
50 impl_segmenter_dictionary_extended_v1!(Baked);
51 impl_segmenter_break_line_v1!(Baked);
52 #[cfg(feature = "lstm")]
53 impl_segmenter_lstm_auto_v1!(Baked);
54 impl_segmenter_break_word_v1!(Baked);
55 impl_segmenter_break_word_override_v1!(Baked);
56 impl_segmenter_break_sentence_override_v1!(Baked);
57};
58
59icu_provider::data_marker!(
60 SegmenterLstmAutoV1,
62 "segmenter/lstm/auto/v1",
63 LstmData<'static>,
64 #[cfg(feature = "datagen")]
65 attributes_domain = "segmenter"
66);
67icu_provider::data_marker!(
68 SegmenterDictionaryAutoV1,
70 "segmenter/dictionary/auto/v1",
71 UCharDictionaryBreakData<'static>,
72 #[cfg(feature = "datagen")]
73 attributes_domain = "segmenter"
74);
75icu_provider::data_marker!(
76 SegmenterDictionaryExtendedV1,
78 "segmenter/dictionary/extended/v1",
79 UCharDictionaryBreakData<'static>,
80 #[cfg(feature = "datagen")]
81 attributes_domain = "segmenter"
82);
83icu_provider::data_marker!(
84 SegmenterBreakSentenceOverrideV1,
86 "segmenter/break/sentence/override/v1",
87 RuleBreakDataOverride<'static>,
88);
89icu_provider::data_marker!(
90 SegmenterBreakWordOverrideV1,
92 "segmenter/break/word/override/v1",
93 RuleBreakDataOverride<'static>,
94);
95icu_provider::data_marker!(
96 SegmenterBreakLineV1,
98 "segmenter/break/line/v1",
99 RuleBreakData<'static>,
100 is_singleton = true
101);
102icu_provider::data_marker!(
103 SegmenterBreakWordV1,
105 "segmenter/break/word/v1",
106 RuleBreakData<'static>,
107 is_singleton = true
108);
109icu_provider::data_marker!(
110 SegmenterBreakGraphemeClusterV1,
112 "segmenter/break/grapheme/cluster/v1",
113 RuleBreakData<'static>,
114 is_singleton = true
115);
116icu_provider::data_marker!(
117 SegmenterBreakSentenceV1,
119 "segmenter/break/sentence/v1",
120 RuleBreakData<'static>,
121 is_singleton = true
122);
123
124pub use crate::word::inner::WordTypeULE;
125
126#[cfg(feature = "datagen")]
127pub const MARKERS: &[DataMarkerInfo] = &[
129 SegmenterBreakGraphemeClusterV1::INFO,
130 SegmenterBreakLineV1::INFO,
131 SegmenterBreakSentenceOverrideV1::INFO,
132 SegmenterBreakSentenceV1::INFO,
133 SegmenterBreakWordOverrideV1::INFO,
134 SegmenterBreakWordV1::INFO,
135 SegmenterDictionaryAutoV1::INFO,
136 SegmenterDictionaryExtendedV1::INFO,
137 SegmenterLstmAutoV1::INFO,
138];
139
140#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
148#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
149#[cfg_attr(feature = "datagen", databake(path = icu_segmenter::provider))]
150#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
151pub struct RuleBreakData<'data> {
152 #[cfg_attr(feature = "serde", serde(borrow))]
154 pub property_table: CodePointTrie<'data, u8>,
155
156 #[cfg_attr(feature = "serde", serde(borrow))]
158 pub break_state_table: ZeroVec<'data, BreakState>,
159
160 #[cfg_attr(feature = "serde", serde(borrow, rename = "rule_status_table"))]
162 pub word_type_table: ZeroVec<'data, WordType>,
163
164 pub property_count: u8,
166
167 pub last_codepoint_property: u8,
170
171 pub sot_property: u8,
173
174 pub eot_property: u8,
176
177 pub complex_property: u8,
180}
181
182icu_provider::data_struct!(
183 RuleBreakData<'_>,
184 #[cfg(feature = "datagen")]
185);
186
187#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
195#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
196#[cfg_attr(feature = "datagen", databake(path = icu_segmenter::provider))]
197#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
198pub struct UCharDictionaryBreakData<'data> {
199 #[cfg_attr(feature = "serde", serde(borrow))]
201 pub trie_data: ZeroVec<'data, u16>,
202}
203
204icu_provider::data_struct!(
205 UCharDictionaryBreakData<'_>,
206 #[cfg(feature = "datagen")]
207);
208
209pub(crate) struct UCharDictionaryBreakDataV1;
210
211impl DynamicDataMarker for UCharDictionaryBreakDataV1 {
212 type DataStruct = UCharDictionaryBreakData<'static>;
213}
214
215#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
217#[cfg_attr(
218 feature = "datagen",
219 derive(serde::Serialize,databake::Bake),
220 databake(path = icu_segmenter::provider),
221)]
222#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
223pub struct RuleBreakDataOverride<'data> {
224 #[cfg_attr(feature = "serde", serde(borrow))]
226 pub property_table_override: CodePointTrie<'data, u8>,
227}
228
229icu_provider::data_struct!(
230 RuleBreakDataOverride<'_>,
231 #[cfg(feature = "datagen")]
232);
233
234#[derive(Clone, Copy, PartialEq, Debug)]
235#[cfg_attr(feature = "datagen", derive(databake::Bake))]
236#[cfg_attr(feature = "datagen", databake(path = icu_segmenter::provider))]
237pub enum BreakState {
245 Break,
247 Keep,
249 NoMatch,
251 Intermediate(u8),
253 Index(u8),
255}
256
257#[cfg(feature = "datagen")]
258impl serde::Serialize for BreakState {
259 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
260 where
261 S: serde::Serializer,
262 {
263 if serializer.is_human_readable() {
265 i8::from_le_bytes([zerovec::ule::AsULE::to_unaligned(*self)]).serialize(serializer)
266 } else {
267 zerovec::ule::AsULE::to_unaligned(*self).serialize(serializer)
268 }
269 }
270}
271
272#[cfg(feature = "serde")]
273impl<'de> serde::Deserialize<'de> for BreakState {
274 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
275 where
276 D: serde::Deserializer<'de>,
277 {
278 if deserializer.is_human_readable() {
279 Ok(zerovec::ule::AsULE::from_unaligned(
280 i8::deserialize(deserializer)?.to_le_bytes()[0],
281 ))
282 } else {
283 u8::deserialize(deserializer).map(zerovec::ule::AsULE::from_unaligned)
284 }
285 }
286}
287
288impl zerovec::ule::AsULE for BreakState {
289 type ULE = u8;
290
291 fn to_unaligned(self) -> Self::ULE {
292 match self {
293 BreakState::Break => 253,
294 BreakState::Keep => 255,
295 BreakState::NoMatch => 254,
296 BreakState::Intermediate(i) => i + 120,
297 BreakState::Index(i) => i,
298 }
299 }
300
301 fn from_unaligned(unaligned: Self::ULE) -> Self {
302 match unaligned {
303 253 => BreakState::Break,
304 255 => BreakState::Keep,
305 254 => BreakState::NoMatch,
306 i if (120..253).contains(&i) => BreakState::Intermediate(i - 120),
307 i => BreakState::Index(i),
308 }
309 }
310}
311
312#[cfg(feature = "datagen")]
313impl serde::Serialize for WordType {
314 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
315 where
316 S: serde::Serializer,
317 {
318 if serializer.is_human_readable() {
319 (*self as u8).serialize(serializer)
320 } else {
321 unreachable!("only used as ULE")
322 }
323 }
324}
325
326#[cfg(feature = "datagen")]
327impl databake::Bake for WordType {
328 fn bake(&self, _crate_env: &databake::CrateEnv) -> databake::TokenStream {
329 unreachable!("only used as ULE")
330 }
331}
332
333#[cfg(feature = "serde")]
334impl<'de> serde::Deserialize<'de> for WordType {
335 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
336 where
337 D: serde::Deserializer<'de>,
338 {
339 if deserializer.is_human_readable() {
340 use serde::de::Error;
341 match u8::deserialize(deserializer) {
342 Ok(0) => Ok(WordType::None),
343 Ok(1) => Ok(WordType::Number),
344 Ok(2) => Ok(WordType::Letter),
345 Ok(_) => Err(D::Error::custom("invalid value")),
346 Err(e) => Err(e),
347 }
348 } else {
349 unreachable!("only used as ULE")
350 }
351 }
352}