icu_segmenter/provider/
mod.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
6//!
7//! <div class="stab unstable">
8//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
9//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
10//! to be stable, their Rust representation might not be. Use with caution.
11//! </div>
12//!
13//! Read more about data providers: [`icu_provider`]
14
15// Provider structs must be stable
16#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
17
18mod lstm;
19pub use lstm::*;
20
21use crate::options::WordType;
22use icu_collections::codepointtrie::CodePointTrie;
23use icu_provider::prelude::*;
24use zerovec::ZeroVec;
25
26#[cfg(feature = "compiled_data")]
27#[derive(Debug)]
28/// Baked data
29///
30/// <div class="stab unstable">
31/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
32/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
33/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
34/// </div>
35pub struct Baked;
36
37#[cfg(feature = "compiled_data")]
38#[allow(unused_imports)]
39const _: () = {
40    use icu_segmenter_data::*;
41    pub mod icu {
42        pub use crate as segmenter;
43        pub use icu_collections as collections;
44        pub use icu_locale as locale;
45    }
46    make_provider!(Baked);
47    impl_segmenter_break_sentence_v1!(Baked);
48    impl_segmenter_dictionary_auto_v1!(Baked);
49    impl_segmenter_break_grapheme_cluster_v1!(Baked);
50    impl_segmenter_dictionary_extended_v1!(Baked);
51    impl_segmenter_break_line_v1!(Baked);
52    #[cfg(feature = "lstm")]
53    impl_segmenter_lstm_auto_v1!(Baked);
54    impl_segmenter_break_word_v1!(Baked);
55    impl_segmenter_break_word_override_v1!(Baked);
56    impl_segmenter_break_sentence_override_v1!(Baked);
57};
58
59icu_provider::data_marker!(
60    /// `SegmenterLstmWordLineAutoV1`
61    SegmenterLstmAutoV1,
62    "segmenter/lstm/auto/v1",
63    LstmData<'static>,
64    #[cfg(feature = "datagen")]
65    attributes_domain = "segmenter"
66);
67icu_provider::data_marker!(
68    /// `SegmenterDictionaryWordAutoV1`
69    SegmenterDictionaryAutoV1,
70    "segmenter/dictionary/auto/v1",
71    UCharDictionaryBreakData<'static>,
72    #[cfg(feature = "datagen")]
73    attributes_domain = "segmenter"
74);
75icu_provider::data_marker!(
76    /// `SegmenterDictionaryExtendedV1`
77    SegmenterDictionaryExtendedV1,
78    "segmenter/dictionary/extended/v1",
79    UCharDictionaryBreakData<'static>,
80    #[cfg(feature = "datagen")]
81    attributes_domain = "segmenter"
82);
83icu_provider::data_marker!(
84    /// `SegmenterBreakSentenceOverrideV1`
85    SegmenterBreakSentenceOverrideV1,
86    "segmenter/break/sentence/override/v1",
87    RuleBreakDataOverride<'static>,
88);
89icu_provider::data_marker!(
90    /// `SegmenterBreakWordOverrideV1`
91    SegmenterBreakWordOverrideV1,
92    "segmenter/break/word/override/v1",
93    RuleBreakDataOverride<'static>,
94);
95icu_provider::data_marker!(
96    /// `SegmenterBreakLineV1`
97    SegmenterBreakLineV1,
98    "segmenter/break/line/v1",
99    RuleBreakData<'static>,
100    is_singleton = true
101);
102icu_provider::data_marker!(
103    /// `SegmenterBreakWordV1`
104    SegmenterBreakWordV1,
105    "segmenter/break/word/v1",
106    RuleBreakData<'static>,
107    is_singleton = true
108);
109icu_provider::data_marker!(
110    /// `SegmenterBreakGraphemeClusterV1`
111    SegmenterBreakGraphemeClusterV1,
112    "segmenter/break/grapheme/cluster/v1",
113    RuleBreakData<'static>,
114    is_singleton = true
115);
116icu_provider::data_marker!(
117    /// `SegmenterBreakSentenceV1`
118    SegmenterBreakSentenceV1,
119    "segmenter/break/sentence/v1",
120    RuleBreakData<'static>,
121    is_singleton = true
122);
123
124pub use crate::word::inner::WordTypeULE;
125
126#[cfg(feature = "datagen")]
127/// The latest minimum set of markers required by this component.
128pub const MARKERS: &[DataMarkerInfo] = &[
129    SegmenterBreakGraphemeClusterV1::INFO,
130    SegmenterBreakLineV1::INFO,
131    SegmenterBreakSentenceOverrideV1::INFO,
132    SegmenterBreakSentenceV1::INFO,
133    SegmenterBreakWordOverrideV1::INFO,
134    SegmenterBreakWordV1::INFO,
135    SegmenterDictionaryAutoV1::INFO,
136    SegmenterDictionaryExtendedV1::INFO,
137    SegmenterLstmAutoV1::INFO,
138];
139
140/// Pre-processed Unicode data in the form of tables to be used for rule-based breaking.
141///
142/// <div class="stab unstable">
143/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
144/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
145/// to be stable, their Rust representation might not be. Use with caution.
146/// </div>
147#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
148#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
149#[cfg_attr(feature = "datagen", databake(path = icu_segmenter::provider))]
150#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
151pub struct RuleBreakData<'data> {
152    /// Property table.
153    #[cfg_attr(feature = "serde", serde(borrow))]
154    pub property_table: CodePointTrie<'data, u8>,
155
156    /// Break state table.
157    #[cfg_attr(feature = "serde", serde(borrow))]
158    pub break_state_table: ZeroVec<'data, BreakState>,
159
160    /// Word type table. Only used for word segmenter.
161    #[cfg_attr(feature = "serde", serde(borrow, rename = "rule_status_table"))]
162    pub word_type_table: ZeroVec<'data, WordType>,
163
164    /// Number of properties; should be the square root of the length of [`Self::break_state_table`].
165    pub property_count: u8,
166
167    /// The index of the last simple state for [`Self::break_state_table`]. (A simple state has no
168    /// `left` nor `right` in SegmenterProperty).
169    pub last_codepoint_property: u8,
170
171    /// The index of SOT (start of text) state for [`Self::break_state_table`].
172    pub sot_property: u8,
173
174    /// The index of EOT (end of text) state [`Self::break_state_table`].
175    pub eot_property: u8,
176
177    /// The index of "SA" state (or 127 if the complex language isn't handled) for
178    /// [`Self::break_state_table`].
179    pub complex_property: u8,
180}
181
182icu_provider::data_struct!(
183    RuleBreakData<'_>,
184    #[cfg(feature = "datagen")]
185);
186
187/// char16trie data for dictionary break
188///
189/// <div class="stab unstable">
190/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
191/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
192/// to be stable, their Rust representation might not be. Use with caution.
193/// </div>
194#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
195#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
196#[cfg_attr(feature = "datagen", databake(path = icu_segmenter::provider))]
197#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
198pub struct UCharDictionaryBreakData<'data> {
199    /// Dictionary data of char16trie.
200    #[cfg_attr(feature = "serde", serde(borrow))]
201    pub trie_data: ZeroVec<'data, u16>,
202}
203
204icu_provider::data_struct!(
205    UCharDictionaryBreakData<'_>,
206    #[cfg(feature = "datagen")]
207);
208
209pub(crate) struct UCharDictionaryBreakDataV1;
210
211impl DynamicDataMarker for UCharDictionaryBreakDataV1 {
212    type DataStruct = UCharDictionaryBreakData<'static>;
213}
214
215/// codepoint trie data that the difference by specific locale
216#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
217#[cfg_attr(
218    feature = "datagen",
219    derive(serde::Serialize,databake::Bake),
220    databake(path = icu_segmenter::provider),
221)]
222#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
223pub struct RuleBreakDataOverride<'data> {
224    /// The difference of property table for special locale.
225    #[cfg_attr(feature = "serde", serde(borrow))]
226    pub property_table_override: CodePointTrie<'data, u8>,
227}
228
229icu_provider::data_struct!(
230    RuleBreakDataOverride<'_>,
231    #[cfg(feature = "datagen")]
232);
233
234#[derive(Clone, Copy, PartialEq, Debug)]
235#[cfg_attr(feature = "datagen", derive(databake::Bake))]
236#[cfg_attr(feature = "datagen", databake(path = icu_segmenter::provider))]
237/// Break state
238///
239/// <div class="stab unstable">
240/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
241/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
242/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
243/// </div>
244pub enum BreakState {
245    /// Break
246    Break,
247    /// Keep rule
248    Keep,
249    /// Non-matching rule
250    NoMatch,
251    /// We have to look ahead one more character.
252    Intermediate(u8),
253    /// Index of a state.
254    Index(u8),
255}
256
257#[cfg(feature = "datagen")]
258impl serde::Serialize for BreakState {
259    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
260    where
261        S: serde::Serializer,
262    {
263        // would be nice to use the derive serde for JSON, but can't break serialization
264        if serializer.is_human_readable() {
265            i8::from_le_bytes([zerovec::ule::AsULE::to_unaligned(*self)]).serialize(serializer)
266        } else {
267            zerovec::ule::AsULE::to_unaligned(*self).serialize(serializer)
268        }
269    }
270}
271
272#[cfg(feature = "serde")]
273impl<'de> serde::Deserialize<'de> for BreakState {
274    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
275    where
276        D: serde::Deserializer<'de>,
277    {
278        if deserializer.is_human_readable() {
279            Ok(zerovec::ule::AsULE::from_unaligned(
280                i8::deserialize(deserializer)?.to_le_bytes()[0],
281            ))
282        } else {
283            u8::deserialize(deserializer).map(zerovec::ule::AsULE::from_unaligned)
284        }
285    }
286}
287
288impl zerovec::ule::AsULE for BreakState {
289    type ULE = u8;
290
291    fn to_unaligned(self) -> Self::ULE {
292        match self {
293            BreakState::Break => 253,
294            BreakState::Keep => 255,
295            BreakState::NoMatch => 254,
296            BreakState::Intermediate(i) => i + 120,
297            BreakState::Index(i) => i,
298        }
299    }
300
301    fn from_unaligned(unaligned: Self::ULE) -> Self {
302        match unaligned {
303            253 => BreakState::Break,
304            255 => BreakState::Keep,
305            254 => BreakState::NoMatch,
306            i if (120..253).contains(&i) => BreakState::Intermediate(i - 120),
307            i => BreakState::Index(i),
308        }
309    }
310}
311
312#[cfg(feature = "datagen")]
313impl serde::Serialize for WordType {
314    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
315    where
316        S: serde::Serializer,
317    {
318        if serializer.is_human_readable() {
319            (*self as u8).serialize(serializer)
320        } else {
321            unreachable!("only used as ULE")
322        }
323    }
324}
325
326#[cfg(feature = "datagen")]
327impl databake::Bake for WordType {
328    fn bake(&self, _crate_env: &databake::CrateEnv) -> databake::TokenStream {
329        unreachable!("only used as ULE")
330    }
331}
332
333#[cfg(feature = "serde")]
334impl<'de> serde::Deserialize<'de> for WordType {
335    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
336    where
337        D: serde::Deserializer<'de>,
338    {
339        if deserializer.is_human_readable() {
340            use serde::de::Error;
341            match u8::deserialize(deserializer) {
342                Ok(0) => Ok(WordType::None),
343                Ok(1) => Ok(WordType::Number),
344                Ok(2) => Ok(WordType::Letter),
345                Ok(_) => Err(D::Error::custom("invalid value")),
346                Err(e) => Err(e),
347            }
348        } else {
349            unreachable!("only used as ULE")
350        }
351    }
352}