icu_locale/
provider.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5// Provider structs must be stable
6#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
7
8//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
9//!
10//! <div class="stab unstable">
11//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
12//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
13//! to be stable, their Rust representation might not be. Use with caution.
14//! </div>
15//!
16//! Read more about data providers: [`icu_provider`]
17
18#[cfg(feature = "compiled_data")]
19#[derive(Debug)]
20/// Baked data
21///
22/// <div class="stab unstable">
23/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
24/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
25/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
26/// </div>
27pub struct Baked;
28
29#[cfg(feature = "compiled_data")]
30#[allow(unused_imports)]
31const _: () = {
32    use icu_locale_data::*;
33    pub mod icu {
34        pub use crate as locale;
35        pub use icu_collections as collections;
36    }
37    make_provider!(Baked);
38    impl_locale_aliases_v1!(Baked);
39    impl_locale_likely_subtags_extended_v1!(Baked);
40    impl_locale_likely_subtags_language_v1!(Baked);
41    impl_locale_likely_subtags_script_region_v1!(Baked);
42    impl_locale_parents_v1!(Baked);
43    impl_locale_script_direction_v1!(Baked);
44
45    impl_locale_exemplar_characters_auxiliary_v1!(Baked);
46    impl_locale_exemplar_characters_index_v1!(Baked);
47    impl_locale_exemplar_characters_main_v1!(Baked);
48    impl_locale_exemplar_characters_numbers_v1!(Baked);
49    impl_locale_exemplar_characters_punctuation_v1!(Baked);
50};
51
52icu_provider::data_marker!(
53    /// Marker for locale alias data.
54    LocaleAliasesV1,
55    "locale/aliases/v1",
56    Aliases<'static>,
57    is_singleton = true
58);
59icu_provider::data_marker!(
60    /// Marker for data for likely subtags for languages.
61    LocaleLikelySubtagsLanguageV1,
62    "locale/likely/subtags/language/v1",
63    LikelySubtagsForLanguage<'static>,
64    is_singleton = true
65);
66icu_provider::data_marker!(
67    /// Marker for data for likely subtags for scripts and regions.
68    LocaleLikelySubtagsScriptRegionV1,
69    "locale/likely/subtags/script/region/v1",
70    LikelySubtagsForScriptRegion<'static>,
71    is_singleton = true
72);
73icu_provider::data_marker!(
74    /// Marker for extended data for likely subtags.
75    LocaleLikelySubtagsExtendedV1,
76    "locale/likely/subtags/extended/v1",
77    LikelySubtagsExtended<'static>,
78    is_singleton = true
79);
80icu_provider::data_marker!(
81    /// Marker for locale fallback parents data.
82    LocaleParentsV1,
83    "locale/parents/v1",
84    Parents<'static>,
85    is_singleton = true
86);
87
88icu_provider::data_marker!(
89    /// Marker for script direction data.
90    LocaleScriptDirectionV1,
91    "locale/script/direction/v1",
92    ScriptDirection<'static>,
93    is_singleton = true
94);
95
96icu_provider::data_marker!(
97    /// Marker for auxiliary exemplar characters data.
98    LocaleExemplarCharactersAuxiliaryV1,
99    "locale/exemplar/characters/auxiliary/v1",
100    ExemplarCharactersData<'static>,
101);
102icu_provider::data_marker!(
103    /// Marker for index exemplar characters data.
104    LocaleExemplarCharactersIndexV1,
105    "locale/exemplar/characters/index/v1",
106    ExemplarCharactersData<'static>,
107);
108icu_provider::data_marker!(
109    /// Marker for main exemplar characters data.
110    LocaleExemplarCharactersMainV1,
111    "locale/exemplar/characters/main/v1",
112    ExemplarCharactersData<'static>,
113);
114icu_provider::data_marker!(
115    /// Marker for numbers exemplar characters data.
116    LocaleExemplarCharactersNumbersV1,
117    "locale/exemplar/characters/numbers/v1",
118    ExemplarCharactersData<'static>,
119);
120icu_provider::data_marker!(
121    /// Marker for punctuation exemplar characters data.
122    LocaleExemplarCharactersPunctuationV1,
123    "locale/exemplar/characters/punctuation/v1",
124    ExemplarCharactersData<'static>,
125);
126
127#[cfg(feature = "datagen")]
128/// The latest minimum set of markers required by this component.
129pub const MARKERS: &[DataMarkerInfo] = &[
130    LocaleAliasesV1::INFO,
131    LocaleExemplarCharactersAuxiliaryV1::INFO,
132    LocaleExemplarCharactersIndexV1::INFO,
133    LocaleExemplarCharactersMainV1::INFO,
134    LocaleExemplarCharactersNumbersV1::INFO,
135    LocaleExemplarCharactersPunctuationV1::INFO,
136    LocaleLikelySubtagsExtendedV1::INFO,
137    LocaleLikelySubtagsLanguageV1::INFO,
138    LocaleLikelySubtagsScriptRegionV1::INFO,
139    LocaleParentsV1::INFO,
140    LocaleScriptDirectionV1::INFO,
141];
142
143use alloc::borrow::Cow;
144use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList;
145use icu_locale_core::subtags::{Language, Region, Script, Variant};
146use icu_provider::prelude::*;
147use potential_utf::PotentialUtf8;
148use tinystr::{TinyAsciiStr, UnvalidatedTinyAsciiStr};
149use zerovec::{VarZeroVec, ZeroMap, ZeroSlice, ZeroVec};
150
151// We use raw TinyAsciiStrs for map keys, as we then don't have to
152// validate them as subtags on deserialization. Map lookup can be
153// done even if they are not valid tags (an invalid key will just
154// become inaccessible).
155type UnvalidatedLanguage = UnvalidatedTinyAsciiStr<3>;
156type UnvalidatedScript = UnvalidatedTinyAsciiStr<4>;
157type UnvalidatedRegion = UnvalidatedTinyAsciiStr<3>;
158type UnvalidatedVariant = UnvalidatedTinyAsciiStr<8>;
159type UnvalidatedSubdivision = UnvalidatedTinyAsciiStr<7>;
160type SemivalidatedSubdivision = TinyAsciiStr<7>;
161
162// LanguageIdentifier doesn't have an AsULE implementation, so we have
163// to store strs and parse when needed.
164type UnvalidatedLanguageIdentifier = str;
165type UnvalidatedLanguageIdentifierPair = StrStrPairVarULE;
166type UnvalidatedLanguageVariantsPair = LanguageStrStrPairVarULE;
167
168#[zerovec::make_varule(StrStrPairVarULE)]
169#[zerovec::derive(Debug)]
170#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
171#[cfg_attr(
172    feature = "serde",
173    derive(serde::Deserialize),
174    zerovec::derive(Deserialize)
175)]
176#[cfg_attr(
177    feature = "datagen",
178    derive(serde::Serialize, databake::Bake),
179    zerovec::derive(Serialize),
180    databake(path = icu_locale::provider),
181)]
182/// A pair of strings with a EncodeAsVarULE implementation.
183///
184/// <div class="stab unstable">
185/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
186/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
187/// to be stable, their Rust representation might not be. Use with caution.
188/// </div>
189pub struct StrStrPair<'a>(
190    #[cfg_attr(feature = "serde", serde(borrow))] pub Cow<'a, str>,
191    #[cfg_attr(feature = "serde", serde(borrow))] pub Cow<'a, str>,
192);
193
194#[zerovec::make_varule(LanguageStrStrPairVarULE)]
195#[zerovec::derive(Debug)]
196#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
197#[cfg_attr(
198    feature = "serde",
199    derive(serde::Deserialize),
200    zerovec::derive(Deserialize)
201)]
202#[cfg_attr(
203    feature = "datagen",
204    derive(serde::Serialize, databake::Bake),
205    zerovec::derive(Serialize),
206    databake(path = icu_locale::provider),
207)]
208/// A triplet of strings with a EncodeAsVarULE implementation.
209pub struct LanguageStrStrPair<'a>(
210    pub Language,
211    #[cfg_attr(feature = "serde", serde(borrow))] pub Cow<'a, str>,
212    #[cfg_attr(feature = "serde", serde(borrow))] pub Cow<'a, str>,
213);
214
215#[derive(PartialEq, Clone, Default, yoke::Yokeable, zerofrom::ZeroFrom)]
216#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
217#[cfg_attr(feature = "datagen", databake(path = icu_locale::provider))]
218#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
219#[yoke(prove_covariance_manually)]
220/// This alias data is used for locale canonicalization.
221///
222/// Each field defines a
223/// mapping from an old identifier to a new identifier, based upon the rules in
224/// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>. The data
225/// is stored in sorted order, allowing for binary search to identify rules to
226/// apply. It is broken down into smaller vectors based upon some characteristic
227/// of the data, to help avoid unnecessary searches. For example, the `sgn_region`
228/// field contains aliases for sign language and region, so that it is not
229/// necessary to search the data unless the input is a sign language.
230///
231/// The algorithm in tr35 is not guaranteed to terminate on data other than what
232/// is currently in CLDR. For this reason, it is not a good idea to attempt to add
233/// or modify aliases for use in this structure.
234///
235/// <div class="stab unstable">
236/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
237/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
238/// to be stable, their Rust representation might not be. Use with caution.
239/// </div>
240// TODO: Use validated types as value types
241// Notice:  improves the alignment of `language_variants` speeding up canonicalization by upon
242// to 40%. See https://github.com/unicode-org/icu4x/pull/2935 for details.
243#[derive(Debug)]
244pub struct Aliases<'data> {
245    /// `[language, variant(-variant)*] -> [langid]`
246    /// This is not a map as it's searched linearly according to the canonicalization rules.
247    #[cfg_attr(feature = "serde", serde(borrow))]
248    pub language_variants: VarZeroVec<'data, UnvalidatedLanguageVariantsPair>,
249    /// `sgn-[region] -> [language]`
250    #[cfg_attr(feature = "serde", serde(borrow))]
251    pub sgn_region: ZeroMap<'data, UnvalidatedRegion, Language>,
252    /// `[language{2}] -> [langid]`
253    #[cfg_attr(feature = "serde", serde(borrow))]
254    pub language_len2: ZeroMap<'data, UnvalidatedTinyAsciiStr<2>, UnvalidatedLanguageIdentifier>,
255    /// `[language{3}] -> [langid]`
256    #[cfg_attr(feature = "serde", serde(borrow))]
257    pub language_len3: ZeroMap<'data, UnvalidatedLanguage, UnvalidatedLanguageIdentifier>,
258    /// `[langid] -> [langid]`
259    /// This is not a map as it's searched linearly according to the canonicalization rules.
260    #[cfg_attr(feature = "serde", serde(borrow))]
261    pub language: VarZeroVec<'data, UnvalidatedLanguageIdentifierPair>,
262
263    /// `[script] -> [script]`
264    #[cfg_attr(feature = "serde", serde(borrow))]
265    pub script: ZeroMap<'data, UnvalidatedScript, Script>,
266
267    /// `[region{2}] -> [region]`
268    #[cfg_attr(feature = "serde", serde(borrow))]
269    pub region_alpha: ZeroMap<'data, UnvalidatedTinyAsciiStr<2>, Region>,
270    /// `[region{3}] -> [region]`
271    #[cfg_attr(feature = "serde", serde(borrow))]
272    pub region_num: ZeroMap<'data, UnvalidatedRegion, Region>,
273
274    /// `[region] -> [region]+`
275    #[cfg_attr(feature = "serde", serde(borrow))]
276    pub complex_region: ZeroMap<'data, UnvalidatedRegion, ZeroSlice<Region>>,
277
278    /// `[variant] -> [variant]`
279    #[cfg_attr(feature = "serde", serde(borrow))]
280    pub variant: ZeroMap<'data, UnvalidatedVariant, Variant>,
281
282    /// `[value{7}] -> [value{7}]`
283    #[cfg_attr(feature = "serde", serde(borrow))]
284    pub subdivision: ZeroMap<'data, UnvalidatedSubdivision, SemivalidatedSubdivision>,
285}
286
287icu_provider::data_struct!(
288    Aliases<'_>,
289    #[cfg(feature = "datagen")]
290);
291
292#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
293#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
294#[cfg_attr(feature = "datagen", databake(path = icu_locale::provider))]
295#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
296/// This likely subtags data is used for the minimize and maximize operations.
297///
298/// Each field defines a mapping from an old identifier to a new identifier,
299/// based upon the rules in
300/// <https://www.unicode.org/reports/tr35/#Likely_Subtags>.
301///
302/// The data is stored is broken down into smaller vectors based upon the rules
303/// defined for the likely subtags maximize algorithm.
304///
305/// For efficiency, only the relevant part of the LanguageIdentifier is stored
306/// for searching and replacing. E.g., the `language_script` field is used to store
307/// rules for `LanguageIdentifier`s that contain a language and a script, but not a
308/// region.
309///
310/// This struct contains mappings when the input contains a language subtag.
311/// Also see [`LikelySubtagsForScriptRegion`].
312///
313/// <div class="stab unstable">
314/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
315/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
316/// to be stable, their Rust representation might not be. Use with caution.
317/// </div>
318#[yoke(prove_covariance_manually)]
319pub struct LikelySubtagsForLanguage<'data> {
320    /// Language and script.
321    #[cfg_attr(feature = "serde", serde(borrow))]
322    pub language_script: ZeroMap<'data, (UnvalidatedLanguage, UnvalidatedScript), Region>,
323    /// Language and region.
324    #[cfg_attr(feature = "serde", serde(borrow))]
325    pub language_region: ZeroMap<'data, (UnvalidatedLanguage, UnvalidatedRegion), Script>,
326    /// Just language.
327    #[cfg_attr(feature = "serde", serde(borrow))]
328    pub language: ZeroMap<'data, UnvalidatedLanguage, (Script, Region)>,
329    /// Undefined.
330    pub und: (Language, Script, Region),
331}
332
333icu_provider::data_struct!(
334    LikelySubtagsForLanguage<'_>,
335    #[cfg(feature = "datagen")]
336);
337
338#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
339#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
340#[cfg_attr(feature = "datagen", databake(path = icu_locale::provider))]
341#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
342/// This likely subtags data is used for the minimize and maximize operations.
343///
344/// Each field defines a mapping from an old identifier to a new identifier,
345/// based upon the rules in
346/// <https://www.unicode.org/reports/tr35/#Likely_Subtags>.
347///
348/// The data is stored is broken down into smaller vectors based upon the rules
349/// defined for the likely subtags maximize algorithm.
350///
351/// For efficiency, only the relevant part of the LanguageIdentifier is stored
352/// for searching and replacing. E.g., the `script_region` field is used to store
353/// rules for `LanguageIdentifier`s that contain a script and a region, but not a
354/// language.
355///
356/// This struct contains mappings when the input does not contain a language subtag.
357/// Also see [`LikelySubtagsForLanguage`].
358///
359/// <div class="stab unstable">
360/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
361/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
362/// to be stable, their Rust representation might not be. Use with caution.
363/// </div>
364#[yoke(prove_covariance_manually)]
365pub struct LikelySubtagsForScriptRegion<'data> {
366    /// Script and region.
367    #[cfg_attr(feature = "serde", serde(borrow))]
368    pub script_region: ZeroMap<'data, (UnvalidatedScript, UnvalidatedRegion), Language>,
369    /// Just script.
370    #[cfg_attr(feature = "serde", serde(borrow))]
371    pub script: ZeroMap<'data, UnvalidatedScript, (Language, Region)>,
372    /// Just region.
373    #[cfg_attr(feature = "serde", serde(borrow))]
374    pub region: ZeroMap<'data, UnvalidatedRegion, (Language, Script)>,
375}
376
377icu_provider::data_struct!(
378    LikelySubtagsForScriptRegion<'_>,
379    #[cfg(feature = "datagen")]
380);
381
382#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
383#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
384#[cfg_attr(feature = "datagen", databake(path = icu_locale::provider))]
385#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
386/// This likely subtags data is used for full coverage of locales, including ones that
387/// don't otherwise have data in the Common Locale Data Repository (CLDR).
388///
389/// <div class="stab unstable">
390/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
391/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
392/// to be stable, their Rust representation might not be. Use with caution.
393/// </div>
394#[yoke(prove_covariance_manually)]
395pub struct LikelySubtagsExtended<'data> {
396    /// Language and script.
397    #[cfg_attr(feature = "serde", serde(borrow))]
398    pub language_script: ZeroMap<'data, (UnvalidatedLanguage, UnvalidatedScript), Region>,
399    /// Language and region.
400    #[cfg_attr(feature = "serde", serde(borrow))]
401    pub language_region: ZeroMap<'data, (UnvalidatedLanguage, UnvalidatedRegion), Script>,
402    /// Just language.
403    #[cfg_attr(feature = "serde", serde(borrow))]
404    pub language: ZeroMap<'data, UnvalidatedLanguage, (Script, Region)>,
405    /// Script and region.
406    #[cfg_attr(feature = "serde", serde(borrow))]
407    pub script_region: ZeroMap<'data, (UnvalidatedScript, UnvalidatedRegion), Language>,
408    /// Just script.
409    #[cfg_attr(feature = "serde", serde(borrow))]
410    pub script: ZeroMap<'data, UnvalidatedScript, (Language, Region)>,
411    /// Just region.
412    #[cfg_attr(feature = "serde", serde(borrow))]
413    pub region: ZeroMap<'data, UnvalidatedRegion, (Language, Script)>,
414}
415
416icu_provider::data_struct!(
417    LikelySubtagsExtended<'_>,
418    #[cfg(feature = "datagen")]
419);
420
421/// Locale fallback rules derived from CLDR parent locales data.
422#[derive(Default, Clone, PartialEq, Debug, yoke::Yokeable, zerofrom::ZeroFrom)]
423#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
424#[cfg_attr(feature = "datagen", databake(path = icu_locale::provider))]
425#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
426#[yoke(prove_covariance_manually)]
427pub struct Parents<'data> {
428    /// Map from language identifier to language identifier, indicating that the language on the
429    /// left should inherit from the language on the right.
430    #[cfg_attr(feature = "serde", serde(borrow))]
431    pub parents: ZeroMap<'data, PotentialUtf8, (Language, Option<Script>, Option<Region>)>,
432}
433
434icu_provider::data_struct!(
435    Parents<'_>,
436    #[cfg(feature = "datagen")]
437);
438
439#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
440#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
441#[cfg_attr(feature = "datagen", databake(path = icu_locale::provider))]
442#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
443/// This directionality data is used to determine the script directionality of a locale.
444///
445/// <div class="stab unstable">
446/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
447/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
448/// to be stable, their Rust representation might not be. Use with caution.
449/// </div>
450#[yoke(prove_covariance_manually)]
451pub struct ScriptDirection<'data> {
452    /// Scripts in right-to-left direction.
453    #[cfg_attr(feature = "serde", serde(borrow))]
454    pub rtl: ZeroVec<'data, UnvalidatedScript>,
455    /// Scripts in left-to-right direction.
456    #[cfg_attr(feature = "serde", serde(borrow))]
457    pub ltr: ZeroVec<'data, UnvalidatedScript>,
458}
459
460icu_provider::data_struct!(
461    ScriptDirection<'_>,
462    #[cfg(feature = "datagen")]
463);
464
465/// A set of characters and strings which share a particular property value.
466///
467/// <div class="stab unstable">
468/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
469/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
470/// to be stable, their Rust representation might not be. Use with caution.
471/// </div>
472#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
473#[cfg_attr(
474    feature = "datagen", 
475    derive(serde::Serialize, databake::Bake),
476    databake(path = icu_locale::provider),
477)]
478#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
479pub struct ExemplarCharactersData<'data>(
480    #[cfg_attr(feature = "serde", serde(borrow))] pub CodePointInversionListAndStringList<'data>,
481);
482
483icu_provider::data_struct!(
484    ExemplarCharactersData<'_>,
485    #[cfg(feature = "datagen")]
486);