icu_collator/
provider.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5// The reordering algorithms in this file are adapted from ICU4C and,
6// therefore, are subject to the ICU license as described in LICENSE.
7
8//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
9//!
10//! <div class="stab unstable">
11//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
12//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
13//! to be stable, their Rust representation might not be. Use with caution.
14//! </div>
15//!
16//! Read more about data providers: [`icu_provider`]
17
18// Provider structs must be stable
19#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
20
21use icu_collections::char16trie::Char16TrieIterator;
22use icu_collections::codepointtrie::CodePointTrie;
23use icu_provider::prelude::*;
24use zerovec::ule::AsULE;
25use zerovec::ZeroVec;
26use zerovec::{zeroslice, ZeroSlice};
27
28use crate::elements::CollationElement;
29use crate::elements::CollationElement32;
30use crate::elements::Tag;
31use crate::elements::EMPTY_U16;
32use crate::elements::FFFD_CE;
33use crate::elements::FFFD_CE32;
34use crate::elements::FFFD_CE32_VALUE;
35use crate::elements::FFFD_CE_VALUE;
36use crate::elements::NO_CE_PRIMARY;
37use crate::preferences::CollationCaseFirst;
38
39use crate::options::MaxVariable;
40
41#[cfg(feature = "compiled_data")]
42#[derive(Debug)]
43/// Baked data
44///
45/// <div class="stab unstable">
46/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
47/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
48/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
49/// </div>
50pub struct Baked;
51
52#[cfg(feature = "compiled_data")]
53#[allow(unused_imports)]
54const _: () = {
55    use icu_collator_data::*;
56    pub mod icu {
57        pub use crate as collator;
58        pub use icu_collections as collections;
59        pub use icu_locale as locale;
60    }
61    make_provider!(Baked);
62    impl_collation_root_v1!(Baked);
63    impl_collation_tailoring_v1!(Baked);
64    impl_collation_diacritics_v1!(Baked);
65    impl_collation_jamo_v1!(Baked);
66    impl_collation_metadata_v1!(Baked);
67    impl_collation_special_primaries_v1!(Baked);
68    impl_collation_reordering_v1!(Baked);
69};
70
71const SCRIPT_FALLBACK: icu_provider::fallback::LocaleFallbackConfig = {
72    let mut c = icu_provider::fallback::LocaleFallbackConfig::default();
73    c.priority = icu_provider::fallback::LocaleFallbackPriority::Script;
74    c
75};
76
77icu_provider::data_marker!(
78    /// Data marker for singleton root collation data.
79    CollationRootV1,
80    "collation/root/v1",
81    CollationData<'static>,
82    is_singleton = true,
83);
84icu_provider::data_marker!(
85    /// Data marker for collation tailorings.
86    CollationTailoringV1,
87    "collation/tailoring/v1",
88    CollationData<'static>,
89    fallback_config = SCRIPT_FALLBACK,
90    #[cfg(feature = "datagen")]
91    attributes_domain = "collator",
92);
93icu_provider::data_marker!(
94    /// Data marker for collation diacritics data.
95    CollationDiacriticsV1,
96    "collation/diacritics/v1",
97    CollationDiacritics<'static>,
98    fallback_config = SCRIPT_FALLBACK,
99    #[cfg(feature = "datagen")]
100    attributes_domain = "collator",
101);
102icu_provider::data_marker!(
103    /// Data marker for collation jamo data.
104    CollationJamoV1,
105    "collation/jamo/v1",
106    CollationJamo<'static>,
107    is_singleton = true,
108);
109icu_provider::data_marker!(
110    /// Data marker for collation reordering data.
111    CollationReorderingV1,
112    "collation/reordering/v1",
113    CollationReordering<'static>,
114    fallback_config = SCRIPT_FALLBACK,
115    #[cfg(feature = "datagen")]
116    attributes_domain = "collator",
117);
118icu_provider::data_marker!(
119    /// Data marker for collation metadata.
120    CollationMetadataV1,
121    "collation/metadata/v1",
122    CollationMetadata,
123    fallback_config = SCRIPT_FALLBACK,
124    #[cfg(feature = "datagen")]
125    attributes_domain = "collator",
126);
127icu_provider::data_marker!(
128    /// Data marker for collcation special primaries data.
129    CollationSpecialPrimariesV1,
130    "collation/special/primaries/v1",
131    CollationSpecialPrimaries<'static>,
132    is_singleton = true,
133);
134
135#[cfg(feature = "datagen")]
136/// The latest minimum set of markers required by this component.
137pub const MARKERS: &[DataMarkerInfo] = &[
138    CollationRootV1::INFO,
139    CollationTailoringV1::INFO,
140    CollationDiacriticsV1::INFO,
141    CollationJamoV1::INFO,
142    CollationMetadataV1::INFO,
143    CollationReorderingV1::INFO,
144    CollationSpecialPrimariesV1::INFO,
145];
146
147const SINGLE_U32: &ZeroSlice<u32> =
148    zeroslice!(u32; <u32 as AsULE>::ULE::from_unsigned; [FFFD_CE32_VALUE]);
149const SINGLE_U64: &ZeroSlice<u64> =
150    zeroslice!(u64; <u64 as AsULE>::ULE::from_unsigned; [FFFD_CE_VALUE]);
151
152fn data_ce_to_primary(data_ce: u64, c: char) -> u32 {
153    // Collation::getThreeBytePrimaryForOffsetData
154    let p = (data_ce >> 32) as u32; // three-byte primary pppppp00
155    let lower32 = data_ce as u32 as i32; // base code point b & step s: bbbbbbss (bit 7: isCompressible)
156    let mut offset = ((u32::from(c) as i32) - (lower32 >> 8)) * (lower32 & 0x7F); // delta * increment
157    let is_compressible = (lower32 & 0x80) != 0;
158    // Collation::incThreeBytePrimaryByOffset
159    offset += (((p >> 8) & 0xFF) as i32) - 2;
160    let mut primary = (((offset % 254) + 2) as u32) << 8;
161    offset /= 254;
162    // Same with the second byte,
163    // but reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary.
164    if is_compressible {
165        offset += (((p >> 16) & 0xFF) as i32) - 4;
166        primary |= (((offset % 251) + 4) as u32) << 16;
167        offset /= 251;
168    } else {
169        offset += (((p >> 16) & 0xFF) as i32) - 2;
170        primary |= (((offset % 254) + 2) as u32) << 16;
171        offset /= 254;
172    }
173    primary | ((p & 0xFF000000) + ((offset as u32) << 24))
174}
175
176/// The main collation data either for the root or for a tailoring
177///
178/// <div class="stab unstable">
179/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
180/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
181/// to be stable, their Rust representation might not be. Use with caution.
182/// </div>
183#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
184#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
185#[cfg_attr(feature = "datagen", databake(path = icu_collator::provider))]
186#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
187pub struct CollationData<'data> {
188    /// Mapping from `char` to `CollationElement32` (represented
189    /// as its `u32` bits).
190    #[cfg_attr(feature = "serde", serde(borrow))]
191    pub trie: CodePointTrie<'data, u32>,
192    /// `CollationElement`s used in expansions and offset CE32s
193    /// (represented as their `u64` bits)
194    #[cfg_attr(feature = "serde", serde(borrow))]
195    pub ces: ZeroVec<'data, u64>,
196    /// `CollationElement32`s used in expansions and as defaults
197    /// for digits when the numeric mode is not in use
198    #[cfg_attr(feature = "serde", serde(borrow))]
199    pub ce32s: ZeroVec<'data, u32>,
200    /// Defaults and tries for prefix and contraction matching
201    #[cfg_attr(feature = "serde", serde(borrow))]
202    pub contexts: ZeroVec<'data, u16>,
203}
204
205icu_provider::data_struct!(
206    CollationData<'_>,
207    #[cfg(feature = "datagen")]
208);
209
210impl<'data> CollationData<'data> {
211    pub(crate) fn ce32_for_char(&self, c: char) -> CollationElement32 {
212        CollationElement32::new(self.trie.get32(c as u32))
213    }
214    pub(crate) fn get_ce32(&'data self, index: usize) -> CollationElement32 {
215        CollationElement32::new(if let Some(u) = self.ce32s.get(index) {
216            u
217        } else {
218            // GIGO case
219            debug_assert!(false);
220            FFFD_CE32_VALUE
221        })
222    }
223    pub(crate) fn get_ce32s(&'data self, index: usize, len: usize) -> &'data ZeroSlice<u32> {
224        if len > 0 {
225            if let Some(slice) = self.ce32s.get_subslice(index..index + len) {
226                return slice;
227            }
228        }
229        // GIGO case
230        debug_assert!(false);
231        SINGLE_U32
232    }
233    pub(crate) fn get_ces(&'data self, index: usize, len: usize) -> &'data ZeroSlice<u64> {
234        if len > 0 {
235            if let Some(slice) = self.ces.get_subslice(index..index + len) {
236                return slice;
237            }
238        }
239        // GIGO case
240        debug_assert!(false);
241        SINGLE_U64
242    }
243    fn get_default_and_trie_impl(
244        &'data self,
245        index: usize,
246    ) -> (CollationElement32, &'data ZeroSlice<u16>) {
247        if let Some(slice) = self.contexts.get_subslice(index..self.contexts.len()) {
248            #[allow(clippy::unwrap_used)]
249            if slice.len() >= 2 {
250                // `unwrap` must succeed due to the length check above.
251                let first = slice.get(0).unwrap();
252                let second = slice.get(1).unwrap();
253                let trie = slice.get_subslice(2..slice.len()).unwrap();
254                return (
255                    CollationElement32::new((u32::from(first) << 16) | u32::from(second)),
256                    trie,
257                );
258            }
259        }
260        // GIGO case
261        debug_assert!(false);
262        (FFFD_CE32, EMPTY_U16)
263    }
264    pub(crate) fn get_default_and_trie(
265        &'data self,
266        index: usize,
267    ) -> (CollationElement32, Char16TrieIterator<'data>) {
268        let (ce32, trie) = self.get_default_and_trie_impl(index);
269        (ce32, Char16TrieIterator::new(trie))
270    }
271    pub(crate) fn get_default(&'data self, index: usize) -> CollationElement32 {
272        let (ce32, _) = self.get_default_and_trie_impl(index);
273        ce32
274    }
275    pub(crate) fn ce_from_offset_ce32(
276        &self,
277        c: char,
278        ce32: CollationElement32,
279    ) -> CollationElement {
280        debug_assert!(ce32.tag() == Tag::Offset);
281        if let Some(data_ce) = self.ces.get(ce32.index()) {
282            CollationElement::new_from_primary(data_ce_to_primary(data_ce, c))
283        } else {
284            // GIGO case
285            debug_assert!(false);
286            FFFD_CE
287        }
288    }
289}
290
291/// Secondary weights for the start of the Combining Diacritics block.
292///
293/// <div class="stab unstable">
294/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
295/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
296/// to be stable, their Rust representation might not be. Use with caution.
297/// </div>
298#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
299#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
300#[cfg_attr(feature = "datagen", databake(path = icu_collator::provider))]
301#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
302pub struct CollationDiacritics<'data> {
303    /// Secondary weights for characters starting from U+0300 up
304    /// to but not including U+034F. May be shorter than that;
305    /// zero-length when a tailoring opts out of using this
306    /// feature altogether.
307    #[cfg_attr(feature = "serde", serde(borrow))]
308    pub secondaries: ZeroVec<'data, u16>,
309}
310
311icu_provider::data_struct!(
312    CollationDiacritics<'_>,
313    #[cfg(feature = "datagen")]
314);
315
316/// `CollationElement32`s for the Hangul Jamo Unicode Block
317///
318/// <div class="stab unstable">
319/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
320/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
321/// to be stable, their Rust representation might not be. Use with caution.
322/// </div>
323#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
324#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
325#[cfg_attr(feature = "datagen", databake(path = icu_collator::provider))]
326#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
327pub struct CollationJamo<'data> {
328    /// `CollationElement32`s (as `u32`s) for the Hangul Jamo Unicode Block.
329    /// The length must be equal to the size of the block (256).
330    #[cfg_attr(feature = "serde", serde(borrow))]
331    pub ce32s: ZeroVec<'data, u32>,
332}
333
334icu_provider::data_struct!(
335    CollationJamo<'_>,
336    #[cfg(feature = "datagen")]
337);
338
339/// Script reordering data
340///
341/// <div class="stab unstable">
342/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
343/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
344/// to be stable, their Rust representation might not be. Use with caution.
345/// </div>
346#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
347#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
348#[cfg_attr(feature = "datagen", databake(path = icu_collator::provider))]
349#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
350pub struct CollationReordering<'data> {
351    /// Limit of last reordered range. 0 if no reordering or no split bytes.
352    ///
353    /// Comment from ICU4C's `collationsettings.h`
354    pub min_high_no_reorder: u32,
355    /// 256-byte table for reordering permutation of primary lead
356    /// bytes; NULL if no reordering. A 0 entry at a non-zero index means
357    /// that the primary lead byte is "split" (there are different offsets
358    /// for primaries that share that lead byte) and the reordering offset
359    /// must be determined via the reorderRanges.
360    ///
361    /// Comment from ICU4C's `collationsettings.h`
362    #[cfg_attr(feature = "serde", serde(borrow))]
363    pub reorder_table: ZeroVec<'data, u8>, // len always 256
364    /// Primary-weight ranges for script reordering, to be used by
365    /// reorder(p) for split-reordered primary lead bytes.
366    ///
367    /// Each entry is a (limit, offset) pair. The upper 16 bits of the
368    /// entry are the upper 16 bits of the exclusive primary limit of
369    /// a range. Primaries between the previous limit and this one have
370    /// their lead bytes modified by the signed offset (-0xff..+0xff)
371    /// stored in the lower 16 bits.
372    ///
373    /// CollationData::makeReorderRanges() writes a full list where the
374    /// first range (at least for terminators and separators) has a 0
375    /// offset. The last range has a non-zero offset. minHighNoReorder
376    /// is set to the limit of that last range.
377    ///
378    /// In the settings object, the initial ranges before the first
379    /// split lead byte are omitted for efficiency; they are handled
380    /// by reorder(p) via the reorderTable. If there are no
381    /// split-reordered lead bytes, then no ranges are needed.
382    ///
383    /// Comment from ICU4C's `collationsettings.h`; names refer to
384    /// ICU4C.
385    #[cfg_attr(feature = "serde", serde(borrow))]
386    pub reorder_ranges: ZeroVec<'data, u32>,
387}
388
389icu_provider::data_struct!(
390    CollationReordering<'_>,
391    #[cfg(feature = "datagen")]
392);
393
394impl CollationReordering<'_> {
395    pub(crate) fn reorder(&self, primary: u32) -> u32 {
396        if let Some(b) = self.reorder_table.get((primary >> 24) as usize) {
397            if b != 0 || primary <= NO_CE_PRIMARY {
398                (u32::from(b) << 24) | (primary & 0x00FFFFFF)
399            } else {
400                self.reorder_ex(primary)
401            }
402        } else {
403            // GIGO case
404            debug_assert!(false);
405            primary
406        }
407    }
408
409    fn reorder_ex(&self, primary: u32) -> u32 {
410        if primary >= self.min_high_no_reorder {
411            return primary;
412        }
413        let q = primary | 0xFFFF;
414        for &range in self.reorder_ranges.as_ule_slice().iter() {
415            let r = u32::from_unaligned(range);
416            if q < r {
417                return primary.wrapping_add(r << 24);
418            }
419        }
420        // GIGO case
421        debug_assert!(false);
422        primary
423    }
424}
425
426/// Each non-alias collation that the data provider knows
427/// about explicitly has an data entry at least for this
428/// struct.
429///
430/// <div class="stab unstable">
431/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
432/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
433/// to be stable, their Rust representation might not be. Use with caution.
434/// </div>
435#[derive(Debug, PartialEq, Clone, Copy, yoke::Yokeable, zerofrom::ZeroFrom)]
436#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
437#[cfg_attr(feature = "datagen", databake(path = icu_collator::provider))]
438#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
439pub struct CollationMetadata {
440    /// See the mask constants in the `impl` block for the
441    /// bit layout. The other bits are ignored: They could
442    /// be from the future if their semantics such that
443    /// old code may ignore them.
444    ///
445    /// Note: At present, it's bogus for the bit for "upper
446    /// first" to be set if "case first" isn't also set.
447    /// However, the methods handle this case gracefully,
448    /// so there is no need for invariant validation.
449    pub bits: u32,
450}
451
452icu_provider::data_struct!(
453    CollationMetadata,
454    #[cfg(feature = "datagen")]
455);
456
457impl CollationMetadata {
458    const MAX_VARIABLE_MASK: u32 = 0b11;
459    const TAILORED_MASK: u32 = 1 << 3;
460    const TAILORED_DIACRITICS_MASK: u32 = 1 << 4;
461    const REORDERING_MASK: u32 = 1 << 5;
462    const LITHUANIAN_DOT_ABOVE_MASK: u32 = 1 << 6;
463    const BACWARD_SECOND_LEVEL_MASK: u32 = 1 << 7;
464    const ALTERNATE_SHIFTED_MASK: u32 = 1 << 8;
465    const CASE_FIRST_MASK: u32 = 1 << 9;
466    const UPPER_FIRST_MASK: u32 = 1 << 10;
467
468    #[inline(always)]
469    pub(crate) fn max_variable(self) -> MaxVariable {
470        // Safety: the possible numeric values for `MaxVariable` are from 0 to 3, inclusive,
471        // and it is repr(u8). MAX_VARIABLE_MASK here ensures our values have most 2 bits, which produces
472        // the same range.
473        unsafe { core::mem::transmute((self.bits & CollationMetadata::MAX_VARIABLE_MASK) as u8) }
474    }
475
476    #[inline(always)]
477    pub(crate) fn tailored(self) -> bool {
478        self.bits & CollationMetadata::TAILORED_MASK != 0
479    }
480
481    /// Vietnamese and Ewe
482    #[inline(always)]
483    pub(crate) fn tailored_diacritics(self) -> bool {
484        self.bits & CollationMetadata::TAILORED_DIACRITICS_MASK != 0
485    }
486
487    /// Lithuanian
488    #[inline(always)]
489    pub(crate) fn lithuanian_dot_above(self) -> bool {
490        self.bits & CollationMetadata::LITHUANIAN_DOT_ABOVE_MASK != 0
491    }
492
493    /// Canadian French
494    #[inline(always)]
495    pub(crate) fn backward_second_level(self) -> bool {
496        self.bits & CollationMetadata::BACWARD_SECOND_LEVEL_MASK != 0
497    }
498
499    #[inline(always)]
500    pub(crate) fn reordering(self) -> bool {
501        self.bits & CollationMetadata::REORDERING_MASK != 0
502    }
503
504    /// Thai
505    #[inline(always)]
506    pub(crate) fn alternate_shifted(self) -> bool {
507        self.bits & CollationMetadata::ALTERNATE_SHIFTED_MASK != 0
508    }
509
510    #[inline(always)]
511    pub(crate) fn case_first(self) -> CollationCaseFirst {
512        if self.bits & CollationMetadata::CASE_FIRST_MASK != 0 {
513            if self.bits & CollationMetadata::UPPER_FIRST_MASK != 0 {
514                CollationCaseFirst::Upper
515            } else {
516                CollationCaseFirst::Lower
517            }
518        } else {
519            CollationCaseFirst::False
520        }
521    }
522}
523
524/// Special primaries associated with the root collation
525///
526/// <div class="stab unstable">
527/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
528/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
529/// to be stable, their Rust representation might not be. Use with caution.
530/// </div>
531#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
532#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
533#[cfg_attr(feature = "datagen", databake(path = icu_collator::provider))]
534#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
535pub struct CollationSpecialPrimaries<'data> {
536    /// The primaries corresponding to `MaxVariable`
537    /// character classes packed so that each fits in
538    /// 16 bits. Length must match the number of enum
539    /// variants in `MaxVariable`, currently 4.
540    #[cfg_attr(feature = "serde", serde(borrow))]
541    pub last_primaries: ZeroVec<'data, u16>,
542    /// The high 8 bits of the numeric primary
543    pub numeric_primary: u8,
544}
545
546icu_provider::data_struct!(
547    CollationSpecialPrimaries<'_>,
548    #[cfg(feature = "datagen")]
549);
550
551impl CollationSpecialPrimaries<'_> {
552    #[allow(clippy::unwrap_used)]
553    pub(crate) fn last_primary_for_group(&self, max_variable: MaxVariable) -> u32 {
554        // `unwrap` is OK, because `Collator::try_new` validates the length.
555        //
556        // Minus one to generate the right lower 16 bits from the high 16 bits.
557        // See parse.cpp in genrb and getLastPrimaryForGroup in ICU4C.
558        (u32::from(self.last_primaries.get(max_variable as usize).unwrap()) << 16) - 1
559    }
560}