icu_casemap/provider/
exception_helpers.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! This module contains various types for the header part of casemapping exception data
6//!
7//! This is both used in datagen to decode ICU4C's data, and natively in ICU4X's
8//! own data model.
9//!
10//! [`ExceptionBits`] is the bag of bits associated with exceptions, and [`SlotPresence`]
11//! marks the presence or absence of various "slots" in a given exception.
12//!
13//! The `exceptions_builder` module of this crate handles decoding ICU4C data using the exception
14//! header, and [`crate::provider::exceptions`] handles.
15
16use crate::provider::data::{DotType, MappingKind};
17use zerovec::ule::{AsULE, ULE};
18
19/// A bunch of bits associated with each exception.
20///
21/// <div class="stab unstable">
22/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
23/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
24/// to be stable, their Rust representation might not be. Use with caution.
25/// </div>
26#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
27#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
28#[cfg_attr(feature = "datagen", derive(serde::Serialize))]
29pub struct ExceptionBits {
30    /// Whether or not the slots are double-width.
31    ///
32    /// Unused in ICU4X
33    pub double_width_slots: bool,
34    /// There is no simple casefolding, even if there is a simple lowercase mapping
35    pub no_simple_case_folding: bool,
36    /// The delta stored in the `Delta` slot is negative
37    pub negative_delta: bool,
38    /// If the character is case sensitive
39    pub is_sensitive: bool,
40    /// The dot type of the character
41    pub dot_type: DotType,
42    /// If the character has conditional special casing
43    pub has_conditional_special: bool,
44    /// If the character has conditional case folding
45    pub has_conditional_fold: bool,
46}
47
48impl ExceptionBits {
49    /// Extract from the upper half of an ICU4C-format u16
50    pub(crate) fn from_integer(int: u8) -> Self {
51        let ule = ExceptionBitsULE(int);
52        let double_width_slots = ule.double_width_slots();
53        let no_simple_case_folding = ule.no_simple_case_folding();
54        let negative_delta = ule.negative_delta();
55        let is_sensitive = ule.is_sensitive();
56        let has_conditional_special = ule.has_conditional_special();
57        let has_conditional_fold = ule.has_conditional_fold();
58        let dot_type = ule.dot_type();
59
60        Self {
61            double_width_slots,
62            no_simple_case_folding,
63            negative_delta,
64            is_sensitive,
65            dot_type,
66            has_conditional_special,
67            has_conditional_fold,
68        }
69    }
70
71    /// Convert to an ICU4C-format upper half of u16
72    pub(crate) fn to_integer(self) -> u8 {
73        let mut int = 0;
74        let dot_data = (self.dot_type as u8) << ExceptionBitsULE::DOT_SHIFT;
75        int |= dot_data;
76
77        if self.double_width_slots {
78            int |= ExceptionBitsULE::DOUBLE_SLOTS_FLAG
79        }
80        if self.no_simple_case_folding {
81            int |= ExceptionBitsULE::NO_SIMPLE_CASE_FOLDING_FLAG
82        }
83        if self.negative_delta {
84            int |= ExceptionBitsULE::NEGATIVE_DELTA_FLAG
85        }
86        if self.is_sensitive {
87            int |= ExceptionBitsULE::SENSITIVE_FLAG
88        }
89        if self.has_conditional_special {
90            int |= ExceptionBitsULE::CONDITIONAL_SPECIAL_FLAG
91        }
92        if self.has_conditional_fold {
93            int |= ExceptionBitsULE::CONDITIONAL_FOLD_FLAG
94        }
95        int
96    }
97}
98
99/// Packed slot presence marker
100///
101/// All bits are valid, though bit 4 is unused and reserved
102///
103/// Bits:
104///
105/// ```text
106///               0: Lowercase mapping (code point)
107///               1: Case folding (code point)
108///               2: Uppercase mapping (code point)
109///               3: Titlecase mapping (code point)
110///               4: Delta to simple case mapping (code point) (sign stored separately)
111///               5: RESERVED
112///               6: Closure mappings (string; see below)
113///               7: Full mappings (strings; see below)
114/// ```
115///
116/// <div class="stab unstable">
117/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
118/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
119/// to be stable, their Rust representation might not be. Use with caution.
120/// </div>
121#[derive(Copy, Clone, PartialEq, Eq, ULE, Debug, Default)]
122#[repr(transparent)]
123#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
124#[cfg_attr(feature = "datagen", derive(serde::Serialize))]
125pub struct SlotPresence(pub u8);
126
127impl SlotPresence {
128    pub(crate) fn add_slot(&mut self, slot: ExceptionSlot) {
129        self.0 |= 1 << slot as u8;
130    }
131    pub(crate) fn has_slot(self, slot: ExceptionSlot) -> bool {
132        let bit = 1 << (slot as u8);
133        self.0 & bit != 0
134    }
135}
136
137/// The bitflags on an exception header.
138///
139/// Format from icu4c, documented in casepropsbuilder.cpp, shifted 8 bits since ICU4C has this packed
140/// alongside a SlotPresence
141///
142/// ```text
143///            0  Double-width slots. If set, then each optional slot is stored as two
144///               elements of the array (high and low halves of 32-bit values) instead of
145///               a single element.
146///            1  Has no simple case folding, even if there is a simple lowercase mapping
147///           2  The value in the delta slot is negative
148///           3  Is case-sensitive (not exposed)
149///       4..5  Dot type
150///           6  Has conditional special casing
151///           7  Has conditional case folding
152/// ```
153///
154/// All bits are valid, though in ICU4X data bits 0 and 2 are not used
155///
156/// <div class="stab unstable">
157/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
158/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
159/// to be stable, their Rust representation might not be. Use with caution.
160/// </div>
161#[derive(Copy, Clone, PartialEq, Eq, ULE, Debug)]
162#[repr(transparent)]
163pub struct ExceptionBitsULE(pub u8);
164
165impl ExceptionBitsULE {
166    const DOUBLE_SLOTS_FLAG: u8 = 0x1;
167
168    const NO_SIMPLE_CASE_FOLDING_FLAG: u8 = 0x2;
169    const NEGATIVE_DELTA_FLAG: u8 = 0x4;
170    const SENSITIVE_FLAG: u8 = 0x8;
171
172    const DOT_SHIFT: u8 = 4;
173
174    const CONDITIONAL_SPECIAL_FLAG: u8 = 0x40;
175    const CONDITIONAL_FOLD_FLAG: u8 = 0x80;
176}
177
178impl ExceptionBitsULE {
179    /// Whether or not the slots are double-width.
180    ///
181    /// Unused in ICU4X
182    pub fn double_width_slots(self) -> bool {
183        self.0 & Self::DOUBLE_SLOTS_FLAG != 0
184    }
185
186    /// There is no simple casefolding, even if there is a simple lowercase mapping
187    pub fn no_simple_case_folding(self) -> bool {
188        self.0 & Self::NO_SIMPLE_CASE_FOLDING_FLAG != 0
189    }
190
191    /// The delta stored in the `Delta` slot is negative
192    pub fn negative_delta(self) -> bool {
193        self.0 & Self::NEGATIVE_DELTA_FLAG != 0
194    }
195
196    /// If the character is case sensitive
197    pub fn is_sensitive(self) -> bool {
198        self.0 & Self::SENSITIVE_FLAG != 0
199    }
200
201    /// If the character has conditional special casing
202    pub fn has_conditional_special(self) -> bool {
203        self.0 & Self::CONDITIONAL_SPECIAL_FLAG != 0
204    }
205
206    /// If the character has conditional case folding
207    pub fn has_conditional_fold(self) -> bool {
208        self.0 & Self::CONDITIONAL_FOLD_FLAG != 0
209    }
210
211    /// The dot type of the character
212    pub fn dot_type(self) -> DotType {
213        DotType::from_masked_bits((u16::from(self.0 >> Self::DOT_SHIFT)) & DotType::DOT_MASK)
214    }
215}
216
217impl AsULE for ExceptionBits {
218    type ULE = ExceptionBitsULE;
219    fn from_unaligned(u: ExceptionBitsULE) -> Self {
220        ExceptionBits::from_integer(u.0)
221    }
222
223    fn to_unaligned(self) -> ExceptionBitsULE {
224        ExceptionBitsULE(self.to_integer())
225    }
226}
227
228impl AsULE for SlotPresence {
229    type ULE = SlotPresence;
230    fn from_unaligned(u: Self) -> Self {
231        u
232    }
233
234    fn to_unaligned(self) -> Self {
235        self
236    }
237}
238
239/// The different slots that may be present in slot-based exception data
240///
241/// <div class="stab unstable">
242/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
243/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
244/// to be stable, their Rust representation might not be. Use with caution.
245/// </div>
246#[derive(Copy, Clone, Debug, PartialOrd, Ord, PartialEq, Eq)]
247pub(crate) enum ExceptionSlot {
248    /// Lowercase mapping
249    Lower = 0,
250    /// Case folding
251    Fold = 1,
252    /// Uppercase mapping
253    Upper = 2,
254    /// Titlecase mapping
255    Title = 3,
256    /// The delta to the simple case folding
257    Delta = 4,
258    // Slot 5 is reserved
259    /// The closure set
260    Closure = 6,
261    /// The four full-mappings
262    FullMappings = 7,
263}
264
265impl ExceptionSlot {
266    /// Where the string slots begin
267    pub(crate) const STRING_SLOTS_START: Self = Self::Closure;
268}
269
270impl From<MappingKind> for ExceptionSlot {
271    fn from(full: MappingKind) -> Self {
272        match full {
273            MappingKind::Lower => Self::Lower,
274            MappingKind::Fold => Self::Fold,
275            MappingKind::Upper => Self::Upper,
276            MappingKind::Title => Self::Title,
277        }
278    }
279}