icu_casemap/provider/exception_helpers.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! This module contains various types for the header part of casemapping exception data
6//!
7//! This is both used in datagen to decode ICU4C's data, and natively in ICU4X's
8//! own data model.
9//!
10//! [`ExceptionBits`] is the bag of bits associated with exceptions, and [`SlotPresence`]
11//! marks the presence or absence of various "slots" in a given exception.
12//!
13//! The `exceptions_builder` module of this crate handles decoding ICU4C data using the exception
14//! header, and [`crate::provider::exceptions`] handles.
15
16use crate::provider::data::{DotType, MappingKind};
17use zerovec::ule::{AsULE, ULE};
18
19/// A bunch of bits associated with each exception.
20///
21/// <div class="stab unstable">
22/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
23/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
24/// to be stable, their Rust representation might not be. Use with caution.
25/// </div>
26#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
27#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
28#[cfg_attr(feature = "datagen", derive(serde::Serialize))]
29pub struct ExceptionBits {
30 /// Whether or not the slots are double-width.
31 ///
32 /// Unused in ICU4X
33 pub double_width_slots: bool,
34 /// There is no simple casefolding, even if there is a simple lowercase mapping
35 pub no_simple_case_folding: bool,
36 /// The delta stored in the `Delta` slot is negative
37 pub negative_delta: bool,
38 /// If the character is case sensitive
39 pub is_sensitive: bool,
40 /// The dot type of the character
41 pub dot_type: DotType,
42 /// If the character has conditional special casing
43 pub has_conditional_special: bool,
44 /// If the character has conditional case folding
45 pub has_conditional_fold: bool,
46}
47
48impl ExceptionBits {
49 /// Extract from the upper half of an ICU4C-format u16
50 pub(crate) fn from_integer(int: u8) -> Self {
51 let ule = ExceptionBitsULE(int);
52 let double_width_slots = ule.double_width_slots();
53 let no_simple_case_folding = ule.no_simple_case_folding();
54 let negative_delta = ule.negative_delta();
55 let is_sensitive = ule.is_sensitive();
56 let has_conditional_special = ule.has_conditional_special();
57 let has_conditional_fold = ule.has_conditional_fold();
58 let dot_type = ule.dot_type();
59
60 Self {
61 double_width_slots,
62 no_simple_case_folding,
63 negative_delta,
64 is_sensitive,
65 dot_type,
66 has_conditional_special,
67 has_conditional_fold,
68 }
69 }
70
71 /// Convert to an ICU4C-format upper half of u16
72 pub(crate) fn to_integer(self) -> u8 {
73 let mut int = 0;
74 let dot_data = (self.dot_type as u8) << ExceptionBitsULE::DOT_SHIFT;
75 int |= dot_data;
76
77 if self.double_width_slots {
78 int |= ExceptionBitsULE::DOUBLE_SLOTS_FLAG
79 }
80 if self.no_simple_case_folding {
81 int |= ExceptionBitsULE::NO_SIMPLE_CASE_FOLDING_FLAG
82 }
83 if self.negative_delta {
84 int |= ExceptionBitsULE::NEGATIVE_DELTA_FLAG
85 }
86 if self.is_sensitive {
87 int |= ExceptionBitsULE::SENSITIVE_FLAG
88 }
89 if self.has_conditional_special {
90 int |= ExceptionBitsULE::CONDITIONAL_SPECIAL_FLAG
91 }
92 if self.has_conditional_fold {
93 int |= ExceptionBitsULE::CONDITIONAL_FOLD_FLAG
94 }
95 int
96 }
97}
98
99/// Packed slot presence marker
100///
101/// All bits are valid, though bit 4 is unused and reserved
102///
103/// Bits:
104///
105/// ```text
106/// 0: Lowercase mapping (code point)
107/// 1: Case folding (code point)
108/// 2: Uppercase mapping (code point)
109/// 3: Titlecase mapping (code point)
110/// 4: Delta to simple case mapping (code point) (sign stored separately)
111/// 5: RESERVED
112/// 6: Closure mappings (string; see below)
113/// 7: Full mappings (strings; see below)
114/// ```
115///
116/// <div class="stab unstable">
117/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
118/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
119/// to be stable, their Rust representation might not be. Use with caution.
120/// </div>
121#[derive(Copy, Clone, PartialEq, Eq, ULE, Debug, Default)]
122#[repr(transparent)]
123#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
124#[cfg_attr(feature = "datagen", derive(serde::Serialize))]
125pub struct SlotPresence(pub u8);
126
127impl SlotPresence {
128 pub(crate) fn add_slot(&mut self, slot: ExceptionSlot) {
129 self.0 |= 1 << slot as u8;
130 }
131 pub(crate) fn has_slot(self, slot: ExceptionSlot) -> bool {
132 let bit = 1 << (slot as u8);
133 self.0 & bit != 0
134 }
135}
136
137/// The bitflags on an exception header.
138///
139/// Format from icu4c, documented in casepropsbuilder.cpp, shifted 8 bits since ICU4C has this packed
140/// alongside a SlotPresence
141///
142/// ```text
143/// 0 Double-width slots. If set, then each optional slot is stored as two
144/// elements of the array (high and low halves of 32-bit values) instead of
145/// a single element.
146/// 1 Has no simple case folding, even if there is a simple lowercase mapping
147/// 2 The value in the delta slot is negative
148/// 3 Is case-sensitive (not exposed)
149/// 4..5 Dot type
150/// 6 Has conditional special casing
151/// 7 Has conditional case folding
152/// ```
153///
154/// All bits are valid, though in ICU4X data bits 0 and 2 are not used
155///
156/// <div class="stab unstable">
157/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
158/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
159/// to be stable, their Rust representation might not be. Use with caution.
160/// </div>
161#[derive(Copy, Clone, PartialEq, Eq, ULE, Debug)]
162#[repr(transparent)]
163pub struct ExceptionBitsULE(pub u8);
164
165impl ExceptionBitsULE {
166 const DOUBLE_SLOTS_FLAG: u8 = 0x1;
167
168 const NO_SIMPLE_CASE_FOLDING_FLAG: u8 = 0x2;
169 const NEGATIVE_DELTA_FLAG: u8 = 0x4;
170 const SENSITIVE_FLAG: u8 = 0x8;
171
172 const DOT_SHIFT: u8 = 4;
173
174 const CONDITIONAL_SPECIAL_FLAG: u8 = 0x40;
175 const CONDITIONAL_FOLD_FLAG: u8 = 0x80;
176}
177
178impl ExceptionBitsULE {
179 /// Whether or not the slots are double-width.
180 ///
181 /// Unused in ICU4X
182 pub fn double_width_slots(self) -> bool {
183 self.0 & Self::DOUBLE_SLOTS_FLAG != 0
184 }
185
186 /// There is no simple casefolding, even if there is a simple lowercase mapping
187 pub fn no_simple_case_folding(self) -> bool {
188 self.0 & Self::NO_SIMPLE_CASE_FOLDING_FLAG != 0
189 }
190
191 /// The delta stored in the `Delta` slot is negative
192 pub fn negative_delta(self) -> bool {
193 self.0 & Self::NEGATIVE_DELTA_FLAG != 0
194 }
195
196 /// If the character is case sensitive
197 pub fn is_sensitive(self) -> bool {
198 self.0 & Self::SENSITIVE_FLAG != 0
199 }
200
201 /// If the character has conditional special casing
202 pub fn has_conditional_special(self) -> bool {
203 self.0 & Self::CONDITIONAL_SPECIAL_FLAG != 0
204 }
205
206 /// If the character has conditional case folding
207 pub fn has_conditional_fold(self) -> bool {
208 self.0 & Self::CONDITIONAL_FOLD_FLAG != 0
209 }
210
211 /// The dot type of the character
212 pub fn dot_type(self) -> DotType {
213 DotType::from_masked_bits((u16::from(self.0 >> Self::DOT_SHIFT)) & DotType::DOT_MASK)
214 }
215}
216
217impl AsULE for ExceptionBits {
218 type ULE = ExceptionBitsULE;
219 fn from_unaligned(u: ExceptionBitsULE) -> Self {
220 ExceptionBits::from_integer(u.0)
221 }
222
223 fn to_unaligned(self) -> ExceptionBitsULE {
224 ExceptionBitsULE(self.to_integer())
225 }
226}
227
228impl AsULE for SlotPresence {
229 type ULE = SlotPresence;
230 fn from_unaligned(u: Self) -> Self {
231 u
232 }
233
234 fn to_unaligned(self) -> Self {
235 self
236 }
237}
238
239/// The different slots that may be present in slot-based exception data
240///
241/// <div class="stab unstable">
242/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
243/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
244/// to be stable, their Rust representation might not be. Use with caution.
245/// </div>
246#[derive(Copy, Clone, Debug, PartialOrd, Ord, PartialEq, Eq)]
247pub(crate) enum ExceptionSlot {
248 /// Lowercase mapping
249 Lower = 0,
250 /// Case folding
251 Fold = 1,
252 /// Uppercase mapping
253 Upper = 2,
254 /// Titlecase mapping
255 Title = 3,
256 /// The delta to the simple case folding
257 Delta = 4,
258 // Slot 5 is reserved
259 /// The closure set
260 Closure = 6,
261 /// The four full-mappings
262 FullMappings = 7,
263}
264
265impl ExceptionSlot {
266 /// Where the string slots begin
267 pub(crate) const STRING_SLOTS_START: Self = Self::Closure;
268}
269
270impl From<MappingKind> for ExceptionSlot {
271 fn from(full: MappingKind) -> Self {
272 match full {
273 MappingKind::Lower => Self::Lower,
274 MappingKind::Fold => Self::Fold,
275 MappingKind::Upper => Self::Upper,
276 MappingKind::Title => Self::Title,
277 }
278 }
279}