icu_casemap/provider/
exceptions.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! This is the main module pertaining to casemapping exceptions.
6//!
7//! A single exception is represented by the [`Exception`] type and its ULE equivalent.
8//!
9//! The storage format is complicated (and documented on [`Exception`]), but the data format is
10//! represented equally by [`DecodedException`], which is more human-readable.
11use icu_provider::prelude::*;
12
13use super::data::MappingKind;
14use super::exception_helpers::{ExceptionBits, ExceptionSlot, SlotPresence};
15use crate::set::ClosureSink;
16use alloc::borrow::Cow;
17use core::fmt;
18#[cfg(any(feature = "serde", feature = "datagen"))]
19use core::ops::Range;
20use core::ptr;
21use zerovec::ule::AsULE;
22use zerovec::VarZeroVec;
23
24const SURROGATES_START: u32 = 0xD800;
25const SURROGATES_LEN: u32 = 0xDFFF - SURROGATES_START + 1;
26
27/// This represents case mapping exceptions that can't be represented as a delta applied to
28/// the original code point. The codepoint
29/// trie in CaseMapper stores indices into this VarZeroVec.
30///
31/// <div class="stab unstable">
32/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
33/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
34/// to be stable, their Rust representation might not be. Use with caution.
35/// </div>
36#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
37#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
38#[cfg_attr(feature = "datagen", databake(path = icu_casemap::provider::exceptions))]
39#[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
40pub struct CaseMapExceptions<'data> {
41    #[cfg_attr(feature = "serde", serde(borrow))]
42    /// The list of exceptions
43    pub exceptions: VarZeroVec<'data, ExceptionULE>,
44}
45
46impl CaseMapExceptions<'_> {
47    /// Obtain the exception at index `idx`. Will
48    /// return a default value if not present (GIGO behavior),
49    /// as these indices should come from a paired CaseMapData object
50    ///
51    /// Will also panic in debug mode
52    pub fn get(&self, idx: u16) -> &ExceptionULE {
53        let exception = self.exceptions.get(idx.into());
54        debug_assert!(exception.is_some());
55
56        exception.unwrap_or(ExceptionULE::empty_exception())
57    }
58
59    #[cfg(any(feature = "serde", feature = "datagen"))]
60    pub(crate) fn validate(&self) -> Result<Range<u16>, &'static str> {
61        for exception in self.exceptions.iter() {
62            exception.validate()?;
63        }
64        u16::try_from(self.exceptions.len())
65            .map_err(|_| "Too many exceptions")
66            .map(|l| 0..l)
67    }
68}
69/// A type representing the wire format of `Exception`. The data contained is
70/// equivalently represented by [`DecodedException`].
71///
72/// This type is itself not used that much, most of its relevant methods live
73/// on [`ExceptionULE`].
74///
75/// The `bits` contain supplementary data, whereas
76/// `slot_presence` marks te presence of various extra data
77/// in the `data` field.
78///
79/// The `data` field is not validated to contain all of this data,
80/// this type will have GIGO behavior when constructed with invalid `data`.
81///
82/// The format of `data` is documented on the field
83///
84/// <div class="stab unstable">
85/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
86/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
87/// to be stable, their Rust representation might not be. Use with caution.
88/// </div>
89#[zerovec::make_varule(ExceptionULE)]
90#[derive(PartialEq, Eq, Clone, Default, Debug)]
91#[zerovec::skip_derive(Ord)]
92#[cfg_attr(
93    feature = "serde",
94    derive(serde::Deserialize),
95    zerovec::derive(Deserialize)
96)]
97#[cfg_attr(
98    feature = "datagen",
99    derive(serde::Serialize),
100    zerovec::derive(Serialize)
101)]
102pub struct Exception<'a> {
103    /// The various bit based exception data associated with this.
104    ///
105    /// Format: Just a u8 of bitflags, some flags unused. See [`ExceptionBits`] and its ULE type for more.
106    pub bits: ExceptionBits,
107    /// Which slots are present in `data`.
108    ///
109    /// Format: a u8 of bitflags
110    pub slot_presence: SlotPresence,
111    /// Format : `[char slots] [optional closure length] [ closure slot ] [ full mappings data ]`
112    ///
113    /// For each set SlotPresence bit, except for the two stringy slots (Closure/FullMapping),
114    /// this will have one entry in the string, packed together.
115    ///
116    /// Note that the simple_case delta is stored as a u32 normalized to a `char`, where u32s
117    /// which are from or beyond the surrogate range 0xD800-0xDFFF are stored as chars
118    /// starting from 0xE000. The sign is stored in bits.negative_delta.
119    ///
120    /// If both Closure/FullMapping are present, the next char will be the length of the closure slot,
121    /// bisecting the rest of the data.
122    /// If only one is present, the rest of the data represents that slot.
123    ///
124    /// The closure slot simply represents one string. The full-mappings slot represents four strings,
125    /// packed in a way similar to VarZeroVec, in the following format:
126    /// `i1 i2 i3 [ str0 ] [ str1 ] [ str2 ] [ str3 ]`
127    ///
128    /// where `i1 i2 i3` are the indices of the relevant mappings string. The strings are stored in
129    /// the order corresponding to the MappingKind enum.
130    pub data: Cow<'a, str>,
131}
132
133impl ExceptionULE {
134    #[inline]
135    fn empty_exception() -> &'static Self {
136        static EMPTY_BYTES: &[u8] = &[0, 0];
137        // Safety:
138        // ExceptionULE is a packed DST with `(u8, u8, unsized)` fields. All bit patterns are valid for the two u8s
139        //
140        // An "empty" one can be constructed from a slice of two u8s
141        unsafe {
142            let slice: *const [u8] = ptr::slice_from_raw_parts(EMPTY_BYTES.as_ptr(), 0);
143            &*(slice as *const Self)
144        }
145    }
146    pub(crate) fn has_slot(&self, slot: ExceptionSlot) -> bool {
147        self.slot_presence.has_slot(slot)
148    }
149    /// Obtain a `char` slot, if occupied. If `slot` represents a string slot,
150    /// will return `None`
151    pub(crate) fn get_char_slot(&self, slot: ExceptionSlot) -> Option<char> {
152        if slot >= ExceptionSlot::STRING_SLOTS_START {
153            return None;
154        }
155        let bit = 1 << (slot as u8);
156        // check if slot is occupied
157        if self.slot_presence.0 & bit == 0 {
158            return None;
159        }
160
161        let previous_slot_mask = bit - 1;
162        let previous_slots = self.slot_presence.0 & previous_slot_mask;
163        let slot_num = previous_slots.count_ones() as usize;
164        self.data.chars().nth(slot_num)
165    }
166
167    /// Get the `simple_case` delta (i.e. the `delta` slot), given the character
168    /// this data belongs to.
169    ///
170    /// Normalizes the delta from char-format to u32 format
171    ///
172    /// Does *not* handle the sign of the delta; see self.bits.negative_delta
173    fn get_simple_case_delta(&self) -> Option<u32> {
174        let delta_ch = self.get_char_slot(ExceptionSlot::Delta)?;
175        let mut delta = u32::from(delta_ch);
176        // We "fill in" the surrogates range by offsetting deltas greater than it
177        if delta >= SURROGATES_START {
178            delta -= SURROGATES_LEN;
179        }
180        Some(delta)
181    }
182
183    /// Get the `simple_case` value (i.e. the `delta` slot), given the character
184    /// this data belongs to.
185    ///
186    /// The data is stored as a delta so the character must be provided.
187    ///
188    /// The data cannot be stored directly as a character because the trie is more
189    /// compact with adjacent characters sharing deltas.
190    pub(crate) fn get_simple_case_slot_for(&self, ch: char) -> Option<char> {
191        let delta = self.get_simple_case_delta()?;
192        let mut delta = i32::try_from(delta).ok()?;
193        if self.bits.negative_delta() {
194            delta = -delta;
195        }
196
197        let new_ch = i32::try_from(u32::from(ch)).ok()? + delta;
198
199        char::try_from(u32::try_from(new_ch).ok()?).ok()
200    }
201
202    /// Returns *all* the data in the closure/full slots, including length metadata
203    fn get_stringy_data(&self) -> Option<&str> {
204        const CHAR_MASK: u8 = (1 << ExceptionSlot::STRING_SLOTS_START as u8) - 1;
205        let char_slot_count = (self.slot_presence.0 & CHAR_MASK).count_ones() as usize;
206        let mut chars = self.data.chars();
207        for _ in 0..char_slot_count {
208            let res = chars.next();
209            res?;
210        }
211        Some(chars.as_str())
212    }
213
214    /// Returns a single stringy slot, either ExceptionSlot::Closure
215    /// or ExceptionSlot::FullMappings.
216    fn get_stringy_slot(&self, slot: ExceptionSlot) -> Option<&str> {
217        debug_assert!(slot == ExceptionSlot::Closure || slot == ExceptionSlot::FullMappings);
218        let other_slot = if slot == ExceptionSlot::Closure {
219            ExceptionSlot::FullMappings
220        } else {
221            ExceptionSlot::Closure
222        };
223        if !self.slot_presence.has_slot(slot) {
224            return None;
225        }
226        let stringy_data = self.get_stringy_data()?;
227
228        if self.slot_presence.has_slot(other_slot) {
229            // both stringy slots are used, we need a length
230            let mut chars = stringy_data.chars();
231            // GIGO: to have two strings there must be a length, if not present return None
232            let length_char = chars.next()?;
233
234            let length = usize::try_from(u32::from(length_char)).unwrap_or(0);
235            // The length indexes into the string after the first char
236            let remaining_slice = chars.as_str();
237            // GIGO: will return none if there wasn't enough space in this slot
238            if slot == ExceptionSlot::Closure {
239                remaining_slice.get(0..length)
240            } else {
241                remaining_slice.get(length..)
242            }
243        } else {
244            // only a single stringy slot, there is no length stored
245            Some(stringy_data)
246        }
247    }
248
249    /// Get the data behind the `closure` slot
250    pub(crate) fn get_closure_slot(&self) -> Option<&str> {
251        self.get_stringy_slot(ExceptionSlot::Closure)
252    }
253
254    /// Get all the slot data for the FullMappings slot
255    ///
256    /// This needs to be further segmented into four based on length metadata
257    fn get_fullmappings_slot_data(&self) -> Option<&str> {
258        self.get_stringy_slot(ExceptionSlot::FullMappings)
259    }
260
261    /// Get a specific FullMappings slot value
262    pub(crate) fn get_fullmappings_slot_for_kind(&self, kind: MappingKind) -> Option<&str> {
263        let data = self.get_fullmappings_slot_data()?;
264
265        let mut chars = data.chars();
266        // GIGO: must have three index strings, else return None
267        let i1 = usize::try_from(u32::from(chars.next()?)).ok()?;
268        let i2 = usize::try_from(u32::from(chars.next()?)).ok()?;
269        let i3 = usize::try_from(u32::from(chars.next()?)).ok()?;
270        let remaining_slice = chars.as_str();
271        // GIGO: if the indices are wrong, return None
272        match kind {
273            MappingKind::Lower => remaining_slice.get(..i1),
274            MappingKind::Fold => remaining_slice.get(i1..i2),
275            MappingKind::Upper => remaining_slice.get(i2..i3),
276            MappingKind::Title => remaining_slice.get(i3..),
277        }
278    }
279
280    // convenience function that lets us use the ? operator
281    fn get_all_fullmapping_slots(&self) -> Option<[Cow<'_, str>; 4]> {
282        Some([
283            self.get_fullmappings_slot_for_kind(MappingKind::Lower)?
284                .into(),
285            self.get_fullmappings_slot_for_kind(MappingKind::Fold)?
286                .into(),
287            self.get_fullmappings_slot_for_kind(MappingKind::Upper)?
288                .into(),
289            self.get_fullmappings_slot_for_kind(MappingKind::Title)?
290                .into(),
291        ])
292    }
293
294    // Given a mapping kind, returns the character for that kind, if it exists. Fold falls
295    // back to Lower; Title falls back to Upper.
296    #[inline]
297    pub(crate) fn slot_char_for_kind(&self, kind: MappingKind) -> Option<char> {
298        match kind {
299            MappingKind::Lower | MappingKind::Upper => self.get_char_slot(kind.into()),
300            MappingKind::Fold => self
301                .get_char_slot(ExceptionSlot::Fold)
302                .or_else(|| self.get_char_slot(ExceptionSlot::Lower)),
303            MappingKind::Title => self
304                .get_char_slot(ExceptionSlot::Title)
305                .or_else(|| self.get_char_slot(ExceptionSlot::Upper)),
306        }
307    }
308
309    pub(crate) fn add_full_and_closure_mappings<S: ClosureSink>(&self, set: &mut S) {
310        if let Some(full) = self.get_fullmappings_slot_for_kind(MappingKind::Fold) {
311            if !full.is_empty() {
312                set.add_string(full);
313            }
314        };
315        if let Some(closure) = self.get_closure_slot() {
316            for c in closure.chars() {
317                set.add_char(c);
318            }
319        };
320    }
321
322    /// Extract all the data out into a structured form
323    ///
324    /// Useful for serialization and debugging
325    pub fn decode(&self) -> DecodedException<'_> {
326        // Potential future optimization: This can
327        // directly access each bit one after the other and iterate the string
328        // which avoids recomputing slot offsets over and over again.
329        //
330        // If we're doing so we may wish to retain this older impl so that we can still roundtrip test
331        let bits = self.bits;
332        let lowercase = self.get_char_slot(ExceptionSlot::Lower);
333        let casefold = self.get_char_slot(ExceptionSlot::Fold);
334        let uppercase = self.get_char_slot(ExceptionSlot::Upper);
335        let titlecase = self.get_char_slot(ExceptionSlot::Title);
336        let simple_case_delta = self.get_simple_case_delta();
337        let closure = self.get_closure_slot().map(Into::into);
338        let full = self.get_all_fullmapping_slots();
339
340        DecodedException {
341            bits: ExceptionBits::from_unaligned(bits),
342            lowercase,
343            casefold,
344            uppercase,
345            titlecase,
346            simple_case_delta,
347            closure,
348            full,
349        }
350    }
351
352    #[cfg(any(feature = "serde", feature = "datagen"))]
353    pub(crate) fn validate(&self) -> Result<(), &'static str> {
354        // check that ICU4C specific fields are not set
355        // check that there is enough space for all the offsets
356        if self.bits.double_width_slots() {
357            return Err("double-width-slots should not be used in ICU4C");
358        }
359
360        // just run all of the slot getters at once and then check
361        let decoded = self.decode();
362
363        for (slot, decoded_slot) in [
364            (ExceptionSlot::Lower, &decoded.lowercase),
365            (ExceptionSlot::Fold, &decoded.casefold),
366            (ExceptionSlot::Upper, &decoded.uppercase),
367            (ExceptionSlot::Title, &decoded.titlecase),
368        ] {
369            if self.has_slot(slot) && decoded_slot.is_none() {
370                // decoding hit GIGO behavior, oops!
371                return Err("Slot decoding failed");
372            }
373        }
374        if self.has_slot(ExceptionSlot::Delta) && decoded.simple_case_delta.is_none() {
375            // decoding hit GIGO behavior, oops!
376            return Err("Slot decoding failed");
377        }
378
379        if self.has_slot(ExceptionSlot::Closure) && decoded.closure.is_none() {
380            return Err("Slot decoding failed");
381        }
382
383        if self.has_slot(ExceptionSlot::FullMappings) {
384            if decoded.full.is_some() {
385                let data = self
386                    .get_fullmappings_slot_data()
387                    .ok_or("fullmappings slot doesn't parse")?;
388                let mut chars = data.chars();
389                let i1 = u32::from(chars.next().ok_or("fullmappings string too small")?);
390                let i2 = u32::from(chars.next().ok_or("fullmappings string too small")?);
391                let i3 = u32::from(chars.next().ok_or("fullmappings string too small")?);
392
393                if i2 < i1 || i3 < i2 {
394                    return Err("fullmappings string contains non-sequential indices");
395                }
396                let rest = chars.as_str();
397                let len = u32::try_from(rest.len()).map_err(|_| "len too large for u32")?;
398
399                if i1 > len || i2 > len || i3 > len {
400                    return Err("fullmappings string contains out-of-bounds indices");
401                }
402            } else {
403                return Err("Slot decoding failed");
404            }
405        }
406
407        Ok(())
408    }
409}
410
411impl fmt::Debug for ExceptionULE {
412    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
413        self.decode().fmt(f)
414    }
415}
416
417/// A decoded [`Exception`] type, with all of the data parsed out into
418/// separate fields.
419///
420/// <div class="stab unstable">
421/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
422/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
423/// to be stable, their Rust representation might not be. Use with caution.
424/// </div>
425#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
426#[cfg_attr(feature = "datagen", derive(serde::Serialize))]
427#[derive(Debug, Clone, PartialEq, Eq, Default)]
428pub struct DecodedException<'a> {
429    /// The various bit-based data associated with this exception
430    pub bits: ExceptionBits,
431    /// Lowercase mapping
432    pub lowercase: Option<char>,
433    /// Case folding
434    pub casefold: Option<char>,
435    /// Uppercase mapping
436    pub uppercase: Option<char>,
437    /// Titlecase mapping
438    pub titlecase: Option<char>,
439    /// The simple casefold delta. Its sign is stored in bits.negative_delta
440    pub simple_case_delta: Option<u32>,
441    /// Closure mappings
442    pub closure: Option<Cow<'a, str>>,
443    /// The four full-mappings strings, indexed by MappingKind u8 value
444    pub full: Option<[Cow<'a, str>; 4]>,
445}
446
447impl DecodedException<'_> {
448    /// Convert to a wire-format encodeable (VarULE-encodeable) [`Exception`]
449    pub fn encode(&self) -> Exception<'static> {
450        let bits = self.bits;
451        let mut slot_presence = SlotPresence(0);
452        let mut data = alloc::string::String::new();
453        if let Some(lowercase) = self.lowercase {
454            slot_presence.add_slot(ExceptionSlot::Lower);
455            data.push(lowercase)
456        }
457        if let Some(casefold) = self.casefold {
458            slot_presence.add_slot(ExceptionSlot::Fold);
459            data.push(casefold)
460        }
461        if let Some(uppercase) = self.uppercase {
462            slot_presence.add_slot(ExceptionSlot::Upper);
463            data.push(uppercase)
464        }
465        if let Some(titlecase) = self.titlecase {
466            slot_presence.add_slot(ExceptionSlot::Title);
467            data.push(titlecase)
468        }
469        if let Some(mut simple_case_delta) = self.simple_case_delta {
470            slot_presence.add_slot(ExceptionSlot::Delta);
471
472            if simple_case_delta >= SURROGATES_START {
473                simple_case_delta += SURROGATES_LEN;
474            }
475            let simple_case_delta = char::try_from(simple_case_delta).unwrap_or('\0');
476            data.push(simple_case_delta)
477        }
478
479        if let Some(ref closure) = self.closure {
480            slot_presence.add_slot(ExceptionSlot::Closure);
481            if self.full.is_some() {
482                // GIGO: if the closure length is more than 0xD800 this will error. Plenty of space.
483                debug_assert!(
484                    closure.len() < 0xD800,
485                    "Found overlarge closure value when encoding exception"
486                );
487                let len_char = u32::try_from(closure.len())
488                    .ok()
489                    .and_then(|c| char::try_from(c).ok())
490                    .unwrap_or('\0');
491                data.push(len_char);
492            }
493            data.push_str(closure);
494        }
495        if let Some(ref full) = self.full {
496            slot_presence.add_slot(ExceptionSlot::FullMappings);
497            let mut idx = 0;
498            // iterate all elements except the last, whose length we can calculate from context
499            for mapping in full.iter().take(3) {
500                idx += mapping.len();
501                data.push(char::try_from(u32::try_from(idx).unwrap_or(0)).unwrap_or('\0'));
502            }
503            for mapping in full {
504                data.push_str(mapping);
505            }
506        }
507        Exception {
508            bits,
509            slot_presence,
510            data: data.into(),
511        }
512    }
513
514    // Potential optimization: Write an `EncodeAsVarULE` that
515    // directly produces an ExceptionULE
516}
517
518#[cfg(test)]
519mod tests {
520    use super::*;
521
522    fn test_roundtrip_once(exception: DecodedException) {
523        let encoded = exception.encode();
524        let encoded = zerovec::ule::encode_varule_to_box(&encoded);
525        let decoded = encoded.decode();
526        assert_eq!(decoded, exception);
527    }
528
529    #[test]
530    fn test_roundtrip() {
531        test_roundtrip_once(DecodedException {
532            lowercase: Some('ø'),
533            ..Default::default()
534        });
535        test_roundtrip_once(DecodedException {
536            titlecase: Some('X'),
537            lowercase: Some('ø'),
538            ..Default::default()
539        });
540        test_roundtrip_once(DecodedException {
541            titlecase: Some('X'),
542            ..Default::default()
543        });
544        test_roundtrip_once(DecodedException {
545            titlecase: Some('X'),
546            simple_case_delta: Some(0xE999),
547            closure: Some("hello world".into()),
548            ..Default::default()
549        });
550        test_roundtrip_once(DecodedException {
551            simple_case_delta: Some(10),
552            closure: Some("hello world".into()),
553            full: Some(["你好世界".into(), "".into(), "hi".into(), "å".into()]),
554            ..Default::default()
555        });
556        test_roundtrip_once(DecodedException {
557            closure: Some("hello world".into()),
558            full: Some(["aa".into(), "È›".into(), "".into(), "Ã¥".into()]),
559            ..Default::default()
560        });
561        test_roundtrip_once(DecodedException {
562            full: Some(["你好世界".into(), "".into(), "hi".into(), "å".into()]),
563            ..Default::default()
564        });
565    }
566}