icu_datetime/provider/pattern/item/
ule.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use super::{GenericPatternItem, PatternItem};
6use crate::provider::fields;
7use core::convert::TryFrom;
8use zerovec::ule::{AsULE, UleError, ULE};
9
10/// `PatternItemULE` is a type optimized for efficient storing and
11/// deserialization of `FixedCalendarDateTimeFormatter` `PatternItem` elements using
12/// `ZeroVec` model.
13///
14/// The serialization model packages the pattern item in three bytes.
15///
16/// The first bit is used to disriminate the item variant. If the bit is
17/// set, then the value is the `PatternItem::Field` variant. Otherwise,
18/// the `PatternItem::Literal` is used.
19///
20/// In case the discriminant is set:
21///
22/// 1) The rest of the first byte remains unused.
23/// 2) The second byte encodes `FieldSymbol` encoded as (Type: 4 bits, Symbol: 4 bits).
24/// 3) The third byte encodes the field length.
25///
26/// If the discriminant is not set, the bottom three bits of the first byte,
27/// together with the next two bytes, contain all 21 bits required to encode
28/// any [`Unicode Code Point`]. By design, the representation of a code point
29/// is the same between [`PatternItemULE`] and [`GenericPatternItemULE`].
30///
31/// # Diagram
32///
33/// ```text
34/// ┌───────────────┬───────────────┬───────────────┐
35/// │       u8      │       u8      │       u8      │
36/// ├─┬─┬─┬─┬─┬─┬─┬─┼─┬─┬─┬─┬─┬─┬─┬─┼─┬─┬─┬─┬─┬─┬─┬─┤
37/// ├─┴─┴─┼─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┤
38/// │     │          Unicode Code Point             │ Literal
39/// ├─┬───┴─────────┬───────────────┬───────────────┤
40/// │X│             │  FieldSymbol  │  FieldLength  │ Field
41/// └─┴─────────────┴───────────────┴───────────────┘
42///  ▲
43///  │
44///  Variant Discriminant
45/// ```
46///
47/// # Optimization
48///
49/// This model is optimized for efficient packaging of the `PatternItem` elements
50/// and performant deserialization from the `PatternItemULE` to `PatternItem` type.
51///
52/// # Constraints
53///
54/// The model leaves at most 8 `PatternItem` variants, limits the number of possible
55/// field types and symbols to 16 each and limits the number of length variants to 256.
56///
57/// [`Unicode Code Point`]: http://www.unicode.org/versions/latest/
58#[derive(Copy, Clone, Debug, PartialEq)]
59#[repr(transparent)]
60// This field has many invariants about the data, but the primary safety invariant
61// is that when the discriminant bit is 0, it represents a valid char, bytes interpreted as big-endian.
62pub struct PatternItemULE([u8; 3]);
63
64impl PatternItemULE {
65    /// Given the first byte of the three-byte array that `PatternItemULE` encodes,
66    /// the method determines whether the discriminant in
67    /// the byte indicates that the array encodes the `PatternItem::Field`
68    /// or `PatternItem::Literal` variant of the `PatternItem`.
69    ///
70    /// Returns true when it is a `PatternItem::Field`.
71    #[inline]
72    fn determine_field_from_u8(byte: u8) -> bool {
73        byte & 0b1000_0000 != 0
74    }
75
76    #[inline]
77    fn bytes_in_range(value: (&u8, &u8, &u8)) -> bool {
78        if Self::determine_field_from_u8(*value.0) {
79            // ensure that unused bytes are all zero
80            fields::FieldULE::validate_byte_pair((*value.1, *value.2)).is_ok()
81                && *value.0 == 0b1000_0000
82        } else {
83            // This upholds the safety invariant
84            char::try_from(u32::from_be_bytes([0x00, *value.0, *value.1, *value.2])).is_ok()
85        }
86    }
87}
88
89// Safety (based on the safety checklist on the ULE trait):
90//  1. PatternItemULE does not include any uninitialized or padding bytes.
91//     (achieved by `#[repr(transparent)]` on a ULE type)
92//  2. PatternItemULE is aligned to 1 byte.
93//     (achieved by `#[repr(transparent)]` on a ULE type)
94//  3. The impl of validate_bytes() returns an error if any byte is not valid.
95//  4. The impl of validate_bytes() returns an error if there are extra bytes.
96//  5. The other ULE methods use the default impl.
97//  6. PatternItemULE byte equality is semantic equality.
98unsafe impl ULE for PatternItemULE {
99    fn validate_bytes(bytes: &[u8]) -> Result<(), UleError> {
100        if bytes.len() % 3 != 0 {
101            return Err(UleError::length::<Self>(bytes.len()));
102        }
103
104        #[allow(clippy::indexing_slicing)] // chunks
105        if !bytes
106            .chunks(3)
107            // This upholds the safety invariant by checking all invariants
108            .all(|c| Self::bytes_in_range((&c[0], &c[1], &c[2])))
109        {
110            return Err(UleError::parse::<Self>());
111        }
112        Ok(())
113    }
114}
115
116impl AsULE for PatternItem {
117    type ULE = PatternItemULE;
118
119    #[inline]
120    fn to_unaligned(self) -> Self::ULE {
121        match self {
122            Self::Field(field) => {
123                PatternItemULE([0b1000_0000, field.symbol.idx(), field.length.idx()])
124            }
125            Self::Literal(ch) => {
126                let u = ch as u32;
127                let bytes = u.to_be_bytes();
128                PatternItemULE([bytes[1], bytes[2], bytes[3]])
129            }
130        }
131    }
132
133    #[inline]
134    fn from_unaligned(unaligned: Self::ULE) -> Self {
135        let value = unaligned.0;
136        #[allow(clippy::unwrap_used)] // validated
137        if PatternItemULE::determine_field_from_u8(value[0]) {
138            let symbol = fields::FieldSymbol::from_idx(value[1]).unwrap();
139            let length = fields::FieldLength::from_idx(value[2]).unwrap();
140            PatternItem::Field(fields::Field { symbol, length })
141        } else {
142            // Safety: From field safety invariant
143            PatternItem::Literal(unsafe {
144                char::from_u32_unchecked(u32::from_be_bytes([0x00, value[0], value[1], value[2]]))
145            })
146        }
147    }
148}
149
150/// `GenericPatternItemULE` is a type optimized for efficient storing and
151/// deserialization of `FixedCalendarDateTimeFormatter` `GenericPatternItem` elements using
152/// the `ZeroVec` model.
153///
154/// The serialization model packages the pattern item in three bytes.
155///
156/// The first bit is used to disriminate the item variant. If the bit is
157/// set, then the value is the `GenericPatternItem::Placeholder` variant. Otherwise,
158/// the `GenericPatternItem::Literal` is used.
159///
160/// In case the discriminant is set:
161///
162/// 1) The rest of the first byte remains unused.
163/// 2) The second byte is unused.
164/// 3) The third byte encodes the placeholder index.
165///
166/// If the discriminant is not set, the bottom three bits of the first byte,
167/// together with the next two bytes, contain all 21 bits required to encode
168/// any [`Unicode Code Point`]. By design, the representation of a code point
169/// is the same between [`PatternItemULE`] and [`GenericPatternItemULE`].
170///
171/// # Diagram
172///
173/// ```text
174/// ┌───────────────┬───────────────┬───────────────┐
175/// │       u8      │       u8      │       u8      │
176/// ├─┬─┬─┬─┬─┬─┬─┬─┼─┬─┬─┬─┬─┬─┬─┬─┼─┬─┬─┬─┬─┬─┬─┬─┤
177/// ├─┴─┴─┼─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┤
178/// │     │          Unicode Code Point             │ Literal
179/// ├─┬───┴─────────────────────────┬───────────────┤
180/// │X│                             │  Placeholder  │ Placeholder
181/// └─┴─────────────────────────────┴───────────────┘
182///  ▲
183///  │
184///  Variant Discriminant
185/// ```
186///
187/// # Optimization
188///
189/// This model is optimized for efficient packaging of the `GenericPatternItem` elements
190/// and performant deserialization from the `GernericPatternItemULE` to `GenericPatternItem` type.
191///
192/// # Constraints
193///
194/// The model leaves at most 8 `PatternItem` variants, and limits the placeholder
195/// to a single u8.
196///
197/// [`Unicode Code Point`]: http://www.unicode.org/versions/latest/
198#[derive(Copy, Clone, Debug, PartialEq)]
199#[repr(transparent)]
200pub struct GenericPatternItemULE([u8; 3]);
201
202impl GenericPatternItemULE {
203    /// Given the first byte of the three-byte array that `GenericPatternItemULE` encodes,
204    /// the method determines whether the discriminant in
205    /// the byte indicates that the array encodes the `GenericPatternItem::Field`
206    /// or `GenericPatternItem::Literal` variant of the `GenericPatternItem`.
207    ///
208    /// Returns true when it is a `GenericPatternItem::Field`.
209    #[inline]
210    fn determine_field_from_u8(byte: u8) -> bool {
211        byte & 0b1000_0000 != 0
212    }
213
214    #[inline]
215    fn bytes_in_range(value: (&u8, &u8, &u8)) -> bool {
216        if Self::determine_field_from_u8(*value.0) {
217            // ensure that unused bytes are all zero
218            *value.0 == 0b1000_0000 && *value.1 == 0 && *value.2 < 10
219        } else {
220            let u = u32::from_be_bytes([0x00, *value.0, *value.1, *value.2]);
221            char::try_from(u).is_ok()
222        }
223    }
224
225    /// Converts this [`GenericPatternItemULE`] to a [`PatternItemULE`]
226    /// (if a Literal) or returns the placeholder value.
227    #[inline]
228    pub(crate) fn as_pattern_item_ule(&self) -> Result<&PatternItemULE, u8> {
229        if Self::determine_field_from_u8(self.0[0]) {
230            Err(self.0[2])
231        } else {
232            if cfg!(debug_assertions) {
233                let GenericPatternItem::Literal(c) = GenericPatternItem::from_unaligned(*self)
234                else {
235                    unreachable!("expected a literal!")
236                };
237                let pattern_item_ule = PatternItem::Literal(c).to_unaligned();
238                debug_assert_eq!(self.0, pattern_item_ule.0);
239            }
240            // Safety: The two types are repr(transparent) over [u8; 3].
241            // When a Literal, the two ULEs have the same repr,
242            // as shown in the above assertion (and the class docs).
243            Ok(unsafe { core::mem::transmute::<&GenericPatternItemULE, &PatternItemULE>(self) })
244        }
245    }
246}
247
248// Safety (based on the safety checklist on the ULE trait):
249//  1. GenericPatternItemULE does not include any uninitialized or padding bytes.
250//     (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
251//  2. GenericPatternItemULE is aligned to 1 byte.
252//     (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
253//  3. The impl of validate_bytes() returns an error if any byte is not valid.
254//  4. The impl of validate_bytes() returns an error if there are extra bytes.
255//  5. The other ULE methods use the default impl.
256//  6. GenericPatternItemULE byte equality is semantic equality.
257unsafe impl ULE for GenericPatternItemULE {
258    fn validate_bytes(bytes: &[u8]) -> Result<(), UleError> {
259        if bytes.len() % 3 != 0 {
260            return Err(UleError::length::<Self>(bytes.len()));
261        }
262        #[allow(clippy::indexing_slicing)] // chunks
263        if !bytes
264            .chunks_exact(3)
265            .all(|c| Self::bytes_in_range((&c[0], &c[1], &c[2])))
266        {
267            return Err(UleError::parse::<Self>());
268        }
269        Ok(())
270    }
271}
272
273impl GenericPatternItem {
274    #[inline]
275    pub(crate) const fn to_unaligned_const(self) -> <Self as AsULE>::ULE {
276        match self {
277            Self::Placeholder(idx) => GenericPatternItemULE([0b1000_0000, 0x00, idx]),
278            Self::Literal(ch) => {
279                let u = ch as u32;
280                let bytes = u.to_be_bytes();
281                GenericPatternItemULE([bytes[1], bytes[2], bytes[3]])
282            }
283        }
284    }
285}
286
287impl AsULE for GenericPatternItem {
288    type ULE = GenericPatternItemULE;
289
290    #[inline]
291    fn to_unaligned(self) -> Self::ULE {
292        self.to_unaligned_const()
293    }
294
295    #[inline]
296    fn from_unaligned(unaligned: Self::ULE) -> Self {
297        let value = unaligned.0;
298        if GenericPatternItemULE::determine_field_from_u8(value[0]) {
299            Self::Placeholder(value[2])
300        } else {
301            #[allow(clippy::unwrap_used)] // validated
302            Self::Literal(
303                char::try_from(u32::from_be_bytes([0x00, value[0], value[1], value[2]])).unwrap(),
304            )
305        }
306    }
307}
308
309#[cfg(test)]
310mod test {
311    use super::*;
312    use crate::provider::fields::{FieldLength, FieldSymbol, Second, Year};
313    use zerovec::ule::{AsULE, ULE};
314
315    #[test]
316    fn test_pattern_item_as_ule() {
317        let samples = [
318            (
319                PatternItem::from((FieldSymbol::Minute, FieldLength::Two)),
320                [0x80, FieldSymbol::Minute.idx(), FieldLength::Two.idx()],
321            ),
322            (
323                PatternItem::from((FieldSymbol::Year(Year::Calendar), FieldLength::Four)),
324                [
325                    0x80,
326                    FieldSymbol::Year(Year::Calendar).idx(),
327                    FieldLength::Four.idx(),
328                ],
329            ),
330            (
331                PatternItem::from((FieldSymbol::Year(Year::Cyclic), FieldLength::Four)),
332                [
333                    0x80,
334                    FieldSymbol::Year(Year::Cyclic).idx(),
335                    FieldLength::Four.idx(),
336                ],
337            ),
338            (
339                PatternItem::from((FieldSymbol::Second(Second::MillisInDay), FieldLength::One)),
340                [
341                    0x80,
342                    FieldSymbol::Second(Second::MillisInDay).idx(),
343                    FieldLength::One.idx(),
344                ],
345            ),
346            (PatternItem::from('z'), [0x00, 0x00, 0x7a]),
347        ];
348
349        for (ref_pattern, ref_bytes) in samples {
350            let ule = ref_pattern.to_unaligned();
351            assert_eq!(ULE::slice_as_bytes(&[ule]), ref_bytes);
352            let pattern = PatternItem::from_unaligned(ule);
353            assert_eq!(pattern, ref_pattern);
354        }
355    }
356
357    #[test]
358    fn test_pattern_item_ule() {
359        let samples = [(
360            [
361                PatternItem::from((FieldSymbol::Year(Year::Calendar), FieldLength::Four)),
362                PatternItem::from('z'),
363                PatternItem::from((FieldSymbol::Second(Second::MillisInDay), FieldLength::One)),
364            ],
365            [
366                [
367                    0x80,
368                    FieldSymbol::Year(Year::Calendar).idx(),
369                    FieldLength::Four.idx(),
370                ],
371                [0x00, 0x00, 0x7a],
372                [
373                    0x80,
374                    FieldSymbol::Second(Second::MillisInDay).idx(),
375                    FieldLength::One.idx(),
376                ],
377            ],
378        )];
379
380        for (ref_pattern, ref_bytes) in samples {
381            let mut bytes: Vec<u8> = vec![];
382            for item in ref_pattern.iter() {
383                let ule = item.to_unaligned();
384                bytes.extend(ULE::slice_as_bytes(&[ule]));
385            }
386
387            let mut bytes2: Vec<u8> = vec![];
388            for seq in ref_bytes.iter() {
389                bytes2.extend_from_slice(seq);
390            }
391
392            assert!(PatternItemULE::validate_bytes(&bytes).is_ok());
393            assert_eq!(bytes, bytes2);
394        }
395    }
396
397    #[test]
398    fn test_generic_pattern_item_as_ule() {
399        let samples = [
400            (GenericPatternItem::Placeholder(4), [0x80, 0x00, 4]),
401            (GenericPatternItem::Placeholder(0), [0x80, 0x00, 0]),
402            (GenericPatternItem::from('z'), [0x00, 0x00, 0x7a]),
403        ];
404
405        for (ref_pattern, ref_bytes) in samples {
406            let ule = ref_pattern.to_unaligned();
407            assert_eq!(ULE::slice_as_bytes(&[ule]), ref_bytes);
408            let pattern = GenericPatternItem::from_unaligned(ule);
409            assert_eq!(pattern, ref_pattern);
410        }
411    }
412}