icu_datetime/provider/pattern/item/ule.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use super::{GenericPatternItem, PatternItem};
6use crate::provider::fields;
7use core::convert::TryFrom;
8use zerovec::ule::{AsULE, UleError, ULE};
9
10/// `PatternItemULE` is a type optimized for efficient storing and
11/// deserialization of `FixedCalendarDateTimeFormatter` `PatternItem` elements using
12/// `ZeroVec` model.
13///
14/// The serialization model packages the pattern item in three bytes.
15///
16/// The first bit is used to disriminate the item variant. If the bit is
17/// set, then the value is the `PatternItem::Field` variant. Otherwise,
18/// the `PatternItem::Literal` is used.
19///
20/// In case the discriminant is set:
21///
22/// 1) The rest of the first byte remains unused.
23/// 2) The second byte encodes `FieldSymbol` encoded as (Type: 4 bits, Symbol: 4 bits).
24/// 3) The third byte encodes the field length.
25///
26/// If the discriminant is not set, the bottom three bits of the first byte,
27/// together with the next two bytes, contain all 21 bits required to encode
28/// any [`Unicode Code Point`]. By design, the representation of a code point
29/// is the same between [`PatternItemULE`] and [`GenericPatternItemULE`].
30///
31/// # Diagram
32///
33/// ```text
34/// ┌───────────────┬───────────────┬───────────────┐
35/// │ u8 │ u8 │ u8 │
36/// ├─┬─┬─┬─┬─┬─┬─┬─┼─┬─┬─┬─┬─┬─┬─┬─┼─┬─┬─┬─┬─┬─┬─┬─┤
37/// ├─┴─┴─┼─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┤
38/// │ │ Unicode Code Point │ Literal
39/// ├─┬───┴─────────┬───────────────┬───────────────┤
40/// │X│ │ FieldSymbol │ FieldLength │ Field
41/// └─┴─────────────┴───────────────┴───────────────┘
42/// ▲
43/// │
44/// Variant Discriminant
45/// ```
46///
47/// # Optimization
48///
49/// This model is optimized for efficient packaging of the `PatternItem` elements
50/// and performant deserialization from the `PatternItemULE` to `PatternItem` type.
51///
52/// # Constraints
53///
54/// The model leaves at most 8 `PatternItem` variants, limits the number of possible
55/// field types and symbols to 16 each and limits the number of length variants to 256.
56///
57/// [`Unicode Code Point`]: http://www.unicode.org/versions/latest/
58#[derive(Copy, Clone, Debug, PartialEq)]
59#[repr(transparent)]
60// This field has many invariants about the data, but the primary safety invariant
61// is that when the discriminant bit is 0, it represents a valid char, bytes interpreted as big-endian.
62pub struct PatternItemULE([u8; 3]);
63
64impl PatternItemULE {
65 /// Given the first byte of the three-byte array that `PatternItemULE` encodes,
66 /// the method determines whether the discriminant in
67 /// the byte indicates that the array encodes the `PatternItem::Field`
68 /// or `PatternItem::Literal` variant of the `PatternItem`.
69 ///
70 /// Returns true when it is a `PatternItem::Field`.
71 #[inline]
72 fn determine_field_from_u8(byte: u8) -> bool {
73 byte & 0b1000_0000 != 0
74 }
75
76 #[inline]
77 fn bytes_in_range(value: (&u8, &u8, &u8)) -> bool {
78 if Self::determine_field_from_u8(*value.0) {
79 // ensure that unused bytes are all zero
80 fields::FieldULE::validate_byte_pair((*value.1, *value.2)).is_ok()
81 && *value.0 == 0b1000_0000
82 } else {
83 // This upholds the safety invariant
84 char::try_from(u32::from_be_bytes([0x00, *value.0, *value.1, *value.2])).is_ok()
85 }
86 }
87}
88
89// Safety (based on the safety checklist on the ULE trait):
90// 1. PatternItemULE does not include any uninitialized or padding bytes.
91// (achieved by `#[repr(transparent)]` on a ULE type)
92// 2. PatternItemULE is aligned to 1 byte.
93// (achieved by `#[repr(transparent)]` on a ULE type)
94// 3. The impl of validate_bytes() returns an error if any byte is not valid.
95// 4. The impl of validate_bytes() returns an error if there are extra bytes.
96// 5. The other ULE methods use the default impl.
97// 6. PatternItemULE byte equality is semantic equality.
98unsafe impl ULE for PatternItemULE {
99 fn validate_bytes(bytes: &[u8]) -> Result<(), UleError> {
100 if bytes.len() % 3 != 0 {
101 return Err(UleError::length::<Self>(bytes.len()));
102 }
103
104 #[allow(clippy::indexing_slicing)] // chunks
105 if !bytes
106 .chunks(3)
107 // This upholds the safety invariant by checking all invariants
108 .all(|c| Self::bytes_in_range((&c[0], &c[1], &c[2])))
109 {
110 return Err(UleError::parse::<Self>());
111 }
112 Ok(())
113 }
114}
115
116impl AsULE for PatternItem {
117 type ULE = PatternItemULE;
118
119 #[inline]
120 fn to_unaligned(self) -> Self::ULE {
121 match self {
122 Self::Field(field) => {
123 PatternItemULE([0b1000_0000, field.symbol.idx(), field.length.idx()])
124 }
125 Self::Literal(ch) => {
126 let u = ch as u32;
127 let bytes = u.to_be_bytes();
128 PatternItemULE([bytes[1], bytes[2], bytes[3]])
129 }
130 }
131 }
132
133 #[inline]
134 fn from_unaligned(unaligned: Self::ULE) -> Self {
135 let value = unaligned.0;
136 #[allow(clippy::unwrap_used)] // validated
137 if PatternItemULE::determine_field_from_u8(value[0]) {
138 let symbol = fields::FieldSymbol::from_idx(value[1]).unwrap();
139 let length = fields::FieldLength::from_idx(value[2]).unwrap();
140 PatternItem::Field(fields::Field { symbol, length })
141 } else {
142 // Safety: From field safety invariant
143 PatternItem::Literal(unsafe {
144 char::from_u32_unchecked(u32::from_be_bytes([0x00, value[0], value[1], value[2]]))
145 })
146 }
147 }
148}
149
150/// `GenericPatternItemULE` is a type optimized for efficient storing and
151/// deserialization of `FixedCalendarDateTimeFormatter` `GenericPatternItem` elements using
152/// the `ZeroVec` model.
153///
154/// The serialization model packages the pattern item in three bytes.
155///
156/// The first bit is used to disriminate the item variant. If the bit is
157/// set, then the value is the `GenericPatternItem::Placeholder` variant. Otherwise,
158/// the `GenericPatternItem::Literal` is used.
159///
160/// In case the discriminant is set:
161///
162/// 1) The rest of the first byte remains unused.
163/// 2) The second byte is unused.
164/// 3) The third byte encodes the placeholder index.
165///
166/// If the discriminant is not set, the bottom three bits of the first byte,
167/// together with the next two bytes, contain all 21 bits required to encode
168/// any [`Unicode Code Point`]. By design, the representation of a code point
169/// is the same between [`PatternItemULE`] and [`GenericPatternItemULE`].
170///
171/// # Diagram
172///
173/// ```text
174/// ┌───────────────┬───────────────┬───────────────┐
175/// │ u8 │ u8 │ u8 │
176/// ├─┬─┬─┬─┬─┬─┬─┬─┼─┬─┬─┬─┬─┬─┬─┬─┼─┬─┬─┬─┬─┬─┬─┬─┤
177/// ├─┴─┴─┼─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┤
178/// │ │ Unicode Code Point │ Literal
179/// ├─┬───┴─────────────────────────┬───────────────┤
180/// │X│ │ Placeholder │ Placeholder
181/// └─┴─────────────────────────────┴───────────────┘
182/// ▲
183/// │
184/// Variant Discriminant
185/// ```
186///
187/// # Optimization
188///
189/// This model is optimized for efficient packaging of the `GenericPatternItem` elements
190/// and performant deserialization from the `GernericPatternItemULE` to `GenericPatternItem` type.
191///
192/// # Constraints
193///
194/// The model leaves at most 8 `PatternItem` variants, and limits the placeholder
195/// to a single u8.
196///
197/// [`Unicode Code Point`]: http://www.unicode.org/versions/latest/
198#[derive(Copy, Clone, Debug, PartialEq)]
199#[repr(transparent)]
200pub struct GenericPatternItemULE([u8; 3]);
201
202impl GenericPatternItemULE {
203 /// Given the first byte of the three-byte array that `GenericPatternItemULE` encodes,
204 /// the method determines whether the discriminant in
205 /// the byte indicates that the array encodes the `GenericPatternItem::Field`
206 /// or `GenericPatternItem::Literal` variant of the `GenericPatternItem`.
207 ///
208 /// Returns true when it is a `GenericPatternItem::Field`.
209 #[inline]
210 fn determine_field_from_u8(byte: u8) -> bool {
211 byte & 0b1000_0000 != 0
212 }
213
214 #[inline]
215 fn bytes_in_range(value: (&u8, &u8, &u8)) -> bool {
216 if Self::determine_field_from_u8(*value.0) {
217 // ensure that unused bytes are all zero
218 *value.0 == 0b1000_0000 && *value.1 == 0 && *value.2 < 10
219 } else {
220 let u = u32::from_be_bytes([0x00, *value.0, *value.1, *value.2]);
221 char::try_from(u).is_ok()
222 }
223 }
224
225 /// Converts this [`GenericPatternItemULE`] to a [`PatternItemULE`]
226 /// (if a Literal) or returns the placeholder value.
227 #[inline]
228 pub(crate) fn as_pattern_item_ule(&self) -> Result<&PatternItemULE, u8> {
229 if Self::determine_field_from_u8(self.0[0]) {
230 Err(self.0[2])
231 } else {
232 if cfg!(debug_assertions) {
233 let GenericPatternItem::Literal(c) = GenericPatternItem::from_unaligned(*self)
234 else {
235 unreachable!("expected a literal!")
236 };
237 let pattern_item_ule = PatternItem::Literal(c).to_unaligned();
238 debug_assert_eq!(self.0, pattern_item_ule.0);
239 }
240 // Safety: The two types are repr(transparent) over [u8; 3].
241 // When a Literal, the two ULEs have the same repr,
242 // as shown in the above assertion (and the class docs).
243 Ok(unsafe { core::mem::transmute::<&GenericPatternItemULE, &PatternItemULE>(self) })
244 }
245 }
246}
247
248// Safety (based on the safety checklist on the ULE trait):
249// 1. GenericPatternItemULE does not include any uninitialized or padding bytes.
250// (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
251// 2. GenericPatternItemULE is aligned to 1 byte.
252// (achieved by `#[repr(transparent)]` on a type that satisfies this invariant)
253// 3. The impl of validate_bytes() returns an error if any byte is not valid.
254// 4. The impl of validate_bytes() returns an error if there are extra bytes.
255// 5. The other ULE methods use the default impl.
256// 6. GenericPatternItemULE byte equality is semantic equality.
257unsafe impl ULE for GenericPatternItemULE {
258 fn validate_bytes(bytes: &[u8]) -> Result<(), UleError> {
259 if bytes.len() % 3 != 0 {
260 return Err(UleError::length::<Self>(bytes.len()));
261 }
262 #[allow(clippy::indexing_slicing)] // chunks
263 if !bytes
264 .chunks_exact(3)
265 .all(|c| Self::bytes_in_range((&c[0], &c[1], &c[2])))
266 {
267 return Err(UleError::parse::<Self>());
268 }
269 Ok(())
270 }
271}
272
273impl GenericPatternItem {
274 #[inline]
275 pub(crate) const fn to_unaligned_const(self) -> <Self as AsULE>::ULE {
276 match self {
277 Self::Placeholder(idx) => GenericPatternItemULE([0b1000_0000, 0x00, idx]),
278 Self::Literal(ch) => {
279 let u = ch as u32;
280 let bytes = u.to_be_bytes();
281 GenericPatternItemULE([bytes[1], bytes[2], bytes[3]])
282 }
283 }
284 }
285}
286
287impl AsULE for GenericPatternItem {
288 type ULE = GenericPatternItemULE;
289
290 #[inline]
291 fn to_unaligned(self) -> Self::ULE {
292 self.to_unaligned_const()
293 }
294
295 #[inline]
296 fn from_unaligned(unaligned: Self::ULE) -> Self {
297 let value = unaligned.0;
298 if GenericPatternItemULE::determine_field_from_u8(value[0]) {
299 Self::Placeholder(value[2])
300 } else {
301 #[allow(clippy::unwrap_used)] // validated
302 Self::Literal(
303 char::try_from(u32::from_be_bytes([0x00, value[0], value[1], value[2]])).unwrap(),
304 )
305 }
306 }
307}
308
309#[cfg(test)]
310mod test {
311 use super::*;
312 use crate::provider::fields::{FieldLength, FieldSymbol, Second, Year};
313 use zerovec::ule::{AsULE, ULE};
314
315 #[test]
316 fn test_pattern_item_as_ule() {
317 let samples = [
318 (
319 PatternItem::from((FieldSymbol::Minute, FieldLength::Two)),
320 [0x80, FieldSymbol::Minute.idx(), FieldLength::Two.idx()],
321 ),
322 (
323 PatternItem::from((FieldSymbol::Year(Year::Calendar), FieldLength::Four)),
324 [
325 0x80,
326 FieldSymbol::Year(Year::Calendar).idx(),
327 FieldLength::Four.idx(),
328 ],
329 ),
330 (
331 PatternItem::from((FieldSymbol::Year(Year::Cyclic), FieldLength::Four)),
332 [
333 0x80,
334 FieldSymbol::Year(Year::Cyclic).idx(),
335 FieldLength::Four.idx(),
336 ],
337 ),
338 (
339 PatternItem::from((FieldSymbol::Second(Second::MillisInDay), FieldLength::One)),
340 [
341 0x80,
342 FieldSymbol::Second(Second::MillisInDay).idx(),
343 FieldLength::One.idx(),
344 ],
345 ),
346 (PatternItem::from('z'), [0x00, 0x00, 0x7a]),
347 ];
348
349 for (ref_pattern, ref_bytes) in samples {
350 let ule = ref_pattern.to_unaligned();
351 assert_eq!(ULE::slice_as_bytes(&[ule]), ref_bytes);
352 let pattern = PatternItem::from_unaligned(ule);
353 assert_eq!(pattern, ref_pattern);
354 }
355 }
356
357 #[test]
358 fn test_pattern_item_ule() {
359 let samples = [(
360 [
361 PatternItem::from((FieldSymbol::Year(Year::Calendar), FieldLength::Four)),
362 PatternItem::from('z'),
363 PatternItem::from((FieldSymbol::Second(Second::MillisInDay), FieldLength::One)),
364 ],
365 [
366 [
367 0x80,
368 FieldSymbol::Year(Year::Calendar).idx(),
369 FieldLength::Four.idx(),
370 ],
371 [0x00, 0x00, 0x7a],
372 [
373 0x80,
374 FieldSymbol::Second(Second::MillisInDay).idx(),
375 FieldLength::One.idx(),
376 ],
377 ],
378 )];
379
380 for (ref_pattern, ref_bytes) in samples {
381 let mut bytes: Vec<u8> = vec![];
382 for item in ref_pattern.iter() {
383 let ule = item.to_unaligned();
384 bytes.extend(ULE::slice_as_bytes(&[ule]));
385 }
386
387 let mut bytes2: Vec<u8> = vec![];
388 for seq in ref_bytes.iter() {
389 bytes2.extend_from_slice(seq);
390 }
391
392 assert!(PatternItemULE::validate_bytes(&bytes).is_ok());
393 assert_eq!(bytes, bytes2);
394 }
395 }
396
397 #[test]
398 fn test_generic_pattern_item_as_ule() {
399 let samples = [
400 (GenericPatternItem::Placeholder(4), [0x80, 0x00, 4]),
401 (GenericPatternItem::Placeholder(0), [0x80, 0x00, 0]),
402 (GenericPatternItem::from('z'), [0x00, 0x00, 0x7a]),
403 ];
404
405 for (ref_pattern, ref_bytes) in samples {
406 let ule = ref_pattern.to_unaligned();
407 assert_eq!(ULE::slice_as_bytes(&[ule]), ref_bytes);
408 let pattern = GenericPatternItem::from_unaligned(ule);
409 assert_eq!(pattern, ref_pattern);
410 }
411 }
412}