potential_utf/
uchar.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use core::cmp::Ordering;
6use core::fmt;
7
8/// A 24-bit numeric data type that is expected to be a Unicode scalar value, but is not
9/// validated as such.
10///
11/// Use this type instead of `char` when you want to deal with data that is expected to be valid
12/// Unicode scalar values, but you want control over when or if you validate that assumption.
13///
14/// # Examples
15///
16/// ```
17/// use potential_utf::PotentialCodePoint;
18///
19/// assert_eq!(PotentialCodePoint::from_u24(0x68).try_to_char(), Ok('h'));
20/// assert_eq!(PotentialCodePoint::from_char('i').try_to_char(), Ok('i'));
21/// assert_eq!(
22///     PotentialCodePoint::from_u24(0x1F44B).try_to_char(),
23///     Ok('👋')
24/// );
25///
26/// assert!(PotentialCodePoint::from_u24(0xDE01).try_to_char().is_err());
27/// assert_eq!(
28///     PotentialCodePoint::from_u24(0xDE01).to_char_lossy(),
29///     char::REPLACEMENT_CHARACTER
30/// );
31/// ```
32#[repr(transparent)]
33#[allow(clippy::exhaustive_structs)] // transparent newtype
34#[derive(PartialEq, Eq, Clone, Copy, Hash)]
35pub struct PotentialCodePoint([u8; 3]);
36
37impl PotentialCodePoint {
38    /// Create a [`PotentialCodePoint`] from a `char`.
39    ///
40    /// # Examples
41    ///
42    /// ```
43    /// use potential_utf::PotentialCodePoint;
44    ///
45    /// let a = PotentialCodePoint::from_char('a');
46    /// assert_eq!(a.try_to_char().unwrap(), 'a');
47    /// ```
48    #[inline]
49    pub const fn from_char(c: char) -> Self {
50        let [u0, u1, u2, _u3] = (c as u32).to_le_bytes();
51        Self([u0, u1, u2])
52    }
53
54    /// Create [`PotentialCodePoint`] from a u32 value, ignoring the most significant 8 bits.
55    #[inline]
56    pub const fn from_u24(c: u32) -> Self {
57        let [u0, u1, u2, _u3] = c.to_le_bytes();
58        Self([u0, u1, u2])
59    }
60
61    /// Attempt to convert a [`PotentialCodePoint`] to a `char`.
62    ///
63    /// # Examples
64    ///
65    /// ```
66    /// use potential_utf::PotentialCodePoint;
67    /// use zerovec::ule::AsULE;
68    ///
69    /// let a = PotentialCodePoint::from_char('a');
70    /// assert_eq!(a.try_to_char(), Ok('a'));
71    ///
72    /// let b = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into());
73    /// assert!(matches!(b.try_to_char(), Err(_)));
74    /// ```
75    #[inline]
76    pub fn try_to_char(self) -> Result<char, core::char::CharTryFromError> {
77        char::try_from(u32::from(self))
78    }
79
80    /// Convert a [`PotentialCodePoint`] to a `char', returning [`char::REPLACEMENT_CHARACTER`]
81    /// if the `PotentialCodePoint` does not represent a valid Unicode scalar value.
82    ///
83    /// # Examples
84    ///
85    /// ```
86    /// use potential_utf::PotentialCodePoint;
87    /// use zerovec::ule::AsULE;
88    ///
89    /// let a = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into());
90    /// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER);
91    /// ```
92    #[inline]
93    pub fn to_char_lossy(self) -> char {
94        self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER)
95    }
96
97    /// Convert a [`PotentialCodePoint`] to a `char` without checking that it is
98    /// a valid Unicode scalar value.
99    ///
100    /// # Safety
101    ///
102    /// The `PotentialCodePoint` must be a valid Unicode scalar value in little-endian order.
103    ///
104    /// # Examples
105    ///
106    /// ```
107    /// use potential_utf::PotentialCodePoint;
108    ///
109    /// let a = PotentialCodePoint::from_char('a');
110    /// assert_eq!(unsafe { a.to_char_unchecked() }, 'a');
111    /// ```
112    #[inline]
113    pub unsafe fn to_char_unchecked(self) -> char {
114        char::from_u32_unchecked(u32::from(self))
115    }
116
117    /// For converting to the ULE type in a const context
118    ///
119    /// Can be removed once const traits are a thing
120    #[inline]
121    #[cfg(feature = "zerovec")]
122    pub const fn to_unaligned(self) -> zerovec::ule::RawBytesULE<3> {
123        zerovec::ule::RawBytesULE(self.0)
124    }
125}
126
127/// This impl requires enabling the optional `zerovec` Cargo feature
128#[cfg(feature = "zerovec")]
129impl zerovec::ule::AsULE for PotentialCodePoint {
130    type ULE = zerovec::ule::RawBytesULE<3>;
131
132    #[inline]
133    fn to_unaligned(self) -> Self::ULE {
134        zerovec::ule::RawBytesULE(self.0)
135    }
136
137    #[inline]
138    fn from_unaligned(unaligned: Self::ULE) -> Self {
139        Self(unaligned.0)
140    }
141}
142
143// Safety: PotentialCodePoint is always the little-endian representation of a char,
144// which corresponds to its AsULE::ULE type
145/// This impl requires enabling the optional `zerovec` Cargo feature
146#[cfg(feature = "zerovec")]
147unsafe impl zerovec::ule::EqULE for PotentialCodePoint {}
148
149impl fmt::Debug for PotentialCodePoint {
150    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
151        // Debug as a char if possible
152        match self.try_to_char() {
153            Ok(c) => fmt::Debug::fmt(&c, f),
154            Err(_) => fmt::Debug::fmt(&self.0, f),
155        }
156    }
157}
158
159impl PartialOrd for PotentialCodePoint {
160    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
161        Some(self.cmp(other))
162    }
163}
164
165impl PartialEq<char> for PotentialCodePoint {
166    fn eq(&self, other: &char) -> bool {
167        self.eq(&Self::from_char(*other))
168    }
169}
170
171impl PartialOrd<char> for PotentialCodePoint {
172    fn partial_cmp(&self, other: &char) -> Option<Ordering> {
173        self.partial_cmp(&Self::from_char(*other))
174    }
175}
176
177impl PartialEq<PotentialCodePoint> for char {
178    fn eq(&self, other: &PotentialCodePoint) -> bool {
179        PotentialCodePoint::from_char(*self).eq(other)
180    }
181}
182
183impl PartialOrd<PotentialCodePoint> for char {
184    fn partial_cmp(&self, other: &PotentialCodePoint) -> Option<Ordering> {
185        PotentialCodePoint::from_char(*self).partial_cmp(other)
186    }
187}
188
189impl Ord for PotentialCodePoint {
190    // custom implementation, as derived Ord would compare lexicographically
191    fn cmp(&self, other: &Self) -> Ordering {
192        let a = u32::from(*self);
193        let b = u32::from(*other);
194        a.cmp(&b)
195    }
196}
197
198impl From<PotentialCodePoint> for u32 {
199    fn from(x: PotentialCodePoint) -> Self {
200        let [a0, a1, a2] = x.0;
201        u32::from_le_bytes([a0, a1, a2, 0])
202    }
203}
204
205impl TryFrom<u32> for PotentialCodePoint {
206    type Error = ();
207    fn try_from(x: u32) -> Result<Self, ()> {
208        let [u0, u1, u2, u3] = x.to_le_bytes();
209        if u3 != 0 {
210            return Err(());
211        }
212        Ok(Self([u0, u1, u2]))
213    }
214}
215
216impl From<char> for PotentialCodePoint {
217    #[inline]
218    fn from(value: char) -> Self {
219        Self::from_char(value)
220    }
221}
222
223impl TryFrom<PotentialCodePoint> for char {
224    type Error = core::char::CharTryFromError;
225
226    #[inline]
227    fn try_from(value: PotentialCodePoint) -> Result<char, Self::Error> {
228        value.try_to_char()
229    }
230}
231
232/// This impl requires enabling the optional `serde` Cargo feature
233#[cfg(feature = "serde")]
234impl serde::Serialize for PotentialCodePoint {
235    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
236    where
237        S: serde::Serializer,
238    {
239        use serde::ser::Error;
240        let c = self
241            .try_to_char()
242            .map_err(|_| S::Error::custom("invalid Unicode scalar value in PotentialCodePoint"))?;
243        if serializer.is_human_readable() {
244            serializer.serialize_char(c)
245        } else {
246            self.0.serialize(serializer)
247        }
248    }
249}
250
251/// This impl requires enabling the optional `serde` Cargo feature
252#[cfg(feature = "serde")]
253impl<'de> serde::Deserialize<'de> for PotentialCodePoint {
254    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
255    where
256        D: serde::Deserializer<'de>,
257    {
258        if deserializer.is_human_readable() {
259            let c = <char>::deserialize(deserializer)?;
260            Ok(PotentialCodePoint::from_char(c))
261        } else {
262            let bytes = <[u8; 3]>::deserialize(deserializer)?;
263            Ok(PotentialCodePoint(bytes))
264        }
265    }
266}
267
268/// This impl requires enabling the optional `databake` Cargo feature
269#[cfg(feature = "databake")]
270impl databake::Bake for PotentialCodePoint {
271    fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
272        match self.try_to_char() {
273            Ok(ch) => {
274                env.insert("potential_utf");
275                let ch = ch.bake(env);
276                databake::quote! {
277                    potential_utf::PotentialCodePoint::from_char(#ch)
278                }
279            }
280            Err(_) => {
281                env.insert("potential_utf");
282                let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]);
283                databake::quote! {
284                    potential_utf::PotentialCodePoint::from_u24(#u24)
285                }
286            }
287        }
288    }
289}
290
291#[cfg(test)]
292mod test {
293    use super::*;
294    use zerovec::ZeroVec;
295
296    #[test]
297    fn test_serde_fail() {
298        let uc = PotentialCodePoint([0xFF, 0xFF, 0xFF]);
299        serde_json::to_string(&uc).expect_err("serialize invalid char bytes");
300        bincode::serialize(&uc).expect_err("serialize invalid char bytes");
301    }
302
303    #[test]
304    fn test_serde_json() {
305        let c = '🙃';
306        let uc = PotentialCodePoint::from_char(c);
307        let json_ser = serde_json::to_string(&uc).unwrap();
308
309        assert_eq!(json_ser, r#""🙃""#);
310
311        let json_de: PotentialCodePoint = serde_json::from_str(&json_ser).unwrap();
312
313        assert_eq!(uc, json_de);
314    }
315
316    #[test]
317    fn test_serde_bincode() {
318        let c = '🙃';
319        let uc = PotentialCodePoint::from_char(c);
320        let bytes_ser = bincode::serialize(&uc).unwrap();
321
322        assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]);
323
324        let bytes_de: PotentialCodePoint = bincode::deserialize(&bytes_ser).unwrap();
325
326        assert_eq!(uc, bytes_de);
327    }
328
329    #[test]
330    fn test_representation() {
331        let chars = ['w', 'ω', '文', '𑄃', '🙃'];
332
333        // backed by [PotentialCodePoint]
334        let uvchars: Vec<_> = chars
335            .iter()
336            .copied()
337            .map(PotentialCodePoint::from_char)
338            .collect();
339        // backed by [RawBytesULE<3>]
340        let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect();
341
342        let ule_bytes = zvec.as_bytes();
343        let uvbytes;
344        unsafe {
345            let ptr = &uvchars[..] as *const _ as *const u8;
346            uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len());
347        }
348
349        // PotentialCodePoint is defined as little-endian, so this must be true on all platforms
350        // also asserts that to_unaligned/from_unaligned are no-ops
351        assert_eq!(uvbytes, ule_bytes);
352
353        assert_eq!(
354            &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1],
355            ule_bytes
356        );
357    }
358
359    #[test]
360    fn test_char_bake() {
361        databake::test_bake!(
362            PotentialCodePoint,
363            const,
364            crate::PotentialCodePoint::from_char('b'),
365            potential_utf
366        );
367        // surrogate code point
368        databake::test_bake!(
369            PotentialCodePoint,
370            const,
371            crate::PotentialCodePoint::from_u24(55296u32),
372            potential_utf
373        );
374    }
375}
potential_utf/uchar.rs

potential_utf/
uchar.rs