icu_list/provider/
serde_dfa.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use icu_provider::prelude::*;
6use regex_automata::dfa::sparse::DFA;
7use zerovec::VarZeroCow;
8
9/// A serde-compatible version of [regex_automata::dfa::sparse::DFA].
10///
11/// This does not implement
12/// [`serde::Deserialize`] directly, as binary deserialization is not supported in big-endian
13/// platforms. `Self::maybe_deserialize` can be used to deserialize to `Option<SerdeDFA>`.
14///
15/// <div class="stab unstable">
16/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
17/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
18/// to be stable, their Rust representation might not be. Use with caution.
19/// </div>
20#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)]
21pub struct SerdeDFA<'data> {
22    // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok())
23    dfa_bytes: VarZeroCow<'data, [u8]>,
24    #[cfg(feature = "serde_human")]
25    pattern: Option<alloc::borrow::Cow<'data, str>>,
26}
27
28impl PartialEq for SerdeDFA<'_> {
29    fn eq(&self, other: &Self) -> bool {
30        self.dfa_bytes == other.dfa_bytes
31    }
32}
33
34#[cfg(feature = "datagen")]
35impl databake::Bake for SerdeDFA<'_> {
36    fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
37        env.insert("icu_list");
38        let le_bytes = databake::Bake::bake(&self.deref().to_bytes_little_endian().as_slice(), env);
39        let be_bytes = databake::Bake::bake(&self.deref().to_bytes_big_endian().as_slice(), env);
40        // Safe because of `to_bytes_little_endian`/`to_bytes_big_endian`'s invariant: They produce
41        // valid DFA representations, and we consume them correctly taking care of the endianness of the target platform.
42        databake::quote! {
43            unsafe {
44                icu_list::provider::SerdeDFA::from_dfa_bytes_unchecked(
45                    if cfg!(target_endian = "little") {
46                        #le_bytes
47                    } else {
48                        #be_bytes
49                    }
50                )
51            }
52        }
53    }
54}
55
56#[cfg(feature = "datagen")]
57impl databake::BakeSize for SerdeDFA<'_> {
58    fn borrows_size(&self) -> usize {
59        self.deref().write_to_len()
60    }
61}
62
63#[cfg(feature = "datagen")]
64impl serde::Serialize for SerdeDFA<'_> {
65    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
66    where
67        S: serde::ser::Serializer,
68    {
69        #[cfg(feature = "serde_human")]
70        if serializer.is_human_readable() {
71            return self
72                .pattern
73                .as_ref()
74                .map(|pattern| pattern.serialize(serializer))
75                .unwrap_or_else(|| {
76                    use serde::ser::Error;
77                    Err(S::Error::custom(
78                        "cannot serialize a binary-deserialized SerdeDFA to JSON",
79                    ))
80                });
81        }
82        serializer.serialize_bytes(&self.deref().to_bytes_little_endian())
83    }
84}
85
86#[cfg(feature = "serde")]
87impl<'data> SerdeDFA<'data> {
88    /// Deserializes to `Option<Self>`. Will return `None` for non-human-readable serialization
89    /// formats on big-endian systems, as `regex_automata` serialization is endian-sensitive.
90    pub fn maybe_deserialize<'de: 'data, D>(deserializer: D) -> Result<Option<Self>, D::Error>
91    where
92        D: serde::de::Deserializer<'de>,
93    {
94        use serde::Deserialize;
95
96        #[cfg(feature = "serde_human")]
97        if deserializer.is_human_readable() {
98            use alloc::string::ToString;
99            use serde::de::Error;
100            return SerdeDFA::new(alloc::borrow::Cow::<str>::deserialize(deserializer)?)
101                .map(Some)
102                .map_err(|e| D::Error::custom(e.to_string()));
103        }
104
105        let dfa_bytes = VarZeroCow::<[u8]>::deserialize(deserializer)?;
106
107        if cfg!(target_endian = "big") {
108            return Ok(None);
109        }
110
111        // Verify safety invariant
112        DFA::from_bytes(&dfa_bytes).map_err(|_e| {
113            use serde::de::Error;
114            D::Error::custom("Invalid DFA bytes")
115        })?;
116
117        Ok(Some(SerdeDFA {
118            dfa_bytes,
119            #[cfg(feature = "serde_human")]
120            pattern: None,
121        }))
122    }
123}
124
125impl<'data> SerdeDFA<'data> {
126    /// Creates a `SerdeDFA` from raw bytes. Used internally by databake.
127    ///
128    /// # Safety
129    ///
130    /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok())
131    pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self {
132        Self {
133            // SAFETY: safe for VarZeroCow<[u8]>
134            dfa_bytes: unsafe { VarZeroCow::from_bytes_unchecked(dfa_bytes) },
135            #[cfg(feature = "serde_human")]
136            pattern: None,
137        }
138    }
139
140    /// Creates a `SerdeDFA` from a regex.
141    #[cfg(any(feature = "datagen", feature = "serde_human",))]
142    pub fn new(pattern: alloc::borrow::Cow<'data, str>) -> Result<Self, icu_provider::DataError> {
143        use regex_automata::dfa::dense::{Builder, Config};
144
145        let Some(anchored_pattern) = pattern.strip_prefix('^') else {
146            return Err(
147                DataError::custom("Only anchored regexes (starting with ^) are supported")
148                    .with_display_context(&pattern),
149            );
150        };
151
152        let mut builder = Builder::new();
153        let dfa = builder
154            .configure(
155                Config::new()
156                    .start_kind(regex_automata::dfa::StartKind::Anchored)
157                    .minimize(true),
158            )
159            .build(anchored_pattern)
160            .map_err(|e| {
161                icu_provider::DataError::custom("Cannot build DFA")
162                    .with_display_context(anchored_pattern)
163                    .with_debug_context(&e)
164            })?
165            .to_sparse()
166            .map_err(|e| {
167                icu_provider::DataError::custom("Cannot sparsify DFA")
168                    .with_display_context(anchored_pattern)
169                    .with_debug_context(&e)
170            })?;
171
172        Ok(Self {
173            dfa_bytes: VarZeroCow::new_owned(dfa.to_bytes_native_endian().into_boxed_slice()),
174            pattern: Some(pattern),
175        })
176    }
177
178    /// Returns the represented [`DFA`]
179    #[allow(clippy::unwrap_used)] // by invariant
180    pub fn deref(&'data self) -> DFA<&'data [u8]> {
181        // Safe due to struct invariant.
182        unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 }
183    }
184}
185
186#[cfg(all(test, feature = "datagen"))]
187mod test {
188    use super::*;
189    use regex_automata::Input;
190    use std::borrow::Cow;
191
192    #[test]
193    fn test_serde_dfa() {
194        use regex_automata::dfa::Automaton;
195
196        let matcher = SerdeDFA::new(Cow::Borrowed("^abc")).unwrap();
197
198        assert!(matcher
199            .deref()
200            .try_search_fwd(&Input::new("ab").anchored(regex_automata::Anchored::Yes))
201            .unwrap()
202            .is_none());
203        assert!(matcher
204            .deref()
205            .try_search_fwd(&Input::new("abc").anchored(regex_automata::Anchored::Yes))
206            .unwrap()
207            .is_some());
208        assert!(matcher
209            .deref()
210            .try_search_fwd(&Input::new("abcde").anchored(regex_automata::Anchored::Yes))
211            .unwrap()
212            .is_some());
213        assert!(matcher
214            .deref()
215            .try_search_fwd(&Input::new(" abcde").anchored(regex_automata::Anchored::Yes))
216            .unwrap()
217            .is_none());
218    }
219
220    #[derive(serde::Deserialize)]
221    struct OptionSerdeDFA<'data>(
222        #[serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize")] Option<SerdeDFA<'data>>,
223    );
224
225    #[test]
226    #[cfg(target_endian = "little")]
227    fn test_postcard_serialization() {
228        let matcher = SerdeDFA::new(Cow::Borrowed("^abc*")).unwrap();
229
230        let mut bytes = postcard::to_stdvec(&matcher).unwrap();
231        assert_eq!(
232            postcard::from_bytes::<OptionSerdeDFA>(&bytes).unwrap().0,
233            Some(matcher)
234        );
235
236        // A corrupted byte leads to an error
237        bytes[17] ^= 255;
238        assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err());
239        bytes[17] ^= 255;
240
241        // An extra byte leads to an error
242        bytes.insert(123, 40);
243        assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err());
244        bytes.remove(123);
245
246        // Missing bytes lead to an error
247        assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes[0..bytes.len() - 5]).is_err());
248    }
249
250    #[test]
251    fn test_rmp_serialization() {
252        let matcher = SerdeDFA::new(Cow::Borrowed("^abc*")).unwrap();
253
254        let bytes = rmp_serde::to_vec(&matcher).unwrap();
255        assert_eq!(
256            rmp_serde::from_slice::<OptionSerdeDFA>(&bytes).unwrap().0,
257            Some(matcher)
258        );
259    }
260
261    #[test]
262    #[cfg(feature = "serde_human")]
263    fn test_json_serialization() {
264        let matcher = SerdeDFA::new(Cow::Borrowed("^abc*")).unwrap();
265
266        let json = serde_json::to_string(&matcher).unwrap();
267        assert_eq!(
268            serde_json::from_str::<OptionSerdeDFA>(&json).unwrap().0,
269            Some(matcher)
270        );
271        assert!(serde_json::from_str::<OptionSerdeDFA>(".*[").is_err());
272    }
273
274    #[test]
275    fn databake() {
276        // This is the DFA for ".*"
277        databake::test_bake!(
278            SerdeDFA,
279            const,
280            unsafe {
281                crate::provider::SerdeDFA::from_dfa_bytes_unchecked(
282                    if cfg!(target_endian = "little") {
283                        b"rust-regex-automata-dfa-sparse\0\0\xFF\xFE\0\0\x02\0\0\0\0\0\0\0\x02\0\0\0\x0E\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\x02\x02\x02\x03\x04\x04\x05\x06\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x08\t\t\t\n\x0B\x0B\x0C\r\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x12\x12\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x14\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x16\x17\x17\x18\x19\x19\x19\x1A\x1B\x1B\x1B\x1B\x1B\x1B\x1B\x1B\x1B\x1B\x1B(\x01\0\0\x01\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x01\x80\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\x05\0\x05\x05\x06\x06\x0C\x0C\r\r\0\0S\0\0\0D\0\0\0S\0\0\0D\0\0\0\0\0\0\0\0\x02\0\0\x1B\0\0\x12\0\0\0\x12\0\0\0\0\x03\0\x06\x06\r\r\0\0h\0\0\0h\0\0\0\0\0\0\0\0\x0E\0\0\0\x02\x02\x04\x07\t\t\x0B\x0E\x13\x13\x14\x14\x15\x15\x16\x16\x17\x17\x18\x18\x19\x19\x1A\x1A\0\0D\0\0\0D\0\0\0D\0\0\0D\0\0\0D\0\0\0\xBF\0\0\0\xCE\0\0\0\xDD\0\0\0\xEC\0\0\0\xDD\0\0\0\xFB\0\0\0\n\x01\0\0\x19\x01\0\0\x12\0\0\0\0\x02\0\x0F\x11\0\0D\0\0\0\0\0\0\0\0\x02\0\x11\x11\0\0\xBF\0\0\0\0\0\0\0\0\x02\0\x0F\x11\0\0\xBF\0\0\0\0\0\0\0\0\x02\0\x0F\x10\0\0\xBF\0\0\0\0\0\0\0\0\x02\0\x10\x11\0\0\xDD\0\0\0\0\0\0\0\0\x02\0\x0F\x11\0\0\xDD\0\0\0\0\0\0\0\0\x02\0\x0F\x0F\0\0\xDD\0\0\0\0\0\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\0\0\0\0\0\0\0\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\0\0\0\0\x01\0\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x06\0\0\0\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF`\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0#\0\0\0#\0\0\0#\0\0\0#\0\0\0#\0\0\0#\0\0\0\x12\0\0\0\t\0\0\0\x12\0\0\0\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
284                    } else {
285                        b"rust-regex-automata-dfa-sparse\0\0\0\0\xFE\xFF\0\0\0\x02\0\0\0\0\0\0\0\x02\0\0\0\x0E\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\x02\x02\x02\x03\x04\x04\x05\x06\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x08\t\t\t\n\x0B\x0B\x0C\r\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0E\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x0F\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x12\x12\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x13\x14\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x15\x16\x17\x17\x18\x19\x19\x19\x1A\x1B\x1B\x1B\x1B\x1B\x1B\x1B\x1B\x1B\x1B\x1B\0\0\x01(\0\x01\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x05\x05\x05\x06\x06\x0C\x0C\r\r\0\0\0\0\0S\0\0\0D\0\0\0S\0\0\0D\0\0\0\0\0\0\x02\0\x1B\0\0\0\0\0\x12\0\0\0\x12\0\0\x03\x06\x06\r\r\0\0\0\0\0h\0\0\0h\0\0\0\0\0\0\x0E\0\0\x02\x02\x04\x07\t\t\x0B\x0E\x13\x13\x14\x14\x15\x15\x16\x16\x17\x17\x18\x18\x19\x19\x1A\x1A\0\0\0\0\0D\0\0\0D\0\0\0D\0\0\0D\0\0\0D\0\0\0\xBF\0\0\0\xCE\0\0\0\xDD\0\0\0\xEC\0\0\0\xDD\0\0\0\xFB\0\0\x01\n\0\0\x01\x19\0\0\0\x12\0\0\x02\x0F\x11\0\0\0\0\0D\0\0\0\0\0\0\x02\x11\x11\0\0\0\0\0\xBF\0\0\0\0\0\0\x02\x0F\x11\0\0\0\0\0\xBF\0\0\0\0\0\0\x02\x0F\x10\0\0\0\0\0\xBF\0\0\0\0\0\0\x02\x10\x11\0\0\0\0\0\xDD\0\0\0\0\0\0\x02\x0F\x11\0\0\0\0\0\xDD\0\0\0\0\0\0\x02\x0F\x0F\0\0\0\0\0\xDD\0\0\0\0\0\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\x03\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\0\0\0\0\0\0\0\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\0\0\0\0\x01\0\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x06\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\0\0\0`\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0#\0\0\0#\0\0\0#\0\0\0#\0\0\0#\0\0\0#\0\0\0\x12\0\0\0\t\0\0\0\x12\0\0\0\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
286                    },
287                )
288            },
289            icu_list
290        );
291    }
292}