icu_provider_blob/
blob_schema.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use core::fmt::Write;
6use icu_provider::{marker::DataMarkerIdHash, prelude::*};
7use serde::Deserialize;
8use writeable::Writeable;
9use zerotrie::ZeroTrieSimpleAscii;
10use zerovec::vecs::{Index16, Index32, VarZeroSlice, VarZeroVecFormat, ZeroSlice};
11
12/// A versioned Serde schema for ICU4X data blobs.
13#[derive(serde::Deserialize, yoke::Yokeable)]
14#[yoke(prove_covariance_manually)]
15#[cfg_attr(feature = "export", derive(serde::Serialize))]
16#[derive(Debug, Clone)]
17pub(crate) enum BlobSchema<'data> {
18    V001(NeverSchema),
19    V002(NeverSchema),
20    V002Bigger(NeverSchema),
21    #[serde(borrow)]
22    V003(BlobSchemaV1<'data, Index16>),
23    #[serde(borrow)]
24    V003Bigger(BlobSchemaV1<'data, Index32>),
25}
26
27// This is a valid separator as `DataLocale` will never produce it.
28pub(crate) const REQUEST_SEPARATOR: char = '\x1E';
29pub(crate) const CHECKSUM_KEY: &[u8] = b"\0c";
30
31impl<'data> BlobSchema<'data> {
32    pub fn deserialize_and_check<D: serde::Deserializer<'data>>(
33        de: D,
34    ) -> Result<BlobSchema<'data>, D::Error> {
35        let blob = Self::deserialize(de)?;
36        #[cfg(debug_assertions)]
37        blob.check_invariants();
38        Ok(blob)
39    }
40
41    pub fn load(
42        &self,
43        marker: DataMarkerInfo,
44        req: DataRequest,
45    ) -> Result<(&'data [u8], Option<u64>), DataError> {
46        match self {
47            BlobSchema::V001(..) | BlobSchema::V002(..) | BlobSchema::V002Bigger(..) => {
48                unreachable!("Unreachable blob schema")
49            }
50            BlobSchema::V003(s) => s.load(marker, req),
51            BlobSchema::V003Bigger(s) => s.load(marker, req),
52        }
53    }
54
55    #[cfg(feature = "alloc")]
56    pub fn iter_ids(
57        &self,
58        marker: DataMarkerInfo,
59    ) -> Result<alloc::collections::BTreeSet<DataIdentifierCow>, DataError> {
60        match self {
61            BlobSchema::V001(..) | BlobSchema::V002(..) | BlobSchema::V002Bigger(..) => {
62                unreachable!("Unreachable blob schema")
63            }
64            BlobSchema::V003(s) => s.iter_ids(marker),
65            BlobSchema::V003Bigger(s) => s.iter_ids(marker),
66        }
67    }
68
69    #[cfg(debug_assertions)]
70    fn check_invariants(&self) {
71        match self {
72            BlobSchema::V001(..) | BlobSchema::V002(..) | BlobSchema::V002Bigger(..) => (),
73            BlobSchema::V003(s) => s.check_invariants(),
74            BlobSchema::V003Bigger(s) => s.check_invariants(),
75        }
76    }
77}
78
79#[cfg_attr(feature = "export", derive(serde::Serialize))]
80#[derive(Debug, Clone, yoke::Yokeable)]
81pub enum NeverSchema {}
82
83impl<'de> serde::Deserialize<'de> for NeverSchema {
84    fn deserialize<D>(_: D) -> Result<Self, D::Error>
85    where
86        D: serde::Deserializer<'de>,
87    {
88        use serde::de::Error;
89        Err(D::Error::custom("Attempted to read 1.0 blob format from ICU4X 2.0: please run ICU4X 2.0 datagen to generate a new file."))
90    }
91}
92/// Version 3 of the ICU4X data blob schema.
93///
94/// This itself has two modes, using [`Index16`] or [`Index32`] buffers for the locales array.
95///
96/// The exporter will autoupgrade to the larger buffer as needed.
97#[derive(Clone, Copy, Debug, serde::Deserialize, yoke::Yokeable)]
98#[yoke(prove_covariance_manually)]
99#[cfg_attr(feature = "export", derive(serde::Serialize))]
100#[serde(bound = "")] // Override the autogenerated `LocaleVecFormat: Serialize/Deserialize` bound
101pub(crate) struct BlobSchemaV1<'data, LocaleVecFormat: VarZeroVecFormat> {
102    /// Map from marker hash to locale trie.
103    /// Weak invariant: should be sorted.
104    #[serde(borrow)]
105    pub markers: &'data ZeroSlice<DataMarkerIdHash>,
106    /// Map from locale to buffer index.
107    /// Weak invariant: the `usize` values are valid indices into `self.buffers`
108    /// Weak invariant: there is at least one value for every integer in 0..self.buffers.len()
109    /// Weak invariant: markers and locales are the same length
110    // TODO: Make ZeroTrieSimpleAscii<[u8]> work when in this position.
111    #[serde(borrow)]
112    pub locales: &'data VarZeroSlice<[u8], LocaleVecFormat>,
113    /// Vector of buffers
114    #[serde(borrow)]
115    pub buffers: &'data VarZeroSlice<[u8], Index32>,
116}
117
118impl<LocaleVecFormat: VarZeroVecFormat> Default for BlobSchemaV1<'_, LocaleVecFormat> {
119    fn default() -> Self {
120        Self {
121            markers: ZeroSlice::new_empty(),
122            locales: VarZeroSlice::new_empty(),
123            buffers: VarZeroSlice::new_empty(),
124        }
125    }
126}
127
128impl<'data, LocaleVecFormat: VarZeroVecFormat> BlobSchemaV1<'data, LocaleVecFormat> {
129    pub fn load(
130        &self,
131        marker: DataMarkerInfo,
132        req: DataRequest,
133    ) -> Result<(&'data [u8], Option<u64>), DataError> {
134        if marker.is_singleton && !req.id.locale.is_unknown() {
135            return Err(DataErrorKind::InvalidRequest.with_req(marker, req));
136        }
137        let marker_index = self
138            .markers
139            .binary_search(&marker.id.hashed())
140            .ok()
141            .ok_or_else(|| DataErrorKind::MarkerNotFound.with_req(marker, req))?;
142        let zerotrie = self
143            .locales
144            .get(marker_index)
145            .ok_or_else(|| DataError::custom("Invalid blob bytes").with_req(marker, req))?;
146        let mut cursor = ZeroTrieSimpleAscii::from_store(zerotrie).into_cursor();
147        let _infallible_ascii = req.id.locale.write_to(&mut cursor);
148        let blob_index = if !req.id.marker_attributes.is_empty() {
149            let _infallible_ascii = cursor.write_char(REQUEST_SEPARATOR);
150            req.id
151                .marker_attributes
152                .write_to(&mut cursor)
153                .map_err(|_| DataErrorKind::IdentifierNotFound.with_req(marker, req))?;
154            loop {
155                if let Some(v) = cursor.take_value() {
156                    break Some(v);
157                }
158                if !req.metadata.attributes_prefix_match || cursor.probe(0).is_none() {
159                    break None;
160                }
161            }
162        } else {
163            cursor.take_value()
164        }
165        .ok_or_else(|| DataErrorKind::IdentifierNotFound.with_req(marker, req))?;
166        let buffer = self
167            .buffers
168            .get(blob_index)
169            .ok_or_else(|| DataError::custom("Invalid blob bytes").with_req(marker, req))?;
170        Ok((
171            buffer,
172            marker
173                .has_checksum
174                .then(|| self.get_checksum(zerotrie))
175                .flatten(),
176        ))
177    }
178
179    fn get_checksum(&self, zerotrie: &[u8]) -> Option<u64> {
180        ZeroTrieSimpleAscii::from_store(zerotrie)
181            .get(CHECKSUM_KEY)
182            .and_then(|cs| Some(u64::from_le_bytes(self.buffers.get(cs)?.try_into().ok()?)))
183    }
184
185    #[cfg(feature = "alloc")]
186    pub fn iter_ids(
187        &self,
188        marker: DataMarkerInfo,
189    ) -> Result<alloc::collections::BTreeSet<DataIdentifierCow>, DataError> {
190        let marker_index = self
191            .markers
192            .binary_search(&marker.id.hashed())
193            .ok()
194            .ok_or_else(|| DataErrorKind::MarkerNotFound.with_marker(marker))?;
195        let zerotrie = self
196            .locales
197            .get(marker_index)
198            .ok_or_else(|| DataError::custom("Invalid blob bytes").with_marker(marker))?;
199        Ok(ZeroTrieSimpleAscii::from_store(zerotrie)
200            .iter()
201            .filter_map(|(s, _)| {
202                #[allow(unused_imports)]
203                use alloc::borrow::ToOwned;
204                if let Some((locale, attrs)) = s.split_once(REQUEST_SEPARATOR) {
205                    Some(DataIdentifierCow::from_owned(
206                        DataMarkerAttributes::try_from_str(attrs).ok()?.to_owned(),
207                        locale.parse().ok()?,
208                    ))
209                } else if s.as_bytes() == CHECKSUM_KEY {
210                    None
211                } else {
212                    Some(DataIdentifierCow::from_locale(s.parse().ok()?))
213                }
214            })
215            .collect())
216    }
217
218    /// Verifies the weak invariants using debug assertions
219    #[cfg(debug_assertions)]
220    fn check_invariants(&self) {
221        if self.markers.is_empty() && self.locales.is_empty() && self.buffers.is_empty() {
222            return;
223        }
224        debug_assert_eq!(self.markers.len(), self.locales.len());
225        // Note: We could check that every index occurs at least once, but that's a more expensive
226        // operation, so we will just check for the min and max index.
227        let mut seen_min = self.buffers.is_empty();
228        let mut seen_max = self.buffers.is_empty();
229        for zerotrie in self.locales.iter() {
230            for (_locale, idx) in ZeroTrieSimpleAscii::from_store(zerotrie).iter() {
231                debug_assert!(idx < self.buffers.len());
232                if idx == 0 {
233                    seen_min = true;
234                }
235                if idx + 1 == self.buffers.len() {
236                    seen_max = true;
237                }
238            }
239        }
240        debug_assert!(seen_min);
241        debug_assert!(seen_max);
242    }
243}