icu_collections/char16trie/
trie.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use zerofrom::ZeroFrom;
6use zerovec::{ZeroSlice, ZeroVec};
7
8// Match-node lead unit values, after masking off intermediate-value bits:
9
10// 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
11// the length is one more than the next byte.
12
13// For a branch sub-node with at most this many entries, we drop down
14// to a linear search.
15const MAX_BRANCH_LINEAR_SUB_NODE_LENGTH: usize = 5;
16
17// 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
18const MIN_LINEAR_MATCH: u16 = 0x30;
19const MAX_LINEAR_MATCH_LENGTH: u16 = 0x10;
20
21// Match-node lead unit bits 14..6 for the optional intermediate value.
22// If these bits are 0, then there is no intermediate value.
23// Otherwise, see the *NodeValue* constants below.
24const MIN_VALUE_LEAD: u16 = MIN_LINEAR_MATCH + MAX_LINEAR_MATCH_LENGTH; // 0x40
25const NODE_TYPE_MASK: u16 = MIN_VALUE_LEAD - 1; // 0x003f
26
27// A final-value node has bit 15 set.
28const VALUE_IS_FINAL: u16 = 0x8000;
29
30// Compact value: After testing bit 0, shift right by 15 and then use the following thresholds.
31const MAX_ONE_UNIT_VALUE: u16 = 0x3fff;
32
33const MIN_TWO_UNIT_VALUE_LEAD: u16 = MAX_ONE_UNIT_VALUE + 1; // 0x4000
34
35const MAX_ONE_UNIT_NODE_VALUE: u16 = 0xff;
36
37const MIN_TWO_UNIT_NODE_VALUE_LEAD: u16 = MIN_VALUE_LEAD + ((MAX_ONE_UNIT_NODE_VALUE + 1) << 6); // 0x4040
38
39const THREE_UNIT_NODE_VALUE_LEAD: u16 = 0x7fc0;
40
41const THREE_UNIT_VALUE_LEAD: u16 = 0x7fff;
42
43// Compact delta integers.
44const MAX_ONE_UNIT_DELTA: u16 = 0xfbff;
45const MIN_TWO_UNIT_DELTA_LEAD: u16 = MAX_ONE_UNIT_DELTA + 1; // 0xfc00
46const THREE_UNIT_DELTA_LEAD: u16 = 0xffff;
47
48fn skip_value(pos: usize, lead: u16) -> usize {
49    if lead < MIN_TWO_UNIT_VALUE_LEAD {
50        pos
51    } else if lead < THREE_UNIT_VALUE_LEAD {
52        pos + 1
53    } else {
54        pos + 2
55    }
56}
57
58fn skip_node_value(pos: usize, lead: u16) -> usize {
59    if lead < MIN_TWO_UNIT_NODE_VALUE_LEAD {
60        pos
61    } else if lead < THREE_UNIT_NODE_VALUE_LEAD {
62        pos + 1
63    } else {
64        pos + 2
65    }
66}
67
68/// This struct represents a de-serialized `Char16Trie` that was exported from
69/// ICU binary data.
70///
71/// Light-weight, non-const reader class for a `CharsTrie`. Traverses a
72/// char-serialized data structure with minimal state, for mapping 16-bit-unit
73/// sequences to non-negative integer values.
74///
75/// For more information:
76/// - [ICU4C UCharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UCharsTrie.html)
77/// - [ICU4J CharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/util/CharsTrie.html) API.
78#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
79#[cfg_attr(feature = "databake", derive(databake::Bake))]
80#[cfg_attr(feature = "databake", databake(path = icu_collections::char16trie))]
81#[derive(Clone, Debug, PartialEq, Eq, ZeroFrom)]
82pub struct Char16Trie<'data> {
83    /// An array of u16 containing the trie data.
84    #[cfg_attr(feature = "serde", serde(borrow))]
85    #[doc(hidden)] // #2417
86    pub data: ZeroVec<'data, u16>,
87}
88
89impl<'data> Char16Trie<'data> {
90    /// Returns a new [`Char16Trie`] with ownership of the provided data.
91    pub fn new(data: ZeroVec<'data, u16>) -> Self {
92        Self { data }
93    }
94
95    /// Returns a new [`Char16TrieIterator`] backed by borrowed data from the `trie` data
96    pub fn iter(&self) -> Char16TrieIterator {
97        Char16TrieIterator::new(&self.data)
98    }
99}
100
101/// This struct represents an iterator over a [`Char16Trie`].
102#[derive(Clone)]
103pub struct Char16TrieIterator<'a> {
104    /// A reference to the Char16Trie data to iterate over.
105    trie: &'a ZeroSlice<u16>,
106    /// Index of next trie unit to read, or `None` if there are no more matches.
107    pos: Option<usize>,
108    /// Remaining length of a linear-match node, minus 1, or `None` if not in
109    /// such a node.
110    remaining_match_length: Option<usize>,
111}
112
113/// An enum representing the return value from a lookup in [`Char16Trie`].
114#[derive(Clone, Copy, Debug, PartialEq)]
115pub enum TrieResult {
116    /// The input unit(s) did not continue a matching string.
117    /// Once `next()` returns `TrieResult::NoMatch`, all further calls to `next()`
118    /// will also return `TrieResult::NoMatch`.
119    NoMatch,
120    /// The input unit(s) matched a string but there is no value for the string
121    /// so far.  (It is a prefix of a longer string.)
122    NoValue,
123    /// The input unit(s) continued a matching string and there is a value for
124    /// the string so far. No further input byte/unit can continue a matching
125    /// string.
126    FinalValue(i32),
127    /// The input unit(s) continued a matching string and there is a value for
128    /// the string so far.  Another input byte/unit can continue a matching
129    /// string.
130    Intermediate(i32),
131}
132
133// Get the lead surrogate (0xd800..0xdbff) for a
134// supplementary code point (0x10000..0x10ffff).
135// @param supplementary 32-bit code point (U+10000..U+10ffff)
136// @return lead surrogate (U+d800..U+dbff) for supplementary
137fn u16_lead(supplementary: i32) -> u16 {
138    (((supplementary) >> 10) + 0xd7c0) as u16
139}
140
141// Get the trail surrogate (0xdc00..0xdfff) for a
142// supplementary code point (0x10000..0x10ffff).
143// @param supplementary 32-bit code point (U+10000..U+10ffff)
144// @return trail surrogate (U+dc00..U+dfff) for supplementary
145fn u16_tail(supplementary: i32) -> u16 {
146    (((supplementary) & 0x3ff) | 0xdc00) as u16
147}
148
149/// A macro that takes an `Option` argument and either unwraps it if it has a value or
150/// causes the function to return `TrieResult::NoMatch` if there is no value.
151/// This could perhaps be done with `std::ops::Try` once stabilized.
152macro_rules! trie_unwrap {
153    ($option:expr) => {
154        match $option {
155            Some(x) => x,
156            None => {
157                // Unexpected
158                debug_assert!(false);
159                return TrieResult::NoMatch;
160            }
161        }
162    };
163}
164
165impl<'a> Char16TrieIterator<'a> {
166    /// Returns a new [`Char16TrieIterator`] backed by borrowed data for the `trie` array
167    pub fn new(trie: &'a ZeroSlice<u16>) -> Self {
168        Self {
169            trie,
170            pos: Some(0),
171            remaining_match_length: None,
172        }
173    }
174
175    /// Traverses the trie from the current state for this input char.
176    ///
177    /// # Examples
178    ///
179    /// ```
180    /// use icu::collections::char16trie::{Char16Trie, TrieResult};
181    /// use zerovec::ZeroVec;
182    ///
183    /// // A Char16Trie containing the ASCII characters 'a' and 'b'.
184    /// let trie_data = [48, 97, 176, 98, 32868];
185    /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
186    ///
187    /// let mut iter = trie.iter();
188    /// let res = iter.next('a');
189    /// assert_eq!(res, TrieResult::Intermediate(1));
190    /// let res = iter.next('b');
191    /// assert_eq!(res, TrieResult::FinalValue(100));
192    /// let res = iter.next('c');
193    /// assert_eq!(res, TrieResult::NoMatch);
194    /// ```
195    pub fn next(&mut self, c: char) -> TrieResult {
196        if (c as u32) <= 0xffff {
197            self.next16(c as u16)
198        } else {
199            match self.next16(u16_lead(c as i32)) {
200                TrieResult::NoValue | TrieResult::Intermediate(_) => {
201                    self.next16(u16_tail(c as i32))
202                }
203                _ => TrieResult::NoMatch,
204            }
205        }
206    }
207
208    /// Traverses the trie from the current state for this input char.
209    ///
210    /// # Examples
211    ///
212    /// ```
213    /// use icu::collections::char16trie::{Char16Trie, TrieResult};
214    /// use zerovec::ZeroVec;
215    ///
216    /// // A Char16Trie containing the ASCII characters 'a' and 'b'.
217    /// let trie_data = [48, 97, 176, 98, 32868];
218    /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
219    ///
220    /// let mut iter = trie.iter();
221    /// let res = iter.next('a');
222    /// assert_eq!(res, TrieResult::Intermediate(1));
223    /// let res = iter.next('b');
224    /// assert_eq!(res, TrieResult::FinalValue(100));
225    /// let res = iter.next('c');
226    /// assert_eq!(res, TrieResult::NoMatch);
227    /// ```
228    pub fn next32(&mut self, c: u32) -> TrieResult {
229        if c <= 0xffff {
230            self.next16(c as u16)
231        } else {
232            match self.next16(u16_lead(c as i32)) {
233                TrieResult::NoValue | TrieResult::Intermediate(_) => {
234                    self.next16(u16_tail(c as i32))
235                }
236                _ => TrieResult::NoMatch,
237            }
238        }
239    }
240
241    /// Traverses the trie from the current state for this input char.
242    ///
243    /// # Examples
244    ///
245    /// ```
246    /// use icu::collections::char16trie::{Char16Trie, TrieResult};
247    /// use zerovec::ZeroVec;
248    ///
249    /// // A Char16Trie containing the ASCII characters 'a' and 'b'.
250    /// let trie_data = [48, 97, 176, 98, 32868];
251    /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
252    ///
253    /// let mut iter = trie.iter();
254    /// let res = iter.next16('a' as u16);
255    /// assert_eq!(res, TrieResult::Intermediate(1));
256    /// let res = iter.next16('b' as u16);
257    /// assert_eq!(res, TrieResult::FinalValue(100));
258    /// let res = iter.next16('c' as u16);
259    /// assert_eq!(res, TrieResult::NoMatch);
260    /// ```
261    pub fn next16(&mut self, c: u16) -> TrieResult {
262        let mut pos = match self.pos {
263            Some(p) => p,
264            None => return TrieResult::NoMatch,
265        };
266        if let Some(length) = self.remaining_match_length {
267            // Remaining part of a linear-match node
268            if c == trie_unwrap!(self.trie.get(pos)) {
269                pos += 1;
270                self.pos = Some(pos);
271                if length == 0 {
272                    self.remaining_match_length = None;
273                    let node = trie_unwrap!(self.trie.get(pos));
274                    if node >= MIN_VALUE_LEAD {
275                        return self.value_result(pos);
276                    }
277                } else {
278                    self.remaining_match_length = Some(length - 1);
279                }
280                return TrieResult::NoValue;
281            }
282            self.stop();
283            TrieResult::NoMatch
284        } else {
285            self.next_impl(pos, c)
286        }
287    }
288
289    fn branch_next(&mut self, pos: usize, length: usize, in_unit: u16) -> TrieResult {
290        let mut pos = pos;
291        let mut length = length;
292        if length == 0 {
293            length = trie_unwrap!(self.trie.get(pos)) as usize;
294            pos += 1;
295        }
296        length += 1;
297
298        // The length of the branch is the number of units to select from.
299        // The data structure encodes a binary search.
300        while length > MAX_BRANCH_LINEAR_SUB_NODE_LENGTH {
301            if in_unit < trie_unwrap!(self.trie.get(pos)) {
302                length >>= 1;
303                pos = trie_unwrap!(self.jump_by_delta(pos + 1));
304            } else {
305                length = length - (length >> 1);
306                pos = trie_unwrap!(self.skip_delta(pos + 1));
307            }
308        }
309        // Drop down to linear search for the last few bytes.
310        // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
311        // and divides length by 2.
312        loop {
313            if in_unit == trie_unwrap!(self.trie.get(pos)) {
314                pos += 1;
315                let mut node = trie_unwrap!(self.trie.get(pos));
316                if node & VALUE_IS_FINAL != 0 {
317                    self.pos = Some(pos);
318                    return self.value_result(pos);
319                }
320                // Use the non-final value as the jump delta.
321                pos += 1;
322
323                if node < MIN_TWO_UNIT_VALUE_LEAD {
324                    pos += node as usize;
325                } else if node < THREE_UNIT_VALUE_LEAD {
326                    pos += (((node - MIN_TWO_UNIT_VALUE_LEAD) as u32) << 16) as usize
327                        | trie_unwrap!(self.trie.get(pos)) as usize;
328                    pos += 1;
329                } else {
330                    pos += ((trie_unwrap!(self.trie.get(pos)) as usize) << 16)
331                        | trie_unwrap!(self.trie.get(pos + 1)) as usize;
332                    pos += 2;
333                }
334                node = trie_unwrap!(self.trie.get(pos));
335                self.pos = Some(pos);
336
337                if node >= MIN_VALUE_LEAD {
338                    return self.value_result(pos);
339                }
340                return TrieResult::NoValue;
341            }
342            length -= 1;
343            pos = trie_unwrap!(self.skip_value(pos + 1));
344            if length <= 1 {
345                break;
346            }
347        }
348
349        if in_unit == trie_unwrap!(self.trie.get(pos)) {
350            pos += 1;
351            self.pos = Some(pos);
352            let node = trie_unwrap!(self.trie.get(pos));
353            if node >= MIN_VALUE_LEAD {
354                return self.value_result(pos);
355            }
356            TrieResult::NoValue
357        } else {
358            self.stop();
359            TrieResult::NoMatch
360        }
361    }
362
363    fn next_impl(&mut self, pos: usize, in_unit: u16) -> TrieResult {
364        let mut node = trie_unwrap!(self.trie.get(pos));
365        let mut pos = pos + 1;
366        loop {
367            if node < MIN_LINEAR_MATCH {
368                return self.branch_next(pos, node as usize, in_unit);
369            } else if node < MIN_VALUE_LEAD {
370                // Match the first of length+1 units.
371                let length = node - MIN_LINEAR_MATCH;
372                if in_unit == trie_unwrap!(self.trie.get(pos)) {
373                    pos += 1;
374                    if length == 0 {
375                        self.remaining_match_length = None;
376                        self.pos = Some(pos);
377                        node = trie_unwrap!(self.trie.get(pos));
378                        if node >= MIN_VALUE_LEAD {
379                            return self.value_result(pos);
380                        }
381                        return TrieResult::NoValue;
382                    }
383                    self.remaining_match_length = Some(length as usize - 1);
384                    self.pos = Some(pos);
385                    return TrieResult::NoValue;
386                }
387                // No match
388                break;
389            } else if (node & VALUE_IS_FINAL) != 0 {
390                // No further matching units.
391                break;
392            } else {
393                // Skip intermediate value.
394                pos = skip_node_value(pos, node);
395                node &= NODE_TYPE_MASK;
396            }
397        }
398        self.stop();
399        TrieResult::NoMatch
400    }
401
402    fn stop(&mut self) {
403        self.pos = None;
404    }
405
406    #[inline(always)] // 1 call site and we want the Option to go away
407    fn jump_by_delta(&self, pos: usize) -> Option<usize> {
408        let delta = self.trie.get(pos)?;
409        let v = if delta < MIN_TWO_UNIT_DELTA_LEAD {
410            // nothing to do
411            pos + 1 + delta as usize
412        } else if delta == THREE_UNIT_DELTA_LEAD {
413            let delta =
414                ((self.trie.get(pos + 1)? as usize) << 16) | (self.trie.get(pos + 2)? as usize);
415            pos + delta + 3
416        } else {
417            let delta = (((delta - MIN_TWO_UNIT_DELTA_LEAD) as usize) << 16)
418                | (self.trie.get(pos + 1)? as usize);
419            pos + delta + 2
420        };
421        Some(v)
422    }
423
424    #[inline(always)] // 1 call site and we want the Option to go away
425    fn skip_value(&self, pos: usize) -> Option<usize> {
426        let lead_unit = self.trie.get(pos)?;
427        Some(skip_value(pos + 1, lead_unit & 0x7fff))
428    }
429
430    #[inline(always)] // 1 call site and we want the Option to go away
431    fn skip_delta(&self, pos: usize) -> Option<usize> {
432        let delta = self.trie.get(pos)?;
433        let v = if delta < MIN_TWO_UNIT_DELTA_LEAD {
434            pos + 1
435        } else if delta == THREE_UNIT_DELTA_LEAD {
436            pos + 3
437        } else {
438            pos + 2
439        };
440        Some(v)
441    }
442
443    fn value_result(&self, pos: usize) -> TrieResult {
444        match self.get_value(pos) {
445            Some(result) => result,
446            None => {
447                // Unexpected
448                debug_assert!(false);
449                TrieResult::NoMatch
450            }
451        }
452    }
453
454    #[inline(always)] // 1 call site and we want the Option to go away
455    fn get_value(&self, pos: usize) -> Option<TrieResult> {
456        let lead_unit = self.trie.get(pos)?;
457        if lead_unit & VALUE_IS_FINAL == VALUE_IS_FINAL {
458            self.read_value(pos + 1, lead_unit & 0x7fff)
459                .map(TrieResult::FinalValue)
460        } else {
461            self.read_node_value(pos + 1, lead_unit)
462                .map(TrieResult::Intermediate)
463        }
464    }
465
466    #[inline(always)] // 1 call site and we want the Option to go away
467    fn read_value(&self, pos: usize, lead_unit: u16) -> Option<i32> {
468        let v = if lead_unit < MIN_TWO_UNIT_VALUE_LEAD {
469            lead_unit.into()
470        } else if lead_unit < THREE_UNIT_VALUE_LEAD {
471            (((lead_unit - MIN_TWO_UNIT_VALUE_LEAD) as i32) << 16) | self.trie.get(pos)? as i32
472        } else {
473            ((self.trie.get(pos)? as i32) << 16) | self.trie.get(pos + 1)? as i32
474        };
475        Some(v)
476    }
477
478    #[inline(always)] // 1 call site and we want the Option to go away
479    fn read_node_value(&self, pos: usize, lead_unit: u16) -> Option<i32> {
480        let v = if lead_unit < (MIN_TWO_UNIT_NODE_VALUE_LEAD) {
481            ((lead_unit >> 6) - 1).into()
482        } else if lead_unit < THREE_UNIT_NODE_VALUE_LEAD {
483            ((((lead_unit & 0x7fc0) - MIN_TWO_UNIT_NODE_VALUE_LEAD) as i32) << 10)
484                | self.trie.get(pos)? as i32
485        } else {
486            ((self.trie.get(pos)? as i32) << 16) | self.trie.get(pos + 1)? as i32
487        };
488        Some(v)
489    }
490}