icu_pattern/parser/
mod.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5pub mod error;
6pub mod token;
7
8use alloc::{borrow::Cow, vec, vec::Vec};
9use core::{fmt::Debug, marker::PhantomData, str::FromStr};
10pub use error::ParserError;
11pub use token::ParsedPatternItem;
12
13#[derive(PartialEq, Debug)]
14enum ParserState {
15    Default,
16    Placeholder,
17    QuotedLiteral,
18    Apostrophe { quoted: bool },
19}
20
21impl Default for ParserState {
22    fn default() -> Self {
23        Self::Default
24    }
25}
26
27macro_rules! handle_literal {
28    ($self:ident, $quoted:expr, $next_state:expr) => {{
29        let range = $self.advance_state($self.idx, $next_state);
30        if !range.is_empty() {
31            return Ok(Some(ParsedPatternItem::Literal {
32                content: Cow::Borrowed(&$self.input[range]),
33                quoted: $quoted,
34            }));
35        } else {
36            continue;
37        }
38    }};
39}
40
41/// Options passed to the constructor of [`Parser`].
42///
43/// ✨ *Enabled with the `alloc` Cargo feature.*
44#[derive(Debug, Default)]
45#[non_exhaustive]
46pub struct ParserOptions {
47    /// Controls how quotes (`'`) are interpreted.
48    pub quote_mode: QuoteMode,
49}
50
51/// Controls how quotes (`'`) are interpreted.
52#[derive(Debug, Default, PartialEq)]
53#[non_exhaustive]
54pub enum QuoteMode {
55    /// Quotes are interpreted as literals, i.e. `{0} o'clock` will interpolate to `5 o'clock`.
56    #[default]
57    QuotesAreLiterals,
58    /// Quotes can be used to quote ASCII characters, i.e. both `{0} World` and `{0} 'World'` will interpolate to `Hello World`.
59    ///
60    /// A double quote can be used to create a quote literal, i.e. `{0} o''clock`.
61    QuotingSupported,
62    /// Quotes are required to quote ASCII characters, i.e. `{0} 'World'` will interpolate to `Hello World`, while `{0} World` is an error.
63    ///
64    /// A double quote can be used to create a quote literal, i.e. `{0} 'o''clock'`.
65    QuotingRequired,
66}
67
68impl From<QuoteMode> for ParserOptions {
69    fn from(quote_mode: QuoteMode) -> Self {
70        Self { quote_mode }
71    }
72}
73
74/// Placeholder pattern parser.
75///
76/// The parser allows for handling flexible range of generic patterns
77/// with placeholders.
78///
79/// The [`Parser`] is generic over any placeholder which implements [`FromStr`]
80/// allowing the consumer to parse placeholder patterns such as "{0}, {1}",
81/// "{date}, {time}" or any other. A placeholder must be enclosed in `{` and `}`
82/// characters in the input pattern string.
83///
84/// At the moment the parser is written as a custom fallible iterator.
85///
86/// ✨ *Enabled with the `alloc` Cargo feature.*
87///
88/// # Examples
89///
90/// ```
91/// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
92///
93/// let input = "{0}, {1}";
94///
95/// let mut parser = Parser::new(input, ParserOptions::default());
96///
97/// let mut result = vec![];
98///
99/// while let Some(element) =
100///     parser.try_next().expect("Failed to advance iterator")
101/// {
102///     result.push(element);
103/// }
104///
105/// assert_eq!(
106///     result,
107///     &[
108///         ParsedPatternItem::Placeholder(0),
109///         ParsedPatternItem::Literal {
110///             content: ", ".into(),
111///             quoted: false
112///         },
113///         ParsedPatternItem::Placeholder(1),
114///     ]
115/// );
116/// ```
117///
118/// # Named placeholders
119///
120/// The parser is also capable of parsing different placeholder types such as strings.
121///
122/// ## Examples
123/// ```
124/// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
125///
126/// let input = "{start}, {end}";
127///
128/// let mut parser = Parser::new(input, ParserOptions::default());
129///
130/// let mut result = vec![];
131///
132/// while let Some(element) =
133///     parser.try_next().expect("Failed to advance iterator")
134/// {
135///     result.push(element);
136/// }
137///
138/// assert_eq!(
139///     result,
140///     &[
141///         ParsedPatternItem::Placeholder("start".to_owned()),
142///         ParsedPatternItem::Literal {
143///             content: ", ".into(),
144///             quoted: false
145///         },
146///         ParsedPatternItem::Placeholder("end".to_owned()),
147///     ]
148/// );
149/// ```
150///
151/// # Type parameters
152///
153/// - `P`: The type of the placeholder used as a key for the [`PlaceholderValueProvider`].
154///
155/// # Lifetimes
156///
157/// - `p`: The life time of an input string slice to be parsed.
158///
159/// # Design Decisions
160///
161/// The parser is written in an intentionally generic way to enable use against wide range
162/// of potential placeholder pattern models and use cases.
163///
164/// Serveral design decisions have been made that the reader should be aware of when using the API.
165///
166/// ## Zero copy
167///
168/// The parser is intended for runtime use and is optimized for performance and low memory overhad.
169///
170/// Zero copy parsing is a model which allows the parser to produce tokens that are de-facto
171/// slices of the input without ever having to modify the input or copy from it.
172///
173/// In case of ICU patterns that decision brings a trade-off around handling of quoted literals.
174/// A parser that copies bytes from the input when generating the output can take a pattern literal
175/// that contains a quoted portion and concatenace the parts, effectively generating a single
176/// literal out of a series of syntactical literal quoted and unquoted nodes.
177/// A zero copy parser sacrifices that convenience for marginal performance gains.
178///
179/// The rationale for the decision is that many placeholder patterns do not contain ASCII letters
180/// and therefore can benefit from this design decision.
181/// Secondly, even in scenarios where ASCII letters, or other quoted literals, are used, the
182/// zero-copy design still maintains high performance, only increasing the number of tokens
183/// returned by the parser, but without increase to allocations.
184///
185/// ### Examples
186/// ```
187/// use icu_pattern::{ParsedPatternItem, Parser, QuoteMode};
188///
189/// let input = "{0} 'and' {1}";
190///
191/// let mut parser = Parser::new(input, QuoteMode::QuotingSupported.into());
192///
193/// let mut result = vec![];
194///
195/// while let Some(element) =
196///     parser.try_next().expect("Failed to advance iterator")
197/// {
198///     result.push(element);
199/// }
200///
201/// assert_eq!(
202///     result,
203///     &[
204///         ParsedPatternItem::Placeholder(0),
205///         ParsedPatternItem::Literal {
206///             content: " ".into(),
207///             quoted: false
208///         },
209///         ParsedPatternItem::Literal {
210///             content: "and".into(),
211///             quoted: true
212///         },
213///         ParsedPatternItem::Literal {
214///             content: " ".into(),
215///             quoted: false
216///         },
217///         ParsedPatternItem::Placeholder(1),
218///     ]
219/// );
220/// ```
221///
222/// ## Fallible Iterator
223///
224/// Rust providers a strong support for iterators and iterator combinators, which
225/// fits very well into the design of this parser/interpolator model.
226///
227/// Unfortunately, Rust iterators at the moment are infallible, while parsers are inhereantely
228/// fallible. As such, the decision has been made to design the API in line with what
229/// we hope will become a trait signature of a fallible iterator in the future, rather
230/// than implementing a reversed infallible iterator (where the [`Item`] would be
231/// `Option<Result<Item>>`).
232///
233/// That decision impacts the ergonomics of operating on the parser, on one hand making
234/// the fallible iteration more ergonomic, at a trade-off of losing access to the wide
235/// range of Rust iterator traits.
236///
237/// ## Generic Placeholder
238///
239/// To handle generic placeholder design, the only constrain necessary in the parser
240/// is that a placeholder must be parsed from a string slice.
241/// At the moment of writing, Rust is [preparing to deprecate][`RFC 2924`] [`FromStr`] in favor of
242/// [`TryFrom<&str>`][`TryFrom`].
243/// Among many benfits of such transition would be the auto-trait behavior of [`From`] and
244/// a [`TryFrom<&str>`][`TryFrom`] for [`&str`] allowing for placeholders to be [`&str`] themselves.
245///
246/// Unfortunately, at the moment [`TryFrom<&str>`][`TryFrom`] for [`usize`] is not implemented, which would
247/// impact the core use case of placeholder patterns.
248///
249/// In result, the decision has been made to use [`FromStr`] for the time being, until
250/// [`TryFrom<&str>`][`TryFrom`] gets implemented on all types that support [`FromStr`].
251///
252/// [`TR35 2.6.1]: https://unicode.org/reports/tr35/tr35-dates.html#dateTimeFormat
253/// [`RFC 2924`]: https://github.com/rust-lang/rfcs/pull/2924
254/// [`Item`]: core::iter::Iterator::Item
255/// [`TryFrom`]: core::convert::TryFrom
256/// [`PlaceholderValueProvider`]: crate::PlaceholderValueProvider
257#[derive(Debug)]
258pub struct Parser<'p, P> {
259    input: &'p str,
260    len: usize,
261
262    quote_mode: QuoteMode,
263
264    start_idx: usize,
265    idx: usize,
266
267    state: ParserState,
268    marker: PhantomData<P>,
269}
270
271impl<'p, P> Parser<'p, P> {
272    /// Creates a new `Parser`.
273    ///
274    /// The `allow_raw_letters` controls whether the parser will support
275    /// ASCII letters without quotes.
276    ///
277    /// # Examples
278    /// ```
279    /// use icu_pattern::{Parser, ParserOptions};
280    /// let mut parser = Parser::<usize>::new("{0}, {1}", ParserOptions::default());
281    /// ```
282    pub fn new(input: &'p str, options: ParserOptions) -> Self {
283        Self {
284            input,
285            len: input.len(),
286
287            quote_mode: options.quote_mode,
288
289            start_idx: 0,
290            idx: 0,
291
292            state: ParserState::default(),
293            marker: PhantomData,
294        }
295    }
296
297    /// An iterator method that advances the iterator and returns the result of an attempt to parse
298    /// the next token.
299    ///
300    /// # Examples
301    /// ```
302    /// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
303    ///
304    /// let mut parser = Parser::<usize>::new("{0}, {1}", ParserOptions::default());
305    ///
306    /// // A call to try_next() returns the next value…
307    /// assert_eq!(
308    ///     Ok(Some(ParsedPatternItem::Placeholder(0))),
309    ///     parser.try_next()
310    /// );
311    /// assert_eq!(
312    ///     Ok(Some(ParsedPatternItem::Literal {
313    ///         content: ", ".into(),
314    ///         quoted: false
315    ///     })),
316    ///     parser.try_next()
317    /// );
318    /// assert_eq!(
319    ///     Ok(Some(ParsedPatternItem::Placeholder(1))),
320    ///     parser.try_next()
321    /// );
322    ///
323    /// // … and then `None` once it's over.
324    /// assert_eq!(Ok(None), parser.try_next());
325    /// ```
326    pub fn try_next(
327        &mut self,
328    ) -> Result<Option<ParsedPatternItem<'p, P>>, ParserError<<P as FromStr>::Err>>
329    where
330        P: FromStr,
331        P::Err: Debug,
332    {
333        while let Some(b) = self.input.as_bytes().get(self.idx) {
334            match self.state {
335                ParserState::Placeholder if *b == b'}' => {
336                    let range = self.advance_state(self.idx, ParserState::Default);
337                    return self.input[range]
338                        .parse()
339                        .map(|ret| Some(ParsedPatternItem::Placeholder(ret)))
340                        .map_err(ParserError::InvalidPlaceholder);
341                }
342                ParserState::QuotedLiteral
343                    if *b == b'\'' && self.quote_mode != QuoteMode::QuotesAreLiterals =>
344                {
345                    if self.input.as_bytes().get(self.idx + 1) == Some(&b'\'') {
346                        handle_literal!(self, true, ParserState::Apostrophe { quoted: true })
347                    } else {
348                        handle_literal!(self, true, ParserState::Default)
349                    }
350                }
351                ParserState::Default if *b == b'{' => {
352                    handle_literal!(self, false, ParserState::Placeholder)
353                }
354                ParserState::Default
355                    if *b == b'\'' && self.quote_mode != QuoteMode::QuotesAreLiterals =>
356                {
357                    if self.input.as_bytes().get(self.idx + 1) == Some(&b'\'') {
358                        handle_literal!(self, false, ParserState::Apostrophe { quoted: false })
359                    } else {
360                        handle_literal!(self, false, ParserState::QuotedLiteral)
361                    }
362                }
363                ParserState::Default
364                    if self.quote_mode == QuoteMode::QuotingRequired && b.is_ascii_alphabetic() =>
365                {
366                    return Err(ParserError::IllegalCharacter(*b as char));
367                }
368                ParserState::Apostrophe { quoted } => {
369                    self.start_idx -= 1;
370                    if quoted {
371                        handle_literal!(self, true, ParserState::QuotedLiteral)
372                    } else {
373                        handle_literal!(self, false, ParserState::Default)
374                    }
375                }
376                _ => self.idx += 1,
377            }
378        }
379        match self.state {
380            ParserState::Placeholder => Err(ParserError::UnclosedPlaceholder),
381            ParserState::QuotedLiteral => Err(ParserError::UnclosedQuotedLiteral),
382            ParserState::Apostrophe { .. } => unreachable!(),
383            ParserState::Default => {
384                let range = self.start_idx..self.len;
385                if !range.is_empty() {
386                    self.start_idx = self.len;
387                    Ok(Some(ParsedPatternItem::Literal {
388                        content: Cow::Borrowed(&self.input[range]),
389                        quoted: false,
390                    }))
391                } else {
392                    Ok(None)
393                }
394            }
395        }
396    }
397
398    fn advance_state(&mut self, idx: usize, next_state: ParserState) -> core::ops::Range<usize> {
399        let range = self.start_idx..idx;
400        self.idx = idx + 1;
401        self.start_idx = self.idx;
402        self.state = next_state;
403        range
404    }
405
406    /// Mutates this parser and collects all [`ParsedPatternItem`]s into a vector.
407    pub fn try_collect_into_vec(
408        mut self,
409    ) -> Result<Vec<ParsedPatternItem<'p, P>>, ParserError<<P as FromStr>::Err>>
410    where
411        P: FromStr,
412        P::Err: Debug,
413    {
414        let mut result = vec![];
415        while let Some(token) = self.try_next()? {
416            result.push(token);
417        }
418        Ok(result)
419    }
420}
421
422#[cfg(test)]
423mod tests {
424    use super::*;
425    use core::ops::Deref;
426
427    #[test]
428    fn pattern_parse_placeholders() {
429        let samples = vec![
430            ("{0}", vec![ParsedPatternItem::Placeholder(0)]),
431            (
432                "{0}{1}",
433                vec![
434                    ParsedPatternItem::Placeholder(0),
435                    ParsedPatternItem::Placeholder(1),
436                ],
437            ),
438            (
439                "{0} 'at' {1}",
440                vec![
441                    ParsedPatternItem::Placeholder(0),
442                    ParsedPatternItem::Literal {
443                        content: " ".into(),
444                        quoted: false,
445                    },
446                    ParsedPatternItem::Literal {
447                        content: "at".into(),
448                        quoted: true,
449                    },
450                    ParsedPatternItem::Literal {
451                        content: " ".into(),
452                        quoted: false,
453                    },
454                    ParsedPatternItem::Placeholder(1),
455                ],
456            ),
457            (
458                "{0}'at'{1}",
459                vec![
460                    ParsedPatternItem::Placeholder(0),
461                    ParsedPatternItem::Literal {
462                        content: "at".into(),
463                        quoted: true,
464                    },
465                    ParsedPatternItem::Placeholder(1),
466                ],
467            ),
468            (
469                "'{0}' 'at' '{1}'",
470                vec![
471                    ParsedPatternItem::Literal {
472                        content: "{0}".into(),
473                        quoted: true,
474                    },
475                    ParsedPatternItem::Literal {
476                        content: " ".into(),
477                        quoted: false,
478                    },
479                    ParsedPatternItem::Literal {
480                        content: "at".into(),
481                        quoted: true,
482                    },
483                    ParsedPatternItem::Literal {
484                        content: " ".into(),
485                        quoted: false,
486                    },
487                    ParsedPatternItem::Literal {
488                        content: "{1}".into(),
489                        quoted: true,
490                    },
491                ],
492            ),
493            (
494                "'PRE' {0} 'and' {1} 'POST'",
495                vec![
496                    ParsedPatternItem::Literal {
497                        content: "PRE".into(),
498                        quoted: true,
499                    },
500                    ParsedPatternItem::Literal {
501                        content: " ".into(),
502                        quoted: false,
503                    },
504                    ParsedPatternItem::Placeholder(0),
505                    ParsedPatternItem::Literal {
506                        content: " ".into(),
507                        quoted: false,
508                    },
509                    ParsedPatternItem::Literal {
510                        content: "and".into(),
511                        quoted: true,
512                    },
513                    ParsedPatternItem::Literal {
514                        content: " ".into(),
515                        quoted: false,
516                    },
517                    ParsedPatternItem::Placeholder(1),
518                    ParsedPatternItem::Literal {
519                        content: " ".into(),
520                        quoted: false,
521                    },
522                    ParsedPatternItem::Literal {
523                        content: "POST".into(),
524                        quoted: true,
525                    },
526                ],
527            ),
528            (
529                "{0} o''clock and 'o''clock'",
530                vec![
531                    ParsedPatternItem::Placeholder(0),
532                    ParsedPatternItem::Literal {
533                        content: " o".into(),
534                        quoted: false,
535                    },
536                    ParsedPatternItem::Literal {
537                        content: "'".into(),
538                        quoted: false,
539                    },
540                    ParsedPatternItem::Literal {
541                        content: "clock and ".into(),
542                        quoted: false,
543                    },
544                    ParsedPatternItem::Literal {
545                        content: "o".into(),
546                        quoted: true,
547                    },
548                    ParsedPatternItem::Literal {
549                        content: "'".into(),
550                        quoted: true,
551                    },
552                    ParsedPatternItem::Literal {
553                        content: "clock".into(),
554                        quoted: true,
555                    },
556                ],
557            ),
558        ];
559
560        for (input, expected) in samples {
561            let parser = Parser::new(input, QuoteMode::QuotingSupported.into());
562            let result = parser
563                .try_collect_into_vec()
564                .expect("Failed to parse a pattern");
565            assert_eq!(result.deref(), expected,);
566        }
567
568        let broken: Vec<(_, Option<ParserError<core::num::ParseIntError>>)> = vec![
569            ("{", Some(ParserError::UnclosedPlaceholder)),
570            ("{0", Some(ParserError::UnclosedPlaceholder)),
571            ("{01", Some(ParserError::UnclosedPlaceholder)),
572            (
573                "{date}",
574                // This should be:
575                // ```
576                // ParserError::InvalidPlaceholder(
577                //     ParseIntError {
578                //         kind: core::num::IntErrorKind::InvalidDigit
579                //     }
580                // ),
581                // ```
582                // Pending: https://github.com/rust-lang/rust/issues/22639
583                //
584                // Once that is fixed, we can stop using an `Option` here.
585                None,
586            ),
587            ("{date} 'days'", None),
588            ("'{00}", Some(ParserError::UnclosedQuotedLiteral)),
589            ("d", Some(ParserError::IllegalCharacter('d'))),
590        ];
591
592        for (input, error) in broken {
593            let parser = Parser::<usize>::new(input, QuoteMode::QuotingRequired.into());
594            let result = parser.try_collect_into_vec();
595            if let Some(error) = error {
596                assert_eq!(result.expect_err("Should have failed."), error,);
597            } else {
598                assert!(result.is_err());
599            }
600        }
601    }
602}