icu_pattern/parser/
mod.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5pub mod error;
6pub mod token;
7
8use alloc::{borrow::Cow, vec, vec::Vec};
9use core::{fmt::Debug, marker::PhantomData, str::FromStr};
10pub use error::ParserError;
11pub use token::ParsedPatternItem;
12
13#[derive(PartialEq, Debug)]
14enum ParserState {
15    Default,
16    Placeholder,
17    QuotedLiteral,
18    Apostrophe { quoted: bool },
19}
20
21impl Default for ParserState {
22    fn default() -> Self {
23        Self::Default
24    }
25}
26
27macro_rules! handle_literal {
28    ($self:ident, $quoted:expr, $next_state:expr) => {{
29        let range = $self.advance_state($self.idx, $next_state);
30        if !range.is_empty() {
31            #[allow(clippy::indexing_slicing)]
32            // TODO(#1668) Clippy exceptions need docs or fixing.
33            return Ok(Some(ParsedPatternItem::Literal {
34                content: Cow::Borrowed(&$self.input[range]),
35                quoted: $quoted,
36            }));
37        } else {
38            continue;
39        }
40    }};
41}
42
43/// Options passed to the constructor of [`Parser`].
44///
45/// ✨ *Enabled with the `alloc` Cargo feature.*
46#[derive(Debug, Default)]
47#[non_exhaustive]
48pub struct ParserOptions {
49    /// Controls how quotes (`'`) are interpreted.
50    pub quote_mode: QuoteMode,
51}
52
53/// Controls how quotes (`'`) are interpreted.
54#[derive(Debug, Default, PartialEq)]
55#[non_exhaustive]
56pub enum QuoteMode {
57    /// Quotes are interpreted as literals, i.e. `{0} o'clock` will interpolate to `5 o'clock`.
58    #[default]
59    QuotesAreLiterals,
60    /// Quotes can be used to quote ASCII characters, i.e. both `{0} World` and `{0} 'World'` will interpolate to `Hello World`.
61    ///
62    /// A double quote can be used to create a quote literal, i.e. `{0} o''clock`.
63    QuotingSupported,
64    /// Quotes are required to quote ASCII characters, i.e. `{0} 'World'` will interpolate to `Hello World`, while `{0} World` is an error.
65    ///
66    /// A double quote can be used to create a quote literal, i.e. `{0} 'o''clock'`.
67    QuotingRequired,
68}
69
70impl From<QuoteMode> for ParserOptions {
71    fn from(quote_mode: QuoteMode) -> Self {
72        Self { quote_mode }
73    }
74}
75
76/// Placeholder pattern parser.
77///
78/// The parser allows for handling flexible range of generic patterns
79/// with placeholders.
80///
81/// The [`Parser`] is generic over any placeholder which implements [`FromStr`]
82/// allowing the consumer to parse placeholder patterns such as "{0}, {1}",
83/// "{date}, {time}" or any other. A placeholder must be enclosed in `{` and `}`
84/// characters in the input pattern string.
85///
86/// At the moment the parser is written as a custom fallible iterator.
87///
88/// ✨ *Enabled with the `alloc` Cargo feature.*
89///
90/// # Examples
91///
92/// ```
93/// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
94///
95/// let input = "{0}, {1}";
96///
97/// let mut parser = Parser::new(input, ParserOptions::default());
98///
99/// let mut result = vec![];
100///
101/// while let Some(element) =
102///     parser.try_next().expect("Failed to advance iterator")
103/// {
104///     result.push(element);
105/// }
106///
107/// assert_eq!(
108///     result,
109///     &[
110///         ParsedPatternItem::Placeholder(0),
111///         ParsedPatternItem::Literal {
112///             content: ", ".into(),
113///             quoted: false
114///         },
115///         ParsedPatternItem::Placeholder(1),
116///     ]
117/// );
118/// ```
119///
120/// # Named placeholders
121///
122/// The parser is also capable of parsing different placeholder types such as strings.
123///
124/// ## Examples
125/// ```
126/// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
127///
128/// let input = "{start}, {end}";
129///
130/// let mut parser = Parser::new(input, ParserOptions::default());
131///
132/// let mut result = vec![];
133///
134/// while let Some(element) =
135///     parser.try_next().expect("Failed to advance iterator")
136/// {
137///     result.push(element);
138/// }
139///
140/// assert_eq!(
141///     result,
142///     &[
143///         ParsedPatternItem::Placeholder("start".to_owned()),
144///         ParsedPatternItem::Literal {
145///             content: ", ".into(),
146///             quoted: false
147///         },
148///         ParsedPatternItem::Placeholder("end".to_owned()),
149///     ]
150/// );
151/// ```
152///
153/// # Type parameters
154///
155/// - `P`: The type of the placeholder used as a key for the [`PlaceholderValueProvider`].
156///
157/// # Lifetimes
158///
159/// - `p`: The life time of an input string slice to be parsed.
160///
161/// # Design Decisions
162///
163/// The parser is written in an intentionally generic way to enable use against wide range
164/// of potential placeholder pattern models and use cases.
165///
166/// Serveral design decisions have been made that the reader should be aware of when using the API.
167///
168/// ## Zero copy
169///
170/// The parser is intended for runtime use and is optimized for performance and low memory overhad.
171///
172/// Zero copy parsing is a model which allows the parser to produce tokens that are de-facto
173/// slices of the input without ever having to modify the input or copy from it.
174///
175/// In case of ICU patterns that decision brings a trade-off around handling of quoted literals.
176/// A parser that copies bytes from the input when generating the output can take a pattern literal
177/// that contains a quoted portion and concatenace the parts, effectively generating a single
178/// literal out of a series of syntactical literal quoted and unquoted nodes.
179/// A zero copy parser sacrifices that convenience for marginal performance gains.
180///
181/// The rationale for the decision is that many placeholder patterns do not contain ASCII letters
182/// and therefore can benefit from this design decision.
183/// Secondly, even in scenarios where ASCII letters, or other quoted literals, are used, the
184/// zero-copy design still maintains high performance, only increasing the number of tokens
185/// returned by the parser, but without increase to allocations.
186///
187/// ### Examples
188/// ```
189/// use icu_pattern::{ParsedPatternItem, Parser, QuoteMode};
190///
191/// let input = "{0} 'and' {1}";
192///
193/// let mut parser = Parser::new(input, QuoteMode::QuotingSupported.into());
194///
195/// let mut result = vec![];
196///
197/// while let Some(element) =
198///     parser.try_next().expect("Failed to advance iterator")
199/// {
200///     result.push(element);
201/// }
202///
203/// assert_eq!(
204///     result,
205///     &[
206///         ParsedPatternItem::Placeholder(0),
207///         ParsedPatternItem::Literal {
208///             content: " ".into(),
209///             quoted: false
210///         },
211///         ParsedPatternItem::Literal {
212///             content: "and".into(),
213///             quoted: true
214///         },
215///         ParsedPatternItem::Literal {
216///             content: " ".into(),
217///             quoted: false
218///         },
219///         ParsedPatternItem::Placeholder(1),
220///     ]
221/// );
222/// ```
223///
224/// ## Fallible Iterator
225///
226/// Rust providers a strong support for iterators and iterator combinators, which
227/// fits very well into the design of this parser/interpolator model.
228///
229/// Unfortunately, Rust iterators at the moment are infallible, while parsers are inhereantely
230/// fallible. As such, the decision has been made to design the API in line with what
231/// we hope will become a trait signature of a fallible iterator in the future, rather
232/// than implementing a reversed infallible iterator (where the [`Item`] would be
233/// `Option<Result<Item>>`).
234///
235/// That decision impacts the ergonomics of operating on the parser, on one hand making
236/// the fallible iteration more ergonomic, at a trade-off of losing access to the wide
237/// range of Rust iterator traits.
238///
239/// ## Generic Placeholder
240///
241/// To handle generic placeholder design, the only constrain necessary in the parser
242/// is that a placeholder must be parsed from a string slice.
243/// At the moment of writing, Rust is [preparing to deprecate][`RFC 2924`] [`FromStr`] in favor of
244/// [`TryFrom<&str>`][`TryFrom`].
245/// Among many benfits of such transition would be the auto-trait behavior of [`From`] and
246/// a [`TryFrom<&str>`][`TryFrom`] for [`&str`] allowing for placeholders to be [`&str`] themselves.
247///
248/// Unfortunately, at the moment [`TryFrom<&str>`][`TryFrom`] for [`usize`] is not implemented, which would
249/// impact the core use case of placeholder patterns.
250///
251/// In result, the decision has been made to use [`FromStr`] for the time being, until
252/// [`TryFrom<&str>`][`TryFrom`] gets implemented on all types that support [`FromStr`].
253///
254/// [`TR35 2.6.1]: https://unicode.org/reports/tr35/tr35-dates.html#dateTimeFormat
255/// [`RFC 2924`]: https://github.com/rust-lang/rfcs/pull/2924
256/// [`Item`]: core::iter::Iterator::Item
257/// [`TryFrom`]: core::convert::TryFrom
258/// [`PlaceholderValueProvider`]: crate::PlaceholderValueProvider
259#[derive(Debug)]
260pub struct Parser<'p, P> {
261    input: &'p str,
262    len: usize,
263
264    quote_mode: QuoteMode,
265
266    start_idx: usize,
267    idx: usize,
268
269    state: ParserState,
270    marker: PhantomData<P>,
271}
272
273impl<'p, P> Parser<'p, P> {
274    /// Creates a new `Parser`.
275    ///
276    /// The `allow_raw_letters` controls whether the parser will support
277    /// ASCII letters without quotes.
278    ///
279    /// # Examples
280    /// ```
281    /// use icu_pattern::{Parser, ParserOptions};
282    /// let mut parser = Parser::<usize>::new("{0}, {1}", ParserOptions::default());
283    /// ```
284    pub fn new(input: &'p str, options: ParserOptions) -> Self {
285        Self {
286            input,
287            len: input.len(),
288
289            quote_mode: options.quote_mode,
290
291            start_idx: 0,
292            idx: 0,
293
294            state: ParserState::default(),
295            marker: PhantomData,
296        }
297    }
298
299    /// An iterator method that advances the iterator and returns the result of an attempt to parse
300    /// the next token.
301    ///
302    /// # Examples
303    /// ```
304    /// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
305    ///
306    /// let mut parser = Parser::<usize>::new("{0}, {1}", ParserOptions::default());
307    ///
308    /// // A call to try_next() returns the next value…
309    /// assert_eq!(
310    ///     Ok(Some(ParsedPatternItem::Placeholder(0))),
311    ///     parser.try_next()
312    /// );
313    /// assert_eq!(
314    ///     Ok(Some(ParsedPatternItem::Literal {
315    ///         content: ", ".into(),
316    ///         quoted: false
317    ///     })),
318    ///     parser.try_next()
319    /// );
320    /// assert_eq!(
321    ///     Ok(Some(ParsedPatternItem::Placeholder(1))),
322    ///     parser.try_next()
323    /// );
324    ///
325    /// // … and then `None` once it's over.
326    /// assert_eq!(Ok(None), parser.try_next());
327    /// ```
328    pub fn try_next(
329        &mut self,
330    ) -> Result<Option<ParsedPatternItem<'p, P>>, ParserError<<P as FromStr>::Err>>
331    where
332        P: FromStr,
333        P::Err: Debug,
334    {
335        while let Some(b) = self.input.as_bytes().get(self.idx) {
336            match self.state {
337                ParserState::Placeholder if *b == b'}' => {
338                    let range = self.advance_state(self.idx, ParserState::Default);
339                    #[allow(clippy::indexing_slicing)]
340                    // TODO(#1668) Clippy exceptions need docs or fixing.
341                    return self.input[range]
342                        .parse()
343                        .map(|ret| Some(ParsedPatternItem::Placeholder(ret)))
344                        .map_err(ParserError::InvalidPlaceholder);
345                }
346                ParserState::QuotedLiteral
347                    if *b == b'\'' && self.quote_mode != QuoteMode::QuotesAreLiterals =>
348                {
349                    if self.input.as_bytes().get(self.idx + 1) == Some(&b'\'') {
350                        handle_literal!(self, true, ParserState::Apostrophe { quoted: true })
351                    } else {
352                        handle_literal!(self, true, ParserState::Default)
353                    }
354                }
355                ParserState::Default if *b == b'{' => {
356                    handle_literal!(self, false, ParserState::Placeholder)
357                }
358                ParserState::Default
359                    if *b == b'\'' && self.quote_mode != QuoteMode::QuotesAreLiterals =>
360                {
361                    if self.input.as_bytes().get(self.idx + 1) == Some(&b'\'') {
362                        handle_literal!(self, false, ParserState::Apostrophe { quoted: false })
363                    } else {
364                        handle_literal!(self, false, ParserState::QuotedLiteral)
365                    }
366                }
367                ParserState::Default
368                    if self.quote_mode == QuoteMode::QuotingRequired && b.is_ascii_alphabetic() =>
369                {
370                    return Err(ParserError::IllegalCharacter(*b as char));
371                }
372                ParserState::Apostrophe { quoted } => {
373                    self.start_idx -= 1;
374                    if quoted {
375                        handle_literal!(self, true, ParserState::QuotedLiteral)
376                    } else {
377                        handle_literal!(self, false, ParserState::Default)
378                    }
379                }
380                _ => self.idx += 1,
381            }
382        }
383        match self.state {
384            ParserState::Placeholder => Err(ParserError::UnclosedPlaceholder),
385            ParserState::QuotedLiteral => Err(ParserError::UnclosedQuotedLiteral),
386            ParserState::Apostrophe { .. } => unreachable!(),
387            ParserState::Default => {
388                let range = self.start_idx..self.len;
389                if !range.is_empty() {
390                    self.start_idx = self.len;
391                    #[allow(clippy::indexing_slicing)]
392                    // TODO(#1668) Clippy exceptions need docs or fixing.
393                    Ok(Some(ParsedPatternItem::Literal {
394                        content: Cow::Borrowed(&self.input[range]),
395                        quoted: false,
396                    }))
397                } else {
398                    Ok(None)
399                }
400            }
401        }
402    }
403
404    fn advance_state(&mut self, idx: usize, next_state: ParserState) -> core::ops::Range<usize> {
405        let range = self.start_idx..idx;
406        self.idx = idx + 1;
407        self.start_idx = self.idx;
408        self.state = next_state;
409        range
410    }
411
412    /// Mutates this parser and collects all [`ParsedPatternItem`]s into a vector.
413    pub fn try_collect_into_vec(
414        mut self,
415    ) -> Result<Vec<ParsedPatternItem<'p, P>>, ParserError<<P as FromStr>::Err>>
416    where
417        P: FromStr,
418        P::Err: Debug,
419    {
420        let mut result = vec![];
421        while let Some(token) = self.try_next()? {
422            result.push(token);
423        }
424        Ok(result)
425    }
426}
427
428#[cfg(test)]
429mod tests {
430    use super::*;
431    use core::ops::Deref;
432
433    #[test]
434    fn pattern_parse_placeholders() {
435        let samples = vec![
436            ("{0}", vec![ParsedPatternItem::Placeholder(0)]),
437            (
438                "{0}{1}",
439                vec![
440                    ParsedPatternItem::Placeholder(0),
441                    ParsedPatternItem::Placeholder(1),
442                ],
443            ),
444            (
445                "{0} 'at' {1}",
446                vec![
447                    ParsedPatternItem::Placeholder(0),
448                    ParsedPatternItem::Literal {
449                        content: " ".into(),
450                        quoted: false,
451                    },
452                    ParsedPatternItem::Literal {
453                        content: "at".into(),
454                        quoted: true,
455                    },
456                    ParsedPatternItem::Literal {
457                        content: " ".into(),
458                        quoted: false,
459                    },
460                    ParsedPatternItem::Placeholder(1),
461                ],
462            ),
463            (
464                "{0}'at'{1}",
465                vec![
466                    ParsedPatternItem::Placeholder(0),
467                    ParsedPatternItem::Literal {
468                        content: "at".into(),
469                        quoted: true,
470                    },
471                    ParsedPatternItem::Placeholder(1),
472                ],
473            ),
474            (
475                "'{0}' 'at' '{1}'",
476                vec![
477                    ParsedPatternItem::Literal {
478                        content: "{0}".into(),
479                        quoted: true,
480                    },
481                    ParsedPatternItem::Literal {
482                        content: " ".into(),
483                        quoted: false,
484                    },
485                    ParsedPatternItem::Literal {
486                        content: "at".into(),
487                        quoted: true,
488                    },
489                    ParsedPatternItem::Literal {
490                        content: " ".into(),
491                        quoted: false,
492                    },
493                    ParsedPatternItem::Literal {
494                        content: "{1}".into(),
495                        quoted: true,
496                    },
497                ],
498            ),
499            (
500                "'PRE' {0} 'and' {1} 'POST'",
501                vec![
502                    ParsedPatternItem::Literal {
503                        content: "PRE".into(),
504                        quoted: true,
505                    },
506                    ParsedPatternItem::Literal {
507                        content: " ".into(),
508                        quoted: false,
509                    },
510                    ParsedPatternItem::Placeholder(0),
511                    ParsedPatternItem::Literal {
512                        content: " ".into(),
513                        quoted: false,
514                    },
515                    ParsedPatternItem::Literal {
516                        content: "and".into(),
517                        quoted: true,
518                    },
519                    ParsedPatternItem::Literal {
520                        content: " ".into(),
521                        quoted: false,
522                    },
523                    ParsedPatternItem::Placeholder(1),
524                    ParsedPatternItem::Literal {
525                        content: " ".into(),
526                        quoted: false,
527                    },
528                    ParsedPatternItem::Literal {
529                        content: "POST".into(),
530                        quoted: true,
531                    },
532                ],
533            ),
534            (
535                "{0} o''clock and 'o''clock'",
536                vec![
537                    ParsedPatternItem::Placeholder(0),
538                    ParsedPatternItem::Literal {
539                        content: " o".into(),
540                        quoted: false,
541                    },
542                    ParsedPatternItem::Literal {
543                        content: "'".into(),
544                        quoted: false,
545                    },
546                    ParsedPatternItem::Literal {
547                        content: "clock and ".into(),
548                        quoted: false,
549                    },
550                    ParsedPatternItem::Literal {
551                        content: "o".into(),
552                        quoted: true,
553                    },
554                    ParsedPatternItem::Literal {
555                        content: "'".into(),
556                        quoted: true,
557                    },
558                    ParsedPatternItem::Literal {
559                        content: "clock".into(),
560                        quoted: true,
561                    },
562                ],
563            ),
564        ];
565
566        for (input, expected) in samples {
567            let parser = Parser::new(input, QuoteMode::QuotingSupported.into());
568            let result = parser
569                .try_collect_into_vec()
570                .expect("Failed to parse a pattern");
571            assert_eq!(result.deref(), expected,);
572        }
573
574        let broken: Vec<(_, Option<ParserError<core::num::ParseIntError>>)> = vec![
575            ("{", Some(ParserError::UnclosedPlaceholder)),
576            ("{0", Some(ParserError::UnclosedPlaceholder)),
577            ("{01", Some(ParserError::UnclosedPlaceholder)),
578            (
579                "{date}",
580                // This should be:
581                // ```
582                // ParserError::InvalidPlaceholder(
583                //     ParseIntError {
584                //         kind: core::num::IntErrorKind::InvalidDigit
585                //     }
586                // ),
587                // ```
588                // Pending: https://github.com/rust-lang/rust/issues/22639
589                //
590                // Once that is fixed, we can stop using an `Option` here.
591                None,
592            ),
593            ("{date} 'days'", None),
594            ("'{00}", Some(ParserError::UnclosedQuotedLiteral)),
595            ("d", Some(ParserError::IllegalCharacter('d'))),
596        ];
597
598        for (input, error) in broken {
599            let parser = Parser::<usize>::new(input, QuoteMode::QuotingRequired.into());
600            let result = parser.try_collect_into_vec();
601            if let Some(error) = error {
602                assert_eq!(result.expect_err("Should have failed."), error,);
603            } else {
604                assert!(result.is_err());
605            }
606        }
607    }
608}