icu_pattern/parser/mod.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5pub mod error;
6pub mod token;
7
8use alloc::{borrow::Cow, vec, vec::Vec};
9use core::{fmt::Debug, marker::PhantomData, str::FromStr};
10pub use error::ParserError;
11pub use token::ParsedPatternItem;
12
13#[derive(PartialEq, Debug)]
14enum ParserState {
15 Default,
16 Placeholder,
17 QuotedLiteral,
18 Apostrophe { quoted: bool },
19}
20
21impl Default for ParserState {
22 fn default() -> Self {
23 Self::Default
24 }
25}
26
27macro_rules! handle_literal {
28 ($self:ident, $quoted:expr, $next_state:expr) => {{
29 let range = $self.advance_state($self.idx, $next_state);
30 if !range.is_empty() {
31 return Ok(Some(ParsedPatternItem::Literal {
32 content: Cow::Borrowed(&$self.input[range]),
33 quoted: $quoted,
34 }));
35 } else {
36 continue;
37 }
38 }};
39}
40
41/// Options passed to the constructor of [`Parser`].
42///
43/// ✨ *Enabled with the `alloc` Cargo feature.*
44#[derive(Debug, Default)]
45#[non_exhaustive]
46pub struct ParserOptions {
47 /// Controls how quotes (`'`) are interpreted.
48 pub quote_mode: QuoteMode,
49}
50
51/// Controls how quotes (`'`) are interpreted.
52#[derive(Debug, Default, PartialEq)]
53#[non_exhaustive]
54pub enum QuoteMode {
55 /// Quotes are interpreted as literals, i.e. `{0} o'clock` will interpolate to `5 o'clock`.
56 #[default]
57 QuotesAreLiterals,
58 /// Quotes can be used to quote ASCII characters, i.e. both `{0} World` and `{0} 'World'` will interpolate to `Hello World`.
59 ///
60 /// A double quote can be used to create a quote literal, i.e. `{0} o''clock`.
61 QuotingSupported,
62 /// Quotes are required to quote ASCII characters, i.e. `{0} 'World'` will interpolate to `Hello World`, while `{0} World` is an error.
63 ///
64 /// A double quote can be used to create a quote literal, i.e. `{0} 'o''clock'`.
65 QuotingRequired,
66}
67
68impl From<QuoteMode> for ParserOptions {
69 fn from(quote_mode: QuoteMode) -> Self {
70 Self { quote_mode }
71 }
72}
73
74/// Placeholder pattern parser.
75///
76/// The parser allows for handling flexible range of generic patterns
77/// with placeholders.
78///
79/// The [`Parser`] is generic over any placeholder which implements [`FromStr`]
80/// allowing the consumer to parse placeholder patterns such as "{0}, {1}",
81/// "{date}, {time}" or any other. A placeholder must be enclosed in `{` and `}`
82/// characters in the input pattern string.
83///
84/// At the moment the parser is written as a custom fallible iterator.
85///
86/// ✨ *Enabled with the `alloc` Cargo feature.*
87///
88/// # Examples
89///
90/// ```
91/// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
92///
93/// let input = "{0}, {1}";
94///
95/// let mut parser = Parser::new(input, ParserOptions::default());
96///
97/// let mut result = vec![];
98///
99/// while let Some(element) =
100/// parser.try_next().expect("Failed to advance iterator")
101/// {
102/// result.push(element);
103/// }
104///
105/// assert_eq!(
106/// result,
107/// &[
108/// ParsedPatternItem::Placeholder(0),
109/// ParsedPatternItem::Literal {
110/// content: ", ".into(),
111/// quoted: false
112/// },
113/// ParsedPatternItem::Placeholder(1),
114/// ]
115/// );
116/// ```
117///
118/// # Named placeholders
119///
120/// The parser is also capable of parsing different placeholder types such as strings.
121///
122/// ## Examples
123/// ```
124/// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
125///
126/// let input = "{start}, {end}";
127///
128/// let mut parser = Parser::new(input, ParserOptions::default());
129///
130/// let mut result = vec![];
131///
132/// while let Some(element) =
133/// parser.try_next().expect("Failed to advance iterator")
134/// {
135/// result.push(element);
136/// }
137///
138/// assert_eq!(
139/// result,
140/// &[
141/// ParsedPatternItem::Placeholder("start".to_owned()),
142/// ParsedPatternItem::Literal {
143/// content: ", ".into(),
144/// quoted: false
145/// },
146/// ParsedPatternItem::Placeholder("end".to_owned()),
147/// ]
148/// );
149/// ```
150///
151/// # Type parameters
152///
153/// - `P`: The type of the placeholder used as a key for the [`PlaceholderValueProvider`].
154///
155/// # Lifetimes
156///
157/// - `p`: The life time of an input string slice to be parsed.
158///
159/// # Design Decisions
160///
161/// The parser is written in an intentionally generic way to enable use against wide range
162/// of potential placeholder pattern models and use cases.
163///
164/// Serveral design decisions have been made that the reader should be aware of when using the API.
165///
166/// ## Zero copy
167///
168/// The parser is intended for runtime use and is optimized for performance and low memory overhad.
169///
170/// Zero copy parsing is a model which allows the parser to produce tokens that are de-facto
171/// slices of the input without ever having to modify the input or copy from it.
172///
173/// In case of ICU patterns that decision brings a trade-off around handling of quoted literals.
174/// A parser that copies bytes from the input when generating the output can take a pattern literal
175/// that contains a quoted portion and concatenace the parts, effectively generating a single
176/// literal out of a series of syntactical literal quoted and unquoted nodes.
177/// A zero copy parser sacrifices that convenience for marginal performance gains.
178///
179/// The rationale for the decision is that many placeholder patterns do not contain ASCII letters
180/// and therefore can benefit from this design decision.
181/// Secondly, even in scenarios where ASCII letters, or other quoted literals, are used, the
182/// zero-copy design still maintains high performance, only increasing the number of tokens
183/// returned by the parser, but without increase to allocations.
184///
185/// ### Examples
186/// ```
187/// use icu_pattern::{ParsedPatternItem, Parser, QuoteMode};
188///
189/// let input = "{0} 'and' {1}";
190///
191/// let mut parser = Parser::new(input, QuoteMode::QuotingSupported.into());
192///
193/// let mut result = vec![];
194///
195/// while let Some(element) =
196/// parser.try_next().expect("Failed to advance iterator")
197/// {
198/// result.push(element);
199/// }
200///
201/// assert_eq!(
202/// result,
203/// &[
204/// ParsedPatternItem::Placeholder(0),
205/// ParsedPatternItem::Literal {
206/// content: " ".into(),
207/// quoted: false
208/// },
209/// ParsedPatternItem::Literal {
210/// content: "and".into(),
211/// quoted: true
212/// },
213/// ParsedPatternItem::Literal {
214/// content: " ".into(),
215/// quoted: false
216/// },
217/// ParsedPatternItem::Placeholder(1),
218/// ]
219/// );
220/// ```
221///
222/// ## Fallible Iterator
223///
224/// Rust providers a strong support for iterators and iterator combinators, which
225/// fits very well into the design of this parser/interpolator model.
226///
227/// Unfortunately, Rust iterators at the moment are infallible, while parsers are inhereantely
228/// fallible. As such, the decision has been made to design the API in line with what
229/// we hope will become a trait signature of a fallible iterator in the future, rather
230/// than implementing a reversed infallible iterator (where the [`Item`] would be
231/// `Option<Result<Item>>`).
232///
233/// That decision impacts the ergonomics of operating on the parser, on one hand making
234/// the fallible iteration more ergonomic, at a trade-off of losing access to the wide
235/// range of Rust iterator traits.
236///
237/// ## Generic Placeholder
238///
239/// To handle generic placeholder design, the only constrain necessary in the parser
240/// is that a placeholder must be parsed from a string slice.
241/// At the moment of writing, Rust is [preparing to deprecate][`RFC 2924`] [`FromStr`] in favor of
242/// [`TryFrom<&str>`][`TryFrom`].
243/// Among many benfits of such transition would be the auto-trait behavior of [`From`] and
244/// a [`TryFrom<&str>`][`TryFrom`] for [`&str`] allowing for placeholders to be [`&str`] themselves.
245///
246/// Unfortunately, at the moment [`TryFrom<&str>`][`TryFrom`] for [`usize`] is not implemented, which would
247/// impact the core use case of placeholder patterns.
248///
249/// In result, the decision has been made to use [`FromStr`] for the time being, until
250/// [`TryFrom<&str>`][`TryFrom`] gets implemented on all types that support [`FromStr`].
251///
252/// [`TR35 2.6.1]: https://unicode.org/reports/tr35/tr35-dates.html#dateTimeFormat
253/// [`RFC 2924`]: https://github.com/rust-lang/rfcs/pull/2924
254/// [`Item`]: core::iter::Iterator::Item
255/// [`TryFrom`]: core::convert::TryFrom
256/// [`PlaceholderValueProvider`]: crate::PlaceholderValueProvider
257#[derive(Debug)]
258pub struct Parser<'p, P> {
259 input: &'p str,
260 len: usize,
261
262 quote_mode: QuoteMode,
263
264 start_idx: usize,
265 idx: usize,
266
267 state: ParserState,
268 marker: PhantomData<P>,
269}
270
271impl<'p, P> Parser<'p, P> {
272 /// Creates a new `Parser`.
273 ///
274 /// The `allow_raw_letters` controls whether the parser will support
275 /// ASCII letters without quotes.
276 ///
277 /// # Examples
278 /// ```
279 /// use icu_pattern::{Parser, ParserOptions};
280 /// let mut parser = Parser::<usize>::new("{0}, {1}", ParserOptions::default());
281 /// ```
282 pub fn new(input: &'p str, options: ParserOptions) -> Self {
283 Self {
284 input,
285 len: input.len(),
286
287 quote_mode: options.quote_mode,
288
289 start_idx: 0,
290 idx: 0,
291
292 state: ParserState::default(),
293 marker: PhantomData,
294 }
295 }
296
297 /// An iterator method that advances the iterator and returns the result of an attempt to parse
298 /// the next token.
299 ///
300 /// # Examples
301 /// ```
302 /// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
303 ///
304 /// let mut parser = Parser::<usize>::new("{0}, {1}", ParserOptions::default());
305 ///
306 /// // A call to try_next() returns the next value…
307 /// assert_eq!(
308 /// Ok(Some(ParsedPatternItem::Placeholder(0))),
309 /// parser.try_next()
310 /// );
311 /// assert_eq!(
312 /// Ok(Some(ParsedPatternItem::Literal {
313 /// content: ", ".into(),
314 /// quoted: false
315 /// })),
316 /// parser.try_next()
317 /// );
318 /// assert_eq!(
319 /// Ok(Some(ParsedPatternItem::Placeholder(1))),
320 /// parser.try_next()
321 /// );
322 ///
323 /// // … and then `None` once it's over.
324 /// assert_eq!(Ok(None), parser.try_next());
325 /// ```
326 pub fn try_next(
327 &mut self,
328 ) -> Result<Option<ParsedPatternItem<'p, P>>, ParserError<<P as FromStr>::Err>>
329 where
330 P: FromStr,
331 P::Err: Debug,
332 {
333 while let Some(b) = self.input.as_bytes().get(self.idx) {
334 match self.state {
335 ParserState::Placeholder if *b == b'}' => {
336 let range = self.advance_state(self.idx, ParserState::Default);
337 return self.input[range]
338 .parse()
339 .map(|ret| Some(ParsedPatternItem::Placeholder(ret)))
340 .map_err(ParserError::InvalidPlaceholder);
341 }
342 ParserState::QuotedLiteral
343 if *b == b'\'' && self.quote_mode != QuoteMode::QuotesAreLiterals =>
344 {
345 if self.input.as_bytes().get(self.idx + 1) == Some(&b'\'') {
346 handle_literal!(self, true, ParserState::Apostrophe { quoted: true })
347 } else {
348 handle_literal!(self, true, ParserState::Default)
349 }
350 }
351 ParserState::Default if *b == b'{' => {
352 handle_literal!(self, false, ParserState::Placeholder)
353 }
354 ParserState::Default
355 if *b == b'\'' && self.quote_mode != QuoteMode::QuotesAreLiterals =>
356 {
357 if self.input.as_bytes().get(self.idx + 1) == Some(&b'\'') {
358 handle_literal!(self, false, ParserState::Apostrophe { quoted: false })
359 } else {
360 handle_literal!(self, false, ParserState::QuotedLiteral)
361 }
362 }
363 ParserState::Default
364 if self.quote_mode == QuoteMode::QuotingRequired && b.is_ascii_alphabetic() =>
365 {
366 return Err(ParserError::IllegalCharacter(*b as char));
367 }
368 ParserState::Apostrophe { quoted } => {
369 self.start_idx -= 1;
370 if quoted {
371 handle_literal!(self, true, ParserState::QuotedLiteral)
372 } else {
373 handle_literal!(self, false, ParserState::Default)
374 }
375 }
376 _ => self.idx += 1,
377 }
378 }
379 match self.state {
380 ParserState::Placeholder => Err(ParserError::UnclosedPlaceholder),
381 ParserState::QuotedLiteral => Err(ParserError::UnclosedQuotedLiteral),
382 ParserState::Apostrophe { .. } => unreachable!(),
383 ParserState::Default => {
384 let range = self.start_idx..self.len;
385 if !range.is_empty() {
386 self.start_idx = self.len;
387 Ok(Some(ParsedPatternItem::Literal {
388 content: Cow::Borrowed(&self.input[range]),
389 quoted: false,
390 }))
391 } else {
392 Ok(None)
393 }
394 }
395 }
396 }
397
398 fn advance_state(&mut self, idx: usize, next_state: ParserState) -> core::ops::Range<usize> {
399 let range = self.start_idx..idx;
400 self.idx = idx + 1;
401 self.start_idx = self.idx;
402 self.state = next_state;
403 range
404 }
405
406 /// Mutates this parser and collects all [`ParsedPatternItem`]s into a vector.
407 pub fn try_collect_into_vec(
408 mut self,
409 ) -> Result<Vec<ParsedPatternItem<'p, P>>, ParserError<<P as FromStr>::Err>>
410 where
411 P: FromStr,
412 P::Err: Debug,
413 {
414 let mut result = vec![];
415 while let Some(token) = self.try_next()? {
416 result.push(token);
417 }
418 Ok(result)
419 }
420}
421
422#[cfg(test)]
423mod tests {
424 use super::*;
425 use core::ops::Deref;
426
427 #[test]
428 fn pattern_parse_placeholders() {
429 let samples = vec![
430 ("{0}", vec![ParsedPatternItem::Placeholder(0)]),
431 (
432 "{0}{1}",
433 vec![
434 ParsedPatternItem::Placeholder(0),
435 ParsedPatternItem::Placeholder(1),
436 ],
437 ),
438 (
439 "{0} 'at' {1}",
440 vec![
441 ParsedPatternItem::Placeholder(0),
442 ParsedPatternItem::Literal {
443 content: " ".into(),
444 quoted: false,
445 },
446 ParsedPatternItem::Literal {
447 content: "at".into(),
448 quoted: true,
449 },
450 ParsedPatternItem::Literal {
451 content: " ".into(),
452 quoted: false,
453 },
454 ParsedPatternItem::Placeholder(1),
455 ],
456 ),
457 (
458 "{0}'at'{1}",
459 vec![
460 ParsedPatternItem::Placeholder(0),
461 ParsedPatternItem::Literal {
462 content: "at".into(),
463 quoted: true,
464 },
465 ParsedPatternItem::Placeholder(1),
466 ],
467 ),
468 (
469 "'{0}' 'at' '{1}'",
470 vec![
471 ParsedPatternItem::Literal {
472 content: "{0}".into(),
473 quoted: true,
474 },
475 ParsedPatternItem::Literal {
476 content: " ".into(),
477 quoted: false,
478 },
479 ParsedPatternItem::Literal {
480 content: "at".into(),
481 quoted: true,
482 },
483 ParsedPatternItem::Literal {
484 content: " ".into(),
485 quoted: false,
486 },
487 ParsedPatternItem::Literal {
488 content: "{1}".into(),
489 quoted: true,
490 },
491 ],
492 ),
493 (
494 "'PRE' {0} 'and' {1} 'POST'",
495 vec![
496 ParsedPatternItem::Literal {
497 content: "PRE".into(),
498 quoted: true,
499 },
500 ParsedPatternItem::Literal {
501 content: " ".into(),
502 quoted: false,
503 },
504 ParsedPatternItem::Placeholder(0),
505 ParsedPatternItem::Literal {
506 content: " ".into(),
507 quoted: false,
508 },
509 ParsedPatternItem::Literal {
510 content: "and".into(),
511 quoted: true,
512 },
513 ParsedPatternItem::Literal {
514 content: " ".into(),
515 quoted: false,
516 },
517 ParsedPatternItem::Placeholder(1),
518 ParsedPatternItem::Literal {
519 content: " ".into(),
520 quoted: false,
521 },
522 ParsedPatternItem::Literal {
523 content: "POST".into(),
524 quoted: true,
525 },
526 ],
527 ),
528 (
529 "{0} o''clock and 'o''clock'",
530 vec![
531 ParsedPatternItem::Placeholder(0),
532 ParsedPatternItem::Literal {
533 content: " o".into(),
534 quoted: false,
535 },
536 ParsedPatternItem::Literal {
537 content: "'".into(),
538 quoted: false,
539 },
540 ParsedPatternItem::Literal {
541 content: "clock and ".into(),
542 quoted: false,
543 },
544 ParsedPatternItem::Literal {
545 content: "o".into(),
546 quoted: true,
547 },
548 ParsedPatternItem::Literal {
549 content: "'".into(),
550 quoted: true,
551 },
552 ParsedPatternItem::Literal {
553 content: "clock".into(),
554 quoted: true,
555 },
556 ],
557 ),
558 ];
559
560 for (input, expected) in samples {
561 let parser = Parser::new(input, QuoteMode::QuotingSupported.into());
562 let result = parser
563 .try_collect_into_vec()
564 .expect("Failed to parse a pattern");
565 assert_eq!(result.deref(), expected,);
566 }
567
568 let broken: Vec<(_, Option<ParserError<core::num::ParseIntError>>)> = vec![
569 ("{", Some(ParserError::UnclosedPlaceholder)),
570 ("{0", Some(ParserError::UnclosedPlaceholder)),
571 ("{01", Some(ParserError::UnclosedPlaceholder)),
572 (
573 "{date}",
574 // This should be:
575 // ```
576 // ParserError::InvalidPlaceholder(
577 // ParseIntError {
578 // kind: core::num::IntErrorKind::InvalidDigit
579 // }
580 // ),
581 // ```
582 // Pending: https://github.com/rust-lang/rust/issues/22639
583 //
584 // Once that is fixed, we can stop using an `Option` here.
585 None,
586 ),
587 ("{date} 'days'", None),
588 ("'{00}", Some(ParserError::UnclosedQuotedLiteral)),
589 ("d", Some(ParserError::IllegalCharacter('d'))),
590 ];
591
592 for (input, error) in broken {
593 let parser = Parser::<usize>::new(input, QuoteMode::QuotingRequired.into());
594 let result = parser.try_collect_into_vec();
595 if let Some(error) = error {
596 assert_eq!(result.expect_err("Should have failed."), error,);
597 } else {
598 assert!(result.is_err());
599 }
600 }
601 }
602}