icu_pattern/parser/mod.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5pub mod error;
6pub mod token;
7
8use alloc::{borrow::Cow, vec, vec::Vec};
9use core::{fmt::Debug, marker::PhantomData, str::FromStr};
10pub use error::ParserError;
11pub use token::ParsedPatternItem;
12
13#[derive(PartialEq, Debug)]
14enum ParserState {
15 Default,
16 Placeholder,
17 QuotedLiteral,
18 Apostrophe { quoted: bool },
19}
20
21impl Default for ParserState {
22 fn default() -> Self {
23 Self::Default
24 }
25}
26
27macro_rules! handle_literal {
28 ($self:ident, $quoted:expr, $next_state:expr) => {{
29 let range = $self.advance_state($self.idx, $next_state);
30 if !range.is_empty() {
31 #[allow(clippy::indexing_slicing)]
32 // TODO(#1668) Clippy exceptions need docs or fixing.
33 return Ok(Some(ParsedPatternItem::Literal {
34 content: Cow::Borrowed(&$self.input[range]),
35 quoted: $quoted,
36 }));
37 } else {
38 continue;
39 }
40 }};
41}
42
43/// Options passed to the constructor of [`Parser`].
44///
45/// ✨ *Enabled with the `alloc` Cargo feature.*
46#[derive(Debug, Default)]
47#[non_exhaustive]
48pub struct ParserOptions {
49 /// Controls how quotes (`'`) are interpreted.
50 pub quote_mode: QuoteMode,
51}
52
53/// Controls how quotes (`'`) are interpreted.
54#[derive(Debug, Default, PartialEq)]
55#[non_exhaustive]
56pub enum QuoteMode {
57 /// Quotes are interpreted as literals, i.e. `{0} o'clock` will interpolate to `5 o'clock`.
58 #[default]
59 QuotesAreLiterals,
60 /// Quotes can be used to quote ASCII characters, i.e. both `{0} World` and `{0} 'World'` will interpolate to `Hello World`.
61 ///
62 /// A double quote can be used to create a quote literal, i.e. `{0} o''clock`.
63 QuotingSupported,
64 /// Quotes are required to quote ASCII characters, i.e. `{0} 'World'` will interpolate to `Hello World`, while `{0} World` is an error.
65 ///
66 /// A double quote can be used to create a quote literal, i.e. `{0} 'o''clock'`.
67 QuotingRequired,
68}
69
70impl From<QuoteMode> for ParserOptions {
71 fn from(quote_mode: QuoteMode) -> Self {
72 Self { quote_mode }
73 }
74}
75
76/// Placeholder pattern parser.
77///
78/// The parser allows for handling flexible range of generic patterns
79/// with placeholders.
80///
81/// The [`Parser`] is generic over any placeholder which implements [`FromStr`]
82/// allowing the consumer to parse placeholder patterns such as "{0}, {1}",
83/// "{date}, {time}" or any other. A placeholder must be enclosed in `{` and `}`
84/// characters in the input pattern string.
85///
86/// At the moment the parser is written as a custom fallible iterator.
87///
88/// ✨ *Enabled with the `alloc` Cargo feature.*
89///
90/// # Examples
91///
92/// ```
93/// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
94///
95/// let input = "{0}, {1}";
96///
97/// let mut parser = Parser::new(input, ParserOptions::default());
98///
99/// let mut result = vec![];
100///
101/// while let Some(element) =
102/// parser.try_next().expect("Failed to advance iterator")
103/// {
104/// result.push(element);
105/// }
106///
107/// assert_eq!(
108/// result,
109/// &[
110/// ParsedPatternItem::Placeholder(0),
111/// ParsedPatternItem::Literal {
112/// content: ", ".into(),
113/// quoted: false
114/// },
115/// ParsedPatternItem::Placeholder(1),
116/// ]
117/// );
118/// ```
119///
120/// # Named placeholders
121///
122/// The parser is also capable of parsing different placeholder types such as strings.
123///
124/// ## Examples
125/// ```
126/// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
127///
128/// let input = "{start}, {end}";
129///
130/// let mut parser = Parser::new(input, ParserOptions::default());
131///
132/// let mut result = vec![];
133///
134/// while let Some(element) =
135/// parser.try_next().expect("Failed to advance iterator")
136/// {
137/// result.push(element);
138/// }
139///
140/// assert_eq!(
141/// result,
142/// &[
143/// ParsedPatternItem::Placeholder("start".to_owned()),
144/// ParsedPatternItem::Literal {
145/// content: ", ".into(),
146/// quoted: false
147/// },
148/// ParsedPatternItem::Placeholder("end".to_owned()),
149/// ]
150/// );
151/// ```
152///
153/// # Type parameters
154///
155/// - `P`: The type of the placeholder used as a key for the [`PlaceholderValueProvider`].
156///
157/// # Lifetimes
158///
159/// - `p`: The life time of an input string slice to be parsed.
160///
161/// # Design Decisions
162///
163/// The parser is written in an intentionally generic way to enable use against wide range
164/// of potential placeholder pattern models and use cases.
165///
166/// Serveral design decisions have been made that the reader should be aware of when using the API.
167///
168/// ## Zero copy
169///
170/// The parser is intended for runtime use and is optimized for performance and low memory overhad.
171///
172/// Zero copy parsing is a model which allows the parser to produce tokens that are de-facto
173/// slices of the input without ever having to modify the input or copy from it.
174///
175/// In case of ICU patterns that decision brings a trade-off around handling of quoted literals.
176/// A parser that copies bytes from the input when generating the output can take a pattern literal
177/// that contains a quoted portion and concatenace the parts, effectively generating a single
178/// literal out of a series of syntactical literal quoted and unquoted nodes.
179/// A zero copy parser sacrifices that convenience for marginal performance gains.
180///
181/// The rationale for the decision is that many placeholder patterns do not contain ASCII letters
182/// and therefore can benefit from this design decision.
183/// Secondly, even in scenarios where ASCII letters, or other quoted literals, are used, the
184/// zero-copy design still maintains high performance, only increasing the number of tokens
185/// returned by the parser, but without increase to allocations.
186///
187/// ### Examples
188/// ```
189/// use icu_pattern::{ParsedPatternItem, Parser, QuoteMode};
190///
191/// let input = "{0} 'and' {1}";
192///
193/// let mut parser = Parser::new(input, QuoteMode::QuotingSupported.into());
194///
195/// let mut result = vec![];
196///
197/// while let Some(element) =
198/// parser.try_next().expect("Failed to advance iterator")
199/// {
200/// result.push(element);
201/// }
202///
203/// assert_eq!(
204/// result,
205/// &[
206/// ParsedPatternItem::Placeholder(0),
207/// ParsedPatternItem::Literal {
208/// content: " ".into(),
209/// quoted: false
210/// },
211/// ParsedPatternItem::Literal {
212/// content: "and".into(),
213/// quoted: true
214/// },
215/// ParsedPatternItem::Literal {
216/// content: " ".into(),
217/// quoted: false
218/// },
219/// ParsedPatternItem::Placeholder(1),
220/// ]
221/// );
222/// ```
223///
224/// ## Fallible Iterator
225///
226/// Rust providers a strong support for iterators and iterator combinators, which
227/// fits very well into the design of this parser/interpolator model.
228///
229/// Unfortunately, Rust iterators at the moment are infallible, while parsers are inhereantely
230/// fallible. As such, the decision has been made to design the API in line with what
231/// we hope will become a trait signature of a fallible iterator in the future, rather
232/// than implementing a reversed infallible iterator (where the [`Item`] would be
233/// `Option<Result<Item>>`).
234///
235/// That decision impacts the ergonomics of operating on the parser, on one hand making
236/// the fallible iteration more ergonomic, at a trade-off of losing access to the wide
237/// range of Rust iterator traits.
238///
239/// ## Generic Placeholder
240///
241/// To handle generic placeholder design, the only constrain necessary in the parser
242/// is that a placeholder must be parsed from a string slice.
243/// At the moment of writing, Rust is [preparing to deprecate][`RFC 2924`] [`FromStr`] in favor of
244/// [`TryFrom<&str>`][`TryFrom`].
245/// Among many benfits of such transition would be the auto-trait behavior of [`From`] and
246/// a [`TryFrom<&str>`][`TryFrom`] for [`&str`] allowing for placeholders to be [`&str`] themselves.
247///
248/// Unfortunately, at the moment [`TryFrom<&str>`][`TryFrom`] for [`usize`] is not implemented, which would
249/// impact the core use case of placeholder patterns.
250///
251/// In result, the decision has been made to use [`FromStr`] for the time being, until
252/// [`TryFrom<&str>`][`TryFrom`] gets implemented on all types that support [`FromStr`].
253///
254/// [`TR35 2.6.1]: https://unicode.org/reports/tr35/tr35-dates.html#dateTimeFormat
255/// [`RFC 2924`]: https://github.com/rust-lang/rfcs/pull/2924
256/// [`Item`]: core::iter::Iterator::Item
257/// [`TryFrom`]: core::convert::TryFrom
258/// [`PlaceholderValueProvider`]: crate::PlaceholderValueProvider
259#[derive(Debug)]
260pub struct Parser<'p, P> {
261 input: &'p str,
262 len: usize,
263
264 quote_mode: QuoteMode,
265
266 start_idx: usize,
267 idx: usize,
268
269 state: ParserState,
270 marker: PhantomData<P>,
271}
272
273impl<'p, P> Parser<'p, P> {
274 /// Creates a new `Parser`.
275 ///
276 /// The `allow_raw_letters` controls whether the parser will support
277 /// ASCII letters without quotes.
278 ///
279 /// # Examples
280 /// ```
281 /// use icu_pattern::{Parser, ParserOptions};
282 /// let mut parser = Parser::<usize>::new("{0}, {1}", ParserOptions::default());
283 /// ```
284 pub fn new(input: &'p str, options: ParserOptions) -> Self {
285 Self {
286 input,
287 len: input.len(),
288
289 quote_mode: options.quote_mode,
290
291 start_idx: 0,
292 idx: 0,
293
294 state: ParserState::default(),
295 marker: PhantomData,
296 }
297 }
298
299 /// An iterator method that advances the iterator and returns the result of an attempt to parse
300 /// the next token.
301 ///
302 /// # Examples
303 /// ```
304 /// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
305 ///
306 /// let mut parser = Parser::<usize>::new("{0}, {1}", ParserOptions::default());
307 ///
308 /// // A call to try_next() returns the next value…
309 /// assert_eq!(
310 /// Ok(Some(ParsedPatternItem::Placeholder(0))),
311 /// parser.try_next()
312 /// );
313 /// assert_eq!(
314 /// Ok(Some(ParsedPatternItem::Literal {
315 /// content: ", ".into(),
316 /// quoted: false
317 /// })),
318 /// parser.try_next()
319 /// );
320 /// assert_eq!(
321 /// Ok(Some(ParsedPatternItem::Placeholder(1))),
322 /// parser.try_next()
323 /// );
324 ///
325 /// // … and then `None` once it's over.
326 /// assert_eq!(Ok(None), parser.try_next());
327 /// ```
328 pub fn try_next(
329 &mut self,
330 ) -> Result<Option<ParsedPatternItem<'p, P>>, ParserError<<P as FromStr>::Err>>
331 where
332 P: FromStr,
333 P::Err: Debug,
334 {
335 while let Some(b) = self.input.as_bytes().get(self.idx) {
336 match self.state {
337 ParserState::Placeholder if *b == b'}' => {
338 let range = self.advance_state(self.idx, ParserState::Default);
339 #[allow(clippy::indexing_slicing)]
340 // TODO(#1668) Clippy exceptions need docs or fixing.
341 return self.input[range]
342 .parse()
343 .map(|ret| Some(ParsedPatternItem::Placeholder(ret)))
344 .map_err(ParserError::InvalidPlaceholder);
345 }
346 ParserState::QuotedLiteral
347 if *b == b'\'' && self.quote_mode != QuoteMode::QuotesAreLiterals =>
348 {
349 if self.input.as_bytes().get(self.idx + 1) == Some(&b'\'') {
350 handle_literal!(self, true, ParserState::Apostrophe { quoted: true })
351 } else {
352 handle_literal!(self, true, ParserState::Default)
353 }
354 }
355 ParserState::Default if *b == b'{' => {
356 handle_literal!(self, false, ParserState::Placeholder)
357 }
358 ParserState::Default
359 if *b == b'\'' && self.quote_mode != QuoteMode::QuotesAreLiterals =>
360 {
361 if self.input.as_bytes().get(self.idx + 1) == Some(&b'\'') {
362 handle_literal!(self, false, ParserState::Apostrophe { quoted: false })
363 } else {
364 handle_literal!(self, false, ParserState::QuotedLiteral)
365 }
366 }
367 ParserState::Default
368 if self.quote_mode == QuoteMode::QuotingRequired && b.is_ascii_alphabetic() =>
369 {
370 return Err(ParserError::IllegalCharacter(*b as char));
371 }
372 ParserState::Apostrophe { quoted } => {
373 self.start_idx -= 1;
374 if quoted {
375 handle_literal!(self, true, ParserState::QuotedLiteral)
376 } else {
377 handle_literal!(self, false, ParserState::Default)
378 }
379 }
380 _ => self.idx += 1,
381 }
382 }
383 match self.state {
384 ParserState::Placeholder => Err(ParserError::UnclosedPlaceholder),
385 ParserState::QuotedLiteral => Err(ParserError::UnclosedQuotedLiteral),
386 ParserState::Apostrophe { .. } => unreachable!(),
387 ParserState::Default => {
388 let range = self.start_idx..self.len;
389 if !range.is_empty() {
390 self.start_idx = self.len;
391 #[allow(clippy::indexing_slicing)]
392 // TODO(#1668) Clippy exceptions need docs or fixing.
393 Ok(Some(ParsedPatternItem::Literal {
394 content: Cow::Borrowed(&self.input[range]),
395 quoted: false,
396 }))
397 } else {
398 Ok(None)
399 }
400 }
401 }
402 }
403
404 fn advance_state(&mut self, idx: usize, next_state: ParserState) -> core::ops::Range<usize> {
405 let range = self.start_idx..idx;
406 self.idx = idx + 1;
407 self.start_idx = self.idx;
408 self.state = next_state;
409 range
410 }
411
412 /// Mutates this parser and collects all [`ParsedPatternItem`]s into a vector.
413 pub fn try_collect_into_vec(
414 mut self,
415 ) -> Result<Vec<ParsedPatternItem<'p, P>>, ParserError<<P as FromStr>::Err>>
416 where
417 P: FromStr,
418 P::Err: Debug,
419 {
420 let mut result = vec![];
421 while let Some(token) = self.try_next()? {
422 result.push(token);
423 }
424 Ok(result)
425 }
426}
427
428#[cfg(test)]
429mod tests {
430 use super::*;
431 use core::ops::Deref;
432
433 #[test]
434 fn pattern_parse_placeholders() {
435 let samples = vec![
436 ("{0}", vec![ParsedPatternItem::Placeholder(0)]),
437 (
438 "{0}{1}",
439 vec![
440 ParsedPatternItem::Placeholder(0),
441 ParsedPatternItem::Placeholder(1),
442 ],
443 ),
444 (
445 "{0} 'at' {1}",
446 vec![
447 ParsedPatternItem::Placeholder(0),
448 ParsedPatternItem::Literal {
449 content: " ".into(),
450 quoted: false,
451 },
452 ParsedPatternItem::Literal {
453 content: "at".into(),
454 quoted: true,
455 },
456 ParsedPatternItem::Literal {
457 content: " ".into(),
458 quoted: false,
459 },
460 ParsedPatternItem::Placeholder(1),
461 ],
462 ),
463 (
464 "{0}'at'{1}",
465 vec![
466 ParsedPatternItem::Placeholder(0),
467 ParsedPatternItem::Literal {
468 content: "at".into(),
469 quoted: true,
470 },
471 ParsedPatternItem::Placeholder(1),
472 ],
473 ),
474 (
475 "'{0}' 'at' '{1}'",
476 vec![
477 ParsedPatternItem::Literal {
478 content: "{0}".into(),
479 quoted: true,
480 },
481 ParsedPatternItem::Literal {
482 content: " ".into(),
483 quoted: false,
484 },
485 ParsedPatternItem::Literal {
486 content: "at".into(),
487 quoted: true,
488 },
489 ParsedPatternItem::Literal {
490 content: " ".into(),
491 quoted: false,
492 },
493 ParsedPatternItem::Literal {
494 content: "{1}".into(),
495 quoted: true,
496 },
497 ],
498 ),
499 (
500 "'PRE' {0} 'and' {1} 'POST'",
501 vec![
502 ParsedPatternItem::Literal {
503 content: "PRE".into(),
504 quoted: true,
505 },
506 ParsedPatternItem::Literal {
507 content: " ".into(),
508 quoted: false,
509 },
510 ParsedPatternItem::Placeholder(0),
511 ParsedPatternItem::Literal {
512 content: " ".into(),
513 quoted: false,
514 },
515 ParsedPatternItem::Literal {
516 content: "and".into(),
517 quoted: true,
518 },
519 ParsedPatternItem::Literal {
520 content: " ".into(),
521 quoted: false,
522 },
523 ParsedPatternItem::Placeholder(1),
524 ParsedPatternItem::Literal {
525 content: " ".into(),
526 quoted: false,
527 },
528 ParsedPatternItem::Literal {
529 content: "POST".into(),
530 quoted: true,
531 },
532 ],
533 ),
534 (
535 "{0} o''clock and 'o''clock'",
536 vec![
537 ParsedPatternItem::Placeholder(0),
538 ParsedPatternItem::Literal {
539 content: " o".into(),
540 quoted: false,
541 },
542 ParsedPatternItem::Literal {
543 content: "'".into(),
544 quoted: false,
545 },
546 ParsedPatternItem::Literal {
547 content: "clock and ".into(),
548 quoted: false,
549 },
550 ParsedPatternItem::Literal {
551 content: "o".into(),
552 quoted: true,
553 },
554 ParsedPatternItem::Literal {
555 content: "'".into(),
556 quoted: true,
557 },
558 ParsedPatternItem::Literal {
559 content: "clock".into(),
560 quoted: true,
561 },
562 ],
563 ),
564 ];
565
566 for (input, expected) in samples {
567 let parser = Parser::new(input, QuoteMode::QuotingSupported.into());
568 let result = parser
569 .try_collect_into_vec()
570 .expect("Failed to parse a pattern");
571 assert_eq!(result.deref(), expected,);
572 }
573
574 let broken: Vec<(_, Option<ParserError<core::num::ParseIntError>>)> = vec![
575 ("{", Some(ParserError::UnclosedPlaceholder)),
576 ("{0", Some(ParserError::UnclosedPlaceholder)),
577 ("{01", Some(ParserError::UnclosedPlaceholder)),
578 (
579 "{date}",
580 // This should be:
581 // ```
582 // ParserError::InvalidPlaceholder(
583 // ParseIntError {
584 // kind: core::num::IntErrorKind::InvalidDigit
585 // }
586 // ),
587 // ```
588 // Pending: https://github.com/rust-lang/rust/issues/22639
589 //
590 // Once that is fixed, we can stop using an `Option` here.
591 None,
592 ),
593 ("{date} 'days'", None),
594 ("'{00}", Some(ParserError::UnclosedQuotedLiteral)),
595 ("d", Some(ParserError::IllegalCharacter('d'))),
596 ];
597
598 for (input, error) in broken {
599 let parser = Parser::<usize>::new(input, QuoteMode::QuotingRequired.into());
600 let result = parser.try_collect_into_vec();
601 if let Some(error) = error {
602 assert_eq!(result.expect_err("Should have failed."), error,);
603 } else {
604 assert!(result.is_err());
605 }
606 }
607 }
608}