icu_locale_core/
locale.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::parser::*;
6use crate::subtags::Subtag;
7use crate::{extensions, subtags, LanguageIdentifier};
8#[cfg(feature = "alloc")]
9use alloc::borrow::Cow;
10use core::cmp::Ordering;
11#[cfg(feature = "alloc")]
12use core::str::FromStr;
13
14/// A core struct representing a [`Unicode Locale Identifier`].
15///
16/// A locale is made of two parts:
17///  * Unicode Language Identifier
18///  * A set of Unicode Extensions
19///
20/// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and
21/// on top of that is able to parse, manipulate and serialize unicode extension fields.
22///
23/// # Ordering
24///
25/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
26/// multiple possible orderings. Depending on your use case, two orderings are available:
27///
28/// 1. A string ordering, suitable for stable serialization: [`Locale::strict_cmp`]
29/// 2. A struct ordering, suitable for use with a BTreeSet: [`Locale::total_cmp`]
30///
31/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
32///
33/// # Parsing
34///
35/// Unicode recognizes three levels of standard conformance for a locale:
36///
37///  * *well-formed* - syntactically correct
38///  * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
39///  * *canonical* - valid and no deprecated codes or structure.
40///
41/// Any syntactically invalid subtags will cause the parsing to fail with an error.
42///
43/// This operation normalizes syntax to be well-formed. No legacy subtag replacements is performed.
44/// For validation and canonicalization, see `LocaleCanonicalizer`.
45///
46/// ICU4X's Locale parsing does not allow for non-BCP-47-compatible locales [allowed by UTS 35 for backwards compatability][tr35-bcp].
47/// Furthermore, it currently does not allow for language tags to have more than three characters.
48///
49/// # Examples
50///
51/// Simple example:
52///
53/// ```
54/// use icu::locale::{
55///     extensions::unicode::{key, value},
56///     locale,
57///     subtags::{language, region},
58/// };
59///
60/// let loc = locale!("en-US-u-ca-buddhist");
61///
62/// assert_eq!(loc.id.language, language!("en"));
63/// assert_eq!(loc.id.script, None);
64/// assert_eq!(loc.id.region, Some(region!("US")));
65/// assert_eq!(loc.id.variants.len(), 0);
66/// assert_eq!(
67///     loc.extensions.unicode.keywords.get(&key!("ca")),
68///     Some(&value!("buddhist"))
69/// );
70/// ```
71///
72/// More complex example:
73///
74/// ```
75/// use icu::locale::{subtags::*, Locale};
76///
77/// let loc: Locale = "eN-latn-Us-Valencia-u-hC-H12"
78///     .parse()
79///     .expect("Failed to parse.");
80///
81/// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap());
82/// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok());
83/// assert_eq!(loc.id.region, "US".parse::<Region>().ok());
84/// assert_eq!(
85///     loc.id.variants.get(0),
86///     "valencia".parse::<Variant>().ok().as_ref()
87/// );
88/// ```
89///
90/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier
91/// [tr35-bcp]: https://unicode.org/reports/tr35/#BCP_47_Conformance
92#[derive(PartialEq, Eq, Clone, Hash)] // no Ord or PartialOrd: see docs
93#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
94pub struct Locale {
95    /// The basic language/script/region components in the locale identifier along with any variants.
96    pub id: LanguageIdentifier,
97    /// Any extensions present in the locale identifier.
98    pub extensions: extensions::Extensions,
99}
100
101#[test]
102fn test_sizes() {
103    assert_eq!(core::mem::size_of::<subtags::Language>(), 3);
104    assert_eq!(core::mem::size_of::<subtags::Script>(), 4);
105    assert_eq!(core::mem::size_of::<subtags::Region>(), 3);
106    assert_eq!(core::mem::size_of::<subtags::Variant>(), 8);
107    assert_eq!(core::mem::size_of::<subtags::Variants>(), 16);
108    assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 32);
109
110    assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 56);
111    assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 32);
112    assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24);
113
114    assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 16);
115    assert_eq!(core::mem::size_of::<extensions::unicode::Keywords>(), 24);
116    assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24);
117    assert_eq!(core::mem::size_of::<extensions::private::Private>(), 16);
118    assert_eq!(core::mem::size_of::<extensions::Extensions>(), 136);
119
120    assert_eq!(core::mem::size_of::<Locale>(), 168);
121}
122
123impl Locale {
124    /// The unknown locale "und".
125    pub const UNKNOWN: Self = crate::locale!("und");
126
127    /// A constructor which takes a utf8 slice, parses it and
128    /// produces a well-formed [`Locale`].
129    ///
130    /// # Examples
131    ///
132    /// ```
133    /// use icu::locale::Locale;
134    ///
135    /// Locale::try_from_str("en-US-u-hc-h12").unwrap();
136    /// ```
137    #[inline]
138    #[cfg(feature = "alloc")]
139    pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
140        Self::try_from_utf8(s.as_bytes())
141    }
142
143    /// See [`Self::try_from_str`]
144    #[cfg(feature = "alloc")]
145    pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
146        parse_locale(code_units)
147    }
148
149    /// Normalize the locale (operating on UTF-8 formatted byte slices)
150    ///
151    /// This operation will normalize casing and the separator.
152    ///
153    /// # Examples
154    ///
155    /// ```
156    /// use icu::locale::Locale;
157    ///
158    /// assert_eq!(
159    ///     Locale::normalize_utf8(b"pL-latn-pl-U-HC-H12").as_deref(),
160    ///     Ok("pl-Latn-PL-u-hc-h12")
161    /// );
162    /// ```
163    #[cfg(feature = "alloc")]
164    pub fn normalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
165        let locale = Self::try_from_utf8(input)?;
166        Ok(writeable::to_string_or_borrow(&locale, input))
167    }
168
169    /// Normalize the locale (operating on strings)
170    ///
171    /// This operation will normalize casing and the separator.
172    ///
173    /// # Examples
174    ///
175    /// ```
176    /// use icu::locale::Locale;
177    ///
178    /// assert_eq!(
179    ///     Locale::normalize("pL-latn-pl-U-HC-H12").as_deref(),
180    ///     Ok("pl-Latn-PL-u-hc-h12")
181    /// );
182    /// ```
183    #[cfg(feature = "alloc")]
184    pub fn normalize(input: &str) -> Result<Cow<str>, ParseError> {
185        Self::normalize_utf8(input.as_bytes())
186    }
187
188    /// Compare this [`Locale`] with BCP-47 bytes.
189    ///
190    /// The return value is equivalent to what would happen if you first converted this
191    /// [`Locale`] to a BCP-47 string and then performed a byte comparison.
192    ///
193    /// This function is case-sensitive and results in a *total order*, so it is appropriate for
194    /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
195    ///
196    /// # Examples
197    ///
198    /// Sorting a list of locales with this method requires converting one of them to a string:
199    ///
200    /// ```
201    /// use icu::locale::Locale;
202    /// use std::cmp::Ordering;
203    /// use writeable::Writeable;
204    ///
205    /// // Random input order:
206    /// let bcp47_strings: &[&str] = &[
207    ///     "und-u-ca-hebrew",
208    ///     "ar-Latn",
209    ///     "zh-Hant-TW",
210    ///     "zh-TW",
211    ///     "und-fonipa",
212    ///     "zh-Hant",
213    ///     "ar-SA",
214    /// ];
215    ///
216    /// let mut locales = bcp47_strings
217    ///     .iter()
218    ///     .map(|s| s.parse().unwrap())
219    ///     .collect::<Vec<Locale>>();
220    /// locales.sort_by(|a, b| {
221    ///     let b = b.write_to_string();
222    ///     a.strict_cmp(b.as_bytes())
223    /// });
224    /// let strict_cmp_strings = locales
225    ///     .iter()
226    ///     .map(|l| l.to_string())
227    ///     .collect::<Vec<String>>();
228    ///
229    /// // Output ordering, sorted alphabetically
230    /// let expected_ordering: &[&str] = &[
231    ///     "ar-Latn",
232    ///     "ar-SA",
233    ///     "und-fonipa",
234    ///     "und-u-ca-hebrew",
235    ///     "zh-Hant",
236    ///     "zh-Hant-TW",
237    ///     "zh-TW",
238    /// ];
239    ///
240    /// assert_eq!(expected_ordering, strict_cmp_strings);
241    /// ```
242    pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
243        writeable::cmp_utf8(self, other)
244    }
245
246    #[allow(clippy::type_complexity)]
247    pub(crate) fn as_tuple(
248        &self,
249    ) -> (
250        (
251            subtags::Language,
252            Option<subtags::Script>,
253            Option<subtags::Region>,
254            &subtags::Variants,
255        ),
256        (
257            (
258                &extensions::unicode::Attributes,
259                &extensions::unicode::Keywords,
260            ),
261            (
262                Option<(
263                    subtags::Language,
264                    Option<subtags::Script>,
265                    Option<subtags::Region>,
266                    &subtags::Variants,
267                )>,
268                &extensions::transform::Fields,
269            ),
270            &extensions::private::Private,
271            &[extensions::other::Other],
272        ),
273    ) {
274        (self.id.as_tuple(), self.extensions.as_tuple())
275    }
276
277    /// Returns an ordering suitable for use in [`BTreeSet`].
278    ///
279    /// Unlike [`Locale::strict_cmp`], the ordering may or may not be equivalent
280    /// to string ordering, and it may or may not be stable across ICU4X releases.
281    ///
282    /// # Examples
283    ///
284    /// This method returns a nonsensical ordering derived from the fields of the struct:
285    ///
286    /// ```
287    /// use icu::locale::Locale;
288    /// use std::cmp::Ordering;
289    ///
290    /// // Input strings, sorted alphabetically
291    /// let bcp47_strings: &[&str] = &[
292    ///     "ar-Latn",
293    ///     "ar-SA",
294    ///     "und-fonipa",
295    ///     "und-u-ca-hebrew",
296    ///     "zh-Hant",
297    ///     "zh-Hant-TW",
298    ///     "zh-TW",
299    /// ];
300    /// assert!(bcp47_strings.windows(2).all(|w| w[0] < w[1]));
301    ///
302    /// let mut locales = bcp47_strings
303    ///     .iter()
304    ///     .map(|s| s.parse().unwrap())
305    ///     .collect::<Vec<Locale>>();
306    /// locales.sort_by(Locale::total_cmp);
307    /// let total_cmp_strings = locales
308    ///     .iter()
309    ///     .map(|l| l.to_string())
310    ///     .collect::<Vec<String>>();
311    ///
312    /// // Output ordering, sorted arbitrarily
313    /// let expected_ordering: &[&str] = &[
314    ///     "ar-SA",
315    ///     "ar-Latn",
316    ///     "und-u-ca-hebrew",
317    ///     "und-fonipa",
318    ///     "zh-TW",
319    ///     "zh-Hant",
320    ///     "zh-Hant-TW",
321    /// ];
322    ///
323    /// assert_eq!(expected_ordering, total_cmp_strings);
324    /// ```
325    ///
326    /// Use a wrapper to add a [`Locale`] to a [`BTreeSet`]:
327    ///
328    /// ```no_run
329    /// use icu::locale::Locale;
330    /// use std::cmp::Ordering;
331    /// use std::collections::BTreeSet;
332    ///
333    /// #[derive(PartialEq, Eq)]
334    /// struct LocaleTotalOrd(Locale);
335    ///
336    /// impl Ord for LocaleTotalOrd {
337    ///     fn cmp(&self, other: &Self) -> Ordering {
338    ///         self.0.total_cmp(&other.0)
339    ///     }
340    /// }
341    ///
342    /// impl PartialOrd for LocaleTotalOrd {
343    ///     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
344    ///         Some(self.cmp(other))
345    ///     }
346    /// }
347    ///
348    /// let _: BTreeSet<LocaleTotalOrd> = unimplemented!();
349    /// ```
350    ///
351    /// [`BTreeSet`]: alloc::collections::BTreeSet
352    pub fn total_cmp(&self, other: &Self) -> Ordering {
353        self.as_tuple().cmp(&other.as_tuple())
354    }
355
356    /// Compare this `Locale` with a potentially unnormalized BCP-47 string.
357    ///
358    /// The return value is equivalent to what would happen if you first parsed the
359    /// BCP-47 string to a `Locale` and then performed a structural comparison.
360    ///
361    /// # Examples
362    ///
363    /// ```
364    /// use icu::locale::Locale;
365    ///
366    /// let bcp47_strings: &[&str] = &[
367    ///     "pl-LaTn-pL",
368    ///     "uNd",
369    ///     "UND-FONIPA",
370    ///     "UnD-t-m0-TrUe",
371    ///     "uNd-u-CA-Japanese",
372    ///     "ZH",
373    /// ];
374    ///
375    /// for a in bcp47_strings {
376    ///     assert!(a.parse::<Locale>().unwrap().normalizing_eq(a));
377    /// }
378    /// ```
379    #[cfg(feature = "alloc")]
380    pub fn normalizing_eq(&self, other: &str) -> bool {
381        macro_rules! subtag_matches {
382            ($T:ty, $iter:ident, $expected:expr) => {
383                $iter
384                    .next()
385                    .map(|b| <$T>::try_from_utf8(b) == Ok($expected))
386                    .unwrap_or(false)
387            };
388        }
389
390        let mut iter = SubtagIterator::new(other.as_bytes());
391        if !subtag_matches!(subtags::Language, iter, self.id.language) {
392            return false;
393        }
394        if let Some(ref script) = self.id.script {
395            if !subtag_matches!(subtags::Script, iter, *script) {
396                return false;
397            }
398        }
399        if let Some(ref region) = self.id.region {
400            if !subtag_matches!(subtags::Region, iter, *region) {
401                return false;
402            }
403        }
404        for variant in self.id.variants.iter() {
405            if !subtag_matches!(subtags::Variant, iter, *variant) {
406                return false;
407            }
408        }
409        if !self.extensions.is_empty() {
410            match extensions::Extensions::try_from_iter(&mut iter) {
411                Ok(exts) => {
412                    if self.extensions != exts {
413                        return false;
414                    }
415                }
416                Err(_) => {
417                    return false;
418                }
419            }
420        }
421        iter.next().is_none()
422    }
423
424    #[doc(hidden)] // macro use
425    #[allow(clippy::type_complexity)]
426    pub const fn try_from_utf8_with_single_variant_single_keyword_unicode_extension(
427        code_units: &[u8],
428    ) -> Result<
429        (
430            subtags::Language,
431            Option<subtags::Script>,
432            Option<subtags::Region>,
433            Option<subtags::Variant>,
434            Option<(extensions::unicode::Key, Option<Subtag>)>,
435        ),
436        ParseError,
437    > {
438        parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
439            code_units,
440            ParserMode::Locale,
441        )
442    }
443
444    pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
445    where
446        F: FnMut(&str) -> Result<(), E>,
447    {
448        self.id.for_each_subtag_str(f)?;
449        self.extensions.for_each_subtag_str(f)?;
450        Ok(())
451    }
452}
453
454#[cfg(feature = "alloc")]
455impl FromStr for Locale {
456    type Err = ParseError;
457
458    #[inline]
459    fn from_str(s: &str) -> Result<Self, Self::Err> {
460        Self::try_from_str(s)
461    }
462}
463
464impl From<LanguageIdentifier> for Locale {
465    fn from(id: LanguageIdentifier) -> Self {
466        Self {
467            id,
468            extensions: extensions::Extensions::default(),
469        }
470    }
471}
472
473impl From<Locale> for LanguageIdentifier {
474    fn from(loc: Locale) -> Self {
475        loc.id
476    }
477}
478
479impl core::fmt::Debug for Locale {
480    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
481        writeable::Writeable::write_to(self, f)
482    }
483}
484
485impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.write_to_string());
486
487#[test]
488fn test_writeable() {
489    use writeable::assert_writeable_eq;
490    assert_writeable_eq!(Locale::UNKNOWN, "und");
491    assert_writeable_eq!("und-001".parse::<Locale>().unwrap(), "und-001");
492    assert_writeable_eq!("und-Mymr".parse::<Locale>().unwrap(), "und-Mymr");
493    assert_writeable_eq!("my-Mymr-MM".parse::<Locale>().unwrap(), "my-Mymr-MM");
494    assert_writeable_eq!(
495        "my-Mymr-MM-posix".parse::<Locale>().unwrap(),
496        "my-Mymr-MM-posix",
497    );
498    assert_writeable_eq!(
499        "zh-macos-posix".parse::<Locale>().unwrap(),
500        "zh-macos-posix",
501    );
502    assert_writeable_eq!(
503        "my-t-my-d0-zawgyi".parse::<Locale>().unwrap(),
504        "my-t-my-d0-zawgyi",
505    );
506    assert_writeable_eq!(
507        "ar-SA-u-ca-islamic-civil".parse::<Locale>().unwrap(),
508        "ar-SA-u-ca-islamic-civil",
509    );
510    assert_writeable_eq!(
511        "en-001-x-foo-bar".parse::<Locale>().unwrap(),
512        "en-001-x-foo-bar",
513    );
514    assert_writeable_eq!("und-t-m0-true".parse::<Locale>().unwrap(), "und-t-m0-true",);
515}
516
517/// # Examples
518///
519/// ```
520/// use icu::locale::Locale;
521/// use icu::locale::{locale, subtags::language};
522///
523/// assert_eq!(Locale::from(language!("en")), locale!("en"));
524/// ```
525impl From<subtags::Language> for Locale {
526    fn from(language: subtags::Language) -> Self {
527        Self {
528            id: language.into(),
529            extensions: extensions::Extensions::new(),
530        }
531    }
532}
533
534/// # Examples
535///
536/// ```
537/// use icu::locale::Locale;
538/// use icu::locale::{locale, subtags::script};
539///
540/// assert_eq!(Locale::from(Some(script!("latn"))), locale!("und-Latn"));
541/// ```
542impl From<Option<subtags::Script>> for Locale {
543    fn from(script: Option<subtags::Script>) -> Self {
544        Self {
545            id: script.into(),
546            extensions: extensions::Extensions::new(),
547        }
548    }
549}
550
551/// # Examples
552///
553/// ```
554/// use icu::locale::Locale;
555/// use icu::locale::{locale, subtags::region};
556///
557/// assert_eq!(Locale::from(Some(region!("US"))), locale!("und-US"));
558/// ```
559impl From<Option<subtags::Region>> for Locale {
560    fn from(region: Option<subtags::Region>) -> Self {
561        Self {
562            id: region.into(),
563            extensions: extensions::Extensions::new(),
564        }
565    }
566}
567
568/// # Examples
569///
570/// ```
571/// use icu::locale::Locale;
572/// use icu::locale::{
573///     locale,
574///     subtags::{language, region, script},
575/// };
576///
577/// assert_eq!(
578///     Locale::from((
579///         language!("en"),
580///         Some(script!("Latn")),
581///         Some(region!("US"))
582///     )),
583///     locale!("en-Latn-US")
584/// );
585/// ```
586impl
587    From<(
588        subtags::Language,
589        Option<subtags::Script>,
590        Option<subtags::Region>,
591    )> for Locale
592{
593    fn from(
594        lsr: (
595            subtags::Language,
596            Option<subtags::Script>,
597            Option<subtags::Region>,
598        ),
599    ) -> Self {
600        Self {
601            id: lsr.into(),
602            extensions: extensions::Extensions::new(),
603        }
604    }
605}