icu_locale_core/
langid.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use core::cmp::Ordering;
6#[cfg(feature = "alloc")]
7use core::str::FromStr;
8
9use crate::parser;
10use crate::subtags;
11use crate::ParseError;
12#[cfg(feature = "alloc")]
13use alloc::borrow::Cow;
14
15/// A core struct representing a [`Unicode BCP47 Language Identifier`].
16///
17/// # Ordering
18///
19/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
20/// multiple possible orderings. Depending on your use case, two orderings are available:
21///
22/// 1. A string ordering, suitable for stable serialization: [`LanguageIdentifier::strict_cmp`]
23/// 2. A struct ordering, suitable for use with a BTreeSet: [`LanguageIdentifier::total_cmp`]
24///
25/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
26///
27/// # Parsing
28///
29/// Unicode recognizes three levels of standard conformance for any language identifier:
30///
31///  * *well-formed* - syntactically correct
32///  * *valid* - well-formed and only uses registered language, region, script and variant subtags...
33///  * *canonical* - valid and no deprecated codes or structure.
34///
35/// At the moment parsing normalizes a well-formed language identifier converting
36/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
37///
38/// Any syntactically invalid subtags will cause the parsing to fail with an error.
39///
40/// This operation normalizes syntax to be well-formed. No legacy subtag replacements is performed.
41/// For validation and canonicalization, see `LocaleCanonicalizer`.
42///
43/// # Examples
44///
45/// Simple example:
46///
47/// ```
48/// use icu::locale::{
49///     langid,
50///     subtags::{language, region},
51/// };
52///
53/// let li = langid!("en-US");
54///
55/// assert_eq!(li.language, language!("en"));
56/// assert_eq!(li.script, None);
57/// assert_eq!(li.region, Some(region!("US")));
58/// assert_eq!(li.variants.len(), 0);
59/// ```
60///
61/// More complex example:
62///
63/// ```
64/// use icu::locale::{
65///     langid,
66///     subtags::{language, region, script, variant},
67/// };
68///
69/// let li = langid!("eN-latn-Us-Valencia");
70///
71/// assert_eq!(li.language, language!("en"));
72/// assert_eq!(li.script, Some(script!("Latn")));
73/// assert_eq!(li.region, Some(region!("US")));
74/// assert_eq!(li.variants.get(0), Some(&variant!("valencia")));
75/// ```
76///
77/// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier
78#[derive(PartialEq, Eq, Clone, Hash)] // no Ord or PartialOrd: see docs
79#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
80pub struct LanguageIdentifier {
81    /// Language subtag of the language identifier.
82    pub language: subtags::Language,
83    /// Script subtag of the language identifier.
84    pub script: Option<subtags::Script>,
85    /// Region subtag of the language identifier.
86    pub region: Option<subtags::Region>,
87    /// Variant subtags of the language identifier.
88    pub variants: subtags::Variants,
89}
90
91impl LanguageIdentifier {
92    /// The unknown language identifier "und".
93    pub const UNKNOWN: Self = crate::langid!("und");
94
95    /// A constructor which takes a utf8 slice, parses it and
96    /// produces a well-formed [`LanguageIdentifier`].
97    ///
98    /// # Examples
99    ///
100    /// ```
101    /// use icu::locale::LanguageIdentifier;
102    ///
103    /// LanguageIdentifier::try_from_str("en-US").expect("Parsing failed");
104    /// ```
105    #[inline]
106    #[cfg(feature = "alloc")]
107    pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
108        Self::try_from_utf8(s.as_bytes())
109    }
110
111    /// See [`Self::try_from_str`]
112    #[cfg(feature = "alloc")]
113    pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
114        crate::parser::parse_language_identifier(code_units, parser::ParserMode::LanguageIdentifier)
115    }
116
117    #[doc(hidden)] // macro use
118    #[allow(clippy::type_complexity)]
119    // The return type should be `Result<Self, ParseError>` once the `const_precise_live_drops`
120    // is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)).
121    pub const fn try_from_utf8_with_single_variant(
122        code_units: &[u8],
123    ) -> Result<
124        (
125            subtags::Language,
126            Option<subtags::Script>,
127            Option<subtags::Region>,
128            Option<subtags::Variant>,
129        ),
130        ParseError,
131    > {
132        crate::parser::parse_language_identifier_with_single_variant(
133            code_units,
134            parser::ParserMode::LanguageIdentifier,
135        )
136    }
137
138    /// A constructor which takes a utf8 slice which may contain extension keys,
139    /// parses it and produces a well-formed [`LanguageIdentifier`].
140    ///
141    /// # Examples
142    ///
143    /// ```
144    /// use icu::locale::{langid, LanguageIdentifier};
145    ///
146    /// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix")
147    ///     .expect("Parsing failed.");
148    ///
149    /// assert_eq!(li, langid!("en-US"));
150    /// ```
151    ///
152    /// This method should be used for input that may be a locale identifier.
153    /// All extensions will be lost.
154    #[cfg(feature = "alloc")]
155    pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParseError> {
156        parser::parse_language_identifier(v, parser::ParserMode::Locale)
157    }
158
159    /// Whether this [`LanguageIdentifier`] equals [`LanguageIdentifier::UNKNOWN`].
160    pub const fn is_unknown(&self) -> bool {
161        self.language.is_unknown()
162            && self.script.is_none()
163            && self.region.is_none()
164            && self.variants.is_empty()
165    }
166
167    /// Normalize the language identifier (operating on UTF-8 formatted byte slices)
168    ///
169    /// This operation will normalize casing and the separator.
170    ///
171    /// # Examples
172    ///
173    /// ```
174    /// use icu::locale::LanguageIdentifier;
175    ///
176    /// assert_eq!(
177    ///     LanguageIdentifier::normalize("pL-latn-pl").as_deref(),
178    ///     Ok("pl-Latn-PL")
179    /// );
180    /// ```
181    #[cfg(feature = "alloc")]
182    pub fn normalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
183        let lang_id = Self::try_from_utf8(input)?;
184        Ok(writeable::to_string_or_borrow(&lang_id, input))
185    }
186
187    /// Normalize the language identifier (operating on strings)
188    ///
189    /// This operation will normalize casing and the separator.
190    ///
191    /// # Examples
192    ///
193    /// ```
194    /// use icu::locale::LanguageIdentifier;
195    ///
196    /// assert_eq!(
197    ///     LanguageIdentifier::normalize("pL-latn-pl").as_deref(),
198    ///     Ok("pl-Latn-PL")
199    /// );
200    /// ```
201    #[cfg(feature = "alloc")]
202    pub fn normalize(input: &str) -> Result<Cow<str>, ParseError> {
203        Self::normalize_utf8(input.as_bytes())
204    }
205
206    /// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
207    ///
208    /// The return value is equivalent to what would happen if you first converted this
209    /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison.
210    ///
211    /// This function is case-sensitive and results in a *total order*, so it is appropriate for
212    /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
213    ///
214    /// # Examples
215    ///
216    /// Sorting a list of langids with this method requires converting one of them to a string:
217    ///
218    /// ```
219    /// use icu::locale::LanguageIdentifier;
220    /// use std::cmp::Ordering;
221    /// use writeable::Writeable;
222    ///
223    /// // Random input order:
224    /// let bcp47_strings: &[&str] = &[
225    ///     "ar-Latn",
226    ///     "zh-Hant-TW",
227    ///     "zh-TW",
228    ///     "und-fonipa",
229    ///     "zh-Hant",
230    ///     "ar-SA",
231    /// ];
232    ///
233    /// let mut langids = bcp47_strings
234    ///     .iter()
235    ///     .map(|s| s.parse().unwrap())
236    ///     .collect::<Vec<LanguageIdentifier>>();
237    /// langids.sort_by(|a, b| {
238    ///     let b = b.write_to_string();
239    ///     a.strict_cmp(b.as_bytes())
240    /// });
241    /// let strict_cmp_strings = langids
242    ///     .iter()
243    ///     .map(|l| l.to_string())
244    ///     .collect::<Vec<String>>();
245    ///
246    /// // Output ordering, sorted alphabetically
247    /// let expected_ordering: &[&str] = &[
248    ///     "ar-Latn",
249    ///     "ar-SA",
250    ///     "und-fonipa",
251    ///     "zh-Hant",
252    ///     "zh-Hant-TW",
253    ///     "zh-TW",
254    /// ];
255    ///
256    /// assert_eq!(expected_ordering, strict_cmp_strings);
257    /// ```
258    pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
259        writeable::cmp_utf8(self, other)
260    }
261
262    pub(crate) fn as_tuple(
263        &self,
264    ) -> (
265        subtags::Language,
266        Option<subtags::Script>,
267        Option<subtags::Region>,
268        &subtags::Variants,
269    ) {
270        (self.language, self.script, self.region, &self.variants)
271    }
272
273    /// Compare this [`LanguageIdentifier`] with another [`LanguageIdentifier`] field-by-field.
274    /// The result is a total ordering sufficient for use in a [`BTreeSet`].
275    ///
276    /// Unlike [`LanguageIdentifier::strict_cmp`], the ordering may or may not be equivalent
277    /// to string ordering, and it may or may not be stable across ICU4X releases.
278    ///
279    /// # Examples
280    ///
281    /// This method returns a nonsensical ordering derived from the fields of the struct:
282    ///
283    /// ```
284    /// use icu::locale::LanguageIdentifier;
285    /// use std::cmp::Ordering;
286    ///
287    /// // Input strings, sorted alphabetically
288    /// let bcp47_strings: &[&str] = &[
289    ///     "ar-Latn",
290    ///     "ar-SA",
291    ///     "und-fonipa",
292    ///     "zh-Hant",
293    ///     "zh-Hant-TW",
294    ///     "zh-TW",
295    /// ];
296    /// assert!(bcp47_strings.windows(2).all(|w| w[0] < w[1]));
297    ///
298    /// let mut langids = bcp47_strings
299    ///     .iter()
300    ///     .map(|s| s.parse().unwrap())
301    ///     .collect::<Vec<LanguageIdentifier>>();
302    /// langids.sort_by(LanguageIdentifier::total_cmp);
303    /// let total_cmp_strings = langids
304    ///     .iter()
305    ///     .map(|l| l.to_string())
306    ///     .collect::<Vec<String>>();
307    ///
308    /// // Output ordering, sorted arbitrarily
309    /// let expected_ordering: &[&str] = &[
310    ///     "ar-SA",
311    ///     "ar-Latn",
312    ///     "und-fonipa",
313    ///     "zh-TW",
314    ///     "zh-Hant",
315    ///     "zh-Hant-TW",
316    /// ];
317    ///
318    /// assert_eq!(expected_ordering, total_cmp_strings);
319    /// ```
320    ///
321    /// Use a wrapper to add a [`LanguageIdentifier`] to a [`BTreeSet`]:
322    ///
323    /// ```no_run
324    /// use icu::locale::LanguageIdentifier;
325    /// use std::cmp::Ordering;
326    /// use std::collections::BTreeSet;
327    ///
328    /// #[derive(PartialEq, Eq)]
329    /// struct LanguageIdentifierTotalOrd(LanguageIdentifier);
330    ///
331    /// impl Ord for LanguageIdentifierTotalOrd {
332    ///     fn cmp(&self, other: &Self) -> Ordering {
333    ///         self.0.total_cmp(&other.0)
334    ///     }
335    /// }
336    ///
337    /// impl PartialOrd for LanguageIdentifierTotalOrd {
338    ///     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
339    ///         Some(self.cmp(other))
340    ///     }
341    /// }
342    ///
343    /// let _: BTreeSet<LanguageIdentifierTotalOrd> = unimplemented!();
344    /// ```
345    ///
346    /// [`BTreeSet`]: alloc::collections::BTreeSet
347    pub fn total_cmp(&self, other: &Self) -> Ordering {
348        self.as_tuple().cmp(&other.as_tuple())
349    }
350
351    /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string.
352    ///
353    /// The return value is equivalent to what would happen if you first parsed the
354    /// BCP-47 string to a `LanguageIdentifier` and then performed a structural comparison.
355    ///
356    /// # Examples
357    ///
358    /// ```
359    /// use icu::locale::LanguageIdentifier;
360    ///
361    /// let bcp47_strings: &[&str] = &[
362    ///     "pl-LaTn-pL",
363    ///     "uNd",
364    ///     "UnD-adlm",
365    ///     "uNd-GB",
366    ///     "UND-FONIPA",
367    ///     "ZH",
368    /// ];
369    ///
370    /// for a in bcp47_strings {
371    ///     assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a));
372    /// }
373    /// ```
374    pub fn normalizing_eq(&self, other: &str) -> bool {
375        macro_rules! subtag_matches {
376            ($T:ty, $iter:ident, $expected:expr) => {
377                $iter
378                    .next()
379                    .map(|b| <$T>::try_from_utf8(b) == Ok($expected))
380                    .unwrap_or(false)
381            };
382        }
383
384        let mut iter = parser::SubtagIterator::new(other.as_bytes());
385        if !subtag_matches!(subtags::Language, iter, self.language) {
386            return false;
387        }
388        if let Some(ref script) = self.script {
389            if !subtag_matches!(subtags::Script, iter, *script) {
390                return false;
391            }
392        }
393        if let Some(ref region) = self.region {
394            if !subtag_matches!(subtags::Region, iter, *region) {
395                return false;
396            }
397        }
398        for variant in self.variants.iter() {
399            if !subtag_matches!(subtags::Variant, iter, *variant) {
400                return false;
401            }
402        }
403        iter.next().is_none()
404    }
405
406    pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
407    where
408        F: FnMut(&str) -> Result<(), E>,
409    {
410        f(self.language.as_str())?;
411        if let Some(ref script) = self.script {
412            f(script.as_str())?;
413        }
414        if let Some(ref region) = self.region {
415            f(region.as_str())?;
416        }
417        for variant in self.variants.iter() {
418            f(variant.as_str())?;
419        }
420        Ok(())
421    }
422
423    /// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in
424    /// lowercase ascii form.
425    ///
426    /// The default normalization of language identifiers uses titlecase scripts and uppercase
427    /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
428    ///
429    /// > _The canonical form for all subtags in the extension is lowercase, with the fields
430    /// > ordered by the separators, alphabetically._
431    ///
432    /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
433    /// normalization of the language identifier.
434    ///
435    /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
436    /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
437    /// but titlecased and uppercased outside T extensions respectively.
438    ///
439    /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
440    /// [`Transform extensions`]: crate::extensions::transform
441    pub(crate) fn for_each_subtag_str_lowercased<E, F>(&self, f: &mut F) -> Result<(), E>
442    where
443        F: FnMut(&str) -> Result<(), E>,
444    {
445        f(self.language.as_str())?;
446        if let Some(ref script) = self.script {
447            f(script.to_tinystr().to_ascii_lowercase().as_str())?;
448        }
449        if let Some(ref region) = self.region {
450            f(region.to_tinystr().to_ascii_lowercase().as_str())?;
451        }
452        for variant in self.variants.iter() {
453            f(variant.as_str())?;
454        }
455        Ok(())
456    }
457
458    /// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with
459    /// lowercase ascii chars.
460    ///
461    /// The default normalization of language identifiers uses titlecase scripts and uppercase
462    /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
463    ///
464    /// > _The canonical form for all subtags in the extension is lowercase, with the fields
465    /// > ordered by the separators, alphabetically._
466    ///
467    /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
468    /// normalization of the language identifier.
469    ///
470    /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
471    /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
472    /// but titlecased and uppercased outside T extensions respectively.
473    ///
474    /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
475    /// [`Transform extensions`]: crate::extensions::transform
476    pub(crate) fn write_lowercased_to<W: core::fmt::Write + ?Sized>(
477        &self,
478        sink: &mut W,
479    ) -> core::fmt::Result {
480        let mut initial = true;
481        self.for_each_subtag_str_lowercased(&mut |subtag| {
482            if initial {
483                initial = false;
484            } else {
485                sink.write_char('-')?;
486            }
487            sink.write_str(subtag)
488        })
489    }
490}
491
492impl core::fmt::Debug for LanguageIdentifier {
493    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
494        core::fmt::Display::fmt(&self, f)
495    }
496}
497
498#[cfg(feature = "alloc")]
499impl FromStr for LanguageIdentifier {
500    type Err = ParseError;
501
502    #[inline]
503    fn from_str(s: &str) -> Result<Self, Self::Err> {
504        Self::try_from_str(s)
505    }
506}
507
508impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => selff.language.write_to_string());
509
510#[test]
511fn test_writeable() {
512    use writeable::assert_writeable_eq;
513    assert_writeable_eq!(LanguageIdentifier::UNKNOWN, "und");
514    assert_writeable_eq!("und-001".parse::<LanguageIdentifier>().unwrap(), "und-001");
515    assert_writeable_eq!(
516        "und-Mymr".parse::<LanguageIdentifier>().unwrap(),
517        "und-Mymr",
518    );
519    assert_writeable_eq!(
520        "my-Mymr-MM".parse::<LanguageIdentifier>().unwrap(),
521        "my-Mymr-MM",
522    );
523    assert_writeable_eq!(
524        "my-Mymr-MM-posix".parse::<LanguageIdentifier>().unwrap(),
525        "my-Mymr-MM-posix",
526    );
527    assert_writeable_eq!(
528        "zh-macos-posix".parse::<LanguageIdentifier>().unwrap(),
529        "zh-macos-posix",
530    );
531}
532
533/// # Examples
534///
535/// ```
536/// use icu::locale::{langid, subtags::language, LanguageIdentifier};
537///
538/// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en"));
539/// ```
540impl From<subtags::Language> for LanguageIdentifier {
541    fn from(language: subtags::Language) -> Self {
542        Self {
543            language,
544            script: None,
545            region: None,
546            variants: subtags::Variants::new(),
547        }
548    }
549}
550
551/// # Examples
552///
553/// ```
554/// use icu::locale::{langid, subtags::script, LanguageIdentifier};
555///
556/// assert_eq!(
557///     LanguageIdentifier::from(Some(script!("latn"))),
558///     langid!("und-Latn")
559/// );
560/// ```
561impl From<Option<subtags::Script>> for LanguageIdentifier {
562    fn from(script: Option<subtags::Script>) -> Self {
563        Self {
564            language: subtags::Language::UNKNOWN,
565            script,
566            region: None,
567            variants: subtags::Variants::new(),
568        }
569    }
570}
571
572/// # Examples
573///
574/// ```
575/// use icu::locale::{langid, subtags::region, LanguageIdentifier};
576///
577/// assert_eq!(
578///     LanguageIdentifier::from(Some(region!("US"))),
579///     langid!("und-US")
580/// );
581/// ```
582impl From<Option<subtags::Region>> for LanguageIdentifier {
583    fn from(region: Option<subtags::Region>) -> Self {
584        Self {
585            language: subtags::Language::UNKNOWN,
586            script: None,
587            region,
588            variants: subtags::Variants::new(),
589        }
590    }
591}
592
593/// Convert from an LSR tuple to a [`LanguageIdentifier`].
594///
595/// # Examples
596///
597/// ```
598/// use icu::locale::{
599///     langid,
600///     subtags::{language, region, script},
601///     LanguageIdentifier,
602/// };
603///
604/// let lang = language!("en");
605/// let script = script!("Latn");
606/// let region = region!("US");
607/// assert_eq!(
608///     LanguageIdentifier::from((lang, Some(script), Some(region))),
609///     langid!("en-Latn-US")
610/// );
611/// ```
612impl
613    From<(
614        subtags::Language,
615        Option<subtags::Script>,
616        Option<subtags::Region>,
617    )> for LanguageIdentifier
618{
619    fn from(
620        lsr: (
621            subtags::Language,
622            Option<subtags::Script>,
623            Option<subtags::Region>,
624        ),
625    ) -> Self {
626        Self {
627            language: lsr.0,
628            script: lsr.1,
629            region: lsr.2,
630            variants: subtags::Variants::new(),
631        }
632    }
633}
634
635/// Convert from a [`LanguageIdentifier`] to an LSR tuple.
636///
637/// # Examples
638///
639/// ```
640/// use icu::locale::{
641///     langid,
642///     subtags::{language, region, script},
643/// };
644///
645/// let lid = langid!("en-Latn-US");
646/// let (lang, script, region) = (&lid).into();
647///
648/// assert_eq!(lang, language!("en"));
649/// assert_eq!(script, Some(script!("Latn")));
650/// assert_eq!(region, Some(region!("US")));
651/// ```
652impl From<&LanguageIdentifier>
653    for (
654        subtags::Language,
655        Option<subtags::Script>,
656        Option<subtags::Region>,
657    )
658{
659    fn from(langid: &LanguageIdentifier) -> Self {
660        (langid.language, langid.script, langid.region)
661    }
662}