icu_locale_core/langid.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use core::cmp::Ordering;
6#[cfg(feature = "alloc")]
7use core::str::FromStr;
8
9use crate::parser;
10use crate::subtags;
11use crate::ParseError;
12#[cfg(feature = "alloc")]
13use alloc::borrow::Cow;
14
15/// A core struct representing a [`Unicode BCP47 Language Identifier`].
16///
17/// # Ordering
18///
19/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
20/// multiple possible orderings. Depending on your use case, two orderings are available:
21///
22/// 1. A string ordering, suitable for stable serialization: [`LanguageIdentifier::strict_cmp`]
23/// 2. A struct ordering, suitable for use with a BTreeSet: [`LanguageIdentifier::total_cmp`]
24///
25/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
26///
27/// # Parsing
28///
29/// Unicode recognizes three levels of standard conformance for any language identifier:
30///
31/// * *well-formed* - syntactically correct
32/// * *valid* - well-formed and only uses registered language, region, script and variant subtags...
33/// * *canonical* - valid and no deprecated codes or structure.
34///
35/// At the moment parsing normalizes a well-formed language identifier converting
36/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
37///
38/// Any syntactically invalid subtags will cause the parsing to fail with an error.
39///
40/// This operation normalizes syntax to be well-formed. No legacy subtag replacements is performed.
41/// For validation and canonicalization, see `LocaleCanonicalizer`.
42///
43/// # Examples
44///
45/// Simple example:
46///
47/// ```
48/// use icu::locale::{
49/// langid,
50/// subtags::{language, region},
51/// };
52///
53/// let li = langid!("en-US");
54///
55/// assert_eq!(li.language, language!("en"));
56/// assert_eq!(li.script, None);
57/// assert_eq!(li.region, Some(region!("US")));
58/// assert_eq!(li.variants.len(), 0);
59/// ```
60///
61/// More complex example:
62///
63/// ```
64/// use icu::locale::{
65/// langid,
66/// subtags::{language, region, script, variant},
67/// };
68///
69/// let li = langid!("eN-latn-Us-Valencia");
70///
71/// assert_eq!(li.language, language!("en"));
72/// assert_eq!(li.script, Some(script!("Latn")));
73/// assert_eq!(li.region, Some(region!("US")));
74/// assert_eq!(li.variants.get(0), Some(&variant!("valencia")));
75/// ```
76///
77/// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier
78#[derive(PartialEq, Eq, Clone, Hash)] // no Ord or PartialOrd: see docs
79#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
80pub struct LanguageIdentifier {
81 /// Language subtag of the language identifier.
82 pub language: subtags::Language,
83 /// Script subtag of the language identifier.
84 pub script: Option<subtags::Script>,
85 /// Region subtag of the language identifier.
86 pub region: Option<subtags::Region>,
87 /// Variant subtags of the language identifier.
88 pub variants: subtags::Variants,
89}
90
91impl LanguageIdentifier {
92 /// The unknown language identifier "und".
93 pub const UNKNOWN: Self = crate::langid!("und");
94
95 /// A constructor which takes a utf8 slice, parses it and
96 /// produces a well-formed [`LanguageIdentifier`].
97 ///
98 /// # Examples
99 ///
100 /// ```
101 /// use icu::locale::LanguageIdentifier;
102 ///
103 /// LanguageIdentifier::try_from_str("en-US").expect("Parsing failed");
104 /// ```
105 #[inline]
106 #[cfg(feature = "alloc")]
107 pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
108 Self::try_from_utf8(s.as_bytes())
109 }
110
111 /// See [`Self::try_from_str`]
112 #[cfg(feature = "alloc")]
113 pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
114 crate::parser::parse_language_identifier(code_units, parser::ParserMode::LanguageIdentifier)
115 }
116
117 #[doc(hidden)] // macro use
118 #[allow(clippy::type_complexity)]
119 // The return type should be `Result<Self, ParseError>` once the `const_precise_live_drops`
120 // is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)).
121 pub const fn try_from_utf8_with_single_variant(
122 code_units: &[u8],
123 ) -> Result<
124 (
125 subtags::Language,
126 Option<subtags::Script>,
127 Option<subtags::Region>,
128 Option<subtags::Variant>,
129 ),
130 ParseError,
131 > {
132 crate::parser::parse_language_identifier_with_single_variant(
133 code_units,
134 parser::ParserMode::LanguageIdentifier,
135 )
136 }
137
138 /// A constructor which takes a utf8 slice which may contain extension keys,
139 /// parses it and produces a well-formed [`LanguageIdentifier`].
140 ///
141 /// # Examples
142 ///
143 /// ```
144 /// use icu::locale::{langid, LanguageIdentifier};
145 ///
146 /// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix")
147 /// .expect("Parsing failed.");
148 ///
149 /// assert_eq!(li, langid!("en-US"));
150 /// ```
151 ///
152 /// This method should be used for input that may be a locale identifier.
153 /// All extensions will be lost.
154 #[cfg(feature = "alloc")]
155 pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParseError> {
156 parser::parse_language_identifier(v, parser::ParserMode::Locale)
157 }
158
159 /// Whether this [`LanguageIdentifier`] equals [`LanguageIdentifier::UNKNOWN`].
160 pub const fn is_unknown(&self) -> bool {
161 self.language.is_unknown()
162 && self.script.is_none()
163 && self.region.is_none()
164 && self.variants.is_empty()
165 }
166
167 /// Normalize the language identifier (operating on UTF-8 formatted byte slices)
168 ///
169 /// This operation will normalize casing and the separator.
170 ///
171 /// # Examples
172 ///
173 /// ```
174 /// use icu::locale::LanguageIdentifier;
175 ///
176 /// assert_eq!(
177 /// LanguageIdentifier::normalize("pL-latn-pl").as_deref(),
178 /// Ok("pl-Latn-PL")
179 /// );
180 /// ```
181 #[cfg(feature = "alloc")]
182 pub fn normalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
183 let lang_id = Self::try_from_utf8(input)?;
184 Ok(writeable::to_string_or_borrow(&lang_id, input))
185 }
186
187 /// Normalize the language identifier (operating on strings)
188 ///
189 /// This operation will normalize casing and the separator.
190 ///
191 /// # Examples
192 ///
193 /// ```
194 /// use icu::locale::LanguageIdentifier;
195 ///
196 /// assert_eq!(
197 /// LanguageIdentifier::normalize("pL-latn-pl").as_deref(),
198 /// Ok("pl-Latn-PL")
199 /// );
200 /// ```
201 #[cfg(feature = "alloc")]
202 pub fn normalize(input: &str) -> Result<Cow<str>, ParseError> {
203 Self::normalize_utf8(input.as_bytes())
204 }
205
206 /// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
207 ///
208 /// The return value is equivalent to what would happen if you first converted this
209 /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison.
210 ///
211 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
212 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
213 ///
214 /// # Examples
215 ///
216 /// Sorting a list of langids with this method requires converting one of them to a string:
217 ///
218 /// ```
219 /// use icu::locale::LanguageIdentifier;
220 /// use std::cmp::Ordering;
221 /// use writeable::Writeable;
222 ///
223 /// // Random input order:
224 /// let bcp47_strings: &[&str] = &[
225 /// "ar-Latn",
226 /// "zh-Hant-TW",
227 /// "zh-TW",
228 /// "und-fonipa",
229 /// "zh-Hant",
230 /// "ar-SA",
231 /// ];
232 ///
233 /// let mut langids = bcp47_strings
234 /// .iter()
235 /// .map(|s| s.parse().unwrap())
236 /// .collect::<Vec<LanguageIdentifier>>();
237 /// langids.sort_by(|a, b| {
238 /// let b = b.write_to_string();
239 /// a.strict_cmp(b.as_bytes())
240 /// });
241 /// let strict_cmp_strings = langids
242 /// .iter()
243 /// .map(|l| l.to_string())
244 /// .collect::<Vec<String>>();
245 ///
246 /// // Output ordering, sorted alphabetically
247 /// let expected_ordering: &[&str] = &[
248 /// "ar-Latn",
249 /// "ar-SA",
250 /// "und-fonipa",
251 /// "zh-Hant",
252 /// "zh-Hant-TW",
253 /// "zh-TW",
254 /// ];
255 ///
256 /// assert_eq!(expected_ordering, strict_cmp_strings);
257 /// ```
258 pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
259 writeable::cmp_utf8(self, other)
260 }
261
262 pub(crate) fn as_tuple(
263 &self,
264 ) -> (
265 subtags::Language,
266 Option<subtags::Script>,
267 Option<subtags::Region>,
268 &subtags::Variants,
269 ) {
270 (self.language, self.script, self.region, &self.variants)
271 }
272
273 /// Compare this [`LanguageIdentifier`] with another [`LanguageIdentifier`] field-by-field.
274 /// The result is a total ordering sufficient for use in a [`BTreeSet`].
275 ///
276 /// Unlike [`LanguageIdentifier::strict_cmp`], the ordering may or may not be equivalent
277 /// to string ordering, and it may or may not be stable across ICU4X releases.
278 ///
279 /// # Examples
280 ///
281 /// This method returns a nonsensical ordering derived from the fields of the struct:
282 ///
283 /// ```
284 /// use icu::locale::LanguageIdentifier;
285 /// use std::cmp::Ordering;
286 ///
287 /// // Input strings, sorted alphabetically
288 /// let bcp47_strings: &[&str] = &[
289 /// "ar-Latn",
290 /// "ar-SA",
291 /// "und-fonipa",
292 /// "zh-Hant",
293 /// "zh-Hant-TW",
294 /// "zh-TW",
295 /// ];
296 /// assert!(bcp47_strings.windows(2).all(|w| w[0] < w[1]));
297 ///
298 /// let mut langids = bcp47_strings
299 /// .iter()
300 /// .map(|s| s.parse().unwrap())
301 /// .collect::<Vec<LanguageIdentifier>>();
302 /// langids.sort_by(LanguageIdentifier::total_cmp);
303 /// let total_cmp_strings = langids
304 /// .iter()
305 /// .map(|l| l.to_string())
306 /// .collect::<Vec<String>>();
307 ///
308 /// // Output ordering, sorted arbitrarily
309 /// let expected_ordering: &[&str] = &[
310 /// "ar-SA",
311 /// "ar-Latn",
312 /// "und-fonipa",
313 /// "zh-TW",
314 /// "zh-Hant",
315 /// "zh-Hant-TW",
316 /// ];
317 ///
318 /// assert_eq!(expected_ordering, total_cmp_strings);
319 /// ```
320 ///
321 /// Use a wrapper to add a [`LanguageIdentifier`] to a [`BTreeSet`]:
322 ///
323 /// ```no_run
324 /// use icu::locale::LanguageIdentifier;
325 /// use std::cmp::Ordering;
326 /// use std::collections::BTreeSet;
327 ///
328 /// #[derive(PartialEq, Eq)]
329 /// struct LanguageIdentifierTotalOrd(LanguageIdentifier);
330 ///
331 /// impl Ord for LanguageIdentifierTotalOrd {
332 /// fn cmp(&self, other: &Self) -> Ordering {
333 /// self.0.total_cmp(&other.0)
334 /// }
335 /// }
336 ///
337 /// impl PartialOrd for LanguageIdentifierTotalOrd {
338 /// fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
339 /// Some(self.cmp(other))
340 /// }
341 /// }
342 ///
343 /// let _: BTreeSet<LanguageIdentifierTotalOrd> = unimplemented!();
344 /// ```
345 ///
346 /// [`BTreeSet`]: alloc::collections::BTreeSet
347 pub fn total_cmp(&self, other: &Self) -> Ordering {
348 self.as_tuple().cmp(&other.as_tuple())
349 }
350
351 /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string.
352 ///
353 /// The return value is equivalent to what would happen if you first parsed the
354 /// BCP-47 string to a `LanguageIdentifier` and then performed a structural comparison.
355 ///
356 /// # Examples
357 ///
358 /// ```
359 /// use icu::locale::LanguageIdentifier;
360 ///
361 /// let bcp47_strings: &[&str] = &[
362 /// "pl-LaTn-pL",
363 /// "uNd",
364 /// "UnD-adlm",
365 /// "uNd-GB",
366 /// "UND-FONIPA",
367 /// "ZH",
368 /// ];
369 ///
370 /// for a in bcp47_strings {
371 /// assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a));
372 /// }
373 /// ```
374 pub fn normalizing_eq(&self, other: &str) -> bool {
375 macro_rules! subtag_matches {
376 ($T:ty, $iter:ident, $expected:expr) => {
377 $iter
378 .next()
379 .map(|b| <$T>::try_from_utf8(b) == Ok($expected))
380 .unwrap_or(false)
381 };
382 }
383
384 let mut iter = parser::SubtagIterator::new(other.as_bytes());
385 if !subtag_matches!(subtags::Language, iter, self.language) {
386 return false;
387 }
388 if let Some(ref script) = self.script {
389 if !subtag_matches!(subtags::Script, iter, *script) {
390 return false;
391 }
392 }
393 if let Some(ref region) = self.region {
394 if !subtag_matches!(subtags::Region, iter, *region) {
395 return false;
396 }
397 }
398 for variant in self.variants.iter() {
399 if !subtag_matches!(subtags::Variant, iter, *variant) {
400 return false;
401 }
402 }
403 iter.next().is_none()
404 }
405
406 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
407 where
408 F: FnMut(&str) -> Result<(), E>,
409 {
410 f(self.language.as_str())?;
411 if let Some(ref script) = self.script {
412 f(script.as_str())?;
413 }
414 if let Some(ref region) = self.region {
415 f(region.as_str())?;
416 }
417 for variant in self.variants.iter() {
418 f(variant.as_str())?;
419 }
420 Ok(())
421 }
422
423 /// Executes `f` on each subtag string of this `LanguageIdentifier`, with every string in
424 /// lowercase ascii form.
425 ///
426 /// The default normalization of language identifiers uses titlecase scripts and uppercase
427 /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
428 ///
429 /// > _The canonical form for all subtags in the extension is lowercase, with the fields
430 /// > ordered by the separators, alphabetically._
431 ///
432 /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
433 /// normalization of the language identifier.
434 ///
435 /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
436 /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
437 /// but titlecased and uppercased outside T extensions respectively.
438 ///
439 /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
440 /// [`Transform extensions`]: crate::extensions::transform
441 pub(crate) fn for_each_subtag_str_lowercased<E, F>(&self, f: &mut F) -> Result<(), E>
442 where
443 F: FnMut(&str) -> Result<(), E>,
444 {
445 f(self.language.as_str())?;
446 if let Some(ref script) = self.script {
447 f(script.to_tinystr().to_ascii_lowercase().as_str())?;
448 }
449 if let Some(ref region) = self.region {
450 f(region.to_tinystr().to_ascii_lowercase().as_str())?;
451 }
452 for variant in self.variants.iter() {
453 f(variant.as_str())?;
454 }
455 Ok(())
456 }
457
458 /// Writes this `LanguageIdentifier` to a sink, replacing uppercase ascii chars with
459 /// lowercase ascii chars.
460 ///
461 /// The default normalization of language identifiers uses titlecase scripts and uppercase
462 /// regions. However, this differs from [RFC6497 (BCP 47 Extension T)], which specifies:
463 ///
464 /// > _The canonical form for all subtags in the extension is lowercase, with the fields
465 /// > ordered by the separators, alphabetically._
466 ///
467 /// Hence, this method is used inside [`Transform Extensions`] to be able to get the correct
468 /// normalization of the language identifier.
469 ///
470 /// As an example, the canonical form of locale **EN-LATN-CA-T-EN-LATN-CA** is
471 /// **en-Latn-CA-t-en-latn-ca**, with the script and region parts lowercased inside T extensions,
472 /// but titlecased and uppercased outside T extensions respectively.
473 ///
474 /// [RFC6497 (BCP 47 Extension T)]: https://www.ietf.org/rfc/rfc6497.txt
475 /// [`Transform extensions`]: crate::extensions::transform
476 pub(crate) fn write_lowercased_to<W: core::fmt::Write + ?Sized>(
477 &self,
478 sink: &mut W,
479 ) -> core::fmt::Result {
480 let mut initial = true;
481 self.for_each_subtag_str_lowercased(&mut |subtag| {
482 if initial {
483 initial = false;
484 } else {
485 sink.write_char('-')?;
486 }
487 sink.write_str(subtag)
488 })
489 }
490}
491
492impl core::fmt::Debug for LanguageIdentifier {
493 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
494 core::fmt::Display::fmt(&self, f)
495 }
496}
497
498#[cfg(feature = "alloc")]
499impl FromStr for LanguageIdentifier {
500 type Err = ParseError;
501
502 #[inline]
503 fn from_str(s: &str) -> Result<Self, Self::Err> {
504 Self::try_from_str(s)
505 }
506}
507
508impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => selff.language.write_to_string());
509
510#[test]
511fn test_writeable() {
512 use writeable::assert_writeable_eq;
513 assert_writeable_eq!(LanguageIdentifier::UNKNOWN, "und");
514 assert_writeable_eq!("und-001".parse::<LanguageIdentifier>().unwrap(), "und-001");
515 assert_writeable_eq!(
516 "und-Mymr".parse::<LanguageIdentifier>().unwrap(),
517 "und-Mymr",
518 );
519 assert_writeable_eq!(
520 "my-Mymr-MM".parse::<LanguageIdentifier>().unwrap(),
521 "my-Mymr-MM",
522 );
523 assert_writeable_eq!(
524 "my-Mymr-MM-posix".parse::<LanguageIdentifier>().unwrap(),
525 "my-Mymr-MM-posix",
526 );
527 assert_writeable_eq!(
528 "zh-macos-posix".parse::<LanguageIdentifier>().unwrap(),
529 "zh-macos-posix",
530 );
531}
532
533/// # Examples
534///
535/// ```
536/// use icu::locale::{langid, subtags::language, LanguageIdentifier};
537///
538/// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en"));
539/// ```
540impl From<subtags::Language> for LanguageIdentifier {
541 fn from(language: subtags::Language) -> Self {
542 Self {
543 language,
544 script: None,
545 region: None,
546 variants: subtags::Variants::new(),
547 }
548 }
549}
550
551/// # Examples
552///
553/// ```
554/// use icu::locale::{langid, subtags::script, LanguageIdentifier};
555///
556/// assert_eq!(
557/// LanguageIdentifier::from(Some(script!("latn"))),
558/// langid!("und-Latn")
559/// );
560/// ```
561impl From<Option<subtags::Script>> for LanguageIdentifier {
562 fn from(script: Option<subtags::Script>) -> Self {
563 Self {
564 language: subtags::Language::UNKNOWN,
565 script,
566 region: None,
567 variants: subtags::Variants::new(),
568 }
569 }
570}
571
572/// # Examples
573///
574/// ```
575/// use icu::locale::{langid, subtags::region, LanguageIdentifier};
576///
577/// assert_eq!(
578/// LanguageIdentifier::from(Some(region!("US"))),
579/// langid!("und-US")
580/// );
581/// ```
582impl From<Option<subtags::Region>> for LanguageIdentifier {
583 fn from(region: Option<subtags::Region>) -> Self {
584 Self {
585 language: subtags::Language::UNKNOWN,
586 script: None,
587 region,
588 variants: subtags::Variants::new(),
589 }
590 }
591}
592
593/// Convert from an LSR tuple to a [`LanguageIdentifier`].
594///
595/// # Examples
596///
597/// ```
598/// use icu::locale::{
599/// langid,
600/// subtags::{language, region, script},
601/// LanguageIdentifier,
602/// };
603///
604/// let lang = language!("en");
605/// let script = script!("Latn");
606/// let region = region!("US");
607/// assert_eq!(
608/// LanguageIdentifier::from((lang, Some(script), Some(region))),
609/// langid!("en-Latn-US")
610/// );
611/// ```
612impl
613 From<(
614 subtags::Language,
615 Option<subtags::Script>,
616 Option<subtags::Region>,
617 )> for LanguageIdentifier
618{
619 fn from(
620 lsr: (
621 subtags::Language,
622 Option<subtags::Script>,
623 Option<subtags::Region>,
624 ),
625 ) -> Self {
626 Self {
627 language: lsr.0,
628 script: lsr.1,
629 region: lsr.2,
630 variants: subtags::Variants::new(),
631 }
632 }
633}
634
635/// Convert from a [`LanguageIdentifier`] to an LSR tuple.
636///
637/// # Examples
638///
639/// ```
640/// use icu::locale::{
641/// langid,
642/// subtags::{language, region, script},
643/// };
644///
645/// let lid = langid!("en-Latn-US");
646/// let (lang, script, region) = (&lid).into();
647///
648/// assert_eq!(lang, language!("en"));
649/// assert_eq!(script, Some(script!("Latn")));
650/// assert_eq!(region, Some(region!("US")));
651/// ```
652impl From<&LanguageIdentifier>
653 for (
654 subtags::Language,
655 Option<subtags::Script>,
656 Option<subtags::Region>,
657 )
658{
659 fn from(langid: &LanguageIdentifier) -> Self {
660 (langid.language, langid.script, langid.region)
661 }
662}