icu_locale_core/locale.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::parser::*;
6use crate::subtags::Subtag;
7use crate::{extensions, subtags, LanguageIdentifier};
8#[cfg(feature = "alloc")]
9use alloc::borrow::Cow;
10use core::cmp::Ordering;
11#[cfg(feature = "alloc")]
12use core::str::FromStr;
13
14/// A core struct representing a [`Unicode Locale Identifier`].
15///
16/// A locale is made of two parts:
17/// * Unicode Language Identifier
18/// * A set of Unicode Extensions
19///
20/// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and
21/// on top of that is able to parse, manipulate and serialize unicode extension fields.
22///
23/// # Ordering
24///
25/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
26/// multiple possible orderings. Depending on your use case, two orderings are available:
27///
28/// 1. A string ordering, suitable for stable serialization: [`Locale::strict_cmp`]
29/// 2. A struct ordering, suitable for use with a BTreeSet: [`Locale::total_cmp`]
30///
31/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
32///
33/// # Parsing
34///
35/// Unicode recognizes three levels of standard conformance for a locale:
36///
37/// * *well-formed* - syntactically correct
38/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
39/// * *canonical* - valid and no deprecated codes or structure.
40///
41/// Any syntactically invalid subtags will cause the parsing to fail with an error.
42///
43/// This operation normalizes syntax to be well-formed. No legacy subtag replacements is performed.
44/// For validation and canonicalization, see `LocaleCanonicalizer`.
45///
46/// ICU4X's Locale parsing does not allow for non-BCP-47-compatible locales [allowed by UTS 35 for backwards compatability][tr35-bcp].
47/// Furthermore, it currently does not allow for language tags to have more than three characters.
48///
49/// # Examples
50///
51/// Simple example:
52///
53/// ```
54/// use icu::locale::{
55/// extensions::unicode::{key, value},
56/// locale,
57/// subtags::{language, region},
58/// };
59///
60/// let loc = locale!("en-US-u-ca-buddhist");
61///
62/// assert_eq!(loc.id.language, language!("en"));
63/// assert_eq!(loc.id.script, None);
64/// assert_eq!(loc.id.region, Some(region!("US")));
65/// assert_eq!(loc.id.variants.len(), 0);
66/// assert_eq!(
67/// loc.extensions.unicode.keywords.get(&key!("ca")),
68/// Some(&value!("buddhist"))
69/// );
70/// ```
71///
72/// More complex example:
73///
74/// ```
75/// use icu::locale::{subtags::*, Locale};
76///
77/// let loc: Locale = "eN-latn-Us-Valencia-u-hC-H12"
78/// .parse()
79/// .expect("Failed to parse.");
80///
81/// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap());
82/// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok());
83/// assert_eq!(loc.id.region, "US".parse::<Region>().ok());
84/// assert_eq!(
85/// loc.id.variants.get(0),
86/// "valencia".parse::<Variant>().ok().as_ref()
87/// );
88/// ```
89///
90/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier
91/// [tr35-bcp]: https://unicode.org/reports/tr35/#BCP_47_Conformance
92#[derive(PartialEq, Eq, Clone, Hash)] // no Ord or PartialOrd: see docs
93#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
94pub struct Locale {
95 /// The basic language/script/region components in the locale identifier along with any variants.
96 pub id: LanguageIdentifier,
97 /// Any extensions present in the locale identifier.
98 pub extensions: extensions::Extensions,
99}
100
101#[test]
102fn test_sizes() {
103 assert_eq!(core::mem::size_of::<subtags::Language>(), 3);
104 assert_eq!(core::mem::size_of::<subtags::Script>(), 4);
105 assert_eq!(core::mem::size_of::<subtags::Region>(), 3);
106 assert_eq!(core::mem::size_of::<subtags::Variant>(), 8);
107 assert_eq!(core::mem::size_of::<subtags::Variants>(), 16);
108 assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 32);
109
110 assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 56);
111 assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 32);
112 assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24);
113
114 assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 16);
115 assert_eq!(core::mem::size_of::<extensions::unicode::Keywords>(), 24);
116 assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24);
117 assert_eq!(core::mem::size_of::<extensions::private::Private>(), 16);
118 assert_eq!(core::mem::size_of::<extensions::Extensions>(), 136);
119
120 assert_eq!(core::mem::size_of::<Locale>(), 168);
121}
122
123impl Locale {
124 /// The unknown locale "und".
125 pub const UNKNOWN: Self = crate::locale!("und");
126
127 /// A constructor which takes a utf8 slice, parses it and
128 /// produces a well-formed [`Locale`].
129 ///
130 /// # Examples
131 ///
132 /// ```
133 /// use icu::locale::Locale;
134 ///
135 /// Locale::try_from_str("en-US-u-hc-h12").unwrap();
136 /// ```
137 #[inline]
138 #[cfg(feature = "alloc")]
139 pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
140 Self::try_from_utf8(s.as_bytes())
141 }
142
143 /// See [`Self::try_from_str`]
144 #[cfg(feature = "alloc")]
145 pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
146 parse_locale(code_units)
147 }
148
149 /// Normalize the locale (operating on UTF-8 formatted byte slices)
150 ///
151 /// This operation will normalize casing and the separator.
152 ///
153 /// # Examples
154 ///
155 /// ```
156 /// use icu::locale::Locale;
157 ///
158 /// assert_eq!(
159 /// Locale::normalize_utf8(b"pL-latn-pl-U-HC-H12").as_deref(),
160 /// Ok("pl-Latn-PL-u-hc-h12")
161 /// );
162 /// ```
163 #[cfg(feature = "alloc")]
164 pub fn normalize_utf8(input: &[u8]) -> Result<Cow<str>, ParseError> {
165 let locale = Self::try_from_utf8(input)?;
166 Ok(writeable::to_string_or_borrow(&locale, input))
167 }
168
169 /// Normalize the locale (operating on strings)
170 ///
171 /// This operation will normalize casing and the separator.
172 ///
173 /// # Examples
174 ///
175 /// ```
176 /// use icu::locale::Locale;
177 ///
178 /// assert_eq!(
179 /// Locale::normalize("pL-latn-pl-U-HC-H12").as_deref(),
180 /// Ok("pl-Latn-PL-u-hc-h12")
181 /// );
182 /// ```
183 #[cfg(feature = "alloc")]
184 pub fn normalize(input: &str) -> Result<Cow<str>, ParseError> {
185 Self::normalize_utf8(input.as_bytes())
186 }
187
188 /// Compare this [`Locale`] with BCP-47 bytes.
189 ///
190 /// The return value is equivalent to what would happen if you first converted this
191 /// [`Locale`] to a BCP-47 string and then performed a byte comparison.
192 ///
193 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
194 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
195 ///
196 /// # Examples
197 ///
198 /// Sorting a list of locales with this method requires converting one of them to a string:
199 ///
200 /// ```
201 /// use icu::locale::Locale;
202 /// use std::cmp::Ordering;
203 /// use writeable::Writeable;
204 ///
205 /// // Random input order:
206 /// let bcp47_strings: &[&str] = &[
207 /// "und-u-ca-hebrew",
208 /// "ar-Latn",
209 /// "zh-Hant-TW",
210 /// "zh-TW",
211 /// "und-fonipa",
212 /// "zh-Hant",
213 /// "ar-SA",
214 /// ];
215 ///
216 /// let mut locales = bcp47_strings
217 /// .iter()
218 /// .map(|s| s.parse().unwrap())
219 /// .collect::<Vec<Locale>>();
220 /// locales.sort_by(|a, b| {
221 /// let b = b.write_to_string();
222 /// a.strict_cmp(b.as_bytes())
223 /// });
224 /// let strict_cmp_strings = locales
225 /// .iter()
226 /// .map(|l| l.to_string())
227 /// .collect::<Vec<String>>();
228 ///
229 /// // Output ordering, sorted alphabetically
230 /// let expected_ordering: &[&str] = &[
231 /// "ar-Latn",
232 /// "ar-SA",
233 /// "und-fonipa",
234 /// "und-u-ca-hebrew",
235 /// "zh-Hant",
236 /// "zh-Hant-TW",
237 /// "zh-TW",
238 /// ];
239 ///
240 /// assert_eq!(expected_ordering, strict_cmp_strings);
241 /// ```
242 pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
243 writeable::cmp_utf8(self, other)
244 }
245
246 #[allow(clippy::type_complexity)]
247 pub(crate) fn as_tuple(
248 &self,
249 ) -> (
250 (
251 subtags::Language,
252 Option<subtags::Script>,
253 Option<subtags::Region>,
254 &subtags::Variants,
255 ),
256 (
257 (
258 &extensions::unicode::Attributes,
259 &extensions::unicode::Keywords,
260 ),
261 (
262 Option<(
263 subtags::Language,
264 Option<subtags::Script>,
265 Option<subtags::Region>,
266 &subtags::Variants,
267 )>,
268 &extensions::transform::Fields,
269 ),
270 &extensions::private::Private,
271 &[extensions::other::Other],
272 ),
273 ) {
274 (self.id.as_tuple(), self.extensions.as_tuple())
275 }
276
277 /// Returns an ordering suitable for use in [`BTreeSet`].
278 ///
279 /// Unlike [`Locale::strict_cmp`], the ordering may or may not be equivalent
280 /// to string ordering, and it may or may not be stable across ICU4X releases.
281 ///
282 /// # Examples
283 ///
284 /// This method returns a nonsensical ordering derived from the fields of the struct:
285 ///
286 /// ```
287 /// use icu::locale::Locale;
288 /// use std::cmp::Ordering;
289 ///
290 /// // Input strings, sorted alphabetically
291 /// let bcp47_strings: &[&str] = &[
292 /// "ar-Latn",
293 /// "ar-SA",
294 /// "und-fonipa",
295 /// "und-u-ca-hebrew",
296 /// "zh-Hant",
297 /// "zh-Hant-TW",
298 /// "zh-TW",
299 /// ];
300 /// assert!(bcp47_strings.windows(2).all(|w| w[0] < w[1]));
301 ///
302 /// let mut locales = bcp47_strings
303 /// .iter()
304 /// .map(|s| s.parse().unwrap())
305 /// .collect::<Vec<Locale>>();
306 /// locales.sort_by(Locale::total_cmp);
307 /// let total_cmp_strings = locales
308 /// .iter()
309 /// .map(|l| l.to_string())
310 /// .collect::<Vec<String>>();
311 ///
312 /// // Output ordering, sorted arbitrarily
313 /// let expected_ordering: &[&str] = &[
314 /// "ar-SA",
315 /// "ar-Latn",
316 /// "und-u-ca-hebrew",
317 /// "und-fonipa",
318 /// "zh-TW",
319 /// "zh-Hant",
320 /// "zh-Hant-TW",
321 /// ];
322 ///
323 /// assert_eq!(expected_ordering, total_cmp_strings);
324 /// ```
325 ///
326 /// Use a wrapper to add a [`Locale`] to a [`BTreeSet`]:
327 ///
328 /// ```no_run
329 /// use icu::locale::Locale;
330 /// use std::cmp::Ordering;
331 /// use std::collections::BTreeSet;
332 ///
333 /// #[derive(PartialEq, Eq)]
334 /// struct LocaleTotalOrd(Locale);
335 ///
336 /// impl Ord for LocaleTotalOrd {
337 /// fn cmp(&self, other: &Self) -> Ordering {
338 /// self.0.total_cmp(&other.0)
339 /// }
340 /// }
341 ///
342 /// impl PartialOrd for LocaleTotalOrd {
343 /// fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
344 /// Some(self.cmp(other))
345 /// }
346 /// }
347 ///
348 /// let _: BTreeSet<LocaleTotalOrd> = unimplemented!();
349 /// ```
350 ///
351 /// [`BTreeSet`]: alloc::collections::BTreeSet
352 pub fn total_cmp(&self, other: &Self) -> Ordering {
353 self.as_tuple().cmp(&other.as_tuple())
354 }
355
356 /// Compare this `Locale` with a potentially unnormalized BCP-47 string.
357 ///
358 /// The return value is equivalent to what would happen if you first parsed the
359 /// BCP-47 string to a `Locale` and then performed a structural comparison.
360 ///
361 /// # Examples
362 ///
363 /// ```
364 /// use icu::locale::Locale;
365 ///
366 /// let bcp47_strings: &[&str] = &[
367 /// "pl-LaTn-pL",
368 /// "uNd",
369 /// "UND-FONIPA",
370 /// "UnD-t-m0-TrUe",
371 /// "uNd-u-CA-Japanese",
372 /// "ZH",
373 /// ];
374 ///
375 /// for a in bcp47_strings {
376 /// assert!(a.parse::<Locale>().unwrap().normalizing_eq(a));
377 /// }
378 /// ```
379 #[cfg(feature = "alloc")]
380 pub fn normalizing_eq(&self, other: &str) -> bool {
381 macro_rules! subtag_matches {
382 ($T:ty, $iter:ident, $expected:expr) => {
383 $iter
384 .next()
385 .map(|b| <$T>::try_from_utf8(b) == Ok($expected))
386 .unwrap_or(false)
387 };
388 }
389
390 let mut iter = SubtagIterator::new(other.as_bytes());
391 if !subtag_matches!(subtags::Language, iter, self.id.language) {
392 return false;
393 }
394 if let Some(ref script) = self.id.script {
395 if !subtag_matches!(subtags::Script, iter, *script) {
396 return false;
397 }
398 }
399 if let Some(ref region) = self.id.region {
400 if !subtag_matches!(subtags::Region, iter, *region) {
401 return false;
402 }
403 }
404 for variant in self.id.variants.iter() {
405 if !subtag_matches!(subtags::Variant, iter, *variant) {
406 return false;
407 }
408 }
409 if !self.extensions.is_empty() {
410 match extensions::Extensions::try_from_iter(&mut iter) {
411 Ok(exts) => {
412 if self.extensions != exts {
413 return false;
414 }
415 }
416 Err(_) => {
417 return false;
418 }
419 }
420 }
421 iter.next().is_none()
422 }
423
424 #[doc(hidden)] // macro use
425 #[allow(clippy::type_complexity)]
426 pub const fn try_from_utf8_with_single_variant_single_keyword_unicode_extension(
427 code_units: &[u8],
428 ) -> Result<
429 (
430 subtags::Language,
431 Option<subtags::Script>,
432 Option<subtags::Region>,
433 Option<subtags::Variant>,
434 Option<(extensions::unicode::Key, Option<Subtag>)>,
435 ),
436 ParseError,
437 > {
438 parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
439 code_units,
440 ParserMode::Locale,
441 )
442 }
443
444 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
445 where
446 F: FnMut(&str) -> Result<(), E>,
447 {
448 self.id.for_each_subtag_str(f)?;
449 self.extensions.for_each_subtag_str(f)?;
450 Ok(())
451 }
452}
453
454#[cfg(feature = "alloc")]
455impl FromStr for Locale {
456 type Err = ParseError;
457
458 #[inline]
459 fn from_str(s: &str) -> Result<Self, Self::Err> {
460 Self::try_from_str(s)
461 }
462}
463
464impl From<LanguageIdentifier> for Locale {
465 fn from(id: LanguageIdentifier) -> Self {
466 Self {
467 id,
468 extensions: extensions::Extensions::default(),
469 }
470 }
471}
472
473impl From<Locale> for LanguageIdentifier {
474 fn from(loc: Locale) -> Self {
475 loc.id
476 }
477}
478
479impl core::fmt::Debug for Locale {
480 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
481 writeable::Writeable::write_to(self, f)
482 }
483}
484
485impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.write_to_string());
486
487#[test]
488fn test_writeable() {
489 use writeable::assert_writeable_eq;
490 assert_writeable_eq!(Locale::UNKNOWN, "und");
491 assert_writeable_eq!("und-001".parse::<Locale>().unwrap(), "und-001");
492 assert_writeable_eq!("und-Mymr".parse::<Locale>().unwrap(), "und-Mymr");
493 assert_writeable_eq!("my-Mymr-MM".parse::<Locale>().unwrap(), "my-Mymr-MM");
494 assert_writeable_eq!(
495 "my-Mymr-MM-posix".parse::<Locale>().unwrap(),
496 "my-Mymr-MM-posix",
497 );
498 assert_writeable_eq!(
499 "zh-macos-posix".parse::<Locale>().unwrap(),
500 "zh-macos-posix",
501 );
502 assert_writeable_eq!(
503 "my-t-my-d0-zawgyi".parse::<Locale>().unwrap(),
504 "my-t-my-d0-zawgyi",
505 );
506 assert_writeable_eq!(
507 "ar-SA-u-ca-islamic-civil".parse::<Locale>().unwrap(),
508 "ar-SA-u-ca-islamic-civil",
509 );
510 assert_writeable_eq!(
511 "en-001-x-foo-bar".parse::<Locale>().unwrap(),
512 "en-001-x-foo-bar",
513 );
514 assert_writeable_eq!("und-t-m0-true".parse::<Locale>().unwrap(), "und-t-m0-true",);
515}
516
517/// # Examples
518///
519/// ```
520/// use icu::locale::Locale;
521/// use icu::locale::{locale, subtags::language};
522///
523/// assert_eq!(Locale::from(language!("en")), locale!("en"));
524/// ```
525impl From<subtags::Language> for Locale {
526 fn from(language: subtags::Language) -> Self {
527 Self {
528 id: language.into(),
529 extensions: extensions::Extensions::new(),
530 }
531 }
532}
533
534/// # Examples
535///
536/// ```
537/// use icu::locale::Locale;
538/// use icu::locale::{locale, subtags::script};
539///
540/// assert_eq!(Locale::from(Some(script!("latn"))), locale!("und-Latn"));
541/// ```
542impl From<Option<subtags::Script>> for Locale {
543 fn from(script: Option<subtags::Script>) -> Self {
544 Self {
545 id: script.into(),
546 extensions: extensions::Extensions::new(),
547 }
548 }
549}
550
551/// # Examples
552///
553/// ```
554/// use icu::locale::Locale;
555/// use icu::locale::{locale, subtags::region};
556///
557/// assert_eq!(Locale::from(Some(region!("US"))), locale!("und-US"));
558/// ```
559impl From<Option<subtags::Region>> for Locale {
560 fn from(region: Option<subtags::Region>) -> Self {
561 Self {
562 id: region.into(),
563 extensions: extensions::Extensions::new(),
564 }
565 }
566}
567
568/// # Examples
569///
570/// ```
571/// use icu::locale::Locale;
572/// use icu::locale::{
573/// locale,
574/// subtags::{language, region, script},
575/// };
576///
577/// assert_eq!(
578/// Locale::from((
579/// language!("en"),
580/// Some(script!("Latn")),
581/// Some(region!("US"))
582/// )),
583/// locale!("en-Latn-US")
584/// );
585/// ```
586impl
587 From<(
588 subtags::Language,
589 Option<subtags::Script>,
590 Option<subtags::Region>,
591 )> for Locale
592{
593 fn from(
594 lsr: (
595 subtags::Language,
596 Option<subtags::Script>,
597 Option<subtags::Region>,
598 ),
599 ) -> Self {
600 Self {
601 id: lsr.into(),
602 extensions: extensions::Extensions::new(),
603 }
604 }
605}