icu_locale/
canonicalizer.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! The collection of code for locale canonicalization.
6
7use crate::provider::*;
8use alloc::vec::Vec;
9use core::cmp::Ordering;
10
11use crate::LocaleExpander;
12use crate::TransformResult;
13use icu_locale_core::extensions::Extensions;
14use icu_locale_core::subtags::{Language, Region, Script};
15use icu_locale_core::{
16    extensions::unicode::key,
17    subtags::{language, Variant, Variants},
18    LanguageIdentifier, Locale,
19};
20use icu_provider::prelude::*;
21use tinystr::TinyAsciiStr;
22
23/// Implements the algorithm defined in *[UTS #35: Annex C, LocaleId Canonicalization]*.
24///
25/// # Examples
26///
27/// ```
28/// use icu::locale::Locale;
29/// use icu::locale::{LocaleCanonicalizer, TransformResult};
30///
31/// let lc = LocaleCanonicalizer::new_extended();
32///
33/// let mut locale: Locale = "ja-Latn-fonipa-hepburn-heploc".parse().unwrap();
34/// assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified);
35/// assert_eq!(locale, "ja-Latn-alalc97-fonipa".parse().unwrap());
36/// ```
37///
38/// [UTS #35: Annex C, LocaleId Canonicalization]: http://unicode.org/reports/tr35/#LocaleId_Canonicalization
39#[derive(Debug)]
40pub struct LocaleCanonicalizer<Expander = LocaleExpander> {
41    /// Data to support canonicalization.
42    aliases: DataPayload<LocaleAliasesV1>,
43    /// Likely subtags implementation for delegation.
44    expander: Expander,
45}
46
47fn uts35_rule_matches<'a, I>(
48    source: &LanguageIdentifier,
49    language: Language,
50    script: Option<Script>,
51    region: Option<Region>,
52    raw_variants: I,
53) -> bool
54where
55    I: Iterator<Item = &'a str>,
56{
57    (language.is_unknown() || language == source.language)
58        && (script.is_none() || script == source.script)
59        && (region.is_none() || region == source.region)
60        && {
61            // Checks if variants are a subset of source variants.
62            // As both iterators are sorted, this can be done linearly.
63            let mut source_variants = source.variants.iter();
64            'outer: for raw_variant in raw_variants {
65                for source_variant in source_variants.by_ref() {
66                    match source_variant.as_str().cmp(raw_variant) {
67                        Ordering::Equal => {
68                            // The source_variant is equal, move to next raw_variant
69                            continue 'outer;
70                        }
71                        Ordering::Less => {
72                            // The source_variant is smaller, take the next source_variant
73                        }
74                        Ordering::Greater => {
75                            // The source_variant is greater,
76                            // raw_variants is not a subset of source_variants
77                            return false;
78                        }
79                    }
80                }
81                // There are raw_variants left after we exhausted source_variants
82                return false;
83            }
84            true
85        }
86}
87
88fn uts35_replacement<'a, I>(
89    source: &mut LanguageIdentifier,
90    ruletype_has_language: bool,
91    ruletype_has_script: bool,
92    ruletype_has_region: bool,
93    ruletype_variants: Option<I>,
94    replacement: &LanguageIdentifier,
95) where
96    I: Iterator<Item = &'a str>,
97{
98    if ruletype_has_language || (source.language.is_unknown() && !replacement.language.is_unknown())
99    {
100        source.language = replacement.language;
101    }
102    if ruletype_has_script || (source.script.is_none() && replacement.script.is_some()) {
103        source.script = replacement.script;
104    }
105    if ruletype_has_region || (source.region.is_none() && replacement.region.is_some()) {
106        source.region = replacement.region;
107    }
108    if let Some(skips) = ruletype_variants {
109        // The rule matches if the ruletype variants are a subset of the source variants.
110        // This means ja-Latn-fonipa-hepburn-heploc matches against the rule for
111        // hepburn-heploc and is canonicalized to ja-Latn-alalc97-fonipa
112
113        // We're merging three sorted deduped iterators into a new sequence:
114        // sources - skips + replacements
115
116        let mut sources = source.variants.iter().peekable();
117        let mut replacements = replacement.variants.iter().peekable();
118        let mut skips = skips.peekable();
119
120        let mut variants: Vec<Variant> = Vec::new();
121
122        loop {
123            match (sources.peek(), skips.peek(), replacements.peek()) {
124                (Some(&source), Some(skip), _)
125                    if source.as_str().cmp(skip) == Ordering::Greater =>
126                {
127                    skips.next();
128                }
129                (Some(&source), Some(skip), _) if source.as_str().cmp(skip) == Ordering::Equal => {
130                    skips.next();
131                    sources.next();
132                }
133                (Some(&source), _, Some(&replacement))
134                    if replacement.cmp(source) == Ordering::Less =>
135                {
136                    variants.push(*replacement);
137                    replacements.next();
138                }
139                (Some(&source), _, Some(&replacement))
140                    if replacement.cmp(source) == Ordering::Equal =>
141                {
142                    variants.push(*source);
143                    sources.next();
144                    replacements.next();
145                }
146                (Some(&source), _, _) => {
147                    variants.push(*source);
148                    sources.next();
149                }
150                (None, _, Some(&replacement)) => {
151                    variants.push(*replacement);
152                    replacements.next();
153                }
154                (None, _, None) => {
155                    break;
156                }
157            }
158        }
159        source.variants = Variants::from_vec_unchecked(variants);
160    }
161}
162
163#[inline]
164fn uts35_check_language_rules(
165    langid: &mut LanguageIdentifier,
166    alias_data: &DataPayload<LocaleAliasesV1>,
167) -> TransformResult {
168    if !langid.language.is_unknown() {
169        let lang: TinyAsciiStr<3> = langid.language.into();
170        let replacement = if lang.len() == 2 {
171            alias_data
172                .get()
173                .language_len2
174                .get(&lang.resize().to_unvalidated())
175        } else {
176            alias_data.get().language_len3.get(&lang.to_unvalidated())
177        };
178
179        if let Some(replacement) = replacement {
180            if let Ok(new_langid) = replacement.parse() {
181                uts35_replacement::<core::iter::Empty<&str>>(
182                    langid,
183                    true,
184                    false,
185                    false,
186                    None,
187                    &new_langid,
188                );
189                return TransformResult::Modified;
190            }
191        }
192    }
193
194    TransformResult::Unmodified
195}
196
197impl LocaleCanonicalizer<LocaleExpander> {
198    /// A constructor which creates a [`LocaleCanonicalizer`] from compiled data,
199    /// using a [`LocaleExpander`] for common locales.
200    ///
201    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
202    ///
203    /// [📚 Help choosing a constructor](icu_provider::constructors)
204    #[cfg(feature = "compiled_data")]
205    pub const fn new_common() -> Self {
206        Self::new_with_expander(LocaleExpander::new_common())
207    }
208
209    icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
210        functions: [
211            new_common: skip,
212            try_new_common_with_buffer_provider,
213            try_new_common_unstable,
214            Self,
215        ]
216    );
217
218    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_common)]
219    pub fn try_new_common_unstable<P>(provider: &P) -> Result<Self, DataError>
220    where
221        P: DataProvider<LocaleAliasesV1>
222            + DataProvider<LocaleLikelySubtagsLanguageV1>
223            + DataProvider<LocaleLikelySubtagsScriptRegionV1>
224            + ?Sized,
225    {
226        let expander = LocaleExpander::try_new_common_unstable(provider)?;
227        Self::try_new_with_expander_unstable(provider, expander)
228    }
229
230    /// A constructor which creates a [`LocaleCanonicalizer`] from compiled data,
231    /// using a [`LocaleExpander`] for all locales.
232    ///
233    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
234    ///
235    /// [📚 Help choosing a constructor](icu_provider::constructors)
236    #[cfg(feature = "compiled_data")]
237    pub const fn new_extended() -> Self {
238        Self::new_with_expander(LocaleExpander::new_extended())
239    }
240
241    icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
242        functions: [
243            new_extended: skip,
244            try_new_extended_with_buffer_provider,
245            try_new_extended_unstable,
246            Self,
247        ]
248    );
249
250    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_extended)]
251    pub fn try_new_extended_unstable<P>(provider: &P) -> Result<Self, DataError>
252    where
253        P: DataProvider<LocaleAliasesV1>
254            + DataProvider<LocaleLikelySubtagsLanguageV1>
255            + DataProvider<LocaleLikelySubtagsScriptRegionV1>
256            + DataProvider<LocaleLikelySubtagsExtendedV1>
257            + ?Sized,
258    {
259        let expander = LocaleExpander::try_new_extended_unstable(provider)?;
260        Self::try_new_with_expander_unstable(provider, expander)
261    }
262}
263
264impl<Expander: AsRef<LocaleExpander>> LocaleCanonicalizer<Expander> {
265    /// Creates a [`LocaleCanonicalizer`] with a custom [`LocaleExpander`] and compiled data.
266    ///
267    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
268    ///
269    /// [📚 Help choosing a constructor](icu_provider::constructors)
270    #[cfg(feature = "compiled_data")]
271    pub const fn new_with_expander(expander: Expander) -> Self {
272        Self {
273            aliases: DataPayload::from_static_ref(
274                crate::provider::Baked::SINGLETON_LOCALE_ALIASES_V1,
275            ),
276            expander,
277        }
278    }
279
280    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_with_expander)]
281    pub fn try_new_with_expander_unstable<P>(
282        provider: &P,
283        expander: Expander,
284    ) -> Result<Self, DataError>
285    where
286        P: DataProvider<LocaleAliasesV1> + ?Sized,
287    {
288        let aliases: DataPayload<LocaleAliasesV1> = provider.load(Default::default())?.payload;
289
290        Ok(Self { aliases, expander })
291    }
292
293    icu_provider::gen_buffer_data_constructors!((options: Expander) -> error: DataError,
294        functions: [
295            new_with_expander: skip,
296            try_new_with_expander_with_buffer_provider,
297            try_new_with_expander_unstable,
298            Self,
299        ]
300    );
301
302    /// The canonicalize method potentially updates a passed in locale in place
303    /// depending up the results of running the canonicalization algorithm
304    /// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>.
305    ///
306    /// Some BCP47 canonicalization data is not part of the CLDR json package. Because
307    /// of this, some canonicalizations are not performed, e.g. the canonicalization of
308    /// `und-u-ca-islamicc` to `und-u-ca-islamic-civil`. This will be fixed in a future
309    /// release once the missing data has been added to the CLDR json data. See:
310    /// <https://github.com/unicode-org/icu4x/issues/746>
311    ///
312    /// # Examples
313    ///
314    /// ```
315    /// use icu::locale::{Locale, LocaleCanonicalizer, TransformResult};
316    ///
317    /// let lc = LocaleCanonicalizer::new_extended();
318    ///
319    /// let mut locale: Locale = "ja-Latn-fonipa-hepburn-heploc".parse().unwrap();
320    /// assert_eq!(lc.canonicalize(&mut locale), TransformResult::Modified);
321    /// assert_eq!(locale, "ja-Latn-alalc97-fonipa".parse().unwrap());
322    /// ```
323    pub fn canonicalize(&self, locale: &mut Locale) -> TransformResult {
324        let mut result = TransformResult::Unmodified;
325
326        // This loops until we get a 'fixed point', where applying the rules do not
327        // result in any more changes.
328        loop {
329            // These are linear searches due to the ordering imposed by the canonicalization
330            // rules, where rules with more variants should be considered first. With the
331            // current data in CLDR, we will only do this for locales which have variants,
332            // or new rules which we haven't special-cased yet (of which there are fewer
333            // than 20).
334            let modified = if locale.id.variants.is_empty() {
335                self.canonicalize_absolute_language_fallbacks(&mut locale.id)
336            } else {
337                self.canonicalize_language_variant_fallbacks(&mut locale.id)
338            };
339            if modified {
340                result = TransformResult::Modified;
341                continue;
342            }
343
344            if !locale.id.language.is_unknown() {
345                // If the region is specified, check sgn-region rules first
346                if let Some(region) = locale.id.region {
347                    if locale.id.language == language!("sgn") {
348                        if let Some(&sgn_lang) = self
349                            .aliases
350                            .get()
351                            .sgn_region
352                            .get(&region.to_tinystr().to_unvalidated())
353                        {
354                            uts35_replacement::<core::iter::Empty<&str>>(
355                                &mut locale.id,
356                                true,
357                                false,
358                                true,
359                                None,
360                                &sgn_lang.into(),
361                            );
362                            result = TransformResult::Modified;
363                            continue;
364                        }
365                    }
366                }
367
368                if uts35_check_language_rules(&mut locale.id, &self.aliases)
369                    == TransformResult::Modified
370                {
371                    result = TransformResult::Modified;
372                    continue;
373                }
374            }
375
376            if let Some(script) = locale.id.script {
377                if let Some(&replacement) = self
378                    .aliases
379                    .get()
380                    .script
381                    .get(&script.to_tinystr().to_unvalidated())
382                {
383                    locale.id.script = Some(replacement);
384                    result = TransformResult::Modified;
385                    continue;
386                }
387            }
388
389            if let Some(region) = locale.id.region {
390                let replacement = if region.is_alphabetic() {
391                    self.aliases
392                        .get()
393                        .region_alpha
394                        .get(&region.to_tinystr().resize().to_unvalidated())
395                } else {
396                    self.aliases
397                        .get()
398                        .region_num
399                        .get(&region.to_tinystr().to_unvalidated())
400                };
401                if let Some(&replacement) = replacement {
402                    locale.id.region = Some(replacement);
403                    result = TransformResult::Modified;
404                    continue;
405                }
406
407                if let Some(regions) = self
408                    .aliases
409                    .get()
410                    .complex_region
411                    .get(&region.to_tinystr().to_unvalidated())
412                {
413                    // Skip if regions are empty
414                    if let Some(default_region) = regions.get(0) {
415                        let mut maximized = LanguageIdentifier {
416                            language: locale.id.language,
417                            script: locale.id.script,
418                            region: None,
419                            variants: Variants::default(),
420                        };
421
422                        locale.id.region = Some(
423                            match (
424                                self.expander.as_ref().maximize(&mut maximized),
425                                maximized.region,
426                            ) {
427                                (TransformResult::Modified, Some(candidate))
428                                    if regions.iter().any(|x| x == candidate) =>
429                                {
430                                    candidate
431                                }
432                                _ => default_region,
433                            },
434                        );
435                        result = TransformResult::Modified;
436                        continue;
437                    }
438                }
439            }
440
441            if !locale.id.variants.is_empty() {
442                let mut modified = Vec::with_capacity(0);
443                for (idx, &variant) in locale.id.variants.iter().enumerate() {
444                    if let Some(&updated) = self
445                        .aliases
446                        .get()
447                        .variant
448                        .get(&variant.to_tinystr().to_unvalidated())
449                    {
450                        if modified.is_empty() {
451                            modified = locale.id.variants.to_vec();
452                        }
453                        #[allow(clippy::indexing_slicing)]
454                        let _ = core::mem::replace(&mut modified[idx], updated);
455                    }
456                }
457
458                if !modified.is_empty() {
459                    modified.sort();
460                    modified.dedup();
461                    locale.id.variants = Variants::from_vec_unchecked(modified);
462                    result = TransformResult::Modified;
463                    continue;
464                }
465            }
466
467            // Nothing matched in this iteration, we're done.
468            break;
469        }
470
471        if !locale.extensions.transform.is_empty() || !locale.extensions.unicode.is_empty() {
472            self.canonicalize_extensions(&mut locale.extensions, &mut result);
473        }
474        result
475    }
476
477    fn canonicalize_extensions(&self, extensions: &mut Extensions, result: &mut TransformResult) {
478        // Handle Locale extensions in their own loops, because these rules do not interact
479        // with each other.
480        if let Some(ref mut lang) = extensions.transform.lang {
481            while uts35_check_language_rules(lang, &self.aliases) == TransformResult::Modified {
482                *result = TransformResult::Modified;
483            }
484        }
485
486        if !extensions.unicode.keywords.is_empty() {
487            for key in [key!("rg"), key!("sd")] {
488                if let Some(value) = extensions.unicode.keywords.get_mut(&key) {
489                    if let Some(only_value) = value.as_single_subtag() {
490                        if let Some(modified_value) = self
491                            .aliases
492                            .get()
493                            .subdivision
494                            .get(&only_value.to_tinystr().resize().to_unvalidated())
495                        {
496                            if let Ok(modified_value) = modified_value.parse() {
497                                *value = modified_value;
498                                *result = TransformResult::Modified;
499                            }
500                        }
501                    }
502                }
503            }
504        }
505    }
506
507    fn canonicalize_language_variant_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool {
508        // These language/variant comibnations have around 20 rules
509        for LanguageStrStrPair(lang, raw_variants, raw_to) in self
510            .aliases
511            .get()
512            .language_variants
513            .iter()
514            .map(zerofrom::ZeroFrom::zero_from)
515        {
516            let raw_variants = raw_variants.split('-');
517            // if is_iter_sorted(raw_variants.clone()) { // can we sort at construction?
518            if uts35_rule_matches(lid, lang, None, None, raw_variants.clone()) {
519                if let Ok(to) = raw_to.parse() {
520                    uts35_replacement(
521                        lid,
522                        !lang.is_unknown(),
523                        false,
524                        false,
525                        Some(raw_variants),
526                        &to,
527                    );
528                    return true;
529                }
530            }
531        }
532        false
533    }
534
535    fn canonicalize_absolute_language_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool {
536        for StrStrPair(raw_from, raw_to) in self
537            .aliases
538            .get()
539            .language
540            .iter()
541            .map(zerofrom::ZeroFrom::zero_from)
542        {
543            if let Ok(from) = raw_from.parse::<LanguageIdentifier>() {
544                if uts35_rule_matches(
545                    lid,
546                    from.language,
547                    from.script,
548                    from.region,
549                    from.variants.iter().map(Variant::as_str),
550                ) {
551                    if let Ok(to) = raw_to.parse() {
552                        uts35_replacement(
553                            lid,
554                            !from.language.is_unknown(),
555                            from.script.is_some(),
556                            from.region.is_some(),
557                            Some(from.variants.iter().map(Variant::as_str)),
558                            &to,
559                        );
560                        return true;
561                    }
562                }
563            }
564        }
565        false
566    }
567}
568
569#[cfg(test)]
570mod test {
571    use super::*;
572
573    #[test]
574    fn test_uts35_rule_matches() {
575        for (source, rule, result) in [
576            ("ja", "und", true),
577            ("und-heploc-hepburn", "und-hepburn", true),
578            ("ja-heploc-hepburn", "und-hepburn", true),
579            ("ja-hepburn", "und-hepburn-heploc", false),
580        ] {
581            let source = source.parse().unwrap();
582            let rule = rule.parse::<LanguageIdentifier>().unwrap();
583            assert_eq!(
584                uts35_rule_matches(
585                    &source,
586                    rule.language,
587                    rule.script,
588                    rule.region,
589                    rule.variants.iter().map(Variant::as_str),
590                ),
591                result,
592                "{}",
593                source
594            );
595        }
596    }
597
598    #[test]
599    fn test_uts35_replacement() {
600        for (locale, rule_0, rule_1, result) in [
601            (
602                "ja-Latn-fonipa-hepburn-heploc",
603                "und-hepburn-heploc",
604                "und-alalc97",
605                "ja-Latn-alalc97-fonipa",
606            ),
607            ("sgn-DD", "und-DD", "und-DE", "sgn-DE"),
608            ("sgn-DE", "sgn-DE", "gsg", "gsg"),
609        ] {
610            let mut locale: Locale = locale.parse().unwrap();
611            let rule_0 = rule_0.parse::<LanguageIdentifier>().unwrap();
612            let rule_1 = rule_1.parse().unwrap();
613            let result = result.parse::<Locale>().unwrap();
614            uts35_replacement(
615                &mut locale.id,
616                !rule_0.language.is_unknown(),
617                rule_0.script.is_some(),
618                rule_0.region.is_some(),
619                Some(rule_0.variants.iter().map(Variant::as_str)),
620                &rule_1,
621            );
622            assert_eq!(result, locale);
623        }
624    }
625}