icu_locale/
expander.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::provider::*;
6
7use icu_locale_core::subtags::{Language, Region, Script};
8use icu_locale_core::LanguageIdentifier;
9use icu_provider::prelude::*;
10
11use crate::TransformResult;
12
13/// Implements the *Add Likely Subtags* and *Remove Likely Subtags*
14/// algorithms as defined in *[UTS #35: Likely Subtags]*.
15///
16/// # Examples
17///
18/// Add likely subtags:
19///
20/// ```
21/// use icu::locale::locale;
22/// use icu::locale::{LocaleExpander, TransformResult};
23///
24/// let lc = LocaleExpander::new_common();
25///
26/// let mut locale = locale!("zh-CN");
27/// assert_eq!(lc.maximize(&mut locale.id), TransformResult::Modified);
28/// assert_eq!(locale, locale!("zh-Hans-CN"));
29///
30/// let mut locale = locale!("zh-Hant-TW");
31/// assert_eq!(lc.maximize(&mut locale.id), TransformResult::Unmodified);
32/// assert_eq!(locale, locale!("zh-Hant-TW"));
33/// ```
34///
35/// Remove likely subtags:
36///
37/// ```
38/// use icu::locale::{locale, LocaleExpander, TransformResult};
39///
40/// let lc = LocaleExpander::new_common();
41///
42/// let mut locale = locale!("zh-Hans-CN");
43/// assert_eq!(lc.minimize(&mut locale.id), TransformResult::Modified);
44/// assert_eq!(locale, locale!("zh"));
45///
46/// let mut locale = locale!("zh");
47/// assert_eq!(lc.minimize(&mut locale.id), TransformResult::Unmodified);
48/// assert_eq!(locale, locale!("zh"));
49/// ```
50///
51/// Normally, only CLDR locales with Basic or higher coverage are included. To include more
52/// locales for maximization, use [`try_new_extended`](Self::try_new_extended_unstable):
53///
54/// ```
55/// use icu::locale::{locale, LocaleExpander, TransformResult};
56///
57/// let lc = LocaleExpander::new_extended();
58///
59/// let mut locale = locale!("atj");
60/// assert_eq!(lc.maximize(&mut locale.id), TransformResult::Modified);
61/// assert_eq!(locale, locale!("atj-Latn-CA"));
62/// ```
63///
64/// [UTS #35: Likely Subtags]: https://www.unicode.org/reports/tr35/#Likely_Subtags
65#[derive(Debug, Clone)]
66pub struct LocaleExpander {
67    likely_subtags_l: DataPayload<LocaleLikelySubtagsLanguageV1>,
68    likely_subtags_sr: DataPayload<LocaleLikelySubtagsScriptRegionV1>,
69    likely_subtags_ext: Option<DataPayload<LocaleLikelySubtagsExtendedV1>>,
70}
71
72struct LocaleExpanderBorrowed<'a> {
73    likely_subtags_l: &'a LikelySubtagsForLanguage<'a>,
74    likely_subtags_sr: &'a LikelySubtagsForScriptRegion<'a>,
75    likely_subtags_ext: Option<&'a LikelySubtagsExtended<'a>>,
76}
77
78impl LocaleExpanderBorrowed<'_> {
79    fn get_l(&self, l: Language) -> Option<(Script, Region)> {
80        let key = &l.to_tinystr().to_unvalidated();
81        self.likely_subtags_l.language.get_copied(key).or_else(|| {
82            self.likely_subtags_ext
83                .and_then(|ext| ext.language.get_copied(key))
84        })
85    }
86
87    fn get_ls(&self, l: Language, s: Script) -> Option<Region> {
88        let key = &(
89            l.to_tinystr().to_unvalidated(),
90            s.to_tinystr().to_unvalidated(),
91        );
92        self.likely_subtags_l
93            .language_script
94            .get_copied(key)
95            .or_else(|| {
96                self.likely_subtags_ext
97                    .and_then(|ext| ext.language_script.get_copied(key))
98            })
99    }
100
101    fn get_lr(&self, l: Language, r: Region) -> Option<Script> {
102        let key = &(
103            l.to_tinystr().to_unvalidated(),
104            r.to_tinystr().to_unvalidated(),
105        );
106        self.likely_subtags_l
107            .language_region
108            .get_copied(key)
109            .or_else(|| {
110                self.likely_subtags_ext
111                    .and_then(|ext| ext.language_region.get_copied(key))
112            })
113    }
114
115    fn get_s(&self, s: Script) -> Option<(Language, Region)> {
116        let key = &s.to_tinystr().to_unvalidated();
117        self.likely_subtags_sr.script.get_copied(key).or_else(|| {
118            self.likely_subtags_ext
119                .and_then(|ext| ext.script.get_copied(key))
120        })
121    }
122
123    fn get_sr(&self, s: Script, r: Region) -> Option<Language> {
124        let key = &(
125            s.to_tinystr().to_unvalidated(),
126            r.to_tinystr().to_unvalidated(),
127        );
128        self.likely_subtags_sr
129            .script_region
130            .get_copied(key)
131            .or_else(|| {
132                self.likely_subtags_ext
133                    .and_then(|ext| ext.script_region.get_copied(key))
134            })
135    }
136
137    fn get_r(&self, r: Region) -> Option<(Language, Script)> {
138        let key = &r.to_tinystr().to_unvalidated();
139        self.likely_subtags_sr.region.get_copied(key).or_else(|| {
140            self.likely_subtags_ext
141                .and_then(|ext| ext.region.get_copied(key))
142        })
143    }
144
145    fn get_und(&self) -> (Language, Script, Region) {
146        self.likely_subtags_l.und
147    }
148}
149
150#[inline]
151fn update_langid(
152    language: Language,
153    script: Option<Script>,
154    region: Option<Region>,
155    langid: &mut LanguageIdentifier,
156) -> TransformResult {
157    let mut modified = false;
158
159    if langid.language.is_unknown() && !language.is_unknown() {
160        langid.language = language;
161        modified = true;
162    }
163
164    if langid.script.is_none() && script.is_some() {
165        langid.script = script;
166        modified = true;
167    }
168
169    if langid.region.is_none() && region.is_some() {
170        langid.region = region;
171        modified = true;
172    }
173
174    if modified {
175        TransformResult::Modified
176    } else {
177        TransformResult::Unmodified
178    }
179}
180
181#[inline]
182fn update_langid_minimize(
183    language: Language,
184    script: Option<Script>,
185    region: Option<Region>,
186    langid: &mut LanguageIdentifier,
187) -> TransformResult {
188    let mut modified = false;
189
190    if langid.language != language {
191        langid.language = language;
192        modified = true;
193    }
194
195    if langid.script != script {
196        langid.script = script;
197        modified = true;
198    }
199
200    if langid.region != region {
201        langid.region = region;
202        modified = true;
203    }
204
205    if modified {
206        TransformResult::Modified
207    } else {
208        TransformResult::Unmodified
209    }
210}
211
212impl LocaleExpander {
213    /// Creates a [`LocaleExpander`] with compiled data for commonly-used locales
214    /// (locales with *Basic* or higher [CLDR coverage]).
215    ///
216    /// Use this constructor if you want limited likely subtags for data-oriented use cases.
217    ///
218    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
219    ///
220    /// [📚 Help choosing a constructor](icu_provider::constructors)
221    ///
222    /// [CLDR coverage]: https://www.unicode.org/reports/tr35/tr35-info.html#Coverage_Levels
223    #[cfg(feature = "compiled_data")]
224    pub const fn new_common() -> Self {
225        LocaleExpander {
226            likely_subtags_l: DataPayload::from_static_ref(
227                crate::provider::Baked::SINGLETON_LOCALE_LIKELY_SUBTAGS_LANGUAGE_V1,
228            ),
229            likely_subtags_sr: DataPayload::from_static_ref(
230                crate::provider::Baked::SINGLETON_LOCALE_LIKELY_SUBTAGS_SCRIPT_REGION_V1,
231            ),
232            likely_subtags_ext: None,
233        }
234    }
235
236    icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
237        functions: [
238        new_common: skip,
239        try_new_common_with_buffer_provider,
240        try_new_common_unstable,
241        Self
242    ]);
243
244    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_common)]
245    pub fn try_new_common_unstable<P>(provider: &P) -> Result<LocaleExpander, DataError>
246    where
247        P: DataProvider<LocaleLikelySubtagsLanguageV1>
248            + DataProvider<LocaleLikelySubtagsScriptRegionV1>
249            + ?Sized,
250    {
251        let likely_subtags_l = provider.load(Default::default())?.payload;
252        let likely_subtags_sr = provider.load(Default::default())?.payload;
253
254        Ok(LocaleExpander {
255            likely_subtags_l,
256            likely_subtags_sr,
257            likely_subtags_ext: None,
258        })
259    }
260
261    /// Creates a [`LocaleExpander`] with compiled data for all locales.
262    ///
263    /// Use this constructor if you want to include data for all locales, including ones
264    /// that may not have data for other services (i.e. [CLDR coverage] below *Basic*).
265    ///
266    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
267    ///
268    /// [📚 Help choosing a constructor](icu_provider::constructors)
269    ///
270    /// [CLDR coverage]: https://www.unicode.org/reports/tr35/tr35-info.html#Coverage_Levels
271    #[cfg(feature = "compiled_data")]
272    pub const fn new_extended() -> Self {
273        LocaleExpander {
274            likely_subtags_l: DataPayload::from_static_ref(
275                crate::provider::Baked::SINGLETON_LOCALE_LIKELY_SUBTAGS_LANGUAGE_V1,
276            ),
277            likely_subtags_sr: DataPayload::from_static_ref(
278                crate::provider::Baked::SINGLETON_LOCALE_LIKELY_SUBTAGS_SCRIPT_REGION_V1,
279            ),
280            likely_subtags_ext: Some(DataPayload::from_static_ref(
281                crate::provider::Baked::SINGLETON_LOCALE_LIKELY_SUBTAGS_EXTENDED_V1,
282            )),
283        }
284    }
285
286    icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
287        functions: [
288        new_extended: skip,
289        try_new_extended_with_buffer_provider,
290        try_new_extended_unstable,
291        Self
292    ]);
293
294    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_extended)]
295    pub fn try_new_extended_unstable<P>(provider: &P) -> Result<LocaleExpander, DataError>
296    where
297        P: DataProvider<LocaleLikelySubtagsLanguageV1>
298            + DataProvider<LocaleLikelySubtagsScriptRegionV1>
299            + DataProvider<LocaleLikelySubtagsExtendedV1>
300            + ?Sized,
301    {
302        let likely_subtags_l = provider.load(Default::default())?.payload;
303        let likely_subtags_sr = provider.load(Default::default())?.payload;
304        let likely_subtags_ext = Some(provider.load(Default::default())?.payload);
305
306        Ok(LocaleExpander {
307            likely_subtags_l,
308            likely_subtags_sr,
309            likely_subtags_ext,
310        })
311    }
312
313    fn as_borrowed(&self) -> LocaleExpanderBorrowed {
314        LocaleExpanderBorrowed {
315            likely_subtags_l: self.likely_subtags_l.get(),
316            likely_subtags_sr: self.likely_subtags_sr.get(),
317            likely_subtags_ext: self.likely_subtags_ext.as_ref().map(|p| p.get()),
318        }
319    }
320
321    /// The maximize method potentially updates a passed in locale in place
322    /// depending up the results of running the 'Add Likely Subtags' algorithm
323    /// from <https://www.unicode.org/reports/tr35/#Likely_Subtags>.
324    ///
325    /// If the result of running the algorithm would result in a new locale, the
326    /// locale argument is updated in place to match the result, and the method
327    /// returns [`TransformResult::Modified`]. Otherwise, the method
328    /// returns [`TransformResult::Unmodified`] and the locale argument is
329    /// unchanged.
330    ///
331    /// This function does not guarantee that any particular set of subtags
332    /// will be present in the resulting locale.
333    ///
334    /// # Examples
335    ///
336    /// ```
337    /// use icu::locale::{locale, LocaleExpander, TransformResult};
338    ///
339    /// let lc = LocaleExpander::new_common();
340    ///
341    /// let mut locale = locale!("zh-CN");
342    /// assert_eq!(lc.maximize(&mut locale.id), TransformResult::Modified);
343    /// assert_eq!(locale, locale!("zh-Hans-CN"));
344    ///
345    /// let mut locale = locale!("zh-Hant-TW");
346    /// assert_eq!(lc.maximize(&mut locale.id), TransformResult::Unmodified);
347    /// assert_eq!(locale, locale!("zh-Hant-TW"));
348    /// ```
349    ///
350    /// If there is no data for a particular language, the result is not
351    /// modified. Note that [`LocaleExpander::new_extended`] supports
352    /// more languages.
353    ///
354    /// ```
355    /// use icu::locale::{locale, LocaleExpander, TransformResult};
356    ///
357    /// let lc = LocaleExpander::new_common();
358    ///
359    /// // No subtags data for ccp in the default set:
360    /// let mut locale = locale!("ccp");
361    /// assert_eq!(lc.maximize(&mut locale.id), TransformResult::Unmodified);
362    /// assert_eq!(locale, locale!("ccp"));
363    ///
364    /// // The extended set supports it:
365    /// let lc = LocaleExpander::new_extended();
366    /// let mut locale = locale!("ccp");
367    /// assert_eq!(lc.maximize(&mut locale.id), TransformResult::Modified);
368    /// assert_eq!(locale, locale!("ccp-Cakm-BD"));
369    ///
370    /// // But even the extended set does not support all language subtags:
371    /// let mut locale = locale!("mul");
372    /// assert_eq!(lc.maximize(&mut locale.id), TransformResult::Unmodified);
373    /// assert_eq!(locale, locale!("mul"));
374    /// ```
375    pub fn maximize(&self, langid: &mut LanguageIdentifier) -> TransformResult {
376        let data = self.as_borrowed();
377
378        if !langid.language.is_unknown() && langid.script.is_some() && langid.region.is_some() {
379            return TransformResult::Unmodified;
380        }
381
382        if !langid.language.is_unknown() {
383            if let Some(region) = langid.region {
384                if let Some(script) = data.get_lr(langid.language, region) {
385                    return update_langid(Language::UNKNOWN, Some(script), None, langid);
386                }
387            }
388            if let Some(script) = langid.script {
389                if let Some(region) = data.get_ls(langid.language, script) {
390                    return update_langid(Language::UNKNOWN, None, Some(region), langid);
391                }
392            }
393            if let Some((script, region)) = data.get_l(langid.language) {
394                return update_langid(Language::UNKNOWN, Some(script), Some(region), langid);
395            }
396            // Language not found: return unmodified.
397            return TransformResult::Unmodified;
398        }
399        if let Some(script) = langid.script {
400            if let Some(region) = langid.region {
401                if let Some(language) = data.get_sr(script, region) {
402                    return update_langid(language, None, None, langid);
403                }
404            }
405            if let Some((language, region)) = data.get_s(script) {
406                return update_langid(language, None, Some(region), langid);
407            }
408        }
409        if let Some(region) = langid.region {
410            if let Some((language, script)) = data.get_r(region) {
411                return update_langid(language, Some(script), None, langid);
412            }
413        }
414
415        // We failed to find anything in the und-SR, und-S, or und-R tables,
416        // to fall back to bare "und"
417        debug_assert!(langid.language.is_unknown());
418        update_langid(
419            data.get_und().0,
420            Some(data.get_und().1),
421            Some(data.get_und().2),
422            langid,
423        )
424    }
425
426    /// This returns a new Locale that is the result of running the
427    /// 'Remove Likely Subtags' algorithm from
428    /// <https://www.unicode.org/reports/tr35/#Likely_Subtags>.
429    ///
430    /// If the result of running the algorithm would result in a new locale, the
431    /// locale argument is updated in place to match the result, and the method
432    /// returns [`TransformResult::Modified`]. Otherwise, the method
433    /// returns [`TransformResult::Unmodified`] and the locale argument is
434    /// unchanged.
435    ///
436    /// # Examples
437    ///
438    /// ```
439    /// use icu::locale::{locale, LocaleExpander, TransformResult};
440    ///
441    /// let lc = LocaleExpander::new_common();
442    ///
443    /// let mut locale = locale!("zh-Hans-CN");
444    /// assert_eq!(lc.minimize(&mut locale.id), TransformResult::Modified);
445    /// assert_eq!(locale, locale!("zh"));
446    ///
447    /// let mut locale = locale!("zh");
448    /// assert_eq!(lc.minimize(&mut locale.id), TransformResult::Unmodified);
449    /// assert_eq!(locale, locale!("zh"));
450    /// ```
451    pub fn minimize(&self, langid: &mut LanguageIdentifier) -> TransformResult {
452        self.minimize_impl(langid, true)
453    }
454
455    /// This returns a new Locale that is the result of running the
456    /// 'Remove Likely Subtags, favoring script' algorithm from
457    /// <https://www.unicode.org/reports/tr35/#Likely_Subtags>.
458    ///
459    /// If the result of running the algorithm would result in a new locale, the
460    /// locale argument is updated in place to match the result, and the method
461    /// returns [`TransformResult::Modified`]. Otherwise, the method
462    /// returns [`TransformResult::Unmodified`] and the locale argument is
463    /// unchanged.
464    ///
465    /// # Examples
466    ///
467    /// ```
468    /// use icu::locale::{locale, LocaleExpander, TransformResult};
469    ///
470    /// let lc = LocaleExpander::new_common();
471    ///
472    /// let mut locale = locale!("zh-TW");
473    /// assert_eq!(
474    ///     lc.minimize_favor_script(&mut locale.id),
475    ///     TransformResult::Modified
476    /// );
477    /// assert_eq!(locale, locale!("zh-Hant"));
478    /// ```
479    pub fn minimize_favor_script(&self, langid: &mut LanguageIdentifier) -> TransformResult {
480        self.minimize_impl(langid, false)
481    }
482
483    fn minimize_impl(
484        &self,
485        langid: &mut LanguageIdentifier,
486        favor_region: bool,
487    ) -> TransformResult {
488        let mut max = langid.clone();
489        self.maximize(&mut max);
490
491        let mut trial = max.clone();
492
493        trial.script = None;
494        trial.region = None;
495        self.maximize(&mut trial);
496        if trial == max {
497            return update_langid_minimize(max.language, None, None, langid);
498        }
499
500        if favor_region {
501            trial.script = None;
502            trial.region = max.region;
503            self.maximize(&mut trial);
504
505            if trial == max {
506                return update_langid_minimize(max.language, None, max.region, langid);
507            }
508
509            trial.script = max.script;
510            trial.region = None;
511            self.maximize(&mut trial);
512            if trial == max {
513                return update_langid_minimize(max.language, max.script, None, langid);
514            }
515        } else {
516            trial.script = max.script;
517            trial.region = None;
518            self.maximize(&mut trial);
519            if trial == max {
520                return update_langid_minimize(max.language, max.script, None, langid);
521            }
522
523            trial.script = None;
524            trial.region = max.region;
525            self.maximize(&mut trial);
526
527            if trial == max {
528                return update_langid_minimize(max.language, None, max.region, langid);
529            }
530        }
531
532        update_langid_minimize(max.language, max.script, max.region, langid)
533    }
534
535    // TODO(3492): consider turning this and a future get_likely_region/get_likely_language public
536    #[inline]
537    pub(crate) fn get_likely_script(&self, langid: &LanguageIdentifier) -> Option<Script> {
538        langid
539            .script
540            .or_else(|| self.infer_likely_script(langid.language, langid.region))
541    }
542
543    fn infer_likely_script(&self, language: Language, region: Option<Region>) -> Option<Script> {
544        let data = self.as_borrowed();
545
546        // proceed through _all possible cases_ in order of specificity
547        // (borrowed from LocaleExpander::maximize):
548        // 1. language + region
549        // 2. language
550        // 3. region
551        // we need to check all cases, because e.g. for "en-US" the default script is associated
552        // with "en" but not "en-US"
553        if !language.is_unknown() {
554            if let Some(region) = region {
555                // 1. we know both language and region
556                if let Some(script) = data.get_lr(language, region) {
557                    return Some(script);
558                }
559            }
560            // 2. we know language, but we either do not know region or knowing region did not help
561            if let Some((script, _)) = data.get_l(language) {
562                return Some(script);
563            }
564        }
565        if let Some(region) = region {
566            // 3. we know region, but we either do not know language or knowing language did not help
567            if let Some((_, script)) = data.get_r(region) {
568                return Some(script);
569            }
570        }
571        // we could not figure out the script from the given locale
572        None
573    }
574}
575
576impl AsRef<LocaleExpander> for LocaleExpander {
577    fn as_ref(&self) -> &LocaleExpander {
578        self
579    }
580}
581
582#[cfg(feature = "serde")]
583#[cfg(test)]
584mod tests {
585    use super::*;
586    use icu_locale_core::locale;
587
588    #[test]
589    fn test_minimize_favor_script() {
590        let lc = LocaleExpander::new_common();
591        let mut locale = locale!("yue-Hans");
592        assert_eq!(
593            lc.minimize_favor_script(&mut locale.id),
594            TransformResult::Unmodified
595        );
596        assert_eq!(locale, locale!("yue-Hans"));
597    }
598
599    #[test]
600    fn test_minimize_favor_region() {
601        let lc = LocaleExpander::new_common();
602        let mut locale = locale!("yue-Hans");
603        assert_eq!(lc.minimize(&mut locale.id), TransformResult::Modified);
604        assert_eq!(locale, locale!("yue-CN"));
605    }
606}