1use crate::provider::*;
8use alloc::vec::Vec;
9use core::cmp::Ordering;
10
11use crate::LocaleExpander;
12use crate::TransformResult;
13use icu_locale_core::extensions::Extensions;
14use icu_locale_core::subtags::{Language, Region, Script};
15use icu_locale_core::{
16 extensions::unicode::key,
17 subtags::{language, Variant, Variants},
18 LanguageIdentifier, Locale,
19};
20use icu_provider::prelude::*;
21use tinystr::TinyAsciiStr;
22
23#[derive(Debug)]
40pub struct LocaleCanonicalizer<Expander = LocaleExpander> {
41 aliases: DataPayload<LocaleAliasesV1>,
43 expander: Expander,
45}
46
47fn uts35_rule_matches<'a, I>(
48 source: &LanguageIdentifier,
49 language: Language,
50 script: Option<Script>,
51 region: Option<Region>,
52 raw_variants: I,
53) -> bool
54where
55 I: Iterator<Item = &'a str>,
56{
57 (language.is_unknown() || language == source.language)
58 && (script.is_none() || script == source.script)
59 && (region.is_none() || region == source.region)
60 && {
61 let mut source_variants = source.variants.iter();
64 'outer: for raw_variant in raw_variants {
65 for source_variant in source_variants.by_ref() {
66 match source_variant.as_str().cmp(raw_variant) {
67 Ordering::Equal => {
68 continue 'outer;
70 }
71 Ordering::Less => {
72 }
74 Ordering::Greater => {
75 return false;
78 }
79 }
80 }
81 return false;
83 }
84 true
85 }
86}
87
88fn uts35_replacement<'a, I>(
89 source: &mut LanguageIdentifier,
90 ruletype_has_language: bool,
91 ruletype_has_script: bool,
92 ruletype_has_region: bool,
93 ruletype_variants: Option<I>,
94 replacement: &LanguageIdentifier,
95) where
96 I: Iterator<Item = &'a str>,
97{
98 if ruletype_has_language || (source.language.is_unknown() && !replacement.language.is_unknown())
99 {
100 source.language = replacement.language;
101 }
102 if ruletype_has_script || (source.script.is_none() && replacement.script.is_some()) {
103 source.script = replacement.script;
104 }
105 if ruletype_has_region || (source.region.is_none() && replacement.region.is_some()) {
106 source.region = replacement.region;
107 }
108 if let Some(skips) = ruletype_variants {
109 let mut sources = source.variants.iter().peekable();
117 let mut replacements = replacement.variants.iter().peekable();
118 let mut skips = skips.peekable();
119
120 let mut variants: Vec<Variant> = Vec::new();
121
122 loop {
123 match (sources.peek(), skips.peek(), replacements.peek()) {
124 (Some(&source), Some(skip), _)
125 if source.as_str().cmp(skip) == Ordering::Greater =>
126 {
127 skips.next();
128 }
129 (Some(&source), Some(skip), _) if source.as_str().cmp(skip) == Ordering::Equal => {
130 skips.next();
131 sources.next();
132 }
133 (Some(&source), _, Some(&replacement))
134 if replacement.cmp(source) == Ordering::Less =>
135 {
136 variants.push(*replacement);
137 replacements.next();
138 }
139 (Some(&source), _, Some(&replacement))
140 if replacement.cmp(source) == Ordering::Equal =>
141 {
142 variants.push(*source);
143 sources.next();
144 replacements.next();
145 }
146 (Some(&source), _, _) => {
147 variants.push(*source);
148 sources.next();
149 }
150 (None, _, Some(&replacement)) => {
151 variants.push(*replacement);
152 replacements.next();
153 }
154 (None, _, None) => {
155 break;
156 }
157 }
158 }
159 source.variants = Variants::from_vec_unchecked(variants);
160 }
161}
162
163#[inline]
164fn uts35_check_language_rules(
165 langid: &mut LanguageIdentifier,
166 alias_data: &DataPayload<LocaleAliasesV1>,
167) -> TransformResult {
168 if !langid.language.is_unknown() {
169 let lang: TinyAsciiStr<3> = langid.language.into();
170 let replacement = if lang.len() == 2 {
171 alias_data
172 .get()
173 .language_len2
174 .get(&lang.resize().to_unvalidated())
175 } else {
176 alias_data.get().language_len3.get(&lang.to_unvalidated())
177 };
178
179 if let Some(replacement) = replacement {
180 if let Ok(new_langid) = replacement.parse() {
181 uts35_replacement::<core::iter::Empty<&str>>(
182 langid,
183 true,
184 false,
185 false,
186 None,
187 &new_langid,
188 );
189 return TransformResult::Modified;
190 }
191 }
192 }
193
194 TransformResult::Unmodified
195}
196
197impl LocaleCanonicalizer<LocaleExpander> {
198 #[cfg(feature = "compiled_data")]
205 pub const fn new_common() -> Self {
206 Self::new_with_expander(LocaleExpander::new_common())
207 }
208
209 icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
210 functions: [
211 new_common: skip,
212 try_new_common_with_buffer_provider,
213 try_new_common_unstable,
214 Self,
215 ]
216 );
217
218 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_common)]
219 pub fn try_new_common_unstable<P>(provider: &P) -> Result<Self, DataError>
220 where
221 P: DataProvider<LocaleAliasesV1>
222 + DataProvider<LocaleLikelySubtagsLanguageV1>
223 + DataProvider<LocaleLikelySubtagsScriptRegionV1>
224 + ?Sized,
225 {
226 let expander = LocaleExpander::try_new_common_unstable(provider)?;
227 Self::try_new_with_expander_unstable(provider, expander)
228 }
229
230 #[cfg(feature = "compiled_data")]
237 pub const fn new_extended() -> Self {
238 Self::new_with_expander(LocaleExpander::new_extended())
239 }
240
241 icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
242 functions: [
243 new_extended: skip,
244 try_new_extended_with_buffer_provider,
245 try_new_extended_unstable,
246 Self,
247 ]
248 );
249
250 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_extended)]
251 pub fn try_new_extended_unstable<P>(provider: &P) -> Result<Self, DataError>
252 where
253 P: DataProvider<LocaleAliasesV1>
254 + DataProvider<LocaleLikelySubtagsLanguageV1>
255 + DataProvider<LocaleLikelySubtagsScriptRegionV1>
256 + DataProvider<LocaleLikelySubtagsExtendedV1>
257 + ?Sized,
258 {
259 let expander = LocaleExpander::try_new_extended_unstable(provider)?;
260 Self::try_new_with_expander_unstable(provider, expander)
261 }
262}
263
264impl<Expander: AsRef<LocaleExpander>> LocaleCanonicalizer<Expander> {
265 #[cfg(feature = "compiled_data")]
271 pub const fn new_with_expander(expander: Expander) -> Self {
272 Self {
273 aliases: DataPayload::from_static_ref(
274 crate::provider::Baked::SINGLETON_LOCALE_ALIASES_V1,
275 ),
276 expander,
277 }
278 }
279
280 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_with_expander)]
281 pub fn try_new_with_expander_unstable<P>(
282 provider: &P,
283 expander: Expander,
284 ) -> Result<Self, DataError>
285 where
286 P: DataProvider<LocaleAliasesV1> + ?Sized,
287 {
288 let aliases: DataPayload<LocaleAliasesV1> = provider.load(Default::default())?.payload;
289
290 Ok(Self { aliases, expander })
291 }
292
293 icu_provider::gen_buffer_data_constructors!((options: Expander) -> error: DataError,
294 functions: [
295 new_with_expander: skip,
296 try_new_with_expander_with_buffer_provider,
297 try_new_with_expander_unstable,
298 Self,
299 ]
300 );
301
302 pub fn canonicalize(&self, locale: &mut Locale) -> TransformResult {
324 let mut result = TransformResult::Unmodified;
325
326 loop {
329 let modified = if locale.id.variants.is_empty() {
335 self.canonicalize_absolute_language_fallbacks(&mut locale.id)
336 } else {
337 self.canonicalize_language_variant_fallbacks(&mut locale.id)
338 };
339 if modified {
340 result = TransformResult::Modified;
341 continue;
342 }
343
344 if !locale.id.language.is_unknown() {
345 if let Some(region) = locale.id.region {
347 if locale.id.language == language!("sgn") {
348 if let Some(&sgn_lang) = self
349 .aliases
350 .get()
351 .sgn_region
352 .get(®ion.to_tinystr().to_unvalidated())
353 {
354 uts35_replacement::<core::iter::Empty<&str>>(
355 &mut locale.id,
356 true,
357 false,
358 true,
359 None,
360 &sgn_lang.into(),
361 );
362 result = TransformResult::Modified;
363 continue;
364 }
365 }
366 }
367
368 if uts35_check_language_rules(&mut locale.id, &self.aliases)
369 == TransformResult::Modified
370 {
371 result = TransformResult::Modified;
372 continue;
373 }
374 }
375
376 if let Some(script) = locale.id.script {
377 if let Some(&replacement) = self
378 .aliases
379 .get()
380 .script
381 .get(&script.to_tinystr().to_unvalidated())
382 {
383 locale.id.script = Some(replacement);
384 result = TransformResult::Modified;
385 continue;
386 }
387 }
388
389 if let Some(region) = locale.id.region {
390 let replacement = if region.is_alphabetic() {
391 self.aliases
392 .get()
393 .region_alpha
394 .get(®ion.to_tinystr().resize().to_unvalidated())
395 } else {
396 self.aliases
397 .get()
398 .region_num
399 .get(®ion.to_tinystr().to_unvalidated())
400 };
401 if let Some(&replacement) = replacement {
402 locale.id.region = Some(replacement);
403 result = TransformResult::Modified;
404 continue;
405 }
406
407 if let Some(regions) = self
408 .aliases
409 .get()
410 .complex_region
411 .get(®ion.to_tinystr().to_unvalidated())
412 {
413 if let Some(default_region) = regions.get(0) {
415 let mut maximized = LanguageIdentifier {
416 language: locale.id.language,
417 script: locale.id.script,
418 region: None,
419 variants: Variants::default(),
420 };
421
422 locale.id.region = Some(
423 match (
424 self.expander.as_ref().maximize(&mut maximized),
425 maximized.region,
426 ) {
427 (TransformResult::Modified, Some(candidate))
428 if regions.iter().any(|x| x == candidate) =>
429 {
430 candidate
431 }
432 _ => default_region,
433 },
434 );
435 result = TransformResult::Modified;
436 continue;
437 }
438 }
439 }
440
441 if !locale.id.variants.is_empty() {
442 let mut modified = Vec::with_capacity(0);
443 for (idx, &variant) in locale.id.variants.iter().enumerate() {
444 if let Some(&updated) = self
445 .aliases
446 .get()
447 .variant
448 .get(&variant.to_tinystr().to_unvalidated())
449 {
450 if modified.is_empty() {
451 modified = locale.id.variants.to_vec();
452 }
453 #[allow(clippy::indexing_slicing)]
454 let _ = core::mem::replace(&mut modified[idx], updated);
455 }
456 }
457
458 if !modified.is_empty() {
459 modified.sort();
460 modified.dedup();
461 locale.id.variants = Variants::from_vec_unchecked(modified);
462 result = TransformResult::Modified;
463 continue;
464 }
465 }
466
467 break;
469 }
470
471 if !locale.extensions.transform.is_empty() || !locale.extensions.unicode.is_empty() {
472 self.canonicalize_extensions(&mut locale.extensions, &mut result);
473 }
474 result
475 }
476
477 fn canonicalize_extensions(&self, extensions: &mut Extensions, result: &mut TransformResult) {
478 if let Some(ref mut lang) = extensions.transform.lang {
481 while uts35_check_language_rules(lang, &self.aliases) == TransformResult::Modified {
482 *result = TransformResult::Modified;
483 }
484 }
485
486 if !extensions.unicode.keywords.is_empty() {
487 for key in [key!("rg"), key!("sd")] {
488 if let Some(value) = extensions.unicode.keywords.get_mut(&key) {
489 if let Some(only_value) = value.as_single_subtag() {
490 if let Some(modified_value) = self
491 .aliases
492 .get()
493 .subdivision
494 .get(&only_value.to_tinystr().resize().to_unvalidated())
495 {
496 if let Ok(modified_value) = modified_value.parse() {
497 *value = modified_value;
498 *result = TransformResult::Modified;
499 }
500 }
501 }
502 }
503 }
504 }
505 }
506
507 fn canonicalize_language_variant_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool {
508 for LanguageStrStrPair(lang, raw_variants, raw_to) in self
510 .aliases
511 .get()
512 .language_variants
513 .iter()
514 .map(zerofrom::ZeroFrom::zero_from)
515 {
516 let raw_variants = raw_variants.split('-');
517 if uts35_rule_matches(lid, lang, None, None, raw_variants.clone()) {
519 if let Ok(to) = raw_to.parse() {
520 uts35_replacement(
521 lid,
522 !lang.is_unknown(),
523 false,
524 false,
525 Some(raw_variants),
526 &to,
527 );
528 return true;
529 }
530 }
531 }
532 false
533 }
534
535 fn canonicalize_absolute_language_fallbacks(&self, lid: &mut LanguageIdentifier) -> bool {
536 for StrStrPair(raw_from, raw_to) in self
537 .aliases
538 .get()
539 .language
540 .iter()
541 .map(zerofrom::ZeroFrom::zero_from)
542 {
543 if let Ok(from) = raw_from.parse::<LanguageIdentifier>() {
544 if uts35_rule_matches(
545 lid,
546 from.language,
547 from.script,
548 from.region,
549 from.variants.iter().map(Variant::as_str),
550 ) {
551 if let Ok(to) = raw_to.parse() {
552 uts35_replacement(
553 lid,
554 !from.language.is_unknown(),
555 from.script.is_some(),
556 from.region.is_some(),
557 Some(from.variants.iter().map(Variant::as_str)),
558 &to,
559 );
560 return true;
561 }
562 }
563 }
564 }
565 false
566 }
567}
568
569#[cfg(test)]
570mod test {
571 use super::*;
572
573 #[test]
574 fn test_uts35_rule_matches() {
575 for (source, rule, result) in [
576 ("ja", "und", true),
577 ("und-heploc-hepburn", "und-hepburn", true),
578 ("ja-heploc-hepburn", "und-hepburn", true),
579 ("ja-hepburn", "und-hepburn-heploc", false),
580 ] {
581 let source = source.parse().unwrap();
582 let rule = rule.parse::<LanguageIdentifier>().unwrap();
583 assert_eq!(
584 uts35_rule_matches(
585 &source,
586 rule.language,
587 rule.script,
588 rule.region,
589 rule.variants.iter().map(Variant::as_str),
590 ),
591 result,
592 "{}",
593 source
594 );
595 }
596 }
597
598 #[test]
599 fn test_uts35_replacement() {
600 for (locale, rule_0, rule_1, result) in [
601 (
602 "ja-Latn-fonipa-hepburn-heploc",
603 "und-hepburn-heploc",
604 "und-alalc97",
605 "ja-Latn-alalc97-fonipa",
606 ),
607 ("sgn-DD", "und-DD", "und-DE", "sgn-DE"),
608 ("sgn-DE", "sgn-DE", "gsg", "gsg"),
609 ] {
610 let mut locale: Locale = locale.parse().unwrap();
611 let rule_0 = rule_0.parse::<LanguageIdentifier>().unwrap();
612 let rule_1 = rule_1.parse().unwrap();
613 let result = result.parse::<Locale>().unwrap();
614 uts35_replacement(
615 &mut locale.id,
616 !rule_0.language.is_unknown(),
617 rule_0.script.is_some(),
618 rule_0.region.is_some(),
619 Some(rule_0.variants.iter().map(Variant::as_str)),
620 &rule_1,
621 );
622 assert_eq!(result, locale);
623 }
624 }
625}