icu_collator/comparison.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5// Various collation-related algorithms and constants in this file are
6// adapted from ICU4C and, therefore, are subject to the ICU license as
7// described in LICENSE.
8
9//! This module holds the `Collator` struct whose `compare_impl()` contains
10//! the comparison of collation element sequences.
11
12use crate::elements::CharacterAndClassAndTrieValue;
13use crate::elements::CollationElement32;
14use crate::elements::Tag;
15use crate::elements::BACKWARD_COMBINING_MARKER;
16use crate::elements::CE_BUFFER_SIZE;
17use crate::elements::FALLBACK_CE32;
18use crate::elements::NON_ROUND_TRIP_MARKER;
19use crate::elements::{
20 char_from_u32, CollationElement, CollationElements, NonPrimary, FFFD_CE32,
21 HANGUL_SYLLABLE_MARKER, HIGH_ZEROS_MASK, JAMO_COUNT, LOW_ZEROS_MASK, NO_CE, NO_CE_PRIMARY,
22 NO_CE_SECONDARY, NO_CE_TERTIARY, OPTIMIZED_DIACRITICS_MAX_COUNT, QUATERNARY_MASK,
23};
24use crate::options::CollatorOptionsBitField;
25use crate::options::{
26 AlternateHandling, CollatorOptions, MaxVariable, ResolvedCollatorOptions, Strength,
27};
28use crate::preferences::{CollationCaseFirst, CollationNumericOrdering, CollationType};
29use crate::provider::CollationData;
30use crate::provider::CollationDiacritics;
31use crate::provider::CollationDiacriticsV1;
32use crate::provider::CollationJamo;
33use crate::provider::CollationJamoV1;
34use crate::provider::CollationMetadataV1;
35use crate::provider::CollationReordering;
36use crate::provider::CollationReorderingV1;
37use crate::provider::CollationRootV1;
38use crate::provider::CollationSpecialPrimaries;
39use crate::provider::CollationSpecialPrimariesV1;
40use crate::provider::CollationTailoringV1;
41use core::cmp::Ordering;
42use core::convert::TryFrom;
43use icu_normalizer::provider::DecompositionData;
44use icu_normalizer::provider::DecompositionTables;
45use icu_normalizer::provider::NormalizerNfdDataV1;
46use icu_normalizer::provider::NormalizerNfdTablesV1;
47use icu_normalizer::Decomposition;
48use icu_provider::prelude::*;
49use smallvec::SmallVec;
50use utf16_iter::Utf16CharsEx;
51use utf8_iter::Utf8CharsEx;
52use zerovec::ule::AsULE;
53
54const MERGE_SEPARATOR_PRIMARY: u32 = 0x02000000; // for U+FFFE
55
56struct AnyQuaternaryAccumulator(u32);
57
58impl AnyQuaternaryAccumulator {
59 #[inline(always)]
60 pub fn new() -> Self {
61 AnyQuaternaryAccumulator(0)
62 }
63 #[inline(always)]
64 pub fn accumulate(&mut self, non_primary: NonPrimary) {
65 self.0 |= non_primary.bits()
66 }
67 #[inline(always)]
68 pub fn has_quaternary(&self) -> bool {
69 self.0 & u32::from(QUATERNARY_MASK) != 0
70 }
71}
72
73/// `true` iff `i` is greater or equal to `start` and less or equal
74/// to `end`.
75#[inline(always)]
76fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
77 i.wrapping_sub(start) <= (end - start)
78}
79
80/// Finds the identical prefix of `left` and `right` containing
81/// potentially ill-formed UTF-16, while avoiding splitting a
82/// well-formed surrogate pair. In case of ill-formed
83/// UTF-16, the prefix is not guaranteed to be maximal.
84///
85/// Returns the identical prefix, the part of `left` after the
86/// prefix, and the part of `right` after the prefix.
87fn split_prefix_u16<'a, 'b>(
88 left: &'a [u16],
89 right: &'b [u16],
90) -> (&'a [u16], &'a [u16], &'b [u16]) {
91 let mut i = left
92 .iter()
93 .zip(right.iter())
94 .take_while(|(l, r)| l == r)
95 .count();
96 if i != 0 {
97 if let Some(&last) = left.get(i.wrapping_sub(1)) {
98 if in_inclusive_range16(last, 0xD800, 0xDBFF) {
99 i -= 1;
100 }
101 if let Some((head, left_tail)) = left.split_at_checked(i) {
102 if let Some(right_tail) = right.get(i..) {
103 return (head, left_tail, right_tail);
104 }
105 }
106 }
107 }
108 (&[], left, right)
109}
110
111/// Finds the identical prefix of `left` and `right` containing
112/// potentially ill-formed UTF-8, while avoiding splitting a UTF-8
113/// byte sequence. In case of ill-formed UTF-8, the prefix is
114/// not guaranteed to be maximal.
115///
116/// Returns the identical prefix, the part of `left` after the
117/// prefix, and the part of `right` after the prefix.
118fn split_prefix_u8<'a, 'b>(left: &'a [u8], right: &'b [u8]) -> (&'a [u8], &'a [u8], &'b [u8]) {
119 let mut i = left
120 .iter()
121 .zip(right.iter())
122 .take_while(|(l, r)| l == r)
123 .count();
124 if i != 0 {
125 // Tails must not start with a UTF-8 continuation
126 // byte unless it's the first byte of the original
127 // slice.
128
129 // First, left and right differ, but since they
130 // are the same afterwards, one of them needs checking
131 // only once.
132 if let Some(right_first) = right.get(i) {
133 if (right_first & 0b1100_0000) == 0b1000_0000 {
134 i -= 1;
135 }
136 }
137 while i != 0 {
138 if let Some(left_first) = left.get(i) {
139 if (left_first & 0b1100_0000) == 0b1000_0000 {
140 i -= 1;
141 continue;
142 }
143 }
144 break;
145 }
146 if let Some((head, left_tail)) = left.split_at_checked(i) {
147 if let Some(right_tail) = right.get(i..) {
148 return (head, left_tail, right_tail);
149 }
150 }
151 }
152 (&[], left, right)
153}
154
155/// Finds the identical prefix of `left` and `right` containing
156/// guaranteed well-format UTF-8.
157///
158/// Returns the identical prefix, the part of `left` after the
159/// prefix, and the part of `right` after the prefix.
160fn split_prefix<'a, 'b>(left: &'a str, right: &'b str) -> (&'a str, &'a str, &'b str) {
161 let left_bytes = left.as_bytes();
162 let right_bytes = right.as_bytes();
163 let mut i = left_bytes
164 .iter()
165 .zip(right_bytes.iter())
166 .take_while(|(l, r)| l == r)
167 .count();
168 if i != 0 {
169 // Tails must not start with a UTF-8 continuation
170 // byte.
171
172 // Since the inputs are valid UTF-8, the first byte
173 // of either input slice cannot be a contination slice,
174 // so we may rely on finding a lead byte when walking
175 // backwards.
176
177 // Since the inputs are valid UTF-8, if a tail starts
178 // with a continuation, both tails must start with a
179 // continuation, since the most recent lead byte must
180 // be equal, so the difference is within valid UTF-8
181 // sequences of equal length.
182
183 // Therefore, it's sufficient to examine only one of
184 // the sides.
185 loop {
186 if let Some(left_first) = left_bytes.get(i) {
187 if (left_first & 0b1100_0000) == 0b1000_0000 {
188 i -= 1;
189 continue;
190 }
191 }
192 break;
193 }
194 // The methods below perform useless UTF-8 boundary checks,
195 // since we just checked. However, avoiding `unsafe` to
196 // make this code easier to audit.
197 if let Some((head, left_tail)) = left.split_at_checked(i) {
198 if let Some(right_tail) = right.get(i..) {
199 return (head, left_tail, right_tail);
200 }
201 }
202 }
203 ("", left, right)
204}
205
206/// Holder struct for payloads that are locale-dependent. (For code
207/// reuse between owned and borrowed cases.)
208#[derive(Debug)]
209struct LocaleSpecificDataHolder {
210 tailoring: Option<DataPayload<CollationTailoringV1>>,
211 diacritics: DataPayload<CollationDiacriticsV1>,
212 reordering: Option<DataPayload<CollationReorderingV1>>,
213 merged_options: CollatorOptionsBitField,
214 lithuanian_dot_above: bool,
215}
216
217icu_locale_core::preferences::define_preferences!(
218 /// The preferences for collation.
219 ///
220 /// # Preferences
221 ///
222 /// Examples for using the different preferences below can be found in the [crate-level docs](crate).
223 ///
224 /// ## Case First
225 ///
226 /// See the [spec](https://www.unicode.org/reports/tr35/tr35-collation.html#Case_Parameters).
227 /// This is the BCP47 key `kf`. Three possibilities: [`CollationCaseFirst::False`] (default,
228 /// except for Danish and Maltese), [`CollationCaseFirst::Lower`], and [`CollationCaseFirst::Upper`]
229 /// (default for Danish and Maltese).
230 ///
231 /// ## Numeric
232 ///
233 /// This is the BCP47 key `kn`. When set to [`CollationNumericOrdering::True`], any sequence of decimal
234 /// digits (General_Category = Nd) is sorted at the primary level according to the
235 /// numeric value. The default is [`CollationNumericOrdering::False`].
236 [Copy]
237 CollatorPreferences,
238 {
239 /// The collation type. This corresponds to the `-u-co` BCP-47 tag.
240 collation_type: CollationType,
241 /// Treatment of case. (Large and small kana differences are treated as case differences.)
242 /// This corresponds to the `-u-kf` BCP-47 tag.
243 case_first: CollationCaseFirst,
244 /// When set to `True`, any sequence of decimal digits is sorted at a primary level according
245 /// to the numeric value.
246 /// This corresponds to the `-u-kn` BPC-47 tag.
247 numeric_ordering: CollationNumericOrdering
248 }
249);
250
251impl LocaleSpecificDataHolder {
252 /// The constructor code reused between owned and borrowed cases.
253 fn try_new_unstable_internal<D>(
254 provider: &D,
255 prefs: CollatorPreferences,
256 options: CollatorOptions,
257 ) -> Result<Self, DataError>
258 where
259 D: DataProvider<CollationTailoringV1>
260 + DataProvider<CollationDiacriticsV1>
261 + DataProvider<CollationMetadataV1>
262 + DataProvider<CollationReorderingV1>
263 + ?Sized,
264 {
265 let marker_attributes = prefs
266 .collation_type
267 .as_ref()
268 // all collation types are valid marker attributes
269 .map(|c| DataMarkerAttributes::from_str_or_panic(c.as_str()))
270 .unwrap_or_default();
271
272 let data_locale = CollationTailoringV1::make_locale(prefs.locale_preferences);
273 let req = DataRequest {
274 id: DataIdentifierBorrowed::for_marker_attributes_and_locale(
275 marker_attributes,
276 &data_locale,
277 ),
278 metadata: {
279 let mut metadata = DataRequestMetadata::default();
280 metadata.silent = true;
281 metadata
282 },
283 };
284
285 let fallback_req = DataRequest {
286 id: DataIdentifierBorrowed::for_marker_attributes_and_locale(
287 Default::default(),
288 &data_locale,
289 ),
290 ..Default::default()
291 };
292
293 let metadata_payload: DataPayload<crate::provider::CollationMetadataV1> = provider
294 .load(req)
295 .or_else(|_| provider.load(fallback_req))?
296 .payload;
297
298 let metadata = metadata_payload.get();
299
300 let tailoring: Option<DataPayload<crate::provider::CollationTailoringV1>> =
301 if metadata.tailored() {
302 Some(
303 provider
304 .load(req)
305 .or_else(|_| provider.load(fallback_req))?
306 .payload,
307 )
308 } else {
309 None
310 };
311
312 let reordering: Option<DataPayload<crate::provider::CollationReorderingV1>> =
313 if metadata.reordering() {
314 Some(
315 provider
316 .load(req)
317 .or_else(|_| provider.load(fallback_req))?
318 .payload,
319 )
320 } else {
321 None
322 };
323
324 if let Some(reordering) = &reordering {
325 if reordering.get().reorder_table.len() != 256 {
326 return Err(DataError::custom("invalid").with_marker(CollationReorderingV1::INFO));
327 }
328 }
329
330 let tailored_diacritics = metadata.tailored_diacritics();
331 let diacritics: DataPayload<CollationDiacriticsV1> = provider
332 .load(if tailored_diacritics {
333 req
334 } else {
335 Default::default()
336 })?
337 .payload;
338
339 if tailored_diacritics {
340 // In the tailored case we accept a shorter table in which case the tailoring is
341 // responsible for supplying the missing values in the trie.
342 // As of June 2022, none of the collations actually use a shortened table.
343 // Vietnamese and Ewe load a full-length alternative table and the rest use
344 // the default one.
345 if diacritics.get().secondaries.len() > OPTIMIZED_DIACRITICS_MAX_COUNT {
346 return Err(DataError::custom("invalid").with_marker(CollationDiacriticsV1::INFO));
347 }
348 } else if diacritics.get().secondaries.len() != OPTIMIZED_DIACRITICS_MAX_COUNT {
349 return Err(DataError::custom("invalid").with_marker(CollationDiacriticsV1::INFO));
350 }
351
352 let mut altered_defaults = CollatorOptionsBitField::default();
353
354 if metadata.alternate_shifted() {
355 altered_defaults.set_alternate_handling(Some(AlternateHandling::Shifted));
356 }
357 if metadata.backward_second_level() {
358 altered_defaults.set_backward_second_level(Some(true));
359 }
360
361 altered_defaults.set_case_first(Some(metadata.case_first()));
362 altered_defaults.set_max_variable(Some(metadata.max_variable()));
363
364 let mut merged_options = CollatorOptionsBitField::from(options);
365 merged_options.set_case_first(prefs.case_first);
366 merged_options.set_numeric_from_enum(prefs.numeric_ordering);
367 merged_options.set_defaults(altered_defaults);
368
369 Ok(LocaleSpecificDataHolder {
370 tailoring,
371 diacritics,
372 merged_options,
373 reordering,
374 lithuanian_dot_above: metadata.lithuanian_dot_above(),
375 })
376 }
377}
378
379/// Compares strings according to culturally-relevant ordering.
380#[derive(Debug)]
381pub struct Collator {
382 special_primaries: Option<DataPayload<CollationSpecialPrimariesV1>>,
383 root: DataPayload<CollationRootV1>,
384 tailoring: Option<DataPayload<CollationTailoringV1>>,
385 jamo: DataPayload<CollationJamoV1>,
386 diacritics: DataPayload<CollationDiacriticsV1>,
387 options: CollatorOptionsBitField,
388 reordering: Option<DataPayload<CollationReorderingV1>>,
389 decompositions: DataPayload<NormalizerNfdDataV1>,
390 tables: DataPayload<NormalizerNfdTablesV1>,
391 lithuanian_dot_above: bool,
392}
393
394impl Collator {
395 /// Constructs a borrowed version of this type for more efficient querying.
396 pub fn as_borrowed(&self) -> CollatorBorrowed {
397 CollatorBorrowed {
398 special_primaries: self.special_primaries.as_ref().map(|s| s.get()),
399 root: self.root.get(),
400 tailoring: self.tailoring.as_ref().map(|s| s.get()),
401 jamo: self.jamo.get(),
402 diacritics: self.diacritics.get(),
403 options: self.options,
404 reordering: self.reordering.as_ref().map(|s| s.get()),
405 decompositions: self.decompositions.get(),
406 tables: self.tables.get(),
407 lithuanian_dot_above: self.lithuanian_dot_above,
408 }
409 }
410
411 /// Creates `CollatorBorrowed` for the given locale and options from compiled data.
412 #[cfg(feature = "compiled_data")]
413 pub fn try_new(
414 prefs: CollatorPreferences,
415 options: CollatorOptions,
416 ) -> Result<CollatorBorrowed<'static>, DataError> {
417 CollatorBorrowed::try_new(prefs, options)
418 }
419
420 icu_provider::gen_buffer_data_constructors!(
421 (prefs: CollatorPreferences, options: CollatorOptions) -> error: DataError,
422 functions: [
423 try_new: skip,
424 try_new_with_buffer_provider,
425 try_new_unstable,
426 Self
427 ]
428 );
429
430 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::try_new)]
431 pub fn try_new_unstable<D>(
432 provider: &D,
433 prefs: CollatorPreferences,
434 options: CollatorOptions,
435 ) -> Result<Self, DataError>
436 where
437 D: DataProvider<CollationSpecialPrimariesV1>
438 + DataProvider<CollationRootV1>
439 + DataProvider<CollationTailoringV1>
440 + DataProvider<CollationDiacriticsV1>
441 + DataProvider<CollationJamoV1>
442 + DataProvider<CollationMetadataV1>
443 + DataProvider<CollationReorderingV1>
444 + DataProvider<NormalizerNfdDataV1>
445 + DataProvider<NormalizerNfdTablesV1>
446 + ?Sized,
447 {
448 Self::try_new_unstable_internal(
449 provider,
450 provider.load(Default::default())?.payload,
451 provider.load(Default::default())?.payload,
452 provider.load(Default::default())?.payload,
453 provider.load(Default::default())?.payload,
454 || provider.load(Default::default()).map(|r| r.payload),
455 prefs,
456 options,
457 )
458 }
459
460 #[allow(clippy::too_many_arguments)]
461 fn try_new_unstable_internal<D>(
462 provider: &D,
463 root: DataPayload<CollationRootV1>,
464 decompositions: DataPayload<NormalizerNfdDataV1>,
465 tables: DataPayload<NormalizerNfdTablesV1>,
466 jamo: DataPayload<CollationJamoV1>,
467 special_primaries: impl FnOnce() -> Result<DataPayload<CollationSpecialPrimariesV1>, DataError>,
468 prefs: CollatorPreferences,
469 options: CollatorOptions,
470 ) -> Result<Self, DataError>
471 where
472 D: DataProvider<CollationRootV1>
473 + DataProvider<CollationTailoringV1>
474 + DataProvider<CollationDiacriticsV1>
475 + DataProvider<CollationMetadataV1>
476 + DataProvider<CollationReorderingV1>
477 + ?Sized,
478 {
479 let locale_dependent =
480 LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;
481
482 // TODO: redesign Korean search collation handling
483 if jamo.get().ce32s.len() != JAMO_COUNT {
484 return Err(DataError::custom("invalid").with_marker(CollationJamoV1::INFO));
485 }
486
487 let special_primaries = if locale_dependent.merged_options.alternate_handling()
488 == AlternateHandling::Shifted
489 || locale_dependent.merged_options.numeric()
490 {
491 let special_primaries = special_primaries()?;
492 // `variant_count` isn't stable yet:
493 // https://github.com/rust-lang/rust/issues/73662
494 if special_primaries.get().last_primaries.len() <= (MaxVariable::Currency as usize) {
495 return Err(
496 DataError::custom("invalid").with_marker(CollationSpecialPrimariesV1::INFO)
497 );
498 }
499 Some(special_primaries)
500 } else {
501 None
502 };
503
504 Ok(Collator {
505 special_primaries,
506 root,
507 tailoring: locale_dependent.tailoring,
508 jamo,
509 diacritics: locale_dependent.diacritics,
510 options: locale_dependent.merged_options,
511 reordering: locale_dependent.reordering,
512 decompositions,
513 tables,
514 lithuanian_dot_above: locale_dependent.lithuanian_dot_above,
515 })
516 }
517}
518
519macro_rules! compare {
520 ($(#[$meta:meta])*,
521 $compare:ident,
522 $slice:ty,
523 $split_prefix:ident,
524 ) => {
525 $(#[$meta])*
526 pub fn $compare(&self, left: &$slice, right: &$slice) -> Ordering {
527 let (head, left_tail, right_tail) = $split_prefix(left, right);
528 if left_tail.is_empty() && right_tail.is_empty() {
529 return Ordering::Equal;
530 }
531 let ret = self.compare_impl(left_tail.chars(), right_tail.chars(), head.chars());
532 if self.options.strength() == Strength::Identical && ret == Ordering::Equal {
533 return Decomposition::new(left_tail.chars(), self.decompositions, self.tables).cmp(
534 Decomposition::new(right_tail.chars(), self.decompositions, self.tables),
535 );
536 }
537 ret
538 }
539 }
540}
541
542/// Compares strings according to culturally-relevant ordering,
543/// borrowed version.
544#[derive(Debug)]
545pub struct CollatorBorrowed<'a> {
546 special_primaries: Option<&'a CollationSpecialPrimaries<'a>>,
547 root: &'a CollationData<'a>,
548 tailoring: Option<&'a CollationData<'a>>,
549 jamo: &'a CollationJamo<'a>,
550 diacritics: &'a CollationDiacritics<'a>,
551 options: CollatorOptionsBitField,
552 reordering: Option<&'a CollationReordering<'a>>,
553 decompositions: &'a DecompositionData<'a>,
554 tables: &'a DecompositionTables<'a>,
555 lithuanian_dot_above: bool,
556}
557
558impl CollatorBorrowed<'static> {
559 /// Creates a collator for the given locale and options from compiled data.
560 #[cfg(feature = "compiled_data")]
561 pub fn try_new(
562 prefs: CollatorPreferences,
563 options: CollatorOptions,
564 ) -> Result<Self, DataError> {
565 // These are assigned to locals in order to keep the code after these assignments
566 // copypaste-compatible with `Collator::try_new_unstable_internal`.
567
568 let provider = &crate::provider::Baked;
569 let decompositions = icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1;
570 let tables = icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1;
571 let root = crate::provider::Baked::SINGLETON_COLLATION_ROOT_V1;
572 let jamo = crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1;
573
574 let locale_dependent =
575 LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;
576
577 // TODO: redesign Korean search collation handling
578 if jamo.ce32s.len() != JAMO_COUNT {
579 return Err(DataError::custom("invalid").with_marker(CollationJamoV1::INFO));
580 }
581
582 let special_primaries = if locale_dependent.merged_options.alternate_handling()
583 == AlternateHandling::Shifted
584 || locale_dependent.merged_options.numeric()
585 {
586 let special_primaries =
587 crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1;
588 // `variant_count` isn't stable yet:
589 // https://github.com/rust-lang/rust/issues/73662
590 if special_primaries.last_primaries.len() <= (MaxVariable::Currency as usize) {
591 return Err(
592 DataError::custom("invalid").with_marker(CollationSpecialPrimariesV1::INFO)
593 );
594 }
595 Some(special_primaries)
596 } else {
597 None
598 };
599
600 // Attribute belongs closer to `unwrap`, but
601 // https://github.com/rust-lang/rust/issues/15701
602 #[allow(clippy::unwrap_used)]
603 Ok(CollatorBorrowed {
604 special_primaries,
605 root,
606 // Unwrap is OK, because we know we have the baked provider.
607 tailoring: locale_dependent.tailoring.map(|s| s.get_static().unwrap()),
608 jamo,
609 // Unwrap is OK, because we know we have the baked provider.
610 diacritics: locale_dependent.diacritics.get_static().unwrap(),
611 options: locale_dependent.merged_options,
612 // Unwrap is OK, because we know we have the baked provider.
613 reordering: locale_dependent.reordering.map(|s| s.get_static().unwrap()),
614 decompositions,
615 tables,
616 lithuanian_dot_above: locale_dependent.lithuanian_dot_above,
617 })
618 }
619
620 /// Cheaply converts a [`CollatorBorrowed<'static>`] into a [`Collator`].
621 ///
622 /// Note: Due to branching and indirection, using [`Collator`] might inhibit some
623 /// compile-time optimizations that are possible with [`CollatorBorrowed`].
624 pub const fn static_to_owned(self) -> Collator {
625 Collator {
626 special_primaries: if let Some(s) = self.special_primaries {
627 // `map` not available in const context
628 Some(DataPayload::from_static_ref(s))
629 } else {
630 None
631 },
632 root: DataPayload::from_static_ref(self.root),
633 tailoring: if let Some(s) = self.tailoring {
634 // `map` not available in const context
635 Some(DataPayload::from_static_ref(s))
636 } else {
637 None
638 },
639 jamo: DataPayload::from_static_ref(self.jamo),
640 diacritics: DataPayload::from_static_ref(self.diacritics),
641 options: self.options,
642 reordering: if let Some(s) = self.reordering {
643 // `map` not available in const context
644 Some(DataPayload::from_static_ref(s))
645 } else {
646 None
647 },
648 decompositions: DataPayload::from_static_ref(self.decompositions),
649 tables: DataPayload::from_static_ref(self.tables),
650 lithuanian_dot_above: self.lithuanian_dot_above,
651 }
652 }
653}
654
655impl CollatorBorrowed<'_> {
656 /// The resolved options showing how the default options, the requested options,
657 /// and the options from locale data were combined.
658 pub fn resolved_options(&self) -> ResolvedCollatorOptions {
659 self.options.into()
660 }
661
662 compare!(
663 /// Compare guaranteed well-formed UTF-8 slices.
664 ,
665 compare,
666 str,
667 split_prefix,
668 );
669
670 compare!(
671 /// Compare potentially ill-formed UTF-8 slices. Ill-formed input is compared
672 /// as if errors had been replaced with REPLACEMENT CHARACTERs according
673 /// to the WHATWG Encoding Standard.
674 ,
675 compare_utf8,
676 [u8],
677 split_prefix_u8,
678 );
679
680 compare!(
681 /// Compare potentially ill-formed UTF-16 slices. Unpaired surrogates
682 /// are compared as if each one was a REPLACEMENT CHARACTER.
683 ,
684 compare_utf16,
685 [u16],
686 split_prefix_u16,
687 );
688
689 /// The implementation of the comparison operation.
690 ///
691 /// `head_chars` is an iterator over the identical prefix and
692 /// `left_chars` and `right_chars` are iterators over the parts
693 /// after the identical prefix.
694 ///
695 /// Currently, all three have the same concrete type, so the
696 /// trait bound is given as `DoubleEndedIterator`.
697 /// `head_chars` is iterated backwards and `left_chars` and
698 /// `right_chars` forward. If this were a public API, this
699 /// should have three generic types, one for each argument,
700 /// for maximum flexibility.
701 fn compare_impl<I: DoubleEndedIterator<Item = char>>(
702 &self,
703 left_chars: I,
704 right_chars: I,
705 mut head_chars: I,
706 ) -> Ordering {
707 let tailoring: &CollationData = if let Some(tailoring) = &self.tailoring {
708 tailoring
709 } else {
710 // If the root collation is valid for the locale,
711 // use the root as the tailoring so that reads from the
712 // tailoring always succeed.
713 //
714 // TODO(#2011): Do we instead want to have an untailored
715 // copypaste of the iterator that omits the tailoring
716 // branches for performance at the expense of code size
717 // and having to maintain both a tailoring-capable and
718 // a tailoring-incapable version of the iterator?
719 // Or, in order not to flip the branch prediction around,
720 // should we have a no-op tailoring that contains a
721 // specially-crafted CodePointTrie that always returns
722 // a FALLBACK_CE32 after a single branch?
723 self.root
724 };
725
726 // Sadly, it looks like variable CEs and backward second level
727 // require us to store the full 64-bit CEs instead of storing only
728 // the NonPrimary part.
729 //
730 // TODO(#2008): Consider having two monomorphizations of this method:
731 // one that can deal with variables shifted to quaternary and
732 // backward second level and another that doesn't support that
733 // and only stores `NonPrimary` in `left_ces` and `right_ces`
734 // with double the number of stack allocated elements.
735
736 // Note: These are used only after the identical prefix skipping,
737 // but initializing these up here improves performance at the time
738 // of writing. Presumably the source order affects the stack frame
739 // layout.
740 let mut left_ces: SmallVec<[CollationElement; CE_BUFFER_SIZE]> = SmallVec::new();
741 let mut right_ces: SmallVec<[CollationElement; CE_BUFFER_SIZE]> = SmallVec::new();
742
743 // The algorithm comes from CollationCompare::compareUpToQuaternary in ICU4C.
744
745 let mut any_variable = false;
746 // Attribute belongs closer to `unwrap`, but
747 // https://github.com/rust-lang/rust/issues/15701
748 #[allow(clippy::unwrap_used)]
749 let variable_top = if self.options.alternate_handling() == AlternateHandling::NonIgnorable {
750 0
751 } else {
752 // +1 so that we can use "<" and primary ignorables test out early.
753 self.special_primaries
754 .as_ref()
755 // `unwrap()` is OK, because we've ensured in the constructor that value
756 // is `Some` if we have alternate handling.
757 .unwrap()
758 .last_primary_for_group(self.options.max_variable())
759 + 1
760 };
761
762 // Attribute belongs on inner expression, but
763 // https://github.com/rust-lang/rust/issues/15701
764 #[allow(clippy::unwrap_used)]
765 let numeric_primary = if self.options.numeric() {
766 Some(
767 self.special_primaries
768 .as_ref()
769 // `unwrap` is OK, because we've ensured `Some` in the constructor
770 .unwrap()
771 .numeric_primary,
772 )
773 } else {
774 None
775 };
776
777 // Attribute belongs on inner expression, but
778 // https://github.com/rust-lang/rust/issues/15701
779 #[allow(clippy::unwrap_used)]
780 let mut left = CollationElements::new(
781 left_chars,
782 self.root,
783 tailoring,
784 <&[<u32 as AsULE>::ULE; JAMO_COUNT]>::try_from(self.jamo.ce32s.as_ule_slice()).unwrap(), // `unwrap` OK, because length already validated
785 &self.diacritics.secondaries,
786 self.decompositions,
787 self.tables,
788 numeric_primary,
789 self.lithuanian_dot_above,
790 );
791 // Attribute belongs on inner expression, but
792 // https://github.com/rust-lang/rust/issues/15701
793 #[allow(clippy::unwrap_used)]
794 let mut right = CollationElements::new(
795 right_chars,
796 self.root,
797 tailoring,
798 <&[<u32 as AsULE>::ULE; JAMO_COUNT]>::try_from(self.jamo.ce32s.as_ule_slice()).unwrap(), // `unwrap` OK, because length already validated
799 &self.diacritics.secondaries,
800 self.decompositions,
801 self.tables,
802 numeric_primary,
803 self.lithuanian_dot_above,
804 );
805
806 // Start identical prefix
807
808 // The logic here to check whether the boundary found by skipping
809 // the identical prefix is safe is complicated compared to the ICU4C
810 // approach of having a set of characters that are unsafe as the character
811 // immediately following the identical prefix. However, the approach here
812 // avoids extra data, and working on the main data avoids the bug
813 // possibility of data structures not being mutually consistent.
814
815 // This code intentionally does not keep around the `CollationElement32`s
816 // that have been read from the collation data tries, because keeping
817 // them around turned out to be a pessimization: There would be added
818 // branches on the hot path of the algorithm that maps characters to
819 // collation elements, and the element size of the upcoming buffer
820 // would grow.
821 //
822 // However, the values read from the normalization trie _are_ kept around,
823 // since there is already a place where to put them.
824
825 // This loop is only broken out of as goto forward.
826 #[allow(clippy::never_loop)]
827 'prefix: loop {
828 if let Some(mut head_last_c) = head_chars.next_back() {
829 let norm_trie = &self.decompositions.trie;
830 let mut head_last = CharacterAndClassAndTrieValue::new_with_trie_val(
831 head_last_c,
832 norm_trie.get(head_last_c),
833 );
834 let mut head_last_ce32 = CollationElement32::default();
835 let mut head_last_ok = false;
836 if let Some(left_different) = left.iter_next_before_init() {
837 left.prepend_upcoming_before_init(left_different.clone());
838 if let Some(right_different) = right.iter_next_before_init() {
839 // Note: left_different and right_different may both be U+FFFD.
840 right.prepend_upcoming_before_init(right_different.clone());
841
842 // The base logic is that a boundary between two starters
843 // that decompose to selves is safe iff the starter
844 // before the boundary can't contract a starter, the
845 // starter after the boundary doesn't have a prefix
846 // condition, and, with the numeric mode enabled,
847 // they aren't both numeric.
848 //
849 // This base logic is then extended with Hangul
850 // syllables and characters that decompose to a
851 // BMP starter followed by a BMP non-starter.
852 // The logic could be extended further, in
853 // particular to cover singleton decompositions
854 // to a BMP starter, but such characters can be
855 // expected to be rare enough in real-world input
856 // that it's not worthwhile to make this code more
857 // branchy.
858 //
859 // A Hangul syllable is safe on either side of the
860 // boundary, because Hangul syllables can't participate
861 // in contraction or have prefix conditions. They are
862 // also known not to be numeric.
863 //
864 // Hangul jamo is safe to look up from the main trie
865 // instead of the jamo table, because they aren't
866 // allowed to participate in contractions or prefix
867 // conditions, either, and are known not to be numeric.
868 //
869 // After a boundary, a decomposition to a BMP starter
870 // and a BMP non-starter can obviously be analyzed by
871 // considering the starter as if it was a starter
872 // that decomposes to self.
873 //
874 // Before a boundary the contraction condition considers
875 // whether the contraction can contract a starter.
876 // For the case of contracting a non-starter, it's
877 // fine for the BMP starter of the decomposition to
878 // contract the non-starter from the same decomposition:
879 // Since that would happen as part of the prefix that
880 // is identical, it wouldn't affect anything after.
881 //
882 // The case of contracting a non-starter other than
883 // the one that came from the decomposition itself
884 // is irrelevant, because we don't allow a non-starter
885 // right after the boundary regardless of the contraction
886 // status of what's before the boundary.
887 //
888 // Finally, decompositions to starter and non-starter
889 // are known not to be numeric.
890
891 // The checks below are repetitive, but an attempt to factor
892 // repetitive code into an inlined function regressed,
893 // performance, so it seems that having the control flow
894 // right here without an intermediate enum from a
895 // function return to branch on is important.
896
897 // This loop is only broken out of as goto forward. The control flow
898 // is much more readable this way.
899 #[allow(clippy::never_loop)]
900 loop {
901 // The two highest bits are about NFC, which we don't
902 // care about here.
903 let decomposition = head_last.trie_val;
904 if (decomposition
905 & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER))
906 == 0
907 {
908 // Intentionally empty block to keep
909 // the same structure as in the cases
910 // where something happens here.
911 } else if ((decomposition & HIGH_ZEROS_MASK) != 0)
912 && ((decomposition & LOW_ZEROS_MASK) != 0)
913 {
914 // Decomposition into two BMP characters: starter and non-starter
915 // Let's take the starter
916 head_last_c = char_from_u32(decomposition & 0x7FFF);
917 } else if decomposition == HANGUL_SYLLABLE_MARKER {
918 head_last_ce32 = FFFD_CE32;
919 } else {
920 break;
921 }
922 head_last_ok = true;
923
924 let mut left_ce32 = CollationElement32::default();
925 let mut right_ce32 = CollationElement32::default();
926 let left_c;
927 let right_c;
928
929 let decomposition = left_different.trie_val;
930 if (decomposition
931 & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER))
932 == 0
933 {
934 left_c = left_different.character();
935 } else if ((decomposition & HIGH_ZEROS_MASK) != 0)
936 && ((decomposition & LOW_ZEROS_MASK) != 0)
937 {
938 // Decomposition into two BMP characters: starter and non-starter
939 // Let's take the starter
940 left_c = char_from_u32(decomposition & 0x7FFF);
941 } else if decomposition == HANGUL_SYLLABLE_MARKER {
942 left_c = '\u{0}';
943 left_ce32 = FFFD_CE32;
944 } else {
945 break;
946 }
947
948 let decomposition = right_different.trie_val;
949 if (decomposition
950 & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER))
951 == 0
952 {
953 right_c = right_different.character();
954 } else if ((decomposition & HIGH_ZEROS_MASK) != 0)
955 && ((decomposition & LOW_ZEROS_MASK) != 0)
956 {
957 // Decomposition into two BMP characters: starter and non-starter
958 // Let's take the starter
959 right_c = char_from_u32(decomposition & 0x7FFF);
960 } else if decomposition == HANGUL_SYLLABLE_MARKER {
961 right_c = '\u{0}';
962 right_ce32 = FFFD_CE32;
963 } else {
964 break;
965 }
966
967 // The last character of the prefix is OK on the normalization
968 // level. Now let's check its ce32 unless it's a Hangul syllable,
969 // in which case `head_last_ce32` already is a non-default placeholder.
970 if head_last_ce32 == CollationElement32::default() {
971 head_last_ce32 = tailoring.ce32_for_char(head_last_c);
972 if head_last_ce32 == FALLBACK_CE32 {
973 head_last_ce32 = self.root.ce32_for_char(head_last_c);
974 }
975 if head_last_ce32.tag_checked() == Some(Tag::Contraction)
976 && head_last_ce32.at_least_one_suffix_contains_starter()
977 {
978 break;
979 }
980 }
981 // The first character of each suffix is OK on the normalization
982 // level. Now let's check their ce32s unless they are Hangul syllables.
983 if left_ce32 == CollationElement32::default() {
984 left_ce32 = tailoring.ce32_for_char(left_c);
985 if left_ce32 == FALLBACK_CE32 {
986 left_ce32 = self.root.ce32_for_char(left_c);
987 }
988 if left_ce32.tag_checked() == Some(Tag::Prefix) {
989 break;
990 }
991 }
992 if right_ce32 == CollationElement32::default() {
993 right_ce32 = tailoring.ce32_for_char(right_c);
994 if right_ce32 == FALLBACK_CE32 {
995 right_ce32 = self.root.ce32_for_char(right_c);
996 }
997 if right_ce32.tag_checked() == Some(Tag::Prefix) {
998 break;
999 }
1000 }
1001 if self.options.numeric()
1002 && head_last_ce32.tag_checked() == Some(Tag::Digit)
1003 && (left_ce32.tag_checked() == Some(Tag::Digit)
1004 || right_ce32.tag_checked() == Some(Tag::Digit))
1005 {
1006 break;
1007 }
1008 // We are at a good boundary!
1009 break 'prefix;
1010 }
1011 }
1012 }
1013 let mut tail_first_c;
1014 let mut tail_first_ce32;
1015 let mut tail_first_ok;
1016 loop {
1017 // Take a step back.
1018 left.prepend_upcoming_before_init(head_last.clone());
1019 right.prepend_upcoming_before_init(head_last.clone());
1020
1021 tail_first_c = head_last_c;
1022 tail_first_ce32 = head_last_ce32;
1023 tail_first_ok = head_last_ok;
1024
1025 head_last_c = if let Some(head_last_c) = head_chars.next_back() {
1026 head_last_c
1027 } else {
1028 // We need to step back beyond the start of the prefix.
1029 // Treat as good boundary.
1030 break 'prefix;
1031 };
1032 let decomposition = norm_trie.get(head_last_c);
1033 head_last = CharacterAndClassAndTrieValue::new_with_trie_val(
1034 head_last_c,
1035 decomposition,
1036 );
1037 head_last_ce32 = CollationElement32::default();
1038 head_last_ok = false;
1039
1040 if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
1041 // Intentionally empty block to keep
1042 // the same structure as in the cases
1043 // where something happens here.
1044 } else if ((decomposition & HIGH_ZEROS_MASK) != 0)
1045 && ((decomposition & LOW_ZEROS_MASK) != 0)
1046 {
1047 // Decomposition into two BMP characters: starter and non-starter
1048 // Let's take the starter
1049 head_last_c = char_from_u32(decomposition & 0x7FFF);
1050 } else if decomposition == HANGUL_SYLLABLE_MARKER {
1051 head_last_ce32 = FFFD_CE32;
1052 } else {
1053 continue;
1054 }
1055 head_last_ok = true;
1056 if !tail_first_ok {
1057 continue;
1058 }
1059 // The last character of the prefix is OK on the normalization
1060 // level. Now let's check its ce32 unless it's a Hangul syllable.
1061 if head_last_ce32 == CollationElement32::default() {
1062 head_last_ce32 = tailoring.ce32_for_char(head_last_c);
1063 if head_last_ce32 == FALLBACK_CE32 {
1064 head_last_ce32 = self.root.ce32_for_char(head_last_c);
1065 }
1066 if head_last_ce32.tag_checked() == Some(Tag::Contraction)
1067 && head_last_ce32.at_least_one_suffix_contains_starter()
1068 {
1069 continue;
1070 }
1071 }
1072 // Check this _after_ `head_last_ce32` to make sure
1073 // `head_last_ce32` is initialized for the next loop round
1074 // trip if applicable.
1075 if tail_first_ce32 == CollationElement32::default() {
1076 tail_first_ce32 = tailoring.ce32_for_char(tail_first_c);
1077 if tail_first_ce32 == FALLBACK_CE32 {
1078 tail_first_ce32 = self.root.ce32_for_char(tail_first_c);
1079 }
1080 } // else we already have a trie value from the previous loop iteration or we have Hangul syllable
1081 if tail_first_ce32.tag_checked() == Some(Tag::Prefix) {
1082 continue;
1083 }
1084 if self.options.numeric()
1085 && head_last_ce32.tag_checked() == Some(Tag::Digit)
1086 && tail_first_ce32.tag_checked() == Some(Tag::Digit)
1087 {
1088 continue;
1089 }
1090 // We are at a good boundary!
1091 break 'prefix;
1092 }
1093 } else {
1094 // The prefix is empty
1095 break 'prefix;
1096 }
1097 // Unreachable line
1098 }
1099
1100 // End identical prefix
1101
1102 left.init();
1103 right.init();
1104
1105 loop {
1106 let mut left_primary;
1107 'left_primary_loop: loop {
1108 let ce = left.next();
1109 left_primary = ce.primary();
1110 // TODO(#2008): Consider compiling out the variable handling when we know we aren't
1111 // shifting variable CEs.
1112 if !(left_primary < variable_top && left_primary > MERGE_SEPARATOR_PRIMARY) {
1113 left_ces.push(ce);
1114 } else {
1115 // Variable CE, shift it to quaternary level.
1116 // Ignore all following primary ignorables, and shift further variable CEs.
1117 any_variable = true;
1118 // Relative to ICU4C, the next line is hoisted out of the following loop
1119 // in order to keep the variables called `ce` immutable to make it easier
1120 // to reason about each assignment into `ce` resulting in exactly a single
1121 // push into `left_ces`.
1122 left_ces.push(ce.clone_with_non_primary_zeroed());
1123 loop {
1124 // This loop is simpler than in ICU4C; unlike in C++, we get to break by label.
1125 let ce = left.next();
1126 left_primary = ce.primary();
1127 if left_primary != 0
1128 && !(left_primary < variable_top
1129 && left_primary > MERGE_SEPARATOR_PRIMARY)
1130 {
1131 // Neither a primary ignorable nor a variable CE.
1132 left_ces.push(ce);
1133 break 'left_primary_loop;
1134 }
1135 // If `left_primary == 0`, the following line ignores a primary-ignorable.
1136 // Otherwise, it shifts a variable CE.
1137 left_ces.push(ce.clone_with_non_primary_zeroed());
1138 }
1139 }
1140 if left_primary != 0 {
1141 break;
1142 }
1143 }
1144 let mut right_primary;
1145 'right_primary_loop: loop {
1146 let ce = right.next();
1147 right_primary = ce.primary();
1148 // TODO(#2008): Consider compiling out the variable handling when we know we aren't
1149 // shifting variable CEs.
1150 if !(right_primary < variable_top && right_primary > MERGE_SEPARATOR_PRIMARY) {
1151 right_ces.push(ce);
1152 } else {
1153 // Variable CE, shift it to quaternary level.
1154 // Ignore all following primary ignorables, and shift further variable CEs.
1155 any_variable = true;
1156 // Relative to ICU4C, the next line is hoisted out of the following loop
1157 // in order to keep the variables called `ce` immutable to make it easier
1158 // to reason about each assignment into `ce` resulting in exactly a single
1159 // push into `right_ces`.
1160 right_ces.push(ce.clone_with_non_primary_zeroed());
1161 loop {
1162 // This loop is simpler than in ICU4C; unlike in C++, we get to break by label.
1163 let ce = right.next();
1164 right_primary = ce.primary();
1165 if right_primary != 0
1166 && !(right_primary < variable_top
1167 && right_primary > MERGE_SEPARATOR_PRIMARY)
1168 {
1169 // Neither a primary ignorable nor a variable CE.
1170 right_ces.push(ce);
1171 break 'right_primary_loop;
1172 }
1173 // If `right_primary == 0`, the following line ignores a primary-ignorable.
1174 // Otherwise, it shifts a variable CE.
1175 right_ces.push(ce.clone_with_non_primary_zeroed());
1176 }
1177 }
1178 if right_primary != 0 {
1179 break;
1180 }
1181 }
1182 if left_primary != right_primary {
1183 if let Some(reordering) = &self.reordering {
1184 left_primary = reordering.reorder(left_primary);
1185 right_primary = reordering.reorder(right_primary);
1186 }
1187 if left_primary < right_primary {
1188 return Ordering::Less;
1189 }
1190 return Ordering::Greater;
1191 }
1192 if left_primary == NO_CE_PRIMARY {
1193 break;
1194 }
1195 }
1196
1197 // Sadly, we end up pushing the sentinel value, which means these
1198 // `SmallVec`s allocate more often than if we didn't actually
1199 // store the sentinel.
1200 debug_assert_eq!(left_ces.last(), Some(&NO_CE));
1201 debug_assert_eq!(right_ces.last(), Some(&NO_CE));
1202
1203 // Note: `unwrap_or_default` in the iterations below should never
1204 // actually end up using the "_or_default" part, because the sentinel
1205 // is in the `SmallVec`s. These could be changed to `unwrap()` if we
1206 // preferred panic in case of a bug.
1207 // TODO(#2009): Should we save one slot by not putting the sentinel in
1208 // the `SmallVec`s? So far, the answer seems "no", as it would complicate
1209 // the primary comparison above.
1210
1211 // Compare the buffered secondary & tertiary weights.
1212 // We might skip the secondary level but continue with the case level
1213 // which is turned on separately.
1214 if self.options.strength() >= Strength::Secondary {
1215 if !self.options.backward_second_level() {
1216 let mut left_iter = left_ces.iter();
1217 let mut right_iter = right_ces.iter();
1218 let mut left_secondary;
1219 let mut right_secondary;
1220 loop {
1221 loop {
1222 left_secondary = left_iter.next().unwrap_or_default().secondary();
1223 if left_secondary != 0 {
1224 break;
1225 }
1226 }
1227 loop {
1228 right_secondary = right_iter.next().unwrap_or_default().secondary();
1229 if right_secondary != 0 {
1230 break;
1231 }
1232 }
1233 if left_secondary != right_secondary {
1234 if left_secondary < right_secondary {
1235 return Ordering::Less;
1236 }
1237 return Ordering::Greater;
1238 }
1239 if left_secondary == NO_CE_SECONDARY {
1240 break;
1241 }
1242 }
1243 } else {
1244 let mut left_remaining = &left_ces[..];
1245 let mut right_remaining = &right_ces[..];
1246 loop {
1247 if left_remaining.is_empty() {
1248 debug_assert!(right_remaining.is_empty());
1249 break;
1250 }
1251 let (left_prefix, right_prefix) = {
1252 let mut left_iter = left_remaining.iter();
1253 loop {
1254 let left_primary = left_iter.next().unwrap_or_default().primary();
1255 if left_primary != 0 && left_primary <= MERGE_SEPARATOR_PRIMARY {
1256 break;
1257 }
1258 debug_assert_ne!(left_primary, NO_CE_PRIMARY);
1259 }
1260 let left_new_remaining = left_iter.as_slice();
1261 // Index in range by construction
1262 #[allow(clippy::indexing_slicing)]
1263 let left_prefix =
1264 &left_remaining[..left_remaining.len() - 1 - left_new_remaining.len()];
1265 left_remaining = left_new_remaining;
1266
1267 let mut right_iter = right_remaining.iter();
1268 loop {
1269 let right_primary = right_iter.next().unwrap_or_default().primary();
1270 if right_primary != 0 && right_primary <= MERGE_SEPARATOR_PRIMARY {
1271 break;
1272 }
1273 debug_assert_ne!(right_primary, NO_CE_PRIMARY);
1274 }
1275 let right_new_remaining = right_iter.as_slice();
1276 // Index in range by construction
1277 #[allow(clippy::indexing_slicing)]
1278 let right_prefix = &right_remaining
1279 [..right_remaining.len() - 1 - right_new_remaining.len()];
1280 right_remaining = right_new_remaining;
1281
1282 (left_prefix, right_prefix)
1283 };
1284 let mut left_iter = left_prefix.iter();
1285 let mut right_iter = right_prefix.iter();
1286
1287 let mut left_secondary;
1288 let mut right_secondary;
1289 loop {
1290 loop {
1291 left_secondary = left_iter.next_back().unwrap_or_default().secondary();
1292 if left_secondary != 0 {
1293 break;
1294 }
1295 }
1296 loop {
1297 right_secondary =
1298 right_iter.next_back().unwrap_or_default().secondary();
1299 if right_secondary != 0 {
1300 break;
1301 }
1302 }
1303 if left_secondary != right_secondary {
1304 if left_secondary < right_secondary {
1305 return Ordering::Less;
1306 }
1307 return Ordering::Greater;
1308 }
1309 if left_secondary == NO_CE_SECONDARY {
1310 break;
1311 }
1312 }
1313 }
1314 }
1315 }
1316
1317 if self.options.case_level() {
1318 if self.options.strength() == Strength::Primary {
1319 // Primary+caseLevel: Ignore case level weights of primary ignorables.
1320 // Otherwise we would get a-umlaut > a
1321 // which is not desirable for accent-insensitive sorting.
1322 // Check for (lower 32 bits) == 0 as well because variable CEs are stored
1323 // with only primary weights.
1324 let mut left_non_primary;
1325 let mut right_non_primary;
1326 let mut left_case;
1327 let mut right_case;
1328 let mut left_iter = left_ces.iter();
1329 let mut right_iter = right_ces.iter();
1330 loop {
1331 loop {
1332 let ce = left_iter.next().unwrap_or_default();
1333 left_non_primary = ce.non_primary();
1334 if !ce.either_half_zero() {
1335 break;
1336 }
1337 }
1338 left_case = left_non_primary.case();
1339 loop {
1340 let ce = right_iter.next().unwrap_or_default();
1341 right_non_primary = ce.non_primary();
1342 if !ce.either_half_zero() {
1343 break;
1344 }
1345 }
1346 right_case = right_non_primary.case();
1347 // No need to handle NO_CE and MERGE_SEPARATOR specially:
1348 // There is one case weight for each previous-level weight,
1349 // so level length differences were handled there.
1350 if left_case != right_case {
1351 if !self.options.upper_first() {
1352 if left_case < right_case {
1353 return Ordering::Less;
1354 }
1355 return Ordering::Greater;
1356 }
1357 if left_case < right_case {
1358 return Ordering::Greater;
1359 }
1360 return Ordering::Less;
1361 }
1362 if left_non_primary.secondary() == NO_CE_SECONDARY {
1363 break;
1364 }
1365 }
1366 } else {
1367 // Secondary+caseLevel: By analogy with the above,
1368 // ignore case level weights of secondary ignorables.
1369 //
1370 // Note: A tertiary CE has uppercase case bits (0.0.ut)
1371 // to keep tertiary+caseFirst well-formed.
1372 //
1373 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
1374 // Otherwise a tertiary CE's uppercase would be no greater than
1375 // a primary/secondary CE's uppercase.
1376 // (See UCA well-formedness condition 2.)
1377 // We could construct a special case weight higher than uppercase,
1378 // but it's simpler to always ignore case weights of secondary ignorables,
1379 // turning 0.0.ut into 0.0.0.t.
1380 // (See LDML Collation, Case Parameters.)
1381 let mut left_non_primary;
1382 let mut right_non_primary;
1383 let mut left_case;
1384 let mut right_case;
1385 let mut left_iter = left_ces.iter();
1386 let mut right_iter = right_ces.iter();
1387 loop {
1388 loop {
1389 left_non_primary = left_iter.next().unwrap_or_default().non_primary();
1390 if left_non_primary.secondary() != 0 {
1391 break;
1392 }
1393 }
1394 left_case = left_non_primary.case();
1395 loop {
1396 right_non_primary = right_iter.next().unwrap_or_default().non_primary();
1397 if right_non_primary.secondary() != 0 {
1398 break;
1399 }
1400 }
1401 right_case = right_non_primary.case();
1402 // No need to handle NO_CE and MERGE_SEPARATOR specially:
1403 // There is one case weight for each previous-level weight,
1404 // so level length differences were handled there.
1405 if left_case != right_case {
1406 if !self.options.upper_first() {
1407 if left_case < right_case {
1408 return Ordering::Less;
1409 }
1410 return Ordering::Greater;
1411 }
1412 if left_case < right_case {
1413 return Ordering::Greater;
1414 }
1415 return Ordering::Less;
1416 }
1417 if left_non_primary.secondary() == NO_CE_SECONDARY {
1418 break;
1419 }
1420 }
1421 }
1422 }
1423
1424 if let Some(tertiary_mask) = self.options.tertiary_mask() {
1425 let mut any_quaternaries = AnyQuaternaryAccumulator::new();
1426 let mut left_iter = left_ces.iter();
1427 let mut right_iter = right_ces.iter();
1428 loop {
1429 let mut left_non_primary;
1430 let mut left_tertiary;
1431 loop {
1432 left_non_primary = left_iter.next().unwrap_or_default().non_primary();
1433 any_quaternaries.accumulate(left_non_primary);
1434 debug_assert!(
1435 left_non_primary.tertiary() != 0 || left_non_primary.case_quaternary() == 0
1436 );
1437 left_tertiary = left_non_primary.tertiary_case_quarternary(tertiary_mask);
1438 if left_tertiary != 0 {
1439 break;
1440 }
1441 }
1442
1443 let mut right_non_primary;
1444 let mut right_tertiary;
1445 loop {
1446 right_non_primary = right_iter.next().unwrap_or_default().non_primary();
1447 any_quaternaries.accumulate(right_non_primary);
1448 debug_assert!(
1449 right_non_primary.tertiary() != 0
1450 || right_non_primary.case_quaternary() == 0
1451 );
1452 right_tertiary = right_non_primary.tertiary_case_quarternary(tertiary_mask);
1453 if right_tertiary != 0 {
1454 break;
1455 }
1456 }
1457
1458 if left_tertiary != right_tertiary {
1459 if self.options.upper_first() {
1460 // Pass through NO_CE and keep real tertiary weights larger than that.
1461 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
1462 // to keep tertiary CEs well-formed.
1463 // Their case+tertiary weights must be greater than those of
1464 // primary and secondary CEs.
1465 // Magic numbers from ICU4C.
1466 if left_tertiary > NO_CE_TERTIARY {
1467 if left_non_primary.secondary() != 0 {
1468 left_tertiary ^= 0xC000;
1469 } else {
1470 left_tertiary += 0x4000;
1471 }
1472 }
1473 if right_tertiary > NO_CE_TERTIARY {
1474 if right_non_primary.secondary() != 0 {
1475 right_tertiary ^= 0xC000;
1476 } else {
1477 right_tertiary += 0x4000;
1478 }
1479 }
1480 }
1481 if left_tertiary < right_tertiary {
1482 return Ordering::Less;
1483 }
1484 return Ordering::Greater;
1485 }
1486
1487 if left_tertiary == NO_CE_TERTIARY {
1488 break;
1489 }
1490 }
1491 if !any_variable && !any_quaternaries.has_quaternary() {
1492 return Ordering::Equal;
1493 }
1494 } else {
1495 return Ordering::Equal;
1496 }
1497
1498 if self.options.strength() <= Strength::Tertiary {
1499 return Ordering::Equal;
1500 }
1501
1502 let mut left_iter = left_ces.iter();
1503 let mut right_iter = right_ces.iter();
1504 loop {
1505 let mut left_quaternary;
1506 loop {
1507 let ce = left_iter.next().unwrap_or_default();
1508 if ce.tertiary_ignorable() {
1509 left_quaternary = ce.primary();
1510 } else {
1511 left_quaternary = ce.quaternary();
1512 }
1513 if left_quaternary != 0 {
1514 break;
1515 }
1516 }
1517 let mut right_quaternary;
1518 loop {
1519 let ce = right_iter.next().unwrap_or_default();
1520 if ce.tertiary_ignorable() {
1521 right_quaternary = ce.primary();
1522 } else {
1523 right_quaternary = ce.quaternary();
1524 }
1525 if right_quaternary != 0 {
1526 break;
1527 }
1528 }
1529 if left_quaternary != right_quaternary {
1530 if let Some(reordering) = &self.reordering {
1531 left_quaternary = reordering.reorder(left_quaternary);
1532 right_quaternary = reordering.reorder(right_quaternary);
1533 }
1534 if left_quaternary < right_quaternary {
1535 return Ordering::Less;
1536 }
1537 return Ordering::Greater;
1538 }
1539 if left_quaternary == NO_CE_PRIMARY {
1540 break;
1541 }
1542 }
1543
1544 Ordering::Equal
1545 }
1546}