icu_collator/
comparison.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5// Various collation-related algorithms and constants in this file are
6// adapted from ICU4C and, therefore, are subject to the ICU license as
7// described in LICENSE.
8
9//! This module holds the `Collator` struct whose `compare_impl()` contains
10//! the comparison of collation element sequences.
11
12use crate::elements::CharacterAndClassAndTrieValue;
13use crate::elements::CollationElement32;
14use crate::elements::Tag;
15use crate::elements::BACKWARD_COMBINING_MARKER;
16use crate::elements::CE_BUFFER_SIZE;
17use crate::elements::FALLBACK_CE32;
18use crate::elements::NON_ROUND_TRIP_MARKER;
19use crate::elements::{
20    char_from_u32, CollationElement, CollationElements, NonPrimary, FFFD_CE32,
21    HANGUL_SYLLABLE_MARKER, HIGH_ZEROS_MASK, JAMO_COUNT, LOW_ZEROS_MASK, NO_CE, NO_CE_PRIMARY,
22    NO_CE_SECONDARY, NO_CE_TERTIARY, OPTIMIZED_DIACRITICS_MAX_COUNT, QUATERNARY_MASK,
23};
24use crate::options::CollatorOptionsBitField;
25use crate::options::{
26    AlternateHandling, CollatorOptions, MaxVariable, ResolvedCollatorOptions, Strength,
27};
28use crate::preferences::{CollationCaseFirst, CollationNumericOrdering, CollationType};
29use crate::provider::CollationData;
30use crate::provider::CollationDiacritics;
31use crate::provider::CollationDiacriticsV1;
32use crate::provider::CollationJamo;
33use crate::provider::CollationJamoV1;
34use crate::provider::CollationMetadataV1;
35use crate::provider::CollationReordering;
36use crate::provider::CollationReorderingV1;
37use crate::provider::CollationRootV1;
38use crate::provider::CollationSpecialPrimaries;
39use crate::provider::CollationSpecialPrimariesV1;
40use crate::provider::CollationTailoringV1;
41use core::cmp::Ordering;
42use core::convert::TryFrom;
43use icu_normalizer::provider::DecompositionData;
44use icu_normalizer::provider::DecompositionTables;
45use icu_normalizer::provider::NormalizerNfdDataV1;
46use icu_normalizer::provider::NormalizerNfdTablesV1;
47use icu_normalizer::Decomposition;
48use icu_provider::prelude::*;
49use smallvec::SmallVec;
50use utf16_iter::Utf16CharsEx;
51use utf8_iter::Utf8CharsEx;
52use zerovec::ule::AsULE;
53
54const MERGE_SEPARATOR_PRIMARY: u32 = 0x02000000; // for U+FFFE
55
56struct AnyQuaternaryAccumulator(u32);
57
58impl AnyQuaternaryAccumulator {
59    #[inline(always)]
60    pub fn new() -> Self {
61        AnyQuaternaryAccumulator(0)
62    }
63    #[inline(always)]
64    pub fn accumulate(&mut self, non_primary: NonPrimary) {
65        self.0 |= non_primary.bits()
66    }
67    #[inline(always)]
68    pub fn has_quaternary(&self) -> bool {
69        self.0 & u32::from(QUATERNARY_MASK) != 0
70    }
71}
72
73/// `true` iff `i` is greater or equal to `start` and less or equal
74/// to `end`.
75#[inline(always)]
76fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
77    i.wrapping_sub(start) <= (end - start)
78}
79
80/// Finds the identical prefix of `left` and `right` containing
81/// potentially ill-formed UTF-16, while avoiding splitting a
82/// well-formed surrogate pair. In case of ill-formed
83/// UTF-16, the prefix is not guaranteed to be maximal.
84///
85/// Returns the identical prefix, the part of `left` after the
86/// prefix, and the part of `right` after the prefix.
87fn split_prefix_u16<'a, 'b>(
88    left: &'a [u16],
89    right: &'b [u16],
90) -> (&'a [u16], &'a [u16], &'b [u16]) {
91    let mut i = left
92        .iter()
93        .zip(right.iter())
94        .take_while(|(l, r)| l == r)
95        .count();
96    if i != 0 {
97        if let Some(&last) = left.get(i.wrapping_sub(1)) {
98            if in_inclusive_range16(last, 0xD800, 0xDBFF) {
99                i -= 1;
100            }
101            if let Some((head, left_tail)) = left.split_at_checked(i) {
102                if let Some(right_tail) = right.get(i..) {
103                    return (head, left_tail, right_tail);
104                }
105            }
106        }
107    }
108    (&[], left, right)
109}
110
111/// Finds the identical prefix of `left` and `right` containing
112/// potentially ill-formed UTF-8, while avoiding splitting a UTF-8
113/// byte sequence. In case of ill-formed UTF-8, the prefix is
114/// not guaranteed to be maximal.
115///
116/// Returns the identical prefix, the part of `left` after the
117/// prefix, and the part of `right` after the prefix.
118fn split_prefix_u8<'a, 'b>(left: &'a [u8], right: &'b [u8]) -> (&'a [u8], &'a [u8], &'b [u8]) {
119    let mut i = left
120        .iter()
121        .zip(right.iter())
122        .take_while(|(l, r)| l == r)
123        .count();
124    if i != 0 {
125        // Tails must not start with a UTF-8 continuation
126        // byte unless it's the first byte of the original
127        // slice.
128
129        // First, left and right differ, but since they
130        // are the same afterwards, one of them needs checking
131        // only once.
132        if let Some(right_first) = right.get(i) {
133            if (right_first & 0b1100_0000) == 0b1000_0000 {
134                i -= 1;
135            }
136        }
137        while i != 0 {
138            if let Some(left_first) = left.get(i) {
139                if (left_first & 0b1100_0000) == 0b1000_0000 {
140                    i -= 1;
141                    continue;
142                }
143            }
144            break;
145        }
146        if let Some((head, left_tail)) = left.split_at_checked(i) {
147            if let Some(right_tail) = right.get(i..) {
148                return (head, left_tail, right_tail);
149            }
150        }
151    }
152    (&[], left, right)
153}
154
155/// Finds the identical prefix of `left` and `right` containing
156/// guaranteed well-format UTF-8.
157///
158/// Returns the identical prefix, the part of `left` after the
159/// prefix, and the part of `right` after the prefix.
160fn split_prefix<'a, 'b>(left: &'a str, right: &'b str) -> (&'a str, &'a str, &'b str) {
161    let left_bytes = left.as_bytes();
162    let right_bytes = right.as_bytes();
163    let mut i = left_bytes
164        .iter()
165        .zip(right_bytes.iter())
166        .take_while(|(l, r)| l == r)
167        .count();
168    if i != 0 {
169        // Tails must not start with a UTF-8 continuation
170        // byte.
171
172        // Since the inputs are valid UTF-8, the first byte
173        // of either input slice cannot be a contination slice,
174        // so we may rely on finding a lead byte when walking
175        // backwards.
176
177        // Since the inputs are valid UTF-8, if a tail starts
178        // with a continuation, both tails must start with a
179        // continuation, since the most recent lead byte must
180        // be equal, so the difference is within valid UTF-8
181        // sequences of equal length.
182
183        // Therefore, it's sufficient to examine only one of
184        // the sides.
185        loop {
186            if let Some(left_first) = left_bytes.get(i) {
187                if (left_first & 0b1100_0000) == 0b1000_0000 {
188                    i -= 1;
189                    continue;
190                }
191            }
192            break;
193        }
194        // The methods below perform useless UTF-8 boundary checks,
195        // since we just checked. However, avoiding `unsafe` to
196        // make this code easier to audit.
197        if let Some((head, left_tail)) = left.split_at_checked(i) {
198            if let Some(right_tail) = right.get(i..) {
199                return (head, left_tail, right_tail);
200            }
201        }
202    }
203    ("", left, right)
204}
205
206/// Holder struct for payloads that are locale-dependent. (For code
207/// reuse between owned and borrowed cases.)
208#[derive(Debug)]
209struct LocaleSpecificDataHolder {
210    tailoring: Option<DataPayload<CollationTailoringV1>>,
211    diacritics: DataPayload<CollationDiacriticsV1>,
212    reordering: Option<DataPayload<CollationReorderingV1>>,
213    merged_options: CollatorOptionsBitField,
214    lithuanian_dot_above: bool,
215}
216
217icu_locale_core::preferences::define_preferences!(
218    /// The preferences for collation.
219    ///
220    /// # Preferences
221    ///
222    /// Examples for using the different preferences below can be found in the [crate-level docs](crate).
223    ///
224    /// ## Case First
225    ///
226    /// See the [spec](https://www.unicode.org/reports/tr35/tr35-collation.html#Case_Parameters).
227    /// This is the BCP47 key `kf`. Three possibilities: [`CollationCaseFirst::False`] (default,
228    /// except for Danish and Maltese), [`CollationCaseFirst::Lower`], and [`CollationCaseFirst::Upper`]
229    /// (default for Danish and Maltese).
230    ///
231    /// ## Numeric
232    ///
233    /// This is the BCP47 key `kn`. When set to [`CollationNumericOrdering::True`], any sequence of decimal
234    /// digits (General_Category = Nd) is sorted at the primary level according to the
235    /// numeric value. The default is [`CollationNumericOrdering::False`].
236    [Copy]
237    CollatorPreferences,
238    {
239        /// The collation type. This corresponds to the `-u-co` BCP-47 tag.
240        collation_type: CollationType,
241        /// Treatment of case. (Large and small kana differences are treated as case differences.)
242        /// This corresponds to the `-u-kf` BCP-47 tag.
243        case_first: CollationCaseFirst,
244        /// When set to `True`, any sequence of decimal digits is sorted at a primary level according
245        /// to the numeric value.
246        /// This corresponds to the `-u-kn` BPC-47 tag.
247        numeric_ordering: CollationNumericOrdering
248    }
249);
250
251impl LocaleSpecificDataHolder {
252    /// The constructor code reused between owned and borrowed cases.
253    fn try_new_unstable_internal<D>(
254        provider: &D,
255        prefs: CollatorPreferences,
256        options: CollatorOptions,
257    ) -> Result<Self, DataError>
258    where
259        D: DataProvider<CollationTailoringV1>
260            + DataProvider<CollationDiacriticsV1>
261            + DataProvider<CollationMetadataV1>
262            + DataProvider<CollationReorderingV1>
263            + ?Sized,
264    {
265        let marker_attributes = prefs
266            .collation_type
267            .as_ref()
268            // all collation types are valid marker attributes
269            .map(|c| DataMarkerAttributes::from_str_or_panic(c.as_str()))
270            .unwrap_or_default();
271
272        let data_locale = CollationTailoringV1::make_locale(prefs.locale_preferences);
273        let req = DataRequest {
274            id: DataIdentifierBorrowed::for_marker_attributes_and_locale(
275                marker_attributes,
276                &data_locale,
277            ),
278            metadata: {
279                let mut metadata = DataRequestMetadata::default();
280                metadata.silent = true;
281                metadata
282            },
283        };
284
285        let fallback_req = DataRequest {
286            id: DataIdentifierBorrowed::for_marker_attributes_and_locale(
287                Default::default(),
288                &data_locale,
289            ),
290            ..Default::default()
291        };
292
293        let metadata_payload: DataPayload<crate::provider::CollationMetadataV1> = provider
294            .load(req)
295            .or_else(|_| provider.load(fallback_req))?
296            .payload;
297
298        let metadata = metadata_payload.get();
299
300        let tailoring: Option<DataPayload<crate::provider::CollationTailoringV1>> =
301            if metadata.tailored() {
302                Some(
303                    provider
304                        .load(req)
305                        .or_else(|_| provider.load(fallback_req))?
306                        .payload,
307                )
308            } else {
309                None
310            };
311
312        let reordering: Option<DataPayload<crate::provider::CollationReorderingV1>> =
313            if metadata.reordering() {
314                Some(
315                    provider
316                        .load(req)
317                        .or_else(|_| provider.load(fallback_req))?
318                        .payload,
319                )
320            } else {
321                None
322            };
323
324        if let Some(reordering) = &reordering {
325            if reordering.get().reorder_table.len() != 256 {
326                return Err(DataError::custom("invalid").with_marker(CollationReorderingV1::INFO));
327            }
328        }
329
330        let tailored_diacritics = metadata.tailored_diacritics();
331        let diacritics: DataPayload<CollationDiacriticsV1> = provider
332            .load(if tailored_diacritics {
333                req
334            } else {
335                Default::default()
336            })?
337            .payload;
338
339        if tailored_diacritics {
340            // In the tailored case we accept a shorter table in which case the tailoring is
341            // responsible for supplying the missing values in the trie.
342            // As of June 2022, none of the collations actually use a shortened table.
343            // Vietnamese and Ewe load a full-length alternative table and the rest use
344            // the default one.
345            if diacritics.get().secondaries.len() > OPTIMIZED_DIACRITICS_MAX_COUNT {
346                return Err(DataError::custom("invalid").with_marker(CollationDiacriticsV1::INFO));
347            }
348        } else if diacritics.get().secondaries.len() != OPTIMIZED_DIACRITICS_MAX_COUNT {
349            return Err(DataError::custom("invalid").with_marker(CollationDiacriticsV1::INFO));
350        }
351
352        let mut altered_defaults = CollatorOptionsBitField::default();
353
354        if metadata.alternate_shifted() {
355            altered_defaults.set_alternate_handling(Some(AlternateHandling::Shifted));
356        }
357        if metadata.backward_second_level() {
358            altered_defaults.set_backward_second_level(Some(true));
359        }
360
361        altered_defaults.set_case_first(Some(metadata.case_first()));
362        altered_defaults.set_max_variable(Some(metadata.max_variable()));
363
364        let mut merged_options = CollatorOptionsBitField::from(options);
365        merged_options.set_case_first(prefs.case_first);
366        merged_options.set_numeric_from_enum(prefs.numeric_ordering);
367        merged_options.set_defaults(altered_defaults);
368
369        Ok(LocaleSpecificDataHolder {
370            tailoring,
371            diacritics,
372            merged_options,
373            reordering,
374            lithuanian_dot_above: metadata.lithuanian_dot_above(),
375        })
376    }
377}
378
379/// Compares strings according to culturally-relevant ordering.
380#[derive(Debug)]
381pub struct Collator {
382    special_primaries: Option<DataPayload<CollationSpecialPrimariesV1>>,
383    root: DataPayload<CollationRootV1>,
384    tailoring: Option<DataPayload<CollationTailoringV1>>,
385    jamo: DataPayload<CollationJamoV1>,
386    diacritics: DataPayload<CollationDiacriticsV1>,
387    options: CollatorOptionsBitField,
388    reordering: Option<DataPayload<CollationReorderingV1>>,
389    decompositions: DataPayload<NormalizerNfdDataV1>,
390    tables: DataPayload<NormalizerNfdTablesV1>,
391    lithuanian_dot_above: bool,
392}
393
394impl Collator {
395    /// Constructs a borrowed version of this type for more efficient querying.
396    pub fn as_borrowed(&self) -> CollatorBorrowed {
397        CollatorBorrowed {
398            special_primaries: self.special_primaries.as_ref().map(|s| s.get()),
399            root: self.root.get(),
400            tailoring: self.tailoring.as_ref().map(|s| s.get()),
401            jamo: self.jamo.get(),
402            diacritics: self.diacritics.get(),
403            options: self.options,
404            reordering: self.reordering.as_ref().map(|s| s.get()),
405            decompositions: self.decompositions.get(),
406            tables: self.tables.get(),
407            lithuanian_dot_above: self.lithuanian_dot_above,
408        }
409    }
410
411    /// Creates `CollatorBorrowed` for the given locale and options from compiled data.
412    #[cfg(feature = "compiled_data")]
413    pub fn try_new(
414        prefs: CollatorPreferences,
415        options: CollatorOptions,
416    ) -> Result<CollatorBorrowed<'static>, DataError> {
417        CollatorBorrowed::try_new(prefs, options)
418    }
419
420    icu_provider::gen_buffer_data_constructors!(
421        (prefs: CollatorPreferences, options: CollatorOptions) -> error: DataError,
422        functions: [
423            try_new: skip,
424            try_new_with_buffer_provider,
425            try_new_unstable,
426            Self
427        ]
428    );
429
430    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::try_new)]
431    pub fn try_new_unstable<D>(
432        provider: &D,
433        prefs: CollatorPreferences,
434        options: CollatorOptions,
435    ) -> Result<Self, DataError>
436    where
437        D: DataProvider<CollationSpecialPrimariesV1>
438            + DataProvider<CollationRootV1>
439            + DataProvider<CollationTailoringV1>
440            + DataProvider<CollationDiacriticsV1>
441            + DataProvider<CollationJamoV1>
442            + DataProvider<CollationMetadataV1>
443            + DataProvider<CollationReorderingV1>
444            + DataProvider<NormalizerNfdDataV1>
445            + DataProvider<NormalizerNfdTablesV1>
446            + ?Sized,
447    {
448        Self::try_new_unstable_internal(
449            provider,
450            provider.load(Default::default())?.payload,
451            provider.load(Default::default())?.payload,
452            provider.load(Default::default())?.payload,
453            provider.load(Default::default())?.payload,
454            || provider.load(Default::default()).map(|r| r.payload),
455            prefs,
456            options,
457        )
458    }
459
460    #[allow(clippy::too_many_arguments)]
461    fn try_new_unstable_internal<D>(
462        provider: &D,
463        root: DataPayload<CollationRootV1>,
464        decompositions: DataPayload<NormalizerNfdDataV1>,
465        tables: DataPayload<NormalizerNfdTablesV1>,
466        jamo: DataPayload<CollationJamoV1>,
467        special_primaries: impl FnOnce() -> Result<DataPayload<CollationSpecialPrimariesV1>, DataError>,
468        prefs: CollatorPreferences,
469        options: CollatorOptions,
470    ) -> Result<Self, DataError>
471    where
472        D: DataProvider<CollationRootV1>
473            + DataProvider<CollationTailoringV1>
474            + DataProvider<CollationDiacriticsV1>
475            + DataProvider<CollationMetadataV1>
476            + DataProvider<CollationReorderingV1>
477            + ?Sized,
478    {
479        let locale_dependent =
480            LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;
481
482        // TODO: redesign Korean search collation handling
483        if jamo.get().ce32s.len() != JAMO_COUNT {
484            return Err(DataError::custom("invalid").with_marker(CollationJamoV1::INFO));
485        }
486
487        let special_primaries = if locale_dependent.merged_options.alternate_handling()
488            == AlternateHandling::Shifted
489            || locale_dependent.merged_options.numeric()
490        {
491            let special_primaries = special_primaries()?;
492            // `variant_count` isn't stable yet:
493            // https://github.com/rust-lang/rust/issues/73662
494            if special_primaries.get().last_primaries.len() <= (MaxVariable::Currency as usize) {
495                return Err(
496                    DataError::custom("invalid").with_marker(CollationSpecialPrimariesV1::INFO)
497                );
498            }
499            Some(special_primaries)
500        } else {
501            None
502        };
503
504        Ok(Collator {
505            special_primaries,
506            root,
507            tailoring: locale_dependent.tailoring,
508            jamo,
509            diacritics: locale_dependent.diacritics,
510            options: locale_dependent.merged_options,
511            reordering: locale_dependent.reordering,
512            decompositions,
513            tables,
514            lithuanian_dot_above: locale_dependent.lithuanian_dot_above,
515        })
516    }
517}
518
519macro_rules! compare {
520    ($(#[$meta:meta])*,
521     $compare:ident,
522     $slice:ty,
523     $split_prefix:ident,
524    ) => {
525        $(#[$meta])*
526        pub fn $compare(&self, left: &$slice, right: &$slice) -> Ordering {
527            let (head, left_tail, right_tail) = $split_prefix(left, right);
528            if left_tail.is_empty() && right_tail.is_empty() {
529                return Ordering::Equal;
530            }
531            let ret = self.compare_impl(left_tail.chars(), right_tail.chars(), head.chars());
532            if self.options.strength() == Strength::Identical && ret == Ordering::Equal {
533                return Decomposition::new(left_tail.chars(), self.decompositions, self.tables).cmp(
534                    Decomposition::new(right_tail.chars(), self.decompositions, self.tables),
535                );
536            }
537            ret
538        }
539    }
540}
541
542/// Compares strings according to culturally-relevant ordering,
543/// borrowed version.
544#[derive(Debug)]
545pub struct CollatorBorrowed<'a> {
546    special_primaries: Option<&'a CollationSpecialPrimaries<'a>>,
547    root: &'a CollationData<'a>,
548    tailoring: Option<&'a CollationData<'a>>,
549    jamo: &'a CollationJamo<'a>,
550    diacritics: &'a CollationDiacritics<'a>,
551    options: CollatorOptionsBitField,
552    reordering: Option<&'a CollationReordering<'a>>,
553    decompositions: &'a DecompositionData<'a>,
554    tables: &'a DecompositionTables<'a>,
555    lithuanian_dot_above: bool,
556}
557
558impl CollatorBorrowed<'static> {
559    /// Creates a collator for the given locale and options from compiled data.
560    #[cfg(feature = "compiled_data")]
561    pub fn try_new(
562        prefs: CollatorPreferences,
563        options: CollatorOptions,
564    ) -> Result<Self, DataError> {
565        // These are assigned to locals in order to keep the code after these assignments
566        // copypaste-compatible with `Collator::try_new_unstable_internal`.
567
568        let provider = &crate::provider::Baked;
569        let decompositions = icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1;
570        let tables = icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1;
571        let root = crate::provider::Baked::SINGLETON_COLLATION_ROOT_V1;
572        let jamo = crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1;
573
574        let locale_dependent =
575            LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;
576
577        // TODO: redesign Korean search collation handling
578        if jamo.ce32s.len() != JAMO_COUNT {
579            return Err(DataError::custom("invalid").with_marker(CollationJamoV1::INFO));
580        }
581
582        let special_primaries = if locale_dependent.merged_options.alternate_handling()
583            == AlternateHandling::Shifted
584            || locale_dependent.merged_options.numeric()
585        {
586            let special_primaries =
587                crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1;
588            // `variant_count` isn't stable yet:
589            // https://github.com/rust-lang/rust/issues/73662
590            if special_primaries.last_primaries.len() <= (MaxVariable::Currency as usize) {
591                return Err(
592                    DataError::custom("invalid").with_marker(CollationSpecialPrimariesV1::INFO)
593                );
594            }
595            Some(special_primaries)
596        } else {
597            None
598        };
599
600        // Attribute belongs closer to `unwrap`, but
601        // https://github.com/rust-lang/rust/issues/15701
602        #[allow(clippy::unwrap_used)]
603        Ok(CollatorBorrowed {
604            special_primaries,
605            root,
606            // Unwrap is OK, because we know we have the baked provider.
607            tailoring: locale_dependent.tailoring.map(|s| s.get_static().unwrap()),
608            jamo,
609            // Unwrap is OK, because we know we have the baked provider.
610            diacritics: locale_dependent.diacritics.get_static().unwrap(),
611            options: locale_dependent.merged_options,
612            // Unwrap is OK, because we know we have the baked provider.
613            reordering: locale_dependent.reordering.map(|s| s.get_static().unwrap()),
614            decompositions,
615            tables,
616            lithuanian_dot_above: locale_dependent.lithuanian_dot_above,
617        })
618    }
619
620    /// Cheaply converts a [`CollatorBorrowed<'static>`] into a [`Collator`].
621    ///
622    /// Note: Due to branching and indirection, using [`Collator`] might inhibit some
623    /// compile-time optimizations that are possible with [`CollatorBorrowed`].
624    pub const fn static_to_owned(self) -> Collator {
625        Collator {
626            special_primaries: if let Some(s) = self.special_primaries {
627                // `map` not available in const context
628                Some(DataPayload::from_static_ref(s))
629            } else {
630                None
631            },
632            root: DataPayload::from_static_ref(self.root),
633            tailoring: if let Some(s) = self.tailoring {
634                // `map` not available in const context
635                Some(DataPayload::from_static_ref(s))
636            } else {
637                None
638            },
639            jamo: DataPayload::from_static_ref(self.jamo),
640            diacritics: DataPayload::from_static_ref(self.diacritics),
641            options: self.options,
642            reordering: if let Some(s) = self.reordering {
643                // `map` not available in const context
644                Some(DataPayload::from_static_ref(s))
645            } else {
646                None
647            },
648            decompositions: DataPayload::from_static_ref(self.decompositions),
649            tables: DataPayload::from_static_ref(self.tables),
650            lithuanian_dot_above: self.lithuanian_dot_above,
651        }
652    }
653}
654
655impl CollatorBorrowed<'_> {
656    /// The resolved options showing how the default options, the requested options,
657    /// and the options from locale data were combined.
658    pub fn resolved_options(&self) -> ResolvedCollatorOptions {
659        self.options.into()
660    }
661
662    compare!(
663        /// Compare guaranteed well-formed UTF-8 slices.
664        ,
665        compare,
666        str,
667        split_prefix,
668    );
669
670    compare!(
671        /// Compare potentially ill-formed UTF-8 slices. Ill-formed input is compared
672        /// as if errors had been replaced with REPLACEMENT CHARACTERs according
673        /// to the WHATWG Encoding Standard.
674        ,
675        compare_utf8,
676        [u8],
677        split_prefix_u8,
678    );
679
680    compare!(
681        /// Compare potentially ill-formed UTF-16 slices. Unpaired surrogates
682        /// are compared as if each one was a REPLACEMENT CHARACTER.
683        ,
684        compare_utf16,
685        [u16],
686        split_prefix_u16,
687    );
688
689    /// The implementation of the comparison operation.
690    ///
691    /// `head_chars` is an iterator over the identical prefix and
692    /// `left_chars` and `right_chars` are iterators over the parts
693    /// after the identical prefix.
694    ///
695    /// Currently, all three have the same concrete type, so the
696    /// trait bound is given as `DoubleEndedIterator`.
697    /// `head_chars` is iterated backwards and `left_chars` and
698    /// `right_chars` forward. If this were a public API, this
699    /// should have three generic types, one for each argument,
700    /// for maximum flexibility.
701    fn compare_impl<I: DoubleEndedIterator<Item = char>>(
702        &self,
703        left_chars: I,
704        right_chars: I,
705        mut head_chars: I,
706    ) -> Ordering {
707        let tailoring: &CollationData = if let Some(tailoring) = &self.tailoring {
708            tailoring
709        } else {
710            // If the root collation is valid for the locale,
711            // use the root as the tailoring so that reads from the
712            // tailoring always succeed.
713            //
714            // TODO(#2011): Do we instead want to have an untailored
715            // copypaste of the iterator that omits the tailoring
716            // branches for performance at the expense of code size
717            // and having to maintain both a tailoring-capable and
718            // a tailoring-incapable version of the iterator?
719            // Or, in order not to flip the branch prediction around,
720            // should we have a no-op tailoring that contains a
721            // specially-crafted CodePointTrie that always returns
722            // a FALLBACK_CE32 after a single branch?
723            self.root
724        };
725
726        // Sadly, it looks like variable CEs and backward second level
727        // require us to store the full 64-bit CEs instead of storing only
728        // the NonPrimary part.
729        //
730        // TODO(#2008): Consider having two monomorphizations of this method:
731        // one that can deal with variables shifted to quaternary and
732        // backward second level and another that doesn't support that
733        // and only stores `NonPrimary` in `left_ces` and `right_ces`
734        // with double the number of stack allocated elements.
735
736        // Note: These are used only after the identical prefix skipping,
737        // but initializing these up here improves performance at the time
738        // of writing. Presumably the source order affects the stack frame
739        // layout.
740        let mut left_ces: SmallVec<[CollationElement; CE_BUFFER_SIZE]> = SmallVec::new();
741        let mut right_ces: SmallVec<[CollationElement; CE_BUFFER_SIZE]> = SmallVec::new();
742
743        // The algorithm comes from CollationCompare::compareUpToQuaternary in ICU4C.
744
745        let mut any_variable = false;
746        // Attribute belongs closer to `unwrap`, but
747        // https://github.com/rust-lang/rust/issues/15701
748        #[allow(clippy::unwrap_used)]
749        let variable_top = if self.options.alternate_handling() == AlternateHandling::NonIgnorable {
750            0
751        } else {
752            // +1 so that we can use "<" and primary ignorables test out early.
753            self.special_primaries
754                .as_ref()
755                // `unwrap()` is OK, because we've ensured in the constructor that value
756                // is `Some` if we have alternate handling.
757                .unwrap()
758                .last_primary_for_group(self.options.max_variable())
759                + 1
760        };
761
762        // Attribute belongs on inner expression, but
763        // https://github.com/rust-lang/rust/issues/15701
764        #[allow(clippy::unwrap_used)]
765        let numeric_primary = if self.options.numeric() {
766            Some(
767                self.special_primaries
768                    .as_ref()
769                    // `unwrap` is OK, because we've ensured `Some` in the constructor
770                    .unwrap()
771                    .numeric_primary,
772            )
773        } else {
774            None
775        };
776
777        // Attribute belongs on inner expression, but
778        // https://github.com/rust-lang/rust/issues/15701
779        #[allow(clippy::unwrap_used)]
780        let mut left = CollationElements::new(
781            left_chars,
782            self.root,
783            tailoring,
784            <&[<u32 as AsULE>::ULE; JAMO_COUNT]>::try_from(self.jamo.ce32s.as_ule_slice()).unwrap(), // `unwrap` OK, because length already validated
785            &self.diacritics.secondaries,
786            self.decompositions,
787            self.tables,
788            numeric_primary,
789            self.lithuanian_dot_above,
790        );
791        // Attribute belongs on inner expression, but
792        // https://github.com/rust-lang/rust/issues/15701
793        #[allow(clippy::unwrap_used)]
794        let mut right = CollationElements::new(
795            right_chars,
796            self.root,
797            tailoring,
798            <&[<u32 as AsULE>::ULE; JAMO_COUNT]>::try_from(self.jamo.ce32s.as_ule_slice()).unwrap(), // `unwrap` OK, because length already validated
799            &self.diacritics.secondaries,
800            self.decompositions,
801            self.tables,
802            numeric_primary,
803            self.lithuanian_dot_above,
804        );
805
806        // Start identical prefix
807
808        // The logic here to check whether the boundary found by skipping
809        // the identical prefix is safe is complicated compared to the ICU4C
810        // approach of having a set of characters that are unsafe as the character
811        // immediately following the identical prefix. However, the approach here
812        // avoids extra data, and working on the main data avoids the bug
813        // possibility of data structures not being mutually consistent.
814
815        // This code intentionally does not keep around the `CollationElement32`s
816        // that have been read from the collation data tries, because keeping
817        // them around turned out to be a pessimization: There would be added
818        // branches on the hot path of the algorithm that maps characters to
819        // collation elements, and the element size of the upcoming buffer
820        // would grow.
821        //
822        // However, the values read from the normalization trie _are_ kept around,
823        // since there is already a place where to put them.
824
825        // This loop is only broken out of as goto forward.
826        #[allow(clippy::never_loop)]
827        'prefix: loop {
828            if let Some(mut head_last_c) = head_chars.next_back() {
829                let norm_trie = &self.decompositions.trie;
830                let mut head_last = CharacterAndClassAndTrieValue::new_with_trie_val(
831                    head_last_c,
832                    norm_trie.get(head_last_c),
833                );
834                let mut head_last_ce32 = CollationElement32::default();
835                let mut head_last_ok = false;
836                if let Some(left_different) = left.iter_next_before_init() {
837                    left.prepend_upcoming_before_init(left_different.clone());
838                    if let Some(right_different) = right.iter_next_before_init() {
839                        // Note: left_different and right_different may both be U+FFFD.
840                        right.prepend_upcoming_before_init(right_different.clone());
841
842                        // The base logic is that a boundary between two starters
843                        // that decompose to selves is safe iff the starter
844                        // before the boundary can't contract a starter, the
845                        // starter after the boundary doesn't have a prefix
846                        // condition, and, with the numeric mode enabled,
847                        // they aren't both numeric.
848                        //
849                        // This base logic is then extended with Hangul
850                        // syllables and characters that decompose to a
851                        // BMP starter followed by a BMP non-starter.
852                        // The logic could be extended further, in
853                        // particular to cover singleton decompositions
854                        // to a BMP starter, but such characters can be
855                        // expected to be rare enough in real-world input
856                        // that it's not worthwhile to make this code more
857                        // branchy.
858                        //
859                        // A Hangul syllable is safe on either side of the
860                        // boundary, because Hangul syllables can't participate
861                        // in contraction or have prefix conditions. They are
862                        // also known not to be numeric.
863                        //
864                        // Hangul jamo is safe to look up from the main trie
865                        // instead of the jamo table, because they aren't
866                        // allowed to participate in contractions or prefix
867                        // conditions, either, and are known not to be numeric.
868                        //
869                        // After a boundary, a decomposition to a BMP starter
870                        // and a BMP non-starter can obviously be analyzed by
871                        // considering the starter as if it was a starter
872                        // that decomposes to self.
873                        //
874                        // Before a boundary the contraction condition considers
875                        // whether the contraction can contract a starter.
876                        // For the case of contracting a non-starter, it's
877                        // fine for the BMP starter of the decomposition to
878                        // contract the non-starter from the same decomposition:
879                        // Since that would happen as part of the prefix that
880                        // is identical, it wouldn't affect anything after.
881                        //
882                        // The case of contracting a non-starter other than
883                        // the one that came from the decomposition itself
884                        // is irrelevant, because we don't allow a non-starter
885                        // right after the boundary regardless of the contraction
886                        // status of what's before the boundary.
887                        //
888                        // Finally, decompositions to starter and non-starter
889                        // are known not to be numeric.
890
891                        // The checks below are repetitive, but an attempt to factor
892                        // repetitive code into an inlined function regressed,
893                        // performance, so it seems that having the control flow
894                        // right here without an intermediate enum from a
895                        // function return to branch on is important.
896
897                        // This loop is only broken out of as goto forward. The control flow
898                        // is much more readable this way.
899                        #[allow(clippy::never_loop)]
900                        loop {
901                            // The two highest bits are about NFC, which we don't
902                            // care about here.
903                            let decomposition = head_last.trie_val;
904                            if (decomposition
905                                & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER))
906                                == 0
907                            {
908                                // Intentionally empty block to keep
909                                // the same structure as in the cases
910                                // where something happens here.
911                            } else if ((decomposition & HIGH_ZEROS_MASK) != 0)
912                                && ((decomposition & LOW_ZEROS_MASK) != 0)
913                            {
914                                // Decomposition into two BMP characters: starter and non-starter
915                                // Let's take the starter
916                                head_last_c = char_from_u32(decomposition & 0x7FFF);
917                            } else if decomposition == HANGUL_SYLLABLE_MARKER {
918                                head_last_ce32 = FFFD_CE32;
919                            } else {
920                                break;
921                            }
922                            head_last_ok = true;
923
924                            let mut left_ce32 = CollationElement32::default();
925                            let mut right_ce32 = CollationElement32::default();
926                            let left_c;
927                            let right_c;
928
929                            let decomposition = left_different.trie_val;
930                            if (decomposition
931                                & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER))
932                                == 0
933                            {
934                                left_c = left_different.character();
935                            } else if ((decomposition & HIGH_ZEROS_MASK) != 0)
936                                && ((decomposition & LOW_ZEROS_MASK) != 0)
937                            {
938                                // Decomposition into two BMP characters: starter and non-starter
939                                // Let's take the starter
940                                left_c = char_from_u32(decomposition & 0x7FFF);
941                            } else if decomposition == HANGUL_SYLLABLE_MARKER {
942                                left_c = '\u{0}';
943                                left_ce32 = FFFD_CE32;
944                            } else {
945                                break;
946                            }
947
948                            let decomposition = right_different.trie_val;
949                            if (decomposition
950                                & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER))
951                                == 0
952                            {
953                                right_c = right_different.character();
954                            } else if ((decomposition & HIGH_ZEROS_MASK) != 0)
955                                && ((decomposition & LOW_ZEROS_MASK) != 0)
956                            {
957                                // Decomposition into two BMP characters: starter and non-starter
958                                // Let's take the starter
959                                right_c = char_from_u32(decomposition & 0x7FFF);
960                            } else if decomposition == HANGUL_SYLLABLE_MARKER {
961                                right_c = '\u{0}';
962                                right_ce32 = FFFD_CE32;
963                            } else {
964                                break;
965                            }
966
967                            // The last character of the prefix is OK on the normalization
968                            // level. Now let's check its ce32 unless it's a Hangul syllable,
969                            // in which case `head_last_ce32` already is a non-default placeholder.
970                            if head_last_ce32 == CollationElement32::default() {
971                                head_last_ce32 = tailoring.ce32_for_char(head_last_c);
972                                if head_last_ce32 == FALLBACK_CE32 {
973                                    head_last_ce32 = self.root.ce32_for_char(head_last_c);
974                                }
975                                if head_last_ce32.tag_checked() == Some(Tag::Contraction)
976                                    && head_last_ce32.at_least_one_suffix_contains_starter()
977                                {
978                                    break;
979                                }
980                            }
981                            // The first character of each suffix is OK on the normalization
982                            // level. Now let's check their ce32s unless they are Hangul syllables.
983                            if left_ce32 == CollationElement32::default() {
984                                left_ce32 = tailoring.ce32_for_char(left_c);
985                                if left_ce32 == FALLBACK_CE32 {
986                                    left_ce32 = self.root.ce32_for_char(left_c);
987                                }
988                                if left_ce32.tag_checked() == Some(Tag::Prefix) {
989                                    break;
990                                }
991                            }
992                            if right_ce32 == CollationElement32::default() {
993                                right_ce32 = tailoring.ce32_for_char(right_c);
994                                if right_ce32 == FALLBACK_CE32 {
995                                    right_ce32 = self.root.ce32_for_char(right_c);
996                                }
997                                if right_ce32.tag_checked() == Some(Tag::Prefix) {
998                                    break;
999                                }
1000                            }
1001                            if self.options.numeric()
1002                                && head_last_ce32.tag_checked() == Some(Tag::Digit)
1003                                && (left_ce32.tag_checked() == Some(Tag::Digit)
1004                                    || right_ce32.tag_checked() == Some(Tag::Digit))
1005                            {
1006                                break;
1007                            }
1008                            // We are at a good boundary!
1009                            break 'prefix;
1010                        }
1011                    }
1012                }
1013                let mut tail_first_c;
1014                let mut tail_first_ce32;
1015                let mut tail_first_ok;
1016                loop {
1017                    // Take a step back.
1018                    left.prepend_upcoming_before_init(head_last.clone());
1019                    right.prepend_upcoming_before_init(head_last.clone());
1020
1021                    tail_first_c = head_last_c;
1022                    tail_first_ce32 = head_last_ce32;
1023                    tail_first_ok = head_last_ok;
1024
1025                    head_last_c = if let Some(head_last_c) = head_chars.next_back() {
1026                        head_last_c
1027                    } else {
1028                        // We need to step back beyond the start of the prefix.
1029                        // Treat as good boundary.
1030                        break 'prefix;
1031                    };
1032                    let decomposition = norm_trie.get(head_last_c);
1033                    head_last = CharacterAndClassAndTrieValue::new_with_trie_val(
1034                        head_last_c,
1035                        decomposition,
1036                    );
1037                    head_last_ce32 = CollationElement32::default();
1038                    head_last_ok = false;
1039
1040                    if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
1041                        // Intentionally empty block to keep
1042                        // the same structure as in the cases
1043                        // where something happens here.
1044                    } else if ((decomposition & HIGH_ZEROS_MASK) != 0)
1045                        && ((decomposition & LOW_ZEROS_MASK) != 0)
1046                    {
1047                        // Decomposition into two BMP characters: starter and non-starter
1048                        // Let's take the starter
1049                        head_last_c = char_from_u32(decomposition & 0x7FFF);
1050                    } else if decomposition == HANGUL_SYLLABLE_MARKER {
1051                        head_last_ce32 = FFFD_CE32;
1052                    } else {
1053                        continue;
1054                    }
1055                    head_last_ok = true;
1056                    if !tail_first_ok {
1057                        continue;
1058                    }
1059                    // The last character of the prefix is OK on the normalization
1060                    // level. Now let's check its ce32 unless it's a Hangul syllable.
1061                    if head_last_ce32 == CollationElement32::default() {
1062                        head_last_ce32 = tailoring.ce32_for_char(head_last_c);
1063                        if head_last_ce32 == FALLBACK_CE32 {
1064                            head_last_ce32 = self.root.ce32_for_char(head_last_c);
1065                        }
1066                        if head_last_ce32.tag_checked() == Some(Tag::Contraction)
1067                            && head_last_ce32.at_least_one_suffix_contains_starter()
1068                        {
1069                            continue;
1070                        }
1071                    }
1072                    // Check this _after_ `head_last_ce32` to make sure
1073                    // `head_last_ce32` is initialized for the next loop round
1074                    // trip if applicable.
1075                    if tail_first_ce32 == CollationElement32::default() {
1076                        tail_first_ce32 = tailoring.ce32_for_char(tail_first_c);
1077                        if tail_first_ce32 == FALLBACK_CE32 {
1078                            tail_first_ce32 = self.root.ce32_for_char(tail_first_c);
1079                        }
1080                    } // else we already have a trie value from the previous loop iteration or we have Hangul syllable
1081                    if tail_first_ce32.tag_checked() == Some(Tag::Prefix) {
1082                        continue;
1083                    }
1084                    if self.options.numeric()
1085                        && head_last_ce32.tag_checked() == Some(Tag::Digit)
1086                        && tail_first_ce32.tag_checked() == Some(Tag::Digit)
1087                    {
1088                        continue;
1089                    }
1090                    // We are at a good boundary!
1091                    break 'prefix;
1092                }
1093            } else {
1094                // The prefix is empty
1095                break 'prefix;
1096            }
1097            // Unreachable line
1098        }
1099
1100        // End identical prefix
1101
1102        left.init();
1103        right.init();
1104
1105        loop {
1106            let mut left_primary;
1107            'left_primary_loop: loop {
1108                let ce = left.next();
1109                left_primary = ce.primary();
1110                // TODO(#2008): Consider compiling out the variable handling when we know we aren't
1111                // shifting variable CEs.
1112                if !(left_primary < variable_top && left_primary > MERGE_SEPARATOR_PRIMARY) {
1113                    left_ces.push(ce);
1114                } else {
1115                    // Variable CE, shift it to quaternary level.
1116                    // Ignore all following primary ignorables, and shift further variable CEs.
1117                    any_variable = true;
1118                    // Relative to ICU4C, the next line is hoisted out of the following loop
1119                    // in order to keep the variables called `ce` immutable to make it easier
1120                    // to reason about each assignment into `ce` resulting in exactly a single
1121                    // push into `left_ces`.
1122                    left_ces.push(ce.clone_with_non_primary_zeroed());
1123                    loop {
1124                        // This loop is simpler than in ICU4C; unlike in C++, we get to break by label.
1125                        let ce = left.next();
1126                        left_primary = ce.primary();
1127                        if left_primary != 0
1128                            && !(left_primary < variable_top
1129                                && left_primary > MERGE_SEPARATOR_PRIMARY)
1130                        {
1131                            // Neither a primary ignorable nor a variable CE.
1132                            left_ces.push(ce);
1133                            break 'left_primary_loop;
1134                        }
1135                        // If `left_primary == 0`, the following line ignores a primary-ignorable.
1136                        // Otherwise, it shifts a variable CE.
1137                        left_ces.push(ce.clone_with_non_primary_zeroed());
1138                    }
1139                }
1140                if left_primary != 0 {
1141                    break;
1142                }
1143            }
1144            let mut right_primary;
1145            'right_primary_loop: loop {
1146                let ce = right.next();
1147                right_primary = ce.primary();
1148                // TODO(#2008): Consider compiling out the variable handling when we know we aren't
1149                // shifting variable CEs.
1150                if !(right_primary < variable_top && right_primary > MERGE_SEPARATOR_PRIMARY) {
1151                    right_ces.push(ce);
1152                } else {
1153                    // Variable CE, shift it to quaternary level.
1154                    // Ignore all following primary ignorables, and shift further variable CEs.
1155                    any_variable = true;
1156                    // Relative to ICU4C, the next line is hoisted out of the following loop
1157                    // in order to keep the variables called `ce` immutable to make it easier
1158                    // to reason about each assignment into `ce` resulting in exactly a single
1159                    // push into `right_ces`.
1160                    right_ces.push(ce.clone_with_non_primary_zeroed());
1161                    loop {
1162                        // This loop is simpler than in ICU4C; unlike in C++, we get to break by label.
1163                        let ce = right.next();
1164                        right_primary = ce.primary();
1165                        if right_primary != 0
1166                            && !(right_primary < variable_top
1167                                && right_primary > MERGE_SEPARATOR_PRIMARY)
1168                        {
1169                            // Neither a primary ignorable nor a variable CE.
1170                            right_ces.push(ce);
1171                            break 'right_primary_loop;
1172                        }
1173                        // If `right_primary == 0`, the following line ignores a primary-ignorable.
1174                        // Otherwise, it shifts a variable CE.
1175                        right_ces.push(ce.clone_with_non_primary_zeroed());
1176                    }
1177                }
1178                if right_primary != 0 {
1179                    break;
1180                }
1181            }
1182            if left_primary != right_primary {
1183                if let Some(reordering) = &self.reordering {
1184                    left_primary = reordering.reorder(left_primary);
1185                    right_primary = reordering.reorder(right_primary);
1186                }
1187                if left_primary < right_primary {
1188                    return Ordering::Less;
1189                }
1190                return Ordering::Greater;
1191            }
1192            if left_primary == NO_CE_PRIMARY {
1193                break;
1194            }
1195        }
1196
1197        // Sadly, we end up pushing the sentinel value, which means these
1198        // `SmallVec`s allocate more often than if we didn't actually
1199        // store the sentinel.
1200        debug_assert_eq!(left_ces.last(), Some(&NO_CE));
1201        debug_assert_eq!(right_ces.last(), Some(&NO_CE));
1202
1203        // Note: `unwrap_or_default` in the iterations below should never
1204        // actually end up using the "_or_default" part, because the sentinel
1205        // is in the `SmallVec`s. These could be changed to `unwrap()` if we
1206        // preferred panic in case of a bug.
1207        // TODO(#2009): Should we save one slot by not putting the sentinel in
1208        // the `SmallVec`s? So far, the answer seems "no", as it would complicate
1209        // the primary comparison above.
1210
1211        // Compare the buffered secondary & tertiary weights.
1212        // We might skip the secondary level but continue with the case level
1213        // which is turned on separately.
1214        if self.options.strength() >= Strength::Secondary {
1215            if !self.options.backward_second_level() {
1216                let mut left_iter = left_ces.iter();
1217                let mut right_iter = right_ces.iter();
1218                let mut left_secondary;
1219                let mut right_secondary;
1220                loop {
1221                    loop {
1222                        left_secondary = left_iter.next().unwrap_or_default().secondary();
1223                        if left_secondary != 0 {
1224                            break;
1225                        }
1226                    }
1227                    loop {
1228                        right_secondary = right_iter.next().unwrap_or_default().secondary();
1229                        if right_secondary != 0 {
1230                            break;
1231                        }
1232                    }
1233                    if left_secondary != right_secondary {
1234                        if left_secondary < right_secondary {
1235                            return Ordering::Less;
1236                        }
1237                        return Ordering::Greater;
1238                    }
1239                    if left_secondary == NO_CE_SECONDARY {
1240                        break;
1241                    }
1242                }
1243            } else {
1244                let mut left_remaining = &left_ces[..];
1245                let mut right_remaining = &right_ces[..];
1246                loop {
1247                    if left_remaining.is_empty() {
1248                        debug_assert!(right_remaining.is_empty());
1249                        break;
1250                    }
1251                    let (left_prefix, right_prefix) = {
1252                        let mut left_iter = left_remaining.iter();
1253                        loop {
1254                            let left_primary = left_iter.next().unwrap_or_default().primary();
1255                            if left_primary != 0 && left_primary <= MERGE_SEPARATOR_PRIMARY {
1256                                break;
1257                            }
1258                            debug_assert_ne!(left_primary, NO_CE_PRIMARY);
1259                        }
1260                        let left_new_remaining = left_iter.as_slice();
1261                        // Index in range by construction
1262                        #[allow(clippy::indexing_slicing)]
1263                        let left_prefix =
1264                            &left_remaining[..left_remaining.len() - 1 - left_new_remaining.len()];
1265                        left_remaining = left_new_remaining;
1266
1267                        let mut right_iter = right_remaining.iter();
1268                        loop {
1269                            let right_primary = right_iter.next().unwrap_or_default().primary();
1270                            if right_primary != 0 && right_primary <= MERGE_SEPARATOR_PRIMARY {
1271                                break;
1272                            }
1273                            debug_assert_ne!(right_primary, NO_CE_PRIMARY);
1274                        }
1275                        let right_new_remaining = right_iter.as_slice();
1276                        // Index in range by construction
1277                        #[allow(clippy::indexing_slicing)]
1278                        let right_prefix = &right_remaining
1279                            [..right_remaining.len() - 1 - right_new_remaining.len()];
1280                        right_remaining = right_new_remaining;
1281
1282                        (left_prefix, right_prefix)
1283                    };
1284                    let mut left_iter = left_prefix.iter();
1285                    let mut right_iter = right_prefix.iter();
1286
1287                    let mut left_secondary;
1288                    let mut right_secondary;
1289                    loop {
1290                        loop {
1291                            left_secondary = left_iter.next_back().unwrap_or_default().secondary();
1292                            if left_secondary != 0 {
1293                                break;
1294                            }
1295                        }
1296                        loop {
1297                            right_secondary =
1298                                right_iter.next_back().unwrap_or_default().secondary();
1299                            if right_secondary != 0 {
1300                                break;
1301                            }
1302                        }
1303                        if left_secondary != right_secondary {
1304                            if left_secondary < right_secondary {
1305                                return Ordering::Less;
1306                            }
1307                            return Ordering::Greater;
1308                        }
1309                        if left_secondary == NO_CE_SECONDARY {
1310                            break;
1311                        }
1312                    }
1313                }
1314            }
1315        }
1316
1317        if self.options.case_level() {
1318            if self.options.strength() == Strength::Primary {
1319                // Primary+caseLevel: Ignore case level weights of primary ignorables.
1320                // Otherwise we would get a-umlaut > a
1321                // which is not desirable for accent-insensitive sorting.
1322                // Check for (lower 32 bits) == 0 as well because variable CEs are stored
1323                // with only primary weights.
1324                let mut left_non_primary;
1325                let mut right_non_primary;
1326                let mut left_case;
1327                let mut right_case;
1328                let mut left_iter = left_ces.iter();
1329                let mut right_iter = right_ces.iter();
1330                loop {
1331                    loop {
1332                        let ce = left_iter.next().unwrap_or_default();
1333                        left_non_primary = ce.non_primary();
1334                        if !ce.either_half_zero() {
1335                            break;
1336                        }
1337                    }
1338                    left_case = left_non_primary.case();
1339                    loop {
1340                        let ce = right_iter.next().unwrap_or_default();
1341                        right_non_primary = ce.non_primary();
1342                        if !ce.either_half_zero() {
1343                            break;
1344                        }
1345                    }
1346                    right_case = right_non_primary.case();
1347                    // No need to handle NO_CE and MERGE_SEPARATOR specially:
1348                    // There is one case weight for each previous-level weight,
1349                    // so level length differences were handled there.
1350                    if left_case != right_case {
1351                        if !self.options.upper_first() {
1352                            if left_case < right_case {
1353                                return Ordering::Less;
1354                            }
1355                            return Ordering::Greater;
1356                        }
1357                        if left_case < right_case {
1358                            return Ordering::Greater;
1359                        }
1360                        return Ordering::Less;
1361                    }
1362                    if left_non_primary.secondary() == NO_CE_SECONDARY {
1363                        break;
1364                    }
1365                }
1366            } else {
1367                // Secondary+caseLevel: By analogy with the above,
1368                // ignore case level weights of secondary ignorables.
1369                //
1370                // Note: A tertiary CE has uppercase case bits (0.0.ut)
1371                // to keep tertiary+caseFirst well-formed.
1372                //
1373                // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
1374                // Otherwise a tertiary CE's uppercase would be no greater than
1375                // a primary/secondary CE's uppercase.
1376                // (See UCA well-formedness condition 2.)
1377                // We could construct a special case weight higher than uppercase,
1378                // but it's simpler to always ignore case weights of secondary ignorables,
1379                // turning 0.0.ut into 0.0.0.t.
1380                // (See LDML Collation, Case Parameters.)
1381                let mut left_non_primary;
1382                let mut right_non_primary;
1383                let mut left_case;
1384                let mut right_case;
1385                let mut left_iter = left_ces.iter();
1386                let mut right_iter = right_ces.iter();
1387                loop {
1388                    loop {
1389                        left_non_primary = left_iter.next().unwrap_or_default().non_primary();
1390                        if left_non_primary.secondary() != 0 {
1391                            break;
1392                        }
1393                    }
1394                    left_case = left_non_primary.case();
1395                    loop {
1396                        right_non_primary = right_iter.next().unwrap_or_default().non_primary();
1397                        if right_non_primary.secondary() != 0 {
1398                            break;
1399                        }
1400                    }
1401                    right_case = right_non_primary.case();
1402                    // No need to handle NO_CE and MERGE_SEPARATOR specially:
1403                    // There is one case weight for each previous-level weight,
1404                    // so level length differences were handled there.
1405                    if left_case != right_case {
1406                        if !self.options.upper_first() {
1407                            if left_case < right_case {
1408                                return Ordering::Less;
1409                            }
1410                            return Ordering::Greater;
1411                        }
1412                        if left_case < right_case {
1413                            return Ordering::Greater;
1414                        }
1415                        return Ordering::Less;
1416                    }
1417                    if left_non_primary.secondary() == NO_CE_SECONDARY {
1418                        break;
1419                    }
1420                }
1421            }
1422        }
1423
1424        if let Some(tertiary_mask) = self.options.tertiary_mask() {
1425            let mut any_quaternaries = AnyQuaternaryAccumulator::new();
1426            let mut left_iter = left_ces.iter();
1427            let mut right_iter = right_ces.iter();
1428            loop {
1429                let mut left_non_primary;
1430                let mut left_tertiary;
1431                loop {
1432                    left_non_primary = left_iter.next().unwrap_or_default().non_primary();
1433                    any_quaternaries.accumulate(left_non_primary);
1434                    debug_assert!(
1435                        left_non_primary.tertiary() != 0 || left_non_primary.case_quaternary() == 0
1436                    );
1437                    left_tertiary = left_non_primary.tertiary_case_quarternary(tertiary_mask);
1438                    if left_tertiary != 0 {
1439                        break;
1440                    }
1441                }
1442
1443                let mut right_non_primary;
1444                let mut right_tertiary;
1445                loop {
1446                    right_non_primary = right_iter.next().unwrap_or_default().non_primary();
1447                    any_quaternaries.accumulate(right_non_primary);
1448                    debug_assert!(
1449                        right_non_primary.tertiary() != 0
1450                            || right_non_primary.case_quaternary() == 0
1451                    );
1452                    right_tertiary = right_non_primary.tertiary_case_quarternary(tertiary_mask);
1453                    if right_tertiary != 0 {
1454                        break;
1455                    }
1456                }
1457
1458                if left_tertiary != right_tertiary {
1459                    if self.options.upper_first() {
1460                        // Pass through NO_CE and keep real tertiary weights larger than that.
1461                        // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
1462                        // to keep tertiary CEs well-formed.
1463                        // Their case+tertiary weights must be greater than those of
1464                        // primary and secondary CEs.
1465                        // Magic numbers from ICU4C.
1466                        if left_tertiary > NO_CE_TERTIARY {
1467                            if left_non_primary.secondary() != 0 {
1468                                left_tertiary ^= 0xC000;
1469                            } else {
1470                                left_tertiary += 0x4000;
1471                            }
1472                        }
1473                        if right_tertiary > NO_CE_TERTIARY {
1474                            if right_non_primary.secondary() != 0 {
1475                                right_tertiary ^= 0xC000;
1476                            } else {
1477                                right_tertiary += 0x4000;
1478                            }
1479                        }
1480                    }
1481                    if left_tertiary < right_tertiary {
1482                        return Ordering::Less;
1483                    }
1484                    return Ordering::Greater;
1485                }
1486
1487                if left_tertiary == NO_CE_TERTIARY {
1488                    break;
1489                }
1490            }
1491            if !any_variable && !any_quaternaries.has_quaternary() {
1492                return Ordering::Equal;
1493            }
1494        } else {
1495            return Ordering::Equal;
1496        }
1497
1498        if self.options.strength() <= Strength::Tertiary {
1499            return Ordering::Equal;
1500        }
1501
1502        let mut left_iter = left_ces.iter();
1503        let mut right_iter = right_ces.iter();
1504        loop {
1505            let mut left_quaternary;
1506            loop {
1507                let ce = left_iter.next().unwrap_or_default();
1508                if ce.tertiary_ignorable() {
1509                    left_quaternary = ce.primary();
1510                } else {
1511                    left_quaternary = ce.quaternary();
1512                }
1513                if left_quaternary != 0 {
1514                    break;
1515                }
1516            }
1517            let mut right_quaternary;
1518            loop {
1519                let ce = right_iter.next().unwrap_or_default();
1520                if ce.tertiary_ignorable() {
1521                    right_quaternary = ce.primary();
1522                } else {
1523                    right_quaternary = ce.quaternary();
1524                }
1525                if right_quaternary != 0 {
1526                    break;
1527                }
1528            }
1529            if left_quaternary != right_quaternary {
1530                if let Some(reordering) = &self.reordering {
1531                    left_quaternary = reordering.reorder(left_quaternary);
1532                    right_quaternary = reordering.reorder(right_quaternary);
1533                }
1534                if left_quaternary < right_quaternary {
1535                    return Ordering::Less;
1536                }
1537                return Ordering::Greater;
1538            }
1539            if left_quaternary == NO_CE_PRIMARY {
1540                break;
1541            }
1542        }
1543
1544        Ordering::Equal
1545    }
1546}