icu_collator/
lib.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5// Various collation-related algorithms and constants in this file are
6// adapted from ICU4C and, therefore, are subject to the ICU license as
7// described in LICENSE.
8
9// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
10#![cfg_attr(not(any(test, doc)), no_std)]
11#![cfg_attr(
12    not(test),
13    deny(
14        clippy::indexing_slicing,
15        clippy::unwrap_used,
16        clippy::expect_used,
17        clippy::panic,
18        clippy::exhaustive_structs,
19        clippy::exhaustive_enums,
20        clippy::trivially_copy_pass_by_ref,
21        missing_debug_implementations,
22    )
23)]
24#![warn(missing_docs)]
25
26//! Comparing strings according to language-dependent conventions.
27//!
28//! This module is published as its own crate ([`icu_collator`](https://docs.rs/icu_collator/latest/icu_collator/))
29//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
30//! `Collator` is the main structure of the component. It accepts a set of arguments
31//! which allow it to collect necessary data from the data provider, and once
32//! instantiated, can be used to compare strings.
33//!
34//! Refer to the ICU User Guide sections for Collation that give an
35//! [introduction](https://unicode-org.github.io/icu/userguide/collation/) and explain
36//! [basic concepts](https://unicode-org.github.io/icu/userguide/collation/concepts.html).
37//!
38//! # Examples
39//!
40//! As its most basic purpose, `Collator` offers locale-aware ordering:
41//!
42//! ```
43//! use core::cmp::Ordering;
44//! use icu::collator::{options::*, *};
45//! use icu::locale::locale;
46//!
47//! let mut options = CollatorOptions::default();
48//! options.strength = Some(Strength::Primary);
49//! let collator_es =
50//!     Collator::try_new(locale!("es-u-co-trad").into(), options).unwrap();
51//!
52//! // "pollo" > "polvo" in traditional Spanish
53//! assert_eq!(collator_es.compare("pollo", "polvo"), Ordering::Greater);
54//!
55//! let mut options = CollatorOptions::default();
56//! options.strength = Some(Strength::Primary);
57//! let collator_en = Collator::try_new(locale!("en").into(), options).unwrap();
58//!
59//! // "pollo" < "polvo" according to English rules
60//! assert_eq!(collator_en.compare("pollo", "polvo"), Ordering::Less);
61//! ```
62//!
63//! ## Examples of `CollatorOptions`
64//!
65//! The [`CollatorOptions`] struct configures specific custom behavior for the `Collator`.  See docs
66//! for [`CollatorOptions`] for more details.  Some basic descriptions and examples are below.
67//!
68//! ## Strength
69//!
70//! The degree of sensitivity in how to determine that strings are distinct.
71//!
72//! ```
73//! use core::cmp::Ordering;
74//! use icu::collator::{options::*, *};
75//!
76//! // Primary Level
77//!
78//! let mut options_l1 = CollatorOptions::default();
79//! options_l1.strength = Some(Strength::Primary);
80//! let collator_l1 =
81//!     Collator::try_new(Default::default(), options_l1).unwrap();
82//!
83//! assert_eq!(collator_l1.compare("a", "b"), Ordering::Less); // primary
84//! assert_eq!(collator_l1.compare("as", "às"), Ordering::Equal); // secondary
85//! assert_eq!(collator_l1.compare("às", "at"), Ordering::Less);
86//! assert_eq!(collator_l1.compare("ao", "Ao"), Ordering::Equal); // tertiary
87//! assert_eq!(collator_l1.compare("Ao", "aò"), Ordering::Equal);
88//! assert_eq!(collator_l1.compare("A", "Ⓐ"), Ordering::Equal);
89//!
90//! // Secondary Level
91//!
92//! let mut options_l2 = CollatorOptions::default();
93//! options_l2.strength = Some(Strength::Secondary);
94//! let collator_l2 =
95//!     Collator::try_new(Default::default(), options_l2).unwrap();
96//!
97//! assert_eq!(collator_l2.compare("a", "b"), Ordering::Less); // primary
98//! assert_eq!(collator_l2.compare("as", "às"), Ordering::Less); // secondary
99//! assert_eq!(collator_l2.compare("às", "at"), Ordering::Less);
100//! assert_eq!(collator_l2.compare("ao", "Ao"), Ordering::Equal); // tertiary
101//! assert_eq!(collator_l2.compare("Ao", "aò"), Ordering::Less);
102//! assert_eq!(collator_l2.compare("A", "Ⓐ"), Ordering::Equal);
103//!
104//! // Tertiary Level
105//!
106//! let mut options_l3 = CollatorOptions::default();
107//! options_l3.strength = Some(Strength::Tertiary);
108//! let collator_l3 =
109//!     Collator::try_new(Default::default(), options_l3).unwrap();
110//!
111//! assert_eq!(collator_l3.compare("a", "b"), Ordering::Less); // primary
112//! assert_eq!(collator_l3.compare("as", "às"), Ordering::Less); // secondary
113//! assert_eq!(collator_l3.compare("às", "at"), Ordering::Less);
114//! assert_eq!(collator_l3.compare("ao", "Ao"), Ordering::Less); // tertiary
115//! assert_eq!(collator_l3.compare("Ao", "aò"), Ordering::Less);
116//! assert_eq!(collator_l3.compare("A", "Ⓐ"), Ordering::Less);
117//! ```
118//!
119//! ## Alternate Handling
120//!
121//! Allows alternate handling for certain customized collation orderings, including the option to
122//! ignore the special handling for the strings of such customizations.  Specifically,
123//! alternate handling is used to control the handling of the so-called **variable** characters in the
124//! Unicode Collation Algorithm: whitespace, punctuation and symbols.
125//!
126//! Note that `AlternateHandling::ShiftTrimmed` and `AlternateHandling::Blanked` are
127//! unimplemented. The default is `AlternateHandling::NonIgnorable`, except
128//! for Thai, whose default is `AlternateHandling::Shifted`.
129//!
130//! ```
131//! use core::cmp::Ordering;
132//! use icu::collator::{*, options::*};
133//!
134//! // If alternate handling is set to `NonIgnorable`, then differences among
135//! // these characters are of the same importance as differences among letters.
136//!
137//! let mut options_3n = CollatorOptions::default();
138//! options_3n.strength = Some(Strength::Tertiary);
139//! options_3n.alternate_handling = Some(AlternateHandling::NonIgnorable);
140//! let collator_3n =
141//!     Collator::try_new(Default::default(), options_3n).unwrap();
142//!
143//! assert_eq!(collator_3n.compare("di Silva", "Di Silva"), Ordering::Less);
144//! assert_eq!(collator_3n.compare("Di Silva", "diSilva"), Ordering::Less);
145//! assert_eq!(collator_3n.compare("diSilva", "U.S.A."), Ordering::Less);
146//! assert_eq!(collator_3n.compare("U.S.A.", "USA"), Ordering::Less);
147//!
148//! // If alternate handling is set to `Shifted`, then these characters are of only minor
149//! // importance. The Shifted value is often used in combination with Strength
150//! // set to Quaternary.
151//!
152//! let mut options_3s = CollatorOptions::default();
153//! options_3s.strength = Some(Strength::Tertiary);
154//! options_3s.alternate_handling = Some(AlternateHandling::Shifted);
155//! let collator_3s =
156//!     Collator::try_new(Default::default(), options_3s).unwrap();
157//!
158//! assert_eq!(collator_3s.compare("di Silva", "diSilva"), Ordering::Equal);
159//! assert_eq!(collator_3s.compare("diSilva", "Di Silva"), Ordering::Less);
160//! assert_eq!(collator_3s.compare("Di Silva", "U.S.A."), Ordering::Less);
161//! assert_eq!(collator_3s.compare("U.S.A.", "USA"), Ordering::Equal);
162//!
163//! let mut options_4s = CollatorOptions::default();
164//! options_4s.strength = Some(Strength::Quaternary);
165//! options_4s.alternate_handling = Some(AlternateHandling::Shifted);
166//! let collator_4s =
167//!     Collator::try_new(Default::default(), options_4s).unwrap();
168//!
169//! assert_eq!(collator_4s.compare("di Silva", "diSilva"), Ordering::Less);
170//! assert_eq!(collator_4s.compare("diSilva", "Di Silva"), Ordering::Less);
171//! assert_eq!(collator_4s.compare("Di Silva", "U.S.A."), Ordering::Less);
172//! assert_eq!(collator_4s.compare("U.S.A.", "USA"), Ordering::Less);
173//! ```
174//!
175//! ## Case Level
176//!
177//! Whether to distinguish case in sorting, even for sorting levels higher than tertiary,
178//! without having to use tertiary level just to enable case level differences.
179//!
180//! ```
181//! use core::cmp::Ordering;
182//! use icu::collator::{*, options::*};
183//!
184//! // Primary
185//!
186//! let mut options = CollatorOptions::default();
187//! options.strength = Some(Strength::Primary);
188//! options.case_level = Some(CaseLevel::Off);
189//! let primary =
190//!   Collator::try_new(Default::default(),
191//!                     options).unwrap();
192//!
193//! assert_eq!(primary.compare("ⓓⓔⓐⓛ", "DEAL"), Ordering::Equal);
194//! assert_eq!(primary.compare("dejavu", "dejAvu"), Ordering::Equal);
195//! assert_eq!(primary.compare("dejavu", "déjavu"), Ordering::Equal);
196//!
197//! // Primary with case level on
198//!
199//! options.strength = Some(Strength::Primary);
200//! options.case_level = Some(CaseLevel::On);
201//! let primary_and_case =
202//!   Collator::try_new(Default::default(),
203//!                     options).unwrap();
204//!
205//! assert_eq!(primary_and_case.compare("ⓓⓔⓐⓛ", "DEAL"), Ordering::Less);
206//! assert_eq!(primary_and_case.compare("dejavu", "dejAvu"), Ordering::Less);
207//! assert_eq!(primary_and_case.compare("dejavu", "déjavu"), Ordering::Equal);
208//!
209//! // Secondary with case level on
210//!
211//! options.strength = Some(Strength::Secondary);
212//! options.case_level = Some(CaseLevel::On);
213//! let secondary_and_case =
214//!   Collator::try_new(Default::default(),
215//!                     options).unwrap();
216//!
217//! assert_eq!(secondary_and_case.compare("ⓓⓔⓐⓛ", "DEAL"), Ordering::Less);
218//! assert_eq!(secondary_and_case.compare("dejavu", "dejAvu"), Ordering::Less);
219//! assert_eq!(secondary_and_case.compare("dejavu", "déjavu"), Ordering::Less);  // secondary difference
220//!
221//! // Tertiary
222//!
223//! options.strength = Some(Strength::Tertiary);
224//! options.case_level = Some(CaseLevel::Off);
225//! let tertiary =
226//!   Collator::try_new(Default::default(),
227//!                     options).unwrap();
228//!
229//! assert_eq!(tertiary.compare("ⓓⓔⓐⓛ", "DEAL"), Ordering::Less);
230//! assert_eq!(tertiary.compare("dejavu", "dejAvu"), Ordering::Less);
231//! assert_eq!(tertiary.compare("dejavu", "déjavu"), Ordering::Less);
232//! ```
233//!
234//!
235//! ## Backward second level
236//!
237//! Compare the second level in backward order. The default is `false` (off), except for Canadian
238//! French.
239//!
240//! ## Examples of `CollatorPreferences`
241//!
242//! The [`CollatorPreferences`] struct configures specific custom behavior for the `Collator`, like
243//! [`CollatorOptions`]. However, unlike `CollatorOptions`, this set of preferences can also be set
244//! implicitly by the locale. See docs for [`CollatorPreferences`] for more details.
245//! Some basic descriptions and examples are below.
246//!
247//! ## Case First
248//!
249//! Whether to swap the ordering of uppercase and lowercase.
250//!
251//! ```
252//! use core::cmp::Ordering;
253//! use icu::collator::preferences::*;
254//! use icu::collator::{options::*, *};
255//!
256//! // Use the locale's default.
257//!
258//! let mut prefs_no_case = CollatorPreferences::default();
259//! prefs_no_case.case_first = Some(CollationCaseFirst::False);
260//! let collator_no_case =
261//!     Collator::try_new(prefs_no_case, Default::default()).unwrap();
262//! assert_eq!(collator_no_case.compare("ab", "AB"), Ordering::Less);
263//!
264//! // Lowercase is less
265//!
266//! let mut prefs_lower_less = CollatorPreferences::default();
267//! prefs_lower_less.case_first = Some(CollationCaseFirst::Lower);
268//! let collator_lower_less =
269//!     Collator::try_new(prefs_lower_less, Default::default()).unwrap();
270//! assert_eq!(collator_lower_less.compare("ab", "AB"), Ordering::Less);
271//!
272//! // Uppercase is less
273//!
274//! let mut prefs_upper_greater = CollatorPreferences::default();
275//! prefs_upper_greater.case_first = Some(CollationCaseFirst::Upper);
276//! let collator_upper_greater =
277//!     Collator::try_new(prefs_upper_greater, Default::default()).unwrap();
278//! assert_eq!(collator_upper_greater.compare("AB", "ab"), Ordering::Less);
279//! ```
280//!
281//! ## Numeric
282//!
283//! When set to `true` (on), any sequence of decimal
284//! digits is sorted at a primary level according to the
285//! numeric value.
286//!
287//! ```
288//! use core::cmp::Ordering;
289//! use icu::collator::preferences::*;
290//! use icu::collator::{options::*, *};
291//!
292//! // Numerical sorting off
293//!
294//! let mut prefs_num_off = CollatorPreferences::default();
295//! prefs_num_off.numeric_ordering = Some(CollationNumericOrdering::False);
296//! let collator_num_off =
297//!     Collator::try_new(prefs_num_off, Default::default()).unwrap();
298//! assert_eq!(collator_num_off.compare("a10b", "a2b"), Ordering::Less);
299//!
300//! // Numerical sorting on
301//!
302//! let mut prefs_num_on = CollatorPreferences::default();
303//! prefs_num_on.numeric_ordering = Some(CollationNumericOrdering::True);
304//! let collator_num_on =
305//!     Collator::try_new(prefs_num_on, Default::default()).unwrap();
306//! assert_eq!(collator_num_on.compare("a10b", "a2b"), Ordering::Greater);
307//! ```
308//!
309//! [`CollatorOptions`]: options::CollatorOptions
310
311mod comparison;
312#[cfg(doc)]
313pub mod docs;
314
315// NOTE: The Pernosco debugger has special knowledge
316// of the `CharacterAndClass` struct inside the `elements`
317// module. Please do not change the crate-module-qualified
318// name of that struct without coordination.
319mod elements;
320
321pub mod options;
322pub mod provider;
323
324pub use comparison::Collator;
325pub use comparison::CollatorBorrowed;
326pub use comparison::CollatorPreferences;
327
328/// Locale preferences used by this crate
329pub mod preferences {
330    /// **This is a reexport of a type in [`icu::locale`](icu_locale_core::preferences::extensions::unicode::keywords)**.
331    #[doc = "\n"] // prevent autoformatting
332    pub use icu_locale_core::preferences::extensions::unicode::keywords::CollationCaseFirst;
333    /// **This is a reexport of a type in [`icu::locale`](icu_locale_core::preferences::extensions::unicode::keywords)**.
334    #[doc = "\n"] // prevent autoformatting
335    pub use icu_locale_core::preferences::extensions::unicode::keywords::CollationNumericOrdering;
336    /// **This is a reexport of a type in [`icu::locale`](icu_locale_core::preferences::extensions::unicode::keywords)**.
337    #[doc = "\n"] // prevent autoformatting
338    pub use icu_locale_core::preferences::extensions::unicode::keywords::CollationType;
339}