isolang/lib.rs
1//! ISO 639 language codes
2//!
3//! When dealing with different language inputs and APIs, different standards are used to identify
4//! a language. Converting between these in an automated way can be tedious. This crate provides an
5//! enum which supports conversion from 639-1 and 639-3 and also into these formats, as well as
6//! into their names. The English name can be retrieved using
7//! [`Language::to_name()`](enum.Language.html#method.to_name) if compiled with the `english_names`
8//! feature.
9//! The autonyms (local names) can be retrieved using
10//! [`to_autonym()`](enum.Language.html#method.to_autonym) if compiled with the `local_names`
11//! feature.
12//!
13//! The language table is compiled into the library. While this increases the binary size, it means
14//! that no additional time is wasted on program startup or on table access for allocating or
15//! filling the map. It is hence suitable for retrieval of codes in constraint environments.
16//!
17//! # Examples
18//!
19//! ```
20//! use isolang::Language;
21//! #[cfg(feature = "english_names")]
22//! assert_eq!(Language::from_639_1("de").unwrap().to_name(), "German");
23//! #[cfg(feature = "local_names")]
24//! assert_eq!(Language::from_639_1("de").unwrap().to_autonym(), Some("Deutsch"));
25//!
26//! assert_eq!(Language::from_639_3("spa").unwrap().to_639_1(), Some("es"));
27//!
28//! #[cfg(feature = "list_languages")]
29//! {
30//! // Filter languages with a ISO 639-1 code
31//! let languages = isolang::languages();
32//! let languages_with_iso_639_1 = languages.filter(|language| language.to_639_1().is_some());
33//! for language in languages_with_iso_639_1 {
34//! assert_eq!(language.to_639_1().is_some(), true);
35//! }
36//! }
37//! ```
38
39#[cfg(feature = "serde")]
40mod serde_impl;
41
42extern crate phf;
43
44use std::{
45 error::Error,
46 fmt::{Debug, Display, Formatter},
47 str::{self, FromStr},
48};
49
50/// Language data extracted from `iso-639-3.tab` and `iso639-autonyms.tsv`
51///
52/// Instances of this are generated in the `generated_code_is_fresh()` integration test,
53/// which generates the code in `src/isotable.rs`.
54struct LanguageData {
55 /// The ISO-639-3 3-letter language code (column `Id` in `iso-639-3.tab`)
56 code_3: [u8; 3],
57 /// The ISO-639-1 2-letter language code, if available (column `Part1` in `iso-639-3.tab`)
58 code_1: Option<[u8; 2]>,
59 /// The language's name in English (column `Ref_Name` in `iso-639-3.tab`)
60 ///
61 /// The code generator removes any parenthesized suffix from the name.
62 #[cfg(feature = "english_names")]
63 name_en: &'static str,
64 /// The language's name in its own language (column `autonym` in `iso639-autonyms.tsv`)
65 #[cfg(feature = "local_names")]
66 autonym: Option<&'static str>,
67}
68
69#[rustfmt::skip]
70mod isotable;
71pub use isotable::Language;
72use isotable::{OVERVIEW, THREE_TO_THREE, TWO_TO_THREE};
73
74/// Get an iterator of all languages.
75///
76/// This will return an iterator over all the variants of the [`Language`](enum.Language.html) enum.
77/// It is available if compiled with the `list_languages` feature.
78///
79/// # Examples
80///
81/// ```
82/// let languages = isolang::languages();
83///
84/// // Display ISO 639-3 code of every language
85/// for language in languages {
86/// println!("{}", language.to_639_3());
87/// }
88///
89/// // Filter languages with a ISO 639-1 code
90/// let languages = isolang::languages();
91/// let languages_with_iso_639_1 = languages.filter(|language| language.to_639_1().is_some());
92/// for language in languages_with_iso_639_1 {
93/// assert_eq!(language.to_639_1().is_some(), true);
94/// }
95/// ```
96#[cfg(any(feature = "list_languages", test))]
97pub fn languages() -> impl Iterator<Item = Language> {
98 OVERVIEW.iter().enumerate().filter_map(|(idx, _)| Language::from_usize(idx))
99}
100
101impl Language {
102 /// Create string representation of this Language as a ISO 639-3 code.
103 ///
104 /// This method will return the ISO 639-3 code, which consists of three letters.
105 ///
106 /// # Example
107 ///
108 /// ```
109 /// use isolang::Language;
110 ///
111 /// assert_eq!(Language::Deu.to_639_3(), "deu");
112 /// ```
113 pub fn to_639_3(&self) -> &'static str {
114 // SAFETY: The ISO 639 table has been written to the binary with UTF-8 encoding, hence reading it without checks is safe.
115 unsafe { str::from_utf8_unchecked(&OVERVIEW[*self as usize].code_3) }
116 }
117
118 /// Create two-letter ISO 639-1 representation of the language.
119 ///
120 /// This will return a two-letter ISO 639-1 code, if it exists and None otherwise.
121 /// ISO 639-1 codes are only used for the most common languages.
122 ///
123 /// # Example
124 ///
125 /// ```
126 /// use isolang::Language;
127 ///
128 /// assert!(Language::Gha.to_639_1().is_none());
129 /// ```
130 pub fn to_639_1(&self) -> Option<&'static str> {
131 // SAFETY: The global state is initialised at load time and filled at compile-time. The
132 // access happens read-only.
133 unsafe {
134 OVERVIEW[*self as usize]
135 .code_1
136 .as_ref()
137 .map(|s| str::from_utf8_unchecked(s))
138 }
139 }
140
141 /// Get the English name of this language.
142 ///
143 /// This returns the English name of the language, as defined in the ISO 639 standard. It does
144 /// not include additional comments, e.g. classification of a macrolanguage, etc. It is
145 /// available if compiled with the `english_names` feature.
146 ///
147 /// # Examples
148 ///
149 /// ```rust
150 /// use isolang::Language;
151 ///
152 /// assert_eq!(Language::Spa.to_name(), "Spanish");
153 /// // macro language
154 /// assert_eq!(Language::Swa.to_name(), "Swahili");
155 /// // individual language
156 /// assert_eq!(Language::Swh.to_name(), "Swahili");
157 /// ```
158 #[cfg(feature = "english_names")]
159 pub fn to_name(&self) -> &'static str {
160 OVERVIEW[*self as usize].name_en
161 }
162
163 /// Get the ISO code by its English name.
164 ///
165 /// This returns the ISO code by the given English name of the language string, as defined in
166 /// the ISO 639 standard. It does not include additional comments, e.g. classification of a
167 /// macrolanguage, etc. Only available if compiled with the `english_names` feature.
168 ///
169 /// # Examples
170 ///
171 /// ```rust
172 /// use isolang::Language;
173 ///
174 /// assert_eq!(Language::from_name("Spanish"), Some(Language::Spa));
175 /// ```
176 #[cfg(feature = "english_names")]
177 pub fn from_name(engl_name: &str) -> Option<Self> {
178 OVERVIEW
179 .iter()
180 .enumerate()
181 .find(|(_, it)| it.name_en == engl_name)
182 .and_then(|(idx, _)| Language::from_usize(idx))
183 }
184
185 /// Get the ISO code by its lowercase English name.
186 ///
187 /// This returns the ISO code by the given lowercase English name of the language string, as defined in
188 /// the ISO 639 standard. It does not include additional comments, e.g. classification of a
189 /// macrolanguage, etc. Only available if compiled with the `lowercase_names` feature.
190 ///
191 /// # Examples
192 ///
193 /// ```rust
194 /// use isolang::Language;
195 /// let some_input_name = "spanish"; // maybe "Spanish"
196 /// assert_eq!(Language::from_name_lowercase(&some_input_name.to_ascii_lowercase()), Some(Language::Spa));
197 /// ```
198 #[cfg(all(feature = "english_names", feature = "lowercase_names"))]
199 pub fn from_name_lowercase(engl_name: &str) -> Option<Self> {
200 OVERVIEW
201 .iter()
202 .enumerate()
203 .find(|(_, it)| {
204 it.name_en.to_ascii_lowercase().as_str() == engl_name
205 })
206 .and_then(|(idx, _)| Language::from_usize(idx))
207 }
208
209 /// Get all matching ISO codes by a provided English name pattern.
210 ///
211 /// This returns the matching ISO codes for the provided matcher. The matcher matches all known
212 /// English language names.
213 ///
214 /// # Examples
215 ///
216 /// ```rust
217 /// use isolang::Language;
218 ///
219 /// assert!(Language::match_names(|lang| lang.contains("Engl")).count() > 1);
220 /// ```
221 #[cfg(feature = "english_names")]
222 pub fn match_names<F>(matcher: F) -> impl Iterator<Item = Self>
223 where
224 F: Fn(&str) -> bool + 'static,
225 {
226 OVERVIEW.iter().enumerate().filter_map(move |(idx, it)| {
227 match matcher(it.name_en) {
228 true => Language::from_usize(idx),
229 false => None,
230 }
231 })
232 }
233
234 /// Get the autonym of this language
235 ///
236 /// This returns the native language name (if there is one available). This method is available
237 /// if compiled with the `local_names` feature.
238 /// The database for those names is found here https://github.com/bbqsrc/iso639-autonyms
239 /// and it itself is a collection of several different datasets
240 ///
241 /// # Examples
242 ///
243 /// ```rust
244 /// use isolang::Language;
245 ///
246 /// assert_eq!(Language::Bul.to_autonym(), Some("български"));
247 /// assert_eq!(Language::Fra.to_autonym(), Some("français"));
248 /// ```
249 #[cfg(feature = "local_names")]
250 pub fn to_autonym(&self) -> Option<&'static str> {
251 OVERVIEW[*self as usize].autonym
252 }
253
254 /// Get the ISO code by its autonym (local language name).
255 ///
256 /// The result is `None` is the autonym wasn't found.
257 ///
258 /// # Examples
259 ///
260 /// ```rust
261 /// use isolang::Language;
262 ///
263 /// assert_eq!(Language::from_autonym("Deutsch"), Some(Language::Deu));
264 /// ```
265 #[cfg(feature = "local_names")]
266 pub fn from_autonym(autonym: &str) -> Option<Self> {
267 OVERVIEW
268 .iter()
269 .enumerate()
270 .find(|(_, it)| it.autonym == Some(autonym))
271 .and_then(|(idx, _)| Language::from_usize(idx))
272 }
273
274 /// Get all matching ISO codes by a provided autonym pattern.
275 ///
276 /// This returns the matching ISO codes for the provided matcher. It is evaluated against all
277 /// known autonyms (local language names).
278 ///
279 /// # Examples
280 ///
281 /// ```rust
282 /// use isolang::Language;
283 ///
284 /// assert_eq!(Language::match_autonyms(|lang| lang.contains("Deutsch")).count(), 1);
285 /// ```
286 #[cfg(feature = "local_names")]
287 pub fn match_autonyms<F>(matcher: F) -> impl Iterator<Item = Self>
288 where
289 F: Fn(&str) -> bool + 'static,
290 {
291 OVERVIEW.iter().enumerate().filter_map(move |(idx, it)| {
292 it.autonym.and_then(|autonym| match matcher(autonym) {
293 true => Language::from_usize(idx),
294 false => None,
295 })
296 })
297 }
298
299 /// Create a Language instance rom a ISO 639-1 code.
300 ///
301 /// This will return a Language instance if the given string is a valid two-letter language
302 /// code. For invalid inputs, None is returned.
303 ///
304 /// # Example
305 ///
306 /// ```
307 /// use isolang::Language;
308 ///
309 /// assert!(Language::from_639_1("de").is_some());
310 /// assert!(Language::from_639_1("…").is_none());
311 /// ```
312 pub fn from_639_1(code: &str) -> Option<Language> {
313 if code.len() != 2 {
314 return None;
315 }
316
317 TWO_TO_THREE
318 .get(code)
319 .copied()
320 .and_then(|raw_lang| Language::from_usize(raw_lang as usize))
321 }
322
323 /// Create a Language instance rom a ISO 639-3 code.
324 ///
325 /// This will return a Language instance if the given string is a valid three-letter language
326 /// code. For invalid inputs, None is returned.
327 ///
328 /// # Example
329 ///
330 /// ```
331 /// use isolang::Language;
332 ///
333 /// assert!(Language::from_639_3("dan").is_some());
334 /// assert!(Language::from_639_1("…").is_none());
335 /// ```
336 pub fn from_639_3(code: &str) -> Option<Language> {
337 if code.len() != 3 {
338 return None;
339 }
340
341 THREE_TO_THREE
342 .get(code)
343 .copied()
344 .and_then(|raw_lang| Language::from_usize(raw_lang as usize))
345 }
346
347 /// Parse language from given locale
348 ///
349 /// This parses a language from a given locale string, as used by UNIX-alike and other systems.
350 ///
351 /// # Example
352 ///
353 /// ```
354 /// use isolang::Language;
355 ///
356 /// assert!(Language::from_locale("de_DE.UTF-8") == Some(Language::Deu));
357 /// ```
358 pub fn from_locale(locale: &str) -> Option<Language> {
359 if locale.len() < 3 {
360 return None;
361 }
362 // use first bit of locale (before the _) to detect the language
363 locale.split('_').next().and_then(Language::from_639_1)
364 }
365}
366
367#[allow(clippy::derivable_impls)]
368impl Default for Language {
369 fn default() -> Self {
370 Language::Und
371 }
372}
373
374impl Debug for Language {
375 fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
376 write!(f, "{}", self.to_639_3())
377 }
378}
379
380impl Display for Language {
381 #[cfg(all(feature = "local_names", feature = "english_names"))]
382 fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
383 write!(
384 f,
385 "{} ({})",
386 self.to_name(),
387 self.to_autonym().unwrap_or("missing autonym")
388 )
389 }
390
391 #[cfg(all(feature = "local_names", not(feature = "english_names")))]
392 fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
393 write!(f, "{}", self.to_autonym().unwrap_or("missing autonym"))
394 }
395
396 #[cfg(all(not(feature = "local_names"), feature = "english_names"))]
397 fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
398 write!(f, "{}", self.to_name())
399 }
400
401 #[cfg(all(not(feature = "local_names"), not(feature = "english_names")))]
402 fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
403 write!(f, "{}", self.to_639_3())
404 }
405}
406
407#[derive(Debug)]
408pub struct ParseLanguageError(String);
409
410impl Display for ParseLanguageError {
411 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
412 write!(f, "'{}' is not a valid ISO 639-1 or 639-3 code.", self.0)
413 }
414}
415
416impl Error for ParseLanguageError {}
417
418impl FromStr for Language {
419 type Err = ParseLanguageError;
420
421 #[cfg(any(
422 not(feature = "english_names"),
423 not(feature = "lowercase_names")
424 ))]
425 fn from_str(s: &str) -> Result<Self, ParseLanguageError> {
426 match Language::from_639_3(s).or_else(|| Language::from_639_1(s)) {
427 Some(l) => Ok(l),
428 None => Err(ParseLanguageError(s.to_owned())),
429 }
430 }
431
432 #[cfg(all(
433 feature = "english_names",
434 feature = "lowercase_names",
435 not(feature = "local_names")
436 ))]
437 fn from_str(s: &str) -> Result<Self, ParseLanguageError> {
438 match Language::from_639_3(s)
439 .or_else(|| Language::from_639_1(s))
440 .or_else(|| Language::from_name_lowercase(s))
441 {
442 Some(l) => Ok(l),
443 None => Err(ParseLanguageError(s.to_owned())),
444 }
445 }
446
447 #[cfg(all(
448 feature = "english_names",
449 feature = "lowercase_names",
450 feature = "local_names"
451 ))]
452 fn from_str(s: &str) -> Result<Self, ParseLanguageError> {
453 match Language::from_639_3(s)
454 .or_else(|| Language::from_639_1(s))
455 .or_else(|| Language::from_name_lowercase(s))
456 .or_else(|| Language::from_autonym(s))
457 {
458 Some(l) => Ok(l),
459 None => Err(ParseLanguageError(s.to_owned())),
460 }
461 }
462}
463
464#[cfg(test)]
465mod tests {
466 use super::*;
467 #[cfg(feature = "serde")]
468 extern crate serde_json;
469 use std::fmt::Write;
470
471 #[test]
472 fn invalid_locale_gives_none() {
473 assert!(Language::from_locale("foo").is_none());
474 assert!(Language::from_locale("deu_DEU.UTF-8").is_none());
475 assert!(Language::from_locale("___").is_none());
476 assert!(Language::from_locale("ää_öö.UTF-8").is_none());
477 }
478
479 #[test]
480 fn test_valid_locales_are_correctly_decoded() {
481 assert_eq!(Language::from_locale("de_DE.UTF-8"), Some(Language::Deu));
482 assert_eq!(Language::from_locale("en_GB.UTF-8"), Some(Language::Eng));
483 }
484
485 #[test]
486 fn test_std_fmt() {
487 let mut t = String::new();
488 write!(t, "{}", Language::Deu).unwrap();
489 if cfg!(feature = "local_names") && cfg!(feature = "english_names") {
490 assert_eq!(t, "German (Deutsch)");
491 } else if cfg!(feature = "local_names") {
492 assert_eq!(t, "Deutsch");
493 } else if cfg!(feature = "english_names") {
494 assert_eq!(t, "German");
495 } else {
496 assert_eq!(t, "deu");
497 }
498
499 let mut t = String::new();
500 write!(t, "{:?}", Language::Deu).unwrap();
501 assert_eq!(t, "deu");
502 }
503
504 #[test]
505 #[cfg(feature = "local_names")]
506 fn test_iso639_3_to_autonym() {
507 assert_eq!(
508 Language::from_639_3("bul").unwrap().to_autonym(),
509 Some("български")
510 );
511 assert_eq!(
512 Language::from_639_3("fra").unwrap().to_autonym(),
513 Some("français")
514 );
515 }
516
517 #[test]
518 fn test_default() {
519 assert_eq!(Language::default(), Language::Und);
520 }
521
522 #[test]
523 #[cfg(feature = "serde")]
524 fn test_serde() {
525 fn to_json(code: &str) -> String {
526 format!(r#""{code}""#)
527 }
528
529 fn test_deserialize(language: Language, code: &str) {
530 assert_eq!(
531 serde_json::from_str::<Language>(&to_json(code)).unwrap(),
532 language
533 );
534 assert_eq!(
535 serde_json::from_value::<Language>(serde_json::json!(code))
536 .unwrap(),
537 language
538 );
539 }
540
541 for language in languages() {
542 assert_eq!(
543 serde_json::to_string(&language).unwrap(),
544 to_json(language.to_639_3())
545 );
546
547 test_deserialize(language, language.to_639_3());
548 if let Some(code) = language.to_639_1() {
549 test_deserialize(language, code)
550 }
551
552 assert_eq!(
553 serde_json::from_str::<Language>(
554 &serde_json::to_string(&language).unwrap()
555 )
556 .unwrap(),
557 language
558 );
559 }
560
561 assert_eq!(
562 serde_json::from_str::<Language>(&to_json("foo")).map_err(|e| e.to_string()),
563 Err("unknown variant `foo`, expected `any valid ISO 639-1 or 639-3 code` at line 1 column 5".to_string())
564 );
565 assert_eq!(
566 serde_json::from_str::<Language>("123").map_err(|e| e.to_string()),
567 Err("invalid type: integer `123`, expected borrowed str or bytes at line 1 column 3".to_string())
568 );
569 }
570
571 #[test]
572 fn test_ordering() {
573 assert!(Language::Deu < Language::Fra);
574 let fra = Language::Fra;
575 assert!(fra <= Language::Fra);
576 }
577
578 #[test]
579 #[cfg(feature = "list_languages")]
580 fn test_good_language_filtering() {
581 let languages = languages();
582 let languages_with_iso_639_1 =
583 languages.filter(|language| language.to_639_1().is_some());
584 for language in languages_with_iso_639_1 {
585 assert!(language.to_639_1().is_some());
586 }
587 }
588
589 #[test]
590 #[cfg(feature = "list_languages")]
591 fn test_wrong_language_filtering() {
592 let languages = languages();
593 let languages_with_iso_639_1 =
594 languages.filter(|language| language.to_639_1().is_none());
595 for language in languages_with_iso_639_1 {
596 assert!(language.to_639_1().is_none());
597 }
598 }
599
600 #[test]
601 fn test_from_str() {
602 assert_eq!(Language::from_str("deu").unwrap(), Language::Deu);
603 assert_eq!(Language::from_str("fr").unwrap(), Language::Fra);
604 assert!(Language::from_str("foo").is_err());
605 }
606
607 #[test]
608 #[cfg(feature = "english_names")]
609 fn test_from_str_full_features() {
610 assert_eq!(Language::from_str("es").unwrap().to_name(), "Spanish");
611 assert_eq!(Language::from_str("spa").unwrap().to_name(), "Spanish");
612 if cfg!(feature = "lowercase_names") {
613 assert_eq!(
614 Language::from_str("spanish").unwrap().to_name(),
615 "Spanish"
616 );
617 }
618 if cfg!(feature = "lowercase_names") && cfg!(feature = "local_names") {
619 assert_eq!(
620 Language::from_str("español").unwrap().to_name(),
621 "Spanish"
622 );
623 }
624 assert!(Language::from_str("Spanish").is_err());
625 }
626}