isolang/
lib.rs

1//! ISO 639 language codes
2//!
3//! When dealing with different language inputs and APIs, different standards are used to identify
4//! a language. Converting between these in an automated way can be tedious. This crate provides an
5//! enum which supports conversion from 639-1 and 639-3 and also into these formats, as well as
6//! into their names. The English name can be retrieved using
7//! [`Language::to_name()`](enum.Language.html#method.to_name) if compiled with the `english_names`
8//! feature.
9//! The autonyms (local names) can be retrieved using
10//! [`to_autonym()`](enum.Language.html#method.to_autonym) if compiled with the `local_names`
11//! feature.
12//!
13//! The language table is compiled into the library. While this increases the binary size, it means
14//! that no additional time is wasted on program startup or on table access for allocating or
15//! filling the map. It is hence suitable for retrieval of codes in constraint environments.
16//!
17//! # Examples
18//!
19//! ```
20//! use isolang::Language;
21//! #[cfg(feature = "english_names")]
22//! assert_eq!(Language::from_639_1("de").unwrap().to_name(), "German");
23//! #[cfg(feature = "local_names")]
24//! assert_eq!(Language::from_639_1("de").unwrap().to_autonym(), Some("Deutsch"));
25//!
26//! assert_eq!(Language::from_639_3("spa").unwrap().to_639_1(), Some("es"));
27//!
28//! #[cfg(feature = "list_languages")]
29//! {
30//!     // Filter languages with a ISO 639-1 code
31//!     let languages = isolang::languages();
32//!     let languages_with_iso_639_1 = languages.filter(|language| language.to_639_1().is_some());
33//!     for language in languages_with_iso_639_1 {
34//!         assert_eq!(language.to_639_1().is_some(), true);
35//!     }
36//! }
37//! ```
38
39#[cfg(feature = "serde")]
40mod serde_impl;
41
42extern crate phf;
43
44use std::{
45    error::Error,
46    fmt::{Debug, Display, Formatter},
47    str::{self, FromStr},
48};
49
50/// Language data extracted from `iso-639-3.tab` and `iso639-autonyms.tsv`
51///
52/// Instances of this are generated in the `generated_code_is_fresh()` integration test,
53/// which generates the code in `src/isotable.rs`.
54struct LanguageData {
55    /// The ISO-639-3 3-letter language code (column `Id` in `iso-639-3.tab`)
56    code_3: [u8; 3],
57    /// The ISO-639-1 2-letter language code, if available (column `Part1` in `iso-639-3.tab`)
58    code_1: Option<[u8; 2]>,
59    /// The language's name in English (column `Ref_Name` in `iso-639-3.tab`)
60    ///
61    /// The code generator removes any parenthesized suffix from the name.
62    #[cfg(feature = "english_names")]
63    name_en: &'static str,
64    /// The language's name in its own language (column `autonym` in `iso639-autonyms.tsv`)
65    #[cfg(feature = "local_names")]
66    autonym: Option<&'static str>,
67}
68
69#[rustfmt::skip]
70mod isotable;
71pub use isotable::Language;
72use isotable::{OVERVIEW, THREE_TO_THREE, TWO_TO_THREE};
73
74/// Get an iterator of all languages.
75///
76/// This will return an iterator over all the variants of the [`Language`](enum.Language.html) enum.
77/// It is available if compiled with the `list_languages` feature.
78///
79/// # Examples
80///
81/// ```
82/// let languages = isolang::languages();
83///
84/// // Display ISO 639-3 code of every language
85/// for language in languages {
86///     println!("{}", language.to_639_3());
87/// }
88///
89/// // Filter languages with a ISO 639-1 code
90/// let languages = isolang::languages();
91/// let languages_with_iso_639_1 = languages.filter(|language| language.to_639_1().is_some());
92/// for language in languages_with_iso_639_1 {
93///     assert_eq!(language.to_639_1().is_some(), true);
94/// }
95/// ```
96#[cfg(any(feature = "list_languages", test))]
97pub fn languages() -> impl Iterator<Item = Language> {
98    OVERVIEW.iter().enumerate().filter_map(|(idx, _)| Language::from_usize(idx))
99}
100
101impl Language {
102    /// Create string representation of this Language as a ISO 639-3 code.
103    ///
104    /// This method will return the ISO 639-3 code, which consists of three letters.
105    ///
106    /// # Example
107    ///
108    /// ```
109    /// use isolang::Language;
110    ///
111    /// assert_eq!(Language::Deu.to_639_3(), "deu");
112    /// ```
113    pub fn to_639_3(&self) -> &'static str {
114        // SAFETY: The ISO 639 table has been written to the binary with UTF-8 encoding, hence reading it without checks is safe.
115        unsafe { str::from_utf8_unchecked(&OVERVIEW[*self as usize].code_3) }
116    }
117
118    /// Create two-letter ISO 639-1 representation of the language.
119    ///
120    /// This will return a two-letter ISO 639-1 code, if it exists and None otherwise.
121    /// ISO 639-1 codes are only used for the most common languages.
122    ///
123    /// # Example
124    ///
125    /// ```
126    /// use isolang::Language;
127    ///
128    /// assert!(Language::Gha.to_639_1().is_none());
129    /// ```
130    pub fn to_639_1(&self) -> Option<&'static str> {
131        // SAFETY: The global state is initialised at load time and filled at compile-time. The
132        // access happens read-only.
133        unsafe {
134            OVERVIEW[*self as usize]
135                .code_1
136                .as_ref()
137                .map(|s| str::from_utf8_unchecked(s))
138        }
139    }
140
141    /// Get the English name of this language.
142    ///
143    /// This returns the English name of the language, as defined in the ISO 639 standard. It does
144    /// not include additional comments, e.g. classification of a macrolanguage, etc. It is
145    /// available if compiled with the `english_names` feature.
146    ///
147    /// # Examples
148    ///
149    /// ```rust
150    /// use isolang::Language;
151    ///
152    /// assert_eq!(Language::Spa.to_name(), "Spanish");
153    /// // macro language
154    /// assert_eq!(Language::Swa.to_name(), "Swahili");
155    /// // individual language
156    /// assert_eq!(Language::Swh.to_name(), "Swahili");
157    /// ```
158    #[cfg(feature = "english_names")]
159    pub fn to_name(&self) -> &'static str {
160        OVERVIEW[*self as usize].name_en
161    }
162
163    /// Get the ISO code by its English name.
164    ///
165    /// This returns the ISO code by the given English name of the language string, as defined in
166    /// the ISO 639 standard. It does not include additional comments, e.g. classification of a
167    /// macrolanguage, etc. Only available if compiled with the `english_names` feature.
168    ///
169    /// # Examples
170    ///
171    /// ```rust
172    /// use isolang::Language;
173    ///
174    /// assert_eq!(Language::from_name("Spanish"), Some(Language::Spa));
175    /// ```
176    #[cfg(feature = "english_names")]
177    pub fn from_name(engl_name: &str) -> Option<Self> {
178        OVERVIEW
179            .iter()
180            .enumerate()
181            .find(|(_, it)| it.name_en == engl_name)
182            .and_then(|(idx, _)| Language::from_usize(idx))
183    }
184
185    /// Get the ISO code by its lowercase English name.
186    ///
187    /// This returns the ISO code by the given lowercase English name of the language string, as defined in
188    /// the ISO 639 standard. It does not include additional comments, e.g. classification of a
189    /// macrolanguage, etc. Only available if compiled with the `lowercase_names` feature.
190    ///
191    /// # Examples
192    ///
193    /// ```rust
194    /// use isolang::Language;
195    /// let some_input_name = "spanish"; // maybe "Spanish"
196    /// assert_eq!(Language::from_name_lowercase(&some_input_name.to_ascii_lowercase()), Some(Language::Spa));
197    /// ```
198    #[cfg(all(feature = "english_names", feature = "lowercase_names"))]
199    pub fn from_name_lowercase(engl_name: &str) -> Option<Self> {
200        OVERVIEW
201            .iter()
202            .enumerate()
203            .find(|(_, it)| {
204                it.name_en.to_ascii_lowercase().as_str() == engl_name
205            })
206            .and_then(|(idx, _)| Language::from_usize(idx))
207    }
208
209    /// Get all matching ISO codes by a provided English name pattern.
210    ///
211    /// This returns the matching ISO codes for the provided matcher. The matcher matches all known
212    /// English language names.
213    ///
214    /// # Examples
215    ///
216    /// ```rust
217    /// use isolang::Language;
218    ///
219    /// assert!(Language::match_names(|lang| lang.contains("Engl")).count() > 1);
220    /// ```
221    #[cfg(feature = "english_names")]
222    pub fn match_names<F>(matcher: F) -> impl Iterator<Item = Self>
223    where
224        F: Fn(&str) -> bool + 'static,
225    {
226        OVERVIEW.iter().enumerate().filter_map(move |(idx, it)| {
227            match matcher(it.name_en) {
228                true => Language::from_usize(idx),
229                false => None,
230            }
231        })
232    }
233
234    /// Get the autonym of this language
235    ///
236    /// This returns the native language name (if there is one available). This method is available
237    /// if compiled with the `local_names` feature.
238    /// The database for those names is found here https://github.com/bbqsrc/iso639-autonyms
239    /// and it itself is a collection of several different datasets
240    ///
241    /// # Examples
242    ///
243    /// ```rust
244    /// use isolang::Language;
245    ///
246    /// assert_eq!(Language::Bul.to_autonym(), Some("български"));
247    /// assert_eq!(Language::Fra.to_autonym(), Some("français"));
248    /// ```
249    #[cfg(feature = "local_names")]
250    pub fn to_autonym(&self) -> Option<&'static str> {
251        OVERVIEW[*self as usize].autonym
252    }
253
254    /// Get the ISO code by its autonym (local language name).
255    ///
256    /// The result is `None` is the autonym wasn't found.
257    ///
258    /// # Examples
259    ///
260    /// ```rust
261    /// use isolang::Language;
262    ///
263    /// assert_eq!(Language::from_autonym("Deutsch"), Some(Language::Deu));
264    /// ```
265    #[cfg(feature = "local_names")]
266    pub fn from_autonym(autonym: &str) -> Option<Self> {
267        OVERVIEW
268            .iter()
269            .enumerate()
270            .find(|(_, it)| it.autonym == Some(autonym))
271            .and_then(|(idx, _)| Language::from_usize(idx))
272    }
273
274    /// Get all matching ISO codes by a provided autonym pattern.
275    ///
276    /// This returns the matching ISO codes for the provided matcher. It is evaluated against all
277    /// known autonyms (local language names).
278    ///
279    /// # Examples
280    ///
281    /// ```rust
282    /// use isolang::Language;
283    ///
284    /// assert_eq!(Language::match_autonyms(|lang| lang.contains("Deutsch")).count(), 1);
285    /// ```
286    #[cfg(feature = "local_names")]
287    pub fn match_autonyms<F>(matcher: F) -> impl Iterator<Item = Self>
288    where
289        F: Fn(&str) -> bool + 'static,
290    {
291        OVERVIEW.iter().enumerate().filter_map(move |(idx, it)| {
292            it.autonym.and_then(|autonym| match matcher(autonym) {
293                true => Language::from_usize(idx),
294                false => None,
295            })
296        })
297    }
298
299    /// Create a Language instance rom a ISO 639-1 code.
300    ///
301    /// This will return a Language instance if the given string is a valid two-letter language
302    /// code. For invalid inputs, None is returned.
303    ///
304    /// # Example
305    ///
306    /// ```
307    /// use isolang::Language;
308    ///
309    /// assert!(Language::from_639_1("de").is_some());
310    /// assert!(Language::from_639_1("…").is_none());
311    /// ```
312    pub fn from_639_1(code: &str) -> Option<Language> {
313        if code.len() != 2 {
314            return None;
315        }
316
317        TWO_TO_THREE
318            .get(code)
319            .copied()
320            .and_then(|raw_lang| Language::from_usize(raw_lang as usize))
321    }
322
323    /// Create a Language instance rom a ISO 639-3 code.
324    ///
325    /// This will return a Language instance if the given string is a valid three-letter language
326    /// code. For invalid inputs, None is returned.
327    ///
328    /// # Example
329    ///
330    /// ```
331    /// use isolang::Language;
332    ///
333    /// assert!(Language::from_639_3("dan").is_some());
334    /// assert!(Language::from_639_1("…").is_none());
335    /// ```
336    pub fn from_639_3(code: &str) -> Option<Language> {
337        if code.len() != 3 {
338            return None;
339        }
340
341        THREE_TO_THREE
342            .get(code)
343            .copied()
344            .and_then(|raw_lang| Language::from_usize(raw_lang as usize))
345    }
346
347    /// Parse language from given locale
348    ///
349    /// This parses a language from a given locale string, as used by UNIX-alike and other systems.
350    ///
351    /// # Example
352    ///
353    /// ```
354    /// use isolang::Language;
355    ///
356    /// assert!(Language::from_locale("de_DE.UTF-8") == Some(Language::Deu));
357    /// ```
358    pub fn from_locale(locale: &str) -> Option<Language> {
359        if locale.len() < 3 {
360            return None;
361        }
362        // use first bit of locale (before the _) to detect the language
363        locale.split('_').next().and_then(Language::from_639_1)
364    }
365}
366
367#[allow(clippy::derivable_impls)]
368impl Default for Language {
369    fn default() -> Self {
370        Language::Und
371    }
372}
373
374impl Debug for Language {
375    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
376        write!(f, "{}", self.to_639_3())
377    }
378}
379
380impl Display for Language {
381    #[cfg(all(feature = "local_names", feature = "english_names"))]
382    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
383        write!(
384            f,
385            "{} ({})",
386            self.to_name(),
387            self.to_autonym().unwrap_or("missing autonym")
388        )
389    }
390
391    #[cfg(all(feature = "local_names", not(feature = "english_names")))]
392    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
393        write!(f, "{}", self.to_autonym().unwrap_or("missing autonym"))
394    }
395
396    #[cfg(all(not(feature = "local_names"), feature = "english_names"))]
397    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
398        write!(f, "{}", self.to_name())
399    }
400
401    #[cfg(all(not(feature = "local_names"), not(feature = "english_names")))]
402    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
403        write!(f, "{}", self.to_639_3())
404    }
405}
406
407#[derive(Debug)]
408pub struct ParseLanguageError(String);
409
410impl Display for ParseLanguageError {
411    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
412        write!(f, "'{}' is not a valid ISO 639-1 or 639-3 code.", self.0)
413    }
414}
415
416impl Error for ParseLanguageError {}
417
418impl FromStr for Language {
419    type Err = ParseLanguageError;
420
421    #[cfg(any(
422        not(feature = "english_names"),
423        not(feature = "lowercase_names")
424    ))]
425    fn from_str(s: &str) -> Result<Self, ParseLanguageError> {
426        match Language::from_639_3(s).or_else(|| Language::from_639_1(s)) {
427            Some(l) => Ok(l),
428            None => Err(ParseLanguageError(s.to_owned())),
429        }
430    }
431
432    #[cfg(all(
433        feature = "english_names",
434        feature = "lowercase_names",
435        not(feature = "local_names")
436    ))]
437    fn from_str(s: &str) -> Result<Self, ParseLanguageError> {
438        match Language::from_639_3(s)
439            .or_else(|| Language::from_639_1(s))
440            .or_else(|| Language::from_name_lowercase(s))
441        {
442            Some(l) => Ok(l),
443            None => Err(ParseLanguageError(s.to_owned())),
444        }
445    }
446
447    #[cfg(all(
448        feature = "english_names",
449        feature = "lowercase_names",
450        feature = "local_names"
451    ))]
452    fn from_str(s: &str) -> Result<Self, ParseLanguageError> {
453        match Language::from_639_3(s)
454            .or_else(|| Language::from_639_1(s))
455            .or_else(|| Language::from_name_lowercase(s))
456            .or_else(|| Language::from_autonym(s))
457        {
458            Some(l) => Ok(l),
459            None => Err(ParseLanguageError(s.to_owned())),
460        }
461    }
462}
463
464#[cfg(test)]
465mod tests {
466    use super::*;
467    #[cfg(feature = "serde")]
468    extern crate serde_json;
469    use std::fmt::Write;
470
471    #[test]
472    fn invalid_locale_gives_none() {
473        assert!(Language::from_locale("foo").is_none());
474        assert!(Language::from_locale("deu_DEU.UTF-8").is_none());
475        assert!(Language::from_locale("___").is_none());
476        assert!(Language::from_locale("ää_öö.UTF-8").is_none());
477    }
478
479    #[test]
480    fn test_valid_locales_are_correctly_decoded() {
481        assert_eq!(Language::from_locale("de_DE.UTF-8"), Some(Language::Deu));
482        assert_eq!(Language::from_locale("en_GB.UTF-8"), Some(Language::Eng));
483    }
484
485    #[test]
486    fn test_std_fmt() {
487        let mut t = String::new();
488        write!(t, "{}", Language::Deu).unwrap();
489        if cfg!(feature = "local_names") && cfg!(feature = "english_names") {
490            assert_eq!(t, "German (Deutsch)");
491        } else if cfg!(feature = "local_names") {
492            assert_eq!(t, "Deutsch");
493        } else if cfg!(feature = "english_names") {
494            assert_eq!(t, "German");
495        } else {
496            assert_eq!(t, "deu");
497        }
498
499        let mut t = String::new();
500        write!(t, "{:?}", Language::Deu).unwrap();
501        assert_eq!(t, "deu");
502    }
503
504    #[test]
505    #[cfg(feature = "local_names")]
506    fn test_iso639_3_to_autonym() {
507        assert_eq!(
508            Language::from_639_3("bul").unwrap().to_autonym(),
509            Some("български")
510        );
511        assert_eq!(
512            Language::from_639_3("fra").unwrap().to_autonym(),
513            Some("français")
514        );
515    }
516
517    #[test]
518    fn test_default() {
519        assert_eq!(Language::default(), Language::Und);
520    }
521
522    #[test]
523    #[cfg(feature = "serde")]
524    fn test_serde() {
525        fn to_json(code: &str) -> String {
526            format!(r#""{code}""#)
527        }
528
529        fn test_deserialize(language: Language, code: &str) {
530            assert_eq!(
531                serde_json::from_str::<Language>(&to_json(code)).unwrap(),
532                language
533            );
534            assert_eq!(
535                serde_json::from_value::<Language>(serde_json::json!(code))
536                    .unwrap(),
537                language
538            );
539        }
540
541        for language in languages() {
542            assert_eq!(
543                serde_json::to_string(&language).unwrap(),
544                to_json(language.to_639_3())
545            );
546
547            test_deserialize(language, language.to_639_3());
548            if let Some(code) = language.to_639_1() {
549                test_deserialize(language, code)
550            }
551
552            assert_eq!(
553                serde_json::from_str::<Language>(
554                    &serde_json::to_string(&language).unwrap()
555                )
556                .unwrap(),
557                language
558            );
559        }
560
561        assert_eq!(
562            serde_json::from_str::<Language>(&to_json("foo")).map_err(|e| e.to_string()),
563            Err("unknown variant `foo`, expected `any valid ISO 639-1 or 639-3 code` at line 1 column 5".to_string())
564        );
565        assert_eq!(
566            serde_json::from_str::<Language>("123").map_err(|e| e.to_string()),
567            Err("invalid type: integer `123`, expected borrowed str or bytes at line 1 column 3".to_string())
568        );
569    }
570
571    #[test]
572    fn test_ordering() {
573        assert!(Language::Deu < Language::Fra);
574        let fra = Language::Fra;
575        assert!(fra <= Language::Fra);
576    }
577
578    #[test]
579    #[cfg(feature = "list_languages")]
580    fn test_good_language_filtering() {
581        let languages = languages();
582        let languages_with_iso_639_1 =
583            languages.filter(|language| language.to_639_1().is_some());
584        for language in languages_with_iso_639_1 {
585            assert!(language.to_639_1().is_some());
586        }
587    }
588
589    #[test]
590    #[cfg(feature = "list_languages")]
591    fn test_wrong_language_filtering() {
592        let languages = languages();
593        let languages_with_iso_639_1 =
594            languages.filter(|language| language.to_639_1().is_none());
595        for language in languages_with_iso_639_1 {
596            assert!(language.to_639_1().is_none());
597        }
598    }
599
600    #[test]
601    fn test_from_str() {
602        assert_eq!(Language::from_str("deu").unwrap(), Language::Deu);
603        assert_eq!(Language::from_str("fr").unwrap(), Language::Fra);
604        assert!(Language::from_str("foo").is_err());
605    }
606
607    #[test]
608    #[cfg(feature = "english_names")]
609    fn test_from_str_full_features() {
610        assert_eq!(Language::from_str("es").unwrap().to_name(), "Spanish");
611        assert_eq!(Language::from_str("spa").unwrap().to_name(), "Spanish");
612        if cfg!(feature = "lowercase_names") {
613            assert_eq!(
614                Language::from_str("spanish").unwrap().to_name(),
615                "Spanish"
616            );
617        }
618        if cfg!(feature = "lowercase_names") && cfg!(feature = "local_names") {
619            assert_eq!(
620                Language::from_str("español").unwrap().to_name(),
621                "Spanish"
622            );
623        }
624        assert!(Language::from_str("Spanish").is_err());
625    }
626}