summaryrefslogtreecommitdiff
path: root/crates/typst-library/src/text/lang.rs
diff options
context:
space:
mode:
Diffstat (limited to 'crates/typst-library/src/text/lang.rs')
-rw-r--r--crates/typst-library/src/text/lang.rs317
1 files changed, 317 insertions, 0 deletions
diff --git a/crates/typst-library/src/text/lang.rs b/crates/typst-library/src/text/lang.rs
new file mode 100644
index 00000000..64ab1a7c
--- /dev/null
+++ b/crates/typst-library/src/text/lang.rs
@@ -0,0 +1,317 @@
+use std::collections::HashMap;
+use std::str::FromStr;
+
+use ecow::{eco_format, EcoString};
+
+use crate::diag::Hint;
+use crate::foundations::{cast, StyleChain};
+use crate::layout::Dir;
+use crate::text::TextElem;
+
+macro_rules! translation {
+ ($lang:literal) => {
+ ($lang, include_str!(concat!("../../translations/", $lang, ".txt")))
+ };
+}
+
+const TRANSLATIONS: [(&str, &str); 36] = [
+ translation!("ar"),
+ translation!("ca"),
+ translation!("cs"),
+ translation!("da"),
+ translation!("de"),
+ translation!("en"),
+ translation!("es"),
+ translation!("et"),
+ translation!("fi"),
+ translation!("fr"),
+ translation!("gl"),
+ translation!("gr"),
+ translation!("he"),
+ translation!("hu"),
+ translation!("is"),
+ translation!("it"),
+ translation!("ja"),
+ translation!("la"),
+ translation!("nb"),
+ translation!("nl"),
+ translation!("nn"),
+ translation!("pl"),
+ translation!("pt-PT"),
+ translation!("pt"),
+ translation!("ro"),
+ translation!("ru"),
+ translation!("sl"),
+ translation!("sq"),
+ translation!("sr"),
+ translation!("sv"),
+ translation!("tl"),
+ translation!("tr"),
+ translation!("ua"),
+ translation!("vi"),
+ translation!("zh-TW"),
+ translation!("zh"),
+];
+
+/// An identifier for a natural language.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
+pub struct Lang([u8; 3], u8);
+
+impl Lang {
+ pub const ALBANIAN: Self = Self(*b"sq ", 2);
+ pub const ARABIC: Self = Self(*b"ar ", 2);
+ pub const BOKMÃ…L: Self = Self(*b"nb ", 2);
+ pub const CATALAN: Self = Self(*b"ca ", 2);
+ pub const CHINESE: Self = Self(*b"zh ", 2);
+ pub const CROATIAN: Self = Self(*b"hr ", 2);
+ pub const CZECH: Self = Self(*b"cs ", 2);
+ pub const DANISH: Self = Self(*b"da ", 2);
+ pub const DUTCH: Self = Self(*b"nl ", 2);
+ pub const ENGLISH: Self = Self(*b"en ", 2);
+ pub const ESTONIAN: Self = Self(*b"et ", 2);
+ pub const FILIPINO: Self = Self(*b"tl ", 2);
+ pub const FINNISH: Self = Self(*b"fi ", 2);
+ pub const FRENCH: Self = Self(*b"fr ", 2);
+ pub const GALICIAN: Self = Self(*b"gl ", 2);
+ pub const GERMAN: Self = Self(*b"de ", 2);
+ pub const GREEK: Self = Self(*b"gr ", 2);
+ pub const HEBREW: Self = Self(*b"he ", 2);
+ pub const HUNGARIAN: Self = Self(*b"hu ", 2);
+ pub const ICELANDIC: Self = Self(*b"is ", 2);
+ pub const ITALIAN: Self = Self(*b"it ", 2);
+ pub const JAPANESE: Self = Self(*b"ja ", 2);
+ pub const LATIN: Self = Self(*b"la ", 2);
+ pub const LOWER_SORBIAN: Self = Self(*b"dsb", 3);
+ pub const NYNORSK: Self = Self(*b"nn ", 2);
+ pub const POLISH: Self = Self(*b"pl ", 2);
+ pub const PORTUGUESE: Self = Self(*b"pt ", 2);
+ pub const ROMANIAN: Self = Self(*b"ro ", 2);
+ pub const RUSSIAN: Self = Self(*b"ru ", 2);
+ pub const SERBIAN: Self = Self(*b"sr ", 2);
+ pub const SLOVAK: Self = Self(*b"sk ", 2);
+ pub const SLOVENIAN: Self = Self(*b"sl ", 2);
+ pub const SPANISH: Self = Self(*b"es ", 2);
+ pub const SWEDISH: Self = Self(*b"sv ", 2);
+ pub const TURKISH: Self = Self(*b"tr ", 2);
+ pub const UKRAINIAN: Self = Self(*b"ua ", 2);
+ pub const VIETNAMESE: Self = Self(*b"vi ", 2);
+
+ /// Return the language code as an all lowercase string slice.
+ pub fn as_str(&self) -> &str {
+ std::str::from_utf8(&self.0[..usize::from(self.1)]).unwrap_or_default()
+ }
+
+ /// The default direction for the language.
+ pub fn dir(self) -> Dir {
+ match self.as_str() {
+ "ar" | "dv" | "fa" | "he" | "ks" | "pa" | "ps" | "sd" | "ug" | "ur"
+ | "yi" => Dir::RTL,
+ _ => Dir::LTR,
+ }
+ }
+}
+
+impl FromStr for Lang {
+ type Err = &'static str;
+
+ /// Construct a language from a two- or three-byte ISO 639-1/2/3 code.
+ fn from_str(iso: &str) -> Result<Self, Self::Err> {
+ let len = iso.len();
+ if matches!(len, 2..=3) && iso.is_ascii() {
+ let mut bytes = [b' '; 3];
+ bytes[..len].copy_from_slice(iso.as_bytes());
+ bytes.make_ascii_lowercase();
+ Ok(Self(bytes, len as u8))
+ } else {
+ Err("expected two or three letter language code (ISO 639-1/2/3)")
+ }
+ }
+}
+
+cast! {
+ Lang,
+ self => self.as_str().into_value(),
+ string: EcoString => {
+ let result = Self::from_str(&string);
+ if result.is_err() {
+ if let Some((lang, region)) = string.split_once('-') {
+ if Lang::from_str(lang).is_ok() && Region::from_str(region).is_ok() {
+ return result
+ .hint(eco_format!(
+ "you should leave only \"{}\" in the `lang` parameter and specify \"{}\" in the `region` parameter",
+ lang, region,
+ ));
+ }
+ }
+ }
+
+ result?
+ }
+}
+
+/// An identifier for a region somewhere in the world.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
+pub struct Region([u8; 2]);
+
+impl Region {
+ /// Return the region code as an all uppercase string slice.
+ pub fn as_str(&self) -> &str {
+ std::str::from_utf8(&self.0).unwrap_or_default()
+ }
+}
+
+impl PartialEq<&str> for Region {
+ fn eq(&self, other: &&str) -> bool {
+ self.as_str() == *other
+ }
+}
+
+impl FromStr for Region {
+ type Err = &'static str;
+
+ /// Construct a region from its two-byte ISO 3166-1 alpha-2 code.
+ fn from_str(iso: &str) -> Result<Self, Self::Err> {
+ if iso.len() == 2 && iso.is_ascii() {
+ let mut bytes: [u8; 2] = iso.as_bytes().try_into().unwrap();
+ bytes.make_ascii_uppercase();
+ Ok(Self(bytes))
+ } else {
+ Err("expected two letter region code (ISO 3166-1 alpha-2)")
+ }
+ }
+}
+
+cast! {
+ Region,
+ self => self.as_str().into_value(),
+ string: EcoString => Self::from_str(&string)?,
+}
+
+/// An ISO 15924-type script identifier.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
+pub struct WritingScript([u8; 4], u8);
+
+impl WritingScript {
+ /// Return the script as an all lowercase string slice.
+ pub fn as_str(&self) -> &str {
+ std::str::from_utf8(&self.0[..usize::from(self.1)]).unwrap_or_default()
+ }
+
+ /// Return the description of the script as raw bytes.
+ pub fn as_bytes(&self) -> &[u8; 4] {
+ &self.0
+ }
+}
+
+impl FromStr for WritingScript {
+ type Err = &'static str;
+
+ /// Construct a region from its ISO 15924 code.
+ fn from_str(iso: &str) -> Result<Self, Self::Err> {
+ let len = iso.len();
+ if matches!(len, 3..=4) && iso.is_ascii() {
+ let mut bytes = [b' '; 4];
+ bytes[..len].copy_from_slice(iso.as_bytes());
+ bytes.make_ascii_lowercase();
+ Ok(Self(bytes, len as u8))
+ } else {
+ Err("expected three or four letter script code (ISO 15924 or 'math')")
+ }
+ }
+}
+
+cast! {
+ WritingScript,
+ self => self.as_str().into_value(),
+ string: EcoString => Self::from_str(&string)?,
+}
+
+/// The name with which an element is referenced.
+pub trait LocalName {
+ /// The key of an element in order to get its localized name.
+ const KEY: &'static str;
+
+ /// Get the name in the given language and (optionally) region.
+ fn local_name(lang: Lang, region: Option<Region>) -> &'static str {
+ localized_str(lang, region, Self::KEY)
+ }
+
+ /// Gets the local name from the style chain.
+ fn local_name_in(styles: StyleChain) -> &'static str
+ where
+ Self: Sized,
+ {
+ Self::local_name(TextElem::lang_in(styles), TextElem::region_in(styles))
+ }
+}
+
+/// Retrieves the localized string for a given language and region.
+/// Silently falls back to English if no fitting string exists for
+/// the given language + region. Panics if no fitting string exists
+/// in both given language + region and English.
+#[comemo::memoize]
+pub fn localized_str(lang: Lang, region: Option<Region>, key: &str) -> &'static str {
+ let lang_region_bundle = parse_language_bundle(lang, region).unwrap();
+ if let Some(str) = lang_region_bundle.get(key) {
+ return str;
+ }
+ let lang_bundle = parse_language_bundle(lang, None).unwrap();
+ if let Some(str) = lang_bundle.get(key) {
+ return str;
+ }
+ let english_bundle = parse_language_bundle(Lang::ENGLISH, None).unwrap();
+ english_bundle.get(key).unwrap()
+}
+
+/// Parses the translation file for a given language and region.
+/// Only returns an error if the language file is malformed.
+#[comemo::memoize]
+fn parse_language_bundle(
+ lang: Lang,
+ region: Option<Region>,
+) -> Result<HashMap<&'static str, &'static str>, &'static str> {
+ let language_tuple = TRANSLATIONS.iter().find(|it| it.0 == lang_str(lang, region));
+ let Some((_lang_name, language_file)) = language_tuple else {
+ return Ok(HashMap::new());
+ };
+
+ let mut bundle = HashMap::new();
+ let lines = language_file.trim().lines();
+ for line in lines {
+ if line.trim().starts_with('#') {
+ continue;
+ }
+ let (key, val) = line
+ .split_once('=')
+ .ok_or("malformed translation file: line without \"=\"")?;
+ let (key, val) = (key.trim(), val.trim());
+ if val.is_empty() {
+ return Err("malformed translation file: empty translation value");
+ }
+ let duplicate = bundle.insert(key.trim(), val.trim());
+ if duplicate.is_some() {
+ return Err("malformed translation file: duplicate key");
+ }
+ }
+ Ok(bundle)
+}
+
+/// Convert language + region to a string to be able to get a file name.
+fn lang_str(lang: Lang, region: Option<Region>) -> EcoString {
+ EcoString::from(lang.as_str())
+ + region.map_or_else(EcoString::new, |r| EcoString::from("-") + r.as_str())
+}
+
+#[cfg(test)]
+mod tests {
+ use typst_utils::option_eq;
+
+ use super::*;
+
+ #[test]
+ fn test_region_option_eq() {
+ let region = Some(Region([b'U', b'S']));
+ assert!(option_eq(region, "US"));
+ assert!(!option_eq(region, "AB"));
+ }
+}