Add `#str.normalize(form)` (#5631)

Co-authored-by: +merlan #flirora <uruwi@protonmail.com> Co-authored-by: Laurenz <laurmaedje@gmail.com>
author: evie <50974538+mi2ebi@users.noreply.github.com> 2025-02-25 06:01:01 -0800
committer: GitHub <noreply@github.com> 2025-02-25 14:01:01 +0000
commit: d11ad80dee669c5e2285ca8df8ebc99abc031ccd (patch)
tree: ca18c5b71671990314a5c9a6275ccb5fe995ebe1 /crates/typst-library/src/foundations
parent: bad343748b834cdc155c5fe76cd944e74f4665cf (diff)
1 files changed, 45 insertions, 1 deletions
diff --git a/crates/typst-library/src/foundations/str.rs b/crates/typst-library/src/foundations/str.rs
index 551ac04f..23a1bd4c 100644
--- a/crates/typst-library/src/foundations/str.rs
+++ b/crates/typst-library/src/foundations/str.rs
@@ -7,12 +7,13 @@ use comemo::Tracked;
 use ecow::EcoString;
 use serde::{Deserialize, Serialize};
 use typst_syntax::{Span, Spanned};
+use unicode_normalization::UnicodeNormalization;
 use unicode_segmentation::UnicodeSegmentation;
 
 use crate::diag::{bail, At, SourceResult, StrResult};
 use crate::engine::Engine;
 use crate::foundations::{
-    cast, dict, func, repr, scope, ty, Array, Bytes, Context, Decimal, Dict, Func,
+    cast, dict, func, repr, scope, ty, Array, Bytes, Cast, Context, Decimal, Dict, Func,
     IntoValue, Label, Repr, Type, Value, Version,
 };
 use crate::layout::Alignment;
@@ -286,6 +287,30 @@ impl Str {
         Ok(c.into())
     }
 
+    /// Normalizes the string to the given Unicode normal form.
+    ///
+    /// This is useful when manipulating strings containing Unicode combining
+    /// characters.
+    ///
+    /// ```typ
+    /// #assert.eq("é".normalize(form: "nfd"), "e\u{0301}")
+    /// #assert.eq("ſ́".normalize(form: "nfkc"), "ś")
+    /// ```
+    #[func]
+    pub fn normalize(
+        &self,
+        #[named]
+        #[default(UnicodeNormalForm::Nfc)]
+        form: UnicodeNormalForm,
+    ) -> Str {
+        match form {
+            UnicodeNormalForm::Nfc => self.nfc().collect(),
+            UnicodeNormalForm::Nfd => self.nfd().collect(),
+            UnicodeNormalForm::Nfkc => self.nfkc().collect(),
+            UnicodeNormalForm::Nfkd => self.nfkd().collect(),
+        }
+    }
+
     /// Whether the string contains the specified pattern.
     ///
     /// This method also has dedicated syntax: You can write `{"bc" in "abcd"}`
@@ -788,6 +813,25 @@ cast! {
     v: Str => Self::Str(v),
 }
 
+/// A Unicode normalization form.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Cast)]
+pub enum UnicodeNormalForm {
+    /// Canonical composition where e.g. accented letters are turned into a
+    /// single Unicode codepoint.
+    #[string("nfc")]
+    Nfc,
+    /// Canonical decomposition where e.g. accented letters are split into a
+    /// separate base and diacritic.
+    #[string("nfd")]
+    Nfd,
+    /// Like NFC, but using the Unicode compatibility decompositions.
+    #[string("nfkc")]
+    Nfkc,
+    /// Like NFD, but using the Unicode compatibility decompositions.
+    #[string("nfkd")]
+    Nfkd,
+}
+
 /// Convert an item of std's `match_indices` to a dictionary.
 fn match_to_dict((start, text): (usize, &str)) -> Dict {
     dict! {
author	evie <50974538+mi2ebi@users.noreply.github.com>	2025-02-25 06:01:01 -0800
committer	GitHub <noreply@github.com>	2025-02-25 14:01:01 +0000
commit	d11ad80dee669c5e2285ca8df8ebc99abc031ccd (patch)
tree	ca18c5b71671990314a5c9a6275ccb5fe995ebe1 /crates/typst-library/src/foundations
parent	bad343748b834cdc155c5fe76cd944e74f4665cf (diff)