summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Cargo.lock1
-rw-r--r--Cargo.toml1
-rw-r--r--crates/typst-library/Cargo.toml1
-rw-r--r--crates/typst-library/src/foundations/str.rs46
-rw-r--r--tests/suite/foundations/str.typ7
5 files changed, 55 insertions, 1 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 1851134a..86f04ee5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2995,6 +2995,7 @@ dependencies = [
"typst-timing",
"typst-utils",
"unicode-math-class",
+ "unicode-normalization",
"unicode-segmentation",
"unscanny",
"usvg",
diff --git a/Cargo.toml b/Cargo.toml
index 36195230..f643856e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -129,6 +129,7 @@ unicode-bidi = "0.3.18"
unicode-ident = "1.0"
unicode-math-class = "0.1"
unicode-script = "0.5"
+unicode-normalization = "0.1.24"
unicode-segmentation = "1"
unscanny = "0.1"
ureq = { version = "2", default-features = false, features = ["native-tls", "gzip", "json"] }
diff --git a/crates/typst-library/Cargo.toml b/crates/typst-library/Cargo.toml
index fb45ec86..71729b63 100644
--- a/crates/typst-library/Cargo.toml
+++ b/crates/typst-library/Cargo.toml
@@ -61,6 +61,7 @@ ttf-parser = { workspace = true }
two-face = { workspace = true }
typed-arena = { workspace = true }
unicode-math-class = { workspace = true }
+unicode-normalization = { workspace = true }
unicode-segmentation = { workspace = true }
unscanny = { workspace = true }
usvg = { workspace = true }
diff --git a/crates/typst-library/src/foundations/str.rs b/crates/typst-library/src/foundations/str.rs
index 551ac04f..23a1bd4c 100644
--- a/crates/typst-library/src/foundations/str.rs
+++ b/crates/typst-library/src/foundations/str.rs
@@ -7,12 +7,13 @@ use comemo::Tracked;
use ecow::EcoString;
use serde::{Deserialize, Serialize};
use typst_syntax::{Span, Spanned};
+use unicode_normalization::UnicodeNormalization;
use unicode_segmentation::UnicodeSegmentation;
use crate::diag::{bail, At, SourceResult, StrResult};
use crate::engine::Engine;
use crate::foundations::{
- cast, dict, func, repr, scope, ty, Array, Bytes, Context, Decimal, Dict, Func,
+ cast, dict, func, repr, scope, ty, Array, Bytes, Cast, Context, Decimal, Dict, Func,
IntoValue, Label, Repr, Type, Value, Version,
};
use crate::layout::Alignment;
@@ -286,6 +287,30 @@ impl Str {
Ok(c.into())
}
+ /// Normalizes the string to the given Unicode normal form.
+ ///
+ /// This is useful when manipulating strings containing Unicode combining
+ /// characters.
+ ///
+ /// ```typ
+ /// #assert.eq("é".normalize(form: "nfd"), "e\u{0301}")
+ /// #assert.eq("ſ́".normalize(form: "nfkc"), "ś")
+ /// ```
+ #[func]
+ pub fn normalize(
+ &self,
+ #[named]
+ #[default(UnicodeNormalForm::Nfc)]
+ form: UnicodeNormalForm,
+ ) -> Str {
+ match form {
+ UnicodeNormalForm::Nfc => self.nfc().collect(),
+ UnicodeNormalForm::Nfd => self.nfd().collect(),
+ UnicodeNormalForm::Nfkc => self.nfkc().collect(),
+ UnicodeNormalForm::Nfkd => self.nfkd().collect(),
+ }
+ }
+
/// Whether the string contains the specified pattern.
///
/// This method also has dedicated syntax: You can write `{"bc" in "abcd"}`
@@ -788,6 +813,25 @@ cast! {
v: Str => Self::Str(v),
}
+/// A Unicode normalization form.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Cast)]
+pub enum UnicodeNormalForm {
+ /// Canonical composition where e.g. accented letters are turned into a
+ /// single Unicode codepoint.
+ #[string("nfc")]
+ Nfc,
+ /// Canonical decomposition where e.g. accented letters are split into a
+ /// separate base and diacritic.
+ #[string("nfd")]
+ Nfd,
+ /// Like NFC, but using the Unicode compatibility decompositions.
+ #[string("nfkc")]
+ Nfkc,
+ /// Like NFD, but using the Unicode compatibility decompositions.
+ #[string("nfkd")]
+ Nfkd,
+}
+
/// Convert an item of std's `match_indices` to a dictionary.
fn match_to_dict((start, text): (usize, &str)) -> Dict {
dict! {
diff --git a/tests/suite/foundations/str.typ b/tests/suite/foundations/str.typ
index 56756416..66fb912c 100644
--- a/tests/suite/foundations/str.typ
+++ b/tests/suite/foundations/str.typ
@@ -86,6 +86,13 @@
// Error: 2-28 0x110000 is not a valid codepoint
#str.from-unicode(0x110000) // 0x10ffff is the highest valid code point
+--- str-normalize ---
+// Test the `normalize` method.
+#test("e\u{0301}".normalize(form: "nfc"), "é")
+#test("é".normalize(form: "nfd"), "e\u{0301}")
+#test("ſ\u{0301}".normalize(form: "nfkc"), "ś")
+#test("ſ\u{0301}".normalize(form: "nfkd"), "s\u{0301}")
+
--- string-len ---
// Test the `len` method.
#test("Hello World!".len(), 12)