summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeng Guanwen <pg999w@outlook.com>2023-11-15 22:01:15 +0800
committerGitHub <noreply@github.com>2023-11-15 15:01:15 +0100
commitf4a81091f72299296fb2be7511b08c3cb0e8fa57 (patch)
tree7935e1eebf9e4ff1f361e71435becd5801f8eb51
parent50ea3b4f1651ca6558c118edebbd68d7ef7a93d7 (diff)
Lexer change: Allow emphasis in CJK text without spaces (#2648)
-rw-r--r--Cargo.lock1
-rw-r--r--assets/fonts/NotoSerifCJKsc-Bold.otfbin0 -> 1385940 bytes
-rw-r--r--assets/fonts/NotoSerifCJKtc-Bold.otfbin0 -> 1617780 bytes
-rw-r--r--crates/typst-syntax/Cargo.toml1
-rw-r--r--crates/typst-syntax/src/lexer.rs13
-rw-r--r--tests/ref/text/emphasis.pngbin8575 -> 16982 bytes
-rw-r--r--tests/ref/text/lang-with-region.pngbin1389 -> 1392 bytes
-rw-r--r--tests/typ/text/emphasis.typ7
8 files changed, 20 insertions, 2 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 299e3a58..a046e53c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3117,6 +3117,7 @@ dependencies = [
"tracing",
"unicode-ident",
"unicode-math-class",
+ "unicode-script",
"unicode-segmentation",
"unscanny",
]
diff --git a/assets/fonts/NotoSerifCJKsc-Bold.otf b/assets/fonts/NotoSerifCJKsc-Bold.otf
new file mode 100644
index 00000000..c291490e
--- /dev/null
+++ b/assets/fonts/NotoSerifCJKsc-Bold.otf
Binary files differ
diff --git a/assets/fonts/NotoSerifCJKtc-Bold.otf b/assets/fonts/NotoSerifCJKtc-Bold.otf
new file mode 100644
index 00000000..73a2135e
--- /dev/null
+++ b/assets/fonts/NotoSerifCJKtc-Bold.otf
Binary files differ
diff --git a/crates/typst-syntax/Cargo.toml b/crates/typst-syntax/Cargo.toml
index 1254e663..681189bd 100644
--- a/crates/typst-syntax/Cargo.toml
+++ b/crates/typst-syntax/Cargo.toml
@@ -23,5 +23,6 @@ serde = { workspace = true }
tracing = { workspace = true }
unicode-ident = { workspace = true }
unicode-math-class = { workspace = true }
+unicode-script = { workspace = true }
unicode-segmentation = { workspace = true }
unscanny = { workspace = true }
diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
index c702551c..ffe53145 100644
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@@ -1,5 +1,6 @@
use ecow::{eco_format, EcoString};
use unicode_ident::{is_xid_continue, is_xid_start};
+use unicode_script::{Script, UnicodeScript};
use unicode_segmentation::UnicodeSegmentation;
use unscanny::Scanner;
@@ -343,10 +344,18 @@ impl Lexer<'_> {
}
fn in_word(&self) -> bool {
- let alphanum = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
+ let wordy = |c: Option<char>| {
+ c.map_or(false, |c| {
+ c.is_alphanumeric()
+ && !matches!(
+ c.script(),
+ Script::Han | Script::Hiragana | Script::Katakana
+ )
+ })
+ };
let prev = self.s.scout(-2);
let next = self.s.peek();
- alphanum(prev) && alphanum(next)
+ wordy(prev) && wordy(next)
}
fn space_or_end(&self) -> bool {
diff --git a/tests/ref/text/emphasis.png b/tests/ref/text/emphasis.png
index 333f7006..c19f6ebb 100644
--- a/tests/ref/text/emphasis.png
+++ b/tests/ref/text/emphasis.png
Binary files differ
diff --git a/tests/ref/text/lang-with-region.png b/tests/ref/text/lang-with-region.png
index 097f105d..c7753104 100644
--- a/tests/ref/text/lang-with-region.png
+++ b/tests/ref/text/lang-with-region.png
Binary files differ
diff --git a/tests/typ/text/emphasis.typ b/tests/typ/text/emphasis.typ
index fd04c8e7..93913dcf 100644
--- a/tests/typ/text/emphasis.typ
+++ b/tests/typ/text/emphasis.typ
@@ -7,6 +7,13 @@ _Emphasized and *strong* words!_
// Inside of a word it's a normal underscore or star.
hello_world Nutzer*innen
+// CJK characters will not need spaces.
+中文一般使用*粗体*或者_楷体_来表示强调。
+
+日本語では、*太字*や_斜体_を使って強調します。
+
+中文中混有*Strong*和_Empasis_。
+
// Can contain paragraph in nested content block.
_Still #[