summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeng Guanwen <pg999w@outlook.com>2023-10-17 17:14:09 +0800
committerGitHub <noreply@github.com>2023-10-17 11:14:09 +0200
commite4d9db83ea4b7b9a3003c754c80b30b87d6cd119 (patch)
treea65e58179aa33adc4bf06732697a22da3dfb643b
parenta59666369b946c3a8b62db363659cbfca35f0a26 (diff)
Automatically add spacing between CJK and Latin characters (#2334)
-rw-r--r--crates/typst-library/src/layout/par.rs137
-rw-r--r--crates/typst-library/src/text/mod.rs12
-rw-r--r--crates/typst-library/src/text/shaping.rs17
-rw-r--r--tests/ref/layout/cjk-latin-spacing.pngbin0 -> 6039 bytes
-rw-r--r--tests/ref/layout/par-justify-cjk.pngbin70197 -> 69916 bytes
-rw-r--r--tests/ref/text/chinese.pngbin20759 -> 20722 bytes
-rw-r--r--tests/typ/layout/cjk-latin-spacing.typ16
7 files changed, 156 insertions, 26 deletions
diff --git a/crates/typst-library/src/layout/par.rs b/crates/typst-library/src/layout/par.rs
index f126df76..b18d6229 100644
--- a/crates/typst-library/src/layout/par.rs
+++ b/crates/typst-library/src/layout/par.rs
@@ -16,8 +16,8 @@ use crate::layout::AlignElem;
use crate::math::EquationElem;
use crate::prelude::*;
use crate::text::{
- is_gb_style, shape, LinebreakElem, Quoter, Quotes, ShapedText, SmartquoteElem,
- SpaceElem, TextElem,
+ char_is_cjk_script, is_gb_style, shape, LinebreakElem, Quoter, Quotes, ShapedGlyph,
+ ShapedText, SmartquoteElem, SpaceElem, TextElem,
};
/// Arranges text, spacing and inline-level elements into a paragraph.
@@ -723,6 +723,10 @@ fn prepare<'a>(
cursor = end;
}
+ if TextElem::cjk_latin_spacing_in(styles).is_auto() {
+ add_cjk_latin_spacing(&mut items);
+ }
+
Ok(Preparation {
bidi,
items,
@@ -736,6 +740,52 @@ fn prepare<'a>(
})
}
+/// Add some spacing between Han characters and western characters.
+/// See Requirements for Chinese Text Layout, Section 3.2.2 Mixed Text Composition in Horizontal
+/// Written Mode
+fn add_cjk_latin_spacing(items: &mut [Item]) {
+ let mut items = items.iter_mut().peekable();
+ let mut prev: Option<&ShapedGlyph> = None;
+ while let Some(item) = items.next() {
+ let Some(text) = item.text_mut() else {
+ prev = None;
+ continue;
+ };
+
+ // Since we only call this function in [`prepare`], we can assume
+ // that the Cow is owned, and `to_mut` can be called without overhead.
+ debug_assert!(matches!(text.glyphs, std::borrow::Cow::Owned(_)));
+ let mut glyphs = text.glyphs.to_mut().iter_mut().peekable();
+
+ while let Some(glyph) = glyphs.next() {
+ let next = glyphs.peek().map(|n| n as _).or_else(|| {
+ items
+ .peek()
+ .and_then(|i| i.text())
+ .and_then(|shaped| shaped.glyphs.first())
+ });
+
+ // Case 1: CJK followed by a Latin character
+ if glyph.is_cjk_script() && next.map_or(false, |g| g.is_letter_or_number()) {
+ // The spacing is default to 1/4 em, and can be shrunk to 1/8 em.
+ glyph.x_advance += Em::new(0.25);
+ glyph.adjustability.shrinkability.1 += Em::new(0.125);
+ text.width += Em::new(0.25).at(text.size);
+ }
+
+ // Case 2: Latin followed by a CJK character
+ if glyph.is_cjk_script() && prev.map_or(false, |g| g.is_letter_or_number()) {
+ glyph.x_advance += Em::new(0.25);
+ glyph.x_offset += Em::new(0.25);
+ glyph.adjustability.shrinkability.0 += Em::new(0.125);
+ text.width += Em::new(0.25).at(text.size);
+ }
+
+ prev = Some(glyph);
+ }
+ }
+}
+
/// Group a range of text by BiDi level and script, shape the runs and generate
/// items for them.
fn shape_range<'a>(
@@ -839,10 +889,11 @@ fn linebreak_simple<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<Line
let mut lines = vec![];
let mut start = 0;
let mut last = None;
+ let cjk_latin_spacing = TextElem::cjk_latin_spacing_in(p.styles).is_auto();
for (end, mandatory, hyphen) in breakpoints(p) {
// Compute the line and its size.
- let mut attempt = line(vt, p, start..end, mandatory, hyphen);
+ let mut attempt = line(vt, p, start..end, mandatory, hyphen, cjk_latin_spacing);
// If the line doesn't fit anymore, we push the last fitting attempt
// into the stack and rebuild the line from the attempt's end. The
@@ -851,7 +902,7 @@ fn linebreak_simple<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<Line
if let Some((last_attempt, last_end)) = last.take() {
lines.push(last_attempt);
start = last_end;
- attempt = line(vt, p, start..end, mandatory, hyphen);
+ attempt = line(vt, p, start..end, mandatory, hyphen, cjk_latin_spacing);
}
}
@@ -914,10 +965,11 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
let mut table = vec![Entry {
pred: 0,
total: 0.0,
- line: line(vt, p, 0..0, false, false),
+ line: line(vt, p, 0..0, false, false, false),
}];
let em = TextElem::size_in(p.styles);
+ let cjk_latin_spacing = TextElem::cjk_latin_spacing_in(p.styles).is_auto();
for (end, mandatory, hyphen) in breakpoints(p) {
let k = table.len();
@@ -929,7 +981,7 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
// Layout the line.
let start = pred.line.end;
- let attempt = line(vt, p, start..end, mandatory, hyphen);
+ let attempt = line(vt, p, start..end, mandatory, hyphen, cjk_latin_spacing);
// Determine how much the line's spaces would need to be stretched
// to make it the desired width.
@@ -1213,10 +1265,16 @@ fn line<'a>(
mut range: Range,
mandatory: bool,
hyphen: bool,
+ cjk_latin_spacing: bool,
) -> Line<'a> {
let end = range.end;
let mut justify = p.justify && end < p.bidi.text.len() && !mandatory;
+ // The CJK punctuation that can appear at the beginning or end of a line.
+ const BEGIN_PUNCT_PAT: &[char] = &['“', '‘', '《', '(', '『', '「'];
+ const END_PUNCT_PAT: &[char] =
+ &['”', '’', ',', '。', '、', ':', ';', '》', ')', '』', '」'];
+
if range.is_empty() {
return Line {
bidi: &p.bidi,
@@ -1256,8 +1314,8 @@ fn line<'a>(
// Deal with CJK punctuation at line ends.
let gb_style = is_gb_style(shaped.lang, shaped.region);
- let end_cjk_punct = trimmed
- .ends_with(['”', '’', ',', '。', '、', ':', ';', '》', ')', '』', '」']);
+ let maybe_adjust_last_glyph = trimmed.ends_with(END_PUNCT_PAT)
+ || (cjk_latin_spacing && trimmed.ends_with(char_is_cjk_script));
// Usually, we don't want to shape an empty string because:
// - We don't want the height of trimmed whitespace in a different
@@ -1268,21 +1326,37 @@ fn line<'a>(
// need the shaped empty string to make the line the appropriate
// height. That is the case exactly if the string is empty and there
// are no other items in the line.
- if hyphen || start + shaped.text.len() > range.end || end_cjk_punct {
+ if hyphen || start + shaped.text.len() > range.end || maybe_adjust_last_glyph {
if hyphen || start < range.end || before.is_empty() {
let mut reshaped = shaped.reshape(vt, &p.spans, start..range.end);
if hyphen || shy {
reshaped.push_hyphen(vt, TextElem::fallback_in(p.styles));
}
- let punct = reshaped.glyphs.last();
- if let Some(punct) = punct {
- if punct.is_cjk_left_aligned_punctuation(gb_style) {
- let shrink_amount = punct.shrinkability().1;
+
+ if let Some(last_glyph) = reshaped.glyphs.last() {
+ if last_glyph.is_cjk_left_aligned_punctuation(gb_style) {
+ // If the last glyph is a CJK punctuation, we want to shrink it.
+ // See Requirements for Chinese Text Layout, Section 3.1.6.3
+ // Compression of punctuation marks at line start or line end
+ let shrink_amount = last_glyph.shrinkability().1;
let punct = reshaped.glyphs.to_mut().last_mut().unwrap();
punct.shrink_right(shrink_amount);
reshaped.width -= shrink_amount.at(reshaped.size);
+ } else if cjk_latin_spacing
+ && last_glyph.is_cjk_script()
+ && (last_glyph.x_advance - last_glyph.x_offset) > Em::one()
+ {
+ // If the last glyph is a CJK character adjusted by [`add_cjk_latin_spacing`],
+ // restore the original width.
+ let shrink_amount =
+ last_glyph.x_advance - last_glyph.x_offset - Em::one();
+ let glyph = reshaped.glyphs.to_mut().last_mut().unwrap();
+ glyph.x_advance -= shrink_amount;
+ glyph.adjustability.shrinkability.1 = Em::zero();
+ reshaped.width -= shrink_amount.at(reshaped.size);
}
}
+
width += reshaped.width;
last = Some(Item::Text(reshaped));
}
@@ -1291,9 +1365,10 @@ fn line<'a>(
}
}
- // Deal with CJK punctuation at line starts.
+ // Deal with CJK characters at line starts.
let text = &p.bidi.text[range.start..end];
- let start_cjk_punct = text.starts_with(['“', '‘', '《', '(', '『', '「']);
+ let maybe_adjust_first_glyph = text.starts_with(BEGIN_PUNCT_PAT)
+ || (cjk_latin_spacing && text.starts_with(char_is_cjk_script));
// Reshape the start item if it's split in half.
let mut first = None;
@@ -1303,8 +1378,9 @@ fn line<'a>(
let end = range.end.min(base + shaped.text.len());
// Reshape if necessary.
- if range.start + shaped.text.len() > end || start_cjk_punct {
- if range.start < end || start_cjk_punct {
+ if range.start + shaped.text.len() > end || maybe_adjust_first_glyph {
+ // If the range is empty, we don't want to push an empty text item.
+ if range.start < end {
let reshaped = shaped.reshape(vt, &p.spans, range.start..end);
width += reshaped.width;
first = Some(Item::Text(reshaped));
@@ -1314,14 +1390,29 @@ fn line<'a>(
}
}
- if start_cjk_punct {
+ if maybe_adjust_first_glyph {
let reshaped = first.as_mut().or(last.as_mut()).and_then(Item::text_mut);
if let Some(reshaped) = reshaped {
- if let Some(punct) = reshaped.glyphs.first() {
- if punct.is_cjk_right_aligned_punctuation() {
- let shrink_amount = punct.shrinkability().0;
- let punct = reshaped.glyphs.to_mut().first_mut().unwrap();
- punct.shrink_left(shrink_amount);
+ if let Some(first_glyph) = reshaped.glyphs.first() {
+ if first_glyph.is_cjk_right_aligned_punctuation() {
+ // If the first glyph is a CJK punctuation, we want to shrink it.
+ let shrink_amount = first_glyph.shrinkability().0;
+ let glyph = reshaped.glyphs.to_mut().first_mut().unwrap();
+ glyph.shrink_left(shrink_amount);
+ let amount_abs = shrink_amount.at(reshaped.size);
+ reshaped.width -= amount_abs;
+ width -= amount_abs;
+ } else if cjk_latin_spacing
+ && first_glyph.is_cjk_script()
+ && first_glyph.x_offset > Em::zero()
+ {
+ // If the first glyph is a CJK character adjusted by [`add_cjk_latin_spacing`],
+ // restore the original width.
+ let shrink_amount = first_glyph.x_offset;
+ let glyph = reshaped.glyphs.to_mut().first_mut().unwrap();
+ glyph.x_advance -= shrink_amount;
+ glyph.x_offset = Em::zero();
+ glyph.adjustability.shrinkability.0 = Em::zero();
let amount_abs = shrink_amount.at(reshaped.size);
reshaped.width -= amount_abs;
width -= amount_abs;
diff --git a/crates/typst-library/src/text/mod.rs b/crates/typst-library/src/text/mod.rs
index 748d71ad..4d2f5c63 100644
--- a/crates/typst-library/src/text/mod.rs
+++ b/crates/typst-library/src/text/mod.rs
@@ -19,6 +19,7 @@ pub use self::shift::*;
use rustybuzz::Tag;
use ttf_parser::Rect;
use typst::diag::{bail, error, SourceResult};
+use typst::eval::Never;
use typst::font::{Font, FontStretch, FontStyle, FontWeight, VerticalFontMetric};
use crate::layout::ParElem;
@@ -224,6 +225,17 @@ pub struct TextElem {
#[default(Rel::one())]
pub spacing: Rel<Length>,
+ /// Whether to automatically insert spacing between CJK and Latin characters.
+ ///
+ /// ```example
+ /// #set text(cjk-latin-spacing: auto)
+ /// 第4章介绍了基本的API。
+ ///
+ /// #set text(cjk-latin-spacing: none)
+ /// 第4章介绍了基本的API。
+ /// ```
+ pub cjk_latin_spacing: Smart<Option<Never>>,
+
/// An amount to shift the text baseline by.
///
/// ```example
diff --git a/crates/typst-library/src/text/shaping.rs b/crates/typst-library/src/text/shaping.rs
index df30a9c8..1b7d9e09 100644
--- a/crates/typst-library/src/text/shaping.rs
+++ b/crates/typst-library/src/text/shaping.rs
@@ -100,9 +100,7 @@ impl ShapedGlyph {
}
pub fn is_cjk_script(&self) -> bool {
- use Script::*;
- // U+30FC: Katakana-Hiragana Prolonged Sound Mark
- matches!(self.c.script(), Hiragana | Katakana | Han) || self.c == '\u{30FC}'
+ char_is_cjk_script(self.c)
}
pub fn is_cjk_punctuation(&self) -> bool {
@@ -151,6 +149,13 @@ impl ShapedGlyph {
matches!(self.c, '\u{30FB}')
}
+ /// Whether the glyph is a western letter or number.
+ pub fn is_letter_or_number(&self) -> bool {
+ matches!(self.c.script(), Script::Latin | Script::Greek | Script::Cyrillic)
+ || matches!(self.c, '#' | '$' | '%' | '&')
+ || self.c.is_ascii_digit()
+ }
+
pub fn base_adjustability(&self, gb_style: bool) -> Adjustability {
let width = self.x_advance;
if self.is_space() {
@@ -958,6 +963,12 @@ fn language(styles: StyleChain) -> rustybuzz::Language {
rustybuzz::Language::from_str(&bcp).unwrap()
}
+pub fn char_is_cjk_script(c: char) -> bool {
+ use Script::*;
+ // U+30FC: Katakana-Hiragana Prolonged Sound Mark
+ matches!(c.script(), Hiragana | Katakana | Han) || c == '\u{30FC}'
+}
+
/// Returns true if all glyphs in `glyphs` have ranges within the range `range`.
#[cfg(debug_assertions)]
fn assert_all_glyphs_in_range(glyphs: &[ShapedGlyph], text: &str, range: Range<usize>) {
diff --git a/tests/ref/layout/cjk-latin-spacing.png b/tests/ref/layout/cjk-latin-spacing.png
new file mode 100644
index 00000000..bd4eed9b
--- /dev/null
+++ b/tests/ref/layout/cjk-latin-spacing.png
Binary files differ
diff --git a/tests/ref/layout/par-justify-cjk.png b/tests/ref/layout/par-justify-cjk.png
index 5a147ca4..a9baf14c 100644
--- a/tests/ref/layout/par-justify-cjk.png
+++ b/tests/ref/layout/par-justify-cjk.png
Binary files differ
diff --git a/tests/ref/text/chinese.png b/tests/ref/text/chinese.png
index 4762558d..0c3ddd00 100644
--- a/tests/ref/text/chinese.png
+++ b/tests/ref/text/chinese.png
Binary files differ
diff --git a/tests/typ/layout/cjk-latin-spacing.typ b/tests/typ/layout/cjk-latin-spacing.typ
new file mode 100644
index 00000000..9cc94fd2
--- /dev/null
+++ b/tests/typ/layout/cjk-latin-spacing.typ
@@ -0,0 +1,16 @@
+// Test CJK-Latin spacing.
+
+#set page(width: 50pt + 10pt, margin: (x: 5pt))
+#set text(lang: "zh", font: "Noto Serif CJK SC", cjk-latin-spacing: auto)
+#set par(justify: true)
+
+中文,中12文1中,文12中文
+
+中文,中ab文a中,文ab中文
+
+#set text(cjk-latin-spacing: none)
+
+中文,中12文1中,文12中文
+
+中文,中ab文a中,文ab中文
+