summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeng Guanwen <pg999w@outlook.com>2023-05-11 21:02:52 +0800
committerGitHub <noreply@github.com>2023-05-11 15:02:52 +0200
commitd19a4124de60b043e36e76dfe20fca193deb6a41 (patch)
treee71f1906acf454a97f3b8083ca03c60b8b32b7cd
parente472b0347f84f39edf4655d39f8b5484870d0a76 (diff)
Implement sophisticated CJK punctuation adjustment (#954)
-rw-r--r--assets/fonts/NotoSerifCJKjp-Regular.otfbin0 -> 3080156 bytes
-rw-r--r--assets/fonts/NotoSerifCJKsc-Regular.otfbin23613224 -> 1350760 bytes
-rw-r--r--assets/fonts/NotoSerifCJKtc-Regular.otfbin0 -> 1569152 bytes
-rw-r--r--library/src/layout/par.rs73
-rw-r--r--library/src/text/shaping.rs135
-rw-r--r--tests/ref/layout/par-justify-cjk.pngbin120972 -> 69448 bytes
-rw-r--r--tests/ref/meta/numbering.pngbin36985 -> 35430 bytes
-rw-r--r--tests/ref/text/chinese.pngbin21925 -> 20759 bytes
-rw-r--r--tests/typ/layout/par-justify-cjk.typ17
9 files changed, 195 insertions, 30 deletions
diff --git a/assets/fonts/NotoSerifCJKjp-Regular.otf b/assets/fonts/NotoSerifCJKjp-Regular.otf
new file mode 100644
index 00000000..6daac206
--- /dev/null
+++ b/assets/fonts/NotoSerifCJKjp-Regular.otf
Binary files differ
diff --git a/assets/fonts/NotoSerifCJKsc-Regular.otf b/assets/fonts/NotoSerifCJKsc-Regular.otf
index 4c5f715b..4f8e5404 100644
--- a/assets/fonts/NotoSerifCJKsc-Regular.otf
+++ b/assets/fonts/NotoSerifCJKsc-Regular.otf
Binary files differ
diff --git a/assets/fonts/NotoSerifCJKtc-Regular.otf b/assets/fonts/NotoSerifCJKtc-Regular.otf
new file mode 100644
index 00000000..7f6da6f7
--- /dev/null
+++ b/assets/fonts/NotoSerifCJKtc-Regular.otf
Binary files differ
diff --git a/library/src/layout/par.rs b/library/src/layout/par.rs
index 2edbd93d..96d45f2f 100644
--- a/library/src/layout/par.rs
+++ b/library/src/layout/par.rs
@@ -8,7 +8,8 @@ use crate::layout::AlignElem;
use crate::math::EquationElem;
use crate::prelude::*;
use crate::text::{
- shape, LinebreakElem, Quoter, Quotes, ShapedText, SmartQuoteElem, SpaceElem, TextElem,
+ is_gb_style, shape, LinebreakElem, Quoter, Quotes, ShapedText, SmartQuoteElem,
+ SpaceElem, TextElem,
};
/// Arrange text, spacing and inline-level elements into a paragraph.
@@ -354,6 +355,13 @@ impl<'a> Item<'a> {
}
}
+ fn text_mut(&mut self) -> Option<&mut ShapedText<'a>> {
+ match self {
+ Self::Text(shaped) => Some(shaped),
+ _ => None,
+ }
+ }
+
/// The text length of the item.
fn len(&self) -> usize {
match self {
@@ -715,9 +723,12 @@ fn shape_range<'a>(
spans: &SpanMapper,
styles: StyleChain<'a>,
) {
+ let lang = TextElem::lang_in(styles);
+ let region = TextElem::region_in(styles);
let mut process = |range: Range, level: BidiLevel| {
let dir = if level.is_ltr() { Dir::LTR } else { Dir::RTL };
- let shaped = shape(vt, range.start, &bidi.text[range], spans, styles, dir);
+ let shaped =
+ shape(vt, range.start, &bidi.text[range], spans, styles, dir, lang, region);
items.push(Item::Text(shaped));
};
@@ -905,15 +916,11 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
// This often happens with monospace fonts and CJK texts.
ratio = 0.0;
}
- if ratio.is_infinite() {
- // The line's not stretchable, we calculate the ratio in another way...
- ratio = delta / (em / 2.0);
- // ...and because it is underfull/overfull, make sure the ratio is at least 1.0.
- if ratio > 0.0 {
- ratio += 1.0;
- } else {
- ratio -= 1.0;
- }
+ if ratio > 1.0 {
+ // We should stretch the line above its stretchability. Now calculate the extra amount.
+ let extra_stretch = (delta - adjust) / attempt.justifiables() as f64;
+ // Normalize the amount by half Em size.
+ ratio = 1.0 + extra_stretch / (em / 2.0);
}
// Determine the cost of the line.
@@ -1124,7 +1131,9 @@ fn line<'a>(
let base = expanded.end - shaped.text.len();
let start = range.start.max(base);
let text = &p.bidi.text[start..range.end];
- let trimmed = text.trim_end();
+ // U+200B ZERO WIDTH SPACE is used to provide a line break opportunity,
+ // we want to trim it too.
+ let trimmed = text.trim_end().trim_end_matches('\u{200B}');
range.end = start + trimmed.len();
// Deal with hyphens, dashes and justification.
@@ -1132,6 +1141,11 @@ fn line<'a>(
dash = hyphen || shy || trimmed.ends_with(['-', '–', '—']);
justify |= text.ends_with('\u{2028}');
+ // Deal with CJK punctuation at line ends.
+ let gb_style = is_gb_style(shaped.lang, shaped.region);
+ let end_cjk_punct = trimmed
+ .ends_with(['”', '’', ',', '。', '、', ':', ';', '》', ')', '』', '」']);
+
// Usually, we don't want to shape an empty string because:
// - We don't want the height of trimmed whitespace in a different
// font to be considered for the line height.
@@ -1141,12 +1155,21 @@ fn line<'a>(
// need the shaped empty string to make the line the appropriate
// height. That is the case exactly if the string is empty and there
// are no other items in the line.
- if hyphen || start + shaped.text.len() > range.end {
+ if hyphen || start + shaped.text.len() > range.end || end_cjk_punct {
if hyphen || start < range.end || before.is_empty() {
let mut reshaped = shaped.reshape(vt, &p.spans, start..range.end);
if hyphen || shy {
reshaped.push_hyphen(vt);
}
+ let punct = reshaped.glyphs.last();
+ if let Some(punct) = punct {
+ if punct.is_cjk_left_aligned_punctuation(gb_style) {
+ let shrink_amount = punct.shrinkability().1;
+ let punct = reshaped.glyphs.to_mut().last_mut().unwrap();
+ punct.shrink_right(shrink_amount);
+ reshaped.width -= shrink_amount.at(reshaped.size);
+ }
+ }
width += reshaped.width;
last = Some(Item::Text(reshaped));
}
@@ -1155,6 +1178,10 @@ fn line<'a>(
}
}
+ // Deal with CJK punctuation at line starts.
+ let text = &p.bidi.text[range.start..end];
+ let start_cjk_punct = text.starts_with(['“', '‘', '《', '(', '『', '「']);
+
// Reshape the start item if it's split in half.
let mut first = None;
if let Some((Item::Text(shaped), after)) = inner.split_first() {
@@ -1163,8 +1190,8 @@ fn line<'a>(
let end = range.end.min(base + shaped.text.len());
// Reshape if necessary.
- if range.start + shaped.text.len() > end {
- if range.start < end {
+ if range.start + shaped.text.len() > end || start_cjk_punct {
+ if range.start < end || start_cjk_punct {
let reshaped = shaped.reshape(vt, &p.spans, range.start..end);
width += reshaped.width;
first = Some(Item::Text(reshaped));
@@ -1174,6 +1201,22 @@ fn line<'a>(
}
}
+ if start_cjk_punct {
+ let reshaped = first.as_mut().or(last.as_mut()).and_then(Item::text_mut);
+ if let Some(reshaped) = reshaped {
+ if let Some(punct) = reshaped.glyphs.first() {
+ if punct.is_cjk_right_aligned_punctuation() {
+ let shrink_amount = punct.shrinkability().0;
+ let punct = reshaped.glyphs.to_mut().first_mut().unwrap();
+ punct.shrink_left(shrink_amount);
+ let amount_abs = shrink_amount.at(reshaped.size);
+ reshaped.width -= amount_abs;
+ width -= amount_abs;
+ }
+ }
+ }
+ }
+
// Measure the inner items.
for item in inner {
width += item.width();
diff --git a/library/src/text/shaping.rs b/library/src/text/shaping.rs
index b0be8bf6..d0c879fe 100644
--- a/library/src/text/shaping.rs
+++ b/library/src/text/shaping.rs
@@ -23,6 +23,10 @@ pub struct ShapedText<'a> {
pub text: &'a str,
/// The text direction.
pub dir: Dir,
+ /// The text language.
+ pub lang: Lang,
+ /// The text region.
+ pub region: Option<Region>,
/// The text's style properties.
pub styles: StyleChain<'a>,
/// The font variant.
@@ -48,6 +52,8 @@ pub struct ShapedGlyph {
pub x_offset: Em,
/// The vertical offset of the glyph.
pub y_offset: Em,
+ /// The adjustability of the glyph.
+ pub adjustability: Adjustability,
/// The byte range of this glyph's cluster in the full paragraph. A cluster
/// is a sequence of one or multiple glyphs that cannot be separated and
/// must always be treated as a union.
@@ -78,41 +84,67 @@ impl ShapedGlyph {
/// Whether the glyph is justifiable.
pub fn is_justifiable(&self) -> bool {
+ // GB style is not relevant here.
self.is_space()
- || self.is_cjk()
- || self.is_cjk_left_aligned_punctuation()
+ || self.is_cjk_script()
+ || self.is_cjk_left_aligned_punctuation(true)
|| self.is_cjk_right_aligned_punctuation()
+ || self.is_cjk_center_aligned_punctuation(true)
}
- pub fn is_cjk(&self) -> bool {
+ pub fn is_cjk_script(&self) -> bool {
use Script::*;
// U+30FC: Katakana-Hiragana Prolonged Sound Mark
matches!(self.c.script(), Hiragana | Katakana | Han) || self.c == '\u{30FC}'
}
+ pub fn is_cjk_adjustable(&self) -> bool {
+ self.is_cjk_left_aligned_punctuation(true)
+ || self.is_cjk_right_aligned_punctuation()
+ || self.is_cjk_center_aligned_punctuation(true)
+ }
+
/// See <https://www.w3.org/TR/clreq/#punctuation_width_adjustment>
- pub fn is_cjk_left_aligned_punctuation(&self) -> bool {
+ pub fn is_cjk_left_aligned_punctuation(&self, gb_style: bool) -> bool {
// CJK quotation marks shares codepoints with latin quotation marks.
// But only the CJK ones have full width.
- if matches!(self.c, '”' | '’') && self.x_advance == Em::one() {
+ if matches!(self.c, '”' | '’')
+ && self.x_advance + self.stretchability().1 == Em::one()
+ {
return true;
}
- matches!(self.c, ',' | '。' | '、' | ':' | ';' | '》' | ')' | '』' | '」')
+ if gb_style && matches!(self.c, ',' | '。' | '、' | ':' | ';') {
+ return true;
+ }
+
+ matches!(self.c, '》' | ')' | '』' | '」')
}
/// See <https://www.w3.org/TR/clreq/#punctuation_width_adjustment>
pub fn is_cjk_right_aligned_punctuation(&self) -> bool {
// CJK quotation marks shares codepoints with latin quotation marks.
// But only the CJK ones have full width.
- if matches!(self.c, '“' | '‘') && self.x_advance == Em::one() {
+ if matches!(self.c, '“' | '‘')
+ && self.x_advance + self.stretchability().0 == Em::one()
+ {
return true;
}
matches!(self.c, '《' | '(' | '『' | '「')
}
- pub fn adjustability(&self) -> Adjustability {
+ /// See https://www.w3.org/TR/clreq/#punctuation_width_adjustment
+ pub fn is_cjk_center_aligned_punctuation(&self, gb_style: bool) -> bool {
+ if !gb_style && matches!(self.c, ',' | '。' | '、' | ':' | ';') {
+ return true;
+ }
+
+ // U+30FB: Katakana Middle Dot
+ matches!(self.c, '\u{30FB}')
+ }
+
+ pub fn base_adjustability(&self, gb_style: bool) -> Adjustability {
let width = self.x_advance;
if self.is_space() {
Adjustability {
@@ -120,7 +152,7 @@ impl ShapedGlyph {
stretchability: (Em::zero(), width / 2.0),
shrinkability: (Em::zero(), width / 3.0),
}
- } else if self.is_cjk_left_aligned_punctuation() {
+ } else if self.is_cjk_left_aligned_punctuation(gb_style) {
Adjustability {
stretchability: (Em::zero(), Em::zero()),
shrinkability: (Em::zero(), width / 2.0),
@@ -130,6 +162,11 @@ impl ShapedGlyph {
stretchability: (Em::zero(), Em::zero()),
shrinkability: (width / 2.0, Em::zero()),
}
+ } else if self.is_cjk_center_aligned_punctuation(gb_style) {
+ Adjustability {
+ stretchability: (Em::zero(), Em::zero()),
+ shrinkability: (width / 4.0, width / 4.0),
+ }
} else {
Adjustability::default()
}
@@ -137,12 +174,27 @@ impl ShapedGlyph {
/// The stretchability of the character.
pub fn stretchability(&self) -> (Em, Em) {
- self.adjustability().stretchability
+ self.adjustability.stretchability
}
/// The shrinkability of the character.
pub fn shrinkability(&self) -> (Em, Em) {
- self.adjustability().shrinkability
+ self.adjustability.shrinkability
+ }
+
+ /// Shrink the width of glyph on the left side.
+ pub fn shrink_left(&mut self, amount: Em) {
+ self.x_offset -= amount;
+ self.x_advance -= amount;
+ self.adjustability.shrinkability.0 -= amount;
+ self.adjustability.stretchability.0 += amount;
+ }
+
+ /// Shrink the width of glyph on the right side.
+ pub fn shrink_right(&mut self, amount: Em) {
+ self.x_advance -= amount;
+ self.adjustability.shrinkability.1 -= amount;
+ self.adjustability.stretchability.1 += amount;
}
}
@@ -301,7 +353,7 @@ impl<'a> ShapedText<'a> {
pub fn cjk_justifiable_at_last(&self) -> bool {
self.glyphs
.last()
- .map(|g| g.is_cjk() || g.is_cjk_left_aligned_punctuation())
+ .map(|g| g.is_cjk_script() || g.is_cjk_adjustable())
.unwrap_or(false)
}
@@ -339,6 +391,8 @@ impl<'a> ShapedText<'a> {
base: text_range.start,
text,
dir: self.dir,
+ lang: self.lang,
+ region: self.region,
styles: self.styles,
size: self.size,
variant: self.variant,
@@ -346,7 +400,16 @@ impl<'a> ShapedText<'a> {
glyphs: Cow::Borrowed(glyphs),
}
} else {
- shape(vt, text_range.start, text, spans, self.styles, self.dir)
+ shape(
+ vt,
+ text_range.start,
+ text,
+ spans,
+ self.styles,
+ self.dir,
+ self.lang,
+ self.region,
+ )
}
}
@@ -373,6 +436,7 @@ impl<'a> ShapedText<'a> {
x_advance,
x_offset: Em::zero(),
y_offset: Em::zero(),
+ adjustability: Adjustability::default(),
range,
safe_to_break: true,
c: '-',
@@ -462,6 +526,7 @@ struct ShapingContext<'a, 'v> {
}
/// Shape text into [`ShapedText`].
+#[allow(clippy::too_many_arguments)]
pub fn shape<'a>(
vt: &Vt,
base: usize,
@@ -469,6 +534,8 @@ pub fn shape<'a>(
spans: &SpanMapper,
styles: StyleChain<'a>,
dir: Dir,
+ lang: Lang,
+ region: Option<Region>,
) -> ShapedText<'a> {
let size = TextElem::size_in(styles);
let mut ctx = ShapingContext {
@@ -489,11 +556,14 @@ pub fn shape<'a>(
}
track_and_space(&mut ctx);
+ calculate_adjustability(&mut ctx, lang, region);
ShapedText {
base,
text,
dir,
+ lang,
+ region,
styles,
variant: ctx.variant,
size,
@@ -581,6 +651,7 @@ fn shape_segment(
x_advance: font.to_em(pos[i].x_advance),
x_offset: font.to_em(pos[i].x_offset),
y_offset: font.to_em(pos[i].y_offset),
+ adjustability: Adjustability::default(),
range: start..end,
safe_to_break: !info.unsafe_to_break(),
c: text[cluster..].chars().next().unwrap(),
@@ -645,6 +716,7 @@ fn shape_tofus(ctx: &mut ShapingContext, base: usize, text: &str, font: Font) {
x_advance,
x_offset: Em::zero(),
y_offset: Em::zero(),
+ adjustability: Adjustability::default(),
range: start..end,
safe_to_break: true,
c,
@@ -679,6 +751,43 @@ fn track_and_space(ctx: &mut ShapingContext) {
}
}
+pub fn is_gb_style(lang: Lang, region: Option<Region>) -> bool {
+ // Most CJK variants, including zh-CN, ja-JP, zh-SG, zh-MY use GB-style punctuation,
+ // while zh-HK and zh-TW use alternative style. We default to use GB-style.
+ !(lang == Lang::CHINESE
+ && matches!(region.as_ref().map(Region::as_str), Some("TW" | "HK")))
+}
+
+/// Calculate stretchability and shrinkability of each glyph,
+/// and CJK punctuation adjustments according to Chinese Layout Requirements.
+fn calculate_adjustability(ctx: &mut ShapingContext, lang: Lang, region: Option<Region>) {
+ let gb_style = is_gb_style(lang, region);
+
+ let mut glyphs = ctx.glyphs.iter_mut().peekable();
+ while let Some(glyph) = glyphs.next() {
+ glyph.adjustability = glyph.base_adjustability(gb_style);
+
+ // Only GB style needs further adjustment.
+ if glyph.is_cjk_adjustable() && !gb_style {
+ continue;
+ }
+
+ // Now we apply consecutive punctuation adjustment, specified in Chinese Layout
+ // Requirements, section 3.1.6.1 Punctuation Adjustment Space, and Japanese Layout
+ // Requirements, section 3.1 Line Composition Rules for Punctuation Marks
+ let Some(next) = glyphs.peek_mut() else { continue };
+ let width = glyph.x_advance;
+ let delta = width / 2.0;
+ if next.is_cjk_adjustable()
+ && (glyph.shrinkability().1 + next.shrinkability().0) >= delta
+ {
+ let left_delta = glyph.shrinkability().1.min(delta);
+ glyph.shrink_right(left_delta);
+ next.shrink_left(delta - left_delta);
+ }
+ }
+}
+
/// Difference between non-breaking and normal space.
fn nbsp_delta(font: &Font) -> Option<Em> {
let space = font.ttf().glyph_index(' ')?.0;
diff --git a/tests/ref/layout/par-justify-cjk.png b/tests/ref/layout/par-justify-cjk.png
index 5efcc1e1..89a9af7d 100644
--- a/tests/ref/layout/par-justify-cjk.png
+++ b/tests/ref/layout/par-justify-cjk.png
Binary files differ
diff --git a/tests/ref/meta/numbering.png b/tests/ref/meta/numbering.png
index 984cf04c..8ddf3324 100644
--- a/tests/ref/meta/numbering.png
+++ b/tests/ref/meta/numbering.png
Binary files differ
diff --git a/tests/ref/text/chinese.png b/tests/ref/text/chinese.png
index 89ee357b..4762558d 100644
--- a/tests/ref/text/chinese.png
+++ b/tests/ref/text/chinese.png
Binary files differ
diff --git a/tests/typ/layout/par-justify-cjk.typ b/tests/typ/layout/par-justify-cjk.typ
index d82e72cd..8ceab65f 100644
--- a/tests/typ/layout/par-justify-cjk.typ
+++ b/tests/typ/layout/par-justify-cjk.typ
@@ -5,7 +5,7 @@
// Most Chinese publications do not use hanging punctuation at line end.
#set page(width: auto)
#set par(justify: true)
-#set text(overhang: false, lang: "zh")
+#set text(font: "Noto Serif CJK SC", lang: "zh", overhang: false)
#rect(inset: 0pt, width: 80pt, fill: rgb("eee"))[
中文维基百科使用汉字书写,汉字是汉族或华人的共同文字,是中国大陆、新加坡、马来西亚、台湾、香港、澳门的唯一官方文字或官方文字之一。25.9%,而美国和荷兰则分別占13.7%及8.2%。近年來,中国大陆地区的维基百科编辑者正在迅速增加;
@@ -33,4 +33,17 @@
《书名》《测试》下一行
《书名》《测试》。
-] \ No newline at end of file
+]
+
+---
+// Test Variants of Mainland China, Hong Kong, and Japan.
+
+// 17 characters a line.
+#set page(width: 170pt + 10pt, margin: (x: 5pt))
+#set text(font: "Noto Serif CJK SC", lang: "zh", overhang: false)
+#set par(justify: true)
+
+孔雀最早见于《山海经》中的《海内经》:\u{200b}“有孔雀。”东汉杨孚著《异物志》记载,岭南:“孔雀,其大如大雁而足高,毛皆有斑纹彩,捕而蓄之,拍手即舞。”
+
+#set text(font: "Noto Serif CJK TC", lang: "zh", region: "hk")
+孔雀最早见于《山海经》中的《海内经》:「有孔雀。」东汉杨孚著《异物志》记载,岭南:「孔雀,其大如大雁而足高,毛皆有斑纹彩,捕而蓄之,拍手即舞。」