summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeng Guanwen <pg999w@outlook.com>2023-04-13 16:44:39 +0800
committerGitHub <noreply@github.com>2023-04-13 10:44:39 +0200
commitff16f3fb370a1688e59fbbaf8824302d2b2f1a7b (patch)
tree6b62a1a755252b52f6eedf88cd2d5741337d5923
parent03d2ec9f813cb18c350de78614fcbc269b2dfc96 (diff)
Refine linebreak algorithm for better Chinese justification (#701)
-rw-r--r--library/src/layout/par.rs108
-rw-r--r--library/src/text/shaping.rs113
-rw-r--r--tests/ref/layout/par-justify-cjk.pngbin0 -> 43928 bytes
-rw-r--r--tests/ref/layout/par-justify.pngbin29600 -> 12441 bytes
-rw-r--r--tests/ref/layout/par-knuth.pngbin69406 -> 68723 bytes
-rw-r--r--tests/ref/text/linebreak.pngbin19142 -> 18973 bytes
-rw-r--r--tests/typ/layout/par-justify-cjk.typ23
-rw-r--r--tests/typ/layout/par-justify.typ6
8 files changed, 187 insertions, 63 deletions
diff --git a/library/src/layout/par.rs b/library/src/layout/par.rs
index 0ad9e171..17e07cd0 100644
--- a/library/src/layout/par.rs
+++ b/library/src/layout/par.rs
@@ -457,22 +457,35 @@ impl<'a> Line<'a> {
self.items().skip(start).take(end - start)
}
- /// How many justifiable glyphs the line contains.
+ /// How many glyphs are in the text where we can insert additional
+ /// space when encountering underfull lines.
fn justifiables(&self) -> usize {
let mut count = 0;
for shaped in self.items().filter_map(Item::text) {
count += shaped.justifiables();
}
+ // CJK character at line end should not be adjusted.
+ if self
+ .items()
+ .last()
+ .and_then(Item::text)
+ .map(|s| s.cjk_justifiable_at_last())
+ .unwrap_or(false)
+ {
+ count -= 1;
+ }
+
count
}
- /// How much of the line is stretchable spaces.
- fn stretch(&self) -> Abs {
- let mut stretch = Abs::zero();
- for shaped in self.items().filter_map(Item::text) {
- stretch += shaped.stretch();
- }
- stretch
+ /// How much can the line stretch
+ fn stretchability(&self) -> Abs {
+ self.items().filter_map(Item::text).map(|s| s.stretchability()).sum()
+ }
+
+ /// How much can the line shrink
+ fn shrinkability(&self) -> Abs {
+ self.items().filter_map(Item::text).map(|s| s.shrinkability()).sum()
}
/// The sum of fractions in the line.
@@ -835,10 +848,9 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
// Cost parameters.
const HYPH_COST: Cost = 0.5;
- const CONSECUTIVE_DASH_COST: Cost = 30.0;
+ const CONSECUTIVE_DASH_COST: Cost = 300.0;
const MAX_COST: Cost = 1_000_000.0;
- const MIN_COST: Cost = -MAX_COST;
- const MIN_RATIO: f64 = -0.15;
+ const MIN_RATIO: f64 = -1.0;
// Dynamic programming table.
let mut active = 0;
@@ -864,14 +876,31 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
// Determine how much the line's spaces would need to be stretched
// to make it the desired width.
let delta = width - attempt.width;
- let mut ratio = delta / attempt.stretch();
+ // Determine how much stretch are permitted.
+ let adjust = if delta >= Abs::zero() {
+ attempt.stretchability()
+ } else {
+ attempt.shrinkability()
+ };
+ // Ideally, the ratio should between -1.0 and 1.0, but sometimes a value above 1.0
+ // is possible, in which case the line is underfull.
+ let mut ratio = delta / adjust;
+ if ratio.is_nan() {
+ // The line is not stretchable, but it just fits.
+ // This often happens with monospace fonts and CJK texts.
+ ratio = 0.0;
+ }
if ratio.is_infinite() {
+ // The line's not stretchable, we calculate the ratio in another way...
ratio = delta / (em / 2.0);
+ // ...and because it is underfull/overfull, make sure the ratio is at least 1.0.
+ if ratio > 0.0 {
+ ratio += 1.0;
+ } else {
+ ratio -= 1.0;
+ }
}
- // At some point, it doesn't matter any more.
- ratio = ratio.min(10.0);
-
// Determine the cost of the line.
let min_ratio = if attempt.justify { MIN_RATIO } else { 0.0 };
let mut cost = if ratio < min_ratio {
@@ -883,11 +912,15 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
active = i + 1;
MAX_COST
} else if mandatory || eof {
- // This is a mandatory break and the line is not overfull, so it
- // has minimum cost. All breakpoints before this one become
- // inactive since no line can span above the mandatory break.
+ // This is a mandatory break and the line is not overfull, so
+ // all breakpoints before this one become inactive since no line
+ // can span above the mandatory break.
active = k;
- MIN_COST + if attempt.justify { ratio.powi(3).abs() } else { 0.0 }
+ if attempt.justify {
+ ratio.powi(3).abs()
+ } else {
+ 0.0
+ }
} else {
// Normal line with cost of |ratio^3|.
ratio.powi(3).abs()
@@ -898,6 +931,12 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
cost += HYPH_COST;
}
+ // In Knuth paper, cost = (1 + 100|r|^3 + p)^2 + a,
+ // where r is the ratio, p=50 is penaty, and a=3000 is consecutive penaty.
+ // We divide the whole formula by 10, resulting (0.01 + |r|^3 + p)^2 + a,
+ // where p=0.5 and a=300
+ cost = (0.01 + cost).powi(2);
+
// Penalize two consecutive dashes (not necessarily hyphens) extra.
if attempt.dash && pred.line.dash {
cost += CONSECUTIVE_DASH_COST;
@@ -1233,13 +1272,32 @@ fn commit(
}
}
- // Determine how much to justify each space.
+ // Determine how much addtional space is needed.
+ // The justicication_ratio is for the first step justification,
+ // extra_justification is for the last step.
+ // For more info on multi-step justification, see Procedures for Inter-
+ // Character Space Expansion in W3C document Chinese Layout Requirements.
let fr = line.fr();
- let mut justification = Abs::zero();
- if remaining < Abs::zero() || (line.justify && fr.is_zero()) {
+ let mut justification_ratio = 0.0;
+ let mut extra_justification = Abs::zero();
+
+ let shrink = line.shrinkability();
+ let stretch = line.stretchability();
+ if remaining < Abs::zero() && shrink > Abs::zero() {
+ // Attempt to reduce the length of the line, using shrinkability.
+ justification_ratio = (remaining / shrink).max(-1.0);
+ remaining = (remaining + shrink).min(Abs::zero());
+ } else if line.justify && fr.is_zero() {
+ // Attempt to increase the length of the line, using stretchability.
+ if stretch > Abs::zero() {
+ justification_ratio = (remaining / stretch).min(1.0);
+ remaining = (remaining - stretch).max(Abs::zero());
+ }
+
let justifiables = line.justifiables();
- if justifiables > 0 {
- justification = remaining / justifiables as f64;
+ if justifiables > 0 && remaining > Abs::zero() {
+ // Underfull line, distribute the extra space.
+ extra_justification = remaining / justifiables as f64;
remaining = Abs::zero();
}
}
@@ -1275,7 +1333,7 @@ fn commit(
}
}
Item::Text(shaped) => {
- let frame = shaped.build(vt, justification);
+ let frame = shaped.build(vt, justification_ratio, extra_justification);
push(&mut offset, frame);
}
Item::Frame(frame) => {
diff --git a/library/src/text/shaping.rs b/library/src/text/shaping.rs
index 2dd0cd6d..0e5e0a73 100644
--- a/library/src/text/shaping.rs
+++ b/library/src/text/shaping.rs
@@ -70,22 +70,42 @@ impl ShapedGlyph {
}
/// Whether the glyph is justifiable.
- ///
- /// Typst's basic justification strategy is to stretch all the spaces
- /// in a line until the line fills the available width. However, some
- /// scripts (notably Chinese and Japanese) don't use spaces.
- ///
- /// In Japanese typography, the convention is to insert space evenly
- /// between all glyphs. I assume it's the same in Chinese.
pub fn is_justifiable(&self) -> bool {
- self.is_space() || is_spaceless(self.c.script())
+ self.is_space() || self.is_cjk() || self.is_cjk_punctuation()
+ }
+
+ pub fn is_cjk(&self) -> bool {
+ use Script::*;
+ matches!(self.c.script(), Hiragana | Katakana | Han)
+ }
+
+ pub fn is_cjk_punctuation(&self) -> bool {
+ matches!(self.c, ',' | '。' | '、' | ':' | ';')
+ }
+
+ /// The stretchability of the character.
+ pub fn stretchability(&self) -> Em {
+ let width = self.x_advance;
+ if self.is_space() {
+ // The number for spaces is from Knuth-Plass' paper
+ width / 2.0
+ } else {
+ Em::zero()
+ }
}
-}
-/// Does this script separate its words using spaces?
-fn is_spaceless(script: Script) -> bool {
- use Script::*;
- matches!(script, Hiragana | Katakana | Han)
+ /// The shrinkability of the character.
+ pub fn shrinkability(&self) -> Em {
+ let width = self.x_advance;
+ if self.is_space() {
+ // The number for spaces is from Knuth-Plass' paper
+ width / 3.0
+ } else if self.is_cjk_punctuation() {
+ width / 2.0
+ } else {
+ Em::zero()
+ }
+ }
}
/// A side you can go toward.
@@ -101,7 +121,12 @@ impl<'a> ShapedText<'a> {
///
/// The `justification` defines how much extra advance width each
/// [justifiable glyph](ShapedGlyph::is_justifiable) will get.
- pub fn build(&self, vt: &Vt, justification: Abs) -> Frame {
+ pub fn build(
+ &self,
+ vt: &Vt,
+ justification_ratio: f64,
+ extra_justification: Abs,
+ ) -> Frame {
let (top, bottom) = self.measure(vt);
let size = Size::new(self.width, top + bottom);
@@ -120,19 +145,25 @@ impl<'a> ShapedText<'a> {
let pos = Point::new(offset, top + shift - y_offset.at(self.size));
let glyphs = group
.iter()
- .map(|glyph| Glyph {
- id: glyph.glyph_id,
- x_advance: glyph.x_advance
- + if glyph.is_justifiable() {
- frame.size_mut().x += justification;
- Em::from_length(justification, self.size)
- } else {
- Em::zero()
- },
- x_offset: glyph.x_offset,
- c: glyph.c,
- span: glyph.span,
- offset: glyph.offset,
+ .map(|glyph| {
+ let mut justification = Em::zero();
+ if justification_ratio < 0.0 {
+ justification += glyph.shrinkability() * justification_ratio
+ } else {
+ justification += glyph.stretchability() * justification_ratio
+ }
+ if glyph.is_justifiable() {
+ justification += Em::from_length(extra_justification, self.size)
+ }
+ frame.size_mut().x += justification.at(self.size);
+ Glyph {
+ id: glyph.glyph_id,
+ x_advance: glyph.x_advance + justification,
+ x_offset: glyph.x_offset,
+ c: glyph.c,
+ span: glyph.span,
+ offset: glyph.offset,
+ }
})
.collect();
@@ -200,17 +231,35 @@ impl<'a> ShapedText<'a> {
(top, bottom)
}
- /// How many justifiable glyphs the text contains.
+ /// How many glyphs are in the text where we can insert additional
+ /// space when encountering underfull lines.
pub fn justifiables(&self) -> usize {
self.glyphs.iter().filter(|g| g.is_justifiable()).count()
}
- /// The width of the spaces in the text.
- pub fn stretch(&self) -> Abs {
+ /// Whether the last glyph is a CJK character which should not be justified
+ /// on line end.
+ pub fn cjk_justifiable_at_last(&self) -> bool {
+ self.glyphs
+ .last()
+ .map(|g| g.is_cjk() || g.is_cjk_punctuation())
+ .unwrap_or(false)
+ }
+
+ /// The stretchability of the text.
+ pub fn stretchability(&self) -> Abs {
+ self.glyphs
+ .iter()
+ .map(|g| g.stretchability())
+ .sum::<Em>()
+ .at(self.size)
+ }
+
+ /// The shrinkability of the text
+ pub fn shrinkability(&self) -> Abs {
self.glyphs
.iter()
- .filter(|g| g.is_justifiable())
- .map(|g| g.x_advance)
+ .map(|g| g.shrinkability())
.sum::<Em>()
.at(self.size)
}
diff --git a/tests/ref/layout/par-justify-cjk.png b/tests/ref/layout/par-justify-cjk.png
new file mode 100644
index 00000000..ee58318a
--- /dev/null
+++ b/tests/ref/layout/par-justify-cjk.png
Binary files differ
diff --git a/tests/ref/layout/par-justify.png b/tests/ref/layout/par-justify.png
index 8a1578fe..a16c3f7b 100644
--- a/tests/ref/layout/par-justify.png
+++ b/tests/ref/layout/par-justify.png
Binary files differ
diff --git a/tests/ref/layout/par-knuth.png b/tests/ref/layout/par-knuth.png
index afe590b0..f3da1753 100644
--- a/tests/ref/layout/par-knuth.png
+++ b/tests/ref/layout/par-knuth.png
Binary files differ
diff --git a/tests/ref/text/linebreak.png b/tests/ref/text/linebreak.png
index 64d0d403..c127182c 100644
--- a/tests/ref/text/linebreak.png
+++ b/tests/ref/text/linebreak.png
Binary files differ
diff --git a/tests/typ/layout/par-justify-cjk.typ b/tests/typ/layout/par-justify-cjk.typ
new file mode 100644
index 00000000..1016b282
--- /dev/null
+++ b/tests/typ/layout/par-justify-cjk.typ
@@ -0,0 +1,23 @@
+// Test Chinese text in narrow lines.
+
+// In Chinese typography, line length should be multiples of the character size
+// and the line ends should be aligned with each other.
+// Most Chinese publications do not use hanging punctuation at line end.
+#set page(width: auto)
+#set par(justify: true)
+#set text(overhang: false, lang: "zh")
+
+#rect(inset: 0pt, width: 80pt, fill: rgb("eee"))[
+ 中文维基百科使用汉字书写,汉字是汉族或华人的共同文字,是中国大陆、新加坡、马来西亚、台湾、香港、澳门的唯一官方文字或官方文字之一。25.9%,而美国和荷兰则分別占13.7%及8.2%。近年來,中国大陆地区的维基百科编辑者正在迅速增加;
+]
+
+---
+// Japanese typography is more complex, make sure it is at least a bit sensible.
+#set page(width: auto)
+#set par(justify: true)
+#set text(lang: "jp")
+#rect(inset: 0pt, width: 80pt, fill: rgb("eee"))[
+ ウィキペディア(英: Wikipedia)は、世界中のボランティアの共同作業によって執筆及び作成されるフリーの多言語インターネット百科事典である。主に寄付に依って活動している非営利団体「ウィキメディア財団」が所有・運営している。
+
+ 専門家によるオンライン百科事典プロジェクトNupedia(ヌーペディア)を前身として、2001年1月、ラリー・サンガーとジミー・ウェールズ(英: Jimmy Donal "Jimbo" Wales)により英語でプロジェクトが開始された。
+] \ No newline at end of file
diff --git a/tests/typ/layout/par-justify.typ b/tests/typ/layout/par-justify.typ
index 24d3ab38..7034a42b 100644
--- a/tests/typ/layout/par-justify.typ
+++ b/tests/typ/layout/par-justify.typ
@@ -21,12 +21,6 @@ A B C #linebreak(justify: true)
D E F #linebreak(justify: true)
---
-// Test that justificating chinese text is at least a bit sensible.
-#set page(width: 200pt)
-#set par(justify: true)
-中文维基百科使用汉字书写,汉字是汉族或华人的共同文字,是中国大陆、新加坡、马来西亚、台湾、香港、澳门的唯一官方文字或官方文字之一。25.9%,而美国和荷兰则分別占13.7%及8.2%。近年來,中国大陆地区的维基百科编辑者正在迅速增加;
-
----
// Test that there are no hick-ups with justification enabled and
// basically empty paragraph.
#set par(justify: true)