summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2024-07-19 13:47:51 +0200
committerGitHub <noreply@github.com>2024-07-19 11:47:51 +0000
commit3ef0991fbb688c22c1d6d0a78c52a5bae3f56f7d (patch)
tree64b676a71214cc7c49cceb6f4c519c663e03cd93
parent42754477886f6a12afbabfd2a64d8c787a57bc03 (diff)
Tune hyphenation (#4584)
-rw-r--r--crates/typst/src/layout/inline/line.rs2
-rw-r--r--crates/typst/src/layout/inline/linebreak.rs135
-rw-r--r--crates/typst/src/text/mod.rs5
-rw-r--r--tests/ref/justify-avoid-runts.pngbin1879 -> 1885 bytes
-rw-r--r--tests/ref/justify-chinese.pngbin6678 -> 6583 bytes
5 files changed, 74 insertions, 68 deletions
diff --git a/crates/typst/src/layout/inline/line.rs b/crates/typst/src/layout/inline/line.rs
index 12162ab1..bf1662ce 100644
--- a/crates/typst/src/layout/inline/line.rs
+++ b/crates/typst/src/layout/inline/line.rs
@@ -133,7 +133,7 @@ pub fn line<'a>(
|| (p.justify && breakpoint != Breakpoint::Mandatory);
// Process dashes.
- let dash = if breakpoint == Breakpoint::Hyphen || full.ends_with(SHY) {
+ let dash = if breakpoint.is_hyphen() || full.ends_with(SHY) {
Some(Dash::Soft)
} else if full.ends_with(HYPHEN) {
Some(Dash::Hard)
diff --git a/crates/typst/src/layout/inline/linebreak.rs b/crates/typst/src/layout/inline/linebreak.rs
index 1f30bb73..defb5f81 100644
--- a/crates/typst/src/layout/inline/linebreak.rs
+++ b/crates/typst/src/layout/inline/linebreak.rs
@@ -1,5 +1,6 @@
use std::ops::{Add, Sub};
+use az::SaturatingAs;
use icu_properties::maps::{CodePointMapData, CodePointMapDataBorrowed};
use icu_properties::sets::CodePointSetData;
use icu_properties::LineBreak;
@@ -21,10 +22,15 @@ use crate::text::{Lang, TextElem};
type Cost = f64;
// Cost parameters.
-const DEFAULT_HYPH_COST: Cost = 0.5;
-const DEFAULT_RUNT_COST: Cost = 0.5;
-const CONSECUTIVE_DASH_COST: Cost = 0.3;
-const MAX_COST: Cost = 1_000_000.0;
+//
+// We choose higher costs than the Knuth-Plass paper (which would be 50) because
+// it hyphenates way to eagerly in Typst otherwise. Could be related to the
+// ratios coming out differently since Typst doesn't have the concept of glue,
+// so things work a bit differently.
+const DEFAULT_HYPH_COST: Cost = 135.0;
+const DEFAULT_RUNT_COST: Cost = 100.0;
+
+// Other parameters.
const MIN_RATIO: f64 = -1.0;
const MIN_APPROX_RATIO: f64 = -0.5;
const BOUND_EPS: f64 = 1e-3;
@@ -65,8 +71,9 @@ pub enum Breakpoint {
Normal,
/// A mandatory breakpoint (after '\n' or at the end of the text).
Mandatory,
- /// An opportunity for hyphenating.
- Hyphen,
+ /// An opportunity for hyphenating and how many chars are before/after it
+ /// in the word.
+ Hyphen(u8, u8),
}
impl Breakpoint {
@@ -95,9 +102,14 @@ impl Breakpoint {
}
// Trim nothing further.
- Self::Hyphen => line,
+ Self::Hyphen(..) => line,
}
}
+
+ /// Whether this is a hyphen breakpoint.
+ pub fn is_hyphen(self) -> bool {
+ matches!(self, Self::Hyphen(..))
+ }
}
/// Breaks the paragraph into lines.
@@ -254,7 +266,6 @@ fn linebreak_optimized_bounded<'a>(
width,
&pred.line,
&attempt,
- end,
breakpoint,
unbreakable,
);
@@ -374,8 +385,6 @@ fn linebreak_optimized_approximate(
let mut prev_end = 0;
breakpoints(p, |end, breakpoint| {
- let at_end = end == p.text.len();
-
// Find the optimal predecessor.
let mut best: Option<Entry> = None;
for (pred_index, pred) in table.iter().enumerate().skip(active) {
@@ -384,13 +393,12 @@ fn linebreak_optimized_approximate(
// Whether the line is justified. This is not 100% accurate w.r.t
// to line()'s behaviour, but good enough.
- let justify = p.justify && !at_end && breakpoint != Breakpoint::Mandatory;
+ let justify = p.justify && breakpoint != Breakpoint::Mandatory;
// We don't really know whether the line naturally ends with a dash
// here, so we can miss that case, but it's ok, since all of this
// just an estimate.
- let consecutive_dash =
- pred.breakpoint == Breakpoint::Hyphen && breakpoint == Breakpoint::Hyphen;
+ let consecutive_dash = pred.breakpoint.is_hyphen() && breakpoint.is_hyphen();
// Estimate how much the line's spaces would need to be stretched to
// make it the desired width. We trim at the end to not take into
@@ -401,7 +409,7 @@ fn linebreak_optimized_approximate(
p,
width,
estimates.widths.estimate(start..trimmed_end)
- + if breakpoint == Breakpoint::Hyphen {
+ + if breakpoint.is_hyphen() {
metrics.approx_hyphen_width
} else {
Abs::zero()
@@ -416,7 +424,6 @@ fn linebreak_optimized_approximate(
metrics,
breakpoint,
line_ratio,
- at_end,
justify,
unbreakable,
consecutive_dash,
@@ -474,17 +481,8 @@ fn linebreak_optimized_approximate(
let Entry { end, breakpoint, unbreakable, .. } = table[idx];
let attempt = line(engine, p, start..end, breakpoint, Some(&pred));
-
- let (_, line_cost) = ratio_and_cost(
- p,
- metrics,
- width,
- &pred,
- &attempt,
- end,
- breakpoint,
- unbreakable,
- );
+ let (_, line_cost) =
+ ratio_and_cost(p, metrics, width, &pred, &attempt, breakpoint, unbreakable);
pred = attempt;
start = end;
@@ -502,7 +500,6 @@ fn ratio_and_cost(
available_width: Abs,
pred: &Line,
attempt: &Line,
- end: usize,
breakpoint: Breakpoint,
unbreakable: bool,
) -> (f64, Cost) {
@@ -519,7 +516,6 @@ fn ratio_and_cost(
metrics,
breakpoint,
ratio,
- end == p.text.len(),
attempt.justify,
unbreakable,
pred.dash.is_some() && attempt.dash.is_some(),
@@ -569,57 +565,64 @@ fn raw_ratio(
}
/// Compute the cost of a line given raw metrics.
-#[allow(clippy::too_many_arguments)]
+///
+/// This mostly follows the formula in the Knuth-Plass paper, but there are some
+/// adjustments.
fn raw_cost(
metrics: &CostMetrics,
breakpoint: Breakpoint,
ratio: f64,
- at_end: bool,
justify: bool,
unbreakable: bool,
consecutive_dash: bool,
approx: bool,
) -> Cost {
- // Determine the cost of the line.
- let mut cost = if ratio < metrics.min_ratio(approx) {
+ // Determine the stretch/shrink cost of the line.
+ let badness = if ratio < metrics.min_ratio(approx) {
// Overfull line always has maximum cost.
- MAX_COST
- } else if breakpoint == Breakpoint::Mandatory || at_end {
- // - If ratio < 0, we always need to shrink the line (even the last one).
- // - If ratio > 0, we need to stretch the line only when it is justified
- // (last line is not justified by default even if `p.justify` is true).
- if ratio < 0.0 || (ratio > 0.0 && justify) {
- ratio.powi(3).abs()
- } else {
- 0.0
- }
+ 1_000_000.0
+ } else if justify || ratio < 0.0 {
+ // If the line shall be justified or needs shrinking, it has normal
+ // badness with cost 100|ratio|^3. We limit the ratio to 10 as to not
+ // get to close to our maximum cost.
+ 100.0 * ratio.abs().min(10.0).powi(3)
} else {
- // Normal line with cost of |ratio^3|.
- ratio.powi(3).abs()
+ // If the line shouldn't be justified and doesn't need shrink, we don't
+ // pay any cost.
+ 0.0
};
- // Penalize runts (lone words in the last line).
- if unbreakable && at_end {
- cost += metrics.runt_cost;
+ // Compute penalties.
+ let mut penalty = 0.0;
+
+ // Penalize runts (lone words before a mandatory break / at the end).
+ if unbreakable && breakpoint == Breakpoint::Mandatory {
+ penalty += metrics.runt_cost;
}
// Penalize hyphenation.
- if breakpoint == Breakpoint::Hyphen {
- cost += metrics.hyph_cost;
+ if let Breakpoint::Hyphen(l, r) = breakpoint {
+ // We penalize hyphenations close to the edges of the word (< LIMIT
+ // chars) extra. For each step of distance from the limit, we add 15%
+ // to the cost.
+ const LIMIT: u8 = 5;
+ let steps = LIMIT.saturating_sub(l) + LIMIT.saturating_sub(r);
+ let extra = 0.15 * steps as f64;
+ penalty += (1.0 + extra) * metrics.hyph_cost;
}
- // In the Knuth paper, cost = (1 + 100|r|^3 + p)^2 + a,
- // where r is the ratio, p=50 is the penalty, and a=3000 is
- // consecutive the penalty. We divide the whole formula by 10,
- // resulting (0.01 + |r|^3 + p)^2 + a, where p=0.5 and a=0.3
- let mut cost = (0.01 + cost).powi(2);
-
- // Penalize two consecutive dashes (not necessarily hyphens) extra.
+ // Penalize two consecutive dashes extra (not necessarily hyphens).
+ // Knuth-Plass does this separately after the squaring, with a higher cost,
+ // but I couldn't find any explanation as to why.
if consecutive_dash {
- cost += CONSECUTIVE_DASH_COST;
+ penalty += metrics.hyph_cost;
}
- cost
+ // From the Knuth-Plass Paper: $ (1 + beta_j + pi_j)^2 $.
+ //
+ // We add one to minimize the number of lines when everything else is more
+ // or less equal.
+ (1.0 + badness + penalty).powi(2)
}
/// Calls `f` for all possible points in the text where lines can broken.
@@ -711,10 +714,13 @@ fn hyphenations(
mut f: impl FnMut(usize, Breakpoint),
) {
let Some(lang) = lang_at(p, offset) else { return };
+ let count = word.chars().count();
let end = offset + word.len();
+ let mut chars = 0;
for syllable in hypher::hyphenate(word, lang) {
offset += syllable.len();
+ chars += syllable.chars().count();
// Don't hyphenate after the final syllable.
if offset == end {
@@ -735,8 +741,12 @@ fn hyphenations(
continue;
}
+ // Determine the number of codepoints before and after the hyphenation.
+ let l = chars.saturating_as::<u8>();
+ let r = (count - chars).saturating_as::<u8>();
+
// Call `f` for the word-internal hyphenation opportunity.
- f(offset, Breakpoint::Hyphen);
+ f(offset, Breakpoint::Hyphen(l, r));
}
}
@@ -825,9 +835,9 @@ fn lang_at(p: &Preparation, offset: usize) -> Option<hypher::Lang> {
struct CostMetrics {
min_ratio: f64,
min_approx_ratio: f64,
+ approx_hyphen_width: Abs,
hyph_cost: Cost,
runt_cost: Cost,
- approx_hyphen_width: Abs,
}
impl CostMetrics {
@@ -837,10 +847,11 @@ impl CostMetrics {
// When justifying, we may stretch spaces below their natural width.
min_ratio: if p.justify { MIN_RATIO } else { 0.0 },
min_approx_ratio: if p.justify { MIN_APPROX_RATIO } else { 0.0 },
- hyph_cost: DEFAULT_HYPH_COST * p.costs.hyphenation().get(),
- runt_cost: DEFAULT_RUNT_COST * p.costs.runt().get(),
// Approximate hyphen width for estimates.
approx_hyphen_width: Em::new(0.33).at(p.size),
+ // Costs.
+ hyph_cost: DEFAULT_HYPH_COST * p.costs.hyphenation().get(),
+ runt_cost: DEFAULT_RUNT_COST * p.costs.runt().get(),
}
}
diff --git a/crates/typst/src/text/mod.rs b/crates/typst/src/text/mod.rs
index d42e4df8..76ea26c1 100644
--- a/crates/typst/src/text/mod.rs
+++ b/crates/typst/src/text/mod.rs
@@ -512,11 +512,6 @@ pub struct TextElem {
/// default of `auto`, prevents them. More nuanced cost specification for
/// these modifications is planned for the future.)
///
- /// The default costs are an acceptable balance, but some may find that it
- /// hyphenates or avoids runs too eagerly, breaking the flow of dense prose.
- /// A cost of 600% (six times the normal cost) may work better for such
- /// contexts.
- ///
/// ```example
/// #set text(hyphenate: true, size: 11.4pt)
/// #set par(justify: true)
diff --git a/tests/ref/justify-avoid-runts.png b/tests/ref/justify-avoid-runts.png
index 70513939..a0c84eec 100644
--- a/tests/ref/justify-avoid-runts.png
+++ b/tests/ref/justify-avoid-runts.png
Binary files differ
diff --git a/tests/ref/justify-chinese.png b/tests/ref/justify-chinese.png
index 0284e8b9..ed9a86f3 100644
--- a/tests/ref/justify-chinese.png
+++ b/tests/ref/justify-chinese.png
Binary files differ