Optimize optimized paragraph layout (#4483)

author: Laurenz <laurmaedje@gmail.com> 2024-07-01 15:04:58 +0200
committer: GitHub <noreply@github.com> 2024-07-01 13:04:58 +0000
commit: 6d835ecb9278490f645ad0a1b86ec84e752d3220 (patch)
tree: 9d8b3ecc0c374a53214d4c1fb73699a52ce80322
parent: 45366c0112ac7a6197cee35f1e180c6a00923e05 (diff)
3 files changed, 590 insertions, 152 deletions
diff --git a/crates/typst/src/layout/inline/line.rs b/crates/typst/src/layout/inline/line.rs
index 2473f958..232a1c6b 100644
--- a/crates/typst/src/layout/inline/line.rs
+++ b/crates/typst/src/layout/inline/line.rs
@@ -3,7 +3,7 @@ use unicode_bidi::BidiInfo;
 use super::*;
 use crate::engine::Engine;
 use crate::layout::{Abs, Em, Fr, Frame, FrameItem, Point};
-use crate::text::TextElem;
+use crate::text::{Lang, TextElem};
 use crate::utils::Numeric;
 
 /// A layouted line, consisting of a sequence of layouted paragraph items that
@@ -99,6 +99,15 @@ impl<'a> Line<'a> {
         self.items().filter_map(Item::text).map(|s| s.shrinkability()).sum()
     }
 
+    /// Whether the line has items with negative width.
+    pub fn has_negative_width_items(&self) -> bool {
+        self.items().any(|item| match item {
+            Item::Absolute(amount, _) => *amount < Abs::zero(),
+            Item::Frame(frame, _) => frame.width() < Abs::zero(),
+            _ => false,
+        })
+    }
+
     /// The sum of fractions in the line.
     pub fn fr(&self) -> Fr {
         self.items()
@@ -129,7 +138,7 @@ pub fn line<'a>(
     p: &'a Preparation,
     mut range: Range,
     breakpoint: Breakpoint,
-    prepend_hyphen: bool,
+    pred: Option<&Line>,
 ) -> Line<'a> {
     let end = range.end;
     let mut justify =
@@ -149,6 +158,8 @@ pub fn line<'a>(
         };
     }
 
+    let prepend_hyphen = pred.map_or(false, should_insert_hyphen);
+
     // Slice out the relevant items.
     let (mut expanded, mut inner) = p.slice(range.clone());
     let mut width = Abs::zero();
@@ -528,6 +539,54 @@ fn reorder<'a>(line: &'a Line<'a>) -> (Vec<&Item<'a>>, bool) {
     (reordered, starts_rtl)
 }
 
+/// Whether a hyphen should be inserted at the start of the next line.
+fn should_insert_hyphen(pred_line: &Line) -> bool {
+    // If the predecessor line does not end with a Dash::HardHyphen, we shall
+    // not place a hyphen at the start of the next line.
+    if pred_line.dash != Some(Dash::HardHyphen) {
+        return false;
+    }
+
+    // If there's a trimmed out space, we needn't repeat the hyphen. That's the
+    // case of a text like "...kebab é a -melhor- comida que existe", where the
+    // hyphens are a kind of emphasis marker.
+    if pred_line.trimmed.end != pred_line.end {
+        return false;
+    }
+
+    // The hyphen should repeat only in the languages that require that feature.
+    // For more information see the discussion at https://github.com/typst/typst/issues/3235
+    let Some(Item::Text(shape)) = pred_line.last.as_ref() else { return false };
+
+    match shape.lang {
+        // - Lower Sorbian: see https://dolnoserbski.de/ortografija/psawidla/K3
+        // - Czech: see https://prirucka.ujc.cas.cz/?id=164
+        // - Croatian: see http://pravopis.hr/pravilo/spojnica/68/
+        // - Polish: see https://www.ortograf.pl/zasady-pisowni/lacznik-zasady-pisowni
+        // - Portuguese: see https://www2.senado.leg.br/bdsf/bitstream/handle/id/508145/000997415.pdf (Base XX)
+        // - Slovak: see https://www.zones.sk/studentske-prace/gramatika/10620-pravopis-rozdelovanie-slov/
+        Lang::LOWER_SORBIAN
+        | Lang::CZECH
+        | Lang::CROATIAN
+        | Lang::POLISH
+        | Lang::PORTUGUESE
+        | Lang::SLOVAK => true,
+
+        // In Spanish the hyphen is required only if the word next to hyphen is
+        // not capitalized. Otherwise, the hyphen must not be repeated.
+        //
+        // See § 4.1.1.1.2.e on the "Ortografía de la lengua española"
+        // https://www.rae.es/ortografía/como-signo-de-división-de-palabras-a-final-de-línea
+        Lang::SPANISH => pred_line.bidi.text[pred_line.end..]
+            .chars()
+            .next()
+            .map(|c| !c.is_uppercase())
+            .unwrap_or(false),
+
+        _ => false,
+    }
+}
+
 /// How much a character should hang into the end margin.
 ///
 /// For more discussion, see:
diff --git a/crates/typst/src/layout/inline/linebreak.rs b/crates/typst/src/layout/inline/linebreak.rs
index ddf7937b..0555c189 100644
--- a/crates/typst/src/layout/inline/linebreak.rs
+++ b/crates/typst/src/layout/inline/linebreak.rs
@@ -1,3 +1,5 @@
+use std::ops::{Add, Sub};
+
 use icu_properties::maps::CodePointMapData;
 use icu_properties::LineBreak;
 use icu_provider::AsDeserializingBufferProvider;
@@ -8,11 +10,23 @@ use once_cell::sync::Lazy;
 
 use super::*;
 use crate::engine::Engine;
-use crate::layout::Abs;
+use crate::layout::{Abs, Em};
 use crate::model::Linebreaks;
 use crate::syntax::link_prefix;
 use crate::text::{Lang, TextElem};
 
+/// The cost of a line or paragraph layout.
+type Cost = f64;
+
+// Cost parameters.
+const DEFAULT_HYPH_COST: Cost = 0.5;
+const DEFAULT_RUNT_COST: Cost = 0.5;
+const CONSECUTIVE_DASH_COST: Cost = 0.3;
+const MAX_COST: Cost = 1_000_000.0;
+const MIN_RATIO: f64 = -1.0;
+const MIN_APPROX_RATIO: f64 = -0.5;
+const BOUND_EPS: f64 = 1e-3;
+
 /// The general line break segmenter.
 static SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
     let provider =
@@ -84,10 +98,8 @@ fn linebreak_simple<'a>(
     let mut last = None;
 
     breakpoints(p, |end, breakpoint| {
-        let prepend_hyphen = lines.last().map(should_repeat_hyphen).unwrap_or(false);
-
         // Compute the line and its size.
-        let mut attempt = line(engine, p, start..end, breakpoint, prepend_hyphen);
+        let mut attempt = line(engine, p, start..end, breakpoint, lines.last());
 
         // If the line doesn't fit anymore, we push the last fitting attempt
         // into the stack and rebuild the line from the attempt's end. The
@@ -96,7 +108,7 @@ fn linebreak_simple<'a>(
             if let Some((last_attempt, last_end)) = last.take() {
                 lines.push(last_attempt);
                 start = last_end;
-                attempt = line(engine, p, start..end, breakpoint, prepend_hyphen);
+                attempt = line(engine, p, start..end, breakpoint, lines.last());
             }
         }
 
@@ -142,144 +154,142 @@ fn linebreak_optimized<'a>(
     p: &'a Preparation<'a>,
     width: Abs,
 ) -> Vec<Line<'a>> {
-    /// The cost of a line or paragraph layout.
-    type Cost = f64;
+    let metrics = CostMetrics::compute(p);
+
+    // Determines the exact costs of a likely good layout through Knuth-Plass
+    // with approximate metrics. We can use this cost as an upper bound to prune
+    // the search space in our proper optimization pass below.
+    let upper_bound = linebreak_optimized_approximate(engine, p, width, &metrics);
 
-    /// An entry in the dynamic programming table.
+    // Using the upper bound, perform exact optimized linebreaking.
+    linebreak_optimized_bounded(engine, p, width, &metrics, upper_bound)
+}
+
+/// Performs line breaking in optimized Knuth-Plass style, but with an upper
+/// bound on the cost. This allows us to skip many parts of the search space.
+#[typst_macros::time]
+fn linebreak_optimized_bounded<'a>(
+    engine: &Engine,
+    p: &'a Preparation<'a>,
+    width: Abs,
+    metrics: &CostMetrics,
+    upper_bound: Cost,
+) -> Vec<Line<'a>> {
+    /// An entry in the dynamic programming table for paragraph optimization.
     struct Entry<'a> {
         pred: usize,
         total: Cost,
         line: Line<'a>,
     }
 
-    // Cost parameters.
-    const DEFAULT_HYPH_COST: Cost = 0.5;
-    const DEFAULT_RUNT_COST: Cost = 0.5;
-    const CONSECUTIVE_DASH_COST: Cost = 0.3;
-    const MAX_COST: Cost = 1_000_000.0;
-    const MIN_RATIO: f64 = -1.0;
-
-    let hyph_cost = DEFAULT_HYPH_COST * p.costs.hyphenation().get();
-    let runt_cost = DEFAULT_RUNT_COST * p.costs.runt().get();
-
     // Dynamic programming table.
-    let mut active = 0;
     let mut table = vec![Entry {
         pred: 0,
         total: 0.0,
-        line: line(engine, p, 0..0, Breakpoint::Mandatory, false),
+        line: line(engine, p, 0..0, Breakpoint::Mandatory, None),
     }];
 
-    let em = p.size;
-    let mut lines = Vec::with_capacity(16);
-    breakpoints(p, |end, breakpoint| {
-        let k = table.len();
-        let is_end = end == p.bidi.text.len();
-        let mut best: Option<Entry> = None;
+    let mut active = 0;
+    let mut prev_end = 0;
 
+    breakpoints(p, |end, breakpoint| {
         // Find the optimal predecessor.
-        for (i, pred) in table.iter().enumerate().skip(active) {
-            // Layout the line.
-            let start = pred.line.end;
-            let prepend_hyphen = should_repeat_hyphen(&pred.line);
+        let mut best: Option<Entry> = None;
 
-            let attempt = line(engine, p, start..end, breakpoint, prepend_hyphen);
+        // A lower bound for the cost of all following line attempts.
+        let mut line_lower_bound = None;
 
-            // Determine how much the line's spaces would need to be stretched
-            // to make it the desired width.
-            let delta = width - attempt.width;
-            // Determine how much stretch are permitted.
-            let adjust = if delta >= Abs::zero() {
-                attempt.stretchability()
-            } else {
-                attempt.shrinkability()
-            };
-            // Ideally, the ratio should between -1.0 and 1.0, but sometimes a
-            // value above 1.0 is possible, in which case the line is underfull.
-            let mut ratio = delta / adjust;
-            if ratio.is_nan() {
-                // The line is not stretchable, but it just fits. This often
-                // happens with monospace fonts and CJK texts.
-                ratio = 0.0;
-            }
-            if ratio > 1.0 {
-                // We should stretch the line above its stretchability. Now
-                // calculate the extra amount. Also, don't divide by zero.
-                let extra_stretch =
-                    (delta - adjust) / attempt.justifiables().max(1) as f64;
-                // Normalize the amount by half Em size.
-                ratio = 1.0 + extra_stretch / (em / 2.0);
+        for (pred_index, pred) in table.iter().enumerate().skip(active) {
+            let start = pred.line.end;
+            let unbreakable = prev_end == start;
+
+            // If the minimum cost we've established for the line is already
+            // too much, skip this attempt.
+            if line_lower_bound
+                .is_some_and(|lower| pred.total + lower > upper_bound + BOUND_EPS)
+            {
+                continue;
             }
 
-            // Determine the cost of the line.
-            let min_ratio = if p.justify { MIN_RATIO } else { 0.0 };
-            let mut cost = if ratio < min_ratio {
-                // The line is overfull. This is the case if
-                // - justification is on, but we'd need to shrink too much
-                // - justification is off and the line just doesn't fit
-                //
-                // If this is the earliest breakpoint in the active set
-                // (active == i), remove it from the active set. If there is an
-                // earlier one (active < i), then the logically shorter line was
-                // in fact longer (can happen with negative spacing) and we
-                // can't trim the active set just yet.
-                if active == i {
-                    active += 1;
-                }
-                MAX_COST
-            } else if breakpoint == Breakpoint::Mandatory || is_end {
-                // This is a mandatory break and the line is not overfull, so
-                // all breakpoints before this one become inactive since no line
-                // can span above the mandatory break.
-                active = k;
-                // - If ratio > 0, we need to stretch the line only when justify
-                //   is needed.
-                // - If ratio < 0, we always need to shrink the line.
-                if (ratio > 0.0 && attempt.justify) || ratio < 0.0 {
-                    ratio.powi(3).abs()
-                } else {
-                    0.0
-                }
-            } else {
-                // Normal line with cost of |ratio^3|.
-                ratio.powi(3).abs()
-            };
-
-            // Penalize runts.
-            if k == i + 1 && is_end {
-                cost += runt_cost;
+            // Build the line.
+            let attempt = line(engine, p, start..end, breakpoint, Some(&pred.line));
+
+            // Determine the cost of the line and its stretch ratio.
+            let (line_ratio, line_cost) = ratio_and_cost(
+                p,
+                metrics,
+                width,
+                &pred.line,
+                &attempt,
+                breakpoint,
+                unbreakable,
+            );
+
+            // If the line is overfull, we adjust the set of active candidate
+            // line starts. This is the case if
+            // - justification is on, but we'd need to shrink too much
+            // - justification is off and the line just doesn't fit
+            //
+            // If this is the earliest breakpoint in the active set
+            // (active == i), remove it from the active set. If there is an
+            // earlier one (active < i), then the logically shorter line was
+            // in fact longer (can happen with negative spacing) and we
+            // can't trim the active set just yet.
+            if line_ratio < metrics.min_ratio && active == pred_index {
+                active += 1;
             }
 
-            // Penalize hyphens.
-            if breakpoint == Breakpoint::Hyphen {
-                cost += hyph_cost;
+            // The total cost of this line and its chain of predecessors.
+            let total = pred.total + line_cost;
+
+            // If the line is already underfull (`line_ratio > 0`), any shorter
+            // slice of the line will be even more underfull. So it'll only get
+            // worse from here and further attempts would also have a cost
+            // exceeding `bound`. There is one exception: When the line has
+            // negative spacing, we can't know for sure, so we don't assign the
+            // lower bound in that case.
+            if line_ratio > 0.0
+                && line_lower_bound.is_none()
+                && !attempt.has_negative_width_items()
+            {
+                line_lower_bound = Some(line_cost);
             }
 
-            // In Knuth paper, cost = (1 + 100|r|^3 + p)^2 + a,
-            // where r is the ratio, p=50 is the penalty, and a=3000 is
-            // consecutive the penalty. We divide the whole formula by 10,
-            // resulting (0.01 + |r|^3 + p)^2 + a, where p=0.5 and a=0.3
-            cost = (0.01 + cost).powi(2);
-
-            // Penalize two consecutive dashes (not necessarily hyphens) extra.
-            if attempt.dash.is_some() && pred.line.dash.is_some() {
-                cost += CONSECUTIVE_DASH_COST;
+            // If the cost already exceeds the upper bound, we don't need to
+            // integrate this result into the table.
+            if total > upper_bound + BOUND_EPS {
+                continue;
             }
 
-            // The total cost of this line and its chain of predecessors.
-            let total = pred.total + cost;
-
             // If this attempt is better than what we had before, take it!
             if best.as_ref().map_or(true, |best| best.total >= total) {
-                best = Some(Entry { pred: i, total, line: attempt });
+                best = Some(Entry { pred: pred_index, total, line: attempt });
             }
         }
 
-        table.push(best.unwrap());
+        // If this is a mandatory break, all breakpoints before this one become
+        // inactive since no line can span over the mandatory break.
+        if breakpoint == Breakpoint::Mandatory {
+            active = table.len();
+        }
+
+        table.extend(best);
+        prev_end = end;
     });
 
     // Retrace the best path.
+    let mut lines = Vec::with_capacity(16);
     let mut idx = table.len() - 1;
+
+    // This should only happen if our bound was faulty. Which shouldn't happen!
+    if table[idx].line.end != p.bidi.text.len() {
+        #[cfg(debug_assertions)]
+        panic!("bounded paragraph layout is incomplete");
+
+        #[cfg(not(debug_assertions))]
+        return linebreak_optimized_bounded(engine, p, width, metrics, Cost::INFINITY);
+    }
+
     while idx != 0 {
         table.truncate(idx + 1);
         let entry = table.pop().unwrap();
@@ -291,6 +301,282 @@ fn linebreak_optimized<'a>(
     lines
 }
 
+/// Runs the normal Knuth-Plass algorithm, but instead of building proper lines
+/// (which is costly) to determine costs, it determines approximate costs using
+/// cummulative arrays.
+///
+/// This results in a likely good paragraph layouts, for which we then compute
+/// the exact cost. This cost is an upper bound for proper optimized
+/// linebreaking. We can use it to heavily prune the search space.
+#[typst_macros::time]
+fn linebreak_optimized_approximate(
+    engine: &Engine,
+    p: &Preparation,
+    width: Abs,
+    metrics: &CostMetrics,
+) -> Cost {
+    // Determine the cummulative estimation metrics.
+    let estimates = Estimates::compute(p);
+
+    /// An entry in the dynamic programming table for paragraph optimization.
+    struct Entry {
+        pred: usize,
+        total: Cost,
+        end: usize,
+        unbreakable: bool,
+        breakpoint: Breakpoint,
+    }
+
+    // Dynamic programming table.
+    let mut table = vec![Entry {
+        pred: 0,
+        total: 0.0,
+        end: 0,
+        unbreakable: false,
+        breakpoint: Breakpoint::Mandatory,
+    }];
+
+    let mut active = 0;
+    let mut prev_end = 0;
+
+    breakpoints(p, |end, breakpoint| {
+        let at_end = end == p.bidi.text.len();
+
+        // Find the optimal predecessor.
+        let mut best: Option<Entry> = None;
+        for (pred_index, pred) in table.iter().enumerate().skip(active) {
+            let start = pred.end;
+            let unbreakable = prev_end == start;
+
+            // Whether the line is justified. This is not 100% accurate w.r.t
+            // to line()'s behaviour, but good enough.
+            let justify = p.justify && !at_end && breakpoint != Breakpoint::Mandatory;
+
+            // We don't really know whether the line naturally ends with a dash
+            // here, so we can miss that case, but it's ok, since all of this
+            // just an estimate.
+            let consecutive_dash =
+                pred.breakpoint == Breakpoint::Hyphen && breakpoint == Breakpoint::Hyphen;
+
+            // Estimate how much the line's spaces would need to be stretched to
+            // make it the desired width. We trim at the end to not take into
+            // account trailing spaces. This is, again, only an approximation of
+            // the real behaviour of `line`.
+            let trimmed_end = start + p.bidi.text[start..end].trim_end().len();
+            let line_ratio = raw_ratio(
+                p,
+                width,
+                estimates.widths.estimate(start..trimmed_end)
+                    + if breakpoint == Breakpoint::Hyphen {
+                        metrics.approx_hyphen_width
+                    } else {
+                        Abs::zero()
+                    },
+                estimates.stretchability.estimate(start..trimmed_end),
+                estimates.shrinkability.estimate(start..trimmed_end),
+                estimates.justifiables.estimate(start..trimmed_end),
+            );
+
+            // Determine the line's cost.
+            let line_cost = raw_cost(
+                metrics,
+                breakpoint,
+                line_ratio,
+                at_end,
+                justify,
+                unbreakable,
+                consecutive_dash,
+                true,
+            );
+
+            // Adjust the set of active breakpoints.
+            // See `linebreak_optimized` for details.
+            if line_ratio < metrics.min_ratio && active == pred_index {
+                active += 1;
+            }
+
+            // The total cost of this line and its chain of predecessors.
+            let total = pred.total + line_cost;
+
+            // If this attempt is better than what we had before, take it!
+            if best.as_ref().map_or(true, |best| best.total >= total) {
+                best = Some(Entry {
+                    pred: pred_index,
+                    total,
+                    end,
+                    unbreakable,
+                    breakpoint,
+                });
+            }
+        }
+
+        // If this is a mandatory break, all breakpoints before this one become
+        // inactive.
+        if breakpoint == Breakpoint::Mandatory {
+            active = table.len();
+        }
+
+        table.extend(best);
+        prev_end = end;
+    });
+
+    // Retrace the best path.
+    let mut indices = Vec::with_capacity(16);
+    let mut idx = table.len() - 1;
+    while idx != 0 {
+        indices.push(idx);
+        idx = table[idx].pred;
+    }
+
+    let mut exact = 0.0;
+    let mut pred = line(engine, p, 0..0, Breakpoint::Mandatory, None);
+
+    // The cost that we optimized was only an approximate cost, so the layout we
+    // got here is only likely to be good, not guaranteed to be the best. We now
+    // computes its exact cost as that gives us a sound upper bound for the
+    // proper optimization pass.
+    for idx in indices.into_iter().rev() {
+        let Entry { end, breakpoint, unbreakable, .. } = table[idx];
+
+        let start = pred.end;
+        let attempt = line(engine, p, start..end, breakpoint, Some(&pred));
+
+        let (_, line_cost) =
+            ratio_and_cost(p, metrics, width, &pred, &attempt, breakpoint, unbreakable);
+
+        exact += line_cost;
+        pred = attempt;
+    }
+
+    exact
+}
+
+/// Compute the stretch ratio and cost of a line.
+fn ratio_and_cost(
+    p: &Preparation,
+    metrics: &CostMetrics,
+    available_width: Abs,
+    pred: &Line,
+    attempt: &Line,
+    breakpoint: Breakpoint,
+    unbreakable: bool,
+) -> (f64, Cost) {
+    let ratio = raw_ratio(
+        p,
+        available_width,
+        attempt.width,
+        attempt.stretchability(),
+        attempt.shrinkability(),
+        attempt.justifiables(),
+    );
+
+    let cost = raw_cost(
+        metrics,
+        breakpoint,
+        ratio,
+        attempt.end == p.bidi.text.len(),
+        attempt.justify,
+        unbreakable,
+        pred.dash.is_some() && attempt.dash.is_some(),
+        false,
+    );
+
+    (ratio, cost)
+}
+
+/// Determine the stretch ratio for a line given raw metrics.
+fn raw_ratio(
+    p: &Preparation,
+    available_width: Abs,
+    line_width: Abs,
+    stretchability: Abs,
+    shrinkability: Abs,
+    justifiables: usize,
+) -> f64 {
+    // Determine how much the line's spaces would need to be stretched
+    // to make it the desired width.
+    let delta = available_width - line_width;
+
+    // Determine how much stretch is permitted.
+    let adjust = if delta >= Abs::zero() { stretchability } else { shrinkability };
+
+    // Ideally, the ratio should between -1.0 and 1.0.
+    //
+    // A ratio above 1.0 is possible for an underfull line, but a ratio below
+    // -1.0 is forbidden because the line would overflow.
+    let mut ratio = delta / adjust;
+
+    // The line is not stretchable, but it just fits. This often happens with
+    // monospace fonts and CJK texts.
+    if ratio.is_nan() {
+        ratio = 0.0;
+    }
+
+    if ratio > 1.0 {
+        // We should stretch the line above its stretchability. Now
+        // calculate the extra amount. Also, don't divide by zero.
+        let extra_stretch = (delta - adjust) / justifiables.max(1) as f64;
+        // Normalize the amount by half the em size.
+        ratio = 1.0 + extra_stretch / (p.size / 2.0);
+    }
+
+    ratio
+}
+
+/// Compute the cost of a line given raw metrics.
+#[allow(clippy::too_many_arguments)]
+fn raw_cost(
+    metrics: &CostMetrics,
+    breakpoint: Breakpoint,
+    ratio: f64,
+    at_end: bool,
+    justify: bool,
+    unbreakable: bool,
+    consecutive_dash: bool,
+    approx: bool,
+) -> Cost {
+    // Determine the cost of the line.
+    let mut cost = if ratio < metrics.min_ratio(approx) {
+        // Overfull line always has maximum cost.
+        MAX_COST
+    } else if breakpoint == Breakpoint::Mandatory || at_end {
+        // - If ratio < 0, we always need to shrink the line (even the last one).
+        // - If ratio > 0, we need to stretch the line only when it is justified
+        //   (last line is not justified by default even if `p.justify` is true).
+        if ratio < 0.0 || (ratio > 0.0 && justify) {
+            ratio.powi(3).abs()
+        } else {
+            0.0
+        }
+    } else {
+        // Normal line with cost of |ratio^3|.
+        ratio.powi(3).abs()
+    };
+
+    // Penalize runts (lone words in the last line).
+    if unbreakable && at_end {
+        cost += metrics.runt_cost;
+    }
+
+    // Penalize hyphenation.
+    if breakpoint == Breakpoint::Hyphen {
+        cost += metrics.hyph_cost;
+    }
+
+    // In the Knuth paper, cost = (1 + 100|r|^3 + p)^2 + a,
+    // where r is the ratio, p=50 is the penalty, and a=3000 is
+    // consecutive the penalty. We divide the whole formula by 10,
+    // resulting (0.01 + |r|^3 + p)^2 + a, where p=0.5 and a=0.3
+    let mut cost = (0.01 + cost).powi(2);
+
+    // Penalize two consecutive dashes (not necessarily hyphens) extra.
+    if consecutive_dash {
+        cost += CONSECUTIVE_DASH_COST;
+    }
+
+    cost
+}
+
 /// Calls `f` for all possible points in the text where lines can broken.
 ///
 /// Yields for each breakpoint the text index, whether the break is mandatory
@@ -433,7 +719,7 @@ fn linebreak_link(link: &str, mut f: impl FnMut(usize)) {
         // - other -> other
         // - alphabetic -> numeric
         // - numeric -> alphabetic
-        // Never before after opening delimiters.
+        // Never before/after opening delimiters.
         if end > 0
             && prev != Class::Open
             && if class == Class::Other { prev == Class::Other } else { class != prev }
@@ -478,48 +764,141 @@ fn lang_at(p: &Preparation, offset: usize) -> Option<hypher::Lang> {
     hypher::Lang::from_iso(bytes)
 }
 
-/// Whether the hyphen should repeat at the start of the next line.
-fn should_repeat_hyphen(pred_line: &Line) -> bool {
-    // If the predecessor line does not end with a Dash::HardHyphen, we shall
-    // not place a hyphen at the start of the next line.
-    if pred_line.dash != Some(Dash::HardHyphen) {
-        return false;
+/// Resolved metrics relevant for cost computation.
+struct CostMetrics {
+    min_ratio: f64,
+    min_approx_ratio: f64,
+    hyph_cost: Cost,
+    runt_cost: Cost,
+    approx_hyphen_width: Abs,
+}
+
+impl CostMetrics {
+    /// Compute shared metrics for paragraph optimization.
+    fn compute(p: &Preparation) -> Self {
+        Self {
+            // When justifying, we may stretch spaces below their natural width.
+            min_ratio: if p.justify { MIN_RATIO } else { 0.0 },
+            min_approx_ratio: if p.justify { MIN_APPROX_RATIO } else { 0.0 },
+            hyph_cost: DEFAULT_HYPH_COST * p.costs.hyphenation().get(),
+            runt_cost: DEFAULT_RUNT_COST * p.costs.runt().get(),
+            // Approximate hyphen width for estimates.
+            approx_hyphen_width: Em::new(0.33).at(p.size),
+        }
     }
 
-    // If there's a trimmed out space, we needn't repeat the hyphen. That's the
-    // case of a text like "...kebab é a -melhor- comida que existe", where the
-    // hyphens are a kind of emphasis marker.
-    if pred_line.trimmed.end != pred_line.end {
-        return false;
+    /// The minimum line ratio we allow for shrinking. For approximate layout,
+    /// we allow less because otherwise we get an invalid layout fairly often,
+    /// which makes our bound useless.
+    fn min_ratio(&self, approx: bool) -> f64 {
+        if approx {
+            self.min_approx_ratio
+        } else {
+            self.min_ratio
+        }
     }
+}
+
+/// Estimated line metrics.
+///
+/// Allows to get a quick estimate of a metric for a line between two byte
+/// positions.
+struct Estimates {
+    widths: CummulativeVec<Abs>,
+    stretchability: CummulativeVec<Abs>,
+    shrinkability: CummulativeVec<Abs>,
+    justifiables: CummulativeVec<usize>,
+}
 
-    // The hyphen should repeat only in the languages that require that feature.
-    // For more information see the discussion at https://github.com/typst/typst/issues/3235
-    let Some(Item::Text(shape)) = pred_line.last.as_ref() else { return false };
-
-    match shape.lang {
-        // - Lower Sorbian: see https://dolnoserbski.de/ortografija/psawidla/K3
-        // - Czech: see https://prirucka.ujc.cas.cz/?id=164
-        // - Croatian: see http://pravopis.hr/pravilo/spojnica/68/
-        // - Polish: see https://www.ortograf.pl/zasady-pisowni/lacznik-zasady-pisowni
-        // - Portuguese: see https://www2.senado.leg.br/bdsf/bitstream/handle/id/508145/000997415.pdf (Base XX)
-        // - Slovak: see https://www.zones.sk/studentske-prace/gramatika/10620-pravopis-rozdelovanie-slov/
-        Lang::LOWER_SORBIAN
-        | Lang::CZECH
-        | Lang::CROATIAN
-        | Lang::POLISH
-        | Lang::PORTUGUESE
-        | Lang::SLOVAK => true,
-        // In Spanish the hyphen is required only if the word next to hyphen is
-        // not capitalized. Otherwise, the hyphen must not be repeated.
-        //
-        // See § 4.1.1.1.2.e on the "Ortografía de la lengua española"
-        // https://www.rae.es/ortografía/como-signo-de-división-de-palabras-a-final-de-línea
-        Lang::SPANISH => pred_line.bidi.text[pred_line.end..]
-            .chars()
-            .next()
-            .map(|c| !c.is_uppercase())
-            .unwrap_or(false),
-        _ => false,
+impl Estimates {
+    /// Compute estimations for approximate Knuth-Plass layout.
+    fn compute(p: &Preparation) -> Self {
+        let cap = p.bidi.text.len();
+
+        let mut widths = CummulativeVec::with_capacity(cap);
+        let mut stretchability = CummulativeVec::with_capacity(cap);
+        let mut shrinkability = CummulativeVec::with_capacity(cap);
+        let mut justifiables = CummulativeVec::with_capacity(cap);
+
+        for item in &p.items {
+            let textual_len = item.textual_len();
+            let after = widths.len() + textual_len;
+
+            if let Item::Text(shaped) = item {
+                for g in shaped.glyphs.iter() {
+                    let byte_len = g.range.len();
+                    let stretch = g.stretchability().0 + g.stretchability().1;
+                    let shrink = g.shrinkability().0 + g.shrinkability().1;
+                    widths.push(byte_len, g.x_advance.at(shaped.size));
+                    stretchability.push(byte_len, stretch.at(shaped.size));
+                    shrinkability.push(byte_len, shrink.at(shaped.size));
+                    justifiables.push(byte_len, g.is_justifiable() as usize);
+                }
+            } else {
+                widths.push(textual_len, item.width());
+            }
+
+            widths.adjust(after);
+            stretchability.adjust(after);
+            shrinkability.adjust(after);
+            justifiables.adjust(after);
+        }
+
+        Self {
+            widths,
+            stretchability,
+            shrinkability,
+            justifiables,
+        }
+    }
+}
+
+/// An accumulative array of a metric.
+struct CummulativeVec<T> {
+    total: T,
+    summed: Vec<T>,
+}
+
+impl<T> CummulativeVec<T>
+where
+    T: Default + Copy + Add<Output = T> + Sub<Output = T>,
+{
+    /// Create a new instance with the given capacity.
+    fn with_capacity(capacity: usize) -> Self {
+        let total = T::default();
+        let mut summed = Vec::with_capacity(capacity);
+        summed.push(total);
+        Self { total, summed }
+    }
+
+    /// Get the covered byte length.
+    fn len(&self) -> usize {
+        self.summed.len()
+    }
+
+    /// Adjust to cover the given byte length.
+    fn adjust(&mut self, len: usize) {
+        self.summed.resize(len, self.total);
+    }
+
+    /// Adds a new segment with the given byte length and metric.
+    fn push(&mut self, byte_len: usize, metric: T) {
+        self.total = self.total + metric;
+        for _ in 0..byte_len {
+            self.summed.push(self.total);
+        }
+    }
+
+    /// Estimates the metrics for the line spanned by the range.
+    fn estimate(&self, range: Range) -> T {
+        self.get(range.end) - self.get(range.start)
+    }
+
+    /// Get the metric at the given byte position.
+    fn get(&self, index: usize) -> T {
+        match index.checked_sub(1) {
+            None => T::default(),
+            Some(i) => self.summed[i],
+        }
     }
 }
diff --git a/crates/typst/src/layout/inline/mod.rs b/crates/typst/src/layout/inline/mod.rs
index 94ac89f0..f89de169 100644
--- a/crates/typst/src/layout/inline/mod.rs
+++ b/crates/typst/src/layout/inline/mod.rs
@@ -9,7 +9,7 @@ use comemo::{Track, Tracked, TrackedMut};
 
 use self::collect::{collect, Item, Segment, SpanMapper};
 use self::finalize::finalize;
-use self::line::{commit, line, Dash, Line};
+use self::line::{commit, line, Line};
 use self::linebreak::{linebreak, Breakpoint};
 use self::prepare::{prepare, Preparation};
 use self::shaping::{
author	Laurenz <laurmaedje@gmail.com>	2024-07-01 15:04:58 +0200
committer	GitHub <noreply@github.com>	2024-07-01 13:04:58 +0000
commit	6d835ecb9278490f645ad0a1b86ec84e752d3220 (patch)
tree	9d8b3ecc0c374a53214d4c1fb73699a52ce80322
parent	45366c0112ac7a6197cee35f1e180c6a00923e05 (diff)