summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGabriel Araújo <gabriel140492@gmail.com>2024-05-15 10:42:13 -0300
committerGitHub <noreply@github.com>2024-05-15 13:42:13 +0000
commit017f2f45666abdc4a1c6a253f0e04a271b5a82c1 (patch)
treee652e90aefc67124afbfd072e45188a9d9dfe4f7
parent484a0e60d8087fac38b98f697b68f2dd4e829d59 (diff)
Fix hyphen duplication rule for some languages (#4058)
-rw-r--r--crates/typst/src/layout/inline/mod.rs111
-rw-r--r--crates/typst/src/layout/inline/shaping.rs35
-rw-r--r--crates/typst/src/text/lang.rs3
-rw-r--r--tests/ref/hyphenate-es-captalized-names.pngbin0 -> 4238 bytes
-rw-r--r--tests/ref/hyphenate-es-repeat-hyphen.pngbin0 -> 3224 bytes
-rw-r--r--tests/ref/hyphenate-pt-dash-emphasis.pngbin0 -> 983 bytes
-rw-r--r--tests/ref/hyphenate-pt-no-repeat-hyphen.pngbin0 -> 1533 bytes
-rw-r--r--tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true-with-emphasis.pngbin0 -> 1350 bytes
-rw-r--r--tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true.pngbin0 -> 1341 bytes
-rw-r--r--tests/ref/hyphenate-pt-repeat-hyphen-natural-word-breaking.pngbin0 -> 1341 bytes
-rw-r--r--tests/suite/layout/inline/hyphenate.typ52
11 files changed, 179 insertions, 22 deletions
diff --git a/crates/typst/src/layout/inline/mod.rs b/crates/typst/src/layout/inline/mod.rs
index 0b73eef6..f8b17f46 100644
--- a/crates/typst/src/layout/inline/mod.rs
+++ b/crates/typst/src/layout/inline/mod.rs
@@ -298,6 +298,19 @@ impl SpanMapper {
}
}
+/// A dash at the end of a line.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub(super) enum Dash {
+ /// A hyphen added to break a word.
+ SoftHyphen,
+ /// Regular hyphen, present in a compound word, e.g. beija-flor.
+ HardHyphen,
+ /// An em dash.
+ Long,
+ /// An en dash.
+ Short,
+}
+
/// A layouted line, consisting of a sequence of layouted paragraph items that
/// are mostly borrowed from the preparation phase. This type enables you to
/// measure the size of a line in a range before committing to building the
@@ -327,7 +340,7 @@ struct Line<'a> {
justify: bool,
/// Whether the line ends with a hyphen or dash, either naturally or through
/// hyphenation.
- dash: bool,
+ dash: Option<Dash>,
}
impl<'a> Line<'a> {
@@ -814,8 +827,10 @@ fn linebreak_simple<'a>(
let mut last = None;
breakpoints(p, |end, breakpoint| {
+ let prepend_hyphen = lines.last().map(should_repeat_hyphen).unwrap_or(false);
+
// Compute the line and its size.
- let mut attempt = line(engine, p, start..end, breakpoint);
+ let mut attempt = line(engine, p, start..end, breakpoint, prepend_hyphen);
// If the line doesn't fit anymore, we push the last fitting attempt
// into the stack and rebuild the line from the attempt's end. The
@@ -824,7 +839,7 @@ fn linebreak_simple<'a>(
if let Some((last_attempt, last_end)) = last.take() {
lines.push(last_attempt);
start = last_end;
- attempt = line(engine, p, start..end, breakpoint);
+ attempt = line(engine, p, start..end, breakpoint, prepend_hyphen);
}
}
@@ -894,7 +909,7 @@ fn linebreak_optimized<'a>(
let mut table = vec![Entry {
pred: 0,
total: 0.0,
- line: line(engine, p, 0..0, Breakpoint::Mandatory),
+ line: line(engine, p, 0..0, Breakpoint::Mandatory, false),
}];
let em = p.size;
@@ -908,8 +923,9 @@ fn linebreak_optimized<'a>(
for (i, pred) in table.iter().enumerate().skip(active) {
// Layout the line.
let start = pred.line.end;
+ let prepend_hyphen = should_repeat_hyphen(&pred.line);
- let attempt = line(engine, p, start..end, breakpoint);
+ let attempt = line(engine, p, start..end, breakpoint, prepend_hyphen);
// Determine how much the line's spaces would need to be stretched
// to make it the desired width.
@@ -987,7 +1003,7 @@ fn linebreak_optimized<'a>(
cost = (0.01 + cost).powi(2);
// Penalize two consecutive dashes (not necessarily hyphens) extra.
- if attempt.dash && pred.line.dash {
+ if attempt.dash.is_some() && pred.line.dash.is_some() {
cost += CONSECUTIVE_DASH_COST;
}
@@ -1022,6 +1038,7 @@ fn line<'a>(
p: &'a Preparation,
mut range: Range,
breakpoint: Breakpoint,
+ prepend_hyphen: bool,
) -> Line<'a> {
let end = range.end;
let mut justify =
@@ -1037,7 +1054,7 @@ fn line<'a>(
last: None,
width: Abs::zero(),
justify,
- dash: false,
+ dash: None,
};
}
@@ -1047,7 +1064,7 @@ fn line<'a>(
// Reshape the last item if it's split in half or hyphenated.
let mut last = None;
- let mut dash = false;
+ let mut dash = None;
if let Some((Item::Text(shaped), before)) = inner.split_last() {
// Compute the range we want to shape, trimming whitespace at the
// end of the line.
@@ -1062,7 +1079,17 @@ fn line<'a>(
// Deal with hyphens, dashes and justification.
let shy = trimmed.ends_with('\u{ad}');
let hyphen = breakpoint == Breakpoint::Hyphen;
- dash = hyphen || shy || trimmed.ends_with(['-', '–', '—']);
+ dash = if hyphen || shy {
+ Some(Dash::SoftHyphen)
+ } else if trimmed.ends_with('-') {
+ Some(Dash::HardHyphen)
+ } else if trimmed.ends_with('–') {
+ Some(Dash::Short)
+ } else if trimmed.ends_with('—') {
+ Some(Dash::Long)
+ } else {
+ None
+ };
justify |= text.ends_with('\u{2028}');
// Deal with CJK punctuation at line ends.
@@ -1079,7 +1106,11 @@ fn line<'a>(
// need the shaped empty string to make the line the appropriate
// height. That is the case exactly if the string is empty and there
// are no other items in the line.
- if hyphen || start + shaped.text.len() > range.end || maybe_adjust_last_glyph {
+ if hyphen
+ || start + shaped.text.len() > range.end
+ || maybe_adjust_last_glyph
+ || prepend_hyphen
+ {
if hyphen || start < range.end || before.is_empty() {
let mut reshaped = shaped.reshape(engine, &p.spans, start..range.end);
if hyphen || shy {
@@ -1131,7 +1162,10 @@ fn line<'a>(
let end = range.end.min(base + shaped.text.len());
// Reshape if necessary.
- if range.start + shaped.text.len() > end || maybe_adjust_first_glyph {
+ if range.start + shaped.text.len() > end
+ || maybe_adjust_first_glyph
+ || prepend_hyphen
+ {
// If the range is empty, we don't want to push an empty text item.
if range.start < end {
let reshaped = shaped.reshape(engine, &p.spans, range.start..end);
@@ -1143,6 +1177,15 @@ fn line<'a>(
}
}
+ if prepend_hyphen {
+ let reshaped = first.as_mut().or(last.as_mut()).and_then(Item::text_mut);
+ if let Some(reshaped) = reshaped {
+ let width_before = reshaped.width;
+ reshaped.prepend_hyphen(engine, p.fallback);
+ width += reshaped.width - width_before;
+ }
+ }
+
if maybe_adjust_first_glyph {
let reshaped = first.as_mut().or(last.as_mut()).and_then(Item::text_mut);
if let Some(reshaped) = reshaped {
@@ -1446,3 +1489,49 @@ fn overhang(c: char) -> f64 {
_ => 0.0,
}
}
+
+/// Whether the hyphen should repeat at the start of the next line.
+fn should_repeat_hyphen(pred_line: &Line) -> bool {
+ // If the predecessor line does not end with a Dash::HardHyphen, we shall
+ // not place a hyphen at the start of the next line.
+ if pred_line.dash != Some(Dash::HardHyphen) {
+ return false;
+ }
+
+ // If there's a trimmed out space, we needn't repeat the hyphen. That's the
+ // case of a text like "...kebab é a -melhor- comida que existe", where the
+ // hyphens are a kind of emphasis marker.
+ if pred_line.trimmed.end != pred_line.end {
+ return false;
+ }
+
+ // The hyphen should repeat only in the languages that require that feature.
+ // For more information see the discussion at https://github.com/typst/typst/issues/3235
+ let Some(Item::Text(shape)) = pred_line.last.as_ref() else { return false };
+
+ match shape.lang {
+ // - Lower Sorbian: see https://dolnoserbski.de/ortografija/psawidla/K3
+ // - Czech: see https://prirucka.ujc.cas.cz/?id=164
+ // - Croatian: see http://pravopis.hr/pravilo/spojnica/68/
+ // - Polish: see https://www.ortograf.pl/zasady-pisowni/lacznik-zasady-pisowni
+ // - Portuguese: see https://www2.senado.leg.br/bdsf/bitstream/handle/id/508145/000997415.pdf (Base XX)
+ // - Slovak: see https://www.zones.sk/studentske-prace/gramatika/10620-pravopis-rozdelovanie-slov/
+ Lang::LOWER_SORBIAN
+ | Lang::CZECH
+ | Lang::CROATIAN
+ | Lang::POLISH
+ | Lang::PORTUGUESE
+ | Lang::SLOVAK => true,
+ // In Spanish the hyphen is required only if the word next to hyphen is
+ // not capitalized. Otherwise, the hyphen must not be repeated.
+ //
+ // See § 4.1.1.1.2.e on the "Ortografía de la lengua española"
+ // https://www.rae.es/ortografía/como-signo-de-división-de-palabras-a-final-de-línea
+ Lang::SPANISH => pred_line.bidi.text[pred_line.end..]
+ .chars()
+ .next()
+ .map(|c| !c.is_uppercase())
+ .unwrap_or(false),
+ _ => false,
+ }
+}
diff --git a/crates/typst/src/layout/inline/shaping.rs b/crates/typst/src/layout/inline/shaping.rs
index ff13f776..57b94230 100644
--- a/crates/typst/src/layout/inline/shaping.rs
+++ b/crates/typst/src/layout/inline/shaping.rs
@@ -447,6 +447,15 @@ impl<'a> ShapedText<'a> {
/// Push a hyphen to end of the text.
pub fn push_hyphen(&mut self, engine: &Engine, fallback: bool) {
+ self.insert_hyphen(engine, fallback, Side::Right)
+ }
+
+ /// Prepend a hyphen to start of the text.
+ pub fn prepend_hyphen(&mut self, engine: &Engine, fallback: bool) {
+ self.insert_hyphen(engine, fallback, Side::Left)
+ }
+
+ fn insert_hyphen(&mut self, engine: &Engine, fallback: bool, side: Side) {
let world = engine.world;
let book = world.book();
let fallback_func = if fallback {
@@ -464,17 +473,17 @@ impl<'a> ShapedText<'a> {
let ttf = font.ttf();
let glyph_id = ttf.glyph_index('-')?;
let x_advance = font.to_em(ttf.glyph_hor_advance(glyph_id)?);
- let range = self
- .glyphs
- .last()
- .map(|g| g.range.end..g.range.end)
- // In the unlikely chance that we hyphenate after an empty line,
- // ensure that the glyph range still falls after self.base so
- // that subtracting either of the endpoints by self.base doesn't
- // underflow. See <https://github.com/typst/typst/issues/2283>.
- .unwrap_or_else(|| self.base..self.base);
+ let range = match side {
+ Side::Left => self.glyphs.first().map(|g| g.range.start..g.range.start),
+ Side::Right => self.glyphs.last().map(|g| g.range.end..g.range.end),
+ }
+ // In the unlikely chance that we hyphenate after an empty line,
+ // ensure that the glyph range still falls after self.base so
+ // that subtracting either of the endpoints by self.base doesn't
+ // underflow. See <https://github.com/typst/typst/issues/2283>.
+ .unwrap_or_else(|| self.base..self.base);
self.width += x_advance.at(self.size);
- self.glyphs.to_mut().push(ShapedGlyph {
+ let glyph = ShapedGlyph {
font,
glyph_id: glyph_id.0,
x_advance,
@@ -487,7 +496,11 @@ impl<'a> ShapedText<'a> {
span: (Span::detached(), 0),
is_justifiable: false,
script: Script::Common,
- });
+ };
+ match side {
+ Side::Left => self.glyphs.to_mut().insert(0, glyph),
+ Side::Right => self.glyphs.to_mut().push(glyph),
+ }
Some(())
});
}
diff --git a/crates/typst/src/text/lang.rs b/crates/typst/src/text/lang.rs
index 67df0c6e..6809238a 100644
--- a/crates/typst/src/text/lang.rs
+++ b/crates/typst/src/text/lang.rs
@@ -57,6 +57,7 @@ impl Lang {
pub const BOKMÅL: Self = Self(*b"nb ", 2);
pub const CATALAN: Self = Self(*b"ca ", 2);
pub const CHINESE: Self = Self(*b"zh ", 2);
+ pub const CROATIAN: Self = Self(*b"hr ", 2);
pub const CZECH: Self = Self(*b"cs ", 2);
pub const DANISH: Self = Self(*b"da ", 2);
pub const DUTCH: Self = Self(*b"nl ", 2);
@@ -70,12 +71,14 @@ impl Lang {
pub const HUNGARIAN: Self = Self(*b"hu ", 2);
pub const ITALIAN: Self = Self(*b"it ", 2);
pub const JAPANESE: Self = Self(*b"ja ", 2);
+ pub const LOWER_SORBIAN: Self = Self(*b"dsb", 3);
pub const NYNORSK: Self = Self(*b"nn ", 2);
pub const POLISH: Self = Self(*b"pl ", 2);
pub const PORTUGUESE: Self = Self(*b"pt ", 2);
pub const ROMANIAN: Self = Self(*b"ro ", 2);
pub const RUSSIAN: Self = Self(*b"ru ", 2);
pub const SERBIAN: Self = Self(*b"sr ", 2);
+ pub const SLOVAK: Self = Self(*b"sk ", 2);
pub const SLOVENIAN: Self = Self(*b"sl ", 2);
pub const SPANISH: Self = Self(*b"es ", 2);
pub const SWEDISH: Self = Self(*b"sv ", 2);
diff --git a/tests/ref/hyphenate-es-captalized-names.png b/tests/ref/hyphenate-es-captalized-names.png
new file mode 100644
index 00000000..803d6795
--- /dev/null
+++ b/tests/ref/hyphenate-es-captalized-names.png
Binary files differ
diff --git a/tests/ref/hyphenate-es-repeat-hyphen.png b/tests/ref/hyphenate-es-repeat-hyphen.png
new file mode 100644
index 00000000..a4c5a060
--- /dev/null
+++ b/tests/ref/hyphenate-es-repeat-hyphen.png
Binary files differ
diff --git a/tests/ref/hyphenate-pt-dash-emphasis.png b/tests/ref/hyphenate-pt-dash-emphasis.png
new file mode 100644
index 00000000..cab13ea4
--- /dev/null
+++ b/tests/ref/hyphenate-pt-dash-emphasis.png
Binary files differ
diff --git a/tests/ref/hyphenate-pt-no-repeat-hyphen.png b/tests/ref/hyphenate-pt-no-repeat-hyphen.png
new file mode 100644
index 00000000..d0e34c9b
--- /dev/null
+++ b/tests/ref/hyphenate-pt-no-repeat-hyphen.png
Binary files differ
diff --git a/tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true-with-emphasis.png b/tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true-with-emphasis.png
new file mode 100644
index 00000000..0bb23ab1
--- /dev/null
+++ b/tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true-with-emphasis.png
Binary files differ
diff --git a/tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true.png b/tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true.png
new file mode 100644
index 00000000..d08859fb
--- /dev/null
+++ b/tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true.png
Binary files differ
diff --git a/tests/ref/hyphenate-pt-repeat-hyphen-natural-word-breaking.png b/tests/ref/hyphenate-pt-repeat-hyphen-natural-word-breaking.png
new file mode 100644
index 00000000..d08859fb
--- /dev/null
+++ b/tests/ref/hyphenate-pt-repeat-hyphen-natural-word-breaking.png
Binary files differ
diff --git a/tests/suite/layout/inline/hyphenate.typ b/tests/suite/layout/inline/hyphenate.typ
index bcad4d93..c366b38f 100644
--- a/tests/suite/layout/inline/hyphenate.typ
+++ b/tests/suite/layout/inline/hyphenate.typ
@@ -50,6 +50,58 @@ It's a #emph[Tree]beard.
#set text(hyphenate: true)
#h(6pt) networks, the rest.
+--- hyphenate-pt-repeat-hyphen-natural-word-breaking ---
+// The word breaker naturally breaks arco-da-velha at arco-/-da-velha,
+// so we shall repeat the hyphen, even that hyphenate is set to false.
+#set page(width: 4cm)
+#set text(lang: "pt")
+
+Alguma coisa no arco-da-velha é algo que está muito longe.
+
+--- hyphenate-pt-repeat-hyphen-hyphenate-true ---
+#set page(width: 4cm)
+#set text(lang: "pt", hyphenate: true)
+
+Alguma coisa no arco-da-velha é algo que está muito longe.
+
+--- hyphenate-pt-repeat-hyphen-hyphenate-true-with-emphasis ---
+#set page(width: 4cm)
+#set text(lang: "pt", hyphenate: true)
+
+Alguma coisa no _arco-da-velha_ é algo que está muito longe.
+
+--- hyphenate-pt-no-repeat-hyphen ---
+#set page(width: 4cm)
+#set text(lang: "pt", hyphenate: true)
+
+Um médico otorrinolaringologista cuida da garganta do paciente.
+
+--- hyphenate-pt-dash-emphasis ---
+// If the hyphen is followed by a space we shall not repeat the hyphen
+// at the next line
+#set page(width: 4cm)
+#set text(lang: "pt", hyphenate: true)
+
+Quebabe é a -melhor- comida que existe.
+
+--- hyphenate-es-repeat-hyphen ---
+#set page(width: 6cm)
+#set text(lang: "es", hyphenate: true)
+
+Lo que entendemos por nivel léxico-semántico, en cuanto su sentido más
+gramatical: es aquel que estudia el origen y forma de las palabras de
+un idioma.
+
+--- hyphenate-es-captalized-names ---
+// If the hyphen is followed by a capitalized word we shall not repeat
+// the hyphen at the next line
+#set page(width: 6.2cm)
+#set text(lang: "es", hyphenate: true)
+
+Tras el estallido de la contienda Ruiz-Giménez fue detenido junto a sus
+dos hermanos y puesto bajo custodia por las autoridades republicanas, con
+el objetivo de protegerle de las patrullas de milicianos.
+
--- costs-widow-orphan ---
#set page(height: 60pt)