From 4c75adbb047cba73b052c2fafa9155e2e4026610 Mon Sep 17 00:00:00 2001 From: Laurenz Date: Sat, 28 Oct 2023 22:02:22 +0200 Subject: Simplify linebreaking Switches from an iterator to callback style, which significantly increases the clarity of the whole thing. --- crates/typst-library/src/text/linebreak.rs | 188 +++++++++++++++++++++++++++++ crates/typst-library/src/text/mod.rs | 2 + 2 files changed, 190 insertions(+) create mode 100644 crates/typst-library/src/text/linebreak.rs (limited to 'crates/typst-library/src/text') diff --git a/crates/typst-library/src/text/linebreak.rs b/crates/typst-library/src/text/linebreak.rs new file mode 100644 index 00000000..a026df5d --- /dev/null +++ b/crates/typst-library/src/text/linebreak.rs @@ -0,0 +1,188 @@ +use icu_properties::{maps::CodePointMapData, LineBreak}; +use icu_provider::AsDeserializingBufferProvider; +use icu_provider_adapters::fork::ForkByKeyProvider; +use icu_provider_blob::BlobDataProvider; +use icu_segmenter::LineSegmenter; +use once_cell::sync::Lazy; +use typst::doc::Lang; + +use super::TextElem; +use crate::layout::Preparation; + +/// Generated by the following command: +/// +/// ```sh +/// icu4x-datagen --locales full \ +/// --format blob \ +/// --keys-for-bin target/debug/typst \ +/// --out crates/typst-library/assets/icudata.postcard \ +/// --overwrite +/// ``` +/// +/// Install icu_datagen with `cargo install icu_datagen`. +static ICU_DATA: &[u8] = include_bytes!("../../assets/icudata.postcard"); + +/// Generated by the following command: +/// +/// ```sh +/// icu4x-datagen --locales zh ja \ +/// --format blob \ +/// --keys segmenter/line@1 \ +/// --out crates/typst-library/assets/cj_linebreak_data.postcard \ +/// --overwrite +/// ``` +/// +/// The used icu_datagen should be patched by +/// https://github.com/peng1999/icu4x/commit/b9beb6cbf633d61fc3d7983e5baf7f4449fbfae5 +static CJ_LINEBREAK_DATA: &[u8] = + include_bytes!("../../assets/cj_linebreak_data.postcard"); + +/// The general line break segmenter. +static SEGMENTER: Lazy = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap() +}); + +/// The line break segmenter for Chinese/Japanese text. +static CJ_SEGMENTER: Lazy = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap(); + let cj_provider = ForkByKeyProvider::new(cj_blob, provider); + LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap() +}); + +/// The Unicode line break properties for each code point. +static LINEBREAK_DATA: Lazy> = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + let deser_provider = provider.as_deserializing(); + icu_properties::maps::load_line_break(&deser_provider).unwrap() +}); + +/// A line break opportunity. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub(crate) enum Breakpoint { + /// Just a normal opportunity (e.g. after a space). + Normal, + /// A mandatory breakpoint (after '\n' or at the end of the text). + Mandatory, + /// An opportunity for hyphenating. + Hyphen, +} + +/// Calls `f` for all possible points in the text where lines can broken. +/// +/// Yields for each breakpoint the text index, whether the break is mandatory +/// (after `\n`) and whether a hyphen is required (when breaking inside of a +/// word). +/// +/// This is an internal instead of an external iterator because it makes the +/// code much simpler and the consumers of this function don't need the +/// composability and flexibility of external iteration anyway. +pub(crate) fn breakpoints<'a>( + p: &'a Preparation<'a>, + mut f: impl FnMut(usize, Breakpoint), +) { + let lb = LINEBREAK_DATA.as_borrowed(); + let segmenter = match p.lang { + Some(Lang::CHINESE | Lang::JAPANESE) => &CJ_SEGMENTER, + _ => &SEGMENTER, + }; + + let hyphenate = p.hyphenate != Some(false); + let mut last = 0; + + // Walk over all UAX #14 linebreak opportunities. + for point in segmenter.segment_str(p.bidi.text) { + // Skip breakpoint if there is no char before it. icu4x generates one + // at offset 0, but we don't want it. + let Some(c) = p.bidi.text[..point].chars().next_back() else { continue }; + + // Find out whether the last break was mandatory by checking against + // rules LB4 and LB5, special-casing the end of text according to LB3. + // See also: https://docs.rs/icu_segmenter/latest/icu_segmenter/struct.LineSegmenter.html + let breakpoint = if point == p.bidi.text.len() { + Breakpoint::Mandatory + } else { + match lb.get(c) { + // Fix for: https://github.com/unicode-org/icu4x/issues/4146 + LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ => continue, + LineBreak::MandatoryBreak + | LineBreak::CarriageReturn + | LineBreak::LineFeed + | LineBreak::NextLine => Breakpoint::Mandatory, + _ => Breakpoint::Normal, + } + }; + + // Hyphenate between the last and current breakpoint. + 'hyphenate: { + if !hyphenate { + break 'hyphenate; + } + + // Extract a hyphenatable "word". + let word = + &p.bidi.text[last..point].trim_end_matches(|c: char| !c.is_alphabetic()); + if word.is_empty() { + break 'hyphenate; + } + + let end = last + word.len(); + let mut offset = last; + + // Determine the language to hyphenate this word in. + let Some(lang) = lang_at(p, last) else { break 'hyphenate }; + + for syllable in hypher::hyphenate(word, lang) { + // Don't hyphenate after the final syllable. + offset += syllable.len(); + if offset == end { + continue; + } + + // Filter out hyphenation opportunities where hyphenation was + // actually disabled. + if !hyphenate_at(p, offset) { + continue; + } + + // Filter out forbidden hyphenation opportunities. + if matches!( + syllable.chars().next_back().map(|c| lb.get(c)), + Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ) + ) { + continue; + } + + // Call `f` for the word-internal hyphenation opportunity. + f(offset, Breakpoint::Hyphen); + } + } + + // Call `f` for the UAX #14 break opportunity. + f(point, breakpoint); + + last = point; + } +} + +/// Whether hyphenation is enabled at the given offset. +fn hyphenate_at(p: &Preparation, offset: usize) -> bool { + p.hyphenate + .or_else(|| { + let shaped = p.find(offset)?.text()?; + Some(TextElem::hyphenate_in(shaped.styles)) + }) + .unwrap_or(false) +} + +/// The text language at the given offset. +fn lang_at(p: &Preparation, offset: usize) -> Option { + let lang = p.lang.or_else(|| { + let shaped = p.find(offset)?.text()?; + Some(TextElem::lang_in(shaped.styles)) + })?; + + let bytes = lang.as_str().as_bytes().try_into().ok()?; + hypher::Lang::from_iso(bytes) +} diff --git a/crates/typst-library/src/text/mod.rs b/crates/typst-library/src/text/mod.rs index 4d2f5c63..bfa77586 100644 --- a/crates/typst-library/src/text/mod.rs +++ b/crates/typst-library/src/text/mod.rs @@ -1,6 +1,7 @@ //! Text handling. mod deco; +mod linebreak; mod misc; mod quote; mod quotes; @@ -9,6 +10,7 @@ mod shaping; mod shift; pub use self::deco::*; +pub(crate) use self::linebreak::*; pub use self::misc::*; pub use self::quote::*; pub use self::quotes::*; -- cgit v1.2.3