2 files changed, 190 insertions, 0 deletions
diff --git a/crates/typst-library/src/text/linebreak.rs b/crates/typst-library/src/text/linebreak.rs
new file mode 100644
index 00000000..a026df5d
--- /dev/null
+++ b/crates/typst-library/src/text/linebreak.rs
@@ -0,0 +1,188 @@
+use icu_properties::{maps::CodePointMapData, LineBreak};
+use icu_provider::AsDeserializingBufferProvider;
+use icu_provider_adapters::fork::ForkByKeyProvider;
+use icu_provider_blob::BlobDataProvider;
+use icu_segmenter::LineSegmenter;
+use once_cell::sync::Lazy;
+use typst::doc::Lang;
+
+use super::TextElem;
+use crate::layout::Preparation;
+
+/// Generated by the following command:
+///
+/// ```sh
+/// icu4x-datagen --locales full \
+///               --format blob \
+///               --keys-for-bin target/debug/typst \
+///               --out crates/typst-library/assets/icudata.postcard  \
+///               --overwrite
+/// ```
+///
+/// Install icu_datagen with `cargo install icu_datagen`.
+static ICU_DATA: &[u8] = include_bytes!("../../assets/icudata.postcard");
+
+/// Generated by the following command:
+///
+/// ```sh
+/// icu4x-datagen --locales zh ja \
+///               --format blob \
+///               --keys segmenter/line@1 \
+///               --out crates/typst-library/assets/cj_linebreak_data.postcard \
+///               --overwrite
+/// ```
+///
+/// The used icu_datagen should be patched by
+/// https://github.com/peng1999/icu4x/commit/b9beb6cbf633d61fc3d7983e5baf7f4449fbfae5
+static CJ_LINEBREAK_DATA: &[u8] =
+    include_bytes!("../../assets/cj_linebreak_data.postcard");
+
+/// The general line break segmenter.
+static SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
+    let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
+    LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap()
+});
+
+/// The line break segmenter for Chinese/Japanese text.
+static CJ_SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
+    let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
+    let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap();
+    let cj_provider = ForkByKeyProvider::new(cj_blob, provider);
+    LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap()
+});
+
+/// The Unicode line break properties for each code point.
+static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| {
+    let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
+    let deser_provider = provider.as_deserializing();
+    icu_properties::maps::load_line_break(&deser_provider).unwrap()
+});
+
+/// A line break opportunity.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub(crate) enum Breakpoint {
+    /// Just a normal opportunity (e.g. after a space).
+    Normal,
+    /// A mandatory breakpoint (after '\n' or at the end of the text).
+    Mandatory,
+    /// An opportunity for hyphenating.
+    Hyphen,
+}
+
+/// Calls `f` for all possible points in the text where lines can broken.
+///
+/// Yields for each breakpoint the text index, whether the break is mandatory
+/// (after `\n`) and whether a hyphen is required (when breaking inside of a
+/// word).
+///
+/// This is an internal instead of an external iterator because it makes the
+/// code much simpler and the consumers of this function don't need the
+/// composability and flexibility of external iteration anyway.
+pub(crate) fn breakpoints<'a>(
+    p: &'a Preparation<'a>,
+    mut f: impl FnMut(usize, Breakpoint),
+) {
+    let lb = LINEBREAK_DATA.as_borrowed();
+    let segmenter = match p.lang {
+        Some(Lang::CHINESE | Lang::JAPANESE) => &CJ_SEGMENTER,
+        _ => &SEGMENTER,
+    };
+
+    let hyphenate = p.hyphenate != Some(false);
+    let mut last = 0;
+
+    // Walk over all UAX #14 linebreak opportunities.
+    for point in segmenter.segment_str(p.bidi.text) {
+        // Skip breakpoint if there is no char before it. icu4x generates one
+        // at offset 0, but we don't want it.
+        let Some(c) = p.bidi.text[..point].chars().next_back() else { continue };
+
+        // Find out whether the last break was mandatory by checking against
+        // rules LB4 and LB5, special-casing the end of text according to LB3.
+        // See also: https://docs.rs/icu_segmenter/latest/icu_segmenter/struct.LineSegmenter.html
+        let breakpoint = if point == p.bidi.text.len() {
+            Breakpoint::Mandatory
+        } else {
+            match lb.get(c) {
+                // Fix for: https://github.com/unicode-org/icu4x/issues/4146
+                LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ => continue,
+                LineBreak::MandatoryBreak
+                | LineBreak::CarriageReturn
+                | LineBreak::LineFeed
+                | LineBreak::NextLine => Breakpoint::Mandatory,
+                _ => Breakpoint::Normal,
+            }
+        };
+
+        // Hyphenate between the last and current breakpoint.
+        'hyphenate: {
+            if !hyphenate {
+                break 'hyphenate;
+            }
+
+            // Extract a hyphenatable "word".
+            let word =
+                &p.bidi.text[last..point].trim_end_matches(|c: char| !c.is_alphabetic());
+            if word.is_empty() {
+                break 'hyphenate;
+            }
+
+            let end = last + word.len();
+            let mut offset = last;
+
+            // Determine the language to hyphenate this word in.
+            let Some(lang) = lang_at(p, last) else { break 'hyphenate };
+
+            for syllable in hypher::hyphenate(word, lang) {
+                // Don't hyphenate after the final syllable.
+                offset += syllable.len();
+                if offset == end {
+                    continue;
+                }
+
+                // Filter out hyphenation opportunities where hyphenation was
+                // actually disabled.
+                if !hyphenate_at(p, offset) {
+                    continue;
+                }
+
+                // Filter out forbidden hyphenation opportunities.
+                if matches!(
+                    syllable.chars().next_back().map(|c| lb.get(c)),
+                    Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ)
+                ) {
+                    continue;
+                }
+
+                // Call `f` for the word-internal hyphenation opportunity.
+                f(offset, Breakpoint::Hyphen);
+            }
+        }
+
+        // Call `f` for the UAX #14 break opportunity.
+        f(point, breakpoint);
+
+        last = point;
+    }
+}
+
+/// Whether hyphenation is enabled at the given offset.
+fn hyphenate_at(p: &Preparation, offset: usize) -> bool {
+    p.hyphenate
+        .or_else(|| {
+            let shaped = p.find(offset)?.text()?;
+            Some(TextElem::hyphenate_in(shaped.styles))
+        })
+        .unwrap_or(false)
+}
+
+/// The text language at the given offset.
+fn lang_at(p: &Preparation, offset: usize) -> Option<hypher::Lang> {
+    let lang = p.lang.or_else(|| {
+        let shaped = p.find(offset)?.text()?;
+        Some(TextElem::lang_in(shaped.styles))
+    })?;
+
+    let bytes = lang.as_str().as_bytes().try_into().ok()?;
+    hypher::Lang::from_iso(bytes)
+}
diff --git a/crates/typst-library/src/text/mod.rs b/crates/typst-library/src/text/mod.rs
index 4d2f5c63..bfa77586 100644
--- a/crates/typst-library/src/text/mod.rs
+++ b/crates/typst-library/src/text/mod.rs
@@ -1,6 +1,7 @@
 //! Text handling.
 
 mod deco;
+mod linebreak;
 mod misc;
 mod quote;
 mod quotes;
@@ -9,6 +10,7 @@ mod shaping;
 mod shift;
 
 pub use self::deco::*;
+pub(crate) use self::linebreak::*;
 pub use self::misc::*;
 pub use self::quote::*;
 pub use self::quotes::*;