Use icu4x for linebreaking algorithm (#1355)

author: Peng Guanwen <pg999w@outlook.com> 2023-05-30 23:53:10 +0800
committer: GitHub <noreply@github.com> 2023-05-30 17:53:10 +0200
commit: e2bf2327b53a6b963f7adaeb655fefc1d5fac745 (patch)
tree: 10c3ce9393b256670e987ea5d02d2f88dd602823 /library/src
parent: 11714609b8eb1e2939dfd86923e7b7203879228b (diff)
1 files changed, 71 insertions, 4 deletions
diff --git a/library/src/layout/par.rs b/library/src/layout/par.rs
index e056b1c9..79b36813 100644
--- a/library/src/layout/par.rs
+++ b/library/src/layout/par.rs
@@ -1,7 +1,12 @@
+use icu_properties::{maps::CodePointMapData, LineBreak};
+use icu_provider::AsDeserializingBufferProvider;
+use icu_provider_adapters::fork::ForkByKeyProvider;
+use icu_provider_blob::BlobDataProvider;
+use icu_segmenter::{LineBreakIteratorUtf8, LineSegmenter};
+use once_cell::sync::Lazy;
 use typst::eval::Tracer;
 use unicode_bidi::{BidiInfo, Level as BidiLevel};
 use unicode_script::{Script, UnicodeScript};
-use xi_unicode::LineBreakIterator;
 
 use super::{BoxElem, HElem, Sizing, Spacing};
 use crate::layout::AlignElem;
@@ -998,15 +1003,65 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
     lines
 }
 
+/// Generated by the following command:
+///
+/// ```sh
+/// icu4x-datagen  --locales full --keys-for-bin target/debug/typst \
+///                --format blob --out assets/data/icudata.postcard --overwrite
+/// ```
+///
+/// Install icu4x-datagen with `cargo install icu4x-datagen`.
+static ICU_DATA: &[u8] = include_bytes!("../../../assets/data/icudata.postcard");
+
+/// Gnerated by the following command:
+///
+/// ```sh
+/// icu4x-datagen --locales zh ja --keys segmenter/line@1 --format blob \
+///               --out assets/data/cj_linebreak_data.postcard --overwrite
+/// ```
+///
+/// The used icu4x-datagen should be patched by
+/// https://github.com/peng1999/icu4x/commit/b9beb6cbf633d61fc3d7983e5baf7f4449fbfae5
+static CJ_LINEBREAK_DATA: &[u8] =
+    include_bytes!("../../../assets/data/cj_linebreak_data.postcard");
+
+/// The general line break segmenter.
+static SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
+    let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
+    LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap()
+});
+
+/// The Unicode line break properties for each code point.
+static CJ_SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
+    let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
+    let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap();
+    let cj_provider = ForkByKeyProvider::new(cj_blob, provider);
+    LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap()
+});
+
+/// The line break segmenter for Chinese/Jpanese text.
+static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| {
+    let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
+    let deser_provider = provider.as_deserializing();
+    icu_properties::maps::load_line_break(&deser_provider).unwrap()
+});
+
 /// Determine all possible points in the text where lines can broken.
 ///
 /// Returns for each breakpoint the text index, whether the break is mandatory
 /// (after `\n`) and whether a hyphen is required (when breaking inside of a
 /// word).
 fn breakpoints<'a>(p: &'a Preparation<'a>) -> Breakpoints<'a> {
+    let mut linebreaks = if matches!(p.lang, Some(Lang::CHINESE | Lang::JAPANESE)) {
+        CJ_SEGMENTER.segment_str(p.bidi.text)
+    } else {
+        SEGMENTER.segment_str(p.bidi.text)
+    };
+    // The iterator always yields a breakpoint at index 0, we want to ignore it
+    linebreaks.next();
     Breakpoints {
         p,
-        linebreaks: LineBreakIterator::new(p.bidi.text),
+        linebreaks,
         syllables: None,
         offset: 0,
         suffix: 0,
@@ -1020,7 +1075,7 @@ struct Breakpoints<'a> {
     /// The paragraph's items.
     p: &'a Preparation<'a>,
     /// The inner iterator over the unicode line break opportunities.
-    linebreaks: LineBreakIterator<'a>,
+    linebreaks: LineBreakIteratorUtf8<'a, 'a>,
     /// Iterator over syllables of the current word.
     syllables: Option<hypher::Syllables<'a>>,
     /// The current text offset.
@@ -1054,8 +1109,20 @@ impl Iterator for Breakpoints<'_> {
             return Some((self.offset, self.mandatory && !hyphen, hyphen));
         }
 
+        let lb = LINEBREAK_DATA.as_borrowed();
+
         // Get the next "word".
-        (self.end, self.mandatory) = self.linebreaks.next()?;
+        self.end = self.linebreaks.next()?;
+        self.mandatory =
+            self.p.bidi.text[..self.end].chars().next_back().map_or(false, |c| {
+                matches!(
+                    lb.get(c),
+                    LineBreak::MandatoryBreak
+                        | LineBreak::CarriageReturn
+                        | LineBreak::LineFeed
+                        | LineBreak::NextLine
+                ) || self.end == self.p.bidi.text.len()
+            });
 
         // Hyphenate the next word.
         if self.p.hyphenate != Some(false) {
author	Peng Guanwen <pg999w@outlook.com>	2023-05-30 23:53:10 +0800
committer	GitHub <noreply@github.com>	2023-05-30 17:53:10 +0200
commit	e2bf2327b53a6b963f7adaeb655fefc1d5fac745 (patch)
tree	10c3ce9393b256670e987ea5d02d2f88dd602823 /library/src
parent	11714609b8eb1e2939dfd86923e7b7203879228b (diff)