From e2bf2327b53a6b963f7adaeb655fefc1d5fac745 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Tue, 30 May 2023 23:53:10 +0800 Subject: Use icu4x for linebreaking algorithm (#1355) --- library/src/layout/par.rs | 75 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 4 deletions(-) (limited to 'library/src') diff --git a/library/src/layout/par.rs b/library/src/layout/par.rs index e056b1c9..79b36813 100644 --- a/library/src/layout/par.rs +++ b/library/src/layout/par.rs @@ -1,7 +1,12 @@ +use icu_properties::{maps::CodePointMapData, LineBreak}; +use icu_provider::AsDeserializingBufferProvider; +use icu_provider_adapters::fork::ForkByKeyProvider; +use icu_provider_blob::BlobDataProvider; +use icu_segmenter::{LineBreakIteratorUtf8, LineSegmenter}; +use once_cell::sync::Lazy; use typst::eval::Tracer; use unicode_bidi::{BidiInfo, Level as BidiLevel}; use unicode_script::{Script, UnicodeScript}; -use xi_unicode::LineBreakIterator; use super::{BoxElem, HElem, Sizing, Spacing}; use crate::layout::AlignElem; @@ -998,15 +1003,65 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap() +}); + +/// The Unicode line break properties for each code point. +static CJ_SEGMENTER: Lazy = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap(); + let cj_provider = ForkByKeyProvider::new(cj_blob, provider); + LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap() +}); + +/// The line break segmenter for Chinese/Jpanese text. +static LINEBREAK_DATA: Lazy> = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + let deser_provider = provider.as_deserializing(); + icu_properties::maps::load_line_break(&deser_provider).unwrap() +}); + /// Determine all possible points in the text where lines can broken. /// /// Returns for each breakpoint the text index, whether the break is mandatory /// (after `\n`) and whether a hyphen is required (when breaking inside of a /// word). fn breakpoints<'a>(p: &'a Preparation<'a>) -> Breakpoints<'a> { + let mut linebreaks = if matches!(p.lang, Some(Lang::CHINESE | Lang::JAPANESE)) { + CJ_SEGMENTER.segment_str(p.bidi.text) + } else { + SEGMENTER.segment_str(p.bidi.text) + }; + // The iterator always yields a breakpoint at index 0, we want to ignore it + linebreaks.next(); Breakpoints { p, - linebreaks: LineBreakIterator::new(p.bidi.text), + linebreaks, syllables: None, offset: 0, suffix: 0, @@ -1020,7 +1075,7 @@ struct Breakpoints<'a> { /// The paragraph's items. p: &'a Preparation<'a>, /// The inner iterator over the unicode line break opportunities. - linebreaks: LineBreakIterator<'a>, + linebreaks: LineBreakIteratorUtf8<'a, 'a>, /// Iterator over syllables of the current word. syllables: Option>, /// The current text offset. @@ -1054,8 +1109,20 @@ impl Iterator for Breakpoints<'_> { return Some((self.offset, self.mandatory && !hyphen, hyphen)); } + let lb = LINEBREAK_DATA.as_borrowed(); + // Get the next "word". - (self.end, self.mandatory) = self.linebreaks.next()?; + self.end = self.linebreaks.next()?; + self.mandatory = + self.p.bidi.text[..self.end].chars().next_back().map_or(false, |c| { + matches!( + lb.get(c), + LineBreak::MandatoryBreak + | LineBreak::CarriageReturn + | LineBreak::LineFeed + | LineBreak::NextLine + ) || self.end == self.p.bidi.text.len() + }); // Hyphenate the next word. if self.p.hyphenate != Some(false) { -- cgit v1.2.3