diff options
| author | Peng Guanwen <pg999w@outlook.com> | 2023-05-30 23:53:10 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-05-30 17:53:10 +0200 |
| commit | e2bf2327b53a6b963f7adaeb655fefc1d5fac745 (patch) | |
| tree | 10c3ce9393b256670e987ea5d02d2f88dd602823 /library/src | |
| parent | 11714609b8eb1e2939dfd86923e7b7203879228b (diff) | |
Use icu4x for linebreaking algorithm (#1355)
Diffstat (limited to 'library/src')
| -rw-r--r-- | library/src/layout/par.rs | 75 |
1 files changed, 71 insertions, 4 deletions
diff --git a/library/src/layout/par.rs b/library/src/layout/par.rs index e056b1c9..79b36813 100644 --- a/library/src/layout/par.rs +++ b/library/src/layout/par.rs @@ -1,7 +1,12 @@ +use icu_properties::{maps::CodePointMapData, LineBreak}; +use icu_provider::AsDeserializingBufferProvider; +use icu_provider_adapters::fork::ForkByKeyProvider; +use icu_provider_blob::BlobDataProvider; +use icu_segmenter::{LineBreakIteratorUtf8, LineSegmenter}; +use once_cell::sync::Lazy; use typst::eval::Tracer; use unicode_bidi::{BidiInfo, Level as BidiLevel}; use unicode_script::{Script, UnicodeScript}; -use xi_unicode::LineBreakIterator; use super::{BoxElem, HElem, Sizing, Spacing}; use crate::layout::AlignElem; @@ -998,15 +1003,65 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L lines } +/// Generated by the following command: +/// +/// ```sh +/// icu4x-datagen --locales full --keys-for-bin target/debug/typst \ +/// --format blob --out assets/data/icudata.postcard --overwrite +/// ``` +/// +/// Install icu4x-datagen with `cargo install icu4x-datagen`. +static ICU_DATA: &[u8] = include_bytes!("../../../assets/data/icudata.postcard"); + +/// Gnerated by the following command: +/// +/// ```sh +/// icu4x-datagen --locales zh ja --keys segmenter/line@1 --format blob \ +/// --out assets/data/cj_linebreak_data.postcard --overwrite +/// ``` +/// +/// The used icu4x-datagen should be patched by +/// https://github.com/peng1999/icu4x/commit/b9beb6cbf633d61fc3d7983e5baf7f4449fbfae5 +static CJ_LINEBREAK_DATA: &[u8] = + include_bytes!("../../../assets/data/cj_linebreak_data.postcard"); + +/// The general line break segmenter. +static SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap() +}); + +/// The Unicode line break properties for each code point. +static CJ_SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap(); + let cj_provider = ForkByKeyProvider::new(cj_blob, provider); + LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap() +}); + +/// The line break segmenter for Chinese/Jpanese text. +static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| { + let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap(); + let deser_provider = provider.as_deserializing(); + icu_properties::maps::load_line_break(&deser_provider).unwrap() +}); + /// Determine all possible points in the text where lines can broken. /// /// Returns for each breakpoint the text index, whether the break is mandatory /// (after `\n`) and whether a hyphen is required (when breaking inside of a /// word). fn breakpoints<'a>(p: &'a Preparation<'a>) -> Breakpoints<'a> { + let mut linebreaks = if matches!(p.lang, Some(Lang::CHINESE | Lang::JAPANESE)) { + CJ_SEGMENTER.segment_str(p.bidi.text) + } else { + SEGMENTER.segment_str(p.bidi.text) + }; + // The iterator always yields a breakpoint at index 0, we want to ignore it + linebreaks.next(); Breakpoints { p, - linebreaks: LineBreakIterator::new(p.bidi.text), + linebreaks, syllables: None, offset: 0, suffix: 0, @@ -1020,7 +1075,7 @@ struct Breakpoints<'a> { /// The paragraph's items. p: &'a Preparation<'a>, /// The inner iterator over the unicode line break opportunities. - linebreaks: LineBreakIterator<'a>, + linebreaks: LineBreakIteratorUtf8<'a, 'a>, /// Iterator over syllables of the current word. syllables: Option<hypher::Syllables<'a>>, /// The current text offset. @@ -1054,8 +1109,20 @@ impl Iterator for Breakpoints<'_> { return Some((self.offset, self.mandatory && !hyphen, hyphen)); } + let lb = LINEBREAK_DATA.as_borrowed(); + // Get the next "word". - (self.end, self.mandatory) = self.linebreaks.next()?; + self.end = self.linebreaks.next()?; + self.mandatory = + self.p.bidi.text[..self.end].chars().next_back().map_or(false, |c| { + matches!( + lb.get(c), + LineBreak::MandatoryBreak + | LineBreak::CarriageReturn + | LineBreak::LineFeed + | LineBreak::NextLine + ) || self.end == self.p.bidi.text.len() + }); // Hyphenate the next word. if self.p.hyphenate != Some(false) { |
