summaryrefslogtreecommitdiff
path: root/library/src
diff options
context:
space:
mode:
authorPeng Guanwen <pg999w@outlook.com>2023-05-30 23:53:10 +0800
committerGitHub <noreply@github.com>2023-05-30 17:53:10 +0200
commite2bf2327b53a6b963f7adaeb655fefc1d5fac745 (patch)
tree10c3ce9393b256670e987ea5d02d2f88dd602823 /library/src
parent11714609b8eb1e2939dfd86923e7b7203879228b (diff)
Use icu4x for linebreaking algorithm (#1355)
Diffstat (limited to 'library/src')
-rw-r--r--library/src/layout/par.rs75
1 files changed, 71 insertions, 4 deletions
diff --git a/library/src/layout/par.rs b/library/src/layout/par.rs
index e056b1c9..79b36813 100644
--- a/library/src/layout/par.rs
+++ b/library/src/layout/par.rs
@@ -1,7 +1,12 @@
+use icu_properties::{maps::CodePointMapData, LineBreak};
+use icu_provider::AsDeserializingBufferProvider;
+use icu_provider_adapters::fork::ForkByKeyProvider;
+use icu_provider_blob::BlobDataProvider;
+use icu_segmenter::{LineBreakIteratorUtf8, LineSegmenter};
+use once_cell::sync::Lazy;
use typst::eval::Tracer;
use unicode_bidi::{BidiInfo, Level as BidiLevel};
use unicode_script::{Script, UnicodeScript};
-use xi_unicode::LineBreakIterator;
use super::{BoxElem, HElem, Sizing, Spacing};
use crate::layout::AlignElem;
@@ -998,15 +1003,65 @@ fn linebreak_optimized<'a>(vt: &Vt, p: &'a Preparation<'a>, width: Abs) -> Vec<L
lines
}
+/// Generated by the following command:
+///
+/// ```sh
+/// icu4x-datagen --locales full --keys-for-bin target/debug/typst \
+/// --format blob --out assets/data/icudata.postcard --overwrite
+/// ```
+///
+/// Install icu4x-datagen with `cargo install icu4x-datagen`.
+static ICU_DATA: &[u8] = include_bytes!("../../../assets/data/icudata.postcard");
+
+/// Gnerated by the following command:
+///
+/// ```sh
+/// icu4x-datagen --locales zh ja --keys segmenter/line@1 --format blob \
+/// --out assets/data/cj_linebreak_data.postcard --overwrite
+/// ```
+///
+/// The used icu4x-datagen should be patched by
+/// https://github.com/peng1999/icu4x/commit/b9beb6cbf633d61fc3d7983e5baf7f4449fbfae5
+static CJ_LINEBREAK_DATA: &[u8] =
+ include_bytes!("../../../assets/data/cj_linebreak_data.postcard");
+
+/// The general line break segmenter.
+static SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
+ let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
+ LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap()
+});
+
+/// The Unicode line break properties for each code point.
+static CJ_SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
+ let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
+ let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap();
+ let cj_provider = ForkByKeyProvider::new(cj_blob, provider);
+ LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap()
+});
+
+/// The line break segmenter for Chinese/Jpanese text.
+static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| {
+ let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
+ let deser_provider = provider.as_deserializing();
+ icu_properties::maps::load_line_break(&deser_provider).unwrap()
+});
+
/// Determine all possible points in the text where lines can broken.
///
/// Returns for each breakpoint the text index, whether the break is mandatory
/// (after `\n`) and whether a hyphen is required (when breaking inside of a
/// word).
fn breakpoints<'a>(p: &'a Preparation<'a>) -> Breakpoints<'a> {
+ let mut linebreaks = if matches!(p.lang, Some(Lang::CHINESE | Lang::JAPANESE)) {
+ CJ_SEGMENTER.segment_str(p.bidi.text)
+ } else {
+ SEGMENTER.segment_str(p.bidi.text)
+ };
+ // The iterator always yields a breakpoint at index 0, we want to ignore it
+ linebreaks.next();
Breakpoints {
p,
- linebreaks: LineBreakIterator::new(p.bidi.text),
+ linebreaks,
syllables: None,
offset: 0,
suffix: 0,
@@ -1020,7 +1075,7 @@ struct Breakpoints<'a> {
/// The paragraph's items.
p: &'a Preparation<'a>,
/// The inner iterator over the unicode line break opportunities.
- linebreaks: LineBreakIterator<'a>,
+ linebreaks: LineBreakIteratorUtf8<'a, 'a>,
/// Iterator over syllables of the current word.
syllables: Option<hypher::Syllables<'a>>,
/// The current text offset.
@@ -1054,8 +1109,20 @@ impl Iterator for Breakpoints<'_> {
return Some((self.offset, self.mandatory && !hyphen, hyphen));
}
+ let lb = LINEBREAK_DATA.as_borrowed();
+
// Get the next "word".
- (self.end, self.mandatory) = self.linebreaks.next()?;
+ self.end = self.linebreaks.next()?;
+ self.mandatory =
+ self.p.bidi.text[..self.end].chars().next_back().map_or(false, |c| {
+ matches!(
+ lb.get(c),
+ LineBreak::MandatoryBreak
+ | LineBreak::CarriageReturn
+ | LineBreak::LineFeed
+ | LineBreak::NextLine
+ ) || self.end == self.p.bidi.text.len()
+ });
// Hyphenate the next word.
if self.p.hyphenate != Some(false) {