summaryrefslogtreecommitdiff
path: root/crates/typst-library/src/text
diff options
context:
space:
mode:
Diffstat (limited to 'crates/typst-library/src/text')
-rw-r--r--crates/typst-library/src/text/linebreak.rs188
-rw-r--r--crates/typst-library/src/text/mod.rs2
2 files changed, 190 insertions, 0 deletions
diff --git a/crates/typst-library/src/text/linebreak.rs b/crates/typst-library/src/text/linebreak.rs
new file mode 100644
index 00000000..a026df5d
--- /dev/null
+++ b/crates/typst-library/src/text/linebreak.rs
@@ -0,0 +1,188 @@
+use icu_properties::{maps::CodePointMapData, LineBreak};
+use icu_provider::AsDeserializingBufferProvider;
+use icu_provider_adapters::fork::ForkByKeyProvider;
+use icu_provider_blob::BlobDataProvider;
+use icu_segmenter::LineSegmenter;
+use once_cell::sync::Lazy;
+use typst::doc::Lang;
+
+use super::TextElem;
+use crate::layout::Preparation;
+
+/// Generated by the following command:
+///
+/// ```sh
+/// icu4x-datagen --locales full \
+/// --format blob \
+/// --keys-for-bin target/debug/typst \
+/// --out crates/typst-library/assets/icudata.postcard \
+/// --overwrite
+/// ```
+///
+/// Install icu_datagen with `cargo install icu_datagen`.
+static ICU_DATA: &[u8] = include_bytes!("../../assets/icudata.postcard");
+
+/// Generated by the following command:
+///
+/// ```sh
+/// icu4x-datagen --locales zh ja \
+/// --format blob \
+/// --keys segmenter/line@1 \
+/// --out crates/typst-library/assets/cj_linebreak_data.postcard \
+/// --overwrite
+/// ```
+///
+/// The used icu_datagen should be patched by
+/// https://github.com/peng1999/icu4x/commit/b9beb6cbf633d61fc3d7983e5baf7f4449fbfae5
+static CJ_LINEBREAK_DATA: &[u8] =
+ include_bytes!("../../assets/cj_linebreak_data.postcard");
+
+/// The general line break segmenter.
+static SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
+ let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
+ LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap()
+});
+
+/// The line break segmenter for Chinese/Japanese text.
+static CJ_SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
+ let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
+ let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap();
+ let cj_provider = ForkByKeyProvider::new(cj_blob, provider);
+ LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap()
+});
+
+/// The Unicode line break properties for each code point.
+static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| {
+ let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
+ let deser_provider = provider.as_deserializing();
+ icu_properties::maps::load_line_break(&deser_provider).unwrap()
+});
+
+/// A line break opportunity.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub(crate) enum Breakpoint {
+ /// Just a normal opportunity (e.g. after a space).
+ Normal,
+ /// A mandatory breakpoint (after '\n' or at the end of the text).
+ Mandatory,
+ /// An opportunity for hyphenating.
+ Hyphen,
+}
+
+/// Calls `f` for all possible points in the text where lines can broken.
+///
+/// Yields for each breakpoint the text index, whether the break is mandatory
+/// (after `\n`) and whether a hyphen is required (when breaking inside of a
+/// word).
+///
+/// This is an internal instead of an external iterator because it makes the
+/// code much simpler and the consumers of this function don't need the
+/// composability and flexibility of external iteration anyway.
+pub(crate) fn breakpoints<'a>(
+ p: &'a Preparation<'a>,
+ mut f: impl FnMut(usize, Breakpoint),
+) {
+ let lb = LINEBREAK_DATA.as_borrowed();
+ let segmenter = match p.lang {
+ Some(Lang::CHINESE | Lang::JAPANESE) => &CJ_SEGMENTER,
+ _ => &SEGMENTER,
+ };
+
+ let hyphenate = p.hyphenate != Some(false);
+ let mut last = 0;
+
+ // Walk over all UAX #14 linebreak opportunities.
+ for point in segmenter.segment_str(p.bidi.text) {
+ // Skip breakpoint if there is no char before it. icu4x generates one
+ // at offset 0, but we don't want it.
+ let Some(c) = p.bidi.text[..point].chars().next_back() else { continue };
+
+ // Find out whether the last break was mandatory by checking against
+ // rules LB4 and LB5, special-casing the end of text according to LB3.
+ // See also: https://docs.rs/icu_segmenter/latest/icu_segmenter/struct.LineSegmenter.html
+ let breakpoint = if point == p.bidi.text.len() {
+ Breakpoint::Mandatory
+ } else {
+ match lb.get(c) {
+ // Fix for: https://github.com/unicode-org/icu4x/issues/4146
+ LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ => continue,
+ LineBreak::MandatoryBreak
+ | LineBreak::CarriageReturn
+ | LineBreak::LineFeed
+ | LineBreak::NextLine => Breakpoint::Mandatory,
+ _ => Breakpoint::Normal,
+ }
+ };
+
+ // Hyphenate between the last and current breakpoint.
+ 'hyphenate: {
+ if !hyphenate {
+ break 'hyphenate;
+ }
+
+ // Extract a hyphenatable "word".
+ let word =
+ &p.bidi.text[last..point].trim_end_matches(|c: char| !c.is_alphabetic());
+ if word.is_empty() {
+ break 'hyphenate;
+ }
+
+ let end = last + word.len();
+ let mut offset = last;
+
+ // Determine the language to hyphenate this word in.
+ let Some(lang) = lang_at(p, last) else { break 'hyphenate };
+
+ for syllable in hypher::hyphenate(word, lang) {
+ // Don't hyphenate after the final syllable.
+ offset += syllable.len();
+ if offset == end {
+ continue;
+ }
+
+ // Filter out hyphenation opportunities where hyphenation was
+ // actually disabled.
+ if !hyphenate_at(p, offset) {
+ continue;
+ }
+
+ // Filter out forbidden hyphenation opportunities.
+ if matches!(
+ syllable.chars().next_back().map(|c| lb.get(c)),
+ Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ)
+ ) {
+ continue;
+ }
+
+ // Call `f` for the word-internal hyphenation opportunity.
+ f(offset, Breakpoint::Hyphen);
+ }
+ }
+
+ // Call `f` for the UAX #14 break opportunity.
+ f(point, breakpoint);
+
+ last = point;
+ }
+}
+
+/// Whether hyphenation is enabled at the given offset.
+fn hyphenate_at(p: &Preparation, offset: usize) -> bool {
+ p.hyphenate
+ .or_else(|| {
+ let shaped = p.find(offset)?.text()?;
+ Some(TextElem::hyphenate_in(shaped.styles))
+ })
+ .unwrap_or(false)
+}
+
+/// The text language at the given offset.
+fn lang_at(p: &Preparation, offset: usize) -> Option<hypher::Lang> {
+ let lang = p.lang.or_else(|| {
+ let shaped = p.find(offset)?.text()?;
+ Some(TextElem::lang_in(shaped.styles))
+ })?;
+
+ let bytes = lang.as_str().as_bytes().try_into().ok()?;
+ hypher::Lang::from_iso(bytes)
+}
diff --git a/crates/typst-library/src/text/mod.rs b/crates/typst-library/src/text/mod.rs
index 4d2f5c63..bfa77586 100644
--- a/crates/typst-library/src/text/mod.rs
+++ b/crates/typst-library/src/text/mod.rs
@@ -1,6 +1,7 @@
//! Text handling.
mod deco;
+mod linebreak;
mod misc;
mod quote;
mod quotes;
@@ -9,6 +10,7 @@ mod shaping;
mod shift;
pub use self::deco::*;
+pub(crate) use self::linebreak::*;
pub use self::misc::*;
pub use self::quote::*;
pub use self::quotes::*;