1 files changed, 1175 insertions, 0 deletions
diff --git a/crates/typst-layout/src/inline/shaping.rs b/crates/typst-layout/src/inline/shaping.rs
new file mode 100644
index 00000000..bd803b52
--- /dev/null
+++ b/crates/typst-layout/src/inline/shaping.rs
@@ -0,0 +1,1175 @@
+use std::borrow::Cow;
+use std::fmt::{self, Debug, Formatter};
+use std::str::FromStr;
+use std::sync::Arc;
+
+use az::SaturatingAs;
+use ecow::EcoString;
+use rustybuzz::{BufferFlags, ShapePlan, UnicodeBuffer};
+use ttf_parser::Tag;
+use typst_library::engine::Engine;
+use typst_library::foundations::{Smart, StyleChain};
+use typst_library::layout::{Abs, Dir, Em, Frame, FrameItem, Point, Size};
+use typst_library::text::{
+    families, features, is_default_ignorable, variant, Font, FontVariant, Glyph, Lang,
+    Region, TextEdgeBounds, TextElem, TextItem,
+};
+use typst_library::World;
+use typst_utils::SliceExt;
+use unicode_bidi::{BidiInfo, Level as BidiLevel};
+use unicode_script::{Script, UnicodeScript};
+
+use super::{decorate, Item, Range, SpanMapper};
+
+/// The result of shaping text.
+///
+/// This type contains owned or borrowed shaped text runs, which can be
+/// measured, used to reshape substrings more quickly and converted into a
+/// frame.
+#[derive(Clone)]
+pub struct ShapedText<'a> {
+    /// The start of the text in the full paragraph.
+    pub base: usize,
+    /// The text that was shaped.
+    pub text: &'a str,
+    /// The text direction.
+    pub dir: Dir,
+    /// The text language.
+    pub lang: Lang,
+    /// The text region.
+    pub region: Option<Region>,
+    /// The text's style properties.
+    pub styles: StyleChain<'a>,
+    /// The font variant.
+    pub variant: FontVariant,
+    /// The font size.
+    pub size: Abs,
+    /// The width of the text's bounding box.
+    pub width: Abs,
+    /// The shaped glyphs.
+    pub glyphs: Cow<'a, [ShapedGlyph]>,
+}
+
+/// A single glyph resulting from shaping.
+#[derive(Debug, Clone)]
+pub struct ShapedGlyph {
+    /// The font the glyph is contained in.
+    pub font: Font,
+    /// The glyph's index in the font.
+    pub glyph_id: u16,
+    /// The advance width of the glyph.
+    pub x_advance: Em,
+    /// The horizontal offset of the glyph.
+    pub x_offset: Em,
+    /// The vertical offset of the glyph.
+    pub y_offset: Em,
+    /// The adjustability of the glyph.
+    pub adjustability: Adjustability,
+    /// The byte range of this glyph's cluster in the full paragraph. A cluster
+    /// is a sequence of one or multiple glyphs that cannot be separated and
+    /// must always be treated as a union.
+    ///
+    /// The range values of the glyphs in a [`ShapedText`] should not overlap
+    /// with each other, and they should be monotonically increasing (for
+    /// left-to-right or top-to-bottom text) or monotonically decreasing (for
+    /// right-to-left or bottom-to-top text).
+    pub range: Range,
+    /// Whether splitting the shaping result before this glyph would yield the
+    /// same results as shaping the parts to both sides of `text_index`
+    /// separately.
+    pub safe_to_break: bool,
+    /// The first char in this glyph's cluster.
+    pub c: char,
+    /// Whether this glyph is justifiable for CJK scripts.
+    pub is_justifiable: bool,
+    /// The script of the glyph.
+    pub script: Script,
+}
+
+#[derive(Debug, Clone, Default)]
+pub struct Adjustability {
+    /// The left and right stretchability
+    pub stretchability: (Em, Em),
+    /// The left and right shrinkability
+    pub shrinkability: (Em, Em),
+}
+
+impl ShapedGlyph {
+    /// Whether the glyph is a space.
+    pub fn is_space(&self) -> bool {
+        is_space(self.c)
+    }
+
+    /// Whether the glyph is justifiable.
+    pub fn is_justifiable(&self) -> bool {
+        // GB style is not relevant here.
+        self.is_justifiable
+    }
+
+    /// Whether the glyph is part of Chinese or Japanese script (i.e. CJ, not CJK).
+    pub fn is_cj_script(&self) -> bool {
+        is_cj_script(self.c, self.script)
+    }
+
+    pub fn is_cjk_punctuation(&self) -> bool {
+        self.is_cjk_left_aligned_punctuation(CjkPunctStyle::Gb)
+            || self.is_cjk_right_aligned_punctuation()
+            || self.is_cjk_center_aligned_punctuation(CjkPunctStyle::Gb)
+    }
+
+    /// See <https://www.w3.org/TR/clreq/#punctuation_width_adjustment>
+    pub fn is_cjk_left_aligned_punctuation(&self, style: CjkPunctStyle) -> bool {
+        is_cjk_left_aligned_punctuation(
+            self.c,
+            self.x_advance,
+            self.stretchability(),
+            style,
+        )
+    }
+
+    /// See <https://www.w3.org/TR/clreq/#punctuation_width_adjustment>
+    pub fn is_cjk_right_aligned_punctuation(&self) -> bool {
+        is_cjk_right_aligned_punctuation(self.c, self.x_advance, self.stretchability())
+    }
+
+    /// See <https://www.w3.org/TR/clreq/#punctuation_width_adjustment>
+    pub fn is_cjk_center_aligned_punctuation(&self, style: CjkPunctStyle) -> bool {
+        is_cjk_center_aligned_punctuation(self.c, style)
+    }
+
+    /// Whether the glyph is a western letter or number.
+    pub fn is_letter_or_number(&self) -> bool {
+        matches!(self.c.script(), Script::Latin | Script::Greek | Script::Cyrillic)
+            || matches!(self.c, '#' | '$' | '%' | '&')
+            || self.c.is_ascii_digit()
+    }
+
+    pub fn base_adjustability(&self, style: CjkPunctStyle) -> Adjustability {
+        let width = self.x_advance;
+        if self.is_space() {
+            Adjustability {
+                // The number for spaces is from Knuth-Plass' paper
+                stretchability: (Em::zero(), width / 2.0),
+                shrinkability: (Em::zero(), width / 3.0),
+            }
+        } else if self.is_cjk_left_aligned_punctuation(style) {
+            Adjustability {
+                stretchability: (Em::zero(), Em::zero()),
+                shrinkability: (Em::zero(), width / 2.0),
+            }
+        } else if self.is_cjk_right_aligned_punctuation() {
+            Adjustability {
+                stretchability: (Em::zero(), Em::zero()),
+                shrinkability: (width / 2.0, Em::zero()),
+            }
+        } else if self.is_cjk_center_aligned_punctuation(style) {
+            Adjustability {
+                stretchability: (Em::zero(), Em::zero()),
+                shrinkability: (width / 4.0, width / 4.0),
+            }
+        } else {
+            Adjustability::default()
+        }
+    }
+
+    /// The stretchability of the character.
+    pub fn stretchability(&self) -> (Em, Em) {
+        self.adjustability.stretchability
+    }
+
+    /// The shrinkability of the character.
+    pub fn shrinkability(&self) -> (Em, Em) {
+        self.adjustability.shrinkability
+    }
+
+    /// Shrink the width of glyph on the left side.
+    pub fn shrink_left(&mut self, amount: Em) {
+        self.x_offset -= amount;
+        self.x_advance -= amount;
+        self.adjustability.shrinkability.0 -= amount;
+    }
+
+    /// Shrink the width of glyph on the right side.
+    pub fn shrink_right(&mut self, amount: Em) {
+        self.x_advance -= amount;
+        self.adjustability.shrinkability.1 -= amount;
+    }
+}
+
+/// A side you can go toward.
+enum Side {
+    /// To the left-hand side.
+    Left,
+    /// To the right-hand side.
+    Right,
+}
+
+impl<'a> ShapedText<'a> {
+    /// Build the shaped text's frame.
+    ///
+    /// The `justification` defines how much extra advance width each
+    /// [justifiable glyph](ShapedGlyph::is_justifiable) will get.
+    pub fn build(
+        &self,
+        engine: &Engine,
+        spans: &SpanMapper,
+        justification_ratio: f64,
+        extra_justification: Abs,
+    ) -> Frame {
+        let (top, bottom) = self.measure(engine);
+        let size = Size::new(self.width, top + bottom);
+
+        let mut offset = Abs::zero();
+        let mut frame = Frame::soft(size);
+        frame.set_baseline(top);
+
+        let shift = TextElem::baseline_in(self.styles);
+        let decos = TextElem::deco_in(self.styles);
+        let fill = TextElem::fill_in(self.styles);
+        let stroke = TextElem::stroke_in(self.styles);
+        let span_offset = TextElem::span_offset_in(self.styles);
+
+        for ((font, y_offset), group) in
+            self.glyphs.as_ref().group_by_key(|g| (g.font.clone(), g.y_offset))
+        {
+            let mut range = group[0].range.clone();
+            for glyph in group {
+                range.start = range.start.min(glyph.range.start);
+                range.end = range.end.max(glyph.range.end);
+            }
+
+            let pos = Point::new(offset, top + shift - y_offset.at(self.size));
+            let glyphs: Vec<Glyph> = group
+                .iter()
+                .map(|shaped: &ShapedGlyph| {
+                    let adjustability_left = if justification_ratio < 0.0 {
+                        shaped.shrinkability().0
+                    } else {
+                        shaped.stretchability().0
+                    };
+                    let adjustability_right = if justification_ratio < 0.0 {
+                        shaped.shrinkability().1
+                    } else {
+                        shaped.stretchability().1
+                    };
+
+                    let justification_left = adjustability_left * justification_ratio;
+                    let mut justification_right =
+                        adjustability_right * justification_ratio;
+                    if shaped.is_justifiable() {
+                        justification_right +=
+                            Em::from_length(extra_justification, self.size)
+                    }
+
+                    frame.size_mut().x += justification_left.at(self.size)
+                        + justification_right.at(self.size);
+
+                    // We may not be able to reach the offset completely if
+                    // it exceeds u16, but better to have a roughly correct
+                    // span offset than nothing.
+                    let mut span = spans.span_at(shaped.range.start);
+                    span.1 = span.1.saturating_add(span_offset.saturating_as());
+
+                    // |<---- a Glyph ---->|
+                    //  -->|ShapedGlyph|<--
+                    // +---+-----------+---+
+                    // |   |  *********|   |
+                    // |   |  *        |   |
+                    // |   |  *    ****|   |
+                    // |   |  *       *|   |
+                    // |   |  *********|   |
+                    // +---+--+--------+---+
+                    //   A   B     C     D
+                    // Note A, B, D could be positive, zero, or negative.
+                    // A: justification_left
+                    // B: ShapedGlyph's x_offset
+                    //    (though a small part of the glyph may go inside B)
+                    // B+C: ShapedGlyph's x_advance
+                    // D: justification_right
+                    // A+B: Glyph's x_offset
+                    // A+B+C+D: Glyph's x_advance
+                    Glyph {
+                        id: shaped.glyph_id,
+                        x_advance: shaped.x_advance
+                            + justification_left
+                            + justification_right,
+                        x_offset: shaped.x_offset + justification_left,
+                        range: (shaped.range.start - range.start).saturating_as()
+                            ..(shaped.range.end - range.start).saturating_as(),
+                        span,
+                    }
+                })
+                .collect();
+
+            let item = TextItem {
+                font,
+                size: self.size,
+                lang: self.lang,
+                region: self.region,
+                fill: fill.clone(),
+                stroke: stroke.clone().map(|s| s.unwrap_or_default()),
+                text: self.text[range.start - self.base..range.end - self.base].into(),
+                glyphs,
+            };
+
+            let width = item.width();
+            if decos.is_empty() {
+                frame.push(pos, FrameItem::Text(item));
+            } else {
+                // Apply line decorations.
+                frame.push(pos, FrameItem::Text(item.clone()));
+                for deco in &decos {
+                    decorate(&mut frame, deco, &item, width, shift, pos);
+                }
+            }
+
+            offset += width;
+        }
+
+        frame
+    }
+
+    /// Measure the top and bottom extent of this text.
+    pub fn measure(&self, engine: &Engine) -> (Abs, Abs) {
+        let mut top = Abs::zero();
+        let mut bottom = Abs::zero();
+
+        let top_edge = TextElem::top_edge_in(self.styles);
+        let bottom_edge = TextElem::bottom_edge_in(self.styles);
+
+        // Expand top and bottom by reading the font's vertical metrics.
+        let mut expand = |font: &Font, bounds: TextEdgeBounds| {
+            let (t, b) = font.edges(top_edge, bottom_edge, self.size, bounds);
+            top.set_max(t);
+            bottom.set_max(b);
+        };
+
+        if self.glyphs.is_empty() {
+            // When there are no glyphs, we just use the vertical metrics of the
+            // first available font.
+            let world = engine.world;
+            for family in families(self.styles) {
+                if let Some(font) = world
+                    .book()
+                    .select(family, self.variant)
+                    .and_then(|id| world.font(id))
+                {
+                    expand(&font, TextEdgeBounds::Zero);
+                    break;
+                }
+            }
+        } else {
+            for g in self.glyphs.iter() {
+                expand(&g.font, TextEdgeBounds::Glyph(g.glyph_id));
+            }
+        }
+
+        (top, bottom)
+    }
+
+    /// How many glyphs are in the text where we can insert additional
+    /// space when encountering underfull lines.
+    pub fn justifiables(&self) -> usize {
+        self.glyphs.iter().filter(|g| g.is_justifiable()).count()
+    }
+
+    /// Whether the last glyph is a CJK character which should not be justified
+    /// on line end.
+    pub fn cjk_justifiable_at_last(&self) -> bool {
+        self.glyphs
+            .last()
+            .map(|g| g.is_cj_script() || g.is_cjk_punctuation())
+            .unwrap_or(false)
+    }
+
+    /// The stretchability of the text.
+    pub fn stretchability(&self) -> Abs {
+        self.glyphs
+            .iter()
+            .map(|g| g.stretchability().0 + g.stretchability().1)
+            .sum::<Em>()
+            .at(self.size)
+    }
+
+    /// The shrinkability of the text
+    pub fn shrinkability(&self) -> Abs {
+        self.glyphs
+            .iter()
+            .map(|g| g.shrinkability().0 + g.shrinkability().1)
+            .sum::<Em>()
+            .at(self.size)
+    }
+
+    /// Reshape a range of the shaped text, reusing information from this
+    /// shaping process if possible.
+    ///
+    /// The text `range` is relative to the whole paragraph.
+    pub fn reshape(&'a self, engine: &Engine, text_range: Range) -> ShapedText<'a> {
+        let text = &self.text[text_range.start - self.base..text_range.end - self.base];
+        if let Some(glyphs) = self.slice_safe_to_break(text_range.clone()) {
+            #[cfg(debug_assertions)]
+            assert_all_glyphs_in_range(glyphs, text, text_range.clone());
+            Self {
+                base: text_range.start,
+                text,
+                dir: self.dir,
+                lang: self.lang,
+                region: self.region,
+                styles: self.styles,
+                size: self.size,
+                variant: self.variant,
+                width: glyphs.iter().map(|g| g.x_advance).sum::<Em>().at(self.size),
+                glyphs: Cow::Borrowed(glyphs),
+            }
+        } else {
+            shape(
+                engine,
+                text_range.start,
+                text,
+                self.styles,
+                self.dir,
+                self.lang,
+                self.region,
+            )
+        }
+    }
+
+    /// Derive an empty text run with the same properties as this one.
+    pub fn empty(&self) -> Self {
+        Self {
+            text: "",
+            width: Abs::zero(),
+            glyphs: Cow::Borrowed(&[]),
+            ..*self
+        }
+    }
+
+    /// Push a hyphen to end of the text.
+    pub fn push_hyphen(&mut self, engine: &Engine, fallback: bool) {
+        self.insert_hyphen(engine, fallback, Side::Right)
+    }
+
+    /// Prepend a hyphen to start of the text.
+    pub fn prepend_hyphen(&mut self, engine: &Engine, fallback: bool) {
+        self.insert_hyphen(engine, fallback, Side::Left)
+    }
+
+    fn insert_hyphen(&mut self, engine: &Engine, fallback: bool, side: Side) {
+        let world = engine.world;
+        let book = world.book();
+        let fallback_func = if fallback {
+            Some(|| book.select_fallback(None, self.variant, "-"))
+        } else {
+            None
+        };
+        let mut chain = families(self.styles)
+            .map(|family| book.select(family, self.variant))
+            .chain(fallback_func.iter().map(|f| f()))
+            .flatten();
+
+        chain.find_map(|id| {
+            let font = world.font(id)?;
+            let ttf = font.ttf();
+            let glyph_id = ttf.glyph_index('-')?;
+            let x_advance = font.to_em(ttf.glyph_hor_advance(glyph_id)?);
+            let range = match side {
+                Side::Left => self.glyphs.first().map(|g| g.range.start..g.range.start),
+                Side::Right => self.glyphs.last().map(|g| g.range.end..g.range.end),
+            }
+            // In the unlikely chance that we hyphenate after an empty line,
+            // ensure that the glyph range still falls after self.base so
+            // that subtracting either of the endpoints by self.base doesn't
+            // underflow. See <https://github.com/typst/typst/issues/2283>.
+            .unwrap_or_else(|| self.base..self.base);
+            self.width += x_advance.at(self.size);
+            let glyph = ShapedGlyph {
+                font,
+                glyph_id: glyph_id.0,
+                x_advance,
+                x_offset: Em::zero(),
+                y_offset: Em::zero(),
+                adjustability: Adjustability::default(),
+                range,
+                safe_to_break: true,
+                c: '-',
+                is_justifiable: false,
+                script: Script::Common,
+            };
+            match side {
+                Side::Left => self.glyphs.to_mut().insert(0, glyph),
+                Side::Right => self.glyphs.to_mut().push(glyph),
+            }
+            Some(())
+        });
+    }
+
+    /// Find the subslice of glyphs that represent the given text range if both
+    /// sides are safe to break.
+    fn slice_safe_to_break(&self, text_range: Range) -> Option<&[ShapedGlyph]> {
+        let Range { mut start, mut end } = text_range;
+        if !self.dir.is_positive() {
+            std::mem::swap(&mut start, &mut end);
+        }
+
+        let left = self.find_safe_to_break(start)?;
+        let right = self.find_safe_to_break(end)?;
+        Some(&self.glyphs[left..right])
+    }
+
+    /// Find the glyph offset matching the text index that is most towards the
+    /// start of the text and safe-to-break.
+    fn find_safe_to_break(&self, text_index: usize) -> Option<usize> {
+        let ltr = self.dir.is_positive();
+
+        // Handle edge cases.
+        let len = self.glyphs.len();
+        if text_index == self.base {
+            return Some(if ltr { 0 } else { len });
+        } else if text_index == self.base + self.text.len() {
+            return Some(if ltr { len } else { 0 });
+        }
+
+        // Find any glyph with the text index.
+        let found = self.glyphs.binary_search_by(|g: &ShapedGlyph| {
+            let ordering = g.range.start.cmp(&text_index);
+            if ltr {
+                ordering
+            } else {
+                ordering.reverse()
+            }
+        });
+
+        let mut idx = match found {
+            Ok(idx) => idx,
+            Err(idx) => {
+                // Handle the special case where we break before a '\n'
+                //
+                // For example: (assume `a` is a CJK character with three bytes)
+                // text:  " a     \n b  "
+                // index:   0 1 2 3  4 5
+                // text_index:    ^
+                // glyphs:  0     .  1
+                //
+                // We will get found = Err(1), because '\n' does not have a
+                // glyph. But it's safe to break here. Thus the following
+                // condition:
+                // - glyphs[0].end == text_index == 3
+                // - text[3] == '\n'
+                return (idx > 0
+                    && self.glyphs[idx - 1].range.end == text_index
+                    && self.text[text_index - self.base..].starts_with('\n'))
+                .then_some(idx);
+            }
+        };
+
+        // Search for the start-most glyph with the text index. This means
+        // we take empty range glyphs at the start and leave those at the end
+        // for the next line.
+        let dec = if ltr { usize::checked_sub } else { usize::checked_add };
+        while let Some(next) = dec(idx, 1) {
+            if self.glyphs.get(next).map_or(true, |g| g.range.start != text_index) {
+                break;
+            }
+            idx = next;
+        }
+
+        // RTL needs offset one because the left side of the range should be
+        // exclusive and the right side inclusive, contrary to the normal
+        // behaviour of ranges.
+        self.glyphs[idx].safe_to_break.then_some(idx + usize::from(!ltr))
+    }
+}
+
+impl Debug for ShapedText<'_> {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        self.text.fmt(f)
+    }
+}
+
+/// Group a range of text by BiDi level and script, shape the runs and generate
+/// items for them.
+pub fn shape_range<'a>(
+    items: &mut Vec<(Range, Item<'a>)>,
+    engine: &Engine,
+    text: &'a str,
+    bidi: &BidiInfo<'a>,
+    range: Range,
+    styles: StyleChain<'a>,
+) {
+    let script = TextElem::script_in(styles);
+    let lang = TextElem::lang_in(styles);
+    let region = TextElem::region_in(styles);
+    let mut process = |range: Range, level: BidiLevel| {
+        let dir = if level.is_ltr() { Dir::LTR } else { Dir::RTL };
+        let shaped =
+            shape(engine, range.start, &text[range.clone()], styles, dir, lang, region);
+        items.push((range, Item::Text(shaped)));
+    };
+
+    let mut prev_level = BidiLevel::ltr();
+    let mut prev_script = Script::Unknown;
+    let mut cursor = range.start;
+
+    // Group by embedding level and script.  If the text's script is explicitly
+    // set (rather than inferred from the glyphs), we keep the script at an
+    // unchanging `Script::Unknown` so that only level changes cause breaks.
+    for i in range.clone() {
+        if !text.is_char_boundary(i) {
+            continue;
+        }
+
+        let level = bidi.levels[i];
+        let curr_script = match script {
+            Smart::Auto => {
+                text[i..].chars().next().map_or(Script::Unknown, |c| c.script())
+            }
+            Smart::Custom(_) => Script::Unknown,
+        };
+
+        if level != prev_level || !is_compatible(curr_script, prev_script) {
+            if cursor < i {
+                process(cursor..i, prev_level);
+            }
+            cursor = i;
+            prev_level = level;
+            prev_script = curr_script;
+        } else if is_generic_script(prev_script) {
+            prev_script = curr_script;
+        }
+    }
+
+    process(cursor..range.end, prev_level);
+}
+
+/// Whether this is not a specific script.
+fn is_generic_script(script: Script) -> bool {
+    matches!(script, Script::Unknown | Script::Common | Script::Inherited)
+}
+
+/// Whether these script can be part of the same shape run.
+fn is_compatible(a: Script, b: Script) -> bool {
+    is_generic_script(a) || is_generic_script(b) || a == b
+}
+
+/// Shape text into [`ShapedText`].
+#[allow(clippy::too_many_arguments)]
+fn shape<'a>(
+    engine: &Engine,
+    base: usize,
+    text: &'a str,
+    styles: StyleChain<'a>,
+    dir: Dir,
+    lang: Lang,
+    region: Option<Region>,
+) -> ShapedText<'a> {
+    let size = TextElem::size_in(styles);
+    let mut ctx = ShapingContext {
+        engine,
+        size,
+        glyphs: vec![],
+        used: vec![],
+        styles,
+        variant: variant(styles),
+        features: features(styles),
+        fallback: TextElem::fallback_in(styles),
+        dir,
+    };
+
+    if !text.is_empty() {
+        shape_segment(&mut ctx, base, text, families(styles));
+    }
+
+    track_and_space(&mut ctx);
+    calculate_adjustability(&mut ctx, lang, region);
+
+    #[cfg(debug_assertions)]
+    assert_all_glyphs_in_range(&ctx.glyphs, text, base..(base + text.len()));
+    #[cfg(debug_assertions)]
+    assert_glyph_ranges_in_order(&ctx.glyphs, dir);
+
+    ShapedText {
+        base,
+        text,
+        dir,
+        lang,
+        region,
+        styles,
+        variant: ctx.variant,
+        size,
+        width: ctx.glyphs.iter().map(|g| g.x_advance).sum::<Em>().at(size),
+        glyphs: Cow::Owned(ctx.glyphs),
+    }
+}
+
+/// Holds shaping results and metadata common to all shaped segments.
+struct ShapingContext<'a, 'v> {
+    engine: &'a Engine<'v>,
+    glyphs: Vec<ShapedGlyph>,
+    used: Vec<Font>,
+    styles: StyleChain<'a>,
+    size: Abs,
+    variant: FontVariant,
+    features: Vec<rustybuzz::Feature>,
+    fallback: bool,
+    dir: Dir,
+}
+
+/// Shape text with font fallback using the `families` iterator.
+fn shape_segment<'a>(
+    ctx: &mut ShapingContext,
+    base: usize,
+    text: &str,
+    mut families: impl Iterator<Item = &'a str> + Clone,
+) {
+    // Don't try shaping newlines, tabs, or default ignorables.
+    if text
+        .chars()
+        .all(|c| c == '\n' || c == '\t' || is_default_ignorable(c))
+    {
+        return;
+    }
+
+    // Find the next available family.
+    let world = ctx.engine.world;
+    let book = world.book();
+    let mut selection = families.find_map(|family| {
+        book.select(family, ctx.variant)
+            .and_then(|id| world.font(id))
+            .filter(|font| !ctx.used.contains(font))
+    });
+
+    // Do font fallback if the families are exhausted and fallback is enabled.
+    if selection.is_none() && ctx.fallback {
+        let first = ctx.used.first().map(Font::info);
+        selection = book
+            .select_fallback(first, ctx.variant, text)
+            .and_then(|id| world.font(id))
+            .filter(|font| !ctx.used.contains(font));
+    }
+
+    // Extract the font id or shape notdef glyphs if we couldn't find any font.
+    let Some(font) = selection else {
+        if let Some(font) = ctx.used.first().cloned() {
+            shape_tofus(ctx, base, text, font);
+        }
+        return;
+    };
+
+    ctx.used.push(font.clone());
+
+    // Fill the buffer with our text.
+    let mut buffer = UnicodeBuffer::new();
+    buffer.push_str(text);
+    buffer.set_language(language(ctx.styles));
+    if let Some(script) = TextElem::script_in(ctx.styles).custom().and_then(|script| {
+        rustybuzz::Script::from_iso15924_tag(Tag::from_bytes(script.as_bytes()))
+    }) {
+        buffer.set_script(script)
+    }
+    buffer.set_direction(match ctx.dir {
+        Dir::LTR => rustybuzz::Direction::LeftToRight,
+        Dir::RTL => rustybuzz::Direction::RightToLeft,
+        _ => unimplemented!("vertical text layout"),
+    });
+    buffer.guess_segment_properties();
+
+    // By default, Harfbuzz will create zero-width space glyphs for default
+    // ignorables. This is probably useful for GUI apps that want noticable
+    // effects on the cursor for those, but for us it's not useful and hurts
+    // text extraction.
+    buffer.set_flags(BufferFlags::REMOVE_DEFAULT_IGNORABLES);
+
+    // Prepare the shape plan. This plan depends on direction, script, language,
+    // and features, but is independent from the text and can thus be memoized.
+    let plan = create_shape_plan(
+        &font,
+        buffer.direction(),
+        buffer.script(),
+        buffer.language().as_ref(),
+        &ctx.features,
+    );
+
+    // Shape!
+    let buffer = rustybuzz::shape_with_plan(font.rusty(), &plan, buffer);
+    let infos = buffer.glyph_infos();
+    let pos = buffer.glyph_positions();
+    let ltr = ctx.dir.is_positive();
+
+    // Collect the shaped glyphs, doing fallback and shaping parts again with
+    // the next font if necessary.
+    let mut i = 0;
+    while i < infos.len() {
+        let info = &infos[i];
+        let cluster = info.cluster as usize;
+
+        // Add the glyph to the shaped output.
+        if info.glyph_id != 0 {
+            // Determine the text range of the glyph.
+            let start = base + cluster;
+            let end = base
+                + if ltr { i.checked_add(1) } else { i.checked_sub(1) }
+                    .and_then(|last| infos.get(last))
+                    .map_or(text.len(), |info| info.cluster as usize);
+
+            let c = text[cluster..].chars().next().unwrap();
+            let script = c.script();
+            let x_advance = font.to_em(pos[i].x_advance);
+            ctx.glyphs.push(ShapedGlyph {
+                font: font.clone(),
+                glyph_id: info.glyph_id as u16,
+                // TODO: Don't ignore y_advance.
+                x_advance,
+                x_offset: font.to_em(pos[i].x_offset),
+                y_offset: font.to_em(pos[i].y_offset),
+                adjustability: Adjustability::default(),
+                range: start..end,
+                safe_to_break: !info.unsafe_to_break(),
+                c,
+                is_justifiable: is_justifiable(
+                    c,
+                    script,
+                    x_advance,
+                    Adjustability::default().stretchability,
+                ),
+                script,
+            });
+        } else {
+            // First, search for the end of the tofu sequence.
+            let k = i;
+            while infos.get(i + 1).is_some_and(|info| info.glyph_id == 0) {
+                i += 1;
+            }
+
+            // Then, determine the start and end text index for the tofu
+            // sequence.
+            //
+            // Examples:
+            // Everything is shown in visual order. Tofus are written as "_".
+            // We want to find out that the tofus span the text `2..6`.
+            // Note that the clusters are longer than 1 char.
+            //
+            // Left-to-right:
+            // Text:     h a l i h a l l o
+            // Glyphs:   A   _   _   C   E
+            // Clusters: 0   2   4   6   8
+            //              k=1 i=2
+            //
+            // Right-to-left:
+            // Text:     O L L A H I L A H
+            // Glyphs:   E   C   _   _   A
+            // Clusters: 8   6   4   2   0
+            //                  k=2 i=3
+            let start = infos[if ltr { k } else { i }].cluster as usize;
+            let end = if ltr { i.checked_add(1) } else { k.checked_sub(1) }
+                .and_then(|last| infos.get(last))
+                .map_or(text.len(), |info| info.cluster as usize);
+
+            // Trim half-baked cluster.
+            let remove = base + start..base + end;
+            while ctx.glyphs.last().is_some_and(|g| remove.contains(&g.range.start)) {
+                ctx.glyphs.pop();
+            }
+
+            // Recursively shape the tofu sequence with the next family.
+            shape_segment(ctx, base + start, &text[start..end], families.clone());
+        }
+
+        i += 1;
+    }
+
+    ctx.used.pop();
+}
+
+/// Create a shape plan.
+#[comemo::memoize]
+fn create_shape_plan(
+    font: &Font,
+    direction: rustybuzz::Direction,
+    script: rustybuzz::Script,
+    language: Option<&rustybuzz::Language>,
+    features: &[rustybuzz::Feature],
+) -> Arc<ShapePlan> {
+    Arc::new(rustybuzz::ShapePlan::new(
+        font.rusty(),
+        direction,
+        Some(script),
+        language,
+        features,
+    ))
+}
+
+/// Shape the text with tofus from the given font.
+fn shape_tofus(ctx: &mut ShapingContext, base: usize, text: &str, font: Font) {
+    let x_advance = font.advance(0).unwrap_or_default();
+    let add_glyph = |(cluster, c): (usize, char)| {
+        let start = base + cluster;
+        let end = start + c.len_utf8();
+        let script = c.script();
+        ctx.glyphs.push(ShapedGlyph {
+            font: font.clone(),
+            glyph_id: 0,
+            x_advance,
+            x_offset: Em::zero(),
+            y_offset: Em::zero(),
+            adjustability: Adjustability::default(),
+            range: start..end,
+            safe_to_break: true,
+            c,
+            is_justifiable: is_justifiable(
+                c,
+                script,
+                x_advance,
+                Adjustability::default().stretchability,
+            ),
+            script,
+        });
+    };
+    if ctx.dir.is_positive() {
+        text.char_indices().for_each(add_glyph);
+    } else {
+        text.char_indices().rev().for_each(add_glyph);
+    }
+}
+
+/// Apply tracking and spacing to the shaped glyphs.
+fn track_and_space(ctx: &mut ShapingContext) {
+    let tracking = Em::from_length(TextElem::tracking_in(ctx.styles), ctx.size);
+    let spacing =
+        TextElem::spacing_in(ctx.styles).map(|abs| Em::from_length(abs, ctx.size));
+
+    let mut glyphs = ctx.glyphs.iter_mut().peekable();
+    while let Some(glyph) = glyphs.next() {
+        // Make non-breaking space same width as normal space.
+        if glyph.c == '\u{00A0}' {
+            glyph.x_advance -= nbsp_delta(&glyph.font).unwrap_or_default();
+        }
+
+        if glyph.is_space() {
+            glyph.x_advance = spacing.relative_to(glyph.x_advance);
+        }
+
+        if glyphs
+            .peek()
+            .is_some_and(|next| glyph.range.start != next.range.start)
+        {
+            glyph.x_advance += tracking;
+        }
+    }
+}
+
+/// Calculate stretchability and shrinkability of each glyph,
+/// and CJK punctuation adjustments according to Chinese Layout Requirements.
+fn calculate_adjustability(ctx: &mut ShapingContext, lang: Lang, region: Option<Region>) {
+    let style = cjk_punct_style(lang, region);
+
+    for glyph in &mut ctx.glyphs {
+        glyph.adjustability = glyph.base_adjustability(style);
+    }
+
+    let mut glyphs = ctx.glyphs.iter_mut().peekable();
+    while let Some(glyph) = glyphs.next() {
+        // CNS style needs not further adjustment.
+        if glyph.is_cjk_punctuation() && matches!(style, CjkPunctStyle::Cns) {
+            continue;
+        }
+
+        // Now we apply consecutive punctuation adjustment, specified in Chinese Layout.
+        // Requirements, section 3.1.6.1 Punctuation Adjustment Space, and Japanese Layout
+        // Requirements, section 3.1 Line Composition Rules for Punctuation Marks
+        let Some(next) = glyphs.peek_mut() else { continue };
+        let width = glyph.x_advance;
+        let delta = width / 2.0;
+        if glyph.is_cjk_punctuation()
+            && next.is_cjk_punctuation()
+            && (glyph.shrinkability().1 + next.shrinkability().0) >= delta
+        {
+            let left_delta = glyph.shrinkability().1.min(delta);
+            glyph.shrink_right(left_delta);
+            next.shrink_left(delta - left_delta);
+        }
+    }
+}
+
+/// Difference between non-breaking and normal space.
+fn nbsp_delta(font: &Font) -> Option<Em> {
+    let space = font.ttf().glyph_index(' ')?.0;
+    let nbsp = font.ttf().glyph_index('\u{00A0}')?.0;
+    Some(font.advance(nbsp)? - font.advance(space)?)
+}
+
+/// Process the language and region of a style chain into a
+/// rustybuzz-compatible BCP 47 language.
+fn language(styles: StyleChain) -> rustybuzz::Language {
+    let mut bcp: EcoString = TextElem::lang_in(styles).as_str().into();
+    if let Some(region) = TextElem::region_in(styles) {
+        bcp.push('-');
+        bcp.push_str(region.as_str());
+    }
+    rustybuzz::Language::from_str(&bcp).unwrap()
+}
+
+/// Returns true if all glyphs in `glyphs` have ranges within the range `range`.
+#[cfg(debug_assertions)]
+fn assert_all_glyphs_in_range(glyphs: &[ShapedGlyph], text: &str, range: Range) {
+    if glyphs
+        .iter()
+        .any(|g| g.range.start < range.start || g.range.end > range.end)
+    {
+        panic!("one or more glyphs in {text:?} fell out of range");
+    }
+}
+
+/// Asserts that the ranges of `glyphs` is in the proper order according to
+/// `dir`.
+///
+/// This asserts instead of returning a bool in order to provide a more
+/// informative message when the invariant is violated.
+#[cfg(debug_assertions)]
+fn assert_glyph_ranges_in_order(glyphs: &[ShapedGlyph], dir: Dir) {
+    if glyphs.is_empty() {
+        return;
+    }
+
+    // Iterator::is_sorted and friends are unstable as of Rust 1.70.0
+    for i in 0..(glyphs.len() - 1) {
+        let a = &glyphs[i];
+        let b = &glyphs[i + 1];
+        let ord = a.range.start.cmp(&b.range.start);
+        let ord = if dir.is_positive() { ord } else { ord.reverse() };
+        if ord == std::cmp::Ordering::Greater {
+            panic!(
+                "glyph ranges should be monotonically {}, \
+                 but found glyphs out of order:\n\n\
+                 first: {a:#?}\nsecond: {b:#?}",
+                if dir.is_positive() { "increasing" } else { "decreasing" },
+            );
+        }
+    }
+}
+
+// The CJK punctuation that can appear at the beginning or end of a line.
+pub const BEGIN_PUNCT_PAT: &[char] =
+    &['“', '‘', '《', '〈', '（', '『', '「', '【', '〖', '〔', '［', '｛'];
+pub const END_PUNCT_PAT: &[char] = &[
+    '”', '’', '，', '．', '。', '、', '：', '；', '》', '〉', '）', '』', '」', '】',
+    '〗', '〕', '］', '｝', '？', '！',
+];
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CjkPunctStyle {
+    /// Standard GB/T 15834-2011, used mostly in mainland China.
+    Gb,
+    /// Standard by Taiwan Ministry of Education, used in Taiwan and Hong Kong.
+    Cns,
+    /// Standard JIS X 4051, used in Japan.
+    Jis,
+}
+
+pub fn cjk_punct_style(lang: Lang, region: Option<Region>) -> CjkPunctStyle {
+    match (lang, region.as_ref().map(Region::as_str)) {
+        (Lang::CHINESE, Some("TW" | "HK")) => CjkPunctStyle::Cns,
+        (Lang::JAPANESE, _) => CjkPunctStyle::Jis,
+        // zh-CN, zh-SG, zh-MY use GB-style punctuation,
+        _ => CjkPunctStyle::Gb,
+    }
+}
+
+/// Whether the glyph is a space.
+fn is_space(c: char) -> bool {
+    matches!(c, ' ' | '\u{00A0}' | '　')
+}
+
+/// Whether the glyph is part of Chinese or Japanese script (i.e. CJ, not CJK).
+pub fn is_of_cj_script(c: char) -> bool {
+    is_cj_script(c, c.script())
+}
+
+/// Whether the glyph is part of Chinese or Japanese script (i.e. CJ, not CJK).
+/// The function is dedicated to typesetting Chinese or Japanese, which do not
+/// have spaces between words, so K is not checked here.
+fn is_cj_script(c: char, script: Script) -> bool {
+    use Script::*;
+    // U+30FC: Katakana-Hiragana Prolonged Sound Mark
+    matches!(script, Hiragana | Katakana | Han) || c == '\u{30FC}'
+}
+
+/// See <https://www.w3.org/TR/clreq/#punctuation_width_adjustment>
+fn is_cjk_left_aligned_punctuation(
+    c: char,
+    x_advance: Em,
+    stretchability: (Em, Em),
+    style: CjkPunctStyle,
+) -> bool {
+    use CjkPunctStyle::*;
+
+    // CJK quotation marks shares codepoints with latin quotation marks.
+    // But only the CJK ones have full width.
+    if matches!(c, '”' | '’') && x_advance + stretchability.1 == Em::one() {
+        return true;
+    }
+
+    if matches!(style, Gb | Jis) && matches!(c, '，' | '。' | '．' | '、' | '：' | '；')
+    {
+        return true;
+    }
+
+    if matches!(style, Gb) && matches!(c, '？' | '！') {
+        // In GB style, exclamations and question marks are also left aligned
+        // and can be adjusted. Note that they are not adjustable in other
+        // styles.
+        return true;
+    }
+
+    // See appendix A.3 https://www.w3.org/TR/clreq/#tables_of_chinese_punctuation_marks
+    matches!(c, '》' | '）' | '』' | '」' | '】' | '〗' | '〕' | '〉' | '］' | '｝')
+}
+
+/// See <https://www.w3.org/TR/clreq/#punctuation_width_adjustment>
+fn is_cjk_right_aligned_punctuation(
+    c: char,
+    x_advance: Em,
+    stretchability: (Em, Em),
+) -> bool {
+    // CJK quotation marks shares codepoints with latin quotation marks.
+    // But only the CJK ones have full width.
+    if matches!(c, '“' | '‘') && x_advance + stretchability.0 == Em::one() {
+        return true;
+    }
+    // See appendix A.3 https://www.w3.org/TR/clreq/#tables_of_chinese_punctuation_marks
+    matches!(c, '《' | '（' | '『' | '「' | '【' | '〖' | '〔' | '〈' | '［' | '｛')
+}
+
+/// See <https://www.w3.org/TR/clreq/#punctuation_width_adjustment>
+fn is_cjk_center_aligned_punctuation(c: char, style: CjkPunctStyle) -> bool {
+    if matches!(style, CjkPunctStyle::Cns)
+        && matches!(c, '，' | '。' | '．' | '、' | '：' | '；')
+    {
+        return true;
+    }
+
+    // U+30FB: Katakana Middle Dot
+    // U+00B7: Middle Dot
+    matches!(c, '\u{30FB}' | '\u{00B7}')
+}
+
+/// Whether the glyph is justifiable.
+///
+/// Quotations in latin script and CJK are unfortunately the same codepoint
+/// (U+2018, U+2019, U+201C, U+201D), but quotations in Chinese must be
+/// fullwidth. This heuristics can therefore fail for monospace latin fonts.
+/// However, since monospace fonts are usually not justified this edge case
+/// should be rare enough.
+fn is_justifiable(
+    c: char,
+    script: Script,
+    x_advance: Em,
+    stretchability: (Em, Em),
+) -> bool {
+    // punctuation style is not relevant here.
+    let style = CjkPunctStyle::Gb;
+    is_space(c)
+        || is_cj_script(c, script)
+        || is_cjk_left_aligned_punctuation(c, x_advance, stretchability, style)
+        || is_cjk_right_aligned_punctuation(c, x_advance, stretchability)
+        || is_cjk_center_aligned_punctuation(c, style)
+}