Make ligatures copyable and searchable

Fixes #479 Fixes #1040
author: Laurenz <laurmaedje@gmail.com> 2023-05-03 10:33:18 +0200
committer: Laurenz <laurmaedje@gmail.com> 2023-05-03 10:33:18 +0200
commit: ad347632ab95e29eb5180b27142f5c264dfc611a (patch)
tree: 2742a33f4c3d800a86e977de04fa2cec7104c43f
parent: bcc014c4e177cc4e8cf5ca8c24990908b507c0f8 (diff)
17 files changed, 221 insertions, 179 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 14a297fd..3fd1e3b8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -117,6 +117,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
+name = "az"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973"
+
+[[package]]
 name = "base64"
 version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1385,9 +1391,9 @@ checksum = "8835116a5c179084a830efb3adc117ab007512b535bc1a21c991d3b32a6b44dd"
 
 [[package]]
 name = "pdf-writer"
-version = "0.7.0"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "63f45f7c7538e67c58cb4977e4f97bbd75fbd3990d827d28d597ec746291f644"
+checksum = "30900f178ea696fc5d9637171f98aaa93d5aae54f0726726df68fc3e32810db6"
 dependencies = [
  "bitflags 1.3.2",
  "itoa",
@@ -2306,6 +2312,7 @@ dependencies = [
  "tracing",
  "ttf-parser",
  "typst-macros",
+ "unicode-general-category",
  "unicode-math-class",
  "unicode-segmentation",
  "unicode-xid",
@@ -2366,6 +2373,7 @@ dependencies = [
 name = "typst-library"
 version = "0.3.0"
 dependencies = [
+ "az",
  "chinese-number",
  "comemo",
  "csv",
diff --git a/Cargo.toml b/Cargo.toml
index a0e51002..1c404061 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,7 +33,7 @@ indexmap = "1.9.3"
 log = "0.4"
 miniz_oxide = "0.7"
 once_cell = "1"
-pdf-writer = "0.7"
+pdf-writer = "0.7.1"
 pixglyph = "0.1"
 regex = "1"
 resvg = { version = "0.32", default-features = false }
@@ -46,6 +46,7 @@ svg2pdf = { git = "https://github.com/typst/svg2pdf" }
 tiny-skia = "0.9.0"
 tracing = "0.1.37"
 ttf-parser = "0.18.1"
+unicode-general-category = "0.6"
 unicode-math-class = "0.1"
 unicode-segmentation = "1"
 unicode-xid = "0.2"
diff --git a/assets/fonts/IBMPlexSansDevanagari-Regular.ttf b/assets/fonts/IBMPlexSansDevanagari-Regular.ttf
new file mode 100644
index 00000000..5d7c8f0f
--- /dev/null
+++ b/assets/fonts/IBMPlexSansDevanagari-Regular.ttf
diff --git a/docs/src/reference/details.yml b/docs/src/reference/details.yml
index 1926fb77..22b67963 100644
--- a/docs/src/reference/details.yml
+++ b/docs/src/reference/details.yml
@@ -159,7 +159,7 @@ construct: |
 data-loading: |
   Data loading from external files.
 
-  These functions help you with embedding data from experiments and APIs in your
+  These functions help you with embedding data from experiments in your
   documents.
 
 utility: |
diff --git a/library/Cargo.toml b/library/Cargo.toml
index 033058f3..499170cb 100644
--- a/library/Cargo.toml
+++ b/library/Cargo.toml
@@ -16,6 +16,7 @@ bench = false
 
 [dependencies]
 typst = { path = ".." }
+az = "1.2"
 chinese-number = { version = "0.7.2", default-features = false, features = ["number-to-chinese"] }
 comemo = "0.2.2"
 csv = "1"
diff --git a/library/src/layout/par.rs b/library/src/layout/par.rs
index a6ad647b..0c3a9a3c 100644
--- a/library/src/layout/par.rs
+++ b/library/src/layout/par.rs
@@ -1139,8 +1139,7 @@ fn line<'a>(
         // are no other items in the line.
         if hyphen || start + shaped.text.len() > range.end {
             if hyphen || start < range.end || before.is_empty() {
-                let shifted = start - base..range.end - base;
-                let mut reshaped = shaped.reshape(vt, &p.spans, shifted);
+                let mut reshaped = shaped.reshape(vt, &p.spans, start..range.end);
                 if hyphen || shy {
                     reshaped.push_hyphen(vt);
                 }
@@ -1162,8 +1161,7 @@ fn line<'a>(
         // Reshape if necessary.
         if range.start + shaped.text.len() > end {
             if range.start < end {
-                let shifted = range.start - base..end - base;
-                let reshaped = shaped.reshape(vt, &p.spans, shifted);
+                let reshaped = shaped.reshape(vt, &p.spans, range.start..end);
                 width += reshaped.width;
                 first = Some(Item::Text(reshaped));
             }
diff --git a/library/src/math/fragment.rs b/library/src/math/fragment.rs
index b0991630..40dca347 100644
--- a/library/src/math/fragment.rs
+++ b/library/src/math/fragment.rs
@@ -222,13 +222,13 @@ impl GlyphFragment {
             size: self.font_size,
             fill: self.fill,
             lang: self.lang,
+            text: self.c.into(),
             glyphs: vec![Glyph {
                 id: self.id.0,
-                c: self.c,
                 x_advance: Em::from_length(self.width, self.font_size),
                 x_offset: Em::zero(),
-                span: self.span,
-                offset: 0,
+                range: 0..self.c.len_utf8() as u16,
+                span: (self.span, 0),
             }],
         };
         let size = Size::new(self.width, self.ascent + self.descent);
diff --git a/library/src/text/shaping.rs b/library/src/text/shaping.rs
index 1e1ccc99..7d5703bc 100644
--- a/library/src/text/shaping.rs
+++ b/library/src/text/shaping.rs
@@ -1,6 +1,7 @@
 use std::ops::Range;
 use std::str::FromStr;
 
+use az::SaturatingAs;
 use rustybuzz::{Feature, Tag, UnicodeBuffer};
 use typst::font::{Font, FontVariant};
 use typst::util::SliceExt;
@@ -47,20 +48,18 @@ pub struct ShapedGlyph {
     pub x_offset: Em,
     /// The vertical offset of the glyph.
     pub y_offset: Em,
-    /// The byte index in the source text where this glyph's cluster starts. A
-    /// cluster is a sequence of one or multiple glyphs that cannot be
-    /// separated and must always be treated as a union.
-    pub cluster: usize,
+    /// The byte range of this glyph's cluster in the full paragraph. A cluster
+    /// is a sequence of one or multiple glyphs that cannot be separated and
+    /// must always be treated as a union.
+    pub range: Range<usize>,
     /// Whether splitting the shaping result before this glyph would yield the
     /// same results as shaping the parts to both sides of `text_index`
     /// separately.
     pub safe_to_break: bool,
     /// The first char in this glyph's cluster.
     pub c: char,
-    /// The source code location of the text.
-    pub span: Span,
-    /// The offset within the spanned text.
-    pub offset: u16,
+    /// The source code location of the glyph and its byte offset within it.
+    pub span: (Span, u16),
 }
 
 #[derive(Debug, Clone, Default)]
@@ -181,6 +180,12 @@ impl<'a> ShapedText<'a> {
         for ((font, y_offset), group) in
             self.glyphs.as_ref().group_by_key(|g| (g.font.clone(), g.y_offset))
         {
+            let mut range = group[0].range.clone();
+            for glyph in group {
+                range.start = range.start.min(glyph.range.start);
+                range.end = range.end.max(glyph.range.end);
+            }
+
             let pos = Point::new(offset, top + shift - y_offset.at(self.size));
             let glyphs = group
                 .iter()
@@ -195,8 +200,8 @@ impl<'a> ShapedText<'a> {
                     } else {
                         glyph.stretchability().1
                     };
-                    let justification_left = adjustability_left * justification_ratio;
 
+                    let justification_left = adjustability_left * justification_ratio;
                     let mut justification_right =
                         adjustability_right * justification_ratio;
                     if glyph.is_justifiable() {
@@ -206,15 +211,16 @@ impl<'a> ShapedText<'a> {
 
                     frame.size_mut().x += justification_left.at(self.size)
                         + justification_right.at(self.size);
+
                     Glyph {
                         id: glyph.glyph_id,
                         x_advance: glyph.x_advance
                             + justification_left
                             + justification_right,
                         x_offset: glyph.x_offset + justification_left,
-                        c: glyph.c,
+                        range: (glyph.range.start - range.start).saturating_as()
+                            ..(glyph.range.end - range.start).saturating_as(),
                         span: glyph.span,
-                        offset: glyph.offset,
                     }
                 })
                 .collect();
@@ -224,6 +230,7 @@ impl<'a> ShapedText<'a> {
                 size: self.size,
                 lang,
                 fill: fill.clone(),
+                text: self.text[range.start - self.base..range.end - self.base].into(),
                 glyphs,
             };
 
@@ -318,16 +325,19 @@ impl<'a> ShapedText<'a> {
 
     /// Reshape a range of the shaped text, reusing information from this
     /// shaping process if possible.
+    ///
+    /// The text `range` is relative to the whole paragraph.
     pub fn reshape(
         &'a self,
         vt: &Vt,
         spans: &SpanMapper,
         text_range: Range<usize>,
     ) -> ShapedText<'a> {
+        let text = &self.text[text_range.start - self.base..text_range.end - self.base];
         if let Some(glyphs) = self.slice_safe_to_break(text_range.clone()) {
             Self {
-                base: self.base + text_range.start,
-                text: &self.text[text_range],
+                base: text_range.start,
+                text,
                 dir: self.dir,
                 styles: self.styles,
                 size: self.size,
@@ -336,14 +346,7 @@ impl<'a> ShapedText<'a> {
                 glyphs: Cow::Borrowed(glyphs),
             }
         } else {
-            shape(
-                vt,
-                self.base + text_range.start,
-                &self.text[text_range],
-                spans,
-                self.styles,
-                self.dir,
-            )
+            shape(vt, text_range.start, text, spans, self.styles, self.dir)
         }
     }
 
@@ -358,7 +361,11 @@ impl<'a> ShapedText<'a> {
             let ttf = font.ttf();
             let glyph_id = ttf.glyph_index('-')?;
             let x_advance = font.to_em(ttf.glyph_hor_advance(glyph_id)?);
-            let cluster = self.glyphs.last().map(|g| g.cluster).unwrap_or_default();
+            let range = self
+                .glyphs
+                .last()
+                .map(|g| g.range.end..g.range.end)
+                .unwrap_or_default();
             self.width += x_advance.at(self.size);
             self.glyphs.to_mut().push(ShapedGlyph {
                 font,
@@ -366,11 +373,10 @@ impl<'a> ShapedText<'a> {
                 x_advance,
                 x_offset: Em::zero(),
                 y_offset: Em::zero(),
-                cluster,
+                range,
                 safe_to_break: true,
                 c: '-',
-                span: Span::detached(),
-                offset: 0,
+                span: (Span::detached(), 0),
             });
             Some(())
         });
@@ -396,9 +402,9 @@ impl<'a> ShapedText<'a> {
 
         // Handle edge cases.
         let len = self.glyphs.len();
-        if text_index == 0 {
+        if text_index == self.base {
             return Some(if ltr { 0 } else { len });
-        } else if text_index == self.text.len() {
+        } else if text_index == self.base + self.text.len() {
             return Some(if ltr { len } else { 0 });
         }
 
@@ -406,7 +412,7 @@ impl<'a> ShapedText<'a> {
         let mut idx = self
             .glyphs
             .binary_search_by(|g| {
-                let ordering = g.cluster.cmp(&text_index);
+                let ordering = g.range.start.cmp(&text_index);
                 if ltr {
                     ordering
                 } else {
@@ -422,7 +428,7 @@ impl<'a> ShapedText<'a> {
 
         // Search for the outermost glyph with the text index.
         while let Some(next) = next(idx, 1) {
-            if self.glyphs.get(next).map_or(true, |g| g.cluster != text_index) {
+            if self.glyphs.get(next).map_or(true, |g| g.range.start != text_index) {
                 break;
             }
             idx = next;
@@ -444,7 +450,6 @@ impl Debug for ShapedText<'_> {
 /// Holds shaping results and metadata common to all shaped segments.
 struct ShapingContext<'a> {
     vt: &'a Vt<'a>,
-    base: usize,
     spans: &'a SpanMapper,
     glyphs: Vec<ShapedGlyph>,
     used: Vec<Font>,
@@ -468,7 +473,6 @@ pub fn shape<'a>(
     let size = TextElem::size_in(styles);
     let mut ctx = ShapingContext {
         vt,
-        base,
         spans,
         size,
         glyphs: vec![],
@@ -481,7 +485,7 @@ pub fn shape<'a>(
     };
 
     if !text.is_empty() {
-        shape_segment(&mut ctx, 0, text, families(styles));
+        shape_segment(&mut ctx, base, text, families(styles));
     }
 
     track_and_space(&mut ctx);
@@ -552,6 +556,7 @@ fn shape_segment(
     let buffer = rustybuzz::shape(font.rusty(), &ctx.tags, buffer);
     let infos = buffer.glyph_infos();
     let pos = buffer.glyph_positions();
+    let ltr = ctx.dir.is_positive();
 
     // Collect the shaped glyphs, doing fallback and shaping parts again with
     // the next font if necessary.
@@ -560,68 +565,66 @@ fn shape_segment(
         let info = &infos[i];
         let cluster = info.cluster as usize;
 
+        // Add the glyph to the shaped output.
         if info.glyph_id != 0 {
-            // Add the glyph to the shaped output.
-            // TODO: Don't ignore y_advance.
-            let (span, offset) = ctx.spans.span_at(ctx.base + cluster);
+            // Determine the text range of the glyph.
+            let start = base + cluster;
+            let end = base
+                + if ltr { i.checked_add(1) } else { i.checked_sub(1) }
+                    .and_then(|last| infos.get(last))
+                    .map_or(text.len(), |info| info.cluster as usize);
+
             ctx.glyphs.push(ShapedGlyph {
                 font: font.clone(),
                 glyph_id: info.glyph_id as u16,
+                // TODO: Don't ignore y_advance.
                 x_advance: font.to_em(pos[i].x_advance),
                 x_offset: font.to_em(pos[i].x_offset),
                 y_offset: font.to_em(pos[i].y_offset),
-                cluster: base + cluster,
+                range: start..end,
                 safe_to_break: !info.unsafe_to_break(),
                 c: text[cluster..].chars().next().unwrap(),
-                span,
-                offset,
+                span: ctx.spans.span_at(start),
             });
         } else {
-            // Determine the source text range for the tofu sequence.
-            let range = {
-                // First, search for the end of the tofu sequence.
-                let k = i;
-                while infos.get(i + 1).map_or(false, |info| info.glyph_id == 0) {
-                    i += 1;
-                }
-
-                // Then, determine the start and end text index.
-                //
-                // Examples:
-                // Everything is shown in visual order. Tofus are written as "_".
-                // We want to find out that the tofus span the text `2..6`.
-                // Note that the clusters are longer than 1 char.
-                //
-                // Left-to-right:
-                // Text:     h a l i h a l l o
-                // Glyphs:   A   _   _   C   E
-                // Clusters: 0   2   4   6   8
-                //              k=1 i=2
-                //
-                // Right-to-left:
-                // Text:     O L L A H I L A H
-                // Glyphs:   E   C   _   _   A
-                // Clusters: 8   6   4   2   0
-                //                  k=2 i=3
-                let ltr = ctx.dir.is_positive();
-                let first = if ltr { k } else { i };
-                let start = infos[first].cluster as usize;
-                let last = if ltr { i.checked_add(1) } else { k.checked_sub(1) };
-                let end = last
-                    .and_then(|last| infos.get(last))
-                    .map_or(text.len(), |info| info.cluster as usize);
+            // First, search for the end of the tofu sequence.
+            let k = i;
+            while infos.get(i + 1).map_or(false, |info| info.glyph_id == 0) {
+                i += 1;
+            }
 
-                start..end
-            };
+            // Then, determine the start and end text index for the tofu
+            // sequence.
+            //
+            // Examples:
+            // Everything is shown in visual order. Tofus are written as "_".
+            // We want to find out that the tofus span the text `2..6`.
+            // Note that the clusters are longer than 1 char.
+            //
+            // Left-to-right:
+            // Text:     h a l i h a l l o
+            // Glyphs:   A   _   _   C   E
+            // Clusters: 0   2   4   6   8
+            //              k=1 i=2
+            //
+            // Right-to-left:
+            // Text:     O L L A H I L A H
+            // Glyphs:   E   C   _   _   A
+            // Clusters: 8   6   4   2   0
+            //                  k=2 i=3
+            let start = infos[if ltr { k } else { i }].cluster as usize;
+            let end = if ltr { i.checked_add(1) } else { k.checked_sub(1) }
+                .and_then(|last| infos.get(last))
+                .map_or(text.len(), |info| info.cluster as usize);
 
             // Trim half-baked cluster.
-            let remove = base + range.start..base + range.end;
-            while ctx.glyphs.last().map_or(false, |g| remove.contains(&g.cluster)) {
+            let remove = base + start..base + end;
+            while ctx.glyphs.last().map_or(false, |g| remove.contains(&g.range.start)) {
                 ctx.glyphs.pop();
             }
 
             // Recursively shape the tofu sequence with the next family.
-            shape_segment(ctx, base + range.start, &text[range], families.clone());
+            shape_segment(ctx, base + start, &text[start..end], families.clone());
         }
 
         i += 1;
@@ -634,19 +637,18 @@ fn shape_segment(
 fn shape_tofus(ctx: &mut ShapingContext, base: usize, text: &str, font: Font) {
     let x_advance = font.advance(0).unwrap_or_default();
     for (cluster, c) in text.char_indices() {
-        let cluster = base + cluster;
-        let (span, offset) = ctx.spans.span_at(ctx.base + cluster);
+        let start = base + cluster;
+        let end = start + c.len_utf8();
         ctx.glyphs.push(ShapedGlyph {
             font: font.clone(),
             glyph_id: 0,
             x_advance,
             x_offset: Em::zero(),
             y_offset: Em::zero(),
-            cluster,
+            range: start..end,
             safe_to_break: true,
             c,
-            span,
-            offset,
+            span: ctx.spans.span_at(start),
         });
     }
 }
@@ -668,7 +670,10 @@ fn track_and_space(ctx: &mut ShapingContext) {
             glyph.x_advance = spacing.relative_to(glyph.x_advance);
         }
 
-        if glyphs.peek().map_or(false, |next| glyph.cluster != next.cluster) {
+        if glyphs
+            .peek()
+            .map_or(false, |next| glyph.range.start != next.range.start)
+        {
             glyph.x_advance += tracking;
         }
     }
diff --git a/src/doc.rs b/src/doc.rs
index 0a744ffc..0a9b9a8d 100644
--- a/src/doc.rs
+++ b/src/doc.rs
@@ -1,7 +1,8 @@
 //! Finished documents.
 
-use std::fmt::{self, Debug, Formatter, Write};
+use std::fmt::{self, Debug, Formatter};
 use std::num::NonZeroUsize;
+use std::ops::Range;
 use std::str::FromStr;
 use std::sync::Arc;
 
@@ -114,23 +115,6 @@ impl Frame {
     pub fn items(&self) -> std::slice::Iter<'_, (Point, FrameItem)> {
         self.items.iter()
     }
-
-    /// Approximately recover the text inside of the frame and its children.
-    pub fn text(&self) -> EcoString {
-        let mut text = EcoString::new();
-        for (_, item) in self.items() {
-            match item {
-                FrameItem::Text(item) => {
-                    for glyph in &item.glyphs {
-                        text.push(glyph.c);
-                    }
-                }
-                FrameItem::Group(group) => text.push_str(&group.frame.text()),
-                _ => {}
-            }
-        }
-        text
-    }
 }
 
 /// Insert items and subframes.
@@ -476,6 +460,8 @@ pub struct TextItem {
     pub fill: Paint,
     /// The natural language of the text.
     pub lang: Lang,
+    /// The item's plain text.
+    pub text: EcoString,
     /// The glyphs.
     pub glyphs: Vec<Glyph>,
 }
@@ -489,19 +475,14 @@ impl TextItem {
 
 impl Debug for TextItem {
     fn fmt(&self, f: &mut Formatter) -> fmt::Result {
-        // This is only a rough approximation of the source text.
-        f.write_str("Text(\"")?;
-        for glyph in &self.glyphs {
-            for c in glyph.c.escape_debug() {
-                f.write_char(c)?;
-            }
-        }
-        f.write_str("\")")
+        f.write_str("Text(")?;
+        self.text.fmt(f)?;
+        f.write_str(")")
     }
 }
 
 /// A glyph in a run of shaped text.
-#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
+#[derive(Debug, Clone, Eq, PartialEq, Hash)]
 pub struct Glyph {
     /// The glyph's index in the font.
     pub id: u16,
@@ -509,12 +490,17 @@ pub struct Glyph {
     pub x_advance: Em,
     /// The horizontal offset of the glyph.
     pub x_offset: Em,
-    /// The first character of the glyph's cluster.
-    pub c: char,
+    /// The range of the glyph in its item's text.
+    pub range: Range<u16>,
     /// The source code location of the text.
-    pub span: Span,
-    /// The offset within the spanned text.
-    pub offset: u16,
+    pub span: (Span, u16),
+}
+
+impl Glyph {
+    /// The range of the glyph in its item's text.
+    pub fn range(&self) -> Range<usize> {
+        usize::from(self.range.start)..usize::from(self.range.end)
+    }
 }
 
 /// An identifier for a natural language.
diff --git a/src/export/pdf/font.rs b/src/export/pdf/font.rs
index de79976a..1e2f9c93 100644
--- a/src/export/pdf/font.rs
+++ b/src/export/pdf/font.rs
@@ -1,13 +1,21 @@
 use std::collections::BTreeMap;
 
-use ecow::eco_format;
+use ecow::{eco_format, EcoString};
 use pdf_writer::types::{CidFontType, FontFlags, SystemInfo, UnicodeCmap};
 use pdf_writer::{Filter, Finish, Name, Rect, Str};
 use ttf_parser::{name_id, GlyphId, Tag};
+use unicode_general_category::GeneralCategory;
 
 use super::{deflate, EmExt, PdfContext, RefExt};
 use crate::util::SliceExt;
 
+const CMAP_NAME: Name = Name(b"Custom");
+const SYSTEM_INFO: SystemInfo = SystemInfo {
+    registry: Str(b"Adobe"),
+    ordering: Str(b"Identity"),
+    supplement: 0,
+};
+
 /// Embed all used fonts into the PDF.
 #[tracing::instrument(skip_all)]
 pub fn write_fonts(ctx: &mut PdfContext) {
@@ -19,7 +27,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
         let data_ref = ctx.alloc.bump();
         ctx.font_refs.push(type0_ref);
 
-        let glyphs = &ctx.glyph_sets[font];
+        let glyph_set = ctx.glyph_sets.get_mut(font).unwrap();
         let metrics = font.metrics();
         let ttf = font.ttf();
 
@@ -29,12 +37,6 @@ pub fn write_fonts(ctx: &mut PdfContext) {
 
         let base_font = eco_format!("ABCDEF+{}", postscript_name);
         let base_font = Name(base_font.as_bytes());
-        let cmap_name = Name(b"Custom");
-        let system_info = SystemInfo {
-            registry: Str(b"Adobe"),
-            ordering: Str(b"Identity"),
-            supplement: 0,
-        };
 
         // Write the base font object referencing the CID font.
         ctx.writer
@@ -59,7 +61,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
         let mut cid = ctx.writer.cid_font(cid_ref);
         cid.subtype(subtype);
         cid.base_font(base_font);
-        cid.system_info(system_info);
+        cid.system_info(SYSTEM_INFO);
         cid.font_descriptor(descriptor_ref);
         cid.default_width(0.0);
 
@@ -70,7 +72,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
         // Extract the widths of all glyphs.
         let num_glyphs = ttf.number_of_glyphs();
         let mut widths = vec![0.0; num_glyphs as usize];
-        for &g in glyphs {
+        for &g in glyph_set.keys() {
             let x = ttf.glyph_hor_advance(GlyphId(g)).unwrap_or(0);
             widths[g as usize] = font.to_em(x).to_font_units();
         }
@@ -130,42 +132,15 @@ pub fn write_fonts(ctx: &mut PdfContext) {
 
         font_descriptor.finish();
 
-        // Compute a reverse mapping from glyphs to unicode.
-        let cmap = {
-            let mut mapping = BTreeMap::new();
-            for subtable in
-                ttf.tables().cmap.into_iter().flat_map(|table| table.subtables)
-            {
-                if subtable.is_unicode() {
-                    subtable.codepoints(|n| {
-                        if let Some(c) = std::char::from_u32(n) {
-                            if let Some(GlyphId(g)) = ttf.glyph_index(c) {
-                                if glyphs.contains(&g) {
-                                    mapping.insert(g, c);
-                                }
-                            }
-                        }
-                    });
-                }
-            }
-
-            let mut cmap = UnicodeCmap::new(cmap_name, system_info);
-            for (g, c) in mapping {
-                cmap.pair(g, c);
-            }
-            cmap
-        };
-
         // Write the /ToUnicode character map, which maps glyph ids back to
         // unicode codepoints to enable copying out of the PDF.
-        ctx.writer
-            .cmap(cmap_ref, &deflate(&cmap.finish()))
-            .filter(Filter::FlateDecode);
+        let cmap = create_cmap(ttf, glyph_set);
+        ctx.writer.cmap(cmap_ref, &cmap.finish());
 
         // Subset and write the font's bytes.
         let data = font.data();
         let subsetted = {
-            let glyphs: Vec<_> = glyphs.iter().copied().collect();
+            let glyphs: Vec<_> = glyph_set.keys().copied().collect();
             let profile = subsetter::Profile::pdf(&glyphs);
             subsetter::subset(data, font.index(), profile)
         };
@@ -183,3 +158,44 @@ pub fn write_fonts(ctx: &mut PdfContext) {
         stream.finish();
     }
 }
+
+/// Create a /ToUnicode CMap.
+fn create_cmap(
+    ttf: &ttf_parser::Face,
+    glyph_set: &mut BTreeMap<u16, EcoString>,
+) -> UnicodeCmap {
+    // For glyphs that have codepoints mapping to in the font's cmap table, we
+    // prefer them over pre-existing text mappings from the document. Only
+    // things that don't have a corresponding codepoint (or only a private-use
+    // one) like the "Th" in Linux Libertine get the text of their first
+    // occurances in the document instead.
+    for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
+        if !subtable.is_unicode() {
+            continue;
+        }
+
+        subtable.codepoints(|n| {
+            let Some(c) = std::char::from_u32(n) else { return };
+            if unicode_general_category::get_general_category(c)
+                == GeneralCategory::PrivateUse
+            {
+                return;
+            }
+
+            let Some(GlyphId(g)) = ttf.glyph_index(c) else { return };
+            if glyph_set.contains_key(&g) {
+                glyph_set.insert(g, c.into());
+            }
+        });
+    }
+
+    // Produce a reverse mapping from glyphs to unicode strings.
+    let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO);
+    for (&g, text) in glyph_set.iter() {
+        if !text.is_empty() {
+            cmap.pair_with_multiple(g, text.chars());
+        }
+    }
+
+    cmap
+}
diff --git a/src/export/pdf/mod.rs b/src/export/pdf/mod.rs
index ffbf67a3..48485862 100644
--- a/src/export/pdf/mod.rs
+++ b/src/export/pdf/mod.rs
@@ -6,9 +6,10 @@ mod outline;
 mod page;
 
 use std::cmp::Eq;
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeMap, HashMap};
 use std::hash::Hash;
 
+use ecow::EcoString;
 use pdf_writer::types::Direction;
 use pdf_writer::{Finish, Name, PdfWriter, Ref, TextStr};
 use xmp_writer::{LangId, RenditionClass, XmpWriter};
@@ -52,7 +53,13 @@ pub struct PdfContext<'a> {
     page_refs: Vec<Ref>,
     font_map: Remapper<Font>,
     image_map: Remapper<Image>,
-    glyph_sets: HashMap<Font, HashSet<u16>>,
+    /// For each font a mapping from used glyphs to their text representation.
+    /// May contain multiple chars in case of ligatures or similar things. The
+    /// same glyph can have a different text representation within one document,
+    /// then we just save the first one. The resulting strings are used for the
+    /// PDF's /ToUnicode map for glyphs that don't have an entry in the font's
+    /// cmap. This is important for copy-paste and searching.
+    glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
     languages: HashMap<Lang, usize>,
 }
 
diff --git a/src/export/pdf/page.rs b/src/export/pdf/page.rs
index 35a4f5dc..22e590d5 100644
--- a/src/export/pdf/page.rs
+++ b/src/export/pdf/page.rs
@@ -364,11 +364,12 @@ fn write_group(ctx: &mut PageContext, pos: Point, group: &GroupItem) {
 /// Encode a text run into the content stream.
 fn write_text(ctx: &mut PageContext, x: f32, y: f32, text: &TextItem) {
     *ctx.parent.languages.entry(text.lang).or_insert(0) += text.glyphs.len();
-    ctx.parent
-        .glyph_sets
-        .entry(text.font.clone())
-        .or_default()
-        .extend(text.glyphs.iter().map(|g| g.id));
+
+    let glyph_set = ctx.parent.glyph_sets.entry(text.font.clone()).or_default();
+    for g in &text.glyphs {
+        let segment = &text.text[g.range()];
+        glyph_set.entry(g.id).or_insert_with(|| segment.into());
+    }
 
     ctx.set_fill(&text.fill);
     ctx.set_font(&text.font, text.size);
diff --git a/src/ide/jump.rs b/src/ide/jump.rs
index fc98747c..42ed2ab5 100644
--- a/src/ide/jump.rs
+++ b/src/ide/jump.rs
@@ -67,7 +67,8 @@ pub fn jump_from_click(
 
             FrameItem::Text(text) => {
                 for glyph in &text.glyphs {
-                    if glyph.span.is_detached() {
+                    let (span, span_offset) = glyph.span;
+                    if span.is_detached() {
                         continue;
                     }
 
@@ -77,13 +78,13 @@ pub fn jump_from_click(
                         Size::new(width, text.size),
                         click,
                     ) {
-                        let source = world.source(glyph.span.source());
-                        let node = source.find(glyph.span)?;
+                        let source = world.source(span.source());
+                        let node = source.find(span)?;
                         let pos = if node.kind() == SyntaxKind::Text {
                             let range = node.range();
-                            let mut offset = range.start + usize::from(glyph.offset);
+                            let mut offset = range.start + usize::from(span_offset);
                             if (click.x - pos.x) > width / 2.0 {
-                                offset += glyph.c.len_utf8();
+                                offset += glyph.range().len();
                             }
                             offset.min(range.end)
                         } else {
@@ -150,7 +151,7 @@ fn find_in_frame(frame: &Frame, span: Span) -> Option<Point> {
 
         if let FrameItem::Text(text) = item {
             for glyph in &text.glyphs {
-                if glyph.span == span {
+                if glyph.span.0 == span {
                     return Some(pos);
                 }
                 pos.x += glyph.x_advance.at(text.size);
diff --git a/tests/ref/text/copy-paste.png b/tests/ref/text/copy-paste.png
new file mode 100644
index 00000000..cbbad940
--- /dev/null
+++ b/tests/ref/text/copy-paste.png
diff --git a/tests/ref/text/shaping.png b/tests/ref/text/shaping.png
index 7b33074f..278fe8ee 100644
--- a/tests/ref/text/shaping.png
+++ b/tests/ref/text/shaping.png
diff --git a/tests/src/tests.rs b/tests/src/tests.rs
index 0e22084c..2a0b74ea 100644
--- a/tests/src/tests.rs
+++ b/tests/src/tests.rs
@@ -353,9 +353,18 @@ fn test(
     pdf_path: Option<&Path>,
     args: &Args,
 ) -> bool {
-    let name = src_path.strip_prefix(TYP_DIR).unwrap_or(src_path);
+    struct PanicGuard<'a>(&'a Path);
+    impl Drop for PanicGuard<'_> {
+        fn drop(&mut self) {
+            if std::thread::panicking() {
+                println!("Panicked in {}", self.0.display());
+            }
+        }
+    }
 
+    let name = src_path.strip_prefix(TYP_DIR).unwrap_or(src_path);
     let text = fs::read_to_string(src_path).unwrap();
+    let _guard = PanicGuard(name);
 
     let mut output = String::new();
     let mut ok = true;
@@ -401,6 +410,7 @@ fn test(
                 line,
                 &mut rng,
             );
+
             ok &= part_ok;
             compare_ever |= compare_here;
             frames.extend(part_frames);
diff --git a/tests/typ/text/copy-paste.typ b/tests/typ/text/copy-paste.typ
new file mode 100644
index 00000000..5d826482
--- /dev/null
+++ b/tests/typ/text/copy-paste.typ
@@ -0,0 +1,8 @@
+// Test copy-paste and search in PDF with ligatures
+// and Arabic test. Must be tested manually!
+
+---
+The after fira 🏳️‍🌈!
+
+#set text(lang: "ar", font: "Noto Sans Arabic")
+مرحبًا
author	Laurenz <laurmaedje@gmail.com>	2023-05-03 10:33:18 +0200
committer	Laurenz <laurmaedje@gmail.com>	2023-05-03 10:33:18 +0200
commit	ad347632ab95e29eb5180b27142f5c264dfc611a (patch)
tree	2742a33f4c3d800a86e977de04fa2cec7104c43f
parent	bcc014c4e177cc4e8cf5ca8c24990908b507c0f8 (diff)