summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2023-05-03 10:33:18 +0200
committerLaurenz <laurmaedje@gmail.com>2023-05-03 10:33:18 +0200
commitad347632ab95e29eb5180b27142f5c264dfc611a (patch)
tree2742a33f4c3d800a86e977de04fa2cec7104c43f
parentbcc014c4e177cc4e8cf5ca8c24990908b507c0f8 (diff)
Make ligatures copyable and searchable
Fixes #479 Fixes #1040
-rw-r--r--Cargo.lock12
-rw-r--r--Cargo.toml3
-rw-r--r--assets/fonts/IBMPlexSansDevanagari-Regular.ttfbin0 -> 350804 bytes
-rw-r--r--docs/src/reference/details.yml2
-rw-r--r--library/Cargo.toml1
-rw-r--r--library/src/layout/par.rs6
-rw-r--r--library/src/math/fragment.rs6
-rw-r--r--library/src/text/shaping.rs169
-rw-r--r--src/doc.rs50
-rw-r--r--src/export/pdf/font.rs96
-rw-r--r--src/export/pdf/mod.rs11
-rw-r--r--src/export/pdf/page.rs11
-rw-r--r--src/ide/jump.rs13
-rw-r--r--tests/ref/text/copy-paste.pngbin0 -> 3196 bytes
-rw-r--r--tests/ref/text/shaping.pngbin1467 -> 2606 bytes
-rw-r--r--tests/src/tests.rs12
-rw-r--r--tests/typ/text/copy-paste.typ8
17 files changed, 221 insertions, 179 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 14a297fd..3fd1e3b8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -117,6 +117,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
+name = "az"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973"
+
+[[package]]
name = "base64"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1385,9 +1391,9 @@ checksum = "8835116a5c179084a830efb3adc117ab007512b535bc1a21c991d3b32a6b44dd"
[[package]]
name = "pdf-writer"
-version = "0.7.0"
+version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "63f45f7c7538e67c58cb4977e4f97bbd75fbd3990d827d28d597ec746291f644"
+checksum = "30900f178ea696fc5d9637171f98aaa93d5aae54f0726726df68fc3e32810db6"
dependencies = [
"bitflags 1.3.2",
"itoa",
@@ -2306,6 +2312,7 @@ dependencies = [
"tracing",
"ttf-parser",
"typst-macros",
+ "unicode-general-category",
"unicode-math-class",
"unicode-segmentation",
"unicode-xid",
@@ -2366,6 +2373,7 @@ dependencies = [
name = "typst-library"
version = "0.3.0"
dependencies = [
+ "az",
"chinese-number",
"comemo",
"csv",
diff --git a/Cargo.toml b/Cargo.toml
index a0e51002..1c404061 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,7 +33,7 @@ indexmap = "1.9.3"
log = "0.4"
miniz_oxide = "0.7"
once_cell = "1"
-pdf-writer = "0.7"
+pdf-writer = "0.7.1"
pixglyph = "0.1"
regex = "1"
resvg = { version = "0.32", default-features = false }
@@ -46,6 +46,7 @@ svg2pdf = { git = "https://github.com/typst/svg2pdf" }
tiny-skia = "0.9.0"
tracing = "0.1.37"
ttf-parser = "0.18.1"
+unicode-general-category = "0.6"
unicode-math-class = "0.1"
unicode-segmentation = "1"
unicode-xid = "0.2"
diff --git a/assets/fonts/IBMPlexSansDevanagari-Regular.ttf b/assets/fonts/IBMPlexSansDevanagari-Regular.ttf
new file mode 100644
index 00000000..5d7c8f0f
--- /dev/null
+++ b/assets/fonts/IBMPlexSansDevanagari-Regular.ttf
Binary files differ
diff --git a/docs/src/reference/details.yml b/docs/src/reference/details.yml
index 1926fb77..22b67963 100644
--- a/docs/src/reference/details.yml
+++ b/docs/src/reference/details.yml
@@ -159,7 +159,7 @@ construct: |
data-loading: |
Data loading from external files.
- These functions help you with embedding data from experiments and APIs in your
+ These functions help you with embedding data from experiments in your
documents.
utility: |
diff --git a/library/Cargo.toml b/library/Cargo.toml
index 033058f3..499170cb 100644
--- a/library/Cargo.toml
+++ b/library/Cargo.toml
@@ -16,6 +16,7 @@ bench = false
[dependencies]
typst = { path = ".." }
+az = "1.2"
chinese-number = { version = "0.7.2", default-features = false, features = ["number-to-chinese"] }
comemo = "0.2.2"
csv = "1"
diff --git a/library/src/layout/par.rs b/library/src/layout/par.rs
index a6ad647b..0c3a9a3c 100644
--- a/library/src/layout/par.rs
+++ b/library/src/layout/par.rs
@@ -1139,8 +1139,7 @@ fn line<'a>(
// are no other items in the line.
if hyphen || start + shaped.text.len() > range.end {
if hyphen || start < range.end || before.is_empty() {
- let shifted = start - base..range.end - base;
- let mut reshaped = shaped.reshape(vt, &p.spans, shifted);
+ let mut reshaped = shaped.reshape(vt, &p.spans, start..range.end);
if hyphen || shy {
reshaped.push_hyphen(vt);
}
@@ -1162,8 +1161,7 @@ fn line<'a>(
// Reshape if necessary.
if range.start + shaped.text.len() > end {
if range.start < end {
- let shifted = range.start - base..end - base;
- let reshaped = shaped.reshape(vt, &p.spans, shifted);
+ let reshaped = shaped.reshape(vt, &p.spans, range.start..end);
width += reshaped.width;
first = Some(Item::Text(reshaped));
}
diff --git a/library/src/math/fragment.rs b/library/src/math/fragment.rs
index b0991630..40dca347 100644
--- a/library/src/math/fragment.rs
+++ b/library/src/math/fragment.rs
@@ -222,13 +222,13 @@ impl GlyphFragment {
size: self.font_size,
fill: self.fill,
lang: self.lang,
+ text: self.c.into(),
glyphs: vec![Glyph {
id: self.id.0,
- c: self.c,
x_advance: Em::from_length(self.width, self.font_size),
x_offset: Em::zero(),
- span: self.span,
- offset: 0,
+ range: 0..self.c.len_utf8() as u16,
+ span: (self.span, 0),
}],
};
let size = Size::new(self.width, self.ascent + self.descent);
diff --git a/library/src/text/shaping.rs b/library/src/text/shaping.rs
index 1e1ccc99..7d5703bc 100644
--- a/library/src/text/shaping.rs
+++ b/library/src/text/shaping.rs
@@ -1,6 +1,7 @@
use std::ops::Range;
use std::str::FromStr;
+use az::SaturatingAs;
use rustybuzz::{Feature, Tag, UnicodeBuffer};
use typst::font::{Font, FontVariant};
use typst::util::SliceExt;
@@ -47,20 +48,18 @@ pub struct ShapedGlyph {
pub x_offset: Em,
/// The vertical offset of the glyph.
pub y_offset: Em,
- /// The byte index in the source text where this glyph's cluster starts. A
- /// cluster is a sequence of one or multiple glyphs that cannot be
- /// separated and must always be treated as a union.
- pub cluster: usize,
+ /// The byte range of this glyph's cluster in the full paragraph. A cluster
+ /// is a sequence of one or multiple glyphs that cannot be separated and
+ /// must always be treated as a union.
+ pub range: Range<usize>,
/// Whether splitting the shaping result before this glyph would yield the
/// same results as shaping the parts to both sides of `text_index`
/// separately.
pub safe_to_break: bool,
/// The first char in this glyph's cluster.
pub c: char,
- /// The source code location of the text.
- pub span: Span,
- /// The offset within the spanned text.
- pub offset: u16,
+ /// The source code location of the glyph and its byte offset within it.
+ pub span: (Span, u16),
}
#[derive(Debug, Clone, Default)]
@@ -181,6 +180,12 @@ impl<'a> ShapedText<'a> {
for ((font, y_offset), group) in
self.glyphs.as_ref().group_by_key(|g| (g.font.clone(), g.y_offset))
{
+ let mut range = group[0].range.clone();
+ for glyph in group {
+ range.start = range.start.min(glyph.range.start);
+ range.end = range.end.max(glyph.range.end);
+ }
+
let pos = Point::new(offset, top + shift - y_offset.at(self.size));
let glyphs = group
.iter()
@@ -195,8 +200,8 @@ impl<'a> ShapedText<'a> {
} else {
glyph.stretchability().1
};
- let justification_left = adjustability_left * justification_ratio;
+ let justification_left = adjustability_left * justification_ratio;
let mut justification_right =
adjustability_right * justification_ratio;
if glyph.is_justifiable() {
@@ -206,15 +211,16 @@ impl<'a> ShapedText<'a> {
frame.size_mut().x += justification_left.at(self.size)
+ justification_right.at(self.size);
+
Glyph {
id: glyph.glyph_id,
x_advance: glyph.x_advance
+ justification_left
+ justification_right,
x_offset: glyph.x_offset + justification_left,
- c: glyph.c,
+ range: (glyph.range.start - range.start).saturating_as()
+ ..(glyph.range.end - range.start).saturating_as(),
span: glyph.span,
- offset: glyph.offset,
}
})
.collect();
@@ -224,6 +230,7 @@ impl<'a> ShapedText<'a> {
size: self.size,
lang,
fill: fill.clone(),
+ text: self.text[range.start - self.base..range.end - self.base].into(),
glyphs,
};
@@ -318,16 +325,19 @@ impl<'a> ShapedText<'a> {
/// Reshape a range of the shaped text, reusing information from this
/// shaping process if possible.
+ ///
+ /// The text `range` is relative to the whole paragraph.
pub fn reshape(
&'a self,
vt: &Vt,
spans: &SpanMapper,
text_range: Range<usize>,
) -> ShapedText<'a> {
+ let text = &self.text[text_range.start - self.base..text_range.end - self.base];
if let Some(glyphs) = self.slice_safe_to_break(text_range.clone()) {
Self {
- base: self.base + text_range.start,
- text: &self.text[text_range],
+ base: text_range.start,
+ text,
dir: self.dir,
styles: self.styles,
size: self.size,
@@ -336,14 +346,7 @@ impl<'a> ShapedText<'a> {
glyphs: Cow::Borrowed(glyphs),
}
} else {
- shape(
- vt,
- self.base + text_range.start,
- &self.text[text_range],
- spans,
- self.styles,
- self.dir,
- )
+ shape(vt, text_range.start, text, spans, self.styles, self.dir)
}
}
@@ -358,7 +361,11 @@ impl<'a> ShapedText<'a> {
let ttf = font.ttf();
let glyph_id = ttf.glyph_index('-')?;
let x_advance = font.to_em(ttf.glyph_hor_advance(glyph_id)?);
- let cluster = self.glyphs.last().map(|g| g.cluster).unwrap_or_default();
+ let range = self
+ .glyphs
+ .last()
+ .map(|g| g.range.end..g.range.end)
+ .unwrap_or_default();
self.width += x_advance.at(self.size);
self.glyphs.to_mut().push(ShapedGlyph {
font,
@@ -366,11 +373,10 @@ impl<'a> ShapedText<'a> {
x_advance,
x_offset: Em::zero(),
y_offset: Em::zero(),
- cluster,
+ range,
safe_to_break: true,
c: '-',
- span: Span::detached(),
- offset: 0,
+ span: (Span::detached(), 0),
});
Some(())
});
@@ -396,9 +402,9 @@ impl<'a> ShapedText<'a> {
// Handle edge cases.
let len = self.glyphs.len();
- if text_index == 0 {
+ if text_index == self.base {
return Some(if ltr { 0 } else { len });
- } else if text_index == self.text.len() {
+ } else if text_index == self.base + self.text.len() {
return Some(if ltr { len } else { 0 });
}
@@ -406,7 +412,7 @@ impl<'a> ShapedText<'a> {
let mut idx = self
.glyphs
.binary_search_by(|g| {
- let ordering = g.cluster.cmp(&text_index);
+ let ordering = g.range.start.cmp(&text_index);
if ltr {
ordering
} else {
@@ -422,7 +428,7 @@ impl<'a> ShapedText<'a> {
// Search for the outermost glyph with the text index.
while let Some(next) = next(idx, 1) {
- if self.glyphs.get(next).map_or(true, |g| g.cluster != text_index) {
+ if self.glyphs.get(next).map_or(true, |g| g.range.start != text_index) {
break;
}
idx = next;
@@ -444,7 +450,6 @@ impl Debug for ShapedText<'_> {
/// Holds shaping results and metadata common to all shaped segments.
struct ShapingContext<'a> {
vt: &'a Vt<'a>,
- base: usize,
spans: &'a SpanMapper,
glyphs: Vec<ShapedGlyph>,
used: Vec<Font>,
@@ -468,7 +473,6 @@ pub fn shape<'a>(
let size = TextElem::size_in(styles);
let mut ctx = ShapingContext {
vt,
- base,
spans,
size,
glyphs: vec![],
@@ -481,7 +485,7 @@ pub fn shape<'a>(
};
if !text.is_empty() {
- shape_segment(&mut ctx, 0, text, families(styles));
+ shape_segment(&mut ctx, base, text, families(styles));
}
track_and_space(&mut ctx);
@@ -552,6 +556,7 @@ fn shape_segment(
let buffer = rustybuzz::shape(font.rusty(), &ctx.tags, buffer);
let infos = buffer.glyph_infos();
let pos = buffer.glyph_positions();
+ let ltr = ctx.dir.is_positive();
// Collect the shaped glyphs, doing fallback and shaping parts again with
// the next font if necessary.
@@ -560,68 +565,66 @@ fn shape_segment(
let info = &infos[i];
let cluster = info.cluster as usize;
+ // Add the glyph to the shaped output.
if info.glyph_id != 0 {
- // Add the glyph to the shaped output.
- // TODO: Don't ignore y_advance.
- let (span, offset) = ctx.spans.span_at(ctx.base + cluster);
+ // Determine the text range of the glyph.
+ let start = base + cluster;
+ let end = base
+ + if ltr { i.checked_add(1) } else { i.checked_sub(1) }
+ .and_then(|last| infos.get(last))
+ .map_or(text.len(), |info| info.cluster as usize);
+
ctx.glyphs.push(ShapedGlyph {
font: font.clone(),
glyph_id: info.glyph_id as u16,
+ // TODO: Don't ignore y_advance.
x_advance: font.to_em(pos[i].x_advance),
x_offset: font.to_em(pos[i].x_offset),
y_offset: font.to_em(pos[i].y_offset),
- cluster: base + cluster,
+ range: start..end,
safe_to_break: !info.unsafe_to_break(),
c: text[cluster..].chars().next().unwrap(),
- span,
- offset,
+ span: ctx.spans.span_at(start),
});
} else {
- // Determine the source text range for the tofu sequence.
- let range = {
- // First, search for the end of the tofu sequence.
- let k = i;
- while infos.get(i + 1).map_or(false, |info| info.glyph_id == 0) {
- i += 1;
- }
-
- // Then, determine the start and end text index.
- //
- // Examples:
- // Everything is shown in visual order. Tofus are written as "_".
- // We want to find out that the tofus span the text `2..6`.
- // Note that the clusters are longer than 1 char.
- //
- // Left-to-right:
- // Text: h a l i h a l l o
- // Glyphs: A _ _ C E
- // Clusters: 0 2 4 6 8
- // k=1 i=2
- //
- // Right-to-left:
- // Text: O L L A H I L A H
- // Glyphs: E C _ _ A
- // Clusters: 8 6 4 2 0
- // k=2 i=3
- let ltr = ctx.dir.is_positive();
- let first = if ltr { k } else { i };
- let start = infos[first].cluster as usize;
- let last = if ltr { i.checked_add(1) } else { k.checked_sub(1) };
- let end = last
- .and_then(|last| infos.get(last))
- .map_or(text.len(), |info| info.cluster as usize);
+ // First, search for the end of the tofu sequence.
+ let k = i;
+ while infos.get(i + 1).map_or(false, |info| info.glyph_id == 0) {
+ i += 1;
+ }
- start..end
- };
+ // Then, determine the start and end text index for the tofu
+ // sequence.
+ //
+ // Examples:
+ // Everything is shown in visual order. Tofus are written as "_".
+ // We want to find out that the tofus span the text `2..6`.
+ // Note that the clusters are longer than 1 char.
+ //
+ // Left-to-right:
+ // Text: h a l i h a l l o
+ // Glyphs: A _ _ C E
+ // Clusters: 0 2 4 6 8
+ // k=1 i=2
+ //
+ // Right-to-left:
+ // Text: O L L A H I L A H
+ // Glyphs: E C _ _ A
+ // Clusters: 8 6 4 2 0
+ // k=2 i=3
+ let start = infos[if ltr { k } else { i }].cluster as usize;
+ let end = if ltr { i.checked_add(1) } else { k.checked_sub(1) }
+ .and_then(|last| infos.get(last))
+ .map_or(text.len(), |info| info.cluster as usize);
// Trim half-baked cluster.
- let remove = base + range.start..base + range.end;
- while ctx.glyphs.last().map_or(false, |g| remove.contains(&g.cluster)) {
+ let remove = base + start..base + end;
+ while ctx.glyphs.last().map_or(false, |g| remove.contains(&g.range.start)) {
ctx.glyphs.pop();
}
// Recursively shape the tofu sequence with the next family.
- shape_segment(ctx, base + range.start, &text[range], families.clone());
+ shape_segment(ctx, base + start, &text[start..end], families.clone());
}
i += 1;
@@ -634,19 +637,18 @@ fn shape_segment(
fn shape_tofus(ctx: &mut ShapingContext, base: usize, text: &str, font: Font) {
let x_advance = font.advance(0).unwrap_or_default();
for (cluster, c) in text.char_indices() {
- let cluster = base + cluster;
- let (span, offset) = ctx.spans.span_at(ctx.base + cluster);
+ let start = base + cluster;
+ let end = start + c.len_utf8();
ctx.glyphs.push(ShapedGlyph {
font: font.clone(),
glyph_id: 0,
x_advance,
x_offset: Em::zero(),
y_offset: Em::zero(),
- cluster,
+ range: start..end,
safe_to_break: true,
c,
- span,
- offset,
+ span: ctx.spans.span_at(start),
});
}
}
@@ -668,7 +670,10 @@ fn track_and_space(ctx: &mut ShapingContext) {
glyph.x_advance = spacing.relative_to(glyph.x_advance);
}
- if glyphs.peek().map_or(false, |next| glyph.cluster != next.cluster) {
+ if glyphs
+ .peek()
+ .map_or(false, |next| glyph.range.start != next.range.start)
+ {
glyph.x_advance += tracking;
}
}
diff --git a/src/doc.rs b/src/doc.rs
index 0a744ffc..0a9b9a8d 100644
--- a/src/doc.rs
+++ b/src/doc.rs
@@ -1,7 +1,8 @@
//! Finished documents.
-use std::fmt::{self, Debug, Formatter, Write};
+use std::fmt::{self, Debug, Formatter};
use std::num::NonZeroUsize;
+use std::ops::Range;
use std::str::FromStr;
use std::sync::Arc;
@@ -114,23 +115,6 @@ impl Frame {
pub fn items(&self) -> std::slice::Iter<'_, (Point, FrameItem)> {
self.items.iter()
}
-
- /// Approximately recover the text inside of the frame and its children.
- pub fn text(&self) -> EcoString {
- let mut text = EcoString::new();
- for (_, item) in self.items() {
- match item {
- FrameItem::Text(item) => {
- for glyph in &item.glyphs {
- text.push(glyph.c);
- }
- }
- FrameItem::Group(group) => text.push_str(&group.frame.text()),
- _ => {}
- }
- }
- text
- }
}
/// Insert items and subframes.
@@ -476,6 +460,8 @@ pub struct TextItem {
pub fill: Paint,
/// The natural language of the text.
pub lang: Lang,
+ /// The item's plain text.
+ pub text: EcoString,
/// The glyphs.
pub glyphs: Vec<Glyph>,
}
@@ -489,19 +475,14 @@ impl TextItem {
impl Debug for TextItem {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
- // This is only a rough approximation of the source text.
- f.write_str("Text(\"")?;
- for glyph in &self.glyphs {
- for c in glyph.c.escape_debug() {
- f.write_char(c)?;
- }
- }
- f.write_str("\")")
+ f.write_str("Text(")?;
+ self.text.fmt(f)?;
+ f.write_str(")")
}
}
/// A glyph in a run of shaped text.
-#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
+#[derive(Debug, Clone, Eq, PartialEq, Hash)]
pub struct Glyph {
/// The glyph's index in the font.
pub id: u16,
@@ -509,12 +490,17 @@ pub struct Glyph {
pub x_advance: Em,
/// The horizontal offset of the glyph.
pub x_offset: Em,
- /// The first character of the glyph's cluster.
- pub c: char,
+ /// The range of the glyph in its item's text.
+ pub range: Range<u16>,
/// The source code location of the text.
- pub span: Span,
- /// The offset within the spanned text.
- pub offset: u16,
+ pub span: (Span, u16),
+}
+
+impl Glyph {
+ /// The range of the glyph in its item's text.
+ pub fn range(&self) -> Range<usize> {
+ usize::from(self.range.start)..usize::from(self.range.end)
+ }
}
/// An identifier for a natural language.
diff --git a/src/export/pdf/font.rs b/src/export/pdf/font.rs
index de79976a..1e2f9c93 100644
--- a/src/export/pdf/font.rs
+++ b/src/export/pdf/font.rs
@@ -1,13 +1,21 @@
use std::collections::BTreeMap;
-use ecow::eco_format;
+use ecow::{eco_format, EcoString};
use pdf_writer::types::{CidFontType, FontFlags, SystemInfo, UnicodeCmap};
use pdf_writer::{Filter, Finish, Name, Rect, Str};
use ttf_parser::{name_id, GlyphId, Tag};
+use unicode_general_category::GeneralCategory;
use super::{deflate, EmExt, PdfContext, RefExt};
use crate::util::SliceExt;
+const CMAP_NAME: Name = Name(b"Custom");
+const SYSTEM_INFO: SystemInfo = SystemInfo {
+ registry: Str(b"Adobe"),
+ ordering: Str(b"Identity"),
+ supplement: 0,
+};
+
/// Embed all used fonts into the PDF.
#[tracing::instrument(skip_all)]
pub fn write_fonts(ctx: &mut PdfContext) {
@@ -19,7 +27,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
let data_ref = ctx.alloc.bump();
ctx.font_refs.push(type0_ref);
- let glyphs = &ctx.glyph_sets[font];
+ let glyph_set = ctx.glyph_sets.get_mut(font).unwrap();
let metrics = font.metrics();
let ttf = font.ttf();
@@ -29,12 +37,6 @@ pub fn write_fonts(ctx: &mut PdfContext) {
let base_font = eco_format!("ABCDEF+{}", postscript_name);
let base_font = Name(base_font.as_bytes());
- let cmap_name = Name(b"Custom");
- let system_info = SystemInfo {
- registry: Str(b"Adobe"),
- ordering: Str(b"Identity"),
- supplement: 0,
- };
// Write the base font object referencing the CID font.
ctx.writer
@@ -59,7 +61,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
let mut cid = ctx.writer.cid_font(cid_ref);
cid.subtype(subtype);
cid.base_font(base_font);
- cid.system_info(system_info);
+ cid.system_info(SYSTEM_INFO);
cid.font_descriptor(descriptor_ref);
cid.default_width(0.0);
@@ -70,7 +72,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
// Extract the widths of all glyphs.
let num_glyphs = ttf.number_of_glyphs();
let mut widths = vec![0.0; num_glyphs as usize];
- for &g in glyphs {
+ for &g in glyph_set.keys() {
let x = ttf.glyph_hor_advance(GlyphId(g)).unwrap_or(0);
widths[g as usize] = font.to_em(x).to_font_units();
}
@@ -130,42 +132,15 @@ pub fn write_fonts(ctx: &mut PdfContext) {
font_descriptor.finish();
- // Compute a reverse mapping from glyphs to unicode.
- let cmap = {
- let mut mapping = BTreeMap::new();
- for subtable in
- ttf.tables().cmap.into_iter().flat_map(|table| table.subtables)
- {
- if subtable.is_unicode() {
- subtable.codepoints(|n| {
- if let Some(c) = std::char::from_u32(n) {
- if let Some(GlyphId(g)) = ttf.glyph_index(c) {
- if glyphs.contains(&g) {
- mapping.insert(g, c);
- }
- }
- }
- });
- }
- }
-
- let mut cmap = UnicodeCmap::new(cmap_name, system_info);
- for (g, c) in mapping {
- cmap.pair(g, c);
- }
- cmap
- };
-
// Write the /ToUnicode character map, which maps glyph ids back to
// unicode codepoints to enable copying out of the PDF.
- ctx.writer
- .cmap(cmap_ref, &deflate(&cmap.finish()))
- .filter(Filter::FlateDecode);
+ let cmap = create_cmap(ttf, glyph_set);
+ ctx.writer.cmap(cmap_ref, &cmap.finish());
// Subset and write the font's bytes.
let data = font.data();
let subsetted = {
- let glyphs: Vec<_> = glyphs.iter().copied().collect();
+ let glyphs: Vec<_> = glyph_set.keys().copied().collect();
let profile = subsetter::Profile::pdf(&glyphs);
subsetter::subset(data, font.index(), profile)
};
@@ -183,3 +158,44 @@ pub fn write_fonts(ctx: &mut PdfContext) {
stream.finish();
}
}
+
+/// Create a /ToUnicode CMap.
+fn create_cmap(
+ ttf: &ttf_parser::Face,
+ glyph_set: &mut BTreeMap<u16, EcoString>,
+) -> UnicodeCmap {
+ // For glyphs that have codepoints mapping to in the font's cmap table, we
+ // prefer them over pre-existing text mappings from the document. Only
+ // things that don't have a corresponding codepoint (or only a private-use
+ // one) like the "Th" in Linux Libertine get the text of their first
+ // occurances in the document instead.
+ for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
+ if !subtable.is_unicode() {
+ continue;
+ }
+
+ subtable.codepoints(|n| {
+ let Some(c) = std::char::from_u32(n) else { return };
+ if unicode_general_category::get_general_category(c)
+ == GeneralCategory::PrivateUse
+ {
+ return;
+ }
+
+ let Some(GlyphId(g)) = ttf.glyph_index(c) else { return };
+ if glyph_set.contains_key(&g) {
+ glyph_set.insert(g, c.into());
+ }
+ });
+ }
+
+ // Produce a reverse mapping from glyphs to unicode strings.
+ let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO);
+ for (&g, text) in glyph_set.iter() {
+ if !text.is_empty() {
+ cmap.pair_with_multiple(g, text.chars());
+ }
+ }
+
+ cmap
+}
diff --git a/src/export/pdf/mod.rs b/src/export/pdf/mod.rs
index ffbf67a3..48485862 100644
--- a/src/export/pdf/mod.rs
+++ b/src/export/pdf/mod.rs
@@ -6,9 +6,10 @@ mod outline;
mod page;
use std::cmp::Eq;
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeMap, HashMap};
use std::hash::Hash;
+use ecow::EcoString;
use pdf_writer::types::Direction;
use pdf_writer::{Finish, Name, PdfWriter, Ref, TextStr};
use xmp_writer::{LangId, RenditionClass, XmpWriter};
@@ -52,7 +53,13 @@ pub struct PdfContext<'a> {
page_refs: Vec<Ref>,
font_map: Remapper<Font>,
image_map: Remapper<Image>,
- glyph_sets: HashMap<Font, HashSet<u16>>,
+ /// For each font a mapping from used glyphs to their text representation.
+ /// May contain multiple chars in case of ligatures or similar things. The
+ /// same glyph can have a different text representation within one document,
+ /// then we just save the first one. The resulting strings are used for the
+ /// PDF's /ToUnicode map for glyphs that don't have an entry in the font's
+ /// cmap. This is important for copy-paste and searching.
+ glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
languages: HashMap<Lang, usize>,
}
diff --git a/src/export/pdf/page.rs b/src/export/pdf/page.rs
index 35a4f5dc..22e590d5 100644
--- a/src/export/pdf/page.rs
+++ b/src/export/pdf/page.rs
@@ -364,11 +364,12 @@ fn write_group(ctx: &mut PageContext, pos: Point, group: &GroupItem) {
/// Encode a text run into the content stream.
fn write_text(ctx: &mut PageContext, x: f32, y: f32, text: &TextItem) {
*ctx.parent.languages.entry(text.lang).or_insert(0) += text.glyphs.len();
- ctx.parent
- .glyph_sets
- .entry(text.font.clone())
- .or_default()
- .extend(text.glyphs.iter().map(|g| g.id));
+
+ let glyph_set = ctx.parent.glyph_sets.entry(text.font.clone()).or_default();
+ for g in &text.glyphs {
+ let segment = &text.text[g.range()];
+ glyph_set.entry(g.id).or_insert_with(|| segment.into());
+ }
ctx.set_fill(&text.fill);
ctx.set_font(&text.font, text.size);
diff --git a/src/ide/jump.rs b/src/ide/jump.rs
index fc98747c..42ed2ab5 100644
--- a/src/ide/jump.rs
+++ b/src/ide/jump.rs
@@ -67,7 +67,8 @@ pub fn jump_from_click(
FrameItem::Text(text) => {
for glyph in &text.glyphs {
- if glyph.span.is_detached() {
+ let (span, span_offset) = glyph.span;
+ if span.is_detached() {
continue;
}
@@ -77,13 +78,13 @@ pub fn jump_from_click(
Size::new(width, text.size),
click,
) {
- let source = world.source(glyph.span.source());
- let node = source.find(glyph.span)?;
+ let source = world.source(span.source());
+ let node = source.find(span)?;
let pos = if node.kind() == SyntaxKind::Text {
let range = node.range();
- let mut offset = range.start + usize::from(glyph.offset);
+ let mut offset = range.start + usize::from(span_offset);
if (click.x - pos.x) > width / 2.0 {
- offset += glyph.c.len_utf8();
+ offset += glyph.range().len();
}
offset.min(range.end)
} else {
@@ -150,7 +151,7 @@ fn find_in_frame(frame: &Frame, span: Span) -> Option<Point> {
if let FrameItem::Text(text) = item {
for glyph in &text.glyphs {
- if glyph.span == span {
+ if glyph.span.0 == span {
return Some(pos);
}
pos.x += glyph.x_advance.at(text.size);
diff --git a/tests/ref/text/copy-paste.png b/tests/ref/text/copy-paste.png
new file mode 100644
index 00000000..cbbad940
--- /dev/null
+++ b/tests/ref/text/copy-paste.png
Binary files differ
diff --git a/tests/ref/text/shaping.png b/tests/ref/text/shaping.png
index 7b33074f..278fe8ee 100644
--- a/tests/ref/text/shaping.png
+++ b/tests/ref/text/shaping.png
Binary files differ
diff --git a/tests/src/tests.rs b/tests/src/tests.rs
index 0e22084c..2a0b74ea 100644
--- a/tests/src/tests.rs
+++ b/tests/src/tests.rs
@@ -353,9 +353,18 @@ fn test(
pdf_path: Option<&Path>,
args: &Args,
) -> bool {
- let name = src_path.strip_prefix(TYP_DIR).unwrap_or(src_path);
+ struct PanicGuard<'a>(&'a Path);
+ impl Drop for PanicGuard<'_> {
+ fn drop(&mut self) {
+ if std::thread::panicking() {
+ println!("Panicked in {}", self.0.display());
+ }
+ }
+ }
+ let name = src_path.strip_prefix(TYP_DIR).unwrap_or(src_path);
let text = fs::read_to_string(src_path).unwrap();
+ let _guard = PanicGuard(name);
let mut output = String::new();
let mut ok = true;
@@ -401,6 +410,7 @@ fn test(
line,
&mut rng,
);
+
ok &= part_ok;
compare_ever |= compare_here;
frames.extend(part_frames);
diff --git a/tests/typ/text/copy-paste.typ b/tests/typ/text/copy-paste.typ
new file mode 100644
index 00000000..5d826482
--- /dev/null
+++ b/tests/typ/text/copy-paste.typ
@@ -0,0 +1,8 @@
+// Test copy-paste and search in PDF with ligatures
+// and Arabic test. Must be tested manually!
+
+---
+The after fira 🏳️‍🌈!
+
+#set text(lang: "ar", font: "Noto Sans Arabic")
+مرحبًا