summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2023-05-03 10:33:18 +0200
committerLaurenz <laurmaedje@gmail.com>2023-05-03 10:33:18 +0200
commitad347632ab95e29eb5180b27142f5c264dfc611a (patch)
tree2742a33f4c3d800a86e977de04fa2cec7104c43f /src
parentbcc014c4e177cc4e8cf5ca8c24990908b507c0f8 (diff)
Make ligatures copyable and searchable
Fixes #479 Fixes #1040
Diffstat (limited to 'src')
-rw-r--r--src/doc.rs50
-rw-r--r--src/export/pdf/font.rs96
-rw-r--r--src/export/pdf/mod.rs11
-rw-r--r--src/export/pdf/page.rs11
-rw-r--r--src/ide/jump.rs13
5 files changed, 96 insertions, 85 deletions
diff --git a/src/doc.rs b/src/doc.rs
index 0a744ffc..0a9b9a8d 100644
--- a/src/doc.rs
+++ b/src/doc.rs
@@ -1,7 +1,8 @@
//! Finished documents.
-use std::fmt::{self, Debug, Formatter, Write};
+use std::fmt::{self, Debug, Formatter};
use std::num::NonZeroUsize;
+use std::ops::Range;
use std::str::FromStr;
use std::sync::Arc;
@@ -114,23 +115,6 @@ impl Frame {
pub fn items(&self) -> std::slice::Iter<'_, (Point, FrameItem)> {
self.items.iter()
}
-
- /// Approximately recover the text inside of the frame and its children.
- pub fn text(&self) -> EcoString {
- let mut text = EcoString::new();
- for (_, item) in self.items() {
- match item {
- FrameItem::Text(item) => {
- for glyph in &item.glyphs {
- text.push(glyph.c);
- }
- }
- FrameItem::Group(group) => text.push_str(&group.frame.text()),
- _ => {}
- }
- }
- text
- }
}
/// Insert items and subframes.
@@ -476,6 +460,8 @@ pub struct TextItem {
pub fill: Paint,
/// The natural language of the text.
pub lang: Lang,
+ /// The item's plain text.
+ pub text: EcoString,
/// The glyphs.
pub glyphs: Vec<Glyph>,
}
@@ -489,19 +475,14 @@ impl TextItem {
impl Debug for TextItem {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
- // This is only a rough approximation of the source text.
- f.write_str("Text(\"")?;
- for glyph in &self.glyphs {
- for c in glyph.c.escape_debug() {
- f.write_char(c)?;
- }
- }
- f.write_str("\")")
+ f.write_str("Text(")?;
+ self.text.fmt(f)?;
+ f.write_str(")")
}
}
/// A glyph in a run of shaped text.
-#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
+#[derive(Debug, Clone, Eq, PartialEq, Hash)]
pub struct Glyph {
/// The glyph's index in the font.
pub id: u16,
@@ -509,12 +490,17 @@ pub struct Glyph {
pub x_advance: Em,
/// The horizontal offset of the glyph.
pub x_offset: Em,
- /// The first character of the glyph's cluster.
- pub c: char,
+ /// The range of the glyph in its item's text.
+ pub range: Range<u16>,
/// The source code location of the text.
- pub span: Span,
- /// The offset within the spanned text.
- pub offset: u16,
+ pub span: (Span, u16),
+}
+
+impl Glyph {
+ /// The range of the glyph in its item's text.
+ pub fn range(&self) -> Range<usize> {
+ usize::from(self.range.start)..usize::from(self.range.end)
+ }
}
/// An identifier for a natural language.
diff --git a/src/export/pdf/font.rs b/src/export/pdf/font.rs
index de79976a..1e2f9c93 100644
--- a/src/export/pdf/font.rs
+++ b/src/export/pdf/font.rs
@@ -1,13 +1,21 @@
use std::collections::BTreeMap;
-use ecow::eco_format;
+use ecow::{eco_format, EcoString};
use pdf_writer::types::{CidFontType, FontFlags, SystemInfo, UnicodeCmap};
use pdf_writer::{Filter, Finish, Name, Rect, Str};
use ttf_parser::{name_id, GlyphId, Tag};
+use unicode_general_category::GeneralCategory;
use super::{deflate, EmExt, PdfContext, RefExt};
use crate::util::SliceExt;
+const CMAP_NAME: Name = Name(b"Custom");
+const SYSTEM_INFO: SystemInfo = SystemInfo {
+ registry: Str(b"Adobe"),
+ ordering: Str(b"Identity"),
+ supplement: 0,
+};
+
/// Embed all used fonts into the PDF.
#[tracing::instrument(skip_all)]
pub fn write_fonts(ctx: &mut PdfContext) {
@@ -19,7 +27,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
let data_ref = ctx.alloc.bump();
ctx.font_refs.push(type0_ref);
- let glyphs = &ctx.glyph_sets[font];
+ let glyph_set = ctx.glyph_sets.get_mut(font).unwrap();
let metrics = font.metrics();
let ttf = font.ttf();
@@ -29,12 +37,6 @@ pub fn write_fonts(ctx: &mut PdfContext) {
let base_font = eco_format!("ABCDEF+{}", postscript_name);
let base_font = Name(base_font.as_bytes());
- let cmap_name = Name(b"Custom");
- let system_info = SystemInfo {
- registry: Str(b"Adobe"),
- ordering: Str(b"Identity"),
- supplement: 0,
- };
// Write the base font object referencing the CID font.
ctx.writer
@@ -59,7 +61,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
let mut cid = ctx.writer.cid_font(cid_ref);
cid.subtype(subtype);
cid.base_font(base_font);
- cid.system_info(system_info);
+ cid.system_info(SYSTEM_INFO);
cid.font_descriptor(descriptor_ref);
cid.default_width(0.0);
@@ -70,7 +72,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
// Extract the widths of all glyphs.
let num_glyphs = ttf.number_of_glyphs();
let mut widths = vec![0.0; num_glyphs as usize];
- for &g in glyphs {
+ for &g in glyph_set.keys() {
let x = ttf.glyph_hor_advance(GlyphId(g)).unwrap_or(0);
widths[g as usize] = font.to_em(x).to_font_units();
}
@@ -130,42 +132,15 @@ pub fn write_fonts(ctx: &mut PdfContext) {
font_descriptor.finish();
- // Compute a reverse mapping from glyphs to unicode.
- let cmap = {
- let mut mapping = BTreeMap::new();
- for subtable in
- ttf.tables().cmap.into_iter().flat_map(|table| table.subtables)
- {
- if subtable.is_unicode() {
- subtable.codepoints(|n| {
- if let Some(c) = std::char::from_u32(n) {
- if let Some(GlyphId(g)) = ttf.glyph_index(c) {
- if glyphs.contains(&g) {
- mapping.insert(g, c);
- }
- }
- }
- });
- }
- }
-
- let mut cmap = UnicodeCmap::new(cmap_name, system_info);
- for (g, c) in mapping {
- cmap.pair(g, c);
- }
- cmap
- };
-
// Write the /ToUnicode character map, which maps glyph ids back to
// unicode codepoints to enable copying out of the PDF.
- ctx.writer
- .cmap(cmap_ref, &deflate(&cmap.finish()))
- .filter(Filter::FlateDecode);
+ let cmap = create_cmap(ttf, glyph_set);
+ ctx.writer.cmap(cmap_ref, &cmap.finish());
// Subset and write the font's bytes.
let data = font.data();
let subsetted = {
- let glyphs: Vec<_> = glyphs.iter().copied().collect();
+ let glyphs: Vec<_> = glyph_set.keys().copied().collect();
let profile = subsetter::Profile::pdf(&glyphs);
subsetter::subset(data, font.index(), profile)
};
@@ -183,3 +158,44 @@ pub fn write_fonts(ctx: &mut PdfContext) {
stream.finish();
}
}
+
+/// Create a /ToUnicode CMap.
+fn create_cmap(
+ ttf: &ttf_parser::Face,
+ glyph_set: &mut BTreeMap<u16, EcoString>,
+) -> UnicodeCmap {
+ // For glyphs that have codepoints mapping to in the font's cmap table, we
+ // prefer them over pre-existing text mappings from the document. Only
+ // things that don't have a corresponding codepoint (or only a private-use
+ // one) like the "Th" in Linux Libertine get the text of their first
+ // occurances in the document instead.
+ for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
+ if !subtable.is_unicode() {
+ continue;
+ }
+
+ subtable.codepoints(|n| {
+ let Some(c) = std::char::from_u32(n) else { return };
+ if unicode_general_category::get_general_category(c)
+ == GeneralCategory::PrivateUse
+ {
+ return;
+ }
+
+ let Some(GlyphId(g)) = ttf.glyph_index(c) else { return };
+ if glyph_set.contains_key(&g) {
+ glyph_set.insert(g, c.into());
+ }
+ });
+ }
+
+ // Produce a reverse mapping from glyphs to unicode strings.
+ let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO);
+ for (&g, text) in glyph_set.iter() {
+ if !text.is_empty() {
+ cmap.pair_with_multiple(g, text.chars());
+ }
+ }
+
+ cmap
+}
diff --git a/src/export/pdf/mod.rs b/src/export/pdf/mod.rs
index ffbf67a3..48485862 100644
--- a/src/export/pdf/mod.rs
+++ b/src/export/pdf/mod.rs
@@ -6,9 +6,10 @@ mod outline;
mod page;
use std::cmp::Eq;
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeMap, HashMap};
use std::hash::Hash;
+use ecow::EcoString;
use pdf_writer::types::Direction;
use pdf_writer::{Finish, Name, PdfWriter, Ref, TextStr};
use xmp_writer::{LangId, RenditionClass, XmpWriter};
@@ -52,7 +53,13 @@ pub struct PdfContext<'a> {
page_refs: Vec<Ref>,
font_map: Remapper<Font>,
image_map: Remapper<Image>,
- glyph_sets: HashMap<Font, HashSet<u16>>,
+ /// For each font a mapping from used glyphs to their text representation.
+ /// May contain multiple chars in case of ligatures or similar things. The
+ /// same glyph can have a different text representation within one document,
+ /// then we just save the first one. The resulting strings are used for the
+ /// PDF's /ToUnicode map for glyphs that don't have an entry in the font's
+ /// cmap. This is important for copy-paste and searching.
+ glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
languages: HashMap<Lang, usize>,
}
diff --git a/src/export/pdf/page.rs b/src/export/pdf/page.rs
index 35a4f5dc..22e590d5 100644
--- a/src/export/pdf/page.rs
+++ b/src/export/pdf/page.rs
@@ -364,11 +364,12 @@ fn write_group(ctx: &mut PageContext, pos: Point, group: &GroupItem) {
/// Encode a text run into the content stream.
fn write_text(ctx: &mut PageContext, x: f32, y: f32, text: &TextItem) {
*ctx.parent.languages.entry(text.lang).or_insert(0) += text.glyphs.len();
- ctx.parent
- .glyph_sets
- .entry(text.font.clone())
- .or_default()
- .extend(text.glyphs.iter().map(|g| g.id));
+
+ let glyph_set = ctx.parent.glyph_sets.entry(text.font.clone()).or_default();
+ for g in &text.glyphs {
+ let segment = &text.text[g.range()];
+ glyph_set.entry(g.id).or_insert_with(|| segment.into());
+ }
ctx.set_fill(&text.fill);
ctx.set_font(&text.font, text.size);
diff --git a/src/ide/jump.rs b/src/ide/jump.rs
index fc98747c..42ed2ab5 100644
--- a/src/ide/jump.rs
+++ b/src/ide/jump.rs
@@ -67,7 +67,8 @@ pub fn jump_from_click(
FrameItem::Text(text) => {
for glyph in &text.glyphs {
- if glyph.span.is_detached() {
+ let (span, span_offset) = glyph.span;
+ if span.is_detached() {
continue;
}
@@ -77,13 +78,13 @@ pub fn jump_from_click(
Size::new(width, text.size),
click,
) {
- let source = world.source(glyph.span.source());
- let node = source.find(glyph.span)?;
+ let source = world.source(span.source());
+ let node = source.find(span)?;
let pos = if node.kind() == SyntaxKind::Text {
let range = node.range();
- let mut offset = range.start + usize::from(glyph.offset);
+ let mut offset = range.start + usize::from(span_offset);
if (click.x - pos.x) > width / 2.0 {
- offset += glyph.c.len_utf8();
+ offset += glyph.range().len();
}
offset.min(range.end)
} else {
@@ -150,7 +151,7 @@ fn find_in_frame(frame: &Frame, span: Span) -> Option<Point> {
if let FrameItem::Text(text) = item {
for glyph in &text.glyphs {
- if glyph.span == span {
+ if glyph.span.0 == span {
return Some(pos);
}
pos.x += glyph.x_advance.at(text.size);