summaryrefslogtreecommitdiff
path: root/src/export
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2023-05-03 10:33:18 +0200
committerLaurenz <laurmaedje@gmail.com>2023-05-03 10:33:18 +0200
commitad347632ab95e29eb5180b27142f5c264dfc611a (patch)
tree2742a33f4c3d800a86e977de04fa2cec7104c43f /src/export
parentbcc014c4e177cc4e8cf5ca8c24990908b507c0f8 (diff)
Make ligatures copyable and searchable
Fixes #479 Fixes #1040
Diffstat (limited to 'src/export')
-rw-r--r--src/export/pdf/font.rs96
-rw-r--r--src/export/pdf/mod.rs11
-rw-r--r--src/export/pdf/page.rs11
3 files changed, 71 insertions, 47 deletions
diff --git a/src/export/pdf/font.rs b/src/export/pdf/font.rs
index de79976a..1e2f9c93 100644
--- a/src/export/pdf/font.rs
+++ b/src/export/pdf/font.rs
@@ -1,13 +1,21 @@
use std::collections::BTreeMap;
-use ecow::eco_format;
+use ecow::{eco_format, EcoString};
use pdf_writer::types::{CidFontType, FontFlags, SystemInfo, UnicodeCmap};
use pdf_writer::{Filter, Finish, Name, Rect, Str};
use ttf_parser::{name_id, GlyphId, Tag};
+use unicode_general_category::GeneralCategory;
use super::{deflate, EmExt, PdfContext, RefExt};
use crate::util::SliceExt;
+const CMAP_NAME: Name = Name(b"Custom");
+const SYSTEM_INFO: SystemInfo = SystemInfo {
+ registry: Str(b"Adobe"),
+ ordering: Str(b"Identity"),
+ supplement: 0,
+};
+
/// Embed all used fonts into the PDF.
#[tracing::instrument(skip_all)]
pub fn write_fonts(ctx: &mut PdfContext) {
@@ -19,7 +27,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
let data_ref = ctx.alloc.bump();
ctx.font_refs.push(type0_ref);
- let glyphs = &ctx.glyph_sets[font];
+ let glyph_set = ctx.glyph_sets.get_mut(font).unwrap();
let metrics = font.metrics();
let ttf = font.ttf();
@@ -29,12 +37,6 @@ pub fn write_fonts(ctx: &mut PdfContext) {
let base_font = eco_format!("ABCDEF+{}", postscript_name);
let base_font = Name(base_font.as_bytes());
- let cmap_name = Name(b"Custom");
- let system_info = SystemInfo {
- registry: Str(b"Adobe"),
- ordering: Str(b"Identity"),
- supplement: 0,
- };
// Write the base font object referencing the CID font.
ctx.writer
@@ -59,7 +61,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
let mut cid = ctx.writer.cid_font(cid_ref);
cid.subtype(subtype);
cid.base_font(base_font);
- cid.system_info(system_info);
+ cid.system_info(SYSTEM_INFO);
cid.font_descriptor(descriptor_ref);
cid.default_width(0.0);
@@ -70,7 +72,7 @@ pub fn write_fonts(ctx: &mut PdfContext) {
// Extract the widths of all glyphs.
let num_glyphs = ttf.number_of_glyphs();
let mut widths = vec![0.0; num_glyphs as usize];
- for &g in glyphs {
+ for &g in glyph_set.keys() {
let x = ttf.glyph_hor_advance(GlyphId(g)).unwrap_or(0);
widths[g as usize] = font.to_em(x).to_font_units();
}
@@ -130,42 +132,15 @@ pub fn write_fonts(ctx: &mut PdfContext) {
font_descriptor.finish();
- // Compute a reverse mapping from glyphs to unicode.
- let cmap = {
- let mut mapping = BTreeMap::new();
- for subtable in
- ttf.tables().cmap.into_iter().flat_map(|table| table.subtables)
- {
- if subtable.is_unicode() {
- subtable.codepoints(|n| {
- if let Some(c) = std::char::from_u32(n) {
- if let Some(GlyphId(g)) = ttf.glyph_index(c) {
- if glyphs.contains(&g) {
- mapping.insert(g, c);
- }
- }
- }
- });
- }
- }
-
- let mut cmap = UnicodeCmap::new(cmap_name, system_info);
- for (g, c) in mapping {
- cmap.pair(g, c);
- }
- cmap
- };
-
// Write the /ToUnicode character map, which maps glyph ids back to
// unicode codepoints to enable copying out of the PDF.
- ctx.writer
- .cmap(cmap_ref, &deflate(&cmap.finish()))
- .filter(Filter::FlateDecode);
+ let cmap = create_cmap(ttf, glyph_set);
+ ctx.writer.cmap(cmap_ref, &cmap.finish());
// Subset and write the font's bytes.
let data = font.data();
let subsetted = {
- let glyphs: Vec<_> = glyphs.iter().copied().collect();
+ let glyphs: Vec<_> = glyph_set.keys().copied().collect();
let profile = subsetter::Profile::pdf(&glyphs);
subsetter::subset(data, font.index(), profile)
};
@@ -183,3 +158,44 @@ pub fn write_fonts(ctx: &mut PdfContext) {
stream.finish();
}
}
+
+/// Create a /ToUnicode CMap.
+fn create_cmap(
+ ttf: &ttf_parser::Face,
+ glyph_set: &mut BTreeMap<u16, EcoString>,
+) -> UnicodeCmap {
+ // For glyphs that have codepoints mapping to in the font's cmap table, we
+ // prefer them over pre-existing text mappings from the document. Only
+ // things that don't have a corresponding codepoint (or only a private-use
+ // one) like the "Th" in Linux Libertine get the text of their first
+ // occurances in the document instead.
+ for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
+ if !subtable.is_unicode() {
+ continue;
+ }
+
+ subtable.codepoints(|n| {
+ let Some(c) = std::char::from_u32(n) else { return };
+ if unicode_general_category::get_general_category(c)
+ == GeneralCategory::PrivateUse
+ {
+ return;
+ }
+
+ let Some(GlyphId(g)) = ttf.glyph_index(c) else { return };
+ if glyph_set.contains_key(&g) {
+ glyph_set.insert(g, c.into());
+ }
+ });
+ }
+
+ // Produce a reverse mapping from glyphs to unicode strings.
+ let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO);
+ for (&g, text) in glyph_set.iter() {
+ if !text.is_empty() {
+ cmap.pair_with_multiple(g, text.chars());
+ }
+ }
+
+ cmap
+}
diff --git a/src/export/pdf/mod.rs b/src/export/pdf/mod.rs
index ffbf67a3..48485862 100644
--- a/src/export/pdf/mod.rs
+++ b/src/export/pdf/mod.rs
@@ -6,9 +6,10 @@ mod outline;
mod page;
use std::cmp::Eq;
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeMap, HashMap};
use std::hash::Hash;
+use ecow::EcoString;
use pdf_writer::types::Direction;
use pdf_writer::{Finish, Name, PdfWriter, Ref, TextStr};
use xmp_writer::{LangId, RenditionClass, XmpWriter};
@@ -52,7 +53,13 @@ pub struct PdfContext<'a> {
page_refs: Vec<Ref>,
font_map: Remapper<Font>,
image_map: Remapper<Image>,
- glyph_sets: HashMap<Font, HashSet<u16>>,
+ /// For each font a mapping from used glyphs to their text representation.
+ /// May contain multiple chars in case of ligatures or similar things. The
+ /// same glyph can have a different text representation within one document,
+ /// then we just save the first one. The resulting strings are used for the
+ /// PDF's /ToUnicode map for glyphs that don't have an entry in the font's
+ /// cmap. This is important for copy-paste and searching.
+ glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
languages: HashMap<Lang, usize>,
}
diff --git a/src/export/pdf/page.rs b/src/export/pdf/page.rs
index 35a4f5dc..22e590d5 100644
--- a/src/export/pdf/page.rs
+++ b/src/export/pdf/page.rs
@@ -364,11 +364,12 @@ fn write_group(ctx: &mut PageContext, pos: Point, group: &GroupItem) {
/// Encode a text run into the content stream.
fn write_text(ctx: &mut PageContext, x: f32, y: f32, text: &TextItem) {
*ctx.parent.languages.entry(text.lang).or_insert(0) += text.glyphs.len();
- ctx.parent
- .glyph_sets
- .entry(text.font.clone())
- .or_default()
- .extend(text.glyphs.iter().map(|g| g.id));
+
+ let glyph_set = ctx.parent.glyph_sets.entry(text.font.clone()).or_default();
+ for g in &text.glyphs {
+ let segment = &text.text[g.range()];
+ glyph_set.entry(g.id).or_insert_with(|| segment.into());
+ }
ctx.set_fill(&text.fill);
ctx.set_font(&text.font, text.size);