diff options
| author | Laurenz <laurmaedje@gmail.com> | 2023-05-03 10:33:18 +0200 |
|---|---|---|
| committer | Laurenz <laurmaedje@gmail.com> | 2023-05-03 10:33:18 +0200 |
| commit | ad347632ab95e29eb5180b27142f5c264dfc611a (patch) | |
| tree | 2742a33f4c3d800a86e977de04fa2cec7104c43f /src/export | |
| parent | bcc014c4e177cc4e8cf5ca8c24990908b507c0f8 (diff) | |
Make ligatures copyable and searchable
Fixes #479
Fixes #1040
Diffstat (limited to 'src/export')
| -rw-r--r-- | src/export/pdf/font.rs | 96 | ||||
| -rw-r--r-- | src/export/pdf/mod.rs | 11 | ||||
| -rw-r--r-- | src/export/pdf/page.rs | 11 |
3 files changed, 71 insertions, 47 deletions
diff --git a/src/export/pdf/font.rs b/src/export/pdf/font.rs index de79976a..1e2f9c93 100644 --- a/src/export/pdf/font.rs +++ b/src/export/pdf/font.rs @@ -1,13 +1,21 @@ use std::collections::BTreeMap; -use ecow::eco_format; +use ecow::{eco_format, EcoString}; use pdf_writer::types::{CidFontType, FontFlags, SystemInfo, UnicodeCmap}; use pdf_writer::{Filter, Finish, Name, Rect, Str}; use ttf_parser::{name_id, GlyphId, Tag}; +use unicode_general_category::GeneralCategory; use super::{deflate, EmExt, PdfContext, RefExt}; use crate::util::SliceExt; +const CMAP_NAME: Name = Name(b"Custom"); +const SYSTEM_INFO: SystemInfo = SystemInfo { + registry: Str(b"Adobe"), + ordering: Str(b"Identity"), + supplement: 0, +}; + /// Embed all used fonts into the PDF. #[tracing::instrument(skip_all)] pub fn write_fonts(ctx: &mut PdfContext) { @@ -19,7 +27,7 @@ pub fn write_fonts(ctx: &mut PdfContext) { let data_ref = ctx.alloc.bump(); ctx.font_refs.push(type0_ref); - let glyphs = &ctx.glyph_sets[font]; + let glyph_set = ctx.glyph_sets.get_mut(font).unwrap(); let metrics = font.metrics(); let ttf = font.ttf(); @@ -29,12 +37,6 @@ pub fn write_fonts(ctx: &mut PdfContext) { let base_font = eco_format!("ABCDEF+{}", postscript_name); let base_font = Name(base_font.as_bytes()); - let cmap_name = Name(b"Custom"); - let system_info = SystemInfo { - registry: Str(b"Adobe"), - ordering: Str(b"Identity"), - supplement: 0, - }; // Write the base font object referencing the CID font. ctx.writer @@ -59,7 +61,7 @@ pub fn write_fonts(ctx: &mut PdfContext) { let mut cid = ctx.writer.cid_font(cid_ref); cid.subtype(subtype); cid.base_font(base_font); - cid.system_info(system_info); + cid.system_info(SYSTEM_INFO); cid.font_descriptor(descriptor_ref); cid.default_width(0.0); @@ -70,7 +72,7 @@ pub fn write_fonts(ctx: &mut PdfContext) { // Extract the widths of all glyphs. let num_glyphs = ttf.number_of_glyphs(); let mut widths = vec![0.0; num_glyphs as usize]; - for &g in glyphs { + for &g in glyph_set.keys() { let x = ttf.glyph_hor_advance(GlyphId(g)).unwrap_or(0); widths[g as usize] = font.to_em(x).to_font_units(); } @@ -130,42 +132,15 @@ pub fn write_fonts(ctx: &mut PdfContext) { font_descriptor.finish(); - // Compute a reverse mapping from glyphs to unicode. - let cmap = { - let mut mapping = BTreeMap::new(); - for subtable in - ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) - { - if subtable.is_unicode() { - subtable.codepoints(|n| { - if let Some(c) = std::char::from_u32(n) { - if let Some(GlyphId(g)) = ttf.glyph_index(c) { - if glyphs.contains(&g) { - mapping.insert(g, c); - } - } - } - }); - } - } - - let mut cmap = UnicodeCmap::new(cmap_name, system_info); - for (g, c) in mapping { - cmap.pair(g, c); - } - cmap - }; - // Write the /ToUnicode character map, which maps glyph ids back to // unicode codepoints to enable copying out of the PDF. - ctx.writer - .cmap(cmap_ref, &deflate(&cmap.finish())) - .filter(Filter::FlateDecode); + let cmap = create_cmap(ttf, glyph_set); + ctx.writer.cmap(cmap_ref, &cmap.finish()); // Subset and write the font's bytes. let data = font.data(); let subsetted = { - let glyphs: Vec<_> = glyphs.iter().copied().collect(); + let glyphs: Vec<_> = glyph_set.keys().copied().collect(); let profile = subsetter::Profile::pdf(&glyphs); subsetter::subset(data, font.index(), profile) }; @@ -183,3 +158,44 @@ pub fn write_fonts(ctx: &mut PdfContext) { stream.finish(); } } + +/// Create a /ToUnicode CMap. +fn create_cmap( + ttf: &ttf_parser::Face, + glyph_set: &mut BTreeMap<u16, EcoString>, +) -> UnicodeCmap { + // For glyphs that have codepoints mapping to in the font's cmap table, we + // prefer them over pre-existing text mappings from the document. Only + // things that don't have a corresponding codepoint (or only a private-use + // one) like the "Th" in Linux Libertine get the text of their first + // occurances in the document instead. + for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) { + if !subtable.is_unicode() { + continue; + } + + subtable.codepoints(|n| { + let Some(c) = std::char::from_u32(n) else { return }; + if unicode_general_category::get_general_category(c) + == GeneralCategory::PrivateUse + { + return; + } + + let Some(GlyphId(g)) = ttf.glyph_index(c) else { return }; + if glyph_set.contains_key(&g) { + glyph_set.insert(g, c.into()); + } + }); + } + + // Produce a reverse mapping from glyphs to unicode strings. + let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO); + for (&g, text) in glyph_set.iter() { + if !text.is_empty() { + cmap.pair_with_multiple(g, text.chars()); + } + } + + cmap +} diff --git a/src/export/pdf/mod.rs b/src/export/pdf/mod.rs index ffbf67a3..48485862 100644 --- a/src/export/pdf/mod.rs +++ b/src/export/pdf/mod.rs @@ -6,9 +6,10 @@ mod outline; mod page; use std::cmp::Eq; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use std::hash::Hash; +use ecow::EcoString; use pdf_writer::types::Direction; use pdf_writer::{Finish, Name, PdfWriter, Ref, TextStr}; use xmp_writer::{LangId, RenditionClass, XmpWriter}; @@ -52,7 +53,13 @@ pub struct PdfContext<'a> { page_refs: Vec<Ref>, font_map: Remapper<Font>, image_map: Remapper<Image>, - glyph_sets: HashMap<Font, HashSet<u16>>, + /// For each font a mapping from used glyphs to their text representation. + /// May contain multiple chars in case of ligatures or similar things. The + /// same glyph can have a different text representation within one document, + /// then we just save the first one. The resulting strings are used for the + /// PDF's /ToUnicode map for glyphs that don't have an entry in the font's + /// cmap. This is important for copy-paste and searching. + glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>, languages: HashMap<Lang, usize>, } diff --git a/src/export/pdf/page.rs b/src/export/pdf/page.rs index 35a4f5dc..22e590d5 100644 --- a/src/export/pdf/page.rs +++ b/src/export/pdf/page.rs @@ -364,11 +364,12 @@ fn write_group(ctx: &mut PageContext, pos: Point, group: &GroupItem) { /// Encode a text run into the content stream. fn write_text(ctx: &mut PageContext, x: f32, y: f32, text: &TextItem) { *ctx.parent.languages.entry(text.lang).or_insert(0) += text.glyphs.len(); - ctx.parent - .glyph_sets - .entry(text.font.clone()) - .or_default() - .extend(text.glyphs.iter().map(|g| g.id)); + + let glyph_set = ctx.parent.glyph_sets.entry(text.font.clone()).or_default(); + for g in &text.glyphs { + let segment = &text.text[g.range()]; + glyph_set.entry(g.id).or_insert_with(|| segment.into()); + } ctx.set_fill(&text.fill); ctx.set_font(&text.font, text.size); |
