diff options
| author | Laurenz <laurmaedje@gmail.com> | 2023-05-03 10:33:18 +0200 |
|---|---|---|
| committer | Laurenz <laurmaedje@gmail.com> | 2023-05-03 10:33:18 +0200 |
| commit | ad347632ab95e29eb5180b27142f5c264dfc611a (patch) | |
| tree | 2742a33f4c3d800a86e977de04fa2cec7104c43f /src | |
| parent | bcc014c4e177cc4e8cf5ca8c24990908b507c0f8 (diff) | |
Make ligatures copyable and searchable
Fixes #479
Fixes #1040
Diffstat (limited to 'src')
| -rw-r--r-- | src/doc.rs | 50 | ||||
| -rw-r--r-- | src/export/pdf/font.rs | 96 | ||||
| -rw-r--r-- | src/export/pdf/mod.rs | 11 | ||||
| -rw-r--r-- | src/export/pdf/page.rs | 11 | ||||
| -rw-r--r-- | src/ide/jump.rs | 13 |
5 files changed, 96 insertions, 85 deletions
@@ -1,7 +1,8 @@ //! Finished documents. -use std::fmt::{self, Debug, Formatter, Write}; +use std::fmt::{self, Debug, Formatter}; use std::num::NonZeroUsize; +use std::ops::Range; use std::str::FromStr; use std::sync::Arc; @@ -114,23 +115,6 @@ impl Frame { pub fn items(&self) -> std::slice::Iter<'_, (Point, FrameItem)> { self.items.iter() } - - /// Approximately recover the text inside of the frame and its children. - pub fn text(&self) -> EcoString { - let mut text = EcoString::new(); - for (_, item) in self.items() { - match item { - FrameItem::Text(item) => { - for glyph in &item.glyphs { - text.push(glyph.c); - } - } - FrameItem::Group(group) => text.push_str(&group.frame.text()), - _ => {} - } - } - text - } } /// Insert items and subframes. @@ -476,6 +460,8 @@ pub struct TextItem { pub fill: Paint, /// The natural language of the text. pub lang: Lang, + /// The item's plain text. + pub text: EcoString, /// The glyphs. pub glyphs: Vec<Glyph>, } @@ -489,19 +475,14 @@ impl TextItem { impl Debug for TextItem { fn fmt(&self, f: &mut Formatter) -> fmt::Result { - // This is only a rough approximation of the source text. - f.write_str("Text(\"")?; - for glyph in &self.glyphs { - for c in glyph.c.escape_debug() { - f.write_char(c)?; - } - } - f.write_str("\")") + f.write_str("Text(")?; + self.text.fmt(f)?; + f.write_str(")") } } /// A glyph in a run of shaped text. -#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] +#[derive(Debug, Clone, Eq, PartialEq, Hash)] pub struct Glyph { /// The glyph's index in the font. pub id: u16, @@ -509,12 +490,17 @@ pub struct Glyph { pub x_advance: Em, /// The horizontal offset of the glyph. pub x_offset: Em, - /// The first character of the glyph's cluster. - pub c: char, + /// The range of the glyph in its item's text. + pub range: Range<u16>, /// The source code location of the text. - pub span: Span, - /// The offset within the spanned text. - pub offset: u16, + pub span: (Span, u16), +} + +impl Glyph { + /// The range of the glyph in its item's text. + pub fn range(&self) -> Range<usize> { + usize::from(self.range.start)..usize::from(self.range.end) + } } /// An identifier for a natural language. diff --git a/src/export/pdf/font.rs b/src/export/pdf/font.rs index de79976a..1e2f9c93 100644 --- a/src/export/pdf/font.rs +++ b/src/export/pdf/font.rs @@ -1,13 +1,21 @@ use std::collections::BTreeMap; -use ecow::eco_format; +use ecow::{eco_format, EcoString}; use pdf_writer::types::{CidFontType, FontFlags, SystemInfo, UnicodeCmap}; use pdf_writer::{Filter, Finish, Name, Rect, Str}; use ttf_parser::{name_id, GlyphId, Tag}; +use unicode_general_category::GeneralCategory; use super::{deflate, EmExt, PdfContext, RefExt}; use crate::util::SliceExt; +const CMAP_NAME: Name = Name(b"Custom"); +const SYSTEM_INFO: SystemInfo = SystemInfo { + registry: Str(b"Adobe"), + ordering: Str(b"Identity"), + supplement: 0, +}; + /// Embed all used fonts into the PDF. #[tracing::instrument(skip_all)] pub fn write_fonts(ctx: &mut PdfContext) { @@ -19,7 +27,7 @@ pub fn write_fonts(ctx: &mut PdfContext) { let data_ref = ctx.alloc.bump(); ctx.font_refs.push(type0_ref); - let glyphs = &ctx.glyph_sets[font]; + let glyph_set = ctx.glyph_sets.get_mut(font).unwrap(); let metrics = font.metrics(); let ttf = font.ttf(); @@ -29,12 +37,6 @@ pub fn write_fonts(ctx: &mut PdfContext) { let base_font = eco_format!("ABCDEF+{}", postscript_name); let base_font = Name(base_font.as_bytes()); - let cmap_name = Name(b"Custom"); - let system_info = SystemInfo { - registry: Str(b"Adobe"), - ordering: Str(b"Identity"), - supplement: 0, - }; // Write the base font object referencing the CID font. ctx.writer @@ -59,7 +61,7 @@ pub fn write_fonts(ctx: &mut PdfContext) { let mut cid = ctx.writer.cid_font(cid_ref); cid.subtype(subtype); cid.base_font(base_font); - cid.system_info(system_info); + cid.system_info(SYSTEM_INFO); cid.font_descriptor(descriptor_ref); cid.default_width(0.0); @@ -70,7 +72,7 @@ pub fn write_fonts(ctx: &mut PdfContext) { // Extract the widths of all glyphs. let num_glyphs = ttf.number_of_glyphs(); let mut widths = vec![0.0; num_glyphs as usize]; - for &g in glyphs { + for &g in glyph_set.keys() { let x = ttf.glyph_hor_advance(GlyphId(g)).unwrap_or(0); widths[g as usize] = font.to_em(x).to_font_units(); } @@ -130,42 +132,15 @@ pub fn write_fonts(ctx: &mut PdfContext) { font_descriptor.finish(); - // Compute a reverse mapping from glyphs to unicode. - let cmap = { - let mut mapping = BTreeMap::new(); - for subtable in - ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) - { - if subtable.is_unicode() { - subtable.codepoints(|n| { - if let Some(c) = std::char::from_u32(n) { - if let Some(GlyphId(g)) = ttf.glyph_index(c) { - if glyphs.contains(&g) { - mapping.insert(g, c); - } - } - } - }); - } - } - - let mut cmap = UnicodeCmap::new(cmap_name, system_info); - for (g, c) in mapping { - cmap.pair(g, c); - } - cmap - }; - // Write the /ToUnicode character map, which maps glyph ids back to // unicode codepoints to enable copying out of the PDF. - ctx.writer - .cmap(cmap_ref, &deflate(&cmap.finish())) - .filter(Filter::FlateDecode); + let cmap = create_cmap(ttf, glyph_set); + ctx.writer.cmap(cmap_ref, &cmap.finish()); // Subset and write the font's bytes. let data = font.data(); let subsetted = { - let glyphs: Vec<_> = glyphs.iter().copied().collect(); + let glyphs: Vec<_> = glyph_set.keys().copied().collect(); let profile = subsetter::Profile::pdf(&glyphs); subsetter::subset(data, font.index(), profile) }; @@ -183,3 +158,44 @@ pub fn write_fonts(ctx: &mut PdfContext) { stream.finish(); } } + +/// Create a /ToUnicode CMap. +fn create_cmap( + ttf: &ttf_parser::Face, + glyph_set: &mut BTreeMap<u16, EcoString>, +) -> UnicodeCmap { + // For glyphs that have codepoints mapping to in the font's cmap table, we + // prefer them over pre-existing text mappings from the document. Only + // things that don't have a corresponding codepoint (or only a private-use + // one) like the "Th" in Linux Libertine get the text of their first + // occurances in the document instead. + for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) { + if !subtable.is_unicode() { + continue; + } + + subtable.codepoints(|n| { + let Some(c) = std::char::from_u32(n) else { return }; + if unicode_general_category::get_general_category(c) + == GeneralCategory::PrivateUse + { + return; + } + + let Some(GlyphId(g)) = ttf.glyph_index(c) else { return }; + if glyph_set.contains_key(&g) { + glyph_set.insert(g, c.into()); + } + }); + } + + // Produce a reverse mapping from glyphs to unicode strings. + let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO); + for (&g, text) in glyph_set.iter() { + if !text.is_empty() { + cmap.pair_with_multiple(g, text.chars()); + } + } + + cmap +} diff --git a/src/export/pdf/mod.rs b/src/export/pdf/mod.rs index ffbf67a3..48485862 100644 --- a/src/export/pdf/mod.rs +++ b/src/export/pdf/mod.rs @@ -6,9 +6,10 @@ mod outline; mod page; use std::cmp::Eq; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use std::hash::Hash; +use ecow::EcoString; use pdf_writer::types::Direction; use pdf_writer::{Finish, Name, PdfWriter, Ref, TextStr}; use xmp_writer::{LangId, RenditionClass, XmpWriter}; @@ -52,7 +53,13 @@ pub struct PdfContext<'a> { page_refs: Vec<Ref>, font_map: Remapper<Font>, image_map: Remapper<Image>, - glyph_sets: HashMap<Font, HashSet<u16>>, + /// For each font a mapping from used glyphs to their text representation. + /// May contain multiple chars in case of ligatures or similar things. The + /// same glyph can have a different text representation within one document, + /// then we just save the first one. The resulting strings are used for the + /// PDF's /ToUnicode map for glyphs that don't have an entry in the font's + /// cmap. This is important for copy-paste and searching. + glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>, languages: HashMap<Lang, usize>, } diff --git a/src/export/pdf/page.rs b/src/export/pdf/page.rs index 35a4f5dc..22e590d5 100644 --- a/src/export/pdf/page.rs +++ b/src/export/pdf/page.rs @@ -364,11 +364,12 @@ fn write_group(ctx: &mut PageContext, pos: Point, group: &GroupItem) { /// Encode a text run into the content stream. fn write_text(ctx: &mut PageContext, x: f32, y: f32, text: &TextItem) { *ctx.parent.languages.entry(text.lang).or_insert(0) += text.glyphs.len(); - ctx.parent - .glyph_sets - .entry(text.font.clone()) - .or_default() - .extend(text.glyphs.iter().map(|g| g.id)); + + let glyph_set = ctx.parent.glyph_sets.entry(text.font.clone()).or_default(); + for g in &text.glyphs { + let segment = &text.text[g.range()]; + glyph_set.entry(g.id).or_insert_with(|| segment.into()); + } ctx.set_fill(&text.fill); ctx.set_font(&text.font, text.size); diff --git a/src/ide/jump.rs b/src/ide/jump.rs index fc98747c..42ed2ab5 100644 --- a/src/ide/jump.rs +++ b/src/ide/jump.rs @@ -67,7 +67,8 @@ pub fn jump_from_click( FrameItem::Text(text) => { for glyph in &text.glyphs { - if glyph.span.is_detached() { + let (span, span_offset) = glyph.span; + if span.is_detached() { continue; } @@ -77,13 +78,13 @@ pub fn jump_from_click( Size::new(width, text.size), click, ) { - let source = world.source(glyph.span.source()); - let node = source.find(glyph.span)?; + let source = world.source(span.source()); + let node = source.find(span)?; let pos = if node.kind() == SyntaxKind::Text { let range = node.range(); - let mut offset = range.start + usize::from(glyph.offset); + let mut offset = range.start + usize::from(span_offset); if (click.x - pos.x) > width / 2.0 { - offset += glyph.c.len_utf8(); + offset += glyph.range().len(); } offset.min(range.end) } else { @@ -150,7 +151,7 @@ fn find_in_frame(frame: &Frame, span: Span) -> Option<Point> { if let FrameItem::Text(text) = item { for glyph in &text.glyphs { - if glyph.span == span { + if glyph.span.0 == span { return Some(pos); } pos.x += glyph.x_advance.at(text.size); |
