summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLaurenz Stampfl <47084093+LaurenzV@users.noreply.github.com>2024-06-16 11:38:33 +0200
committerGitHub <noreply@github.com>2024-06-16 09:38:33 +0000
commitfeedfe80cb86f880245f7b2361b83459c72ee36d (patch)
tree89832bffe311aee263e74c7c56fae3d3515243c2
parent34550220aee087271769fb9e5d94d1faebe243c1 (diff)
Improve subsetting (#4373)
Co-authored-by: Laurenz <laurmaedje@gmail.com>
-rw-r--r--Cargo.lock8
-rw-r--r--Cargo.toml4
-rw-r--r--crates/typst-pdf/src/color_font.rs2
-rw-r--r--crates/typst-pdf/src/content.rs33
-rw-r--r--crates/typst-pdf/src/font.rs87
-rw-r--r--crates/typst-pdf/src/image.rs3
-rw-r--r--crates/typst-pdf/src/page.rs1
-rw-r--r--crates/typst-pdf/src/resources.rs9
8 files changed, 79 insertions, 68 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 72f757cd..3e99ea81 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2255,15 +2255,13 @@ dependencies = [
[[package]]
name = "subsetter"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09eab8a83bff89ba2200bd4c59be45c7c787f988431b936099a5a266c957f2f9"
+version = "0.11.0"
+source = "git+https://github.com/typst/subsetter?rev=4e0058b#4e0058b4b9a0948a5f79894111948d95e59ba350"
[[package]]
name = "svg2pdf"
version = "0.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e31565956eb1dc398c0d9776ee1d1bac4e34759af63dcbe0520df32313a5b53b"
+source = "git+https://github.com/typst/svg2pdf?rev=39f8ad3#39f8ad3b35e14cfcabf3d5d916899f7ac78790f7"
dependencies = [
"fontdb",
"image 0.25.1",
diff --git a/Cargo.toml b/Cargo.toml
index 367e835a..ee50b666 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -98,8 +98,8 @@ shell-escape = "0.1.5"
siphasher = "1"
smallvec = { version = "1.11.1", features = ["union", "const_generics", "const_new"] }
stacker = "0.1.15"
-subsetter = "0.1.1"
-svg2pdf = "0.11.0"
+subsetter = { git = "https://github.com/typst/subsetter", rev = "4e0058b" }
+svg2pdf = { git = "https://github.com/typst/svg2pdf", rev = "39f8ad3" }
syn = { version = "2", features = ["full", "extra-traits"] }
syntect = { version = "5", default-features = false, features = ["parsing", "regex-fancy", "plist-load", "yaml-load"] }
tar = "0.4"
diff --git a/crates/typst-pdf/src/color_font.rs b/crates/typst-pdf/src/color_font.rs
index f4621ca0..201915b1 100644
--- a/crates/typst-pdf/src/color_font.rs
+++ b/crates/typst-pdf/src/color_font.rs
@@ -115,7 +115,7 @@ pub fn write_color_fonts(
pdf_font.finish();
// Encode a CMAP to make it possible to search or copy glyphs.
- let glyph_set = resources.glyph_sets.get(&font_slice.font).unwrap();
+ let glyph_set = resources.color_glyph_sets.get(&font_slice.font).unwrap();
let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO);
for (index, glyph) in subset.iter().enumerate() {
let Some(text) = glyph_set.get(&glyph.gid) else {
diff --git a/crates/typst-pdf/src/content.rs b/crates/typst-pdf/src/content.rs
index c5327c18..8ae2c424 100644
--- a/crates/typst-pdf/src/content.rs
+++ b/crates/typst-pdf/src/content.rs
@@ -476,6 +476,12 @@ fn write_normal_text(ctx: &mut Builder, pos: Point, text: TextItemView) {
let mut adjustment = Em::zero();
let mut encoded = vec![];
+ let glyph_remapper = ctx
+ .resources
+ .glyph_remappers
+ .entry(text.item.font.clone())
+ .or_default();
+
// Write the glyphs with kerning adjustments.
for glyph in text.glyphs() {
adjustment += glyph.x_offset;
@@ -490,7 +496,26 @@ fn write_normal_text(ctx: &mut Builder, pos: Point, text: TextItemView) {
adjustment = Em::zero();
}
- let cid = crate::font::glyph_cid(&text.item.font, glyph.id);
+ // In PDF, we use CIDs to index the glyphs in a font, not GIDs. What a
+ // CID actually refers to depends on the type of font we are embedding:
+ //
+ // - For TrueType fonts, the CIDs are defined by an external mapping.
+ // - For SID-keyed CFF fonts, the CID is the same as the GID in the font.
+ // - For CID-keyed CFF fonts, the CID refers to the CID in the font.
+ //
+ // (See in the PDF-spec for more details on this.)
+ //
+ // However, in our case:
+ // - We use the identity-mapping for TrueType fonts.
+ // - SID-keyed fonts will get converted into CID-keyed fonts by the
+ // subsetter.
+ // - CID-keyed fonts will be rewritten in a way so that the mapping
+ // between CID and GID is always the identity mapping, regardless of
+ // the mapping before.
+ //
+ // Because of this, we can always use the remapped GID as the CID,
+ // regardless of which type of font we are actually embedding.
+ let cid = glyph_remapper.remap(glyph.id);
encoded.push((cid >> 8) as u8);
encoded.push((cid & 0xff) as u8);
@@ -523,7 +548,11 @@ fn write_color_glyphs(ctx: &mut Builder, pos: Point, text: TextItemView) {
// displays regular glyphs and not color glyphs.
ctx.state.font = None;
- let glyph_set = ctx.resources.glyph_sets.entry(text.item.font.clone()).or_default();
+ let glyph_set = ctx
+ .resources
+ .color_glyph_sets
+ .entry(text.item.font.clone())
+ .or_default();
for glyph in text.glyphs() {
// Retrieve the Type3 font reference and the glyph index in the font.
diff --git a/crates/typst-pdf/src/font.rs b/crates/typst-pdf/src/font.rs
index 22c3d22f..6c6e7682 100644
--- a/crates/typst-pdf/src/font.rs
+++ b/crates/typst-pdf/src/font.rs
@@ -8,6 +8,7 @@ use pdf_writer::{
writers::FontDescriptor,
Chunk, Filter, Finish, Name, Rect, Ref, Str,
};
+use subsetter::GlyphRemapper;
use ttf_parser::{name_id, GlyphId, Tag};
use typst::text::Font;
use typst::utils::SliceExt;
@@ -43,6 +44,7 @@ pub fn write_fonts(context: &WithGlobalRefs) -> (PdfChunk, HashMap<Font, Ref>) {
out.insert(font.clone(), type0_ref);
let glyph_set = resources.glyph_sets.get(font).unwrap();
+ let glyph_remapper = resources.glyph_remappers.get(font).unwrap();
let ttf = font.ttf();
// Do we have a TrueType or CFF font?
@@ -87,16 +89,15 @@ pub fn write_fonts(context: &WithGlobalRefs) -> (PdfChunk, HashMap<Font, Ref>) {
}
// Extract the widths of all glyphs.
- let mut widths = vec![];
- for gid in std::iter::once(0).chain(glyph_set.keys().copied()) {
- let width = ttf.glyph_hor_advance(GlyphId(gid)).unwrap_or(0);
- let units = font.to_em(width).to_font_units();
- let cid = glyph_cid(font, gid);
- if usize::from(cid) >= widths.len() {
- widths.resize(usize::from(cid) + 1, 0.0);
- widths[usize::from(cid)] = units;
- }
- }
+ // `remapped_gids` returns an iterator over the old GIDs in their new sorted
+ // order, so we can append the widths as is.
+ let widths = glyph_remapper
+ .remapped_gids()
+ .map(|gid| {
+ let width = ttf.glyph_hor_advance(GlyphId(gid)).unwrap_or(0);
+ font.to_em(width).to_font_units()
+ })
+ .collect::<Vec<_>>();
// Write all non-zero glyph widths.
let mut first = 0;
@@ -115,19 +116,15 @@ pub fn write_fonts(context: &WithGlobalRefs) -> (PdfChunk, HashMap<Font, Ref>) {
// Write the /ToUnicode character map, which maps glyph ids back to
// unicode codepoints to enable copying out of the PDF.
- let cmap = create_cmap(font, glyph_set);
+ let cmap = create_cmap(glyph_set, glyph_remapper);
chunk.cmap(cmap_ref, &cmap.finish());
- // Subset and write the font's bytes.
- let glyphs: Vec<_> = glyph_set.keys().copied().collect();
- let data = subset_font(font, &glyphs);
-
- let mut stream = chunk.stream(data_ref, &data);
+ let subset = subset_font(font, glyph_remapper);
+ let mut stream = chunk.stream(data_ref, &subset);
stream.filter(Filter::FlateDecode);
if is_cff {
stream.pair(Name(b"Subtype"), Name(b"CIDFontType0C"));
}
-
stream.finish();
let mut font_descriptor =
@@ -194,15 +191,18 @@ pub fn write_font_descriptor<'a>(
/// Subset a font to the given glyphs.
///
-/// - For a font with TrueType outlines, this returns the whole OpenType font.
-/// - For a font with CFF outlines, this returns just the CFF font program.
+/// - For a font with TrueType outlines, this produces the whole OpenType font.
+/// - For a font with CFF outlines, this produces just the CFF font program.
+///
+/// In both cases, this returns the already compressed data.
#[comemo::memoize]
#[typst_macros::time(name = "subset font")]
-fn subset_font(font: &Font, glyphs: &[u16]) -> Arc<Vec<u8>> {
+fn subset_font(font: &Font, glyph_remapper: &GlyphRemapper) -> Arc<Vec<u8>> {
let data = font.data();
- let profile = subsetter::Profile::pdf(glyphs);
- let subsetted = subsetter::subset(data, font.index(), profile);
- let mut data = subsetted.as_deref().unwrap_or(data);
+ // TODO: Fail export instead of unwrapping once export diagnoistics exist.
+ let subsetted = subsetter::subset(data, font.index(), glyph_remapper).unwrap();
+
+ let mut data = subsetted.as_ref();
// Extract the standalone CFF font program if applicable.
let raw = ttf_parser::RawFace::parse(data, 0).unwrap();
@@ -259,46 +259,19 @@ pub fn improve_glyph_sets(glyph_sets: &mut HashMap<Font, BTreeMap<u16, EcoString
}
/// Create a /ToUnicode CMap.
-fn create_cmap(font: &Font, glyph_set: &BTreeMap<u16, EcoString>) -> UnicodeCmap {
+fn create_cmap(
+ glyph_set: &BTreeMap<u16, EcoString>,
+ glyph_remapper: &GlyphRemapper,
+) -> UnicodeCmap {
// Produce a reverse mapping from glyphs' CIDs to unicode strings.
let mut cmap = UnicodeCmap::new(CMAP_NAME, SYSTEM_INFO);
for (&g, text) in glyph_set.iter() {
+ // See commend in `write_normal_text` for why we can choose the CID this way.
+ let cid = glyph_remapper.get(g).unwrap();
if !text.is_empty() {
- cmap.pair_with_multiple(glyph_cid(font, g), text.chars());
+ cmap.pair_with_multiple(cid, text.chars());
}
}
cmap
}
-
-/// Get the CID for a glyph id.
-///
-/// When writing text into a PDF, we have to specify CIDs (character ids) not
-/// GIDs (glyph IDs).
-///
-/// Most of the time, the mapping between these two is an identity mapping. In
-/// particular, for TrueType fonts, the mapping is an identity mapping because
-/// of this line above:
-/// ```ignore
-/// cid.cid_to_gid_map_predefined(Name(b"Identity"));
-/// ```
-///
-/// However, CID-keyed CFF fonts may have a non-identity mapping defined in
-/// their charset. For those, we must map the glyph IDs in a `TextItem` to CIDs.
-/// The font defines the map through its charset. The charset usually maps
-/// glyphs to SIDs (string ids) specifying the glyph's name. Not for CID-keyed
-/// fonts though! For these, the SIDs are CIDs in disguise. Relevant quote from
-/// the CFF spec:
-///
-/// > The charset data, although in the same format as non-CIDFonts, will
-/// > represent CIDs rather than SIDs, [...]
-///
-/// This function performs the mapping from glyph ID to CID. It also works for
-/// non CID-keyed fonts. Then, it will simply return the glyph ID.
-pub(super) fn glyph_cid(font: &Font, glyph_id: u16) -> u16 {
- font.ttf()
- .tables()
- .cff
- .and_then(|cff| cff.glyph_cid(ttf_parser::GlyphId(glyph_id)))
- .unwrap_or(glyph_id)
-}
diff --git a/crates/typst-pdf/src/image.rs b/crates/typst-pdf/src/image.rs
index 1d43a43b..0df67c61 100644
--- a/crates/typst-pdf/src/image.rs
+++ b/crates/typst-pdf/src/image.rs
@@ -183,7 +183,8 @@ fn encode_alpha(raster: &RasterImage) -> (Vec<u8>, Filter) {
/// Encode an SVG into a chunk of PDF objects.
#[typst_macros::time(name = "encode svg")]
fn encode_svg(svg: &SvgImage) -> (Chunk, Ref) {
- svg2pdf::to_chunk(svg.tree(), svg2pdf::ConversionOptions::default())
+ // TODO: Don't unwrap once we have export diagostics.
+ svg2pdf::to_chunk(svg.tree(), svg2pdf::ConversionOptions::default()).unwrap()
}
/// A pre-encoded image.
diff --git a/crates/typst-pdf/src/page.rs b/crates/typst-pdf/src/page.rs
index c6881eb6..2983f504 100644
--- a/crates/typst-pdf/src/page.rs
+++ b/crates/typst-pdf/src/page.rs
@@ -53,6 +53,7 @@ pub fn traverse_pages(
}
improve_glyph_sets(&mut resources.glyph_sets);
+ improve_glyph_sets(&mut resources.color_glyph_sets);
(PdfChunk::new(), (pages, resources))
}
diff --git a/crates/typst-pdf/src/resources.rs b/crates/typst-pdf/src/resources.rs
index a0a7c71d..a2cf5687 100644
--- a/crates/typst-pdf/src/resources.rs
+++ b/crates/typst-pdf/src/resources.rs
@@ -11,6 +11,7 @@ use std::hash::Hash;
use ecow::{eco_format, EcoString};
use pdf_writer::{Dict, Finish, Name, Ref};
+use subsetter::GlyphRemapper;
use typst::text::Lang;
use typst::{text::Font, utils::Deferred, visualize::Image};
@@ -82,6 +83,10 @@ pub struct Resources<R = Ref> {
/// PDF's /ToUnicode map for glyphs that don't have an entry in the font's
/// cmap. This is important for copy-paste and searching.
pub glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
+ /// Same as `glyph_sets`, but for color fonts.
+ pub color_glyph_sets: HashMap<Font, BTreeMap<u16, EcoString>>,
+ /// Stores the glyph remapper for each font for the subsetter.
+ pub glyph_remappers: HashMap<Font, GlyphRemapper>,
}
impl<R: Renumber> Renumber for Resources<R> {
@@ -112,6 +117,8 @@ impl Default for Resources<()> {
color_fonts: None,
languages: BTreeMap::new(),
glyph_sets: HashMap::new(),
+ color_glyph_sets: HashMap::new(),
+ glyph_remappers: HashMap::new(),
}
}
}
@@ -138,6 +145,8 @@ impl Resources<()> {
.map(|(c, r)| Box::new(c.with_refs(r))),
languages: self.languages,
glyph_sets: self.glyph_sets,
+ color_glyph_sets: self.color_glyph_sets,
+ glyph_remappers: self.glyph_remappers,
}
}
}