diff options
| author | Niklas Eicker <git@nikl.me> | 2025-01-08 10:38:34 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-01-08 09:38:34 +0000 |
| commit | 0a374d238016c0101d11cbc3f4bc621f3895ad36 (patch) | |
| tree | 5799e3c279e70a371fe7d737ac3ff37655827910 /crates/typst-pdf | |
| parent | 265df6c29f4d142a372917dd708bfba780f7cfbc (diff) | |
Embed files associated with the document as a whole (#5221)
Co-authored-by: Laurenz <laurmaedje@gmail.com>
Diffstat (limited to 'crates/typst-pdf')
| -rw-r--r-- | crates/typst-pdf/src/catalog.rs | 84 | ||||
| -rw-r--r-- | crates/typst-pdf/src/embed.rs | 122 | ||||
| -rw-r--r-- | crates/typst-pdf/src/lib.rs | 46 |
3 files changed, 218 insertions, 34 deletions
diff --git a/crates/typst-pdf/src/catalog.rs b/crates/typst-pdf/src/catalog.rs index c4b0e2e8..709b0155 100644 --- a/crates/typst-pdf/src/catalog.rs +++ b/crates/typst-pdf/src/catalog.rs @@ -12,7 +12,7 @@ use typst_syntax::Span; use xmp_writer::{DateTime, LangId, RenditionClass, XmpWriter}; use crate::page::PdfPageLabel; -use crate::{hash_base64, outline, TextStrExt, Timezone, WithEverything}; +use crate::{hash_base64, outline, TextStrExt, Timestamp, Timezone, WithEverything}; /// Write the document catalog. pub fn write_catalog( @@ -86,23 +86,10 @@ pub fn write_catalog( info.keywords(TextStr::trimmed(&joined)); xmp.pdf_keywords(&joined); } - - // (1) If the `document.date` is set to specific `datetime` or `none`, use it. - // (2) If the `document.date` is set to `auto` or not set, try to use the - // date from the options. - // (3) Otherwise, we don't write date metadata. - let (date, tz) = match (ctx.document.info.date, ctx.options.timestamp) { - (Smart::Custom(date), _) => (date, None), - (Smart::Auto, Some(timestamp)) => { - (Some(timestamp.datetime), Some(timestamp.timezone)) - } - _ => (None, None), - }; - if let Some(date) = date { - if let Some(pdf_date) = pdf_date(date, tz) { - info.creation_date(pdf_date); - info.modified_date(pdf_date); - } + let (date, tz) = document_date(ctx.document.info.date, ctx.options.timestamp); + if let Some(pdf_date) = date.and_then(|date| pdf_date(date, tz)) { + info.creation_date(pdf_date); + info.modified_date(pdf_date); } info.finish(); @@ -154,7 +141,7 @@ pub fn write_catalog( } // Assert dominance. - if ctx.options.standards.pdfa { + if let Some((part, conformance)) = ctx.options.standards.pdfa_part { let mut extension_schemas = xmp.extension_schemas(); extension_schemas .xmp_media_management() @@ -162,8 +149,8 @@ pub fn write_catalog( .describe_instance_id(); extension_schemas.pdf().properties().describe_all(); extension_schemas.finish(); - xmp.pdfa_part(2); - xmp.pdfa_conformance("B"); + xmp.pdfa_part(part); + xmp.pdfa_conformance(conformance); } let xmp_buf = xmp.finish(None); @@ -182,13 +169,35 @@ pub fn write_catalog( catalog.viewer_preferences().direction(dir); catalog.metadata(meta_ref); - // Write the named destination tree if there are any entries. - if !ctx.references.named_destinations.dests.is_empty() { + let has_dests = !ctx.references.named_destinations.dests.is_empty(); + let has_embeddings = !ctx.references.embedded_files.is_empty(); + + // Write the `/Names` dictionary. + if has_dests || has_embeddings { + // Write the named destination tree if there are any entries. let mut name_dict = catalog.names(); - let mut dests_name_tree = name_dict.destinations(); - let mut names = dests_name_tree.names(); - for &(name, dest_ref, ..) in &ctx.references.named_destinations.dests { - names.insert(Str(name.resolve().as_bytes()), dest_ref); + if has_dests { + let mut dests_name_tree = name_dict.destinations(); + let mut names = dests_name_tree.names(); + for &(name, dest_ref, ..) in &ctx.references.named_destinations.dests { + names.insert(Str(name.resolve().as_bytes()), dest_ref); + } + } + + if has_embeddings { + let mut embedded_files = name_dict.embedded_files(); + let mut names = embedded_files.names(); + for (name, file_ref) in &ctx.references.embedded_files { + names.insert(Str(name.as_bytes()), *file_ref); + } + } + } + + if has_embeddings && ctx.options.standards.pdfa { + // PDF 2.0, but ISO 19005-3 (PDF/A-3) Annex E allows it for PDF/A-3. + let mut associated_files = catalog.insert(Name(b"AF")).array().typed(); + for (_, file_ref) in ctx.references.embedded_files { + associated_files.item(file_ref).finish(); } } @@ -289,8 +298,27 @@ pub(crate) fn write_page_labels( result } +/// Resolve the document date. +/// +/// (1) If the `document.date` is set to specific `datetime` or `none`, use it. +/// (2) If the `document.date` is set to `auto` or not set, try to use the +/// date from the options. +/// (3) Otherwise, we don't write date metadata. +pub fn document_date( + document_date: Smart<Option<Datetime>>, + timestamp: Option<Timestamp>, +) -> (Option<Datetime>, Option<Timezone>) { + match (document_date, timestamp) { + (Smart::Custom(date), _) => (date, None), + (Smart::Auto, Some(timestamp)) => { + (Some(timestamp.datetime), Some(timestamp.timezone)) + } + _ => (None, None), + } +} + /// Converts a datetime to a pdf-writer date. -fn pdf_date(datetime: Datetime, tz: Option<Timezone>) -> Option<pdf_writer::Date> { +pub fn pdf_date(datetime: Datetime, tz: Option<Timezone>) -> Option<pdf_writer::Date> { let year = datetime.year().filter(|&y| y >= 0)? as u16; let mut pdf_date = pdf_writer::Date::new(year); diff --git a/crates/typst-pdf/src/embed.rs b/crates/typst-pdf/src/embed.rs new file mode 100644 index 00000000..b32f6e45 --- /dev/null +++ b/crates/typst-pdf/src/embed.rs @@ -0,0 +1,122 @@ +use std::collections::BTreeMap; + +use ecow::EcoString; +use pdf_writer::types::AssociationKind; +use pdf_writer::{Filter, Finish, Name, Ref, Str, TextStr}; +use typst_library::diag::{bail, SourceResult}; +use typst_library::foundations::{NativeElement, Packed, StyleChain}; +use typst_library::pdf::{EmbedElem, EmbeddedFileRelationship}; + +use crate::catalog::{document_date, pdf_date}; +use crate::{deflate, NameExt, PdfChunk, StrExt, WithGlobalRefs}; + +/// Query for all [`EmbedElem`] and write them and their file specifications. +/// +/// This returns a map of embedding names and references so that we can later +/// add them to the catalog's `/Names` dictionary. +pub fn write_embedded_files( + ctx: &WithGlobalRefs, +) -> SourceResult<(PdfChunk, BTreeMap<EcoString, Ref>)> { + let mut chunk = PdfChunk::new(); + let mut embedded_files = BTreeMap::default(); + + let elements = ctx.document.introspector.query(&EmbedElem::elem().select()); + for elem in &elements { + if !ctx.options.standards.embedded_files { + // PDF/A-2 requires embedded files to be PDF/A-1 or PDF/A-2, + // which we don't currently check. + bail!( + elem.span(), + "file embeddings are not currently supported for PDF/A-2"; + hint: "PDF/A-3 supports arbitrary embedded files" + ); + } + + let embed = elem.to_packed::<EmbedElem>().unwrap(); + if embed.resolved_path.len() > Str::PDFA_LIMIT { + bail!(embed.span(), "embedded file path is too long"); + } + + let id = embed_file(ctx, &mut chunk, embed)?; + if embedded_files.insert(embed.resolved_path.clone(), id).is_some() { + bail!( + elem.span(), + "duplicate embedded file for path `{}`", embed.resolved_path; + hint: "embedded file paths must be unique", + ); + } + } + + Ok((chunk, embedded_files)) +} + +/// Write the embedded file stream and its file specification. +fn embed_file( + ctx: &WithGlobalRefs, + chunk: &mut PdfChunk, + embed: &Packed<EmbedElem>, +) -> SourceResult<Ref> { + let embedded_file_stream_ref = chunk.alloc.bump(); + let file_spec_dict_ref = chunk.alloc.bump(); + + let data = embed.data().as_slice(); + let compressed = deflate(data); + + let mut embedded_file = chunk.embedded_file(embedded_file_stream_ref, &compressed); + embedded_file.filter(Filter::FlateDecode); + + if let Some(mime_type) = embed.mime_type(StyleChain::default()) { + if mime_type.len() > Name::PDFA_LIMIT { + bail!(embed.span(), "embedded file MIME type is too long"); + } + embedded_file.subtype(Name(mime_type.as_bytes())); + } else if ctx.options.standards.pdfa { + bail!(embed.span(), "embedded files must have a MIME type in PDF/A-3"); + } + + let mut params = embedded_file.params(); + params.size(data.len() as i32); + + let (date, tz) = document_date(ctx.document.info.date, ctx.options.timestamp); + if let Some(pdf_date) = date.and_then(|date| pdf_date(date, tz)) { + params.modification_date(pdf_date); + } else if ctx.options.standards.pdfa { + bail!( + embed.span(), + "the document must have a date when embedding files in PDF/A-3"; + hint: "`set document(date: none)` must not be used in this case" + ); + } + + params.finish(); + embedded_file.finish(); + + let mut file_spec = chunk.file_spec(file_spec_dict_ref); + file_spec.path(Str(embed.resolved_path.as_bytes())); + file_spec.unic_file(TextStr(&embed.resolved_path)); + file_spec + .insert(Name(b"EF")) + .dict() + .pair(Name(b"F"), embedded_file_stream_ref) + .pair(Name(b"UF"), embedded_file_stream_ref); + + if ctx.options.standards.pdfa { + // PDF 2.0, but ISO 19005-3 (PDF/A-3) Annex E allows it for PDF/A-3. + file_spec.association_kind(match embed.relationship(StyleChain::default()) { + Some(EmbeddedFileRelationship::Source) => AssociationKind::Source, + Some(EmbeddedFileRelationship::Data) => AssociationKind::Data, + Some(EmbeddedFileRelationship::Alternative) => AssociationKind::Alternative, + Some(EmbeddedFileRelationship::Supplement) => AssociationKind::Supplement, + None => AssociationKind::Unspecified, + }); + } + + if let Some(description) = embed.description(StyleChain::default()) { + if description.len() > Str::PDFA_LIMIT { + bail!(embed.span(), "embedded file description is too long"); + } + file_spec.description(TextStr(description)); + } + + Ok(file_spec_dict_ref) +} diff --git a/crates/typst-pdf/src/lib.rs b/crates/typst-pdf/src/lib.rs index f45c62bb..88e62389 100644 --- a/crates/typst-pdf/src/lib.rs +++ b/crates/typst-pdf/src/lib.rs @@ -4,6 +4,7 @@ mod catalog; mod color; mod color_font; mod content; +mod embed; mod extg; mod font; mod gradient; @@ -14,12 +15,13 @@ mod page; mod resources; mod tiling; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::fmt::{self, Debug, Formatter}; use std::hash::Hash; use std::ops::{Deref, DerefMut}; use base64::Engine; +use ecow::EcoString; use pdf_writer::{Chunk, Name, Pdf, Ref, Str, TextStr}; use serde::{Deserialize, Serialize}; use typst_library::diag::{bail, SourceResult, StrResult}; @@ -33,6 +35,7 @@ use typst_utils::Deferred; use crate::catalog::write_catalog; use crate::color::{alloc_color_functions_refs, ColorFunctionRefs}; use crate::color_font::{write_color_fonts, ColorFontSlice}; +use crate::embed::write_embedded_files; use crate::extg::{write_graphic_states, ExtGState}; use crate::font::write_fonts; use crate::gradient::{write_gradients, PdfGradient}; @@ -67,6 +70,7 @@ pub fn pdf(document: &PagedDocument, options: &PdfOptions) -> SourceResult<Vec<u gradients: builder.run(write_gradients)?, tilings: builder.run(write_tilings)?, ext_gs: builder.run(write_graphic_states)?, + embedded_files: builder.run(write_embedded_files)?, }) })? .phase(|builder| builder.run(write_page_tree))? @@ -147,16 +151,34 @@ pub enum Timezone { /// Encapsulates a list of compatible PDF standards. #[derive(Clone)] pub struct PdfStandards { - /// For now, we simplify to just PDF/A, since we only support PDF/A-2b. But - /// it can be more fine-grained in the future. + /// For now, we simplify to just PDF/A. But it can be more fine-grained in + /// the future. pub(crate) pdfa: bool, + /// Whether the standard allows for embedding any kind of file into the PDF. + /// We disallow this for PDF/A-2, since it only allows embedding + /// PDF/A-1 and PDF/A-2 documents. + pub(crate) embedded_files: bool, + /// Part of the PDF/A standard. + pub(crate) pdfa_part: Option<(i32, &'static str)>, } impl PdfStandards { /// Validates a list of PDF standards for compatibility and returns their /// encapsulated representation. pub fn new(list: &[PdfStandard]) -> StrResult<Self> { - Ok(Self { pdfa: list.contains(&PdfStandard::A_2b) }) + let a2b = list.contains(&PdfStandard::A_2b); + let a3b = list.contains(&PdfStandard::A_3b); + + if a2b && a3b { + bail!("PDF cannot conform to A-2B and A-3B at the same time") + } + + let pdfa = a2b || a3b; + Ok(Self { + pdfa, + embedded_files: !a2b, + pdfa_part: pdfa.then_some((if a2b { 2 } else { 3 }, "B")), + }) } } @@ -166,10 +188,9 @@ impl Debug for PdfStandards { } } -#[allow(clippy::derivable_impls)] impl Default for PdfStandards { fn default() -> Self { - Self { pdfa: false } + Self { pdfa: false, embedded_files: true, pdfa_part: None } } } @@ -186,6 +207,9 @@ pub enum PdfStandard { /// PDF/A-2b. #[serde(rename = "a-2b")] A_2b, + /// PDF/A-3b. + #[serde(rename = "a-3b")] + A_3b, } /// A struct to build a PDF following a fixed succession of phases. @@ -316,6 +340,8 @@ struct References { tilings: HashMap<PdfTiling, Ref>, /// The IDs of written external graphics states. ext_gs: HashMap<ExtGState, Ref>, + /// The names and references for embedded files. + embedded_files: BTreeMap<EcoString, Ref>, } /// At this point, the references have been assigned to all resources. The page @@ -481,6 +507,14 @@ impl<T: Eq + Hash, R: Renumber> Renumber for HashMap<T, R> { } } +impl<T: Ord, R: Renumber> Renumber for BTreeMap<T, R> { + fn renumber(&mut self, offset: i32) { + for v in self.values_mut() { + v.renumber(offset); + } + } +} + impl<R: Renumber> Renumber for Option<R> { fn renumber(&mut self, offset: i32) { if let Some(r) = self { |
