From cb77d520bfd163ee8a63e8158bccb6b237df85fe Mon Sep 17 00:00:00 2001 From: Marat Radchenko Date: Tue, 18 Feb 2020 20:46:41 +0300 Subject: resolves #47 stop requiring specific include file scheme (PR #302) support conversion of article documents to a single-chapter file resolves #205 support special chapters like bibliography resolves #190 `basedir` now points to spine document directory when processing chapter files resolves #178 fix image and listing numbers being reset in each chapter resolves #166 fix xref resolving between sub-includes of chapter files resolves #151 add support for contentless include files resolves #136 drop nonstandard `<>` xref syntax and instead support vanilla `<>` or `<>` syntax resolves #206 properly include bibliography generated by asciidoctor-bibtex require Asciidoctor 1.5.6+ --- lib/asciidoctor-epub3.rb | 1 - lib/asciidoctor-epub3/converter.rb | 892 +++++++++++++++++++++----- lib/asciidoctor-epub3/packager.rb | 735 --------------------- lib/asciidoctor-epub3/spine_item_processor.rb | 93 --- 4 files changed, 724 insertions(+), 997 deletions(-) delete mode 100644 lib/asciidoctor-epub3/packager.rb delete mode 100644 lib/asciidoctor-epub3/spine_item_processor.rb (limited to 'lib') diff --git a/lib/asciidoctor-epub3.rb b/lib/asciidoctor-epub3.rb index 81017df..1ff6859 100644 --- a/lib/asciidoctor-epub3.rb +++ b/lib/asciidoctor-epub3.rb @@ -5,7 +5,6 @@ require 'asciidoctor/extensions' require 'gepub' require_relative 'asciidoctor-epub3/ext' require_relative 'asciidoctor-epub3/converter' -require_relative 'asciidoctor-epub3/packager' # We need to be able to write files with unicode names. See https://github.com/asciidoctor/asciidoctor-epub3/issues/217 ::Zip.unicode_names = true diff --git a/lib/asciidoctor-epub3/converter.rb b/lib/asciidoctor-epub3/converter.rb index 67fa444..271a6ce 100644 --- a/lib/asciidoctor-epub3/converter.rb +++ b/lib/asciidoctor-epub3/converter.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -require_relative 'spine_item_processor' +require 'open3' require_relative 'font_icon_map' module Asciidoctor @@ -14,56 +14,45 @@ module Asciidoctor register_for 'epub3' - def initialize backend, opts - super - basebackend 'html' - outfilesuffix '.epub' # dummy outfilesuffix since it may be .mobi - htmlsyntax 'xml' - @validate = false - @extract = false - @kindlegen_path = nil - @epubcheck_path = nil - end - - def convert node, name = nil - if (name ||= node.node_name) == 'document' - @validate = node.attr? 'ebook-validate' - @extract = node.attr? 'ebook-extract' - @compress = node.attr 'ebook-compress' - @kindlegen_path = node.attr 'ebook-kindlegen-path' - @epubcheck_path = node.attr 'ebook-epubcheck-path' - spine_items = node.references[:spine_items] - if spine_items.nil? - logger.error %(#{::File.basename node.document.attr('docfile')}: failed to find spine items, produced file will be invalid) - spine_items = [] + def write output, target + epub_file = @format == :kf8 ? %(#{::Asciidoctor::Helpers.rootname target}-kf8.epub) : target + output.generate_epub epub_file + logger.debug %(Wrote #{@format.upcase} to #{epub_file}) + if @extract + extract_dir = epub_file.sub EpubExtensionRx, '' + ::FileUtils.remove_dir extract_dir if ::File.directory? extract_dir + ::Dir.mkdir extract_dir + ::Dir.chdir extract_dir do + ::Zip::File.open epub_file do |entries| + entries.each do |entry| + next unless entry.file? + unless (entry_dir = ::File.dirname entry.name) == '.' || (::File.directory? entry_dir) + ::FileUtils.mkdir_p entry_dir + end + entry.extract + end + end end - Packager.new node, spine_items, node.attributes['ebook-format'].to_sym - # converting an element from the spine document, such as an inline node in the doctitle - elsif name.start_with? 'inline_' - (@content_converter ||= ::Asciidoctor::Converter::Factory.default.create 'epub3-xhtml5').convert node, name - else - raise ::ArgumentError, %(Encountered unexpected node in epub3 package converter: #{name}) + logger.debug %(Extracted #{@format.upcase} to #{extract_dir}) end - end - # FIXME: we have to package in write because we don't have access to target before this point - def write packager, target - packager.package validate: @validate, extract: @extract, compress: @compress, kindlegen_path: @kindlegen_path, epubcheck_path: @epubcheck_path, target: target - nil + if @format == :kf8 + # QUESTION shouldn't we validate this epub file too? + distill_epub_to_mobi epub_file, target, @compress, @kindlegen_path + elsif @validate + validate_epub epub_file, @epubcheck_path + end end - end - # Public: The converter for the epub3 backend that converts the individual - # content documents in an EPUB3 publication. - class ContentConverter - include ::Asciidoctor::Converter - include ::Asciidoctor::Logging + CsvDelimiterRx = /\s*,\s*/ - register_for 'epub3-xhtml5' + DATA_DIR = ::File.expand_path ::File.join(__dir__, '..', '..', 'data') + ImageMacroRx = /^image::?(.*?)\[(.*?)\]$/ + ImgSrcScanRx = / 'xsd:string' + + # NOTE we must use :plain_text here since gepub reencodes + @book.add_title sanitize_doctitle_xml(node, :plain_text), id: 'pub-title' + + # FIXME: this logic needs some work + if node.attr? 'publisher' + @book.publisher publisher_name = (node.attr 'publisher') + # marc role: Book producer (see http://www.loc.gov/marc/relators/relaterm.html) + @book.creator (node.attr 'producer', publisher_name), role: 'bkp' + elsif node.attr? 'producer' + # NOTE Use producer as both publisher and producer if publisher isn't specified + producer_name = node.attr 'producer' + @book.publisher producer_name + # marc role: Book producer (see http://www.loc.gov/marc/relators/relaterm.html) + @book.creator producer_name, role: 'bkp' + elsif node.attr? 'author' + # NOTE Use author as creator if both publisher or producer are absent + # marc role: Author (see http://www.loc.gov/marc/relators/relaterm.html) + @book.creator node.attr('author'), role: 'aut' + end + + if node.attr? 'creator' + # marc role: Creator (see http://www.loc.gov/marc/relators/relaterm.html) + @book.creator node.attr('creator'), role: 'cre' + else + # marc role: Manufacturer (see http://www.loc.gov/marc/relators/relaterm.html) + # QUESTION should this be bkp? + @book.creator 'Asciidoctor', role: 'mfr' + end + + if node.attr? 'reproducible' + # We need to set lastmodified to some fixed value. Otherwise, gepub will set it to current date. + @book.lastmodified = (::Time.at 0).utc + # Is it correct that we do not populate dc:date when 'reproducible' is set? + else + if node.attr? 'revdate' + begin + @book.date = node.attr 'revdate' + rescue ArgumentError => e + logger.error %(#{::File.basename node.attr('docfile')}: failed to parse revdate: #{e}) + @book.date = node.attr 'docdatetime' + end + else + @book.date = node.attr 'docdatetime' + end + @book.lastmodified = node.attr 'localdatetime' + end + + @book.description = node.attr 'description' if node.attr? 'description' + @book.source = node.attr 'source' if node.attr? 'source' + @book.rights = node.attr 'copyright' if node.attr? 'copyright' + + (node.attr 'keywords', '').split(CsvDelimiterRx).each do |s| + @book.metadata.add_metadata 'subject', s + end + + add_cover_image node + add_front_matter_page node + + if node.doctype == 'book' + toc_items = [] + node.sections.each do |item| + next unless item.parent == node + # Mark top-level sections as separate chapter files + item.set_attr 'ebook-chapter', item.id + toc_items << item + end + # TODO: this loses content between doc header and first chapter + node.content + else + toc_items = [node] + node.set_attr 'ebook-chapter', node.attr('docname') + add_chapter node + end + + nav_xhtml = @book.add_item 'nav.xhtml', content: postprocess_xhtml(nav_doc(node, toc_items)), id: 'nav' + nav_xhtml.nav + + # NOTE gepub doesn't support building a ncx TOC with depth > 1, so do it ourselves + toc_ncx = ncx_doc node, toc_items + @book.add_item 'toc.ncx', content: toc_ncx.to_ios, id: 'ncx' + + docimagesdir = (node.attr 'imagesdir', '.').chomp '/' + docimagesdir = (docimagesdir == '.' ? nil : %(#{docimagesdir}/)) + + @images.each do |image| + if image[:name].start_with? %(#{docimagesdir}jacket/cover.) + logger.warn %(image path is reserved for cover artwork: #{image[:name]}; skipping image found in content) + elsif ::File.readable? image[:path] + @book.add_item image[:name], content: image[:path] + else + logger.error %(#{File.basename node.attr('docfile')}: image not found or not readable: #{image[:path]}) + end + end + + #add_metadata 'ibooks:specified-fonts', true + + add_theme_assets node + if node.doctype != 'book' + usernames = [node].map {|item| item.attr 'username' }.compact.uniq + add_profile_images node, usernames + end + + @book + end - if (doctitle = node.doctitle partition: true, use_fallback: true).subtitle? - title = %(#{doctitle.main} ) - subtitle = doctitle.subtitle + # FIXME: move to Asciidoctor::Helpers + def sanitize_doctitle_xml doc, content_spec + doctitle = doc.doctitle use_fallback: true + sanitize_xml doctitle, content_spec + end + + # FIXME: move to Asciidoctor::Helpers + def sanitize_xml content, content_spec + if content_spec != :pcdata && (content.include? '<') + if (content = (content.gsub XmlElementRx, '').strip).include? ' ' + content = content.tr_s ' ', ' ' + end + end + + case content_spec + when :attribute_cdata + content = content.gsub '"', '"' if content.include? '"' + when :cdata, :pcdata + # noop + when :plain_text + if content.include? ';' + content = content.gsub(CharEntityRx) { [$1.to_i].pack 'U*' } if content.include? '&#' + content = content.gsub FromHtmlSpecialCharsRx, FromHtmlSpecialCharsMap + end + else + raise ::ArgumentError, %(Unknown content spec: #{content_spec}) + end + content + end + + def add_chapter node + docid = node.attr 'ebook-chapter' + + if node.context == :document + if (doctitle = node.doctitle partition: true, use_fallback: true).subtitle? + title = %(#{doctitle.main} ) + subtitle = doctitle.subtitle + else + # HACK: until we get proper handling of title-only in CSS + title = '' + subtitle = doctitle.combined + end else - # HACK: until we get proper handling of title-only in CSS title = '' - subtitle = doctitle.combined + subtitle = node.title end - doctitle_sanitized = (node.doctitle sanitize: true, use_fallback: true).to_s + doctitle_sanitized = (node.document.doctitle sanitize: true, use_fallback: true).to_s # By default, Kindle does not allow the line height to be adjusted. # But if you float the elements, then the line height disappears and can be restored manually using margins. # See https://github.com/asciidoctor/asciidoctor-epub3/issues/123 subtitle_formatted = subtitle.split.map {|w| %(#{w}) } * ' ' - if pubtype == 'book' + if node.document.doctype == 'book' byline = '' else author = node.attr 'author' username = node.attr 'username', 'default' - imagesdir = (node.references[:spine].attr 'imagesdir', '.').chomp '/' + imagesdir = (node.document.attr 'imagesdir', '.').chomp '/' imagesdir = imagesdir == '.' ? '' : %(#{imagesdir}/) byline = %(#{LF}) end - mark_last_paragraph node unless pubtype == 'book' - content = node.content + mark_last_paragraph node unless node.document.doctype == 'book' + + begin + @in_chapter = true + @xrefs_seen.clear + content = node.content + ensure + @in_chapter = false + end # NOTE must run after content is resolved # TODO perhaps create dynamic CSS file? @@ -154,7 +316,7 @@ module Asciidoctor # NOTE kindlegen seems to mangle the
element, so we wrap its content in a div lines = [%( - + #{doctitle_sanitized} @@ -179,12 +341,14 @@ document.addEventListener('DOMContentLoaded', function(event, reader) {
#{content})] - if node.footnotes? + unless (fns = node.document.footnotes - @footnotes).empty? + @footnotes += fns + # NOTE kindlegen seems to mangle the