diff options
| author | Dan Allen <dan.j.allen@gmail.com> | 2019-01-04 14:17:21 -0700 |
|---|---|---|
| committer | Dan Allen <dan.j.allen@gmail.com> | 2019-01-06 16:42:53 -0700 |
| commit | 83619ac4e1b8207614c1500a0012d30af28db68b (patch) | |
| tree | 56c7c3435102c9d853f34221194d564b48314d8c | |
| parent | 555031d7dd583aad461546b0e6ee98dda02d1fa5 (diff) | |
read input File as string; rename Helpers.normalize_lines* methods
- read input into string instead of array
- rename Helpers.normalize_lines* methods to Helpers.prepare_source*
- update API docs
- add alias constant for UTF_8 encoding
- remove unused Helpers.prepare_source method
- remove workaround for encoding array to UTF-16LE
| -rw-r--r-- | lib/asciidoctor.rb | 21 | ||||
| -rw-r--r-- | lib/asciidoctor/abstract_node.rb | 4 | ||||
| -rw-r--r-- | lib/asciidoctor/block.rb | 2 | ||||
| -rw-r--r-- | lib/asciidoctor/cli/options.rb | 2 | ||||
| -rw-r--r-- | lib/asciidoctor/helpers.rb | 67 | ||||
| -rw-r--r-- | lib/asciidoctor/reader.rb | 12 | ||||
| -rw-r--r-- | test/invoker_test.rb | 4 | ||||
| -rw-r--r-- | test/reader_test.rb | 19 |
8 files changed, 62 insertions, 69 deletions
diff --git a/lib/asciidoctor.rb b/lib/asciidoctor.rb index 44fabd83..53516809 100644 --- a/lib/asciidoctor.rb +++ b/lib/asciidoctor.rb @@ -193,6 +193,9 @@ module Asciidoctor # Maximum integer value for "boundless" operations; equal to MAX_SAFE_INTEGER in JavaScript MAX_INT = 9007199254740991 + # Alias UTF_8 encoding for convenience / speed + UTF_8 = ::Encoding::UTF_8 + # Byte arrays for UTF-* Byte Order Marks BOM_BYTES_UTF_8 = [0xef, 0xbb, 0xbf] BOM_BYTES_UTF_16LE = [0xff, 0xfe] @@ -1272,14 +1275,13 @@ module Asciidoctor raise ::ArgumentError, %(illegal type for attributes option: #{attrs.class.ancestors.join ' < '}) end - lines = nil if ::File === input # TODO cli checks if input path can be read and is file, but might want to add check to API input_path = ::File.expand_path input.path # See https://reproducible-builds.org/specs/source-date-epoch/ # NOTE Opal can't call key? on ENV input_mtime = ::ENV['SOURCE_DATE_EPOCH'] ? ::Time.at(Integer ::ENV['SOURCE_DATE_EPOCH']).utc : input.mtime - lines = input.readlines + source = input.read # hold off on setting infile and indir until we get a better sense of their purpose attrs['docfile'] = input_path attrs['docdir'] = ::File.dirname input_path @@ -1293,18 +1295,15 @@ module Asciidoctor # %Z is OS dependent and may contain characters that aren't UTF-8 encoded (see asciidoctor#2770 and asciidoctor.js#23) doctime = (attrs['doctime'] ||= input_mtime.strftime %(%T #{input_mtime.utc_offset == 0 ? 'UTC' : '%z'})) attrs['docdatetime'] = %(#{docdate} #{doctime}) - elsif input.respond_to? :readlines + elsif input.respond_to? :read # NOTE tty, pipes & sockets can't be rewound, but can't be sniffed easily either # just fail the rewind operation silently to handle all cases - begin - input.rewind - rescue - end - lines = input.readlines + input.rewind rescue nil + source = input.read elsif ::String === input - lines = input.lines + source = input elsif ::Array === input - lines = input.drop 0 + source = input.drop 0 else raise ::ArgumentError, %(unsupported input type: #{input.class}) end @@ -1315,7 +1314,7 @@ module Asciidoctor end options[:attributes] = attrs - doc = options[:parse] == false ? (Document.new lines, options) : (Document.new lines, options).parse + doc = options[:parse] == false ? (Document.new source, options) : (Document.new source, options).parse timings.record :parse if timings doc diff --git a/lib/asciidoctor/abstract_node.rb b/lib/asciidoctor/abstract_node.rb index 0bf6aaff..d63e3479 100644 --- a/lib/asciidoctor/abstract_node.rb +++ b/lib/asciidoctor/abstract_node.rb @@ -505,7 +505,7 @@ class AbstractNode opts = { :warn_on_failure => (opts != false) } unless ::Hash === opts if ::File.readable? path if opts[:normalize] - (Helpers.normalize_lines_array ::File.open(path, FILE_READ_MODE) {|f| f.each_line.to_a }).join LF + (Helpers.prepare_source_string ::File.open(path, FILE_READ_MODE) {|f| f.read }).join LF else # QUESTION should we chomp or rstrip content? ::IO.read path @@ -540,7 +540,7 @@ class AbstractNode Helpers.require_library 'open-uri/cached', 'open-uri-cached' if doc.attr? 'cache-uri' begin if opts[:normalize] - (Helpers.normalize_lines_array ::OpenURI.open_uri(target, URI_READ_MODE) {|f| f.each_line.to_a }).join LF + (Helpers.prepare_source_string ::OpenURI.open_uri(target, URI_READ_MODE) {|f| f.read }).join LF else ::OpenURI.open_uri(target, URI_READ_MODE) {|f| f.read } end diff --git a/lib/asciidoctor/block.rb b/lib/asciidoctor/block.rb index b3057cbb..c915b0db 100644 --- a/lib/asciidoctor/block.rb +++ b/lib/asciidoctor/block.rb @@ -87,7 +87,7 @@ class Block < AbstractBlock if (raw_source = opts[:source]).nil_or_empty? @lines = [] elsif ::String === raw_source - @lines = Helpers.normalize_lines_from_string raw_source + @lines = Helpers.prepare_source_string raw_source else @lines = raw_source.drop 0 end diff --git a/lib/asciidoctor/cli/options.rb b/lib/asciidoctor/cli/options.rb index 2c7f3e66..019c0fd5 100644 --- a/lib/asciidoctor/cli/options.rb +++ b/lib/asciidoctor/cli/options.rb @@ -81,7 +81,7 @@ Example: asciidoctor -b html5 source.asciidoc opts.on('-a', '--attribute key[=value]', 'a document attribute to set in the form of key, key! or key=value pair', 'unless @ is appended to the value, this attributes takes precedence over attributes', 'defined in the source document') do |attr| - attr = attr.encode ::Encoding::UTF_8 unless attr.encoding == ::Encoding::UTF_8 + attr = attr.encode UTF_8 unless attr.encoding == UTF_8 key, val = attr.split '=', 2 self[:attributes][key] = val || '' end diff --git a/lib/asciidoctor/helpers.rb b/lib/asciidoctor/helpers.rb index e57f8843..2bd0c5af 100644 --- a/lib/asciidoctor/helpers.rb +++ b/lib/asciidoctor/helpers.rb @@ -42,65 +42,58 @@ module Helpers nil end - # Public: Normalize the data to prepare for parsing + # Public: Prepare the source data Array for parsing. # - # Delegates to Helpers#normalize_lines_from_string if data is a String. - # Delegates to Helpers#normalize_lines_array if data is a String Array. + # Encodes the data to UTF-8, if necessary, and removes any trailing + # whitespace from every line. # - # returns a String Array of normalized lines - def self.normalize_lines data - ::String === data ? (normalize_lines_from_string data) : (normalize_lines_array data) - end - - # Public: Normalize the array of lines to prepare them for parsing - # - # Encodes the data to UTF-8 and removes trailing whitespace from each line. + # If a BOM is found at the beginning of the data, a best attempt is made to + # encode it to UTF-8 from the specified source encoding. # - # If a BOM is present at the beginning of the data, a best attempt - # is made to encode from the specified encoding to UTF-8. + # data - the source data Array to prepare (no nil entries allowed) # - # data - a String Array of lines to normalize - # - # returns a String Array of normalized lines - def self.normalize_lines_array data - return data if data.empty? - utf8 = ::Encoding::UTF_8 + # returns a String Array of prepared lines + def self.prepare_source_array data + return [] if data.empty? if (leading_2_bytes = (leading_bytes = (first = data[0]).unpack 'C3').slice 0, 2) == BOM_BYTES_UTF_16LE data[0] = first.byteslice 2, first.bytesize - # HACK Ruby messes up trailing whitespace on UTF-16LE, so encode whole document first; assume newlines are present - return (data.join.encode utf8, ::Encoding::UTF_16LE).lines.map {|line| line.rstrip } + # NOTE you can't split a UTF-16LE string using .lines when encoding is UTF-8; doing so will cause this line to fail + return data.map {|line| (line.encode UTF_8, ::Encoding::UTF_16LE).rstrip } elsif leading_2_bytes == BOM_BYTES_UTF_16BE data[0] = first.byteslice 2, first.bytesize - return data.map {|line| (line.encode utf8, ::Encoding::UTF_16BE).rstrip } + return data.map {|line| (line.encode UTF_8, ::Encoding::UTF_16BE).rstrip } elsif leading_bytes == BOM_BYTES_UTF_8 data[0] = first.byteslice 3, first.bytesize end - data.map {|line| (line.encoding == utf8 ? line : (line.encode utf8)).rstrip } + if first.encoding == UTF_8 + data.map {|line| line.rstrip } + else + data.map {|line| (line.encode UTF_8).rstrip } + end end - # Public: Normalize the String and split into lines to prepare them for parsing + # Public: Prepare the source data String for parsing. # - # Encodes the data to UTF-8, converts the data to a String Array, and removes - # trailing whitespace from each line. + # Encodes the data to UTF-8, if necessary, splits it into an array, and + # removes any trailing whitespace from every line. # - # If a BOM is present at the beginning of the data, a best attempt - # is made to encode from the specified encoding to UTF-8. + # If a BOM is found at the beginning of the data, a best attempt is made to + # encode it to UTF-8 from the specified source encoding. # - # data - a String of lines to normalize + # data - the source data String to prepare # - # returns a String Array of normalized lines - def self.normalize_lines_from_string data + # returns a String Array of prepared lines + def self.prepare_source_string data return [] if data.nil_or_empty? - utf8 = ::Encoding::UTF_8 if (leading_2_bytes = (leading_bytes = data.unpack 'C3').slice 0, 2) == BOM_BYTES_UTF_16LE - data = (data.byteslice 2, data.bytesize).encode utf8, ::Encoding::UTF_16LE + data = (data.byteslice 2, data.bytesize).encode UTF_8, ::Encoding::UTF_16LE elsif leading_2_bytes == BOM_BYTES_UTF_16BE - data = (data.byteslice 2, data.bytesize).encode utf8, ::Encoding::UTF_16BE + data = (data.byteslice 2, data.bytesize).encode UTF_8, ::Encoding::UTF_16BE elsif leading_bytes == BOM_BYTES_UTF_8 data = data.byteslice 3, data.bytesize - data = data.encode utf8 unless data.encoding == utf8 - elsif data.encoding != utf8 - data = data.encode utf8 + data = data.encode UTF_8 unless data.encoding == UTF_8 + elsif data.encoding != UTF_8 + data = data.encode UTF_8 end data.lines.map {|line| line.rstrip } end diff --git a/lib/asciidoctor/reader.rb b/lib/asciidoctor/reader.rb index 59fd2a6d..985ac51d 100644 --- a/lib/asciidoctor/reader.rb +++ b/lib/asciidoctor/reader.rb @@ -82,14 +82,10 @@ class Reader # # Returns The String lines extracted from the data def prepare_lines data, opts = {} - if ::String === data - if opts[:normalize] - Helpers.normalize_lines_from_string data - else - data.split LF, -1 - end - elsif opts[:normalize] - Helpers.normalize_lines_array data + if opts[:normalize] + ::String === data ? (Helpers.prepare_source_string data) : (Helpers.prepare_source_array data) + elsif ::String === data + data.split LF, -1 else data.drop 0 end diff --git a/test/invoker_test.rb b/test/invoker_test.rb index 1c85b3c3..1398ef9c 100644 --- a/test/invoker_test.rb +++ b/test/invoker_test.rb @@ -71,8 +71,8 @@ context 'Invoker' do test 'should not fail to rewind input if reading document from stdin' do io = STDIN.dup class << io - def readlines - ['paragraph'] + def read + 'paragraph' end end invoker = invoke_cli_to_buffer(%w(-s), '-') { io } diff --git a/test/reader_test.rb b/test/reader_test.rb index f31bdfb4..ed03084e 100644 --- a/test/reader_test.rb +++ b/test/reader_test.rb @@ -54,7 +54,10 @@ third line test 'should encode UTF-16LE string array to UTF-8 when BOM is found' do ['UTF-8', 'ASCII-8BIT'].each do |start_encoding| - data = "\ufeff#{SAMPLE_DATA.join ::Asciidoctor::LF}".encode('UTF-16LE').force_encoding(start_encoding).lines.to_a + # NOTE can't split a UTF-16LE string using .lines when encoding is set to UTF-8 + data = SAMPLE_DATA.dup + data.unshift %(\ufeff#{data.shift}) + data.each {|line| (line.encode 'UTF-16LE').force_encoding start_encoding } reader = Asciidoctor::Reader.new data, nil, :normalize => true assert_equal Encoding::UTF_8, reader.lines[0].encoding assert_equal 'f', reader.lines[0].chr @@ -74,7 +77,9 @@ third line test 'should encode UTF-16BE string array to UTF-8 when BOM is found' do ['UTF-8', 'ASCII-8BIT'].each do |start_encoding| - data = "\ufeff#{SAMPLE_DATA.join ::Asciidoctor::LF}".encode('UTF-16BE').force_encoding(start_encoding).lines.to_a + data = SAMPLE_DATA.dup + data.unshift %(\ufeff#{data.shift}) + data = data.map {|line| (line.encode 'UTF-16BE').force_encoding start_encoding } reader = Asciidoctor::Reader.new data, nil, :normalize => true assert_equal Encoding::UTF_8, reader.lines[0].encoding assert_equal 'f', reader.lines[0].chr @@ -381,7 +386,7 @@ This is a paragraph outside the block. end test 'read lines until terminator' do - lines = <<-EOS.each_line.to_a + lines = <<-EOS.lines **** captured @@ -402,7 +407,7 @@ not captured end test 'should flag reader as unterminated if reader reaches end of source without finding terminator' do - lines = <<-EOS.each_line.to_a + lines = <<-EOS.lines **** captured @@ -469,7 +474,7 @@ CRLF\r endlines\r EOS - [input, input.lines.to_a, input.split(::Asciidoctor::LF), input.split(::Asciidoctor::LF).join(::Asciidoctor::LF)].each do |lines| + [input, input.lines, input.split(::Asciidoctor::LF), input.split(::Asciidoctor::LF).join(::Asciidoctor::LF)].each do |lines| doc = Asciidoctor::Document.new lines reader = doc.reader reader.lines.each do |line| @@ -1504,7 +1509,7 @@ include::fixtures/parent-include-restricted.adoc[depth=3] end test 'read_lines_until should not process lines if process option is false' do - lines = <<-EOS.each_line.to_a + lines = <<-EOS.lines //// include::fixtures/no-such-file.adoc[] //// @@ -1518,7 +1523,7 @@ include::fixtures/no-such-file.adoc[] end test 'skip_comment_lines should not process lines read' do - lines = <<-EOS.each_line.to_a + lines = <<-EOS.lines //// include::fixtures/no-such-file.adoc[] //// |
