summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan Allen <dan.j.allen@gmail.com>2019-01-04 14:17:21 -0700
committerDan Allen <dan.j.allen@gmail.com>2019-01-06 16:42:53 -0700
commit83619ac4e1b8207614c1500a0012d30af28db68b (patch)
tree56c7c3435102c9d853f34221194d564b48314d8c
parent555031d7dd583aad461546b0e6ee98dda02d1fa5 (diff)
read input File as string; rename Helpers.normalize_lines* methods
- read input into string instead of array - rename Helpers.normalize_lines* methods to Helpers.prepare_source* - update API docs - add alias constant for UTF_8 encoding - remove unused Helpers.prepare_source method - remove workaround for encoding array to UTF-16LE
-rw-r--r--lib/asciidoctor.rb21
-rw-r--r--lib/asciidoctor/abstract_node.rb4
-rw-r--r--lib/asciidoctor/block.rb2
-rw-r--r--lib/asciidoctor/cli/options.rb2
-rw-r--r--lib/asciidoctor/helpers.rb67
-rw-r--r--lib/asciidoctor/reader.rb12
-rw-r--r--test/invoker_test.rb4
-rw-r--r--test/reader_test.rb19
8 files changed, 62 insertions, 69 deletions
diff --git a/lib/asciidoctor.rb b/lib/asciidoctor.rb
index 44fabd83..53516809 100644
--- a/lib/asciidoctor.rb
+++ b/lib/asciidoctor.rb
@@ -193,6 +193,9 @@ module Asciidoctor
# Maximum integer value for "boundless" operations; equal to MAX_SAFE_INTEGER in JavaScript
MAX_INT = 9007199254740991
+ # Alias UTF_8 encoding for convenience / speed
+ UTF_8 = ::Encoding::UTF_8
+
# Byte arrays for UTF-* Byte Order Marks
BOM_BYTES_UTF_8 = [0xef, 0xbb, 0xbf]
BOM_BYTES_UTF_16LE = [0xff, 0xfe]
@@ -1272,14 +1275,13 @@ module Asciidoctor
raise ::ArgumentError, %(illegal type for attributes option: #{attrs.class.ancestors.join ' < '})
end
- lines = nil
if ::File === input
# TODO cli checks if input path can be read and is file, but might want to add check to API
input_path = ::File.expand_path input.path
# See https://reproducible-builds.org/specs/source-date-epoch/
# NOTE Opal can't call key? on ENV
input_mtime = ::ENV['SOURCE_DATE_EPOCH'] ? ::Time.at(Integer ::ENV['SOURCE_DATE_EPOCH']).utc : input.mtime
- lines = input.readlines
+ source = input.read
# hold off on setting infile and indir until we get a better sense of their purpose
attrs['docfile'] = input_path
attrs['docdir'] = ::File.dirname input_path
@@ -1293,18 +1295,15 @@ module Asciidoctor
# %Z is OS dependent and may contain characters that aren't UTF-8 encoded (see asciidoctor#2770 and asciidoctor.js#23)
doctime = (attrs['doctime'] ||= input_mtime.strftime %(%T #{input_mtime.utc_offset == 0 ? 'UTC' : '%z'}))
attrs['docdatetime'] = %(#{docdate} #{doctime})
- elsif input.respond_to? :readlines
+ elsif input.respond_to? :read
# NOTE tty, pipes & sockets can't be rewound, but can't be sniffed easily either
# just fail the rewind operation silently to handle all cases
- begin
- input.rewind
- rescue
- end
- lines = input.readlines
+ input.rewind rescue nil
+ source = input.read
elsif ::String === input
- lines = input.lines
+ source = input
elsif ::Array === input
- lines = input.drop 0
+ source = input.drop 0
else
raise ::ArgumentError, %(unsupported input type: #{input.class})
end
@@ -1315,7 +1314,7 @@ module Asciidoctor
end
options[:attributes] = attrs
- doc = options[:parse] == false ? (Document.new lines, options) : (Document.new lines, options).parse
+ doc = options[:parse] == false ? (Document.new source, options) : (Document.new source, options).parse
timings.record :parse if timings
doc
diff --git a/lib/asciidoctor/abstract_node.rb b/lib/asciidoctor/abstract_node.rb
index 0bf6aaff..d63e3479 100644
--- a/lib/asciidoctor/abstract_node.rb
+++ b/lib/asciidoctor/abstract_node.rb
@@ -505,7 +505,7 @@ class AbstractNode
opts = { :warn_on_failure => (opts != false) } unless ::Hash === opts
if ::File.readable? path
if opts[:normalize]
- (Helpers.normalize_lines_array ::File.open(path, FILE_READ_MODE) {|f| f.each_line.to_a }).join LF
+ (Helpers.prepare_source_string ::File.open(path, FILE_READ_MODE) {|f| f.read }).join LF
else
# QUESTION should we chomp or rstrip content?
::IO.read path
@@ -540,7 +540,7 @@ class AbstractNode
Helpers.require_library 'open-uri/cached', 'open-uri-cached' if doc.attr? 'cache-uri'
begin
if opts[:normalize]
- (Helpers.normalize_lines_array ::OpenURI.open_uri(target, URI_READ_MODE) {|f| f.each_line.to_a }).join LF
+ (Helpers.prepare_source_string ::OpenURI.open_uri(target, URI_READ_MODE) {|f| f.read }).join LF
else
::OpenURI.open_uri(target, URI_READ_MODE) {|f| f.read }
end
diff --git a/lib/asciidoctor/block.rb b/lib/asciidoctor/block.rb
index b3057cbb..c915b0db 100644
--- a/lib/asciidoctor/block.rb
+++ b/lib/asciidoctor/block.rb
@@ -87,7 +87,7 @@ class Block < AbstractBlock
if (raw_source = opts[:source]).nil_or_empty?
@lines = []
elsif ::String === raw_source
- @lines = Helpers.normalize_lines_from_string raw_source
+ @lines = Helpers.prepare_source_string raw_source
else
@lines = raw_source.drop 0
end
diff --git a/lib/asciidoctor/cli/options.rb b/lib/asciidoctor/cli/options.rb
index 2c7f3e66..019c0fd5 100644
--- a/lib/asciidoctor/cli/options.rb
+++ b/lib/asciidoctor/cli/options.rb
@@ -81,7 +81,7 @@ Example: asciidoctor -b html5 source.asciidoc
opts.on('-a', '--attribute key[=value]', 'a document attribute to set in the form of key, key! or key=value pair',
'unless @ is appended to the value, this attributes takes precedence over attributes',
'defined in the source document') do |attr|
- attr = attr.encode ::Encoding::UTF_8 unless attr.encoding == ::Encoding::UTF_8
+ attr = attr.encode UTF_8 unless attr.encoding == UTF_8
key, val = attr.split '=', 2
self[:attributes][key] = val || ''
end
diff --git a/lib/asciidoctor/helpers.rb b/lib/asciidoctor/helpers.rb
index e57f8843..2bd0c5af 100644
--- a/lib/asciidoctor/helpers.rb
+++ b/lib/asciidoctor/helpers.rb
@@ -42,65 +42,58 @@ module Helpers
nil
end
- # Public: Normalize the data to prepare for parsing
+ # Public: Prepare the source data Array for parsing.
#
- # Delegates to Helpers#normalize_lines_from_string if data is a String.
- # Delegates to Helpers#normalize_lines_array if data is a String Array.
+ # Encodes the data to UTF-8, if necessary, and removes any trailing
+ # whitespace from every line.
#
- # returns a String Array of normalized lines
- def self.normalize_lines data
- ::String === data ? (normalize_lines_from_string data) : (normalize_lines_array data)
- end
-
- # Public: Normalize the array of lines to prepare them for parsing
- #
- # Encodes the data to UTF-8 and removes trailing whitespace from each line.
+ # If a BOM is found at the beginning of the data, a best attempt is made to
+ # encode it to UTF-8 from the specified source encoding.
#
- # If a BOM is present at the beginning of the data, a best attempt
- # is made to encode from the specified encoding to UTF-8.
+ # data - the source data Array to prepare (no nil entries allowed)
#
- # data - a String Array of lines to normalize
- #
- # returns a String Array of normalized lines
- def self.normalize_lines_array data
- return data if data.empty?
- utf8 = ::Encoding::UTF_8
+ # returns a String Array of prepared lines
+ def self.prepare_source_array data
+ return [] if data.empty?
if (leading_2_bytes = (leading_bytes = (first = data[0]).unpack 'C3').slice 0, 2) == BOM_BYTES_UTF_16LE
data[0] = first.byteslice 2, first.bytesize
- # HACK Ruby messes up trailing whitespace on UTF-16LE, so encode whole document first; assume newlines are present
- return (data.join.encode utf8, ::Encoding::UTF_16LE).lines.map {|line| line.rstrip }
+ # NOTE you can't split a UTF-16LE string using .lines when encoding is UTF-8; doing so will cause this line to fail
+ return data.map {|line| (line.encode UTF_8, ::Encoding::UTF_16LE).rstrip }
elsif leading_2_bytes == BOM_BYTES_UTF_16BE
data[0] = first.byteslice 2, first.bytesize
- return data.map {|line| (line.encode utf8, ::Encoding::UTF_16BE).rstrip }
+ return data.map {|line| (line.encode UTF_8, ::Encoding::UTF_16BE).rstrip }
elsif leading_bytes == BOM_BYTES_UTF_8
data[0] = first.byteslice 3, first.bytesize
end
- data.map {|line| (line.encoding == utf8 ? line : (line.encode utf8)).rstrip }
+ if first.encoding == UTF_8
+ data.map {|line| line.rstrip }
+ else
+ data.map {|line| (line.encode UTF_8).rstrip }
+ end
end
- # Public: Normalize the String and split into lines to prepare them for parsing
+ # Public: Prepare the source data String for parsing.
#
- # Encodes the data to UTF-8, converts the data to a String Array, and removes
- # trailing whitespace from each line.
+ # Encodes the data to UTF-8, if necessary, splits it into an array, and
+ # removes any trailing whitespace from every line.
#
- # If a BOM is present at the beginning of the data, a best attempt
- # is made to encode from the specified encoding to UTF-8.
+ # If a BOM is found at the beginning of the data, a best attempt is made to
+ # encode it to UTF-8 from the specified source encoding.
#
- # data - a String of lines to normalize
+ # data - the source data String to prepare
#
- # returns a String Array of normalized lines
- def self.normalize_lines_from_string data
+ # returns a String Array of prepared lines
+ def self.prepare_source_string data
return [] if data.nil_or_empty?
- utf8 = ::Encoding::UTF_8
if (leading_2_bytes = (leading_bytes = data.unpack 'C3').slice 0, 2) == BOM_BYTES_UTF_16LE
- data = (data.byteslice 2, data.bytesize).encode utf8, ::Encoding::UTF_16LE
+ data = (data.byteslice 2, data.bytesize).encode UTF_8, ::Encoding::UTF_16LE
elsif leading_2_bytes == BOM_BYTES_UTF_16BE
- data = (data.byteslice 2, data.bytesize).encode utf8, ::Encoding::UTF_16BE
+ data = (data.byteslice 2, data.bytesize).encode UTF_8, ::Encoding::UTF_16BE
elsif leading_bytes == BOM_BYTES_UTF_8
data = data.byteslice 3, data.bytesize
- data = data.encode utf8 unless data.encoding == utf8
- elsif data.encoding != utf8
- data = data.encode utf8
+ data = data.encode UTF_8 unless data.encoding == UTF_8
+ elsif data.encoding != UTF_8
+ data = data.encode UTF_8
end
data.lines.map {|line| line.rstrip }
end
diff --git a/lib/asciidoctor/reader.rb b/lib/asciidoctor/reader.rb
index 59fd2a6d..985ac51d 100644
--- a/lib/asciidoctor/reader.rb
+++ b/lib/asciidoctor/reader.rb
@@ -82,14 +82,10 @@ class Reader
#
# Returns The String lines extracted from the data
def prepare_lines data, opts = {}
- if ::String === data
- if opts[:normalize]
- Helpers.normalize_lines_from_string data
- else
- data.split LF, -1
- end
- elsif opts[:normalize]
- Helpers.normalize_lines_array data
+ if opts[:normalize]
+ ::String === data ? (Helpers.prepare_source_string data) : (Helpers.prepare_source_array data)
+ elsif ::String === data
+ data.split LF, -1
else
data.drop 0
end
diff --git a/test/invoker_test.rb b/test/invoker_test.rb
index 1c85b3c3..1398ef9c 100644
--- a/test/invoker_test.rb
+++ b/test/invoker_test.rb
@@ -71,8 +71,8 @@ context 'Invoker' do
test 'should not fail to rewind input if reading document from stdin' do
io = STDIN.dup
class << io
- def readlines
- ['paragraph']
+ def read
+ 'paragraph'
end
end
invoker = invoke_cli_to_buffer(%w(-s), '-') { io }
diff --git a/test/reader_test.rb b/test/reader_test.rb
index f31bdfb4..ed03084e 100644
--- a/test/reader_test.rb
+++ b/test/reader_test.rb
@@ -54,7 +54,10 @@ third line
test 'should encode UTF-16LE string array to UTF-8 when BOM is found' do
['UTF-8', 'ASCII-8BIT'].each do |start_encoding|
- data = "\ufeff#{SAMPLE_DATA.join ::Asciidoctor::LF}".encode('UTF-16LE').force_encoding(start_encoding).lines.to_a
+ # NOTE can't split a UTF-16LE string using .lines when encoding is set to UTF-8
+ data = SAMPLE_DATA.dup
+ data.unshift %(\ufeff#{data.shift})
+ data.each {|line| (line.encode 'UTF-16LE').force_encoding start_encoding }
reader = Asciidoctor::Reader.new data, nil, :normalize => true
assert_equal Encoding::UTF_8, reader.lines[0].encoding
assert_equal 'f', reader.lines[0].chr
@@ -74,7 +77,9 @@ third line
test 'should encode UTF-16BE string array to UTF-8 when BOM is found' do
['UTF-8', 'ASCII-8BIT'].each do |start_encoding|
- data = "\ufeff#{SAMPLE_DATA.join ::Asciidoctor::LF}".encode('UTF-16BE').force_encoding(start_encoding).lines.to_a
+ data = SAMPLE_DATA.dup
+ data.unshift %(\ufeff#{data.shift})
+ data = data.map {|line| (line.encode 'UTF-16BE').force_encoding start_encoding }
reader = Asciidoctor::Reader.new data, nil, :normalize => true
assert_equal Encoding::UTF_8, reader.lines[0].encoding
assert_equal 'f', reader.lines[0].chr
@@ -381,7 +386,7 @@ This is a paragraph outside the block.
end
test 'read lines until terminator' do
- lines = <<-EOS.each_line.to_a
+ lines = <<-EOS.lines
****
captured
@@ -402,7 +407,7 @@ not captured
end
test 'should flag reader as unterminated if reader reaches end of source without finding terminator' do
- lines = <<-EOS.each_line.to_a
+ lines = <<-EOS.lines
****
captured
@@ -469,7 +474,7 @@ CRLF\r
endlines\r
EOS
- [input, input.lines.to_a, input.split(::Asciidoctor::LF), input.split(::Asciidoctor::LF).join(::Asciidoctor::LF)].each do |lines|
+ [input, input.lines, input.split(::Asciidoctor::LF), input.split(::Asciidoctor::LF).join(::Asciidoctor::LF)].each do |lines|
doc = Asciidoctor::Document.new lines
reader = doc.reader
reader.lines.each do |line|
@@ -1504,7 +1509,7 @@ include::fixtures/parent-include-restricted.adoc[depth=3]
end
test 'read_lines_until should not process lines if process option is false' do
- lines = <<-EOS.each_line.to_a
+ lines = <<-EOS.lines
////
include::fixtures/no-such-file.adoc[]
////
@@ -1518,7 +1523,7 @@ include::fixtures/no-such-file.adoc[]
end
test 'skip_comment_lines should not process lines read' do
- lines = <<-EOS.each_line.to_a
+ lines = <<-EOS.lines
////
include::fixtures/no-such-file.adoc[]
////