read input File as string; rename Helpers.normalize_lines* methods

- read input into string instead of array - rename Helpers.normalize_lines* methods to Helpers.prepare_source* - update API docs - add alias constant for UTF_8 encoding - remove unused Helpers.prepare_source method - remove workaround for encoding array to UTF-16LE
author: Dan Allen <dan.j.allen@gmail.com> 2019-01-04 14:17:21 -0700
committer: Dan Allen <dan.j.allen@gmail.com> 2019-01-06 16:42:53 -0700
commit: 83619ac4e1b8207614c1500a0012d30af28db68b (patch)
tree: 56c7c3435102c9d853f34221194d564b48314d8c
parent: 555031d7dd583aad461546b0e6ee98dda02d1fa5 (diff)
8 files changed, 62 insertions, 69 deletions
diff --git a/lib/asciidoctor.rb b/lib/asciidoctor.rb
index 44fabd83..53516809 100644
--- a/lib/asciidoctor.rb
+++ b/lib/asciidoctor.rb
@@ -193,6 +193,9 @@ module Asciidoctor
   # Maximum integer value for "boundless" operations; equal to MAX_SAFE_INTEGER in JavaScript
   MAX_INT = 9007199254740991
 
+  # Alias UTF_8 encoding for convenience / speed
+  UTF_8 = ::Encoding::UTF_8
+
   # Byte arrays for UTF-* Byte Order Marks
   BOM_BYTES_UTF_8 = [0xef, 0xbb, 0xbf]
   BOM_BYTES_UTF_16LE = [0xff, 0xfe]
@@ -1272,14 +1275,13 @@ module Asciidoctor
       raise ::ArgumentError, %(illegal type for attributes option: #{attrs.class.ancestors.join ' < '})
     end
 
-    lines = nil
     if ::File === input
       # TODO cli checks if input path can be read and is file, but might want to add check to API
       input_path = ::File.expand_path input.path
       # See https://reproducible-builds.org/specs/source-date-epoch/
       # NOTE Opal can't call key? on ENV
       input_mtime = ::ENV['SOURCE_DATE_EPOCH'] ? ::Time.at(Integer ::ENV['SOURCE_DATE_EPOCH']).utc : input.mtime
-      lines = input.readlines
+      source = input.read
       # hold off on setting infile and indir until we get a better sense of their purpose
       attrs['docfile'] = input_path
       attrs['docdir'] = ::File.dirname input_path
@@ -1293,18 +1295,15 @@ module Asciidoctor
       # %Z is OS dependent and may contain characters that aren't UTF-8 encoded (see asciidoctor#2770 and asciidoctor.js#23)
       doctime = (attrs['doctime'] ||= input_mtime.strftime %(%T #{input_mtime.utc_offset == 0 ? 'UTC' : '%z'}))
       attrs['docdatetime'] = %(#{docdate} #{doctime})
-    elsif input.respond_to? :readlines
+    elsif input.respond_to? :read
       # NOTE tty, pipes & sockets can't be rewound, but can't be sniffed easily either
       # just fail the rewind operation silently to handle all cases
-      begin
-        input.rewind
-      rescue
-      end
-      lines = input.readlines
+      input.rewind rescue nil
+      source = input.read
     elsif ::String === input
-      lines = input.lines
+      source = input
     elsif ::Array === input
-      lines = input.drop 0
+      source = input.drop 0
     else
       raise ::ArgumentError, %(unsupported input type: #{input.class})
     end
@@ -1315,7 +1314,7 @@ module Asciidoctor
     end
 
     options[:attributes] = attrs
-    doc = options[:parse] == false ? (Document.new lines, options) : (Document.new lines, options).parse
+    doc = options[:parse] == false ? (Document.new source, options) : (Document.new source, options).parse
 
     timings.record :parse if timings
     doc
diff --git a/lib/asciidoctor/abstract_node.rb b/lib/asciidoctor/abstract_node.rb
index 0bf6aaff..d63e3479 100644
--- a/lib/asciidoctor/abstract_node.rb
+++ b/lib/asciidoctor/abstract_node.rb
@@ -505,7 +505,7 @@ class AbstractNode
     opts = { :warn_on_failure => (opts != false) } unless ::Hash === opts
     if ::File.readable? path
       if opts[:normalize]
-        (Helpers.normalize_lines_array ::File.open(path, FILE_READ_MODE) {|f| f.each_line.to_a }).join LF
+        (Helpers.prepare_source_string ::File.open(path, FILE_READ_MODE) {|f| f.read }).join LF
       else
         # QUESTION should we chomp or rstrip content?
         ::IO.read path
@@ -540,7 +540,7 @@ class AbstractNode
         Helpers.require_library 'open-uri/cached', 'open-uri-cached' if doc.attr? 'cache-uri'
         begin
           if opts[:normalize]
-            (Helpers.normalize_lines_array ::OpenURI.open_uri(target, URI_READ_MODE) {|f| f.each_line.to_a }).join LF
+            (Helpers.prepare_source_string ::OpenURI.open_uri(target, URI_READ_MODE) {|f| f.read }).join LF
           else
             ::OpenURI.open_uri(target, URI_READ_MODE) {|f| f.read }
           end
diff --git a/lib/asciidoctor/block.rb b/lib/asciidoctor/block.rb
index b3057cbb..c915b0db 100644
--- a/lib/asciidoctor/block.rb
+++ b/lib/asciidoctor/block.rb
@@ -87,7 +87,7 @@ class Block < AbstractBlock
     if (raw_source = opts[:source]).nil_or_empty?
       @lines = []
     elsif ::String === raw_source
-      @lines = Helpers.normalize_lines_from_string raw_source
+      @lines = Helpers.prepare_source_string raw_source
     else
       @lines = raw_source.drop 0
     end
diff --git a/lib/asciidoctor/cli/options.rb b/lib/asciidoctor/cli/options.rb
index 2c7f3e66..019c0fd5 100644
--- a/lib/asciidoctor/cli/options.rb
+++ b/lib/asciidoctor/cli/options.rb
@@ -81,7 +81,7 @@ Example: asciidoctor -b html5 source.asciidoc
           opts.on('-a', '--attribute key[=value]', 'a document attribute to set in the form of key, key! or key=value pair',
                   'unless @ is appended to the value, this attributes takes precedence over attributes',
                   'defined in the source document') do |attr|
-            attr = attr.encode ::Encoding::UTF_8 unless attr.encoding == ::Encoding::UTF_8
+            attr = attr.encode UTF_8 unless attr.encoding == UTF_8
             key, val = attr.split '=', 2
             self[:attributes][key] = val || ''
           end
diff --git a/lib/asciidoctor/helpers.rb b/lib/asciidoctor/helpers.rb
index e57f8843..2bd0c5af 100644
--- a/lib/asciidoctor/helpers.rb
+++ b/lib/asciidoctor/helpers.rb
@@ -42,65 +42,58 @@ module Helpers
     nil
   end
 
-  # Public: Normalize the data to prepare for parsing
+  # Public: Prepare the source data Array for parsing.
   #
-  # Delegates to Helpers#normalize_lines_from_string if data is a String.
-  # Delegates to Helpers#normalize_lines_array if data is a String Array.
+  # Encodes the data to UTF-8, if necessary, and removes any trailing
+  # whitespace from every line.
   #
-  # returns a String Array of normalized lines
-  def self.normalize_lines data
-    ::String === data ? (normalize_lines_from_string data) : (normalize_lines_array data)
-  end
-
-  # Public: Normalize the array of lines to prepare them for parsing
-  #
-  # Encodes the data to UTF-8 and removes trailing whitespace from each line.
+  # If a BOM is found at the beginning of the data, a best attempt is made to
+  # encode it to UTF-8 from the specified source encoding.
   #
-  # If a BOM is present at the beginning of the data, a best attempt
-  # is made to encode from the specified encoding to UTF-8.
+  # data - the source data Array to prepare (no nil entries allowed)
   #
-  # data - a String Array of lines to normalize
-  #
-  # returns a String Array of normalized lines
-  def self.normalize_lines_array data
-    return data if data.empty?
-    utf8 = ::Encoding::UTF_8
+  # returns a String Array of prepared lines
+  def self.prepare_source_array data
+    return [] if data.empty?
     if (leading_2_bytes = (leading_bytes = (first = data[0]).unpack 'C3').slice 0, 2) == BOM_BYTES_UTF_16LE
       data[0] = first.byteslice 2, first.bytesize
-      # HACK Ruby messes up trailing whitespace on UTF-16LE, so encode whole document first; assume newlines are present
-      return (data.join.encode utf8, ::Encoding::UTF_16LE).lines.map {|line| line.rstrip }
+      # NOTE you can't split a UTF-16LE string using .lines when encoding is UTF-8; doing so will cause this line to fail
+      return data.map {|line| (line.encode UTF_8, ::Encoding::UTF_16LE).rstrip }
     elsif leading_2_bytes == BOM_BYTES_UTF_16BE
       data[0] = first.byteslice 2, first.bytesize
-      return data.map {|line| (line.encode utf8, ::Encoding::UTF_16BE).rstrip }
+      return data.map {|line| (line.encode UTF_8, ::Encoding::UTF_16BE).rstrip }
     elsif leading_bytes == BOM_BYTES_UTF_8
       data[0] = first.byteslice 3, first.bytesize
     end
-    data.map {|line| (line.encoding == utf8 ? line : (line.encode utf8)).rstrip }
+    if first.encoding == UTF_8
+      data.map {|line| line.rstrip }
+    else
+      data.map {|line| (line.encode UTF_8).rstrip }
+    end
   end
 
-  # Public: Normalize the String and split into lines to prepare them for parsing
+  # Public: Prepare the source data String for parsing.
   #
-  # Encodes the data to UTF-8, converts the data to a String Array, and removes
-  # trailing whitespace from each line.
+  # Encodes the data to UTF-8, if necessary, splits it into an array, and
+  # removes any trailing whitespace from every line.
   #
-  # If a BOM is present at the beginning of the data, a best attempt
-  # is made to encode from the specified encoding to UTF-8.
+  # If a BOM is found at the beginning of the data, a best attempt is made to
+  # encode it to UTF-8 from the specified source encoding.
   #
-  # data - a String of lines to normalize
+  # data - the source data String to prepare
   #
-  # returns a String Array of normalized lines
-  def self.normalize_lines_from_string data
+  # returns a String Array of prepared lines
+  def self.prepare_source_string data
     return [] if data.nil_or_empty?
-    utf8 = ::Encoding::UTF_8
     if (leading_2_bytes = (leading_bytes = data.unpack 'C3').slice 0, 2) == BOM_BYTES_UTF_16LE
-      data = (data.byteslice 2, data.bytesize).encode utf8, ::Encoding::UTF_16LE
+      data = (data.byteslice 2, data.bytesize).encode UTF_8, ::Encoding::UTF_16LE
     elsif leading_2_bytes == BOM_BYTES_UTF_16BE
-      data = (data.byteslice 2, data.bytesize).encode utf8, ::Encoding::UTF_16BE
+      data = (data.byteslice 2, data.bytesize).encode UTF_8, ::Encoding::UTF_16BE
     elsif leading_bytes == BOM_BYTES_UTF_8
       data = data.byteslice 3, data.bytesize
-      data = data.encode utf8 unless data.encoding == utf8
-    elsif data.encoding != utf8
-      data = data.encode utf8
+      data = data.encode UTF_8 unless data.encoding == UTF_8
+    elsif data.encoding != UTF_8
+      data = data.encode UTF_8
     end
     data.lines.map {|line| line.rstrip }
   end
diff --git a/lib/asciidoctor/reader.rb b/lib/asciidoctor/reader.rb
index 59fd2a6d..985ac51d 100644
--- a/lib/asciidoctor/reader.rb
+++ b/lib/asciidoctor/reader.rb
@@ -82,14 +82,10 @@ class Reader
   #
   # Returns The String lines extracted from the data
   def prepare_lines data, opts = {}
-    if ::String === data
-      if opts[:normalize]
-        Helpers.normalize_lines_from_string data
-      else
-        data.split LF, -1
-      end
-    elsif opts[:normalize]
-      Helpers.normalize_lines_array data
+    if opts[:normalize]
+      ::String === data ? (Helpers.prepare_source_string data) : (Helpers.prepare_source_array data)
+    elsif ::String === data
+      data.split LF, -1
     else
       data.drop 0
     end
diff --git a/test/invoker_test.rb b/test/invoker_test.rb
index 1c85b3c3..1398ef9c 100644
--- a/test/invoker_test.rb
+++ b/test/invoker_test.rb
@@ -71,8 +71,8 @@ context 'Invoker' do
   test 'should not fail to rewind input if reading document from stdin' do
     io = STDIN.dup
     class << io
-      def readlines
-        ['paragraph']
+      def read
+        'paragraph'
       end
     end
     invoker = invoke_cli_to_buffer(%w(-s), '-') { io }
diff --git a/test/reader_test.rb b/test/reader_test.rb
index f31bdfb4..ed03084e 100644
--- a/test/reader_test.rb
+++ b/test/reader_test.rb
@@ -54,7 +54,10 @@ third line
 
       test 'should encode UTF-16LE string array to UTF-8 when BOM is found' do
         ['UTF-8', 'ASCII-8BIT'].each do |start_encoding|
-          data = "\ufeff#{SAMPLE_DATA.join ::Asciidoctor::LF}".encode('UTF-16LE').force_encoding(start_encoding).lines.to_a
+          # NOTE can't split a UTF-16LE string using .lines when encoding is set to UTF-8
+          data = SAMPLE_DATA.dup
+          data.unshift %(\ufeff#{data.shift})
+          data.each {|line| (line.encode 'UTF-16LE').force_encoding start_encoding }
           reader = Asciidoctor::Reader.new data, nil, :normalize => true
           assert_equal Encoding::UTF_8, reader.lines[0].encoding
           assert_equal 'f', reader.lines[0].chr
@@ -74,7 +77,9 @@ third line
 
       test 'should encode UTF-16BE string array to UTF-8 when BOM is found' do
         ['UTF-8', 'ASCII-8BIT'].each do |start_encoding|
-          data = "\ufeff#{SAMPLE_DATA.join ::Asciidoctor::LF}".encode('UTF-16BE').force_encoding(start_encoding).lines.to_a
+          data = SAMPLE_DATA.dup
+          data.unshift %(\ufeff#{data.shift})
+          data = data.map {|line| (line.encode 'UTF-16BE').force_encoding start_encoding }
           reader = Asciidoctor::Reader.new data, nil, :normalize => true
           assert_equal Encoding::UTF_8, reader.lines[0].encoding
           assert_equal 'f', reader.lines[0].chr
@@ -381,7 +386,7 @@ This is a paragraph outside the block.
       end
 
       test 'read lines until terminator' do
-        lines = <<-EOS.each_line.to_a
+        lines = <<-EOS.lines
 ****
 captured
 
@@ -402,7 +407,7 @@ not captured
       end
 
       test 'should flag reader as unterminated if reader reaches end of source without finding terminator' do
-        lines = <<-EOS.each_line.to_a
+        lines = <<-EOS.lines
 ****
 captured
 
@@ -469,7 +474,7 @@ CRLF\r
 endlines\r
       EOS
 
-        [input, input.lines.to_a, input.split(::Asciidoctor::LF), input.split(::Asciidoctor::LF).join(::Asciidoctor::LF)].each do |lines|
+        [input, input.lines, input.split(::Asciidoctor::LF), input.split(::Asciidoctor::LF).join(::Asciidoctor::LF)].each do |lines|
           doc = Asciidoctor::Document.new lines
           reader = doc.reader
           reader.lines.each do |line|
@@ -1504,7 +1509,7 @@ include::fixtures/parent-include-restricted.adoc[depth=3]
       end
 
       test 'read_lines_until should not process lines if process option is false' do
-        lines = <<-EOS.each_line.to_a
+        lines = <<-EOS.lines
 ////
 include::fixtures/no-such-file.adoc[]
 ////
@@ -1518,7 +1523,7 @@ include::fixtures/no-such-file.adoc[]
       end
 
       test 'skip_comment_lines should not process lines read' do
-        lines = <<-EOS.each_line.to_a
+        lines = <<-EOS.lines
 ////
 include::fixtures/no-such-file.adoc[]
 ////
author	Dan Allen <dan.j.allen@gmail.com>	2019-01-04 14:17:21 -0700
committer	Dan Allen <dan.j.allen@gmail.com>	2019-01-06 16:42:53 -0700
commit	83619ac4e1b8207614c1500a0012d30af28db68b (patch)
tree	56c7c3435102c9d853f34221194d564b48314d8c
parent	555031d7dd583aad461546b0e6ee98dda02d1fa5 (diff)