resolves #794 drop XML tags, character refs, and non-word chars when generating ID for section

- drop character refs and non-word chars (except for hyphen and space) instead of replacing with ID separator - drop XML tags (but not the contents of the tag) - replace spaces and hyphens with ID separator - optimize logic in Section.generate_id method
author: Dan Allen <dan.j.allen@gmail.com> 2018-04-17 00:41:39 -0600
committer: Dan Allen <dan.j.allen@gmail.com> 2018-04-17 01:34:53 -0600
commit: d55e2b936cdd4e1792828fb3c5c5d8cf0c2c63ca (patch)
tree: 2e66098ed2c04e4e7697ca0fb7d882ce344c3fb0
parent: ce927bdf045f6d176e0c173e67654786e7abbce9 (diff)
5 files changed, 59 insertions, 28 deletions
diff --git a/CHANGELOG.adoc b/CHANGELOG.adoc
index d8e1ca20..15cd4022 100644
--- a/CHANGELOG.adoc
+++ b/CHANGELOG.adoc
@@ -17,6 +17,7 @@ For a detailed view of what has changed, refer to the {uri-repo}/commits/master[
 
 Enhancements::
 
+  * BREAKING: drop XML tags, character refs, and non-word characters (except hyphen and space) when generating ID for section (#794)
   * route messages through a logger instead of using Kernel#warn (#44, PR #2660)
   * add MemoryLogger for capturing messages sent to logger into memory (#44, PR #2660)
   * add NullLogger to prevent messages from being logged (#44, PR #2660)
diff --git a/features/xref.feature b/features/xref.feature
index 1809668a..9a5974ee 100644
--- a/features/xref.feature
+++ b/features/xref.feature
@@ -669,14 +669,14 @@ Feature: Cross References
     Then the result should match the HTML structure
       """
       .sect1
-        h2#_section_strong_one_strong
+        h2#_section_one
           |Section <strong>One</strong>
         .sectionbody: .paragraph: p content
       .sect1
         h2#_section_two Section Two
         .sectionbody: .paragraph: p
           |refer to
-          a< href='#_section_strong_one_strong' Section <strong>One</strong>
+          a< href='#_section_one' Section <strong>One</strong>
       """
 
     Scenario: Does not process a natural cross reference in compat mode
diff --git a/lib/asciidoctor.rb b/lib/asciidoctor.rb
index 5f92723d..3d7a3608 100644
--- a/lib/asciidoctor.rb
+++ b/lib/asciidoctor.rb
@@ -661,10 +661,10 @@ module Asciidoctor
     #
     InlineSectionAnchorRx = / (\\)?\[\[([#{CC_ALPHA}_:][#{CC_WORD}:.-]*)(?:, *(.+))?\]\]$/
 
-    # Matches invalid characters in a section id.
+    # Matches invalid ID characters in a section title.
     #
-    # NOTE uppercase chars are not included since the expression is used on a lowercased string
-    InvalidSectionIdCharsRx = /&(?:[a-z][a-z]+\d{0,2}|#\d\d\d{0,4}|#x[\da-f][\da-f][\da-f]{0,3});|[^#{CC_WORD}]+?/
+    # NOTE uppercase chars not included since expression is only run on a lowercase string
+    InvalidSectionIdCharsRx = /<[^>]+>|&(?:[a-z][a-z]+\d{0,2}|#\d\d\d{0,4}|#x[\da-f][\da-f][\da-f]{0,3});|[^ #{CC_WORD}\-]+?/
 
     # Matches the block style used to designate a discrete (aka free-floating) heading.
     #
diff --git a/lib/asciidoctor/section.rb b/lib/asciidoctor/section.rb
index 766deb40..7ce38209 100644
--- a/lib/asciidoctor/section.rb
+++ b/lib/asciidoctor/section.rb
@@ -169,31 +169,41 @@ class Section < AbstractBlock
   # Public: Generate a String ID from the given section title.
   #
   # The generated ID is prefixed with value of the 'idprefix' attribute, which
-  # is an underscore by default. Invalid characters are replaced with the
-  # value of the 'idseparator' attribute, which is an underscore by default.
+  # is an underscore (_) by default. Invalid characters are then removed and
+  # spaces are replaced with the value of the 'idseparator' attribute, which is
+  # an underscore (_) by default.
   #
-  # If the generated ID is already in use in the document, a count is appended
-  # until a unique id is found.
+  # If the generated ID is already in use in the document, a count is appended,
+  # offset by the separator, until a unique ID is found.
   #
-  # Section ID generation can be disabled by undefining the 'sectids' attribute.
+  # Section ID generation can be disabled by unsetting the 'sectids' document attribute.
   #
   # Examples
   #
   #   Section.generate_id 'Foo', document
   #   => "_foo"
   #
+  # Returns the generated [String] ID.
   def self.generate_id title, document
     attrs = document.attributes
+    pre = attrs['idprefix'] || '_'
     if (sep = attrs['idseparator'])
-      sep, sep_len = (attrs['idseparator'] = sep.chr), sep.length > 0 ? 1 : 0
+      if sep.length == 1
+        sep_sub = sep == '-' ? ' -' : %( #{sep}-)
+      elsif sep.empty?
+        no_sep = true
+      else
+        sep_sub = (sep = attrs['idseparator'] = sep.chr) == '-' ? ' -' : %( #{sep}-)
+      end
     else
-      sep, sep_len = '_', 1
+      sep, sep_sub = '_', ' _-'
     end
-    pre = attrs['idprefix'] || '_'
-    gen_id = %(#{pre}#{title.downcase.gsub InvalidSectionIdCharsRx, sep})
-    if sep_len > 0
-      # remove repeat and trailing separator characters
-      gen_id = gen_id.tr_s sep, sep
+    gen_id = %(#{pre}#{title.downcase.gsub InvalidSectionIdCharsRx, ''})
+    if no_sep
+      gen_id = gen_id.delete ' '
+    else
+      # replace space with separator and remove repeating and trailing separator characters
+      gen_id = gen_id.tr_s sep_sub, sep
       gen_id = gen_id.chop if gen_id.end_with? sep
       # ensure id doesn't begin with idseparator if idprefix is empty (assuming idseparator is not empty)
       gen_id = gen_id.slice 1, gen_id.length if pre.empty? && (gen_id.start_with? sep)
diff --git a/test/sections_test.rb b/test/sections_test.rb
index 05120d69..3e60f5d9 100644
--- a/test/sections_test.rb
+++ b/test/sections_test.rb
@@ -11,12 +11,12 @@ context 'Sections' do
       assert_equal '_section_one', sec.id
     end
 
-    test 'synthetic id replaces non-word characters with underscores' do
+    test 'synthetic id removes non-word characters' do
       sec = block_from_string("== We're back!")
-      assert_equal '_we_re_back', sec.id
+      assert_equal '_were_back', sec.id
     end
 
-    test 'synthetic id removes repeating underscores' do
+    test 'synthetic id removes repeating separators' do
       sec = block_from_string('== Section $ One')
       assert_equal '_section_one', sec.id
     end
@@ -31,6 +31,21 @@ context 'Sections' do
       assert_equal '_a_b', sec.id
     end
 
+    test 'synthetic id removes XML tags' do
+      sec = block_from_string('== Use the `run` command to make it icon:gear[]')
+      assert_equal '_use_the_run_command_to_make_it_gear', sec.id
+    end
+
+    test 'synthetic id collapses repeating spaces' do
+      sec = block_from_string('== Go    Far')
+      assert_equal '_go_far', sec.id
+    end
+
+    test 'synthetic id replaces hyphens with separator' do
+      sec = block_from_string('== State-of-the-art design')
+      assert_equal '_state_of_the_art_design', sec.id
+    end
+
     test 'synthetic id prefix can be customized' do
       sec = block_from_string(":idprefix: id_\n\n== Section One")
       assert_equal 'id_section_one', sec.id
@@ -51,6 +66,11 @@ context 'Sections' do
       assert_equal '_section-one', sec.id
     end
 
+    test 'synthetic id separator can be hyphen and hyphens are preserved' do
+      sec = block_from_string(":idseparator: -\n\n== State-of-the-art design")
+      assert_equal '_state-of-the-art-design', sec.id
+    end
+
     test 'synthetic id separator can only be one character' do
       input = <<-EOS
 :idseparator: -=-
@@ -481,11 +501,11 @@ endif::[]
     end
 
     test "with XML entity" do
-      assert_xpath "//h2[@id='_where_s_the_love'][text() = \"Where#{decode_char 8217}s the love?\"]", render_string("== Where's the love?")
+      assert_xpath "//h2[@id='_whats_new'][text() = \"What#{decode_char 8217}s new?\"]", render_string("== What's new?")
     end
 
     test "with non-word character" do
-      assert_xpath "//h2[@id='_where_s_the_love'][text() = \"Where’s the love?\"]", render_string("== Where’s the love?")
+      assert_xpath "//h2[@id='_whats_new'][text() = \"What’s new?\"]", render_string("== What’s new?")
     end
 
     test "with sequential non-word characters" do
@@ -2773,9 +2793,9 @@ content
       assert_xpath '/*[@id="toc"]', output, 1
       toc_links = xmlnodes_at_xpath '/*[@id="toc"]//li', output
       assert_equal 3, toc_links.size
-      toc_links.each do |toc_link|
-        assert_equal 1, toc_link.inner_html.scan('<a').size
-      end
+      assert_equal '<a href="#_section_one">Section One</a>', toc_links[0].inner_html
+      assert_equal '<a href="#_section_two">Section Two</a>', toc_links[1].inner_html
+      assert_equal '<a href="#_plant_trees_by_searching">Plant Trees by Searching</a>', toc_links[2].inner_html
     end
 
     test 'should not remove non-anchor tags from contents of entries in table of contents' do
@@ -2801,9 +2821,9 @@ content
       assert_xpath '/*[@id="toc"]', output, 1
       toc_links = xmlnodes_at_xpath '/*[@id="toc"]//li', output
       assert_equal 3, toc_links.size
-      assert_match(/^<a[^>]+><code>run<\/code> command<\/a>$/, toc_links[0].inner_html)
-      assert_match(/^<a[^>]+><span class="icon"><i class="fa fa-bug"><\/i><\/span> Issues<\/a>$/, toc_links[1].inner_html)
-      assert_match(/^<a[^>]+><em>Sustainable<\/em> Searches<\/a>/, toc_links[2].inner_html)
+      assert_equal '<a href="#_run_command"><code>run</code> command</a>', toc_links[0].inner_html
+      assert_equal '<a href="#_issues"><span class="icon"><i class="fa fa-bug"></i></span> Issues</a>', toc_links[1].inner_html
+      assert_equal '<a href="#_sustainable_searches"><em>Sustainable</em> Searches</a>', toc_links[2].inner_html
     end
   end
author	Dan Allen <dan.j.allen@gmail.com>	2018-04-17 00:41:39 -0600
committer	Dan Allen <dan.j.allen@gmail.com>	2018-04-17 01:34:53 -0600
commit	d55e2b936cdd4e1792828fb3c5c5d8cf0c2c63ca (patch)
tree	2e66098ed2c04e4e7697ca0fb7d882ce344c3fb0
parent	ce927bdf045f6d176e0c173e67654786e7abbce9 (diff)