diff --git a/lib/rdoc/markup/to_html.rb b/lib/rdoc/markup/to_html.rb index 2924b89b94..b04e915c87 100644 --- a/lib/rdoc/markup/to_html.rb +++ b/lib/rdoc/markup/to_html.rb @@ -40,6 +40,82 @@ class RDoc::Markup::ToHtml < RDoc::Markup::Formatter # :section: + # Maps an encoding to a Hash of characters properly transcoded for that + # encoding. + # + # See also encode_fallback. + + TO_HTML_CHARACTERS = Hash.new do |h, encoding| + h[encoding] = { + :close_dquote => encode_fallback('”', encoding, '"'), + :close_squote => encode_fallback('’', encoding, '\''), + :copyright => encode_fallback('©', encoding, '(c)'), + :ellipsis => encode_fallback('…', encoding, '...'), + :dot_ellipsis => encode_fallback('.…', encoding, '....'), + :em_dash => encode_fallback('—', encoding, '---'), + :en_dash => encode_fallback('–', encoding, '--'), + :open_dquote => encode_fallback('“', encoding, '"'), + :open_squote => encode_fallback('‘', encoding, '\''), + :trademark => encode_fallback('®', encoding, '(r)'), + } + end + + HTML_CHARACTER_ALIASES = { + '(c)' => :copyright, + '(C)' => :copyright, + '(r)' => :trademark, + '(R)' => :trademark, + '---' => :em_dash, + '--' => :en_dash, + '....' => :dot_ellipsis, + '...' => :ellipsis, + '``' => :open_dquote, + "''" => :close_dquote, + } + + # Transcodes +character+ to +encoding+ with a +fallback+ character. + + def self.encode_fallback(character, encoding, fallback) + character.encode(encoding, :fallback => { character => fallback }, + :undef => :replace, :replace => fallback) + end + + # Converts ascii quote pairs to multibyte quote characters + class QuoteConverter + + def initialize + @in_dquote = false + @in_squote = false + end + + def convert(quote, after_word:) + case quote + when '"' + type = @in_dquote ? :close_dquote : :open_dquote + @in_dquote = !@in_dquote + when "'" + if @insquotes + type = :close_squote + @insquotes = false + elsif after_word + # Mary's dog, my parents' house: do not start paired quotes + type = :close_squote + else + type = :open_squote + @insquotes = true + end + when '`' + # Opening quote of `quoted sentence'. + # This will conflict with code blocks `puts('hello')` in the future. + if !@insquotes && !after_word + type = :open_squote + @insquotes = true + end + end + TO_HTML_CHARACTERS[quote.encoding][type] if type + end + end + ## # Creates a new formatter that will output HTML @@ -51,6 +127,7 @@ def initialize(options, markup = nil) @in_list_entry = nil @list = nil @th = nil + @quote_converter = nil @in_tidylink_label = false @hard_break = "
\n" @@ -75,6 +152,11 @@ def init_regexp_handlings # suppress crossref: \#method \::method \ClassName \method_with_underscores @markup.add_regexp_handling(/\\(?:[#:A-Z]|[a-z]+_[a-z0-9])/, :SUPPRESSED_CROSSREF) + @markup.add_regexp_handling(Regexp.union(HTML_CHARACTER_ALIASES.keys), :HTML_CHARACTERS) + + @markup.add_regexp_handling(/\b['"`]/, :QUOTE_AFTER_WORD) + @markup.add_regexp_handling(/\B['"`]/, :QUOTE_NOT_AFTER_WORD) + init_link_notation_regexp_handlings end @@ -227,12 +309,28 @@ def handle_TIDYLINK(label_part, url) def handle_inline(text) # :nodoc: @inline_output = +'' + @quote_converter = QuoteConverter.new super out = @inline_output @inline_output = nil + @quote_converter = nil out end + # Converts (c), (r), --, --- , ..., ...., ``, "" to HTML characters. + def handle_regexp_HTML_CHARACTERS(text) + name = HTML_CHARACTER_ALIASES[text] + TO_HTML_CHARACTERS[text.encoding][name] if name + end + + def handle_regexp_QUOTE_NOT_AFTER_WORD(text) + @quote_converter.convert(text, after_word: false) || convert_string(text) + end + + def handle_regexp_QUOTE_AFTER_WORD(text) + @quote_converter.convert(text, after_word: true) || convert_string(text) + end + # Converts suppressed cross-reference +text+ to HTML by removing the leading backslash. def handle_regexp_SUPPRESSED_CROSSREF(text) @@ -565,10 +663,7 @@ def parseable?(text) # Converts +item+ to HTML using RDoc::Text#to_html def to_html(item) - # Ideally, we should convert html characters at handle_PLAIN_TEXT or somewhere else, - # but we need to convert it here for now because to_html_characters converts pair of backticks to ’‘ and pair of double backticks to ”“. - # Known bugs: `...` in `def f(...); end` and `(c) in `` will be wrongly converted. - to_html_characters(handle_inline(item)) + handle_inline(item) end end diff --git a/lib/rdoc/markup/to_html_snippet.rb b/lib/rdoc/markup/to_html_snippet.rb index 52cc4543f3..0f485cae8e 100644 --- a/lib/rdoc/markup/to_html_snippet.rb +++ b/lib/rdoc/markup/to_html_snippet.rb @@ -109,7 +109,7 @@ def accept_verbatim(verbatim) input = verbatim.text.rstrip text = truncate(input, @character_limit - @characters) @characters += input.length - text << ' ...' unless text == input + text << " #{TO_HTML_CHARACTERS[text.encoding][:ellipsis]}" unless text == input super RDoc::Markup::Verbatim.new text @@ -262,14 +262,14 @@ def handle_inline(text) return ['', 0] if limit <= 0 @inline_character_limit = limit res = super - res << ' ...' if @inline_character_limit <= 0 + res << " #{TO_HTML_CHARACTERS[text.encoding][:ellipsis]}" if @inline_character_limit <= 0 @characters += limit - @inline_character_limit res end def to_html(item) throw :done if @characters >= @character_limit - to_html_characters(handle_inline(item)) + handle_inline(item) end ## diff --git a/lib/rdoc/text.rb b/lib/rdoc/text.rb index 94c84037c8..6d28a196ef 100644 --- a/lib/rdoc/text.rb +++ b/lib/rdoc/text.rb @@ -29,34 +29,6 @@ module RDoc::Text MARKUP_FORMAT.default = RDoc::Markup - ## - # Maps an encoding to a Hash of characters properly transcoded for that - # encoding. - # - # See also encode_fallback. - - TO_HTML_CHARACTERS = Hash.new do |h, encoding| - h[encoding] = { - :close_dquote => encode_fallback('”', encoding, '"'), - :close_squote => encode_fallback('’', encoding, '\''), - :copyright => encode_fallback('©', encoding, '(c)'), - :ellipsis => encode_fallback('…', encoding, '...'), - :em_dash => encode_fallback('—', encoding, '---'), - :en_dash => encode_fallback('–', encoding, '--'), - :open_dquote => encode_fallback('“', encoding, '"'), - :open_squote => encode_fallback('‘', encoding, '\''), - :trademark => encode_fallback('®', encoding, '(r)'), - } - end - - ## - # Transcodes +character+ to +encoding+ with a +fallback+ character. - - def self.encode_fallback(character, encoding, fallback) - character.encode(encoding, :fallback => { character => fallback }, - :undef => :replace, :replace => fallback) - end - ## # Expands tab characters in +text+ to eight spaces @@ -193,95 +165,6 @@ def strip_stars(text) text.gsub(/^\s+$/, empty) end - def to_html(text) - to_html_characters(text) - end - - ## - # Converts ampersand, dashes, ellipsis, quotes, copyright and registered - # trademark symbols in +text+ to properly encoded characters. - - def to_html_characters(text) - html = (''.encode text.encoding).dup - - encoded = RDoc::Text::TO_HTML_CHARACTERS[text.encoding] - - s = StringScanner.new text - insquotes = false - indquotes = false - after_word = nil - - until s.eos? do - case - when s.scan(/<(tt|code)>.*?<\/\1>/) then # skip contents of tt - html << s.matched - when s.scan(/<(tt|code)>.*?/) then - warn "mismatched <#{s[1]}> tag" # TODO signal file/line - html << s.matched - when s.scan(/<[^>]+\/?s*>/) then # skip HTML tags - html << s.matched - when s.scan(/\.\.\.(\.?)/) then - html << s[1] << encoded[:ellipsis] - after_word = nil - when s.scan(/\(c\)/i) then - html << encoded[:copyright] - after_word = nil - when s.scan(/\(r\)/i) then - html << encoded[:trademark] - after_word = nil - when s.scan(/---/) then - html << encoded[:em_dash] - after_word = nil - when s.scan(/--/) then - html << encoded[:en_dash] - after_word = nil - when s.scan(/"|"/) then - html << encoded[indquotes ? :close_dquote : :open_dquote] - indquotes = !indquotes - after_word = nil - when s.scan(/``/) then # backtick double quote - html << encoded[:open_dquote] - after_word = nil - when s.scan(/(?:'|'){2}/) then # tick double quote - html << encoded[:close_dquote] - after_word = nil - when s.scan(/`/) then # backtick - if insquotes or after_word - html << '`' - after_word = false - else - html << encoded[:open_squote] - insquotes = true - end - when s.scan(/'|'/) then # single quote - if insquotes - html << encoded[:close_squote] - insquotes = false - elsif after_word - # Mary's dog, my parents' house: do not start paired quotes - html << encoded[:close_squote] - else - html << encoded[:open_squote] - insquotes = true - end - - after_word = nil - else # advance to the next potentially significant character - match = s.scan(/.+?(?=[<\\.("'`&-])/) #" - - if match then - html << match - after_word = match =~ /\w$/ - else - html << s.rest - break - end - end - end - - html - end - ## # Wraps +txt+ to +line_len+ diff --git a/test/rdoc/markup/to_html_snippet_test.rb b/test/rdoc/markup/to_html_snippet_test.rb index ce9118a18c..f9bf2fb890 100644 --- a/test/rdoc/markup/to_html_snippet_test.rb +++ b/test/rdoc/markup/to_html_snippet_test.rb @@ -543,7 +543,7 @@ def test_convert_limit_verbatim

Hello There

This is some text, it will be cut off after 100 characters -

This one is cut off in this verbatim ...
+
This one is cut off in this verbatim …
EXPECTED actual = @to.convert rdoc diff --git a/test/rdoc/markup/to_html_test.rb b/test/rdoc/markup/to_html_test.rb index 459bcb140e..05a42c95d6 100644 --- a/test/rdoc/markup/to_html_test.rb +++ b/test/rdoc/markup/to_html_test.rb @@ -669,6 +669,51 @@ def test_convert_string assert_equal '<>', @to.convert_string('<>') end + def test_self_converter_encode_fallback + assert_equal '…', + RDoc::Markup::ToHtml::encode_fallback('…', Encoding::UTF_8, '...') + assert_equal '...', + RDoc::Markup::ToHtml::encode_fallback('…', Encoding::US_ASCII, '...') + end + + def test_convert_HTML_CHARACTER + result = @to.convert "(c)(r)(C)(R)...--....---``''" + assert_equal "\n

©®©®…–.…—“”

\n", result + + result = @to.convert "(c)(r)(C)(R)...--....---``''" + assert_equal "\n

(c)(r)(C)(R)...--....---``''

\n", result + + result = @to.convert "{(c)(r)(C)(R)...--....---``''}[url]" + assert_equal "\n

©®©®…–.…—“”

\n", result + + result = @to.convert "{link}[http://example.com/?q=(c)(r)(C)(R)...--....---``'']" + assert_equal "\n

link

\n", result + end + + def test_convert_HTML_CHARACTER_encoding + s = '...(c)'.encode Encoding::Shift_JIS + result = @to.convert s + assert_equal Encoding::Shift_JIS, result.encoding + + expected = '…(c)'.encode Encoding::Shift_JIS + assert_equal "\n

#{expected}

\n", result + end + + def test_convert_QUOTE_dquote + result = @to.convert '"This is a +quoted+ string." and "another"' + assert_equal "\n

“This is a quoted string.” and “another”

\n", result + end + + def test_convert_QUOTE_squote + result = @to.convert "'quote' '1+2'. I'm 'RDoc'" + assert_equal "\n

‘quote’ ‘1+2’. I’m ‘RDoc’

\n", result + end + + def test_convert_QUOTE_backtick + result = @to.convert "This is `quote' and this is `code`" + assert_equal "\n

This is ‘quote’ and this is code

\n", result + end + def test_convert_HYPERLINK_irc result = @to.convert 'irc://irc.freenode.net/#ruby-lang' diff --git a/test/rdoc/rdoc_text_test.rb b/test/rdoc/rdoc_text_test.rb index 03c4167ac5..48686b7a25 100644 --- a/test/rdoc/rdoc_text_test.rb +++ b/test/rdoc/rdoc_text_test.rb @@ -16,13 +16,6 @@ def setup @language = nil end - def test_self_encode_fallback - assert_equal '…', - RDoc::Text::encode_fallback('…', Encoding::UTF_8, '...') - assert_equal '...', - RDoc::Text::encode_fallback('…', Encoding::US_ASCII, '...') - end - def test_expand_tabs assert_equal("hello\n dave", expand_tabs("hello\n dave"), 'spaces') @@ -478,104 +471,6 @@ def test_strip_stars_no_stars assert_equal expected, strip_stars(text) end - def test_to_html_apostrophe - assert_equal '‘a', to_html("'a") - assert_equal 'a’', to_html("a'") - - assert_equal '‘a’ ‘', to_html("'a' '") - end - - def test_to_html_apostrophe_entity - assert_equal '‘a', to_html("'a") - assert_equal 'a’', to_html("a'") - - assert_equal '‘a’ ‘', to_html("'a' '") - end - - def test_to_html_backslash - # Don't handle unescaped crossref. It should be handled in RDoc::Markup::ToHtml, not in RDoc::Text - assert_equal '\\S', to_html('\\S') - end - - def test_to_html_br - assert_equal '
', to_html('
') - end - - def test_to_html_copyright - assert_equal '©', to_html('(c)') - assert_equal '©', to_html('(C)') - end - - def test_to_html_dash - assert_equal '-', to_html('-') - assert_equal '–', to_html('--') - assert_equal '—', to_html('---') - assert_equal '—-', to_html('----') - end - - def test_to_html_double_backtick - assert_equal '“a', to_html('``a') - assert_equal '“a“', to_html('``a``') - assert_equal '“a”', to_html("``a''") - end - - def test_to_html_double_quote - assert_equal '“a', to_html('"a') - assert_equal '“a”', to_html('"a"') - end - - def test_to_html_double_quote_quot - assert_equal '“a', to_html('"a') - assert_equal '“a”', to_html('"a"') - end - - def test_to_html_double_tick - assert_equal '”a', to_html("''a") - assert_equal '”a”', to_html("''a''") - end - - def test_to_html_ellipsis - assert_equal '..', to_html('..') - assert_equal '…', to_html('...') - assert_equal '.…', to_html('....') - end - - def test_to_html_encoding - s = '...(c)'.encode Encoding::Shift_JIS - - html = to_html s - - assert_equal Encoding::Shift_JIS, html.encoding - - expected = '…(c)'.encode Encoding::Shift_JIS - - assert_equal expected, html - end - - def test_to_html_html_tag - assert_equal 'hi’s', - to_html('hi\'s') - end - - def test_to_html_registered_trademark - assert_equal '®', to_html('(r)') - assert_equal '®', to_html('(R)') - end - - def test_to_html_tt_tag - # tt tag content is already escaped - assert_equal 'hi\'s', to_html('hi\'s') - assert_equal 'hi\\\\\'s', to_html('hi\\\\\'s') - end - - def test_to_html_tt_tag_mismatch - _, err = verbose_capture_output do - assert_equal 'hi', to_html('hi') - end - - assert_include err, "mismatched tag\n" - end - def formatter RDoc::Markup::ToHtml.new @options end