From 2b6f7fa3f90869bb360ff51c2f7e2652b4b88a5d Mon Sep 17 00:00:00 2001 From: tompng Date: Tue, 6 Jan 2026 01:56:14 +0900 Subject: [PATCH] Implement escapes in Markdown to RDoc conversion Plain text part of parsed markdown may contain special characters (example: `+_*<`). URL in tidy link may contain `[]`. These characters need escape. --- lib/rdoc/markdown.kpeg | 28 ++++++++++---- lib/rdoc/markdown.rb | 56 +++++++++++++++++---------- lib/rdoc/markup/inline_parser.rb | 3 +- test/rdoc/markup/to_html_test.rb | 16 ++++++++ test/rdoc/rdoc_markdown_test.rb | 57 ++++++++++++++++++++++++++-- test/rdoc/rdoc_markdown_test_test.rb | 18 ++++----- 6 files changed, 136 insertions(+), 42 deletions(-) diff --git a/lib/rdoc/markdown.kpeg b/lib/rdoc/markdown.kpeg index 91d05c57a9..cde3e927f4 100644 --- a/lib/rdoc/markdown.kpeg +++ b/lib/rdoc/markdown.kpeg @@ -303,6 +303,20 @@ end end + # Escape character that has special meaning in RDoc format. + # To allow rdoc-styled link used in markdown format for now, bracket and brace are not escaped. + + def rdoc_escape(text) + text.gsub(/[*+<\\_]/) {|s| "\\#{s}" } + end + + # Escape link url that contains brackets. + # Brackets needs escape because link url will be surrounded by `[]` in RDoc format. + + def rdoc_link_url_escape(text) + text.gsub(/[\[\]\\]/) {|s| "\\#{s}" } + end + ## # :category: Extensions # @@ -969,11 +983,11 @@ Space = @Spacechar+ { " " } Str = @StartList:a < @NormalChar+ > { a = text } - ( StrChunk:c { a << c } )* { a } + ( StrChunk:c { a << c } )* { rdoc_escape(a) } StrChunk = < (@NormalChar | /_+/ &Alphanumeric)+ > { text } -EscapedChar = "\\" !@Newline < /[:\\`|*_{}\[\]()#+.!><-]/ > { text } +EscapedChar = "\\" !@Newline < /[:\\`|*_{}\[\]()#+.!><-]/ > { rdoc_escape(text) } Entity = ( HexEntity | DecEntity | CharEntity ):a { a } @@ -988,7 +1002,7 @@ TerminalEndline = @Sp @Newline @Eof LineBreak = " " @NormalEndline { RDoc::Markup::HardBreak.new } Symbol = < @SpecialChar > - { text } + { rdoc_escape(text) } # This keeps the parser from getting bogged down on long strings of '*' or '_', # or strings of '*' or '_' with space on each side: @@ -1053,7 +1067,7 @@ ReferenceLinkSingle = Label:content < (Spnl "[]")? > { link_to content, content, text } ExplicitLink = ExplicitLinkWithLabel:a - { "{#{a[:label]}}[#{a[:link]}]" } + { "{#{a[:label]}}[#{rdoc_link_url_escape(a[:link])}]" } ExplicitLinkWithLabel = Label:label "(" @Sp Source:link Spnl Title @Sp ")" { { label: label, link: link } } @@ -1163,12 +1177,12 @@ Newline = %literals.Newline Spacechar = %literals.Spacechar HexEntity = /&#x/i < /[0-9a-fA-F]+/ > ";" - { [text.to_i(16)].pack 'U' } + { rdoc_escape([text.to_i(16)].pack('U')) } DecEntity = "&#" < /[0-9]+/ > ";" - { [text.to_i].pack 'U' } + { rdoc_escape([text.to_i].pack('U')) } CharEntity = "&" ";" { if entity = HTML_ENTITIES[text] then - entity.pack 'U*' + rdoc_escape(entity.pack('U*')) else "&#{text};" end diff --git a/lib/rdoc/markdown.rb b/lib/rdoc/markdown.rb index 811c065ec1..7e4adcefc3 100644 --- a/lib/rdoc/markdown.rb +++ b/lib/rdoc/markdown.rb @@ -688,6 +688,20 @@ def emphasis text end end + # Escape character that has special meaning in RDoc format. + # To allow rdoc-styled link used in markdown format for now, bracket and brace are not escaped. + + def rdoc_escape(text) + text.gsub(/[*+<\\_]/) {|s| "\\#{s}" } + end + + # Escape link url that contains brackets. + # Brackets needs escape because link url will be surrounded by `[]` in RDoc format. + + def rdoc_link_url_escape(text) + text.gsub(/[\[\]\\]/) {|s| "\\#{s}" } + end + ## # :category: Extensions # @@ -9731,7 +9745,7 @@ def _Space return _tmp end - # Str = @StartList:a < @NormalChar+ > { a = text } (StrChunk:c { a << c })* { a } + # Str = @StartList:a < @NormalChar+ > { a = text } (StrChunk:c { a << c })* { rdoc_escape(a) } def _Str _save = self.pos @@ -9792,7 +9806,7 @@ def _Str self.pos = _save break end - @result = begin; a ; end + @result = begin; rdoc_escape(a) ; end _tmp = true unless _tmp self.pos = _save @@ -9894,7 +9908,7 @@ def _StrChunk return _tmp end - # EscapedChar = "\\" !@Newline < /[:\\`|*_{}\[\]()#+.!><-]/ > { text } + # EscapedChar = "\\" !@Newline < /[:\\`|*_{}\[\]()#+.!><-]/ > { rdoc_escape(text) } def _EscapedChar _save = self.pos @@ -9921,7 +9935,7 @@ def _EscapedChar self.pos = _save break end - @result = begin; text ; end + @result = begin; rdoc_escape(text) ; end _tmp = true unless _tmp self.pos = _save @@ -10122,7 +10136,7 @@ def _LineBreak return _tmp end - # Symbol = < @SpecialChar > { text } + # Symbol = < @SpecialChar > { rdoc_escape(text) } def _Symbol _save = self.pos @@ -10136,7 +10150,7 @@ def _Symbol self.pos = _save break end - @result = begin; text ; end + @result = begin; rdoc_escape(text) ; end _tmp = true unless _tmp self.pos = _save @@ -11189,7 +11203,7 @@ def _ReferenceLinkSingle return _tmp end - # ExplicitLink = ExplicitLinkWithLabel:a { "{#{a[:label]}}[#{a[:link]}]" } + # ExplicitLink = ExplicitLinkWithLabel:a { "{#{a[:label]}}[#{rdoc_link_url_escape(a[:link])}]" } def _ExplicitLink _save = self.pos @@ -11200,7 +11214,7 @@ def _ExplicitLink self.pos = _save break end - @result = begin; "{#{a[:label]}}[#{a[:link]}]" ; end + @result = begin; "{#{a[:label]}}[#{rdoc_link_url_escape(a[:link])}]" ; end _tmp = true unless _tmp self.pos = _save @@ -14615,7 +14629,7 @@ def _Spacechar return _tmp end - # HexEntity = /&#x/i < /[0-9a-fA-F]+/ > ";" { [text.to_i(16)].pack 'U' } + # HexEntity = /&#x/i < /[0-9a-fA-F]+/ > ";" { rdoc_escape([text.to_i(16)].pack('U')) } def _HexEntity _save = self.pos @@ -14639,7 +14653,7 @@ def _HexEntity self.pos = _save break end - @result = begin; [text.to_i(16)].pack 'U' ; end + @result = begin; rdoc_escape([text.to_i(16)].pack('U')) ; end _tmp = true unless _tmp self.pos = _save @@ -14651,7 +14665,7 @@ def _HexEntity return _tmp end - # DecEntity = "&#" < /[0-9]+/ > ";" { [text.to_i].pack 'U' } + # DecEntity = "&#" < /[0-9]+/ > ";" { rdoc_escape([text.to_i].pack('U')) } def _DecEntity _save = self.pos @@ -14675,7 +14689,7 @@ def _DecEntity self.pos = _save break end - @result = begin; [text.to_i].pack 'U' ; end + @result = begin; rdoc_escape([text.to_i].pack('U')) ; end _tmp = true unless _tmp self.pos = _save @@ -14687,7 +14701,7 @@ def _DecEntity return _tmp end - # CharEntity = "&" < /[A-Za-z0-9]+/ > ";" { if entity = HTML_ENTITIES[text] then entity.pack 'U*' else "&#{text};" end } + # CharEntity = "&" < /[A-Za-z0-9]+/ > ";" { if entity = HTML_ENTITIES[text] then rdoc_escape(entity.pack('U*')) else "&#{text};" end } def _CharEntity _save = self.pos @@ -14712,7 +14726,7 @@ def _CharEntity break end @result = begin; if entity = HTML_ENTITIES[text] then - entity.pack 'U*' + rdoc_escape(entity.pack('U*')) else "&#{text};" end @@ -16563,15 +16577,15 @@ def _DefinitionListDefinition Rules[:_Inlines] = rule_info("Inlines", "(!@Endline Inline:i { i } | @Endline:c !(&{ github? } Ticks3 /[^`\\n]*$/) &Inline { c })+:chunks @Endline? { chunks }") Rules[:_Inline] = rule_info("Inline", "(Str | @Endline | UlOrStarLine | @Space | Strong | Emph | Strike | Image | Link | NoteReference | InlineNote | Code | RawHtml | Entity | EscapedChar | Symbol)") Rules[:_Space] = rule_info("Space", "@Spacechar+ { \" \" }") - Rules[:_Str] = rule_info("Str", "@StartList:a < @NormalChar+ > { a = text } (StrChunk:c { a << c })* { a }") + Rules[:_Str] = rule_info("Str", "@StartList:a < @NormalChar+ > { a = text } (StrChunk:c { a << c })* { rdoc_escape(a) }") Rules[:_StrChunk] = rule_info("StrChunk", "< (@NormalChar | /_+/ &Alphanumeric)+ > { text }") - Rules[:_EscapedChar] = rule_info("EscapedChar", "\"\\\\\" !@Newline < /[:\\\\`|*_{}\\[\\]()\#+.!><-]/ > { text }") + Rules[:_EscapedChar] = rule_info("EscapedChar", "\"\\\\\" !@Newline < /[:\\\\`|*_{}\\[\\]()\#+.!><-]/ > { rdoc_escape(text) }") Rules[:_Entity] = rule_info("Entity", "(HexEntity | DecEntity | CharEntity):a { a }") Rules[:_Endline] = rule_info("Endline", "(@LineBreak | @TerminalEndline | @NormalEndline)") Rules[:_NormalEndline] = rule_info("NormalEndline", "@Sp @Newline !@BlankLine !\">\" !AtxStart !(Line /={1,}|-{1,}/ @Newline) { \"\\n\" }") Rules[:_TerminalEndline] = rule_info("TerminalEndline", "@Sp @Newline @Eof") Rules[:_LineBreak] = rule_info("LineBreak", "\" \" @NormalEndline { RDoc::Markup::HardBreak.new }") - Rules[:_Symbol] = rule_info("Symbol", "< @SpecialChar > { text }") + Rules[:_Symbol] = rule_info("Symbol", "< @SpecialChar > { rdoc_escape(text) }") Rules[:_UlOrStarLine] = rule_info("UlOrStarLine", "(UlLine | StarLine):a { a }") Rules[:_StarLine] = rule_info("StarLine", "(< /\\*{4,}/ > { text } | < @Spacechar /\\*+/ &@Spacechar > { text })") Rules[:_UlLine] = rule_info("UlLine", "(< /_{4,}/ > { text } | < @Spacechar /_+/ &@Spacechar > { text })") @@ -16588,7 +16602,7 @@ def _DefinitionListDefinition Rules[:_ReferenceLink] = rule_info("ReferenceLink", "(ReferenceLinkDouble | ReferenceLinkSingle)") Rules[:_ReferenceLinkDouble] = rule_info("ReferenceLinkDouble", "Label:content < Spnl > !\"[]\" Label:label { link_to content, label, text }") Rules[:_ReferenceLinkSingle] = rule_info("ReferenceLinkSingle", "Label:content < (Spnl \"[]\")? > { link_to content, content, text }") - Rules[:_ExplicitLink] = rule_info("ExplicitLink", "ExplicitLinkWithLabel:a { \"{\#{a[:label]}}[\#{a[:link]}]\" }") + Rules[:_ExplicitLink] = rule_info("ExplicitLink", "ExplicitLinkWithLabel:a { \"{\#{a[:label]}}[\#{rdoc_link_url_escape(a[:link])}]\" }") Rules[:_ExplicitLinkWithLabel] = rule_info("ExplicitLinkWithLabel", "Label:label \"(\" @Sp Source:link Spnl Title @Sp \")\" { { label: label, link: link } }") Rules[:_Source] = rule_info("Source", "(\"<\" < SourceContents > \">\" | < SourceContents >) { text }") Rules[:_SourceContents] = rule_info("SourceContents", "((!\"(\" !\")\" !\">\" Nonspacechar)+ | \"(\" SourceContents \")\")*") @@ -16631,9 +16645,9 @@ def _DefinitionListDefinition Rules[:_BOM] = rule_info("BOM", "%literals.BOM") Rules[:_Newline] = rule_info("Newline", "%literals.Newline") Rules[:_Spacechar] = rule_info("Spacechar", "%literals.Spacechar") - Rules[:_HexEntity] = rule_info("HexEntity", "/&\#x/i < /[0-9a-fA-F]+/ > \";\" { [text.to_i(16)].pack 'U' }") - Rules[:_DecEntity] = rule_info("DecEntity", "\"&\#\" < /[0-9]+/ > \";\" { [text.to_i].pack 'U' }") - Rules[:_CharEntity] = rule_info("CharEntity", "\"&\" < /[A-Za-z0-9]+/ > \";\" { if entity = HTML_ENTITIES[text] then entity.pack 'U*' else \"&\#{text};\" end }") + Rules[:_HexEntity] = rule_info("HexEntity", "/&\#x/i < /[0-9a-fA-F]+/ > \";\" { rdoc_escape([text.to_i(16)].pack('U')) }") + Rules[:_DecEntity] = rule_info("DecEntity", "\"&\#\" < /[0-9]+/ > \";\" { rdoc_escape([text.to_i].pack('U')) }") + Rules[:_CharEntity] = rule_info("CharEntity", "\"&\" < /[A-Za-z0-9]+/ > \";\" { if entity = HTML_ENTITIES[text] then rdoc_escape(entity.pack('U*')) else \"&\#{text};\" end }") Rules[:_NonindentSpace] = rule_info("NonindentSpace", "/ {0,3}/") Rules[:_Indent] = rule_info("Indent", "/\\t| /") Rules[:_IndentedLine] = rule_info("IndentedLine", "Indent Line") diff --git a/lib/rdoc/markup/inline_parser.rb b/lib/rdoc/markup/inline_parser.rb index 6bbd15e7e2..4e2b86c630 100644 --- a/lib/rdoc/markup/inline_parser.rb +++ b/lib/rdoc/markup/inline_parser.rb @@ -303,9 +303,10 @@ def scan_token # Returns nil if no valid URL part is found. # URL part is enclosed in square brackets and may contain escaped brackets. # Example: [http://example.com/?q=\[\]] represents http://example.com/?q=[]. + # If we're accepting rdoc-style links in markdown, url may include *+<_ with backslash escape. def read_tidylink_url - bracketed_url = strscan(/\[([^\s\[\]\\]|\\[\[\]\\])+\]/) + bracketed_url = strscan(/\[([^\s\[\]\\]|\\[\[\]\\*+<_])+\]/) bracketed_url[1...-1].gsub(/\\(.)/, '\1') if bracketed_url end end diff --git a/test/rdoc/markup/to_html_test.rb b/test/rdoc/markup/to_html_test.rb index 459bcb140e..bb57e78e86 100644 --- a/test/rdoc/markup/to_html_test.rb +++ b/test/rdoc/markup/to_html_test.rb @@ -736,6 +736,22 @@ def test_convert_TIDYLINK_multiple assert_equal expected, result end + def test_convert_TIDYLINK_url_unescape + # markdown: [{label}](http://example.com/foo?q=bar+baz[]) + result = @to.convert '{\{label\}}[http://example.com/_foo?q=bar+baz\[\]]' + expected = "\n

{label}

\n" + assert_equal expected, result + end + + def test_convert_TIDYLINK_rdoc_in_markdown_url_unescape + # markdown: {label}[http://example.com/?q=<+_*] + # The ubove text is a plain text in markdown, so <+_* are escaped in HTML. + # If we're accepting rdoc-style link in markdown, these escape should be allowed in [url] part. + result = @to.convert '{label}[http://example.com/?q=\<\+\_\*]' + expected = "\n

label

\n" + assert_equal expected, result + end + def test_convert_TIDYLINK_with_code_label result = @to.convert '{Link to +Foo+}[https://example.com]' diff --git a/test/rdoc/rdoc_markdown_test.rb b/test/rdoc/rdoc_markdown_test.rb index e851466f72..1b3e4e96ce 100644 --- a/test/rdoc/rdoc_markdown_test.rb +++ b/test/rdoc/rdoc_markdown_test.rb @@ -480,11 +480,11 @@ def test_parse_emphasis_underscore end def test_parse_emphasis_underscore_embedded - doc = parse "foo_bar bar_baz\n" + doc = parse "foo_bar bar_baz _em1_ *em2*\n" expected = doc( - para("foo_bar bar_baz")) + para("foo\\_bar bar\\_baz _em1_ _em2_")) assert_equal expected, doc end @@ -494,15 +494,64 @@ def test_parse_emphasis_underscore_in_word expected = doc( - para("it foo_bar_baz")) + para("it foo\\_bar\\_baz")) assert_equal expected, doc end + def test_rdoc_code_escaped_in_normal_text + doc = parse "+notcode+ \\+notcode+ \\\\+notcode+" + expected = doc(para("\\+notcode\\+ \\+notcode\\+ \\\\\\+notcode\\+")) + assert_equal expected, doc + end + + def test_escape_character_entities + doc = parse "<tt>*\\ <tt>+\\ <tt>_\\" + expected = doc(para("\\\\*\\ \\\\+\\ \\\\_\\")) + assert_equal expected, doc + end + + def test_rdoc_escape_in_markdown_styling + doc = parse "_a \\_b\\_ c_ **+d+** `_1+2*3`" + expected = doc(para("a \\_b\\_ c \\+d\\+ _1+2*3")) + assert_equal expected, doc + end + + def test_rdoc_heading_escaped_inside_markdown + doc = parse "= notheading\n" + expected = doc(para("= notheading")) + assert_equal expected, doc + end + + def test_rdoc_code_escaped_inside_markdown + doc = parse "~~+notcode+~~" + expected = doc(para("\\+notcode\\+")) + assert_equal expected, doc + end + + def test_no_rdoc_escape_inside_markdown_code + doc = parse "`+foo+`" + expected = doc(para("+foo+")) + assert_equal expected, doc + end + + def test_rdoc_format_escaped_inside_markdown_link + doc = parse "[Link +to+ `tap{ +1+ }`](http://example.com/?q=[])" + expected = doc(para("{Link \\+to\\+ tap{ +1+ }}[http://example.com/?q=\\[\\]]")) + assert_equal expected, doc + end + + def test_lt_escape + doc = parse "\\`a`\\ \\`b`" + expected = doc(para("\\a\\ \\b")) + assert_equal expected, doc + end + def test_parse_escape assert_equal doc(para("Backtick: `")), parse("Backtick: \\`") - assert_equal doc(para("Backslash: \\")), parse("Backslash: \\\\") + # Unescaped as markdown and then escaped as RDoc + assert_equal doc(para("Backslash: \\\\")), parse("Backslash: \\\\") assert_equal doc(para("Colon: :")), parse("Colon: \\:") end diff --git a/test/rdoc/rdoc_markdown_test_test.rb b/test/rdoc/rdoc_markdown_test_test.rb index c28fe6bebd..ce9481e7ed 100644 --- a/test/rdoc/rdoc_markdown_test_test.rb +++ b/test/rdoc/rdoc_markdown_test_test.rb @@ -25,7 +25,7 @@ def test_amps_and_angle_encoding para("AT&T has an ampersand in their name."), para("AT&T is another way to write it."), para("This & that."), - para("4 < 5."), + para("4 \\< 5."), para("6 > 5."), para("Here's a {link}[http://example.com/?foo=1&bar=2] with " + "an ampersand in the URL."), @@ -69,10 +69,10 @@ def test_backslash_escapes doc( para("These should all get escaped:"), - para("Backslash: \\"), + para("Backslash: \\\\"), para("Backtick: `"), - para("Asterisk: *"), - para("Underscore: _"), + para("Asterisk: \\*"), + para("Underscore: \\_"), para("Left brace: {"), para("Right brace: }"), para("Left bracket: ["), @@ -83,7 +83,7 @@ def test_backslash_escapes para("Hash: #"), para("Period: ."), para("Bang: !"), - para("Plus: +"), + para("Plus: \\+"), para("Minus: -"), para("These should not, because they occur within a code block:"), @@ -142,8 +142,8 @@ def test_backslash_escapes para("These should get escaped, even though they're matching pairs for\n" + "other Markdown constructs:"), - para("\*asterisks\*"), - para("\_underscores\_"), + para("\\*asterisks\\*"), + para("\\_underscores\\_"), para("`backticks`"), para("This is a code span with a literal backslash-backtick " + @@ -227,7 +227,7 @@ def test_hard_wrapped_paragraphs_with_list_like_lines "middle of a paragraph looked like a\n" + "list item."), para("Here's one with a bullet.\n" + - "* criminey.")) + "\\* criminey.")) assert_equal expected, doc end @@ -866,7 +866,7 @@ def test_markdown_documentation_syntax para("To this end, Markdown's syntax is comprised entirely of punctuation\n" + "characters, which punctuation characters have been carefully chosen so\n" + "as to look like what they mean. E.g., asterisks around a word actually\n" + - "look like \*emphasis\*. Markdown lists look like, well, lists. Even\n" + + "look like \\*emphasis\\*. Markdown lists look like, well, lists. Even\n" + "blockquotes look like quoted passages of text, assuming you've ever\n" + "used email."),