From 025781d2c6f975f68590689f87bca55618ad9119 Mon Sep 17 00:00:00 2001 From: Anna Petrasova Date: Fri, 7 Mar 2025 13:18:14 -0500 Subject: [PATCH 01/11] doc: markdown to man pages --- utils/markdown2man.py | 296 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 utils/markdown2man.py diff --git a/utils/markdown2man.py b/utils/markdown2man.py new file mode 100644 index 00000000000..717e9c4fe38 --- /dev/null +++ b/utils/markdown2man.py @@ -0,0 +1,296 @@ +import argparse +import sys +import re + +def strip_yaml_from_markdown(content): + # Remove YAML front matter + return re.sub(r'^---\n.*?\n---\n', '', content, flags=re.DOTALL) + + +def parse_markdown(content): + lines = content.splitlines() + processing_block = [] + processed_content = [] + + buffer = "" + state = "default" + + + for line in lines: + if line.strip().startswith("```"): + # end of code block + if state == "code": + processing_block.append(line) + processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processing_block = [] + state = "default" + # start of code block + else: + if buffer: + processing_block.append(buffer) + buffer = "" + processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processing_block = [] + else: + processing_block.append(line) + state = "code" + continue + + if state == "code": + processing_block.append(line) + continue + + if line.endswith(" ") : + buffer += line # Keep trailing spaces for markdown line breaks + processing_block.append(buffer) + buffer = "" + + if line.strip().startswith("- ") or line.startswith("* "): + + if buffer: + processing_block.append(buffer) + buffer = "" + + # start of list + if state != "list": + processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processing_block = [] + state = "list" + + processing_block.append(line) + continue + + if re.match(r"^\d+\.", line.strip()): + if buffer: + processing_block.append(buffer) + buffer = "" + # start of ordered list + if state != "olist": + processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processing_block = [] + state = "olist" + + processing_block.append(line) + continue + + # empty line at the start and end of code, list blocks + if line == "": + if buffer: + processing_block.append(buffer) + buffer = "" + if state != "default": + processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processing_block = [] + state = "default" + processing_block.append(line) + continue + + buffer += line + " " + + if buffer: + processing_block.append(buffer) + if processing_block: + processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + + return processed_content + +def process_multiline_links(markdown): + # Regular expression to match links that may span multiple lines + link_pattern = re.compile(r'\[([^\]]*)\]\(([^)]+)\)', re.DOTALL) + image_link_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)', re.DOTALL) + + def replace_link(match): + # Strip leading and trailing whitespace or newlines from the link URL and text + alt_text = match.group(1).replace("\n", " ").strip() + url = match.group(2).replace("\n", "").strip() + return f'[{alt_text}]({url})' + + def replace_image_link(match): + # Strip leading and trailing whitespace or newlines from the link URL and text + alt_text = match.group(1).replace("\n", " ").strip() + url = match.group(2).replace("\n", "").strip() + return f'![{alt_text}]({url})' + + # Replace all matched links with the single-line version + markdown = re.sub(link_pattern, replace_link, markdown) + return re.sub(image_link_pattern, replace_image_link, markdown) + +def process_markdown_formatting(md_text): + # Regular expression to find multi-line formatting for bold, emphasis, and combined bold and emphasis + pattern = r'(\*\*\*([^\*]+)\*\*\*|\*\*([^\*]+)\*\*|\*([^\*]+)\*)' + + def replace_match(match): + # Match for combined bold and emphasis (***text***) + if match.group(1).startswith('***'): + content = match.group(2).replace('\n', ' ').strip() + return f"***{content}***" + + # Match for bold (**text**) + elif match.group(1).startswith('**'): + content = match.group(3).replace('\n', ' ').strip() + return f"**{content}**" + + # Match for emphasis (*text*) + elif match.group(1).startswith('*'): + content = match.group(4).replace('\n', ' ').strip() + return f"*{content}*" + + return match.group(0) # Return the original text if no match + + # Apply the regex pattern to replace formatting spans + processed_text = re.sub(pattern, replace_match, md_text) + + return processed_text + +def process_links(line): + """Replace Markdown links with only their display text.""" + line = re.sub(r"!\[.*?\]\(.*?\)", "", line) + return re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", line) + +def process_parameters(line): + return re.sub(r"^\*\*([a-z0-9_]*)\*\*=\*([a-z]*)\*( \*\*\[required\]\*\*)?", r'.IP "**\1**=*\2*\3" 4m', line, flags=re.MULTILINE) + +def process_flags(line): + return re.sub(r"^\*\*-(.*?)\*\*", r'.IP "**-\1**" 4m', line, flags=re.MULTILINE) + +def process_formatting(line): + """Apply inline formatting for bold, italic, and bold+italic.""" + line = re.sub(r"\*\*\*(.+?)\*\*\*", r"\\fB\\fI\1\\fR", line) # Bold+Italic + line = re.sub(r"\*\*(.+?)\*\*", r"\\fB\1\\fR", line) # Bold + line = re.sub(r"\*(.+?)\*", r"\\fI\1\\fR", line) # Italic + line = line.replace("\u00A0", " ") # Replace non-breaking spaces with indent + + return line + +def process_br(line): + return re.sub(r"([^\n\s]) $", r"\1\n.br", line) + +def process_headings(markdown): + def convert_sh(match): + return f".SH {match.group(1).upper()}\n" + + def convert_ss(match): + return f".SS {match.group(1)}\n" + + markdown = re.sub(r"^#{1,2} (.*)", convert_sh, markdown, flags=re.MULTILINE) + markdown = re.sub(r"^#{3,} (.*)", convert_ss, markdown, flags=re.MULTILINE) + return markdown + +def process_code(markdown): + markdown = re.sub(r"\\", "\(rs", markdown) + return markdown + +def process_lists(markdown): + markdown = re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', markdown, flags=re.MULTILINE) + return re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', markdown, flags=re.MULTILINE) + +def process_non_code(markdown): + markdown_text = process_parameters(markdown) + markdown_text = process_flags(markdown_text) + markdown_text = markdown_text.replace("    ", "") + + markdown_text = re.sub(r"\\#", "#", markdown_text) + markdown_text = re.sub(r"\\>", ">", markdown_text) + markdown_text = re.sub(r"\\<", "<", markdown_text) + markdown_text = re.sub(r"\\", "\(rs", markdown_text) + markdown_text = re.sub(r"(?<=\S) {2,}(?=\S)", " ", markdown_text) + markdown_text = re.sub(r'(?<=\S) {2,}(?=\S)', ' ', markdown_text) + markdown_text = process_formatting(markdown_text) + markdown_text = process_links(markdown_text) + markdown_text = process_headings(markdown_text) + + return markdown_text + +def convert_line(line, in_paragraph, in_code_block): + """Convert a single line of Markdown to man page format, handling paragraph continuity.""" + if line.startswith("```"): + if in_code_block: + return "\\fR\n.fi\n", False, False # End code block + else: + return ".nf\n\\fC\n", False, True # Start code block with proper indent + + if in_code_block: + return f"{line}\n", False, True # Keep all whitespace in code blocks + + if not line.strip(): + return "", False, False # Empty line resets paragraph state + + #line = process_headings(line) + line = process_br(line) + + line = line.replace("`", "") + + if re.match(r"^[-*] (.+)", line): + return f".IP \"{re.sub(r'^[-*] ', '', line)}\" 4m\n.br\n", False, False + #if re.match(r"^\d+\. (.+)", line): + #return re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', line, flags=re.MULTILINE), False, False + if in_paragraph: + return line + "\n", True, False + else: + return f".PP\n{line}\n", True, False + + +def markdown_to_man(markdown_text): + """Convert a Markdown text to a Unix man page format""" + markdown_text = strip_yaml_from_markdown(markdown_text) + blocks = parse_markdown(markdown_text) + result = [] + for block in blocks: + if block["type"] == "code": + result.append(process_code(block["markdown"])) + elif block["type"] == "list": + result.append(process_lists(block["markdown"])) + elif block["type"] == "olist": + result.append(process_lists(block["markdown"])) + else: + result.append(process_non_code(block["markdown"])) + markdown_text = "\n".join(result) + #markdown_text = process_parameters(markdown_text) + #markdown_text = process_flags(markdown_text) + #markdown_text = markdown_text.replace("    ", "") + + #markdown_text = re.sub(r"\\#", "#", markdown_text) + #markdown_text = re.sub(r"\\>", ">", markdown_text) + #markdown_text = re.sub(r"\\<", "<", markdown_text) + #markdown_text = re.sub(r"\\", "\(rs", markdown_text) + #markdown_text = process_formatting(markdown_text) + #markdown_text = process_links(markdown_text) + + lines = markdown_text.splitlines() + man_page = [".TH MAN 1 \"Manual\"\n"] + in_paragraph = False + in_code_block = False + + print(markdown_text) + + for line in lines: + converted_line, in_paragraph, in_code_block = convert_line(line, in_paragraph, in_code_block) + man_page.append(converted_line) + + if in_code_block: + man_page.append(".fi\n") # Ensure proper closure of code block + + return "".join(man_page) + +def convert_markdown_to_man(input_file, output_file): + """Read Markdown file and convert to man page.""" + with open(input_file, "r") as f: + markdown_text = f.read() + + man_text = markdown_to_man(markdown_text) + + with open(output_file, "w") as f: + f.write(man_text) + + print(f"Man page generated: {output_file}") + +def main(): + parser = argparse.ArgumentParser(description="Convert Markdown to Unix man page.") + parser.add_argument("input_file", help="Path to the input Markdown file.") + parser.add_argument("output_file", help="Path to the output man page file.") + args = parser.parse_args() + + convert_markdown_to_man(args.input_file, args.output_file) + +if __name__ == "__main__": + main() From 9db21fefd67f5f97ece036197bde9eabfc5454bc Mon Sep 17 00:00:00 2001 From: Anna Petrasova Date: Fri, 7 Mar 2025 13:18:14 -0500 Subject: [PATCH 02/11] doc: markdown to man pages --- utils/markdown2man.py | 296 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 utils/markdown2man.py diff --git a/utils/markdown2man.py b/utils/markdown2man.py new file mode 100644 index 00000000000..717e9c4fe38 --- /dev/null +++ b/utils/markdown2man.py @@ -0,0 +1,296 @@ +import argparse +import sys +import re + +def strip_yaml_from_markdown(content): + # Remove YAML front matter + return re.sub(r'^---\n.*?\n---\n', '', content, flags=re.DOTALL) + + +def parse_markdown(content): + lines = content.splitlines() + processing_block = [] + processed_content = [] + + buffer = "" + state = "default" + + + for line in lines: + if line.strip().startswith("```"): + # end of code block + if state == "code": + processing_block.append(line) + processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processing_block = [] + state = "default" + # start of code block + else: + if buffer: + processing_block.append(buffer) + buffer = "" + processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processing_block = [] + else: + processing_block.append(line) + state = "code" + continue + + if state == "code": + processing_block.append(line) + continue + + if line.endswith(" ") : + buffer += line # Keep trailing spaces for markdown line breaks + processing_block.append(buffer) + buffer = "" + + if line.strip().startswith("- ") or line.startswith("* "): + + if buffer: + processing_block.append(buffer) + buffer = "" + + # start of list + if state != "list": + processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processing_block = [] + state = "list" + + processing_block.append(line) + continue + + if re.match(r"^\d+\.", line.strip()): + if buffer: + processing_block.append(buffer) + buffer = "" + # start of ordered list + if state != "olist": + processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processing_block = [] + state = "olist" + + processing_block.append(line) + continue + + # empty line at the start and end of code, list blocks + if line == "": + if buffer: + processing_block.append(buffer) + buffer = "" + if state != "default": + processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processing_block = [] + state = "default" + processing_block.append(line) + continue + + buffer += line + " " + + if buffer: + processing_block.append(buffer) + if processing_block: + processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + + return processed_content + +def process_multiline_links(markdown): + # Regular expression to match links that may span multiple lines + link_pattern = re.compile(r'\[([^\]]*)\]\(([^)]+)\)', re.DOTALL) + image_link_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)', re.DOTALL) + + def replace_link(match): + # Strip leading and trailing whitespace or newlines from the link URL and text + alt_text = match.group(1).replace("\n", " ").strip() + url = match.group(2).replace("\n", "").strip() + return f'[{alt_text}]({url})' + + def replace_image_link(match): + # Strip leading and trailing whitespace or newlines from the link URL and text + alt_text = match.group(1).replace("\n", " ").strip() + url = match.group(2).replace("\n", "").strip() + return f'![{alt_text}]({url})' + + # Replace all matched links with the single-line version + markdown = re.sub(link_pattern, replace_link, markdown) + return re.sub(image_link_pattern, replace_image_link, markdown) + +def process_markdown_formatting(md_text): + # Regular expression to find multi-line formatting for bold, emphasis, and combined bold and emphasis + pattern = r'(\*\*\*([^\*]+)\*\*\*|\*\*([^\*]+)\*\*|\*([^\*]+)\*)' + + def replace_match(match): + # Match for combined bold and emphasis (***text***) + if match.group(1).startswith('***'): + content = match.group(2).replace('\n', ' ').strip() + return f"***{content}***" + + # Match for bold (**text**) + elif match.group(1).startswith('**'): + content = match.group(3).replace('\n', ' ').strip() + return f"**{content}**" + + # Match for emphasis (*text*) + elif match.group(1).startswith('*'): + content = match.group(4).replace('\n', ' ').strip() + return f"*{content}*" + + return match.group(0) # Return the original text if no match + + # Apply the regex pattern to replace formatting spans + processed_text = re.sub(pattern, replace_match, md_text) + + return processed_text + +def process_links(line): + """Replace Markdown links with only their display text.""" + line = re.sub(r"!\[.*?\]\(.*?\)", "", line) + return re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", line) + +def process_parameters(line): + return re.sub(r"^\*\*([a-z0-9_]*)\*\*=\*([a-z]*)\*( \*\*\[required\]\*\*)?", r'.IP "**\1**=*\2*\3" 4m', line, flags=re.MULTILINE) + +def process_flags(line): + return re.sub(r"^\*\*-(.*?)\*\*", r'.IP "**-\1**" 4m', line, flags=re.MULTILINE) + +def process_formatting(line): + """Apply inline formatting for bold, italic, and bold+italic.""" + line = re.sub(r"\*\*\*(.+?)\*\*\*", r"\\fB\\fI\1\\fR", line) # Bold+Italic + line = re.sub(r"\*\*(.+?)\*\*", r"\\fB\1\\fR", line) # Bold + line = re.sub(r"\*(.+?)\*", r"\\fI\1\\fR", line) # Italic + line = line.replace("\u00A0", " ") # Replace non-breaking spaces with indent + + return line + +def process_br(line): + return re.sub(r"([^\n\s]) $", r"\1\n.br", line) + +def process_headings(markdown): + def convert_sh(match): + return f".SH {match.group(1).upper()}\n" + + def convert_ss(match): + return f".SS {match.group(1)}\n" + + markdown = re.sub(r"^#{1,2} (.*)", convert_sh, markdown, flags=re.MULTILINE) + markdown = re.sub(r"^#{3,} (.*)", convert_ss, markdown, flags=re.MULTILINE) + return markdown + +def process_code(markdown): + markdown = re.sub(r"\\", "\(rs", markdown) + return markdown + +def process_lists(markdown): + markdown = re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', markdown, flags=re.MULTILINE) + return re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', markdown, flags=re.MULTILINE) + +def process_non_code(markdown): + markdown_text = process_parameters(markdown) + markdown_text = process_flags(markdown_text) + markdown_text = markdown_text.replace("    ", "") + + markdown_text = re.sub(r"\\#", "#", markdown_text) + markdown_text = re.sub(r"\\>", ">", markdown_text) + markdown_text = re.sub(r"\\<", "<", markdown_text) + markdown_text = re.sub(r"\\", "\(rs", markdown_text) + markdown_text = re.sub(r"(?<=\S) {2,}(?=\S)", " ", markdown_text) + markdown_text = re.sub(r'(?<=\S) {2,}(?=\S)', ' ', markdown_text) + markdown_text = process_formatting(markdown_text) + markdown_text = process_links(markdown_text) + markdown_text = process_headings(markdown_text) + + return markdown_text + +def convert_line(line, in_paragraph, in_code_block): + """Convert a single line of Markdown to man page format, handling paragraph continuity.""" + if line.startswith("```"): + if in_code_block: + return "\\fR\n.fi\n", False, False # End code block + else: + return ".nf\n\\fC\n", False, True # Start code block with proper indent + + if in_code_block: + return f"{line}\n", False, True # Keep all whitespace in code blocks + + if not line.strip(): + return "", False, False # Empty line resets paragraph state + + #line = process_headings(line) + line = process_br(line) + + line = line.replace("`", "") + + if re.match(r"^[-*] (.+)", line): + return f".IP \"{re.sub(r'^[-*] ', '', line)}\" 4m\n.br\n", False, False + #if re.match(r"^\d+\. (.+)", line): + #return re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', line, flags=re.MULTILINE), False, False + if in_paragraph: + return line + "\n", True, False + else: + return f".PP\n{line}\n", True, False + + +def markdown_to_man(markdown_text): + """Convert a Markdown text to a Unix man page format""" + markdown_text = strip_yaml_from_markdown(markdown_text) + blocks = parse_markdown(markdown_text) + result = [] + for block in blocks: + if block["type"] == "code": + result.append(process_code(block["markdown"])) + elif block["type"] == "list": + result.append(process_lists(block["markdown"])) + elif block["type"] == "olist": + result.append(process_lists(block["markdown"])) + else: + result.append(process_non_code(block["markdown"])) + markdown_text = "\n".join(result) + #markdown_text = process_parameters(markdown_text) + #markdown_text = process_flags(markdown_text) + #markdown_text = markdown_text.replace("    ", "") + + #markdown_text = re.sub(r"\\#", "#", markdown_text) + #markdown_text = re.sub(r"\\>", ">", markdown_text) + #markdown_text = re.sub(r"\\<", "<", markdown_text) + #markdown_text = re.sub(r"\\", "\(rs", markdown_text) + #markdown_text = process_formatting(markdown_text) + #markdown_text = process_links(markdown_text) + + lines = markdown_text.splitlines() + man_page = [".TH MAN 1 \"Manual\"\n"] + in_paragraph = False + in_code_block = False + + print(markdown_text) + + for line in lines: + converted_line, in_paragraph, in_code_block = convert_line(line, in_paragraph, in_code_block) + man_page.append(converted_line) + + if in_code_block: + man_page.append(".fi\n") # Ensure proper closure of code block + + return "".join(man_page) + +def convert_markdown_to_man(input_file, output_file): + """Read Markdown file and convert to man page.""" + with open(input_file, "r") as f: + markdown_text = f.read() + + man_text = markdown_to_man(markdown_text) + + with open(output_file, "w") as f: + f.write(man_text) + + print(f"Man page generated: {output_file}") + +def main(): + parser = argparse.ArgumentParser(description="Convert Markdown to Unix man page.") + parser.add_argument("input_file", help="Path to the input Markdown file.") + parser.add_argument("output_file", help="Path to the output man page file.") + args = parser.parse_args() + + convert_markdown_to_man(args.input_file, args.output_file) + +if __name__ == "__main__": + main() From a64b969ba8498a228a20d62bb98c3cb541555e6d Mon Sep 17 00:00:00 2001 From: Anna Petrasova Date: Tue, 11 Mar 2025 14:28:13 -0400 Subject: [PATCH 03/11] update converter --- utils/markdown2man.py | 384 +++++++++++++++++++----------------------- 1 file changed, 171 insertions(+), 213 deletions(-) diff --git a/utils/markdown2man.py b/utils/markdown2man.py index 717e9c4fe38..a17e93025e5 100644 --- a/utils/markdown2man.py +++ b/utils/markdown2man.py @@ -1,77 +1,69 @@ import argparse -import sys import re +from pathlib import Path + def strip_yaml_from_markdown(content): # Remove YAML front matter - return re.sub(r'^---\n.*?\n---\n', '', content, flags=re.DOTALL) + return re.sub(r"^---\n.*?\n---\n", "", content, flags=re.DOTALL) def parse_markdown(content): lines = content.splitlines() processing_block = [] processed_content = [] - + buffer = "" state = "default" - - + for line in lines: if line.strip().startswith("```"): # end of code block if state == "code": processing_block.append(line) - processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processed_content.append( + {"markdown": "\n".join(processing_block), "type": state} + ) processing_block = [] state = "default" # start of code block else: - if buffer: - processing_block.append(buffer) - buffer = "" - processed_content.append({"markdown": "\n".join(processing_block), "type": state}) - processing_block = [] - else: - processing_block.append(line) + processed_content.append( + {"markdown": "\n".join(processing_block), "type": state} + ) + processing_block = [] + processing_block.append(line) state = "code" continue if state == "code": processing_block.append(line) continue - - if line.endswith(" ") : - buffer += line # Keep trailing spaces for markdown line breaks - processing_block.append(buffer) - buffer = "" - if line.strip().startswith("- ") or line.startswith("* "): - + # if line.strip().startswith("- ") or line.strip().startswith("* "): + # if buffer: + # processing_block.append(buffer) + # buffer = "" + + # # start of list + # if state != "list": + # processed_content.append( + # {"markdown": "\n".join(processing_block), "type": state} + # ) + # processing_block = [] + # state = "list" + + if re.match(r"^(\s*)([-*]|\d+\.)\s+(.*)", line.strip()): if buffer: processing_block.append(buffer) buffer = "" - - # start of list + # start of ordered list if state != "list": - processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processed_content.append( + {"markdown": "\n".join(processing_block), "type": state} + ) processing_block = [] state = "list" - - processing_block.append(line) - continue - - if re.match(r"^\d+\.", line.strip()): - if buffer: - processing_block.append(buffer) - buffer = "" - # start of ordered list - if state != "olist": - processed_content.append({"markdown": "\n".join(processing_block), "type": state}) - processing_block = [] - state = "olist" - - processing_block.append(line) - continue # empty line at the start and end of code, list blocks if line == "": @@ -79,218 +71,184 @@ def parse_markdown(content): processing_block.append(buffer) buffer = "" if state != "default": - processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processed_content.append( + {"markdown": "\n".join(processing_block), "type": state} + ) processing_block = [] state = "default" processing_block.append(line) continue - buffer += line + " " - + if buffer: + buffer += " " + line + else: + buffer += line + + if line.endswith(" "): + processing_block.append(buffer) + buffer = "" + if buffer: processing_block.append(buffer) if processing_block: - processed_content.append({"markdown": "\n".join(processing_block), "type": state}) - - return processed_content - -def process_multiline_links(markdown): - # Regular expression to match links that may span multiple lines - link_pattern = re.compile(r'\[([^\]]*)\]\(([^)]+)\)', re.DOTALL) - image_link_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)', re.DOTALL) - - def replace_link(match): - # Strip leading and trailing whitespace or newlines from the link URL and text - alt_text = match.group(1).replace("\n", " ").strip() - url = match.group(2).replace("\n", "").strip() - return f'[{alt_text}]({url})' - - def replace_image_link(match): - # Strip leading and trailing whitespace or newlines from the link URL and text - alt_text = match.group(1).replace("\n", " ").strip() - url = match.group(2).replace("\n", "").strip() - return f'![{alt_text}]({url})' - - # Replace all matched links with the single-line version - markdown = re.sub(link_pattern, replace_link, markdown) - return re.sub(image_link_pattern, replace_image_link, markdown) - -def process_markdown_formatting(md_text): - # Regular expression to find multi-line formatting for bold, emphasis, and combined bold and emphasis - pattern = r'(\*\*\*([^\*]+)\*\*\*|\*\*([^\*]+)\*\*|\*([^\*]+)\*)' - - def replace_match(match): - # Match for combined bold and emphasis (***text***) - if match.group(1).startswith('***'): - content = match.group(2).replace('\n', ' ').strip() - return f"***{content}***" - - # Match for bold (**text**) - elif match.group(1).startswith('**'): - content = match.group(3).replace('\n', ' ').strip() - return f"**{content}**" - - # Match for emphasis (*text*) - elif match.group(1).startswith('*'): - content = match.group(4).replace('\n', ' ').strip() - return f"*{content}*" - - return match.group(0) # Return the original text if no match - - # Apply the regex pattern to replace formatting spans - processed_text = re.sub(pattern, replace_match, md_text) - - return processed_text - -def process_links(line): + processed_content.append( + {"markdown": "\n".join(processing_block), "type": state} + ) + + merged_content = [] + for item in processed_content: + if not item["markdown"]: + continue + if merged_content and merged_content[-1]["type"] == item["type"]: + merged_content[-1]["markdown"] += "\n" + item["markdown"] + else: + merged_content.append(item) + + return merged_content + + +def process_links(markdown): """Replace Markdown links with only their display text.""" - line = re.sub(r"!\[.*?\]\(.*?\)", "", line) - return re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", line) + markdown = re.sub(r"!\[.*?\]\(.*?\)", "", markdown) + return re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", markdown) + -def process_parameters(line): - return re.sub(r"^\*\*([a-z0-9_]*)\*\*=\*([a-z]*)\*( \*\*\[required\]\*\*)?", r'.IP "**\1**=*\2*\3" 4m', line, flags=re.MULTILINE) +def process_parameters(markdown): + return re.sub( + r"^\*\*([a-z0-9_]*)\*\*=\*([a-z]*)\*( \*\*\[required\]\*\*)?", + r'.IP "**\1**=*\2*\3" 4m', + markdown, + flags=re.MULTILINE, + ) -def process_flags(line): - return re.sub(r"^\*\*-(.*?)\*\*", r'.IP "**-\1**" 4m', line, flags=re.MULTILINE) - -def process_formatting(line): + +def process_flags(markdown): + return re.sub(r"^\*\*-(.*?)\*\*", r'.IP "**-\1**" 4m', markdown, flags=re.MULTILINE) + + +def process_formatting(markdown): """Apply inline formatting for bold, italic, and bold+italic.""" - line = re.sub(r"\*\*\*(.+?)\*\*\*", r"\\fB\\fI\1\\fR", line) # Bold+Italic - line = re.sub(r"\*\*(.+?)\*\*", r"\\fB\1\\fR", line) # Bold - line = re.sub(r"\*(.+?)\*", r"\\fI\1\\fR", line) # Italic - line = line.replace("\u00A0", " ") # Replace non-breaking spaces with indent - - return line - -def process_br(line): - return re.sub(r"([^\n\s]) $", r"\1\n.br", line) + markdown = re.sub(r"\*\*\*(.+?)\*\*\*", r"\\fB\\fI\1\\fR", markdown) + markdown = re.sub(r"\*\*(.+?)\*\*", r"\\fB\1\\fR", markdown) + return re.sub(r"\*(.+?)\*", r"\\fI\1\\fR", markdown) + + +def process_br(markdown): + return re.sub(r"([^\n\s]) $", r"\1\n.br", markdown, flags=re.MULTILINE) + def process_headings(markdown): def convert_sh(match): - return f".SH {match.group(1).upper()}\n" + return f".SH {match.group(1).upper()}" def convert_ss(match): - return f".SS {match.group(1)}\n" + return f".SS {match.group(1)}" markdown = re.sub(r"^#{1,2} (.*)", convert_sh, markdown, flags=re.MULTILINE) - markdown = re.sub(r"^#{3,} (.*)", convert_ss, markdown, flags=re.MULTILINE) - return markdown + return re.sub(r"^#{3,} (.*)", convert_ss, markdown, flags=re.MULTILINE) + def process_code(markdown): - markdown = re.sub(r"\\", "\(rs", markdown) - return markdown + in_code_block = False + output = [] + for line in markdown.splitlines(): + if line.lstrip().startswith("```"): + if in_code_block: + output.append("\\fR\n.fi\n") # End code block + else: + output.append(".nf\n\\fC\n") # Start code block + in_code_block = not in_code_block + else: + output.append(re.sub(r"\\", r"\(rs", line)) + + return "\n".join(output) + def process_lists(markdown): - markdown = re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', markdown, flags=re.MULTILINE) - return re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', markdown, flags=re.MULTILINE) - -def process_non_code(markdown): - markdown_text = process_parameters(markdown) - markdown_text = process_flags(markdown_text) - markdown_text = markdown_text.replace("    ", "") - - markdown_text = re.sub(r"\\#", "#", markdown_text) - markdown_text = re.sub(r"\\>", ">", markdown_text) - markdown_text = re.sub(r"\\<", "<", markdown_text) - markdown_text = re.sub(r"\\", "\(rs", markdown_text) - markdown_text = re.sub(r"(?<=\S) {2,}(?=\S)", " ", markdown_text) - markdown_text = re.sub(r'(?<=\S) {2,}(?=\S)', ' ', markdown_text) - markdown_text = process_formatting(markdown_text) - markdown_text = process_links(markdown_text) - markdown_text = process_headings(markdown_text) - - return markdown_text - -def convert_line(line, in_paragraph, in_code_block): - """Convert a single line of Markdown to man page format, handling paragraph continuity.""" - if line.startswith("```"): - if in_code_block: - return "\\fR\n.fi\n", False, False # End code block - else: - return ".nf\n\\fC\n", False, True # Start code block with proper indent - - if in_code_block: - return f"{line}\n", False, True # Keep all whitespace in code blocks - - if not line.strip(): - return "", False, False # Empty line resets paragraph state - - #line = process_headings(line) - line = process_br(line) - - line = line.replace("`", "") - - if re.match(r"^[-*] (.+)", line): - return f".IP \"{re.sub(r'^[-*] ', '', line)}\" 4m\n.br\n", False, False - #if re.match(r"^\d+\. (.+)", line): - #return re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', line, flags=re.MULTILINE), False, False - if in_paragraph: - return line + "\n", True, False - else: - return f".PP\n{line}\n", True, False - - -def markdown_to_man(markdown_text): - """Convert a Markdown text to a Unix man page format""" - markdown_text = strip_yaml_from_markdown(markdown_text) - blocks = parse_markdown(markdown_text) - result = [] + markdown = process_special_characters(markdown) + markdown = process_formatting(markdown) + markdown = process_links(markdown) + + output = [] + indent_levels = [] + + for line in markdown.splitlines(): + match = re.match(r"^(\s*)([-*]|\d+\.)\s+(.*)", line) # Match bullets or numbers + if not match: + continue # Skip non-list lines (shouldn't happen if input is all lists) + + spaces, bullet, item_text = match.groups() + level = len(spaces) # Determine indentation level + + while indent_levels and indent_levels[-1] > level: + output.append(".RE") # Close previous indentation level + indent_levels.pop() + + if not indent_levels or indent_levels[-1] < level: + output.append(".RS 4n") # Open new indentation level + indent_levels.append(level) + + if re.match(r"^\d+\.$", bullet): # Numbered list + output.append(f'.IP "{bullet}" 4n\n{item_text}') + else: # Bullet list + output.append(".IP \\(bu 4n\n" + item_text) + + # Close any remaining indentation levels + while indent_levels: + output.append(".RE") + indent_levels.pop() + + return "\n".join(output) + + +def process_special_characters(markdown): + markdown = markdown.replace(r"\[", "[") + markdown = markdown.replace(r"\]", "]") + markdown = markdown.replace(r"\#", "#") + markdown = markdown.replace(r"\>", ">") + markdown = markdown.replace(r"\<", "<") + markdown = markdown.replace("`", "") + # eliminate extra spaces between words + markdown = re.sub(r"(?<=\S) {2,}(?=\S)", " ", markdown) + return re.sub(r"\\", r"\(rs", markdown) + + +def process_default(markdown): + markdown = process_br(markdown) + markdown = process_parameters(markdown) + markdown = process_flags(markdown) + markdown = markdown.replace("    ", "") + markdown = process_special_characters(markdown) + markdown = process_formatting(markdown) + markdown = process_links(markdown) + return process_headings(markdown) + + +def convert_markdown_to_man(input_file, output_file): + """Read Markdown file and convert to man page.""" + markdown = Path(input_file).read_text() + markdown = strip_yaml_from_markdown(markdown) + blocks = parse_markdown(markdown) + result = ['.TH MAN 1 "Manual"\n'] for block in blocks: if block["type"] == "code": result.append(process_code(block["markdown"])) elif block["type"] == "list": result.append(process_lists(block["markdown"])) - elif block["type"] == "olist": - result.append(process_lists(block["markdown"])) else: - result.append(process_non_code(block["markdown"])) - markdown_text = "\n".join(result) - #markdown_text = process_parameters(markdown_text) - #markdown_text = process_flags(markdown_text) - #markdown_text = markdown_text.replace("    ", "") - - #markdown_text = re.sub(r"\\#", "#", markdown_text) - #markdown_text = re.sub(r"\\>", ">", markdown_text) - #markdown_text = re.sub(r"\\<", "<", markdown_text) - #markdown_text = re.sub(r"\\", "\(rs", markdown_text) - #markdown_text = process_formatting(markdown_text) - #markdown_text = process_links(markdown_text) - - lines = markdown_text.splitlines() - man_page = [".TH MAN 1 \"Manual\"\n"] - in_paragraph = False - in_code_block = False + result.append(process_default(block["markdown"])) - print(markdown_text) - - for line in lines: - converted_line, in_paragraph, in_code_block = convert_line(line, in_paragraph, in_code_block) - man_page.append(converted_line) - - if in_code_block: - man_page.append(".fi\n") # Ensure proper closure of code block - - return "".join(man_page) + Path(output_file).write_text("\n".join(result)) -def convert_markdown_to_man(input_file, output_file): - """Read Markdown file and convert to man page.""" - with open(input_file, "r") as f: - markdown_text = f.read() - - man_text = markdown_to_man(markdown_text) - - with open(output_file, "w") as f: - f.write(man_text) - - print(f"Man page generated: {output_file}") def main(): parser = argparse.ArgumentParser(description="Convert Markdown to Unix man page.") parser.add_argument("input_file", help="Path to the input Markdown file.") parser.add_argument("output_file", help="Path to the output man page file.") args = parser.parse_args() - + convert_markdown_to_man(args.input_file, args.output_file) + if __name__ == "__main__": main() From e3b0d899eb795f34f6c4444732b4c6cac70df4a8 Mon Sep 17 00:00:00 2001 From: Anna Petrasova Date: Tue, 11 Mar 2025 15:30:37 -0400 Subject: [PATCH 04/11] switch to generating man pages from markdown --- CMakeLists.txt | 4 ++-- cmake/generate_man_pages.cmake | 8 ++++---- include/Make/Grass.make | 2 +- include/Make/Html.make | 4 ++-- utils/Makefile | 4 ++++ utils/markdown2man.py | 14 +------------- 6 files changed, 14 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e8ca94e277c..a57c7770b30 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,8 +214,8 @@ include(generate_html) set(MKHTML_PY ${CMAKE_BINARY_DIR}/utils/mkhtml.py) set(THUMBNAILS_PY ${CMAKE_BINARY_DIR}/utils/thumbnails.py) -set(HTML2MAN VERSION_NUMBER=${GRASS_VERSION_NUMBER} ${PYTHON_EXECUTABLE} - ${OUTDIR}/${GRASS_INSTALL_UTILSDIR}/g.html2man.py) +set(MD2MAN VERSION_NUMBER=${GRASS_VERSION_NUMBER} ${PYTHON_EXECUTABLE} + ${OUTDIR}/${GRASS_INSTALL_UTILSDIR}/markdown2man.py) set(env_path "$ENV{PATH}") diff --git a/cmake/generate_man_pages.cmake b/cmake/generate_man_pages.cmake index 85553ef8629..9ff22f2ad33 100644 --- a/cmake/generate_man_pages.cmake +++ b/cmake/generate_man_pages.cmake @@ -1,14 +1,14 @@ # work in progress... -file(GLOB doc_HTMLFILES "${OUTDIR}/${GRASS_INSTALL_DOCDIR}/*.html") +file(GLOB doc_MDFILES "${OUTDIR}/${GRASS_INSTALL_MKDOCSDIR}/source/*.md") -foreach(html_file ${doc_HTMLFILES}) - get_filename_component(PGM_NAME ${html_file} NAME) +foreach(md_file ${doc_MDFILES}) + get_filename_component(PGM_NAME ${md_file} NAME) add_custom_command( TARGET create_man_pages PRE_BUILD - COMMAND ${HTML2MAN} ${OUTDIR}/${GRASS_INSTALL_DOCDIR}/${PGM_NAME}.html + COMMAND ${MD2MAN} ${OUTDIR}/${GRASS_INSTALL_MKDOCSDIR}/${PGM_NAME}.html ${OUTDIR}/${GRASS_INSTALL_MANDIR}/${PGM_NAME}.1 ) endforeach() diff --git a/include/Make/Grass.make b/include/Make/Grass.make index dfcbd974b6a..ef8e09b66d4 100644 --- a/include/Make/Grass.make +++ b/include/Make/Grass.make @@ -98,7 +98,7 @@ YFLAGS = -d -v MANSECT = 1 MANBASEDIR = $(ARCH_DISTDIR)/docs/man MANDIR = $(MANBASEDIR)/man$(MANSECT) -HTML2MAN = VERSION_NUMBER=$(GRASS_VERSION_NUMBER) $(GISBASE)/utils/g.html2man.py +MD2MAN = VERSION_NUMBER=$(GRASS_VERSION_NUMBER) $(GISBASE)/utils/markdown2man.py GDAL_LINK = $(USE_GDAL) diff --git a/include/Make/Html.make b/include/Make/Html.make index 42d5284ecd9..626328c59fa 100644 --- a/include/Make/Html.make +++ b/include/Make/Html.make @@ -11,8 +11,8 @@ $(MDDIR)/source/%.md: %.md %.tmp.md $(HTMLSRC) $(IMGDST_MD) | $(MDDIR) VERSION_NUMBER=$(GRASS_VERSION_NUMBER) VERSION_DATE=$(GRASS_VERSION_DATE) MODULE_TOPDIR=$(MODULE_TOPDIR) \ $(PYTHON) $(GISBASE)/utils/mkmarkdown.py $* > $@ -$(MANDIR)/%.$(MANSECT): $(HTMLDIR)/%.html - $(HTML2MAN) "$<" "$@" +$(MANDIR)/%.$(MANSECT): $(MDDIR)/source/%.md + $(MD2MAN) "$<" "$@" %.tmp.html: $(HTMLSRC) if [ "$(HTMLSRC)" != "" ] ; then $(call htmldesc,$<,$@) ; fi diff --git a/utils/Makefile b/utils/Makefile index 8137e1b7e63..0e38eab95f3 100644 --- a/utils/Makefile +++ b/utils/Makefile @@ -7,6 +7,7 @@ include $(MODULE_TOPDIR)/include/Make/Compile.make default: parsubdirs $(UTILSDIR)/mkdocs.py $(UTILSDIR)/mkhtml.py $(UTILSDIR)/mkmarkdown.py \ $(UTILSDIR)/generate_last_commit_file.py \ + $(UTILSDIR)/markdown2man.py \ $(UTILSDIR)/g.echo$(EXE) $(UTILSDIR)/mkdocs.py: mkdocs.py @@ -21,5 +22,8 @@ $(UTILSDIR)/mkmarkdown.py: mkmarkdown.py $(UTILSDIR)/generate_last_commit_file.py: generate_last_commit_file.py $(INSTALL) $< $@ +$(UTILSDIR)/markdown2man.py: markdown2man.py + $(INSTALL) $< $@ + $(UTILSDIR)/g.echo$(EXE): $(OBJDIR)/g.echo.o $(call linker_base,$(LINK),$(LDFLAGS) $(EXTRA_LDFLAGS),$(MANIFEST_OBJ)) diff --git a/utils/markdown2man.py b/utils/markdown2man.py index a17e93025e5..4d3426ca190 100644 --- a/utils/markdown2man.py +++ b/utils/markdown2man.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import re from pathlib import Path @@ -40,19 +41,6 @@ def parse_markdown(content): processing_block.append(line) continue - # if line.strip().startswith("- ") or line.strip().startswith("* "): - # if buffer: - # processing_block.append(buffer) - # buffer = "" - - # # start of list - # if state != "list": - # processed_content.append( - # {"markdown": "\n".join(processing_block), "type": state} - # ) - # processing_block = [] - # state = "list" - if re.match(r"^(\s*)([-*]|\d+\.)\s+(.*)", line.strip()): if buffer: processing_block.append(buffer) From f50b13669dc813dd10be56ab99b89fde1b67302b Mon Sep 17 00:00:00 2001 From: Anna Petrasova Date: Tue, 11 Mar 2025 15:33:50 -0400 Subject: [PATCH 05/11] fix whitespace --- include/Make/Html.make | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/Make/Html.make b/include/Make/Html.make index 626328c59fa..7d7e843ab1b 100644 --- a/include/Make/Html.make +++ b/include/Make/Html.make @@ -12,7 +12,7 @@ $(MDDIR)/source/%.md: %.md %.tmp.md $(HTMLSRC) $(IMGDST_MD) | $(MDDIR) $(PYTHON) $(GISBASE)/utils/mkmarkdown.py $* > $@ $(MANDIR)/%.$(MANSECT): $(MDDIR)/source/%.md - $(MD2MAN) "$<" "$@" + $(MD2MAN) "$<" "$@" %.tmp.html: $(HTMLSRC) if [ "$(HTMLSRC)" != "" ] ; then $(call htmldesc,$<,$@) ; fi From 50f532d42a14b04cc306b7bf9411903c35ac6413 Mon Sep 17 00:00:00 2001 From: Anna Petrasova Date: Tue, 11 Mar 2025 16:08:31 -0400 Subject: [PATCH 06/11] fix build --- man/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/Makefile b/man/Makefile index 4bbe145b49a..c1c3a0d845a 100644 --- a/man/Makefile +++ b/man/Makefile @@ -66,8 +66,8 @@ default: $(DSTFILES) @echo "Generating manual pages index (help system)..." $(MAKE) $(INDICES) $(call build,check) - $(MAKE) manpages $(MAKE) $(INDICES_MD) + $(MAKE) manpages # $(MAKE) build-mkdocs # This must be a separate target so that evaluation of $(MANPAGES) From de650052f0d3f5f509cfb3d9312f89dd56aabbd9 Mon Sep 17 00:00:00 2001 From: Anna Petrasova Date: Wed, 12 Mar 2025 12:09:14 -0400 Subject: [PATCH 07/11] Update utils/markdown2man.py Co-authored-by: Markus Neteler --- utils/markdown2man.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/utils/markdown2man.py b/utils/markdown2man.py index 4d3426ca190..eb4d4cbc42d 100644 --- a/utils/markdown2man.py +++ b/utils/markdown2man.py @@ -1,4 +1,18 @@ #!/usr/bin/env python3 + +############################################################################### +# Convert manual pages from markdown to MAN format +# +# Author(s): Anna Petrasova +# +# COPYRIGHT: (C) 2025 by the GRASS Development Team +# +# This program is free software under the GNU General Public +# License (>=v2). Read the file COPYING that comes with GRASS +# for details. +# +############################################################################### + import argparse import re from pathlib import Path From 75a4b4310bacd6239079243378297f4ab32f8cb8 Mon Sep 17 00:00:00 2001 From: Anna Petrasova Date: Fri, 25 Jul 2025 12:40:34 -0400 Subject: [PATCH 08/11] fixes --- utils/markdown2man.py | 242 +++++++++++++++++++++++++----------------- 1 file changed, 142 insertions(+), 100 deletions(-) diff --git a/utils/markdown2man.py b/utils/markdown2man.py index 717e9c4fe38..4fba94b81de 100644 --- a/utils/markdown2man.py +++ b/utils/markdown2man.py @@ -1,75 +1,100 @@ import argparse -import sys import re +from pathlib import Path + def strip_yaml_from_markdown(content): # Remove YAML front matter - return re.sub(r'^---\n.*?\n---\n', '', content, flags=re.DOTALL) + return re.sub(r"^---\n.*?\n---\n", "", content, flags=re.DOTALL).strip() + + +def simplify_synopsis(content): + before, after = re.split( + r"(?=^## DESCRIPTION\b)", content, maxsplit=1, flags=re.MULTILINE + ) + + # Remove Python sections (both command and parameters) + text = re.sub( + r'^=== "Python .*?"\n(?:\n| {4}.*\n)*', "", before, flags=re.MULTILINE + ) + text = re.sub(r'^=== "Command line"\n', "", text, flags=re.MULTILINE) + # Unindent remaining lines by 4 spaces (if they start with at least 4 spaces) + text = re.sub(r"^ {4}", "", text, flags=re.MULTILINE) + return text + after def parse_markdown(content): lines = content.splitlines() processing_block = [] processed_content = [] - + buffer = "" state = "default" - - + code_indent = 0 + for line in lines: if line.strip().startswith("```"): # end of code block if state == "code": processing_block.append(line) - processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processed_content.append( + {"markdown": "\n".join(processing_block), "type": state} + ) processing_block = [] state = "default" # start of code block else: + code_indent = len(re.match(r" *", line).group()) if buffer: processing_block.append(buffer) buffer = "" - processed_content.append({"markdown": "\n".join(processing_block), "type": state}) - processing_block = [] - else: - processing_block.append(line) + processed_content.append( + {"markdown": "\n".join(processing_block), "type": state} + ) + processing_block = [line] state = "code" continue if state == "code": - processing_block.append(line) + processing_block.append( + line[code_indent:] if line.startswith(" " * code_indent) else line + ) continue - - if line.endswith(" ") : + + if line.endswith(" "): buffer += line # Keep trailing spaces for markdown line breaks processing_block.append(buffer) buffer = "" + continue if line.strip().startswith("- ") or line.startswith("* "): - if buffer: processing_block.append(buffer) buffer = "" - + # start of list if state != "list": - processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processed_content.append( + {"markdown": "\n".join(processing_block), "type": state} + ) processing_block = [] state = "list" - - processing_block.append(line) - continue - + + processing_block.append(line) + continue + if re.match(r"^\d+\.", line.strip()): if buffer: processing_block.append(buffer) buffer = "" # start of ordered list if state != "olist": - processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processed_content.append( + {"markdown": "\n".join(processing_block), "type": state} + ) processing_block = [] state = "olist" - + processing_block.append(line) continue @@ -79,92 +104,105 @@ def parse_markdown(content): processing_block.append(buffer) buffer = "" if state != "default": - processed_content.append({"markdown": "\n".join(processing_block), "type": state}) + processed_content.append( + {"markdown": "\n".join(processing_block), "type": state} + ) processing_block = [] state = "default" processing_block.append(line) continue buffer += line + " " - + if buffer: processing_block.append(buffer) if processing_block: - processed_content.append({"markdown": "\n".join(processing_block), "type": state}) - + processed_content.append( + {"markdown": "\n".join(processing_block), "type": state} + ) + return processed_content + def process_multiline_links(markdown): # Regular expression to match links that may span multiple lines - link_pattern = re.compile(r'\[([^\]]*)\]\(([^)]+)\)', re.DOTALL) - image_link_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)', re.DOTALL) + link_pattern = re.compile(r"\[([^\]]*)\]\(([^)]+)\)", re.DOTALL) + image_link_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)", re.DOTALL) def replace_link(match): # Strip leading and trailing whitespace or newlines from the link URL and text alt_text = match.group(1).replace("\n", " ").strip() url = match.group(2).replace("\n", "").strip() - return f'[{alt_text}]({url})' + return f"[{alt_text}]({url})" def replace_image_link(match): # Strip leading and trailing whitespace or newlines from the link URL and text alt_text = match.group(1).replace("\n", " ").strip() url = match.group(2).replace("\n", "").strip() - return f'![{alt_text}]({url})' + return f"![{alt_text}]({url})" # Replace all matched links with the single-line version markdown = re.sub(link_pattern, replace_link, markdown) return re.sub(image_link_pattern, replace_image_link, markdown) + def process_markdown_formatting(md_text): # Regular expression to find multi-line formatting for bold, emphasis, and combined bold and emphasis - pattern = r'(\*\*\*([^\*]+)\*\*\*|\*\*([^\*]+)\*\*|\*([^\*]+)\*)' + pattern = r"(\*\*\*([^\*]+)\*\*\*|\*\*([^\*]+)\*\*|\*([^\*]+)\*)" def replace_match(match): # Match for combined bold and emphasis (***text***) - if match.group(1).startswith('***'): - content = match.group(2).replace('\n', ' ').strip() + if match.group(1).startswith("***"): + content = match.group(2).replace("\n", " ").strip() return f"***{content}***" - + # Match for bold (**text**) - elif match.group(1).startswith('**'): - content = match.group(3).replace('\n', ' ').strip() + if match.group(1).startswith("**"): + content = match.group(3).replace("\n", " ").strip() return f"**{content}**" - + # Match for emphasis (*text*) - elif match.group(1).startswith('*'): - content = match.group(4).replace('\n', ' ').strip() + if match.group(1).startswith("*"): + content = match.group(4).replace("\n", " ").strip() return f"*{content}*" - + return match.group(0) # Return the original text if no match # Apply the regex pattern to replace formatting spans - processed_text = re.sub(pattern, replace_match, md_text) + return re.sub(pattern, replace_match, md_text) + - return processed_text - def process_links(line): """Replace Markdown links with only their display text.""" line = re.sub(r"!\[.*?\]\(.*?\)", "", line) return re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", line) + def process_parameters(line): - return re.sub(r"^\*\*([a-z0-9_]*)\*\*=\*([a-z]*)\*( \*\*\[required\]\*\*)?", r'.IP "**\1**=*\2*\3" 4m', line, flags=re.MULTILINE) + return re.sub( + r"^\*\*([a-z0-9_]*)\*\*=\*([a-z]*)\*( \*\*\[required\]\*\*)?", + r'.IP "**\1**=*\2*\3" 4m', + line, + flags=re.MULTILINE, + ) + def process_flags(line): return re.sub(r"^\*\*-(.*?)\*\*", r'.IP "**-\1**" 4m', line, flags=re.MULTILINE) - + + def process_formatting(line): """Apply inline formatting for bold, italic, and bold+italic.""" line = re.sub(r"\*\*\*(.+?)\*\*\*", r"\\fB\\fI\1\\fR", line) # Bold+Italic line = re.sub(r"\*\*(.+?)\*\*", r"\\fB\1\\fR", line) # Bold line = re.sub(r"\*(.+?)\*", r"\\fI\1\\fR", line) # Italic - line = line.replace("\u00A0", " ") # Replace non-breaking spaces with indent - - return line - + return line.replace("\u00a0", " ") # Replace non-breaking spaces with indent + + def process_br(line): return re.sub(r"([^\n\s]) $", r"\1\n.br", line) + def process_headings(markdown): def convert_sh(match): return f".SH {match.group(1).upper()}\n" @@ -173,124 +211,128 @@ def convert_ss(match): return f".SS {match.group(1)}\n" markdown = re.sub(r"^#{1,2} (.*)", convert_sh, markdown, flags=re.MULTILINE) - markdown = re.sub(r"^#{3,} (.*)", convert_ss, markdown, flags=re.MULTILINE) - return markdown + return re.sub(r"^#{3,} (.*)", convert_ss, markdown, flags=re.MULTILINE) + def process_code(markdown): - markdown = re.sub(r"\\", "\(rs", markdown) - return markdown + return re.sub(r"\\", r"\(rs", markdown) + def process_lists(markdown): - markdown = re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', markdown, flags=re.MULTILINE) - return re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', markdown, flags=re.MULTILINE) + markdown = re.sub( + r"^(\d+)\.\s+(.*)", r".IP \\fB\1\\fR\n\2\n", markdown, flags=re.MULTILINE + ) + return re.sub( + r"^(\d+)\.\s+(.*)", r".IP \\fB\1\\fR\n\2\n", markdown, flags=re.MULTILINE + ) + def process_non_code(markdown): markdown_text = process_parameters(markdown) markdown_text = process_flags(markdown_text) markdown_text = markdown_text.replace("    ", "") - + markdown_text = re.sub(r"\\#", "#", markdown_text) markdown_text = re.sub(r"\\>", ">", markdown_text) markdown_text = re.sub(r"\\<", "<", markdown_text) - markdown_text = re.sub(r"\\", "\(rs", markdown_text) + markdown_text = re.sub(r"\\", r"\(rs", markdown_text) + markdown_text = re.sub(r"(?<=\S) {2,}(?=\S)", " ", markdown_text) markdown_text = re.sub(r"(?<=\S) {2,}(?=\S)", " ", markdown_text) - markdown_text = re.sub(r'(?<=\S) {2,}(?=\S)', ' ', markdown_text) markdown_text = process_formatting(markdown_text) markdown_text = process_links(markdown_text) - markdown_text = process_headings(markdown_text) - - return markdown_text - + return process_headings(markdown_text) + + def convert_line(line, in_paragraph, in_code_block): """Convert a single line of Markdown to man page format, handling paragraph continuity.""" if line.startswith("```"): if in_code_block: return "\\fR\n.fi\n", False, False # End code block - else: - return ".nf\n\\fC\n", False, True # Start code block with proper indent - + return ".nf\n\\fC\n", False, True # Start code block with proper indent + if in_code_block: return f"{line}\n", False, True # Keep all whitespace in code blocks - + if not line.strip(): return "", False, False # Empty line resets paragraph state - #line = process_headings(line) + # line = process_headings(line) line = process_br(line) - + line = line.replace("`", "") if re.match(r"^[-*] (.+)", line): - return f".IP \"{re.sub(r'^[-*] ', '', line)}\" 4m\n.br\n", False, False - #if re.match(r"^\d+\. (.+)", line): - #return re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', line, flags=re.MULTILINE), False, False + return f'.IP "{re.sub(r"^[-*] ", "", line)}" 4m\n.br\n', False, False + # if re.match(r"^\d+\. (.+)", line): + # return re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', line, flags=re.MULTILINE), False, False if in_paragraph: return line + "\n", True, False - else: - return f".PP\n{line}\n", True, False + return f".PP\n{line}\n", True, False def markdown_to_man(markdown_text): """Convert a Markdown text to a Unix man page format""" markdown_text = strip_yaml_from_markdown(markdown_text) + markdown_text = simplify_synopsis(markdown_text) blocks = parse_markdown(markdown_text) result = [] for block in blocks: if block["type"] == "code": result.append(process_code(block["markdown"])) - elif block["type"] == "list": - result.append(process_lists(block["markdown"])) - elif block["type"] == "olist": + elif block["type"] == "list" or block["type"] == "olist": result.append(process_lists(block["markdown"])) else: result.append(process_non_code(block["markdown"])) markdown_text = "\n".join(result) - #markdown_text = process_parameters(markdown_text) - #markdown_text = process_flags(markdown_text) - #markdown_text = markdown_text.replace("    ", "") - - #markdown_text = re.sub(r"\\#", "#", markdown_text) - #markdown_text = re.sub(r"\\>", ">", markdown_text) - #markdown_text = re.sub(r"\\<", "<", markdown_text) - #markdown_text = re.sub(r"\\", "\(rs", markdown_text) - #markdown_text = process_formatting(markdown_text) - #markdown_text = process_links(markdown_text) - + # markdown_text = process_parameters(markdown_text) + # markdown_text = process_flags(markdown_text) + # markdown_text = markdown_text.replace("    ", "") + + # markdown_text = re.sub(r"\\#", "#", markdown_text) + # markdown_text = re.sub(r"\\>", ">", markdown_text) + # markdown_text = re.sub(r"\\<", "<", markdown_text) + # markdown_text = re.sub(r"\\", "\(rs", markdown_text) + # markdown_text = process_formatting(markdown_text) + # markdown_text = process_links(markdown_text) + lines = markdown_text.splitlines() - man_page = [".TH MAN 1 \"Manual\"\n"] + man_page = ['.TH MAN 1 "Manual"\n'] in_paragraph = False in_code_block = False print(markdown_text) - + for line in lines: - converted_line, in_paragraph, in_code_block = convert_line(line, in_paragraph, in_code_block) + converted_line, in_paragraph, in_code_block = convert_line( + line, in_paragraph, in_code_block + ) man_page.append(converted_line) - + if in_code_block: man_page.append(".fi\n") # Ensure proper closure of code block - + return "".join(man_page) + def convert_markdown_to_man(input_file, output_file): """Read Markdown file and convert to man page.""" - with open(input_file, "r") as f: - markdown_text = f.read() - + markdown_text = Path(input_file).read_text() + man_text = markdown_to_man(markdown_text) - - with open(output_file, "w") as f: - f.write(man_text) - + + Path(output_file).write_text(man_text) + print(f"Man page generated: {output_file}") + def main(): parser = argparse.ArgumentParser(description="Convert Markdown to Unix man page.") parser.add_argument("input_file", help="Path to the input Markdown file.") parser.add_argument("output_file", help="Path to the output man page file.") args = parser.parse_args() - + convert_markdown_to_man(args.input_file, args.output_file) + if __name__ == "__main__": main() From 5d4840f4a90f476fb45b9fb86192752954773958 Mon Sep 17 00:00:00 2001 From: Anna Petrasova Date: Mon, 28 Jul 2025 06:26:03 -0400 Subject: [PATCH 09/11] fix tables and lists --- utils/markdown2man.py | 104 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 90 insertions(+), 14 deletions(-) diff --git a/utils/markdown2man.py b/utils/markdown2man.py index 4fba94b81de..a5779ba2d10 100644 --- a/utils/markdown2man.py +++ b/utils/markdown2man.py @@ -61,12 +61,6 @@ def parse_markdown(content): ) continue - if line.endswith(" "): - buffer += line # Keep trailing spaces for markdown line breaks - processing_block.append(buffer) - buffer = "" - continue - if line.strip().startswith("- ") or line.startswith("* "): if buffer: processing_block.append(buffer) @@ -98,6 +92,19 @@ def parse_markdown(content): processing_block.append(line) continue + if line.strip().startswith("|") and line.strip().endswith("|"): + if buffer: + processing_block.append(buffer) + buffer = "" + processing_block.append(line) + continue + + if line.endswith(" "): + buffer += line # Keep trailing spaces for markdown line breaks + processing_block.append(buffer) + buffer = "" + continue + # empty line at the start and end of code, list blocks if line == "": if buffer: @@ -112,7 +119,7 @@ def parse_markdown(content): processing_block.append(line) continue - buffer += line + " " + buffer += line.lstrip() + " " if buffer: processing_block.append(buffer) @@ -120,7 +127,9 @@ def parse_markdown(content): processed_content.append( {"markdown": "\n".join(processing_block), "type": state} ) - + for each in processed_content: + print(each["type"], "\n") + print(each["markdown"]) return processed_content @@ -180,7 +189,7 @@ def process_links(line): def process_parameters(line): return re.sub( - r"^\*\*([a-z0-9_]*)\*\*=\*([a-z]*)\*( \*\*\[required\]\*\*)?", + r"^\*\*([a-z0-9_]*)\*\*=\*([a-z,]*)\*( \*\*\[required\]\*\*)?", r'.IP "**\1**=*\2*\3" 4m', line, flags=re.MULTILINE, @@ -219,9 +228,33 @@ def process_code(markdown): def process_lists(markdown): - markdown = re.sub( - r"^(\d+)\.\s+(.*)", r".IP \\fB\1\\fR\n\2\n", markdown, flags=re.MULTILINE + # Match list items and their continuation lines + pattern = re.compile( + r""" + ^[ \t]*[-*+] # Bullet list marker at start of line + [ \t]+ # Space after bullet + .+ # First line of text + (?:\n(?![ \t]*[-*+] ) # Continuation lines NOT starting with another bullet + [^\n]+)* # Non-blank continuation lines + """, + re.MULTILINE | re.VERBOSE, ) + + matches = pattern.findall(markdown) + if not matches: + return markdown # nothing to convert + + # Build roff output + lines = [".RS 4n"] + for item in matches: + # Clean bullet + item_text = re.sub(r"^[ \t]*[-*][ \t]+", "", item, count=1) + lines.extend((".IP \\(bu 4n", item_text.strip())) + lines.append(".RE") + return "\n".join(lines) + + +def process_olists(markdown): return re.sub( r"^(\d+)\.\s+(.*)", r".IP \\fB\1\\fR\n\2\n", markdown, flags=re.MULTILINE ) @@ -231,7 +264,8 @@ def process_non_code(markdown): markdown_text = process_parameters(markdown) markdown_text = process_flags(markdown_text) markdown_text = markdown_text.replace("    ", "") - + # print(markdown_text) + markdown_text = process_table(markdown_text) markdown_text = re.sub(r"\\#", "#", markdown_text) markdown_text = re.sub(r"\\>", ">", markdown_text) markdown_text = re.sub(r"\\<", "<", markdown_text) @@ -243,6 +277,46 @@ def process_non_code(markdown): return process_headings(markdown_text) +def process_table(markdown: str) -> str: + def markdown_to_roff_table(md_table: str) -> str: + lines = md_table.strip().splitlines() + if len(lines) < 2: + return md_table # not a valid table + + # Remove divider line (2nd line) + header = lines[0].strip("|").split("|") + rows = [line.strip("|").split("|") for line in lines[2:]] + + # Trim spaces in cells + header = [cell.strip() for cell in header] + header = [f"**{cell.strip()}**" for cell in header] + rows = [[cell.strip() for cell in row] for row in rows] + + # Generate column format line (left aligned) + format_line = " ".join(["l"] * len(header)) + "." + # Build the roff table + lines = [".TS", "tab(|);", format_line, "|".join(header)] + for row in rows: + lines.append("|".join(row)) + lines.extend((".TE", ".PP")) + + return "\n".join(lines) + + markdown_table_pattern = re.compile( + r""" + ( # full table match + ^\|.*\|\s*\n # header row: starts and ends with | + ^\|[:\-| ]+\|\s*\n # divider row: like | --- | :--: | + (?:^\|.*\|\s*\n?)+ # one or more data rows + ) + """, + re.MULTILINE | re.VERBOSE, + ) + return markdown_table_pattern.sub( + lambda match: markdown_to_roff_table(match.group(0)), markdown + ) + + def convert_line(line, in_paragraph, in_code_block): """Convert a single line of Markdown to man page format, handling paragraph continuity.""" if line.startswith("```"): @@ -279,8 +353,10 @@ def markdown_to_man(markdown_text): for block in blocks: if block["type"] == "code": result.append(process_code(block["markdown"])) - elif block["type"] == "list" or block["type"] == "olist": + elif block["type"] == "list": result.append(process_lists(block["markdown"])) + elif block["type"] == "olist": + result.append(process_olists(block["markdown"])) else: result.append(process_non_code(block["markdown"])) markdown_text = "\n".join(result) @@ -300,7 +376,7 @@ def markdown_to_man(markdown_text): in_paragraph = False in_code_block = False - print(markdown_text) + # print(markdown_text) for line in lines: converted_line, in_paragraph, in_code_block = convert_line( From 1b7b12db7e055ced70b8bd03f70679615d90af81 Mon Sep 17 00:00:00 2001 From: Anna Petrasova Date: Wed, 30 Jul 2025 05:21:50 -0400 Subject: [PATCH 10/11] continued fixes --- utils/markdown2man.py | 280 ++++++++++++++++++++++++++---------------- 1 file changed, 174 insertions(+), 106 deletions(-) diff --git a/utils/markdown2man.py b/utils/markdown2man.py index a5779ba2d10..c2b24fc3d78 100644 --- a/utils/markdown2man.py +++ b/utils/markdown2man.py @@ -1,26 +1,49 @@ import argparse -import re +import os from pathlib import Path +import re +import textwrap -def strip_yaml_from_markdown(content): +def strip_yaml_from_markdown_and_reformat(content): # Remove YAML front matter - return re.sub(r"^---\n.*?\n---\n", "", content, flags=re.DOTALL).strip() + match = re.match(r"^---\n(.*?)\n---\n", content, re.DOTALL) + if not match: + return {}, content.strip() + + yaml_block = match.group(1) + markdown = content[match.end() :].strip() + + yaml = {} + for line in yaml_block.splitlines(): + key, value = line.strip().split(":") + key = key.strip() + value = value.strip() + if value.startswith("[") and value.endswith("]"): + yaml[key] = [v.strip() for v in value[1:-1].split(",")] + else: + yaml[key] = value + split_string = '=== "Command line"' + before, after = markdown.split(split_string, 1) -def simplify_synopsis(content): - before, after = re.split( - r"(?=^## DESCRIPTION\b)", content, maxsplit=1, flags=re.MULTILINE - ) + before = f""" - # Remove Python sections (both command and parameters) - text = re.sub( - r'^=== "Python .*?"\n(?:\n| {4}.*\n)*', "", before, flags=re.MULTILINE - ) - text = re.sub(r'^=== "Command line"\n', "", text, flags=re.MULTILINE) - # Unindent remaining lines by 4 spaces (if they start with at least 4 spaces) - text = re.sub(r"^ {4}", "", text, flags=re.MULTILINE) - return text + after + # NAME + + {yaml["name"]} - {yaml["description"]} + + # KEYWORDS + + {", ".join(yaml["keywords"])} + + # SYNOPSIS + + """ + + markdown = before + after + markdown = markdown.replace("## Parameters", "### Parameters") + return yaml, markdown def parse_markdown(content): @@ -187,17 +210,30 @@ def process_links(line): return re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", line) -def process_parameters(line): - return re.sub( +def process_parameters(markdown): + parts = markdown.split("## DESCRIPTION", 1) + if len(parts) == 1: + return markdown + before, after = parts + before_processed = re.sub( r"^\*\*([a-z0-9_]*)\*\*=\*([a-z,]*)\*( \*\*\[required\]\*\*)?", r'.IP "**\1**=*\2*\3" 4m', - line, + before, flags=re.MULTILINE, ) + return before_processed + "## DESCRIPTION" + after -def process_flags(line): - return re.sub(r"^\*\*-(.*?)\*\*", r'.IP "**-\1**" 4m', line, flags=re.MULTILINE) +def process_flags(markdown): + parts = markdown.split("## DESCRIPTION", 1) + if len(parts) == 1: + return markdown + + before, after = parts + before_processed = re.sub( + r"^\*\*-(.*?)\*\*", r'.IP "**-\1**" 4m', before, flags=re.MULTILINE + ) + return before_processed + "## DESCRIPTION" + after def process_formatting(line): @@ -223,11 +259,56 @@ def convert_ss(match): return re.sub(r"^#{3,} (.*)", convert_ss, markdown, flags=re.MULTILINE) +def remove_python_content_blocks(markdown): + pattern = re.compile( + r""" + ^===\s*"Python[^\n]*"\n # Match the Python block title + (?: + (?![!-~]) # As long as the line does NOT start with ascii + .*\n # Match that whole line + )* + """, + re.MULTILINE | re.VERBOSE, + ) + return pattern.sub("", markdown) + + +def unindent_command_content_blocks(markdown): + pattern = re.compile( + r""" + ^===\s*"Command[^\n]*\n # Match the Python block title + ( # Capture group for the content block + (?:\n?(?![!-~]).*)* # Match all lines NOT starting with visible ASCII (indented) + ) + """, + re.MULTILINE | re.VERBOSE, + ) + + def unindent_block(match): + return textwrap.dedent(match.group(1)) + + markdown = pattern.sub(unindent_block, markdown) + return re.sub(r'^=== "Command line"\n', "", markdown, flags=re.MULTILINE) + + def process_code(markdown): - return re.sub(r"\\", r"\(rs", markdown) + markdown = re.sub(r"\\", r"\(rs", markdown) + + pattern = re.compile(r"(?m)^( *)```(?:\w+)?\n(.*?)(?<=\n)\1```", re.DOTALL) + + def repl(match): + code = match.group(2) + dedented = textwrap.dedent(code).rstrip() + return f".PP\n.nf\n\\fC\n{dedented}\n\\fR\n.fi" + + return pattern.sub(repl, markdown) -def process_lists(markdown): +def remove_comments(markdown): + return re.sub(r"", "", markdown, flags=re.DOTALL) + + +def process_list(markdown): # Match list items and their continuation lines pattern = re.compile( r""" @@ -254,27 +335,52 @@ def process_lists(markdown): return "\n".join(lines) -def process_olists(markdown): - return re.sub( - r"^(\d+)\.\s+(.*)", r".IP \\fB\1\\fR\n\2\n", markdown, flags=re.MULTILINE +def process_olist(markdown): + # Match list items and their continuation lines + pattern = re.compile( + r""" + ^[ \t]* # Optional leading whitespace + (\d+)[.)] # Ordered list number (capture this) + [ \t]+ # At least one space + .+ # First line of list item (non-greedy) + (?:\n # Followed by continuation lines + (?![ \t]*\d+[.)][ \t]) # Not starting with another list number + [^\n]+ # Continuation line content + )* # More content lines + """, + re.MULTILINE | re.VERBOSE, ) + matches = list(pattern.finditer(markdown)) + if not matches: + return markdown # nothing to convert + + lines = [".RS 4n"] + for match in matches: + number = match.group(1) + full_item = match.group(0) + # Remove the number and punctuation from the beginning + item_text = re.sub(r"^[ \t]*\d+[.)][ \t]+", "", full_item, count=1) + lines.append(f'.IP "{number}." 4n\n{item_text.strip()}') + lines.append(".RE") + return "\n".join(lines) + def process_non_code(markdown): - markdown_text = process_parameters(markdown) - markdown_text = process_flags(markdown_text) - markdown_text = markdown_text.replace("    ", "") - # print(markdown_text) - markdown_text = process_table(markdown_text) - markdown_text = re.sub(r"\\#", "#", markdown_text) - markdown_text = re.sub(r"\\>", ">", markdown_text) - markdown_text = re.sub(r"\\<", "<", markdown_text) - markdown_text = re.sub(r"\\", r"\(rs", markdown_text) - markdown_text = re.sub(r"(?<=\S) {2,}(?=\S)", " ", markdown_text) - markdown_text = re.sub(r"(?<=\S) {2,}(?=\S)", " ", markdown_text) - markdown_text = process_formatting(markdown_text) - markdown_text = process_links(markdown_text) - return process_headings(markdown_text) + markdown = process_table(markdown) + markdown = markdown.replace(r"`", "") + markdown = re.sub(r"\\#", "#", markdown) + markdown = re.sub(r"\\\[", "[", markdown) + markdown = re.sub(r"\\\]", "]", markdown) + markdown = re.sub(r"\\>", ">", markdown) + markdown = re.sub(r"\\<", "<", markdown) + markdown = re.sub(r"\\", r"\(rs", markdown) + markdown = re.sub(r"(?<=\S) {2,}(?=\S)", " ", markdown) + markdown = re.sub(r"(?<=\S) {2,}(?=\S)", " ", markdown) + markdown = process_formatting(markdown) + markdown = process_links(markdown) + markdown = process_headings(markdown) + return re.sub(r"([^\n\s]) $", r"\1\n.br", markdown, flags=re.MULTILINE) def process_table(markdown: str) -> str: @@ -317,88 +423,48 @@ def markdown_to_roff_table(md_table: str) -> str: ) -def convert_line(line, in_paragraph, in_code_block): - """Convert a single line of Markdown to man page format, handling paragraph continuity.""" - if line.startswith("```"): - if in_code_block: - return "\\fR\n.fi\n", False, False # End code block - return ".nf\n\\fC\n", False, True # Start code block with proper indent - - if in_code_block: - return f"{line}\n", False, True # Keep all whitespace in code blocks - - if not line.strip(): - return "", False, False # Empty line resets paragraph state - - # line = process_headings(line) - line = process_br(line) - - line = line.replace("`", "") - - if re.match(r"^[-*] (.+)", line): - return f'.IP "{re.sub(r"^[-*] ", "", line)}" 4m\n.br\n', False, False - # if re.match(r"^\d+\. (.+)", line): - # return re.sub(r"^(\d+)\.\s+(.*)", r'.IP \\fB\1\\fR\n\2\n', line, flags=re.MULTILINE), False, False - if in_paragraph: - return line + "\n", True, False - return f".PP\n{line}\n", True, False +def add_paragraphs(markdown): + return re.sub( + r"(?m)(?:^|\n)([^\n\S]*[^\n]+(?:\n[^\n\S]*[^\n]+)*)", + lambda m: f"\n.PP\n{m.group(1)}", + markdown, + ).strip() def markdown_to_man(markdown_text): """Convert a Markdown text to a Unix man page format""" - markdown_text = strip_yaml_from_markdown(markdown_text) - markdown_text = simplify_synopsis(markdown_text) + yaml, markdown_text = strip_yaml_from_markdown_and_reformat(markdown_text) + markdown_text = remove_python_content_blocks(markdown_text) + markdown_text = unindent_command_content_blocks(markdown_text) + markdown_text = remove_comments(markdown_text) + # process synopsis + markdown_text = process_parameters(markdown_text) + markdown_text = process_flags(markdown_text) + markdown_text = markdown_text.replace("    ", "") + blocks = parse_markdown(markdown_text) result = [] for block in blocks: if block["type"] == "code": result.append(process_code(block["markdown"])) elif block["type"] == "list": - result.append(process_lists(block["markdown"])) + text = process_non_code(block["markdown"]) + result.append(process_list(text)) elif block["type"] == "olist": - result.append(process_olists(block["markdown"])) + text = process_non_code(block["markdown"]) + result.append(process_olist(text)) else: result.append(process_non_code(block["markdown"])) markdown_text = "\n".join(result) - # markdown_text = process_parameters(markdown_text) - # markdown_text = process_flags(markdown_text) - # markdown_text = markdown_text.replace("    ", "") - - # markdown_text = re.sub(r"\\#", "#", markdown_text) - # markdown_text = re.sub(r"\\>", ">", markdown_text) - # markdown_text = re.sub(r"\\<", "<", markdown_text) - # markdown_text = re.sub(r"\\", "\(rs", markdown_text) - # markdown_text = process_formatting(markdown_text) - # markdown_text = process_links(markdown_text) - - lines = markdown_text.splitlines() - man_page = ['.TH MAN 1 "Manual"\n'] - in_paragraph = False - in_code_block = False - - # print(markdown_text) - - for line in lines: - converted_line, in_paragraph, in_code_block = convert_line( - line, in_paragraph, in_code_block - ) - man_page.append(converted_line) + markdown_text = add_paragraphs(markdown_text) - if in_code_block: - man_page.append(".fi\n") # Ensure proper closure of code block - - return "".join(man_page) - - -def convert_markdown_to_man(input_file, output_file): - """Read Markdown file and convert to man page.""" - markdown_text = Path(input_file).read_text() - - man_text = markdown_to_man(markdown_text) - - Path(output_file).write_text(man_text) + version = os.environ.get("VERSION_NUMBER", "") + man_page = ( + f'.TH {yaml.get("name", "MAN")} 1 "" "GRASS {version}" "GRASS User\'s Manual"\n' + ) + man_page += markdown_text - print(f"Man page generated: {output_file}") + return man_page def main(): @@ -407,7 +473,9 @@ def main(): parser.add_argument("output_file", help="Path to the output man page file.") args = parser.parse_args() - convert_markdown_to_man(args.input_file, args.output_file) + markdown_text = Path(args.input_file).read_text() + man_text = markdown_to_man(markdown_text) + Path(args.output_file).write_text(man_text) if __name__ == "__main__": From 8c03213a8696c41a36c7b7a538a1ca541ce5604b Mon Sep 17 00:00:00 2001 From: Anna Petrasova Date: Thu, 31 Jul 2025 09:14:58 -0400 Subject: [PATCH 11/11] fix --- utils/markdown2man.py | 270 +++++++++++++++--------------------------- 1 file changed, 97 insertions(+), 173 deletions(-) diff --git a/utils/markdown2man.py b/utils/markdown2man.py index c2b24fc3d78..b3a82b6498c 100644 --- a/utils/markdown2man.py +++ b/utils/markdown2man.py @@ -1,3 +1,18 @@ +#!/usr/bin/env python3 + +############################################################################### +# Convert manual pages from markdown to MAN format +# +# Author(s): Anna Petrasova +# +# COPYRIGHT: (C) 2025 by the GRASS Development Team +# +# This program is free software under the GNU General Public +# License (>=v2). Read the file COPYING that comes with GRASS +# for details. +# +############################################################################### + import argparse import os from pathlib import Path @@ -29,19 +44,19 @@ def strip_yaml_from_markdown_and_reformat(content): before = f""" - # NAME +# NAME - {yaml["name"]} - {yaml["description"]} +{yaml["name"]} - {yaml["description"]} - # KEYWORDS +# KEYWORDS - {", ".join(yaml["keywords"])} +{", ".join(yaml["keywords"])} - # SYNOPSIS +# SYNOPSIS """ - markdown = before + after + markdown = before + after.strip() markdown = markdown.replace("## Parameters", "### Parameters") return yaml, markdown @@ -53,7 +68,6 @@ def parse_markdown(content): buffer = "" state = "default" - code_indent = 0 for line in lines: if line.strip().startswith("```"): @@ -67,53 +81,29 @@ def parse_markdown(content): state = "default" # start of code block else: - code_indent = len(re.match(r" *", line).group()) - if buffer: - processing_block.append(buffer) - buffer = "" processed_content.append( {"markdown": "\n".join(processing_block), "type": state} ) - processing_block = [line] + processing_block = [] + processing_block.append(line) state = "code" continue if state == "code": - processing_block.append( - line[code_indent:] if line.startswith(" " * code_indent) else line - ) - continue - - if line.strip().startswith("- ") or line.startswith("* "): - if buffer: - processing_block.append(buffer) - buffer = "" - - # start of list - if state != "list": - processed_content.append( - {"markdown": "\n".join(processing_block), "type": state} - ) - processing_block = [] - state = "list" - processing_block.append(line) continue - if re.match(r"^\d+\.", line.strip()): + if re.match(r"^(\s*)([-*]|\d+\.)\s+(.*)", line.strip()): if buffer: processing_block.append(buffer) buffer = "" # start of ordered list - if state != "olist": + if state != "list": processed_content.append( {"markdown": "\n".join(processing_block), "type": state} ) processing_block = [] - state = "olist" - - processing_block.append(line) - continue + state = "list" if line.strip().startswith("|") and line.strip().endswith("|"): if buffer: @@ -122,12 +112,6 @@ def parse_markdown(content): processing_block.append(line) continue - if line.endswith(" "): - buffer += line # Keep trailing spaces for markdown line breaks - processing_block.append(buffer) - buffer = "" - continue - # empty line at the start and end of code, list blocks if line == "": if buffer: @@ -142,7 +126,16 @@ def parse_markdown(content): processing_block.append(line) continue - buffer += line.lstrip() + " " + if buffer: + buffer += " " + line + elif state == "list": + buffer += line + else: + buffer += line.lstrip() + + if line.endswith(" "): + processing_block.append(buffer) + buffer = "" if buffer: processing_block.append(buffer) @@ -150,64 +143,14 @@ def parse_markdown(content): processed_content.append( {"markdown": "\n".join(processing_block), "type": state} ) - for each in processed_content: - print(each["type"], "\n") - print(each["markdown"]) - return processed_content - - -def process_multiline_links(markdown): - # Regular expression to match links that may span multiple lines - link_pattern = re.compile(r"\[([^\]]*)\]\(([^)]+)\)", re.DOTALL) - image_link_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)", re.DOTALL) - def replace_link(match): - # Strip leading and trailing whitespace or newlines from the link URL and text - alt_text = match.group(1).replace("\n", " ").strip() - url = match.group(2).replace("\n", "").strip() - return f"[{alt_text}]({url})" - - def replace_image_link(match): - # Strip leading and trailing whitespace or newlines from the link URL and text - alt_text = match.group(1).replace("\n", " ").strip() - url = match.group(2).replace("\n", "").strip() - return f"![{alt_text}]({url})" - - # Replace all matched links with the single-line version - markdown = re.sub(link_pattern, replace_link, markdown) - return re.sub(image_link_pattern, replace_image_link, markdown) - - -def process_markdown_formatting(md_text): - # Regular expression to find multi-line formatting for bold, emphasis, and combined bold and emphasis - pattern = r"(\*\*\*([^\*]+)\*\*\*|\*\*([^\*]+)\*\*|\*([^\*]+)\*)" - - def replace_match(match): - # Match for combined bold and emphasis (***text***) - if match.group(1).startswith("***"): - content = match.group(2).replace("\n", " ").strip() - return f"***{content}***" - - # Match for bold (**text**) - if match.group(1).startswith("**"): - content = match.group(3).replace("\n", " ").strip() - return f"**{content}**" - - # Match for emphasis (*text*) - if match.group(1).startswith("*"): - content = match.group(4).replace("\n", " ").strip() - return f"*{content}*" - - return match.group(0) # Return the original text if no match - - # Apply the regex pattern to replace formatting spans - return re.sub(pattern, replace_match, md_text) + return processed_content -def process_links(line): +def process_links(markdown): """Replace Markdown links with only their display text.""" - line = re.sub(r"!\[.*?\]\(.*?\)", "", line) - return re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", line) + markdown = re.sub(r"!\[.*?\]\(.*?\)", "", markdown) + return re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", markdown) def process_parameters(markdown): @@ -216,7 +159,7 @@ def process_parameters(markdown): return markdown before, after = parts before_processed = re.sub( - r"^\*\*([a-z0-9_]*)\*\*=\*([a-z,]*)\*( \*\*\[required\]\*\*)?", + r"^\*\*([a-z0-9_]*)\*\*=\*([a-zA-Z,_ ]*)\*( \*\*\[required\]\*\*)?", r'.IP "**\1**=*\2*\3" 4m', before, flags=re.MULTILINE, @@ -236,16 +179,12 @@ def process_flags(markdown): return before_processed + "## DESCRIPTION" + after -def process_formatting(line): +def process_formatting(markdown): """Apply inline formatting for bold, italic, and bold+italic.""" - line = re.sub(r"\*\*\*(.+?)\*\*\*", r"\\fB\\fI\1\\fR", line) # Bold+Italic - line = re.sub(r"\*\*(.+?)\*\*", r"\\fB\1\\fR", line) # Bold - line = re.sub(r"\*(.+?)\*", r"\\fI\1\\fR", line) # Italic - return line.replace("\u00a0", " ") # Replace non-breaking spaces with indent - - -def process_br(line): - return re.sub(r"([^\n\s]) $", r"\1\n.br", line) + markdown = re.sub(r"\*\*\*(.+?)\*\*\*", r"\\fB\\fI\1\\fR", markdown) + markdown = re.sub(r"\*\*(.+?)\*\*", r"\\fB\1\\fR", markdown) + # avoid detecting \* + return re.sub(r"(? level: + output.append(".RE") # Close previous indentation level + indent_levels.pop() + + if not indent_levels or indent_levels[-1] < level: + output.append(".RS 4n") # Open new indentation level + indent_levels.append(level) - matches = list(pattern.finditer(markdown)) - if not matches: - return markdown # nothing to convert + if re.match(r"^\d+\.$", bullet): # Numbered list + output.append(f'.IP "{bullet}" 4n\n{item_text}') + else: # Bullet list + output.append(".IP \\(bu 4n\n" + item_text) - lines = [".RS 4n"] - for match in matches: - number = match.group(1) - full_item = match.group(0) - # Remove the number and punctuation from the beginning - item_text = re.sub(r"^[ \t]*\d+[.)][ \t]+", "", full_item, count=1) - lines.append(f'.IP "{number}." 4n\n{item_text.strip()}') - lines.append(".RE") - return "\n".join(lines) + # Close any remaining indentation levels + while indent_levels: + output.append(".RE") + indent_levels.pop() + return "\n".join(output) -def process_non_code(markdown): + +def process_special_characters(markdown): + markdown = markdown.replace(r"\[", "[") + markdown = markdown.replace(r"\]", "]") + markdown = markdown.replace(r"\#", "#") + markdown = markdown.replace(r"\>", ">") + markdown = markdown.replace(r"\<", "<") + markdown = markdown.replace(r"\*", "*") + markdown = markdown.replace("`", "") + # eliminate extra spaces between words + return re.sub(r"(?<=\S) {2,}(?=\S)", " ", markdown) + + +def process_br(markdown): + return re.sub(r"([^\n\s]) $", r"\1\n.br", markdown, flags=re.MULTILINE) + + +def process_default(markdown): markdown = process_table(markdown) - markdown = markdown.replace(r"`", "") - markdown = re.sub(r"\\#", "#", markdown) - markdown = re.sub(r"\\\[", "[", markdown) - markdown = re.sub(r"\\\]", "]", markdown) - markdown = re.sub(r"\\>", ">", markdown) - markdown = re.sub(r"\\<", "<", markdown) - markdown = re.sub(r"\\", r"\(rs", markdown) - markdown = re.sub(r"(?<=\S) {2,}(?=\S)", " ", markdown) - markdown = re.sub(r"(?<=\S) {2,}(?=\S)", " ", markdown) markdown = process_formatting(markdown) + markdown = process_special_characters(markdown) markdown = process_links(markdown) markdown = process_headings(markdown) - return re.sub(r"([^\n\s]) $", r"\1\n.br", markdown, flags=re.MULTILINE) + return process_br(markdown) def process_table(markdown: str) -> str: @@ -448,13 +376,9 @@ def markdown_to_man(markdown_text): if block["type"] == "code": result.append(process_code(block["markdown"])) elif block["type"] == "list": - text = process_non_code(block["markdown"]) - result.append(process_list(text)) - elif block["type"] == "olist": - text = process_non_code(block["markdown"]) - result.append(process_olist(text)) + result.append(process_list(block["markdown"])) else: - result.append(process_non_code(block["markdown"])) + result.append(process_default(block["markdown"])) markdown_text = "\n".join(result) markdown_text = add_paragraphs(markdown_text)