增加集体通信文档命令audit#8
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces a new Python script tools/doc_command_audit.py and its corresponding unit tests to audit shell script references in README.md against the repository's files. The reviewer provided valuable feedback, suggesting to filter out external SDK scripts and deduplicate references to avoid false positives and redundant I/O, to handle missing README files gracefully and return a non-zero exit code on audit failures for CI integration, and to update the unit tests to cover these new scenarios.
Important
The consumer version of Gemini Code Assist on GitHub is being sunset. Starting June 18, 2026, new organization installations will be blocked, and all code review activity will officially cease on July 17, 2026.
For more details on the timeline and next steps, please review the Help Documentation.
| SCRIPT_RE = re.compile(r"(?:bash\s+)?([A-Za-z0-9_./-]+\.sh)") | ||
|
|
||
|
|
||
| def audit(readme: Path, root: Path) -> dict[str, object]: | ||
| references = [] | ||
| for match in SCRIPT_RE.finditer(readme.read_text(encoding="utf-8", errors="replace")): | ||
| script = match.group(1) | ||
| references.append({"script": script, "exists": (root / script).exists()}) | ||
| missing = [entry for entry in references if not entry["exists"]] | ||
| return {"reference_count": len(references), "missing_count": len(missing), "references": references} |
There was a problem hiding this comment.
建议:过滤外部脚本并进行去重处理
- 过滤外部脚本:
README.md中包含mxmaca-sdk-install.sh等外部 SDK 安装脚本,它们不属于本仓库,会导致审计工具始终报missing,从而无法在 CI 中通过。建议增加IGNORE_LIST过滤这些外部脚本。 - 去重处理:同一个脚本在文档中可能会被多次提及(例如
mccl.sh),如果不去重,输出的 JSON 中会包含大量重复项,且会重复进行磁盘 I/O 检查。建议使用seen集合进行去重。
| SCRIPT_RE = re.compile(r"(?:bash\s+)?([A-Za-z0-9_./-]+\.sh)") | |
| def audit(readme: Path, root: Path) -> dict[str, object]: | |
| references = [] | |
| for match in SCRIPT_RE.finditer(readme.read_text(encoding="utf-8", errors="replace")): | |
| script = match.group(1) | |
| references.append({"script": script, "exists": (root / script).exists()}) | |
| missing = [entry for entry in references if not entry["exists"]] | |
| return {"reference_count": len(references), "missing_count": len(missing), "references": references} | |
| SCRIPT_RE = re.compile(r"(?:bash\s+)?([A-Za-z0-9_./-]+\.sh)") | |
| IGNORE_LIST = {"mxmaca-sdk-install.sh"} | |
| def audit(readme: Path, root: Path) -> dict[str, object]: | |
| references = [] | |
| seen = set() | |
| for match in SCRIPT_RE.finditer(readme.read_text(encoding="utf-8", errors="replace")): | |
| script = match.group(1) | |
| if script in IGNORE_LIST: | |
| continue | |
| if script not in seen: | |
| seen.add(script) | |
| references.append({"script": script, "exists": (root / script).exists()}) | |
| missing = [entry for entry in references if not entry["exists"]] | |
| return {"reference_count": len(references), "missing_count": len(missing), "references": references} |
| def main() -> int: | ||
| parser = argparse.ArgumentParser(description=__doc__) | ||
| parser.add_argument("--readme", type=Path, default=Path("README.md")) | ||
| parser.add_argument("--root", type=Path, default=Path.cwd()) | ||
| parser.add_argument("--output", type=Path) | ||
| args = parser.parse_args() | ||
|
|
||
| text = json.dumps(audit(args.readme, args.root), indent=2) | ||
| if args.output: | ||
| args.output.parent.mkdir(parents=True, exist_ok=True) | ||
| args.output.write_text(text + "\n", encoding="utf-8") | ||
| else: | ||
| print(text) | ||
| return 0 |
There was a problem hiding this comment.
建议:增加文件存在性检查并返回非零状态码
- 文件存在性检查:如果指定的
--readme文件不存在,程序会直接抛出FileNotFoundError堆栈信息。建议提前检查并优雅地报错。 - 非零状态码返回:作为审计(Audit)工具,如果发现有缺失的脚本(
missing_count > 0),应当返回非零状态码(例如1),以便于在 CI/CD 流程中自动拦截并报错。
| def main() -> int: | |
| parser = argparse.ArgumentParser(description=__doc__) | |
| parser.add_argument("--readme", type=Path, default=Path("README.md")) | |
| parser.add_argument("--root", type=Path, default=Path.cwd()) | |
| parser.add_argument("--output", type=Path) | |
| args = parser.parse_args() | |
| text = json.dumps(audit(args.readme, args.root), indent=2) | |
| if args.output: | |
| args.output.parent.mkdir(parents=True, exist_ok=True) | |
| args.output.write_text(text + "\n", encoding="utf-8") | |
| else: | |
| print(text) | |
| return 0 | |
| def main() -> int: | |
| import sys | |
| parser = argparse.ArgumentParser(description=__doc__) | |
| parser.add_argument("--readme", type=Path, default=Path("README.md")) | |
| parser.add_argument("--root", type=Path, default=Path.cwd()) | |
| parser.add_argument("--output", type=Path) | |
| args = parser.parse_args() | |
| if not args.readme.is_file(): | |
| print(f"Error: README file not found at {args.readme}", file=sys.stderr) | |
| return 1 | |
| result = audit(args.readme, args.root) | |
| text = json.dumps(result, indent=2) | |
| if args.output: | |
| args.output.parent.mkdir(parents=True, exist_ok=True) | |
| args.output.write_text(text + "\n", encoding="utf-8") | |
| else: | |
| print(text) | |
| return 1 if result["missing_count"] > 0 else 0 |
| class DocCommandAuditTest(unittest.TestCase): | ||
| def test_marks_existing_and_missing_scripts(self): | ||
| with tempfile.TemporaryDirectory() as tmpdir: | ||
| root = Path(tmpdir) | ||
| (root / "function").mkdir() | ||
| (root / "mccl.sh").write_text("", encoding="utf-8") | ||
| (root / "README.md").write_text( | ||
| "Run `bash mccl.sh` then `bash function/cluster.sh`.\n", | ||
| encoding="utf-8", | ||
| ) | ||
|
|
||
| result = audit(root / "README.md", root) | ||
|
|
||
| self.assertEqual(result["reference_count"], 2) | ||
| self.assertEqual(result["missing_count"], 1) | ||
| self.assertFalse(result["references"][1]["exists"]) |
There was a problem hiding this comment.
建议:更新单元测试以覆盖去重和忽略列表逻辑
为了配合对 audit 函数的改进(去重和忽略列表),建议更新单元测试,确保这些新逻辑得到充分测试。
| class DocCommandAuditTest(unittest.TestCase): | |
| def test_marks_existing_and_missing_scripts(self): | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| root = Path(tmpdir) | |
| (root / "function").mkdir() | |
| (root / "mccl.sh").write_text("", encoding="utf-8") | |
| (root / "README.md").write_text( | |
| "Run `bash mccl.sh` then `bash function/cluster.sh`.\n", | |
| encoding="utf-8", | |
| ) | |
| result = audit(root / "README.md", root) | |
| self.assertEqual(result["reference_count"], 2) | |
| self.assertEqual(result["missing_count"], 1) | |
| self.assertFalse(result["references"][1]["exists"]) | |
| class DocCommandAuditTest(unittest.TestCase): | |
| def test_marks_existing_and_missing_scripts(self): | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| root = Path(tmpdir) | |
| (root / "function").mkdir() | |
| (root / "mccl.sh").write_text("", encoding="utf-8") | |
| (root / "README.md").write_text( | |
| "Run bash mccl.sh then bash function/cluster.sh.\n" | |
| "Also run bash mccl.sh again and bash mxmaca-sdk-install.sh.\n", | |
| encoding="utf-8", | |
| ) | |
| result = audit(root / "README.md", root) | |
| self.assertEqual(result["reference_count"], 2) | |
| self.assertEqual(result["missing_count"], 1) | |
| self.assertFalse(result["references"][1]["exists"]) |
这次改动补上了集体通信文档命令audit,主要是为了解决集体通信测试与结果整理流程里相关信息不够集中、人工整理成本较高的问题,让日常排查、验证和结果归档更直接。
实现上补充了对应工具或脚本逻辑,补上了对应测试,同时尽量保持现有用法不变,避免影响已有流程。
这一分支已经在沐曦算力环境完成实际验证,相关检查均已通过,现提交合入。