增加 mxcc 编译超时控制#23
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces a timeout mechanism for MACA compilation in TVM by reading the TVM_MXCC_TIMEOUT environment variable and applying it to the subprocess communication. It also adds corresponding unit tests. The reviewer feedback suggests wrapping proc.kill() in a try-except block to handle potential OSError exceptions during timeout handling, and simplifying the unit tests by directly importing the helper function instead of dynamically parsing and executing it with AST.
Important
The consumer version of Gemini Code Assist on GitHub is being sunset. Starting June 18, 2026, new organization installations will be blocked, and all code review activity will officially cease on July 17, 2026.
For more details on the timeline and next steps, please review the Help Documentation.
| except subprocess.TimeoutExpired as err: | ||
| proc.kill() | ||
| out, _ = proc.communicate() |
There was a problem hiding this comment.
在捕获 subprocess.TimeoutExpired 异常后调用 proc.kill() 时,如果子进程在超时瞬间或之后刚好自行结束,可能会抛出 ProcessLookupError(或在 Windows 上抛出其他 OSError)。为了提高代码的健壮性,建议使用 try...except OSError: 块包裹 proc.kill(),以防止因竞态条件导致编译流程异常崩溃。
| except subprocess.TimeoutExpired as err: | |
| proc.kill() | |
| out, _ = proc.communicate() | |
| except subprocess.TimeoutExpired as err: | |
| try: | |
| proc.kill() | |
| except OSError: | |
| pass | |
| out, _ = proc.communicate() |
| import ast | ||
| import os | ||
| from pathlib import Path | ||
| from unittest.mock import patch | ||
|
|
||
|
|
||
| def _load_timeout_helper(): | ||
| source_path = ( | ||
| Path(__file__).resolve().parents[3] / "python" / "tvm" / "contrib" / "mxcc.py" | ||
| ) | ||
| tree = ast.parse(source_path.read_text(encoding="utf-8")) | ||
| nodes = [ | ||
| node | ||
| for node in tree.body | ||
| if ( | ||
| isinstance(node, ast.Import) | ||
| and all(alias.name == "os" for alias in node.names) | ||
| ) | ||
| or (isinstance(node, ast.FunctionDef) and node.name == "_get_mxcc_timeout") | ||
| ] | ||
| module = ast.Module(body=nodes, type_ignores=[]) | ||
| ast.fix_missing_locations(module) | ||
| namespace = {} | ||
| exec(compile(module, str(source_path), "exec"), namespace) | ||
| return namespace["_get_mxcc_timeout"] | ||
|
|
||
|
|
||
| def test_mxcc_timeout_accepts_positive_float(): | ||
| get_timeout = _load_timeout_helper() | ||
| with patch.dict(os.environ, {"TVM_MXCC_TIMEOUT": "12.5"}, clear=True): | ||
| assert get_timeout() == 12.5 | ||
|
|
||
|
|
||
| def test_mxcc_timeout_rejects_invalid_value(): | ||
| get_timeout = _load_timeout_helper() | ||
| with patch.dict(os.environ, {"TVM_MXCC_TIMEOUT": "0"}, clear=True): | ||
| try: | ||
| get_timeout() | ||
| except ValueError as err: | ||
| assert "positive number" in str(err) | ||
| else: | ||
| raise AssertionError("expected ValueError") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| test_mxcc_timeout_accepts_positive_float() | ||
| test_mxcc_timeout_rejects_invalid_value() |
There was a problem hiding this comment.
使用 ast.parse 和 exec 动态加载并执行 _get_mxcc_timeout 函数过于复杂且非常脆弱(例如,如果未来该函数依赖了其他未被提取的导入或辅助函数,测试将会失败)。
既然这是 TVM 的测试套件,可以直接从 tvm.contrib.mxcc 导入该函数进行测试。此外,使用 pytest.raises 来断言异常比手动 try-except-else 更符合 Python 测试的最佳实践。
| import ast | |
| import os | |
| from pathlib import Path | |
| from unittest.mock import patch | |
| def _load_timeout_helper(): | |
| source_path = ( | |
| Path(__file__).resolve().parents[3] / "python" / "tvm" / "contrib" / "mxcc.py" | |
| ) | |
| tree = ast.parse(source_path.read_text(encoding="utf-8")) | |
| nodes = [ | |
| node | |
| for node in tree.body | |
| if ( | |
| isinstance(node, ast.Import) | |
| and all(alias.name == "os" for alias in node.names) | |
| ) | |
| or (isinstance(node, ast.FunctionDef) and node.name == "_get_mxcc_timeout") | |
| ] | |
| module = ast.Module(body=nodes, type_ignores=[]) | |
| ast.fix_missing_locations(module) | |
| namespace = {} | |
| exec(compile(module, str(source_path), "exec"), namespace) | |
| return namespace["_get_mxcc_timeout"] | |
| def test_mxcc_timeout_accepts_positive_float(): | |
| get_timeout = _load_timeout_helper() | |
| with patch.dict(os.environ, {"TVM_MXCC_TIMEOUT": "12.5"}, clear=True): | |
| assert get_timeout() == 12.5 | |
| def test_mxcc_timeout_rejects_invalid_value(): | |
| get_timeout = _load_timeout_helper() | |
| with patch.dict(os.environ, {"TVM_MXCC_TIMEOUT": "0"}, clear=True): | |
| try: | |
| get_timeout() | |
| except ValueError as err: | |
| assert "positive number" in str(err) | |
| else: | |
| raise AssertionError("expected ValueError") | |
| if __name__ == "__main__": | |
| test_mxcc_timeout_accepts_positive_float() | |
| test_mxcc_timeout_rejects_invalid_value() | |
| import os | |
| from unittest.mock import patch | |
| import pytest | |
| from tvm.contrib.mxcc import _get_mxcc_timeout | |
| def test_mxcc_timeout_accepts_positive_float(): | |
| with patch.dict(os.environ, {"TVM_MXCC_TIMEOUT": "12.5"}, clear=True): | |
| assert _get_mxcc_timeout() == 12.5 | |
| def test_mxcc_timeout_rejects_invalid_value(): | |
| with patch.dict(os.environ, {"TVM_MXCC_TIMEOUT": "0"}, clear=True): | |
| with pytest.raises(ValueError, match="positive number"): | |
| _get_mxcc_timeout() | |
| if __name__ == "__main__": | |
| test_mxcc_timeout_accepts_positive_float() | |
| test_mxcc_timeout_rejects_invalid_value() |
该 PR 为 mxcc 编译流程增加超时控制,避免异常工具链或错误参数导致测试任务长期卡住。
这个修改面向沐曦 GPU 适配场景中比较容易影响开发、构建或验证稳定性的环节,把原来需要人工排查的问题前移到工具链、运行前检查或基准脚本中处理。实现上保持对现有默认行为的兼容,只在检测到明确配置、输入或环境异常时给出更直接的诊断,避免引入额外运行依赖,也方便维护者独立审阅该分支。
已在沐曦算力环境中完成对应分支验证,验证记录包含真实运行日志、命令输出和失败路径检查,本地归档目录为:E:/Documents/muxi/测试报告/mcTVM_new_toolchain_validation_20260608。提交分支:
mengz/mxcc-compile-timeout,目标仓库:MetaX-MACA/mcTVM。