From 94239d6d978023664e6606e9d0beba08339afba5 Mon Sep 17 00:00:00 2001 From: stasnamco <2972373@gmail.com> Date: Sun, 5 Jul 2026 05:07:30 +0000 Subject: [PATCH] fix(extract): route extensionless shebang scripts to their AST extractor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit detect.classify_file already labels extensionless files with a bash/python/ node/... shebang as CODE via _shebang_interpreter, but _get_extractor dispatched purely on path.suffix — so a CLI entry point like `devctl` or `manage` was detected as code and then silently contributed zero nodes to the graph (its doc-referenced symbols stayed dangling stubs). Resolve extensionless files through the same _shebang_interpreter and a new _SHEBANG_DISPATCH map. Only interpreters with a real extractor are mapped (python/bash-family/node/ruby/lua/php/julia); detect's wider set (perl, fish, tcsh, Rscript) stays unmapped and skipped rather than being mis-parsed by a wrong grammar. Co-Authored-By: Claude Fable 5 --- graphify/extract.py | 32 +++++++++++++++++++++++++++ tests/test_extract.py | 50 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/graphify/extract.py b/graphify/extract.py index f14f4638d..13c6cafb5 100644 --- a/graphify/extract.py +++ b/graphify/extract.py @@ -15821,6 +15821,30 @@ def _body_of(block): } +# Extensionless executables (CLI entry points like `devctl` or `manage`) carry +# their language in the shebang, not the suffix. detect.classify_file already +# routes them to the CODE path via _shebang_interpreter; _get_extractor must +# honor the same signal or these files are classified as code and then silently +# dropped by extraction. Only interpreters with a real extractor are mapped — +# detect's wider set (perl, fish, tcsh, Rscript) stays unmapped and skipped. +_SHEBANG_DISPATCH: dict[str, Any] = { + "python": extract_python, + "python2": extract_python, + "python3": extract_python, + "bash": extract_bash, + "sh": extract_bash, + "dash": extract_bash, + "zsh": extract_bash, + "ksh": extract_bash, + "node": extract_js, + "nodejs": extract_js, + "ruby": extract_ruby, + "lua": extract_lua, + "php": extract_php, + "julia": extract_julia, +} + + # ObjC-only directives. They are illegal in C and C++, so finding one in a `.h` # file is a near-zero-false-positive signal that the header is Objective-C (and so # belongs to extract_objc, not extract_c). `@property` is deliberately excluded: it @@ -15908,6 +15932,14 @@ def _get_extractor(path: Path) -> Any | None: # grammar has no class_specifier). Reroute to extract_cpp (#1547). if _is_cpp_header(path): return extract_cpp + # Extensionless files: resolve by shebang, mirroring detect.classify_file. + # Without this, detect labels e.g. `#!/usr/bin/env bash` CLIs as code but + # extraction returns no extractor and the file silently contributes nothing. + if not suffix: + from graphify.detect import _shebang_interpreter + interp = _shebang_interpreter(path) + if interp is not None: + return _SHEBANG_DISPATCH.get(interp) return _DISPATCH.get(suffix) diff --git a/tests/test_extract.py b/tests/test_extract.py index ae88e364b..4e76ad120 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -1514,6 +1514,56 @@ def test_extract_json_via_dispatch(): assert _get_extractor(Path("foo.json")) is extract_json +def test_extensionless_shebang_via_dispatch(tmp_path): + """Extensionless CLIs resolve their extractor from the shebang, mirroring + detect.classify_file — otherwise detect labels them code and extraction + silently drops them.""" + from graphify.extract import _get_extractor + + cli = tmp_path / "devctl" + cli.write_text("#!/usr/bin/env bash\necho hi\n") + assert _get_extractor(cli) is extract_bash + + pytool = tmp_path / "manage" + pytool.write_text("#!/usr/bin/env python3\nprint('hi')\n") + assert _get_extractor(pytool) is extract_python + + # env -S split-args form is handled by the shared shebang parser + split = tmp_path / "runner" + split.write_text("#!/usr/bin/env -S bash -eu\necho hi\n") + assert _get_extractor(split) is extract_bash + + +def test_extensionless_without_usable_shebang_stays_unsupported(tmp_path): + from graphify.extract import _get_extractor + + plain = tmp_path / "LICENSE-COPY" + plain.write_text("plain text, no shebang\n") + assert _get_extractor(plain) is None + + # Interpreter known to detect but with no AST extractor: stays skipped + # rather than being mis-parsed by a wrong grammar. + perl = tmp_path / "legacy" + perl.write_text("#!/usr/bin/env perl\nprint 1;\n") + assert _get_extractor(perl) is None + + +def test_extract_extensionless_bash_cli_end_to_end(tmp_path): + """A shebang-only bash CLI must contribute nodes with the same ID scheme + as a .sh file (path stem + entity), so doc-created stub IDs merge.""" + cli = tmp_path / "devctl" + cli.write_text( + "#!/usr/bin/env bash\n" + "helper() { echo hi; }\n" + "main() { helper; }\n" + 'main "$@"\n' + ) + result = extract([cli], cache_root=tmp_path) + ids = {n["id"] for n in result["nodes"]} + assert "devctl_helper" in ids + assert "devctl_main" in ids + + def test_extract_bash_node_metadata_is_sanitized(): """Bash extractor must route node metadata through sanitize_metadata so HTML-sensitive characters cannot reach downstream graph viewers raw."""