Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions internal/cbm/cbm.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ typedef struct {
int loop_depth; // enclosing loop nesting at the call site
int branch_depth; // enclosing branch nesting at the call site
int start_line; // 1-based source line of the call (for def range-match)
bool is_method; // Perl-only: arrow/method call ($obj->m). Default false.
} CBMCall;

typedef struct {
Expand Down
61 changes: 60 additions & 1 deletion internal/cbm/extract_calls.c
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,40 @@ static char *extract_swift_callee(CBMArena *a, TSNode node, const char *source,
return NULL;
}

// A Perl sub/method name is an identifier: it starts with a letter or '_',
// contains only [A-Za-z0-9_] plus the '::' package separator, and is never a
// string/config literal. tree-sitter-perl mis-parses config lines in .cgi /
// heredoc-heavy files into call-shaped nodes whose "callee" is a dotted config
// token (e.g. "log4perl.appender.File.utf8"); rejecting non-identifier text
// here stops those from becoming bogus CALLS edges. Any '.', whitespace, quote,
// or '/' disqualifies the token.
static bool perl_is_identifier_callee(const char *name) {
if (!name || !name[0]) {
return false;
}
unsigned char c0 = (unsigned char)name[0];
if (!(isalpha(c0) || c0 == '_')) {
return false;
}
for (const char *p = name; *p; p++) {
unsigned char c = (unsigned char)*p;
if (isalnum(c) || c == '_') {
continue;
}
if (c == ':') {
// Only the '::' package separator is allowed: require an adjacent
// pair, and reject a lone ':', ':::', or a trailing '::'.
if (p[1] != ':' || p[2] == ':' || p[2] == '\0') {
return false;
}
p++; // consume the second ':'; the loop's p++ moves past the pair
continue;
}
return false; // '.', space, quote, '/', etc. → not a sub/method name
}
return true;
}

// Callee extraction for scripting languages (Elixir, Perl, PHP, Kotlin, MATLAB).
static char *extract_scripting_callee(CBMArena *a, TSNode node, const char *source,
CBMLanguage lang, const char *nk) {
Expand All @@ -367,7 +401,24 @@ static char *extract_scripting_callee(CBMArena *a, TSNode node, const char *sour
return NULL;
}
if (lang == CBM_LANG_PERL && ts_node_child_count(node) > 0) {
return cbm_node_text(a, ts_node_child(node, 0), source);
// Pull the actual sub/method name token rather than blindly taking
// child(0). Grammar (verified against the vendored parser):
// method_call_expression : field `method` ($obj->m / Class->m)
// function_call_expression : field `function` (foo(); name with '.'
// from a config-string misparse lands here)
// ambiguous_function_call_expression : field `function`
// func1op_call_expression : builtin keyword as child(0) (no field)
TSNode name_node = ts_node_child_by_field_name(node, TS_FIELD("method"));
if (ts_node_is_null(name_node)) {
name_node = ts_node_child_by_field_name(node, TS_FIELD("function"));
}
if (ts_node_is_null(name_node)) {
name_node = ts_node_child(node, 0);
}
char *pn = cbm_node_text(a, name_node, source);
// Reject anything that is not a bare Perl sub/method identifier (config
// strings, quoted literals, paths) so no spurious CALLS edge is emitted.
return perl_is_identifier_callee(pn) ? pn : NULL;
}
if (lang == CBM_LANG_PHP) {
TSNode func_node = ts_node_child_by_field_name(node, TS_FIELD("function"));
Expand Down Expand Up @@ -1134,6 +1185,14 @@ void handle_calls(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec, Walk
call.loop_depth = state->loop_depth; // enclosing loop nesting at this call
call.branch_depth = state->branch_depth; // enclosing branch nesting at this call
call.start_line = (int)ts_node_start_point(node).row + TS_LINE_OFFSET;
// Perl-only: flag arrow/method calls ($obj->m / Class->m). The
// generic short-name resolver cannot place a method without a known
// receiver type, so the call-resolution pass suppresses those edges.
// Default false for every other language (struct is zero-init).
if (ctx->language == CBM_LANG_PERL &&
strcmp(ts_node_type(node), "method_call_expression") == 0) {
call.is_method = true;
}

TSNode args = ts_node_child_by_field_name(node, TS_FIELD("arguments"));
if (!ts_node_is_null(args)) {
Expand Down
2 changes: 1 addition & 1 deletion internal/cbm/lang_specs.c
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,7 @@ static const char *perl_func_types[] = {"subroutine_declaration_statement", NULL
static const char *perl_module_types[] = {"source_file", NULL};
static const char *perl_call_types[] = {"ambiguous_function_call_expression",
"function_call_expression", "func1op_call_expression",
NULL};
"method_call_expression", NULL};
static const char *perl_import_types[] = {"use_statement", "require_statement", "require", NULL};
static const char *perl_branch_types[] = {"if_statement", "unless_statement", "for_statement",
"foreach_statement", "while_statement", NULL};
Expand Down
17 changes: 15 additions & 2 deletions src/pipeline/pass_calls.c
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ static const cbm_gbuf_node_t *calls_find_source(cbm_pipeline_ctx_t *ctx, const c
static int resolve_single_call(cbm_pipeline_ctx_t *ctx, CBMCall *call,
const CBMResolvedCallArray *lsp_calls, const char *rel,
const char *module_qn, const char **imp_keys, const char **imp_vals,
int imp_count) {
int imp_count, CBMLanguage lang) {
const cbm_gbuf_node_t *source_node = calls_find_source(ctx, rel, call->enclosing_func_qn);
if (!source_node) {
return 0;
Expand Down Expand Up @@ -366,6 +366,19 @@ static int resolve_single_call(cbm_pipeline_ctx_t *ctx, CBMCall *call,
if (!res.qualified_name || res.qualified_name[0] == '\0') {
return 0;
}

/* Perl call-graph noise guard (#476). Perl has no LSP resolver, so the
* generic registry chain is the only resolver; for builtins (push/shift/
* keys/...) and method calls ($obj->m with an unresolved receiver), a *weak*
* cross-file short-name match to a project sub sharing the name is almost
* always a false positive. Suppress only those weak matches; KEEP the
* high-confidence same_module / import_map strategies so a genuine
* same-file or imported call to a builtin-named sub still resolves. Gated
* to Perl — other languages are unaffected. */
if (cbm_perl_suppress_generic_match(lang == CBM_LANG_PERL, call->is_method, call->callee_name,
res.strategy)) {
return 0;
}
const cbm_gbuf_node_t *target_node = cbm_gbuf_find_by_qn(ctx->gbuf, res.qualified_name);
if (!target_node || source_node->id == target_node->id) {
return 0;
Expand Down Expand Up @@ -440,7 +453,7 @@ int cbm_pipeline_pass_calls(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t *file
}
total_calls++;
if (resolve_single_call(ctx, call, &result->resolved_calls, rel, module_qn, imp_keys,
imp_vals, imp_count)) {
imp_vals, imp_count, files[i].language)) {
resolved++;
} else {
unresolved++;
Expand Down
17 changes: 15 additions & 2 deletions src/pipeline/pass_parallel.c
Original file line number Diff line number Diff line change
Expand Up @@ -1693,7 +1693,7 @@ static void lsp_idx_free_key(const char *key, void *value, void *ud) {
/* Resolve calls for one file and emit CALLS/HTTP_CALLS/ASYNC_CALLS edges. */
static void resolve_file_calls(resolve_ctx_t *rc, resolve_worker_state_t *ws, CBMFileResult *result,
const char *rel, const char *module_qn, const char **imp_keys,
const char **imp_vals, int imp_count) {
const char **imp_vals, int imp_count, CBMLanguage lang) {
/* Build a per-file hash index of resolved_calls keyed by
* "caller_qn|callee_short" for O(1) lookup. cbm_pipeline_find_lsp_
* resolution would otherwise do an O(N) linear scan over
Expand Down Expand Up @@ -1803,6 +1803,19 @@ static void resolve_file_calls(resolve_ctx_t *rc, resolve_worker_state_t *ws, CB
atomic_fetch_add_explicit(&rc->time_ns_rc_hint, extract_now_ns() - _rc_t0,
memory_order_relaxed);

/* Perl call-graph noise guard (#476), mirroring the sequential pass
* (pass_calls.c). Perl has no LSP resolver; for builtins (push/shift/
* keys/...) and method calls ($obj->m, unresolved receiver), suppress
* only WEAK cross-file short-name matches and keep the high-confidence
* same_module / import_map strategies so a genuine same-file or
* imported call to a builtin-named sub still resolves. Placed after the
* field-type hint so a hint cannot re-introduce a suppressed edge.
* Gated to Perl — other languages are unaffected. */
if (cbm_perl_suppress_generic_match(lang == CBM_LANG_PERL, call->is_method,
call->callee_name, res.strategy)) {
continue;
}

if (!res.qualified_name || res.qualified_name[0] == '\0') {
if (cbm_service_pattern_route_method(call->callee_name) != NULL) {
cbm_resolution_t fake_res = {.qualified_name = call->callee_name,
Expand Down Expand Up @@ -2328,7 +2341,7 @@ static void resolve_worker(int worker_id, void *ctx_ptr) {

/* ── CALLS resolution ──────────────────────────────────── */
_ph_t0 = extract_now_ns();
resolve_file_calls(rc, ws, result, rel, module_qn, imp_keys, imp_vals, imp_count);
resolve_file_calls(rc, ws, result, rel, module_qn, imp_keys, imp_vals, imp_count, lang);
atomic_fetch_add_explicit(&rc->time_ns_calls, extract_now_ns() - _ph_t0,
memory_order_relaxed);

Expand Down
13 changes: 13 additions & 0 deletions src/pipeline/pipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,19 @@ void cbm_registry_resolve_cache_end(void);
/* Check if a qualified name exists in the registry. */
bool cbm_registry_exists(const cbm_registry_t *r, const char *qn);

/* True if `name` is one of the curated Perl core builtins (perlfunc). Used by
* the call-resolution passes to suppress generic-resolver CALLS edges from Perl
* builtin invocations (push/shift/keys/...) to project subs that merely share
* the name. Perl-scoped: callers gate on the file language. */
bool cbm_perl_is_builtin(const char *name);

/* Decide whether a resolved Perl call edge is generic-resolver noise to drop
* (#476): true only for Perl, only for a builtin/method call, and only when the
* match used a weak short-name strategy — high-confidence same_module/import_map
* matches are kept. Pure; unit-tested in test_registry.c. */
bool cbm_perl_suppress_generic_match(bool is_perl, bool is_method, const char *callee_name,
const char *strategy);

/* Get the label of a qualified name, or NULL if not found. */
const char *cbm_registry_label_of(const cbm_registry_t *r, const char *qn);

Expand Down
67 changes: 67 additions & 0 deletions src/pipeline/registry.c
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,73 @@ static cbm_resolution_t empty_result(void) {
return r;
}

/* ── Perl builtin guard (#459 follow-up: call-graph noise) ──────────
* Curated subset of perlfunc core builtins. When a Perl CALL resolves
* only by the generic short-name matcher (no LSP, no import, after the
* same-module/name-lookup chain), a builtin name like `push`/`shift`/
* `keys` must NOT be wired to a project sub that merely shares the name
* — that is virtually always a false positive. A genuine intra-project
* call is resolved by earlier (LSP/textual) stages before this guard.
* MUST stay sorted ASCII-ascending for bsearch. */
static const char *const PERL_BUILTINS[] = {
"abs", "atan2", "binmode", "bless", "caller", "chdir", "chmod", "chomp",
"chop", "chown", "chr", "chroot", "close", "closedir", "cos", "defined",
"delete", "die", "do", "each", "eof", "eval", "exec", "exists",
"exit", "fork", "gmtime", "goto", "grep", "hex", "index", "int",
"join", "keys", "last", "lc", "lcfirst", "length", "local", "localtime",
"log", "lstat", "map", "mkdir", "my", "next", "oct", "open",
"opendir", "ord", "our", "pop", "pos", "print", "printf", "push",
"quotemeta", "rand", "read", "readdir", "readline", "redo", "ref", "rename",
"require", "return", "reverse", "rindex", "rmdir", "say", "scalar", "seek",
"shift", "sin", "sleep", "sort", "splice", "split", "sprintf", "sqrt",
"srand", "stat", "substr", "system", "time", "uc", "ucfirst", "undef",
"unlink", "unshift", "values", "wantarray", "warn", "write",
};

static int perl_builtin_cmp(const void *key, const void *elem) {
return strcmp((const char *)key, *(const char *const *)elem);
}

/* True if `name` is one of the curated Perl core builtins. Used to suppress
* generic-resolver CALLS edges from Perl builtin invocations to project subs
* that happen to share the builtin's name. Perl-scoped: callers gate on the
* file language so no other language's resolution is affected. */
bool cbm_perl_is_builtin(const char *name) {
if (!name || !name[0]) {
return false;
}
return bsearch(name, PERL_BUILTINS, sizeof(PERL_BUILTINS) / sizeof(PERL_BUILTINS[0]),
sizeof(PERL_BUILTINS[0]), perl_builtin_cmp) != NULL;
}

/* Decide whether a *resolved* Perl call edge is generic-resolver noise that
* should be suppressed (#476). Returns true only for Perl, only for a builtin
* invocation or a method call, and only when the registry landed the match via
* a WEAK short-name strategy. High-confidence import/same-module strategies
* (same_module, import_map, import_map_suffix) are KEPT so a genuine same-file
* or imported call to a builtin-named sub still resolves — only the weak
* short-name guesses (suffix_match, unique_name) are dropped. `strategy` is the
* cbm_resolution_t.strategy of a non-empty match;
* NULL/empty (no match) returns false. Pure + side-effect-free so the
* suppression contract is unit-testable without a full pipeline. */
bool cbm_perl_suppress_generic_match(bool is_perl, bool is_method, const char *callee_name,
const char *strategy) {
if (!is_perl) {
return false;
}
if (!(is_method || cbm_perl_is_builtin(callee_name))) {
return false;
}
if (!strategy || !strategy[0]) {
return false;
}
if (strcmp(strategy, "same_module") == 0 || strcmp(strategy, "import_map") == 0 ||
strcmp(strategy, "import_map_suffix") == 0) {
return false; /* high-confidence import/same-module match — keep the genuine edge */
}
return true; /* weak short-name match (suffix_match / unique_name / …) → drop */
}

/* ── Lifecycle ──────────────────────────────────────────────────── */

cbm_registry_t *cbm_registry_new(void) {
Expand Down
Loading
Loading