Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 73 additions & 2 deletions .codefang.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,36 @@
# ballast_size: "0" # e.g. "128MiB"
# memory_limit: "" # e.g. "8GiB" — sets debug.SetMemoryLimit
# worker_timeout: "" # e.g. "60s" — stall detection timeout per worker request
#
# # Advanced pipeline tuning (0 = use defaults).
# uast_spill_threshold: 32 # file changes per commit before spilling UAST to disk
# intra_commit_parallel_threshold: 4 # min file changes for intra-commit parallel parsing
# max_intra_commit_workers: 4 # max goroutines for intra-commit UAST parsing
# max_uast_blob_size: 262144 # max blob size (bytes) for UAST parsing (256 KiB)
# uast_parse_timeout: "10s" # per-file UAST parse timeout
# max_changes_per_commit: 10000 # commits exceeding this are skipped
# max_diff_batch_size: 1000 # max diff requests per batch
# memory_budget_ratio: 50 # % of system RAM for auto memory budget
# memory_budget_cap: "2GiB" # max auto memory budget
# memory_limit_ratio: 75 # % of system RAM for Go soft memory limit
#
# # Extended pipeline tuning.
# uast_spill_trim_interval: 16 # MallocTrim frequency during UAST spill-mode parsing
# native_trim_interval: 10 # malloc_trim frequency within a chunk
# max_streaming_buffering: 3 # max buffering factor for streaming (triple-buffering)
# drain_prefetch_timeout: "30s" # timeout for abandoned prefetch goroutines
# sampler_interval: "2s" # pipeline sampler polling interval
# worker_ratio: 100 # % of CPU cores for pipeline workers
# uast_worker_ratio: 40 # % of CPU cores for UAST pipeline workers
# leaf_worker_divisor: 3 # leaf workers = NumCPU / divisor
# min_leaf_workers: 4 # minimum leaf workers
# buffer_size_multiplier: 2 # buffer size = workers * multiplier
# budget_limit_ratio: 95 # budget-to-memory-limit conversion (%)
# system_ram_limit_ratio: 90 # memory limit cap as % of system RAM
# diff_job_buffer_multiplier: 10 # scales diff job queue buffer
# static_max_workers: 8 # max concurrent workers for static analysis
# malloc_trim_interval: 50 # files between malloc_trim calls in static analysis
# static_memory_limit_ratio: 90 # % of budget for static phase memory limit

# History analyzer settings.
# history:
Expand All @@ -37,17 +67,58 @@
# debug: false
# goroutines: 0 # 0 = auto
#
# couples:
# coupling_threshold_high: 10 # min co-change count for "high" coupling
# ownership_few_threshold: 3 # max contributors for "few owners" bucket
# ownership_moderate_threshold: 5 # max contributors for "moderate owners" bucket
# batch_coupling_threshold: 100 # max file pairs per commit for coupling
# hll_precision: 10 # HyperLogLog precision for contributor sketches
# top_k_per_file: 100 # max coupling pairs per file in store output
# min_edge_weight: 2 # min co-changes for a coupling edge
#
# devs:
# consider_empty_commits: false
# anonymize: false
# bus_factor_threshold: 0.5 # cumulative ownership fraction for bus factor
# risk_threshold_critical: 90.0 # % ownership for critical risk
# risk_threshold_high: 80.0 # % ownership for high risk
# risk_threshold_medium: 60.0 # % ownership for medium risk
# active_threshold_ratio: 0.7 # fraction of max commits for "active" developer
# default_active_days: 90 # lookback window (days) for active developer check
# hll_precision: 14 # HyperLogLog precision for developer sketches
#
# file_history:
# hotspot_threshold_critical: 50 # commits for critical hotspot
# hotspot_threshold_high: 30 # commits for high hotspot
# hotspot_threshold_medium: 15 # commits for medium hotspot
#
# imports:
# goroutines: 4
# max_file_size: 1048576 # 1 MiB
# max_file_size: 1048576 # 1 MiB
# max_dependency_risk_rows: 30 # max rows in dependency risk table
#
# sentiment:
# min_comment_length: 20
# gap: 0.5 # 0.0 - 1.0
# gap: 0.5 # 0.0 - 1.0
# neutralizer_weight: 0.8 # SE-domain adjustment weight (0-1)
# max_weight_ratio: 3.0 # max comment length weight ratio
# positive_threshold: 0.6 # sentiment >= this is "positive"
# negative_threshold: 0.4 # sentiment <= this is "negative"
# trend_threshold: 0.1 # change needed to classify trend direction
# low_sentiment_risk_thresh: 0.2 # sentiment <= this is HIGH risk
#
# clones:
# max_clone_pairs: 1000 # max clone pairs in aggregated report
# num_hashes: 128 # MinHash signature size
# num_bands: 16 # LSH band count
# num_rows: 8 # rows per LSH band
# shingle_size: 5 # token shingle window size
# similarity_type2: 0.8 # Type-2 clone similarity threshold
# similarity_type3: 0.5 # Type-3 clone similarity threshold
# threshold_ratio_yellow: 0.1 # clone ratio for yellow warning
# threshold_ratio_red: 0.3 # clone ratio for red warning
# threshold_pairs_yellow: 5 # clone pairs for yellow warning
# threshold_pairs_red: 20 # clone pairs for red warning
#
# shotness:
# dsl_struct: 'filter(.roles has "Function")'
Expand Down
96 changes: 77 additions & 19 deletions cmd/codefang/commands/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,20 @@ func parseBoolFlag(cmd *cobra.Command, name string) *bool {
return &v
}

// applyIfZero sets *dst to src if *dst is zero (not explicitly set by CLI flags).
func applyIfZero(dst *int, src int) {
if *dst == 0 && src != 0 {
*dst = src
}
}

// applyIfZeroStr sets *dst to src if *dst is empty.
func applyIfZeroStr(dst *string, src string) {
if *dst == "" && src != "" {
*dst = src
}
}

// collectAnalyzerFlags reads CLI flag overrides for all registered analyzer configuration options.
func collectAnalyzerFlags(cmd *cobra.Command) map[string]any {
flags := make(map[string]any)
Expand Down Expand Up @@ -953,7 +967,7 @@ func runHistoryAnalyzers(
return executeHistoryPipeline(
ctx, result.pipeline, path, result.selectedLeaves,
result.commits, result.commitIter, result.commitCount,
result.analyzerKeys, pipelineFormat, result.opts, result.repository, writer,
result.analyzerKeys, pipelineFormat, result.opts, result.fileCfg, result.repository, writer,
)
}

Expand All @@ -968,6 +982,7 @@ type initResult struct {
analyzerKeys []string
format string
opts HistoryRunOptions
fileCfg *cfgpkg.Config // Loaded config file (may be nil).
}

// initHistoryPipeline performs the initialization phase: builds the pipeline,
Expand Down Expand Up @@ -1035,7 +1050,7 @@ func initHeadOnly(
return initResult{}, loadErr
}

selectedLeaves, configErr := configureAndSelect(pl, analyzerKeys, opts)
selectedLeaves, fileCfg, configErr := configureAndSelect(pl, analyzerKeys, opts)
if configErr != nil {
repository.Free()

Expand All @@ -1055,6 +1070,7 @@ func initHeadOnly(
analyzerKeys: analyzerKeys,
format: normalizedFormat,
opts: opts,
fileCfg: fileCfg,
}, nil
}

Expand Down Expand Up @@ -1104,7 +1120,7 @@ func initStreamingIterator(
return initResult{}, fmt.Errorf("failed to create commit iterator: %w", err)
}

selectedLeaves, configErr := configureAndSelect(pl, analyzerKeys, opts)
selectedLeaves, fileCfg, configErr := configureAndSelect(pl, analyzerKeys, opts)
if configErr != nil {
iter.Close()
repository.Free()
Expand All @@ -1127,13 +1143,16 @@ func initStreamingIterator(
analyzerKeys: analyzerKeys,
format: normalizedFormat,
opts: opts,
fileCfg: fileCfg,
}, nil
}

// configureAndSelect configures core analyzers with facts and selects leaf analyzers.
// When configFile is non-empty, it loads analyzer settings from the given config file
// and applies them to facts before configuring analyzers.
func configureAndSelect(pl *historyPipeline, analyzerKeys []string, opts HistoryRunOptions) ([]analyze.HistoryAnalyzer, error) {
func configureAndSelect(
pl *historyPipeline, analyzerKeys []string, opts HistoryRunOptions,
) ([]analyze.HistoryAnalyzer, *cfgpkg.Config, error) {
facts := buildFacts(pl, opts)

if opts.TmpDir != "" {
Expand All @@ -1143,7 +1162,7 @@ func configureAndSelect(pl *historyPipeline, analyzerKeys []string, opts History
// Apply file-based configuration if provided.
cfg, cfgErr := cfgpkg.LoadConfig(opts.ConfigFile)
if cfgErr != nil {
return nil, fmt.Errorf("load config: %w", cfgErr)
return nil, nil, fmt.Errorf("load config: %w", cfgErr)
}

cfg.ApplyToFacts(facts)
Expand All @@ -1152,15 +1171,61 @@ func configureAndSelect(pl *historyPipeline, analyzerKeys []string, opts History
// (e.g. TicksSinceStart publishes FactCommitsByTick) that leaves depend on.
err := configureAnalyzers(pl.Core, facts)
if err != nil {
return nil, err
return nil, nil, err
}

selectedLeaves, err := selectLeaves(pl.Leaves, analyzerKeys, facts)
if err != nil {
return nil, err
return nil, nil, err
}

return selectedLeaves, cfg, nil
}

func buildConfigParams(opts HistoryRunOptions, fileCfg *cfgpkg.Config) framework.ConfigParams {
params := framework.ConfigParams{
Workers: opts.Workers,
BufferSize: opts.BufferSize,
CommitBatchSize: opts.CommitBatchSize,
BlobCacheSize: opts.BlobCacheSize,
DiffCacheSize: opts.DiffCacheSize,
BlobArenaSize: opts.BlobArenaSize,
MemoryBudget: opts.MemoryBudget,
GCPercent: opts.GCPercent,
BallastSize: opts.BallastSize,
}

return selectedLeaves, nil
if fileCfg != nil {
applyPipelineConfigParams(&params, fileCfg.Pipeline)
}

return params
}

func applyPipelineConfigParams(params *framework.ConfigParams, p cfgpkg.PipelineConfig) {
applyIfZero(&params.UASTSpillThreshold, p.UASTSpillThreshold)
applyIfZero(&params.IntraCommitParallelThreshold, p.IntraCommitParallelThreshold)
applyIfZero(&params.MaxIntraCommitWorkers, p.MaxIntraCommitWorkers)
applyIfZero(&params.MaxUASTBlobSize, p.MaxUASTBlobSize)
applyIfZeroStr(&params.UASTParseTimeout, p.UASTParseTimeout)
applyIfZero(&params.MaxChangesPerCommit, p.MaxChangesPerCommit)
applyIfZero(&params.MaxDiffBatchSize, p.MaxDiffBatchSize)
applyIfZero(&params.MemoryBudgetRatio, p.MemoryBudgetRatio)
applyIfZeroStr(&params.MemoryBudgetCap, p.MemoryBudgetCap)
applyIfZero(&params.MemoryLimitRatio, p.MemoryLimitRatio)
applyIfZero(&params.UASTSpillTrimInterval, p.UASTSpillTrimInterval)
applyIfZero(&params.NativeTrimInterval, p.NativeTrimInterval)
applyIfZero(&params.MaxStreamingBuffering, p.MaxStreamingBuffering)
applyIfZeroStr(&params.DrainPrefetchTimeout, p.DrainPrefetchTimeout)
applyIfZeroStr(&params.SamplerInterval, p.SamplerInterval)
applyIfZero(&params.WorkerRatio, p.WorkerRatio)
applyIfZero(&params.UASTWorkerRatio, p.UASTWorkerRatio)
applyIfZero(&params.LeafWorkerDivisor, p.LeafWorkerDivisor)
applyIfZero(&params.MinLeafWorkers, p.MinLeafWorkers)
applyIfZero(&params.BufferSizeMultiplier, p.BufferSizeMultiplier)
applyIfZero(&params.BudgetLimitRatio, p.BudgetLimitRatio)
applyIfZero(&params.SystemRAMLimitRatio, p.SystemRAMLimitRatio)
applyIfZero(&params.DiffJobBufferMultiplier, p.DiffJobBufferMultiplier)
}

func executeHistoryPipeline(
Expand All @@ -1174,6 +1239,7 @@ func executeHistoryPipeline(
analyzerKeys []string,
normalizedFormat string,
opts HistoryRunOptions,
fileCfg *cfgpkg.Config,
repository *gitlib.Repository,
writer io.Writer,
) error {
Expand All @@ -1183,17 +1249,9 @@ func executeHistoryPipeline(
allAnalyzers = append(allAnalyzers, pl.Core...)
allAnalyzers = append(allAnalyzers, selectedLeaves...)

coordConfig, memBudget, err := framework.BuildConfigFromParams(framework.ConfigParams{
Workers: opts.Workers,
BufferSize: opts.BufferSize,
CommitBatchSize: opts.CommitBatchSize,
BlobCacheSize: opts.BlobCacheSize,
DiffCacheSize: opts.DiffCacheSize,
BlobArenaSize: opts.BlobArenaSize,
MemoryBudget: opts.MemoryBudget,
GCPercent: opts.GCPercent,
BallastSize: opts.BallastSize,
}, budget.SolveForBudget)
params := buildConfigParams(opts, fileCfg)

coordConfig, memBudget, err := framework.BuildConfigFromParams(params, budget.SolveForBudget)
if err != nil {
return err
}
Expand Down
14 changes: 10 additions & 4 deletions internal/analyzers/clones/aggregator.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,19 @@ type Aggregator struct {
totalFunctions int
// MaxClonePairs limits the number of clone pairs stored in the report detail.
// The total_clone_pairs count remains exact. Zero means unlimited.
MaxClonePairs int
MaxClonePairs int
NumBands int
NumRows int
SimilarityType3 float64
}

// NewAggregator creates a new clone detection aggregator.
func NewAggregator() *Aggregator {
return &Aggregator{
MaxClonePairs: DefaultMaxClonePairs,
MaxClonePairs: DefaultMaxClonePairs,
NumBands: numBands,
NumRows: numRows,
SimilarityType3: similarityType3,
}
}

Expand Down Expand Up @@ -126,7 +132,7 @@ func (a *Aggregator) detectGlobalClones() (pairs []ClonePair, totalCount int) {
return nil, 0
}

idx, err := lsh.New(numBands, numRows)
idx, err := lsh.New(a.NumBands, a.NumRows)
if err != nil {
return nil, 0
}
Expand All @@ -138,7 +144,7 @@ func (a *Aggregator) detectGlobalClones() (pairs []ClonePair, totalCount int) {
}
}

return findClonePairs(a.entries, idx, a.MaxClonePairs)
return findClonePairs(a.entries, idx, a.MaxClonePairs, a.SimilarityType3)
}

// qualifyFuncName returns "sourceFile::name" if sourceFile is non-empty,
Expand Down
Loading
Loading