diff --git a/.codefang.yaml b/.codefang.yaml index 3a1fd43..24bf8a9 100644 --- a/.codefang.yaml +++ b/.codefang.yaml @@ -23,6 +23,36 @@ # ballast_size: "0" # e.g. "128MiB" # memory_limit: "" # e.g. "8GiB" — sets debug.SetMemoryLimit # worker_timeout: "" # e.g. "60s" — stall detection timeout per worker request +# +# # Advanced pipeline tuning (0 = use defaults). +# uast_spill_threshold: 32 # file changes per commit before spilling UAST to disk +# intra_commit_parallel_threshold: 4 # min file changes for intra-commit parallel parsing +# max_intra_commit_workers: 4 # max goroutines for intra-commit UAST parsing +# max_uast_blob_size: 262144 # max blob size (bytes) for UAST parsing (256 KiB) +# uast_parse_timeout: "10s" # per-file UAST parse timeout +# max_changes_per_commit: 10000 # commits exceeding this are skipped +# max_diff_batch_size: 1000 # max diff requests per batch +# memory_budget_ratio: 50 # % of system RAM for auto memory budget +# memory_budget_cap: "2GiB" # max auto memory budget +# memory_limit_ratio: 75 # % of system RAM for Go soft memory limit +# +# # Extended pipeline tuning. +# uast_spill_trim_interval: 16 # MallocTrim frequency during UAST spill-mode parsing +# native_trim_interval: 10 # malloc_trim frequency within a chunk +# max_streaming_buffering: 3 # max buffering factor for streaming (triple-buffering) +# drain_prefetch_timeout: "30s" # timeout for abandoned prefetch goroutines +# sampler_interval: "2s" # pipeline sampler polling interval +# worker_ratio: 100 # % of CPU cores for pipeline workers +# uast_worker_ratio: 40 # % of CPU cores for UAST pipeline workers +# leaf_worker_divisor: 3 # leaf workers = NumCPU / divisor +# min_leaf_workers: 4 # minimum leaf workers +# buffer_size_multiplier: 2 # buffer size = workers * multiplier +# budget_limit_ratio: 95 # budget-to-memory-limit conversion (%) +# system_ram_limit_ratio: 90 # memory limit cap as % of system RAM +# diff_job_buffer_multiplier: 10 # scales diff job queue buffer +# static_max_workers: 8 # max concurrent workers for static analysis +# malloc_trim_interval: 50 # files between malloc_trim calls in static analysis +# static_memory_limit_ratio: 90 # % of budget for static phase memory limit # History analyzer settings. # history: @@ -37,17 +67,58 @@ # debug: false # goroutines: 0 # 0 = auto # +# couples: +# coupling_threshold_high: 10 # min co-change count for "high" coupling +# ownership_few_threshold: 3 # max contributors for "few owners" bucket +# ownership_moderate_threshold: 5 # max contributors for "moderate owners" bucket +# batch_coupling_threshold: 100 # max file pairs per commit for coupling +# hll_precision: 10 # HyperLogLog precision for contributor sketches +# top_k_per_file: 100 # max coupling pairs per file in store output +# min_edge_weight: 2 # min co-changes for a coupling edge +# # devs: # consider_empty_commits: false # anonymize: false +# bus_factor_threshold: 0.5 # cumulative ownership fraction for bus factor +# risk_threshold_critical: 90.0 # % ownership for critical risk +# risk_threshold_high: 80.0 # % ownership for high risk +# risk_threshold_medium: 60.0 # % ownership for medium risk +# active_threshold_ratio: 0.7 # fraction of max commits for "active" developer +# default_active_days: 90 # lookback window (days) for active developer check +# hll_precision: 14 # HyperLogLog precision for developer sketches +# +# file_history: +# hotspot_threshold_critical: 50 # commits for critical hotspot +# hotspot_threshold_high: 30 # commits for high hotspot +# hotspot_threshold_medium: 15 # commits for medium hotspot # # imports: # goroutines: 4 -# max_file_size: 1048576 # 1 MiB +# max_file_size: 1048576 # 1 MiB +# max_dependency_risk_rows: 30 # max rows in dependency risk table # # sentiment: # min_comment_length: 20 -# gap: 0.5 # 0.0 - 1.0 +# gap: 0.5 # 0.0 - 1.0 +# neutralizer_weight: 0.8 # SE-domain adjustment weight (0-1) +# max_weight_ratio: 3.0 # max comment length weight ratio +# positive_threshold: 0.6 # sentiment >= this is "positive" +# negative_threshold: 0.4 # sentiment <= this is "negative" +# trend_threshold: 0.1 # change needed to classify trend direction +# low_sentiment_risk_thresh: 0.2 # sentiment <= this is HIGH risk +# +# clones: +# max_clone_pairs: 1000 # max clone pairs in aggregated report +# num_hashes: 128 # MinHash signature size +# num_bands: 16 # LSH band count +# num_rows: 8 # rows per LSH band +# shingle_size: 5 # token shingle window size +# similarity_type2: 0.8 # Type-2 clone similarity threshold +# similarity_type3: 0.5 # Type-3 clone similarity threshold +# threshold_ratio_yellow: 0.1 # clone ratio for yellow warning +# threshold_ratio_red: 0.3 # clone ratio for red warning +# threshold_pairs_yellow: 5 # clone pairs for yellow warning +# threshold_pairs_red: 20 # clone pairs for red warning # # shotness: # dsl_struct: 'filter(.roles has "Function")' diff --git a/cmd/codefang/commands/run.go b/cmd/codefang/commands/run.go index c05b820..18c3875 100644 --- a/cmd/codefang/commands/run.go +++ b/cmd/codefang/commands/run.go @@ -749,6 +749,20 @@ func parseBoolFlag(cmd *cobra.Command, name string) *bool { return &v } +// applyIfZero sets *dst to src if *dst is zero (not explicitly set by CLI flags). +func applyIfZero(dst *int, src int) { + if *dst == 0 && src != 0 { + *dst = src + } +} + +// applyIfZeroStr sets *dst to src if *dst is empty. +func applyIfZeroStr(dst *string, src string) { + if *dst == "" && src != "" { + *dst = src + } +} + // collectAnalyzerFlags reads CLI flag overrides for all registered analyzer configuration options. func collectAnalyzerFlags(cmd *cobra.Command) map[string]any { flags := make(map[string]any) @@ -953,7 +967,7 @@ func runHistoryAnalyzers( return executeHistoryPipeline( ctx, result.pipeline, path, result.selectedLeaves, result.commits, result.commitIter, result.commitCount, - result.analyzerKeys, pipelineFormat, result.opts, result.repository, writer, + result.analyzerKeys, pipelineFormat, result.opts, result.fileCfg, result.repository, writer, ) } @@ -968,6 +982,7 @@ type initResult struct { analyzerKeys []string format string opts HistoryRunOptions + fileCfg *cfgpkg.Config // Loaded config file (may be nil). } // initHistoryPipeline performs the initialization phase: builds the pipeline, @@ -1035,7 +1050,7 @@ func initHeadOnly( return initResult{}, loadErr } - selectedLeaves, configErr := configureAndSelect(pl, analyzerKeys, opts) + selectedLeaves, fileCfg, configErr := configureAndSelect(pl, analyzerKeys, opts) if configErr != nil { repository.Free() @@ -1055,6 +1070,7 @@ func initHeadOnly( analyzerKeys: analyzerKeys, format: normalizedFormat, opts: opts, + fileCfg: fileCfg, }, nil } @@ -1104,7 +1120,7 @@ func initStreamingIterator( return initResult{}, fmt.Errorf("failed to create commit iterator: %w", err) } - selectedLeaves, configErr := configureAndSelect(pl, analyzerKeys, opts) + selectedLeaves, fileCfg, configErr := configureAndSelect(pl, analyzerKeys, opts) if configErr != nil { iter.Close() repository.Free() @@ -1127,13 +1143,16 @@ func initStreamingIterator( analyzerKeys: analyzerKeys, format: normalizedFormat, opts: opts, + fileCfg: fileCfg, }, nil } // configureAndSelect configures core analyzers with facts and selects leaf analyzers. // When configFile is non-empty, it loads analyzer settings from the given config file // and applies them to facts before configuring analyzers. -func configureAndSelect(pl *historyPipeline, analyzerKeys []string, opts HistoryRunOptions) ([]analyze.HistoryAnalyzer, error) { +func configureAndSelect( + pl *historyPipeline, analyzerKeys []string, opts HistoryRunOptions, +) ([]analyze.HistoryAnalyzer, *cfgpkg.Config, error) { facts := buildFacts(pl, opts) if opts.TmpDir != "" { @@ -1143,7 +1162,7 @@ func configureAndSelect(pl *historyPipeline, analyzerKeys []string, opts History // Apply file-based configuration if provided. cfg, cfgErr := cfgpkg.LoadConfig(opts.ConfigFile) if cfgErr != nil { - return nil, fmt.Errorf("load config: %w", cfgErr) + return nil, nil, fmt.Errorf("load config: %w", cfgErr) } cfg.ApplyToFacts(facts) @@ -1152,15 +1171,61 @@ func configureAndSelect(pl *historyPipeline, analyzerKeys []string, opts History // (e.g. TicksSinceStart publishes FactCommitsByTick) that leaves depend on. err := configureAnalyzers(pl.Core, facts) if err != nil { - return nil, err + return nil, nil, err } selectedLeaves, err := selectLeaves(pl.Leaves, analyzerKeys, facts) if err != nil { - return nil, err + return nil, nil, err + } + + return selectedLeaves, cfg, nil +} + +func buildConfigParams(opts HistoryRunOptions, fileCfg *cfgpkg.Config) framework.ConfigParams { + params := framework.ConfigParams{ + Workers: opts.Workers, + BufferSize: opts.BufferSize, + CommitBatchSize: opts.CommitBatchSize, + BlobCacheSize: opts.BlobCacheSize, + DiffCacheSize: opts.DiffCacheSize, + BlobArenaSize: opts.BlobArenaSize, + MemoryBudget: opts.MemoryBudget, + GCPercent: opts.GCPercent, + BallastSize: opts.BallastSize, } - return selectedLeaves, nil + if fileCfg != nil { + applyPipelineConfigParams(¶ms, fileCfg.Pipeline) + } + + return params +} + +func applyPipelineConfigParams(params *framework.ConfigParams, p cfgpkg.PipelineConfig) { + applyIfZero(¶ms.UASTSpillThreshold, p.UASTSpillThreshold) + applyIfZero(¶ms.IntraCommitParallelThreshold, p.IntraCommitParallelThreshold) + applyIfZero(¶ms.MaxIntraCommitWorkers, p.MaxIntraCommitWorkers) + applyIfZero(¶ms.MaxUASTBlobSize, p.MaxUASTBlobSize) + applyIfZeroStr(¶ms.UASTParseTimeout, p.UASTParseTimeout) + applyIfZero(¶ms.MaxChangesPerCommit, p.MaxChangesPerCommit) + applyIfZero(¶ms.MaxDiffBatchSize, p.MaxDiffBatchSize) + applyIfZero(¶ms.MemoryBudgetRatio, p.MemoryBudgetRatio) + applyIfZeroStr(¶ms.MemoryBudgetCap, p.MemoryBudgetCap) + applyIfZero(¶ms.MemoryLimitRatio, p.MemoryLimitRatio) + applyIfZero(¶ms.UASTSpillTrimInterval, p.UASTSpillTrimInterval) + applyIfZero(¶ms.NativeTrimInterval, p.NativeTrimInterval) + applyIfZero(¶ms.MaxStreamingBuffering, p.MaxStreamingBuffering) + applyIfZeroStr(¶ms.DrainPrefetchTimeout, p.DrainPrefetchTimeout) + applyIfZeroStr(¶ms.SamplerInterval, p.SamplerInterval) + applyIfZero(¶ms.WorkerRatio, p.WorkerRatio) + applyIfZero(¶ms.UASTWorkerRatio, p.UASTWorkerRatio) + applyIfZero(¶ms.LeafWorkerDivisor, p.LeafWorkerDivisor) + applyIfZero(¶ms.MinLeafWorkers, p.MinLeafWorkers) + applyIfZero(¶ms.BufferSizeMultiplier, p.BufferSizeMultiplier) + applyIfZero(¶ms.BudgetLimitRatio, p.BudgetLimitRatio) + applyIfZero(¶ms.SystemRAMLimitRatio, p.SystemRAMLimitRatio) + applyIfZero(¶ms.DiffJobBufferMultiplier, p.DiffJobBufferMultiplier) } func executeHistoryPipeline( @@ -1174,6 +1239,7 @@ func executeHistoryPipeline( analyzerKeys []string, normalizedFormat string, opts HistoryRunOptions, + fileCfg *cfgpkg.Config, repository *gitlib.Repository, writer io.Writer, ) error { @@ -1183,17 +1249,9 @@ func executeHistoryPipeline( allAnalyzers = append(allAnalyzers, pl.Core...) allAnalyzers = append(allAnalyzers, selectedLeaves...) - coordConfig, memBudget, err := framework.BuildConfigFromParams(framework.ConfigParams{ - Workers: opts.Workers, - BufferSize: opts.BufferSize, - CommitBatchSize: opts.CommitBatchSize, - BlobCacheSize: opts.BlobCacheSize, - DiffCacheSize: opts.DiffCacheSize, - BlobArenaSize: opts.BlobArenaSize, - MemoryBudget: opts.MemoryBudget, - GCPercent: opts.GCPercent, - BallastSize: opts.BallastSize, - }, budget.SolveForBudget) + params := buildConfigParams(opts, fileCfg) + + coordConfig, memBudget, err := framework.BuildConfigFromParams(params, budget.SolveForBudget) if err != nil { return err } diff --git a/internal/analyzers/clones/aggregator.go b/internal/analyzers/clones/aggregator.go index 2ce2eb0..0a95680 100644 --- a/internal/analyzers/clones/aggregator.go +++ b/internal/analyzers/clones/aggregator.go @@ -15,13 +15,19 @@ type Aggregator struct { totalFunctions int // MaxClonePairs limits the number of clone pairs stored in the report detail. // The total_clone_pairs count remains exact. Zero means unlimited. - MaxClonePairs int + MaxClonePairs int + NumBands int + NumRows int + SimilarityType3 float64 } // NewAggregator creates a new clone detection aggregator. func NewAggregator() *Aggregator { return &Aggregator{ - MaxClonePairs: DefaultMaxClonePairs, + MaxClonePairs: DefaultMaxClonePairs, + NumBands: numBands, + NumRows: numRows, + SimilarityType3: similarityType3, } } @@ -126,7 +132,7 @@ func (a *Aggregator) detectGlobalClones() (pairs []ClonePair, totalCount int) { return nil, 0 } - idx, err := lsh.New(numBands, numRows) + idx, err := lsh.New(a.NumBands, a.NumRows) if err != nil { return nil, 0 } @@ -138,7 +144,7 @@ func (a *Aggregator) detectGlobalClones() (pairs []ClonePair, totalCount int) { } } - return findClonePairs(a.entries, idx, a.MaxClonePairs) + return findClonePairs(a.entries, idx, a.MaxClonePairs, a.SimilarityType3) } // qualifyFuncName returns "sourceFile::name" if sourceFile is non-empty, diff --git a/internal/analyzers/clones/analyzer.go b/internal/analyzers/clones/analyzer.go index 09cc935..3cf10f3 100644 --- a/internal/analyzers/clones/analyzer.go +++ b/internal/analyzers/clones/analyzer.go @@ -64,10 +64,36 @@ const ( maxTraversalVal = 10 ) +// Configuration option keys for the clones analyzer. +const ( + ConfigClonesMaxClonePairs = "Clones.MaxClonePairs" + ConfigClonesNumHashes = "Clones.NumHashes" + ConfigClonesNumBands = "Clones.NumBands" + ConfigClonesNumRows = "Clones.NumRows" + ConfigClonesShingleSize = "Clones.ShingleSize" + ConfigClonesSimilarityType2 = "Clones.SimilarityType2" + ConfigClonesSimilarityType3 = "Clones.SimilarityType3" + ConfigClonesThresholdRatioYellow = "Clones.ThresholdRatioYellow" + ConfigClonesThresholdRatioRed = "Clones.ThresholdRatioRed" + ConfigClonesThresholdPairsYellow = "Clones.ThresholdPairsYellow" + ConfigClonesThresholdPairsRed = "Clones.ThresholdPairsRed" +) + // Analyzer provides clone detection analysis using MinHash and LSH. type Analyzer struct { traverser *common.UASTTraverser shingler *Shingler + + cfgMaxClonePairs int + cfgNumHashes int + cfgNumBands int + cfgNumRows int + cfgSimilarityType2 float64 + cfgSimilarityType3 float64 + cfgThresholdRatioYellow float64 + cfgThresholdRatioRed float64 + cfgThresholdPairsYellow int + cfgThresholdPairsRed int } // NewAnalyzer creates a new clone detection Analyzer. @@ -77,7 +103,16 @@ func NewAnalyzer() *Analyzer { MaxDepth: maxTraversalVal, IncludeRoot: true, }), - shingler: NewShingler(defaultShingleSize), + shingler: NewShingler(defaultShingleSize), + cfgNumHashes: numHashes, + cfgNumBands: numBands, + cfgNumRows: numRows, + cfgSimilarityType2: similarityType2, + cfgSimilarityType3: similarityType3, + cfgThresholdRatioYellow: thresholdCloneRatioYellow, + cfgThresholdRatioRed: thresholdCloneRatioRed, + cfgThresholdPairsYellow: thresholdClonePairsYellow, + cfgThresholdPairsRed: thresholdClonePairsRed, } } @@ -106,7 +141,51 @@ func (a *Analyzer) ListConfigurationOptions() []pipeline.ConfigurationOption { } // Configure configures the analyzer. -func (a *Analyzer) Configure(_ map[string]any) error { +func (a *Analyzer) Configure(facts map[string]any) error { + if val, ok := facts[ConfigClonesMaxClonePairs].(int); ok { + a.cfgMaxClonePairs = val + } + + if val, ok := facts[ConfigClonesNumHashes].(int); ok { + a.cfgNumHashes = val + } + + if val, ok := facts[ConfigClonesNumBands].(int); ok { + a.cfgNumBands = val + } + + if val, ok := facts[ConfigClonesNumRows].(int); ok { + a.cfgNumRows = val + } + + if val, ok := facts[ConfigClonesShingleSize].(int); ok { + a.shingler = NewShingler(val) + } + + if val, ok := facts[ConfigClonesSimilarityType2].(float64); ok { + a.cfgSimilarityType2 = val + } + + if val, ok := facts[ConfigClonesSimilarityType3].(float64); ok { + a.cfgSimilarityType3 = val + } + + if val, ok := facts[ConfigClonesThresholdRatioYellow].(float64); ok { + a.cfgThresholdRatioYellow = val + } + + if val, ok := facts[ConfigClonesThresholdRatioRed].(float64); ok { + a.cfgThresholdRatioRed = val + } + + if val, ok := facts[ConfigClonesThresholdPairsYellow].(int); ok { + a.cfgThresholdPairsYellow = val + } + + if val, ok := facts[ConfigClonesThresholdPairsRed].(int); ok { + a.cfgThresholdPairsRed = val + } + return nil } @@ -115,25 +194,40 @@ func (a *Analyzer) Thresholds() analyze.Thresholds { return analyze.Thresholds{ "clone_ratio": { "green": 0.0, - "yellow": thresholdCloneRatioYellow, - "red": thresholdCloneRatioRed, + "yellow": a.cfgThresholdRatioYellow, + "red": a.cfgThresholdRatioRed, }, "total_clone_pairs": { "green": 0, - "yellow": thresholdClonePairsYellow, - "red": thresholdClonePairsRed, + "yellow": a.cfgThresholdPairsYellow, + "red": a.cfgThresholdPairsRed, }, } } // CreateAggregator returns a new aggregator for clone analysis. func (a *Analyzer) CreateAggregator() analyze.ResultAggregator { - return NewAggregator() + agg := NewAggregator() + if a.cfgMaxClonePairs > 0 { + agg.MaxClonePairs = a.cfgMaxClonePairs + } + + agg.NumBands = a.cfgNumBands + agg.NumRows = a.cfgNumRows + agg.SimilarityType3 = a.cfgSimilarityType3 + + return agg } // CreateVisitor creates a new visitor for single-pass traversal optimization. func (a *Analyzer) CreateVisitor() analyze.AnalysisVisitor { - return NewVisitor() + v := NewVisitor() + v.numHashes = a.cfgNumHashes + v.shingler = a.shingler + v.similarityType2 = a.cfgSimilarityType2 + v.similarityType3 = a.cfgSimilarityType3 + + return v } // CreateReportSection creates a ReportSection from report data. @@ -208,7 +302,7 @@ func (a *Analyzer) detectClones(functions []*node.Node) []ClonePair { return nil } - idx, err := lsh.New(numBands, numRows) + idx, err := lsh.New(a.cfgNumBands, a.cfgNumRows) if err != nil { return nil } @@ -221,7 +315,7 @@ func (a *Analyzer) detectClones(functions []*node.Node) []ClonePair { } // Per-file detection: no cap (single-file scope, bounded by function count). - pairs, _ := findClonePairs(entries, idx, 0) + pairs, _ := findClonePairs(entries, idx, 0, a.cfgSimilarityType3) return pairs } @@ -236,7 +330,7 @@ func (a *Analyzer) buildSignatures(functions []*node.Node) []funcEntry { continue } - sig, err := minhash.New(numHashes) + sig, err := minhash.New(a.cfgNumHashes) if err != nil { continue } diff --git a/internal/analyzers/clones/visitor.go b/internal/analyzers/clones/visitor.go index 99db269..9292c01 100644 --- a/internal/analyzers/clones/visitor.go +++ b/internal/analyzers/clones/visitor.go @@ -13,14 +13,20 @@ import ( // It collects function nodes during traversal and exports MinHash signatures // for cross-file clone detection by the aggregator. type Visitor struct { - functions []*node.Node - shingler *Shingler + functions []*node.Node + shingler *Shingler + numHashes int + similarityType2 float64 + similarityType3 float64 } // NewVisitor creates a new clone detection Visitor. func NewVisitor() *Visitor { return &Visitor{ - shingler: NewShingler(defaultShingleSize), + shingler: NewShingler(defaultShingleSize), + numHashes: numHashes, + similarityType2: similarityType2, + similarityType3: similarityType3, } } @@ -58,7 +64,7 @@ func (v *Visitor) buildSignatures() []funcEntry { continue } - sig, err := minhash.New(numHashes) + sig, err := minhash.New(v.numHashes) if err != nil { continue } @@ -104,17 +110,17 @@ func buildSignatureReport(totalFunctions int, entries []funcEntry) analyze.Repor // findClonePairs queries the LSH index and collects unique clone pairs. // pairCap limits the stored pairs slice (0 = unlimited). The returned totalCount // reflects ALL unique pairs found, regardless of the cap. -func findClonePairs(entries []funcEntry, idx *lsh.Index, pairCap int) (pairs []ClonePair, totalCount int) { +func findClonePairs(entries []funcEntry, idx *lsh.Index, pairCap int, minSimilarity float64) (pairs []ClonePair, totalCount int) { seen := make(map[PairKey]bool) sigMap := buildSignatureMap(entries) for _, entry := range entries { - candidates, err := idx.QueryThreshold(entry.sig, similarityType3) + candidates, err := idx.QueryThreshold(entry.sig, minSimilarity) if err != nil { continue } - pairs, totalCount = matchCandidates(entry, candidates, sigMap, seen, pairs, totalCount, pairCap) + pairs, totalCount = matchCandidates(entry, candidates, sigMap, seen, pairs, totalCount, pairCap, minSimilarity) } sort.Slice(pairs, func(i, j int) bool { @@ -145,6 +151,7 @@ func matchCandidates( pairs []ClonePair, totalCount int, pairCap int, + minSimilarity float64, ) (updatedPairs []ClonePair, updatedCount int) { for _, candidateID := range candidates { if candidateID == entry.name { @@ -158,7 +165,7 @@ func matchCandidates( seen[key] = true - pair, ok := computeClonePair(entry, candidateID, sigMap) + pair, ok := computeClonePair(entry, candidateID, sigMap, minSimilarity) if ok { totalCount++ @@ -172,7 +179,7 @@ func matchCandidates( } // computeClonePair computes a clone pair between an entry and a candidate. -func computeClonePair(entry funcEntry, candidateID string, sigMap map[string]*minhash.Signature) (ClonePair, bool) { +func computeClonePair(entry funcEntry, candidateID string, sigMap map[string]*minhash.Signature, minSimilarity float64) (ClonePair, bool) { candidateSig := sigMap[candidateID] if candidateSig == nil { return ClonePair{}, false @@ -183,7 +190,7 @@ func computeClonePair(entry funcEntry, candidateID string, sigMap map[string]*mi return ClonePair{}, false } - if similarity < similarityType3 { + if similarity < minSimilarity { return ClonePair{}, false } diff --git a/internal/analyzers/common/aggregator.go b/internal/analyzers/common/aggregator.go index 689288b..df5a21e 100644 --- a/internal/analyzers/common/aggregator.go +++ b/internal/analyzers/common/aggregator.go @@ -2,6 +2,8 @@ package common import ( + "sync" + "github.com/Sumatoshi-tech/codefang/internal/analyzers/analyze" ) @@ -13,6 +15,7 @@ var ( // Aggregator provides generic aggregation capabilities for analyzers. type Aggregator struct { + mu sync.Mutex metricsProcessor *MetricsProcessor dataCollector *SpillableDataCollector resultBuilder *ResultBuilder @@ -52,6 +55,9 @@ func NewAggregator( // Aggregate combines multiple analysis results. func (a *Aggregator) Aggregate(results map[string]analyze.Report) { + a.mu.Lock() + defer a.mu.Unlock() + for _, report := range results { if report == nil { continue @@ -135,6 +141,9 @@ func (a *Aggregator) GetResultBuilder() *ResultBuilder { // EstimatedStateSize returns the estimated in-memory state size in bytes. // Sums MetricsProcessor and SpillableDataCollector estimates. func (a *Aggregator) EstimatedStateSize() int64 { + a.mu.Lock() + defer a.mu.Unlock() + return a.metricsProcessor.EstimatedStateBytes() + a.dataCollector.EstimatedBufferBytes() } diff --git a/internal/analyzers/couples/aggregator.go b/internal/analyzers/couples/aggregator.go index 04e6388..0cb893a 100644 --- a/internal/analyzers/couples/aggregator.go +++ b/internal/analyzers/couples/aggregator.go @@ -51,6 +51,10 @@ type Aggregator struct { reversedNames []string lastCommit analyze.CommitLike closed bool + + // cfgBatchCouplingThreshold overrides the default batchCouplingThreshold. + // Zero uses the package-level default. + cfgBatchCouplingThreshold int } func newAggregator( @@ -58,6 +62,7 @@ func newAggregator( peopleNumber int, reversedNames []string, lastCommit analyze.CommitLike, + batchThreshold int, ) *Aggregator { people := make([]map[string]int, peopleNumber+1) for i := range people { @@ -65,15 +70,16 @@ func newAggregator( } return &Aggregator{ - files: spillstore.New[map[string]int](opts.SpillDir), - people: people, - peopleCommits: make([]int, peopleNumber+1), - commitStats: make(map[string]*CommitSummary), - commitsByTick: make(map[int][]gitlib.Hash), - opts: opts, - peopleNumber: peopleNumber, - reversedNames: reversedNames, - lastCommit: lastCommit, + files: spillstore.New[map[string]int](opts.SpillDir), + people: people, + peopleCommits: make([]int, peopleNumber+1), + commitStats: make(map[string]*CommitSummary), + commitsByTick: make(map[int][]gitlib.Hash), + opts: opts, + peopleNumber: peopleNumber, + reversedNames: reversedNames, + lastCommit: lastCommit, + cfgBatchCouplingThreshold: batchThreshold, } } @@ -125,6 +131,15 @@ func (a *Aggregator) addAuthorFiles(authorFiles map[string]int, author int) { // lane maps with the known coupling set size to reduce map growth overhead. const batchCouplingThreshold = 100 +// effectiveBatchCouplingThreshold returns the configured or default batch coupling threshold. +func (a *Aggregator) effectiveBatchCouplingThreshold() int { + if a.cfgBatchCouplingThreshold > 0 { + return a.cfgBatchCouplingThreshold + } + + return batchCouplingThreshold +} + // addFileCouplings updates the file co-occurrence matrix. // // For large commits (>= batchCouplingThreshold files), pre-allocates lane @@ -132,13 +147,14 @@ const batchCouplingThreshold = 100 // insertions. func (a *Aggregator) addFileCouplings(couplingFiles []string) { n := len(couplingFiles) + threshold := a.effectiveBatchCouplingThreshold() for _, file := range couplingFiles { lane, ok := a.files.Get(file) if !ok { // Pre-allocate to the known size for large commits. initCap := 0 - if n >= batchCouplingThreshold { + if n >= threshold { initCap = n } diff --git a/internal/analyzers/couples/aggregator_test.go b/internal/analyzers/couples/aggregator_test.go index 4ecdc5a..af27b71 100644 --- a/internal/analyzers/couples/aggregator_test.go +++ b/internal/analyzers/couples/aggregator_test.go @@ -13,7 +13,7 @@ import ( func TestAggregator_Add_StoresData(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 2, []string{"alice", "bob"}, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 2, []string{"alice", "bob"}, nil, 0) tc := analyze.TC{ AuthorID: 0, @@ -47,7 +47,7 @@ func TestAggregator_Add_StoresData(t *testing.T) { func TestAggregator_Add_NilData(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil, 0) require.NoError(t, agg.Add(analyze.TC{Data: nil})) @@ -57,7 +57,7 @@ func TestAggregator_Add_NilData(t *testing.T) { func TestAggregator_Add_WrongType(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil, 0) require.NoError(t, agg.Add(analyze.TC{Data: "wrong"})) @@ -67,7 +67,7 @@ func TestAggregator_Add_WrongType(t *testing.T) { func TestAggregator_Add_EmptyCouplingFiles(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil, 0) tc := analyze.TC{ AuthorID: 0, @@ -91,7 +91,7 @@ func TestAggregator_Add_EmptyCouplingFiles(t *testing.T) { func TestAggregator_Add_PeopleGrowth(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil, 0) tc := analyze.TC{ AuthorID: 5, // Exceeds initial capacity of 2 (PeopleNumber+1). @@ -110,7 +110,7 @@ func TestAggregator_Add_PeopleGrowth(t *testing.T) { func TestAggregator_Add_CommitNotCounted(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil, 0) tc := analyze.TC{ AuthorID: 0, @@ -130,7 +130,7 @@ func TestAggregator_Add_CommitNotCounted(t *testing.T) { func TestAggregator_FlushTick_Empty(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil, 0) tick, err := agg.FlushTick(0) require.NoError(t, err) @@ -142,7 +142,7 @@ func TestAggregator_FlushTick_Empty(t *testing.T) { func TestAggregator_FlushTick_WithData(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil, 0) require.NoError(t, agg.Add(analyze.TC{ AuthorID: 0, @@ -168,7 +168,7 @@ func TestAggregator_FlushTick_WithData(t *testing.T) { func TestAggregator_SpillCollect(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil, 0) require.NoError(t, agg.Add(analyze.TC{ AuthorID: 0, @@ -209,7 +209,7 @@ func TestAggregator_SpillCollect(t *testing.T) { func TestAggregator_AutoSpill(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{SpillBudget: 1}, 1, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{SpillBudget: 1}, 1, nil, nil, 0) require.NoError(t, agg.Add(analyze.TC{ AuthorID: 0, @@ -228,7 +228,7 @@ func TestAggregator_AutoSpill(t *testing.T) { func TestAggregator_Close_Idempotent(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil, 0) require.NoError(t, agg.Close()) require.NoError(t, agg.Close()) @@ -237,7 +237,7 @@ func TestAggregator_Close_Idempotent(t *testing.T) { func TestAggregator_EstimatedStateSize(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil, 0) // Initial size accounts for the pre-allocated peopleCommits slice. initialSize := agg.EstimatedStateSize() @@ -336,7 +336,7 @@ func TestMergeFileCouplings(t *testing.T) { func TestAggregator_MultipleCommits(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 2, []string{"alice", "bob"}, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 2, []string{"alice", "bob"}, nil, 0) // Commit 1: alice touches a.go, b.go. require.NoError(t, agg.Add(analyze.TC{ @@ -372,7 +372,7 @@ func TestAggregator_MultipleCommits(t *testing.T) { func TestAggregator_Renames(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil, 0) require.NoError(t, agg.Add(analyze.TC{ AuthorID: 0, @@ -397,7 +397,7 @@ func TestAggregator_Renames(t *testing.T) { func TestAggregator_Spill_Empty(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 1, nil, nil, 0) freed, err := agg.Spill() require.NoError(t, err) @@ -672,7 +672,7 @@ func TestEndToEnd_Consume_Aggregator_TicksToReport(t *testing.T) { } // Step 2: Feed through Aggregator (PeopleNumber=0 — no --people-dict). - agg := newAggregator(analyze.AggregatorOptions{}, 0, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 0, nil, nil, 0) defer agg.Close() for _, tc := range commits { diff --git a/internal/analyzers/couples/history.go b/internal/analyzers/couples/history.go index 5958d3e..305d813 100644 --- a/internal/analyzers/couples/history.go +++ b/internal/analyzers/couples/history.go @@ -34,7 +34,16 @@ const ( // ErrInvalidReversedPeopleDict indicates a type assertion failure for reversedPeopleDict. var ErrInvalidReversedPeopleDict = errors.New("expected []string for reversedPeopleDict") -// +// Configuration option keys for the couples analyzer. +const ( + ConfigCouplesCouplingThresholdHigh = "Couples.CouplingThresholdHigh" + ConfigCouplesOwnershipFewThreshold = "Couples.OwnershipFewThreshold" + ConfigCouplesOwnershipModerateThreshold = "Couples.OwnershipModerateThreshold" + ConfigCouplesBatchCouplingThreshold = "Couples.BatchCouplingThreshold" + ConfigCouplesHLLPrecision = "Couples.HLLPrecision" + ConfigCouplesTopKPerFile = "Couples.TopKPerFile" + ConfigCouplesMinEdgeWeight = "Couples.MinEdgeWeight" +) // HistoryAnalyzer identifies co-change coupling between files and developers. type HistoryAnalyzer struct { @@ -48,16 +57,24 @@ type HistoryAnalyzer struct { seenFiles *bloom.Filter // TopKPerFile limits the number of file coupling pairs emitted by WriteToStoreFromAggregator. - // Zero uses DefaultTopKPerFile. TopKPerFile int // MinEdgeWeight is the minimum co-change count for an edge to be emitted. - // Zero uses DefaultMinEdgeWeight. MinEdgeWeight int64 + + // Configurable thresholds (zero = use package-level defaults). + cfgCouplingThresholdHigh int + cfgOwnershipFewThreshold int + cfgOwnershipModerateThreshold int + cfgBatchCouplingThreshold int + cfgHLLPrecision int } // NewHistoryAnalyzer creates a new HistoryAnalyzer. func NewHistoryAnalyzer() *HistoryAnalyzer { - a := &HistoryAnalyzer{} + a := &HistoryAnalyzer{ + TopKPerFile: DefaultTopKPerFile, + MinEdgeWeight: DefaultMinEdgeWeight, + } a.BaseHistoryAnalyzer = &analyze.BaseHistoryAnalyzer[*ComputedMetrics]{ Desc: analyze.Descriptor{ @@ -74,10 +91,10 @@ func NewHistoryAnalyzer() *HistoryAnalyzer { return &ComputedMetrics{}, nil } - return ComputeAllMetrics(report) + return ComputeAllMetricsWithOptions(report, a.metricOptions()) }, AggregatorFn: func(opts analyze.AggregatorOptions) analyze.Aggregator { - return newAggregator(opts, a.PeopleNumber, a.GetReversedPeopleDict(), a.lastCommit) + return newAggregator(opts, a.PeopleNumber, a.GetReversedPeopleDict(), a.lastCommit, a.cfgBatchCouplingThreshold) }, TicksToReportFn: func(ctx context.Context, ticks []analyze.TICK) analyze.Report { return ticksToReport(ctx, ticks, a.GetReversedPeopleDict(), a.PeopleNumber, a.lastCommit) @@ -146,9 +163,48 @@ func (c *HistoryAnalyzer) Configure(facts map[string]any) error { c.ReversedPeopleDict = rpd } + if val, ok := facts[ConfigCouplesCouplingThresholdHigh].(int); ok { + c.cfgCouplingThresholdHigh = val + } + + if val, ok := facts[ConfigCouplesOwnershipFewThreshold].(int); ok { + c.cfgOwnershipFewThreshold = val + } + + if val, ok := facts[ConfigCouplesOwnershipModerateThreshold].(int); ok { + c.cfgOwnershipModerateThreshold = val + } + + if val, ok := facts[ConfigCouplesBatchCouplingThreshold].(int); ok { + c.cfgBatchCouplingThreshold = val + } + + if val, ok := facts[ConfigCouplesHLLPrecision].(int); ok { + c.cfgHLLPrecision = val + } + + if val, ok := facts[ConfigCouplesTopKPerFile].(int); ok { + c.TopKPerFile = val + } + + if val, ok := facts[ConfigCouplesMinEdgeWeight].(int); ok { + c.MinEdgeWeight = int64(val) + } + return nil } +// metricOptions returns the metric computation options from configured values. +func (c *HistoryAnalyzer) metricOptions() MetricOptions { + return MetricOptions{ + CouplingThresholdHigh: c.cfgCouplingThresholdHigh, + OwnershipFewThreshold: c.cfgOwnershipFewThreshold, + OwnershipModerateThreshold: c.cfgOwnershipModerateThreshold, + BatchCouplingThreshold: c.cfgBatchCouplingThreshold, + HLLPrecision: c.cfgHLLPrecision, + } +} + // MapDependencies returns the required plumbing analyzers. func (c *HistoryAnalyzer) MapDependencies() []string { return []string{} @@ -492,7 +548,7 @@ func (c *HistoryAnalyzer) Merge(branches []analyze.HistoryAnalyzer) { // NewAggregator creates a new aggregator for this analyzer. func (c *HistoryAnalyzer) NewAggregator(opts analyze.AggregatorOptions) analyze.Aggregator { - return newAggregator(opts, c.PeopleNumber, c.GetReversedPeopleDict(), c.lastCommit) + return newAggregator(opts, c.PeopleNumber, c.GetReversedPeopleDict(), c.lastCommit, c.cfgBatchCouplingThreshold) } // ExtractCommitTimeSeries implements analyze.CommitTimeSeriesProvider. diff --git a/internal/analyzers/couples/metrics.go b/internal/analyzers/couples/metrics.go index c8ce945..cd0b419 100644 --- a/internal/analyzers/couples/metrics.go +++ b/internal/analyzers/couples/metrics.go @@ -2,6 +2,7 @@ package couples import ( "encoding/binary" + "fmt" "sort" "github.com/Sumatoshi-tech/codefang/internal/analyzers/analyze" @@ -277,8 +278,14 @@ func NewFileOwnershipMetric() *FileOwnershipMetric { // Uses HyperLogLog sketches per file to estimate contributor cardinality // instead of maintaining a map[int]bool per file. This reduces memory from // O(F × D) to O(F × 2^p) where p is the HLL precision. +// Compute calculates file ownership data using default options. func (m *FileOwnershipMetric) Compute(input *ReportData) []FileOwnershipData { - fileSketches := buildFileContributorSketches(len(input.Files), input.PeopleFiles) + return m.ComputeWithOptions(input, DefaultMetricOptions()) +} + +// ComputeWithOptions calculates file ownership data with configurable HLL precision. +func (m *FileOwnershipMetric) ComputeWithOptions(input *ReportData, opts MetricOptions) []FileOwnershipData { + fileSketches := buildFileContributorSketchesWithPrecision(len(input.Files), input.PeopleFiles, uint8(opts.HLLPrecision)) result := make([]FileOwnershipData, 0, len(input.Files)) for i, file := range input.Files { @@ -305,9 +312,14 @@ func (m *FileOwnershipMetric) Compute(input *ReportData) []FileOwnershipData { // buildFileContributorSketches creates per-file HLL sketches and populates them // from the people-files mapping. func buildFileContributorSketches(numFiles int, peopleFiles [][]int) []*hll.Sketch { + return buildFileContributorSketchesWithPrecision(numFiles, peopleFiles, fileContribHLLPrecision) +} + +// buildFileContributorSketchesWithPrecision creates per-file HLL sketches with custom precision. +func buildFileContributorSketchesWithPrecision(numFiles int, peopleFiles [][]int, precision uint8) []*hll.Sketch { sketches := make([]*hll.Sketch, numFiles) for i := range sketches { - sketch, err := hll.New(fileContribHLLPrecision) + sketch, err := hll.New(precision) if err != nil { continue } @@ -356,10 +368,14 @@ type aggregateAccum struct { } func (a *aggregateAccum) addPair(coChanges, selfI, selfJ int64) { + a.addPairWithThreshold(coChanges, selfI, selfJ, CouplingThresholdHigh) +} + +func (a *aggregateAccum) addPairWithThreshold(coChanges, selfI, selfJ, threshold int64) { a.totalCoChanges += coChanges a.pairCount++ - if coChanges >= CouplingThresholdHigh { + if coChanges >= threshold { a.highlyCoupled++ } @@ -369,13 +385,15 @@ func (a *aggregateAccum) addPair(coChanges, selfI, selfJ int64) { } } -// Compute calculates aggregate statistics. -func (m *AggregateMetric) Compute(input *ReportData) AggregateData { +// ComputeWithOptions calculates aggregate statistics with configurable thresholds. +func (m *AggregateMetric) ComputeWithOptions(input *ReportData, opts MetricOptions) AggregateData { agg := AggregateData{ TotalFiles: len(input.Files), TotalDevelopers: len(input.ReversedPeopleDict), } + threshold := int64(opts.CouplingThresholdHigh) + var acc aggregateAccum for i, row := range input.FilesMatrix { @@ -384,7 +402,7 @@ func (m *AggregateMetric) Compute(input *ReportData) AggregateData { continue } - acc.addPair(coChanges, row[i], input.FilesMatrix[j][j]) + acc.addPairWithThreshold(coChanges, row[i], input.FilesMatrix[j][j], threshold) } } @@ -414,15 +432,20 @@ const ( // BucketOwnership groups file ownership data into contributor count categories. func BucketOwnership(ownership []FileOwnershipData) []OwnershipBucket { + return BucketOwnershipWithThresholds(ownership, ownershipFewThreshold, ownershipModerateThreshold) +} + +// BucketOwnershipWithThresholds groups file ownership data with configurable thresholds. +func BucketOwnershipWithThresholds(ownership []FileOwnershipData, fewThreshold, moderateThreshold int) []OwnershipBucket { single, few, moderate, many := 0, 0, 0, 0 for _, fo := range ownership { switch { case fo.Contributors <= 1: single++ - case fo.Contributors <= ownershipFewThreshold: + case fo.Contributors <= fewThreshold: few++ - case fo.Contributors <= ownershipModerateThreshold: + case fo.Contributors <= moderateThreshold: moderate++ default: many++ @@ -431,9 +454,9 @@ func BucketOwnership(ownership []FileOwnershipData) []OwnershipBucket { return []OwnershipBucket{ {Label: "Single owner", Count: single}, - {Label: "2-3 owners", Count: few}, - {Label: "4-5 owners", Count: moderate}, - {Label: "6+ owners", Count: many}, + {Label: fmt.Sprintf("2-%d owners", fewThreshold), Count: few}, + {Label: fmt.Sprintf("%d-%d owners", fewThreshold+1, moderateThreshold), Count: moderate}, + {Label: fmt.Sprintf("%d+ owners", moderateThreshold+1), Count: many}, } } @@ -504,6 +527,25 @@ func FilterTopDevs(matrix []map[int]int64, names []string, limit int) (filtered return newMatrix, newNames } +// MetricOptions holds configurable thresholds for couples metric computation. +type MetricOptions struct { + CouplingThresholdHigh int + OwnershipFewThreshold int + OwnershipModerateThreshold int + BatchCouplingThreshold int + HLLPrecision int +} + +// DefaultMetricOptions returns MetricOptions populated with package-level defaults. +func DefaultMetricOptions() MetricOptions { + return MetricOptions{ + CouplingThresholdHigh: int(CouplingThresholdHigh), + OwnershipFewThreshold: ownershipFewThreshold, + OwnershipModerateThreshold: ownershipModerateThreshold, + HLLPrecision: int(fileContribHLLPrecision), + } +} + // --- Computed Metrics ---. // ComputedMetrics holds all computed metric results for the couples analyzer. @@ -533,6 +575,11 @@ func (m *ComputedMetrics) ToYAML() any { // ComputeAllMetrics runs all couples metrics and returns the results. func ComputeAllMetrics(report analyze.Report) (*ComputedMetrics, error) { + return ComputeAllMetricsWithOptions(report, DefaultMetricOptions()) +} + +// ComputeAllMetricsWithOptions runs all couples metrics with configurable thresholds. +func ComputeAllMetricsWithOptions(report analyze.Report, opts MetricOptions) (*ComputedMetrics, error) { input, err := ParseReportData(report) if err != nil { return nil, err @@ -545,10 +592,10 @@ func ComputeAllMetrics(report analyze.Report) (*ComputedMetrics, error) { devCoupling := devMetric.Compute(input) ownerMetric := NewFileOwnershipMetric() - fileOwnership := ownerMetric.Compute(input) + fileOwnership := ownerMetric.ComputeWithOptions(input, opts) aggMetric := NewAggregateMetric() - aggregate := aggMetric.Compute(input) + aggregate := aggMetric.ComputeWithOptions(input, opts) return &ComputedMetrics{ FileCoupling: fileCoupling, diff --git a/internal/analyzers/couples/metrics_test.go b/internal/analyzers/couples/metrics_test.go index 6178a66..a13f53e 100644 --- a/internal/analyzers/couples/metrics_test.go +++ b/internal/analyzers/couples/metrics_test.go @@ -294,7 +294,7 @@ func TestFileOwnershipMetric_Empty(t *testing.T) { m := NewFileOwnershipMetric() input := &ReportData{} - result := m.Compute(input) + result := m.ComputeWithOptions(input, DefaultMetricOptions()) assert.Empty(t, result) } @@ -309,7 +309,7 @@ func TestFileOwnershipMetric_SingleFile(t *testing.T) { PeopleFiles: [][]int{{0}}, // dev0 touched file0. } - result := m.Compute(input) + result := m.ComputeWithOptions(input, DefaultMetricOptions()) require.Len(t, result, 1) assert.Equal(t, testFile1, result[0].File) @@ -331,7 +331,7 @@ func TestFileOwnershipMetric_MultipleContributors(t *testing.T) { }, } - result := m.Compute(input) + result := m.ComputeWithOptions(input, DefaultMetricOptions()) require.Len(t, result, 2) // file0 - touched by dev0 and dev1. @@ -353,7 +353,7 @@ func TestFileOwnershipMetric_MissingFilesLines(t *testing.T) { FilesLines: []int{100}, // Only 1 entry for 2 files. } - result := m.Compute(input) + result := m.ComputeWithOptions(input, DefaultMetricOptions()) require.Len(t, result, 2) assert.Equal(t, 100, result[0].Lines) @@ -370,7 +370,7 @@ func TestFileOwnershipMetric_OutOfBoundsFileIndex(t *testing.T) { PeopleFiles: [][]int{{0, 5}}, // Index 5 is out of bounds. } - result := m.Compute(input) + result := m.ComputeWithOptions(input, DefaultMetricOptions()) require.Len(t, result, 1) assert.Equal(t, 1, result[0].Contributors) // Only valid index counted. @@ -395,7 +395,7 @@ func TestCouplesAggregateMetric_Empty(t *testing.T) { m := NewAggregateMetric() input := &ReportData{} - result := m.Compute(input) + result := m.ComputeWithOptions(input, DefaultMetricOptions()) assert.Equal(t, 0, result.TotalFiles) assert.Equal(t, 0, result.TotalDevelopers) @@ -418,7 +418,7 @@ func TestCouplesAggregateMetric_WithData(t *testing.T) { }, } - result := m.Compute(input) + result := m.ComputeWithOptions(input, DefaultMetricOptions()) assert.Equal(t, 3, result.TotalFiles) assert.Equal(t, 2, result.TotalDevelopers) @@ -446,7 +446,7 @@ func TestCouplesAggregateMetric_HighlyCoupledThreshold(t *testing.T) { }, } - result := m.Compute(input) + result := m.ComputeWithOptions(input, DefaultMetricOptions()) // 10 is exactly at threshold CouplingThresholdHigh. assert.Equal(t, 1, result.HighlyCoupledPairs) diff --git a/internal/analyzers/couples/store_writer.go b/internal/analyzers/couples/store_writer.go index d971ccc..cbb1147 100644 --- a/internal/analyzers/couples/store_writer.go +++ b/internal/analyzers/couples/store_writer.go @@ -53,7 +53,7 @@ func (c *HistoryAnalyzer) WriteToStoreFromAggregator( return ErrUnexpectedAggregator } - minWeight := c.minEdgeWeight() + minWeight := c.MinEdgeWeight reducedFiles, reducedPeople, err := c.collectAndReduce(ctx, ca, minWeight) if err != nil { @@ -62,7 +62,7 @@ func (c *HistoryAnalyzer) WriteToStoreFromAggregator( filesSequence, filesIndex := buildFilesIndex(reducedFiles) - fcErr := writeFileCoupling(w, reducedFiles, filesSequence, filesIndex, c.topKPerFile(), minWeight) + fcErr := writeFileCoupling(w, reducedFiles, filesSequence, filesIndex, c.TopKPerFile, minWeight) if fcErr != nil { return fmt.Errorf("write file_coupling: %w", fcErr) } @@ -159,24 +159,6 @@ func (c *HistoryAnalyzer) collectUnfiltered( return reducedFiles, reducedPeople, nil } -// topKPerFile returns the configured TopK or the default. -func (c *HistoryAnalyzer) topKPerFile() int { - if c.TopKPerFile > 0 { - return c.TopKPerFile - } - - return DefaultTopKPerFile -} - -// minEdgeWeight returns the configured MinEdgeWeight or the default. -func (c *HistoryAnalyzer) minEdgeWeight() int64 { - if c.MinEdgeWeight > 0 { - return c.MinEdgeWeight - } - - return DefaultMinEdgeWeight -} - // writeFileCoupling computes top-K file coupling pairs from the sparse map // and writes them as individual "file_coupling" records. func writeFileCoupling( diff --git a/internal/analyzers/couples/store_writer_test.go b/internal/analyzers/couples/store_writer_test.go index 694aac8..6f006bf 100644 --- a/internal/analyzers/couples/store_writer_test.go +++ b/internal/analyzers/couples/store_writer_test.go @@ -29,6 +29,7 @@ func buildTestAggregator(tb testing.TB) *Aggregator { testPeopleCount, testNames, nil, // no lastCommit for unit tests. + 0, // use default batch coupling threshold. ) // Commit 1: alice touches a.go + b.go. @@ -210,7 +211,7 @@ func TestWriteToStoreFromAggregator_WrongAggregatorType(t *testing.T) { func TestWriteToStoreFromAggregator_EmptyAggregator(t *testing.T) { t.Parallel() - agg := newAggregator(analyze.AggregatorOptions{}, 0, nil, nil) + agg := newAggregator(analyze.AggregatorOptions{}, 0, nil, nil, 0) defer agg.Close() storeDir := t.TempDir() @@ -458,7 +459,7 @@ func TestComputeSparseAggregate_MatchesDenseAggregate(t *testing.T) { ReversedPeopleDict: names, } aggMetric := NewAggregateMetric() - denseAgg := aggMetric.Compute(input) + denseAgg := aggMetric.ComputeWithOptions(input, DefaultMetricOptions()) assert.Equal(t, denseAgg.TotalFiles, sparseAgg.TotalFiles) assert.Equal(t, denseAgg.TotalDevelopers, sparseAgg.TotalDevelopers) diff --git a/internal/analyzers/devs/analyzer.go b/internal/analyzers/devs/analyzer.go index 19fe274..9fff651 100644 --- a/internal/analyzers/devs/analyzer.go +++ b/internal/analyzers/devs/analyzer.go @@ -43,8 +43,15 @@ type TickDevData struct { // Configuration option keys for the devs analyzer. const ( - ConfigDevsConsiderEmptyCommits = "Devs.ConsiderEmptyCommits" - ConfigDevsAnonymize = "Devs.Anonymize" + ConfigDevsConsiderEmptyCommits = "Devs.ConsiderEmptyCommits" + ConfigDevsAnonymize = "Devs.Anonymize" + ConfigDevsBusFactorThreshold = "Devs.BusFactorThreshold" + ConfigDevsRiskThresholdCritical = "Devs.RiskThresholdCritical" + ConfigDevsRiskThresholdHigh = "Devs.RiskThresholdHigh" + ConfigDevsRiskThresholdMedium = "Devs.RiskThresholdMedium" + ConfigDevsActiveThresholdRatio = "Devs.ActiveThresholdRatio" + ConfigDevsDefaultActiveDays = "Devs.DefaultActiveDays" + ConfigDevsHLLPrecision = "Devs.HLLPrecision" defaultHoursPerDay = 24 ) @@ -63,6 +70,15 @@ type Analyzer struct { tickSize time.Duration ConsiderEmptyCommits bool Anonymize bool + + // Configurable thresholds (zero = use package-level defaults). + cfgBusFactorThreshold float64 + cfgRiskThresholdCritical float64 + cfgRiskThresholdHigh float64 + cfgRiskThresholdMedium float64 + cfgActiveThresholdRatio float64 + cfgDefaultActiveDays int + cfgHLLPrecision int } // NewAnalyzer creates a new devs analyzer. @@ -93,8 +109,11 @@ func NewAnalyzer() *Analyzer { Default: false, }, }, - ComputeMetricsFn: analyze.SafeMetricComputer(ComputeAllMetrics, &ComputedMetrics{}), - AggregatorFn: newAggregator, + ComputeMetricsFn: analyze.SafeMetricComputer( + func(report analyze.Report) (*ComputedMetrics, error) { + return ComputeAllMetricsWithOptions(report, a.metricOptions()) + }, &ComputedMetrics{}), + AggregatorFn: newAggregator, } a.TicksToReportFn = func(ctx context.Context, ticks []analyze.TICK) analyze.Report { @@ -119,6 +138,34 @@ func (a *Analyzer) Configure(facts map[string]any) error { a.Anonymize = val } + if val, ok := facts[ConfigDevsBusFactorThreshold].(float64); ok { + a.cfgBusFactorThreshold = val + } + + if val, ok := facts[ConfigDevsRiskThresholdCritical].(float64); ok { + a.cfgRiskThresholdCritical = val + } + + if val, ok := facts[ConfigDevsRiskThresholdHigh].(float64); ok { + a.cfgRiskThresholdHigh = val + } + + if val, ok := facts[ConfigDevsRiskThresholdMedium].(float64); ok { + a.cfgRiskThresholdMedium = val + } + + if val, ok := facts[ConfigDevsActiveThresholdRatio].(float64); ok { + a.cfgActiveThresholdRatio = val + } + + if val, ok := facts[ConfigDevsDefaultActiveDays].(int); ok { + a.cfgDefaultActiveDays = val + } + + if val, ok := facts[ConfigDevsHLLPrecision].(int); ok { + a.cfgHLLPrecision = val + } + if val, ok := pkgplumbing.GetReversedPeopleDict(facts); ok { a.ReversedPeopleDict = val } @@ -134,6 +181,19 @@ func (a *Analyzer) Configure(facts map[string]any) error { return nil } +// metricOptions returns the metric computation options from configured values. +func (a *Analyzer) metricOptions() MetricOptions { + return MetricOptions{ + BusFactorThreshold: a.cfgBusFactorThreshold, + RiskThresholdCritical: a.cfgRiskThresholdCritical, + RiskThresholdHigh: a.cfgRiskThresholdHigh, + RiskThresholdMedium: a.cfgRiskThresholdMedium, + ActiveThresholdRatio: a.cfgActiveThresholdRatio, + DefaultActiveDays: a.cfgDefaultActiveDays, + HLLPrecision: a.cfgHLLPrecision, + } +} + // Initialize prepares the analyzer for processing commits. func (a *Analyzer) Initialize(_ *gitlib.Repository) error { RegisterDevPlotSections() diff --git a/internal/analyzers/devs/hll_test.go b/internal/analyzers/devs/hll_test.go index c9ee084..20737d8 100644 --- a/internal/analyzers/devs/hll_test.go +++ b/internal/analyzers/devs/hll_test.go @@ -90,7 +90,7 @@ func TestParseTickData_PopulatesDevSketch(t *testing.T) { "TickSize": testTickSize, } - data, err := ParseTickData(report) + data, err := ParseTickDataWithPrecision(report, hllPrecision) require.NoError(t, err) require.NotNil(t, data) @@ -109,7 +109,7 @@ func TestParseTickData_EmptyTicks_NilSketch(t *testing.T) { "TickSize": testTickSize, } - data, err := ParseTickData(report) + data, err := ParseTickDataWithPrecision(report, hllPrecision) require.NoError(t, err) require.NotNil(t, data) diff --git a/internal/analyzers/devs/metrics.go b/internal/analyzers/devs/metrics.go index cfc3eef..0e48062 100644 --- a/internal/analyzers/devs/metrics.go +++ b/internal/analyzers/devs/metrics.go @@ -100,8 +100,8 @@ func aggregateDevTickFromCommits(hashes []gitlib.Hash, commitDevData map[string] return devTicks } -// ParseTickData extracts TickData from an analyzer report. -func ParseTickData(report analyze.Report) (*TickData, error) { +// ParseTickDataWithPrecision extracts TickData from an analyzer report using a custom HLL precision. +func ParseTickDataWithPrecision(report analyze.Report, precision int) (*TickData, error) { names, err := parseReversedPeopleDict(report) if err != nil { return nil, err @@ -127,19 +127,18 @@ func ParseTickData(report analyze.Report) (*TickData, error) { TickSize: tickSize, } - td.DevSketch = buildDevSketch(ticks) + td.DevSketch = buildDevSketchWithPrecision(ticks, precision) return td, nil } -// buildDevSketch creates an HLL sketch from all unique developer IDs across ticks. -// Returns nil if no ticks contain developer data. -func buildDevSketch(ticks map[int]map[int]*DevTick) *hll.Sketch { +// buildDevSketchWithPrecision creates an HLL sketch with a custom precision from all unique developer IDs across ticks. +func buildDevSketchWithPrecision(ticks map[int]map[int]*DevTick, precision int) *hll.Sketch { if len(ticks) == 0 { return nil } - sketch, err := hll.New(hllPrecision) + sketch, err := hll.New(uint8(precision)) if err != nil { return nil } @@ -568,8 +567,18 @@ const busFactorThreshold = 0.5 // Compute calculates bus factor risk from language data. // Contributors map values represent total contribution (Added+Removed). func (m *BusFactorMetric) Compute(input BusFactorInput) []BusFactorData { + return m.ComputeWithOptions(input, DefaultMetricOptions()) +} + +// ComputeWithOptions calculates bus factor risk with configurable thresholds. +func (m *BusFactorMetric) ComputeWithOptions(input BusFactorInput, opts MetricOptions) []BusFactorData { result := make([]BusFactorData, 0, len(input.Languages)) + bfThreshold := opts.BusFactorThreshold + critThreshold := opts.RiskThresholdCritical + highThreshold := opts.RiskThresholdHigh + medThreshold := opts.RiskThresholdMedium + for _, ld := range input.Languages { if ld.TotalContribution == 0 { continue @@ -598,7 +607,7 @@ func (m *BusFactorMetric) Compute(input BusFactorInput) []BusFactorData { bf := BusFactorData{ Language: ld.Name, TotalContributors: len(contribs), - BusFactor: computeBusFactorFromSorted(sortedAmounts, ld.TotalContribution), + BusFactor: computeBusFactorFromSortedWithThreshold(sortedAmounts, ld.TotalContribution, bfThreshold), } if len(contribs) > 0 { @@ -614,11 +623,11 @@ func (m *BusFactorMetric) Compute(input BusFactorInput) []BusFactorData { } switch { - case bf.PrimaryPct >= ThresholdCritical: + case bf.PrimaryPct >= critThreshold: bf.RiskLevel = string(metrics.RiskCritical) - case bf.PrimaryPct >= ThresholdHigh: + case bf.PrimaryPct >= highThreshold: bf.RiskLevel = string(metrics.RiskHigh) - case bf.PrimaryPct >= ThresholdMedium: + case bf.PrimaryPct >= medThreshold: bf.RiskLevel = string(metrics.RiskMedium) default: bf.RiskLevel = string(metrics.RiskLow) @@ -637,22 +646,21 @@ func (m *BusFactorMetric) Compute(input BusFactorInput) []BusFactorData { return result } -// computeBusFactorFromSorted returns the smallest number of contributors -// who together account for at least 50% of total contributions. -// This follows the CHAOSS Contributor Absence Factor methodology. +// computeBusFactorFromSortedWithThreshold returns the smallest number of contributors +// who together account for at least the given threshold fraction of total contributions. // sortedContribs must be sorted descending by contribution amount. -func computeBusFactorFromSorted(sortedContribs []int, total int) int { +func computeBusFactorFromSortedWithThreshold(sortedContribs []int, total int, threshold float64) int { if total == 0 || len(sortedContribs) == 0 { return 0 } - threshold := float64(total) * busFactorThreshold + target := float64(total) * threshold cumulative := 0 for i, amount := range sortedContribs { cumulative += amount - if float64(cumulative) >= threshold { + if float64(cumulative) >= target { return i + 1 } } @@ -777,12 +785,18 @@ const DefaultActiveDays = 90 // Compute calculates aggregate statistics. func (m *AggregateMetric) Compute(input AggregateInput) AggregateData { + return m.ComputeWithOptions(input, DefaultMetricOptions()) +} + +// ComputeWithOptions calculates aggregate statistics with configurable thresholds. +func (m *AggregateMetric) ComputeWithOptions(input AggregateInput, opts MetricOptions) AggregateData { agg := AggregateData{ TotalDevelopers: len(input.Developers), TotalLanguages: len(input.Languages), } - totalSketch := buildTotalDevSketch(input.Developers) + precision := opts.HLLPrecision + totalSketch := buildTotalDevSketchWithPrecision(input.Developers, precision) for _, d := range input.Developers { agg.TotalCommits += d.Commits @@ -799,9 +813,9 @@ func (m *AggregateMetric) Compute(input AggregateInput) AggregateData { maxTick := tickKeys[len(tickKeys)-1] agg.AnalysisPeriodTicks = maxTick - recentThreshold := computeActiveThreshold(maxTick, input.TickSize) + recentThreshold := computeActiveThresholdWithOptions(maxTick, input.TickSize, opts) activeDevs := make(map[int]bool) - activeSketch := buildActiveDevSketch(input.Ticks, recentThreshold) + activeSketch := buildActiveDevSketchWithPrecision(input.Ticks, recentThreshold, precision) for tick, devTicks := range input.Ticks { if tick >= recentThreshold { @@ -818,18 +832,18 @@ func (m *AggregateMetric) Compute(input AggregateInput) AggregateData { } } - agg.ProjectBusFactor = computeProjectBusFactor(input.Developers) + agg.ProjectBusFactor = computeProjectBusFactorWithThreshold(input.Developers, opts.BusFactorThreshold) return agg } -// buildTotalDevSketch creates an HLL sketch from all developer IDs in the input. -func buildTotalDevSketch(developers []DeveloperData) *hll.Sketch { +// buildTotalDevSketchWithPrecision creates an HLL sketch with custom precision from all developer IDs. +func buildTotalDevSketchWithPrecision(developers []DeveloperData, precision int) *hll.Sketch { if len(developers) == 0 { return nil } - sketch, err := hll.New(hllPrecision) + sketch, err := hll.New(uint8(precision)) if err != nil { return nil } @@ -841,9 +855,9 @@ func buildTotalDevSketch(developers []DeveloperData) *hll.Sketch { return sketch } -// buildActiveDevSketch creates an HLL sketch from developer IDs in ticks at or above the threshold. -func buildActiveDevSketch(ticks map[int]map[int]*DevTick, threshold int) *hll.Sketch { - sketch, err := hll.New(hllPrecision) +// buildActiveDevSketchWithPrecision creates an HLL sketch with custom precision from active developer IDs. +func buildActiveDevSketchWithPrecision(ticks map[int]map[int]*DevTick, threshold, precision int) *hll.Sketch { + sketch, err := hll.New(uint8(precision)) if err != nil { return nil } @@ -859,12 +873,11 @@ func buildActiveDevSketch(ticks map[int]map[int]*DevTick, threshold int) *hll.Sk return sketch } -// computeActiveThreshold returns the tick index threshold for "active" developers. -// When TickSize is known, uses time-based calculation (last 90 days). -// Otherwise falls back to ratio-based (last 30% of analysis period). -func computeActiveThreshold(maxTick int, tickSize time.Duration) int { +// computeActiveThresholdWithOptions returns the active threshold using configurable parameters. +func computeActiveThresholdWithOptions(maxTick int, tickSize time.Duration, opts MetricOptions) int { if tickSize > 0 { - activeDuration := time.Duration(DefaultActiveDays) * defaultTickHours * time.Hour + activeDays := opts.DefaultActiveDays + activeDuration := time.Duration(activeDays) * defaultTickHours * time.Hour ticksForActive := int(activeDuration / tickSize) threshold := maxTick - ticksForActive @@ -875,13 +888,11 @@ func computeActiveThreshold(maxTick int, tickSize time.Duration) int { return threshold } - return int(float64(maxTick) * ActiveThresholdRatio) + return int(float64(maxTick) * opts.ActiveThresholdRatio) } -// computeProjectBusFactor computes the CHAOSS Contributor Absence Factor -// across the entire project: the smallest number of developers responsible -// for 50% of all contributions (Added+Removed). -func computeProjectBusFactor(developers []DeveloperData) int { +// computeProjectBusFactorWithThreshold computes project bus factor with a configurable threshold. +func computeProjectBusFactorWithThreshold(developers []DeveloperData, threshold float64) int { if len(developers) == 0 { return 0 } @@ -908,7 +919,7 @@ func computeProjectBusFactor(developers []DeveloperData) int { sortedAmounts[i] = c.contribution } - return computeBusFactorFromSorted(sortedAmounts, total) + return computeBusFactorFromSortedWithThreshold(sortedAmounts, total, threshold) } // ComputedMetrics holds all computed metric results for the devs analyzer. @@ -925,9 +936,38 @@ type ComputedMetrics struct { metricNames []string `json:"-" yaml:"-"` } +// MetricOptions holds configurable thresholds for devs metric computation. +type MetricOptions struct { + BusFactorThreshold float64 + RiskThresholdCritical float64 + RiskThresholdHigh float64 + RiskThresholdMedium float64 + ActiveThresholdRatio float64 + DefaultActiveDays int + HLLPrecision int +} + +// DefaultMetricOptions returns MetricOptions populated with package-level defaults. +func DefaultMetricOptions() MetricOptions { + return MetricOptions{ + BusFactorThreshold: busFactorThreshold, + RiskThresholdCritical: ThresholdCritical, + RiskThresholdHigh: ThresholdHigh, + RiskThresholdMedium: ThresholdMedium, + ActiveThresholdRatio: ActiveThresholdRatio, + DefaultActiveDays: DefaultActiveDays, + HLLPrecision: hllPrecision, + } +} + // ComputeAllMetrics runs all devs metrics and returns the results. func ComputeAllMetrics(report analyze.Report) (*ComputedMetrics, error) { - input, err := ParseTickData(report) + return ComputeAllMetricsWithOptions(report, DefaultMetricOptions()) +} + +// ComputeAllMetricsWithOptions runs all devs metrics with configurable thresholds. +func ComputeAllMetricsWithOptions(report analyze.Report, opts MetricOptions) (*ComputedMetrics, error) { + input, err := ParseTickDataWithPrecision(report, opts.HLLPrecision) if err != nil { return nil, err } @@ -940,7 +980,7 @@ func ComputeAllMetrics(report analyze.Report) (*ComputedMetrics, error) { languages := langMetric.Compute(developers) busMetric := NewBusFactorMetric() - busFactor := busMetric.Compute(BusFactorInput{Languages: languages, Names: input.Names}) + busFactor := busMetric.ComputeWithOptions(BusFactorInput{Languages: languages, Names: input.Names}, opts) actMetric := NewActivityMetric() activity := actMetric.Compute(input) @@ -949,12 +989,12 @@ func ComputeAllMetrics(report analyze.Report) (*ComputedMetrics, error) { churn := churnMetric.Compute(input) aggMetric := NewAggregateMetric() - aggregate := aggMetric.Compute(AggregateInput{ + aggregate := aggMetric.ComputeWithOptions(AggregateInput{ Developers: developers, Languages: languages, Ticks: input.Ticks, TickSize: input.TickSize, - }) + }, opts) return &ComputedMetrics{ Ticks: input.Ticks, diff --git a/internal/analyzers/devs/metrics_test.go b/internal/analyzers/devs/metrics_test.go index 1dd842a..4d18301 100644 --- a/internal/analyzers/devs/metrics_test.go +++ b/internal/analyzers/devs/metrics_test.go @@ -50,7 +50,7 @@ func TestParseTickData_Valid(t *testing.T) { "TickSize": testTickSize, } - data, err := ParseTickData(report) + data, err := ParseTickDataWithPrecision(report, hllPrecision) require.NoError(t, err) require.NotNil(t, data) @@ -69,7 +69,7 @@ func TestParseTickData_EmptyCanonical(t *testing.T) { "TickSize": testTickSize, } - data, err := ParseTickData(report) + data, err := ParseTickDataWithPrecision(report, hllPrecision) require.NoError(t, err) require.NotNil(t, data) @@ -86,7 +86,7 @@ func TestParseTickData_MissingPeopleDict(t *testing.T) { "TickSize": testTickSize, } - data, err := ParseTickData(report) + data, err := ParseTickDataWithPrecision(report, hllPrecision) require.Error(t, err) assert.Equal(t, ErrInvalidPeopleDict, err) @@ -104,7 +104,7 @@ func TestParseTickData_MissingTickSize_DefaultsTo24Hours(t *testing.T) { "ReversedPeopleDict": names, } - data, err := ParseTickData(report) + data, err := ParseTickDataWithPrecision(report, hllPrecision) require.NoError(t, err) require.NotNil(t, data) diff --git a/internal/analyzers/file_history/history.go b/internal/analyzers/file_history/history.go index 7cad429..dc07200 100644 --- a/internal/analyzers/file_history/history.go +++ b/internal/analyzers/file_history/history.go @@ -14,6 +14,13 @@ import ( "github.com/Sumatoshi-tech/codefang/pkg/pipeline" ) +// Configuration option keys for the file history analyzer. +const ( + ConfigFileHistoryHotspotThresholdCritical = "FileHistory.HotspotThresholdCritical" + ConfigFileHistoryHotspotThresholdHigh = "FileHistory.HotspotThresholdHigh" + ConfigFileHistoryHotspotThresholdMedium = "FileHistory.HotspotThresholdMedium" +) + // HistoryAnalyzer tracks file-level change history across commits. type HistoryAnalyzer struct { *analyze.BaseHistoryAnalyzer[*ComputedMetrics] @@ -31,6 +38,11 @@ type HistoryAnalyzer struct { repo *gitlib.Repository merges *analyze.MergeTracker classifier *Classifier + + // Configurable thresholds (zero = use package-level defaults). + cfgHotspotCritical int + cfgHotspotHigh int + cfgHotspotMedium int } // FileHistory holds the change history for a single file. @@ -53,7 +65,9 @@ func NewAnalyzer() *HistoryAnalyzer { ha.BaseHistoryAnalyzer = &analyze.BaseHistoryAnalyzer[*ComputedMetrics]{ EstimatedStateSize: workingStateSize, EstimatedTCSize: avgTCSize, - ComputeMetricsFn: ComputeAllMetrics, + ComputeMetricsFn: func(report analyze.Report) (*ComputedMetrics, error) { + return ComputeAllMetricsWithOptions(report, ha.metricOptions()) + }, TicksToReportFn: func(ctx context.Context, t []analyze.TICK) analyze.Report { return TicksToReport(ctx, t, ha.repo) }, @@ -102,10 +116,31 @@ func (h *HistoryAnalyzer) ListConfigurationOptions() []pipeline.ConfigurationOpt } // Configure sets up the analyzer with the provided facts. -func (h *HistoryAnalyzer) Configure(_ map[string]any) error { +func (h *HistoryAnalyzer) Configure(facts map[string]any) error { + if val, ok := facts[ConfigFileHistoryHotspotThresholdCritical].(int); ok { + h.cfgHotspotCritical = val + } + + if val, ok := facts[ConfigFileHistoryHotspotThresholdHigh].(int); ok { + h.cfgHotspotHigh = val + } + + if val, ok := facts[ConfigFileHistoryHotspotThresholdMedium].(int); ok { + h.cfgHotspotMedium = val + } + return nil } +// metricOptions returns the metric computation options from configured values. +func (h *HistoryAnalyzer) metricOptions() MetricOptions { + return MetricOptions{ + HotspotThresholdCritical: h.cfgHotspotCritical, + HotspotThresholdHigh: h.cfgHotspotHigh, + HotspotThresholdMedium: h.cfgHotspotMedium, + } +} + // Initialize prepares the analyzer for processing commits. func (h *HistoryAnalyzer) Initialize(repo *gitlib.Repository) error { h.files = map[string]*FileHistory{} diff --git a/internal/analyzers/file_history/metrics.go b/internal/analyzers/file_history/metrics.go index 32527a8..2e1b6e9 100644 --- a/internal/analyzers/file_history/metrics.go +++ b/internal/analyzers/file_history/metrics.go @@ -117,8 +117,29 @@ func (m *ComputedMetrics) ToYAML() any { return m } +// MetricOptions holds configurable thresholds for file history metrics. +type MetricOptions struct { + HotspotThresholdCritical int + HotspotThresholdHigh int + HotspotThresholdMedium int +} + +// DefaultMetricOptions returns MetricOptions populated with package-level defaults. +func DefaultMetricOptions() MetricOptions { + return MetricOptions{ + HotspotThresholdCritical: HotspotThresholdCritical, + HotspotThresholdHigh: HotspotThresholdHigh, + HotspotThresholdMedium: HotspotThresholdMedium, + } +} + // ComputeAllMetrics runs all file history metrics and returns the results. func ComputeAllMetrics(report analyze.Report) (*ComputedMetrics, error) { + return ComputeAllMetricsWithOptions(report, DefaultMetricOptions()) +} + +// ComputeAllMetricsWithOptions runs all file history metrics with configurable thresholds. +func ComputeAllMetricsWithOptions(report analyze.Report, opts MetricOptions) (*ComputedMetrics, error) { input, err := ParseReportData(report) if err != nil { return nil, err @@ -134,8 +155,8 @@ func ComputeAllMetrics(report analyze.Report) (*ComputedMetrics, error) { return &ComputedMetrics{ FileChurn: computeFileChurn(input), FileContributors: computeFileContributors(input), - Hotspots: computeHotspots(input), - Aggregate: computeAggregate(input), + Hotspots: computeHotspotsWithOptions(input, opts), + Aggregate: computeAggregateWithOptions(input, opts), Composition: composition, CompositionTS: compositionTS, }, nil @@ -202,7 +223,11 @@ func computeFileContributors(input *ReportData) []FileContributorData { return result } -func computeHotspots(input *ReportData) []HotspotData { +func computeHotspotsWithOptions(input *ReportData, opts MetricOptions) []HotspotData { + critical := opts.HotspotThresholdCritical + high := opts.HotspotThresholdHigh + medium := opts.HotspotThresholdMedium + result := make([]HotspotData, 0, len(input.Files)) for path, fh := range input.Files { @@ -220,11 +245,11 @@ func computeHotspots(input *ReportData) []HotspotData { var riskLevel string switch { - case commitCount >= HotspotThresholdCritical: + case commitCount >= critical: riskLevel = string(metrics.RiskCritical) - case commitCount >= HotspotThresholdHigh: + case commitCount >= high: riskLevel = string(metrics.RiskHigh) - case commitCount >= HotspotThresholdMedium: + case commitCount >= medium: riskLevel = string(metrics.RiskMedium) default: continue // Skip low-risk files. @@ -312,7 +337,7 @@ func computeComposition(tickComp map[int]*CategoryCounts) (CompositionData, []Co const percentMultiplier = 100.0 -func computeAggregate(input *ReportData) AggregateData { +func computeAggregateWithOptions(input *ReportData, opts MetricOptions) AggregateData { agg := AggregateData{ TotalFiles: len(input.Files), } @@ -321,6 +346,7 @@ func computeAggregate(input *ReportData) AggregateData { return agg } + medium := opts.HotspotThresholdMedium allContributors := make(map[int]bool) var totalCommits, highChurnCount int @@ -332,7 +358,7 @@ func computeAggregate(input *ReportData) AggregateData { allContributors[devID] = true } - if len(fh.Hashes) >= HotspotThresholdMedium { + if len(fh.Hashes) >= medium { highChurnCount++ } } diff --git a/internal/analyzers/file_history/metrics_test.go b/internal/analyzers/file_history/metrics_test.go index 174d0b5..7379142 100644 --- a/internal/analyzers/file_history/metrics_test.go +++ b/internal/analyzers/file_history/metrics_test.go @@ -223,7 +223,7 @@ func TestFileContributorMetric_NoContributors(t *testing.T) { // --- HotspotMetric Tests ---. // func TestHotspotMetric_Metadata(_ *testing.T) { -// result := computeHotspots(nil) +// result := computeHotspotsWithOptions(nil, DefaultMetricOptions()) // assert.Equal(t, "Hotspots", result.Name) // assert.Equal(t, "Identifies high-risk files based on commit frequency", result.Description) // }. @@ -233,7 +233,7 @@ func TestHotspotMetric_Empty(t *testing.T) { input := &ReportData{Files: make(map[string]FileHistory)} - result := computeHotspots(input) + result := computeHotspotsWithOptions(input, DefaultMetricOptions()) assert.Empty(t, result) } @@ -250,7 +250,7 @@ func TestHotspotMetric_BelowThreshold(t *testing.T) { }, } - result := computeHotspots(input) + result := computeHotspotsWithOptions(input, DefaultMetricOptions()) assert.Empty(t, result) } @@ -284,7 +284,7 @@ func TestHotspotMetric_RiskLevels(t *testing.T) { }, } - result := computeHotspots(input) + result := computeHotspotsWithOptions(input, DefaultMetricOptions()) require.Len(t, result, 1) assert.Equal(t, tt.expected, result[0].RiskLevel) @@ -313,7 +313,7 @@ func TestHotspotMetric_SortedByRiskThenCommitCount(t *testing.T) { }, } - result := computeHotspots(input) + result := computeHotspotsWithOptions(input, DefaultMetricOptions()) require.Len(t, result, 3) // Sorted by risk first (critical > high > medium). @@ -325,7 +325,7 @@ func TestHotspotMetric_SortedByRiskThenCommitCount(t *testing.T) { // --- FileHistoryAggregateMetric Tests ---. // func TestAggregateMetric_Metadata(_ *testing.T) { -// result := computeAggregate(nil) +// result := computeAggregateWithOptions(nil, DefaultMetricOptions()) // assert.Equal(t, "File History Summary", result.Name) // assert.Equal(t, "Aggregates overall file history statistics", result.Description) // }. @@ -335,7 +335,7 @@ func TestFileHistoryAggregateMetric_Empty(t *testing.T) { input := &ReportData{Files: make(map[string]FileHistory)} - result := computeAggregate(input) + result := computeAggregateWithOptions(input, DefaultMetricOptions()) assert.Equal(t, 0, result.TotalFiles) assert.Equal(t, 0, result.TotalCommits) @@ -367,7 +367,7 @@ func TestFileHistoryAggregateMetric_WithData(t *testing.T) { }, } - result := computeAggregate(input) + result := computeAggregateWithOptions(input, DefaultMetricOptions()) assert.Equal(t, 2, result.TotalFiles) assert.Equal(t, 30, result.TotalCommits) // 20 + 10 diff --git a/internal/analyzers/imports/analyzer.go b/internal/analyzers/imports/analyzer.go index 0db5427..f52de87 100644 --- a/internal/analyzers/imports/analyzer.go +++ b/internal/analyzers/imports/analyzer.go @@ -23,8 +23,13 @@ const ( magic2_2 = 2 ) +// ConfigImportsMaxDependencyRiskRows is the configuration key for max dependency risk rows. +const ConfigImportsMaxDependencyRiskRows = "Imports.MaxDependencyRiskRows" + // Analyzer analyzes import statements in source code. type Analyzer struct { + // cfgMaxDependencyRiskRows overrides the default maxDependencyRiskRows. Zero = use default. + cfgMaxDependencyRiskRows int } // NewAnalyzer creates a new Analyzer. @@ -62,7 +67,11 @@ func (a *Analyzer) ListConfigurationOptions() []pipeline.ConfigurationOption { } // Configure sets up the analyzer with the provided facts. -func (a *Analyzer) Configure(_ map[string]any) error { +func (a *Analyzer) Configure(facts map[string]any) error { + if val, ok := facts[ConfigImportsMaxDependencyRiskRows].(int); ok { + a.cfgMaxDependencyRiskRows = val + } + return nil } diff --git a/internal/analyzers/imports/static_plot.go b/internal/analyzers/imports/static_plot.go index 1dd99d4..ee431e9 100644 --- a/internal/analyzers/imports/static_plot.go +++ b/internal/analyzers/imports/static_plot.go @@ -40,6 +40,11 @@ func (a *Analyzer) generateStaticSections(report analyze.Report) []plotpage.Sect metrics = &ComputedMetrics{} } + riskRowLimit := maxDependencyRiskRows + if a.cfgMaxDependencyRiskRows > 0 { + riskRowLimit = a.cfgMaxDependencyRiskRows + } + return []plotpage.Section{ { Title: "Top Imports Usage", @@ -70,7 +75,7 @@ func (a *Analyzer) generateStaticSections(report analyze.Report) []plotpage.Sect { Title: "Dependency Risk Overview", Subtitle: "Potentially risky import patterns extracted from static metrics.", - Chart: buildDependencyRiskTable(metrics), + Chart: buildDependencyRiskTableWithLimit(metrics, riskRowLimit), Hint: plotpage.Hint{ Title: "How to interpret:", Items: []string{ @@ -169,7 +174,7 @@ func createEmptyImportCategoriesPie() *charts.Pie { const maxDependencyRiskRows = 30 -func buildDependencyRiskTable(metrics *ComputedMetrics) *plotpage.Table { +func buildDependencyRiskTableWithLimit(metrics *ComputedMetrics, rowLimit int) *plotpage.Table { table := plotpage.NewTable([]string{"Import", "Risk", "Reason"}) if len(metrics.Dependencies) == 0 { @@ -188,17 +193,17 @@ func buildDependencyRiskTable(metrics *ComputedMetrics) *plotpage.Table { return deps[i].Path < deps[j].Path }) - limit := min(len(deps), maxDependencyRiskRows) + limit := min(len(deps), rowLimit) for _, dep := range deps[:limit] { table.AddRow(dep.Path, dep.RiskLevel, dep.Reason) } - if len(deps) > maxDependencyRiskRows { + if len(deps) > rowLimit { table.AddRow( - fmt.Sprintf("... and %d more", len(deps)-maxDependencyRiskRows), + fmt.Sprintf("... and %d more", len(deps)-rowLimit), "INFO", - fmt.Sprintf("Showing top %d of %d total risks", maxDependencyRiskRows, len(deps)), + fmt.Sprintf("Showing top %d of %d total risks", rowLimit, len(deps)), ) } diff --git a/internal/analyzers/plumbing/uast.go b/internal/analyzers/plumbing/uast.go index 3f37d38..f7f7155 100644 --- a/internal/analyzers/plumbing/uast.go +++ b/internal/analyzers/plumbing/uast.go @@ -22,14 +22,16 @@ import ( // UASTChangesAnalyzer extracts UAST-level changes between commits. // It uses lazy parsing - changes are only parsed when Changes() is called. type UASTChangesAnalyzer struct { - TreeDiff *TreeDiffAnalyzer - BlobCache *BlobCacheAnalyzer - Goroutines int - parser *uast.Parser - pathFilter *pathfilter.Filter - changes []uast.Change - parsed bool // tracks whether parsing was done for current commit. - spillPath string // path to spill file from current commit (for cleanup on next Consume). + TreeDiff *TreeDiffAnalyzer + BlobCache *BlobCacheAnalyzer + Goroutines int + MaxBlobSize int // Maximum blob size for parsing. Zero uses default. + ParseTimeout time.Duration // Timeout per file. Zero uses default. + parser *uast.Parser + pathFilter *pathfilter.Filter + changes []uast.Change + parsed bool // tracks whether parsing was done for current commit. + spillPath string // path to spill file from current commit (for cleanup on next Consume). } const ( @@ -317,7 +319,12 @@ func (c *UASTChangesAnalyzer) parseBlob( return nil } - if len(blob.Data) > maxUASTBlobSize { + blobLimit := c.MaxBlobSize + if blobLimit <= 0 { + blobLimit = maxUASTBlobSize + } + + if len(blob.Data) > blobLimit { return nil } @@ -326,7 +333,12 @@ func (c *UASTChangesAnalyzer) parseBlob( return nil } - parseCtx, cancel := context.WithTimeout(ctx, uastParseTimeout) + timeout := c.ParseTimeout + if timeout <= 0 { + timeout = uastParseTimeout + } + + parseCtx, cancel := context.WithTimeout(ctx, timeout) defer cancel() parsed, err := c.parser.Parse(parseCtx, filename, blob.Data) diff --git a/internal/analyzers/sentiment/analyzer.go b/internal/analyzers/sentiment/analyzer.go index 68016fb..2741aea 100644 --- a/internal/analyzers/sentiment/analyzer.go +++ b/internal/analyzers/sentiment/analyzer.go @@ -47,6 +47,18 @@ const ( ConfigCommentSentimentMinLength = "CommentSentiment.MinLength" // ConfigCommentSentimentGap is the configuration key for the sentiment gap threshold. ConfigCommentSentimentGap = "CommentSentiment.Gap" + // ConfigCommentSentimentNeutralizerWeight is the configuration key for the SE domain neutralizer weight. + ConfigCommentSentimentNeutralizerWeight = "CommentSentiment.NeutralizerWeight" + // ConfigCommentSentimentMaxWeightRatio is the configuration key for the max comment weight ratio. + ConfigCommentSentimentMaxWeightRatio = "CommentSentiment.MaxWeightRatio" + // ConfigCommentSentimentPositiveThreshold is the config key for the positive sentiment classification threshold. + ConfigCommentSentimentPositiveThreshold = "CommentSentiment.PositiveThreshold" + // ConfigCommentSentimentNegativeThreshold is the config key for the negative sentiment classification threshold. + ConfigCommentSentimentNegativeThreshold = "CommentSentiment.NegativeThreshold" + // ConfigCommentSentimentTrendThreshold is the config key for the trend direction threshold. + ConfigCommentSentimentTrendThreshold = "CommentSentiment.TrendThreshold" + // ConfigCommentSentimentLowRiskThreshold is the config key for the low sentiment risk threshold. + ConfigCommentSentimentLowRiskThreshold = "CommentSentiment.LowSentimentRiskThreshold" // DefaultCommentSentimentCommentMinLength is the default minimum comment length for sentiment analysis. DefaultCommentSentimentCommentMinLength = 20 @@ -79,11 +91,26 @@ type Analyzer struct { commitsByTick map[int][]gitlib.Hash MinCommentLength int Gap float32 + + // Configurable scoring parameters (zero = use package-level defaults). + cfgNeutralizerWeight float64 + cfgMaxWeightRatio float64 + + // Configurable metrics thresholds. + cfgPositiveThreshold float64 + cfgNegativeThreshold float64 + cfgTrendThreshold float64 + cfgLowSentimentRiskThresh float64 } // NewAnalyzer creates a new sentiment analyzer. func NewAnalyzer() *Analyzer { - a := &Analyzer{} + a := &Analyzer{ + cfgPositiveThreshold: SentimentPositiveThreshold, + cfgNegativeThreshold: SentimentNegativeThreshold, + cfgTrendThreshold: trendThreshold, + cfgLowSentimentRiskThresh: lowSentimentRiskThreshold, + } a.BaseHistoryAnalyzer = &analyze.BaseHistoryAnalyzer[*ComputedMetrics]{ Desc: analyze.Descriptor{ ID: "history/sentiment", @@ -108,8 +135,10 @@ func NewAnalyzer() *Analyzer { Default: DefaultCommentSentimentGap, }, }, - ComputeMetricsFn: analyze.SafeMetricComputer(ComputeAllMetrics, &ComputedMetrics{}), - AggregatorFn: newAggregator, + ComputeMetricsFn: func(report analyze.Report) (*ComputedMetrics, error) { + return ComputeAllMetricsWithOptions(report, a.metricOptions()) + }, + AggregatorFn: newAggregator, } a.TicksToReportFn = func(ctx context.Context, ticks []analyze.TICK) analyze.Report { @@ -164,6 +193,30 @@ func (s *Analyzer) Configure(facts map[string]any) error { s.MinCommentLength = val } + if val, ok := facts[ConfigCommentSentimentNeutralizerWeight].(float64); ok { + s.cfgNeutralizerWeight = val + } + + if val, ok := facts[ConfigCommentSentimentMaxWeightRatio].(float64); ok { + s.cfgMaxWeightRatio = val + } + + if val, ok := facts[ConfigCommentSentimentPositiveThreshold].(float64); ok { + s.cfgPositiveThreshold = val + } + + if val, ok := facts[ConfigCommentSentimentNegativeThreshold].(float64); ok { + s.cfgNegativeThreshold = val + } + + if val, ok := facts[ConfigCommentSentimentTrendThreshold].(float64); ok { + s.cfgTrendThreshold = val + } + + if val, ok := facts[ConfigCommentSentimentLowRiskThreshold].(float64); ok { + s.cfgLowSentimentRiskThresh = val + } + if val, ok := pkgplumbing.GetCommitsByTick(facts); ok { s.commitsByTick = val } @@ -173,6 +226,16 @@ func (s *Analyzer) Configure(facts map[string]any) error { return nil } +// metricOptions returns the configured sentiment metric options. +func (s *Analyzer) metricOptions() MetricOptions { + return MetricOptions{ + PositiveThreshold: s.cfgPositiveThreshold, + NegativeThreshold: s.cfgNegativeThreshold, + TrendThreshold: s.cfgTrendThreshold, + LowSentimentRiskThresh: s.cfgLowSentimentRiskThresh, + } +} + func (s *Analyzer) validate() { if s.Gap < 0 || s.Gap >= 1 { s.Gap = DefaultCommentSentimentGap diff --git a/internal/analyzers/sentiment/metrics.go b/internal/analyzers/sentiment/metrics.go index 1317634..f3b0b01 100644 --- a/internal/analyzers/sentiment/metrics.go +++ b/internal/analyzers/sentiment/metrics.go @@ -170,18 +170,23 @@ func (m *ComputedMetrics) ToYAML() any { return m } -// ComputeAllMetrics runs all sentiment metrics and returns the results. +// ComputeAllMetrics runs all sentiment metrics with default options. func ComputeAllMetrics(report analyze.Report) (*ComputedMetrics, error) { + return ComputeAllMetricsWithOptions(report, DefaultMetricOptions()) +} + +// ComputeAllMetricsWithOptions runs all sentiment metrics with configurable thresholds. +func ComputeAllMetricsWithOptions(report analyze.Report, opts MetricOptions) (*ComputedMetrics, error) { input, err := ParseReportData(report) if err != nil { return nil, err } return &ComputedMetrics{ - TimeSeries: computeTimeSeries(input), - Trend: computeTrend(input), - LowSentimentPeriods: computeLowSentimentPeriods(input), - Aggregate: computeAggregate(input), + TimeSeries: computeTimeSeriesWithOpts(input, opts), + Trend: computeTrendWithOpts(input, opts), + LowSentimentPeriods: computeLowSentimentPeriodsWithOpts(input, opts), + Aggregate: computeAggregateWithOpts(input, opts), }, nil } @@ -199,18 +204,38 @@ const ( lowSentimentRiskThreshold = 0.2 ) -func classifyTrendDirection(startSentiment, endSentiment float32) string { +// MetricOptions holds configurable thresholds for sentiment metrics computation. +type MetricOptions struct { + PositiveThreshold float64 + NegativeThreshold float64 + TrendThreshold float64 + LowSentimentRiskThresh float64 +} + +// DefaultMetricOptions returns default sentiment metric options. +func DefaultMetricOptions() MetricOptions { + return MetricOptions{ + PositiveThreshold: SentimentPositiveThreshold, + NegativeThreshold: SentimentNegativeThreshold, + TrendThreshold: trendThreshold, + LowSentimentRiskThresh: lowSentimentRiskThreshold, + } +} + +func classifyTrendDirectionWithOpts(startSentiment, endSentiment float32, opts MetricOptions) string { + thresh := float32(opts.TrendThreshold) + switch { - case endSentiment > startSentiment+trendThreshold: + case endSentiment > startSentiment+thresh: return "improving" - case endSentiment < startSentiment-trendThreshold: + case endSentiment < startSentiment-thresh: return "declining" default: return "stable" } } -func computeTimeSeries(input *ReportData) []TimeSeriesData { +func computeTimeSeriesWithOpts(input *ReportData, opts MetricOptions) []TimeSeriesData { ticks := make([]int, 0, len(input.EmotionsByTick)) for tick := range input.EmotionsByTick { ticks = append(ticks, tick) @@ -233,7 +258,7 @@ func computeTimeSeries(input *ReportData) []TimeSeriesData { commitCount = len(commits) } - classification := classifySentiment(sentiment) + classification := classifySentimentWithOpts(sentiment, opts) result = append(result, TimeSeriesData{ Tick: tick, @@ -247,18 +272,18 @@ func computeTimeSeries(input *ReportData) []TimeSeriesData { return result } -func classifySentiment(sentiment float32) string { +func classifySentimentWithOpts(sentiment float32, opts MetricOptions) string { switch { - case sentiment >= SentimentPositiveThreshold: + case sentiment >= float32(opts.PositiveThreshold): return "positive" - case sentiment <= SentimentNegativeThreshold: + case sentiment <= float32(opts.NegativeThreshold): return "negative" default: return "neutral" } } -func computeTrend(input *ReportData) TrendData { +func computeTrendWithOpts(input *ReportData, opts MetricOptions) TrendData { if len(input.EmotionsByTick) == 0 { return TrendData{} } @@ -282,7 +307,7 @@ func computeTrend(input *ReportData) TrendData { changePercent = stats.ToPercent(float64(regressionEnd-regressionStart) / float64(regressionStart)) } - direction := classifyTrendDirection(regressionStart, regressionEnd) + direction := classifyTrendDirectionWithOpts(regressionStart, regressionEnd, opts) return TrendData{ StartTick: startTick, @@ -336,16 +361,16 @@ func linearRegressionEndpoints(ticks []int, emotions map[int]float32) (start, en return startVal, endVal } -func computeLowSentimentPeriods(input *ReportData) []LowSentimentPeriodData { +func computeLowSentimentPeriodsWithOpts(input *ReportData, opts MetricOptions) []LowSentimentPeriodData { var result []LowSentimentPeriodData for tick, sentiment := range input.EmotionsByTick { - if sentiment > SentimentNegativeThreshold { + if sentiment > float32(opts.NegativeThreshold) { continue } var riskLevel string - if sentiment <= lowSentimentRiskThreshold { + if sentiment <= float32(opts.LowSentimentRiskThresh) { riskLevel = "HIGH" } else { riskLevel = "MEDIUM" @@ -369,7 +394,7 @@ func computeLowSentimentPeriods(input *ReportData) []LowSentimentPeriodData { return result } -func computeAggregate(input *ReportData) AggregateData { +func computeAggregateWithOpts(input *ReportData, opts MetricOptions) AggregateData { agg := AggregateData{ TotalTicks: len(input.EmotionsByTick), } @@ -384,9 +409,9 @@ func computeAggregate(input *ReportData) AggregateData { totalSentiment += sentiment switch { - case sentiment >= SentimentPositiveThreshold: + case sentiment >= float32(opts.PositiveThreshold): agg.PositiveTicks++ - case sentiment <= SentimentNegativeThreshold: + case sentiment <= float32(opts.NegativeThreshold): agg.NegativeTicks++ default: agg.NeutralTicks++ diff --git a/internal/analyzers/sentiment/metrics_test.go b/internal/analyzers/sentiment/metrics_test.go index 35db103..2b7c7dc 100644 --- a/internal/analyzers/sentiment/metrics_test.go +++ b/internal/analyzers/sentiment/metrics_test.go @@ -105,7 +105,7 @@ func TestClassifySentiment(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Parallel() - result := classifySentiment(tt.sentiment) + result := classifySentimentWithOpts(tt.sentiment, DefaultMetricOptions()) assert.Equal(t, tt.expected, result) }) } @@ -118,7 +118,7 @@ func TestSentimentTimeSeriesMetric_Empty(t *testing.T) { input := &ReportData{} - result := computeTimeSeries(input) + result := computeTimeSeriesWithOpts(input, DefaultMetricOptions()) assert.Empty(t, result) } @@ -132,7 +132,7 @@ func TestSentimentTimeSeriesMetric_SingleTick(t *testing.T) { CommitsByTick: map[int][]gitlib.Hash{0: {testHash("abc"), testHash("def")}}, } - result := computeTimeSeries(input) + result := computeTimeSeriesWithOpts(input, DefaultMetricOptions()) require.Len(t, result, 1) assert.Equal(t, 0, result[0].Tick) @@ -153,7 +153,7 @@ func TestSentimentTimeSeriesMetric_MultipleTicks_SortedByTick(t *testing.T) { }, } - result := computeTimeSeries(input) + result := computeTimeSeriesWithOpts(input, DefaultMetricOptions()) require.Len(t, result, 3) // Sorted by tick. @@ -175,7 +175,7 @@ func TestSentimentTimeSeriesMetric_MissingCommmentsAndCommits(t *testing.T) { // No comments or commits for tick 0. } - result := computeTimeSeries(input) + result := computeTimeSeriesWithOpts(input, DefaultMetricOptions()) require.Len(t, result, 1) assert.Equal(t, 0, result[0].CommentCount) @@ -189,7 +189,7 @@ func TestSentimentTrendMetric_Empty(t *testing.T) { input := &ReportData{} - result := computeTrend(input) + result := computeTrendWithOpts(input, DefaultMetricOptions()) assert.Equal(t, 0, result.StartTick) assert.Equal(t, 0, result.EndTick) @@ -203,7 +203,7 @@ func TestSentimentTrendMetric_SingleTick(t *testing.T) { EmotionsByTick: map[int]float32{0: testSentimentNeutral}, } - result := computeTrend(input) + result := computeTrendWithOpts(input, DefaultMetricOptions()) assert.Equal(t, 0, result.StartTick) assert.Equal(t, 0, result.EndTick) @@ -236,7 +236,7 @@ func TestSentimentTrendMetric_TrendDirections(t *testing.T) { }, } - result := computeTrend(input) + result := computeTrendWithOpts(input, DefaultMetricOptions()) assert.Equal(t, tt.expectedTrend, result.TrendDirection) assert.InDelta(t, tt.startSentiment, result.StartSentiment, floatDelta) @@ -255,7 +255,7 @@ func TestSentimentTrendMetric_ChangePercent(t *testing.T) { }, } - result := computeTrend(input) + result := computeTrendWithOpts(input, DefaultMetricOptions()) // Change = (0.75 - 0.5) / 0.5 * 100 = 50%. assert.InDelta(t, 50.0, result.ChangePercent, floatDelta) @@ -271,7 +271,7 @@ func TestSentimentTrendMetric_ZeroStartSentiment(t *testing.T) { }, } - result := computeTrend(input) + result := computeTrendWithOpts(input, DefaultMetricOptions()) // Change percent should be 0 when start is 0 (avoid division by zero). assert.InDelta(t, 0.0, result.ChangePercent, floatDelta) @@ -330,7 +330,7 @@ func TestSentimentTrendMetric_RegressionBased(t *testing.T) { }, } - result := computeTrend(input) + result := computeTrendWithOpts(input, DefaultMetricOptions()) assert.Equal(t, 0, result.StartTick) assert.Equal(t, 4, result.EndTick) @@ -344,7 +344,7 @@ func TestLowSentimentPeriodMetric_Empty(t *testing.T) { input := &ReportData{} - result := computeLowSentimentPeriods(input) + result := computeLowSentimentPeriodsWithOpts(input, DefaultMetricOptions()) assert.Empty(t, result) } @@ -359,7 +359,7 @@ func TestLowSentimentPeriodMetric_NoLowSentiment(t *testing.T) { }, } - result := computeLowSentimentPeriods(input) + result := computeLowSentimentPeriodsWithOpts(input, DefaultMetricOptions()) assert.Empty(t, result) } @@ -387,7 +387,7 @@ func TestLowSentimentPeriodMetric_RiskLevels(t *testing.T) { CommentsByTick: map[int][]string{0: {testComment2}}, } - result := computeLowSentimentPeriods(input) + result := computeLowSentimentPeriodsWithOpts(input, DefaultMetricOptions()) require.Len(t, result, 1) assert.InDelta(t, tt.sentiment, result[0].Sentiment, floatDelta) @@ -408,7 +408,7 @@ func TestLowSentimentPeriodMetric_SortedBySentiment(t *testing.T) { }, } - result := computeLowSentimentPeriods(input) + result := computeLowSentimentPeriodsWithOpts(input, DefaultMetricOptions()) require.Len(t, result, 3) // Sorted by sentiment ascending (worst first). @@ -424,7 +424,7 @@ func TestSentimentAggregateMetric_Empty(t *testing.T) { input := &ReportData{} - result := computeAggregate(input) + result := computeAggregateWithOpts(input, DefaultMetricOptions()) assert.Equal(t, 0, result.TotalTicks) assert.Equal(t, 0, result.TotalComments) @@ -454,7 +454,7 @@ func TestSentimentAggregateMetric_AllClassifications(t *testing.T) { }, } - result := computeAggregate(input) + result := computeAggregateWithOpts(input, DefaultMetricOptions()) assert.Equal(t, 3, result.TotalTicks) assert.Equal(t, 3, result.TotalComments) // 2 + 1 + 0 diff --git a/internal/analyzers/sentiment/scorer.go b/internal/analyzers/sentiment/scorer.go index 21fd6d8..88aef75 100644 --- a/internal/analyzers/sentiment/scorer.go +++ b/internal/analyzers/sentiment/scorer.go @@ -134,16 +134,28 @@ const neutralizerWeight = 0.8 // maxWeightRatio caps comment length weight to prevent single long comments from dominating. const maxWeightRatio = 3.0 -// applySEDomainAdjustment adjusts VADER compound score for SE-domain terms. -// Returns adjusted compound score in [-1, 1]. -func applySEDomainAdjustment(text string, compound float64) float64 { +// ScorerOptions holds configurable parameters for sentiment scoring. +type ScorerOptions struct { + NeutralizerWeight float64 + MaxWeightRatio float64 +} + +// DefaultScorerOptions returns ScorerOptions populated with package-level defaults. +func DefaultScorerOptions() ScorerOptions { + return ScorerOptions{ + NeutralizerWeight: neutralizerWeight, + MaxWeightRatio: maxWeightRatio, + } +} + +func applySEDomainAdjustmentWithWeight(text string, compound, nWeight float64) float64 { lower := strings.ToLower(text) adjustment := 0.0 count := 0 for term, shift := range seDomainNeutralizers { if strings.Contains(lower, term) { - adjustment += (shift - compound) * neutralizerWeight + adjustment += (shift - compound) * nWeight count++ } } @@ -170,11 +182,18 @@ func applySEDomainAdjustment(text string, compound float64) float64 { // Empty comments yield 0 (no comment implies no sentiment signal). // Comments are weighted by length (longer comments carry more signal). func ComputeSentiment(comments []string) float32 { + return ComputeSentimentWithOptions(comments, DefaultScorerOptions()) +} + +// ComputeSentimentWithOptions returns a sentiment score with configurable parameters. +func ComputeSentimentWithOptions(comments []string, opts ScorerOptions) float32 { if len(comments) == 0 { return 0 } analyzer := getVaderAnalyzer() + nWeight := opts.NeutralizerWeight + maxWR := opts.MaxWeightRatio var weightedSum float64 @@ -189,9 +208,9 @@ func ComputeSentiment(comments []string) float32 { } scores := analyzer.PolarityScores(c) - adjusted := applySEDomainAdjustment(c, scores.Compound) + adjusted := applySEDomainAdjustmentWithWeight(c, scores.Compound, nWeight) - weight := commentWeight(len(c), avgLen) + weight := commentWeightWithMax(len(c), avgLen, maxWR) weightedSum += float64(vaderCompoundToScore(adjusted)) * weight totalWeight += weight } @@ -225,14 +244,12 @@ func averageCommentLength(comments []string) float64 { return float64(total) / float64(count) } -// commentWeight returns the weight for a comment based on its length relative to the average. -// Longer comments get more weight, capped at maxWeightRatio to prevent dominance. -func commentWeight(length int, avgLength float64) float64 { +func commentWeightWithMax(length int, avgLength, maxRatio float64) float64 { if avgLength <= 0 { return 1 } ratio := float64(length) / avgLength - return math.Min(ratio, maxWeightRatio) + return math.Min(ratio, maxRatio) } diff --git a/internal/analyzers/sentiment/scorer_test.go b/internal/analyzers/sentiment/scorer_test.go index 426d046..dc337a1 100644 --- a/internal/analyzers/sentiment/scorer_test.go +++ b/internal/analyzers/sentiment/scorer_test.go @@ -134,28 +134,28 @@ func TestComputeSentiment_LengthWeighting(t *testing.T) { func TestApplySEDomainAdjustment_NoTerms(t *testing.T) { t.Parallel() - result := applySEDomainAdjustment("simple regular comment", 0.5) + result := applySEDomainAdjustmentWithWeight("simple regular comment", 0.5, neutralizerWeight) assert.InDelta(t, 0.5, result, floatDelta, "no SE terms should leave compound unchanged") } func TestApplySEDomainAdjustment_WithNeutralizer(t *testing.T) { t.Parallel() - result := applySEDomainAdjustment("kill the process", -0.6) + result := applySEDomainAdjustmentWithWeight("kill the process", -0.6, neutralizerWeight) assert.Greater(t, result, -0.6, "neutralizer should push negative compound toward neutral") } func TestApplySEDomainAdjustment_WithNegativeTerm(t *testing.T) { t.Parallel() - result := applySEDomainAdjustment("this is a terrible hack", 0.0) + result := applySEDomainAdjustmentWithWeight("this is a terrible hack", 0.0, neutralizerWeight) assert.Less(t, result, 0.0, "SE negative term should push compound toward negative") } func TestApplySEDomainAdjustment_ClampsBounds(t *testing.T) { t.Parallel() - result := applySEDomainAdjustment("nightmare spaghetti awful terrible hack kludge", 0.9) + result := applySEDomainAdjustmentWithWeight("nightmare spaghetti awful terrible hack kludge", 0.9, neutralizerWeight) assert.GreaterOrEqual(t, result, -1.0, "result should be >= -1") assert.LessOrEqual(t, result, 1.0, "result should be <= 1") } @@ -204,7 +204,7 @@ func TestCommentWeight(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Parallel() - result := commentWeight(tt.length, tt.avgLength) + result := commentWeightWithMax(tt.length, tt.avgLength, DefaultScorerOptions().MaxWeightRatio) assert.InDelta(t, tt.expected, result, floatDelta) }) } diff --git a/internal/config/config.go b/internal/config/config.go index 3ccc259..26784ab 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -23,17 +23,48 @@ type PipelineConfig struct { BallastSize string `mapstructure:"ballast_size"` MemoryLimit string `mapstructure:"memory_limit"` WorkerTimeout string `mapstructure:"worker_timeout"` + + // Advanced pipeline tuning. + UASTSpillThreshold int `mapstructure:"uast_spill_threshold"` + IntraCommitParallelThreshold int `mapstructure:"intra_commit_parallel_threshold"` + MaxIntraCommitWorkers int `mapstructure:"max_intra_commit_workers"` + MaxUASTBlobSize int `mapstructure:"max_uast_blob_size"` + UASTParseTimeout string `mapstructure:"uast_parse_timeout"` + MaxChangesPerCommit int `mapstructure:"max_changes_per_commit"` + MaxDiffBatchSize int `mapstructure:"max_diff_batch_size"` + MemoryBudgetRatio int `mapstructure:"memory_budget_ratio"` + MemoryBudgetCap string `mapstructure:"memory_budget_cap"` + MemoryLimitRatio int `mapstructure:"memory_limit_ratio"` + UASTSpillTrimInterval int `mapstructure:"uast_spill_trim_interval"` + NativeTrimInterval int `mapstructure:"native_trim_interval"` + MaxStreamingBuffering int `mapstructure:"max_streaming_buffering"` + DrainPrefetchTimeout string `mapstructure:"drain_prefetch_timeout"` + SamplerInterval string `mapstructure:"sampler_interval"` + WorkerRatio int `mapstructure:"worker_ratio"` + UASTWorkerRatio int `mapstructure:"uast_worker_ratio"` + LeafWorkerDivisor int `mapstructure:"leaf_worker_divisor"` + MinLeafWorkers int `mapstructure:"min_leaf_workers"` + BufferSizeMultiplier int `mapstructure:"buffer_size_multiplier"` + BudgetLimitRatio int `mapstructure:"budget_limit_ratio"` + SystemRAMLimitRatio int `mapstructure:"system_ram_limit_ratio"` + StaticMaxWorkers int `mapstructure:"static_max_workers"` + MallocTrimInterval int `mapstructure:"malloc_trim_interval"` + StaticMemoryLimitRatio int `mapstructure:"static_memory_limit_ratio"` + DiffJobBufferMultiplier int `mapstructure:"diff_job_buffer_multiplier"` } // HistoryConfig holds per-analyzer configuration for history analyzers. type HistoryConfig struct { - Burndown BurndownConfig `mapstructure:"burndown"` - Devs DevsConfig `mapstructure:"devs"` - Imports ImportsConfig `mapstructure:"imports"` - Sentiment SentimentConfig `mapstructure:"sentiment"` - Shotness ShotnessConfig `mapstructure:"shotness"` - Typos TyposConfig `mapstructure:"typos"` - Anomaly AnomalyConfig `mapstructure:"anomaly"` + Burndown BurndownConfig `mapstructure:"burndown"` + Couples CouplesConfig `mapstructure:"couples"` + Devs DevsConfig `mapstructure:"devs"` + FileHistory FileHistoryConfig `mapstructure:"file_history"` + Imports ImportsConfig `mapstructure:"imports"` + Sentiment SentimentConfig `mapstructure:"sentiment"` + Shotness ShotnessConfig `mapstructure:"shotness"` + Typos TyposConfig `mapstructure:"typos"` + Anomaly AnomalyConfig `mapstructure:"anomaly"` + Clones ClonesConfig `mapstructure:"clones"` } // AnomalyConfig holds temporal anomaly detection analyzer settings. @@ -55,22 +86,54 @@ type BurndownConfig struct { Goroutines int `mapstructure:"goroutines"` } +// CouplesConfig holds couples analyzer settings. +type CouplesConfig struct { + CouplingThresholdHigh int `mapstructure:"coupling_threshold_high"` + OwnershipFewThreshold int `mapstructure:"ownership_few_threshold"` + OwnershipModerateThreshold int `mapstructure:"ownership_moderate_threshold"` + BatchCouplingThreshold int `mapstructure:"batch_coupling_threshold"` + HLLPrecision int `mapstructure:"hll_precision"` + TopKPerFile int `mapstructure:"top_k_per_file"` + MinEdgeWeight int `mapstructure:"min_edge_weight"` +} + // DevsConfig holds devs analyzer settings. type DevsConfig struct { - ConsiderEmptyCommits bool `mapstructure:"consider_empty_commits"` - Anonymize bool `mapstructure:"anonymize"` + ConsiderEmptyCommits bool `mapstructure:"consider_empty_commits"` + Anonymize bool `mapstructure:"anonymize"` + BusFactorThreshold float64 `mapstructure:"bus_factor_threshold"` + RiskThresholdCritical float64 `mapstructure:"risk_threshold_critical"` + RiskThresholdHigh float64 `mapstructure:"risk_threshold_high"` + RiskThresholdMedium float64 `mapstructure:"risk_threshold_medium"` + ActiveThresholdRatio float64 `mapstructure:"active_threshold_ratio"` + DefaultActiveDays int `mapstructure:"default_active_days"` + HLLPrecision int `mapstructure:"hll_precision"` +} + +// FileHistoryConfig holds file history analyzer settings. +type FileHistoryConfig struct { + HotspotThresholdCritical int `mapstructure:"hotspot_threshold_critical"` + HotspotThresholdHigh int `mapstructure:"hotspot_threshold_high"` + HotspotThresholdMedium int `mapstructure:"hotspot_threshold_medium"` } // ImportsConfig holds imports history analyzer settings. type ImportsConfig struct { - Goroutines int `mapstructure:"goroutines"` - MaxFileSize int `mapstructure:"max_file_size"` + Goroutines int `mapstructure:"goroutines"` + MaxFileSize int `mapstructure:"max_file_size"` + MaxDependencyRiskRows int `mapstructure:"max_dependency_risk_rows"` } // SentimentConfig holds sentiment analyzer settings. type SentimentConfig struct { - MinCommentLength int `mapstructure:"min_comment_length"` - Gap float64 `mapstructure:"gap"` + MinCommentLength int `mapstructure:"min_comment_length"` + Gap float64 `mapstructure:"gap"` + NeutralizerWeight float64 `mapstructure:"neutralizer_weight"` + MaxWeightRatio float64 `mapstructure:"max_weight_ratio"` + PositiveThreshold float64 `mapstructure:"positive_threshold"` + NegativeThreshold float64 `mapstructure:"negative_threshold"` + TrendThreshold float64 `mapstructure:"trend_threshold"` + LowSentimentRiskThresh float64 `mapstructure:"low_sentiment_risk_threshold"` } // ShotnessConfig holds shotness analyzer settings. @@ -84,6 +147,21 @@ type TyposConfig struct { MaxDistance int `mapstructure:"max_distance"` } +// ClonesConfig holds clones analyzer settings. +type ClonesConfig struct { + MaxClonePairs int `mapstructure:"max_clone_pairs"` + NumHashes int `mapstructure:"num_hashes"` + NumBands int `mapstructure:"num_bands"` + NumRows int `mapstructure:"num_rows"` + ShingleSize int `mapstructure:"shingle_size"` + SimilarityType2 float64 `mapstructure:"similarity_type2"` + SimilarityType3 float64 `mapstructure:"similarity_type3"` + ThresholdRatioYellow float64 `mapstructure:"threshold_ratio_yellow"` + ThresholdRatioRed float64 `mapstructure:"threshold_ratio_red"` + ThresholdPairsYellow int `mapstructure:"threshold_pairs_yellow"` + ThresholdPairsRed int `mapstructure:"threshold_pairs_red"` +} + // CheckpointConfig holds checkpoint settings. type CheckpointConfig struct { Enabled bool `mapstructure:"enabled"` @@ -95,6 +173,18 @@ type CheckpointConfig struct { // sentimentGapMax is the upper bound for the sentiment gap value. const sentimentGapMax = 1.0 +// ratioMax is the upper bound for ratio values (0.0 to 1.0). +const ratioMax = 1.0 + +// percentMax is the upper bound for percentage values (0 to 100). +const percentMax = 100.0 + +// HLL precision bounds (algorithm constraint). +const ( + minHLLPrecision = 4 + maxHLLPrecision = 18 +) + // Sentinel errors for configuration validation. var ( // ErrInvalidWorkers indicates the workers value is negative. @@ -105,24 +195,76 @@ var ( ErrInvalidCommitBatchSize = errors.New("pipeline.commit_batch_size must be non-negative") // ErrInvalidGOGC indicates the GOGC value is negative. ErrInvalidGOGC = errors.New("pipeline.gogc must be non-negative") + // ErrInvalidUASTSpillThreshold indicates the UAST spill threshold is negative. + ErrInvalidUASTSpillThreshold = errors.New("pipeline.uast_spill_threshold must be non-negative") + // ErrInvalidIntraCommitParallelThreshold indicates the intra-commit parallel threshold is negative. + ErrInvalidIntraCommitParallelThreshold = errors.New("pipeline.intra_commit_parallel_threshold must be non-negative") + // ErrInvalidMaxIntraCommitWorkers indicates the max intra-commit workers is negative. + ErrInvalidMaxIntraCommitWorkers = errors.New("pipeline.max_intra_commit_workers must be non-negative") + // ErrInvalidMaxUASTBlobSize indicates the max UAST blob size is negative. + ErrInvalidMaxUASTBlobSize = errors.New("pipeline.max_uast_blob_size must be non-negative") + // ErrInvalidMaxChangesPerCommit indicates the max changes per commit is negative. + ErrInvalidMaxChangesPerCommit = errors.New("pipeline.max_changes_per_commit must be non-negative") + // ErrInvalidMaxDiffBatchSize indicates the max diff batch size is negative. + ErrInvalidMaxDiffBatchSize = errors.New("pipeline.max_diff_batch_size must be non-negative") + // ErrInvalidMemoryBudgetRatio indicates the memory budget ratio is out of range. + ErrInvalidMemoryBudgetRatio = errors.New("pipeline.memory_budget_ratio must be between 0 and 100") + // ErrInvalidMemoryLimitRatio indicates the memory limit ratio is out of range. + ErrInvalidMemoryLimitRatio = errors.New("pipeline.memory_limit_ratio must be between 0 and 100") // ErrInvalidBurndownGranularity indicates the granularity is not positive. ErrInvalidBurndownGranularity = errors.New("history.burndown.granularity must be positive") // ErrInvalidBurndownSampling indicates the sampling is not positive. ErrInvalidBurndownSampling = errors.New("history.burndown.sampling must be positive") + // ErrInvalidCouplingThreshold indicates the coupling threshold is negative. + ErrInvalidCouplingThreshold = errors.New("history.couples.coupling_threshold_high must be non-negative") + // ErrInvalidOwnershipFewThreshold indicates the ownership few threshold is negative. + ErrInvalidOwnershipFewThreshold = errors.New("history.couples.ownership_few_threshold must be non-negative") + // ErrInvalidOwnershipModerateThreshold indicates the ownership moderate threshold is negative. + ErrInvalidOwnershipModerateThreshold = errors.New("history.couples.ownership_moderate_threshold must be non-negative") + // ErrInvalidCouplesHLLPrecision indicates the HLL precision is out of range. + ErrInvalidCouplesHLLPrecision = errors.New("history.couples.hll_precision must be between 4 and 18") + // ErrInvalidBusFactorThreshold indicates the bus factor threshold is out of range. + ErrInvalidBusFactorThreshold = errors.New("history.devs.bus_factor_threshold must be between 0 and 1") + // ErrInvalidDevsRiskThresholdCritical indicates the critical risk threshold is out of range. + ErrInvalidDevsRiskThresholdCritical = errors.New("history.devs.risk_threshold_critical must be between 0 and 100") + // ErrInvalidDevsRiskThresholdHigh indicates the high risk threshold is out of range. + ErrInvalidDevsRiskThresholdHigh = errors.New("history.devs.risk_threshold_high must be between 0 and 100") + // ErrInvalidDevsRiskThresholdMedium indicates the medium risk threshold is out of range. + ErrInvalidDevsRiskThresholdMedium = errors.New("history.devs.risk_threshold_medium must be between 0 and 100") + // ErrInvalidDevsActiveThresholdRatio indicates the active threshold ratio is out of range. + ErrInvalidDevsActiveThresholdRatio = errors.New("history.devs.active_threshold_ratio must be between 0 and 1") + // ErrInvalidDevsDefaultActiveDays indicates the default active days is negative. + ErrInvalidDevsDefaultActiveDays = errors.New("history.devs.default_active_days must be non-negative") + // ErrInvalidDevsHLLPrecision indicates the HLL precision is out of range. + ErrInvalidDevsHLLPrecision = errors.New("history.devs.hll_precision must be between 4 and 18") + // ErrInvalidHotspotThresholdCritical indicates the critical hotspot threshold is negative. + ErrInvalidHotspotThresholdCritical = errors.New("history.file_history.hotspot_threshold_critical must be non-negative") + // ErrInvalidHotspotThresholdHigh indicates the high hotspot threshold is negative. + ErrInvalidHotspotThresholdHigh = errors.New("history.file_history.hotspot_threshold_high must be non-negative") + // ErrInvalidHotspotThresholdMedium indicates the medium hotspot threshold is negative. + ErrInvalidHotspotThresholdMedium = errors.New("history.file_history.hotspot_threshold_medium must be non-negative") // ErrInvalidSentimentMinLength indicates the min comment length is not positive. ErrInvalidSentimentMinLength = errors.New("history.sentiment.min_comment_length must be positive") // ErrInvalidSentimentGap indicates the sentiment gap is out of range. ErrInvalidSentimentGap = errors.New("history.sentiment.gap must be between 0 and 1") + // ErrInvalidNeutralizerWeight indicates the neutralizer weight is out of range. + ErrInvalidNeutralizerWeight = errors.New("history.sentiment.neutralizer_weight must be between 0 and 1") + // ErrInvalidMaxWeightRatio indicates the max weight ratio is negative. + ErrInvalidMaxWeightRatio = errors.New("history.sentiment.max_weight_ratio must be non-negative") // ErrInvalidTyposMaxDistance indicates the max distance is not positive. ErrInvalidTyposMaxDistance = errors.New("history.typos.max_distance must be positive") // ErrInvalidImportsGoroutines indicates the goroutines value is not positive. ErrInvalidImportsGoroutines = errors.New("history.imports.goroutines must be positive") // ErrInvalidImportsMaxFileSize indicates the max file size is not positive. ErrInvalidImportsMaxFileSize = errors.New("history.imports.max_file_size must be positive") + // ErrInvalidImportsMaxDependencyRiskRows indicates the max dependency risk rows is negative. + ErrInvalidImportsMaxDependencyRiskRows = errors.New("history.imports.max_dependency_risk_rows must be non-negative") // ErrInvalidAnomalyThreshold indicates the threshold is not positive. ErrInvalidAnomalyThreshold = errors.New("history.anomaly.threshold must be positive") // ErrInvalidAnomalyWindowSize indicates the window size is less than 2. ErrInvalidAnomalyWindowSize = errors.New("history.anomaly.window_size must be at least 2") + // ErrInvalidClonesMaxClonePairs indicates the max clone pairs is negative. + ErrInvalidClonesMaxClonePairs = errors.New("history.clones.max_clone_pairs must be non-negative") ) // Validate checks Config invariants and returns the first error found. @@ -152,6 +294,38 @@ func (c *Config) validatePipeline() error { return ErrInvalidGOGC } + if c.Pipeline.UASTSpillThreshold < 0 { + return ErrInvalidUASTSpillThreshold + } + + if c.Pipeline.IntraCommitParallelThreshold < 0 { + return ErrInvalidIntraCommitParallelThreshold + } + + if c.Pipeline.MaxIntraCommitWorkers < 0 { + return ErrInvalidMaxIntraCommitWorkers + } + + if c.Pipeline.MaxUASTBlobSize < 0 { + return ErrInvalidMaxUASTBlobSize + } + + if c.Pipeline.MaxChangesPerCommit < 0 { + return ErrInvalidMaxChangesPerCommit + } + + if c.Pipeline.MaxDiffBatchSize < 0 { + return ErrInvalidMaxDiffBatchSize + } + + if c.Pipeline.MemoryBudgetRatio < 0 || c.Pipeline.MemoryBudgetRatio > int(percentMax) { + return ErrInvalidMemoryBudgetRatio + } + + if c.Pipeline.MemoryLimitRatio < 0 || c.Pipeline.MemoryLimitRatio > int(percentMax) { + return ErrInvalidMemoryLimitRatio + } + return nil } @@ -164,12 +338,24 @@ func (c *Config) validateHistory() error { return ErrInvalidBurndownSampling } - if c.History.Sentiment.MinCommentLength < 0 { - return ErrInvalidSentimentMinLength + err := c.validateCouples() + if err != nil { + return err } - if c.History.Sentiment.Gap < 0 || c.History.Sentiment.Gap > sentimentGapMax { - return ErrInvalidSentimentGap + err = c.validateDevs() + if err != nil { + return err + } + + err = c.validateFileHistory() + if err != nil { + return err + } + + err = c.validateSentiment() + if err != nil { + return err } if c.History.Typos.MaxDistance < 0 { @@ -184,6 +370,10 @@ func (c *Config) validateHistory() error { return ErrInvalidImportsMaxFileSize } + if c.History.Imports.MaxDependencyRiskRows < 0 { + return ErrInvalidImportsMaxDependencyRiskRows + } + if c.History.Anomaly.Threshold < 0 { return ErrInvalidAnomalyThreshold } @@ -192,6 +382,106 @@ func (c *Config) validateHistory() error { return ErrInvalidAnomalyWindowSize } + if c.History.Clones.MaxClonePairs < 0 { + return ErrInvalidClonesMaxClonePairs + } + + return nil +} + +func (c *Config) validateCouples() error { + cp := c.History.Couples + + if cp.CouplingThresholdHigh < 0 { + return ErrInvalidCouplingThreshold + } + + if cp.OwnershipFewThreshold < 0 { + return ErrInvalidOwnershipFewThreshold + } + + if cp.OwnershipModerateThreshold < 0 { + return ErrInvalidOwnershipModerateThreshold + } + + if cp.HLLPrecision != 0 && (cp.HLLPrecision < minHLLPrecision || cp.HLLPrecision > maxHLLPrecision) { + return ErrInvalidCouplesHLLPrecision + } + + return nil +} + +func (c *Config) validateDevs() error { + dv := c.History.Devs + + if dv.BusFactorThreshold < 0 || dv.BusFactorThreshold > ratioMax { + return ErrInvalidBusFactorThreshold + } + + if dv.RiskThresholdCritical < 0 || dv.RiskThresholdCritical > percentMax { + return ErrInvalidDevsRiskThresholdCritical + } + + if dv.RiskThresholdHigh < 0 || dv.RiskThresholdHigh > percentMax { + return ErrInvalidDevsRiskThresholdHigh + } + + if dv.RiskThresholdMedium < 0 || dv.RiskThresholdMedium > percentMax { + return ErrInvalidDevsRiskThresholdMedium + } + + if dv.ActiveThresholdRatio < 0 || dv.ActiveThresholdRatio > ratioMax { + return ErrInvalidDevsActiveThresholdRatio + } + + if dv.DefaultActiveDays < 0 { + return ErrInvalidDevsDefaultActiveDays + } + + if dv.HLLPrecision != 0 && (dv.HLLPrecision < minHLLPrecision || dv.HLLPrecision > maxHLLPrecision) { + return ErrInvalidDevsHLLPrecision + } + + return nil +} + +func (c *Config) validateFileHistory() error { + fh := c.History.FileHistory + + if fh.HotspotThresholdCritical < 0 { + return ErrInvalidHotspotThresholdCritical + } + + if fh.HotspotThresholdHigh < 0 { + return ErrInvalidHotspotThresholdHigh + } + + if fh.HotspotThresholdMedium < 0 { + return ErrInvalidHotspotThresholdMedium + } + + return nil +} + +func (c *Config) validateSentiment() error { + se := c.History.Sentiment + + if se.MinCommentLength < 0 { + return ErrInvalidSentimentMinLength + } + + if se.Gap < 0 || se.Gap > sentimentGapMax { + return ErrInvalidSentimentGap + } + + if se.NeutralizerWeight < 0 || se.NeutralizerWeight > ratioMax { + return ErrInvalidNeutralizerWeight + } + + if se.MaxWeightRatio < 0 { + return ErrInvalidMaxWeightRatio + } + return nil } diff --git a/internal/config/defaults.go b/internal/config/defaults.go index 9cccde8..d6ffaf3 100644 --- a/internal/config/defaults.go +++ b/internal/config/defaults.go @@ -13,6 +13,36 @@ const ( DefaultPipelineBallastSize = "0" ) +// Pipeline advanced tuning defaults. +const ( + DefaultPipelineUASTSpillThreshold = 32 + DefaultPipelineIntraCommitParallelThreshold = 4 + DefaultPipelineMaxIntraCommitWorkers = 4 + DefaultPipelineMaxUASTBlobSize = 256 * 1024 // 256 KiB. + DefaultPipelineUASTParseTimeout = "10s" + DefaultPipelineMaxChangesPerCommit = 10000 + DefaultPipelineMaxDiffBatchSize = 1000 + DefaultPipelineMemoryBudgetRatio = 50 + DefaultPipelineMemoryBudgetCap = "2GiB" + DefaultPipelineMemoryLimitRatio = 75 + DefaultPipelineUASTSpillTrimInterval = 16 + DefaultPipelineNativeTrimInterval = 10 + DefaultPipelineMaxStreamingBuffering = 3 + DefaultPipelineDrainPrefetchTimeout = "30s" + DefaultPipelineSamplerInterval = "2s" + DefaultPipelineWorkerRatio = 100 + DefaultPipelineUASTWorkerRatio = 40 + DefaultPipelineLeafWorkerDivisor = 3 + DefaultPipelineMinLeafWorkers = 4 + DefaultPipelineBufferSizeMultiplier = 2 + DefaultPipelineBudgetLimitRatio = 95 + DefaultPipelineSystemRAMLimitRatio = 90 + DefaultPipelineStaticMaxWorkers = 8 + DefaultPipelineMallocTrimInterval = 50 + DefaultPipelineStaticMemoryLimitRatio = 90 + DefaultPipelineDiffJobBufferMultiplier = 10 +) + // Burndown analyzer defaults. const ( DefaultBurndownGranularity = 30 @@ -26,22 +56,54 @@ const ( DefaultBurndownGoroutines = 0 ) +// Couples analyzer defaults. +const ( + DefaultCouplesCouplingThresholdHigh = 10 + DefaultCouplesOwnershipFewThreshold = 3 + DefaultCouplesOwnershipModerateThreshold = 5 + DefaultCouplesBatchCouplingThreshold = 100 + DefaultCouplesHLLPrecision = 10 + DefaultCouplesTopKPerFile = 100 + DefaultCouplesMinEdgeWeight = 2 +) + // Devs analyzer defaults. const ( - DefaultDevsConsiderEmptyCommits = false - DefaultDevsAnonymize = false + DefaultDevsConsiderEmptyCommits = false + DefaultDevsAnonymize = false + DefaultDevsBusFactorThreshold = 0.5 + DefaultDevsRiskThresholdCritical = 90.0 + DefaultDevsRiskThresholdHigh = 80.0 + DefaultDevsRiskThresholdMedium = 60.0 + DefaultDevsActiveThresholdRatio = 0.7 + DefaultDevsDefaultActiveDays = 90 + DefaultDevsHLLPrecision = 14 +) + +// File history analyzer defaults. +const ( + DefaultFileHistoryHotspotCritical = 50 + DefaultFileHistoryHotspotHigh = 30 + DefaultFileHistoryHotspotMedium = 15 ) // Imports analyzer defaults. const ( - DefaultImportsGoroutines = 4 - DefaultImportsMaxFileSize = 1 << 20 // 1 MiB. + DefaultImportsGoroutines = 4 + DefaultImportsMaxFileSize = 1 << 20 // 1 MiB. + DefaultImportsMaxDependencyRiskRows = 30 ) // Sentiment analyzer defaults. const ( - DefaultSentimentMinCommentLength = 20 - DefaultSentimentGap = 0.5 + DefaultSentimentMinCommentLength = 20 + DefaultSentimentGap = 0.5 + DefaultSentimentNeutralizerWeight = 0.8 + DefaultSentimentMaxWeightRatio = 3.0 + DefaultSentimentPositiveThreshold = 0.6 + DefaultSentimentNegativeThreshold = 0.4 + DefaultSentimentTrendThreshold = 0.1 + DefaultSentimentLowSentimentRiskThresh = 0.2 ) // Shotness analyzer defaults. @@ -61,6 +123,21 @@ const ( DefaultAnomalyWindowSize = 20 ) +// Clones analyzer defaults. +const ( + DefaultClonesMaxClonePairs = 1000 + DefaultClonesNumHashes = 128 + DefaultClonesNumBands = 16 + DefaultClonesNumRows = 8 + DefaultClonesShingleSize = 5 + DefaultClonesSimilarityType2 = 0.8 + DefaultClonesSimilarityType3 = 0.5 + DefaultClonesThresholdRatioYellow = 0.1 + DefaultClonesThresholdRatioRed = 0.3 + DefaultClonesThresholdPairsYellow = 5 + DefaultClonesThresholdPairsRed = 20 +) + // Checkpoint defaults. const ( DefaultCheckpointEnabled = true diff --git a/internal/config/facts.go b/internal/config/facts.go index e79fd49..47a8c21 100644 --- a/internal/config/facts.go +++ b/internal/config/facts.go @@ -2,7 +2,7 @@ package config // positive constrains types eligible for skip-on-zero fact application. type positive interface { - ~int | ~float32 + ~int | ~float32 | ~float64 } // applyPositive sets facts[key] = value when value is positive. @@ -31,6 +31,19 @@ func applyBool(facts map[string]any, key string, value bool) { // indicate "use analyzer default" and are skipped. // Boolean fields are always applied because false is a meaningful value. func (c *Config) ApplyToFacts(facts map[string]any) { + c.applyBurndownFacts(facts) + c.applyCouplesFacts(facts) + c.applyDevsFacts(facts) + c.applyFileHistoryFacts(facts) + c.applyImportsFacts(facts) + c.applySentimentFacts(facts) + c.applyShotnessFacts(facts) + c.applyTyposFacts(facts) + c.applyAnomalyFacts(facts) + c.applyClonesFacts(facts) +} + +func (c *Config) applyBurndownFacts(facts map[string]any) { bd := c.History.Burndown applyPositive(facts, "Burndown.Granularity", bd.Granularity) @@ -42,31 +55,93 @@ func (c *Config) ApplyToFacts(facts map[string]any) { applyNonEmpty(facts, "Burndown.HibernationDirectory", bd.HibernationDirectory) applyBool(facts, "Burndown.Debug", bd.Debug) applyPositive(facts, "Burndown.Goroutines", bd.Goroutines) +} + +func (c *Config) applyCouplesFacts(facts map[string]any) { + cp := c.History.Couples + applyPositive(facts, "Couples.CouplingThresholdHigh", cp.CouplingThresholdHigh) + applyPositive(facts, "Couples.OwnershipFewThreshold", cp.OwnershipFewThreshold) + applyPositive(facts, "Couples.OwnershipModerateThreshold", cp.OwnershipModerateThreshold) + applyPositive(facts, "Couples.BatchCouplingThreshold", cp.BatchCouplingThreshold) + applyPositive(facts, "Couples.HLLPrecision", cp.HLLPrecision) + applyPositive(facts, "Couples.TopKPerFile", cp.TopKPerFile) + applyPositive(facts, "Couples.MinEdgeWeight", cp.MinEdgeWeight) +} + +func (c *Config) applyDevsFacts(facts map[string]any) { dv := c.History.Devs applyBool(facts, "Devs.ConsiderEmptyCommits", dv.ConsiderEmptyCommits) applyBool(facts, "Devs.Anonymize", dv.Anonymize) + applyPositive(facts, "Devs.BusFactorThreshold", dv.BusFactorThreshold) + applyPositive(facts, "Devs.RiskThresholdCritical", dv.RiskThresholdCritical) + applyPositive(facts, "Devs.RiskThresholdHigh", dv.RiskThresholdHigh) + applyPositive(facts, "Devs.RiskThresholdMedium", dv.RiskThresholdMedium) + applyPositive(facts, "Devs.ActiveThresholdRatio", dv.ActiveThresholdRatio) + applyPositive(facts, "Devs.DefaultActiveDays", dv.DefaultActiveDays) + applyPositive(facts, "Devs.HLLPrecision", dv.HLLPrecision) +} + +func (c *Config) applyFileHistoryFacts(facts map[string]any) { + fh := c.History.FileHistory + applyPositive(facts, "FileHistory.HotspotThresholdCritical", fh.HotspotThresholdCritical) + applyPositive(facts, "FileHistory.HotspotThresholdHigh", fh.HotspotThresholdHigh) + applyPositive(facts, "FileHistory.HotspotThresholdMedium", fh.HotspotThresholdMedium) +} + +func (c *Config) applyImportsFacts(facts map[string]any) { im := c.History.Imports applyPositive(facts, "Imports.Goroutines", im.Goroutines) applyPositive(facts, "Imports.MaxFileSize", im.MaxFileSize) + applyPositive(facts, "Imports.MaxDependencyRiskRows", im.MaxDependencyRiskRows) +} +func (c *Config) applySentimentFacts(facts map[string]any) { se := c.History.Sentiment applyPositive(facts, "CommentSentiment.MinLength", se.MinCommentLength) - applyPositive(facts, "CommentSentiment.Gap", float32(se.Gap)) + applyPositive(facts, "CommentSentiment.Gap", se.Gap) + applyPositive(facts, "CommentSentiment.NeutralizerWeight", se.NeutralizerWeight) + applyPositive(facts, "CommentSentiment.MaxWeightRatio", se.MaxWeightRatio) + applyPositive(facts, "CommentSentiment.PositiveThreshold", se.PositiveThreshold) + applyPositive(facts, "CommentSentiment.NegativeThreshold", se.NegativeThreshold) + applyPositive(facts, "CommentSentiment.TrendThreshold", se.TrendThreshold) + applyPositive(facts, "CommentSentiment.LowSentimentRiskThreshold", se.LowSentimentRiskThresh) +} +func (c *Config) applyShotnessFacts(facts map[string]any) { sh := c.History.Shotness applyNonEmpty(facts, "Shotness.DSLStruct", sh.DSLStruct) applyNonEmpty(facts, "Shotness.DSLName", sh.DSLName) +} +func (c *Config) applyTyposFacts(facts map[string]any) { applyPositive(facts, "TyposDatasetBuilder.MaximumAllowedDistance", c.History.Typos.MaxDistance) +} +func (c *Config) applyAnomalyFacts(facts map[string]any) { an := c.History.Anomaly - applyPositive(facts, "TemporalAnomaly.Threshold", float32(an.Threshold)) + applyPositive(facts, "TemporalAnomaly.Threshold", an.Threshold) applyPositive(facts, "TemporalAnomaly.WindowSize", an.WindowSize) } + +func (c *Config) applyClonesFacts(facts map[string]any) { + cl := c.History.Clones + + applyPositive(facts, "Clones.MaxClonePairs", cl.MaxClonePairs) + applyPositive(facts, "Clones.NumHashes", cl.NumHashes) + applyPositive(facts, "Clones.NumBands", cl.NumBands) + applyPositive(facts, "Clones.NumRows", cl.NumRows) + applyPositive(facts, "Clones.ShingleSize", cl.ShingleSize) + applyPositive(facts, "Clones.SimilarityType2", cl.SimilarityType2) + applyPositive(facts, "Clones.SimilarityType3", cl.SimilarityType3) + applyPositive(facts, "Clones.ThresholdRatioYellow", cl.ThresholdRatioYellow) + applyPositive(facts, "Clones.ThresholdRatioRed", cl.ThresholdRatioRed) + applyPositive(facts, "Clones.ThresholdPairsYellow", cl.ThresholdPairsYellow) + applyPositive(facts, "Clones.ThresholdPairsRed", cl.ThresholdPairsRed) +} diff --git a/internal/config/loader.go b/internal/config/loader.go index e543a00..69ed71b 100644 --- a/internal/config/loader.go +++ b/internal/config/loader.go @@ -73,6 +73,12 @@ func LoadConfig(configPath string) (*Config, error) { func applyDefaults(viperCfg *viper.Viper) { viperCfg.SetDefault("analyzers", []string{}) + applyPipelineDefaults(viperCfg) + applyHistoryDefaults(viperCfg) + applyCheckpointDefaults(viperCfg) +} + +func applyPipelineDefaults(viperCfg *viper.Viper) { viperCfg.SetDefault("pipeline.workers", DefaultPipelineWorkers) viperCfg.SetDefault("pipeline.memory_budget", DefaultPipelineMemoryBudget) viperCfg.SetDefault("pipeline.blob_cache_size", DefaultPipelineBlobCacheSize) @@ -82,6 +88,42 @@ func applyDefaults(viperCfg *viper.Viper) { viperCfg.SetDefault("pipeline.gogc", DefaultPipelineGOGC) viperCfg.SetDefault("pipeline.ballast_size", DefaultPipelineBallastSize) + viperCfg.SetDefault("pipeline.uast_spill_threshold", DefaultPipelineUASTSpillThreshold) + viperCfg.SetDefault("pipeline.intra_commit_parallel_threshold", DefaultPipelineIntraCommitParallelThreshold) + viperCfg.SetDefault("pipeline.max_intra_commit_workers", DefaultPipelineMaxIntraCommitWorkers) + viperCfg.SetDefault("pipeline.max_uast_blob_size", DefaultPipelineMaxUASTBlobSize) + viperCfg.SetDefault("pipeline.uast_parse_timeout", DefaultPipelineUASTParseTimeout) + viperCfg.SetDefault("pipeline.max_changes_per_commit", DefaultPipelineMaxChangesPerCommit) + viperCfg.SetDefault("pipeline.max_diff_batch_size", DefaultPipelineMaxDiffBatchSize) + viperCfg.SetDefault("pipeline.memory_budget_ratio", DefaultPipelineMemoryBudgetRatio) + viperCfg.SetDefault("pipeline.memory_budget_cap", DefaultPipelineMemoryBudgetCap) + viperCfg.SetDefault("pipeline.memory_limit_ratio", DefaultPipelineMemoryLimitRatio) + viperCfg.SetDefault("pipeline.uast_spill_trim_interval", DefaultPipelineUASTSpillTrimInterval) + viperCfg.SetDefault("pipeline.native_trim_interval", DefaultPipelineNativeTrimInterval) + viperCfg.SetDefault("pipeline.max_streaming_buffering", DefaultPipelineMaxStreamingBuffering) + viperCfg.SetDefault("pipeline.drain_prefetch_timeout", DefaultPipelineDrainPrefetchTimeout) + viperCfg.SetDefault("pipeline.sampler_interval", DefaultPipelineSamplerInterval) + viperCfg.SetDefault("pipeline.worker_ratio", DefaultPipelineWorkerRatio) + viperCfg.SetDefault("pipeline.uast_worker_ratio", DefaultPipelineUASTWorkerRatio) + viperCfg.SetDefault("pipeline.leaf_worker_divisor", DefaultPipelineLeafWorkerDivisor) + viperCfg.SetDefault("pipeline.min_leaf_workers", DefaultPipelineMinLeafWorkers) + viperCfg.SetDefault("pipeline.buffer_size_multiplier", DefaultPipelineBufferSizeMultiplier) + viperCfg.SetDefault("pipeline.budget_limit_ratio", DefaultPipelineBudgetLimitRatio) + viperCfg.SetDefault("pipeline.system_ram_limit_ratio", DefaultPipelineSystemRAMLimitRatio) + viperCfg.SetDefault("pipeline.static_max_workers", DefaultPipelineStaticMaxWorkers) + viperCfg.SetDefault("pipeline.malloc_trim_interval", DefaultPipelineMallocTrimInterval) + viperCfg.SetDefault("pipeline.static_memory_limit_ratio", DefaultPipelineStaticMemoryLimitRatio) + viperCfg.SetDefault("pipeline.diff_job_buffer_multiplier", DefaultPipelineDiffJobBufferMultiplier) +} + +func applyHistoryDefaults(viperCfg *viper.Viper) { + applyBurndownDefaults(viperCfg) + applyCouplesDefaults(viperCfg) + applyDevsDefaults(viperCfg) + applyOtherHistoryDefaults(viperCfg) +} + +func applyBurndownDefaults(viperCfg *viper.Viper) { viperCfg.SetDefault("history.burndown.granularity", DefaultBurndownGranularity) viperCfg.SetDefault("history.burndown.sampling", DefaultBurndownSampling) viperCfg.SetDefault("history.burndown.track_files", DefaultBurndownTrackFiles) @@ -91,15 +133,47 @@ func applyDefaults(viperCfg *viper.Viper) { viperCfg.SetDefault("history.burndown.hibernation_directory", DefaultBurndownHibernationDirectory) viperCfg.SetDefault("history.burndown.debug", DefaultBurndownDebug) viperCfg.SetDefault("history.burndown.goroutines", DefaultBurndownGoroutines) +} +func applyCouplesDefaults(viperCfg *viper.Viper) { + viperCfg.SetDefault("history.couples.coupling_threshold_high", DefaultCouplesCouplingThresholdHigh) + viperCfg.SetDefault("history.couples.ownership_few_threshold", DefaultCouplesOwnershipFewThreshold) + viperCfg.SetDefault("history.couples.ownership_moderate_threshold", DefaultCouplesOwnershipModerateThreshold) + viperCfg.SetDefault("history.couples.batch_coupling_threshold", DefaultCouplesBatchCouplingThreshold) + viperCfg.SetDefault("history.couples.hll_precision", DefaultCouplesHLLPrecision) + viperCfg.SetDefault("history.couples.top_k_per_file", DefaultCouplesTopKPerFile) + viperCfg.SetDefault("history.couples.min_edge_weight", DefaultCouplesMinEdgeWeight) +} + +func applyDevsDefaults(viperCfg *viper.Viper) { viperCfg.SetDefault("history.devs.consider_empty_commits", DefaultDevsConsiderEmptyCommits) viperCfg.SetDefault("history.devs.anonymize", DefaultDevsAnonymize) + viperCfg.SetDefault("history.devs.bus_factor_threshold", DefaultDevsBusFactorThreshold) + viperCfg.SetDefault("history.devs.risk_threshold_critical", DefaultDevsRiskThresholdCritical) + viperCfg.SetDefault("history.devs.risk_threshold_high", DefaultDevsRiskThresholdHigh) + viperCfg.SetDefault("history.devs.risk_threshold_medium", DefaultDevsRiskThresholdMedium) + viperCfg.SetDefault("history.devs.active_threshold_ratio", DefaultDevsActiveThresholdRatio) + viperCfg.SetDefault("history.devs.default_active_days", DefaultDevsDefaultActiveDays) + viperCfg.SetDefault("history.devs.hll_precision", DefaultDevsHLLPrecision) +} + +func applyOtherHistoryDefaults(viperCfg *viper.Viper) { + viperCfg.SetDefault("history.file_history.hotspot_threshold_critical", DefaultFileHistoryHotspotCritical) + viperCfg.SetDefault("history.file_history.hotspot_threshold_high", DefaultFileHistoryHotspotHigh) + viperCfg.SetDefault("history.file_history.hotspot_threshold_medium", DefaultFileHistoryHotspotMedium) viperCfg.SetDefault("history.imports.goroutines", DefaultImportsGoroutines) viperCfg.SetDefault("history.imports.max_file_size", DefaultImportsMaxFileSize) + viperCfg.SetDefault("history.imports.max_dependency_risk_rows", DefaultImportsMaxDependencyRiskRows) viperCfg.SetDefault("history.sentiment.min_comment_length", DefaultSentimentMinCommentLength) viperCfg.SetDefault("history.sentiment.gap", DefaultSentimentGap) + viperCfg.SetDefault("history.sentiment.neutralizer_weight", DefaultSentimentNeutralizerWeight) + viperCfg.SetDefault("history.sentiment.max_weight_ratio", DefaultSentimentMaxWeightRatio) + viperCfg.SetDefault("history.sentiment.positive_threshold", DefaultSentimentPositiveThreshold) + viperCfg.SetDefault("history.sentiment.negative_threshold", DefaultSentimentNegativeThreshold) + viperCfg.SetDefault("history.sentiment.trend_threshold", DefaultSentimentTrendThreshold) + viperCfg.SetDefault("history.sentiment.low_sentiment_risk_threshold", DefaultSentimentLowSentimentRiskThresh) viperCfg.SetDefault("history.shotness.dsl_struct", DefaultShotnessDSLStruct) viperCfg.SetDefault("history.shotness.dsl_name", DefaultShotnessDSLName) @@ -109,6 +183,20 @@ func applyDefaults(viperCfg *viper.Viper) { viperCfg.SetDefault("history.anomaly.threshold", DefaultAnomalyThreshold) viperCfg.SetDefault("history.anomaly.window_size", DefaultAnomalyWindowSize) + viperCfg.SetDefault("history.clones.max_clone_pairs", DefaultClonesMaxClonePairs) + viperCfg.SetDefault("history.clones.num_hashes", DefaultClonesNumHashes) + viperCfg.SetDefault("history.clones.num_bands", DefaultClonesNumBands) + viperCfg.SetDefault("history.clones.num_rows", DefaultClonesNumRows) + viperCfg.SetDefault("history.clones.shingle_size", DefaultClonesShingleSize) + viperCfg.SetDefault("history.clones.similarity_type2", DefaultClonesSimilarityType2) + viperCfg.SetDefault("history.clones.similarity_type3", DefaultClonesSimilarityType3) + viperCfg.SetDefault("history.clones.threshold_ratio_yellow", DefaultClonesThresholdRatioYellow) + viperCfg.SetDefault("history.clones.threshold_ratio_red", DefaultClonesThresholdRatioRed) + viperCfg.SetDefault("history.clones.threshold_pairs_yellow", DefaultClonesThresholdPairsYellow) + viperCfg.SetDefault("history.clones.threshold_pairs_red", DefaultClonesThresholdPairsRed) +} + +func applyCheckpointDefaults(viperCfg *viper.Viper) { viperCfg.SetDefault("checkpoint.enabled", DefaultCheckpointEnabled) viperCfg.SetDefault("checkpoint.dir", DefaultCheckpointDir) viperCfg.SetDefault("checkpoint.resume", DefaultCheckpointResume) diff --git a/internal/framework/blob_pipeline.go b/internal/framework/blob_pipeline.go index 5ed17c3..84170a8 100644 --- a/internal/framework/blob_pipeline.go +++ b/internal/framework/blob_pipeline.go @@ -53,6 +53,9 @@ type BlobPipeline struct { BlobCache *cache.LRUBlobCache ArenaSize int + // MaxChanges caps the number of file changes per commit. Zero = use default. + MaxChanges int + // Metrics provides per-stage counters for memory triage. Nil-safe. Metrics *StageMetrics @@ -93,6 +96,7 @@ func NewBlobPipelineWithCache( p := &BlobPipeline{ SeqWorkerChan: seqChan, PoolWorkerChan: poolChan, + MaxChanges: maxChangesPerCommit, BufferSize: bufferSize, WorkerCount: workerCount, BlobCache: blobCache, @@ -245,9 +249,10 @@ func (p *BlobPipeline) processBatch( // Skip monster commits (vendor moves, mass renames) by setting // ErrCommitTooLarge. The runner detects this and skips the commit // instead of aborting the pipeline. - if len(resp.Changes) > maxChangesPerCommit { + changeCap := p.MaxChanges + if len(resp.Changes) > changeCap { log.Printf("blob pipeline: skipping commit %s (%d changes > %d cap)", - job.commit.Hash(), len(resp.Changes), maxChangesPerCommit) + job.commit.Hash(), len(resp.Changes), changeCap) bJob.data.Changes = nil bJob.data.Error = fmt.Errorf("%w: %s has %d changes", ErrCommitTooLarge, job.commit.Hash(), len(resp.Changes)) diff --git a/internal/framework/config.go b/internal/framework/config.go index 6fe4063..d2110f6 100644 --- a/internal/framework/config.go +++ b/internal/framework/config.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "strings" + "time" "github.com/dustin/go-humanize" @@ -28,6 +29,33 @@ type ConfigParams struct { MemoryBudget string GCPercent int BallastSize string + + // Advanced pipeline tuning (zero = use defaults). + UASTSpillThreshold int + IntraCommitParallelThreshold int + MaxIntraCommitWorkers int + MaxUASTBlobSize int + UASTParseTimeout string + MaxChangesPerCommit int + MaxDiffBatchSize int + MemoryBudgetRatio int + MemoryBudgetCap string + MemoryLimitRatio int + + // Extended pipeline tuning. + UASTSpillTrimInterval int + NativeTrimInterval int + MaxStreamingBuffering int + DrainPrefetchTimeout string + SamplerInterval string + WorkerRatio int + UASTWorkerRatio int + LeafWorkerDivisor int + MinLeafWorkers int + BufferSizeMultiplier int + BudgetLimitRatio int + SystemRAMLimitRatio int + DiffJobBufferMultiplier int } // CheckpointParams holds checkpoint-related configuration. @@ -54,16 +82,31 @@ const percentDenominator = 100 const defaultMemoryBudgetCap = int64(2 * 1024 * 1024 * 1024) // DefaultMemoryBudget returns a sensible memory budget based on available system memory. -// Returns min(50% of total RAM, 4 GiB), or 0 if detection fails. +// Returns min(50% of total RAM, 2 GiB), or 0 if detection fails. func DefaultMemoryBudget() int64 { + return DefaultMemoryBudgetWithParams(defaultMemoryBudgetRatio, "") +} + +// DefaultMemoryBudgetWithParams returns a memory budget with configurable ratio and cap. +// Empty cap string uses defaultMemoryBudgetCap. +func DefaultMemoryBudgetWithParams(ratio int, capStr string) int64 { total := detectTotalMemoryBytes() if total == 0 { return 0 } - budget := safeconv.SafeInt64(total * defaultMemoryBudgetRatio / percentDenominator) + budgetCap := defaultMemoryBudgetCap + + if capStr != "" { + parsed, err := humanize.ParseBytes(capStr) + if err == nil && parsed > 0 { + budgetCap = safeconv.SafeInt64(parsed) + } + } + + budget := safeconv.SafeInt64(total * uint64(ratio) / percentDenominator) - return min(budget, defaultMemoryBudgetCap) + return min(budget, budgetCap) } // BuildConfigFromParams builds a CoordinatorConfig from raw parameters. @@ -99,13 +142,23 @@ func BuildConfigFromParams(params ConfigParams, budgetSolver BudgetSolver) (Coor return config, 0, sizeErr } + advErr := applyAdvancedParams(&config, params) + if advErr != nil { + return config, 0, advErr + } + tuningErr := applyRuntimeTuningParams(&config, params.GCPercent, params.BallastSize) if tuningErr != nil { return config, 0, tuningErr } // Auto-detect memory budget from system memory when not explicitly set. - memBudget := DefaultMemoryBudget() + budgetRatio := params.MemoryBudgetRatio + if budgetRatio == 0 { + budgetRatio = defaultMemoryBudgetRatio + } + + memBudget := DefaultMemoryBudgetWithParams(budgetRatio, params.MemoryBudgetCap) return config, memBudget, nil } @@ -140,6 +193,113 @@ func applyIntParams(config *CoordinatorConfig, params ConfigParams) { if params.DiffCacheSize > 0 { config.DiffCacheSize = params.DiffCacheSize } + + if params.UASTSpillThreshold > 0 { + config.UASTSpillThreshold = params.UASTSpillThreshold + } + + if params.IntraCommitParallelThreshold > 0 { + config.IntraCommitParallelThreshold = params.IntraCommitParallelThreshold + } + + if params.MaxIntraCommitWorkers > 0 { + config.MaxIntraCommitWorkers = params.MaxIntraCommitWorkers + } + + if params.MaxUASTBlobSize > 0 { + config.MaxUASTBlobSize = params.MaxUASTBlobSize + } + + if params.MaxChangesPerCommit > 0 { + config.MaxChangesPerCommit = params.MaxChangesPerCommit + } + + if params.MaxDiffBatchSize > 0 { + config.MaxDiffBatchSize = params.MaxDiffBatchSize + } + + if params.MemoryLimitRatio > 0 { + config.MemoryLimitRatio = params.MemoryLimitRatio + } + + applyExtendedIntParams(config, params) +} + +func applyExtendedIntParams(config *CoordinatorConfig, params ConfigParams) { + if params.UASTSpillTrimInterval > 0 { + config.UASTSpillTrimInterval = params.UASTSpillTrimInterval + } + + if params.NativeTrimInterval > 0 { + config.NativeTrimInterval = params.NativeTrimInterval + } + + if params.MaxStreamingBuffering > 0 { + config.MaxStreamingBuffering = params.MaxStreamingBuffering + } + + if params.WorkerRatio > 0 { + config.WorkerRatio = params.WorkerRatio + } + + if params.UASTWorkerRatio > 0 { + config.UASTWorkerRatio = params.UASTWorkerRatio + } + + if params.LeafWorkerDivisor > 0 { + config.LeafWorkerDivisor = params.LeafWorkerDivisor + } + + if params.MinLeafWorkers > 0 { + config.MinLeafWorkers = params.MinLeafWorkers + } + + if params.BufferSizeMultiplier > 0 { + config.BufferSizeMultiplier = params.BufferSizeMultiplier + } + + if params.BudgetLimitRatio > 0 { + config.BudgetLimitRatio = params.BudgetLimitRatio + } + + if params.SystemRAMLimitRatio > 0 { + config.SystemRAMLimitRatio = params.SystemRAMLimitRatio + } + + if params.DiffJobBufferMultiplier > 0 { + config.DiffJobBufferMultiplier = params.DiffJobBufferMultiplier + } +} + +func applyAdvancedParams(config *CoordinatorConfig, params ConfigParams) error { + if params.UASTParseTimeout != "" { + d, err := time.ParseDuration(params.UASTParseTimeout) + if err != nil { + return fmt.Errorf("%w for uast-parse-timeout: %s", ErrInvalidSizeFormat, params.UASTParseTimeout) + } + + config.UASTParseTimeout = d + } + + if params.DrainPrefetchTimeout != "" { + d, err := time.ParseDuration(params.DrainPrefetchTimeout) + if err != nil { + return fmt.Errorf("%w for drain-prefetch-timeout: %s", ErrInvalidSizeFormat, params.DrainPrefetchTimeout) + } + + config.DrainPrefetchTimeout = d + } + + if params.SamplerInterval != "" { + d, err := time.ParseDuration(params.SamplerInterval) + if err != nil { + return fmt.Errorf("%w for sampler-interval: %s", ErrInvalidSizeFormat, params.SamplerInterval) + } + + config.SamplerInterval = d + } + + return nil } func applySizeParams(config *CoordinatorConfig, params ConfigParams) error { diff --git a/internal/framework/coordinator.go b/internal/framework/coordinator.go index e290fb1..1469445 100644 --- a/internal/framework/coordinator.go +++ b/internal/framework/coordinator.go @@ -138,6 +138,72 @@ type CoordinatorConfig struct { // WorkerTimeout is the maximum time to wait for a worker response before // considering it stalled. Set to 0 to disable the watchdog. WorkerTimeout time.Duration + + // Advanced pipeline tuning (zero = use package-level defaults). + + // UASTSpillThreshold is the number of file changes above which the UAST pipeline + // spills parsed trees to disk to cap memory. + UASTSpillThreshold int + + // IntraCommitParallelThreshold is the minimum number of file changes for intra-commit parallelism. + IntraCommitParallelThreshold int + + // MaxIntraCommitWorkers caps the goroutine count for parsing files within a single commit. + MaxIntraCommitWorkers int + + // MaxUASTBlobSize is the maximum blob size (in bytes) for UAST parsing. + MaxUASTBlobSize int + + // UASTParseTimeout is the per-file UAST parse timeout. + UASTParseTimeout time.Duration + + // MaxChangesPerCommit caps the number of file changes per commit for blob loading. + MaxChangesPerCommit int + + // MaxDiffBatchSize is the maximum number of diff requests per batch. + MaxDiffBatchSize int + + // MemoryLimitRatio is the fraction of system memory to use as the soft limit. + MemoryLimitRatio int + + // UASTSpillTrimInterval controls MallocTrim frequency during UAST spill-mode parsing. + UASTSpillTrimInterval int + + // NativeTrimInterval controls malloc_trim frequency within a chunk. + NativeTrimInterval int + + // MaxStreamingBuffering is the maximum buffering factor for RunStreaming (triple-buffering). + MaxStreamingBuffering int + + // DrainPrefetchTimeout is the timeout for abandoning prefetch goroutines. + DrainPrefetchTimeout time.Duration + + // SamplerInterval is the polling interval for the pipeline sampler. + SamplerInterval time.Duration + + // WorkerRatio is the fraction of CPU cores to use for workers (percentage). + WorkerRatio int + + // UASTWorkerRatio is the fraction of CPU cores to use for UAST pipeline workers (percentage). + UASTWorkerRatio int + + // LeafWorkerDivisor controls default leaf workers: NumCPU / divisor. + LeafWorkerDivisor int + + // MinLeafWorkers is the minimum number of leaf workers when enabled. + MinLeafWorkers int + + // BufferSizeMultiplier scales buffer size with worker count. + BufferSizeMultiplier int + + // BudgetLimitRatio is the budget-to-memory-limit conversion ratio (percentage). + BudgetLimitRatio int + + // SystemRAMLimitRatio caps the memory limit at this fraction of system RAM (percentage). + SystemRAMLimitRatio int + + // DiffJobBufferMultiplier scales the diff job queue buffer. + DiffJobBufferMultiplier int } // DefaultCoordinatorConfig returns the default coordinator configuration. @@ -162,6 +228,29 @@ func DefaultCoordinatorConfig() CoordinatorConfig { BlobArenaSize: defaultBlobArenaBytes, GCPercent: 0, BallastSize: 0, + + // Advanced pipeline tuning — actual defaults, not zero sentinels. + UASTSpillThreshold: uastSpillThreshold, + IntraCommitParallelThreshold: intraCommitParallelThreshold, + MaxIntraCommitWorkers: defaultMaxIntraCommitWorkers, + MaxUASTBlobSize: maxUASTBlobSize, + UASTParseTimeout: defaultParseTimeout, + MaxChangesPerCommit: maxChangesPerCommit, + MaxDiffBatchSize: defaultMaxDiffBatchSize, + MemoryLimitRatio: memoryLimitRatio, + UASTSpillTrimInterval: uastSpillTrimInterval, + NativeTrimInterval: nativeTrimInterval, + MaxStreamingBuffering: maxStreamingBuffering, + DrainPrefetchTimeout: drainPrefetchTimeout, + SamplerInterval: samplerInterval, + WorkerRatio: optimalWorkerRatio, + UASTWorkerRatio: uastPipelineWorkerRatio, + LeafWorkerDivisor: leafWorkerDivisor, + MinLeafWorkers: minLeafWorkers, + BufferSizeMultiplier: bufferSizeMultiplier, + BudgetLimitRatio: budgetLimitRatio, + SystemRAMLimitRatio: systemRAMLimitRatio, + DiffJobBufferMultiplier: diffJobBufferMultiplier, } } @@ -264,22 +353,9 @@ func NewCoordinator(repo *gitlib.Repository, config CoordinatorConfig) *Coordina diffCache = NewDiffCache(config.DiffCacheSize) } - blobPipeline := NewBlobPipelineWithCache(seqChan, poolChan, config.BufferSize, config.Workers, blobCache) - if config.BlobArenaSize > 0 { - blobPipeline.ArenaSize = config.BlobArenaSize - } - - diffPipeline := NewDiffPipelineWithCache(poolChan, config.BufferSize, diffCache) - - // Create UAST pipeline if workers are configured. - var uastPipeline *UASTPipeline - - if config.UASTPipelineWorkers > 0 { - parser, err := uast.NewParser() - if err == nil { - uastPipeline = NewUASTPipeline(parser, config.UASTPipelineWorkers, config.BufferSize) - } - } + blobPipeline := newBlobPipelineFromConfig(seqChan, poolChan, config, blobCache) + diffPipeline := newDiffPipelineFromConfig(poolChan, config, diffCache) + uastPipeline := newUASTPipelineFromConfig(config) return &Coordinator{ repo: repo, @@ -302,6 +378,78 @@ func NewCoordinator(repo *gitlib.Repository, config CoordinatorConfig) *Coordina } } +func newBlobPipelineFromConfig( + seqChan, poolChan chan gitlib.WorkerRequest, + config CoordinatorConfig, blobCache *cache.LRUBlobCache, +) *BlobPipeline { + p := NewBlobPipelineWithCache(seqChan, poolChan, config.BufferSize, config.Workers, blobCache) + + if config.BlobArenaSize > 0 { + p.ArenaSize = config.BlobArenaSize + } + + if config.MaxChangesPerCommit > 0 { + p.MaxChanges = config.MaxChangesPerCommit + } + + return p +} + +func newDiffPipelineFromConfig( + poolChan chan gitlib.WorkerRequest, config CoordinatorConfig, diffCache *DiffCache, +) *DiffPipeline { + p := NewDiffPipelineWithCache(poolChan, config.BufferSize, diffCache) + + if config.MaxDiffBatchSize > 0 { + p.MaxBatchSize = config.MaxDiffBatchSize + } + + if config.DiffJobBufferMultiplier > 0 { + p.JobBufferMultiplier = config.DiffJobBufferMultiplier + } + + return p +} + +func newUASTPipelineFromConfig(config CoordinatorConfig) *UASTPipeline { + if config.UASTPipelineWorkers <= 0 { + return nil + } + + parser, err := uast.NewParser() + if err != nil { + return nil + } + + p := NewUASTPipeline(parser, config.UASTPipelineWorkers, config.BufferSize) + + if config.UASTSpillThreshold > 0 { + p.SpillThreshold = config.UASTSpillThreshold + } + + if config.IntraCommitParallelThreshold > 0 { + p.IntraCommitParallelThresh = config.IntraCommitParallelThreshold + } + + if config.MaxIntraCommitWorkers > 0 { + p.MaxIntraCommitWorkers = config.MaxIntraCommitWorkers + } + + if config.MaxUASTBlobSize > 0 { + p.MaxBlobSize = config.MaxUASTBlobSize + } + + if config.UASTParseTimeout > 0 { + p.ParseTimeout = config.UASTParseTimeout + } + + if config.UASTSpillTrimInterval > 0 { + p.SpillTrimInterval = config.UASTSpillTrimInterval + } + + return p +} + // Stats returns the pipeline stats collected during Process(). // Only valid after the channel returned by Process() is fully drained. func (c *Coordinator) Stats() PipelineStats { @@ -312,9 +460,9 @@ func applyRuntimeTuning(config CoordinatorConfig, memBudgetOverride int64) []byt applyGCPercent(config.GCPercent) if memBudgetOverride > 0 { - applyMemoryLimitFromBudget(memBudgetOverride) + applyMemoryLimitFromBudget(memBudgetOverride, config.BudgetLimitRatio, config.SystemRAMLimitRatio) } else { - applyMemoryLimit() + applyMemoryLimitWithRatio(config.MemoryLimitRatio) } return applyBallast(config.BallastSize) @@ -331,16 +479,16 @@ const systemRAMLimitRatio = 90 // applyMemoryLimitFromBudget sets Go's soft memory limit to a fraction of the // user's memory budget. Capped at 90% of system RAM to prevent GC thrashing // when the budget exceeds available memory. -func applyMemoryLimitFromBudget(budget int64) { - limit := resolveMemoryLimitFromBudget(budget, detectTotalMemoryBytes()) +func applyMemoryLimitFromBudget(budget int64, budgetRatio, systemRatio int) { + limit := resolveMemoryLimitFromBudget(budget, detectTotalMemoryBytes(), budgetRatio, systemRatio) debug.SetMemoryLimit(safeconv.SafeInt64(limit)) } -func resolveMemoryLimitFromBudget(budget int64, totalMemoryBytes uint64) uint64 { - budgetBased := uint64(budget) * budgetLimitRatio / percentDivisor +func resolveMemoryLimitFromBudget(budget int64, totalMemoryBytes uint64, budgetRatioVal, systemRatioVal int) uint64 { + budgetBased := uint64(budget) * uint64(budgetRatioVal) / percentDivisor if totalMemoryBytes > 0 { - systemCap := totalMemoryBytes * systemRAMLimitRatio / percentDivisor + systemCap := totalMemoryBytes * uint64(systemRatioVal) / percentDivisor return min(budgetBased, systemCap) } @@ -348,21 +496,20 @@ func resolveMemoryLimitFromBudget(budget int64, totalMemoryBytes uint64) uint64 return budgetBased } -// applyMemoryLimit sets Go's soft memory limit based on available system memory. -// Uses 75% of system memory (capped at 4 GiB) to trigger aggressive GC before OOM. -// Go's GC uses this as a target: when heap approaches the limit, GC runs more -// frequently regardless of GOGC. This prevents OOM on large analysis workloads. -func applyMemoryLimit() { - limit := resolveMemoryLimit(detectTotalMemoryBytes()) +// applyMemoryLimitWithRatio sets Go's soft memory limit based on available system memory. +// Uses the given ratio (percent) of system memory (capped at defaultMemoryLimitBytes). +// Zero ratio uses the package-level default (75%). +func applyMemoryLimitWithRatio(ratio int) { + limit := resolveMemoryLimitWithRatio(detectTotalMemoryBytes(), ratio) debug.SetMemoryLimit(safeconv.SafeInt64(limit)) } -func resolveMemoryLimit(totalMemoryBytes uint64) uint64 { +func resolveMemoryLimitWithRatio(totalMemoryBytes uint64, ratio int) uint64 { if totalMemoryBytes == 0 { return defaultMemoryLimitBytes } - systemBased := totalMemoryBytes * memoryLimitRatio / percentDivisor + systemBased := totalMemoryBytes * uint64(ratio) / percentDivisor return min(systemBased, defaultMemoryLimitBytes) } diff --git a/internal/framework/diff_pipeline.go b/internal/framework/diff_pipeline.go index e8da5b1..9112ef4 100644 --- a/internal/framework/diff_pipeline.go +++ b/internal/framework/diff_pipeline.go @@ -35,6 +35,12 @@ type DiffPipeline struct { BufferSize int DiffCache *DiffCache + // MaxBatchSize is the maximum number of diff requests per batch. + MaxBatchSize int + + // JobBufferMultiplier scales the job buffer relative to pipeline buffer size. + JobBufferMultiplier int + // NoBatch disables cross-commit batching. Each diff request fires immediately. // Useful for debugging or single-commit analysis. NoBatch bool @@ -61,9 +67,11 @@ func NewDiffPipelineWithCache(workerChan chan<- gitlib.WorkerRequest, bufferSize } p := &DiffPipeline{ - PoolWorkerChan: workerChan, - BufferSize: bufferSize, - DiffCache: cache, + PoolWorkerChan: workerChan, + BufferSize: bufferSize, + MaxBatchSize: defaultMaxDiffBatchSize, + JobBufferMultiplier: diffJobBufferMultiplier, + DiffCache: cache, } p.dispatch = pipeline.DispatchFunc[gitlib.WorkerRequest](func(ctx context.Context, req gitlib.WorkerRequest) error { @@ -113,12 +121,8 @@ type diffJob struct { // Process receives blob data and outputs commit data with computed diffs. func (p *DiffPipeline) Process(ctx context.Context, blobs <-chan BlobData) <-chan CommitData { - // diffJobBufferMultiplier scales the job buffer relative to pipeline buffer size. - // A larger buffer allows accumulating more diff jobs for cross-commit batching. - const diffJobBufferMultiplier = 10 - pc := pipeline.RunPC[<-chan BlobData, CommitData, diffJob]{ - Buffer: p.BufferSize * diffJobBufferMultiplier, + Buffer: p.BufferSize * p.JobBufferMultiplier, Produce: p.runDiffProducer, Consume: p.runDiffConsumer, } @@ -133,13 +137,11 @@ func (p *DiffPipeline) runDiffProducer(ctx context.Context, blobs <-chan BlobDat // or until input channel is dry. // Since BlobPipeline emits BlobData which already contains multiple diffs per commit, // we are effectively re-batching across commits. - const maxBatchSize = 1000 - var batcher pipeline.Batcher[gitlib.DiffRequest, []gitlib.DiffRequest] if p.NoBatch { batcher = &pipeline.PassthroughBatcher[gitlib.DiffRequest]{} } else { - batcher = pipeline.NewThresholdBatcher[gitlib.DiffRequest](maxBatchSize) + batcher = pipeline.NewThresholdBatcher[gitlib.DiffRequest](p.MaxBatchSize) } var pendingJobs []*diffJob @@ -455,3 +457,9 @@ func (p *DiffPipeline) fileDiffFromGoDiff(oldBlob, newBlob *gitlib.CachedBlob, o Diffs: diffs, } } + +// defaultMaxDiffBatchSize is the default maximum number of diff requests per batch. +const defaultMaxDiffBatchSize = 1000 + +// diffJobBufferMultiplier scales the job buffer relative to pipeline buffer size. +const diffJobBufferMultiplier = 10 diff --git a/internal/framework/export_test.go b/internal/framework/export_test.go index 6b65f9e..1ffe898 100644 --- a/internal/framework/export_test.go +++ b/internal/framework/export_test.go @@ -261,12 +261,12 @@ func RunnerBallastSizeForTest(runner *Runner) int { // ResolveMemoryLimitForTest exposes memory limit resolution logic. func ResolveMemoryLimitForTest(totalMemoryBytes uint64) uint64 { - return resolveMemoryLimit(totalMemoryBytes) + return resolveMemoryLimitWithRatio(totalMemoryBytes, memoryLimitRatio) } // ResolveMemoryLimitFromBudgetForTest exposes budget-aligned memory limit logic. -func ResolveMemoryLimitFromBudgetForTest(budget int64, totalMemoryBytes uint64) uint64 { - return resolveMemoryLimitFromBudget(budget, totalMemoryBytes) +func ResolveMemoryLimitFromBudgetForTest(budget int64, totalMemoryBytes uint64, budgetRatio, systemRatio int) uint64 { + return resolveMemoryLimitFromBudget(budget, totalMemoryBytes, budgetRatio, systemRatio) } // SplitLeavesForTest exposes the three-group leaf split for testing. diff --git a/internal/framework/runner.go b/internal/framework/runner.go index 7ee4f2c..0a8f3da 100644 --- a/internal/framework/runner.go +++ b/internal/framework/runner.go @@ -970,7 +970,7 @@ func (runner *Runner) processCommitsSerial( // Periodically release native (C malloc) memory back to the OS to prevent // tree-sitter/libgit2 fragmentation from accumulating within a chunk. - if commitIdx%nativeTrimInterval == 0 { + if runner.Config.NativeTrimInterval > 0 && commitIdx%runner.Config.NativeTrimInterval == 0 { gitlib.ReleaseNativeMemory() } } @@ -1374,7 +1374,7 @@ func (runner *Runner) hybridCommitLoop( // Periodically release native (C malloc) memory back to the OS to prevent // tree-sitter/libgit2 fragmentation from accumulating within a chunk. - if commitIdx%nativeTrimInterval == 0 { + if runner.Config.NativeTrimInterval > 0 && commitIdx%runner.Config.NativeTrimInterval == 0 { gitlib.ReleaseNativeMemory() } } diff --git a/internal/framework/runner_test.go b/internal/framework/runner_test.go index 0c15309..3336dbe 100644 --- a/internal/framework/runner_test.go +++ b/internal/framework/runner_test.go @@ -286,7 +286,7 @@ func TestResolveMemoryLimitFromBudget_SetsBudgetBased(t *testing.T) { totalRAM = uint64(32 * 1024 * 1024 * 1024) // 32 GiB system. ) - got := framework.ResolveMemoryLimitFromBudgetForTest(budget, totalRAM) + got := framework.ResolveMemoryLimitFromBudgetForTest(budget, totalRAM, 95, 90) // 95% of 4 GiB = 3.8 GiB. System cap = 90% of 32 GiB = 28.8 GiB. Min = 3.8 GiB. want := uint64(budget) * 95 / 100 @@ -303,7 +303,7 @@ func TestResolveMemoryLimitFromBudget_CappedAtSystemRAM(t *testing.T) { totalRAM = uint64(8 * 1024 * 1024 * 1024) // 8 GiB system (budget > system). ) - got := framework.ResolveMemoryLimitFromBudgetForTest(budget, totalRAM) + got := framework.ResolveMemoryLimitFromBudgetForTest(budget, totalRAM, 95, 90) // 95% of 16 GiB = 15.2 GiB, but capped at 90% of 8 GiB = 7.2 GiB. want := totalRAM * 90 / 100 @@ -315,7 +315,7 @@ func TestResolveMemoryLimitFromBudget_CappedAtSystemRAM(t *testing.T) { func TestResolveMemoryLimitFromBudget_ZeroBudget(t *testing.T) { t.Parallel() - got := framework.ResolveMemoryLimitFromBudgetForTest(0, 32*1024*1024*1024) + got := framework.ResolveMemoryLimitFromBudgetForTest(0, 32*1024*1024*1024, 95, 90) if got != 0 { t.Fatalf("memory limit = %d, want 0 for zero budget", got) } diff --git a/internal/framework/sampler.go b/internal/framework/sampler.go index da2a84b..bb96008 100644 --- a/internal/framework/sampler.go +++ b/internal/framework/sampler.go @@ -40,12 +40,16 @@ type SamplerConfig struct { DumpDir string ChunkIndex int MemBudget int64 - ProfileAtRSS int64 // RSS in bytes at which to capture t1 profile. 0 = disabled. + ProfileAtRSS int64 // RSS in bytes at which to capture t1 profile. 0 = disabled. + Interval time.Duration // Polling interval. Zero uses default. } // NewPipelineSampler creates a sampler. Call Start to begin periodic logging. func NewPipelineSampler(cfg SamplerConfig) *PipelineSampler { - interval := samplerInterval + interval := cfg.Interval + if interval <= 0 { + interval = samplerInterval + } return &PipelineSampler{ logger: cfg.Logger, diff --git a/internal/framework/streaming.go b/internal/framework/streaming.go index 33aa05b..a0e3db2 100644 --- a/internal/framework/streaming.go +++ b/internal/framework/streaming.go @@ -149,7 +149,7 @@ func RunStreaming( PipelineOverhead: pipelineOverhead, WorkStatePerCommit: workStatePerCommit, AvgTCSize: avgTCSize, - MaxBuffering: maxStreamingBuffering, + MaxBuffering: runner.Config.MaxStreamingBuffering, }) chunks := schedule.Chunks @@ -736,7 +736,7 @@ func processChunksWithCheckpoint( } samplerCtx, samplerCancel := context.WithCancel(ctx) - sampler := startChunkSampler(samplerCtx, logger, runner.StageMetrics, i, memBudget) + sampler := startChunkSampler(samplerCtx, logger, runner.StageMetrics, i, memBudget, runner.Config.SamplerInterval) before := observability.TakeHeapSnapshot() @@ -850,7 +850,7 @@ func processChunksFromIterator( } samplerCtx, samplerCancel := context.WithCancel(ctx) - sampler := startChunkSampler(samplerCtx, logger, runner.StageMetrics, i, memBudget) + sampler := startChunkSampler(samplerCtx, logger, runner.StageMetrics, i, memBudget, runner.Config.SamplerInterval) before := observability.TakeHeapSnapshot() @@ -962,7 +962,10 @@ func loadCommitsFromIterator(iter *gitlib.CommitIter, n int) ([]*gitlib.Commit, // startChunkSampler creates and starts a PipelineSampler for a chunk. // Returns the sampler (caller must call CaptureT1 + cancel the context when done). -func startChunkSampler(ctx context.Context, logger *slog.Logger, metrics *StageMetrics, chunkIdx int, memBudget int64) *PipelineSampler { +func startChunkSampler( + ctx context.Context, logger *slog.Logger, metrics *StageMetrics, + chunkIdx int, memBudget int64, interval time.Duration, +) *PipelineSampler { sampler := NewPipelineSampler(SamplerConfig{ Logger: logger, Metrics: metrics, @@ -970,6 +973,7 @@ func startChunkSampler(ctx context.Context, logger *slog.Logger, metrics *StageM ChunkIndex: chunkIdx, MemBudget: memBudget, ProfileAtRSS: memBudget * profileRSSPercent / percentDivisor, // Capture t1 at 90% of budget. + Interval: interval, }) sampler.Start(ctx) @@ -1111,7 +1115,7 @@ func processChunksDoubleBuffered( dur, pStats, err := st.processCurrentChunk(ctx, idx, startChunk) if err != nil { samplerCancel() - drainPrefetch(prefetch) + drainPrefetch(prefetch, st.runner.Config.DrainPrefetchTimeout) return stats, err } @@ -1128,7 +1132,7 @@ func processChunksDoubleBuffered( cbErr := st.invokeOnChunkComplete(idx + 1) if cbErr != nil { - drainPrefetch(prefetch) + drainPrefetch(prefetch, st.runner.Config.DrainPrefetchTimeout) return stats, cbErr } @@ -1152,7 +1156,7 @@ func (st *doubleBufferState) startSampler(_, samplerCtx context.Context, idx int st.runner.StageMetrics.Reset() } - return startChunkSampler(samplerCtx, st.logger, st.runner.StageMetrics, idx, st.memBudget) + return startChunkSampler(samplerCtx, st.logger, st.runner.StageMetrics, idx, st.memBudget, st.runner.Config.SamplerInterval) } // stopSampler captures the T1 snapshot, cancels the sampler context, and @@ -1231,7 +1235,7 @@ func (st *doubleBufferState) replanAndDrainStale( // If next chunk boundaries changed, drain stale prefetch. newNext := safeChunkAt(newChunks, idx+1) if prefetch != nil && !chunksEqual(prefetchedNext, newNext) { - drainPrefetch(prefetch) + drainPrefetch(prefetch, st.runner.Config.DrainPrefetchTimeout) prefetch = nil } } @@ -1359,15 +1363,19 @@ const drainPrefetchTimeout = 30 * time.Second // drainPrefetch waits for a pending prefetch to complete (if any) to prevent // goroutine leaks. The result is discarded. If the prefetch does not complete -// within drainPrefetchTimeout, it is abandoned. -func drainPrefetch(ch <-chan prefetchedChunk) { +// within the given timeout, it is abandoned. +func drainPrefetch(ch <-chan prefetchedChunk, timeout time.Duration) { if ch == nil { return } + if timeout <= 0 { + timeout = drainPrefetchTimeout + } + select { case <-ch: - case <-time.After(drainPrefetchTimeout): + case <-time.After(timeout): } } diff --git a/internal/framework/uast_pipeline.go b/internal/framework/uast_pipeline.go index 114ace4..8343078 100644 --- a/internal/framework/uast_pipeline.go +++ b/internal/framework/uast_pipeline.go @@ -25,6 +25,13 @@ type UASTPipeline struct { Workers int BufferSize int PathFilter *pathfilter.Filter + + SpillThreshold int + IntraCommitParallelThresh int + MaxIntraCommitWorkers int + MaxBlobSize int + ParseTimeout time.Duration + SpillTrimInterval int } // NewUASTPipeline creates a new UAST pipeline stage. @@ -38,10 +45,16 @@ func NewUASTPipeline(parser *uast.Parser, workers, bufferSize int) *UASTPipeline } return &UASTPipeline{ - Parser: parser, - Workers: workers, - BufferSize: bufferSize, - PathFilter: pathfilter.New(), + Parser: parser, + Workers: workers, + BufferSize: bufferSize, + PathFilter: pathfilter.New(), + SpillThreshold: uastSpillThreshold, + IntraCommitParallelThresh: intraCommitParallelThreshold, + MaxIntraCommitWorkers: defaultMaxIntraCommitWorkers, + MaxBlobSize: maxUASTBlobSize, + ParseTimeout: defaultParseTimeout, + SpillTrimInterval: uastSpillTrimInterval, } } @@ -113,8 +126,9 @@ func (p *UASTPipeline) startWorkers(ctx context.Context, jobs <-chan *uastSlot) go func() { defer wg.Done() + spillThresh := p.SpillThreshold for slot := range jobs { - if len(slot.data.Changes) > uastSpillThreshold { + if len(slot.data.Changes) > spillThresh { path, err := p.parseCommitAndSpill(ctx, slot.data.Changes, slot.data.BlobCache) if err != nil { log.Printf("UAST spill error: %v", err) @@ -167,6 +181,10 @@ const uastSpillThreshold = 32 // Every N files, reclaim C arena pages to prevent fragmentation buildup. const uastSpillTrimInterval = 16 +// defaultMaxIntraCommitWorkers caps the goroutine count for parsing files within +// a single commit. Keeping this small avoids excessive concurrency. +const defaultMaxIntraCommitWorkers = 4 + // intraCommitParallelThreshold is the minimum number of file changes in a commit // before intra-commit parallelism is used. Below this, sequential parsing is faster. const intraCommitParallelThreshold = 4 @@ -189,7 +207,7 @@ func (p *UASTPipeline) parseCommitChanges( return nil } - if len(changes) <= intraCommitParallelThreshold { + if len(changes) <= p.IntraCommitParallelThresh { return p.parseCommitSequential(ctx, changes, cache) } @@ -237,11 +255,7 @@ func (p *UASTPipeline) parseCommitParallel( jobs := make(chan *gitlib.Change, len(changes)) results := make(chan uastFileResult, len(changes)) - // maxIntraCommitWorkers caps the goroutine count for parsing files within - // a single commit. Keeping this small avoids excessive concurrency. - const maxIntraCommitWorkers = 4 - - numWorkers := min(maxIntraCommitWorkers, len(changes)) + numWorkers := min(p.MaxIntraCommitWorkers, len(changes)) var wg sync.WaitGroup @@ -335,7 +349,7 @@ func (p *UASTPipeline) parseCommitAndSpill( wrote++ // Periodically reclaim C arena pages during long sequential parsing. - if wrote%uastSpillTrimInterval == 0 { + if wrote%p.SpillTrimInterval == 0 { uast.MallocTrim() } } @@ -398,7 +412,7 @@ func (p *UASTPipeline) parseBlob( return nil } - if len(blob.Data) > maxUASTBlobSize { + if len(blob.Data) > p.MaxBlobSize { return nil } @@ -410,9 +424,7 @@ func (p *UASTPipeline) parseBlob( // Tree-sitter can exhibit pathological behavior on some files (e.g., deeply // nested JSON, certain generated code patterns) causing unbounded native // memory growth. The timeout triggers the cancellation flag to stop the parse. - const parseTimeout = 10 * time.Second - - parseCtx, cancel := context.WithTimeout(ctx, parseTimeout) + parseCtx, cancel := context.WithTimeout(ctx, p.ParseTimeout) defer cancel() parsed, err := p.Parser.Parse(parseCtx, filename, blob.Data) @@ -422,3 +434,6 @@ func (p *UASTPipeline) parseBlob( return parsed } + +// defaultParseTimeout is the per-file UAST parse timeout. +const defaultParseTimeout = 10 * time.Second diff --git a/site/guide/configuration.md b/site/guide/configuration.md index 4283846..a3b9df4 100644 --- a/site/guide/configuration.md +++ b/site/guide/configuration.md @@ -84,8 +84,44 @@ pipeline: ballast_size: "0" # "0" = disabled memory_limit: "" # e.g. "8GiB" worker_timeout: "" # e.g. "60s" + # Advanced pipeline tuning. + uast_spill_threshold: 32 + intra_commit_parallel_threshold: 4 + max_intra_commit_workers: 4 + max_uast_blob_size: 262144 # 256 KiB + uast_parse_timeout: "10s" + max_changes_per_commit: 10000 + max_diff_batch_size: 1000 + memory_budget_ratio: 50 + memory_budget_cap: "2GiB" + memory_limit_ratio: 75 + # Extended pipeline tuning. + uast_spill_trim_interval: 16 + native_trim_interval: 10 + max_streaming_buffering: 3 + drain_prefetch_timeout: "30s" + sampler_interval: "2s" + worker_ratio: 100 + uast_worker_ratio: 40 + leaf_worker_divisor: 3 + min_leaf_workers: 4 + buffer_size_multiplier: 2 + budget_limit_ratio: 95 + system_ram_limit_ratio: 90 + diff_job_buffer_multiplier: 10 + static_max_workers: 8 + malloc_trim_interval: 50 + static_memory_limit_ratio: 90 history: + couples: + coupling_threshold_high: 10 + ownership_few_threshold: 3 + ownership_moderate_threshold: 5 + batch_coupling_threshold: 100 + hll_precision: 10 + top_k_per_file: 100 + min_edge_weight: 2 burndown: granularity: 30 sampling: 30 @@ -99,12 +135,42 @@ history: devs: consider_empty_commits: false anonymize: false + bus_factor_threshold: 0.5 + risk_threshold_critical: 90.0 + risk_threshold_high: 80.0 + risk_threshold_medium: 60.0 + active_threshold_ratio: 0.7 + default_active_days: 90 + hll_precision: 14 + file_history: + hotspot_threshold_critical: 50 + hotspot_threshold_high: 30 + hotspot_threshold_medium: 15 imports: goroutines: 4 max_file_size: 1048576 + max_dependency_risk_rows: 30 sentiment: min_comment_length: 20 gap: 0.5 + neutralizer_weight: 0.8 + max_weight_ratio: 3.0 + positive_threshold: 0.6 + negative_threshold: 0.4 + trend_threshold: 0.1 + low_sentiment_risk_thresh: 0.2 + clones: + max_clone_pairs: 1000 + num_hashes: 128 + num_bands: 16 + num_rows: 8 + shingle_size: 5 + similarity_type2: 0.8 + similarity_type3: 0.5 + threshold_ratio_yellow: 0.1 + threshold_ratio_red: 0.3 + threshold_pairs_yellow: 5 + threshold_pairs_red: 20 shotness: dsl_struct: 'filter(.roles has "Function")' dsl_name: ".props.name" @@ -156,6 +222,32 @@ Resource and tuning knobs for the analysis pipeline. | `ballast_size` | `string` | `"0"` | GC ballast allocation size. `"0"` disables ballast. Useful for reducing GC pauses in memory-rich environments. | Valid byte-size string | | `memory_limit` | `string` | `""` | Hard memory limit passed to the Go runtime (`GOMEMLIMIT`). Empty means no limit. | Valid byte-size string or empty | | `worker_timeout` | `string` | `""` | Maximum duration a single worker may run before being terminated (e.g. `"60s"`, `"5m"`). Empty means no timeout. | Valid Go duration string or empty | +| `uast_spill_threshold` | `int` | `32` | File changes per commit before UAST trees are spilled to disk to cap memory. | Must be >= 0 | +| `intra_commit_parallel_threshold` | `int` | `4` | Minimum file changes in a commit before intra-commit parallel UAST parsing is used. | Must be >= 0 | +| `max_intra_commit_workers` | `int` | `4` | Maximum goroutines for parallel UAST parsing within a single commit. | Must be >= 0 | +| `max_uast_blob_size` | `int` | `262144` | Maximum blob size in bytes for UAST parsing (256 KiB). Larger files are skipped. | Must be >= 0 | +| `uast_parse_timeout` | `string` | `"10s"` | Per-file timeout for UAST parsing. Prevents pathological tree-sitter behavior. | Valid Go duration string or empty | +| `max_changes_per_commit` | `int` | `10000` | Commits with more file changes than this are skipped entirely. | Must be >= 0 | +| `max_diff_batch_size` | `int` | `1000` | Maximum number of diff requests batched together for efficiency. | Must be >= 0 | +| `memory_budget_ratio` | `int` | `50` | Percentage of system RAM to use as the auto-detected memory budget. | Must be 0–100 | +| `memory_budget_cap` | `string` | `"2GiB"` | Maximum auto-detected memory budget. | Valid byte-size string or empty | +| `memory_limit_ratio` | `int` | `75` | Percentage of system RAM to use as Go's soft memory limit. | Must be 0–100 | +| `uast_spill_trim_interval` | `int` | `16` | How often to call `MallocTrim` during UAST spill-mode parsing (every N commits). | Must be >= 0 | +| `native_trim_interval` | `int` | `10` | How often to call `malloc_trim` within a chunk (every N commits). | Must be >= 0 | +| `max_streaming_buffering` | `int` | `3` | Maximum buffering factor for streaming (1=single, 2=double, 3=triple). | Must be >= 1 | +| `drain_prefetch_timeout` | `string` | `"30s"` | Timeout for abandoned prefetch goroutines before they are leaked. | Valid Go duration string | +| `sampler_interval` | `string` | `"2s"` | Pipeline sampler polling interval for memory triage logging. | Valid Go duration string | +| `worker_ratio` | `int` | `100` | Percentage of CPU cores to use for pipeline workers. | Must be 0–100 | +| `uast_worker_ratio` | `int` | `40` | Percentage of CPU cores to use for UAST pipeline workers. | Must be 0–100 | +| `leaf_worker_divisor` | `int` | `3` | Leaf worker count = `NumCPU / divisor`. | Must be >= 1 | +| `min_leaf_workers` | `int` | `4` | Minimum number of leaf workers when enabled. | Must be >= 1 | +| `buffer_size_multiplier` | `int` | `2` | Buffer size = `workers * multiplier`. | Must be >= 1 | +| `budget_limit_ratio` | `int` | `95` | Percentage of memory budget used as Go's soft memory limit. | Must be 0–100 | +| `system_ram_limit_ratio` | `int` | `90` | Memory limit cap as percentage of system RAM. | Must be 0–100 | +| `diff_job_buffer_multiplier` | `int` | `10` | Scales the diff job queue buffer relative to pipeline buffer size. | Must be >= 1 | +| `static_max_workers` | `int` | `8` | Maximum concurrent workers for static analysis phase. | Must be >= 1 | +| `malloc_trim_interval` | `int` | `50` | Files between `malloc_trim` calls in static analysis. `-1` disables. | Must be >= -1 | +| `static_memory_limit_ratio` | `int` | `90` | Percentage of budget applied as Go's memory limit during static phase. | Must be 0–100 | !!! tip "Memory Budget Auto-Tuning" @@ -165,6 +257,22 @@ Resource and tuning knobs for the analysis pipeline. --- +### `history.couples` + +Controls the file coupling and ownership analyzer. + +| Field | Type | Default | Description | Validation | +|-------|------|---------|-------------|------------| +| `coupling_threshold_high` | `int` | `10` | Minimum co-change count for a file pair to be classified as "high" coupling. | Must be >= 0 | +| `ownership_few_threshold` | `int` | `3` | Maximum number of contributors for a file to be in the "few owners" bucket. | Must be >= 0 | +| `ownership_moderate_threshold` | `int` | `5` | Maximum number of contributors for a file to be in the "moderate owners" bucket. | Must be >= 0 | +| `batch_coupling_threshold` | `int` | `100` | Maximum number of file pairs per commit considered for coupling. Limits quadratic growth on large commits. | Must be >= 0 | +| `hll_precision` | `int` | `10` | HyperLogLog precision for contributor count sketches. Higher = more accurate, more memory. Valid range: 4–18. | Must be 4–18 when set | +| `top_k_per_file` | `int` | `100` | Maximum coupling pairs per file in store output. | Must be >= 0 | +| `min_edge_weight` | `int` | `2` | Minimum co-change count for a coupling edge to be included. | Must be >= 0 | + +--- + ### `history.burndown` Controls the burndown (code ownership aging) analyzer. @@ -191,6 +299,25 @@ Controls the developer activity analyzer. |-------|------|---------|-------------|------------| | `consider_empty_commits` | `bool` | `false` | Include empty (no-diff) commits in developer statistics. | -- | | `anonymize` | `bool` | `false` | Replace developer names with anonymous identifiers in output. | -- | +| `bus_factor_threshold` | `float64` | `0.5` | Cumulative ownership fraction at which the bus factor count stops. | Must be 0.0–1.0 | +| `risk_threshold_critical` | `float64` | `90.0` | Ownership percentage above which a file is at critical risk (single-owner). | Must be 0–100 | +| `risk_threshold_high` | `float64` | `80.0` | Ownership percentage above which a file is at high risk. | Must be 0–100 | +| `risk_threshold_medium` | `float64` | `60.0` | Ownership percentage above which a file is at medium risk. | Must be 0–100 | +| `active_threshold_ratio` | `float64` | `0.7` | Fraction of the most-active developer's commits required to consider a developer "active". | Must be 0.0–1.0 | +| `default_active_days` | `int` | `90` | Lookback window in days for determining whether a developer is currently active. | Must be >= 0 | +| `hll_precision` | `int` | `14` | HyperLogLog precision for developer count sketches. Higher = more accurate, more memory. Valid range: 4–18. | Must be 4–18 when set | + +--- + +### `history.file_history` + +Controls the file history (churn and hotspot) analyzer. + +| Field | Type | Default | Description | Validation | +|-------|------|---------|-------------|------------| +| `hotspot_threshold_critical` | `int` | `50` | Commit count above which a file is classified as a critical hotspot. | Must be >= 0 | +| `hotspot_threshold_high` | `int` | `30` | Commit count above which a file is classified as a high hotspot. | Must be >= 0 | +| `hotspot_threshold_medium` | `int` | `15` | Commit count above which a file is classified as a medium hotspot. | Must be >= 0 | --- @@ -202,6 +329,7 @@ Controls the import/dependency history analyzer. |-------|------|---------|-------------|------------| | `goroutines` | `int` | `4` | Number of parallel goroutines for import extraction. | Must be > 0 | | `max_file_size` | `int` | `1048576` | Maximum file size in bytes to analyze for imports (1 MiB default). | Must be > 0 | +| `max_dependency_risk_rows` | `int` | `30` | Maximum number of rows in the dependency risk table in plot output. | Must be >= 0 | --- @@ -213,6 +341,32 @@ Controls the comment sentiment analyzer. |-------|------|---------|-------------|------------| | `min_comment_length` | `int` | `20` | Minimum comment character length to include in sentiment analysis. | Must be > 0 | | `gap` | `float64` | `0.5` | Sentiment classification gap threshold. Comments with scores within this gap of neutral are considered neutral. | Must be between 0.0 and 1.0 | +| `neutralizer_weight` | `float64` | `0.8` | How strongly SE-domain adjustments affect the final score. `0` = no effect, `1` = full adjustment. | Must be 0.0–1.0 | +| `max_weight_ratio` | `float64` | `3.0` | Maximum weight ratio for comment length weighting. Prevents single long comments from dominating. | Must be > 0 | +| `positive_threshold` | `float64` | `0.6` | Sentiment score at or above this is classified as "positive". | Must be 0.0–1.0 | +| `negative_threshold` | `float64` | `0.4` | Sentiment score at or below this is classified as "negative". | Must be 0.0–1.0 | +| `trend_threshold` | `float64` | `0.1` | Minimum change in sentiment needed to classify a trend as "improving" or "declining". | Must be >= 0 | +| `low_sentiment_risk_thresh` | `float64` | `0.2` | Sentiment at or below this is flagged as HIGH risk (vs MEDIUM). | Must be 0.0–1.0 | + +--- + +### `history.clones` + +Controls the clone detection analyzer. + +| Field | Type | Default | Description | Validation | +|-------|------|---------|-------------|------------| +| `max_clone_pairs` | `int` | `1000` | Maximum number of clone pairs reported in the aggregated result. | Must be >= 0 | +| `num_hashes` | `int` | `128` | MinHash signature size. More hashes = better accuracy, more memory. | Must be > 0 | +| `num_bands` | `int` | `16` | Number of LSH bands. `num_bands * num_rows` must equal `num_hashes`. | Must be > 0 | +| `num_rows` | `int` | `8` | Number of rows per LSH band. | Must be > 0 | +| `shingle_size` | `int` | `5` | Token shingle window size for MinHash input. | Must be > 0 | +| `similarity_type2` | `float64` | `0.8` | Minimum Jaccard similarity for Type-2 (renamed) clone detection. | Must be 0.0–1.0 | +| `similarity_type3` | `float64` | `0.5` | Minimum Jaccard similarity for Type-3 (near-miss) clone detection. | Must be 0.0–1.0 | +| `threshold_ratio_yellow` | `float64` | `0.1` | Clone ratio above which a yellow warning is issued. | Must be 0.0–1.0 | +| `threshold_ratio_red` | `float64` | `0.3` | Clone ratio above which a red warning is issued. | Must be 0.0–1.0 | +| `threshold_pairs_yellow` | `int` | `5` | Clone pair count above which a yellow warning is issued. | Must be >= 0 | +| `threshold_pairs_red` | `int` | `20` | Clone pair count above which a red warning is issued. | Must be >= 0 | ---