Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,30 @@ ARTIFACTS_DIR=artifacts
help:
@echo "Architecture Governance Toolkit - Available Commands:"
@echo ""
@echo "Core Analysis:"
@echo " make artifacts-dir - Create artifacts directory"
@echo " make build-analysis-image - Build Docker analysis image"
@echo " make run-analysis - Run analysis in Docker"
@echo " make hotspots - Generate code hotspots"
@echo " make ownership - Analyze code ownership"
@echo " make drift - Detect architecture drift"
@echo " make risk - Generate consolidated risk register"
@echo " make full-analysis - Run complete analysis pipeline"
@echo " make adr-new TITLE='...' - Create new ADR"
@echo " make clean - Remove artifacts directory"
@echo ""
@echo "Research System (NEW):"
@echo " make research-check-deps - Check research system dependencies"
@echo " make research-profile - Create organization profile"
@echo " make research-discover - Discover similar repositories"
@echo " make research-similarity - Calculate similarity scores"
@echo " make research-report - Generate research summary"
@echo " make research-full - Run complete research cycle"
@echo " make research-clean - Remove research artifacts"
@echo ""
@echo "Environment Variables:"
@echo " GITHUB_TOKEN - GitHub API token (for research-discover)"
@echo ""

artifacts-dir:
mkdir -p $(ARTIFACTS_DIR)/sbom $(ARTIFACTS_DIR)/timeseries
Expand Down Expand Up @@ -67,3 +85,92 @@ risk: artifacts-dir

full-analysis: hotspots ownership drift risk
@echo "Full analysis complete. Check artifacts/ directory."

# ============================================================================
# Research System Targets
# ============================================================================

RESEARCH_ARTIFACTS=$(ARTIFACTS_DIR)/research
ORG_PROFILE=$(RESEARCH_ARTIFACTS)/profiles/org_profile.json
DISCOVERED_REPOS=$(RESEARCH_ARTIFACTS)/discoveries/discovered_repos.json
SIMILARITY_SCORES=$(RESEARCH_ARTIFACTS)/discoveries/similarity_scores.json

research-dirs:
@mkdir -p $(RESEARCH_ARTIFACTS)/profiles
@mkdir -p $(RESEARCH_ARTIFACTS)/discoveries
@mkdir -p $(RESEARCH_ARTIFACTS)/analysis
@mkdir -p $(RESEARCH_ARTIFACTS)/patterns
@mkdir -p $(RESEARCH_ARTIFACTS)/recommendations
@mkdir -p $(RESEARCH_ARTIFACTS)/feedback

research-profile: research-dirs
@echo "========================================="
@echo "Creating Organization Profile..."
@echo "========================================="
python3 scripts/research/profile_org.py \
--path . \
--artifacts $(ARTIFACTS_DIR) \
--out $(ORG_PROFILE)
@echo ""
@echo "✓ Organization profile created: $(ORG_PROFILE)"

research-discover: research-profile
@echo "========================================="
@echo "Discovering Similar Repositories..."
@echo "========================================="
@if [ -z "$(GITHUB_TOKEN)" ]; then \
echo "WARNING: GITHUB_TOKEN not set. Rate limits will be restrictive."; \
echo "Set GITHUB_TOKEN environment variable for better results."; \
echo ""; \
fi
python3 scripts/research/discover_repos.py \
--config config/research/discovery_config.yaml \
--profile $(ORG_PROFILE) \
--out $(DISCOVERED_REPOS)
@echo ""
@echo "✓ Repository discovery complete: $(DISCOVERED_REPOS)"

research-similarity: research-discover
@echo "========================================="
@echo "Calculating Similarity Scores..."
@echo "========================================="
python3 scripts/research/similarity_scorer.py \
--discovered $(DISCOVERED_REPOS) \
--profile $(ORG_PROFILE) \
--weights config/research/similarity_weights.yaml \
--out $(SIMILARITY_SCORES)
@echo ""
@echo "✓ Similarity scoring complete: $(SIMILARITY_SCORES)"

research-report: research-similarity
@echo "========================================="
@echo "Research System Summary"
@echo "========================================="
@echo ""
@echo "Organization Profile:"
@python3 -c "import json; p=json.load(open('$(ORG_PROFILE)')); print(f\" Fingerprint: {p.get('fingerprint', 'unknown')}\"); print(f\" Languages: {', '.join(list(p.get('metrics', {}).get('primary_languages', []))[:5])}\"); print(f\" Research Areas: {len(p.get('challenges', {}).get('research_areas', []))}\"); print(f\" High Priority Challenges: {len(p.get('challenges', {}).get('high_priority', []))}\")"
Comment on lines +145 to +151
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 | Confidence: High

The Makefile embeds complex Python one-liners that are difficult to maintain and debug. These inline scripts lack proper error handling and will fail silently if JSON structure changes. This violates the principle of keeping build logic separate from complex data processing.

Code Suggestion:

research-report: research-similarity
	@echo "========================================="
	@echo "Research System Summary"
	@echo "========================================="
	@python3 scripts/research/generate_report.py \
		--profile $(ORG_PROFILE) \
		--scores $(SIMILARITY_SCORES)

@echo ""
@echo "Discovery Results:"
@python3 -c "import json; d=json.load(open('$(SIMILARITY_SCORES)')); meta=d.get('similarity_metadata', {}); print(f\" Total Scored: {meta.get('total_scored', 0)}\"); print(f\" Above Threshold: {meta.get('above_threshold', 0)}\"); print(f\" Threshold: {meta.get('threshold', 0)}\"); repos=d.get('repositories', []); print(f\" Top 5 Matches:\"); [print(f\" {i+1}. {r.get('full_name', 'unknown')} (score: {r.get('similarity_score', 0):.4f})\") for i, r in enumerate(repos[:5])]"
Comment on lines +151 to +154
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The inline Python commands in the research-report target are complex and difficult to read, maintain, and debug. It's better to move this logic into a dedicated Python script (e.g., scripts/research/generate_report.py) that takes the necessary file paths as arguments. This will improve modularity, testability, and readability.

@echo ""
@echo "========================================="

research-clean:
rm -rf $(RESEARCH_ARTIFACTS)

research-full: research-profile research-discover research-similarity research-report
@echo ""
@echo "✓ Full research cycle complete!"
@echo ""
@echo "Next steps:"
@echo " 1. Review discovered repositories in: $(SIMILARITY_SCORES)"
@echo " 2. Analyze top matches manually"
@echo " 3. Run 'make research-analyze' to analyze selected repositories (coming soon)"
@echo ""

# Helper target to check research system dependencies
research-check-deps:
@echo "Checking research system dependencies..."
@python3 -c "import github" 2>/dev/null || (echo "ERROR: PyGithub not installed. Run: pip install PyGithub" && exit 1)
@python3 -c "import yaml" 2>/dev/null || (echo "ERROR: PyYAML not installed. Run: pip install PyYAML" && exit 1)
@echo "✓ All dependencies installed"
252 changes: 252 additions & 0 deletions config/research/analysis_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
# Analysis Pipeline Configuration
# Controls how discovered repositories are cloned and analyzed

version: "1.0"
last_updated: "2025-11-18"

# Cloning configuration
cloning:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 | Confidence: Medium

Speculative: The Phase 3 analysis configuration uses /tmp/research_clones as workspace directory without proper isolation. This creates potential security risks when cloning untrusted repositories, including path traversal attacks and conflicts between parallel analysis runs. The roadmap indicates Phase 3 will involve cloning external repositories.

Code Suggestion:

cloning:
  workspace_dir: "/tmp/research_clones_${TIMESTAMP}_${RANDOM_SUFFIX}"
  use_docker: true
  docker_image: "research-analysis:latest"
  read_only_mounts: true

Evidence: path:docs/ROADMAP_RECURSIVE_RESEARCH_SYSTEM.md

# Clone depth (shallow clone)
depth: 1

# Maximum repository size (MB)
max_size_mb: 500

# Timeout per clone (seconds)
timeout_seconds: 300

# Concurrent clones
max_parallel: 3

# Clone location
workspace_dir: "/tmp/research_clones"

# Auto-cleanup
auto_cleanup: true
cleanup_after_hours: 24

# Sandbox mode
use_docker: false # Set to true for production
docker_image: "research-analysis:latest"

# Analysis modules to run
modules:
structural:
enabled: true
analyze_directory_structure: true
detect_config_files: true
measure_doc_coverage: true
analyze_test_organization: true

quality:
enabled: true
run_complexity_analysis: true
detect_coverage_config: true
extract_linting_config: true
analyze_code_review_practices: true

architecture:
enabled: true
extract_dependency_graph: true
identify_service_boundaries: true
detect_api_patterns: true
analyze_data_flows: false # Expensive, disable by default

devops:
enabled: true
parse_ci_cd_configs: true
detect_iac_patterns: true
extract_monitoring_setup: true
identify_security_tools: true

documentation:
enabled: true
analyze_readme: true
extract_adrs: true
find_runbooks: true
extract_contribution_guide: true

# Structural Analysis
structural:
# Directory patterns to analyze
analyze_patterns:
- "src/**"
- "lib/**"
- "app/**"
- "pkg/**"
- "tests/**"
- "docs/**"

# Config files to detect
config_patterns:
- "*.yml"
- "*.yaml"
- "*.json"
- "*.toml"
- "*.ini"
- "Dockerfile*"
- "docker-compose*.yml"
- "Makefile"
- ".github/**"
- ".gitlab-ci.yml"

# Documentation coverage
doc_indicators:
- "README.md"
- "CONTRIBUTING.md"
- "docs/"
- "*.md"
- "LICENSE"

# Quality Analysis
quality:
# Complexity tools by language
complexity_tools:
python: "radon"
javascript: "eslint"
go: "gocyclo"
java: "checkstyle"

# Coverage tools
coverage_tools:
python: ["pytest-cov", "coverage.py"]
javascript: ["jest", "nyc", "istanbul"]
go: ["go test -cover"]
java: ["jacoco"]

# Linters
linters:
python: ["pylint", "flake8", "black", "mypy"]
javascript: ["eslint", "prettier"]
go: ["golint", "gofmt"]
java: ["checkstyle", "pmd"]

# DevOps Analysis
devops:
# CI/CD platforms to detect
ci_platforms:
- ".github/workflows" # GitHub Actions
- ".gitlab-ci.yml" # GitLab CI
- ".travis.yml" # Travis CI
- "Jenkinsfile" # Jenkins
- ".circleci" # CircleCI
- "azure-pipelines.yml" # Azure Pipelines

# Infrastructure as Code
iac_tools:
- "terraform"
- "cloudformation"
- "kubernetes"
- "helm"
- "ansible"
- "pulumi"

# Monitoring/Observability
observability_tools:
- "prometheus"
- "grafana"
- "datadog"
- "newrelic"
- "sentry"
- "opentelemetry"

# Security tools
security_tools:
- "trivy"
- "semgrep"
- "snyk"
- "sonarqube"
- "dependabot"

# Documentation Analysis
documentation:
# README quality metrics
readme_quality:
min_length: 500 # characters
required_sections:
- "installation"
- "usage"
- "license"
bonus_sections:
- "contributing"
- "testing"
- "architecture"

# ADR detection
adr_patterns:
- "docs/adr/**/*.md"
- "docs/decisions/**/*.md"
- "adr/**/*.md"
- "decisions/**/*.md"

# Runbook detection
runbook_patterns:
- "runbooks/**"
- "playbooks/**"
- "docs/operations/**"
- "docs/runbooks/**"

# Comparison with baseline
baseline_comparison:
enabled: true

# Gap identification
identify_gaps:
- missing_tools
- missing_practices
- better_configurations
- superior_patterns

# Impact scoring
impact_factors:
risk_reduction: 0.35
velocity_improvement: 0.30
quality_improvement: 0.20
cost_reduction: 0.15

# Effort estimation
effort_estimation:
# Map pattern types to T-shirt sizes
simple_patterns: "S" # < 8 hours
moderate_patterns: "M" # 8-24 hours
complex_patterns: "L" # 1-5 days
major_changes: "XL" # > 5 days

# Performance
performance:
# Timeouts
module_timeout_seconds: 180
total_analysis_timeout_seconds: 600

# Resource limits
max_memory_mb: 2048
max_cpu_percent: 80

# Parallel processing
parallel_modules: false # Run modules sequentially by default

# Output
output:
# Format: "json", "yaml"
format: "json"

# Compression
compress: true

# Include raw data
include_raw: true

# Include diffs
include_diff: true

# Error handling
error_handling:
# Continue on module failure
continue_on_error: true

# Retry failed modules
retry_failed: true
max_retries: 2

# Log level: "DEBUG", "INFO", "WARNING", "ERROR"
log_level: "INFO"
Loading