Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
# LLM Provider Configuration
# Options: "ollama" or "gemini"
LLM_PROVIDER=ollama
LLM_PROVIDER=gemini

# Default model to use
# For Ollama: "gemma3:4b", "qwen3:4b", "mistral:7b", etc.
# For Gemini: "gemini-2.5-pro", "gemini-2.5-flash", etc.
DEFAULT_MODEL=gemma3:4b
DEFAULT_MODEL=gemini-2.5-flash

# Google Gemini API Key (required if using Gemini provider)
GEMINI_API_KEY=your_gemini_api_key_here
# Example: GEMINI_API_KEY=your_gemini_api_key_here
GEMINI_API_KEY=

# Optional: Personal access token to increase GitHub API rate limits
# Example: GITHUB_TOKEN=ghp_xxx
GITHUB_TOKEN=
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ test_*.py
cache/
resume_evaluations.csv
greenhouse_resumes/*
*.pdf
.venv_win/
python-*.exe

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,12 @@ What happens:
2. If a GitHub profile is found in the resume, repositories are fetched and cached to `cache/githubcache_<basename>.json`.
3. The evaluator prints a report and, in development mode, appends a CSV row to `resume_evaluations.csv`.

### Flags

- `--force`: bypass caches and fully re-extract from the PDF.
- `--no-github`: skip GitHub enrichment (useful when rate-limited or offline).
- `--max-workers N`: control parallel section extraction (default: 3). Lower it if you hit LLM 429s.

---

## Directory layout
Expand Down
40 changes: 36 additions & 4 deletions evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
from typing import Dict, List, Optional, Tuple, Any
import hashlib
from pydantic import BaseModel, Field, field_validator
from models import JSONResume, EvaluationData
from llm_utils import initialize_llm_provider, extract_json_from_response
from llm_utils import (
initialize_llm_provider,
extract_json_from_response,
ensure_valid_json,
)
import logging
import json
import re
Expand Down Expand Up @@ -78,12 +83,39 @@ def evaluate_resume(self, resume_text: str) -> EvaluationData:
response = self.provider.chat(**chat_params, **kwargs)

response_text = response["message"]["content"]
response_text = extract_json_from_response(response_text)
logger.error(f"🔤 Prompt response: {response_text}")
cleaned_text = extract_json_from_response(response_text)
repaired_json_str = ensure_valid_json(
cleaned_text,
provider=self.provider,
model=self.model_name,
original_prompt=full_prompt,
)
logger.error(f"🔤 Prompt response: {repaired_json_str}")

try:
evaluation_dict = json.loads(repaired_json_str)
except Exception as e:
logger.error(f"Failed to parse evaluation JSON after repair attempts: {e}")
raise

evaluation_dict = json.loads(response_text)
evaluation_data = EvaluationData(**evaluation_dict)

# Attach prompt/version metadata
template_sources = self.template_manager.get_all_template_sources()
template_hashes = {
name: hashlib.sha256(src.encode("utf-8")).hexdigest()
for name, src in template_sources.items()
}
evaluation_data.meta = {
"model": self.model_name,
"provider": MODEL_PROVIDER_MAPPING.get(self.model_name, None).value
if MODEL_PROVIDER_MAPPING.get(self.model_name, None)
else None,
"template_hashes": template_hashes,
"temperature": self.model_params.get("temperature"),
"top_p": self.model_params.get("top_p"),
}

return evaluation_data

except Exception as e:
Expand Down
91 changes: 55 additions & 36 deletions github.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,23 +57,13 @@ def _fetch_github_api(api_url, params=None):
# Log rate limit information and handle proactively
if remaining < 10 and rate_limit_reset:
reset_timestamp = int(rate_limit_reset)
current_timestamp = int(time.time())
wait_seconds = max(0, reset_timestamp - current_timestamp) + 5 # Add 5 second buffer
reset_time = datetime.datetime.fromtimestamp(reset_timestamp)

# Cap maximum wait time at 1 hour
max_wait = 3600
if wait_seconds > max_wait:
print(f"⚠️ Rate limit reset time is too far in the future ({wait_seconds}s). Capping wait to {max_wait}s")
wait_seconds = max_wait

logger.error(f"⚠️ GitHub API rate limit low: {remaining}/{limit} requests remaining. Resets at {reset_time}")
print(f"💡 Tip: Set GITHUB_TOKEN environment variable to increase rate limits (60/hour → 5000/hour)")

if wait_seconds > 0:
logger.info(f"⏳ Proactively sleeping for {wait_seconds} seconds until rate limit resets...")
time.sleep(wait_seconds)
print(f"✅ Rate limit should be reset now. Continuing...")
logger.error(
f"⚠️ GitHub API rate limit low: {remaining}/{limit} requests remaining. Resets at {reset_time}"
)
print(
"💡 Tip: Set GITHUB_TOKEN environment variable to increase rate limits (60/hour → 5000/hour). Continuing without delay."
)
elif remaining < 100:
logger.info(f"ℹ️ GitHub API rate limit: {remaining}/{limit} requests remaining")

Expand Down Expand Up @@ -210,24 +200,66 @@ def fetch_all_github_repos(github_url: str, max_repos: int = 100) -> List[Dict]:
if status_code == 200:
projects = []
for repo in repos_data:
if repo.get("fork") and repo.get("forks_count", 0) < 5:
continue

repo_name = repo.get("name")
if not repo_name:
continue

# Fetch contributors for user's fork (or original if not fork)
contributors_data = fetch_repo_contributors(username, repo_name)
contributor_count = len(contributors_data)

user_contributions, total_contributions = fetch_contributions_count(
username, contributors_data
)

# Determine project type (consider upstream if fork)
project_type = (
"open_source" if contributor_count > 1 else "self_project"
)

github_details = {
"stars": repo.get("stargazers_count", 0),
"forks": repo.get("forks_count", 0),
"language": repo.get("language"),
"description": repo.get("description"),
"created_at": repo.get("created_at"),
"updated_at": repo.get("updated_at"),
"topics": repo.get("topics", []),
"open_issues": repo.get("open_issues_count", 0),
"size": repo.get("size", 0),
"fork": repo.get("fork", False),
"archived": repo.get("archived", False),
"default_branch": repo.get("default_branch"),
"contributors": contributor_count,
}

upstream_details = None
if repo.get("fork"):
# Fetch upstream parent for accurate stats (#155) and avoid skipping low-fork repos (#162)
upstream_api = f"https://api.github.com/repos/{username}/{repo_name}"
status_code, upstream_data = _fetch_github_api(upstream_api)
if status_code == 200 and isinstance(upstream_data, dict):
parent = upstream_data.get("parent")
if parent:
upstream_details = {
"name": parent.get("name"),
"owner": parent.get("owner", {}).get("login"),
"html_url": parent.get("html_url"),
"stars": parent.get("stargazers_count", 0),
"forks": parent.get("forks_count", 0),
"language": parent.get("language"),
"topics": parent.get("topics", []),
"description": parent.get("description"),
}
# Prefer upstream popularity metrics for evaluation
github_details["stars"] = upstream_details["stars"]
github_details["forks"] = upstream_details["forks"]
github_details["topics"] = upstream_details["topics"]
github_details["upstream_owner"] = upstream_details["owner"]
github_details["upstream_name"] = upstream_details["name"]
github_details["upstream_html_url"] = upstream_details["html_url"]

project = {
"name": repo.get("name"),
"name": repo_name,
"description": repo.get("description"),
"github_url": repo.get("html_url"),
"live_url": repo.get("homepage") if repo.get("homepage") else None,
Expand All @@ -238,21 +270,8 @@ def fetch_all_github_repos(github_url: str, max_repos: int = 100) -> List[Dict]:
"contributor_count": contributor_count,
"author_commit_count": user_contributions,
"total_commit_count": total_contributions,
"github_details": {
"stars": repo.get("stargazers_count", 0),
"forks": repo.get("forks_count", 0),
"language": repo.get("language"),
"description": repo.get("description"),
"created_at": repo.get("created_at"),
"updated_at": repo.get("updated_at"),
"topics": repo.get("topics", []),
"open_issues": repo.get("open_issues_count", 0),
"size": repo.get("size", 0),
"fork": repo.get("fork", False),
"archived": repo.get("archived", False),
"default_branch": repo.get("default_branch"),
"contributors": contributor_count,
},
"github_details": github_details,
"upstream_details": upstream_details,
}
projects.append(project)

Expand Down
82 changes: 82 additions & 0 deletions llm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"""

import logging
import json
import hashlib
from typing import Any, Dict, Optional
from models import ModelProvider, OllamaProvider, GeminiProvider
from prompt import MODEL_PROVIDER_MAPPING, GEMINI_API_KEY
Expand Down Expand Up @@ -37,6 +39,86 @@ def extract_json_from_response(response_text: str) -> str:
return response_text


def _try_parse_json(text: str) -> Optional[str]:
"""Attempt to parse JSON and return the canonical string if successful."""
try:
obj = json.loads(text)
return json.dumps(obj, ensure_ascii=False)
except Exception:
return None


def ensure_valid_json(
response_text: str,
provider: Any = None,
model: str = None,
original_prompt: str = None,
max_repair_attempts: int = 2,
) -> str:
"""Validate JSON; attempt lightweight repairs or LLM self-repair if needed.

Strategy:
1. Strip markdown fences / think tags (already handled outside).
2. Trim to first/last brace.
3. Try direct parse.
4. If still failing and provider available, send a repair prompt asking ONLY for valid JSON.
5. Return raw text if irreparable to allow upstream fallback handling.
"""
cleaned = response_text.strip()

# Fast path
parsed = _try_parse_json(cleaned)
if parsed is not None:
return parsed

# Attempt brace slicing
start = cleaned.find("{")
end = cleaned.rfind("}")
if start != -1 and end != -1 and end > start:
sliced = cleaned[start : end + 1]
parsed = _try_parse_json(sliced)
if parsed is not None:
return parsed

# Attempt LLM repair
if provider and model:
repair_instruction = (
"You previously returned malformed JSON. Return ONLY valid JSON for the same task. "
"No explanations, code fences, or commentary. If fields are missing, infer minimal plausible empty values."
)
for attempt in range(max_repair_attempts):
try:
repair_messages = [
{"role": "system", "content": repair_instruction},
{
"role": "user",
"content": (
"Original prompt:\n" + (original_prompt or "<none>") +
"\nMalformed JSON response:\n" + cleaned +
"\nReturn ONLY repaired JSON now."
),
},
]
# Low creativity for repair
repair_options = {"temperature": 0.0, "top_p": 0.9}
repair_resp = provider.chat(
model=model,
messages=repair_messages,
options=repair_options,
)
candidate = extract_json_from_response(
repair_resp["message"]["content"]
)
parsed = _try_parse_json(candidate)
if parsed is not None:
return parsed
except Exception as e:
logger.warning(f"JSON repair attempt {attempt+1} failed: {e}")

# Return original cleaned text (upstream may log and skip)
return cleaned


def initialize_llm_provider(model_name: str) -> Any:
"""
Initialize the appropriate LLM provider based on the model name.
Expand Down
9 changes: 9 additions & 0 deletions models.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ class Project(BaseModel):
description: Optional[str] = None
highlights: Optional[List[str]] = None
url: Optional[str] = None
repo_url: Optional[str] = None
live_url: Optional[str] = None
technologies: Optional[List[str]] = None
skills: Optional[List[str]] = None

Expand Down Expand Up @@ -198,6 +200,12 @@ class AwardsSection(BaseModel):
awards: Optional[List[Award]] = None


class LanguagesSection(BaseModel):
"""Language section containing a list of languages."""

languages: Optional[List[Language]] = None


class JSONResume(BaseModel):
"""Complete JSON Resume format model."""

Expand Down Expand Up @@ -247,6 +255,7 @@ class EvaluationData(BaseModel):
deductions: Deductions
key_strengths: List[str] = Field(min_items=1, max_items=5)
areas_for_improvement: List[str] = Field(min_items=1, max_items=5)
meta: Optional[Dict[str, Any]] = None # metadata: template hashes, model, provider, timestamps


class GitHubProfile(BaseModel):
Expand Down
Loading