diff --git a/.env.example b/.env.example index cae00ff..57f6f9b 100644 --- a/.env.example +++ b/.env.example @@ -1,11 +1,16 @@ # LLM Provider Configuration # Options: "ollama" or "gemini" -LLM_PROVIDER=ollama +LLM_PROVIDER=gemini # Default model to use # For Ollama: "gemma3:4b", "qwen3:4b", "mistral:7b", etc. # For Gemini: "gemini-2.5-pro", "gemini-2.5-flash", etc. -DEFAULT_MODEL=gemma3:4b +DEFAULT_MODEL=gemini-2.5-flash # Google Gemini API Key (required if using Gemini provider) -GEMINI_API_KEY=your_gemini_api_key_here +# Example: GEMINI_API_KEY=your_gemini_api_key_here +GEMINI_API_KEY= + +# Optional: Personal access token to increase GitHub API rate limits +# Example: GITHUB_TOKEN=ghp_xxx +GITHUB_TOKEN= diff --git a/.gitignore b/.gitignore index a2e75f9..010cae0 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,9 @@ test_*.py cache/ resume_evaluations.csv greenhouse_resumes/* +*.pdf +.venv_win/ +python-*.exe # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 0396ef4..5fb21cd 100644 --- a/README.md +++ b/README.md @@ -215,6 +215,12 @@ What happens: 2. If a GitHub profile is found in the resume, repositories are fetched and cached to `cache/githubcache_.json`. 3. The evaluator prints a report and, in development mode, appends a CSV row to `resume_evaluations.csv`. +### Flags + +- `--force`: bypass caches and fully re-extract from the PDF. +- `--no-github`: skip GitHub enrichment (useful when rate-limited or offline). +- `--max-workers N`: control parallel section extraction (default: 3). Lower it if you hit LLM 429s. + --- ## Directory layout diff --git a/evaluator.py b/evaluator.py index 1f9e91f..abad532 100644 --- a/evaluator.py +++ b/evaluator.py @@ -1,7 +1,12 @@ from typing import Dict, List, Optional, Tuple, Any +import hashlib from pydantic import BaseModel, Field, field_validator from models import JSONResume, EvaluationData -from llm_utils import initialize_llm_provider, extract_json_from_response +from llm_utils import ( + initialize_llm_provider, + extract_json_from_response, + ensure_valid_json, +) import logging import json import re @@ -78,12 +83,39 @@ def evaluate_resume(self, resume_text: str) -> EvaluationData: response = self.provider.chat(**chat_params, **kwargs) response_text = response["message"]["content"] - response_text = extract_json_from_response(response_text) - logger.error(f"🔤 Prompt response: {response_text}") + cleaned_text = extract_json_from_response(response_text) + repaired_json_str = ensure_valid_json( + cleaned_text, + provider=self.provider, + model=self.model_name, + original_prompt=full_prompt, + ) + logger.error(f"🔤 Prompt response: {repaired_json_str}") + + try: + evaluation_dict = json.loads(repaired_json_str) + except Exception as e: + logger.error(f"Failed to parse evaluation JSON after repair attempts: {e}") + raise - evaluation_dict = json.loads(response_text) evaluation_data = EvaluationData(**evaluation_dict) + # Attach prompt/version metadata + template_sources = self.template_manager.get_all_template_sources() + template_hashes = { + name: hashlib.sha256(src.encode("utf-8")).hexdigest() + for name, src in template_sources.items() + } + evaluation_data.meta = { + "model": self.model_name, + "provider": MODEL_PROVIDER_MAPPING.get(self.model_name, None).value + if MODEL_PROVIDER_MAPPING.get(self.model_name, None) + else None, + "template_hashes": template_hashes, + "temperature": self.model_params.get("temperature"), + "top_p": self.model_params.get("top_p"), + } + return evaluation_data except Exception as e: diff --git a/github.py b/github.py index 1c52bc6..210e966 100644 --- a/github.py +++ b/github.py @@ -57,23 +57,13 @@ def _fetch_github_api(api_url, params=None): # Log rate limit information and handle proactively if remaining < 10 and rate_limit_reset: reset_timestamp = int(rate_limit_reset) - current_timestamp = int(time.time()) - wait_seconds = max(0, reset_timestamp - current_timestamp) + 5 # Add 5 second buffer reset_time = datetime.datetime.fromtimestamp(reset_timestamp) - - # Cap maximum wait time at 1 hour - max_wait = 3600 - if wait_seconds > max_wait: - print(f"âš ī¸ Rate limit reset time is too far in the future ({wait_seconds}s). Capping wait to {max_wait}s") - wait_seconds = max_wait - - logger.error(f"âš ī¸ GitHub API rate limit low: {remaining}/{limit} requests remaining. Resets at {reset_time}") - print(f"💡 Tip: Set GITHUB_TOKEN environment variable to increase rate limits (60/hour → 5000/hour)") - - if wait_seconds > 0: - logger.info(f"âŗ Proactively sleeping for {wait_seconds} seconds until rate limit resets...") - time.sleep(wait_seconds) - print(f"✅ Rate limit should be reset now. Continuing...") + logger.error( + f"âš ī¸ GitHub API rate limit low: {remaining}/{limit} requests remaining. Resets at {reset_time}" + ) + print( + "💡 Tip: Set GITHUB_TOKEN environment variable to increase rate limits (60/hour → 5000/hour). Continuing without delay." + ) elif remaining < 100: logger.info(f"â„šī¸ GitHub API rate limit: {remaining}/{limit} requests remaining") @@ -210,24 +200,66 @@ def fetch_all_github_repos(github_url: str, max_repos: int = 100) -> List[Dict]: if status_code == 200: projects = [] for repo in repos_data: - if repo.get("fork") and repo.get("forks_count", 0) < 5: - continue - repo_name = repo.get("name") + if not repo_name: + continue + # Fetch contributors for user's fork (or original if not fork) contributors_data = fetch_repo_contributors(username, repo_name) contributor_count = len(contributors_data) - user_contributions, total_contributions = fetch_contributions_count( username, contributors_data ) + # Determine project type (consider upstream if fork) project_type = ( "open_source" if contributor_count > 1 else "self_project" ) + github_details = { + "stars": repo.get("stargazers_count", 0), + "forks": repo.get("forks_count", 0), + "language": repo.get("language"), + "description": repo.get("description"), + "created_at": repo.get("created_at"), + "updated_at": repo.get("updated_at"), + "topics": repo.get("topics", []), + "open_issues": repo.get("open_issues_count", 0), + "size": repo.get("size", 0), + "fork": repo.get("fork", False), + "archived": repo.get("archived", False), + "default_branch": repo.get("default_branch"), + "contributors": contributor_count, + } + + upstream_details = None + if repo.get("fork"): + # Fetch upstream parent for accurate stats (#155) and avoid skipping low-fork repos (#162) + upstream_api = f"https://api.github.com/repos/{username}/{repo_name}" + status_code, upstream_data = _fetch_github_api(upstream_api) + if status_code == 200 and isinstance(upstream_data, dict): + parent = upstream_data.get("parent") + if parent: + upstream_details = { + "name": parent.get("name"), + "owner": parent.get("owner", {}).get("login"), + "html_url": parent.get("html_url"), + "stars": parent.get("stargazers_count", 0), + "forks": parent.get("forks_count", 0), + "language": parent.get("language"), + "topics": parent.get("topics", []), + "description": parent.get("description"), + } + # Prefer upstream popularity metrics for evaluation + github_details["stars"] = upstream_details["stars"] + github_details["forks"] = upstream_details["forks"] + github_details["topics"] = upstream_details["topics"] + github_details["upstream_owner"] = upstream_details["owner"] + github_details["upstream_name"] = upstream_details["name"] + github_details["upstream_html_url"] = upstream_details["html_url"] + project = { - "name": repo.get("name"), + "name": repo_name, "description": repo.get("description"), "github_url": repo.get("html_url"), "live_url": repo.get("homepage") if repo.get("homepage") else None, @@ -238,21 +270,8 @@ def fetch_all_github_repos(github_url: str, max_repos: int = 100) -> List[Dict]: "contributor_count": contributor_count, "author_commit_count": user_contributions, "total_commit_count": total_contributions, - "github_details": { - "stars": repo.get("stargazers_count", 0), - "forks": repo.get("forks_count", 0), - "language": repo.get("language"), - "description": repo.get("description"), - "created_at": repo.get("created_at"), - "updated_at": repo.get("updated_at"), - "topics": repo.get("topics", []), - "open_issues": repo.get("open_issues_count", 0), - "size": repo.get("size", 0), - "fork": repo.get("fork", False), - "archived": repo.get("archived", False), - "default_branch": repo.get("default_branch"), - "contributors": contributor_count, - }, + "github_details": github_details, + "upstream_details": upstream_details, } projects.append(project) diff --git a/llm_utils.py b/llm_utils.py index 7e1d96d..a143b99 100644 --- a/llm_utils.py +++ b/llm_utils.py @@ -3,6 +3,8 @@ """ import logging +import json +import hashlib from typing import Any, Dict, Optional from models import ModelProvider, OllamaProvider, GeminiProvider from prompt import MODEL_PROVIDER_MAPPING, GEMINI_API_KEY @@ -37,6 +39,86 @@ def extract_json_from_response(response_text: str) -> str: return response_text +def _try_parse_json(text: str) -> Optional[str]: + """Attempt to parse JSON and return the canonical string if successful.""" + try: + obj = json.loads(text) + return json.dumps(obj, ensure_ascii=False) + except Exception: + return None + + +def ensure_valid_json( + response_text: str, + provider: Any = None, + model: str = None, + original_prompt: str = None, + max_repair_attempts: int = 2, +) -> str: + """Validate JSON; attempt lightweight repairs or LLM self-repair if needed. + + Strategy: + 1. Strip markdown fences / think tags (already handled outside). + 2. Trim to first/last brace. + 3. Try direct parse. + 4. If still failing and provider available, send a repair prompt asking ONLY for valid JSON. + 5. Return raw text if irreparable to allow upstream fallback handling. + """ + cleaned = response_text.strip() + + # Fast path + parsed = _try_parse_json(cleaned) + if parsed is not None: + return parsed + + # Attempt brace slicing + start = cleaned.find("{") + end = cleaned.rfind("}") + if start != -1 and end != -1 and end > start: + sliced = cleaned[start : end + 1] + parsed = _try_parse_json(sliced) + if parsed is not None: + return parsed + + # Attempt LLM repair + if provider and model: + repair_instruction = ( + "You previously returned malformed JSON. Return ONLY valid JSON for the same task. " + "No explanations, code fences, or commentary. If fields are missing, infer minimal plausible empty values." + ) + for attempt in range(max_repair_attempts): + try: + repair_messages = [ + {"role": "system", "content": repair_instruction}, + { + "role": "user", + "content": ( + "Original prompt:\n" + (original_prompt or "") + + "\nMalformed JSON response:\n" + cleaned + + "\nReturn ONLY repaired JSON now." + ), + }, + ] + # Low creativity for repair + repair_options = {"temperature": 0.0, "top_p": 0.9} + repair_resp = provider.chat( + model=model, + messages=repair_messages, + options=repair_options, + ) + candidate = extract_json_from_response( + repair_resp["message"]["content"] + ) + parsed = _try_parse_json(candidate) + if parsed is not None: + return parsed + except Exception as e: + logger.warning(f"JSON repair attempt {attempt+1} failed: {e}") + + # Return original cleaned text (upstream may log and skip) + return cleaned + + def initialize_llm_provider(model_name: str) -> Any: """ Initialize the appropriate LLM provider based on the model name. diff --git a/models.py b/models.py index e83779e..c9e9e9e 100644 --- a/models.py +++ b/models.py @@ -158,6 +158,8 @@ class Project(BaseModel): description: Optional[str] = None highlights: Optional[List[str]] = None url: Optional[str] = None + repo_url: Optional[str] = None + live_url: Optional[str] = None technologies: Optional[List[str]] = None skills: Optional[List[str]] = None @@ -198,6 +200,12 @@ class AwardsSection(BaseModel): awards: Optional[List[Award]] = None +class LanguagesSection(BaseModel): + """Language section containing a list of languages.""" + + languages: Optional[List[Language]] = None + + class JSONResume(BaseModel): """Complete JSON Resume format model.""" @@ -247,6 +255,7 @@ class EvaluationData(BaseModel): deductions: Deductions key_strengths: List[str] = Field(min_items=1, max_items=5) areas_for_improvement: List[str] = Field(min_items=1, max_items=5) + meta: Optional[Dict[str, Any]] = None # metadata: template hashes, model, provider, timestamps class GitHubProfile(BaseModel): diff --git a/pdf.py b/pdf.py index 296db47..666d3dc 100644 --- a/pdf.py +++ b/pdf.py @@ -3,6 +3,7 @@ import json import time import logging +import re import pymupdf from models import ( @@ -13,14 +14,20 @@ Skill, Project, Award, + Language, BasicsSection, WorkSection, EducationSection, SkillsSection, ProjectsSection, AwardsSection, + LanguagesSection, +) +from llm_utils import ( + initialize_llm_provider, + extract_json_from_response, + ensure_valid_json, ) -from llm_utils import initialize_llm_provider, extract_json_from_response from pymupdf_rag import to_markdown from typing import List, Optional, Dict, Any from prompt import ( @@ -37,9 +44,14 @@ class PDFHandler: - def __init__(self): + def __init__(self, max_workers: int = 3): self.template_manager = TemplateManager() self._initialize_llm_provider() + # Limit the concurrency for section extraction to reduce rate limits + try: + self.max_workers = int(max_workers) if max_workers and max_workers > 0 else 3 + except Exception: + self.max_workers = 3 def _initialize_llm_provider(self): """Initialize the appropriate LLM provider based on the model.""" @@ -103,33 +115,70 @@ def _call_llm_for_section( if return_model: kwargs["format"] = return_model.model_json_schema() - # Use the appropriate provider to make the API call - response = self.provider.chat(**chat_params, **kwargs) + # Retry logic for rate limits / transient failures + max_attempts = 3 + attempt = 0 + response_text = None + while attempt < max_attempts and response_text is None: + attempt += 1 + try: + response = self.provider.chat(**chat_params, **kwargs) + response_text = response["message"]["content"] + except Exception as e: + err_msg = str(e) + if "429" in err_msg or "quota" in err_msg.lower(): + # Parse suggested retry delay if present + retry_delay = 8 + m = re.search(r"retry in ([0-9]+(?:\.[0-9]+)?)s", err_msg) + if m: + try: + retry_delay = min(float(m.group(1)) + 1, 30) + except Exception: + pass + logger.warning( + f"âš ī¸ Rate limit for {section_name} (attempt {attempt}/{max_attempts}). Sleeping {retry_delay:.1f}s before retry." + ) + time.sleep(retry_delay) + continue + else: + logger.error( + f"❌ Non-retryable error extracting {section_name}: {e}" + ) + return None + + if response_text is None: + logger.error( + f"❌ Exhausted retries for {section_name} due to rate limits." + ) + return None - response_text = response["message"]["content"] + cleaned = extract_json_from_response(response_text) + repaired = ensure_valid_json( + cleaned, + provider=self.provider, + model=DEFAULT_MODEL, + original_prompt=prompt, + ) try: - response_text = extract_json_from_response(response_text) - json_start = response_text.find("{") - json_end = response_text.rfind("}") - if json_start != -1 and json_end != -1: - response_text = response_text[json_start : json_end + 1] - parsed_data = json.loads(response_text) + parsed_data = json.loads(repaired) logger.debug(f"✅ Successfully extracted {section_name} section") - - transformed_data = transform_parsed_data(parsed_data) - end_time = time.time() - total_time = end_time - start_time - logger.debug( - f"âąī¸ Total time for separate section extraction: {total_time:.2f} seconds" - ) - - return transformed_data except json.JSONDecodeError as e: - logger.error(f"❌ Error parsing JSON for {section_name} section: {e}") - logger.error(f"Raw response: {response_text}") + logger.error( + f"❌ Error parsing JSON for {section_name} section after repair attempts: {e}" + ) + logger.error(f"Raw repaired text: {repaired}") return None + transformed_data = transform_parsed_data(parsed_data) + end_time = time.time() + total_time = end_time - start_time + logger.debug( + f"âąī¸ Total time for separate section extraction: {total_time:.2f} seconds" + ) + + return transformed_data + except Exception as e: logger.error(f"❌ Error calling LLM for {section_name} section: {e}") return None @@ -190,6 +239,15 @@ def extract_awards_section(self, resume_text: str) -> Optional[Dict]: return None return self._call_llm_for_section("awards", resume_text, prompt, AwardsSection) + def extract_languages_section(self, resume_text: str) -> Optional[Dict]: + prompt = self.template_manager.render_template( + "languages", text_content=resume_text + ) + if not prompt: + logger.error("❌ Failed to render languages template") + return None + return self._call_llm_for_section("languages", resume_text, prompt, LanguagesSection) + def extract_json_from_text(self, resume_text: str) -> Optional[JSONResume]: try: return self._extract_all_sections_separately(resume_text) @@ -227,6 +285,7 @@ def _extract_section_data( "skills": self.extract_skills_section, "projects": self.extract_projects_section, "awards": self.extract_awards_section, + "languages": self.extract_languages_section, } if section_name not in section_extractors: @@ -269,7 +328,7 @@ def _extract_all_sections_separately( ) -> Optional[JSONResume]: start_time = time.time() - sections = ["basics", "work", "education", "skills", "projects", "awards"] + sections = ["basics", "work", "education", "skills", "projects", "awards", "languages"] complete_resume = { "basics": None, @@ -287,14 +346,156 @@ def _extract_all_sections_separately( "meta": None, } - for section_name in sections: - section_data = self._extract_section_data(text_content, section_name) + # Parallel extraction using threads (I/O bound network calls) + from concurrent.futures import ThreadPoolExecutor, as_completed - if section_data: - complete_resume.update(section_data) - logger.debug(f"✅ Successfully extracted {section_name} section") - else: - logger.error(f"âš ī¸ Failed to extract {section_name} section") + results = {} + # Constrain parallelism to reduce rate limit pressure + parallel_workers = self.max_workers + with ThreadPoolExecutor(max_workers=min(len(sections), parallel_workers)) as executor: + future_map = { + executor.submit(self._extract_section_data, text_content, section_name): section_name + for section_name in sections + } + for future in as_completed(future_map): + sec = future_map[future] + try: + section_data = future.result() + if section_data: + results[sec] = section_data + complete_resume.update(section_data) + logger.debug(f"✅ Successfully extracted {sec} section (parallel)") + else: + logger.error(f"âš ī¸ Failed to extract {sec} section") + except Exception as e: + logger.error(f"❌ Exception extracting {sec} section: {e}") + + # Fallback: if all sections failed in parallel, retry sequentially with small delay + if not results: + logger.warning("âš ī¸ Parallel extraction returned no sections. Retrying sequentially to mitigate rate limits.") + for sec in sections: + try: + section_data = self._extract_section_data(text_content, sec, None) + if section_data: + complete_resume.update(section_data) + results[sec] = section_data + logger.debug(f"✅ Sequentially extracted {sec} section") + else: + logger.error(f"âš ī¸ Sequential retry failed for {sec} section") + time.sleep(2) # gentle pacing to avoid hitting per-minute limits + except Exception as e: + logger.error(f"❌ Exception in sequential retry for {sec}: {e}") + else: + # Targeted retries for only the missing sections (avoid flooding API) + missing = [s for s in sections if complete_resume.get(s) is None] + if missing: + logger.warning(f"âš ī¸ Missing sections after parallel run: {missing}. Retrying them sequentially with pacing.") + for sec in missing: + try: + section_data = self._extract_section_data(text_content, sec, None) + if section_data: + complete_resume.update(section_data) + results[sec] = section_data + logger.debug(f"✅ Filled missing {sec} section via sequential retry") + else: + logger.error(f"âš ī¸ Sequential retry could not extract {sec} section") + time.sleep(2) + except Exception as e: + logger.error(f"❌ Exception retrying missing {sec} section: {e}") + + # Fallback heuristics for skills & projects if still None + def _simple_skill_extraction(text: str): + tech_keywords = [ + "python", + "java", + "javascript", + "typescript", + "react", + "node", + "django", + "flask", + "aws", + "docker", + "kubernetes", + "postgres", + "mysql", + "mongodb", + "git", + "linux", + "tensorflow", + "pytorch", + "llm", + ] + found = set() + lower = text.lower() + for kw in tech_keywords: + if kw in lower: + found.add(kw) + if not found: + return None + return { + "skills": [ + {"name": "Technologies", "level": None, "keywords": sorted(list(found))} + ] + } + + def _simple_projects_extraction(text: str): + # Use work highlights as proxy if present + projects = [] + if complete_resume.get("work") and isinstance(complete_resume["work"], list): + for w in complete_resume["work"]: + highlights = w.get("highlights") if isinstance(w, dict) else None + if highlights: + for h in highlights: + if any(word in h.lower() for word in ["developed", "built", "engineered", "implemented", "created"]): + projects.append({ + "name": h[:60] + ("..." if len(h) > 60 else ""), + "description": h, + "highlights": [h], + }) + if not projects: + return None + return {"projects": projects} + + if complete_resume.get("skills") is None: + fallback_skills = _simple_skill_extraction(text_content) + if fallback_skills: + complete_resume.update(fallback_skills) + logger.warning("âš ī¸ Applied heuristic fallback for skills section.") + if complete_resume.get("projects") is None: + fallback_projects = _simple_projects_extraction(text_content) + if fallback_projects: + complete_resume.update(fallback_projects) + logger.warning("âš ī¸ Applied heuristic fallback for projects section from work highlights.") + + # Heuristic fallback for spoken languages if missing + def _simple_languages_extraction(text: str): + try: + lines = [l.strip() for l in text.splitlines() if l.strip()] + langs = [] + for line in lines: + if line.lower().startswith("languages:") or line.lower().startswith("language:"): + content = line.split(":", 1)[1].strip() + parts = [p.strip() for p in re.split(r",|;", content) if p.strip()] + for p in parts: + # Match formats like "English (Professional)" or just "English" + m = re.match(r"^(.*?)\s*\((.*?)\)$", p) + if m: + langs.append({"language": m.group(1).strip(), "fluency": m.group(2).strip()}) + else: + langs.append({"language": p, "fluency": None}) + break + if langs: + return {"languages": langs} + except Exception: + return None + return None + + if complete_resume.get("languages") in (None, []): + fallback_langs = _simple_languages_extraction(text_content) + if fallback_langs: + complete_resume.update(fallback_langs) + logger.warning("âš ī¸ Applied heuristic fallback for languages section.") try: if complete_resume.get("basics") and isinstance( @@ -308,6 +509,14 @@ def _extract_all_sections_separately( json_resume = JSONResume(**complete_resume) + # If still completely empty, log explicit warning + all_empty = all( + getattr(json_resume, s, None) in (None, [], {}) + for s in ["basics", "work", "education", "skills", "projects", "awards"] + ) + if all_empty: + logger.warning("âš ī¸ Extraction produced an empty resume (all key sections None).") + end_time = time.time() total_time = end_time - start_time logger.info( diff --git a/prompts/template_manager.py b/prompts/template_manager.py index b68f680..3b7b8d0 100644 --- a/prompts/template_manager.py +++ b/prompts/template_manager.py @@ -41,6 +41,7 @@ def _load_templates(self): "skills": "skills.jinja", "projects": "projects.jinja", "awards": "awards.jinja", + "languages": "languages.jinja", "system_message": "system_message.jinja", "github_project_selection": "github_project_selection.jinja", "resume_evaluation_criteria": "resume_evaluation_criteria.jinja", @@ -57,6 +58,29 @@ def _load_templates(self): except Exception as e: print(f"❌ Error loading template {filename}: {e}") + def get_template_source(self, section_name: str) -> Optional[str]: + """Return raw template source for hashing/versioning.""" + if section_name not in self._templates: + return None + try: + # FileSystemLoader get_source returns (source, filename, uptodate) + loader = self.env.loader + if hasattr(loader, "get_source"): + source_tuple = loader.get_source(self.env, self._templates[section_name].name) + return source_tuple[0] + except Exception: + return None + return None + + def get_all_template_sources(self) -> dict: + """Return mapping of template name to source for all loaded templates.""" + sources = {} + for name in self._templates: + src = self.get_template_source(name) + if src is not None: + sources[name] = src + return sources + def get_available_sections(self) -> list: """ Get list of available section names. diff --git a/prompts/templates/basics.jinja b/prompts/templates/basics.jinja index d02320c..4adda27 100644 --- a/prompts/templates/basics.jinja +++ b/prompts/templates/basics.jinja @@ -6,7 +6,7 @@ Extract ONLY the basic information (name, email, phone, location, profiles) from --- The input resume markdown ends here --- -Return ONLY a JSON object with this structure: +Return ONLY a JSON object with this structure (fill fields ONLY if present in the resume; otherwise use null): { "basics": { "name": "Full name", @@ -15,8 +15,11 @@ Return ONLY a JSON object with this structure: "url": null, "summary": null, "location": { + "address": null, + "postalCode": null, "city": "City", - "countryCode": "Country code" + "countryCode": "Country code", + "region": null }, "profiles": [ { diff --git a/prompts/templates/languages.jinja b/prompts/templates/languages.jinja new file mode 100644 index 0000000..342bab0 --- /dev/null +++ b/prompts/templates/languages.jinja @@ -0,0 +1,23 @@ +Extract ONLY the spoken languages from this resume. + +--- The input resume markdown starts here --- + +{{ text_content }} + +--- The input resume markdown ends here --- + +Return ONLY a JSON object with this structure: +{ + "languages": [ + { + "language": "Language name", + "fluency": "Fluency level (e.g., Native, Professional, Intermediate)" + } + ] +} + +Rules: +- Only include languages explicitly present in the resume text. +- If fluency/level is not stated, set "fluency" to null. +- Do not invent or infer languages. +- Return valid JSON only; no extra commentary. diff --git a/prompts/templates/projects.jinja b/prompts/templates/projects.jinja index feea98a..bf1be56 100644 --- a/prompts/templates/projects.jinja +++ b/prompts/templates/projects.jinja @@ -11,11 +11,16 @@ Return ONLY a JSON object with this structure: "projects": [ { "name": "Project name", - "description": "Project description", - "url": "Project URL", + "description": "Short description in one sentence", + "repo_url": "GitHub/GitLab repository URL if present, else null", + "live_url": "Public live demo/hosted URL if present, else null", "technologies": ["Tech 1", "Tech 2"] } ] } +Guidelines: +- Prefer putting source code link in "repo_url" and hosted demo link (e.g. vercel.app, netlify.app, render.com, fly.dev, onrender.com, herokuapp.com) in "live_url". +- If both links exist, fill both; if only one exists, set the other to null. + **IMPORTANT**: Return ONLY valid JSON. Do not include any explanatory text. \ No newline at end of file diff --git a/score.py b/score.py index b0944dd..7a25b52 100644 --- a/score.py +++ b/score.py @@ -3,6 +3,7 @@ import json import logging import csv +import argparse from pdf import PDFHandler from github import fetch_and_display_github_info from models import JSONResume, EvaluationData @@ -17,6 +18,8 @@ convert_blog_data_to_text, ) from config import DEVELOPMENT_MODE +from prompts.template_manager import TemplateManager +import hashlib logger = logging.getLogger(__name__) @@ -197,7 +200,38 @@ def find_profile(profiles, network): ) -def main(pdf_path): +def _is_empty_resume(resume_data: JSONResume) -> bool: + if not resume_data: + return True + key_sections = [ + "basics", + "work", + "education", + "skills", + "projects", + "awards", + ] + for sec in key_sections: + val = getattr(resume_data, sec, None) + if val: + # If any section has content (dict/list/object), treat as non-empty + try: + if isinstance(val, (list, dict)) and len(val) > 0: + return False + # Basics is a pydantic model. If it has at least one non-null attribute -> non-empty + fields = getattr(val.__class__, "model_fields", None) + if fields: + for field_name in fields.keys(): + if getattr(val, field_name, None): + return False + except Exception: + pass + # Non-container truthy object + return False + return True + + +def main(pdf_path, force: bool = False, no_github: bool = False, max_workers: int = 3): # Create cache filename based on PDF path cache_filename = ( f"cache/resumecache_{os.path.basename(pdf_path).replace('.pdf', '')}.json" @@ -207,16 +241,40 @@ def main(pdf_path): ) # Check if cache exists and we're in development mode - if DEVELOPMENT_MODE and os.path.exists(cache_filename): + if not force and DEVELOPMENT_MODE and os.path.exists(cache_filename): print(f"Loading cached data from {cache_filename}") - cached_data = json.loads(Path(cache_filename).read_text()) - resume_data = JSONResume(**cached_data) + cached_raw = json.loads(Path(cache_filename).read_text()) + + # Validate cache metadata if present + use_cache = True + cache_meta = cached_raw.get("_cache_meta") + if cache_meta: + # Verify file hash + try: + with open(pdf_path, "rb") as f: + data = f.read() + file_hash = hashlib.md5(data).hexdigest() + if cache_meta.get("file_hash") != file_hash: + print("âš ī¸ Resume file changed since cache was written. Ignoring cached resume.") + use_cache = False + # Verify model/template + if cache_meta.get("model") != DEFAULT_MODEL: + print("âš ī¸ Model changed since cache was written. Ignoring cached resume.") + use_cache = False + except Exception: + use_cache = False + + if use_cache: + cached_data = cached_raw.get("data", cached_raw) + resume_data = JSONResume(**cached_data) + else: + resume_data = None else: logger.debug( f"Extracting data from PDF" + (" and caching to " + cache_filename if DEVELOPMENT_MODE else "") ) - pdf_handler = PDFHandler() + pdf_handler = PDFHandler(max_workers=max_workers) resume_data = pdf_handler.extract_json_from_pdf(pdf_path) if resume_data == None: @@ -224,17 +282,55 @@ def main(pdf_path): if DEVELOPMENT_MODE: os.makedirs(os.path.dirname(cache_filename), exist_ok=True) - Path(cache_filename).write_text( - json.dumps(resume_data.model_dump(), indent=2, ensure_ascii=False), - encoding='utf-8' - ) + # Write cache with metadata to allow validation later + tm = TemplateManager() + template_sources = tm.get_all_template_sources() + template_hashes = {name: hashlib.sha256(src.encode("utf-8")).hexdigest() for name, src in template_sources.items()} + with open(cache_filename, "w", encoding="utf-8") as fh: + wrapper = { + "_cache_meta": { + "file_hash": hashlib.md5(open(pdf_path, "rb").read()).hexdigest(), + "model": DEFAULT_MODEL, + "template_hashes": template_hashes, + }, + "data": resume_data.model_dump(), + } + fh.write(json.dumps(wrapper, indent=2, ensure_ascii=False)) # Check if cache exists and we're in development mode github_data = {} - if DEVELOPMENT_MODE and os.path.exists(github_cache_filename): + gh_cache_exists = os.path.exists(github_cache_filename) + use_gh_cache = (not force) and DEVELOPMENT_MODE and gh_cache_exists + if no_github: + print("Skipping GitHub fetch due to --no-github flag") + github_data = {} + elif use_gh_cache: print(f"Loading cached data from {github_cache_filename}") - github_data = json.loads(Path(github_cache_filename).read_text()) - else: + try: + cached_raw = json.loads(Path(github_cache_filename).read_text()) + except Exception as e: + print(f"âš ī¸ Failed to read GitHub cache: {e}. Will refetch.") + cached_raw = None + + cache_valid = False + if cached_raw: + cache_meta = cached_raw.get("_cache_meta") + if cache_meta and cache_meta.get("model") != DEFAULT_MODEL: + print("âš ī¸ GitHub cache model mismatch. Ignoring cached GitHub data.") + else: + candidate = cached_raw.get("data", cached_raw) + # Consider cache invalid if empty or missing profile/projects + if candidate and isinstance(candidate, dict): + total_projects = candidate.get("total_projects") + profile = candidate.get("profile") + has_username = bool(profile and profile.get("username")) + has_projects = isinstance(candidate.get("projects"), list) and len(candidate.get("projects")) > 0 + if has_username and (has_projects or (isinstance(total_projects, int) and total_projects > 0)): + github_data = candidate + cache_valid = True + if not cache_valid: + print("âš ī¸ GitHub cache is empty or invalid. Fetching fresh data...") + if (not no_github) and (not use_gh_cache or (use_gh_cache and not cache_valid)): print( f"Fetching GitHub data" + (" and caching to " + github_cache_filename if DEVELOPMENT_MODE else "") @@ -250,10 +346,43 @@ def main(pdf_path): github_data = fetch_and_display_github_info(github_profile.url) if DEVELOPMENT_MODE: os.makedirs(os.path.dirname(github_cache_filename), exist_ok=True) - Path(github_cache_filename).write_text( - json.dumps(github_data, indent=2, ensure_ascii=False), - encoding='utf-8' - ) + with open(github_cache_filename, "w", encoding="utf-8") as fh: + wrapper = { + "_cache_meta": { + "model": DEFAULT_MODEL, + }, + "data": github_data, + } + fh.write(json.dumps(wrapper, indent=2, ensure_ascii=False)) + + # If cached resume is empty, attempt a fresh extraction + if (force or _is_empty_resume(resume_data)): + if _is_empty_resume(resume_data): + print("âš ī¸ Cached resume appears empty. Attempting re-extraction...") + pdf_handler = PDFHandler(max_workers=max_workers) + fresh_resume = pdf_handler.extract_json_from_pdf(pdf_path) + if fresh_resume and not _is_empty_resume(fresh_resume): + resume_data = fresh_resume + if DEVELOPMENT_MODE: + try: + tm = TemplateManager() + template_sources = tm.get_all_template_sources() + template_hashes = {name: hashlib.sha256(src.encode("utf-8")).hexdigest() for name, src in template_sources.items()} + with open(cache_filename, "w", encoding="utf-8") as fh: + wrapper = { + "_cache_meta": { + "file_hash": hashlib.md5(open(pdf_path, "rb").read()).hexdigest(), + "model": DEFAULT_MODEL, + "template_hashes": template_hashes, + }, + "data": resume_data.model_dump(), + } + fh.write(json.dumps(wrapper, indent=2, ensure_ascii=False)) + print("✅ Re-extracted resume and updated cache.") + except Exception as e: + print(f"âš ī¸ Failed to update cache after re-extraction: {e}") + else: + print("❌ Re-extraction failed or still empty; proceeding with existing data.") score = _evaluate_resume(resume_data, github_data) @@ -297,13 +426,15 @@ def main(pdf_path): if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python score.py ") - exit(1) - pdf_path = sys.argv[1] - - if not os.path.exists(pdf_path): - print(f"Error: File '{pdf_path}' does not exist.") + parser = argparse.ArgumentParser(description="Evaluate a resume PDF and output scores.") + parser.add_argument("pdf_path", help="Path to the resume PDF file") + parser.add_argument("--force", action="store_true", help="Bypass caches and re-extract") + parser.add_argument("--no-github", action="store_true", help="Skip GitHub fetch and enrichment") + parser.add_argument("--max-workers", type=int, default=3, help="Max parallel section extractions (default: 3)") + args = parser.parse_args() + + if not os.path.exists(args.pdf_path): + print(f"Error: File '{args.pdf_path}' does not exist.") exit(1) - main(pdf_path) + main(args.pdf_path, force=args.force, no_github=args.no_github, max_workers=args.max_workers) diff --git a/transform.py b/transform.py index 25eab1d..b49af68 100644 --- a/transform.py +++ b/transform.py @@ -330,6 +330,8 @@ def transform_projects(projects_list: List) -> List[Dict]: if not skills and technologies: skills = technologies + repo_url = item.get("repo_url") or item.get("url") + live_url = item.get("live_url") transformed.append( { "name": item.get("name", ""), @@ -337,7 +339,9 @@ def transform_projects(projects_list: List) -> List[Dict]: "endDate": None, "description": item.get("description", ""), "highlights": [item.get("type", "")] if item.get("type") else [], - "url": item.get("url", None), + "url": repo_url or live_url, + "repo_url": repo_url, + "live_url": live_url, "technologies": technologies, "skills": skills, } @@ -393,6 +397,8 @@ def transform_projects_comprehensive(parsed_data: Dict) -> List[Dict]: skills = [skill.strip() for skill in skills_part.split(",")] item["name"] = name_parts[0].strip() + repo_url = item.get("repo_url") or item.get("url") + live_url = item.get("live_url") projects.append( { "name": item.get("name", ""), @@ -400,7 +406,9 @@ def transform_projects_comprehensive(parsed_data: Dict) -> List[Dict]: "endDate": None, "description": item.get("summary", ""), "highlights": [], - "url": item.get("url", None), + "url": repo_url or live_url, + "repo_url": repo_url, + "live_url": live_url, "technologies": item.get("technologies", []), "skills": skills, } @@ -823,8 +831,17 @@ def convert_json_resume_to_text(resume_data: JSONResume) -> str: text_parts.append(f" Period: {project.startDate} - {project.endDate}") if project.description: text_parts.append(f" Description: {project.description}") - if project.url: - text_parts.append(f" URL: {project.url}") + # Prefer separating Repo vs Live Demo for clarity in evaluation + repo_url = getattr(project, "repo_url", None) or ( + project.url if (project.url and "github.com" in project.url) else None + ) + live_url = getattr(project, "live_url", None) + if not live_url and project.url and not ("github.com" in project.url): + live_url = project.url + if repo_url: + text_parts.append(f" Repo URL: {repo_url}") + if live_url: + text_parts.append(f" Live Demo: {live_url}") if project.highlights: text_parts.append(" Highlights:") for highlight in project.highlights: @@ -910,7 +927,11 @@ def convert_github_data_to_text(github_data: dict) -> str: for i, project in enumerate(projects[:10], 1): github_text += f"{i}. {project.get('name', 'N/A')}\n" github_text += f" Description: {project.get('description', 'N/A')}\n" - github_text += f" URL: {project.get('github_url', 'N/A')}\n" + github_url = project.get('github_url') or 'N/A' + github_text += f" Repo URL: {github_url}\n" + live_url = project.get('live_url') + if live_url: + github_text += f" Live Demo: {live_url}\n" if "github_details" in project: details = project["github_details"] github_text += f" Stars: {details.get('stars', 'N/A')}\n"