interviewstreet · abhiram050904 · Nov 16, 2025
diff --git a/.env.example b/.env.example
@@ -1,11 +1,16 @@
 # LLM Provider Configuration
 # Options: "ollama" or "gemini"
-LLM_PROVIDER=ollama
+LLM_PROVIDER=gemini
 
 # Default model to use
 # For Ollama: "gemma3:4b", "qwen3:4b", "mistral:7b", etc.
 # For Gemini: "gemini-2.5-pro", "gemini-2.5-flash", etc.
-DEFAULT_MODEL=gemma3:4b
+DEFAULT_MODEL=gemini-2.5-flash
 
 # Google Gemini API Key (required if using Gemini provider)
-GEMINI_API_KEY=your_gemini_api_key_here
+# Example: GEMINI_API_KEY=your_gemini_api_key_here
+GEMINI_API_KEY=
+
+# Optional: Personal access token to increase GitHub API rate limits
+# Example: GITHUB_TOKEN=ghp_xxx
+GITHUB_TOKEN=
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,9 @@ test_*.py
 cache/
 resume_evaluations.csv
 greenhouse_resumes/*
+*.pdf
+.venv_win/
+python-*.exe
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -215,6 +215,12 @@ What happens:
 2. If a GitHub profile is found in the resume, repositories are fetched and cached to `cache/githubcache_<basename>.json`.
 3. The evaluator prints a report and, in development mode, appends a CSV row to `resume_evaluations.csv`.
 
+### Flags
+
+- `--force`: bypass caches and fully re-extract from the PDF.
+- `--no-github`: skip GitHub enrichment (useful when rate-limited or offline).
+- `--max-workers N`: control parallel section extraction (default: 3). Lower it if you hit LLM 429s.
+
 ---
 
 ## Directory layout

diff --git a/evaluator.py b/evaluator.py
@@ -1,7 +1,12 @@
 from typing import Dict, List, Optional, Tuple, Any
+import hashlib
 from pydantic import BaseModel, Field, field_validator
 from models import JSONResume, EvaluationData
-from llm_utils import initialize_llm_provider, extract_json_from_response
+from llm_utils import (
+    initialize_llm_provider,
+    extract_json_from_response,
+    ensure_valid_json,
+)
 import logging
 import json
 import re
@@ -78,12 +83,39 @@ def evaluate_resume(self, resume_text: str) -> EvaluationData:
             response = self.provider.chat(**chat_params, **kwargs)
 
             response_text = response["message"]["content"]
-            response_text = extract_json_from_response(response_text)
-            logger.error(f"🔤 Prompt response: {response_text}")
+            cleaned_text = extract_json_from_response(response_text)
+            repaired_json_str = ensure_valid_json(
+                cleaned_text,
+                provider=self.provider,
+                model=self.model_name,
+                original_prompt=full_prompt,
+            )
+            logger.error(f"🔤 Prompt response: {repaired_json_str}")
+
+            try:
+                evaluation_dict = json.loads(repaired_json_str)
+            except Exception as e:
+                logger.error(f"Failed to parse evaluation JSON after repair attempts: {e}")
+                raise
 
-            evaluation_dict = json.loads(response_text)
             evaluation_data = EvaluationData(**evaluation_dict)
 
+            # Attach prompt/version metadata
+            template_sources = self.template_manager.get_all_template_sources()
+            template_hashes = {
+                name: hashlib.sha256(src.encode("utf-8")).hexdigest()
+                for name, src in template_sources.items()
+            }
+            evaluation_data.meta = {
+                "model": self.model_name,
+                "provider": MODEL_PROVIDER_MAPPING.get(self.model_name, None).value
+                if MODEL_PROVIDER_MAPPING.get(self.model_name, None)
+                else None,
+                "template_hashes": template_hashes,
+                "temperature": self.model_params.get("temperature"),
+                "top_p": self.model_params.get("top_p"),
+            }
+
             return evaluation_data
 
         except Exception as e:

diff --git a/github.py b/github.py
@@ -57,23 +57,13 @@ def _fetch_github_api(api_url, params=None):
         # Log rate limit information and handle proactively
         if remaining < 10 and rate_limit_reset:
             reset_timestamp = int(rate_limit_reset)
-            current_timestamp = int(time.time())
-            wait_seconds = max(0, reset_timestamp - current_timestamp) + 5  # Add 5 second buffer
             reset_time = datetime.datetime.fromtimestamp(reset_timestamp)
-
-            # Cap maximum wait time at 1 hour
-            max_wait = 3600
-            if wait_seconds > max_wait:
-                print(f"⚠️  Rate limit reset time is too far in the future ({wait_seconds}s). Capping wait to {max_wait}s")
-                wait_seconds = max_wait
-
-            logger.error(f"⚠️  GitHub API rate limit low: {remaining}/{limit} requests remaining. Resets at {reset_time}")
-            print(f"💡 Tip: Set GITHUB_TOKEN environment variable to increase rate limits (60/hour → 5000/hour)")
-
-            if wait_seconds > 0:
-                logger.info(f"⏳ Proactively sleeping for {wait_seconds} seconds until rate limit resets...")
-                time.sleep(wait_seconds)
-                print(f"✅ Rate limit should be reset now. Continuing...")
+            logger.error(
+                f"⚠️  GitHub API rate limit low: {remaining}/{limit} requests remaining. Resets at {reset_time}"
+            )
+            print(
+                "💡 Tip: Set GITHUB_TOKEN environment variable to increase rate limits (60/hour → 5000/hour). Continuing without delay."
+            )
         elif remaining < 100:
             logger.info(f"ℹ️  GitHub API rate limit: {remaining}/{limit} requests remaining")
 
@@ -210,24 +200,66 @@ def fetch_all_github_repos(github_url: str, max_repos: int = 100) -> List[Dict]:
         if status_code == 200:
             projects = []
             for repo in repos_data:
-                if repo.get("fork") and repo.get("forks_count", 0) < 5:
-                    continue
-
                 repo_name = repo.get("name")
+                if not repo_name:
+                    continue
 
+                # Fetch contributors for user's fork (or original if not fork)
                 contributors_data = fetch_repo_contributors(username, repo_name)
                 contributor_count = len(contributors_data)
-
                 user_contributions, total_contributions = fetch_contributions_count(
                     username, contributors_data
                 )
 
+                # Determine project type (consider upstream if fork)
                 project_type = (
                     "open_source" if contributor_count > 1 else "self_project"
                 )
 
+                github_details = {
+                    "stars": repo.get("stargazers_count", 0),
+                    "forks": repo.get("forks_count", 0),
+                    "language": repo.get("language"),
+                    "description": repo.get("description"),
+                    "created_at": repo.get("created_at"),
+                    "updated_at": repo.get("updated_at"),
+                    "topics": repo.get("topics", []),
+                    "open_issues": repo.get("open_issues_count", 0),
+                    "size": repo.get("size", 0),
+                    "fork": repo.get("fork", False),
+                    "archived": repo.get("archived", False),
+                    "default_branch": repo.get("default_branch"),
+                    "contributors": contributor_count,
+                }
+
+                upstream_details = None
+                if repo.get("fork"):
+                    # Fetch upstream parent for accurate stats (#155) and avoid skipping low-fork repos (#162)
+                    upstream_api = f"https://api.github.com/repos/{username}/{repo_name}"
+                    status_code, upstream_data = _fetch_github_api(upstream_api)
+                    if status_code == 200 and isinstance(upstream_data, dict):
+                        parent = upstream_data.get("parent")
+                        if parent:
+                            upstream_details = {
+                                "name": parent.get("name"),
+                                "owner": parent.get("owner", {}).get("login"),
+                                "html_url": parent.get("html_url"),
+                                "stars": parent.get("stargazers_count", 0),
+                                "forks": parent.get("forks_count", 0),
+                                "language": parent.get("language"),
+                                "topics": parent.get("topics", []),
+                                "description": parent.get("description"),
+                            }
+                            # Prefer upstream popularity metrics for evaluation
+                            github_details["stars"] = upstream_details["stars"]
+                            github_details["forks"] = upstream_details["forks"]
+                            github_details["topics"] = upstream_details["topics"]
+                            github_details["upstream_owner"] = upstream_details["owner"]
+                            github_details["upstream_name"] = upstream_details["name"]
+                            github_details["upstream_html_url"] = upstream_details["html_url"]
+
                 project = {
-                    "name": repo.get("name"),
+                    "name": repo_name,
                     "description": repo.get("description"),
                     "github_url": repo.get("html_url"),
                     "live_url": repo.get("homepage") if repo.get("homepage") else None,
@@ -238,21 +270,8 @@ def fetch_all_github_repos(github_url: str, max_repos: int = 100) -> List[Dict]:
                     "contributor_count": contributor_count,
                     "author_commit_count": user_contributions,
                     "total_commit_count": total_contributions,
-                    "github_details": {
-                        "stars": repo.get("stargazers_count", 0),
-                        "forks": repo.get("forks_count", 0),
-                        "language": repo.get("language"),
-                        "description": repo.get("description"),
-                        "created_at": repo.get("created_at"),
-                        "updated_at": repo.get("updated_at"),
-                        "topics": repo.get("topics", []),
-                        "open_issues": repo.get("open_issues_count", 0),
-                        "size": repo.get("size", 0),
-                        "fork": repo.get("fork", False),
-                        "archived": repo.get("archived", False),
-                        "default_branch": repo.get("default_branch"),
-                        "contributors": contributor_count,
-                    },
+                    "github_details": github_details,
+                    "upstream_details": upstream_details,
                 }
                 projects.append(project)
 

diff --git a/llm_utils.py b/llm_utils.py
@@ -3,6 +3,8 @@
 """
 
 import logging
+import json
+import hashlib
 from typing import Any, Dict, Optional
 from models import ModelProvider, OllamaProvider, GeminiProvider
 from prompt import MODEL_PROVIDER_MAPPING, GEMINI_API_KEY
@@ -37,6 +39,86 @@ def extract_json_from_response(response_text: str) -> str:
     return response_text
 
 
+def _try_parse_json(text: str) -> Optional[str]:
+    """Attempt to parse JSON and return the canonical string if successful."""
+    try:
+        obj = json.loads(text)
+        return json.dumps(obj, ensure_ascii=False)
+    except Exception:
+        return None
+
+
+def ensure_valid_json(
+    response_text: str,
+    provider: Any = None,
+    model: str = None,
+    original_prompt: str = None,
+    max_repair_attempts: int = 2,
+) -> str:
+    """Validate JSON; attempt lightweight repairs or LLM self-repair if needed.
+
+    Strategy:
+    1. Strip markdown fences / think tags (already handled outside).
+    2. Trim to first/last brace.
+    3. Try direct parse.
+    4. If still failing and provider available, send a repair prompt asking ONLY for valid JSON.
+    5. Return raw text if irreparable to allow upstream fallback handling.
+    """
+    cleaned = response_text.strip()
+
+    # Fast path
+    parsed = _try_parse_json(cleaned)
+    if parsed is not None:
+        return parsed
+
+    # Attempt brace slicing
+    start = cleaned.find("{")
+    end = cleaned.rfind("}")
+    if start != -1 and end != -1 and end > start:
+        sliced = cleaned[start : end + 1]
+        parsed = _try_parse_json(sliced)
+        if parsed is not None:
+            return parsed
+
+    # Attempt LLM repair
+    if provider and model:
+        repair_instruction = (
+            "You previously returned malformed JSON. Return ONLY valid JSON for the same task. "
+            "No explanations, code fences, or commentary. If fields are missing, infer minimal plausible empty values." 
+        )
+        for attempt in range(max_repair_attempts):
+            try:
+                repair_messages = [
+                    {"role": "system", "content": repair_instruction},
+                    {
+                        "role": "user",
+                        "content": (
+                            "Original prompt:\n" + (original_prompt or "<none>") +
+                            "\nMalformed JSON response:\n" + cleaned +
+                            "\nReturn ONLY repaired JSON now."
+                        ),
+                    },
+                ]
+                # Low creativity for repair
+                repair_options = {"temperature": 0.0, "top_p": 0.9}
+                repair_resp = provider.chat(
+                    model=model,
+                    messages=repair_messages,
+                    options=repair_options,
+                )
+                candidate = extract_json_from_response(
+                    repair_resp["message"]["content"]
+                )
+                parsed = _try_parse_json(candidate)
+                if parsed is not None:
+                    return parsed
+            except Exception as e:
+                logger.warning(f"JSON repair attempt {attempt+1} failed: {e}")
+
+    # Return original cleaned text (upstream may log and skip)
+    return cleaned
+
+
 def initialize_llm_provider(model_name: str) -> Any:
     """
     Initialize the appropriate LLM provider based on the model name.

diff --git a/models.py b/models.py
@@ -158,6 +158,8 @@ class Project(BaseModel):
     description: Optional[str] = None
     highlights: Optional[List[str]] = None
     url: Optional[str] = None
+    repo_url: Optional[str] = None
+    live_url: Optional[str] = None
     technologies: Optional[List[str]] = None
     skills: Optional[List[str]] = None
 
@@ -198,6 +200,12 @@ class AwardsSection(BaseModel):
     awards: Optional[List[Award]] = None
 
 
+class LanguagesSection(BaseModel):
+    """Language section containing a list of languages."""
+
+    languages: Optional[List[Language]] = None
+
+
 class JSONResume(BaseModel):
     """Complete JSON Resume format model."""
 
@@ -247,6 +255,7 @@ class EvaluationData(BaseModel):
     deductions: Deductions
     key_strengths: List[str] = Field(min_items=1, max_items=5)
     areas_for_improvement: List[str] = Field(min_items=1, max_items=5)
+    meta: Optional[Dict[str, Any]] = None  # metadata: template hashes, model, provider, timestamps
 
 
 class GitHubProfile(BaseModel):