Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 88 additions & 97 deletions pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def extract_text_from_pdf(self, pdf_path: str) -> Optional[str]:

doc = pymupdf.open(pdf_path)
pages = range(doc.page_count)
# We are calling the enhanced pymupdf_rag script here
resume_text = to_markdown(
doc,
pages=pages,
Expand All @@ -67,6 +68,8 @@ def extract_text_from_pdf(self, pdf_path: str) -> Optional[str]:
def _call_llm_for_section(
self, section_name: str, text_content: str, prompt: str, return_model=None
) -> Optional[Dict]:
# This function remains unchanged, as it correctly calls the LLM for a given piece of text.
# The change is that we will now pass it SMALLER, pre-separated chunks of text.
try:
start_time = time.time()
logger.debug(
Expand Down Expand Up @@ -103,9 +106,7 @@ def _call_llm_for_section(
if return_model:
kwargs["format"] = return_model.model_json_schema()

# Use the appropriate provider to make the API call
response = self.provider.chat(**chat_params, **kwargs)

response_text = response["message"]["content"]

try:
Expand All @@ -123,40 +124,30 @@ def _call_llm_for_section(
logger.debug(
f"⏱️ Total time for separate section extraction: {total_time:.2f} seconds"
)

return transformed_data
except json.JSONDecodeError as e:
logger.error(f"❌ Error parsing JSON for {section_name} section: {e}")
logger.error(f"Raw response: {response_text}")
return None

except Exception as e:
logger.error(f"❌ Error calling LLM for {section_name} section: {e}")
return None

# --- All the extract_*_section methods below remain unchanged ---
def extract_basics_section(self, resume_text: str) -> Optional[Dict]:
prompt = self.template_manager.render_template(
"basics", text_content=resume_text
)
if not prompt:
logger.error("❌ Failed to render basics template")
return None
return self._call_llm_for_section("basics", resume_text, prompt, BasicsSection)

def extract_work_section(self, resume_text: str) -> Optional[Dict]:
prompt = self.template_manager.render_template("work", text_content=resume_text)
if not prompt:
logger.error("❌ Failed to render work template")
return None
return self._call_llm_for_section("work", resume_text, prompt, WorkSection)

def extract_education_section(self, resume_text: str) -> Optional[Dict]:
prompt = self.template_manager.render_template(
"education", text_content=resume_text
)
if not prompt:
logger.error("❌ Failed to render education template")
return None
return self._call_llm_for_section(
"education", resume_text, prompt, EducationSection
)
Expand All @@ -165,18 +156,12 @@ def extract_skills_section(self, resume_text: str) -> Optional[Dict]:
prompt = self.template_manager.render_template(
"skills", text_content=resume_text
)
if not prompt:
logger.error("❌ Failed to render skills template")
return None
return self._call_llm_for_section("skills", resume_text, prompt, SkillsSection)

def extract_projects_section(self, resume_text: str) -> Optional[Dict]:
prompt = self.template_manager.render_template(
"projects", text_content=resume_text
)
if not prompt:
logger.error("❌ Failed to render projects template")
return None
return self._call_llm_for_section(
"projects", resume_text, prompt, ProjectsSection
)
Expand All @@ -185,37 +170,17 @@ def extract_awards_section(self, resume_text: str) -> Optional[Dict]:
prompt = self.template_manager.render_template(
"awards", text_content=resume_text
)
if not prompt:
logger.error("❌ Failed to render awards template")
return None
return self._call_llm_for_section("awards", resume_text, prompt, AwardsSection)

# --- All other top-level methods remain unchanged ---
def extract_json_from_text(self, resume_text: str) -> Optional[JSONResume]:
try:
return self._extract_all_sections_separately(resume_text)
except Exception as e:
logger.error(f"Error calling Ollama: {e}")
return None
return self._extract_all_sections_separately(resume_text)

def extract_json_from_pdf(self, pdf_path: str) -> Optional[JSONResume]:
try:
logger.debug(f"📄 Extracting text from PDF: {pdf_path}")
text_content = self.extract_text_from_pdf(pdf_path)

if not text_content:
logger.error("❌ Failed to extract text from PDF")
return None

logger.debug(
f"✅ Successfully extracted {len(text_content)} characters from PDF"
)

logger.debug("🔄 Extracting all sections separately...")
return self._extract_all_sections_separately(text_content)

except Exception as e:
logger.error(f"❌ Error during PDF to JSON extraction: {e}")
text_content = self.extract_text_from_pdf(pdf_path)
if not text_content:
return None
return self._extract_all_sections_separately(text_content)

def _extract_section_data(
self, text_content: str, section_name: str, return_model=None
Expand All @@ -228,85 +193,111 @@ def _extract_section_data(
"projects": self.extract_projects_section,
"awards": self.extract_awards_section,
}
if section_name in section_extractors:
return section_extractors[section_name](text_content)
return None

if section_name not in section_extractors:
logger.error(f"❌ Invalid section name: {section_name}")
logger.error(f"Valid sections: {list(section_extractors.keys())}")
return None

return section_extractors[section_name](text_content)
# --- UPGRADE #1: ADD THE NEW HELPER FUNCTION ---
# This is now properly indented to be a method of the PDFHandler class.
def _split_markdown_by_headers(self, markdown_text: str) -> dict:
"""
Splits a markdown string into a dictionary based on H2 headers.
This is a robust, deterministic way to parse the resume structure.
"""
sections = {}
lines = markdown_text.strip().split("\n")

current_header_key = "basics"
current_content = []

known_headers = {
"academic details": "education",
"work experience": "work",
"projects": "projects",
"technical skills": "skills",
"relevant courses": "skills", # Group courses with skills
"achievements": "awards",
}

def _extract_single_section(
self, text_content: str, section_name: str, return_model=None
) -> Optional[Dict]:
section_data = self._extract_section_data(
text_content, section_name, return_model
)
if section_data:
complete_resume = {
"basics": None,
"work": None,
"volunteer": None,
"education": None,
"awards": None,
"certificates": None,
"publications": None,
"skills": None,
"languages": None,
"interests": None,
"references": None,
"projects": None,
"meta": None,
}
for line in lines:
if line.strip().startswith("##"):
cleaned_line = line.strip().replace("##", "").strip().lower()

if cleaned_line in known_headers:
# Save the content of the previous section
if current_header_key and current_content:
sections[current_header_key] = "\n".join(
current_content
).strip()

# Start the new section
current_header_key = known_headers[cleaned_line]
current_content = []
else:
# If it's a header we don't recognize, treat it as content
current_content.append(line)
else:
current_content.append(line)

complete_resume.update(section_data)
return complete_resume
# Save the very last section
if current_header_key and current_content:
sections[current_header_key] = "\n".join(current_content).strip()

return None
return sections

# --- UPGRADE #2: REPLACE THE OLD, INEFFICIENT FUNCTION ---
def _extract_all_sections_separately(
self, text_content: str
) -> Optional[JSONResume]:
start_time = time.time()

sections = ["basics", "work", "education", "skills", "projects", "awards"]
# Step 1: Reliably split the document into sections using our new helper function.
sectioned_text = self._split_markdown_by_headers(text_content)

complete_resume = {
"basics": None,
"work": None,
"volunteer": None,
"education": None,
"awards": None,
"certificates": None,
"publications": None,
"skills": None,
"languages": None,
"interests": None,
"references": None,
"projects": None,
"meta": None,
}

for section_name in sections:
section_data = self._extract_section_data(text_content, section_name)

if section_data:
complete_resume.update(section_data)
logger.debug(f"✅ Successfully extracted {section_name} section")
else:
logger.error(f"⚠️ Failed to extract {section_name} section")
# Step 2: Loop through the pre-separated sections and send them to the LLM for analysis.
for section_name, section_content in sectioned_text.items():
if section_name in complete_resume and section_content:
# Pass the specific section_content, not the whole resume text.
section_data = self._extract_section_data(section_content, section_name)

if section_data:
# If a section already has data (e.g. skills + courses), merge them
if complete_resume.get(section_name):
# This is a simplified merge, more complex logic can be added if needed
if isinstance(complete_resume[section_name], list):
complete_resume[section_name].extend(
section_data.get(section_name, [])
)
else:
complete_resume.update(section_data)
logger.debug(f"✅ Successfully extracted {section_name} section")
else:
logger.error(
f"⚠️ Failed to extract {section_name} section using LLM"
)

try:
if complete_resume.get("basics") and isinstance(
complete_resume["basics"], dict
):
try:
complete_resume["basics"] = Basics(**complete_resume["basics"])
except Exception as e:
logger.error(f"❌ Error creating Basics object: {e}")
complete_resume["basics"] = None
complete_resume["basics"] = Basics(**complete_resume["basics"])

# Filter out keys not expected by JSONResume before creating the object
valid_keys = JSONResume.model_fields.keys()
filtered_resume_data = {
k: v for k, v in complete_resume.items() if k in valid_keys
}

json_resume = JSONResume(**complete_resume)
json_resume = JSONResume(**filtered_resume_data)

end_time = time.time()
total_time = end_time - start_time
Expand Down
Loading