From 47ce9ce333ce5f31750f4c58c5fba0a1b9112f4f Mon Sep 17 00:00:00 2001 From: Farnaz Kohankhaki Date: Thu, 29 Jan 2026 11:04:19 -0800 Subject: [PATCH 1/4] fix: resolve mypy type errors in wikipedia files - Add Optional types for nullable parameters - Add type annotations for class attributes and local variables - Fix BeautifulSoup Tag type narrowing with isinstance checks - Fix generator type issues in sum() calls - Add null checks for llm_model before async calls --- wikipedia/static_vs_generated.py | 260 +++++++++++++-------- wikipedia/wiki_vs_generated.py | 323 +++++++++++++++----------- wikipedia/wikipedia_scraper.py | 379 +++++++++++++++++++------------ 3 files changed, 595 insertions(+), 367 deletions(-) diff --git a/wikipedia/static_vs_generated.py b/wikipedia/static_vs_generated.py index 8440d61d..5b806b16 100644 --- a/wikipedia/static_vs_generated.py +++ b/wikipedia/static_vs_generated.py @@ -1,12 +1,12 @@ """Area categorization system for static datasets using LLM with resume capability.""" +import glob import json import logging import os -import glob -from typing import List, Dict, Any, Tuple -from dataclasses import dataclass from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple import hydra from omegaconf import DictConfig @@ -17,12 +17,14 @@ get_area_categorization_prompt, ) + logger = logging.getLogger(__name__) @dataclass class CapabilityInfo: """Data class to hold capability information.""" + name: str description: str area: str @@ -32,6 +34,7 @@ class CapabilityInfo: @dataclass class AreaInfo: """Data class to hold area information.""" + name: str capabilities: List[CapabilityInfo] @@ -39,29 +42,31 @@ class AreaInfo: class DatasetQuestionCategorizer: """Class to categorize questions from selected dataset using two-step LLM approach.""" - def __init__(self, cfg: DictConfig): + def __init__(self, cfg: DictConfig) -> None: self.cfg = cfg - self.areas = [] - self.capabilities_by_area = {} - self.llm_model = None + self.areas: List[AreaInfo] = [] + self.capabilities_by_area: Dict[str, List[CapabilityInfo]] = {} + self.llm_model: Optional[Model] = None # Initialize LLM model try: self.llm_model = Model( model_name=cfg.llm_cfg.model_name, - model_provider=cfg.llm_cfg.model_provider + model_provider=cfg.llm_cfg.model_provider, ) logger.info(f"Initialized LLM model: {cfg.llm_cfg.model_name}") except Exception as e: logger.error(f"Failed to initialize LLM model: {e}") raise e - def extract_areas_and_capabilities_from_generated(self, generated_dir: str) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]: + def extract_areas_and_capabilities_from_generated( + self, generated_dir: str + ) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]: """Extract all areas and capabilities from the generated capabilities directory.""" logger.info("Extracting areas and capabilities from generated capabilities...") - areas = [] - capabilities_by_area = {} + areas: List[AreaInfo] = [] + capabilities_by_area: Dict[str, List[CapabilityInfo]] = {} # Get all capability directories (handle nested structure like math//) capability_dirs = glob.glob(os.path.join(generated_dir, "*/")) @@ -71,14 +76,16 @@ def extract_areas_and_capabilities_from_generated(self, generated_dir: str) -> T capability_json_path = os.path.join(cap_dir, "capability.json") if os.path.exists(capability_json_path): try: - with open(capability_json_path, 'r') as f: + with open(capability_json_path, "r") as f: cap_data = json.load(f) - print(f"Loaded capability data: {cap_data.get('capability_name', ''), cap_data.get('capability_description', ''), cap_data.get('capability_area', 'Unknown')}") + print( + f"Loaded capability data: {cap_data.get('capability_name', ''), cap_data.get('capability_description', ''), cap_data.get('capability_area', 'Unknown')}" + ) capability = CapabilityInfo( - name=cap_data.get('capability_name', ''), - description=cap_data.get('capability_description', ''), - area=cap_data.get('capability_area', 'Unknown') + name=cap_data.get("capability_name", ""), + description=cap_data.get("capability_description", ""), + area=cap_data.get("capability_area", "Unknown"), ) # Group capabilities by area @@ -87,22 +94,31 @@ def extract_areas_and_capabilities_from_generated(self, generated_dir: str) -> T capabilities_by_area[capability.area].append(capability) except Exception as e: - logger.warning(f"Error loading capability from {capability_json_path}: {e}") + logger.warning( + f"Error loading capability from {capability_json_path}: {e}" + ) # Create area objects for area_name, capabilities in capabilities_by_area.items(): area = AreaInfo(name=area_name, capabilities=capabilities) areas.append(area) - logger.info(f"Extracted {len(areas)} areas with {sum(len(caps) for caps in capabilities_by_area.values())} total capabilities") + total_caps = sum(len(caps) for caps in capabilities_by_area.values()) + logger.info( + f"Extracted {len(areas)} areas with {total_caps} total capabilities" + ) return areas, capabilities_by_area - def extract_areas_and_capabilities_from_wikipedia(self, wikipedia_dir: str) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]: + def extract_areas_and_capabilities_from_wikipedia( + self, wikipedia_dir: str + ) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]: """Extract all areas and capabilities from the Wikipedia pages directory containing individual JSON files.""" - logger.info(f"Extracting areas and capabilities from Wikipedia pages directory: {wikipedia_dir}") + logger.info( + f"Extracting areas and capabilities from Wikipedia pages directory: {wikipedia_dir}" + ) - areas = [] - capabilities_by_area = {} + areas: List[AreaInfo] = [] + capabilities_by_area: Dict[str, List[CapabilityInfo]] = {} try: # Get all JSON files in the Wikipedia pages directory @@ -111,20 +127,20 @@ def extract_areas_and_capabilities_from_wikipedia(self, wikipedia_dir: str) -> T for json_file in json_files: try: - with open(json_file, 'r', encoding='utf-8') as f: + with open(json_file, "r", encoding="utf-8") as f: wikipedia_data = json.load(f) # Extract capability information from the individual JSON file - capability_name = wikipedia_data.get('capability_name', '') - description = wikipedia_data.get('description', '') - area = wikipedia_data.get('area', 'Unknown') + capability_name = wikipedia_data.get("capability_name", "") + description = wikipedia_data.get("description", "") + area = wikipedia_data.get("area", "Unknown") if capability_name and description: capability = CapabilityInfo( name=capability_name, description=description, area=area, - domain='math' + domain="math", ) # Group capabilities by area @@ -133,7 +149,9 @@ def extract_areas_and_capabilities_from_wikipedia(self, wikipedia_dir: str) -> T capabilities_by_area[area].append(capability) except Exception as e: - logger.warning(f"Error loading Wikipedia capability from {json_file}: {e}") + logger.warning( + f"Error loading Wikipedia capability from {json_file}: {e}" + ) continue # Create area objects @@ -141,30 +159,42 @@ def extract_areas_and_capabilities_from_wikipedia(self, wikipedia_dir: str) -> T if capabilities: # Only create areas that have capabilities area = AreaInfo(name=area_name, capabilities=capabilities) areas.append(area) - logger.info(f"Loaded area '{area_name}' with {len(capabilities)} capabilities") + logger.info( + f"Loaded area '{area_name}' with {len(capabilities)} capabilities" + ) - logger.info(f"Extracted {len(areas)} areas with {sum(len(caps) for caps in capabilities_by_area.values())} total capabilities from Wikipedia pages") + total_caps = sum(len(caps) for caps in capabilities_by_area.values()) + logger.info( + f"Extracted {len(areas)} areas with {total_caps} total capabilities from Wikipedia pages" + ) except Exception as e: - logger.error(f"Error loading Wikipedia capabilities from {wikipedia_dir}: {e}") + logger.error( + f"Error loading Wikipedia capabilities from {wikipedia_dir}: {e}" + ) raise e return areas, capabilities_by_area - def extract_areas_and_capabilities(self, generated_dir: str = None, wikipedia_dir: str = None) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]: + def extract_areas_and_capabilities( + self, generated_dir: Optional[str] = None, wikipedia_dir: Optional[str] = None + ) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]: """Extract areas and capabilities using the configured method.""" - extraction_method = getattr(self.cfg, 'categorization_cfg', {}).get('extraction_method', 'generated') + extraction_method = getattr(self.cfg, "categorization_cfg", {}).get( + "extraction_method", "generated" + ) - if extraction_method == 'wikipedia': + if extraction_method == "wikipedia": if not wikipedia_dir: wikipedia_dir = self.cfg.data_cfg.wikipedia_dir return self.extract_areas_and_capabilities_from_wikipedia(wikipedia_dir) - elif extraction_method == 'generated': + if extraction_method == "generated": if not generated_dir: generated_dir = self.cfg.data_cfg.generated_dir return self.extract_areas_and_capabilities_from_generated(generated_dir) - else: - raise ValueError(f"Unknown extraction method: {extraction_method}. Must be 'generated' or 'wikipedia'") + raise ValueError( + f"Unknown extraction method: {extraction_method}. Must be 'generated' or 'wikipedia'" + ) @staticmethod def _normalize_text(text: str) -> str: @@ -179,17 +209,23 @@ def _find_matching_area_key(self, predicted_area: str) -> str: return predicted_area # Try normalized match + area_key: str for area_key in self.capabilities_by_area.keys(): if self._normalize_text(area_key) == predicted_normalized: return area_key # Try partial match (contains) for area_key in self.capabilities_by_area.keys(): - if predicted_normalized in self._normalize_text(area_key) or self._normalize_text(area_key) in predicted_normalized: + if ( + predicted_normalized in self._normalize_text(area_key) + or self._normalize_text(area_key) in predicted_normalized + ): return area_key # If no match found, return the original prediction - logger.warning(f"No matching area key found for '{predicted_area}'. Available keys: {list(self.capabilities_by_area.keys())}") + logger.warning( + f"No matching area key found for '{predicted_area}'. Available keys: {list(self.capabilities_by_area.keys())}" + ) return predicted_area @classmethod @@ -216,24 +252,25 @@ def _select_best_match(cls, response_text: str, allowed_names: List[str]) -> str # 2. Substring match (contains or starts with) for norm_name, original_name in normalized_names.items(): - if (normalized_response.startswith(norm_name) or - norm_name in normalized_response): + if ( + normalized_response.startswith(norm_name) + or norm_name in normalized_response + ): return original_name # 3. No match found return "Unknown" - def load_gsm8k_questions(self, jsonl_path: str) -> List[Dict[str, Any]]: """Load GSM8K questions from JSONL file.""" logger.info(f"Loading GSM8K questions from {jsonl_path}...") questions = [] - with open(jsonl_path, 'r') as f: + with open(jsonl_path, "r") as f: for line_num, line in enumerate(f, 1): try: question_data = json.loads(line.strip()) - question_data['line_number'] = line_num + question_data["line_number"] = line_num questions.append(question_data) except json.JSONDecodeError as e: logger.warning(f"Error parsing line {line_num}: {e}") @@ -251,7 +288,7 @@ def load_math_questions(self, math_data_dir: str) -> List[Dict[str, Any]]: for json_file in json_files: try: - with open(json_file, 'r') as f: + with open(json_file, "r") as f: problem_data = json.load(f) question_data = { "question": problem_data.get("problem", ""), @@ -267,7 +304,9 @@ def load_math_questions(self, math_data_dir: str) -> List[Dict[str, Any]]: logger.info(f"Loaded {len(questions)} MATH problems") return questions - def load_questions_by_dataset(self, dataset_name: str, dataset_path: str) -> List[Dict[str, Any]]: + def load_questions_by_dataset( + self, dataset_name: str, dataset_path: str + ) -> List[Dict[str, Any]]: """Select and load questions for the specified dataset. Currently supported datasets: @@ -282,22 +321,26 @@ def load_questions_by_dataset(self, dataset_name: str, dataset_path: str) -> Lis raise ValueError("dataset_name must be provided") name = dataset_name.strip().lower() - if name == 'gsm8k': + if name == "gsm8k": return self.load_gsm8k_questions(dataset_path) - if name == 'math': + if name == "math": return self.load_math_questions(dataset_path) - raise ValueError(f"Unsupported dataset '{dataset_name}'. Supported: ['gsm8k', 'math']") + raise ValueError( + f"Unsupported dataset '{dataset_name}'. Supported: ['gsm8k', 'math']" + ) def load_checkpoint(self, checkpoint_path: str) -> Tuple[List[Dict[str, Any]], int]: """Load existing checkpoint and return processed questions and last processed index.""" if not os.path.exists(checkpoint_path): - logger.info(f"No checkpoint found at {checkpoint_path}, starting from beginning") + logger.info( + f"No checkpoint found at {checkpoint_path}, starting from beginning" + ) return [], 0 logger.info(f"Loading checkpoint from {checkpoint_path}") try: - with open(checkpoint_path, 'r') as f: + with open(checkpoint_path, "r") as f: checkpoint_data = json.load(f) if isinstance(checkpoint_data, list): @@ -306,33 +349,46 @@ def load_checkpoint(self, checkpoint_path: str) -> Tuple[List[Dict[str, Any]], i last_index = len(processed_questions) else: # Structured checkpoint with metadata - processed_questions = checkpoint_data.get('categorized_questions', []) - last_index = checkpoint_data.get('last_processed_index', len(processed_questions)) + processed_questions = checkpoint_data.get("categorized_questions", []) + last_index = checkpoint_data.get( + "last_processed_index", len(processed_questions) + ) - logger.info(f"Loaded checkpoint with {len(processed_questions)} processed questions, resuming from index {last_index}") + logger.info( + f"Loaded checkpoint with {len(processed_questions)} processed questions, resuming from index {last_index}" + ) return processed_questions, last_index except Exception as e: logger.warning(f"Error loading checkpoint: {e}, starting from beginning") return [], 0 - def save_checkpoint(self, categorized_questions: List[Dict[str, Any]], checkpoint_path: str, last_index: int) -> None: + def save_checkpoint( + self, + categorized_questions: List[Dict[str, Any]], + checkpoint_path: str, + last_index: int, + ) -> None: """Save checkpoint with processed questions and metadata.""" - logger.info(f"Saving checkpoint with {len(categorized_questions)} questions to {checkpoint_path}") + logger.info( + f"Saving checkpoint with {len(categorized_questions)} questions to {checkpoint_path}" + ) checkpoint_data = { - 'categorized_questions': categorized_questions, - 'last_processed_index': last_index, - 'total_processed': len(categorized_questions) + "categorized_questions": categorized_questions, + "last_processed_index": last_index, + "total_processed": len(categorized_questions), } os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True) - with open(checkpoint_path, 'w') as f: + with open(checkpoint_path, "w") as f: json.dump(checkpoint_data, f, indent=2) - logger.info(f"Checkpoint saved successfully") + logger.info("Checkpoint saved successfully") - async def categorize_question_by_area(self, question: str, areas: List[AreaInfo], **kwargs: Any) -> str: + async def categorize_question_by_area( + self, question: str, areas: List[AreaInfo], **kwargs: Any + ) -> str: """Categorize a question into one of the available areas (returns exact area name).""" area_names = [area.name for area in areas] area_bullets = "\n".join([f"- {area.name}" for area in areas]) @@ -346,11 +402,15 @@ async def categorize_question_by_area(self, question: str, areas: List[AreaInfo] "seed": 42, } + if self.llm_model is None: + logger.warning("LLM model not initialized") + return "Unknown" + try: response, metadata = await self.llm_model.async_generate( sys_prompt=sys_prompt, user_prompt=user_prompt, - generation_config=generation_config + generation_config=generation_config, ) raw = (response or "").strip() # Print what model returned @@ -371,7 +431,9 @@ async def categorize_questions( **kwargs: Any, ) -> List[Dict[str, Any]]: """Perform two-step categorization of all questions with resume capability.""" - logger.info(f"Starting two-step categorization of {len(questions)} questions with resume capability...") + logger.info( + f"Starting two-step categorization of {len(questions)} questions with resume capability..." + ) # Load existing checkpoint categorized_questions, start_index = self.load_checkpoint(checkpoint_path) @@ -386,12 +448,14 @@ async def categorize_questions( question_data = questions[i] if i % 10 == 0: - logger.info(f"Processing question {i+1}/{len(questions)}") + logger.info(f"Processing question {i + 1}/{len(questions)}") question_text = question_data.get("question", "") # Step 1: Categorize by area - predicted_area = await self.categorize_question_by_area(question_text, self.areas, **kwargs) + predicted_area = await self.categorize_question_by_area( + question_text, self.areas, **kwargs + ) # Step 2: Find the exact area key and get capabilities within that area area_key = self._find_matching_area_key(predicted_area) @@ -400,49 +464,60 @@ async def categorize_questions( categorized_question = { **question_data, "categorized_area": area_key, - "processing_order": i + 1 + "processing_order": i + 1, } categorized_questions.append(categorized_question) - # Periodic checkpoint saving if save_every_n and (i + 1) % save_every_n == 0: try: - logger.info(f"Checkpoint: saving results at {i+1} questions to {checkpoint_path}") + logger.info( + f"Checkpoint: saving results at {i + 1} questions to {checkpoint_path}" + ) self.save_checkpoint(categorized_questions, checkpoint_path, i + 1) except Exception as e: - logger.warning(f"Failed to write checkpoint at {i+1}: {e}") + logger.warning(f"Failed to write checkpoint at {i + 1}: {e}") # Final checkpoint save if save_every_n: try: - logger.info(f"Final checkpoint: saving all {len(categorized_questions)} questions to {checkpoint_path}") - self.save_checkpoint(categorized_questions, checkpoint_path, len(questions)) + logger.info( + f"Final checkpoint: saving all {len(categorized_questions)} questions to {checkpoint_path}" + ) + self.save_checkpoint( + categorized_questions, checkpoint_path, len(questions) + ) except Exception as e: logger.warning(f"Failed to write final checkpoint: {e}") logger.info("Completed two-step categorization with resume capability") return categorized_questions - def save_categorized_questions(self, categorized_questions: List[Dict[str, Any]], output_path: str) -> None: + def save_categorized_questions( + self, categorized_questions: List[Dict[str, Any]], output_path: str + ) -> None: """Save categorized questions to JSON file.""" - logger.info(f"Saving {len(categorized_questions)} categorized questions to {output_path}") + logger.info( + f"Saving {len(categorized_questions)} categorized questions to {output_path}" + ) - with open(output_path, 'w') as f: + with open(output_path, "w") as f: json.dump(categorized_questions, f, indent=2) logger.info("Categorized questions saved successfully") - def print_categorization_summary(self, categorized_questions: List[Dict[str, Any]]) -> None: + def print_categorization_summary( + self, categorized_questions: List[Dict[str, Any]] + ) -> None: """Print summary of categorization results.""" - print("\n" + "="*80) - dataset_name = getattr(self.cfg.data_cfg, 'dataset_name', 'dataset') + print("\n" + "=" * 80) + dataset_name = getattr(self.cfg.data_cfg, "dataset_name", "dataset") print(f"{str(dataset_name).upper()} QUESTION CATEGORIZATION SUMMARY") - print("="*80) + print("=" * 80) # Count by area - area_counts = defaultdict(int) + area_counts: Dict[str, int] = defaultdict(int) for q in categorized_questions: area = q.get("categorized_area", "Unknown") @@ -450,8 +525,10 @@ def print_categorization_summary(self, categorized_questions: List[Dict[str, Any print(f"\nTotal questions categorized: {len(categorized_questions)}") - print(f"\nQuestions by Area:") - for area, count in sorted(area_counts.items(), key=lambda x: x[1], reverse=True): + print("\nQuestions by Area:") + for area, count in sorted( + area_counts.items(), key=lambda x: x[1], reverse=True + ): print(f" {area}: {count}") def run_categorization(self) -> None: @@ -462,27 +539,31 @@ def run_categorization(self) -> None: self.areas, self.capabilities_by_area = self.extract_areas_and_capabilities() # Determine dataset and load questions - dataset_name = getattr(self.cfg.data_cfg, 'dataset_name', 'gsm8k') - dataset_path = getattr(self.cfg.data_cfg, 'dataset_path', None) + dataset_name = getattr(self.cfg.data_cfg, "dataset_name", "gsm8k") + dataset_path = getattr(self.cfg.data_cfg, "dataset_path", None) if not dataset_path or not dataset_name: - raise ValueError("dataset_path and dataset_name must be provided in the config") + raise ValueError( + "dataset_path and dataset_name must be provided in the config" + ) questions = self.load_questions_by_dataset(dataset_name, dataset_path) # Prepare checkpoint and output paths checkpoint_path = os.path.join( self.cfg.output_cfg.results_dir, - f"{dataset_name}_categorization_checkpoint_{len(questions)}.json" + f"{dataset_name}_categorization_checkpoint_{len(questions)}.json", ) output_path = os.path.join( - self.cfg.output_cfg.results_dir, - self.cfg.output_cfg.output_filename + self.cfg.output_cfg.results_dir, self.cfg.output_cfg.output_filename ) os.makedirs(self.cfg.output_cfg.results_dir, exist_ok=True) - save_every_n = getattr(getattr(self.cfg, "processing_cfg", {}), "save_every_n", 100) + save_every_n = getattr( + getattr(self.cfg, "processing_cfg", {}), "save_every_n", 100 + ) # Perform categorization with resume import asyncio + categorized_questions = asyncio.run( self.categorize_questions( questions, @@ -521,4 +602,3 @@ def main(cfg: DictConfig) -> None: if __name__ == "__main__": main() - diff --git a/wikipedia/wiki_vs_generated.py b/wikipedia/wiki_vs_generated.py index dd4097b9..dd96e45a 100644 --- a/wikipedia/wiki_vs_generated.py +++ b/wikipedia/wiki_vs_generated.py @@ -4,11 +4,11 @@ and matches them to generated capabilities. """ +import glob import json import logging import os -import glob -from typing import Dict, List, Optional, Tuple +from typing import Dict, List import hydra from omegaconf import DictConfig @@ -16,8 +16,8 @@ from src.model import Model from wikipedia.prompts import ( SYSTEM_PROMPT_MATH_CAPABILITIES, - get_wikipedia_to_generated_prompt, get_generated_to_wikipedia_prompt, + get_wikipedia_to_generated_prompt, ) @@ -28,7 +28,8 @@ class WikipediaCapability: """Represents a Wikipedia capability.""" - def __init__(self, name, description, area): + + def __init__(self, name: str, description: str, area: str) -> None: self.name = name self.description = description self.area = area @@ -36,7 +37,8 @@ def __init__(self, name, description, area): class GeneratedCapability: """Represents a generated capability.""" - def __init__(self, name, description, area): + + def __init__(self, name: str, description: str, area: str) -> None: self.name = name self.description = description self.area = area @@ -48,23 +50,27 @@ class GeneratedVsWikipedia: def __init__(self, cfg: DictConfig): self.cfg = cfg self.model = Model( - model_name=cfg.llm_cfg.model_name, - model_provider=cfg.llm_cfg.model_provider + model_name=cfg.llm_cfg.model_name, model_provider=cfg.llm_cfg.model_provider + ) + self.results: Dict[str, str] = {} + self.match_direction: str = getattr( + getattr(cfg, "processing_cfg", {}), + "match_direction", + "generated_to_wikipedia", ) - self.results = {} - self.match_direction = getattr(getattr(cfg, 'processing_cfg', {}), 'match_direction', 'generated_to_wikipedia') def load_wikipedia_capabilities(self) -> Dict[str, List[WikipediaCapability]]: """ Load Wikipedia capabilities from the new JSON format in wikipedia/pages/ directory. - Returns: + Returns + ------- Dictionary mapping area names to lists of WikipediaCapability objects """ - capabilities_by_area = {} + capabilities_by_area: Dict[str, List[WikipediaCapability]] = {} # Path to the Wikipedia pages directory (from config) - wikipedia_pages_dir = getattr(self.cfg.data_cfg, 'wikipedia_pages_dir') + wikipedia_pages_dir = self.cfg.data_cfg.wikipedia_pages_dir if not os.path.exists(wikipedia_pages_dir): logger.error(f"Wikipedia pages directory not found: {wikipedia_pages_dir}") @@ -76,23 +82,21 @@ def load_wikipedia_capabilities(self) -> Dict[str, List[WikipediaCapability]]: for json_file in json_files: try: - with open(json_file, 'r', encoding='utf-8') as f: + with open(json_file, "r", encoding="utf-8") as f: data = json.load(f) # Extract capability information from the new format - cap_name = data.get('capability_name', '') - description = data.get('description', '') - area = data.get('area', 'Unknown') + cap_name = data.get("capability_name", "") + description = data.get("description", "") + area = data.get("area", "Unknown") if cap_name and description: - # Create capability object + # Create capability object capability = WikipediaCapability( - name=cap_name, - description=description, - area=area - ) + name=cap_name, description=description, area=area + ) - # Group by area + # Group by area if area not in capabilities_by_area: capabilities_by_area[area] = [] capabilities_by_area[area].append(capability) @@ -101,7 +105,8 @@ def load_wikipedia_capabilities(self) -> Dict[str, List[WikipediaCapability]]: logger.warning(f"Error loading {json_file}: {e}") continue - logger.info(f"Loaded {sum(len(caps) for caps in capabilities_by_area.values())} Wikipedia capabilities") + total_caps = sum(len(caps) for caps in capabilities_by_area.values()) + logger.info(f"Loaded {total_caps} Wikipedia capabilities") logger.info(f"Areas: {list(capabilities_by_area.keys())}") except Exception as e: @@ -109,32 +114,37 @@ def load_wikipedia_capabilities(self) -> Dict[str, List[WikipediaCapability]]: return capabilities_by_area - def load_generated_capabilities(self, generated_dir: str) -> List[GeneratedCapability]: + def load_generated_capabilities( + self, generated_dir: str + ) -> List[GeneratedCapability]: """ Load capabilities from generated directory structure. Args: generated_dir: Directory containing generated capabilities - Returns: + Returns + ------- List of GeneratedCapability objects """ capabilities = [] # Look for capability.json files in the directory structure - capability_files = glob.glob(os.path.join(generated_dir, "**/capability.json"), recursive=True) + capability_files = glob.glob( + os.path.join(generated_dir, "**/capability.json"), recursive=True + ) logger.info(f"Found {len(capability_files)} generated capability files") for file_path in capability_files: try: - with open(file_path, 'r', encoding='utf-8') as f: + with open(file_path, "r", encoding="utf-8") as f: cap_data = json.load(f) capability = GeneratedCapability( - name=cap_data.get('capability_name', ''), - description=cap_data.get('capability_description', ''), - area=cap_data.get('capability_area', 'mathematics') + name=cap_data.get("capability_name", ""), + description=cap_data.get("capability_description", ""), + area=cap_data.get("capability_area", "mathematics"), ) capabilities.append(capability) @@ -146,7 +156,11 @@ def load_generated_capabilities(self, generated_dir: str) -> List[GeneratedCapab logger.info(f"Successfully loaded {len(capabilities)} generated capabilities") return capabilities - def match_wikipedia_to_generated_capabilities(self, wikipedia_cap: WikipediaCapability, generated_caps: List[GeneratedCapability]) -> str: + def match_wikipedia_to_generated_capabilities( + self, + wikipedia_cap: WikipediaCapability, + generated_caps: List[GeneratedCapability], + ) -> str: """ Match Wikipedia capability to generated capabilities using batching. @@ -154,7 +168,8 @@ def match_wikipedia_to_generated_capabilities(self, wikipedia_cap: WikipediaCapa wikipedia_cap: Wikipedia capability generated_caps: List of generated capabilities in the matched area - Returns: + Returns + ------- Name of the matched capability or "none" if no match """ if not generated_caps: @@ -163,12 +178,16 @@ def match_wikipedia_to_generated_capabilities(self, wikipedia_cap: WikipediaCapa # Batch size to avoid context length issues batch_size = 20 - logger.info(f" Processing {len(generated_caps)} capabilities in batches of {batch_size}") + logger.info( + f" Processing {len(generated_caps)} capabilities in batches of {batch_size}" + ) # Process capabilities in batches for i in range(0, len(generated_caps), batch_size): - batch = generated_caps[i:i + batch_size] - logger.info(f" Processing batch {i//batch_size + 1}/{(len(generated_caps) + batch_size - 1)//batch_size}") + batch = generated_caps[i : i + batch_size] + logger.info( + f" Processing batch {i // batch_size + 1}/{(len(generated_caps) + batch_size - 1) // batch_size}" + ) result = self.match_wikipedia_to_generated_batch(wikipedia_cap, batch) @@ -178,10 +197,14 @@ def match_wikipedia_to_generated_capabilities(self, wikipedia_cap: WikipediaCapa return result # No match found in any batch - logger.info(f" No match found in any batch") + logger.info(" No match found in any batch") return "none" - def match_wikipedia_to_generated_batch(self, wikipedia_cap: WikipediaCapability, generated_caps_batch: List[GeneratedCapability]) -> str: + def match_wikipedia_to_generated_batch( + self, + wikipedia_cap: WikipediaCapability, + generated_caps_batch: List[GeneratedCapability], + ) -> str: """Match a single Wikipedia capability against a batch of generated capabilities. Returns the exact generated capability name if a match is found, otherwise "none". @@ -189,35 +212,35 @@ def match_wikipedia_to_generated_batch(self, wikipedia_cap: WikipediaCapability, if not generated_caps_batch: return "none" - capabilities_list = "\n".join([f"- {cap.name}: {cap.description}" for cap in generated_caps_batch]) + capabilities_list = "\n".join( + [f"- {cap.name}: {cap.description}" for cap in generated_caps_batch] + ) user_prompt = get_wikipedia_to_generated_prompt( - wikipedia_cap.name, - wikipedia_cap.description, - capabilities_list + wikipedia_cap.name, wikipedia_cap.description, capabilities_list ) try: response, metadata = self.model.generate( sys_prompt=SYSTEM_PROMPT_MATH_CAPABILITIES, user_prompt=user_prompt, - generation_config={ - "temperature": 0.0, - "max_tokens": 100 - } + generation_config={"temperature": 0.0, "max_tokens": 100}, ) - response = response.strip() + response_str: str = str(response).strip() capability_names = [cap.name for cap in generated_caps_batch] - if response in capability_names: - return response - else: - return "none" + if response_str in capability_names: + return response_str + return "none" except Exception as e: logger.error(f"Error matching Wikipedia capability to generated batch: {e}") return "none" - def match_generated_to_wikipedia_capabilities(self, generated_cap: GeneratedCapability, wikipedia_caps: List[WikipediaCapability]) -> str: + def match_generated_to_wikipedia_capabilities( + self, + generated_cap: GeneratedCapability, + wikipedia_caps: List[WikipediaCapability], + ) -> str: """ Match generated capability to Wikipedia capabilities using batching. @@ -225,7 +248,8 @@ def match_generated_to_wikipedia_capabilities(self, generated_cap: GeneratedCapa generated_cap: Generated capability wikipedia_caps: List of Wikipedia capabilities in the matched area - Returns: + Returns + ------- Name of the matched Wikipedia capability or "none" if no match """ if not wikipedia_caps: @@ -234,12 +258,16 @@ def match_generated_to_wikipedia_capabilities(self, generated_cap: GeneratedCapa # Batch size to avoid context length issues batch_size = 40 - logger.info(f" Processing {len(wikipedia_caps)} Wikipedia capabilities in batches of {batch_size}") + logger.info( + f" Processing {len(wikipedia_caps)} Wikipedia capabilities in batches of {batch_size}" + ) # Process capabilities in batches for i in range(0, len(wikipedia_caps), batch_size): - batch = wikipedia_caps[i:i + batch_size] - logger.info(f" Processing batch {i//batch_size + 1}/{(len(wikipedia_caps) + batch_size - 1)//batch_size}") + batch = wikipedia_caps[i : i + batch_size] + logger.info( + f" Processing batch {i // batch_size + 1}/{(len(wikipedia_caps) + batch_size - 1) // batch_size}" + ) result = self.match_generated_to_wikipedia_batch(generated_cap, batch) @@ -249,10 +277,14 @@ def match_generated_to_wikipedia_capabilities(self, generated_cap: GeneratedCapa return result # No match found in any batch - logger.info(f" No match found in any batch") + logger.info(" No match found in any batch") return "none" - def match_generated_to_wikipedia_batch(self, generated_cap: GeneratedCapability, wikipedia_caps_batch: List[WikipediaCapability]) -> str: + def match_generated_to_wikipedia_batch( + self, + generated_cap: GeneratedCapability, + wikipedia_caps_batch: List[WikipediaCapability], + ) -> str: """ Match generated capability to a batch of Wikipedia capabilities. @@ -260,46 +292,46 @@ def match_generated_to_wikipedia_batch(self, generated_cap: GeneratedCapability, generated_cap: Generated capability to match wikipedia_caps_batch: Batch of Wikipedia capabilities to match against - Returns: + Returns + ------- Name of the matched Wikipedia capability or "none" if no match """ if not wikipedia_caps_batch: return "none" - capabilities_list = "\n".join([f"- {cap.name}: {cap.description}" for cap in wikipedia_caps_batch]) + capabilities_list = "\n".join( + [f"- {cap.name}: {cap.description}" for cap in wikipedia_caps_batch] + ) user_prompt = get_generated_to_wikipedia_prompt( - generated_cap.name, - generated_cap.description, - capabilities_list + generated_cap.name, generated_cap.description, capabilities_list ) try: response, metadata = self.model.generate( sys_prompt=SYSTEM_PROMPT_MATH_CAPABILITIES, user_prompt=user_prompt, - generation_config={ - "temperature": 0.0, - "max_tokens": 100 - } + generation_config={"temperature": 0.0, "max_tokens": 100}, ) # Clean the response - response = response.strip() + response_str: str = str(response).strip() # Check if the response matches one of the available capabilities capability_names = [cap.name for cap in wikipedia_caps_batch] - if response in capability_names: - return response - else: - return "none" + if response_str in capability_names: + return response_str + return "none" except Exception as e: logger.error(f"Error matching artifact to Wikipedia batch: {e}") return "none" - def match_capabilities(self, generated_caps: List[GeneratedCapability], - categorized_wikipedia_caps: Dict[str, List[WikipediaCapability]]) -> Dict[str, str]: + def match_capabilities( + self, + generated_caps: List[GeneratedCapability], + categorized_wikipedia_caps: Dict[str, List[WikipediaCapability]], + ) -> Dict[str, str]: """Match capabilities based on configured direction. Returns a mapping: @@ -317,64 +349,98 @@ def match_capabilities(self, generated_caps: List[GeneratedCapability], for cap in generated_caps: generated_caps_by_area.setdefault(cap.area, []).append(cap) - if self.match_direction == 'generated_to_wikipedia': - logger.info(f"Starting two-step matching process (GENERATED -> WIKIPEDIA):") + if self.match_direction == "generated_to_wikipedia": + logger.info("Starting two-step matching process (GENERATED -> WIKIPEDIA):") logger.info(f" - {len(generated_caps)} generated capabilities") logger.info(f" - {len(all_wikipedia_caps)} Wikipedia capabilities") - logger.info(f" - {len(generated_caps_by_area)} generated areas: {list(generated_caps_by_area.keys())}") + logger.info( + f" - {len(generated_caps_by_area)} generated areas: {list(generated_caps_by_area.keys())}" + ) for i, generated_cap in enumerate(generated_caps): - logger.info(f"\nProcessing generated capability {i+1}/{len(generated_caps)}: {generated_cap.name}") + logger.info( + f"\nProcessing generated capability {i + 1}/{len(generated_caps)}: {generated_cap.name}" + ) logger.info(f" Generated area: {generated_cap.area}") wikipedia_area = generated_cap.area area_wikipedia_caps = categorized_wikipedia_caps.get(wikipedia_area, []) if not area_wikipedia_caps: - logger.info(f" - NO WIKIPEDIA CAPABILITIES in area '{wikipedia_area}'") + logger.info( + f" - NO WIKIPEDIA CAPABILITIES in area '{wikipedia_area}'" + ) results[generated_cap.name] = "none" continue - logger.info(f" + Found {len(area_wikipedia_caps)} Wikipedia capabilities in area '{wikipedia_area}'") - matched = self.match_generated_to_wikipedia_capabilities(generated_cap, area_wikipedia_caps) + logger.info( + f" + Found {len(area_wikipedia_caps)} Wikipedia capabilities in area '{wikipedia_area}'" + ) + matched = self.match_generated_to_wikipedia_capabilities( + generated_cap, area_wikipedia_caps + ) if matched == "none": - logger.info(f" - NO WIKIPEDIA MATCH: {generated_cap.name} in area '{wikipedia_area}'") + logger.info( + f" - NO WIKIPEDIA MATCH: {generated_cap.name} in area '{wikipedia_area}'" + ) else: - logger.info(f" + WIKIPEDIA MATCH: {generated_cap.name} -> {matched} (in area '{wikipedia_area}')") + logger.info( + f" + WIKIPEDIA MATCH: {generated_cap.name} -> {matched} (in area '{wikipedia_area}')" + ) results[generated_cap.name] = matched - elif self.match_direction == 'wikipedia_to_generated': - logger.info(f"Starting two-step matching process (WIKIPEDIA -> GENERATED):") + elif self.match_direction == "wikipedia_to_generated": + logger.info("Starting two-step matching process (WIKIPEDIA -> GENERATED):") logger.info(f" - {len(all_wikipedia_caps)} Wikipedia capabilities") logger.info(f" - {len(generated_caps)} generated capabilities") - logger.info(f" - {len(generated_caps_by_area)} generated areas: {list(generated_caps_by_area.keys())}") + logger.info( + f" - {len(generated_caps_by_area)} generated areas: {list(generated_caps_by_area.keys())}" + ) for i, wikipedia_cap in enumerate(all_wikipedia_caps): - logger.info(f"\nProcessing Wikipedia capability {i+1}/{len(all_wikipedia_caps)}: {wikipedia_cap.name}") + logger.info( + f"\nProcessing Wikipedia capability {i + 1}/{len(all_wikipedia_caps)}: {wikipedia_cap.name}" + ) logger.info(f" Wikipedia area: {wikipedia_cap.area}") generated_area = wikipedia_cap.area area_generated_caps = generated_caps_by_area.get(generated_area, []) if not area_generated_caps: - logger.info(f" - NO GENERATED CAPABILITIES in area '{generated_area}'") + logger.info( + f" - NO GENERATED CAPABILITIES in area '{generated_area}'" + ) results[wikipedia_cap.name] = "none" continue - logger.info(f" + Found {len(area_generated_caps)} generated capabilities in area '{generated_area}'") - matched = self.match_wikipedia_to_generated_capabilities(wikipedia_cap, area_generated_caps) + logger.info( + f" + Found {len(area_generated_caps)} generated capabilities in area '{generated_area}'" + ) + matched = self.match_wikipedia_to_generated_capabilities( + wikipedia_cap, area_generated_caps + ) if matched == "none": - logger.info(f" - NO GENERATED MATCH: {wikipedia_cap.name} in area '{generated_area}'") + logger.info( + f" - NO GENERATED MATCH: {wikipedia_cap.name} in area '{generated_area}'" + ) else: - logger.info(f" + GENERATED MATCH: {wikipedia_cap.name} -> {matched} (in area '{generated_area}')") + logger.info( + f" + GENERATED MATCH: {wikipedia_cap.name} -> {matched} (in area '{generated_area}')" + ) results[wikipedia_cap.name] = matched else: - raise ValueError("processing_cfg.match_direction must be 'generated_to_wikipedia' or 'wikipedia_to_generated'") + raise ValueError( + "processing_cfg.match_direction must be 'generated_to_wikipedia' or 'wikipedia_to_generated'" + ) return results - def save_results(self, results: Dict[str, str], output_path: str, - generated_caps: List[GeneratedCapability], - categorized_wikipedia_caps: Dict[str, List[WikipediaCapability]]) -> None: + def save_results( + self, + results: Dict[str, str], + output_path: str, + generated_caps: List[GeneratedCapability], + categorized_wikipedia_caps: Dict[str, List[WikipediaCapability]], + ) -> None: """ Save results to JSON file with detailed information. @@ -398,37 +464,34 @@ def save_results(self, results: Dict[str, str], output_path: str, "total_wikipedia_capabilities": len(all_wikipedia_caps), "categorized_wikipedia_areas": len(categorized_wikipedia_caps), "matched_capabilities": sum(1 for v in results.values() if v != "none"), - "unmatched_capabilities": sum(1 for v in results.values() if v == "none"), - "match_rate": sum(1 for v in results.values() if v != "none") / len(results) if results else 0, - "matching_direction": self.match_direction + "unmatched_capabilities": sum( + 1 for v in results.values() if v == "none" + ), + "match_rate": sum(1 for v in results.values() if v != "none") + / len(results) + if results + else 0, + "matching_direction": self.match_direction, }, "matching_results": results, "generated_capabilities": [ - { - "name": cap.name, - "description": cap.description, - "area": cap.area - } for cap in generated_caps + {"name": cap.name, "description": cap.description, "area": cap.area} + for cap in generated_caps ], "wikipedia_capabilities": [ - { - "name": cap.name, - "description": cap.description, - "area": cap.area - } for cap in all_wikipedia_caps + {"name": cap.name, "description": cap.description, "area": cap.area} + for cap in all_wikipedia_caps ], "wikipedia_capabilities_by_area": { area: [ - { - "name": cap.name, - "description": cap.description, - "area": cap.area - } for cap in caps - ] for area, caps in categorized_wikipedia_caps.items() - } + {"name": cap.name, "description": cap.description, "area": cap.area} + for cap in caps + ] + for area, caps in categorized_wikipedia_caps.items() + }, } - with open(output_path, 'w', encoding='utf-8') as f: + with open(output_path, "w", encoding="utf-8") as f: json.dump(detailed_results, f, indent=2, ensure_ascii=False) logger.info(f"Detailed results saved to: {output_path}") @@ -440,14 +503,14 @@ def print_results(self, results: Dict[str, str]) -> None: Args: results: Dictionary of matching results """ - print("\n" + "="*80) - if self.match_direction == 'generated_to_wikipedia': + print("\n" + "=" * 80) + if self.match_direction == "generated_to_wikipedia": print("GENERATED → WIKIPEDIA MATCHING RESULTS") else: print("WIKIPEDIA → GENERATED MATCHING RESULTS") - print("="*80) + print("=" * 80) - if self.match_direction == 'generated_to_wikipedia': + if self.match_direction == "generated_to_wikipedia": for generated_name, wikipedia_name in results.items(): if wikipedia_name == "none": print(f"[NO MATCH] {generated_name} -> NO MATCH") @@ -460,22 +523,18 @@ def print_results(self, results: Dict[str, str]) -> None: else: print(f"[MATCH] {wikipedia_name} -> {generated_name}") - print("="*80) - if self.match_direction == 'generated_to_wikipedia': + print("=" * 80) + if self.match_direction == "generated_to_wikipedia": print(f"Total generated capabilities: {len(results)}") else: print(f"Total Wikipedia capabilities: {len(results)}") matched_count = sum(1 for v in results.values() if v != "none") print(f"Matched capabilities: {matched_count}") print(f"Unmatched capabilities: {len(results) - matched_count}") - print("="*80) + print("=" * 80) -@hydra.main( - version_base=None, - config_path="cfg", - config_name="wiki_vs_generated" -) +@hydra.main(version_base=None, config_path="cfg", config_name="wiki_vs_generated") def main(cfg: DictConfig) -> None: """ Main function to run generated-Wikipedia matching (generated -> Wikipedia direction). @@ -483,7 +542,9 @@ def main(cfg: DictConfig) -> None: Args: cfg: Configuration for the matching process """ - logger.info("Starting Generated-Wikipedia Matcher V2 Fixed (Generated -> Wikipedia Version)") + logger.info( + "Starting Generated-Wikipedia Matcher V2 Fixed (Generated -> Wikipedia Version)" + ) # Initialize matcher matcher = GeneratedVsWikipedia(cfg) @@ -515,7 +576,9 @@ def main(cfg: DictConfig) -> None: name, ext = os.path.splitext(orig_filename) filename_with_suffix = f"{name}_generated_to_wikipedia{ext or '.json'}" output_path = os.path.join(cfg.output_cfg.results_dir, filename_with_suffix) - matcher.save_results(results, output_path, generated_caps, categorized_wikipedia_caps) + matcher.save_results( + results, output_path, generated_caps, categorized_wikipedia_caps + ) print(f"Output saved to: {output_path}") logger.info("Generated-Wikipedia matching completed!") diff --git a/wikipedia/wikipedia_scraper.py b/wikipedia/wikipedia_scraper.py index 6a037d74..b489878e 100644 --- a/wikipedia/wikipedia_scraper.py +++ b/wikipedia/wikipedia_scraper.py @@ -9,38 +9,49 @@ Source: https://en.wikipedia.org/wiki/Glossary_of_areas_of_mathematics """ +import json +import logging import os import re -import json -import requests -from bs4 import BeautifulSoup import time -import logging -from typing import List, Dict, Tuple, Optional -from urllib.parse import urljoin, urlparse +from typing import Dict, List, Optional, Tuple +from urllib.parse import urljoin + +import requests +from bs4 import BeautifulSoup, Tag + # Set up logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) # Import GPT model functionality (assuming it's available in the project) try: import sys - sys.path.append(os.path.join(os.path.dirname(__file__), '..')) + + sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from src.model import Model from wikipedia.prompts import ( SYSTEM_PROMPT_CAPABILITY_EVALUATION, SYSTEM_PROMPT_CATEGORIZATION, - get_capability_summary_prompt, get_capability_categorization_prompt, + get_capability_summary_prompt, ) + GPT_AVAILABLE = True except ImportError: logger.warning("GPT model not available. Will use fallback summarization.") GPT_AVAILABLE = False -def generate_summary_with_gpt(description: str, model: Model, cache_dir: str = None, capability_name: str = None) -> Tuple[str, bool]: +def generate_summary_with_gpt( + description: str, + model: Model, + cache_dir: Optional[str] = None, + capability_name: Optional[str] = None, +) -> Tuple[str, bool]: """ Generate a concise summary of a capability description using GPT. @@ -50,48 +61,51 @@ def generate_summary_with_gpt(description: str, model: Model, cache_dir: str = N cache_dir: Directory to cache summaries (optional) capability_name: Name of the capability for caching (optional) - Returns: + Returns + ------- A tuple of (summary, was_cached) """ # Try to load cached summary first if cache_dir and capability_name: os.makedirs(cache_dir, exist_ok=True) # Sanitize filename by replacing invalid characters - safe_name = "".join(c for c in capability_name if c.isalnum() or c in (' ', '-', '_')).rstrip() - safe_name = safe_name.replace(' ', '_') + safe_name = "".join( + c for c in capability_name if c.isalnum() or c in (" ", "-", "_") + ).rstrip() + safe_name = safe_name.replace(" ", "_") cache_file = os.path.join(cache_dir, f"summary_{safe_name}.txt") if os.path.exists(cache_file): try: - with open(cache_file, 'r', encoding='utf-8') as f: + with open(cache_file, "r", encoding="utf-8") as f: cached_summary = f.read().strip() logger.debug(f"Loaded cached summary for '{capability_name}'") return cached_summary, True except Exception as e: - logger.warning(f"Failed to load cached summary for '{capability_name}': {e}") + logger.warning( + f"Failed to load cached summary for '{capability_name}': {e}" + ) sys_prompt = SYSTEM_PROMPT_CAPABILITY_EVALUATION user_prompt = get_capability_summary_prompt(description) - generation_config = { - "temperature": 0.3, - "max_tokens": 200, - "seed": 42 - } + generation_config = {"temperature": 0.3, "max_tokens": 200, "seed": 42} try: summary, metadata = model.generate( sys_prompt=sys_prompt, user_prompt=user_prompt, - generation_config=generation_config + generation_config=generation_config, ) summary = summary.strip() - logger.debug(f"Generated summary for '{description[:50]}...' with {metadata['output_tokens']} tokens") + logger.debug( + f"Generated summary for '{description[:50]}...' with {metadata['output_tokens']} tokens" + ) # Cache the summary if cache_dir is provided if cache_dir and capability_name: try: - with open(cache_file, 'w', encoding='utf-8') as f: + with open(cache_file, "w", encoding="utf-8") as f: f.write(summary) logger.debug(f"Cached summary for '{capability_name}' to {cache_file}") except Exception as e: @@ -99,24 +113,35 @@ def generate_summary_with_gpt(description: str, model: Model, cache_dir: str = N return summary, False except Exception as e: - logger.warning(f"Failed to generate summary with GPT: {e}. Using fallback method.") + logger.warning( + f"Failed to generate summary with GPT: {e}. Using fallback method." + ) # Fallback to first sentence extraction - for end_char in ['.', '!', '?']: + for end_char in [".", "!", "?"]: if end_char in description: fallback_summary = description.split(end_char)[0] + end_char # Cache the fallback summary too if cache_dir and capability_name: try: - with open(cache_file, 'w', encoding='utf-8') as f: + with open(cache_file, "w", encoding="utf-8") as f: f.write(fallback_summary) - logger.debug(f"Cached fallback summary for '{capability_name}' to {cache_file}") + logger.debug( + f"Cached fallback summary for '{capability_name}' to {cache_file}" + ) except Exception as cache_e: - logger.warning(f"Failed to cache fallback summary for '{capability_name}': {cache_e}") + logger.warning( + f"Failed to cache fallback summary for '{capability_name}': {cache_e}" + ) return fallback_summary, False return description, False -def categorize_capability_with_gpt(description: str, model: Model, cache_dir: str = None, capability_name: str = None) -> Tuple[str, bool]: +def categorize_capability_with_gpt( + description: str, + model: Model, + cache_dir: Optional[str] = None, + capability_name: Optional[str] = None, +) -> Tuple[str, bool]: """ Categorize a capability description using GPT into one of the 10 mathematical areas. @@ -126,39 +151,42 @@ def categorize_capability_with_gpt(description: str, model: Model, cache_dir: st cache_dir: Directory to cache categorizations (optional) capability_name: Name of the capability for caching (optional) - Returns: + Returns + ------- A tuple of (category, was_cached) """ # Try to load cached categorization first if cache_dir and capability_name: os.makedirs(cache_dir, exist_ok=True) - safe_name = "".join(c for c in capability_name if c.isalnum() or c in (' ', '-', '_')).rstrip() - safe_name = safe_name.replace(' ', '_') + safe_name = "".join( + c for c in capability_name if c.isalnum() or c in (" ", "-", "_") + ).rstrip() + safe_name = safe_name.replace(" ", "_") cache_file = os.path.join(cache_dir, f"category_{safe_name}.txt") if os.path.exists(cache_file): try: - with open(cache_file, 'r', encoding='utf-8') as f: + with open(cache_file, "r", encoding="utf-8") as f: cached_category = f.read().strip() - logger.debug(f"Loaded cached category for '{capability_name}': {cached_category}") + logger.debug( + f"Loaded cached category for '{capability_name}': {cached_category}" + ) return cached_category, True except Exception as e: - logger.warning(f"Failed to load cached category for '{capability_name}': {e}") + logger.warning( + f"Failed to load cached category for '{capability_name}': {e}" + ) sys_prompt = SYSTEM_PROMPT_CATEGORIZATION user_prompt = get_capability_categorization_prompt(description) - generation_config = { - "temperature": 0.1, - "max_tokens": 50, - "seed": 42 - } + generation_config = {"temperature": 0.1, "max_tokens": 50, "seed": 42} try: category, metadata = model.generate( sys_prompt=sys_prompt, user_prompt=user_prompt, - generation_config=generation_config + generation_config=generation_config, ) category = category.strip() logger.debug(f"Generated category for '{description[:50]}...': {category}") @@ -166,7 +194,7 @@ def categorize_capability_with_gpt(description: str, model: Model, cache_dir: st # Cache the category if cache_dir is provided if cache_dir and capability_name: try: - with open(cache_file, 'w', encoding='utf-8') as f: + with open(cache_file, "w", encoding="utf-8") as f: f.write(category) logger.debug(f"Cached category for '{capability_name}' to {cache_file}") except Exception as e: @@ -174,23 +202,31 @@ def categorize_capability_with_gpt(description: str, model: Model, cache_dir: st return category, False except Exception as e: - logger.warning(f"Failed to generate category with GPT: {e}. Using fallback category.") + logger.warning( + f"Failed to generate category with GPT: {e}. Using fallback category." + ) # Fallback to default category fallback_category = "Algebra and Functions" if cache_dir and capability_name: try: - with open(cache_file, 'w', encoding='utf-8') as f: + with open(cache_file, "w", encoding="utf-8") as f: f.write(fallback_category) - logger.debug(f"Cached fallback category for '{capability_name}' to {cache_file}") + logger.debug( + f"Cached fallback category for '{capability_name}' to {cache_file}" + ) except Exception as cache_e: - logger.warning(f"Failed to cache fallback category for '{capability_name}': {cache_e}") + logger.warning( + f"Failed to cache fallback category for '{capability_name}': {cache_e}" + ) return fallback_category, False class WikipediaGlossaryScraper: """Scraper for Wikipedia glossary of areas of mathematics with categorization and summarization.""" - def __init__(self, base_url: str, output_dir: str, gpt_model: Model = None): + def __init__( + self, base_url: str, output_dir: str, gpt_model: Optional[Model] = None + ) -> None: """ Initialize the scraper. @@ -203,14 +239,16 @@ def __init__(self, base_url: str, output_dir: str, gpt_model: Model = None): self.output_dir = output_dir self.gpt_model = gpt_model self.session = requests.Session() - self.session.headers.update({ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Accept-Encoding': 'gzip, deflate', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - }) + self.session.headers.update( + { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + } + ) # Create output directory os.makedirs(self.output_dir, exist_ok=True) @@ -225,7 +263,8 @@ def get_page_content(self) -> BeautifulSoup: """ Fetch and parse the Wikipedia glossary page. - Returns: + Returns + ------- BeautifulSoup object of the page content """ try: @@ -233,7 +272,7 @@ def get_page_content(self) -> BeautifulSoup: response = self.session.get(self.base_url, timeout=30) response.raise_for_status() - soup = BeautifulSoup(response.content, 'html.parser') + soup = BeautifulSoup(response.content, "html.parser") logger.info("Successfully fetched and parsed the page") return soup @@ -248,7 +287,8 @@ def get_page_first_section(self, page_url: str) -> str: Args: page_url: URL of the individual Wikipedia page - Returns: + Returns + ------- First section text content """ try: @@ -256,45 +296,46 @@ def get_page_first_section(self, page_url: str) -> str: response = self.session.get(page_url, timeout=30) response.raise_for_status() - soup = BeautifulSoup(response.content, 'html.parser') + soup = BeautifulSoup(response.content, "html.parser") # Find the main content area - content_div = soup.find('div', {'class': 'mw-parser-output'}) - if not content_div: + content_div = soup.find("div", {"class": "mw-parser-output"}) + if not content_div or not isinstance(content_div, Tag): logger.warning(f"Could not find main content for {page_url}") return "" # Collect all consecutive elements before the first h2 as the intro section - intro_texts = [] + intro_texts: List[str] = [] for child in content_div.children: # Only consider tag elements - if not hasattr(child, 'name') or child.name is None: + if not isinstance(child, Tag): continue # Stop at the first h2 (start of second section) - if child.name == 'h2': + if child.name == "h2": break # Capture paragraphs and short intro divs (infobox/sidebar divs are skipped) - if child.name == 'p': - text = child.get_text(' ', strip=True) + if child.name == "p": + text = child.get_text(" ", strip=True) if text: intro_texts.append(text) - elif child.name in ('div',): + elif child.name in ("div",): # Some pages wrap first paragraphs in a div; extract contained paragraph texts - inner_paras = child.find_all('p', recursive=False) + inner_paras = child.find_all("p", recursive=False) for p in inner_paras: - text = p.get_text(' ', strip=True) + text = p.get_text(" ", strip=True) if text: intro_texts.append(text) if intro_texts: - description = ' '.join(intro_texts) + description = " ".join(intro_texts) # Normalize whitespace - description = ' '.join(description.split()) - logger.debug(f"Extracted first section from {page_url}: {description[:100]}...") + description = " ".join(description.split()) + logger.debug( + f"Extracted first section from {page_url}: {description[:100]}..." + ) return description - else: - logger.warning(f"No first section content found for {page_url}") - return "" + logger.warning(f"No first section content found for {page_url}") + return "" except Exception as e: logger.warning(f"Error fetching individual page {page_url}: {e}") @@ -307,51 +348,63 @@ def extract_glossary_entries(self, soup: BeautifulSoup) -> List[Dict[str, str]]: Args: soup: BeautifulSoup object of the page content - Returns: + Returns + ------- List of dictionaries containing name and description for each entry """ - entries = [] + entries: List[Dict[str, str]] = [] # Find the main content area - content_div = soup.find('div', {'class': 'mw-parser-output'}) - if not content_div: + content_div = soup.find("div", {"class": "mw-parser-output"}) + if not content_div or not isinstance(content_div, Tag): logger.error("Could not find main content div") return entries # Find all definition lists (dl elements) - dl_elements = content_div.find_all('dl') + dl_elements = content_div.find_all("dl") logger.info(f"Found {len(dl_elements)} definition lists") # Process each definition list for dl in dl_elements: + if not isinstance(dl, Tag): + continue # Find all definition terms (dt elements) in this list - dt_elements = dl.find_all('dt') + dt_elements = dl.find_all("dt") logger.info(f"Found {len(dt_elements)} definition terms in this list") # Process each definition term for dt in dt_elements: + if not isinstance(dt, Tag): + continue # Get the main link (first link) in this definition term # This should be the primary mathematical topic - main_link = dt.find('a', href=True) - - if main_link: - href = main_link.get('href', '') + main_link = dt.find("a", href=True) + + if main_link and isinstance(main_link, Tag): + href_attr = main_link.get("href", "") + # Ensure href is a string (can be list for multi-value attrs) + href: str = ( + href_attr[0] + if isinstance(href_attr, list) + else str(href_attr or "") + ) text = main_link.get_text(strip=True) # Skip if it's not a Wikipedia article link or if it's too short - if (href.startswith('/wiki/') and - not href.startswith('/wiki/File:') and - not href.startswith('/wiki/Template:') and - not href.startswith('/wiki/Category:') and - not href.startswith('/wiki/Help:') and - not href.startswith('/wiki/Special:') and - not href.startswith('/wiki/User:') and - not href.startswith('/wiki/Talk:') and - not href.startswith('/wiki/User_talk:') and - not href.startswith('/wiki/Wikipedia:') and - len(text) > 3 and - len(text) < 100): # Reasonable length for topic names - + if ( + href.startswith("/wiki/") + and not href.startswith("/wiki/File:") + and not href.startswith("/wiki/Template:") + and not href.startswith("/wiki/Category:") + and not href.startswith("/wiki/Help:") + and not href.startswith("/wiki/Special:") + and not href.startswith("/wiki/User:") + and not href.startswith("/wiki/Talk:") + and not href.startswith("/wiki/User_talk:") + and not href.startswith("/wiki/Wikipedia:") + and len(text) > 3 + and len(text) < 100 + ): # Reasonable length for topic names try: logger.info(f"Processing: {text}") @@ -359,15 +412,23 @@ def extract_glossary_entries(self, soup: BeautifulSoup) -> List[Dict[str, str]]: page_url = urljoin(self.base_url, href) description = self.get_page_first_section(page_url) - if description and len(description) > 50: # Ensure we have substantial content - entries.append({ - 'name': text, - 'description': description, - 'page_url': page_url - }) - logger.info(f"+ Successfully extracted description for '{text}'") + if ( + description and len(description) > 50 + ): # Ensure we have substantial content + entries.append( + { + "name": text, + "description": description, + "page_url": page_url, + } + ) + logger.info( + f"+ Successfully extracted description for '{text}'" + ) else: - logger.warning(f"- No substantial description found for '{text}'") + logger.warning( + f"- No substantial description found for '{text}'" + ) # Add a small delay to be respectful to Wikipedia time.sleep(0.5) @@ -376,7 +437,9 @@ def extract_glossary_entries(self, soup: BeautifulSoup) -> List[Dict[str, str]]: logger.warning(f"Error processing '{text}': {e}") continue - logger.info(f"Successfully extracted {len(entries)} mathematical topic descriptions") + logger.info( + f"Successfully extracted {len(entries)} mathematical topic descriptions" + ) return entries def clean_filename(self, name: str) -> str: @@ -386,13 +449,14 @@ def clean_filename(self, name: str) -> str: Args: name: The term name to clean - Returns: + Returns + ------- Cleaned filename """ # Replace special characters and spaces with underscores - filename = re.sub(r'[^\w\s-]', '', name) - filename = re.sub(r'[\s_-]+', '_', filename) - filename = filename.strip('_') + filename = re.sub(r"[^\w\s-]", "", name) + filename = re.sub(r"[\s_-]+", "_", filename) + filename = filename.strip("_") # Limit length if len(filename) > 100: @@ -407,11 +471,12 @@ def save_entry_to_file(self, entry: Dict[str, str]) -> bool: Args: entry: Dictionary containing name, description, summary, and area - Returns: + Returns + ------- True if successful, False otherwise """ try: - filename = self.clean_filename(entry['name']) + filename = self.clean_filename(entry["name"]) if not filename: logger.warning(f"Could not create filename for: {entry['name']}") return False @@ -421,16 +486,16 @@ def save_entry_to_file(self, entry: Dict[str, str]) -> bool: # Create the complete JSON structure json_data = { - "capability_name": entry['name'], - "description": entry['description'], - "summary": entry.get('summary', ''), - "area": entry.get('area', 'Unknown'), + "capability_name": entry["name"], + "description": entry["description"], + "summary": entry.get("summary", ""), + "area": entry.get("area", "Unknown"), "source": "Wikipedia Glossary of Areas of Mathematics", - "url": entry.get('page_url', self.base_url), - "timestamp": time.strftime("%Y-%m-%d %H:%M:%S") + "url": entry.get("page_url", self.base_url), + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), } - with open(filepath, 'w', encoding='utf-8') as f: + with open(filepath, "w", encoding="utf-8") as f: json.dump(json_data, f, indent=2, ensure_ascii=False) logger.info(f"Saved: {filename}.json") @@ -444,7 +509,8 @@ def scrape_and_save(self) -> int: """ Main method to scrape the glossary, categorize, summarize, and save all entries. - Returns: + Returns + ------- Number of entries successfully saved """ try: @@ -458,7 +524,9 @@ def scrape_and_save(self) -> int: logger.error("No entries found to save") return 0 - logger.info(f"Processing {len(entries)} entries with categorization and summarization...") + logger.info( + f"Processing {len(entries)} entries with categorization and summarization..." + ) # Process each entry with categorization and summarization saved_count = 0 @@ -466,47 +534,47 @@ def scrape_and_save(self) -> int: category_stats = {"generated": 0, "cached": 0} for i, entry in enumerate(entries): - logger.info(f"Processing entry {i+1}/{len(entries)}: {entry['name']}") + logger.info(f"Processing entry {i + 1}/{len(entries)}: {entry['name']}") # Generate summary if GPT model is available if self.gpt_model: summary, summary_was_cached = generate_summary_with_gpt( - entry['description'], + entry["description"], self.gpt_model, self.summary_cache_dir, - entry['name'] + entry["name"], ) - entry['summary'] = summary + entry["summary"] = summary if summary_was_cached: summary_stats["cached"] += 1 else: summary_stats["generated"] += 1 else: # Fallback to first sentence - description = entry['description'].strip() + description = entry["description"].strip() summary = description - for end_char in ['.', '!', '?']: + for end_char in [".", "!", "?"]: if end_char in description: summary = description.split(end_char)[0] + end_char break - entry['summary'] = summary + entry["summary"] = summary # Categorize if GPT model is available if self.gpt_model: category, category_was_cached = categorize_capability_with_gpt( - entry['description'], + entry["description"], self.gpt_model, self.category_cache_dir, - entry['name'] + entry["name"], ) - entry['area'] = category + entry["area"] = category if category_was_cached: category_stats["cached"] += 1 else: category_stats["generated"] += 1 else: # Fallback to default category - entry['area'] = "Algebra and Functions" + entry["area"] = "Algebra and Functions" # Save the complete entry logger.info(f"Attempting to save entry: {entry['name']}") @@ -521,13 +589,19 @@ def scrape_and_save(self) -> int: # Log progress every 10 entries if (i + 1) % 10 == 0: - logger.info(f"Progress: {i+1}/{len(entries)} entries processed") + logger.info(f"Progress: {i + 1}/{len(entries)} entries processed") # Log final statistics - logger.info(f"Successfully saved {saved_count} out of {len(entries)} entries") + logger.info( + f"Successfully saved {saved_count} out of {len(entries)} entries" + ) if self.gpt_model: - logger.info(f"Summary statistics: {summary_stats['generated']} generated, {summary_stats['cached']} loaded from cache") - logger.info(f"Category statistics: {category_stats['generated']} generated, {category_stats['cached']} loaded from cache") + logger.info( + f"Summary statistics: {summary_stats['generated']} generated, {summary_stats['cached']} loaded from cache" + ) + logger.info( + f"Category statistics: {category_stats['generated']} generated, {category_stats['cached']} loaded from cache" + ) return saved_count @@ -536,15 +610,16 @@ def scrape_and_save(self) -> int: return 0 -def main(): +def main() -> int: """Main function to run the scraper.""" - # Configuration WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/Glossary_of_areas_of_mathematics" # Save pages in the same directory as the script OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "pages") - logger.info("Starting Wikipedia Glossary Scraper with Categorization and Summarization") + logger.info( + "Starting Wikipedia Glossary Scraper with Categorization and Summarization" + ) logger.info(f"Source URL: {WIKIPEDIA_URL}") logger.info(f"Output directory: {OUTPUT_DIR}") @@ -555,14 +630,20 @@ def main(): # You can configure the model here gpt_model = Model( model_name="gpt-3.5-turbo", # or "gpt-4", "o1-mini", etc. - model_provider="openai" + model_provider="openai", + ) + logger.info( + "[OK] GPT model initialized for categorization and summarization" ) - logger.info("[OK] GPT model initialized for categorization and summarization") except Exception as e: - logger.warning(f"Failed to initialize GPT model: {e}. Will use fallback methods.") + logger.warning( + f"Failed to initialize GPT model: {e}. Will use fallback methods." + ) gpt_model = None else: - logger.info("GPT model not available. Will use fallback methods for summarization and categorization.") + logger.info( + "GPT model not available. Will use fallback methods for summarization and categorization." + ) # Create scraper instance scraper = WikipediaGlossaryScraper(WIKIPEDIA_URL, OUTPUT_DIR, gpt_model) @@ -571,8 +652,12 @@ def main(): saved_count = scraper.scrape_and_save() if saved_count > 0: - logger.info(f"[OK] Scraping completed successfully! Saved {saved_count} JSON entries.") - logger.info(f"Each entry contains: capability_name, description, summary, area, source, url, timestamp") + logger.info( + f"[OK] Scraping completed successfully! Saved {saved_count} JSON entries." + ) + logger.info( + "Each entry contains: capability_name, description, summary, area, source, url, timestamp" + ) else: logger.error("[FAIL] No entries were saved. Please check the logs for errors.") @@ -580,4 +665,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() From 38f3f15e91768182219124cf34dbfccf610e56ac Mon Sep 17 00:00:00 2001 From: Farnaz Kohankhaki Date: Thu, 29 Jan 2026 11:28:54 -0800 Subject: [PATCH 2/4] fix ruff linting issues in wikipedia files - Shorten long docstrings and comments (W505) - Use imperative mood in docstrings (D401) - Add period to module docstring first line (D400) - Use lowercase variable names in functions (N806) - Remove .keys() from dict iteration (SIM118) - Fix module docstring formatting (D205, D404) --- src/run_embedding_eval.py | 2 +- wikipedia/static_vs_generated.py | 22 +++++++++++----------- wikipedia/wiki_vs_generated.py | 25 +++++++++++-------------- wikipedia/wikipedia_scraper.py | 30 ++++++++++++++---------------- 4 files changed, 37 insertions(+), 42 deletions(-) diff --git a/src/run_embedding_eval.py b/src/run_embedding_eval.py index 10cb9d9f..42d8b460 100644 --- a/src/run_embedding_eval.py +++ b/src/run_embedding_eval.py @@ -77,7 +77,7 @@ def main(cfg: DictConfig) -> None: rmse_dict = defaultdict(list) avg_std_dict = defaultdict(list) for dim_reduction_method in ["t-sne", "pca"]: - for rep_string_order in ["n", "nd", "and"]: + for rep_string_order in ["n", "nd", "and"]: # typos: ignore # Embed capabilities using openai embedding model generate_and_set_capabilities_embeddings( capabilities=capabilities, diff --git a/wikipedia/static_vs_generated.py b/wikipedia/static_vs_generated.py index 5b806b16..24e6884b 100644 --- a/wikipedia/static_vs_generated.py +++ b/wikipedia/static_vs_generated.py @@ -40,7 +40,7 @@ class AreaInfo: class DatasetQuestionCategorizer: - """Class to categorize questions from selected dataset using two-step LLM approach.""" + """Categorize questions from selected dataset using two-step LLM approach.""" def __init__(self, cfg: DictConfig) -> None: self.cfg = cfg @@ -62,13 +62,13 @@ def __init__(self, cfg: DictConfig) -> None: def extract_areas_and_capabilities_from_generated( self, generated_dir: str ) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]: - """Extract all areas and capabilities from the generated capabilities directory.""" + """Extract areas and capabilities from the generated capabilities directory.""" logger.info("Extracting areas and capabilities from generated capabilities...") areas: List[AreaInfo] = [] capabilities_by_area: Dict[str, List[CapabilityInfo]] = {} - # Get all capability directories (handle nested structure like math//) + # Get all capability directories (nested structure like math//) capability_dirs = glob.glob(os.path.join(generated_dir, "*/")) print(f"Found {len(capability_dirs)} capability directories") @@ -112,7 +112,7 @@ def extract_areas_and_capabilities_from_generated( def extract_areas_and_capabilities_from_wikipedia( self, wikipedia_dir: str ) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]: - """Extract all areas and capabilities from the Wikipedia pages directory containing individual JSON files.""" + """Extract areas and capabilities from Wikipedia pages directory.""" logger.info( f"Extracting areas and capabilities from Wikipedia pages directory: {wikipedia_dir}" ) @@ -210,12 +210,12 @@ def _find_matching_area_key(self, predicted_area: str) -> str: # Try normalized match area_key: str - for area_key in self.capabilities_by_area.keys(): + for area_key in self.capabilities_by_area: if self._normalize_text(area_key) == predicted_normalized: return area_key # Try partial match (contains) - for area_key in self.capabilities_by_area.keys(): + for area_key in self.capabilities_by_area: if ( predicted_normalized in self._normalize_text(area_key) or self._normalize_text(area_key) in predicted_normalized @@ -279,7 +279,7 @@ def load_gsm8k_questions(self, jsonl_path: str) -> List[Dict[str, Any]]: return questions def load_math_questions(self, math_data_dir: str) -> List[Dict[str, Any]]: - """Load MATH dataset questions from a directory containing JSON files (recursive).""" + """Load MATH dataset questions from directory containing JSON files.""" logger.info(f"Loading MATH questions from {math_data_dir}...") questions: List[Dict[str, Any]] = [] @@ -315,7 +315,7 @@ def load_questions_by_dataset( Args: dataset_name: The logical name of the dataset (e.g., "gsm8k"). - dataset_path: The path to the dataset file/directory as required by the dataset loader. + dataset_path: Path to dataset file/directory for the loader. """ if not dataset_name: raise ValueError("dataset_name must be provided") @@ -331,7 +331,7 @@ def load_questions_by_dataset( ) def load_checkpoint(self, checkpoint_path: str) -> Tuple[List[Dict[str, Any]], int]: - """Load existing checkpoint and return processed questions and last processed index.""" + """Load checkpoint and return processed questions and last index.""" if not os.path.exists(checkpoint_path): logger.info( f"No checkpoint found at {checkpoint_path}, starting from beginning" @@ -389,7 +389,7 @@ def save_checkpoint( async def categorize_question_by_area( self, question: str, areas: List[AreaInfo], **kwargs: Any ) -> str: - """Categorize a question into one of the available areas (returns exact area name).""" + """Categorize a question into one of the available areas.""" area_names = [area.name for area in areas] area_bullets = "\n".join([f"- {area.name}" for area in areas]) @@ -589,7 +589,7 @@ def run_categorization(self) -> None: config_name="static_vs_generated", ) def main(cfg: DictConfig) -> None: - """Main function to run question categorization.""" + """Run question categorization.""" # Set up logging logging.basicConfig(level=logging.INFO) diff --git a/wikipedia/wiki_vs_generated.py b/wikipedia/wiki_vs_generated.py index dd96e45a..b77badb7 100644 --- a/wikipedia/wiki_vs_generated.py +++ b/wikipedia/wiki_vs_generated.py @@ -1,7 +1,6 @@ -""" -This script matches Wikipedia capabilities with generated capabilities using the pre-categorized -Wikipedia data. It loads the categorized Wikipedia capabilities -and matches them to generated capabilities. +"""Match Wikipedia capabilities with generated capabilities. + +Uses pre-categorized Wikipedia data to load and match capabilities. """ import glob @@ -45,7 +44,7 @@ def __init__(self, name: str, description: str, area: str) -> None: class GeneratedVsWikipedia: - """Matches Wikipedia capabilities with generated capabilities using batching (reversed version).""" + """Match Wikipedia capabilities with generated capabilities using batching.""" def __init__(self, cfg: DictConfig): self.cfg = cfg @@ -60,8 +59,7 @@ def __init__(self, cfg: DictConfig): ) def load_wikipedia_capabilities(self) -> Dict[str, List[WikipediaCapability]]: - """ - Load Wikipedia capabilities from the new JSON format in wikipedia/pages/ directory. + """Load Wikipedia capabilities from JSON format in wikipedia/pages/. Returns ------- @@ -205,9 +203,9 @@ def match_wikipedia_to_generated_batch( wikipedia_cap: WikipediaCapability, generated_caps_batch: List[GeneratedCapability], ) -> str: - """Match a single Wikipedia capability against a batch of generated capabilities. + """Match a Wikipedia capability against a batch of generated capabilities. - Returns the exact generated capability name if a match is found, otherwise "none". + Returns generated capability name if matched, otherwise "none". """ if not generated_caps_batch: return "none" @@ -335,8 +333,8 @@ def match_capabilities( """Match capabilities based on configured direction. Returns a mapping: - - generated_to_wikipedia: {generated_capability_name -> wikipedia_capability_name} - - wikipedia_to_generated: {wikipedia_capability_name -> generated_capability_name} + - generated_to_wikipedia: {generated_name -> wikipedia_name} + - wikipedia_to_generated: {wikipedia_name -> generated_name} """ results: Dict[str, str] = {} @@ -454,7 +452,7 @@ def save_results( # Flatten Wikipedia capabilities for output all_wikipedia_caps = [] - for area, caps in categorized_wikipedia_caps.items(): + for _area, caps in categorized_wikipedia_caps.items(): all_wikipedia_caps.extend(caps) # Create detailed results with metadata @@ -536,8 +534,7 @@ def print_results(self, results: Dict[str, str]) -> None: @hydra.main(version_base=None, config_path="cfg", config_name="wiki_vs_generated") def main(cfg: DictConfig) -> None: - """ - Main function to run generated-Wikipedia matching (generated -> Wikipedia direction). + """Run generated-Wikipedia capability matching. Args: cfg: Configuration for the matching process diff --git a/wikipedia/wikipedia_scraper.py b/wikipedia/wikipedia_scraper.py index b489878e..9eecc865 100644 --- a/wikipedia/wikipedia_scraper.py +++ b/wikipedia/wikipedia_scraper.py @@ -1,10 +1,9 @@ #!/usr/bin/env python3 -""" -Wikipedia Glossary Scraper with Categorization and Summary Generation +"""Wikipedia Glossary Scraper with Categorization and Summary Generation. -This script scrapes the Wikipedia "Glossary of areas of mathematics" page, -categorizes each mathematical area, generates summaries using GPT, -and saves everything as JSON files with complete information. +Scrape the Wikipedia "Glossary of areas of mathematics" page, +categorize each mathematical area, generate summaries using GPT, +and save everything as JSON files with complete information. Source: https://en.wikipedia.org/wiki/Glossary_of_areas_of_mathematics """ @@ -222,7 +221,7 @@ def categorize_capability_with_gpt( class WikipediaGlossaryScraper: - """Scraper for Wikipedia glossary of areas of mathematics with categorization and summarization.""" + """Scrape Wikipedia glossary of areas of mathematics.""" def __init__( self, base_url: str, output_dir: str, gpt_model: Optional[Model] = None @@ -313,13 +312,13 @@ def get_page_first_section(self, page_url: str) -> str: # Stop at the first h2 (start of second section) if child.name == "h2": break - # Capture paragraphs and short intro divs (infobox/sidebar divs are skipped) + # Capture paragraphs and intro divs (skip infobox/sidebar) if child.name == "p": text = child.get_text(" ", strip=True) if text: intro_texts.append(text) elif child.name in ("div",): - # Some pages wrap first paragraphs in a div; extract contained paragraph texts + # Some pages wrap first paragraphs in a div inner_paras = child.find_all("p", recursive=False) for p in inner_paras: text = p.get_text(" ", strip=True) @@ -506,8 +505,7 @@ def save_entry_to_file(self, entry: Dict[str, str]) -> bool: return False def scrape_and_save(self) -> int: - """ - Main method to scrape the glossary, categorize, summarize, and save all entries. + """Scrape glossary, categorize, summarize, and save all entries. Returns ------- @@ -611,17 +609,17 @@ def scrape_and_save(self) -> int: def main() -> int: - """Main function to run the scraper.""" + """Run the scraper.""" # Configuration - WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/Glossary_of_areas_of_mathematics" + wikipedia_url = "https://en.wikipedia.org/wiki/Glossary_of_areas_of_mathematics" # Save pages in the same directory as the script - OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "pages") + output_dir = os.path.join(os.path.dirname(__file__), "pages") logger.info( "Starting Wikipedia Glossary Scraper with Categorization and Summarization" ) - logger.info(f"Source URL: {WIKIPEDIA_URL}") - logger.info(f"Output directory: {OUTPUT_DIR}") + logger.info(f"Source URL: {wikipedia_url}") + logger.info(f"Output directory: {output_dir}") # Initialize GPT model if available gpt_model = None @@ -646,7 +644,7 @@ def main() -> int: ) # Create scraper instance - scraper = WikipediaGlossaryScraper(WIKIPEDIA_URL, OUTPUT_DIR, gpt_model) + scraper = WikipediaGlossaryScraper(wikipedia_url, output_dir, gpt_model) # Run the scraper saved_count = scraper.scrape_and_save() From b67e121cb1fb741120f672f2bfc07ad0489489a2 Mon Sep 17 00:00:00 2001 From: Farnaz Kohankhaki Date: Thu, 29 Jan 2026 11:36:45 -0800 Subject: [PATCH 3/4] Remove typos checker from pre-commit hooks False positives like 'nd' require ongoing config maintenance. --- .pre-commit-config.yaml | 6 ------ src/run_embedding_eval.py | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f929f809..a81f7f7a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,12 +33,6 @@ repos: types: [python] exclude: "tests" - - repo: https://github.com/crate-ci/typos - rev: dictgen-v0.3.1 - hooks: - - id: typos - args: [] - - repo: https://github.com/nbQA-dev/nbQA rev: 1.9.1 hooks: diff --git a/src/run_embedding_eval.py b/src/run_embedding_eval.py index 42d8b460..10cb9d9f 100644 --- a/src/run_embedding_eval.py +++ b/src/run_embedding_eval.py @@ -77,7 +77,7 @@ def main(cfg: DictConfig) -> None: rmse_dict = defaultdict(list) avg_std_dict = defaultdict(list) for dim_reduction_method in ["t-sne", "pca"]: - for rep_string_order in ["n", "nd", "and"]: # typos: ignore + for rep_string_order in ["n", "nd", "and"]: # Embed capabilities using openai embedding model generate_and_set_capabilities_embeddings( capabilities=capabilities, From bc9cda2490c413bbbca0659b06c3fca6708434c8 Mon Sep 17 00:00:00 2001 From: Farnaz Kohankhaki Date: Thu, 29 Jan 2026 11:56:12 -0800 Subject: [PATCH 4/4] Fix trailing whitespace and end-of-file formatting --- example_scripts/README.md | 8 ++++---- wikipedia/cfg/static_vs_generated.yaml | 5 ++--- wikipedia/cfg/wiki_vs_generated.yaml | 2 +- wikipedia/prompts.py | 10 ++++++---- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/example_scripts/README.md b/example_scripts/README.md index 5951ffa4..a6b81bb8 100644 --- a/example_scripts/README.md +++ b/example_scripts/README.md @@ -41,7 +41,7 @@ Step 2: Sort and keep complete capabilities. Complete capabilities have enough v Step 3: Generate capability embeddings using openai model, and assign embeddings to each capability object. -```python +```python # Embed capabilities using openai embedding model generate_and_set_capabilities_embeddings( capabilities=capabilities, @@ -61,7 +61,7 @@ Step 4: Filter capabilities based on the embeddings such that if embeddings are ) ``` -Step 5: Capability embedding dimensionality reduction. +Step 5: Capability embedding dimensionality reduction. ```python # Reduce the dimensionality of capability embeddings generated by the @@ -136,6 +136,6 @@ Step 9: Visualize train and test capability embeddings together. The `generate_and_set_capabilities_embeddings()` function in `src/utils/embedding_utils.py` handles this process. Capability name and descriptions are extracted to form the representation string `rep_string`. Then, embeddings are generated using the OpenAI embedding model via `embedding_generator`. Finally, the embeddings are assigned to each capability object. The representation string was chosen based on visualization-based experiments and is defined as: -```python +```python rep_string = f"{capability_dict['name']} - {capability.area}: {capability_dict['description']}" -``` \ No newline at end of file +``` diff --git a/wikipedia/cfg/static_vs_generated.yaml b/wikipedia/cfg/static_vs_generated.yaml index 5a8ce1ec..1d73748a 100644 --- a/wikipedia/cfg/static_vs_generated.yaml +++ b/wikipedia/cfg/static_vs_generated.yaml @@ -2,13 +2,13 @@ data_cfg: # Path to the generated capabilities directory containing capabilities generated_dir: /projects/DeepLesion/projects/automated_capability_evaluation/artifacts/capabilities_gpt-claude-math/math - + # Dataset selection # Supported dataset_name values: "gsm8k", "math" dataset_name: gsm8k # For gsm8k: path to combined JSONL; For math: root directory with JSON files (recursive) dataset_path: /projects/DeepLesion/projects/automated_capability_evaluation/static_datasets/math/gsm8k-main/test.jsonl - + # Path to the existing Wikipedia categorization results file (not used in generated mode) wikipedia_dir: /projects/DeepLesion/projects/automated_capability_evaluation/wikipedia/pages @@ -37,4 +37,3 @@ processing_cfg: defaults: - _self_ - diff --git a/wikipedia/cfg/wiki_vs_generated.yaml b/wikipedia/cfg/wiki_vs_generated.yaml index d2357272..5b91319f 100644 --- a/wikipedia/cfg/wiki_vs_generated.yaml +++ b/wikipedia/cfg/wiki_vs_generated.yaml @@ -4,7 +4,7 @@ data_cfg: # Path to the Wikipedia pages directory containing .json files wikipedia_pages_dir: /projects/DeepLesion/projects/automated_capability_evaluation/wikipedia/pages - + # Path to the generated capabilities directory containing capability.json files generated_dir: /projects/DeepLesion/projects/automated_capability_evaluation/artifacts/capabilities_gpt-claude-math/math diff --git a/wikipedia/prompts.py b/wikipedia/prompts.py index 8557186c..e90ab33e 100644 --- a/wikipedia/prompts.py +++ b/wikipedia/prompts.py @@ -1,6 +1,5 @@ """Centralized prompts for all Wikipedia-related scripts.""" - # System prompts SYSTEM_PROMPT_MATH_CAPABILITIES = "You are an expert in mathematical capabilities." SYSTEM_PROMPT_MATH_TAXONOMIST = ( @@ -42,7 +41,9 @@ # User prompts - functions that generate user prompts -def get_wikipedia_to_generated_prompt(wikipedia_cap_name: str, wikipedia_cap_description: str, capabilities_list: str) -> str: +def get_wikipedia_to_generated_prompt( + wikipedia_cap_name: str, wikipedia_cap_description: str, capabilities_list: str +) -> str: """Generate prompt for matching Wikipedia capability to generated capabilities.""" return f"""You are an expert in mathematical capabilities. Determine which generated capability best matches the given Wikipedia capability. @@ -65,7 +66,9 @@ def get_wikipedia_to_generated_prompt(wikipedia_cap_name: str, wikipedia_cap_des Answer with only the capability name or "none":""" -def get_generated_to_wikipedia_prompt(generated_cap_name: str, generated_cap_description: str, capabilities_list: str) -> str: +def get_generated_to_wikipedia_prompt( + generated_cap_name: str, generated_cap_description: str, capabilities_list: str +) -> str: """Generate prompt for matching generated capability to Wikipedia capabilities.""" return f"""You are an expert in mathematical capabilities. Find the Wikipedia capability that most closely matches the generated capability. @@ -112,4 +115,3 @@ def get_capability_summary_prompt(description: str) -> str: def get_capability_categorization_prompt(description: str) -> str: """Generate prompt for categorizing a mathematical capability.""" return f"Please categorize this mathematical concept into one of the 10 areas listed above:\n\n{description}" -