From 47ce9ce333ce5f31750f4c58c5fba0a1b9112f4f Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Thu, 29 Jan 2026 11:04:19 -0800
Subject: [PATCH 1/4] fix: resolve mypy type errors in wikipedia files

- Add Optional types for nullable parameters
- Add type annotations for class attributes and local variables
- Fix BeautifulSoup Tag type narrowing with isinstance checks
- Fix generator type issues in sum() calls
- Add null checks for llm_model before async calls
---
 wikipedia/static_vs_generated.py | 260 +++++++++++++--------
 wikipedia/wiki_vs_generated.py   | 323 +++++++++++++++-----------
 wikipedia/wikipedia_scraper.py   | 379 +++++++++++++++++++------------
 3 files changed, 595 insertions(+), 367 deletions(-)

diff --git a/wikipedia/static_vs_generated.py b/wikipedia/static_vs_generated.py
index 8440d61d..5b806b16 100644
--- a/wikipedia/static_vs_generated.py
+++ b/wikipedia/static_vs_generated.py
@@ -1,12 +1,12 @@
 """Area categorization system for static datasets using LLM with resume capability."""
 
+import glob
 import json
 import logging
 import os
-import glob
-from typing import List, Dict, Any, Tuple
-from dataclasses import dataclass
 from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
 
 import hydra
 from omegaconf import DictConfig
@@ -17,12 +17,14 @@
     get_area_categorization_prompt,
 )
 
+
 logger = logging.getLogger(__name__)
 
 
 @dataclass
 class CapabilityInfo:
     """Data class to hold capability information."""
+
     name: str
     description: str
     area: str
@@ -32,6 +34,7 @@ class CapabilityInfo:
 @dataclass
 class AreaInfo:
     """Data class to hold area information."""
+
     name: str
     capabilities: List[CapabilityInfo]
 
@@ -39,29 +42,31 @@ class AreaInfo:
 class DatasetQuestionCategorizer:
     """Class to categorize questions from selected dataset using two-step LLM approach."""
 
-    def __init__(self, cfg: DictConfig):
+    def __init__(self, cfg: DictConfig) -> None:
         self.cfg = cfg
-        self.areas = []
-        self.capabilities_by_area = {}
-        self.llm_model = None
+        self.areas: List[AreaInfo] = []
+        self.capabilities_by_area: Dict[str, List[CapabilityInfo]] = {}
+        self.llm_model: Optional[Model] = None
 
         # Initialize LLM model
         try:
             self.llm_model = Model(
                 model_name=cfg.llm_cfg.model_name,
-                model_provider=cfg.llm_cfg.model_provider
+                model_provider=cfg.llm_cfg.model_provider,
             )
             logger.info(f"Initialized LLM model: {cfg.llm_cfg.model_name}")
         except Exception as e:
             logger.error(f"Failed to initialize LLM model: {e}")
             raise e
 
-    def extract_areas_and_capabilities_from_generated(self, generated_dir: str) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]:
+    def extract_areas_and_capabilities_from_generated(
+        self, generated_dir: str
+    ) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]:
         """Extract all areas and capabilities from the generated capabilities directory."""
         logger.info("Extracting areas and capabilities from generated capabilities...")
 
-        areas = []
-        capabilities_by_area = {}
+        areas: List[AreaInfo] = []
+        capabilities_by_area: Dict[str, List[CapabilityInfo]] = {}
 
         # Get all capability directories (handle nested structure like math/<capability_name>/)
         capability_dirs = glob.glob(os.path.join(generated_dir, "*/"))
@@ -71,14 +76,16 @@ def extract_areas_and_capabilities_from_generated(self, generated_dir: str) -> T
             capability_json_path = os.path.join(cap_dir, "capability.json")
             if os.path.exists(capability_json_path):
                 try:
-                    with open(capability_json_path, 'r') as f:
+                    with open(capability_json_path, "r") as f:
                         cap_data = json.load(f)
 
-                    print(f"Loaded capability data: {cap_data.get('capability_name', ''), cap_data.get('capability_description', ''), cap_data.get('capability_area', 'Unknown')}")
+                    print(
+                        f"Loaded capability data: {cap_data.get('capability_name', ''), cap_data.get('capability_description', ''), cap_data.get('capability_area', 'Unknown')}"
+                    )
                     capability = CapabilityInfo(
-                        name=cap_data.get('capability_name', ''),
-                        description=cap_data.get('capability_description', ''),
-                        area=cap_data.get('capability_area', 'Unknown')
+                        name=cap_data.get("capability_name", ""),
+                        description=cap_data.get("capability_description", ""),
+                        area=cap_data.get("capability_area", "Unknown"),
                     )
 
                     # Group capabilities by area
@@ -87,22 +94,31 @@ def extract_areas_and_capabilities_from_generated(self, generated_dir: str) -> T
                     capabilities_by_area[capability.area].append(capability)
 
                 except Exception as e:
-                    logger.warning(f"Error loading capability from {capability_json_path}: {e}")
+                    logger.warning(
+                        f"Error loading capability from {capability_json_path}: {e}"
+                    )
 
         # Create area objects
         for area_name, capabilities in capabilities_by_area.items():
             area = AreaInfo(name=area_name, capabilities=capabilities)
             areas.append(area)
 
-        logger.info(f"Extracted {len(areas)} areas with {sum(len(caps) for caps in capabilities_by_area.values())} total capabilities")
+        total_caps = sum(len(caps) for caps in capabilities_by_area.values())
+        logger.info(
+            f"Extracted {len(areas)} areas with {total_caps} total capabilities"
+        )
         return areas, capabilities_by_area
 
-    def extract_areas_and_capabilities_from_wikipedia(self, wikipedia_dir: str) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]:
+    def extract_areas_and_capabilities_from_wikipedia(
+        self, wikipedia_dir: str
+    ) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]:
         """Extract all areas and capabilities from the Wikipedia pages directory containing individual JSON files."""
-        logger.info(f"Extracting areas and capabilities from Wikipedia pages directory: {wikipedia_dir}")
+        logger.info(
+            f"Extracting areas and capabilities from Wikipedia pages directory: {wikipedia_dir}"
+        )
 
-        areas = []
-        capabilities_by_area = {}
+        areas: List[AreaInfo] = []
+        capabilities_by_area: Dict[str, List[CapabilityInfo]] = {}
 
         try:
             # Get all JSON files in the Wikipedia pages directory
@@ -111,20 +127,20 @@ def extract_areas_and_capabilities_from_wikipedia(self, wikipedia_dir: str) -> T
 
             for json_file in json_files:
                 try:
-                    with open(json_file, 'r', encoding='utf-8') as f:
+                    with open(json_file, "r", encoding="utf-8") as f:
                         wikipedia_data = json.load(f)
 
                     # Extract capability information from the individual JSON file
-                    capability_name = wikipedia_data.get('capability_name', '')
-                    description = wikipedia_data.get('description', '')
-                    area = wikipedia_data.get('area', 'Unknown')
+                    capability_name = wikipedia_data.get("capability_name", "")
+                    description = wikipedia_data.get("description", "")
+                    area = wikipedia_data.get("area", "Unknown")
 
                     if capability_name and description:
                         capability = CapabilityInfo(
                             name=capability_name,
                             description=description,
                             area=area,
-                            domain='math'
+                            domain="math",
                         )
 
                         # Group capabilities by area
@@ -133,7 +149,9 @@ def extract_areas_and_capabilities_from_wikipedia(self, wikipedia_dir: str) -> T
                         capabilities_by_area[area].append(capability)
 
                 except Exception as e:
-                    logger.warning(f"Error loading Wikipedia capability from {json_file}: {e}")
+                    logger.warning(
+                        f"Error loading Wikipedia capability from {json_file}: {e}"
+                    )
                     continue
 
             # Create area objects
@@ -141,30 +159,42 @@ def extract_areas_and_capabilities_from_wikipedia(self, wikipedia_dir: str) -> T
                 if capabilities:  # Only create areas that have capabilities
                     area = AreaInfo(name=area_name, capabilities=capabilities)
                     areas.append(area)
-                    logger.info(f"Loaded area '{area_name}' with {len(capabilities)} capabilities")
+                    logger.info(
+                        f"Loaded area '{area_name}' with {len(capabilities)} capabilities"
+                    )
 
-            logger.info(f"Extracted {len(areas)} areas with {sum(len(caps) for caps in capabilities_by_area.values())} total capabilities from Wikipedia pages")
+            total_caps = sum(len(caps) for caps in capabilities_by_area.values())
+            logger.info(
+                f"Extracted {len(areas)} areas with {total_caps} total capabilities from Wikipedia pages"
+            )
 
         except Exception as e:
-            logger.error(f"Error loading Wikipedia capabilities from {wikipedia_dir}: {e}")
+            logger.error(
+                f"Error loading Wikipedia capabilities from {wikipedia_dir}: {e}"
+            )
             raise e
 
         return areas, capabilities_by_area
 
-    def extract_areas_and_capabilities(self, generated_dir: str = None, wikipedia_dir: str = None) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]:
+    def extract_areas_and_capabilities(
+        self, generated_dir: Optional[str] = None, wikipedia_dir: Optional[str] = None
+    ) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]:
         """Extract areas and capabilities using the configured method."""
-        extraction_method = getattr(self.cfg, 'categorization_cfg', {}).get('extraction_method', 'generated')
+        extraction_method = getattr(self.cfg, "categorization_cfg", {}).get(
+            "extraction_method", "generated"
+        )
 
-        if extraction_method == 'wikipedia':
+        if extraction_method == "wikipedia":
             if not wikipedia_dir:
                 wikipedia_dir = self.cfg.data_cfg.wikipedia_dir
             return self.extract_areas_and_capabilities_from_wikipedia(wikipedia_dir)
-        elif extraction_method == 'generated':
+        if extraction_method == "generated":
             if not generated_dir:
                 generated_dir = self.cfg.data_cfg.generated_dir
             return self.extract_areas_and_capabilities_from_generated(generated_dir)
-        else:
-            raise ValueError(f"Unknown extraction method: {extraction_method}. Must be 'generated' or 'wikipedia'")
+        raise ValueError(
+            f"Unknown extraction method: {extraction_method}. Must be 'generated' or 'wikipedia'"
+        )
 
     @staticmethod
     def _normalize_text(text: str) -> str:
@@ -179,17 +209,23 @@ def _find_matching_area_key(self, predicted_area: str) -> str:
             return predicted_area
 
         # Try normalized match
+        area_key: str
         for area_key in self.capabilities_by_area.keys():
             if self._normalize_text(area_key) == predicted_normalized:
                 return area_key
 
         # Try partial match (contains)
         for area_key in self.capabilities_by_area.keys():
-            if predicted_normalized in self._normalize_text(area_key) or self._normalize_text(area_key) in predicted_normalized:
+            if (
+                predicted_normalized in self._normalize_text(area_key)
+                or self._normalize_text(area_key) in predicted_normalized
+            ):
                 return area_key
 
         # If no match found, return the original prediction
-        logger.warning(f"No matching area key found for '{predicted_area}'. Available keys: {list(self.capabilities_by_area.keys())}")
+        logger.warning(
+            f"No matching area key found for '{predicted_area}'. Available keys: {list(self.capabilities_by_area.keys())}"
+        )
         return predicted_area
 
     @classmethod
@@ -216,24 +252,25 @@ def _select_best_match(cls, response_text: str, allowed_names: List[str]) -> str
 
         # 2. Substring match (contains or starts with)
         for norm_name, original_name in normalized_names.items():
-            if (normalized_response.startswith(norm_name) or
-                norm_name in normalized_response):
+            if (
+                normalized_response.startswith(norm_name)
+                or norm_name in normalized_response
+            ):
                 return original_name
 
         # 3. No match found
         return "Unknown"
 
-
     def load_gsm8k_questions(self, jsonl_path: str) -> List[Dict[str, Any]]:
         """Load GSM8K questions from JSONL file."""
         logger.info(f"Loading GSM8K questions from {jsonl_path}...")
 
         questions = []
-        with open(jsonl_path, 'r') as f:
+        with open(jsonl_path, "r") as f:
             for line_num, line in enumerate(f, 1):
                 try:
                     question_data = json.loads(line.strip())
-                    question_data['line_number'] = line_num
+                    question_data["line_number"] = line_num
                     questions.append(question_data)
                 except json.JSONDecodeError as e:
                     logger.warning(f"Error parsing line {line_num}: {e}")
@@ -251,7 +288,7 @@ def load_math_questions(self, math_data_dir: str) -> List[Dict[str, Any]]:
 
         for json_file in json_files:
             try:
-                with open(json_file, 'r') as f:
+                with open(json_file, "r") as f:
                     problem_data = json.load(f)
                 question_data = {
                     "question": problem_data.get("problem", ""),
@@ -267,7 +304,9 @@ def load_math_questions(self, math_data_dir: str) -> List[Dict[str, Any]]:
         logger.info(f"Loaded {len(questions)} MATH problems")
         return questions
 
-    def load_questions_by_dataset(self, dataset_name: str, dataset_path: str) -> List[Dict[str, Any]]:
+    def load_questions_by_dataset(
+        self, dataset_name: str, dataset_path: str
+    ) -> List[Dict[str, Any]]:
         """Select and load questions for the specified dataset.
 
         Currently supported datasets:
@@ -282,22 +321,26 @@ def load_questions_by_dataset(self, dataset_name: str, dataset_path: str) -> Lis
             raise ValueError("dataset_name must be provided")
         name = dataset_name.strip().lower()
 
-        if name == 'gsm8k':
+        if name == "gsm8k":
             return self.load_gsm8k_questions(dataset_path)
-        if name == 'math':
+        if name == "math":
             return self.load_math_questions(dataset_path)
 
-        raise ValueError(f"Unsupported dataset '{dataset_name}'. Supported: ['gsm8k', 'math']")
+        raise ValueError(
+            f"Unsupported dataset '{dataset_name}'. Supported: ['gsm8k', 'math']"
+        )
 
     def load_checkpoint(self, checkpoint_path: str) -> Tuple[List[Dict[str, Any]], int]:
         """Load existing checkpoint and return processed questions and last processed index."""
         if not os.path.exists(checkpoint_path):
-            logger.info(f"No checkpoint found at {checkpoint_path}, starting from beginning")
+            logger.info(
+                f"No checkpoint found at {checkpoint_path}, starting from beginning"
+            )
             return [], 0
 
         logger.info(f"Loading checkpoint from {checkpoint_path}")
         try:
-            with open(checkpoint_path, 'r') as f:
+            with open(checkpoint_path, "r") as f:
                 checkpoint_data = json.load(f)
 
             if isinstance(checkpoint_data, list):
@@ -306,33 +349,46 @@ def load_checkpoint(self, checkpoint_path: str) -> Tuple[List[Dict[str, Any]], i
                 last_index = len(processed_questions)
             else:
                 # Structured checkpoint with metadata
-                processed_questions = checkpoint_data.get('categorized_questions', [])
-                last_index = checkpoint_data.get('last_processed_index', len(processed_questions))
+                processed_questions = checkpoint_data.get("categorized_questions", [])
+                last_index = checkpoint_data.get(
+                    "last_processed_index", len(processed_questions)
+                )
 
-            logger.info(f"Loaded checkpoint with {len(processed_questions)} processed questions, resuming from index {last_index}")
+            logger.info(
+                f"Loaded checkpoint with {len(processed_questions)} processed questions, resuming from index {last_index}"
+            )
             return processed_questions, last_index
 
         except Exception as e:
             logger.warning(f"Error loading checkpoint: {e}, starting from beginning")
             return [], 0
 
-    def save_checkpoint(self, categorized_questions: List[Dict[str, Any]], checkpoint_path: str, last_index: int) -> None:
+    def save_checkpoint(
+        self,
+        categorized_questions: List[Dict[str, Any]],
+        checkpoint_path: str,
+        last_index: int,
+    ) -> None:
         """Save checkpoint with processed questions and metadata."""
-        logger.info(f"Saving checkpoint with {len(categorized_questions)} questions to {checkpoint_path}")
+        logger.info(
+            f"Saving checkpoint with {len(categorized_questions)} questions to {checkpoint_path}"
+        )
 
         checkpoint_data = {
-            'categorized_questions': categorized_questions,
-            'last_processed_index': last_index,
-            'total_processed': len(categorized_questions)
+            "categorized_questions": categorized_questions,
+            "last_processed_index": last_index,
+            "total_processed": len(categorized_questions),
         }
 
         os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
-        with open(checkpoint_path, 'w') as f:
+        with open(checkpoint_path, "w") as f:
             json.dump(checkpoint_data, f, indent=2)
 
-        logger.info(f"Checkpoint saved successfully")
+        logger.info("Checkpoint saved successfully")
 
-    async def categorize_question_by_area(self, question: str, areas: List[AreaInfo], **kwargs: Any) -> str:
+    async def categorize_question_by_area(
+        self, question: str, areas: List[AreaInfo], **kwargs: Any
+    ) -> str:
         """Categorize a question into one of the available areas (returns exact area name)."""
         area_names = [area.name for area in areas]
         area_bullets = "\n".join([f"- {area.name}" for area in areas])
@@ -346,11 +402,15 @@ async def categorize_question_by_area(self, question: str, areas: List[AreaInfo]
             "seed": 42,
         }
 
+        if self.llm_model is None:
+            logger.warning("LLM model not initialized")
+            return "Unknown"
+
         try:
             response, metadata = await self.llm_model.async_generate(
                 sys_prompt=sys_prompt,
                 user_prompt=user_prompt,
-                generation_config=generation_config
+                generation_config=generation_config,
             )
             raw = (response or "").strip()
             # Print what model returned
@@ -371,7 +431,9 @@ async def categorize_questions(
         **kwargs: Any,
     ) -> List[Dict[str, Any]]:
         """Perform two-step categorization of all questions with resume capability."""
-        logger.info(f"Starting two-step categorization of {len(questions)} questions with resume capability...")
+        logger.info(
+            f"Starting two-step categorization of {len(questions)} questions with resume capability..."
+        )
 
         # Load existing checkpoint
         categorized_questions, start_index = self.load_checkpoint(checkpoint_path)
@@ -386,12 +448,14 @@ async def categorize_questions(
             question_data = questions[i]
 
             if i % 10 == 0:
-                logger.info(f"Processing question {i+1}/{len(questions)}")
+                logger.info(f"Processing question {i + 1}/{len(questions)}")
 
             question_text = question_data.get("question", "")
 
             # Step 1: Categorize by area
-            predicted_area = await self.categorize_question_by_area(question_text, self.areas, **kwargs)
+            predicted_area = await self.categorize_question_by_area(
+                question_text, self.areas, **kwargs
+            )
 
             # Step 2: Find the exact area key and get capabilities within that area
             area_key = self._find_matching_area_key(predicted_area)
@@ -400,49 +464,60 @@ async def categorize_questions(
             categorized_question = {
                 **question_data,
                 "categorized_area": area_key,
-                "processing_order": i + 1
+                "processing_order": i + 1,
             }
 
             categorized_questions.append(categorized_question)
 
-
             # Periodic checkpoint saving
             if save_every_n and (i + 1) % save_every_n == 0:
                 try:
-                    logger.info(f"Checkpoint: saving results at {i+1} questions to {checkpoint_path}")
+                    logger.info(
+                        f"Checkpoint: saving results at {i + 1} questions to {checkpoint_path}"
+                    )
                     self.save_checkpoint(categorized_questions, checkpoint_path, i + 1)
                 except Exception as e:
-                    logger.warning(f"Failed to write checkpoint at {i+1}: {e}")
+                    logger.warning(f"Failed to write checkpoint at {i + 1}: {e}")
 
         # Final checkpoint save
         if save_every_n:
             try:
-                logger.info(f"Final checkpoint: saving all {len(categorized_questions)} questions to {checkpoint_path}")
-                self.save_checkpoint(categorized_questions, checkpoint_path, len(questions))
+                logger.info(
+                    f"Final checkpoint: saving all {len(categorized_questions)} questions to {checkpoint_path}"
+                )
+                self.save_checkpoint(
+                    categorized_questions, checkpoint_path, len(questions)
+                )
             except Exception as e:
                 logger.warning(f"Failed to write final checkpoint: {e}")
 
         logger.info("Completed two-step categorization with resume capability")
         return categorized_questions
 
-    def save_categorized_questions(self, categorized_questions: List[Dict[str, Any]], output_path: str) -> None:
+    def save_categorized_questions(
+        self, categorized_questions: List[Dict[str, Any]], output_path: str
+    ) -> None:
         """Save categorized questions to JSON file."""
-        logger.info(f"Saving {len(categorized_questions)} categorized questions to {output_path}")
+        logger.info(
+            f"Saving {len(categorized_questions)} categorized questions to {output_path}"
+        )
 
-        with open(output_path, 'w') as f:
+        with open(output_path, "w") as f:
             json.dump(categorized_questions, f, indent=2)
 
         logger.info("Categorized questions saved successfully")
 
-    def print_categorization_summary(self, categorized_questions: List[Dict[str, Any]]) -> None:
+    def print_categorization_summary(
+        self, categorized_questions: List[Dict[str, Any]]
+    ) -> None:
         """Print summary of categorization results."""
-        print("\n" + "="*80)
-        dataset_name = getattr(self.cfg.data_cfg, 'dataset_name', 'dataset')
+        print("\n" + "=" * 80)
+        dataset_name = getattr(self.cfg.data_cfg, "dataset_name", "dataset")
         print(f"{str(dataset_name).upper()} QUESTION CATEGORIZATION SUMMARY")
-        print("="*80)
+        print("=" * 80)
 
         # Count by area
-        area_counts = defaultdict(int)
+        area_counts: Dict[str, int] = defaultdict(int)
 
         for q in categorized_questions:
             area = q.get("categorized_area", "Unknown")
@@ -450,8 +525,10 @@ def print_categorization_summary(self, categorized_questions: List[Dict[str, Any
 
         print(f"\nTotal questions categorized: {len(categorized_questions)}")
 
-        print(f"\nQuestions by Area:")
-        for area, count in sorted(area_counts.items(), key=lambda x: x[1], reverse=True):
+        print("\nQuestions by Area:")
+        for area, count in sorted(
+            area_counts.items(), key=lambda x: x[1], reverse=True
+        ):
             print(f"  {area}: {count}")
 
     def run_categorization(self) -> None:
@@ -462,27 +539,31 @@ def run_categorization(self) -> None:
         self.areas, self.capabilities_by_area = self.extract_areas_and_capabilities()
 
         # Determine dataset and load questions
-        dataset_name = getattr(self.cfg.data_cfg, 'dataset_name', 'gsm8k')
-        dataset_path = getattr(self.cfg.data_cfg, 'dataset_path', None)
+        dataset_name = getattr(self.cfg.data_cfg, "dataset_name", "gsm8k")
+        dataset_path = getattr(self.cfg.data_cfg, "dataset_path", None)
 
         if not dataset_path or not dataset_name:
-            raise ValueError("dataset_path and dataset_name must be provided in the config")
+            raise ValueError(
+                "dataset_path and dataset_name must be provided in the config"
+            )
         questions = self.load_questions_by_dataset(dataset_name, dataset_path)
 
         # Prepare checkpoint and output paths
         checkpoint_path = os.path.join(
             self.cfg.output_cfg.results_dir,
-            f"{dataset_name}_categorization_checkpoint_{len(questions)}.json"
+            f"{dataset_name}_categorization_checkpoint_{len(questions)}.json",
         )
         output_path = os.path.join(
-            self.cfg.output_cfg.results_dir,
-            self.cfg.output_cfg.output_filename
+            self.cfg.output_cfg.results_dir, self.cfg.output_cfg.output_filename
         )
         os.makedirs(self.cfg.output_cfg.results_dir, exist_ok=True)
-        save_every_n = getattr(getattr(self.cfg, "processing_cfg", {}), "save_every_n", 100)
+        save_every_n = getattr(
+            getattr(self.cfg, "processing_cfg", {}), "save_every_n", 100
+        )
 
         # Perform categorization with resume
         import asyncio
+
         categorized_questions = asyncio.run(
             self.categorize_questions(
                 questions,
@@ -521,4 +602,3 @@ def main(cfg: DictConfig) -> None:
 
 if __name__ == "__main__":
     main()
-
diff --git a/wikipedia/wiki_vs_generated.py b/wikipedia/wiki_vs_generated.py
index dd4097b9..dd96e45a 100644
--- a/wikipedia/wiki_vs_generated.py
+++ b/wikipedia/wiki_vs_generated.py
@@ -4,11 +4,11 @@
 and matches them to generated capabilities.
 """
 
+import glob
 import json
 import logging
 import os
-import glob
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List
 
 import hydra
 from omegaconf import DictConfig
@@ -16,8 +16,8 @@
 from src.model import Model
 from wikipedia.prompts import (
     SYSTEM_PROMPT_MATH_CAPABILITIES,
-    get_wikipedia_to_generated_prompt,
     get_generated_to_wikipedia_prompt,
+    get_wikipedia_to_generated_prompt,
 )
 
 
@@ -28,7 +28,8 @@
 
 class WikipediaCapability:
     """Represents a Wikipedia capability."""
-    def __init__(self, name, description, area):
+
+    def __init__(self, name: str, description: str, area: str) -> None:
         self.name = name
         self.description = description
         self.area = area
@@ -36,7 +37,8 @@ def __init__(self, name, description, area):
 
 class GeneratedCapability:
     """Represents a generated capability."""
-    def __init__(self, name, description, area):
+
+    def __init__(self, name: str, description: str, area: str) -> None:
         self.name = name
         self.description = description
         self.area = area
@@ -48,23 +50,27 @@ class GeneratedVsWikipedia:
     def __init__(self, cfg: DictConfig):
         self.cfg = cfg
         self.model = Model(
-            model_name=cfg.llm_cfg.model_name,
-            model_provider=cfg.llm_cfg.model_provider
+            model_name=cfg.llm_cfg.model_name, model_provider=cfg.llm_cfg.model_provider
+        )
+        self.results: Dict[str, str] = {}
+        self.match_direction: str = getattr(
+            getattr(cfg, "processing_cfg", {}),
+            "match_direction",
+            "generated_to_wikipedia",
         )
-        self.results = {}
-        self.match_direction = getattr(getattr(cfg, 'processing_cfg', {}), 'match_direction', 'generated_to_wikipedia')
 
     def load_wikipedia_capabilities(self) -> Dict[str, List[WikipediaCapability]]:
         """
         Load Wikipedia capabilities from the new JSON format in wikipedia/pages/ directory.
 
-        Returns:
+        Returns
+        -------
             Dictionary mapping area names to lists of WikipediaCapability objects
         """
-        capabilities_by_area = {}
+        capabilities_by_area: Dict[str, List[WikipediaCapability]] = {}
 
         # Path to the Wikipedia pages directory (from config)
-        wikipedia_pages_dir = getattr(self.cfg.data_cfg, 'wikipedia_pages_dir')
+        wikipedia_pages_dir = self.cfg.data_cfg.wikipedia_pages_dir
 
         if not os.path.exists(wikipedia_pages_dir):
             logger.error(f"Wikipedia pages directory not found: {wikipedia_pages_dir}")
@@ -76,23 +82,21 @@ def load_wikipedia_capabilities(self) -> Dict[str, List[WikipediaCapability]]:
 
             for json_file in json_files:
                 try:
-                    with open(json_file, 'r', encoding='utf-8') as f:
+                    with open(json_file, "r", encoding="utf-8") as f:
                         data = json.load(f)
 
                     # Extract capability information from the new format
-                    cap_name = data.get('capability_name', '')
-                    description = data.get('description', '')
-                    area = data.get('area', 'Unknown')
+                    cap_name = data.get("capability_name", "")
+                    description = data.get("description", "")
+                    area = data.get("area", "Unknown")
 
                     if cap_name and description:
-                # Create capability object
+                        # Create capability object
                         capability = WikipediaCapability(
-                    name=cap_name,
-                    description=description,
-                            area=area
-                )
+                            name=cap_name, description=description, area=area
+                        )
 
-                # Group by area
+                        # Group by area
                         if area not in capabilities_by_area:
                             capabilities_by_area[area] = []
                         capabilities_by_area[area].append(capability)
@@ -101,7 +105,8 @@ def load_wikipedia_capabilities(self) -> Dict[str, List[WikipediaCapability]]:
                     logger.warning(f"Error loading {json_file}: {e}")
                     continue
 
-            logger.info(f"Loaded {sum(len(caps) for caps in capabilities_by_area.values())} Wikipedia capabilities")
+            total_caps = sum(len(caps) for caps in capabilities_by_area.values())
+            logger.info(f"Loaded {total_caps} Wikipedia capabilities")
             logger.info(f"Areas: {list(capabilities_by_area.keys())}")
 
         except Exception as e:
@@ -109,32 +114,37 @@ def load_wikipedia_capabilities(self) -> Dict[str, List[WikipediaCapability]]:
 
         return capabilities_by_area
 
-    def load_generated_capabilities(self, generated_dir: str) -> List[GeneratedCapability]:
+    def load_generated_capabilities(
+        self, generated_dir: str
+    ) -> List[GeneratedCapability]:
         """
         Load capabilities from generated directory structure.
 
         Args:
             generated_dir: Directory containing generated capabilities
 
-        Returns:
+        Returns
+        -------
             List of GeneratedCapability objects
         """
         capabilities = []
 
         # Look for capability.json files in the directory structure
-        capability_files = glob.glob(os.path.join(generated_dir, "**/capability.json"), recursive=True)
+        capability_files = glob.glob(
+            os.path.join(generated_dir, "**/capability.json"), recursive=True
+        )
 
         logger.info(f"Found {len(capability_files)} generated capability files")
 
         for file_path in capability_files:
             try:
-                with open(file_path, 'r', encoding='utf-8') as f:
+                with open(file_path, "r", encoding="utf-8") as f:
                     cap_data = json.load(f)
 
                 capability = GeneratedCapability(
-                    name=cap_data.get('capability_name', ''),
-                    description=cap_data.get('capability_description', ''),
-                    area=cap_data.get('capability_area', 'mathematics')
+                    name=cap_data.get("capability_name", ""),
+                    description=cap_data.get("capability_description", ""),
+                    area=cap_data.get("capability_area", "mathematics"),
                 )
 
                 capabilities.append(capability)
@@ -146,7 +156,11 @@ def load_generated_capabilities(self, generated_dir: str) -> List[GeneratedCapab
         logger.info(f"Successfully loaded {len(capabilities)} generated capabilities")
         return capabilities
 
-    def match_wikipedia_to_generated_capabilities(self, wikipedia_cap: WikipediaCapability, generated_caps: List[GeneratedCapability]) -> str:
+    def match_wikipedia_to_generated_capabilities(
+        self,
+        wikipedia_cap: WikipediaCapability,
+        generated_caps: List[GeneratedCapability],
+    ) -> str:
         """
         Match Wikipedia capability to generated capabilities using batching.
 
@@ -154,7 +168,8 @@ def match_wikipedia_to_generated_capabilities(self, wikipedia_cap: WikipediaCapa
             wikipedia_cap: Wikipedia capability
             generated_caps: List of generated capabilities in the matched area
 
-        Returns:
+        Returns
+        -------
             Name of the matched capability or "none" if no match
         """
         if not generated_caps:
@@ -163,12 +178,16 @@ def match_wikipedia_to_generated_capabilities(self, wikipedia_cap: WikipediaCapa
         # Batch size to avoid context length issues
         batch_size = 20
 
-        logger.info(f"  Processing {len(generated_caps)} capabilities in batches of {batch_size}")
+        logger.info(
+            f"  Processing {len(generated_caps)} capabilities in batches of {batch_size}"
+        )
 
         # Process capabilities in batches
         for i in range(0, len(generated_caps), batch_size):
-            batch = generated_caps[i:i + batch_size]
-            logger.info(f"  Processing batch {i//batch_size + 1}/{(len(generated_caps) + batch_size - 1)//batch_size}")
+            batch = generated_caps[i : i + batch_size]
+            logger.info(
+                f"  Processing batch {i // batch_size + 1}/{(len(generated_caps) + batch_size - 1) // batch_size}"
+            )
 
             result = self.match_wikipedia_to_generated_batch(wikipedia_cap, batch)
 
@@ -178,10 +197,14 @@ def match_wikipedia_to_generated_capabilities(self, wikipedia_cap: WikipediaCapa
                 return result
 
         # No match found in any batch
-        logger.info(f"  No match found in any batch")
+        logger.info("  No match found in any batch")
         return "none"
 
-    def match_wikipedia_to_generated_batch(self, wikipedia_cap: WikipediaCapability, generated_caps_batch: List[GeneratedCapability]) -> str:
+    def match_wikipedia_to_generated_batch(
+        self,
+        wikipedia_cap: WikipediaCapability,
+        generated_caps_batch: List[GeneratedCapability],
+    ) -> str:
         """Match a single Wikipedia capability against a batch of generated capabilities.
 
         Returns the exact generated capability name if a match is found, otherwise "none".
@@ -189,35 +212,35 @@ def match_wikipedia_to_generated_batch(self, wikipedia_cap: WikipediaCapability,
         if not generated_caps_batch:
             return "none"
 
-        capabilities_list = "\n".join([f"- {cap.name}: {cap.description}" for cap in generated_caps_batch])
+        capabilities_list = "\n".join(
+            [f"- {cap.name}: {cap.description}" for cap in generated_caps_batch]
+        )
 
         user_prompt = get_wikipedia_to_generated_prompt(
-            wikipedia_cap.name,
-            wikipedia_cap.description,
-            capabilities_list
+            wikipedia_cap.name, wikipedia_cap.description, capabilities_list
         )
 
         try:
             response, metadata = self.model.generate(
                 sys_prompt=SYSTEM_PROMPT_MATH_CAPABILITIES,
                 user_prompt=user_prompt,
-                generation_config={
-                    "temperature": 0.0,
-                    "max_tokens": 100
-                }
+                generation_config={"temperature": 0.0, "max_tokens": 100},
             )
 
-            response = response.strip()
+            response_str: str = str(response).strip()
             capability_names = [cap.name for cap in generated_caps_batch]
-            if response in capability_names:
-                return response
-            else:
-                return "none"
+            if response_str in capability_names:
+                return response_str
+            return "none"
         except Exception as e:
             logger.error(f"Error matching Wikipedia capability to generated batch: {e}")
             return "none"
 
-    def match_generated_to_wikipedia_capabilities(self, generated_cap: GeneratedCapability, wikipedia_caps: List[WikipediaCapability]) -> str:
+    def match_generated_to_wikipedia_capabilities(
+        self,
+        generated_cap: GeneratedCapability,
+        wikipedia_caps: List[WikipediaCapability],
+    ) -> str:
         """
         Match generated capability to Wikipedia capabilities using batching.
 
@@ -225,7 +248,8 @@ def match_generated_to_wikipedia_capabilities(self, generated_cap: GeneratedCapa
             generated_cap: Generated capability
             wikipedia_caps: List of Wikipedia capabilities in the matched area
 
-        Returns:
+        Returns
+        -------
             Name of the matched Wikipedia capability or "none" if no match
         """
         if not wikipedia_caps:
@@ -234,12 +258,16 @@ def match_generated_to_wikipedia_capabilities(self, generated_cap: GeneratedCapa
         # Batch size to avoid context length issues
         batch_size = 40
 
-        logger.info(f"  Processing {len(wikipedia_caps)} Wikipedia capabilities in batches of {batch_size}")
+        logger.info(
+            f"  Processing {len(wikipedia_caps)} Wikipedia capabilities in batches of {batch_size}"
+        )
 
         # Process capabilities in batches
         for i in range(0, len(wikipedia_caps), batch_size):
-            batch = wikipedia_caps[i:i + batch_size]
-            logger.info(f"  Processing batch {i//batch_size + 1}/{(len(wikipedia_caps) + batch_size - 1)//batch_size}")
+            batch = wikipedia_caps[i : i + batch_size]
+            logger.info(
+                f"  Processing batch {i // batch_size + 1}/{(len(wikipedia_caps) + batch_size - 1) // batch_size}"
+            )
 
             result = self.match_generated_to_wikipedia_batch(generated_cap, batch)
 
@@ -249,10 +277,14 @@ def match_generated_to_wikipedia_capabilities(self, generated_cap: GeneratedCapa
                 return result
 
         # No match found in any batch
-        logger.info(f"  No match found in any batch")
+        logger.info("  No match found in any batch")
         return "none"
 
-    def match_generated_to_wikipedia_batch(self, generated_cap: GeneratedCapability, wikipedia_caps_batch: List[WikipediaCapability]) -> str:
+    def match_generated_to_wikipedia_batch(
+        self,
+        generated_cap: GeneratedCapability,
+        wikipedia_caps_batch: List[WikipediaCapability],
+    ) -> str:
         """
         Match generated capability to a batch of Wikipedia capabilities.
 
@@ -260,46 +292,46 @@ def match_generated_to_wikipedia_batch(self, generated_cap: GeneratedCapability,
             generated_cap: Generated capability to match
             wikipedia_caps_batch: Batch of Wikipedia capabilities to match against
 
-        Returns:
+        Returns
+        -------
             Name of the matched Wikipedia capability or "none" if no match
         """
         if not wikipedia_caps_batch:
             return "none"
 
-        capabilities_list = "\n".join([f"- {cap.name}: {cap.description}" for cap in wikipedia_caps_batch])
+        capabilities_list = "\n".join(
+            [f"- {cap.name}: {cap.description}" for cap in wikipedia_caps_batch]
+        )
 
         user_prompt = get_generated_to_wikipedia_prompt(
-            generated_cap.name,
-            generated_cap.description,
-            capabilities_list
+            generated_cap.name, generated_cap.description, capabilities_list
         )
 
         try:
             response, metadata = self.model.generate(
                 sys_prompt=SYSTEM_PROMPT_MATH_CAPABILITIES,
                 user_prompt=user_prompt,
-                generation_config={
-                    "temperature": 0.0,
-                    "max_tokens": 100
-                }
+                generation_config={"temperature": 0.0, "max_tokens": 100},
             )
 
             # Clean the response
-            response = response.strip()
+            response_str: str = str(response).strip()
 
             # Check if the response matches one of the available capabilities
             capability_names = [cap.name for cap in wikipedia_caps_batch]
-            if response in capability_names:
-                return response
-            else:
-                return "none"
+            if response_str in capability_names:
+                return response_str
+            return "none"
 
         except Exception as e:
             logger.error(f"Error matching artifact to Wikipedia batch: {e}")
             return "none"
 
-    def match_capabilities(self, generated_caps: List[GeneratedCapability],
-                          categorized_wikipedia_caps: Dict[str, List[WikipediaCapability]]) -> Dict[str, str]:
+    def match_capabilities(
+        self,
+        generated_caps: List[GeneratedCapability],
+        categorized_wikipedia_caps: Dict[str, List[WikipediaCapability]],
+    ) -> Dict[str, str]:
         """Match capabilities based on configured direction.
 
         Returns a mapping:
@@ -317,64 +349,98 @@ def match_capabilities(self, generated_caps: List[GeneratedCapability],
         for cap in generated_caps:
             generated_caps_by_area.setdefault(cap.area, []).append(cap)
 
-        if self.match_direction == 'generated_to_wikipedia':
-            logger.info(f"Starting two-step matching process (GENERATED -> WIKIPEDIA):")
+        if self.match_direction == "generated_to_wikipedia":
+            logger.info("Starting two-step matching process (GENERATED -> WIKIPEDIA):")
             logger.info(f"  - {len(generated_caps)} generated capabilities")
             logger.info(f"  - {len(all_wikipedia_caps)} Wikipedia capabilities")
-            logger.info(f"  - {len(generated_caps_by_area)} generated areas: {list(generated_caps_by_area.keys())}")
+            logger.info(
+                f"  - {len(generated_caps_by_area)} generated areas: {list(generated_caps_by_area.keys())}"
+            )
 
             for i, generated_cap in enumerate(generated_caps):
-                logger.info(f"\nProcessing generated capability {i+1}/{len(generated_caps)}: {generated_cap.name}")
+                logger.info(
+                    f"\nProcessing generated capability {i + 1}/{len(generated_caps)}: {generated_cap.name}"
+                )
                 logger.info(f"  Generated area: {generated_cap.area}")
 
                 wikipedia_area = generated_cap.area
                 area_wikipedia_caps = categorized_wikipedia_caps.get(wikipedia_area, [])
                 if not area_wikipedia_caps:
-                    logger.info(f"  - NO WIKIPEDIA CAPABILITIES in area '{wikipedia_area}'")
+                    logger.info(
+                        f"  - NO WIKIPEDIA CAPABILITIES in area '{wikipedia_area}'"
+                    )
                     results[generated_cap.name] = "none"
                     continue
 
-                logger.info(f"  + Found {len(area_wikipedia_caps)} Wikipedia capabilities in area '{wikipedia_area}'")
-                matched = self.match_generated_to_wikipedia_capabilities(generated_cap, area_wikipedia_caps)
+                logger.info(
+                    f"  + Found {len(area_wikipedia_caps)} Wikipedia capabilities in area '{wikipedia_area}'"
+                )
+                matched = self.match_generated_to_wikipedia_capabilities(
+                    generated_cap, area_wikipedia_caps
+                )
                 if matched == "none":
-                    logger.info(f"  - NO WIKIPEDIA MATCH: {generated_cap.name} in area '{wikipedia_area}'")
+                    logger.info(
+                        f"  - NO WIKIPEDIA MATCH: {generated_cap.name} in area '{wikipedia_area}'"
+                    )
                 else:
-                    logger.info(f"  + WIKIPEDIA MATCH: {generated_cap.name} -> {matched} (in area '{wikipedia_area}')")
+                    logger.info(
+                        f"  + WIKIPEDIA MATCH: {generated_cap.name} -> {matched} (in area '{wikipedia_area}')"
+                    )
                 results[generated_cap.name] = matched
 
-        elif self.match_direction == 'wikipedia_to_generated':
-            logger.info(f"Starting two-step matching process (WIKIPEDIA -> GENERATED):")
+        elif self.match_direction == "wikipedia_to_generated":
+            logger.info("Starting two-step matching process (WIKIPEDIA -> GENERATED):")
             logger.info(f"  - {len(all_wikipedia_caps)} Wikipedia capabilities")
             logger.info(f"  - {len(generated_caps)} generated capabilities")
-            logger.info(f"  - {len(generated_caps_by_area)} generated areas: {list(generated_caps_by_area.keys())}")
+            logger.info(
+                f"  - {len(generated_caps_by_area)} generated areas: {list(generated_caps_by_area.keys())}"
+            )
 
             for i, wikipedia_cap in enumerate(all_wikipedia_caps):
-                logger.info(f"\nProcessing Wikipedia capability {i+1}/{len(all_wikipedia_caps)}: {wikipedia_cap.name}")
+                logger.info(
+                    f"\nProcessing Wikipedia capability {i + 1}/{len(all_wikipedia_caps)}: {wikipedia_cap.name}"
+                )
                 logger.info(f"  Wikipedia area: {wikipedia_cap.area}")
 
                 generated_area = wikipedia_cap.area
                 area_generated_caps = generated_caps_by_area.get(generated_area, [])
                 if not area_generated_caps:
-                    logger.info(f"  - NO GENERATED CAPABILITIES in area '{generated_area}'")
+                    logger.info(
+                        f"  - NO GENERATED CAPABILITIES in area '{generated_area}'"
+                    )
                     results[wikipedia_cap.name] = "none"
                     continue
 
-                logger.info(f"  + Found {len(area_generated_caps)} generated capabilities in area '{generated_area}'")
-                matched = self.match_wikipedia_to_generated_capabilities(wikipedia_cap, area_generated_caps)
+                logger.info(
+                    f"  + Found {len(area_generated_caps)} generated capabilities in area '{generated_area}'"
+                )
+                matched = self.match_wikipedia_to_generated_capabilities(
+                    wikipedia_cap, area_generated_caps
+                )
                 if matched == "none":
-                    logger.info(f"  - NO GENERATED MATCH: {wikipedia_cap.name} in area '{generated_area}'")
+                    logger.info(
+                        f"  - NO GENERATED MATCH: {wikipedia_cap.name} in area '{generated_area}'"
+                    )
                 else:
-                    logger.info(f"  + GENERATED MATCH: {wikipedia_cap.name} -> {matched} (in area '{generated_area}')")
+                    logger.info(
+                        f"  + GENERATED MATCH: {wikipedia_cap.name} -> {matched} (in area '{generated_area}')"
+                    )
                 results[wikipedia_cap.name] = matched
 
         else:
-            raise ValueError("processing_cfg.match_direction must be 'generated_to_wikipedia' or 'wikipedia_to_generated'")
+            raise ValueError(
+                "processing_cfg.match_direction must be 'generated_to_wikipedia' or 'wikipedia_to_generated'"
+            )
 
         return results
 
-    def save_results(self, results: Dict[str, str], output_path: str,
-                    generated_caps: List[GeneratedCapability],
-                    categorized_wikipedia_caps: Dict[str, List[WikipediaCapability]]) -> None:
+    def save_results(
+        self,
+        results: Dict[str, str],
+        output_path: str,
+        generated_caps: List[GeneratedCapability],
+        categorized_wikipedia_caps: Dict[str, List[WikipediaCapability]],
+    ) -> None:
         """
         Save results to JSON file with detailed information.
 
@@ -398,37 +464,34 @@ def save_results(self, results: Dict[str, str], output_path: str,
                 "total_wikipedia_capabilities": len(all_wikipedia_caps),
                 "categorized_wikipedia_areas": len(categorized_wikipedia_caps),
                 "matched_capabilities": sum(1 for v in results.values() if v != "none"),
-                "unmatched_capabilities": sum(1 for v in results.values() if v == "none"),
-                "match_rate": sum(1 for v in results.values() if v != "none") / len(results) if results else 0,
-                "matching_direction": self.match_direction
+                "unmatched_capabilities": sum(
+                    1 for v in results.values() if v == "none"
+                ),
+                "match_rate": sum(1 for v in results.values() if v != "none")
+                / len(results)
+                if results
+                else 0,
+                "matching_direction": self.match_direction,
             },
             "matching_results": results,
             "generated_capabilities": [
-                {
-                    "name": cap.name,
-                    "description": cap.description,
-                    "area": cap.area
-                } for cap in generated_caps
+                {"name": cap.name, "description": cap.description, "area": cap.area}
+                for cap in generated_caps
             ],
             "wikipedia_capabilities": [
-                {
-                    "name": cap.name,
-                    "description": cap.description,
-                    "area": cap.area
-                } for cap in all_wikipedia_caps
+                {"name": cap.name, "description": cap.description, "area": cap.area}
+                for cap in all_wikipedia_caps
             ],
             "wikipedia_capabilities_by_area": {
                 area: [
-                    {
-                        "name": cap.name,
-                        "description": cap.description,
-                        "area": cap.area
-                    } for cap in caps
-                ] for area, caps in categorized_wikipedia_caps.items()
-            }
+                    {"name": cap.name, "description": cap.description, "area": cap.area}
+                    for cap in caps
+                ]
+                for area, caps in categorized_wikipedia_caps.items()
+            },
         }
 
-        with open(output_path, 'w', encoding='utf-8') as f:
+        with open(output_path, "w", encoding="utf-8") as f:
             json.dump(detailed_results, f, indent=2, ensure_ascii=False)
 
         logger.info(f"Detailed results saved to: {output_path}")
@@ -440,14 +503,14 @@ def print_results(self, results: Dict[str, str]) -> None:
         Args:
             results: Dictionary of matching results
         """
-        print("\n" + "="*80)
-        if self.match_direction == 'generated_to_wikipedia':
+        print("\n" + "=" * 80)
+        if self.match_direction == "generated_to_wikipedia":
             print("GENERATED → WIKIPEDIA MATCHING RESULTS")
         else:
             print("WIKIPEDIA → GENERATED MATCHING RESULTS")
-        print("="*80)
+        print("=" * 80)
 
-        if self.match_direction == 'generated_to_wikipedia':
+        if self.match_direction == "generated_to_wikipedia":
             for generated_name, wikipedia_name in results.items():
                 if wikipedia_name == "none":
                     print(f"[NO MATCH] {generated_name} -> NO MATCH")
@@ -460,22 +523,18 @@ def print_results(self, results: Dict[str, str]) -> None:
                 else:
                     print(f"[MATCH] {wikipedia_name} -> {generated_name}")
 
-        print("="*80)
-        if self.match_direction == 'generated_to_wikipedia':
+        print("=" * 80)
+        if self.match_direction == "generated_to_wikipedia":
             print(f"Total generated capabilities: {len(results)}")
         else:
             print(f"Total Wikipedia capabilities: {len(results)}")
         matched_count = sum(1 for v in results.values() if v != "none")
         print(f"Matched capabilities: {matched_count}")
         print(f"Unmatched capabilities: {len(results) - matched_count}")
-        print("="*80)
+        print("=" * 80)
 
 
-@hydra.main(
-    version_base=None,
-    config_path="cfg",
-    config_name="wiki_vs_generated"
-)
+@hydra.main(version_base=None, config_path="cfg", config_name="wiki_vs_generated")
 def main(cfg: DictConfig) -> None:
     """
     Main function to run generated-Wikipedia matching (generated -> Wikipedia direction).
@@ -483,7 +542,9 @@ def main(cfg: DictConfig) -> None:
     Args:
         cfg: Configuration for the matching process
     """
-    logger.info("Starting Generated-Wikipedia Matcher V2 Fixed (Generated -> Wikipedia Version)")
+    logger.info(
+        "Starting Generated-Wikipedia Matcher V2 Fixed (Generated -> Wikipedia Version)"
+    )
 
     # Initialize matcher
     matcher = GeneratedVsWikipedia(cfg)
@@ -515,7 +576,9 @@ def main(cfg: DictConfig) -> None:
     name, ext = os.path.splitext(orig_filename)
     filename_with_suffix = f"{name}_generated_to_wikipedia{ext or '.json'}"
     output_path = os.path.join(cfg.output_cfg.results_dir, filename_with_suffix)
-    matcher.save_results(results, output_path, generated_caps, categorized_wikipedia_caps)
+    matcher.save_results(
+        results, output_path, generated_caps, categorized_wikipedia_caps
+    )
     print(f"Output saved to: {output_path}")
 
     logger.info("Generated-Wikipedia matching completed!")
diff --git a/wikipedia/wikipedia_scraper.py b/wikipedia/wikipedia_scraper.py
index 6a037d74..b489878e 100644
--- a/wikipedia/wikipedia_scraper.py
+++ b/wikipedia/wikipedia_scraper.py
@@ -9,38 +9,49 @@
 Source: https://en.wikipedia.org/wiki/Glossary_of_areas_of_mathematics
 """
 
+import json
+import logging
 import os
 import re
-import json
-import requests
-from bs4 import BeautifulSoup
 import time
-import logging
-from typing import List, Dict, Tuple, Optional
-from urllib.parse import urljoin, urlparse
+from typing import Dict, List, Optional, Tuple
+from urllib.parse import urljoin
+
+import requests
+from bs4 import BeautifulSoup, Tag
+
 
 # Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
 logger = logging.getLogger(__name__)
 
 # Import GPT model functionality (assuming it's available in the project)
 try:
     import sys
-    sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+
+    sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
     from src.model import Model
     from wikipedia.prompts import (
         SYSTEM_PROMPT_CAPABILITY_EVALUATION,
         SYSTEM_PROMPT_CATEGORIZATION,
-        get_capability_summary_prompt,
         get_capability_categorization_prompt,
+        get_capability_summary_prompt,
     )
+
     GPT_AVAILABLE = True
 except ImportError:
     logger.warning("GPT model not available. Will use fallback summarization.")
     GPT_AVAILABLE = False
 
 
-def generate_summary_with_gpt(description: str, model: Model, cache_dir: str = None, capability_name: str = None) -> Tuple[str, bool]:
+def generate_summary_with_gpt(
+    description: str,
+    model: Model,
+    cache_dir: Optional[str] = None,
+    capability_name: Optional[str] = None,
+) -> Tuple[str, bool]:
     """
     Generate a concise summary of a capability description using GPT.
 
@@ -50,48 +61,51 @@ def generate_summary_with_gpt(description: str, model: Model, cache_dir: str = N
         cache_dir: Directory to cache summaries (optional)
         capability_name: Name of the capability for caching (optional)
 
-    Returns:
+    Returns
+    -------
         A tuple of (summary, was_cached)
     """
     # Try to load cached summary first
     if cache_dir and capability_name:
         os.makedirs(cache_dir, exist_ok=True)
         # Sanitize filename by replacing invalid characters
-        safe_name = "".join(c for c in capability_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
-        safe_name = safe_name.replace(' ', '_')
+        safe_name = "".join(
+            c for c in capability_name if c.isalnum() or c in (" ", "-", "_")
+        ).rstrip()
+        safe_name = safe_name.replace(" ", "_")
         cache_file = os.path.join(cache_dir, f"summary_{safe_name}.txt")
 
         if os.path.exists(cache_file):
             try:
-                with open(cache_file, 'r', encoding='utf-8') as f:
+                with open(cache_file, "r", encoding="utf-8") as f:
                     cached_summary = f.read().strip()
                 logger.debug(f"Loaded cached summary for '{capability_name}'")
                 return cached_summary, True
             except Exception as e:
-                logger.warning(f"Failed to load cached summary for '{capability_name}': {e}")
+                logger.warning(
+                    f"Failed to load cached summary for '{capability_name}': {e}"
+                )
 
     sys_prompt = SYSTEM_PROMPT_CAPABILITY_EVALUATION
     user_prompt = get_capability_summary_prompt(description)
 
-    generation_config = {
-        "temperature": 0.3,
-        "max_tokens": 200,
-        "seed": 42
-    }
+    generation_config = {"temperature": 0.3, "max_tokens": 200, "seed": 42}
 
     try:
         summary, metadata = model.generate(
             sys_prompt=sys_prompt,
             user_prompt=user_prompt,
-            generation_config=generation_config
+            generation_config=generation_config,
         )
         summary = summary.strip()
-        logger.debug(f"Generated summary for '{description[:50]}...' with {metadata['output_tokens']} tokens")
+        logger.debug(
+            f"Generated summary for '{description[:50]}...' with {metadata['output_tokens']} tokens"
+        )
 
         # Cache the summary if cache_dir is provided
         if cache_dir and capability_name:
             try:
-                with open(cache_file, 'w', encoding='utf-8') as f:
+                with open(cache_file, "w", encoding="utf-8") as f:
                     f.write(summary)
                 logger.debug(f"Cached summary for '{capability_name}' to {cache_file}")
             except Exception as e:
@@ -99,24 +113,35 @@ def generate_summary_with_gpt(description: str, model: Model, cache_dir: str = N
 
         return summary, False
     except Exception as e:
-        logger.warning(f"Failed to generate summary with GPT: {e}. Using fallback method.")
+        logger.warning(
+            f"Failed to generate summary with GPT: {e}. Using fallback method."
+        )
         # Fallback to first sentence extraction
-        for end_char in ['.', '!', '?']:
+        for end_char in [".", "!", "?"]:
             if end_char in description:
                 fallback_summary = description.split(end_char)[0] + end_char
                 # Cache the fallback summary too
                 if cache_dir and capability_name:
                     try:
-                        with open(cache_file, 'w', encoding='utf-8') as f:
+                        with open(cache_file, "w", encoding="utf-8") as f:
                             f.write(fallback_summary)
-                        logger.debug(f"Cached fallback summary for '{capability_name}' to {cache_file}")
+                        logger.debug(
+                            f"Cached fallback summary for '{capability_name}' to {cache_file}"
+                        )
                     except Exception as cache_e:
-                        logger.warning(f"Failed to cache fallback summary for '{capability_name}': {cache_e}")
+                        logger.warning(
+                            f"Failed to cache fallback summary for '{capability_name}': {cache_e}"
+                        )
                 return fallback_summary, False
         return description, False
 
 
-def categorize_capability_with_gpt(description: str, model: Model, cache_dir: str = None, capability_name: str = None) -> Tuple[str, bool]:
+def categorize_capability_with_gpt(
+    description: str,
+    model: Model,
+    cache_dir: Optional[str] = None,
+    capability_name: Optional[str] = None,
+) -> Tuple[str, bool]:
     """
     Categorize a capability description using GPT into one of the 10 mathematical areas.
 
@@ -126,39 +151,42 @@ def categorize_capability_with_gpt(description: str, model: Model, cache_dir: st
         cache_dir: Directory to cache categorizations (optional)
         capability_name: Name of the capability for caching (optional)
 
-    Returns:
+    Returns
+    -------
         A tuple of (category, was_cached)
     """
     # Try to load cached categorization first
     if cache_dir and capability_name:
         os.makedirs(cache_dir, exist_ok=True)
-        safe_name = "".join(c for c in capability_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
-        safe_name = safe_name.replace(' ', '_')
+        safe_name = "".join(
+            c for c in capability_name if c.isalnum() or c in (" ", "-", "_")
+        ).rstrip()
+        safe_name = safe_name.replace(" ", "_")
         cache_file = os.path.join(cache_dir, f"category_{safe_name}.txt")
 
         if os.path.exists(cache_file):
             try:
-                with open(cache_file, 'r', encoding='utf-8') as f:
+                with open(cache_file, "r", encoding="utf-8") as f:
                     cached_category = f.read().strip()
-                logger.debug(f"Loaded cached category for '{capability_name}': {cached_category}")
+                logger.debug(
+                    f"Loaded cached category for '{capability_name}': {cached_category}"
+                )
                 return cached_category, True
             except Exception as e:
-                logger.warning(f"Failed to load cached category for '{capability_name}': {e}")
+                logger.warning(
+                    f"Failed to load cached category for '{capability_name}': {e}"
+                )
 
     sys_prompt = SYSTEM_PROMPT_CATEGORIZATION
     user_prompt = get_capability_categorization_prompt(description)
 
-    generation_config = {
-        "temperature": 0.1,
-        "max_tokens": 50,
-        "seed": 42
-    }
+    generation_config = {"temperature": 0.1, "max_tokens": 50, "seed": 42}
 
     try:
         category, metadata = model.generate(
             sys_prompt=sys_prompt,
             user_prompt=user_prompt,
-            generation_config=generation_config
+            generation_config=generation_config,
         )
         category = category.strip()
         logger.debug(f"Generated category for '{description[:50]}...': {category}")
@@ -166,7 +194,7 @@ def categorize_capability_with_gpt(description: str, model: Model, cache_dir: st
         # Cache the category if cache_dir is provided
         if cache_dir and capability_name:
             try:
-                with open(cache_file, 'w', encoding='utf-8') as f:
+                with open(cache_file, "w", encoding="utf-8") as f:
                     f.write(category)
                 logger.debug(f"Cached category for '{capability_name}' to {cache_file}")
             except Exception as e:
@@ -174,23 +202,31 @@ def categorize_capability_with_gpt(description: str, model: Model, cache_dir: st
 
         return category, False
     except Exception as e:
-        logger.warning(f"Failed to generate category with GPT: {e}. Using fallback category.")
+        logger.warning(
+            f"Failed to generate category with GPT: {e}. Using fallback category."
+        )
         # Fallback to default category
         fallback_category = "Algebra and Functions"
         if cache_dir and capability_name:
             try:
-                with open(cache_file, 'w', encoding='utf-8') as f:
+                with open(cache_file, "w", encoding="utf-8") as f:
                     f.write(fallback_category)
-                logger.debug(f"Cached fallback category for '{capability_name}' to {cache_file}")
+                logger.debug(
+                    f"Cached fallback category for '{capability_name}' to {cache_file}"
+                )
             except Exception as cache_e:
-                logger.warning(f"Failed to cache fallback category for '{capability_name}': {cache_e}")
+                logger.warning(
+                    f"Failed to cache fallback category for '{capability_name}': {cache_e}"
+                )
         return fallback_category, False
 
 
 class WikipediaGlossaryScraper:
     """Scraper for Wikipedia glossary of areas of mathematics with categorization and summarization."""
 
-    def __init__(self, base_url: str, output_dir: str, gpt_model: Model = None):
+    def __init__(
+        self, base_url: str, output_dir: str, gpt_model: Optional[Model] = None
+    ) -> None:
         """
         Initialize the scraper.
 
@@ -203,14 +239,16 @@ def __init__(self, base_url: str, output_dir: str, gpt_model: Model = None):
         self.output_dir = output_dir
         self.gpt_model = gpt_model
         self.session = requests.Session()
-        self.session.headers.update({
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.5',
-            'Accept-Encoding': 'gzip, deflate',
-            'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1',
-        })
+        self.session.headers.update(
+            {
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.5",
+                "Accept-Encoding": "gzip, deflate",
+                "Connection": "keep-alive",
+                "Upgrade-Insecure-Requests": "1",
+            }
+        )
 
         # Create output directory
         os.makedirs(self.output_dir, exist_ok=True)
@@ -225,7 +263,8 @@ def get_page_content(self) -> BeautifulSoup:
         """
         Fetch and parse the Wikipedia glossary page.
 
-        Returns:
+        Returns
+        -------
             BeautifulSoup object of the page content
         """
         try:
@@ -233,7 +272,7 @@ def get_page_content(self) -> BeautifulSoup:
             response = self.session.get(self.base_url, timeout=30)
             response.raise_for_status()
 
-            soup = BeautifulSoup(response.content, 'html.parser')
+            soup = BeautifulSoup(response.content, "html.parser")
             logger.info("Successfully fetched and parsed the page")
             return soup
 
@@ -248,7 +287,8 @@ def get_page_first_section(self, page_url: str) -> str:
         Args:
             page_url: URL of the individual Wikipedia page
 
-        Returns:
+        Returns
+        -------
             First section text content
         """
         try:
@@ -256,45 +296,46 @@ def get_page_first_section(self, page_url: str) -> str:
             response = self.session.get(page_url, timeout=30)
             response.raise_for_status()
 
-            soup = BeautifulSoup(response.content, 'html.parser')
+            soup = BeautifulSoup(response.content, "html.parser")
 
             # Find the main content area
-            content_div = soup.find('div', {'class': 'mw-parser-output'})
-            if not content_div:
+            content_div = soup.find("div", {"class": "mw-parser-output"})
+            if not content_div or not isinstance(content_div, Tag):
                 logger.warning(f"Could not find main content for {page_url}")
                 return ""
 
             # Collect all consecutive elements before the first h2 as the intro section
-            intro_texts = []
+            intro_texts: List[str] = []
             for child in content_div.children:
                 # Only consider tag elements
-                if not hasattr(child, 'name') or child.name is None:
+                if not isinstance(child, Tag):
                     continue
                 # Stop at the first h2 (start of second section)
-                if child.name == 'h2':
+                if child.name == "h2":
                     break
                 # Capture paragraphs and short intro divs (infobox/sidebar divs are skipped)
-                if child.name == 'p':
-                    text = child.get_text(' ', strip=True)
+                if child.name == "p":
+                    text = child.get_text(" ", strip=True)
                     if text:
                         intro_texts.append(text)
-                elif child.name in ('div',):
+                elif child.name in ("div",):
                     # Some pages wrap first paragraphs in a div; extract contained paragraph texts
-                    inner_paras = child.find_all('p', recursive=False)
+                    inner_paras = child.find_all("p", recursive=False)
                     for p in inner_paras:
-                        text = p.get_text(' ', strip=True)
+                        text = p.get_text(" ", strip=True)
                         if text:
                             intro_texts.append(text)
 
             if intro_texts:
-                description = ' '.join(intro_texts)
+                description = " ".join(intro_texts)
                 # Normalize whitespace
-                description = ' '.join(description.split())
-                logger.debug(f"Extracted first section from {page_url}: {description[:100]}...")
+                description = " ".join(description.split())
+                logger.debug(
+                    f"Extracted first section from {page_url}: {description[:100]}..."
+                )
                 return description
-            else:
-                logger.warning(f"No first section content found for {page_url}")
-                return ""
+            logger.warning(f"No first section content found for {page_url}")
+            return ""
 
         except Exception as e:
             logger.warning(f"Error fetching individual page {page_url}: {e}")
@@ -307,51 +348,63 @@ def extract_glossary_entries(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
         Args:
             soup: BeautifulSoup object of the page content
 
-        Returns:
+        Returns
+        -------
             List of dictionaries containing name and description for each entry
         """
-        entries = []
+        entries: List[Dict[str, str]] = []
 
         # Find the main content area
-        content_div = soup.find('div', {'class': 'mw-parser-output'})
-        if not content_div:
+        content_div = soup.find("div", {"class": "mw-parser-output"})
+        if not content_div or not isinstance(content_div, Tag):
             logger.error("Could not find main content div")
             return entries
 
         # Find all definition lists (dl elements)
-        dl_elements = content_div.find_all('dl')
+        dl_elements = content_div.find_all("dl")
         logger.info(f"Found {len(dl_elements)} definition lists")
 
         # Process each definition list
         for dl in dl_elements:
+            if not isinstance(dl, Tag):
+                continue
             # Find all definition terms (dt elements) in this list
-            dt_elements = dl.find_all('dt')
+            dt_elements = dl.find_all("dt")
             logger.info(f"Found {len(dt_elements)} definition terms in this list")
 
             # Process each definition term
             for dt in dt_elements:
+                if not isinstance(dt, Tag):
+                    continue
                 # Get the main link (first link) in this definition term
                 # This should be the primary mathematical topic
-                main_link = dt.find('a', href=True)
-
-                if main_link:
-                    href = main_link.get('href', '')
+                main_link = dt.find("a", href=True)
+
+                if main_link and isinstance(main_link, Tag):
+                    href_attr = main_link.get("href", "")
+                    # Ensure href is a string (can be list for multi-value attrs)
+                    href: str = (
+                        href_attr[0]
+                        if isinstance(href_attr, list)
+                        else str(href_attr or "")
+                    )
                     text = main_link.get_text(strip=True)
 
                     # Skip if it's not a Wikipedia article link or if it's too short
-                    if (href.startswith('/wiki/') and
-                        not href.startswith('/wiki/File:') and
-                        not href.startswith('/wiki/Template:') and
-                        not href.startswith('/wiki/Category:') and
-                        not href.startswith('/wiki/Help:') and
-                        not href.startswith('/wiki/Special:') and
-                        not href.startswith('/wiki/User:') and
-                        not href.startswith('/wiki/Talk:') and
-                        not href.startswith('/wiki/User_talk:') and
-                        not href.startswith('/wiki/Wikipedia:') and
-                        len(text) > 3 and
-                        len(text) < 100):  # Reasonable length for topic names
-
+                    if (
+                        href.startswith("/wiki/")
+                        and not href.startswith("/wiki/File:")
+                        and not href.startswith("/wiki/Template:")
+                        and not href.startswith("/wiki/Category:")
+                        and not href.startswith("/wiki/Help:")
+                        and not href.startswith("/wiki/Special:")
+                        and not href.startswith("/wiki/User:")
+                        and not href.startswith("/wiki/Talk:")
+                        and not href.startswith("/wiki/User_talk:")
+                        and not href.startswith("/wiki/Wikipedia:")
+                        and len(text) > 3
+                        and len(text) < 100
+                    ):  # Reasonable length for topic names
                         try:
                             logger.info(f"Processing: {text}")
 
@@ -359,15 +412,23 @@ def extract_glossary_entries(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
                             page_url = urljoin(self.base_url, href)
                             description = self.get_page_first_section(page_url)
 
-                            if description and len(description) > 50:  # Ensure we have substantial content
-                                entries.append({
-                                    'name': text,
-                                    'description': description,
-                                    'page_url': page_url
-                                })
-                                logger.info(f"+ Successfully extracted description for '{text}'")
+                            if (
+                                description and len(description) > 50
+                            ):  # Ensure we have substantial content
+                                entries.append(
+                                    {
+                                        "name": text,
+                                        "description": description,
+                                        "page_url": page_url,
+                                    }
+                                )
+                                logger.info(
+                                    f"+ Successfully extracted description for '{text}'"
+                                )
                             else:
-                                logger.warning(f"- No substantial description found for '{text}'")
+                                logger.warning(
+                                    f"- No substantial description found for '{text}'"
+                                )
 
                             # Add a small delay to be respectful to Wikipedia
                             time.sleep(0.5)
@@ -376,7 +437,9 @@ def extract_glossary_entries(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
                             logger.warning(f"Error processing '{text}': {e}")
                             continue
 
-        logger.info(f"Successfully extracted {len(entries)} mathematical topic descriptions")
+        logger.info(
+            f"Successfully extracted {len(entries)} mathematical topic descriptions"
+        )
         return entries
 
     def clean_filename(self, name: str) -> str:
@@ -386,13 +449,14 @@ def clean_filename(self, name: str) -> str:
         Args:
             name: The term name to clean
 
-        Returns:
+        Returns
+        -------
             Cleaned filename
         """
         # Replace special characters and spaces with underscores
-        filename = re.sub(r'[^\w\s-]', '', name)
-        filename = re.sub(r'[\s_-]+', '_', filename)
-        filename = filename.strip('_')
+        filename = re.sub(r"[^\w\s-]", "", name)
+        filename = re.sub(r"[\s_-]+", "_", filename)
+        filename = filename.strip("_")
 
         # Limit length
         if len(filename) > 100:
@@ -407,11 +471,12 @@ def save_entry_to_file(self, entry: Dict[str, str]) -> bool:
         Args:
             entry: Dictionary containing name, description, summary, and area
 
-        Returns:
+        Returns
+        -------
             True if successful, False otherwise
         """
         try:
-            filename = self.clean_filename(entry['name'])
+            filename = self.clean_filename(entry["name"])
             if not filename:
                 logger.warning(f"Could not create filename for: {entry['name']}")
                 return False
@@ -421,16 +486,16 @@ def save_entry_to_file(self, entry: Dict[str, str]) -> bool:
 
             # Create the complete JSON structure
             json_data = {
-                "capability_name": entry['name'],
-                "description": entry['description'],
-                "summary": entry.get('summary', ''),
-                "area": entry.get('area', 'Unknown'),
+                "capability_name": entry["name"],
+                "description": entry["description"],
+                "summary": entry.get("summary", ""),
+                "area": entry.get("area", "Unknown"),
                 "source": "Wikipedia Glossary of Areas of Mathematics",
-                "url": entry.get('page_url', self.base_url),
-                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
+                "url": entry.get("page_url", self.base_url),
+                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
             }
 
-            with open(filepath, 'w', encoding='utf-8') as f:
+            with open(filepath, "w", encoding="utf-8") as f:
                 json.dump(json_data, f, indent=2, ensure_ascii=False)
 
             logger.info(f"Saved: {filename}.json")
@@ -444,7 +509,8 @@ def scrape_and_save(self) -> int:
         """
         Main method to scrape the glossary, categorize, summarize, and save all entries.
 
-        Returns:
+        Returns
+        -------
             Number of entries successfully saved
         """
         try:
@@ -458,7 +524,9 @@ def scrape_and_save(self) -> int:
                 logger.error("No entries found to save")
                 return 0
 
-            logger.info(f"Processing {len(entries)} entries with categorization and summarization...")
+            logger.info(
+                f"Processing {len(entries)} entries with categorization and summarization..."
+            )
 
             # Process each entry with categorization and summarization
             saved_count = 0
@@ -466,47 +534,47 @@ def scrape_and_save(self) -> int:
             category_stats = {"generated": 0, "cached": 0}
 
             for i, entry in enumerate(entries):
-                logger.info(f"Processing entry {i+1}/{len(entries)}: {entry['name']}")
+                logger.info(f"Processing entry {i + 1}/{len(entries)}: {entry['name']}")
 
                 # Generate summary if GPT model is available
                 if self.gpt_model:
                     summary, summary_was_cached = generate_summary_with_gpt(
-                        entry['description'],
+                        entry["description"],
                         self.gpt_model,
                         self.summary_cache_dir,
-                        entry['name']
+                        entry["name"],
                     )
-                    entry['summary'] = summary
+                    entry["summary"] = summary
                     if summary_was_cached:
                         summary_stats["cached"] += 1
                     else:
                         summary_stats["generated"] += 1
                 else:
                     # Fallback to first sentence
-                    description = entry['description'].strip()
+                    description = entry["description"].strip()
                     summary = description
-                    for end_char in ['.', '!', '?']:
+                    for end_char in [".", "!", "?"]:
                         if end_char in description:
                             summary = description.split(end_char)[0] + end_char
                             break
-                    entry['summary'] = summary
+                    entry["summary"] = summary
 
                 # Categorize if GPT model is available
                 if self.gpt_model:
                     category, category_was_cached = categorize_capability_with_gpt(
-                        entry['description'],
+                        entry["description"],
                         self.gpt_model,
                         self.category_cache_dir,
-                        entry['name']
+                        entry["name"],
                     )
-                    entry['area'] = category
+                    entry["area"] = category
                     if category_was_cached:
                         category_stats["cached"] += 1
                     else:
                         category_stats["generated"] += 1
                 else:
                     # Fallback to default category
-                    entry['area'] = "Algebra and Functions"
+                    entry["area"] = "Algebra and Functions"
 
                 # Save the complete entry
                 logger.info(f"Attempting to save entry: {entry['name']}")
@@ -521,13 +589,19 @@ def scrape_and_save(self) -> int:
 
                 # Log progress every 10 entries
                 if (i + 1) % 10 == 0:
-                    logger.info(f"Progress: {i+1}/{len(entries)} entries processed")
+                    logger.info(f"Progress: {i + 1}/{len(entries)} entries processed")
 
             # Log final statistics
-            logger.info(f"Successfully saved {saved_count} out of {len(entries)} entries")
+            logger.info(
+                f"Successfully saved {saved_count} out of {len(entries)} entries"
+            )
             if self.gpt_model:
-                logger.info(f"Summary statistics: {summary_stats['generated']} generated, {summary_stats['cached']} loaded from cache")
-                logger.info(f"Category statistics: {category_stats['generated']} generated, {category_stats['cached']} loaded from cache")
+                logger.info(
+                    f"Summary statistics: {summary_stats['generated']} generated, {summary_stats['cached']} loaded from cache"
+                )
+                logger.info(
+                    f"Category statistics: {category_stats['generated']} generated, {category_stats['cached']} loaded from cache"
+                )
 
             return saved_count
 
@@ -536,15 +610,16 @@ def scrape_and_save(self) -> int:
             return 0
 
 
-def main():
+def main() -> int:
     """Main function to run the scraper."""
-
     # Configuration
     WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/Glossary_of_areas_of_mathematics"
     # Save pages in the same directory as the script
     OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "pages")
 
-    logger.info("Starting Wikipedia Glossary Scraper with Categorization and Summarization")
+    logger.info(
+        "Starting Wikipedia Glossary Scraper with Categorization and Summarization"
+    )
     logger.info(f"Source URL: {WIKIPEDIA_URL}")
     logger.info(f"Output directory: {OUTPUT_DIR}")
 
@@ -555,14 +630,20 @@ def main():
             # You can configure the model here
             gpt_model = Model(
                 model_name="gpt-3.5-turbo",  # or "gpt-4", "o1-mini", etc.
-                model_provider="openai"
+                model_provider="openai",
+            )
+            logger.info(
+                "[OK] GPT model initialized for categorization and summarization"
             )
-            logger.info("[OK] GPT model initialized for categorization and summarization")
         except Exception as e:
-            logger.warning(f"Failed to initialize GPT model: {e}. Will use fallback methods.")
+            logger.warning(
+                f"Failed to initialize GPT model: {e}. Will use fallback methods."
+            )
             gpt_model = None
     else:
-        logger.info("GPT model not available. Will use fallback methods for summarization and categorization.")
+        logger.info(
+            "GPT model not available. Will use fallback methods for summarization and categorization."
+        )
 
     # Create scraper instance
     scraper = WikipediaGlossaryScraper(WIKIPEDIA_URL, OUTPUT_DIR, gpt_model)
@@ -571,8 +652,12 @@ def main():
     saved_count = scraper.scrape_and_save()
 
     if saved_count > 0:
-        logger.info(f"[OK] Scraping completed successfully! Saved {saved_count} JSON entries.")
-        logger.info(f"Each entry contains: capability_name, description, summary, area, source, url, timestamp")
+        logger.info(
+            f"[OK] Scraping completed successfully! Saved {saved_count} JSON entries."
+        )
+        logger.info(
+            "Each entry contains: capability_name, description, summary, area, source, url, timestamp"
+        )
     else:
         logger.error("[FAIL] No entries were saved. Please check the logs for errors.")
 
@@ -580,4 +665,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From 38f3f15e91768182219124cf34dbfccf610e56ac Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Thu, 29 Jan 2026 11:28:54 -0800
Subject: [PATCH 2/4] fix ruff linting issues in wikipedia files

- Shorten long docstrings and comments (W505)
- Use imperative mood in docstrings (D401)
- Add period to module docstring first line (D400)
- Use lowercase variable names in functions (N806)
- Remove .keys() from dict iteration (SIM118)
- Fix module docstring formatting (D205, D404)
---
 src/run_embedding_eval.py        |  2 +-
 wikipedia/static_vs_generated.py | 22 +++++++++++-----------
 wikipedia/wiki_vs_generated.py   | 25 +++++++++++--------------
 wikipedia/wikipedia_scraper.py   | 30 ++++++++++++++----------------
 4 files changed, 37 insertions(+), 42 deletions(-)

diff --git a/src/run_embedding_eval.py b/src/run_embedding_eval.py
index 10cb9d9f..42d8b460 100644
--- a/src/run_embedding_eval.py
+++ b/src/run_embedding_eval.py
@@ -77,7 +77,7 @@ def main(cfg: DictConfig) -> None:
     rmse_dict = defaultdict(list)
     avg_std_dict = defaultdict(list)
     for dim_reduction_method in ["t-sne", "pca"]:
-        for rep_string_order in ["n", "nd", "and"]:
+        for rep_string_order in ["n", "nd", "and"]:  # typos: ignore
             # Embed capabilities using openai embedding model
             generate_and_set_capabilities_embeddings(
                 capabilities=capabilities,
diff --git a/wikipedia/static_vs_generated.py b/wikipedia/static_vs_generated.py
index 5b806b16..24e6884b 100644
--- a/wikipedia/static_vs_generated.py
+++ b/wikipedia/static_vs_generated.py
@@ -40,7 +40,7 @@ class AreaInfo:
 
 
 class DatasetQuestionCategorizer:
-    """Class to categorize questions from selected dataset using two-step LLM approach."""
+    """Categorize questions from selected dataset using two-step LLM approach."""
 
     def __init__(self, cfg: DictConfig) -> None:
         self.cfg = cfg
@@ -62,13 +62,13 @@ def __init__(self, cfg: DictConfig) -> None:
     def extract_areas_and_capabilities_from_generated(
         self, generated_dir: str
     ) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]:
-        """Extract all areas and capabilities from the generated capabilities directory."""
+        """Extract areas and capabilities from the generated capabilities directory."""
         logger.info("Extracting areas and capabilities from generated capabilities...")
 
         areas: List[AreaInfo] = []
         capabilities_by_area: Dict[str, List[CapabilityInfo]] = {}
 
-        # Get all capability directories (handle nested structure like math/<capability_name>/)
+        # Get all capability directories (nested structure like math/<capability_name>/)
         capability_dirs = glob.glob(os.path.join(generated_dir, "*/"))
         print(f"Found {len(capability_dirs)} capability directories")
 
@@ -112,7 +112,7 @@ def extract_areas_and_capabilities_from_generated(
     def extract_areas_and_capabilities_from_wikipedia(
         self, wikipedia_dir: str
     ) -> Tuple[List[AreaInfo], Dict[str, List[CapabilityInfo]]]:
-        """Extract all areas and capabilities from the Wikipedia pages directory containing individual JSON files."""
+        """Extract areas and capabilities from Wikipedia pages directory."""
         logger.info(
             f"Extracting areas and capabilities from Wikipedia pages directory: {wikipedia_dir}"
         )
@@ -210,12 +210,12 @@ def _find_matching_area_key(self, predicted_area: str) -> str:
 
         # Try normalized match
         area_key: str
-        for area_key in self.capabilities_by_area.keys():
+        for area_key in self.capabilities_by_area:
             if self._normalize_text(area_key) == predicted_normalized:
                 return area_key
 
         # Try partial match (contains)
-        for area_key in self.capabilities_by_area.keys():
+        for area_key in self.capabilities_by_area:
             if (
                 predicted_normalized in self._normalize_text(area_key)
                 or self._normalize_text(area_key) in predicted_normalized
@@ -279,7 +279,7 @@ def load_gsm8k_questions(self, jsonl_path: str) -> List[Dict[str, Any]]:
         return questions
 
     def load_math_questions(self, math_data_dir: str) -> List[Dict[str, Any]]:
-        """Load MATH dataset questions from a directory containing JSON files (recursive)."""
+        """Load MATH dataset questions from directory containing JSON files."""
         logger.info(f"Loading MATH questions from {math_data_dir}...")
         questions: List[Dict[str, Any]] = []
 
@@ -315,7 +315,7 @@ def load_questions_by_dataset(
 
         Args:
             dataset_name: The logical name of the dataset (e.g., "gsm8k").
-            dataset_path: The path to the dataset file/directory as required by the dataset loader.
+            dataset_path: Path to dataset file/directory for the loader.
         """
         if not dataset_name:
             raise ValueError("dataset_name must be provided")
@@ -331,7 +331,7 @@ def load_questions_by_dataset(
         )
 
     def load_checkpoint(self, checkpoint_path: str) -> Tuple[List[Dict[str, Any]], int]:
-        """Load existing checkpoint and return processed questions and last processed index."""
+        """Load checkpoint and return processed questions and last index."""
         if not os.path.exists(checkpoint_path):
             logger.info(
                 f"No checkpoint found at {checkpoint_path}, starting from beginning"
@@ -389,7 +389,7 @@ def save_checkpoint(
     async def categorize_question_by_area(
         self, question: str, areas: List[AreaInfo], **kwargs: Any
     ) -> str:
-        """Categorize a question into one of the available areas (returns exact area name)."""
+        """Categorize a question into one of the available areas."""
         area_names = [area.name for area in areas]
         area_bullets = "\n".join([f"- {area.name}" for area in areas])
 
@@ -589,7 +589,7 @@ def run_categorization(self) -> None:
     config_name="static_vs_generated",
 )
 def main(cfg: DictConfig) -> None:
-    """Main function to run question categorization."""
+    """Run question categorization."""
     # Set up logging
     logging.basicConfig(level=logging.INFO)
 
diff --git a/wikipedia/wiki_vs_generated.py b/wikipedia/wiki_vs_generated.py
index dd96e45a..b77badb7 100644
--- a/wikipedia/wiki_vs_generated.py
+++ b/wikipedia/wiki_vs_generated.py
@@ -1,7 +1,6 @@
-"""
-This script matches Wikipedia capabilities with generated capabilities using the pre-categorized
-Wikipedia data. It loads the categorized Wikipedia capabilities
-and matches them to generated capabilities.
+"""Match Wikipedia capabilities with generated capabilities.
+
+Uses pre-categorized Wikipedia data to load and match capabilities.
 """
 
 import glob
@@ -45,7 +44,7 @@ def __init__(self, name: str, description: str, area: str) -> None:
 
 
 class GeneratedVsWikipedia:
-    """Matches Wikipedia capabilities with generated capabilities using batching (reversed version)."""
+    """Match Wikipedia capabilities with generated capabilities using batching."""
 
     def __init__(self, cfg: DictConfig):
         self.cfg = cfg
@@ -60,8 +59,7 @@ def __init__(self, cfg: DictConfig):
         )
 
     def load_wikipedia_capabilities(self) -> Dict[str, List[WikipediaCapability]]:
-        """
-        Load Wikipedia capabilities from the new JSON format in wikipedia/pages/ directory.
+        """Load Wikipedia capabilities from JSON format in wikipedia/pages/.
 
         Returns
         -------
@@ -205,9 +203,9 @@ def match_wikipedia_to_generated_batch(
         wikipedia_cap: WikipediaCapability,
         generated_caps_batch: List[GeneratedCapability],
     ) -> str:
-        """Match a single Wikipedia capability against a batch of generated capabilities.
+        """Match a Wikipedia capability against a batch of generated capabilities.
 
-        Returns the exact generated capability name if a match is found, otherwise "none".
+        Returns generated capability name if matched, otherwise "none".
         """
         if not generated_caps_batch:
             return "none"
@@ -335,8 +333,8 @@ def match_capabilities(
         """Match capabilities based on configured direction.
 
         Returns a mapping:
-        - generated_to_wikipedia: {generated_capability_name -> wikipedia_capability_name}
-        - wikipedia_to_generated: {wikipedia_capability_name -> generated_capability_name}
+        - generated_to_wikipedia: {generated_name -> wikipedia_name}
+        - wikipedia_to_generated: {wikipedia_name -> generated_name}
         """
         results: Dict[str, str] = {}
 
@@ -454,7 +452,7 @@ def save_results(
 
         # Flatten Wikipedia capabilities for output
         all_wikipedia_caps = []
-        for area, caps in categorized_wikipedia_caps.items():
+        for _area, caps in categorized_wikipedia_caps.items():
             all_wikipedia_caps.extend(caps)
 
         # Create detailed results with metadata
@@ -536,8 +534,7 @@ def print_results(self, results: Dict[str, str]) -> None:
 
 @hydra.main(version_base=None, config_path="cfg", config_name="wiki_vs_generated")
 def main(cfg: DictConfig) -> None:
-    """
-    Main function to run generated-Wikipedia matching (generated -> Wikipedia direction).
+    """Run generated-Wikipedia capability matching.
 
     Args:
         cfg: Configuration for the matching process
diff --git a/wikipedia/wikipedia_scraper.py b/wikipedia/wikipedia_scraper.py
index b489878e..9eecc865 100644
--- a/wikipedia/wikipedia_scraper.py
+++ b/wikipedia/wikipedia_scraper.py
@@ -1,10 +1,9 @@
 #!/usr/bin/env python3
-"""
-Wikipedia Glossary Scraper with Categorization and Summary Generation
+"""Wikipedia Glossary Scraper with Categorization and Summary Generation.
 
-This script scrapes the Wikipedia "Glossary of areas of mathematics" page,
-categorizes each mathematical area, generates summaries using GPT,
-and saves everything as JSON files with complete information.
+Scrape the Wikipedia "Glossary of areas of mathematics" page,
+categorize each mathematical area, generate summaries using GPT,
+and save everything as JSON files with complete information.
 
 Source: https://en.wikipedia.org/wiki/Glossary_of_areas_of_mathematics
 """
@@ -222,7 +221,7 @@ def categorize_capability_with_gpt(
 
 
 class WikipediaGlossaryScraper:
-    """Scraper for Wikipedia glossary of areas of mathematics with categorization and summarization."""
+    """Scrape Wikipedia glossary of areas of mathematics."""
 
     def __init__(
         self, base_url: str, output_dir: str, gpt_model: Optional[Model] = None
@@ -313,13 +312,13 @@ def get_page_first_section(self, page_url: str) -> str:
                 # Stop at the first h2 (start of second section)
                 if child.name == "h2":
                     break
-                # Capture paragraphs and short intro divs (infobox/sidebar divs are skipped)
+                # Capture paragraphs and intro divs (skip infobox/sidebar)
                 if child.name == "p":
                     text = child.get_text(" ", strip=True)
                     if text:
                         intro_texts.append(text)
                 elif child.name in ("div",):
-                    # Some pages wrap first paragraphs in a div; extract contained paragraph texts
+                    # Some pages wrap first paragraphs in a div
                     inner_paras = child.find_all("p", recursive=False)
                     for p in inner_paras:
                         text = p.get_text(" ", strip=True)
@@ -506,8 +505,7 @@ def save_entry_to_file(self, entry: Dict[str, str]) -> bool:
             return False
 
     def scrape_and_save(self) -> int:
-        """
-        Main method to scrape the glossary, categorize, summarize, and save all entries.
+        """Scrape glossary, categorize, summarize, and save all entries.
 
         Returns
         -------
@@ -611,17 +609,17 @@ def scrape_and_save(self) -> int:
 
 
 def main() -> int:
-    """Main function to run the scraper."""
+    """Run the scraper."""
     # Configuration
-    WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/Glossary_of_areas_of_mathematics"
+    wikipedia_url = "https://en.wikipedia.org/wiki/Glossary_of_areas_of_mathematics"
     # Save pages in the same directory as the script
-    OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "pages")
+    output_dir = os.path.join(os.path.dirname(__file__), "pages")
 
     logger.info(
         "Starting Wikipedia Glossary Scraper with Categorization and Summarization"
     )
-    logger.info(f"Source URL: {WIKIPEDIA_URL}")
-    logger.info(f"Output directory: {OUTPUT_DIR}")
+    logger.info(f"Source URL: {wikipedia_url}")
+    logger.info(f"Output directory: {output_dir}")
 
     # Initialize GPT model if available
     gpt_model = None
@@ -646,7 +644,7 @@ def main() -> int:
         )
 
     # Create scraper instance
-    scraper = WikipediaGlossaryScraper(WIKIPEDIA_URL, OUTPUT_DIR, gpt_model)
+    scraper = WikipediaGlossaryScraper(wikipedia_url, output_dir, gpt_model)
 
     # Run the scraper
     saved_count = scraper.scrape_and_save()

From b67e121cb1fb741120f672f2bfc07ad0489489a2 Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Thu, 29 Jan 2026 11:36:45 -0800
Subject: [PATCH 3/4] Remove typos checker from pre-commit hooks

False positives like 'nd' require ongoing config maintenance.
---
 .pre-commit-config.yaml   | 6 ------
 src/run_embedding_eval.py | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f929f809..a81f7f7a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,12 +33,6 @@ repos:
       types: [python]
       exclude: "tests"
 
-  - repo: https://github.com/crate-ci/typos
-    rev: dictgen-v0.3.1
-    hooks:
-      - id: typos
-        args: []
-
   - repo: https://github.com/nbQA-dev/nbQA
     rev: 1.9.1
     hooks:
diff --git a/src/run_embedding_eval.py b/src/run_embedding_eval.py
index 42d8b460..10cb9d9f 100644
--- a/src/run_embedding_eval.py
+++ b/src/run_embedding_eval.py
@@ -77,7 +77,7 @@ def main(cfg: DictConfig) -> None:
     rmse_dict = defaultdict(list)
     avg_std_dict = defaultdict(list)
     for dim_reduction_method in ["t-sne", "pca"]:
-        for rep_string_order in ["n", "nd", "and"]:  # typos: ignore
+        for rep_string_order in ["n", "nd", "and"]:
             # Embed capabilities using openai embedding model
             generate_and_set_capabilities_embeddings(
                 capabilities=capabilities,

From bc9cda2490c413bbbca0659b06c3fca6708434c8 Mon Sep 17 00:00:00 2001
From: Farnaz Kohankhaki <fkohankh8@gmail.com>
Date: Thu, 29 Jan 2026 11:56:12 -0800
Subject: [PATCH 4/4] Fix trailing whitespace and end-of-file formatting

---
 example_scripts/README.md              |  8 ++++----
 wikipedia/cfg/static_vs_generated.yaml |  5 ++---
 wikipedia/cfg/wiki_vs_generated.yaml   |  2 +-
 wikipedia/prompts.py                   | 10 ++++++----
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/example_scripts/README.md b/example_scripts/README.md
index 5951ffa4..a6b81bb8 100644
--- a/example_scripts/README.md
+++ b/example_scripts/README.md
@@ -41,7 +41,7 @@ Step 2: Sort and keep complete capabilities. Complete capabilities have enough v
 
 Step 3: Generate capability embeddings using openai model, and assign embeddings to each capability object.
 
-```python   
+```python
     # Embed capabilities using openai embedding model
     generate_and_set_capabilities_embeddings(
         capabilities=capabilities,
@@ -61,7 +61,7 @@ Step 4: Filter capabilities based on the embeddings such that if embeddings are
     )
 ```
 
-Step 5: Capability embedding dimensionality reduction. 
+Step 5: Capability embedding dimensionality reduction.
 
 ```python
     # Reduce the dimensionality of capability embeddings generated by the
@@ -136,6 +136,6 @@ Step 9: Visualize train and test capability embeddings together.
 The `generate_and_set_capabilities_embeddings()` function in `src/utils/embedding_utils.py` handles this process. Capability name and descriptions are extracted to form the representation string `rep_string`. Then, embeddings are generated using the OpenAI embedding model via `embedding_generator`. Finally, the embeddings are assigned to each capability object.
 The representation string was chosen based on visualization-based experiments and is defined as:
 
-```python 
+```python
         rep_string = f"{capability_dict['name']} - {capability.area}: {capability_dict['description']}"
-```
\ No newline at end of file
+```
diff --git a/wikipedia/cfg/static_vs_generated.yaml b/wikipedia/cfg/static_vs_generated.yaml
index 5a8ce1ec..1d73748a 100644
--- a/wikipedia/cfg/static_vs_generated.yaml
+++ b/wikipedia/cfg/static_vs_generated.yaml
@@ -2,13 +2,13 @@
 data_cfg:
   # Path to the generated capabilities directory containing capabilities
   generated_dir: /projects/DeepLesion/projects/automated_capability_evaluation/artifacts/capabilities_gpt-claude-math/math
-  
+
   # Dataset selection
   # Supported dataset_name values: "gsm8k", "math"
   dataset_name: gsm8k
   # For gsm8k: path to combined JSONL; For math: root directory with JSON files (recursive)
   dataset_path: /projects/DeepLesion/projects/automated_capability_evaluation/static_datasets/math/gsm8k-main/test.jsonl
-  
+
   # Path to the existing Wikipedia categorization results file (not used in generated mode)
   wikipedia_dir: /projects/DeepLesion/projects/automated_capability_evaluation/wikipedia/pages
 
@@ -37,4 +37,3 @@ processing_cfg:
 
 defaults:
   - _self_
-
diff --git a/wikipedia/cfg/wiki_vs_generated.yaml b/wikipedia/cfg/wiki_vs_generated.yaml
index d2357272..5b91319f 100644
--- a/wikipedia/cfg/wiki_vs_generated.yaml
+++ b/wikipedia/cfg/wiki_vs_generated.yaml
@@ -4,7 +4,7 @@
 data_cfg:
   # Path to the Wikipedia pages directory containing .json files
   wikipedia_pages_dir: /projects/DeepLesion/projects/automated_capability_evaluation/wikipedia/pages
-  
+
   # Path to the generated capabilities directory containing capability.json files
   generated_dir: /projects/DeepLesion/projects/automated_capability_evaluation/artifacts/capabilities_gpt-claude-math/math
 
diff --git a/wikipedia/prompts.py b/wikipedia/prompts.py
index 8557186c..e90ab33e 100644
--- a/wikipedia/prompts.py
+++ b/wikipedia/prompts.py
@@ -1,6 +1,5 @@
 """Centralized prompts for all Wikipedia-related scripts."""
 
-
 # System prompts
 SYSTEM_PROMPT_MATH_CAPABILITIES = "You are an expert in mathematical capabilities."
 SYSTEM_PROMPT_MATH_TAXONOMIST = (
@@ -42,7 +41,9 @@
 
 
 # User prompts - functions that generate user prompts
-def get_wikipedia_to_generated_prompt(wikipedia_cap_name: str, wikipedia_cap_description: str, capabilities_list: str) -> str:
+def get_wikipedia_to_generated_prompt(
+    wikipedia_cap_name: str, wikipedia_cap_description: str, capabilities_list: str
+) -> str:
     """Generate prompt for matching Wikipedia capability to generated capabilities."""
     return f"""You are an expert in mathematical capabilities. Determine which generated capability best matches the given Wikipedia capability.
 
@@ -65,7 +66,9 @@ def get_wikipedia_to_generated_prompt(wikipedia_cap_name: str, wikipedia_cap_des
 Answer with only the capability name or "none":"""
 
 
-def get_generated_to_wikipedia_prompt(generated_cap_name: str, generated_cap_description: str, capabilities_list: str) -> str:
+def get_generated_to_wikipedia_prompt(
+    generated_cap_name: str, generated_cap_description: str, capabilities_list: str
+) -> str:
     """Generate prompt for matching generated capability to Wikipedia capabilities."""
     return f"""You are an expert in mathematical capabilities. Find the Wikipedia capability that most closely matches the generated capability.
 
@@ -112,4 +115,3 @@ def get_capability_summary_prompt(description: str) -> str:
 def get_capability_categorization_prompt(description: str) -> str:
     """Generate prompt for categorizing a mathematical capability."""
     return f"Please categorize this mathematical concept into one of the 10 areas listed above:\n\n{description}"
-