RealMath/eval_math.py at main · ethz-spylab/RealMath · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Math QA Evaluator: Evaluate LLM Performance on Mathematical Problem-Solving

This script evaluates the performance of Language Models (LLMs) on mathematical
question-answering tasks using a dataset generated from mathematical papers.

The script:
1. Loads a mathematical QA dataset (containing questions, context, and ground truth answers)
2. For each question, prompts an LLM to solve it using the provided context
3. Evaluates the correctness of the LLM's answer against the ground truth
4. Generates detailed metrics on the model's performance

The script supports accessing models through:
- OpenAI API (for GPT models)
- Anthropic API (for Claude models)
- OpenRouter API (for various models like DeepSeek-R1)

Usage:
  # Evaluate a model
  python math_qa_evaluator.py --dataset <path> --model <model_name> [--sample <n>] [--verbose]

Dependencies:
  - datasets (for loading the dataset)
  - openai (for OpenAI models, OpenRouter models, and evaluation)
  - anthropic (for Claude models)
  - numpy (for metrics calculation)

Environment Variables:
  - OPENAI_API_KEY: API key for OpenAI
  - ANTHROPIC_API_KEY: API key for Anthropic
  - OPENROUTER_API_KEY: API key for OpenRouter
"""

import argparse
import os
import json
import time
import random
from tqdm import tqdm
import numpy as np
from datasets import load_from_disk
import asyncio  # Added for async support

try:
    from datasets import load_dataset

    HF_AVAILABLE = True
except ImportError:
    console.print(
        "[bold red]Hugging Face Datasets not installed properly. HF datasets will not be available.[/bold red]"
    )
    console.print(
        "[yellow]To enable HF datasets, install with: pip install datasets[/yellow]"
    )
    HF_AVAILABLE = False
# use dotenv to load the api keys
from dotenv import load_dotenv

load_dotenv()
import re

# Import rich console for better formatting
from rich.console import Console
from rich.panel import Panel

console = Console()

# Handle optional dependencies with graceful fallbacks
try:
    from openai import OpenAI

    OPENAI_AVAILABLE = True
except ImportError:
    console.print(
        "[bold red]OpenAI SDK not found. OpenAI models will not be available.[/bold red]"
    )
    console.print(
        "[yellow]To enable OpenAI models, install with: pip install openai[/yellow]"
    )
    OPENAI_AVAILABLE = False

# Add Anthropic support
try:
    import anthropic
    from anthropic import AsyncAnthropic  # Added async client

    ANTHROPIC_AVAILABLE = True
except ImportError:
    console.print(
        "[bold red]Anthropic SDK not found. Claude models will not be available.[/bold red]"
    )
    console.print(
        "[yellow]To enable Claude models, install with: pip install anthropic[/yellow]"
    )
    ANTHROPIC_AVAILABLE = False

# Add OpenRouter support (already uses OpenAI client)
OPENROUTER_AVAILABLE = OPENAI_AVAILABLE  # Depends on OpenAI client

# Add wandb support
try:
    import wandb

    WANDB_AVAILABLE = True
except ImportError:
    console.print(
        "[bold red]Weights & Biases not installed. wandb logging will not be available.[/bold red]"
    )
    console.print("[yellow]To enable wandb, install with: pip install wandb[/yellow]")
    WANDB_AVAILABLE = False


OPENAI_MODELS = {
    "o3-mini": "o3-mini-2025-01-31",
    "o1-mini": "o1-mini-2024-09-12",
    "gpt-3.5-turbo": "gpt-3.5-turbo-0125",
    "o4-mini": "o4-mini-2025-04-16",  # 2025-04-16
    "gpt-4o-mini": "gpt-4o-mini-2024-07-18",
}

ANTHROPIC_MODELS = {
    "claude-3.7-sonnet": "claude-3-7-sonnet-20250219",
    "claude-3.5-haiku": "claude-3-5-haiku-20241022",
}

# Add OpenRouter models
OPENROUTER_MODELS = {
    "deepseek-r1": "deepseek/deepseek-r1",  # high latency
    "grok-3": "x-ai/grok-3-beta",
    "llama-3.3-70b": "meta-llama/llama-3.3-70b-instruct",
    "llama-3.1-405b": "meta-llama/llama-3.1-405b-instruct",
    "llama-3.1-8b": "meta-llama/llama-3.1-8b-instruct",
    "gemini-2.5-flash": "google/gemini-2.5-flash-preview",
    "gemini-2.5-pro": "google/gemini-2.5-pro-preview-03-25",  # 2025-03-25
    "qwen-32b": "qwen/qwq-32b",
    "qwen3-235b": "qwen/qwen3-235b-a22b",
}


class MathQAEvaluator:
    """
    A class for evaluating LLM performance on mathematical question-answering tasks.

    This class provides functionality to:
    1. Load a mathematical QA dataset
    2. Query OpenAI GPT models with mathematical problems
    3. Evaluate the correctness of the LLM's answers
    4. Generate performance metrics and comparisons
    """

    def __init__(self, verbose=False):
        """
        Initialize the MathQAEvaluator.

        Args:
            verbose (bool, optional): Whether to print detailed information. Defaults to False.
        """
        self.openai_api_key = os.environ.get("OPENAI_API_KEY")
        self.anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY")
        self.openrouter_api_key = os.environ.get("OPENROUTER_API_KEY")
        self.verbose = verbose

        # Initialize clients
        self.openai_client = None
        self.anthropic_client = None
        self.async_anthropic_client = None  # Added async client
        self.openrouter_client = None
        if OPENAI_AVAILABLE:
            self.openai_client = OpenAI(api_key=self.openai_api_key)
        if ANTHROPIC_AVAILABLE:
            self.anthropic_client = anthropic.Anthropic(api_key=self.anthropic_api_key)
            self.async_anthropic_client = AsyncAnthropic(
                api_key=self.anthropic_api_key
            )  # Initialize async client
        if OPENROUTER_AVAILABLE and self.openrouter_api_key:
            self.openrouter_client = OpenAI(
                base_url="https://openrouter.ai/api/v1", api_key=self.openrouter_api_key
            )

    def load_dataset(self, dataset_path, sample_size=None, subset=None):
        """
        Load the mathematical QA dataset.

        Args:
            dataset_path (str): Path to the dataset or Hugging Face dataset ID.
            sample_size (int, optional): Number of samples to use. Defaults to None (use entire dataset).
                                        If set to 0, will use the entire dataset.

        Returns:
            dataset: The loaded dataset.
        """
        dataset = None
        # First try to load from Hugging Face
        if HF_AVAILABLE and (
            dataset_path.startswith("ethz-spylab/") or "/" in dataset_path
        ):
            try:
                if self.verbose:
                    console.print(
                        f"[yellow]Attempting to load dataset from Hugging Face: {dataset_path}[/yellow]"
                    )
                if subset == "stackexchange":
                    dataset = load_dataset(dataset_path)
                else:
                    dataset = load_dataset(dataset_path, split=subset)
                # Convert to regular dataset (not DatasetDict)
                if isinstance(dataset, dict):
                    if "train" in dataset:
                        dataset = dataset["train"]
                    else:
                        # Use the first split
                        dataset = dataset[list(dataset.keys())[0]]
                console.print(
                    f"[green]Successfully loaded dataset from Hugging Face: {dataset_path}[/green]"
                )
            except Exception as e:
                console.print(
                    f"[bold red]Error loading from Hugging Face: {e}[/bold red]"
                )
                console.print(
                    "[yellow]Falling back to local dataset loading...[/yellow]"
                )

        # If HF loading failed or not a HF path, try loading from disk
        if dataset is None:
            try:
                if self.verbose:
                    console.print(
                        f"[yellow]Attempting to load dataset from disk: {dataset_path}[/yellow]"
                    )
                dataset = load_from_disk(dataset_path)
                console.print(
                    f"[green]Successfully loaded dataset from disk: {dataset_path}[/green]"
                )
            except Exception as e:
                console.print(
                    f"[bold red]Error loading dataset from disk: {e}[/bold red]"
                )
                return None

        # Handle sample size - use full dataset if sample_size is 0
        if sample_size and sample_size > 0 and sample_size < len(dataset):
            # Randomly sample from the dataset
            indices = random.sample(range(len(dataset)), sample_size)
            dataset = dataset.select(indices)
            if self.verbose:
                console.print(
                    f"[yellow]Sampled {sample_size} examples from dataset with {len(dataset)} total examples[/yellow]"
                )

        if self.verbose:
            console.print(f"[green]Final dataset has {len(dataset)} examples[/green]")

        return dataset

    def query_openai_model(
        self, context, question, model_name="gpt-4o", use_context=True
    ):
        """
        Query an OpenAI model with a mathematical problem.

        Args:
            context (str): The context/background information for the problem.
            question (str): The mathematical question to solve.
            model_name (str, optional): The OpenAI model to use. Defaults to "gpt-4o".
            use_context (bool, optional): Whether to include context in the prompt. Defaults to True.

        Returns:
            str: The model's answer to the question.
        """
        default_answer = {"final_answer": "", "reasoning": ""}

        system_prompt = """You are an expert mathematician tasked with solving a mathematical problem. Given a question and context, provide a clear, step-by-step solution to the question based on the provided context.
        Your answer should be precise, rigorous, and use proper mathematical notation.

        After your detailed explanation, include your final answer in a clear, properly formatted LaTeX after a section titled 'Final Answer' (\\section*{{Final Answer}}).
        Ensure all math expressions are properly enclosed in $...$ or \\[...\\] delimiters.

        """

        if use_context:
            user_prompt = f"""CONTEXT:
            {context}

            QUESTION:
            {question}

            Please solve this mathematical problem step by step, showing your reasoning clearly.
            At the end, provide your final answer in well-formatted LaTeX under a section titled 'Final Answer' (\\section*{{Final Answer}})."""
        else:
            user_prompt = f"""QUESTION:
            {question}

            Please solve this mathematical problem step by step, showing your reasoning clearly.
            At the end, provide your final answer in well-formatted LaTeX under a section titled 'Final Answer' (\\section*{{Final Answer}})."""

        try:
            response = self.openai_client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
            )

            return response.choices[0].message.content
        except Exception as e:
            if self.verbose:
                console.print(f"[bold red]Error querying OpenAI model: {e}[/bold red]")
            return f"Error: {str(e)}"

    def query_antropic_model(
        self,
        context,
        question,
        model_name="claude-3-7-sonnet",
        use_context=True,
        use_thinking=False,
    ):
        """
        Query an Anthropic Claude model with a mathematical problem.

        Args:
            context (str): The context/background information for the problem.
            question (str): The mathematical question to solve.
            model_name (str, optional): The Anthropic model to use. Defaults to "claude-3-7-sonnet".
            use_context (bool, optional): Whether to include context in the prompt. Defaults to True.
            use_thinking (bool, optional): Whether to include thinking in the response. Defaults to True.
        Returns:
            str: The model's answer to the question.
        """
        if not ANTHROPIC_AVAILABLE or not self.anthropic_client:
            return "Error: Anthropic client not available"

        system_prompt = """You are an expert mathematician tasked with solving a mathematical problem.
        Provide a clear, step-by-step solution to the question based on the provided context.
        Your answer should be precise, rigorous, and use proper mathematical notation.

        After your detailed explanation, include your final answer in a clear, properly formatted LaTeX form
        under a section titled 'Final Answer' like this:

        \\section*{Final Answer}
        Your well-formatted LaTeX answer here, with appropriate line breaks for complex expressions.
        Ensure all math expressions are properly enclosed in $...$ or \\[...\\] delimiters.

        Ensure that the LaTeX is valid and can be directly rendered in standard LaTeX without requiring custom command definitions.
        """

        if use_context:
            user_content = f"""CONTEXT:
            {context}

            QUESTION:
            {question}

            Please solve this mathematical problem step by step, showing your reasoning clearly.
            At the end, provide your final answer in well-formatted LaTeX under a section titled 'Final Answer' (\\section*{{Final Answer}})."""
        else:
            user_content = f"""QUESTION:
            {question}

            Please solve this mathematical problem step by step, showing your reasoning clearly.
            At the end, provide your final answer in well-formatted LaTeX under a section titled 'Final Answer' (\\section*{{Final Answer}})."""

        try:
            if use_thinking:
                response = self.anthropic_client.messages.create(
                    model=model_name,
                    max_tokens=20000 + 4000,
                    messages=[{"role": "user", "content": user_content}],
                    thinking={"type": "enabled", "budget_tokens": 20000},
                    system=system_prompt,
            )
            else:
                response = self.anthropic_client.messages.create(
                    model=model_name,
                    max_tokens=4000,
                    messages=[{"role": "user", "content": user_content}],
                    system=system_prompt,
                )

            # Extract content from response
            text_content = ""
            thinking_content = None

            for item in response.content:
                if item.type == "text":
                    text_content = item.text
                elif item.type == "thinking":
                    thinking_content = item.thinking
            if use_thinking:
                return text_content, thinking_content
            else:
                return text_content
        except Exception as e:
            if self.verbose:
                console.print(
                    f"[bold red]Error querying Anthropic model: {e}[/bold red]"
                )
            return f"Error: {str(e)}"

    async def async_query_anthropic_model(
        self,
        context,
        question,
        model_name="claude-3-7-sonnet",
        use_context=True,
        use_thinking=True,
        max_retries=3,
        initial_timeout=3,
    ):
        """
        Asynchronously query an Anthropic Claude model with a mathematical problem.

        Args:
            context (str): The context/background information for the problem.
            question (str): The mathematical question to solve.
            model_name (str, optional): The Anthropic model to use. Defaults to "claude-3-7-sonnet".
            use_context (bool, optional): Whether to include context in the prompt. Defaults to True.
            use_thinking (bool, optional): Whether to include thinking in the response. Defaults to True.
            max_retries (int, optional): Maximum number of retry attempts. Defaults to 3.
            initial_timeout (int, optional): Initial timeout in seconds before retrying. Defaults to 3.
        Returns:
            str: The model's answer to the question.
        """
        if not ANTHROPIC_AVAILABLE or not self.async_anthropic_client:
            return "Error: Anthropic client not available"

        system_prompt = """You are an expert mathematician tasked with solving a mathematical problem.
        Provide a clear, step-by-step solution to the question based on the provided context.
        Your answer should be precise, rigorous, and use proper mathematical notation.

        After your detailed explanation, include your final answer in a clear, properly formatted LaTeX form
        under a section titled 'Final Answer' like this:

        \\section*{Final Answer}
        Your well-formatted LaTeX answer here, with appropriate line breaks for complex expressions.
        Ensure all math expressions are properly enclosed in $...$ or \\[...\\] delimiters.

        Ensure that the LaTeX is valid and can be directly rendered in standard LaTeX without requiring custom command definitions.
        """

        if use_context:
            user_content = f"""CONTEXT:
            {context}

            QUESTION:
            {question}

            Please solve this mathematical problem step by step, showing your reasoning clearly.
            At the end, provide your final answer in well-formatted LaTeX under a section titled 'Final Answer' (\\section*{{Final Answer}})."""
        else:
            user_content = f"""QUESTION:
            {question}

            Please solve this mathematical problem step by step, showing your reasoning clearly.
            At the end, provide your final answer in well-formatted LaTeX under a section titled 'Final Answer' (\\section*{{Final Answer}})."""

        retries = 0
        backoff_time = initial_timeout

        while retries <= max_retries:
            try:
                async with self.async_anthropic_client.messages.stream(
                    model=model_name,
                    max_tokens=16000 + 4000 if use_thinking else 4000,
                    messages=[{"role": "user", "content": user_content}],
                    thinking={"type": "enabled", "budget_tokens": 16000}
                    if use_thinking
                    else None,
                    system=system_prompt,
                ) as stream:
                    response = await stream.get_final_message()

                # Extract content from response
                text_content = ""
                thinking_content = None

                for item in response.content:
                    if item.type == "text":
                        text_content = item.text
                    elif item.type == "thinking":
                        thinking_content = item.thinking

                # Check if text_content is empty
                if not text_content:
                    if retries < max_retries:
                        retries += 1
                        if self.verbose:
                            console.print(
                                f"[yellow]Empty response received. Retry attempt {retries}/{max_retries} after {backoff_time}s delay...[/yellow]"
                            )
                        await asyncio.sleep(backoff_time)
                        continue
                    else:
                        return (
                            f"Error: Empty response after {max_retries} retry attempts",
                            None,
                        )

                if use_thinking:
                    return text_content, thinking_content
                else:
                    return text_content

            except Exception as e:
                error_str = str(e)
                if "529" in error_str and "overloaded" in error_str.lower():
                    if retries < max_retries:
                        retries += 1
                        if self.verbose:
                            console.print(
                                f"[yellow]Anthropic API overloaded (529). Retry attempt {retries}/{max_retries} after {backoff_time}s delay...[/yellow]"
                            )
                        await asyncio.sleep(backoff_time)
                        continue

                if retries < max_retries:
                    retries += 1
                    if self.verbose:
                        console.print(
                            f"[yellow]Error querying Anthropic model: {e}. Retry attempt {retries}/{max_retries} after {backoff_time}s delay...[/yellow]"
                        )
                        if "prompt is too long" in str(e).lower():
                            return f"Error: {str(e)}", None

                    await asyncio.sleep(backoff_time)
                    continue
                else:
                    if self.verbose:
                        console.print(
                            f"[bold red]Error querying Anthropic model after {max_retries} retries: {e}[/bold red]"
                        )
                    return f"Error: {str(e)}", None

    def query_openrouter_models(
        self, context, question, model_name="deepseek/deepseek-r1", use_context=True
    ):
        """
        Query an OpenRouter model with a mathematical problem.

        Args:
            context (str): The context/background information for the problem.
            question (str): The mathematical question to solve.
            model_name (str, optional): The OpenRouter model to use. Defaults to "deepseek/deepseek-r1".
            use_context (bool, optional): Whether to include context in the prompt. Defaults to True.

        Returns:
            str: The model's answer to the question.
        """
        if not OPENROUTER_AVAILABLE or not self.openrouter_client:
            return "Error: OpenRouter client not available"

        system_prompt = """You are an expert mathematician tasked with solving a mathematical problem.
        Provide a clear, step-by-step solution to the question based on the provided context.
        Your answer should be precise, rigorous, and use proper mathematical notation.

        After your detailed explanation, include your final answer in a clear, properly formatted LaTeX form
        under a section titled 'Final Answer' like this:

        \\section*{Final Answer}
        Your well-formatted LaTeX answer here, with appropriate line breaks for complex expressions.
        Ensure all math expressions are properly enclosed in $...$ or \\[...\\] delimiters.

        Ensure that the LaTeX is valid and can be directly rendered in standard LaTeX without requiring custom command definitions.
        """

        if use_context:
            user_prompt = f"""CONTEXT:
            {context}

            QUESTION:
            {question}

            Please solve this mathematical problem step by step, showing your reasoning clearly.
            At the end, provide your final answer in well-formatted LaTeX under a section titled 'Final Answer' (\\section*{{Final Answer}})."""
        else:
            user_prompt = f"""QUESTION:
            {question}

            Please solve this mathematical problem step by step, showing your reasoning clearly.
            At the end, provide your final answer in well-formatted LaTeX under a section titled 'Final Answer' (\\section*{{Final Answer}})."""

        try:
            response = self.openrouter_client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
                # stream=True,
                # extra_headers={
                #     "HTTP-Referer": "https://math-benchmark-eval.com",
                #     "X-Title": "Math Benchmark Evaluation"
                # }
            )

            return response.choices[0].message.content
        except Exception as e:
            if self.verbose:
                console.print(
                    f"[bold red]Error querying OpenRouter model: {e}[/bold red]"
                )
            return f"Error: {str(e)}"

    def query_model(
        self, context, question, model_name, use_context=True, use_thinking=False
    ):
        """
        Query the appropriate model based on model name.

        Args:
            context (str): The context for the problem.
            question (str): The mathematical question.
            model_name (str): The name of the model to use.
            use_context (bool, optional): Whether to include context in the prompt. Defaults to True.

        Returns:
            str: The model's answer.
        """
        # OpenAI models
        if model_name in OPENAI_MODELS.keys():
            return self.query_openai_model(
                context,
                question,
                model_name=OPENAI_MODELS[model_name],
                use_context=use_context,
            )
        # Anthropic models
        elif model_name in ANTHROPIC_MODELS.keys():
            return self.query_antropic_model(
                context,
                question,
                model_name=ANTHROPIC_MODELS[model_name],
                use_context=use_context,
                use_thinking=use_thinking,
            )
        # OpenRouter models
        elif model_name in OPENROUTER_MODELS.keys():
            return self.query_openrouter_models(
                context,
                question,
                model_name=OPENROUTER_MODELS[model_name],
                use_context=use_context,
            )
        # Unsupported model
        else:
            return f"Error: Unsupported model {model_name}. Only OpenAI GPT models, Anthropic Claude models, and OpenRouter models are supported."

    def verify_latex_compatibility(self, answer):
        """
        Verify that the LaTeX in the answer can be properly rendered
        in a standard LaTeX document environment with common math packages.

        Args:
            answer (str): LaTeX-formatted answer

        Returns:
            dict: Dictionary containing validated 'final_answer' and 'reasoning'
        """
        # Keep the full answer as the reasoning part
        reasoning = answer

        # Extract the final answer if it's in a Final Answer section
        final_answer = answer
        extracted_answer = None

        # Try to find the final answer section with different variations of the command
        section_patterns = [
            r"\\section\*{Final Answer}(.*?)(?:\\section|\Z)",  # \section*{Final Answer} until next section or end
            r"\\section{Final Answer}(.*?)(?:\\section|\Z)",  # \section{Final Answer} until next section or end
            r"\\subsection\*{Final Answer}(.*?)(?:\\section|\\subsection|\Z)",  # subsection version
            r"\\subsection{Final Answer}(.*?)(?:\\section|\\subsection|\Z)",  # subsection version
            r"\[FINAL ANSWER\](.*?)\[/FINAL ANSWER\]",  # Legacy tag format
        ]

        # Try each pattern until we find a match
        for pattern in section_patterns:
            final_answer_match = re.search(pattern, answer, re.DOTALL)
            if final_answer_match:
                extracted_answer = final_answer_match.group(1).strip()
                if extracted_answer:
                    final_answer = extracted_answer
                    break

        # First, try direct compilation with pdflatex to check compatibility
        compile_success, fixed_answer = self.compile_test_latex(final_answer)

        # If compilation succeeded, use the validated content
        if compile_success:
            if self.verbose:
                console.print("LaTeX content successfully compiled with pdflatex")
            final_answer = fixed_answer
        else:
            # If compilation failed, try to fix the LaTeX with GPT-4o
            if not OPENAI_AVAILABLE or not self.openai_client:
                if self.verbose:
                    console.print("OpenAI client not available for LaTeX fixing")
            else:
                try:
                    system_prompt = """You are an expert in LaTeX. Your task is to review an answer from a mathematics problem and ensure it can be directly rendered in standard LaTeX without requiring custom command definitions.

                    For any issues in the LaTeX:
                    1. Fix improper math environment delimiters (ensure all $ and \\[ \\] are properly paired)
                    2. Fix unmatched brackets, braces, or parentheses
                    3. Fix any command syntax errors
                    4. Replace any non-standard LaTeX commands with standard equivalents
                    5. Ensure all LaTeX is properly formatted with appropriate line breaks for complex expressions
                    6. Make sure mathematical formulas are properly enclosed in $ or \\[ \\] delimiters

                    IMPORTANT: You must not change the mathematical meaning of the content. Focus only on syntax corrections.

                    Respond ONLY with the corrected text. Do not explain your changes or add any comments."""

                    user_prompt = f"""The following is an answer to a mathematics problem that may contain LaTeX errors or non-standard commands:

                    {final_answer}

                    Please fix any LaTeX syntax issues to ensure it can be compiled in a standard LaTeX document with amsmath and amssymb packages.
                    Format the answer with proper line breaks for complex expressions and ensure all math is properly delimited.
                    Only make changes necessary for proper LaTeX rendering. Don't change the mathematical content or meaning."""

                    response = self.openai_client.chat.completions.create(
                        model="gpt-4o",
                        messages=[
                            {"role": "system", "content": system_prompt},
                            {"role": "user", "content": user_prompt},
                        ],
                        temperature=0.0,
                        max_tokens=1000,
                    )

                    fixed_answer = response.choices[0].message.content.strip()

                    if self.verbose and fixed_answer != final_answer:
                        console.print("LaTeX formatting fixed in the answer")

                    final_answer = fixed_answer

                except Exception as e:
                    if self.verbose:
                        console.print(f"[bold red]Error fixing LaTeX: {e}[/bold red]")

        return {"final_answer": final_answer, "reasoning": reasoning}

    def compile_test_latex(self, answer):
        """
        Test if the LaTeX content can be compiled using pdflatex.

        Args:
            answer (str): LaTeX-formatted answer

        Returns:
            tuple: (success_status, answer) - success_status is True if compilation succeeds
        """
        import subprocess
        import tempfile
        import os
        import shutil

        # First check if pdflatex is available
        pdflatex_available = shutil.which("pdflatex") is not None

        if not pdflatex_available:
            if self.verbose:
                console.print(
                    "pdflatex not found in PATH. Install LaTeX (TeX Live or MiKTeX) to enable direct compilation verification."
                )
            # Return failure but don't modify the content
            return False, answer

        # Create a comprehensive LaTeX document with all necessary math packages
        latex_document = (
            r"""\documentclass{article}
                        \usepackage{amsmath}
                        \usepackage{amssymb}
                        \usepackage{amsthm}
                        \usepackage{mathtools}
                        \usepackage{bm}

                        \begin{document}

                        %s

                        \end{document}
                        """
            % answer
        )

        # Test compilation
        with tempfile.NamedTemporaryFile(suffix=".tex", delete=False) as f:
            tex_file = f.name
            f.write(latex_document.encode("utf-8"))

        try:
            result = subprocess.run(
                ["pdflatex", "-interaction=nonstopmode", tex_file],
                capture_output=True,
                cwd=os.path.dirname(tex_file),
            )
            success = result.returncode == 0

            # Clean up temporary files
            temp_dir = os.path.dirname(tex_file)
            base_name = os.path.splitext(os.path.basename(tex_file))[0]
            for ext in [".tex", ".aux", ".log", ".pdf"]:
                try:
                    os.remove(os.path.join(temp_dir, f"{base_name}{ext}"))
                except:
                    pass

            return success, answer
        except Exception as e:
            if self.verbose:
                console.print(
                    f"[bold red]Error during LaTeX compilation: {e}[/bold red]"
                )
            return False, answer

    def evaluate_answer(self, answer_data, ground_truth, question):
        """
        Evaluate the correctness of the generated answer against the ground truth
        using GPT-4o as a judge.

        Args:
            answer_data (str): The generated answer.
            ground_truth (str): The ground truth answer.
            question (str): The original question.

        Returns:
            tuple: (bool, str) - Whether the answer is correct and an explanation
        """
        if not OPENAI_AVAILABLE or not self.openai_client:
            return False, "OpenAI client not available for evaluation"

        final_answer = answer_data

        system_prompt = """You are an expert mathematician tasked with evaluating the correctness of an answer to a mathematical question.

        Compare the generated answer to the ground truth answer and determine whether the generated answer is mathematically correct
        and equivalent to the ground truth.

        Please be very strict and rigorous in your evaluation, mark the answer as incorrect even if it is 80% or 90% correct.
        Ensure the generated answer can be directly rendered in standard LaTeX without requiring custom command definitions.
        Be precise and focus on mathematical correctness, not formatting or style differences.
        Your evaluation should be fair and consider that the same mathematical content can be expressed in different ways."""

        user_prompt = f"""QUESTION:
        {question}

        GROUND TRUTH ANSWER:
        {ground_truth}

        GENERATED ANSWER:
        {final_answer}

        Carefully evaluate whether the generated answer is mathematically correct
        and equivalent to the ground truth. Your response should only contain a JSON object with the following fields:
        {{
          "is_correct": boolean,
          "explanation": "A concise explanation of why the answer is correct or incorrect, in a clean LaTeX format"
        }}
        where is_correct is true if the answer is mathematically correct and equivalent to the ground truth, and false if it isn't."""

        try:
            response = self.openai_client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
                response_format={"type": "json_object"},
                max_tokens=1000,
            )

            result = json.loads(response.choices[0].message.content)
            return result.get("is_correct", False), result.get(
                "explanation", "No explanation provided"
            )
        except Exception as e:
            if self.verbose:
                console.print(f"[bold red]Error evaluating answer: {e}[/bold red]")
            return False, f"Error evaluating: {str(e)}"

    async def run_parallel_anthropic_queries(
        self, examples, model_name, use_context=True, use_thinking=False, max_parallel=5
    ):
        """
        Run multiple Anthropic queries in parallel using a semaphore to maintain constant parallelism.

        Args:
            examples (list): List of examples to query
            model_name (str): The Anthropic model to use
            use_context (bool): Whether to include context in the prompt
            use_thinking (bool): Whether to include thinking in the response
            max_parallel (int): Maximum number of parallel queries

        Returns:
            list: Results from parallel query execution
        """
        # Initialize a semaphore to limit concurrent tasks
        semaphore = asyncio.Semaphore(max_parallel)
        all_results = [None] * len(examples)

        # Create a progress bar
        pbar = tqdm(
            total=len(examples), desc=f"Processing queries ({max_parallel} parallel)"
        )

        async def process_example(index, example):
            """Process a single example with semaphore control"""
            async with semaphore:
                context = example["context"] if use_context else ""
                question = example["question"]
                result = await self.async_query_anthropic_model(
                    context,
                    question,
                    model_name=model_name,
                    use_context=use_context,
                    use_thinking=use_thinking,
                )
                all_results[index] = result
                # Update progress bar
                pbar.update(1)
                if self.verbose:
                    # Don't show this message when using progress bar
                    pass

        # Create all tasks but they will only run when semaphore allows
        tasks = []
        for i, example in enumerate(examples):
            tasks.append(asyncio.create_task(process_example(i, example)))

        # Wait for all tasks to complete
        await asyncio.gather(*tasks)

        # Close the progress bar
        pbar.close()

        return all_results

    def run_evaluation(
        self, dataset, model_name, use_context=True, use_thinking=False, parallel=0
    ):
        """
        Run the evaluation on a dataset.

        Args:
            dataset: The dataset to evaluate on.
            model_name (str): The name of the model to evaluate.
            use_context (bool, optional): Whether to include context in the prompt. Defaults to True.
            use_thinking (bool, optional): Whether to include thinking in the response. Defaults to False.
            parallel (int, optional): Number of parallel queries to run. Defaults to 0 (sequential).

        Returns:
            dict: Evaluation results.
        """
        results = []
        correct_count = 0
        correct_ids = []  # Track IDs of correctly answered questions

        # Check if we should run Anthropic queries in parallel
        if model_name in ANTHROPIC_MODELS.keys() and parallel > 0:
            # Run async version
            return asyncio.run(
                self.async_run_evaluation(
                    dataset,
                    model_name,
                    use_context=use_context,
                    use_thinking=use_thinking,
                    max_parallel=parallel,
                )
            )

        for i, example in enumerate(
            tqdm(dataset, desc=f"Evaluating {model_name}", disable=not self.verbose)
        ):
            context = example["context"] if use_context else ""
            theorem_content = example["theorem"]
            question = example["question"]
            ground_truth = example["answer"]
            paper_link = example["paper_link"]

            # Query model for answer
            if self.verbose:
                console.print(
                    f"\n[bold]Evaluating question {i + 1}/{len(dataset)}[/bold]"
                )
                console.print(f"[bold]Question:[/bold] {question}")
                console.print(
                    f"[bold]Context:[/bold] {'[OMITTED]' if not use_context else context[:100] + '...' if len(context) > 100 else context}"
                )

            # Call the model and measure response time
            start_time = time.time()
            answer_data = self.query_model(
                context,
                question,
                model_name,
                use_context=use_context,