From e15ddef375e08471164f01711ec41c5e210afa42 Mon Sep 17 00:00:00 2001 From: Richard Lundeen Date: Thu, 9 Apr 2026 16:32:01 -0700 Subject: [PATCH 1/3] Update to seed --- pyrit/models/seeds/seed_objective.py | 5 ++--- pyrit/models/seeds/seed_prompt.py | 5 ++--- tests/unit/datasets/test_beaver_tails_dataset.py | 13 +++++++------ tests/unit/datasets/test_toxic_chat_dataset.py | 12 +++++++----- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/pyrit/models/seeds/seed_objective.py b/pyrit/models/seeds/seed_objective.py index c36dfef699..68711b7400 100644 --- a/pyrit/models/seeds/seed_objective.py +++ b/pyrit/models/seeds/seed_objective.py @@ -35,9 +35,8 @@ def __post_init__(self) -> None: """ if self.is_general_technique: raise ValueError("SeedObjective cannot be a general technique.") - if not self.is_jinja_template: - self.value = self.escape_for_jinja(self.value) - self.value = super().render_template_value_silent(**PATHS_DICT) + if self.is_jinja_template: + self.value = super().render_template_value_silent(**PATHS_DICT) @classmethod def from_yaml_with_required_parameters( diff --git a/pyrit/models/seeds/seed_prompt.py b/pyrit/models/seeds/seed_prompt.py index c6894531e5..aac2877013 100644 --- a/pyrit/models/seeds/seed_prompt.py +++ b/pyrit/models/seeds/seed_prompt.py @@ -55,9 +55,8 @@ def __post_init__(self) -> None: ValueError: If file-based data type cannot be inferred from extension. """ - if not self.is_jinja_template: - self.value = self.escape_for_jinja(self.value) - self.value = self.render_template_value_silent(**PATHS_DICT) + if self.is_jinja_template: + self.value = self.render_template_value_silent(**PATHS_DICT) if not self.data_type: # If data_type is not provided, infer it from the value diff --git a/tests/unit/datasets/test_beaver_tails_dataset.py b/tests/unit/datasets/test_beaver_tails_dataset.py index 77bc9dd5b2..a2953e1c86 100644 --- a/tests/unit/datasets/test_beaver_tails_dataset.py +++ b/tests/unit/datasets/test_beaver_tails_dataset.py @@ -97,14 +97,14 @@ def test_dataset_name(self): assert loader.dataset_name == "beaver_tails" @pytest.mark.asyncio - async def test_fetch_dataset_skips_prompt_with_template_syntax_error(self): - """Test that prompts causing TemplateSyntaxError are skipped gracefully.""" + async def test_fetch_dataset_preserves_prompt_with_jinja_syntax(self): + """Test that prompts containing Jinja2 syntax are preserved as literal text.""" class MockDataset: def __init__(self): self._data = [ { - "prompt": "This contains {% endraw %} which breaks Jinja2", + "prompt": "This contains {% endraw %} which is Jinja2 syntax", "response": "response", "category": {"animal_abuse": True}, "is_safe": False, @@ -124,6 +124,7 @@ def __iter__(self): with patch.object(loader, "_fetch_from_huggingface", new=AsyncMock(return_value=MockDataset())): dataset = await loader.fetch_dataset() - # The broken prompt should be skipped, only the normal one remains - assert len(dataset.seeds) == 1 - assert dataset.seeds[0].value == "Normal unsafe prompt" + # Both prompts should be preserved — untrusted text is never passed through Jinja + assert len(dataset.seeds) == 2 + assert dataset.seeds[0].value == "This contains {% endraw %} which is Jinja2 syntax" + assert dataset.seeds[1].value == "Normal unsafe prompt" diff --git a/tests/unit/datasets/test_toxic_chat_dataset.py b/tests/unit/datasets/test_toxic_chat_dataset.py index 71de6c89a5..eca32cf912 100644 --- a/tests/unit/datasets/test_toxic_chat_dataset.py +++ b/tests/unit/datasets/test_toxic_chat_dataset.py @@ -92,8 +92,8 @@ async def test_fetch_dataset_preserves_jinja2_content(self): assert dataset.seeds[1].value == "{%block%}broken" @pytest.mark.asyncio - async def test_fetch_dataset_skips_jinja2_incompatible_entries(self): - """Test that entries with Jinja2-incompatible content are skipped.""" + async def test_fetch_dataset_preserves_jinja2_syntax_in_entries(self): + """Test that entries with Jinja2 syntax are preserved as literal text.""" data_with_endraw = [ { "conv_id": "good1", @@ -105,7 +105,7 @@ async def test_fetch_dataset_skips_jinja2_incompatible_entries(self): "openai_moderation": "[]", }, { - "conv_id": "bad1", + "conv_id": "jinja1", "user_input": "This has {% endraw %} in it", "model_output": "N/A", "human_annotation": "False", @@ -128,9 +128,11 @@ async def test_fetch_dataset_skips_jinja2_incompatible_entries(self): with patch.object(loader, "_fetch_from_huggingface", new=AsyncMock(return_value=data_with_endraw)): dataset = await loader.fetch_dataset() - assert len(dataset.seeds) == 2 + # All entries are preserved — untrusted text is never passed through Jinja + assert len(dataset.seeds) == 3 assert dataset.seeds[0].value == "Normal question" - assert dataset.seeds[1].value == "Another normal question" + assert dataset.seeds[1].value == "This has {% endraw %} in it" + assert dataset.seeds[2].value == "Another normal question" @pytest.mark.asyncio async def test_fetch_dataset_preserves_for_loop_content(self): From 02e609ef260a26f67891709e2a39dd756f222540 Mon Sep 17 00:00:00 2001 From: Richard Lundeen Date: Thu, 9 Apr 2026 17:04:26 -0700 Subject: [PATCH 2/3] updating datasets --- .../remote/beaver_tails_dataset.py | 27 ++++------ .../remote/toxic_chat_dataset.py | 50 ++++++------------- 2 files changed, 27 insertions(+), 50 deletions(-) diff --git a/pyrit/datasets/seed_datasets/remote/beaver_tails_dataset.py b/pyrit/datasets/seed_datasets/remote/beaver_tails_dataset.py index f1f2d39135..76c8dec70c 100644 --- a/pyrit/datasets/seed_datasets/remote/beaver_tails_dataset.py +++ b/pyrit/datasets/seed_datasets/remote/beaver_tails_dataset.py @@ -3,8 +3,6 @@ import logging -from jinja2 import TemplateSyntaxError - from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import ( _RemoteDatasetLoader, ) @@ -101,21 +99,18 @@ async def fetch_dataset(self, *, cache: bool = True) -> SeedDataset: harm_categories = [k for k, v in item["category"].items() if v] - try: - seed_prompts.append( - SeedPrompt( - value=item["prompt"], - data_type="text", - dataset_name=self.dataset_name, - harm_categories=harm_categories, - description=description, - source=source_url, - authors=authors, - groups=groups, - ) + seed_prompts.append( + SeedPrompt( + value=item["prompt"], + data_type="text", + dataset_name=self.dataset_name, + harm_categories=harm_categories, + description=description, + source=source_url, + authors=authors, + groups=groups, ) - except TemplateSyntaxError: - logger.warning("Skipping BeaverTails prompt due to Jinja2 template syntax error in prompt text") + ) logger.info(f"Successfully loaded {len(seed_prompts)} prompts from BeaverTails dataset") diff --git a/pyrit/datasets/seed_datasets/remote/toxic_chat_dataset.py b/pyrit/datasets/seed_datasets/remote/toxic_chat_dataset.py index 20a025781b..78259f8f3e 100644 --- a/pyrit/datasets/seed_datasets/remote/toxic_chat_dataset.py +++ b/pyrit/datasets/seed_datasets/remote/toxic_chat_dataset.py @@ -5,8 +5,6 @@ import logging from typing import Any -from jinja2 import TemplateSyntaxError - from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import ( _RemoteDatasetLoader, ) @@ -122,42 +120,26 @@ async def fetch_dataset(self, *, cache: bool = True) -> SeedDataset: source_url = f"https://huggingface.co/datasets/{self.HF_DATASET_NAME}" groups = ["UC San Diego"] - raw_prefix = "{% raw %}" - raw_suffix = "{% endraw %}" - seed_prompts: list[SeedPrompt] = [] for item in data: user_input = item["user_input"] harm_categories = self._extract_harm_categories(item) - try: - prompt = SeedPrompt( - value=user_input, - data_type="text", - dataset_name=self.dataset_name, - description=description, - source=source_url, - authors=authors, - groups=groups, - harm_categories=harm_categories, - metadata={ - "toxicity": str(item.get("toxicity", "")), - "jailbreaking": str(item.get("jailbreaking", "")), - "human_annotation": str(item.get("human_annotation", "")), - }, - ) - - # If user_input contains Jinja2 control structures (e.g., {% for %}), - # render_template_value_silent may skip rendering and leave the raw wrapper. - if prompt.value.startswith(raw_prefix) and prompt.value.endswith(raw_suffix): - prompt.value = prompt.value[len(raw_prefix) : -len(raw_suffix)] - - seed_prompts.append(prompt) - except TemplateSyntaxError: - conv_id = item.get("conv_id", "unknown") - logger.debug( - f"Skipping entry with conv_id={conv_id}: failed to parse as Jinja2 template", - exc_info=True, - ) + prompt = SeedPrompt( + value=user_input, + data_type="text", + dataset_name=self.dataset_name, + description=description, + source=source_url, + authors=authors, + groups=groups, + harm_categories=harm_categories, + metadata={ + "toxicity": str(item.get("toxicity", "")), + "jailbreaking": str(item.get("jailbreaking", "")), + "human_annotation": str(item.get("human_annotation", "")), + }, + ) + seed_prompts.append(prompt) logger.info(f"Successfully loaded {len(seed_prompts)} prompts from ToxicChat dataset") From cb1c54b48cbbe7fd36c9c968ce3fb5a4ebdba974 Mon Sep 17 00:00:00 2001 From: Richard Lundeen Date: Thu, 9 Apr 2026 17:18:18 -0700 Subject: [PATCH 3/3] pr feedback --- pyrit/models/seeds/seed_objective.py | 1 + pyrit/models/seeds/seed_prompt.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/pyrit/models/seeds/seed_objective.py b/pyrit/models/seeds/seed_objective.py index 68711b7400..0f0edd743a 100644 --- a/pyrit/models/seeds/seed_objective.py +++ b/pyrit/models/seeds/seed_objective.py @@ -35,6 +35,7 @@ def __post_init__(self) -> None: """ if self.is_general_technique: raise ValueError("SeedObjective cannot be a general technique.") + # Only trusted templates are rendered through Jinja — see seed_prompt.py for details. if self.is_jinja_template: self.value = super().render_template_value_silent(**PATHS_DICT) diff --git a/pyrit/models/seeds/seed_prompt.py b/pyrit/models/seeds/seed_prompt.py index aac2877013..1a4e8a566d 100644 --- a/pyrit/models/seeds/seed_prompt.py +++ b/pyrit/models/seeds/seed_prompt.py @@ -55,6 +55,10 @@ def __post_init__(self) -> None: ValueError: If file-based data type cannot be inferred from extension. """ + # Only trusted templates (is_jinja_template=True, e.g. from YAML files) are rendered + # through Jinja. Untrusted text (e.g. from remote datasets) must NOT be rendered — a + # crafted payload containing "{% endraw %}" can escape the raw wrapper and execute + # arbitrary Jinja expressions. See seed_objective.py for the same pattern. if self.is_jinja_template: self.value = self.render_template_value_silent(**PATHS_DICT)