From 96a1b1f22648ceb50ae1921f96203f3108dbcee4 Mon Sep 17 00:00:00 2001 From: Parth Thapliyal Date: Sun, 22 Mar 2026 14:56:37 -0400 Subject: [PATCH 01/13] feat(cdp): models + config for confidential_data_procurement skill BuyerPolicy, SupplierSubmission, DatasetMetrics, ProcurementResult with Pydantic validators. Output key sets enforce budget leak prevention (quality_score buyer-only). Score weights and base_price validated on init. Co-Authored-By: Claude Sonnet 4.6 --- .../.env.example | 6 + .../confidential_data_procurement/__init__.py | 2 + .../confidential_data_procurement/config.py | 99 +++++++++++++++ .../confidential_data_procurement/models.py | 120 ++++++++++++++++++ 4 files changed, 227 insertions(+) create mode 100644 skills/confidential_data_procurement/.env.example create mode 100644 skills/confidential_data_procurement/__init__.py create mode 100644 skills/confidential_data_procurement/config.py create mode 100644 skills/confidential_data_procurement/models.py diff --git a/skills/confidential_data_procurement/.env.example b/skills/confidential_data_procurement/.env.example new file mode 100644 index 0000000..493cb78 --- /dev/null +++ b/skills/confidential_data_procurement/.env.example @@ -0,0 +1,6 @@ +# Per-node model overrides for confidential_data_procurement skill. +# Copy to skills/confidential_data_procurement/.env and fill in values. +# Empty value = fallback to CONCLAVE_DEFAULT_MODEL in root .env + +CONCLAVE_CDP_INIT_MODEL=deepseek-ai/DeepSeek-V3.1 +CONCLAVE_CDP_EVALUATE_MODEL=deepseek-ai/DeepSeek-V3.1 diff --git a/skills/confidential_data_procurement/__init__.py b/skills/confidential_data_procurement/__init__.py new file mode 100644 index 0000000..c1928ef --- /dev/null +++ b/skills/confidential_data_procurement/__init__.py @@ -0,0 +1,2 @@ +# Confidential Data Procurement skill — package init. +# run_skill, skill_card, and respond_handler will be added in Commit 6. diff --git a/skills/confidential_data_procurement/config.py b/skills/confidential_data_procurement/config.py new file mode 100644 index 0000000..8362195 --- /dev/null +++ b/skills/confidential_data_procurement/config.py @@ -0,0 +1,99 @@ +""" +Skill-specific constants for confidential_data_procurement. + +What to edit here: +- ALLOWED_OUTPUT_KEYS: buyer (admin) view — keys that leave the pipeline to the buyer +- USER_OUTPUT_KEYS: supplier (participant) view — subset of ALLOWED_OUTPUT_KEYS. + quality_score and hard_constraints_pass are buyer-only to prevent + the supplier from reverse-engineering max_budget via P/S = max_budget. +- SCORE_BOUNDS: clamping ranges for numeric output fields +- DEFAULT_SCORE_WEIGHTS: used when buyer doesn't specify score_weights in BuyerPolicy +- CRITICAL_*: deterministic early-exit thresholds (no LLM runs on critical failure) +- *_MODEL: per-node model overrides (set in .env) + +Consumed by: +- deterministic.py (CRITICAL_*, DEFAULT_SCORE_WEIGHTS) +- guardrails.py (ALLOWED_OUTPUT_KEYS, USER_OUTPUT_KEYS, SCORE_BOUNDS) +- __init__.py (ALLOWED_OUTPUT_KEYS, USER_OUTPUT_KEYS via skill_card) +- agent.py (EVALUATE_MODEL) +- init.py (INIT_MODEL) +""" +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv(os.path.join(os.path.dirname(__file__), ".env")) + +# --- Output key sets --- + +# Buyer (admin) sees quality details + budget-sensitive fields +ALLOWED_OUTPUT_KEYS: set[str] = { + "submission_id", + "deal", + "quality_score", # buyer-only (budget leak if supplier sees this + proposed_payment) + "proposed_payment", + "hard_constraints_pass", # buyer-only + "settlement_status", + "release_token", + "notes", + "explanation", + "claim_verification", + "schema_matching", + "buyer_response", + "supplier_response", + "renegotiation_used", +} + +# Supplier (participant) — same info, quality_score and hard_constraints_pass withheld +USER_OUTPUT_KEYS: set[str] = { + "submission_id", + "deal", + "proposed_payment", + "settlement_status", + "release_token", + "notes", + "explanation", + "claim_verification", + "schema_matching", + "buyer_response", + "supplier_response", + "renegotiation_used", +} + +# --- Score bounds (used by guardrails for clamping) --- + +SCORE_BOUNDS: dict[str, tuple[float, float]] = { + "quality_score": (0.0, 1.0), +} + +# --- Default score weights --- +# Buyer can override via BuyerPolicy.score_weights. Must sum to 1.0. +DEFAULT_SCORE_WEIGHTS: dict[str, float] = { + "schema": 0.15, + "coverage": 0.15, + "null": 0.20, + "duplicate": 0.15, + "label": 0.10, + "risk": 0.15, + "claim_veracity": 0.10, +} + +# --- Critical failure thresholds (deterministic early exit, no LLM) --- + +# Duplicate rate above this → critical failure, deal rejected immediately +CRITICAL_DUPLICATE_THRESHOLD: float = 0.50 + +# Dataset size limits +MAX_DATASET_SIZE_MB: int = 50 +MAX_DATASET_ROWS: int = 500_000 + +# Minimum leakage substring length passed to LeakageDetector +MIN_LEAKAGE_SUBSTRING_LENGTH: int = 20 + +# --- Per-node model overrides --- + +_default = os.environ.get("CONCLAVE_DEFAULT_MODEL", "deepseek-ai/DeepSeek-V3.1") +INIT_MODEL = os.environ.get("CONCLAVE_CDP_INIT_MODEL") or _default +EVALUATE_MODEL = os.environ.get("CONCLAVE_CDP_EVALUATE_MODEL") or _default diff --git a/skills/confidential_data_procurement/models.py b/skills/confidential_data_procurement/models.py new file mode 100644 index 0000000..ce154f6 --- /dev/null +++ b/skills/confidential_data_procurement/models.py @@ -0,0 +1,120 @@ +""" +Input and output Pydantic models for the confidential_data_procurement skill. + +Roles: +- BuyerPolicy: operator config — replaces OperatorConfig for this skill. + NEVER expose max_budget or base_price to the supplier. +- SupplierSubmission: participant input — references an uploaded dataset by ID. + NEVER expose reserve_price to the buyer. +- DatasetMetrics: intermediate deterministic output — not returned to API callers. +- ProcurementResult: final result per submission — key-filtered by role in routes.py. + revised_budget / revised_reserve are internal-only fields, + excluded from both ALLOWED_OUTPUT_KEYS and USER_OUTPUT_KEYS. +""" +from __future__ import annotations + +from typing import Any, Literal, Optional + +from pydantic import BaseModel, Field, model_validator + +from core.models import Submission + + +class BuyerPolicy(BaseModel): + """ + Operator config for the confidential_data_procurement skill. + Extracted by the init_handler from the buyer's onboarding conversation. + routes.py sets instance_id after init completes. + """ + required_columns: list[str] # semantic — agent does fuzzy matching + min_rows: int = Field(gt=0) + max_null_rate: float = Field(ge=0.0, le=1.0) # e.g. 0.03 = 3% + max_duplicate_rate: float = Field(ge=0.0, le=1.0) + min_label_rate: Optional[float] = Field(default=None, ge=0.0, le=1.0) + label_column: Optional[str] = None + forbidden_columns: list[str] = [] + max_budget: float = Field(gt=0.0) # NEVER exposed to supplier + base_price: float = Field(default=0.0, ge=0.0) # floor: P when S=0 + score_weights: dict[str, float] = {} # buyer overrides DEFAULT_SCORE_WEIGHTS + description: str = "" # natural language description of dataset need + instance_id: str = "default" # set by routes.py after init + + @model_validator(mode="after") + def validate_weights(self) -> "BuyerPolicy": + if self.score_weights: + total = sum(self.score_weights.values()) + if abs(total - 1.0) > 0.01: + raise ValueError( + f"score_weights must sum to 1.0 (got {total:.3f}). " + "Adjust weights or omit to use defaults." + ) + if self.base_price >= self.max_budget: + raise ValueError("base_price must be less than max_budget") + return self + + +class SupplierSubmission(Submission): + """ + Participant input for the confidential_data_procurement skill. + Supplier uploads their dataset via POST /upload first, then submits here. + """ + dataset_id: str # references uploaded DataFrame in ingest store + dataset_name: str + reserve_price: float = Field(ge=0.0) # NEVER exposed to buyer + + +class DatasetMetrics(BaseModel): + """ + Deterministic quality metrics computed from the raw DataFrame. + Intermediate result — never returned directly to API callers. + """ + row_count: int + column_names: list[str] + null_rate_by_column: dict[str, float] + overall_null_rate: float + duplicate_rate: float + label_rate: Optional[float] = None # None if label_column not specified + forbidden_columns_present: list[str] = [] + hard_constraints_pass: bool # all binary must-pass checks + critical_failure: bool = False # triggers early exit before agent + critical_reason: Optional[str] = None # human-readable reason for critical failure + + +class ProcurementResult(BaseModel): + """ + Final result per submission after guardrails. + Role-filtered in routes.py: buyer sees ALLOWED_OUTPUT_KEYS, supplier sees USER_OUTPUT_KEYS. + + Field notes: + - deal: enclave's mathematical verdict (R ≤ P ≤ B and hard constraints pass) + - quality_score: buyer-only — supplier could reverse-engineer max_budget via P/S + - hard_constraints_pass: buyer-only — same reasoning + - settlement_status: lifecycle state — independent of deal bool + "rejected" | "pending_approval" | "awaiting_counterparty" | + "renegotiating" | "authorized" + - revised_budget/reserve: INTERNAL — never in any output key set + """ + submission_id: str + deal: bool = False + quality_score: float = Field(default=0.0, ge=0.0, le=1.0) # buyer-only + proposed_payment: float = 0.0 + hard_constraints_pass: bool = False # buyer-only + settlement_status: Literal[ + "rejected", + "pending_approval", + "awaiting_counterparty", + "renegotiating", + "authorized", + ] = "rejected" + release_token: Optional[str] = None + notes: list[str] = [] # failure/partial notes — same for both roles + explanation: Optional[str] = None # bounded LLM summary + claim_verification: Optional[dict[str, Any]] = None # from agent layer + schema_matching: Optional[dict[str, Any]] = None # from agent layer + buyer_response: Optional[Literal["accept", "reject", "renegotiate"]] = None + supplier_response: Optional[Literal["accept", "reject", "renegotiate"]] = None + renegotiation_used: bool = False + + # INTERNAL ONLY — excluded from ALLOWED_OUTPUT_KEYS and USER_OUTPUT_KEYS + revised_budget: Optional[float] = None + revised_reserve: Optional[float] = None From 396cf803544b73c4005cb1f0afb3cf89fe500f23 Mon Sep 17 00:00:00 2001 From: Parth Thapliyal Date: Sun, 22 Mar 2026 15:02:56 -0400 Subject: [PATCH 02/13] feat(cdp): ingestion layer, file upload route, SkillCard handler fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ingest.py: CSV parse with size/row limits, JSON metadata, format stubs (PDF/DOCX/Excel), in-memory DataFrame store, procurement_upload_handler - core/skill_card.py: upload_handler + respond_handler optional fields - api/routes.py: generic POST /upload and POST /respond routes that delegate to skill-owned handlers — no skill-specific logic in shared infra - All 57 existing tests pass Co-Authored-By: Claude Sonnet 4.6 --- api/routes.py | 88 ++++++- core/skill_card.py | 2 + .../confidential_data_procurement/ingest.py | 224 ++++++++++++++++++ 3 files changed, 313 insertions(+), 1 deletion(-) create mode 100644 skills/confidential_data_procurement/ingest.py diff --git a/api/routes.py b/api/routes.py index f40eaa9..8e3a142 100644 --- a/api/routes.py +++ b/api/routes.py @@ -4,7 +4,8 @@ import uuid from functools import partial -from fastapi import APIRouter, HTTPException, Request +from fastapi import APIRouter, File, HTTPException, Request, UploadFile +from fastapi.datastructures import FormData from core.models import SkillResponse, InitRequest, InitResponse from skills.router import SkillRouter @@ -351,6 +352,91 @@ def get_my_submissions(request: Request): return {"submission_ids": list(token_info["submission_ids"])} +@router.post("/upload") +async def upload_file(request: Request): + """ + Generic file upload — delegates entirely to the skill's upload_handler. + Skills that need file upload declare upload_handler on their SkillCard. + The skill owns all parsing, storage, and validation logic. + + Returns whatever the skill's upload_handler returns (e.g. {"dataset_id": "..."}). + """ + token_info = _resolve_token(request) + instance_id = token_info["instance_id"] + card = _skill_router.get_card(_instances[instance_id]["skill_name"]) + + if card.upload_handler is None: + raise HTTPException( + status_code=400, + detail=f"Skill '{_instances[instance_id]['skill_name']}' does not support file upload", + ) + + try: + form: FormData = await request.form() + loop = asyncio.get_event_loop() + result = await loop.run_in_executor(None, card.upload_handler, form, instance_id) + except ValueError as e: + raise HTTPException(status_code=422, detail=str(e)) + except Exception as e: + raise HTTPException(status_code=500, detail=f"Upload failed: {e}") + + return result + + +@router.post("/respond") +async def respond_to_result(body: dict, request: Request): + """ + Deal response — delegates entirely to the skill's respond_handler. + Skills that support renegotiation declare respond_handler on their SkillCard. + + Body: { + "submission_id": str, + "action": "accept" | "reject" | "renegotiate", + "revised_value": float | null # only when action="renegotiate" + } + Returns: {"settlement_status": str, ...any extra fields the skill returns} + """ + token_info = _resolve_token(request) + instance_id = token_info["instance_id"] + role = token_info["role"] + card = _skill_router.get_card(_instances[instance_id]["skill_name"]) + + if card.respond_handler is None: + raise HTTPException( + status_code=400, + detail=f"Skill '{_instances[instance_id]['skill_name']}' does not support deal responses", + ) + + submission_id = body.get("submission_id") + if not submission_id: + raise HTTPException(status_code=422, detail="submission_id is required") + + instance_results = _results.get(instance_id, {}) + if submission_id not in instance_results: + raise HTTPException(status_code=404, detail="Result not found or not yet available") + + action = body.get("action") + if action not in ("accept", "reject", "renegotiate"): + raise HTTPException(status_code=422, detail="action must be 'accept', 'reject', or 'renegotiate'") + + try: + loop = asyncio.get_event_loop() + updated = await loop.run_in_executor( + None, + card.respond_handler, + instance_results[submission_id], + action, + body.get("revised_value"), + "buyer" if role == "admin" else "supplier", + _instances[instance_id]["config"], + ) + except ValueError as e: + raise HTTPException(status_code=422, detail=str(e)) + + _results[instance_id][submission_id] = updated + return {"settlement_status": updated.get("settlement_status")} + + @router.post("/trigger") async def trigger(request: Request): """Manual pipeline trigger. Admin only. Uses stored instance config.""" diff --git a/core/skill_card.py b/core/skill_card.py index 079a6ec..1e19ba8 100644 --- a/core/skill_card.py +++ b/core/skill_card.py @@ -35,6 +35,8 @@ class SkillCard: roles: dict = field(default_factory=dict) # admin + user role declarations setup_prompt: str = "" # LLM onboarding text for admins (metadata/docs) init_handler: Optional[Callable] = None # skill-owned onboarding conversation handler + upload_handler: Optional[Callable] = None # skill-owned file upload handler (POST /upload) + respond_handler: Optional[Callable] = None # skill-owned deal response handler (POST /respond) user_display: dict = field(default_factory=dict) # display hints per output key for the frontend renderer version: str = "0.1.0" diff --git a/skills/confidential_data_procurement/ingest.py b/skills/confidential_data_procurement/ingest.py new file mode 100644 index 0000000..8135ac9 --- /dev/null +++ b/skills/confidential_data_procurement/ingest.py @@ -0,0 +1,224 @@ +""" +Ingestion layer for confidential_data_procurement. + +Responsibilities: +- Parse uploaded CSV into a pandas DataFrame +- Parse metadata file (JSON working; PDF/DOCX stubbed) +- Parse buyer policy document (JSON working; PDF/DOCX stubbed) +- Store DataFrames in memory keyed by dataset_id +- Expose upload_handler — the SkillCard callable for POST /upload + +The DataFrame NEVER leaves this module as raw data. +Tools in tools.py query it only via aggregate operations. +Cleanup is called by run_skill after the pipeline completes. + +Format support matrix: + CSV: ✓ working + JSON: ✓ working (metadata + buyer policy documents) + DOCX: ✗ stubbed + PDF: ✗ stubbed + Excel: ✗ stubbed +""" +from __future__ import annotations + +import io +import json +import uuid +from typing import Any + +import pandas as pd + +from skills.confidential_data_procurement.config import ( + MAX_DATASET_ROWS, + MAX_DATASET_SIZE_MB, +) + +# --------------------------------------------------------------------------- +# In-memory dataset store +# dataset_id -> { +# "df": pd.DataFrame, +# "metadata": dict, # seller-provided metadata +# "column_definitions": dict, # col_name -> human description +# "seller_claims": dict, # claim_key -> claim_value +# "instance_id": str, +# } +# --------------------------------------------------------------------------- +_datasets: dict[str, dict[str, Any]] = {} + + +# --------------------------------------------------------------------------- +# Public accessors +# --------------------------------------------------------------------------- + +def get_dataset(dataset_id: str) -> dict[str, Any]: + """Return the stored dataset dict. Raises KeyError if not found.""" + if dataset_id not in _datasets: + raise KeyError(f"Dataset '{dataset_id}' not found. Upload may have expired.") + return _datasets[dataset_id] + + +def cleanup(dataset_id: str) -> None: + """Discard the DataFrame after the pipeline completes.""" + _datasets.pop(dataset_id, None) + + +# --------------------------------------------------------------------------- +# CSV parsing +# --------------------------------------------------------------------------- + +def parse_csv(file_bytes: bytes) -> pd.DataFrame: + """ + Parse CSV bytes into a DataFrame. + Enforces size and row limits before returning. + Raises ValueError with a human-readable message on any failure. + """ + size_mb = len(file_bytes) / (1024 * 1024) + if size_mb > MAX_DATASET_SIZE_MB: + raise ValueError( + f"Dataset exceeds size limit ({size_mb:.1f}MB > {MAX_DATASET_SIZE_MB}MB). " + "Please upload a smaller file." + ) + + try: + df = pd.read_csv(io.BytesIO(file_bytes)) + except Exception as e: + raise ValueError(f"Could not parse CSV: {e}") from e + + if len(df) > MAX_DATASET_ROWS: + raise ValueError( + f"Dataset exceeds row limit ({len(df):,} rows > {MAX_DATASET_ROWS:,}). " + "Please upload a sample." + ) + + if df.empty: + raise ValueError("Uploaded CSV is empty.") + + return df + + +# --------------------------------------------------------------------------- +# Metadata parsing +# --------------------------------------------------------------------------- + +def parse_metadata(file_bytes: bytes, file_type: str) -> dict[str, Any]: + """ + Parse the supplier's metadata file. + + JSON (working): expects keys such as: + column_definitions: {col_name: description} + seller_claims: {claim_key: claim_value} + source, date_range, license, etc. + + PDF / DOCX / other: stubbed — returns empty metadata with a note. + """ + file_type = (file_type or "").lower().strip(".") + + if file_type == "json": + try: + return json.loads(file_bytes.decode("utf-8")) + except Exception as e: + raise ValueError(f"Could not parse metadata JSON: {e}") from e + + # --- Stubs --- + _STUB_TYPES = {"pdf", "docx", "doc", "txt", "md"} + if file_type in _STUB_TYPES: + return { + "_stub": True, + "_stub_reason": ( + f"Metadata format '{file_type}' is not yet supported. " + "Please upload a JSON metadata file. " + "Proceeding with empty metadata." + ), + } + + return { + "_stub": True, + "_stub_reason": ( + f"Unknown metadata format '{file_type}'. " + "Proceeding with empty metadata." + ), + } + + +def parse_buyer_document(file_bytes: bytes, file_type: str) -> dict[str, Any]: + """ + Parse a buyer-uploaded policy document. + + JSON (working): expects BuyerPolicy-compatible fields. + PDF / DOCX: stubbed — buyer should describe requirements in the init chat. + """ + file_type = (file_type or "").lower().strip(".") + + if file_type == "json": + try: + return json.loads(file_bytes.decode("utf-8")) + except Exception as e: + raise ValueError(f"Could not parse policy JSON: {e}") from e + + _STUB_TYPES = {"pdf", "docx", "doc", "txt", "md"} + if file_type in _STUB_TYPES: + return { + "_stub": True, + "_stub_reason": ( + f"Policy document format '{file_type}' is not yet supported. " + "Please describe your requirements in the setup chat, " + "or upload a JSON policy file." + ), + } + + return { + "_stub": True, + "_stub_reason": ( + f"Unknown policy format '{file_type}'. " + "Please describe your requirements in the setup chat." + ), + } + + +# --------------------------------------------------------------------------- +# Upload handler (SkillCard.upload_handler) +# --------------------------------------------------------------------------- + +def procurement_upload_handler(form: Any, instance_id: str) -> dict[str, Any]: + """ + Skill-owned handler for POST /upload. + Called by routes.py with the parsed multipart form and instance_id. + + Expected form fields: + csv_file — the dataset CSV (required) + metadata_file — JSON metadata file (optional) + + Returns: + {"dataset_id": str} + """ + # --- Extract CSV --- + csv_upload = form.get("csv_file") + if csv_upload is None: + raise ValueError("csv_file is required") + + csv_bytes = csv_upload.file.read() if hasattr(csv_upload, "file") else bytes(csv_upload) + df = parse_csv(csv_bytes) + + # --- Extract metadata (optional) --- + metadata: dict[str, Any] = {} + metadata_upload = form.get("metadata_file") + if metadata_upload is not None: + meta_bytes = ( + metadata_upload.file.read() + if hasattr(metadata_upload, "file") + else bytes(metadata_upload) + ) + filename = getattr(metadata_upload, "filename", "") or "" + ext = filename.rsplit(".", 1)[-1] if "." in filename else "json" + metadata = parse_metadata(meta_bytes, ext) + + dataset_id = str(uuid.uuid4()) + _datasets[dataset_id] = { + "df": df, + "metadata": metadata, + "column_definitions": metadata.get("column_definitions", {}), + "seller_claims": metadata.get("seller_claims", {}), + "instance_id": instance_id, + } + + return {"dataset_id": dataset_id} From 601c45714ffbe7ffdbb667263fa2ead98c54335e Mon Sep 17 00:00:00 2001 From: Parth Thapliyal Date: Sun, 22 Mar 2026 15:12:00 -0400 Subject: [PATCH 03/13] =?UTF-8?q?feat:=20deterministic=20evaluation=20laye?= =?UTF-8?q?r=20=E2=80=94=20metrics,=20scoring,=20pricing,=20deal=20check?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the pure-math quality evaluation pipeline for the dataset procurement skill: null/duplicate/label metrics, component scoring, weighted quality score, price formula (P = base + spread * S), and deal condition (R ≤ P ≤ B). 38 unit tests cover all paths including critical failures, partial quality, and edge cases. --- .../deterministic.py | 313 +++++++++++++++ tests/fixtures/test_dataset_bad.csv | 111 ++++++ tests/fixtures/test_dataset_good.csv | 201 ++++++++++ tests/fixtures/test_dataset_high_nulls.csv | 201 ++++++++++ tests/fixtures/test_metadata.json | 15 + tests/test_data_procurement.py | 377 ++++++++++++++++++ 6 files changed, 1218 insertions(+) create mode 100644 skills/confidential_data_procurement/deterministic.py create mode 100644 tests/fixtures/test_dataset_bad.csv create mode 100644 tests/fixtures/test_dataset_good.csv create mode 100644 tests/fixtures/test_dataset_high_nulls.csv create mode 100644 tests/fixtures/test_metadata.json create mode 100644 tests/test_data_procurement.py diff --git a/skills/confidential_data_procurement/deterministic.py b/skills/confidential_data_procurement/deterministic.py new file mode 100644 index 0000000..9a0b7a8 --- /dev/null +++ b/skills/confidential_data_procurement/deterministic.py @@ -0,0 +1,313 @@ +""" +Deterministic quality evaluation layer for confidential_data_procurement. + +No LLM calls. Pure pandas + math. + +Pipeline: + 1. compute_metrics() — null rates, duplicate rate, label rate, forbidden col check + 2. check_critical() — early exit if any hard constraint is fatally violated + 3. compute_component_scores() — each dimension scored [0, 1] + 4. compute_quality_score() — weighted sum clamped to [0, 1] + 5. compute_price() — P = base_price + (max_budget - base_price) * S + 6. check_deal() — R <= P <= B and hard_constraints_pass + 7. run_deterministic() — orchestrates all of the above + +Note: schema_score and claim_veracity are placeholders (0.5 and 1.0 respectively) +until the agent layer runs fuzzy column matching and claim verification. +run_skill() will merge the agent's verdicts into the final quality score. +""" +from __future__ import annotations + +import math +from typing import Any + +import pandas as pd + +from skills.confidential_data_procurement.config import ( + CRITICAL_DUPLICATE_THRESHOLD, + DEFAULT_SCORE_WEIGHTS, +) +from skills.confidential_data_procurement.ingest import get_dataset +from skills.confidential_data_procurement.models import BuyerPolicy, DatasetMetrics + + +# --------------------------------------------------------------------------- +# Step 1: Metrics +# --------------------------------------------------------------------------- + +def compute_metrics(df: pd.DataFrame, policy: BuyerPolicy) -> DatasetMetrics: + """Compute all quality metrics from the raw DataFrame.""" + row_count = len(df) + column_names = list(df.columns) + + # Null rates per column and overall + null_rate_by_column = {col: float(df[col].isna().mean()) for col in df.columns} + total_cells = df.size + overall_null_rate = float(df.isna().sum().sum() / total_cells) if total_cells > 0 else 0.0 + + # Duplicate rate + duplicate_rate = float(df.duplicated().mean()) if row_count > 0 else 0.0 + + # Label rate — fraction of positive/truthy values in label column + label_rate: float | None = None + if policy.label_column and policy.label_column in df.columns: + col = df[policy.label_column].dropna() + if len(col) > 0: + label_rate = float((col.astype(bool)).mean()) + + # Forbidden columns present + forbidden_columns_present = [ + col for col in policy.forbidden_columns if col in column_names + ] + + # Hard constraints: forbidden cols absent + not critical duplicate rate + hard_constraints_pass = ( + len(forbidden_columns_present) == 0 + and duplicate_rate < CRITICAL_DUPLICATE_THRESHOLD + and row_count > 0 + ) + + # Critical failure detection + critical_failure = False + critical_reason: str | None = None + + if row_count == 0: + critical_failure = True + critical_reason = "Dataset is empty — no rows to evaluate." + elif forbidden_columns_present: + critical_failure = True + critical_reason = ( + f"Forbidden column(s) detected: {', '.join(forbidden_columns_present)}. " + "Deal rejected to protect data privacy constraints." + ) + elif duplicate_rate >= CRITICAL_DUPLICATE_THRESHOLD: + critical_failure = True + critical_reason = ( + f"Duplicate rate ({duplicate_rate:.1%}) exceeds critical threshold " + f"({CRITICAL_DUPLICATE_THRESHOLD:.0%}). Dataset quality is insufficient." + ) + + return DatasetMetrics( + row_count=row_count, + column_names=column_names, + null_rate_by_column=null_rate_by_column, + overall_null_rate=overall_null_rate, + duplicate_rate=duplicate_rate, + label_rate=label_rate, + forbidden_columns_present=forbidden_columns_present, + hard_constraints_pass=hard_constraints_pass, + critical_failure=critical_failure, + critical_reason=critical_reason, + ) + + +# --------------------------------------------------------------------------- +# Step 2: Critical check +# --------------------------------------------------------------------------- + +def check_critical(metrics: DatasetMetrics) -> tuple[bool, str | None]: + """Return (is_critical, reason). Caller should early-exit if is_critical.""" + return metrics.critical_failure, metrics.critical_reason + + +# --------------------------------------------------------------------------- +# Step 3: Component scores +# --------------------------------------------------------------------------- + +def compute_component_scores( + metrics: DatasetMetrics, policy: BuyerPolicy +) -> dict[str, float]: + """ + Score each quality dimension in [0, 1]. + + schema_score: 0.5 placeholder — agent will compute fuzzy match verdict. + claim_veracity: 1.0 placeholder — agent will compute claim verification score. + """ + scores: dict[str, float] = {} + + # Schema — agent will refine this + scores["schema"] = 0.5 + + # Coverage: how close are we to the required row count? + scores["coverage"] = min(metrics.row_count / policy.min_rows, 1.0) + + # Null score: penalise for null rate exceeding the policy threshold + if policy.max_null_rate > 0: + scores["null"] = max(0.0, 1.0 - (metrics.overall_null_rate / policy.max_null_rate)) + else: + scores["null"] = 1.0 if metrics.overall_null_rate == 0 else 0.0 + + # Duplicate score + if policy.max_duplicate_rate > 0: + scores["duplicate"] = max( + 0.0, 1.0 - (metrics.duplicate_rate / policy.max_duplicate_rate) + ) + else: + scores["duplicate"] = 1.0 if metrics.duplicate_rate == 0 else 0.0 + + # Label score + if policy.min_label_rate is not None and policy.min_label_rate > 0: + label_rate = metrics.label_rate or 0.0 + scores["label"] = min(label_rate / policy.min_label_rate, 1.0) + else: + scores["label"] = 1.0 # not required + + # Risk score: hard 0 if forbidden columns present + scores["risk"] = 0.0 if metrics.forbidden_columns_present else 1.0 + + # Claim veracity — agent will refine this + scores["claim_veracity"] = 1.0 + + return scores + + +# --------------------------------------------------------------------------- +# Step 4: Weighted quality score +# --------------------------------------------------------------------------- + +def compute_quality_score( + component_scores: dict[str, float], policy: BuyerPolicy +) -> float: + """ + Weighted sum of component scores, clamped to [0, 1]. + Uses policy.score_weights if set, otherwise DEFAULT_SCORE_WEIGHTS. + """ + weights = policy.score_weights if policy.score_weights else DEFAULT_SCORE_WEIGHTS + total = sum( + weights.get(key, 0.0) * score for key, score in component_scores.items() + ) + return max(0.0, min(1.0, total)) + + +# --------------------------------------------------------------------------- +# Step 5: Price +# --------------------------------------------------------------------------- + +def compute_price(S: float, base_price: float, max_budget: float) -> float: + """ + P = base_price + (max_budget - base_price) * S + + S=0 → P = base_price (floor: minimum payment even for poor quality) + S=1 → P = max_budget (ceiling: full payment for perfect quality) + """ + return round(base_price + (max_budget - base_price) * S, 2) + + +# --------------------------------------------------------------------------- +# Step 6: Deal condition +# --------------------------------------------------------------------------- + +def check_deal( + hard_constraints_pass: bool, + reserve_price: float, + proposed_payment: float, + max_budget: float, +) -> bool: + """ + deal = hard_constraints_pass AND (reserve_price <= proposed_payment <= max_budget) + """ + return ( + hard_constraints_pass + and reserve_price <= proposed_payment <= max_budget + ) + + +# --------------------------------------------------------------------------- +# Step 7: Orchestrator +# --------------------------------------------------------------------------- + +def run_deterministic( + dataset_id: str, + policy: BuyerPolicy, + reserve_price: float, +) -> dict[str, Any]: + """ + Run the full deterministic evaluation for a single dataset. + + Returns a dict consumed by run_skill(): + { + "metrics": DatasetMetrics, + "component_scores": dict[str, float], + "quality_score": float, # preliminary S (schema + claim are placeholders) + "proposed_payment": float, + "deal": bool, + "notes": list[str], # human-readable partial-failure notes + } + """ + dataset = get_dataset(dataset_id) + df: pd.DataFrame = dataset["df"] + + # Step 1 + metrics = compute_metrics(df, policy) + + # Step 2 — critical failures propagate directly to run_skill for early exit + if metrics.critical_failure: + return { + "metrics": metrics, + "component_scores": {}, + "quality_score": 0.0, + "proposed_payment": policy.base_price, + "deal": False, + "notes": [metrics.critical_reason] if metrics.critical_reason else [], + } + + # Step 3 + component_scores = compute_component_scores(metrics, policy) + + # Step 4 + quality_score = compute_quality_score(component_scores, policy) + + # Step 5 + proposed_payment = compute_price(quality_score, policy.base_price, policy.max_budget) + + # Step 6 + deal = check_deal( + metrics.hard_constraints_pass, reserve_price, proposed_payment, policy.max_budget + ) + + # Build human-readable notes for partial failures (non-critical but notable) + notes: list[str] = [] + + if metrics.overall_null_rate > policy.max_null_rate: + notes.append( + f"Null rate ({metrics.overall_null_rate:.1%}) exceeds policy threshold " + f"({policy.max_null_rate:.1%}). Quality score penalised." + ) + + if metrics.duplicate_rate > policy.max_duplicate_rate: + notes.append( + f"Duplicate rate ({metrics.duplicate_rate:.1%}) exceeds policy threshold " + f"({policy.max_duplicate_rate:.1%}). Quality score penalised." + ) + + if metrics.row_count < policy.min_rows: + notes.append( + f"Row count ({metrics.row_count:,}) is below policy minimum " + f"({policy.min_rows:,}). Coverage score penalised." + ) + + if ( + policy.min_label_rate is not None + and metrics.label_rate is not None + and metrics.label_rate < policy.min_label_rate + ): + notes.append( + f"Label rate ({metrics.label_rate:.2%}) is below policy minimum " + f"({policy.min_label_rate:.2%})." + ) + + if not deal and not metrics.critical_failure: + if reserve_price > proposed_payment: + notes.append( + f"Proposed payment (${proposed_payment:,.2f}) is below supplier's " + "reserve price. Consider renegotiation." + ) + + return { + "metrics": metrics, + "component_scores": component_scores, + "quality_score": quality_score, + "proposed_payment": proposed_payment, + "deal": deal, + "notes": notes, + } diff --git a/tests/fixtures/test_dataset_bad.csv b/tests/fixtures/test_dataset_bad.csv new file mode 100644 index 0000000..fe69f48 --- /dev/null +++ b/tests/fixtures/test_dataset_bad.csv @@ -0,0 +1,111 @@ +transaction_id,amount,ssn,is_fraud +txn_0000,3947.96,xxx-xx-0000,0 +txn_0001,2497.23,xxx-xx-0001,0 +txn_0002,443.73,xxx-xx-0002,0 +txn_0003,2690.16,xxx-xx-0003,0 +txn_0004,2938.34,xxx-xx-0004,0 +txn_0005,3729.74,xxx-xx-0005,0 +txn_0006,2163.98,xxx-xx-0006,0 +txn_0007,646.63,xxx-xx-0007,1 +txn_0008,1426.04,xxx-xx-0008,0 +txn_0009,1821.78,xxx-xx-0009,0 +txn_0010,3233.13,xxx-xx-0010,0 +txn_0011,2858.18,xxx-xx-0011,0 +txn_0012,1786.92,xxx-xx-0012,0 +txn_0013,4932.71,xxx-xx-0013,0 +txn_0014,3032.82,xxx-xx-0014,0 +txn_0015,1193.76,xxx-xx-0015,0 +txn_0016,517.89,xxx-xx-0016,0 +txn_0017,772.77,xxx-xx-0017,0 +txn_0018,1237.33,xxx-xx-0018,0 +txn_0019,811.8,xxx-xx-0019,1 +txn_0020,940.97,xxx-xx-0020,0 +txn_0021,1432.62,xxx-xx-0021,0 +txn_0022,875.13,xxx-xx-0022,0 +txn_0023,4484.86,xxx-xx-0023,0 +txn_0024,410.37,xxx-xx-0024,0 +txn_0025,2627.31,xxx-xx-0025,0 +txn_0026,2057.88,xxx-xx-0026,0 +txn_0027,4912.07,xxx-xx-0027,0 +txn_0028,569.07,xxx-xx-0028,0 +txn_0029,1995.3,xxx-xx-0029,0 +txn_0030,4847.66,xxx-xx-0030,0 +txn_0031,4328.88,xxx-xx-0031,0 +txn_0032,4087.19,xxx-xx-0032,0 +txn_0033,1296.94,xxx-xx-0033,0 +txn_0034,862.73,xxx-xx-0034,0 +txn_0035,3346.53,xxx-xx-0035,0 +txn_0036,4647.59,xxx-xx-0036,0 +txn_0037,2788.25,xxx-xx-0037,0 +txn_0038,2862.35,xxx-xx-0038,0 +txn_0039,1407.1,xxx-xx-0039,0 +txn_0040,3849.77,xxx-xx-0040,0 +txn_0041,943.35,xxx-xx-0041,0 +txn_0042,1625.16,xxx-xx-0042,0 +txn_0043,2132.93,xxx-xx-0043,0 +txn_0044,2542.98,xxx-xx-0044,0 +txn_0045,1219.62,xxx-xx-0045,0 +txn_0046,583.04,xxx-xx-0046,0 +txn_0047,3056.99,xxx-xx-0047,0 +txn_0048,1450.27,xxx-xx-0048,0 +txn_0049,2910.38,xxx-xx-0049,0 +txn_0000,3947.96,xxx-xx-0000,0 +txn_0001,2497.23,xxx-xx-0001,0 +txn_0002,443.73,xxx-xx-0002,0 +txn_0003,2690.16,xxx-xx-0003,0 +txn_0004,2938.34,xxx-xx-0004,0 +txn_0005,3729.74,xxx-xx-0005,0 +txn_0006,2163.98,xxx-xx-0006,0 +txn_0007,646.63,xxx-xx-0007,1 +txn_0008,1426.04,xxx-xx-0008,0 +txn_0009,1821.78,xxx-xx-0009,0 +txn_0010,3233.13,xxx-xx-0010,0 +txn_0011,2858.18,xxx-xx-0011,0 +txn_0012,1786.92,xxx-xx-0012,0 +txn_0013,4932.71,xxx-xx-0013,0 +txn_0014,3032.82,xxx-xx-0014,0 +txn_0015,1193.76,xxx-xx-0015,0 +txn_0016,517.89,xxx-xx-0016,0 +txn_0017,772.77,xxx-xx-0017,0 +txn_0018,1237.33,xxx-xx-0018,0 +txn_0019,811.8,xxx-xx-0019,1 +txn_0020,940.97,xxx-xx-0020,0 +txn_0021,1432.62,xxx-xx-0021,0 +txn_0022,875.13,xxx-xx-0022,0 +txn_0023,4484.86,xxx-xx-0023,0 +txn_0024,410.37,xxx-xx-0024,0 +txn_0025,2627.31,xxx-xx-0025,0 +txn_0026,2057.88,xxx-xx-0026,0 +txn_0027,4912.07,xxx-xx-0027,0 +txn_0028,569.07,xxx-xx-0028,0 +txn_0029,1995.3,xxx-xx-0029,0 +txn_0000,3947.96,xxx-xx-0000,0 +txn_0001,2497.23,xxx-xx-0001,0 +txn_0002,443.73,xxx-xx-0002,0 +txn_0003,2690.16,xxx-xx-0003,0 +txn_0004,2938.34,xxx-xx-0004,0 +txn_0005,3729.74,xxx-xx-0005,0 +txn_0006,2163.98,xxx-xx-0006,0 +txn_0007,646.63,xxx-xx-0007,1 +txn_0008,1426.04,xxx-xx-0008,0 +txn_0009,1821.78,xxx-xx-0009,0 +txn_0010,3233.13,xxx-xx-0010,0 +txn_0011,2858.18,xxx-xx-0011,0 +txn_0012,1786.92,xxx-xx-0012,0 +txn_0013,4932.71,xxx-xx-0013,0 +txn_0014,3032.82,xxx-xx-0014,0 +txn_0015,1193.76,xxx-xx-0015,0 +txn_0016,517.89,xxx-xx-0016,0 +txn_0017,772.77,xxx-xx-0017,0 +txn_0018,1237.33,xxx-xx-0018,0 +txn_0019,811.8,xxx-xx-0019,1 +txn_0020,940.97,xxx-xx-0020,0 +txn_0021,1432.62,xxx-xx-0021,0 +txn_0022,875.13,xxx-xx-0022,0 +txn_0023,4484.86,xxx-xx-0023,0 +txn_0024,410.37,xxx-xx-0024,0 +txn_0025,2627.31,xxx-xx-0025,0 +txn_0026,2057.88,xxx-xx-0026,0 +txn_0027,4912.07,xxx-xx-0027,0 +txn_0028,569.07,xxx-xx-0028,0 +txn_0029,1995.3,xxx-xx-0029,0 diff --git a/tests/fixtures/test_dataset_good.csv b/tests/fixtures/test_dataset_good.csv new file mode 100644 index 0000000..6d82afc --- /dev/null +++ b/tests/fixtures/test_dataset_good.csv @@ -0,0 +1,201 @@ +transaction_id,amount,merchant_category,country,is_fraud +txn_0000,1878.96,travel,CA,0 +txn_0001,4754.06,travel,FR,0 +txn_0002,3662.65,food,DE,0 +txn_0003,2997.31,retail,FR,0 +txn_0004,788.53,travel,CA,0 +txn_0005,788.41,travel,UK,0 +txn_0006,299.84,electronics,FR,0 +txn_0007,4332.22,travel,UK,0 +txn_0008,3009.56,travel,DE,0 +txn_0009,3543.28,electronics,US,0 +txn_0010,112.72,food,DE,0 +txn_0011,4849.85,travel,FR,0 +txn_0012,4163.89,electronics,UK,0 +txn_0013,1069.57,retail,UK,0 +txn_0014,917.31,food,CA,0 +txn_0015,925.19,retail,UK,0 +txn_0016,1528.17,retail,CA,1 +txn_0017,2628.53,electronics,US,0 +txn_0018,2165.41,travel,FR,0 +txn_0019,1463.23,electronics,CA,0 +txn_0020,3063.15,electronics,US,0 +txn_0021,706.07,electronics,UK,1 +txn_0022,1467.8,electronics,UK,0 +txn_0023,1838.15,food,US,1 +txn_0024,2285.79,travel,UK,0 +txn_0025,3928.03,retail,US,0 +txn_0026,1006.37,retail,CA,0 +txn_0027,2576.03,travel,CA,1 +txn_0028,2966.15,retail,US,0 +txn_0029,241.79,travel,CA,0 +txn_0030,3041.65,retail,CA,1 +txn_0031,860.92,electronics,CA,0 +txn_0032,334.61,retail,DE,0 +txn_0033,4744.94,travel,FR,0 +txn_0034,4828.5,travel,UK,0 +txn_0035,4043.9,travel,DE,0 +txn_0036,1530.02,travel,CA,0 +txn_0037,497.38,travel,US,0 +txn_0038,3424.32,food,CA,0 +txn_0039,2206.36,food,FR,0 +txn_0040,618.97,retail,CA,0 +txn_0041,2480.93,travel,US,0 +txn_0042,181.6,retail,FR,0 +txn_0043,4547.51,travel,CA,0 +txn_0044,1301.31,travel,FR,0 +txn_0045,3315.99,food,UK,0 +txn_0046,1565.44,food,UK,0 +txn_0047,2605.14,electronics,CA,0 +txn_0048,2738.08,food,FR,0 +txn_0049,932.42,electronics,US,0 +txn_0050,4848.23,electronics,CA,1 +txn_0051,3877.91,electronics,UK,1 +txn_0052,4698.1,electronics,UK,0 +txn_0053,4475.19,electronics,CA,0 +txn_0054,2993.52,retail,FR,0 +txn_0055,4610.15,food,UK,0 +txn_0056,451.58,electronics,FR,0 +txn_0057,987.95,travel,UK,0 +txn_0058,235.68,food,UK,0 +txn_0059,1633.4,food,DE,0 +txn_0060,1949.5,electronics,UK,0 +txn_0061,1364.03,retail,US,0 +txn_0062,4145.4,electronics,CA,0 +txn_0063,1790.2,retail,CA,0 +txn_0064,1411.86,travel,FR,0 +txn_0065,2718.05,food,UK,0 +txn_0066,713.21,travel,US,0 +txn_0067,4012.96,retail,FR,0 +txn_0068,382.01,retail,DE,0 +txn_0069,4934.57,travel,FR,0 +txn_0070,3863.5,retail,FR,1 +txn_0071,1001.59,travel,UK,0 +txn_0072,37.56,food,DE,0 +txn_0073,4079.15,travel,FR,0 +txn_0074,3537.22,food,US,1 +txn_0075,3647.75,travel,US,0 +txn_0076,3858.64,retail,CA,0 +txn_0077,379.48,travel,DE,0 +txn_0078,1798.74,retail,DE,0 +txn_0079,588.19,travel,CA,0 +txn_0080,4316.89,travel,FR,0 +txn_0081,3120.26,electronics,DE,0 +txn_0082,1661.18,retail,US,0 +txn_0083,327.16,electronics,US,1 +txn_0084,1561.8,retail,UK,1 +txn_0085,1632.66,electronics,DE,0 +txn_0086,3650.73,food,FR,0 +txn_0087,3191.41,travel,CA,0 +txn_0088,4437.19,retail,CA,0 +txn_0089,2366.35,retail,FR,0 +txn_0090,606.78,travel,UK,0 +txn_0091,3569.09,retail,CA,0 +txn_0092,3806.32,retail,DE,0 +txn_0093,2810.77,electronics,UK,0 +txn_0094,3857.13,retail,DE,0 +txn_0095,2474.04,food,US,0 +txn_0096,2618.44,food,UK,0 +txn_0097,2143.43,travel,CA,0 +txn_0098,136.84,retail,UK,0 +txn_0099,548.38,travel,UK,0 +txn_0100,166.83,food,UK,0 +txn_0101,3185.69,retail,UK,0 +txn_0102,1578.64,retail,DE,0 +txn_0103,2547.77,travel,US,0 +txn_0104,4538.76,food,FR,0 +txn_0105,1253.97,retail,UK,0 +txn_0106,2057.81,travel,CA,0 +txn_0107,3780.2,food,UK,0 +txn_0108,1151.7,travel,CA,0 +txn_0109,394.13,electronics,DE,0 +txn_0110,1455.86,retail,CA,0 +txn_0111,814.49,electronics,FR,0 +txn_0112,4649.19,food,US,0 +txn_0113,4042.52,electronics,CA,0 +txn_0114,3170.68,electronics,CA,0 +txn_0115,4358.59,food,US,0 +txn_0116,4020.32,travel,FR,0 +txn_0117,940.98,electronics,UK,0 +txn_0118,4463.87,travel,CA,0 +txn_0119,2701.32,electronics,US,0 +txn_0120,4039.13,travel,DE,0 +txn_0121,4481.5,food,US,0 +txn_0122,1596.84,electronics,DE,0 +txn_0123,559.16,food,FR,0 +txn_0124,1147.4,electronics,UK,0 +txn_0125,2141.27,food,US,0 +txn_0126,4091.89,food,CA,0 +txn_0127,4305.05,electronics,FR,0 +txn_0128,44.69,electronics,US,0 +txn_0129,2558.63,electronics,CA,0 +txn_0130,2092.88,retail,US,0 +txn_0131,1118.32,travel,DE,0 +txn_0132,608.13,retail,US,0 +txn_0133,1694.7,retail,US,1 +txn_0134,4715.12,food,US,0 +txn_0135,1622.78,electronics,FR,0 +txn_0136,2598.77,travel,US,0 +txn_0137,3518.06,food,CA,0 +txn_0138,1824.51,retail,DE,0 +txn_0139,4859.19,electronics,DE,0 +txn_0140,4812.61,food,US,0 +txn_0141,1266.39,electronics,FR,0 +txn_0142,2491.27,food,FR,0 +txn_0143,1511.38,electronics,CA,0 +txn_0144,1431.35,retail,US,0 +txn_0145,194.07,travel,DE,0 +txn_0146,3051.73,retail,FR,0 +txn_0147,2518.37,travel,US,0 +txn_0148,266.88,electronics,FR,0 +txn_0149,1400.45,food,FR,0 +txn_0150,4542.25,retail,DE,0 +txn_0151,1205.41,travel,UK,0 +txn_0152,733.03,retail,CA,0 +txn_0153,2452.37,travel,CA,0 +txn_0154,4928.4,retail,DE,0 +txn_0155,1217.86,travel,FR,0 +txn_0156,3363.96,travel,US,0 +txn_0157,3810.48,electronics,FR,0 +txn_0158,1195.81,travel,DE,0 +txn_0159,3643.8,electronics,CA,0 +txn_0160,1845.24,travel,FR,0 +txn_0161,3165.21,food,CA,0 +txn_0162,3171.31,food,US,0 +txn_0163,2683.52,food,CA,0 +txn_0164,460.55,travel,CA,0 +txn_0165,4178.16,electronics,UK,0 +txn_0166,1610.69,electronics,UK,0 +txn_0167,940.73,travel,UK,0 +txn_0168,213.47,food,CA,0 +txn_0169,2958.56,food,DE,0 +txn_0170,3391.05,electronics,CA,0 +txn_0171,92.77,food,DE,0 +txn_0172,2565.34,retail,DE,0 +txn_0173,1140.21,electronics,UK,0 +txn_0174,3229.41,food,FR,0 +txn_0175,880.09,electronics,US,0 +txn_0176,3457.78,retail,UK,0 +txn_0177,1939.81,electronics,UK,0 +txn_0178,4684.28,electronics,FR,0 +txn_0179,696.23,electronics,US,0 +txn_0180,1711.92,electronics,CA,0 +txn_0181,576.23,retail,CA,0 +txn_0182,4624.22,travel,UK,0 +txn_0183,4387.92,electronics,US,0 +txn_0184,1297.13,electronics,UK,0 +txn_0185,3303.32,food,DE,0 +txn_0186,4087.94,electronics,UK,0 +txn_0187,2780.45,food,UK,0 +txn_0188,2652.96,travel,CA,0 +txn_0189,1216.84,retail,CA,0 +txn_0190,474.58,retail,CA,0 +txn_0191,4487.11,travel,DE,0 +txn_0192,4503.09,retail,CA,0 +txn_0193,3169.18,travel,US,0 +txn_0194,1701.76,travel,FR,0 +txn_0195,1752.56,travel,US,0 +txn_0196,3632.52,retail,US,0 +txn_0197,4486.58,food,CA,0 +txn_0198,4436.56,travel,FR,0 +txn_0199,3901.58,electronics,FR,0 diff --git a/tests/fixtures/test_dataset_high_nulls.csv b/tests/fixtures/test_dataset_high_nulls.csv new file mode 100644 index 0000000..2a28160 --- /dev/null +++ b/tests/fixtures/test_dataset_high_nulls.csv @@ -0,0 +1,201 @@ +transaction_id,amount,merchant_category,country,is_fraud +txn_0000,1878.96,travel,CA,0 +txn_0001,4754.06,travel,FR,0 +txn_0002,,food,DE,0 +txn_0003,2997.31,retail,FR,0 +txn_0004,788.53,travel,CA,0 +txn_0005,788.41,travel,UK,0 +txn_0006,,electronics,FR,0 +txn_0007,4332.22,travel,UK,0 +txn_0008,3009.56,travel,DE,0 +txn_0009,3543.28,,US,0 +txn_0010,112.72,food,DE,0 +txn_0011,4849.85,travel,FR,0 +txn_0012,4163.89,,UK,0 +txn_0013,1069.57,retail,UK,0 +txn_0014,917.31,food,CA,0 +txn_0015,925.19,retail,UK,0 +txn_0016,,retail,CA,1 +txn_0017,2628.53,electronics,US,0 +txn_0018,2165.41,travel,FR,0 +txn_0019,1463.23,electronics,CA,0 +txn_0020,3063.15,electronics,US,0 +txn_0021,706.07,electronics,UK,1 +txn_0022,,electronics,UK,0 +txn_0023,1838.15,food,US,1 +txn_0024,,travel,UK,0 +txn_0025,3928.03,retail,US,0 +txn_0026,1006.37,,CA,0 +txn_0027,,travel,CA,1 +txn_0028,2966.15,retail,US,0 +txn_0029,241.79,,CA,0 +txn_0030,3041.65,retail,CA,1 +txn_0031,860.92,electronics,CA,0 +txn_0032,334.61,retail,DE,0 +txn_0033,4744.94,travel,FR,0 +txn_0034,4828.5,travel,UK,0 +txn_0035,4043.9,,DE,0 +txn_0036,1530.02,travel,CA,0 +txn_0037,497.38,travel,US,0 +txn_0038,3424.32,food,CA,0 +txn_0039,2206.36,food,FR,0 +txn_0040,618.97,retail,CA,0 +txn_0041,2480.93,travel,US,0 +txn_0042,181.6,retail,FR,0 +txn_0043,4547.51,travel,CA,0 +txn_0044,1301.31,travel,FR,0 +txn_0045,3315.99,food,UK,0 +txn_0046,,food,UK,0 +txn_0047,2605.14,,CA,0 +txn_0048,2738.08,food,FR,0 +txn_0049,932.42,electronics,US,0 +txn_0050,4848.23,electronics,CA,1 +txn_0051,3877.91,electronics,UK,1 +txn_0052,4698.1,electronics,UK,0 +txn_0053,4475.19,electronics,CA,0 +txn_0054,2993.52,retail,FR,0 +txn_0055,4610.15,food,UK,0 +txn_0056,451.58,electronics,FR,0 +txn_0057,987.95,travel,UK,0 +txn_0058,235.68,food,UK,0 +txn_0059,1633.4,food,DE,0 +txn_0060,1949.5,electronics,UK,0 +txn_0061,,retail,US,0 +txn_0062,4145.4,electronics,CA,0 +txn_0063,1790.2,retail,CA,0 +txn_0064,,travel,FR,0 +txn_0065,2718.05,,UK,0 +txn_0066,713.21,travel,US,0 +txn_0067,4012.96,retail,FR,0 +txn_0068,382.01,retail,DE,0 +txn_0069,4934.57,travel,FR,0 +txn_0070,,retail,FR,1 +txn_0071,1001.59,,UK,0 +txn_0072,37.56,food,DE,0 +txn_0073,,travel,FR,0 +txn_0074,3537.22,food,US,1 +txn_0075,3647.75,travel,US,0 +txn_0076,3858.64,retail,CA,0 +txn_0077,379.48,travel,DE,0 +txn_0078,1798.74,retail,DE,0 +txn_0079,588.19,travel,CA,0 +txn_0080,4316.89,travel,FR,0 +txn_0081,3120.26,electronics,DE,0 +txn_0082,1661.18,,US,0 +txn_0083,327.16,electronics,US,1 +txn_0084,,retail,UK,1 +txn_0085,1632.66,electronics,DE,0 +txn_0086,3650.73,,FR,0 +txn_0087,3191.41,travel,CA,0 +txn_0088,4437.19,retail,CA,0 +txn_0089,2366.35,retail,FR,0 +txn_0090,606.78,,UK,0 +txn_0091,3569.09,retail,CA,0 +txn_0092,,retail,DE,0 +txn_0093,2810.77,electronics,UK,0 +txn_0094,3857.13,retail,DE,0 +txn_0095,2474.04,food,US,0 +txn_0096,,food,UK,0 +txn_0097,2143.43,travel,CA,0 +txn_0098,,retail,UK,0 +txn_0099,548.38,,UK,0 +txn_0100,166.83,food,UK,0 +txn_0101,3185.69,,UK,0 +txn_0102,,retail,DE,0 +txn_0103,2547.77,travel,US,0 +txn_0104,4538.76,food,FR,0 +txn_0105,1253.97,retail,UK,0 +txn_0106,2057.81,travel,CA,0 +txn_0107,3780.2,food,UK,0 +txn_0108,,travel,CA,0 +txn_0109,394.13,electronics,DE,0 +txn_0110,1455.86,retail,CA,0 +txn_0111,,electronics,FR,0 +txn_0112,4649.19,food,US,0 +txn_0113,4042.52,electronics,CA,0 +txn_0114,3170.68,electronics,CA,0 +txn_0115,4358.59,food,US,0 +txn_0116,4020.32,travel,FR,0 +txn_0117,940.98,electronics,UK,0 +txn_0118,4463.87,travel,CA,0 +txn_0119,2701.32,electronics,US,0 +txn_0120,4039.13,travel,DE,0 +txn_0121,4481.5,food,US,0 +txn_0122,1596.84,electronics,DE,0 +txn_0123,559.16,food,FR,0 +txn_0124,1147.4,,UK,0 +txn_0125,2141.27,,US,0 +txn_0126,,food,CA,0 +txn_0127,4305.05,electronics,FR,0 +txn_0128,44.69,electronics,US,0 +txn_0129,2558.63,electronics,CA,0 +txn_0130,,retail,US,0 +txn_0131,1118.32,travel,DE,0 +txn_0132,608.13,retail,US,0 +txn_0133,1694.7,,US,1 +txn_0134,,food,US,0 +txn_0135,,electronics,FR,0 +txn_0136,2598.77,travel,US,0 +txn_0137,3518.06,food,CA,0 +txn_0138,1824.51,retail,DE,0 +txn_0139,4859.19,electronics,DE,0 +txn_0140,,food,US,0 +txn_0141,1266.39,electronics,FR,0 +txn_0142,2491.27,food,FR,0 +txn_0143,1511.38,electronics,CA,0 +txn_0144,1431.35,retail,US,0 +txn_0145,194.07,travel,DE,0 +txn_0146,3051.73,retail,FR,0 +txn_0147,2518.37,travel,US,0 +txn_0148,266.88,electronics,FR,0 +txn_0149,1400.45,food,FR,0 +txn_0150,,retail,DE,0 +txn_0151,,travel,UK,0 +txn_0152,733.03,,CA,0 +txn_0153,,travel,CA,0 +txn_0154,4928.4,retail,DE,0 +txn_0155,1217.86,travel,FR,0 +txn_0156,3363.96,travel,US,0 +txn_0157,3810.48,electronics,FR,0 +txn_0158,1195.81,travel,DE,0 +txn_0159,3643.8,electronics,CA,0 +txn_0160,1845.24,travel,FR,0 +txn_0161,3165.21,food,CA,0 +txn_0162,3171.31,food,US,0 +txn_0163,2683.52,food,CA,0 +txn_0164,460.55,travel,CA,0 +txn_0165,4178.16,electronics,UK,0 +txn_0166,1610.69,electronics,UK,0 +txn_0167,940.73,travel,UK,0 +txn_0168,213.47,,CA,0 +txn_0169,2958.56,food,DE,0 +txn_0170,3391.05,electronics,CA,0 +txn_0171,,food,DE,0 +txn_0172,2565.34,retail,DE,0 +txn_0173,1140.21,electronics,UK,0 +txn_0174,3229.41,food,FR,0 +txn_0175,880.09,electronics,US,0 +txn_0176,3457.78,retail,UK,0 +txn_0177,,electronics,UK,0 +txn_0178,4684.28,,FR,0 +txn_0179,696.23,electronics,US,0 +txn_0180,1711.92,electronics,CA,0 +txn_0181,576.23,retail,CA,0 +txn_0182,,travel,UK,0 +txn_0183,4387.92,electronics,US,0 +txn_0184,1297.13,,UK,0 +txn_0185,3303.32,food,DE,0 +txn_0186,4087.94,electronics,UK,0 +txn_0187,2780.45,,UK,0 +txn_0188,,travel,CA,0 +txn_0189,1216.84,retail,CA,0 +txn_0190,,retail,CA,0 +txn_0191,4487.11,travel,DE,0 +txn_0192,4503.09,retail,CA,0 +txn_0193,3169.18,travel,US,0 +txn_0194,1701.76,travel,FR,0 +txn_0195,1752.56,travel,US,0 +txn_0196,3632.52,retail,US,0 +txn_0197,,food,CA,0 +txn_0198,,travel,FR,0 +txn_0199,3901.58,electronics,FR,0 diff --git a/tests/fixtures/test_metadata.json b/tests/fixtures/test_metadata.json new file mode 100644 index 0000000..0790d7b --- /dev/null +++ b/tests/fixtures/test_metadata.json @@ -0,0 +1,15 @@ +{ + "column_definitions": { + "transaction_id": "Unique transaction identifier", + "amount": "Transaction amount in USD", + "merchant_category": "Merchant category code", + "country": "ISO 2-letter country code", + "is_fraud": "Binary fraud label (1=fraud, 0=legitimate)" + }, + "seller_claims": { + "source": "Internal payment processing system", + "date_range": "2024-01-01 to 2024-12-31", + "row_count": "200", + "fraud_rate": "~4%" + } +} \ No newline at end of file diff --git a/tests/test_data_procurement.py b/tests/test_data_procurement.py new file mode 100644 index 0000000..284b138 --- /dev/null +++ b/tests/test_data_procurement.py @@ -0,0 +1,377 @@ +""" +Unit tests for confidential_data_procurement — deterministic layer. +Tests cover: metrics computation, critical checks, component scores, +quality score formula, price formula, deal condition, and run_deterministic. +""" +from __future__ import annotations + +import io +import uuid + +import pandas as pd +import pytest + +from skills.confidential_data_procurement.config import ( + CRITICAL_DUPLICATE_THRESHOLD, + DEFAULT_SCORE_WEIGHTS, +) +from skills.confidential_data_procurement.deterministic import ( + check_critical, + check_deal, + compute_component_scores, + compute_metrics, + compute_price, + compute_quality_score, + run_deterministic, +) +from skills.confidential_data_procurement.ingest import _datasets, cleanup +from skills.confidential_data_procurement.models import BuyerPolicy + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_policy(**overrides) -> BuyerPolicy: + defaults = dict( + required_columns=["transaction_id", "amount", "is_fraud"], + min_rows=100, + max_null_rate=0.05, + max_duplicate_rate=0.10, + min_label_rate=0.02, + label_column="is_fraud", + forbidden_columns=["ssn", "dob"], + max_budget=5000.0, + base_price=500.0, + ) + defaults.update(overrides) + return BuyerPolicy(**defaults) + + +def _make_df(rows=150, null_amount_rate=0.0, dup_rate=0.0, cols=None) -> pd.DataFrame: + """Generate a clean fraud-like DataFrame.""" + import numpy as np + np.random.seed(0) + base_n = int(rows * (1 - dup_rate)) + df = pd.DataFrame({ + "transaction_id": [f"txn_{i:04d}" for i in range(base_n)], + "amount": [round(float(i * 10.5), 2) for i in range(base_n)], + "is_fraud": [1 if i % 25 == 0 else 0 for i in range(base_n)], + }) + if cols: + df = df[[c for c in cols if c in df.columns]] + if null_amount_rate > 0: + n_nulls = int(base_n * null_amount_rate) + df.loc[:n_nulls, "amount"] = None + if dup_rate > 0: + extra = int(rows * dup_rate) + df = pd.concat([df, df.iloc[:extra]], ignore_index=True) + return df + + +def _register_df(df: pd.DataFrame, metadata: dict | None = None) -> str: + """Store a DataFrame in the ingest store and return its dataset_id.""" + dataset_id = str(uuid.uuid4()) + _datasets[dataset_id] = { + "df": df, + "metadata": metadata or {}, + "column_definitions": {}, + "seller_claims": {}, + "instance_id": "test_instance", + } + return dataset_id + + +# --------------------------------------------------------------------------- +# compute_metrics +# --------------------------------------------------------------------------- + +class TestComputeMetrics: + def test_basic_counts(self): + df = _make_df(rows=150) + policy = _make_policy() + m = compute_metrics(df, policy) + assert m.row_count == 150 + assert "transaction_id" in m.column_names + assert m.critical_failure is False + assert m.hard_constraints_pass is True + + def test_null_rates(self): + df = _make_df(rows=100, null_amount_rate=0.20) + policy = _make_policy() + m = compute_metrics(df, policy) + assert m.null_rate_by_column["amount"] > 0.15 + assert m.overall_null_rate > 0.0 + + def test_duplicate_rate(self): + df = _make_df(rows=100, dup_rate=0.30) + policy = _make_policy() + m = compute_metrics(df, policy) + assert m.duplicate_rate > 0.20 + + def test_label_rate(self): + df = _make_df(rows=100) + policy = _make_policy(label_column="is_fraud") + m = compute_metrics(df, policy) + assert m.label_rate is not None + assert 0 < m.label_rate < 0.1 # ~4% fraud + + def test_no_label_column(self): + df = _make_df(rows=100) + policy = _make_policy(label_column=None, min_label_rate=None) + m = compute_metrics(df, policy) + assert m.label_rate is None + + def test_forbidden_column_detected(self): + df = _make_df(rows=50) + df["ssn"] = "xxx-xx-0000" + policy = _make_policy() + m = compute_metrics(df, policy) + assert "ssn" in m.forbidden_columns_present + assert m.hard_constraints_pass is False + + def test_empty_dataframe(self): + df = pd.DataFrame(columns=["transaction_id", "amount", "is_fraud"]) + policy = _make_policy() + m = compute_metrics(df, policy) + assert m.row_count == 0 + assert m.critical_failure is True + assert m.hard_constraints_pass is False + + +# --------------------------------------------------------------------------- +# check_critical +# --------------------------------------------------------------------------- + +class TestCheckCritical: + def test_clean_df_not_critical(self): + df = _make_df(rows=100) + policy = _make_policy() + m = compute_metrics(df, policy) + is_crit, reason = check_critical(m) + assert is_crit is False + assert reason is None + + def test_forbidden_col_is_critical(self): + df = _make_df(rows=50) + df["ssn"] = "xxx" + policy = _make_policy() + m = compute_metrics(df, policy) + is_crit, reason = check_critical(m) + assert is_crit is True + assert "ssn" in reason.lower() + + def test_high_duplicate_rate_is_critical(self): + df = _make_df(rows=100, dup_rate=CRITICAL_DUPLICATE_THRESHOLD + 0.05) + policy = _make_policy() + m = compute_metrics(df, policy) + is_crit, reason = check_critical(m) + assert is_crit is True + assert "duplicate" in reason.lower() + + def test_empty_df_is_critical(self): + df = pd.DataFrame(columns=["a", "b"]) + policy = _make_policy() + m = compute_metrics(df, policy) + is_crit, reason = check_critical(m) + assert is_crit is True + + +# --------------------------------------------------------------------------- +# compute_component_scores +# --------------------------------------------------------------------------- + +class TestComponentScores: + def test_perfect_dataset_scores(self): + df = _make_df(rows=200) + policy = _make_policy(min_rows=100) + m = compute_metrics(df, policy) + scores = compute_component_scores(m, policy) + assert scores["coverage"] == 1.0 + assert scores["risk"] == 1.0 + assert scores["null"] == 1.0 # no nulls + assert scores["duplicate"] == 1.0 # no dups + + def test_coverage_below_min(self): + df = _make_df(rows=50) + policy = _make_policy(min_rows=200) + m = compute_metrics(df, policy) + scores = compute_component_scores(m, policy) + assert scores["coverage"] == pytest.approx(0.25, abs=0.01) + + def test_null_score_penalised(self): + df = _make_df(rows=100, null_amount_rate=0.20) + policy = _make_policy(max_null_rate=0.05) + m = compute_metrics(df, policy) + scores = compute_component_scores(m, policy) + assert scores["null"] < 0.5 + + def test_risk_score_zero_on_forbidden(self): + df = _make_df(rows=50) + df["ssn"] = "xxx" + policy = _make_policy() + m = compute_metrics(df, policy) + scores = compute_component_scores(m, policy) + assert scores["risk"] == 0.0 + + def test_schema_score_placeholder(self): + df = _make_df(rows=100) + policy = _make_policy() + m = compute_metrics(df, policy) + scores = compute_component_scores(m, policy) + assert scores["schema"] == 0.5 # placeholder until agent + + def test_claim_veracity_placeholder(self): + df = _make_df(rows=100) + policy = _make_policy() + m = compute_metrics(df, policy) + scores = compute_component_scores(m, policy) + assert scores["claim_veracity"] == 1.0 # placeholder until agent + + +# --------------------------------------------------------------------------- +# compute_quality_score +# --------------------------------------------------------------------------- + +class TestQualityScore: + def test_default_weights_sum_to_one(self): + assert abs(sum(DEFAULT_SCORE_WEIGHTS.values()) - 1.0) < 0.001 + + def test_perfect_scores_give_one(self): + perfect = {k: 1.0 for k in DEFAULT_SCORE_WEIGHTS} + assert compute_quality_score(perfect, _make_policy()) == pytest.approx(1.0) + + def test_zero_scores_give_zero(self): + zeros = {k: 0.0 for k in DEFAULT_SCORE_WEIGHTS} + assert compute_quality_score(zeros, _make_policy()) == pytest.approx(0.0) + + def test_clamped_to_zero_one(self): + over = {k: 2.0 for k in DEFAULT_SCORE_WEIGHTS} + assert compute_quality_score(over, _make_policy()) == 1.0 + + def test_custom_weights_respected(self): + policy = _make_policy(score_weights={k: 1/7 for k in DEFAULT_SCORE_WEIGHTS}) + scores = {k: 1.0 for k in DEFAULT_SCORE_WEIGHTS} + assert compute_quality_score(scores, policy) == pytest.approx(1.0, abs=0.01) + + +# --------------------------------------------------------------------------- +# compute_price +# --------------------------------------------------------------------------- + +class TestComputePrice: + def test_s_zero_gives_base_price(self): + assert compute_price(0.0, 500.0, 5000.0) == 500.0 + + def test_s_one_gives_max_budget(self): + assert compute_price(1.0, 500.0, 5000.0) == 5000.0 + + def test_midpoint(self): + assert compute_price(0.5, 0.0, 1000.0) == 500.0 + + def test_rounded_to_two_decimals(self): + result = compute_price(0.333, 0.0, 1000.0) + assert result == round(result, 2) + + def test_formula_correct(self): + S, base, budget = 0.87, 500.0, 5000.0 + expected = round(500.0 + (5000.0 - 500.0) * 0.87, 2) + assert compute_price(S, base, budget) == expected + + +# --------------------------------------------------------------------------- +# check_deal +# --------------------------------------------------------------------------- + +class TestCheckDeal: + def test_deal_passes(self): + assert check_deal(True, 3000.0, 4000.0, 5000.0) is True + + def test_reserve_above_payment(self): + assert check_deal(True, 4500.0, 4000.0, 5000.0) is False + + def test_payment_above_budget(self): + # P > B can't normally happen (P = B * S ≤ B), but guard anyway + assert check_deal(True, 100.0, 6000.0, 5000.0) is False + + def test_hard_constraints_fail(self): + assert check_deal(False, 3000.0, 4000.0, 5000.0) is False + + def test_exact_reserve_equals_payment(self): + assert check_deal(True, 4000.0, 4000.0, 5000.0) is True # R == P: ok + + +# --------------------------------------------------------------------------- +# run_deterministic (integration) +# --------------------------------------------------------------------------- + +class TestRunDeterministic: + def test_good_dataset_deal_passes(self): + df = _make_df(rows=200) + policy = _make_policy(min_rows=100, max_budget=5000.0, base_price=500.0) + dataset_id = _register_df(df) + try: + result = run_deterministic(dataset_id, policy, reserve_price=1000.0) + assert result["deal"] is True + assert result["quality_score"] > 0.5 + assert result["proposed_payment"] >= 500.0 + assert result["proposed_payment"] <= 5000.0 + assert not result["metrics"].critical_failure + finally: + cleanup(dataset_id) + + def test_critical_failure_early_exit(self): + df = _make_df(rows=50) + df["ssn"] = "xxx" + policy = _make_policy() + dataset_id = _register_df(df) + try: + result = run_deterministic(dataset_id, policy, reserve_price=100.0) + assert result["metrics"].critical_failure is True + assert result["deal"] is False + assert result["quality_score"] == 0.0 + assert result["proposed_payment"] == policy.base_price + assert len(result["notes"]) > 0 + finally: + cleanup(dataset_id) + + def test_high_null_reduces_price(self): + df_clean = _make_df(rows=150) + df_nulls = _make_df(rows=150, null_amount_rate=0.30) + policy = _make_policy(max_null_rate=0.05, max_budget=5000.0, base_price=0.0) + + id_clean = _register_df(df_clean) + id_nulls = _register_df(df_nulls) + try: + clean_result = run_deterministic(id_clean, policy, reserve_price=0.0) + nulls_result = run_deterministic(id_nulls, policy, reserve_price=0.0) + assert nulls_result["proposed_payment"] < clean_result["proposed_payment"] + finally: + cleanup(id_clean) + cleanup(id_nulls) + + def test_reserve_above_payment_no_deal(self): + df = _make_df(rows=150) + policy = _make_policy(min_rows=100, max_budget=1000.0, base_price=0.0) + dataset_id = _register_df(df) + try: + result = run_deterministic(dataset_id, policy, reserve_price=9999.0) + assert result["deal"] is False + assert any("reserve" in n.lower() for n in result["notes"]) + finally: + cleanup(dataset_id) + + def test_notes_populated_on_partial_failure(self): + df = _make_df(rows=50) # below min_rows=100 + policy = _make_policy(min_rows=100) + dataset_id = _register_df(df) + try: + result = run_deterministic(dataset_id, policy, reserve_price=0.0) + assert any("row count" in n.lower() for n in result["notes"]) + finally: + cleanup(dataset_id) + + def test_dataset_not_found_raises(self): + policy = _make_policy() + with pytest.raises(KeyError): + run_deterministic("nonexistent_id", policy, reserve_price=100.0) From 540cad4d63a4a47eca157bce4086667d6b9634a5 Mon Sep 17 00:00:00 2001 From: Parth Thapliyal Date: Sun, 22 Mar 2026 15:14:03 -0400 Subject: [PATCH 04/13] =?UTF-8?q?feat:=20guardrails=20=E2=80=94=20role-awa?= =?UTF-8?q?re=20output=20filter=20and=20tool=20output=20validator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ProcurementFilter withholds quality_score and hard_constraints_pass from the supplier role to prevent max_budget reverse-engineering (P/S = budget). validate_tool_output blocks raw row dumps, high-cardinality lists, and oversized blobs before they reach the agent. 15 new tests, 53 total. --- .../guardrails.py | 110 ++++++++++++++++ tests/test_data_procurement.py | 117 ++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100644 skills/confidential_data_procurement/guardrails.py diff --git a/skills/confidential_data_procurement/guardrails.py b/skills/confidential_data_procurement/guardrails.py new file mode 100644 index 0000000..602dff5 --- /dev/null +++ b/skills/confidential_data_procurement/guardrails.py @@ -0,0 +1,110 @@ +""" +Output filter and tool output validator for confidential_data_procurement. + +ProcurementFilter — role-aware output filter: + - Buyer (admin): sees quality_score, hard_constraints_pass + - Supplier (user): those two fields are withheld (budget leak prevention — + if supplier sees quality_score + proposed_payment they can + compute max_budget = P / S) + +validate_tool_output — programmatic guardrail wrapping every agent data tool: + - Blocks raw row dumps (too many CSV-like lines) + - Blocks high-cardinality value lists (> MAX_TOOL_OUTPUT_ITEMS list entries) + - Blocks oversized blobs (> MAX_TOOL_OUTPUT_CHARS) + +LeakageDetector is applied inside ProcurementFilter.apply() (inherited from +OutputFilterBase). Even if the LLM echoes a cell value in its explanation, the +detector flags it before the response leaves the pipeline. +""" +from __future__ import annotations + +from core.guardrails import LeakageDetector, OutputFilterBase +from skills.confidential_data_procurement.config import ( + ALLOWED_OUTPUT_KEYS, + MIN_LEAKAGE_SUBSTRING_LENGTH, + SCORE_BOUNDS, + USER_OUTPUT_KEYS, +) + +# --------------------------------------------------------------------------- +# Tool output guardrail constants +# --------------------------------------------------------------------------- + +MAX_TOOL_OUTPUT_CHARS: int = 4_000 +MAX_TOOL_OUTPUT_ITEMS: int = 50 # max enumerated items (bullet/colon lines) +MAX_RAW_ROW_LINES: int = 5 # more comma-separated lines than this → raw dump + + +# --------------------------------------------------------------------------- +# Role-aware output filter +# --------------------------------------------------------------------------- + +class ProcurementFilter(OutputFilterBase): + """ + Role-aware output filter for the dataset procurement pipeline. + + role="admin" → buyer view — full ALLOWED_OUTPUT_KEYS (includes quality_score) + role="user" → supplier view — USER_OUTPUT_KEYS (quality_score withheld) + """ + + def __init__(self, role: str = "admin"): + keys = ALLOWED_OUTPUT_KEYS if role == "admin" else USER_OUTPUT_KEYS + super().__init__( + allowed_keys=keys, + leakage_detector=LeakageDetector(min_length=MIN_LEAKAGE_SUBSTRING_LENGTH), + ) + + def check_bounds(self, result: dict) -> dict: + """Clamp quality_score to [0, 1]. All other fields pass through.""" + if "quality_score" in result: + lo, hi = SCORE_BOUNDS["quality_score"] + result["quality_score"] = max(lo, min(hi, float(result["quality_score"]))) + return result + + +# --------------------------------------------------------------------------- +# Tool output validator +# --------------------------------------------------------------------------- + +def validate_tool_output(output: str) -> str: + """ + Programmatic guardrail for every agent data tool. + + Raises ValueError if the output looks like: + - A raw row dump (> MAX_RAW_ROW_LINES CSV-like lines) + - A high-cardinality list (> MAX_TOOL_OUTPUT_ITEMS enumerated items) + - An oversized blob (> MAX_TOOL_OUTPUT_CHARS characters) + + Returns the output unchanged if all checks pass. + """ + if len(output) > MAX_TOOL_OUTPUT_CHARS: + raise ValueError( + f"Tool output too large ({len(output):,} chars). " + f"Maximum allowed: {MAX_TOOL_OUTPUT_CHARS:,}. " + "Return aggregate statistics, not raw data." + ) + + lines = [line for line in output.splitlines() if line.strip()] + + # Raw row detection — a real stats summary rarely has many comma-heavy lines + csv_like = sum(1 for line in lines if line.count(",") >= 2) + if csv_like > MAX_RAW_ROW_LINES: + raise ValueError( + f"Tool output contains {csv_like} CSV-like lines " + f"(threshold: {MAX_RAW_ROW_LINES}). " + "Tools must return aggregate statistics, not raw rows." + ) + + # High-cardinality detection — count bullet/label lines + list_items = [ + line for line in lines + if line.lstrip().startswith(("-", "*", "•")) or ": " in line + ] + if len(list_items) > MAX_TOOL_OUTPUT_ITEMS: + raise ValueError( + f"Tool output enumerates {len(list_items)} items " + f"(threshold: {MAX_TOOL_OUTPUT_ITEMS}). " + "Return top-N values or aggregates only." + ) + + return output diff --git a/tests/test_data_procurement.py b/tests/test_data_procurement.py index 284b138..ad21fa6 100644 --- a/tests/test_data_procurement.py +++ b/tests/test_data_procurement.py @@ -375,3 +375,120 @@ def test_dataset_not_found_raises(self): policy = _make_policy() with pytest.raises(KeyError): run_deterministic("nonexistent_id", policy, reserve_price=100.0) + + +# --------------------------------------------------------------------------- +# ProcurementFilter +# --------------------------------------------------------------------------- + +from skills.confidential_data_procurement.guardrails import ( + ProcurementFilter, + validate_tool_output, +) + + +class TestProcurementFilter: + def _result(self) -> dict: + return { + "submission_id": "sub-1", + "deal": True, + "quality_score": 0.85, + "proposed_payment": 3000.0, + "hard_constraints_pass": True, + "settlement_status": "authorized", + "release_token": "tok-abc", + "notes": [], + "explanation": "Looks good.", + "claim_verification": None, + "schema_matching": None, + "buyer_response": "accept", + "supplier_response": "accept", + "renegotiation_used": False, + } + + def test_buyer_sees_quality_score(self): + f = ProcurementFilter(role="admin") + out = f.filter_keys(self._result()) + assert "quality_score" in out + + def test_supplier_hides_quality_score(self): + f = ProcurementFilter(role="user") + out = f.filter_keys(self._result()) + assert "quality_score" not in out + assert "hard_constraints_pass" not in out + + def test_supplier_still_sees_payment(self): + f = ProcurementFilter(role="user") + out = f.filter_keys(self._result()) + assert "proposed_payment" in out + assert "deal" in out + + def test_check_bounds_clamps_high(self): + f = ProcurementFilter(role="admin") + r = {"quality_score": 1.5} + assert f.check_bounds(r)["quality_score"] == 1.0 + + def test_check_bounds_clamps_low(self): + f = ProcurementFilter(role="admin") + r = {"quality_score": -0.3} + assert f.check_bounds(r)["quality_score"] == 0.0 + + def test_check_bounds_passes_valid(self): + f = ProcurementFilter(role="admin") + r = {"quality_score": 0.72} + assert f.check_bounds(r)["quality_score"] == pytest.approx(0.72) + + def test_unknown_keys_stripped(self): + f = ProcurementFilter(role="admin") + r = self._result() + r["_internal_secret"] = "max_budget=9000" + out = f.filter_keys(r) + assert "_internal_secret" not in out + + def test_leakage_flagged_in_apply(self): + f = ProcurementFilter(role="admin") + result = self._result() + # Inject a long substring into explanation that also appears in raw_inputs + leaked = "SENSITIVE_CELL_VALUE_XYZ_1234567890" + result["explanation"] = f"The data shows {leaked} is common." + filtered = f.apply([result], [leaked]) + assert "_leakage_warning" in filtered[0] + + +# --------------------------------------------------------------------------- +# validate_tool_output +# --------------------------------------------------------------------------- + +class TestValidateToolOutput: + def test_clean_stats_pass(self): + output = "count: 150\nmean: 4.2\nstd: 1.1\nmin: 0.0\nmax: 10.0" + assert validate_tool_output(output) == output + + def test_oversized_raises(self): + big = "x" * 5000 + with pytest.raises(ValueError, match="too large"): + validate_tool_output(big) + + def test_raw_rows_raises(self): + # 6 CSV-like lines — over the threshold of 5 + rows = "\n".join(f"txn_{i},100.{i},0" for i in range(6)) + with pytest.raises(ValueError, match="CSV-like"): + validate_tool_output(rows) + + def test_exactly_at_raw_row_limit_passes(self): + # exactly MAX_RAW_ROW_LINES (5) CSV-like lines — should pass + rows = "\n".join(f"txn_{i},100.{i},0" for i in range(5)) + assert validate_tool_output(rows) == rows + + def test_high_cardinality_raises(self): + # 51 bullet items — over the threshold of 50 + items = "\n".join(f"- value_{i}: {i}" for i in range(51)) + with pytest.raises(ValueError, match="enumerates"): + validate_tool_output(items) + + def test_exactly_at_cardinality_limit_passes(self): + items = "\n".join(f"- value_{i}: {i}" for i in range(50)) + assert validate_tool_output(items) == items + + def test_empty_string_passes(self): + assert validate_tool_output("") == "" From 207b842dfe870d8ab35ec9b50f67502239b5fdfa Mon Sep 17 00:00:00 2001 From: Parth Thapliyal Date: Sun, 22 Mar 2026 15:16:49 -0400 Subject: [PATCH 05/13] =?UTF-8?q?feat:=20buyer=20onboarding=20init=20handl?= =?UTF-8?q?er=20=E2=80=94=20LLM=20extracts=20BuyerPolicy=20from=20conversa?= =?UTF-8?q?tion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Guided multi-turn conversation collects required columns, row/quality thresholds, budget range, and optional label/forbidden-column constraints. LLM returns JSON when policy is complete; handler validates and constructs BuyerPolicy. 13 new tests (5 parse, 8 handler), 66 total. --- skills/confidential_data_procurement/init.py | 266 +++++++++++++++++++ tests/test_data_procurement.py | 125 +++++++++ 2 files changed, 391 insertions(+) create mode 100644 skills/confidential_data_procurement/init.py diff --git a/skills/confidential_data_procurement/init.py b/skills/confidential_data_procurement/init.py new file mode 100644 index 0000000..59dafb2 --- /dev/null +++ b/skills/confidential_data_procurement/init.py @@ -0,0 +1,266 @@ +""" +Buyer onboarding handler for confidential_data_procurement. + +The API calls procurement_init_handler(message, conversation) on each POST /init. +This module owns all procurement-specific onboarding logic: + - Greeting and guided data-collection conversation + - LLM extraction of BuyerPolicy fields from free-form buyer input + - BuyerPolicy construction and validation + +Handler interface (same contract as hackathon_novelty.init): + procurement_init_handler(message: str, conversation: list[dict]) -> dict + + Returns one of: + {"status": "configuring", "message": str, "conversation": list[dict]} + {"status": "ready", "message": str, "conversation": list[dict], + "config": BuyerPolicy, "threshold": 1} + +threshold is always 1 — procurement triggers instantly when a supplier submits. +""" +from __future__ import annotations + +import json +from typing import Optional + +from langchain_core.messages import AIMessage, HumanMessage, SystemMessage + +from config import get_llm +from skills.confidential_data_procurement.config import INIT_MODEL +from skills.confidential_data_procurement.models import BuyerPolicy + + +INIT_PROMPT_VERSION = "v1" + + +_GREETING_TEMPLATE = """\ +Welcome to the Confidential Data Procurement setup. + +I'll help you configure your dataset acquisition policy inside the TEE. \ +Suppliers will upload datasets and submit a reserve price — neither party \ +sees the other's private numbers. + +Please provide the following: + +**Required** +1. **Dataset description** — what kind of data you need and why +2. **Required columns** — list the column names you expect + Example: transaction_id, amount, is_fraud +3. **Minimum rows** — fewest acceptable rows (e.g. 10000) +4. **Max null rate** — e.g. 5% means at most 5% cells can be missing +5. **Max duplicate rate** — e.g. 10% means at most 10% duplicate rows +6. **Maximum budget** — the most you will pay for a perfect dataset ($) + +**Optional** +- **Base price** — minimum payment even for poor-quality data (default $0) +- **Label column** + **minimum label rate** — e.g. is_fraud column must have ≥ 2% positives +- **Forbidden columns** — PII fields to block (e.g. ssn, dob, passport_number) + +You can provide everything in one message or answer step by step.\ +""" + + +_SYSTEM_PROMPT = """\ +You are configuring a confidential dataset procurement instance for a buyer. \ +Your job is to collect the required policy fields from the buyer's messages. + +REQUIRED fields (must be present and valid before responding with JSON): + - required_columns: list of expected column name strings (non-empty) + - min_rows: positive integer + - max_null_rate: float in [0, 1] (e.g. 0.05 for 5%) + - max_duplicate_rate: float in [0, 1] + - max_budget: positive float (the ceiling payment for a perfect dataset) + +OPTIONAL fields (use defaults if not provided): + - base_price: float >= 0, default 0.0 (floor payment when quality score = 0) + - min_label_rate: float in [0, 1] or null + - label_column: string or null + - forbidden_columns: list of strings, default [] + - description: free-text description of the dataset need + +CRITICAL RULE: base_price must be strictly less than max_budget. \ +If the buyer provides both and base_price >= max_budget, ask them to fix it. + +Once you have all required fields and they are valid, respond with ONLY this \ +JSON — no extra text, no markdown fences: +{ + "ready": true, + "required_columns": [...], + "min_rows": N, + "max_null_rate": 0.XX, + "max_duplicate_rate": 0.XX, + "max_budget": NNN.0, + "base_price": NN.0, + "min_label_rate": null_or_float, + "label_column": null_or_string, + "forbidden_columns": [...], + "description": "..." +} + +Only ask follow-up questions if required fields are missing or invalid. \ +Convert percentages to decimals (e.g. "5%" → 0.05).\ +""" + + +def _parse_llm_response(text: str) -> Optional[dict]: + """Strip markdown fences, parse JSON, return dict if ready=true else None.""" + text = text.strip() + if text.startswith("```"): + lines = text.splitlines() + inner = lines[1:-1] if lines[-1].strip() == "```" else lines[1:] + text = "\n".join(inner).strip() + try: + obj = json.loads(text) + if isinstance(obj, dict) and obj.get("ready") is True: + return obj + except (json.JSONDecodeError, ValueError): + pass + return None + + +def procurement_init_handler(message: str, conversation: list[dict]) -> dict: + """ + Handle one turn of the buyer onboarding conversation. + + Called by the API on each POST /init. The accumulated conversation is passed + in; this handler appends the new messages and returns the updated state. + """ + # First turn: return fixed greeting immediately (no LLM call). + if not conversation: + conversation = [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "ai", "content": _GREETING_TEMPLATE}, + ] + return { + "status": "configuring", + "message": _GREETING_TEMPLATE, + "conversation": conversation, + } + + conversation = conversation + [{"role": "human", "content": message}] + + # Build LangChain messages + lc_messages = [] + for msg in conversation: + if msg["role"] == "system": + lc_messages.append(SystemMessage(content=msg["content"])) + elif msg["role"] == "human": + lc_messages.append(HumanMessage(content=msg["content"])) + else: + lc_messages.append(AIMessage(content=msg["content"])) + + llm = get_llm(INIT_MODEL) + response = llm.invoke(lc_messages) + ai_text = response.content + + conversation = conversation + [{"role": "ai", "content": ai_text}] + + extracted = _parse_llm_response(ai_text) + if not extracted: + return { + "status": "configuring", + "message": ai_text, + "conversation": conversation, + } + + # Validate required fields + required_columns = extracted.get("required_columns") + if not required_columns or not isinstance(required_columns, list): + return { + "status": "configuring", + "message": "Required columns cannot be empty. Please list the column names you expect.", + "conversation": conversation, + } + + min_rows = extracted.get("min_rows") + try: + min_rows = int(min_rows) + if min_rows < 1: + raise ValueError + except (TypeError, ValueError): + return { + "status": "configuring", + "message": "Minimum rows must be a positive integer. Please provide a valid number.", + "conversation": conversation, + } + + max_budget = extracted.get("max_budget") + try: + max_budget = float(max_budget) + if max_budget <= 0: + raise ValueError + except (TypeError, ValueError): + return { + "status": "configuring", + "message": "Maximum budget must be a positive number. Please provide a valid dollar amount.", + "conversation": conversation, + } + + for rate_key in ("max_null_rate", "max_duplicate_rate"): + val = extracted.get(rate_key) + try: + val = float(val) + if not (0.0 <= val <= 1.0): + raise ValueError + except (TypeError, ValueError): + return { + "status": "configuring", + "message": ( + f"{rate_key.replace('_', ' ').title()} must be a decimal between 0 and 1 " + "(e.g. 0.05 for 5%). Please provide a valid value." + ), + "conversation": conversation, + } + + base_price = float(extracted.get("base_price") or 0.0) + if base_price >= max_budget: + return { + "status": "configuring", + "message": ( + f"Base price (${base_price:,.2f}) must be less than max budget (${max_budget:,.2f}). " + "Please adjust." + ), + "conversation": conversation, + } + + # Build and validate BuyerPolicy (Pydantic catches anything we missed) + try: + policy = BuyerPolicy( + required_columns=[str(c) for c in required_columns], + min_rows=min_rows, + max_null_rate=float(extracted["max_null_rate"]), + max_duplicate_rate=float(extracted["max_duplicate_rate"]), + min_label_rate=extracted.get("min_label_rate"), + label_column=extracted.get("label_column"), + forbidden_columns=[str(c) for c in (extracted.get("forbidden_columns") or [])], + max_budget=max_budget, + base_price=base_price, + description=str(extracted.get("description") or ""), + ) + except Exception as exc: + return { + "status": "configuring", + "message": f"Could not build policy: {exc}. Please review your inputs.", + "conversation": conversation, + } + + ready_message = ( + f"Policy saved.\n" + f"Required columns: {', '.join(policy.required_columns)}\n" + f"Minimum rows: {policy.min_rows:,}\n" + f"Max null rate: {policy.max_null_rate:.0%} | " + f"Max duplicate rate: {policy.max_duplicate_rate:.0%}\n" + f"Budget: ${policy.base_price:,.2f} – ${policy.max_budget:,.2f}\n" + + (f"Label column: {policy.label_column} (≥ {policy.min_label_rate:.1%})\n" + if policy.label_column else "") + + (f"Forbidden columns: {', '.join(policy.forbidden_columns)}\n" + if policy.forbidden_columns else "") + + "\nShare the instance link with your supplier to begin." + ) + + return { + "status": "ready", + "message": ready_message, + "conversation": conversation, + "config": policy, + "threshold": 1, # procurement triggers instantly on first supplier submission + } diff --git a/tests/test_data_procurement.py b/tests/test_data_procurement.py index ad21fa6..e25e254 100644 --- a/tests/test_data_procurement.py +++ b/tests/test_data_procurement.py @@ -492,3 +492,128 @@ def test_exactly_at_cardinality_limit_passes(self): def test_empty_string_passes(self): assert validate_tool_output("") == "" + + +# --------------------------------------------------------------------------- +# procurement_init_handler +# --------------------------------------------------------------------------- + +from unittest.mock import patch + +from skills.confidential_data_procurement.init import ( + _parse_llm_response, + procurement_init_handler, +) + + +class _FakeLLM: + """Minimal LLM stub — returns a fixed content string.""" + def __init__(self, content: str): + self._content = content + + def invoke(self, _messages): + class _R: + pass + r = _R() + r.content = self._content + return r + + +_SEEDED_CONV = [ + {"role": "system", "content": "sys"}, + {"role": "ai", "content": "greeting"}, +] + +_VALID_JSON = ( + '{"ready": true, "required_columns": ["txn_id", "amount", "label"], ' + '"min_rows": 500, "max_null_rate": 0.05, "max_duplicate_rate": 0.10, ' + '"max_budget": 4000.0, "base_price": 200.0, ' + '"min_label_rate": 0.02, "label_column": "label", ' + '"forbidden_columns": ["ssn"], "description": "fraud dataset"}' +) + + +class TestParseLlmResponse: + def test_valid_ready_json(self): + result = _parse_llm_response(_VALID_JSON) + assert result is not None + assert result["ready"] is True + assert result["required_columns"] == ["txn_id", "amount", "label"] + + def test_markdown_fences_stripped(self): + wrapped = f"```json\n{_VALID_JSON}\n```" + assert _parse_llm_response(wrapped) is not None + + def test_non_json_returns_none(self): + assert _parse_llm_response("Sure, what columns do you need?") is None + + def test_ready_false_returns_none(self): + assert _parse_llm_response('{"ready": false, "message": "tell me more"}') is None + + def test_missing_ready_returns_none(self): + assert _parse_llm_response('{"required_columns": ["a"]}') is None + + +class TestProcurementInitHandler: + def test_first_turn_returns_greeting(self): + result = procurement_init_handler("", []) + assert result["status"] == "configuring" + assert "required columns" in result["message"].lower() + assert result["conversation"][0]["role"] == "system" + + def test_first_turn_no_llm_call(self): + # No patch needed — should not call get_llm at all on turn 1 + result = procurement_init_handler("anything", []) + assert result["status"] == "configuring" + + def test_valid_json_returns_ready(self): + with patch("skills.confidential_data_procurement.init.get_llm", + return_value=_FakeLLM(_VALID_JSON)): + result = procurement_init_handler("here is my policy", _SEEDED_CONV) + assert result["status"] == "ready" + assert result["threshold"] == 1 + policy = result["config"] + assert policy.min_rows == 500 + assert policy.max_budget == 4000.0 + assert policy.base_price == 200.0 + assert "ssn" in policy.forbidden_columns + + def test_empty_columns_stays_configuring(self): + bad = _VALID_JSON.replace('"txn_id", "amount", "label"', "") + bad = bad.replace('"required_columns": [],', '"required_columns": [],') + payload = '{"ready": true, "required_columns": [], "min_rows": 500, "max_null_rate": 0.05, "max_duplicate_rate": 0.10, "max_budget": 4000.0}' + with patch("skills.confidential_data_procurement.init.get_llm", + return_value=_FakeLLM(payload)): + result = procurement_init_handler("no columns", _SEEDED_CONV) + assert result["status"] == "configuring" + assert "column" in result["message"].lower() + + def test_zero_min_rows_stays_configuring(self): + payload = '{"ready": true, "required_columns": ["a"], "min_rows": 0, "max_null_rate": 0.05, "max_duplicate_rate": 0.10, "max_budget": 1000.0}' + with patch("skills.confidential_data_procurement.init.get_llm", + return_value=_FakeLLM(payload)): + result = procurement_init_handler("zero rows", _SEEDED_CONV) + assert result["status"] == "configuring" + assert "rows" in result["message"].lower() + + def test_base_price_above_budget_stays_configuring(self): + payload = '{"ready": true, "required_columns": ["a"], "min_rows": 100, "max_null_rate": 0.05, "max_duplicate_rate": 0.10, "max_budget": 500.0, "base_price": 600.0}' + with patch("skills.confidential_data_procurement.init.get_llm", + return_value=_FakeLLM(payload)): + result = procurement_init_handler("bad price", _SEEDED_CONV) + assert result["status"] == "configuring" + assert "base price" in result["message"].lower() + + def test_non_json_response_stays_configuring(self): + with patch("skills.confidential_data_procurement.init.get_llm", + return_value=_FakeLLM("What forbidden columns do you need?")): + result = procurement_init_handler("not ready yet", _SEEDED_CONV) + assert result["status"] == "configuring" + assert result["message"] == "What forbidden columns do you need?" + + def test_conversation_accumulates(self): + with patch("skills.confidential_data_procurement.init.get_llm", + return_value=_FakeLLM(_VALID_JSON)): + result = procurement_init_handler("my policy", _SEEDED_CONV) + # seeded (2) + human (1) + ai (1) = 4 + assert len(result["conversation"]) == 4 From 1d0c2cf1c637e9ff4ca209dcf9a74232e23c5d10 Mon Sep 17 00:00:00 2001 From: Parth Thapliyal Date: Sun, 22 Mar 2026 15:21:28 -0400 Subject: [PATCH 06/13] feat: run_skill pipeline, SkillCard, and skill registration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires up the full pipeline: deterministic evaluation → guardrails → SkillResponse. Agent layer stubbed (schema=0.5, claim_veracity=1.0 placeholders until Commit 7). Registers the new skill in routes.py alongside hackathon_novelty. SkillCard declares instant trigger mode, role-aware output keys, upload_handler, and user_display hints. --- api/routes.py | 7 +- .../confidential_data_procurement/__init__.py | 163 +++++++++++++++++- tests/test_data_procurement.py | 124 ++++++++++++- 3 files changed, 286 insertions(+), 8 deletions(-) diff --git a/api/routes.py b/api/routes.py index 8e3a142..61c02a4 100644 --- a/api/routes.py +++ b/api/routes.py @@ -34,8 +34,11 @@ def register_skills(): """Register all skills. Called at startup.""" - from skills.hackathon_novelty import skill_card - _skill_router.register(skill_card) + from skills.hackathon_novelty import skill_card as hackathon_card + _skill_router.register(hackathon_card) + + from skills.confidential_data_procurement import skill_card as procurement_card + _skill_router.register(procurement_card) # --- Helpers --- diff --git a/skills/confidential_data_procurement/__init__.py b/skills/confidential_data_procurement/__init__.py index c1928ef..b174376 100644 --- a/skills/confidential_data_procurement/__init__.py +++ b/skills/confidential_data_procurement/__init__.py @@ -1,2 +1,161 @@ -# Confidential Data Procurement skill — package init. -# run_skill, skill_card, and respond_handler will be added in Commit 6. +""" +Entry point for the confidential_data_procurement skill. + +Pipeline (per submission — threshold=1, so always exactly one): + 0. ingest.py — CSV parse + metadata parse (no LLM) + 1. deterministic.py — quality metrics, component scores, price, deal check (no LLM) + 2. agent.py — schema matching, claim verification, explanation (LLM) [Commit 7] + 3. guardrails.py — role-aware key filter, score clamping, leakage detection + +What to edit here: +- run_skill(): change how deterministic + agent results merge +- skill_card: update description, config, trigger_modes, roles, user_display + +The skill_card is consumed by the SkillRouter and the /skills API endpoint. +respond_handler will be added in Commit 8 (renegotiation). +""" +from __future__ import annotations + +from core.models import SkillResponse +from core.skill_card import SkillCard +from skills.confidential_data_procurement.config import ( + ALLOWED_OUTPUT_KEYS, + USER_OUTPUT_KEYS, +) +from skills.confidential_data_procurement.deterministic import run_deterministic +from skills.confidential_data_procurement.guardrails import ProcurementFilter +from skills.confidential_data_procurement.init import procurement_init_handler +from skills.confidential_data_procurement.ingest import cleanup, procurement_upload_handler +from skills.confidential_data_procurement.models import ( + BuyerPolicy, + ProcurementResult, + SupplierSubmission, +) + + +def run_skill(inputs: list[SupplierSubmission], params: BuyerPolicy) -> SkillResponse: + """ + Full pipeline: deterministic → [agent — Commit 7] → guardrails → response. + + With threshold=1, inputs always has exactly one SupplierSubmission. + The dataset DataFrame lives in the ingest store — never serialized or passed to LLM. + """ + results = [] + + for sub in inputs: + det = run_deterministic(sub.dataset_id, params, sub.reserve_price) + metrics = det["metrics"] + + if metrics.critical_failure: + result = ProcurementResult( + submission_id=sub.submission_id, + deal=False, + quality_score=0.0, + proposed_payment=params.base_price, + hard_constraints_pass=False, + settlement_status="rejected", + notes=det["notes"], + ) + else: + # --- Agent layer (Commit 7) will refine these placeholders --- + # schema_score stays 0.5, claim_veracity stays 1.0 + # Agent will populate: explanation, claim_verification, schema_matching + # and update quality_score / proposed_payment accordingly + + settlement_status = "pending_approval" if det["deal"] else "rejected" + + result = ProcurementResult( + submission_id=sub.submission_id, + deal=det["deal"], + quality_score=det["quality_score"], + proposed_payment=det["proposed_payment"], + hard_constraints_pass=metrics.hard_constraints_pass, + settlement_status=settlement_status, + notes=det["notes"], + ) + + results.append(result.model_dump()) + + # Guardrails — admin-level filter stores all allowed keys. + # Role-based filtering (buyer vs supplier) happens in routes.py GET /results. + output_filter = ProcurementFilter(role="admin") + filtered = output_filter.apply(results, raw_inputs=[]) + + return SkillResponse(skill="confidential_data_procurement", results=filtered) + + +skill_card = SkillCard( + name="confidential_data_procurement", + description=( + "Bilateral confidential dataset trade protocol. A buyer defines acquisition " + "policy and budget; a supplier uploads a CSV dataset with a reserve price. " + "The TEE evaluates data quality (null rates, duplicates, schema match, claim " + "verification) and proposes a fair price — neither party sees the other's " + "private numbers. Only derived quality metrics and the deal verdict leave " + "the enclave." + ), + run=run_skill, + input_model=SupplierSubmission, + output_keys=ALLOWED_OUTPUT_KEYS, + user_output_keys=USER_OUTPUT_KEYS, + config={"min_submissions": 1}, + trigger_modes=[ + { + "mode": "instant", + "description": ( + "Pipeline fires immediately when the supplier submits. " + "Each submission is evaluated independently against the buyer's policy." + ), + "default_config": {"min_submissions": 1}, + "admin_configurable": False, + }, + ], + roles={ + "admin": { + "description": ( + "Data buyer. Initialises the instance with an acquisition policy " + "(required columns, quality thresholds, budget range). Sees full " + "quality scores and proposed payment. Can accept, reject, or " + "renegotiate the deal." + ), + "capabilities": ["configure", "view_all_results", "respond"], + }, + "user": { + "description": ( + "Data supplier. Uploads a CSV dataset and metadata, sets a reserve " + "price, and submits for evaluation. Sees the proposed payment and " + "deal verdict but NOT the quality score (to prevent budget " + "reverse-engineering). Can accept, reject, or renegotiate." + ), + "capabilities": ["upload", "submit", "respond"], + "result_view": "own", + }, + }, + setup_prompt=( + "This skill runs a confidential dataset trade inside a TEE. " + "No raw data or private budget numbers ever leave the enclave.\n\n" + "As the buyer (admin), you need to provide:\n" + "1. **Required columns** — the column names you expect in the dataset.\n" + "2. **Quality thresholds** — minimum rows, max null rate, max duplicate rate.\n" + "3. **Budget** — your maximum budget and optional base (floor) price.\n" + "4. **(Optional)** Label column + minimum label rate.\n" + "5. **(Optional)** Forbidden columns — PII fields to automatically block.\n\n" + "The supplier will upload a CSV + metadata and set a reserve price. " + "The TEE computes a quality score, proposes a fair price, and checks " + "if the deal is viable (reserve ≤ price ≤ budget). Both parties then " + "accept, reject, or renegotiate." + ), + init_handler=procurement_init_handler, + upload_handler=procurement_upload_handler, + respond_handler=None, # Commit 8 + user_display={ + "deal": {"type": "badge", "label": "Deal Status"}, + "quality_score": {"type": "gauge", "label": "Quality Score", "min": 0, "max": 1}, + "proposed_payment": {"type": "currency", "label": "Proposed Payment"}, + "settlement_status": {"type": "badge", "label": "Settlement"}, + "notes": {"type": "list", "label": "Notes"}, + "explanation": {"type": "text", "label": "Analysis"}, + "schema_matching": {"type": "json", "label": "Schema Matching"}, + "claim_verification": {"type": "json", "label": "Claim Verification"}, + }, +) diff --git a/tests/test_data_procurement.py b/tests/test_data_procurement.py index e25e254..742da22 100644 --- a/tests/test_data_procurement.py +++ b/tests/test_data_procurement.py @@ -1,7 +1,8 @@ """ -Unit tests for confidential_data_procurement — deterministic layer. +Unit tests for confidential_data_procurement. Tests cover: metrics computation, critical checks, component scores, -quality score formula, price formula, deal condition, and run_deterministic. +quality score formula, price formula, deal condition, run_deterministic, +guardrails (filter + validator), init handler, run_skill, and skill_card. """ from __future__ import annotations @@ -24,8 +25,8 @@ compute_quality_score, run_deterministic, ) -from skills.confidential_data_procurement.ingest import _datasets, cleanup -from skills.confidential_data_procurement.models import BuyerPolicy +from skills.confidential_data_procurement.ingest import _datasets, cleanup, procurement_upload_handler +from skills.confidential_data_procurement.models import BuyerPolicy, SupplierSubmission # --------------------------------------------------------------------------- @@ -617,3 +618,118 @@ def test_conversation_accumulates(self): result = procurement_init_handler("my policy", _SEEDED_CONV) # seeded (2) + human (1) + ai (1) = 4 assert len(result["conversation"]) == 4 + + +# --------------------------------------------------------------------------- +# run_skill + skill_card +# --------------------------------------------------------------------------- + +from skills.confidential_data_procurement import run_skill, skill_card + + +class TestRunSkill: + def test_good_dataset_returns_deal(self): + df = _make_df(rows=200) + policy = _make_policy(min_rows=100, max_budget=5000.0, base_price=500.0) + dataset_id = _register_df(df) + try: + sub = SupplierSubmission( + submission_id="sub-good", + dataset_id=dataset_id, + dataset_name="fraud_data.csv", + reserve_price=1000.0, + ) + resp = run_skill([sub], policy) + assert resp.skill == "confidential_data_procurement" + assert len(resp.results) == 1 + r = resp.results[0] + assert r["deal"] is True + assert r["settlement_status"] == "pending_approval" + assert r["proposed_payment"] >= 500.0 + assert r["quality_score"] > 0.5 + finally: + cleanup(dataset_id) + + def test_critical_failure_returns_rejected(self): + df = _make_df(rows=50) + df["ssn"] = "xxx" + policy = _make_policy() + dataset_id = _register_df(df) + try: + sub = SupplierSubmission( + submission_id="sub-bad", + dataset_id=dataset_id, + dataset_name="bad_data.csv", + reserve_price=100.0, + ) + resp = run_skill([sub], policy) + r = resp.results[0] + assert r["deal"] is False + assert r["settlement_status"] == "rejected" + assert r["quality_score"] == 0.0 + finally: + cleanup(dataset_id) + + def test_reserve_above_payment_no_deal(self): + df = _make_df(rows=150) + policy = _make_policy(min_rows=100, max_budget=1000.0, base_price=0.0) + dataset_id = _register_df(df) + try: + sub = SupplierSubmission( + submission_id="sub-expensive", + dataset_id=dataset_id, + dataset_name="data.csv", + reserve_price=9999.0, + ) + resp = run_skill([sub], policy) + r = resp.results[0] + assert r["deal"] is False + assert r["settlement_status"] == "rejected" + finally: + cleanup(dataset_id) + + def test_internal_fields_stripped_by_guardrails(self): + """revised_budget and revised_reserve should not appear in output.""" + df = _make_df(rows=200) + policy = _make_policy(min_rows=100, max_budget=5000.0, base_price=500.0) + dataset_id = _register_df(df) + try: + sub = SupplierSubmission( + submission_id="sub-internal", + dataset_id=dataset_id, + dataset_name="data.csv", + reserve_price=100.0, + ) + resp = run_skill([sub], policy) + r = resp.results[0] + assert "revised_budget" not in r + assert "revised_reserve" not in r + finally: + cleanup(dataset_id) + + +class TestSkillCard: + def test_card_name(self): + assert skill_card.name == "confidential_data_procurement" + + def test_card_has_required_fields(self): + assert skill_card.run is run_skill + assert skill_card.input_model is SupplierSubmission + assert skill_card.init_handler is procurement_init_handler + assert skill_card.upload_handler is procurement_upload_handler + + def test_output_keys_superset_of_user_keys(self): + assert skill_card.user_output_keys.issubset(skill_card.output_keys) + + def test_quality_score_buyer_only(self): + assert "quality_score" in skill_card.output_keys + assert "quality_score" not in skill_card.user_output_keys + + def test_metadata_serializable(self): + meta = skill_card.metadata() + assert meta["name"] == "confidential_data_procurement" + assert "quality_score" in meta["output_keys"] + assert "quality_score" not in meta["user_output_keys"] + + def test_threshold_is_one(self): + assert skill_card.config["min_submissions"] == 1 From 87848a5579911fd392f6cdb97ab9f35ff4aeed13 Mon Sep 17 00:00:00 2001 From: Parth Thapliyal Date: Sun, 22 Mar 2026 16:29:21 -0400 Subject: [PATCH 07/13] =?UTF-8?q?feat:=20agent=20layer=20=E2=80=94=20schem?= =?UTF-8?q?a=20matching,=20claim=20verification,=20explanation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single evaluate_node uses 3 aggregate-only tools (schema_summary, column_stats, value_distribution) to do fuzzy column matching and seller claim verification. Agent output (schema_score, claim_veracity_score) replaces deterministic placeholders, quality_score and proposed_payment are recomputed before guardrails. --- .../confidential_data_procurement/__init__.py | 47 +++- skills/confidential_data_procurement/agent.py | 244 ++++++++++++++++++ skills/confidential_data_procurement/tools.py | 140 ++++++++++ tests/test_data_procurement.py | 95 +++++++ 4 files changed, 515 insertions(+), 11 deletions(-) create mode 100644 skills/confidential_data_procurement/agent.py create mode 100644 skills/confidential_data_procurement/tools.py diff --git a/skills/confidential_data_procurement/__init__.py b/skills/confidential_data_procurement/__init__.py index b174376..b178e65 100644 --- a/skills/confidential_data_procurement/__init__.py +++ b/skills/confidential_data_procurement/__init__.py @@ -18,19 +18,26 @@ from core.models import SkillResponse from core.skill_card import SkillCard +from skills.confidential_data_procurement.agent import run_agent from skills.confidential_data_procurement.config import ( ALLOWED_OUTPUT_KEYS, USER_OUTPUT_KEYS, ) -from skills.confidential_data_procurement.deterministic import run_deterministic +from skills.confidential_data_procurement.deterministic import ( + check_deal, + compute_price, + compute_quality_score, + run_deterministic, +) from skills.confidential_data_procurement.guardrails import ProcurementFilter from skills.confidential_data_procurement.init import procurement_init_handler -from skills.confidential_data_procurement.ingest import cleanup, procurement_upload_handler +from skills.confidential_data_procurement.ingest import get_dataset, procurement_upload_handler from skills.confidential_data_procurement.models import ( BuyerPolicy, ProcurementResult, SupplierSubmission, ) +from skills.confidential_data_procurement.tools import set_context def run_skill(inputs: list[SupplierSubmission], params: BuyerPolicy) -> SkillResponse: @@ -57,21 +64,39 @@ def run_skill(inputs: list[SupplierSubmission], params: BuyerPolicy) -> SkillRes notes=det["notes"], ) else: - # --- Agent layer (Commit 7) will refine these placeholders --- - # schema_score stays 0.5, claim_veracity stays 1.0 - # Agent will populate: explanation, claim_verification, schema_matching - # and update quality_score / proposed_payment accordingly - - settlement_status = "pending_approval" if det["deal"] else "rejected" + # Agent layer — schema matching + claim verification + explanation + dataset = get_dataset(sub.dataset_id) + set_context(sub.dataset_id, { + "required_columns": params.required_columns, + "column_definitions": dataset.get("column_definitions") or {}, + "seller_claims": dataset.get("seller_claims") or {}, + }) + agent_result = run_agent(sub.dataset_id, params, metrics, det["component_scores"]) + + # Merge agent's refined scores into component_scores, recompute quality + component_scores = {**det["component_scores"]} + component_scores["schema"] = agent_result["schema_score"] + component_scores["claim_veracity"] = agent_result["claim_veracity_score"] + + quality_score = compute_quality_score(component_scores, params) + proposed_payment = compute_price(quality_score, params.base_price, params.max_budget) + deal = check_deal( + metrics.hard_constraints_pass, sub.reserve_price, + proposed_payment, params.max_budget, + ) + settlement_status = "pending_approval" if deal else "rejected" result = ProcurementResult( submission_id=sub.submission_id, - deal=det["deal"], - quality_score=det["quality_score"], - proposed_payment=det["proposed_payment"], + deal=deal, + quality_score=quality_score, + proposed_payment=proposed_payment, hard_constraints_pass=metrics.hard_constraints_pass, settlement_status=settlement_status, notes=det["notes"], + explanation=agent_result.get("explanation"), + schema_matching=agent_result.get("schema_matching"), + claim_verification=agent_result.get("claim_verification"), ) results.append(result.model_dump()) diff --git a/skills/confidential_data_procurement/agent.py b/skills/confidential_data_procurement/agent.py new file mode 100644 index 0000000..bd53d8b --- /dev/null +++ b/skills/confidential_data_procurement/agent.py @@ -0,0 +1,244 @@ +""" +Single evaluate_node agent for confidential_data_procurement. + +The agent gets one LLM call (with tool loop) to: + 1. Read the dataset schema and column stats via tools + 2. Match the buyer's required columns to actual columns (semantic/fuzzy) + 3. Verify the seller's claims against observed statistics + 4. Write a bounded explanation for both parties + 5. Output schema_score and claim_veracity_score to replace deterministic placeholders + +The dataset never leaves the TEE — the LLM sees only aggregate statistics +returned by the tools. validate_tool_output() in tools.py blocks raw row dumps. + +Graph: single evaluate_node (no routing needed — one supplier, one dataset per run). +""" +from __future__ import annotations + +import json +import re +from typing import Any + +from langchain_core.messages import HumanMessage, SystemMessage +from langgraph.prebuilt import ToolNode + +from config import get_llm +from skills.confidential_data_procurement.config import EVALUATE_MODEL +from skills.confidential_data_procurement.models import BuyerPolicy, DatasetMetrics +from skills.confidential_data_procurement.tools import EVALUATE_TOOLS + + +EVALUATE_PROMPT_VERSION = "v1" + + +_SYSTEM_PROMPT = """\ +You are a data quality evaluator running inside a Trusted Execution Environment (TEE). +Your job is to assess a supplier's dataset against a buyer's acquisition policy. + +You have three tools: + - get_schema_summary() — column names, dtypes, null rates, row count + - get_column_stats(column_name) — per-column statistics + - get_value_distribution(column_name, top_n) — top-N value frequencies + +TASK 1 — SCHEMA MATCHING +The buyer requires these columns (semantic — names may differ from actual dataset): +{required_columns} + +Column definitions provided by the seller: +{column_definitions} + +For each required column, find the best matching actual column. +A match is valid if the column names are semantically equivalent +(e.g. "transaction_id" ≈ "txn_id", "is_fraud" ≈ "fraud_label"). +Score schema_score as: matched_count / required_count (0.0 if none match, 1.0 if all match). + +TASK 2 — CLAIM VERIFICATION +The seller claims: +{seller_claims} + +Call get_column_stats or get_value_distribution to check each claim against real data. +Mark each claim as "verified", "disputed", or "unverifiable" (if no relevant column exists). +Score claim_veracity_score as: verified_count / total_claims (1.0 if no claims). + +TASK 3 — EXPLANATION +Write a concise (3-5 sentence) neutral explanation covering: +- Which required columns were found/missing +- Whether seller's claims held up +- Any notable quality concerns from the deterministic metrics + +IMPORTANT: +- Only use aggregate stats from tools — never infer individual values +- Do not mention the buyer's budget, base price, or quality score +- Keep explanation under 400 words + +After calling the tools you need, output ONLY this JSON (no markdown fences, no prose): +{{ + "schema_score": 0.0-1.0, + "claim_veracity_score": 0.0-1.0, + "schema_matching": {{"required_col": "matched_col_or_null", ...}}, + "claim_verification": {{"claim_text": "verified|disputed|unverifiable", ...}}, + "explanation": "..." +}} +""" + + +def run_agent( + dataset_id: str, + policy: BuyerPolicy, + metrics: DatasetMetrics, + component_scores: dict[str, float], +) -> dict[str, Any]: + """ + Run the evaluate node for one dataset. + + Returns a dict with: + schema_score, claim_veracity_score, schema_matching, claim_verification, explanation + Falls back to safe defaults if the LLM output cannot be parsed. + """ + from skills.confidential_data_procurement.ingest import get_dataset + + dataset = get_dataset(dataset_id) + column_definitions = dataset.get("column_definitions") or {} + seller_claims = dataset.get("seller_claims") or {} + + required_str = ", ".join(policy.required_columns) if policy.required_columns else "(none)" + definitions_str = ( + "\n".join(f" {col}: {defn}" for col, defn in column_definitions.items()) + if column_definitions else " (no definitions provided)" + ) + claims_str = ( + "\n".join(f" - {k}: {v}" for k, v in seller_claims.items()) + if seller_claims else " (no claims provided)" + ) + + system_content = _SYSTEM_PROMPT.format( + required_columns=required_str, + column_definitions=definitions_str, + seller_claims=claims_str, + ) + + # Build deterministic context note for the LLM + det_note = ( + f"Deterministic metrics already computed:\n" + f" rows={metrics.row_count}, " + f" overall_null_rate={metrics.overall_null_rate:.1%}, " + f" duplicate_rate={metrics.duplicate_rate:.1%}, " + f" hard_constraints_pass={metrics.hard_constraints_pass}" + ) + + llm = get_llm(EVALUATE_MODEL).bind_tools(EVALUATE_TOOLS) + messages = [ + SystemMessage(content=system_content), + HumanMessage(content=( + f"Evaluate the dataset now.\n\n{det_note}\n\n" + "Call get_schema_summary first, then any other tools you need, " + "then output the final JSON." + )), + ] + + # Tool loop — LLM decides when to stop calling tools + max_iterations = 10 + response = None + for _ in range(max_iterations): + response = llm.invoke(messages) + messages.append(response) + if not (hasattr(response, "tool_calls") and response.tool_calls): + break + tool_node = ToolNode(EVALUATE_TOOLS) + tool_results = tool_node.invoke({"messages": messages}) + messages.extend(tool_results["messages"]) + + raw = response.content if response and isinstance(response.content, str) else "" + + # Nudge if LLM stopped without producing JSON + if raw.strip() and not _looks_like_json(raw): + messages.append(HumanMessage(content=( + "Now output ONLY the final JSON object with schema_score, " + "claim_veracity_score, schema_matching, claim_verification, and explanation." + ))) + response = llm.invoke(messages) + raw = response.content if isinstance(response.content, str) else "" + + return _parse_agent_output(raw, policy, seller_claims) + + +# --------------------------------------------------------------------------- +# Parsers +# --------------------------------------------------------------------------- + +def _looks_like_json(text: str) -> bool: + return bool(re.search(r'\{', text)) + + +def _parse_agent_output( + text: str, + policy: BuyerPolicy, + seller_claims: dict, +) -> dict[str, Any]: + """Extract agent JSON from LLM response. Falls back to safe defaults.""" + text = text.strip() + # Strip markdown fences + if text.startswith("```"): + lines = text.splitlines() + inner = lines[1:-1] if lines[-1].strip() == "```" else lines[1:] + text = "\n".join(inner).strip() + + obj = None + match = re.search(r'\{', text) + if match: + start = match.start() + depth = 0 + in_str = False + escape = False + end = -1 + for i in range(start, len(text)): + c = text[i] + if escape: + escape = False + continue + if c == "\\" and in_str: + escape = True + continue + if c == '"': + in_str = not in_str + if not in_str: + if c == "{": + depth += 1 + elif c == "}": + depth -= 1 + if depth == 0: + end = i + 1 + break + if end != -1: + try: + obj = json.loads(text[start:end]) + except (json.JSONDecodeError, ValueError): + obj = None + + if not obj: + return _safe_defaults(policy, seller_claims) + + schema_score = float(obj.get("schema_score") or 0.5) + schema_score = max(0.0, min(1.0, schema_score)) + + claim_veracity_score = float(obj.get("claim_veracity_score") or 1.0) + claim_veracity_score = max(0.0, min(1.0, claim_veracity_score)) + + return { + "schema_score": schema_score, + "claim_veracity_score": claim_veracity_score, + "schema_matching": obj.get("schema_matching") or {}, + "claim_verification": obj.get("claim_verification") or {}, + "explanation": str(obj.get("explanation") or ""), + } + + +def _safe_defaults(policy: BuyerPolicy, seller_claims: dict) -> dict[str, Any]: + """Return conservative defaults when agent output cannot be parsed.""" + return { + "schema_score": 0.5, + "claim_veracity_score": 1.0, + "schema_matching": {col: None for col in policy.required_columns}, + "claim_verification": {k: "unverifiable" for k in seller_claims}, + "explanation": "Automated evaluation completed. Schema and claim verification results unavailable.", + } diff --git a/skills/confidential_data_procurement/tools.py b/skills/confidential_data_procurement/tools.py new file mode 100644 index 0000000..0966a5e --- /dev/null +++ b/skills/confidential_data_procurement/tools.py @@ -0,0 +1,140 @@ +""" +Aggregate-only data tools for the confidential_data_procurement evaluate node. + +Security: + - Tools NEVER return raw rows or individual cell values. + - All output passes through validate_tool_output() before leaving the tool. + - The LLM sees aggregate statistics only — it cannot reconstruct the dataset. + +Tools: + 1. get_schema_summary() — column names, dtypes, null rates, row count + 2. get_column_stats(column_name) — numeric: min/max/mean/std; categorical: top-5 counts + 3. get_value_distribution(col, n) — top-N value counts + distinct count + +What to edit here: + - Add a new tool: define @tool function, add to EVALUATE_TOOLS. + - Change cardinality / size limits: update constants in guardrails.py. +""" +from __future__ import annotations + +from langchain_core.tools import tool + +from skills.confidential_data_procurement.guardrails import validate_tool_output + +# --------------------------------------------------------------------------- +# Module-level context — set by set_context() in __init__.py before agent runs +# --------------------------------------------------------------------------- + +_dataset_id: str = "" +_policy_context: dict = {} # required_columns, column_definitions, seller_claims + + +def set_context(dataset_id: str, policy_context: dict) -> None: + """Bind the active dataset and policy context for this evaluation run. + Called by run_skill() before run_agent(). + """ + global _dataset_id, _policy_context + _dataset_id = dataset_id + _policy_context = policy_context + + +def _get_df(): + from skills.confidential_data_procurement.ingest import get_dataset + return get_dataset(_dataset_id)["df"] + + +# --------------------------------------------------------------------------- +# Tools +# --------------------------------------------------------------------------- + +@tool +def get_schema_summary() -> str: + """ + Get a summary of the dataset schema. + + Returns: column names, data types, null rate per column, row count, column count. + Call this first to understand what columns are present and their data quality. + """ + df = _get_df() + lines = [f"rows: {len(df)}", f"columns ({len(df.columns)}):"] + for col in df.columns: + dtype = str(df[col].dtype) + null_rate = float(df[col].isna().mean()) + lines.append(f" {col}: dtype={dtype}, null_rate={null_rate:.1%}") + return validate_tool_output("\n".join(lines)) + + +@tool +def get_column_stats(column_name: str) -> str: + """ + Get aggregate statistics for a single column. + + Numeric columns: min, max, mean, median, std, non-null count. + Categorical columns: total distinct values, top-5 most frequent values with counts. + Returns an error if the column does not exist. + """ + df = _get_df() + if column_name not in df.columns: + available = ", ".join(list(df.columns)[:10]) + return f"Column '{column_name}' not found. Available columns: {available}" + + col = df[column_name].dropna() + if len(col) == 0: + return validate_tool_output(f"column: {column_name}\nAll values are null.") + + if col.dtype.kind in ("i", "f", "u"): + output = ( + f"column: {column_name} (numeric)\n" + f"count: {len(col)}\n" + f"min: {col.min():.4g}\n" + f"max: {col.max():.4g}\n" + f"mean: {col.mean():.4g}\n" + f"median: {col.median():.4g}\n" + f"std: {col.std():.4g}" + ) + else: + top = col.value_counts().head(5) + top_lines = "\n".join(f" {v}: {c}" for v, c in top.items()) + output = ( + f"column: {column_name} (categorical)\n" + f"count: {len(col)}\n" + f"distinct: {col.nunique()}\n" + f"top-5:\n{top_lines}" + ) + + return validate_tool_output(output) + + +@tool +def get_value_distribution(column_name: str, top_n: int = 10) -> str: + """ + Get the top-N most frequent values for a column with their counts and percentages. + + top_n is capped at 20. Use this to assess label distribution (e.g. fraud rate), + category balance, or unusual value concentration. Returns total distinct count too. + """ + df = _get_df() + if column_name not in df.columns: + return f"Column '{column_name}' not found." + + top_n = min(max(top_n, 1), 20) + col = df[column_name].dropna() + total = len(col) + total_distinct = col.nunique() + top = col.value_counts().head(top_n) + + lines = [ + f"column: {column_name}", + f"total non-null: {total}", + f"total distinct: {total_distinct}", + f"top-{top_n}:", + ] + for val, count in top.items(): + pct = count / total * 100 if total > 0 else 0 + lines.append(f" {val}: {count} ({pct:.1f}%)") + + return validate_tool_output("\n".join(lines)) + + +# Tool group — bound to evaluate_node in agent.py +EVALUATE_TOOLS = [get_schema_summary, get_column_stats, get_value_distribution] diff --git a/tests/test_data_procurement.py b/tests/test_data_procurement.py index 742da22..339c13e 100644 --- a/tests/test_data_procurement.py +++ b/tests/test_data_procurement.py @@ -733,3 +733,98 @@ def test_metadata_serializable(self): def test_threshold_is_one(self): assert skill_card.config["min_submissions"] == 1 + + +# --------------------------------------------------------------------------- +# Agent layer +# --------------------------------------------------------------------------- + +from skills.confidential_data_procurement.agent import _parse_agent_output, _safe_defaults +from skills.confidential_data_procurement.tools import ( + get_column_stats, + get_schema_summary, + get_value_distribution, + set_context, +) + + +_AGENT_JSON = ( + '{"schema_score": 0.8, "claim_veracity_score": 0.9, ' + '"schema_matching": {"transaction_id": "txn_id", "amount": "amount"}, ' + '"claim_verification": {"no_nulls": "disputed"}, ' + '"explanation": "Dataset looks reasonable."}' +) + + +class TestParseAgentOutput: + def test_valid_json_extracted(self): + policy = _make_policy() + result = _parse_agent_output(_AGENT_JSON, policy, {"no_nulls": "true"}) + assert result["schema_score"] == pytest.approx(0.8) + assert result["claim_veracity_score"] == pytest.approx(0.9) + assert result["schema_matching"]["transaction_id"] == "txn_id" + assert result["explanation"] == "Dataset looks reasonable." + + def test_clamped_scores(self): + policy = _make_policy() + bad = '{"schema_score": 2.5, "claim_veracity_score": -0.1, "explanation": "x"}' + result = _parse_agent_output(bad, policy, {}) + assert result["schema_score"] == 1.0 + assert result["claim_veracity_score"] == 0.0 + + def test_markdown_fences_stripped(self): + policy = _make_policy() + wrapped = f"```json\n{_AGENT_JSON}\n```" + result = _parse_agent_output(wrapped, policy, {}) + assert result["schema_score"] == pytest.approx(0.8) + + def test_unparseable_returns_defaults(self): + policy = _make_policy() + result = _parse_agent_output("Sorry, I could not evaluate.", policy, {"claim": "x"}) + assert result["schema_score"] == 0.5 + assert result["claim_veracity_score"] == 1.0 + + def test_safe_defaults_structure(self): + policy = _make_policy() + result = _safe_defaults(policy, {"low_nulls": "true"}) + assert "schema_matching" in result + assert "claim_verification" in result + assert result["claim_verification"]["low_nulls"] == "unverifiable" + + +class TestTools: + def setup_method(self): + self.df = _make_df(rows=50) + self.dataset_id = _register_df(self.df) + set_context(self.dataset_id, { + "required_columns": ["transaction_id", "amount"], + "column_definitions": {}, + "seller_claims": {}, + }) + + def teardown_method(self): + from skills.confidential_data_procurement.ingest import cleanup + cleanup(self.dataset_id) + + def test_schema_summary_passes_validator(self): + result = get_schema_summary.invoke({}) + assert "transaction_id" in result + assert "rows:" in result + + def test_column_stats_numeric(self): + result = get_column_stats.invoke({"column_name": "amount"}) + assert "numeric" in result + assert "mean" in result + + def test_column_stats_missing_column(self): + result = get_column_stats.invoke({"column_name": "nonexistent"}) + assert "not found" in result.lower() + + def test_value_distribution(self): + result = get_value_distribution.invoke({"column_name": "is_fraud", "top_n": 5}) + assert "is_fraud" in result + assert "distinct" in result + + def test_value_distribution_capped_at_20(self): + result = get_value_distribution.invoke({"column_name": "amount", "top_n": 999}) + assert "top-20" in result From f7946046a5fbffaaafeceeff7345f9454105e2fb Mon Sep 17 00:00:00 2001 From: Parth Thapliyal Date: Sun, 22 Mar 2026 16:33:37 -0400 Subject: [PATCH 08/13] feat: deal response handler with one-round renegotiation (3x3 matrix) procurement_respond_handler implements the full resolution matrix: both-accept issues release_token, any-reject terminates, accept+renegotiate resolves at proposed_payment, double-renegotiate checks if revised_budget >= revised_reserve. One round enforced via renegotiation_used flag. Wired into skill_card. --- .../confidential_data_procurement/__init__.py | 128 ++++++++++++++- tests/test_data_procurement.py | 148 ++++++++++++++++++ 2 files changed, 273 insertions(+), 3 deletions(-) diff --git a/skills/confidential_data_procurement/__init__.py b/skills/confidential_data_procurement/__init__.py index b178e65..40181cc 100644 --- a/skills/confidential_data_procurement/__init__.py +++ b/skills/confidential_data_procurement/__init__.py @@ -4,18 +4,21 @@ Pipeline (per submission — threshold=1, so always exactly one): 0. ingest.py — CSV parse + metadata parse (no LLM) 1. deterministic.py — quality metrics, component scores, price, deal check (no LLM) - 2. agent.py — schema matching, claim verification, explanation (LLM) [Commit 7] + 2. agent.py — schema matching, claim verification, explanation (LLM) 3. guardrails.py — role-aware key filter, score clamping, leakage detection + 4. respond_handler — deal response + one-round renegotiation (3×3 resolution matrix) What to edit here: - run_skill(): change how deterministic + agent results merge +- respond_handler / _resolve(): update renegotiation logic - skill_card: update description, config, trigger_modes, roles, user_display The skill_card is consumed by the SkillRouter and the /skills API endpoint. -respond_handler will be added in Commit 8 (renegotiation). """ from __future__ import annotations +import secrets as _secrets + from core.models import SkillResponse from core.skill_card import SkillCard from skills.confidential_data_procurement.agent import run_agent @@ -109,6 +112,125 @@ def run_skill(inputs: list[SupplierSubmission], params: BuyerPolicy) -> SkillRes return SkillResponse(skill="confidential_data_procurement", results=filtered) +def procurement_respond_handler( + result: dict, + action: str, + revised_value: float | None, + role: str, # "buyer" or "supplier" (mapped from "admin"/"user" in routes.py) + policy: BuyerPolicy, +) -> dict: + """ + Process one deal response and advance the settlement state machine. + + 3×3 resolution matrix (B = buyer, S = supplier): + + B \\ S | accept | reject | renegotiate + --------|--------|--------|------------ + accept | auth | reject | auth* + reject | reject | reject | reject + reneg | auth* | reject | check† + + * auth at proposed_payment — the acceptor already committed + † auth if revised_budget >= revised_reserve, else rejected + + One renegotiation round only — ValueError if renegotiation_used is True. + revised_value is required when action='renegotiate'. + """ + result = dict(result) # shallow copy — don't mutate caller's dict + + if action == "renegotiate": + if result.get("renegotiation_used"): + raise ValueError("Renegotiation already used. Only one round is allowed.") + if revised_value is None: + raise ValueError("revised_value is required when action='renegotiate'.") + if role == "buyer": + revised_value = float(revised_value) + if revised_value < (policy.base_price or 0.0): + raise ValueError( + f"Revised payment (${revised_value:,.2f}) cannot be below " + f"base price (${policy.base_price:,.2f})." + ) + if revised_value > policy.max_budget: + raise ValueError( + f"Revised payment (${revised_value:,.2f}) cannot exceed " + f"max budget (${policy.max_budget:,.2f})." + ) + else: # supplier + revised_value = float(revised_value) + if revised_value < 0: + raise ValueError("Revised reserve price cannot be negative.") + + # Store this party's response + if role == "buyer": + result["buyer_response"] = action + if action == "renegotiate": + result["revised_budget"] = revised_value + else: + result["supplier_response"] = action + if action == "renegotiate": + result["revised_reserve"] = revised_value + + # If both parties have now responded, resolve; otherwise await counterparty + buyer_resp = result.get("buyer_response") + supplier_resp = result.get("supplier_response") + + if buyer_resp is None or supplier_resp is None: + result["settlement_status"] = "awaiting_counterparty" + return result + + return _resolve(result) + + +def _resolve(result: dict) -> dict: + """Apply the 3×3 matrix once both buyer_response and supplier_response are set.""" + buyer_resp = result["buyer_response"] + supplier_resp = result["supplier_response"] + + # Any reject → deal off + if buyer_resp == "reject" or supplier_resp == "reject": + result["settlement_status"] = "rejected" + result["deal"] = False + return result + + # Both accept → authorized + if buyer_resp == "accept" and supplier_resp == "accept": + result["settlement_status"] = "authorized" + result["deal"] = True + result["release_token"] = _secrets.token_urlsafe(16) + return result + + # One accepts + other renegotiates → honor the acceptor's bound (proposed_payment) + if buyer_resp == "accept" or supplier_resp == "accept": + result["settlement_status"] = "authorized" + result["deal"] = True + result["renegotiation_used"] = True + result["release_token"] = _secrets.token_urlsafe(16) + return result + + # Both renegotiate → check if revised terms meet + result["renegotiation_used"] = True + revised_budget = float(result.get("revised_budget") or result.get("proposed_payment") or 0) + revised_reserve = float(result.get("revised_reserve") or 0) + + if revised_budget >= revised_reserve: + result["settlement_status"] = "authorized" + result["deal"] = True + result["proposed_payment"] = revised_budget + result["release_token"] = _secrets.token_urlsafe(16) + else: + result["settlement_status"] = "rejected" + result["deal"] = False + note = ( + f"Renegotiation failed: buyer's revised offer (${revised_budget:,.2f}) " + f"is below supplier's revised reserve (${revised_reserve:,.2f})." + ) + notes = list(result.get("notes") or []) + notes.append(note) + result["notes"] = notes + + return result + + skill_card = SkillCard( name="confidential_data_procurement", description=( @@ -172,7 +294,7 @@ def run_skill(inputs: list[SupplierSubmission], params: BuyerPolicy) -> SkillRes ), init_handler=procurement_init_handler, upload_handler=procurement_upload_handler, - respond_handler=None, # Commit 8 + respond_handler=procurement_respond_handler, user_display={ "deal": {"type": "badge", "label": "Deal Status"}, "quality_score": {"type": "gauge", "label": "Quality Score", "min": 0, "max": 1}, diff --git a/tests/test_data_procurement.py b/tests/test_data_procurement.py index 339c13e..3fdf7f4 100644 --- a/tests/test_data_procurement.py +++ b/tests/test_data_procurement.py @@ -734,6 +734,10 @@ def test_metadata_serializable(self): def test_threshold_is_one(self): assert skill_card.config["min_submissions"] == 1 + def test_respond_handler_registered(self): + from skills.confidential_data_procurement import procurement_respond_handler + assert skill_card.respond_handler is procurement_respond_handler + # --------------------------------------------------------------------------- # Agent layer @@ -828,3 +832,147 @@ def test_value_distribution(self): def test_value_distribution_capped_at_20(self): result = get_value_distribution.invoke({"column_name": "amount", "top_n": 999}) assert "top-20" in result + + +# --------------------------------------------------------------------------- +# respond_handler + renegotiation (3×3 matrix) +# --------------------------------------------------------------------------- + +from skills.confidential_data_procurement import procurement_respond_handler + + +def _base_result(deal=True) -> dict: + return { + "submission_id": "sub-1", + "deal": deal, + "quality_score": 0.75, + "proposed_payment": 3000.0, + "hard_constraints_pass": True, + "settlement_status": "pending_approval" if deal else "rejected", + "release_token": None, + "notes": [], + "explanation": None, + "claim_verification": None, + "schema_matching": None, + "buyer_response": None, + "supplier_response": None, + "renegotiation_used": False, + "revised_budget": None, + "revised_reserve": None, + } + + +class TestRespondHandler: + # --- First response only → awaiting_counterparty --- + + def test_first_buyer_response_awaits_counterparty(self): + r = procurement_respond_handler(_base_result(), "accept", None, "buyer", _make_policy()) + assert r["settlement_status"] == "awaiting_counterparty" + assert r["buyer_response"] == "accept" + assert r["supplier_response"] is None + + def test_first_supplier_response_awaits_counterparty(self): + r = procurement_respond_handler(_base_result(), "accept", None, "supplier", _make_policy()) + assert r["settlement_status"] == "awaiting_counterparty" + assert r["supplier_response"] == "accept" + + # --- Both accept → authorized --- + + def test_both_accept_authorized(self): + result = _base_result() + result["buyer_response"] = "accept" + r = procurement_respond_handler(result, "accept", None, "supplier", _make_policy()) + assert r["settlement_status"] == "authorized" + assert r["deal"] is True + assert r["release_token"] is not None + + # --- Any reject → rejected --- + + def test_buyer_reject_rejected(self): + result = _base_result() + result["supplier_response"] = "accept" + r = procurement_respond_handler(result, "reject", None, "buyer", _make_policy()) + assert r["settlement_status"] == "rejected" + assert r["deal"] is False + + def test_supplier_reject_rejected(self): + result = _base_result() + result["buyer_response"] = "accept" + r = procurement_respond_handler(result, "reject", None, "supplier", _make_policy()) + assert r["settlement_status"] == "rejected" + + def test_both_reject_rejected(self): + result = _base_result() + result["buyer_response"] = "reject" + r = procurement_respond_handler(result, "reject", None, "supplier", _make_policy()) + assert r["settlement_status"] == "rejected" + + def test_renegotiate_then_reject_rejected(self): + result = _base_result() + result["buyer_response"] = "renegotiate" + result["revised_budget"] = 2500.0 + result["renegotiation_used"] = False + r = procurement_respond_handler(result, "reject", None, "supplier", _make_policy()) + assert r["settlement_status"] == "rejected" + + # --- accept + renegotiate → authorized at proposed_payment --- + + def test_buyer_accept_supplier_renegotiate_authorized(self): + result = _base_result() + result["buyer_response"] = "accept" + r = procurement_respond_handler(result, "renegotiate", 3500.0, "supplier", _make_policy()) + assert r["settlement_status"] == "authorized" + assert r["renegotiation_used"] is True + assert r["release_token"] is not None + + def test_supplier_accept_buyer_renegotiate_authorized(self): + result = _base_result() + result["supplier_response"] = "accept" + r = procurement_respond_handler(result, "renegotiate", 2500.0, "buyer", _make_policy()) + assert r["settlement_status"] == "authorized" + assert r["renegotiation_used"] is True + + # --- Both renegotiate --- + + def test_both_renegotiate_deal_succeeds(self): + result = _base_result() + result["buyer_response"] = "renegotiate" + result["revised_budget"] = 3000.0 + r = procurement_respond_handler(result, "renegotiate", 2500.0, "supplier", _make_policy()) + assert r["settlement_status"] == "authorized" + assert r["proposed_payment"] == 3000.0 + assert r["renegotiation_used"] is True + + def test_both_renegotiate_deal_fails(self): + result = _base_result() + result["buyer_response"] = "renegotiate" + result["revised_budget"] = 1000.0 + r = procurement_respond_handler(result, "renegotiate", 2000.0, "supplier", _make_policy()) + assert r["settlement_status"] == "rejected" + assert r["deal"] is False + assert any("renegotiation failed" in n.lower() for n in r["notes"]) + + # --- Validation errors --- + + def test_second_renegotiation_raises(self): + result = _base_result() + result["renegotiation_used"] = True + result["buyer_response"] = "renegotiate" + with pytest.raises(ValueError, match="already used"): + procurement_respond_handler(result, "renegotiate", 2000.0, "supplier", _make_policy()) + + def test_renegotiate_without_value_raises(self): + with pytest.raises(ValueError, match="revised_value is required"): + procurement_respond_handler(_base_result(), "renegotiate", None, "buyer", _make_policy()) + + def test_buyer_revised_above_budget_raises(self): + with pytest.raises(ValueError, match="max budget"): + procurement_respond_handler( + _base_result(), "renegotiate", 99999.0, "buyer", _make_policy() + ) + + def test_supplier_negative_reserve_raises(self): + with pytest.raises(ValueError, match="negative"): + procurement_respond_handler( + _base_result(), "renegotiate", -100.0, "supplier", _make_policy() + ) From 44d69c9cb53ec381e3c41027b3ef1b224e5a042c Mon Sep 17 00:00:00 2001 From: Parth Thapliyal Date: Sun, 22 Mar 2026 16:37:55 -0400 Subject: [PATCH 09/13] =?UTF-8?q?feat:=20procurement=20E2E=20tests=20?= =?UTF-8?q?=E2=80=94=20full=20API=20flow,=20role=20filtering,=20renegotiat?= =?UTF-8?q?ion=20matrix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Covers happy path (init→upload→submit→accept→authorized), critical reject, role-filtered results (buyer vs supplier), double-renegotiate success/failure, mixed accept+renegotiate, second-renegotiation guard, token enforcement, and skill registration. Requires python-multipart for multipart form upload. --- tests/test_procurement_e2e.py | 450 ++++++++++++++++++++++++++++++++++ 1 file changed, 450 insertions(+) create mode 100644 tests/test_procurement_e2e.py diff --git a/tests/test_procurement_e2e.py b/tests/test_procurement_e2e.py new file mode 100644 index 0000000..dac73ba --- /dev/null +++ b/tests/test_procurement_e2e.py @@ -0,0 +1,450 @@ +""" +E2E tests for the confidential_data_procurement skill. + +Validates API plumbing: token auth, upload, submit, role-filtered results, +deal responses, and renegotiation. LLM + deterministic pipeline are mocked +so no API keys or credits are needed. + +Scenarios: + 1. Happy path: init → register → upload → submit → accept → authorized + 2. Critical reject: forbidden column CSV → immediate rejection, no LLM + 3. Role filtering: buyer sees quality_score, supplier does not + 4. Renegotiate: double-renegotiate with overlapping terms → authorized + 5. Mixed respond: buyer accept + supplier renegotiate → authorized + 6. Token enforcement: missing/invalid token → 401/403 +""" +from __future__ import annotations + +import json + +import pytest +from fastapi.testclient import TestClient +from unittest.mock import patch + +import api.routes as routes +from core.models import SkillResponse +from skills.confidential_data_procurement import skill_card as proc_card +from skills.confidential_data_procurement.models import BuyerPolicy + + +# --------------------------------------------------------------------------- +# Test data +# --------------------------------------------------------------------------- + +_GOOD_CSV = ( + b"transaction_id,amount,is_fraud\n" + + b"".join( + f"txn_{i:04d},{i * 10.5:.2f},{1 if i % 25 == 0 else 0}\n".encode() + for i in range(200) + ) +) + +_BAD_CSV = ( + b"transaction_id,amount,is_fraud,ssn\n" + + b"".join( + f"txn_{i:04d},{i * 10.5:.2f},{1 if i % 25 == 0 else 0},xxx-xx-0000\n".encode() + for i in range(50) + ) +) + +_METADATA_JSON = json.dumps({ + "column_definitions": { + "transaction_id": "Unique ID for each transaction", + "amount": "Transaction amount in USD", + "is_fraud": "1 if fraudulent, 0 otherwise", + }, + "seller_claims": { + "balanced_labels": "Approximately 4% fraud rate", + "no_missing_values": "All fields fully populated", + }, +}).encode() + +_POLICY = BuyerPolicy( + required_columns=["transaction_id", "amount", "is_fraud"], + min_rows=100, + max_null_rate=0.05, + max_duplicate_rate=0.10, + min_label_rate=0.02, + label_column="is_fraud", + forbidden_columns=["ssn", "dob"], + max_budget=5000.0, + base_price=500.0, +) + + +# --------------------------------------------------------------------------- +# Fakes +# --------------------------------------------------------------------------- + +def _make_init_handler(policy: BuyerPolicy = _POLICY): + """Stateful mock: turn 1 → configuring, turn 2 → ready with BuyerPolicy.""" + calls = [] + + def handler(message, conversation): + calls.append(message) + conv = list(conversation) + [{"role": "human", "content": message}] + if len(calls) == 1: + conv.append({"role": "ai", "content": "Please describe your dataset requirements."}) + return { + "status": "configuring", + "message": "Please describe your dataset requirements.", + "conversation": conv, + } + conv.append({"role": "ai", "content": "Policy saved."}) + return { + "status": "ready", + "message": "Policy saved.", + "conversation": conv, + "config": policy, + "threshold": 1, + } + + return handler + + +def _fake_run_deal(inputs, params): + return SkillResponse( + skill="confidential_data_procurement", + results=[{ + "submission_id": inputs[0].submission_id, + "deal": True, + "quality_score": 0.82, + "proposed_payment": 3500.0, + "hard_constraints_pass": True, + "settlement_status": "pending_approval", + "release_token": None, + "notes": [], + "explanation": "Dataset meets all requirements.", + "claim_verification": {"balanced_labels": "verified"}, + "schema_matching": {"transaction_id": "transaction_id"}, + "buyer_response": None, + "supplier_response": None, + "renegotiation_used": False, + }], + ) + + +def _fake_run_rejected(inputs, params): + return SkillResponse( + skill="confidential_data_procurement", + results=[{ + "submission_id": inputs[0].submission_id, + "deal": False, + "quality_score": 0.0, + "proposed_payment": 500.0, + "hard_constraints_pass": False, + "settlement_status": "rejected", + "release_token": None, + "notes": ["Forbidden column 'ssn' detected. Deal rejected."], + "explanation": None, + "claim_verification": None, + "schema_matching": None, + "buyer_response": None, + "supplier_response": None, + "renegotiation_used": False, + }], + ) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(autouse=True) +def clear_stores(): + """Reset all in-memory state before each test.""" + routes._instances.clear() + routes._submissions.clear() + routes._results.clear() + routes._tokens.clear() + routes._registrations.clear() + from skills.confidential_data_procurement.ingest import _datasets + _datasets.clear() + yield + _datasets.clear() + + +@pytest.fixture +def client(): + from main import app + return TestClient(app) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _init_procurement(client, policy=_POLICY): + """Run the two-turn init flow and return (instance_id, admin_token).""" + handler = _make_init_handler(policy) + with patch.object(proc_card, "init_handler", handler): + r = client.post("/init", json={"skill_name": "confidential_data_procurement", "message": "setup"}) + assert r.status_code == 200 + instance_id = r.json()["instance_id"] + + r = client.post("/init", json={ + "skill_name": "confidential_data_procurement", + "message": "transaction_id, amount, is_fraud, budget 5000", + "instance_id": instance_id, + }) + assert r.status_code == 200 + assert r.json()["status"] == "ready" + return instance_id, r.json()["admin_token"] + + +def _register(client, instance_id): + r = client.post("/register", json={"instance_id": instance_id}) + assert r.status_code == 200 + return r.json()["user_token"] + + +def _upload(client, user_token, csv_bytes=_GOOD_CSV, metadata_bytes=_METADATA_JSON): + r = client.post( + "/upload", + files={ + "csv_file": ("dataset.csv", csv_bytes, "text/csv"), + "metadata_file": ("metadata.json", metadata_bytes, "application/json"), + }, + headers={"X-Instance-Token": user_token}, + ) + assert r.status_code == 200, r.text + return r.json()["dataset_id"] + + +def _submit(client, user_token, dataset_id, sub_id="sub-001", reserve=1000.0): + r = client.post( + "/submit", + json={ + "submission_id": sub_id, + "dataset_id": dataset_id, + "dataset_name": "fraud_dataset.csv", + "reserve_price": reserve, + }, + headers={"X-Instance-Token": user_token}, + ) + assert r.status_code == 200 + return r.json() + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +def test_procurement_happy_path_both_accept(client): + """Full happy path: init → upload → submit → both accept → authorized.""" + instance_id, admin_token = _init_procurement(client) + user_token = _register(client, instance_id) + dataset_id = _upload(client, user_token) + + with patch.object(proc_card, "run", _fake_run_deal): + resp = _submit(client, user_token, dataset_id) + assert resp["status"] == "received_analysis_complete" + + # Buyer views result (should see quality_score) + r = client.get("/results/sub-001", headers={"X-Instance-Token": admin_token}) + assert r.status_code == 200 + result = r.json() + assert result["deal"] is True + assert "quality_score" in result + assert result["settlement_status"] == "pending_approval" + + # Buyer accepts + r = client.post("/respond", json={ + "submission_id": "sub-001", + "action": "accept", + }, headers={"X-Instance-Token": admin_token}) + assert r.status_code == 200 + assert r.json()["settlement_status"] == "awaiting_counterparty" + + # Supplier accepts + r = client.post("/respond", json={ + "submission_id": "sub-001", + "action": "accept", + }, headers={"X-Instance-Token": user_token}) + assert r.status_code == 200 + assert r.json()["settlement_status"] == "authorized" + + # Final result should have release_token + r = client.get("/results/sub-001", headers={"X-Instance-Token": admin_token}) + assert r.json()["release_token"] is not None + assert r.json()["settlement_status"] == "authorized" + + +def test_procurement_critical_reject(client): + """Forbidden column CSV → settlement_status='rejected' immediately.""" + instance_id, admin_token = _init_procurement(client) + user_token = _register(client, instance_id) + dataset_id = _upload(client, user_token, csv_bytes=_BAD_CSV) + + with patch.object(proc_card, "run", _fake_run_rejected): + _submit(client, user_token, dataset_id) + + r = client.get("/results/sub-001", headers={"X-Instance-Token": admin_token}) + assert r.status_code == 200 + result = r.json() + assert result["deal"] is False + assert result["settlement_status"] == "rejected" + assert len(result["notes"]) > 0 + + +def test_procurement_role_filtering(client): + """Buyer sees quality_score; supplier does not.""" + instance_id, admin_token = _init_procurement(client) + user_token = _register(client, instance_id) + dataset_id = _upload(client, user_token) + + with patch.object(proc_card, "run", _fake_run_deal): + _submit(client, user_token, dataset_id) + + buyer_result = client.get( + "/results/sub-001", headers={"X-Instance-Token": admin_token} + ).json() + supplier_result = client.get( + "/results/sub-001", headers={"X-Instance-Token": user_token} + ).json() + + assert "quality_score" in buyer_result + assert "hard_constraints_pass" in buyer_result + assert "quality_score" not in supplier_result + assert "hard_constraints_pass" not in supplier_result + + # Both should see proposed_payment and deal + assert "proposed_payment" in supplier_result + assert "deal" in supplier_result + + +def test_procurement_double_renegotiate_success(client): + """Both renegotiate with overlapping terms → authorized.""" + instance_id, admin_token = _init_procurement(client) + user_token = _register(client, instance_id) + dataset_id = _upload(client, user_token) + + with patch.object(proc_card, "run", _fake_run_deal): + _submit(client, user_token, dataset_id) + + # Buyer renegotiates down to 3000 + r = client.post("/respond", json={ + "submission_id": "sub-001", + "action": "renegotiate", + "revised_value": 3000.0, + }, headers={"X-Instance-Token": admin_token}) + assert r.status_code == 200 + assert r.json()["settlement_status"] == "awaiting_counterparty" + + # Supplier renegotiates reserve down to 2500 (< buyer's 3000 → deal) + r = client.post("/respond", json={ + "submission_id": "sub-001", + "action": "renegotiate", + "revised_value": 2500.0, + }, headers={"X-Instance-Token": user_token}) + assert r.status_code == 200 + assert r.json()["settlement_status"] == "authorized" + + r = client.get("/results/sub-001", headers={"X-Instance-Token": admin_token}) + assert r.json()["proposed_payment"] == 3000.0 + + +def test_procurement_double_renegotiate_failure(client): + """Both renegotiate but terms don't meet → rejected.""" + instance_id, admin_token = _init_procurement(client) + user_token = _register(client, instance_id) + dataset_id = _upload(client, user_token) + + with patch.object(proc_card, "run", _fake_run_deal): + _submit(client, user_token, dataset_id) + + client.post("/respond", json={ + "submission_id": "sub-001", "action": "renegotiate", "revised_value": 1000.0, + }, headers={"X-Instance-Token": admin_token}) + + r = client.post("/respond", json={ + "submission_id": "sub-001", "action": "renegotiate", "revised_value": 2000.0, + }, headers={"X-Instance-Token": user_token}) + assert r.json()["settlement_status"] == "rejected" + + +def test_procurement_buyer_accept_supplier_renegotiate(client): + """Buyer accepts, supplier renegotiates → authorized (acceptor's bound honored).""" + instance_id, admin_token = _init_procurement(client) + user_token = _register(client, instance_id) + dataset_id = _upload(client, user_token) + + with patch.object(proc_card, "run", _fake_run_deal): + _submit(client, user_token, dataset_id) + + client.post("/respond", json={ + "submission_id": "sub-001", "action": "accept", + }, headers={"X-Instance-Token": admin_token}) + + r = client.post("/respond", json={ + "submission_id": "sub-001", "action": "renegotiate", "revised_value": 4000.0, + }, headers={"X-Instance-Token": user_token}) + assert r.json()["settlement_status"] == "authorized" + + +def test_procurement_second_renegotiation_rejected(client): + """Second renegotiation attempt returns 422.""" + instance_id, admin_token = _init_procurement(client) + user_token = _register(client, instance_id) + dataset_id = _upload(client, user_token) + + with patch.object(proc_card, "run", _fake_run_deal): + _submit(client, user_token, dataset_id) + + # First renegotiation + client.post("/respond", json={ + "submission_id": "sub-001", "action": "renegotiate", "revised_value": 3000.0, + }, headers={"X-Instance-Token": admin_token}) + client.post("/respond", json={ + "submission_id": "sub-001", "action": "renegotiate", "revised_value": 2500.0, + }, headers={"X-Instance-Token": user_token}) + + # Attempt second renegotiation — should fail + r = client.post("/respond", json={ + "submission_id": "sub-001", "action": "renegotiate", "revised_value": 2000.0, + }, headers={"X-Instance-Token": admin_token}) + assert r.status_code == 422 + + +def test_procurement_missing_token_401(client): + """No token header → 401.""" + r = client.post("/submit", json={ + "submission_id": "sub-001", "dataset_id": "x", + "dataset_name": "x.csv", "reserve_price": 100.0, + }) + assert r.status_code == 401 + + +def test_procurement_user_cannot_see_other_submission(client): + """User token cannot view a result it didn't submit.""" + instance_id, admin_token = _init_procurement(client) + user_a = _register(client, instance_id) + user_b = _register(client, instance_id) + + dataset_id = _upload(client, user_a) + with patch.object(proc_card, "run", _fake_run_deal): + _submit(client, user_a, dataset_id, sub_id="sub-a") + + r = client.get("/results/sub-a", headers={"X-Instance-Token": user_b}) + assert r.status_code == 403 + + +def test_procurement_upload_without_csv_returns_422(client): + """Upload with no csv_file field → 422.""" + instance_id, _ = _init_procurement(client) + user_token = _register(client, instance_id) + + r = client.post( + "/upload", + files={"metadata_file": ("meta.json", b"{}", "application/json")}, + headers={"X-Instance-Token": user_token}, + ) + assert r.status_code == 422 + + +def test_procurement_skill_appears_in_skills_list(client): + """New skill is registered and visible via GET /skills.""" + r = client.get("/skills") + assert r.status_code == 200 + names = [s["name"] for s in r.json()["skills"]] + assert "confidential_data_procurement" in names From 01cb068fe347294ded4863b01ba218aca5a93937 Mon Sep 17 00:00:00 2001 From: Parth Thapliyal Date: Sun, 22 Mar 2026 18:09:43 -0400 Subject: [PATCH 10/13] feat: LangGraph StateGraph upgrade + live integration test suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - agent.py: wrap evaluate_node in StateGraph for LangSmith trace visibility - tests/conftest.py: @pytest.mark.live marker, base_df fixture, matrix printer + demo JSON output - tests/test_live_integration.py: 30 tests — deterministic, agent, pipeline, renegotiation - base_price=0 on all demo policies; critical failures and reserve-not-met → $0 or rejected - .gitignore: exclude tests/fixtures/ and tests/demo_matrix.json - ci.yml: add skills/dataset-procurement branch; requirements.txt: add python-multipart --- .github/workflows/ci.yml | 2 +- .gitignore | 2 + requirements.txt | 1 + skills/confidential_data_procurement/agent.py | 101 ++- tests/conftest.py | 186 +++++ tests/test_live_integration.py | 724 ++++++++++++++++++ 6 files changed, 991 insertions(+), 25 deletions(-) create mode 100644 tests/conftest.py create mode 100644 tests/test_live_integration.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 285f1b1..d2e46ae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,7 @@ name: CI on: push: - branches: [main, feat/agent-core] + branches: [main, feat/agent-core, skills/dataset-procurement] pull_request: branches: [main] diff --git a/.gitignore b/.gitignore index adebad5..7f1ad88 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ venv/ /plans/ eval_results/ scripts/ +tests/fixtures/ +tests/demo_matrix.json diff --git a/requirements.txt b/requirements.txt index 1df45e7..7e9032e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ langgraph python-dotenv pytest httpx +python-multipart PyJWT>=2.8.0 supabase>=2.0.0 cryptography>=42.0.0 diff --git a/skills/confidential_data_procurement/agent.py b/skills/confidential_data_procurement/agent.py index bd53d8b..a84453c 100644 --- a/skills/confidential_data_procurement/agent.py +++ b/skills/confidential_data_procurement/agent.py @@ -1,31 +1,30 @@ """ Single evaluate_node agent for confidential_data_procurement. -The agent gets one LLM call (with tool loop) to: - 1. Read the dataset schema and column stats via tools - 2. Match the buyer's required columns to actual columns (semantic/fuzzy) - 3. Verify the seller's claims against observed statistics - 4. Write a bounded explanation for both parties - 5. Output schema_score and claim_veracity_score to replace deterministic placeholders +Graph: StateGraph with single evaluate_node → END. +Provides LangSmith trace visibility with proper node names, tool calls, and timing. The dataset never leaves the TEE — the LLM sees only aggregate statistics returned by the tools. validate_tool_output() in tools.py blocks raw row dumps. -Graph: single evaluate_node (no routing needed — one supplier, one dataset per run). +Graph: + evaluate_node (LLM + tools) → END """ from __future__ import annotations import json import re -from typing import Any +from typing import Any, Annotated, TypedDict -from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.messages import HumanMessage, SystemMessage, BaseMessage +from langgraph.graph import StateGraph, END +from langgraph.graph.message import add_messages from langgraph.prebuilt import ToolNode from config import get_llm from skills.confidential_data_procurement.config import EVALUATE_MODEL from skills.confidential_data_procurement.models import BuyerPolicy, DatasetMetrics -from skills.confidential_data_procurement.tools import EVALUATE_TOOLS +from skills.confidential_data_procurement.tools import EVALUATE_TOOLS, set_context EVALUATE_PROMPT_VERSION = "v1" @@ -82,25 +81,35 @@ """ -def run_agent( - dataset_id: str, - policy: BuyerPolicy, - metrics: DatasetMetrics, - component_scores: dict[str, float], -) -> dict[str, Any]: - """ - Run the evaluate node for one dataset. +class EvaluateState(TypedDict): + messages: Annotated[list[BaseMessage], add_messages] + dataset_id: str + policy: Any # BuyerPolicy — held in-memory, not serialized + metrics: Any # DatasetMetrics — held in-memory, not serialized + eval_result: dict - Returns a dict with: - schema_score, claim_veracity_score, schema_matching, claim_verification, explanation - Falls back to safe defaults if the LLM output cannot be parsed. - """ + +# --- Node --- + +def evaluate_node(state: EvaluateState) -> dict: + """LLM node: schema matching + claim verification + explanation with tool loop.""" from skills.confidential_data_procurement.ingest import get_dataset + dataset_id = state["dataset_id"] + policy: BuyerPolicy = state["policy"] + metrics: DatasetMetrics = state["metrics"] + dataset = get_dataset(dataset_id) column_definitions = dataset.get("column_definitions") or {} seller_claims = dataset.get("seller_claims") or {} + # Bind tools to the active dataset + set_context(dataset_id, { + "required_columns": policy.required_columns or [], + "column_definitions": column_definitions, + "seller_claims": seller_claims, + }) + required_str = ", ".join(policy.required_columns) if policy.required_columns else "(none)" definitions_str = ( "\n".join(f" {col}: {defn}" for col, defn in column_definitions.items()) @@ -117,7 +126,6 @@ def run_agent( seller_claims=claims_str, ) - # Build deterministic context note for the LLM det_note = ( f"Deterministic metrics already computed:\n" f" rows={metrics.row_count}, " @@ -157,9 +165,54 @@ def run_agent( "claim_veracity_score, schema_matching, claim_verification, and explanation." ))) response = llm.invoke(messages) + messages.append(response) raw = response.content if isinstance(response.content, str) else "" - return _parse_agent_output(raw, policy, seller_claims) + parsed = _parse_agent_output(raw, policy, seller_claims) + return {"messages": messages, "eval_result": parsed} + + +# --- Graph builder --- + +def _build_evaluate_graph(): + """Build and compile the single-node StateGraph for dataset evaluation.""" + graph = StateGraph(EvaluateState) + graph.add_node("evaluate", evaluate_node) + graph.set_entry_point("evaluate") + graph.add_edge("evaluate", END) + return graph.compile() + + +# --- Entry point --- + +def run_agent( + dataset_id: str, + policy: BuyerPolicy, + metrics: DatasetMetrics, + component_scores: dict[str, float], +) -> dict[str, Any]: + """ + Run the evaluate node for one dataset. + + Returns a dict with: + schema_score, claim_veracity_score, schema_matching, claim_verification, explanation + Falls back to safe defaults if the LLM output cannot be parsed. + """ + graph = _build_evaluate_graph() + + initial_state: EvaluateState = { + "messages": [], + "dataset_id": dataset_id, + "policy": policy, + "metrics": metrics, + "eval_result": {}, + } + + final_state = graph.invoke(initial_state, config={ + "recursion_limit": 50, + "metadata": {"evaluate_prompt": EVALUATE_PROMPT_VERSION}, + }) + return final_state["eval_result"] # --------------------------------------------------------------------------- diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..fea873a --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,186 @@ +""" +Shared pytest configuration for the Conclave test suite. + +Provides: + - @pytest.mark.live — skip when CONCLAVE_NEARAI_API_KEY is not set + - base_df — session-scoped fraud-like DataFrame (~800 rows) + - matrix_results — session-scoped list; tests append rows, teardown + prints two tables and saves tests/demo_matrix.json +""" +from __future__ import annotations + +import datetime +import json +import os +from typing import Generator + +import pandas as pd +import pytest + +DEMO_JSON_PATH = os.path.join(os.path.dirname(__file__), "demo_matrix.json") + + +# --------------------------------------------------------------------------- +# Markers +# --------------------------------------------------------------------------- + +def pytest_configure(config): + config.addinivalue_line( + "markers", + "live: mark test as requiring a real NearAI API key (skipped in CI)", + ) + + +def pytest_collection_modifyitems(config, items): + api_key = os.environ.get("CONCLAVE_NEARAI_API_KEY", "").strip() + skip_live = pytest.mark.skip(reason="CONCLAVE_NEARAI_API_KEY not set — live tests skipped") + for item in items: + if "live" in item.keywords and not api_key: + item.add_marker(skip_live) + + +# --------------------------------------------------------------------------- +# Dataset fixture +# --------------------------------------------------------------------------- + +def _generate_synthetic_df(n: int = 800) -> pd.DataFrame: + import numpy as np + rng = np.random.default_rng(42) + return pd.DataFrame({ + "transaction_id": [f"txn_{i:05d}" for i in range(n)], + "amount": rng.uniform(1.0, 500.0, n).round(2), + "merchant_category": rng.choice(["grocery", "gas", "restaurant", "travel", "online"], n), + "is_fraud": (rng.uniform(0, 1, n) < 0.04).astype(int), + }) + + +@pytest.fixture(scope="session") +def base_df() -> pd.DataFrame: + """Session-scoped clean fraud-like DataFrame (~800 rows, synthetic fallback).""" + url = "https://raw.githubusercontent.com/dsrscientist/dataset1/master/creditcard_small.csv" + try: + import io, requests + resp = requests.get(url, timeout=10) + if resp.status_code == 200: + df = pd.read_csv(io.StringIO(resp.text)) + rename = {} + cols_lower = {c.lower(): c for c in df.columns} + if "transaction_id" not in cols_lower and "id" in cols_lower: + rename[cols_lower["id"]] = "transaction_id" + if "is_fraud" not in cols_lower and "class" in cols_lower: + rename[cols_lower["class"]] = "is_fraud" + if rename: + df = df.rename(columns=rename) + if {"transaction_id", "amount", "is_fraud"}.issubset(df.columns): + return df.head(800) + except Exception: + pass + return _generate_synthetic_df(800) + + +# --------------------------------------------------------------------------- +# Matrix results fixture +# --------------------------------------------------------------------------- + +@pytest.fixture(scope="session") +def matrix_results() -> Generator[list[dict], None, None]: + """ + Session-scoped list. Tests append rows with a "type" field: + type="evaluation" — pipeline runs (quality, payment, deal) + type="renegotiation" — post-evaluation negotiation rounds + + At teardown: prints two formatted tables + saves tests/demo_matrix.json. + """ + rows: list[dict] = [] + yield rows + + if not rows: + return + + eval_rows = [r for r in rows if r.get("type") != "renegotiation"] + reneg_rows = [r for r in rows if r.get("type") == "renegotiation"] + + # --- Evaluation table --- + if eval_rows: + print("\n" + "=" * 96) + print("EVALUATION MATRIX (deterministic + LLM agent)") + print("=" * 96) + print(f"{'Scenario':<30} {'Seller':<18} {'Buyer':<12} {'Reserve':>8} {'Quality':>8} {'Payment':>9} {'Deal':>5}") + print("-" * 96) + for r in eval_rows: + q = r.get("quality") + p = r.get("payment") + rv = r.get("reserve") + print( + f"{r.get('scenario',''):<30} {r.get('seller',''):<18} {r.get('buyer',''):<12} " + f"{'$'+f'{rv:,.0f}' if rv is not None else 'N/A':>8} " + f"{f'{q:.3f}' if q is not None else 'N/A':>8} " + f"{'$'+f'{p:,.0f}' if p is not None else 'N/A':>9} " + f"{'YES' if r.get('deal') else ' NO':>5}" + ) + print("=" * 96) + + # --- Renegotiation table --- + if reneg_rows: + print("\n" + "=" * 90) + print("RENEGOTIATION MATRIX (post-evaluation, deterministic only)") + print("=" * 90) + print(f"{'Scenario':<35} {'Initial':>9} {'Buyer':>14} {'Seller':>14} {'Final':>9} {'Deal':>5}") + print("-" * 90) + for r in reneg_rows: + init = r.get("initial_offer") + final = r.get("final_payment") + print( + f"{r.get('scenario',''):<35} " + f"{'$'+f'{init:,.0f}' if init is not None else 'N/A':>9} " + f"{str(r.get('buyer_action','')):<14} " + f"{str(r.get('supplier_action','')):<14} " + f"{'$'+f'{final:,.0f}' if final is not None else ' —':>9} " + f"{'YES' if r.get('deal') else ' NO':>5}" + ) + print("=" * 90) + + # --- Save JSON --- + output = { + "title": "Confidential Data Procurement — Demo Results", + "generated": str(datetime.date.today()), + "model": "deepseek-ai/DeepSeek-V3.1", + "pipeline": "deterministic → LLM agent (schema match + claim verify) → guardrails", + "note": "base_price=0: bad data → payment approaches $0. Reserve not met → deal rejected.", + "evaluation_matrix": [ + { + "id": i + 1, + "scenario": r.get("scenario", ""), + "narrative": r.get("narrative", ""), + "seller_variant": r.get("seller", ""), + "buyer_variant": r.get("buyer", ""), + "reserve_price": r.get("reserve"), + "quality_score": round(r["quality"], 4) if r.get("quality") is not None else None, + "proposed_payment": r.get("payment"), + "deal": r.get("deal"), + "settlement_status": "pending_approval" if r.get("deal") else "rejected", + "notes": r.get("notes", []), + "explanation": r.get("explanation", ""), + "schema_matching": r.get("schema_matching"), + "claim_verification": r.get("claim_verification"), + } + for i, r in enumerate(eval_rows) + ], + "renegotiation_matrix": [ + { + "id": i + 1, + "scenario": r.get("scenario", ""), + "narrative": r.get("narrative", ""), + "initial_offer": r.get("initial_offer"), + "buyer_action": r.get("buyer_action", ""), + "supplier_action":r.get("supplier_action", ""), + "final_payment": r.get("final_payment"), + "deal": r.get("deal"), + "settlement_status": "authorized" if r.get("deal") else "rejected", + } + for i, r in enumerate(reneg_rows) + ], + } + with open(DEMO_JSON_PATH, "w") as f: + json.dump(output, f, indent=2) + print(f"\nDemo JSON → {DEMO_JSON_PATH}") diff --git a/tests/test_live_integration.py b/tests/test_live_integration.py new file mode 100644 index 0000000..3b81f27 --- /dev/null +++ b/tests/test_live_integration.py @@ -0,0 +1,724 @@ +""" +Live integration test suite for confidential_data_procurement. + +Budget design: base_price=0 on all buyer policies. + → Payment = max_budget * quality_score + → Bad data → payment near $0. Critical failure → $0. + → All amounts in $0–$800 range for demo clarity. + +Sections: + 1. Deterministic (no LLM, always fast) — 11 tests + 2. Agent layer (live LLM, @pytest.mark.live) — 7 tests + 3. Full pipeline (live LLM, @pytest.mark.live) — 7 tests + 4. Renegotiation scenarios (deterministic) — 5 tests + +Run fast only: + ./venv/bin/python -m pytest tests/test_live_integration.py -v -m "not live" + +Run all + print matrix: + ./venv/bin/python -m pytest tests/test_live_integration.py -v -s +""" +from __future__ import annotations + +import uuid + +import pandas as pd +import pytest + +from skills.confidential_data_procurement.ingest import _datasets +from skills.confidential_data_procurement.deterministic import ( + compute_metrics, + run_deterministic, +) +from skills.confidential_data_procurement.models import BuyerPolicy, SupplierSubmission +from skills.confidential_data_procurement.agent import run_agent +from skills.confidential_data_procurement import run_skill +from skills.confidential_data_procurement import procurement_respond_handler + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _register(df: pd.DataFrame, metadata: dict | None = None) -> str: + did = str(uuid.uuid4()) + meta = metadata or {} + _datasets[did] = { + "df": df, + "metadata": meta, + "column_definitions": meta.get("column_definitions", {}), + "seller_claims": meta.get("seller_claims", {}), + "instance_id": "test_integration", + } + return did + + +def _run_pipeline(base_df, seller_fn, policy: BuyerPolicy, reserve: float = 200.0) -> dict: + df, meta = seller_fn(base_df) + did = _register(df, meta) + try: + sub = SupplierSubmission( + submission_id=str(uuid.uuid4()), + dataset_id=did, + dataset_name="test.csv", + reserve_price=reserve, + ) + resp = run_skill([sub], policy) + return resp.results[0] if resp.results else {} + finally: + _datasets.pop(did, None) + + +# --------------------------------------------------------------------------- +# Seller variants +# --------------------------------------------------------------------------- + +def _seller_clean(base_df): + df = base_df[["transaction_id", "amount", "is_fraud"]].copy().reset_index(drop=True) + meta = { + "column_definitions": { + "transaction_id": "Unique ID per transaction", + "amount": "Transaction amount in USD", + "is_fraud": "1 if fraudulent, 0 otherwise", + }, + "seller_claims": { + "low_fraud_rate": "Approximately 4% fraud rate", + "no_missing_values": "All fields fully populated", + }, + } + return df, meta + + +def _seller_null_corrupted(base_df): + """30% of amount values nulled — seller falsely claims no missing values.""" + df = base_df[["transaction_id", "amount", "is_fraud"]].copy().reset_index(drop=True) + n = int(len(df) * 0.30) + df.loc[:n, "amount"] = None + meta = { + "column_definitions": { + "transaction_id": "Unique ID", + "amount": "Transaction amount", + "is_fraud": "Fraud flag", + }, + "seller_claims": {"no_missing_values": "All fields fully populated"}, + } + return df, meta + + +def _seller_dup_corrupted(base_df): + """Entire dataset duplicated → duplicate_rate = 50% → critical failure.""" + df = base_df[["transaction_id", "amount", "is_fraud"]].copy().reset_index(drop=True) + df = pd.concat([df, df], ignore_index=True) + return df, {} + + +def _seller_forbidden_col(base_df): + """SSN column present → critical failure, no LLM.""" + df = base_df[["transaction_id", "amount", "is_fraud"]].copy().reset_index(drop=True) + df["ssn"] = "xxx-xx-0000" + return df, {} + + +def _seller_missing_col(base_df): + """is_fraud dropped — buyer requires it → schema penalty.""" + df = base_df[["transaction_id", "amount"]].copy().reset_index(drop=True) + meta = { + "column_definitions": { + "transaction_id": "Unique ID", + "amount": "Transaction amount", + }, + "seller_claims": {}, + } + return df, meta + + +def _seller_fuzzy_schema(base_df): + """is_fraud renamed to fraud_label — tests agent semantic matching.""" + df = base_df[["transaction_id", "amount", "is_fraud"]].copy().reset_index(drop=True) + df = df.rename(columns={"is_fraud": "fraud_label"}) + meta = { + "column_definitions": { + "transaction_id": "Unique ID", + "amount": "Transaction amount", + "fraud_label": "Binary fraud indicator (1=fraud, 0=legit)", + }, + "seller_claims": {}, + } + return df, meta + + +def _seller_multi_corrupt(base_df): + """25% nulls + missing is_fraud — compound quality damage.""" + df = base_df[["transaction_id", "amount"]].copy().reset_index(drop=True) + n = int(len(df) * 0.25) + df.loc[:n, "amount"] = None + return df, {} + + +# --------------------------------------------------------------------------- +# Buyer policies — base_price=0 on all (bad data → payment near $0) +# --------------------------------------------------------------------------- + +def _buyer_lenient() -> BuyerPolicy: + """Tolerant buyer: accepts moderate nulls, low row floor, $800 ceiling.""" + return BuyerPolicy( + required_columns=["transaction_id", "amount", "is_fraud"], + min_rows=200, + max_null_rate=0.35, + max_duplicate_rate=0.20, + min_label_rate=0.01, + label_column="is_fraud", + forbidden_columns=["ssn", "dob"], + max_budget=800.0, + base_price=0.0, + ) + + +def _buyer_strict() -> BuyerPolicy: + """Strict buyer: demands 900 rows, low null tolerance, $800 ceiling.""" + return BuyerPolicy( + required_columns=["transaction_id", "amount", "is_fraud"], + min_rows=900, + max_null_rate=0.05, + max_duplicate_rate=0.05, + min_label_rate=0.02, + label_column="is_fraud", + forbidden_columns=["ssn", "dob"], + max_budget=800.0, + base_price=0.0, + ) + + +def _buyer_budget_tight() -> BuyerPolicy: + """Budget-conscious buyer: same quality expectations, $300 ceiling.""" + return BuyerPolicy( + required_columns=["transaction_id", "amount", "is_fraud"], + min_rows=200, + max_null_rate=0.35, + max_duplicate_rate=0.20, + min_label_rate=0.01, + label_column="is_fraud", + forbidden_columns=["ssn", "dob"], + max_budget=300.0, + base_price=0.0, + ) + + +# --------------------------------------------------------------------------- +# Section 1: Deterministic layer (no LLM) +# --------------------------------------------------------------------------- + +class TestDeterministic: + """Fast correctness checks — direct calls to run_deterministic.""" + + def setup_method(self): + self._ids: list[str] = [] + + def teardown_method(self): + for did in self._ids: + _datasets.pop(did, None) + + def _run(self, df, policy, metadata=None, reserve=200.0): + did = _register(df, metadata) + self._ids.append(did) + return run_deterministic(did, policy, reserve) + + def test_clean_lenient_high_quality(self, base_df): + """Clean data + lenient policy → no critical failure, quality > 0.7.""" + df, meta = _seller_clean(base_df) + r = self._run(df, _buyer_lenient(), meta) + assert not r["metrics"].critical_failure + assert r["quality_score"] > 0.7 + + def test_clean_strict_coverage_penalty(self, base_df): + """800-row dataset vs strict 900-row min → coverage_score = 800/900 ≈ 0.89.""" + df, meta = _seller_clean(base_df) + r = self._run(df, _buyer_strict(), meta) + assert not r["metrics"].critical_failure + assert r["component_scores"]["coverage"] == pytest.approx(len(df) / 900, rel=0.01) + assert r["component_scores"]["coverage"] < 1.0 + + def test_null_lenient_passes(self, base_df): + """30% null in one column (~10% overall) < 35% lenient threshold → quality > 0.""" + df, meta = _seller_null_corrupted(base_df) + r = self._run(df, _buyer_lenient(), meta) + assert not r["metrics"].critical_failure + assert r["component_scores"]["null"] > 0 + + def test_null_strict_null_score_zero(self, base_df): + """~10% overall null > 5% strict threshold → null_score = 0.""" + df, meta = _seller_null_corrupted(base_df) + r = self._run(df, _buyer_strict(), meta) + assert r["component_scores"]["null"] == 0.0 + + def test_null_payment_lower_than_clean(self, base_df): + """Null-corrupted → lower quality → lower payment than clean, same policy.""" + df_c, m_c = _seller_clean(base_df) + df_n, m_n = _seller_null_corrupted(base_df) + p = _buyer_lenient() + assert self._run(df_n, p, m_n)["proposed_payment"] < self._run(df_c, p, m_c)["proposed_payment"] + + def test_dup_corrupted_critical(self, base_df): + """50% duplicates (entire dataset doubled) → critical_failure, $0 payment.""" + df, meta = _seller_dup_corrupted(base_df) + r = self._run(df, _buyer_lenient(), meta) + assert r["metrics"].critical_failure + assert not r["deal"] + assert r["proposed_payment"] == 0.0 # base_price=0, so floor is $0 + + def test_forbidden_col_critical(self, base_df): + """SSN column → critical_failure, note mentions 'ssn'.""" + df, meta = _seller_forbidden_col(base_df) + r = self._run(df, _buyer_lenient(), meta) + assert r["metrics"].critical_failure + assert "ssn" in " ".join(r["notes"]).lower() + + def test_budget_tight_caps_payment(self, base_df): + """Same clean data, $300 ceiling → proposed_payment ≤ $300.""" + df, meta = _seller_clean(base_df) + r = self._run(df, _buyer_budget_tight(), meta) + assert r["proposed_payment"] <= 300.0 + + def test_reserve_not_met_deal_fails(self, base_df): + """Clean data, seller reserve > proposed payment → deal=False with note.""" + df, meta = _seller_clean(base_df) + # Deterministic quality with schema=0.5 placeholder: 0.925 → payment = $740 + # Reserve $760 > $740 → deal rejected + r = self._run(df, _buyer_lenient(), meta, reserve=760.0) + assert not r["deal"] + assert any("reserve" in n.lower() for n in r["notes"]) + + def test_multi_corrupt_lower_than_clean(self, base_df): + """Multi-corrupt (nulls + missing label col) → payment lower than clean.""" + df_c, m_c = _seller_clean(base_df) + df_x, m_x = _seller_multi_corrupt(base_df) + p = _buyer_lenient() + assert self._run(df_x, p, m_x)["proposed_payment"] < self._run(df_c, p, m_c)["proposed_payment"] + + def test_price_formula_base_zero(self, base_df): + """With base_price=0: P = max_budget * S exactly.""" + df, meta = _seller_clean(base_df) + policy = _buyer_lenient() + r = self._run(df, policy, meta) + S = r["quality_score"] + assert r["proposed_payment"] == pytest.approx(policy.max_budget * S, abs=0.01) + + +# --------------------------------------------------------------------------- +# Section 2: Agent layer (live LLM) +# --------------------------------------------------------------------------- + +class TestAgentLive: + + def setup_method(self): + self._ids: list[str] = [] + + def teardown_method(self): + for did in self._ids: + _datasets.pop(did, None) + + def _reg(self, df, meta=None): + did = _register(df, meta) + self._ids.append(did) + return did + + def _metrics(self, df, policy): + return compute_metrics(df, policy) + + @pytest.mark.live + def test_exact_schema_match(self, base_df): + """All required columns present by exact name → schema_score ≥ 0.8.""" + df, meta = _seller_clean(base_df) + did = self._reg(df, meta) + policy = _buyer_lenient() + r = run_agent(did, policy, self._metrics(df, policy), {}) + assert r["schema_score"] >= 0.8 + + @pytest.mark.live + def test_fuzzy_schema_match(self, base_df): + """fraud_label instead of is_fraud → agent semantic match gives schema_score > 0.""" + df, meta = _seller_fuzzy_schema(base_df) + did = self._reg(df, meta) + policy = _buyer_lenient() + r = run_agent(did, policy, self._metrics(df, policy), {}) + assert r["schema_score"] > 0.0 + + @pytest.mark.live + def test_null_claim_disputed(self, base_df): + """Seller claims 'no missing values' but 30% amount is null → claim disputed.""" + df, meta = _seller_null_corrupted(base_df) + did = self._reg(df, meta) + policy = _buyer_lenient() + r = run_agent(did, policy, self._metrics(df, policy), {}) + verification = r.get("claim_verification") or {} + assert any(v == "disputed" for v in verification.values()), ( + f"Expected at least one disputed claim, got: {verification}" + ) + + @pytest.mark.live + def test_missing_col_lower_schema(self, base_df): + """is_fraud missing → schema_score lower than when it's present.""" + df_full, m_full = _seller_clean(base_df) + df_miss, m_miss = _seller_missing_col(base_df) + policy = _buyer_lenient() + did_f = self._reg(df_full, m_full) + did_m = self._reg(df_miss, m_miss) + score_full = run_agent(did_f, policy, self._metrics(df_full, policy), {})["schema_score"] + score_miss = run_agent(did_m, policy, self._metrics(df_miss, policy), {})["schema_score"] + assert score_miss < score_full + + @pytest.mark.live + def test_explanation_present(self, base_df): + """Agent always produces a non-empty explanation string.""" + df, meta = _seller_clean(base_df) + did = self._reg(df, meta) + policy = _buyer_lenient() + r = run_agent(did, policy, self._metrics(df, policy), {}) + assert isinstance(r.get("explanation"), str) and len(r["explanation"]) > 10 + + @pytest.mark.live + def test_output_bounds(self, base_df): + """schema_score and claim_veracity_score always in [0, 1].""" + df, meta = _seller_clean(base_df) + did = self._reg(df, meta) + policy = _buyer_lenient() + r = run_agent(did, policy, self._metrics(df, policy), {}) + assert 0.0 <= r["schema_score"] <= 1.0 + assert 0.0 <= r["claim_veracity_score"] <= 1.0 + + @pytest.mark.live + def test_schema_matching_dict_returned(self, base_df): + """Agent returns a non-empty schema_matching dict.""" + df, meta = _seller_clean(base_df) + did = self._reg(df, meta) + policy = _buyer_lenient() + r = run_agent(did, policy, self._metrics(df, policy), {}) + assert isinstance(r.get("schema_matching"), dict) and len(r["schema_matching"]) > 0 + + +# --------------------------------------------------------------------------- +# Section 3: Full pipeline — seller × buyer matrix (live LLM) +# --------------------------------------------------------------------------- + +class TestPipelineLive: + """End-to-end pipeline tests. Results appended to matrix_results for demo JSON.""" + + @pytest.mark.live + def test_happy_path(self, base_df, matrix_results): + """Clean data + lenient buyer + reserve=$200 → deal, high quality, full explanation.""" + r = _run_pipeline(base_df, _seller_clean, _buyer_lenient(), reserve=200.0) + assert r.get("deal") is True + assert r.get("explanation") + matrix_results.append({ + "type": "evaluation", + "scenario": "Happy Path", + "narrative": ( + "Seller provides clean fraud-detection data. All required columns present, " + "claims verified. Seller's $200 reserve is comfortably below the $800 offer. " + "Both parties should accept." + ), + "seller": "clean", "buyer": "lenient", "reserve": 200.0, + "quality": r.get("quality_score"), "payment": r.get("proposed_payment"), + "deal": r.get("deal"), "notes": r.get("notes", []), + "explanation": r.get("explanation", ""), + "schema_matching": r.get("schema_matching"), + "claim_verification": r.get("claim_verification"), + }) + + @pytest.mark.live + def test_strict_buyer_coverage_penalty(self, base_df, matrix_results): + """800-row dataset vs strict 900-row requirement → quality drops, lower price.""" + lenient_r = _run_pipeline(base_df, _seller_clean, _buyer_lenient(), reserve=200.0) + strict_r = _run_pipeline(base_df, _seller_clean, _buyer_strict(), reserve=200.0) + assert strict_r.get("proposed_payment", 999) <= lenient_r.get("proposed_payment", 0) + 5 + matrix_results.append({ + "type": "evaluation", + "scenario": "Strict Buyer — Row Coverage Penalty", + "narrative": ( + "Same clean dataset, but buyer demands 900 rows and only 800 are present. " + "Coverage score = 800/900 = 0.89 → quality and price both drop vs lenient buyer." + ), + "seller": "clean", "buyer": "strict", "reserve": 200.0, + "quality": strict_r.get("quality_score"), "payment": strict_r.get("proposed_payment"), + "deal": strict_r.get("deal"), "notes": strict_r.get("notes", []), + "explanation": strict_r.get("explanation", ""), + "schema_matching": strict_r.get("schema_matching"), + "claim_verification": strict_r.get("claim_verification"), + }) + + @pytest.mark.live + def test_null_corrupted_claim_disputed(self, base_df, matrix_results): + """30% nulls + false 'no missing values' claim → agent disputes claim, price dips.""" + r = _run_pipeline(base_df, _seller_null_corrupted, _buyer_lenient(), reserve=200.0) + matrix_results.append({ + "type": "evaluation", + "scenario": "Null-Corrupted + False Claim", + "narrative": ( + "Seller corrupts 30% of amount values and claims 'all fields populated'. " + "Strict buyer's 5% null threshold zeroes the null_score. " + "Agent disputes the no-missing-values claim. Price drops." + ), + "seller": "null_corrupted", "buyer": "lenient", "reserve": 200.0, + "quality": r.get("quality_score"), "payment": r.get("proposed_payment"), + "deal": r.get("deal"), "notes": r.get("notes", []), + "explanation": r.get("explanation", ""), + "schema_matching": r.get("schema_matching"), + "claim_verification": r.get("claim_verification"), + }) + + def test_reserve_not_met(self, base_df, matrix_results): + """Clean data, seller reserve=$760 > deterministic offer of $740 → deal rejected. + Reserve logic is deterministic — no LLM needed for this scenario. + """ + df, meta = _seller_clean(base_df) + did = _register(df, meta) + try: + det = run_deterministic(did, _buyer_lenient(), reserve_price=760.0) + finally: + _datasets.pop(did, None) + assert not det["deal"] + assert any("reserve" in n.lower() for n in det["notes"]) + matrix_results.append({ + "type": "evaluation", + "scenario": "Reserve Floor Not Met", + "narrative": ( + "Data quality is good (quality ~0.93, offer ~$740). " + "But seller's $760 reserve exceeds the computed offer. " + "The enclave reports: 'reserve not met — consider renegotiation'. " + "Neither party's private number was revealed. " + "This is where the renegotiation section begins." + ), + "seller": "clean", "buyer": "lenient", "reserve": 760.0, + "quality": det["quality_score"], "payment": det["proposed_payment"], + "deal": False, "notes": det["notes"], + "explanation": None, + "schema_matching": None, "claim_verification": None, + }) + + @pytest.mark.live + def test_critical_forbidden_column(self, base_df, matrix_results): + """SSN column → immediate rejection, agent never runs, payment=$0.""" + r = _run_pipeline(base_df, _seller_forbidden_col, _buyer_lenient(), reserve=0.0) + assert r.get("deal") is False + assert r.get("explanation") is None # agent skipped + matrix_results.append({ + "type": "evaluation", + "scenario": "Critical: PII Column (SSN)", + "narrative": ( + "Dataset contains an 'ssn' (Social Security Number) column. " + "The deterministic layer rejects immediately — no LLM is invoked, " + "no data is analyzed. Payment = $0 (base_price=0)." + ), + "seller": "forbidden_col", "buyer": "lenient", "reserve": 0.0, + "quality": 0.0, "payment": r.get("proposed_payment"), + "deal": False, "notes": r.get("notes", []), + "explanation": None, + "schema_matching": None, "claim_verification": None, + }) + + @pytest.mark.live + def test_critical_duplicate_spam(self, base_df, matrix_results): + """50%+ duplicates → critical rejection, agent skipped, payment=$0.""" + r = _run_pipeline(base_df, _seller_dup_corrupted, _buyer_lenient(), reserve=0.0) + assert r.get("deal") is False + assert r.get("explanation") is None + matrix_results.append({ + "type": "evaluation", + "scenario": "Critical: 50%+ Duplicates", + "narrative": ( + "Seller doubled the dataset by copying all rows. " + "Duplicate rate = 50%, which hits the critical threshold. " + "Immediate rejection, no LLM, payment = $0." + ), + "seller": "dup_corrupted", "buyer": "lenient", "reserve": 0.0, + "quality": 0.0, "payment": r.get("proposed_payment"), + "deal": False, "notes": r.get("notes", []), + "explanation": None, + "schema_matching": None, "claim_verification": None, + }) + + @pytest.mark.live + def test_budget_ceiling(self, base_df, matrix_results): + """Same clean data + $300 ceiling → proportionally lower price.""" + r = _run_pipeline(base_df, _seller_clean, _buyer_budget_tight(), reserve=50.0) + assert r.get("proposed_payment", 999) <= 300.0 + matrix_results.append({ + "type": "evaluation", + "scenario": "Budget Ceiling ($300 max)", + "narrative": ( + "Same clean dataset, same quality score. But buyer's max_budget=$300 " + "caps the price. Shows the enclave preserves privacy — seller sees only " + "a lower offer, never the buyer's max_budget." + ), + "seller": "clean", "buyer": "budget_tight", "reserve": 50.0, + "quality": r.get("quality_score"), "payment": r.get("proposed_payment"), + "deal": r.get("deal"), "notes": r.get("notes", []), + "explanation": r.get("explanation", ""), + "schema_matching": r.get("schema_matching"), + "claim_verification": r.get("claim_verification"), + }) + + +# --------------------------------------------------------------------------- +# Section 4: Renegotiation scenarios (deterministic, no LLM) +# +# All scenarios start from a fixed base result: +# quality=0.65, proposed_payment=$520, settlement_status="pending_approval" +# +# Renegotiation is pure business logic — no AI involved after the initial evaluation. +# The agent ran once; respond_handler drives all subsequent state changes. +# --------------------------------------------------------------------------- + +_RENEG_QUALITY = 0.65 +_RENEG_PAYMENT = 520.0 # = 800 * 0.65 + + +def _reneg_policy(): + return BuyerPolicy( + required_columns=["transaction_id", "amount", "is_fraud"], + min_rows=200, max_null_rate=0.35, max_duplicate_rate=0.20, + min_label_rate=0.01, label_column="is_fraud", + forbidden_columns=["ssn", "dob"], + max_budget=800.0, base_price=0.0, + ) + + +def _base_result(): + """Pending-approval result from a hypothetical TEE evaluation.""" + return { + "submission_id": "demo-sub", + "deal": True, + "quality_score": _RENEG_QUALITY, + "proposed_payment": _RENEG_PAYMENT, + "hard_constraints_pass": True, + "settlement_status": "pending_approval", + "release_token": None, + "notes": [], + "explanation": "Dataset meets buyer requirements with moderate quality.", + "claim_verification": {"balanced_labels": "verified"}, + "schema_matching": { + "transaction_id": "transaction_id", + "amount": "amount", + "is_fraud": "is_fraud", + }, + "buyer_response": None, + "supplier_response": None, + "renegotiation_used": False, + } + + +class TestRenegotiation: + """ + Tests for the 3×3 resolution matrix in procurement_respond_handler. + No LLM needed. Starting point: quality=0.65, proposed_payment=$520. + """ + + def test_both_accept(self, matrix_results): + """Both accept TEE offer → authorized at $520, no changes.""" + policy = _reneg_policy() + r = procurement_respond_handler(_base_result(), "accept", None, "buyer", policy) + assert r["settlement_status"] == "awaiting_counterparty" + r = procurement_respond_handler(r, "accept", None, "supplier", policy) + assert r["settlement_status"] == "authorized" + assert r["proposed_payment"] == _RENEG_PAYMENT + matrix_results.append({ + "type": "renegotiation", + "scenario": "Both Accept", + "narrative": ( + f"TEE computes ${_RENEG_PAYMENT:.0f}. Both parties accept immediately. " + "Authorized at the enclave's offer — fastest path to a deal." + ), + "initial_offer": _RENEG_PAYMENT, + "buyer_action": "accept", + "supplier_action":"accept", + "final_payment": r["proposed_payment"], + "deal": True, + }) + + def test_buyer_renegotiates_after_supplier_accepts(self, matrix_results): + """Supplier locks in $520 by accepting first. Buyer tries to renegotiate to $400 — too late.""" + policy = _reneg_policy() + r = procurement_respond_handler(_base_result(), "accept", None, "supplier", policy) + r = procurement_respond_handler(r, "renegotiate", 400.0, "buyer", policy) + # Supplier already committed → deal at original proposed_payment, buyer's revision ignored + assert r["settlement_status"] == "authorized" + assert r["proposed_payment"] == _RENEG_PAYMENT + matrix_results.append({ + "type": "renegotiation", + "scenario": "Acceptor Locks the Price", + "narrative": ( + f"Supplier accepts the ${_RENEG_PAYMENT:.0f} offer. " + "Buyer later tries to renegotiate down to $400 — but supplier already committed. " + "Deal authorizes at $520 (supplier's acceptance stands). " + "First to accept locks the price." + ), + "initial_offer": _RENEG_PAYMENT, + "buyer_action": "renegotiate → $400", + "supplier_action":"accept", + "final_payment": r["proposed_payment"], + "deal": True, + }) + + def test_both_renegotiate_terms_overlap(self, matrix_results): + """Both renegotiate. Buyer offers $480, seller accepts $420 floor → deal at $480.""" + policy = _reneg_policy() + r = procurement_respond_handler(_base_result(), "renegotiate", 480.0, "buyer", policy) + r = procurement_respond_handler(r, "renegotiate", 420.0, "supplier", policy) + assert r["settlement_status"] == "authorized" + assert r["proposed_payment"] == 480.0 # buyer's revised offer (>= seller's 420) + matrix_results.append({ + "type": "renegotiation", + "scenario": "Both Renegotiate — Terms Overlap", + "narrative": ( + f"TEE offers ${_RENEG_PAYMENT:.0f}. Buyer revises offer to $480. " + "Seller revises reserve down to $420. " + "$480 ≥ $420 → authorized at buyer's revised offer. " + "No midpoint — buyer's number wins when terms overlap." + ), + "initial_offer": _RENEG_PAYMENT, + "buyer_action": "renegotiate → $480", + "supplier_action":"renegotiate → $420", + "final_payment": 480.0, + "deal": True, + }) + + def test_both_renegotiate_no_overlap(self, matrix_results): + """Both renegotiate with incompatible terms → rejected.""" + policy = _reneg_policy() + r = procurement_respond_handler(_base_result(), "renegotiate", 350.0, "buyer", policy) + r = procurement_respond_handler(r, "renegotiate", 450.0, "supplier", policy) + assert r["settlement_status"] == "rejected" + matrix_results.append({ + "type": "renegotiation", + "scenario": "Both Renegotiate — No Overlap", + "narrative": ( + f"TEE offers ${_RENEG_PAYMENT:.0f}. Buyer revises to $350. " + "Seller revises reserve up to $450. " + "$350 < $450 → deal rejected. Neither party revealed their original private number." + ), + "initial_offer": _RENEG_PAYMENT, + "buyer_action": "renegotiate → $350", + "supplier_action":"renegotiate → $450", + "final_payment": None, + "deal": False, + }) + + def test_either_party_rejects(self, matrix_results): + """One party rejects → immediate deal-off regardless of other's action.""" + policy = _reneg_policy() + r = procurement_respond_handler(_base_result(), "accept", None, "buyer", policy) + r = procurement_respond_handler(r, "reject", None, "supplier", policy) + assert r["settlement_status"] == "rejected" + matrix_results.append({ + "type": "renegotiation", + "scenario": "Hard Reject", + "narrative": ( + f"Buyer accepts the ${_RENEG_PAYMENT:.0f} TEE offer. " + "Supplier rejects outright. One rejection ends the deal — no further rounds." + ), + "initial_offer": _RENEG_PAYMENT, + "buyer_action": "accept", + "supplier_action":"reject", + "final_payment": None, + "deal": False, + }) From 55b37d9f79c6dc984b1fb1f27880e0fdc6a1d611 Mon Sep 17 00:00:00 2001 From: Parth Thapliyal Date: Sun, 22 Mar 2026 18:13:09 -0400 Subject: [PATCH 11/13] chore: untrack test fixtures (gitignored) --- tests/fixtures/test_dataset_bad.csv | 111 ------------ tests/fixtures/test_dataset_good.csv | 201 --------------------- tests/fixtures/test_dataset_high_nulls.csv | 201 --------------------- tests/fixtures/test_metadata.json | 15 -- 4 files changed, 528 deletions(-) delete mode 100644 tests/fixtures/test_dataset_bad.csv delete mode 100644 tests/fixtures/test_dataset_good.csv delete mode 100644 tests/fixtures/test_dataset_high_nulls.csv delete mode 100644 tests/fixtures/test_metadata.json diff --git a/tests/fixtures/test_dataset_bad.csv b/tests/fixtures/test_dataset_bad.csv deleted file mode 100644 index fe69f48..0000000 --- a/tests/fixtures/test_dataset_bad.csv +++ /dev/null @@ -1,111 +0,0 @@ -transaction_id,amount,ssn,is_fraud -txn_0000,3947.96,xxx-xx-0000,0 -txn_0001,2497.23,xxx-xx-0001,0 -txn_0002,443.73,xxx-xx-0002,0 -txn_0003,2690.16,xxx-xx-0003,0 -txn_0004,2938.34,xxx-xx-0004,0 -txn_0005,3729.74,xxx-xx-0005,0 -txn_0006,2163.98,xxx-xx-0006,0 -txn_0007,646.63,xxx-xx-0007,1 -txn_0008,1426.04,xxx-xx-0008,0 -txn_0009,1821.78,xxx-xx-0009,0 -txn_0010,3233.13,xxx-xx-0010,0 -txn_0011,2858.18,xxx-xx-0011,0 -txn_0012,1786.92,xxx-xx-0012,0 -txn_0013,4932.71,xxx-xx-0013,0 -txn_0014,3032.82,xxx-xx-0014,0 -txn_0015,1193.76,xxx-xx-0015,0 -txn_0016,517.89,xxx-xx-0016,0 -txn_0017,772.77,xxx-xx-0017,0 -txn_0018,1237.33,xxx-xx-0018,0 -txn_0019,811.8,xxx-xx-0019,1 -txn_0020,940.97,xxx-xx-0020,0 -txn_0021,1432.62,xxx-xx-0021,0 -txn_0022,875.13,xxx-xx-0022,0 -txn_0023,4484.86,xxx-xx-0023,0 -txn_0024,410.37,xxx-xx-0024,0 -txn_0025,2627.31,xxx-xx-0025,0 -txn_0026,2057.88,xxx-xx-0026,0 -txn_0027,4912.07,xxx-xx-0027,0 -txn_0028,569.07,xxx-xx-0028,0 -txn_0029,1995.3,xxx-xx-0029,0 -txn_0030,4847.66,xxx-xx-0030,0 -txn_0031,4328.88,xxx-xx-0031,0 -txn_0032,4087.19,xxx-xx-0032,0 -txn_0033,1296.94,xxx-xx-0033,0 -txn_0034,862.73,xxx-xx-0034,0 -txn_0035,3346.53,xxx-xx-0035,0 -txn_0036,4647.59,xxx-xx-0036,0 -txn_0037,2788.25,xxx-xx-0037,0 -txn_0038,2862.35,xxx-xx-0038,0 -txn_0039,1407.1,xxx-xx-0039,0 -txn_0040,3849.77,xxx-xx-0040,0 -txn_0041,943.35,xxx-xx-0041,0 -txn_0042,1625.16,xxx-xx-0042,0 -txn_0043,2132.93,xxx-xx-0043,0 -txn_0044,2542.98,xxx-xx-0044,0 -txn_0045,1219.62,xxx-xx-0045,0 -txn_0046,583.04,xxx-xx-0046,0 -txn_0047,3056.99,xxx-xx-0047,0 -txn_0048,1450.27,xxx-xx-0048,0 -txn_0049,2910.38,xxx-xx-0049,0 -txn_0000,3947.96,xxx-xx-0000,0 -txn_0001,2497.23,xxx-xx-0001,0 -txn_0002,443.73,xxx-xx-0002,0 -txn_0003,2690.16,xxx-xx-0003,0 -txn_0004,2938.34,xxx-xx-0004,0 -txn_0005,3729.74,xxx-xx-0005,0 -txn_0006,2163.98,xxx-xx-0006,0 -txn_0007,646.63,xxx-xx-0007,1 -txn_0008,1426.04,xxx-xx-0008,0 -txn_0009,1821.78,xxx-xx-0009,0 -txn_0010,3233.13,xxx-xx-0010,0 -txn_0011,2858.18,xxx-xx-0011,0 -txn_0012,1786.92,xxx-xx-0012,0 -txn_0013,4932.71,xxx-xx-0013,0 -txn_0014,3032.82,xxx-xx-0014,0 -txn_0015,1193.76,xxx-xx-0015,0 -txn_0016,517.89,xxx-xx-0016,0 -txn_0017,772.77,xxx-xx-0017,0 -txn_0018,1237.33,xxx-xx-0018,0 -txn_0019,811.8,xxx-xx-0019,1 -txn_0020,940.97,xxx-xx-0020,0 -txn_0021,1432.62,xxx-xx-0021,0 -txn_0022,875.13,xxx-xx-0022,0 -txn_0023,4484.86,xxx-xx-0023,0 -txn_0024,410.37,xxx-xx-0024,0 -txn_0025,2627.31,xxx-xx-0025,0 -txn_0026,2057.88,xxx-xx-0026,0 -txn_0027,4912.07,xxx-xx-0027,0 -txn_0028,569.07,xxx-xx-0028,0 -txn_0029,1995.3,xxx-xx-0029,0 -txn_0000,3947.96,xxx-xx-0000,0 -txn_0001,2497.23,xxx-xx-0001,0 -txn_0002,443.73,xxx-xx-0002,0 -txn_0003,2690.16,xxx-xx-0003,0 -txn_0004,2938.34,xxx-xx-0004,0 -txn_0005,3729.74,xxx-xx-0005,0 -txn_0006,2163.98,xxx-xx-0006,0 -txn_0007,646.63,xxx-xx-0007,1 -txn_0008,1426.04,xxx-xx-0008,0 -txn_0009,1821.78,xxx-xx-0009,0 -txn_0010,3233.13,xxx-xx-0010,0 -txn_0011,2858.18,xxx-xx-0011,0 -txn_0012,1786.92,xxx-xx-0012,0 -txn_0013,4932.71,xxx-xx-0013,0 -txn_0014,3032.82,xxx-xx-0014,0 -txn_0015,1193.76,xxx-xx-0015,0 -txn_0016,517.89,xxx-xx-0016,0 -txn_0017,772.77,xxx-xx-0017,0 -txn_0018,1237.33,xxx-xx-0018,0 -txn_0019,811.8,xxx-xx-0019,1 -txn_0020,940.97,xxx-xx-0020,0 -txn_0021,1432.62,xxx-xx-0021,0 -txn_0022,875.13,xxx-xx-0022,0 -txn_0023,4484.86,xxx-xx-0023,0 -txn_0024,410.37,xxx-xx-0024,0 -txn_0025,2627.31,xxx-xx-0025,0 -txn_0026,2057.88,xxx-xx-0026,0 -txn_0027,4912.07,xxx-xx-0027,0 -txn_0028,569.07,xxx-xx-0028,0 -txn_0029,1995.3,xxx-xx-0029,0 diff --git a/tests/fixtures/test_dataset_good.csv b/tests/fixtures/test_dataset_good.csv deleted file mode 100644 index 6d82afc..0000000 --- a/tests/fixtures/test_dataset_good.csv +++ /dev/null @@ -1,201 +0,0 @@ -transaction_id,amount,merchant_category,country,is_fraud -txn_0000,1878.96,travel,CA,0 -txn_0001,4754.06,travel,FR,0 -txn_0002,3662.65,food,DE,0 -txn_0003,2997.31,retail,FR,0 -txn_0004,788.53,travel,CA,0 -txn_0005,788.41,travel,UK,0 -txn_0006,299.84,electronics,FR,0 -txn_0007,4332.22,travel,UK,0 -txn_0008,3009.56,travel,DE,0 -txn_0009,3543.28,electronics,US,0 -txn_0010,112.72,food,DE,0 -txn_0011,4849.85,travel,FR,0 -txn_0012,4163.89,electronics,UK,0 -txn_0013,1069.57,retail,UK,0 -txn_0014,917.31,food,CA,0 -txn_0015,925.19,retail,UK,0 -txn_0016,1528.17,retail,CA,1 -txn_0017,2628.53,electronics,US,0 -txn_0018,2165.41,travel,FR,0 -txn_0019,1463.23,electronics,CA,0 -txn_0020,3063.15,electronics,US,0 -txn_0021,706.07,electronics,UK,1 -txn_0022,1467.8,electronics,UK,0 -txn_0023,1838.15,food,US,1 -txn_0024,2285.79,travel,UK,0 -txn_0025,3928.03,retail,US,0 -txn_0026,1006.37,retail,CA,0 -txn_0027,2576.03,travel,CA,1 -txn_0028,2966.15,retail,US,0 -txn_0029,241.79,travel,CA,0 -txn_0030,3041.65,retail,CA,1 -txn_0031,860.92,electronics,CA,0 -txn_0032,334.61,retail,DE,0 -txn_0033,4744.94,travel,FR,0 -txn_0034,4828.5,travel,UK,0 -txn_0035,4043.9,travel,DE,0 -txn_0036,1530.02,travel,CA,0 -txn_0037,497.38,travel,US,0 -txn_0038,3424.32,food,CA,0 -txn_0039,2206.36,food,FR,0 -txn_0040,618.97,retail,CA,0 -txn_0041,2480.93,travel,US,0 -txn_0042,181.6,retail,FR,0 -txn_0043,4547.51,travel,CA,0 -txn_0044,1301.31,travel,FR,0 -txn_0045,3315.99,food,UK,0 -txn_0046,1565.44,food,UK,0 -txn_0047,2605.14,electronics,CA,0 -txn_0048,2738.08,food,FR,0 -txn_0049,932.42,electronics,US,0 -txn_0050,4848.23,electronics,CA,1 -txn_0051,3877.91,electronics,UK,1 -txn_0052,4698.1,electronics,UK,0 -txn_0053,4475.19,electronics,CA,0 -txn_0054,2993.52,retail,FR,0 -txn_0055,4610.15,food,UK,0 -txn_0056,451.58,electronics,FR,0 -txn_0057,987.95,travel,UK,0 -txn_0058,235.68,food,UK,0 -txn_0059,1633.4,food,DE,0 -txn_0060,1949.5,electronics,UK,0 -txn_0061,1364.03,retail,US,0 -txn_0062,4145.4,electronics,CA,0 -txn_0063,1790.2,retail,CA,0 -txn_0064,1411.86,travel,FR,0 -txn_0065,2718.05,food,UK,0 -txn_0066,713.21,travel,US,0 -txn_0067,4012.96,retail,FR,0 -txn_0068,382.01,retail,DE,0 -txn_0069,4934.57,travel,FR,0 -txn_0070,3863.5,retail,FR,1 -txn_0071,1001.59,travel,UK,0 -txn_0072,37.56,food,DE,0 -txn_0073,4079.15,travel,FR,0 -txn_0074,3537.22,food,US,1 -txn_0075,3647.75,travel,US,0 -txn_0076,3858.64,retail,CA,0 -txn_0077,379.48,travel,DE,0 -txn_0078,1798.74,retail,DE,0 -txn_0079,588.19,travel,CA,0 -txn_0080,4316.89,travel,FR,0 -txn_0081,3120.26,electronics,DE,0 -txn_0082,1661.18,retail,US,0 -txn_0083,327.16,electronics,US,1 -txn_0084,1561.8,retail,UK,1 -txn_0085,1632.66,electronics,DE,0 -txn_0086,3650.73,food,FR,0 -txn_0087,3191.41,travel,CA,0 -txn_0088,4437.19,retail,CA,0 -txn_0089,2366.35,retail,FR,0 -txn_0090,606.78,travel,UK,0 -txn_0091,3569.09,retail,CA,0 -txn_0092,3806.32,retail,DE,0 -txn_0093,2810.77,electronics,UK,0 -txn_0094,3857.13,retail,DE,0 -txn_0095,2474.04,food,US,0 -txn_0096,2618.44,food,UK,0 -txn_0097,2143.43,travel,CA,0 -txn_0098,136.84,retail,UK,0 -txn_0099,548.38,travel,UK,0 -txn_0100,166.83,food,UK,0 -txn_0101,3185.69,retail,UK,0 -txn_0102,1578.64,retail,DE,0 -txn_0103,2547.77,travel,US,0 -txn_0104,4538.76,food,FR,0 -txn_0105,1253.97,retail,UK,0 -txn_0106,2057.81,travel,CA,0 -txn_0107,3780.2,food,UK,0 -txn_0108,1151.7,travel,CA,0 -txn_0109,394.13,electronics,DE,0 -txn_0110,1455.86,retail,CA,0 -txn_0111,814.49,electronics,FR,0 -txn_0112,4649.19,food,US,0 -txn_0113,4042.52,electronics,CA,0 -txn_0114,3170.68,electronics,CA,0 -txn_0115,4358.59,food,US,0 -txn_0116,4020.32,travel,FR,0 -txn_0117,940.98,electronics,UK,0 -txn_0118,4463.87,travel,CA,0 -txn_0119,2701.32,electronics,US,0 -txn_0120,4039.13,travel,DE,0 -txn_0121,4481.5,food,US,0 -txn_0122,1596.84,electronics,DE,0 -txn_0123,559.16,food,FR,0 -txn_0124,1147.4,electronics,UK,0 -txn_0125,2141.27,food,US,0 -txn_0126,4091.89,food,CA,0 -txn_0127,4305.05,electronics,FR,0 -txn_0128,44.69,electronics,US,0 -txn_0129,2558.63,electronics,CA,0 -txn_0130,2092.88,retail,US,0 -txn_0131,1118.32,travel,DE,0 -txn_0132,608.13,retail,US,0 -txn_0133,1694.7,retail,US,1 -txn_0134,4715.12,food,US,0 -txn_0135,1622.78,electronics,FR,0 -txn_0136,2598.77,travel,US,0 -txn_0137,3518.06,food,CA,0 -txn_0138,1824.51,retail,DE,0 -txn_0139,4859.19,electronics,DE,0 -txn_0140,4812.61,food,US,0 -txn_0141,1266.39,electronics,FR,0 -txn_0142,2491.27,food,FR,0 -txn_0143,1511.38,electronics,CA,0 -txn_0144,1431.35,retail,US,0 -txn_0145,194.07,travel,DE,0 -txn_0146,3051.73,retail,FR,0 -txn_0147,2518.37,travel,US,0 -txn_0148,266.88,electronics,FR,0 -txn_0149,1400.45,food,FR,0 -txn_0150,4542.25,retail,DE,0 -txn_0151,1205.41,travel,UK,0 -txn_0152,733.03,retail,CA,0 -txn_0153,2452.37,travel,CA,0 -txn_0154,4928.4,retail,DE,0 -txn_0155,1217.86,travel,FR,0 -txn_0156,3363.96,travel,US,0 -txn_0157,3810.48,electronics,FR,0 -txn_0158,1195.81,travel,DE,0 -txn_0159,3643.8,electronics,CA,0 -txn_0160,1845.24,travel,FR,0 -txn_0161,3165.21,food,CA,0 -txn_0162,3171.31,food,US,0 -txn_0163,2683.52,food,CA,0 -txn_0164,460.55,travel,CA,0 -txn_0165,4178.16,electronics,UK,0 -txn_0166,1610.69,electronics,UK,0 -txn_0167,940.73,travel,UK,0 -txn_0168,213.47,food,CA,0 -txn_0169,2958.56,food,DE,0 -txn_0170,3391.05,electronics,CA,0 -txn_0171,92.77,food,DE,0 -txn_0172,2565.34,retail,DE,0 -txn_0173,1140.21,electronics,UK,0 -txn_0174,3229.41,food,FR,0 -txn_0175,880.09,electronics,US,0 -txn_0176,3457.78,retail,UK,0 -txn_0177,1939.81,electronics,UK,0 -txn_0178,4684.28,electronics,FR,0 -txn_0179,696.23,electronics,US,0 -txn_0180,1711.92,electronics,CA,0 -txn_0181,576.23,retail,CA,0 -txn_0182,4624.22,travel,UK,0 -txn_0183,4387.92,electronics,US,0 -txn_0184,1297.13,electronics,UK,0 -txn_0185,3303.32,food,DE,0 -txn_0186,4087.94,electronics,UK,0 -txn_0187,2780.45,food,UK,0 -txn_0188,2652.96,travel,CA,0 -txn_0189,1216.84,retail,CA,0 -txn_0190,474.58,retail,CA,0 -txn_0191,4487.11,travel,DE,0 -txn_0192,4503.09,retail,CA,0 -txn_0193,3169.18,travel,US,0 -txn_0194,1701.76,travel,FR,0 -txn_0195,1752.56,travel,US,0 -txn_0196,3632.52,retail,US,0 -txn_0197,4486.58,food,CA,0 -txn_0198,4436.56,travel,FR,0 -txn_0199,3901.58,electronics,FR,0 diff --git a/tests/fixtures/test_dataset_high_nulls.csv b/tests/fixtures/test_dataset_high_nulls.csv deleted file mode 100644 index 2a28160..0000000 --- a/tests/fixtures/test_dataset_high_nulls.csv +++ /dev/null @@ -1,201 +0,0 @@ -transaction_id,amount,merchant_category,country,is_fraud -txn_0000,1878.96,travel,CA,0 -txn_0001,4754.06,travel,FR,0 -txn_0002,,food,DE,0 -txn_0003,2997.31,retail,FR,0 -txn_0004,788.53,travel,CA,0 -txn_0005,788.41,travel,UK,0 -txn_0006,,electronics,FR,0 -txn_0007,4332.22,travel,UK,0 -txn_0008,3009.56,travel,DE,0 -txn_0009,3543.28,,US,0 -txn_0010,112.72,food,DE,0 -txn_0011,4849.85,travel,FR,0 -txn_0012,4163.89,,UK,0 -txn_0013,1069.57,retail,UK,0 -txn_0014,917.31,food,CA,0 -txn_0015,925.19,retail,UK,0 -txn_0016,,retail,CA,1 -txn_0017,2628.53,electronics,US,0 -txn_0018,2165.41,travel,FR,0 -txn_0019,1463.23,electronics,CA,0 -txn_0020,3063.15,electronics,US,0 -txn_0021,706.07,electronics,UK,1 -txn_0022,,electronics,UK,0 -txn_0023,1838.15,food,US,1 -txn_0024,,travel,UK,0 -txn_0025,3928.03,retail,US,0 -txn_0026,1006.37,,CA,0 -txn_0027,,travel,CA,1 -txn_0028,2966.15,retail,US,0 -txn_0029,241.79,,CA,0 -txn_0030,3041.65,retail,CA,1 -txn_0031,860.92,electronics,CA,0 -txn_0032,334.61,retail,DE,0 -txn_0033,4744.94,travel,FR,0 -txn_0034,4828.5,travel,UK,0 -txn_0035,4043.9,,DE,0 -txn_0036,1530.02,travel,CA,0 -txn_0037,497.38,travel,US,0 -txn_0038,3424.32,food,CA,0 -txn_0039,2206.36,food,FR,0 -txn_0040,618.97,retail,CA,0 -txn_0041,2480.93,travel,US,0 -txn_0042,181.6,retail,FR,0 -txn_0043,4547.51,travel,CA,0 -txn_0044,1301.31,travel,FR,0 -txn_0045,3315.99,food,UK,0 -txn_0046,,food,UK,0 -txn_0047,2605.14,,CA,0 -txn_0048,2738.08,food,FR,0 -txn_0049,932.42,electronics,US,0 -txn_0050,4848.23,electronics,CA,1 -txn_0051,3877.91,electronics,UK,1 -txn_0052,4698.1,electronics,UK,0 -txn_0053,4475.19,electronics,CA,0 -txn_0054,2993.52,retail,FR,0 -txn_0055,4610.15,food,UK,0 -txn_0056,451.58,electronics,FR,0 -txn_0057,987.95,travel,UK,0 -txn_0058,235.68,food,UK,0 -txn_0059,1633.4,food,DE,0 -txn_0060,1949.5,electronics,UK,0 -txn_0061,,retail,US,0 -txn_0062,4145.4,electronics,CA,0 -txn_0063,1790.2,retail,CA,0 -txn_0064,,travel,FR,0 -txn_0065,2718.05,,UK,0 -txn_0066,713.21,travel,US,0 -txn_0067,4012.96,retail,FR,0 -txn_0068,382.01,retail,DE,0 -txn_0069,4934.57,travel,FR,0 -txn_0070,,retail,FR,1 -txn_0071,1001.59,,UK,0 -txn_0072,37.56,food,DE,0 -txn_0073,,travel,FR,0 -txn_0074,3537.22,food,US,1 -txn_0075,3647.75,travel,US,0 -txn_0076,3858.64,retail,CA,0 -txn_0077,379.48,travel,DE,0 -txn_0078,1798.74,retail,DE,0 -txn_0079,588.19,travel,CA,0 -txn_0080,4316.89,travel,FR,0 -txn_0081,3120.26,electronics,DE,0 -txn_0082,1661.18,,US,0 -txn_0083,327.16,electronics,US,1 -txn_0084,,retail,UK,1 -txn_0085,1632.66,electronics,DE,0 -txn_0086,3650.73,,FR,0 -txn_0087,3191.41,travel,CA,0 -txn_0088,4437.19,retail,CA,0 -txn_0089,2366.35,retail,FR,0 -txn_0090,606.78,,UK,0 -txn_0091,3569.09,retail,CA,0 -txn_0092,,retail,DE,0 -txn_0093,2810.77,electronics,UK,0 -txn_0094,3857.13,retail,DE,0 -txn_0095,2474.04,food,US,0 -txn_0096,,food,UK,0 -txn_0097,2143.43,travel,CA,0 -txn_0098,,retail,UK,0 -txn_0099,548.38,,UK,0 -txn_0100,166.83,food,UK,0 -txn_0101,3185.69,,UK,0 -txn_0102,,retail,DE,0 -txn_0103,2547.77,travel,US,0 -txn_0104,4538.76,food,FR,0 -txn_0105,1253.97,retail,UK,0 -txn_0106,2057.81,travel,CA,0 -txn_0107,3780.2,food,UK,0 -txn_0108,,travel,CA,0 -txn_0109,394.13,electronics,DE,0 -txn_0110,1455.86,retail,CA,0 -txn_0111,,electronics,FR,0 -txn_0112,4649.19,food,US,0 -txn_0113,4042.52,electronics,CA,0 -txn_0114,3170.68,electronics,CA,0 -txn_0115,4358.59,food,US,0 -txn_0116,4020.32,travel,FR,0 -txn_0117,940.98,electronics,UK,0 -txn_0118,4463.87,travel,CA,0 -txn_0119,2701.32,electronics,US,0 -txn_0120,4039.13,travel,DE,0 -txn_0121,4481.5,food,US,0 -txn_0122,1596.84,electronics,DE,0 -txn_0123,559.16,food,FR,0 -txn_0124,1147.4,,UK,0 -txn_0125,2141.27,,US,0 -txn_0126,,food,CA,0 -txn_0127,4305.05,electronics,FR,0 -txn_0128,44.69,electronics,US,0 -txn_0129,2558.63,electronics,CA,0 -txn_0130,,retail,US,0 -txn_0131,1118.32,travel,DE,0 -txn_0132,608.13,retail,US,0 -txn_0133,1694.7,,US,1 -txn_0134,,food,US,0 -txn_0135,,electronics,FR,0 -txn_0136,2598.77,travel,US,0 -txn_0137,3518.06,food,CA,0 -txn_0138,1824.51,retail,DE,0 -txn_0139,4859.19,electronics,DE,0 -txn_0140,,food,US,0 -txn_0141,1266.39,electronics,FR,0 -txn_0142,2491.27,food,FR,0 -txn_0143,1511.38,electronics,CA,0 -txn_0144,1431.35,retail,US,0 -txn_0145,194.07,travel,DE,0 -txn_0146,3051.73,retail,FR,0 -txn_0147,2518.37,travel,US,0 -txn_0148,266.88,electronics,FR,0 -txn_0149,1400.45,food,FR,0 -txn_0150,,retail,DE,0 -txn_0151,,travel,UK,0 -txn_0152,733.03,,CA,0 -txn_0153,,travel,CA,0 -txn_0154,4928.4,retail,DE,0 -txn_0155,1217.86,travel,FR,0 -txn_0156,3363.96,travel,US,0 -txn_0157,3810.48,electronics,FR,0 -txn_0158,1195.81,travel,DE,0 -txn_0159,3643.8,electronics,CA,0 -txn_0160,1845.24,travel,FR,0 -txn_0161,3165.21,food,CA,0 -txn_0162,3171.31,food,US,0 -txn_0163,2683.52,food,CA,0 -txn_0164,460.55,travel,CA,0 -txn_0165,4178.16,electronics,UK,0 -txn_0166,1610.69,electronics,UK,0 -txn_0167,940.73,travel,UK,0 -txn_0168,213.47,,CA,0 -txn_0169,2958.56,food,DE,0 -txn_0170,3391.05,electronics,CA,0 -txn_0171,,food,DE,0 -txn_0172,2565.34,retail,DE,0 -txn_0173,1140.21,electronics,UK,0 -txn_0174,3229.41,food,FR,0 -txn_0175,880.09,electronics,US,0 -txn_0176,3457.78,retail,UK,0 -txn_0177,,electronics,UK,0 -txn_0178,4684.28,,FR,0 -txn_0179,696.23,electronics,US,0 -txn_0180,1711.92,electronics,CA,0 -txn_0181,576.23,retail,CA,0 -txn_0182,,travel,UK,0 -txn_0183,4387.92,electronics,US,0 -txn_0184,1297.13,,UK,0 -txn_0185,3303.32,food,DE,0 -txn_0186,4087.94,electronics,UK,0 -txn_0187,2780.45,,UK,0 -txn_0188,,travel,CA,0 -txn_0189,1216.84,retail,CA,0 -txn_0190,,retail,CA,0 -txn_0191,4487.11,travel,DE,0 -txn_0192,4503.09,retail,CA,0 -txn_0193,3169.18,travel,US,0 -txn_0194,1701.76,travel,FR,0 -txn_0195,1752.56,travel,US,0 -txn_0196,3632.52,retail,US,0 -txn_0197,,food,CA,0 -txn_0198,,travel,FR,0 -txn_0199,3901.58,electronics,FR,0 diff --git a/tests/fixtures/test_metadata.json b/tests/fixtures/test_metadata.json deleted file mode 100644 index 0790d7b..0000000 --- a/tests/fixtures/test_metadata.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "column_definitions": { - "transaction_id": "Unique transaction identifier", - "amount": "Transaction amount in USD", - "merchant_category": "Merchant category code", - "country": "ISO 2-letter country code", - "is_fraud": "Binary fraud label (1=fraud, 0=legitimate)" - }, - "seller_claims": { - "source": "Internal payment processing system", - "date_range": "2024-01-01 to 2024-12-31", - "row_count": "200", - "fraud_rate": "~4%" - } -} \ No newline at end of file From c1fd6341cafd18b5d46fd95a2285e9cca911fd75 Mon Sep 17 00:00:00 2001 From: Parth Thapliyal Date: Sun, 22 Mar 2026 19:56:35 -0400 Subject: [PATCH 12/13] feat: endpoint-driven live E2E tests with real data, buyer/seller prompts, renegotiation matrix --- .gitignore | 1 - requirements.txt | 1 + skills/confidential_data_procurement/init.py | 2 +- tests/conftest.py | 84 ++- tests/demo_matrix.json | 131 ++++ tests/test_live_e2e.py | 496 +++++++++++++ tests/test_live_integration.py | 724 ------------------- 7 files changed, 688 insertions(+), 751 deletions(-) create mode 100644 tests/demo_matrix.json create mode 100644 tests/test_live_e2e.py delete mode 100644 tests/test_live_integration.py diff --git a/.gitignore b/.gitignore index 7f1ad88..c121206 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,3 @@ venv/ eval_results/ scripts/ tests/fixtures/ -tests/demo_matrix.json diff --git a/requirements.txt b/requirements.txt index 7e9032e..1a10176 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ python-dotenv pytest httpx python-multipart +datasets PyJWT>=2.8.0 supabase>=2.0.0 cryptography>=42.0.0 diff --git a/skills/confidential_data_procurement/init.py b/skills/confidential_data_procurement/init.py index 59dafb2..62cda40 100644 --- a/skills/confidential_data_procurement/init.py +++ b/skills/confidential_data_procurement/init.py @@ -251,7 +251,7 @@ def procurement_init_handler(message: str, conversation: list[dict]) -> dict: f"Max duplicate rate: {policy.max_duplicate_rate:.0%}\n" f"Budget: ${policy.base_price:,.2f} – ${policy.max_budget:,.2f}\n" + (f"Label column: {policy.label_column} (≥ {policy.min_label_rate:.1%})\n" - if policy.label_column else "") + if policy.label_column and policy.min_label_rate is not None else "") + (f"Forbidden columns: {', '.join(policy.forbidden_columns)}\n" if policy.forbidden_columns else "") + "\nShare the instance link with your supplier to begin." diff --git a/tests/conftest.py b/tests/conftest.py index fea873a..c1526e8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -41,41 +41,70 @@ def pytest_collection_modifyitems(config, items): # --------------------------------------------------------------------------- # Dataset fixture +# +# Loads dazzle-nu/CIS435-CreditCardFraudDetection from HuggingFace. +# Normalises to: transaction_id, amount, is_fraud, category, merchant +# PII columns retained but NOT included by default — seller variants +# can add them (dob, cc_num) to trigger the forbidden-column rejection. +# +# Falls back to synthetic data if HuggingFace is unavailable (e.g. CI). # --------------------------------------------------------------------------- -def _generate_synthetic_df(n: int = 800) -> pd.DataFrame: +_HF_DATASET = "dazzle-nu/CIS435-CreditCardFraudDetection" +_SAMPLE_N = 1000 + + +def _generate_synthetic_df(n: int = _SAMPLE_N) -> pd.DataFrame: import numpy as np rng = np.random.default_rng(42) return pd.DataFrame({ - "transaction_id": [f"txn_{i:05d}" for i in range(n)], - "amount": rng.uniform(1.0, 500.0, n).round(2), - "merchant_category": rng.choice(["grocery", "gas", "restaurant", "travel", "online"], n), - "is_fraud": (rng.uniform(0, 1, n) < 0.04).astype(int), + "transaction_id": [f"txn_{i:05d}" for i in range(n)], + "amount": rng.uniform(1.0, 500.0, n).round(2), + "category": rng.choice(["grocery", "gas", "restaurant", "travel", "online"], n), + "merchant": [f"merchant_{i % 50}" for i in range(n)], + "is_fraud": (rng.uniform(0, 1, n) < 0.04).astype(int), + # PII cols — available for forbidden-column tests + "dob": [f"19{(i % 60 + 40):02d}-01-01" for i in range(n)], + "cc_num": [f"4{i:015d}" for i in range(n)], }) +def _load_hf_df() -> pd.DataFrame | None: + try: + from datasets import load_dataset + ds = load_dataset(_HF_DATASET, split="train") + df = ds.to_pandas() + + # Normalise column names + df = df.rename(columns={"trans_num": "transaction_id", "amt": "amount"}) + if "transaction_id" not in df.columns: + df.insert(0, "transaction_id", [f"txn_{i:06d}" for i in range(len(df))]) + + required = {"transaction_id", "amount", "is_fraud"} + if not required.issubset(df.columns): + return None + + # Stratified sample: keep fraud/non-fraud ratio, cap at _SAMPLE_N + fraud = df[df["is_fraud"] == 1].sample(min(40, (df["is_fraud"] == 1).sum()), random_state=42) + nonfraud = df[df["is_fraud"] == 0].sample(_SAMPLE_N - len(fraud), random_state=42) + df = pd.concat([fraud, nonfraud]).sample(frac=1, random_state=42).reset_index(drop=True) + + print(f"[conftest] HuggingFace dataset loaded: {len(df)} rows, " + f"fraud rate={df['is_fraud'].mean():.1%}, columns={list(df.columns)}") + return df + except Exception as e: + print(f"[conftest] HuggingFace load failed ({e}) — using synthetic data") + return None + + @pytest.fixture(scope="session") def base_df() -> pd.DataFrame: - """Session-scoped clean fraud-like DataFrame (~800 rows, synthetic fallback).""" - url = "https://raw.githubusercontent.com/dsrscientist/dataset1/master/creditcard_small.csv" - try: - import io, requests - resp = requests.get(url, timeout=10) - if resp.status_code == 200: - df = pd.read_csv(io.StringIO(resp.text)) - rename = {} - cols_lower = {c.lower(): c for c in df.columns} - if "transaction_id" not in cols_lower and "id" in cols_lower: - rename[cols_lower["id"]] = "transaction_id" - if "is_fraud" not in cols_lower and "class" in cols_lower: - rename[cols_lower["class"]] = "is_fraud" - if rename: - df = df.rename(columns=rename) - if {"transaction_id", "amount", "is_fraud"}.issubset(df.columns): - return df.head(800) - except Exception: - pass - return _generate_synthetic_df(800) + """ + Session-scoped DataFrame from dazzle-nu/CIS435-CreditCardFraudDetection (~1000 rows). + Falls back to synthetic if HuggingFace is unavailable. + """ + df = _load_hf_df() + return df if df is not None else _generate_synthetic_df() # --------------------------------------------------------------------------- @@ -141,12 +170,16 @@ def matrix_results() -> Generator[list[dict], None, None]: print("=" * 90) # --- Save JSON --- + # Pull buyer prompt from first eval row if present (set by test_live_e2e.py) + buyer_prompt = eval_rows[0].get("buyer_prompt") if eval_rows else None + output = { "title": "Confidential Data Procurement — Demo Results", "generated": str(datetime.date.today()), "model": "deepseek-ai/DeepSeek-V3.1", "pipeline": "deterministic → LLM agent (schema match + claim verify) → guardrails", "note": "base_price=0: bad data → payment approaches $0. Reserve not met → deal rejected.", + "buyer_prompt": buyer_prompt, "evaluation_matrix": [ { "id": i + 1, @@ -154,6 +187,7 @@ def matrix_results() -> Generator[list[dict], None, None]: "narrative": r.get("narrative", ""), "seller_variant": r.get("seller", ""), "buyer_variant": r.get("buyer", ""), + "seller_input": r.get("seller_input"), "reserve_price": r.get("reserve"), "quality_score": round(r["quality"], 4) if r.get("quality") is not None else None, "proposed_payment": r.get("payment"), diff --git a/tests/demo_matrix.json b/tests/demo_matrix.json new file mode 100644 index 0000000..f6b0fdf --- /dev/null +++ b/tests/demo_matrix.json @@ -0,0 +1,131 @@ +{ + "title": "Confidential Data Procurement \u2014 Demo Results", + "generated": "2026-03-22", + "model": "deepseek-ai/DeepSeek-V3.1", + "pipeline": "deterministic \u2192 LLM agent (schema match + claim verify) \u2192 guardrails", + "note": "base_price=0: bad data \u2192 payment approaches $0. Reserve not met \u2192 deal rejected.", + "buyer_prompt": "We are building a machine learning pipeline for real-time fraud detection in payment processing. We need a labeled transaction dataset with four specific columns: transaction_id (a unique identifier per transaction), amount (the transaction value in USD), is_fraud (a binary label \u2014 1 for fraudulent, 0 for legitimate), and category (merchant category code). The dataset must contain at least 500 rows. We can tolerate at most 5% missing values and at most 10% duplicate rows. No personally identifiable information should appear \u2014 date of birth, credit card numbers, social security numbers, or any customer names and addresses are strictly not acceptable. This data will train a gradient boosting classifier so label accuracy and field completeness are critical. Our maximum budget for a perfect dataset is $800. We have no floor price \u2014 if the data is unusable we expect to pay nothing.", + "evaluation_matrix": [ + { + "id": 1, + "scenario": "Full Satisfaction", + "narrative": "Buyer described exact requirements in natural language \u2014 LLM extracted the policy. Seller uploaded clean real transaction data (HuggingFace) with all four required columns. Agent verified claims and found no schema gaps. Both parties accepted the enclave's offer.", + "seller_variant": "clean", + "buyer_variant": "standard ($800)", + "seller_input": { + "column_definitions": { + "transaction_id": "Unique identifier per transaction", + "amount": "Transaction value in USD", + "is_fraud": "Binary fraud label \u2014 1 if fraudulent, 0 otherwise", + "category": "Merchant category code" + }, + "seller_claims": { + "completeness": "All four columns are fully populated \u2014 zero missing values", + "label_rate": "Approximately 4% of transactions are labeled as fraudulent" + } + }, + "reserve_price": 150.0, + "quality_score": 1.0, + "proposed_payment": 800.0, + "deal": true, + "settlement_status": "pending_approval", + "notes": [], + "explanation": "All four required columns (transaction_id, amount, is_fraud, category) are present with exact name matches. Both seller claims are fully verified: all columns show 0.0% null rates, and the fraud label distribution shows exactly 4.0% of transactions are labeled as fraudulent (40 out of 1000). The dataset demonstrates excellent data quality with no missing values and meets the specified label distribution requirements precisely.", + "schema_matching": { + "transaction_id": "transaction_id", + "amount": "amount", + "is_fraud": "is_fraud", + "category": "category" + }, + "claim_verification": { + "completeness: All four columns are fully populated \u2014 zero missing values": "verified", + "label_rate: Approximately 4% of transactions are labeled as fraudulent": "verified" + } + }, + { + "id": 2, + "scenario": "Partial Satisfaction", + "narrative": "Seller omitted the 'category' column, which the buyer explicitly required. Agent identified the schema gap and noted it in the explanation. Quality score and payment reflect the LLM's assessment of the partial dataset.", + "seller_variant": "partial (missing category)", + "buyer_variant": "standard ($800)", + "seller_input": { + "column_definitions": { + "transaction_id": "Unique identifier per transaction", + "amount": "Transaction value in USD", + "is_fraud": "Binary fraud label" + }, + "seller_claims": { + "completeness": "Data is mostly complete" + } + }, + "reserve_price": 50.0, + "quality_score": 1.0, + "proposed_payment": 800.0, + "deal": true, + "settlement_status": "pending_approval", + "notes": [], + "explanation": "The dataset contains all three required columns (transaction_id, amount, is_fraud) with exact name matches, but is missing the required category column. The seller's completeness claim is verified as all columns show 0.0% null rates across all 1000 rows. Notably, the fraud detection label shows a 4% fraud rate (40 fraudulent transactions out of 1000), which appears reasonable for typical transaction monitoring scenarios. The transaction amounts range from $1.01 to $1,315 with a mean of $87.34.", + "schema_matching": { + "transaction_id": "transaction_id", + "amount": "amount", + "is_fraud": "is_fraud", + "category": null + }, + "claim_verification": { + "completeness: Data is mostly complete": "verified" + } + }, + { + "id": 3, + "scenario": "Critical: >50% Duplicates", + "narrative": "Seller submitted a dataset where every row is duplicated \u2014 over 50% dup rate. The deterministic layer flags this as a critical violation and rejects immediately. No LLM call. Payment = $0. Seller's reserve price is irrelevant.", + "seller_variant": "duplicated (>50%)", + "buyer_variant": "standard ($800)", + "seller_input": { + "column_definitions": { + "transaction_id": "Unique identifier per transaction", + "amount": "Transaction value in USD", + "is_fraud": "Binary fraud label \u2014 1 if fraudulent, 0 otherwise", + "category": "Merchant category code" + }, + "seller_claims": {}, + "note": "Every row duplicated \u2014 submitted 2000 rows from a 1000-row base dataset" + }, + "reserve_price": 0.0, + "quality_score": 0.0, + "proposed_payment": 0.0, + "deal": false, + "settlement_status": "rejected", + "notes": [ + "Duplicate rate (50.0%) exceeds critical threshold (50%). Dataset quality is insufficient." + ], + "explanation": null, + "schema_matching": null, + "claim_verification": null + } + ], + "renegotiation_matrix": [ + { + "id": 1, + "scenario": "Renegotiation \u2014 Terms Overlap", + "narrative": "Enclave offers $770 for partial data (missing category). Buyer revises down to $678 (\u221212%). Seller lowers floor to $608 (\u221221%). $678 \u2265 $608 \u2192 deal at buyer's revised offer. Neither party saw the other's private number.", + "initial_offer": 770.0, + "buyer_action": "renegotiate \u2192 $678", + "supplier_action": "renegotiate \u2192 $608", + "final_payment": 678, + "deal": true, + "settlement_status": "authorized" + }, + { + "id": 2, + "scenario": "Renegotiation \u2014 No Overlap", + "narrative": "Enclave offers $770 for partial data. Buyer drops hard to $462 (\u221240%). Seller holds firm at $732 (\u22125%). $462 < $732 \u2192 deal falls through. One round used \u2014 both sides walked away.", + "initial_offer": 770.0, + "buyer_action": "renegotiate \u2192 $462", + "supplier_action": "renegotiate \u2192 $732", + "final_payment": null, + "deal": false, + "settlement_status": "rejected" + } + ] +} \ No newline at end of file diff --git a/tests/test_live_e2e.py b/tests/test_live_e2e.py new file mode 100644 index 0000000..52db6ed --- /dev/null +++ b/tests/test_live_e2e.py @@ -0,0 +1,496 @@ +""" +Live E2E tests for confidential_data_procurement. + +Tests the full API endpoint flow with real HuggingFace transaction data and a real LLM. +No hardcoded BuyerPolicy — buyer describes requirements in natural language via POST /init. + +Scenarios: + 1. Full satisfaction — clean seller data, all required columns → deal, both accept + 2. Partial satisfaction — seller drops 'category' → lower payment (schema gap flagged) + 3. Bad data — >50% duplicate rows → critical rejection, no LLM agent ran + 4. Renegotiation overlap — partial data, both negotiate, terms meet → authorized + 5. Renegotiation no overlap — partial data, buyer drops 40%, seller holds → rejected + +NOTE: All tests are @pytest.mark.live. They are skipped unless CONCLAVE_NEARAI_API_KEY +is set in the environment. Run individually with: + set -a && source .env && set +a + ./venv/bin/python -m pytest tests/test_live_e2e.py -v -s +""" +from __future__ import annotations + +import json +import uuid + +import pandas as pd +import pytest +from fastapi.testclient import TestClient + +import api.routes as routes +from skills.confidential_data_procurement.ingest import _datasets + + +# --------------------------------------------------------------------------- +# Buyer prompt — natural language, ~150 words, all required fields included +# --------------------------------------------------------------------------- + +BUYER_PROMPT = ( + "We are building a machine learning pipeline for real-time fraud detection in " + "payment processing. We need a labeled transaction dataset with four specific columns: " + "transaction_id (a unique identifier per transaction), amount (the transaction value " + "in USD), is_fraud (a binary label — 1 for fraudulent, 0 for legitimate), and category " + "(merchant category code). The dataset must contain at least 500 rows. We can tolerate " + "at most 5% missing values and at most 10% duplicate rows. No personally identifiable " + "information should appear — date of birth, credit card numbers, social security numbers, " + "or any customer names and addresses are strictly not acceptable. This data will train a " + "gradient boosting classifier so label accuracy and field completeness are critical. " + "Our maximum budget for a perfect dataset is $800. We have no floor price — if the " + "data is unusable we expect to pay nothing." +) + + +# --------------------------------------------------------------------------- +# Seller metadata +# --------------------------------------------------------------------------- + +_META_CLEAN = json.dumps({ + "column_definitions": { + "transaction_id": "Unique identifier per transaction", + "amount": "Transaction value in USD", + "is_fraud": "Binary fraud label — 1 if fraudulent, 0 otherwise", + "category": "Merchant category code", + }, + "seller_claims": { + "completeness": "All four columns are fully populated — zero missing values", + "label_rate": "Approximately 4% of transactions are labeled as fraudulent", + }, +}).encode() + +_META_PARTIAL = json.dumps({ + "column_definitions": { + "transaction_id": "Unique identifier per transaction", + "amount": "Transaction value in USD", + "is_fraud": "Binary fraud label", + }, + "seller_claims": { + "completeness": "Data is mostly complete", + }, +}).encode() + + +# --------------------------------------------------------------------------- +# Session fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(scope="session") +def app_client(): + """Session-scoped TestClient — state persists across all live E2E tests.""" + # Clear any leftover state from prior test files + routes._instances.clear() + routes._submissions.clear() + routes._results.clear() + routes._tokens.clear() + routes._registrations.clear() + _datasets.clear() + from main import app + with TestClient(app) as client: + yield client + _datasets.clear() + + +@pytest.fixture(scope="session") +def buyer_init(app_client): + """ + Single real LLM conversation: buyer describes requirements → BuyerPolicy extracted. + Session-scoped — runs once, all scenario tests share the same instance. + """ + r1 = app_client.post("/init", json={ + "skill_name": "confidential_data_procurement", + "message": "I want to set up a data procurement instance.", + }) + assert r1.status_code == 200, r1.text + instance_id = r1.json()["instance_id"] + + r2 = app_client.post("/init", json={ + "skill_name": "confidential_data_procurement", + "message": BUYER_PROMPT, + "instance_id": instance_id, + }) + assert r2.status_code == 200, r2.text + + # If LLM asks a follow-up, give one more nudge + if r2.json().get("status") != "ready": + r3 = app_client.post("/init", json={ + "skill_name": "confidential_data_procurement", + "message": "That covers everything. Please finalize the policy.", + "instance_id": instance_id, + }) + assert r3.status_code == 200, r3.text + assert r3.json().get("status") == "ready", ( + f"Init handler did not reach ready after 3 turns: {r3.json().get('message')}" + ) + admin_token = r3.json()["admin_token"] + else: + admin_token = r2.json()["admin_token"] + + print(f"\n[buyer_init] instance_id={instance_id}, admin_token={admin_token[:12]}...") + return instance_id, admin_token + + +# --------------------------------------------------------------------------- +# Seller data builders (from real HuggingFace base_df) +# --------------------------------------------------------------------------- + +def _clean_csv(base_df: pd.DataFrame) -> bytes: + """All four required columns, no corruption.""" + cols = [c for c in ["transaction_id", "amount", "is_fraud", "category"] + if c in base_df.columns] + return base_df[cols].to_csv(index=False).encode() + + +def _partial_csv(base_df: pd.DataFrame) -> bytes: + """Drop 'category' — buyer requires it. Everything else intact.""" + cols = [c for c in ["transaction_id", "amount", "is_fraud"] + if c in base_df.columns] + return base_df[cols].to_csv(index=False).encode() + + +def _bad_csv(base_df: pd.DataFrame) -> bytes: + """Duplicate every row — produces exactly 50% dup rate, triggers critical rejection.""" + cols = [c for c in ["transaction_id", "amount", "is_fraud", "category"] + if c in base_df.columns] + df = base_df[cols] + return pd.concat([df, df]).to_csv(index=False).encode() + + +# --------------------------------------------------------------------------- +# API helpers +# --------------------------------------------------------------------------- + +def _register(client, instance_id: str) -> str: + r = client.post("/register", json={"instance_id": instance_id}) + assert r.status_code == 200, r.text + return r.json()["user_token"] + + +def _upload(client, user_token: str, csv_bytes: bytes, metadata_bytes: bytes) -> str: + r = client.post( + "/upload", + files={ + "csv_file": ("dataset.csv", csv_bytes, "text/csv"), + "metadata_file": ("metadata.json", metadata_bytes, "application/json"), + }, + headers={"X-Instance-Token": user_token}, + ) + assert r.status_code == 200, r.text + return r.json()["dataset_id"] + + +def _submit(client, user_token: str, dataset_id: str, sub_id: str, reserve: float) -> dict: + r = client.post( + "/submit", + json={ + "submission_id": sub_id, + "dataset_id": dataset_id, + "dataset_name": "seller_data.csv", + "reserve_price": reserve, + }, + headers={"X-Instance-Token": user_token}, + ) + assert r.status_code == 200, r.text + return r.json() + + +def _get_result(client, sub_id: str, token: str) -> dict: + r = client.get(f"/results/{sub_id}", headers={"X-Instance-Token": token}) + assert r.status_code == 200, r.text + return r.json() + + +def _respond(client, sub_id: str, action: str, token: str, + revised_value: float | None = None) -> dict: + body: dict = {"submission_id": sub_id, "action": action} + if revised_value is not None: + body["revised_value"] = revised_value + r = client.post("/respond", json=body, headers={"X-Instance-Token": token}) + assert r.status_code == 200, r.text + return r.json() + + +def _run_pipeline(client, buyer_init, csv_bytes: bytes, metadata_bytes: bytes, + reserve: float): + """ + Full supplier flow: register → upload → submit → get_result. + Returns (result_dict, admin_token, user_token, sub_id). + """ + instance_id, admin_token = buyer_init + user_token = _register(client, instance_id) + dataset_id = _upload(client, user_token, csv_bytes, metadata_bytes) + sub_id = str(uuid.uuid4())[:12] + _submit(client, user_token, dataset_id, sub_id, reserve) + result = _get_result(client, sub_id, admin_token) + print(f"\n pipeline → sub={sub_id} deal={result.get('deal')} " + f"quality={result.get('quality_score')} payment=${result.get('proposed_payment')}") + return result, admin_token, user_token, sub_id + + +# --------------------------------------------------------------------------- +# Scenario 1: Full satisfaction +# --------------------------------------------------------------------------- + +@pytest.mark.live +def test_full_satisfaction(app_client, base_df, buyer_init, matrix_results): + """ + 100% satisfied: clean data, all required columns, honest claims. + Both parties accept the enclave's offer → authorized. + """ + result, admin_token, user_token, sub_id = _run_pipeline( + app_client, buyer_init, + _clean_csv(base_df), _META_CLEAN, reserve=150.0, + ) + + assert result.get("deal") is True, f"Expected deal=True, got: {result}" + assert result.get("settlement_status") == "pending_approval" + + _respond(app_client, sub_id, "accept", admin_token) + respond_result = _respond(app_client, sub_id, "accept", user_token) + assert respond_result["settlement_status"] == "authorized" + + # Fetch full result to verify release_token issued + final = _get_result(app_client, sub_id, admin_token) + assert final.get("release_token") is not None + + matrix_results.append({ + "type": "evaluation", + "scenario": "Full Satisfaction", + "narrative": ( + "Buyer described exact requirements in natural language — LLM extracted the policy. " + "Seller uploaded clean real transaction data (HuggingFace) with all four required columns. " + "Agent verified claims and found no schema gaps. Both parties accepted the enclave's offer." + ), + "buyer_prompt": BUYER_PROMPT, + "seller_input": { + "column_definitions": { + "transaction_id": "Unique identifier per transaction", + "amount": "Transaction value in USD", + "is_fraud": "Binary fraud label — 1 if fraudulent, 0 otherwise", + "category": "Merchant category code", + }, + "seller_claims": { + "completeness": "All four columns are fully populated — zero missing values", + "label_rate": "Approximately 4% of transactions are labeled as fraudulent", + }, + }, + "seller": "clean", + "buyer": "standard ($800)", + "reserve": 150.0, + "quality": result.get("quality_score"), + "payment": result.get("proposed_payment"), + "deal": True, + "notes": result.get("notes", []), + "explanation": result.get("explanation", ""), + "schema_matching": result.get("schema_matching"), + "claim_verification": result.get("claim_verification"), + }) + + +# --------------------------------------------------------------------------- +# Scenario 2: Partial satisfaction +# --------------------------------------------------------------------------- + +@pytest.mark.live +def test_partial_satisfaction(app_client, base_df, buyer_init, matrix_results): + """ + ~80% satisfied: seller drops 'category' (buyer required it). + Agent penalises schema score — payment is proportionally lower. + """ + result, _, _, _ = _run_pipeline( + app_client, buyer_init, + _partial_csv(base_df), _META_PARTIAL, reserve=50.0, + ) + + # Agent should at minimum note the missing column in its explanation + explanation = result.get("explanation", "") + assert "category" in explanation.lower(), ( + f"Expected agent to flag missing 'category' column. Explanation: {explanation}" + ) + + matrix_results.append({ + "type": "evaluation", + "scenario": "Partial Satisfaction", + "narrative": ( + "Seller omitted the 'category' column, which the buyer explicitly required. " + "Agent identified the schema gap and noted it in the explanation. " + "Quality score and payment reflect the LLM's assessment of the partial dataset." + ), + "buyer_prompt": BUYER_PROMPT, + "seller_input": { + "column_definitions": { + "transaction_id": "Unique identifier per transaction", + "amount": "Transaction value in USD", + "is_fraud": "Binary fraud label", + }, + "seller_claims": { + "completeness": "Data is mostly complete", + }, + }, + "seller": "partial (missing category)", + "buyer": "standard ($800)", + "reserve": 50.0, + "quality": result.get("quality_score"), + "payment": result.get("proposed_payment"), + "deal": result.get("deal"), + "notes": result.get("notes", []), + "explanation": result.get("explanation", ""), + "schema_matching": result.get("schema_matching"), + "claim_verification": result.get("claim_verification"), + }) + + +# --------------------------------------------------------------------------- +# Scenario 3: Bad data — critical rejection +# --------------------------------------------------------------------------- + +@pytest.mark.live +def test_bad_data_rejected(app_client, base_df, buyer_init, matrix_results): + """ + Critical: >50% duplicate rows → immediate rejection by deterministic layer. + No LLM agent ran — explanation is absent. + """ + result, _, _, _ = _run_pipeline( + app_client, buyer_init, + _bad_csv(base_df), _META_CLEAN, reserve=0.0, + ) + + assert result.get("deal") is False + assert not result.get("explanation"), "Agent should not have run for critical rejection" + + matrix_results.append({ + "type": "evaluation", + "scenario": "Critical: >50% Duplicates", + "narrative": ( + "Seller submitted a dataset where every row is duplicated — over 50% dup rate. " + "The deterministic layer flags this as a critical violation and rejects immediately. " + "No LLM call. Payment = $0. Seller's reserve price is irrelevant." + ), + "buyer_prompt": BUYER_PROMPT, + "seller_input": { + "column_definitions": { + "transaction_id": "Unique identifier per transaction", + "amount": "Transaction value in USD", + "is_fraud": "Binary fraud label — 1 if fraudulent, 0 otherwise", + "category": "Merchant category code", + }, + "seller_claims": {}, + "note": "Every row duplicated — submitted 2000 rows from a 1000-row base dataset", + }, + "seller": "duplicated (>50%)", + "buyer": "standard ($800)", + "reserve": 0.0, + "quality": 0.0, + "payment": result.get("proposed_payment", 0), + "deal": False, + "notes": result.get("notes", []), + "explanation": None, + "schema_matching": None, + "claim_verification": None, + }) + + +# --------------------------------------------------------------------------- +# Scenario 4: Renegotiation — terms overlap → deal +# --------------------------------------------------------------------------- + +@pytest.mark.live +def test_renegotiation_overlap(app_client, base_df, buyer_init, matrix_results): + """ + Partial data evaluated by full pipeline. Enclave proposes an offer. + Buyer revises down 12%, seller lowers floor 21% — they overlap → deal. + Amounts derived from the actual pipeline result (no hardcoding). + """ + result, admin_token, user_token, sub_id = _run_pipeline( + app_client, buyer_init, + _partial_csv(base_df), _META_PARTIAL, reserve=50.0, + ) + assert result.get("settlement_status") == "pending_approval", ( + f"Expected pending_approval for renegotiation test, got: {result.get('settlement_status')}" + ) + + p = result["proposed_payment"] + buyer_revised = round(p * 0.88) # buyer cuts 12% + supplier_revised = round(p * 0.79) # seller lowers floor 21% + + _respond(app_client, sub_id, "renegotiate", admin_token, buyer_revised) + respond_result = _respond(app_client, sub_id, "renegotiate", user_token, supplier_revised) + assert respond_result["settlement_status"] == "authorized", ( + f"Expected authorized, got: {respond_result['settlement_status']}. " + f"buyer_revised={buyer_revised}, supplier_revised={supplier_revised}" + ) + + # Fetch full result to verify final payment + final = _get_result(app_client, sub_id, admin_token) + assert final["proposed_payment"] == buyer_revised + + matrix_results.append({ + "type": "renegotiation", + "scenario": "Renegotiation — Terms Overlap", + "narrative": ( + f"Enclave offers ${p:.0f} for partial data (missing category). " + f"Buyer revises down to ${buyer_revised:.0f} (−12%). " + f"Seller lowers floor to ${supplier_revised:.0f} (−21%). " + f"${buyer_revised:.0f} ≥ ${supplier_revised:.0f} → deal at buyer's revised offer. " + "Neither party saw the other's private number." + ), + "initial_offer": p, + "buyer_action": f"renegotiate → ${buyer_revised:.0f}", + "supplier_action": f"renegotiate → ${supplier_revised:.0f}", + "final_payment": buyer_revised, + "deal": True, + }) + + +# --------------------------------------------------------------------------- +# Scenario 5: Renegotiation — no overlap → rejected +# --------------------------------------------------------------------------- + +@pytest.mark.live +def test_renegotiation_no_overlap(app_client, base_df, buyer_init, matrix_results): + """ + Partial data evaluated. Buyer drops 40%, seller barely moves (−5%). + No overlap → deal rejected. One renegotiation round used, deal falls through. + """ + result, admin_token, user_token, sub_id = _run_pipeline( + app_client, buyer_init, + _partial_csv(base_df), _META_PARTIAL, reserve=50.0, + ) + assert result.get("settlement_status") == "pending_approval", ( + f"Expected pending_approval for renegotiation test, got: {result.get('settlement_status')}" + ) + + p = result["proposed_payment"] + buyer_revised = round(p * 0.60) # buyer drops 40% + supplier_revised = round(p * 0.95) # seller barely moves + + _respond(app_client, sub_id, "renegotiate", admin_token, buyer_revised) + final = _respond(app_client, sub_id, "renegotiate", user_token, supplier_revised) + + assert final["settlement_status"] == "rejected", ( + f"Expected rejected, got: {final['settlement_status']}. " + f"buyer_revised={buyer_revised}, supplier_revised={supplier_revised}" + ) + + matrix_results.append({ + "type": "renegotiation", + "scenario": "Renegotiation — No Overlap", + "narrative": ( + f"Enclave offers ${p:.0f} for partial data. " + f"Buyer drops hard to ${buyer_revised:.0f} (−40%). " + f"Seller holds firm at ${supplier_revised:.0f} (−5%). " + f"${buyer_revised:.0f} < ${supplier_revised:.0f} → deal falls through. " + "One round used — both sides walked away." + ), + "initial_offer": p, + "buyer_action": f"renegotiate → ${buyer_revised:.0f}", + "supplier_action": f"renegotiate → ${supplier_revised:.0f}", + "final_payment": None, + "deal": False, + }) diff --git a/tests/test_live_integration.py b/tests/test_live_integration.py deleted file mode 100644 index 3b81f27..0000000 --- a/tests/test_live_integration.py +++ /dev/null @@ -1,724 +0,0 @@ -""" -Live integration test suite for confidential_data_procurement. - -Budget design: base_price=0 on all buyer policies. - → Payment = max_budget * quality_score - → Bad data → payment near $0. Critical failure → $0. - → All amounts in $0–$800 range for demo clarity. - -Sections: - 1. Deterministic (no LLM, always fast) — 11 tests - 2. Agent layer (live LLM, @pytest.mark.live) — 7 tests - 3. Full pipeline (live LLM, @pytest.mark.live) — 7 tests - 4. Renegotiation scenarios (deterministic) — 5 tests - -Run fast only: - ./venv/bin/python -m pytest tests/test_live_integration.py -v -m "not live" - -Run all + print matrix: - ./venv/bin/python -m pytest tests/test_live_integration.py -v -s -""" -from __future__ import annotations - -import uuid - -import pandas as pd -import pytest - -from skills.confidential_data_procurement.ingest import _datasets -from skills.confidential_data_procurement.deterministic import ( - compute_metrics, - run_deterministic, -) -from skills.confidential_data_procurement.models import BuyerPolicy, SupplierSubmission -from skills.confidential_data_procurement.agent import run_agent -from skills.confidential_data_procurement import run_skill -from skills.confidential_data_procurement import procurement_respond_handler - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -def _register(df: pd.DataFrame, metadata: dict | None = None) -> str: - did = str(uuid.uuid4()) - meta = metadata or {} - _datasets[did] = { - "df": df, - "metadata": meta, - "column_definitions": meta.get("column_definitions", {}), - "seller_claims": meta.get("seller_claims", {}), - "instance_id": "test_integration", - } - return did - - -def _run_pipeline(base_df, seller_fn, policy: BuyerPolicy, reserve: float = 200.0) -> dict: - df, meta = seller_fn(base_df) - did = _register(df, meta) - try: - sub = SupplierSubmission( - submission_id=str(uuid.uuid4()), - dataset_id=did, - dataset_name="test.csv", - reserve_price=reserve, - ) - resp = run_skill([sub], policy) - return resp.results[0] if resp.results else {} - finally: - _datasets.pop(did, None) - - -# --------------------------------------------------------------------------- -# Seller variants -# --------------------------------------------------------------------------- - -def _seller_clean(base_df): - df = base_df[["transaction_id", "amount", "is_fraud"]].copy().reset_index(drop=True) - meta = { - "column_definitions": { - "transaction_id": "Unique ID per transaction", - "amount": "Transaction amount in USD", - "is_fraud": "1 if fraudulent, 0 otherwise", - }, - "seller_claims": { - "low_fraud_rate": "Approximately 4% fraud rate", - "no_missing_values": "All fields fully populated", - }, - } - return df, meta - - -def _seller_null_corrupted(base_df): - """30% of amount values nulled — seller falsely claims no missing values.""" - df = base_df[["transaction_id", "amount", "is_fraud"]].copy().reset_index(drop=True) - n = int(len(df) * 0.30) - df.loc[:n, "amount"] = None - meta = { - "column_definitions": { - "transaction_id": "Unique ID", - "amount": "Transaction amount", - "is_fraud": "Fraud flag", - }, - "seller_claims": {"no_missing_values": "All fields fully populated"}, - } - return df, meta - - -def _seller_dup_corrupted(base_df): - """Entire dataset duplicated → duplicate_rate = 50% → critical failure.""" - df = base_df[["transaction_id", "amount", "is_fraud"]].copy().reset_index(drop=True) - df = pd.concat([df, df], ignore_index=True) - return df, {} - - -def _seller_forbidden_col(base_df): - """SSN column present → critical failure, no LLM.""" - df = base_df[["transaction_id", "amount", "is_fraud"]].copy().reset_index(drop=True) - df["ssn"] = "xxx-xx-0000" - return df, {} - - -def _seller_missing_col(base_df): - """is_fraud dropped — buyer requires it → schema penalty.""" - df = base_df[["transaction_id", "amount"]].copy().reset_index(drop=True) - meta = { - "column_definitions": { - "transaction_id": "Unique ID", - "amount": "Transaction amount", - }, - "seller_claims": {}, - } - return df, meta - - -def _seller_fuzzy_schema(base_df): - """is_fraud renamed to fraud_label — tests agent semantic matching.""" - df = base_df[["transaction_id", "amount", "is_fraud"]].copy().reset_index(drop=True) - df = df.rename(columns={"is_fraud": "fraud_label"}) - meta = { - "column_definitions": { - "transaction_id": "Unique ID", - "amount": "Transaction amount", - "fraud_label": "Binary fraud indicator (1=fraud, 0=legit)", - }, - "seller_claims": {}, - } - return df, meta - - -def _seller_multi_corrupt(base_df): - """25% nulls + missing is_fraud — compound quality damage.""" - df = base_df[["transaction_id", "amount"]].copy().reset_index(drop=True) - n = int(len(df) * 0.25) - df.loc[:n, "amount"] = None - return df, {} - - -# --------------------------------------------------------------------------- -# Buyer policies — base_price=0 on all (bad data → payment near $0) -# --------------------------------------------------------------------------- - -def _buyer_lenient() -> BuyerPolicy: - """Tolerant buyer: accepts moderate nulls, low row floor, $800 ceiling.""" - return BuyerPolicy( - required_columns=["transaction_id", "amount", "is_fraud"], - min_rows=200, - max_null_rate=0.35, - max_duplicate_rate=0.20, - min_label_rate=0.01, - label_column="is_fraud", - forbidden_columns=["ssn", "dob"], - max_budget=800.0, - base_price=0.0, - ) - - -def _buyer_strict() -> BuyerPolicy: - """Strict buyer: demands 900 rows, low null tolerance, $800 ceiling.""" - return BuyerPolicy( - required_columns=["transaction_id", "amount", "is_fraud"], - min_rows=900, - max_null_rate=0.05, - max_duplicate_rate=0.05, - min_label_rate=0.02, - label_column="is_fraud", - forbidden_columns=["ssn", "dob"], - max_budget=800.0, - base_price=0.0, - ) - - -def _buyer_budget_tight() -> BuyerPolicy: - """Budget-conscious buyer: same quality expectations, $300 ceiling.""" - return BuyerPolicy( - required_columns=["transaction_id", "amount", "is_fraud"], - min_rows=200, - max_null_rate=0.35, - max_duplicate_rate=0.20, - min_label_rate=0.01, - label_column="is_fraud", - forbidden_columns=["ssn", "dob"], - max_budget=300.0, - base_price=0.0, - ) - - -# --------------------------------------------------------------------------- -# Section 1: Deterministic layer (no LLM) -# --------------------------------------------------------------------------- - -class TestDeterministic: - """Fast correctness checks — direct calls to run_deterministic.""" - - def setup_method(self): - self._ids: list[str] = [] - - def teardown_method(self): - for did in self._ids: - _datasets.pop(did, None) - - def _run(self, df, policy, metadata=None, reserve=200.0): - did = _register(df, metadata) - self._ids.append(did) - return run_deterministic(did, policy, reserve) - - def test_clean_lenient_high_quality(self, base_df): - """Clean data + lenient policy → no critical failure, quality > 0.7.""" - df, meta = _seller_clean(base_df) - r = self._run(df, _buyer_lenient(), meta) - assert not r["metrics"].critical_failure - assert r["quality_score"] > 0.7 - - def test_clean_strict_coverage_penalty(self, base_df): - """800-row dataset vs strict 900-row min → coverage_score = 800/900 ≈ 0.89.""" - df, meta = _seller_clean(base_df) - r = self._run(df, _buyer_strict(), meta) - assert not r["metrics"].critical_failure - assert r["component_scores"]["coverage"] == pytest.approx(len(df) / 900, rel=0.01) - assert r["component_scores"]["coverage"] < 1.0 - - def test_null_lenient_passes(self, base_df): - """30% null in one column (~10% overall) < 35% lenient threshold → quality > 0.""" - df, meta = _seller_null_corrupted(base_df) - r = self._run(df, _buyer_lenient(), meta) - assert not r["metrics"].critical_failure - assert r["component_scores"]["null"] > 0 - - def test_null_strict_null_score_zero(self, base_df): - """~10% overall null > 5% strict threshold → null_score = 0.""" - df, meta = _seller_null_corrupted(base_df) - r = self._run(df, _buyer_strict(), meta) - assert r["component_scores"]["null"] == 0.0 - - def test_null_payment_lower_than_clean(self, base_df): - """Null-corrupted → lower quality → lower payment than clean, same policy.""" - df_c, m_c = _seller_clean(base_df) - df_n, m_n = _seller_null_corrupted(base_df) - p = _buyer_lenient() - assert self._run(df_n, p, m_n)["proposed_payment"] < self._run(df_c, p, m_c)["proposed_payment"] - - def test_dup_corrupted_critical(self, base_df): - """50% duplicates (entire dataset doubled) → critical_failure, $0 payment.""" - df, meta = _seller_dup_corrupted(base_df) - r = self._run(df, _buyer_lenient(), meta) - assert r["metrics"].critical_failure - assert not r["deal"] - assert r["proposed_payment"] == 0.0 # base_price=0, so floor is $0 - - def test_forbidden_col_critical(self, base_df): - """SSN column → critical_failure, note mentions 'ssn'.""" - df, meta = _seller_forbidden_col(base_df) - r = self._run(df, _buyer_lenient(), meta) - assert r["metrics"].critical_failure - assert "ssn" in " ".join(r["notes"]).lower() - - def test_budget_tight_caps_payment(self, base_df): - """Same clean data, $300 ceiling → proposed_payment ≤ $300.""" - df, meta = _seller_clean(base_df) - r = self._run(df, _buyer_budget_tight(), meta) - assert r["proposed_payment"] <= 300.0 - - def test_reserve_not_met_deal_fails(self, base_df): - """Clean data, seller reserve > proposed payment → deal=False with note.""" - df, meta = _seller_clean(base_df) - # Deterministic quality with schema=0.5 placeholder: 0.925 → payment = $740 - # Reserve $760 > $740 → deal rejected - r = self._run(df, _buyer_lenient(), meta, reserve=760.0) - assert not r["deal"] - assert any("reserve" in n.lower() for n in r["notes"]) - - def test_multi_corrupt_lower_than_clean(self, base_df): - """Multi-corrupt (nulls + missing label col) → payment lower than clean.""" - df_c, m_c = _seller_clean(base_df) - df_x, m_x = _seller_multi_corrupt(base_df) - p = _buyer_lenient() - assert self._run(df_x, p, m_x)["proposed_payment"] < self._run(df_c, p, m_c)["proposed_payment"] - - def test_price_formula_base_zero(self, base_df): - """With base_price=0: P = max_budget * S exactly.""" - df, meta = _seller_clean(base_df) - policy = _buyer_lenient() - r = self._run(df, policy, meta) - S = r["quality_score"] - assert r["proposed_payment"] == pytest.approx(policy.max_budget * S, abs=0.01) - - -# --------------------------------------------------------------------------- -# Section 2: Agent layer (live LLM) -# --------------------------------------------------------------------------- - -class TestAgentLive: - - def setup_method(self): - self._ids: list[str] = [] - - def teardown_method(self): - for did in self._ids: - _datasets.pop(did, None) - - def _reg(self, df, meta=None): - did = _register(df, meta) - self._ids.append(did) - return did - - def _metrics(self, df, policy): - return compute_metrics(df, policy) - - @pytest.mark.live - def test_exact_schema_match(self, base_df): - """All required columns present by exact name → schema_score ≥ 0.8.""" - df, meta = _seller_clean(base_df) - did = self._reg(df, meta) - policy = _buyer_lenient() - r = run_agent(did, policy, self._metrics(df, policy), {}) - assert r["schema_score"] >= 0.8 - - @pytest.mark.live - def test_fuzzy_schema_match(self, base_df): - """fraud_label instead of is_fraud → agent semantic match gives schema_score > 0.""" - df, meta = _seller_fuzzy_schema(base_df) - did = self._reg(df, meta) - policy = _buyer_lenient() - r = run_agent(did, policy, self._metrics(df, policy), {}) - assert r["schema_score"] > 0.0 - - @pytest.mark.live - def test_null_claim_disputed(self, base_df): - """Seller claims 'no missing values' but 30% amount is null → claim disputed.""" - df, meta = _seller_null_corrupted(base_df) - did = self._reg(df, meta) - policy = _buyer_lenient() - r = run_agent(did, policy, self._metrics(df, policy), {}) - verification = r.get("claim_verification") or {} - assert any(v == "disputed" for v in verification.values()), ( - f"Expected at least one disputed claim, got: {verification}" - ) - - @pytest.mark.live - def test_missing_col_lower_schema(self, base_df): - """is_fraud missing → schema_score lower than when it's present.""" - df_full, m_full = _seller_clean(base_df) - df_miss, m_miss = _seller_missing_col(base_df) - policy = _buyer_lenient() - did_f = self._reg(df_full, m_full) - did_m = self._reg(df_miss, m_miss) - score_full = run_agent(did_f, policy, self._metrics(df_full, policy), {})["schema_score"] - score_miss = run_agent(did_m, policy, self._metrics(df_miss, policy), {})["schema_score"] - assert score_miss < score_full - - @pytest.mark.live - def test_explanation_present(self, base_df): - """Agent always produces a non-empty explanation string.""" - df, meta = _seller_clean(base_df) - did = self._reg(df, meta) - policy = _buyer_lenient() - r = run_agent(did, policy, self._metrics(df, policy), {}) - assert isinstance(r.get("explanation"), str) and len(r["explanation"]) > 10 - - @pytest.mark.live - def test_output_bounds(self, base_df): - """schema_score and claim_veracity_score always in [0, 1].""" - df, meta = _seller_clean(base_df) - did = self._reg(df, meta) - policy = _buyer_lenient() - r = run_agent(did, policy, self._metrics(df, policy), {}) - assert 0.0 <= r["schema_score"] <= 1.0 - assert 0.0 <= r["claim_veracity_score"] <= 1.0 - - @pytest.mark.live - def test_schema_matching_dict_returned(self, base_df): - """Agent returns a non-empty schema_matching dict.""" - df, meta = _seller_clean(base_df) - did = self._reg(df, meta) - policy = _buyer_lenient() - r = run_agent(did, policy, self._metrics(df, policy), {}) - assert isinstance(r.get("schema_matching"), dict) and len(r["schema_matching"]) > 0 - - -# --------------------------------------------------------------------------- -# Section 3: Full pipeline — seller × buyer matrix (live LLM) -# --------------------------------------------------------------------------- - -class TestPipelineLive: - """End-to-end pipeline tests. Results appended to matrix_results for demo JSON.""" - - @pytest.mark.live - def test_happy_path(self, base_df, matrix_results): - """Clean data + lenient buyer + reserve=$200 → deal, high quality, full explanation.""" - r = _run_pipeline(base_df, _seller_clean, _buyer_lenient(), reserve=200.0) - assert r.get("deal") is True - assert r.get("explanation") - matrix_results.append({ - "type": "evaluation", - "scenario": "Happy Path", - "narrative": ( - "Seller provides clean fraud-detection data. All required columns present, " - "claims verified. Seller's $200 reserve is comfortably below the $800 offer. " - "Both parties should accept." - ), - "seller": "clean", "buyer": "lenient", "reserve": 200.0, - "quality": r.get("quality_score"), "payment": r.get("proposed_payment"), - "deal": r.get("deal"), "notes": r.get("notes", []), - "explanation": r.get("explanation", ""), - "schema_matching": r.get("schema_matching"), - "claim_verification": r.get("claim_verification"), - }) - - @pytest.mark.live - def test_strict_buyer_coverage_penalty(self, base_df, matrix_results): - """800-row dataset vs strict 900-row requirement → quality drops, lower price.""" - lenient_r = _run_pipeline(base_df, _seller_clean, _buyer_lenient(), reserve=200.0) - strict_r = _run_pipeline(base_df, _seller_clean, _buyer_strict(), reserve=200.0) - assert strict_r.get("proposed_payment", 999) <= lenient_r.get("proposed_payment", 0) + 5 - matrix_results.append({ - "type": "evaluation", - "scenario": "Strict Buyer — Row Coverage Penalty", - "narrative": ( - "Same clean dataset, but buyer demands 900 rows and only 800 are present. " - "Coverage score = 800/900 = 0.89 → quality and price both drop vs lenient buyer." - ), - "seller": "clean", "buyer": "strict", "reserve": 200.0, - "quality": strict_r.get("quality_score"), "payment": strict_r.get("proposed_payment"), - "deal": strict_r.get("deal"), "notes": strict_r.get("notes", []), - "explanation": strict_r.get("explanation", ""), - "schema_matching": strict_r.get("schema_matching"), - "claim_verification": strict_r.get("claim_verification"), - }) - - @pytest.mark.live - def test_null_corrupted_claim_disputed(self, base_df, matrix_results): - """30% nulls + false 'no missing values' claim → agent disputes claim, price dips.""" - r = _run_pipeline(base_df, _seller_null_corrupted, _buyer_lenient(), reserve=200.0) - matrix_results.append({ - "type": "evaluation", - "scenario": "Null-Corrupted + False Claim", - "narrative": ( - "Seller corrupts 30% of amount values and claims 'all fields populated'. " - "Strict buyer's 5% null threshold zeroes the null_score. " - "Agent disputes the no-missing-values claim. Price drops." - ), - "seller": "null_corrupted", "buyer": "lenient", "reserve": 200.0, - "quality": r.get("quality_score"), "payment": r.get("proposed_payment"), - "deal": r.get("deal"), "notes": r.get("notes", []), - "explanation": r.get("explanation", ""), - "schema_matching": r.get("schema_matching"), - "claim_verification": r.get("claim_verification"), - }) - - def test_reserve_not_met(self, base_df, matrix_results): - """Clean data, seller reserve=$760 > deterministic offer of $740 → deal rejected. - Reserve logic is deterministic — no LLM needed for this scenario. - """ - df, meta = _seller_clean(base_df) - did = _register(df, meta) - try: - det = run_deterministic(did, _buyer_lenient(), reserve_price=760.0) - finally: - _datasets.pop(did, None) - assert not det["deal"] - assert any("reserve" in n.lower() for n in det["notes"]) - matrix_results.append({ - "type": "evaluation", - "scenario": "Reserve Floor Not Met", - "narrative": ( - "Data quality is good (quality ~0.93, offer ~$740). " - "But seller's $760 reserve exceeds the computed offer. " - "The enclave reports: 'reserve not met — consider renegotiation'. " - "Neither party's private number was revealed. " - "This is where the renegotiation section begins." - ), - "seller": "clean", "buyer": "lenient", "reserve": 760.0, - "quality": det["quality_score"], "payment": det["proposed_payment"], - "deal": False, "notes": det["notes"], - "explanation": None, - "schema_matching": None, "claim_verification": None, - }) - - @pytest.mark.live - def test_critical_forbidden_column(self, base_df, matrix_results): - """SSN column → immediate rejection, agent never runs, payment=$0.""" - r = _run_pipeline(base_df, _seller_forbidden_col, _buyer_lenient(), reserve=0.0) - assert r.get("deal") is False - assert r.get("explanation") is None # agent skipped - matrix_results.append({ - "type": "evaluation", - "scenario": "Critical: PII Column (SSN)", - "narrative": ( - "Dataset contains an 'ssn' (Social Security Number) column. " - "The deterministic layer rejects immediately — no LLM is invoked, " - "no data is analyzed. Payment = $0 (base_price=0)." - ), - "seller": "forbidden_col", "buyer": "lenient", "reserve": 0.0, - "quality": 0.0, "payment": r.get("proposed_payment"), - "deal": False, "notes": r.get("notes", []), - "explanation": None, - "schema_matching": None, "claim_verification": None, - }) - - @pytest.mark.live - def test_critical_duplicate_spam(self, base_df, matrix_results): - """50%+ duplicates → critical rejection, agent skipped, payment=$0.""" - r = _run_pipeline(base_df, _seller_dup_corrupted, _buyer_lenient(), reserve=0.0) - assert r.get("deal") is False - assert r.get("explanation") is None - matrix_results.append({ - "type": "evaluation", - "scenario": "Critical: 50%+ Duplicates", - "narrative": ( - "Seller doubled the dataset by copying all rows. " - "Duplicate rate = 50%, which hits the critical threshold. " - "Immediate rejection, no LLM, payment = $0." - ), - "seller": "dup_corrupted", "buyer": "lenient", "reserve": 0.0, - "quality": 0.0, "payment": r.get("proposed_payment"), - "deal": False, "notes": r.get("notes", []), - "explanation": None, - "schema_matching": None, "claim_verification": None, - }) - - @pytest.mark.live - def test_budget_ceiling(self, base_df, matrix_results): - """Same clean data + $300 ceiling → proportionally lower price.""" - r = _run_pipeline(base_df, _seller_clean, _buyer_budget_tight(), reserve=50.0) - assert r.get("proposed_payment", 999) <= 300.0 - matrix_results.append({ - "type": "evaluation", - "scenario": "Budget Ceiling ($300 max)", - "narrative": ( - "Same clean dataset, same quality score. But buyer's max_budget=$300 " - "caps the price. Shows the enclave preserves privacy — seller sees only " - "a lower offer, never the buyer's max_budget." - ), - "seller": "clean", "buyer": "budget_tight", "reserve": 50.0, - "quality": r.get("quality_score"), "payment": r.get("proposed_payment"), - "deal": r.get("deal"), "notes": r.get("notes", []), - "explanation": r.get("explanation", ""), - "schema_matching": r.get("schema_matching"), - "claim_verification": r.get("claim_verification"), - }) - - -# --------------------------------------------------------------------------- -# Section 4: Renegotiation scenarios (deterministic, no LLM) -# -# All scenarios start from a fixed base result: -# quality=0.65, proposed_payment=$520, settlement_status="pending_approval" -# -# Renegotiation is pure business logic — no AI involved after the initial evaluation. -# The agent ran once; respond_handler drives all subsequent state changes. -# --------------------------------------------------------------------------- - -_RENEG_QUALITY = 0.65 -_RENEG_PAYMENT = 520.0 # = 800 * 0.65 - - -def _reneg_policy(): - return BuyerPolicy( - required_columns=["transaction_id", "amount", "is_fraud"], - min_rows=200, max_null_rate=0.35, max_duplicate_rate=0.20, - min_label_rate=0.01, label_column="is_fraud", - forbidden_columns=["ssn", "dob"], - max_budget=800.0, base_price=0.0, - ) - - -def _base_result(): - """Pending-approval result from a hypothetical TEE evaluation.""" - return { - "submission_id": "demo-sub", - "deal": True, - "quality_score": _RENEG_QUALITY, - "proposed_payment": _RENEG_PAYMENT, - "hard_constraints_pass": True, - "settlement_status": "pending_approval", - "release_token": None, - "notes": [], - "explanation": "Dataset meets buyer requirements with moderate quality.", - "claim_verification": {"balanced_labels": "verified"}, - "schema_matching": { - "transaction_id": "transaction_id", - "amount": "amount", - "is_fraud": "is_fraud", - }, - "buyer_response": None, - "supplier_response": None, - "renegotiation_used": False, - } - - -class TestRenegotiation: - """ - Tests for the 3×3 resolution matrix in procurement_respond_handler. - No LLM needed. Starting point: quality=0.65, proposed_payment=$520. - """ - - def test_both_accept(self, matrix_results): - """Both accept TEE offer → authorized at $520, no changes.""" - policy = _reneg_policy() - r = procurement_respond_handler(_base_result(), "accept", None, "buyer", policy) - assert r["settlement_status"] == "awaiting_counterparty" - r = procurement_respond_handler(r, "accept", None, "supplier", policy) - assert r["settlement_status"] == "authorized" - assert r["proposed_payment"] == _RENEG_PAYMENT - matrix_results.append({ - "type": "renegotiation", - "scenario": "Both Accept", - "narrative": ( - f"TEE computes ${_RENEG_PAYMENT:.0f}. Both parties accept immediately. " - "Authorized at the enclave's offer — fastest path to a deal." - ), - "initial_offer": _RENEG_PAYMENT, - "buyer_action": "accept", - "supplier_action":"accept", - "final_payment": r["proposed_payment"], - "deal": True, - }) - - def test_buyer_renegotiates_after_supplier_accepts(self, matrix_results): - """Supplier locks in $520 by accepting first. Buyer tries to renegotiate to $400 — too late.""" - policy = _reneg_policy() - r = procurement_respond_handler(_base_result(), "accept", None, "supplier", policy) - r = procurement_respond_handler(r, "renegotiate", 400.0, "buyer", policy) - # Supplier already committed → deal at original proposed_payment, buyer's revision ignored - assert r["settlement_status"] == "authorized" - assert r["proposed_payment"] == _RENEG_PAYMENT - matrix_results.append({ - "type": "renegotiation", - "scenario": "Acceptor Locks the Price", - "narrative": ( - f"Supplier accepts the ${_RENEG_PAYMENT:.0f} offer. " - "Buyer later tries to renegotiate down to $400 — but supplier already committed. " - "Deal authorizes at $520 (supplier's acceptance stands). " - "First to accept locks the price." - ), - "initial_offer": _RENEG_PAYMENT, - "buyer_action": "renegotiate → $400", - "supplier_action":"accept", - "final_payment": r["proposed_payment"], - "deal": True, - }) - - def test_both_renegotiate_terms_overlap(self, matrix_results): - """Both renegotiate. Buyer offers $480, seller accepts $420 floor → deal at $480.""" - policy = _reneg_policy() - r = procurement_respond_handler(_base_result(), "renegotiate", 480.0, "buyer", policy) - r = procurement_respond_handler(r, "renegotiate", 420.0, "supplier", policy) - assert r["settlement_status"] == "authorized" - assert r["proposed_payment"] == 480.0 # buyer's revised offer (>= seller's 420) - matrix_results.append({ - "type": "renegotiation", - "scenario": "Both Renegotiate — Terms Overlap", - "narrative": ( - f"TEE offers ${_RENEG_PAYMENT:.0f}. Buyer revises offer to $480. " - "Seller revises reserve down to $420. " - "$480 ≥ $420 → authorized at buyer's revised offer. " - "No midpoint — buyer's number wins when terms overlap." - ), - "initial_offer": _RENEG_PAYMENT, - "buyer_action": "renegotiate → $480", - "supplier_action":"renegotiate → $420", - "final_payment": 480.0, - "deal": True, - }) - - def test_both_renegotiate_no_overlap(self, matrix_results): - """Both renegotiate with incompatible terms → rejected.""" - policy = _reneg_policy() - r = procurement_respond_handler(_base_result(), "renegotiate", 350.0, "buyer", policy) - r = procurement_respond_handler(r, "renegotiate", 450.0, "supplier", policy) - assert r["settlement_status"] == "rejected" - matrix_results.append({ - "type": "renegotiation", - "scenario": "Both Renegotiate — No Overlap", - "narrative": ( - f"TEE offers ${_RENEG_PAYMENT:.0f}. Buyer revises to $350. " - "Seller revises reserve up to $450. " - "$350 < $450 → deal rejected. Neither party revealed their original private number." - ), - "initial_offer": _RENEG_PAYMENT, - "buyer_action": "renegotiate → $350", - "supplier_action":"renegotiate → $450", - "final_payment": None, - "deal": False, - }) - - def test_either_party_rejects(self, matrix_results): - """One party rejects → immediate deal-off regardless of other's action.""" - policy = _reneg_policy() - r = procurement_respond_handler(_base_result(), "accept", None, "buyer", policy) - r = procurement_respond_handler(r, "reject", None, "supplier", policy) - assert r["settlement_status"] == "rejected" - matrix_results.append({ - "type": "renegotiation", - "scenario": "Hard Reject", - "narrative": ( - f"Buyer accepts the ${_RENEG_PAYMENT:.0f} TEE offer. " - "Supplier rejects outright. One rejection ends the deal — no further rounds." - ), - "initial_offer": _RENEG_PAYMENT, - "buyer_action": "accept", - "supplier_action":"reject", - "final_payment": None, - "deal": False, - }) From 513575c3a30349f22069e7f3dc884d2cfc300e25 Mon Sep 17 00:00:00 2001 From: Parth Thapliyal Date: Sun, 22 Mar 2026 19:59:51 -0400 Subject: [PATCH 13/13] =?UTF-8?q?fix:=20mark=20run=5Fskill=20tests=20that?= =?UTF-8?q?=20call=20real=20LLM=20as=20live=20=E2=80=94=20skip=20in=20CI?= =?UTF-8?q?=20without=20API=20key?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_data_procurement.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_data_procurement.py b/tests/test_data_procurement.py index 3fdf7f4..8ec7762 100644 --- a/tests/test_data_procurement.py +++ b/tests/test_data_procurement.py @@ -628,6 +628,7 @@ def test_conversation_accumulates(self): class TestRunSkill: + @pytest.mark.live def test_good_dataset_returns_deal(self): df = _make_df(rows=200) policy = _make_policy(min_rows=100, max_budget=5000.0, base_price=500.0) @@ -670,6 +671,7 @@ def test_critical_failure_returns_rejected(self): finally: cleanup(dataset_id) + @pytest.mark.live def test_reserve_above_payment_no_deal(self): df = _make_df(rows=150) policy = _make_policy(min_rows=100, max_budget=1000.0, base_price=0.0) @@ -688,6 +690,7 @@ def test_reserve_above_payment_no_deal(self): finally: cleanup(dataset_id) + @pytest.mark.live def test_internal_fields_stripped_by_guardrails(self): """revised_budget and revised_reserve should not appear in output.""" df = _make_df(rows=200)