From 929cce98f4de64f87173fb02c222c642bb0fec28 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 2 Jun 2026 07:13:13 +0000 Subject: [PATCH 1/5] Initial plan From 8209aecf159e2c6359ed534520238706e1908318 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 2 Jun 2026 07:18:31 +0000 Subject: [PATCH 2/5] Initial exploration - no changes yet --- data/syllogism_clf.pkl | Bin 0 -> 10114 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 data/syllogism_clf.pkl diff --git a/data/syllogism_clf.pkl b/data/syllogism_clf.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8bc5bc03888252cda2009ef56d30ef7c7ecd19f4 GIT binary patch literal 10114 zcmdU#dypJO9mnr(-;dk7O9;w)AQ6&44laZw5O5&}BxK=)@KQNgXLn|ICYhbt%*=9k z0feB091H}KK|uill}8kiBqW3~ECetSKc+j^#qSt7ro;Pjj<8m1vS{l4HCePIqTeO+}Tb-)tfC)q^gE)%R2m)mi4M3se0XY5-qznZ?Ws- zB^mw%<(j5px=zxvO|=TrZt_dBELN+K({hKLeAAShrcsqG3xq>X#x_>Vx)*WCNsGFu zjgjb1e8ey%*U4zch-_9xOLjLn-JZ7G6m45Jbtrsxu{CG7GCVRoa_n%$b+VRiG|MET z1$JeZ*&sXBB`*{(r>Nl1uJ>2P|z4!Hk&mzj#Q4?pv>H$ z%o=wJdSJf{wQ@_loH!IwcAWz~SN-LxVaolDmS(G>X^LPoUpAq)bm*EE^wDNlIm}7v zk`Fh#rH&ii!On_%qr1|b4GpmC1|pzUg^G367Q5pcd>V5)Pj!cehE{wEKi)s34%@Aa zFWr~u=r~Du;;nmL%bxy4r^n4Z=+cvKPwlr4rjMrkTMw3RfAwFK?*GN-f7?F$TTiB& zDxJy)(=#4@W`F;@(*Je&pO?@7ww!1^*)vP0D*r!mmM^M{J~Q1Hoibl6{k;4{%Zom< zI4;cSPH&lRA<9?ErUm;3SoH?_7x$mwwkl3vO&yh`a=1#CVJ$0^!{r#3JNKq!Gkx2~ z_g?Gdcpcecq#9zQA!?SZ90xmxstD^WS(O$!T`ITKC`%?TG=u5Y{*vC|u7~9cb`2sd zy2>(T#bEjuh@>i-0UH#F!WjsrB|~c}styjktfwIgTLjOZz!h#_1t9{C@B~uyQYb_s z9f(zo3PFo_e!hxT%nwsj*E&*lGs@d8B1ys&!#dxDP_k|sEuFR2<%q@WGHm7imbw6u z*obUXupNOEi9|Mxey$A&<~Gm87*vt(`(nBFM1aE2tiJ&(;D1_#Qkst{`2n>!PmL6Cn@GaGaD1042 zBT|dB65mCNg6}Y5u!S_xRldQ%!)1sk!9(c16S0(Kw z9u?WtvkZ}3RW@z4rdFYwX|YEkK|<8jDnl`Z{D|xwSc_PJS{0D12azV{Gl4UA4H9Gs z;Rn_^2={seYhndfB2^zJ5ctZ>M6$exe7mel@SWiO`17}KSK{qng#=m8E+1k!=6(on zqKG4&Y{<5^(s=W#i5U67mB<&7-}Of$n3Fa5HinN{)gGgQK_p04jb<|##&Z!%cu{G# zCPHZ%l)Y555iR(31?yiz;{;G5=k`ogVBTi}Z~8c_@DMIz4N~;b zG9tA*7KsjEbQ0@xkTPMlbe*;PBM8MRMuoQQmk@~8jbNx=f=J4Q(!$lqMKDn}LiBZn zGM)<|47dg1Gys7M`ej7(ga|nDrAU#ZkPC1J5@h@Up}m6;?)IEVLVN>DSO~s7zlS98 z8aXuahrkL%QWY|-Dr}-U7{Nqc)yNLc8&?-0l%hKk3Qs{WE``0c0FgW~BH)-Mr0Awl z;A0B~b-*9d>e3RN-wP1+MW&4(Lepru!h|Azgmg1PK2drLlJ;^^KJcwb z)$JhxSL>dL#ILi!mvSGHS-j4CbDVoS)O2=#GJ1>V%pku2vS*tLh7!lg*i?I8gh+=e9-qEgPET*BAo zYeNC2aBl$Jn4+=jVd0GB_7 zc#7l?{cS=pQBldkw?6~-VKf43217d$ij!P4Q!gSpuRMXzdH@n+Jj9uO3gI*{%gw2q z5zP{U3SWr0mz(o;JEAdIzo_Yl5Qtkg93J>>a6cjmLRcI87@-tgNWjLAEo|!%OlWFd zVQ++jQPpS!)4(i*(qK^FygZC3TvUjrhnb-V5Qke6@Cx5ayse*L355c0??xoac?bzs zO(x)Lp&~)XLqgctUqKkw?%=Ijgb1vubt`g+aWYo0Lkw3SFyP>t`zBI=82Y0dLU6kla1Z0OYxNlFH}Jfyi! zKrAD{Ue18_VgqR&!tqt2&PJbv2<#n6p{$wbBbXyzV2a_w){&ssXF@dk08$kKEO0)K zN17ZVIFyV8ePJQka(_hT*JpyI6Ok-d)7aO}xd^12aIM)2Pwl>qXvUwi!_3nV&I1r4 z1)PTz1x5w0B-)qPVvQe!DXC^U(&QNxLNC`L1*}#TjrDmqLUFj4WaLpyEOCsz3U?ur z^@ANCF3UTH5ANwx(B`5BrxQ+Af-80zW<5CW zb~*>Si|@2H05{uk?e8w520EO$)Ph-V+=3YgY%s3nrg#^GE9P1xoN}A zbzksMCTH~IE0o-yS2Z{jE04(DDhc=daA7SO4LJKJr(LDFaG-B!aAg431zM#B%Y~CB zH|{E#c1x0z0#q^JcE8l&WXUNy$!OZ-DqJtGGAi(229&vQrJeS|$X3}^`jr#N6}2*- zT~K$y!;0}iwtMO5WNl0CEmFD{YqETukdNFkEC{c&C%a=9AB>YjD(@B~Q8i?^TA%0v zuF2QC$|?TE_a}~6$Ug9V(NDJpY3ZW}ou`-YDf;QwAYFabX4{?p%l1@hb5qY%7o7M~ z`@hM{wx6gm& zrsd-o?g~=IXKnlAlHJyeML&JA*m`-*6|wWiXsQP3+8O_t^X|;&i!5EZr1$-E-fb8C z^zkC|{n!iFj(zy?8$mrDoj*`G`t_&V(Rwe+zw_6Rw(o4SRQ>aNyN&wJ;%BFOC-A~C zJ9aPVc&)wtg;(EvWy|(R`T~7sX~=i?HmCKycT3x^&$c3K58v-gZRVS$tlmt|(rEj! zc8scVLfnvJ~yGA*n1L<@Av1ggZ`W<-t*f|o4fS*SLu7Ur+DjKf4O`{;n|=c zS-F0{hy5LWp6Y#$Ex!H_uS{RpX64S=_US%h_?Zds>1A&ne%a{a-xX&bGG|t)^qb=3 z{T23)|6c43-VgSkeDM3be>L{b0~7j(wL42c`#stjjK}Es#LwaQW&JQ!8to4@-f+LP zY22_}7LV1BrGC4v3EGuCv-IGYbo=yY_7(l~sUW?$eP80Y54|0XKlWa;@yVXqIDd1~ z8@osT^h_|$qwk|Xuk4A;FI(G_$Dinj)n}?S+Md2H?04q#pPd6cF5bAS9c|Ca=fACO zJT70P^Aww}+C}!>o^j^CHm-Z***0rezkT-7`EzgEoAR^avNzeh4{OLN Date: Tue, 2 Jun 2026 07:24:51 +0000 Subject: [PATCH 3/5] Add tested examples, example tests, improvement plan, and updated docs - Add 6 new example files covering all Python use cases: guard_verification.py, chain_of_thought.py, arithmetic_solver.py, syllogism_verification.py, mcq_picker.py, arithmetic_repair.py - Add tests/test_examples.py with 36 tests validating all examples - Rewrite examples/README.md with per-use-case documentation - Create docs/IMPROVEMENT-PLAN.md with findings and roadmap - Update README.md with accurate API references, install docs, and links to new examples and improvement plan --- README.md | 50 +++- docs/IMPROVEMENT-PLAN.md | 102 +++++++ examples/README.md | 447 ++++++++++++++++++++--------- examples/arithmetic_repair.py | 115 ++++++++ examples/arithmetic_solver.py | 106 +++++++ examples/chain_of_thought.py | 125 ++++++++ examples/guard_verification.py | 137 +++++++++ examples/mcq_picker.py | 103 +++++++ examples/syllogism_verification.py | 127 ++++++++ tests/test_examples.py | 438 ++++++++++++++++++++++++++++ 10 files changed, 1612 insertions(+), 138 deletions(-) create mode 100644 docs/IMPROVEMENT-PLAN.md create mode 100644 examples/arithmetic_repair.py create mode 100644 examples/arithmetic_solver.py create mode 100644 examples/chain_of_thought.py create mode 100644 examples/guard_verification.py create mode 100644 examples/mcq_picker.py create mode 100644 examples/syllogism_verification.py create mode 100644 tests/test_examples.py diff --git a/README.md b/README.md index c462295..2018162 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,17 @@ The typical workflow: ### Installation ```bash -pip install -e . +pip install -e . # core package +``` + +**Optional extras** (unlock additional features): + +```bash +pip install -e ".[nlp]" # spaCy + word2number (syllogism, arithmetic solver) +python -m spacy download en_core_web_sm +pip install -e ".[logic]" # Z3 solver (formal entailment checking) +pip install -e ".[semantic]" # sentence-transformers (semantic similarity) +pip install -e ".[rest]" # requests (REST API client) ``` ### 5-Minute Quickstart @@ -115,13 +125,23 @@ else: ### Real-World Examples -See [`examples/`](./examples/) for production-ready code: +See [`examples/`](./examples/) for production-ready, tested code: + +| Example | File | What it covers | +|---------|------|----------------| +| **Guard Verification** | [`guard_verification.py`](./examples/guard_verification.py) | ECS scoring, thresholds, repair, degradation tracking | +| **Chain-of-Thought** | [`chain_of_thought.py`](./examples/chain_of_thought.py) | Multi-step reasoning verification | +| **Arithmetic Solver** | [`arithmetic_solver.py`](./examples/arithmetic_solver.py) | Word problem solving end-to-end | +| **Syllogism Checker** | [`syllogism_verification.py`](./examples/syllogism_verification.py) | Formal logic verification (Z3 + heuristics) | +| **MCQ Picker** | [`mcq_picker.py`](./examples/mcq_picker.py) | Multiple-choice answer selection | +| **Arithmetic Repair** | [`arithmetic_repair.py`](./examples/arithmetic_repair.py) | Deterministic error correction | +| **Simple Verification** | [`simple_verification.py`](./examples/simple_verification.py) | Quick-start 3-claim demo | +| **LangChain Integration** | [`langchain_integration.py`](./examples/langchain_integration.py) | LangChain pipeline wrapper | +| **API Server** | [`api_server.py`](./examples/api_server.py) | FastAPI microservice | -- **[`simple_verification.py`](./examples/simple_verification.py)** - Basic usage (5 min) -- **[`langchain_integration.py`](./examples/langchain_integration.py)** - LangChain integration (10 min) -- **[`api_server.py`](./examples/api_server.py)** - Production FastAPI server (15 min) +All examples have tests in [`tests/test_examples.py`](./tests/test_examples.py). -Run the simple example: +Run any example: ```bash python examples/simple_verification.py ``` @@ -163,14 +183,24 @@ Your agent (Claude Desktop, Cursor, GitHub Copilot) can then call PureReason ver ### 3. Python API (Advanced) ```python -from pureason.reasoning import verify_chain +from pureason.reasoning import verify_chain, solve_arithmetic, verify_syllogism # Verify a chain of reasoning steps problem = "What is 2 + 2?" steps = ["Let me add the numbers.", "2 + 2 = 4", "Therefore, the answer is 4."] - result = verify_chain(problem, steps) -print(f"Confidence: {result.ecs}/100") +print(f"Valid: {result.is_valid}, Confidence: {result.chain_confidence:.2f}") + +# Solve an arithmetic word problem +report = solve_arithmetic("Maria has 15 apples. She buys 8 more. How many in total?") +print(f"Answer: {report.answer}") + +# Verify a syllogism +report = verify_syllogism( + premises=["All mammals are warm-blooded.", "Whales are mammals."], + conclusion="Whales are warm-blooded.", +) +print(f"Valid: {report.is_valid}") ``` ## Core Features @@ -235,6 +265,8 @@ cargo build --release | Topic | Link | |-------|------| +| **Examples** | [`examples/README.md`](./examples/README.md) - Tested use cases with code | +| **Improvement Plan** | [`docs/IMPROVEMENT-PLAN.md`](./docs/IMPROVEMENT-PLAN.md) - Roadmap for next improvements | | **Benchmarks** | [`docs/BENCHMARK.md`](./docs/BENCHMARK.md) - Full results and methodology | | **Reproducibility** | [`docs/REPRODUCIBILITY.md`](./docs/REPRODUCIBILITY.md) - Seeds, hashes, holdout | | **MCP Integration** | [`docs/MCP-INTEGRATION.md`](./docs/MCP-INTEGRATION.md) - Agent setup guide | diff --git a/docs/IMPROVEMENT-PLAN.md b/docs/IMPROVEMENT-PLAN.md new file mode 100644 index 0000000..b23061c --- /dev/null +++ b/docs/IMPROVEMENT-PLAN.md @@ -0,0 +1,102 @@ +# PureReason Improvement Plan + +> Generated from hands-on exploration and testing of v0.3.1. + +## Findings Summary + +### What Works Well +- **Arithmetic repair** — deterministic `A op B = C` repair is reliable and fast. +- **Chain-of-thought verification** — `verify_chain` correctly flags arithmetic errors and accumulates context. +- **MCQ picker** — tie detection with `AmbiguousAnswerError` is well-designed. +- **Guard API** — `ReasoningGuard` provides a clean, simple entry point. +- **Degradation tracking** — `_ReputationTracker` is a practical production feature. + +### Issues Found + +| # | Issue | Severity | Status | +|---|-------|----------|--------| +| 1 | Word-number extraction fails without `word2number` installed — tests expect it but the package is optional | Medium | Documented | +| 2 | Examples were generic and untested — no way for consumers to validate setup | High | **Fixed** | +| 3 | `verify_chain` falls back to ECS=50 when Rust binary is unavailable — no clear indication to the user | Medium | Documented | +| 4 | `_ecs_score` in `guard.py` silently returns 75.0 on any exception — masks real failures | Medium | Documented | +| 5 | Examples README referenced `verify_chain(llm_output)` with wrong signature (missing `steps` parameter) | High | **Fixed** | +| 6 | No test coverage for example use cases | High | **Fixed** | +| 7 | `solve_arithmetic` relies on spaCy NLP model but error message is unclear | Low | Documented | + +--- + +## Improvement Plan + +### Phase 1: Examples & Documentation (completed) + +- [x] Create 6 focused, tested example files covering every Python use case +- [x] Add `tests/test_examples.py` with 36 tests validating all examples +- [x] Rewrite `examples/README.md` with per-use-case documentation +- [x] Update `README.md` with accurate code samples and API references +- [x] Document expected inputs, outputs, and edge cases + +### Phase 2: Robustness (recommended next) + +- [ ] **Graceful fallback messaging** — When the Rust binary is unavailable, + `_ecs_score` should log a clear warning (not silently return 75.0). + Suggested: use `warnings.warn()` on first fallback. + +- [ ] **Optional dependency handling** — `_extract_numbers` silently skips + word-form numbers when `word2number` is not installed. Add a one-time + warning so users know they're missing functionality. + +- [ ] **Consolidate install instructions** — The `pyproject.toml` optional groups + (`[nlp]`, `[logic]`, `[semantic]`, `[rest]`) should be documented in a + single "Installation" section in the README so users know what each extra + provides. + +### Phase 3: Test Coverage + +- [ ] **Integration tests with Rust binary** — Add a CI job that builds the + Rust binary and runs tests without mocking `_core._run`. + +- [ ] **Benchmark regression tests** — Add a small smoke-test subset of + the HaluEval/TruthfulQA benchmarks that runs in CI to catch ECS score + regressions. + +- [ ] **Property-based testing** — Use `hypothesis` for arithmetic repair + to verify `_repair_arithmetic_in_step` handles edge cases like very large + numbers, unicode operators, and chained expressions. + +### Phase 4: API Ergonomics + +- [ ] **Typed return objects everywhere** — `pick_best_answer` returns + `tuple[int, EpistemicChainReport]` which is not self-documenting. + Consider a `MCQResult` dataclass. + +- [ ] **Batch verification API** — `ReasoningGuard.verify_batch(texts)` to + verify multiple texts in a single call (parallel processing). + +- [ ] **Structured error types** — Replace generic `Exception` catches with + specific error types (`BinaryNotFoundError`, `ParseError`, etc.). + +### Phase 5: Performance + +- [ ] **Lazy NLP model loading** — spaCy model is loaded on first call to + `_detect_operation`. Add explicit `init()` method for applications that + want to control startup latency. + +- [ ] **Caching** — `_ecs_for_text` could cache results for repeated texts + (LRU cache with configurable size). + +--- + +## Priority Matrix + +| Priority | Effort | Items | +|----------|--------|-------| +| **High / Low effort** | Phase 1 (done), Phase 2 fallback warnings | +| **High / Medium effort** | Phase 3 integration tests | +| **Medium / Low effort** | Phase 4 typed returns | +| **Medium / High effort** | Phase 5 performance | + +## Recommendation + +Start with **Phase 2** (robustness) — it's low-effort and directly improves the +developer experience for new consumers. Then move to **Phase 3** (test coverage) +to prevent regressions as the project grows. diff --git a/examples/README.md b/examples/README.md index 8978a5b..fe13790 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,143 +1,334 @@ -# Quick Start Examples +# PureReason Examples -This directory contains practical examples for using PureReason in real-world applications. +Practical, tested examples showing every major PureReason feature with +expected inputs, outputs, and integration patterns. -## 🚀 Quick Start +> **Prerequisite** — install the Python package first: +> ```bash +> pip install -e . # core (always required) +> pip install -e ".[nlp]" # + spaCy & word2number +> python -m spacy download en_core_web_sm +> ``` -### 1. Simple Verification (5 minutes) +--- -Verify claims with PureReason: +## Use Cases at a Glance -```bash -python examples/simple_verification.py -``` +| # | Example | File | What it covers | +|---|---------|------|----------------| +| 1 | **Guard Verification** | [`guard_verification.py`](guard_verification.py) | `ReasoningGuard` — ECS scoring, threshold tuning, arithmetic repair, degradation tracking | +| 2 | **Chain-of-Thought** | [`chain_of_thought.py`](chain_of_thought.py) | `verify_chain` — multi-step reasoning verification, contradiction detection | +| 3 | **Arithmetic Solver** | [`arithmetic_solver.py`](arithmetic_solver.py) | `solve_arithmetic` — word problem solving, number extraction, operation detection | +| 4 | **Syllogism Verification** | [`syllogism_verification.py`](syllogism_verification.py) | `verify_syllogism` — formal logic checking, fallacy detection (Z3 + heuristics) | +| 5 | **MCQ Picker** | [`mcq_picker.py`](mcq_picker.py) | `pick_best_answer` — multiple-choice selection, tie detection, strict mode | +| 6 | **Arithmetic Repair** | [`arithmetic_repair.py`](arithmetic_repair.py) | `_repair_arithmetic_in_step` — deterministic error correction, answer extraction, majority vote | +| 7 | **Simple Verification** | [`simple_verification.py`](simple_verification.py) | Quick-start 3-claim verification demo | +| 8 | **LangChain Integration** | [`langchain_integration.py`](langchain_integration.py) | `PureReasonVerifier` wrapper for LangChain pipelines | +| 9 | **API Server** | [`api_server.py`](api_server.py) | FastAPI microservice with `/verify` and `/verify/batch` | -**What you'll see:** -- ✅ Factual claims get high ECS scores (80-90) -- ⚠️ Overconfident claims get flagged (30-60) -- ❌ Contradictions get rejected (<30) +--- -### 2. LangChain Integration (10 minutes) +## 1. Guard Verification — `ReasoningGuard` -Use PureReason as a verification layer in LangChain: +The primary entry point. Verifies any text and returns an ECS score, provenance label, and optional arithmetic repair. ```bash -pip install langchain langchain-openai -python examples/langchain_integration.py +python examples/guard_verification.py ``` -**Key pattern:** +### Key Concepts + +- **ECS (Epistemic Confidence Score)**: 0–100 score indicating how defensible a claim is. +- **Provenance**: One of `"verified"`, `"repaired"`, or `"flagged"`. +- **Threshold**: ECS below this → text is flagged (or repaired if arithmetic errors exist). + +### Code + ```python -from pureason.reasoning.chain import verify_chain +from pureason.guard import ReasoningGuard + +guard = ReasoningGuard(threshold=60, repair=True) +result = guard.verify("Water boils at 100°C at sea level.") + +print(result.ecs) # e.g. 75.0 +print(result.provenance) # "verified" +print(result.repaired) # False +print(result.text) # original text (unchanged) +``` -# Verify LLM output -result = verify_chain(llm_output) +### Decision Logic + +```python if result.ecs >= 70: - # Use the output + action = "ACCEPT" +elif result.ecs >= 40: + action = "REVIEW" else: - # Reject or retry + action = "REJECT" +``` + +### Arithmetic Repair + +```python +result = guard.verify("3 + 4 = 8 so the total is wrong.") +# result.repaired == True +# result.text contains "= 7 [repaired]" +# result.original == "3 + 4 = 8 so the total is wrong." ``` -### 3. Production API Server (15 minutes) +### Degradation Tracking + +```python +from pureason.guard import ReasoningGuard, _ReputationTracker -Deploy PureReason as a microservice: +tracker = _ReputationTracker(window=5, baseline_window=20, drop=10.0) +guard = ReasoningGuard(threshold=60, source_label="my_model", tracker=tracker) + +# After many verify() calls, if recent ECS drops >10 points below baseline, +# a ReasoningDegradationWarning is emitted. +``` + +--- + +## 2. Chain-of-Thought Verification — `verify_chain` + +Verifies multi-step reasoning chains for internal consistency (each step alone) +and contextual consistency (each step against accumulated context). ```bash -# Install dependencies -pip install fastapi uvicorn +python examples/chain_of_thought.py +``` -# Start server -python examples/api_server.py +### Code -# Test it -curl -X POST http://localhost:8000/verify \ - -H "Content-Type: application/json" \ - -d '{"text": "The sky is blue.", "min_ecs": 70}' +```python +from pureason.reasoning import verify_chain + +report = verify_chain( + problem="A store has 50 apples. A customer buys 12. How many remain?", + steps=[ + "The store starts with 50 apples.", + "A customer buys 12 apples.", + "Remaining = 50 - 12 = 38.", + "Therefore, the answer is 38.", + ], +) + +print(report.is_valid) # True — all steps pass +print(report.chain_confidence) # harmonic mean of step ECS / 100 +print(report.invalid_steps) # [] — no failures +print(report.answer) # last step text +print(report.summary) # human-readable summary ``` -**Response:** -```json -{ - "text": "The sky is blue.", - "ecs": 85, - "risk": "LOW", - "passed": true, - "issues": [], - "latency_ms": 4.2 -} +### Detecting Arithmetic Errors in a Chain + +```python +report = verify_chain("What is 15 + 27?", ["15 + 27 = 43."]) +# Step 0 will have "ARITHMETIC_ERROR" in its flags +# because 15 + 27 = 42, not 43 ``` -## 🐳 Docker Deployment +### Edge Cases + +```python +# Empty chain +report = verify_chain("Any?", []) +# report.is_valid == False, report.chain_confidence == 0.0 + +# Single step +report = verify_chain("What is 2 + 2?", ["2 + 2 = 4."]) +# report.steps has 1 entry, report.answer == "2 + 2 = 4." +``` + +--- -Build and run with Docker: +## 3. Arithmetic Solver — `solve_arithmetic` + +Solves arithmetic word problems by extracting numbers, detecting the operation, computing the result, and verifying via a reasoning chain. ```bash -# Build image -docker build -f examples/Dockerfile.api -t pureason-api . +python examples/arithmetic_solver.py +``` -# Run container -docker run -p 8000:8000 pureason-api +### Building Blocks -# Test health -curl http://localhost:8000/health +```python +from pureason.reasoning.arithmetic import _safe_eval, _extract_numbers, _detect_operation + +# Safe eval — no exec/eval, only arithmetic AST nodes +_safe_eval("(3 + 4) * 2") # → 14.0 +_safe_eval("import os") # → None (rejected) +_safe_eval("5 / 0") # → None (division by zero) + +# Number extraction — digits, decimals, negatives, commas +_extract_numbers("There are 3 apples and 1,000 bananas.") # → [3.0, 1000.0] + +# Operation detection — NLP-based (spaCy + classifier) +_detect_operation("How many total after adding 5 more?") # → "+" +_detect_operation("How many are left after removing 5?") # → "-" ``` -## 📊 API Endpoints +### Full Solver -| Endpoint | Method | Purpose | -|----------|--------|---------| -| `/health` | GET | Health check | -| `/metrics` | GET | Performance metrics | -| `/verify` | POST | Verify single claim | -| `/verify/batch` | POST | Verify up to 100 claims | -| `/docs` | GET | Interactive API docs | +```python +from pureason.reasoning import solve_arithmetic + +report = solve_arithmetic( + "Maria has 15 apples. She buys 8 more. How many apples in total?" +) +print(report.answer) # "Therefore, the answer is 23." +print(report.is_valid) # True +``` -## 🎯 Integration Patterns +--- + +## 4. Syllogism Verification — `verify_syllogism` + +Verifies logical arguments using a cascade of strategies: +1. TF-IDF + LogReg classifier (fast) +2. Z3 formal entailment (symbolic logic) +3. Informal fallacy heuristics +4. KAC semantic consistency (fallback) + +```bash +python examples/syllogism_verification.py +``` + +### Code -### Pattern 1: Guard Rails ```python -def safe_llm_call(prompt): - output = llm.generate(prompt) - verification = verify_chain(output) - - if verification.ecs < 70: - # Reject low-confidence output - raise ValueError("Output failed verification") - - return output +from pureason.reasoning import verify_syllogism + +# Valid syllogism +report = verify_syllogism( + premises=["All mammals are warm-blooded.", "Whales are mammals."], + conclusion="Whales are warm-blooded.", +) +print(report.is_valid) # True + +# Invalid syllogism (undistributed middle) +report = verify_syllogism( + premises=["All dogs are animals.", "All cats are animals."], + conclusion="All dogs are cats.", +) +print(report.is_valid) # False +``` + +### Report Structure + +```python +report.is_valid # bool — conclusion follows from premises +report.chain_confidence # 0.88 (valid) or 0.25 (invalid) +report.summary # human-readable explanation +report.steps # StepVerification for each premise + conclusion +``` + +--- + +## 5. MCQ Picker — `pick_best_answer` + +Selects the best answer from multiple choices by verifying each against the question context. + +```bash +python examples/mcq_picker.py ``` -### Pattern 2: Confidence Scoring +### Code + ```python -def scored_generation(prompt): - output = llm.generate(prompt) - verification = verify_chain(output) - - return { - "text": output, - "confidence": verification.ecs / 100, - "safe_to_use": verification.ecs >= 70 - } +from pureason.reasoning import pick_best_answer + +best_idx, report = pick_best_answer( + question="What is the capital of France?", + choices=["Berlin", "Paris", "Madrid", "Rome"], +) +print(f"Best: {best_idx}") # index of highest-ECS choice + +# With context +best_idx, report = pick_best_answer( + question="Which animal is fastest?", + choices=["Cheetah (70 mph)", "Lion (50 mph)", "Elephant (25 mph)"], + context="African wildlife guide.", +) + +# Strict mode — raises AmbiguousAnswerError on ties +from pureason.reasoning.mcq import AmbiguousAnswerError +try: + pick_best_answer("Pick one.", ["Red", "Blue"], strict=True) +except AmbiguousAnswerError as e: + print(f"Tied: {e.tied_indices}") +``` + +--- + +## 6. Arithmetic Repair — `_repair_arithmetic_in_step` + +Deterministic repair of arithmetic errors in text. Finds `A op B = C` patterns and corrects wrong results. + +```bash +python examples/arithmetic_repair.py ``` -### Pattern 3: Auto-Correction +### Code + ```python -def self_correcting_llm(prompt): - output = llm.generate(prompt) - verification = verify_chain(output) - - if verification.ecs < 70 and verification.rewrites: - # Use PureReason's rewrite - return verification.rewrites[0] - - return output +from pureason.reasoning.repair import _repair_arithmetic_in_step + +# Correct — no change +_repair_arithmetic_in_step("3 + 4 = 7 apples.") +# → "3 + 4 = 7 apples." + +# Wrong — repaired +_repair_arithmetic_in_step("3 + 4 = 8 apples.") +# → "3 + 4 = 7 [repaired] apples." + +# Extraction utilities +from pureason.reasoning.repair import _extract_numeric_answer, _extract_letter_answer +_extract_numeric_answer("The answer is 42.") # → 42.0 +_extract_letter_answer("The answer is **B**.") # → "B" + +# Majority vote for aggregating multiple answers +from pureason.reasoning.repair import _majority_vote, _majority_vote_letters +_majority_vote([42.0, 42.0, 41.0]) # → 42.0 +_majority_vote_letters(["A", "B", "A"]) # → "A" ``` -## 🔧 Configuration +--- -### ECS Thresholds +## 7–9. Quick Start, LangChain, API Server + +These examples are documented inline: + +- **[`simple_verification.py`](simple_verification.py)** — 3-claim quickstart +- **[`langchain_integration.py`](langchain_integration.py)** — `PureReasonVerifier` wrapper +- **[`api_server.py`](api_server.py)** — FastAPI server (`pip install fastapi uvicorn`) -Choose based on your risk tolerance: +--- + +## 🐳 Docker Deployment + +```bash +docker build -f examples/Dockerfile.api -t pureason-api . +docker run -p 8000:8000 pureason-api +curl http://localhost:8000/health +``` + +## API Endpoints + +| Endpoint | Method | Purpose | +|----------|--------|---------| +| `/health` | GET | Health check | +| `/metrics` | GET | Performance metrics | +| `/verify` | POST | Verify single claim | +| `/verify/batch` | POST | Verify up to 100 claims | +| `/docs` | GET | Interactive API docs | + +--- + +## Configuration + +### ECS Thresholds | Risk Level | Domain | Min ECS | |------------|--------|---------| @@ -146,56 +337,54 @@ Choose based on your risk tolerance: | **Medium** | General knowledge | 65+ | | **Low** | Creative, Opinion | 50+ | -### Performance Tuning +### Integration Patterns + +**Guard Rails** — reject low-confidence output: +```python +guard = ReasoningGuard(threshold=70) +result = guard.verify(llm_output) +if result.provenance == "flagged": + raise ValueError("Output failed verification") +``` + +**Confidence Scoring** — attach scores to outputs: +```python +result = guard.verify(llm_output) +return {"text": llm_output, "confidence": result.ecs / 100} +``` -- **Latency**: <5ms per verification (typical) -- **Throughput**: 200+ verifications/second -- **Batch size**: Up to 100 claims per request +**Auto-Correction** — repair arithmetic mistakes: +```python +result = guard.verify(llm_output) +if result.repaired: + return result.text # corrected version +return llm_output +``` -## 📚 More Examples +--- -Coming soon: -- Jupyter notebook tutorials -- TypeScript/JavaScript integration -- Streaming verification -- Custom domain calibration +## Tests -## 💡 Tips +All examples have corresponding tests in [`tests/test_examples.py`](../tests/test_examples.py). -1. **Always verify** LLM outputs for critical applications -2. **Monitor ECS scores** over time to track model drift -3. **Use batch endpoints** for high throughput -4. **Set appropriate thresholds** based on domain risk -5. **Enable rewrites** for automatic correction +```bash +python -m pytest tests/test_examples.py -v +``` -## 🆘 Troubleshooting +## Troubleshooting **"ModuleNotFoundError: No module named 'pureason'"** ```bash pip install -e . ``` +**"PureReason reasoning requires spaCy"** +```bash +pip install -e ".[nlp]" +python -m spacy download en_core_web_sm +``` + **"API server won't start"** ```bash pip install fastapi uvicorn pydantic ``` - -**"Verification takes too long"** -- Use batch endpoint for multiple claims -- Consider caching for repeated verification -- Check system resources (CPU, memory) - -## 📖 Documentation - -- [Full Documentation](../docs/README.md) -- [Benchmarks](../docs/BENCHMARK.md) -- [Architecture](../docs/CAPABILITIES.md) -- [MCP Integration](../docs/MCP-QUICK-REFERENCE.md) - -## 🤝 Contributing - -See [CONTRIBUTING.md](../docs/CONTRIBUTING.md) for guidelines. - -## 📄 License - -Apache 2.0 - See [LICENSE](../LICENSE) for details. diff --git a/examples/arithmetic_repair.py b/examples/arithmetic_repair.py new file mode 100644 index 0000000..0794eba --- /dev/null +++ b/examples/arithmetic_repair.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""Use Case 6: Arithmetic Repair — fix computation errors in text. + +Demonstrates the deterministic arithmetic repair pipeline that finds +'A op B = C' patterns in text and corrects wrong results. + +This is PureReason's core advantage over raw LLMs: formal arithmetic +verification + repair. LLM arithmetic mistakes become opportunities +for formal correction. + +Run: + python examples/arithmetic_repair.py +""" + +import sys + +sys.path.insert(0, ".") + +from pureason.reasoning.repair import ( + _extract_letter_answer, + _extract_numeric_answer, + _majority_vote, + _majority_vote_letters, + _repair_arithmetic_in_step, +) + + +def example_repair(): + """Repair arithmetic errors in text.""" + cases = [ + # (input_text, should_repair) + ("We computed 3 + 4 = 7 apples.", False), + ("We computed 3 + 4 = 8 apples.", True), + ("The product is 6 * 7 = 41.", True), + ("Half of 10 is 10 / 2 = 5.", False), + ("The difference is 100 - 37 = 64.", True), + ] + + print("=== Arithmetic Repair ===") + for text, expect_repair in cases: + result = _repair_arithmetic_in_step(text) + was_repaired = "[repaired]" in result + status = "OK" if was_repaired == expect_repair else "UNEXPECTED" + print(f" {status:>10s}: {text}") + if was_repaired: + print(f" → {result}") + print() + + +def example_extract_numeric(): + """Extract the final numeric answer from text.""" + texts = [ + ("The answer is 42.", 42.0), + ("Therefore, 3.14 is the result.", 3.14), + ("No number here at all.", None), + ("After calculation we get 100 items total.", 100.0), + ] + + print("=== Extract Numeric Answer ===") + for text, expected in texts: + result = _extract_numeric_answer(text) + # Allow None comparison and close-enough floats + if result is None and expected is None: + match = True + elif result is not None and expected is not None: + match = abs(result - expected) < 0.01 + else: + match = False + status = "OK" if match else "MISMATCH" + print(f" {status:>8s}: {text!r} → {result} (expected {expected})") + print() + + +def example_extract_letter(): + """Extract MCQ letter answers from text.""" + texts = [ + ("Therefore the answer is A.", "A"), + ("After analysis, the best answer is **B**.", "B"), + ("ANSWER: C", "C"), + ("No clear MCQ answer here.", None), + ] + + print("=== Extract Letter Answer ===") + for text, expected in texts: + result = _extract_letter_answer(text) + status = "OK" if result == expected else "MISMATCH" + print(f" {status:>8s}: {text!r} → {result!r} (expected {expected!r})") + print() + + +def example_majority_vote(): + """Majority vote for aggregating multiple answers.""" + print("=== Majority Vote ===") + + # Numeric + nums = [42.0, 42.0, 41.0, 42.0] + print(f" Numeric: {nums} → {_majority_vote(nums)}") + + nums_empty: list = [] + print(f" Empty: {nums_empty} → {_majority_vote(nums_empty)}") + + # Letters + letters = ["A", "B", "A", "A", "C"] + print(f" Letters: {letters} → {_majority_vote_letters(letters)}") + + letters_with_none = [None, "B", None, "B"] + print(f" With None: {letters_with_none} → {_majority_vote_letters(letters_with_none)}") + print() + + +if __name__ == "__main__": + example_repair() + example_extract_numeric() + example_extract_letter() + example_majority_vote() diff --git a/examples/arithmetic_solver.py b/examples/arithmetic_solver.py new file mode 100644 index 0000000..3f3c7cd --- /dev/null +++ b/examples/arithmetic_solver.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +"""Use Case 3: Arithmetic Word Problem Solver. + +Demonstrates PureReason's ability to: + - Extract numbers from natural language (digits and words) + - Detect the intended operation (+, -, *, /) + - Compute the answer with a verified reasoning chain + - Handle multi-step and ratio/proportion problems + +Run: + python examples/arithmetic_solver.py +""" + +import sys + +sys.path.insert(0, ".") + +from pureason.reasoning import solve_arithmetic +from pureason.reasoning.arithmetic import _detect_operation, _extract_numbers, _safe_eval + + +def example_safe_eval(): + """Demonstrate the safe expression evaluator (no exec/eval).""" + expressions = [ + ("2 + 3", 5.0), + ("10 - 4", 6.0), + ("6 * 7", 42.0), + ("10 / 4", 2.5), + ("2 ** 10", 1024.0), + ("(3 + 4) * 2", 14.0), + ("-5 + 3", -2.0), + ("5 / 0", None), # division by zero → None + ("import os", None), # not arithmetic → None + ] + + print("=== Safe Expression Evaluator ===") + for expr, expected in expressions: + result = _safe_eval(expr) + status = "OK" if result == expected else "MISMATCH" + print(f" {status:>8s}: _safe_eval({expr!r}) = {result} (expected {expected})") + print() + + +def example_number_extraction(): + """Extract numbers from natural language text.""" + texts = [ + "There are 3 apples and 10 bananas.", + "The price is 3.14 dollars.", + "Temperature is -5 degrees.", + "No numeric content here.", + "The factory produced 1,000 units.", + ] + + print("=== Number Extraction ===") + for text in texts: + nums = _extract_numbers(text) + print(f" {text}") + print(f" → {nums}") + print() + + +def example_operation_detection(): + """Detect the intended arithmetic operation from problem text.""" + problems = [ + ("How many total items if we add 3 more?", "+"), + ("How many are left after removing 5?", "-"), + ("What is the average speed?", "/"), + ("A car travels 60 mph for 4 hours. How far?", "*"), + ("They split the 100 dollars equally.", "/"), + ] + + print("=== Operation Detection ===") + for text, expected_op in problems: + detected = _detect_operation(text) + status = "OK" if detected == expected_op else "MISMATCH" + print(f" {status:>8s}: {text}") + print(f" detected={detected!r}, expected={expected_op!r}") + print() + + +def example_word_problems(): + """Solve complete arithmetic word problems end-to-end.""" + problems = [ + "Maria has 15 apples. She buys 8 more. How many apples does she have in total?", + "A store has 120 items. They sold 45 items. How many are left?", + "Each box contains 6 items. There are 9 boxes. How many items altogether?", + "There are 48 cookies to share among 8 children. How many does each child get?", + ] + + print("=== Word Problem Solver ===") + for problem in problems: + report = solve_arithmetic(problem) + print(f" Problem: {problem}") + print(f" Answer: {report.answer}") + print(f" Valid: {report.is_valid}") + print(f" Conf: {report.chain_confidence:.4f}") + for sv in report.steps: + print(f" Step {sv.step_index}: {sv.step_text[:70]}") + print() + + +if __name__ == "__main__": + example_safe_eval() + example_number_extraction() + example_operation_detection() + example_word_problems() diff --git a/examples/chain_of_thought.py b/examples/chain_of_thought.py new file mode 100644 index 0000000..70d8685 --- /dev/null +++ b/examples/chain_of_thought.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +"""Use Case 2: Chain-of-Thought Verification — verify multi-step reasoning. + +Demonstrates how to verify an ordered sequence of reasoning steps for +internal consistency (each step on its own) and contextual consistency +(each step against the accumulated context). + +Run: + python examples/chain_of_thought.py +""" + +import sys + +sys.path.insert(0, ".") + +from pureason.reasoning import verify_chain +from pureason.reasoning.models import EpistemicChainReport + + +def example_valid_chain(): + """A correct chain — all steps should pass.""" + problem = "A store has 50 apples. A customer buys 12. How many remain?" + steps = [ + "The store starts with 50 apples.", + "A customer buys 12 apples.", + "Remaining = 50 - 12 = 38.", + "Therefore, the answer is 38.", + ] + + report: EpistemicChainReport = verify_chain(problem, steps) + + print("=== Valid Chain ===") + _print_report(report) + print() + return report + + +def example_arithmetic_error_chain(): + """A chain with an arithmetic error — step should be flagged.""" + problem = "What is the total of 15 and 27?" + steps = [ + "We need to add the two numbers.", + "15 + 27 = 43.", + "Therefore, the answer is 43.", + ] + + # Note: 15 + 27 = 42, so step 1 has an arithmetic error + # (The answer step also carries the wrong value.) + + report = verify_chain(problem, steps) + + print("=== Arithmetic Error Chain ===") + _print_report(report) + print() + return report + + +def example_contradiction_chain(): + """A chain where a later step contradicts an earlier one.""" + problem = "Describe the weather." + steps = [ + "The temperature is 35 degrees Celsius.", + "It is a very hot day.", + "The roads are covered in ice due to freezing temperatures.", + ] + + report = verify_chain(problem, steps) + + print("=== Contradiction Chain ===") + _print_report(report) + print() + return report + + +def example_empty_chain(): + """Edge case: empty step list.""" + report = verify_chain("Any problem?", []) + + print("=== Empty Chain ===") + print(f" is_valid: {report.is_valid}") + print(f" chain_confidence: {report.chain_confidence}") + print(f" summary: {report.summary}") + print() + return report + + +def example_single_step(): + """Edge case: single-step chain (the step is both reasoning and answer).""" + report = verify_chain( + "What is 2 + 2?", + ["2 + 2 = 4, so the answer is 4."], + ) + + print("=== Single-Step Chain ===") + _print_report(report) + print() + return report + + +def _print_report(report: EpistemicChainReport): + """Pretty-print an EpistemicChainReport.""" + print(f" Problem: {report.problem}") + print(f" is_valid: {report.is_valid}") + print(f" chain_confidence: {report.chain_confidence:.4f}") + print(f" answer: {report.answer}") + print(f" invalid_steps: {report.invalid_steps}") + print(f" summary: {report.summary}") + for sv in report.steps: + status = "OK" if sv.is_internally_valid and sv.is_contextually_valid else "FAIL" + flags_str = ", ".join(sv.flags) if sv.flags else "none" + print( + f" Step {sv.step_index}: [{status:>4s}] ECS={sv.ecs:3d} " + f"int={sv.is_internally_valid} ctx={sv.is_contextually_valid} " + f"flags=[{flags_str}]" + ) + if sv.contradiction_with_step is not None: + print(f" contradicts step {sv.contradiction_with_step}") + + +if __name__ == "__main__": + example_valid_chain() + example_arithmetic_error_chain() + example_contradiction_chain() + example_empty_chain() + example_single_step() diff --git a/examples/guard_verification.py b/examples/guard_verification.py new file mode 100644 index 0000000..655c0c7 --- /dev/null +++ b/examples/guard_verification.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +"""Use Case 1: ReasoningGuard — Verify any text with ECS scoring. + +Demonstrates the primary PureReason entry point for verifying text. +The ReasoningGuard checks text using the Epistemic Confidence Score (ECS), +repairs arithmetic errors, and tracks quality degradation over time. + +Run: + python examples/guard_verification.py +""" + +import sys + +sys.path.insert(0, ".") + +from pureason.guard import ReasoningGuard, VerificationResult + + +def example_basic_verification(): + """Verify text and inspect the VerificationResult fields.""" + guard = ReasoningGuard(threshold=60) + + text = "Water boils at 100 degrees Celsius at sea level." + result: VerificationResult = guard.verify(text) + + print("=== Basic Verification ===") + print(f"Input: {text}") + print(f"ECS: {result.ecs}/100") + print(f"Provenance: {result.provenance}") # "verified", "repaired", or "flagged" + print(f"Repaired: {result.repaired}") + print(f"Text out: {result.text}") + print() + return result + + +def example_threshold_levels(): + """Show how different thresholds change the provenance outcome.""" + claims = [ + "The Earth orbits the Sun.", + "2 + 2 = 5 so the total is wrong.", + "The answer is both yes and no at the same time.", + ] + + print("=== Threshold Comparison ===") + for threshold in (40, 60, 80): + guard = ReasoningGuard(threshold=threshold, repair=True) + print(f"\n--- threshold={threshold} ---") + for claim in claims: + r = guard.verify(claim) + print(f" [{r.provenance:>8s}] ECS={r.ecs:5.1f} {claim[:60]}") + print() + + +def example_arithmetic_repair(): + """Demonstrate automatic arithmetic repair.""" + guard = ReasoningGuard(threshold=60, repair=True) + + texts = [ + "3 + 4 = 7 so the answer is correct.", # correct — no repair + "3 + 4 = 8 so the answer is correct.", # wrong — repaired + "6 * 7 = 41 which gives the total.", # wrong — repaired + "10 / 2 = 5 items per group.", # correct — no repair + ] + + print("=== Arithmetic Repair ===") + for text in texts: + r = guard.verify(text) + if r.repaired: + print(f" REPAIRED: {r.original}") + print(f" => {r.text}") + else: + print(f" OK: {text}") + print() + + +def example_degradation_tracking(): + """Show the degradation warning when quality drops over time.""" + import warnings + + from pureason.guard import ReasoningDegradationWarning, _ReputationTracker + + tracker = _ReputationTracker(window=3, baseline_window=6, drop=5.0) + guard = ReasoningGuard( + threshold=60, + source_label="my_llm", + warn_on_degradation=True, + tracker=tracker, + ) + + # Simulate a sequence of ECS scores — first good, then degrading + good_texts = ["The sky is blue."] * 6 # will get ~75 ECS each + bad_texts = ["Maybe yes maybe no."] * 3 # will get lower ECS + + print("=== Degradation Tracking ===") + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + for t in good_texts + bad_texts: + guard.verify(t) + + degradation_warnings = [w for w in caught if issubclass(w.category, ReasoningDegradationWarning)] + if degradation_warnings: + print(f" Degradation detected: {degradation_warnings[0].message}") + else: + print(" No degradation detected (scores stayed stable).") + print() + + +def example_decision_logic(): + """Show a complete agent decision workflow.""" + guard = ReasoningGuard(threshold=70) + + agent_outputs = [ + "Paris is the capital of France.", + "The patient must have cancer based on a headache.", + "2 + 3 = 6 so there are six items.", + ] + + print("=== Agent Decision Logic ===") + for output in agent_outputs: + r = guard.verify(output) + if r.ecs >= 70: + action = "ACCEPT" + elif r.ecs >= 40: + action = "REVIEW" + else: + action = "REJECT" + + print(f" {action:>6s} (ECS={r.ecs:5.1f}, prov={r.provenance}): {output[:55]}") + print() + + +if __name__ == "__main__": + example_basic_verification() + example_threshold_levels() + example_arithmetic_repair() + example_degradation_tracking() + example_decision_logic() diff --git a/examples/mcq_picker.py b/examples/mcq_picker.py new file mode 100644 index 0000000..fa79c58 --- /dev/null +++ b/examples/mcq_picker.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +"""Use Case 5: Multiple-Choice Question Picker. + +Demonstrates PureReason's MCQ answer selection by verifying each choice +against the question context and selecting the one with the highest ECS. + +Run: + python examples/mcq_picker.py +""" + +import sys + +sys.path.insert(0, ".") + +from pureason.reasoning import pick_best_answer +from pureason.reasoning.mcq import AmbiguousAnswerError + + +def example_clear_winner(): + """One choice is clearly more defensible than the others.""" + question = "What is the capital of France?" + choices = [ + "Berlin", + "Paris", + "Madrid", + "Rome", + ] + + best_idx, report = pick_best_answer(question, choices) + + print("=== Clear Winner ===") + print(f" Question: {question}") + for i, c in enumerate(choices): + marker = " ← best" if i == best_idx else "" + print(f" [{i}] {c}{marker}") + print(f" Selected index: {best_idx} ({choices[best_idx]})") + print(f" is_valid: {report.is_valid}") + print(f" chain_confidence: {report.chain_confidence:.4f}") + print() + return best_idx, report + + +def example_with_context(): + """Provide background context to improve discrimination.""" + question = "Based on the passage, which animal is the fastest?" + choices = [ + "The cheetah can reach 70 mph.", + "The lion can reach 50 mph.", + "The elephant can reach 25 mph.", + ] + context = "African wildlife includes cheetahs, lions, and elephants." + + best_idx, report = pick_best_answer(question, choices, context=context) + + print("=== With Context ===") + print(f" Context: {context}") + print(f" Question: {question}") + for i, c in enumerate(choices): + marker = " ← best" if i == best_idx else "" + print(f" [{i}] {c}{marker}") + print(f" Selected index: {best_idx}") + print() + return best_idx, report + + +def example_ambiguous_strict(): + """When choices are equally defensible, strict mode raises an error.""" + question = "Pick a color." + choices = ["Red", "Blue"] + + print("=== Ambiguous (strict mode) ===") + print(f" Question: {question}") + try: + pick_best_answer(question, choices, strict=True) + print(" No ambiguity detected.") + except AmbiguousAnswerError as e: + print(f" AmbiguousAnswerError: {e}") + print(f" Tied indices: {e.tied_indices}, ECS: {e.ecs}") + print() + + +def example_ambiguous_lenient(): + """In default (lenient) mode, ties are resolved to the first index + and flagged with MCQ_AMBIGUOUS_ECS_TIE.""" + question = "Pick a color." + choices = ["Red", "Blue"] + + best_idx, report = pick_best_answer(question, choices, strict=False) + + print("=== Ambiguous (lenient mode) ===") + print(f" Selected index: {best_idx}") + if report.steps: + flags = report.steps[0].flags + print(f" Flags: {flags}") + print() + return best_idx, report + + +if __name__ == "__main__": + example_clear_winner() + example_with_context() + example_ambiguous_strict() + example_ambiguous_lenient() diff --git a/examples/syllogism_verification.py b/examples/syllogism_verification.py new file mode 100644 index 0000000..10e8f7c --- /dev/null +++ b/examples/syllogism_verification.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +"""Use Case 4: Syllogism Verification — formal logic checking. + +Demonstrates PureReason's ability to verify logical arguments using +a multi-strategy approach: + 1. TF-IDF + Logistic Regression classifier (fast, data-driven) + 2. Z3 formal entailment (symbolic logic) + 3. Informal fallacy heuristics (hasty generalisation, circular reasoning) + 4. KAC consistency check (semantic overlap fallback) + +Run: + python examples/syllogism_verification.py +""" + +import sys + +sys.path.insert(0, ".") + +from pureason.reasoning import verify_syllogism +from pureason.reasoning.models import EpistemicChainReport + + +def example_valid_syllogism(): + """Classic valid syllogism — conclusion follows from premises.""" + premises = [ + "All mammals are warm-blooded.", + "Whales are mammals.", + ] + conclusion = "Whales are warm-blooded." + + report: EpistemicChainReport = verify_syllogism(premises, conclusion) + + print("=== Valid Syllogism ===") + _print_report(premises, conclusion, report) + print() + return report + + +def example_invalid_syllogism(): + """Invalid syllogism — conclusion does not follow.""" + premises = [ + "All dogs are animals.", + "All cats are animals.", + ] + conclusion = "Therefore, all dogs are cats." + + report = verify_syllogism(premises, conclusion) + + print("=== Invalid Syllogism ===") + _print_report(premises, conclusion, report) + print() + return report + + +def example_hasty_generalisation(): + """Informal fallacy: specific instances → universal conclusion.""" + premises = [ + "John is tall.", + "Mary is tall.", + ] + conclusion = "All people are tall." + + report = verify_syllogism(premises, conclusion) + + print("=== Hasty Generalisation ===") + _print_report(premises, conclusion, report) + print() + return report + + +def example_modus_ponens(): + """Valid argument form: If P then Q; P; therefore Q.""" + premises = [ + "If it rains, the ground gets wet.", + "It is raining.", + ] + conclusion = "The ground is wet." + + report = verify_syllogism(premises, conclusion) + + print("=== Modus Ponens ===") + _print_report(premises, conclusion, report) + print() + return report + + +def example_three_premise_chain(): + """Transitive chain: A→B, B→C, therefore A→C.""" + premises = [ + "All birds have feathers.", + "All animals with feathers can fly.", + "Penguins are birds.", + ] + conclusion = "Penguins can fly." + + report = verify_syllogism(premises, conclusion) + + print("=== Three-Premise Chain (tricky — penguins can't fly) ===") + _print_report(premises, conclusion, report) + print() + return report + + +def _print_report(premises, conclusion, report: EpistemicChainReport): + """Pretty-print a syllogism verification report.""" + for i, p in enumerate(premises): + print(f" Premise {i + 1}: {p}") + print(f" Conclusion: {conclusion}") + print(f" ---") + print(f" is_valid: {report.is_valid}") + print(f" chain_confidence: {report.chain_confidence:.2f}") + print(f" summary: {report.summary}") + for sv in report.steps: + flags_str = ", ".join(sv.flags) if sv.flags else "none" + print( + f" Step {sv.step_index}: ECS={sv.ecs:3d} " + f"ctx_valid={sv.is_contextually_valid} " + f"flags=[{flags_str}]" + ) + + +if __name__ == "__main__": + example_valid_syllogism() + example_invalid_syllogism() + example_hasty_generalisation() + example_modus_ponens() + example_three_premise_chain() diff --git a/tests/test_examples.py b/tests/test_examples.py new file mode 100644 index 0000000..9395b26 --- /dev/null +++ b/tests/test_examples.py @@ -0,0 +1,438 @@ +"""Tests for example use cases — validates that all examples produce expected results. + +These tests exercise the Python reasoning layer without requiring the Rust binary +by mocking _core._run where needed. Pure-Python functionality (arithmetic, +repair, models) is tested directly. +""" + +import sys +import unittest +from unittest.mock import MagicMock, patch + +sys.path.insert(0, ".") + + +# --------------------------------------------------------------------------- +# 1. ReasoningGuard +# --------------------------------------------------------------------------- + + +class TestGuardUseCases(unittest.TestCase): + """Tests covering guard_verification.py use cases.""" + + def test_guard_verified_provenance(self) -> None: + """High ECS → provenance='verified'.""" + from pureason.guard import ReasoningGuard + + with patch("pureason.reasoning.chain._run") as mock_run: + mock_run.return_value = {"ecs": 80, "flags": []} + guard = ReasoningGuard(threshold=60) + result = guard.verify("Water boils at 100 degrees Celsius.") + self.assertEqual(result.provenance, "verified") + self.assertGreaterEqual(result.ecs, 60) + + def test_guard_flagged_provenance(self) -> None: + """Low ECS with no repairable content → provenance='flagged'.""" + from pureason.guard import ReasoningGuard + + with patch("pureason.reasoning.chain._run") as mock_run: + mock_run.return_value = {"ecs": 20, "flags": ["CERTAINTY_OVERREACH"]} + guard = ReasoningGuard(threshold=60, repair=True) + result = guard.verify("This is definitely absolutely true.") + self.assertEqual(result.provenance, "flagged") + + def test_guard_repaired_provenance(self) -> None: + """Low ECS with arithmetic error → provenance='repaired'.""" + from pureason.guard import ReasoningGuard + + with patch("pureason.reasoning.chain._run") as mock_run: + mock_run.return_value = {"ecs": 30, "flags": []} + guard = ReasoningGuard(threshold=60, repair=True) + result = guard.verify("3 + 4 = 8 so the total is wrong.") + self.assertEqual(result.provenance, "repaired") + self.assertTrue(result.repaired) + self.assertIn("7", result.text) + + def test_guard_threshold_affects_outcome(self) -> None: + """Same text should be 'verified' at low threshold, 'flagged' at high.""" + from pureason.guard import ReasoningGuard + + with patch("pureason.reasoning.chain._run") as mock_run: + mock_run.return_value = {"ecs": 55, "flags": []} + + low_guard = ReasoningGuard(threshold=40) + high_guard = ReasoningGuard(threshold=60) + + r_low = low_guard.verify("Some text.") + r_high = high_guard.verify("Some text.") + + self.assertEqual(r_low.provenance, "verified") + self.assertEqual(r_high.provenance, "flagged") + + def test_guard_repair_disabled(self) -> None: + """When repair=False, arithmetic errors are not corrected.""" + from pureason.guard import ReasoningGuard + + with patch("pureason.reasoning.chain._run") as mock_run: + mock_run.return_value = {"ecs": 30, "flags": []} + guard = ReasoningGuard(threshold=60, repair=False) + result = guard.verify("3 + 4 = 8") + self.assertFalse(result.repaired) + self.assertEqual(result.provenance, "flagged") + + +# --------------------------------------------------------------------------- +# 2. Chain-of-Thought Verification +# --------------------------------------------------------------------------- + + +class TestChainOfThoughtUseCases(unittest.TestCase): + """Tests covering chain_of_thought.py use cases.""" + + @patch("pureason.reasoning.chain._run") + def test_valid_chain_all_pass(self, mock_run: MagicMock) -> None: + """A correct chain should report is_valid=True.""" + from pureason.reasoning import verify_chain + + mock_run.return_value = {"ecs": 75, "flags": []} + report = verify_chain( + "What is 50 - 12?", + [ + "The store starts with 50 apples.", + "A customer buys 12 apples.", + "Remaining = 50 - 12 = 38.", + "Therefore, the answer is 38.", + ], + ) + self.assertTrue(report.is_valid) + self.assertEqual(len(report.invalid_steps), 0) + self.assertGreater(report.chain_confidence, 0) + + @patch("pureason.reasoning.chain._run") + def test_chain_with_arithmetic_error(self, mock_run: MagicMock) -> None: + """A chain containing '15 + 27 = 43' should flag that step.""" + from pureason.reasoning import verify_chain + + mock_run.return_value = {"ecs": 60, "flags": []} + report = verify_chain( + "What is 15 + 27?", + [ + "We add the numbers.", + "15 + 27 = 43.", + ], + ) + # Step 1 (index 1) has wrong arithmetic: 15+27=42 not 43 + arith_flagged = any( + "ARITHMETIC_ERROR" in sv.flags + for sv in report.steps + ) + self.assertTrue(arith_flagged, "Arithmetic error should be flagged") + + @patch("pureason.reasoning.chain._run") + def test_empty_chain(self, mock_run: MagicMock) -> None: + """Empty step list → is_valid=False, confidence=0.""" + from pureason.reasoning import verify_chain + + report = verify_chain("Any?", []) + self.assertFalse(report.is_valid) + self.assertEqual(report.chain_confidence, 0.0) + self.assertIsNone(report.answer) + + @patch("pureason.reasoning.chain._run") + def test_single_step_chain(self, mock_run: MagicMock) -> None: + """Single-step chain should still produce a valid report.""" + from pureason.reasoning import verify_chain + + mock_run.return_value = {"ecs": 70, "flags": []} + report = verify_chain("What is 2 + 2?", ["2 + 2 = 4."]) + self.assertEqual(len(report.steps), 1) + self.assertEqual(report.answer, "2 + 2 = 4.") + + +# --------------------------------------------------------------------------- +# 3. Arithmetic — Pure Python, no mocking needed +# --------------------------------------------------------------------------- + + +class TestArithmeticUseCases(unittest.TestCase): + """Tests covering arithmetic_solver.py use cases.""" + + def test_safe_eval_basic_operations(self) -> None: + from pureason.reasoning.arithmetic import _safe_eval + + self.assertAlmostEqual(_safe_eval("2 + 3"), 5.0) + self.assertAlmostEqual(_safe_eval("10 - 4"), 6.0) + self.assertAlmostEqual(_safe_eval("6 * 7"), 42.0) + self.assertAlmostEqual(_safe_eval("10 / 4"), 2.5) + + def test_safe_eval_rejects_dangerous_input(self) -> None: + from pureason.reasoning.arithmetic import _safe_eval + + self.assertIsNone(_safe_eval("import os")) + self.assertIsNone(_safe_eval("__import__('os')")) + self.assertIsNone(_safe_eval("")) + + def test_safe_eval_division_by_zero(self) -> None: + from pureason.reasoning.arithmetic import _safe_eval + + self.assertIsNone(_safe_eval("5 / 0")) + + def test_extract_numbers_digits(self) -> None: + from pureason.reasoning.arithmetic import _extract_numbers + + nums = _extract_numbers("There are 3 apples and 10 bananas.") + self.assertIn(3.0, nums) + self.assertIn(10.0, nums) + + def test_extract_numbers_decimals(self) -> None: + from pureason.reasoning.arithmetic import _extract_numbers + + nums = _extract_numbers("The price is 3.14 dollars.") + self.assertIn(3.14, nums) + + def test_extract_numbers_negative(self) -> None: + from pureason.reasoning.arithmetic import _extract_numbers + + nums = _extract_numbers("Temperature is -5 degrees.") + self.assertIn(-5.0, nums) + + def test_extract_numbers_comma_separated(self) -> None: + from pureason.reasoning.arithmetic import _extract_numbers + + nums = _extract_numbers("The factory produced 1,000 units.") + self.assertIn(1000.0, nums) + + def test_detect_operation_addition(self) -> None: + from pureason.reasoning.arithmetic import _detect_operation + + op = _detect_operation("How many total items if we add 3 more?") + self.assertEqual(op, "+") + + def test_detect_operation_subtraction(self) -> None: + from pureason.reasoning.arithmetic import _detect_operation + + op = _detect_operation("How many are left after removing 5?") + self.assertEqual(op, "-") + + def test_detect_operation_division(self) -> None: + from pureason.reasoning.arithmetic import _detect_operation + + op = _detect_operation("What is the average speed?") + self.assertEqual(op, "/") + + +# --------------------------------------------------------------------------- +# 4. Repair — Pure Python, no mocking needed +# --------------------------------------------------------------------------- + + +class TestRepairUseCases(unittest.TestCase): + """Tests covering arithmetic_repair.py use cases.""" + + def test_correct_expression_not_repaired(self) -> None: + from pureason.reasoning.repair import _repair_arithmetic_in_step + + result = _repair_arithmetic_in_step("3 + 4 = 7 apples.") + self.assertNotIn("[repaired]", result) + + def test_wrong_addition_repaired(self) -> None: + from pureason.reasoning.repair import _repair_arithmetic_in_step + + result = _repair_arithmetic_in_step("3 + 4 = 8 apples.") + self.assertIn("[repaired]", result) + self.assertIn("7", result) + + def test_wrong_multiplication_repaired(self) -> None: + from pureason.reasoning.repair import _repair_arithmetic_in_step + + result = _repair_arithmetic_in_step("6 * 7 = 41") + self.assertIn("[repaired]", result) + self.assertIn("42", result) + + def test_wrong_subtraction_repaired(self) -> None: + from pureason.reasoning.repair import _repair_arithmetic_in_step + + result = _repair_arithmetic_in_step("100 - 37 = 64") + self.assertIn("[repaired]", result) + self.assertIn("63", result) + + def test_extract_numeric_answer(self) -> None: + from pureason.reasoning.repair import _extract_numeric_answer + + self.assertEqual(_extract_numeric_answer("The answer is 42."), 42.0) + self.assertIsNone(_extract_numeric_answer("No number here at all.")) + + def test_extract_letter_answer(self) -> None: + from pureason.reasoning.repair import _extract_letter_answer + + self.assertEqual(_extract_letter_answer("Therefore the answer is A."), "A") + self.assertEqual(_extract_letter_answer("The best answer is **B**."), "B") + self.assertIsNone(_extract_letter_answer("No clear MCQ answer here.")) + + def test_majority_vote_numeric(self) -> None: + from pureason.reasoning.repair import _majority_vote + + self.assertEqual(_majority_vote([42.0, 42.0, 41.0, 42.0]), 42.0) + self.assertIsNone(_majority_vote([])) + + def test_majority_vote_letters(self) -> None: + from pureason.reasoning.repair import _majority_vote_letters + + self.assertEqual(_majority_vote_letters(["A", "B", "A", "A"]), "A") + self.assertEqual(_majority_vote_letters([None, "B", None, "B"]), "B") + self.assertIsNone(_majority_vote_letters([])) + + +# --------------------------------------------------------------------------- +# 5. MCQ Picker +# --------------------------------------------------------------------------- + + +class TestMCQUseCases(unittest.TestCase): + """Tests covering mcq_picker.py use cases.""" + + @patch("pureason.reasoning.chain._run") + def test_picks_an_index(self, mock_run: MagicMock) -> None: + """pick_best_answer should return a valid index.""" + from pureason.reasoning import pick_best_answer + + # Return different ECS for each choice to make one clearly best + call_count = 0 + def side_effect(*args, **kwargs): + nonlocal call_count + call_count += 1 + ecs_values = [80, 50, 60, 40] + idx = min(call_count - 1, len(ecs_values) - 1) + return {"ecs": ecs_values[idx], "flags": []} + mock_run.side_effect = side_effect + + choices = ["Paris", "Berlin", "Madrid", "Rome"] + best_idx, report = pick_best_answer("Capital of France?", choices) + self.assertIn(best_idx, range(len(choices))) + + def test_empty_choices_raises(self) -> None: + from pureason.reasoning import pick_best_answer + + with self.assertRaises(ValueError): + pick_best_answer("Question?", []) + + @patch("pureason.reasoning.chain._run") + def test_strict_mode_raises_on_tie(self, mock_run: MagicMock) -> None: + """When all choices get the same ECS, strict mode raises AmbiguousAnswerError.""" + from pureason.reasoning import pick_best_answer + from pureason.reasoning.mcq import AmbiguousAnswerError + + mock_run.return_value = {"ecs": 50, "flags": []} + with self.assertRaises(AmbiguousAnswerError): + pick_best_answer("Pick one.", ["A", "B"], strict=True) + + @patch("pureason.reasoning.chain._run") + def test_lenient_mode_flags_tie(self, mock_run: MagicMock) -> None: + """Lenient mode returns first index and adds MCQ_AMBIGUOUS_ECS_TIE flag.""" + from pureason.reasoning import pick_best_answer + + mock_run.return_value = {"ecs": 50, "flags": []} + best_idx, report = pick_best_answer("Pick one.", ["A", "B"], strict=False) + self.assertEqual(best_idx, 0) + if report.steps: + self.assertIn("MCQ_AMBIGUOUS_ECS_TIE", report.steps[0].flags) + + +# --------------------------------------------------------------------------- +# 6. Syllogism Verification +# --------------------------------------------------------------------------- + + +class TestSyllogismUseCases(unittest.TestCase): + """Tests covering syllogism_verification.py use cases.""" + + @patch("pureason.reasoning.chain._run") + def test_valid_syllogism(self, mock_run: MagicMock) -> None: + """Classic valid syllogism should be detected as valid.""" + from pureason.reasoning import verify_syllogism + + mock_run.return_value = {"ecs": 75, "flags": []} + report = verify_syllogism( + premises=["All mammals are warm-blooded.", "Whales are mammals."], + conclusion="Whales are warm-blooded.", + ) + self.assertTrue(report.is_valid) + + def test_invalid_syllogism(self) -> None: + """Invalid syllogism — undistributed middle — should be detected. + + The heuristic fallacy check detects hasty generalisation: + no universal premises support the universal conclusion. + We mock only the KAC fallback to isolate the heuristic path. + """ + from pureason.reasoning import verify_syllogism + + with patch("pureason.reasoning.syllogism._classifier_check", return_value=None), \ + patch("pureason.reasoning.syllogism._z3_entailment_check", return_value=None), \ + patch("pureason.reasoning.syllogism._kac_step_vs_context", return_value=(False, [])): + report = verify_syllogism( + premises=["All dogs are animals.", "All cats are animals."], + conclusion="All dogs are cats.", + ) + self.assertFalse(report.is_valid) + + +# --------------------------------------------------------------------------- +# 7. Models — dataclass structure +# --------------------------------------------------------------------------- + + +class TestModelUseCases(unittest.TestCase): + """Test model dataclass structure used across all examples.""" + + def test_step_verification_fields(self) -> None: + from pureason.reasoning.models import StepVerification + + sv = StepVerification( + step_index=0, + step_text="test", + ecs=75, + is_internally_valid=True, + is_contextually_valid=True, + flags=["TEST_FLAG"], + ) + self.assertEqual(sv.step_index, 0) + self.assertEqual(sv.ecs, 75) + self.assertTrue(sv.is_internally_valid) + self.assertIsNone(sv.contradiction_with_step) + + def test_chain_report_first_failure(self) -> None: + from pureason.reasoning.models import EpistemicChainReport, StepVerification + + sv_ok = StepVerification(0, "ok", 80, True, True) + sv_bad = StepVerification(1, "bad", 20, False, True, flags=["ERROR"]) + report = EpistemicChainReport( + problem="test", + steps=[sv_ok, sv_bad], + answer="bad", + is_valid=False, + chain_confidence=0.3, + invalid_steps=[1], + summary="Step 2 failed.", + ) + self.assertIsNotNone(report.first_failure) + self.assertEqual(report.first_failure.step_index, 1) + + def test_chain_report_no_failure(self) -> None: + from pureason.reasoning.models import EpistemicChainReport, StepVerification + + sv_ok = StepVerification(0, "ok", 80, True, True) + report = EpistemicChainReport( + problem="test", + steps=[sv_ok], + answer="ok", + is_valid=True, + chain_confidence=0.8, + invalid_steps=[], + summary="All passed.", + ) + self.assertIsNone(report.first_failure) + + +if __name__ == "__main__": + unittest.main() From 5eb5ae492decbe130201ebbc95d251650712c1f0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 2 Jun 2026 07:26:27 +0000 Subject: [PATCH 4/5] Address review feedback: clarify comments and install docs --- examples/README.md | 4 ++-- examples/chain_of_thought.py | 4 ++-- tests/test_examples.py | 11 ++++++----- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/examples/README.md b/examples/README.md index fe13790..0204f1d 100644 --- a/examples/README.md +++ b/examples/README.md @@ -5,8 +5,8 @@ expected inputs, outputs, and integration patterns. > **Prerequisite** — install the Python package first: > ```bash -> pip install -e . # core (always required) -> pip install -e ".[nlp]" # + spaCy & word2number +> pip install -e . # core (examples 1–2, 6–9) +> pip install -e ".[nlp]" # + spaCy & word2number (examples 3–5) > python -m spacy download en_core_web_sm > ``` diff --git a/examples/chain_of_thought.py b/examples/chain_of_thought.py index 70d8685..7cef8d3 100644 --- a/examples/chain_of_thought.py +++ b/examples/chain_of_thought.py @@ -44,12 +44,12 @@ def example_arithmetic_error_chain(): "Therefore, the answer is 43.", ] - # Note: 15 + 27 = 42, so step 1 has an arithmetic error - # (The answer step also carries the wrong value.) + # Note: 15 + 27 = 42, so the step at index 1 (the second step) has an arithmetic error report = verify_chain(problem, steps) print("=== Arithmetic Error Chain ===") + print(" Note: 15 + 27 = 42, not 43 — the second step should be flagged.") _print_report(report) print() return report diff --git a/tests/test_examples.py b/tests/test_examples.py index 9395b26..0f9fc6f 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -121,7 +121,8 @@ def test_chain_with_arithmetic_error(self, mock_run: MagicMock) -> None: "15 + 27 = 43.", ], ) - # Step 1 (index 1) has wrong arithmetic: 15+27=42 not 43 + # The step "15 + 27 = 43" is at index 1 (second step). + # 15 + 27 = 42, not 43, so it should be flagged. arith_flagged = any( "ARITHMETIC_ERROR" in sv.flags for sv in report.steps @@ -359,11 +360,11 @@ def test_valid_syllogism(self, mock_run: MagicMock) -> None: self.assertTrue(report.is_valid) def test_invalid_syllogism(self) -> None: - """Invalid syllogism — undistributed middle — should be detected. + """Invalid syllogism — conclusion does not follow from premises. - The heuristic fallacy check detects hasty generalisation: - no universal premises support the universal conclusion. - We mock only the KAC fallback to isolate the heuristic path. + We mock the classifier, Z3, and KAC layers to isolate the heuristic + fallacy check which detects that no universal premises support + the universal conclusion. """ from pureason.reasoning import verify_syllogism From d96e81cc90dff6cb5ae5d794b4061311635b09e5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 2 Jun 2026 07:43:18 +0000 Subject: [PATCH 5/5] Fix CI: ruff lint errors (F541, RUF059), format issues, and missing click dependency --- .github/workflows/lint.yml | 2 +- examples/arithmetic_solver.py | 4 ++-- examples/guard_verification.py | 18 ++++++++++-------- examples/syllogism_verification.py | 2 +- tests/test_examples.py | 17 +++++++++-------- 5 files changed, 23 insertions(+), 20 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 975dad1..2435868 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -42,7 +42,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install package - run: pip install -e ".[logic,nlp]" scikit-learn + run: pip install -e ".[logic,nlp]" scikit-learn click - name: Download spaCy model run: python -m spacy download en_core_web_sm diff --git a/examples/arithmetic_solver.py b/examples/arithmetic_solver.py index 3f3c7cd..c993abd 100644 --- a/examples/arithmetic_solver.py +++ b/examples/arithmetic_solver.py @@ -29,8 +29,8 @@ def example_safe_eval(): ("2 ** 10", 1024.0), ("(3 + 4) * 2", 14.0), ("-5 + 3", -2.0), - ("5 / 0", None), # division by zero → None - ("import os", None), # not arithmetic → None + ("5 / 0", None), # division by zero → None + ("import os", None), # not arithmetic → None ] print("=== Safe Expression Evaluator ===") diff --git a/examples/guard_verification.py b/examples/guard_verification.py index 655c0c7..d14b542 100644 --- a/examples/guard_verification.py +++ b/examples/guard_verification.py @@ -26,7 +26,7 @@ def example_basic_verification(): print("=== Basic Verification ===") print(f"Input: {text}") print(f"ECS: {result.ecs}/100") - print(f"Provenance: {result.provenance}") # "verified", "repaired", or "flagged" + print(f"Provenance: {result.provenance}") # "verified", "repaired", or "flagged" print(f"Repaired: {result.repaired}") print(f"Text out: {result.text}") print() @@ -56,10 +56,10 @@ def example_arithmetic_repair(): guard = ReasoningGuard(threshold=60, repair=True) texts = [ - "3 + 4 = 7 so the answer is correct.", # correct — no repair - "3 + 4 = 8 so the answer is correct.", # wrong — repaired - "6 * 7 = 41 which gives the total.", # wrong — repaired - "10 / 2 = 5 items per group.", # correct — no repair + "3 + 4 = 7 so the answer is correct.", # correct — no repair + "3 + 4 = 8 so the answer is correct.", # wrong — repaired + "6 * 7 = 41 which gives the total.", # wrong — repaired + "10 / 2 = 5 items per group.", # correct — no repair ] print("=== Arithmetic Repair ===") @@ -88,8 +88,8 @@ def example_degradation_tracking(): ) # Simulate a sequence of ECS scores — first good, then degrading - good_texts = ["The sky is blue."] * 6 # will get ~75 ECS each - bad_texts = ["Maybe yes maybe no."] * 3 # will get lower ECS + good_texts = ["The sky is blue."] * 6 # will get ~75 ECS each + bad_texts = ["Maybe yes maybe no."] * 3 # will get lower ECS print("=== Degradation Tracking ===") with warnings.catch_warnings(record=True) as caught: @@ -97,7 +97,9 @@ def example_degradation_tracking(): for t in good_texts + bad_texts: guard.verify(t) - degradation_warnings = [w for w in caught if issubclass(w.category, ReasoningDegradationWarning)] + degradation_warnings = [ + w for w in caught if issubclass(w.category, ReasoningDegradationWarning) + ] if degradation_warnings: print(f" Degradation detected: {degradation_warnings[0].message}") else: diff --git a/examples/syllogism_verification.py b/examples/syllogism_verification.py index 10e8f7c..210e4e0 100644 --- a/examples/syllogism_verification.py +++ b/examples/syllogism_verification.py @@ -106,7 +106,7 @@ def _print_report(premises, conclusion, report: EpistemicChainReport): for i, p in enumerate(premises): print(f" Premise {i + 1}: {p}") print(f" Conclusion: {conclusion}") - print(f" ---") + print(" ---") print(f" is_valid: {report.is_valid}") print(f" chain_confidence: {report.chain_confidence:.2f}") print(f" summary: {report.summary}") diff --git a/tests/test_examples.py b/tests/test_examples.py index 0f9fc6f..f5df708 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -123,10 +123,7 @@ def test_chain_with_arithmetic_error(self, mock_run: MagicMock) -> None: ) # The step "15 + 27 = 43" is at index 1 (second step). # 15 + 27 = 42, not 43, so it should be flagged. - arith_flagged = any( - "ARITHMETIC_ERROR" in sv.flags - for sv in report.steps - ) + arith_flagged = any("ARITHMETIC_ERROR" in sv.flags for sv in report.steps) self.assertTrue(arith_flagged, "Arithmetic error should be flagged") @patch("pureason.reasoning.chain._run") @@ -299,16 +296,18 @@ def test_picks_an_index(self, mock_run: MagicMock) -> None: # Return different ECS for each choice to make one clearly best call_count = 0 + def side_effect(*args, **kwargs): nonlocal call_count call_count += 1 ecs_values = [80, 50, 60, 40] idx = min(call_count - 1, len(ecs_values) - 1) return {"ecs": ecs_values[idx], "flags": []} + mock_run.side_effect = side_effect choices = ["Paris", "Berlin", "Madrid", "Rome"] - best_idx, report = pick_best_answer("Capital of France?", choices) + best_idx, _report = pick_best_answer("Capital of France?", choices) self.assertIn(best_idx, range(len(choices))) def test_empty_choices_raises(self) -> None: @@ -368,9 +367,11 @@ def test_invalid_syllogism(self) -> None: """ from pureason.reasoning import verify_syllogism - with patch("pureason.reasoning.syllogism._classifier_check", return_value=None), \ - patch("pureason.reasoning.syllogism._z3_entailment_check", return_value=None), \ - patch("pureason.reasoning.syllogism._kac_step_vs_context", return_value=(False, [])): + with ( + patch("pureason.reasoning.syllogism._classifier_check", return_value=None), + patch("pureason.reasoning.syllogism._z3_entailment_check", return_value=None), + patch("pureason.reasoning.syllogism._kac_step_vs_context", return_value=(False, [])), + ): report = verify_syllogism( premises=["All dogs are animals.", "All cats are animals."], conclusion="All dogs are cats.",