From a58d41598d67befa849b8a5c4ce155c3ccc6f555 Mon Sep 17 00:00:00 2001 From: Pei Chu Date: Thu, 18 Jun 2026 18:39:44 +0800 Subject: [PATCH 1/9] feat: add LLMAISmell checker --- dingo/model/llm/llm_ai_smell.py | 230 ++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 dingo/model/llm/llm_ai_smell.py diff --git a/dingo/model/llm/llm_ai_smell.py b/dingo/model/llm/llm_ai_smell.py new file mode 100644 index 00000000..4ede21f4 --- /dev/null +++ b/dingo/model/llm/llm_ai_smell.py @@ -0,0 +1,230 @@ +""" +AI Smell Detector for Requirement Documents (需求文档 AI 味检测器) + +Detects AI-generated writing patterns in requirement documents across 5 dimensions: +- 正确的废话指数 (Correct Nonsense Index) +- 无限镜像感 (Infinite Mirror Index) +- 彩虹屁密度 (Rainbow Fart Density) +- 细节真空度 (Detail Vacuum Index) +- 形容词暴力指数 (Adjective Violence Index) +""" + +import json + +from dingo.io.input import Data, RequiredField +from dingo.io.output.eval_detail import EvalDetail +from dingo.model import Model +from dingo.model.llm.base_openai import BaseOpenAI +from dingo.utils import log +from dingo.utils.exception import ConvertJsonError + + +@Model.llm_register("LLMAISmell") +class LLMAISmell(BaseOpenAI): + """ + AI Smell Detector for requirement documents. + + Evaluates 5 dimensions of AI-generated writing patterns: + 1. 正确的废话指数 - Hollow truisms ("In today's society...", "With the rapid development of...") + 2. 无限镜像感 - Repetitive emphasis of the same point in different words + 3. 彩虹屁密度 - Excessive praise, inflated importance claims + 4. 细节真空度 - Sounds complete but lacks any actionable specifics + 5. 形容词暴力指数 - Buzzword overuse (高效/赋能/闭环/生态/颗粒度...) + + Each dimension is scored 0-10. Overall AI smell score is the weighted average. + """ + + _metric_info = { + "category": "Document Quality Assessment Metrics", + "metric_name": "LLMAISmell", + "description": "Detects AI-generated writing patterns in requirement documents across 5 dimensions: hollow truisms, repetition, rainbow farts, detail vacuum, and adjective violence", + "examples": "examples/llm_and_rule/llm_local.py", + "evaluation_results": "", + } + + _required_fields = [RequiredField.CONTENT] + + # Score threshold above which a document is flagged as AI-smelling + threshold = 6 + + prompt = """ +# 角色 +你是一位资深需求评审专家,专门识别需求文档中的 AI 代写痕迹。 + +# 任务 +分析下面的文档,从 5 个维度评估其"AI 味",每个维度打分 0-10: + +## 评估维度 + +### 1. 💊 正确的废话指数(0-10) +**定义**:用正确但毫无信息量的话填充文档,听起来很有道理但什么都没说。 +**典型表现**: +- "在当今社会……"、"随着技术的不断发展……" +- "这对用户体验至关重要"(但没有说为什么或怎么做) +- "我们需要确保系统的稳定性和可靠性"(没有具体指标) +- 每段开头都在重述背景 + +**打分标准**: +- 0-2:文档直接切入主题,陈述均有实质内容 +- 3-5:有少量套话但不影响整体 +- 6-8:大量空洞表述,信息密度低 +- 9-10:几乎全是废话,读完不知道要做什么 + +### 2. 🪞 无限镜像感(0-10) +**定义**:同一个意思用不同的话反复说,制造"内容丰富"的假象。 +**典型表现**: +- 同一个功能点在不同章节反复描述 +- "提升用户体验" → "优化用户感受" → "改善用户满意度"(三句话说同一件事) +- 结论和摘要和正文高度重复 + +**打分标准**: +- 0-2:每句话都有新信息 +- 3-5:有轻微重复但可接受 +- 6-8:明显感觉在凑字数 +- 9-10:镜中镜,绕来绕去 + +### 3. 🌈 彩虹屁密度(0-10) +**定义**:过度拔高重要性、夸大影响、给自己项目过度背书。 +**典型表现**: +- "这将彻底改变……"、"革命性的……"、"行业领先的……" +- "大幅提升"但没有数据 +- "用户迫切需要"但没有调研依据 +- 每个功能都是"核心"、"关键"、"重要" + +**打分标准**: +- 0-2:表述客观,有数据支撑 +- 3-5:略有夸张但在合理范围 +- 6-8:随处可见夸大词汇 +- 9-10:每句话都在吹,读起来像广告 + +### 4. 🧩 细节真空度(0-10) +**定义**:文档结构完整、格式规范,但缺乏任何可落地的具体信息。 +**典型表现**: +- "系统应支持多种支付方式"(哪些方式?) +- "性能要满足用户需求"(什么性能?什么需求?) +- "界面设计应符合用户习惯"(谁的习惯?什么标准?) +- 没有数字、没有边界条件、没有异常处理 + +**打分标准**: +- 0-2:有具体的数字、接口、用例、边界条件 +- 3-5:部分模糊但核心功能有描述 +- 6-8:大量"应该"、"需要"但没有"怎么做" +- 9-10:读完完全不知道要开发什么 + +### 5. ✨ 形容词暴力指数(0-10) +**定义**:大量堆叠科技/管理类buzzword,用词汇密度掩盖内容空洞。 +**高危词汇**:高效、赋能、闭环、生态、颗粒度、抓手、落地、对齐、拉通、赛道、底层逻辑、顶层设计、数字化转型、智能化、一体化、全链路、沉淀、复用、标准化、降本增效、价值最大化 +**打分标准**: +- 0-2:用词精准朴素,术语有明确定义 +- 3-5:偶有流行词但不影响理解 +- 6-8:buzzword 密集,读起来像PPT +- 9-10:去掉这些词文档就空了 + +--- + +## 综合 AI 味总分(0-10) +基于以上 5 个维度的加权综合评估。 + +**权重参考**: +- 细节真空度(0.3):最能区分 AI 和人写的 +- 正确的废话指数(0.25) +- 形容词暴力指数(0.2) +- 无限镜像感(0.15) +- 彩虹屁密度(0.1) + +--- + +## 输出格式 + +请严格按照以下 JSON 格式输出,不要输出任何其他内容: + +```json +{ + "total_score": <综合AI味总分 0-10>, + "dimensions": { + "correct_nonsense": <正确的废话指数 0-10>, + "infinite_mirror": <无限镜像感 0-10>, + "rainbow_fart": <彩虹屁密度 0-10>, + "detail_vacuum": <细节真空度 0-10>, + "adjective_violence": <形容词暴力指数 0-10> + }, + "evidence": { + "correct_nonsense": "<最典型的1-2个例子,直接引用原文>", + "infinite_mirror": "<最典型的1-2个例子,直接引用原文>", + "rainbow_fart": "<最典型的1-2个例子,直接引用原文>", + "detail_vacuum": "<最典型的1-2个例子,直接引用原文>", + "adjective_violence": "<最典型的1-2个例子,直接引用原文>" + }, + "verdict": "<一句话总结,不超过50字>" +} +``` + +--- + +## 待评估文档: + +""" + + @classmethod + def process_response(cls, response: str) -> EvalDetail: + """ + Process LLM response and convert to EvalDetail. + """ + # Clean markdown code blocks + if response.startswith("```json"): + response = response[7:] + elif response.startswith("```"): + response = response[3:] + if response.endswith("```"): + response = response[:-3] + response = response.strip() + + try: + data = json.loads(response) + except json.JSONDecodeError: + raise ConvertJsonError(f"Failed to parse AI smell response as JSON: {response[:200]}") + + total_score = data.get("total_score", 0) + dimensions = data.get("dimensions", {}) + evidence = data.get("evidence", {}) + verdict = data.get("verdict", "") + + # Build human-readable reason + dim_labels = { + "correct_nonsense": "💊 正确的废话指数", + "infinite_mirror": "🪞 无限镜像感", + "rainbow_fart": "🌈 彩虹屁密度", + "detail_vacuum": "🧩 细节真空度", + "adjective_violence": "✨ 形容词暴力指数", + } + + reason_lines = [f"🤖 AI味总分:{total_score}/10"] + reason_lines.append("") + for key, label in dim_labels.items(): + score = dimensions.get(key, 0) + example = evidence.get(key, "") + bar = cls._score_bar(score) + reason_lines.append(f"{label}:{score}/10 {bar}") + if example and score >= 5: + reason_lines.append(f" └ 例:{example}") + reason_lines.append("") + reason_lines.append(f"📝 {verdict}") + + is_ai_smell = total_score >= cls.threshold + + result = EvalDetail( + metric=cls.__name__, + status=is_ai_smell, + score=round(total_score / 10, 2), # normalize to 0-1 for consistency + label=["AI_SMELL_DETECTED"] if is_ai_smell else ["AI_SMELL_CLEAN"], + reason=["\n".join(reason_lines)], + ) + + return result + + @classmethod + def _score_bar(cls, score: int, width: int = 10) -> str: + """Generate a simple ASCII progress bar for a 0-10 score.""" + filled = round(score) + empty = width - filled + return f"[{'█' * filled}{'░' * empty}]" From dc34d93a4b40f5acf516f9c1b7031fcbb2d66983 Mon Sep 17 00:00:00 2001 From: Pei Chu Date: Thu, 18 Jun 2026 18:39:57 +0800 Subject: [PATCH 2/9] test: add unit tests for LLMAISmell --- test/scripts/model/llm/test_llm_ai_smell.py | 243 ++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 test/scripts/model/llm/test_llm_ai_smell.py diff --git a/test/scripts/model/llm/test_llm_ai_smell.py b/test/scripts/model/llm/test_llm_ai_smell.py new file mode 100644 index 00000000..d6ac0f31 --- /dev/null +++ b/test/scripts/model/llm/test_llm_ai_smell.py @@ -0,0 +1,243 @@ +""" +Unit tests for LLMAISmell - AI Smell Detector for Requirement Documents +""" +import json + +import pytest + +from dingo.model.llm.llm_ai_smell import LLMAISmell + + +class TestLLMAISmell: + """Tests for the AI smell detection checker.""" + + def _make_response( + self, + total_score=3, + correct_nonsense=2, + infinite_mirror=3, + rainbow_fart=2, + detail_vacuum=4, + adjective_violence=2, + verdict="文档有一定 AI 味但仍可接受", + evidence=None, + ): + if evidence is None: + evidence = { + "correct_nonsense": "", + "infinite_mirror": "", + "rainbow_fart": "", + "detail_vacuum": "", + "adjective_violence": "", + } + return json.dumps( + { + "total_score": total_score, + "dimensions": { + "correct_nonsense": correct_nonsense, + "infinite_mirror": infinite_mirror, + "rainbow_fart": rainbow_fart, + "detail_vacuum": detail_vacuum, + "adjective_violence": adjective_violence, + }, + "evidence": evidence, + "verdict": verdict, + }, + ensure_ascii=False, + ) + + # ────────────────────────────────────────────── + # Basic pass / fail logic + # ────────────────────────────────────────────── + + def test_clean_document_not_flagged(self): + """Low-scoring document should NOT be flagged as AI smell.""" + response = self._make_response(total_score=3, verdict="文档整体朴实,无明显 AI 味") + result = LLMAISmell.process_response(response) + + assert result.status is False + assert result.label == ["AI_SMELL_CLEAN"] + assert result.metric == "LLMAISmell" + + def test_ai_smell_document_flagged(self): + """High-scoring document SHOULD be flagged as AI smell.""" + response = self._make_response( + total_score=8, + correct_nonsense=8, + detail_vacuum=9, + adjective_violence=8, + verdict="典型 AI 代写,大量废话和 buzzword,缺乏可落地细节", + evidence={ + "correct_nonsense": "在当今数字化转型的大背景下……", + "infinite_mirror": "", + "rainbow_fart": "彻底革新传统模式", + "detail_vacuum": "系统性能应满足业务需求", + "adjective_violence": "赋能、闭环、降本增效、颗粒度", + }, + ) + result = LLMAISmell.process_response(response) + + assert result.status is True + assert result.label == ["AI_SMELL_DETECTED"] + + def test_threshold_boundary_exactly_at_threshold(self): + """Score exactly at threshold (6) should be flagged.""" + response = self._make_response(total_score=LLMAISmell.threshold) + result = LLMAISmell.process_response(response) + + assert result.status is True + assert result.label == ["AI_SMELL_DETECTED"] + + def test_threshold_boundary_just_below(self): + """Score just below threshold (5) should NOT be flagged.""" + response = self._make_response(total_score=LLMAISmell.threshold - 1) + result = LLMAISmell.process_response(response) + + assert result.status is False + assert result.label == ["AI_SMELL_CLEAN"] + + # ────────────────────────────────────────────── + # Score normalization + # ────────────────────────────────────────────── + + def test_score_normalized_to_zero_one(self): + """score field should be in [0, 1] range.""" + for raw in [0, 5, 10]: + response = self._make_response(total_score=raw) + result = LLMAISmell.process_response(response) + assert 0.0 <= result.score <= 1.0, f"score out of range for raw={raw}" + + def test_score_value_correct(self): + """score = total_score / 10, rounded to 2 decimals.""" + response = self._make_response(total_score=7) + result = LLMAISmell.process_response(response) + assert result.score == pytest.approx(0.70, abs=1e-9) + + # ────────────────────────────────────────────── + # Reason string content + # ────────────────────────────────────────────── + + def test_reason_contains_total_score(self): + """Reason should display the total AI smell score.""" + response = self._make_response(total_score=7) + result = LLMAISmell.process_response(response) + assert "7/10" in result.reason[0] + + def test_reason_contains_all_dimension_labels(self): + """Reason should list all 5 dimension labels.""" + response = self._make_response() + result = LLMAISmell.process_response(response) + reason = result.reason[0] + + assert "💊 正确的废话指数" in reason + assert "🪞 无限镜像感" in reason + assert "🌈 彩虹屁密度" in reason + assert "🧩 细节真空度" in reason + assert "✨ 形容词暴力指数" in reason + + def test_reason_contains_verdict(self): + """Reason should contain the verdict string.""" + verdict = "这是一份高度 AI 味的文档,建议重写" + response = self._make_response(total_score=8, verdict=verdict) + result = LLMAISmell.process_response(response) + assert verdict in result.reason[0] + + def test_evidence_shown_for_high_scores(self): + """Evidence should appear in reason for dimensions scoring >= 5.""" + evidence_text = "在当今社会,随着技术不断发展……" + response = self._make_response( + total_score=7, + correct_nonsense=6, + evidence={ + "correct_nonsense": evidence_text, + "infinite_mirror": "", + "rainbow_fart": "", + "detail_vacuum": "", + "adjective_violence": "", + }, + ) + result = LLMAISmell.process_response(response) + assert evidence_text in result.reason[0] + + def test_evidence_hidden_for_low_scores(self): + """Evidence should NOT appear in reason for dimensions scoring < 5.""" + evidence_text = "某个不应出现的例句" + response = self._make_response( + total_score=3, + correct_nonsense=2, + evidence={ + "correct_nonsense": evidence_text, + "infinite_mirror": "", + "rainbow_fart": "", + "detail_vacuum": "", + "adjective_violence": "", + }, + ) + result = LLMAISmell.process_response(response) + assert evidence_text not in result.reason[0] + + # ────────────────────────────────────────────── + # Markdown cleanup + # ────────────────────────────────────────────── + + def test_markdown_json_code_block_stripped(self): + """LLM often wraps JSON in ```json ... ``` — should be handled.""" + inner = self._make_response(total_score=4) + wrapped = f"```json\n{inner}\n```" + result = LLMAISmell.process_response(wrapped) + assert result.label == ["AI_SMELL_CLEAN"] + + def test_plain_code_block_stripped(self): + """Plain ``` ... ``` wrapper should also be stripped.""" + inner = self._make_response(total_score=7) + wrapped = f"```\n{inner}\n```" + result = LLMAISmell.process_response(wrapped) + assert result.label == ["AI_SMELL_DETECTED"] + + # ────────────────────────────────────────────── + # Error handling + # ────────────────────────────────────────────── + + def test_invalid_json_raises_convert_error(self): + """Garbage response should raise ConvertJsonError.""" + from dingo.utils.exception import ConvertJsonError + + with pytest.raises(ConvertJsonError): + LLMAISmell.process_response("This is not JSON at all") + + # ────────────────────────────────────────────── + # Metadata + # ────────────────────────────────────────────── + + def test_metric_name_matches_class(self): + """EvalDetail.metric should be the class name.""" + response = self._make_response() + result = LLMAISmell.process_response(response) + assert result.metric == "LLMAISmell" + + def test_required_fields(self): + """Checker should only require CONTENT (no prompt/context needed).""" + from dingo.io.input import RequiredField + + assert RequiredField.CONTENT in LLMAISmell._required_fields + assert len(LLMAISmell._required_fields) == 1 + + # ────────────────────────────────────────────── + # Score bar helper + # ────────────────────────────────────────────── + + def test_score_bar_full(self): + bar = LLMAISmell._score_bar(10) + assert bar == "[██████████]" + + def test_score_bar_empty(self): + bar = LLMAISmell._score_bar(0) + assert bar == "[░░░░░░░░░░]" + + def test_score_bar_half(self): + bar = LLMAISmell._score_bar(5) + assert bar == "[█████░░░░░]" + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "--tb=short"]) From cd56a039c94a27050a5f19893bb0137956cbb8e3 Mon Sep 17 00:00:00 2001 From: Pei Chu Date: Thu, 18 Jun 2026 18:40:43 +0800 Subject: [PATCH 3/9] docs: add usage example for LLMAISmell --- examples/llm_ai_smell_example.py | 137 +++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 examples/llm_ai_smell_example.py diff --git a/examples/llm_ai_smell_example.py b/examples/llm_ai_smell_example.py new file mode 100644 index 00000000..5e882fe6 --- /dev/null +++ b/examples/llm_ai_smell_example.py @@ -0,0 +1,137 @@ +""" +Example: AI Smell Detection for Requirement Documents + +Usage: + python examples/llm_ai_smell_example.py + +This example demonstrates how to use LLMAISmell to detect AI-generated +writing patterns in requirement documents. +""" + +from dingo.exec.local import LocalExecutor +from dingo.config.input_args import InputArgs + +SAMPLE_DOC_HIGH_AI_SMELL = """ +## 智能客服系统需求文档 + +### 一、背景 + +在当今数字化转型的大背景下,随着人工智能技术的不断发展和进步,越来越多的企业 +开始重视智能化客服系统的建设。为了更好地赋能业务发展,提升用户体验,打造闭环的 +客户服务生态,我们提出构建一套高效、智能的客服解决方案,以实现降本增效、价值最大化的战略目标。 + +### 二、核心目标 + +本系统旨在通过底层逻辑的重塑和顶层设计的优化,实现以下战略目标: +- 大幅提升客户满意度,打造行业领先的服务体验 +- 通过全链路智能化改造,彻底革新传统客服模式 +- 赋能一线客服人员,提升整体服务效能,实现降本增效 +- 构建可持续发展的智能客服生态,沉淀核心服务能力 + +### 三、功能需求 + +#### 3.1 智能问答 + +系统应支持智能问答功能,能够准确理解用户意图,提供精准的回答。系统需要确保 +回答的准确性和及时性,以满足用户的迫切需求。界面设计应符合用户使用习惯,提供 +良好的交互体验。系统还应支持多轮对话,能够理解上下文,提供连贯的对话体验。 + +#### 3.2 工单管理 + +系统应具备完善的工单管理功能,支持工单的创建、分配、跟踪和关闭全生命周期管理。 +工单系统需要满足业务需求,支持多种工单类型,确保处理效率和质量。通过对工单数据 +的深度挖掘和分析,为管理决策提供有力支撑,实现数据驱动的精细化运营。 + +### 四、技术要求 + +系统性能应满足业务需求,确保在高并发场景下的稳定运行。系统需要具备良好的扩展性 +和可维护性,以支撑未来业务的快速发展。安全性方面,系统应符合相关法规要求,保护 +用户数据安全。系统架构应采用先进的微服务架构,实现各模块的解耦,提升系统的灵活性 +和可靠性。 + +### 五、总结 + +综上所述,本智能客服系统将通过技术创新和模式变革,为企业创造巨大的商业价值, +提升核心竞争力,助力企业在激烈的市场竞争中脱颖而出,实现可持续发展。 +""" + +SAMPLE_DOC_LOW_AI_SMELL = """ +## 客服工单系统 v2.1 需求文档 + +**作者**: 张三 **日期**: 2024-01-15 **评审状态**: 待评审 + +--- + +### 1. 背景 + +当前客服团队每天处理约 2000 张工单,其中 65% 为重复性问题(退款、发货查询、 +账号问题)。工单平均处理时间 8 分钟,其中 3 分钟用于查历史记录。本项目目标是 +将平均处理时间降至 5 分钟以内。 + +### 2. 功能需求 + +#### 2.1 快速回复模板 + +**需求描述**:客服输入关键词时,系统自动推荐匹配的回复模板。 + +**详细说明**: +- 输入框输入字符后 300ms 内展示建议列表,最多显示 5 条 +- 按相关度排序:完全匹配 > 关键词匹配 > 语义相似 +- 客服选择模板后可编辑再发送,不能直接强制发送 +- 模板库由运营通过后台维护,支持按一级分类(退款/物流/账号/其他)管理 + +**不在范围内**:自动发送、客户端展示建议 + +#### 2.2 历史工单查询 + +**需求描述**:在工单页面可快速查看同一用户的历史工单。 + +**详细说明**: +- 侧边栏展示最近 10 张工单的摘要(时间、分类、处理结果) +- 点击展开查看完整内容 +- 数据来源:工单系统数据库,实时查询,无需缓存 +- 异常情况:用户无历史工单时展示"暂无历史记录",查询超时(>3s)展示错误提示 + +### 3. 非功能需求 + +- 快速回复建议 P95 响应时间 < 500ms(基于当前 500 并发用户) +- 历史工单查询 P99 < 2s +- 暂不考虑国际化 +""" + + +def run_example(): + print("=" * 60) + print("Example 1: High AI Smell Document") + print("=" * 60) + + executor = LocalExecutor( + input_args=InputArgs( + eval_group="llm", + llm_config={ + "model": "gpt-4o", + "key": "YOUR_API_KEY", + "api_base": "https://api.openai.com/v1", + }, + custom_config={ + "llm": ["LLMAISmell"] + } + ) + ) + + # Use executor.eval_text for quick single-text evaluation + # This is a simplified example showing the checker's usage + print("\nDocument snippet (high AI smell):") + print(SAMPLE_DOC_HIGH_AI_SMELL[:200] + "...") + print("\nExpected: AI_SMELL_DETECTED with high scores on adjective_violence and detail_vacuum") + + print("\n" + "=" * 60) + print("Example 2: Low AI Smell Document") + print("=" * 60) + print("\nDocument snippet (low AI smell):") + print(SAMPLE_DOC_LOW_AI_SMELL[:200] + "...") + print("\nExpected: AI_SMELL_CLEAN with low scores across all dimensions") + + +if __name__ == "__main__": + run_example() From a6615736b280c10a0caf4b4f2225c8e00121e79e Mon Sep 17 00:00:00 2001 From: Pei Chu Date: Thu, 18 Jun 2026 18:47:08 +0800 Subject: [PATCH 4/9] fix: resolve flake8 F841 and isort in example file --- examples/llm_ai_smell_example.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/examples/llm_ai_smell_example.py b/examples/llm_ai_smell_example.py index 5e882fe6..dc97365d 100644 --- a/examples/llm_ai_smell_example.py +++ b/examples/llm_ai_smell_example.py @@ -8,8 +8,8 @@ writing patterns in requirement documents. """ -from dingo.exec.local import LocalExecutor from dingo.config.input_args import InputArgs +from dingo.exec.local import LocalExecutor SAMPLE_DOC_HIGH_AI_SMELL = """ ## 智能客服系统需求文档 @@ -105,25 +105,25 @@ def run_example(): print("Example 1: High AI Smell Document") print("=" * 60) - executor = LocalExecutor( - input_args=InputArgs( - eval_group="llm", - llm_config={ - "model": "gpt-4o", - "key": "YOUR_API_KEY", - "api_base": "https://api.openai.com/v1", - }, - custom_config={ - "llm": ["LLMAISmell"] - } - ) + # Configure the executor with LLMAISmell checker. + # Replace YOUR_API_KEY and api_base with your actual LLM credentials. + input_args = InputArgs( + eval_group="llm", + llm_config={ + "model": "gpt-4o", + "key": "YOUR_API_KEY", + "api_base": "https://api.openai.com/v1", + }, + custom_config={"llm": ["LLMAISmell"]}, ) + executor = LocalExecutor(input_args=input_args) - # Use executor.eval_text for quick single-text evaluation - # This is a simplified example showing the checker's usage + # Call executor.eval_text to evaluate a single document string. + # Example: result = executor.eval_text(SAMPLE_DOC_HIGH_AI_SMELL) print("\nDocument snippet (high AI smell):") print(SAMPLE_DOC_HIGH_AI_SMELL[:200] + "...") print("\nExpected: AI_SMELL_DETECTED with high scores on adjective_violence and detail_vacuum") + print(f"\nExecutor ready: {executor.__class__.__name__}") print("\n" + "=" * 60) print("Example 2: Low AI Smell Document") From 2873e40a5e0422e96a32c69c0dfdbd0eb710dcb0 Mon Sep 17 00:00:00 2001 From: Pei Chu Date: Thu, 18 Jun 2026 19:33:40 +0800 Subject: [PATCH 5/9] fix: harden process_response against LLM returning strings/nulls Adopt Gemini code-assist suggestions: - Cast total_score to float() with ValueError/TypeError fallback - Use 'or {}' for dimensions/evidence to handle null values - Use str() for verdict to handle null - Cast per-dimension scores to float() before comparisons - Clamp _score_bar input with int(round()) + max/min guard --- dingo/model/llm/llm_ai_smell.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/dingo/model/llm/llm_ai_smell.py b/dingo/model/llm/llm_ai_smell.py index 4ede21f4..36c84ce1 100644 --- a/dingo/model/llm/llm_ai_smell.py +++ b/dingo/model/llm/llm_ai_smell.py @@ -184,10 +184,13 @@ def process_response(cls, response: str) -> EvalDetail: except json.JSONDecodeError: raise ConvertJsonError(f"Failed to parse AI smell response as JSON: {response[:200]}") - total_score = data.get("total_score", 0) - dimensions = data.get("dimensions", {}) - evidence = data.get("evidence", {}) - verdict = data.get("verdict", "") + try: + total_score = float(data.get("total_score", 0)) + except (ValueError, TypeError): + total_score = 0.0 + dimensions = data.get("dimensions") or {} + evidence = data.get("evidence") or {} + verdict = str(data.get("verdict") or "") # Build human-readable reason dim_labels = { @@ -201,9 +204,13 @@ def process_response(cls, response: str) -> EvalDetail: reason_lines = [f"🤖 AI味总分:{total_score}/10"] reason_lines.append("") for key, label in dim_labels.items(): - score = dimensions.get(key, 0) + raw_score = dimensions.get(key, 0) + try: + score = float(raw_score) + except (ValueError, TypeError): + score = 0.0 example = evidence.get(key, "") - bar = cls._score_bar(score) + bar = cls._score_bar(round(score)) reason_lines.append(f"{label}:{score}/10 {bar}") if example and score >= 5: reason_lines.append(f" └ 例:{example}") @@ -225,6 +232,6 @@ def process_response(cls, response: str) -> EvalDetail: @classmethod def _score_bar(cls, score: int, width: int = 10) -> str: """Generate a simple ASCII progress bar for a 0-10 score.""" - filled = round(score) + filled = max(0, min(width, int(round(score)))) empty = width - filled return f"[{'█' * filled}{'░' * empty}]" From afef054526c20c9667c8e7bb5e97ccffcdd60c69 Mon Sep 17 00:00:00 2001 From: Pei Chu Date: Thu, 18 Jun 2026 19:38:30 +0800 Subject: [PATCH 6/9] fix: address Gemini medium-priority suggestions --- dingo/model/llm/llm_ai_smell.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dingo/model/llm/llm_ai_smell.py b/dingo/model/llm/llm_ai_smell.py index 36c84ce1..e86c9acc 100644 --- a/dingo/model/llm/llm_ai_smell.py +++ b/dingo/model/llm/llm_ai_smell.py @@ -170,7 +170,8 @@ def process_response(cls, response: str) -> EvalDetail: """ Process LLM response and convert to EvalDetail. """ - # Clean markdown code blocks + # Strip leading/trailing whitespace first, then remove markdown code blocks + response = response.strip() if response.startswith("```json"): response = response[7:] elif response.startswith("```"): @@ -181,6 +182,10 @@ def process_response(cls, response: str) -> EvalDetail: try: data = json.loads(response) + if not isinstance(data, dict): + raise ConvertJsonError( + f"Parsed JSON is not a dictionary: {type(data)}" + ) except json.JSONDecodeError: raise ConvertJsonError(f"Failed to parse AI smell response as JSON: {response[:200]}") From 3edf8bad9a7e9826c342a020ada8db88938e3cb4 Mon Sep 17 00:00:00 2001 From: Pei Chu Date: Thu, 18 Jun 2026 19:38:32 +0800 Subject: [PATCH 7/9] fix: address Gemini medium-priority suggestions --- examples/llm_ai_smell_example.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/llm_ai_smell_example.py b/examples/llm_ai_smell_example.py index dc97365d..92f5a85f 100644 --- a/examples/llm_ai_smell_example.py +++ b/examples/llm_ai_smell_example.py @@ -118,13 +118,19 @@ def run_example(): ) executor = LocalExecutor(input_args=input_args) - # Call executor.eval_text to evaluate a single document string. - # Example: result = executor.eval_text(SAMPLE_DOC_HIGH_AI_SMELL) print("\nDocument snippet (high AI smell):") print(SAMPLE_DOC_HIGH_AI_SMELL[:200] + "...") print("\nExpected: AI_SMELL_DETECTED with high scores on adjective_violence and detail_vacuum") print(f"\nExecutor ready: {executor.__class__.__name__}") + # To run the actual evaluation (requires a valid API key configured above): + # try: + # result = executor.eval_text(SAMPLE_DOC_HIGH_AI_SMELL) + # print("\nEvaluation Result:") + # print(result.reason[0]) + # except Exception as e: + # print(f"\nCould not run evaluation: {e}") + print("\n" + "=" * 60) print("Example 2: Low AI Smell Document") print("=" * 60) From 4d0bf9fd938d035cbb57dbaf3ef656ab4f63c601 Mon Sep 17 00:00:00 2001 From: Pei Chu Date: Thu, 18 Jun 2026 19:45:40 +0800 Subject: [PATCH 8/9] fix: format scores as int in reason output, fix test assertion --- dingo/model/llm/llm_ai_smell.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dingo/model/llm/llm_ai_smell.py b/dingo/model/llm/llm_ai_smell.py index e86c9acc..b5bc3bd2 100644 --- a/dingo/model/llm/llm_ai_smell.py +++ b/dingo/model/llm/llm_ai_smell.py @@ -206,7 +206,7 @@ def process_response(cls, response: str) -> EvalDetail: "adjective_violence": "✨ 形容词暴力指数", } - reason_lines = [f"🤖 AI味总分:{total_score}/10"] + reason_lines = [f"🤖 AI味总分:{int(total_score)}/10"] reason_lines.append("") for key, label in dim_labels.items(): raw_score = dimensions.get(key, 0) @@ -216,7 +216,7 @@ def process_response(cls, response: str) -> EvalDetail: score = 0.0 example = evidence.get(key, "") bar = cls._score_bar(round(score)) - reason_lines.append(f"{label}:{score}/10 {bar}") + reason_lines.append(f"{label}:{int(score)}/10 {bar}") if example and score >= 5: reason_lines.append(f" └ 例:{example}") reason_lines.append("") From 1d62008cf90867bc79d93030c99f776ff1134676 Mon Sep 17 00:00:00 2001 From: Pei Chu Date: Thu, 18 Jun 2026 19:45:43 +0800 Subject: [PATCH 9/9] fix: format scores as int in reason output, fix test assertion