diff --git a/dingo/model/llm/llm_ai_smell.py b/dingo/model/llm/llm_ai_smell.py new file mode 100644 index 00000000..b5bc3bd2 --- /dev/null +++ b/dingo/model/llm/llm_ai_smell.py @@ -0,0 +1,242 @@ +""" +AI Smell Detector for Requirement Documents (需求文档 AI 味检测器) + +Detects AI-generated writing patterns in requirement documents across 5 dimensions: +- 正确的废话指数 (Correct Nonsense Index) +- 无限镜像感 (Infinite Mirror Index) +- 彩虹屁密度 (Rainbow Fart Density) +- 细节真空度 (Detail Vacuum Index) +- 形容词暴力指数 (Adjective Violence Index) +""" + +import json + +from dingo.io.input import Data, RequiredField +from dingo.io.output.eval_detail import EvalDetail +from dingo.model import Model +from dingo.model.llm.base_openai import BaseOpenAI +from dingo.utils import log +from dingo.utils.exception import ConvertJsonError + + +@Model.llm_register("LLMAISmell") +class LLMAISmell(BaseOpenAI): + """ + AI Smell Detector for requirement documents. + + Evaluates 5 dimensions of AI-generated writing patterns: + 1. 正确的废话指数 - Hollow truisms ("In today's society...", "With the rapid development of...") + 2. 无限镜像感 - Repetitive emphasis of the same point in different words + 3. 彩虹屁密度 - Excessive praise, inflated importance claims + 4. 细节真空度 - Sounds complete but lacks any actionable specifics + 5. 形容词暴力指数 - Buzzword overuse (高效/赋能/闭环/生态/颗粒度...) + + Each dimension is scored 0-10. Overall AI smell score is the weighted average. + """ + + _metric_info = { + "category": "Document Quality Assessment Metrics", + "metric_name": "LLMAISmell", + "description": "Detects AI-generated writing patterns in requirement documents across 5 dimensions: hollow truisms, repetition, rainbow farts, detail vacuum, and adjective violence", + "examples": "examples/llm_and_rule/llm_local.py", + "evaluation_results": "", + } + + _required_fields = [RequiredField.CONTENT] + + # Score threshold above which a document is flagged as AI-smelling + threshold = 6 + + prompt = """ +# 角色 +你是一位资深需求评审专家,专门识别需求文档中的 AI 代写痕迹。 + +# 任务 +分析下面的文档,从 5 个维度评估其"AI 味",每个维度打分 0-10: + +## 评估维度 + +### 1. 💊 正确的废话指数(0-10) +**定义**:用正确但毫无信息量的话填充文档,听起来很有道理但什么都没说。 +**典型表现**: +- "在当今社会……"、"随着技术的不断发展……" +- "这对用户体验至关重要"(但没有说为什么或怎么做) +- "我们需要确保系统的稳定性和可靠性"(没有具体指标) +- 每段开头都在重述背景 + +**打分标准**: +- 0-2:文档直接切入主题,陈述均有实质内容 +- 3-5:有少量套话但不影响整体 +- 6-8:大量空洞表述,信息密度低 +- 9-10:几乎全是废话,读完不知道要做什么 + +### 2. 🪞 无限镜像感(0-10) +**定义**:同一个意思用不同的话反复说,制造"内容丰富"的假象。 +**典型表现**: +- 同一个功能点在不同章节反复描述 +- "提升用户体验" → "优化用户感受" → "改善用户满意度"(三句话说同一件事) +- 结论和摘要和正文高度重复 + +**打分标准**: +- 0-2:每句话都有新信息 +- 3-5:有轻微重复但可接受 +- 6-8:明显感觉在凑字数 +- 9-10:镜中镜,绕来绕去 + +### 3. 🌈 彩虹屁密度(0-10) +**定义**:过度拔高重要性、夸大影响、给自己项目过度背书。 +**典型表现**: +- "这将彻底改变……"、"革命性的……"、"行业领先的……" +- "大幅提升"但没有数据 +- "用户迫切需要"但没有调研依据 +- 每个功能都是"核心"、"关键"、"重要" + +**打分标准**: +- 0-2:表述客观,有数据支撑 +- 3-5:略有夸张但在合理范围 +- 6-8:随处可见夸大词汇 +- 9-10:每句话都在吹,读起来像广告 + +### 4. 🧩 细节真空度(0-10) +**定义**:文档结构完整、格式规范,但缺乏任何可落地的具体信息。 +**典型表现**: +- "系统应支持多种支付方式"(哪些方式?) +- "性能要满足用户需求"(什么性能?什么需求?) +- "界面设计应符合用户习惯"(谁的习惯?什么标准?) +- 没有数字、没有边界条件、没有异常处理 + +**打分标准**: +- 0-2:有具体的数字、接口、用例、边界条件 +- 3-5:部分模糊但核心功能有描述 +- 6-8:大量"应该"、"需要"但没有"怎么做" +- 9-10:读完完全不知道要开发什么 + +### 5. ✨ 形容词暴力指数(0-10) +**定义**:大量堆叠科技/管理类buzzword,用词汇密度掩盖内容空洞。 +**高危词汇**:高效、赋能、闭环、生态、颗粒度、抓手、落地、对齐、拉通、赛道、底层逻辑、顶层设计、数字化转型、智能化、一体化、全链路、沉淀、复用、标准化、降本增效、价值最大化 +**打分标准**: +- 0-2:用词精准朴素,术语有明确定义 +- 3-5:偶有流行词但不影响理解 +- 6-8:buzzword 密集,读起来像PPT +- 9-10:去掉这些词文档就空了 + +--- + +## 综合 AI 味总分(0-10) +基于以上 5 个维度的加权综合评估。 + +**权重参考**: +- 细节真空度(0.3):最能区分 AI 和人写的 +- 正确的废话指数(0.25) +- 形容词暴力指数(0.2) +- 无限镜像感(0.15) +- 彩虹屁密度(0.1) + +--- + +## 输出格式 + +请严格按照以下 JSON 格式输出,不要输出任何其他内容: + +```json +{ + "total_score": <综合AI味总分 0-10>, + "dimensions": { + "correct_nonsense": <正确的废话指数 0-10>, + "infinite_mirror": <无限镜像感 0-10>, + "rainbow_fart": <彩虹屁密度 0-10>, + "detail_vacuum": <细节真空度 0-10>, + "adjective_violence": <形容词暴力指数 0-10> + }, + "evidence": { + "correct_nonsense": "<最典型的1-2个例子,直接引用原文>", + "infinite_mirror": "<最典型的1-2个例子,直接引用原文>", + "rainbow_fart": "<最典型的1-2个例子,直接引用原文>", + "detail_vacuum": "<最典型的1-2个例子,直接引用原文>", + "adjective_violence": "<最典型的1-2个例子,直接引用原文>" + }, + "verdict": "<一句话总结,不超过50字>" +} +``` + +--- + +## 待评估文档: + +""" + + @classmethod + def process_response(cls, response: str) -> EvalDetail: + """ + Process LLM response and convert to EvalDetail. + """ + # Strip leading/trailing whitespace first, then remove markdown code blocks + response = response.strip() + if response.startswith("```json"): + response = response[7:] + elif response.startswith("```"): + response = response[3:] + if response.endswith("```"): + response = response[:-3] + response = response.strip() + + try: + data = json.loads(response) + if not isinstance(data, dict): + raise ConvertJsonError( + f"Parsed JSON is not a dictionary: {type(data)}" + ) + except json.JSONDecodeError: + raise ConvertJsonError(f"Failed to parse AI smell response as JSON: {response[:200]}") + + try: + total_score = float(data.get("total_score", 0)) + except (ValueError, TypeError): + total_score = 0.0 + dimensions = data.get("dimensions") or {} + evidence = data.get("evidence") or {} + verdict = str(data.get("verdict") or "") + + # Build human-readable reason + dim_labels = { + "correct_nonsense": "💊 正确的废话指数", + "infinite_mirror": "🪞 无限镜像感", + "rainbow_fart": "🌈 彩虹屁密度", + "detail_vacuum": "🧩 细节真空度", + "adjective_violence": "✨ 形容词暴力指数", + } + + reason_lines = [f"🤖 AI味总分:{int(total_score)}/10"] + reason_lines.append("") + for key, label in dim_labels.items(): + raw_score = dimensions.get(key, 0) + try: + score = float(raw_score) + except (ValueError, TypeError): + score = 0.0 + example = evidence.get(key, "") + bar = cls._score_bar(round(score)) + reason_lines.append(f"{label}:{int(score)}/10 {bar}") + if example and score >= 5: + reason_lines.append(f" └ 例:{example}") + reason_lines.append("") + reason_lines.append(f"📝 {verdict}") + + is_ai_smell = total_score >= cls.threshold + + result = EvalDetail( + metric=cls.__name__, + status=is_ai_smell, + score=round(total_score / 10, 2), # normalize to 0-1 for consistency + label=["AI_SMELL_DETECTED"] if is_ai_smell else ["AI_SMELL_CLEAN"], + reason=["\n".join(reason_lines)], + ) + + return result + + @classmethod + def _score_bar(cls, score: int, width: int = 10) -> str: + """Generate a simple ASCII progress bar for a 0-10 score.""" + filled = max(0, min(width, int(round(score)))) + empty = width - filled + return f"[{'█' * filled}{'░' * empty}]" diff --git a/examples/llm_ai_smell_example.py b/examples/llm_ai_smell_example.py new file mode 100644 index 00000000..92f5a85f --- /dev/null +++ b/examples/llm_ai_smell_example.py @@ -0,0 +1,143 @@ +""" +Example: AI Smell Detection for Requirement Documents + +Usage: + python examples/llm_ai_smell_example.py + +This example demonstrates how to use LLMAISmell to detect AI-generated +writing patterns in requirement documents. +""" + +from dingo.config.input_args import InputArgs +from dingo.exec.local import LocalExecutor + +SAMPLE_DOC_HIGH_AI_SMELL = """ +## 智能客服系统需求文档 + +### 一、背景 + +在当今数字化转型的大背景下,随着人工智能技术的不断发展和进步,越来越多的企业 +开始重视智能化客服系统的建设。为了更好地赋能业务发展,提升用户体验,打造闭环的 +客户服务生态,我们提出构建一套高效、智能的客服解决方案,以实现降本增效、价值最大化的战略目标。 + +### 二、核心目标 + +本系统旨在通过底层逻辑的重塑和顶层设计的优化,实现以下战略目标: +- 大幅提升客户满意度,打造行业领先的服务体验 +- 通过全链路智能化改造,彻底革新传统客服模式 +- 赋能一线客服人员,提升整体服务效能,实现降本增效 +- 构建可持续发展的智能客服生态,沉淀核心服务能力 + +### 三、功能需求 + +#### 3.1 智能问答 + +系统应支持智能问答功能,能够准确理解用户意图,提供精准的回答。系统需要确保 +回答的准确性和及时性,以满足用户的迫切需求。界面设计应符合用户使用习惯,提供 +良好的交互体验。系统还应支持多轮对话,能够理解上下文,提供连贯的对话体验。 + +#### 3.2 工单管理 + +系统应具备完善的工单管理功能,支持工单的创建、分配、跟踪和关闭全生命周期管理。 +工单系统需要满足业务需求,支持多种工单类型,确保处理效率和质量。通过对工单数据 +的深度挖掘和分析,为管理决策提供有力支撑,实现数据驱动的精细化运营。 + +### 四、技术要求 + +系统性能应满足业务需求,确保在高并发场景下的稳定运行。系统需要具备良好的扩展性 +和可维护性,以支撑未来业务的快速发展。安全性方面,系统应符合相关法规要求,保护 +用户数据安全。系统架构应采用先进的微服务架构,实现各模块的解耦,提升系统的灵活性 +和可靠性。 + +### 五、总结 + +综上所述,本智能客服系统将通过技术创新和模式变革,为企业创造巨大的商业价值, +提升核心竞争力,助力企业在激烈的市场竞争中脱颖而出,实现可持续发展。 +""" + +SAMPLE_DOC_LOW_AI_SMELL = """ +## 客服工单系统 v2.1 需求文档 + +**作者**: 张三 **日期**: 2024-01-15 **评审状态**: 待评审 + +--- + +### 1. 背景 + +当前客服团队每天处理约 2000 张工单,其中 65% 为重复性问题(退款、发货查询、 +账号问题)。工单平均处理时间 8 分钟,其中 3 分钟用于查历史记录。本项目目标是 +将平均处理时间降至 5 分钟以内。 + +### 2. 功能需求 + +#### 2.1 快速回复模板 + +**需求描述**:客服输入关键词时,系统自动推荐匹配的回复模板。 + +**详细说明**: +- 输入框输入字符后 300ms 内展示建议列表,最多显示 5 条 +- 按相关度排序:完全匹配 > 关键词匹配 > 语义相似 +- 客服选择模板后可编辑再发送,不能直接强制发送 +- 模板库由运营通过后台维护,支持按一级分类(退款/物流/账号/其他)管理 + +**不在范围内**:自动发送、客户端展示建议 + +#### 2.2 历史工单查询 + +**需求描述**:在工单页面可快速查看同一用户的历史工单。 + +**详细说明**: +- 侧边栏展示最近 10 张工单的摘要(时间、分类、处理结果) +- 点击展开查看完整内容 +- 数据来源:工单系统数据库,实时查询,无需缓存 +- 异常情况:用户无历史工单时展示"暂无历史记录",查询超时(>3s)展示错误提示 + +### 3. 非功能需求 + +- 快速回复建议 P95 响应时间 < 500ms(基于当前 500 并发用户) +- 历史工单查询 P99 < 2s +- 暂不考虑国际化 +""" + + +def run_example(): + print("=" * 60) + print("Example 1: High AI Smell Document") + print("=" * 60) + + # Configure the executor with LLMAISmell checker. + # Replace YOUR_API_KEY and api_base with your actual LLM credentials. + input_args = InputArgs( + eval_group="llm", + llm_config={ + "model": "gpt-4o", + "key": "YOUR_API_KEY", + "api_base": "https://api.openai.com/v1", + }, + custom_config={"llm": ["LLMAISmell"]}, + ) + executor = LocalExecutor(input_args=input_args) + + print("\nDocument snippet (high AI smell):") + print(SAMPLE_DOC_HIGH_AI_SMELL[:200] + "...") + print("\nExpected: AI_SMELL_DETECTED with high scores on adjective_violence and detail_vacuum") + print(f"\nExecutor ready: {executor.__class__.__name__}") + + # To run the actual evaluation (requires a valid API key configured above): + # try: + # result = executor.eval_text(SAMPLE_DOC_HIGH_AI_SMELL) + # print("\nEvaluation Result:") + # print(result.reason[0]) + # except Exception as e: + # print(f"\nCould not run evaluation: {e}") + + print("\n" + "=" * 60) + print("Example 2: Low AI Smell Document") + print("=" * 60) + print("\nDocument snippet (low AI smell):") + print(SAMPLE_DOC_LOW_AI_SMELL[:200] + "...") + print("\nExpected: AI_SMELL_CLEAN with low scores across all dimensions") + + +if __name__ == "__main__": + run_example() diff --git a/test/scripts/model/llm/test_llm_ai_smell.py b/test/scripts/model/llm/test_llm_ai_smell.py new file mode 100644 index 00000000..d6ac0f31 --- /dev/null +++ b/test/scripts/model/llm/test_llm_ai_smell.py @@ -0,0 +1,243 @@ +""" +Unit tests for LLMAISmell - AI Smell Detector for Requirement Documents +""" +import json + +import pytest + +from dingo.model.llm.llm_ai_smell import LLMAISmell + + +class TestLLMAISmell: + """Tests for the AI smell detection checker.""" + + def _make_response( + self, + total_score=3, + correct_nonsense=2, + infinite_mirror=3, + rainbow_fart=2, + detail_vacuum=4, + adjective_violence=2, + verdict="文档有一定 AI 味但仍可接受", + evidence=None, + ): + if evidence is None: + evidence = { + "correct_nonsense": "", + "infinite_mirror": "", + "rainbow_fart": "", + "detail_vacuum": "", + "adjective_violence": "", + } + return json.dumps( + { + "total_score": total_score, + "dimensions": { + "correct_nonsense": correct_nonsense, + "infinite_mirror": infinite_mirror, + "rainbow_fart": rainbow_fart, + "detail_vacuum": detail_vacuum, + "adjective_violence": adjective_violence, + }, + "evidence": evidence, + "verdict": verdict, + }, + ensure_ascii=False, + ) + + # ────────────────────────────────────────────── + # Basic pass / fail logic + # ────────────────────────────────────────────── + + def test_clean_document_not_flagged(self): + """Low-scoring document should NOT be flagged as AI smell.""" + response = self._make_response(total_score=3, verdict="文档整体朴实,无明显 AI 味") + result = LLMAISmell.process_response(response) + + assert result.status is False + assert result.label == ["AI_SMELL_CLEAN"] + assert result.metric == "LLMAISmell" + + def test_ai_smell_document_flagged(self): + """High-scoring document SHOULD be flagged as AI smell.""" + response = self._make_response( + total_score=8, + correct_nonsense=8, + detail_vacuum=9, + adjective_violence=8, + verdict="典型 AI 代写,大量废话和 buzzword,缺乏可落地细节", + evidence={ + "correct_nonsense": "在当今数字化转型的大背景下……", + "infinite_mirror": "", + "rainbow_fart": "彻底革新传统模式", + "detail_vacuum": "系统性能应满足业务需求", + "adjective_violence": "赋能、闭环、降本增效、颗粒度", + }, + ) + result = LLMAISmell.process_response(response) + + assert result.status is True + assert result.label == ["AI_SMELL_DETECTED"] + + def test_threshold_boundary_exactly_at_threshold(self): + """Score exactly at threshold (6) should be flagged.""" + response = self._make_response(total_score=LLMAISmell.threshold) + result = LLMAISmell.process_response(response) + + assert result.status is True + assert result.label == ["AI_SMELL_DETECTED"] + + def test_threshold_boundary_just_below(self): + """Score just below threshold (5) should NOT be flagged.""" + response = self._make_response(total_score=LLMAISmell.threshold - 1) + result = LLMAISmell.process_response(response) + + assert result.status is False + assert result.label == ["AI_SMELL_CLEAN"] + + # ────────────────────────────────────────────── + # Score normalization + # ────────────────────────────────────────────── + + def test_score_normalized_to_zero_one(self): + """score field should be in [0, 1] range.""" + for raw in [0, 5, 10]: + response = self._make_response(total_score=raw) + result = LLMAISmell.process_response(response) + assert 0.0 <= result.score <= 1.0, f"score out of range for raw={raw}" + + def test_score_value_correct(self): + """score = total_score / 10, rounded to 2 decimals.""" + response = self._make_response(total_score=7) + result = LLMAISmell.process_response(response) + assert result.score == pytest.approx(0.70, abs=1e-9) + + # ────────────────────────────────────────────── + # Reason string content + # ────────────────────────────────────────────── + + def test_reason_contains_total_score(self): + """Reason should display the total AI smell score.""" + response = self._make_response(total_score=7) + result = LLMAISmell.process_response(response) + assert "7/10" in result.reason[0] + + def test_reason_contains_all_dimension_labels(self): + """Reason should list all 5 dimension labels.""" + response = self._make_response() + result = LLMAISmell.process_response(response) + reason = result.reason[0] + + assert "💊 正确的废话指数" in reason + assert "🪞 无限镜像感" in reason + assert "🌈 彩虹屁密度" in reason + assert "🧩 细节真空度" in reason + assert "✨ 形容词暴力指数" in reason + + def test_reason_contains_verdict(self): + """Reason should contain the verdict string.""" + verdict = "这是一份高度 AI 味的文档,建议重写" + response = self._make_response(total_score=8, verdict=verdict) + result = LLMAISmell.process_response(response) + assert verdict in result.reason[0] + + def test_evidence_shown_for_high_scores(self): + """Evidence should appear in reason for dimensions scoring >= 5.""" + evidence_text = "在当今社会,随着技术不断发展……" + response = self._make_response( + total_score=7, + correct_nonsense=6, + evidence={ + "correct_nonsense": evidence_text, + "infinite_mirror": "", + "rainbow_fart": "", + "detail_vacuum": "", + "adjective_violence": "", + }, + ) + result = LLMAISmell.process_response(response) + assert evidence_text in result.reason[0] + + def test_evidence_hidden_for_low_scores(self): + """Evidence should NOT appear in reason for dimensions scoring < 5.""" + evidence_text = "某个不应出现的例句" + response = self._make_response( + total_score=3, + correct_nonsense=2, + evidence={ + "correct_nonsense": evidence_text, + "infinite_mirror": "", + "rainbow_fart": "", + "detail_vacuum": "", + "adjective_violence": "", + }, + ) + result = LLMAISmell.process_response(response) + assert evidence_text not in result.reason[0] + + # ────────────────────────────────────────────── + # Markdown cleanup + # ────────────────────────────────────────────── + + def test_markdown_json_code_block_stripped(self): + """LLM often wraps JSON in ```json ... ``` — should be handled.""" + inner = self._make_response(total_score=4) + wrapped = f"```json\n{inner}\n```" + result = LLMAISmell.process_response(wrapped) + assert result.label == ["AI_SMELL_CLEAN"] + + def test_plain_code_block_stripped(self): + """Plain ``` ... ``` wrapper should also be stripped.""" + inner = self._make_response(total_score=7) + wrapped = f"```\n{inner}\n```" + result = LLMAISmell.process_response(wrapped) + assert result.label == ["AI_SMELL_DETECTED"] + + # ────────────────────────────────────────────── + # Error handling + # ────────────────────────────────────────────── + + def test_invalid_json_raises_convert_error(self): + """Garbage response should raise ConvertJsonError.""" + from dingo.utils.exception import ConvertJsonError + + with pytest.raises(ConvertJsonError): + LLMAISmell.process_response("This is not JSON at all") + + # ────────────────────────────────────────────── + # Metadata + # ────────────────────────────────────────────── + + def test_metric_name_matches_class(self): + """EvalDetail.metric should be the class name.""" + response = self._make_response() + result = LLMAISmell.process_response(response) + assert result.metric == "LLMAISmell" + + def test_required_fields(self): + """Checker should only require CONTENT (no prompt/context needed).""" + from dingo.io.input import RequiredField + + assert RequiredField.CONTENT in LLMAISmell._required_fields + assert len(LLMAISmell._required_fields) == 1 + + # ────────────────────────────────────────────── + # Score bar helper + # ────────────────────────────────────────────── + + def test_score_bar_full(self): + bar = LLMAISmell._score_bar(10) + assert bar == "[██████████]" + + def test_score_bar_empty(self): + bar = LLMAISmell._score_bar(0) + assert bar == "[░░░░░░░░░░]" + + def test_score_bar_half(self): + bar = LLMAISmell._score_bar(5) + assert bar == "[█████░░░░░]" + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "--tb=short"])