Skip to content
242 changes: 242 additions & 0 deletions dingo/model/llm/llm_ai_smell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
"""
AI Smell Detector for Requirement Documents (需求文档 AI 味检测器)

Detects AI-generated writing patterns in requirement documents across 5 dimensions:
- 正确的废话指数 (Correct Nonsense Index)
- 无限镜像感 (Infinite Mirror Index)
- 彩虹屁密度 (Rainbow Fart Density)
- 细节真空度 (Detail Vacuum Index)
- 形容词暴力指数 (Adjective Violence Index)
"""

import json

from dingo.io.input import Data, RequiredField
from dingo.io.output.eval_detail import EvalDetail
from dingo.model import Model
from dingo.model.llm.base_openai import BaseOpenAI
from dingo.utils import log
from dingo.utils.exception import ConvertJsonError


@Model.llm_register("LLMAISmell")
class LLMAISmell(BaseOpenAI):
"""
AI Smell Detector for requirement documents.

Evaluates 5 dimensions of AI-generated writing patterns:
1. 正确的废话指数 - Hollow truisms ("In today's society...", "With the rapid development of...")
2. 无限镜像感 - Repetitive emphasis of the same point in different words
3. 彩虹屁密度 - Excessive praise, inflated importance claims
4. 细节真空度 - Sounds complete but lacks any actionable specifics
5. 形容词暴力指数 - Buzzword overuse (高效/赋能/闭环/生态/颗粒度...)

Each dimension is scored 0-10. Overall AI smell score is the weighted average.
"""

_metric_info = {
"category": "Document Quality Assessment Metrics",
"metric_name": "LLMAISmell",
"description": "Detects AI-generated writing patterns in requirement documents across 5 dimensions: hollow truisms, repetition, rainbow farts, detail vacuum, and adjective violence",
"examples": "examples/llm_and_rule/llm_local.py",
"evaluation_results": "",
}

_required_fields = [RequiredField.CONTENT]

# Score threshold above which a document is flagged as AI-smelling
threshold = 6

prompt = """
# 角色
你是一位资深需求评审专家,专门识别需求文档中的 AI 代写痕迹。

# 任务
分析下面的文档,从 5 个维度评估其"AI 味",每个维度打分 0-10:

## 评估维度

### 1. 💊 正确的废话指数(0-10)
**定义**:用正确但毫无信息量的话填充文档,听起来很有道理但什么都没说。
**典型表现**:
- "在当今社会……"、"随着技术的不断发展……"
- "这对用户体验至关重要"(但没有说为什么或怎么做)
- "我们需要确保系统的稳定性和可靠性"(没有具体指标)
- 每段开头都在重述背景

**打分标准**:
- 0-2:文档直接切入主题,陈述均有实质内容
- 3-5:有少量套话但不影响整体
- 6-8:大量空洞表述,信息密度低
- 9-10:几乎全是废话,读完不知道要做什么

### 2. 🪞 无限镜像感(0-10)
**定义**:同一个意思用不同的话反复说,制造"内容丰富"的假象。
**典型表现**:
- 同一个功能点在不同章节反复描述
- "提升用户体验" → "优化用户感受" → "改善用户满意度"(三句话说同一件事)
- 结论和摘要和正文高度重复

**打分标准**:
- 0-2:每句话都有新信息
- 3-5:有轻微重复但可接受
- 6-8:明显感觉在凑字数
- 9-10:镜中镜,绕来绕去

### 3. 🌈 彩虹屁密度(0-10)
**定义**:过度拔高重要性、夸大影响、给自己项目过度背书。
**典型表现**:
- "这将彻底改变……"、"革命性的……"、"行业领先的……"
- "大幅提升"但没有数据
- "用户迫切需要"但没有调研依据
- 每个功能都是"核心"、"关键"、"重要"

**打分标准**:
- 0-2:表述客观,有数据支撑
- 3-5:略有夸张但在合理范围
- 6-8:随处可见夸大词汇
- 9-10:每句话都在吹,读起来像广告

### 4. 🧩 细节真空度(0-10)
**定义**:文档结构完整、格式规范,但缺乏任何可落地的具体信息。
**典型表现**:
- "系统应支持多种支付方式"(哪些方式?)
- "性能要满足用户需求"(什么性能?什么需求?)
- "界面设计应符合用户习惯"(谁的习惯?什么标准?)
- 没有数字、没有边界条件、没有异常处理

**打分标准**:
- 0-2:有具体的数字、接口、用例、边界条件
- 3-5:部分模糊但核心功能有描述
- 6-8:大量"应该"、"需要"但没有"怎么做"
- 9-10:读完完全不知道要开发什么

### 5. ✨ 形容词暴力指数(0-10)
**定义**:大量堆叠科技/管理类buzzword,用词汇密度掩盖内容空洞。
**高危词汇**:高效、赋能、闭环、生态、颗粒度、抓手、落地、对齐、拉通、赛道、底层逻辑、顶层设计、数字化转型、智能化、一体化、全链路、沉淀、复用、标准化、降本增效、价值最大化
**打分标准**:
- 0-2:用词精准朴素,术语有明确定义
- 3-5:偶有流行词但不影响理解
- 6-8:buzzword 密集,读起来像PPT
- 9-10:去掉这些词文档就空了

---

## 综合 AI 味总分(0-10)
基于以上 5 个维度的加权综合评估。

**权重参考**:
- 细节真空度(0.3):最能区分 AI 和人写的
- 正确的废话指数(0.25)
- 形容词暴力指数(0.2)
- 无限镜像感(0.15)
- 彩虹屁密度(0.1)

---

## 输出格式

请严格按照以下 JSON 格式输出,不要输出任何其他内容:

```json
{
"total_score": <综合AI味总分 0-10>,
"dimensions": {
"correct_nonsense": <正确的废话指数 0-10>,
"infinite_mirror": <无限镜像感 0-10>,
"rainbow_fart": <彩虹屁密度 0-10>,
"detail_vacuum": <细节真空度 0-10>,
"adjective_violence": <形容词暴力指数 0-10>
},
"evidence": {
"correct_nonsense": "<最典型的1-2个例子,直接引用原文>",
"infinite_mirror": "<最典型的1-2个例子,直接引用原文>",
"rainbow_fart": "<最典型的1-2个例子,直接引用原文>",
"detail_vacuum": "<最典型的1-2个例子,直接引用原文>",
"adjective_violence": "<最典型的1-2个例子,直接引用原文>"
},
"verdict": "<一句话总结,不超过50字>"
}
```

---

## 待评估文档:

"""

@classmethod
def process_response(cls, response: str) -> EvalDetail:
"""
Process LLM response and convert to EvalDetail.
"""
# Strip leading/trailing whitespace first, then remove markdown code blocks
response = response.strip()
if response.startswith("```json"):
response = response[7:]
elif response.startswith("```"):
response = response[3:]
if response.endswith("```"):
response = response[:-3]
response = response.strip()
Comment on lines +175 to +181

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The markdown code block stripping logic is executed before response.strip(). If the LLM response contains leading or trailing whitespace/newlines (e.g., '\njson\n...\n\n'), the startswith and endswith checks will fail to detect and strip the markdown code blocks. Stripping the response before checking resolves this issue.

Suggested change
if response.startswith("```json"):
response = response[7:]
elif response.startswith("```"):
response = response[3:]
if response.endswith("```"):
response = response[:-3]
response = response.strip()
response = response.strip()
if response.startswith(chr(96) * 3 + "json"):
response = response[7:]
elif response.startswith(chr(96) * 3):
response = response[3:]
if response.endswith(chr(96) * 3):
response = response[:-3]
response = response.strip()


try:
data = json.loads(response)
if not isinstance(data, dict):
raise ConvertJsonError(
f"Parsed JSON is not a dictionary: {type(data)}"
)
except json.JSONDecodeError:
raise ConvertJsonError(f"Failed to parse AI smell response as JSON: {response[:200]}")
Comment on lines +183 to +190

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

If the LLM returns a valid JSON that is not an object/dictionary (e.g., a list or a string), json.loads(response) will succeed but subsequent .get() calls on data will raise an AttributeError. It is safer to explicitly verify that the parsed JSON is a dictionary.

Suggested change
try:
data = json.loads(response)
except json.JSONDecodeError:
raise ConvertJsonError(f"Failed to parse AI smell response as JSON: {response[:200]}")
try:
data = json.loads(response)
if not isinstance(data, dict):
raise ConvertJsonError(f"Parsed JSON is not a dictionary: {type(data)}")
except json.JSONDecodeError:
raise ConvertJsonError(f"Failed to parse AI smell response as JSON: {response[:200]}")


try:
total_score = float(data.get("total_score", 0))
except (ValueError, TypeError):
total_score = 0.0
dimensions = data.get("dimensions") or {}
evidence = data.get("evidence") or {}
verdict = str(data.get("verdict") or "")

# Build human-readable reason
dim_labels = {
"correct_nonsense": "💊 正确的废话指数",
"infinite_mirror": "🪞 无限镜像感",
"rainbow_fart": "🌈 彩虹屁密度",
"detail_vacuum": "🧩 细节真空度",
"adjective_violence": "✨ 形容词暴力指数",
}

reason_lines = [f"🤖 AI味总分:{int(total_score)}/10"]
reason_lines.append("")
for key, label in dim_labels.items():
raw_score = dimensions.get(key, 0)
try:
score = float(raw_score)
except (ValueError, TypeError):
score = 0.0
example = evidence.get(key, "")
bar = cls._score_bar(round(score))
reason_lines.append(f"{label}:{int(score)}/10 {bar}")
if example and score >= 5:
reason_lines.append(f" └ 例:{example}")
reason_lines.append("")
reason_lines.append(f"📝 {verdict}")

is_ai_smell = total_score >= cls.threshold

result = EvalDetail(
metric=cls.__name__,
status=is_ai_smell,
score=round(total_score / 10, 2), # normalize to 0-1 for consistency
label=["AI_SMELL_DETECTED"] if is_ai_smell else ["AI_SMELL_CLEAN"],
reason=["\n".join(reason_lines)],
)

return result

@classmethod
def _score_bar(cls, score: int, width: int = 10) -> str:
"""Generate a simple ASCII progress bar for a 0-10 score."""
filled = max(0, min(width, int(round(score))))
empty = width - filled
return f"[{'█' * filled}{'░' * empty}]"
Comment on lines +238 to +242

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

If the LLM returns a score outside the expected 0-10 range (e.g., negative or greater than 10), _score_bar can produce malformed progress bars or raise errors. Clamping the score to [0, width] ensures the progress bar is always rendered correctly.

Suggested change
def _score_bar(cls, score: int, width: int = 10) -> str:
"""Generate a simple ASCII progress bar for a 0-10 score."""
filled = round(score)
empty = width - filled
return f"[{'█' * filled}{'░' * empty}]"
@classmethod
def _score_bar(cls, score: float, width: int = 10) -> str:
"""Generate a simple ASCII progress bar for a 0-10 score."""
filled = max(0, min(width, round(score)))
empty = width - filled
return f"[{'█' * filled}{'░' * empty}]"

143 changes: 143 additions & 0 deletions examples/llm_ai_smell_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
"""
Example: AI Smell Detection for Requirement Documents

Usage:
python examples/llm_ai_smell_example.py

This example demonstrates how to use LLMAISmell to detect AI-generated
writing patterns in requirement documents.
"""

from dingo.config.input_args import InputArgs
from dingo.exec.local import LocalExecutor

SAMPLE_DOC_HIGH_AI_SMELL = """
## 智能客服系统需求文档

### 一、背景

在当今数字化转型的大背景下,随着人工智能技术的不断发展和进步,越来越多的企业
开始重视智能化客服系统的建设。为了更好地赋能业务发展,提升用户体验,打造闭环的
客户服务生态,我们提出构建一套高效、智能的客服解决方案,以实现降本增效、价值最大化的战略目标。

### 二、核心目标

本系统旨在通过底层逻辑的重塑和顶层设计的优化,实现以下战略目标:
- 大幅提升客户满意度,打造行业领先的服务体验
- 通过全链路智能化改造,彻底革新传统客服模式
- 赋能一线客服人员,提升整体服务效能,实现降本增效
- 构建可持续发展的智能客服生态,沉淀核心服务能力

### 三、功能需求

#### 3.1 智能问答

系统应支持智能问答功能,能够准确理解用户意图,提供精准的回答。系统需要确保
回答的准确性和及时性,以满足用户的迫切需求。界面设计应符合用户使用习惯,提供
良好的交互体验。系统还应支持多轮对话,能够理解上下文,提供连贯的对话体验。

#### 3.2 工单管理

系统应具备完善的工单管理功能,支持工单的创建、分配、跟踪和关闭全生命周期管理。
工单系统需要满足业务需求,支持多种工单类型,确保处理效率和质量。通过对工单数据
的深度挖掘和分析,为管理决策提供有力支撑,实现数据驱动的精细化运营。

### 四、技术要求

系统性能应满足业务需求,确保在高并发场景下的稳定运行。系统需要具备良好的扩展性
和可维护性,以支撑未来业务的快速发展。安全性方面,系统应符合相关法规要求,保护
用户数据安全。系统架构应采用先进的微服务架构,实现各模块的解耦,提升系统的灵活性
和可靠性。

### 五、总结

综上所述,本智能客服系统将通过技术创新和模式变革,为企业创造巨大的商业价值,
提升核心竞争力,助力企业在激烈的市场竞争中脱颖而出,实现可持续发展。
"""

SAMPLE_DOC_LOW_AI_SMELL = """
## 客服工单系统 v2.1 需求文档

**作者**: 张三 **日期**: 2024-01-15 **评审状态**: 待评审

---

### 1. 背景

当前客服团队每天处理约 2000 张工单,其中 65% 为重复性问题(退款、发货查询、
账号问题)。工单平均处理时间 8 分钟,其中 3 分钟用于查历史记录。本项目目标是
将平均处理时间降至 5 分钟以内。

### 2. 功能需求

#### 2.1 快速回复模板

**需求描述**:客服输入关键词时,系统自动推荐匹配的回复模板。

**详细说明**:
- 输入框输入字符后 300ms 内展示建议列表,最多显示 5 条
- 按相关度排序:完全匹配 > 关键词匹配 > 语义相似
- 客服选择模板后可编辑再发送,不能直接强制发送
- 模板库由运营通过后台维护,支持按一级分类(退款/物流/账号/其他)管理

**不在范围内**:自动发送、客户端展示建议

#### 2.2 历史工单查询

**需求描述**:在工单页面可快速查看同一用户的历史工单。

**详细说明**:
- 侧边栏展示最近 10 张工单的摘要(时间、分类、处理结果)
- 点击展开查看完整内容
- 数据来源:工单系统数据库,实时查询,无需缓存
- 异常情况:用户无历史工单时展示"暂无历史记录",查询超时(>3s)展示错误提示

### 3. 非功能需求

- 快速回复建议 P95 响应时间 < 500ms(基于当前 500 并发用户)
- 历史工单查询 P99 < 2s
- 暂不考虑国际化
"""


def run_example():
print("=" * 60)
print("Example 1: High AI Smell Document")
print("=" * 60)

# Configure the executor with LLMAISmell checker.
# Replace YOUR_API_KEY and api_base with your actual LLM credentials.
input_args = InputArgs(
eval_group="llm",
llm_config={
"model": "gpt-4o",
"key": "YOUR_API_KEY",
"api_base": "https://api.openai.com/v1",
},
custom_config={"llm": ["LLMAISmell"]},
)
executor = LocalExecutor(input_args=input_args)

print("\nDocument snippet (high AI smell):")
print(SAMPLE_DOC_HIGH_AI_SMELL[:200] + "...")
print("\nExpected: AI_SMELL_DETECTED with high scores on adjective_violence and detail_vacuum")
print(f"\nExecutor ready: {executor.__class__.__name__}")

# To run the actual evaluation (requires a valid API key configured above):
# try:
# result = executor.eval_text(SAMPLE_DOC_HIGH_AI_SMELL)
# print("\nEvaluation Result:")
# print(result.reason[0])
# except Exception as e:
# print(f"\nCould not run evaluation: {e}")

print("\n" + "=" * 60)
print("Example 2: Low AI Smell Document")
print("=" * 60)
print("\nDocument snippet (low AI smell):")
print(SAMPLE_DOC_LOW_AI_SMELL[:200] + "...")
print("\nExpected: AI_SMELL_CLEAN with low scores across all dimensions")


if __name__ == "__main__":
run_example()
Loading
Loading