From 15a6666d5f618146cf21d155d6d8c90d5afe8830 Mon Sep 17 00:00:00 2001
From: Piar <2741277534@qq.com>
Date: Wed, 8 Apr 2026 23:37:56 +0800
Subject: [PATCH] docs: add DataFlow Skills section and generating dataflow
 pipeline references

---
 docs/.vuepress/notes/en/guide.ts              |  10 ++
 docs/.vuepress/notes/zh/guide.ts              |  10 ++
 docs/en/notes/guide/skills/core_text.md       |  63 ++++++++
 .../skills/generating_dataflow_pipeline.md    | 153 ++++++++++++++++++
 docs/zh/notes/guide/skills/core_text.md       |  63 ++++++++
 .../skills/generating_dataflow_pipeline.md    | 153 ++++++++++++++++++
 6 files changed, 452 insertions(+)
 create mode 100644 docs/en/notes/guide/skills/core_text.md
 create mode 100644 docs/en/notes/guide/skills/generating_dataflow_pipeline.md
 create mode 100644 docs/zh/notes/guide/skills/core_text.md
 create mode 100644 docs/zh/notes/guide/skills/generating_dataflow_pipeline.md

diff --git a/docs/.vuepress/notes/en/guide.ts b/docs/.vuepress/notes/en/guide.ts
index 66aa93beb..95a870ab8 100644
--- a/docs/.vuepress/notes/en/guide.ts
+++ b/docs/.vuepress/notes/en/guide.ts
@@ -128,5 +128,15 @@ export const Guide: ThemeNote = defineNoteConfig({
                 "web_collection"
             ]
         },
+        {
+            text: "DataFlow Skills",
+            collapsed: false,
+            icon: 'material-symbols:auto-awesome',
+            prefix: 'skills',
+            items: [
+                "generating_dataflow_pipeline",
+                "core_text"
+            ]
+        },
     ],
 })
diff --git a/docs/.vuepress/notes/zh/guide.ts b/docs/.vuepress/notes/zh/guide.ts
index ab6dd9042..30acb18b4 100644
--- a/docs/.vuepress/notes/zh/guide.ts
+++ b/docs/.vuepress/notes/zh/guide.ts
@@ -127,6 +127,16 @@ export const Guide: ThemeNote = defineNoteConfig({
                 "web_collection"
             ]
         },
+        {
+            text: "DataFlow Skills",
+            collapsed: false,
+            icon: 'material-symbols:auto-awesome',
+            prefix: 'skills',
+            items: [
+                "generating_dataflow_pipeline",
+                "core_text"
+            ]
+        },
         // {
         //     text: '写作',
         //     icon: 'fluent-mdl2:edit-create',
diff --git a/docs/en/notes/guide/skills/core_text.md b/docs/en/notes/guide/skills/core_text.md
new file mode 100644
index 000000000..2850978a8
--- /dev/null
+++ b/docs/en/notes/guide/skills/core_text.md
@@ -0,0 +1,63 @@
+---
+title: Core Text Operators
+icon: material-symbols:extension
+createTime: 2026/04/08 22:45:39
+permalink: /en/guide/skills/core_text/
+---
+
+# Core Text Operator Reference
+
+Extended operator reference for the [Generating DataFlow Pipeline](./generating_dataflow_pipeline.md) skill. When the 6 core primitives don't cover your task, consult the detailed per-operator documentation here.
+
+## Available Operators
+
+### Generate (`core_text/generate/`)
+
+| Operator | Description |
+|----------|-------------|
+| `prompted-generator` | Basic single-field LLM generation |
+| `format-str-prompted-generator` | Multi-field template-based generation |
+| `chunked-prompted-generator` | Long document chunk-by-chunk processing |
+| `embedding-generator` | Text vectorization using embedding APIs |
+| `retrieval-generator` | Async RAG generation using LightRAG |
+| `bench-answer-generator` | Benchmark answer generation with evaluation type variants |
+| `text2multihopqa-generator` | Multi-hop QA pair construction from text |
+| `random-domain-knowledge-row-generator` | Domain-specific row generation from seed data |
+
+### Filter (`core_text/filter/`)
+
+| Operator | Description |
+|----------|-------------|
+| `prompted-filter` | LLM-based quality scoring and filtering |
+| `general-filter` | Rule-based deterministic filtering |
+| `kcentergreedy-filter` | Diversity-based filtering using k-Center Greedy |
+
+### Refine (`core_text/refine/`)
+
+| Operator | Description |
+|----------|-------------|
+| `prompted-refiner` | LLM-based text rewriting and refinement |
+| `pandas-operator` | Custom pandas DataFrame operations |
+
+### Eval (`core_text/eval/`)
+
+| Operator | Description |
+|----------|-------------|
+| `prompted-evaluator` | LLM-based scoring and evaluation |
+| `bench-dataset-evaluator` | Benchmark dataset evaluation |
+| `bench-dataset-evaluator-question` | Benchmark question-level evaluation |
+| `text2qa-sample-evaluator` | QA sample quality evaluation |
+| `unified-bench-dataset-evaluator` | Unified benchmark evaluation across formats |
+
+## Directory Structure
+
+Each operator folder follows the same layout:
+
+```
+<operator-name>/
+├── SKILL.md          # English documentation: use cases, imports, parameters, run() examples
+├── SKILL_zh.md       # Chinese documentation
+└── examples/
+    ├── good.md       # Correct usage with a simple single-operator pipeline, sample input and output
+    └── bad.md        # Common mistakes and anti-patterns
+```
diff --git a/docs/en/notes/guide/skills/generating_dataflow_pipeline.md b/docs/en/notes/guide/skills/generating_dataflow_pipeline.md
new file mode 100644
index 000000000..15dcd9e57
--- /dev/null
+++ b/docs/en/notes/guide/skills/generating_dataflow_pipeline.md
@@ -0,0 +1,153 @@
+---
+title: Generating DataFlow Pipeline
+icon: carbon:flow
+createTime: 2026/04/08 22:45:39
+permalink: /en/guide/skills/generating_dataflow_pipeline/
+---
+
+# Generating DataFlow Pipeline
+
+<video src="https://github.com/user-attachments/assets/ca1fefbf-9bf7-469f-b856-b201952fb99b" controls style="width:100%; max-width:800px;"></video>
+
+## What It Does
+
+A reasoning-guided pipeline planner for [Claude Code](https://docs.anthropic.com/en/docs/claude-code). Given a **target** (what the pipeline should achieve) and a **sample JSONL file** (1–5 representative rows), it analyzes the data, selects operators, validates field dependencies, and generates a complete, runnable DataFlow pipeline in Python.
+
+## Quick Start
+
+### 1. Add the Skill
+
+Clone the repository and copy the skill directories into your Claude Code skills folder:
+
+```bash
+git clone https://github.com/haolpku/DataFlow-Skills.git
+
+# Project-level (this project only)
+cp -r DataFlow-Skills/generating-dataflow-pipeline .claude/skills/generating-dataflow-pipeline
+cp -r DataFlow-Skills/core_text .claude/skills/core_text
+
+# Or personal-level (all your projects)
+cp -r DataFlow-Skills/generating-dataflow-pipeline ~/.claude/skills/generating-dataflow-pipeline
+cp -r DataFlow-Skills/core_text ~/.claude/skills/core_text
+```
+
+Claude Code discovers skills from `.claude/skills/<skill-name>/SKILL.md`. The `name` field in `SKILL.md` frontmatter becomes the `/slash-command`. For more details, see the [official skills documentation](https://docs.anthropic.com/en/docs/claude-code/skills).
+
+### 2. Prepare Your Data
+
+Create a JSONL file (one JSON object per line) with 1–5 representative rows:
+
+```jsonl
+{"product_name": "Laptop", "category": "Electronics"}
+{"product_name": "Coffee Maker", "category": "Appliances"}
+```
+
+### 3. Run the Skill
+
+In Claude Code, invoke `/generating-dataflow-pipeline` and describe your target:
+
+```
+/generating-dataflow-pipeline
+Target: Generate product descriptions and filter high-quality ones
+Sample file: ./data/products.jsonl
+Expected outputs: generated_description, quality_score
+```
+
+### 4. Review the Output
+
+The skill returns a two-stage result:
+
+1. **Intermediate Operator Decision** — JSON with operator chain, field flow, and reasoning
+2. **Complete 5-Section Response**:
+   - Field Mapping — which fields exist vs. need to be generated
+   - Ordered Operator List — operators in execution order with justification
+   - Reasoning Summary — why this design satisfies the target
+   - Complete Pipeline Code — full executable Python following standard structure
+   - Adjustable Parameters / Caveats — tunable knobs and debugging tips
+
+## Six Core Operators
+
+| Operator | Purpose | LLM? |
+|----------|---------|------|
+| `PromptedGenerator` | Single-field LLM generation | Yes |
+| `FormatStrPromptedGenerator` | Multi-field template-based generation | Yes |
+| `Text2MultiHopQAGenerator` | Multi-hop QA pair construction from text | Yes |
+| `PromptedFilter` | LLM-based quality scoring & filtering | Yes |
+| `GeneralFilter` | Rule-based deterministic filtering | No |
+| **KBC Trio** (3 operators, always together in order) | File/URL → Markdown → chunks → clean text | Partial |
+
+## Generated Pipeline Structure
+
+All generated pipelines follow the same standard structure:
+
+```python
+from dataflow.operators.core_text import PromptedGenerator, PromptedFilter
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+
+class MyPipeline:
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="./data/input.jsonl",  # User-provided path
+            cache_path="./cache",
+            file_name_prefix="step",
+            cache_type="jsonl"
+        )
+        self.llm_serving = APILLMServing_request(
+            api_url="https://api.openai.com/v1/chat/completions",
+            model_name="gpt-4o",
+            max_workers=10
+        )
+        # Operator instances ...
+
+    def forward(self):
+        # Sequential operator.run() calls, each with storage.step()
+        ...
+
+if __name__ == "__main__":
+    pipeline = MyPipeline()
+    pipeline.forward()
+```
+
+Key rules:
+
+- `first_entry_file_name` is set to the exact user-provided JSONL path
+- Each `operator.run()` call uses `storage=self.storage.step()` for checkpointing
+- Fields propagate forward: a field must exist in the sample or be output by a prior step before it can be consumed
+
+## Extended Operators
+
+Beyond the 6 core primitives, DataFlow provides additional operators for generation, filtering, refinement, and evaluation. See the [Core Text Operator Reference](./core_text.md) for the full list.
+
+## Adding a New Operator
+
+Prerequisite: the new operator's skill definition already exists (with `SKILL.md`, `examples/good.md`, `examples/bad.md`, etc.).
+
+### As an Extended Operator
+
+Two steps are required:
+
+**Step 1.** Create an operator directory with its skill definition under any appropriate location (e.g., `core_text/<category>/`, or a separate skill package):
+
+```
+<skill-directory>/<your-operator-name>/
+├── SKILL.md          # API reference (constructor, run() signature, execution logic, constraints)
+├── SKILL_zh.md       # Chinese translation (optional)
+└── examples/
+    ├── good.md       # Best-practice example
+    └── bad.md        # Common mistakes
+```
+
+**Step 2.** Register the operator in `SKILL.md`'s **Extended Operator Reference** section. Add a row to the corresponding category table (Generate / Filter / Refine / Eval) with the operator name, subdirectory path, and description. Without this entry, the pipeline generator will not know the operator exists.
+
+### Promoting to a Core Primitive (Optional)
+
+If the operator is used frequently enough to warrant priority selection, promote it by modifying `SKILL.md`:
+
+1. Add to the **Preferred Operator Strategy** core primitives list
+2. Add a decision table row in **Operator Selection Priority Rule** (when to use / when not to use)
+3. Add full constructor and `run()` signatures in **Operator Parameter Signature Rule**
+4. Add the import path in **Correct Import Paths**
+5. Add input pattern matching in **Input File Content Analysis Rule** if it handles a new data type
+6. Update or remove the entry from the **Extended Operator Reference** table to avoid duplication
+7. Add a complete example in `examples/` (recommended)
diff --git a/docs/zh/notes/guide/skills/core_text.md b/docs/zh/notes/guide/skills/core_text.md
new file mode 100644
index 000000000..9349f31ec
--- /dev/null
+++ b/docs/zh/notes/guide/skills/core_text.md
@@ -0,0 +1,63 @@
+---
+title: core_text
+icon: material-symbols:extension
+createTime: 2026/04/08 22:45:39
+permalink: /zh/guide/de8oculw/
+---
+
+# Core Text 扩展算子参考
+
+[DataFlow Pipeline生成](./generating_dataflow_pipeline.md) 的扩展算子参考库。当 6 个核心算子不能满足需求时，可查阅这里的逐算子详细文档。
+
+## 可用算子
+
+### Generate (`core_text/generate/`)
+
+| 算子 | 说明 |
+|------|------|
+| `prompted-generator` | 最基础的单字段 LLM 生成 |
+| `format-str-prompted-generator` | 多字段模板式生成 |
+| `chunked-prompted-generator` | 长文本分块逐段处理 |
+| `embedding-generator` | 调用 Embedding API 生成文本向量 |
+| `retrieval-generator` | 基于 LightRAG 的异步 RAG 生成 |
+| `bench-answer-generator` | Benchmark 答案生成，支持多种评估类型 |
+| `text2multihopqa-generator` | 从文本构建多跳问答对 |
+| `random-domain-knowledge-row-generator` | 基于种子数据的领域知识行生成 |
+
+### Filter (`core_text/filter/`)
+
+| 算子 | 说明 |
+|------|------|
+| `prompted-filter` | 基于 LLM 的质量评分与过滤 |
+| `general-filter` | 基于规则的确定性过滤 |
+| `kcentergreedy-filter` | 基于 k-Center Greedy 的多样性过滤 |
+
+### Refine (`core_text/refine/`)
+
+| 算子 | 说明 |
+|------|------|
+| `prompted-refiner` | 基于 LLM 的文本改写与精炼 |
+| `pandas-operator` | 自定义 pandas DataFrame 操作 |
+
+### Eval (`core_text/eval/`)
+
+| 算子 | 说明 |
+|------|------|
+| `prompted-evaluator` | 基于 LLM 的打分评估 |
+| `bench-dataset-evaluator` | Benchmark 数据集评估 |
+| `bench-dataset-evaluator-question` | Benchmark 问题级评估 |
+| `text2qa-sample-evaluator` | 问答样本质量评估 |
+| `unified-bench-dataset-evaluator` | 跨格式统一 Benchmark 评估 |
+
+## 目录结构
+
+每个算子文件夹遵循统一布局：
+
+```
+<算子名称>/
+├── SKILL.md          # 英文文档：使用场景、导入方式、参数说明、run() 示例
+├── SKILL_zh.md       # 中文文档
+└── examples/
+    ├── good.md       # 正确用法示例，含单一算子组成的简单 Pipeline、样例输入及输出
+    └── bad.md        # 常见错误与反模式
+```
diff --git a/docs/zh/notes/guide/skills/generating_dataflow_pipeline.md b/docs/zh/notes/guide/skills/generating_dataflow_pipeline.md
new file mode 100644
index 000000000..36e1cabae
--- /dev/null
+++ b/docs/zh/notes/guide/skills/generating_dataflow_pipeline.md
@@ -0,0 +1,153 @@
+---
+title: DataFlow Pipeline生成
+icon: carbon:flow
+createTime: 2026/04/08 22:45:39
+permalink: /zh/guide/skills/generating_dataflow_pipeline/
+---
+
+# DataFlow Pipeline生成（Generating DataFlow Pipeline）
+
+<video src="https://github.com/user-attachments/assets/ca1fefbf-9bf7-469f-b856-b201952fb99b" controls style="width:100%; max-width:800px;"></video>
+
+## 功能说明
+
+一款面向 [Claude Code](https://docs.anthropic.com/en/docs/claude-code) 的推理引导式 Pipeline 规划工具。给定**目标任务**（Pipeline 需实现的功能）和**样本 JSONL 文件**（1–5 条代表性数据），它会自动分析数据、选取算子、校验字段依赖，并生成一套完整可运行的 DataFlow Python Pipeline。
+
+## 快速上手
+
+### 1. 添加 Skill
+
+克隆仓库，并将 Skill 目录复制到 Claude Code 的 skills 文件夹中：
+
+```bash
+git clone https://github.com/haolpku/DataFlow-Skills.git
+
+# 项目级（仅当前项目可用）
+cp -r DataFlow-Skills/generating-dataflow-pipeline .claude/skills/generating-dataflow-pipeline
+cp -r DataFlow-Skills/core_text .claude/skills/core_text
+
+# 或个人级（所有项目可用）
+cp -r DataFlow-Skills/generating-dataflow-pipeline ~/.claude/skills/generating-dataflow-pipeline
+cp -r DataFlow-Skills/core_text ~/.claude/skills/core_text
+```
+
+Claude Code 从 `.claude/skills/<skill-name>/SKILL.md` 自动发现 Skills。`SKILL.md` frontmatter 中的 `name` 字段即为 `/斜杠命令` 名称。更多详情请参阅[官方 Skills 文档](https://docs.anthropic.com/en/docs/claude-code/skills)。
+
+### 2. 准备数据
+
+创建 JSONL 文件（每行一个 JSON 对象），包含 1–5 条代表性数据：
+
+```jsonl
+{"product_name": "笔记本电脑", "category": "电子产品"}
+{"product_name": "咖啡机", "category": "家用电器"}
+```
+
+### 3. 运行 Skill
+
+在 Claude Code 中调用 `/generating-dataflow-pipeline` 并描述你的目标：
+
+```
+/generating-dataflow-pipeline
+目标：生成商品描述并筛选优质内容
+样本文件：./data/products.jsonl
+预期输出字段：generated_description, quality_score
+```
+
+### 4. 查看输出
+
+Skill 输出两阶段结果：
+
+1. **中间算子决策** — 包含算子链路、字段流转逻辑与推理依据的 JSON 数据
+2. **完整的 5 部分响应**：
+   - 字段映射 — 区分已有字段与需生成字段
+   - 有序算子列表 — 按执行顺序排列的算子及选用理由
+   - 推理总结 — 说明该设计为何能满足目标任务
+   - 完整 Pipeline 代码 — 遵循标准结构的可执行 Python 全量代码
+   - 可调参数 / 注意事项 — 可配置参数与调试技巧
+
+## 六大核心算子
+
+| 算子名称 | 用途 | 是否依赖 LLM |
+|----------|------|-------------|
+| `PromptedGenerator` | 单字段大模型生成 | 是 |
+| `FormatStrPromptedGenerator` | 多字段模板式生成 | 是 |
+| `Text2MultiHopQAGenerator` | 从文本构建多跳问答对 | 是 |
+| `PromptedFilter` | 基于大模型的质量评分与筛选 | 是 |
+| `GeneralFilter` | 基于规则的确定性过滤 | 否 |
+| **KBC 三算子组**（3 个算子，需按固定顺序组合使用） | 文件/链接 → Markdown → 分块 → 清洗文本 | 部分依赖 |
+
+## 生成的 Pipeline 结构
+
+所有生成的 Pipeline 均遵循统一标准结构：
+
+```python
+from dataflow.operators.core_text import PromptedGenerator, PromptedFilter
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+
+class MyPipeline:
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="./data/input.jsonl",  # 用户提供的路径
+            cache_path="./cache",
+            file_name_prefix="step",
+            cache_type="jsonl"
+        )
+        self.llm_serving = APILLMServing_request(
+            api_url="https://api.openai.com/v1/chat/completions",
+            model_name="gpt-4o",
+            max_workers=10
+        )
+        # 算子实例化...
+
+    def forward(self):
+        # 按顺序执行 operator.run()，每步通过 storage.step() 做断点持久化
+        ...
+
+if __name__ == "__main__":
+    pipeline = MyPipeline()
+    pipeline.forward()
+```
+
+核心规则：
+
+- `first_entry_file_name` 严格设置为用户提供的 JSONL 文件路径
+- 每个 `operator.run()` 调用均使用 `storage=self.storage.step()` 实现断点持久化
+- 字段向前传递：字段必须存在于样本数据中，或由前置步骤生成，方可被后续算子使用
+
+## 扩展算子
+
+除六大核心算子外，DataFlow 还提供更多 Generate、Filter、Refine 和 Eval 类扩展算子。详见 [Core Text 扩展算子参考](./core_text.md)。
+
+## 新增算子
+
+前置条件：新算子的 Skill 定义文件已完成（包含 `SKILL.md`、`examples/good.md`、`examples/bad.md` 等）。
+
+### 作为扩展算子添加
+
+需要两步：
+
+**第 1 步.** 在合适目录下创建算子文件夹并编写 Skill 定义（如 `core_text/<分类>/`，或独立 Skill 包）：
+
+```
+<Skill 目录>/<自定义算子名称>/
+├── SKILL.md          # 接口文档（构造函数、run() 方法签名、执行逻辑、约束条件）
+├── SKILL_zh.md       # 中文翻译（可选）
+└── examples/
+    ├── good.md       # 最佳实践示例
+    └── bad.md        # 常见错误示例
+```
+
+**第 2 步.** 在 `SKILL.md` 的 **Extended Operator Reference** 部分注册该算子。在对应类别表格（Generate / Filter / Refine / Eval）中添加一行，填写算子名、子目录路径和功能描述。不添加此条目，Pipeline 生成器无法感知该算子的存在。
+
+### 升级为核心算子（可选）
+
+若某算子使用频率极高，需纳入优先选用范围，可通过修改 `SKILL.md` 完成升级：
+
+1. 添加至 **Preferred Operator Strategy** 核心算子列表
+2. 在 **Operator Selection Priority Rule** 中新增决策表条目（明确适用 / 不适用场景）
+3. 在 **Operator Parameter Signature Rule** 中补充完整构造函数与 `run()` 方法签名
+4. 在 **Correct Import Paths** 中添加算子导入路径
+5. 若支持新数据类型，在 **Input File Content Analysis Rule** 中补充输入模式匹配规则
+6. 更新或移除 **Extended Operator Reference** 表中的对应条目，避免与核心算子重复
+7. 在 `examples/` 目录添加完整示例（推荐）