From 15a6666d5f618146cf21d155d6d8c90d5afe8830 Mon Sep 17 00:00:00 2001 From: Piar <2741277534@qq.com> Date: Wed, 8 Apr 2026 23:37:56 +0800 Subject: [PATCH] docs: add DataFlow Skills section and generating dataflow pipeline references --- docs/.vuepress/notes/en/guide.ts | 10 ++ docs/.vuepress/notes/zh/guide.ts | 10 ++ docs/en/notes/guide/skills/core_text.md | 63 ++++++++ .../skills/generating_dataflow_pipeline.md | 153 ++++++++++++++++++ docs/zh/notes/guide/skills/core_text.md | 63 ++++++++ .../skills/generating_dataflow_pipeline.md | 153 ++++++++++++++++++ 6 files changed, 452 insertions(+) create mode 100644 docs/en/notes/guide/skills/core_text.md create mode 100644 docs/en/notes/guide/skills/generating_dataflow_pipeline.md create mode 100644 docs/zh/notes/guide/skills/core_text.md create mode 100644 docs/zh/notes/guide/skills/generating_dataflow_pipeline.md diff --git a/docs/.vuepress/notes/en/guide.ts b/docs/.vuepress/notes/en/guide.ts index 66aa93beb..95a870ab8 100644 --- a/docs/.vuepress/notes/en/guide.ts +++ b/docs/.vuepress/notes/en/guide.ts @@ -128,5 +128,15 @@ export const Guide: ThemeNote = defineNoteConfig({ "web_collection" ] }, + { + text: "DataFlow Skills", + collapsed: false, + icon: 'material-symbols:auto-awesome', + prefix: 'skills', + items: [ + "generating_dataflow_pipeline", + "core_text" + ] + }, ], }) diff --git a/docs/.vuepress/notes/zh/guide.ts b/docs/.vuepress/notes/zh/guide.ts index ab6dd9042..30acb18b4 100644 --- a/docs/.vuepress/notes/zh/guide.ts +++ b/docs/.vuepress/notes/zh/guide.ts @@ -127,6 +127,16 @@ export const Guide: ThemeNote = defineNoteConfig({ "web_collection" ] }, + { + text: "DataFlow Skills", + collapsed: false, + icon: 'material-symbols:auto-awesome', + prefix: 'skills', + items: [ + "generating_dataflow_pipeline", + "core_text" + ] + }, // { // text: '写作', // icon: 'fluent-mdl2:edit-create', diff --git a/docs/en/notes/guide/skills/core_text.md b/docs/en/notes/guide/skills/core_text.md new file mode 100644 index 000000000..2850978a8 --- /dev/null +++ b/docs/en/notes/guide/skills/core_text.md @@ -0,0 +1,63 @@ +--- +title: Core Text Operators +icon: material-symbols:extension +createTime: 2026/04/08 22:45:39 +permalink: /en/guide/skills/core_text/ +--- + +# Core Text Operator Reference + +Extended operator reference for the [Generating DataFlow Pipeline](./generating_dataflow_pipeline.md) skill. When the 6 core primitives don't cover your task, consult the detailed per-operator documentation here. + +## Available Operators + +### Generate (`core_text/generate/`) + +| Operator | Description | +|----------|-------------| +| `prompted-generator` | Basic single-field LLM generation | +| `format-str-prompted-generator` | Multi-field template-based generation | +| `chunked-prompted-generator` | Long document chunk-by-chunk processing | +| `embedding-generator` | Text vectorization using embedding APIs | +| `retrieval-generator` | Async RAG generation using LightRAG | +| `bench-answer-generator` | Benchmark answer generation with evaluation type variants | +| `text2multihopqa-generator` | Multi-hop QA pair construction from text | +| `random-domain-knowledge-row-generator` | Domain-specific row generation from seed data | + +### Filter (`core_text/filter/`) + +| Operator | Description | +|----------|-------------| +| `prompted-filter` | LLM-based quality scoring and filtering | +| `general-filter` | Rule-based deterministic filtering | +| `kcentergreedy-filter` | Diversity-based filtering using k-Center Greedy | + +### Refine (`core_text/refine/`) + +| Operator | Description | +|----------|-------------| +| `prompted-refiner` | LLM-based text rewriting and refinement | +| `pandas-operator` | Custom pandas DataFrame operations | + +### Eval (`core_text/eval/`) + +| Operator | Description | +|----------|-------------| +| `prompted-evaluator` | LLM-based scoring and evaluation | +| `bench-dataset-evaluator` | Benchmark dataset evaluation | +| `bench-dataset-evaluator-question` | Benchmark question-level evaluation | +| `text2qa-sample-evaluator` | QA sample quality evaluation | +| `unified-bench-dataset-evaluator` | Unified benchmark evaluation across formats | + +## Directory Structure + +Each operator folder follows the same layout: + +``` +/ +├── SKILL.md # English documentation: use cases, imports, parameters, run() examples +├── SKILL_zh.md # Chinese documentation +└── examples/ + ├── good.md # Correct usage with a simple single-operator pipeline, sample input and output + └── bad.md # Common mistakes and anti-patterns +``` diff --git a/docs/en/notes/guide/skills/generating_dataflow_pipeline.md b/docs/en/notes/guide/skills/generating_dataflow_pipeline.md new file mode 100644 index 000000000..15dcd9e57 --- /dev/null +++ b/docs/en/notes/guide/skills/generating_dataflow_pipeline.md @@ -0,0 +1,153 @@ +--- +title: Generating DataFlow Pipeline +icon: carbon:flow +createTime: 2026/04/08 22:45:39 +permalink: /en/guide/skills/generating_dataflow_pipeline/ +--- + +# Generating DataFlow Pipeline + + + +## What It Does + +A reasoning-guided pipeline planner for [Claude Code](https://docs.anthropic.com/en/docs/claude-code). Given a **target** (what the pipeline should achieve) and a **sample JSONL file** (1–5 representative rows), it analyzes the data, selects operators, validates field dependencies, and generates a complete, runnable DataFlow pipeline in Python. + +## Quick Start + +### 1. Add the Skill + +Clone the repository and copy the skill directories into your Claude Code skills folder: + +```bash +git clone https://github.com/haolpku/DataFlow-Skills.git + +# Project-level (this project only) +cp -r DataFlow-Skills/generating-dataflow-pipeline .claude/skills/generating-dataflow-pipeline +cp -r DataFlow-Skills/core_text .claude/skills/core_text + +# Or personal-level (all your projects) +cp -r DataFlow-Skills/generating-dataflow-pipeline ~/.claude/skills/generating-dataflow-pipeline +cp -r DataFlow-Skills/core_text ~/.claude/skills/core_text +``` + +Claude Code discovers skills from `.claude/skills//SKILL.md`. The `name` field in `SKILL.md` frontmatter becomes the `/slash-command`. For more details, see the [official skills documentation](https://docs.anthropic.com/en/docs/claude-code/skills). + +### 2. Prepare Your Data + +Create a JSONL file (one JSON object per line) with 1–5 representative rows: + +```jsonl +{"product_name": "Laptop", "category": "Electronics"} +{"product_name": "Coffee Maker", "category": "Appliances"} +``` + +### 3. Run the Skill + +In Claude Code, invoke `/generating-dataflow-pipeline` and describe your target: + +``` +/generating-dataflow-pipeline +Target: Generate product descriptions and filter high-quality ones +Sample file: ./data/products.jsonl +Expected outputs: generated_description, quality_score +``` + +### 4. Review the Output + +The skill returns a two-stage result: + +1. **Intermediate Operator Decision** — JSON with operator chain, field flow, and reasoning +2. **Complete 5-Section Response**: + - Field Mapping — which fields exist vs. need to be generated + - Ordered Operator List — operators in execution order with justification + - Reasoning Summary — why this design satisfies the target + - Complete Pipeline Code — full executable Python following standard structure + - Adjustable Parameters / Caveats — tunable knobs and debugging tips + +## Six Core Operators + +| Operator | Purpose | LLM? | +|----------|---------|------| +| `PromptedGenerator` | Single-field LLM generation | Yes | +| `FormatStrPromptedGenerator` | Multi-field template-based generation | Yes | +| `Text2MultiHopQAGenerator` | Multi-hop QA pair construction from text | Yes | +| `PromptedFilter` | LLM-based quality scoring & filtering | Yes | +| `GeneralFilter` | Rule-based deterministic filtering | No | +| **KBC Trio** (3 operators, always together in order) | File/URL → Markdown → chunks → clean text | Partial | + +## Generated Pipeline Structure + +All generated pipelines follow the same standard structure: + +```python +from dataflow.operators.core_text import PromptedGenerator, PromptedFilter +from dataflow.serving import APILLMServing_request +from dataflow.utils.storage import FileStorage + +class MyPipeline: + def __init__(self): + self.storage = FileStorage( + first_entry_file_name="./data/input.jsonl", # User-provided path + cache_path="./cache", + file_name_prefix="step", + cache_type="jsonl" + ) + self.llm_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", + model_name="gpt-4o", + max_workers=10 + ) + # Operator instances ... + + def forward(self): + # Sequential operator.run() calls, each with storage.step() + ... + +if __name__ == "__main__": + pipeline = MyPipeline() + pipeline.forward() +``` + +Key rules: + +- `first_entry_file_name` is set to the exact user-provided JSONL path +- Each `operator.run()` call uses `storage=self.storage.step()` for checkpointing +- Fields propagate forward: a field must exist in the sample or be output by a prior step before it can be consumed + +## Extended Operators + +Beyond the 6 core primitives, DataFlow provides additional operators for generation, filtering, refinement, and evaluation. See the [Core Text Operator Reference](./core_text.md) for the full list. + +## Adding a New Operator + +Prerequisite: the new operator's skill definition already exists (with `SKILL.md`, `examples/good.md`, `examples/bad.md`, etc.). + +### As an Extended Operator + +Two steps are required: + +**Step 1.** Create an operator directory with its skill definition under any appropriate location (e.g., `core_text//`, or a separate skill package): + +``` +// +├── SKILL.md # API reference (constructor, run() signature, execution logic, constraints) +├── SKILL_zh.md # Chinese translation (optional) +└── examples/ + ├── good.md # Best-practice example + └── bad.md # Common mistakes +``` + +**Step 2.** Register the operator in `SKILL.md`'s **Extended Operator Reference** section. Add a row to the corresponding category table (Generate / Filter / Refine / Eval) with the operator name, subdirectory path, and description. Without this entry, the pipeline generator will not know the operator exists. + +### Promoting to a Core Primitive (Optional) + +If the operator is used frequently enough to warrant priority selection, promote it by modifying `SKILL.md`: + +1. Add to the **Preferred Operator Strategy** core primitives list +2. Add a decision table row in **Operator Selection Priority Rule** (when to use / when not to use) +3. Add full constructor and `run()` signatures in **Operator Parameter Signature Rule** +4. Add the import path in **Correct Import Paths** +5. Add input pattern matching in **Input File Content Analysis Rule** if it handles a new data type +6. Update or remove the entry from the **Extended Operator Reference** table to avoid duplication +7. Add a complete example in `examples/` (recommended) diff --git a/docs/zh/notes/guide/skills/core_text.md b/docs/zh/notes/guide/skills/core_text.md new file mode 100644 index 000000000..9349f31ec --- /dev/null +++ b/docs/zh/notes/guide/skills/core_text.md @@ -0,0 +1,63 @@ +--- +title: core_text +icon: material-symbols:extension +createTime: 2026/04/08 22:45:39 +permalink: /zh/guide/de8oculw/ +--- + +# Core Text 扩展算子参考 + +[DataFlow Pipeline生成](./generating_dataflow_pipeline.md) 的扩展算子参考库。当 6 个核心算子不能满足需求时,可查阅这里的逐算子详细文档。 + +## 可用算子 + +### Generate (`core_text/generate/`) + +| 算子 | 说明 | +|------|------| +| `prompted-generator` | 最基础的单字段 LLM 生成 | +| `format-str-prompted-generator` | 多字段模板式生成 | +| `chunked-prompted-generator` | 长文本分块逐段处理 | +| `embedding-generator` | 调用 Embedding API 生成文本向量 | +| `retrieval-generator` | 基于 LightRAG 的异步 RAG 生成 | +| `bench-answer-generator` | Benchmark 答案生成,支持多种评估类型 | +| `text2multihopqa-generator` | 从文本构建多跳问答对 | +| `random-domain-knowledge-row-generator` | 基于种子数据的领域知识行生成 | + +### Filter (`core_text/filter/`) + +| 算子 | 说明 | +|------|------| +| `prompted-filter` | 基于 LLM 的质量评分与过滤 | +| `general-filter` | 基于规则的确定性过滤 | +| `kcentergreedy-filter` | 基于 k-Center Greedy 的多样性过滤 | + +### Refine (`core_text/refine/`) + +| 算子 | 说明 | +|------|------| +| `prompted-refiner` | 基于 LLM 的文本改写与精炼 | +| `pandas-operator` | 自定义 pandas DataFrame 操作 | + +### Eval (`core_text/eval/`) + +| 算子 | 说明 | +|------|------| +| `prompted-evaluator` | 基于 LLM 的打分评估 | +| `bench-dataset-evaluator` | Benchmark 数据集评估 | +| `bench-dataset-evaluator-question` | Benchmark 问题级评估 | +| `text2qa-sample-evaluator` | 问答样本质量评估 | +| `unified-bench-dataset-evaluator` | 跨格式统一 Benchmark 评估 | + +## 目录结构 + +每个算子文件夹遵循统一布局: + +``` +<算子名称>/ +├── SKILL.md # 英文文档:使用场景、导入方式、参数说明、run() 示例 +├── SKILL_zh.md # 中文文档 +└── examples/ + ├── good.md # 正确用法示例,含单一算子组成的简单 Pipeline、样例输入及输出 + └── bad.md # 常见错误与反模式 +``` diff --git a/docs/zh/notes/guide/skills/generating_dataflow_pipeline.md b/docs/zh/notes/guide/skills/generating_dataflow_pipeline.md new file mode 100644 index 000000000..36e1cabae --- /dev/null +++ b/docs/zh/notes/guide/skills/generating_dataflow_pipeline.md @@ -0,0 +1,153 @@ +--- +title: DataFlow Pipeline生成 +icon: carbon:flow +createTime: 2026/04/08 22:45:39 +permalink: /zh/guide/skills/generating_dataflow_pipeline/ +--- + +# DataFlow Pipeline生成(Generating DataFlow Pipeline) + + + +## 功能说明 + +一款面向 [Claude Code](https://docs.anthropic.com/en/docs/claude-code) 的推理引导式 Pipeline 规划工具。给定**目标任务**(Pipeline 需实现的功能)和**样本 JSONL 文件**(1–5 条代表性数据),它会自动分析数据、选取算子、校验字段依赖,并生成一套完整可运行的 DataFlow Python Pipeline。 + +## 快速上手 + +### 1. 添加 Skill + +克隆仓库,并将 Skill 目录复制到 Claude Code 的 skills 文件夹中: + +```bash +git clone https://github.com/haolpku/DataFlow-Skills.git + +# 项目级(仅当前项目可用) +cp -r DataFlow-Skills/generating-dataflow-pipeline .claude/skills/generating-dataflow-pipeline +cp -r DataFlow-Skills/core_text .claude/skills/core_text + +# 或个人级(所有项目可用) +cp -r DataFlow-Skills/generating-dataflow-pipeline ~/.claude/skills/generating-dataflow-pipeline +cp -r DataFlow-Skills/core_text ~/.claude/skills/core_text +``` + +Claude Code 从 `.claude/skills//SKILL.md` 自动发现 Skills。`SKILL.md` frontmatter 中的 `name` 字段即为 `/斜杠命令` 名称。更多详情请参阅[官方 Skills 文档](https://docs.anthropic.com/en/docs/claude-code/skills)。 + +### 2. 准备数据 + +创建 JSONL 文件(每行一个 JSON 对象),包含 1–5 条代表性数据: + +```jsonl +{"product_name": "笔记本电脑", "category": "电子产品"} +{"product_name": "咖啡机", "category": "家用电器"} +``` + +### 3. 运行 Skill + +在 Claude Code 中调用 `/generating-dataflow-pipeline` 并描述你的目标: + +``` +/generating-dataflow-pipeline +目标:生成商品描述并筛选优质内容 +样本文件:./data/products.jsonl +预期输出字段:generated_description, quality_score +``` + +### 4. 查看输出 + +Skill 输出两阶段结果: + +1. **中间算子决策** — 包含算子链路、字段流转逻辑与推理依据的 JSON 数据 +2. **完整的 5 部分响应**: + - 字段映射 — 区分已有字段与需生成字段 + - 有序算子列表 — 按执行顺序排列的算子及选用理由 + - 推理总结 — 说明该设计为何能满足目标任务 + - 完整 Pipeline 代码 — 遵循标准结构的可执行 Python 全量代码 + - 可调参数 / 注意事项 — 可配置参数与调试技巧 + +## 六大核心算子 + +| 算子名称 | 用途 | 是否依赖 LLM | +|----------|------|-------------| +| `PromptedGenerator` | 单字段大模型生成 | 是 | +| `FormatStrPromptedGenerator` | 多字段模板式生成 | 是 | +| `Text2MultiHopQAGenerator` | 从文本构建多跳问答对 | 是 | +| `PromptedFilter` | 基于大模型的质量评分与筛选 | 是 | +| `GeneralFilter` | 基于规则的确定性过滤 | 否 | +| **KBC 三算子组**(3 个算子,需按固定顺序组合使用) | 文件/链接 → Markdown → 分块 → 清洗文本 | 部分依赖 | + +## 生成的 Pipeline 结构 + +所有生成的 Pipeline 均遵循统一标准结构: + +```python +from dataflow.operators.core_text import PromptedGenerator, PromptedFilter +from dataflow.serving import APILLMServing_request +from dataflow.utils.storage import FileStorage + +class MyPipeline: + def __init__(self): + self.storage = FileStorage( + first_entry_file_name="./data/input.jsonl", # 用户提供的路径 + cache_path="./cache", + file_name_prefix="step", + cache_type="jsonl" + ) + self.llm_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", + model_name="gpt-4o", + max_workers=10 + ) + # 算子实例化... + + def forward(self): + # 按顺序执行 operator.run(),每步通过 storage.step() 做断点持久化 + ... + +if __name__ == "__main__": + pipeline = MyPipeline() + pipeline.forward() +``` + +核心规则: + +- `first_entry_file_name` 严格设置为用户提供的 JSONL 文件路径 +- 每个 `operator.run()` 调用均使用 `storage=self.storage.step()` 实现断点持久化 +- 字段向前传递:字段必须存在于样本数据中,或由前置步骤生成,方可被后续算子使用 + +## 扩展算子 + +除六大核心算子外,DataFlow 还提供更多 Generate、Filter、Refine 和 Eval 类扩展算子。详见 [Core Text 扩展算子参考](./core_text.md)。 + +## 新增算子 + +前置条件:新算子的 Skill 定义文件已完成(包含 `SKILL.md`、`examples/good.md`、`examples/bad.md` 等)。 + +### 作为扩展算子添加 + +需要两步: + +**第 1 步.** 在合适目录下创建算子文件夹并编写 Skill 定义(如 `core_text/<分类>/`,或独立 Skill 包): + +``` +/<自定义算子名称>/ +├── SKILL.md # 接口文档(构造函数、run() 方法签名、执行逻辑、约束条件) +├── SKILL_zh.md # 中文翻译(可选) +└── examples/ + ├── good.md # 最佳实践示例 + └── bad.md # 常见错误示例 +``` + +**第 2 步.** 在 `SKILL.md` 的 **Extended Operator Reference** 部分注册该算子。在对应类别表格(Generate / Filter / Refine / Eval)中添加一行,填写算子名、子目录路径和功能描述。不添加此条目,Pipeline 生成器无法感知该算子的存在。 + +### 升级为核心算子(可选) + +若某算子使用频率极高,需纳入优先选用范围,可通过修改 `SKILL.md` 完成升级: + +1. 添加至 **Preferred Operator Strategy** 核心算子列表 +2. 在 **Operator Selection Priority Rule** 中新增决策表条目(明确适用 / 不适用场景) +3. 在 **Operator Parameter Signature Rule** 中补充完整构造函数与 `run()` 方法签名 +4. 在 **Correct Import Paths** 中添加算子导入路径 +5. 若支持新数据类型,在 **Input File Content Analysis Rule** 中补充输入模式匹配规则 +6. 更新或移除 **Extended Operator Reference** 表中的对应条目,避免与核心算子重复 +7. 在 `examples/` 目录添加完整示例(推荐)