diff --git a/CN/modules/ROOT/nav.adoc b/CN/modules/ROOT/nav.adoc index 2062603..0b052ac 100644 --- a/CN/modules/ROOT/nav.adoc +++ b/CN/modules/ROOT/nav.adoc @@ -31,6 +31,7 @@ **** xref:master/ecosystem_components/pg_cron.adoc[pg_cron] **** xref:master/ecosystem_components/pgsql_http.adoc[pgsql-http] **** xref:master/ecosystem_components/plpgsql_check.adoc[plpgsql_check] +**** xref:master/ecosystem_components/pg_ai_query.adoc[pg_ai_query] **** xref:master/ecosystem_components/pgroonga.adoc[pgroonga] **** xref:master/ecosystem_components/pgaudit.adoc[pgaudit] **** xref:master/ecosystem_components/pgrouting.adoc[pgrouting] diff --git a/CN/modules/ROOT/pages/master/ecosystem_components/pg_ai_query.adoc b/CN/modules/ROOT/pages/master/ecosystem_components/pg_ai_query.adoc new file mode 100644 index 0000000..4061446 --- /dev/null +++ b/CN/modules/ROOT/pages/master/ecosystem_components/pg_ai_query.adoc @@ -0,0 +1,319 @@ +:sectnums: +:sectnumlevels: 5 + += pg_ai_query + +== 概述 + +pg_ai_query 是一个用于 IvorySQL/PostgreSQL 的 AI 驱动自然语言转 SQL 扩展。它利用大语言模型(LLM)将用户的自然语言描述直接转换为可执行的 SQL 查询语句,支持 OpenAI、Anthropic Claude 和 Google Gemini 等多种 AI 模型。 + +项目地址: + +开源协议:Apache-2.0 + +主要特性: + +* **自然语言转 SQL**:将语言描述转换为有效的 PostgreSQL 查询 +* **多模型支持**:支持 gpt-4o-mini、gpt-4o、gpt-5、claude-3-haiku-20240307、claude-sonnet-4-5-20250929、claude-4.5-opus 等大模型 +* **安全保护**:阻止访问 `information_schema` 和 `pg_catalog` 系统表 +* **作用域限制**:仅对用户表进行操作 +* **可配置限制**:内置行数限制强制执行 +* **API 密钥安全**:安全处理 API 凭证 + +== 快速开始 + +=== 安装 + +*依赖要求* + +* PostgreSQL 14+ with development headers +* CMake 3.16+ +* C++20 compatible compiler +* API key from OpenAI, Anthropic, or Google (Gemini) + +*安装依赖* + +[source,bash] +---- +sudo apt-get install libcurl4-openssl-dev +---- + +*编译安装 IvorySQL* + +如需从源码编译 IvorySQL,可参考以下配置: + +[source,bash] +---- +./configure \ +--prefix=$PWD/inst \ +--enable-cassert \ +--enable-debug \ +--enable-tap-tests \ +--enable-rpath \ +--enable-nls \ +--enable-injection-points \ +--with-tcl \ +--with-python \ +--with-gssapi \ +--with-pam \ +--with-ldap \ +--with-openssl \ +--with-libedit-preferred \ +--with-uuid=e2fs \ +--with-ossp-uuid \ +--with-libxml \ +--with-libxslt \ +--with-perl \ +--with-icu \ +--with-libnuma +---- + +*编译安装 pg_ai_query* + +[source,bash] +---- +git clone --recurse-submodules https://github.com/benodiwal/pg_ai_query.git +cd pg_ai_query +mkdir build && cd build +export PATH="$HOME/works/repo/ivorysql/IvorySQL/inst/bin:$PATH" +cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/works/repo/ivorysql/IvorySQL/inst +make && sudo make install +---- + +*创建扩展* + +[source,sql] +---- +CREATE EXTENSION pg_ai_query; +---- + +=== 配置 + +在 home 目录下创建 `~/.pg_ai.config` 配置文件: + +[source,ini] +---- +[general] +log_level = "INFO" +enable_logging = false + +[query] +enforce_limit = true +default_limit = 1000 + +[response] +show_explanation = true +show_warnings = true +show_suggested_visualization = false +use_formatted_response = false + +[anthropic] +# Your Anthropic API key (if using Claude) +api_key = "******" + +# Default model to use (options: claude-sonnet-4-5-20250929) +default_model = "claude-sonnet-4-5-20250929" + +# Custom API endpoint (optional) - for Anthropic-compatible APIs +api_endpoint = "https://open.bigmodel.cn/api/anthropic" + +[prompts] +# Use file paths to read custom prompts +system_prompt = /home/highgo/.pg_ai.prompts +explain_system_prompt = /home/highgo/.pg_ai.explain.prompts +---- + +更多示例请参考: + +== 使用示例 + +=== 基本用法 + +[source,sql] +---- +SELECT generate_query('找出所有的用户'); +---- + +输出示例: + +---- +[INFO] Text generation successful - model: claude-sonnet-4-5-20250929, response_id: msg_20260209135507cc16362d5d324ccd + + generate_query +-------------------------------------------------------- + SELECT * FROM public.users LIMIT 1000; ++ + -- Explanation: + -- Retrieves all columns and rows from the users table. ++ + -- Warning: INFO: Applied LIMIT 1000 to prevent large result sets. Remove LIMIT if you need all data. ++ + -- Note: Row limit was automatically applied to this query for safety +(1 row) +---- + +执行查询: + +[source,sql] +---- +SELECT * FROM public.users LIMIT 1000; +---- + +输出: + +---- + id | name | email | age | created_at | city +----+---------------+-------------------+-----+----------------------------+--------------- + 1 | Alice Johnson | alice@example.com | 28 | 2026-02-04 15:47:55.208111 | New York + 2 | Bob Smith | bob@example.com | 35 | 2026-02-04 15:47:55.208111 | San Francisco + 3 | Carol Davis | carol@example.com | 31 | 2026-02-04 15:47:55.208111 | Chicago + 4 | David Wilson | david@example.com | 27 | 2026-02-04 15:47:55.208111 | Seattle + 5 | Eva Brown | eva@example.com | 33 | 2026-02-04 15:47:55.208111 | Boston +(5 rows) +---- + +=== generate_query 示例 + +*生成测试数据* + +[source,sql] +---- +SELECT generate_query('生成100条user数据,插入到users'); +---- + +输出: + +---- +[INFO] Text generation successful - model: claude-sonnet-4-5-20250929, response_id: msg_2026021114092101601c5650864a2d + + generate_query +-------------------------------------------------------------------------------------------------------- + INSERT INTO public.users (name, email, age, city, status) + SELECT 'User_' || generate_series AS name, + 'user' || generate_series || '@example.com' AS email, + (18 + (generate_series % 50)) AS age, + (ARRAY['Beijing','Shanghai','Guangzhou','Shenzhen','Hangzhou'])[1 + (generate_series % 5)] AS city, + 'active' AS status + FROM generate_series(1, 100); ++ + -- Explanation: + -- 生成100条模拟用户数据并插入到users表中。数据包括自动生成的姓名、唯一邮箱、随机年龄(18-67岁)、随机城市和默认状态。 ++ + -- Warnings: + -- 1. INFO: 依赖users表的id列有DEFAULT自增设置,未手动插入id。 + -- 2. INFO: 使用generate_series函数生成序列数据,这是PostgreSQL/IvorySQL的特性。 + -- 3. WARN: 确保在运行前users表为空或id序列不冲突,否则可能重复插入。 + -- 4. WARN: 邮箱格式为简单模拟,实际环境中可能需要更复杂的逻辑或去重检查。 +(1 row) +---- + +*不区分大小写查询* + +[source,sql] +---- +SELECT generate_query('show users from beijing, beijing is non-Case insensitive'); +---- + +输出: + +---- +[INFO] Text generation successful - model: claude-sonnet-4-5-20250929, response_id: msg_20260211142845878f5f1a5a2f44a7 + + generate_query +----------------------------------------- + SELECT id, name, email, age, created_at, city, status + FROM public.users + WHERE LOWER(city) = LOWER('beijing') LIMIT 100; ++ + -- Explanation: + -- Selects all user details for users located in Beijing, performing a case-insensitive match on the city column. ++ + -- Warnings: + -- 1. INFO: Using LOWER() on both sides ensures case-insensitive matching but may prevent the database from using a standard index on the city column if one exists. + -- 2. INFO: Row limit of 100 applied to prevent large result sets. ++ + -- Note: Row limit was automatically applied to this query for safety +(1 row) +---- + +=== explain_query 示例 + +[source,sql] +---- +SELECT explain_query('SELECT * FROM orders WHERE user_id = 12'); +---- + +输出: + +---- +[INFO] Text generation successful - model: claude-sonnet-4-5-20250929, response_id: msg_20260211175909d47a6871bcca4897 + + explain_query +-------------------------------------------------------------------------------------------------------------- + 1. 查询概述 ++ + - 该查询旨在从 orders 表中检索 user_id 等于 12 的所有记录(SELECT *)。 + - 这是一个典型的根据特定字段(user_id)筛选数据的查询。 ++ + 2. 性能摘要 ++ + - 总执行时间: 0.021 毫秒 + - 规划时间: 0.430 毫秒 + - 总成本: 18.12 + - 返回行数: 0 行 (Actual Rows: 0) + - 扫描行数: 0 行 (Rows Removed by Filter: 0) ++ + 3. 执行计划分析 ++ + - 关键步骤: 顺序扫描 + - 数据库对 orders 表执行了全表扫描操作。 + - 计划器预计会找到 3 行数据,但实际执行返回了 0 行。 + - 过滤条件: orders.user_id = 12,这意味着数据库必须读取表中的每一行来检查这个条件。 ++ + 4. 性能问题 ++ + - 全表扫描风险: 虽然目前表的数据量很小(执行时间仅为 0.021ms),但使用了 Seq Scan(顺序扫描)意味着数据库没有使用索引。如果 orders 表随着时间推移增长到包含数百万行数据,这种查询方式将变得极其缓慢(高 I/O 消耗)。 + - 缺失索引: 计划显示没有使用任何索引来定位 user_id = 12 的行,这表明在 user_id 列上可能缺少必要的 B-Tree 索引。 ++ + 5. 优化建议 ++ + - 主要建议: 在 user_id 列上创建索引以避免全表扫描。这将把查询从 O(N)(扫描所有行)转变为 O(log N)(索引查找)。 + - SQL 优化示例: ++ + CREATE INDEX idx_orders_user_id ON orders(user_id); ++ + 6. 索引建议 ++ + - 推荐索引: 在 orders 表的 user_id 列上创建 B-Tree 索引。 + - 理由: 查询条件基于 user_id 的等值比较 (=)。创建索引后,IvorySQL (PostgreSQL) 将能够利用索引快速定位数据,显著减少查询时间和资源消耗,特别是在数据量大的情况下。 +(1 row) +---- + +== 最佳实践 + +=== 提示词(Prompt)编写建议 + +* **使用英语**:虽然 AI 支持多种语言,但英语效果最佳 +* **了解数据库结构**:对数据库结构理解越深入,生成的查询越准确 +* **迭代优化**:从宽泛的开始,然后逐步添加细节以改进结果 +* **明确指定**:如果知道特定的表或列,请在提示中提及,这有助于 AI 生成精确的查询 + +=== 错误处理示例 + +当查询中引用的表不存在时,系统会返回错误信息: + +[source,sql] +---- +SELECT generate_query('列出所有的商品和价格'); +---- + +错误输出: + +---- +[INFO] Text generation successful - model: claude-sonnet-4-5-20250929, response_id: msg_20260209135642777cbc5c82ca4a85 + +ERROR: Query generation failed: Cannot generate query. Referenced table(s) for 'products' or 'goods' do not exist in the database. Available tables: public.orders, public.student_scores, public.users, sys.dual +---- + +在这种情况下,AI 会告知可用的表列表,帮助用户调整查询。 + diff --git a/EN/modules/ROOT/nav.adoc b/EN/modules/ROOT/nav.adoc index 97ababa..e30e242 100644 --- a/EN/modules/ROOT/nav.adoc +++ b/EN/modules/ROOT/nav.adoc @@ -30,6 +30,7 @@ *** xref:master/ecosystem_components/pg_cron.adoc[pg_cron] *** xref:master/ecosystem_components/pgsql_http.adoc[pgsql-http] *** xref:master/ecosystem_components/plpgsql_check.adoc[plpgsql_check] +*** xref:master/ecosystem_components/pg_ai_query.adoc[pg_ai_query] *** xref:master/ecosystem_components/pgroonga.adoc[pgroonga] *** xref:master/ecosystem_components/pgaudit.adoc[pgaudit] *** xref:master/ecosystem_components/pgrouting.adoc[pgrouting] diff --git a/EN/modules/ROOT/pages/master/ecosystem_components/pg_ai_query.adoc b/EN/modules/ROOT/pages/master/ecosystem_components/pg_ai_query.adoc new file mode 100644 index 0000000..04b8a94 --- /dev/null +++ b/EN/modules/ROOT/pages/master/ecosystem_components/pg_ai_query.adoc @@ -0,0 +1,319 @@ +:sectnums: +:sectnumlevels: 5 + += pg_ai_query + +== Overview + +pg_ai_query is an AI-powered natural language to SQL extension for IvorySQL/PostgreSQL. It leverages Large Language Models (LLMs) to convert user's natural language descriptions directly into executable SQL query statements. It supports multiple AI models including OpenAI, Anthropic Claude, and Google Gemini. + +Project URL: + +License: Apache-2.0 + +Key features: + +* **Natural Language to SQL**: Convert plain English descriptions into valid PostgreSQL queries +* **Multi-model Support**: Supports gpt-4o-mini, gpt-4o, gpt-5, claude-3-haiku-20240307, claude-sonnet-4-5-20250929, claude-4.5-opus and other large models +* **System Table Protection**: Blocks access to `information_schema` and `pg_catalog` +* **Limited Scope**: Only operates on user tables +* **Configurable Limits**: Built-in row limit enforcement +* **API Key Security**: Secure handling of API credentials + +== Quick Start + +=== Installation + +*Prerequisites* + +* PostgreSQL 14+ with development headers +* CMake 3.16+ +* C++20 compatible compiler +* API key from OpenAI, Anthropic, or Google (Gemini) + +*Install dependencies* + +[source,bash] +---- +sudo apt-get install libcurl4-openssl-dev +---- + +*Compile and install IvorySQL* + +If you need to compile IvorySQL from source, you can refer to the following configuration: + +[source,bash] +---- +./configure \ +--prefix=$PWD/inst \ +--enable-cassert \ +--enable-debug \ +--enable-tap-tests \ +--enable-rpath \ +--enable-nls \ +--enable-injection-points \ +--with-tcl \ +--with-python \ +--with-gssapi \ +--with-pam \ +--with-ldap \ +--with-openssl \ +--with-libedit-preferred \ +--with-uuid=e2fs \ +--with-ossp-uuid \ +--with-libxml \ +--with-libxslt \ +--with-perl \ +--with-icu \ +--with-libnuma +---- + +*Compile and install pg_ai_query* + +[source,bash] +---- +git clone --recurse-submodules https://github.com/benodiwal/pg_ai_query.git +cd pg_ai_query +mkdir build && cd build +export PATH="$HOME/works/repo/ivorysql/IvorySQL/inst/bin:$PATH" +cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/works/repo/ivorysql/IvorySQL/inst +make && sudo make install +---- + +*Create extension* + +[source,sql] +---- +CREATE EXTENSION pg_ai_query; +---- + +=== Configuration + +Create a `~/.pg_ai.config` configuration file in your home directory: + +[source,ini] +---- +[general] +log_level = "INFO" +enable_logging = false + +[query] +enforce_limit = true +default_limit = 1000 + +[response] +show_explanation = true +show_warnings = true +show_suggested_visualization = false +use_formatted_response = false + +[anthropic] +# Your Anthropic API key (if using Claude) +api_key = "******" + +# Default model to use (options: claude-sonnet-4-5-20250929) +default_model = "claude-sonnet-4-5-20250929" + +# Custom API endpoint (optional) - for Anthropic-compatible APIs +api_endpoint = "https://open.bigmodel.cn/api/anthropic" + +[prompts] +# Use file paths to read custom prompts +system_prompt = /home/highgo/.pg_ai.prompts +explain_system_prompt = /home/highgo/.pg_ai.explain.prompts +---- + +For more examples, please refer to: + +== Usage Examples + +=== Basic Usage + +[source,sql] +---- +SELECT generate_query('Find all users'); +---- + +Example output: + +---- +[INFO] Text generation successful - model: claude-sonnet-4-5-20250929, response_id: msg_20260209135507cc16362d5d324ccd + + generate_query +-------------------------------------------------------- + SELECT * FROM public.users LIMIT 1000; ++ + -- Explanation: + -- Retrieves all columns and rows from the users table. ++ + -- Warning: INFO: Applied LIMIT 1000 to prevent large result sets. Remove LIMIT if you need all data. ++ + -- Note: Row limit was automatically applied to this query for safety +(1 row) +---- + +Execute the query: + +[source,sql] +---- +SELECT * FROM public.users LIMIT 1000; +---- + +Output: + +---- + id | name | email | age | created_at | city +----+---------------+-------------------+-----+----------------------------+--------------- + 1 | Alice Johnson | alice@example.com | 28 | 2026-02-04 15:47:55.208111 | New York + 2 | Bob Smith | bob@example.com | 35 | 2026-02-04 15:47:55.208111 | San Francisco + 3 | Carol Davis | carol@example.com | 31 | 2026-02-04 15:47:55.208111 | Chicago + 4 | David Wilson | david@example.com | 27 | 2026-02-04 15:47:55.208111 | Seattle + 5 | Eva Brown | eva@example.com | 33 | 2026-02-04 15:47:55.208111 | Boston +(5 rows) +---- + +=== generate_query Examples + +*Generate test data* + +[source,sql] +---- +SELECT generate_query('Generate 100 user records and insert into users table'); +---- + +Output: + +---- +[INFO] Text generation successful - model: claude-sonnet-4-5-20250929, response_id: msg_2026021114092101601c5650864a2d + + generate_query +-------------------------------------------------------------------------------------------------------- + INSERT INTO public.users (name, email, age, city, status) + SELECT 'User_' || generate_series AS name, + 'user' || generate_series || '@example.com' AS email, + (18 + (generate_series % 50)) AS age, + (ARRAY['Beijing','Shanghai','Guangzhou','Shenzhen','Hangzhou'])[1 + (generate_series % 5)] AS city, + 'active' AS status + FROM generate_series(1, 100); ++ + -- Explanation: + -- Generates 100 simulated user records and inserts them into the users table. Data includes auto-generated names, unique emails, random ages (18-67), random cities, and default status. ++ + -- Warnings: + -- 1. INFO: Relies on DEFAULT auto-increment setting for id column in users table, id is not manually inserted. + -- 2. INFO: Uses generate_series function to generate sequence data, which is a PostgreSQL/IvorySQL feature. + -- 3. WARN: Ensure users table is empty or id sequence does not conflict before running, otherwise duplicate inserts may occur. + -- 4. WARN: Email format is simple simulation, actual environment may require more complex logic or duplicate checking. +(1 row) +---- + +*Case-insensitive query* + +[source,sql] +---- +SELECT generate_query('Show users from beijing, beijing is non-Case insensitive'); +---- + +Output: + +---- +[INFO] Text generation successful - model: claude-sonnet-4-5-20250929, response_id: msg_20260211142845878f5f1a5a2f44a7 + + generate_query +----------------------------------------- + SELECT id, name, email, age, created_at, city, status + FROM public.users + WHERE LOWER(city) = LOWER('beijing') LIMIT 100; ++ + -- Explanation: + -- Selects all user details for users located in Beijing, performing a case-insensitive match on the city column. ++ + -- Warnings: + -- 1. INFO: Using LOWER() on both sides ensures case-insensitive matching but may prevent the database from using a standard index on the city column if one exists. + -- 2. INFO: Row limit of 100 applied to prevent large result sets. ++ + -- Note: Row limit was automatically applied to this query for safety +(1 row) +---- + +=== explain_query Examples + +[source,sql] +---- +SELECT explain_query('SELECT * FROM orders WHERE user_id = 12'); +---- + +Output: + +---- +[INFO] Text generation successful - model: claude-sonnet-4-5-20250929, response_id: msg_20260211175909d47a6871bcca4897 + + explain_query +-------------------------------------------------------------------------------------------------------------- + 1. Query Overview ++ + - This query aims to retrieve all records from the orders table where user_id equals 12 (SELECT *). + - This is a typical query that filters data based on a specific field (user_id). ++ + 2. Performance Summary ++ + - Total execution time: 0.021 ms + - Planning time: 0.430 ms + - Total cost: 18.12 + - Rows returned: 0 rows (Actual Rows: 0) + - Rows scanned: 0 rows (Rows Removed by Filter: 0) ++ + 3. Execution Plan Analysis ++ + - Key step: Sequential Scan + - The database performed a full table scan on the orders table. + - The planner estimated finding 3 rows, but actual execution returned 0 rows. + - Filter condition: orders.user_id = 12, which means the database must read every row in the table to check this condition. ++ + 4. Performance Issues ++ + - Full table scan risk: Although the current table data volume is small (execution time is only 0.021ms), using Seq Scan (sequential scan) means the database is not using an index. If the orders table grows to millions of rows over time, this query approach will become extremely slow (high I/O consumption). + - Missing index: The plan shows no index was used to locate rows where user_id = 12, indicating that a necessary B-Tree index may be missing on the user_id column. ++ + 5. Optimization Recommendations ++ + - Primary recommendation: Create an index on the user_id column to avoid full table scans. This will transform the query from O(N) (scanning all rows) to O(log N) (index lookup). + - SQL optimization example: ++ + CREATE INDEX idx_orders_user_id ON orders(user_id); ++ + 6. Index Recommendations ++ + - Recommended index: Create a B-Tree index on the user_id column of the orders table. + - Reason: The query condition is based on equality comparison (=) of user_id. After creating the index, IvorySQL (PostgreSQL) will be able to use the index to quickly locate data, significantly reducing query time and resource consumption, especially when data volume is large. +(1 row) +---- + +== Best Practices + +=== Prompt Writing Guidelines + +* **Use English**: While AI supports multiple languages, English works best +* **Understand your database structure**: The more you understand your database structure, the more accurate the generated queries will be +* **Refine iteratively**: Start broad, then add details step by step to improve results +* **Be explicit**: If you know specific tables or columns, mention them in your prompts - this helps AI generate more precise queries + +=== Error Handling Examples + +When tables referenced in the query do not exist, the system returns an error message: + +[source,sql] +---- +SELECT generate_query('List all products and prices'); +---- + +Error output: + +---- +[INFO] Text generation successful - model: claude-sonnet-4-5-20250929, response_id: msg_20260209135642777cbc5c82ca4a85 + +ERROR: Query generation failed: Cannot generate query. Referenced table(s) for 'products' or 'goods' do not exist in the database. Available tables: public.orders, public.student_scores, public.users, sys.dual +---- + +In this case, the AI informs you of the available table list to help adjust your query. +