diff --git a/=0.23.0 b/=0.23.0 new file mode 100644 index 0000000..8c84ebe --- /dev/null +++ b/=0.23.0 @@ -0,0 +1,4 @@ +[version-2 a8dd7b3] Added zstandard + 3 files changed, 12 insertions(+) + create mode 100644 "=0.23.0\n" + create mode 100644 changai/changai/setup/install.py diff --git "a/=0.23.0\n" "b/=0.23.0\n" new file mode 100644 index 0000000..0b77812 --- /dev/null +++ "b/=0.23.0\n" @@ -0,0 +1,2 @@ +[version-2 f5bbc36] Added zstandard + 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changai/changai/api/v2/assets/business_keywords_v1.json b/changai/changai/api/v2/assets/business_keywords_v1.json index 56e4d5d..3658622 100644 --- a/changai/changai/api/v2/assets/business_keywords_v1.json +++ b/changai/changai/api/v2/assets/business_keywords_v1.json @@ -6403,6 +6403,9 @@ "headline earnings", "exceptional item", "project demand", + "maintenance", + "logs", + "machines", "project request", "project intake", "project scoring", diff --git a/changai/changai/api/v2/build_cards_faiss_index_v2.py b/changai/changai/api/v2/build_cards_faiss_index_v2.py index 7123dcb..48a750f 100644 --- a/changai/changai/api/v2/build_cards_faiss_index_v2.py +++ b/changai/changai/api/v2/build_cards_faiss_index_v2.py @@ -27,12 +27,13 @@ def _get_fvs_paths() -> tuple: table_path = os.path.join(app_base, "table_fvs") schema_path = os.path.join(app_base, "schema_fvs") + schema_emb_path = os.path.join(app_base, "emb_dir") master_path = os.path.join(private_base, "masterdata_fvs") for p in (app_base, private_base, table_path, schema_path, master_path): os.makedirs(p, exist_ok=True) - return app_base, private_base, table_path, schema_path, master_path + return app_base, private_base, table_path, schema_path, master_path,schema_emb_path RAG_FOLDER = "Home/RAG Sources" HNSW_M = 32 @@ -200,7 +201,10 @@ def build_schema_docs(schema: Dict[str, Any]) -> List[Document]: if not isinstance(tables, list): return docs - + GENERIC_FIELDS = { + 'creation', 'modified', 'owner', 'parenttype','old_parent', + 'parentfield', 'parent', 'idx', 'name', 'docstatus' +} for table_block in tables: if not _is_valid_schema_table(table_block): continue @@ -211,8 +215,12 @@ def build_schema_docs(schema: Dict[str, Any]) -> List[Document]: if not isinstance(fields, list): continue - + for field_row in fields: + field_name = field_row.get("name") + if field_name in GENERIC_FIELDS: + continue + doc = _build_field_document(table_name, module, field_row) if doc: docs.append(doc) @@ -338,7 +346,7 @@ def build_all_fvs() -> Dict[str, Any]: def build_table_fvs_job(): try: - app_base, _, table_path, _, _ = _get_fvs_paths() + app_base, _, table_path, _, _,_ = _get_fvs_paths() tables_list = _load_json_from_file_doc("tables.json") table_docs = build_table_docs(tables_list) _build_and_save_faiss(table_docs, table_path, "ERPNext Table FVS", app_base) @@ -347,13 +355,50 @@ def build_table_fvs_job(): frappe.log_error(frappe.get_traceback(), "Build Table FVS Failed") raise +import os +import pickle +import numpy as np +def save_field_matrix(schema_docs, base_dir): + emb = get_embedding_engine() + + texts = [d.page_content for d in schema_docs] + vectors = emb.embed_documents(texts) + + embs = np.array(vectors, dtype="float32") + embs = embs / np.clip( + np.linalg.norm(embs, axis=1, keepdims=True), + 1e-12, + None + ) + + table_to_idx = {} + + for i, d in enumerate(schema_docs): + meta = getattr(d, "metadata", {}) or {} + table = meta.get("table") + field = meta.get("field") + + if table and field: + table_to_idx.setdefault(table, []).append(i) + + os.makedirs(base_dir, exist_ok=True) + + np.save(os.path.join(base_dir, "field_embs.npy"), embs) + + with open(os.path.join(base_dir, "field_docs.pkl"), "wb") as f: + pickle.dump(schema_docs, f) + + with open(os.path.join(base_dir, "table_to_idx.pkl"), "wb") as f: + pickle.dump(table_to_idx, f) + def build_schema_fvs_job(): try: schema = _load_yaml_from_file_doc("schema.yaml") schema_docs = build_schema_docs(schema) - app_base, _, _, schema_path, _ = _get_fvs_paths() + app_base, _, _, schema_path, _,schema_emb_dir = _get_fvs_paths() _build_and_save_faiss(schema_docs, schema_path, "ERPNext Schema FVS", app_base) + save_field_matrix(schema_docs, schema_emb_dir) frappe.logger().info(f"ERPNext Schema FVS built: {len(schema_docs)} docs") except Exception : frappe.log_error(frappe.get_traceback(), "Build Schema FVS Failed") @@ -362,7 +407,7 @@ def build_schema_fvs_job(): def build_master_data_fvs_job(): try: - _, private_base, _, _, master_path = _get_fvs_paths() + _, private_base, _, _, master_path,_ = _get_fvs_paths() master_data = _load_yaml_from_file_doc("master_data.yaml") entity_docs = build_entity_docs(master_data) _build_and_save_faiss(entity_docs, master_path, "ERPNext Master Data FVS", private_base) diff --git a/changai/changai/api/v2/fvs_stores/erpnext/emb_dir/field_docs.pkl b/changai/changai/api/v2/fvs_stores/erpnext/emb_dir/field_docs.pkl new file mode 100644 index 0000000..bf514c6 Binary files /dev/null and b/changai/changai/api/v2/fvs_stores/erpnext/emb_dir/field_docs.pkl differ diff --git a/changai/changai/api/v2/fvs_stores/erpnext/emb_dir/field_embs.npy b/changai/changai/api/v2/fvs_stores/erpnext/emb_dir/field_embs.npy new file mode 100644 index 0000000..4c825f0 Binary files /dev/null and b/changai/changai/api/v2/fvs_stores/erpnext/emb_dir/field_embs.npy differ diff --git a/changai/changai/api/v2/fvs_stores/erpnext/emb_dir/table_to_idx.pkl b/changai/changai/api/v2/fvs_stores/erpnext/emb_dir/table_to_idx.pkl new file mode 100644 index 0000000..363a1d6 Binary files /dev/null and b/changai/changai/api/v2/fvs_stores/erpnext/emb_dir/table_to_idx.pkl differ diff --git a/changai/changai/api/v2/fvs_stores/erpnext/schema_fvs/index.faiss b/changai/changai/api/v2/fvs_stores/erpnext/schema_fvs/index.faiss index 9c84619..29e1b7c 100644 Binary files a/changai/changai/api/v2/fvs_stores/erpnext/schema_fvs/index.faiss and b/changai/changai/api/v2/fvs_stores/erpnext/schema_fvs/index.faiss differ diff --git a/changai/changai/api/v2/fvs_stores/erpnext/schema_fvs/index.pkl b/changai/changai/api/v2/fvs_stores/erpnext/schema_fvs/index.pkl index ad62d4e..ed7ca02 100644 Binary files a/changai/changai/api/v2/fvs_stores/erpnext/schema_fvs/index.pkl and b/changai/changai/api/v2/fvs_stores/erpnext/schema_fvs/index.pkl differ diff --git a/changai/changai/api/v2/fvs_stores/erpnext/table_fvs/index.faiss b/changai/changai/api/v2/fvs_stores/erpnext/table_fvs/index.faiss index aea015a..68eb2cd 100644 Binary files a/changai/changai/api/v2/fvs_stores/erpnext/table_fvs/index.faiss and b/changai/changai/api/v2/fvs_stores/erpnext/table_fvs/index.faiss differ diff --git a/changai/changai/api/v2/process_data.py b/changai/changai/api/v2/process_data.py deleted file mode 100644 index 9c3f30e..0000000 --- a/changai/changai/api/v2/process_data.py +++ /dev/null @@ -1,95 +0,0 @@ -# from vertexai.generative_models import GenerativeModel, GenerationConfig -# import time,secrets -# from google.oauth2 import service_account -# from frappe import _ -# from google.genai import types -# from google.api_core import exceptions as google_exceptions -# from google import genai -# import frappe,json - -# MAX_RETRIES = 5 -# REQUEST_DELAY = 30 -# BASE_BACKOFF = 2.0 -# MAX_BACKOFF = 60.0 -# CHANGAI_SETTINGS = "ChangAI Settings" - - -# def _get_gemini_client(): -# settings = frappe.get_single(CHANGAI_SETTINGS) -# json_content = (settings.get("gemini_json_content") or "").strip() -# project_id = (settings.get("gemini_project_id") or "").strip() -# location = (settings.get("location") or "us-central1").strip() - -# if not json_content: -# frappe.throw(_("Gemini Service Account JSON is missing."), title=_("Missing Gemini Configuration")) -# if not project_id: -# frappe.throw(_("Gemini Project ID is missing."), title=_("Missing Gemini Configuration")) - -# try: -# service_account_info = json.loads(json_content) -# except json.JSONDecodeError as e: -# frappe.throw(_("Gemini Service Account JSON is invalid: {0}").format(str(e)), title=_("Invalid Gemini JSON")) - -# creds = service_account.Credentials.from_service_account_info( -# service_account_info, -# scopes=['https://www.googleapis.com/auth/cloud-platform'] -# ) -# return genai.Client( -# vertexai=True, -# project=project_id, -# location=location, -# credentials=creds, -# ) - - -# def _sleep_backoff(attempt: int, base: float = BASE_BACKOFF, cap: float = MAX_BACKOFF): -# delay = min(cap, base * (2 ** attempt)) -# delay = delay * (0.7 + secrets.randbelow(1000) / 1000 * 0.6) -# time.sleep(delay) -# def generate_anchors(): -# client=_get_gemini_client() -# raw=None -# for attempt in range(MAX_RETRIES): -# try: -# cfg = types.GenerateContentConfig( -# temperature=0.9, -# max_output_tokens=8192, -# system_instruction=system_instruction, -# ) - -# response = client.models.generate_content( -# model="gemini-2.5-flash-lite", -# contents=contents, -# config=cfg, -# ) -# raw = (response.text or "").strip() - -# if REQUEST_DELAY: -# time.sleep(REQUEST_DELAY) - -# break - -# except google_exceptions.ResourceExhausted: -# frappe.log_error( -# "Gemini quota exceeded", -# "Gemini Rate Limit (429) - sleeping 30s", -# ) -# time.sleep(30) -# _sleep_backoff(attempt) - -# except google_exceptions.Unauthenticated: -# frappe.log_error("Gemini auth failed", "Gemini Authentication Error") -# return "" - -# except google_exceptions.GoogleAPIError as e: -# frappe.log_error(str(e), "Gemini API Error") -# _sleep_backoff(attempt) - -# except Exception as e: -# frappe.log_error( -# title="Gemini generate_content.test failed", -# message=f"{str(e)}\n\nContents: {json.dumps(contents)[:8000] if contents else 'N/A'}", -# ) -# _sleep_backoff(attempt) - -# return raw or "" \ No newline at end of file diff --git a/changai/changai/api/v2/store_chats.py b/changai/changai/api/v2/store_chats.py index 788b88d..ce17f4f 100644 --- a/changai/changai/api/v2/store_chats.py +++ b/changai/changai/api/v2/store_chats.py @@ -76,94 +76,75 @@ def get_chat_history(session_id: str) -> list: return [] return history[-5:] -PROMPT_FOLLOWUP = """You are ChangAI, an ERP entity-value detector + query rewriter. -Return ONLY valid JSON with EXACTLY these keys: -{{"standalone_question":"...","contains_values":true/false}} - -### TASK 1 — SPELL CORRECTION: -- Fix any typos or spelling mistakes in the latest message before doing anything else -- Examples: - - "slaes order of lst mnoth" → "sales order of last month" - - "whcih custoemr has pendign" → "which customer has pending" - - "stok of chiar item" → "stock of chair item" - -### TASK 2 — CONTINUITY DETECTION: -- Check if the latest message is a follow-up or refers to previous conversation -- Look at the last 3-4 human messages in chat history for context -- If it IS a follow-up, rewrite as a fully self-contained standalone question -- Always put the final rewritten (and corrected) question in "standalone_question" - -Follow-up indicators: -- Pronouns with no clear referent: "it", "they", "that", "those", "him", "her", "his" -- Incomplete references: "same customer", "that item", "the one", "same period" -- Continuation words: "also", "and what about", "what else", "show more" -- Short vague messages: "and today?", "what about last month?", "how many?" - -Examples: - History: "show sales of ahmed" - Latest: "what about his pending invoices" - → standalone_question: "show pending invoices of ahmed" - - History: "stock of office chair in main warehouse" - Latest: "what about side tabel?" - → standalone_question: "stock of side table in main warehouse" - - History: "top 5 customers this month" - Latest: "show lst month" - → standalone_question: "top 5 customers last month" - - History: "employees in accounts department" - Latest: "hw many are absent today?" - → standalone_question: "how many employees in accounts department are absent today" - -### TASK 3 — ENTITY DETECTION (contains_values): -Meaning of contains_values (STRICT): - -TRUE = standalone_question contains ANY explicit or implied ENTITY IDENTIFIER -that should be matched to master data -(customer/supplier/item/warehouse/employee/category etc.) - -Examples (TRUE): -- "invoice of ayan" (name) -- "who bought laptop last month" (product) -- "sales of pens today" (product) -- "top items in electronics category" (category) -- "stock of office chair in main warehouse" (item + warehouse) - -FALSE = NO entity identifier mentioned. -Only filters, time ranges, counts, ranking words, or statuses. - -Examples (FALSE): -- "show all customers" -- "unpaid suppliers list" -- "sales orders pending delivery" -- "payment received this month" -- "top vendor dues list" -- "today sales" - -Rules: -- Any product/item/category reference → contains_values = true -- Only entity names/codes or product/category references → contains_values = true -- When unsure between item vs non-item → prefer contains_values = true - -### OUTPUT FORMAT (STRICT — no extra keys, no markdown): -{{"standalone_question":"...","contains_values":true/false}} - -Chat history (use ONLY human lines): -{rows} - -Latest user message: -{qstn} -- Always output the "standalone_question" in clear English, regardless of the user's original language, preserving the exact meaning. -""" @frappe.whitelist(allow_guest=False) def respond_from_cache(user_question:str): if user_question: doc=frappe.db.get_value("ChangAI Logs",{"user_question":user_question},["sql_generated","result"],as_dict=False) return doc - +PROMPT_FOLLOWUP = """You are ChangAI, an ERP query rewriter and entity detector. +Return ONLY valid JSON: +{{"standalone_question":"...","contains_values":true/false}} +TASK 1 — FOLLOW-UP +- If the query depends on previous messages, rewrite it as a complete standalone question. +- Otherwise keep it unchanged. +TASK 2 — ENTITY DETECTION +contains_values = TRUE if the query includes any specific entity +(customer, supplier, item, warehouse, employee, etc.), else FALSE. +When unsure → TRUE. +TASK 3 — ERP CONTEXTUAL REWRITE +1. Normalize: +- Fix typos, clear English +- Do NOT change entity values +2. Complete intent: +- Expand vague queries (total, list, top, filter) +3. ERP mapping: +- Map generic terms to standard ERPNext concepts based on intent +- Avoid vague words if clearer business terms exist +- Do NOT invent documents or use report names that +Examples: +invoice → Sales Invoice / Purchase Invoice +order → Sales Order / Purchase Order +stock → Bin / Stock Ledger Entry +production → Work Order +timesheet → Timesheet / Timesheet Detail +finance/profit → GL Entry (use credit and debit) +4. Field hints (max 1–2): +Use natural phrasing ("based on", "using") +sales → grand_total +qty → qty +stock → actual_qty +production → produced_qty +finance → debit / credit +status → status +5. Time fields: +Sales/Stock/Finance → posting_date +Work Order → actual_start_date / actual_end_date +Timesheet → start_date / end_date +Timesheet Detail → from_time / to_time +STRICT: +- NEVER use posting_date for Timesheet +- NEVER use creation unless asked +6. Relationships: +- Include linked entities if required +STYLE: +- Natural business language +- No SQL, no tab* names +EXAMPLES: +"sales invoice last month" +→ What is the total sales amount from Sales Invoices last month based on grand_total and posting_date? + +"stock in warehouse a" +→ What is the stock quantity in Warehouse A based on actual_qty from Bin? + +"who worked today" +→ Which employees logged time today based on Timesheet start_date or Timesheet Detail from_time? +Chat history: +{rows} +User: +{qstn} +""" @frappe.whitelist(allow_guest=False) def inject_prompt(user_qstn: str, session_id: str) -> str: rows=get_chat_history(session_id) diff --git a/changai/changai/api/v2/text2sql_pipeline_v2.py b/changai/changai/api/v2/text2sql_pipeline_v2.py index b5d3fe2..2ec4696 100644 --- a/changai/changai/api/v2/text2sql_pipeline_v2.py +++ b/changai/changai/api/v2/text2sql_pipeline_v2.py @@ -38,7 +38,14 @@ import shutil from frappe import _ from pathlib import Path - +import numpy as np +from typing import List, Dict, Any +from symspellpy.symspellpy import SymSpell +sym_spell = None + +_FIELD_DOCS_CACHE = None +_FIELD_EMBS_CACHE = None +_TABLE_TO_IDX_CACHE = None _ASSETS_DIR = Path(frappe.get_app_path("changai", "changai", "api", "v2", "assets")).resolve() _PROMPTS_DIR = Path(frappe.get_app_path("changai", "changai", "prompts")).resolve() CHANGAI_SETTINGS = "ChangAI Settings" @@ -47,6 +54,30 @@ import frappe from typing import Any, Dict, Optional + +def get_symspell(): + global sym_spell + + if sym_spell is not None: + return sym_spell + + sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) + + dictionary_path = frappe.get_app_path( + "changai", + "utils", + "dictionaries", + "frequency_dictionary_en_82_765.txt" + ) + + sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) + + for kw in BUSINESS_KEYWORDS: + sym_spell.create_dictionary_entry(kw.lower(), 1000) + + return sym_spell + + def publish_pipeline_update(request_id, stage, message, data=None, done=False, error=False): if not request_id: return @@ -189,7 +220,15 @@ def download_model_from_ui(): snapshot_download( repo_id="hyrinmansoor/changAI-nomic-embed-text-v1.5-finetuned", - local_dir=model_path + local_dir=model_path, + ignore_patterns=[ + "*.pt", + "*.pth", + "*.bin", + "trainer_*", + "optimizer*" + ] + ) _EMBEDDER_INSTANCE = None @@ -199,6 +238,48 @@ def download_model_from_ui(): frappe.log_error(frappe.get_traceback(), "Embedding Model Download Failed") frappe.throw(_("Model download failed: {0}\n Check Quick Start Guide Here 👇:\n{1}").format(str(e),CHANGAI_GUIDE_LINK)) +import os +import pickle +import numpy as np + +_FIELD_DOCS_CACHE = None +_FIELD_EMBS_CACHE = None +_TABLE_TO_IDX_CACHE = None + + +def load_field_matrix(): + global _FIELD_DOCS_CACHE, _FIELD_EMBS_CACHE, _TABLE_TO_IDX_CACHE + + if _FIELD_DOCS_CACHE is not None: + return _FIELD_DOCS_CACHE, _FIELD_EMBS_CACHE, _TABLE_TO_IDX_CACHE + + app_root = frappe.get_app_path("changai") + schema_path = os.path.join( + app_root, + "changai", "api", "v2", "fvs_stores", "erpnext", "emb_dir" + ) + + embs_path = os.path.join(schema_path, "field_embs.npy") + docs_path = os.path.join(schema_path, "field_docs.pkl") + table_idx_path = os.path.join(schema_path, "table_to_idx.pkl") + + if not os.path.exists(embs_path): + frappe.throw(f"Missing field_embs.npy. Rebuild schema FVS first: {embs_path}") + + with open(docs_path, "rb") as f: + docs = pickle.load(f) + + with open(table_idx_path, "rb") as f: + table_to_idx = pickle.load(f) + + embs = np.load(embs_path, mmap_mode="r") + + _FIELD_DOCS_CACHE = docs + _FIELD_EMBS_CACHE = embs + _TABLE_TO_IDX_CACHE = table_to_idx + + return docs, embs, table_to_idx + def get_embedding_engine(): global _EMBEDDER_INSTANCE @@ -219,7 +300,10 @@ def get_embedding_engine(): if _EMBEDDER_INSTANCE is None: _EMBEDDER_INSTANCE = HuggingFaceEmbeddings( model_name=model_path, - model_kwargs={"device": "cpu"} + model_kwargs={"device": "cpu","trust_remote_code": True,}, + encode_kwargs={ + "normalize_embeddings": True, + }, ) return _EMBEDDER_INSTANCE @@ -740,32 +824,43 @@ class SQLState(TypedDict, total=False): selected_fields: str +def is_erp_query(q: str, keywords: list[str]) -> bool: + return any(kw in q for kw in keywords) + +def correct_spelling(text: str) -> str: + sym = get_symspell() + suggestions = sym.lookup_compound(text, max_edit_distance=2) + return suggestions[0].term if suggestions else text + + def fill_sql_prompt(question: str, context: str) -> str: return SQL_PROMPT.format(question=question, context=context) + + def guardrail_router(state: SQLState) -> SQLState: request_id = state.get("request_id") - # publish_pipeline_update( - # request_id, - # "router_check", - # "Checking ERP vs NON-ERP" - # ) - raw_q = state.get("formatted_q") or state.get("question") or "" q = str(raw_q).lower().strip() - - is_erp = any(kw in q for kw in BUSINESS_KEYWORDS) + q_corrected = correct_spelling(q) + is_erp = is_erp_query(q_corrected, BUSINESS_KEYWORDS) query_type = "ERP" if is_erp else "NON_ERP" + # optional debug + # print("RAW:", q) + # print("CORRECTED:", q_corrected) + + state["query_type"] = query_type publish_pipeline_update( - request_id, - "router_result", - f"Query type detected: {query_type}" - ) + request_id, + "question_rewrite_done", + "Query classified as " + query_type, + data={"query_type": query_type} + ) - return {**state, "query_type": query_type} + return state def send_non_erp_request(state: SQLState) -> SQLState: qstn =state.get("question") @@ -904,7 +999,7 @@ def get_table_vs_test(): def call_fvs_table_search(q: str) -> List[str]: - hits = get_table_vs().similarity_search(q, k=15) + hits = get_table_vs().similarity_search(q, k=5) out, seen = [], set() for h in hits: t = h.metadata.get("table") @@ -949,38 +1044,34 @@ def call_retrieve_multi_line(user_question: str, request_id: str) -> Dict[str, A "table_retrieval_done", "Tables retrieved" ) - table_prompt = FILTER_TABLES.replace("{user_question}", user_question) - table_prompt = table_prompt.replace("{table_list}", json.dumps(top_tables, ensure_ascii=False)) - selected_raw = call_gemini(table_prompt) - selected_tables = _parse_json_list(selected_raw) - top_set = set(top_tables) - selected_tables = [t for t in selected_tables if t in top_set] - if not selected_tables: - return {"selected_fields": {}, "selected_tables": [], "top_tables": top_tables} - fields_candidates = {} - for table in selected_tables: - - fields_candidates[table] = call_fvs_field_search( - user_question, - table_name=table, - selected_tables=selected_tables, - k=40 - ) + # table_prompt = FILTER_TABLES.replace("{user_question}", user_question) + # table_prompt = table_prompt.replace("{table_list}", json.dumps(top_tables, ensure_ascii=False)) + # selected_raw = call_gemini(table_prompt) + # selected_tables = _parse_json_list(selected_raw) + # top_set = set(top_tables) + # selected_tables = [t for t in selected_tables if t in top_set] + # if not selected_tables: + # return {"selected_fields": {}, "selected_tables": [], "top_tables": top_tables} + fields_candidates= call_fvs_field_search_global_k( + user_question, + selected_tables=top_tables, + k_total=20 + ) publish_pipeline_update( request_id, "field_retrieval_done", "Fields selected" ) - field_prompt = filter_fields.replace("{user_question}", user_question) - field_prompt = field_prompt.replace("{fields_tables}", json.dumps(fields_candidates, ensure_ascii=False)) - selected_raw = call_gemini(field_prompt) - try: - selected_map = json.loads(selected_raw) if isinstance(selected_raw, str) else {} - except Exception: - selected_map = {} + # field_prompt = filter_fields.replace("{user_question}", user_question) + # field_prompt = field_prompt.replace("{fields_tables}", json.dumps(fields_candidates, ensure_ascii=False)) + # selected_raw = call_gemini(field_prompt) + # try: + # selected_map = json.loads(selected_raw) if isinstance(selected_raw, str) else {} + # except Exception: + # selected_map = {} return { - "selected_fields": json.dumps(selected_map, ensure_ascii=False), - "selected_tables": selected_tables, + "selected_fields": fields_candidates, + "selected_tables": top_tables, "top_tables": top_tables, "top_fields": fields_candidates, } @@ -990,6 +1081,123 @@ def call_retrieve_multi_line(user_question: str, request_id: str) -> Dict[str, A return {"selected_fields": {}, "selected_tables": [], "top_tables": [], "error": str(e)} +def call_fvs_field_search_global_k( + user_question: str, + selected_tables: List[str], + k_total: int = 20, +) -> str: + + if not user_question or not selected_tables: + return "" + + docs, embs, table_to_idx = load_field_matrix() + + import numpy as np + + emb = get_embedding_engine() + + q_vec = np.array( + emb.embed_query(user_question), + dtype="float32" + ) + + q_vec = q_vec / max(np.linalg.norm(q_vec), 1e-12) + + # collect indices + all_idxs = [] + for t in selected_tables: + all_idxs.extend(table_to_idx.get(t, [])) + + if not all_idxs: + return "" + + sub_embs = embs[all_idxs] + scores = sub_embs @ q_vec + + top_global = np.argsort(-scores)[:k_total] + + grouped = {} + seen = set() + + for i in top_global: + doc_i = all_idxs[int(i)] + d = docs[doc_i] + + meta = getattr(d, "metadata", {}) or {} + table = meta.get("table") + field = meta.get("field") + + if not table or not field: + continue + + key = (table, field) + if key in seen: + continue + seen.add(key) + + name = field + + # join hint + if meta.get("join_hint"): + linked_table = meta["join_hint"].get("table") + if linked_table: + name += f" -> {linked_table}" + + # options + if meta.get("options"): + opts = meta["options"] + if isinstance(opts, list): + name += " {" + ", ".join(map(str, opts[:5])) + "}" + + grouped.setdefault(table, []).append(name) + + # 🔥 final compact string + parts = [] + for table, fields in grouped.items(): + parts.append(f"{table}: " + ", ".join(fields)) + + return "\n".join(parts) + + +def call_fvs_field_search_grouped( + user_question: str, + selected_tables: List[str], +) -> Dict[str, List[Dict[str, Any]]]: + + if not user_question or not selected_tables: + return {} + + sub_vs = get_sub_vs(selected_tables) + if sub_vs is None: + return {} + + hits = sub_vs.similarity_search(user_question, k=20) + + selected_set = set(selected_tables) + grouped = {} + seen = set() + + for d in hits: + meta = getattr(d, "metadata", {}) or {} + tbl = meta.get("table") + fld = meta.get("field") + key = (tbl, fld) + if key in seen: + continue + seen.add(key) + row = {"field": fld, "table": tbl} + + join_hint = meta.get("join_hint") + if join_hint: + row["join_hint"] = join_hint + + options = meta.get("options") + if options: + row["options"] = options + + grouped.setdefault(tbl, []).append(row) + + return grouped def get_full_fields_vs_test(): global _FULL_FIELDS_VS @@ -1067,42 +1275,44 @@ def get_sub_vs(selected_tables: List[str]) -> Optional[FAISS]: return sub -def call_fvs_field_search( - user_question: str, - table_name: str, - selected_tables: List[str], - k: int = 40, -) -> List[Dict[str, Any]]: - if not user_question or not table_name: - return [] - sub_vs = get_sub_vs(selected_tables) - if sub_vs is None: - return [] - hits = sub_vs.similarity_search(user_question, k=min(60, max(40, k))) - results: List[Dict[str, Any]] = [] - seen = set() - for d in hits: - meta = getattr(d, "metadata", {}) or {} - tbl = meta.get("table") - fld = meta.get("field") - if tbl != table_name: - continue - key = (tbl, fld) - if key in seen: - continue - seen.add(key) - row = { - "field": fld,"table":tbl - } - if meta.get("join_hint"): - row["join_hint"] = meta.get("join_hint") - if meta.get("options"): - row["options"] = meta.get("options") - - results.append(row) - if len(results) >= k: - break - return results +# def call_fvs_field_search( +# user_question: str, +# table_name: str, +# selected_tables: List[str], +# k: int = 40, +# ) -> List[Dict[str, Any]]: +# if not user_question or not table_name: +# return [] +# sub_vs = get_sub_vs(selected_tables) +# if sub_vs is None: +# return [] +# # hits = sub_vs.similarity_search(user_question, k=min(60, max(40, k))) +# hits = sub_vs.similarity_search(user_question, k=20) + +# results: List[Dict[str, Any]] = [] +# seen = set() +# for d in hits: +# meta = getattr(d, "metadata", {}) or {} +# tbl = meta.get("table") +# fld = meta.get("field") +# if tbl != table_name: +# continue +# key = (tbl, fld) +# if key in seen: +# continue +# seen.add(key) +# row = { +# "field": fld,"table":tbl +# } +# if meta.get("join_hint"): +# row["join_hint"] = meta.get("join_hint") +# if meta.get("options"): +# row["options"] = meta.get("options") + +# results.append(row) +# if len(results) >= k: +# break +# return results # Node 1: Retrive with Fiass Vector Store. @@ -2204,4 +2414,15 @@ def run_text2sql_pipeline(user_question: str, chat_id: str, request_id: str) -> return _handle_sql_result(final, sql, orm, formatted_q, fields,selected_tables, res, entity_debug, user_question, chat_id) -print(run_text2sql_pipeline("Hye whatsapp there...","22","22")) \ No newline at end of file +print(run_text2sql_pipeline("Hye whatsapp there...","22","22")) + +@frappe.whitelist(allow_guest=False) +def test(user_qstn, session_id): + prompt = inject_prompt(user_qstn, session_id) + + try: + raw = call_model(prompt, "llm") + standalone, contains_values = _parse_rewrite_response(raw, user_qstn) + return standalone, contains_values + except Exception as e: + print(f"Error during model call: {e}") diff --git a/changai/changai/api/v2/train_data_api.py b/changai/changai/api/v2/train_data_api.py index a9169d3..cce2284 100644 --- a/changai/changai/api/v2/train_data_api.py +++ b/changai/changai/api/v2/train_data_api.py @@ -1,7 +1,7 @@ from __future__ import annotations from pathlib import Path import os, json, math, re, time, random, traceback -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Tuple,Union import frappe from google.oauth2 import service_account from frappe import _ @@ -91,7 +91,7 @@ def _get_abs_path(module_name: str, folder_path: str,suffix: str = "") -> str: site_path = frappe.get_site_path("private", "files") target_dir = os.path.join(site_path, relative) os.makedirs(target_dir, exist_ok=True) - return os.path.join(target_dir, f"{module_name}{suffix}.jsonl") + return os.path.join(target_dir, f"{module_name}_batch8_{suffix}.jsonl") def _seed_seen_from_disk(abs_path: str) -> Tuple[set, int]: @@ -142,7 +142,7 @@ def _sync_frappe_file_doc(module_name: str, abs_path: str, folder_path: str, suf Create/Update File doc that points to the on-disk file. """ relative = folder_path.replace("Home/", "", 1) - out_file_name = f"{module_name}{suffix}.jsonl" + out_file_name = f"{module_name}_batch8_{suffix}.jsonl" file_url = f"/private/files/{relative}/{out_file_name}" existing = frappe.db.get_value( "File", @@ -310,10 +310,11 @@ def _assign_qids(validated_records: List[dict], module_name: str, existing_count return final_records def _build_claude_messages(module_name, module_description) -> List[dict]: + schema = get_module_schema_str(module_name) return [ { "role": "user", - "content": _val_prompt(module_name, module_description, BATCH_SIZE), + "content": _val_prompt(schema,module_name, module_description, BATCH_SIZE), } ] @@ -606,39 +607,30 @@ def _get_gemini_client(): location=location, credentials=creds, ) - -def _val_prompt(module_name, description, batch_size): - return f""" -You are generating ERPNext schema-retrieval training data. -This is for production testing of a trained retrieval model, so focus on variety and real-world business questions. -CRITICAL OUTPUT RULE: -Return ONLY a valid JSON array. Start with '[' and end with ']'. No markdown. No code fences. No explanation. +def _val_prompt(schema:str,module_name, description, batch_size): + return f"""You are generating ERPNext schema-retrieval TEST data for evaluating a trained retrieval model. +CRITICAL OUTPUT RULE: Return ONLY a valid JSON array. Start with '[' and end with ']'. No markdown, code fences, or explanations. +FORMAT: +[{{"anchor":"query text","positives":["[TABLE] tabX","[FIELD] field_name | [TABLE] tabX"]}}] ANCHOR RULES: -- Questions must be data fetch/lookup intent only -- Casual, messy, real chat style. -- Mix styles: ultra-short ("helmet stock dubai"), casual ("how many cement bags in sharjah?"), urgency ("need diesel qty in muscat asap"), doubt ("any PVC fittings left in doha or not?") et.. -- cover all styles of queriy types in the output, with no specific ratio. The more variety the better. -- Do NOT mention DocType or table names in anchor. -- Use business wording and synonyms. -[ - {{ - "anchor": "which suppliers didnt deliver on time last month?", - "positives": [ - "[TABLE] tabPurchase Order", - "[FIELD] supplier | [TABLE] tabPurchase Order", - "[FIELD] schedule_date | [TABLE] tabPurchase Order", - "[FIELD] status | [TABLE] tabPurchase Order" - ] - }} -] -REQUIREMENTS: -- Generate EXACTLY {batch_size} UNIQUE objects -- NEVER include Doctype names in questions -- Casual business language -- Grammar mistakes allowed -- This is for testing a trained Model in retrieval, so focus on covering all varieties and production oriented questions. -- {module_name}: {description} -""".strip() +- Generate a realistic mix of query types: simple (single table), medium (join OR aggregation OR time filter), complex (multi-condition, joins + aggregation + reasoning). +- Do NOT force all queries to be complex. +STYLE VARIATION: +- Include short, natural, business, messy/typo, urgent, and ambiguous queries. +LOGIC COVERAGE: +- Include filters, aggregations, ranking, joins, time-based queries, comparisons, and some complex reasoning. +STRICT RULES: +- NEVER mention table/doctype names in anchor. +- Use realistic business language. +- Ensure anchors are UNIQUE. +POSITIVES RULES: +- Include ALL required tables and fields. +- Include identifier, filter, aggregation, and join fields if needed. +- Do NOT miss required fields or include unrelated ones. +FINAL: +- Generate EXACTLY {batch_size} UNIQUE objects. +- {schema} +Use this Given schema only for produciton grade test data generation.""".strip() @@ -734,8 +726,9 @@ def _generate_and_store_module_records( # validated_records, removed = _validate_records(raw_records) # if not validated_records: - # continue - + # # continue + # try: + # neg_records = raw_records try: final_records = _assign_qids(raw_records, module_name, existing_count) @@ -837,19 +830,19 @@ def start_train(modules: str, total_count: int): timeout=14400, modules=modules, total_count=total_count, - path="Home/Training Data/Batch 7", + path="Home/Training Data/Batch 10", use_claude=False, use_gemini=True ) - frappe.enqueue( - "changai.changai.api.v2.train_data_api.generate_data", - queue="long", - timeout=14400, - modules=modules, - total_count=val_count, - path="Home/Validation Data/Batch 7", - use_claude=True, # <-- Claude - ) + # frappe.enqueue( + # "changai.changai.api.v2.train_data_api.generate_data", + # queue="long", + # timeout=14400, + # modules=modules, + # total_count=val_count, + # path="Home/Validation Data/Batch 8", + # use_claude=True, # <-- Claude + # ) return {"ok": True, "message": "Training and validation jobs queued."} def _build_gemini_system_instruction() -> str: @@ -862,11 +855,10 @@ def _build_gemini_system_instruction() -> str: def _build_gemini_contents(module_name: str, module_description, wrong_examples) -> List[dict]: try: - prompt = _training_prompt( - module_name, - module_description, + schema=get_module_schema_str(module_name) + prompt = _training_prompt_1( + schema, BATCH_SIZE, - wrong_examples, ) except Exception as e: frappe.log_error( @@ -949,7 +941,6 @@ def _parse_gemini_json_array(raw: str) -> List[dict]: return arr if isinstance(arr, list) else [] - def _extract_valid_records(arr: List[dict], seen_anchors: set) -> List[dict]: records = [] @@ -957,59 +948,77 @@ def _extract_valid_records(arr: List[dict], seen_anchors: set) -> List[dict]: if not isinstance(obj, dict): continue - anchor = (obj.get("anchor") or "").strip() + raw_anchor = obj.get("anchors") or obj.get("anchor") or "" + + if isinstance(raw_anchor, list): + anchors_list = [a.strip() for a in raw_anchor if isinstance(a, str) and a.strip()] + anchor = anchors_list[0] if anchors_list else "" + elif isinstance(raw_anchor, str): + anchor = raw_anchor.strip() + anchors_list = [anchor] + else: + anchor = "" + anchors_list = [] + positives = obj.get("positives") + qid = obj.get("qid", "") if not anchor or not isinstance(positives, list) or not positives: continue - if anchor in seen_anchors: + if any(a in seen_anchors for a in anchors_list): continue - seen_anchors.add(anchor) - records.append({"anchor": anchor, "positives": positives}) + seen_anchors.update(anchors_list) - return records + # Expand each paraphrase into its own record + for i, a in enumerate(anchors_list): + records.append({ + "qid": f"{qid}_p{i}" if i > 0 else qid, # Stock_026, Stock_026_p1, Stock_026_p2 + "anchor": a, + "positives": positives + }) -def _call_openai_correction(raw:str): - try: - cleaned_res = json.loads(raw) - except Exception as e: - frappe.log_error(title="Cleaning failed", message=frappe.get_traceback()) - return [] + return records +# def _call_openai_correction(raw:str): +# try: +# cleaned_res = json.loads(raw) +# except Exception as e: +# frappe.log_error(title="Cleaning failed", message=frappe.get_traceback()) +# return [] - try: - openai_client=_get_openai_client() + # try: + # openai_client=_get_openai_client() - # validate records first - for i, record in enumerate(cleaned_res): - if not isinstance(record, dict): - raise ValueError(f"Record {i} is not a dict") - if not record.get("anchor") or not record.get("positives"): - raise ValueError(f"Record {i} missing anchor or positives") + # # validate records first + # for i, record in enumerate(cleaned_res): + # if not isinstance(record, dict): + # raise ValueError(f"Record {i} is not a dict") + # if not record.get("anchor") or not record.get("positives"): + # raise ValueError(f"Record {i} missing anchor or positives") - corrected_all = [] + # corrected_all = [] - # process in batches of 5 - for i in range(0, len(cleaned_res), 5): - batch = cleaned_res[i:i+5] - corrected_raw = _call_openai_batch_with_retry(openai_client, input_raw=batch,module_name=None, module_description=None) + # # process in batches of 5 + # for i in range(0, len(cleaned_res), 5): + # batch = cleaned_res[i:i+5] + # corrected_raw = _call_openai_batch_with_retry(openai_client, input_raw=batch,module_name=None, module_description=None) - # if OpenAI returns JSONs string - corrected_batch = json.loads(corrected_raw) + # # if OpenAI returns JSONs string + # corrected_batch = json.loads(corrected_raw) - if not isinstance(corrected_batch, list): - raise ValueError(f"Corrected batch starting at index {i} is not a list") + # if not isinstance(corrected_batch, list): + # raise ValueError(f"Corrected batch starting at index {i} is not a list") - corrected_all.extend(corrected_batch) + # corrected_all.extend(corrected_batch) - except Exception as e: - frappe.log_error(title="Correction failed", message=frappe.get_traceback()[:4000]) - return [] +# except Exception as e: +# frappe.log_error(title="Correction failed", message=frappe.get_traceback()[:4000]) +# return [] - return corrected_all +# return corrected_all -import json +# import json def _generate_batch_gemini( client, @@ -1057,23 +1066,23 @@ def _generate_batch_gemini( return [] # Step 2: content correction only after valid JSON exists - try: - corrected_raw = _call_openai_correction( - json.dumps(arr, ensure_ascii=False) - ) - if corrected_raw: - corrected_arr = ( - corrected_raw - if isinstance(corrected_raw, list) - else _parse_gemini_json_array(corrected_raw) - ) - if isinstance(corrected_arr, list): - arr = corrected_arr - except Exception as e: - frappe.log_error( - title="OpenAI content correction failed", - message=str(e) - ) + # try: + # corrected_raw = _call_openai_correction( + # json.dumps(arr, ensure_ascii=False) + # ) + # if corrected_raw: + # corrected_arr = ( + # corrected_raw + # if isinstance(corrected_raw, list) + # else _parse_gemini_json_array(corrected_raw) + # ) + # if isinstance(corrected_arr, list): + # arr = corrected_arr + # except Exception as e: + # frappe.log_error( + # title="OpenAI content correction failed", + # message=str(e) + # ) # keep original arr return _extract_valid_records(arr, seen_anchors) @@ -1245,21 +1254,214 @@ def _training_prompt(module_name: str, module_description: str, batch_size: int, Make sure positives' must be a SINGLE-LEVEL list of strings.DO NOT use objects, nested lists, or dictionaries inside 'positives'. """.strip() +import frappe +@frappe.whitelist(allow_guest=False) +def get_module_schema_str(module_name: str) -> str: + + SKIP_FIELDTYPES = { + "Section Break", "Column Break", "Tab Break", "HTML", + "Heading", "Button", "Fold", "Table", "Table MultiSelect", + "Text Editor", "Small Text", "Read Only", "Attach Image", + "Attach", "Color", "Signature", "Geolocation", "Code" + } + + SKIP_FIELDNAMES = { + "address_display", "contact_display", "other_charges_calculation", + "base_in_words", "in_words", "scan_barcode", "last_scanned_warehouse", + "company_address_display", "shipping_address", "dispatch_address", + "base_rounding_adjustment", "rounding_adjustment", "disable_rounded_total", + "ignore_pricing_rule", "plc_conversion_rate", "price_list_currency", + "has_unit_price_items", "group_same_items", "select_print_heading", + "letter_head", "auto_repeat", "tc_name", "terms", "packed_items", + "pricing_rules", "payment_schedule", "named_place", "incoterm", + "reserve_stock", "set_warehouse" + } + + doctypes = frappe.get_all( + "DocType", + filters={"module": module_name, "istable": 0}, + fields=["name"] + ) + + parts = [] + for dt in doctypes: + try: + meta = frappe.get_meta(dt["name"]) + fieldnames = [] + for f in meta.fields: + if f.fieldname in SKIP_FIELDNAMES: + continue + if f.fieldtype in SKIP_FIELDTYPES: + continue + fieldnames.append(f.fieldname) + + if fieldnames: + parts.append(f"tab{dt['name']}({', '.join(fieldnames)})") + + except Exception: + continue + + return " | ".join(parts) + + +def _training_prompt_1(schema: str, batch_size: int) -> str: + hard_n = (batch_size * 3) // 10 + std_n = batch_size - hard_n + return f""" +You are a senior ERPNext architect and Text2SQL dataset designer. +Generate exactly {batch_size} COMPLEX training records as RAW JSON ARRAY only. +━━━ COMPLEXITY MANDATE ━━━ +Every anchor MUST require at least one of: + • Multi-table JOIN (2+ tables minimum) + • Aggregation + GROUP BY (SUM / COUNT / AVG with grouping dimension) + • Time filter + aggregation combined + • Comparison / mismatch logic (e.g. expected vs actual, billed vs delivered) + • Anomaly detection (missing links, zero where non-zero expected, NULL references) + • Cross-module reasoning (tables from 2+ ERPNext modules) + • Ranking (TOP N by a computed metric) +DO NOT generate: + • Single-table queries + • Simple lookup or filter-only queries + • "list all X" or "show me X" style queries +Distribution: {hard_n} anomaly/mismatch/cross-module | {std_n} aggregation/trend/ranking +━━━ QUERY GENERATION RULES ━━━ +Think deeply about the given schema and its business processes. +For this module, identify: + • What are the most important business KPIs and metrics? + • What are common business anomalies or exceptions in this module? + • What cross-module workflows does this module participate in? + • What time-based trends matter for this module? + • What comparisons or mismatches would a business analyst care about? +Then generate anchors that reflect real complex business questions a manager, +analyst, or accountant would ask about the given schema — not a developer. +Style mix: + • Urgent phrasing ("which ones are at risk", "need to flag immediately") + • Casual business phrasing ("which customers are we losing money on") + • Typo/messy phrasing ("custmer with hghest prfit lst mnth") + • Domain-specific phrasing ("show COGS vs revenue by SKU this quarter") +━━━ SQL REASONING (do internally before building positives) ━━━ +For every anchor identify: + SELECT → fields to display + WHERE → all filter conditions (status, date, amount, flag, type) + GROUP BY → grouping dimension + ORDER BY → sort/ranking field + AGG → SUM/COUNT/AVG field + JOIN → all linking fields + all linked tables + DATE → correct date field for this module's time filters + CHILD → child/detail table if line-item or ledger rows needed + ANOMALY → the specific field whose NULL or absence defines the missing link +━━━ DESCRIPTION QUALITY (CRITICAL — this directly affects retrieval) ━━━ +Format: + "[FIELD] fieldname | [TABLE] tabDoctype | desc: ..." + "[TABLE] tabDoctype | desc: ..." +Do not give simple queries. Give complicated queries that require multiple tables, joins, filters, and aggregations. +Make sure with this dataset i wouldnot get any JSON parse Error.That's important. So please follow the format and rules strictly. +Every field description MUST contain ALL of: + 1. BUSINESS PURPOSE — what this field means in plain business English + BAD: "date field" + GOOD: "date the transaction was recorded in books; used for period filtering and trend analysis" + 2. SYNONYMS — business words that map to this field + BAD: "net rate of item" + GOOD: "selling rate per unit after discounts; synonyms: sale price, unit price, discounted rate" + 3. For COST / VALUATION fields — profit/loss relevance explicitly stated + GOOD: "cost price at which item was stocked at time of sale; compare with selling rate to calculate + profit or loss per unit — if this exceeds selling rate, item was sold at a loss; + synonyms: cost price, buying price, valuation rate, COGS" + 4. For AMOUNT / NUMERIC fields — financial meaning + aggregation purpose + GOOD: "total net revenue for this line (rate x qty); SUM per customer to get total revenue" + 5. For STATUS fields — list actual ERPNext option values + GOOD: "payment status; values: Draft, Unpaid, Paid, Partly Paid, Overdue, Cancelled — + filter Unpaid/Overdue for receivables analysis" + 6. For DATE fields — which business event + filtering use + GOOD: "date invoice posted to books; use for monthly/quarterly trends and date range filters" + 7. For FK / LINK fields — what it joins + why join is needed + GOOD: "links to Customer master; required to group by customer or filter by customer segment" + 8. For ANOMALY fields — what NULL or absence means + GOOD: "reference to originating Sales Order; if NULL on Delivery Note means delivery made + without a sales order — use to detect unlinked deliveries" +Every table description MUST contain: + • What business entity/transaction this table represents + • When it is required (e.g. "required for line-item level analysis") + • Synonyms if table name doesn't match business language + BAD: "child table" + GOOD: "line-item rows of a sales invoice; contains per-item price, cost, and quantity — + required for item-level profit, margin, or discount analysis; + synonyms: invoice lines, sold items, invoice details" +━━━ ANCHOR PARAPHRASES (MANDATORY) ━━━ +For every record, generate exactly 3 phrasings of the same question: + • Original: formal business analyst style + • Paraphrase 2: casual manager phrasing ("which products are we losing money on") + • Paraphrase 3: messy/typo style ("itms whr cost exceeds selng pric lst qtr") +Store all 3 in "anchors" list instead of single "anchor" field. +All 3 must map to the SAME positives — same tables, same fields. +━━━ POSITIVES RULES ━━━ + • Flat list of strings only — no objects, dicts, or nested lists + • Every string MUST follow exactly one of these two formats: + "[FIELD] fieldname | [TABLE] tabDoctype | desc: ..." + "[TABLE] tabDoctype | desc: ..." + • NO raw SQL — do NOT include SUM(), GROUP BY, WHERE, HAVING, JOIN, + ORDER BY, LIMIT, COUNT(), AVG(), DATE(), Comparison:, or any SQL + fragment as a standalone string. Encode all logic into field descriptions instead. + BAD: "SUM(amount)" + BAD: "WHERE purchase_order IS NULL" + BAD: "Comparison: valuation_rate > standard_selling_rate" + GOOD: "[FIELD] amount | [TABLE] tabPurchase Order Item | desc: net amount per line; SUM per supplier to get total spend" + GOOD: "[FIELD] purchase_order | [TABLE] tabPurchase Receipt | desc: links to originating PO; if NULL, receipt was created without a PO — use to detect unlinked receipts" + • Include ALL: root tables, child tables, lookup/master tables, + SELECT fields, WHERE fields, GROUP BY fields, + ORDER BY fields, aggregation fields, JOIN fields + • Exclude: creation, modified, owner, parenttype, parentfield, idx, docstatus + • Never invent tables or fields not in standard ERPNext + • For profit/loss/margin queries: ALWAYS include both cost field AND selling rate field + • For time queries: ALWAYS include correct date field for this module + • For anomaly queries: ALWAYS include the field whose NULL defines the missing link + • For cross-module: ALWAYS include tables and link fields from ALL modules involved +━━━ VALIDATION (run for every record before output) ━━━ + ✓ Is query genuinely complex — multi-table / aggregation / anomaly / cross-module? + ✓ Can correct SQL be generated from positives alone? + ✓ For profit/margin/loss: cost field AND selling field both present? + ✓ For time queries: correct date field present? + ✓ For aggregation: numeric field AND grouping field present? + ✓ For anomaly: NULL-defining link field present? + ✓ For cross-module: tables from ALL modules present? + ✓ Are descriptions rich enough to match a business query without seeing the field name? + ✓ Do synonyms bridge business language to technical field names? + ✓ Does every positive string start with [FIELD] or [TABLE]? If not — fix it. + ✓ Are there ANY raw SQL fragments in positives? If yes — remove and encode into desc instead. +If any check fails — fix before output. +━━━ OUTPUT FORMAT ━━━ +Raw JSON array only. No markdown. No explanation. No trailing commas. No comments. +First char '[' last char ']'. Exactly {batch_size} records. +{{ + "qid": "_", + "anchors": ["", "", ""], + "positives": ["", ""] +}} +- Format: RAW JSON ARRAY ONLY. No markdown/prose. +Important !!! Format: RAW JSON ARRAY ONLY. No markdown/prose. +Make sure 'positives' must be a SINGLE-LEVEL list of strings. DO NOT use objects, nested lists, or dictionaries inside 'positives'. +IMPORTANT !!! +schema : {schema} +Never ever write the training data with any fields or tables that do not exist in this given schema. +IMPORTANT !!! Use only the above schema to generate training data. +OUTPUT: RAW JSON ARRAY [{batch_size} records]. Start '[' end ']'. +- Do not add explanations, notes, markdown, or extra text like json. +Your entire response must be raw JSON only. +Do NOT include ```json, ```, or any other text before or after the JSON. +Start your response with [ and end with ]. +""".strip() + def __correction_prompt(input_raw) -> str: return f""" Act as an ERPNext data validation expert. - Task: {input_raw} - Review the training records provided above. For each record, carefully inspect: 1. the anchor 2. the positives list - Your goal is to improve the quality of the dataset for embedding-model training, where the model must retrieve the correct ERPNext tables and fields needed for SQL generation. - Instructions: - Check whether all required tables and fields needed to answer the anchor are present in positives. - Add any missing required tables or fields that are semantically necessary to answer the query. @@ -1281,9 +1483,13 @@ def __correction_prompt(input_raw) -> str: - if the positives are not in this below format.Make it in this below fomat also: [TABLE] | desc: [FIELD] | [TABLE] | desc: +Use only those fields and tables that exist in ERPNext. Output rules: - Return the corrected records only. - Preserve the same JSONL structure (one JSON object per line). -- Do not add explanations, notes, markdown, or extra text. +- Do not add explanations, notes, markdown, or extra text like json . +Your entire response must be raw JSON only. +Do NOT include ```json, ```, or any other text before or after the JSON. +Start your response with [ and end with ] . - Output raw JSON only. """.strip() \ No newline at end of file diff --git a/changai/changai/prompts/sql_prompt.txt b/changai/changai/prompts/sql_prompt.txt index 258b16f..30cf320 100644 --- a/changai/changai/prompts/sql_prompt.txt +++ b/changai/changai/prompts/sql_prompt.txt @@ -29,15 +29,11 @@ HARD CONSTRAINT (MARIADB COMPATIBILITY — ZERO TOLERANCE): - DO NOT USE : STRFTIME, DATE_TRUNC, DATE_TRUNC, ::, ILIKE, TRUE/FALSE literals, INTERVAL 'x', EXTRACT, TO_CHAR, NOW()::,When generating SQL as MariaDB not supports it. - If you are about to output any forbidden token, you MUST REWRITE the query using MariaDB equivalents. - If you cannot express the date logic using MariaDB functions, you MUST NOT output a query that contains forbidden tokens. - INPUT: - USER QUESTION: {question} - SCHEMA CONTEXT: {context} - ENTITY FILTERING (ABSOLUTE): - If ENTITY_CARD exists, you MUST use entity values ONLY from ENTITY_CARD. - 🔒 The EXACT SAME entity literal values MUST be used in BOTH SQL and ORM. @@ -53,7 +49,6 @@ Example: "sql": "", "orm": "" }} - FORBIDDEN TOKENS (HARD FAIL): - STRFTIME, DATE_TRUNC, ::, ILIKE, TO_CHAR, NOW()::, INTERVAL 'x' DATE FILTERING (WHITELIST ONLY — MUST FOLLOW): @@ -61,6 +56,10 @@ DATE FILTERING (WHITELIST ONLY — MUST FOLLOW): A. (MONTH(date_col) = MONTH(CURDATE()) AND YEAR(date_col) = YEAR(CURDATE())) B. (date_col BETWEEN DATE_SUB(CURDATE(), INTERVAL DAYOFMONTH(CURDATE())-1 DAY) AND LAST_DAY(CURDATE())) - You MUST NOT use STRFTIME, DATE(), DATE_FORMAT, TO_CHAR, EXTRACT, DATE_TRUNC, or any other formatting function for month filtering. +- Use ONLY: DATE_SUB(date, INTERVAL n UNIT) +- NEVER use: DATE_SUB(date, n, UNIT) +- Use YEAR(), MONTH(), QUARTER() for filtering +- Do NOT use non-MariaDB date functions VALIDATION STEP (MANDATORY BEFORE OUTPUT): Before returning the result, internally verify: @@ -78,3 +77,22 @@ ALLOWED ERPNext JOIN-KEY EXCEPTION: - Never miss adding the tab prefix for table names, as this SQL will run on MariaDB. Only output the final JSON if ALL validations pass. Do not output the validation steps or any explanation. +### CRITICAL SCHEMA-GROUNDING LAW + +Field names and table/document names mentioned in the USER QUESTION are ONLY semantic hints. + +They are NOT proof that the field or table exists. + +Before using ANY table or field in SQL or ORM: + +1. The table MUST exist exactly in SCHEMA CONTEXT. +2. The field MUST exist exactly under that same table in SCHEMA CONTEXT. +3. A field mentioned in the question MUST NOT be used unless it appears in SCHEMA CONTEXT for the selected table. +4. A table/document mentioned in the question MUST NOT be used unless it appears in SCHEMA CONTEXT. +5. Do NOT invent, infer, rename, translate, or substitute fields. +6. Do NOT borrow fields from another table. +7. Do NOT use a field only because it appears in the rewritten question. +8. If the required field/table is not in SCHEMA CONTEXT, omit it or return an empty SQL string. + +ABSOLUTE RULE: +Any SQL using a table or field not present in SCHEMA CONTEXT is INVALID, even if it sounds correct in ERPNext. diff --git a/changai/changai/setup/install.py b/changai/changai/setup/install.py new file mode 100644 index 0000000..f78b654 --- /dev/null +++ b/changai/changai/setup/install.py @@ -0,0 +1,8 @@ +import subprocess +import sys + +def install_system_deps(): + subprocess.run([ + "sudo", "apt", "install", "-y", + "libffi-dev", "build-essential" + ], check=True) \ No newline at end of file diff --git a/changai/hooks.py b/changai/hooks.py index 1efb92f..940a15e 100644 --- a/changai/hooks.py +++ b/changai/hooks.py @@ -115,7 +115,8 @@ # Installation # ------------ -# before_install = "changai.install.before_install" +before_install = "changai.setup.install.install_system_deps" + # after_install = "changai.changai.api.v2.install.after_install" # after_migrate = "changai.changai.api.v2.install.after_migrate" # Uninstallation diff --git a/pyproject.toml b/pyproject.toml index 4d780fa..7fe6d05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,15 +14,16 @@ dependencies = [ "google-genai>=1.60.0,<2.0.0", "google-auth>=2.38.0", "google-cloud-aiplatform>=1.135.0,<2.0.0", - "huggingface_hub", "langchain-core", "langchain-community", "langchain-huggingface", "langgraph", - "transformers>=4.48.0", - "sentence-transformers>=3.0.0", + "symspellpy>=6.7.7,<7.0.0", + "transformers>=4.49.0,<5.0.0", + "sentence-transformers>=3.0.0,<4.0.0", + "huggingface_hub>=0.23.0,<1.0.0", "faiss-cpu>=1.7.0", - + "zstandard>=0.23.0", # Critical Framework Compatibility "numpy>=1.22.0,<2.0.0", # DO NOT REMOVE <2.0.0 (Breaks ERPNext) "sqlglot>=27.0.0",