From 2832f6c642657bdd6f9df95d1f01cfa181d1c496 Mon Sep 17 00:00:00 2001 From: Hyrin-mansoor Date: Fri, 8 May 2026 14:00:59 +0300 Subject: [PATCH 1/2] feat: implement Claude API client and enhance translation functionality; add new user input responses; refactor prompts for SQL and support interactions; update hooks for improved initialization and migration handling --- changai/changai/api/v2/ai_translate.py | 28 ++- .../api/v2/assets/changai_alias_map.json | 1 + .../non_erp_combined.processed.cache.pkl | Bin 2496849 -> 2496947 bytes .../v2/assets/non_erp_combined.processed.json | 7 + changai/changai/api/v2/format_output.py | 3 +- changai/changai/api/v2/schema_utils.py | 1 - changai/changai/api/v2/store_chats.py | 12 +- .../changai/api/v2/text2sql_pipeline_v2.py | 233 ++++++++++++++---- changai/changai/prompts/sql_prompt.txt | 13 +- .../prompts/sql_rewrite_sys_prompt.txt | 79 ++++++ .../prompts/sql_rewrite_user_prompt.txt | 5 + changai/changai/prompts/sql_system_prompt.txt | 83 +++++++ changai/changai/prompts/sql_user_prompt.txt | 3 + .../changai/prompts/support_sys_prompt.txt | 25 ++ .../changai/prompts/support_user_prompt.txt | 1 + changai/hooks.py | 12 +- pyproject.toml | 4 +- 17 files changed, 450 insertions(+), 60 deletions(-) create mode 100644 changai/changai/prompts/sql_rewrite_sys_prompt.txt create mode 100644 changai/changai/prompts/sql_rewrite_user_prompt.txt create mode 100644 changai/changai/prompts/sql_system_prompt.txt create mode 100644 changai/changai/prompts/sql_user_prompt.txt create mode 100644 changai/changai/prompts/support_sys_prompt.txt create mode 100644 changai/changai/prompts/support_user_prompt.txt diff --git a/changai/changai/api/v2/ai_translate.py b/changai/changai/api/v2/ai_translate.py index e1d2ca3..14a5a5e 100644 --- a/changai/changai/api/v2/ai_translate.py +++ b/changai/changai/api/v2/ai_translate.py @@ -11,6 +11,32 @@ def get_doctype(doc:str,docname: str): def get_settings(): return frappe.get_single("ChangAI Settings") + +_CLAUDE_CLIENT = None +_CLAUDE_API_KEY = None + +def get_claude_client(): + global _CLAUDE_CLIENT, _CLAUDE_API_KEY + + settings = get_settings() + api_key = (getattr(settings, "claude_api_key", None) or "").strip() + + if not api_key: + api_key = (os.getenv("ANTHROPIC_API_KEY") or "").strip() + + if not api_key: + frappe.throw( + _("Claude API key is not configured."), + title=_("Missing Claude API Key") + ) + + if _CLAUDE_CLIENT is None or _CLAUDE_API_KEY != api_key: + _CLAUDE_CLIENT = Anthropic(api_key=api_key) + _CLAUDE_API_KEY = api_key + + return _CLAUDE_CLIENT + + @frappe.whitelist(allow_guest=False) def translate_and_store(docname: str, doctype: str, from_field: str, to_field: str, text: str, to_language: str): """ @@ -43,7 +69,7 @@ def translate_and_store(docname: str, doctype: str, from_field: str, to_field: s title=_("Missing Claude API Key") ) try: - client = Anthropic(api_key=api_key) + client = get_claude_client() prompt = f""" Translate the following text into {to_language}. Return ONLY the translated text. diff --git a/changai/changai/api/v2/assets/changai_alias_map.json b/changai/changai/api/v2/assets/changai_alias_map.json index 6eb2942..670016b 100644 --- a/changai/changai/api/v2/assets/changai_alias_map.json +++ b/changai/changai/api/v2/assets/changai_alias_map.json @@ -8,6 +8,7 @@ "hlw": "hello", "heyy": "hey", "heyyy": "hey", + "nce": "nice", "heyyyy": "hey", "hai": "hi", "hi": "hi", diff --git a/changai/changai/api/v2/assets/non_erp_combined.processed.cache.pkl b/changai/changai/api/v2/assets/non_erp_combined.processed.cache.pkl index 1ef824db2fd20ff28a1435d3e8741ebd0832c6b3..1d95c918fe1e2b3259d204900ff8eed1cea753de 100644 GIT binary patch delta 1174 zcmX9*eQXnT7~Wm)dh70f*SqU=*RI>zZpiR^LEXl%b9@LfV^|>gfjTufKq3qy5Mo4D zBRfHGjYs=n9IpK1OqPw}GJd4dtYRQ4S?oYr0=P9Q4CP}};~z#Gw&|NZ|Gn??^^Sdh zm>C*6%y6B2%zd1DD8zDGajdL(snvK?MCKhv8)^u3opQv)BBm%m*319-5dML2SvT(7 zYyBkoh8Hra?{K}{;lQqI96@$I(NS@hX|$s6+L6Af6Lg%>-86ERS@kd*yAKL0yRm0P zbR&965f8B~2(RXxh=%Rba0h$%4jZemdF^{ZubrM*)Dy` zUFM65&=YVy_FfkkMGd_cZ=i2NeGlo?iUeg)6KQh~ z|Mn|v%vm6JI&9)_)^5}lQcd);p?s?Q?~#1CQYSW9;k}YMSZ}9;^}?nn(B6=1_|kyx zL32b{7C){UNF0)6b08wnfk^SZx43M+oI6AdTfII9=91zdT{Bzk+nBc)xydcSfzv_@ zvL2BjyvSz9^i7*;dNw$yXG6*KkKES9x${4E@(U1cEg=Y;mWt`KkBT2C1?giUwi@NF z;$lQ^I|L|GjEFCLND%ydc@Ywi*d#OCD$;DLRQ`M6qV{{cfpso@8M0svyno#$((VRw zWgjt`rVDt(P={&NiaaixXO0&X+Z2&;4`TUS7eWn_4@vQ=DK$mx`F#Ny^NcTNVn;##$Zfa7x)QI$)Bm^)(^_8IXDv4X zWzx{m#Cz=MpD@~SVBFBmIgck z*WG9y_nGM3&Di59h9Ki9cbU`UJ~};KUh#K9Iy+nuK+|YN8g_%nEa|qB>D)`iryf+=S zo2pbKlEGpuoDFs&oc3x+K3U~4r)GmWRMDx~>QuV0+xZ61Au(1R#JAV0x^Qx5Ktf`P XSBx_%!I9~pXm+RP(C+kIr=IyA*;KXV delta 1253 zcmXw3du&rx9NvAjx3{OAQ&wM5+g=B zjND;BhNXNwr3s2NA*^H0gX7hhKn#zqb5Xz`TTn1C*_LSsiE;Ce;z_=LPJWN?`+nzl z+S>L~-6!@^Y{V3^wX)NLZTo`y_qg(dQ;LSHBH|3qBJ?XO$6lgJ_*h96l{J2%>+K$N z-gcPVk5J!SL};Q}L!DdZF!{PBq2UrWhpwN5T}`GZ$?un_&)mTnPRI9S}kT7_V*n&9E2qEG3qfR5}-`;_ajmv zX-Jfro!D`}REwECZU<6VtuFArN+Uws%`UwCfRRYzpeejkSYwHa|0`;prsg8P+hiw8 zC^I`NqpSQF9sF#Tf}O=SkA9)lOfHmiCu(39%pAhQjDX;AM-IMd;HvP|Nye>HTR1{( zvCR5_y)kaf>yjG6&^Zr}M$r-=?V}dYYNV#fW<+Wk1@V5N0E6eb1<*Qt*=Tx3aO;~^ z@nq8~>%dR!+6tpW@<*x?k@GeyIrX&gseR81O~tu49aebe(r%-QP=T6S!2#bPF@V+6 zr1^+UW^D-HrE>M9iv+TCk&V8GM6F##=(yqJY0Xid$@uUMTEP8&L&JfzsWKYzi}2r* z{Mh)=T!qvV#{II9LKQd8@;2=ThLVX82+ma#=Q*K^ph%);aLptu6buW1#*MWP#D zjr^Oo7o^D%>g^&2Y3tO^AtM?HcP(ukUC;=q3nU25BTNEjDPWe z1ReuPB}bP7GLZxJ&+j0-!%*IJST4dHP6=VDU$!89OD)vvEDEW!s2hG}`^G0#T&OIJ z)~O<<^*I`dq5 zapwe5+?jW`KiZV1VRWL;m~e-0<|9+}NQcjWpF}zW!wZneDNy0>D{$lRXdsetBR-s; wqt}cD4CIrVv4V~y!lMOhwEQ<0$_E2h%uW{W%!Gzx`CdJpEF|e<(b<>(11?F@MF0Q* diff --git a/changai/changai/api/v2/assets/non_erp_combined.processed.json b/changai/changai/api/v2/assets/non_erp_combined.processed.json index 8cfb8d2..fcf47c5 100644 --- a/changai/changai/api/v2/assets/non_erp_combined.processed.json +++ b/changai/changai/api/v2/assets/non_erp_combined.processed.json @@ -1077,6 +1077,13 @@ "priority": 90, "is_active": true }, + { + "category": "conversation", + "user_input": "nice", + "response": "Thanks for the kind words! changAI is here to serve you.", + "priority": 90, + "is_active": true + }, { "category": "conversation", "user_input": "nice to talk to you", diff --git a/changai/changai/api/v2/format_output.py b/changai/changai/api/v2/format_output.py index 0294aab..6ba72e5 100644 --- a/changai/changai/api/v2/format_output.py +++ b/changai/changai/api/v2/format_output.py @@ -1,4 +1,3 @@ -import random from decimal import Decimal from typing import Any, Dict, List, Optional, Set import frappe @@ -584,7 +583,7 @@ def format_sql_response( "source_fields": source_fields, } -@frappe.whitelist(allow_guest=True) +@frappe.whitelist(allow_guest=False) def local_format(sql, sample_rows): row_count = len(sample_rows) result = format_sql_response(sql, row_count, sample_rows) diff --git a/changai/changai/api/v2/schema_utils.py b/changai/changai/api/v2/schema_utils.py index 22b2f8c..146f709 100644 --- a/changai/changai/api/v2/schema_utils.py +++ b/changai/changai/api/v2/schema_utils.py @@ -147,7 +147,6 @@ def validate_sql_schema(sql: str, dialect: str = "mysql") -> dict: @frappe.whitelist(allow_guest=False) def check_file_updates(file_name=None): settings = frappe.get_single("ChangAI Settings") - if file_name == "master_data.yaml": last_sync = settings.last_masterdata_sync elif file_name == "schema.yaml": diff --git a/changai/changai/api/v2/store_chats.py b/changai/changai/api/v2/store_chats.py index c6adce7..a2d7961 100644 --- a/changai/changai/api/v2/store_chats.py +++ b/changai/changai/api/v2/store_chats.py @@ -140,16 +140,24 @@ def respond_from_cache(user_question:str): - For vague money questions, clarify the business meaning as actual, ordered, quoted, paid, or outstanding, but do not guess the document type incorrectly. - If the user says "spend", treat it as actual purchase/expense, not quotation or order commitment, unless the user explicitly mentions order, quotation, or planned purchase. - Preserve all filter conditions, status values, and keywords from the original question — never drop them during rewriting. - +- Do NOT add dates, filters, entities, statuses, or assumptions unless explicitly present in the user question or clearly inferred from conversation memory. +Use chat history only when the current query clearly implies continuation or follow-up context. Never assume dates, filters, entities, or conditions from previous messages unless strongly indicated. Chat history: {rows} User: {qstn} +Use only the most relevant tables and fields required for the user query. +Use only valid tables and fields from the provided schema context, regardless of retrieval ranking order. Choose fields based on business meaning and user intent, not rank position. Never invent schema elements. Always return any one clear user-readable business fields, not only technical IDs, unless explicitly requested. If the query is ambiguous, ask for clarification and set "clarify": true. """ +USER_PROMPT = """Chat History: +{rows} + +User Question: +{qstn}""" @frappe.whitelist(allow_guest=False) def inject_prompt(user_qstn: str, session_id: str) -> str: rows=get_chat_history(session_id) - prompt=PROMPT_FOLLOWUP.format(rows=rows,qstn=user_qstn) + prompt=USER_PROMPT.format(rows=rows,qstn=user_qstn) return prompt diff --git a/changai/changai/api/v2/text2sql_pipeline_v2.py b/changai/changai/api/v2/text2sql_pipeline_v2.py index c0814a6..7be4302 100644 --- a/changai/changai/api/v2/text2sql_pipeline_v2.py +++ b/changai/changai/api/v2/text2sql_pipeline_v2.py @@ -12,7 +12,9 @@ import time import base64 import sqlglot +from functools import lru_cache from sqlglot import exp +from rapidfuzz import fuzz, process from langgraph.checkpoint.memory import MemorySaver from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings @@ -47,6 +49,8 @@ _FIELD_DOCS_CACHE = None _FIELD_EMBS_CACHE = None _TABLE_TO_IDX_CACHE = None +_KEYWORDS_SET=None +_KEYWORDS_LIST=None _ASSETS_DIR = Path(frappe.get_app_path("changai", "changai", "api", "v2", "assets")).resolve() _PROMPTS_DIR = Path(frappe.get_app_path("changai", "changai", "prompts")).resolve() CHANGAI_SETTINGS = "ChangAI Settings" @@ -55,7 +59,85 @@ import frappe from typing import Any, Dict, Optional - +SQL_REWRITE_PROMPT = """You are an ERP query rewriter and entity detector. +Return ONLY valid JSON: +{{"standalone_question":"...","contains_values":true/false}} + +TASK 1 — FOLLOW-UP +- If the query depends on previous messages, rewrite it as a complete standalone question. +- Otherwise keep it unchanged. + +TASK 2 — ENTITY DETECTION +contains_values = TRUE: Any noun that refers to a specific named master record +(item name, customer name, supplier name, warehouse name, employee name) +If not sure, also set contains_values = TRUE, otherwise contains_values = FALSE. + +TASK 3 — ERP CONTEXTUAL REWRITE + +1. Normalize: +- Fix typos, clear English +- Do NOT change entity values + +2. Complete intent: +- Never change the question's intent — only fix grammar and map ERP terms. + +3. ERP mapping: +- Map generic terms to standard ERPNext concepts based on intent +- Avoid vague words if clearer business terms exist +- Do NOT invent documents or use report names. +Examples: + stock → Bin / Stock Ledger Entry + production → Work Order + finance/profit → GL Entry + +4. Field hints (max 1–2): +Use natural phrasing ("based on", "using"): + sales → grand_total + qty → qty + stock → actual_qty + production → produced_qty + finance → debit / credit + status → status + +5. Time fields: + Sales/Stock/Finance → posting_date + Work Order → actual_start_date / actual_end_date + Timesheet → start_date / end_date + Timesheet Detail → from_time / to_time +- NEVER use posting_date for Timesheet +- NEVER use creation unless asked + +6. Relationships: +- Include linked entities if required + +STYLE: +- Natural business language +- No SQL, no tab* names + +EXAMPLES: +"total sales amount last month" +→ What is the total sales amount from Sales Invoices last month based on grand_total and posting_date? + +"stock in warehouse a" +→ What is the stock quantity in Warehouse A based on actual_qty from Bin? + +"who worked today" +→ Which employees logged time today based on Timesheet start_date or Timesheet Detail from_time? + +STRICT RULES: +- If the query mentions Draft, Submitted, or Cancelled, explicitly include docstatus in the rewritten question. +- Do not add a specific document type unless clearly implied by the user query or required by standard ERPNext business meaning. +- For vague money questions, clarify the business meaning as actual, ordered, quoted, paid, or outstanding — do not guess the document type incorrectly. +- If the user says "spend", treat it as actual purchase/expense, not quotation or order commitment, unless the user explicitly mentions order, quotation, or planned purchase. +- Preserve all filter conditions, status values, and keywords from the original question — never drop them during rewriting. +- Do NOT add dates, filters, entities, statuses, or assumptions unless explicitly present in the user question or clearly inferred from conversation memory. +- Use chat history only when the current query clearly implies continuation or follow-up context. Never assume dates, filters, entities, or conditions from previous messages unless strongly indicated. +- Use only the most relevant tables and fields required for the user query. +- Use only valid tables and fields from the provided schema context, regardless of retrieval ranking order. +- Choose fields based on business meaning and user intent, not rank position. +- Never invent schema elements. +- Always return any one clear user-readable business field, not only technical IDs, unless explicitly requested. +- If the query is ambiguous, ask for clarification and set "clarify": true.""" def get_symspell(): global sym_spell @@ -181,11 +263,13 @@ def read_asset(file_name: str, base: str = "assets") -> Any: mapping_data = read_asset("metaschema_clean_v2.json", base="assets") CONVERSATION_TEMPLATE = read_asset("conversation_template_v2.j2", base="assets") - -SQL_PROMPT = read_asset("sql_prompt.txt", base="prompts") +SQL_SYS_PROMPT = read_asset("sql_system_prompt.txt", base="prompts") +SQL_PROMPT = read_asset("sql_user_prompt.txt", base="prompts") FORMAT_PROMPT = read_asset("user_friendly_prompt.txt", base="prompts") NON_ERP_PROMPT = read_asset("non_erp_prompt.txt", base="prompts") SUPPORT_PROMPT = read_asset("support.txt", base="prompts") +SUPPORT_USER_PROMPT = read_asset("support_user_prompt.txt", base="prompts") +SUPPORT_SYS_PROMPT = read_asset("support_sys_prompt.txt", base="prompts") FILTER_TABLES = read_asset("filter_tables.txt", base="prompts") filter_fields = read_asset("filter_fields.txt", base="prompts") @@ -378,20 +462,29 @@ def get(cls): frappe.clear_document_cache(CHANGAI_SETTINGS) frappe.local._changai_config = get_settings() return frappe.local._changai_config +_POLLY_CLIENT = None +def get_polly_client(config): + global _POLLY_CLIENT + + if _POLLY_CLIENT is None: + _POLLY_CLIENT = boto3.client( + "polly", + aws_access_key_id=(config.get("aws_access_key_id") or "").strip(), + aws_secret_access_key=(config.get("aws_secret_access_key") or "").strip(), + region_name=(config.get("aws_region") or "us-east-1"), + ) + return _POLLY_CLIENT @frappe.whitelist(allow_guest=False) def synthesize_tts(text: str, voice_id: Optional[str] = None) -> Dict[str, Any]: config = ChangAIConfig.get() - if not bool(config.get("enable_voice_chat")): return {"ok": False, "error": "Voice chat is disabled in settings.", "provider": "browser"} - aws_access_key_id = (config.get("aws_access_key_id") or "").strip() aws_secret_access_key = (config.get("aws_secret_access_key") or "").strip() if not aws_access_key_id or not aws_secret_access_key: return {"ok": False, "error": "AWS Polly credentials are missing.", "provider": "browser"} - cleaned_text = re.sub(r"<[^>]*>", " ", text or "") cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip() if not cleaned_text: @@ -401,12 +494,7 @@ def synthesize_tts(text: str, voice_id: Optional[str] = None) -> Dict[str, Any]: cleaned_text = cleaned_text[:2500] try: - polly_client = boto3.client( - "polly", - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - region_name=(config.get("aws_region") or "us-east-1"), - ) + polly_client = get_polly_client(config) voice = (voice_id or config.get("polly_voice_id") or "Joanna").strip() or "Joanna" response = polly_client.synthesize_speech( Text=cleaned_text, @@ -530,13 +618,13 @@ def extract_tables_from_sql(sql: str) -> List[str]: return tables -def call_model(prompt: str, task: str = "llm") -> Any: +def call_model(prompt: str, task: str = "llm",sys_prompt: str = "") -> Any: config = ChangAIConfig.get() if config["REMOTE"] and config["llm"] == "QWEN3": return remote_llm_request_deploy_test(prompt=prompt, task=task) else: if config["llm"] == "Gemini": - return call_gemini(prompt) + return call_gemini(prompt,sys_prompt) def _post_json(url: str, headers: Dict[str, str], payload: Dict[str, Any], timeout: int = 120): @@ -705,13 +793,13 @@ def gemini_client(): _GEMINI_CLIENT = _build_gemini_client(config) return _GEMINI_CLIENT -def call_gemini(prompt: str) -> Union[str, Dict[str, Any]]: +def call_gemini(prompt: str,sys_prompt: str) -> Union[str, Dict[str, Any]]: try: # frappe.clear_document_cache(CHANGAI_SETTINGS) client = gemini_client() gemini_config = types.GenerateContentConfig( - system_instruction="You are an ERPNext assistant.Follow the task instructions exactly.", + system_instruction=sys_prompt, ) response = client.models.generate_content( model=MODEL_ID, @@ -831,8 +919,13 @@ class SQLState(TypedDict, total=False): selected_fields: str -def is_erp_query(q: str, keywords: list[str]) -> bool: - return any(kw in q for kw in keywords) +def is_erp_query(q: str) -> tuple[bool, str]: + _init_keywords() + for word in q.lower().split(): + is_erp = _word_is_erp(word) + if is_erp: + return True + return False def correct_spelling(text: str) -> str: sym = get_symspell() @@ -850,8 +943,7 @@ def guardrail_router(state: SQLState) -> SQLState: raw_q = state.get("formatted_q") or state.get("question") or "" q = str(raw_q).lower().strip() q_corrected = correct_spelling(q) - is_erp = is_erp_query(q_corrected, BUSINESS_KEYWORDS) - + is_erp= is_erp_query(q_corrected) query_type = "ERP" if is_erp else "NON_ERP" state["query_type"] = query_type @@ -905,16 +997,17 @@ def _parse_rewrite_response(raw: Any, user_qstn: str) -> Tuple[str, bool]: return standalone or user_qstn.strip(), contains_values - +SQL_REWRITE_SYS_PROMPT = read_asset("sql_rewrite_sys_prompt.txt", base="prompts") +SQL_REWRITE_USER_PROMPT = read_asset("sql_rewrite_user_prompt.txt", base="prompts") def rewrite_question(state: SQLState) -> SQLState: request_id = state.get("request_id") user_qstn = state.get("question") or "" session_id = state.get("session_id") - + sys_prompt = SQL_REWRITE_SYS_PROMPT prompt = inject_prompt(user_qstn, session_id) try: - raw = call_model(prompt, "llm") + raw = call_model(prompt, "llm",sys_prompt) standalone, contains_values = _parse_rewrite_response(raw, user_qstn) publish_pipeline_update( @@ -1375,7 +1468,7 @@ def generate_sql(state:SQLState) -> SQLState: else: prompt=fill_sql_prompt(formatted_q,state["context"]) try: - response=call_model(prompt) + response=call_model(prompt,"llm",SQL_SYS_PROMPT) if not response: return {**state, "error": "Empty response from LLM", "sql_prompt": prompt} if isinstance(response, str): @@ -1481,7 +1574,7 @@ def get_master_vs(): @frappe.whitelist() def local_entity_embedder(q: str) -> List[Dict[str, Any]]: - hits = get_master_vs().similarity_search(q, k=4) + hits = get_master_vs().similarity_search(q, k=10) out, seen = [], set() for h in hits: entity_type = h.metadata.get("entity_type") @@ -1534,10 +1627,10 @@ def repair_sqlquery(state: SQLState) -> SQLState: sql_prompt = state.get("sql_prompt") if not sql_prompt: return {**state, "tries": tries, "error": "No SQL prompt to repair from"} - patched_prompt = sql_prompt + "\n\n#VALIDATION HINTS\n" + "\n".join(f"-{h}" for h in hints) + patched_prompt =sql_prompt + "\n\n#VALIDATION HINTS\n" + "\n".join(f"-{h}" for h in hints) try: - response = call_model(patched_prompt,"llm") + response = call_model(patched_prompt,"llm",SQL_SYS_PROMPT) if isinstance(response, str): try: response = json.loads(response) @@ -1846,8 +1939,8 @@ def execute_query(sql: str, doctypes: List[str]) -> Any: def support_bot(message: str) -> Dict[str, Any]: user_email = frappe.session.user full_name = frappe.get_value("User", frappe.session.user, "full_name") - prompt = SUPPORT_PROMPT.format(user_message=message) - raw = call_gemini(prompt) + prompt = SUPPORT_USER_PROMPT.format(user_message=message) + raw = call_gemini(prompt, SUPPORT_SYS_PROMPT) output = json.loads(raw) task_flag = (output.get("task_flag") or "UNKNOWN").strip() ticket_id = output.get("ticket_id") @@ -1928,24 +2021,24 @@ def format_data(qstn: str, sql_data: Any) -> Dict[str, str]: else: db_result_json = str(sql_data) if sql_data is not None else "{}" - prompt = f""" + sys_prompt = """ INSTRUCTIONS: - Convert raw database results into a short, friendly, human-readable answer. - You may use BOTH: (1) the user question and (2) the DB result JSON to form the answer. - Use ONLY values present in the JSON. NEVER invent numbers or fields. - Keep the answer brief (1–6 lines). - If the question asks for last/top/highest/total, interpret based strictly on the JSON rows. - -QUESTION: -{qstn} - -DATABASE_RESULT_JSON: -{db_result_json} - OUTPUT: Write a clear final answer for the user based strictly on the JSON above. """ - output = call_model(prompt=prompt) + user_prompt=f""" + QUESTION: + {qstn} + + DATABASE_RESULT_JSON: + {db_result_json} + """ + output = call_model(user_prompt,"llm",sys_prompt) answer = str(output) return {"answer": answer} @@ -2446,16 +2539,16 @@ def run_text2sql_pipeline(user_question: str, chat_id: str, request_id: str) -> -@frappe.whitelist(allow_guest=False) -def test(user_qstn, session_id): - prompt = inject_prompt(user_qstn, session_id) +# @frappe.whitelist(allow_guest=False) +# def test(user_qstn, session_id): +# prompt = inject_prompt(user_qstn, session_id) - try: - raw = call_model(prompt, "llm") - standalone, contains_values = _parse_rewrite_response(raw, user_qstn) - return standalone, contains_values - except Exception as e: - print(f"Error during model call: {e}") +# try: +# raw = call_model(prompt, "llm") +# standalone, contains_values = _parse_rewrite_response(raw, user_qstn) +# return standalone, contains_values +# except Exception as e: +# print(f"Error during model call: {e}") def load_on_startup(): @@ -2481,6 +2574,50 @@ def load_on_startup(): load_field_matrix() gemini_client() get_master_vs() + _init_keywords() frappe.logger().info("ChangAI: All components loaded into memory") + config = ChangAIConfig.get() + get_polly_client(config) except Exception as e: - frappe.logger().error(f"ChangAI startup load failed: {e}") \ No newline at end of file + frappe.logger().error(f"ChangAI startup load failed: {e}") + + +def _init_keywords(): + global _KEYWORDS_SET, _KEYWORDS_LIST + if not _KEYWORDS_SET: + _KEYWORDS_SET = set(kw.lower() for kw in BUSINESS_KEYWORDS) + _KEYWORDS_LIST = list(_KEYWORDS_SET) + + # ✅ pre-warm cache — run every keyword through _word_is_erp at startup + for kw in _KEYWORDS_LIST: + _word_is_erp(kw) # result gets cached — first real request is instant + + +@lru_cache(maxsize=None) +def _word_is_erp(word: str) -> tuple[bool, str]: + """Returns (is_erp, matched_keyword)""" + if len(word) <= 3: + return False + + # 1. exact + if word in _KEYWORDS_SET: + return True + + # 2. substring + for kw in _KEYWORDS_SET: + if word in kw or kw in word: + return True + + # 3. fuzzy + if len(word) >= 4: + match = process.extractOne( + word, + _KEYWORDS_LIST, + scorer=fuzz.ratio, + score_cutoff=85 + ) + if match: + return True + + return False, "" + diff --git a/changai/changai/prompts/sql_prompt.txt b/changai/changai/prompts/sql_prompt.txt index f054902..b95930c 100644 --- a/changai/changai/prompts/sql_prompt.txt +++ b/changai/changai/prompts/sql_prompt.txt @@ -93,7 +93,16 @@ SYNTAX RULES: Before outputting SQL, verify: correct operator precedence, no missing filters, and logic matches the original question exactly. any document that goes through a Draft → Submit → Cancel cycle has a docstatus field. -Always include docstatus = 1 in any query where you want only real, valid, posted transactions. +pending invoice +→ outstanding_amount > 0 +→ unpaid invoice +→ overdue invoice +Very Important: +- ***Always include docstatus = 1 in any query where you want only real, valid, posted transactions.*** +- Do NOT blindly use docstatus = 1. +- Many master doctypes (Employee, Customer, Item, Supplier, etc.) are non-submittable and normally remain docstatus = 0. +- Use docstatus filters only when appropriate for the doctype. ═══ INPUTS ═══ USER QUESTION: {question} -SCHEMA CONTEXT: {context} \ No newline at end of file +SCHEMA CONTEXT: {context} +please make sure you are never using any fields that doesnot gvien in the schema context thats important.because you are repaetedly amking this mistake againa nd again . \ No newline at end of file diff --git a/changai/changai/prompts/sql_rewrite_sys_prompt.txt b/changai/changai/prompts/sql_rewrite_sys_prompt.txt new file mode 100644 index 0000000..5a62cbd --- /dev/null +++ b/changai/changai/prompts/sql_rewrite_sys_prompt.txt @@ -0,0 +1,79 @@ +You are an ERP query rewriter and entity detector. +Return ONLY valid JSON: +{{"standalone_question":"...","contains_values":true/false}} + +TASK 1 — FOLLOW-UP +- If the query depends on previous messages, rewrite it as a complete standalone question. +- Otherwise keep it unchanged. + +TASK 2 — ENTITY DETECTION +contains_values = TRUE: Any noun that refers to a specific named master record +(item name, customer name, supplier name, warehouse name, employee name) +If not sure, also set contains_values = TRUE, otherwise contains_values = FALSE. + +TASK 3 — ERP CONTEXTUAL REWRITE + +1. Normalize: +- Fix typos, clear English +- Do NOT change entity values + +2. Complete intent: +- Never change the question's intent — only fix grammar and map ERP terms. + +3. ERP mapping: +- Map generic terms to standard ERPNext concepts based on intent +- Avoid vague words if clearer business terms exist +- Do NOT invent documents or use report names. +Examples: + stock → Bin / Stock Ledger Entry + production → Work Order + finance/profit → GL Entry + +4. Field hints (max 1–2): +Use natural phrasing ("based on", "using"): + sales → grand_total + qty → qty + stock → actual_qty + production → produced_qty + finance → debit / credit + status → status + +5. Time fields: + Sales/Stock/Finance → posting_date + Work Order → actual_start_date / actual_end_date + Timesheet → start_date / end_date + Timesheet Detail → from_time / to_time +- NEVER use posting_date for Timesheet +- NEVER use creation unless asked + +6. Relationships: +- Include linked entities if required + +STYLE: +- Natural business language +- No SQL, no tab* names + +EXAMPLES: +"total sales amount last month" +→ What is the total sales amount from Sales Invoices last month based on grand_total and posting_date? + +"stock in warehouse a" +→ What is the stock quantity in Warehouse A based on actual_qty from Bin? + +"who worked today" +→ Which employees logged time today based on Timesheet start_date or Timesheet Detail from_time? + +STRICT RULES: +- If the query mentions Draft, Submitted, or Cancelled, explicitly include docstatus in the rewritten question. +- Do not add a specific document type unless clearly implied by the user query or required by standard ERPNext business meaning. +- For vague money questions, clarify the business meaning as actual, ordered, quoted, paid, or outstanding — do not guess the document type incorrectly. +- If the user says "spend", treat it as actual purchase/expense, not quotation or order commitment, unless the user explicitly mentions order, quotation, or planned purchase. +- Preserve all filter conditions, status values, and keywords from the original question — never drop them during rewriting. +- Do NOT add dates, filters, entities, statuses, or assumptions unless explicitly present in the user question or clearly inferred from conversation memory. +- Use chat history only when the current query clearly implies continuation or follow-up context. Never assume dates, filters, entities, or conditions from previous messages unless strongly indicated. +- Use only the most relevant tables and fields required for the user query. +- Use only valid tables and fields from the provided schema context, regardless of retrieval ranking order. +- Choose fields based on business meaning and user intent, not rank position. +- Never invent schema elements. +- Always return any one clear user-readable business field, not only technical IDs, unless explicitly requested. +- If the query is ambiguous, ask for clarification and set "clarify": true. \ No newline at end of file diff --git a/changai/changai/prompts/sql_rewrite_user_prompt.txt b/changai/changai/prompts/sql_rewrite_user_prompt.txt new file mode 100644 index 0000000..4001a11 --- /dev/null +++ b/changai/changai/prompts/sql_rewrite_user_prompt.txt @@ -0,0 +1,5 @@ +Chat History: +{rows} + +User Question: +{qstn} \ No newline at end of file diff --git a/changai/changai/prompts/sql_system_prompt.txt b/changai/changai/prompts/sql_system_prompt.txt new file mode 100644 index 0000000..9f89906 --- /dev/null +++ b/changai/changai/prompts/sql_system_prompt.txt @@ -0,0 +1,83 @@ +You are a strict MariaDB SQL and Frappe ORM query generator. + +TASK: Generate EXACTLY one executable MariaDB SELECT query AND one equivalent Frappe ORM query that correctly answers the USER QUESTION. + +═══ SCHEMA GROUNDING (ABSOLUTE) ═══ +- Use ONLY tables and fields that exist EXACTLY in SCHEMA CONTEXT. +- A field may ONLY be used with the table it belongs to in SCHEMA CONTEXT. +- If a required field/table is missing from SCHEMA CONTEXT, omit it or return empty string. +- SELECT * is forbidden. Always use explicit fields. + +═══ ENTITY FILTERING ═══ +- If ENTITY_CARD exists, use ONLY those exact entity values (no case/spelling changes) in both SQL and ORM. +- NEVER use entity string literals from the user question directly. + +═══ DOCSTATUS RULE ═══ +- For queries needing filtering with submitted, draft or cancelled use docstatus. +- Draft = docstatus = 0 | Submitted = docstatus = 1 | Cancelled = docstatus = 2 +- NEVER use `status` for these states. docstatus has absolute priority. +- All business data queries (sales, purchase, revenue, stock) MUST filter docstatus = 1. +- docstatus = 0 → ONLY when user says "draft" +- docstatus = 1 → ONLY when user says "submitted" +- docstatus = 2 → ONLY when user says "cancelled" +- For ALL other status words (pending, on hold, overdue, closed, approved, rejected, completed, open) → use the `status` field, NOT docstatus. +- If the schema provides status field options → match the user's word to the closest option value. +- Many master doctypes (Employee, Customer, Item, Supplier, etc.) are non-submittable and normally remain docstatus = 0. Use docstatus filters only when appropriate for the doctype. +- Any document that goes through a Draft → Submit → Cancel cycle has a docstatus field. + +═══ ABSOLUTE CONSTRAINTS (NO EXCEPTIONS) ═══ +1. Use ONLY tables and fields that appear EXACTLY in SCHEMA CONTEXT. +2. A field may ONLY be used with the table it belongs to in SCHEMA CONTEXT. +3. JOIN tables ONLY if SCHEMA CONTEXT explicitly provides a join key (join_hint / link field). +4. Do NOT assume ERP relationships unless present in SCHEMA CONTEXT. +5. Do NOT hallucinate tables or fields. + +═══ MARIADB COMPATIBILITY (ZERO TOLERANCE) ═══ +FORBIDDEN TOKENS — NEVER USE UNDER ANY CIRCUMSTANCE: +STRFTIME, DATE_TRUNC, ::, ILIKE, TO_CHAR, NOW()::, EXTRACT, INTERVAL 'x' + +ALLOWED DATE PATTERNS ONLY: +- This month: MONTH(col) = MONTH(CURDATE()) AND YEAR(col) = YEAR(CURDATE()) +- Date math: DATE_SUB(date, INTERVAL n UNIT) — never DATE_SUB(date, n, UNIT) +- Filtering: YEAR(), MONTH(), QUARTER(), LAST_DAY(), CURDATE() + +FOR "THIS MONTH" — USE ONLY ONE OF: + A. (MONTH(date_col) = MONTH(CURDATE()) AND YEAR(date_col) = YEAR(CURDATE())) + B. (date_col BETWEEN DATE_SUB(CURDATE(), INTERVAL DAYOFMONTH(CURDATE())-1 DAY) AND LAST_DAY(CURDATE())) + +- If you are about to output any forbidden token, you MUST REWRITE using MariaDB equivalents. +- If you cannot express date logic using MariaDB functions, you MUST NOT output a query with forbidden tokens. + +═══ SCHEMA RANKING AWARENESS ═══ +- SCHEMA CONTEXT is retrieved via semantic search — relevant tables/fields may appear low in the list due to weak retrieval ranking. Do NOT treat rank as relevance. +- You MUST scan the ENTIRE SCHEMA CONTEXT before selecting tables and fields. +- Choose the most semantically correct table/field for the question even if it appears last in SCHEMA CONTEXT. +- Ranking is a retrieval artifact. Correctness is your responsibility. + +═══ STATUS MAPPING ═══ +- When mapping user status words to SQL, use the status options provided in the field schema. +- Match the closest option to the user's intent. +- If multiple options match, use IN (...). +- pending invoice → outstanding_amount > 0 → unpaid invoice → overdue invoice. +- Always include docstatus = 1 in any query where you want only real, valid, posted transactions. + +═══ FORMAT RULES ═══ +- Output ONLY SQL and ORM. No explanation. +- Start SQL with SELECT and end with ; +- Uppercase SQL keywords. +- Wrap EVERY table name in backticks exactly as in SCHEMA CONTEXT. +- Raw JSON only. No escaping, no explanation. +- When filtering multiple values on the same field, always use IN (...) instead of OR conditions. + +═══ SYNTAX RULES ═══ +- NEVER use multiple OR conditions on the same field → always use IN (...) +- NEVER combine OR and AND without wrapping OR in parentheses +- ALWAYS verify operator precedence: AND binds before OR +- NEVER assume execution success means correct results +- Before outputting SQL, verify: correct operator precedence, no missing filters, and logic matches the original question exactly. + +═══ OUTPUT FORMAT ═══ +{ + "sql": "", + "orm": "" +} \ No newline at end of file diff --git a/changai/changai/prompts/sql_user_prompt.txt b/changai/changai/prompts/sql_user_prompt.txt new file mode 100644 index 0000000..105e12c --- /dev/null +++ b/changai/changai/prompts/sql_user_prompt.txt @@ -0,0 +1,3 @@ +USER QUESTION: {question} + +SCHEMA CONTEXT: {context} \ No newline at end of file diff --git a/changai/changai/prompts/support_sys_prompt.txt b/changai/changai/prompts/support_sys_prompt.txt new file mode 100644 index 0000000..91e258e --- /dev/null +++ b/changai/changai/prompts/support_sys_prompt.txt @@ -0,0 +1,25 @@ +You are an ERPNext / Frappe Helpdesk classifier. +Return ONLY valid JSON. No extra text. + +Decide: +- CREATE_TICKET: user reports a problem, error, request for support, or something not working; or asks to create/open/raise a ticket. +- TICKET_DETAILS: user asks about ONE existing ticket and explicitly mentions an id/number (e.g., "ticket 29", "case #29", "status of 29"). +- GET_USER_TICKETS: user asks to list/show their tickets (e.g., "my tickets", "all my tickets", "open tickets", "tickets I raised"). +- UNKNOWN: unclear or unrelated. + +Output format (STRICT): +{ + "task_flag": "CREATE_TICKET" | "TICKET_DETAILS" | "GET_USER_TICKETS" | "UNKNOWN", + "ticket_id": , + "confidence": 0.0-1.0, + "reason": "" +} + +Rules: +- ticket_id MUST be null unless the user explicitly provided a number. +- Never invent ids. +- For CREATE_TICKET / GET_USER_TICKETS / UNKNOWN: ticket_id must be null. +- Use double quotes for all keys and values. +- Do not use single quotes. +- Do not include markdown or explanation. +- Return ONLY valid JSON. No extra text. \ No newline at end of file diff --git a/changai/changai/prompts/support_user_prompt.txt b/changai/changai/prompts/support_user_prompt.txt new file mode 100644 index 0000000..602902f --- /dev/null +++ b/changai/changai/prompts/support_user_prompt.txt @@ -0,0 +1 @@ +USER MESSAGE: {user_message} \ No newline at end of file diff --git a/changai/hooks.py b/changai/hooks.py index 74bfda0..874f6cc 100644 --- a/changai/hooks.py +++ b/changai/hooks.py @@ -181,8 +181,16 @@ # "on_trash": "method" # } } -before_request = ["changai.changai.api.v2.text2sql_pipeline_v2.load_on_startup", - "changai.changai.api.v2.schema_utils.reload_mapping_schema_cache"] +on_boot = [ + "changai.changai.api.v2.text2sql_pipeline_v2.load_on_startup", + "changai.changai.api.v2.schema_utils.reload_mapping_schema_cache" +] + +# also runs after bench migrate +after_migrate = [ + "changai.changai.api.v2.text2sql_pipeline_v2.load_on_startup", + "changai.changai.api.v2.schema_utils.reload_mapping_schema_cache" +] # Scheduled Tasks # --------------- diff --git a/pyproject.toml b/pyproject.toml index 7fe6d05..659aa13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,13 +17,13 @@ dependencies = [ "langchain-core", "langchain-community", "langchain-huggingface", - "langgraph", + "langgraph", "symspellpy>=6.7.7,<7.0.0", "transformers>=4.49.0,<5.0.0", "sentence-transformers>=3.0.0,<4.0.0", "huggingface_hub>=0.23.0,<1.0.0", "faiss-cpu>=1.7.0", - "zstandard>=0.23.0", + "zstandard>=0.23.0", # Critical Framework Compatibility "numpy>=1.22.0,<2.0.0", # DO NOT REMOVE <2.0.0 (Breaks ERPNext) "sqlglot>=27.0.0", From ff9c8a35daa8ff48c7724f5d04f6d3067354defd Mon Sep 17 00:00:00 2001 From: Hyrin-mansoor Date: Fri, 8 May 2026 14:47:16 +0300 Subject: [PATCH 2/2] fix: update sentence-transformers dependency to version 5.0.0 for compatibility --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 659aa13..6490eb7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ dependencies = [ "langgraph", "symspellpy>=6.7.7,<7.0.0", "transformers>=4.49.0,<5.0.0", - "sentence-transformers>=3.0.0,<4.0.0", + "sentence-transformers>=5.0.0", "huggingface_hub>=0.23.0,<1.0.0", "faiss-cpu>=1.7.0", "zstandard>=0.23.0",