diff --git a/docs/docs/integrations/retrievers/self_query/myscale_self_query.ipynb b/docs/docs/integrations/retrievers/self_query/myscale_self_query.ipynb index b38e2901ab2f8..9d40275e3961e 100644 --- a/docs/docs/integrations/retrievers/self_query/myscale_self_query.ipynb +++ b/docs/docs/integrations/retrievers/self_query/myscale_self_query.ipynb @@ -39,7 +39,7 @@ }, "outputs": [], "source": [ - "! pip install lark clickhouse-connect" + "! pip install lark clickhouse-connect langchain-core" ] }, { @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "dd01b61b-7d32-4a55-85d6-b2d2d4f18840", "metadata": { "tags": [] @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "cb4a5787", "metadata": { "tags": [] @@ -124,7 +124,7 @@ " ),\n", " Document(\n", " page_content=\"Toys come alive and have a blast doing so\",\n", - " metadata={\"date\": \"1995-02-11\", \"genre\": [\"animated\"]},\n", + " metadata={\"date\": \"1995-02-11\", \"genre\": [\"animation\"]},\n", " ),\n", " Document(\n", " page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n", @@ -148,48 +148,90 @@ "metadata": {}, "source": [ "## Creating our self-querying retriever\n", - "Just like other retrievers... simple and nice." + "Just like other retrievers... simple and nice.\n", + "\n", + "We also introduce you the `VirtualColumnName`, where you can prompt your columns that links to certain SQL functions / columns.\n", + "\n", + "Self query retrievers are powerful and it can be even stronger than before. Taking MyScale as an example, sometimes the user may add complex SQL function to a column name. So we add a `VirtualColumnName` for self-query retriever to reduce token usage with extra functionality.\n", + "\n", + "1. a function that dynamically creates function mapped column names, which becomes handy when the user want to compare between current time / location or other attributes along with a column.\n", + "2. complex function call in plain string can now be replaced with a shorter nickname in prompt, which saves number of token for very long function names.\n", + "3. refined the logic in MyScale translator to meet this standard.\n", + "\n", + "This change will not affect other self query retrievers as it preserves the original plain string interface under comparison. The actual difference happens under `QueryTransformer` in `query_construct.parser` for `Lark` parser.\n", + "\n", + "`Comparator.attribute` can now be either a string or a `VirtualColumnName`. I have defined the default behaviour which stops the user to use `VirtualColumnName` under other vectorstores than MyScale.\n", + "\n", + "This functionality will boost the self-query retrievers as a bridge between text and simple SQL query. And we believe this will help users to expand their usage to this retriever. Other SQL vector database is theoretically compatible to this new feature as well.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "86e34dbf", "metadata": { "tags": [] }, "outputs": [], "source": [ - "from langchain.chains.query_constructor.base import AttributeInfo\n", + "from langchain.chains.query_constructor.base import AttributeInfo, VirtualColumnName\n", "from langchain.llms import OpenAI\n", "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", "\n", "metadata_field_info = [\n", + " # You can use plain string to specify a column\n", + " # If not specified, the MyScale's Self querying retriever will append `metadata.` to your column.\n", + " # This is a default behavior if you used LangChain to insert data into MyScale\n", + " # the line below is equivalent to\n", + " # |||\n", + " # vvv\n", + " # AttributeInfo(\n", + " # name=VirtualColumnName(\n", + " # name=\"genre\", column=f\"{vectorstore.metadata_column}.genre\"\n", + " # ),\n", + " # description=\"The length of genres of the movie\",\n", + " # type=\"integer\",\n", + " # )\n", " AttributeInfo(\n", " name=\"genre\",\n", " description=\"The genres of the movie\",\n", " type=\"list[string]\",\n", " ),\n", + " # Or if you wang to use a customized column name, you should use virtual column name\n", + " # This will help you to expand how you can use this self query retriever\n", " # If you want to include length of a list, just define it as a new column\n", " # This will teach the LLM to use it as a column when constructing filter.\n", " AttributeInfo(\n", - " name=\"length(genre)\",\n", + " name=VirtualColumnName(\n", + " name=\"length(genre)\", column=f\"length({vectorstore.metadata_column}.genre)\"\n", + " ),\n", " description=\"The length of genres of the movie\",\n", " type=\"integer\",\n", " ),\n", + " # Virtual columns can also help you with SQL functions.\n", " # Now you can define a column as timestamp. By simply set the type to timestamp.\n", " AttributeInfo(\n", - " name=\"date\",\n", + " # Virtual column names are used for translating long name for columns\n", + " name=VirtualColumnName(\n", + " name=\"date\",\n", + " column=f\"parseDateTime32BestEffort({vectorstore.metadata_column}.date)\",\n", + " ),\n", " description=\"The date the movie was released\",\n", " type=\"timestamp\",\n", " ),\n", " AttributeInfo(\n", - " name=\"director\",\n", + " name=VirtualColumnName(\n", + " name=\"director\", column=f\"{vectorstore.metadata_column}.director\"\n", + " ),\n", " description=\"The name of the movie director\",\n", " type=\"string\",\n", " ),\n", " AttributeInfo(\n", - " name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n", + " name=VirtualColumnName(\n", + " name=\"rating\", column=f\"{vectorstore.metadata_column}.rating\"\n", + " ),\n", + " description=\"A 1-10 rating for the movie\",\n", + " type=\"float\",\n", " ),\n", "]\n", "document_content_description = \"Brief summary of a movie\"\n", @@ -210,10 +252,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "38a126e9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'date': '1993-07-02', 'director': '', 'genre': ['science fiction'], 'rating': 7.7}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'date': '1995-02-11', 'director': '', 'genre': ['animation'], 'rating': 0.0}),\n", + " Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'date': '1979-09-10', 'director': 'Andrei Tarkovsky', 'genre': ['science fiction', 'adventure'], 'rating': 9.9}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'date': '2006-04-23', 'director': 'Satoshi Kon', 'genre': [], 'rating': 8.6})]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# This example only specifies a relevant query\n", "retriever.get_relevant_documents(\"What are some movies about dinosaurs\")" @@ -221,10 +284,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "fc3f1e6e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Comparison(comparator=, attribute='metadata.rating', value=8.5) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'date': '1979-09-10', 'director': 'Andrei Tarkovsky', 'genre': ['science fiction', 'adventure'], 'rating': 9.9}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'date': '2006-04-23', 'director': 'Satoshi Kon', 'genre': [], 'rating': 8.6})]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# This example only specifies a filter\n", "retriever.get_relevant_documents(\"I want to watch a movie rated higher than 8.5\")" @@ -232,10 +314,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "b19d4da0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='women' filter=Comparison(comparator=, attribute='metadata.director', value='Greta Gerwig') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'date': '2019-08-22', 'director': 'Greta Gerwig', 'genre': [], 'rating': 8.3})]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# This example specifies a query and a filter\n", "retriever.get_relevant_documents(\"Has Greta Gerwig directed any movies about women\")" @@ -243,10 +343,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "f900e40e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='metadata.rating', value=8.5), Comparison(comparator=, attribute='genre', value='science fiction')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'date': '1979-09-10', 'director': 'Andrei Tarkovsky', 'genre': ['science fiction', 'adventure'], 'rating': 9.9})]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# This example specifies a composite filter\n", "retriever.get_relevant_documents(\n", @@ -256,14 +374,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "12a51522", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='toys' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='parseDateTime32BestEffort(metadata.date)', value={'date': '1990-01-01', 'type': 'date'}), Comparison(comparator=, attribute='parseDateTime32BestEffort(metadata.date)', value={'date': '2005-12-31', 'type': 'date'}), Comparison(comparator=, attribute='genre', value='animation')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Toys come alive and have a blast doing so', metadata={'date': '1995-02-11', 'director': '', 'genre': ['animation'], 'rating': 0.0})]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# This example specifies a query and composite filter\n", "retriever.get_relevant_documents(\n", - " \"What's a movie after 1990 but before 2005 that's all about toys, and preferably is animated\"\n", + " \"What's a movie after 1990 but before 2005 that's all about toys, and preferably is animation\"\n", ")" ] }, @@ -279,10 +415,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "1d043096", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Comparison(comparator=, attribute='length(metadata.genre)', value=1) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'date': '1979-09-10', 'director': 'Andrei Tarkovsky', 'genre': ['science fiction', 'adventure'], 'rating': 9.9})]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# You can use length(genres) to do anything you want\n", "retriever.get_relevant_documents(\"What's a movie that have more than 1 genres?\")" @@ -290,10 +444,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "d570d33c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Comparison(comparator=, attribute='parseDateTime32BestEffort(metadata.date)', value={'date': '1995-02-01', 'type': 'date'}) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Toys come alive and have a blast doing so', metadata={'date': '1995-02-11', 'director': '', 'genre': ['animation'], 'rating': 0.0}),\n", + " Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'date': '2019-08-22', 'director': 'Greta Gerwig', 'genre': [], 'rating': 8.3}),\n", + " Document(page_content='Leo DiCaprio gets lost in a dream within a dream within a dream within a ...', metadata={'date': '2010-12-30', 'director': 'Christopher Nolan', 'genre': [], 'rating': 8.2}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'date': '2006-04-23', 'director': 'Satoshi Kon', 'genre': [], 'rating': 8.6})]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Fine-grained datetime? You got it already.\n", "retriever.get_relevant_documents(\"What's a movie that release after feb 1995?\")" @@ -301,10 +476,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "fbe0b21b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='Andrei' filter=Comparison(comparator=, attribute='metadata.director', value='Andrei') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'date': '1979-09-10', 'director': 'Andrei Tarkovsky', 'genre': ['science fiction', 'adventure'], 'rating': 9.9})]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Don't know what your exact filter should be? Use string pattern match!\n", "retriever.get_relevant_documents(\"What's a movie whose name is like Andrei?\")" @@ -312,10 +505,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "6a514104", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='genre', value='science fiction'), Comparison(comparator=, attribute='genre', value='adventure')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'date': '1979-09-10', 'director': 'Andrei Tarkovsky', 'genre': ['science fiction', 'adventure'], 'rating': 9.9})]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Contain works for lists: so you can match a list with contain comparator!\n", "retriever.get_relevant_documents(\n", @@ -337,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "bff36b88-b506-4877-9c63-e5a1a8d78e64", "metadata": { "tags": [] @@ -356,12 +567,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "2758d229-4f97-499c-819f-888acaf8ee10", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=2\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'date': '1993-07-02', 'director': '', 'genre': ['science fiction'], 'rating': 7.7}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'date': '1995-02-11', 'director': '', 'genre': ['animation'], 'rating': 0.0})]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# This example only specifies a relevant query\n", "retriever.get_relevant_documents(\"what are two movies about dinosaurs\")" @@ -384,7 +614,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.17" } }, "nbformat": 4, diff --git a/libs/langchain/langchain/chains/query_constructor/base.py b/libs/langchain/langchain/chains/query_constructor/base.py index d6c38d5319d0e..fbaf7113db318 100644 --- a/libs/langchain/langchain/chains/query_constructor/base.py +++ b/libs/langchain/langchain/chains/query_constructor/base.py @@ -2,14 +2,25 @@ from __future__ import annotations import json -from typing import Any, Callable, List, Optional, Sequence, Tuple, Union, cast +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Sequence, + Tuple, + Union, + cast, +) from langchain_core.exceptions import OutputParserException from langchain_core.language_models import BaseLanguageModel from langchain_core.output_parsers import BaseOutputParser from langchain_core.prompts import BasePromptTemplate from langchain_core.prompts.few_shot import FewShotPromptTemplate -from langchain_core.runnables import Runnable +from langchain_core.pydantic_v1 import validator +from langchain_core.runnables import Runnable, RunnableConfig, RunnableSerializable from langchain.chains.llm import LLMChain from langchain.chains.query_constructor.ir import ( @@ -33,7 +44,7 @@ SUFFIX_WITHOUT_DATA_SOURCE, USER_SPECIFIED_EXAMPLE_PROMPT, ) -from langchain.chains.query_constructor.schema import AttributeInfo +from langchain.chains.query_constructor.schema import AttributeInfo, VirtualColumnName from langchain.output_parsers.json import parse_and_check_json_markdown @@ -104,6 +115,62 @@ def ast_parse(raw_filter: str) -> Optional[FilterDirective]: return cls(ast_parse=ast_parse) +class VirtualColumnParser(RunnableSerializable[StructuredQuery, StructuredQuery]): + """Virtual Column Parser which is compatible with LCEL""" + + attributes: Sequence[Union[AttributeInfo, dict]] + virtual_attribute_map: Dict[str, Callable] = {} + + @validator("virtual_attribute_map", always=True) + def _compute_map(cls, v: Any, values: Any, **kwargs: Any) -> Dict[str, Callable]: + _map = {} + for a in values["attributes"]: + a = dict(a) + if isinstance(a["name"], VirtualColumnName): + _map[str(a["name"])] = a["name"].to_query + return _map + + def _map_virtual_column(self, filter: Comparison) -> Comparison: + if filter.attribute in self.virtual_attribute_map: + return Comparison( + comparator=filter.comparator, + attribute=self.virtual_attribute_map[filter.attribute](), + value=filter.value, + ) + return filter + + def _traverse_query( + self, filter: FilterDirective, config: Optional[RunnableConfig] = None + ) -> Optional[FilterDirective]: + if not filter: + return filter + elif isinstance(filter, Comparison): + return self._map_virtual_column(filter) + elif isinstance(filter, Operation): + args = [ + self._traverse_query(arg, config=config) for arg in filter.arguments + ] + args = [arg for arg in args if arg is not None] + if not args: + return None + elif len(args) == 1 and filter.operator in (Operator.AND, Operator.OR): + return args[0] + else: + return Operation( + operator=filter.operator, + arguments=args, + ) + else: + return filter + + def invoke( + self, inputs: StructuredQuery, config: Optional[RunnableConfig] = None + ) -> StructuredQuery: + if inputs.filter: + inputs.filter = self._traverse_query(inputs.filter, config=config) + return inputs + + def fix_filter_directive( filter: Optional[FilterDirective], *, @@ -163,6 +230,12 @@ def _format_attribute_info(info: Sequence[Union[AttributeInfo, dict]]) -> str: info_dicts = {} for i in info: i_dict = dict(i) + if type(i) is AttributeInfo and type(i.name) is VirtualColumnName: + i_dict = { + "name": str(i.name), + "description": i.description, + "type": i.type, + } info_dicts[i_dict.pop("name")] = i_dict return json.dumps(info_dicts, indent=4).replace("{", "{{").replace("}", "}}") @@ -282,6 +355,16 @@ def load_query_constructor_chain( Returns: A LLMChain that can be used to construct queries. """ + + class _StructuredQueryVirtualColParser(BaseOutputParser[StructuredQuery]): + struct_query_parser: StructuredQueryOutputParser + virt_col_parser: VirtualColumnParser + + def parse(self, text: str) -> StructuredQuery: + query = self.struct_query_parser.parse(text) + query = self.virt_col_parser.invoke(query) + return query + prompt = get_query_constructor_prompt( document_contents, attribute_info, @@ -294,13 +377,17 @@ def load_query_constructor_chain( allowed_attributes = [] for ainfo in attribute_info: allowed_attributes.append( - ainfo.name if isinstance(ainfo, AttributeInfo) else ainfo["name"] + str(ainfo.name) if isinstance(ainfo, AttributeInfo) else str(ainfo["name"]) ) - output_parser = StructuredQueryOutputParser.from_components( + struct_query_parser = StructuredQueryOutputParser.from_components( allowed_comparators=allowed_comparators, allowed_operators=allowed_operators, allowed_attributes=allowed_attributes, ) + virt_col_parser = VirtualColumnParser(attributes=attribute_info) + output_parser = _StructuredQueryVirtualColParser( + struct_query_parser=struct_query_parser, virt_col_parser=virt_col_parser + ) # For backwards compatibility. prompt.output_parser = output_parser return LLMChain(llm=llm, prompt=prompt, output_parser=output_parser, **kwargs) @@ -352,7 +439,7 @@ def load_query_constructor_runnable( allowed_attributes = [] for ainfo in attribute_info: allowed_attributes.append( - ainfo.name if isinstance(ainfo, AttributeInfo) else ainfo["name"] + str(ainfo.name) if isinstance(ainfo, AttributeInfo) else str(ainfo["name"]) ) output_parser = StructuredQueryOutputParser.from_components( allowed_comparators=allowed_comparators, @@ -360,4 +447,5 @@ def load_query_constructor_runnable( allowed_attributes=allowed_attributes, fix_invalid=fix_invalid, ) - return prompt | llm | output_parser + virt_col_parser = VirtualColumnParser(attributes=attribute_info) + return prompt | llm | output_parser | virt_col_parser diff --git a/libs/langchain/langchain/chains/query_constructor/schema.py b/libs/langchain/langchain/chains/query_constructor/schema.py index 6171b3742f2ac..b5bd4111c0f0b 100644 --- a/libs/langchain/langchain/chains/query_constructor/schema.py +++ b/libs/langchain/langchain/chains/query_constructor/schema.py @@ -1,10 +1,32 @@ +from typing import Callable, Optional, Union + from langchain_core.pydantic_v1 import BaseModel +class VirtualColumnName(BaseModel): + """Virtual column name""" + + name: str + """name of this virtual column name""" + column: Optional[str] = None + """real column name to perform function on""" + func: Callable[[Optional[str]], str] = lambda x: x if x else "" # noqa: E731 + """virtual column name only accepts function that operates on column name""" + + def __str__(self) -> str: + return self.name + + def to_query(self) -> str: + if self.column: + return self.func(self.column) + else: + return self.func(self.name) + + class AttributeInfo(BaseModel): """Information about a data source attribute.""" - name: str + name: Union[str, VirtualColumnName] description: str type: str diff --git a/libs/langchain/langchain/retrievers/self_query/myscale.py b/libs/langchain/langchain/retrievers/self_query/myscale.py index a951272471e55..8b4ea856e48c5 100644 --- a/libs/langchain/langchain/retrievers/self_query/myscale.py +++ b/libs/langchain/langchain/retrievers/self_query/myscale.py @@ -1,5 +1,5 @@ import re -from typing import Any, Callable, Dict, Tuple +from typing import Any, Callable, Dict, Optional, Tuple from langchain.chains.query_constructor.ir import ( Comparator, @@ -76,7 +76,7 @@ class MyScaleTranslator(Visitor): Comparator.LIKE: _DEFAULT_COMPOSER("ILIKE"), } - def __init__(self, metadata_key: str = "metadata") -> None: + def __init__(self, metadata_key: Optional[str] = "metadata") -> None: super().__init__() self.metadata_key = metadata_key @@ -87,18 +87,23 @@ def visit_operation(self, operation: Operation) -> Dict: return self.map_dict[func](*args) def visit_comparison(self, comparison: Comparison) -> Dict: - regex = r"\((.*?)\)" - matched = re.search(r"\(\w+\)", comparison.attribute) - - # If arbitrary function is applied to an attribute - if matched: - attr = re.sub( - regex, - f"({self.metadata_key}.{matched.group(0)[1:-1]})", - comparison.attribute, - ) + virtual_mapped = True + if self.metadata_key and self.metadata_key not in comparison.attribute: + regex = r"\((.*?)\)" + matched = re.search(r"\(\w+\)", comparison.attribute) + + # If arbitrary function is applied to an attribute + if matched: + attr = re.sub( + regex, + f"({self.metadata_key}.{matched.group(0)[1:-1]})", + comparison.attribute, + ) + else: + attr = f"{self.metadata_key}.{comparison.attribute}" + virtual_mapped = False else: - attr = f"{self.metadata_key}.{comparison.attribute}" + attr = comparison.attribute value = comparison.value comp = comparison.comparator @@ -106,7 +111,8 @@ def visit_comparison(self, comparison: Comparison) -> Dict: # convert timestamp for datetime objects if isinstance(value, dict) and value.get("type") == "date": - attr = f"parseDateTime32BestEffort({attr})" + if not virtual_mapped: + attr = f"parseDateTime32BestEffort({attr})" value = f"parseDateTime32BestEffort('{value['date']}')" # string pattern match