From 126364a60fd6dae9afe991d8555600f3431d6f1c Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Sun, 26 Jan 2025 23:26:22 +0100 Subject: [PATCH 01/27] reorg code --- dags/jurisprudencia.py | 56 +-- src/{ => semantic}/__init__.py | 0 src/{ => semantic}/cli.py | 0 src/{ => semantic}/config.py | 0 src/{ => semantic}/embedding.py | 0 src/{ => semantic}/etl.py | 0 src/{ => semantic}/frontend/__init__.py | 0 src/{ => semantic}/frontend/custom_logger.py | 0 src/{ => semantic}/frontend/paths.py | 0 .../frontend/server/__init__.py | 0 src/{ => semantic}/frontend/server/app.py | 0 .../frontend/server/dto}/__init__.py | 0 .../frontend/server/dto/websocket.py | 0 src/{ => semantic}/frontend/server/server.py | 0 .../frontend/server/websocket.py | 0 .../frontend/static/css/document_tree.css | 0 .../frontend/static/css/style.css | 0 src/{ => semantic}/frontend/static/js/main.js | 0 .../frontend/templates/index.html | 0 src/{ => semantic}/ingestion/README.md | 0 .../ingestion}/__init__.py | 0 src/{ => semantic}/ingestion/documentspec.py | 0 src/{ => semantic}/ingestion/downloader.py | 0 .../ingestion/parsers}/__init__.py | 0 .../ingestion/parsers/html_parser.py | 0 .../ingestion/parsers/pdf_parser.py | 0 src/{ => semantic}/ingestion/paths.py | 0 .../ingestion/resources/codigo_civil.json | 0 .../ingestion/resources/codigo_penal.json | 0 src/{ => semantic}/models/node.py | 0 src/{ => semantic}/query.py | 0 src/{ => semantic}/ragagent.py | 0 src/semantic/render/__init__.py | 0 src/{ => semantic}/render/html.py | 0 src/{ => semantic}/render/node_renderer.py | 0 src/{ => semantic}/render/plain_text.py | 0 src/storage/adapters.py | 145 -------- src/storage/chroma_storage.py | 90 ----- src/storage/graph_storage.py | 328 ------------------ src/storage/hybrid_storage.py | 78 ----- src/storage/transaction_manager.py | 93 ----- tests/conftest.py | 2 +- 42 files changed, 12 insertions(+), 780 deletions(-) rename src/{ => semantic}/__init__.py (100%) rename src/{ => semantic}/cli.py (100%) rename src/{ => semantic}/config.py (100%) rename src/{ => semantic}/embedding.py (100%) rename src/{ => semantic}/etl.py (100%) rename src/{ => semantic}/frontend/__init__.py (100%) rename src/{ => semantic}/frontend/custom_logger.py (100%) rename src/{ => semantic}/frontend/paths.py (100%) rename src/{ => semantic}/frontend/server/__init__.py (100%) rename src/{ => semantic}/frontend/server/app.py (100%) rename src/{ingestion => semantic/frontend/server/dto}/__init__.py (100%) rename src/{ => semantic}/frontend/server/dto/websocket.py (100%) rename src/{ => semantic}/frontend/server/server.py (100%) rename src/{ => semantic}/frontend/server/websocket.py (100%) rename src/{ => semantic}/frontend/static/css/document_tree.css (100%) rename src/{ => semantic}/frontend/static/css/style.css (100%) rename src/{ => semantic}/frontend/static/js/main.js (100%) rename src/{ => semantic}/frontend/templates/index.html (100%) rename src/{ => semantic}/ingestion/README.md (100%) rename src/{ingestion/parsers => semantic/ingestion}/__init__.py (100%) rename src/{ => semantic}/ingestion/documentspec.py (100%) rename src/{ => semantic}/ingestion/downloader.py (100%) rename src/{render => semantic/ingestion/parsers}/__init__.py (100%) rename src/{ => semantic}/ingestion/parsers/html_parser.py (100%) rename src/{ => semantic}/ingestion/parsers/pdf_parser.py (100%) rename src/{ => semantic}/ingestion/paths.py (100%) rename src/{ => semantic}/ingestion/resources/codigo_civil.json (100%) rename src/{ => semantic}/ingestion/resources/codigo_penal.json (100%) rename src/{ => semantic}/models/node.py (100%) rename src/{ => semantic}/query.py (100%) rename src/{ => semantic}/ragagent.py (100%) create mode 100644 src/semantic/render/__init__.py rename src/{ => semantic}/render/html.py (100%) rename src/{ => semantic}/render/node_renderer.py (100%) rename src/{ => semantic}/render/plain_text.py (100%) delete mode 100644 src/storage/adapters.py delete mode 100644 src/storage/chroma_storage.py delete mode 100644 src/storage/graph_storage.py delete mode 100644 src/storage/hybrid_storage.py delete mode 100644 src/storage/transaction_manager.py diff --git a/dags/jurisprudencia.py b/dags/jurisprudencia.py index de06499..86d41c2 100644 --- a/dags/jurisprudencia.py +++ b/dags/jurisprudencia.py @@ -1,11 +1,13 @@ from airflow import DAG -from airflow.operators.python_operator import PythonOperator +from airflow.operators.python import PythonOperator from airflow.utils.dates import days_ago from datetime import datetime, timedelta import requests import json import os +from semantic.ingestion.downloader import get_item_pagination + # Define the default arguments default_args = { 'owner': 'airflow', @@ -18,51 +20,15 @@ } # Define the DAG -dag = DAG( +with DAG( 'query_poderjudicial', default_args=default_args, description='Query www.poderjudicial.es and store results in JSON', - schedule_interval='@weekly', + schedule_interval='@daily', catchup=True, -) - -# Define the Python function to query the API and save results -def query_poderjudicial(ds, **kwargs): - date_from = (datetime.strptime(ds, '%Y-%m-%d') - timedelta(days=7)).strftime('%Y-%m-%d') - date_to = ds - # - # url = 'https://www.poderjudicial.es/search/search.action' - # payload = { - # "action": "query", - # "sort": "IN_FECHARESOLUCION:decreasing", - # "recordsPerPage": "10", - # "databasematch": "AN", - # "start": "1", - # "FECHARESOLUCIONDESDE": date_from, - # "FECHARESOLUCIONHASTA": date_to, - # "TIPOINTERES_ACTUAL": "Actualidad", - # "TIPOORGANOPUB": "|11|12|13|14|15|16|" - # } - # headers = { - # 'Content-Type': 'application/json' - # } - # - # response = requests.post(url, json=payload, headers=headers) - # response.raise_for_status() - # - # results = response.json() - # output_path = f'/path/to/output/results_{date_from}_to_{date_to}.json' - # - # with open(output_path, 'w') as f: - # json.dump(results, f) - -# Define the task -query_task = PythonOperator( - task_id='query_poderjudicial_task', - provide_context=True, - python_callable=query_poderjudicial, - dag=dag, -) - -# Set the task in the DAG -query_task +): + item_pagination = PythonOperator( + task_id='get_item_pagination', + provide_context=True, + python_callable=get_item_pagination, + ) diff --git a/src/__init__.py b/src/semantic/__init__.py similarity index 100% rename from src/__init__.py rename to src/semantic/__init__.py diff --git a/src/cli.py b/src/semantic/cli.py similarity index 100% rename from src/cli.py rename to src/semantic/cli.py diff --git a/src/config.py b/src/semantic/config.py similarity index 100% rename from src/config.py rename to src/semantic/config.py diff --git a/src/embedding.py b/src/semantic/embedding.py similarity index 100% rename from src/embedding.py rename to src/semantic/embedding.py diff --git a/src/etl.py b/src/semantic/etl.py similarity index 100% rename from src/etl.py rename to src/semantic/etl.py diff --git a/src/frontend/__init__.py b/src/semantic/frontend/__init__.py similarity index 100% rename from src/frontend/__init__.py rename to src/semantic/frontend/__init__.py diff --git a/src/frontend/custom_logger.py b/src/semantic/frontend/custom_logger.py similarity index 100% rename from src/frontend/custom_logger.py rename to src/semantic/frontend/custom_logger.py diff --git a/src/frontend/paths.py b/src/semantic/frontend/paths.py similarity index 100% rename from src/frontend/paths.py rename to src/semantic/frontend/paths.py diff --git a/src/frontend/server/__init__.py b/src/semantic/frontend/server/__init__.py similarity index 100% rename from src/frontend/server/__init__.py rename to src/semantic/frontend/server/__init__.py diff --git a/src/frontend/server/app.py b/src/semantic/frontend/server/app.py similarity index 100% rename from src/frontend/server/app.py rename to src/semantic/frontend/server/app.py diff --git a/src/ingestion/__init__.py b/src/semantic/frontend/server/dto/__init__.py similarity index 100% rename from src/ingestion/__init__.py rename to src/semantic/frontend/server/dto/__init__.py diff --git a/src/frontend/server/dto/websocket.py b/src/semantic/frontend/server/dto/websocket.py similarity index 100% rename from src/frontend/server/dto/websocket.py rename to src/semantic/frontend/server/dto/websocket.py diff --git a/src/frontend/server/server.py b/src/semantic/frontend/server/server.py similarity index 100% rename from src/frontend/server/server.py rename to src/semantic/frontend/server/server.py diff --git a/src/frontend/server/websocket.py b/src/semantic/frontend/server/websocket.py similarity index 100% rename from src/frontend/server/websocket.py rename to src/semantic/frontend/server/websocket.py diff --git a/src/frontend/static/css/document_tree.css b/src/semantic/frontend/static/css/document_tree.css similarity index 100% rename from src/frontend/static/css/document_tree.css rename to src/semantic/frontend/static/css/document_tree.css diff --git a/src/frontend/static/css/style.css b/src/semantic/frontend/static/css/style.css similarity index 100% rename from src/frontend/static/css/style.css rename to src/semantic/frontend/static/css/style.css diff --git a/src/frontend/static/js/main.js b/src/semantic/frontend/static/js/main.js similarity index 100% rename from src/frontend/static/js/main.js rename to src/semantic/frontend/static/js/main.js diff --git a/src/frontend/templates/index.html b/src/semantic/frontend/templates/index.html similarity index 100% rename from src/frontend/templates/index.html rename to src/semantic/frontend/templates/index.html diff --git a/src/ingestion/README.md b/src/semantic/ingestion/README.md similarity index 100% rename from src/ingestion/README.md rename to src/semantic/ingestion/README.md diff --git a/src/ingestion/parsers/__init__.py b/src/semantic/ingestion/__init__.py similarity index 100% rename from src/ingestion/parsers/__init__.py rename to src/semantic/ingestion/__init__.py diff --git a/src/ingestion/documentspec.py b/src/semantic/ingestion/documentspec.py similarity index 100% rename from src/ingestion/documentspec.py rename to src/semantic/ingestion/documentspec.py diff --git a/src/ingestion/downloader.py b/src/semantic/ingestion/downloader.py similarity index 100% rename from src/ingestion/downloader.py rename to src/semantic/ingestion/downloader.py diff --git a/src/render/__init__.py b/src/semantic/ingestion/parsers/__init__.py similarity index 100% rename from src/render/__init__.py rename to src/semantic/ingestion/parsers/__init__.py diff --git a/src/ingestion/parsers/html_parser.py b/src/semantic/ingestion/parsers/html_parser.py similarity index 100% rename from src/ingestion/parsers/html_parser.py rename to src/semantic/ingestion/parsers/html_parser.py diff --git a/src/ingestion/parsers/pdf_parser.py b/src/semantic/ingestion/parsers/pdf_parser.py similarity index 100% rename from src/ingestion/parsers/pdf_parser.py rename to src/semantic/ingestion/parsers/pdf_parser.py diff --git a/src/ingestion/paths.py b/src/semantic/ingestion/paths.py similarity index 100% rename from src/ingestion/paths.py rename to src/semantic/ingestion/paths.py diff --git a/src/ingestion/resources/codigo_civil.json b/src/semantic/ingestion/resources/codigo_civil.json similarity index 100% rename from src/ingestion/resources/codigo_civil.json rename to src/semantic/ingestion/resources/codigo_civil.json diff --git a/src/ingestion/resources/codigo_penal.json b/src/semantic/ingestion/resources/codigo_penal.json similarity index 100% rename from src/ingestion/resources/codigo_penal.json rename to src/semantic/ingestion/resources/codigo_penal.json diff --git a/src/models/node.py b/src/semantic/models/node.py similarity index 100% rename from src/models/node.py rename to src/semantic/models/node.py diff --git a/src/query.py b/src/semantic/query.py similarity index 100% rename from src/query.py rename to src/semantic/query.py diff --git a/src/ragagent.py b/src/semantic/ragagent.py similarity index 100% rename from src/ragagent.py rename to src/semantic/ragagent.py diff --git a/src/semantic/render/__init__.py b/src/semantic/render/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/render/html.py b/src/semantic/render/html.py similarity index 100% rename from src/render/html.py rename to src/semantic/render/html.py diff --git a/src/render/node_renderer.py b/src/semantic/render/node_renderer.py similarity index 100% rename from src/render/node_renderer.py rename to src/semantic/render/node_renderer.py diff --git a/src/render/plain_text.py b/src/semantic/render/plain_text.py similarity index 100% rename from src/render/plain_text.py rename to src/semantic/render/plain_text.py diff --git a/src/storage/adapters.py b/src/storage/adapters.py deleted file mode 100644 index 3b5686b..0000000 --- a/src/storage/adapters.py +++ /dev/null @@ -1,145 +0,0 @@ -from typing import Optional - -import numpy as np - -from models.node import Node - - -class NodeAdapter: - @staticmethod - def to_neo4j(node: Node, ordinal: Optional[int] = None): - out = { - 'uuid': node.uuid, - 'ordinal': ordinal, - 'level': node.level, - 'content': node.content - } - - return out - - @staticmethod - def to_neo4j_with_relationships(node: Node, ordinal: Optional[int] = None): - """ - Recursively extract all nodes and relationships from a hierarchy. - - Args: - node (Node): The root node of the hierarchy. - ordinal (Optional[int]): The ordinal value of the node in its parent's children list. - - Returns: - - nodes: List of dictionaries representing nodes. - - relationships: List of tuples representing (parent_uuid, child_uuid) relationships. - """ - nodes = [] - relationships = [] - - # Convert the root node - nodes.append(NodeAdapter.to_neo4j(node, ordinal)) - - # Recursively process children - for child_ordinals, child in enumerate(node.children): - # Add the relationship to child - relationships.append((node.uuid, child.uuid)) - - # Add grand child's nodes and relationships - child_nodes, child_relationships = NodeAdapter.to_neo4j_with_relationships(child, child_ordinals) - nodes.extend(child_nodes) # Append all child nodes - relationships.extend(child_relationships) # Append all child relationships - - return nodes, relationships - - @classmethod - def from_neo4j(cls, record: dict) -> Node: - if record.get('children', []): - children, order = zip(*[[cls.from_neo4j(ch), ch['ordinal']] for ch in record.get('children', [])]) - - order = np.array([o if o is not None else np.inf for o in order]) - - np.argsort(order) - - sorted_children = list(np.array(children)[np.argsort(order)]) - else: - sorted_children = [] - return Node( - uuid=record['uuid'], - level=record['level'], - content=record['content'], - children=sorted_children - ) - - @staticmethod - def build_hierarchy(root_uuid: str, nodes: dict) -> Node: - """ - Build a Node hierarchy from a flat structure. - - Args: - root_uuid (str): The UUID of the root node. - nodes (dict): Dictionary of all nodes keyed by UUID. - - Returns: - Node: Root Node with children populated. - """ - node_data = nodes[root_uuid] - root_node = Node( - uuid=node_data['uuid'], - level=node_data['level'], - content=node_data['content'], - children=[] - ) - - stack = [(root_node, root_uuid)] - temp_nodes = { - uuid: data - for uuid, data in nodes.items() - } - - for uuid, data in nodes.items(): - parent_uuid = data.get('parent_uuid') - if parent_uuid is None: - continue - parent_data = temp_nodes.get(parent_uuid, {}) - if not parent_data: - continue - if 'children' not in parent_data: - parent_data['children'] = [] - parent_data['children'].append(data) - - return NodeAdapter.from_neo4j(nodes[root_uuid]) - - # node_data = nodes[root_uuid] - # root_node = Node( - # uuid=node_data['uuid'], - # level=node_data['level'], - # content=node_data['content'], - # children=[] - # ) - # - # temp_nodes = { - # uuid: Node(uuid=uuid, level=data['level'], content=data['content'], children=[]) - # for uuid, data in nodes.items() - # } - # - # for uuid, node in temp_nodes.items(): - # if node.uuid == root_uuid: - # continue - # - # # add the node to its parent - # temp_nodes[node_data['parent_uuid']].children.append(node) - # - # # Find and sort child nodes by their ordinal value - # child_nodes = [ - # NodeAdapter.build_hierarchy(child_uuid, nodes) - # for child_uuid, child_data in nodes.items() - # if child_data.get('parent_uuid') == root_uuid - # ] - # root_node.children = sorted(child_nodes, key=lambda x: nodes[x.uuid]['ordinal']) - # - # return root_node - - @staticmethod - def to_chromadb(node: Node): - return { - 'id': node.uuid, - 'document': node.content, - 'metadata': {'level': node.level} - } diff --git a/src/storage/chroma_storage.py b/src/storage/chroma_storage.py deleted file mode 100644 index 159668a..0000000 --- a/src/storage/chroma_storage.py +++ /dev/null @@ -1,90 +0,0 @@ -import configparser -from typing import Optional, List - -import chromadb -import neo4j - -import config -from embedding import Embedding -from models.node import Node - -from config import logging - -logger = logging.getLogger(__name__) - - -class ChromaStorage: - def __init__( - self, - embedding: Embedding, - chroma_client: chromadb.Client, - collection_name: str = 'default', - conf: Optional[configparser.ConfigParser] = None - ): - self.config = conf or config.get_config() - self.n_results = int(self.config['rag']['n_results']) - self.client: chromadb.Client = chroma_client - self.collection_name = collection_name - self.collection = self.client.get_or_create_collection(name=collection_name) - self.embedding = embedding - - def delete_collection(self, collection): - try: - self.client.get_collection(collection) - print(f'Deleting all documents in collection {collection}...') - self.client.delete_collection(name=collection) - except chromadb.errors.InvalidCollectionException as e: - pass - - def store_batch(self, nodes: List[Node]): - logger.info(f'Adding/updating documents to collection %s...', self.collection_name) - - # Embed nodes - ids = [str(ch.uuid) for ch in nodes] - embeddings, documents, metadatas = self.embedding.embed_nodes(nodes) - - self.collection.upsert( - ids=ids, - embeddings=embeddings, - documents=documents, - metadatas=metadatas - ) - - def query(self, q_string: str, n_results: Optional[int] = None) -> chromadb.QueryResult: - query_embedding = self.embedding.embed_string(q_string) - - retrieved = self.collection.query( - query_embeddings=[query_embedding], - n_results=n_results or self.n_results, - ) - - return retrieved - - def delete_by_ids(self, ids: List[str]): - """ - Delete documents from ChromaDB by their IDs. - """ - self.collection.delete(ids=ids) - - @classmethod - def get_chroma_storage(cls, conf: Optional[configparser.ConfigParser] = None) -> 'ChromaStorage': - conf = conf or config.get_config() - - if conf['chroma']['type'] == 'http': - client = chromadb.HttpClient( - host=conf['chroma']['host'], - port=int(conf['chroma']['port']), - ) - elif conf['chroma']['type'] == 'local': - client = chromadb.PersistentClient( - path=str(config.root_path() / conf.get('storage', 'path')), - ) - else: - # return in-memory client - print("WARNING: Using in-memory client. This is ephemeral") - client = chromadb.EphemeralClient() - - collection = conf.get('storage', 'collection') - embedding = Embedding(conf=conf) - - return ChromaStorage(embedding=embedding, chroma_client=client, collection_name=collection) diff --git a/src/storage/graph_storage.py b/src/storage/graph_storage.py deleted file mode 100644 index 74fa688..0000000 --- a/src/storage/graph_storage.py +++ /dev/null @@ -1,328 +0,0 @@ -import configparser -import logging -from textwrap import dedent -from typing import Optional, List, Dict - -import numpy as np -from neo4j import Driver, GraphDatabase - -import config -from models.node import Node -from storage.adapters import NodeAdapter - -logger = config.logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) - - -class GraphStorage: - def __init__(self, graph_driver: Driver): - """ - Initialize the GraphStorage with a Neo4j driver. - """ - self.driver = graph_driver - self._ensure_constraints() - - def _ensure_constraints(self): - with self.driver.session() as session: - session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (n:Node) REQUIRE n.uuid IS UNIQUE") - - def create_node(self, node: Node): - """ - Create a single node in the Neo4j database. It has no ordinal value. - """ - node_data = NodeAdapter.to_neo4j(node) - with self.driver.session() as session: - query = """ - MERGE (n:Node {uuid: $uuid}) - SET n.level = $level, n.content = $content, n.ordinal = $ordinal - RETURN n - """ - session.run(query, **node_data) - - def create_relationship(self, parent_uuid: str, child_uuid: str): - with self.driver.session() as session: - query = """ - MATCH (p:Node {uuid: $parent_uuid}) - MATCH (c:Node {uuid: $child_uuid}) - MERGE (p)-[:HAS_CHILD]->(c) - """ - session.run(query, parent_uuid=parent_uuid, child_uuid=child_uuid) - - def store(self, root_node: Node): - """ - Recursively store a Node and all its children in the Neo4j database. - """ - # Create the root node - self.create_node(root_node) - - # Recursively create child nodes and relationships - for child in root_node.children: - self.store(child) # Store the child node recursively - self.create_relationship(root_node.uuid, child.uuid) - - def batch_store(self, node_list: List[Node], parent_uuid: Optional[str] = None, ordered = False): - """ - Batch insert nodes and relationships into Neo4j. - If parent_uuid is provided, all the nodes will be inserted as children of the parent node. - If ordered is True, the nodes will be inserted in as and ordered set. If children already exist for the - parent_uuid, the new nodes will be appended to the end. - """ - nodes = [] - relationships = [] - - insert_query = dedent( - """ - MERGE (n:Node {uuid: $uuid}) - SET n.level = $level, n.content = $content, n.ordinal = $ordinal - """ - ) - for node in node_list: - nd, rel = NodeAdapter.to_neo4j_with_relationships(node) - nodes.extend(nd) - relationships.extend(rel) - - if parent_uuid: - relationships.append((parent_uuid, node.uuid)) - - with self.driver.session() as session: - # Insert all nodes - for node in nodes: - result = session.run(insert_query, **node) - result.to_eager_result() - - # Insert all relationships - for parent_uuid, child_uuid in relationships: - session.run(""" - MATCH (p:Node {uuid: $parent_uuid}) - MATCH (c:Node {uuid: $child_uuid}) - MERGE (p)-[:HAS_CHILD]->(c) - """, parent_uuid=parent_uuid, child_uuid=child_uuid) - - def delete_hierarchy(self, root_uuid: str): - """ - Delete a Node hierarchy from Neo4j by root UUID. - """ - with self.driver.session() as session: - query = """ - MATCH (n:Node {uuid: $root_uuid})-[:HAS_CHILD*0..]->(child) - DETACH DELETE n, child - """ - session.run(query, root_uuid=root_uuid) - - def delete_all(self): - """ - Clear all nodes and relationships from the Neo4j database. - """ - with self.driver.session() as session: - session.run("MATCH (n:Node) DETACH DELETE n") - - def retrieve_parent(self, uuid: str, depth: int = np.inf) -> Node: - """ - Retrieve the parent of a node and all its children. - - Args: - uuid (str): The UUID of the node to retrieve the parent of. - - Returns: - Node: The parent node. - """ - - if depth < np.inf: - raise NotImplementedError("Depth limit not implemented") - - with self.driver.session() as session: - # Query to fetch all nodes and their relationships in the hierarchy - query = dedent(""" - MATCH (n:Node {uuid: $uuid})<-[:HAS_CHILD]-(parent) - RETURN parent.uuid AS parent_uuid; - """) - - logger.debug(f"Querying parent for {uuid}") - logger.debug(f"Query: \n{query}") - - result = session.run(query, uuid=uuid).single() - - if not result: - return None - - parent_uuid = result['parent_uuid'] - - return self.retrieve_hierarchy(parent_uuid) - - def retrieve_hierarchy(self, root_uuid: str, depth: int = np.inf) -> Node: - """ - Retrieve a node and all its children as a hierarchy. - - Args: - root_uuid (str): The UUID of the root node to retrieve. - depth (int): The maximum depth to retrieve. - - Returns: - Node: The root node with all its children populated. - """ - - if depth < np.inf: - raise NotImplementedError("Depth limit not implemented yet") - - with self.driver.session() as session: - # Query to fetch all nodes and their relationships in the hierarchy - query = dedent(""" - MATCH (n:Node {uuid: $root_uuid})-[:HAS_CHILD*0..]->(child) - OPTIONAL MATCH (child)<-[:HAS_CHILD]-(parent) - RETURN n.uuid AS root_uuid, - n.level AS root_level, - n.content AS root_content, - collect({parent: parent.uuid, self: child.uuid, contents: child.content, level: child.level, order:child.ordinal}) AS relationships; - """) - logger.debug(f"Querying hierarchy for {root_uuid}") - logger.debug(f"Query: \n{query}") - - result = session.run(query, root_uuid=root_uuid).single() - - if not result: - return None - - # Parse the result to reconstruct the hierarchy - nodes = {} - root_node_data = { - 'uuid': result['root_uuid'], - 'level': result['root_level'], - 'content': result['root_content'], - 'parent_uuid': None, - } - nodes[root_uuid] = root_node_data - - # Process children - for relationship in result['relationships']: - nodes[relationship['self']] = { - 'uuid': relationship['self'], - 'level': relationship['level'], - 'content': relationship['contents'], - 'ordinal': relationship['order'], - 'parent_uuid': relationship['parent'], - } - - # Build the hierarchy - return NodeAdapter.build_hierarchy(root_uuid, nodes) - - # def retrieve_hierarchies(self, root_uuids: List[str]) -> Dict[str, Node]: - # """ - # Retrieve multiple node hierarchies from Neo4j by root UUIDs. - # - # Args: - # root_uuids (List[str]): The UUIDs of the root nodes to retrieve. - # - # Returns: - # Dict[str, Node]: A dictionary of root nodes with all their children populated. - # """ - # with self.driver.session() as session: - - # TODO: There seems to be a bug with this query. Fix it. - - # query = """ - # UNWIND $root_uuids AS root_uuid - # MATCH (n:Node {uuid: root_uuid})-[:HAS_CHILD*0..]->(child) - # OPTIONAL MATCH (child)<-[:HAS_CHILD]-(parent) - # WHERE parent.uuid <> child.uuid - # RETURN root_uuid, - # n.uuid AS node_uuid, - # n.level AS node_level, - # n.content AS node_content, - # collect(child.uuid) AS child_uuids, - # collect(child.level) AS child_levels, - # collect(child.content) AS child_contents, - # collect(child.ordinal) AS child_order, - # collect(parent.uuid) AS parent_uuids - # """ - # result = session.run(query, root_uuids=root_uuids) - # - # nodes = {} - # for record in result: - # root_uuid = record['root_uuid'] - # if root_uuid not in nodes: - # nodes[root_uuid] = {} - # - # node_data = { - # 'uuid': record['node_uuid'], - # 'level': record['node_level'], - # 'content': record['node_content'], - # 'parent_uuid': None, - # } - # nodes[root_uuid][record['node_uuid']] = node_data - # - # child_uuids = record['child_uuids'] - # child_levels = record['child_levels'] - # child_contents = record['child_contents'] - # child_order = record['child_order'] - # parent_uuids = record['parent_uuids'] - # - # assert len(child_uuids) == len(child_levels) - # assert len(child_uuids) == len(child_contents) - # assert len(child_uuids) == len(child_order) - # # assert len(child_uuids) == len(parent_uuids) - # - # for child_uuid, level, content, ordinal, parent_uuid in zip( - # child_uuids, child_levels, child_contents, child_order, parent_uuids - # ): - # nodes[root_uuid][child_uuid] = { - # 'uuid': child_uuid, - # 'level': level, - # 'content': content, - # 'ordinal': ordinal, - # 'parent_uuid': parent_uuid, - # } - # - # hierarchies = {} - # for root_uuid in root_uuids: - # if root_uuid in nodes: - # hierarchies[root_uuid] = NodeAdapter.build_hierarchy(root_uuid, nodes[root_uuid]) - # - # return hierarchies - - def retrieve_by(self, level: Optional[str] = None, content: Optional[str] = None) -> List[Node]: - """ - Retrieve nodes by their level. - - Args: - level (str): The level of the nodes to retrieve. - content (str): The content of the nodes to retrieve - - Returns: - List[Node]: A list of nodes at the specified level. - """ - with self.driver.session() as session: - clauses = [] - kwargs = {} - if level is not None: - clauses.append("level: $level") - kwargs['level'] = level - if content is not None: - clauses.append("content: $content") - kwargs['content'] = content - - query = dedent(f""" - MATCH - (n:Node {{ {','.join(clauses)} }}) - RETURN - n.uuid AS uuid, - n.level AS level, - n.content AS content - """) - - result = session.run(query, **kwargs) - - uuids = [record['uuid'] for record in result] - - nodes = [self.retrieve_hierarchy(uuid) for uuid in uuids] - - return nodes - - @classmethod - def get_graph_storage(cls, conf: Optional[configparser.ConfigParser] = None) -> 'GraphStorage': - conf = conf or config.get_config() - - neo4j_driver = GraphDatabase.driver(conf['neo4j']['url'], - auth=(conf['neo4j']['user'], conf['neo4j']['password'])) - graph_storage = GraphStorage(neo4j_driver) - - return graph_storage diff --git a/src/storage/hybrid_storage.py b/src/storage/hybrid_storage.py deleted file mode 100644 index 0e26417..0000000 --- a/src/storage/hybrid_storage.py +++ /dev/null @@ -1,78 +0,0 @@ -import configparser -from typing import Optional, List - -import config -from models.node import Node -from storage.chroma_storage import ChromaStorage -from storage.graph_storage import GraphStorage - - -class HybridStorage: - def __init__(self, chroma_storage: ChromaStorage, graph_storage: GraphStorage): - """ - Initialize HybridStorage with ChromaStorage and GraphStorage instances. - """ - self.chroma_storage: ChromaStorage = chroma_storage - self.graph_storage: GraphStorage = graph_storage - - def store(self, root_node: Node): - """ - Store a Node hierarchy in both ChromaDB and Neo4j. - """ - # Store in Neo4j - self.graph_storage.store(root_node) - - # Flatten the hierarchy for ChromaDB - all_nodes = self.flatten_hierarchy(root_node) - - # Store in ChromaDB - self.chroma_storage.store_batch(all_nodes) - - def flatten_hierarchy(self, root_node: Node) -> List[Node]: - """ - Flatten a Node hierarchy into a list of all nodes. - """ - flat_list = [root_node] - for child in root_node.children: - flat_list.extend(self.flatten_hierarchy(child)) - return flat_list - - def query(self, query_string: str, n_results: Optional[int] = None): - """ - Perform a semantic search in ChromaDB and return the results. - """ - return self.chroma_storage.query(query_string, n_results) - - def delete_all(self): - """ - Delete all data from both ChromaDB and Neo4j. - """ - self.chroma_storage.delete_collection(self.chroma_storage.collection_name) - self.graph_storage.delete_all() - - def retrieve_parent(self, uuid: str) -> Node: - """ - Retrieve the parent of a node. - """ - return self.graph_storage.retrieve_parent(uuid) - - def retrieve_hierarchy(self, root_uuid: str) -> Node: - """ - Retrieve a node and all its children as a hierarchy. - - Args: - root_uuid (str): The UUID of the root node to retrieve. - - Returns: - Node: The root node with all its children populated. - """ - return self.graph_storage.retrieve_hierarchy(root_uuid) - - @classmethod - def get_hybrid_storage(cls, conf: Optional[configparser.ConfigParser] = None) -> 'HybridStorage': - conf = conf or config.get_config() - - chroma_storage = ChromaStorage.get_chroma_storage(conf) - graph_storage = GraphStorage.get_graph_storage(conf) - - return cls(chroma_storage, graph_storage) \ No newline at end of file diff --git a/src/storage/transaction_manager.py b/src/storage/transaction_manager.py deleted file mode 100644 index 3fdd7b3..0000000 --- a/src/storage/transaction_manager.py +++ /dev/null @@ -1,93 +0,0 @@ -import configparser -from typing import Optional, List - -from models.node import Node -from storage.hybrid_storage import HybridStorage -from config import logging, get_config - -logger = logging.getLogger(__name__) - -# TODO: implement unit tests for this class - - -class TransactionManager: - def __init__(self, hybrid_storage: HybridStorage): - """ - Initialize TransactionManager with a HybridStorage instance. - """ - self.hybrid_storage = hybrid_storage - - def init_dataset(self, name): - """ - Check if a parent node for the Dataset already exists in the database. If it doesn't, create it. In any case, return - the uuid so that all further documents of this dataset will be linked to it. - """ - - # Check if the dataset already exists - query = { - "level": "dataset", - "content": name - } - dataset_node = self.hybrid_storage.graph_storage.retrieve_by(level='dataset', content=name) - - if dataset_node: - logger.info("Dataset already exists in the database") - return dataset_node[0].uuid - - # If it doesn't, create it - dataset_node = Node(level="dataset", content=name) - self.store_with_transaction(dataset_node) - return dataset_node.uuid - - def store_with_transaction(self, root_nodes: Node | List[Node], parent_uuid: Optional[str] = None): - """ - Store a Node hierarchy in both ChromaDB and Neo4j, ensuring consistency. - """ - if not isinstance(root_nodes, list): - root_nodes = [root_nodes] - - try: - # Store in Neo4j - logger.info("Storing in Neo4j...") - self.hybrid_storage.graph_storage.batch_store(root_nodes, parent_uuid) - - # Store in ChromaDB - logger.info("Storing in ChromaDB...") - for node in root_nodes: - # Flatten the hierarchy for ChromaDB - all_nodes = self.hybrid_storage.flatten_hierarchy(node) - - # Store in ChromaDB - self.hybrid_storage.chroma_storage.store_batch(all_nodes) - - except Exception as e: - # Rollback strategy: remove nodes from both systems if one fails - for node in root_nodes: - self.rollback(node) - raise RuntimeError(f"Transaction failed: {e}") - - - def rollback(self, root_node: Node): - """ - Rollback changes made during a failed transaction. - """ - try: - # Delete from Neo4j - self.hybrid_storage.graph_storage.delete_hierarchy(root_node.uuid) - - # Delete from ChromaDB - all_node_uuids = [node.uuid for node in self.hybrid_storage.flatten_hierarchy(root_node)] - self.hybrid_storage.chroma_storage.delete_by_ids(all_node_uuids) - - except Exception as rollback_error: - # Log rollback failure - logger.error(f"Rollback failed: {rollback_error}") - - @classmethod - def get_transaction_manager(cls, conf: Optional[configparser.ConfigParser] = None) -> 'TransactionManager': - conf = conf or get_config() - - # Initialize HybridStorage and TransactionManager - hybrid_storage = HybridStorage.get_hybrid_storage(conf) - transaction_manager = TransactionManager(hybrid_storage) - return transaction_manager diff --git a/tests/conftest.py b/tests/conftest.py index 7b527d6..5d072b9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,7 +12,7 @@ @fixture def static_files(): - return root_path() / "src/frontend/static/css" + return root_path() / "semantic/frontend/static/css" @fixture From 620eb06d44e60b3f9310ea128d1eca1dfccab7c3 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Mon, 27 Jan 2025 00:06:03 +0100 Subject: [PATCH 02/27] fix imports --- .gitignore | 2 +- Dockerfile | 14 + Makefile | 7 +- README.md | 16 +- dags/jurisprudencia.py | 8 +- docker-compose.yml | 23 +- requirements.txt | 393 ++++++++++++++++-- setup.cfg => setup.cfg.bak | 5 +- setup.py | 7 +- src/{semantic => verdictnet}/__init__.py | 0 src/{semantic => verdictnet}/cli.py | 0 src/{semantic => verdictnet}/config.py | 6 +- src/{semantic => verdictnet}/embedding.py | 4 +- src/{semantic => verdictnet}/etl.py | 0 .../frontend/__init__.py | 0 .../frontend/custom_logger.py | 0 .../frontend/paths.py | 0 .../frontend/server/__init__.py | 0 .../frontend/server/app.py | 0 .../frontend/server/dto/__init__.py | 0 .../frontend/server/dto/websocket.py | 0 .../frontend/server/server.py | 0 .../frontend/server/websocket.py | 0 .../frontend/static/css/document_tree.css | 0 .../frontend/static/css/style.css | 0 .../frontend/static/js/main.js | 0 .../frontend/templates/index.html | 0 .../ingestion/README.md | 2 +- .../ingestion/__init__.py | 0 .../ingestion/documentspec.py | 0 .../ingestion/downloader.py | 10 +- .../ingestion/parsers/__init__.py | 0 .../ingestion/parsers/html_parser.py | 0 .../ingestion/parsers/pdf_parser.py | 2 +- .../ingestion/paths.py | 2 +- .../ingestion/resources/codigo_civil.json | 0 .../ingestion/resources/codigo_penal.json | 0 .../render => verdictnet/models}/__init__.py | 0 src/{semantic => verdictnet}/models/node.py | 2 +- src/{semantic => verdictnet}/query.py | 0 src/{semantic => verdictnet}/ragagent.py | 0 src/verdictnet/render/__init__.py | 0 src/{semantic => verdictnet}/render/html.py | 0 .../render/node_renderer.py | 0 .../render/plain_text.py | 0 src/verdictnet/storage/__init__.py | 0 src/verdictnet/storage/adapters.py | 145 +++++++ src/verdictnet/storage/chroma_storage.py | 90 ++++ src/verdictnet/storage/graph_storage.py | 328 +++++++++++++++ src/verdictnet/storage/hybrid_storage.py | 78 ++++ src/verdictnet/storage/transaction_manager.py | 93 +++++ tests/conftest.py | 2 +- 52 files changed, 1172 insertions(+), 67 deletions(-) create mode 100644 Dockerfile rename setup.cfg => setup.cfg.bak (92%) rename src/{semantic => verdictnet}/__init__.py (100%) rename src/{semantic => verdictnet}/cli.py (100%) rename src/{semantic => verdictnet}/config.py (86%) rename src/{semantic => verdictnet}/embedding.py (93%) rename src/{semantic => verdictnet}/etl.py (100%) rename src/{semantic => verdictnet}/frontend/__init__.py (100%) rename src/{semantic => verdictnet}/frontend/custom_logger.py (100%) rename src/{semantic => verdictnet}/frontend/paths.py (100%) rename src/{semantic => verdictnet}/frontend/server/__init__.py (100%) rename src/{semantic => verdictnet}/frontend/server/app.py (100%) rename src/{semantic => verdictnet}/frontend/server/dto/__init__.py (100%) rename src/{semantic => verdictnet}/frontend/server/dto/websocket.py (100%) rename src/{semantic => verdictnet}/frontend/server/server.py (100%) rename src/{semantic => verdictnet}/frontend/server/websocket.py (100%) rename src/{semantic => verdictnet}/frontend/static/css/document_tree.css (100%) rename src/{semantic => verdictnet}/frontend/static/css/style.css (100%) rename src/{semantic => verdictnet}/frontend/static/js/main.js (100%) rename src/{semantic => verdictnet}/frontend/templates/index.html (100%) rename src/{semantic => verdictnet}/ingestion/README.md (98%) rename src/{semantic => verdictnet}/ingestion/__init__.py (100%) rename src/{semantic => verdictnet}/ingestion/documentspec.py (100%) rename src/{semantic => verdictnet}/ingestion/downloader.py (97%) rename src/{semantic => verdictnet}/ingestion/parsers/__init__.py (100%) rename src/{semantic => verdictnet}/ingestion/parsers/html_parser.py (100%) rename src/{semantic => verdictnet}/ingestion/parsers/pdf_parser.py (96%) rename src/{semantic => verdictnet}/ingestion/paths.py (95%) rename src/{semantic => verdictnet}/ingestion/resources/codigo_civil.json (100%) rename src/{semantic => verdictnet}/ingestion/resources/codigo_penal.json (100%) rename src/{semantic/render => verdictnet/models}/__init__.py (100%) rename src/{semantic => verdictnet}/models/node.py (98%) rename src/{semantic => verdictnet}/query.py (100%) rename src/{semantic => verdictnet}/ragagent.py (100%) create mode 100644 src/verdictnet/render/__init__.py rename src/{semantic => verdictnet}/render/html.py (100%) rename src/{semantic => verdictnet}/render/node_renderer.py (100%) rename src/{semantic => verdictnet}/render/plain_text.py (100%) create mode 100644 src/verdictnet/storage/__init__.py create mode 100644 src/verdictnet/storage/adapters.py create mode 100644 src/verdictnet/storage/chroma_storage.py create mode 100644 src/verdictnet/storage/graph_storage.py create mode 100644 src/verdictnet/storage/hybrid_storage.py create mode 100644 src/verdictnet/storage/transaction_manager.py diff --git a/.gitignore b/.gitignore index fee070a..f4ce3e0 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ .ipynb_checkpoints/ */*.egg-info/* .idea/ +build/ __pycache__/ docs/ @@ -11,5 +12,4 @@ data/ datalake/ neo4j_data/ postgress_storage/ -storage/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..50bf3a6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,14 @@ +FROM apache/airflow:2.10.0 + +USER root +COPY requirements.txt . +COPY src/ src/ +COPY setup.py . +COPY config.ini . +RUN chown -R airflow src/ +RUN apt-get update && apt-get install -y build-essential + +# Switch to airflow user to run the application +USER airflow +RUN pip install -r requirements.txt +RUN pip install . \ No newline at end of file diff --git a/Makefile b/Makefile index 417c95d..1568fd3 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: server etl test +.PHONY: server etl test build clean # Default port for the server PORT ?= 8000 @@ -7,6 +7,11 @@ PORT ?= 8000 ETL_PATH ?= /path/to/docspecs FORCE ?= true +# Docker build +.PHONY: build +build: + @docker build -t semantic_airflow . + # Run the server server: @echo "Running the server on port $(PORT)..." diff --git a/README.md b/README.md index 4459095..50af8f8 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ This will launch the following services: Finally, run ```sh - $ semantic server + $ verdictnet server ```` to launch the frontend interface, accessible through - [Frontend: `http://localhost:8000`](http://localhost:8000) @@ -33,7 +33,7 @@ Run data pipelines to ingest and process documents. #### Usage: ```sh -$ semantic etl [--path PATH] [--force FORCE] {clean,run} +$ verdictnet etl [--path PATH] [--force FORCE] {clean,run} --path PATH: Path where to look for document specs. --force FORCE: Force download of documents. @@ -46,13 +46,13 @@ run: Ingest data into the vector database. Query the data stored in the system. Usage: ```sh - $ semantic query [--query QUERY] [--n_results N_RESULTS] [--interactive] + $ verdictnet query [--query QUERY] [--n_results N_RESULTS] [--interactive] ``` ### Server Run the server to provide a frontend interface. Usage: ```sh - $ semantic server [-p PORT] + $ verdictnet server [-p PORT] -p, --port PORT: Port to run the frontend on (default: 8000). ``` @@ -60,16 +60,16 @@ Usage: ## Example usage ```sh # Clean the vector database -semantic etl clean +verdictnet etl clean # Run the ETL pipeline -semantic etl run --path /path/to/docspecs --force true +verdictnet etl run --path /path/to/docspecs --force true # Query the data -semantic query --query "example query" --n_results 5 +verdictnet query --query "example query" --n_results 5 # Run the server -semantic server --port 8080 +verdictnet server --port 8080 ``` ## Configuration diff --git a/dags/jurisprudencia.py b/dags/jurisprudencia.py index 86d41c2..89c180a 100644 --- a/dags/jurisprudencia.py +++ b/dags/jurisprudencia.py @@ -1,3 +1,4 @@ +import pendulum from airflow import DAG from airflow.operators.python import PythonOperator from airflow.utils.dates import days_ago @@ -6,13 +7,13 @@ import json import os -from semantic.ingestion.downloader import get_item_pagination +from verdictnet.ingestion.downloader import get_item_pagination # Define the default arguments default_args = { 'owner': 'airflow', 'depends_on_past': False, - 'start_date': days_ago(8 * 7), # Start date 8 weeks ago + 'start_date': pendulum.today('UTC').add(days=-8 * 7), # Start date 8 weeks ago 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, @@ -24,11 +25,10 @@ 'query_poderjudicial', default_args=default_args, description='Query www.poderjudicial.es and store results in JSON', - schedule_interval='@daily', + schedule='@daily', catchup=True, ): item_pagination = PythonOperator( task_id='get_item_pagination', - provide_context=True, python_callable=get_item_pagination, ) diff --git a/docker-compose.yml b/docker-compose.yml index 4f7dffe..b886363 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.8' - services: chromadb: image: chromadb/chroma:latest @@ -29,7 +27,7 @@ services: - ${PWD}/postgress_storage:/var/lib/postgresql/data airflow-webserver: - image: apache/airflow:2.6.1 + build: . container_name: airflow-webserver environment: - AIRFLOW__CORE__EXECUTOR=LocalExecutor @@ -47,6 +45,25 @@ services: volumes: - ${PWD}/dags:/opt/airflow/dags # Mount the dags folder + + airflow-scheduler: + build: . + container_name: airflow-scheduler + environment: + - AIRFLOW__CORE__EXECUTOR=LocalExecutor + - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow + - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY} + depends_on: + - postgres + - airflow-webserver + command: > + bash -c "airflow scheduler" + env_file: + - .env + volumes: + - ${PWD}/dags:/opt/airflow/dags # Mount the dags folder + + minio: image: minio/minio:latest container_name: minio diff --git a/requirements.txt b/requirements.txt index d11ad23..59e57ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,29 +1,364 @@ -pathlib~=1.0.1 -fastapi~=0.115.5 -starlette~=0.40.0 -Werkzeug~=3.0.3 -requests~=2.32.0 -beautifulsoup4~=4.12.3 -tika~=2.6.0 -tqdm~=4.66.4 -setuptools~=70.0.0 -pytest~=8.2.0 -websocket-client~=1.8.0 -uvicorn~=0.29.0 -websockets~=12.0 -playwright~=1.43.0 -PyPDF2~=3.0.1 -openparse~=0.7.0 -chromadb~=0.5.20 -numpy~=2.1.3 -torch~=2.5.1 -slugify~=0.0.1 -configparser~=7.1.0 -python-slugify~=8.0.4 -fsspec~=2024.10.0 -semantic_pdf~=0.0.1 -pdfplumber~=0.11.4 -pandas~=2.2.3 -neomodel~=5.4.1 -neo4j~=5.26.0 -pytest-mock~=3.14.0 +acres==0.1.0 +aiobotocore==2.15.2 +aiofiles==23.2.1 +aiohappyeyeballs==2.4.4 +aiohttp==3.11.10 +aioitertools==0.12.0 +aiosignal==1.3.1 +aiosqlite==0.20.0 +alembic==1.14.1 +amqp==5.3.1 +annotated-types==0.7.0 +anyio==4.6.2.post1 +apache-airflow==2.10.4 +apache-airflow-providers-celery==3.8.5 +apache-airflow-providers-common-compat==1.3.0 +apache-airflow-providers-common-io==1.5.0 +apache-airflow-providers-common-sql==1.21.0 +apache-airflow-providers-fab==1.5.2 +apache-airflow-providers-ftp==3.12.0 +apache-airflow-providers-google==12.0.0 +apache-airflow-providers-http==5.0.0 +apache-airflow-providers-imap==3.8.0 +apache-airflow-providers-smtp==1.9.0 +apache-airflow-providers-sqlite==4.0.0 +apispec==6.8.1 +argcomplete==3.5.3 +asgiref==3.8.1 +attrs==24.2.0 +babel==2.16.0 +backoff==2.2.1 +bcrypt==4.2.1 +beautifulsoup4==4.12.3 +billiard==4.2.1 +blinker==1.9.0 +botocore==1.35.36 +build==1.2.2.post1 +cachelib==0.9.0 +cachetools==5.5.0 +captcha-solver==0.1.5 +cattrs==24.1.2 +celery==5.4.0 +certifi==2024.8.30 +cffi==1.17.1 +chardet==5.2.0 +charset-normalizer==3.4.0 +chroma-hnswlib==0.7.6 +chromadb==0.5.20 +ci-info==0.3.0 +click==8.1.7 +click-didyoumean==0.3.1 +click-plugins==1.1.1 +click-repl==0.3.0 +clickclick==20.10.2 +colorama==0.4.6 +coloredlogs==15.0.1 +colorlog==6.9.0 +configobj==5.0.9 +configparser==7.1.0 +ConfigUpdater==3.2 +connexion==2.14.2 +contourpy==1.3.1 +cron-descriptor==1.4.5 +croniter==6.0.0 +cryptography==44.0.0 +cycler==0.12.1 +dataclasses-json==0.6.7 +db-dtypes==1.4.0 +decorator==5.1.1 +Deprecated==1.2.15 +dill==0.3.9 +dirtyjson==1.0.8 +distro==1.9.0 +dnspython==2.7.0 +docstring_parser==0.16 +durationpy==0.9 +email_validator==2.2.0 +etelemetry==0.3.1 +fastapi==0.115.5 +fastapi-cli==0.0.5 +filelock==3.16.1 +filetype==1.2.0 +fitz==0.0.1.dev2 +Flask==2.2.5 +Flask-AppBuilder==4.5.2 +Flask-Babel==2.0.0 +Flask-Caching==2.3.0 +Flask-JWT-Extended==4.7.1 +Flask-Limiter==3.10.1 +Flask-Login==0.6.3 +Flask-Session==0.5.0 +Flask-SQLAlchemy==2.5.1 +Flask-WTF==1.2.2 +flatbuffers==24.3.25 +flower==2.0.1 +fonttools==4.55.0 +frozenlist==1.5.0 +fsspec +gcloud-aio-auth==5.3.2 +gcloud-aio-bigquery==7.1.0 +gcloud-aio-storage==9.3.0 +gcsfs +google-ads==25.1.0 +google-analytics-admin==0.23.3 +google-api-core==2.24.0 +google-api-python-client==2.159.0 +google-auth==2.36.0 +google-auth-httplib2==0.2.0 +google-auth-oauthlib==1.2.1 +google-cloud-aiplatform==1.78.0 +google-cloud-alloydb==0.4.1 +google-cloud-appengine-logging==1.5.0 +google-cloud-audit-log==0.3.0 +google-cloud-automl==2.15.0 +google-cloud-batch==0.17.33 +google-cloud-bigquery==3.29.0 +google-cloud-bigquery-datatransfer==3.18.0 +google-cloud-bigtable==2.28.1 +google-cloud-build==3.29.0 +google-cloud-compute==1.23.0 +google-cloud-container==2.55.0 +google-cloud-core==2.4.1 +google-cloud-datacatalog==3.24.1 +google-cloud-dataflow-client==0.8.15 +google-cloud-dataform==0.5.14 +google-cloud-dataplex==2.6.0 +google-cloud-dataproc==5.16.0 +google-cloud-dataproc-metastore==1.17.0 +google-cloud-dlp==3.26.0 +google-cloud-kms==3.2.2 +google-cloud-language==2.16.0 +google-cloud-logging==3.11.3 +google-cloud-memcache==1.11.0 +google-cloud-monitoring==2.26.0 +google-cloud-orchestration-airflow==1.16.1 +google-cloud-os-login==2.16.0 +google-cloud-pubsub==2.27.3 +google-cloud-redis==2.17.0 +google-cloud-resource-manager==1.14.0 +google-cloud-run==0.10.14 +google-cloud-secret-manager==2.22.1 +google-cloud-spanner==3.51.0 +google-cloud-speech==2.30.0 +google-cloud-storage==2.19.0 +google-cloud-storage-transfer==1.15.0 +google-cloud-tasks==2.18.0 +google-cloud-texttospeech==2.24.0 +google-cloud-translate==3.19.0 +google-cloud-videointelligence==2.15.0 +google-cloud-vision==3.9.0 +google-cloud-workflows==1.16.0 +google-crc32c==1.6.0 +google-re2==1.1.20240702 +google-resumable-media==2.7.2 +googleapis-common-protos==1.66.0 +greenlet==3.0.3 +grpc-google-iam-v1==0.14.0 +grpc-interceptor==0.15.4 +grpcio==1.70.0 +grpcio-gcp==0.2.2 +grpcio-status==1.70.0 +gunicorn==23.0.0 +h11==0.14.0 +httpcore==1.0.7 +httplib2==0.22.0 +httptools==0.6.4 +httpx==0.27.2 +huggingface-hub==0.26.2 +humanfriendly==10.0 +humanize==4.11.0 +idna==3.10 +immutabledict==4.2.1 +importlib_metadata==8.5.0 +importlib_resources==6.4.5 +inflection==0.5.1 +iniconfig==2.0.0 +isodate==0.6.1 +itsdangerous==2.2.0 +Jinja2==3.1.4 +jiter==0.8.0 +jmespath==1.0.1 +joblib==1.4.2 +json-merge-patch==0.2 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.7 +kombu==5.4.2 +kubernetes==31.0.0 +lazy-object-proxy==1.10.0 +limits==4.0.1 +linkify-it-py==2.0.3 +llama-cloud==0.1.6 +llama-index==0.12.5 +llama-index-agent-openai==0.4.0 +llama-index-cli==0.4.0 +llama-index-core==0.12.5 +llama-index-embeddings-openai==0.3.1 +llama-index-indices-managed-llama-cloud==0.6.3 +llama-index-legacy==0.9.48.post4 +llama-index-llms-openai==0.3.10 +llama-index-multi-modal-llms-openai==0.4.0 +llama-index-program-openai==0.3.1 +llama-index-question-gen-openai==0.3.0 +llama-index-readers-file==0.4.1 +llama-index-readers-llama-parse==0.4.0 +llama-parse==0.5.17 +lockfile==0.12.2 +looker-sdk==25.0.0 +looseversion==1.3.0 +lxml==5.3.0 +Mako==1.3.8 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +marshmallow==3.23.1 +marshmallow-oneofschema==3.1.1 +marshmallow-sqlalchemy==0.28.2 +matplotlib==3.9.2 +mdit-py-plugins==0.4.2 +mdurl==0.1.2 +methodtools==0.4.7 +mmh3==5.0.1 +monotonic==1.6 +more-itertools==10.6.0 +mpmath==1.3.0 +multidict==6.1.0 +mypy-extensions==1.0.0 +neo4j==5.26.0 +neomodel==5.4.1 +nest-asyncio==1.6.0 +networkx==3.4.2 +nibabel==5.3.2 +nipype==1.9.1 +nltk==3.9.1 +numpy==1.26.4 +oauthlib==3.2.2 +onnxruntime==1.20.1 +openai==1.57.2 +openparse==0.7.0 +ordered-set==4.1.0 +orjson==3.10.12 +overrides==7.7.0 +packaging==24.2 +pandas==2.1.4 +pandas-gbq==0.26.1 +pathlib==1.0.1 +pathspec==0.12.1 +pdf2image==1.17.0 +pdfminer.six==20231228 +pdfplumber==0.11.4 +pendulum==3.0.0 +pillow==11.0.0 +playwright==1.43.0 +pluggy==1.5.0 +posthog==3.7.3 +prison==0.2.1 +prometheus_client==0.21.1 +prompt_toolkit==3.0.50 +propcache==0.2.1 +proto-plus==1.25.0 +protobuf==5.29.0 +prov==2.0.1 +psutil==6.1.1 +puremagic==1.28 +pyarrow==18.1.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.0 +pycparser==2.22 +pydantic==2.10.2 +pydantic_core==2.27.1 +pydata-google-auth==1.9.1 +pydot==3.0.2 +pyee==11.1.0 +Pygments==2.18.0 +PyJWT==2.10.1 +PyMuPDF==1.24.14 +pyOpenSSL==25.0.0 +pyparsing==3.2.0 +pypdf==5.1.0 +PyPDF2==3.0.1 +pypdfium2==4.30.0 +PyPika==0.48.9 +pyproject_hooks==1.2.0 +pytesseract==0.3.13 +pytest==8.2.2 +pytest-mock==3.14.0 +python-daemon==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-multipart==0.0.17 +python-nvd3==0.16.0 +python-slugify==8.0.4 +pytz==2024.2 +pyxnat==1.6.2 +PyYAML==6.0.2 +rdflib==6.3.2 +redis==5.2.1 +referencing==0.36.2 +regex==2024.11.6 +reportlab==4.2.5 +requests==2.32.3 +requests-oauthlib==2.0.0 +requests-toolbelt==1.0.0 +rfc3339-validator==0.1.4 +rich==13.9.4 +rich-argparse==1.6.0 +rpds-py==0.22.3 +rsa==4.9 +s3fs==2024.10.0 +safetensors==0.4.5 +scikit-learn==1.5.2 +scipy==1.14.1 +sentence-transformers==3.3.1 +setproctitle==1.3.4 +setuptools==70.0.0 +shapely==2.0.6 +shellingham==1.5.4 +simplejson==3.19.3 +six==1.16.0 +slugify==0.0.1 +sniffio==1.3.1 +soupsieve==2.6 +SQLAlchemy==1.4.54 +sqlalchemy-bigquery==1.12.1 +SQLAlchemy-JSONField==1.0.2 +sqlalchemy-spanner==1.8.0 +SQLAlchemy-Utils==0.41.2 +sqlparse==0.5.3 +starlette==0.40.0 +striprtf==0.0.26 +sympy==1.13.1 +tabulate==0.9.0 +tenacity==8.5.0 +termcolor==2.5.0 +text-unidecode==1.3 +threadpoolctl==3.5.0 +tika==2.6.0 +tiktoken==0.8.0 +time-machine==2.16.0 +tokenizers==0.20.3 +torch==2.5.1 +tornado==6.4.2 +tqdm==4.66.6 +traits==6.4.3 +transformers==4.46.3 +typer==0.13.1 +typing-inspect==0.9.0 +typing_extensions==4.12.2 +tzdata==2024.2 +uc-micro-py==1.0.3 +universal_pathlib==0.2.6 +uritemplate==4.1.1 +urllib3==2.2.3 +uvicorn==0.29.0 +uvloop==0.21.0 +vine==5.1.0 +watchfiles==1.0.0 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==12.0 +wirerope==1.0.0 +wordcloud==1.9.4 +wrapt==1.17.0 +WTForms==3.2.1 +yarl==1.18.3 +zipp==3.21.0 diff --git a/setup.cfg b/setup.cfg.bak similarity index 92% rename from setup.cfg rename to setup.cfg.bak index 09a51e2..ed112a2 100644 --- a/setup.cfg +++ b/setup.cfg.bak @@ -7,7 +7,7 @@ license = MIT [options] package_dir= - =src + =semantic packages = find: install_requires = numpy @@ -19,10 +19,9 @@ install_requires = matplotlib pyarrow PyMuPDF - wordcloud [options.packages.find] -where=src +where=semantic [options.entry_points] console_scripts = diff --git a/setup.py b/setup.py index 726ea4c..8339b5a 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,11 @@ from setuptools import setup, find_packages setup( - name='semantic_pdf', + name='verdictnet', version='0.0.1', - description='A tool to do semantic_pdf queries on large documents', + description='A tool to do verdictnet queries on large documents', author='Alex Monras', license='MIT', - packages=find_packages('src') + packages=find_packages(where='src'), + package_dir={'': 'src'} ) diff --git a/src/semantic/__init__.py b/src/verdictnet/__init__.py similarity index 100% rename from src/semantic/__init__.py rename to src/verdictnet/__init__.py diff --git a/src/semantic/cli.py b/src/verdictnet/cli.py similarity index 100% rename from src/semantic/cli.py rename to src/verdictnet/cli.py diff --git a/src/semantic/config.py b/src/verdictnet/config.py similarity index 86% rename from src/semantic/config.py rename to src/verdictnet/config.py index 39a6583..8c6ab36 100644 --- a/src/semantic/config.py +++ b/src/verdictnet/config.py @@ -14,15 +14,15 @@ def root_path(): - return Path(__file__).parent.parent + return Path(__file__).parent.parent.parent def get_config(): # Create a ConfigParser instance config = configparser.ConfigParser() - # Load the configuration file - config.read(root_path() / 'config.ini') + # Load the configuration file from the current folder + config.read(filenames=['config.ini', root_path() / 'config.ini']) return config diff --git a/src/semantic/embedding.py b/src/verdictnet/embedding.py similarity index 93% rename from src/semantic/embedding.py rename to src/verdictnet/embedding.py index 281c23f..0dadaee 100644 --- a/src/semantic/embedding.py +++ b/src/verdictnet/embedding.py @@ -6,8 +6,8 @@ from sentence_transformers import SentenceTransformer from torch import Tensor -from config import root_path, get_config -from models.node import Node +from verdictnet.config import root_path, get_config +from verdictnet.models.node import Node class Embedding: diff --git a/src/semantic/etl.py b/src/verdictnet/etl.py similarity index 100% rename from src/semantic/etl.py rename to src/verdictnet/etl.py diff --git a/src/semantic/frontend/__init__.py b/src/verdictnet/frontend/__init__.py similarity index 100% rename from src/semantic/frontend/__init__.py rename to src/verdictnet/frontend/__init__.py diff --git a/src/semantic/frontend/custom_logger.py b/src/verdictnet/frontend/custom_logger.py similarity index 100% rename from src/semantic/frontend/custom_logger.py rename to src/verdictnet/frontend/custom_logger.py diff --git a/src/semantic/frontend/paths.py b/src/verdictnet/frontend/paths.py similarity index 100% rename from src/semantic/frontend/paths.py rename to src/verdictnet/frontend/paths.py diff --git a/src/semantic/frontend/server/__init__.py b/src/verdictnet/frontend/server/__init__.py similarity index 100% rename from src/semantic/frontend/server/__init__.py rename to src/verdictnet/frontend/server/__init__.py diff --git a/src/semantic/frontend/server/app.py b/src/verdictnet/frontend/server/app.py similarity index 100% rename from src/semantic/frontend/server/app.py rename to src/verdictnet/frontend/server/app.py diff --git a/src/semantic/frontend/server/dto/__init__.py b/src/verdictnet/frontend/server/dto/__init__.py similarity index 100% rename from src/semantic/frontend/server/dto/__init__.py rename to src/verdictnet/frontend/server/dto/__init__.py diff --git a/src/semantic/frontend/server/dto/websocket.py b/src/verdictnet/frontend/server/dto/websocket.py similarity index 100% rename from src/semantic/frontend/server/dto/websocket.py rename to src/verdictnet/frontend/server/dto/websocket.py diff --git a/src/semantic/frontend/server/server.py b/src/verdictnet/frontend/server/server.py similarity index 100% rename from src/semantic/frontend/server/server.py rename to src/verdictnet/frontend/server/server.py diff --git a/src/semantic/frontend/server/websocket.py b/src/verdictnet/frontend/server/websocket.py similarity index 100% rename from src/semantic/frontend/server/websocket.py rename to src/verdictnet/frontend/server/websocket.py diff --git a/src/semantic/frontend/static/css/document_tree.css b/src/verdictnet/frontend/static/css/document_tree.css similarity index 100% rename from src/semantic/frontend/static/css/document_tree.css rename to src/verdictnet/frontend/static/css/document_tree.css diff --git a/src/semantic/frontend/static/css/style.css b/src/verdictnet/frontend/static/css/style.css similarity index 100% rename from src/semantic/frontend/static/css/style.css rename to src/verdictnet/frontend/static/css/style.css diff --git a/src/semantic/frontend/static/js/main.js b/src/verdictnet/frontend/static/js/main.js similarity index 100% rename from src/semantic/frontend/static/js/main.js rename to src/verdictnet/frontend/static/js/main.js diff --git a/src/semantic/frontend/templates/index.html b/src/verdictnet/frontend/templates/index.html similarity index 100% rename from src/semantic/frontend/templates/index.html rename to src/verdictnet/frontend/templates/index.html diff --git a/src/semantic/ingestion/README.md b/src/verdictnet/ingestion/README.md similarity index 98% rename from src/semantic/ingestion/README.md rename to src/verdictnet/ingestion/README.md index 239f454..fef01c9 100644 --- a/src/semantic/ingestion/README.md +++ b/src/verdictnet/ingestion/README.md @@ -2,7 +2,7 @@ There are currently two ingestion processes in place: ```sh -$ semantic erl run +$ verdictnet erl run ``` ingests the documents specified in `src/ingestion/resources/`, namely: - Código Civil diff --git a/src/semantic/ingestion/__init__.py b/src/verdictnet/ingestion/__init__.py similarity index 100% rename from src/semantic/ingestion/__init__.py rename to src/verdictnet/ingestion/__init__.py diff --git a/src/semantic/ingestion/documentspec.py b/src/verdictnet/ingestion/documentspec.py similarity index 100% rename from src/semantic/ingestion/documentspec.py rename to src/verdictnet/ingestion/documentspec.py diff --git a/src/semantic/ingestion/downloader.py b/src/verdictnet/ingestion/downloader.py similarity index 97% rename from src/semantic/ingestion/downloader.py rename to src/verdictnet/ingestion/downloader.py index 529de2a..8f76502 100644 --- a/src/semantic/ingestion/downloader.py +++ b/src/verdictnet/ingestion/downloader.py @@ -9,11 +9,11 @@ from bs4 import BeautifulSoup from tqdm import tqdm -from ingestion.parsers.pdf_parser import extract_paragraphs -from ingestion.paths import raw_path, refined_path, fsspec_walk -from config import get_config, logging, get_fs -from models.node import Node -from storage.transaction_manager import TransactionManager +from verdictnet.ingestion.parsers.pdf_parser import extract_paragraphs +from verdictnet.ingestion.paths import raw_path, refined_path, fsspec_walk +from verdictnet.config import get_config, logging, get_fs +from verdictnet.models.node import Node +from verdictnet.storage.transaction_manager import TransactionManager logger = logging.getLogger(__name__) diff --git a/src/semantic/ingestion/parsers/__init__.py b/src/verdictnet/ingestion/parsers/__init__.py similarity index 100% rename from src/semantic/ingestion/parsers/__init__.py rename to src/verdictnet/ingestion/parsers/__init__.py diff --git a/src/semantic/ingestion/parsers/html_parser.py b/src/verdictnet/ingestion/parsers/html_parser.py similarity index 100% rename from src/semantic/ingestion/parsers/html_parser.py rename to src/verdictnet/ingestion/parsers/html_parser.py diff --git a/src/semantic/ingestion/parsers/pdf_parser.py b/src/verdictnet/ingestion/parsers/pdf_parser.py similarity index 96% rename from src/semantic/ingestion/parsers/pdf_parser.py rename to src/verdictnet/ingestion/parsers/pdf_parser.py index 5c69312..0876e1e 100644 --- a/src/semantic/ingestion/parsers/pdf_parser.py +++ b/src/verdictnet/ingestion/parsers/pdf_parser.py @@ -1,6 +1,6 @@ import pdfplumber -from models.node import Node +from verdictnet.models.node import Node def extract_paragraphs(pdf_path) -> Node: diff --git a/src/semantic/ingestion/paths.py b/src/verdictnet/ingestion/paths.py similarity index 95% rename from src/semantic/ingestion/paths.py rename to src/verdictnet/ingestion/paths.py index b0a4224..00be14f 100644 --- a/src/semantic/ingestion/paths.py +++ b/src/verdictnet/ingestion/paths.py @@ -2,7 +2,7 @@ import fsspec -from config import root_path, get_config, logging +from verdictnet.config import root_path, get_config, logging logger = logging.getLogger(__name__) diff --git a/src/semantic/ingestion/resources/codigo_civil.json b/src/verdictnet/ingestion/resources/codigo_civil.json similarity index 100% rename from src/semantic/ingestion/resources/codigo_civil.json rename to src/verdictnet/ingestion/resources/codigo_civil.json diff --git a/src/semantic/ingestion/resources/codigo_penal.json b/src/verdictnet/ingestion/resources/codigo_penal.json similarity index 100% rename from src/semantic/ingestion/resources/codigo_penal.json rename to src/verdictnet/ingestion/resources/codigo_penal.json diff --git a/src/semantic/render/__init__.py b/src/verdictnet/models/__init__.py similarity index 100% rename from src/semantic/render/__init__.py rename to src/verdictnet/models/__init__.py diff --git a/src/semantic/models/node.py b/src/verdictnet/models/node.py similarity index 98% rename from src/semantic/models/node.py rename to src/verdictnet/models/node.py index c1ee23a..07497fc 100644 --- a/src/semantic/models/node.py +++ b/src/verdictnet/models/node.py @@ -3,7 +3,7 @@ from dataclasses import dataclass, field from typing import List -from config import get_fs +from verdictnet.config import get_fs class AutoIncrement: # pylint: disable=too-few-public-methods diff --git a/src/semantic/query.py b/src/verdictnet/query.py similarity index 100% rename from src/semantic/query.py rename to src/verdictnet/query.py diff --git a/src/semantic/ragagent.py b/src/verdictnet/ragagent.py similarity index 100% rename from src/semantic/ragagent.py rename to src/verdictnet/ragagent.py diff --git a/src/verdictnet/render/__init__.py b/src/verdictnet/render/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/semantic/render/html.py b/src/verdictnet/render/html.py similarity index 100% rename from src/semantic/render/html.py rename to src/verdictnet/render/html.py diff --git a/src/semantic/render/node_renderer.py b/src/verdictnet/render/node_renderer.py similarity index 100% rename from src/semantic/render/node_renderer.py rename to src/verdictnet/render/node_renderer.py diff --git a/src/semantic/render/plain_text.py b/src/verdictnet/render/plain_text.py similarity index 100% rename from src/semantic/render/plain_text.py rename to src/verdictnet/render/plain_text.py diff --git a/src/verdictnet/storage/__init__.py b/src/verdictnet/storage/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/verdictnet/storage/adapters.py b/src/verdictnet/storage/adapters.py new file mode 100644 index 0000000..1d8ecd2 --- /dev/null +++ b/src/verdictnet/storage/adapters.py @@ -0,0 +1,145 @@ +from typing import Optional + +import numpy as np + +from verdictnet.models.node import Node + + +class NodeAdapter: + @staticmethod + def to_neo4j(node: Node, ordinal: Optional[int] = None): + out = { + 'uuid': node.uuid, + 'ordinal': ordinal, + 'level': node.level, + 'content': node.content + } + + return out + + @staticmethod + def to_neo4j_with_relationships(node: Node, ordinal: Optional[int] = None): + """ + Recursively extract all nodes and relationships from a hierarchy. + + Args: + node (Node): The root node of the hierarchy. + ordinal (Optional[int]): The ordinal value of the node in its parent's children list. + + Returns: + - nodes: List of dictionaries representing nodes. + - relationships: List of tuples representing (parent_uuid, child_uuid) relationships. + """ + nodes = [] + relationships = [] + + # Convert the root node + nodes.append(NodeAdapter.to_neo4j(node, ordinal)) + + # Recursively process children + for child_ordinals, child in enumerate(node.children): + # Add the relationship to child + relationships.append((node.uuid, child.uuid)) + + # Add grand child's nodes and relationships + child_nodes, child_relationships = NodeAdapter.to_neo4j_with_relationships(child, child_ordinals) + nodes.extend(child_nodes) # Append all child nodes + relationships.extend(child_relationships) # Append all child relationships + + return nodes, relationships + + @classmethod + def from_neo4j(cls, record: dict) -> Node: + if record.get('children', []): + children, order = zip(*[[cls.from_neo4j(ch), ch['ordinal']] for ch in record.get('children', [])]) + + order = np.array([o if o is not None else np.inf for o in order]) + + np.argsort(order) + + sorted_children = list(np.array(children)[np.argsort(order)]) + else: + sorted_children = [] + return Node( + uuid=record['uuid'], + level=record['level'], + content=record['content'], + children=sorted_children + ) + + @staticmethod + def build_hierarchy(root_uuid: str, nodes: dict) -> Node: + """ + Build a Node hierarchy from a flat structure. + + Args: + root_uuid (str): The UUID of the root node. + nodes (dict): Dictionary of all nodes keyed by UUID. + + Returns: + Node: Root Node with children populated. + """ + node_data = nodes[root_uuid] + root_node = Node( + uuid=node_data['uuid'], + level=node_data['level'], + content=node_data['content'], + children=[] + ) + + stack = [(root_node, root_uuid)] + temp_nodes = { + uuid: data + for uuid, data in nodes.items() + } + + for uuid, data in nodes.items(): + parent_uuid = data.get('parent_uuid') + if parent_uuid is None: + continue + parent_data = temp_nodes.get(parent_uuid, {}) + if not parent_data: + continue + if 'children' not in parent_data: + parent_data['children'] = [] + parent_data['children'].append(data) + + return NodeAdapter.from_neo4j(nodes[root_uuid]) + + # node_data = nodes[root_uuid] + # root_node = Node( + # uuid=node_data['uuid'], + # level=node_data['level'], + # content=node_data['content'], + # children=[] + # ) + # + # temp_nodes = { + # uuid: Node(uuid=uuid, level=data['level'], content=data['content'], children=[]) + # for uuid, data in nodes.items() + # } + # + # for uuid, node in temp_nodes.items(): + # if node.uuid == root_uuid: + # continue + # + # # add the node to its parent + # temp_nodes[node_data['parent_uuid']].children.append(node) + # + # # Find and sort child nodes by their ordinal value + # child_nodes = [ + # NodeAdapter.build_hierarchy(child_uuid, nodes) + # for child_uuid, child_data in nodes.items() + # if child_data.get('parent_uuid') == root_uuid + # ] + # root_node.children = sorted(child_nodes, key=lambda x: nodes[x.uuid]['ordinal']) + # + # return root_node + + @staticmethod + def to_chromadb(node: Node): + return { + 'id': node.uuid, + 'document': node.content, + 'metadata': {'level': node.level} + } diff --git a/src/verdictnet/storage/chroma_storage.py b/src/verdictnet/storage/chroma_storage.py new file mode 100644 index 0000000..7638b99 --- /dev/null +++ b/src/verdictnet/storage/chroma_storage.py @@ -0,0 +1,90 @@ +import configparser +from typing import Optional, List + +import chromadb +import neo4j + +from verdictnet import config +from verdictnet.embedding import Embedding +from verdictnet.models.node import Node + +from verdictnet.config import logging + +logger = logging.getLogger(__name__) + + +class ChromaStorage: + def __init__( + self, + embedding: Embedding, + chroma_client: chromadb.Client, + collection_name: str = 'default', + conf: Optional[configparser.ConfigParser] = None + ): + self.config = conf or config.get_config() + self.n_results = int(self.config['rag']['n_results']) + self.client: chromadb.Client = chroma_client + self.collection_name = collection_name + self.collection = self.client.get_or_create_collection(name=collection_name) + self.embedding = embedding + + def delete_collection(self, collection): + try: + self.client.get_collection(collection) + print(f'Deleting all documents in collection {collection}...') + self.client.delete_collection(name=collection) + except chromadb.errors.InvalidCollectionException as e: + pass + + def store_batch(self, nodes: List[Node]): + logger.info(f'Adding/updating documents to collection %s...', self.collection_name) + + # Embed nodes + ids = [str(ch.uuid) for ch in nodes] + embeddings, documents, metadatas = self.embedding.embed_nodes(nodes) + + self.collection.upsert( + ids=ids, + embeddings=embeddings, + documents=documents, + metadatas=metadatas + ) + + def query(self, q_string: str, n_results: Optional[int] = None) -> chromadb.QueryResult: + query_embedding = self.embedding.embed_string(q_string) + + retrieved = self.collection.query( + query_embeddings=[query_embedding], + n_results=n_results or self.n_results, + ) + + return retrieved + + def delete_by_ids(self, ids: List[str]): + """ + Delete documents from ChromaDB by their IDs. + """ + self.collection.delete(ids=ids) + + @classmethod + def get_chroma_storage(cls, conf: Optional[configparser.ConfigParser] = None) -> 'ChromaStorage': + conf = conf or config.get_config() + + if conf['chroma']['type'] == 'http': + client = chromadb.HttpClient( + host=conf['chroma']['host'], + port=int(conf['chroma']['port']), + ) + elif conf['chroma']['type'] == 'local': + client = chromadb.PersistentClient( + path=str(config.root_path() / conf.get('storage', 'path')), + ) + else: + # return in-memory client + print("WARNING: Using in-memory client. This is ephemeral") + client = chromadb.EphemeralClient() + + collection = conf.get('storage', 'collection') + embedding = Embedding(conf=conf) + + return ChromaStorage(embedding=embedding, chroma_client=client, collection_name=collection) diff --git a/src/verdictnet/storage/graph_storage.py b/src/verdictnet/storage/graph_storage.py new file mode 100644 index 0000000..9876423 --- /dev/null +++ b/src/verdictnet/storage/graph_storage.py @@ -0,0 +1,328 @@ +import configparser +import logging +from textwrap import dedent +from typing import Optional, List, Dict + +import numpy as np +from neo4j import Driver, GraphDatabase + +from verdictnet import config +from verdictnet.models.node import Node +from verdictnet.storage.adapters import NodeAdapter + +logger = config.logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + + +class GraphStorage: + def __init__(self, graph_driver: Driver): + """ + Initialize the GraphStorage with a Neo4j driver. + """ + self.driver = graph_driver + self._ensure_constraints() + + def _ensure_constraints(self): + with self.driver.session() as session: + session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (n:Node) REQUIRE n.uuid IS UNIQUE") + + def create_node(self, node: Node): + """ + Create a single node in the Neo4j database. It has no ordinal value. + """ + node_data = NodeAdapter.to_neo4j(node) + with self.driver.session() as session: + query = """ + MERGE (n:Node {uuid: $uuid}) + SET n.level = $level, n.content = $content, n.ordinal = $ordinal + RETURN n + """ + session.run(query, **node_data) + + def create_relationship(self, parent_uuid: str, child_uuid: str): + with self.driver.session() as session: + query = """ + MATCH (p:Node {uuid: $parent_uuid}) + MATCH (c:Node {uuid: $child_uuid}) + MERGE (p)-[:HAS_CHILD]->(c) + """ + session.run(query, parent_uuid=parent_uuid, child_uuid=child_uuid) + + def store(self, root_node: Node): + """ + Recursively store a Node and all its children in the Neo4j database. + """ + # Create the root node + self.create_node(root_node) + + # Recursively create child nodes and relationships + for child in root_node.children: + self.store(child) # Store the child node recursively + self.create_relationship(root_node.uuid, child.uuid) + + def batch_store(self, node_list: List[Node], parent_uuid: Optional[str] = None, ordered = False): + """ + Batch insert nodes and relationships into Neo4j. + If parent_uuid is provided, all the nodes will be inserted as children of the parent node. + If ordered is True, the nodes will be inserted in as and ordered set. If children already exist for the + parent_uuid, the new nodes will be appended to the end. + """ + nodes = [] + relationships = [] + + insert_query = dedent( + """ + MERGE (n:Node {uuid: $uuid}) + SET n.level = $level, n.content = $content, n.ordinal = $ordinal + """ + ) + for node in node_list: + nd, rel = NodeAdapter.to_neo4j_with_relationships(node) + nodes.extend(nd) + relationships.extend(rel) + + if parent_uuid: + relationships.append((parent_uuid, node.uuid)) + + with self.driver.session() as session: + # Insert all nodes + for node in nodes: + result = session.run(insert_query, **node) + result.to_eager_result() + + # Insert all relationships + for parent_uuid, child_uuid in relationships: + session.run(""" + MATCH (p:Node {uuid: $parent_uuid}) + MATCH (c:Node {uuid: $child_uuid}) + MERGE (p)-[:HAS_CHILD]->(c) + """, parent_uuid=parent_uuid, child_uuid=child_uuid) + + def delete_hierarchy(self, root_uuid: str): + """ + Delete a Node hierarchy from Neo4j by root UUID. + """ + with self.driver.session() as session: + query = """ + MATCH (n:Node {uuid: $root_uuid})-[:HAS_CHILD*0..]->(child) + DETACH DELETE n, child + """ + session.run(query, root_uuid=root_uuid) + + def delete_all(self): + """ + Clear all nodes and relationships from the Neo4j database. + """ + with self.driver.session() as session: + session.run("MATCH (n:Node) DETACH DELETE n") + + def retrieve_parent(self, uuid: str, depth: int = np.inf) -> Node: + """ + Retrieve the parent of a node and all its children. + + Args: + uuid (str): The UUID of the node to retrieve the parent of. + + Returns: + Node: The parent node. + """ + + if depth < np.inf: + raise NotImplementedError("Depth limit not implemented") + + with self.driver.session() as session: + # Query to fetch all nodes and their relationships in the hierarchy + query = dedent(""" + MATCH (n:Node {uuid: $uuid})<-[:HAS_CHILD]-(parent) + RETURN parent.uuid AS parent_uuid; + """) + + logger.debug(f"Querying parent for {uuid}") + logger.debug(f"Query: \n{query}") + + result = session.run(query, uuid=uuid).single() + + if not result: + return None + + parent_uuid = result['parent_uuid'] + + return self.retrieve_hierarchy(parent_uuid) + + def retrieve_hierarchy(self, root_uuid: str, depth: int = np.inf) -> Node: + """ + Retrieve a node and all its children as a hierarchy. + + Args: + root_uuid (str): The UUID of the root node to retrieve. + depth (int): The maximum depth to retrieve. + + Returns: + Node: The root node with all its children populated. + """ + + if depth < np.inf: + raise NotImplementedError("Depth limit not implemented yet") + + with self.driver.session() as session: + # Query to fetch all nodes and their relationships in the hierarchy + query = dedent(""" + MATCH (n:Node {uuid: $root_uuid})-[:HAS_CHILD*0..]->(child) + OPTIONAL MATCH (child)<-[:HAS_CHILD]-(parent) + RETURN n.uuid AS root_uuid, + n.level AS root_level, + n.content AS root_content, + collect({parent: parent.uuid, self: child.uuid, contents: child.content, level: child.level, order:child.ordinal}) AS relationships; + """) + logger.debug(f"Querying hierarchy for {root_uuid}") + logger.debug(f"Query: \n{query}") + + result = session.run(query, root_uuid=root_uuid).single() + + if not result: + return None + + # Parse the result to reconstruct the hierarchy + nodes = {} + root_node_data = { + 'uuid': result['root_uuid'], + 'level': result['root_level'], + 'content': result['root_content'], + 'parent_uuid': None, + } + nodes[root_uuid] = root_node_data + + # Process children + for relationship in result['relationships']: + nodes[relationship['self']] = { + 'uuid': relationship['self'], + 'level': relationship['level'], + 'content': relationship['contents'], + 'ordinal': relationship['order'], + 'parent_uuid': relationship['parent'], + } + + # Build the hierarchy + return NodeAdapter.build_hierarchy(root_uuid, nodes) + + # def retrieve_hierarchies(self, root_uuids: List[str]) -> Dict[str, Node]: + # """ + # Retrieve multiple node hierarchies from Neo4j by root UUIDs. + # + # Args: + # root_uuids (List[str]): The UUIDs of the root nodes to retrieve. + # + # Returns: + # Dict[str, Node]: A dictionary of root nodes with all their children populated. + # """ + # with self.driver.session() as session: + + # TODO: There seems to be a bug with this query. Fix it. + + # query = """ + # UNWIND $root_uuids AS root_uuid + # MATCH (n:Node {uuid: root_uuid})-[:HAS_CHILD*0..]->(child) + # OPTIONAL MATCH (child)<-[:HAS_CHILD]-(parent) + # WHERE parent.uuid <> child.uuid + # RETURN root_uuid, + # n.uuid AS node_uuid, + # n.level AS node_level, + # n.content AS node_content, + # collect(child.uuid) AS child_uuids, + # collect(child.level) AS child_levels, + # collect(child.content) AS child_contents, + # collect(child.ordinal) AS child_order, + # collect(parent.uuid) AS parent_uuids + # """ + # result = session.run(query, root_uuids=root_uuids) + # + # nodes = {} + # for record in result: + # root_uuid = record['root_uuid'] + # if root_uuid not in nodes: + # nodes[root_uuid] = {} + # + # node_data = { + # 'uuid': record['node_uuid'], + # 'level': record['node_level'], + # 'content': record['node_content'], + # 'parent_uuid': None, + # } + # nodes[root_uuid][record['node_uuid']] = node_data + # + # child_uuids = record['child_uuids'] + # child_levels = record['child_levels'] + # child_contents = record['child_contents'] + # child_order = record['child_order'] + # parent_uuids = record['parent_uuids'] + # + # assert len(child_uuids) == len(child_levels) + # assert len(child_uuids) == len(child_contents) + # assert len(child_uuids) == len(child_order) + # # assert len(child_uuids) == len(parent_uuids) + # + # for child_uuid, level, content, ordinal, parent_uuid in zip( + # child_uuids, child_levels, child_contents, child_order, parent_uuids + # ): + # nodes[root_uuid][child_uuid] = { + # 'uuid': child_uuid, + # 'level': level, + # 'content': content, + # 'ordinal': ordinal, + # 'parent_uuid': parent_uuid, + # } + # + # hierarchies = {} + # for root_uuid in root_uuids: + # if root_uuid in nodes: + # hierarchies[root_uuid] = NodeAdapter.build_hierarchy(root_uuid, nodes[root_uuid]) + # + # return hierarchies + + def retrieve_by(self, level: Optional[str] = None, content: Optional[str] = None) -> List[Node]: + """ + Retrieve nodes by their level. + + Args: + level (str): The level of the nodes to retrieve. + content (str): The content of the nodes to retrieve + + Returns: + List[Node]: A list of nodes at the specified level. + """ + with self.driver.session() as session: + clauses = [] + kwargs = {} + if level is not None: + clauses.append("level: $level") + kwargs['level'] = level + if content is not None: + clauses.append("content: $content") + kwargs['content'] = content + + query = dedent(f""" + MATCH + (n:Node {{ {','.join(clauses)} }}) + RETURN + n.uuid AS uuid, + n.level AS level, + n.content AS content + """) + + result = session.run(query, **kwargs) + + uuids = [record['uuid'] for record in result] + + nodes = [self.retrieve_hierarchy(uuid) for uuid in uuids] + + return nodes + + @classmethod + def get_graph_storage(cls, conf: Optional[configparser.ConfigParser] = None) -> 'GraphStorage': + conf = conf or config.get_config() + + neo4j_driver = GraphDatabase.driver(conf['neo4j']['url'], + auth=(conf['neo4j']['user'], conf['neo4j']['password'])) + graph_storage = GraphStorage(neo4j_driver) + + return graph_storage diff --git a/src/verdictnet/storage/hybrid_storage.py b/src/verdictnet/storage/hybrid_storage.py new file mode 100644 index 0000000..0196ca2 --- /dev/null +++ b/src/verdictnet/storage/hybrid_storage.py @@ -0,0 +1,78 @@ +import configparser +from typing import Optional, List + +from verdictnet import config +from verdictnet.models.node import Node +from verdictnet.storage.chroma_storage import ChromaStorage +from verdictnet.storage.graph_storage import GraphStorage + + +class HybridStorage: + def __init__(self, chroma_storage: ChromaStorage, graph_storage: GraphStorage): + """ + Initialize HybridStorage with ChromaStorage and GraphStorage instances. + """ + self.chroma_storage: ChromaStorage = chroma_storage + self.graph_storage: GraphStorage = graph_storage + + def store(self, root_node: Node): + """ + Store a Node hierarchy in both ChromaDB and Neo4j. + """ + # Store in Neo4j + self.graph_storage.store(root_node) + + # Flatten the hierarchy for ChromaDB + all_nodes = self.flatten_hierarchy(root_node) + + # Store in ChromaDB + self.chroma_storage.store_batch(all_nodes) + + def flatten_hierarchy(self, root_node: Node) -> List[Node]: + """ + Flatten a Node hierarchy into a list of all nodes. + """ + flat_list = [root_node] + for child in root_node.children: + flat_list.extend(self.flatten_hierarchy(child)) + return flat_list + + def query(self, query_string: str, n_results: Optional[int] = None): + """ + Perform a verdictnet search in ChromaDB and return the results. + """ + return self.chroma_storage.query(query_string, n_results) + + def delete_all(self): + """ + Delete all data from both ChromaDB and Neo4j. + """ + self.chroma_storage.delete_collection(self.chroma_storage.collection_name) + self.graph_storage.delete_all() + + def retrieve_parent(self, uuid: str) -> Node: + """ + Retrieve the parent of a node. + """ + return self.graph_storage.retrieve_parent(uuid) + + def retrieve_hierarchy(self, root_uuid: str) -> Node: + """ + Retrieve a node and all its children as a hierarchy. + + Args: + root_uuid (str): The UUID of the root node to retrieve. + + Returns: + Node: The root node with all its children populated. + """ + return self.graph_storage.retrieve_hierarchy(root_uuid) + + @classmethod + def get_hybrid_storage(cls, conf: Optional[configparser.ConfigParser] = None) -> 'HybridStorage': + conf = conf or config.get_config() + + chroma_storage = ChromaStorage.get_chroma_storage(conf) + graph_storage = GraphStorage.get_graph_storage(conf) + + return cls(chroma_storage, graph_storage) \ No newline at end of file diff --git a/src/verdictnet/storage/transaction_manager.py b/src/verdictnet/storage/transaction_manager.py new file mode 100644 index 0000000..6bdd9a1 --- /dev/null +++ b/src/verdictnet/storage/transaction_manager.py @@ -0,0 +1,93 @@ +import configparser +from typing import Optional, List + +from verdictnet.models.node import Node +from verdictnet.storage.hybrid_storage import HybridStorage +from verdictnet.config import logging, get_config + +logger = logging.getLogger(__name__) + +# TODO: implement unit tests for this class + + +class TransactionManager: + def __init__(self, hybrid_storage: HybridStorage): + """ + Initialize TransactionManager with a HybridStorage instance. + """ + self.hybrid_storage = hybrid_storage + + def init_dataset(self, name): + """ + Check if a parent node for the Dataset already exists in the database. If it doesn't, create it. In any case, return + the uuid so that all further documents of this dataset will be linked to it. + """ + + # Check if the dataset already exists + query = { + "level": "dataset", + "content": name + } + dataset_node = self.hybrid_storage.graph_storage.retrieve_by(level='dataset', content=name) + + if dataset_node: + logger.info("Dataset already exists in the database") + return dataset_node[0].uuid + + # If it doesn't, create it + dataset_node = Node(level="dataset", content=name) + self.store_with_transaction(dataset_node) + return dataset_node.uuid + + def store_with_transaction(self, root_nodes: Node | List[Node], parent_uuid: Optional[str] = None): + """ + Store a Node hierarchy in both ChromaDB and Neo4j, ensuring consistency. + """ + if not isinstance(root_nodes, list): + root_nodes = [root_nodes] + + try: + # Store in Neo4j + logger.info("Storing in Neo4j...") + self.hybrid_storage.graph_storage.batch_store(root_nodes, parent_uuid) + + # Store in ChromaDB + logger.info("Storing in ChromaDB...") + for node in root_nodes: + # Flatten the hierarchy for ChromaDB + all_nodes = self.hybrid_storage.flatten_hierarchy(node) + + # Store in ChromaDB + self.hybrid_storage.chroma_storage.store_batch(all_nodes) + + except Exception as e: + # Rollback strategy: remove nodes from both systems if one fails + for node in root_nodes: + self.rollback(node) + raise RuntimeError(f"Transaction failed: {e}") + + + def rollback(self, root_node: Node): + """ + Rollback changes made during a failed transaction. + """ + try: + # Delete from Neo4j + self.hybrid_storage.graph_storage.delete_hierarchy(root_node.uuid) + + # Delete from ChromaDB + all_node_uuids = [node.uuid for node in self.hybrid_storage.flatten_hierarchy(root_node)] + self.hybrid_storage.chroma_storage.delete_by_ids(all_node_uuids) + + except Exception as rollback_error: + # Log rollback failure + logger.error(f"Rollback failed: {rollback_error}") + + @classmethod + def get_transaction_manager(cls, conf: Optional[configparser.ConfigParser] = None) -> 'TransactionManager': + conf = conf or get_config() + + # Initialize HybridStorage and TransactionManager + hybrid_storage = HybridStorage.get_hybrid_storage(conf) + transaction_manager = TransactionManager(hybrid_storage) + return transaction_manager diff --git a/tests/conftest.py b/tests/conftest.py index 5d072b9..312fe11 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,7 +12,7 @@ @fixture def static_files(): - return root_path() / "semantic/frontend/static/css" + return root_path() / "verdictnet/frontend/static/css" @fixture From fee24ea6a964f4551fad2cbb096f04a4e50440ad Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Mon, 27 Jan 2025 13:26:35 +0100 Subject: [PATCH 03/27] add setup to makefile --- Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1568fd3..5f7f7b5 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: server etl test build clean +.PHONY: setup server etl test build clean # Default port for the server PORT ?= 8000 @@ -7,6 +7,12 @@ PORT ?= 8000 ETL_PATH ?= /path/to/docspecs FORCE ?= true +.PHONY: setup +setup: + @echo "Generating .env file with FERNET_KEY..." + @python3 -c "from cryptography.fernet import Fernet; print(f'FERNET_KEY={Fernet.generate_key().decode()}')" > .env + @echo ".env file generated." + # Docker build .PHONY: build build: From 67f62bdce52e0915b24f99e95b9f65d93ab00510 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Mon, 27 Jan 2025 13:26:35 +0100 Subject: [PATCH 04/27] add setup to makefile --- Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1568fd3..5f7f7b5 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: server etl test build clean +.PHONY: setup server etl test build clean # Default port for the server PORT ?= 8000 @@ -7,6 +7,12 @@ PORT ?= 8000 ETL_PATH ?= /path/to/docspecs FORCE ?= true +.PHONY: setup +setup: + @echo "Generating .env file with FERNET_KEY..." + @python3 -c "from cryptography.fernet import Fernet; print(f'FERNET_KEY={Fernet.generate_key().decode()}')" > .env + @echo ".env file generated." + # Docker build .PHONY: build build: From b1adf256c44e116951592a55ad12629a25cb6506 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Mon, 27 Jan 2025 13:28:23 +0100 Subject: [PATCH 05/27] add setup to makefile --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 5f7f7b5..ee09100 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,8 @@ FORCE ?= true .PHONY: setup setup: + @echo "Installing requirements..." + @pip install -r requirements.txt @echo "Generating .env file with FERNET_KEY..." @python3 -c "from cryptography.fernet import Fernet; print(f'FERNET_KEY={Fernet.generate_key().decode()}')" > .env @echo ".env file generated." From 615898f28093ac91ae17145cea3c62ecc9c11c94 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Mon, 27 Jan 2025 13:54:57 +0100 Subject: [PATCH 06/27] set airflow login creds --- docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index b886363..3e362f6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -34,6 +34,8 @@ services: - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY} - AIRFLOW__WEBSERVER__RBAC=True + - _AIRFLOW_WWW_USER_USERNAME=airflow + - _AIRFLOW_WWW_USER_PASSWORD=airflow ports: - "8080:8080" depends_on: From 552ac47743f461c1c4446a104559fc3c5f917fcf Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Mon, 27 Jan 2025 14:09:15 +0100 Subject: [PATCH 07/27] set airflow login creds --- docker-compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-compose.yml b/docker-compose.yml index 3e362f6..60e49e4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -34,6 +34,7 @@ services: - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY} - AIRFLOW__WEBSERVER__RBAC=True + - _AIRFLOW_WWW_USER_CREATE=True - _AIRFLOW_WWW_USER_USERNAME=airflow - _AIRFLOW_WWW_USER_PASSWORD=airflow ports: From 166e3e6ee98c4d9734cdc4f0bdde28c1ddb83b6a Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Mon, 27 Jan 2025 14:10:18 +0100 Subject: [PATCH 08/27] set airflow login creds --- docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 60e49e4..e2256fe 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -33,8 +33,8 @@ services: - AIRFLOW__CORE__EXECUTOR=LocalExecutor - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY} - - AIRFLOW__WEBSERVER__RBAC=True - - _AIRFLOW_WWW_USER_CREATE=True + - AIRFLOW__WEBSERVER__RBAC=true + - _AIRFLOW_WWW_USER_CREATE=true - _AIRFLOW_WWW_USER_USERNAME=airflow - _AIRFLOW_WWW_USER_PASSWORD=airflow ports: From 62d4a87933de2d3a1ace141518ab0cacdb61ed90 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Mon, 27 Jan 2025 14:47:11 +0100 Subject: [PATCH 09/27] improve dag startup time --- Makefile | 6 +++++- dags/jurisprudencia.py | 6 +----- dags/profiler.py | 10 ++++++++++ requirements.txt | 1 + 4 files changed, 17 insertions(+), 6 deletions(-) create mode 100644 dags/profiler.py diff --git a/Makefile b/Makefile index ee09100..571f4e6 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: setup server etl test build clean +.PHONY: setup profile server etl test build clean # Default port for the server PORT ?= 8000 @@ -15,6 +15,10 @@ setup: @python3 -c "from cryptography.fernet import Fernet; print(f'FERNET_KEY={Fernet.generate_key().decode()}')" > .env @echo ".env file generated." +.PHONY: profile +profile: + @py-spy record -o profile.svg -- python dags/jurisprudencia.py + # Docker build .PHONY: build build: diff --git a/dags/jurisprudencia.py b/dags/jurisprudencia.py index 89c180a..6afc96d 100644 --- a/dags/jurisprudencia.py +++ b/dags/jurisprudencia.py @@ -1,11 +1,7 @@ import pendulum from airflow import DAG from airflow.operators.python import PythonOperator -from airflow.utils.dates import days_ago -from datetime import datetime, timedelta -import requests -import json -import os +from datetime import timedelta from verdictnet.ingestion.downloader import get_item_pagination diff --git a/dags/profiler.py b/dags/profiler.py new file mode 100644 index 0000000..3e00ffe --- /dev/null +++ b/dags/profiler.py @@ -0,0 +1,10 @@ +import line_profiler + + +@line_profiler.profile +def execute(): + import jurisprudencia + + +if __name__ == "__main__": + execute() diff --git a/requirements.txt b/requirements.txt index 59e57ad..5516ad4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -187,6 +187,7 @@ kombu==5.4.2 kubernetes==31.0.0 lazy-object-proxy==1.10.0 limits==4.0.1 +line_profiler linkify-it-py==2.0.3 llama-cloud==0.1.6 llama-index==0.12.5 From 9ac0a61a8a4fc975a5048b94e1b0f7dd3f00936b Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Mon, 27 Jan 2025 14:47:22 +0100 Subject: [PATCH 10/27] improve dag startup time --- src/verdictnet/embedding.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/verdictnet/embedding.py b/src/verdictnet/embedding.py index 0dadaee..8af84d0 100644 --- a/src/verdictnet/embedding.py +++ b/src/verdictnet/embedding.py @@ -1,10 +1,7 @@ -import argparse import configparser -from typing import List, Union, Tuple, Optional +from typing import List, Union, Optional from numpy import ndarray -from sentence_transformers import SentenceTransformer -from torch import Tensor from verdictnet.config import root_path, get_config from verdictnet.models.node import Node @@ -12,6 +9,8 @@ class Embedding: def __init__(self, conf: Optional[configparser.ConfigParser] = None): + from sentence_transformers import SentenceTransformer + self.conf = conf or get_config() # Load a pre-trained model self.model = SentenceTransformer( # Lightweight, fast model @@ -20,7 +19,7 @@ def __init__(self, conf: Optional[configparser.ConfigParser] = None): ) def embed_nodes(self, nodes: List[Node]) -> tuple[ - List[Union[List[Tensor], ndarray, Tensor]], + List[Union[List, ndarray]], List[str], List[dict] ]: @@ -43,5 +42,5 @@ def embed_nodes(self, nodes: List[Node]) -> tuple[ return embeddings, documents, metadata - def embed_string(self, text: str) -> Tensor: + def embed_string(self, text: str): return self.model.encode(text).tolist() From 9913671a02b32106f2e809844164a02a461e7375 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Tue, 28 Jan 2025 00:05:32 +0100 Subject: [PATCH 11/27] fix imports, airflow working --- docker-compose.yml | 263 ++++++++++++++++++++++--- setup.cfg.bak => setup.cfg | 8 +- src/verdictnet/cli.py | 4 +- src/verdictnet/etl.py | 16 +- src/verdictnet/query.py | 6 +- src/verdictnet/render/node_renderer.py | 2 +- src/verdictnet/render/plain_text.py | 4 +- 7 files changed, 252 insertions(+), 51 deletions(-) rename setup.cfg.bak => setup.cfg (81%) diff --git a/docker-compose.yml b/docker-compose.yml index e2256fe..3ec9cc6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,3 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. +# +# WARNING: This configuration is for local development. Do not use it in a production deployment. +# +# This configuration supports basic configuration using environment variables or an .env file +# The following variables are supported: +# +# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. +# Default: apache/airflow:2.10.4 +# AIRFLOW_UID - User ID in Airflow containers +# Default: 50000 +# AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed. +# Default: . +# Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode +# +# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). +# Default: airflow +# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). +# Default: airflow +# _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. +# Use this option ONLY for quick checks. Installing requirements at container +# startup is done EVERY TIME the service is started. +# A better way is to build a custom image or extend the official image +# as described in https://airflow.apache.org/docs/docker-stack/build.html. +# Default: '' +# +# Feel free to modify this file to suit your needs. +--- +x-airflow-common: + &airflow-common + # In order to add custom dependencies or upgrade provider packages you can use your extended image. + # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml + # and uncomment the "build" line below, Then run `docker-compose build` to build the images. + # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.10.4} + build: . + environment: + &airflow-common-env + AIRFLOW__CORE__EXECUTOR: CeleryExecutor + AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow + AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow + AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 + AIRFLOW__CORE__FERNET_KEY: '' + AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' + AIRFLOW__CORE__LOAD_EXAMPLES: 'true' + AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' + # yamllint disable rule:line-length + # Use simple http server on scheduler for health checks + # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server + # yamllint enable rule:line-length + AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' + # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks + # for other purpose (development, test and especially production usage) build/extend Airflow image. + _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} + # The following line can be used to set a custom config file, stored in the local config folder + # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file + # AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg' + volumes: + - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags + - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs + - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config + - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins + user: "${AIRFLOW_UID:-50000}:0" + depends_on: + &airflow-common-depends-on + redis: + condition: service_healthy + postgres: + condition: service_healthy + + services: chromadb: image: chromadb/chroma:latest @@ -23,49 +111,162 @@ services: POSTGRES_USER: airflow POSTGRES_PASSWORD: airflow POSTGRES_DB: airflow + healthcheck: + test: [ "CMD", "pg_isready", "-U", "airflow" ] + interval: 10s + retries: 5 + start_period: 5s + restart: always volumes: - ${PWD}/postgress_storage:/var/lib/postgresql/data + redis: + # Redis is limited to 7.2-bookworm due to licencing change + # https://redis.io/blog/redis-adopts-dual-source-available-licensing/ + image: redis:7.2-bookworm + expose: + - 6379 + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 30s + retries: 50 + start_period: 30s + restart: always + airflow-webserver: - build: . - container_name: airflow-webserver - environment: - - AIRFLOW__CORE__EXECUTOR=LocalExecutor - - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow - - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY} - - AIRFLOW__WEBSERVER__RBAC=true - - _AIRFLOW_WWW_USER_CREATE=true - - _AIRFLOW_WWW_USER_USERNAME=airflow - - _AIRFLOW_WWW_USER_PASSWORD=airflow + <<: *airflow-common + command: webserver ports: - "8080:8080" + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + restart: always depends_on: - - postgres - command: > - bash -c "airflow db init && airflow webserver" - env_file: - - .env - volumes: - - ${PWD}/dags:/opt/airflow/dags # Mount the dags folder - + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully airflow-scheduler: - build: . - container_name: airflow-scheduler + <<: *airflow-common + command: scheduler + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:8974/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-worker: + <<: *airflow-common + command: celery worker + healthcheck: + # yamllint disable rule:line-length + test: + - "CMD-SHELL" + - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s environment: - - AIRFLOW__CORE__EXECUTOR=LocalExecutor - - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow - - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY} + <<: *airflow-common-env + # Required to handle warm shutdown of the celery workers properly + # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation + DUMB_INIT_SETSID: "0" + restart: always depends_on: - - postgres - - airflow-webserver - command: > - bash -c "airflow scheduler" - env_file: - - .env - volumes: - - ${PWD}/dags:/opt/airflow/dags # Mount the dags folder + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + airflow-triggerer: + <<: *airflow-common + command: triggerer + healthcheck: + test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-init: + <<: *airflow-common + entrypoint: /bin/bash + # yamllint disable rule:line-length + command: + - -c + - | + if [[ -z "${AIRFLOW_UID}" ]]; then + echo + echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" + echo "If you are on Linux, you SHOULD follow the instructions below to set " + echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." + echo "For other operating systems you can get rid of the warning with manually created .env file:" + echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" + echo + fi + one_meg=1048576 + mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) + cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) + disk_available=$$(df / | tail -1 | awk '{print $$4}') + warning_resources="false" + if (( mem_available < 4000 )) ; then + echo + echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" + echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" + echo + warning_resources="true" + fi + if (( cpus_available < 2 )); then + echo + echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" + echo "At least 2 CPUs recommended. You have $${cpus_available}" + echo + warning_resources="true" + fi + if (( disk_available < one_meg * 10 )); then + echo + echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" + echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" + echo + warning_resources="true" + fi + if [[ $${warning_resources} == "true" ]]; then + echo + echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" + echo "Please follow the instructions to increase amount of resources available:" + echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin" + echo + fi + mkdir -p /sources/logs /sources/dags /sources/plugins + chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} + exec /entrypoint airflow version + # yamllint enable rule:line-length + environment: + <<: *airflow-common-env + _AIRFLOW_DB_MIGRATE: 'true' + _AIRFLOW_WWW_USER_CREATE: 'true' + _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} + _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} + _PIP_ADDITIONAL_REQUIREMENTS: '' + user: "0:0" + volumes: + - ${AIRFLOW_PROJ_DIR:-.}:/sources minio: image: minio/minio:latest diff --git a/setup.cfg.bak b/setup.cfg similarity index 81% rename from setup.cfg.bak rename to setup.cfg index ed112a2..27e2ef5 100644 --- a/setup.cfg.bak +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -name = semantic +name = verdictnet version = 0.0.1 description = A tool for running semantic queries on Spanish Law author = Alex Monras @@ -7,7 +7,7 @@ license = MIT [options] package_dir= - =semantic + =verdictnet packages = find: install_requires = numpy @@ -21,8 +21,8 @@ install_requires = PyMuPDF [options.packages.find] -where=semantic +where=src [options.entry_points] console_scripts = - semantic = cli:main + verdictnet = verdictnet.cli:main diff --git a/src/verdictnet/cli.py b/src/verdictnet/cli.py index 2ef9e44..d115788 100644 --- a/src/verdictnet/cli.py +++ b/src/verdictnet/cli.py @@ -1,6 +1,6 @@ import argparse -import query +from verdictnet import query def main(): @@ -40,7 +40,7 @@ def main(): def handle_etl(args): - import etl + from verdictnet import etl if args.subcommand == "clean": etl.clean() elif args.subcommand == "run": diff --git a/src/verdictnet/etl.py b/src/verdictnet/etl.py index c5b755b..7e65f74 100644 --- a/src/verdictnet/etl.py +++ b/src/verdictnet/etl.py @@ -6,14 +6,14 @@ from slugify import slugify from bs4 import BeautifulSoup -from config import get_config, root_path, get_fs -from ingestion.documentspec import DocumentSpec -from ingestion.paths import refined_path -from models.node import Node -from ingestion.parsers.html_parser import parse -from storage.chroma_storage import ChromaStorage -from storage.hybrid_storage import HybridStorage -from storage.transaction_manager import TransactionManager +from verdictnet.config import get_config, root_path, get_fs +from verdictnet.ingestion.documentspec import DocumentSpec +from verdictnet.ingestion.paths import refined_path +from verdictnet.models.node import Node +from verdictnet.ingestion.parsers.html_parser import parse +from verdictnet.storage.chroma_storage import ChromaStorage +from verdictnet.storage.hybrid_storage import HybridStorage +from verdictnet.storage.transaction_manager import TransactionManager def get_docspecs(path: Path = None) -> List[Path]: diff --git a/src/verdictnet/query.py b/src/verdictnet/query.py index 142c794..3763911 100644 --- a/src/verdictnet/query.py +++ b/src/verdictnet/query.py @@ -1,9 +1,9 @@ import os import textwrap -from render.plain_text import PlainTextRenderer -from storage.chroma_storage import ChromaStorage -from storage.hybrid_storage import HybridStorage +from verdictnet.render.plain_text import PlainTextRenderer +from verdictnet.storage.chroma_storage import ChromaStorage +from verdictnet.storage.hybrid_storage import HybridStorage def query(q_string: str, n_results: int = 3): diff --git a/src/verdictnet/render/node_renderer.py b/src/verdictnet/render/node_renderer.py index 27166a4..4be823f 100644 --- a/src/verdictnet/render/node_renderer.py +++ b/src/verdictnet/render/node_renderer.py @@ -1,4 +1,4 @@ -from models.node import Node +from verdictnet.models.node import Node class NodeRenderer: diff --git a/src/verdictnet/render/plain_text.py b/src/verdictnet/render/plain_text.py index e7498e4..55c2cbc 100644 --- a/src/verdictnet/render/plain_text.py +++ b/src/verdictnet/render/plain_text.py @@ -1,5 +1,5 @@ -from models.node import Node -from render.node_renderer import NodeRenderer +from verdictnet.models.node import Node +from verdictnet.render.node_renderer import NodeRenderer class PlainTextRenderer(NodeRenderer): From 22106e2f76a288f95dac617cd6d2cc6f49ca7ded Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Tue, 28 Jan 2025 13:45:05 +0100 Subject: [PATCH 12/27] jurisprudencia dag working --- Makefile | 29 +++++++- dags/jurisprudencia.py | 69 +++++++++++++++++-- docker-compose.yml | 23 +++++-- src/verdictnet/config.py | 25 +++++-- .../ingestion/parsers/html_parser.py | 4 +- 5 files changed, 128 insertions(+), 22 deletions(-) diff --git a/Makefile b/Makefile index 571f4e6..0986c2e 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: setup profile server etl test build clean +.PHONY: setup install start profile server etl test build clean # Default port for the server PORT ?= 8000 @@ -7,13 +7,36 @@ PORT ?= 8000 ETL_PATH ?= /path/to/docspecs FORCE ?= true -.PHONY: setup -setup: +.PHONY: install +install: @echo "Installing requirements..." @pip install -r requirements.txt + +.PHONY: setup +setup: install @echo "Generating .env file with FERNET_KEY..." @python3 -c "from cryptography.fernet import Fernet; print(f'FERNET_KEY={Fernet.generate_key().decode()}')" > .env @echo ".env file generated." + @echo "Launching minio..." + @docker-compose up -d minio + @echo "Waiting for minio to start..." + @until docker-compose exec minio mc ready local; do \ + echo "Minio is not healthy yet. Retrying in 5 seconds..."; \ + sleep 5; \ + done + @echo "Setting up local alias..." + @docker-compose exec minio mc alias set minio http://localhost:9000 minioadmin minioadmin + @echo "Creating buckets..." + @docker-compose exec minio sh -c "mc ls minio/legal || mc mb minio/legal" + @docker-compose exec minio mc mb minio/airflow-logs + @echo "Minio setup complete. Stopping minio..." + @docker-compose stop minio + @echo "Setup complete." + +.PHONY: start +start: + @echo "Starting up the microservices..." + @docker-compose up -d .PHONY: profile profile: diff --git a/dags/jurisprudencia.py b/dags/jurisprudencia.py index 6afc96d..febb97a 100644 --- a/dags/jurisprudencia.py +++ b/dags/jurisprudencia.py @@ -1,9 +1,7 @@ import pendulum from airflow import DAG from airflow.operators.python import PythonOperator -from datetime import timedelta - -from verdictnet.ingestion.downloader import get_item_pagination +from datetime import timedelta, datetime # Define the default arguments default_args = { @@ -16,6 +14,39 @@ 'retry_delay': timedelta(minutes=5), } + +def get_item_pagination_task(date: str): + from verdictnet.ingestion.downloader import get_item_pagination + return get_item_pagination(datetime.strptime(date, "%Y-%m-%d")) + + +def refine_item_pagination_task(date: str): + from verdictnet.ingestion.downloader import refine_item_pagination + return refine_item_pagination(datetime.strptime(date, "%Y-%m-%d")) + + +def download_pdfs_task(date: str): + from verdictnet.ingestion.downloader import download_pdfs + return download_pdfs(datetime.strptime(date, "%Y-%m-%d")) + + +def parse_pdfs_task(date: str): + from verdictnet.ingestion.downloader import parse_pdfs + return parse_pdfs(datetime.strptime(date, "%Y-%m-%d")) + + +def ingest_pdfs_task(date: str): + from verdictnet.ingestion.downloader import ingest_pdfs + from verdictnet.storage.transaction_manager import TransactionManager + from verdictnet.config import get_config + + transaction_manager = TransactionManager.get_transaction_manager(get_config()) + dataset_uuid = transaction_manager.init_dataset("Jurisprudencia") + return ingest_pdfs(date=datetime.strptime(date, "%Y-%m-%d"), + transaction_manager=transaction_manager, + dataset_uuid=dataset_uuid) + + # Define the DAG with DAG( 'query_poderjudicial', @@ -24,7 +55,35 @@ schedule='@daily', catchup=True, ): - item_pagination = PythonOperator( + item_pagination_task = PythonOperator( task_id='get_item_pagination', - python_callable=get_item_pagination, + python_callable=get_item_pagination_task, + op_kwargs={'date': "{{ ds }}"}, + ) + + refine_pagination_task = PythonOperator( + task_id='refine_item_pagination', + python_callable=refine_item_pagination_task, + op_kwargs={'date': "{{ ds }}"}, ) + + download_pdfs = PythonOperator( + task_id='download_pdfs', + python_callable=download_pdfs_task, + op_kwargs={'date': "{{ ds }}"}, + ) + + parse_pdfs = PythonOperator( + task_id='parse_pdfs', + python_callable=parse_pdfs_task, + op_kwargs={'date': "{{ ds }}"}, + ) + + ingest_pdfs = PythonOperator( + task_id='ingest_pdfs', + python_callable=ingest_pdfs_task, + op_kwargs={'date': "{{ ds }}"}, + ) + + item_pagination_task >> refine_pagination_task >> download_pdfs >> parse_pdfs >> ingest_pdfs + diff --git a/docker-compose.yml b/docker-compose.yml index 3ec9cc6..b804f34 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -51,16 +51,30 @@ x-airflow-common: # and uncomment the "build" line below, Then run `docker-compose build` to build the images. # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.10.4} build: . + env_file: + - .env environment: &airflow-common-env AIRFLOW__CORE__EXECUTOR: CeleryExecutor AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 - AIRFLOW__CORE__FERNET_KEY: '' + AIRFLOW__CORE__FERNET_KEY: ${FERNET_KEY} AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' - AIRFLOW__CORE__LOAD_EXAMPLES: 'true' + AIRFLOW__CORE__LOAD_EXAMPLES: 'false' AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' + _AIRFLOW_WWW_USER_CREATE: 'true' + _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} + _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} + AIRFLOW_CONN_MINIO: '{ + "conn_type": "s3", + "login": "minioadmin", + "password": "minioadmin", + "host": "minio", + "port": 9000, + "schema": "http", + "extra": {"endpoint_url": "http://minio:9000\"} + }' # yamllint disable rule:line-length # Use simple http server on scheduler for health checks # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server @@ -255,14 +269,11 @@ services: fi mkdir -p /sources/logs /sources/dags /sources/plugins chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} - exec /entrypoint airflow version + exec airflow db init # yamllint enable rule:line-length environment: <<: *airflow-common-env _AIRFLOW_DB_MIGRATE: 'true' - _AIRFLOW_WWW_USER_CREATE: 'true' - _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} - _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} _PIP_ADDITIONAL_REQUIREMENTS: '' user: "0:0" volumes: diff --git a/src/verdictnet/config.py b/src/verdictnet/config.py index 8c6ab36..f48663b 100644 --- a/src/verdictnet/config.py +++ b/src/verdictnet/config.py @@ -5,6 +5,7 @@ from typing import Optional import fsspec +from airflow.hooks.base import BaseHook logging.basicConfig( level=logging.INFO, @@ -30,13 +31,25 @@ def get_config(): def configure_fsspec(): config = get_config() - s3_config = { - "key": os.getenv("AWS_ACCESS_KEY_ID", config.get('s3', 'key')), - "secret": os.getenv("AWS_SECRET_ACCESS_KEY", config.get('s3', 'secret')), - "client_kwargs": { - "endpoint_url": os.getenv("S3_ENDPOINT", config.get('s3', 'endpoint_url')) + # Check if running within Airflow + if 'AIRFLOW_HOME' in os.environ: + # Retrieve the connection details from Airflow + connection = BaseHook.get_connection('minio') + s3_config = { + "key": connection.login, + "secret": connection.password, + "client_kwargs": { + "endpoint_url": connection.extra_dejson.get('endpoint_url') + } + } + else: + s3_config = { + "key": os.getenv("AWS_ACCESS_KEY_ID", config.get('s3', 'key')), + "secret": os.getenv("AWS_SECRET_ACCESS_KEY", config.get('s3', 'secret')), + "client_kwargs": { + "endpoint_url": os.getenv("S3_ENDPOINT", config.get('s3', 'endpoint_url')) + } } - } fsspec.config.conf = { "s3": s3_config diff --git a/src/verdictnet/ingestion/parsers/html_parser.py b/src/verdictnet/ingestion/parsers/html_parser.py index 30db5f0..f152bc8 100644 --- a/src/verdictnet/ingestion/parsers/html_parser.py +++ b/src/verdictnet/ingestion/parsers/html_parser.py @@ -1,5 +1,5 @@ -from ingestion.documentspec import DocumentSpec -from models.node import Node +from verdictnet.ingestion.documentspec import DocumentSpec +from verdictnet.models.node import Node def next_class(tags): From 9d2499f6b48a9865790121b7ccc7e6a0da4b49f8 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Tue, 28 Jan 2025 17:47:36 +0100 Subject: [PATCH 13/27] fix imports --- src/verdictnet/cli.py | 2 +- src/verdictnet/frontend/server/server.py | 16 ++++++++-------- src/verdictnet/frontend/server/websocket.py | 6 +++--- src/verdictnet/ingestion/downloader.py | 1 - src/verdictnet/ragagent.py | 8 +++----- src/verdictnet/render/html.py | 4 ++-- 6 files changed, 17 insertions(+), 20 deletions(-) diff --git a/src/verdictnet/cli.py b/src/verdictnet/cli.py index d115788..e4a0792 100644 --- a/src/verdictnet/cli.py +++ b/src/verdictnet/cli.py @@ -54,7 +54,7 @@ def handle_query(args): def handle_server(args): - from frontend.server import server + from verdictnet.frontend.server import server server.main() diff --git a/src/verdictnet/frontend/server/server.py b/src/verdictnet/frontend/server/server.py index 0fa4907..3c667ad 100644 --- a/src/verdictnet/frontend/server/server.py +++ b/src/verdictnet/frontend/server/server.py @@ -9,14 +9,14 @@ from starlette.staticfiles import StaticFiles from starlette.websockets import WebSocketDisconnect -from config import get_config -from etl import get_files -from frontend import paths -from ragagent import RAGAgent -from frontend.server.websocket import Connection -from frontend.server.dto.websocket import ConnectionId, DisplayDocuments -from render.html import HTMLRenderer -from storage.hybrid_storage import HybridStorage +from verdictnet.config import get_config +from verdictnet.etl import get_files +from verdictnet.frontend import paths +from verdictnet.ragagent import RAGAgent +from verdictnet.frontend.server.websocket import Connection +from verdictnet.frontend.server.dto.websocket import ConnectionId, DisplayDocuments +from verdictnet.render.html import HTMLRenderer +from verdictnet.storage.hybrid_storage import HybridStorage conf = get_config() diff --git a/src/verdictnet/frontend/server/websocket.py b/src/verdictnet/frontend/server/websocket.py index 3d3cc3b..7823c9d 100644 --- a/src/verdictnet/frontend/server/websocket.py +++ b/src/verdictnet/frontend/server/websocket.py @@ -5,9 +5,9 @@ from PyPDF2 import PdfFileReader from starlette.websockets import WebSocket -from frontend.paths import uploads -from ragagent import RAGAgent -from frontend.server.dto.websocket import WebSocketMessage, ChatQueryMessage, FileUploaded, ChatResponseMessage, \ +from verdictnet.frontend.paths import uploads +from verdictnet.ragagent import RAGAgent +from verdictnet.frontend.server.dto.websocket import WebSocketMessage, ChatQueryMessage, FileUploaded, ChatResponseMessage, \ UnfoldNodes diff --git a/src/verdictnet/ingestion/downloader.py b/src/verdictnet/ingestion/downloader.py index 8f76502..c309b19 100644 --- a/src/verdictnet/ingestion/downloader.py +++ b/src/verdictnet/ingestion/downloader.py @@ -307,7 +307,6 @@ def ingest_pdfs(date: datetime, transaction_manager: TransactionManager, force=F transaction_manager.store_with_transaction(nodes, parent_uuid=dataset_uuid) - if __name__ == "__main__": start_date = datetime.today() - timedelta(days=20) end_date = datetime.today() - timedelta(days=1) diff --git a/src/verdictnet/ragagent.py b/src/verdictnet/ragagent.py index 6afdb49..7bb6424 100644 --- a/src/verdictnet/ragagent.py +++ b/src/verdictnet/ragagent.py @@ -2,11 +2,9 @@ import logging from typing import Optional, List -import chromadb - -from models.node import Node -from query import print_results -from storage.hybrid_storage import HybridStorage +from verdictnet.models.node import Node +from verdictnet.query import print_results +from verdictnet.storage.hybrid_storage import HybridStorage logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) diff --git a/src/verdictnet/render/html.py b/src/verdictnet/render/html.py index 8e377e2..f6efb5f 100644 --- a/src/verdictnet/render/html.py +++ b/src/verdictnet/render/html.py @@ -1,5 +1,5 @@ -from models.node import Node -from render.node_renderer import NodeRenderer +from verdictnet.models.node import Node +from verdictnet.render.node_renderer import NodeRenderer class HTMLRenderer(NodeRenderer): From 7b82aac111e035312db985e4064d2b5447f2d66c Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Tue, 28 Jan 2025 17:47:56 +0100 Subject: [PATCH 14/27] define local config files --- Dockerfile | 1 - config/local/airflow.cfg | 6 ++++++ config/local/config.ini | 29 +++++++++++++++++++++++++++++ docker-compose.yml | 5 +++-- 4 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 config/local/airflow.cfg create mode 100644 config/local/config.ini diff --git a/Dockerfile b/Dockerfile index 50bf3a6..6199b5a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,6 @@ USER root COPY requirements.txt . COPY src/ src/ COPY setup.py . -COPY config.ini . RUN chown -R airflow src/ RUN apt-get update && apt-get install -y build-essential diff --git a/config/local/airflow.cfg b/config/local/airflow.cfg new file mode 100644 index 0000000..7b7fc12 --- /dev/null +++ b/config/local/airflow.cfg @@ -0,0 +1,6 @@ +[logging] +remote_logging = True +remote_base_log_folder = s3://airflow-logs +remote_log_conn_id = minio +encrypt_s3_logs = False + diff --git a/config/local/config.ini b/config/local/config.ini new file mode 100644 index 0000000..18426dd --- /dev/null +++ b/config/local/config.ini @@ -0,0 +1,29 @@ +[storage] +type: s3 +bucket: legal +collection: legal-database +raw: datalake/raw/ +refined: datalake/refined/ +html: datalake/html/ + +[chroma] +type: http +host: chromadb +port: 8000 + +[neo4j] +url: bolt://neo4j:7687 +user: neo4j +password: neo4jtest + +[embedding] +model_name_or_path: paraphrase-mpnet-base-v2 +cache: cache/ + +[rag] +n_results: 5 + +[s3] +key = minioadmin +secret = minioadmin +endpoint_url = http://minio:9000 \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index b804f34..dec68ba 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -52,7 +52,7 @@ x-airflow-common: # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.10.4} build: . env_file: - - .env + - config/local/.env environment: &airflow-common-env AIRFLOW__CORE__EXECUTOR: CeleryExecutor @@ -73,7 +73,7 @@ x-airflow-common: "host": "minio", "port": 9000, "schema": "http", - "extra": {"endpoint_url": "http://minio:9000\"} + "extra": {"endpoint_url": "http://minio:9000"} }' # yamllint disable rule:line-length # Use simple http server on scheduler for health checks @@ -91,6 +91,7 @@ x-airflow-common: - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins + - ${PWD}/config/local/config.ini:/opt/airflow/config.ini user: "${AIRFLOW_UID:-50000}:0" depends_on: &airflow-common-depends-on From 201a65a25a181e10b137ba3624dfee53cdf38c99 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Wed, 29 Jan 2025 18:43:54 +0100 Subject: [PATCH 15/27] improve imports --- src/verdictnet/cli.py | 2 +- src/verdictnet/config.py | 19 +++-- src/verdictnet/etl.py | 111 ++++++++++++++++--------- src/verdictnet/ingestion/downloader.py | 1 + src/verdictnet/models/node.py | 3 +- 5 files changed, 90 insertions(+), 46 deletions(-) diff --git a/src/verdictnet/cli.py b/src/verdictnet/cli.py index e4a0792..d1db693 100644 --- a/src/verdictnet/cli.py +++ b/src/verdictnet/cli.py @@ -46,7 +46,7 @@ def handle_etl(args): elif args.subcommand == "run": etl.run(force_download=args.force, path=args.path) else: - print("No {args.subcommand} subcommand found.") + print(f"No {args.subcommand} subcommand found.") def handle_query(args): diff --git a/src/verdictnet/config.py b/src/verdictnet/config.py index f48663b..20dda55 100644 --- a/src/verdictnet/config.py +++ b/src/verdictnet/config.py @@ -1,4 +1,5 @@ import configparser +import json import logging import os from pathlib import Path @@ -13,22 +14,30 @@ format="%(asctime)s - %(name)s.%(funcName)s [%(levelname)s]: %(message)s", ) +logger = logging.getLogger(__name__) + +_fsspec_configured = False + def root_path(): - return Path(__file__).parent.parent.parent + return Path(__file__).parent.parent def get_config(): # Create a ConfigParser instance config = configparser.ConfigParser() - # Load the configuration file from the current folder + # Load the configuration file from the current folder, or from the package root folder config.read(filenames=['config.ini', root_path() / 'config.ini']) return config def configure_fsspec(): + global _fsspec_configured + if _fsspec_configured: + return + config = get_config() # Check if running within Airflow @@ -55,9 +64,9 @@ def configure_fsspec(): "s3": s3_config } - -# Call the function to configure fsspec -configure_fsspec() + # TODO: Warning. This is potentially logging sensitive passwords. Password obfuscation should be implemented. + logger.info("fsspec configured with: %s", json.dumps(fsspec.config.conf)) + _fsspec_configured = True def get_fs(conf: Optional[configparser.ConfigParser] = None): diff --git a/src/verdictnet/etl.py b/src/verdictnet/etl.py index 7e65f74..b48e04d 100644 --- a/src/verdictnet/etl.py +++ b/src/verdictnet/etl.py @@ -1,4 +1,5 @@ import os +from configparser import ConfigParser from pathlib import Path from typing import List @@ -6,15 +7,16 @@ from slugify import slugify from bs4 import BeautifulSoup -from verdictnet.config import get_config, root_path, get_fs +from verdictnet.config import get_config, root_path, logging from verdictnet.ingestion.documentspec import DocumentSpec from verdictnet.ingestion.paths import refined_path from verdictnet.models.node import Node from verdictnet.ingestion.parsers.html_parser import parse -from verdictnet.storage.chroma_storage import ChromaStorage from verdictnet.storage.hybrid_storage import HybridStorage from verdictnet.storage.transaction_manager import TransactionManager +logger = logging.getLogger(__name__) + def get_docspecs(path: Path = None) -> List[Path]: """ @@ -70,8 +72,68 @@ def get_document_structure(text, docspec: DocumentSpec) -> List[Node]: return parsed -def ingest(main_node, docspec: DocumentSpec, storage: TransactionManager): - all_nodes = main_node.get_all(level=docspec.embed_level) +def download_doc(docspec: DocumentSpec, conf, force_download=False): + """ + Download the document and save it to the raw folder in html format + """ + slug_name = slugify(docspec.name) + + target_filename = f'{slug_name}.html' + raw_path = root_path() / conf['storage']['raw'] / target_filename + + # Download documents + if force_download or not os.path.exists(raw_path): + print(f"Downloading document `{docspec.name}`...") + text = download(docspec) + + os.makedirs(os.path.dirname(raw_path), exist_ok=True) + with open(raw_path, 'w') as file: + file.write(text) + + +def refine(docspec, conf): + """ + Take the document in html format and refine it to a json format + """ + slug_name = slugify(docspec.name) + + target_filename = f'{slug_name}.html' + raw_path = root_path() / conf['storage']['refined'] / target_filename + + with open(raw_path, 'r') as file: + text = file.read() + + target = refined_path() + f'{slug_name}.json' + + main_node = get_document_structure(text, docspec=docspec) + + main_node[0].save(target) + print(f"Saved refined in '{target_filename}'.") + + +def render_html(docspec: DocumentSpec, conf: ConfigParser): + slug_name = slugify(docspec.name) + main_node_path = refined_path() + f'{slug_name}.json' + main_node = Node.load(main_node_path) + + html_path = root_path() / conf['storage']['html'] / f'{slug_name}.html' + os.makedirs(os.path.dirname(html_path), exist_ok=True) + with open(html_path, 'w', encoding='utf-8') as file: + file.write(main_node[0].html( + preamble=""" + + """ + )) + logger.info(f"HTML saved to '{slug_name}.html'.") + + +def ingest(docspec: DocumentSpec, conf: ConfigParser): + slug_name = slugify(docspec.name) + main_node_path = refined_path() + f'{slug_name}.json' + main_node = Node.load(main_node_path) + + storage = TransactionManager.get_transaction_manager(conf) + # all_nodes = main_node.get_all(level=docspec.embed_level) storage.store_with_transaction(main_node) @@ -93,52 +155,23 @@ def clean(): def run(force_download=False, path=None): conf = get_config() - transaction_manager = TransactionManager.get_transaction_manager(conf) - # Load Docspecs filenames = get_docspecs(path) docspecs = [DocumentSpec.load(filename) for filename in filenames] for docspec in docspecs: - slug_name = slugify(docspec.name) - - # Download documents - target_filename = f'{slug_name}.html' - raw_path = root_path() / conf['storage']['raw'] / target_filename - - if force_download or not os.path.exists(raw_path): - print(f"Downloading document `{docspec.name}`...") - text = download(docspec) - - os.makedirs(os.path.dirname(raw_path), exist_ok=True) - with open(raw_path, 'w') as file: - file.write(text) - else: - with open(raw_path, 'r') as file: - text = file.read() + # Download document + download_doc(docspec, conf, force_download) # Refining documents - target = refined_path() + f'{slug_name}.json' - - main_node = get_document_structure(text, docspec=docspec) - - main_node[0].save(target) - print(f"Saved refined in '{target_filename}'.") + refine(docspec, conf) - # Saving json - html_path = root_path() / conf['storage']['html'] / f'{slug_name}.html' - os.makedirs(os.path.dirname(html_path), exist_ok=True) - with open(html_path, 'w', encoding='utf-8') as file: - file.write(main_node[0].html( - preamble=""" - - """ - )) - print(f"HTML saved to '{slug_name}.html'.") + # Render html + render_html(docspec, conf) # Ingesting into vector database - ingest(main_node[0], docspec, storage=transaction_manager) + ingest(docspec, conf) if __name__ == "__main__": diff --git a/src/verdictnet/ingestion/downloader.py b/src/verdictnet/ingestion/downloader.py index c309b19..82c7df3 100644 --- a/src/verdictnet/ingestion/downloader.py +++ b/src/verdictnet/ingestion/downloader.py @@ -88,6 +88,7 @@ def create_session(): "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:132.0) Gecko/20100101 Firefox/132.0" } + logger.info("Creating session: %s", f"{BASE_URL}/search/") response = session.get(f"{BASE_URL}/search/", headers=headers_dict) if response.status_code != 200: diff --git a/src/verdictnet/models/node.py b/src/verdictnet/models/node.py index 07497fc..04bf3bd 100644 --- a/src/verdictnet/models/node.py +++ b/src/verdictnet/models/node.py @@ -83,7 +83,8 @@ def save(self, path): with fs.open(path, 'w', encoding='utf8') as file: json.dump(self.json(), file, indent=4, ensure_ascii=False) - def load(self, path): + @classmethod + def load(cls, path): """ Load the node from a file """ From eaf782562da50f6b4d0cb0e07abbf37356e36eca Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Wed, 29 Jan 2025 18:44:00 +0100 Subject: [PATCH 16/27] improve testing imports --- tests/conftest.py | 23 +++++++++++++++--- tests/ingestion/test_documentspec.py | 2 +- tests/ingestion/test_ingest.py | 2 +- tests/ingestion/test_parser.py | 2 +- tests/ingestion/test_pdf_parser.py | 4 ++-- tests/resources/config.ini | 29 +++++++++++++++++++++++ tests/storage/conftest.py | 3 +-- tests/storage/test_adapter.py | 4 ++-- tests/storage/test_chroma_storage.py | 6 ++--- tests/storage/test_graph_storage.py | 4 ++-- tests/storage/test_hybrid_storage.py | 4 ++-- tests/storage/test_transaction_manager.py | 6 ++--- 12 files changed, 67 insertions(+), 22 deletions(-) create mode 100644 tests/resources/config.ini diff --git a/tests/conftest.py b/tests/conftest.py index 312fe11..34c4f96 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,17 +1,34 @@ +import configparser from pathlib import Path +from unittest.mock import patch from bs4 import BeautifulSoup from pytest import fixture -from config import root_path -from ingestion.documentspec import DocumentSpec -from ingestion.parsers.html_parser import parse +from verdictnet.config import get_config + + +@fixture(autouse=True) +def mock_config(): + def mock_get_config(): + config = configparser.ConfigParser() + config.read('resources/config.ini') + return config + + with patch('verdictnet.config.get_config', mock_get_config): + yield + + +from verdictnet.ingestion.documentspec import DocumentSpec +from verdictnet.ingestion.parsers.html_parser import parse resources = Path(__file__).parent / "resources" + @fixture def static_files(): + from verdictnet.config import root_path return root_path() / "verdictnet/frontend/static/css" diff --git a/tests/ingestion/test_documentspec.py b/tests/ingestion/test_documentspec.py index f3fc49a..b954bb3 100644 --- a/tests/ingestion/test_documentspec.py +++ b/tests/ingestion/test_documentspec.py @@ -3,7 +3,7 @@ from pytest import fixture from conftest import resources -from ingestion.documentspec import DocumentSpec +from verdictnet.ingestion.documentspec import DocumentSpec @fixture diff --git a/tests/ingestion/test_ingest.py b/tests/ingestion/test_ingest.py index d8610a1..3b4e71c 100644 --- a/tests/ingestion/test_ingest.py +++ b/tests/ingestion/test_ingest.py @@ -2,7 +2,7 @@ from unittest.mock import patch import pytest -from etl import get_docspecs +from verdictnet.etl import get_docspecs @pytest.fixture diff --git a/tests/ingestion/test_parser.py b/tests/ingestion/test_parser.py index eb28839..260db44 100644 --- a/tests/ingestion/test_parser.py +++ b/tests/ingestion/test_parser.py @@ -2,7 +2,7 @@ from bs4 import BeautifulSoup from conftest import codigo_civil_spec -from ingestion.parsers.html_parser import parse +from verdictnet.ingestion.parsers.html_parser import parse @fixture(scope='class') diff --git a/tests/ingestion/test_pdf_parser.py b/tests/ingestion/test_pdf_parser.py index b6feae1..fbc1a0e 100644 --- a/tests/ingestion/test_pdf_parser.py +++ b/tests/ingestion/test_pdf_parser.py @@ -2,8 +2,8 @@ import os from conftest import resources -from models.node import Node -from ingestion.parsers.pdf_parser import extract_paragraphs +from verdictnet.models.node import Node +from verdictnet.ingestion.parsers.pdf_parser import extract_paragraphs @pytest.fixture diff --git a/tests/resources/config.ini b/tests/resources/config.ini new file mode 100644 index 0000000..cd2617e --- /dev/null +++ b/tests/resources/config.ini @@ -0,0 +1,29 @@ +[storage] +type: s3 +bucket: legal +collection: legal-database +raw: datalake/raw/ +refined: datalake/refined/ +html: datalake/html/ + +[chroma] +type: http +host: localhost +port: 8000 + +[neo4j] +url: bolt://localhost:7687 +user: neo4j +password: neo4jtest + +[embedding] +model_name_or_path: paraphrase-mpnet-base-v2 +cache: cache/ + +[rag] +n_results: 5 + +[s3] +key = minioadmin +secret = minioadmin +endpoint_url = http://localhost:9000 \ No newline at end of file diff --git a/tests/storage/conftest.py b/tests/storage/conftest.py index d984c8f..97e59dc 100644 --- a/tests/storage/conftest.py +++ b/tests/storage/conftest.py @@ -1,7 +1,6 @@ import pytest -from storage.graph_storage import GraphStorage -from storage.hybrid_storage import HybridStorage +from verdictnet.storage.hybrid_storage import HybridStorage @pytest.fixture diff --git a/tests/storage/test_adapter.py b/tests/storage/test_adapter.py index 5e45abe..099286a 100644 --- a/tests/storage/test_adapter.py +++ b/tests/storage/test_adapter.py @@ -1,5 +1,5 @@ -from models.node import Node -from storage.adapters import NodeAdapter +from verdictnet.models.node import Node +from verdictnet.storage.adapters import NodeAdapter def test_to_neo4j_with_relationships_single_node(): diff --git a/tests/storage/test_chroma_storage.py b/tests/storage/test_chroma_storage.py index cc9685e..46c805d 100644 --- a/tests/storage/test_chroma_storage.py +++ b/tests/storage/test_chroma_storage.py @@ -1,9 +1,9 @@ import chromadb import pytest from unittest.mock import MagicMock -from models.node import Node -from storage.chroma_storage import ChromaStorage -from embedding import Embedding +from verdictnet.models.node import Node +from verdictnet.storage.chroma_storage import ChromaStorage +from verdictnet.embedding import Embedding @pytest.fixture diff --git a/tests/storage/test_graph_storage.py b/tests/storage/test_graph_storage.py index c2d275f..673c0bd 100644 --- a/tests/storage/test_graph_storage.py +++ b/tests/storage/test_graph_storage.py @@ -5,8 +5,8 @@ from neo4j import Result -from models.node import Node -from storage.graph_storage import GraphStorage +from verdictnet.models.node import Node +from verdictnet.storage.graph_storage import GraphStorage @pytest.fixture diff --git a/tests/storage/test_hybrid_storage.py b/tests/storage/test_hybrid_storage.py index 095351f..c7e3100 100644 --- a/tests/storage/test_hybrid_storage.py +++ b/tests/storage/test_hybrid_storage.py @@ -1,7 +1,7 @@ import pytest from unittest.mock import MagicMock -from models.node import Node -from storage.hybrid_storage import HybridStorage +from verdictnet.models.node import Node +from verdictnet.storage.hybrid_storage import HybridStorage @pytest.fixture diff --git a/tests/storage/test_transaction_manager.py b/tests/storage/test_transaction_manager.py index 51adc15..78d19cc 100644 --- a/tests/storage/test_transaction_manager.py +++ b/tests/storage/test_transaction_manager.py @@ -1,8 +1,8 @@ import pytest from unittest.mock import MagicMock, call -from models.node import Node -from storage.transaction_manager import TransactionManager -from storage.hybrid_storage import HybridStorage +from verdictnet.models.node import Node +from verdictnet.storage.transaction_manager import TransactionManager +from verdictnet.storage.hybrid_storage import HybridStorage @pytest.fixture From d73b155919ff7f7171fd9f4e5325e8e5994eae30 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Wed, 29 Jan 2025 18:44:15 +0100 Subject: [PATCH 17/27] add local config --- config/local/airflow.cfg | 6 ++++++ docker-compose.yml | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/config/local/airflow.cfg b/config/local/airflow.cfg index 7b7fc12..b396ebe 100644 --- a/config/local/airflow.cfg +++ b/config/local/airflow.cfg @@ -4,3 +4,9 @@ remote_base_log_folder = s3://airflow-logs remote_log_conn_id = minio encrypt_s3_logs = False +[webserver] +default_dag_run_display_number = 250 +expose_config = True + +[celery] +worker_concurrency = 2 diff --git a/docker-compose.yml b/docker-compose.yml index dec68ba..ca88d3d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -85,13 +85,14 @@ x-airflow-common: _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} # The following line can be used to set a custom config file, stored in the local config folder # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file - # AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg' + AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg' volumes: - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins - ${PWD}/config/local/config.ini:/opt/airflow/config.ini + - ${PWD}/config/local/airflow.cfg:/opt/airflow/config/airflow.cfg user: "${AIRFLOW_UID:-50000}:0" depends_on: &airflow-common-depends-on From a309e25c8861d27858363e08dde7c9bae6eee05e Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Wed, 29 Jan 2025 19:25:16 +0100 Subject: [PATCH 18/27] Add codigo civil DAG --- dags/codigo_civil_penal.py | 91 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 dags/codigo_civil_penal.py diff --git a/dags/codigo_civil_penal.py b/dags/codigo_civil_penal.py new file mode 100644 index 0000000..25e34b0 --- /dev/null +++ b/dags/codigo_civil_penal.py @@ -0,0 +1,91 @@ +import pendulum +from airflow import DAG +from airflow.operators.empty import EmptyOperator +from airflow.operators.python import PythonOperator +from datetime import timedelta + +from slugify import slugify + +from verdictnet.config import get_config +from verdictnet.etl import get_docspecs, download_doc, refine, render_html, ingest +from verdictnet.ingestion.documentspec import DocumentSpec + +# Define the default arguments +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': pendulum.today('UTC'), # Start date 8 weeks ago + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 1, + 'retry_delay': timedelta(minutes=5), +} + + +def lazy_download_doc(docspec): + conf = get_config() + download_doc(docspec, conf, force_download=False) + + +def lazy_refine(docspec): + conf = get_config() + refine(docspec, conf) + + +def lazy_render_html(docspec): + conf = get_config() + render_html(docspec, conf) + + +def lazy_ingest(docspec): + conf = get_config() + ingest(docspec, conf) + + +# Define the DAG +with DAG( + 'download_codigos', + default_args=default_args, + description='Download Codigo Civil y Penal', + schedule='@manual', + catchup=False, +): + filenames = get_docspecs() + docspecs = [DocumentSpec.load(filename) for filename in filenames] + + download_task = {} + refine_task = {} + render_task = {} + ingest_task = {} + + start = EmptyOperator(task_id='start_task') + + for docspec in docspecs: + name = slugify(docspec.name) + + download_task[name] = PythonOperator( + task_id=f'download_{name}', + python_callable=lazy_download_doc, + op_args=[docspec], + ) + + refine_task[name] = PythonOperator( + task_id=f'refine_{name}', + python_callable=lazy_refine, + op_args=[docspec], + ) + + render_task[name] = PythonOperator( + task_id=f'render_{name}', + python_callable=lazy_render_html, + op_args=[docspec], + ) + + ingest_task[name] = PythonOperator( + task_id=f'ingest_{name}', + python_callable=lazy_ingest, + op_args=[docspec], + ) + + start >> download_task[name] >> refine_task[name] + download_task[name] >> render_task[name] >> ingest_task[name] From cd6676286f87ac3199e2b3abad66fe0dca2fdfed Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Wed, 29 Jan 2025 19:25:50 +0100 Subject: [PATCH 19/27] add local config file --- config/airflow.cfg | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100755 config/airflow.cfg diff --git a/config/airflow.cfg b/config/airflow.cfg new file mode 100755 index 0000000..e69de29 From cb317a9dae3d69ce9e0ea8eded4497a616eed1c5 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Tue, 4 Feb 2025 08:34:59 +0100 Subject: [PATCH 20/27] add Dockerfile and dev requirements --- Dockerfile | 18 +++-- Makefile | 14 +++- dags/codigo_civil_penal.py | 67 ++++++++-------- docker-compose.yml | 4 +- requirements-dev.txt | 1 + requirements.txt | 2 - setup.cfg | 4 + src/verdictnet/config.py | 3 +- src/verdictnet/etl.py | 43 +++++----- src/verdictnet/ingestion/paths.py | 11 +++ src/verdictnet/models/node.py | 9 ++- tests/conftest.py | 15 ---- tests/models/test_node.py | 128 ++++++++++++++++++++++++++++++ tests/resources/config.ini | 3 +- tests/test_node.py | 29 ------- 15 files changed, 239 insertions(+), 112 deletions(-) create mode 100644 requirements-dev.txt create mode 100644 tests/models/test_node.py delete mode 100644 tests/test_node.py diff --git a/Dockerfile b/Dockerfile index 6199b5a..b053566 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,15 @@ FROM apache/airflow:2.10.0 -USER root -COPY requirements.txt . -COPY src/ src/ -COPY setup.py . -RUN chown -R airflow src/ -RUN apt-get update && apt-get install -y build-essential +# Set the working directory +WORKDIR /app # Switch to airflow user to run the application USER airflow -RUN pip install -r requirements.txt -RUN pip install . \ No newline at end of file + +# Copy the requirements file and install dependencies +COPY requirements.txt . + +RUN pip install --no-cache-dir -r requirements.txt + +# Set the entrypoint to Airflow +ENTRYPOINT ["airflow"] diff --git a/Makefile b/Makefile index 0986c2e..c30eae3 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: setup install start profile server etl test build clean +.PHONY: setup install start stop profile server etl test build clean # Default port for the server PORT ?= 8000 @@ -35,9 +35,21 @@ setup: install .PHONY: start start: + @echo "Detecting virtual environment..." + @if [ -n "$$VIRTUAL_ENV" ]; then \ + echo "Virtual environment detected at $$VIRTUAL_ENV"; \ + export VIRTUAL_ENV_PATH=$$VIRTUAL_ENV; \ + else \ + echo "No virtual environment detected."; \ + fi @echo "Starting up the microservices..." @docker-compose up -d +.PHONY: stop +stop: + @echo "Stopping the microservices..." + @docker-compose down + .PHONY: profile profile: @py-spy record -o profile.svg -- python dags/jurisprudencia.py diff --git a/dags/codigo_civil_penal.py b/dags/codigo_civil_penal.py index 25e34b0..2500960 100644 --- a/dags/codigo_civil_penal.py +++ b/dags/codigo_civil_penal.py @@ -1,5 +1,6 @@ import pendulum from airflow import DAG +from airflow.decorators import task_group from airflow.operators.empty import EmptyOperator from airflow.operators.python import PythonOperator from datetime import timedelta @@ -42,50 +43,52 @@ def lazy_ingest(docspec): ingest(docspec, conf) +def group(name, docspec): + """ + Process a specific document + """ + download_task = PythonOperator( + task_id=f'download_{name}', + python_callable=lazy_download_doc, + op_args=[docspec], + ) + + refine_task = PythonOperator( + task_id=f'refine_{name}', + python_callable=lazy_refine, + op_args=[docspec], + ) + + render_task = PythonOperator( + task_id=f'render_{name}', + python_callable=lazy_render_html, + op_args=[docspec], + ) + + ingest_task = PythonOperator( + task_id=f'ingest_{name}', + python_callable=lazy_ingest, + op_args=[docspec], + ) + + download_task >> refine_task >> render_task >> ingest_task + + return download_task + # Define the DAG with DAG( 'download_codigos', default_args=default_args, description='Download Codigo Civil y Penal', - schedule='@manual', + schedule="@once", catchup=False, ): filenames = get_docspecs() docspecs = [DocumentSpec.load(filename) for filename in filenames] - download_task = {} - refine_task = {} - render_task = {} - ingest_task = {} - start = EmptyOperator(task_id='start_task') for docspec in docspecs: name = slugify(docspec.name) - download_task[name] = PythonOperator( - task_id=f'download_{name}', - python_callable=lazy_download_doc, - op_args=[docspec], - ) - - refine_task[name] = PythonOperator( - task_id=f'refine_{name}', - python_callable=lazy_refine, - op_args=[docspec], - ) - - render_task[name] = PythonOperator( - task_id=f'render_{name}', - python_callable=lazy_render_html, - op_args=[docspec], - ) - - ingest_task[name] = PythonOperator( - task_id=f'ingest_{name}', - python_callable=lazy_ingest, - op_args=[docspec], - ) - - start >> download_task[name] >> refine_task[name] - download_task[name] >> render_task[name] >> ingest_task[name] + start >> group(name, docspec) diff --git a/docker-compose.yml b/docker-compose.yml index ca88d3d..8321f3b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -86,13 +86,15 @@ x-airflow-common: # The following line can be used to set a custom config file, stored in the local config folder # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg' + PYTHONPATH: /app:${PYTHONPATH:-} # Append custom path to existing PYTHONPATH volumes: - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins - - ${PWD}/config/local/config.ini:/opt/airflow/config.ini - ${PWD}/config/local/airflow.cfg:/opt/airflow/config/airflow.cfg + - ./src/verdictnet:/app/verdictnet # Mount local package directory to container + - ${PWD}/config/local/config.ini:/app/verdictnet/config.ini # Mount local config file to container user: "${AIRFLOW_UID:-50000}:0" depends_on: &airflow-common-depends-on diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..ce3e9fc --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +line_profiler diff --git a/requirements.txt b/requirements.txt index 5516ad4..3ecc33b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -187,7 +187,6 @@ kombu==5.4.2 kubernetes==31.0.0 lazy-object-proxy==1.10.0 limits==4.0.1 -line_profiler linkify-it-py==2.0.3 llama-cloud==0.1.6 llama-index==0.12.5 @@ -358,7 +357,6 @@ wcwidth==0.2.13 websocket-client==1.8.0 websockets==12.0 wirerope==1.0.0 -wordcloud==1.9.4 wrapt==1.17.0 WTForms==3.2.1 yarl==1.18.3 diff --git a/setup.cfg b/setup.cfg index 27e2ef5..f38b784 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,6 +20,10 @@ install_requires = pyarrow PyMuPDF +[options.package_data] +verdictnet.ingestion = + resources/*.json + [options.packages.find] where=src diff --git a/src/verdictnet/config.py b/src/verdictnet/config.py index 20dda55..32a617e 100644 --- a/src/verdictnet/config.py +++ b/src/verdictnet/config.py @@ -20,7 +20,7 @@ def root_path(): - return Path(__file__).parent.parent + return Path(__file__).parent def get_config(): @@ -70,6 +70,7 @@ def configure_fsspec(): def get_fs(conf: Optional[configparser.ConfigParser] = None): + configure_fsspec() conf = conf or get_config() if conf['storage']['type'] == 's3': fs = fsspec.filesystem("s3") diff --git a/src/verdictnet/etl.py b/src/verdictnet/etl.py index b48e04d..92660f1 100644 --- a/src/verdictnet/etl.py +++ b/src/verdictnet/etl.py @@ -7,9 +7,9 @@ from slugify import slugify from bs4 import BeautifulSoup -from verdictnet.config import get_config, root_path, logging +from verdictnet.config import get_config, root_path, logging, get_fs from verdictnet.ingestion.documentspec import DocumentSpec -from verdictnet.ingestion.paths import refined_path +from verdictnet.ingestion.paths import refined_path, raw_path, html_path from verdictnet.models.node import Node from verdictnet.ingestion.parsers.html_parser import parse from verdictnet.storage.hybrid_storage import HybridStorage @@ -31,7 +31,9 @@ def get_docspecs(path: Path = None) -> List[Path]: filenames.append(Path(root + '/' + file)) if not filenames: - print(f"No document Spec filed found in provided folder {path}") + logger.warning(f"No document Spec files found in provided folder {path}") + else: + logger.info(f"Found {len(filenames)} document specs: %s", ",".join([str(f) for f in filenames])) return filenames @@ -76,18 +78,19 @@ def download_doc(docspec: DocumentSpec, conf, force_download=False): """ Download the document and save it to the raw folder in html format """ + fs = get_fs(conf) + slug_name = slugify(docspec.name) - target_filename = f'{slug_name}.html' - raw_path = root_path() / conf['storage']['raw'] / target_filename + target_path = raw_path() + f'{slug_name}.html' # Download documents - if force_download or not os.path.exists(raw_path): - print(f"Downloading document `{docspec.name}`...") + if force_download or not os.path.exists(target_path): + logger.info(f"Downloading document `{docspec.name}`...") text = download(docspec) - os.makedirs(os.path.dirname(raw_path), exist_ok=True) - with open(raw_path, 'w') as file: + os.makedirs(os.path.dirname(target_path), exist_ok=True) + with fs.open(target_path, 'w') as file: file.write(text) @@ -95,12 +98,14 @@ def refine(docspec, conf): """ Take the document in html format and refine it to a json format """ + fs = get_fs(conf) + slug_name = slugify(docspec.name) - target_filename = f'{slug_name}.html' - raw_path = root_path() / conf['storage']['refined'] / target_filename + source_filename = f'{slug_name}.html' + soure_path = raw_path() + source_filename - with open(raw_path, 'r') as file: + with fs.open(soure_path, 'r') as file: text = file.read() target = refined_path() + f'{slug_name}.json' @@ -108,23 +113,24 @@ def refine(docspec, conf): main_node = get_document_structure(text, docspec=docspec) main_node[0].save(target) - print(f"Saved refined in '{target_filename}'.") + logger.info(f"Saved refined in '{target}'.") def render_html(docspec: DocumentSpec, conf: ConfigParser): + fs = get_fs(conf) + slug_name = slugify(docspec.name) main_node_path = refined_path() + f'{slug_name}.json' main_node = Node.load(main_node_path) - html_path = root_path() / conf['storage']['html'] / f'{slug_name}.html' - os.makedirs(os.path.dirname(html_path), exist_ok=True) - with open(html_path, 'w', encoding='utf-8') as file: - file.write(main_node[0].html( + html_file = html_path() + f'{slug_name}.html' + with fs.open(html_file, 'w', encoding='utf-8') as file: + file.write(main_node.html( preamble=""" """ )) - logger.info(f"HTML saved to '{slug_name}.html'.") + logger.info(f"HTML saved to '{html_file}'.") def ingest(docspec: DocumentSpec, conf: ConfigParser): @@ -160,7 +166,6 @@ def run(force_download=False, path=None): docspecs = [DocumentSpec.load(filename) for filename in filenames] for docspec in docspecs: - # Download document download_doc(docspec, conf, force_download) diff --git a/src/verdictnet/ingestion/paths.py b/src/verdictnet/ingestion/paths.py index 00be14f..7d2e06f 100644 --- a/src/verdictnet/ingestion/paths.py +++ b/src/verdictnet/ingestion/paths.py @@ -43,3 +43,14 @@ def refined_path(): return root_path() / conf['storage']['bucket'] / conf['storage']['refined'] elif conf['storage']['type'] == 's3': return f"s3://{conf['storage']['bucket']}/{conf['storage']['refined']}" + + +def html_path(): + """ + Return the path where we store refined objects as JSON files ready to be ingested + into the database + """ + if conf['storage']['type'] == 'local': + return root_path() / conf['storage']['bucket'] / conf['storage']['html'] + elif conf['storage']['type'] == 's3': + return f"s3://{conf['storage']['bucket']}/{conf['storage']['html']}" diff --git a/src/verdictnet/models/node.py b/src/verdictnet/models/node.py index 04bf3bd..dd2b24e 100644 --- a/src/verdictnet/models/node.py +++ b/src/verdictnet/models/node.py @@ -88,9 +88,11 @@ def load(cls, path): """ Load the node from a file """ - with open(path, 'r', encoding='utf8') as file: - data = json.load(file, ensure_ascii=False) - return Node(**data) + fs = get_fs() + + with fs.open(path, 'r', encoding='utf8') as file: + data = json.load(file) + return Node.from_dict(data) @classmethod def from_dict(cls, data): @@ -100,6 +102,7 @@ def from_dict(cls, data): return Node( id=data['id'], level=data['level'], + uuid=data['uuid'], content=data['content'], children=[cls.from_dict(child) for child in data['children']] ) diff --git a/tests/conftest.py b/tests/conftest.py index 34c4f96..753d5bc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,8 +5,6 @@ from bs4 import BeautifulSoup from pytest import fixture -from verdictnet.config import get_config - @fixture(autouse=True) def mock_config(): @@ -25,19 +23,6 @@ def mock_get_config(): resources = Path(__file__).parent / "resources" - -@fixture -def static_files(): - from verdictnet.config import root_path - return root_path() / "verdictnet/frontend/static/css" - - -@fixture -def css_code(static_files): - with open(static_files / 'document_tree.css', 'r') as css_file: - yield css_file.read() - - @fixture def codigo_civil_spec(): docspec = DocumentSpec.load(resources / "codigo-civil-spec.json") diff --git a/tests/models/test_node.py b/tests/models/test_node.py new file mode 100644 index 0000000..1246596 --- /dev/null +++ b/tests/models/test_node.py @@ -0,0 +1,128 @@ +import json +import tempfile +import webbrowser + +from _pytest.fixtures import fixture +from bs4 import BeautifulSoup + + +from verdictnet.models.node import Node + + +@fixture +def static_files(): + from verdictnet.config import root_path + return root_path() / "frontend/static/css" + + +@fixture +def css_code(static_files): + with open(static_files / 'document_tree.css', 'r') as css_file: + yield css_file.read() + + +class TestNode: + def test_render(self, node_titulo): + text = node_titulo.render() + assert " 2. Carecerán de validez las disposiciones que contradigan otra de rango superior." in text + + def test_html(self, node_titulo, css_code): + html = node_titulo.html() + + # Insert the CSS link into the HTML content + html_with_css = f'{html}' + + # open html_text in a browser to see the result + with tempfile.NamedTemporaryFile('w', delete=False, suffix='.html') as f: + url = 'file://' + f.name + f.write(html_with_css) + + # ensure html is correctly formatted + try: + assert BeautifulSoup(html_with_css, 'html.parser') + except Exception as e: + webbrowser.open(url) + raise e + + def test_save(self): + node = Node(level="1", content="Test Node") + with tempfile.NamedTemporaryFile('w', delete=False, suffix='.json') as f: + path = f.name + node.save(path) + + with open(path, 'r', encoding='utf8') as file: + data = json.load(file) + + assert data['level'] == "1" + assert data['content'] == "Test Node" + assert 'uuid' in data + + def test_load(self): + node_data = { + "id": 1, + "uuid": "1234", + "level": "1", + "content": "Test Node", + "children": [] + } + with tempfile.NamedTemporaryFile('w', delete=False, suffix='.json') as f: + path = f.name + json.dump(node_data, f) + + loaded_node = Node.load(path) + + assert loaded_node.level == "1" + assert loaded_node.content == "Test Node" + assert loaded_node.uuid == "1234" + assert loaded_node.children == [] + + def test_save_with_children(self): + child_node = Node(level="2", content="Child Node") + parent_node = Node(level="1", content="Parent Node", children=[child_node]) + + with tempfile.NamedTemporaryFile('w', delete=False, suffix='.json') as f: + path = f.name + parent_node.save(path) + + with open(path, 'r', encoding='utf8') as file: + data = json.load(file) + + assert data['level'] == "1" + assert data['content'] == "Parent Node" + assert 'uuid' in data + assert len(data['children']) == 1 + assert data['children'][0]['level'] == "2" + assert data['children'][0]['content'] == "Child Node" + assert 'uuid' in data['children'][0] + + def test_load_with_children(self): + node_data = { + "id": 1, + "uuid": "1234", + "level": "1", + "content": "Parent Node", + "children": [ + { + "id": 2, + "uuid": "5678", + "level": "2", + "content": "Child Node", + "children": [] + } + ] + } + + with tempfile.NamedTemporaryFile('w', delete=False, suffix='.json') as f: + path = f.name + json.dump(node_data, f) + + loaded_node = Node.load(path) + + assert loaded_node.level == "1" + assert loaded_node.content == "Parent Node" + assert loaded_node.uuid == "1234" + assert len(loaded_node.children) == 1 + assert loaded_node.children[0].level == "2" + assert loaded_node.children[0].content == "Child Node" + assert loaded_node.children[0].uuid == "5678" + assert loaded_node.children[0].children == [] diff --git a/tests/resources/config.ini b/tests/resources/config.ini index cd2617e..9e84700 100644 --- a/tests/resources/config.ini +++ b/tests/resources/config.ini @@ -1,5 +1,6 @@ [storage] -type: s3 +# s3 or file +type: file bucket: legal collection: legal-database raw: datalake/raw/ diff --git a/tests/test_node.py b/tests/test_node.py deleted file mode 100644 index da6614a..0000000 --- a/tests/test_node.py +++ /dev/null @@ -1,29 +0,0 @@ -import tempfile -import webbrowser - -from bs4 import BeautifulSoup - - -class TestNode: - def test_render(self, node_titulo): - text = node_titulo.render() - assert " 2. Carecerán de validez las disposiciones que contradigan otra de rango superior." in text - - def test_html(self, node_titulo, css_code): - html = node_titulo.html() - - # Insert the CSS link into the HTML content - html_with_css = f'{html}' - - # open html_text in a browser to see the result - with tempfile.NamedTemporaryFile('w', delete=False, suffix='.html') as f: - url = 'file://' + f.name - f.write(html_with_css) - - # ensure html is correctly formatted - try: - assert BeautifulSoup(html_with_css, 'html.parser') - except Exception as e: - webbrowser.open(url) - raise e - From 2c21d207a9e61a687acee68f362d7a5a6c1a18e4 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Tue, 4 Feb 2025 08:36:52 +0100 Subject: [PATCH 21/27] add logs to gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f4ce3e0..853c3b4 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ */*.egg-info/* .idea/ build/ - +logs/ __pycache__/ docs/ cache/ From 23271fe2f7720be396d85d24df88b201c48a62f7 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Tue, 4 Feb 2025 09:22:45 +0100 Subject: [PATCH 22/27] fix mock config in tests --- pytest.ini | 3 +++ requirements.txt | 1 - setup.cfg | 4 ++++ src/verdictnet/config.py | 3 ++- tests/conftest.py | 3 ++- tests/resources/config.ini | 8 ++++---- 6 files changed, 15 insertions(+), 7 deletions(-) create mode 100644 pytest.ini diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..050e475 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +log_level=INFO +log_cli=true \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 3ecc33b..b32ce8b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -285,7 +285,6 @@ pytest-mock==3.14.0 python-daemon==3.1.2 python-dateutil==2.9.0.post0 python-dotenv==1.0.1 -python-multipart==0.0.17 python-nvd3==0.16.0 python-slugify==8.0.4 pytz==2024.2 diff --git a/setup.cfg b/setup.cfg index f38b784..8a6db70 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,6 +23,10 @@ install_requires = [options.package_data] verdictnet.ingestion = resources/*.json +verdictnet.frontend = + static/css/* + static/js/* + templates/* [options.packages.find] where=src diff --git a/src/verdictnet/config.py b/src/verdictnet/config.py index 32a617e..cbe587c 100644 --- a/src/verdictnet/config.py +++ b/src/verdictnet/config.py @@ -28,7 +28,8 @@ def get_config(): config = configparser.ConfigParser() # Load the configuration file from the current folder, or from the package root folder - config.read(filenames=['config.ini', root_path() / 'config.ini']) + files = config.read(filenames=['config.ini', root_path() / 'config.ini']) + logger.info("Successfully loaded config files: %s", files) return config diff --git a/tests/conftest.py b/tests/conftest.py index 753d5bc..50b9128 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,7 +10,8 @@ def mock_config(): def mock_get_config(): config = configparser.ConfigParser() - config.read('resources/config.ini') + success = config.read(['resources/config.ini', 'tests/resources/config.ini']) + assert success is not [], "Could not read mock config file" return config with patch('verdictnet.config.get_config', mock_get_config): diff --git a/tests/resources/config.ini b/tests/resources/config.ini index 9e84700..5782e55 100644 --- a/tests/resources/config.ini +++ b/tests/resources/config.ini @@ -14,8 +14,8 @@ port: 8000 [neo4j] url: bolt://localhost:7687 -user: neo4j -password: neo4jtest +user: none +password: nopwd [embedding] model_name_or_path: paraphrase-mpnet-base-v2 @@ -25,6 +25,6 @@ cache: cache/ n_results: 5 [s3] -key = minioadmin -secret = minioadmin +key = none +secret = nopwd endpoint_url = http://localhost:9000 \ No newline at end of file From f071d5ac053d6537854ae44bc6dfcbd54338573c Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Tue, 4 Feb 2025 14:34:36 +0100 Subject: [PATCH 23/27] improve setup & readme --- Makefile | 47 ++++++++++++++++++++++++++++++++++++++++------ README.md | 33 +++++++++++++++++++++++++++----- config/airflow.cfg | 0 docker-compose.yml | 21 ++++++++++----------- 4 files changed, 79 insertions(+), 22 deletions(-) delete mode 100755 config/airflow.cfg diff --git a/Makefile b/Makefile index c30eae3..421c88f 100644 --- a/Makefile +++ b/Makefile @@ -14,9 +14,26 @@ install: .PHONY: setup setup: install - @echo "Generating .env file with FERNET_KEY..." - @python3 -c "from cryptography.fernet import Fernet; print(f'FERNET_KEY={Fernet.generate_key().decode()}')" > .env - @echo ".env file generated." + @if [ ! -f .env ]; then \ + echo "Generating .env file with FERNET_KEY..."; \ + echo "Generating .env file with FERNET_KEY..."; \ + python3 -c "from cryptography.fernet import Fernet; \ + fernet_key = Fernet.generate_key().decode(); \ + template = 'FERNET_KEY={fernet_key}\\nAIRFLOW_UID=50000'; \ + print(template.format(fernet_key=fernet_key)); \ + print('_AIRFLOW_WWW_USER_USERNAME=airflow'); \ + print('_AIRFLOW_WWW_USER_PASSWORD=airflow'); " > .env; \ + echo ".env file generated."; \ + else \ + echo ".env file already exists. Skipping FERNET_KEY generation."; \ + fi + @if [ ! -f config/local/.env ]; then \ + echo "Copying .env file to config/local..."; \ + cp .env config/local/; \ + echo ".env file copied."; \ + else \ + echo "config/local/.env file already exists. Skipping config/local copy."; \ + fi @echo "Launching minio..." @docker-compose up -d minio @echo "Waiting for minio to start..." @@ -26,9 +43,20 @@ setup: install done @echo "Setting up local alias..." @docker-compose exec minio mc alias set minio http://localhost:9000 minioadmin minioadmin - @echo "Creating buckets..." - @docker-compose exec minio sh -c "mc ls minio/legal || mc mb minio/legal" - @docker-compose exec minio mc mb minio/airflow-logs + @echo "Checking if bucket 'legal' exists..." + @if ! docker-compose exec minio mc ls minio/legal; then \ + echo "Creating bucket 'legal'..."; \ + docker-compose exec minio mc mb minio/legal; \ + else \ + echo "Bucket 'legal' already exists. Skipping creation."; \ + fi + @echo "Checking if bucket 'airflow-logs' exists..." + @if ! docker-compose exec minio mc ls minio/airflow-logs; then \ + echo "Creating bucket 'airflow-logs'..."; \ + docker-compose exec minio mc mb minio/airflow-logs; \ + else \ + echo "Bucket 'airflow-logs' already exists. Skipping creation."; \ + fi @echo "Minio setup complete. Stopping minio..." @docker-compose stop minio @echo "Setup complete." @@ -44,12 +72,19 @@ start: fi @echo "Starting up the microservices..." @docker-compose up -d + @echo "Done." + @echo "\nFrontends are available at the following links:" + @echo "ChromaDB: http://localhost:3000/collections/legal-database" + @echo "Neo4j: http://localhost:7474" + @echo "Minio: http://localhost:9000" + @echo "Airflow: http://localhost:8080" .PHONY: stop stop: @echo "Stopping the microservices..." @docker-compose down + .PHONY: profile profile: @py-spy record -o profile.svg -- python dags/jurisprudencia.py diff --git a/README.md b/README.md index 50af8f8..5ae8202 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,44 @@ -# Semantic Graph Search Project +# VerdictNet: Legal Semantic Search Engine This project is a semantic graph search system designed to manage and query data. Currently, the interface is a simple command-line interface (CLI) tool and a web server. The system supports data ingestion, cleaning, querying, and running a server for frontend interactions. -## Quick Start -You should be able to kickstart the project by launching the docker compose setup and then starting the server: +## Development Quick Start +It is strongly recommended to use a virtual environment to run the project. You should be able to kickstart the project by running the following commands: ```sh - $ docker-compose up + $ make setup ``` -This will launch the following services: + +This will +- Install the package requirements in the current python environment (python 3.12 recommended) +- Create an `.env` file with the necessary environment variables if it does not exist. Copy this `.env` file to the `config/local` directory if it does not already exist. +- Create the `datalake` and `airflow-logs` buckets in the Minio object storage. +- Install development dependencies. + +After this, you can start the development environment by running: +```sh + $ make start +``` +The first launch will take some time because it will build the docker images. + +When done, the following services will be up and running: - [ChromaDB Browser: `http://localhost:3000/collections/legal-database`](http://localhost:3000/collections/legal-database). This is the vector Database used to run semantic queries. - [Neo4J Browser: `http://localhost:7474`](http://localhost:7474). This is a GUI to the graph database that will hold the relationships between the different documents indexed in the ChromaDB. - [Airflow: `http://localhost:8080`](http://localhost:8080). This is the scheduler used to run daily data mining tasks. - [Minio Console: `http://localhost:9001`](http://localhost:9001). This is the object storage used to store the documents in local develpment envs. +The Postgress database is used by Airflow and is persisted to a `postgress_service`. This is useful if you want to do a clean start and not lose the data in the database. + + + +### Running the ETL pipeline +To run the ETL pipeline, you can run the following command: +```sh + $ make etl +``` + Finally, run ```sh $ verdictnet server diff --git a/config/airflow.cfg b/config/airflow.cfg deleted file mode 100755 index e69de29..0000000 diff --git a/docker-compose.yml b/docker-compose.yml index 8321f3b..2d7ceea 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -89,9 +89,11 @@ x-airflow-common: PYTHONPATH: /app:${PYTHONPATH:-} # Append custom path to existing PYTHONPATH volumes: - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags - - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs - - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config - - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins + # TODO: need to figure out what happens with dag_processor_manager and scheduler logs, that don't go to minio + #- ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs + - ${AIRFLOW_PROJ_DIR:-.}/config/local/:/opt/airflow/config + # TODO: Not needed ATM, but might be useful in the future + #- ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins - ${PWD}/config/local/airflow.cfg:/opt/airflow/config/airflow.cfg - ./src/verdictnet:/app/verdictnet # Mount local package directory to container - ${PWD}/config/local/config.ini:/app/verdictnet/config.ini # Mount local config file to container @@ -111,7 +113,7 @@ services: ports: - "8000:8000" volumes: - - ${PWD}/chromadb_storage:/data + - ${PWD}/chromadb:/data chromadb-admin: image: fengzhichao/chromadb-admin:latest @@ -136,7 +138,7 @@ services: start_period: 5s restart: always volumes: - - ${PWD}/postgress_storage:/var/lib/postgresql/data + - ${PWD}/postgress:/var/lib/postgresql/data redis: # Redis is limited to 7.2-bookworm due to licencing change # https://redis.io/blog/redis-adopts-dual-source-available-licensing/ @@ -273,7 +275,8 @@ services: fi mkdir -p /sources/logs /sources/dags /sources/plugins chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} - exec airflow db init + exec airflow db migrate + exec airflow users create -u ${_AIRFLOW_WWW_USER_USERNAME} -p ${_AIRFLOW_WWW_USER_PASSWORD} -r Admin --verbose -f air -l flow -e airflow@airflow.air # yamllint enable rule:line-length environment: <<: *airflow-common-env @@ -294,7 +297,7 @@ services: - "9000:9000" # API - "9001:9001" # Console volumes: - - ${PWD}/data/:/data + - ${PWD}/minio/:/data command: server /data --console-address ":9001" neo4j: @@ -307,7 +310,3 @@ services: - "7687:7687" # Bolt protocol volumes: - ${PWD}/neo4j_data:/data - -volumes: - chromadb_data: - postgres_data: From 3a64b3e7ec906756c0094f02f50d287d7a8e4ae2 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Tue, 4 Feb 2025 22:31:12 +0100 Subject: [PATCH 24/27] improve setup & readme --- .gitignore | 8 ++++++-- Makefile | 17 +++++++++++++++++ README.md | 2 ++ docker-compose.yml | 1 - 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 853c3b4..015ed78 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,11 @@ __pycache__/ docs/ cache/ data/ -datalake/ +plugins/ + +minio/ +neo4j/ neo4j_data/ -postgress_storage/ +chromadb/ +postgress/ diff --git a/Makefile b/Makefile index 421c88f..bf4b4b2 100644 --- a/Makefile +++ b/Makefile @@ -59,6 +59,23 @@ setup: install fi @echo "Minio setup complete. Stopping minio..." @docker-compose stop minio + @echo "Initializing Airflow..." + @docker-compose up -d airflow-webserver + @echo "Waiting for Airflow to start..." + @until docker-compose exec airflow-webserver airflow db check; do \ + echo "Airflow is not healthy yet. Retrying in 5 seconds..."; \ + sleep 5; \ + done + @echo "Creating Airflow user..." + @docker-compose exec airflow-webserver airflow users create -u airflow -p airflow -r Admin --verbose -f air -l flow -e airflow@airflow.air + @if docker-compose exec airflow-webserver airflow connections get minio; then \ + echo "Connection 'minio' already exists. Skipping creation."; \ + else \ + echo "Creating connection 'minio'..."; \ + docker-compose exec airflow-webserver airflow connections add --conn-login minioadmin --conn-password minioadmin --conn-host minio --conn-port 9000 --conn-schema http --conn-extra '{"endpoint_url": "http://minio:9000"}' --conn-type aws minio; \ + fi + @echo "Stopping Airflow..." + @docker-compose stop airflow-webserver @echo "Setup complete." .PHONY: start diff --git a/README.md b/README.md index 5ae8202..6043221 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ This will - Install the package requirements in the current python environment (python 3.12 recommended) - Create an `.env` file with the necessary environment variables if it does not exist. Copy this `.env` file to the `config/local` directory if it does not already exist. - Create the `datalake` and `airflow-logs` buckets in the Minio object storage. +- Create the `airflow` user in Airflow. +- Create the `minio` Airflow connection. - Install development dependencies. After this, you can start the development environment by running: diff --git a/docker-compose.yml b/docker-compose.yml index 2d7ceea..856fc95 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -276,7 +276,6 @@ services: mkdir -p /sources/logs /sources/dags /sources/plugins chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} exec airflow db migrate - exec airflow users create -u ${_AIRFLOW_WWW_USER_USERNAME} -p ${_AIRFLOW_WWW_USER_PASSWORD} -r Admin --verbose -f air -l flow -e airflow@airflow.air # yamllint enable rule:line-length environment: <<: *airflow-common-env From fe4e39efba33fe0dfb58fa8b68fab7e0a604398a Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Tue, 4 Feb 2025 22:44:45 +0100 Subject: [PATCH 25/27] fix typo --- .gitignore | 2 +- docker-compose.yml | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 015ed78..648a984 100644 --- a/.gitignore +++ b/.gitignore @@ -15,5 +15,5 @@ minio/ neo4j/ neo4j_data/ chromadb/ -postgress/ +postgres/ diff --git a/docker-compose.yml b/docker-compose.yml index 856fc95..ae8252c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -138,7 +138,8 @@ services: start_period: 5s restart: always volumes: - - ${PWD}/postgress:/var/lib/postgresql/data + - ${PWD}/postgres:/var/lib/postgresql/data + redis: # Redis is limited to 7.2-bookworm due to licencing change # https://redis.io/blog/redis-adopts-dual-source-available-licensing/ From c684233c86169d7a9463d5d5ecb3d7d3889eaeae Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Tue, 4 Feb 2025 23:42:52 +0100 Subject: [PATCH 26/27] make sentence-transformers cache available to airflow workers --- Makefile | 13 +++---------- config/local/config.ini | 3 ++- docker-compose.yml | 2 ++ src/verdictnet/embedding.py | 4 ++-- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index bf4b4b2..72b4428 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,5 @@ .PHONY: setup install start stop profile server etl test build clean -# Default port for the server -PORT ?= 8000 - -# Path for ETL documents -ETL_PATH ?= /path/to/docspecs -FORCE ?= true - .PHONY: install install: @echo "Installing requirements..." @@ -114,12 +107,12 @@ build: # Run the server server: @echo "Running the server on port $(PORT)..." - @semantic server --port $(PORT) + @verdictnet server --port $(PORT) # Run the ETL pipeline etl: @echo "Running the ETL pipeline with path $(ETL_PATH) and force $(FORCE)..." - @semantic etl run --path $(ETL_PATH) --force $(FORCE) + @verdictnet etl run --path $(ETL_PATH) --force $(FORCE) # Run tests test: @@ -129,4 +122,4 @@ test: # Clean the vector database clean: @echo "Cleaning the vector database..." - @semantic etl clean \ No newline at end of file + @verdictnet etl clean \ No newline at end of file diff --git a/config/local/config.ini b/config/local/config.ini index 18426dd..ac523d4 100644 --- a/config/local/config.ini +++ b/config/local/config.ini @@ -18,7 +18,8 @@ password: neo4jtest [embedding] model_name_or_path: paraphrase-mpnet-base-v2 -cache: cache/ +# use this because this is the mounting point in the docker compose +cache: /cache [rag] n_results: 5 diff --git a/docker-compose.yml b/docker-compose.yml index ae8252c..b679f6a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -205,6 +205,8 @@ services: # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation DUMB_INIT_SETSID: "0" restart: always + volumes: + - ${PWD}/cache:/cache depends_on: <<: *airflow-common-depends-on airflow-init: diff --git a/src/verdictnet/embedding.py b/src/verdictnet/embedding.py index 8af84d0..2fd3a5f 100644 --- a/src/verdictnet/embedding.py +++ b/src/verdictnet/embedding.py @@ -13,9 +13,9 @@ def __init__(self, conf: Optional[configparser.ConfigParser] = None): self.conf = conf or get_config() # Load a pre-trained model - self.model = SentenceTransformer( # Lightweight, fast model + self.model = SentenceTransformer( self.conf.get('embedding', 'model_name_or_path'), - cache_folder=root_path() / self.conf.get('embedding', 'cache') + cache_folder=self.conf.get('embedding', 'cache') ) def embed_nodes(self, nodes: List[Node]) -> tuple[ From 76dac84a7445992c8fe95b2b5ae3ccfd17f8aeb0 Mon Sep 17 00:00:00 2001 From: Alex Monras Date: Tue, 4 Feb 2025 23:43:09 +0100 Subject: [PATCH 27/27] comment config --- config.ini | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/config.ini b/config.ini index cd2617e..05f70ad 100644 --- a/config.ini +++ b/config.ini @@ -1,3 +1,6 @@ +# This is the config file meant for running the application in the host machine +# It uses the minio storage + [storage] type: s3 bucket: legal @@ -17,6 +20,7 @@ user: neo4j password: neo4jtest [embedding] +# Lightweight, fast model model_name_or_path: paraphrase-mpnet-base-v2 cache: cache/