From 126364a60fd6dae9afe991d8555600f3431d6f1c Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Sun, 26 Jan 2025 23:26:22 +0100
Subject: [PATCH 01/27] reorg code

---
 dags/jurisprudencia.py                        |  56 +--
 src/{ => semantic}/__init__.py                |   0
 src/{ => semantic}/cli.py                     |   0
 src/{ => semantic}/config.py                  |   0
 src/{ => semantic}/embedding.py               |   0
 src/{ => semantic}/etl.py                     |   0
 src/{ => semantic}/frontend/__init__.py       |   0
 src/{ => semantic}/frontend/custom_logger.py  |   0
 src/{ => semantic}/frontend/paths.py          |   0
 .../frontend/server/__init__.py               |   0
 src/{ => semantic}/frontend/server/app.py     |   0
 .../frontend/server/dto}/__init__.py          |   0
 .../frontend/server/dto/websocket.py          |   0
 src/{ => semantic}/frontend/server/server.py  |   0
 .../frontend/server/websocket.py              |   0
 .../frontend/static/css/document_tree.css     |   0
 .../frontend/static/css/style.css             |   0
 src/{ => semantic}/frontend/static/js/main.js |   0
 .../frontend/templates/index.html             |   0
 src/{ => semantic}/ingestion/README.md        |   0
 .../ingestion}/__init__.py                    |   0
 src/{ => semantic}/ingestion/documentspec.py  |   0
 src/{ => semantic}/ingestion/downloader.py    |   0
 .../ingestion/parsers}/__init__.py            |   0
 .../ingestion/parsers/html_parser.py          |   0
 .../ingestion/parsers/pdf_parser.py           |   0
 src/{ => semantic}/ingestion/paths.py         |   0
 .../ingestion/resources/codigo_civil.json     |   0
 .../ingestion/resources/codigo_penal.json     |   0
 src/{ => semantic}/models/node.py             |   0
 src/{ => semantic}/query.py                   |   0
 src/{ => semantic}/ragagent.py                |   0
 src/semantic/render/__init__.py               |   0
 src/{ => semantic}/render/html.py             |   0
 src/{ => semantic}/render/node_renderer.py    |   0
 src/{ => semantic}/render/plain_text.py       |   0
 src/storage/adapters.py                       | 145 --------
 src/storage/chroma_storage.py                 |  90 -----
 src/storage/graph_storage.py                  | 328 ------------------
 src/storage/hybrid_storage.py                 |  78 -----
 src/storage/transaction_manager.py            |  93 -----
 tests/conftest.py                             |   2 +-
 42 files changed, 12 insertions(+), 780 deletions(-)
 rename src/{ => semantic}/__init__.py (100%)
 rename src/{ => semantic}/cli.py (100%)
 rename src/{ => semantic}/config.py (100%)
 rename src/{ => semantic}/embedding.py (100%)
 rename src/{ => semantic}/etl.py (100%)
 rename src/{ => semantic}/frontend/__init__.py (100%)
 rename src/{ => semantic}/frontend/custom_logger.py (100%)
 rename src/{ => semantic}/frontend/paths.py (100%)
 rename src/{ => semantic}/frontend/server/__init__.py (100%)
 rename src/{ => semantic}/frontend/server/app.py (100%)
 rename src/{ingestion => semantic/frontend/server/dto}/__init__.py (100%)
 rename src/{ => semantic}/frontend/server/dto/websocket.py (100%)
 rename src/{ => semantic}/frontend/server/server.py (100%)
 rename src/{ => semantic}/frontend/server/websocket.py (100%)
 rename src/{ => semantic}/frontend/static/css/document_tree.css (100%)
 rename src/{ => semantic}/frontend/static/css/style.css (100%)
 rename src/{ => semantic}/frontend/static/js/main.js (100%)
 rename src/{ => semantic}/frontend/templates/index.html (100%)
 rename src/{ => semantic}/ingestion/README.md (100%)
 rename src/{ingestion/parsers => semantic/ingestion}/__init__.py (100%)
 rename src/{ => semantic}/ingestion/documentspec.py (100%)
 rename src/{ => semantic}/ingestion/downloader.py (100%)
 rename src/{render => semantic/ingestion/parsers}/__init__.py (100%)
 rename src/{ => semantic}/ingestion/parsers/html_parser.py (100%)
 rename src/{ => semantic}/ingestion/parsers/pdf_parser.py (100%)
 rename src/{ => semantic}/ingestion/paths.py (100%)
 rename src/{ => semantic}/ingestion/resources/codigo_civil.json (100%)
 rename src/{ => semantic}/ingestion/resources/codigo_penal.json (100%)
 rename src/{ => semantic}/models/node.py (100%)
 rename src/{ => semantic}/query.py (100%)
 rename src/{ => semantic}/ragagent.py (100%)
 create mode 100644 src/semantic/render/__init__.py
 rename src/{ => semantic}/render/html.py (100%)
 rename src/{ => semantic}/render/node_renderer.py (100%)
 rename src/{ => semantic}/render/plain_text.py (100%)
 delete mode 100644 src/storage/adapters.py
 delete mode 100644 src/storage/chroma_storage.py
 delete mode 100644 src/storage/graph_storage.py
 delete mode 100644 src/storage/hybrid_storage.py
 delete mode 100644 src/storage/transaction_manager.py

diff --git a/dags/jurisprudencia.py b/dags/jurisprudencia.py
index de06499..86d41c2 100644
--- a/dags/jurisprudencia.py
+++ b/dags/jurisprudencia.py
@@ -1,11 +1,13 @@
 from airflow import DAG
-from airflow.operators.python_operator import PythonOperator
+from airflow.operators.python import PythonOperator
 from airflow.utils.dates import days_ago
 from datetime import datetime, timedelta
 import requests
 import json
 import os
 
+from semantic.ingestion.downloader import get_item_pagination
+
 # Define the default arguments
 default_args = {
     'owner': 'airflow',
@@ -18,51 +20,15 @@
 }
 
 # Define the DAG
-dag = DAG(
+with DAG(
     'query_poderjudicial',
     default_args=default_args,
     description='Query www.poderjudicial.es and store results in JSON',
-    schedule_interval='@weekly',
+    schedule_interval='@daily',
     catchup=True,
-)
-
-# Define the Python function to query the API and save results
-def query_poderjudicial(ds, **kwargs):
-    date_from = (datetime.strptime(ds, '%Y-%m-%d') - timedelta(days=7)).strftime('%Y-%m-%d')
-    date_to = ds
-    #
-    # url = 'https://www.poderjudicial.es/search/search.action'
-    # payload = {
-    #     "action": "query",
-    #     "sort": "IN_FECHARESOLUCION:decreasing",
-    #     "recordsPerPage": "10",
-    #     "databasematch": "AN",
-    #     "start": "1",
-    #     "FECHARESOLUCIONDESDE": date_from,
-    #     "FECHARESOLUCIONHASTA": date_to,
-    #     "TIPOINTERES_ACTUAL": "Actualidad",
-    #     "TIPOORGANOPUB": "|11|12|13|14|15|16|"
-    # }
-    # headers = {
-    #     'Content-Type': 'application/json'
-    # }
-    #
-    # response = requests.post(url, json=payload, headers=headers)
-    # response.raise_for_status()
-    #
-    # results = response.json()
-    # output_path = f'/path/to/output/results_{date_from}_to_{date_to}.json'
-    #
-    # with open(output_path, 'w') as f:
-    #     json.dump(results, f)
-
-# Define the task
-query_task = PythonOperator(
-    task_id='query_poderjudicial_task',
-    provide_context=True,
-    python_callable=query_poderjudicial,
-    dag=dag,
-)
-
-# Set the task in the DAG
-query_task
+):
+    item_pagination = PythonOperator(
+        task_id='get_item_pagination',
+        provide_context=True,
+        python_callable=get_item_pagination,
+    )
diff --git a/src/__init__.py b/src/semantic/__init__.py
similarity index 100%
rename from src/__init__.py
rename to src/semantic/__init__.py
diff --git a/src/cli.py b/src/semantic/cli.py
similarity index 100%
rename from src/cli.py
rename to src/semantic/cli.py
diff --git a/src/config.py b/src/semantic/config.py
similarity index 100%
rename from src/config.py
rename to src/semantic/config.py
diff --git a/src/embedding.py b/src/semantic/embedding.py
similarity index 100%
rename from src/embedding.py
rename to src/semantic/embedding.py
diff --git a/src/etl.py b/src/semantic/etl.py
similarity index 100%
rename from src/etl.py
rename to src/semantic/etl.py
diff --git a/src/frontend/__init__.py b/src/semantic/frontend/__init__.py
similarity index 100%
rename from src/frontend/__init__.py
rename to src/semantic/frontend/__init__.py
diff --git a/src/frontend/custom_logger.py b/src/semantic/frontend/custom_logger.py
similarity index 100%
rename from src/frontend/custom_logger.py
rename to src/semantic/frontend/custom_logger.py
diff --git a/src/frontend/paths.py b/src/semantic/frontend/paths.py
similarity index 100%
rename from src/frontend/paths.py
rename to src/semantic/frontend/paths.py
diff --git a/src/frontend/server/__init__.py b/src/semantic/frontend/server/__init__.py
similarity index 100%
rename from src/frontend/server/__init__.py
rename to src/semantic/frontend/server/__init__.py
diff --git a/src/frontend/server/app.py b/src/semantic/frontend/server/app.py
similarity index 100%
rename from src/frontend/server/app.py
rename to src/semantic/frontend/server/app.py
diff --git a/src/ingestion/__init__.py b/src/semantic/frontend/server/dto/__init__.py
similarity index 100%
rename from src/ingestion/__init__.py
rename to src/semantic/frontend/server/dto/__init__.py
diff --git a/src/frontend/server/dto/websocket.py b/src/semantic/frontend/server/dto/websocket.py
similarity index 100%
rename from src/frontend/server/dto/websocket.py
rename to src/semantic/frontend/server/dto/websocket.py
diff --git a/src/frontend/server/server.py b/src/semantic/frontend/server/server.py
similarity index 100%
rename from src/frontend/server/server.py
rename to src/semantic/frontend/server/server.py
diff --git a/src/frontend/server/websocket.py b/src/semantic/frontend/server/websocket.py
similarity index 100%
rename from src/frontend/server/websocket.py
rename to src/semantic/frontend/server/websocket.py
diff --git a/src/frontend/static/css/document_tree.css b/src/semantic/frontend/static/css/document_tree.css
similarity index 100%
rename from src/frontend/static/css/document_tree.css
rename to src/semantic/frontend/static/css/document_tree.css
diff --git a/src/frontend/static/css/style.css b/src/semantic/frontend/static/css/style.css
similarity index 100%
rename from src/frontend/static/css/style.css
rename to src/semantic/frontend/static/css/style.css
diff --git a/src/frontend/static/js/main.js b/src/semantic/frontend/static/js/main.js
similarity index 100%
rename from src/frontend/static/js/main.js
rename to src/semantic/frontend/static/js/main.js
diff --git a/src/frontend/templates/index.html b/src/semantic/frontend/templates/index.html
similarity index 100%
rename from src/frontend/templates/index.html
rename to src/semantic/frontend/templates/index.html
diff --git a/src/ingestion/README.md b/src/semantic/ingestion/README.md
similarity index 100%
rename from src/ingestion/README.md
rename to src/semantic/ingestion/README.md
diff --git a/src/ingestion/parsers/__init__.py b/src/semantic/ingestion/__init__.py
similarity index 100%
rename from src/ingestion/parsers/__init__.py
rename to src/semantic/ingestion/__init__.py
diff --git a/src/ingestion/documentspec.py b/src/semantic/ingestion/documentspec.py
similarity index 100%
rename from src/ingestion/documentspec.py
rename to src/semantic/ingestion/documentspec.py
diff --git a/src/ingestion/downloader.py b/src/semantic/ingestion/downloader.py
similarity index 100%
rename from src/ingestion/downloader.py
rename to src/semantic/ingestion/downloader.py
diff --git a/src/render/__init__.py b/src/semantic/ingestion/parsers/__init__.py
similarity index 100%
rename from src/render/__init__.py
rename to src/semantic/ingestion/parsers/__init__.py
diff --git a/src/ingestion/parsers/html_parser.py b/src/semantic/ingestion/parsers/html_parser.py
similarity index 100%
rename from src/ingestion/parsers/html_parser.py
rename to src/semantic/ingestion/parsers/html_parser.py
diff --git a/src/ingestion/parsers/pdf_parser.py b/src/semantic/ingestion/parsers/pdf_parser.py
similarity index 100%
rename from src/ingestion/parsers/pdf_parser.py
rename to src/semantic/ingestion/parsers/pdf_parser.py
diff --git a/src/ingestion/paths.py b/src/semantic/ingestion/paths.py
similarity index 100%
rename from src/ingestion/paths.py
rename to src/semantic/ingestion/paths.py
diff --git a/src/ingestion/resources/codigo_civil.json b/src/semantic/ingestion/resources/codigo_civil.json
similarity index 100%
rename from src/ingestion/resources/codigo_civil.json
rename to src/semantic/ingestion/resources/codigo_civil.json
diff --git a/src/ingestion/resources/codigo_penal.json b/src/semantic/ingestion/resources/codigo_penal.json
similarity index 100%
rename from src/ingestion/resources/codigo_penal.json
rename to src/semantic/ingestion/resources/codigo_penal.json
diff --git a/src/models/node.py b/src/semantic/models/node.py
similarity index 100%
rename from src/models/node.py
rename to src/semantic/models/node.py
diff --git a/src/query.py b/src/semantic/query.py
similarity index 100%
rename from src/query.py
rename to src/semantic/query.py
diff --git a/src/ragagent.py b/src/semantic/ragagent.py
similarity index 100%
rename from src/ragagent.py
rename to src/semantic/ragagent.py
diff --git a/src/semantic/render/__init__.py b/src/semantic/render/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/render/html.py b/src/semantic/render/html.py
similarity index 100%
rename from src/render/html.py
rename to src/semantic/render/html.py
diff --git a/src/render/node_renderer.py b/src/semantic/render/node_renderer.py
similarity index 100%
rename from src/render/node_renderer.py
rename to src/semantic/render/node_renderer.py
diff --git a/src/render/plain_text.py b/src/semantic/render/plain_text.py
similarity index 100%
rename from src/render/plain_text.py
rename to src/semantic/render/plain_text.py
diff --git a/src/storage/adapters.py b/src/storage/adapters.py
deleted file mode 100644
index 3b5686b..0000000
--- a/src/storage/adapters.py
+++ /dev/null
@@ -1,145 +0,0 @@
-from typing import Optional
-
-import numpy as np
-
-from models.node import Node
-
-
-class NodeAdapter:
-    @staticmethod
-    def to_neo4j(node: Node, ordinal: Optional[int] = None):
-        out = {
-            'uuid': node.uuid,
-            'ordinal': ordinal,
-            'level': node.level,
-            'content': node.content
-        }
-
-        return out
-
-    @staticmethod
-    def to_neo4j_with_relationships(node: Node, ordinal: Optional[int] = None):
-        """
-        Recursively extract all nodes and relationships from a hierarchy.
-
-        Args:
-            node (Node): The root node of the hierarchy.
-            ordinal (Optional[int]): The ordinal value of the node in its parent's children list.
-
-        Returns:
-            - nodes: List of dictionaries representing nodes.
-            - relationships: List of tuples representing (parent_uuid, child_uuid) relationships.
-        """
-        nodes = []
-        relationships = []
-
-        # Convert the root node
-        nodes.append(NodeAdapter.to_neo4j(node, ordinal))
-
-        # Recursively process children
-        for child_ordinals, child in enumerate(node.children):
-            # Add the relationship to child
-            relationships.append((node.uuid, child.uuid))
-
-            # Add grand child's nodes and relationships
-            child_nodes, child_relationships = NodeAdapter.to_neo4j_with_relationships(child, child_ordinals)
-            nodes.extend(child_nodes)  # Append all child nodes
-            relationships.extend(child_relationships)  # Append all child relationships
-
-        return nodes, relationships
-
-    @classmethod
-    def from_neo4j(cls, record: dict) -> Node:
-        if record.get('children', []):
-            children, order = zip(*[[cls.from_neo4j(ch), ch['ordinal']] for ch in record.get('children', [])])
-
-            order = np.array([o if o is not None else np.inf for o in order])
-
-            np.argsort(order)
-
-            sorted_children = list(np.array(children)[np.argsort(order)])
-        else:
-            sorted_children = []
-        return Node(
-            uuid=record['uuid'],
-            level=record['level'],
-            content=record['content'],
-            children=sorted_children
-        )
-
-    @staticmethod
-    def build_hierarchy(root_uuid: str, nodes: dict) -> Node:
-        """
-        Build a Node hierarchy from a flat structure.
-
-        Args:
-            root_uuid (str): The UUID of the root node.
-            nodes (dict): Dictionary of all nodes keyed by UUID.
-
-        Returns:
-            Node: Root Node with children populated.
-        """
-        node_data = nodes[root_uuid]
-        root_node = Node(
-            uuid=node_data['uuid'],
-            level=node_data['level'],
-            content=node_data['content'],
-            children=[]
-        )
-
-        stack = [(root_node, root_uuid)]
-        temp_nodes = {
-            uuid: data
-            for uuid, data in nodes.items()
-        }
-
-        for uuid, data in nodes.items():
-            parent_uuid = data.get('parent_uuid')
-            if parent_uuid is None:
-                continue
-            parent_data = temp_nodes.get(parent_uuid, {})
-            if not parent_data:
-                continue
-            if 'children' not in parent_data:
-                parent_data['children'] = []
-            parent_data['children'].append(data)
-
-        return NodeAdapter.from_neo4j(nodes[root_uuid])
-
-        # node_data = nodes[root_uuid]
-        # root_node = Node(
-        #     uuid=node_data['uuid'],
-        #     level=node_data['level'],
-        #     content=node_data['content'],
-        #     children=[]
-        # )
-        #
-        # temp_nodes = {
-        #     uuid: Node(uuid=uuid, level=data['level'], content=data['content'], children=[])
-        #     for uuid, data in nodes.items()
-        # }
-        #
-        # for uuid, node in temp_nodes.items():
-        #     if node.uuid == root_uuid:
-        #         continue
-        #
-        #     # add the node to its parent
-        #     temp_nodes[node_data['parent_uuid']].children.append(node)
-        #
-        # # Find and sort child nodes by their ordinal value
-        # child_nodes = [
-        #     NodeAdapter.build_hierarchy(child_uuid, nodes)
-        #     for child_uuid, child_data in nodes.items()
-        #     if child_data.get('parent_uuid') == root_uuid
-        # ]
-        # root_node.children = sorted(child_nodes, key=lambda x: nodes[x.uuid]['ordinal'])
-        #
-        # return root_node
-
-    @staticmethod
-    def to_chromadb(node: Node):
-        return {
-            'id': node.uuid,
-            'document': node.content,
-            'metadata': {'level': node.level}
-        }
diff --git a/src/storage/chroma_storage.py b/src/storage/chroma_storage.py
deleted file mode 100644
index 159668a..0000000
--- a/src/storage/chroma_storage.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import configparser
-from typing import Optional, List
-
-import chromadb
-import neo4j
-
-import config
-from embedding import Embedding
-from models.node import Node
-
-from config import logging
-
-logger = logging.getLogger(__name__)
-
-
-class ChromaStorage:
-    def __init__(
-            self,
-            embedding: Embedding,
-            chroma_client: chromadb.Client,
-            collection_name: str = 'default',
-            conf: Optional[configparser.ConfigParser] = None
-    ):
-        self.config = conf or config.get_config()
-        self.n_results = int(self.config['rag']['n_results'])
-        self.client: chromadb.Client = chroma_client
-        self.collection_name = collection_name
-        self.collection = self.client.get_or_create_collection(name=collection_name)
-        self.embedding = embedding
-
-    def delete_collection(self, collection):
-        try:
-            self.client.get_collection(collection)
-            print(f'Deleting all documents in collection {collection}...')
-            self.client.delete_collection(name=collection)
-        except chromadb.errors.InvalidCollectionException as e:
-            pass
-
-    def store_batch(self, nodes: List[Node]):
-        logger.info(f'Adding/updating documents to collection %s...', self.collection_name)
-
-        # Embed nodes
-        ids = [str(ch.uuid) for ch in nodes]
-        embeddings, documents, metadatas = self.embedding.embed_nodes(nodes)
-
-        self.collection.upsert(
-            ids=ids,
-            embeddings=embeddings,
-            documents=documents,
-            metadatas=metadatas
-        )
-
-    def query(self, q_string: str, n_results: Optional[int] = None) -> chromadb.QueryResult:
-        query_embedding = self.embedding.embed_string(q_string)
-
-        retrieved = self.collection.query(
-            query_embeddings=[query_embedding],
-            n_results=n_results or self.n_results,
-        )
-
-        return retrieved
-
-    def delete_by_ids(self, ids: List[str]):
-        """
-        Delete documents from ChromaDB by their IDs.
-        """
-        self.collection.delete(ids=ids)
-
-    @classmethod
-    def get_chroma_storage(cls, conf: Optional[configparser.ConfigParser] = None) -> 'ChromaStorage':
-        conf = conf or config.get_config()
-
-        if conf['chroma']['type'] == 'http':
-            client = chromadb.HttpClient(
-                host=conf['chroma']['host'],
-                port=int(conf['chroma']['port']),
-            )
-        elif conf['chroma']['type'] == 'local':
-            client = chromadb.PersistentClient(
-                path=str(config.root_path() / conf.get('storage', 'path')),
-            )
-        else:
-            # return in-memory client
-            print("WARNING: Using in-memory client. This is ephemeral")
-            client = chromadb.EphemeralClient()
-
-        collection = conf.get('storage', 'collection')
-        embedding = Embedding(conf=conf)
-
-        return ChromaStorage(embedding=embedding, chroma_client=client, collection_name=collection)
diff --git a/src/storage/graph_storage.py b/src/storage/graph_storage.py
deleted file mode 100644
index 74fa688..0000000
--- a/src/storage/graph_storage.py
+++ /dev/null
@@ -1,328 +0,0 @@
-import configparser
-import logging
-from textwrap import dedent
-from typing import Optional, List, Dict
-
-import numpy as np
-from neo4j import Driver, GraphDatabase
-
-import config
-from models.node import Node
-from storage.adapters import NodeAdapter
-
-logger = config.logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-
-
-class GraphStorage:
-    def __init__(self, graph_driver: Driver):
-        """
-        Initialize the GraphStorage with a Neo4j driver.
-        """
-        self.driver = graph_driver
-        self._ensure_constraints()
-
-    def _ensure_constraints(self):
-        with self.driver.session() as session:
-            session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (n:Node) REQUIRE n.uuid IS UNIQUE")
-
-    def create_node(self, node: Node):
-        """
-        Create a single node in the Neo4j database. It has no ordinal value.
-        """
-        node_data = NodeAdapter.to_neo4j(node)
-        with self.driver.session() as session:
-            query = """
-            MERGE (n:Node {uuid: $uuid})
-            SET n.level = $level, n.content = $content, n.ordinal = $ordinal
-            RETURN n
-            """
-            session.run(query, **node_data)
-
-    def create_relationship(self, parent_uuid: str, child_uuid: str):
-        with self.driver.session() as session:
-            query = """
-            MATCH (p:Node {uuid: $parent_uuid})
-            MATCH (c:Node {uuid: $child_uuid})
-            MERGE (p)-[:HAS_CHILD]->(c)
-            """
-            session.run(query, parent_uuid=parent_uuid, child_uuid=child_uuid)
-
-    def store(self, root_node: Node):
-        """
-        Recursively store a Node and all its children in the Neo4j database.
-        """
-        # Create the root node
-        self.create_node(root_node)
-
-        # Recursively create child nodes and relationships
-        for child in root_node.children:
-            self.store(child)  # Store the child node recursively
-            self.create_relationship(root_node.uuid, child.uuid)
-
-    def batch_store(self, node_list: List[Node], parent_uuid: Optional[str] = None, ordered = False):
-        """
-        Batch insert nodes and relationships into Neo4j.
-        If parent_uuid is provided, all the nodes will be inserted as children of the parent node.
-        If ordered is True, the nodes will be inserted in as and ordered set. If children already exist for the
-        parent_uuid, the new nodes will be appended to the end.
-        """
-        nodes = []
-        relationships = []
-
-        insert_query = dedent(
-            """
-                MERGE (n:Node {uuid: $uuid})
-                SET n.level = $level, n.content = $content, n.ordinal = $ordinal
-            """
-        )
-        for node in node_list:
-            nd, rel = NodeAdapter.to_neo4j_with_relationships(node)
-            nodes.extend(nd)
-            relationships.extend(rel)
-
-            if parent_uuid:
-                relationships.append((parent_uuid, node.uuid))
-
-        with self.driver.session() as session:
-            # Insert all nodes
-            for node in nodes:
-                result = session.run(insert_query, **node)
-                result.to_eager_result()
-
-            # Insert all relationships
-            for parent_uuid, child_uuid in relationships:
-                session.run("""
-                    MATCH (p:Node {uuid: $parent_uuid})
-                    MATCH (c:Node {uuid: $child_uuid})
-                    MERGE (p)-[:HAS_CHILD]->(c)
-                """, parent_uuid=parent_uuid, child_uuid=child_uuid)
-
-    def delete_hierarchy(self, root_uuid: str):
-        """
-        Delete a Node hierarchy from Neo4j by root UUID.
-        """
-        with self.driver.session() as session:
-            query = """
-            MATCH (n:Node {uuid: $root_uuid})-[:HAS_CHILD*0..]->(child)
-            DETACH DELETE n, child
-            """
-            session.run(query, root_uuid=root_uuid)
-
-    def delete_all(self):
-        """
-        Clear all nodes and relationships from the Neo4j database.
-        """
-        with self.driver.session() as session:
-            session.run("MATCH (n:Node) DETACH DELETE n")
-
-    def retrieve_parent(self, uuid: str, depth: int = np.inf) -> Node:
-        """
-        Retrieve the parent of a node and all its children.
-
-        Args:
-            uuid (str): The UUID of the node to retrieve the parent of.
-
-        Returns:
-            Node: The parent node.
-        """
-
-        if depth < np.inf:
-            raise NotImplementedError("Depth limit not implemented")
-
-        with self.driver.session() as session:
-            # Query to fetch all nodes and their relationships in the hierarchy
-            query = dedent("""
-            MATCH (n:Node {uuid: $uuid})<-[:HAS_CHILD]-(parent)
-            RETURN parent.uuid AS parent_uuid;
-            """)
-
-            logger.debug(f"Querying parent for {uuid}")
-            logger.debug(f"Query: \n{query}")
-
-            result = session.run(query, uuid=uuid).single()
-
-            if not result:
-                return None
-
-            parent_uuid = result['parent_uuid']
-
-            return self.retrieve_hierarchy(parent_uuid)
-
-    def retrieve_hierarchy(self, root_uuid: str, depth: int = np.inf) -> Node:
-        """
-        Retrieve a node and all its children as a hierarchy.
-
-        Args:
-            root_uuid (str): The UUID of the root node to retrieve.
-            depth (int): The maximum depth to retrieve.
-
-        Returns:
-            Node: The root node with all its children populated.
-        """
-
-        if depth < np.inf:
-            raise NotImplementedError("Depth limit not implemented yet")
-
-        with self.driver.session() as session:
-            # Query to fetch all nodes and their relationships in the hierarchy
-            query = dedent("""
-            MATCH (n:Node {uuid: $root_uuid})-[:HAS_CHILD*0..]->(child)
-            OPTIONAL MATCH (child)<-[:HAS_CHILD]-(parent)
-            RETURN n.uuid AS root_uuid, 
-                   n.level AS root_level, 
-                   n.content AS root_content, 
-                   collect({parent: parent.uuid, self: child.uuid, contents: child.content, level: child.level, order:child.ordinal}) AS relationships;
-            """)
-            logger.debug(f"Querying hierarchy for {root_uuid}")
-            logger.debug(f"Query: \n{query}")
-
-            result = session.run(query, root_uuid=root_uuid).single()
-
-            if not result:
-                return None
-
-            # Parse the result to reconstruct the hierarchy
-            nodes = {}
-            root_node_data = {
-                'uuid': result['root_uuid'],
-                'level': result['root_level'],
-                'content': result['root_content'],
-                'parent_uuid': None,
-            }
-            nodes[root_uuid] = root_node_data
-
-            # Process children
-            for relationship in result['relationships']:
-                nodes[relationship['self']] = {
-                    'uuid': relationship['self'],
-                    'level': relationship['level'],
-                    'content': relationship['contents'],
-                    'ordinal': relationship['order'],
-                    'parent_uuid': relationship['parent'],
-                }
-
-            # Build the hierarchy
-            return NodeAdapter.build_hierarchy(root_uuid, nodes)
-
-    # def retrieve_hierarchies(self, root_uuids: List[str]) -> Dict[str, Node]:
-    #     """
-    #     Retrieve multiple node hierarchies from Neo4j by root UUIDs.
-    #
-    #     Args:
-    #         root_uuids (List[str]): The UUIDs of the root nodes to retrieve.
-    #
-    #     Returns:
-    #         Dict[str, Node]: A dictionary of root nodes with all their children populated.
-    #     """
-    #     with self.driver.session() as session:
-
-    # TODO: There seems to be a bug with this query. Fix it.
-
-    #         query = """
-    #         UNWIND $root_uuids AS root_uuid
-    #         MATCH (n:Node {uuid: root_uuid})-[:HAS_CHILD*0..]->(child)
-    #         OPTIONAL MATCH (child)<-[:HAS_CHILD]-(parent)
-    #         WHERE parent.uuid <> child.uuid
-    #         RETURN root_uuid,
-    #                n.uuid AS node_uuid,
-    #                n.level AS node_level,
-    #                n.content AS node_content,
-    #                collect(child.uuid) AS child_uuids,
-    #                collect(child.level) AS child_levels,
-    #                collect(child.content) AS child_contents,
-    #                collect(child.ordinal) AS child_order,
-    #                collect(parent.uuid) AS parent_uuids
-    #         """
-    #         result = session.run(query, root_uuids=root_uuids)
-    #
-    #         nodes = {}
-    #         for record in result:
-    #             root_uuid = record['root_uuid']
-    #             if root_uuid not in nodes:
-    #                 nodes[root_uuid] = {}
-    #
-    #             node_data = {
-    #                 'uuid': record['node_uuid'],
-    #                 'level': record['node_level'],
-    #                 'content': record['node_content'],
-    #                 'parent_uuid': None,
-    #             }
-    #             nodes[root_uuid][record['node_uuid']] = node_data
-    #
-    #             child_uuids = record['child_uuids']
-    #             child_levels = record['child_levels']
-    #             child_contents = record['child_contents']
-    #             child_order = record['child_order']
-    #             parent_uuids = record['parent_uuids']
-    #
-    #             assert len(child_uuids) == len(child_levels)
-    #             assert len(child_uuids) == len(child_contents)
-    #             assert len(child_uuids) == len(child_order)
-    #             # assert len(child_uuids) == len(parent_uuids)
-    #
-    #             for child_uuid, level, content, ordinal, parent_uuid in zip(
-    #                     child_uuids, child_levels, child_contents, child_order, parent_uuids
-    #             ):
-    #                 nodes[root_uuid][child_uuid] = {
-    #                     'uuid': child_uuid,
-    #                     'level': level,
-    #                     'content': content,
-    #                     'ordinal': ordinal,
-    #                     'parent_uuid': parent_uuid,
-    #                 }
-    #
-    #         hierarchies = {}
-    #         for root_uuid in root_uuids:
-    #             if root_uuid in nodes:
-    #                 hierarchies[root_uuid] = NodeAdapter.build_hierarchy(root_uuid, nodes[root_uuid])
-    #
-    #         return hierarchies
-
-    def retrieve_by(self, level: Optional[str] = None, content: Optional[str] = None) -> List[Node]:
-        """
-        Retrieve nodes by their level.
-
-        Args:
-            level (str): The level of the nodes to retrieve.
-            content (str): The content of the nodes to retrieve
-
-        Returns:
-            List[Node]: A list of nodes at the specified level.
-        """
-        with self.driver.session() as session:
-            clauses = []
-            kwargs = {}
-            if level is not None:
-                clauses.append("level: $level")
-                kwargs['level'] = level
-            if content is not None:
-                clauses.append("content: $content")
-                kwargs['content'] = content
-
-            query = dedent(f"""
-                MATCH 
-                    (n:Node {{ {','.join(clauses)} }}) 
-                RETURN 
-                    n.uuid AS uuid, 
-                    n.level AS level, 
-                    n.content AS content
-            """)
-
-            result = session.run(query, **kwargs)
-
-            uuids = [record['uuid'] for record in result]
-
-            nodes = [self.retrieve_hierarchy(uuid) for uuid in uuids]
-
-            return nodes
-
-    @classmethod
-    def get_graph_storage(cls, conf: Optional[configparser.ConfigParser] = None) -> 'GraphStorage':
-        conf = conf or config.get_config()
-
-        neo4j_driver = GraphDatabase.driver(conf['neo4j']['url'],
-                                            auth=(conf['neo4j']['user'], conf['neo4j']['password']))
-        graph_storage = GraphStorage(neo4j_driver)
-
-        return graph_storage
diff --git a/src/storage/hybrid_storage.py b/src/storage/hybrid_storage.py
deleted file mode 100644
index 0e26417..0000000
--- a/src/storage/hybrid_storage.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import configparser
-from typing import Optional, List
-
-import config
-from models.node import Node
-from storage.chroma_storage import ChromaStorage
-from storage.graph_storage import GraphStorage
-
-
-class HybridStorage:
-    def __init__(self, chroma_storage: ChromaStorage, graph_storage: GraphStorage):
-        """
-        Initialize HybridStorage with ChromaStorage and GraphStorage instances.
-        """
-        self.chroma_storage: ChromaStorage = chroma_storage
-        self.graph_storage: GraphStorage = graph_storage
-
-    def store(self, root_node: Node):
-        """
-        Store a Node hierarchy in both ChromaDB and Neo4j.
-        """
-        # Store in Neo4j
-        self.graph_storage.store(root_node)
-
-        # Flatten the hierarchy for ChromaDB
-        all_nodes = self.flatten_hierarchy(root_node)
-
-        # Store in ChromaDB
-        self.chroma_storage.store_batch(all_nodes)
-
-    def flatten_hierarchy(self, root_node: Node) -> List[Node]:
-        """
-        Flatten a Node hierarchy into a list of all nodes.
-        """
-        flat_list = [root_node]
-        for child in root_node.children:
-            flat_list.extend(self.flatten_hierarchy(child))
-        return flat_list
-
-    def query(self, query_string: str, n_results: Optional[int] = None):
-        """
-        Perform a semantic search in ChromaDB and return the results.
-        """
-        return self.chroma_storage.query(query_string, n_results)
-
-    def delete_all(self):
-        """
-        Delete all data from both ChromaDB and Neo4j.
-        """
-        self.chroma_storage.delete_collection(self.chroma_storage.collection_name)
-        self.graph_storage.delete_all()
-
-    def retrieve_parent(self, uuid: str) -> Node:
-        """
-        Retrieve the parent of a node.
-        """
-        return self.graph_storage.retrieve_parent(uuid)
-
-    def retrieve_hierarchy(self, root_uuid: str) -> Node:
-        """
-        Retrieve a node and all its children as a hierarchy.
-
-        Args:
-            root_uuid (str): The UUID of the root node to retrieve.
-
-        Returns:
-            Node: The root node with all its children populated.
-        """
-        return self.graph_storage.retrieve_hierarchy(root_uuid)
-
-    @classmethod
-    def get_hybrid_storage(cls, conf: Optional[configparser.ConfigParser] = None) -> 'HybridStorage':
-        conf = conf or config.get_config()
-
-        chroma_storage = ChromaStorage.get_chroma_storage(conf)
-        graph_storage = GraphStorage.get_graph_storage(conf)
-
-        return cls(chroma_storage, graph_storage)
\ No newline at end of file
diff --git a/src/storage/transaction_manager.py b/src/storage/transaction_manager.py
deleted file mode 100644
index 3fdd7b3..0000000
--- a/src/storage/transaction_manager.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import configparser
-from typing import Optional, List
-
-from models.node import Node
-from storage.hybrid_storage import HybridStorage
-from config import logging, get_config
-
-logger = logging.getLogger(__name__)
-
-# TODO: implement unit tests for this class
-
-
-class TransactionManager:
-    def __init__(self, hybrid_storage: HybridStorage):
-        """
-        Initialize TransactionManager with a HybridStorage instance.
-        """
-        self.hybrid_storage = hybrid_storage
-
-    def init_dataset(self, name):
-        """
-        Check if a parent node for the Dataset already exists in the database. If it doesn't, create it. In any case, return
-        the uuid so that all further documents of this dataset will be linked to it.
-        """
-
-        # Check if the dataset already exists
-        query = {
-            "level": "dataset",
-            "content": name
-        }
-        dataset_node = self.hybrid_storage.graph_storage.retrieve_by(level='dataset', content=name)
-
-        if dataset_node:
-            logger.info("Dataset already exists in the database")
-            return dataset_node[0].uuid
-
-        # If it doesn't, create it
-        dataset_node = Node(level="dataset", content=name)
-        self.store_with_transaction(dataset_node)
-        return dataset_node.uuid
-
-    def store_with_transaction(self, root_nodes: Node | List[Node], parent_uuid: Optional[str] = None):
-        """
-        Store a Node hierarchy in both ChromaDB and Neo4j, ensuring consistency.
-        """
-        if not isinstance(root_nodes, list):
-            root_nodes = [root_nodes]
-
-        try:
-            # Store in Neo4j
-            logger.info("Storing in Neo4j...")
-            self.hybrid_storage.graph_storage.batch_store(root_nodes, parent_uuid)
-
-            # Store in ChromaDB
-            logger.info("Storing in ChromaDB...")
-            for node in root_nodes:
-                # Flatten the hierarchy for ChromaDB
-                all_nodes = self.hybrid_storage.flatten_hierarchy(node)
-
-                # Store in ChromaDB
-                self.hybrid_storage.chroma_storage.store_batch(all_nodes)
-
-        except Exception as e:
-            # Rollback strategy: remove nodes from both systems if one fails
-            for node in root_nodes:
-                self.rollback(node)
-            raise RuntimeError(f"Transaction failed: {e}")
-
-
-    def rollback(self, root_node: Node):
-        """
-        Rollback changes made during a failed transaction.
-        """
-        try:
-            # Delete from Neo4j
-            self.hybrid_storage.graph_storage.delete_hierarchy(root_node.uuid)
-
-            # Delete from ChromaDB
-            all_node_uuids = [node.uuid for node in self.hybrid_storage.flatten_hierarchy(root_node)]
-            self.hybrid_storage.chroma_storage.delete_by_ids(all_node_uuids)
-
-        except Exception as rollback_error:
-            # Log rollback failure
-            logger.error(f"Rollback failed: {rollback_error}")
-
-    @classmethod
-    def get_transaction_manager(cls, conf: Optional[configparser.ConfigParser] = None) -> 'TransactionManager':
-        conf = conf or get_config()
-
-        # Initialize HybridStorage and TransactionManager
-        hybrid_storage = HybridStorage.get_hybrid_storage(conf)
-        transaction_manager = TransactionManager(hybrid_storage)
-        return transaction_manager
diff --git a/tests/conftest.py b/tests/conftest.py
index 7b527d6..5d072b9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,7 +12,7 @@
 
 @fixture
 def static_files():
-    return root_path() / "src/frontend/static/css"
+    return root_path() / "semantic/frontend/static/css"
 
 
 @fixture

From 620eb06d44e60b3f9310ea128d1eca1dfccab7c3 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Mon, 27 Jan 2025 00:06:03 +0100
Subject: [PATCH 02/27] fix imports

---
 .gitignore                                    |   2 +-
 Dockerfile                                    |  14 +
 Makefile                                      |   7 +-
 README.md                                     |  16 +-
 dags/jurisprudencia.py                        |   8 +-
 docker-compose.yml                            |  23 +-
 requirements.txt                              | 393 ++++++++++++++++--
 setup.cfg => setup.cfg.bak                    |   5 +-
 setup.py                                      |   7 +-
 src/{semantic => verdictnet}/__init__.py      |   0
 src/{semantic => verdictnet}/cli.py           |   0
 src/{semantic => verdictnet}/config.py        |   6 +-
 src/{semantic => verdictnet}/embedding.py     |   4 +-
 src/{semantic => verdictnet}/etl.py           |   0
 .../frontend/__init__.py                      |   0
 .../frontend/custom_logger.py                 |   0
 .../frontend/paths.py                         |   0
 .../frontend/server/__init__.py               |   0
 .../frontend/server/app.py                    |   0
 .../frontend/server/dto/__init__.py           |   0
 .../frontend/server/dto/websocket.py          |   0
 .../frontend/server/server.py                 |   0
 .../frontend/server/websocket.py              |   0
 .../frontend/static/css/document_tree.css     |   0
 .../frontend/static/css/style.css             |   0
 .../frontend/static/js/main.js                |   0
 .../frontend/templates/index.html             |   0
 .../ingestion/README.md                       |   2 +-
 .../ingestion/__init__.py                     |   0
 .../ingestion/documentspec.py                 |   0
 .../ingestion/downloader.py                   |  10 +-
 .../ingestion/parsers/__init__.py             |   0
 .../ingestion/parsers/html_parser.py          |   0
 .../ingestion/parsers/pdf_parser.py           |   2 +-
 .../ingestion/paths.py                        |   2 +-
 .../ingestion/resources/codigo_civil.json     |   0
 .../ingestion/resources/codigo_penal.json     |   0
 .../render => verdictnet/models}/__init__.py  |   0
 src/{semantic => verdictnet}/models/node.py   |   2 +-
 src/{semantic => verdictnet}/query.py         |   0
 src/{semantic => verdictnet}/ragagent.py      |   0
 src/verdictnet/render/__init__.py             |   0
 src/{semantic => verdictnet}/render/html.py   |   0
 .../render/node_renderer.py                   |   0
 .../render/plain_text.py                      |   0
 src/verdictnet/storage/__init__.py            |   0
 src/verdictnet/storage/adapters.py            | 145 +++++++
 src/verdictnet/storage/chroma_storage.py      |  90 ++++
 src/verdictnet/storage/graph_storage.py       | 328 +++++++++++++++
 src/verdictnet/storage/hybrid_storage.py      |  78 ++++
 src/verdictnet/storage/transaction_manager.py |  93 +++++
 tests/conftest.py                             |   2 +-
 52 files changed, 1172 insertions(+), 67 deletions(-)
 create mode 100644 Dockerfile
 rename setup.cfg => setup.cfg.bak (92%)
 rename src/{semantic => verdictnet}/__init__.py (100%)
 rename src/{semantic => verdictnet}/cli.py (100%)
 rename src/{semantic => verdictnet}/config.py (86%)
 rename src/{semantic => verdictnet}/embedding.py (93%)
 rename src/{semantic => verdictnet}/etl.py (100%)
 rename src/{semantic => verdictnet}/frontend/__init__.py (100%)
 rename src/{semantic => verdictnet}/frontend/custom_logger.py (100%)
 rename src/{semantic => verdictnet}/frontend/paths.py (100%)
 rename src/{semantic => verdictnet}/frontend/server/__init__.py (100%)
 rename src/{semantic => verdictnet}/frontend/server/app.py (100%)
 rename src/{semantic => verdictnet}/frontend/server/dto/__init__.py (100%)
 rename src/{semantic => verdictnet}/frontend/server/dto/websocket.py (100%)
 rename src/{semantic => verdictnet}/frontend/server/server.py (100%)
 rename src/{semantic => verdictnet}/frontend/server/websocket.py (100%)
 rename src/{semantic => verdictnet}/frontend/static/css/document_tree.css (100%)
 rename src/{semantic => verdictnet}/frontend/static/css/style.css (100%)
 rename src/{semantic => verdictnet}/frontend/static/js/main.js (100%)
 rename src/{semantic => verdictnet}/frontend/templates/index.html (100%)
 rename src/{semantic => verdictnet}/ingestion/README.md (98%)
 rename src/{semantic => verdictnet}/ingestion/__init__.py (100%)
 rename src/{semantic => verdictnet}/ingestion/documentspec.py (100%)
 rename src/{semantic => verdictnet}/ingestion/downloader.py (97%)
 rename src/{semantic => verdictnet}/ingestion/parsers/__init__.py (100%)
 rename src/{semantic => verdictnet}/ingestion/parsers/html_parser.py (100%)
 rename src/{semantic => verdictnet}/ingestion/parsers/pdf_parser.py (96%)
 rename src/{semantic => verdictnet}/ingestion/paths.py (95%)
 rename src/{semantic => verdictnet}/ingestion/resources/codigo_civil.json (100%)
 rename src/{semantic => verdictnet}/ingestion/resources/codigo_penal.json (100%)
 rename src/{semantic/render => verdictnet/models}/__init__.py (100%)
 rename src/{semantic => verdictnet}/models/node.py (98%)
 rename src/{semantic => verdictnet}/query.py (100%)
 rename src/{semantic => verdictnet}/ragagent.py (100%)
 create mode 100644 src/verdictnet/render/__init__.py
 rename src/{semantic => verdictnet}/render/html.py (100%)
 rename src/{semantic => verdictnet}/render/node_renderer.py (100%)
 rename src/{semantic => verdictnet}/render/plain_text.py (100%)
 create mode 100644 src/verdictnet/storage/__init__.py
 create mode 100644 src/verdictnet/storage/adapters.py
 create mode 100644 src/verdictnet/storage/chroma_storage.py
 create mode 100644 src/verdictnet/storage/graph_storage.py
 create mode 100644 src/verdictnet/storage/hybrid_storage.py
 create mode 100644 src/verdictnet/storage/transaction_manager.py

diff --git a/.gitignore b/.gitignore
index fee070a..f4ce3e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 .ipynb_checkpoints/
 */*.egg-info/*
 .idea/
+build/
 
 __pycache__/
 docs/
@@ -11,5 +12,4 @@ data/
 datalake/
 neo4j_data/
 postgress_storage/
-storage/
 
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..50bf3a6
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,14 @@
+FROM apache/airflow:2.10.0
+
+USER root
+COPY requirements.txt .
+COPY src/ src/
+COPY setup.py .
+COPY config.ini .
+RUN chown -R airflow src/
+RUN apt-get update && apt-get install -y build-essential
+
+# Switch to airflow user to run the application
+USER airflow
+RUN pip install -r requirements.txt
+RUN pip install .
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 417c95d..1568fd3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: server etl test
+.PHONY: server etl test build clean
 
 # Default port for the server
 PORT ?= 8000
@@ -7,6 +7,11 @@ PORT ?= 8000
 ETL_PATH ?= /path/to/docspecs
 FORCE ?= true
 
+# Docker build
+.PHONY: build
+build:
+	@docker build -t semantic_airflow .
+
 # Run the server
 server:
 	@echo "Running the server on port $(PORT)..."
diff --git a/README.md b/README.md
index 4459095..50af8f8 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ This will launch the following services:
 
 Finally, run
 ```sh
-  $ semantic server
+  $ verdictnet server
 ````
 to launch the frontend interface, accessible through
 - [Frontend: `http://localhost:8000`](http://localhost:8000)
@@ -33,7 +33,7 @@ Run data pipelines to ingest and process documents.
 
 #### Usage:
 ```sh
-$ semantic etl [--path PATH] [--force FORCE] {clean,run}
+$ verdictnet etl [--path PATH] [--force FORCE] {clean,run}
 
 --path PATH: Path where to look for document specs.
 --force FORCE: Force download of documents.
@@ -46,13 +46,13 @@ run: Ingest data into the vector database.
 Query the data stored in the system.
 Usage:
 ```sh
-    $ semantic query [--query QUERY] [--n_results N_RESULTS] [--interactive]
+    $ verdictnet query [--query QUERY] [--n_results N_RESULTS] [--interactive]
 ```
 ### Server
 Run the server to provide a frontend interface.
 Usage:
 ```sh
-    $ semantic server [-p PORT]
+    $ verdictnet server [-p PORT]
     
     -p, --port PORT: Port to run the frontend on (default: 8000).
 ```
@@ -60,16 +60,16 @@ Usage:
 ## Example usage
 ```sh
 # Clean the vector database
-semantic etl clean
+verdictnet etl clean
 
 # Run the ETL pipeline
-semantic etl run --path /path/to/docspecs --force true
+verdictnet etl run --path /path/to/docspecs --force true
 
 # Query the data
-semantic query --query "example query" --n_results 5
+verdictnet query --query "example query" --n_results 5
 
 # Run the server
-semantic server --port 8080
+verdictnet server --port 8080
 ```
 
 ## Configuration
diff --git a/dags/jurisprudencia.py b/dags/jurisprudencia.py
index 86d41c2..89c180a 100644
--- a/dags/jurisprudencia.py
+++ b/dags/jurisprudencia.py
@@ -1,3 +1,4 @@
+import pendulum
 from airflow import DAG
 from airflow.operators.python import PythonOperator
 from airflow.utils.dates import days_ago
@@ -6,13 +7,13 @@
 import json
 import os
 
-from semantic.ingestion.downloader import get_item_pagination
+from verdictnet.ingestion.downloader import get_item_pagination
 
 # Define the default arguments
 default_args = {
     'owner': 'airflow',
     'depends_on_past': False,
-    'start_date': days_ago(8 * 7),  # Start date 8 weeks ago
+    'start_date': pendulum.today('UTC').add(days=-8 * 7),  # Start date 8 weeks ago
     'email_on_failure': False,
     'email_on_retry': False,
     'retries': 1,
@@ -24,11 +25,10 @@
     'query_poderjudicial',
     default_args=default_args,
     description='Query www.poderjudicial.es and store results in JSON',
-    schedule_interval='@daily',
+    schedule='@daily',
     catchup=True,
 ):
     item_pagination = PythonOperator(
         task_id='get_item_pagination',
-        provide_context=True,
         python_callable=get_item_pagination,
     )
diff --git a/docker-compose.yml b/docker-compose.yml
index 4f7dffe..b886363 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,3 @@
-version: '3.8'
-
 services:
   chromadb:
     image: chromadb/chroma:latest
@@ -29,7 +27,7 @@ services:
       - ${PWD}/postgress_storage:/var/lib/postgresql/data
 
   airflow-webserver:
-    image: apache/airflow:2.6.1
+    build: .
     container_name: airflow-webserver
     environment:
       - AIRFLOW__CORE__EXECUTOR=LocalExecutor
@@ -47,6 +45,25 @@ services:
     volumes:
       - ${PWD}/dags:/opt/airflow/dags  # Mount the dags folder
 
+
+  airflow-scheduler:
+    build: .
+    container_name: airflow-scheduler
+    environment:
+      - AIRFLOW__CORE__EXECUTOR=LocalExecutor
+      - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
+      - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY}
+    depends_on:
+      - postgres
+      - airflow-webserver
+    command: >
+      bash -c "airflow scheduler"
+    env_file:
+      - .env
+    volumes:
+      - ${PWD}/dags:/opt/airflow/dags  # Mount the dags folder
+
+
   minio:
     image: minio/minio:latest
     container_name: minio
diff --git a/requirements.txt b/requirements.txt
index d11ad23..59e57ad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,29 +1,364 @@
-pathlib~=1.0.1
-fastapi~=0.115.5
-starlette~=0.40.0
-Werkzeug~=3.0.3
-requests~=2.32.0
-beautifulsoup4~=4.12.3
-tika~=2.6.0
-tqdm~=4.66.4
-setuptools~=70.0.0
-pytest~=8.2.0
-websocket-client~=1.8.0
-uvicorn~=0.29.0
-websockets~=12.0
-playwright~=1.43.0
-PyPDF2~=3.0.1
-openparse~=0.7.0
-chromadb~=0.5.20
-numpy~=2.1.3
-torch~=2.5.1
-slugify~=0.0.1
-configparser~=7.1.0
-python-slugify~=8.0.4
-fsspec~=2024.10.0
-semantic_pdf~=0.0.1
-pdfplumber~=0.11.4
-pandas~=2.2.3
-neomodel~=5.4.1
-neo4j~=5.26.0
-pytest-mock~=3.14.0
+acres==0.1.0
+aiobotocore==2.15.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.10
+aioitertools==0.12.0
+aiosignal==1.3.1
+aiosqlite==0.20.0
+alembic==1.14.1
+amqp==5.3.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+apache-airflow==2.10.4
+apache-airflow-providers-celery==3.8.5
+apache-airflow-providers-common-compat==1.3.0
+apache-airflow-providers-common-io==1.5.0
+apache-airflow-providers-common-sql==1.21.0
+apache-airflow-providers-fab==1.5.2
+apache-airflow-providers-ftp==3.12.0
+apache-airflow-providers-google==12.0.0
+apache-airflow-providers-http==5.0.0
+apache-airflow-providers-imap==3.8.0
+apache-airflow-providers-smtp==1.9.0
+apache-airflow-providers-sqlite==4.0.0
+apispec==6.8.1
+argcomplete==3.5.3
+asgiref==3.8.1
+attrs==24.2.0
+babel==2.16.0
+backoff==2.2.1
+bcrypt==4.2.1
+beautifulsoup4==4.12.3
+billiard==4.2.1
+blinker==1.9.0
+botocore==1.35.36
+build==1.2.2.post1
+cachelib==0.9.0
+cachetools==5.5.0
+captcha-solver==0.1.5
+cattrs==24.1.2
+celery==5.4.0
+certifi==2024.8.30
+cffi==1.17.1
+chardet==5.2.0
+charset-normalizer==3.4.0
+chroma-hnswlib==0.7.6
+chromadb==0.5.20
+ci-info==0.3.0
+click==8.1.7
+click-didyoumean==0.3.1
+click-plugins==1.1.1
+click-repl==0.3.0
+clickclick==20.10.2
+colorama==0.4.6
+coloredlogs==15.0.1
+colorlog==6.9.0
+configobj==5.0.9
+configparser==7.1.0
+ConfigUpdater==3.2
+connexion==2.14.2
+contourpy==1.3.1
+cron-descriptor==1.4.5
+croniter==6.0.0
+cryptography==44.0.0
+cycler==0.12.1
+dataclasses-json==0.6.7
+db-dtypes==1.4.0
+decorator==5.1.1
+Deprecated==1.2.15
+dill==0.3.9
+dirtyjson==1.0.8
+distro==1.9.0
+dnspython==2.7.0
+docstring_parser==0.16
+durationpy==0.9
+email_validator==2.2.0
+etelemetry==0.3.1
+fastapi==0.115.5
+fastapi-cli==0.0.5
+filelock==3.16.1
+filetype==1.2.0
+fitz==0.0.1.dev2
+Flask==2.2.5
+Flask-AppBuilder==4.5.2
+Flask-Babel==2.0.0
+Flask-Caching==2.3.0
+Flask-JWT-Extended==4.7.1
+Flask-Limiter==3.10.1
+Flask-Login==0.6.3
+Flask-Session==0.5.0
+Flask-SQLAlchemy==2.5.1
+Flask-WTF==1.2.2
+flatbuffers==24.3.25
+flower==2.0.1
+fonttools==4.55.0
+frozenlist==1.5.0
+fsspec
+gcloud-aio-auth==5.3.2
+gcloud-aio-bigquery==7.1.0
+gcloud-aio-storage==9.3.0
+gcsfs
+google-ads==25.1.0
+google-analytics-admin==0.23.3
+google-api-core==2.24.0
+google-api-python-client==2.159.0
+google-auth==2.36.0
+google-auth-httplib2==0.2.0
+google-auth-oauthlib==1.2.1
+google-cloud-aiplatform==1.78.0
+google-cloud-alloydb==0.4.1
+google-cloud-appengine-logging==1.5.0
+google-cloud-audit-log==0.3.0
+google-cloud-automl==2.15.0
+google-cloud-batch==0.17.33
+google-cloud-bigquery==3.29.0
+google-cloud-bigquery-datatransfer==3.18.0
+google-cloud-bigtable==2.28.1
+google-cloud-build==3.29.0
+google-cloud-compute==1.23.0
+google-cloud-container==2.55.0
+google-cloud-core==2.4.1
+google-cloud-datacatalog==3.24.1
+google-cloud-dataflow-client==0.8.15
+google-cloud-dataform==0.5.14
+google-cloud-dataplex==2.6.0
+google-cloud-dataproc==5.16.0
+google-cloud-dataproc-metastore==1.17.0
+google-cloud-dlp==3.26.0
+google-cloud-kms==3.2.2
+google-cloud-language==2.16.0
+google-cloud-logging==3.11.3
+google-cloud-memcache==1.11.0
+google-cloud-monitoring==2.26.0
+google-cloud-orchestration-airflow==1.16.1
+google-cloud-os-login==2.16.0
+google-cloud-pubsub==2.27.3
+google-cloud-redis==2.17.0
+google-cloud-resource-manager==1.14.0
+google-cloud-run==0.10.14
+google-cloud-secret-manager==2.22.1
+google-cloud-spanner==3.51.0
+google-cloud-speech==2.30.0
+google-cloud-storage==2.19.0
+google-cloud-storage-transfer==1.15.0
+google-cloud-tasks==2.18.0
+google-cloud-texttospeech==2.24.0
+google-cloud-translate==3.19.0
+google-cloud-videointelligence==2.15.0
+google-cloud-vision==3.9.0
+google-cloud-workflows==1.16.0
+google-crc32c==1.6.0
+google-re2==1.1.20240702
+google-resumable-media==2.7.2
+googleapis-common-protos==1.66.0
+greenlet==3.0.3
+grpc-google-iam-v1==0.14.0
+grpc-interceptor==0.15.4
+grpcio==1.70.0
+grpcio-gcp==0.2.2
+grpcio-status==1.70.0
+gunicorn==23.0.0
+h11==0.14.0
+httpcore==1.0.7
+httplib2==0.22.0
+httptools==0.6.4
+httpx==0.27.2
+huggingface-hub==0.26.2
+humanfriendly==10.0
+humanize==4.11.0
+idna==3.10
+immutabledict==4.2.1
+importlib_metadata==8.5.0
+importlib_resources==6.4.5
+inflection==0.5.1
+iniconfig==2.0.0
+isodate==0.6.1
+itsdangerous==2.2.0
+Jinja2==3.1.4
+jiter==0.8.0
+jmespath==1.0.1
+joblib==1.4.2
+json-merge-patch==0.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+kiwisolver==1.4.7
+kombu==5.4.2
+kubernetes==31.0.0
+lazy-object-proxy==1.10.0
+limits==4.0.1
+linkify-it-py==2.0.3
+llama-cloud==0.1.6
+llama-index==0.12.5
+llama-index-agent-openai==0.4.0
+llama-index-cli==0.4.0
+llama-index-core==0.12.5
+llama-index-embeddings-openai==0.3.1
+llama-index-indices-managed-llama-cloud==0.6.3
+llama-index-legacy==0.9.48.post4
+llama-index-llms-openai==0.3.10
+llama-index-multi-modal-llms-openai==0.4.0
+llama-index-program-openai==0.3.1
+llama-index-question-gen-openai==0.3.0
+llama-index-readers-file==0.4.1
+llama-index-readers-llama-parse==0.4.0
+llama-parse==0.5.17
+lockfile==0.12.2
+looker-sdk==25.0.0
+looseversion==1.3.0
+lxml==5.3.0
+Mako==1.3.8
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+marshmallow==3.23.1
+marshmallow-oneofschema==3.1.1
+marshmallow-sqlalchemy==0.28.2
+matplotlib==3.9.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+methodtools==0.4.7
+mmh3==5.0.1
+monotonic==1.6
+more-itertools==10.6.0
+mpmath==1.3.0
+multidict==6.1.0
+mypy-extensions==1.0.0
+neo4j==5.26.0
+neomodel==5.4.1
+nest-asyncio==1.6.0
+networkx==3.4.2
+nibabel==5.3.2
+nipype==1.9.1
+nltk==3.9.1
+numpy==1.26.4
+oauthlib==3.2.2
+onnxruntime==1.20.1
+openai==1.57.2
+openparse==0.7.0
+ordered-set==4.1.0
+orjson==3.10.12
+overrides==7.7.0
+packaging==24.2
+pandas==2.1.4
+pandas-gbq==0.26.1
+pathlib==1.0.1
+pathspec==0.12.1
+pdf2image==1.17.0
+pdfminer.six==20231228
+pdfplumber==0.11.4
+pendulum==3.0.0
+pillow==11.0.0
+playwright==1.43.0
+pluggy==1.5.0
+posthog==3.7.3
+prison==0.2.1
+prometheus_client==0.21.1
+prompt_toolkit==3.0.50
+propcache==0.2.1
+proto-plus==1.25.0
+protobuf==5.29.0
+prov==2.0.1
+psutil==6.1.1
+puremagic==1.28
+pyarrow==18.1.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.0
+pycparser==2.22
+pydantic==2.10.2
+pydantic_core==2.27.1
+pydata-google-auth==1.9.1
+pydot==3.0.2
+pyee==11.1.0
+Pygments==2.18.0
+PyJWT==2.10.1
+PyMuPDF==1.24.14
+pyOpenSSL==25.0.0
+pyparsing==3.2.0
+pypdf==5.1.0
+PyPDF2==3.0.1
+pypdfium2==4.30.0
+PyPika==0.48.9
+pyproject_hooks==1.2.0
+pytesseract==0.3.13
+pytest==8.2.2
+pytest-mock==3.14.0
+python-daemon==3.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.17
+python-nvd3==0.16.0
+python-slugify==8.0.4
+pytz==2024.2
+pyxnat==1.6.2
+PyYAML==6.0.2
+rdflib==6.3.2
+redis==5.2.1
+referencing==0.36.2
+regex==2024.11.6
+reportlab==4.2.5
+requests==2.32.3
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+rfc3339-validator==0.1.4
+rich==13.9.4
+rich-argparse==1.6.0
+rpds-py==0.22.3
+rsa==4.9
+s3fs==2024.10.0
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+sentence-transformers==3.3.1
+setproctitle==1.3.4
+setuptools==70.0.0
+shapely==2.0.6
+shellingham==1.5.4
+simplejson==3.19.3
+six==1.16.0
+slugify==0.0.1
+sniffio==1.3.1
+soupsieve==2.6
+SQLAlchemy==1.4.54
+sqlalchemy-bigquery==1.12.1
+SQLAlchemy-JSONField==1.0.2
+sqlalchemy-spanner==1.8.0
+SQLAlchemy-Utils==0.41.2
+sqlparse==0.5.3
+starlette==0.40.0
+striprtf==0.0.26
+sympy==1.13.1
+tabulate==0.9.0
+tenacity==8.5.0
+termcolor==2.5.0
+text-unidecode==1.3
+threadpoolctl==3.5.0
+tika==2.6.0
+tiktoken==0.8.0
+time-machine==2.16.0
+tokenizers==0.20.3
+torch==2.5.1
+tornado==6.4.2
+tqdm==4.66.6
+traits==6.4.3
+transformers==4.46.3
+typer==0.13.1
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+universal_pathlib==0.2.6
+uritemplate==4.1.1
+urllib3==2.2.3
+uvicorn==0.29.0
+uvloop==0.21.0
+vine==5.1.0
+watchfiles==1.0.0
+wcwidth==0.2.13
+websocket-client==1.8.0
+websockets==12.0
+wirerope==1.0.0
+wordcloud==1.9.4
+wrapt==1.17.0
+WTForms==3.2.1
+yarl==1.18.3
+zipp==3.21.0
diff --git a/setup.cfg b/setup.cfg.bak
similarity index 92%
rename from setup.cfg
rename to setup.cfg.bak
index 09a51e2..ed112a2 100644
--- a/setup.cfg
+++ b/setup.cfg.bak
@@ -7,7 +7,7 @@ license = MIT
 
 [options]
 package_dir=
-    =src
+    =semantic
 packages = find:
 install_requires =
     numpy
@@ -19,10 +19,9 @@ install_requires =
     matplotlib
     pyarrow
     PyMuPDF
-    wordcloud
 
 [options.packages.find]
-where=src
+where=semantic
 
 [options.entry_points]
 console_scripts =
diff --git a/setup.py b/setup.py
index 726ea4c..8339b5a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,10 +1,11 @@
 from setuptools import setup, find_packages
 
 setup(
-    name='semantic_pdf',
+    name='verdictnet',
     version='0.0.1',
-    description='A tool to do semantic_pdf queries on large documents',
+    description='A tool to do verdictnet queries on large documents',
     author='Alex Monras',
     license='MIT',
-    packages=find_packages('src')
+    packages=find_packages(where='src'),
+    package_dir={'': 'src'}
 )
diff --git a/src/semantic/__init__.py b/src/verdictnet/__init__.py
similarity index 100%
rename from src/semantic/__init__.py
rename to src/verdictnet/__init__.py
diff --git a/src/semantic/cli.py b/src/verdictnet/cli.py
similarity index 100%
rename from src/semantic/cli.py
rename to src/verdictnet/cli.py
diff --git a/src/semantic/config.py b/src/verdictnet/config.py
similarity index 86%
rename from src/semantic/config.py
rename to src/verdictnet/config.py
index 39a6583..8c6ab36 100644
--- a/src/semantic/config.py
+++ b/src/verdictnet/config.py
@@ -14,15 +14,15 @@
 
 
 def root_path():
-    return Path(__file__).parent.parent
+    return Path(__file__).parent.parent.parent
 
 
 def get_config():
     # Create a ConfigParser instance
     config = configparser.ConfigParser()
 
-    # Load the configuration file
-    config.read(root_path() / 'config.ini')
+    # Load the configuration file from the current folder
+    config.read(filenames=['config.ini', root_path() / 'config.ini'])
 
     return config
 
diff --git a/src/semantic/embedding.py b/src/verdictnet/embedding.py
similarity index 93%
rename from src/semantic/embedding.py
rename to src/verdictnet/embedding.py
index 281c23f..0dadaee 100644
--- a/src/semantic/embedding.py
+++ b/src/verdictnet/embedding.py
@@ -6,8 +6,8 @@
 from sentence_transformers import SentenceTransformer
 from torch import Tensor
 
-from config import root_path, get_config
-from models.node import Node
+from verdictnet.config import root_path, get_config
+from verdictnet.models.node import Node
 
 
 class Embedding:
diff --git a/src/semantic/etl.py b/src/verdictnet/etl.py
similarity index 100%
rename from src/semantic/etl.py
rename to src/verdictnet/etl.py
diff --git a/src/semantic/frontend/__init__.py b/src/verdictnet/frontend/__init__.py
similarity index 100%
rename from src/semantic/frontend/__init__.py
rename to src/verdictnet/frontend/__init__.py
diff --git a/src/semantic/frontend/custom_logger.py b/src/verdictnet/frontend/custom_logger.py
similarity index 100%
rename from src/semantic/frontend/custom_logger.py
rename to src/verdictnet/frontend/custom_logger.py
diff --git a/src/semantic/frontend/paths.py b/src/verdictnet/frontend/paths.py
similarity index 100%
rename from src/semantic/frontend/paths.py
rename to src/verdictnet/frontend/paths.py
diff --git a/src/semantic/frontend/server/__init__.py b/src/verdictnet/frontend/server/__init__.py
similarity index 100%
rename from src/semantic/frontend/server/__init__.py
rename to src/verdictnet/frontend/server/__init__.py
diff --git a/src/semantic/frontend/server/app.py b/src/verdictnet/frontend/server/app.py
similarity index 100%
rename from src/semantic/frontend/server/app.py
rename to src/verdictnet/frontend/server/app.py
diff --git a/src/semantic/frontend/server/dto/__init__.py b/src/verdictnet/frontend/server/dto/__init__.py
similarity index 100%
rename from src/semantic/frontend/server/dto/__init__.py
rename to src/verdictnet/frontend/server/dto/__init__.py
diff --git a/src/semantic/frontend/server/dto/websocket.py b/src/verdictnet/frontend/server/dto/websocket.py
similarity index 100%
rename from src/semantic/frontend/server/dto/websocket.py
rename to src/verdictnet/frontend/server/dto/websocket.py
diff --git a/src/semantic/frontend/server/server.py b/src/verdictnet/frontend/server/server.py
similarity index 100%
rename from src/semantic/frontend/server/server.py
rename to src/verdictnet/frontend/server/server.py
diff --git a/src/semantic/frontend/server/websocket.py b/src/verdictnet/frontend/server/websocket.py
similarity index 100%
rename from src/semantic/frontend/server/websocket.py
rename to src/verdictnet/frontend/server/websocket.py
diff --git a/src/semantic/frontend/static/css/document_tree.css b/src/verdictnet/frontend/static/css/document_tree.css
similarity index 100%
rename from src/semantic/frontend/static/css/document_tree.css
rename to src/verdictnet/frontend/static/css/document_tree.css
diff --git a/src/semantic/frontend/static/css/style.css b/src/verdictnet/frontend/static/css/style.css
similarity index 100%
rename from src/semantic/frontend/static/css/style.css
rename to src/verdictnet/frontend/static/css/style.css
diff --git a/src/semantic/frontend/static/js/main.js b/src/verdictnet/frontend/static/js/main.js
similarity index 100%
rename from src/semantic/frontend/static/js/main.js
rename to src/verdictnet/frontend/static/js/main.js
diff --git a/src/semantic/frontend/templates/index.html b/src/verdictnet/frontend/templates/index.html
similarity index 100%
rename from src/semantic/frontend/templates/index.html
rename to src/verdictnet/frontend/templates/index.html
diff --git a/src/semantic/ingestion/README.md b/src/verdictnet/ingestion/README.md
similarity index 98%
rename from src/semantic/ingestion/README.md
rename to src/verdictnet/ingestion/README.md
index 239f454..fef01c9 100644
--- a/src/semantic/ingestion/README.md
+++ b/src/verdictnet/ingestion/README.md
@@ -2,7 +2,7 @@
 
 There are currently two ingestion processes in place:
 ```sh
-$ semantic erl run
+$ verdictnet erl run
 ```
 ingests the documents specified in `src/ingestion/resources/`, namely:
 - Código Civil
diff --git a/src/semantic/ingestion/__init__.py b/src/verdictnet/ingestion/__init__.py
similarity index 100%
rename from src/semantic/ingestion/__init__.py
rename to src/verdictnet/ingestion/__init__.py
diff --git a/src/semantic/ingestion/documentspec.py b/src/verdictnet/ingestion/documentspec.py
similarity index 100%
rename from src/semantic/ingestion/documentspec.py
rename to src/verdictnet/ingestion/documentspec.py
diff --git a/src/semantic/ingestion/downloader.py b/src/verdictnet/ingestion/downloader.py
similarity index 97%
rename from src/semantic/ingestion/downloader.py
rename to src/verdictnet/ingestion/downloader.py
index 529de2a..8f76502 100644
--- a/src/semantic/ingestion/downloader.py
+++ b/src/verdictnet/ingestion/downloader.py
@@ -9,11 +9,11 @@
 from bs4 import BeautifulSoup
 from tqdm import tqdm
 
-from ingestion.parsers.pdf_parser import extract_paragraphs
-from ingestion.paths import raw_path, refined_path, fsspec_walk
-from config import get_config, logging, get_fs
-from models.node import Node
-from storage.transaction_manager import TransactionManager
+from verdictnet.ingestion.parsers.pdf_parser import extract_paragraphs
+from verdictnet.ingestion.paths import raw_path, refined_path, fsspec_walk
+from verdictnet.config import get_config, logging, get_fs
+from verdictnet.models.node import Node
+from verdictnet.storage.transaction_manager import TransactionManager
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/semantic/ingestion/parsers/__init__.py b/src/verdictnet/ingestion/parsers/__init__.py
similarity index 100%
rename from src/semantic/ingestion/parsers/__init__.py
rename to src/verdictnet/ingestion/parsers/__init__.py
diff --git a/src/semantic/ingestion/parsers/html_parser.py b/src/verdictnet/ingestion/parsers/html_parser.py
similarity index 100%
rename from src/semantic/ingestion/parsers/html_parser.py
rename to src/verdictnet/ingestion/parsers/html_parser.py
diff --git a/src/semantic/ingestion/parsers/pdf_parser.py b/src/verdictnet/ingestion/parsers/pdf_parser.py
similarity index 96%
rename from src/semantic/ingestion/parsers/pdf_parser.py
rename to src/verdictnet/ingestion/parsers/pdf_parser.py
index 5c69312..0876e1e 100644
--- a/src/semantic/ingestion/parsers/pdf_parser.py
+++ b/src/verdictnet/ingestion/parsers/pdf_parser.py
@@ -1,6 +1,6 @@
 import pdfplumber
 
-from models.node import Node
+from verdictnet.models.node import Node
 
 
 def extract_paragraphs(pdf_path) -> Node:
diff --git a/src/semantic/ingestion/paths.py b/src/verdictnet/ingestion/paths.py
similarity index 95%
rename from src/semantic/ingestion/paths.py
rename to src/verdictnet/ingestion/paths.py
index b0a4224..00be14f 100644
--- a/src/semantic/ingestion/paths.py
+++ b/src/verdictnet/ingestion/paths.py
@@ -2,7 +2,7 @@
 
 import fsspec
 
-from config import root_path, get_config, logging
+from verdictnet.config import root_path, get_config, logging
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/semantic/ingestion/resources/codigo_civil.json b/src/verdictnet/ingestion/resources/codigo_civil.json
similarity index 100%
rename from src/semantic/ingestion/resources/codigo_civil.json
rename to src/verdictnet/ingestion/resources/codigo_civil.json
diff --git a/src/semantic/ingestion/resources/codigo_penal.json b/src/verdictnet/ingestion/resources/codigo_penal.json
similarity index 100%
rename from src/semantic/ingestion/resources/codigo_penal.json
rename to src/verdictnet/ingestion/resources/codigo_penal.json
diff --git a/src/semantic/render/__init__.py b/src/verdictnet/models/__init__.py
similarity index 100%
rename from src/semantic/render/__init__.py
rename to src/verdictnet/models/__init__.py
diff --git a/src/semantic/models/node.py b/src/verdictnet/models/node.py
similarity index 98%
rename from src/semantic/models/node.py
rename to src/verdictnet/models/node.py
index c1ee23a..07497fc 100644
--- a/src/semantic/models/node.py
+++ b/src/verdictnet/models/node.py
@@ -3,7 +3,7 @@
 from dataclasses import dataclass, field
 from typing import List
 
-from config import get_fs
+from verdictnet.config import get_fs
 
 
 class AutoIncrement:  # pylint: disable=too-few-public-methods
diff --git a/src/semantic/query.py b/src/verdictnet/query.py
similarity index 100%
rename from src/semantic/query.py
rename to src/verdictnet/query.py
diff --git a/src/semantic/ragagent.py b/src/verdictnet/ragagent.py
similarity index 100%
rename from src/semantic/ragagent.py
rename to src/verdictnet/ragagent.py
diff --git a/src/verdictnet/render/__init__.py b/src/verdictnet/render/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/semantic/render/html.py b/src/verdictnet/render/html.py
similarity index 100%
rename from src/semantic/render/html.py
rename to src/verdictnet/render/html.py
diff --git a/src/semantic/render/node_renderer.py b/src/verdictnet/render/node_renderer.py
similarity index 100%
rename from src/semantic/render/node_renderer.py
rename to src/verdictnet/render/node_renderer.py
diff --git a/src/semantic/render/plain_text.py b/src/verdictnet/render/plain_text.py
similarity index 100%
rename from src/semantic/render/plain_text.py
rename to src/verdictnet/render/plain_text.py
diff --git a/src/verdictnet/storage/__init__.py b/src/verdictnet/storage/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/verdictnet/storage/adapters.py b/src/verdictnet/storage/adapters.py
new file mode 100644
index 0000000..1d8ecd2
--- /dev/null
+++ b/src/verdictnet/storage/adapters.py
@@ -0,0 +1,145 @@
+from typing import Optional
+
+import numpy as np
+
+from verdictnet.models.node import Node
+
+
+class NodeAdapter:
+    @staticmethod
+    def to_neo4j(node: Node, ordinal: Optional[int] = None):
+        out = {
+            'uuid': node.uuid,
+            'ordinal': ordinal,
+            'level': node.level,
+            'content': node.content
+        }
+
+        return out
+
+    @staticmethod
+    def to_neo4j_with_relationships(node: Node, ordinal: Optional[int] = None):
+        """
+        Recursively extract all nodes and relationships from a hierarchy.
+
+        Args:
+            node (Node): The root node of the hierarchy.
+            ordinal (Optional[int]): The ordinal value of the node in its parent's children list.
+
+        Returns:
+            - nodes: List of dictionaries representing nodes.
+            - relationships: List of tuples representing (parent_uuid, child_uuid) relationships.
+        """
+        nodes = []
+        relationships = []
+
+        # Convert the root node
+        nodes.append(NodeAdapter.to_neo4j(node, ordinal))
+
+        # Recursively process children
+        for child_ordinals, child in enumerate(node.children):
+            # Add the relationship to child
+            relationships.append((node.uuid, child.uuid))
+
+            # Add grand child's nodes and relationships
+            child_nodes, child_relationships = NodeAdapter.to_neo4j_with_relationships(child, child_ordinals)
+            nodes.extend(child_nodes)  # Append all child nodes
+            relationships.extend(child_relationships)  # Append all child relationships
+
+        return nodes, relationships
+
+    @classmethod
+    def from_neo4j(cls, record: dict) -> Node:
+        if record.get('children', []):
+            children, order = zip(*[[cls.from_neo4j(ch), ch['ordinal']] for ch in record.get('children', [])])
+
+            order = np.array([o if o is not None else np.inf for o in order])
+
+            np.argsort(order)
+
+            sorted_children = list(np.array(children)[np.argsort(order)])
+        else:
+            sorted_children = []
+        return Node(
+            uuid=record['uuid'],
+            level=record['level'],
+            content=record['content'],
+            children=sorted_children
+        )
+
+    @staticmethod
+    def build_hierarchy(root_uuid: str, nodes: dict) -> Node:
+        """
+        Build a Node hierarchy from a flat structure.
+
+        Args:
+            root_uuid (str): The UUID of the root node.
+            nodes (dict): Dictionary of all nodes keyed by UUID.
+
+        Returns:
+            Node: Root Node with children populated.
+        """
+        node_data = nodes[root_uuid]
+        root_node = Node(
+            uuid=node_data['uuid'],
+            level=node_data['level'],
+            content=node_data['content'],
+            children=[]
+        )
+
+        stack = [(root_node, root_uuid)]
+        temp_nodes = {
+            uuid: data
+            for uuid, data in nodes.items()
+        }
+
+        for uuid, data in nodes.items():
+            parent_uuid = data.get('parent_uuid')
+            if parent_uuid is None:
+                continue
+            parent_data = temp_nodes.get(parent_uuid, {})
+            if not parent_data:
+                continue
+            if 'children' not in parent_data:
+                parent_data['children'] = []
+            parent_data['children'].append(data)
+
+        return NodeAdapter.from_neo4j(nodes[root_uuid])
+
+        # node_data = nodes[root_uuid]
+        # root_node = Node(
+        #     uuid=node_data['uuid'],
+        #     level=node_data['level'],
+        #     content=node_data['content'],
+        #     children=[]
+        # )
+        #
+        # temp_nodes = {
+        #     uuid: Node(uuid=uuid, level=data['level'], content=data['content'], children=[])
+        #     for uuid, data in nodes.items()
+        # }
+        #
+        # for uuid, node in temp_nodes.items():
+        #     if node.uuid == root_uuid:
+        #         continue
+        #
+        #     # add the node to its parent
+        #     temp_nodes[node_data['parent_uuid']].children.append(node)
+        #
+        # # Find and sort child nodes by their ordinal value
+        # child_nodes = [
+        #     NodeAdapter.build_hierarchy(child_uuid, nodes)
+        #     for child_uuid, child_data in nodes.items()
+        #     if child_data.get('parent_uuid') == root_uuid
+        # ]
+        # root_node.children = sorted(child_nodes, key=lambda x: nodes[x.uuid]['ordinal'])
+        #
+        # return root_node
+
+    @staticmethod
+    def to_chromadb(node: Node):
+        return {
+            'id': node.uuid,
+            'document': node.content,
+            'metadata': {'level': node.level}
+        }
diff --git a/src/verdictnet/storage/chroma_storage.py b/src/verdictnet/storage/chroma_storage.py
new file mode 100644
index 0000000..7638b99
--- /dev/null
+++ b/src/verdictnet/storage/chroma_storage.py
@@ -0,0 +1,90 @@
+import configparser
+from typing import Optional, List
+
+import chromadb
+import neo4j
+
+from verdictnet import config
+from verdictnet.embedding import Embedding
+from verdictnet.models.node import Node
+
+from verdictnet.config import logging
+
+logger = logging.getLogger(__name__)
+
+
+class ChromaStorage:
+    def __init__(
+            self,
+            embedding: Embedding,
+            chroma_client: chromadb.Client,
+            collection_name: str = 'default',
+            conf: Optional[configparser.ConfigParser] = None
+    ):
+        self.config = conf or config.get_config()
+        self.n_results = int(self.config['rag']['n_results'])
+        self.client: chromadb.Client = chroma_client
+        self.collection_name = collection_name
+        self.collection = self.client.get_or_create_collection(name=collection_name)
+        self.embedding = embedding
+
+    def delete_collection(self, collection):
+        try:
+            self.client.get_collection(collection)
+            print(f'Deleting all documents in collection {collection}...')
+            self.client.delete_collection(name=collection)
+        except chromadb.errors.InvalidCollectionException as e:
+            pass
+
+    def store_batch(self, nodes: List[Node]):
+        logger.info(f'Adding/updating documents to collection %s...', self.collection_name)
+
+        # Embed nodes
+        ids = [str(ch.uuid) for ch in nodes]
+        embeddings, documents, metadatas = self.embedding.embed_nodes(nodes)
+
+        self.collection.upsert(
+            ids=ids,
+            embeddings=embeddings,
+            documents=documents,
+            metadatas=metadatas
+        )
+
+    def query(self, q_string: str, n_results: Optional[int] = None) -> chromadb.QueryResult:
+        query_embedding = self.embedding.embed_string(q_string)
+
+        retrieved = self.collection.query(
+            query_embeddings=[query_embedding],
+            n_results=n_results or self.n_results,
+        )
+
+        return retrieved
+
+    def delete_by_ids(self, ids: List[str]):
+        """
+        Delete documents from ChromaDB by their IDs.
+        """
+        self.collection.delete(ids=ids)
+
+    @classmethod
+    def get_chroma_storage(cls, conf: Optional[configparser.ConfigParser] = None) -> 'ChromaStorage':
+        conf = conf or config.get_config()
+
+        if conf['chroma']['type'] == 'http':
+            client = chromadb.HttpClient(
+                host=conf['chroma']['host'],
+                port=int(conf['chroma']['port']),
+            )
+        elif conf['chroma']['type'] == 'local':
+            client = chromadb.PersistentClient(
+                path=str(config.root_path() / conf.get('storage', 'path')),
+            )
+        else:
+            # return in-memory client
+            print("WARNING: Using in-memory client. This is ephemeral")
+            client = chromadb.EphemeralClient()
+
+        collection = conf.get('storage', 'collection')
+        embedding = Embedding(conf=conf)
+
+        return ChromaStorage(embedding=embedding, chroma_client=client, collection_name=collection)
diff --git a/src/verdictnet/storage/graph_storage.py b/src/verdictnet/storage/graph_storage.py
new file mode 100644
index 0000000..9876423
--- /dev/null
+++ b/src/verdictnet/storage/graph_storage.py
@@ -0,0 +1,328 @@
+import configparser
+import logging
+from textwrap import dedent
+from typing import Optional, List, Dict
+
+import numpy as np
+from neo4j import Driver, GraphDatabase
+
+from verdictnet import config
+from verdictnet.models.node import Node
+from verdictnet.storage.adapters import NodeAdapter
+
+logger = config.logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+class GraphStorage:
+    def __init__(self, graph_driver: Driver):
+        """
+        Initialize the GraphStorage with a Neo4j driver.
+        """
+        self.driver = graph_driver
+        self._ensure_constraints()
+
+    def _ensure_constraints(self):
+        with self.driver.session() as session:
+            session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (n:Node) REQUIRE n.uuid IS UNIQUE")
+
+    def create_node(self, node: Node):
+        """
+        Create a single node in the Neo4j database. It has no ordinal value.
+        """
+        node_data = NodeAdapter.to_neo4j(node)
+        with self.driver.session() as session:
+            query = """
+            MERGE (n:Node {uuid: $uuid})
+            SET n.level = $level, n.content = $content, n.ordinal = $ordinal
+            RETURN n
+            """
+            session.run(query, **node_data)
+
+    def create_relationship(self, parent_uuid: str, child_uuid: str):
+        with self.driver.session() as session:
+            query = """
+            MATCH (p:Node {uuid: $parent_uuid})
+            MATCH (c:Node {uuid: $child_uuid})
+            MERGE (p)-[:HAS_CHILD]->(c)
+            """
+            session.run(query, parent_uuid=parent_uuid, child_uuid=child_uuid)
+
+    def store(self, root_node: Node):
+        """
+        Recursively store a Node and all its children in the Neo4j database.
+        """
+        # Create the root node
+        self.create_node(root_node)
+
+        # Recursively create child nodes and relationships
+        for child in root_node.children:
+            self.store(child)  # Store the child node recursively
+            self.create_relationship(root_node.uuid, child.uuid)
+
+    def batch_store(self, node_list: List[Node], parent_uuid: Optional[str] = None, ordered = False):
+        """
+        Batch insert nodes and relationships into Neo4j.
+        If parent_uuid is provided, all the nodes will be inserted as children of the parent node.
+        If ordered is True, the nodes will be inserted in as and ordered set. If children already exist for the
+        parent_uuid, the new nodes will be appended to the end.
+        """
+        nodes = []
+        relationships = []
+
+        insert_query = dedent(
+            """
+                MERGE (n:Node {uuid: $uuid})
+                SET n.level = $level, n.content = $content, n.ordinal = $ordinal
+            """
+        )
+        for node in node_list:
+            nd, rel = NodeAdapter.to_neo4j_with_relationships(node)
+            nodes.extend(nd)
+            relationships.extend(rel)
+
+            if parent_uuid:
+                relationships.append((parent_uuid, node.uuid))
+
+        with self.driver.session() as session:
+            # Insert all nodes
+            for node in nodes:
+                result = session.run(insert_query, **node)
+                result.to_eager_result()
+
+            # Insert all relationships
+            for parent_uuid, child_uuid in relationships:
+                session.run("""
+                    MATCH (p:Node {uuid: $parent_uuid})
+                    MATCH (c:Node {uuid: $child_uuid})
+                    MERGE (p)-[:HAS_CHILD]->(c)
+                """, parent_uuid=parent_uuid, child_uuid=child_uuid)
+
+    def delete_hierarchy(self, root_uuid: str):
+        """
+        Delete a Node hierarchy from Neo4j by root UUID.
+        """
+        with self.driver.session() as session:
+            query = """
+            MATCH (n:Node {uuid: $root_uuid})-[:HAS_CHILD*0..]->(child)
+            DETACH DELETE n, child
+            """
+            session.run(query, root_uuid=root_uuid)
+
+    def delete_all(self):
+        """
+        Clear all nodes and relationships from the Neo4j database.
+        """
+        with self.driver.session() as session:
+            session.run("MATCH (n:Node) DETACH DELETE n")
+
+    def retrieve_parent(self, uuid: str, depth: int = np.inf) -> Node:
+        """
+        Retrieve the parent of a node and all its children.
+
+        Args:
+            uuid (str): The UUID of the node to retrieve the parent of.
+
+        Returns:
+            Node: The parent node.
+        """
+
+        if depth < np.inf:
+            raise NotImplementedError("Depth limit not implemented")
+
+        with self.driver.session() as session:
+            # Query to fetch all nodes and their relationships in the hierarchy
+            query = dedent("""
+            MATCH (n:Node {uuid: $uuid})<-[:HAS_CHILD]-(parent)
+            RETURN parent.uuid AS parent_uuid;
+            """)
+
+            logger.debug(f"Querying parent for {uuid}")
+            logger.debug(f"Query: \n{query}")
+
+            result = session.run(query, uuid=uuid).single()
+
+            if not result:
+                return None
+
+            parent_uuid = result['parent_uuid']
+
+            return self.retrieve_hierarchy(parent_uuid)
+
+    def retrieve_hierarchy(self, root_uuid: str, depth: int = np.inf) -> Node:
+        """
+        Retrieve a node and all its children as a hierarchy.
+
+        Args:
+            root_uuid (str): The UUID of the root node to retrieve.
+            depth (int): The maximum depth to retrieve.
+
+        Returns:
+            Node: The root node with all its children populated.
+        """
+
+        if depth < np.inf:
+            raise NotImplementedError("Depth limit not implemented yet")
+
+        with self.driver.session() as session:
+            # Query to fetch all nodes and their relationships in the hierarchy
+            query = dedent("""
+            MATCH (n:Node {uuid: $root_uuid})-[:HAS_CHILD*0..]->(child)
+            OPTIONAL MATCH (child)<-[:HAS_CHILD]-(parent)
+            RETURN n.uuid AS root_uuid, 
+                   n.level AS root_level, 
+                   n.content AS root_content, 
+                   collect({parent: parent.uuid, self: child.uuid, contents: child.content, level: child.level, order:child.ordinal}) AS relationships;
+            """)
+            logger.debug(f"Querying hierarchy for {root_uuid}")
+            logger.debug(f"Query: \n{query}")
+
+            result = session.run(query, root_uuid=root_uuid).single()
+
+            if not result:
+                return None
+
+            # Parse the result to reconstruct the hierarchy
+            nodes = {}
+            root_node_data = {
+                'uuid': result['root_uuid'],
+                'level': result['root_level'],
+                'content': result['root_content'],
+                'parent_uuid': None,
+            }
+            nodes[root_uuid] = root_node_data
+
+            # Process children
+            for relationship in result['relationships']:
+                nodes[relationship['self']] = {
+                    'uuid': relationship['self'],
+                    'level': relationship['level'],
+                    'content': relationship['contents'],
+                    'ordinal': relationship['order'],
+                    'parent_uuid': relationship['parent'],
+                }
+
+            # Build the hierarchy
+            return NodeAdapter.build_hierarchy(root_uuid, nodes)
+
+    # def retrieve_hierarchies(self, root_uuids: List[str]) -> Dict[str, Node]:
+    #     """
+    #     Retrieve multiple node hierarchies from Neo4j by root UUIDs.
+    #
+    #     Args:
+    #         root_uuids (List[str]): The UUIDs of the root nodes to retrieve.
+    #
+    #     Returns:
+    #         Dict[str, Node]: A dictionary of root nodes with all their children populated.
+    #     """
+    #     with self.driver.session() as session:
+
+    # TODO: There seems to be a bug with this query. Fix it.
+
+    #         query = """
+    #         UNWIND $root_uuids AS root_uuid
+    #         MATCH (n:Node {uuid: root_uuid})-[:HAS_CHILD*0..]->(child)
+    #         OPTIONAL MATCH (child)<-[:HAS_CHILD]-(parent)
+    #         WHERE parent.uuid <> child.uuid
+    #         RETURN root_uuid,
+    #                n.uuid AS node_uuid,
+    #                n.level AS node_level,
+    #                n.content AS node_content,
+    #                collect(child.uuid) AS child_uuids,
+    #                collect(child.level) AS child_levels,
+    #                collect(child.content) AS child_contents,
+    #                collect(child.ordinal) AS child_order,
+    #                collect(parent.uuid) AS parent_uuids
+    #         """
+    #         result = session.run(query, root_uuids=root_uuids)
+    #
+    #         nodes = {}
+    #         for record in result:
+    #             root_uuid = record['root_uuid']
+    #             if root_uuid not in nodes:
+    #                 nodes[root_uuid] = {}
+    #
+    #             node_data = {
+    #                 'uuid': record['node_uuid'],
+    #                 'level': record['node_level'],
+    #                 'content': record['node_content'],
+    #                 'parent_uuid': None,
+    #             }
+    #             nodes[root_uuid][record['node_uuid']] = node_data
+    #
+    #             child_uuids = record['child_uuids']
+    #             child_levels = record['child_levels']
+    #             child_contents = record['child_contents']
+    #             child_order = record['child_order']
+    #             parent_uuids = record['parent_uuids']
+    #
+    #             assert len(child_uuids) == len(child_levels)
+    #             assert len(child_uuids) == len(child_contents)
+    #             assert len(child_uuids) == len(child_order)
+    #             # assert len(child_uuids) == len(parent_uuids)
+    #
+    #             for child_uuid, level, content, ordinal, parent_uuid in zip(
+    #                     child_uuids, child_levels, child_contents, child_order, parent_uuids
+    #             ):
+    #                 nodes[root_uuid][child_uuid] = {
+    #                     'uuid': child_uuid,
+    #                     'level': level,
+    #                     'content': content,
+    #                     'ordinal': ordinal,
+    #                     'parent_uuid': parent_uuid,
+    #                 }
+    #
+    #         hierarchies = {}
+    #         for root_uuid in root_uuids:
+    #             if root_uuid in nodes:
+    #                 hierarchies[root_uuid] = NodeAdapter.build_hierarchy(root_uuid, nodes[root_uuid])
+    #
+    #         return hierarchies
+
+    def retrieve_by(self, level: Optional[str] = None, content: Optional[str] = None) -> List[Node]:
+        """
+        Retrieve nodes by their level.
+
+        Args:
+            level (str): The level of the nodes to retrieve.
+            content (str): The content of the nodes to retrieve
+
+        Returns:
+            List[Node]: A list of nodes at the specified level.
+        """
+        with self.driver.session() as session:
+            clauses = []
+            kwargs = {}
+            if level is not None:
+                clauses.append("level: $level")
+                kwargs['level'] = level
+            if content is not None:
+                clauses.append("content: $content")
+                kwargs['content'] = content
+
+            query = dedent(f"""
+                MATCH 
+                    (n:Node {{ {','.join(clauses)} }}) 
+                RETURN 
+                    n.uuid AS uuid, 
+                    n.level AS level, 
+                    n.content AS content
+            """)
+
+            result = session.run(query, **kwargs)
+
+            uuids = [record['uuid'] for record in result]
+
+            nodes = [self.retrieve_hierarchy(uuid) for uuid in uuids]
+
+            return nodes
+
+    @classmethod
+    def get_graph_storage(cls, conf: Optional[configparser.ConfigParser] = None) -> 'GraphStorage':
+        conf = conf or config.get_config()
+
+        neo4j_driver = GraphDatabase.driver(conf['neo4j']['url'],
+                                            auth=(conf['neo4j']['user'], conf['neo4j']['password']))
+        graph_storage = GraphStorage(neo4j_driver)
+
+        return graph_storage
diff --git a/src/verdictnet/storage/hybrid_storage.py b/src/verdictnet/storage/hybrid_storage.py
new file mode 100644
index 0000000..0196ca2
--- /dev/null
+++ b/src/verdictnet/storage/hybrid_storage.py
@@ -0,0 +1,78 @@
+import configparser
+from typing import Optional, List
+
+from verdictnet import config
+from verdictnet.models.node import Node
+from verdictnet.storage.chroma_storage import ChromaStorage
+from verdictnet.storage.graph_storage import GraphStorage
+
+
+class HybridStorage:
+    def __init__(self, chroma_storage: ChromaStorage, graph_storage: GraphStorage):
+        """
+        Initialize HybridStorage with ChromaStorage and GraphStorage instances.
+        """
+        self.chroma_storage: ChromaStorage = chroma_storage
+        self.graph_storage: GraphStorage = graph_storage
+
+    def store(self, root_node: Node):
+        """
+        Store a Node hierarchy in both ChromaDB and Neo4j.
+        """
+        # Store in Neo4j
+        self.graph_storage.store(root_node)
+
+        # Flatten the hierarchy for ChromaDB
+        all_nodes = self.flatten_hierarchy(root_node)
+
+        # Store in ChromaDB
+        self.chroma_storage.store_batch(all_nodes)
+
+    def flatten_hierarchy(self, root_node: Node) -> List[Node]:
+        """
+        Flatten a Node hierarchy into a list of all nodes.
+        """
+        flat_list = [root_node]
+        for child in root_node.children:
+            flat_list.extend(self.flatten_hierarchy(child))
+        return flat_list
+
+    def query(self, query_string: str, n_results: Optional[int] = None):
+        """
+        Perform a verdictnet search in ChromaDB and return the results.
+        """
+        return self.chroma_storage.query(query_string, n_results)
+
+    def delete_all(self):
+        """
+        Delete all data from both ChromaDB and Neo4j.
+        """
+        self.chroma_storage.delete_collection(self.chroma_storage.collection_name)
+        self.graph_storage.delete_all()
+
+    def retrieve_parent(self, uuid: str) -> Node:
+        """
+        Retrieve the parent of a node.
+        """
+        return self.graph_storage.retrieve_parent(uuid)
+
+    def retrieve_hierarchy(self, root_uuid: str) -> Node:
+        """
+        Retrieve a node and all its children as a hierarchy.
+
+        Args:
+            root_uuid (str): The UUID of the root node to retrieve.
+
+        Returns:
+            Node: The root node with all its children populated.
+        """
+        return self.graph_storage.retrieve_hierarchy(root_uuid)
+
+    @classmethod
+    def get_hybrid_storage(cls, conf: Optional[configparser.ConfigParser] = None) -> 'HybridStorage':
+        conf = conf or config.get_config()
+
+        chroma_storage = ChromaStorage.get_chroma_storage(conf)
+        graph_storage = GraphStorage.get_graph_storage(conf)
+
+        return cls(chroma_storage, graph_storage)
\ No newline at end of file
diff --git a/src/verdictnet/storage/transaction_manager.py b/src/verdictnet/storage/transaction_manager.py
new file mode 100644
index 0000000..6bdd9a1
--- /dev/null
+++ b/src/verdictnet/storage/transaction_manager.py
@@ -0,0 +1,93 @@
+import configparser
+from typing import Optional, List
+
+from verdictnet.models.node import Node
+from verdictnet.storage.hybrid_storage import HybridStorage
+from verdictnet.config import logging, get_config
+
+logger = logging.getLogger(__name__)
+
+# TODO: implement unit tests for this class
+
+
+class TransactionManager:
+    def __init__(self, hybrid_storage: HybridStorage):
+        """
+        Initialize TransactionManager with a HybridStorage instance.
+        """
+        self.hybrid_storage = hybrid_storage
+
+    def init_dataset(self, name):
+        """
+        Check if a parent node for the Dataset already exists in the database. If it doesn't, create it. In any case, return
+        the uuid so that all further documents of this dataset will be linked to it.
+        """
+
+        # Check if the dataset already exists
+        query = {
+            "level": "dataset",
+            "content": name
+        }
+        dataset_node = self.hybrid_storage.graph_storage.retrieve_by(level='dataset', content=name)
+
+        if dataset_node:
+            logger.info("Dataset already exists in the database")
+            return dataset_node[0].uuid
+
+        # If it doesn't, create it
+        dataset_node = Node(level="dataset", content=name)
+        self.store_with_transaction(dataset_node)
+        return dataset_node.uuid
+
+    def store_with_transaction(self, root_nodes: Node | List[Node], parent_uuid: Optional[str] = None):
+        """
+        Store a Node hierarchy in both ChromaDB and Neo4j, ensuring consistency.
+        """
+        if not isinstance(root_nodes, list):
+            root_nodes = [root_nodes]
+
+        try:
+            # Store in Neo4j
+            logger.info("Storing in Neo4j...")
+            self.hybrid_storage.graph_storage.batch_store(root_nodes, parent_uuid)
+
+            # Store in ChromaDB
+            logger.info("Storing in ChromaDB...")
+            for node in root_nodes:
+                # Flatten the hierarchy for ChromaDB
+                all_nodes = self.hybrid_storage.flatten_hierarchy(node)
+
+                # Store in ChromaDB
+                self.hybrid_storage.chroma_storage.store_batch(all_nodes)
+
+        except Exception as e:
+            # Rollback strategy: remove nodes from both systems if one fails
+            for node in root_nodes:
+                self.rollback(node)
+            raise RuntimeError(f"Transaction failed: {e}")
+
+
+    def rollback(self, root_node: Node):
+        """
+        Rollback changes made during a failed transaction.
+        """
+        try:
+            # Delete from Neo4j
+            self.hybrid_storage.graph_storage.delete_hierarchy(root_node.uuid)
+
+            # Delete from ChromaDB
+            all_node_uuids = [node.uuid for node in self.hybrid_storage.flatten_hierarchy(root_node)]
+            self.hybrid_storage.chroma_storage.delete_by_ids(all_node_uuids)
+
+        except Exception as rollback_error:
+            # Log rollback failure
+            logger.error(f"Rollback failed: {rollback_error}")
+
+    @classmethod
+    def get_transaction_manager(cls, conf: Optional[configparser.ConfigParser] = None) -> 'TransactionManager':
+        conf = conf or get_config()
+
+        # Initialize HybridStorage and TransactionManager
+        hybrid_storage = HybridStorage.get_hybrid_storage(conf)
+        transaction_manager = TransactionManager(hybrid_storage)
+        return transaction_manager
diff --git a/tests/conftest.py b/tests/conftest.py
index 5d072b9..312fe11 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,7 +12,7 @@
 
 @fixture
 def static_files():
-    return root_path() / "semantic/frontend/static/css"
+    return root_path() / "verdictnet/frontend/static/css"
 
 
 @fixture

From fee24ea6a964f4551fad2cbb096f04a4e50440ad Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Mon, 27 Jan 2025 13:26:35 +0100
Subject: [PATCH 03/27] add setup to makefile

---
 Makefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 1568fd3..5f7f7b5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: server etl test build clean
+.PHONY: setup server etl test build clean
 
 # Default port for the server
 PORT ?= 8000
@@ -7,6 +7,12 @@ PORT ?= 8000
 ETL_PATH ?= /path/to/docspecs
 FORCE ?= true
 
+.PHONY: setup
+setup:
+	@echo "Generating .env file with FERNET_KEY..."
+	@python3 -c "from cryptography.fernet import Fernet; print(f'FERNET_KEY={Fernet.generate_key().decode()}')" > .env
+	@echo ".env file generated."
+
 # Docker build
 .PHONY: build
 build:

From 67f62bdce52e0915b24f99e95b9f65d93ab00510 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Mon, 27 Jan 2025 13:26:35 +0100
Subject: [PATCH 04/27] add setup to makefile

---
 Makefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 1568fd3..5f7f7b5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: server etl test build clean
+.PHONY: setup server etl test build clean
 
 # Default port for the server
 PORT ?= 8000
@@ -7,6 +7,12 @@ PORT ?= 8000
 ETL_PATH ?= /path/to/docspecs
 FORCE ?= true
 
+.PHONY: setup
+setup:
+	@echo "Generating .env file with FERNET_KEY..."
+	@python3 -c "from cryptography.fernet import Fernet; print(f'FERNET_KEY={Fernet.generate_key().decode()}')" > .env
+	@echo ".env file generated."
+
 # Docker build
 .PHONY: build
 build:

From b1adf256c44e116951592a55ad12629a25cb6506 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Mon, 27 Jan 2025 13:28:23 +0100
Subject: [PATCH 05/27] add setup to makefile

---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index 5f7f7b5..ee09100 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,8 @@ FORCE ?= true
 
 .PHONY: setup
 setup:
+	@echo "Installing requirements..."
+	@pip install -r requirements.txt
 	@echo "Generating .env file with FERNET_KEY..."
 	@python3 -c "from cryptography.fernet import Fernet; print(f'FERNET_KEY={Fernet.generate_key().decode()}')" > .env
 	@echo ".env file generated."

From 615898f28093ac91ae17145cea3c62ecc9c11c94 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Mon, 27 Jan 2025 13:54:57 +0100
Subject: [PATCH 06/27] set airflow login creds

---
 docker-compose.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docker-compose.yml b/docker-compose.yml
index b886363..3e362f6 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -34,6 +34,8 @@ services:
       - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
       - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY}
       - AIRFLOW__WEBSERVER__RBAC=True
+      - _AIRFLOW_WWW_USER_USERNAME=airflow
+      - _AIRFLOW_WWW_USER_PASSWORD=airflow
     ports:
       - "8080:8080"
     depends_on:

From 552ac47743f461c1c4446a104559fc3c5f917fcf Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Mon, 27 Jan 2025 14:09:15 +0100
Subject: [PATCH 07/27] set airflow login creds

---
 docker-compose.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker-compose.yml b/docker-compose.yml
index 3e362f6..60e49e4 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -34,6 +34,7 @@ services:
       - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
       - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY}
       - AIRFLOW__WEBSERVER__RBAC=True
+      - _AIRFLOW_WWW_USER_CREATE=True
       - _AIRFLOW_WWW_USER_USERNAME=airflow
       - _AIRFLOW_WWW_USER_PASSWORD=airflow
     ports:

From 166e3e6ee98c4d9734cdc4f0bdde28c1ddb83b6a Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Mon, 27 Jan 2025 14:10:18 +0100
Subject: [PATCH 08/27] set airflow login creds

---
 docker-compose.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 60e49e4..e2256fe 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -33,8 +33,8 @@ services:
       - AIRFLOW__CORE__EXECUTOR=LocalExecutor
       - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
       - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY}
-      - AIRFLOW__WEBSERVER__RBAC=True
-      - _AIRFLOW_WWW_USER_CREATE=True
+      - AIRFLOW__WEBSERVER__RBAC=true
+      - _AIRFLOW_WWW_USER_CREATE=true
       - _AIRFLOW_WWW_USER_USERNAME=airflow
       - _AIRFLOW_WWW_USER_PASSWORD=airflow
     ports:

From 62d4a87933de2d3a1ace141518ab0cacdb61ed90 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Mon, 27 Jan 2025 14:47:11 +0100
Subject: [PATCH 09/27] improve dag startup time

---
 Makefile               |  6 +++++-
 dags/jurisprudencia.py |  6 +-----
 dags/profiler.py       | 10 ++++++++++
 requirements.txt       |  1 +
 4 files changed, 17 insertions(+), 6 deletions(-)
 create mode 100644 dags/profiler.py

diff --git a/Makefile b/Makefile
index ee09100..571f4e6 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: setup server etl test build clean
+.PHONY: setup profile server etl test build clean
 
 # Default port for the server
 PORT ?= 8000
@@ -15,6 +15,10 @@ setup:
 	@python3 -c "from cryptography.fernet import Fernet; print(f'FERNET_KEY={Fernet.generate_key().decode()}')" > .env
 	@echo ".env file generated."
 
+.PHONY: profile
+profile:
+	@py-spy record -o profile.svg -- python dags/jurisprudencia.py
+
 # Docker build
 .PHONY: build
 build:
diff --git a/dags/jurisprudencia.py b/dags/jurisprudencia.py
index 89c180a..6afc96d 100644
--- a/dags/jurisprudencia.py
+++ b/dags/jurisprudencia.py
@@ -1,11 +1,7 @@
 import pendulum
 from airflow import DAG
 from airflow.operators.python import PythonOperator
-from airflow.utils.dates import days_ago
-from datetime import datetime, timedelta
-import requests
-import json
-import os
+from datetime import timedelta
 
 from verdictnet.ingestion.downloader import get_item_pagination
 
diff --git a/dags/profiler.py b/dags/profiler.py
new file mode 100644
index 0000000..3e00ffe
--- /dev/null
+++ b/dags/profiler.py
@@ -0,0 +1,10 @@
+import line_profiler
+
+
+@line_profiler.profile
+def execute():
+    import jurisprudencia
+
+
+if __name__ == "__main__":
+    execute()
diff --git a/requirements.txt b/requirements.txt
index 59e57ad..5516ad4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -187,6 +187,7 @@ kombu==5.4.2
 kubernetes==31.0.0
 lazy-object-proxy==1.10.0
 limits==4.0.1
+line_profiler
 linkify-it-py==2.0.3
 llama-cloud==0.1.6
 llama-index==0.12.5

From 9ac0a61a8a4fc975a5048b94e1b0f7dd3f00936b Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Mon, 27 Jan 2025 14:47:22 +0100
Subject: [PATCH 10/27] improve dag startup time

---
 src/verdictnet/embedding.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/verdictnet/embedding.py b/src/verdictnet/embedding.py
index 0dadaee..8af84d0 100644
--- a/src/verdictnet/embedding.py
+++ b/src/verdictnet/embedding.py
@@ -1,10 +1,7 @@
-import argparse
 import configparser
-from typing import List, Union, Tuple, Optional
+from typing import List, Union, Optional
 
 from numpy import ndarray
-from sentence_transformers import SentenceTransformer
-from torch import Tensor
 
 from verdictnet.config import root_path, get_config
 from verdictnet.models.node import Node
@@ -12,6 +9,8 @@
 
 class Embedding:
     def __init__(self, conf: Optional[configparser.ConfigParser] = None):
+        from sentence_transformers import SentenceTransformer
+
         self.conf = conf or get_config()
         # Load a pre-trained model
         self.model = SentenceTransformer(  # Lightweight, fast model
@@ -20,7 +19,7 @@ def __init__(self, conf: Optional[configparser.ConfigParser] = None):
         )
 
     def embed_nodes(self, nodes: List[Node]) -> tuple[
-        List[Union[List[Tensor], ndarray, Tensor]],
+        List[Union[List, ndarray]],
         List[str],
         List[dict]
     ]:
@@ -43,5 +42,5 @@ def embed_nodes(self, nodes: List[Node]) -> tuple[
 
         return embeddings, documents, metadata
 
-    def embed_string(self, text: str) -> Tensor:
+    def embed_string(self, text: str):
         return self.model.encode(text).tolist()

From 9913671a02b32106f2e809844164a02a461e7375 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Tue, 28 Jan 2025 00:05:32 +0100
Subject: [PATCH 11/27] fix imports, airflow working

---
 docker-compose.yml                     | 263 ++++++++++++++++++++++---
 setup.cfg.bak => setup.cfg             |   8 +-
 src/verdictnet/cli.py                  |   4 +-
 src/verdictnet/etl.py                  |  16 +-
 src/verdictnet/query.py                |   6 +-
 src/verdictnet/render/node_renderer.py |   2 +-
 src/verdictnet/render/plain_text.py    |   4 +-
 7 files changed, 252 insertions(+), 51 deletions(-)
 rename setup.cfg.bak => setup.cfg (81%)

diff --git a/docker-compose.yml b/docker-compose.yml
index e2256fe..3ec9cc6 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,3 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL.
+#
+# WARNING: This configuration is for local development. Do not use it in a production deployment.
+#
+# This configuration supports basic configuration using environment variables or an .env file
+# The following variables are supported:
+#
+# AIRFLOW_IMAGE_NAME           - Docker image name used to run Airflow.
+#                                Default: apache/airflow:2.10.4
+# AIRFLOW_UID                  - User ID in Airflow containers
+#                                Default: 50000
+# AIRFLOW_PROJ_DIR             - Base path to which all the files will be volumed.
+#                                Default: .
+# Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode
+#
+# _AIRFLOW_WWW_USER_USERNAME   - Username for the administrator account (if requested).
+#                                Default: airflow
+# _AIRFLOW_WWW_USER_PASSWORD   - Password for the administrator account (if requested).
+#                                Default: airflow
+# _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers.
+#                                Use this option ONLY for quick checks. Installing requirements at container
+#                                startup is done EVERY TIME the service is started.
+#                                A better way is to build a custom image or extend the official image
+#                                as described in https://airflow.apache.org/docs/docker-stack/build.html.
+#                                Default: ''
+#
+# Feel free to modify this file to suit your needs.
+---
+x-airflow-common:
+  &airflow-common
+  # In order to add custom dependencies or upgrade provider packages you can use your extended image.
+  # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml
+  # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
+  # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.10.4}
+  build: .
+  environment:
+    &airflow-common-env
+    AIRFLOW__CORE__EXECUTOR: CeleryExecutor
+    AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
+    AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
+    AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
+    AIRFLOW__CORE__FERNET_KEY: ''
+    AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
+    AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
+    AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
+    # yamllint disable rule:line-length
+    # Use simple http server on scheduler for health checks
+    # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
+    # yamllint enable rule:line-length
+    AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
+    # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
+    # for other purpose (development, test and especially production usage) build/extend Airflow image.
+    _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
+    # The following line can be used to set a custom config file, stored in the local config folder
+    # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file
+    # AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
+  volumes:
+    - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
+    - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
+    - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
+    - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
+  user: "${AIRFLOW_UID:-50000}:0"
+  depends_on:
+    &airflow-common-depends-on
+    redis:
+      condition: service_healthy
+    postgres:
+      condition: service_healthy
+
+
 services:
   chromadb:
     image: chromadb/chroma:latest
@@ -23,49 +111,162 @@ services:
       POSTGRES_USER: airflow
       POSTGRES_PASSWORD: airflow
       POSTGRES_DB: airflow
+    healthcheck:
+      test: [ "CMD", "pg_isready", "-U", "airflow" ]
+      interval: 10s
+      retries: 5
+      start_period: 5s
+    restart: always
     volumes:
       - ${PWD}/postgress_storage:/var/lib/postgresql/data
+  redis:
+    # Redis is limited to 7.2-bookworm due to licencing change
+    # https://redis.io/blog/redis-adopts-dual-source-available-licensing/
+    image: redis:7.2-bookworm
+    expose:
+      - 6379
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 10s
+      timeout: 30s
+      retries: 50
+      start_period: 30s
+    restart: always
+
 
   airflow-webserver:
-    build: .
-    container_name: airflow-webserver
-    environment:
-      - AIRFLOW__CORE__EXECUTOR=LocalExecutor
-      - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
-      - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY}
-      - AIRFLOW__WEBSERVER__RBAC=true
-      - _AIRFLOW_WWW_USER_CREATE=true
-      - _AIRFLOW_WWW_USER_USERNAME=airflow
-      - _AIRFLOW_WWW_USER_PASSWORD=airflow
+    <<: *airflow-common
+    command: webserver
     ports:
       - "8080:8080"
+    healthcheck:
+      test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 30s
+    restart: always
     depends_on:
-      - postgres
-    command: >
-      bash -c "airflow db init && airflow webserver"
-    env_file:
-      - .env
-    volumes:
-      - ${PWD}/dags:/opt/airflow/dags  # Mount the dags folder
-
+      <<: *airflow-common-depends-on
+      airflow-init:
+        condition: service_completed_successfully
 
   airflow-scheduler:
-    build: .
-    container_name: airflow-scheduler
+    <<: *airflow-common
+    command: scheduler
+    healthcheck:
+      test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 30s
+    restart: always
+    depends_on:
+      <<: *airflow-common-depends-on
+      airflow-init:
+        condition: service_completed_successfully
+
+  airflow-worker:
+    <<: *airflow-common
+    command: celery worker
+    healthcheck:
+      # yamllint disable rule:line-length
+      test:
+        - "CMD-SHELL"
+        - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 30s
     environment:
-      - AIRFLOW__CORE__EXECUTOR=LocalExecutor
-      - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow
-      - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY}
+      <<: *airflow-common-env
+      # Required to handle warm shutdown of the celery workers properly
+      # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
+      DUMB_INIT_SETSID: "0"
+    restart: always
     depends_on:
-      - postgres
-      - airflow-webserver
-    command: >
-      bash -c "airflow scheduler"
-    env_file:
-      - .env
-    volumes:
-      - ${PWD}/dags:/opt/airflow/dags  # Mount the dags folder
+      <<: *airflow-common-depends-on
+      airflow-init:
+        condition: service_completed_successfully
 
+  airflow-triggerer:
+    <<: *airflow-common
+    command: triggerer
+    healthcheck:
+      test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 30s
+    restart: always
+    depends_on:
+      <<: *airflow-common-depends-on
+      airflow-init:
+        condition: service_completed_successfully
+
+  airflow-init:
+    <<: *airflow-common
+    entrypoint: /bin/bash
+    # yamllint disable rule:line-length
+    command:
+      - -c
+      - |
+        if [[ -z "${AIRFLOW_UID}" ]]; then
+          echo
+          echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
+          echo "If you are on Linux, you SHOULD follow the instructions below to set "
+          echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
+          echo "For other operating systems you can get rid of the warning with manually created .env file:"
+          echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
+          echo
+        fi
+        one_meg=1048576
+        mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
+        cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
+        disk_available=$$(df / | tail -1 | awk '{print $$4}')
+        warning_resources="false"
+        if (( mem_available < 4000 )) ; then
+          echo
+          echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
+          echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
+          echo
+          warning_resources="true"
+        fi
+        if (( cpus_available < 2 )); then
+          echo
+          echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
+          echo "At least 2 CPUs recommended. You have $${cpus_available}"
+          echo
+          warning_resources="true"
+        fi
+        if (( disk_available < one_meg * 10 )); then
+          echo
+          echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
+          echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
+          echo
+          warning_resources="true"
+        fi
+        if [[ $${warning_resources} == "true" ]]; then
+          echo
+          echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
+          echo "Please follow the instructions to increase amount of resources available:"
+          echo "   https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
+          echo
+        fi
+        mkdir -p /sources/logs /sources/dags /sources/plugins
+        chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
+        exec /entrypoint airflow version
+    # yamllint enable rule:line-length
+    environment:
+      <<: *airflow-common-env
+      _AIRFLOW_DB_MIGRATE: 'true'
+      _AIRFLOW_WWW_USER_CREATE: 'true'
+      _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
+      _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
+      _PIP_ADDITIONAL_REQUIREMENTS: ''
+    user: "0:0"
+    volumes:
+      - ${AIRFLOW_PROJ_DIR:-.}:/sources
 
   minio:
     image: minio/minio:latest
diff --git a/setup.cfg.bak b/setup.cfg
similarity index 81%
rename from setup.cfg.bak
rename to setup.cfg
index ed112a2..27e2ef5 100644
--- a/setup.cfg.bak
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [metadata]
-name = semantic
+name = verdictnet
 version = 0.0.1
 description = A tool for running semantic queries on Spanish Law
 author = Alex Monras
@@ -7,7 +7,7 @@ license = MIT
 
 [options]
 package_dir=
-    =semantic
+    =verdictnet
 packages = find:
 install_requires =
     numpy
@@ -21,8 +21,8 @@ install_requires =
     PyMuPDF
 
 [options.packages.find]
-where=semantic
+where=src
 
 [options.entry_points]
 console_scripts =
-    semantic = cli:main
+    verdictnet = verdictnet.cli:main
diff --git a/src/verdictnet/cli.py b/src/verdictnet/cli.py
index 2ef9e44..d115788 100644
--- a/src/verdictnet/cli.py
+++ b/src/verdictnet/cli.py
@@ -1,6 +1,6 @@
 import argparse
 
-import query
+from verdictnet import query
 
 
 def main():
@@ -40,7 +40,7 @@ def main():
 
 
 def handle_etl(args):
-    import etl
+    from verdictnet import etl
     if args.subcommand == "clean":
         etl.clean()
     elif args.subcommand == "run":
diff --git a/src/verdictnet/etl.py b/src/verdictnet/etl.py
index c5b755b..7e65f74 100644
--- a/src/verdictnet/etl.py
+++ b/src/verdictnet/etl.py
@@ -6,14 +6,14 @@
 from slugify import slugify
 from bs4 import BeautifulSoup
 
-from config import get_config, root_path, get_fs
-from ingestion.documentspec import DocumentSpec
-from ingestion.paths import refined_path
-from models.node import Node
-from ingestion.parsers.html_parser import parse
-from storage.chroma_storage import ChromaStorage
-from storage.hybrid_storage import HybridStorage
-from storage.transaction_manager import TransactionManager
+from verdictnet.config import get_config, root_path, get_fs
+from verdictnet.ingestion.documentspec import DocumentSpec
+from verdictnet.ingestion.paths import refined_path
+from verdictnet.models.node import Node
+from verdictnet.ingestion.parsers.html_parser import parse
+from verdictnet.storage.chroma_storage import ChromaStorage
+from verdictnet.storage.hybrid_storage import HybridStorage
+from verdictnet.storage.transaction_manager import TransactionManager
 
 
 def get_docspecs(path: Path = None) -> List[Path]:
diff --git a/src/verdictnet/query.py b/src/verdictnet/query.py
index 142c794..3763911 100644
--- a/src/verdictnet/query.py
+++ b/src/verdictnet/query.py
@@ -1,9 +1,9 @@
 import os
 import textwrap
 
-from render.plain_text import PlainTextRenderer
-from storage.chroma_storage import ChromaStorage
-from storage.hybrid_storage import HybridStorage
+from verdictnet.render.plain_text import PlainTextRenderer
+from verdictnet.storage.chroma_storage import ChromaStorage
+from verdictnet.storage.hybrid_storage import HybridStorage
 
 
 def query(q_string: str, n_results: int = 3):
diff --git a/src/verdictnet/render/node_renderer.py b/src/verdictnet/render/node_renderer.py
index 27166a4..4be823f 100644
--- a/src/verdictnet/render/node_renderer.py
+++ b/src/verdictnet/render/node_renderer.py
@@ -1,4 +1,4 @@
-from models.node import Node
+from verdictnet.models.node import Node
 
 
 class NodeRenderer:
diff --git a/src/verdictnet/render/plain_text.py b/src/verdictnet/render/plain_text.py
index e7498e4..55c2cbc 100644
--- a/src/verdictnet/render/plain_text.py
+++ b/src/verdictnet/render/plain_text.py
@@ -1,5 +1,5 @@
-from models.node import Node
-from render.node_renderer import NodeRenderer
+from verdictnet.models.node import Node
+from verdictnet.render.node_renderer import NodeRenderer
 
 
 class PlainTextRenderer(NodeRenderer):

From 22106e2f76a288f95dac617cd6d2cc6f49ca7ded Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Tue, 28 Jan 2025 13:45:05 +0100
Subject: [PATCH 12/27] jurisprudencia dag working

---
 Makefile                                      | 29 +++++++-
 dags/jurisprudencia.py                        | 69 +++++++++++++++++--
 docker-compose.yml                            | 23 +++++--
 src/verdictnet/config.py                      | 25 +++++--
 .../ingestion/parsers/html_parser.py          |  4 +-
 5 files changed, 128 insertions(+), 22 deletions(-)

diff --git a/Makefile b/Makefile
index 571f4e6..0986c2e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: setup profile server etl test build clean
+.PHONY: setup install start profile server etl test build clean
 
 # Default port for the server
 PORT ?= 8000
@@ -7,13 +7,36 @@ PORT ?= 8000
 ETL_PATH ?= /path/to/docspecs
 FORCE ?= true
 
-.PHONY: setup
-setup:
+.PHONY: install
+install:
 	@echo "Installing requirements..."
 	@pip install -r requirements.txt
+
+.PHONY: setup
+setup: install
 	@echo "Generating .env file with FERNET_KEY..."
 	@python3 -c "from cryptography.fernet import Fernet; print(f'FERNET_KEY={Fernet.generate_key().decode()}')" > .env
 	@echo ".env file generated."
+	@echo "Launching minio..."
+	@docker-compose up -d minio
+	@echo "Waiting for minio to start..."
+	@until docker-compose exec minio mc ready local; do \
+		echo "Minio is not healthy yet. Retrying in 5 seconds..."; \
+		sleep 5; \
+	done
+	@echo "Setting up local alias..."
+	@docker-compose exec minio mc alias set minio http://localhost:9000 minioadmin minioadmin
+	@echo "Creating buckets..."
+	@docker-compose exec minio sh -c "mc ls minio/legal || mc mb minio/legal"
+	@docker-compose exec minio mc mb minio/airflow-logs
+	@echo "Minio setup complete. Stopping minio..."
+	@docker-compose stop minio
+	@echo "Setup complete."
+
+.PHONY: start
+start:
+	@echo "Starting up the microservices..."
+	@docker-compose up -d
 
 .PHONY: profile
 profile:
diff --git a/dags/jurisprudencia.py b/dags/jurisprudencia.py
index 6afc96d..febb97a 100644
--- a/dags/jurisprudencia.py
+++ b/dags/jurisprudencia.py
@@ -1,9 +1,7 @@
 import pendulum
 from airflow import DAG
 from airflow.operators.python import PythonOperator
-from datetime import timedelta
-
-from verdictnet.ingestion.downloader import get_item_pagination
+from datetime import timedelta, datetime
 
 # Define the default arguments
 default_args = {
@@ -16,6 +14,39 @@
     'retry_delay': timedelta(minutes=5),
 }
 
+
+def get_item_pagination_task(date: str):
+    from verdictnet.ingestion.downloader import get_item_pagination
+    return get_item_pagination(datetime.strptime(date, "%Y-%m-%d"))
+
+
+def refine_item_pagination_task(date: str):
+    from verdictnet.ingestion.downloader import refine_item_pagination
+    return refine_item_pagination(datetime.strptime(date, "%Y-%m-%d"))
+
+
+def download_pdfs_task(date: str):
+    from verdictnet.ingestion.downloader import download_pdfs
+    return download_pdfs(datetime.strptime(date, "%Y-%m-%d"))
+
+
+def parse_pdfs_task(date: str):
+    from verdictnet.ingestion.downloader import parse_pdfs
+    return parse_pdfs(datetime.strptime(date, "%Y-%m-%d"))
+
+
+def ingest_pdfs_task(date: str):
+    from verdictnet.ingestion.downloader import ingest_pdfs
+    from verdictnet.storage.transaction_manager import TransactionManager
+    from verdictnet.config import get_config
+
+    transaction_manager = TransactionManager.get_transaction_manager(get_config())
+    dataset_uuid = transaction_manager.init_dataset("Jurisprudencia")
+    return ingest_pdfs(date=datetime.strptime(date, "%Y-%m-%d"),
+                transaction_manager=transaction_manager,
+                dataset_uuid=dataset_uuid)
+
+
 # Define the DAG
 with DAG(
     'query_poderjudicial',
@@ -24,7 +55,35 @@
     schedule='@daily',
     catchup=True,
 ):
-    item_pagination = PythonOperator(
+    item_pagination_task = PythonOperator(
         task_id='get_item_pagination',
-        python_callable=get_item_pagination,
+        python_callable=get_item_pagination_task,
+        op_kwargs={'date': "{{ ds }}"},
+    )
+
+    refine_pagination_task = PythonOperator(
+        task_id='refine_item_pagination',
+        python_callable=refine_item_pagination_task,
+        op_kwargs={'date': "{{ ds }}"},
     )
+
+    download_pdfs = PythonOperator(
+        task_id='download_pdfs',
+        python_callable=download_pdfs_task,
+        op_kwargs={'date': "{{ ds }}"},
+    )
+
+    parse_pdfs = PythonOperator(
+        task_id='parse_pdfs',
+        python_callable=parse_pdfs_task,
+        op_kwargs={'date': "{{ ds }}"},
+    )
+
+    ingest_pdfs = PythonOperator(
+        task_id='ingest_pdfs',
+        python_callable=ingest_pdfs_task,
+        op_kwargs={'date': "{{ ds }}"},
+    )
+
+    item_pagination_task >> refine_pagination_task >> download_pdfs >> parse_pdfs >> ingest_pdfs
+
diff --git a/docker-compose.yml b/docker-compose.yml
index 3ec9cc6..b804f34 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -51,16 +51,30 @@ x-airflow-common:
   # and uncomment the "build" line below, Then run `docker-compose build` to build the images.
   # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.10.4}
   build: .
+  env_file:
+    - .env
   environment:
     &airflow-common-env
     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
     AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
-    AIRFLOW__CORE__FERNET_KEY: ''
+    AIRFLOW__CORE__FERNET_KEY: ${FERNET_KEY}
     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
-    AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
+    AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
     AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
+    _AIRFLOW_WWW_USER_CREATE: 'true'
+    _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
+    _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
+    AIRFLOW_CONN_MINIO: '{
+      "conn_type": "s3",
+      "login": "minioadmin",
+      "password": "minioadmin",
+      "host": "minio",
+      "port": 9000,
+      "schema": "http",
+      "extra": {"endpoint_url": "http://minio:9000\"}
+    }'
     # yamllint disable rule:line-length
     # Use simple http server on scheduler for health checks
     # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
@@ -255,14 +269,11 @@ services:
         fi
         mkdir -p /sources/logs /sources/dags /sources/plugins
         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
-        exec /entrypoint airflow version
+        exec airflow db init
     # yamllint enable rule:line-length
     environment:
       <<: *airflow-common-env
       _AIRFLOW_DB_MIGRATE: 'true'
-      _AIRFLOW_WWW_USER_CREATE: 'true'
-      _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
-      _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
       _PIP_ADDITIONAL_REQUIREMENTS: ''
     user: "0:0"
     volumes:
diff --git a/src/verdictnet/config.py b/src/verdictnet/config.py
index 8c6ab36..f48663b 100644
--- a/src/verdictnet/config.py
+++ b/src/verdictnet/config.py
@@ -5,6 +5,7 @@
 from typing import Optional
 
 import fsspec
+from airflow.hooks.base import BaseHook
 
 logging.basicConfig(
     level=logging.INFO,
@@ -30,13 +31,25 @@ def get_config():
 def configure_fsspec():
     config = get_config()
 
-    s3_config = {
-        "key": os.getenv("AWS_ACCESS_KEY_ID", config.get('s3', 'key')),
-        "secret": os.getenv("AWS_SECRET_ACCESS_KEY", config.get('s3', 'secret')),
-        "client_kwargs": {
-            "endpoint_url": os.getenv("S3_ENDPOINT", config.get('s3', 'endpoint_url'))
+    # Check if running within Airflow
+    if 'AIRFLOW_HOME' in os.environ:
+        # Retrieve the connection details from Airflow
+        connection = BaseHook.get_connection('minio')
+        s3_config = {
+            "key": connection.login,
+            "secret": connection.password,
+            "client_kwargs": {
+                "endpoint_url": connection.extra_dejson.get('endpoint_url')
+            }
+        }
+    else:
+        s3_config = {
+            "key": os.getenv("AWS_ACCESS_KEY_ID", config.get('s3', 'key')),
+            "secret": os.getenv("AWS_SECRET_ACCESS_KEY", config.get('s3', 'secret')),
+            "client_kwargs": {
+                "endpoint_url": os.getenv("S3_ENDPOINT", config.get('s3', 'endpoint_url'))
+            }
         }
-    }
 
     fsspec.config.conf = {
         "s3": s3_config
diff --git a/src/verdictnet/ingestion/parsers/html_parser.py b/src/verdictnet/ingestion/parsers/html_parser.py
index 30db5f0..f152bc8 100644
--- a/src/verdictnet/ingestion/parsers/html_parser.py
+++ b/src/verdictnet/ingestion/parsers/html_parser.py
@@ -1,5 +1,5 @@
-from ingestion.documentspec import DocumentSpec
-from models.node import Node
+from verdictnet.ingestion.documentspec import DocumentSpec
+from verdictnet.models.node import Node
 
 
 def next_class(tags):

From 9d2499f6b48a9865790121b7ccc7e6a0da4b49f8 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Tue, 28 Jan 2025 17:47:36 +0100
Subject: [PATCH 13/27] fix imports

---
 src/verdictnet/cli.py                       |  2 +-
 src/verdictnet/frontend/server/server.py    | 16 ++++++++--------
 src/verdictnet/frontend/server/websocket.py |  6 +++---
 src/verdictnet/ingestion/downloader.py      |  1 -
 src/verdictnet/ragagent.py                  |  8 +++-----
 src/verdictnet/render/html.py               |  4 ++--
 6 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/src/verdictnet/cli.py b/src/verdictnet/cli.py
index d115788..e4a0792 100644
--- a/src/verdictnet/cli.py
+++ b/src/verdictnet/cli.py
@@ -54,7 +54,7 @@ def handle_query(args):
 
 
 def handle_server(args):
-    from frontend.server import server
+    from verdictnet.frontend.server import server
     server.main()
 
 
diff --git a/src/verdictnet/frontend/server/server.py b/src/verdictnet/frontend/server/server.py
index 0fa4907..3c667ad 100644
--- a/src/verdictnet/frontend/server/server.py
+++ b/src/verdictnet/frontend/server/server.py
@@ -9,14 +9,14 @@
 from starlette.staticfiles import StaticFiles
 from starlette.websockets import WebSocketDisconnect
 
-from config import get_config
-from etl import get_files
-from frontend import paths
-from ragagent import RAGAgent
-from frontend.server.websocket import Connection
-from frontend.server.dto.websocket import ConnectionId, DisplayDocuments
-from render.html import HTMLRenderer
-from storage.hybrid_storage import HybridStorage
+from verdictnet.config import get_config
+from verdictnet.etl import get_files
+from verdictnet.frontend import paths
+from verdictnet.ragagent import RAGAgent
+from verdictnet.frontend.server.websocket import Connection
+from verdictnet.frontend.server.dto.websocket import ConnectionId, DisplayDocuments
+from verdictnet.render.html import HTMLRenderer
+from verdictnet.storage.hybrid_storage import HybridStorage
 
 conf = get_config()
 
diff --git a/src/verdictnet/frontend/server/websocket.py b/src/verdictnet/frontend/server/websocket.py
index 3d3cc3b..7823c9d 100644
--- a/src/verdictnet/frontend/server/websocket.py
+++ b/src/verdictnet/frontend/server/websocket.py
@@ -5,9 +5,9 @@
 from PyPDF2 import PdfFileReader
 from starlette.websockets import WebSocket
 
-from frontend.paths import uploads
-from ragagent import RAGAgent
-from frontend.server.dto.websocket import WebSocketMessage, ChatQueryMessage, FileUploaded, ChatResponseMessage, \
+from verdictnet.frontend.paths import uploads
+from verdictnet.ragagent import RAGAgent
+from verdictnet.frontend.server.dto.websocket import WebSocketMessage, ChatQueryMessage, FileUploaded, ChatResponseMessage, \
     UnfoldNodes
 
 
diff --git a/src/verdictnet/ingestion/downloader.py b/src/verdictnet/ingestion/downloader.py
index 8f76502..c309b19 100644
--- a/src/verdictnet/ingestion/downloader.py
+++ b/src/verdictnet/ingestion/downloader.py
@@ -307,7 +307,6 @@ def ingest_pdfs(date: datetime, transaction_manager: TransactionManager, force=F
     transaction_manager.store_with_transaction(nodes, parent_uuid=dataset_uuid)
 
 
-
 if __name__ == "__main__":
     start_date = datetime.today() - timedelta(days=20)
     end_date = datetime.today() - timedelta(days=1)
diff --git a/src/verdictnet/ragagent.py b/src/verdictnet/ragagent.py
index 6afdb49..7bb6424 100644
--- a/src/verdictnet/ragagent.py
+++ b/src/verdictnet/ragagent.py
@@ -2,11 +2,9 @@
 import logging
 from typing import Optional, List
 
-import chromadb
-
-from models.node import Node
-from query import print_results
-from storage.hybrid_storage import HybridStorage
+from verdictnet.models.node import Node
+from verdictnet.query import print_results
+from verdictnet.storage.hybrid_storage import HybridStorage
 
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
diff --git a/src/verdictnet/render/html.py b/src/verdictnet/render/html.py
index 8e377e2..f6efb5f 100644
--- a/src/verdictnet/render/html.py
+++ b/src/verdictnet/render/html.py
@@ -1,5 +1,5 @@
-from models.node import Node
-from render.node_renderer import NodeRenderer
+from verdictnet.models.node import Node
+from verdictnet.render.node_renderer import NodeRenderer
 
 
 class HTMLRenderer(NodeRenderer):

From 7b82aac111e035312db985e4064d2b5447f2d66c Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Tue, 28 Jan 2025 17:47:56 +0100
Subject: [PATCH 14/27] define local config files

---
 Dockerfile               |  1 -
 config/local/airflow.cfg |  6 ++++++
 config/local/config.ini  | 29 +++++++++++++++++++++++++++++
 docker-compose.yml       |  5 +++--
 4 files changed, 38 insertions(+), 3 deletions(-)
 create mode 100644 config/local/airflow.cfg
 create mode 100644 config/local/config.ini

diff --git a/Dockerfile b/Dockerfile
index 50bf3a6..6199b5a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,7 +4,6 @@ USER root
 COPY requirements.txt .
 COPY src/ src/
 COPY setup.py .
-COPY config.ini .
 RUN chown -R airflow src/
 RUN apt-get update && apt-get install -y build-essential
 
diff --git a/config/local/airflow.cfg b/config/local/airflow.cfg
new file mode 100644
index 0000000..7b7fc12
--- /dev/null
+++ b/config/local/airflow.cfg
@@ -0,0 +1,6 @@
+[logging]
+remote_logging = True
+remote_base_log_folder = s3://airflow-logs
+remote_log_conn_id = minio
+encrypt_s3_logs = False
+
diff --git a/config/local/config.ini b/config/local/config.ini
new file mode 100644
index 0000000..18426dd
--- /dev/null
+++ b/config/local/config.ini
@@ -0,0 +1,29 @@
+[storage]
+type: s3
+bucket: legal
+collection: legal-database
+raw: datalake/raw/
+refined: datalake/refined/
+html: datalake/html/
+
+[chroma]
+type: http
+host: chromadb
+port: 8000
+
+[neo4j]
+url: bolt://neo4j:7687
+user: neo4j
+password: neo4jtest
+
+[embedding]
+model_name_or_path: paraphrase-mpnet-base-v2
+cache: cache/
+
+[rag]
+n_results: 5
+
+[s3]
+key = minioadmin
+secret = minioadmin
+endpoint_url = http://minio:9000
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index b804f34..dec68ba 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -52,7 +52,7 @@ x-airflow-common:
   # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.10.4}
   build: .
   env_file:
-    - .env
+    - config/local/.env
   environment:
     &airflow-common-env
     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
@@ -73,7 +73,7 @@ x-airflow-common:
       "host": "minio",
       "port": 9000,
       "schema": "http",
-      "extra": {"endpoint_url": "http://minio:9000\"}
+      "extra": {"endpoint_url": "http://minio:9000"}
     }'
     # yamllint disable rule:line-length
     # Use simple http server on scheduler for health checks
@@ -91,6 +91,7 @@ x-airflow-common:
     - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
     - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
     - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
+    - ${PWD}/config/local/config.ini:/opt/airflow/config.ini
   user: "${AIRFLOW_UID:-50000}:0"
   depends_on:
     &airflow-common-depends-on

From 201a65a25a181e10b137ba3624dfee53cdf38c99 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Wed, 29 Jan 2025 18:43:54 +0100
Subject: [PATCH 15/27] improve imports

---
 src/verdictnet/cli.py                  |   2 +-
 src/verdictnet/config.py               |  19 +++--
 src/verdictnet/etl.py                  | 111 ++++++++++++++++---------
 src/verdictnet/ingestion/downloader.py |   1 +
 src/verdictnet/models/node.py          |   3 +-
 5 files changed, 90 insertions(+), 46 deletions(-)

diff --git a/src/verdictnet/cli.py b/src/verdictnet/cli.py
index e4a0792..d1db693 100644
--- a/src/verdictnet/cli.py
+++ b/src/verdictnet/cli.py
@@ -46,7 +46,7 @@ def handle_etl(args):
     elif args.subcommand == "run":
         etl.run(force_download=args.force, path=args.path)
     else:
-        print("No {args.subcommand} subcommand found.")
+        print(f"No {args.subcommand} subcommand found.")
 
 
 def handle_query(args):
diff --git a/src/verdictnet/config.py b/src/verdictnet/config.py
index f48663b..20dda55 100644
--- a/src/verdictnet/config.py
+++ b/src/verdictnet/config.py
@@ -1,4 +1,5 @@
 import configparser
+import json
 import logging
 import os
 from pathlib import Path
@@ -13,22 +14,30 @@
     format="%(asctime)s - %(name)s.%(funcName)s [%(levelname)s]: %(message)s",
 )
 
+logger = logging.getLogger(__name__)
+
+_fsspec_configured = False
+
 
 def root_path():
-    return Path(__file__).parent.parent.parent
+    return Path(__file__).parent.parent
 
 
 def get_config():
     # Create a ConfigParser instance
     config = configparser.ConfigParser()
 
-    # Load the configuration file from the current folder
+    # Load the configuration file from the current folder, or from the package root folder
     config.read(filenames=['config.ini', root_path() / 'config.ini'])
 
     return config
 
 
 def configure_fsspec():
+    global _fsspec_configured
+    if _fsspec_configured:
+        return
+
     config = get_config()
 
     # Check if running within Airflow
@@ -55,9 +64,9 @@ def configure_fsspec():
         "s3": s3_config
     }
 
-
-# Call the function to configure fsspec
-configure_fsspec()
+    # TODO: Warning. This is potentially logging sensitive passwords. Password obfuscation should be implemented.
+    logger.info("fsspec configured with: %s", json.dumps(fsspec.config.conf))
+    _fsspec_configured = True
 
 
 def get_fs(conf: Optional[configparser.ConfigParser] = None):
diff --git a/src/verdictnet/etl.py b/src/verdictnet/etl.py
index 7e65f74..b48e04d 100644
--- a/src/verdictnet/etl.py
+++ b/src/verdictnet/etl.py
@@ -1,4 +1,5 @@
 import os
+from configparser import ConfigParser
 from pathlib import Path
 from typing import List
 
@@ -6,15 +7,16 @@
 from slugify import slugify
 from bs4 import BeautifulSoup
 
-from verdictnet.config import get_config, root_path, get_fs
+from verdictnet.config import get_config, root_path, logging
 from verdictnet.ingestion.documentspec import DocumentSpec
 from verdictnet.ingestion.paths import refined_path
 from verdictnet.models.node import Node
 from verdictnet.ingestion.parsers.html_parser import parse
-from verdictnet.storage.chroma_storage import ChromaStorage
 from verdictnet.storage.hybrid_storage import HybridStorage
 from verdictnet.storage.transaction_manager import TransactionManager
 
+logger = logging.getLogger(__name__)
+
 
 def get_docspecs(path: Path = None) -> List[Path]:
     """
@@ -70,8 +72,68 @@ def get_document_structure(text, docspec: DocumentSpec) -> List[Node]:
     return parsed
 
 
-def ingest(main_node, docspec: DocumentSpec, storage: TransactionManager):
-    all_nodes = main_node.get_all(level=docspec.embed_level)
+def download_doc(docspec: DocumentSpec, conf, force_download=False):
+    """
+    Download the document and save it to the raw folder in html format
+    """
+    slug_name = slugify(docspec.name)
+
+    target_filename = f'{slug_name}.html'
+    raw_path = root_path() / conf['storage']['raw'] / target_filename
+
+    # Download documents
+    if force_download or not os.path.exists(raw_path):
+        print(f"Downloading document `{docspec.name}`...")
+        text = download(docspec)
+
+        os.makedirs(os.path.dirname(raw_path), exist_ok=True)
+        with open(raw_path, 'w') as file:
+            file.write(text)
+
+
+def refine(docspec, conf):
+    """
+    Take the document in html format and refine it to a json format
+    """
+    slug_name = slugify(docspec.name)
+
+    target_filename = f'{slug_name}.html'
+    raw_path = root_path() / conf['storage']['refined'] / target_filename
+
+    with open(raw_path, 'r') as file:
+        text = file.read()
+
+    target = refined_path() + f'{slug_name}.json'
+
+    main_node = get_document_structure(text, docspec=docspec)
+
+    main_node[0].save(target)
+    print(f"Saved refined in '{target_filename}'.")
+
+
+def render_html(docspec: DocumentSpec, conf: ConfigParser):
+    slug_name = slugify(docspec.name)
+    main_node_path = refined_path() + f'{slug_name}.json'
+    main_node = Node.load(main_node_path)
+
+    html_path = root_path() / conf['storage']['html'] / f'{slug_name}.html'
+    os.makedirs(os.path.dirname(html_path), exist_ok=True)
+    with open(html_path, 'w', encoding='utf-8') as file:
+        file.write(main_node[0].html(
+            preamble="""
+            <html lang="es"><head><meta charset="utf-8" /></head>
+            """
+        ))
+    logger.info(f"HTML saved to '{slug_name}.html'.")
+
+
+def ingest(docspec: DocumentSpec, conf: ConfigParser):
+    slug_name = slugify(docspec.name)
+    main_node_path = refined_path() + f'{slug_name}.json'
+    main_node = Node.load(main_node_path)
+
+    storage = TransactionManager.get_transaction_manager(conf)
+    # all_nodes = main_node.get_all(level=docspec.embed_level)
 
     storage.store_with_transaction(main_node)
 
@@ -93,52 +155,23 @@ def clean():
 def run(force_download=False, path=None):
     conf = get_config()
 
-    transaction_manager = TransactionManager.get_transaction_manager(conf)
-
     # Load Docspecs
     filenames = get_docspecs(path)
     docspecs = [DocumentSpec.load(filename) for filename in filenames]
 
     for docspec in docspecs:
 
-        slug_name = slugify(docspec.name)
-
-        # Download documents
-        target_filename = f'{slug_name}.html'
-        raw_path = root_path() / conf['storage']['raw'] / target_filename
-
-        if force_download or not os.path.exists(raw_path):
-            print(f"Downloading document `{docspec.name}`...")
-            text = download(docspec)
-
-            os.makedirs(os.path.dirname(raw_path), exist_ok=True)
-            with open(raw_path, 'w') as file:
-                file.write(text)
-        else:
-            with open(raw_path, 'r') as file:
-                text = file.read()
+        # Download document
+        download_doc(docspec, conf, force_download)
 
         # Refining documents
-        target = refined_path() + f'{slug_name}.json'
-
-        main_node = get_document_structure(text, docspec=docspec)
-
-        main_node[0].save(target)
-        print(f"Saved refined in '{target_filename}'.")
+        refine(docspec, conf)
 
-        # Saving json
-        html_path = root_path() / conf['storage']['html'] / f'{slug_name}.html'
-        os.makedirs(os.path.dirname(html_path), exist_ok=True)
-        with open(html_path, 'w', encoding='utf-8') as file:
-            file.write(main_node[0].html(
-                preamble="""
-                <html lang="es"><head><meta charset="utf-8" /></head>
-                """
-            ))
-        print(f"HTML saved to '{slug_name}.html'.")
+        # Render html
+        render_html(docspec, conf)
 
         # Ingesting into vector database
-        ingest(main_node[0], docspec, storage=transaction_manager)
+        ingest(docspec, conf)
 
 
 if __name__ == "__main__":
diff --git a/src/verdictnet/ingestion/downloader.py b/src/verdictnet/ingestion/downloader.py
index c309b19..82c7df3 100644
--- a/src/verdictnet/ingestion/downloader.py
+++ b/src/verdictnet/ingestion/downloader.py
@@ -88,6 +88,7 @@ def create_session():
         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:132.0) Gecko/20100101 Firefox/132.0"
     }
 
+    logger.info("Creating session: %s", f"{BASE_URL}/search/")
     response = session.get(f"{BASE_URL}/search/", headers=headers_dict)
 
     if response.status_code != 200:
diff --git a/src/verdictnet/models/node.py b/src/verdictnet/models/node.py
index 07497fc..04bf3bd 100644
--- a/src/verdictnet/models/node.py
+++ b/src/verdictnet/models/node.py
@@ -83,7 +83,8 @@ def save(self, path):
         with fs.open(path, 'w', encoding='utf8') as file:
             json.dump(self.json(), file, indent=4, ensure_ascii=False)
 
-    def load(self, path):
+    @classmethod
+    def load(cls, path):
         """
         Load the node from a file
         """

From eaf782562da50f6b4d0cb0e07abbf37356e36eca Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Wed, 29 Jan 2025 18:44:00 +0100
Subject: [PATCH 16/27] improve testing imports

---
 tests/conftest.py                         | 23 +++++++++++++++---
 tests/ingestion/test_documentspec.py      |  2 +-
 tests/ingestion/test_ingest.py            |  2 +-
 tests/ingestion/test_parser.py            |  2 +-
 tests/ingestion/test_pdf_parser.py        |  4 ++--
 tests/resources/config.ini                | 29 +++++++++++++++++++++++
 tests/storage/conftest.py                 |  3 +--
 tests/storage/test_adapter.py             |  4 ++--
 tests/storage/test_chroma_storage.py      |  6 ++---
 tests/storage/test_graph_storage.py       |  4 ++--
 tests/storage/test_hybrid_storage.py      |  4 ++--
 tests/storage/test_transaction_manager.py |  6 ++---
 12 files changed, 67 insertions(+), 22 deletions(-)
 create mode 100644 tests/resources/config.ini

diff --git a/tests/conftest.py b/tests/conftest.py
index 312fe11..34c4f96 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,17 +1,34 @@
+import configparser
 from pathlib import Path
+from unittest.mock import patch
 
 from bs4 import BeautifulSoup
 from pytest import fixture
 
-from config import root_path
-from ingestion.documentspec import DocumentSpec
-from ingestion.parsers.html_parser import parse
+from verdictnet.config import get_config
+
+
+@fixture(autouse=True)
+def mock_config():
+    def mock_get_config():
+        config = configparser.ConfigParser()
+        config.read('resources/config.ini')
+        return config
+
+    with patch('verdictnet.config.get_config', mock_get_config):
+        yield
+
+
+from verdictnet.ingestion.documentspec import DocumentSpec
+from verdictnet.ingestion.parsers.html_parser import parse
 
 resources = Path(__file__).parent / "resources"
 
 
+
 @fixture
 def static_files():
+    from verdictnet.config import root_path
     return root_path() / "verdictnet/frontend/static/css"
 
 
diff --git a/tests/ingestion/test_documentspec.py b/tests/ingestion/test_documentspec.py
index f3fc49a..b954bb3 100644
--- a/tests/ingestion/test_documentspec.py
+++ b/tests/ingestion/test_documentspec.py
@@ -3,7 +3,7 @@
 from pytest import fixture
 
 from conftest import resources
-from ingestion.documentspec import DocumentSpec
+from verdictnet.ingestion.documentspec import DocumentSpec
 
 
 @fixture
diff --git a/tests/ingestion/test_ingest.py b/tests/ingestion/test_ingest.py
index d8610a1..3b4e71c 100644
--- a/tests/ingestion/test_ingest.py
+++ b/tests/ingestion/test_ingest.py
@@ -2,7 +2,7 @@
 from unittest.mock import patch
 import pytest
 
-from etl import get_docspecs
+from verdictnet.etl import get_docspecs
 
 
 @pytest.fixture
diff --git a/tests/ingestion/test_parser.py b/tests/ingestion/test_parser.py
index eb28839..260db44 100644
--- a/tests/ingestion/test_parser.py
+++ b/tests/ingestion/test_parser.py
@@ -2,7 +2,7 @@
 from bs4 import BeautifulSoup
 
 from conftest import codigo_civil_spec
-from ingestion.parsers.html_parser import parse
+from verdictnet.ingestion.parsers.html_parser import parse
 
 
 @fixture(scope='class')
diff --git a/tests/ingestion/test_pdf_parser.py b/tests/ingestion/test_pdf_parser.py
index b6feae1..fbc1a0e 100644
--- a/tests/ingestion/test_pdf_parser.py
+++ b/tests/ingestion/test_pdf_parser.py
@@ -2,8 +2,8 @@
 import os
 
 from conftest import resources
-from models.node import Node
-from ingestion.parsers.pdf_parser import extract_paragraphs
+from verdictnet.models.node import Node
+from verdictnet.ingestion.parsers.pdf_parser import extract_paragraphs
 
 
 @pytest.fixture
diff --git a/tests/resources/config.ini b/tests/resources/config.ini
new file mode 100644
index 0000000..cd2617e
--- /dev/null
+++ b/tests/resources/config.ini
@@ -0,0 +1,29 @@
+[storage]
+type: s3
+bucket: legal
+collection: legal-database
+raw: datalake/raw/
+refined: datalake/refined/
+html: datalake/html/
+
+[chroma]
+type: http
+host: localhost
+port: 8000
+
+[neo4j]
+url: bolt://localhost:7687
+user: neo4j
+password: neo4jtest
+
+[embedding]
+model_name_or_path: paraphrase-mpnet-base-v2
+cache: cache/
+
+[rag]
+n_results: 5
+
+[s3]
+key = minioadmin
+secret = minioadmin
+endpoint_url = http://localhost:9000
\ No newline at end of file
diff --git a/tests/storage/conftest.py b/tests/storage/conftest.py
index d984c8f..97e59dc 100644
--- a/tests/storage/conftest.py
+++ b/tests/storage/conftest.py
@@ -1,7 +1,6 @@
 import pytest
 
-from storage.graph_storage import GraphStorage
-from storage.hybrid_storage import HybridStorage
+from verdictnet.storage.hybrid_storage import HybridStorage
 
 
 @pytest.fixture
diff --git a/tests/storage/test_adapter.py b/tests/storage/test_adapter.py
index 5e45abe..099286a 100644
--- a/tests/storage/test_adapter.py
+++ b/tests/storage/test_adapter.py
@@ -1,5 +1,5 @@
-from models.node import Node
-from storage.adapters import NodeAdapter
+from verdictnet.models.node import Node
+from verdictnet.storage.adapters import NodeAdapter
 
 
 def test_to_neo4j_with_relationships_single_node():
diff --git a/tests/storage/test_chroma_storage.py b/tests/storage/test_chroma_storage.py
index cc9685e..46c805d 100644
--- a/tests/storage/test_chroma_storage.py
+++ b/tests/storage/test_chroma_storage.py
@@ -1,9 +1,9 @@
 import chromadb
 import pytest
 from unittest.mock import MagicMock
-from models.node import Node
-from storage.chroma_storage import ChromaStorage
-from embedding import Embedding
+from verdictnet.models.node import Node
+from verdictnet.storage.chroma_storage import ChromaStorage
+from verdictnet.embedding import Embedding
 
 
 @pytest.fixture
diff --git a/tests/storage/test_graph_storage.py b/tests/storage/test_graph_storage.py
index c2d275f..673c0bd 100644
--- a/tests/storage/test_graph_storage.py
+++ b/tests/storage/test_graph_storage.py
@@ -5,8 +5,8 @@
 
 from neo4j import Result
 
-from models.node import Node
-from storage.graph_storage import GraphStorage
+from verdictnet.models.node import Node
+from verdictnet.storage.graph_storage import GraphStorage
 
 
 @pytest.fixture
diff --git a/tests/storage/test_hybrid_storage.py b/tests/storage/test_hybrid_storage.py
index 095351f..c7e3100 100644
--- a/tests/storage/test_hybrid_storage.py
+++ b/tests/storage/test_hybrid_storage.py
@@ -1,7 +1,7 @@
 import pytest
 from unittest.mock import MagicMock
-from models.node import Node
-from storage.hybrid_storage import HybridStorage
+from verdictnet.models.node import Node
+from verdictnet.storage.hybrid_storage import HybridStorage
 
 
 @pytest.fixture
diff --git a/tests/storage/test_transaction_manager.py b/tests/storage/test_transaction_manager.py
index 51adc15..78d19cc 100644
--- a/tests/storage/test_transaction_manager.py
+++ b/tests/storage/test_transaction_manager.py
@@ -1,8 +1,8 @@
 import pytest
 from unittest.mock import MagicMock, call
-from models.node import Node
-from storage.transaction_manager import TransactionManager
-from storage.hybrid_storage import HybridStorage
+from verdictnet.models.node import Node
+from verdictnet.storage.transaction_manager import TransactionManager
+from verdictnet.storage.hybrid_storage import HybridStorage
 
 
 @pytest.fixture

From d73b155919ff7f7171fd9f4e5325e8e5994eae30 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Wed, 29 Jan 2025 18:44:15 +0100
Subject: [PATCH 17/27] add local config

---
 config/local/airflow.cfg | 6 ++++++
 docker-compose.yml       | 3 ++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/config/local/airflow.cfg b/config/local/airflow.cfg
index 7b7fc12..b396ebe 100644
--- a/config/local/airflow.cfg
+++ b/config/local/airflow.cfg
@@ -4,3 +4,9 @@ remote_base_log_folder = s3://airflow-logs
 remote_log_conn_id = minio
 encrypt_s3_logs = False
 
+[webserver]
+default_dag_run_display_number = 250
+expose_config = True
+
+[celery]
+worker_concurrency = 2
diff --git a/docker-compose.yml b/docker-compose.yml
index dec68ba..ca88d3d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -85,13 +85,14 @@ x-airflow-common:
     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
     # The following line can be used to set a custom config file, stored in the local config folder
     # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file
-    # AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
+    AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
   volumes:
     - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
     - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
     - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
     - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
     - ${PWD}/config/local/config.ini:/opt/airflow/config.ini
+    - ${PWD}/config/local/airflow.cfg:/opt/airflow/config/airflow.cfg
   user: "${AIRFLOW_UID:-50000}:0"
   depends_on:
     &airflow-common-depends-on

From a309e25c8861d27858363e08dde7c9bae6eee05e Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Wed, 29 Jan 2025 19:25:16 +0100
Subject: [PATCH 18/27] Add codigo civil DAG

---
 dags/codigo_civil_penal.py | 91 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 dags/codigo_civil_penal.py

diff --git a/dags/codigo_civil_penal.py b/dags/codigo_civil_penal.py
new file mode 100644
index 0000000..25e34b0
--- /dev/null
+++ b/dags/codigo_civil_penal.py
@@ -0,0 +1,91 @@
+import pendulum
+from airflow import DAG
+from airflow.operators.empty import EmptyOperator
+from airflow.operators.python import PythonOperator
+from datetime import timedelta
+
+from slugify import slugify
+
+from verdictnet.config import get_config
+from verdictnet.etl import get_docspecs, download_doc, refine, render_html, ingest
+from verdictnet.ingestion.documentspec import DocumentSpec
+
+# Define the default arguments
+default_args = {
+    'owner': 'airflow',
+    'depends_on_past': False,
+    'start_date': pendulum.today('UTC'),  # Start date 8 weeks ago
+    'email_on_failure': False,
+    'email_on_retry': False,
+    'retries': 1,
+    'retry_delay': timedelta(minutes=5),
+}
+
+
+def lazy_download_doc(docspec):
+    conf = get_config()
+    download_doc(docspec, conf, force_download=False)
+
+
+def lazy_refine(docspec):
+    conf = get_config()
+    refine(docspec, conf)
+
+
+def lazy_render_html(docspec):
+    conf = get_config()
+    render_html(docspec, conf)
+
+
+def lazy_ingest(docspec):
+    conf = get_config()
+    ingest(docspec, conf)
+
+
+# Define the DAG
+with DAG(
+        'download_codigos',
+        default_args=default_args,
+        description='Download Codigo Civil y Penal',
+        schedule='@manual',
+        catchup=False,
+):
+    filenames = get_docspecs()
+    docspecs = [DocumentSpec.load(filename) for filename in filenames]
+
+    download_task = {}
+    refine_task = {}
+    render_task = {}
+    ingest_task = {}
+
+    start = EmptyOperator(task_id='start_task')
+
+    for docspec in docspecs:
+        name = slugify(docspec.name)
+
+        download_task[name] = PythonOperator(
+            task_id=f'download_{name}',
+            python_callable=lazy_download_doc,
+            op_args=[docspec],
+        )
+
+        refine_task[name] = PythonOperator(
+            task_id=f'refine_{name}',
+            python_callable=lazy_refine,
+            op_args=[docspec],
+        )
+
+        render_task[name] = PythonOperator(
+            task_id=f'render_{name}',
+            python_callable=lazy_render_html,
+            op_args=[docspec],
+        )
+
+        ingest_task[name] = PythonOperator(
+            task_id=f'ingest_{name}',
+            python_callable=lazy_ingest,
+            op_args=[docspec],
+        )
+
+        start >> download_task[name] >> refine_task[name]
+        download_task[name] >> render_task[name] >> ingest_task[name]

From cd6676286f87ac3199e2b3abad66fe0dca2fdfed Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Wed, 29 Jan 2025 19:25:50 +0100
Subject: [PATCH 19/27] add local config file

---
 config/airflow.cfg | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100755 config/airflow.cfg

diff --git a/config/airflow.cfg b/config/airflow.cfg
new file mode 100755
index 0000000..e69de29

From cb317a9dae3d69ce9e0ea8eded4497a616eed1c5 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Tue, 4 Feb 2025 08:34:59 +0100
Subject: [PATCH 20/27] add Dockerfile and dev requirements

---
 Dockerfile                        |  18 +++--
 Makefile                          |  14 +++-
 dags/codigo_civil_penal.py        |  67 ++++++++--------
 docker-compose.yml                |   4 +-
 requirements-dev.txt              |   1 +
 requirements.txt                  |   2 -
 setup.cfg                         |   4 +
 src/verdictnet/config.py          |   3 +-
 src/verdictnet/etl.py             |  43 +++++-----
 src/verdictnet/ingestion/paths.py |  11 +++
 src/verdictnet/models/node.py     |   9 ++-
 tests/conftest.py                 |  15 ----
 tests/models/test_node.py         | 128 ++++++++++++++++++++++++++++++
 tests/resources/config.ini        |   3 +-
 tests/test_node.py                |  29 -------
 15 files changed, 239 insertions(+), 112 deletions(-)
 create mode 100644 requirements-dev.txt
 create mode 100644 tests/models/test_node.py
 delete mode 100644 tests/test_node.py

diff --git a/Dockerfile b/Dockerfile
index 6199b5a..b053566 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,13 +1,15 @@
 FROM apache/airflow:2.10.0
 
-USER root
-COPY requirements.txt .
-COPY src/ src/
-COPY setup.py .
-RUN chown -R airflow src/
-RUN apt-get update && apt-get install -y build-essential
+# Set the working directory
+WORKDIR /app
 
 # Switch to airflow user to run the application
 USER airflow
-RUN pip install -r requirements.txt
-RUN pip install .
\ No newline at end of file
+
+# Copy the requirements file and install dependencies
+COPY requirements.txt .
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Set the entrypoint to Airflow
+ENTRYPOINT ["airflow"]
diff --git a/Makefile b/Makefile
index 0986c2e..c30eae3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: setup install start profile server etl test build clean
+.PHONY: setup install start stop profile server etl test build clean
 
 # Default port for the server
 PORT ?= 8000
@@ -35,9 +35,21 @@ setup: install
 
 .PHONY: start
 start:
+	@echo "Detecting virtual environment..."
+	@if [ -n "$$VIRTUAL_ENV" ]; then \
+		echo "Virtual environment detected at $$VIRTUAL_ENV"; \
+		export VIRTUAL_ENV_PATH=$$VIRTUAL_ENV; \
+	else \
+		echo "No virtual environment detected."; \
+	fi
 	@echo "Starting up the microservices..."
 	@docker-compose up -d
 
+.PHONY: stop
+stop:
+	@echo "Stopping the microservices..."
+	@docker-compose down
+
 .PHONY: profile
 profile:
 	@py-spy record -o profile.svg -- python dags/jurisprudencia.py
diff --git a/dags/codigo_civil_penal.py b/dags/codigo_civil_penal.py
index 25e34b0..2500960 100644
--- a/dags/codigo_civil_penal.py
+++ b/dags/codigo_civil_penal.py
@@ -1,5 +1,6 @@
 import pendulum
 from airflow import DAG
+from airflow.decorators import task_group
 from airflow.operators.empty import EmptyOperator
 from airflow.operators.python import PythonOperator
 from datetime import timedelta
@@ -42,50 +43,52 @@ def lazy_ingest(docspec):
     ingest(docspec, conf)
 
 
+def group(name, docspec):
+    """
+    Process a specific document
+    """
+    download_task = PythonOperator(
+        task_id=f'download_{name}',
+        python_callable=lazy_download_doc,
+        op_args=[docspec],
+    )
+
+    refine_task = PythonOperator(
+        task_id=f'refine_{name}',
+        python_callable=lazy_refine,
+        op_args=[docspec],
+    )
+
+    render_task = PythonOperator(
+        task_id=f'render_{name}',
+        python_callable=lazy_render_html,
+        op_args=[docspec],
+    )
+
+    ingest_task = PythonOperator(
+        task_id=f'ingest_{name}',
+        python_callable=lazy_ingest,
+        op_args=[docspec],
+    )
+
+    download_task >> refine_task >> render_task >> ingest_task
+
+    return download_task
+
 # Define the DAG
 with DAG(
         'download_codigos',
         default_args=default_args,
         description='Download Codigo Civil y Penal',
-        schedule='@manual',
+        schedule="@once",
         catchup=False,
 ):
     filenames = get_docspecs()
     docspecs = [DocumentSpec.load(filename) for filename in filenames]
 
-    download_task = {}
-    refine_task = {}
-    render_task = {}
-    ingest_task = {}
-
     start = EmptyOperator(task_id='start_task')
 
     for docspec in docspecs:
         name = slugify(docspec.name)
 
-        download_task[name] = PythonOperator(
-            task_id=f'download_{name}',
-            python_callable=lazy_download_doc,
-            op_args=[docspec],
-        )
-
-        refine_task[name] = PythonOperator(
-            task_id=f'refine_{name}',
-            python_callable=lazy_refine,
-            op_args=[docspec],
-        )
-
-        render_task[name] = PythonOperator(
-            task_id=f'render_{name}',
-            python_callable=lazy_render_html,
-            op_args=[docspec],
-        )
-
-        ingest_task[name] = PythonOperator(
-            task_id=f'ingest_{name}',
-            python_callable=lazy_ingest,
-            op_args=[docspec],
-        )
-
-        start >> download_task[name] >> refine_task[name]
-        download_task[name] >> render_task[name] >> ingest_task[name]
+        start >> group(name, docspec)
diff --git a/docker-compose.yml b/docker-compose.yml
index ca88d3d..8321f3b 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -86,13 +86,15 @@ x-airflow-common:
     # The following line can be used to set a custom config file, stored in the local config folder
     # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file
     AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
+    PYTHONPATH: /app:${PYTHONPATH:-}  # Append custom path to existing PYTHONPATH
   volumes:
     - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
     - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
     - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
     - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
-    - ${PWD}/config/local/config.ini:/opt/airflow/config.ini
     - ${PWD}/config/local/airflow.cfg:/opt/airflow/config/airflow.cfg
+    - ./src/verdictnet:/app/verdictnet  # Mount local package directory to container
+    - ${PWD}/config/local/config.ini:/app/verdictnet/config.ini  # Mount local config file to container
   user: "${AIRFLOW_UID:-50000}:0"
   depends_on:
     &airflow-common-depends-on
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..ce3e9fc
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1 @@
+line_profiler
diff --git a/requirements.txt b/requirements.txt
index 5516ad4..3ecc33b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -187,7 +187,6 @@ kombu==5.4.2
 kubernetes==31.0.0
 lazy-object-proxy==1.10.0
 limits==4.0.1
-line_profiler
 linkify-it-py==2.0.3
 llama-cloud==0.1.6
 llama-index==0.12.5
@@ -358,7 +357,6 @@ wcwidth==0.2.13
 websocket-client==1.8.0
 websockets==12.0
 wirerope==1.0.0
-wordcloud==1.9.4
 wrapt==1.17.0
 WTForms==3.2.1
 yarl==1.18.3
diff --git a/setup.cfg b/setup.cfg
index 27e2ef5..f38b784 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -20,6 +20,10 @@ install_requires =
     pyarrow
     PyMuPDF
 
+[options.package_data]
+verdictnet.ingestion =
+    resources/*.json
+
 [options.packages.find]
 where=src
 
diff --git a/src/verdictnet/config.py b/src/verdictnet/config.py
index 20dda55..32a617e 100644
--- a/src/verdictnet/config.py
+++ b/src/verdictnet/config.py
@@ -20,7 +20,7 @@
 
 
 def root_path():
-    return Path(__file__).parent.parent
+    return Path(__file__).parent
 
 
 def get_config():
@@ -70,6 +70,7 @@ def configure_fsspec():
 
 
 def get_fs(conf: Optional[configparser.ConfigParser] = None):
+    configure_fsspec()
     conf = conf or get_config()
     if conf['storage']['type'] == 's3':
         fs = fsspec.filesystem("s3")
diff --git a/src/verdictnet/etl.py b/src/verdictnet/etl.py
index b48e04d..92660f1 100644
--- a/src/verdictnet/etl.py
+++ b/src/verdictnet/etl.py
@@ -7,9 +7,9 @@
 from slugify import slugify
 from bs4 import BeautifulSoup
 
-from verdictnet.config import get_config, root_path, logging
+from verdictnet.config import get_config, root_path, logging, get_fs
 from verdictnet.ingestion.documentspec import DocumentSpec
-from verdictnet.ingestion.paths import refined_path
+from verdictnet.ingestion.paths import refined_path, raw_path, html_path
 from verdictnet.models.node import Node
 from verdictnet.ingestion.parsers.html_parser import parse
 from verdictnet.storage.hybrid_storage import HybridStorage
@@ -31,7 +31,9 @@ def get_docspecs(path: Path = None) -> List[Path]:
                 filenames.append(Path(root + '/' + file))
 
     if not filenames:
-        print(f"No document Spec filed found in provided folder {path}")
+        logger.warning(f"No document Spec files found in provided folder {path}")
+    else:
+        logger.info(f"Found {len(filenames)} document specs: %s", ",".join([str(f) for f in filenames]))
 
     return filenames
 
@@ -76,18 +78,19 @@ def download_doc(docspec: DocumentSpec, conf, force_download=False):
     """
     Download the document and save it to the raw folder in html format
     """
+    fs = get_fs(conf)
+
     slug_name = slugify(docspec.name)
 
-    target_filename = f'{slug_name}.html'
-    raw_path = root_path() / conf['storage']['raw'] / target_filename
+    target_path = raw_path() + f'{slug_name}.html'
 
     # Download documents
-    if force_download or not os.path.exists(raw_path):
-        print(f"Downloading document `{docspec.name}`...")
+    if force_download or not os.path.exists(target_path):
+        logger.info(f"Downloading document `{docspec.name}`...")
         text = download(docspec)
 
-        os.makedirs(os.path.dirname(raw_path), exist_ok=True)
-        with open(raw_path, 'w') as file:
+        os.makedirs(os.path.dirname(target_path), exist_ok=True)
+        with fs.open(target_path, 'w') as file:
             file.write(text)
 
 
@@ -95,12 +98,14 @@ def refine(docspec, conf):
     """
     Take the document in html format and refine it to a json format
     """
+    fs = get_fs(conf)
+
     slug_name = slugify(docspec.name)
 
-    target_filename = f'{slug_name}.html'
-    raw_path = root_path() / conf['storage']['refined'] / target_filename
+    source_filename = f'{slug_name}.html'
+    soure_path = raw_path() + source_filename
 
-    with open(raw_path, 'r') as file:
+    with fs.open(soure_path, 'r') as file:
         text = file.read()
 
     target = refined_path() + f'{slug_name}.json'
@@ -108,23 +113,24 @@ def refine(docspec, conf):
     main_node = get_document_structure(text, docspec=docspec)
 
     main_node[0].save(target)
-    print(f"Saved refined in '{target_filename}'.")
+    logger.info(f"Saved refined in '{target}'.")
 
 
 def render_html(docspec: DocumentSpec, conf: ConfigParser):
+    fs = get_fs(conf)
+
     slug_name = slugify(docspec.name)
     main_node_path = refined_path() + f'{slug_name}.json'
     main_node = Node.load(main_node_path)
 
-    html_path = root_path() / conf['storage']['html'] / f'{slug_name}.html'
-    os.makedirs(os.path.dirname(html_path), exist_ok=True)
-    with open(html_path, 'w', encoding='utf-8') as file:
-        file.write(main_node[0].html(
+    html_file = html_path() + f'{slug_name}.html'
+    with fs.open(html_file, 'w', encoding='utf-8') as file:
+        file.write(main_node.html(
             preamble="""
             <html lang="es"><head><meta charset="utf-8" /></head>
             """
         ))
-    logger.info(f"HTML saved to '{slug_name}.html'.")
+    logger.info(f"HTML saved to '{html_file}'.")
 
 
 def ingest(docspec: DocumentSpec, conf: ConfigParser):
@@ -160,7 +166,6 @@ def run(force_download=False, path=None):
     docspecs = [DocumentSpec.load(filename) for filename in filenames]
 
     for docspec in docspecs:
-
         # Download document
         download_doc(docspec, conf, force_download)
 
diff --git a/src/verdictnet/ingestion/paths.py b/src/verdictnet/ingestion/paths.py
index 00be14f..7d2e06f 100644
--- a/src/verdictnet/ingestion/paths.py
+++ b/src/verdictnet/ingestion/paths.py
@@ -43,3 +43,14 @@ def refined_path():
         return root_path() / conf['storage']['bucket'] / conf['storage']['refined']
     elif conf['storage']['type'] == 's3':
         return f"s3://{conf['storage']['bucket']}/{conf['storage']['refined']}"
+
+
+def html_path():
+    """
+    Return the path where we store refined objects as JSON files ready to be ingested
+    into the database
+    """
+    if conf['storage']['type'] == 'local':
+        return root_path() / conf['storage']['bucket'] / conf['storage']['html']
+    elif conf['storage']['type'] == 's3':
+        return f"s3://{conf['storage']['bucket']}/{conf['storage']['html']}"
diff --git a/src/verdictnet/models/node.py b/src/verdictnet/models/node.py
index 04bf3bd..dd2b24e 100644
--- a/src/verdictnet/models/node.py
+++ b/src/verdictnet/models/node.py
@@ -88,9 +88,11 @@ def load(cls, path):
         """
         Load the node from a file
         """
-        with open(path, 'r', encoding='utf8') as file:
-            data = json.load(file, ensure_ascii=False)
-        return Node(**data)
+        fs = get_fs()
+
+        with fs.open(path, 'r', encoding='utf8') as file:
+            data = json.load(file)
+        return Node.from_dict(data)
 
     @classmethod
     def from_dict(cls, data):
@@ -100,6 +102,7 @@ def from_dict(cls, data):
         return Node(
             id=data['id'],
             level=data['level'],
+            uuid=data['uuid'],
             content=data['content'],
             children=[cls.from_dict(child) for child in data['children']]
         )
diff --git a/tests/conftest.py b/tests/conftest.py
index 34c4f96..753d5bc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,8 +5,6 @@
 from bs4 import BeautifulSoup
 from pytest import fixture
 
-from verdictnet.config import get_config
-
 
 @fixture(autouse=True)
 def mock_config():
@@ -25,19 +23,6 @@ def mock_get_config():
 resources = Path(__file__).parent / "resources"
 
 
-
-@fixture
-def static_files():
-    from verdictnet.config import root_path
-    return root_path() / "verdictnet/frontend/static/css"
-
-
-@fixture
-def css_code(static_files):
-    with open(static_files / 'document_tree.css', 'r') as css_file:
-        yield css_file.read()
-
-
 @fixture
 def codigo_civil_spec():
     docspec = DocumentSpec.load(resources / "codigo-civil-spec.json")
diff --git a/tests/models/test_node.py b/tests/models/test_node.py
new file mode 100644
index 0000000..1246596
--- /dev/null
+++ b/tests/models/test_node.py
@@ -0,0 +1,128 @@
+import json
+import tempfile
+import webbrowser
+
+from _pytest.fixtures import fixture
+from bs4 import BeautifulSoup
+
+
+from verdictnet.models.node import Node
+
+
+@fixture
+def static_files():
+    from verdictnet.config import root_path
+    return root_path() / "frontend/static/css"
+
+
+@fixture
+def css_code(static_files):
+    with open(static_files / 'document_tree.css', 'r') as css_file:
+        yield css_file.read()
+
+
+class TestNode:
+    def test_render(self, node_titulo):
+        text = node_titulo.render()
+        assert "      2. Carecerán de validez las disposiciones que  contradigan otra de rango superior." in text
+
+    def test_html(self, node_titulo, css_code):
+        html = node_titulo.html()
+
+        # Insert the CSS link into the HTML content
+        html_with_css = f'<html><head><style>{css_code}</style></head><body>{html}</body></html>'
+
+        # open html_text in a browser to see the result
+        with tempfile.NamedTemporaryFile('w', delete=False, suffix='.html') as f:
+            url = 'file://' + f.name
+            f.write(html_with_css)
+
+        # ensure html is correctly formatted
+        try:
+            assert BeautifulSoup(html_with_css, 'html.parser')
+        except Exception as e:
+            webbrowser.open(url)
+            raise e
+
+    def test_save(self):
+        node = Node(level="1", content="Test Node")
+        with tempfile.NamedTemporaryFile('w', delete=False, suffix='.json') as f:
+            path = f.name
+        node.save(path)
+
+        with open(path, 'r', encoding='utf8') as file:
+            data = json.load(file)
+
+        assert data['level'] == "1"
+        assert data['content'] == "Test Node"
+        assert 'uuid' in data
+
+    def test_load(self):
+        node_data = {
+            "id": 1,
+            "uuid": "1234",
+            "level": "1",
+            "content": "Test Node",
+            "children": []
+        }
+        with tempfile.NamedTemporaryFile('w', delete=False, suffix='.json') as f:
+            path = f.name
+            json.dump(node_data, f)
+
+        loaded_node = Node.load(path)
+
+        assert loaded_node.level == "1"
+        assert loaded_node.content == "Test Node"
+        assert loaded_node.uuid == "1234"
+        assert loaded_node.children == []
+
+    def test_save_with_children(self):
+        child_node = Node(level="2", content="Child Node")
+        parent_node = Node(level="1", content="Parent Node", children=[child_node])
+
+        with tempfile.NamedTemporaryFile('w', delete=False, suffix='.json') as f:
+            path = f.name
+        parent_node.save(path)
+
+        with open(path, 'r', encoding='utf8') as file:
+            data = json.load(file)
+
+        assert data['level'] == "1"
+        assert data['content'] == "Parent Node"
+        assert 'uuid' in data
+        assert len(data['children']) == 1
+        assert data['children'][0]['level'] == "2"
+        assert data['children'][0]['content'] == "Child Node"
+        assert 'uuid' in data['children'][0]
+
+    def test_load_with_children(self):
+        node_data = {
+            "id": 1,
+            "uuid": "1234",
+            "level": "1",
+            "content": "Parent Node",
+            "children": [
+                {
+                    "id": 2,
+                    "uuid": "5678",
+                    "level": "2",
+                    "content": "Child Node",
+                    "children": []
+                }
+            ]
+        }
+
+        with tempfile.NamedTemporaryFile('w', delete=False, suffix='.json') as f:
+            path = f.name
+            json.dump(node_data, f)
+
+        loaded_node = Node.load(path)
+
+        assert loaded_node.level == "1"
+        assert loaded_node.content == "Parent Node"
+        assert loaded_node.uuid == "1234"
+        assert len(loaded_node.children) == 1
+        assert loaded_node.children[0].level == "2"
+        assert loaded_node.children[0].content == "Child Node"
+        assert loaded_node.children[0].uuid == "5678"
+        assert loaded_node.children[0].children == []
diff --git a/tests/resources/config.ini b/tests/resources/config.ini
index cd2617e..9e84700 100644
--- a/tests/resources/config.ini
+++ b/tests/resources/config.ini
@@ -1,5 +1,6 @@
 [storage]
-type: s3
+# s3 or file
+type: file
 bucket: legal
 collection: legal-database
 raw: datalake/raw/
diff --git a/tests/test_node.py b/tests/test_node.py
deleted file mode 100644
index da6614a..0000000
--- a/tests/test_node.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import tempfile
-import webbrowser
-
-from bs4 import BeautifulSoup
-
-
-class TestNode:
-    def test_render(self, node_titulo):
-        text = node_titulo.render()
-        assert "      2. Carecerán de validez las disposiciones que  contradigan otra de rango superior." in text
-
-    def test_html(self, node_titulo, css_code):
-        html = node_titulo.html()
-
-        # Insert the CSS link into the HTML content
-        html_with_css = f'<html><head><style>{css_code}</style></head><body>{html}</body></html>'
-
-        # open html_text in a browser to see the result
-        with tempfile.NamedTemporaryFile('w', delete=False, suffix='.html') as f:
-            url = 'file://' + f.name
-            f.write(html_with_css)
-
-        # ensure html is correctly formatted
-        try:
-            assert BeautifulSoup(html_with_css, 'html.parser')
-        except Exception as e:
-            webbrowser.open(url)
-            raise e
-

From 2c21d207a9e61a687acee68f362d7a5a6c1a18e4 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Tue, 4 Feb 2025 08:36:52 +0100
Subject: [PATCH 21/27] add logs to gitignore

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index f4ce3e0..853c3b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@
 */*.egg-info/*
 .idea/
 build/
-
+logs/
 __pycache__/
 docs/
 cache/

From 23271fe2f7720be396d85d24df88b201c48a62f7 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Tue, 4 Feb 2025 09:22:45 +0100
Subject: [PATCH 22/27] fix mock config in tests

---
 pytest.ini                 | 3 +++
 requirements.txt           | 1 -
 setup.cfg                  | 4 ++++
 src/verdictnet/config.py   | 3 ++-
 tests/conftest.py          | 3 ++-
 tests/resources/config.ini | 8 ++++----
 6 files changed, 15 insertions(+), 7 deletions(-)
 create mode 100644 pytest.ini

diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..050e475
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+log_level=INFO
+log_cli=true
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 3ecc33b..b32ce8b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -285,7 +285,6 @@ pytest-mock==3.14.0
 python-daemon==3.1.2
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
-python-multipart==0.0.17
 python-nvd3==0.16.0
 python-slugify==8.0.4
 pytz==2024.2
diff --git a/setup.cfg b/setup.cfg
index f38b784..8a6db70 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -23,6 +23,10 @@ install_requires =
 [options.package_data]
 verdictnet.ingestion =
     resources/*.json
+verdictnet.frontend =
+    static/css/*
+    static/js/*
+    templates/*
 
 [options.packages.find]
 where=src
diff --git a/src/verdictnet/config.py b/src/verdictnet/config.py
index 32a617e..cbe587c 100644
--- a/src/verdictnet/config.py
+++ b/src/verdictnet/config.py
@@ -28,7 +28,8 @@ def get_config():
     config = configparser.ConfigParser()
 
     # Load the configuration file from the current folder, or from the package root folder
-    config.read(filenames=['config.ini', root_path() / 'config.ini'])
+    files = config.read(filenames=['config.ini', root_path() / 'config.ini'])
+    logger.info("Successfully loaded config files: %s", files)
 
     return config
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 753d5bc..50b9128 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,7 +10,8 @@
 def mock_config():
     def mock_get_config():
         config = configparser.ConfigParser()
-        config.read('resources/config.ini')
+        success = config.read(['resources/config.ini', 'tests/resources/config.ini'])
+        assert success is not [], "Could not read mock config file"
         return config
 
     with patch('verdictnet.config.get_config', mock_get_config):
diff --git a/tests/resources/config.ini b/tests/resources/config.ini
index 9e84700..5782e55 100644
--- a/tests/resources/config.ini
+++ b/tests/resources/config.ini
@@ -14,8 +14,8 @@ port: 8000
 
 [neo4j]
 url: bolt://localhost:7687
-user: neo4j
-password: neo4jtest
+user: none
+password: nopwd
 
 [embedding]
 model_name_or_path: paraphrase-mpnet-base-v2
@@ -25,6 +25,6 @@ cache: cache/
 n_results: 5
 
 [s3]
-key = minioadmin
-secret = minioadmin
+key = none
+secret = nopwd
 endpoint_url = http://localhost:9000
\ No newline at end of file

From f071d5ac053d6537854ae44bc6dfcbd54338573c Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Tue, 4 Feb 2025 14:34:36 +0100
Subject: [PATCH 23/27] improve setup & readme

---
 Makefile           | 47 ++++++++++++++++++++++++++++++++++++++++------
 README.md          | 33 +++++++++++++++++++++++++++-----
 config/airflow.cfg |  0
 docker-compose.yml | 21 ++++++++++-----------
 4 files changed, 79 insertions(+), 22 deletions(-)
 delete mode 100755 config/airflow.cfg

diff --git a/Makefile b/Makefile
index c30eae3..421c88f 100644
--- a/Makefile
+++ b/Makefile
@@ -14,9 +14,26 @@ install:
 
 .PHONY: setup
 setup: install
-	@echo "Generating .env file with FERNET_KEY..."
-	@python3 -c "from cryptography.fernet import Fernet; print(f'FERNET_KEY={Fernet.generate_key().decode()}')" > .env
-	@echo ".env file generated."
+	@if [ ! -f .env ]; then \
+		echo "Generating .env file with FERNET_KEY..."; \
+		echo "Generating .env file with FERNET_KEY..."; \
+		python3 -c "from cryptography.fernet import Fernet; \
+		fernet_key = Fernet.generate_key().decode(); \
+		template = 'FERNET_KEY={fernet_key}\\nAIRFLOW_UID=50000'; \
+		print(template.format(fernet_key=fernet_key)); \
+		print('_AIRFLOW_WWW_USER_USERNAME=airflow'); \
+		print('_AIRFLOW_WWW_USER_PASSWORD=airflow'); " > .env; \
+  		echo ".env file generated."; \
+	else \
+		echo ".env file already exists. Skipping FERNET_KEY generation."; \
+	fi
+	@if [ ! -f config/local/.env ]; then \
+		echo "Copying .env file to config/local..."; \
+		cp .env config/local/; \
+		echo ".env file copied."; \
+	else \
+		echo "config/local/.env file already exists. Skipping config/local copy."; \
+	fi
 	@echo "Launching minio..."
 	@docker-compose up -d minio
 	@echo "Waiting for minio to start..."
@@ -26,9 +43,20 @@ setup: install
 	done
 	@echo "Setting up local alias..."
 	@docker-compose exec minio mc alias set minio http://localhost:9000 minioadmin minioadmin
-	@echo "Creating buckets..."
-	@docker-compose exec minio sh -c "mc ls minio/legal || mc mb minio/legal"
-	@docker-compose exec minio mc mb minio/airflow-logs
+	@echo "Checking if bucket 'legal' exists..."
+	@if ! docker-compose exec minio mc ls minio/legal; then \
+  		echo "Creating bucket 'legal'..."; \
+  		docker-compose exec minio mc mb minio/legal; \
+ 	else \
+  		echo "Bucket 'legal' already exists. Skipping creation."; \
+	fi
+	@echo "Checking if bucket 'airflow-logs' exists..."
+	@if ! docker-compose exec minio mc ls minio/airflow-logs; then \
+  		echo "Creating bucket 'airflow-logs'..."; \
+  		docker-compose exec minio mc mb minio/airflow-logs; \
+ 	else \
+  		echo "Bucket 'airflow-logs' already exists. Skipping creation."; \
+	fi
 	@echo "Minio setup complete. Stopping minio..."
 	@docker-compose stop minio
 	@echo "Setup complete."
@@ -44,12 +72,19 @@ start:
 	fi
 	@echo "Starting up the microservices..."
 	@docker-compose up -d
+	@echo "Done."
+	@echo "\nFrontends are available at the following links:"
+	@echo "ChromaDB: http://localhost:3000/collections/legal-database"
+	@echo "Neo4j: http://localhost:7474"
+	@echo "Minio: http://localhost:9000"
+	@echo "Airflow: http://localhost:8080"
 
 .PHONY: stop
 stop:
 	@echo "Stopping the microservices..."
 	@docker-compose down
 
+
 .PHONY: profile
 profile:
 	@py-spy record -o profile.svg -- python dags/jurisprudencia.py
diff --git a/README.md b/README.md
index 50af8f8..5ae8202 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,44 @@
-# Semantic Graph Search Project
+# VerdictNet: Legal Semantic Search Engine
 
 This project is a semantic graph search system designed to manage and query data.
 Currently, the interface is a simple command-line interface (CLI) tool and a web server.
 The system supports data ingestion, cleaning, querying, and running a server for frontend interactions.
 
 
-## Quick Start
-You should be able to kickstart the project by launching the docker compose setup and then starting the server:
+## Development Quick Start
+It is strongly recommended to use a virtual environment to run the project. You should be able to kickstart the project by running the following commands:
 ```sh
-  $ docker-compose up
+  $ make setup
 ```
-This will launch the following services:
+
+This will
+- Install the package requirements in the current python environment (python 3.12 recommended)
+- Create an `.env` file with the necessary environment variables if it does not exist. Copy this `.env` file to the `config/local` directory if it does not already exist. 
+- Create the `datalake` and `airflow-logs` buckets in the Minio object storage.
+- Install development dependencies.
+
+After this, you can start the development environment by running:
+```sh
+  $ make start
+```
+The first launch will take some time because it will build the docker images.
+
+When done, the following services will be up and running:
 - [ChromaDB Browser: `http://localhost:3000/collections/legal-database`](http://localhost:3000/collections/legal-database). This is the vector Database used to run semantic queries.
 - [Neo4J Browser: `http://localhost:7474`](http://localhost:7474). This is a GUI to the graph database that will hold the relationships between the different documents indexed in the ChromaDB.
 - [Airflow: `http://localhost:8080`](http://localhost:8080). This is the scheduler used to run daily data mining tasks.
 - [Minio Console: `http://localhost:9001`](http://localhost:9001). This is the object storage used to store the documents in local develpment envs.
 
+The Postgress database is used by Airflow and is persisted to a `postgress_service`. This is useful if you want to do a clean start and not lose the data in the database.
+
+
+
+### Running the ETL pipeline
+To run the ETL pipeline, you can run the following command:
+```sh
+  $ make etl
+```
+
 Finally, run
 ```sh
   $ verdictnet server
diff --git a/config/airflow.cfg b/config/airflow.cfg
deleted file mode 100755
index e69de29..0000000
diff --git a/docker-compose.yml b/docker-compose.yml
index 8321f3b..2d7ceea 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -89,9 +89,11 @@ x-airflow-common:
     PYTHONPATH: /app:${PYTHONPATH:-}  # Append custom path to existing PYTHONPATH
   volumes:
     - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
-    - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
-    - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
-    - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
+    # TODO: need to figure out what happens with dag_processor_manager and scheduler logs, that don't go to minio
+    #- ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
+    - ${AIRFLOW_PROJ_DIR:-.}/config/local/:/opt/airflow/config
+    # TODO: Not needed ATM, but might be useful in the future
+    #- ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
     - ${PWD}/config/local/airflow.cfg:/opt/airflow/config/airflow.cfg
     - ./src/verdictnet:/app/verdictnet  # Mount local package directory to container
     - ${PWD}/config/local/config.ini:/app/verdictnet/config.ini  # Mount local config file to container
@@ -111,7 +113,7 @@ services:
     ports:
       - "8000:8000"
     volumes:
-      - ${PWD}/chromadb_storage:/data
+      - ${PWD}/chromadb:/data
 
   chromadb-admin:
     image: fengzhichao/chromadb-admin:latest
@@ -136,7 +138,7 @@ services:
       start_period: 5s
     restart: always
     volumes:
-      - ${PWD}/postgress_storage:/var/lib/postgresql/data
+      - ${PWD}/postgress:/var/lib/postgresql/data
   redis:
     # Redis is limited to 7.2-bookworm due to licencing change
     # https://redis.io/blog/redis-adopts-dual-source-available-licensing/
@@ -273,7 +275,8 @@ services:
         fi
         mkdir -p /sources/logs /sources/dags /sources/plugins
         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
-        exec airflow db init
+        exec airflow db migrate
+        exec airflow users create -u ${_AIRFLOW_WWW_USER_USERNAME} -p ${_AIRFLOW_WWW_USER_PASSWORD} -r Admin --verbose -f air -l flow -e airflow@airflow.air
     # yamllint enable rule:line-length
     environment:
       <<: *airflow-common-env
@@ -294,7 +297,7 @@ services:
       - "9000:9000"  # API
       - "9001:9001"  # Console
     volumes:
-      - ${PWD}/data/:/data
+      - ${PWD}/minio/:/data
     command: server /data --console-address ":9001"
 
   neo4j:
@@ -307,7 +310,3 @@ services:
       - "7687:7687"  # Bolt protocol
     volumes:
       - ${PWD}/neo4j_data:/data
-
-volumes:
-  chromadb_data:
-  postgres_data:

From 3a64b3e7ec906756c0094f02f50d287d7a8e4ae2 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Tue, 4 Feb 2025 22:31:12 +0100
Subject: [PATCH 24/27] improve setup & readme

---
 .gitignore         |  8 ++++++--
 Makefile           | 17 +++++++++++++++++
 README.md          |  2 ++
 docker-compose.yml |  1 -
 4 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 853c3b4..015ed78 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,11 @@ __pycache__/
 docs/
 cache/
 data/
-datalake/
+plugins/
+
+minio/
+neo4j/
 neo4j_data/
-postgress_storage/
+chromadb/
+postgress/
 
diff --git a/Makefile b/Makefile
index 421c88f..bf4b4b2 100644
--- a/Makefile
+++ b/Makefile
@@ -59,6 +59,23 @@ setup: install
 	fi
 	@echo "Minio setup complete. Stopping minio..."
 	@docker-compose stop minio
+	@echo "Initializing Airflow..."
+	@docker-compose up -d airflow-webserver
+	@echo "Waiting for Airflow to start..."
+	@until docker-compose exec airflow-webserver airflow db check; do \
+		echo "Airflow is not healthy yet. Retrying in 5 seconds..."; \
+		sleep 5; \
+	done
+	@echo "Creating Airflow user..."
+	@docker-compose exec airflow-webserver airflow users create -u airflow -p airflow -r Admin --verbose -f air -l flow -e airflow@airflow.air
+	@if  docker-compose exec airflow-webserver airflow connections get minio; then \
+		echo "Connection 'minio' already exists. Skipping creation."; \
+	else \
+		echo "Creating connection 'minio'..."; \
+		docker-compose exec airflow-webserver airflow connections add --conn-login minioadmin --conn-password minioadmin --conn-host minio --conn-port 9000 --conn-schema http --conn-extra '{"endpoint_url": "http://minio:9000"}' --conn-type aws minio; \
+	fi
+	@echo "Stopping Airflow..."
+	@docker-compose stop airflow-webserver
 	@echo "Setup complete."
 
 .PHONY: start
diff --git a/README.md b/README.md
index 5ae8202..6043221 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@ This will
 - Install the package requirements in the current python environment (python 3.12 recommended)
 - Create an `.env` file with the necessary environment variables if it does not exist. Copy this `.env` file to the `config/local` directory if it does not already exist. 
 - Create the `datalake` and `airflow-logs` buckets in the Minio object storage.
+- Create the `airflow` user in Airflow.
+- Create the `minio` Airflow connection.
 - Install development dependencies.
 
 After this, you can start the development environment by running:
diff --git a/docker-compose.yml b/docker-compose.yml
index 2d7ceea..856fc95 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -276,7 +276,6 @@ services:
         mkdir -p /sources/logs /sources/dags /sources/plugins
         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
         exec airflow db migrate
-        exec airflow users create -u ${_AIRFLOW_WWW_USER_USERNAME} -p ${_AIRFLOW_WWW_USER_PASSWORD} -r Admin --verbose -f air -l flow -e airflow@airflow.air
     # yamllint enable rule:line-length
     environment:
       <<: *airflow-common-env

From fe4e39efba33fe0dfb58fa8b68fab7e0a604398a Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Tue, 4 Feb 2025 22:44:45 +0100
Subject: [PATCH 25/27] fix typo

---
 .gitignore         | 2 +-
 docker-compose.yml | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 015ed78..648a984 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,5 +15,5 @@ minio/
 neo4j/
 neo4j_data/
 chromadb/
-postgress/
+postgres/
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 856fc95..ae8252c 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -138,7 +138,8 @@ services:
       start_period: 5s
     restart: always
     volumes:
-      - ${PWD}/postgress:/var/lib/postgresql/data
+      - ${PWD}/postgres:/var/lib/postgresql/data
+
   redis:
     # Redis is limited to 7.2-bookworm due to licencing change
     # https://redis.io/blog/redis-adopts-dual-source-available-licensing/

From c684233c86169d7a9463d5d5ecb3d7d3889eaeae Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Tue, 4 Feb 2025 23:42:52 +0100
Subject: [PATCH 26/27] make sentence-transformers cache available to airflow
 workers

---
 Makefile                    | 13 +++----------
 config/local/config.ini     |  3 ++-
 docker-compose.yml          |  2 ++
 src/verdictnet/embedding.py |  4 ++--
 4 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/Makefile b/Makefile
index bf4b4b2..72b4428 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,5 @@
 .PHONY: setup install start stop profile server etl test build clean
 
-# Default port for the server
-PORT ?= 8000
-
-# Path for ETL documents
-ETL_PATH ?= /path/to/docspecs
-FORCE ?= true
-
 .PHONY: install
 install:
 	@echo "Installing requirements..."
@@ -114,12 +107,12 @@ build:
 # Run the server
 server:
 	@echo "Running the server on port $(PORT)..."
-	@semantic server --port $(PORT)
+	@verdictnet server --port $(PORT)
 
 # Run the ETL pipeline
 etl:
 	@echo "Running the ETL pipeline with path $(ETL_PATH) and force $(FORCE)..."
-	@semantic etl run --path $(ETL_PATH) --force $(FORCE)
+	@verdictnet etl run --path $(ETL_PATH) --force $(FORCE)
 
 # Run tests
 test:
@@ -129,4 +122,4 @@ test:
 # Clean the vector database
 clean:
 	@echo "Cleaning the vector database..."
-	@semantic etl clean
\ No newline at end of file
+	@verdictnet etl clean
\ No newline at end of file
diff --git a/config/local/config.ini b/config/local/config.ini
index 18426dd..ac523d4 100644
--- a/config/local/config.ini
+++ b/config/local/config.ini
@@ -18,7 +18,8 @@ password: neo4jtest
 
 [embedding]
 model_name_or_path: paraphrase-mpnet-base-v2
-cache: cache/
+# use this because this is the mounting point in the docker compose
+cache: /cache
 
 [rag]
 n_results: 5
diff --git a/docker-compose.yml b/docker-compose.yml
index ae8252c..b679f6a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -205,6 +205,8 @@ services:
       # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation
       DUMB_INIT_SETSID: "0"
     restart: always
+    volumes:
+      - ${PWD}/cache:/cache
     depends_on:
       <<: *airflow-common-depends-on
       airflow-init:
diff --git a/src/verdictnet/embedding.py b/src/verdictnet/embedding.py
index 8af84d0..2fd3a5f 100644
--- a/src/verdictnet/embedding.py
+++ b/src/verdictnet/embedding.py
@@ -13,9 +13,9 @@ def __init__(self, conf: Optional[configparser.ConfigParser] = None):
 
         self.conf = conf or get_config()
         # Load a pre-trained model
-        self.model = SentenceTransformer(  # Lightweight, fast model
+        self.model = SentenceTransformer(
             self.conf.get('embedding', 'model_name_or_path'),
-            cache_folder=root_path() / self.conf.get('embedding', 'cache')
+            cache_folder=self.conf.get('embedding', 'cache')
         )
 
     def embed_nodes(self, nodes: List[Node]) -> tuple[

From 76dac84a7445992c8fe95b2b5ae3ccfd17f8aeb0 Mon Sep 17 00:00:00 2001
From: Alex Monras <amb.physis@gmail.com>
Date: Tue, 4 Feb 2025 23:43:09 +0100
Subject: [PATCH 27/27] comment config

---
 config.ini | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/config.ini b/config.ini
index cd2617e..05f70ad 100644
--- a/config.ini
+++ b/config.ini
@@ -1,3 +1,6 @@
+# This is the config file meant for running the application in the host machine
+# It uses the minio storage
+
 [storage]
 type: s3
 bucket: legal
@@ -17,6 +20,7 @@ user: neo4j
 password: neo4jtest
 
 [embedding]
+# Lightweight, fast model
 model_name_or_path: paraphrase-mpnet-base-v2
 cache: cache/