diff --git a/.gitignore b/.gitignore index fee070a..648a984 100644 --- a/.gitignore +++ b/.gitignore @@ -3,13 +3,17 @@ .ipynb_checkpoints/ */*.egg-info/* .idea/ - +build/ +logs/ __pycache__/ docs/ cache/ data/ -datalake/ +plugins/ + +minio/ +neo4j/ neo4j_data/ -postgress_storage/ -storage/ +chromadb/ +postgres/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b053566 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM apache/airflow:2.10.0 + +# Set the working directory +WORKDIR /app + +# Switch to airflow user to run the application +USER airflow + +# Copy the requirements file and install dependencies +COPY requirements.txt . + +RUN pip install --no-cache-dir -r requirements.txt + +# Set the entrypoint to Airflow +ENTRYPOINT ["airflow"] diff --git a/Makefile b/Makefile index 417c95d..72b4428 100644 --- a/Makefile +++ b/Makefile @@ -1,21 +1,118 @@ -.PHONY: server etl test +.PHONY: setup install start stop profile server etl test build clean -# Default port for the server -PORT ?= 8000 +.PHONY: install +install: + @echo "Installing requirements..." + @pip install -r requirements.txt -# Path for ETL documents -ETL_PATH ?= /path/to/docspecs -FORCE ?= true +.PHONY: setup +setup: install + @if [ ! -f .env ]; then \ + echo "Generating .env file with FERNET_KEY..."; \ + echo "Generating .env file with FERNET_KEY..."; \ + python3 -c "from cryptography.fernet import Fernet; \ + fernet_key = Fernet.generate_key().decode(); \ + template = 'FERNET_KEY={fernet_key}\\nAIRFLOW_UID=50000'; \ + print(template.format(fernet_key=fernet_key)); \ + print('_AIRFLOW_WWW_USER_USERNAME=airflow'); \ + print('_AIRFLOW_WWW_USER_PASSWORD=airflow'); " > .env; \ + echo ".env file generated."; \ + else \ + echo ".env file already exists. Skipping FERNET_KEY generation."; \ + fi + @if [ ! -f config/local/.env ]; then \ + echo "Copying .env file to config/local..."; \ + cp .env config/local/; \ + echo ".env file copied."; \ + else \ + echo "config/local/.env file already exists. Skipping config/local copy."; \ + fi + @echo "Launching minio..." + @docker-compose up -d minio + @echo "Waiting for minio to start..." + @until docker-compose exec minio mc ready local; do \ + echo "Minio is not healthy yet. Retrying in 5 seconds..."; \ + sleep 5; \ + done + @echo "Setting up local alias..." + @docker-compose exec minio mc alias set minio http://localhost:9000 minioadmin minioadmin + @echo "Checking if bucket 'legal' exists..." + @if ! docker-compose exec minio mc ls minio/legal; then \ + echo "Creating bucket 'legal'..."; \ + docker-compose exec minio mc mb minio/legal; \ + else \ + echo "Bucket 'legal' already exists. Skipping creation."; \ + fi + @echo "Checking if bucket 'airflow-logs' exists..." + @if ! docker-compose exec minio mc ls minio/airflow-logs; then \ + echo "Creating bucket 'airflow-logs'..."; \ + docker-compose exec minio mc mb minio/airflow-logs; \ + else \ + echo "Bucket 'airflow-logs' already exists. Skipping creation."; \ + fi + @echo "Minio setup complete. Stopping minio..." + @docker-compose stop minio + @echo "Initializing Airflow..." + @docker-compose up -d airflow-webserver + @echo "Waiting for Airflow to start..." + @until docker-compose exec airflow-webserver airflow db check; do \ + echo "Airflow is not healthy yet. Retrying in 5 seconds..."; \ + sleep 5; \ + done + @echo "Creating Airflow user..." + @docker-compose exec airflow-webserver airflow users create -u airflow -p airflow -r Admin --verbose -f air -l flow -e airflow@airflow.air + @if docker-compose exec airflow-webserver airflow connections get minio; then \ + echo "Connection 'minio' already exists. Skipping creation."; \ + else \ + echo "Creating connection 'minio'..."; \ + docker-compose exec airflow-webserver airflow connections add --conn-login minioadmin --conn-password minioadmin --conn-host minio --conn-port 9000 --conn-schema http --conn-extra '{"endpoint_url": "http://minio:9000"}' --conn-type aws minio; \ + fi + @echo "Stopping Airflow..." + @docker-compose stop airflow-webserver + @echo "Setup complete." + +.PHONY: start +start: + @echo "Detecting virtual environment..." + @if [ -n "$$VIRTUAL_ENV" ]; then \ + echo "Virtual environment detected at $$VIRTUAL_ENV"; \ + export VIRTUAL_ENV_PATH=$$VIRTUAL_ENV; \ + else \ + echo "No virtual environment detected."; \ + fi + @echo "Starting up the microservices..." + @docker-compose up -d + @echo "Done." + @echo "\nFrontends are available at the following links:" + @echo "ChromaDB: http://localhost:3000/collections/legal-database" + @echo "Neo4j: http://localhost:7474" + @echo "Minio: http://localhost:9000" + @echo "Airflow: http://localhost:8080" + +.PHONY: stop +stop: + @echo "Stopping the microservices..." + @docker-compose down + + +.PHONY: profile +profile: + @py-spy record -o profile.svg -- python dags/jurisprudencia.py + +# Docker build +.PHONY: build +build: + @docker build -t semantic_airflow . # Run the server server: @echo "Running the server on port $(PORT)..." - @semantic server --port $(PORT) + @verdictnet server --port $(PORT) # Run the ETL pipeline etl: @echo "Running the ETL pipeline with path $(ETL_PATH) and force $(FORCE)..." - @semantic etl run --path $(ETL_PATH) --force $(FORCE) + @verdictnet etl run --path $(ETL_PATH) --force $(FORCE) # Run tests test: @@ -25,4 +122,4 @@ test: # Clean the vector database clean: @echo "Cleaning the vector database..." - @semantic etl clean \ No newline at end of file + @verdictnet etl clean \ No newline at end of file diff --git a/README.md b/README.md index 4459095..6043221 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,49 @@ -# Semantic Graph Search Project +# VerdictNet: Legal Semantic Search Engine This project is a semantic graph search system designed to manage and query data. Currently, the interface is a simple command-line interface (CLI) tool and a web server. The system supports data ingestion, cleaning, querying, and running a server for frontend interactions. -## Quick Start -You should be able to kickstart the project by launching the docker compose setup and then starting the server: +## Development Quick Start +It is strongly recommended to use a virtual environment to run the project. You should be able to kickstart the project by running the following commands: ```sh - $ docker-compose up + $ make setup ``` -This will launch the following services: + +This will +- Install the package requirements in the current python environment (python 3.12 recommended) +- Create an `.env` file with the necessary environment variables if it does not exist. Copy this `.env` file to the `config/local` directory if it does not already exist. +- Create the `datalake` and `airflow-logs` buckets in the Minio object storage. +- Create the `airflow` user in Airflow. +- Create the `minio` Airflow connection. +- Install development dependencies. + +After this, you can start the development environment by running: +```sh + $ make start +``` +The first launch will take some time because it will build the docker images. + +When done, the following services will be up and running: - [ChromaDB Browser: `http://localhost:3000/collections/legal-database`](http://localhost:3000/collections/legal-database). This is the vector Database used to run semantic queries. - [Neo4J Browser: `http://localhost:7474`](http://localhost:7474). This is a GUI to the graph database that will hold the relationships between the different documents indexed in the ChromaDB. - [Airflow: `http://localhost:8080`](http://localhost:8080). This is the scheduler used to run daily data mining tasks. - [Minio Console: `http://localhost:9001`](http://localhost:9001). This is the object storage used to store the documents in local develpment envs. +The Postgress database is used by Airflow and is persisted to a `postgress_service`. This is useful if you want to do a clean start and not lose the data in the database. + + + +### Running the ETL pipeline +To run the ETL pipeline, you can run the following command: +```sh + $ make etl +``` + Finally, run ```sh - $ semantic server + $ verdictnet server ```` to launch the frontend interface, accessible through - [Frontend: `http://localhost:8000`](http://localhost:8000) @@ -33,7 +58,7 @@ Run data pipelines to ingest and process documents. #### Usage: ```sh -$ semantic etl [--path PATH] [--force FORCE] {clean,run} +$ verdictnet etl [--path PATH] [--force FORCE] {clean,run} --path PATH: Path where to look for document specs. --force FORCE: Force download of documents. @@ -46,13 +71,13 @@ run: Ingest data into the vector database. Query the data stored in the system. Usage: ```sh - $ semantic query [--query QUERY] [--n_results N_RESULTS] [--interactive] + $ verdictnet query [--query QUERY] [--n_results N_RESULTS] [--interactive] ``` ### Server Run the server to provide a frontend interface. Usage: ```sh - $ semantic server [-p PORT] + $ verdictnet server [-p PORT] -p, --port PORT: Port to run the frontend on (default: 8000). ``` @@ -60,16 +85,16 @@ Usage: ## Example usage ```sh # Clean the vector database -semantic etl clean +verdictnet etl clean # Run the ETL pipeline -semantic etl run --path /path/to/docspecs --force true +verdictnet etl run --path /path/to/docspecs --force true # Query the data -semantic query --query "example query" --n_results 5 +verdictnet query --query "example query" --n_results 5 # Run the server -semantic server --port 8080 +verdictnet server --port 8080 ``` ## Configuration diff --git a/config.ini b/config.ini index cd2617e..05f70ad 100644 --- a/config.ini +++ b/config.ini @@ -1,3 +1,6 @@ +# This is the config file meant for running the application in the host machine +# It uses the minio storage + [storage] type: s3 bucket: legal @@ -17,6 +20,7 @@ user: neo4j password: neo4jtest [embedding] +# Lightweight, fast model model_name_or_path: paraphrase-mpnet-base-v2 cache: cache/ diff --git a/config/local/airflow.cfg b/config/local/airflow.cfg new file mode 100644 index 0000000..b396ebe --- /dev/null +++ b/config/local/airflow.cfg @@ -0,0 +1,12 @@ +[logging] +remote_logging = True +remote_base_log_folder = s3://airflow-logs +remote_log_conn_id = minio +encrypt_s3_logs = False + +[webserver] +default_dag_run_display_number = 250 +expose_config = True + +[celery] +worker_concurrency = 2 diff --git a/config/local/config.ini b/config/local/config.ini new file mode 100644 index 0000000..ac523d4 --- /dev/null +++ b/config/local/config.ini @@ -0,0 +1,30 @@ +[storage] +type: s3 +bucket: legal +collection: legal-database +raw: datalake/raw/ +refined: datalake/refined/ +html: datalake/html/ + +[chroma] +type: http +host: chromadb +port: 8000 + +[neo4j] +url: bolt://neo4j:7687 +user: neo4j +password: neo4jtest + +[embedding] +model_name_or_path: paraphrase-mpnet-base-v2 +# use this because this is the mounting point in the docker compose +cache: /cache + +[rag] +n_results: 5 + +[s3] +key = minioadmin +secret = minioadmin +endpoint_url = http://minio:9000 \ No newline at end of file diff --git a/dags/codigo_civil_penal.py b/dags/codigo_civil_penal.py new file mode 100644 index 0000000..2500960 --- /dev/null +++ b/dags/codigo_civil_penal.py @@ -0,0 +1,94 @@ +import pendulum +from airflow import DAG +from airflow.decorators import task_group +from airflow.operators.empty import EmptyOperator +from airflow.operators.python import PythonOperator +from datetime import timedelta + +from slugify import slugify + +from verdictnet.config import get_config +from verdictnet.etl import get_docspecs, download_doc, refine, render_html, ingest +from verdictnet.ingestion.documentspec import DocumentSpec + +# Define the default arguments +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': pendulum.today('UTC'), # Start date 8 weeks ago + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 1, + 'retry_delay': timedelta(minutes=5), +} + + +def lazy_download_doc(docspec): + conf = get_config() + download_doc(docspec, conf, force_download=False) + + +def lazy_refine(docspec): + conf = get_config() + refine(docspec, conf) + + +def lazy_render_html(docspec): + conf = get_config() + render_html(docspec, conf) + + +def lazy_ingest(docspec): + conf = get_config() + ingest(docspec, conf) + + +def group(name, docspec): + """ + Process a specific document + """ + download_task = PythonOperator( + task_id=f'download_{name}', + python_callable=lazy_download_doc, + op_args=[docspec], + ) + + refine_task = PythonOperator( + task_id=f'refine_{name}', + python_callable=lazy_refine, + op_args=[docspec], + ) + + render_task = PythonOperator( + task_id=f'render_{name}', + python_callable=lazy_render_html, + op_args=[docspec], + ) + + ingest_task = PythonOperator( + task_id=f'ingest_{name}', + python_callable=lazy_ingest, + op_args=[docspec], + ) + + download_task >> refine_task >> render_task >> ingest_task + + return download_task + +# Define the DAG +with DAG( + 'download_codigos', + default_args=default_args, + description='Download Codigo Civil y Penal', + schedule="@once", + catchup=False, +): + filenames = get_docspecs() + docspecs = [DocumentSpec.load(filename) for filename in filenames] + + start = EmptyOperator(task_id='start_task') + + for docspec in docspecs: + name = slugify(docspec.name) + + start >> group(name, docspec) diff --git a/dags/jurisprudencia.py b/dags/jurisprudencia.py index de06499..febb97a 100644 --- a/dags/jurisprudencia.py +++ b/dags/jurisprudencia.py @@ -1,68 +1,89 @@ +import pendulum from airflow import DAG -from airflow.operators.python_operator import PythonOperator -from airflow.utils.dates import days_ago -from datetime import datetime, timedelta -import requests -import json -import os +from airflow.operators.python import PythonOperator +from datetime import timedelta, datetime # Define the default arguments default_args = { 'owner': 'airflow', 'depends_on_past': False, - 'start_date': days_ago(8 * 7), # Start date 8 weeks ago + 'start_date': pendulum.today('UTC').add(days=-8 * 7), # Start date 8 weeks ago 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } + +def get_item_pagination_task(date: str): + from verdictnet.ingestion.downloader import get_item_pagination + return get_item_pagination(datetime.strptime(date, "%Y-%m-%d")) + + +def refine_item_pagination_task(date: str): + from verdictnet.ingestion.downloader import refine_item_pagination + return refine_item_pagination(datetime.strptime(date, "%Y-%m-%d")) + + +def download_pdfs_task(date: str): + from verdictnet.ingestion.downloader import download_pdfs + return download_pdfs(datetime.strptime(date, "%Y-%m-%d")) + + +def parse_pdfs_task(date: str): + from verdictnet.ingestion.downloader import parse_pdfs + return parse_pdfs(datetime.strptime(date, "%Y-%m-%d")) + + +def ingest_pdfs_task(date: str): + from verdictnet.ingestion.downloader import ingest_pdfs + from verdictnet.storage.transaction_manager import TransactionManager + from verdictnet.config import get_config + + transaction_manager = TransactionManager.get_transaction_manager(get_config()) + dataset_uuid = transaction_manager.init_dataset("Jurisprudencia") + return ingest_pdfs(date=datetime.strptime(date, "%Y-%m-%d"), + transaction_manager=transaction_manager, + dataset_uuid=dataset_uuid) + + # Define the DAG -dag = DAG( +with DAG( 'query_poderjudicial', default_args=default_args, description='Query www.poderjudicial.es and store results in JSON', - schedule_interval='@weekly', + schedule='@daily', catchup=True, -) - -# Define the Python function to query the API and save results -def query_poderjudicial(ds, **kwargs): - date_from = (datetime.strptime(ds, '%Y-%m-%d') - timedelta(days=7)).strftime('%Y-%m-%d') - date_to = ds - # - # url = 'https://www.poderjudicial.es/search/search.action' - # payload = { - # "action": "query", - # "sort": "IN_FECHARESOLUCION:decreasing", - # "recordsPerPage": "10", - # "databasematch": "AN", - # "start": "1", - # "FECHARESOLUCIONDESDE": date_from, - # "FECHARESOLUCIONHASTA": date_to, - # "TIPOINTERES_ACTUAL": "Actualidad", - # "TIPOORGANOPUB": "|11|12|13|14|15|16|" - # } - # headers = { - # 'Content-Type': 'application/json' - # } - # - # response = requests.post(url, json=payload, headers=headers) - # response.raise_for_status() - # - # results = response.json() - # output_path = f'/path/to/output/results_{date_from}_to_{date_to}.json' - # - # with open(output_path, 'w') as f: - # json.dump(results, f) - -# Define the task -query_task = PythonOperator( - task_id='query_poderjudicial_task', - provide_context=True, - python_callable=query_poderjudicial, - dag=dag, -) - -# Set the task in the DAG -query_task +): + item_pagination_task = PythonOperator( + task_id='get_item_pagination', + python_callable=get_item_pagination_task, + op_kwargs={'date': "{{ ds }}"}, + ) + + refine_pagination_task = PythonOperator( + task_id='refine_item_pagination', + python_callable=refine_item_pagination_task, + op_kwargs={'date': "{{ ds }}"}, + ) + + download_pdfs = PythonOperator( + task_id='download_pdfs', + python_callable=download_pdfs_task, + op_kwargs={'date': "{{ ds }}"}, + ) + + parse_pdfs = PythonOperator( + task_id='parse_pdfs', + python_callable=parse_pdfs_task, + op_kwargs={'date': "{{ ds }}"}, + ) + + ingest_pdfs = PythonOperator( + task_id='ingest_pdfs', + python_callable=ingest_pdfs_task, + op_kwargs={'date': "{{ ds }}"}, + ) + + item_pagination_task >> refine_pagination_task >> download_pdfs >> parse_pdfs >> ingest_pdfs + diff --git a/dags/profiler.py b/dags/profiler.py new file mode 100644 index 0000000..3e00ffe --- /dev/null +++ b/dags/profiler.py @@ -0,0 +1,10 @@ +import line_profiler + + +@line_profiler.profile +def execute(): + import jurisprudencia + + +if __name__ == "__main__": + execute() diff --git a/docker-compose.yml b/docker-compose.yml index 4f7dffe..b679f6a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,4 +1,110 @@ -version: '3.8' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. +# +# WARNING: This configuration is for local development. Do not use it in a production deployment. +# +# This configuration supports basic configuration using environment variables or an .env file +# The following variables are supported: +# +# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. +# Default: apache/airflow:2.10.4 +# AIRFLOW_UID - User ID in Airflow containers +# Default: 50000 +# AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed. +# Default: . +# Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode +# +# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). +# Default: airflow +# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). +# Default: airflow +# _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. +# Use this option ONLY for quick checks. Installing requirements at container +# startup is done EVERY TIME the service is started. +# A better way is to build a custom image or extend the official image +# as described in https://airflow.apache.org/docs/docker-stack/build.html. +# Default: '' +# +# Feel free to modify this file to suit your needs. +--- +x-airflow-common: + &airflow-common + # In order to add custom dependencies or upgrade provider packages you can use your extended image. + # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml + # and uncomment the "build" line below, Then run `docker-compose build` to build the images. + # image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.10.4} + build: . + env_file: + - config/local/.env + environment: + &airflow-common-env + AIRFLOW__CORE__EXECUTOR: CeleryExecutor + AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow + AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow + AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 + AIRFLOW__CORE__FERNET_KEY: ${FERNET_KEY} + AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' + AIRFLOW__CORE__LOAD_EXAMPLES: 'false' + AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' + _AIRFLOW_WWW_USER_CREATE: 'true' + _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} + _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} + AIRFLOW_CONN_MINIO: '{ + "conn_type": "s3", + "login": "minioadmin", + "password": "minioadmin", + "host": "minio", + "port": 9000, + "schema": "http", + "extra": {"endpoint_url": "http://minio:9000"} + }' + # yamllint disable rule:line-length + # Use simple http server on scheduler for health checks + # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server + # yamllint enable rule:line-length + AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' + # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks + # for other purpose (development, test and especially production usage) build/extend Airflow image. + _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} + # The following line can be used to set a custom config file, stored in the local config folder + # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file + AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg' + PYTHONPATH: /app:${PYTHONPATH:-} # Append custom path to existing PYTHONPATH + volumes: + - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags + # TODO: need to figure out what happens with dag_processor_manager and scheduler logs, that don't go to minio + #- ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs + - ${AIRFLOW_PROJ_DIR:-.}/config/local/:/opt/airflow/config + # TODO: Not needed ATM, but might be useful in the future + #- ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins + - ${PWD}/config/local/airflow.cfg:/opt/airflow/config/airflow.cfg + - ./src/verdictnet:/app/verdictnet # Mount local package directory to container + - ${PWD}/config/local/config.ini:/app/verdictnet/config.ini # Mount local config file to container + user: "${AIRFLOW_UID:-50000}:0" + depends_on: + &airflow-common-depends-on + redis: + condition: service_healthy + postgres: + condition: service_healthy + services: chromadb: @@ -7,7 +113,7 @@ services: ports: - "8000:8000" volumes: - - ${PWD}/chromadb_storage:/data + - ${PWD}/chromadb:/data chromadb-admin: image: fengzhichao/chromadb-admin:latest @@ -25,27 +131,162 @@ services: POSTGRES_USER: airflow POSTGRES_PASSWORD: airflow POSTGRES_DB: airflow + healthcheck: + test: [ "CMD", "pg_isready", "-U", "airflow" ] + interval: 10s + retries: 5 + start_period: 5s + restart: always volumes: - - ${PWD}/postgress_storage:/var/lib/postgresql/data + - ${PWD}/postgres:/var/lib/postgresql/data + + redis: + # Redis is limited to 7.2-bookworm due to licencing change + # https://redis.io/blog/redis-adopts-dual-source-available-licensing/ + image: redis:7.2-bookworm + expose: + - 6379 + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 30s + retries: 50 + start_period: 30s + restart: always + airflow-webserver: - image: apache/airflow:2.6.1 - container_name: airflow-webserver - environment: - - AIRFLOW__CORE__EXECUTOR=LocalExecutor - - AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres/airflow - - AIRFLOW__CORE__FERNET_KEY=${FERNET_KEY} - - AIRFLOW__WEBSERVER__RBAC=True + <<: *airflow-common + command: webserver ports: - "8080:8080" + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-scheduler: + <<: *airflow-common + command: scheduler + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:8974/health"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + restart: always + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-worker: + <<: *airflow-common + command: celery worker + healthcheck: + # yamllint disable rule:line-length + test: + - "CMD-SHELL" + - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + environment: + <<: *airflow-common-env + # Required to handle warm shutdown of the celery workers properly + # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation + DUMB_INIT_SETSID: "0" + restart: always + volumes: + - ${PWD}/cache:/cache + depends_on: + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-triggerer: + <<: *airflow-common + command: triggerer + healthcheck: + test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] + interval: 30s + timeout: 10s + retries: 5 + start_period: 30s + restart: always depends_on: - - postgres - command: > - bash -c "airflow db init && airflow webserver" - env_file: - - .env + <<: *airflow-common-depends-on + airflow-init: + condition: service_completed_successfully + + airflow-init: + <<: *airflow-common + entrypoint: /bin/bash + # yamllint disable rule:line-length + command: + - -c + - | + if [[ -z "${AIRFLOW_UID}" ]]; then + echo + echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" + echo "If you are on Linux, you SHOULD follow the instructions below to set " + echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." + echo "For other operating systems you can get rid of the warning with manually created .env file:" + echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" + echo + fi + one_meg=1048576 + mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) + cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) + disk_available=$$(df / | tail -1 | awk '{print $$4}') + warning_resources="false" + if (( mem_available < 4000 )) ; then + echo + echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" + echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" + echo + warning_resources="true" + fi + if (( cpus_available < 2 )); then + echo + echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" + echo "At least 2 CPUs recommended. You have $${cpus_available}" + echo + warning_resources="true" + fi + if (( disk_available < one_meg * 10 )); then + echo + echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" + echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" + echo + warning_resources="true" + fi + if [[ $${warning_resources} == "true" ]]; then + echo + echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" + echo "Please follow the instructions to increase amount of resources available:" + echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin" + echo + fi + mkdir -p /sources/logs /sources/dags /sources/plugins + chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} + exec airflow db migrate + # yamllint enable rule:line-length + environment: + <<: *airflow-common-env + _AIRFLOW_DB_MIGRATE: 'true' + _PIP_ADDITIONAL_REQUIREMENTS: '' + user: "0:0" volumes: - - ${PWD}/dags:/opt/airflow/dags # Mount the dags folder + - ${AIRFLOW_PROJ_DIR:-.}:/sources minio: image: minio/minio:latest @@ -58,7 +299,7 @@ services: - "9000:9000" # API - "9001:9001" # Console volumes: - - ${PWD}/data/:/data + - ${PWD}/minio/:/data command: server /data --console-address ":9001" neo4j: @@ -71,7 +312,3 @@ services: - "7687:7687" # Bolt protocol volumes: - ${PWD}/neo4j_data:/data - -volumes: - chromadb_data: - postgres_data: diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..050e475 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +log_level=INFO +log_cli=true \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..ce3e9fc --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +line_profiler diff --git a/requirements.txt b/requirements.txt index d11ad23..b32ce8b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,29 +1,362 @@ -pathlib~=1.0.1 -fastapi~=0.115.5 -starlette~=0.40.0 -Werkzeug~=3.0.3 -requests~=2.32.0 -beautifulsoup4~=4.12.3 -tika~=2.6.0 -tqdm~=4.66.4 -setuptools~=70.0.0 -pytest~=8.2.0 -websocket-client~=1.8.0 -uvicorn~=0.29.0 -websockets~=12.0 -playwright~=1.43.0 -PyPDF2~=3.0.1 -openparse~=0.7.0 -chromadb~=0.5.20 -numpy~=2.1.3 -torch~=2.5.1 -slugify~=0.0.1 -configparser~=7.1.0 -python-slugify~=8.0.4 -fsspec~=2024.10.0 -semantic_pdf~=0.0.1 -pdfplumber~=0.11.4 -pandas~=2.2.3 -neomodel~=5.4.1 -neo4j~=5.26.0 -pytest-mock~=3.14.0 +acres==0.1.0 +aiobotocore==2.15.2 +aiofiles==23.2.1 +aiohappyeyeballs==2.4.4 +aiohttp==3.11.10 +aioitertools==0.12.0 +aiosignal==1.3.1 +aiosqlite==0.20.0 +alembic==1.14.1 +amqp==5.3.1 +annotated-types==0.7.0 +anyio==4.6.2.post1 +apache-airflow==2.10.4 +apache-airflow-providers-celery==3.8.5 +apache-airflow-providers-common-compat==1.3.0 +apache-airflow-providers-common-io==1.5.0 +apache-airflow-providers-common-sql==1.21.0 +apache-airflow-providers-fab==1.5.2 +apache-airflow-providers-ftp==3.12.0 +apache-airflow-providers-google==12.0.0 +apache-airflow-providers-http==5.0.0 +apache-airflow-providers-imap==3.8.0 +apache-airflow-providers-smtp==1.9.0 +apache-airflow-providers-sqlite==4.0.0 +apispec==6.8.1 +argcomplete==3.5.3 +asgiref==3.8.1 +attrs==24.2.0 +babel==2.16.0 +backoff==2.2.1 +bcrypt==4.2.1 +beautifulsoup4==4.12.3 +billiard==4.2.1 +blinker==1.9.0 +botocore==1.35.36 +build==1.2.2.post1 +cachelib==0.9.0 +cachetools==5.5.0 +captcha-solver==0.1.5 +cattrs==24.1.2 +celery==5.4.0 +certifi==2024.8.30 +cffi==1.17.1 +chardet==5.2.0 +charset-normalizer==3.4.0 +chroma-hnswlib==0.7.6 +chromadb==0.5.20 +ci-info==0.3.0 +click==8.1.7 +click-didyoumean==0.3.1 +click-plugins==1.1.1 +click-repl==0.3.0 +clickclick==20.10.2 +colorama==0.4.6 +coloredlogs==15.0.1 +colorlog==6.9.0 +configobj==5.0.9 +configparser==7.1.0 +ConfigUpdater==3.2 +connexion==2.14.2 +contourpy==1.3.1 +cron-descriptor==1.4.5 +croniter==6.0.0 +cryptography==44.0.0 +cycler==0.12.1 +dataclasses-json==0.6.7 +db-dtypes==1.4.0 +decorator==5.1.1 +Deprecated==1.2.15 +dill==0.3.9 +dirtyjson==1.0.8 +distro==1.9.0 +dnspython==2.7.0 +docstring_parser==0.16 +durationpy==0.9 +email_validator==2.2.0 +etelemetry==0.3.1 +fastapi==0.115.5 +fastapi-cli==0.0.5 +filelock==3.16.1 +filetype==1.2.0 +fitz==0.0.1.dev2 +Flask==2.2.5 +Flask-AppBuilder==4.5.2 +Flask-Babel==2.0.0 +Flask-Caching==2.3.0 +Flask-JWT-Extended==4.7.1 +Flask-Limiter==3.10.1 +Flask-Login==0.6.3 +Flask-Session==0.5.0 +Flask-SQLAlchemy==2.5.1 +Flask-WTF==1.2.2 +flatbuffers==24.3.25 +flower==2.0.1 +fonttools==4.55.0 +frozenlist==1.5.0 +fsspec +gcloud-aio-auth==5.3.2 +gcloud-aio-bigquery==7.1.0 +gcloud-aio-storage==9.3.0 +gcsfs +google-ads==25.1.0 +google-analytics-admin==0.23.3 +google-api-core==2.24.0 +google-api-python-client==2.159.0 +google-auth==2.36.0 +google-auth-httplib2==0.2.0 +google-auth-oauthlib==1.2.1 +google-cloud-aiplatform==1.78.0 +google-cloud-alloydb==0.4.1 +google-cloud-appengine-logging==1.5.0 +google-cloud-audit-log==0.3.0 +google-cloud-automl==2.15.0 +google-cloud-batch==0.17.33 +google-cloud-bigquery==3.29.0 +google-cloud-bigquery-datatransfer==3.18.0 +google-cloud-bigtable==2.28.1 +google-cloud-build==3.29.0 +google-cloud-compute==1.23.0 +google-cloud-container==2.55.0 +google-cloud-core==2.4.1 +google-cloud-datacatalog==3.24.1 +google-cloud-dataflow-client==0.8.15 +google-cloud-dataform==0.5.14 +google-cloud-dataplex==2.6.0 +google-cloud-dataproc==5.16.0 +google-cloud-dataproc-metastore==1.17.0 +google-cloud-dlp==3.26.0 +google-cloud-kms==3.2.2 +google-cloud-language==2.16.0 +google-cloud-logging==3.11.3 +google-cloud-memcache==1.11.0 +google-cloud-monitoring==2.26.0 +google-cloud-orchestration-airflow==1.16.1 +google-cloud-os-login==2.16.0 +google-cloud-pubsub==2.27.3 +google-cloud-redis==2.17.0 +google-cloud-resource-manager==1.14.0 +google-cloud-run==0.10.14 +google-cloud-secret-manager==2.22.1 +google-cloud-spanner==3.51.0 +google-cloud-speech==2.30.0 +google-cloud-storage==2.19.0 +google-cloud-storage-transfer==1.15.0 +google-cloud-tasks==2.18.0 +google-cloud-texttospeech==2.24.0 +google-cloud-translate==3.19.0 +google-cloud-videointelligence==2.15.0 +google-cloud-vision==3.9.0 +google-cloud-workflows==1.16.0 +google-crc32c==1.6.0 +google-re2==1.1.20240702 +google-resumable-media==2.7.2 +googleapis-common-protos==1.66.0 +greenlet==3.0.3 +grpc-google-iam-v1==0.14.0 +grpc-interceptor==0.15.4 +grpcio==1.70.0 +grpcio-gcp==0.2.2 +grpcio-status==1.70.0 +gunicorn==23.0.0 +h11==0.14.0 +httpcore==1.0.7 +httplib2==0.22.0 +httptools==0.6.4 +httpx==0.27.2 +huggingface-hub==0.26.2 +humanfriendly==10.0 +humanize==4.11.0 +idna==3.10 +immutabledict==4.2.1 +importlib_metadata==8.5.0 +importlib_resources==6.4.5 +inflection==0.5.1 +iniconfig==2.0.0 +isodate==0.6.1 +itsdangerous==2.2.0 +Jinja2==3.1.4 +jiter==0.8.0 +jmespath==1.0.1 +joblib==1.4.2 +json-merge-patch==0.2 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.7 +kombu==5.4.2 +kubernetes==31.0.0 +lazy-object-proxy==1.10.0 +limits==4.0.1 +linkify-it-py==2.0.3 +llama-cloud==0.1.6 +llama-index==0.12.5 +llama-index-agent-openai==0.4.0 +llama-index-cli==0.4.0 +llama-index-core==0.12.5 +llama-index-embeddings-openai==0.3.1 +llama-index-indices-managed-llama-cloud==0.6.3 +llama-index-legacy==0.9.48.post4 +llama-index-llms-openai==0.3.10 +llama-index-multi-modal-llms-openai==0.4.0 +llama-index-program-openai==0.3.1 +llama-index-question-gen-openai==0.3.0 +llama-index-readers-file==0.4.1 +llama-index-readers-llama-parse==0.4.0 +llama-parse==0.5.17 +lockfile==0.12.2 +looker-sdk==25.0.0 +looseversion==1.3.0 +lxml==5.3.0 +Mako==1.3.8 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +marshmallow==3.23.1 +marshmallow-oneofschema==3.1.1 +marshmallow-sqlalchemy==0.28.2 +matplotlib==3.9.2 +mdit-py-plugins==0.4.2 +mdurl==0.1.2 +methodtools==0.4.7 +mmh3==5.0.1 +monotonic==1.6 +more-itertools==10.6.0 +mpmath==1.3.0 +multidict==6.1.0 +mypy-extensions==1.0.0 +neo4j==5.26.0 +neomodel==5.4.1 +nest-asyncio==1.6.0 +networkx==3.4.2 +nibabel==5.3.2 +nipype==1.9.1 +nltk==3.9.1 +numpy==1.26.4 +oauthlib==3.2.2 +onnxruntime==1.20.1 +openai==1.57.2 +openparse==0.7.0 +ordered-set==4.1.0 +orjson==3.10.12 +overrides==7.7.0 +packaging==24.2 +pandas==2.1.4 +pandas-gbq==0.26.1 +pathlib==1.0.1 +pathspec==0.12.1 +pdf2image==1.17.0 +pdfminer.six==20231228 +pdfplumber==0.11.4 +pendulum==3.0.0 +pillow==11.0.0 +playwright==1.43.0 +pluggy==1.5.0 +posthog==3.7.3 +prison==0.2.1 +prometheus_client==0.21.1 +prompt_toolkit==3.0.50 +propcache==0.2.1 +proto-plus==1.25.0 +protobuf==5.29.0 +prov==2.0.1 +psutil==6.1.1 +puremagic==1.28 +pyarrow==18.1.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.0 +pycparser==2.22 +pydantic==2.10.2 +pydantic_core==2.27.1 +pydata-google-auth==1.9.1 +pydot==3.0.2 +pyee==11.1.0 +Pygments==2.18.0 +PyJWT==2.10.1 +PyMuPDF==1.24.14 +pyOpenSSL==25.0.0 +pyparsing==3.2.0 +pypdf==5.1.0 +PyPDF2==3.0.1 +pypdfium2==4.30.0 +PyPika==0.48.9 +pyproject_hooks==1.2.0 +pytesseract==0.3.13 +pytest==8.2.2 +pytest-mock==3.14.0 +python-daemon==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-nvd3==0.16.0 +python-slugify==8.0.4 +pytz==2024.2 +pyxnat==1.6.2 +PyYAML==6.0.2 +rdflib==6.3.2 +redis==5.2.1 +referencing==0.36.2 +regex==2024.11.6 +reportlab==4.2.5 +requests==2.32.3 +requests-oauthlib==2.0.0 +requests-toolbelt==1.0.0 +rfc3339-validator==0.1.4 +rich==13.9.4 +rich-argparse==1.6.0 +rpds-py==0.22.3 +rsa==4.9 +s3fs==2024.10.0 +safetensors==0.4.5 +scikit-learn==1.5.2 +scipy==1.14.1 +sentence-transformers==3.3.1 +setproctitle==1.3.4 +setuptools==70.0.0 +shapely==2.0.6 +shellingham==1.5.4 +simplejson==3.19.3 +six==1.16.0 +slugify==0.0.1 +sniffio==1.3.1 +soupsieve==2.6 +SQLAlchemy==1.4.54 +sqlalchemy-bigquery==1.12.1 +SQLAlchemy-JSONField==1.0.2 +sqlalchemy-spanner==1.8.0 +SQLAlchemy-Utils==0.41.2 +sqlparse==0.5.3 +starlette==0.40.0 +striprtf==0.0.26 +sympy==1.13.1 +tabulate==0.9.0 +tenacity==8.5.0 +termcolor==2.5.0 +text-unidecode==1.3 +threadpoolctl==3.5.0 +tika==2.6.0 +tiktoken==0.8.0 +time-machine==2.16.0 +tokenizers==0.20.3 +torch==2.5.1 +tornado==6.4.2 +tqdm==4.66.6 +traits==6.4.3 +transformers==4.46.3 +typer==0.13.1 +typing-inspect==0.9.0 +typing_extensions==4.12.2 +tzdata==2024.2 +uc-micro-py==1.0.3 +universal_pathlib==0.2.6 +uritemplate==4.1.1 +urllib3==2.2.3 +uvicorn==0.29.0 +uvloop==0.21.0 +vine==5.1.0 +watchfiles==1.0.0 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==12.0 +wirerope==1.0.0 +wrapt==1.17.0 +WTForms==3.2.1 +yarl==1.18.3 +zipp==3.21.0 diff --git a/setup.cfg b/setup.cfg index 09a51e2..8a6db70 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -name = semantic +name = verdictnet version = 0.0.1 description = A tool for running semantic queries on Spanish Law author = Alex Monras @@ -7,7 +7,7 @@ license = MIT [options] package_dir= - =src + =verdictnet packages = find: install_requires = numpy @@ -19,11 +19,18 @@ install_requires = matplotlib pyarrow PyMuPDF - wordcloud + +[options.package_data] +verdictnet.ingestion = + resources/*.json +verdictnet.frontend = + static/css/* + static/js/* + templates/* [options.packages.find] where=src [options.entry_points] console_scripts = - semantic = cli:main + verdictnet = verdictnet.cli:main diff --git a/setup.py b/setup.py index 726ea4c..8339b5a 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,11 @@ from setuptools import setup, find_packages setup( - name='semantic_pdf', + name='verdictnet', version='0.0.1', - description='A tool to do semantic_pdf queries on large documents', + description='A tool to do verdictnet queries on large documents', author='Alex Monras', license='MIT', - packages=find_packages('src') + packages=find_packages(where='src'), + package_dir={'': 'src'} ) diff --git a/src/config.py b/src/config.py deleted file mode 100644 index 39a6583..0000000 --- a/src/config.py +++ /dev/null @@ -1,56 +0,0 @@ -import configparser -import logging -import os -from pathlib import Path -from typing import Optional - -import fsspec - -logging.basicConfig( - level=logging.INFO, - # Define the format of the logs, include the function name and line number - format="%(asctime)s - %(name)s.%(funcName)s [%(levelname)s]: %(message)s", -) - - -def root_path(): - return Path(__file__).parent.parent - - -def get_config(): - # Create a ConfigParser instance - config = configparser.ConfigParser() - - # Load the configuration file - config.read(root_path() / 'config.ini') - - return config - - -def configure_fsspec(): - config = get_config() - - s3_config = { - "key": os.getenv("AWS_ACCESS_KEY_ID", config.get('s3', 'key')), - "secret": os.getenv("AWS_SECRET_ACCESS_KEY", config.get('s3', 'secret')), - "client_kwargs": { - "endpoint_url": os.getenv("S3_ENDPOINT", config.get('s3', 'endpoint_url')) - } - } - - fsspec.config.conf = { - "s3": s3_config - } - - -# Call the function to configure fsspec -configure_fsspec() - - -def get_fs(conf: Optional[configparser.ConfigParser] = None): - conf = conf or get_config() - if conf['storage']['type'] == 's3': - fs = fsspec.filesystem("s3") - else: - fs = fsspec.filesystem("file") - return fs diff --git a/src/etl.py b/src/etl.py deleted file mode 100644 index c5b755b..0000000 --- a/src/etl.py +++ /dev/null @@ -1,145 +0,0 @@ -import os -from pathlib import Path -from typing import List - -import requests -from slugify import slugify -from bs4 import BeautifulSoup - -from config import get_config, root_path, get_fs -from ingestion.documentspec import DocumentSpec -from ingestion.paths import refined_path -from models.node import Node -from ingestion.parsers.html_parser import parse -from storage.chroma_storage import ChromaStorage -from storage.hybrid_storage import HybridStorage -from storage.transaction_manager import TransactionManager - - -def get_docspecs(path: Path = None) -> List[Path]: - """ - Return all the docspecs in the resources folder - """ - if path is None: - path = Path(__file__).parent / 'ingestion' / 'resources/' - filenames = [] - for root, subdirs, files in os.walk(path): - for file in files: - if file.endswith('.json'): - filenames.append(Path(root + '/' + file)) - - if not filenames: - print(f"No document Spec filed found in provided folder {path}") - - return filenames - - -def get_files(path: Path, extension: str = None, subfolders=False) -> List[Path]: - """ - Return all the files in the resources folder - """ - filenames = [] - for root, subdirs, files in os.walk(path): - for file in files: - if file.endswith(extension): - filenames.append(Path(root + '/' + file)) - if not subfolders: - break - - if not filenames: - print(f"No files found in provided folder {path}") - - return filenames - - -def download(docspec: DocumentSpec) -> str: - # Hacer la solicitud HTTP - response = requests.get(docspec.url) - if response.status_code != 200: - raise Exception(f"No se pudo acceder al texto consolidado. Código de estado: {response.status_code}") - - return response.text - - -def get_document_structure(text, docspec: DocumentSpec) -> List[Node]: - soup = BeautifulSoup(text, "html.parser") - tags = soup.findAll(docspec.tags) - parsed = parse(tags, docspec=docspec, levels=docspec.wraps or [docspec.head]) - if len(parsed) > 1: - parsed = [Node(level=docspec.head, content=docspec.name, children=parsed)] - return parsed - - -def ingest(main_node, docspec: DocumentSpec, storage: TransactionManager): - all_nodes = main_node.get_all(level=docspec.embed_level) - - storage.store_with_transaction(main_node) - - -def clean(): - print("This will empty both databases. Are you sure you want to continue? (y/n)") - response = input() - if response.lower() != 'y': - print("Aborted.") - return - - conf = get_config() - - store = HybridStorage.get_hybrid_storage(conf) - store.delete_all() - print(f"Cleared all databases.") - - -def run(force_download=False, path=None): - conf = get_config() - - transaction_manager = TransactionManager.get_transaction_manager(conf) - - # Load Docspecs - filenames = get_docspecs(path) - docspecs = [DocumentSpec.load(filename) for filename in filenames] - - for docspec in docspecs: - - slug_name = slugify(docspec.name) - - # Download documents - target_filename = f'{slug_name}.html' - raw_path = root_path() / conf['storage']['raw'] / target_filename - - if force_download or not os.path.exists(raw_path): - print(f"Downloading document `{docspec.name}`...") - text = download(docspec) - - os.makedirs(os.path.dirname(raw_path), exist_ok=True) - with open(raw_path, 'w') as file: - file.write(text) - else: - with open(raw_path, 'r') as file: - text = file.read() - - # Refining documents - target = refined_path() + f'{slug_name}.json' - - main_node = get_document_structure(text, docspec=docspec) - - main_node[0].save(target) - print(f"Saved refined in '{target_filename}'.") - - # Saving json - html_path = root_path() / conf['storage']['html'] / f'{slug_name}.html' - os.makedirs(os.path.dirname(html_path), exist_ok=True) - with open(html_path, 'w', encoding='utf-8') as file: - file.write(main_node[0].html( - preamble=""" - - """ - )) - print(f"HTML saved to '{slug_name}.html'.") - - # Ingesting into vector database - ingest(main_node[0], docspec, storage=transaction_manager) - - -if __name__ == "__main__": - run() diff --git a/src/__init__.py b/src/verdictnet/__init__.py similarity index 100% rename from src/__init__.py rename to src/verdictnet/__init__.py diff --git a/src/cli.py b/src/verdictnet/cli.py similarity index 92% rename from src/cli.py rename to src/verdictnet/cli.py index 2ef9e44..d1db693 100644 --- a/src/cli.py +++ b/src/verdictnet/cli.py @@ -1,6 +1,6 @@ import argparse -import query +from verdictnet import query def main(): @@ -40,13 +40,13 @@ def main(): def handle_etl(args): - import etl + from verdictnet import etl if args.subcommand == "clean": etl.clean() elif args.subcommand == "run": etl.run(force_download=args.force, path=args.path) else: - print("No {args.subcommand} subcommand found.") + print(f"No {args.subcommand} subcommand found.") def handle_query(args): @@ -54,7 +54,7 @@ def handle_query(args): def handle_server(args): - from frontend.server import server + from verdictnet.frontend.server import server server.main() diff --git a/src/verdictnet/config.py b/src/verdictnet/config.py new file mode 100644 index 0000000..cbe587c --- /dev/null +++ b/src/verdictnet/config.py @@ -0,0 +1,80 @@ +import configparser +import json +import logging +import os +from pathlib import Path +from typing import Optional + +import fsspec +from airflow.hooks.base import BaseHook + +logging.basicConfig( + level=logging.INFO, + # Define the format of the logs, include the function name and line number + format="%(asctime)s - %(name)s.%(funcName)s [%(levelname)s]: %(message)s", +) + +logger = logging.getLogger(__name__) + +_fsspec_configured = False + + +def root_path(): + return Path(__file__).parent + + +def get_config(): + # Create a ConfigParser instance + config = configparser.ConfigParser() + + # Load the configuration file from the current folder, or from the package root folder + files = config.read(filenames=['config.ini', root_path() / 'config.ini']) + logger.info("Successfully loaded config files: %s", files) + + return config + + +def configure_fsspec(): + global _fsspec_configured + if _fsspec_configured: + return + + config = get_config() + + # Check if running within Airflow + if 'AIRFLOW_HOME' in os.environ: + # Retrieve the connection details from Airflow + connection = BaseHook.get_connection('minio') + s3_config = { + "key": connection.login, + "secret": connection.password, + "client_kwargs": { + "endpoint_url": connection.extra_dejson.get('endpoint_url') + } + } + else: + s3_config = { + "key": os.getenv("AWS_ACCESS_KEY_ID", config.get('s3', 'key')), + "secret": os.getenv("AWS_SECRET_ACCESS_KEY", config.get('s3', 'secret')), + "client_kwargs": { + "endpoint_url": os.getenv("S3_ENDPOINT", config.get('s3', 'endpoint_url')) + } + } + + fsspec.config.conf = { + "s3": s3_config + } + + # TODO: Warning. This is potentially logging sensitive passwords. Password obfuscation should be implemented. + logger.info("fsspec configured with: %s", json.dumps(fsspec.config.conf)) + _fsspec_configured = True + + +def get_fs(conf: Optional[configparser.ConfigParser] = None): + configure_fsspec() + conf = conf or get_config() + if conf['storage']['type'] == 's3': + fs = fsspec.filesystem("s3") + else: + fs = fsspec.filesystem("file") + return fs diff --git a/src/embedding.py b/src/verdictnet/embedding.py similarity index 69% rename from src/embedding.py rename to src/verdictnet/embedding.py index 281c23f..2fd3a5f 100644 --- a/src/embedding.py +++ b/src/verdictnet/embedding.py @@ -1,26 +1,25 @@ -import argparse import configparser -from typing import List, Union, Tuple, Optional +from typing import List, Union, Optional from numpy import ndarray -from sentence_transformers import SentenceTransformer -from torch import Tensor -from config import root_path, get_config -from models.node import Node +from verdictnet.config import root_path, get_config +from verdictnet.models.node import Node class Embedding: def __init__(self, conf: Optional[configparser.ConfigParser] = None): + from sentence_transformers import SentenceTransformer + self.conf = conf or get_config() # Load a pre-trained model - self.model = SentenceTransformer( # Lightweight, fast model + self.model = SentenceTransformer( self.conf.get('embedding', 'model_name_or_path'), - cache_folder=root_path() / self.conf.get('embedding', 'cache') + cache_folder=self.conf.get('embedding', 'cache') ) def embed_nodes(self, nodes: List[Node]) -> tuple[ - List[Union[List[Tensor], ndarray, Tensor]], + List[Union[List, ndarray]], List[str], List[dict] ]: @@ -43,5 +42,5 @@ def embed_nodes(self, nodes: List[Node]) -> tuple[ return embeddings, documents, metadata - def embed_string(self, text: str) -> Tensor: + def embed_string(self, text: str): return self.model.encode(text).tolist() diff --git a/src/verdictnet/etl.py b/src/verdictnet/etl.py new file mode 100644 index 0000000..92660f1 --- /dev/null +++ b/src/verdictnet/etl.py @@ -0,0 +1,183 @@ +import os +from configparser import ConfigParser +from pathlib import Path +from typing import List + +import requests +from slugify import slugify +from bs4 import BeautifulSoup + +from verdictnet.config import get_config, root_path, logging, get_fs +from verdictnet.ingestion.documentspec import DocumentSpec +from verdictnet.ingestion.paths import refined_path, raw_path, html_path +from verdictnet.models.node import Node +from verdictnet.ingestion.parsers.html_parser import parse +from verdictnet.storage.hybrid_storage import HybridStorage +from verdictnet.storage.transaction_manager import TransactionManager + +logger = logging.getLogger(__name__) + + +def get_docspecs(path: Path = None) -> List[Path]: + """ + Return all the docspecs in the resources folder + """ + if path is None: + path = Path(__file__).parent / 'ingestion' / 'resources/' + filenames = [] + for root, subdirs, files in os.walk(path): + for file in files: + if file.endswith('.json'): + filenames.append(Path(root + '/' + file)) + + if not filenames: + logger.warning(f"No document Spec files found in provided folder {path}") + else: + logger.info(f"Found {len(filenames)} document specs: %s", ",".join([str(f) for f in filenames])) + + return filenames + + +def get_files(path: Path, extension: str = None, subfolders=False) -> List[Path]: + """ + Return all the files in the resources folder + """ + filenames = [] + for root, subdirs, files in os.walk(path): + for file in files: + if file.endswith(extension): + filenames.append(Path(root + '/' + file)) + if not subfolders: + break + + if not filenames: + print(f"No files found in provided folder {path}") + + return filenames + + +def download(docspec: DocumentSpec) -> str: + # Hacer la solicitud HTTP + response = requests.get(docspec.url) + if response.status_code != 200: + raise Exception(f"No se pudo acceder al texto consolidado. Código de estado: {response.status_code}") + + return response.text + + +def get_document_structure(text, docspec: DocumentSpec) -> List[Node]: + soup = BeautifulSoup(text, "html.parser") + tags = soup.findAll(docspec.tags) + parsed = parse(tags, docspec=docspec, levels=docspec.wraps or [docspec.head]) + if len(parsed) > 1: + parsed = [Node(level=docspec.head, content=docspec.name, children=parsed)] + return parsed + + +def download_doc(docspec: DocumentSpec, conf, force_download=False): + """ + Download the document and save it to the raw folder in html format + """ + fs = get_fs(conf) + + slug_name = slugify(docspec.name) + + target_path = raw_path() + f'{slug_name}.html' + + # Download documents + if force_download or not os.path.exists(target_path): + logger.info(f"Downloading document `{docspec.name}`...") + text = download(docspec) + + os.makedirs(os.path.dirname(target_path), exist_ok=True) + with fs.open(target_path, 'w') as file: + file.write(text) + + +def refine(docspec, conf): + """ + Take the document in html format and refine it to a json format + """ + fs = get_fs(conf) + + slug_name = slugify(docspec.name) + + source_filename = f'{slug_name}.html' + soure_path = raw_path() + source_filename + + with fs.open(soure_path, 'r') as file: + text = file.read() + + target = refined_path() + f'{slug_name}.json' + + main_node = get_document_structure(text, docspec=docspec) + + main_node[0].save(target) + logger.info(f"Saved refined in '{target}'.") + + +def render_html(docspec: DocumentSpec, conf: ConfigParser): + fs = get_fs(conf) + + slug_name = slugify(docspec.name) + main_node_path = refined_path() + f'{slug_name}.json' + main_node = Node.load(main_node_path) + + html_file = html_path() + f'{slug_name}.html' + with fs.open(html_file, 'w', encoding='utf-8') as file: + file.write(main_node.html( + preamble=""" + + """ + )) + logger.info(f"HTML saved to '{html_file}'.") + + +def ingest(docspec: DocumentSpec, conf: ConfigParser): + slug_name = slugify(docspec.name) + main_node_path = refined_path() + f'{slug_name}.json' + main_node = Node.load(main_node_path) + + storage = TransactionManager.get_transaction_manager(conf) + # all_nodes = main_node.get_all(level=docspec.embed_level) + + storage.store_with_transaction(main_node) + + +def clean(): + print("This will empty both databases. Are you sure you want to continue? (y/n)") + response = input() + if response.lower() != 'y': + print("Aborted.") + return + + conf = get_config() + + store = HybridStorage.get_hybrid_storage(conf) + store.delete_all() + print(f"Cleared all databases.") + + +def run(force_download=False, path=None): + conf = get_config() + + # Load Docspecs + filenames = get_docspecs(path) + docspecs = [DocumentSpec.load(filename) for filename in filenames] + + for docspec in docspecs: + # Download document + download_doc(docspec, conf, force_download) + + # Refining documents + refine(docspec, conf) + + # Render html + render_html(docspec, conf) + + # Ingesting into vector database + ingest(docspec, conf) + + +if __name__ == "__main__": + run() diff --git a/src/frontend/__init__.py b/src/verdictnet/frontend/__init__.py similarity index 100% rename from src/frontend/__init__.py rename to src/verdictnet/frontend/__init__.py diff --git a/src/frontend/custom_logger.py b/src/verdictnet/frontend/custom_logger.py similarity index 100% rename from src/frontend/custom_logger.py rename to src/verdictnet/frontend/custom_logger.py diff --git a/src/frontend/paths.py b/src/verdictnet/frontend/paths.py similarity index 100% rename from src/frontend/paths.py rename to src/verdictnet/frontend/paths.py diff --git a/src/frontend/server/__init__.py b/src/verdictnet/frontend/server/__init__.py similarity index 100% rename from src/frontend/server/__init__.py rename to src/verdictnet/frontend/server/__init__.py diff --git a/src/frontend/server/app.py b/src/verdictnet/frontend/server/app.py similarity index 100% rename from src/frontend/server/app.py rename to src/verdictnet/frontend/server/app.py diff --git a/src/ingestion/__init__.py b/src/verdictnet/frontend/server/dto/__init__.py similarity index 100% rename from src/ingestion/__init__.py rename to src/verdictnet/frontend/server/dto/__init__.py diff --git a/src/frontend/server/dto/websocket.py b/src/verdictnet/frontend/server/dto/websocket.py similarity index 100% rename from src/frontend/server/dto/websocket.py rename to src/verdictnet/frontend/server/dto/websocket.py diff --git a/src/frontend/server/server.py b/src/verdictnet/frontend/server/server.py similarity index 88% rename from src/frontend/server/server.py rename to src/verdictnet/frontend/server/server.py index 0fa4907..3c667ad 100644 --- a/src/frontend/server/server.py +++ b/src/verdictnet/frontend/server/server.py @@ -9,14 +9,14 @@ from starlette.staticfiles import StaticFiles from starlette.websockets import WebSocketDisconnect -from config import get_config -from etl import get_files -from frontend import paths -from ragagent import RAGAgent -from frontend.server.websocket import Connection -from frontend.server.dto.websocket import ConnectionId, DisplayDocuments -from render.html import HTMLRenderer -from storage.hybrid_storage import HybridStorage +from verdictnet.config import get_config +from verdictnet.etl import get_files +from verdictnet.frontend import paths +from verdictnet.ragagent import RAGAgent +from verdictnet.frontend.server.websocket import Connection +from verdictnet.frontend.server.dto.websocket import ConnectionId, DisplayDocuments +from verdictnet.render.html import HTMLRenderer +from verdictnet.storage.hybrid_storage import HybridStorage conf = get_config() diff --git a/src/frontend/server/websocket.py b/src/verdictnet/frontend/server/websocket.py similarity index 94% rename from src/frontend/server/websocket.py rename to src/verdictnet/frontend/server/websocket.py index 3d3cc3b..7823c9d 100644 --- a/src/frontend/server/websocket.py +++ b/src/verdictnet/frontend/server/websocket.py @@ -5,9 +5,9 @@ from PyPDF2 import PdfFileReader from starlette.websockets import WebSocket -from frontend.paths import uploads -from ragagent import RAGAgent -from frontend.server.dto.websocket import WebSocketMessage, ChatQueryMessage, FileUploaded, ChatResponseMessage, \ +from verdictnet.frontend.paths import uploads +from verdictnet.ragagent import RAGAgent +from verdictnet.frontend.server.dto.websocket import WebSocketMessage, ChatQueryMessage, FileUploaded, ChatResponseMessage, \ UnfoldNodes diff --git a/src/frontend/static/css/document_tree.css b/src/verdictnet/frontend/static/css/document_tree.css similarity index 100% rename from src/frontend/static/css/document_tree.css rename to src/verdictnet/frontend/static/css/document_tree.css diff --git a/src/frontend/static/css/style.css b/src/verdictnet/frontend/static/css/style.css similarity index 100% rename from src/frontend/static/css/style.css rename to src/verdictnet/frontend/static/css/style.css diff --git a/src/frontend/static/js/main.js b/src/verdictnet/frontend/static/js/main.js similarity index 100% rename from src/frontend/static/js/main.js rename to src/verdictnet/frontend/static/js/main.js diff --git a/src/frontend/templates/index.html b/src/verdictnet/frontend/templates/index.html similarity index 100% rename from src/frontend/templates/index.html rename to src/verdictnet/frontend/templates/index.html diff --git a/src/ingestion/README.md b/src/verdictnet/ingestion/README.md similarity index 98% rename from src/ingestion/README.md rename to src/verdictnet/ingestion/README.md index 239f454..fef01c9 100644 --- a/src/ingestion/README.md +++ b/src/verdictnet/ingestion/README.md @@ -2,7 +2,7 @@ There are currently two ingestion processes in place: ```sh -$ semantic erl run +$ verdictnet erl run ``` ingests the documents specified in `src/ingestion/resources/`, namely: - Código Civil diff --git a/src/ingestion/parsers/__init__.py b/src/verdictnet/ingestion/__init__.py similarity index 100% rename from src/ingestion/parsers/__init__.py rename to src/verdictnet/ingestion/__init__.py diff --git a/src/ingestion/documentspec.py b/src/verdictnet/ingestion/documentspec.py similarity index 100% rename from src/ingestion/documentspec.py rename to src/verdictnet/ingestion/documentspec.py diff --git a/src/ingestion/downloader.py b/src/verdictnet/ingestion/downloader.py similarity index 96% rename from src/ingestion/downloader.py rename to src/verdictnet/ingestion/downloader.py index 529de2a..82c7df3 100644 --- a/src/ingestion/downloader.py +++ b/src/verdictnet/ingestion/downloader.py @@ -9,11 +9,11 @@ from bs4 import BeautifulSoup from tqdm import tqdm -from ingestion.parsers.pdf_parser import extract_paragraphs -from ingestion.paths import raw_path, refined_path, fsspec_walk -from config import get_config, logging, get_fs -from models.node import Node -from storage.transaction_manager import TransactionManager +from verdictnet.ingestion.parsers.pdf_parser import extract_paragraphs +from verdictnet.ingestion.paths import raw_path, refined_path, fsspec_walk +from verdictnet.config import get_config, logging, get_fs +from verdictnet.models.node import Node +from verdictnet.storage.transaction_manager import TransactionManager logger = logging.getLogger(__name__) @@ -88,6 +88,7 @@ def create_session(): "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:132.0) Gecko/20100101 Firefox/132.0" } + logger.info("Creating session: %s", f"{BASE_URL}/search/") response = session.get(f"{BASE_URL}/search/", headers=headers_dict) if response.status_code != 200: @@ -307,7 +308,6 @@ def ingest_pdfs(date: datetime, transaction_manager: TransactionManager, force=F transaction_manager.store_with_transaction(nodes, parent_uuid=dataset_uuid) - if __name__ == "__main__": start_date = datetime.today() - timedelta(days=20) end_date = datetime.today() - timedelta(days=1) diff --git a/src/render/__init__.py b/src/verdictnet/ingestion/parsers/__init__.py similarity index 100% rename from src/render/__init__.py rename to src/verdictnet/ingestion/parsers/__init__.py diff --git a/src/ingestion/parsers/html_parser.py b/src/verdictnet/ingestion/parsers/html_parser.py similarity index 96% rename from src/ingestion/parsers/html_parser.py rename to src/verdictnet/ingestion/parsers/html_parser.py index 30db5f0..f152bc8 100644 --- a/src/ingestion/parsers/html_parser.py +++ b/src/verdictnet/ingestion/parsers/html_parser.py @@ -1,5 +1,5 @@ -from ingestion.documentspec import DocumentSpec -from models.node import Node +from verdictnet.ingestion.documentspec import DocumentSpec +from verdictnet.models.node import Node def next_class(tags): diff --git a/src/ingestion/parsers/pdf_parser.py b/src/verdictnet/ingestion/parsers/pdf_parser.py similarity index 96% rename from src/ingestion/parsers/pdf_parser.py rename to src/verdictnet/ingestion/parsers/pdf_parser.py index 5c69312..0876e1e 100644 --- a/src/ingestion/parsers/pdf_parser.py +++ b/src/verdictnet/ingestion/parsers/pdf_parser.py @@ -1,6 +1,6 @@ import pdfplumber -from models.node import Node +from verdictnet.models.node import Node def extract_paragraphs(pdf_path) -> Node: diff --git a/src/ingestion/paths.py b/src/verdictnet/ingestion/paths.py similarity index 74% rename from src/ingestion/paths.py rename to src/verdictnet/ingestion/paths.py index b0a4224..7d2e06f 100644 --- a/src/ingestion/paths.py +++ b/src/verdictnet/ingestion/paths.py @@ -2,7 +2,7 @@ import fsspec -from config import root_path, get_config, logging +from verdictnet.config import root_path, get_config, logging logger = logging.getLogger(__name__) @@ -43,3 +43,14 @@ def refined_path(): return root_path() / conf['storage']['bucket'] / conf['storage']['refined'] elif conf['storage']['type'] == 's3': return f"s3://{conf['storage']['bucket']}/{conf['storage']['refined']}" + + +def html_path(): + """ + Return the path where we store refined objects as JSON files ready to be ingested + into the database + """ + if conf['storage']['type'] == 'local': + return root_path() / conf['storage']['bucket'] / conf['storage']['html'] + elif conf['storage']['type'] == 's3': + return f"s3://{conf['storage']['bucket']}/{conf['storage']['html']}" diff --git a/src/ingestion/resources/codigo_civil.json b/src/verdictnet/ingestion/resources/codigo_civil.json similarity index 100% rename from src/ingestion/resources/codigo_civil.json rename to src/verdictnet/ingestion/resources/codigo_civil.json diff --git a/src/ingestion/resources/codigo_penal.json b/src/verdictnet/ingestion/resources/codigo_penal.json similarity index 100% rename from src/ingestion/resources/codigo_penal.json rename to src/verdictnet/ingestion/resources/codigo_penal.json diff --git a/src/verdictnet/models/__init__.py b/src/verdictnet/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/node.py b/src/verdictnet/models/node.py similarity index 91% rename from src/models/node.py rename to src/verdictnet/models/node.py index c1ee23a..dd2b24e 100644 --- a/src/models/node.py +++ b/src/verdictnet/models/node.py @@ -3,7 +3,7 @@ from dataclasses import dataclass, field from typing import List -from config import get_fs +from verdictnet.config import get_fs class AutoIncrement: # pylint: disable=too-few-public-methods @@ -83,13 +83,16 @@ def save(self, path): with fs.open(path, 'w', encoding='utf8') as file: json.dump(self.json(), file, indent=4, ensure_ascii=False) - def load(self, path): + @classmethod + def load(cls, path): """ Load the node from a file """ - with open(path, 'r', encoding='utf8') as file: - data = json.load(file, ensure_ascii=False) - return Node(**data) + fs = get_fs() + + with fs.open(path, 'r', encoding='utf8') as file: + data = json.load(file) + return Node.from_dict(data) @classmethod def from_dict(cls, data): @@ -99,6 +102,7 @@ def from_dict(cls, data): return Node( id=data['id'], level=data['level'], + uuid=data['uuid'], content=data['content'], children=[cls.from_dict(child) for child in data['children']] ) diff --git a/src/query.py b/src/verdictnet/query.py similarity index 90% rename from src/query.py rename to src/verdictnet/query.py index 142c794..3763911 100644 --- a/src/query.py +++ b/src/verdictnet/query.py @@ -1,9 +1,9 @@ import os import textwrap -from render.plain_text import PlainTextRenderer -from storage.chroma_storage import ChromaStorage -from storage.hybrid_storage import HybridStorage +from verdictnet.render.plain_text import PlainTextRenderer +from verdictnet.storage.chroma_storage import ChromaStorage +from verdictnet.storage.hybrid_storage import HybridStorage def query(q_string: str, n_results: int = 3): diff --git a/src/ragagent.py b/src/verdictnet/ragagent.py similarity index 84% rename from src/ragagent.py rename to src/verdictnet/ragagent.py index 6afdb49..7bb6424 100644 --- a/src/ragagent.py +++ b/src/verdictnet/ragagent.py @@ -2,11 +2,9 @@ import logging from typing import Optional, List -import chromadb - -from models.node import Node -from query import print_results -from storage.hybrid_storage import HybridStorage +from verdictnet.models.node import Node +from verdictnet.query import print_results +from verdictnet.storage.hybrid_storage import HybridStorage logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) diff --git a/src/verdictnet/render/__init__.py b/src/verdictnet/render/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/render/html.py b/src/verdictnet/render/html.py similarity index 90% rename from src/render/html.py rename to src/verdictnet/render/html.py index 8e377e2..f6efb5f 100644 --- a/src/render/html.py +++ b/src/verdictnet/render/html.py @@ -1,5 +1,5 @@ -from models.node import Node -from render.node_renderer import NodeRenderer +from verdictnet.models.node import Node +from verdictnet.render.node_renderer import NodeRenderer class HTMLRenderer(NodeRenderer): diff --git a/src/render/node_renderer.py b/src/verdictnet/render/node_renderer.py similarity index 85% rename from src/render/node_renderer.py rename to src/verdictnet/render/node_renderer.py index 27166a4..4be823f 100644 --- a/src/render/node_renderer.py +++ b/src/verdictnet/render/node_renderer.py @@ -1,4 +1,4 @@ -from models.node import Node +from verdictnet.models.node import Node class NodeRenderer: diff --git a/src/render/plain_text.py b/src/verdictnet/render/plain_text.py similarity index 86% rename from src/render/plain_text.py rename to src/verdictnet/render/plain_text.py index e7498e4..55c2cbc 100644 --- a/src/render/plain_text.py +++ b/src/verdictnet/render/plain_text.py @@ -1,5 +1,5 @@ -from models.node import Node -from render.node_renderer import NodeRenderer +from verdictnet.models.node import Node +from verdictnet.render.node_renderer import NodeRenderer class PlainTextRenderer(NodeRenderer): diff --git a/src/verdictnet/storage/__init__.py b/src/verdictnet/storage/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/storage/adapters.py b/src/verdictnet/storage/adapters.py similarity index 99% rename from src/storage/adapters.py rename to src/verdictnet/storage/adapters.py index 3b5686b..1d8ecd2 100644 --- a/src/storage/adapters.py +++ b/src/verdictnet/storage/adapters.py @@ -2,7 +2,7 @@ import numpy as np -from models.node import Node +from verdictnet.models.node import Node class NodeAdapter: diff --git a/src/storage/chroma_storage.py b/src/verdictnet/storage/chroma_storage.py similarity index 95% rename from src/storage/chroma_storage.py rename to src/verdictnet/storage/chroma_storage.py index 159668a..7638b99 100644 --- a/src/storage/chroma_storage.py +++ b/src/verdictnet/storage/chroma_storage.py @@ -4,11 +4,11 @@ import chromadb import neo4j -import config -from embedding import Embedding -from models.node import Node +from verdictnet import config +from verdictnet.embedding import Embedding +from verdictnet.models.node import Node -from config import logging +from verdictnet.config import logging logger = logging.getLogger(__name__) diff --git a/src/storage/graph_storage.py b/src/verdictnet/storage/graph_storage.py similarity index 98% rename from src/storage/graph_storage.py rename to src/verdictnet/storage/graph_storage.py index 74fa688..9876423 100644 --- a/src/storage/graph_storage.py +++ b/src/verdictnet/storage/graph_storage.py @@ -6,9 +6,9 @@ import numpy as np from neo4j import Driver, GraphDatabase -import config -from models.node import Node -from storage.adapters import NodeAdapter +from verdictnet import config +from verdictnet.models.node import Node +from verdictnet.storage.adapters import NodeAdapter logger = config.logging.getLogger(__name__) logger.setLevel(logging.DEBUG) diff --git a/src/storage/hybrid_storage.py b/src/verdictnet/storage/hybrid_storage.py similarity index 89% rename from src/storage/hybrid_storage.py rename to src/verdictnet/storage/hybrid_storage.py index 0e26417..0196ca2 100644 --- a/src/storage/hybrid_storage.py +++ b/src/verdictnet/storage/hybrid_storage.py @@ -1,10 +1,10 @@ import configparser from typing import Optional, List -import config -from models.node import Node -from storage.chroma_storage import ChromaStorage -from storage.graph_storage import GraphStorage +from verdictnet import config +from verdictnet.models.node import Node +from verdictnet.storage.chroma_storage import ChromaStorage +from verdictnet.storage.graph_storage import GraphStorage class HybridStorage: @@ -39,7 +39,7 @@ def flatten_hierarchy(self, root_node: Node) -> List[Node]: def query(self, query_string: str, n_results: Optional[int] = None): """ - Perform a semantic search in ChromaDB and return the results. + Perform a verdictnet search in ChromaDB and return the results. """ return self.chroma_storage.query(query_string, n_results) diff --git a/src/storage/transaction_manager.py b/src/verdictnet/storage/transaction_manager.py similarity index 95% rename from src/storage/transaction_manager.py rename to src/verdictnet/storage/transaction_manager.py index 3fdd7b3..6bdd9a1 100644 --- a/src/storage/transaction_manager.py +++ b/src/verdictnet/storage/transaction_manager.py @@ -1,9 +1,9 @@ import configparser from typing import Optional, List -from models.node import Node -from storage.hybrid_storage import HybridStorage -from config import logging, get_config +from verdictnet.models.node import Node +from verdictnet.storage.hybrid_storage import HybridStorage +from verdictnet.config import logging, get_config logger = logging.getLogger(__name__) diff --git a/tests/conftest.py b/tests/conftest.py index 7b527d6..50b9128 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,24 +1,27 @@ +import configparser from pathlib import Path +from unittest.mock import patch from bs4 import BeautifulSoup from pytest import fixture -from config import root_path -from ingestion.documentspec import DocumentSpec -from ingestion.parsers.html_parser import parse -resources = Path(__file__).parent / "resources" +@fixture(autouse=True) +def mock_config(): + def mock_get_config(): + config = configparser.ConfigParser() + success = config.read(['resources/config.ini', 'tests/resources/config.ini']) + assert success is not [], "Could not read mock config file" + return config + with patch('verdictnet.config.get_config', mock_get_config): + yield -@fixture -def static_files(): - return root_path() / "src/frontend/static/css" +from verdictnet.ingestion.documentspec import DocumentSpec +from verdictnet.ingestion.parsers.html_parser import parse -@fixture -def css_code(static_files): - with open(static_files / 'document_tree.css', 'r') as css_file: - yield css_file.read() +resources = Path(__file__).parent / "resources" @fixture diff --git a/tests/ingestion/test_documentspec.py b/tests/ingestion/test_documentspec.py index f3fc49a..b954bb3 100644 --- a/tests/ingestion/test_documentspec.py +++ b/tests/ingestion/test_documentspec.py @@ -3,7 +3,7 @@ from pytest import fixture from conftest import resources -from ingestion.documentspec import DocumentSpec +from verdictnet.ingestion.documentspec import DocumentSpec @fixture diff --git a/tests/ingestion/test_ingest.py b/tests/ingestion/test_ingest.py index d8610a1..3b4e71c 100644 --- a/tests/ingestion/test_ingest.py +++ b/tests/ingestion/test_ingest.py @@ -2,7 +2,7 @@ from unittest.mock import patch import pytest -from etl import get_docspecs +from verdictnet.etl import get_docspecs @pytest.fixture diff --git a/tests/ingestion/test_parser.py b/tests/ingestion/test_parser.py index eb28839..260db44 100644 --- a/tests/ingestion/test_parser.py +++ b/tests/ingestion/test_parser.py @@ -2,7 +2,7 @@ from bs4 import BeautifulSoup from conftest import codigo_civil_spec -from ingestion.parsers.html_parser import parse +from verdictnet.ingestion.parsers.html_parser import parse @fixture(scope='class') diff --git a/tests/ingestion/test_pdf_parser.py b/tests/ingestion/test_pdf_parser.py index b6feae1..fbc1a0e 100644 --- a/tests/ingestion/test_pdf_parser.py +++ b/tests/ingestion/test_pdf_parser.py @@ -2,8 +2,8 @@ import os from conftest import resources -from models.node import Node -from ingestion.parsers.pdf_parser import extract_paragraphs +from verdictnet.models.node import Node +from verdictnet.ingestion.parsers.pdf_parser import extract_paragraphs @pytest.fixture diff --git a/tests/models/test_node.py b/tests/models/test_node.py new file mode 100644 index 0000000..1246596 --- /dev/null +++ b/tests/models/test_node.py @@ -0,0 +1,128 @@ +import json +import tempfile +import webbrowser + +from _pytest.fixtures import fixture +from bs4 import BeautifulSoup + + +from verdictnet.models.node import Node + + +@fixture +def static_files(): + from verdictnet.config import root_path + return root_path() / "frontend/static/css" + + +@fixture +def css_code(static_files): + with open(static_files / 'document_tree.css', 'r') as css_file: + yield css_file.read() + + +class TestNode: + def test_render(self, node_titulo): + text = node_titulo.render() + assert " 2. Carecerán de validez las disposiciones que contradigan otra de rango superior." in text + + def test_html(self, node_titulo, css_code): + html = node_titulo.html() + + # Insert the CSS link into the HTML content + html_with_css = f'{html}' + + # open html_text in a browser to see the result + with tempfile.NamedTemporaryFile('w', delete=False, suffix='.html') as f: + url = 'file://' + f.name + f.write(html_with_css) + + # ensure html is correctly formatted + try: + assert BeautifulSoup(html_with_css, 'html.parser') + except Exception as e: + webbrowser.open(url) + raise e + + def test_save(self): + node = Node(level="1", content="Test Node") + with tempfile.NamedTemporaryFile('w', delete=False, suffix='.json') as f: + path = f.name + node.save(path) + + with open(path, 'r', encoding='utf8') as file: + data = json.load(file) + + assert data['level'] == "1" + assert data['content'] == "Test Node" + assert 'uuid' in data + + def test_load(self): + node_data = { + "id": 1, + "uuid": "1234", + "level": "1", + "content": "Test Node", + "children": [] + } + with tempfile.NamedTemporaryFile('w', delete=False, suffix='.json') as f: + path = f.name + json.dump(node_data, f) + + loaded_node = Node.load(path) + + assert loaded_node.level == "1" + assert loaded_node.content == "Test Node" + assert loaded_node.uuid == "1234" + assert loaded_node.children == [] + + def test_save_with_children(self): + child_node = Node(level="2", content="Child Node") + parent_node = Node(level="1", content="Parent Node", children=[child_node]) + + with tempfile.NamedTemporaryFile('w', delete=False, suffix='.json') as f: + path = f.name + parent_node.save(path) + + with open(path, 'r', encoding='utf8') as file: + data = json.load(file) + + assert data['level'] == "1" + assert data['content'] == "Parent Node" + assert 'uuid' in data + assert len(data['children']) == 1 + assert data['children'][0]['level'] == "2" + assert data['children'][0]['content'] == "Child Node" + assert 'uuid' in data['children'][0] + + def test_load_with_children(self): + node_data = { + "id": 1, + "uuid": "1234", + "level": "1", + "content": "Parent Node", + "children": [ + { + "id": 2, + "uuid": "5678", + "level": "2", + "content": "Child Node", + "children": [] + } + ] + } + + with tempfile.NamedTemporaryFile('w', delete=False, suffix='.json') as f: + path = f.name + json.dump(node_data, f) + + loaded_node = Node.load(path) + + assert loaded_node.level == "1" + assert loaded_node.content == "Parent Node" + assert loaded_node.uuid == "1234" + assert len(loaded_node.children) == 1 + assert loaded_node.children[0].level == "2" + assert loaded_node.children[0].content == "Child Node" + assert loaded_node.children[0].uuid == "5678" + assert loaded_node.children[0].children == [] diff --git a/tests/resources/config.ini b/tests/resources/config.ini new file mode 100644 index 0000000..5782e55 --- /dev/null +++ b/tests/resources/config.ini @@ -0,0 +1,30 @@ +[storage] +# s3 or file +type: file +bucket: legal +collection: legal-database +raw: datalake/raw/ +refined: datalake/refined/ +html: datalake/html/ + +[chroma] +type: http +host: localhost +port: 8000 + +[neo4j] +url: bolt://localhost:7687 +user: none +password: nopwd + +[embedding] +model_name_or_path: paraphrase-mpnet-base-v2 +cache: cache/ + +[rag] +n_results: 5 + +[s3] +key = none +secret = nopwd +endpoint_url = http://localhost:9000 \ No newline at end of file diff --git a/tests/storage/conftest.py b/tests/storage/conftest.py index d984c8f..97e59dc 100644 --- a/tests/storage/conftest.py +++ b/tests/storage/conftest.py @@ -1,7 +1,6 @@ import pytest -from storage.graph_storage import GraphStorage -from storage.hybrid_storage import HybridStorage +from verdictnet.storage.hybrid_storage import HybridStorage @pytest.fixture diff --git a/tests/storage/test_adapter.py b/tests/storage/test_adapter.py index 5e45abe..099286a 100644 --- a/tests/storage/test_adapter.py +++ b/tests/storage/test_adapter.py @@ -1,5 +1,5 @@ -from models.node import Node -from storage.adapters import NodeAdapter +from verdictnet.models.node import Node +from verdictnet.storage.adapters import NodeAdapter def test_to_neo4j_with_relationships_single_node(): diff --git a/tests/storage/test_chroma_storage.py b/tests/storage/test_chroma_storage.py index cc9685e..46c805d 100644 --- a/tests/storage/test_chroma_storage.py +++ b/tests/storage/test_chroma_storage.py @@ -1,9 +1,9 @@ import chromadb import pytest from unittest.mock import MagicMock -from models.node import Node -from storage.chroma_storage import ChromaStorage -from embedding import Embedding +from verdictnet.models.node import Node +from verdictnet.storage.chroma_storage import ChromaStorage +from verdictnet.embedding import Embedding @pytest.fixture diff --git a/tests/storage/test_graph_storage.py b/tests/storage/test_graph_storage.py index c2d275f..673c0bd 100644 --- a/tests/storage/test_graph_storage.py +++ b/tests/storage/test_graph_storage.py @@ -5,8 +5,8 @@ from neo4j import Result -from models.node import Node -from storage.graph_storage import GraphStorage +from verdictnet.models.node import Node +from verdictnet.storage.graph_storage import GraphStorage @pytest.fixture diff --git a/tests/storage/test_hybrid_storage.py b/tests/storage/test_hybrid_storage.py index 095351f..c7e3100 100644 --- a/tests/storage/test_hybrid_storage.py +++ b/tests/storage/test_hybrid_storage.py @@ -1,7 +1,7 @@ import pytest from unittest.mock import MagicMock -from models.node import Node -from storage.hybrid_storage import HybridStorage +from verdictnet.models.node import Node +from verdictnet.storage.hybrid_storage import HybridStorage @pytest.fixture diff --git a/tests/storage/test_transaction_manager.py b/tests/storage/test_transaction_manager.py index 51adc15..78d19cc 100644 --- a/tests/storage/test_transaction_manager.py +++ b/tests/storage/test_transaction_manager.py @@ -1,8 +1,8 @@ import pytest from unittest.mock import MagicMock, call -from models.node import Node -from storage.transaction_manager import TransactionManager -from storage.hybrid_storage import HybridStorage +from verdictnet.models.node import Node +from verdictnet.storage.transaction_manager import TransactionManager +from verdictnet.storage.hybrid_storage import HybridStorage @pytest.fixture diff --git a/tests/test_node.py b/tests/test_node.py deleted file mode 100644 index da6614a..0000000 --- a/tests/test_node.py +++ /dev/null @@ -1,29 +0,0 @@ -import tempfile -import webbrowser - -from bs4 import BeautifulSoup - - -class TestNode: - def test_render(self, node_titulo): - text = node_titulo.render() - assert " 2. Carecerán de validez las disposiciones que contradigan otra de rango superior." in text - - def test_html(self, node_titulo, css_code): - html = node_titulo.html() - - # Insert the CSS link into the HTML content - html_with_css = f'{html}' - - # open html_text in a browser to see the result - with tempfile.NamedTemporaryFile('w', delete=False, suffix='.html') as f: - url = 'file://' + f.name - f.write(html_with_css) - - # ensure html is correctly formatted - try: - assert BeautifulSoup(html_with_css, 'html.parser') - except Exception as e: - webbrowser.open(url) - raise e -