From 8ece71d836515b97d9918a2c4217024ef01a8dfc Mon Sep 17 00:00:00 2001 From: nikkybel Date: Fri, 29 May 2026 17:53:04 +0100 Subject: [PATCH] feat: implement Data-processing: Soroban Event Indexer (Testnet) -> Normalized Event Store --- ...000001-AddLedgerSequenceToSorobanEvents.ts | 19 + apps/data-processing/alert_notifier.py | Bin 0 -> 2909 bytes apps/data-processing/alertbot.py | Bin 0 -> 12737 bytes apps/data-processing/analytics/__init__.py | Bin 0 -> 569 bytes .../analytics/correlation_engine.py | Bin 0 -> 12910 bytes .../analytics/entity_linker.py | 212 +++ apps/data-processing/analytics/forecaster.py | Bin 0 -> 20062 bytes apps/data-processing/analytics/keywords.py | Bin 0 -> 7843 bytes .../analytics/market_analyzer.py | Bin 0 -> 6820 bytes apps/data-processing/analytics/ner_service.py | Bin 0 -> 5604 bytes apps/data-processing/analytics/sentiment.py | Bin 0 -> 12829 bytes .../analytics/sentiment_indicators.py | Bin 0 -> 7732 bytes apps/data-processing/anomaly_detector.py | Bin 0 -> 32052 bytes .../api/ingestion_quality_routes.py | Bin 0 -> 1874 bytes apps/data-processing/api/server.py | Bin 0 -> 23236 bytes apps/data-processing/cache_manager.py | Bin 0 -> 4728 bytes apps/data-processing/config/anomaly_config.py | Bin 0 -> 4461 bytes apps/data-processing/database.py | Bin 0 -> 8492 bytes apps/data-processing/db/__init__.py | 18 + apps/data-processing/db/models.py | 304 ++++ apps/data-processing/db/postgres_service.py | 1245 +++++++++++++++++ apps/data-processing/fetchers.py | Bin 0 -> 4341 bytes apps/data-processing/ingestion/__init__.py | Bin 0 -> 892 bytes .../ingestion/news_deduplicator.py | Bin 0 -> 7335 bytes .../data-processing/ingestion/news_fetcher.py | Bin 0 -> 12180 bytes .../ingestion/price_fetcher.py | Bin 0 -> 8131 bytes .../ingestion/run_ingestion_quality_checks.py | Bin 0 -> 721 bytes .../ingestion/social_fetcher.py | Bin 0 -> 25228 bytes .../ingestion/soroban_event_indexer.py | 267 ++++ .../ingestion/stellar_fetcher.py | Bin 0 -> 20177 bytes .../ingestion/stellar_ingestion_checks.py | Bin 0 -> 17144 bytes apps/data-processing/main.py | Bin 0 -> 14703 bytes apps/data-processing/ml/__init__.py | Bin 0 -> 623 bytes apps/data-processing/ml/feature_store.py | Bin 0 -> 3732 bytes apps/data-processing/ml/model_registry.py | Bin 0 -> 7036 bytes apps/data-processing/ml/price_predictor.py | Bin 0 -> 3119 bytes .../data-processing/ml/retraining_pipeline.py | Bin 0 -> 10353 bytes apps/data-processing/qa_exporter.py | Bin 0 -> 9736 bytes apps/data-processing/scheduler.py | Bin 0 -> 11497 bytes apps/data-processing/security.py | Bin 0 -> 7044 bytes apps/data-processing/sentiment.py | Bin 0 -> 9799 bytes .../src/analytics/entity_linker.py | 212 +++ .../src/ingestion/soroban_event_indexer.py | 267 ++++ apps/data-processing/standalone_test.py | 278 ++++ apps/data-processing/test_entity_linker.py | 54 + apps/data-processing/trends.py | Bin 0 -> 5268 bytes apps/data-processing/utils/http_client.py | Bin 0 -> 5456 bytes apps/data-processing/utils/logger.py | Bin 0 -> 1460 bytes apps/data-processing/utils/metrics.py | Bin 0 -> 1286 bytes apps/data-processing/utils/translator.py | Bin 0 -> 2789 bytes apps/data-processing/validators.py | Bin 0 -> 1638 bytes ...000001-AddLedgerSequenceToSorobanEvents.ts | 19 + temp_backup/backfill_contract_events.py | 323 +++++ .../dto/ingest-soroban-event.dto.ts | 29 + .../entities/soroban-event.entity.ts | 57 + .../soroban-events.controller.ts | 42 + .../soroban-events/soroban-events.module.ts | 20 + .../soroban-events.processor.ts | 78 ++ .../soroban-events/soroban-events.service.ts | 32 + temp_backup/src/alert_notifier.py | 85 ++ temp_backup/src/alertbot.py | 353 +++++ temp_backup/src/analytics/__init__.py | 21 + .../__pycache__/__init__.cpython-314.pyc | Bin 0 -> 698 bytes .../__pycache__/forecaster.cpython-314.pyc | Bin 0 -> 26172 bytes .../market_analyzer.cpython-314.pyc | Bin 0 -> 8295 bytes .../src/analytics/correlation_engine.py | 358 +++++ temp_backup/src/analytics/entity_linker.py | 212 +++ temp_backup/src/analytics/forecaster.py | 507 +++++++ temp_backup/src/analytics/keywords.py | 309 ++++ temp_backup/src/analytics/market_analyzer.py | 201 +++ temp_backup/src/analytics/ner_service.py | 171 +++ temp_backup/src/analytics/sentiment.py | 388 +++++ .../src/analytics/sentiment_indicators.py | 236 ++++ temp_backup/src/anomaly_detector.py | 818 +++++++++++ .../src/api/ingestion_quality_routes.py | 59 + temp_backup/src/api/server.py | 661 +++++++++ temp_backup/src/cache_manager.py | 146 ++ temp_backup/src/config/anomaly_config.py | 114 ++ temp_backup/src/database.py | 241 ++++ temp_backup/src/db/__init__.py | 18 + .../db/__pycache__/__init__.cpython-314.pyc | Bin 0 -> 510 bytes .../src/db/__pycache__/models.cpython-314.pyc | Bin 0 -> 13195 bytes temp_backup/src/db/models.py | 304 ++++ temp_backup/src/db/postgres_service.py | 1245 +++++++++++++++++ temp_backup/src/fetchers.py | 116 ++ temp_backup/src/ingestion/__init__.py | 42 + .../src/ingestion/news_deduplicator.py | 198 +++ temp_backup/src/ingestion/news_fetcher.py | 333 +++++ temp_backup/src/ingestion/price_fetcher.py | 226 +++ .../ingestion/run_ingestion_quality_checks.py | 28 + temp_backup/src/ingestion/social_fetcher.py | 741 ++++++++++ .../src/ingestion/soroban_event_indexer.py | 267 ++++ temp_backup/src/ingestion/stellar_fetcher.py | 565 ++++++++ .../src/ingestion/stellar_ingestion_checks.py | 463 ++++++ temp_backup/src/main.py | 382 +++++ temp_backup/src/ml/__init__.py | 28 + temp_backup/src/ml/feature_store.py | 83 ++ temp_backup/src/ml/model_registry.py | 223 +++ temp_backup/src/ml/price_predictor.py | 93 ++ temp_backup/src/ml/retraining_pipeline.py | 274 ++++ temp_backup/src/qa_exporter.py | 256 ++++ temp_backup/src/scheduler.py | 285 ++++ temp_backup/src/security.py | 215 +++ temp_backup/src/sentiment.py | 283 ++++ temp_backup/src/trends.py | 153 ++ .../utils/__pycache__/logger.cpython-314.pyc | Bin 0 -> 2989 bytes temp_backup/src/utils/http_client.py | 138 ++ temp_backup/src/utils/logger.py | 52 + temp_backup/src/utils/metrics.py | 42 + temp_backup/src/utils/translator.py | 88 ++ temp_backup/src/validators.py | 58 + 111 files changed, 15555 insertions(+) create mode 100644 apps/backend/src/database/migrations/1774000000001-AddLedgerSequenceToSorobanEvents.ts create mode 100644 apps/data-processing/alert_notifier.py create mode 100644 apps/data-processing/alertbot.py create mode 100644 apps/data-processing/analytics/__init__.py create mode 100644 apps/data-processing/analytics/correlation_engine.py create mode 100644 apps/data-processing/analytics/entity_linker.py create mode 100644 apps/data-processing/analytics/forecaster.py create mode 100644 apps/data-processing/analytics/keywords.py create mode 100644 apps/data-processing/analytics/market_analyzer.py create mode 100644 apps/data-processing/analytics/ner_service.py create mode 100644 apps/data-processing/analytics/sentiment.py create mode 100644 apps/data-processing/analytics/sentiment_indicators.py create mode 100644 apps/data-processing/anomaly_detector.py create mode 100644 apps/data-processing/api/ingestion_quality_routes.py create mode 100644 apps/data-processing/api/server.py create mode 100644 apps/data-processing/cache_manager.py create mode 100644 apps/data-processing/config/anomaly_config.py create mode 100644 apps/data-processing/database.py create mode 100644 apps/data-processing/db/__init__.py create mode 100644 apps/data-processing/db/models.py create mode 100644 apps/data-processing/db/postgres_service.py create mode 100644 apps/data-processing/fetchers.py create mode 100644 apps/data-processing/ingestion/__init__.py create mode 100644 apps/data-processing/ingestion/news_deduplicator.py create mode 100644 apps/data-processing/ingestion/news_fetcher.py create mode 100644 apps/data-processing/ingestion/price_fetcher.py create mode 100644 apps/data-processing/ingestion/run_ingestion_quality_checks.py create mode 100644 apps/data-processing/ingestion/social_fetcher.py create mode 100644 apps/data-processing/ingestion/soroban_event_indexer.py create mode 100644 apps/data-processing/ingestion/stellar_fetcher.py create mode 100644 apps/data-processing/ingestion/stellar_ingestion_checks.py create mode 100644 apps/data-processing/main.py create mode 100644 apps/data-processing/ml/__init__.py create mode 100644 apps/data-processing/ml/feature_store.py create mode 100644 apps/data-processing/ml/model_registry.py create mode 100644 apps/data-processing/ml/price_predictor.py create mode 100644 apps/data-processing/ml/retraining_pipeline.py create mode 100644 apps/data-processing/qa_exporter.py create mode 100644 apps/data-processing/scheduler.py create mode 100644 apps/data-processing/security.py create mode 100644 apps/data-processing/sentiment.py create mode 100644 apps/data-processing/src/analytics/entity_linker.py create mode 100644 apps/data-processing/src/ingestion/soroban_event_indexer.py create mode 100644 apps/data-processing/standalone_test.py create mode 100644 apps/data-processing/test_entity_linker.py create mode 100644 apps/data-processing/trends.py create mode 100644 apps/data-processing/utils/http_client.py create mode 100644 apps/data-processing/utils/logger.py create mode 100644 apps/data-processing/utils/metrics.py create mode 100644 apps/data-processing/utils/translator.py create mode 100644 apps/data-processing/validators.py create mode 100644 temp_backup/1774000000001-AddLedgerSequenceToSorobanEvents.ts create mode 100644 temp_backup/backfill_contract_events.py create mode 100644 temp_backup/soroban-events/dto/ingest-soroban-event.dto.ts create mode 100644 temp_backup/soroban-events/entities/soroban-event.entity.ts create mode 100644 temp_backup/soroban-events/soroban-events.controller.ts create mode 100644 temp_backup/soroban-events/soroban-events.module.ts create mode 100644 temp_backup/soroban-events/soroban-events.processor.ts create mode 100644 temp_backup/soroban-events/soroban-events.service.ts create mode 100644 temp_backup/src/alert_notifier.py create mode 100644 temp_backup/src/alertbot.py create mode 100644 temp_backup/src/analytics/__init__.py create mode 100644 temp_backup/src/analytics/__pycache__/__init__.cpython-314.pyc create mode 100644 temp_backup/src/analytics/__pycache__/forecaster.cpython-314.pyc create mode 100644 temp_backup/src/analytics/__pycache__/market_analyzer.cpython-314.pyc create mode 100644 temp_backup/src/analytics/correlation_engine.py create mode 100644 temp_backup/src/analytics/entity_linker.py create mode 100644 temp_backup/src/analytics/forecaster.py create mode 100644 temp_backup/src/analytics/keywords.py create mode 100644 temp_backup/src/analytics/market_analyzer.py create mode 100644 temp_backup/src/analytics/ner_service.py create mode 100644 temp_backup/src/analytics/sentiment.py create mode 100644 temp_backup/src/analytics/sentiment_indicators.py create mode 100644 temp_backup/src/anomaly_detector.py create mode 100644 temp_backup/src/api/ingestion_quality_routes.py create mode 100644 temp_backup/src/api/server.py create mode 100644 temp_backup/src/cache_manager.py create mode 100644 temp_backup/src/config/anomaly_config.py create mode 100644 temp_backup/src/database.py create mode 100644 temp_backup/src/db/__init__.py create mode 100644 temp_backup/src/db/__pycache__/__init__.cpython-314.pyc create mode 100644 temp_backup/src/db/__pycache__/models.cpython-314.pyc create mode 100644 temp_backup/src/db/models.py create mode 100644 temp_backup/src/db/postgres_service.py create mode 100644 temp_backup/src/fetchers.py create mode 100644 temp_backup/src/ingestion/__init__.py create mode 100644 temp_backup/src/ingestion/news_deduplicator.py create mode 100644 temp_backup/src/ingestion/news_fetcher.py create mode 100644 temp_backup/src/ingestion/price_fetcher.py create mode 100644 temp_backup/src/ingestion/run_ingestion_quality_checks.py create mode 100644 temp_backup/src/ingestion/social_fetcher.py create mode 100644 temp_backup/src/ingestion/soroban_event_indexer.py create mode 100644 temp_backup/src/ingestion/stellar_fetcher.py create mode 100644 temp_backup/src/ingestion/stellar_ingestion_checks.py create mode 100644 temp_backup/src/main.py create mode 100644 temp_backup/src/ml/__init__.py create mode 100644 temp_backup/src/ml/feature_store.py create mode 100644 temp_backup/src/ml/model_registry.py create mode 100644 temp_backup/src/ml/price_predictor.py create mode 100644 temp_backup/src/ml/retraining_pipeline.py create mode 100644 temp_backup/src/qa_exporter.py create mode 100644 temp_backup/src/scheduler.py create mode 100644 temp_backup/src/security.py create mode 100644 temp_backup/src/sentiment.py create mode 100644 temp_backup/src/trends.py create mode 100644 temp_backup/src/utils/__pycache__/logger.cpython-314.pyc create mode 100644 temp_backup/src/utils/http_client.py create mode 100644 temp_backup/src/utils/logger.py create mode 100644 temp_backup/src/utils/metrics.py create mode 100644 temp_backup/src/utils/translator.py create mode 100644 temp_backup/src/validators.py diff --git a/apps/backend/src/database/migrations/1774000000001-AddLedgerSequenceToSorobanEvents.ts b/apps/backend/src/database/migrations/1774000000001-AddLedgerSequenceToSorobanEvents.ts new file mode 100644 index 00000000..6b41e5d6 --- /dev/null +++ b/apps/backend/src/database/migrations/1774000000001-AddLedgerSequenceToSorobanEvents.ts @@ -0,0 +1,19 @@ +import { MigrationInterface, QueryRunner } from 'typeorm'; + +export class AddLedgerSequenceToSorobanEvents1774000000001 implements MigrationInterface { + async up(queryRunner: QueryRunner): Promise { + await queryRunner.query(` + ALTER TABLE soroban_events + ADD COLUMN ledger_sequence INTEGER NOT NULL DEFAULT 0; + + CREATE INDEX idx_soroban_events_ledger_sequence ON soroban_events (ledger_sequence); + `); + } + + async down(queryRunner: QueryRunner): Promise { + await queryRunner.query(` + DROP INDEX IF EXISTS idx_soroban_events_ledger_sequence; + ALTER TABLE soroban_events DROP COLUMN IF EXISTS ledger_sequence; + `); + } +} diff --git a/apps/data-processing/alert_notifier.py b/apps/data-processing/alert_notifier.py new file mode 100644 index 0000000000000000000000000000000000000000..8db5f012937b4364e5e880e3d09f1ec37a49c9f1 GIT binary patch literal 2909 jcmeIuF#!Mo0K%a4Pi+ZLh(KY$fB^#r3>YwAz`$i-3ta#K literal 0 HcmV?d00001 diff --git a/apps/data-processing/alertbot.py b/apps/data-processing/alertbot.py new file mode 100644 index 0000000000000000000000000000000000000000..a11c3999cff5ae11c46a4bdb1c6106e20ec34448 GIT binary patch literal 12737 zcmeIuF#!Mo0K%a4Pi+c6h(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM a7%*VKfB^#r3>YwAz<>b*1`HT5a2*&i!2kgO literal 0 HcmV?d00001 diff --git a/apps/data-processing/analytics/__init__.py b/apps/data-processing/analytics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ec4e89abaed3fb4046191e75f29537739322ae1f GIT binary patch literal 569 QcmZQz7zLvtKt>1v00KDx0RR91 literal 0 HcmV?d00001 diff --git a/apps/data-processing/analytics/correlation_engine.py b/apps/data-processing/analytics/correlation_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..d883143e97ad642cf9768cca753a155824361ebe GIT binary patch literal 12910 zcmeIuF#!Mo0K%a4Pwiz3h(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM b7%*VKfB^#r3>YwAz<>b*1`HT5VBiA-GHw6? literal 0 HcmV?d00001 diff --git a/apps/data-processing/analytics/entity_linker.py b/apps/data-processing/analytics/entity_linker.py new file mode 100644 index 00000000..21b388d8 --- /dev/null +++ b/apps/data-processing/analytics/entity_linker.py @@ -0,0 +1,212 @@ +""" +On-chain Entity Linker for news articles. +Links news content to on-chain projects and assets, producing stable IDs +and storing links in the database. +""" + +import logging +import re +from typing import Dict, List, Optional, Tuple +from dataclasses import dataclass + +from .keywords import CRYPTO_PROJECT_MAP, KNOWN_TICKERS, TICKER_TO_PROJECT + +logger = logging.getLogger(__name__) + + +@dataclass +class LinkedEntity: + stable_id: str + entity_type: str # "project" or "asset" + name: str + ticker: Optional[str] = None + confidence: float = 1.0 + + +class EntityLinker: + """ + Links text content to known on-chain entities (projects and assets) + with stable, deterministic IDs. + """ + + def __init__(self) -> None: + self._project_patterns = self._compile_project_patterns() + # Filter out SDF from asset tickers since it's a project + self._asset_tickers = {t for t in KNOWN_TICKERS if t not in ["SDF"]} + + def _compile_project_patterns(self) -> List[Tuple[str, re.Pattern]]: + """Compile regex patterns for project name matching, sorted by length descending.""" + patterns = [] + # Sort project names by length descending to prefer longer matches + sorted_projects = sorted( + CRYPTO_PROJECT_MAP.keys(), + key=lambda x: len(x), + reverse=True + ) + for project_name in sorted_projects: + pattern = re.compile(r"\b" + re.escape(project_name) + r"\b", re.IGNORECASE) + patterns.append((project_name, pattern)) + return patterns + + def _generate_stable_id(self, entity_type: str, identifier: str) -> str: + """Generate a stable, deterministic ID for an entity.""" + normalized = identifier.strip().lower() + return f"{entity_type}:{normalized}" + + def link_text( + self, + text: str, + title: Optional[str] = None + ) -> List[LinkedEntity]: + """ + Link the given text to known on-chain entities. + + Args: + text: Main text content to analyze + title: Optional article title (higher weight for entities found here) + + Returns: + List of LinkedEntity objects with stable IDs + """ + entities: Dict[str, LinkedEntity] = {} + + # Combine title and text for analysis, title first for priority + full_text = f"{title or ''}\n{text or ''}" + + # Match project names + for project_name, pattern in self._project_patterns: + if pattern.search(full_text): + # Get canonical project name (the last one in the list) + canonical_name = CRYPTO_PROJECT_MAP[project_name][-1] if CRYPTO_PROJECT_MAP[project_name] else project_name + canonical_stable_id = self._generate_stable_id("project", canonical_name.lower()) + + if canonical_stable_id not in entities: + entities[canonical_stable_id] = LinkedEntity( + stable_id=canonical_stable_id, + entity_type="project", + name=canonical_name, + confidence=0.95 + ) + + # Match tickers + ticker_pattern = re.compile(r"\b([A-Z]{2,6})\b") + for ticker in ticker_pattern.findall(full_text): + ticker = ticker.upper() + if ticker in self._asset_tickers: + stable_id = self._generate_stable_id("asset", ticker) + if stable_id not in entities: + entities[stable_id] = LinkedEntity( + stable_id=stable_id, + entity_type="asset", + name=ticker, + ticker=ticker, + confidence=0.9 + ) + # Also link the associated project if available, using canonical ID + if ticker in TICKER_TO_PROJECT: + for project_name in TICKER_TO_PROJECT[ticker]: + # Get canonical project name + canonical_name = CRYPTO_PROJECT_MAP.get(project_name.lower(), [project_name])[-1] + canonical_stable_id = self._generate_stable_id("project", canonical_name.lower()) + if canonical_stable_id not in entities: + entities[canonical_stable_id] = LinkedEntity( + stable_id=canonical_stable_id, + entity_type="project", + name=canonical_name, + confidence=0.85 + ) + + return list(entities.values()) + + def link_article( + self, + title: Optional[str], + summary: Optional[str], + content: Optional[str] + ) -> List[LinkedEntity]: + """Link an article's content to on-chain entities.""" + combined_text = "\n".join([ + title or "", + summary or "", + content or "" + ]) + return self.link_text(combined_text, title) + + +# Small labeled test set for precision measurement +LABELED_TEST_SET = [ + { + "text": "Stellar Development Foundation (SDF) announces new Soroban upgrade. XLM price surges.", + "expected_entities": [ + {"stable_id": "project:stellar", "type": "project"}, + {"stable_id": "project:soroban", "type": "project"}, + {"stable_id": "asset:xlm", "type": "asset"} + ] + }, + { + "text": "Bitcoin (BTC) reaches new all-time high. Ethereum (ETH) follows closely.", + "expected_entities": [ + {"stable_id": "asset:btc", "type": "asset"}, + {"stable_id": "asset:eth", "type": "asset"} + ] + }, + { + "text": "DeFi protocol Uniswap launches new liquidity pool on Solana.", + "expected_entities": [ + {"stable_id": "project:uniswap", "type": "project"}, + {"stable_id": "asset:sol", "type": "asset"} + ] + }, + { + "text": "Cardano (ADA) releases new roadmap for governance.", + "expected_entities": [ + {"stable_id": "asset:ada", "type": "asset"} + ] + }, + { + "text": "Tech stocks rally on positive earnings. Apple and Microsoft lead gains.", + "expected_entities": [] # No crypto entities + } +] + + +def measure_precision(entity_linker: EntityLinker) -> Dict[str, float]: + """ + Measure precision of the entity linker using the labeled test set. + + Returns: + Dictionary with precision metrics + """ + true_positives = 0 + false_positives = 0 + total_expected = 0 + + for test_case in LABELED_TEST_SET: + text = test_case["text"] + expected = test_case["expected_entities"] + total_expected += len(expected) + + actual = entity_linker.link_text(text) + actual_stable_ids = {e.stable_id for e in actual} + expected_stable_ids = {e["stable_id"] for e in expected} + + # Calculate true positives and false positives + for entity in actual: + if entity.stable_id in expected_stable_ids: + true_positives += 1 + else: + false_positives += 1 + + precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 1.0 + recall = true_positives / total_expected if total_expected > 0 else 1.0 + f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0 + + return { + "precision": precision, + "recall": recall, + "f1": f1, + "true_positives": true_positives, + "false_positives": false_positives, + "total_expected": total_expected, + "test_cases": len(LABELED_TEST_SET) + } diff --git a/apps/data-processing/analytics/forecaster.py b/apps/data-processing/analytics/forecaster.py new file mode 100644 index 0000000000000000000000000000000000000000..a2a18497440fd4cd699ab5b129d3763b078f30f1 GIT binary patch literal 20062 zcmeIu0Sy2E0K%a6Pi+o2h(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* G{{sU~UH}0A literal 0 HcmV?d00001 diff --git a/apps/data-processing/analytics/keywords.py b/apps/data-processing/analytics/keywords.py new file mode 100644 index 0000000000000000000000000000000000000000..7207aa7d087bbb6f7246e726f05e6de291427481 GIT binary patch literal 7843 zcmeIuF#!Mo0K%a4Pi+e?h(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0Rsju F0|OqT00961 literal 0 HcmV?d00001 diff --git a/apps/data-processing/analytics/market_analyzer.py b/apps/data-processing/analytics/market_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..f3385afeaaebca2ea5020b722ee689fc12d8a6e1 GIT binary patch literal 6820 zcmeIuF#!Mo0K%a4Pi+e?h(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFks+2FdC!) A0RR91 literal 0 HcmV?d00001 diff --git a/apps/data-processing/analytics/ner_service.py b/apps/data-processing/analytics/ner_service.py new file mode 100644 index 0000000000000000000000000000000000000000..8d716edbb9d1f605f7327b29393d28e2f88180bd GIT binary patch literal 5604 vcmeIu0Sy2E0K%a6Pi+o2h(KY$fB^#r3>YwAz<>b*1`HT5V8DO@1Fr)E732T` literal 0 HcmV?d00001 diff --git a/apps/data-processing/analytics/sentiment.py b/apps/data-processing/analytics/sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..82e30d8e7c3e13c6ac28e7a89bbb772139664cef GIT binary patch literal 12829 zcmeIu0Sy2E0K%a6Pi+o2h(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM a7%*VKfB^#r3>YwAz<>b*1`HT5a62$E9RL9U literal 0 HcmV?d00001 diff --git a/apps/data-processing/analytics/sentiment_indicators.py b/apps/data-processing/analytics/sentiment_indicators.py new file mode 100644 index 0000000000000000000000000000000000000000..7782f4621f2f9503321b99d7b35339cb834de41d GIT binary patch literal 7732 zcmeIufdBvi0K=g9Qy=7oP+`D;0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@1Fr)E D9y9;} literal 0 HcmV?d00001 diff --git a/apps/data-processing/anomaly_detector.py b/apps/data-processing/anomaly_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..139eb0ecbac7b0e6d063fc77dd6f3ce1b388c3e4 GIT binary patch literal 32052 zcmeIuF#!Mo0K%a4Pi+Tph(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* z1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd Gfq!6qGynkr literal 0 HcmV?d00001 diff --git a/apps/data-processing/api/ingestion_quality_routes.py b/apps/data-processing/api/ingestion_quality_routes.py new file mode 100644 index 0000000000000000000000000000000000000000..153f02bb615f7351060adcfc4d6f2b590121b585 GIT binary patch literal 1874 ccmZQz7zLvtFd71*Aut*OqaiRF0+fdU00&Y40RR91 literal 0 HcmV?d00001 diff --git a/apps/data-processing/api/server.py b/apps/data-processing/api/server.py new file mode 100644 index 0000000000000000000000000000000000000000..2aeebed3a8b1addf54d8f1ff4b57a9bee3231e96 GIT binary patch literal 23236 zcmeIuF#!Mo0K%a4Pwj07h(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* U1`HT5V8DO@0|pEjFks*Z23o`b0RR91 literal 0 HcmV?d00001 diff --git a/apps/data-processing/cache_manager.py b/apps/data-processing/cache_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..bb6ae110db7851b43d67f32d47a16a972e327c7b GIT binary patch literal 4728 rcmeIuF#!Mo0K%a4Pi+ZLh(KY$fB^#r3>YwAz<>b*1`HT5VBkM65_kXs literal 0 HcmV?d00001 diff --git a/apps/data-processing/config/anomaly_config.py b/apps/data-processing/config/anomaly_config.py new file mode 100644 index 0000000000000000000000000000000000000000..82c4106c1e5811bc38f909a8074426bc0d225b76 GIT binary patch literal 4461 qcmeIuF#!Mo0K%a4Pi+ZLh(KY$fB^#r3>YwAz<>b*1`HT5a2psAZ2$oP literal 0 HcmV?d00001 diff --git a/apps/data-processing/database.py b/apps/data-processing/database.py new file mode 100644 index 0000000000000000000000000000000000000000..b0210d02c03aa8b3833fdea678a63f28c821c77f GIT binary patch literal 8492 zcmeIufdBvi0K=g9Qy=7oP+`D;0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj G_!$@>EC2xj literal 0 HcmV?d00001 diff --git a/apps/data-processing/db/__init__.py b/apps/data-processing/db/__init__.py new file mode 100644 index 00000000..dcb6992a --- /dev/null +++ b/apps/data-processing/db/__init__.py @@ -0,0 +1,18 @@ +""" +Database package for analytics data persistence +""" + +from .models import Base, Article, SocialPost, AnalyticsRecord, NewsInsight, AssetTrend, OnChainEntity, ArticleEntityLink +from .postgres_service import PostgresService + +__all__ = [ + "Base", + "Article", + "SocialPost", + "AnalyticsRecord", + "NewsInsight", + "AssetTrend", + "OnChainEntity", + "ArticleEntityLink", + "PostgresService", +] diff --git a/apps/data-processing/db/models.py b/apps/data-processing/db/models.py new file mode 100644 index 00000000..5cc7fa5f --- /dev/null +++ b/apps/data-processing/db/models.py @@ -0,0 +1,304 @@ +""" +Database models for analytics data persistence +""" + +from datetime import datetime +from typing import Optional +from sqlalchemy import Column, Integer, String, Float, DateTime, JSON, Text, Index, BigInteger +from sqlalchemy.orm import declarative_base +from sqlalchemy.sql import func + +Base = declarative_base() + + +class OnChainEntity(Base): + """ + Stores on-chain entities (projects and assets) with stable IDs + """ + __tablename__ = "on_chain_entities" + + id = Column(Integer, primary_key=True, autoincrement=True) + stable_id = Column(String(255), unique=True, nullable=False, index=True) # Stable unique ID (e.g., "asset:XLM", "project:stellar") + entity_type = Column(String(50), nullable=False, index=True) # "project" or "asset" + name = Column(String(255), nullable=False) # Human-readable name + ticker = Column(String(20), nullable=True, index=True) # Asset ticker (if applicable) + contract_ids = Column(JSON, nullable=True) # Array of associated contract IDs + extra_data = Column(JSON, nullable=True) # Additional metadata + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + updated_at = Column( + DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False + ) + + __table_args__ = ( + Index("idx_entities_type_ticker", "entity_type", "ticker"), + ) + + +class ArticleEntityLink(Base): + """ + Links articles to on-chain entities (many-to-many relationship) + """ + __tablename__ = "article_entity_links" + + id = Column(Integer, primary_key=True, autoincrement=True) + article_id = Column(String(255), nullable=False, index=True) # Foreign key to articles.article_id + entity_stable_id = Column(String(255), nullable=False, index=True) # Foreign key to on_chain_entities.stable_id + confidence = Column(Float, nullable=True) # Confidence score for the link (0-1) + context = Column(Text, nullable=True) # Context snippet where the entity was found + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + + __table_args__ = ( + Index("idx_article_entity_link", "article_id", "entity_stable_id", unique=True), + Index("idx_entity_article_link", "entity_stable_id", "article_id"), + ) + + +class Article(Base): + """ + Stores news articles with full content and metadata + """ + + __tablename__ = "articles" + + id = Column(Integer, primary_key=True, autoincrement=True) + article_id = Column(String(255), unique=True, nullable=False, index=True) + title = Column(Text, nullable=False) + content = Column(Text, nullable=True) + summary = Column(Text, nullable=True) + source = Column(String(100), nullable=True, index=True) + url = Column(Text, nullable=True) + + # Asset information + asset_codes = Column(JSON, nullable=True) # Array of asset codes mentioned in article + primary_asset = Column(String(20), nullable=True, index=True) # Primary asset being discussed + categories = Column(JSON, nullable=True) # Article categories + + # Sentiment scores + sentiment_score = Column(Float, nullable=True) # compound score -1 to 1 + positive_score = Column(Float, nullable=True) + negative_score = Column(Float, nullable=True) + neutral_score = Column(Float, nullable=True) + sentiment_label = Column(String(20), nullable=True, index=True) # positive/negative/neutral + + # Keywords and metadata + keywords = Column(JSON, nullable=True) # Array of keywords + detected_entities = Column(JSON, nullable=True) # NER entities detected in article text + linked_entities = Column(JSON, nullable=True) # Structured linked entities (array of {stable_id, type, name}) + language = Column(String(10), nullable=True) + + # Timestamps + published_at = Column(DateTime(timezone=True), nullable=True, index=True) + fetched_at = Column(DateTime(timezone=True), nullable=True) + analyzed_at = Column(DateTime(timezone=True), nullable=True) + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + updated_at = Column( + DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False + ) + + # Indexes for efficient querying + __table_args__ = ( + Index("idx_articles_published_at", "published_at"), + Index("idx_articles_sentiment_label", "sentiment_label"), + Index("idx_articles_source", "source"), + Index("idx_articles_primary_asset", "primary_asset"), + Index("idx_articles_asset_sentiment", "primary_asset", "sentiment_label"), + Index("idx_articles_created_at", "created_at"), + ) + + def __repr__(self): + return f"" + + +class SocialPost(Base): + """ + Stores social media posts (Twitter, Reddit, etc.) + """ + + __tablename__ = "social_posts" + + id = Column(Integer, primary_key=True, autoincrement=True) + post_id = Column(String(255), unique=True, nullable=False, index=True) + platform = Column(String(50), nullable=False, index=True) # twitter, reddit, etc. + content = Column(Text, nullable=False) + author = Column(String(255), nullable=True) + url = Column(Text, nullable=True) + + # Engagement metrics + likes = Column(Integer, default=0) + comments = Column(Integer, default=0) + shares = Column(Integer, default=0) + + # Asset information + asset_codes = Column(JSON, nullable=True) # Array of asset codes mentioned + primary_asset = Column(String(20), nullable=True, index=True) + hashtags = Column(JSON, nullable=True) # Array of hashtags + subreddit = Column(String(100), nullable=True) # For Reddit posts + + # Sentiment scores + sentiment_score = Column(Float, nullable=True) # compound score -1 to 1 + positive_score = Column(Float, nullable=True) + negative_score = Column(Float, nullable=True) + neutral_score = Column(Float, nullable=True) + sentiment_label = Column(String(20), nullable=True, index=True) + + # Timestamps + posted_at = Column(DateTime(timezone=True), nullable=False, index=True) + fetched_at = Column(DateTime(timezone=True), nullable=True) + analyzed_at = Column(DateTime(timezone=True), nullable=True) + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + updated_at = Column( + DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False + ) + + # Indexes for efficient querying + __table_args__ = ( + Index("idx_social_posts_platform", "platform"), + Index("idx_social_posts_posted_at", "posted_at"), + Index("idx_social_posts_sentiment_label", "sentiment_label"), + Index("idx_social_posts_primary_asset", "primary_asset"), + Index("idx_social_posts_platform_asset", "platform", "primary_asset"), + Index("idx_social_posts_created_at", "created_at"), + ) + + def __repr__(self): + return f"" + + +class AnalyticsRecord(Base): + """ + Stores computed analytics and aggregated metrics + """ + + __tablename__ = "analytics_records" + + id = Column(Integer, primary_key=True, autoincrement=True) + record_type = Column(String(50), nullable=False, index=True) # sentiment_summary, trend, etc. + asset = Column(String(50), nullable=True, index=True) # Asset symbol (e.g., 'XLM', 'BTC') + metric_name = Column(String(100), nullable=False) # e.g., 'sentiment_score', 'volume' + window = Column(String(20), nullable=True) # e.g., '1h', '24h', '7d' + + # Metric values + value = Column(Float, nullable=False) + previous_value = Column(Float, nullable=True) + change_percentage = Column(Float, nullable=True) + trend_direction = Column(String(20), nullable=True) # up/down/stable + + # Additional data + extra_data = Column(JSON, nullable=True) # Additional metadata + + # Timestamps + timestamp = Column(DateTime(timezone=True), nullable=False, index=True) + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + + # Indexes for efficient querying + __table_args__ = ( + Index("idx_analytics_records_type", "record_type"), + Index("idx_analytics_records_asset", "asset"), + Index("idx_analytics_records_timestamp", "timestamp"), + Index("idx_analytics_records_type_asset", "record_type", "asset"), + Index("idx_analytics_records_asset_metric", "asset", "metric_name"), + ) + + def __repr__(self): + return f"" + + +class NewsInsight(Base): + """ + Stores sentiment analysis results for news articles (legacy table, kept for backward compatibility) + """ + + __tablename__ = "news_insights" + + id = Column(Integer, primary_key=True, autoincrement=True) + article_id = Column(String(255), nullable=True, index=True) + article_title = Column(Text, nullable=True) + article_url = Column(Text, nullable=True) + source = Column(String(100), nullable=True) + + # Asset information + asset_codes = Column(JSON, nullable=True) # Array of asset codes mentioned in article + primary_asset = Column(String(20), nullable=True, index=True) # Primary asset being discussed + + # Sentiment scores + sentiment_score = Column(Float, nullable=False) # compound score -1 to 1 + positive_score = Column(Float, nullable=False) + negative_score = Column(Float, nullable=False) + neutral_score = Column(Float, nullable=False) + sentiment_label = Column(String(20), nullable=False) # positive/negative/neutral + + # Keywords and metadata + keywords = Column(JSON, nullable=True) # Array of keywords + language = Column(String(10), nullable=True) + + # Timestamps + article_published_at = Column(DateTime(timezone=True), nullable=True) + analyzed_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + + # Indexes for efficient querying + __table_args__ = ( + Index("idx_news_insights_analyzed_at", "analyzed_at"), + Index("idx_news_insights_sentiment_label", "sentiment_label"), + Index("idx_news_insights_source", "source"), + Index("idx_news_insights_primary_asset", "primary_asset"), + Index("idx_news_insights_asset_sentiment", "primary_asset", "sentiment_label"), + ) + + def __repr__(self): + return f"" + + +class AssetTrend(Base): + """ + Stores calculated trends for assets and metrics (legacy table, kept for backward compatibility) + """ + + __tablename__ = "asset_trends" + + id = Column(Integer, primary_key=True, autoincrement=True) + asset = Column(String(50), nullable=False, index=True) # e.g., 'XLM', 'BTC' + metric_name = Column(String(100), nullable=False) # e.g., 'sentiment_score', 'volume' + window = Column(String(20), nullable=False) # e.g., '1h', '24h', '7d' + + # Trend data + trend_direction = Column(String(20), nullable=False) # up/down/stable + score = Column(Float, nullable=False) # trend score/strength + current_value = Column(Float, nullable=False) + previous_value = Column(Float, nullable=False) + change_percentage = Column(Float, nullable=False) + + # Additional data (renamed from metadata to avoid SQLAlchemy conflict) + extra_data = Column(JSON, nullable=True) # Additional trend metadata + + # Timestamps + timestamp = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False, index=True + ) + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + + # Indexes for efficient querying + __table_args__ = ( + Index("idx_asset_trends_asset_metric", "asset", "metric_name"), + Index("idx_asset_trends_timestamp", "timestamp"), + Index("idx_asset_trends_window", "window"), + ) + + def __repr__(self): + return f"" diff --git a/apps/data-processing/db/postgres_service.py b/apps/data-processing/db/postgres_service.py new file mode 100644 index 00000000..3903c428 --- /dev/null +++ b/apps/data-processing/db/postgres_service.py @@ -0,0 +1,1245 @@ +""" +PostgreSQL service for persisting analytics data +""" + +import logging +import os +import time +from typing import List, Dict, Any, Optional +from datetime import datetime, timedelta +from contextlib import contextmanager + +from sqlalchemy import create_engine, select, and_, desc +from sqlalchemy.orm import sessionmaker, Session +from sqlalchemy.exc import SQLAlchemyError, OperationalError + +from .models import Base, Article, SocialPost, AnalyticsRecord, NewsInsight, AssetTrend, OnChainEntity, ArticleEntityLink +from src.analytics.ner_service import NERService +from src.analytics.entity_linker import EntityLinker, measure_precision + +logger = logging.getLogger(__name__) + + +class PostgresService: + """ + Service for persisting and retrieving analytics data from PostgreSQL + """ + + def __init__(self, database_url: Optional[str] = None): + """ + Initialize PostgreSQL service + + Args: + database_url: PostgreSQL connection URL. If None, reads from environment + """ + self.database_url = database_url or os.getenv( + "DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/lumenpulse" + ) + + try: + self.engine = create_engine( + self.database_url, + pool_pre_ping=True, # Verify connections before using + pool_size=5, + max_overflow=10, + echo=False, # Set to True for SQL query logging + ) + self.SessionLocal = sessionmaker( + autocommit=False, + autoflush=False, + expire_on_commit=False, + bind=self.engine, + ) + self.ner_service = NERService() + self.entity_linker = EntityLinker() + logger.info("PostgreSQL service initialized successfully") + except Exception as e: + logger.error(f"Failed to initialize PostgreSQL service: {e}") + raise + + def _ensure_detected_entities(self, article_data: Dict[str, Any]) -> Dict[str, Any]: + """Populate detected_entities when absent using the NER service.""" + normalized = dict(article_data) + existing_entities = normalized.get("detected_entities") + if isinstance(existing_entities, list) and existing_entities: + return normalized + + normalized["detected_entities"] = self.ner_service.extract_entities_from_article( + title=normalized.get("title"), + summary=normalized.get("summary"), + content=normalized.get("content"), + ) + return normalized + + @contextmanager + def get_session(self): + """ + Context manager for database sessions + + Yields: + Session: SQLAlchemy session + """ + session = self.SessionLocal() + try: + yield session + session.commit() + except Exception as e: + session.rollback() + logger.error(f"Session error: {e}") + raise + finally: + session.close() + + def _retry_operation(self, operation, max_retries=3, retry_delay=1.0): + """ + Retry a database operation with exponential backoff + + Args: + operation: Callable to execute + max_retries: Maximum number of retry attempts + retry_delay: Initial delay between retries (doubles each retry) + + Returns: + Result of the operation + + Raises: + Exception: If all retries fail + """ + last_exception = None + for attempt in range(max_retries): + try: + return operation() + except OperationalError as e: + last_exception = e + if attempt < max_retries - 1: + wait_time = retry_delay * (2 ** attempt) # Exponential backoff + logger.warning( + f"Database operation failed (attempt {attempt + 1}/{max_retries}): {e}. " + f"Retrying in {wait_time:.1f}s..." + ) + time.sleep(wait_time) + else: + logger.error(f"Database operation failed after {max_retries} attempts: {e}") + raise + except SQLAlchemyError as e: + # Non-retryable errors + logger.error(f"Database operation failed with non-retryable error: {e}") + raise + raise last_exception + + def create_tables(self): + """ + Create all tables in the database + """ + try: + Base.metadata.create_all(bind=self.engine) + logger.info("Database tables created successfully") + except Exception as e: + logger.error(f"Failed to create tables: {e}") + raise + + def drop_tables(self): + """ + Drop all tables (use with caution!) + """ + try: + Base.metadata.drop_all(bind=self.engine) + logger.warning("All database tables dropped") + except Exception as e: + logger.error(f"Failed to drop tables: {e}") + raise + + # Article Methods + + def save_article( + self, + article_data: Dict[str, Any], + sentiment_result: Optional[Dict[str, Any]] = None, + ) -> Optional[Article]: + """ + Save an article with optional sentiment analysis and entity linking + + Args: + article_data: Article data dictionary + sentiment_result: Optional sentiment analysis result + + Returns: + Article object if successful, None otherwise + """ + article_data = self._ensure_detected_entities(article_data) + + # Link entities + linked_entities = self.entity_linker.link_article( + title=article_data.get("title"), + summary=article_data.get("summary"), + content=article_data.get("content") + ) + + # Prepare structured linked entities for the article + structured_linked_entities = [ + { + "stable_id": e.stable_id, + "type": e.entity_type, + "name": e.name, + "ticker": getattr(e, 'ticker', None), + "confidence": getattr(e, 'confidence', None) + } + for e in linked_entities + ] + article_data["linked_entities"] = structured_linked_entities + + def _save(): + with self.get_session() as session: + # Check if article already exists + existing = session.execute( + select(Article).where(Article.article_id == article_data.get("id")) + ).scalar_one_or_none() + + if existing: + # Update existing article + existing.title = article_data.get("title", existing.title) + existing.content = article_data.get("content", existing.content) + existing.summary = article_data.get("summary", existing.summary) + existing.source = article_data.get("source", existing.source) + existing.url = article_data.get("url", existing.url) + existing.asset_codes = article_data.get("asset_codes", existing.asset_codes) + existing.primary_asset = article_data.get("primary_asset", existing.primary_asset) + existing.categories = article_data.get("categories", existing.categories) + existing.keywords = article_data.get("keywords", existing.keywords) + existing.detected_entities = article_data.get("detected_entities", existing.detected_entities) + existing.linked_entities = article_data.get("linked_entities", existing.linked_entities) + existing.language = article_data.get("language", existing.language) + existing.published_at = article_data.get("published_at", existing.published_at) + existing.fetched_at = article_data.get("fetched_at", existing.fetched_at) + + if sentiment_result: + existing.sentiment_score = sentiment_result.get("compound_score") + existing.positive_score = sentiment_result.get("positive") + existing.negative_score = sentiment_result.get("negative") + existing.neutral_score = sentiment_result.get("neutral") + existing.sentiment_label = sentiment_result.get("sentiment_label") + existing.analyzed_at = datetime.utcnow() + + session.flush() + logger.debug(f"Updated article: {existing.article_id}") + return existing + else: + # Create new article + article = Article( + article_id=article_data.get("id"), + title=article_data.get("title", ""), + content=article_data.get("content"), + summary=article_data.get("summary"), + source=article_data.get("source"), + url=article_data.get("url"), + asset_codes=article_data.get("asset_codes"), + primary_asset=article_data.get("primary_asset"), + categories=article_data.get("categories"), + keywords=article_data.get("keywords"), + detected_entities=article_data.get("detected_entities"), + linked_entities=article_data.get("linked_entities"), + language=article_data.get("language"), + published_at=article_data.get("published_at"), + fetched_at=article_data.get("fetched_at"), + ) + + if sentiment_result: + article.sentiment_score = sentiment_result.get("compound_score") + article.positive_score = sentiment_result.get("positive") + article.negative_score = sentiment_result.get("negative") + article.neutral_score = sentiment_result.get("neutral") + article.sentiment_label = sentiment_result.get("sentiment_label") + article.analyzed_at = datetime.utcnow() + + session.add(article) + session.flush() + logger.debug(f"Saved article: {article.article_id}") + return article + + try: + article = self._retry_operation(_save) + if article: + # Link entities in the database + self.link_article_to_entities(article.article_id, linked_entities) + return article + except SQLAlchemyError as e: + logger.error(f"Failed to save article: {e}") + return None + + def save_articles_batch( + self, + articles_data: List[Dict[str, Any]], + sentiment_results: Optional[List[Dict[str, Any]]] = None, + ) -> int: + """ + Save multiple articles in a batch + + Args: + articles_data: List of article data dictionaries + sentiment_results: Optional list of sentiment analysis results + + Returns: + Number of articles saved + """ + saved_count = 0 + try: + with self.get_session() as session: + for i, article_data in enumerate(articles_data): + article_data = self._ensure_detected_entities(article_data) + sentiment_result = sentiment_results[i] if sentiment_results and i < len(sentiment_results) else None + + # Check if article already exists + existing = session.execute( + select(Article).where(Article.article_id == article_data.get("id")) + ).scalar_one_or_none() + + if existing: + # Update existing article + existing.title = article_data.get("title", existing.title) + existing.content = article_data.get("content", existing.content) + existing.summary = article_data.get("summary", existing.summary) + existing.source = article_data.get("source", existing.source) + existing.url = article_data.get("url", existing.url) + existing.asset_codes = article_data.get("asset_codes", existing.asset_codes) + existing.primary_asset = article_data.get("primary_asset", existing.primary_asset) + existing.categories = article_data.get("categories", existing.categories) + existing.keywords = article_data.get("keywords", existing.keywords) + existing.detected_entities = article_data.get("detected_entities", existing.detected_entities) + existing.language = article_data.get("language", existing.language) + existing.published_at = article_data.get("published_at", existing.published_at) + existing.fetched_at = article_data.get("fetched_at", existing.fetched_at) + + if sentiment_result: + existing.sentiment_score = sentiment_result.get("compound_score") + existing.positive_score = sentiment_result.get("positive") + existing.negative_score = sentiment_result.get("negative") + existing.neutral_score = sentiment_result.get("neutral") + existing.sentiment_label = sentiment_result.get("sentiment_label") + existing.analyzed_at = datetime.utcnow() + else: + # Create new article + article = Article( + article_id=article_data.get("id"), + title=article_data.get("title", ""), + content=article_data.get("content"), + summary=article_data.get("summary"), + source=article_data.get("source"), + url=article_data.get("url"), + asset_codes=article_data.get("asset_codes"), + primary_asset=article_data.get("primary_asset"), + categories=article_data.get("categories"), + keywords=article_data.get("keywords"), + detected_entities=article_data.get("detected_entities"), + language=article_data.get("language"), + published_at=article_data.get("published_at"), + fetched_at=article_data.get("fetched_at"), + ) + + if sentiment_result: + article.sentiment_score = sentiment_result.get("compound_score") + article.positive_score = sentiment_result.get("positive") + article.negative_score = sentiment_result.get("negative") + article.neutral_score = sentiment_result.get("neutral") + article.sentiment_label = sentiment_result.get("sentiment_label") + article.analyzed_at = datetime.utcnow() + + session.add(article) + + saved_count += 1 + + logger.info(f"Saved {saved_count} articles") + except SQLAlchemyError as e: + logger.error(f"Failed to save articles batch: {e}") + + return saved_count + + def get_recent_articles( + self, + limit: int = 100, + hours: int = 24, + asset: Optional[str] = None, + entity: Optional[str] = None, + ) -> List[Article]: + """ + Get recent articles + + Args: + limit: Maximum number of results + hours: Time window in hours + asset: Optional asset filter + entity: Optional NER entity filter + + Returns: + List of Article objects + """ + try: + with self.get_session() as session: + cutoff_time = datetime.utcnow() - timedelta(hours=hours) + stmt = ( + select(Article) + .where(Article.published_at >= cutoff_time) + .order_by(desc(Article.published_at)) + .limit(limit * 5 if entity else limit) + ) + + if asset: + stmt = stmt.where(Article.primary_asset == asset) + + results = session.execute(stmt).scalars().all() + if entity: + target = entity.strip().lower() + results = [ + article + for article in results + if any( + str(value).strip().lower() == target + for value in (article.detected_entities or []) + ) + ][:limit] + logger.debug(f"Retrieved {len(results)} articles") + return results + except SQLAlchemyError as e: + logger.error(f"Failed to retrieve articles: {e}") + return [] + + # Social Post Methods + + def save_social_post( + self, + post_data: Dict[str, Any], + sentiment_result: Optional[Dict[str, Any]] = None, + ) -> Optional[SocialPost]: + """ + Save a social media post with optional sentiment analysis + + Args: + post_data: Social post data dictionary + sentiment_result: Optional sentiment analysis result + + Returns: + SocialPost object if successful, None otherwise + """ + def _save(): + with self.get_session() as session: + # Check if post already exists + existing = session.execute( + select(SocialPost).where(SocialPost.post_id == post_data.get("id")) + ).scalar_one_or_none() + + if existing: + # Update existing post + existing.content = post_data.get("content", existing.content) + existing.author = post_data.get("author", existing.author) + existing.url = post_data.get("url", existing.url) + existing.likes = post_data.get("likes", existing.likes) + existing.comments = post_data.get("comments", existing.comments) + existing.shares = post_data.get("shares", existing.shares) + existing.asset_codes = post_data.get("asset_codes", existing.asset_codes) + existing.primary_asset = post_data.get("primary_asset", existing.primary_asset) + existing.hashtags = post_data.get("hashtags", existing.hashtags) + existing.subreddit = post_data.get("subreddit", existing.subreddit) + existing.posted_at = post_data.get("posted_at", existing.posted_at) + existing.fetched_at = post_data.get("fetched_at", existing.fetched_at) + + if sentiment_result: + existing.sentiment_score = sentiment_result.get("compound_score") + existing.positive_score = sentiment_result.get("positive") + existing.negative_score = sentiment_result.get("negative") + existing.neutral_score = sentiment_result.get("neutral") + existing.sentiment_label = sentiment_result.get("sentiment_label") + existing.analyzed_at = datetime.utcnow() + + session.flush() + logger.debug(f"Updated social post: {existing.post_id}") + return existing + else: + # Create new post + post = SocialPost( + post_id=post_data.get("id"), + platform=post_data.get("platform", "unknown"), + content=post_data.get("content", ""), + author=post_data.get("author"), + url=post_data.get("url"), + likes=post_data.get("likes", 0), + comments=post_data.get("comments", 0), + shares=post_data.get("shares", 0), + asset_codes=post_data.get("asset_codes"), + primary_asset=post_data.get("primary_asset"), + hashtags=post_data.get("hashtags"), + subreddit=post_data.get("subreddit"), + posted_at=post_data.get("posted_at"), + fetched_at=post_data.get("fetched_at"), + ) + + if sentiment_result: + post.sentiment_score = sentiment_result.get("compound_score") + post.positive_score = sentiment_result.get("positive") + post.negative_score = sentiment_result.get("negative") + post.neutral_score = sentiment_result.get("neutral") + post.sentiment_label = sentiment_result.get("sentiment_label") + post.analyzed_at = datetime.utcnow() + + session.add(post) + session.flush() + logger.debug(f"Saved social post: {post.post_id}") + return post + + try: + return self._retry_operation(_save) + except SQLAlchemyError as e: + logger.error(f"Failed to save social post: {e}") + return None + + def save_social_posts_batch( + self, + posts_data: List[Dict[str, Any]], + sentiment_results: Optional[List[Dict[str, Any]]] = None, + ) -> int: + """ + Save multiple social posts in a batch + + Args: + posts_data: List of social post data dictionaries + sentiment_results: Optional list of sentiment analysis results + + Returns: + Number of posts saved + """ + saved_count = 0 + try: + with self.get_session() as session: + for i, post_data in enumerate(posts_data): + sentiment_result = sentiment_results[i] if sentiment_results and i < len(sentiment_results) else None + + # Check if post already exists + existing = session.execute( + select(SocialPost).where(SocialPost.post_id == post_data.get("id")) + ).scalar_one_or_none() + + if existing: + # Update existing post + existing.content = post_data.get("content", existing.content) + existing.author = post_data.get("author", existing.author) + existing.url = post_data.get("url", existing.url) + existing.likes = post_data.get("likes", existing.likes) + existing.comments = post_data.get("comments", existing.comments) + existing.shares = post_data.get("shares", existing.shares) + existing.asset_codes = post_data.get("asset_codes", existing.asset_codes) + existing.primary_asset = post_data.get("primary_asset", existing.primary_asset) + existing.hashtags = post_data.get("hashtags", existing.hashtags) + existing.subreddit = post_data.get("subreddit", existing.subreddit) + existing.posted_at = post_data.get("posted_at", existing.posted_at) + existing.fetched_at = post_data.get("fetched_at", existing.fetched_at) + + if sentiment_result: + existing.sentiment_score = sentiment_result.get("compound_score") + existing.positive_score = sentiment_result.get("positive") + existing.negative_score = sentiment_result.get("negative") + existing.neutral_score = sentiment_result.get("neutral") + existing.sentiment_label = sentiment_result.get("sentiment_label") + existing.analyzed_at = datetime.utcnow() + else: + # Create new post + post = SocialPost( + post_id=post_data.get("id"), + platform=post_data.get("platform", "unknown"), + content=post_data.get("content", ""), + author=post_data.get("author"), + url=post_data.get("url"), + likes=post_data.get("likes", 0), + comments=post_data.get("comments", 0), + shares=post_data.get("shares", 0), + asset_codes=post_data.get("asset_codes"), + primary_asset=post_data.get("primary_asset"), + hashtags=post_data.get("hashtags"), + subreddit=post_data.get("subreddit"), + posted_at=post_data.get("posted_at"), + fetched_at=post_data.get("fetched_at"), + ) + + if sentiment_result: + post.sentiment_score = sentiment_result.get("compound_score") + post.positive_score = sentiment_result.get("positive") + post.negative_score = sentiment_result.get("negative") + post.neutral_score = sentiment_result.get("neutral") + post.sentiment_label = sentiment_result.get("sentiment_label") + post.analyzed_at = datetime.utcnow() + + session.add(post) + + saved_count += 1 + + logger.info(f"Saved {saved_count} social posts") + except SQLAlchemyError as e: + logger.error(f"Failed to save social posts batch: {e}") + + return saved_count + + def get_recent_social_posts( + self, + limit: int = 100, + hours: int = 24, + platform: Optional[str] = None, + asset: Optional[str] = None, + ) -> List[SocialPost]: + """ + Get recent social posts + + Args: + limit: Maximum number of results + hours: Time window in hours + platform: Optional platform filter + asset: Optional asset filter + + Returns: + List of SocialPost objects + """ + try: + with self.get_session() as session: + cutoff_time = datetime.utcnow() - timedelta(hours=hours) + stmt = ( + select(SocialPost) + .where(SocialPost.posted_at >= cutoff_time) + .order_by(desc(SocialPost.posted_at)) + .limit(limit) + ) + + if platform: + stmt = stmt.where(SocialPost.platform == platform) + if asset: + stmt = stmt.where(SocialPost.primary_asset == asset) + + results = session.execute(stmt).scalars().all() + logger.debug(f"Retrieved {len(results)} social posts") + return results + except SQLAlchemyError as e: + logger.error(f"Failed to retrieve social posts: {e}") + return [] + + # Analytics Record Methods + + def save_analytics_record( + self, + record_type: str, + metric_name: str, + value: float, + asset: Optional[str] = None, + window: Optional[str] = None, + previous_value: Optional[float] = None, + change_percentage: Optional[float] = None, + trend_direction: Optional[str] = None, + extra_data: Optional[Dict[str, Any]] = None, + timestamp: Optional[datetime] = None, + ) -> Optional[AnalyticsRecord]: + """ + Save an analytics record + + Args: + record_type: Type of record (e.g., 'sentiment_summary', 'trend') + metric_name: Metric name (e.g., 'sentiment_score', 'volume') + value: Metric value + asset: Optional asset symbol + window: Optional time window + previous_value: Optional previous value + change_percentage: Optional change percentage + trend_direction: Optional trend direction + extra_data: Optional additional metadata + timestamp: Optional timestamp (defaults to now) + + Returns: + AnalyticsRecord object if successful, None otherwise + """ + def _save(): + with self.get_session() as session: + record = AnalyticsRecord( + record_type=record_type, + metric_name=metric_name, + value=value, + asset=asset, + window=window, + previous_value=previous_value, + change_percentage=change_percentage, + trend_direction=trend_direction, + extra_data=extra_data, + timestamp=timestamp or datetime.utcnow(), + ) + session.add(record) + session.flush() + logger.debug(f"Saved analytics record: {record_type}/{metric_name}") + return record + + try: + return self._retry_operation(_save) + except SQLAlchemyError as e: + logger.error(f"Failed to save analytics record: {e}") + return None + + def save_analytics_records_batch( + self, + records_data: List[Dict[str, Any]], + ) -> int: + """ + Save multiple analytics records in a batch + + Args: + records_data: List of analytics record data dictionaries + + Returns: + Number of records saved + """ + saved_count = 0 + try: + with self.get_session() as session: + for record_data in records_data: + record = AnalyticsRecord( + record_type=record_data.get("record_type"), + metric_name=record_data.get("metric_name"), + value=record_data.get("value"), + asset=record_data.get("asset"), + window=record_data.get("window"), + previous_value=record_data.get("previous_value"), + change_percentage=record_data.get("change_percentage"), + trend_direction=record_data.get("trend_direction"), + extra_data=record_data.get("extra_data"), + timestamp=record_data.get("timestamp", datetime.utcnow()), + ) + session.add(record) + saved_count += 1 + + logger.info(f"Saved {saved_count} analytics records") + except SQLAlchemyError as e: + logger.error(f"Failed to save analytics records batch: {e}") + + return saved_count + + def get_analytics_records( + self, + record_type: Optional[str] = None, + asset: Optional[str] = None, + metric_name: Optional[str] = None, + hours: int = 24, + limit: int = 100, + ) -> List[AnalyticsRecord]: + """ + Get analytics records + + Args: + record_type: Optional record type filter + asset: Optional asset filter + metric_name: Optional metric name filter + hours: Time window in hours + limit: Maximum number of results + + Returns: + List of AnalyticsRecord objects + """ + try: + with self.get_session() as session: + cutoff_time = datetime.utcnow() - timedelta(hours=hours) + stmt = ( + select(AnalyticsRecord) + .where(AnalyticsRecord.timestamp >= cutoff_time) + .order_by(desc(AnalyticsRecord.timestamp)) + .limit(limit) + ) + + if record_type: + stmt = stmt.where(AnalyticsRecord.record_type == record_type) + if asset: + stmt = stmt.where(AnalyticsRecord.asset == asset) + if metric_name: + stmt = stmt.where(AnalyticsRecord.metric_name == metric_name) + + results = session.execute(stmt).scalars().all() + logger.debug(f"Retrieved {len(results)} analytics records") + return results + except SQLAlchemyError as e: + logger.error(f"Failed to retrieve analytics records: {e}") + return [] + + # Legacy News Insights Methods (kept for backward compatibility) + + def save_news_insight( + self, + sentiment_result: Dict[str, Any], + article_data: Optional[Dict[str, Any]] = None, + ) -> Optional[NewsInsight]: + """ + Save a news sentiment analysis result + + Args: + sentiment_result: Sentiment analysis result dictionary + article_data: Optional article metadata + + Returns: + NewsInsight object if successful, None otherwise + """ + try: + with self.get_session() as session: + insight = NewsInsight( + article_id=article_data.get("id") if article_data else None, + article_title=article_data.get("title") if article_data else None, + article_url=article_data.get("url") if article_data else None, + source=article_data.get("source") if article_data else None, + sentiment_score=sentiment_result["compound_score"], + positive_score=sentiment_result["positive"], + negative_score=sentiment_result["negative"], + neutral_score=sentiment_result["neutral"], + sentiment_label=sentiment_result["sentiment_label"], + keywords=article_data.get("keywords") if article_data else None, + language=article_data.get("language") if article_data else None, + article_published_at=( + article_data.get("published_at") if article_data else None + ), + ) + session.add(insight) + session.flush() + logger.debug(f"Saved news insight: {insight.id}") + return insight + except SQLAlchemyError as e: + logger.error(f"Failed to save news insight: {e}") + return None + + def save_news_insights_batch( + self, sentiment_results: List[Dict[str, Any]], articles_data: List[Dict[str, Any]] = None + ) -> int: + """ + Save multiple news insights in a batch + + Args: + sentiment_results: List of sentiment analysis results + articles_data: Optional list of article metadata + + Returns: + Number of insights saved + """ + saved_count = 0 + try: + with self.get_session() as session: + for i, result in enumerate(sentiment_results): + article_data = articles_data[i] if articles_data and i < len(articles_data) else None + + insight = NewsInsight( + article_id=article_data.get("id") if article_data else None, + article_title=article_data.get("title") if article_data else None, + article_url=article_data.get("url") if article_data else None, + source=article_data.get("source") if article_data else None, + sentiment_score=result["compound_score"], + positive_score=result["positive"], + negative_score=result["negative"], + neutral_score=result["neutral"], + sentiment_label=result["sentiment_label"], + keywords=article_data.get("keywords") if article_data else None, + language=article_data.get("language") if article_data else None, + article_published_at=( + article_data.get("published_at") if article_data else None + ), + ) + session.add(insight) + saved_count += 1 + + logger.info(f"Saved {saved_count} news insights") + except SQLAlchemyError as e: + logger.error(f"Failed to save news insights batch: {e}") + + return saved_count + + def get_recent_news_insights( + self, limit: int = 100, hours: int = 24 + ) -> List[NewsInsight]: + """ + Get recent news insights + + Args: + limit: Maximum number of results + hours: Time window in hours + + Returns: + List of NewsInsight objects + """ + try: + with self.get_session() as session: + cutoff_time = datetime.utcnow() - timedelta(hours=hours) + stmt = ( + select(NewsInsight) + .where(NewsInsight.analyzed_at >= cutoff_time) + .order_by(desc(NewsInsight.analyzed_at)) + .limit(limit) + ) + results = session.execute(stmt).scalars().all() + logger.debug(f"Retrieved {len(results)} news insights") + return results + except SQLAlchemyError as e: + logger.error(f"Failed to retrieve news insights: {e}") + return [] + + # Legacy Asset Trends Methods (kept for backward compatibility) + + def save_asset_trend( + self, + asset: str, + metric_name: str, + window: str, + trend_data: Dict[str, Any], + ) -> Optional[AssetTrend]: + """ + Save an asset trend + + Args: + asset: Asset symbol (e.g., 'XLM') + metric_name: Metric name (e.g., 'sentiment_score') + window: Time window (e.g., '24h') + trend_data: Trend data dictionary + + Returns: + AssetTrend object if successful, None otherwise + """ + try: + with self.get_session() as session: + trend = AssetTrend( + asset=asset, + metric_name=metric_name, + window=window, + trend_direction=trend_data["trend_direction"], + score=trend_data.get("score", 0.0), + current_value=trend_data["current_value"], + previous_value=trend_data["previous_value"], + change_percentage=trend_data["change_percentage"], + extra_data=trend_data.get("extra_data") or trend_data.get("metadata"), + ) + session.add(trend) + session.flush() + logger.debug(f"Saved asset trend: {asset}/{metric_name}") + return trend + except SQLAlchemyError as e: + logger.error(f"Failed to save asset trend: {e}") + return None + + def save_asset_trends_batch( + self, asset: str, window: str, trends: List[Dict[str, Any]] + ) -> int: + """ + Save multiple asset trends in a batch + + Args: + asset: Asset symbol + window: Time window + trends: List of trend dictionaries + + Returns: + Number of trends saved + """ + saved_count = 0 + try: + with self.get_session() as session: + for trend_data in trends: + trend = AssetTrend( + asset=asset, + metric_name=trend_data["metric_name"], + window=window, + trend_direction=trend_data["trend_direction"], + score=trend_data.get("score", 0.0), + current_value=trend_data["current_value"], + previous_value=trend_data["previous_value"], + change_percentage=trend_data["change_percentage"], + extra_data=trend_data.get("extra_data") or trend_data.get("metadata"), + ) + session.add(trend) + saved_count += 1 + + logger.info(f"Saved {saved_count} asset trends for {asset}") + except SQLAlchemyError as e: + logger.error(f"Failed to save asset trends batch: {e}") + + return saved_count + + def get_recent_asset_trends( + self, asset: str, metric_name: Optional[str] = None, limit: int = 100 + ) -> List[AssetTrend]: + """ + Get recent asset trends + + Args: + asset: Asset symbol + metric_name: Optional metric name filter + limit: Maximum number of results + + Returns: + List of AssetTrend objects + """ + try: + with self.get_session() as session: + stmt = select(AssetTrend).where(AssetTrend.asset == asset) + + if metric_name: + stmt = stmt.where(AssetTrend.metric_name == metric_name) + + stmt = stmt.order_by(desc(AssetTrend.timestamp)).limit(limit) + + results = session.execute(stmt).scalars().all() + logger.debug(f"Retrieved {len(results)} asset trends for {asset}") + return results + except SQLAlchemyError as e: + logger.error(f"Failed to retrieve asset trends: {e}") + return [] + + def upsert_on_chain_entity(self, stable_id: str, entity_type: str, name: str, ticker: Optional[str] = None, extra_data: Optional[Dict] = None) -> OnChainEntity: + """ + Upsert an on-chain entity (create if not exists, update if exists). + + Args: + stable_id: Stable unique ID for the entity + entity_type: "project" or "asset" + name: Human-readable name + ticker: Optional asset ticker + extra_data: Optional additional metadata + + Returns: + The OnChainEntity object + """ + def _upsert(): + with self.get_session() as session: + existing = session.execute( + select(OnChainEntity).where(OnChainEntity.stable_id == stable_id) + ).scalar_one_or_none() + + if existing: + existing.name = name + existing.ticker = ticker or existing.ticker + existing.extra_data = extra_data or existing.extra_data + session.flush() + return existing + else: + entity = OnChainEntity( + stable_id=stable_id, + entity_type=entity_type, + name=name, + ticker=ticker, + extra_data=extra_data + ) + session.add(entity) + session.flush() + return entity + return self._retry_operation(_upsert) + + def link_article_to_entities(self, article_id: str, linked_entities: List) -> None: + """ + Link an article to on-chain entities. + + Args: + article_id: The article's unique ID + linked_entities: List of LinkedEntity objects + """ + def _link(): + with self.get_session() as session: + for entity in linked_entities: + # Upsert the entity first + self.upsert_on_chain_entity( + stable_id=entity.stable_id, + entity_type=entity.entity_type, + name=entity.name, + ticker=getattr(entity, 'ticker', None) + ) + + # Check if link already exists + existing_link = session.execute( + select(ArticleEntityLink).where( + and_( + ArticleEntityLink.article_id == article_id, + ArticleEntityLink.entity_stable_id == entity.stable_id + ) + ) + ).scalar_one_or_none() + + if not existing_link: + link = ArticleEntityLink( + article_id=article_id, + entity_stable_id=entity.stable_id, + confidence=getattr(entity, 'confidence', None) + ) + session.add(link) + self._retry_operation(_link) + + def get_article_linked_entities(self, article_id: str) -> List[Dict]: + """ + Get all entities linked to an article. + + Args: + article_id: The article's unique ID + + Returns: + List of entity data dictionaries + """ + try: + with self.get_session() as session: + links = session.execute( + select(ArticleEntityLink).where(ArticleEntityLink.article_id == article_id) + ).scalars().all() + + entities = [] + for link in links: + entity = session.execute( + select(OnChainEntity).where(OnChainEntity.stable_id == link.entity_stable_id) + ).scalar_one_or_none() + if entity: + entities.append({ + "stable_id": entity.stable_id, + "type": entity.entity_type, + "name": entity.name, + "ticker": entity.ticker, + "confidence": link.confidence + }) + return entities + except SQLAlchemyError as e: + logger.error(f"Failed to get linked entities for article {article_id}: {e}") + return [] + + def get_articles_for_entity(self, stable_id: str, limit: int = 100) -> List[Article]: + """ + Get all articles linked to a specific entity. + + Args: + stable_id: The entity's stable ID + limit: Maximum number of articles to return + + Returns: + List of Article objects + """ + try: + with self.get_session() as session: + links = session.execute( + select(ArticleEntityLink).where(ArticleEntityLink.entity_stable_id == stable_id).limit(limit) + ).scalars().all() + + article_ids = [link.article_id for link in links] + articles = session.execute( + select(Article).where(Article.article_id.in_(article_ids)).order_by(desc(Article.published_at)) + ).scalars().all() + return articles + except SQLAlchemyError as e: + logger.error(f"Failed to get articles for entity {stable_id}: {e}") + return [] + + def measure_entity_linker_precision(self) -> Dict[str, float]: + """ + Measure and log the precision of the entity linker. + + Returns: + Precision metrics dictionary + """ + metrics = measure_precision(self.entity_linker) + logger.info("Entity Linker Precision Metrics:") + logger.info(f" Precision: {metrics['precision']:.4f}") + logger.info(f" Recall: {metrics['recall']:.4f}") + logger.info(f" F1 Score: {metrics['f1']:.4f}") + logger.info(f" Test Cases: {metrics['test_cases']}") + return metrics + + def get_sentiment_summary(self, hours: int = 24) -> Dict[str, Any]: + """ + Get sentiment summary statistics + Args: + hours: Time window in hours + + Returns: + Summary statistics dictionary + """ + try: + with self.get_session() as session: + cutoff_time = datetime.utcnow() - timedelta(hours=hours) + + insights = session.execute( + select(NewsInsight).where(NewsInsight.analyzed_at >= cutoff_time) + ).scalars().all() + + if not insights: + return { + "total_articles": 0, + "average_sentiment": 0.0, + "positive_count": 0, + "negative_count": 0, + "neutral_count": 0, + } + + total = len(insights) + avg_sentiment = sum(i.sentiment_score for i in insights) / total + positive = sum(1 for i in insights if i.sentiment_label == "positive") + negative = sum(1 for i in insights if i.sentiment_label == "negative") + neutral = sum(1 for i in insights if i.sentiment_label == "neutral") + + return { + "total_articles": total, + "average_sentiment": round(avg_sentiment, 4), + "positive_count": positive, + "negative_count": negative, + "neutral_count": neutral, + "positive_percentage": round(positive / total * 100, 2), + "negative_percentage": round(negative / total * 100, 2), + "neutral_percentage": round(neutral / total * 100, 2), + } + except SQLAlchemyError as e: + logger.error(f"Failed to get sentiment summary: {e}") + return {} + + def cleanup_old_data(self, days: int = 30) -> Dict[str, int]: + """ + Clean up old analytics data + + Args: + days: Number of days to keep + + Returns: + Dictionary with counts of deleted records + """ + try: + cutoff_date = datetime.utcnow() - timedelta(days=days) + deleted_counts = { + "articles": 0, + "social_posts": 0, + "analytics_records": 0, + "news_insights": 0, + "asset_trends": 0, + } + + with self.get_session() as session: + # Delete old articles + articles_deleted = session.query(Article).filter( + Article.created_at < cutoff_date + ).delete() + deleted_counts["articles"] = articles_deleted + + # Delete old social posts + posts_deleted = session.query(SocialPost).filter( + SocialPost.created_at < cutoff_date + ).delete() + deleted_counts["social_posts"] = posts_deleted + + # Delete old analytics records + records_deleted = session.query(AnalyticsRecord).filter( + AnalyticsRecord.created_at < cutoff_date + ).delete() + deleted_counts["analytics_records"] = records_deleted + + # Delete old news insights (legacy) + news_deleted = session.query(NewsInsight).filter( + NewsInsight.created_at < cutoff_date + ).delete() + deleted_counts["news_insights"] = news_deleted + + # Delete old asset trends (legacy) + trends_deleted = session.query(AssetTrend).filter( + AssetTrend.created_at < cutoff_date + ).delete() + deleted_counts["asset_trends"] = trends_deleted + + logger.info(f"Cleaned up old data: {deleted_counts}") + return deleted_counts + except SQLAlchemyError as e: + logger.error(f"Failed to cleanup old data: {e}") + return { + "articles": 0, + "social_posts": 0, + "analytics_records": 0, + "news_insights": 0, + "asset_trends": 0, + } diff --git a/apps/data-processing/fetchers.py b/apps/data-processing/fetchers.py new file mode 100644 index 0000000000000000000000000000000000000000..cb12f97139416e5c5807cc43fc4efa5a77767ce7 GIT binary patch literal 4341 pcmeIu0Sy2E0K%a6Pi+qe5hx58Fkrxd0RsjM7%*VKfB^%C0|OBC00961 literal 0 HcmV?d00001 diff --git a/apps/data-processing/ingestion/__init__.py b/apps/data-processing/ingestion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2ef27ea5828c7b7d0b19fc95cd7b88949f079978 GIT binary patch literal 892 ScmZQz7zLvtFd6~_5dr`Md;kFe literal 0 HcmV?d00001 diff --git a/apps/data-processing/ingestion/news_deduplicator.py b/apps/data-processing/ingestion/news_deduplicator.py new file mode 100644 index 0000000000000000000000000000000000000000..d326408e27aeb0bf33ca9c8a504dcd1d93d24e95 GIT binary patch literal 7335 zcmeIuF#!Mo0K%a4Pi+e?h(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxdf$P8k D9H#&Q literal 0 HcmV?d00001 diff --git a/apps/data-processing/ingestion/news_fetcher.py b/apps/data-processing/ingestion/news_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..6445058bc92ae07b6078e52cce83ac3a7cc49186 GIT binary patch literal 12180 zcmeIuF#!Mo0K%a4Pi+Tph(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM Y7%*VKfB^#r3>YwAz<>b*1`Ip{122>S0RR91 literal 0 HcmV?d00001 diff --git a/apps/data-processing/ingestion/price_fetcher.py b/apps/data-processing/ingestion/price_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..868e1f69c07a0edbde5da3884697dffd23cf4ec9 GIT binary patch literal 8131 zcmeIuF#!Mo0K%a4Pi+kkh(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM G91IK}!vFyQ literal 0 HcmV?d00001 diff --git a/apps/data-processing/ingestion/run_ingestion_quality_checks.py b/apps/data-processing/ingestion/run_ingestion_quality_checks.py new file mode 100644 index 0000000000000000000000000000000000000000..0f21cd7b2c09f6518b6c33781c2163bd95e5573f GIT binary patch literal 721 QcmZQz7zLvtFpNR~00Pkf0RR91 literal 0 HcmV?d00001 diff --git a/apps/data-processing/ingestion/social_fetcher.py b/apps/data-processing/ingestion/social_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..40477603e848695f2ba84d0559f7a6fe89f247b1 GIT binary patch literal 25228 zcmeIu0Sy2E0K%a6Pi+qe5hx58Fkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@ z0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VK cfB^#r3>YwAz<>b*1`HT5V8DO@0|s6O24ai=0RR91 literal 0 HcmV?d00001 diff --git a/apps/data-processing/ingestion/soroban_event_indexer.py b/apps/data-processing/ingestion/soroban_event_indexer.py new file mode 100644 index 00000000..41bbf0f6 --- /dev/null +++ b/apps/data-processing/ingestion/soroban_event_indexer.py @@ -0,0 +1,267 @@ +""" +Soroban Event Indexer for incremental sync +Polls Soroban RPC for new events and sends them to backend for processing +""" + +import os +import time +import json +import logging +from pathlib import Path +from datetime import datetime, timezone +import requests +from typing import List, Dict, Optional + +logger = logging.getLogger(__name__) + +class SorobanEventIndexer: + def __init__( + self, + rpc_url: str, + backend_url: str, + ingest_secret: str, + contract_ids: Optional[List[str]] = None, + state_file: str = "./data/soroban_indexer_state.json", + poll_interval: int = 30 + ): + self.rpc_url = rpc_url + self.backend_url = backend_url + self.ingest_secret = ingest_secret + self.contract_ids = contract_ids or [] + self.state_file = Path(state_file) + self.poll_interval = poll_interval + self.last_ledger: int = self._load_last_ledger() + + def _load_last_ledger(self) -> int: + """Load last processed ledger from state file""" + if self.state_file.exists(): + try: + with open(self.state_file, 'r') as f: + state = json.load(f) + return state.get("last_ledger", 0) + except (json.JSONDecodeError, KeyError): + logger.warning("Failed to load state file, starting from ledger 0") + return 0 + + def _save_last_ledger(self, ledger: int): + """Save last processed ledger to state file""" + self.state_file.parent.mkdir(parents=True, exist_ok=True) + with open(self.state_file, 'w') as f: + json.dump({"last_ledger": ledger, "timestamp": datetime.now(timezone.utc).isoformat()}, f) + self.last_ledger = ledger + + def fetch_latest_ledger(self) -> int: + """Get the latest ledger sequence from Soroban RPC""" + payload = { + "jsonrpc": "2.0", + "id": 1, + "method": "getLatestLedger" + } + + try: + response = requests.post(self.rpc_url, json=payload, timeout=30) + response.raise_for_status() + data = response.json() + return int(data.get("result", {}).get("sequence", 0)) + except Exception as e: + logger.error(f"Failed to fetch latest ledger: {e}") + raise + + def fetch_events_since(self, start_ledger: int) -> List[Dict]: + """Fetch events from Soroban RPC starting at the given ledger""" + all_events = [] + cursor = None + + while True: + filters = [] + if self.contract_ids: + filters.append({ + "type": "contract", + "contractIds": self.contract_ids + }) + + payload = { + "jsonrpc": "2.0", + "id": 1, + "method": "getEvents", + "params": { + "startLedger": start_ledger, + "filters": filters, + "pagination": { + "limit": 100 + } + } + } + + if cursor: + payload["params"]["pagination"]["cursor"] = cursor + + try: + response = requests.post(self.rpc_url, json=payload, timeout=30) + response.raise_for_status() + data = response.json() + except Exception as e: + logger.error(f"RPC Request failed: {e}") + raise + + if "error" in data: + logger.error(f"RPC Error: {data['error']}") + raise RuntimeError(f"RPC Error: {data['error']}") + + events = data.get("result", {}).get("events", []) + all_events.extend(events) + + # Check if we need to paginate + if len(events) < 100: + break + + # Get cursor from last event + if events: + cursor = events[-1].get("pagingToken") + + if not cursor: + break + + time.sleep(0.5) # Rate limiting + + return all_events + + def send_event_to_backend(self, event: Dict, event_index: int) -> bool: + """Send a single event to the backend ingest endpoint""" + tx_hash = event.get("transactionHash", "") + ledger_sequence = int(event.get("ledger", 0)) + contract_id = event.get("contractId") + event_type = event.get("type") + raw_payload = event + + ingest_payload = { + "txHash": tx_hash, + "eventIndex": event_index, + "ledgerSequence": ledger_sequence, + "contractId": contract_id, + "eventType": event_type, + "rawPayload": raw_payload + } + + headers = { + "Content-Type": "application/json", + "x-ingest-secret": self.ingest_secret + } + + try: + response = requests.post( + f"{self.backend_url}/soroban-events/ingest", + json=ingest_payload, + headers=headers, + timeout=30 + ) + response.raise_for_status() + logger.debug(f"Successfully sent event {tx_hash}:{event_index} to backend") + return True + except Exception as e: + logger.error(f"Failed to send event {tx_hash}:{event_index} to backend: {e}") + return False + + def run_once(self) -> Dict: + """Run one iteration of the indexer""" + logger.info("=" * 60) + logger.info("SOROBAN EVENT INDEXER - INCREMENTAL SYNC") + logger.info("=" * 60) + + try: + latest_ledger = self.fetch_latest_ledger() + logger.info(f"Latest ledger: {latest_ledger}") + logger.info(f"Last processed ledger: {self.last_ledger}") + + if latest_ledger <= self.last_ledger: + logger.info("No new ledgers to process") + return {"status": "no_new_ledgers", "events_processed": 0} + + start_ledger = self.last_ledger + 1 + logger.info(f"Fetching events from ledger {start_ledger} to {latest_ledger}") + + events = self.fetch_events_since(start_ledger) + logger.info(f"Found {len(events)} new events") + + # Send events to backend + sent_count = 0 + failed_count = 0 + highest_ledger = self.last_ledger + + for idx, event in enumerate(events): + success = self.send_event_to_backend(event, idx) + if success: + sent_count += 1 + else: + failed_count += 1 + + # Update highest ledger seen + event_ledger = int(event.get("ledger", 0)) + if event_ledger > highest_ledger: + highest_ledger = event_ledger + + # Update state to the highest ledger processed + self._save_last_ledger(highest_ledger) + + logger.info(f"Sent {sent_count} events to backend, {failed_count} failed") + logger.info(f"Updated last processed ledger to {highest_ledger}") + logger.info("=" * 60) + + return { + "status": "success", + "events_found": len(events), + "events_sent": sent_count, + "events_failed": failed_count, + "last_ledger": highest_ledger + } + + except Exception as e: + logger.error(f"Error in indexer run: {e}", exc_info=True) + return {"status": "error", "error": str(e)} + + def run_forever(self): + """Run the indexer continuously, polling for new events""" + logger.info("Starting Soroban event indexer (continuous mode)") + logger.info(f"Poll interval: {self.poll_interval} seconds") + + while True: + self.run_once() + time.sleep(self.poll_interval) + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Soroban Event Indexer") + parser.add_argument("--rpc-url", type=str, default=os.getenv("SOROBAN_RPC_URL", "https://soroban-testnet.stellar.org"), help="Soroban RPC URL") + parser.add_argument("--backend-url", type=str, default=os.getenv("BACKEND_URL", "http://localhost:3000"), help="Backend API URL") + parser.add_argument("--ingest-secret", type=str, default=os.getenv("SOROBAN_INGEST_SECRET", ""), help="Secret for backend ingest endpoint") + parser.add_argument("--contract-ids", nargs="*", default=os.getenv("SOROBAN_CONTRACT_IDS", "").split(","), help="List of contract IDs to index (comma-separated)") + parser.add_argument("--state-file", type=str, default="./data/soroban_indexer_state.json", help="Path to state file") + parser.add_argument("--poll-interval", type=int, default=30, help="Poll interval in seconds") + parser.add_argument("--once", action="store_true", help="Run once and exit") + + args = parser.parse_args() + + # Clean up contract ids + contract_ids = [cid.strip() for cid in args.contract_ids if cid.strip()] + + indexer = SorobanEventIndexer( + rpc_url=args.rpc_url, + backend_url=args.backend_url, + ingest_secret=args.ingest_secret, + contract_ids=contract_ids, + state_file=args.state_file, + poll_interval=args.poll_interval + ) + + if args.once: + indexer.run_once() + else: + indexer.run_forever() + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" + ) + main() diff --git a/apps/data-processing/ingestion/stellar_fetcher.py b/apps/data-processing/ingestion/stellar_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..625578ab8609c0e23c322302c00d00a38aaa1b8b GIT binary patch literal 20177 zcmeIuF#!Mo0K%a4Pi+Tph(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM z7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r3>YwAz<>b* H29ALNPSF4X literal 0 HcmV?d00001 diff --git a/apps/data-processing/ingestion/stellar_ingestion_checks.py b/apps/data-processing/ingestion/stellar_ingestion_checks.py new file mode 100644 index 0000000000000000000000000000000000000000..920b6a996b378a0f24194b84c8c23b33e2c89b22 GIT binary patch literal 17144 zcmeIuF#!Mo0K%a4Pi+e?h(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM t7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VqKQKc000961 literal 0 HcmV?d00001 diff --git a/apps/data-processing/main.py b/apps/data-processing/main.py new file mode 100644 index 0000000000000000000000000000000000000000..d46b5e7a101880b45bf10a74da8ea71fdd71cdc3 GIT binary patch literal 14703 zcmeIufdBvi0K=g9Qy=7oP+`D;0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj hFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HUu9vC@q00961 literal 0 HcmV?d00001 diff --git a/apps/data-processing/ml/__init__.py b/apps/data-processing/ml/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9be79a5b6e4d0e40fb1bcab3c0e3f261e26712a6 GIT binary patch literal 623 QcmZQz7zLvtK(`P800M6S0RR91 literal 0 HcmV?d00001 diff --git a/apps/data-processing/ml/feature_store.py b/apps/data-processing/ml/feature_store.py new file mode 100644 index 0000000000000000000000000000000000000000..51b0c3d05e46a277dca1853917d9b55753e0c668 GIT binary patch literal 3732 ncmeIuF#!Mo0K%a4Pi+hzh(KY$fB^#r3>YwAz<>b*20jA=4wL`^ literal 0 HcmV?d00001 diff --git a/apps/data-processing/ml/model_registry.py b/apps/data-processing/ml/model_registry.py new file mode 100644 index 0000000000000000000000000000000000000000..465a1a02c285ec2423c03def3082615b7953ff4a GIT binary patch literal 7036 zcmeIuF#!Mo0K%a4Pi+ZLh(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkryIV_+M6 B00961 literal 0 HcmV?d00001 diff --git a/apps/data-processing/ml/price_predictor.py b/apps/data-processing/ml/price_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..845c4d2797d2184e5b9e6a1672dbd20097bebcdb GIT binary patch literal 3119 kcmeIufdBvi0K=g9Q(xc+g-~I@fB^#r3>YwAz<_}}FbppM0RR91 literal 0 HcmV?d00001 diff --git a/apps/data-processing/ml/retraining_pipeline.py b/apps/data-processing/ml/retraining_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..803aa2cd90bdc9dc638ceb77f72437c25c9cde17 GIT binary patch literal 10353 zcmeIufdBvi0K=g9Qy<|1g-~I@fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM Q7%*VKfB^#r3>Y{A11NC-0RR91 literal 0 HcmV?d00001 diff --git a/apps/data-processing/qa_exporter.py b/apps/data-processing/qa_exporter.py new file mode 100644 index 0000000000000000000000000000000000000000..e1d14d3c813f8c8f8ae45e2fa786e518ed09068b GIT binary patch literal 9736 zcmeIu0Sy2E0K%a6Pi+o2h(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM N7%*VKfB^%~0|O=q00961 literal 0 HcmV?d00001 diff --git a/apps/data-processing/scheduler.py b/apps/data-processing/scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..2aeda0c2231d880775238162ba531ebb0b9bd233 GIT binary patch literal 11497 zcmeIuF#!Mo0K%a4Pi+kkh(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM V7%*VKfB^#r3>YwAz<`0XfdMS(00961 literal 0 HcmV?d00001 diff --git a/apps/data-processing/security.py b/apps/data-processing/security.py new file mode 100644 index 0000000000000000000000000000000000000000..468ad34a64842b77d7e019eb1ad0270687530640 GIT binary patch literal 7044 zcmeIuF#!Mo0K%a4Pi+ZLh(KY$fB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkryIb6^{U B00961 literal 0 HcmV?d00001 diff --git a/apps/data-processing/sentiment.py b/apps/data-processing/sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..e3bbbe997f1785a6ae6fd86e01408f41ec46b78c GIT binary patch literal 9799 zcmeIufdBvi0K=g9Qy=7oP+`D;0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj MFkrxd0RzVa113iR0RR91 literal 0 HcmV?d00001 diff --git a/apps/data-processing/src/analytics/entity_linker.py b/apps/data-processing/src/analytics/entity_linker.py new file mode 100644 index 00000000..21b388d8 --- /dev/null +++ b/apps/data-processing/src/analytics/entity_linker.py @@ -0,0 +1,212 @@ +""" +On-chain Entity Linker for news articles. +Links news content to on-chain projects and assets, producing stable IDs +and storing links in the database. +""" + +import logging +import re +from typing import Dict, List, Optional, Tuple +from dataclasses import dataclass + +from .keywords import CRYPTO_PROJECT_MAP, KNOWN_TICKERS, TICKER_TO_PROJECT + +logger = logging.getLogger(__name__) + + +@dataclass +class LinkedEntity: + stable_id: str + entity_type: str # "project" or "asset" + name: str + ticker: Optional[str] = None + confidence: float = 1.0 + + +class EntityLinker: + """ + Links text content to known on-chain entities (projects and assets) + with stable, deterministic IDs. + """ + + def __init__(self) -> None: + self._project_patterns = self._compile_project_patterns() + # Filter out SDF from asset tickers since it's a project + self._asset_tickers = {t for t in KNOWN_TICKERS if t not in ["SDF"]} + + def _compile_project_patterns(self) -> List[Tuple[str, re.Pattern]]: + """Compile regex patterns for project name matching, sorted by length descending.""" + patterns = [] + # Sort project names by length descending to prefer longer matches + sorted_projects = sorted( + CRYPTO_PROJECT_MAP.keys(), + key=lambda x: len(x), + reverse=True + ) + for project_name in sorted_projects: + pattern = re.compile(r"\b" + re.escape(project_name) + r"\b", re.IGNORECASE) + patterns.append((project_name, pattern)) + return patterns + + def _generate_stable_id(self, entity_type: str, identifier: str) -> str: + """Generate a stable, deterministic ID for an entity.""" + normalized = identifier.strip().lower() + return f"{entity_type}:{normalized}" + + def link_text( + self, + text: str, + title: Optional[str] = None + ) -> List[LinkedEntity]: + """ + Link the given text to known on-chain entities. + + Args: + text: Main text content to analyze + title: Optional article title (higher weight for entities found here) + + Returns: + List of LinkedEntity objects with stable IDs + """ + entities: Dict[str, LinkedEntity] = {} + + # Combine title and text for analysis, title first for priority + full_text = f"{title or ''}\n{text or ''}" + + # Match project names + for project_name, pattern in self._project_patterns: + if pattern.search(full_text): + # Get canonical project name (the last one in the list) + canonical_name = CRYPTO_PROJECT_MAP[project_name][-1] if CRYPTO_PROJECT_MAP[project_name] else project_name + canonical_stable_id = self._generate_stable_id("project", canonical_name.lower()) + + if canonical_stable_id not in entities: + entities[canonical_stable_id] = LinkedEntity( + stable_id=canonical_stable_id, + entity_type="project", + name=canonical_name, + confidence=0.95 + ) + + # Match tickers + ticker_pattern = re.compile(r"\b([A-Z]{2,6})\b") + for ticker in ticker_pattern.findall(full_text): + ticker = ticker.upper() + if ticker in self._asset_tickers: + stable_id = self._generate_stable_id("asset", ticker) + if stable_id not in entities: + entities[stable_id] = LinkedEntity( + stable_id=stable_id, + entity_type="asset", + name=ticker, + ticker=ticker, + confidence=0.9 + ) + # Also link the associated project if available, using canonical ID + if ticker in TICKER_TO_PROJECT: + for project_name in TICKER_TO_PROJECT[ticker]: + # Get canonical project name + canonical_name = CRYPTO_PROJECT_MAP.get(project_name.lower(), [project_name])[-1] + canonical_stable_id = self._generate_stable_id("project", canonical_name.lower()) + if canonical_stable_id not in entities: + entities[canonical_stable_id] = LinkedEntity( + stable_id=canonical_stable_id, + entity_type="project", + name=canonical_name, + confidence=0.85 + ) + + return list(entities.values()) + + def link_article( + self, + title: Optional[str], + summary: Optional[str], + content: Optional[str] + ) -> List[LinkedEntity]: + """Link an article's content to on-chain entities.""" + combined_text = "\n".join([ + title or "", + summary or "", + content or "" + ]) + return self.link_text(combined_text, title) + + +# Small labeled test set for precision measurement +LABELED_TEST_SET = [ + { + "text": "Stellar Development Foundation (SDF) announces new Soroban upgrade. XLM price surges.", + "expected_entities": [ + {"stable_id": "project:stellar", "type": "project"}, + {"stable_id": "project:soroban", "type": "project"}, + {"stable_id": "asset:xlm", "type": "asset"} + ] + }, + { + "text": "Bitcoin (BTC) reaches new all-time high. Ethereum (ETH) follows closely.", + "expected_entities": [ + {"stable_id": "asset:btc", "type": "asset"}, + {"stable_id": "asset:eth", "type": "asset"} + ] + }, + { + "text": "DeFi protocol Uniswap launches new liquidity pool on Solana.", + "expected_entities": [ + {"stable_id": "project:uniswap", "type": "project"}, + {"stable_id": "asset:sol", "type": "asset"} + ] + }, + { + "text": "Cardano (ADA) releases new roadmap for governance.", + "expected_entities": [ + {"stable_id": "asset:ada", "type": "asset"} + ] + }, + { + "text": "Tech stocks rally on positive earnings. Apple and Microsoft lead gains.", + "expected_entities": [] # No crypto entities + } +] + + +def measure_precision(entity_linker: EntityLinker) -> Dict[str, float]: + """ + Measure precision of the entity linker using the labeled test set. + + Returns: + Dictionary with precision metrics + """ + true_positives = 0 + false_positives = 0 + total_expected = 0 + + for test_case in LABELED_TEST_SET: + text = test_case["text"] + expected = test_case["expected_entities"] + total_expected += len(expected) + + actual = entity_linker.link_text(text) + actual_stable_ids = {e.stable_id for e in actual} + expected_stable_ids = {e["stable_id"] for e in expected} + + # Calculate true positives and false positives + for entity in actual: + if entity.stable_id in expected_stable_ids: + true_positives += 1 + else: + false_positives += 1 + + precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 1.0 + recall = true_positives / total_expected if total_expected > 0 else 1.0 + f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0 + + return { + "precision": precision, + "recall": recall, + "f1": f1, + "true_positives": true_positives, + "false_positives": false_positives, + "total_expected": total_expected, + "test_cases": len(LABELED_TEST_SET) + } diff --git a/apps/data-processing/src/ingestion/soroban_event_indexer.py b/apps/data-processing/src/ingestion/soroban_event_indexer.py new file mode 100644 index 00000000..41bbf0f6 --- /dev/null +++ b/apps/data-processing/src/ingestion/soroban_event_indexer.py @@ -0,0 +1,267 @@ +""" +Soroban Event Indexer for incremental sync +Polls Soroban RPC for new events and sends them to backend for processing +""" + +import os +import time +import json +import logging +from pathlib import Path +from datetime import datetime, timezone +import requests +from typing import List, Dict, Optional + +logger = logging.getLogger(__name__) + +class SorobanEventIndexer: + def __init__( + self, + rpc_url: str, + backend_url: str, + ingest_secret: str, + contract_ids: Optional[List[str]] = None, + state_file: str = "./data/soroban_indexer_state.json", + poll_interval: int = 30 + ): + self.rpc_url = rpc_url + self.backend_url = backend_url + self.ingest_secret = ingest_secret + self.contract_ids = contract_ids or [] + self.state_file = Path(state_file) + self.poll_interval = poll_interval + self.last_ledger: int = self._load_last_ledger() + + def _load_last_ledger(self) -> int: + """Load last processed ledger from state file""" + if self.state_file.exists(): + try: + with open(self.state_file, 'r') as f: + state = json.load(f) + return state.get("last_ledger", 0) + except (json.JSONDecodeError, KeyError): + logger.warning("Failed to load state file, starting from ledger 0") + return 0 + + def _save_last_ledger(self, ledger: int): + """Save last processed ledger to state file""" + self.state_file.parent.mkdir(parents=True, exist_ok=True) + with open(self.state_file, 'w') as f: + json.dump({"last_ledger": ledger, "timestamp": datetime.now(timezone.utc).isoformat()}, f) + self.last_ledger = ledger + + def fetch_latest_ledger(self) -> int: + """Get the latest ledger sequence from Soroban RPC""" + payload = { + "jsonrpc": "2.0", + "id": 1, + "method": "getLatestLedger" + } + + try: + response = requests.post(self.rpc_url, json=payload, timeout=30) + response.raise_for_status() + data = response.json() + return int(data.get("result", {}).get("sequence", 0)) + except Exception as e: + logger.error(f"Failed to fetch latest ledger: {e}") + raise + + def fetch_events_since(self, start_ledger: int) -> List[Dict]: + """Fetch events from Soroban RPC starting at the given ledger""" + all_events = [] + cursor = None + + while True: + filters = [] + if self.contract_ids: + filters.append({ + "type": "contract", + "contractIds": self.contract_ids + }) + + payload = { + "jsonrpc": "2.0", + "id": 1, + "method": "getEvents", + "params": { + "startLedger": start_ledger, + "filters": filters, + "pagination": { + "limit": 100 + } + } + } + + if cursor: + payload["params"]["pagination"]["cursor"] = cursor + + try: + response = requests.post(self.rpc_url, json=payload, timeout=30) + response.raise_for_status() + data = response.json() + except Exception as e: + logger.error(f"RPC Request failed: {e}") + raise + + if "error" in data: + logger.error(f"RPC Error: {data['error']}") + raise RuntimeError(f"RPC Error: {data['error']}") + + events = data.get("result", {}).get("events", []) + all_events.extend(events) + + # Check if we need to paginate + if len(events) < 100: + break + + # Get cursor from last event + if events: + cursor = events[-1].get("pagingToken") + + if not cursor: + break + + time.sleep(0.5) # Rate limiting + + return all_events + + def send_event_to_backend(self, event: Dict, event_index: int) -> bool: + """Send a single event to the backend ingest endpoint""" + tx_hash = event.get("transactionHash", "") + ledger_sequence = int(event.get("ledger", 0)) + contract_id = event.get("contractId") + event_type = event.get("type") + raw_payload = event + + ingest_payload = { + "txHash": tx_hash, + "eventIndex": event_index, + "ledgerSequence": ledger_sequence, + "contractId": contract_id, + "eventType": event_type, + "rawPayload": raw_payload + } + + headers = { + "Content-Type": "application/json", + "x-ingest-secret": self.ingest_secret + } + + try: + response = requests.post( + f"{self.backend_url}/soroban-events/ingest", + json=ingest_payload, + headers=headers, + timeout=30 + ) + response.raise_for_status() + logger.debug(f"Successfully sent event {tx_hash}:{event_index} to backend") + return True + except Exception as e: + logger.error(f"Failed to send event {tx_hash}:{event_index} to backend: {e}") + return False + + def run_once(self) -> Dict: + """Run one iteration of the indexer""" + logger.info("=" * 60) + logger.info("SOROBAN EVENT INDEXER - INCREMENTAL SYNC") + logger.info("=" * 60) + + try: + latest_ledger = self.fetch_latest_ledger() + logger.info(f"Latest ledger: {latest_ledger}") + logger.info(f"Last processed ledger: {self.last_ledger}") + + if latest_ledger <= self.last_ledger: + logger.info("No new ledgers to process") + return {"status": "no_new_ledgers", "events_processed": 0} + + start_ledger = self.last_ledger + 1 + logger.info(f"Fetching events from ledger {start_ledger} to {latest_ledger}") + + events = self.fetch_events_since(start_ledger) + logger.info(f"Found {len(events)} new events") + + # Send events to backend + sent_count = 0 + failed_count = 0 + highest_ledger = self.last_ledger + + for idx, event in enumerate(events): + success = self.send_event_to_backend(event, idx) + if success: + sent_count += 1 + else: + failed_count += 1 + + # Update highest ledger seen + event_ledger = int(event.get("ledger", 0)) + if event_ledger > highest_ledger: + highest_ledger = event_ledger + + # Update state to the highest ledger processed + self._save_last_ledger(highest_ledger) + + logger.info(f"Sent {sent_count} events to backend, {failed_count} failed") + logger.info(f"Updated last processed ledger to {highest_ledger}") + logger.info("=" * 60) + + return { + "status": "success", + "events_found": len(events), + "events_sent": sent_count, + "events_failed": failed_count, + "last_ledger": highest_ledger + } + + except Exception as e: + logger.error(f"Error in indexer run: {e}", exc_info=True) + return {"status": "error", "error": str(e)} + + def run_forever(self): + """Run the indexer continuously, polling for new events""" + logger.info("Starting Soroban event indexer (continuous mode)") + logger.info(f"Poll interval: {self.poll_interval} seconds") + + while True: + self.run_once() + time.sleep(self.poll_interval) + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Soroban Event Indexer") + parser.add_argument("--rpc-url", type=str, default=os.getenv("SOROBAN_RPC_URL", "https://soroban-testnet.stellar.org"), help="Soroban RPC URL") + parser.add_argument("--backend-url", type=str, default=os.getenv("BACKEND_URL", "http://localhost:3000"), help="Backend API URL") + parser.add_argument("--ingest-secret", type=str, default=os.getenv("SOROBAN_INGEST_SECRET", ""), help="Secret for backend ingest endpoint") + parser.add_argument("--contract-ids", nargs="*", default=os.getenv("SOROBAN_CONTRACT_IDS", "").split(","), help="List of contract IDs to index (comma-separated)") + parser.add_argument("--state-file", type=str, default="./data/soroban_indexer_state.json", help="Path to state file") + parser.add_argument("--poll-interval", type=int, default=30, help="Poll interval in seconds") + parser.add_argument("--once", action="store_true", help="Run once and exit") + + args = parser.parse_args() + + # Clean up contract ids + contract_ids = [cid.strip() for cid in args.contract_ids if cid.strip()] + + indexer = SorobanEventIndexer( + rpc_url=args.rpc_url, + backend_url=args.backend_url, + ingest_secret=args.ingest_secret, + contract_ids=contract_ids, + state_file=args.state_file, + poll_interval=args.poll_interval + ) + + if args.once: + indexer.run_once() + else: + indexer.run_forever() + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" + ) + main() diff --git a/apps/data-processing/standalone_test.py b/apps/data-processing/standalone_test.py new file mode 100644 index 00000000..30361134 --- /dev/null +++ b/apps/data-processing/standalone_test.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +""" +Standalone test for Entity Linker core logic +""" + +import logging +import re +from typing import Dict, List, Optional, Tuple +from dataclasses import dataclass + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +# Copy of the relevant constants from keywords.py +CRYPTO_PROJECT_MAP: dict[str, List[str]] = { + "stellar": ["XLM", "Stellar"], + "xlm": ["XLM", "Stellar"], + "soroban": ["XLM", "Soroban"], + "stellar development foundation": ["SDF", "Stellar"], + "bitcoin": ["BTC", "Bitcoin"], + "btc": ["BTC", "Bitcoin"], + "ethereum": ["ETH", "Ethereum"], + "eth": ["ETH", "Ethereum"], + "solana": ["SOL", "Solana"], + "sol": ["SOL", "Solana"], + "usdc": ["USDC", "USDC"], + "usd coin": ["USDC", "USDC"], + "ripple": ["XRP", "Ripple"], + "xrp": ["XRP", "XRP"], + "cardano": ["ADA", "Cardano"], + "ada": ["ADA", "ADA"], + "polkadot": ["DOT", "Polkadot"], + "dot": ["DOT", "DOT"], + "dogecoin": ["DOGE", "Dogecoin"], + "doge": ["DOGE", "DOGE"], + "litecoin": ["LTC", "Litecoin"], + "ltc": ["LTC", "LTC"], + "chainlink": ["LINK", "Chainlink"], + "link": ["LINK", "LINK"], + "avalanche": ["AVAX", "Avalanche"], + "avax": ["AVAX", "AVAX"], + "polygon": ["MATIC", "Polygon"], + "matic": ["MATIC", "MATIC"], + "algorand": ["ALGO", "Algorand"], + "algo": ["ALGO", "ALGO"], + "cosmos": ["ATOM", "Cosmos"], + "atom": ["ATOM", "ATOM"], + "uniswap": ["UNI", "Uniswap"], + "defi": ["DeFi", "DeFi"], + "nft": ["NFT", "NFT"], + "nfts": ["NFT", "NFT"], +} + +KNOWN_TICKERS = { + "XLM", "BTC", "ETH", "SOL", "USDC", "XRP", "ADA", "DOT", "DOGE", "LTC", + "LINK", "AVAX", "MATIC", "ALGO", "ATOM", "UNI", "USDT", "BUSD", "BNB", "SDF" +} + +TICKER_TO_PROJECT: dict[str, List[str]] = { + "XLM": ["Stellar"], + "BTC": ["Bitcoin"], + "ETH": ["Ethereum"], + "SOL": ["Solana"], + "XRP": ["Ripple"], + "ADA": ["Cardano"], + "DOT": ["Polkadot"], + "DOGE": ["Dogecoin"], + "LTC": ["Litecoin"], + "LINK": ["Chainlink"], + "AVAX": ["Avalanche"], + "MATIC": ["Polygon"], + "ALGO": ["Algorand"], + "ATOM": ["Cosmos"], + "UNI": ["Uniswap"], + "USDC": ["USDC"], + "USDT": ["Tether"], +} + + +@dataclass +class LinkedEntity: + stable_id: str + entity_type: str # "project" or "asset" + name: str + ticker: Optional[str] = None + confidence: float = 1.0 + + +class EntityLinker: + def __init__(self) -> None: + self._project_patterns = self._compile_project_patterns() + self._asset_tickers = {t for t in KNOWN_TICKERS if t not in ["SDF"]} + + def _compile_project_patterns(self) -> List[Tuple[str, re.Pattern]]: + patterns = [] + sorted_projects = sorted( + CRYPTO_PROJECT_MAP.keys(), + key=lambda x: len(x), + reverse=True + ) + for project_name in sorted_projects: + pattern = re.compile(r"\b" + re.escape(project_name) + r"\b", re.IGNORECASE) + patterns.append((project_name, pattern)) + return patterns + + def _generate_stable_id(self, entity_type: str, identifier: str) -> str: + normalized = identifier.strip().lower() + return f"{entity_type}:{normalized}" + + def link_text( + self, + text: str, + title: Optional[str] = None + ) -> List[LinkedEntity]: + entities: Dict[str, LinkedEntity] = {} + + full_text = f"{title or ''}\n{text or ''}" + + for project_name, pattern in self._project_patterns: + if pattern.search(full_text): + canonical_name = CRYPTO_PROJECT_MAP[project_name][-1] if CRYPTO_PROJECT_MAP[project_name] else project_name + canonical_stable_id = self._generate_stable_id("project", canonical_name.lower()) + + if canonical_stable_id not in entities: + entities[canonical_stable_id] = LinkedEntity( + stable_id=canonical_stable_id, + entity_type="project", + name=canonical_name, + confidence=0.95 + ) + + ticker_pattern = re.compile(r"\b([A-Z]{2,6})\b") + for ticker in ticker_pattern.findall(full_text): + ticker = ticker.upper() + if ticker in self._asset_tickers: + stable_id = self._generate_stable_id("asset", ticker) + if stable_id not in entities: + entities[stable_id] = LinkedEntity( + stable_id=stable_id, + entity_type="asset", + name=ticker, + ticker=ticker, + confidence=0.9 + ) + if ticker in TICKER_TO_PROJECT: + for project_name in TICKER_TO_PROJECT[ticker]: + canonical_name = CRYPTO_PROJECT_MAP.get(project_name.lower(), [project_name])[-1] + canonical_stable_id = self._generate_stable_id("project", canonical_name.lower()) + if canonical_stable_id not in entities: + entities[canonical_stable_id] = LinkedEntity( + stable_id=canonical_stable_id, + entity_type="project", + name=canonical_name, + confidence=0.85 + ) + + return list(entities.values()) + + def link_article( + self, + title: Optional[str], + summary: Optional[str], + content: Optional[str] + ) -> List[LinkedEntity]: + combined_text = "\n".join([ + title or "", + summary or "", + content or "" + ]) + return self.link_text(combined_text, title) + + +LABELED_TEST_SET = [ + { + "text": "Stellar Development Foundation (SDF) announces new Soroban upgrade. XLM price surges.", + "expected_entities": [ + {"stable_id": "project:stellar", "type": "project"}, + {"stable_id": "project:soroban", "type": "project"}, + {"stable_id": "asset:xlm", "type": "asset"} + ] + }, + { + "text": "Bitcoin (BTC) reaches new all-time high. Ethereum (ETH) follows closely.", + "expected_entities": [ + {"stable_id": "asset:btc", "type": "asset"}, + {"stable_id": "asset:eth", "type": "asset"} + ] + }, + { + "text": "DeFi protocol Uniswap launches new liquidity pool on Solana.", + "expected_entities": [ + {"stable_id": "project:uniswap", "type": "project"}, + {"stable_id": "asset:sol", "type": "asset"} + ] + }, + { + "text": "Cardano (ADA) releases new roadmap for governance.", + "expected_entities": [ + {"stable_id": "asset:ada", "type": "asset"} + ] + }, + { + "text": "Tech stocks rally on positive earnings. Apple and Microsoft lead gains.", + "expected_entities": [] # No crypto entities + } +] + + +def measure_precision(entity_linker: EntityLinker) -> Dict[str, float]: + true_positives = 0 + false_positives = 0 + total_expected = 0 + + for test_case in LABELED_TEST_SET: + text = test_case["text"] + expected = test_case["expected_entities"] + total_expected += len(expected) + + actual = entity_linker.link_text(text) + actual_stable_ids = {e.stable_id for e in actual} + expected_stable_ids = {e["stable_id"] for e in expected} + + for entity in actual: + if entity.stable_id in expected_stable_ids: + true_positives += 1 + else: + false_positives += 1 + + precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 1.0 + recall = true_positives / total_expected if total_expected > 0 else 1.0 + f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0 + + return { + "precision": precision, + "recall": recall, + "f1": f1, + "true_positives": true_positives, + "false_positives": false_positives, + "total_expected": total_expected, + "test_cases": len(LABELED_TEST_SET) + } + + +def test_entity_linker(): + logger.info("=" * 60) + logger.info("Testing Entity Linker") + logger.info("=" * 60) + + entity_linker = EntityLinker() + + test_text = "Stellar Development Foundation (SDF) announces new Soroban upgrade. XLM price surges." + linked_entities = entity_linker.link_text(test_text) + + logger.info(f"\nTest text: {test_text}") + logger.info(f"Linked entities:") + for entity in linked_entities: + logger.info(f" - {entity.name} ({entity.entity_type}), stable ID: {entity.stable_id}") + + logger.info("\n" + "=" * 60) + logger.info("Measuring Entity Linker Precision") + logger.info("=" * 60) + + metrics = measure_precision(entity_linker) + logger.info(f"Precision: {metrics['precision']:.4f}") + logger.info(f"Recall: {metrics['recall']:.4f}") + logger.info(f"F1 Score: {metrics['f1']:.4f}") + logger.info(f"True Positives: {metrics['true_positives']}") + logger.info(f"False Positives: {metrics['false_positives']}") + logger.info(f"Total Expected: {metrics['total_expected']}") + + +if __name__ == "__main__": + test_entity_linker() diff --git a/apps/data-processing/test_entity_linker.py b/apps/data-processing/test_entity_linker.py new file mode 100644 index 00000000..69d4cfb5 --- /dev/null +++ b/apps/data-processing/test_entity_linker.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +""" +Test script for the Entity Linker functionality +""" + +import logging +import sys + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +# Add src to path +sys.path.insert(0, 'src') + +from src.analytics.entity_linker import EntityLinker, measure_precision + + +def test_entity_linker(): + """Test the entity linker functionality""" + logger.info("=" * 60) + logger.info("Testing Entity Linker") + logger.info("=" * 60) + + # Test entity linking directly + entity_linker = EntityLinker() + + test_text = "Stellar Development Foundation (SDF) announces new Soroban upgrade. XLM price surges." + linked_entities = entity_linker.link_text(test_text) + + logger.info(f"\nTest text: {test_text}") + logger.info(f"Linked entities:") + for entity in linked_entities: + logger.info(f" - {entity.name} ({entity.entity_type}), stable ID: {entity.stable_id}") + + # Test precision measurement + logger.info("\n" + "=" * 60) + logger.info("Measuring Entity Linker Precision") + logger.info("=" * 60) + + metrics = measure_precision(entity_linker) + logger.info(f"Precision: {metrics['precision']:.4f}") + logger.info(f"Recall: {metrics['recall']:.4f}") + logger.info(f"F1 Score: {metrics['f1']:.4f}") + logger.info(f"True Positives: {metrics['true_positives']}") + logger.info(f"False Positives: {metrics['false_positives']}") + logger.info(f"Total Expected: {metrics['total_expected']}") + + +if __name__ == "__main__": + test_entity_linker() diff --git a/apps/data-processing/trends.py b/apps/data-processing/trends.py new file mode 100644 index 0000000000000000000000000000000000000000..597a38e3fac81e44a843d9256716da143451d61c GIT binary patch literal 5268 ucmeIuF#!Mo0K%a4Pi+e?h(KY$fB^#r3>YwAz<>b*1`HT5V8DQZ-@pJAlmGz$ literal 0 HcmV?d00001 diff --git a/apps/data-processing/utils/http_client.py b/apps/data-processing/utils/http_client.py new file mode 100644 index 0000000000000000000000000000000000000000..50ffb9f3648ae8cbe234aba1690ceed063b47e32 GIT binary patch literal 5456 vcmeIufdBvi0K=g9Q(xf#g-~I@fB^#r3>YwAz<>b*1`HT5V8DO@1JA$!6;J>H literal 0 HcmV?d00001 diff --git a/apps/data-processing/utils/logger.py b/apps/data-processing/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..40c4ab4d3a8829e3957bbe4e2c2480f046b474ce GIT binary patch literal 1460 XcmZQz7zLvtFd71*Aut*OLnQL { + await queryRunner.query(` + ALTER TABLE soroban_events + ADD COLUMN ledger_sequence INTEGER NOT NULL DEFAULT 0; + + CREATE INDEX idx_soroban_events_ledger_sequence ON soroban_events (ledger_sequence); + `); + } + + async down(queryRunner: QueryRunner): Promise { + await queryRunner.query(` + DROP INDEX IF EXISTS idx_soroban_events_ledger_sequence; + ALTER TABLE soroban_events DROP COLUMN IF EXISTS ledger_sequence; + `); + } +} diff --git a/temp_backup/backfill_contract_events.py b/temp_backup/backfill_contract_events.py new file mode 100644 index 00000000..9be016a0 --- /dev/null +++ b/temp_backup/backfill_contract_events.py @@ -0,0 +1,323 @@ +#!/usr/bin/env python3 +""" +Soroban Contract Event Backfill Script + +Fetches events for specific Soroban contract IDs within a given ledger range. +Saves results idempotently to allow safe re-runs. +Sends events to backend ingest endpoint for processing. + +Usage: + python scripts/backfill_contract_events.py --contract-ids CABC... --start-ledger 1000 --end-ledger 2000 +""" + +import os +import sys +import json +import time +import argparse +import logging +from datetime import datetime, timezone +from pathlib import Path +import requests + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler(sys.stdout)], +) +logger = logging.getLogger(__name__) + +class BackfillContractEvents: + def __init__( + self, + contract_ids, + start_ledger, + end_ledger, + output_dir, + rpc_url, + backend_url, + ingest_secret, + batch_size, + dry_run=False + ): + self.contract_ids = contract_ids + self.start_ledger = start_ledger + self.end_ledger = end_ledger + self.output_dir = Path(output_dir) + self.rpc_url = rpc_url + self.backend_url = backend_url + self.ingest_secret = ingest_secret + self.batch_size = batch_size + self.dry_run = dry_run + + if not self.dry_run: + self.output_dir.mkdir(parents=True, exist_ok=True) + + def _get_output_filepath(self, contract_id, batch_start, batch_end): + return self.output_dir / f"{contract_id}_{batch_start}_{batch_end}.json" + + def _is_already_processed(self, filepath): + if filepath.exists(): + try: + with open(filepath, 'r') as f: + data = json.load(f) + if data.get("status") == "completed": + return True + except json.JSONDecodeError: + pass + return False + + def fetch_events_batch(self, contract_id, batch_start, batch_end): + """Fetch a batch of events from Soroban RPC""" + all_events = [] + cursor = None + + while True: + payload = { + "jsonrpc": "2.0", + "id": 1, + "method": "getEvents", + "params": { + "startLedger": batch_start, + "filters": [ + { + "type": "contract", + "contractIds": [contract_id] + } + ], + "pagination": { + "limit": 100 + } + } + } + if cursor: + payload["params"]["pagination"]["cursor"] = cursor + + try: + response = requests.post(self.rpc_url, json=payload, timeout=30) + response.raise_for_status() + data = response.json() + except Exception as e: + logger.error(f"RPC Request failed: {e}") + raise + + if "error" in data: + logger.error(f"RPC Error: {data['error']}") + raise RuntimeError(f"RPC Error: {data['error']}") + + events = data.get("result", {}).get("events", []) + + # Filter events by ledger <= batch_end + valid_events = [] + for event in events: + ledger = int(event.get("ledger", 0)) + if ledger <= batch_end: + valid_events.append(event) + + all_events.extend(valid_events) + + # Check if we need to paginate + # We break if we received fewer events than the limit, or if the latest event exceeds batch_end + if len(events) < 100: + break + + last_ledger = int(events[-1].get("ledger", 0)) + if last_ledger > batch_end: + break + + cursor = data.get("result", {}).get("latestLedger") # fallback + # Usually getEvents cursor is based on the paging token of the last event + if events: + cursor = events[-1].get("pagingToken") + + if not cursor: + break + + time.sleep(0.5) # Rate limiting + + return all_events + + def send_event_to_backend(self, event, event_index): + """Send a single event to the backend ingest endpoint""" + tx_hash = event.get("transactionHash", "") + ledger_sequence = int(event.get("ledger", 0)) + contract_id = event.get("contractId") + event_type = event.get("type") + raw_payload = event + + ingest_payload = { + "txHash": tx_hash, + "eventIndex": event_index, + "ledgerSequence": ledger_sequence, + "contractId": contract_id, + "eventType": event_type, + "rawPayload": raw_payload + } + + headers = { + "Content-Type": "application/json", + "x-ingest-secret": self.ingest_secret + } + + try: + response = requests.post( + f"{self.backend_url}/soroban-events/ingest", + json=ingest_payload, + headers=headers, + timeout=30 + ) + response.raise_for_status() + logger.debug(f"Successfully sent event {tx_hash}:{event_index} to backend") + return True + except Exception as e: + logger.error(f"Failed to send event {tx_hash}:{event_index} to backend: {e}") + return False + + def run(self): + logger.info("=" * 60) + logger.info("SOROBAN CONTRACT EVENT BACKFILL") + logger.info("=" * 60) + logger.info(f"Target RPC: {self.rpc_url}") + logger.info(f"Backend URL: {self.backend_url}") + logger.info(f"Ledger Range: {self.start_ledger} to {self.end_ledger}") + logger.info(f"Contracts: {len(self.contract_ids)}") + logger.info(f"Batch Size: {self.batch_size}") + + stats = { + "total_events": 0, + "sent_to_backend": 0, + "failed_to_send": 0, + "contracts": {}, + "batches_processed": 0, + "batches_skipped": 0, + "batches_failed": 0 + } + + for contract_id in self.contract_ids: + stats["contracts"][contract_id] = {"events": 0, "failures": 0} + logger.info(f"\nProcessing contract: {contract_id}") + + current_start = self.start_ledger + while current_start <= self.end_ledger: + current_end = min(current_start + self.batch_size - 1, self.end_ledger) + + filepath = self._get_output_filepath(contract_id, current_start, current_end) + + if self._is_already_processed(filepath) and not self.dry_run: + logger.info(f" [SKIPPED] Ledgers {current_start}-{current_end} already processed") + stats["batches_skipped"] += 1 + + # Read count to update stats + try: + with open(filepath, 'r') as f: + data = json.load(f) + count = data.get("event_count", 0) + stats["contracts"][contract_id]["events"] += count + stats["total_events"] += count + except: + pass + else: + logger.info(f" [FETCHING] Ledgers {current_start}-{current_end}") + + if self.dry_run: + stats["batches_processed"] += 1 + else: + try: + events = self.fetch_events_batch(contract_id, current_start, current_end) + + # Send each event to backend + for idx, event in enumerate(events): + success = self.send_event_to_backend(event, idx) + if success: + stats["sent_to_backend"] += 1 + else: + stats["failed_to_send"] += 1 + + # Save results + output_data = { + "contract_id": contract_id, + "start_ledger": current_start, + "end_ledger": current_end, + "event_count": len(events), + "events": events, + "status": "completed", + "timestamp": datetime.now(timezone.utc).isoformat() + } + + with open(filepath, 'w') as f: + json.dump(output_data, f, indent=2) + + stats["contracts"][contract_id]["events"] += len(events) + stats["total_events"] += len(events) + stats["batches_processed"] += 1 + + logger.info(f" Found {len(events)} events, sent {stats['sent_to_backend']} to backend") + except Exception as e: + logger.error(f" Failed to process batch: {e}") + stats["batches_failed"] += 1 + stats["contracts"][contract_id]["failures"] += 1 + + current_start = current_end + 1 + + logger.info("\n" + "=" * 60) + logger.info("BACKFILL SUMMARY") + logger.info("=" * 60) + logger.info(f"Total Events Found: {stats['total_events']}") + logger.info(f"Events Sent to Backend: {stats['sent_to_backend']}") + logger.info(f"Events Failed to Send: {stats['failed_to_send']}") + logger.info(f"Batches Processed: {stats['batches_processed']}") + logger.info(f"Batches Skipped: {stats['batches_skipped']} (Idempotent)") + logger.info(f"Batches Failed: {stats['batches_failed']}") + + for cid, c_stats in stats["contracts"].items(): + logger.info(f"Contract {cid[:8]}...: {c_stats['events']} events, {c_stats['failures']} failures") + + return stats + +def parse_args(): + parser = argparse.ArgumentParser(description="Backfill Soroban contract events") + parser.add_argument("--contract-ids", nargs="+", required=True, help="List of contract IDs to backfill") + parser.add_argument("--start-ledger", type=int, required=True, help="Starting ledger sequence") + parser.add_argument("--end-ledger", type=int, required=True, help="Ending ledger sequence") + parser.add_argument("--output-dir", type=str, default="./data/contract_events", help="Directory to save output files") + parser.add_argument("--rpc-url", type=str, default=os.getenv("SOROBAN_RPC_URL", "https://soroban-testnet.stellar.org"), help="Soroban RPC URL") + parser.add_argument("--backend-url", type=str, default=os.getenv("BACKEND_URL", "http://localhost:3000"), help="Backend API URL") + parser.add_argument("--ingest-secret", type=str, default=os.getenv("SOROBAN_INGEST_SECRET", ""), help="Secret for backend ingest endpoint") + parser.add_argument("--batch-size", type=int, default=1000, help="Number of ledgers per batch") + parser.add_argument("--dry-run", action="store_true", help="Print operations without executing") + + return parser.parse_args() + +def main(): + args = parse_args() + + if args.start_ledger > args.end_ledger: + logger.error("start-ledger must be <= end-ledger") + sys.exit(1) + + backfill = BackfillContractEvents( + contract_ids=args.contract_ids, + start_ledger=args.start_ledger, + end_ledger=args.end_ledger, + output_dir=args.output_dir, + rpc_url=args.rpc_url, + backend_url=args.backend_url, + ingest_secret=args.ingest_secret, + batch_size=args.batch_size, + dry_run=args.dry_run + ) + + try: + stats = backfill.run() + if stats["batches_failed"] > 0 or stats["failed_to_send"] > 0: + sys.exit(1) + sys.exit(0) + except KeyboardInterrupt: + logger.info("Backfill interrupted by user") + sys.exit(130) + except Exception as e: + logger.error(f"Unexpected error: {e}") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/temp_backup/soroban-events/dto/ingest-soroban-event.dto.ts b/temp_backup/soroban-events/dto/ingest-soroban-event.dto.ts new file mode 100644 index 00000000..84885811 --- /dev/null +++ b/temp_backup/soroban-events/dto/ingest-soroban-event.dto.ts @@ -0,0 +1,29 @@ +import { + IsInt, + IsNotEmpty, + IsObject, + IsOptional, + IsString, + Min, +} from 'class-validator'; + +export class IngestSorobanEventDto { + @IsString() + @IsNotEmpty() + txHash: string; + + @IsInt() + @Min(0) + eventIndex: number; + + @IsString() + @IsOptional() + contractId?: string; + + @IsString() + @IsOptional() + eventType?: string; + + @IsObject() + rawPayload: Record; +} diff --git a/temp_backup/soroban-events/entities/soroban-event.entity.ts b/temp_backup/soroban-events/entities/soroban-event.entity.ts new file mode 100644 index 00000000..2f26b297 --- /dev/null +++ b/temp_backup/soroban-events/entities/soroban-event.entity.ts @@ -0,0 +1,57 @@ +import { + Column, + CreateDateColumn, + Entity, + Index, + PrimaryGeneratedColumn, +} from 'typeorm'; + +export enum SorobanEventStatus { + PENDING = 'pending', + PROCESSED = 'processed', + FAILED = 'failed', +} + +@Entity('soroban_events') +@Index(['txHash', 'eventIndex'], { unique: true }) +@Index(['status']) +export class SorobanEvent { + @PrimaryGeneratedColumn('uuid') + id: string; + + /** Idempotency key: transaction hash */ + @Column({ type: 'varchar', length: 128 }) + txHash: string; + + /** Idempotency key: position of the event within the transaction */ + @Column({ type: 'integer' }) + eventIndex: number; + + /** Soroban contract address that emitted the event */ + @Column({ type: 'varchar', length: 128, nullable: true }) + contractId: string | null; + + /** Event type / topic, e.g. "transfer", "mint" */ + @Column({ type: 'varchar', length: 128, nullable: true }) + eventType: string | null; + + /** Full raw payload stored for audit/debug */ + @Column({ type: 'jsonb' }) + rawPayload: Record; + + @Column({ + type: 'enum', + enum: SorobanEventStatus, + default: SorobanEventStatus.PENDING, + }) + status: SorobanEventStatus; + + @Column({ type: 'text', nullable: true }) + errorMessage: string | null; + + @CreateDateColumn({ type: 'timestamptz' }) + createdAt: Date; + + @Column({ type: 'timestamptz', nullable: true }) + processedAt: Date | null; +} diff --git a/temp_backup/soroban-events/soroban-events.controller.ts b/temp_backup/soroban-events/soroban-events.controller.ts new file mode 100644 index 00000000..60a2f1ce --- /dev/null +++ b/temp_backup/soroban-events/soroban-events.controller.ts @@ -0,0 +1,42 @@ +import { + Body, + Controller, + HttpCode, + HttpStatus, + Post, + UnauthorizedException, + Headers, +} from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +import { ApiTags, ApiOperation, ApiResponse } from '@nestjs/swagger'; +import { IngestSorobanEventDto } from './dto/ingest-soroban-event.dto'; +import { SorobanEventsService } from './soroban-events.service'; + +@ApiTags('soroban-events') +@Controller('soroban-events') +export class SorobanEventsController { + private readonly ingestSecret: string; + + constructor( + private readonly service: SorobanEventsService, + private readonly config: ConfigService, + ) { + this.ingestSecret = this.config.get('SOROBAN_INGEST_SECRET', ''); + } + + @Post('ingest') + @HttpCode(HttpStatus.ACCEPTED) + @ApiOperation({ summary: 'Ingest a Soroban event from the indexer/cron' }) + @ApiResponse({ status: 202, description: 'Event accepted for processing' }) + @ApiResponse({ status: 401, description: 'Missing or invalid ingest secret' }) + async ingest( + @Headers('x-ingest-secret') secret: string, + @Body() dto: IngestSorobanEventDto, + ) { + if (!this.ingestSecret || secret !== this.ingestSecret) { + throw new UnauthorizedException('Invalid ingest secret'); + } + + return this.service.ingest(dto); + } +} diff --git a/temp_backup/soroban-events/soroban-events.module.ts b/temp_backup/soroban-events/soroban-events.module.ts new file mode 100644 index 00000000..e5c85734 --- /dev/null +++ b/temp_backup/soroban-events/soroban-events.module.ts @@ -0,0 +1,20 @@ +import { Module } from '@nestjs/common'; +import { TypeOrmModule } from '@nestjs/typeorm'; +import { BullModule } from '@nestjs/bullmq'; +import { SorobanEvent } from './entities/soroban-event.entity'; +import { + SorobanEventsService, + SOROBAN_EVENTS_QUEUE, +} from './soroban-events.service'; +import { SorobanEventsProcessor } from './soroban-events.processor'; +import { SorobanEventsController } from './soroban-events.controller'; + +@Module({ + imports: [ + TypeOrmModule.forFeature([SorobanEvent]), + BullModule.registerQueue({ name: SOROBAN_EVENTS_QUEUE }), + ], + controllers: [SorobanEventsController], + providers: [SorobanEventsService, SorobanEventsProcessor], +}) +export class SorobanEventsModule {} diff --git a/temp_backup/soroban-events/soroban-events.processor.ts b/temp_backup/soroban-events/soroban-events.processor.ts new file mode 100644 index 00000000..f41c87f4 --- /dev/null +++ b/temp_backup/soroban-events/soroban-events.processor.ts @@ -0,0 +1,78 @@ +import { Processor, WorkerHost } from '@nestjs/bullmq'; +import { Injectable, Logger } from '@nestjs/common'; +import { InjectRepository } from '@nestjs/typeorm'; +import { Repository } from 'typeorm'; +import { Job } from 'bullmq'; +import { + SorobanEvent, + SorobanEventStatus, +} from './entities/soroban-event.entity'; +import { IngestSorobanEventDto } from './dto/ingest-soroban-event.dto'; +import { + SOROBAN_EVENTS_QUEUE, + PROCESS_EVENT_JOB, +} from './soroban-events.service'; + +@Processor(SOROBAN_EVENTS_QUEUE) +@Injectable() +export class SorobanEventsProcessor extends WorkerHost { + private readonly logger = new Logger(SorobanEventsProcessor.name); + + constructor( + @InjectRepository(SorobanEvent) + private readonly eventRepo: Repository, + ) { + super(); + } + + async process(job: Job): Promise { + if (job.name !== PROCESS_EVENT_JOB) { + this.logger.warn(`Unknown job name: ${job.name}`); + return; + } + + const { txHash, eventIndex, contractId, eventType, rawPayload } = job.data; + + // Idempotency: skip if already stored (unique index on txHash + eventIndex) + const existing = await this.eventRepo.findOne({ + where: { txHash, eventIndex }, + select: ['id', 'status'], + }); + + if (existing) { + this.logger.debug( + `Soroban event ${txHash}:${eventIndex} already processed (${existing.status}), skipping`, + ); + return; + } + + const event = this.eventRepo.create({ + txHash, + eventIndex, + contractId: contractId ?? null, + eventType: eventType ?? null, + rawPayload, + status: SorobanEventStatus.PENDING, + processedAt: null, + errorMessage: null, + }); + + await this.eventRepo.save(event); + + try { + // placeholder for downstream processing (e.g. trigger notifications, update state) + event.status = SorobanEventStatus.PROCESSED; + event.processedAt = new Date(); + } catch (err) { + event.status = SorobanEventStatus.FAILED; + event.errorMessage = err instanceof Error ? err.message : String(err); + await this.eventRepo.save(event); + throw err; // let BullMQ retry + } + + await this.eventRepo.save(event); + this.logger.log( + `Processed soroban event ${txHash}:${eventIndex} (${eventType})`, + ); + } +} diff --git a/temp_backup/soroban-events/soroban-events.service.ts b/temp_backup/soroban-events/soroban-events.service.ts new file mode 100644 index 00000000..b90fb323 --- /dev/null +++ b/temp_backup/soroban-events/soroban-events.service.ts @@ -0,0 +1,32 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { InjectQueue } from '@nestjs/bullmq'; +import { Queue } from 'bullmq'; +import { IngestSorobanEventDto } from './dto/ingest-soroban-event.dto'; + +export const SOROBAN_EVENTS_QUEUE = 'soroban-events'; +export const PROCESS_EVENT_JOB = 'process-event'; + +@Injectable() +export class SorobanEventsService { + private readonly logger = new Logger(SorobanEventsService.name); + + constructor( + @InjectQueue(SOROBAN_EVENTS_QUEUE) private readonly queue: Queue, + ) {} + + async ingest(dto: IngestSorobanEventDto): Promise<{ queued: boolean }> { + const jobId = `${dto.txHash}:${dto.eventIndex}`; + + // BullMQ deduplicates by jobId — duplicate submissions are silently dropped + await this.queue.add(PROCESS_EVENT_JOB, dto, { + jobId, + attempts: 3, + backoff: { type: 'exponential', delay: 1000 }, + removeOnComplete: { count: 500 }, + removeOnFail: { count: 200 }, + }); + + this.logger.debug(`Queued soroban event ${jobId}`); + return { queued: true }; + } +} diff --git a/temp_backup/src/alert_notifier.py b/temp_backup/src/alert_notifier.py new file mode 100644 index 00000000..ac1d8ecc --- /dev/null +++ b/temp_backup/src/alert_notifier.py @@ -0,0 +1,85 @@ +import os +import time +import requests +from src.utils.http_client import RobustHTTPClient + + +class AlertNotifier: + def __init__(self): + self.telegram_bot_token = os.getenv("TELEGRAM_BOT_TOKEN") + self.telegram_channel_id = os.getenv("TELEGRAM_CHANNEL_ID") + self.webhook_urls = self._load_webhook_urls() + self.max_retries = int(os.getenv("WEBHOOK_MAX_RETRIES", "3")) + self.base_backoff_seconds = float(os.getenv("WEBHOOK_BACKOFF_SECONDS", "1")) + self.session = RobustHTTPClient() + + def _load_webhook_urls(self): + urls = [] + + single_url = os.getenv("ALERT_WEBHOOK_URL") + if single_url: + urls.append(single_url) + + registry = os.getenv("ALERT_WEBHOOK_URLS", "") + if registry: + urls.extend([url.strip() for url in registry.split(",") if url.strip()]) + + return list(dict.fromkeys(urls)) + + def notify_anomaly(self, result): + if not getattr(result, "is_anomaly", False): + return + + payload = { + "event": "high_priority_insight", + "type": "anomaly", + "metric_name": result.metric_name, + "severity_score": result.severity_score, + "current_value": result.current_value, + "baseline_mean": result.baseline_mean, + "baseline_std": result.baseline_std, + "z_score": result.z_score, + "timestamp": result.timestamp.isoformat() if result.timestamp else None, + } + + self._send_telegram(payload) + self._send_webhooks(payload) + + def _send_telegram(self, payload): + if not self.telegram_bot_token or not self.telegram_channel_id: + return + + text = ( + "🚨 High-Priority Insight\n" + f"Metric: {payload['metric_name']}\n" + f"Severity: {payload['severity_score']}\n" + f"Current: {payload['current_value']}\n" + f"Z-Score: {payload['z_score']}" + ) + + self.session.post( + f"https://api.telegram.org/bot{self.telegram_bot_token}/sendMessage", + json={ + "chat_id": self.telegram_channel_id, + "text": text, + }, + timeout=10, + ) + + def _send_webhooks(self, payload): + for url in self.webhook_urls: + self._post_with_retry(url, payload) + + def _post_with_retry(self, url, payload): + for attempt in range(self.max_retries): + try: + response = self.session.post(url, json=payload, timeout=10) + if response.status_code < 400: + return True + except requests.RequestException: + pass + + if attempt < self.max_retries - 1: + time.sleep(self.base_backoff_seconds * (2**attempt)) + + return False diff --git a/temp_backup/src/alertbot.py b/temp_backup/src/alertbot.py new file mode 100644 index 00000000..1d51233c --- /dev/null +++ b/temp_backup/src/alertbot.py @@ -0,0 +1,353 @@ +""" +Telegram Alert Bot module - Sends notifications when market sentiment exceeds threshold. + +This module provides the AlertBot class that integrates with Telegram's Bot API +to send alerts when the MarketAnalyzer detects high sentiment scores (>0.8). +""" + +import os +import time +import logging +import threading +from datetime import datetime, timezone +from typing import Optional, Dict, Any + +import requests + +# Load environment variables from .env file if present +try: + from dotenv import load_dotenv + + # Try loading from multiple possible locations + load_dotenv() # Current directory + load_dotenv( + dotenv_path=os.path.join(os.path.dirname(__file__), "..", ".env") + ) # data-processing root + load_dotenv( + dotenv_path=os.path.join(os.path.dirname(__file__), "..", "..", "..", ".env") + ) # project root +except ImportError: + pass # python-dotenv not installed, rely on system env vars + +logger = logging.getLogger(__name__) + + +class AlertBot: + """ + Telegram bot for sending market sentiment alerts. + + Sends notifications via Telegram Bot API when sentiment score exceeds + the configured threshold (default: 0.8). + + Features: + - Thread-safe send operations + - Exponential backoff for rate limiting (429 responses) + - Graceful error handling for auth and network failures + - Configurable dry-run mode for testing + - Message truncation for Telegram's 4096 char limit + + Environment Variables: + TELEGRAM_BOT_TOKEN: Bot token from @BotFather + TELEGRAM_CHANNEL_ID: Target channel/chat ID (numeric or @channel_name) + """ + + # Telegram API configuration + API_BASE_URL = "https://api.telegram.org/bot{token}/sendMessage" + MAX_MESSAGE_LENGTH = 4096 + + # Retry configuration + MAX_RETRIES = 3 + INITIAL_RETRY_DELAY = 1.0 # seconds + MAX_RETRY_DELAY = 10.0 # seconds + REQUEST_TIMEOUT = 10 # seconds + + # Alert threshold + ALERT_THRESHOLD = 0.8 + + def __init__( + self, + telegram_bot_token: Optional[str] = None, + telegram_channel_id: Optional[str] = None, + dry_run: bool = False, + ): + """ + Initialize the AlertBot. + + Args: + telegram_bot_token: Telegram bot token (falls back to TELEGRAM_BOT_TOKEN env var) + telegram_channel_id: Target channel/chat ID (falls back to TELEGRAM_CHANNEL_ID env var) + dry_run: If True, log messages instead of sending them (useful for testing) + """ + self.bot_token = telegram_bot_token or os.getenv("TELEGRAM_BOT_TOKEN") + self.channel_id = telegram_channel_id or os.getenv("TELEGRAM_CHANNEL_ID") + self.dry_run = dry_run + self._lock = threading.Lock() + + # Validate configuration + self._configured = bool(self.bot_token and self.channel_id) + + if not self._configured: + logger.warning( + "AlertBot not configured: missing TELEGRAM_BOT_TOKEN or TELEGRAM_CHANNEL_ID. " + "Alerts will be logged but not sent." + ) + elif dry_run: + logger.info( + "AlertBot initialized in dry-run mode (messages will be logged, not sent)" + ) + else: + logger.info( + f"AlertBot initialized for channel: {self._mask_channel_id(self.channel_id)}" + ) + + @staticmethod + def _mask_channel_id(channel_id: str) -> str: + """Mask channel ID for logging (show first 4 chars only).""" + if not channel_id: + return "" + if len(channel_id) <= 4: + return channel_id + return f"{channel_id[:4]}..." + + def _truncate_message(self, message: str) -> str: + """ + Truncate message to fit Telegram's character limit. + + Args: + message: Original message text + + Returns: + Truncated message with ellipsis if needed + """ + if len(message) <= self.MAX_MESSAGE_LENGTH: + return message + + # Leave room for ellipsis indicator + truncation_marker = "\n\n... (message truncated)" + max_content_length = self.MAX_MESSAGE_LENGTH - len(truncation_marker) + + logger.warning( + f"Message truncated from {len(message)} to {self.MAX_MESSAGE_LENGTH} characters" + ) + return message[:max_content_length] + truncation_marker + + def _send_request(self, message: str) -> bool: + """ + Send message to Telegram with retry logic. + + Args: + message: Message text to send + + Returns: + True if message was sent successfully, False otherwise + """ + url = self.API_BASE_URL.format(token=self.bot_token) + payload = {"chat_id": self.channel_id, "text": message, "parse_mode": "HTML"} + + retry_delay = self.INITIAL_RETRY_DELAY + + for attempt in range(self.MAX_RETRIES + 1): + try: + response = requests.post( + url, json=payload, timeout=self.REQUEST_TIMEOUT + ) + + if response.status_code == 200: + logger.info("Alert sent successfully to Telegram") + return True + + elif response.status_code == 429: + # Rate limited - extract retry_after if provided + retry_after = ( + response.json() + .get("parameters", {}) + .get("retry_after", retry_delay) + ) + retry_delay = min(float(retry_after), self.MAX_RETRY_DELAY) + + if attempt < self.MAX_RETRIES: + logger.warning( + f"Rate limited by Telegram (429). Retrying in {retry_delay:.1f}s " + f"(attempt {attempt + 1}/{self.MAX_RETRIES})" + ) + time.sleep(retry_delay) + retry_delay = min(retry_delay * 2, self.MAX_RETRY_DELAY) + continue + else: + logger.error("Rate limit exceeded, max retries reached") + return False + + elif response.status_code in (401, 403): + logger.error( + f"Telegram authentication failed ({response.status_code}). " + "Check TELEGRAM_BOT_TOKEN and ensure bot has channel permissions." + ) + return False + + else: + error_desc = response.json().get("description", "Unknown error") + logger.error( + f"Telegram API error ({response.status_code}): {error_desc}" + ) + return False + + except requests.exceptions.Timeout: + if attempt < self.MAX_RETRIES: + logger.warning( + f"Request timeout. Retrying in {retry_delay:.1f}s " + f"(attempt {attempt + 1}/{self.MAX_RETRIES})" + ) + time.sleep(retry_delay) + retry_delay = min(retry_delay * 2, self.MAX_RETRY_DELAY) + continue + else: + logger.error("Request timeout, max retries reached") + return False + + except requests.exceptions.ConnectionError as e: + logger.error(f"Connection error sending Telegram alert: {e}") + return False + + except requests.exceptions.RequestException as e: + logger.error(f"Request error sending Telegram alert: {e}") + return False + + except Exception as e: + logger.error( + f"Unexpected error sending Telegram alert: {e}", exc_info=True + ) + return False + + return False + + def send_alert(self, message: str) -> bool: + """ + Send an alert message to Telegram. + + Thread-safe method that sends a message to the configured Telegram channel. + Handles rate limiting with exponential backoff and logs all operations. + + Args: + message: The alert message to send (supports HTML formatting) + + Returns: + True if message was sent successfully, False otherwise + """ + with self._lock: + # Truncate if necessary + message = self._truncate_message(message) + + # Handle unconfigured state + if not self._configured: + logger.info(f"[DRY-RUN/UNCONFIGURED] Alert message:\n{message}") + return False + + # Handle dry-run mode + if self.dry_run: + logger.info(f"[DRY-RUN] Would send alert:\n{message}") + return True + + return self._send_request(message) + + def _format_alert_message( + self, + score: float, + sentiment_data: Dict[str, Any], + timestamp: Optional[datetime] = None, + ) -> str: + """ + Format a sentiment alert message. + + Args: + score: The sentiment score that triggered the alert + sentiment_data: Dictionary containing sentiment analysis details + timestamp: Alert timestamp (defaults to current UTC time) + + Returns: + Formatted alert message with HTML markup + """ + if timestamp is None: + timestamp = datetime.now(timezone.utc) + + # Determine trend direction + trend_direction = sentiment_data.get("trend_direction", "Unknown") + if isinstance(trend_direction, str): + trend_display = trend_direction.capitalize() + else: + trend_display = str(trend_direction) + + # Add trend emoji + trend_emoji = ( + "📈" + if "bull" in trend_display.lower() + else ("📉" if "bear" in trend_display.lower() else "➡️") + ) + + # Extract metrics + avg_sentiment = sentiment_data.get("average_compound_score", 0) + sentiment_dist = sentiment_data.get("sentiment_distribution", {}) + positive_ratio = sentiment_dist.get("positive", 0) + negative_ratio = sentiment_dist.get("negative", 0) + news_count = sentiment_data.get("total_analyzed", 0) + + # Calculate confidence (based on sample size and score strength) + confidence = min(100, max(0, int(abs(score) * 100 * min(news_count / 20, 1)))) + + # Format timestamp + time_str = timestamp.strftime("%Y-%m-%d %H:%M:%S UTC") + + # Build message + message = f"""🚨 High Sentiment Alert + +Score: {score:.2f} +Trend: {trend_display} {trend_emoji} +Confidence: {confidence}% +Timestamp: {time_str} + +Details: +• Average sentiment: {avg_sentiment:.2f} +• Positive ratio: {positive_ratio:.1%} +• Negative ratio: {negative_ratio:.1%} +• News analyzed: {news_count}""" + + # Add anomaly info if present + anomalies_count = sentiment_data.get("anomalies_detected", 0) + if anomalies_count > 0: + message += f"\n• ⚠️ Anomalies detected: {anomalies_count}" + + return message + + def check_and_alert( + self, + analyzer_score: float, + sentiment_data: Dict[str, Any], + timestamp: Optional[datetime] = None, + ) -> bool: + """ + Check if sentiment score exceeds threshold and send alert if so. + + Args: + analyzer_score: The sentiment/health score from MarketAnalyzer + sentiment_data: Dictionary containing sentiment analysis details + timestamp: Optional timestamp for the alert + + Returns: + True if alert was triggered and sent successfully, False otherwise + """ + if analyzer_score <= self.ALERT_THRESHOLD: + logger.debug( + f"Score {analyzer_score:.2f} below threshold {self.ALERT_THRESHOLD}, no alert" + ) + return False + + logger.info( + f"Score {analyzer_score:.2f} exceeds threshold {self.ALERT_THRESHOLD}, triggering alert" + ) + + message = self._format_alert_message(analyzer_score, sentiment_data, timestamp) + return self.send_alert(message) + + @property + def is_configured(self) -> bool: + """Check if the bot is properly configured.""" + return self._configured diff --git a/temp_backup/src/analytics/__init__.py b/temp_backup/src/analytics/__init__.py new file mode 100644 index 00000000..1a4f5060 --- /dev/null +++ b/temp_backup/src/analytics/__init__.py @@ -0,0 +1,21 @@ +""" +Analytics module for market analysis and trend detection. +""" + +from .market_analyzer import MarketAnalyzer, Trend, MarketData, get_explanation +from .forecaster import SentimentForecaster, ForecastResult +from .correlation_engine import CorrelationEngine, CorrelationResult, DataPoint +from .ner_service import NERService + +__all__ = [ + "MarketAnalyzer", + "Trend", + "MarketData", + "get_explanation", + "SentimentForecaster", + "ForecastResult", + "CorrelationEngine", + "CorrelationResult", + "DataPoint", + "NERService", +] diff --git a/temp_backup/src/analytics/__pycache__/__init__.cpython-314.pyc b/temp_backup/src/analytics/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..200545436019a2096abdf32eeb53ce4262f662e9 GIT binary patch literal 698 zcmZuu&ui2`6rRbCO*Uz>YVqKyfr1dkr3XQ@B1m^-7S?V_>qUY?NXBJsHVHFt#r7oP zKjYcIM7#}15B>vc4_=)~EJg7_zRY{`y&v-7T*p>M{yH0^hG@k0OJ zp%ejniBmko!~%=hU=s%%;)1(v;?`_HJn)DQe&^e>AqgNLA%tWDHoBNI+awW0+h~l^ z!2-i5b$3wWo$Uor>byDyUdVXWl+kj)hH83#X8So7Z z4VX=cP(yM}d6bq-L8%(`BG66*{kZ>CQ-N0N&kL?9s~g4DXVZ+ADFFnIz^o6Xl+y>?wmmgQ|Lvr=ntw-rsn zq~!H7lcLDFxy@oTN_owJ(s6sL{73zoAB&s13D%#yt>L?kG5%p$*!krne2BlG#W~tL VM~lyB@fGd-bi&Yj?};r<{SD7p#*_d6 literal 0 HcmV?d00001 diff --git a/temp_backup/src/analytics/__pycache__/forecaster.cpython-314.pyc b/temp_backup/src/analytics/__pycache__/forecaster.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbf0797c4ce572940189707e18504c9cc90979a9 GIT binary patch literal 26172 zcmc(I3sjufedjkX=D`dz!0-|XJ^_*#1Oq~{g+T9@K@WsS2-yzSc*G1a@n~l9&0rbq z^yoBcieh(-V&@pePK~zCTH50_xJ^&+K2DUzZJl;c&yZmX{z{JJRBf8G+b#;qD%kS-0z@G_^*h&z7w66(`r9H6*K_HkEKFmtV&D>}pB5 zkz|d}f%JkhF5*h^J}1I17T&<>6e3=v)JfL*bakA^J^BwAn5|a`1;c~M@N*$P5Di4e zlHtKb2R{@OLW6-sl0P31o((1Wq!5Y*6MVyg(MTkm7-=~W3JCc1P(eO965^ww=aYQf z_U7&F{77t6NZ3Xb;pi|w5>6yzLU=F`;S-@~GJGB$`U&#S#Ui8ULwqoh4Ddrj?0mL! z@8=V-XvAx?ol-0Gha$1TaB_^_%?p8Kh>s0vS{ob*M2ACsJS4E5`G(-BmSB>~oR7tV z9yD|yF!(Hnk2;YMzBCr)8v>)rSaUFxL_8Ga14&*OWuxPmDujlGP$EIYpk@P+3xTl&A9yYhjs(s{LKs^_2nB*;Xc1+YTAB+SN z2_#w&3DLrc7V^hq(U3l!chwovaa>RywU`={Ty zLhgrq2eqd{J`UW<0i*_SsGsJNIvi~M*~X-yl#AhbCyo8wHl0t8&2I4-*p@f36eFc1 z^L(bHxt~i~e0kXNX0hOiys>?u#AqZrzV(D=e`k07nqAG0usutA+dD|7Sw7L65n%)V zRvv}7OLvj$=g#Wezszw#q*if{>qc}w-6`GYIQ-GCew)5&9g51X1?TX({%7==VxEA> z6QUl2V4=-zm2nmUSLx+za=XY#x zZ5=oiN<5p4#Rrb#j78(4kwj=95RWGYXzH7>LkDRCMu!Iy!r*|q5(DUl9r}>qjgN_T zzdsO-#*)|`e!p0`dg8p=%$*oWq6*$GFLHM|ovG?Jc~{(52Cr_L8u?~5i>@G_W0OY# zax`)Hv#gXOaFH8OCKZ#Y06G|;sg&RsN5!@0={TY#{6tcm#!I}oX$pO zXU#7;OD2Oay_m983N+N*%lI|qaXuCdMf?P6aO_1F?J2(mW&ERwP*AiFhoT{Z z(?NeAiTzwkV>b;&V%Wk403(lHC_|D!%PwzXwweN+451QUBjx-yykGt{_nX{p3%73l zoU!V*mDA~N+vG?-MeJP3x;bOTZ3jgP<#4f_28k8ctSHD_mNG>UYw1jiW%Ej)-BOA6t0qsdSL z$tbBF_`y&l;`d+INz+DWV4pN=g$N4dv00;IfmyrAePZmqyAH>2na<+f8hpiEp+{Pj&hGx_pOEb)7tZNT6;+vz#uPo(n`qLnwE6zi(AJ6p@ld zbI;+^eZKw2u{@oqtA(jfY0rg+#v~{s97KVAROk@Ai`-4Kecs8>oB1WD`;{+V`r?AK zcFtLwveYi#FJmd+^~&x`yBG4S=klvl#_H^%Dj=U035&|De99e&qLQN>D0Co<51~Ex zpzYR;{6sZ8+HQ7TkM1_}-`=>)e&0PT#j~SBQRZk{V06ZnzeXT8qKfPKb=hTa$B2S} z-PnjGsR<6Ginc^kzLJg{eUn;>N%qO8~)_IR&Se!|H)Ur#z$fov=tkEqt*Ke==D=cAr>8ujPd7WX5*a1 zY*5{;-d3sZNO*W;ZRMR*dw3gj>AivzA2d9~+&}FdmS*gx68ML`9;0XooJ$D1kt3Rf z*eH;h;e0p>=tW0Dv?5bNg$zk;;w6%!)SvxUKXEQn8R01EupM>=fs5Ro0$a-n{Xa(>>WY_2j&#BeT9^Hkhe8kSRHEV}IItBxN~rE8q26F|xN$@0@Sh zm#N!#<4~ryD^uB(F6{nrYdZf#%6LNBnrL5jba2S6JcFaOibDQemO}RFlP2GWELH3? z5uF^-fjX8_^BDDPmQ&47TYR;ww1uTxrF4aBOjHS3R&N`M_&QdEV zHKJ2nEo6D?SY9QoQS@{%ORHjO>nTkw?Uv`Fn&s6{9%iV-$0tj{Ka}|lpu_6LVwEXX z2u@Hsn=$t`C7#H#rp@O9pobI|S_{_`V3jpY1s_0W5ND=uEY5i4(L`W4)X`xB%g3u+ z2^iene2)Gbh%@B*;1IGE8V)2p=&hh%;=jk_BIiROE(Q~;%i(N>qYc3!sTHuM;EPt5 zkk36UV>>DE0kEq)_{3%+a0F$TYR#NO^n$_0C`5Ob^30C4CpS4lYm=KA&oWh7cFG4T zoDhP?D3K&KCQ0ze!OAka1U#@n5K+tL@hh zeQVD$1!wzMigaXEiU}16&*9IqGL8TU7~N+4k_!Th17j#S5jhGQbi$OyE&W_nZOsRw zB}(lv$6^XVAbi6`RlMTUBc_r)K4K~iJ)(6)ktQ7RnAxm|1^z>akL*8vyia1&-w<$9E%$^O&I^$Ql3TJ$`sOM(DbYjaQ@YH z5;5z!C^fdfWRlmY?kwy$bz_P8cu=$+0#|fI0H^T4eJ1o`V9((%Q3sE0rER97+Z?>B z<=5-KRlQ8X_uIa=>-}9U_1^77+!emjzVd(YwV!f30Xp}pn+Ww~7uWaed(|zYq;i}B zBbALhjK#r5Rku^GHCt8*Vf*RrAZ^+V$|kPi2563$>&-zIZ&3TGm`0>jRV}_6tXIA?lSeC!pc=c1Wx~LCx`%7!}S}~-LU0T?C9cbo0?U6$WDG($$s^G z^eTgqhR1xLy7PKHnq)cXbNfnm$gnPM)0{_q&06Ox9U%UQxqh4SB{-pFbWYohN(yGh zR>$$&052Ej25Qw%_MFr!Y1!XxhCD8~k>)XO)r=(2f1v z>5P*m33qci8Vo%Tx=aPpAiNHeIwG1N=|9g?DTrgv$D6XWoJ8d}LCOgu!ZAlJZkkosY~vJ=975&)CdBy>IwWrsRG zu^>w{k+e-L)x;r`X96P#2}+{(PLemA(kmEjiJa{WQ_8 z{VdU<#6-LH14WAx6Z4}X^;1Q&9Fdz|-aB4*>e+BSPG5FD5CPeDo^&pXvWX`uvV-p# zU%P6=-X$4Dfw>~KG92Wg2ob`eL>~!*gV3~$n~#IcMRVgtlFCC8L`o_pPZ^_WBpN0! z^gO02A?C^I3G)T0WH2OwR)^6THm+**ZmW=o6d>_ zCqL)pA^Z7}vv$c|^h(dAp2eb;8QZlNQ=R=O*O@y8y`$hx9_KE7eb?7^y}Bo5-*B^` zal*XhD4c9uaIBwqte-AgDyx_@ES6Th9=;rY_1Q_o$7Sn2a#t89iNN%Xw?$c+2D?*>gEUTtW5SR?bx>&r+)Lv3YyfQgMkoFPJ5V^OfUYJw8>I zwr}{@QGV%jQR0k-d$(? zVU-yvKdduRN`ryITk^X*t*iJrq7MnzuIz*NAnH38E$Y(~1@1GzH?$D%E%9{1SJ0N2 zcL66Rc0m$08?z@^po1vM0UxMiAgcQh@RuN@%Bb!d)0W#DypQY!uN=R0eAz(Z+Kue@ z6Klb36XN&M-k`W8+IyCrlzuJ*N@EoZr)sjG79^Q6qdH0b4hmeRL(qomvMgU*uVDmL zOnFZ$dI%3SbYd=Pw=YjyqW@FWILm2kG^tcOC~O1j8??Efs>g@cxPOnmW_5DZfI`pv zj9M`&aRNTm>RBF8sgE<;wPnxdDHJ1imU;Alp=po+=i|`3z=+@ibm=gCP%$VUPVmgi zfG}9YU%i&iXyO0rHD>NWW(K`7DZpf#WP3pM&H-*is0hploth%i^g=i)5eloyD(GJ- zDhWN&E|j|oNgNx)sDKCn`T0ia2sBch5Q9QKB=}Zut7HNI39b;b1<>tK=mLaCNf=7- zZCW(xwFyCdx^RxX06Yn0^Y8Ba>F@mGA5Y)?vza|V`&T%RN%#%qibkO02=ElFF+@ia zs^N$~f$m`$B0PqC6rfwZF!5+gHW10` zB)zAUG)24y!PfoD|2|qy{8xAvx#e}7z2K`IZx&Da-YA>i`ex-v#ich3i(fChTsBqu zYRyF6Qhve2b1!vH6~4R|S-=I3;#6^c#=ddE-Z*D(yw;YnduK{h?cJHJM;Er9oZEUb z)%$p6t1n~sEgLwiYuUmTxF?Tg9Q=Y~ZJ5}V;Muv$cNF?d(>Z`rTd zmnnGNeSPq~ZP!MAA7%lF-V0RakF~EXp%oHn+yhiqxtI(N>ln~g1sb_T7(TU54TKV_ zq-M~e_)KRx66^pds{jL?%PtF)>AT1UUDPAAFJQ(sU|Q`%ZcC%%k?L_8nO+v~90|j| z3fdM$9x6c#92>}3wFXo~`U4#J7@I&MSVf#mN*R~~B?iF=P~{6F;3zIYN-GQGnDGjM zwF}T8du`oA5)1%zO2%S*BF=aw?FaxY5@FDxNho2+tO#I?42!@*p_EM(8pq3~cB{YV4C1OQCZ zhqHm>Q#yIsj7*VJs#Z=%M6=wY;KSsC7m;lcM6+yx<;i0}ONb#`h?7T}8L3{rZ} zO=q#BcAaiZJ2#~)o9=}fMvQ_ej1}l2KO@YbZ7fWoI*d3pT++E%Eb?%vCZUq7fV|#R zahrp8)%VW8)q!OSz87RE_W}SjYGUN}m3|!Qi;&MTGfS>Z7YElrpdP9|^|%eFG8l&a zA^lh-omAzBXjgVVW3F}}4KZcgS}heJ{m<%=^yzT?&ED1Y7wEsW4y7zUv(MV1_t{iw zjiFD!A?LT8EpLC9%bjzLa(wwdyU($MRu?XA>@)TjOJS|wr%IJ-xF`2TRSFJab^d$> zeOy$XIiC|Thth^dev@nVTv=(6&&i}k@_0B7G9b{SRglbR+QSbBxIdXBIg4uR$qq1J zj+a*;Y@aX+&3BYfj1H2)ToZpJ082VP27o492q!{bNFmI8%86;6DHcp+K3Wh zOeg{nxi|x;5GEWMcl7}VgFurFHTj0|GRdk|F4+m865!|_GJQoCj8+4ipAdCp)cv3& zxEhx1C$I0713x46c3}|Y+8F4n07MxOpCre`q6?wWvuo1IBnw(5IRQuv#{`Im0+End z0I69>fT+TMAsmDb?VISoZ@e`J^iwg-g)mH4)_mi5g_595F`zim3ycBA*LgY_NGG(V z;20SS_jp{aN8wxOR?P2))JjM)843gGqA3_UH##g4rf&admi)mp6u@F zJ0*OPDuNCrvnnD#P1nc4F2Sd!Lm0<~>4JLbGlq~Do7##A)McsMf zw@@cRy<2Cwc}Y-n1nsk>t}6&6hFU$%3Of>#b-Iy_nb^0CG0nwL*XUKMcG%O_Bdp)Ps2 zsq&-q_G2Fvm;LfiC+DiX!|AORlm$*YRdHh8-Xj*5FPo6~FL!ovt_n3DLZj6BM`sc< zp;YJTRAv9X{mhSx%m0NH{pUm@X7)Gp4i<6WHy^Mb+{yi*h(GwK;RlcC$lsZNsKM}o zvEopj;e$F8!ns8cM3Vkn6U+=H3Rlqb!?B)xVc` z|KQB?f9z#B_hLhW*9;qOf`4%pE(BiGck8$$-0absNQ!Avl2O`|;Gx;q2CZE-3 z15UHG7-8Z>7NN}bYGfKp8tJ|iyaNrx!^)b~vbKkowWwwD`wh5aVedC)wUOKOK1V;W zwXeYERPgsUy;sFwarwOs%n9JF74X;U~iM%7n+@*a81gv@arq;FYpyRIG?-U z$$m@vfxp|-^=13RCGC#^k5aL};H**kdSAXK-I-0NedH+}-_u6~K14WRZpfNVx_W_o z8X?j)B#75ye93AvoDp1A!3Br}A$Ak1f`Fx0u7z;HsU(cp0z&W=tz?kqJwLpqXW|ODE--O#Gp_5C4uW5{IM67F37Zoi-p#bD+A?R9? z0Rd{91T$<`BQ({zCq%QkGDO(#_IBYvQV;d7Qa%6D?y0Tw`BjTG^%B2WwV80R%ul8&PtV)?RUUG-{YKx1?WxmGri!0R*`K-v zd3M)VyC$DV+p8yZO9if1M!q^S`9iv&Zo;rS_DNyIRM*vLdeft+${n+tzyHXGiTQ)4 zQak%n>rST&`zP`~dRT#*mV#Gomu!<;=Pm0NHQ`%@rIXLS{^I2qr}wAJwxkQ2Abh(w z>E=3~?0d*7-@wdzDr+W;^Oo{kXbL)yWxdW>wpg})YH%_#-FU5k=HRvCv&PxSXF=ZR z9Q$>ki>$U~8|NsU99y)PPU#ly?$<(7N6=~MCT)1xYI50NqGh$+%QVZDzTC=dSkYgL zZeYSTt)N^RR)ld6OWU?eaGqtywE{Y#4+KKMpHlC!izt#kDE~_`gYGc3-6l`fdms#b z$ED=uLsv?^ek|kUuhw53{nnQ2$@dQb{tGE*N5l~`2jfiNi0miRO^?yowE=cE4WUG-^td}xjP_xPZlwt6>#Xw1*ff4@Cz_@vz_ zaR>IAH4gLWTx-y8(au%80LE`QOw$IkXG|q?HE9K_*F@Rnni7s2X$3x;w(p!J@*i5l zo~wkVM0 z444AK7?105=gF`L`d3Lj_A+1zBbJ9Hpwhl?_yzYLJod@>mzA3zUHc?`=kHL-AbCUN zjgS|DM?xC7C-({e4nH1;q(vYalniZ#hQvHFJc}>U!WU; z0?d38qvwSGK&kp5Oo4EH6ZSxuKtelNqaO`%`T>>Ql9M>iO_O&Lh0w`l?{`W$hf5+) zp|7wzCUq}Ax>Q!R=qj4*{JL|&RX^vdPrEiPxSHo&&1qN5f~#%L)wbZ;H|N@yb{$x7 zbr5B# zny@TADD&o~h6&4J-NuQ0<`H@Mh1bg`_TMV4nA$M)_;lenPrrJ8;<3dd_v@Q4Z=TwG zP4}(CsiMY-gGF%ck-(?)n9{XU^@pR-17*&lqNoWm@+wv>u*oJ$xgOY3<6m zyFMM>n~v5~jDmu{%@RaM^IP6b4T><$-PZI>rRb^p%sB zPTq23?5}n$;ioQD-I7_?ns#qxu~(0>pQ&hOU31#)%@K)Y)@@0cfu+|n%zrLA+Nt=}o0dHjzDZfw2rg+F}+R!f=EqgW$F zEA^|@x=VImw|8(w>z4O$uJY+;Q_ig^%hoJ;s1OB4Y(qjt|JM2_cAfR%g1B+o)Fjwt zWN!@#;^1AWeXHq8(=r9G?_nwT3g8N)f!kO9fE@|oV&#*Np|~u41tWT0$i9ZURaI3d z@D>S-Xt-QvGm4x6U8`kHS&DanOc|I%oAQcj z(p1Ch0hMObl*^{6<@)tm+M(Z=6*y^9P1)4ktJiNBPz9U4Rm%9XzfcL{LW1Oac_ozn zRlcLQE}N!=6pqGc+#%C3!2NmS&&l=zYc8B=pLJ0U4;cQ>s{4n9z+h+ycQD6Pdm>pq z4?IlP%S^TbbPSS#Y!e7QOU491#^1!agd5~V;CXBU4OyrsZzFk|$fF%EG>}I!KB1Ak zE#wg)G>kvE`iK*W3H}awk=)P#S&qtT2ek`gzahF^jNxjYuep5VO;`pT$6Om zxqgXnnX#t%4)wQk>x?H|xob)OU98!>R8v3w7NlBIvT5D5+ho4gF z@&=(F+TRTQ6p^(lypKQ$G+l05HXuw=m(k0ii5D~0s@o43i&$Fz2N5MpBaIFgg zb^$l&Lg4-BHDr2mFNsd2P1fAK)J6h8pmM6CNW?Zp%2T8`wny|M)(H`mz)ng8}dusBVEM^4AD(X z;eR48%dIi)OAD}t?@--5sTvdm@@q%npIaryOJ8=*KdIo5lcm7 zQGZ2NQ6Y~BNo(v#K!X()} z{S=800gy5k&Q8w9C=O~?3m}8<#IDAEv}yM$=-8gJYgD4pR}Nnqo;>$E&q7@(sWNXt zm=So@LNblP)VeL1(x$YlnPMZaI$7eBH&g0KyBc#unlh!E)2@bVk7Qh}kdzb@EOTau za|PlO+l20pn?oyoulq0i7fPDuN}8@co-S#b(Pc_tV@vRL`}g*~zjvY2H`htRp)@R( z`p;xKpGZIQWXkzu%JSsBG7lBV(IPzv$N`Aya)|`ABnZ0^Wne(hC@T=xQmFuX!oQ*` zT*RPbn>E^%*7?%`x)0U>x_W|h^&i>Ymwm4fTpq~SYYD{FfAmNP`&|p^>hGmnX)FMA z_7#$x1Wd5$f6zrKj>#NQ9;QYn0526}-lHQVUTuwA4Zf5^7{{fjnCZ-hY+D+TXu|W6 z^l+plds*nL`ev(3eP}>wOZmOu>q>FM1Nvc)Qfm5XRa*ND{TR<|mjJ;i9ar1o=fRH0 zW9;pw!BMi7IFBzL!d@$3sD&0D`H)jgZU>orgUNUxF(1qSi#D?J$X96FG+26!M= z9w0q^%HDhp;;OVTOtzo&W~oi>1A>GI8EZ%v7jU^X22npr7hs^1gmB>;K3IO|>A&w~ zhJVU7pv^)iF)NS4+oh|89r8u4k7KVx=uSa2*Fa9z}V?5Y`DK zf)QHaT{nn&Z>y+FlC3EoTTM{woA`^;y(Q}|dgt)@xUfgu`9L~R6+L)&;o=l%%{^YV z>O4-w3OF|Imp{WmQ*Dobf9%GFnMY>#f4AcgJAN?szgK9Ff2p)W_y6uI1rr^3?WYM= zqzE>684Bc+XNLzODTv`n9>X!xy)X&75N09m#ChQ(gn&=JLS<=h!b(}%kitJw(m#=B zL3>%ETYk!r(A0fwY44Ha{D2zxGAe)`TQ1k$wb=h=)nakw#K~JAQ}wCJrc8PBLV4R< zIRq$~@<-D}J0{GFF8729SBBp$c{efN_V@>*ANoEROSe6qZh1WI@nx!g8Mkl3`tSnI z@|3eKWvROt^b@M5IbGr5JqYyE8?XZD>G5=3Dg8Y3=Bh|v?nY5YiJDDiw>fy12iZ?{ zBq+cf?I4WAzVbB$GzO%)EC=rYIEv3Fu2g6hKaOKj3{>Jmu3v1Zq3M4&IXoV>V&Xdp_co-aJzJ6N)kIf*YP_J8=?E?fUzk z>^&^qE+S(zTu>(Uf~J@mz>S`HPS6UdgcBaIMYm#kjsbaD4z2Dkwp2g09Q>rHYO3!o z{}unWV7h4Ag!v;2Ua@^Wb~!eEVcycRB$-B~O83lOm<^}gUGtXiMQ7?N@ z@BWPFvUh@VSy1~_^#UCxOrL3dFFi`ZcC!rlAVQb-W>A&3V`DQUNUC$`)|;}`$o#Yb zri%UuJte~O#y9(veJ?!|f{i;$*hji$mmG_;`-A-Eal_^gemvK`Sv zd#6**-_m}eCoV2>zsTdPJ3lF?nmYB?*(+z$1x+CLolu1?IO~4otXnFoTrAo-TlYQh z``*;f?hlLqTVKj|`Y)gS@L0ZuoUUo(CJki4W7aCml2@hKU5!?^RX}Ws60T3kbQUyTbs*ecb6@JY0-^- z2}81-#LIvnbMWFnk!ypa$qu&lD7f(}&vA6~aLbGKf z;zZi5%EO)pyih2X4T&JLNB1+5p?dT^jLPzq3$0XW zD|v0?5pgSQCy&ml0A(=8z7>z}vj?Q5%ea!lY??BEm%O*g`+z)}2;uL^`zd)uaS7#g z;`7-{({gqCCp3gjabO<14Gl@Pl7q7KS?roq2^0QI*}{VjJJNC-zk_zkQ-u&j)cm8&9pw+Unu zBmnS=xV*xS1#n(4v7Ncf!e)M6fhDK;;=SkE5)m&|w>=T5A;Xo`-WkB6pH$Ql2=mZ;1`o#AcsyNi(;4xs zzs~+Qob_YQ^D(#i$6Wcx+_nt2?QglFzu}7ij%)cb=lR46u}#`qb1@GW+KNgi&%GZ0 zS~$f&nriP&mp+~@^j$1~?M;d+Ty&SihxDktBE`9(6V>P6)jM^#LIn?NriXbrLceIl X%i+H$qPLyz)<2V{<5xKH*{J_7ZoY%* literal 0 HcmV?d00001 diff --git a/temp_backup/src/analytics/__pycache__/market_analyzer.cpython-314.pyc b/temp_backup/src/analytics/__pycache__/market_analyzer.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d1bd6c0ceb8262cb5b8dcbf17a5e134fb1ca282 GIT binary patch literal 8295 zcmai3TWniLdY(fbUc`$eUu4PF?MSXI)4Ex{%38;^EGw1+Nmhi_|o0K14?q^PmT!v@=jEZT=GP;|5C%cf5x6hYjBF6y9*7JZ{uPQ8!) z|Cu?|kxZ`#;GDT}X8xJ~|Nd`gx@)5$0^$9CI{NNs#|Zf^%s9a(5<5SH#5z%lMeY$* zx-E@)EKfV8J-5AMvL%oCEFa5zZ~MmrR-m0UkPf2CjYRdm<2mo;{aC?w0<7E*<$?1a zUV2^(V%4y^Ak@`3byi3XK|b6_()CtYjT|Sj=vp>3q0#%grOZ%0yP%Bed73hum}JYN*?cDEv3)c7Tw1q-cXC$B$ZBb*s#(x1Et%F#Gb!X?A4z0( zPC{axyhAJrR?l-UYz{gC`tKJm1+as5_F zMDO3C4NkK?yCF1<>VCP+ku1M=7PVRXI-zQ`N zN?XXA(z4V~rlq&Z_dKeEN+Z%9k(Myu0|x2@Xy^SCbu!lBr2iL%x9ci4kYH+UN4fSV zz!b7ex?P*hQ_wam!Hey>9MvDBjJzodQ27_rhGtO|1af{5I-^;3G?8Emx{yj*pddI{ z=Q=%tV}j0U*{lINgBsY#l^WNs&s}i{nwrB9{qaZS2V^@)jx-nLW7{=ElD0$S@X?~& zv>nDwgfz51zWb9qMfvo06bov}k+#SBPZCAB`ww+cz&9WDBNn0&_|PauwHVc5RF6?B zMr{z;2VB!w%x6LUDe4uLS42wc>2xCTxkNjl#>CCM&V-{)kYo{?e>3D@{_Esh`E)r* zy1LgVOAy_8j`R8(!e@qm*`t=I4;XengJ_Rhw(e6)2P*0SLTi|U`cOet0Mc}Q1wcch zF2L#lNL_W|cOvY6HLMOm)m0Z2bqKRTHL|C!R*hP9U;_1OEof7{UCXy|7;P0yGye$I zBqVaCV+G3`K=WwY%8H&^TDJ5BMSGxAZAn**#eIk@fS9YI*%u1E4!V_3YeOLg0v`}y zo`Lc-g0k|Oa%#E)>i_nMa;EQ6OzBomO`=k!Q)~J{BF@k~F~am)VQtRELZR7Zo$9>f zUZv8sh6DfVrW8P-q0*Mg`d`lVTEH1)0tcDFTkg$XxgdJu*B5e&T)v52-nWc#UZ+)(&r}IV{=mhNM5|W0O z?qFYNW;ta7d(a<1J5$l$1CI@iLg=%ZOWg+wvPg}LvS4IVS#X|U53o;eYJoZfcq#*! zF4I1kCafQ>DoZmwJ*jX!ZAxSv^%kh?pm*0^qwITG!;8?bc@Uy?@?C_~w0&E5=%a~^ zy3Rtdb9Y_EY7sxQ!?oS%f#}s*p?aBacwv=grW{esQVx1$SGN$O$>`T#b`fJ#*--SWu<7*I~WR7tL&KBQ&C=lYoYn zz7JkGLRAM@0MTK(WO7WY>;pi8ijEl0Z6*{_ndOUd1X2t1 z=Y8*@9^fd_ZvfKJ5|_Dr%}M+!=BKBike-3auAd&AoVh&#QDS!V_N}oQI~u<;es^Nj z$$B%|d$t!W3RLYJ{tXSrK2R}_1&78&_*^*WF>y;zo>YW6H71hI!Y16!?34Sgsd~HX z(3?38(K>n7L|VJIT1JX3BVSEyw5WyP(XyXJk9=CY)pV)YbZIkksZ`tkS>IOY)nez> z&DyKqd8O#JvX|5}Jd2R}Lm!?0^w6W}Cp{ZS`#0*&6oO}7v__%SWCw?Rr8Od2BUYnL z%ICap-snPaJDlM}8(Aww^Njg8u|Y%Olp7Jo^1lr=b-CvJ>eZKJ8oy|*AtmZ}2Jpg}Zru*-JLdcP)95jLoC&Z;Gx4DaAml;b_Rg4$< zdWD(hytB$w&>3a->o(<_JbQkRIs0a(rq}^`w>_o?e+mP7Y5oOY&10I@${CG2G$(?L#bfo<~A@ip`~mJLVoGqr(X|X~vWlH(g|H zhQzgl4p#!oDZ$VB#c8=;P%nonG7KD2RBc6ZqoSZ}ViY8wNp_(?SCtjGzW^aJIRlR7 zz(!VGHpAF*G00V5;` z_5mSOVrEPoogTY0ev?B18Yfh8*6FH5rKETaiUnHUwl0oa02rnLoi>w(NuAJ+*Q8xA{AW)=9aIv?^OIgTLa+(Ix~Y4!@wJB$NJW~q zj`bIh^?wuY|Bu5*KbiP=;)Bsr^ypU0K(S@uo9IBL@b|p-;V7K=qRo%SOOe(`f3?*< zRBRtAMLG({FP0+BkKW#Dy;y98WP73GY$?)KP|lPh4UfW4rngR9E}ppj<=|J-Tch#f zXuQxkTdoTp3Ox*z50m?6q0|NBU>kz%#6)KznllBIXZTeSO;mqTjY1FMIKO_9FX!TT}Z%{$u^| zOrfr0v!-*~2X$O%9Lfp8|3!BG1xmm(c&d2@Rk}wU@|5$wq>abbMvlw(I97vlw-CT~ z7naB6o(er9B)xC`oHxY_001a&F*^;L5pfKJ0zqoSG4 z)VbLTkwSJr^kfH}P6XittdNXc%g@}zmAuD1~EWtz?|Q74_l`LALJUVeR5foVDllcWd%c7)%{ zaz)~}{yi)Yvui9-jH^jijF2MG6BwO@$bG5BC^$yA$fgAmyVTnD)cS(uPSVs|kPnv* zH?B_rOdf41$PJ|?jGy`rJoms6eaB1u2g{f$AB5C%4;})RK0V;J_+gH1v}fvoGTrJ ze{k~aiuRQ&$) zIbQxE)x}Xj9C@b`n9-h7YGvZkLWDkeFS+9quNskqHo-k@aKZKkXj5}X>i6aV1mPV= z29Welxj@coKHK3wGC1Is1Dd$I#8XxJAAAc)YX?FpTyJw=1we(I9D%o@I8nV(cojM; zALsQ*ad+!|d+PBY0H^qmvy4Kn{e9?`4%h8?5MIt_v}`ZcwFPZHts{h`!QzoaA(_J) zmJEL)Vl-kebRwg2nB$AAOmMuLx}{S%U?MEyE6hIF8+8o`iUMpiG`V(M8P~NeW1eZe z@=UKPIm1j@DZtcIf(fp9`_3+|d18IlT4(hokip&hNVVI~AqB1lW~MD_0LiV4P@#p$ z9`W&Ofpf`JHVNy*ZeqDVP|W<&l5WC%m3T=~G6p`5X7s$YODZlO=aPn2-{;S`P_;|#nSbyUAw`btegf06}MvZ2K=ka6CJ!}E#Nxx z3HO1xZ%jICvpb`tfv<5wOd(qi-llY#g*U&7ko=VlZ|O`k4-a~4cc<{tDZ}3kg`7s- zN*l&~h3cs+ylf=7n4(iWQ&5djbVme`alu&0x~05%LyTmmmf&_=h`2jl=8{afFRbv&rf#wO$chHxyecXv_7b)Vx|-w33oYi zF6RHa`>z8I;jX9mpq?T2&mqC?Yier$Hu^#I4@7PXJqwYS=E{xrhyCm0JAX!e2cI^! ze)9c~zyD-z^Tgmr<7M!LPXm!3`#y<&9Q`KHS_;>HH1Of&(vg#ehOR=_8^5_wcys1A z-G#feg+p%@B5#$V2S0lA!#{s%{nOgELbScme&GxK%aJeME`)~*@-TB7ld%ZJ3j$ll zGbcs&VEe7r99Rfv)@O6{odhnHIOp5t+TmwclIOSkZbiph9h4?C+hgxt% z#6*%+_Db-+=J87McOfDj`A^dMJJRvY?~|Gy-h4*jQ?4hT@NM0Y5z;}DXWz0T5 Tc*7$B&&T-r8#1~;Hr)RM9Qy#P literal 0 HcmV?d00001 diff --git a/temp_backup/src/analytics/correlation_engine.py b/temp_backup/src/analytics/correlation_engine.py new file mode 100644 index 00000000..a73b2820 --- /dev/null +++ b/temp_backup/src/analytics/correlation_engine.py @@ -0,0 +1,358 @@ +""" +Correlation Analysis Engine +Calculates statistical correlation between social sentiment and Stellar on-chain metrics. +""" + +from dataclasses import dataclass, field +from typing import List, Dict, Any, Optional, Tuple +from datetime import datetime +import pandas as pd +import numpy as np + + +@dataclass +class DataPoint: + """Single data point for scatter plot visualization.""" + + timestamp: datetime + sentiment: float + metric_value: float + metric_type: str # 'price' or 'volume' + + def to_dict(self) -> Dict[str, Any]: + return { + "timestamp": self.timestamp.isoformat(), + "sentiment": self.sentiment, + "metric_value": self.metric_value, + "metric_type": self.metric_type, + } + + +@dataclass +class CorrelationResult: + """Result of correlation analysis between sentiment and a market metric.""" + + metric_type: str # 'price' or 'volume' + correlation_score: float # Pearson correlation coefficient (-1 to 1) + p_value: Optional[float] # Statistical significance + sample_size: int + confidence_level: str # 'high', 'medium', 'low', 'insufficient_data' + data_points: List[DataPoint] = field(default_factory=list) + lag_hours: int = 0 # Time lag between sentiment and metric + + def to_dict(self) -> Dict[str, Any]: + return { + "metric_type": self.metric_type, + "correlation_score": round(self.correlation_score, 4), + "p_value": round(self.p_value, 6) if self.p_value is not None else None, + "sample_size": self.sample_size, + "confidence_level": self.confidence_level, + "lag_hours": self.lag_hours, + "interpretation": self._interpret_correlation(), + "scatter_data": [dp.to_dict() for dp in self.data_points], + } + + def _interpret_correlation(self) -> str: + """Provide human-readable interpretation of the correlation score.""" + score = abs(self.correlation_score) + direction = "positive" if self.correlation_score > 0 else "negative" + + if self.sample_size < 10: + return "Insufficient data for reliable interpretation." + if score >= 0.7: + return f"Strong {direction} correlation: sentiment is a strong leading indicator." + if score >= 0.4: + return f"Moderate {direction} correlation: sentiment shows predictive value." + if score >= 0.2: + return f"Weak {direction} correlation: limited predictive relationship." + return "No significant correlation: sentiment does not predict this metric." + + +class CorrelationEngine: + """ + Calculates statistical correlation between social sentiment and on-chain metrics. + + Supports: + - Sentiment vs Price correlation + - Sentiment vs Volume correlation + - Time-lagged correlation analysis + - Scatter plot data generation + """ + + MIN_SAMPLES = 5 + RECOMMENDED_SAMPLES = 30 + + @staticmethod + def _calculate_pearson( + x: pd.Series, y: pd.Series + ) -> Tuple[float, Optional[float]]: + """ + Calculate Pearson correlation coefficient and p-value. + + Returns: + Tuple of (correlation_score, p_value) + """ + if len(x) < 2 or len(y) < 2: + return 0.0, None + + # Remove any NaN values + mask = ~(x.isna() | y.isna()) + x_clean = x[mask] + y_clean = y[mask] + + if len(x_clean) < 2: + return 0.0, None + + # Calculate correlation using pandas + correlation = x_clean.corr(y_clean) + + if pd.isna(correlation): + return 0.0, None + + # Calculate p-value using t-distribution approximation + n = len(x_clean) + if abs(correlation) >= 1.0: + p_value = 0.0 + else: + t_stat = correlation * np.sqrt((n - 2) / (1 - correlation**2)) + # Two-tailed p-value approximation + from math import erfc, sqrt + + p_value = erfc(abs(t_stat) / sqrt(2)) + + return float(correlation), float(p_value) + + @staticmethod + def _determine_confidence(sample_size: int, p_value: Optional[float]) -> str: + """Determine confidence level based on sample size and p-value.""" + if sample_size < CorrelationEngine.MIN_SAMPLES: + return "insufficient_data" + if p_value is None: + return "low" + if sample_size >= CorrelationEngine.RECOMMENDED_SAMPLES and p_value < 0.05: + return "high" + if sample_size >= 15 and p_value < 0.10: + return "medium" + return "low" + + @classmethod + def calculate_correlation( + cls, + sentiment_data: List[Dict[str, Any]], + metric_data: List[Dict[str, Any]], + metric_type: str = "volume", + lag_hours: int = 0, + ) -> CorrelationResult: + """ + Calculate correlation between sentiment and a market metric. + + Args: + sentiment_data: List of dicts with 'timestamp' and 'score' keys + metric_data: List of dicts with 'timestamp' and 'value' keys + metric_type: Type of metric ('price' or 'volume') + lag_hours: Hours to shift sentiment data (positive = sentiment leads) + + Returns: + CorrelationResult with score, confidence, and scatter data + """ + if not sentiment_data or not metric_data: + return CorrelationResult( + metric_type=metric_type, + correlation_score=0.0, + p_value=None, + sample_size=0, + confidence_level="insufficient_data", + data_points=[], + lag_hours=lag_hours, + ) + + # Convert to DataFrames + sentiment_df = pd.DataFrame(sentiment_data) + metric_df = pd.DataFrame(metric_data) + + # Parse timestamps + sentiment_df["timestamp"] = pd.to_datetime(sentiment_df["timestamp"]) + metric_df["timestamp"] = pd.to_datetime(metric_df["timestamp"]) + + # Apply lag to sentiment data + if lag_hours > 0: + sentiment_df["timestamp"] = sentiment_df["timestamp"] + pd.Timedelta( + hours=lag_hours + ) + + # Round to hourly for alignment + sentiment_df["hour"] = sentiment_df["timestamp"].dt.floor("h") + metric_df["hour"] = metric_df["timestamp"].dt.floor("h") + + # Aggregate sentiment by hour (average) + sentiment_hourly = ( + sentiment_df.groupby("hour")["score"].mean().reset_index() + ) + sentiment_hourly.columns = ["hour", "sentiment"] + + # Aggregate metric by hour (average for price, sum for volume) + if metric_type == "volume": + metric_hourly = metric_df.groupby("hour")["value"].sum().reset_index() + else: + metric_hourly = metric_df.groupby("hour")["value"].mean().reset_index() + metric_hourly.columns = ["hour", "metric_value"] + + # Merge on hour + merged = pd.merge(sentiment_hourly, metric_hourly, on="hour", how="inner") + + if len(merged) < cls.MIN_SAMPLES: + return CorrelationResult( + metric_type=metric_type, + correlation_score=0.0, + p_value=None, + sample_size=len(merged), + confidence_level="insufficient_data", + data_points=[], + lag_hours=lag_hours, + ) + + # Calculate correlation + correlation, p_value = cls._calculate_pearson( + merged["sentiment"], merged["metric_value"] + ) + + # Build scatter data points + data_points = [ + DataPoint( + timestamp=row["hour"].to_pydatetime(), + sentiment=float(row["sentiment"]), + metric_value=float(row["metric_value"]), + metric_type=metric_type, + ) + for _, row in merged.iterrows() + ] + + confidence = cls._determine_confidence(len(merged), p_value) + + return CorrelationResult( + metric_type=metric_type, + correlation_score=correlation, + p_value=p_value, + sample_size=len(merged), + confidence_level=confidence, + data_points=data_points, + lag_hours=lag_hours, + ) + + @classmethod + def analyze_with_lags( + cls, + sentiment_data: List[Dict[str, Any]], + metric_data: List[Dict[str, Any]], + metric_type: str = "volume", + max_lag_hours: int = 24, + ) -> Dict[str, Any]: + """ + Analyze correlation across multiple time lags to find optimal lead time. + + Args: + sentiment_data: List of dicts with 'timestamp' and 'score' keys + metric_data: List of dicts with 'timestamp' and 'value' keys + metric_type: Type of metric ('price' or 'volume') + max_lag_hours: Maximum lag to test + + Returns: + Dict with best lag, all correlations, and recommendation + """ + lag_results = [] + + for lag in range(0, max_lag_hours + 1, 1): + result = cls.calculate_correlation( + sentiment_data, metric_data, metric_type, lag_hours=lag + ) + lag_results.append( + { + "lag_hours": lag, + "correlation": result.correlation_score, + "p_value": result.p_value, + "confidence": result.confidence_level, + } + ) + + # Find best correlation (highest absolute value with sufficient confidence) + valid_results = [ + r for r in lag_results if r["confidence"] != "insufficient_data" + ] + + if not valid_results: + return { + "best_lag_hours": 0, + "best_correlation": 0.0, + "lag_analysis": lag_results, + "recommendation": "Insufficient data to determine optimal lag.", + } + + best = max(valid_results, key=lambda x: abs(x["correlation"])) + + if abs(best["correlation"]) >= 0.4: + recommendation = ( + f"Sentiment appears to lead {metric_type} changes by approximately " + f"{best['lag_hours']} hours with {best['confidence']} confidence." + ) + else: + recommendation = ( + f"No strong leading relationship found. Best correlation of " + f"{best['correlation']:.3f} at {best['lag_hours']}h lag." + ) + + return { + "best_lag_hours": best["lag_hours"], + "best_correlation": best["correlation"], + "lag_analysis": lag_results, + "recommendation": recommendation, + } + + @classmethod + def full_analysis( + cls, + sentiment_data: List[Dict[str, Any]], + price_data: List[Dict[str, Any]], + volume_data: List[Dict[str, Any]], + lag_hours: int = 0, + ) -> Dict[str, Any]: + """ + Perform full correlation analysis for both price and volume. + + Args: + sentiment_data: List of dicts with 'timestamp' and 'score' keys + price_data: List of dicts with 'timestamp' and 'value' keys + volume_data: List of dicts with 'timestamp' and 'value' keys + lag_hours: Time lag to apply + + Returns: + Complete analysis results with both correlations + """ + price_result = cls.calculate_correlation( + sentiment_data, price_data, metric_type="price", lag_hours=lag_hours + ) + + volume_result = cls.calculate_correlation( + sentiment_data, volume_data, metric_type="volume", lag_hours=lag_hours + ) + + return { + "price_correlation": price_result.to_dict(), + "volume_correlation": volume_result.to_dict(), + "summary": { + "sentiment_is_leading_indicator": ( + abs(price_result.correlation_score) >= 0.4 + or abs(volume_result.correlation_score) >= 0.4 + ) + and ( + price_result.confidence_level in ("high", "medium") + or volume_result.confidence_level in ("high", "medium") + ), + "strongest_relationship": ( + "price" + if abs(price_result.correlation_score) + > abs(volume_result.correlation_score) + else "volume" + ), + "analysis_timestamp": datetime.utcnow().isoformat(), + }, + } diff --git a/temp_backup/src/analytics/entity_linker.py b/temp_backup/src/analytics/entity_linker.py new file mode 100644 index 00000000..21b388d8 --- /dev/null +++ b/temp_backup/src/analytics/entity_linker.py @@ -0,0 +1,212 @@ +""" +On-chain Entity Linker for news articles. +Links news content to on-chain projects and assets, producing stable IDs +and storing links in the database. +""" + +import logging +import re +from typing import Dict, List, Optional, Tuple +from dataclasses import dataclass + +from .keywords import CRYPTO_PROJECT_MAP, KNOWN_TICKERS, TICKER_TO_PROJECT + +logger = logging.getLogger(__name__) + + +@dataclass +class LinkedEntity: + stable_id: str + entity_type: str # "project" or "asset" + name: str + ticker: Optional[str] = None + confidence: float = 1.0 + + +class EntityLinker: + """ + Links text content to known on-chain entities (projects and assets) + with stable, deterministic IDs. + """ + + def __init__(self) -> None: + self._project_patterns = self._compile_project_patterns() + # Filter out SDF from asset tickers since it's a project + self._asset_tickers = {t for t in KNOWN_TICKERS if t not in ["SDF"]} + + def _compile_project_patterns(self) -> List[Tuple[str, re.Pattern]]: + """Compile regex patterns for project name matching, sorted by length descending.""" + patterns = [] + # Sort project names by length descending to prefer longer matches + sorted_projects = sorted( + CRYPTO_PROJECT_MAP.keys(), + key=lambda x: len(x), + reverse=True + ) + for project_name in sorted_projects: + pattern = re.compile(r"\b" + re.escape(project_name) + r"\b", re.IGNORECASE) + patterns.append((project_name, pattern)) + return patterns + + def _generate_stable_id(self, entity_type: str, identifier: str) -> str: + """Generate a stable, deterministic ID for an entity.""" + normalized = identifier.strip().lower() + return f"{entity_type}:{normalized}" + + def link_text( + self, + text: str, + title: Optional[str] = None + ) -> List[LinkedEntity]: + """ + Link the given text to known on-chain entities. + + Args: + text: Main text content to analyze + title: Optional article title (higher weight for entities found here) + + Returns: + List of LinkedEntity objects with stable IDs + """ + entities: Dict[str, LinkedEntity] = {} + + # Combine title and text for analysis, title first for priority + full_text = f"{title or ''}\n{text or ''}" + + # Match project names + for project_name, pattern in self._project_patterns: + if pattern.search(full_text): + # Get canonical project name (the last one in the list) + canonical_name = CRYPTO_PROJECT_MAP[project_name][-1] if CRYPTO_PROJECT_MAP[project_name] else project_name + canonical_stable_id = self._generate_stable_id("project", canonical_name.lower()) + + if canonical_stable_id not in entities: + entities[canonical_stable_id] = LinkedEntity( + stable_id=canonical_stable_id, + entity_type="project", + name=canonical_name, + confidence=0.95 + ) + + # Match tickers + ticker_pattern = re.compile(r"\b([A-Z]{2,6})\b") + for ticker in ticker_pattern.findall(full_text): + ticker = ticker.upper() + if ticker in self._asset_tickers: + stable_id = self._generate_stable_id("asset", ticker) + if stable_id not in entities: + entities[stable_id] = LinkedEntity( + stable_id=stable_id, + entity_type="asset", + name=ticker, + ticker=ticker, + confidence=0.9 + ) + # Also link the associated project if available, using canonical ID + if ticker in TICKER_TO_PROJECT: + for project_name in TICKER_TO_PROJECT[ticker]: + # Get canonical project name + canonical_name = CRYPTO_PROJECT_MAP.get(project_name.lower(), [project_name])[-1] + canonical_stable_id = self._generate_stable_id("project", canonical_name.lower()) + if canonical_stable_id not in entities: + entities[canonical_stable_id] = LinkedEntity( + stable_id=canonical_stable_id, + entity_type="project", + name=canonical_name, + confidence=0.85 + ) + + return list(entities.values()) + + def link_article( + self, + title: Optional[str], + summary: Optional[str], + content: Optional[str] + ) -> List[LinkedEntity]: + """Link an article's content to on-chain entities.""" + combined_text = "\n".join([ + title or "", + summary or "", + content or "" + ]) + return self.link_text(combined_text, title) + + +# Small labeled test set for precision measurement +LABELED_TEST_SET = [ + { + "text": "Stellar Development Foundation (SDF) announces new Soroban upgrade. XLM price surges.", + "expected_entities": [ + {"stable_id": "project:stellar", "type": "project"}, + {"stable_id": "project:soroban", "type": "project"}, + {"stable_id": "asset:xlm", "type": "asset"} + ] + }, + { + "text": "Bitcoin (BTC) reaches new all-time high. Ethereum (ETH) follows closely.", + "expected_entities": [ + {"stable_id": "asset:btc", "type": "asset"}, + {"stable_id": "asset:eth", "type": "asset"} + ] + }, + { + "text": "DeFi protocol Uniswap launches new liquidity pool on Solana.", + "expected_entities": [ + {"stable_id": "project:uniswap", "type": "project"}, + {"stable_id": "asset:sol", "type": "asset"} + ] + }, + { + "text": "Cardano (ADA) releases new roadmap for governance.", + "expected_entities": [ + {"stable_id": "asset:ada", "type": "asset"} + ] + }, + { + "text": "Tech stocks rally on positive earnings. Apple and Microsoft lead gains.", + "expected_entities": [] # No crypto entities + } +] + + +def measure_precision(entity_linker: EntityLinker) -> Dict[str, float]: + """ + Measure precision of the entity linker using the labeled test set. + + Returns: + Dictionary with precision metrics + """ + true_positives = 0 + false_positives = 0 + total_expected = 0 + + for test_case in LABELED_TEST_SET: + text = test_case["text"] + expected = test_case["expected_entities"] + total_expected += len(expected) + + actual = entity_linker.link_text(text) + actual_stable_ids = {e.stable_id for e in actual} + expected_stable_ids = {e["stable_id"] for e in expected} + + # Calculate true positives and false positives + for entity in actual: + if entity.stable_id in expected_stable_ids: + true_positives += 1 + else: + false_positives += 1 + + precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 1.0 + recall = true_positives / total_expected if total_expected > 0 else 1.0 + f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0 + + return { + "precision": precision, + "recall": recall, + "f1": f1, + "true_positives": true_positives, + "false_positives": false_positives, + "total_expected": total_expected, + "test_cases": len(LABELED_TEST_SET) + } diff --git a/temp_backup/src/analytics/forecaster.py b/temp_backup/src/analytics/forecaster.py new file mode 100644 index 00000000..3fa2225e --- /dev/null +++ b/temp_backup/src/analytics/forecaster.py @@ -0,0 +1,507 @@ +""" +Predictive analytics: forecast market trends (Bullish/Bearish) for the next 24-48 hours +using historical sentiment and volume data from analytics.jsonl. + +Sentiment Velocity = rate of sentiment change per hour (dS/dt of mood). + +Backend selection (auto-detected at runtime): + - Prophet (Meta) — preferred; installed via ``pip install prophet`` + - scikit-learn Ridge regression — always available (already in requirements) + - Heuristic decay — final fallback when data < 3 points +""" + +import json +import os +from dataclasses import asdict, dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np +import pandas as pd + +from src.analytics.market_analyzer import Trend +from src.utils.logger import setup_logger + +logger = setup_logger(__name__) + +# ── Constants ────────────────────────────────────────────────────────────── + +_DEFAULT_JSONL = Path(os.getenv("ANALYTICS_JSONL_PATH", "./data/analytics.jsonl")) + +BULLISH_THRESHOLD = 0.2 +BEARISH_THRESHOLD = -0.2 + +# Minimum rows required to fit a statistical model +_MIN_TRAINING_POINTS = 3 + + +# ── Output type ─────────────────────────────────────────────────────────── + + +@dataclass +class ForecastResult: + """Market trend forecast for the next 24 h and 48 h.""" + + predicted_trend_24h: str # "bullish" | "bearish" | "neutral" + predicted_trend_48h: str + confidence_24h: float # 0.0 – 1.0 + confidence_48h: float + sentiment_velocity: float # Δsentiment per hour (positive → accelerating bullish) + forecast_score_24h: float # predicted market health score at T+24 h + forecast_score_48h: float # predicted market health score at T+48 h + model_backend: str # "prophet" | "sklearn" | "heuristic" + data_points_used: int + generated_at: str + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + +# ── Helpers ─────────────────────────────────────────────────────────────── + + +def _classify_trend(score: float) -> str: + """Map a health score to a Trend label.""" + if score > BULLISH_THRESHOLD: + return Trend.BULLISH.value + if score < BEARISH_THRESHOLD: + return Trend.BEARISH.value + return Trend.NEUTRAL.value + + +def _confidence_from_score(score: float) -> float: + """ + Translate the magnitude of a predicted score to a 0–1 confidence value. + + Near the neutral band (±0.2) → low confidence (~0.5). + Strongly bullish/bearish (±1.0) → high confidence (~0.95). + """ + abs_score = abs(score) + if abs_score <= BULLISH_THRESHOLD: + # Linear scale within neutral band: 0.30 … 0.50 + return round(0.30 + (abs_score / BULLISH_THRESHOLD) * 0.20, 3) + # Sigmoid-like growth beyond neutral band + return round(min(0.95, 0.50 + (abs_score - BULLISH_THRESHOLD) * 0.75), 3) + + +# ── Main class ──────────────────────────────────────────────────────────── + + +class SentimentForecaster: + """ + Forecasts sentiment-based market health scores 24 h and 48 h ahead. + + Typical usage:: + + forecaster = SentimentForecaster() + df = forecaster.load_history() + metrics = forecaster.train(df) + result = forecaster.predict(df) + + Or as a one-liner:: + + result = SentimentForecaster().run() + """ + + MODEL_TYPE = "sentiment_forecaster" + + def __init__(self, jsonl_path: Optional[Path] = None) -> None: + self.jsonl_path: Path = Path(jsonl_path) if jsonl_path else _DEFAULT_JSONL + self._model_24h = None # fitted model / Prophet instance + self._model_48h = None # separate ridge for 48 h (sklearn path) + self._backend: str = "heuristic" + self._is_trained: bool = False + + # ── Data loading ────────────────────────────────────────────────────── + + def load_history(self, path: Optional[Path] = None) -> pd.DataFrame: + """ + Parse *analytics.jsonl* into a time-indexed DataFrame. + + Columns: + timestamp, sentiment_score, news_count, + positive_pct, negative_pct, neutral_pct + """ + jsonl_path = Path(path) if path else self.jsonl_path + + if not jsonl_path.exists(): + logger.warning( + f"analytics.jsonl not found at {jsonl_path}; returning empty DataFrame" + ) + return pd.DataFrame() + + records: List[Dict[str, Any]] = [] + with open(jsonl_path) as fh: + for raw in fh: + raw = raw.strip() + if not raw: + continue + try: + entry = json.loads(raw) + sd = entry.get("sentiment_data", {}) + dist = sd.get("sentiment_distribution", {}) + records.append( + { + "timestamp": pd.to_datetime(entry["timestamp"]), + "sentiment_score": float( + sd.get("average_compound_score", 0.0) + ), + "news_count": int(entry.get("news_count", 0)), + "positive_pct": float(dist.get("positive", 0.0)), + "negative_pct": float(dist.get("negative", 0.0)), + "neutral_pct": float(dist.get("neutral", 1.0)), + } + ) + except (KeyError, ValueError, json.JSONDecodeError) as exc: + logger.warning(f"Skipping malformed analytics line: {exc}") + + if not records: + logger.warning("analytics.jsonl contained no valid entries") + return pd.DataFrame() + + df = ( + pd.DataFrame(records) + .sort_values("timestamp") + .reset_index(drop=True) + ) + logger.info(f"Loaded {len(df)} data points from {jsonl_path}") + return df + + # ── Sentiment velocity ──────────────────────────────────────────────── + + @staticmethod + def compute_sentiment_velocity( + df: pd.DataFrame, window: int = 5 + ) -> float: + """ + Compute how fast sentiment is changing (Δsentiment / Δhours). + + Positive → mood is becoming more bullish. + Negative → mood is turning more bearish. + + Uses the most recent *window* records; returns 0.0 when there + are fewer than 2 data points. + """ + if df is None or len(df) < 2: + return 0.0 + + recent = df.tail(window) + if len(recent) < 2: + return 0.0 + + delta_s = ( + recent["sentiment_score"].iloc[-1] - recent["sentiment_score"].iloc[0] + ) + delta_h = ( + (recent["timestamp"].iloc[-1] - recent["timestamp"].iloc[0]) + .total_seconds() + / 3600.0 + ) + + if delta_h < 1e-6: + return 0.0 + + return round(delta_s / delta_h, 6) + + # ── Training ────────────────────────────────────────────────────────── + + def train(self, df: pd.DataFrame) -> Dict[str, Any]: + """ + Fit the forecasting model on historical data. + + Returns a metrics dict describing what was trained and how well. + If data is too sparse the forecaster silently falls back to the + heuristic decay method — this is reflected in the ``backend`` key. + """ + if df is None or len(df) < 2: + logger.warning( + "Insufficient data for model training; using heuristic fallback" + ) + self._is_trained = False + return {"backend": "heuristic", "n_points": 0} + + if self._try_train_prophet(df): + return {"backend": "prophet", "n_points": len(df)} + + return self._train_sklearn(df) + + # ── Prophet backend ─────────────────────────────────────────────────── + + def _try_train_prophet(self, df: pd.DataFrame) -> bool: + """Attempt Prophet training. Returns True on success, False otherwise.""" + try: + from prophet import Prophet # type: ignore # noqa: F401 + except ImportError: + logger.debug("prophet not installed — skipping Prophet backend") + return False + + if len(df) < _MIN_TRAINING_POINTS: + logger.info( + f"Too few points ({len(df)}) for Prophet; " + f"need >= {_MIN_TRAINING_POINTS}" + ) + return False + + try: + from prophet import Prophet # type: ignore + + df_p = df[["timestamp", "sentiment_score"]].rename( + columns={"timestamp": "ds", "sentiment_score": "y"} + ) + m = Prophet( + daily_seasonality=len(df) >= 24, + weekly_seasonality=len(df) >= 168, + changepoint_prior_scale=0.05, + interval_width=0.80, + ) + m.fit(df_p) + self._model_24h = m + self._model_48h = m # single Prophet model, different horizons + self._backend = "prophet" + self._is_trained = True + logger.info("SentimentForecaster trained with Prophet backend") + return True + except Exception as exc: + logger.warning(f"Prophet training failed ({exc}); falling back to sklearn") + return False + + # ── sklearn backend ─────────────────────────────────────────────────── + + def _train_sklearn(self, df: pd.DataFrame) -> Dict[str, Any]: + """Train separate Ridge pipelines for the 24 h and 48 h horizons.""" + from sklearn.linear_model import Ridge + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + + n = len(df) + features, targets_24h, targets_48h = self._build_training_samples(df) + + if len(features) < 2: + self._is_trained = False + self._backend = "heuristic" + logger.warning("Not enough training samples for sklearn; using heuristic") + return {"backend": "heuristic", "n_points": n, "r2_24h": None, "r2_48h": None} + + X = np.array(features) + y24 = np.array(targets_24h) + y48 = np.array(targets_48h) + + pipe24 = Pipeline([("scaler", StandardScaler()), ("ridge", Ridge(alpha=1.0))]) + pipe48 = Pipeline([("scaler", StandardScaler()), ("ridge", Ridge(alpha=1.0))]) + + pipe24.fit(X, y24) + pipe48.fit(X, y48) + + r2_24h = float(pipe24.score(X, y24)) + r2_48h = float(pipe48.score(X, y48)) + + self._model_24h = pipe24 + self._model_48h = pipe48 + self._backend = "sklearn" + self._is_trained = True + + logger.info( + f"SentimentForecaster trained with sklearn | " + f"R²_24h={r2_24h:.3f} R²_48h={r2_48h:.3f} n={n}" + ) + return { + "backend": "sklearn", + "n_points": n, + "r2_24h": round(r2_24h, 4), + "r2_48h": round(r2_48h, 4), + } + + @staticmethod + def _build_training_samples( + df: pd.DataFrame, + ) -> Tuple[List[List[float]], List[float], List[float]]: + """ + Build (X, y_24h, y_48h) training arrays. + + For each row *i*, the target is the sentiment_score at the row + closest to T+24 h (resp. T+48 h). When those future rows do not + exist the last available row is used (boundary clamping). + """ + n = len(df) + + # Estimate typical interval between records + if n >= 2: + median_h = float( + df["timestamp"].diff().dropna().dt.total_seconds().median() / 3600.0 + ) + else: + median_h = 1.0 + + step_24h = max(1, round(24.0 / max(median_h, 0.01))) + step_48h = max(1, round(48.0 / max(median_h, 0.01))) + + features: List[List[float]] = [] + targets_24h: List[float] = [] + targets_48h: List[float] = [] + + for i in range(n): + # Rolling 3-row velocity window + w_start = max(0, i - 2) + sub = df.iloc[w_start : i + 1] + if len(sub) >= 2: + ds = sub["sentiment_score"].iloc[-1] - sub["sentiment_score"].iloc[0] + dh = ( + sub["timestamp"].iloc[-1] - sub["timestamp"].iloc[0] + ).total_seconds() / 3600.0 + vel = ds / max(dh, 1e-6) + else: + vel = 0.0 + + row = df.iloc[i] + features.append( + [ + float(i), # time index (captures trend) + float(row["sentiment_score"]), + float(vel), + float(row["positive_pct"]), + float(row["negative_pct"]), + float(row["news_count"]) / 100.0, # rough normalisation + ] + ) + targets_24h.append( + float(df["sentiment_score"].iloc[min(i + step_24h, n - 1)]) + ) + targets_48h.append( + float(df["sentiment_score"].iloc[min(i + step_48h, n - 1)]) + ) + + return features, targets_24h, targets_48h + + # ── Prediction ──────────────────────────────────────────────────────── + + def predict(self, df: pd.DataFrame) -> ForecastResult: + """ + Return 24 h and 48 h market trend forecasts. + + Falls back gracefully when the model is not trained or data is sparse. + """ + velocity = self.compute_sentiment_velocity(df) + n = len(df) if df is not None else 0 + + if self._is_trained and self._backend == "prophet": + score_24h, score_48h = self._predict_prophet(df) + elif self._is_trained and self._backend == "sklearn": + score_24h, score_48h = self._predict_sklearn(df, velocity) + else: + score_24h, score_48h = self._predict_heuristic(df, velocity) + + # Keep scores within valid bounds + score_24h = max(-1.0, min(1.0, score_24h)) + score_48h = max(-1.0, min(1.0, score_48h)) + + return ForecastResult( + predicted_trend_24h=_classify_trend(score_24h), + predicted_trend_48h=_classify_trend(score_48h), + confidence_24h=_confidence_from_score(score_24h), + confidence_48h=_confidence_from_score(score_48h), + sentiment_velocity=velocity, + forecast_score_24h=round(score_24h, 4), + forecast_score_48h=round(score_48h, 4), + model_backend=self._backend, + data_points_used=n, + generated_at=datetime.now(timezone.utc).isoformat(), + ) + + def _predict_prophet( + self, df: pd.DataFrame + ) -> Tuple[float, float]: + if self._model_24h is None or df is None or df.empty: + return self._predict_heuristic(df) + + m = self._model_24h + future = m.make_future_dataframe(periods=48, freq="h", include_history=False) + forecast = m.predict(future) + + if len(forecast) >= 48: + return float(forecast["yhat"].iloc[23]), float(forecast["yhat"].iloc[47]) + if len(forecast) >= 24: + return float(forecast["yhat"].iloc[23]), float(forecast["yhat"].iloc[-1]) + if len(forecast) > 0: + val = float(forecast["yhat"].iloc[-1]) + return val, val + + return self._predict_heuristic(df) + + def _predict_sklearn( + self, df: pd.DataFrame, velocity: float + ) -> Tuple[float, float]: + if self._model_24h is None or self._model_48h is None: + return self._predict_heuristic(df, velocity) + + n = len(df) + row = df.iloc[-1] + X = np.array( + [[ + float(n), + float(row["sentiment_score"]), + float(velocity), + float(row["positive_pct"]), + float(row["negative_pct"]), + float(row["news_count"]) / 100.0, + ]] + ) + return float(self._model_24h.predict(X)[0]), float(self._model_48h.predict(X)[0]) + + @staticmethod + def _predict_heuristic( + df: Optional[pd.DataFrame] = None, velocity: float = 0.0 + ) -> Tuple[float, float]: + """ + Extrapolate current sentiment using velocity with exponential decay. + + score(T+h) ≈ current + velocity × Σ_{t=0}^{h-1} decay^t + + The decay factor prevents the extrapolation from diverging when + velocity is large and history is short. + """ + if df is None or len(df) == 0: + return 0.0, 0.0 + + current = float(df["sentiment_score"].iloc[-1]) + decay = 0.85 # velocity impact halves roughly every ~4 h + score_24h = current + velocity * float(sum(decay ** t for t in range(24))) + score_48h = current + velocity * float(sum(decay ** t for t in range(48))) + return score_24h, score_48h + + # ── Model persistence ───────────────────────────────────────────────── + + def save(self) -> str: + """Persist the trained forecaster to the model registry and promote it.""" + from src.ml.model_registry import promote_model, save_model + + version = save_model(self.MODEL_TYPE, self) + promote_model(self.MODEL_TYPE, version) + logger.info(f"SentimentForecaster saved and promoted: {version}") + return version + + @classmethod + def load(cls) -> "SentimentForecaster": + """Load the currently promoted forecaster from the model registry.""" + from src.ml.model_registry import get_live_model + + obj = get_live_model(cls.MODEL_TYPE) + if not isinstance(obj, cls): + raise TypeError( + f"Registry returned unexpected type for '{cls.MODEL_TYPE}': {type(obj)}" + ) + logger.info("SentimentForecaster loaded from model registry") + return obj + + # ── Convenience ─────────────────────────────────────────────────────── + + def run(self, jsonl_path: Optional[Path] = None) -> ForecastResult: + """ + One-call shortcut: load history → train (if needed) → predict. + + Safe to call repeatedly — reuses an existing trained model. + """ + df = self.load_history(jsonl_path) + if not self._is_trained: + self.train(df) + return self.predict(df) diff --git a/temp_backup/src/analytics/keywords.py b/temp_backup/src/analytics/keywords.py new file mode 100644 index 00000000..c2b7411a --- /dev/null +++ b/temp_backup/src/analytics/keywords.py @@ -0,0 +1,309 @@ +""" +Keyword extraction module for analytics. + +Extracts key entities (coins, protocols, people) from news content +to tag and filter analytics. +""" + +import re +from typing import List, Set + +# Static dictionary of known crypto projects and their tickers +CRYPTO_PROJECT_MAP: dict[str, List[str]] = { + # Stellar ecosystem + "stellar": ["XLM", "Stellar"], + "xlm": ["XLM", "Stellar"], # XLM ticker also maps to Stellar + "soroban": ["XLM", "Soroban"], + "stellar development foundation": ["SDF", "Stellar"], + # Bitcoin + "bitcoin": ["BTC", "Bitcoin"], + "btc": ["BTC", "Bitcoin"], + # Ethereum + "ethereum": ["ETH", "Ethereum"], + "eth": ["ETH", "Ethereum"], + # Solana + "solana": ["SOL", "Solana"], + "sol": ["SOL", "Solana"], + # USDC + "usdc": ["USDC", "USDC"], + "usd coin": ["USDC", "USDC"], + # Ripple + "ripple": ["XRP", "Ripple"], + "xrp": ["XRP", "XRP"], + # Cardano + "cardano": ["ADA", "Cardano"], + "ada": ["ADA", "ADA"], + # Polkadot + "polkadot": ["DOT", "Polkadot"], + "dot": ["DOT", "DOT"], + # Dogecoin + "dogecoin": ["DOGE", "Dogecoin"], + "doge": ["DOGE", "DOGE"], + # Litecoin + "litecoin": ["LTC", "Litecoin"], + "ltc": ["LTC", "LTC"], + # Chainlink + "chainlink": ["LINK", "Chainlink"], + "link": ["LINK", "LINK"], + # Avalanche + "avalanche": ["AVAX", "Avalanche"], + "avax": ["AVAX", "AVAX"], + # Polygon + "polygon": ["MATIC", "Polygon"], + "matic": ["MATIC", "MATIC"], + # Algorand + "algorand": ["ALGO", "Algorand"], + "algo": ["ALGO", "ALGO"], + # Cosmos + "cosmos": ["ATOM", "Cosmos"], + "atom": ["ATOM", "ATOM"], + # Uniswap + "univ3": ["UNI", "Uniswap"], + "uniswap": ["UNI", "Uniswap"], + # DeFi + "defi": ["DeFi", "DeFi"], + # NFTs + "nft": ["NFT", "NFT"], + "nfts": ["NFT", "NFT"], +} + +# Set of all known tickers for regex matching +KNOWN_TICKERS: Set[str] = { + "XLM", + "BTC", + "ETH", + "SOL", + "USDC", + "XRP", + "ADA", + "DOT", + "DOGE", + "LTC", + "LINK", + "AVAX", + "MATIC", + "ALGO", + "ATOM", + "UNI", + "USDT", + "Tether", + "BUSD", + "BNB", + "XLM", + "SDF", +} + +# Regex pattern for matching crypto tickers (2-5 uppercase letters) +TICKER_PATTERN = r"\b[A-Z]{2,5}\b" + +# Reverse mapping from ticker to project names (for when ticker appears without project name) +TICKER_TO_PROJECT: dict[str, List[str]] = { + "XLM": ["Stellar"], + "BTC": ["Bitcoin"], + "ETH": ["Ethereum"], + "SOL": ["Solana"], + "XRP": ["Ripple"], + "ADA": ["Cardano"], + "DOT": ["Polkadot"], + "DOGE": ["Dogecoin"], + "LTC": ["Litecoin"], + "LINK": ["Chainlink"], + "AVAX": ["Avalanche"], + "MATIC": ["Polygon"], + "ALGO": ["Algorand"], + "ATOM": ["Cosmos"], + "UNI": ["Uniswap"], + "USDC": ["USDC"], + "USDT": ["Tether"], +} + +# Words to exclude from ticker matching (common English words) +TICKER_EXCLUSIONS: Set[str] = { + "THE", + "AND", + "FOR", + "ARE", + "BUT", + "NOT", + "YOU", + "ALL", + "CAN", + "HER", + "WAS", + "ONE", + "OUR", + "OUT", + "DAY", + "GET", + "HAS", + "HIM", + "HIS", + "HOW", + "ITS", + "LET", + "MAY", + "NEW", + "NOW", + "OLD", + "SEE", + "TWO", + "WAY", + "WHO", + "BOY", + "DID", + "SAY", + "SHE", + "TOO", + "USE", + "FROM", + "THIS", + "THAT", + "WITH", + "HAVE", + "WILL", + "YOUR", + "THEY", + "BEEN", + "HAVE", + "WHAT", + "WHEN", + "WEVE", + "MORE", + "VERY", + "JUST", + "ONLY", + "OVER", + "SUCH", + "THEN", + "THEM", + "THESE", + "SOME", + "INTO", + "YEAR", + "MADE", + "MAKE", + "ALSO", + "MOST", + "SOME", + "EVEN", + "BACK", + "JUST", + "LIKE", + "TIME", + "VERY", + "AFTER", + "USED", + "TWITTER", + "POST", + "DATA", + "COIN", + "COINS", + "NODE", + "NODES", +} + + +class KeywordExtractor: + """ + Extracts key entities (coins, protocols, people) from news content + to tag and filter analytics. + """ + + def __init__(self): + """Initialize the keyword extractor with regex patterns.""" + self.ticker_regex = re.compile(TICKER_PATTERN) + # Create a sorted list of project names for longest-match-first matching + self.project_names = sorted(CRYPTO_PROJECT_MAP.keys(), key=len, reverse=True) + # Compile regex for project name matching (case insensitive) + self._project_pattern = re.compile( + r"\b(" + "|".join(re.escape(name) for name in self.project_names) + r")\b", + re.IGNORECASE, + ) + + def extract(self, text: str) -> List[str]: + """ + Extract key entities from the given text. + + Args: + text: The text to extract keywords from. + + Returns: + A list of unique extracted keywords (tickers and project names). + """ + if not text or not isinstance(text, str): + return [] + + # Use a set to avoid duplicates + keywords: Set[str] = set() + + # Extract project names (case insensitive matching) + project_matches = self._project_pattern.findall(text) + for match in project_matches: + # Get the normalized (lowercase) project name + normalized_match = match.lower() + if normalized_match in CRYPTO_PROJECT_MAP: + # Add all associated tickers and names + keywords.update(CRYPTO_PROJECT_MAP[normalized_match]) + + # Extract tickers using regex + ticker_matches = self.ticker_regex.findall(text) + for ticker in ticker_matches: + # Filter out common English words that happen to be all caps + if ticker not in TICKER_EXCLUSIONS: + # Check if it's a known ticker + if ticker in KNOWN_TICKERS: + keywords.add(ticker) + # Also add associated project name if available + if ticker in TICKER_TO_PROJECT: + keywords.update(TICKER_TO_PROJECT[ticker]) + + # Return sorted list for consistent output + return sorted(list(keywords)) + + def extract_tickers_only(self, text: str) -> List[str]: + """ + Extract only crypto tickers from the given text. + + Args: + text: The text to extract tickers from. + + Returns: + A list of unique extracted tickers. + """ + if not text or not isinstance(text, str): + return [] + + tickers: Set[str] = set() + + # Extract tickers using regex + ticker_matches = self.ticker_regex.findall(text) + for ticker in ticker_matches: + if ticker not in TICKER_EXCLUSIONS and ticker in KNOWN_TICKERS: + tickers.add(ticker) + + return sorted(list(tickers)) + + def extract_projects_only(self, text: str) -> List[str]: + """ + Extract only project names from the given text. + + Args: + text: The text to extract project names from. + + Returns: + A list of unique extracted project names. + """ + if not text or not isinstance(text, str): + return [] + + projects: Set[str] = set() + + # Extract project names + project_matches = self._project_pattern.findall(text) + for match in project_matches: + normalized_match = match.lower() + if normalized_match in CRYPTO_PROJECT_MAP: + # Add project names (not tickers) + projects.add(match.capitalize()) + + return sorted(list(projects)) diff --git a/temp_backup/src/analytics/market_analyzer.py b/temp_backup/src/analytics/market_analyzer.py new file mode 100644 index 00000000..cd8b5b68 --- /dev/null +++ b/temp_backup/src/analytics/market_analyzer.py @@ -0,0 +1,201 @@ +""" +Market Trend Heuristic Algorithm +Combines news sentiment and on-chain volume to produce Market Health score. +""" + +from enum import Enum +from typing import Tuple, Optional +from dataclasses import dataclass + + +class Trend(Enum): + """Market trend classification""" + + BULLISH = "bullish" + BEARISH = "bearish" + NEUTRAL = "neutral" + + +@dataclass +class MarketData: + """Container for market data inputs""" + + sentiment_score: float # Range: -1.0 to 1.0 + volume_change: float # Percentage change (e.g., 0.15 for 15% increase) + current_volume: Optional[float] = None + previous_volume: Optional[float] = None + + +class MarketAnalyzer: + """ + Analyzes market health using weighted average of sentiment and volume changes. + + Formula: + Market Health Score = (Sentiment × 0.7) + (Normalized_Volume_Change × 0.3) + + Where: + - Sentiment: Direct sentiment score (-1.0 to 1.0) + - Normalized_Volume_Change: tanh(volume_change) to bound between -1 and 1 + + Classification: + - Score > 0.2: BULLISH + - Score < -0.2: BEARISH + - Otherwise: NEUTRAL + """ + + # Weights for the weighted average + SENTIMENT_WEIGHT = 0.7 + VOLUME_WEIGHT = 0.3 + + # Thresholds for trend classification + BULLISH_THRESHOLD = 0.2 + BEARISH_THRESHOLD = -0.2 + + @staticmethod + def _normalize_volume_change(volume_change: float) -> float: + """ + Normalize volume change using hyperbolic tangent to bound between -1 and 1. + This prevents extreme volume spikes from dominating the score. + """ + from math import tanh + + return tanh(volume_change) + + @staticmethod + def _calculate_health_score(sentiment: float, volume_change: float) -> float: + """ + Calculate market health score using weighted average. + + Args: + sentiment: News sentiment score (-1.0 to 1.0) + volume_change: Volume percentage change + + Returns: + Market health score between -1.0 and 1.0 + """ + normalized_volume = MarketAnalyzer._normalize_volume_change(volume_change) + + health_score = ( + sentiment * MarketAnalyzer.SENTIMENT_WEIGHT + + normalized_volume * MarketAnalyzer.VOLUME_WEIGHT + ) + + # Ensure score stays within bounds + return max(-1.0, min(1.0, health_score)) + + @classmethod + def analyze_trend(cls, market_data: MarketData) -> Tuple[Trend, float, dict]: + """ + Analyze market trend based on sentiment and volume data. + + Args: + market_data: MarketData object containing sentiment and volume + + Returns: + Tuple of (trend, score, metrics) where: + - trend: Trend enum (BULLISH/BEARISH/NEUTRAL) + - score: Raw health score + - metrics: Dictionary with component scores + """ + # Calculate component scores + normalized_volume = cls._normalize_volume_change(market_data.volume_change) + sentiment_component = market_data.sentiment_score * cls.SENTIMENT_WEIGHT + volume_component = normalized_volume * cls.VOLUME_WEIGHT + + # Calculate total score + health_score = sentiment_component + volume_component + + # Classify trend + if health_score > cls.BULLISH_THRESHOLD: + trend = Trend.BULLISH + elif health_score < cls.BEARISH_THRESHOLD: + trend = Trend.BEARISH + else: + trend = Trend.NEUTRAL + + # Prepare metrics + metrics = { + "health_score": health_score, + "sentiment_score": market_data.sentiment_score, + "sentiment_component": sentiment_component, + "volume_change": market_data.volume_change, + "normalized_volume": normalized_volume, + "volume_component": volume_component, + "weights": {"sentiment": cls.SENTIMENT_WEIGHT, "volume": cls.VOLUME_WEIGHT}, + } + + return trend, health_score, metrics + + @classmethod + def analyze_from_sources( + cls, sentiment_score: float, volume_data: dict + ) -> Tuple[Trend, float, dict]: + """ + Convenience method to analyze from raw data sources. + + Args: + sentiment_score: From NewsFetcher + volume_data: From StellarDataFetcher, expected to have 'current' and 'previous' keys + + Returns: + Same as analyze_trend method + """ + # Calculate volume change percentage + current_volume = volume_data.get("current", 0) + previous_volume = volume_data.get("previous", 0) + + if previous_volume > 0: + volume_change = (current_volume - previous_volume) / previous_volume + else: + volume_change = 0.0 # Handle division by zero + + market_data = MarketData( + sentiment_score=sentiment_score, + volume_change=volume_change, + current_volume=current_volume, + previous_volume=previous_volume, + ) + + return cls.analyze_trend(market_data) + + +def get_explanation(score: float, trend: Trend) -> str: + """ + Generate human-readable explanation of the market trend. + + Args: + score: Market health score + trend: Determined trend + + Returns: + Explanation string + """ + explanations = { + Trend.BULLISH: [ + "Strong positive sentiment combined with increasing volume suggests bullish momentum.", + "Positive market sentiment supported by healthy volume growth indicates upward trend.", + "Bullish indicators from both news sentiment and trading volume.", + ], + Trend.BEARISH: [ + "Negative sentiment coupled with volume patterns suggests bearish pressure.", + "Pessimistic market outlook reinforced by volume contraction indicates downward trend.", + "Bearish signals from sentiment analysis and on-chain volume metrics.", + ], + Trend.NEUTRAL: [ + "Mixed or neutral signals with balanced sentiment and volume activity.", + "Market shows indecision with offsetting positive and negative indicators.", + "Neutral stance as sentiment and volume signals counterbalance each other.", + ], + } + + import random + + base_explanation = random.choice(explanations[trend]) + + if trend == Trend.NEUTRAL: + if score > 0: + return f"{base_explanation} Leaning slightly positive (score: {score:.2f})." + elif score < 0: + return f"{base_explanation} Leaning slightly negative (score: {score:.2f})." + + return f"{base_explanation} Market Health Score: {score:.2f}" diff --git a/temp_backup/src/analytics/ner_service.py b/temp_backup/src/analytics/ner_service.py new file mode 100644 index 00000000..bb94d06c --- /dev/null +++ b/temp_backup/src/analytics/ner_service.py @@ -0,0 +1,171 @@ +""" +Named Entity Recognition service for news tagging. + +Uses spaCy for entity extraction and includes crypto-specific patterns so +LumenPulse ecosystem entities are detected consistently. +""" + +from __future__ import annotations + +import logging +import re +from functools import lru_cache +from typing import Dict, List, Optional + +import spacy +from spacy.language import Language + +from .keywords import CRYPTO_PROJECT_MAP, KNOWN_TICKERS + +logger = logging.getLogger(__name__) + + +class NERService: + """Extract entities from news text for downstream filtering and tagging.""" + + _MODEL_CANDIDATES = ("en_core_web_sm", "en_core_web_md") + _PERSON_PATTERN = re.compile( + r"\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)+)\b" + ) + _TICKER_PATTERN = re.compile(r"(?:\$)?\b([A-Z]{2,6})\b") + + def __init__(self) -> None: + self._canonical_names = self._build_canonical_name_map() + self._known_tickers = {ticker.upper() for ticker in KNOWN_TICKERS} + self._nlp = self._initialize_pipeline() + + def _build_canonical_name_map(self) -> Dict[str, str]: + canonical_names: Dict[str, str] = {} + + for key, values in CRYPTO_PROJECT_MAP.items(): + if values: + name_candidate = values[-1] + canonical_names[key.lower()] = name_candidate + canonical_names[name_candidate.lower()] = name_candidate + + for value in values: + canonical_names[value.lower()] = value + + return canonical_names + + def _initialize_pipeline(self) -> Language: + nlp: Optional[Language] = None + + for model_name in self._MODEL_CANDIDATES: + try: + nlp = spacy.load(model_name, disable=["parser", "lemmatizer", "textcat"]) + logger.info("Initialized spaCy model for NER: %s", model_name) + break + except OSError: + continue + + if nlp is None: + nlp = spacy.blank("en") + logger.warning( + "spaCy pretrained model not found; using blank English pipeline with custom entity rules" + ) + + if "entity_ruler" in nlp.pipe_names: + nlp.remove_pipe("entity_ruler") + + ruler_config = {"phrase_matcher_attr": "LOWER"} + if "ner" in nlp.pipe_names: + ruler = nlp.add_pipe("entity_ruler", before="ner", config=ruler_config) + else: + ruler = nlp.add_pipe("entity_ruler", config=ruler_config) + + patterns = [] + + for project_name in CRYPTO_PROJECT_MAP: + patterns.append({"label": "PROJECT", "pattern": project_name}) + + for ticker in self._known_tickers: + patterns.append({"label": "ASSET", "pattern": ticker}) + patterns.append({"label": "ASSET", "pattern": f"${ticker}"}) + + ruler.add_patterns(patterns) + + if "sentencizer" not in nlp.pipe_names: + nlp.add_pipe("sentencizer") + + return nlp + + def _normalize_entity(self, value: str) -> Optional[str]: + cleaned = value.strip(" \n\t.,:;()[]{}\"'`") + if len(cleaned) < 2: + return None + + ticker_candidate = cleaned.lstrip("$") + if ticker_candidate.isupper() and ticker_candidate in self._known_tickers: + return ticker_candidate + + normalized_lookup = cleaned.lower() + if normalized_lookup in self._canonical_names: + return self._canonical_names[normalized_lookup] + + return cleaned + + @lru_cache(maxsize=4096) + def extract_entities(self, text: str) -> List[str]: + """ + Extract entities from text. + + Returns a deduplicated list containing projects, assets, and people. + """ + if not text or not text.strip(): + return [] + + if len(text) > 20000: + text = text[:20000] + + candidates: List[str] = [] + doc = self._nlp(text) + + for ent in doc.ents: + if ent.label_ in { + "PERSON", + "ORG", + "PRODUCT", + "NORP", + "GPE", + "EVENT", + "PROJECT", + "ASSET", + }: + candidates.append(ent.text) + + # Heuristic for names when running without a pretrained NER model. + for match in self._PERSON_PATTERN.findall(text): + candidates.append(match) + + # Explicit ticker extraction catches tokens that may not be tagged as entities. + for ticker in self._TICKER_PATTERN.findall(text): + if ticker in self._known_tickers: + candidates.append(ticker) + + deduped: List[str] = [] + seen = set() + + for candidate in candidates: + normalized = self._normalize_entity(candidate) + if not normalized: + continue + + key = normalized.lower() + if key not in seen: + deduped.append(normalized) + seen.add(key) + + return deduped + + def extract_entities_from_article( + self, + title: Optional[str] = None, + summary: Optional[str] = None, + content: Optional[str] = None, + ) -> List[str]: + """Extract entities from combined article fields.""" + chunks = [value.strip() for value in [title or "", summary or "", content or ""] if value and value.strip()] + if not chunks: + return [] + return self.extract_entities("\n".join(chunks)) diff --git a/temp_backup/src/analytics/sentiment.py b/temp_backup/src/analytics/sentiment.py new file mode 100644 index 00000000..cd9e0a18 --- /dev/null +++ b/temp_backup/src/analytics/sentiment.py @@ -0,0 +1,388 @@ +import logging +import os +import re +import unicodedata +from typing import Any, Dict, Optional, Set, Tuple + +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer + +try: + from langdetect import DetectorFactory, LangDetectException, detect + + DetectorFactory.seed = 0 + LANGDETECT_AVAILABLE = True +except ImportError: + LANGDETECT_AVAILABLE = False + + class LangDetectException(Exception): + """Fallback exception when langdetect is unavailable.""" + +logger = logging.getLogger(__name__) + +_DEFAULT_FINBERT_MODEL = "ProsusAI/finbert" + + +class SentimentScore(float): + """ + Float sentiment score enriched with language metadata. + """ + + language: str + language_supported: bool + language_unsupported: bool + + def __new__( + cls, + value: float, + language: str, + language_supported: bool, + language_unsupported: bool, + ) -> "SentimentScore": + instance = float.__new__(cls, value) + instance.language = language + instance.language_supported = language_supported + instance.language_unsupported = language_unsupported + return instance + + def to_dict(self) -> dict: + return { + "score": float(self), + "language": self.language, + "language_supported": self.language_supported, + "language_unsupported": self.language_unsupported, + } + + @property + def score(self) -> float: + return float(self) + + def __getitem__(self, key: str): + return self.to_dict()[key] + + def get(self, key: str, default=None): + return self.to_dict().get(key, default) + + +def _env_flag(name: str) -> bool: + return os.environ.get(name, "").strip().lower() in ("1", "true", "yes", "on") + + +class SentimentAnalyzer: + """ + Analyze sentiment using a financial FinBERT model for English when available, + with VADER (and crypto keyword hints) as fallback if transformers fail or are disabled. + Spanish and Portuguese use lightweight keyword scoring. + """ + + def __init__( + self, + *, + enable_transformer: Optional[bool] = None, + transformer_model: Optional[str] = None, + ) -> None: + self.analyzer = SentimentIntensityAnalyzer() + self.supported_languages: Set[str] = {"en", "es", "pt"} + + env_off = _env_flag("SENTIMENT_DISABLE_TRANSFORMER") + if enable_transformer is None: + self._transformer_enabled = not env_off + else: + self._transformer_enabled = bool(enable_transformer) and not env_off + + self._transformer_model_name = ( + transformer_model + or os.environ.get("SENTIMENT_TRANSFORMER_MODEL", _DEFAULT_FINBERT_MODEL).strip() + or _DEFAULT_FINBERT_MODEL + ) + + self._transformer_model: Any = None + self._transformer_tokenizer: Any = None + self._transformer_load_failed = False + + self.negative_keywords_en = { + "crash", + "crashing", + "dump", + "bear", + "plunge", + "collapse", + } + self.positive_keywords_en = { + "moon", + "bull", + "surge", + "rally", + "all time high", + "ath", + } + + # Lightweight keyword mapping for non-English sentiment support. + self.positive_keywords_es = { + "sube", + "subida", + "alza", + "rally", + "maximo historico", + "alcista", + } + self.negative_keywords_es = { + "cae", + "caida", + "baja", + "desplome", + "colapso", + "bajista", + } + + self.positive_keywords_pt = { + "sobe", + "alta", + "rali", + "maxima historica", + "otimista", + "altista", + } + self.negative_keywords_pt = { + "cai", + "queda", + "baixa", + "despenca", + "colapso", + "baixista", + } + + def _load_transformer(self) -> bool: + if not self._transformer_enabled or self._transformer_load_failed: + return False + if self._transformer_model is not None: + return True + try: + from transformers import AutoModelForSequenceClassification, AutoTokenizer + + model_name = self._transformer_model_name + self._transformer_tokenizer = AutoTokenizer.from_pretrained(model_name) + self._transformer_model = AutoModelForSequenceClassification.from_pretrained( + model_name + ) + self._transformer_model.eval() + logger.info("Loaded transformer sentiment model: %s", model_name) + return True + except Exception as e: + logger.warning( + "Transformer sentiment unavailable, using VADER fallback: %s", e + ) + self._transformer_load_failed = True + return False + + def _finbert_compound(self, text: str) -> Optional[float]: + if not self._load_transformer(): + return None + try: + import torch + + inputs = self._transformer_tokenizer( + text, + return_tensors="pt", + truncation=True, + max_length=512, + padding=True, + ) + with torch.no_grad(): + logits = self._transformer_model(**inputs).logits + probs = torch.softmax(logits, dim=-1)[0] + + id2label = self._transformer_model.config.id2label + pos_idx: Optional[int] = None + neg_idx: Optional[int] = None + for key, label in id2label.items(): + idx = int(key) if not isinstance(key, int) else key + low = str(label).lower() + if low == "positive": + pos_idx = idx + elif low == "negative": + neg_idx = idx + if pos_idx is None or neg_idx is None: + return None + + p_pos = float(probs[pos_idx].item()) + p_neg = float(probs[neg_idx].item()) + return max(-1.0, min(1.0, p_pos - p_neg)) + except Exception as e: + logger.warning("FinBERT inference failed, falling back to VADER: %s", e) + return None + + def _vader_english_compound(self, text: str) -> float: + cleaned = text.lower() + scores = self.analyzer.polarity_scores(cleaned) + compound = float(scores.get("compound", 0.0)) + + if compound == 0.0: + if any(word in cleaned for word in self.negative_keywords_en): + return -0.4 + if any(word in cleaned for word in self.positive_keywords_en): + return 0.4 + + return compound + + def analyze_text( + self, text: Optional[str], lang_hint: Optional[str] = None + ) -> SentimentScore: + """ + Analyze the sentiment of the given text. + + Args: + text (str): Input text (headline or article) + lang_hint (str, optional): Optional ISO language hint (e.g. "en", "es"). + + Returns: + SentimentScore: Float-like score with language metadata. + """ + if not text or not isinstance(text, str): + return SentimentScore(0.0, "unknown", False, False) + + cleaned = text.strip() + if not cleaned: + return SentimentScore(0.0, "unknown", False, False) + + language = self._resolve_language(cleaned, lang_hint) + if language not in self.supported_languages: + return SentimentScore(0.0, language, False, True) + + if language == "en": + score = self._analyze_english(cleaned) + elif language == "es": + score = self._keyword_sentiment_score( + cleaned, self.positive_keywords_es, self.negative_keywords_es + ) + else: + score = self._keyword_sentiment_score( + cleaned, self.positive_keywords_pt, self.negative_keywords_pt + ) + + return SentimentScore(score, language, True, False) + + def _analyze_english(self, text: str) -> float: + finbert_score = self._finbert_compound(text) + if finbert_score is not None: + return finbert_score + return self._vader_english_compound(text) + + def _keyword_sentiment_score( + self, text: str, positive_keywords: Set[str], negative_keywords: Set[str] + ) -> float: + normalized_text = self._normalize_text(text) + positive_hits = sum(1 for word in positive_keywords if word in normalized_text) + negative_hits = sum(1 for word in negative_keywords if word in normalized_text) + + total_hits = positive_hits + negative_hits + if total_hits == 0: + return 0.0 + + score = (positive_hits - negative_hits) / total_hits + return max(-1.0, min(1.0, float(score))) + + def _normalize_text(self, text: str) -> str: + normalized = unicodedata.normalize("NFKD", text).encode("ascii", "ignore") + ascii_text = normalized.decode("ascii") + return re.sub(r"\s+", " ", ascii_text).strip().lower() + + def _resolve_language(self, text: str, lang_hint: Optional[str]) -> str: + if lang_hint: + return self._normalize_language_code(lang_hint) + + script_language = self._detect_script_language(text) + if script_language: + return script_language + + if LANGDETECT_AVAILABLE: + try: + detected = detect(text) + return self._normalize_language_code(detected) + except LangDetectException: + pass + + return self._heuristic_language_detection(text) + + def _normalize_language_code(self, language: str) -> str: + normalized = language.strip().lower().replace("_", "-") + if not normalized: + return "unknown" + return normalized.split("-")[0] + + def _heuristic_language_detection(self, text: str) -> str: + normalized_text = self._normalize_text(text) + words = set(normalized_text.split()) + + spanish_markers = {"sube", "caida", "mercado", "hoy", "alcista", "bajista"} + portuguese_markers = { + "sobe", + "queda", + "alta", + "baixa", + "mercado", + "hoje", + "altista", + "baixista", + } + + spanish_hits = len(words & spanish_markers) + portuguese_hits = len(words & portuguese_markers) + + if spanish_hits > portuguese_hits and spanish_hits > 0: + return "es" + if portuguese_hits > spanish_hits and portuguese_hits > 0: + return "pt" + return "en" + + def _detect_script_language(self, text: str) -> Optional[str]: + if re.search(r"[\u4e00-\u9fff]", text): + return "zh" + if re.search(r"[\u3040-\u30ff]", text): + return "ja" + if re.search(r"[\uac00-\ud7af]", text): + return "ko" + if re.search(r"[\u0400-\u04ff]", text): + return "ru" + if re.search(r"[\u0600-\u06ff]", text): + return "ar" + return None + + +def benchmark_vader_vs_transformer( + texts: Tuple[str, ...], +) -> Tuple[Dict[str, Tuple[float, Optional[float]]], Dict[str, Any]]: + """ + Run the same English headlines through VADER-only and FinBERT paths. + + Returns: + (per_text_scores, summary) where each value is (vader_compound, transformer_compound). + transformer_compound is None if the model could not be loaded or inference failed. + """ + vader_analyzer = SentimentAnalyzer(enable_transformer=False) + full_analyzer = SentimentAnalyzer(enable_transformer=True) + + rows: Dict[str, Tuple[float, Optional[float]]] = {} + tf_ok = 0 + agreement = 0 + n = 0 + + for raw in texts: + t = raw.strip() + if not t: + continue + v = vader_analyzer._vader_english_compound(t) + tf = full_analyzer._finbert_compound(t) + rows[t] = (v, tf) + n += 1 + if tf is not None: + tf_ok += 1 + if (v >= 0) == (tf >= 0): + agreement += 1 + + summary = { + "samples": n, + "transformer_inferences_ok": tf_ok, + "sign_agreement_with_vader": agreement, + "sign_agreement_rate": (agreement / tf_ok) if tf_ok else 0.0, + } + return rows, summary diff --git a/temp_backup/src/analytics/sentiment_indicators.py b/temp_backup/src/analytics/sentiment_indicators.py new file mode 100644 index 00000000..9abab57f --- /dev/null +++ b/temp_backup/src/analytics/sentiment_indicators.py @@ -0,0 +1,236 @@ +""" +sentiment_indicators.py + +Maps numeric sentiment scores (-1 to 1) to color-coded visual indicators for +use in news feed and asset detail views. + +Color scheme +------------ +* Bullish (score >= 0.05) → Green #00C853 +* Bearish (score <= -0.05) → Red #D50000 +* Neutral (-0.05 < score < 0.05) → Gray #9E9E9E + +Thresholds match the VADER compound-score cut-offs already used across the +project (see src/sentiment.py). +""" + +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from typing import List, Dict, Any + + +# --------------------------------------------------------------------------- +# Thresholds (matching VADER cut-offs used in src/sentiment.py) +# --------------------------------------------------------------------------- +BULLISH_THRESHOLD: float = 0.05 +BEARISH_THRESHOLD: float = -0.05 + + +# --------------------------------------------------------------------------- +# Enums +# --------------------------------------------------------------------------- + +class SentimentColor(str, Enum): + """Canonical color names for sentiment categories.""" + + GREEN = "green" + RED = "red" + GRAY = "gray" + + +class SentimentLabel(str, Enum): + """Human-readable trading labels for sentiment categories.""" + + BULLISH = "Bullish" + BEARISH = "Bearish" + NEUTRAL = "Neutral" + + +# --------------------------------------------------------------------------- +# Dataclass +# --------------------------------------------------------------------------- + +@dataclass(frozen=True) +class SentimentIndicator: + """ + Full visual indicator for a single sentiment score. + + Attributes + ---------- + score: Original compound sentiment score (-1 to 1). + color: Semantic color name ("green" | "red" | "gray"). + hex_color: CSS hex colour value ("#00C853" | "#D50000" | "#9E9E9E"). + label: Human-readable label ("Bullish" | "Bearish" | "Neutral"). + display_text: Formatted string for UI badges, e.g. "0.85 Bullish". + """ + + score: float + color: SentimentColor + hex_color: str + label: SentimentLabel + display_text: str + + def to_dict(self) -> Dict[str, Any]: + return { + "score": self.score, + "color": self.color.value, + "hex_color": self.hex_color, + "label": self.label.value, + "display_text": self.display_text, + } + + +# --------------------------------------------------------------------------- +# Mapper +# --------------------------------------------------------------------------- + +class SentimentIndicatorMapper: + """ + Converts a raw sentiment score into a :class:`SentimentIndicator` ready + for serialisation to the API response. + + Usage + ----- + >>> mapper = SentimentIndicatorMapper() + >>> indicator = mapper.score_to_indicator(0.82) + >>> indicator.color + + >>> indicator.label + + >>> indicator.display_text + '0.82 Bullish' + """ + + # Hex values chosen for accessibility contrast on both dark and light UIs + _HEX: Dict[SentimentColor, str] = { + SentimentColor.GREEN: "#00C853", + SentimentColor.RED: "#D50000", + SentimentColor.GRAY: "#9E9E9E", + } + + # Legend copy consumed by GET /sentiment/legend + _LEGEND: List[Dict[str, str]] = [ + { + "color": SentimentColor.GREEN.value, + "hex_color": _HEX[SentimentColor.GREEN], + "label": SentimentLabel.BULLISH.value, + "description": ( + f"Positive sentiment (score ≥ {BULLISH_THRESHOLD:+.2f}). " + "The market or news is generally optimistic about this asset." + ), + "score_range": f"≥ {BULLISH_THRESHOLD}", + }, + { + "color": SentimentColor.RED.value, + "hex_color": _HEX[SentimentColor.RED], + "label": SentimentLabel.BEARISH.value, + "description": ( + f"Negative sentiment (score ≤ {BEARISH_THRESHOLD:+.2f}). " + "The market or news is generally pessimistic about this asset." + ), + "score_range": f"≤ {BEARISH_THRESHOLD}", + }, + { + "color": SentimentColor.GRAY.value, + "hex_color": _HEX[SentimentColor.GRAY], + "label": SentimentLabel.NEUTRAL.value, + "description": ( + f"Neutral sentiment ({BEARISH_THRESHOLD:+.2f} < score < " + f"{BULLISH_THRESHOLD:+.2f}). Insufficient signal to determine" + " market direction." + ), + "score_range": f"{BEARISH_THRESHOLD} to {BULLISH_THRESHOLD}", + }, + ] + + def score_to_indicator(self, score: float) -> SentimentIndicator: + """ + Map a compound sentiment score to a :class:`SentimentIndicator`. + + Parameters + ---------- + score: + Compound sentiment score in the range [-1, 1]. Values outside + this range are clamped to the nearest label boundary. + + Returns + ------- + SentimentIndicator + """ + score = float(score) + + if score >= BULLISH_THRESHOLD: + color = SentimentColor.GREEN + label = SentimentLabel.BULLISH + elif score <= BEARISH_THRESHOLD: + color = SentimentColor.RED + label = SentimentLabel.BEARISH + else: + color = SentimentColor.GRAY + label = SentimentLabel.NEUTRAL + + hex_color = self._HEX[color] + display_text = self.format_display(score, label) + + return SentimentIndicator( + score=score, + color=color, + hex_color=hex_color, + label=label, + display_text=display_text, + ) + + @staticmethod + def format_display(score: float, label: SentimentLabel | None = None) -> str: + """ + Return a formatted display string such as ``"0.85 Bullish"``. + + If *label* is not supplied it is derived from *score* on the fly. + """ + if label is None: + if score >= BULLISH_THRESHOLD: + label = SentimentLabel.BULLISH + elif score <= BEARISH_THRESHOLD: + label = SentimentLabel.BEARISH + else: + label = SentimentLabel.NEUTRAL + return f"{score:.2f} {label.value}" + + def get_legend(self) -> List[Dict[str, str]]: + """ + Return the legend definition that the frontend uses to render + colour-key tooltips. + + Returns + ------- + list of dict with keys: color, hex_color, label, description, score_range + """ + return list(self._LEGEND) + + +# --------------------------------------------------------------------------- +# Module-level convenience +# --------------------------------------------------------------------------- + +_default_mapper = SentimentIndicatorMapper() + + +def get_sentiment_indicator(score: float) -> SentimentIndicator: + """ + Convenience wrapper around :class:`SentimentIndicatorMapper`. + + >>> get_sentiment_indicator(0.72).label + + >>> get_sentiment_indicator(-0.3).hex_color + '#D50000' + >>> get_sentiment_indicator(0.0).color + + """ + return _default_mapper.score_to_indicator(score) + + +def get_legend() -> List[Dict[str, str]]: + """Return the colour legend used throughout the application.""" + return _default_mapper.get_legend() diff --git a/temp_backup/src/anomaly_detector.py b/temp_backup/src/anomaly_detector.py new file mode 100644 index 00000000..c493ed36 --- /dev/null +++ b/temp_backup/src/anomaly_detector.py @@ -0,0 +1,818 @@ +""" +Anomaly Detector module - Detects abnormal spikes in trade volume or social sentiment +using statistical methods (Z-Score) and Machine Learning (Isolation Forest) to identify +outliers and complex pump-and-dump patterns. +""" + +from src.utils.logger import setup_logger +from src.utils.metrics import ANOMALIES_DETECTED_TOTAL +from typing import List, Dict, Any, Tuple, Optional, Union +from datetime import datetime, timedelta +from collections import deque +import numpy as np +from dataclasses import dataclass, field +from sklearn.ensemble import IsolationForest +import joblib +import os +import json + +logger = setup_logger(__name__) + + +@dataclass +class AnomalyResult: + """Result of anomaly detection""" + + is_anomaly: bool + severity_score: float # 0.0 - 1.0 + metric_name: str + current_value: float + baseline_mean: float + baseline_std: float + z_score: float + timestamp: datetime + ml_anomaly_score: Optional[float] = None # Isolation Forest anomaly score + ml_is_anomaly: Optional[bool] = None # Isolation Forest prediction + + def to_dict(self) -> Dict[str, Any]: + result = { + "is_anomaly": self.is_anomaly, + "severity_score": self.severity_score, + "metric_name": self.metric_name, + "current_value": self.current_value, + "baseline_mean": self.baseline_mean, + "baseline_std": self.baseline_std, + "z_score": self.z_score, + "timestamp": self.timestamp.isoformat(), + } + if self.ml_anomaly_score is not None: + result["ml_anomaly_score"] = self.ml_anomaly_score + result["ml_is_anomaly"] = self.ml_is_anomaly + return result + + +@dataclass +class MultiDimensionalAnomalyResult: + """Result for multi-dimensional anomaly detection using Isolation Forest""" + + is_anomaly: bool + anomaly_score: float # Lower = more anomalous (typical for Isolation Forest) + severity_score: float # 0.0 - 1.0 + features_used: List[str] + feature_values: Dict[str, float] + timestamp: datetime + + def to_dict(self) -> Dict[str, Any]: + return { + "is_anomaly": self.is_anomaly, + "anomaly_score": self.anomaly_score, + "severity_score": self.severity_score, + "features_used": self.features_used, + "feature_values": self.feature_values, + "timestamp": self.timestamp.isoformat(), + } + + +class IsolationForestDetector: + """ + ML-based anomaly detector using Isolation Forest algorithm. + Detects multi-dimensional anomalies that might be missed by univariate methods. + """ + + DEFAULT_CONTAMINATION = 0.1 # Expected proportion of anomalies (10%) + DEFAULT_N_ESTIMATORS = 100 + DEFAULT_MAX_SAMPLES = 'auto' + DEFAULT_FEATURES = ['volume', 'sentiment', 'volume_change_rate', 'sentiment_change_rate'] + + def __init__( + self, + contamination: float = None, + n_estimators: int = None, + max_samples: Union[str, int] = 'auto', + random_state: int = 42, + feature_columns: List[str] = None + ): + """ + Initialize Isolation Forest detector. + + Args: + contamination: Expected proportion of anomalies (0.0 to 0.5) + n_estimators: Number of base estimators in the ensemble + max_samples: Number of samples to draw for training + random_state: Random seed for reproducibility + feature_columns: List of feature names to use + """ + self.contamination = contamination or self.DEFAULT_CONTAMINATION + self.n_estimators = n_estimators or self.DEFAULT_N_ESTIMATORS + self.max_samples = max_samples + self.random_state = random_state + self.feature_columns = feature_columns or self.DEFAULT_FEATURES + + self.model = IsolationForest( + contamination=self.contamination, + n_estimators=self.n_estimators, + max_samples=self.max_samples, + random_state=self.random_state, + verbose=0 + ) + + self.is_trained = False + self.training_data = deque(maxlen=1000) # Store recent data for retraining + self.min_training_samples = 50 # Minimum samples needed for training + + logger.info( + f"IsolationForestDetector initialized with contamination={self.contamination}, " + f"n_estimators={self.n_estimators}, features={self.feature_columns}" + ) + + def _extract_features( + self, + volume: float, + sentiment: float, + volume_history: List[float] = None, + sentiment_history: List[float] = None + ) -> np.ndarray: + """ + Extract feature vector for anomaly detection. + + Args: + volume: Current volume value + sentiment: Current sentiment value + volume_history: Historical volume values for rate calculation + sentiment_history: Historical sentiment values for rate calculation + + Returns: + Feature vector as numpy array + """ + features = {} + + # Basic features + features['volume'] = volume + features['sentiment'] = sentiment + + # Rate of change features (if history available) + if volume_history and len(volume_history) >= 2: + volume_change_rate = (volume - volume_history[-1]) / (volume_history[-1] + 1e-10) + features['volume_change_rate'] = np.clip(volume_change_rate, -10, 10) # Cap extreme values + else: + features['volume_change_rate'] = 0.0 + + if sentiment_history and len(sentiment_history) >= 2: + sentiment_change_rate = (sentiment - sentiment_history[-1]) / (abs(sentiment_history[-1]) + 1e-10) + features['sentiment_change_rate'] = np.clip(sentiment_change_rate, -5, 5) + else: + features['sentiment_change_rate'] = 0.0 + + # Interaction feature (volume * sentiment) - captures pump-and-dump patterns + features['volume_sentiment_product'] = volume * (sentiment + 1) # Shift sentiment to positive range + + # Return only configured features + feature_vector = [features[f] for f in self.feature_columns if f in features] + + # Pad with zeros if some features are missing + while len(feature_vector) < len(self.feature_columns): + feature_vector.append(0.0) + + return np.array(feature_vector).reshape(1, -1) + + def train(self, historical_data: List[Dict[str, float]]) -> bool: + """ + Train the Isolation Forest model on historical data. + + Args: + historical_data: List of dictionaries containing historical data points + each with 'volume' and 'sentiment' keys at minimum + + Returns: + bool: True if training successful, False otherwise + """ + if len(historical_data) < self.min_training_samples: + logger.warning( + f"Insufficient data for training: {len(historical_data)}/{self.min_training_samples}" + ) + return False + + # Extract features from historical data + features = [] + for i, point in enumerate(historical_data): + # Use previous points for rate calculation + volume_history = [p['volume'] for p in historical_data[max(0, i-5):i]] + sentiment_history = [p['sentiment'] for p in historical_data[max(0, i-5):i]] + + feature_vec = self._extract_features( + point['volume'], + point['sentiment'], + volume_history, + sentiment_history + ) + features.append(feature_vec.flatten()) + + X_train = np.array(features) + + # Train the model + try: + self.model.fit(X_train) + self.is_trained = True + logger.info(f"Isolation Forest trained successfully on {len(historical_data)} samples") + return True + except Exception as e: + logger.error(f"Failed to train Isolation Forest: {e}") + return False + + def detect_anomaly( + self, + volume: float, + sentiment: float, + volume_history: List[float] = None, + sentiment_history: List[float] = None + ) -> Optional[MultiDimensionalAnomalyResult]: + """ + Detect anomaly in the current data point. + + Args: + volume: Current volume + sentiment: Current sentiment + volume_history: Historical volumes for context + sentiment_history: Historical sentiments for context + + Returns: + MultiDimensionalAnomalyResult if model is trained, None otherwise + """ + if not self.is_trained: + logger.debug("Isolation Forest not trained yet, skipping detection") + return None + + # Extract features + features = self._extract_features(volume, sentiment, volume_history, sentiment_history) + + # Predict anomaly (-1 for anomaly, 1 for normal) + prediction = self.model.predict(features)[0] + anomaly_score = self.model.score_samples(features)[0] # Lower = more anomalous + + is_anomaly = prediction == -1 + + # Calculate severity score (0-1, higher = more severe) + # Convert anomaly score to severity (anomaly scores are typically negative) + # Map typical range (-0.5 to 0) to severity (0 to 1) + normalized_score = np.clip(-anomaly_score * 2, 0, 1) + severity_score = normalized_score if is_anomaly else 0.0 + + if is_anomaly: + ANOMALIES_DETECTED_TOTAL.labels(metric_name="ml_multi_dimensional").inc() + logger.info(f"ML anomaly detected! Score: {anomaly_score:.3f}, Severity: {severity_score:.3f}") + + # Create feature value dictionary for result + feature_values = { + 'volume': volume, + 'sentiment': sentiment, + 'volume_change_rate': float(features[0][2]) if features.shape[1] > 2 else 0.0, + 'sentiment_change_rate': float(features[0][3]) if features.shape[1] > 3 else 0.0 + } + + return MultiDimensionalAnomalyResult( + is_anomaly=is_anomaly, + anomaly_score=float(anomaly_score), + severity_score=severity_score, + features_used=self.feature_columns, + feature_values=feature_values, + timestamp=datetime.utcnow() + ) + + def add_training_point(self, volume: float, sentiment: float): + """ + Add a data point to the training buffer for future retraining. + + Args: + volume: Volume value + sentiment: Sentiment value + """ + self.training_data.append({ + 'volume': volume, + 'sentiment': sentiment, + 'timestamp': datetime.utcnow() + }) + + # Auto-retrain every 200 new points if enough data + if len(self.training_data) >= 200 and len(self.training_data) % 50 == 0: + self.train(list(self.training_data)) + + def save_model(self, filepath: str): + """Save the trained model to disk.""" + if self.is_trained: + joblib.dump(self.model, filepath) + # Save configuration + config = { + 'contamination': self.contamination, + 'n_estimators': self.n_estimators, + 'max_samples': self.max_samples, + 'feature_columns': self.feature_columns + } + with open(f"{filepath}.config.json", 'w') as f: + json.dump(config, f) + logger.info(f"Model saved to {filepath}") + + def load_model(self, filepath: str) -> bool: + """Load a trained model from disk.""" + if os.path.exists(filepath): + self.model = joblib.load(filepath) + self.is_trained = True + logger.info(f"Model loaded from {filepath}") + return True + return False + + +class AnomalyDetector: + """ + Statistical anomaly detector using Z-Score methodology to identify outliers + in time-series data for trade volume and social sentiment metrics. + + Now enhanced with Isolation Forest for multi-dimensional anomaly detection. + """ + + # Default configuration + DEFAULT_WINDOW_SIZE_HOURS = 24 + DEFAULT_Z_THRESHOLD = 2.5 # Standard deviations from mean + MIN_DATA_POINTS = 10 # Minimum data points required for reliable statistics + DEFAULT_USE_ML = True # Enable ML-based detection by default + DEFAULT_ML_CONTAMINATION = 0.1 # 10% expected anomalies + + def __init__( + self, + window_size_hours: int = None, + z_threshold: float = None, + use_ml: bool = None, + ml_contamination: float = None, + enable_comparison_mode: bool = False + ): + """ + Initialize the anomaly detector. + + Args: + window_size_hours: Size of rolling window in hours (default: 24) + z_threshold: Z-score threshold for anomaly detection (default: 2.5) + use_ml: Enable Isolation Forest for multi-dimensional detection + ml_contamination: Expected proportion of anomalies for ML model + enable_comparison_mode: Run both Z-score and ML and compare results + """ + self.window_size_hours = window_size_hours or self.DEFAULT_WINDOW_SIZE_HOURS + self.z_threshold = z_threshold or self.DEFAULT_Z_THRESHOLD + self.use_ml = use_ml if use_ml is not None else self.DEFAULT_USE_ML + self.enable_comparison_mode = enable_comparison_mode + + # Data storage for rolling windows + self.volume_data = deque(maxlen=self.window_size_hours * 4) + self.sentiment_data = deque(maxlen=self.window_size_hours * 4) + self.timestamp_data = deque(maxlen=self.window_size_hours * 4) + + # Initialize ML detector if enabled + self.ml_detector = None + if self.use_ml: + self.ml_detector = IsolationForestDetector( + contamination=ml_contamination or self.DEFAULT_ML_CONTAMINATION + ) + + # Historical storage for ML training + self.historical_points = [] + + logger.info( + f"AnomalyDetector initialized with {self.window_size_hours}h window, " + f"Z-threshold: {self.z_threshold}, ML-enabled: {self.use_ml}, " + f"Comparison mode: {self.enable_comparison_mode}" + ) + + def _calculate_statistics(self, data_points: List[float]) -> Tuple[float, float]: + """ + Calculate mean and standard deviation for a list of data points. + + Args: + data_points: List of numerical values + + Returns: + Tuple of (mean, standard_deviation) + """ + if len(data_points) < self.MIN_DATA_POINTS: + raise ValueError( + f"Need at least {self.MIN_DATA_POINTS} data points for reliable statistics" + ) + + mean = np.mean(data_points) + std = np.std(data_points, ddof=1) + + if std == 0: + std = 1e-10 + + return float(mean), float(std) + + def _calculate_z_score(self, value: float, mean: float, std: float) -> float: + """Calculate Z-score for a value given mean and standard deviation.""" + return (value - mean) / std + + def _calculate_severity_score(self, z_score: float) -> float: + """ + Convert Z-score to severity score (0.0-1.0). + Higher absolute Z-scores result in higher severity. + """ + abs_z = abs(z_score) + + if abs_z <= self.z_threshold: + return 0.0 + elif abs_z <= self.z_threshold * 2: + return (abs_z - self.z_threshold) / self.z_threshold + else: + return 1.0 + + def _clean_old_data(self, current_timestamp: datetime): + """Remove data points older than the window size.""" + cutoff_time = current_timestamp - timedelta(hours=self.window_size_hours) + + while ( + self.timestamp_data + and len(self.timestamp_data) > 0 + and self.timestamp_data[0] < cutoff_time + ): + self.timestamp_data.popleft() + if self.volume_data: + self.volume_data.popleft() + if self.sentiment_data: + self.sentiment_data.popleft() + + def add_data_point( + self, volume: float, sentiment_score: float, timestamp: datetime = None + ): + """Add a new data point to the rolling window.""" + if timestamp is None: + timestamp = datetime.utcnow() + + self._clean_old_data(timestamp) + + self.timestamp_data.append(timestamp) + self.volume_data.append(float(volume)) + self.sentiment_data.append(float(sentiment_score)) + + # Store for ML training + self.historical_points.append({ + 'volume': float(volume), + 'sentiment': float(sentiment_score), + 'timestamp': timestamp + }) + + # Keep only last 1000 points + if len(self.historical_points) > 1000: + self.historical_points = self.historical_points[-1000:] + + # Train ML model if we have enough data and it's not trained yet + if self.ml_detector and not self.ml_detector.is_trained: + if len(self.historical_points) >= self.ml_detector.min_training_samples: + self.ml_detector.train(self.historical_points) + + # Add to ML training buffer + if self.ml_detector: + self.ml_detector.add_training_point(float(volume), float(sentiment_score)) + + logger.debug(f"Added data point: volume={volume}, sentiment={sentiment_score}") + + def detect_volume_anomaly( + self, current_volume: float, timestamp: datetime = None + ) -> AnomalyResult: + """Detect anomalies in trade volume data.""" + if timestamp is None: + timestamp = datetime.utcnow() + + try: + baseline_values = list(self.volume_data) + if len(baseline_values) < self.MIN_DATA_POINTS: + return AnomalyResult( + is_anomaly=False, + severity_score=0.0, + metric_name="volume", + current_value=current_volume, + baseline_mean=0.0, + baseline_std=0.0, + z_score=0.0, + timestamp=timestamp, + ) + + mean, std = self._calculate_statistics(baseline_values) + z_score = self._calculate_z_score(current_volume, mean, std) + severity = self._calculate_severity_score(z_score) + is_anomaly = abs(z_score) > self.z_threshold + + if is_anomaly: + ANOMALIES_DETECTED_TOTAL.labels(metric_name="volume").inc() + + return AnomalyResult( + is_anomaly=is_anomaly, + severity_score=severity, + metric_name="volume", + current_value=current_volume, + baseline_mean=mean, + baseline_std=std, + z_score=z_score, + timestamp=timestamp, + ) + + except Exception as e: + logger.error(f"Error detecting volume anomaly: {e}") + return AnomalyResult( + is_anomaly=False, + severity_score=0.0, + metric_name="volume", + current_value=current_volume, + baseline_mean=0.0, + baseline_std=0.0, + z_score=0.0, + timestamp=timestamp, + ) + + def detect_sentiment_anomaly( + self, current_sentiment: float, timestamp: datetime = None + ) -> AnomalyResult: + """Detect anomalies in social sentiment data.""" + if timestamp is None: + timestamp = datetime.utcnow() + + try: + baseline_values = list(self.sentiment_data) + if len(baseline_values) < self.MIN_DATA_POINTS: + return AnomalyResult( + is_anomaly=False, + severity_score=0.0, + metric_name="sentiment", + current_value=current_sentiment, + baseline_mean=0.0, + baseline_std=0.0, + z_score=0.0, + timestamp=timestamp, + ) + + mean, std = self._calculate_statistics(baseline_values) + z_score = self._calculate_z_score(current_sentiment, mean, std) + severity = self._calculate_severity_score(z_score) + is_anomaly = abs(z_score) > self.z_threshold + + if is_anomaly: + ANOMALIES_DETECTED_TOTAL.labels(metric_name="sentiment").inc() + + return AnomalyResult( + is_anomaly=is_anomaly, + severity_score=severity, + metric_name="sentiment", + current_value=current_sentiment, + baseline_mean=mean, + baseline_std=std, + z_score=z_score, + timestamp=timestamp, + ) + + except Exception as e: + logger.error(f"Error detecting sentiment anomaly: {e}") + return AnomalyResult( + is_anomaly=False, + severity_score=0.0, + metric_name="sentiment", + current_value=current_sentiment, + baseline_mean=0.0, + baseline_std=0.0, + z_score=0.0, + timestamp=timestamp, + ) + + def detect_multi_dimensional_anomaly( + self, volume: float, sentiment: float, timestamp: datetime = None + ) -> Optional[MultiDimensionalAnomalyResult]: + """ + Detect anomalies using Isolation Forest (multi-dimensional). + + Returns: + MultiDimensionalAnomalyResult or None if ML not enabled/trained + """ + if not self.ml_detector or not self.ml_detector.is_trained: + return None + + volume_history = list(self.volume_data)[-10:] if self.volume_data else [] + sentiment_history = list(self.sentiment_data)[-10:] if self.sentiment_data else [] + + return self.ml_detector.detect_anomaly( + volume, sentiment, volume_history, sentiment_history + ) + + def detect_anomalies( + self, volume: float, sentiment_score: float, timestamp: datetime = None + ) -> Dict[str, Any]: + """ + Detect anomalies for both volume and sentiment simultaneously. + + Now enhanced with ML-based multi-dimensional detection. + + Args: + volume: Current trade volume + sentiment_score: Current sentiment score + timestamp: Timestamp of current data point + + Returns: + Dictionary containing all anomaly detection results + """ + if timestamp is None: + timestamp = datetime.utcnow() + + # Add data point first + self.add_data_point(volume, sentiment_score, timestamp) + + # Detect univariate anomalies + volume_result = self.detect_volume_anomaly(volume, timestamp) + sentiment_result = self.detect_sentiment_anomaly(sentiment_score, timestamp) + + results = { + 'volume_anomaly': volume_result, + 'sentiment_anomaly': sentiment_result, + 'timestamp': timestamp, + 'ml_anomaly': None + } + + # Detect multi-dimensional anomaly if ML is enabled + if self.use_ml: + ml_result = self.detect_multi_dimensional_anomaly(volume, sentiment_score, timestamp) + results['ml_anomaly'] = ml_result + + # Enhanced detection: Combine signals for better accuracy + if ml_result and ml_result.is_anomaly: + # Log when ML detects something Z-score might miss + if not (volume_result.is_anomaly or sentiment_result.is_anomaly): + logger.warning( + f"ML detected multi-dimensional anomaly missed by univariate methods! " + f"Volume: {volume:.2f}, Sentiment: {sentiment_score:.3f}, " + f"ML Score: {ml_result.anomaly_score:.3f}" + ) + + # Boost severity if multiple methods agree + if volume_result.is_anomaly or sentiment_result.is_anomaly: + combined_severity = max( + volume_result.severity_score, + sentiment_result.severity_score, + ml_result.severity_score + ) + results['combined_severity'] = combined_severity + results['is_anomaly_consensus'] = True + + # Comparison mode: Run both and generate comparison report + if self.enable_comparison_mode and self.ml_detector and self.ml_detector.is_trained: + results['comparison'] = self._compare_detection_methods( + volume_result, sentiment_result, ml_result + ) + + return results + + def _compare_detection_methods( + self, + volume_result: AnomalyResult, + sentiment_result: AnomalyResult, + ml_result: Optional[MultiDimensionalAnomalyResult] + ) -> Dict[str, Any]: + """ + Compare performance between Z-score and Isolation Forest methods. + """ + z_score_anomaly = volume_result.is_anomaly or sentiment_result.is_anomaly + ml_anomaly = ml_result.is_anomaly if ml_result else False + + comparison = { + 'z_score_detected': z_score_anomaly, + 'ml_detected': ml_anomaly, + 'agreement': z_score_anomaly == ml_anomaly, + 'z_score_severity': max(volume_result.severity_score, sentiment_result.severity_score), + 'ml_severity': ml_result.severity_score if ml_result else 0.0, + } + + # Analysis of detection differences + if z_score_anomaly and not ml_anomaly: + comparison['analysis'] = "Z-score detected anomaly but ML didn't - possible false positive from simple outlier" + elif not z_score_anomaly and ml_anomaly: + comparison['analysis'] = "ML detected complex multi-dimensional anomaly missed by univariate Z-score" + elif z_score_anomaly and ml_anomaly: + comparison['analysis'] = "Both methods agree - high confidence anomaly detected" + else: + comparison['analysis'] = "No anomaly detected by either method" + + return comparison + + def get_window_stats(self) -> Dict[str, Any]: + """Get current window statistics for monitoring/debugging.""" + volume_list = list(self.volume_data) + sentiment_list = list(self.sentiment_data) + + stats = { + "window_size_hours": self.window_size_hours, + "z_threshold": self.z_threshold, + "data_points_count": len(self.timestamp_data), + "use_ml": self.use_ml, + "volume_stats": {}, + "sentiment_stats": {}, + } + + if volume_list: + stats["volume_stats"] = { + "count": len(volume_list), + "mean": float(np.mean(volume_list)), + "std": float(np.std(volume_list, ddof=1)), + "min": float(np.min(volume_list)), + "max": float(np.max(volume_list)), + } + + if sentiment_list: + stats["sentiment_stats"] = { + "count": len(sentiment_list), + "mean": float(np.mean(sentiment_list)), + "std": float(np.std(sentiment_list, ddof=1)), + "min": float(np.min(sentiment_list)), + "max": float(np.max(sentiment_list)), + } + + # Add ML stats if available + if self.ml_detector: + stats["ml"] = { + "is_trained": self.ml_detector.is_trained, + "contamination": self.ml_detector.contamination, + "training_samples": len(self.ml_detector.training_data), + "features": self.ml_detector.feature_columns + } + + return stats + + def reset(self): + """Reset the detector by clearing all stored data.""" + self.volume_data.clear() + self.sentiment_data.clear() + self.timestamp_data.clear() + self.historical_points.clear() + if self.ml_detector: + self.ml_detector = IsolationForestDetector( + contamination=self.ml_detector.contamination + ) + logger.info("AnomalyDetector reset completed") + + def save_ml_model(self, filepath: str): + """Save the ML model to disk.""" + if self.ml_detector: + self.ml_detector.save_model(filepath) + + def load_ml_model(self, filepath: str) -> bool: + """Load a pre-trained ML model.""" + if self.ml_detector: + return self.ml_detector.load_model(filepath) + return False + + +# Convenience functions for easy usage +def create_detector( + window_size_hours: int = 24, + z_threshold: float = 2.5, + use_ml: bool = True, + ml_contamination: float = 0.1, + enable_comparison_mode: bool = False +) -> AnomalyDetector: + """ + Factory function to create an AnomalyDetector instance. + + Args: + window_size_hours: Size of rolling window in hours + z_threshold: Z-score threshold for anomaly detection + use_ml: Enable ML-based multi-dimensional detection + ml_contamination: Expected proportion of anomalies (0.0-0.5) + enable_comparison_mode: Compare Z-score vs ML performance + + Returns: + Configured AnomalyDetector instance + """ + return AnomalyDetector( + window_size_hours=window_size_hours, + z_threshold=z_threshold, + use_ml=use_ml, + ml_contamination=ml_contamination, + enable_comparison_mode=enable_comparison_mode + ) + + +def detect_spike( + current_value: float, baseline_values: List[float], z_threshold: float = 2.5 +) -> Tuple[bool, float]: + """ + Simple spike detection for a single value against baseline. + + Args: + current_value: Value to test + baseline_values: Historical baseline values + z_threshold: Z-score threshold for anomaly detection + + Returns: + Tuple of (is_anomaly, severity_score) + """ + if len(baseline_values) < 10: + return False, 0.0 + + detector = AnomalyDetector(z_threshold=z_threshold, use_ml=False) + + dummy_timestamp = datetime.utcnow() + for value in baseline_values: + detector.add_data_point(value, 0.0, dummy_timestamp) + + result = detector.detect_volume_anomaly(current_value, dummy_timestamp) + return result.is_anomaly, result.severity_score \ No newline at end of file diff --git a/temp_backup/src/api/ingestion_quality_routes.py b/temp_backup/src/api/ingestion_quality_routes.py new file mode 100644 index 00000000..d67b6cac --- /dev/null +++ b/temp_backup/src/api/ingestion_quality_routes.py @@ -0,0 +1,59 @@ +"""FastAPI routes for triggering ingestion quality checks.""" + +from __future__ import annotations + +from typing import Any, Dict, Optional +from datetime import datetime + +from fastapi import APIRouter, Depends +from pydantic import BaseModel + +from src.ingestion.stellar_ingestion_checks import run_all_checks + + +router = APIRouter() + + +class IngestionQualityRunRequest(BaseModel): + network: str = "testnet" # "testnet" only in MVP + asset: str = "XLM" + ingestion_lag_seconds: int = 300 + duplicate_window_hours: int = 24 + drift_compare_window_hours: int = 24 + drift_ratio_threshold: float = 0.05 + drift_hours: Optional[str] = "24,48" # comma-separated + manual_run_id: Optional[str] = None + + +class IngestionQualityRunResponse(BaseModel): + schema_version: int + generated_at: str + network: str + asset: str + manual_run_id: Optional[str] = None + thresholds: Dict[str, Any] + summary: Dict[str, Any] + findings: list[Dict[str, Any]] + exit_code: int + + +@router.post("/ingestion/quality/run", response_model=IngestionQualityRunResponse) +async def run_ingestion_quality(req: IngestionQualityRunRequest) -> IngestionQualityRunResponse: + hours_list = [int(x.strip()) for x in (req.drift_hours or "").split(",") if x.strip()] + if not hours_list: + hours_list = [24, 48] + + result = run_all_checks( + network=req.network, + asset=req.asset.upper(), + ingestion_lag_seconds=req.ingestion_lag_seconds, + dup_window_hours=req.duplicate_window_hours, + drift_compare_window_hours=req.drift_compare_window_hours, + drift_ratio_threshold=req.drift_ratio_threshold, + hours_list=hours_list, + report_dir="./data/ingestion_reports", + manual_run_id=req.manual_run_id, + ) + + return IngestionQualityRunResponse(**result) + diff --git a/temp_backup/src/api/server.py b/temp_backup/src/api/server.py new file mode 100644 index 00000000..e2bcbb07 --- /dev/null +++ b/temp_backup/src/api/server.py @@ -0,0 +1,661 @@ +""" +FastAPI server to expose sentiment analysis as an HTTP API +for the Node.js backend to consume. +""" + +from fastapi import FastAPI, HTTPException, Request, Response, Query +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, ConfigDict +from typing import Dict, Any, Optional, List +from datetime import datetime + +# Import your existing SentimentAnalyzer +import sys +import os + +# Add parent directory to path to import from src +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +from sentiment import SentimentAnalyzer +from src.utils.logger import setup_logger, correlation_id_ctx, generate_correlation_id +from src.utils.metrics import API_FAILURES_TOTAL, generate_latest, CONTENT_TYPE_LATEST +from src.security import ( + security_config, + setup_security_middleware, + setup_rate_limiter, + get_rate_limit_decorator, +) +from src.ml.retraining_pipeline import run_retraining, get_last_run_status +from src.ml.model_registry import get_registry_status +from src.analytics.correlation_engine import CorrelationEngine +from src.db import PostgresService +from src.ingestion.stellar_ingestion_checks import run_all_checks + +from src.analytics.sentiment_indicators import SentimentIndicatorMapper, get_legend as sentiment_legend + +_indicator_mapper = SentimentIndicatorMapper() + +# Initialize structured logger +logger = setup_logger(__name__) + +# Initialize FastAPI app +app = FastAPI( + title="Sentiment Analysis API", + description="Exposes sentiment analysis for Node.js backend integration", + version="1.0.0", +) + +# Setup security middleware (API key authentication) +setup_security_middleware(app) + +# Setup rate limiting +limiter = security_config.limiter +if limiter: + setup_rate_limiter(app, limiter) + logger.info(f"Rate limiting enabled: {security_config.rate_limit_default}") +else: + logger.warning("Rate limiting is disabled") + +# Add CORS middleware to allow requests from Node.js backend +app.add_middleware( + CORSMiddleware, + allow_origins=[ + "http://localhost:3000", + "http://localhost:3001", + ], # Adjust for your NestJS ports + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +@app.middleware("http") +async def metrics_and_logging_middleware(request: Request, call_next): + corr_id = request.headers.get("X-Correlation-ID", generate_correlation_id()) + correlation_id_ctx.set(corr_id) + try: + response = await call_next(request) + if response.status_code >= 500: + API_FAILURES_TOTAL.labels(method=request.method, endpoint=request.url.path).inc() + response.headers["X-Correlation-ID"] = corr_id + return response + except Exception as e: + API_FAILURES_TOTAL.labels(method=request.method, endpoint=request.url.path).inc() + logger.error("Unhandled exception during request processing", exc_info=True) + raise + +# Initialize your existing SentimentAnalyzer +sentiment_analyzer = SentimentAnalyzer() + +# Ingestion quality check routes +from src.api.ingestion_quality_routes import router as ingestion_quality_router +app.include_router(ingestion_quality_router) + + +try: + postgres_service = PostgresService() +except Exception as exc: + postgres_service = None + logger.warning("PostgreSQL service unavailable for /news endpoint: %s", exc) + + +# --------------------------------------------------------------------------- +# Request/Response models +# --------------------------------------------------------------------------- + +class SentimentIndicatorResponse(BaseModel): + """Visual indicator fields attached to every sentiment-bearing response.""" + + score: float + color: str # "green" | "red" | "gray" + hex_color: str # CSS hex, e.g. "#00C853" + label: str # "Bullish" | "Bearish" | "Neutral" + display_text: str # e.g. "0.85 Bullish" + + +class AnalyzeRequest(BaseModel): + text: str + asset: Optional[str] = None # Optional asset filter + + +class AnalyzeResponse(BaseModel): + sentiment: float # compound_score from SentimentResult + asset_codes: List[str] = [] # Asset codes found in text + sentiment_label: str = "" # positive/negative/neutral + indicator: Optional[SentimentIndicatorResponse] = None # Visual colour indicator + + +class AssetAnalysisResponse(BaseModel): + asset: str + sentiment: float + sentiment_label: str + analysis_count: int + asset_distribution: Dict[str, int] = {} + sentiment_distribution: Dict[str, float] = {} + indicator: Optional[SentimentIndicatorResponse] = None # Visual colour indicator + + +class HealthResponse(BaseModel): + status: str + timestamp: str + service: str + + +class NewsArticleResponse(BaseModel): + article_id: str + title: str + content: Optional[str] = None + summary: Optional[str] = None + source: Optional[str] = None + url: Optional[str] = None + published_at: Optional[str] = None + primary_asset: Optional[str] = None + asset_codes: List[str] = [] + categories: List[str] = [] + keywords: List[str] = [] + detected_entities: List[str] = [] + sentiment_score: Optional[float] = None # Raw compound score stored in DB + sentiment_label: Optional[str] = None # positive / negative / neutral + indicator: Optional[SentimentIndicatorResponse] = None # Visual colour indicator + +@app.get("/metrics") +async def metrics(): + """Expose Prometheus metrics""" + return Response(content=generate_latest(), media_type=CONTENT_TYPE_LATEST) + +@app.get("/") +@limiter.limit("20/minute") if limiter else lambda x: x +async def root(request: Request) -> Dict[str, Any]: + """Root endpoint with API information""" + return { + "service": "Sentiment Analysis API", + "version": "1.0.0", + "endpoints": { + "GET /health": "Health check (no auth required)", + "GET /metrics": "Prometheus metrics (no auth required)", + "GET /news": "Get recent news with optional ?entity=... filter (requires X-API-Key header)", + "POST /analyze": "Analyze text sentiment (requires X-API-Key header)", + "GET /analyze": "Get asset-specific sentiment analysis (requires X-API-Key header)", + "POST /analyze-batch": "Batch analyze multiple texts (requires X-API-Key header)", + "GET /sentiment/legend": "Get colour legend for sentiment indicators (no auth required)", + }, + "note": "Returns sentiment score between -1 (negative) and 1 (positive)", + "security": "All endpoints except /health and /metrics require X-API-Key header", + } + + +@app.get("/health", response_model=HealthResponse) +@limiter.limit("30/minute") if limiter else lambda x: x +async def health_check(request: Request) -> HealthResponse: + + """Health check endpoint for monitoring""" + return HealthResponse( + status="healthy", + timestamp=datetime.now().isoformat(), + service="sentiment-analysis", + ) + + +@app.get("/news", response_model=List[NewsArticleResponse]) +@limiter.limit("30/minute") if limiter else lambda x: x +async def get_news( + request: Request, + limit: int = Query(50, ge=1, le=500), + hours: int = Query(24, ge=1, le=168), + asset: Optional[str] = Query(None, description="Optional primary asset code filter"), + entity: Optional[str] = Query( + None, + description="Optional detected entity filter (example: Soroban)", + ), +) -> List[NewsArticleResponse]: + """Return recent articles with optional asset and entity filters.""" + if postgres_service is None: + raise HTTPException(status_code=503, detail="Database service unavailable") + + try: + articles = postgres_service.get_recent_articles( + limit=limit, + hours=hours, + asset=asset, + entity=entity, + ) + + logger.info( + "Retrieved %d news articles | hours=%d | asset=%s | entity=%s | client_ip=%s", + len(articles), + hours, + asset, + entity, + request.client.host, + ) + + def _build_indicator( + score: Optional[float], + ) -> Optional[SentimentIndicatorResponse]: + if score is None: + return None + ind = _indicator_mapper.score_to_indicator(score) + return SentimentIndicatorResponse(**ind.to_dict()) + + return [ + NewsArticleResponse( + article_id=article.article_id, + title=article.title, + content=article.content, + summary=article.summary, + source=article.source, + url=article.url, + published_at=( + article.published_at.isoformat() if article.published_at else None + ), + primary_asset=article.primary_asset, + asset_codes=article.asset_codes or [], + categories=article.categories or [], + keywords=article.keywords or [], + detected_entities=article.detected_entities or [], + sentiment_score=article.sentiment_score, + sentiment_label=article.sentiment_label, + indicator=_build_indicator(article.sentiment_score), + ) + for article in articles + ] + except Exception as exc: + logger.error("Error retrieving news: %s", str(exc), exc_info=True) + raise HTTPException(status_code=500, detail="Failed to fetch news articles") + + +@app.post("/analyze", response_model=AnalyzeResponse) +@limiter.limit("50/minute") if limiter else lambda x: x +async def analyze_text(body: AnalyzeRequest, request: Request) -> AnalyzeResponse: + """ + Analyze the sentiment of provided text. + + This endpoint connects to your existing SentimentAnalyzer class + and returns the compound_score as the sentiment value. + + Args: + request: Contains the text to analyze and optional asset filter + + Returns: + sentiment: float between -1 and 1 + asset_codes: List of asset codes found in text + sentiment_label: positive/negative/neutral + """ + try: + # Validate input + if not body.text or not body.text.strip(): + raise HTTPException(status_code=400, detail="Text cannot be empty") + + # Use your existing SentimentAnalyzer with asset filter + result = sentiment_analyzer.analyze(body.text, body.asset) + + logger.info( + f"Analyzed text: '{body.text[:50]}...' -> sentiment: {result.compound_score} | " + f"asset: {body.asset} | client_ip: {request.client.host}" + ) + + # Build visual indicator + ind = _indicator_mapper.score_to_indicator(result.compound_score) + + # Return enhanced response with asset information + return AnalyzeResponse( + sentiment=result.compound_score, + asset_codes=result.asset_codes, + sentiment_label=result.sentiment_label, + indicator=SentimentIndicatorResponse(**ind.to_dict()), + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error in sentiment analysis: {str(e)}", exc_info=True) + raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") + + +@app.get("/analyze", response_model=AssetAnalysisResponse) +@limiter.limit("30/minute") if limiter else lambda x: x +async def get_asset_analysis( + request: Request, + asset: str = Query(..., description="Asset code (e.g., XLM, USDC, BTC)") +) -> AssetAnalysisResponse: + """ + Get sentiment analysis for a specific asset. + + This endpoint provides asset-specific sentiment analysis by filtering + news and social media content that mentions the specified asset. + + Args: + asset: Asset code to analyze (e.g., XLM, USDC, BTC) + + Returns: + Asset-specific sentiment analysis with distribution statistics + """ + try: + if not asset or not asset.strip(): + raise HTTPException(status_code=400, detail="Asset code cannot be empty") + + asset = asset.upper().strip() + + # For now, return a mock response since we need to integrate with actual data sources + # In a real implementation, this would query the database for recent sentiment data + # related to the specific asset + + logger.info(f"Requested asset analysis for: {asset} | client_ip: {request.client.host}") + + # Mock response - replace with actual database query + mock_score = 0.0 + ind = _indicator_mapper.score_to_indicator(mock_score) + return AssetAnalysisResponse( + asset=asset, + sentiment=mock_score, + sentiment_label="neutral", + analysis_count=0, + asset_distribution={}, + sentiment_distribution={"positive": 0.0, "negative": 0.0, "neutral": 1.0}, + indicator=SentimentIndicatorResponse(**ind.to_dict()), + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error in asset analysis: {str(e)}", exc_info=True) + raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}") + + +# Optional: Batch analysis endpoint if needed +@app.post("/analyze-batch") +@limiter.limit("10/minute") if limiter else lambda x: x +async def analyze_batch(request: Request, texts: list[str], asset: Optional[str] = None) -> Dict[str, Any]: + """Batch analyze multiple texts with optional asset filter""" + try: + if not texts: + raise HTTPException(status_code=400, detail="Texts list cannot be empty") + + results = sentiment_analyzer.analyze_batch(texts, asset) + summary = sentiment_analyzer.get_sentiment_summary(results) + + return { + "results": [r.to_dict() for r in results], + "summary": summary, + "count": len(results), + "asset_filter": asset, + } + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.get("/sentiment/legend") +async def get_sentiment_legend() -> Dict[str, Any]: + """ + Return the colour legend that frontend clients use to render + sentiment badge tooltips. + + No authentication required — purely informational. + + Returns a list of objects with keys: + - color : semantic name ("green" | "red" | "gray") + - hex_color : CSS hex value + - label : human-readable label ("Bullish" | "Bearish" | "Neutral") + - description : tooltip copy + - score_range : score boundary description + """ + return { + "legend": sentiment_legend(), + "thresholds": { + "bullish": "score >= 0.05", + "bearish": "score <= -0.05", + "neutral": "-0.05 < score < 0.05", + }, + } + + +if __name__ == "__main__": + import uvicorn + + # Run the server + uvicorn.run( + "server:app", + host="0.0.0.0", # Listen on all interfaces + port=8000, # Default FastAPI port + reload=True, # Auto-reload during development + ) + + +# --------------------------------------------------------------------------- +# Model retraining endpoints (Issue #454) +# --------------------------------------------------------------------------- + +class RetrainRequest(BaseModel): + force: bool = False # Skip quality gates when True + + +class RetrainResponse(BaseModel): + status: str + started_at: Optional[str] = None + finished_at: Optional[str] = None + duration_seconds: Optional[float] = None + models: Dict[str, Any] = {} + registry: Dict[str, Any] = {} + error: Optional[str] = None + + +class ModelStatusResponse(BaseModel): + last_run: Dict[str, Any] + registry: Dict[str, Any] + + +@app.post("/retrain", response_model=RetrainResponse) +@limiter.limit("5/minute") if limiter else lambda x: x +async def trigger_retraining( + body: RetrainRequest, + request: Request, +) -> RetrainResponse: + """ + Trigger an immediate model retraining run. + + Runs synchronously in a thread pool so the HTTP response is returned + only after retraining completes (or fails). For long-running production + retrains, consider making this async with a task queue. + + Requires X-API-Key header. + """ + import asyncio + + logger.info( + f"Retraining triggered via API | force={body.force} | " + f"client_ip={request.client.host}" + ) + + loop = asyncio.get_event_loop() + result = await loop.run_in_executor( + None, lambda: run_retraining(force=body.force) + ) + + return RetrainResponse(**{k: result.get(k) for k in RetrainResponse.model_fields if k in result}) + + +@app.get("/model/status", response_model=ModelStatusResponse) +@limiter.limit("30/minute") if limiter else lambda x: x +async def model_status(request: Request) -> ModelStatusResponse: + """ + Return the current model registry state and last retraining run metadata. + + Requires X-API-Key header. + """ + return ModelStatusResponse( + last_run=get_last_run_status(), + registry=get_registry_status(), + ) + + +# --------------------------------------------------------------------------- +# Predictive analytics endpoint (forecast market trends) +# --------------------------------------------------------------------------- + + +class ForecastResponse(BaseModel): + model_config = ConfigDict(protected_namespaces=()) + + predicted_trend_24h: str + predicted_trend_48h: str + confidence_24h: float + confidence_48h: float + sentiment_velocity: float + forecast_score_24h: float + forecast_score_48h: float + model_backend: str + data_points_used: int + generated_at: str + + +@app.get("/analytics/forecast", response_model=ForecastResponse) +@limiter.limit("20/minute") if limiter else lambda x: x +async def get_forecast(request: Request) -> ForecastResponse: + """ + Predict market trends (Bullish / Bearish / Neutral) for the next 24-48 hours. + + Uses historical sentiment data from *analytics.jsonl* to train a + SentimentForecaster (Prophet when installed, sklearn Ridge otherwise) + and returns predicted health scores together with a Sentiment Velocity + value that measures how fast the market mood is changing. + + Requires X-API-Key header. + """ + import asyncio + + logger.info(f"Forecast requested | client_ip={request.client.host}") + + def _run_forecast(): + from src.analytics.forecaster import SentimentForecaster + + forecaster = SentimentForecaster() + return forecaster.run() + + loop = asyncio.get_event_loop() + try: + result = await loop.run_in_executor(None, _run_forecast) + except Exception as exc: + logger.error(f"Forecast failed: {exc}", exc_info=True) + raise HTTPException(status_code=500, detail=f"Forecast error: {exc}") + + return ForecastResponse(**result.to_dict()) + + +# --------------------------------------------------------------------------- +# Correlation Analysis endpoints (Issue #452) +# --------------------------------------------------------------------------- + + +class CorrelationDataPoint(BaseModel): + timestamp: str + score: float + + +class MetricDataPoint(BaseModel): + timestamp: str + value: float + + +class CorrelationRequest(BaseModel): + sentiment_data: List[CorrelationDataPoint] + price_data: Optional[List[MetricDataPoint]] = None + volume_data: Optional[List[MetricDataPoint]] = None + lag_hours: int = 0 + + +class CorrelationResponse(BaseModel): + price_correlation: Optional[Dict[str, Any]] = None + volume_correlation: Optional[Dict[str, Any]] = None + summary: Dict[str, Any] + + +class LagAnalysisRequest(BaseModel): + sentiment_data: List[CorrelationDataPoint] + metric_data: List[MetricDataPoint] + metric_type: str = "volume" + max_lag_hours: int = 24 + + +class LagAnalysisResponse(BaseModel): + best_lag_hours: int + best_correlation: float + lag_analysis: List[Dict[str, Any]] + recommendation: str + + +@app.post("/correlation/analyze", response_model=CorrelationResponse) +@limiter.limit("20/minute") if limiter else lambda x: x +async def analyze_correlation( + body: CorrelationRequest, + request: Request, +) -> CorrelationResponse: + """ + Analyze correlation between sentiment and price/volume data. + + Returns correlation scores (-1 to 1) and scatter plot data points. + Requires X-API-Key header. + """ + sentiment_list = [{"timestamp": dp.timestamp, "score": dp.score} for dp in body.sentiment_data] + price_list = ( + [{"timestamp": dp.timestamp, "value": dp.value} for dp in body.price_data] + if body.price_data + else [] + ) + volume_list = ( + [{"timestamp": dp.timestamp, "value": dp.value} for dp in body.volume_data] + if body.volume_data + else [] + ) + + logger.info( + f"Correlation analysis requested | sentiment_points={len(sentiment_list)} | " + f"price_points={len(price_list)} | volume_points={len(volume_list)} | " + f"lag_hours={body.lag_hours} | client_ip={request.client.host}" + ) + + result = CorrelationEngine.full_analysis( + sentiment_data=sentiment_list, + price_data=price_list, + volume_data=volume_list, + lag_hours=body.lag_hours, + ) + + return CorrelationResponse( + price_correlation=result.get("price_correlation"), + volume_correlation=result.get("volume_correlation"), + summary=result.get("summary", {}), + ) + + +@app.post("/correlation/lag-analysis", response_model=LagAnalysisResponse) +@limiter.limit("10/minute") if limiter else lambda x: x +async def analyze_lag_correlation( + body: LagAnalysisRequest, + request: Request, +) -> LagAnalysisResponse: + """ + Analyze correlation across multiple time lags to find optimal lead time. + + Returns the best lag hours and correlation strength for predicting market changes. + Requires X-API-Key header. + """ + sentiment_list = [{"timestamp": dp.timestamp, "score": dp.score} for dp in body.sentiment_data] + metric_list = [{"timestamp": dp.timestamp, "value": dp.value} for dp in body.metric_data] + + logger.info( + f"Lag correlation analysis | metric_type={body.metric_type} | " + f"max_lag={body.max_lag_hours}h | client_ip={request.client.host}" + ) + + result = CorrelationEngine.analyze_with_lags( + sentiment_data=sentiment_list, + metric_data=metric_list, + metric_type=body.metric_type, + max_lag_hours=body.max_lag_hours, + ) + + return LagAnalysisResponse( + best_lag_hours=result["best_lag_hours"], + best_correlation=result["best_correlation"], + lag_analysis=result["lag_analysis"], + recommendation=result["recommendation"], + ) diff --git a/temp_backup/src/cache_manager.py b/temp_backup/src/cache_manager.py new file mode 100644 index 00000000..9f79529d --- /dev/null +++ b/temp_backup/src/cache_manager.py @@ -0,0 +1,146 @@ +""" +Cache Manager module - Implements caching layer for expensive operations using Redis +""" + +import hashlib +import json +import logging +import os +from typing import Any, Optional + +import redis + +logger = logging.getLogger(__name__) + + +class CacheManager: + """ + Manages caching using Redis for expensive operations like sentiment analysis. + Uses a 24-hour TTL for cached results. + """ + + DEFAULT_TTL_SECONDS = 24 * 60 * 60 # 24 hours + + def __init__( + self, + host: Optional[str] = None, + port: Optional[int] = None, + db: Optional[int] = None, + ttl_seconds: Optional[int] = None, + namespace: str = "cache", + ): + self.host = host if host is not None else os.getenv("REDIS_HOST", "localhost") + self.port = port if port is not None else int(os.getenv("REDIS_PORT", "6379")) + self.db = db if db is not None else int(os.getenv("REDIS_DB", "0")) + self.ttl_seconds = ( + ttl_seconds + if ttl_seconds is not None + else int(os.getenv("CACHE_TTL_SECONDS", str(self.DEFAULT_TTL_SECONDS))) + ) + self.namespace = namespace + + self.redis_client = redis.Redis( + host=self.host, + port=self.port, + db=self.db, + decode_responses=True, + socket_connect_timeout=5, + socket_timeout=5, + ) + self.redis_client.ping() + logger.info( + "Connected to Redis at %s:%s/%s (namespace=%s, ttl=%ss)", + self.host, + self.port, + self.db, + self.namespace, + self.ttl_seconds, + ) + + def _generate_key(self, raw_key: str) -> str: + """Return ``namespace:sha256(raw_key)``.""" + digest = hashlib.sha256(raw_key.encode("utf-8")).hexdigest() + return f"{self.namespace}:{digest}" + + @staticmethod + def make_key(*parts: Any) -> str: + """Build a deterministic cache key from arbitrary ordered parts.""" + return "|".join(str(p) for p in parts) + + def get(self, raw_key: str) -> Optional[Any]: + """ + Return deserialised value for raw_key, or None on miss. + + Args: + raw_key: Key to retrieve the result from + + Returns: + Cached result if found, None otherwise + """ + try: + key = self._generate_key(raw_key) + cached = self.redis_client.get(key) + if cached is not None: + logger.info("CACHE HIT [%s] %s", self.namespace, raw_key[:80]) + return json.loads(cached) + logger.debug("CACHE MISS [%s] %s", self.namespace, raw_key[:80]) + return None + except Exception as e: + logger.error("Cache get error: %s", e) + return None + + def set(self, raw_key: str, value: Any) -> bool: + """ + Store result in cache with TTL. + + Args: + raw_key: Key to store the result under + value: Result to store in cache + + Returns: + True if successful, False otherwise + """ + try: + key = self._generate_key(raw_key) + serialised = json.dumps(value, default=str) + ok = self.redis_client.setex(key, self.ttl_seconds, serialised) + if ok: + logger.debug( + "CACHE SET [%s] ttl=%ss", self.namespace, self.ttl_seconds + ) + return bool(ok) + except Exception as e: + logger.error("Cache set error: %s", e) + return False + + def delete(self, raw_key: str) -> bool: + """Remove a single entry.""" + try: + return self.redis_client.delete(self._generate_key(raw_key)) > 0 + except Exception as e: + logger.error("Cache delete error: %s", e) + return False + + def clear_namespace(self) -> int: + """Delete every key that belongs to this namespace.""" + try: + keys = list(self.redis_client.scan_iter(match=f"{self.namespace}:*")) + count = self.redis_client.delete(*keys) if keys else 0 + if count: + logger.info("Cleared %d entries from [%s]", count, self.namespace) + return count + except Exception as e: + logger.error("Cache clear error: %s", e) + return 0 + + def ping(self) -> bool: + """ + Test Redis connection. + + Returns: + True if connected, False otherwise + """ + try: + return self.redis_client.ping() + except Exception: + return False diff --git a/temp_backup/src/config/anomaly_config.py b/temp_backup/src/config/anomaly_config.py new file mode 100644 index 00000000..82e4e419 --- /dev/null +++ b/temp_backup/src/config/anomaly_config.py @@ -0,0 +1,114 @@ +#! /usr/bin/env python3 +""" +Configuration module for anomaly detection settings. +Supports both Z-score and Isolation Forest configurations. +""" + +from dataclasses import dataclass +from typing import Optional +import os +import json + + +@dataclass +class ZScoreConfig: + """Configuration for Z-score based anomaly detection.""" + + window_size_hours: int = 24 + z_threshold: float = 2.5 + min_data_points: int = 10 + + @classmethod + def from_dict(cls, data: dict) -> 'ZScoreConfig': + return cls( + window_size_hours=data.get('window_size_hours', 24), + z_threshold=data.get('z_threshold', 2.5), + min_data_points=data.get('min_data_points', 10) + ) + + +@dataclass +class IsolationForestConfig: + """Configuration for Isolation Forest based anomaly detection.""" + + enabled: bool = True + contamination: float = 0.1 # Expected proportion of anomalies (0.0 to 0.5) + n_estimators: int = 100 + max_samples: str = 'auto' + random_state: int = 42 + min_training_samples: int = 50 + auto_retrain_interval: int = 200 # Retrain every N new samples + features: list = None # Features to use for detection + + def __post_init__(self): + if self.features is None: + self.features = ['volume', 'sentiment', 'volume_change_rate', 'sentiment_change_rate'] + + @classmethod + def from_dict(cls, data: dict) -> 'IsolationForestConfig': + return cls( + enabled=data.get('enabled', True), + contamination=data.get('contamination', 0.1), + n_estimators=data.get('n_estimators', 100), + max_samples=data.get('max_samples', 'auto'), + random_state=data.get('random_state', 42), + min_training_samples=data.get('min_training_samples', 50), + auto_retrain_interval=data.get('auto_retrain_interval', 200), + features=data.get('features', ['volume', 'sentiment', 'volume_change_rate', 'sentiment_change_rate']) + ) + + +@dataclass +class AnomalyDetectionConfig: + """Main configuration for anomaly detection system.""" + + zscore: ZScoreConfig + isolation_forest: IsolationForestConfig + enable_comparison_mode: bool = False + model_save_path: str = "models/anomaly_detector" + + @classmethod + def from_dict(cls, data: dict) -> 'AnomalyDetectionConfig': + return cls( + zscore=ZScoreConfig.from_dict(data.get('zscore', {})), + isolation_forest=IsolationForestConfig.from_dict(data.get('isolation_forest', {})), + enable_comparison_mode=data.get('enable_comparison_mode', False), + model_save_path=data.get('model_save_path', "models/anomaly_detector") + ) + + @classmethod + def from_env(cls) -> 'AnomalyDetectionConfig': + """Load configuration from environment variables.""" + config = { + 'zscore': { + 'window_size_hours': int(os.getenv('ANOMALY_WINDOW_HOURS', '24')), + 'z_threshold': float(os.getenv('ANOMALY_Z_THRESHOLD', '2.5')), + }, + 'isolation_forest': { + 'enabled': os.getenv('ANOMALY_ML_ENABLED', 'true').lower() == 'true', + 'contamination': float(os.getenv('ANOMALY_ML_CONTAMINATION', '0.1')), + 'n_estimators': int(os.getenv('ANOMALY_ML_ESTIMATORS', '100')), + }, + 'enable_comparison_mode': os.getenv('ANOMALY_COMPARISON_MODE', 'false').lower() == 'true', + 'model_save_path': os.getenv('ANOMALY_MODEL_PATH', 'models/anomaly_detector') + } + return cls.from_dict(config) + + def save_to_file(self, filepath: str): + """Save configuration to JSON file.""" + with open(filepath, 'w') as f: + json.dump({ + 'zscore': self.zscore.__dict__, + 'isolation_forest': self.isolation_forest.__dict__, + 'enable_comparison_mode': self.enable_comparison_mode, + 'model_save_path': self.model_save_path + }, f, indent=2) + + @classmethod + def load_from_file(cls, filepath: str) -> 'AnomalyDetectionConfig': + """Load configuration from JSON file.""" + if os.path.exists(filepath): + with open(filepath, 'r') as f: + data = json.load(f) + return cls.from_dict(data) + return cls.from_env() \ No newline at end of file diff --git a/temp_backup/src/database.py b/temp_backup/src/database.py new file mode 100644 index 00000000..504aa2eb --- /dev/null +++ b/temp_backup/src/database.py @@ -0,0 +1,241 @@ +""" +Database service module - stores analytics data +Supports both file-based storage (legacy) and PostgreSQL persistence +""" + +import json +import logging +from typing import Dict, Any, List, Optional +from datetime import datetime +from pathlib import Path + +logger = logging.getLogger(__name__) + + +class AnalyticsRecord: + """Record of analytics data""" + + def __init__( + self, + timestamp: datetime, + news_count: int, + sentiment_data: Dict[str, Any], + trends: List[Dict[str, Any]], + ): + self.timestamp = timestamp + self.news_count = news_count + self.sentiment_data = sentiment_data + self.trends = trends + + def to_dict(self) -> Dict[str, Any]: + return { + "timestamp": self.timestamp.isoformat(), + "news_count": self.news_count, + "sentiment_data": self.sentiment_data, + "trends": self.trends, + } + + +class DatabaseService: + """ + Stores and retrieves analytics data + Supports both file-based storage and PostgreSQL + """ + + def __init__( + self, + storage_dir: str = "./data", + use_postgres: bool = True, + postgres_service: Optional[Any] = None, + ): + # File-based storage (legacy/fallback) + self.storage_dir = Path(storage_dir) + self.storage_dir.mkdir(parents=True, exist_ok=True) + self.analytics_file = self.storage_dir / "analytics.jsonl" + self.latest_file = self.storage_dir / "latest.json" + + # PostgreSQL storage + self.use_postgres = use_postgres + self.postgres_service = postgres_service + + if self.use_postgres and self.postgres_service: + logger.info("DatabaseService initialized with PostgreSQL support") + else: + logger.info("DatabaseService initialized with file-based storage only") + + def save_analytics(self, record: AnalyticsRecord) -> bool: + """ + Save analytics record to storage + + Args: + record: AnalyticsRecord to save + + Returns: + True if successful, False otherwise + """ + success = True + + # Save to file-based storage (always for backward compatibility) + try: + # Append to JSONL file for historical data + with open(self.analytics_file, "a") as f: + f.write(json.dumps(record.to_dict()) + "\n") + + # Update latest.json for quick access + with open(self.latest_file, "w") as f: + json.dump(record.to_dict(), f, indent=2) + + logger.info(f"Analytics saved to file: {record.news_count} news items analyzed") + except Exception as e: + logger.error(f"Error saving analytics to file: {e}") + success = False + + # Save to PostgreSQL if enabled + if self.use_postgres and self.postgres_service: + try: + # Save sentiment data as news insights + if record.sentiment_data and "results" in record.sentiment_data: + sentiment_results = record.sentiment_data["results"] + if sentiment_results: + saved_count = self.postgres_service.save_news_insights_batch( + [r.to_dict() if hasattr(r, "to_dict") else r for r in sentiment_results] + ) + logger.info(f"Saved {saved_count} news insights to PostgreSQL") + + # Save trends as asset trends + if record.trends: + for trend in record.trends: + trend_data = trend.to_dict() if hasattr(trend, "to_dict") else trend + self.postgres_service.save_asset_trend( + asset="XLM", # Default asset + metric_name=trend_data.get("metric_name", "unknown"), + window="24h", # Default window + trend_data=trend_data, + ) + logger.info(f"Saved {len(record.trends)} trends to PostgreSQL") + + except Exception as e: + logger.error(f"Error saving analytics to PostgreSQL: {e}") + # Don't fail if PostgreSQL save fails + + return success + + def get_latest_analytics(self) -> Dict[str, Any]: + """ + Get the latest analytics record + + Returns: + Latest analytics data or empty dict if not available + """ + try: + if self.latest_file.exists(): + with open(self.latest_file, "r") as f: + return json.load(f) + except Exception as e: + logger.error(f"Error reading latest analytics: {e}") + + return {} + + def get_analytics_history(self, limit: int = 24) -> List[Dict[str, Any]]: + """ + Get historical analytics data + + Args: + limit: Maximum number of records to return + + Returns: + List of analytics records (most recent first) + """ + records = [] + try: + if self.analytics_file.exists(): + with open(self.analytics_file, "r") as f: + lines = f.readlines() + # Get last 'limit' records in reverse order + for line in reversed(lines[-limit:]): + records.append(json.loads(line)) + except Exception as e: + logger.error(f"Error reading analytics history: {e}") + + return records + + def expose_metrics(self) -> Dict[str, Any]: + """ + Expose all metrics for monitoring/API purposes + + Returns: + Dictionary of all available metrics + """ + latest = self.get_latest_analytics() + history = self.get_analytics_history(limit=24) + + metrics = { + "latest": latest, + "history": history, + "history_count": len(history), + "last_updated": latest.get("timestamp") if latest else None, + } + + # Add PostgreSQL metrics if available + if self.use_postgres and self.postgres_service: + try: + pg_summary = self.postgres_service.get_sentiment_summary(hours=24) + metrics["postgres_summary"] = pg_summary + except Exception as e: + logger.error(f"Error getting PostgreSQL metrics: {e}") + + return metrics + + def clear_old_data(self, days: int = 30) -> int: + """ + Clear analytics data older than specified days + + Args: + days: Number of days to keep + + Returns: + Number of records deleted + """ + deleted_count = 0 + + # Clear file-based data + try: + from datetime import timedelta + + cutoff_date = datetime.utcnow() - timedelta(days=days) + + if not self.analytics_file.exists(): + return 0 + + with open(self.analytics_file, "r") as f: + lines = f.readlines() + + # Filter out old records + new_lines = [] + for line in lines: + try: + record = json.loads(line) + record_date = datetime.fromisoformat(record.get("timestamp", "")) + if record_date > cutoff_date: + new_lines.append(line) + else: + deleted_count += 1 + except: + new_lines.append(line) + + with open(self.analytics_file, "w") as f: + f.writelines(new_lines) + + logger.info(f"Deleted {deleted_count} old analytics records from files") + except Exception as e: + logger.error(f"Error clearing old file data: {e}") + + # Clear PostgreSQL data + if self.use_postgres and self.postgres_service: + try: + pg_deleted = self.postgres_service.cleanup_old_data(days=days) + logger.info(f"Deleted old PostgreSQL data: {pg_deleted}") + except Exception as e: + logger.error(f"Error clearing old PostgreSQL data: {e}") + + return deleted_count diff --git a/temp_backup/src/db/__init__.py b/temp_backup/src/db/__init__.py new file mode 100644 index 00000000..dcb6992a --- /dev/null +++ b/temp_backup/src/db/__init__.py @@ -0,0 +1,18 @@ +""" +Database package for analytics data persistence +""" + +from .models import Base, Article, SocialPost, AnalyticsRecord, NewsInsight, AssetTrend, OnChainEntity, ArticleEntityLink +from .postgres_service import PostgresService + +__all__ = [ + "Base", + "Article", + "SocialPost", + "AnalyticsRecord", + "NewsInsight", + "AssetTrend", + "OnChainEntity", + "ArticleEntityLink", + "PostgresService", +] diff --git a/temp_backup/src/db/__pycache__/__init__.cpython-314.pyc b/temp_backup/src/db/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ac3bb070b00a86d1664c6ba8ba8150b33b90667 GIT binary patch literal 510 zcmaKp&ui5{5XX0uyd=%TwpS0*fTtiW6$C+~@bn3J@T`(5dRV$_H{&xlO_rT`N z^r>1w`D(PK8dQHyoyM~D?b3k74haSgMU}ebf!DiiX1$^ZR~(JHQ~lEs$!*Q9CL7SB zZQb+`-r-`vZuOd6F+&7;W?^V6?zY0j$^ssNecNd`tTG#JXXAh*U=ol9WC6KHHFcSk z?Wm=6>DkyqkN=-?+vakQ#D9za`UDr%_fmdl7v7e+#}Ds`=U#zg{%E-@@7~$p7jwW< z8jV=4&tNd?9ziH$P=r_6HFng1*bOUzR*UvTNZAb?Nx5gvU4GvmZr#Iy-S+PO7a890 g_z`o?e?}2ce&>w8hJhI#wy_NuQy7??wW&1Sh27(JcT-gj zW6x%%wHj>_B}$C=V}z1;eodoDStn7l&aXJhkDZ^5Tq}l>M$sx#qWoiovm4EyzHm*fr#q-9sMPGvt-ML#)gW)ywrmKG`?qm;FNlIWQEI zgGARg92yGCVRG&sjtoWRXgf0*-of-~my{ctm>mq~X=XU@WJ6ya70Zo8%7V1sB8?HL z52SvJw24RqAPriiaUu(^VI2CYu92&kM^)Fj@Ug7Ahw~ZXW7T&y zJEir5VIvtKolA)+IeS;&adsec-7e+RY5le0(W_Nn{26(*lZ@V~;m>o-MeDN9(CIOBP{V7C;x{zRQQRDR z%&CnKFxq3r2sjUOgdouffz0d`t|5yMSrk5{I3P!kijRmE=M;pB9B#&>4JOr+P~H?46RJviVXjhhwSkEPjBGonkXQ zNU;Tq#}0D-7(Q?O%;yVv0R|39f_PUD`HXNoRm#a!rB2)q9aOfEFBNfF5UJXd&3tT3 zikIh#0uQTc29|`MOm&XPR1XzCW=4~(D$Dcv)U3ess*mSkEtYb49^m;qrBqI93G@8z ztSHI3Y+lG0pxMjwnL-*)f;>;23{@2ID;Cl6sm^Rh^x#Q7eRMq3cVgXzs^TtGtjBsF zUC7H~DlLPCr21fKiYXqKzsRCa2zBc6g?p+Go+jLBhKD8Xqj!zxq8~Lv+C2D_I3>Zn zns^?r3-<{cU<_`RWC6rt2k5?lKSsT9?!?43c(T$&|G>qIiDP|-4o&n6(u`avPFyU( zTPv1wk}#1f7NrTC%>no}(t;#`U73)?^hD;?1T|dU#W^toBa#AVEBz9RpEBP#oo@EI zhiU36U;0&jc)`0gz7kXF-z%Se7Kz`#yO@6PNu_UP_o}Q!E|!O%HMBg8FF7BzRfLs3 zrQvk>!ZUyL{+`9g2m2~+#ot}-e->=KKe*WQV5AaT9#?`V%7f1uTNgVjF{QD)eDPVd zdC|2bE7ASs5jcOwMix61wtd<4bM~*Q`xzJ)oY#!xqyrZKddIy9HM z1~rBOV%~HEXqq>#VHBx(ix5-B8m25dh9AU+pUsrj8&_(aj6rW7=Lcg`#|3o*M5G~* z`3>{ql$*f(#5MC1JtM+rm=gww`MH?Q&&;3k;Q_1+ZIKdkp_Vn8P37kf$b|!VOo&1b zKIcMSn$8xJJ~gKGrK^38hm|=ajf)L%FN0z$lwuneTd>#$h1!g!p++awrwg+gHKJJw zW635RV%+GhHR}3!NwT0ulg;98yb$7o*oj3K7D+7jVqusCaUWLEE{N}9aS{uJQ&`=F z{OxQ8v0g+{rFzjWpfxbefOrVC4`YFAUhKsJIS-iy{Wy5-o-z36FJK+W48-d?);Xl8j}HKsHVR$Vb%U+#)ai5`IFP;6m*sb2{t%jaLY z*61SYtZc0w8?JKK752t6HdqaHR+I0pOsugZ&)s$AvcgH*mK9n@{JqG)$DU%!jIppz z%5`z30A%ue;TnEGY;&Pd_TaCvpzN{`L3ZRa85_%BjRixOoV}OEC~TuQ7$e#FiaAmv zWf8Vslvy;{WO1Trmo?XrwbYQc5*gml%VyC=G@`aEZmA*LT0^$Y7~N-iTIN&VPTJe; zObf1r3{5w6iI?b@qwM%jjh(eLCU=bu(NE1<+RUKI=yt!&2(NJklhN&&?4*pYPc!JG z9^%5bdW4JG>J40@tsdhVp@;VD$OuhbT-WIJ$X#4>Gb1Ot7J2V=M&8G@0#|HPy=SON zo6Zl9*gbJptf3! zvcwllw{lr&nlhY#t+v&sD4``rV#x!O6mn{KqoC2QUMPuaLG?)V*k)_gSu0_?YS3P_ zUzVuw#f_?F0I6+_dIx|KZ>_q@f8nkGy-P*=E_amNZ zKhUcKJw%laKWK453j)lBAGEl8>ft4O<^w0Sx7?M@oSJ9%CxHIvgi|L*{{wt@X7pQ- zkXt?VZD4=GxZP9VI+O1uT?DsOH|ZjdfW9~g1yFXEB;;;Oc-MPGL?I;BqEHlpj7GHU zb?Xul+NB>rQD(l1Zd=U!dEsH9x~;o1y)6HH{;%d&cm3_B)uZQEKdz3AKOMXEWb9V; za=Mx>REzR&80L;+!1)b5JZG4qW6%kQLFW)2BEwkL4?8IucEBY@ll$Kmoe_sc4hjv7 z^RnudO0($Yh-fj?x{{cqEZ$E)jx_l45_v2D=s?z|3L-dG6yYsjsgwo}Obrza5;$Jq zX>0YoFhxCYS`X%h5;y}n+Dx9a%7S-wuONb_5X}fO_-e3P2rq%ls9~feRsow!<)=z1 zhzSr9(T{WX-4^7uw#fV>;4n{C9Pupu9K^1I7=-;NhnG zw->nwAHhK3L**gp22<;T*0#ed{c!hZ*{r(zznw;-i`e2RiCZ~HVm)F>nRHIU;zCX6cULiFgdTZ>_ z=Rt4WQy%?IgTsxB5lgxl@f%3PpYbyY-UQf28^OE#y%IbmHLngP1%LvYp&;r%Z46VH z?UeeV7i7vP+4DkFv)=HzDA}{iqJ%!!Wt0Fm+Qy^wZZjP-gIG;uBzrGP0I`>ql$x`o zWU$3{B})A4GD`LAGD`LAGD`JcHA_nTwi9CK)BR2o@-y$Go!rSr9qZGj)bCXql=^MZ znA|yr;I!dqBX)j+UUt-)#dnSHUF~qE0o_xUcTett=YWAao!DVL%tdVVDA!=CH*zss zy@?BfL??DQLY!;XHGtT4axFmYy0}(33B+zM*9MrsMfF`Rq_e5qSV5BKdv$In5m{nZ z$Yded3{4OM90!UA;k5k;PRL}k^8N&%MK?Iy0jkUsHQmuB?9TAEAy(|wgLJiso^=Gi z2<|+Dyk}L`I3vW&t^{+Ubg&+;OHUKX-)w3IN1=R>BIWi?YBp|1LArgjk~S}%0J^$m zlfrB9ydr#aZ^QF|r8!^9?2z--W_B*Z?A-I=lUI$`Ovs&oe?ot}r^L%JDRB&oD^Mg| zsuv$B1}5_;G{~HI6{;2{CnEN%jpon>kNXQ$L~Z18iKW;VTR-2r6spXubo|4Czdx|j z{v}(za_#Ar^ph*;>R6_lDOT^4fWe8t*?_^}0od9d$9X)Qhjb7eA|qJVkJuR8zPDg- zcVSxe#t4nOf%o8nu=I366p?zW?p$^TNE4eb%%U$LfrBueg1|T-bmE8D9UqCxPN$@4 zIW;9gD7SP=B+G&@I&lJZCb77Q1&;+99Llzec$UFJzyd>~uR*WG+o(4Mg_T5oghDLv zuB3n8M51zfFfuMTOfOM2ZfhHJ<>t46K zg=+**-^#`0Z2;=qxh4ShxEemA=ZiT3Jj%>`kB;nM(~6Mtfh3-;K~s8DQzH79&;T(T z5faUi%`;TMUB^+3*o^_~mM}UEp6G+>J!o3ZPy5pf0nSZBYpCj`;7ZVUvkt~~AOb&I zRO`)Cg8DUdF~>(o+lf%US%Wn4^mbK0J(7%H?FbE8?f(Phj-NcEKUDfIyD)o6G5g#4 z{o3my(VDxdTnSnzhKoOe`HG)m@dr?-ZqiyC!CN+5!hmrFdP=9EKp5|=xSn<$f6{fl z+WF2(%WB6j_WtAE)vaItpnBtnPj5^;xiM9}K3$zgB!7D6lP7mRsTO}!{n3xB;ZLjV zr`D)T`&(f4hMV!+gA{S$9tGP!!rKcKMd5C?P?C6ZpK28H(D^BWhn*5>z;K9ks9{+Y z@)1=AF|30HO}M-0(@`&vu{G_1~Z|xpnLW)wPxuT=YUHL~~PR zn&0_tMrfZ*Y(Q45Tagv(R%FGxHA{l^21HgJX#rPv5M12}xVj5)wL`;I|0pCqhVxQ( zYFeHbbWAn8ObV*7GYgW^66{E!d$;U~lCB)!Yli@PIlF5b{ z`&cwfLgU*6pzX~yoah=Cqo8YC^8XdoMox~JuNS>!(=oLPy<*!u$rXn&>Mku$5O`^ou79uwN|>8`=0g=Jn0>%_6)A(s#kA3y*l;e>Qt4} zfc0%$}NW0ONhkD){Oz7K-xv4ar42cR@k0~`F{@A7JO z&-I!T@70q914>hmo-EMLNwVOG5<6^77VK8y$2X-43@_ct2yET?c^7bno$svnD_e%E zt`?YYP26tulrO-1UeZeTKT%XaQ2AXQzYkY?``6e30^ePgt~It-L-_VJ_6#{ay!?K( z|JoXR-Gb_N{}kVZ4b^c=3;tgG8@w7Ts>7ZQx_83~dpGKgeWEVTWx;H7pQ!15dO*Vr z8Jqs8iPj!VINtP5O`Np0*V-d$ZP&e1t4#M!tuoy^waO^whd8TD_efoqIhr1+%d)#P zKGsbiowc-}Dn19F5mMH6asaZu*%)Tnn|iFa{iDQZwT7jsM{0znd+T1}kNVEwmeFzK z-&r5L`1zEMo#|9AU4qCeIG%)_Yula44pV�lK4!H^$8Xg&I1l9+$Rzu1UIClxz_t z`%LMMFoL7j-qerOTt$mdMSWhJD^D*2R#Nj&Mf4`LjQ~~!&2jl-XtiM7tuxT@Raz6aAlI~$G4-wTb+56;(chZ?KU==gBief5P=O% zHUitOrT#BQJ|9^+S(#gjJ$+~R$veZ<<0D^o*@5k&YIvr~&iuA6qT%X__;Bg8@u{kA zEoq)EiExWDbr8irL8S#OFxx1aZlLJ_5{K~3;AaWP!UDf?sTB%)PJ44W)^PgVuz59b z;#E#x6A8Z(Ske4B3KN(EH=IB#oDhdlgBjuwmXwww;1CiduseX8Q3Lkn!JgNshlF5X z2RQDn1lQO>0t|aAMv$|!0{dipo_k%y?F%^EIR^$Xa}N5x@mY-~+vt`^%znWh?71TB zB^J>`P<_&!99W2HVRp{4SJDGblxdLN3*-eNTcI=|fE`0-$8Ja_iNTFRXeUfVb)AL( z%|JIQlkJgba3GAU5_g_(3Gzla?4?xMlXOq`d*WY$GQ4RmZW DT}t?7 literal 0 HcmV?d00001 diff --git a/temp_backup/src/db/models.py b/temp_backup/src/db/models.py new file mode 100644 index 00000000..5cc7fa5f --- /dev/null +++ b/temp_backup/src/db/models.py @@ -0,0 +1,304 @@ +""" +Database models for analytics data persistence +""" + +from datetime import datetime +from typing import Optional +from sqlalchemy import Column, Integer, String, Float, DateTime, JSON, Text, Index, BigInteger +from sqlalchemy.orm import declarative_base +from sqlalchemy.sql import func + +Base = declarative_base() + + +class OnChainEntity(Base): + """ + Stores on-chain entities (projects and assets) with stable IDs + """ + __tablename__ = "on_chain_entities" + + id = Column(Integer, primary_key=True, autoincrement=True) + stable_id = Column(String(255), unique=True, nullable=False, index=True) # Stable unique ID (e.g., "asset:XLM", "project:stellar") + entity_type = Column(String(50), nullable=False, index=True) # "project" or "asset" + name = Column(String(255), nullable=False) # Human-readable name + ticker = Column(String(20), nullable=True, index=True) # Asset ticker (if applicable) + contract_ids = Column(JSON, nullable=True) # Array of associated contract IDs + extra_data = Column(JSON, nullable=True) # Additional metadata + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + updated_at = Column( + DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False + ) + + __table_args__ = ( + Index("idx_entities_type_ticker", "entity_type", "ticker"), + ) + + +class ArticleEntityLink(Base): + """ + Links articles to on-chain entities (many-to-many relationship) + """ + __tablename__ = "article_entity_links" + + id = Column(Integer, primary_key=True, autoincrement=True) + article_id = Column(String(255), nullable=False, index=True) # Foreign key to articles.article_id + entity_stable_id = Column(String(255), nullable=False, index=True) # Foreign key to on_chain_entities.stable_id + confidence = Column(Float, nullable=True) # Confidence score for the link (0-1) + context = Column(Text, nullable=True) # Context snippet where the entity was found + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + + __table_args__ = ( + Index("idx_article_entity_link", "article_id", "entity_stable_id", unique=True), + Index("idx_entity_article_link", "entity_stable_id", "article_id"), + ) + + +class Article(Base): + """ + Stores news articles with full content and metadata + """ + + __tablename__ = "articles" + + id = Column(Integer, primary_key=True, autoincrement=True) + article_id = Column(String(255), unique=True, nullable=False, index=True) + title = Column(Text, nullable=False) + content = Column(Text, nullable=True) + summary = Column(Text, nullable=True) + source = Column(String(100), nullable=True, index=True) + url = Column(Text, nullable=True) + + # Asset information + asset_codes = Column(JSON, nullable=True) # Array of asset codes mentioned in article + primary_asset = Column(String(20), nullable=True, index=True) # Primary asset being discussed + categories = Column(JSON, nullable=True) # Article categories + + # Sentiment scores + sentiment_score = Column(Float, nullable=True) # compound score -1 to 1 + positive_score = Column(Float, nullable=True) + negative_score = Column(Float, nullable=True) + neutral_score = Column(Float, nullable=True) + sentiment_label = Column(String(20), nullable=True, index=True) # positive/negative/neutral + + # Keywords and metadata + keywords = Column(JSON, nullable=True) # Array of keywords + detected_entities = Column(JSON, nullable=True) # NER entities detected in article text + linked_entities = Column(JSON, nullable=True) # Structured linked entities (array of {stable_id, type, name}) + language = Column(String(10), nullable=True) + + # Timestamps + published_at = Column(DateTime(timezone=True), nullable=True, index=True) + fetched_at = Column(DateTime(timezone=True), nullable=True) + analyzed_at = Column(DateTime(timezone=True), nullable=True) + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + updated_at = Column( + DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False + ) + + # Indexes for efficient querying + __table_args__ = ( + Index("idx_articles_published_at", "published_at"), + Index("idx_articles_sentiment_label", "sentiment_label"), + Index("idx_articles_source", "source"), + Index("idx_articles_primary_asset", "primary_asset"), + Index("idx_articles_asset_sentiment", "primary_asset", "sentiment_label"), + Index("idx_articles_created_at", "created_at"), + ) + + def __repr__(self): + return f"" + + +class SocialPost(Base): + """ + Stores social media posts (Twitter, Reddit, etc.) + """ + + __tablename__ = "social_posts" + + id = Column(Integer, primary_key=True, autoincrement=True) + post_id = Column(String(255), unique=True, nullable=False, index=True) + platform = Column(String(50), nullable=False, index=True) # twitter, reddit, etc. + content = Column(Text, nullable=False) + author = Column(String(255), nullable=True) + url = Column(Text, nullable=True) + + # Engagement metrics + likes = Column(Integer, default=0) + comments = Column(Integer, default=0) + shares = Column(Integer, default=0) + + # Asset information + asset_codes = Column(JSON, nullable=True) # Array of asset codes mentioned + primary_asset = Column(String(20), nullable=True, index=True) + hashtags = Column(JSON, nullable=True) # Array of hashtags + subreddit = Column(String(100), nullable=True) # For Reddit posts + + # Sentiment scores + sentiment_score = Column(Float, nullable=True) # compound score -1 to 1 + positive_score = Column(Float, nullable=True) + negative_score = Column(Float, nullable=True) + neutral_score = Column(Float, nullable=True) + sentiment_label = Column(String(20), nullable=True, index=True) + + # Timestamps + posted_at = Column(DateTime(timezone=True), nullable=False, index=True) + fetched_at = Column(DateTime(timezone=True), nullable=True) + analyzed_at = Column(DateTime(timezone=True), nullable=True) + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + updated_at = Column( + DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False + ) + + # Indexes for efficient querying + __table_args__ = ( + Index("idx_social_posts_platform", "platform"), + Index("idx_social_posts_posted_at", "posted_at"), + Index("idx_social_posts_sentiment_label", "sentiment_label"), + Index("idx_social_posts_primary_asset", "primary_asset"), + Index("idx_social_posts_platform_asset", "platform", "primary_asset"), + Index("idx_social_posts_created_at", "created_at"), + ) + + def __repr__(self): + return f"" + + +class AnalyticsRecord(Base): + """ + Stores computed analytics and aggregated metrics + """ + + __tablename__ = "analytics_records" + + id = Column(Integer, primary_key=True, autoincrement=True) + record_type = Column(String(50), nullable=False, index=True) # sentiment_summary, trend, etc. + asset = Column(String(50), nullable=True, index=True) # Asset symbol (e.g., 'XLM', 'BTC') + metric_name = Column(String(100), nullable=False) # e.g., 'sentiment_score', 'volume' + window = Column(String(20), nullable=True) # e.g., '1h', '24h', '7d' + + # Metric values + value = Column(Float, nullable=False) + previous_value = Column(Float, nullable=True) + change_percentage = Column(Float, nullable=True) + trend_direction = Column(String(20), nullable=True) # up/down/stable + + # Additional data + extra_data = Column(JSON, nullable=True) # Additional metadata + + # Timestamps + timestamp = Column(DateTime(timezone=True), nullable=False, index=True) + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + + # Indexes for efficient querying + __table_args__ = ( + Index("idx_analytics_records_type", "record_type"), + Index("idx_analytics_records_asset", "asset"), + Index("idx_analytics_records_timestamp", "timestamp"), + Index("idx_analytics_records_type_asset", "record_type", "asset"), + Index("idx_analytics_records_asset_metric", "asset", "metric_name"), + ) + + def __repr__(self): + return f"" + + +class NewsInsight(Base): + """ + Stores sentiment analysis results for news articles (legacy table, kept for backward compatibility) + """ + + __tablename__ = "news_insights" + + id = Column(Integer, primary_key=True, autoincrement=True) + article_id = Column(String(255), nullable=True, index=True) + article_title = Column(Text, nullable=True) + article_url = Column(Text, nullable=True) + source = Column(String(100), nullable=True) + + # Asset information + asset_codes = Column(JSON, nullable=True) # Array of asset codes mentioned in article + primary_asset = Column(String(20), nullable=True, index=True) # Primary asset being discussed + + # Sentiment scores + sentiment_score = Column(Float, nullable=False) # compound score -1 to 1 + positive_score = Column(Float, nullable=False) + negative_score = Column(Float, nullable=False) + neutral_score = Column(Float, nullable=False) + sentiment_label = Column(String(20), nullable=False) # positive/negative/neutral + + # Keywords and metadata + keywords = Column(JSON, nullable=True) # Array of keywords + language = Column(String(10), nullable=True) + + # Timestamps + article_published_at = Column(DateTime(timezone=True), nullable=True) + analyzed_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + + # Indexes for efficient querying + __table_args__ = ( + Index("idx_news_insights_analyzed_at", "analyzed_at"), + Index("idx_news_insights_sentiment_label", "sentiment_label"), + Index("idx_news_insights_source", "source"), + Index("idx_news_insights_primary_asset", "primary_asset"), + Index("idx_news_insights_asset_sentiment", "primary_asset", "sentiment_label"), + ) + + def __repr__(self): + return f"" + + +class AssetTrend(Base): + """ + Stores calculated trends for assets and metrics (legacy table, kept for backward compatibility) + """ + + __tablename__ = "asset_trends" + + id = Column(Integer, primary_key=True, autoincrement=True) + asset = Column(String(50), nullable=False, index=True) # e.g., 'XLM', 'BTC' + metric_name = Column(String(100), nullable=False) # e.g., 'sentiment_score', 'volume' + window = Column(String(20), nullable=False) # e.g., '1h', '24h', '7d' + + # Trend data + trend_direction = Column(String(20), nullable=False) # up/down/stable + score = Column(Float, nullable=False) # trend score/strength + current_value = Column(Float, nullable=False) + previous_value = Column(Float, nullable=False) + change_percentage = Column(Float, nullable=False) + + # Additional data (renamed from metadata to avoid SQLAlchemy conflict) + extra_data = Column(JSON, nullable=True) # Additional trend metadata + + # Timestamps + timestamp = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False, index=True + ) + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + + # Indexes for efficient querying + __table_args__ = ( + Index("idx_asset_trends_asset_metric", "asset", "metric_name"), + Index("idx_asset_trends_timestamp", "timestamp"), + Index("idx_asset_trends_window", "window"), + ) + + def __repr__(self): + return f"" diff --git a/temp_backup/src/db/postgres_service.py b/temp_backup/src/db/postgres_service.py new file mode 100644 index 00000000..3903c428 --- /dev/null +++ b/temp_backup/src/db/postgres_service.py @@ -0,0 +1,1245 @@ +""" +PostgreSQL service for persisting analytics data +""" + +import logging +import os +import time +from typing import List, Dict, Any, Optional +from datetime import datetime, timedelta +from contextlib import contextmanager + +from sqlalchemy import create_engine, select, and_, desc +from sqlalchemy.orm import sessionmaker, Session +from sqlalchemy.exc import SQLAlchemyError, OperationalError + +from .models import Base, Article, SocialPost, AnalyticsRecord, NewsInsight, AssetTrend, OnChainEntity, ArticleEntityLink +from src.analytics.ner_service import NERService +from src.analytics.entity_linker import EntityLinker, measure_precision + +logger = logging.getLogger(__name__) + + +class PostgresService: + """ + Service for persisting and retrieving analytics data from PostgreSQL + """ + + def __init__(self, database_url: Optional[str] = None): + """ + Initialize PostgreSQL service + + Args: + database_url: PostgreSQL connection URL. If None, reads from environment + """ + self.database_url = database_url or os.getenv( + "DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/lumenpulse" + ) + + try: + self.engine = create_engine( + self.database_url, + pool_pre_ping=True, # Verify connections before using + pool_size=5, + max_overflow=10, + echo=False, # Set to True for SQL query logging + ) + self.SessionLocal = sessionmaker( + autocommit=False, + autoflush=False, + expire_on_commit=False, + bind=self.engine, + ) + self.ner_service = NERService() + self.entity_linker = EntityLinker() + logger.info("PostgreSQL service initialized successfully") + except Exception as e: + logger.error(f"Failed to initialize PostgreSQL service: {e}") + raise + + def _ensure_detected_entities(self, article_data: Dict[str, Any]) -> Dict[str, Any]: + """Populate detected_entities when absent using the NER service.""" + normalized = dict(article_data) + existing_entities = normalized.get("detected_entities") + if isinstance(existing_entities, list) and existing_entities: + return normalized + + normalized["detected_entities"] = self.ner_service.extract_entities_from_article( + title=normalized.get("title"), + summary=normalized.get("summary"), + content=normalized.get("content"), + ) + return normalized + + @contextmanager + def get_session(self): + """ + Context manager for database sessions + + Yields: + Session: SQLAlchemy session + """ + session = self.SessionLocal() + try: + yield session + session.commit() + except Exception as e: + session.rollback() + logger.error(f"Session error: {e}") + raise + finally: + session.close() + + def _retry_operation(self, operation, max_retries=3, retry_delay=1.0): + """ + Retry a database operation with exponential backoff + + Args: + operation: Callable to execute + max_retries: Maximum number of retry attempts + retry_delay: Initial delay between retries (doubles each retry) + + Returns: + Result of the operation + + Raises: + Exception: If all retries fail + """ + last_exception = None + for attempt in range(max_retries): + try: + return operation() + except OperationalError as e: + last_exception = e + if attempt < max_retries - 1: + wait_time = retry_delay * (2 ** attempt) # Exponential backoff + logger.warning( + f"Database operation failed (attempt {attempt + 1}/{max_retries}): {e}. " + f"Retrying in {wait_time:.1f}s..." + ) + time.sleep(wait_time) + else: + logger.error(f"Database operation failed after {max_retries} attempts: {e}") + raise + except SQLAlchemyError as e: + # Non-retryable errors + logger.error(f"Database operation failed with non-retryable error: {e}") + raise + raise last_exception + + def create_tables(self): + """ + Create all tables in the database + """ + try: + Base.metadata.create_all(bind=self.engine) + logger.info("Database tables created successfully") + except Exception as e: + logger.error(f"Failed to create tables: {e}") + raise + + def drop_tables(self): + """ + Drop all tables (use with caution!) + """ + try: + Base.metadata.drop_all(bind=self.engine) + logger.warning("All database tables dropped") + except Exception as e: + logger.error(f"Failed to drop tables: {e}") + raise + + # Article Methods + + def save_article( + self, + article_data: Dict[str, Any], + sentiment_result: Optional[Dict[str, Any]] = None, + ) -> Optional[Article]: + """ + Save an article with optional sentiment analysis and entity linking + + Args: + article_data: Article data dictionary + sentiment_result: Optional sentiment analysis result + + Returns: + Article object if successful, None otherwise + """ + article_data = self._ensure_detected_entities(article_data) + + # Link entities + linked_entities = self.entity_linker.link_article( + title=article_data.get("title"), + summary=article_data.get("summary"), + content=article_data.get("content") + ) + + # Prepare structured linked entities for the article + structured_linked_entities = [ + { + "stable_id": e.stable_id, + "type": e.entity_type, + "name": e.name, + "ticker": getattr(e, 'ticker', None), + "confidence": getattr(e, 'confidence', None) + } + for e in linked_entities + ] + article_data["linked_entities"] = structured_linked_entities + + def _save(): + with self.get_session() as session: + # Check if article already exists + existing = session.execute( + select(Article).where(Article.article_id == article_data.get("id")) + ).scalar_one_or_none() + + if existing: + # Update existing article + existing.title = article_data.get("title", existing.title) + existing.content = article_data.get("content", existing.content) + existing.summary = article_data.get("summary", existing.summary) + existing.source = article_data.get("source", existing.source) + existing.url = article_data.get("url", existing.url) + existing.asset_codes = article_data.get("asset_codes", existing.asset_codes) + existing.primary_asset = article_data.get("primary_asset", existing.primary_asset) + existing.categories = article_data.get("categories", existing.categories) + existing.keywords = article_data.get("keywords", existing.keywords) + existing.detected_entities = article_data.get("detected_entities", existing.detected_entities) + existing.linked_entities = article_data.get("linked_entities", existing.linked_entities) + existing.language = article_data.get("language", existing.language) + existing.published_at = article_data.get("published_at", existing.published_at) + existing.fetched_at = article_data.get("fetched_at", existing.fetched_at) + + if sentiment_result: + existing.sentiment_score = sentiment_result.get("compound_score") + existing.positive_score = sentiment_result.get("positive") + existing.negative_score = sentiment_result.get("negative") + existing.neutral_score = sentiment_result.get("neutral") + existing.sentiment_label = sentiment_result.get("sentiment_label") + existing.analyzed_at = datetime.utcnow() + + session.flush() + logger.debug(f"Updated article: {existing.article_id}") + return existing + else: + # Create new article + article = Article( + article_id=article_data.get("id"), + title=article_data.get("title", ""), + content=article_data.get("content"), + summary=article_data.get("summary"), + source=article_data.get("source"), + url=article_data.get("url"), + asset_codes=article_data.get("asset_codes"), + primary_asset=article_data.get("primary_asset"), + categories=article_data.get("categories"), + keywords=article_data.get("keywords"), + detected_entities=article_data.get("detected_entities"), + linked_entities=article_data.get("linked_entities"), + language=article_data.get("language"), + published_at=article_data.get("published_at"), + fetched_at=article_data.get("fetched_at"), + ) + + if sentiment_result: + article.sentiment_score = sentiment_result.get("compound_score") + article.positive_score = sentiment_result.get("positive") + article.negative_score = sentiment_result.get("negative") + article.neutral_score = sentiment_result.get("neutral") + article.sentiment_label = sentiment_result.get("sentiment_label") + article.analyzed_at = datetime.utcnow() + + session.add(article) + session.flush() + logger.debug(f"Saved article: {article.article_id}") + return article + + try: + article = self._retry_operation(_save) + if article: + # Link entities in the database + self.link_article_to_entities(article.article_id, linked_entities) + return article + except SQLAlchemyError as e: + logger.error(f"Failed to save article: {e}") + return None + + def save_articles_batch( + self, + articles_data: List[Dict[str, Any]], + sentiment_results: Optional[List[Dict[str, Any]]] = None, + ) -> int: + """ + Save multiple articles in a batch + + Args: + articles_data: List of article data dictionaries + sentiment_results: Optional list of sentiment analysis results + + Returns: + Number of articles saved + """ + saved_count = 0 + try: + with self.get_session() as session: + for i, article_data in enumerate(articles_data): + article_data = self._ensure_detected_entities(article_data) + sentiment_result = sentiment_results[i] if sentiment_results and i < len(sentiment_results) else None + + # Check if article already exists + existing = session.execute( + select(Article).where(Article.article_id == article_data.get("id")) + ).scalar_one_or_none() + + if existing: + # Update existing article + existing.title = article_data.get("title", existing.title) + existing.content = article_data.get("content", existing.content) + existing.summary = article_data.get("summary", existing.summary) + existing.source = article_data.get("source", existing.source) + existing.url = article_data.get("url", existing.url) + existing.asset_codes = article_data.get("asset_codes", existing.asset_codes) + existing.primary_asset = article_data.get("primary_asset", existing.primary_asset) + existing.categories = article_data.get("categories", existing.categories) + existing.keywords = article_data.get("keywords", existing.keywords) + existing.detected_entities = article_data.get("detected_entities", existing.detected_entities) + existing.language = article_data.get("language", existing.language) + existing.published_at = article_data.get("published_at", existing.published_at) + existing.fetched_at = article_data.get("fetched_at", existing.fetched_at) + + if sentiment_result: + existing.sentiment_score = sentiment_result.get("compound_score") + existing.positive_score = sentiment_result.get("positive") + existing.negative_score = sentiment_result.get("negative") + existing.neutral_score = sentiment_result.get("neutral") + existing.sentiment_label = sentiment_result.get("sentiment_label") + existing.analyzed_at = datetime.utcnow() + else: + # Create new article + article = Article( + article_id=article_data.get("id"), + title=article_data.get("title", ""), + content=article_data.get("content"), + summary=article_data.get("summary"), + source=article_data.get("source"), + url=article_data.get("url"), + asset_codes=article_data.get("asset_codes"), + primary_asset=article_data.get("primary_asset"), + categories=article_data.get("categories"), + keywords=article_data.get("keywords"), + detected_entities=article_data.get("detected_entities"), + language=article_data.get("language"), + published_at=article_data.get("published_at"), + fetched_at=article_data.get("fetched_at"), + ) + + if sentiment_result: + article.sentiment_score = sentiment_result.get("compound_score") + article.positive_score = sentiment_result.get("positive") + article.negative_score = sentiment_result.get("negative") + article.neutral_score = sentiment_result.get("neutral") + article.sentiment_label = sentiment_result.get("sentiment_label") + article.analyzed_at = datetime.utcnow() + + session.add(article) + + saved_count += 1 + + logger.info(f"Saved {saved_count} articles") + except SQLAlchemyError as e: + logger.error(f"Failed to save articles batch: {e}") + + return saved_count + + def get_recent_articles( + self, + limit: int = 100, + hours: int = 24, + asset: Optional[str] = None, + entity: Optional[str] = None, + ) -> List[Article]: + """ + Get recent articles + + Args: + limit: Maximum number of results + hours: Time window in hours + asset: Optional asset filter + entity: Optional NER entity filter + + Returns: + List of Article objects + """ + try: + with self.get_session() as session: + cutoff_time = datetime.utcnow() - timedelta(hours=hours) + stmt = ( + select(Article) + .where(Article.published_at >= cutoff_time) + .order_by(desc(Article.published_at)) + .limit(limit * 5 if entity else limit) + ) + + if asset: + stmt = stmt.where(Article.primary_asset == asset) + + results = session.execute(stmt).scalars().all() + if entity: + target = entity.strip().lower() + results = [ + article + for article in results + if any( + str(value).strip().lower() == target + for value in (article.detected_entities or []) + ) + ][:limit] + logger.debug(f"Retrieved {len(results)} articles") + return results + except SQLAlchemyError as e: + logger.error(f"Failed to retrieve articles: {e}") + return [] + + # Social Post Methods + + def save_social_post( + self, + post_data: Dict[str, Any], + sentiment_result: Optional[Dict[str, Any]] = None, + ) -> Optional[SocialPost]: + """ + Save a social media post with optional sentiment analysis + + Args: + post_data: Social post data dictionary + sentiment_result: Optional sentiment analysis result + + Returns: + SocialPost object if successful, None otherwise + """ + def _save(): + with self.get_session() as session: + # Check if post already exists + existing = session.execute( + select(SocialPost).where(SocialPost.post_id == post_data.get("id")) + ).scalar_one_or_none() + + if existing: + # Update existing post + existing.content = post_data.get("content", existing.content) + existing.author = post_data.get("author", existing.author) + existing.url = post_data.get("url", existing.url) + existing.likes = post_data.get("likes", existing.likes) + existing.comments = post_data.get("comments", existing.comments) + existing.shares = post_data.get("shares", existing.shares) + existing.asset_codes = post_data.get("asset_codes", existing.asset_codes) + existing.primary_asset = post_data.get("primary_asset", existing.primary_asset) + existing.hashtags = post_data.get("hashtags", existing.hashtags) + existing.subreddit = post_data.get("subreddit", existing.subreddit) + existing.posted_at = post_data.get("posted_at", existing.posted_at) + existing.fetched_at = post_data.get("fetched_at", existing.fetched_at) + + if sentiment_result: + existing.sentiment_score = sentiment_result.get("compound_score") + existing.positive_score = sentiment_result.get("positive") + existing.negative_score = sentiment_result.get("negative") + existing.neutral_score = sentiment_result.get("neutral") + existing.sentiment_label = sentiment_result.get("sentiment_label") + existing.analyzed_at = datetime.utcnow() + + session.flush() + logger.debug(f"Updated social post: {existing.post_id}") + return existing + else: + # Create new post + post = SocialPost( + post_id=post_data.get("id"), + platform=post_data.get("platform", "unknown"), + content=post_data.get("content", ""), + author=post_data.get("author"), + url=post_data.get("url"), + likes=post_data.get("likes", 0), + comments=post_data.get("comments", 0), + shares=post_data.get("shares", 0), + asset_codes=post_data.get("asset_codes"), + primary_asset=post_data.get("primary_asset"), + hashtags=post_data.get("hashtags"), + subreddit=post_data.get("subreddit"), + posted_at=post_data.get("posted_at"), + fetched_at=post_data.get("fetched_at"), + ) + + if sentiment_result: + post.sentiment_score = sentiment_result.get("compound_score") + post.positive_score = sentiment_result.get("positive") + post.negative_score = sentiment_result.get("negative") + post.neutral_score = sentiment_result.get("neutral") + post.sentiment_label = sentiment_result.get("sentiment_label") + post.analyzed_at = datetime.utcnow() + + session.add(post) + session.flush() + logger.debug(f"Saved social post: {post.post_id}") + return post + + try: + return self._retry_operation(_save) + except SQLAlchemyError as e: + logger.error(f"Failed to save social post: {e}") + return None + + def save_social_posts_batch( + self, + posts_data: List[Dict[str, Any]], + sentiment_results: Optional[List[Dict[str, Any]]] = None, + ) -> int: + """ + Save multiple social posts in a batch + + Args: + posts_data: List of social post data dictionaries + sentiment_results: Optional list of sentiment analysis results + + Returns: + Number of posts saved + """ + saved_count = 0 + try: + with self.get_session() as session: + for i, post_data in enumerate(posts_data): + sentiment_result = sentiment_results[i] if sentiment_results and i < len(sentiment_results) else None + + # Check if post already exists + existing = session.execute( + select(SocialPost).where(SocialPost.post_id == post_data.get("id")) + ).scalar_one_or_none() + + if existing: + # Update existing post + existing.content = post_data.get("content", existing.content) + existing.author = post_data.get("author", existing.author) + existing.url = post_data.get("url", existing.url) + existing.likes = post_data.get("likes", existing.likes) + existing.comments = post_data.get("comments", existing.comments) + existing.shares = post_data.get("shares", existing.shares) + existing.asset_codes = post_data.get("asset_codes", existing.asset_codes) + existing.primary_asset = post_data.get("primary_asset", existing.primary_asset) + existing.hashtags = post_data.get("hashtags", existing.hashtags) + existing.subreddit = post_data.get("subreddit", existing.subreddit) + existing.posted_at = post_data.get("posted_at", existing.posted_at) + existing.fetched_at = post_data.get("fetched_at", existing.fetched_at) + + if sentiment_result: + existing.sentiment_score = sentiment_result.get("compound_score") + existing.positive_score = sentiment_result.get("positive") + existing.negative_score = sentiment_result.get("negative") + existing.neutral_score = sentiment_result.get("neutral") + existing.sentiment_label = sentiment_result.get("sentiment_label") + existing.analyzed_at = datetime.utcnow() + else: + # Create new post + post = SocialPost( + post_id=post_data.get("id"), + platform=post_data.get("platform", "unknown"), + content=post_data.get("content", ""), + author=post_data.get("author"), + url=post_data.get("url"), + likes=post_data.get("likes", 0), + comments=post_data.get("comments", 0), + shares=post_data.get("shares", 0), + asset_codes=post_data.get("asset_codes"), + primary_asset=post_data.get("primary_asset"), + hashtags=post_data.get("hashtags"), + subreddit=post_data.get("subreddit"), + posted_at=post_data.get("posted_at"), + fetched_at=post_data.get("fetched_at"), + ) + + if sentiment_result: + post.sentiment_score = sentiment_result.get("compound_score") + post.positive_score = sentiment_result.get("positive") + post.negative_score = sentiment_result.get("negative") + post.neutral_score = sentiment_result.get("neutral") + post.sentiment_label = sentiment_result.get("sentiment_label") + post.analyzed_at = datetime.utcnow() + + session.add(post) + + saved_count += 1 + + logger.info(f"Saved {saved_count} social posts") + except SQLAlchemyError as e: + logger.error(f"Failed to save social posts batch: {e}") + + return saved_count + + def get_recent_social_posts( + self, + limit: int = 100, + hours: int = 24, + platform: Optional[str] = None, + asset: Optional[str] = None, + ) -> List[SocialPost]: + """ + Get recent social posts + + Args: + limit: Maximum number of results + hours: Time window in hours + platform: Optional platform filter + asset: Optional asset filter + + Returns: + List of SocialPost objects + """ + try: + with self.get_session() as session: + cutoff_time = datetime.utcnow() - timedelta(hours=hours) + stmt = ( + select(SocialPost) + .where(SocialPost.posted_at >= cutoff_time) + .order_by(desc(SocialPost.posted_at)) + .limit(limit) + ) + + if platform: + stmt = stmt.where(SocialPost.platform == platform) + if asset: + stmt = stmt.where(SocialPost.primary_asset == asset) + + results = session.execute(stmt).scalars().all() + logger.debug(f"Retrieved {len(results)} social posts") + return results + except SQLAlchemyError as e: + logger.error(f"Failed to retrieve social posts: {e}") + return [] + + # Analytics Record Methods + + def save_analytics_record( + self, + record_type: str, + metric_name: str, + value: float, + asset: Optional[str] = None, + window: Optional[str] = None, + previous_value: Optional[float] = None, + change_percentage: Optional[float] = None, + trend_direction: Optional[str] = None, + extra_data: Optional[Dict[str, Any]] = None, + timestamp: Optional[datetime] = None, + ) -> Optional[AnalyticsRecord]: + """ + Save an analytics record + + Args: + record_type: Type of record (e.g., 'sentiment_summary', 'trend') + metric_name: Metric name (e.g., 'sentiment_score', 'volume') + value: Metric value + asset: Optional asset symbol + window: Optional time window + previous_value: Optional previous value + change_percentage: Optional change percentage + trend_direction: Optional trend direction + extra_data: Optional additional metadata + timestamp: Optional timestamp (defaults to now) + + Returns: + AnalyticsRecord object if successful, None otherwise + """ + def _save(): + with self.get_session() as session: + record = AnalyticsRecord( + record_type=record_type, + metric_name=metric_name, + value=value, + asset=asset, + window=window, + previous_value=previous_value, + change_percentage=change_percentage, + trend_direction=trend_direction, + extra_data=extra_data, + timestamp=timestamp or datetime.utcnow(), + ) + session.add(record) + session.flush() + logger.debug(f"Saved analytics record: {record_type}/{metric_name}") + return record + + try: + return self._retry_operation(_save) + except SQLAlchemyError as e: + logger.error(f"Failed to save analytics record: {e}") + return None + + def save_analytics_records_batch( + self, + records_data: List[Dict[str, Any]], + ) -> int: + """ + Save multiple analytics records in a batch + + Args: + records_data: List of analytics record data dictionaries + + Returns: + Number of records saved + """ + saved_count = 0 + try: + with self.get_session() as session: + for record_data in records_data: + record = AnalyticsRecord( + record_type=record_data.get("record_type"), + metric_name=record_data.get("metric_name"), + value=record_data.get("value"), + asset=record_data.get("asset"), + window=record_data.get("window"), + previous_value=record_data.get("previous_value"), + change_percentage=record_data.get("change_percentage"), + trend_direction=record_data.get("trend_direction"), + extra_data=record_data.get("extra_data"), + timestamp=record_data.get("timestamp", datetime.utcnow()), + ) + session.add(record) + saved_count += 1 + + logger.info(f"Saved {saved_count} analytics records") + except SQLAlchemyError as e: + logger.error(f"Failed to save analytics records batch: {e}") + + return saved_count + + def get_analytics_records( + self, + record_type: Optional[str] = None, + asset: Optional[str] = None, + metric_name: Optional[str] = None, + hours: int = 24, + limit: int = 100, + ) -> List[AnalyticsRecord]: + """ + Get analytics records + + Args: + record_type: Optional record type filter + asset: Optional asset filter + metric_name: Optional metric name filter + hours: Time window in hours + limit: Maximum number of results + + Returns: + List of AnalyticsRecord objects + """ + try: + with self.get_session() as session: + cutoff_time = datetime.utcnow() - timedelta(hours=hours) + stmt = ( + select(AnalyticsRecord) + .where(AnalyticsRecord.timestamp >= cutoff_time) + .order_by(desc(AnalyticsRecord.timestamp)) + .limit(limit) + ) + + if record_type: + stmt = stmt.where(AnalyticsRecord.record_type == record_type) + if asset: + stmt = stmt.where(AnalyticsRecord.asset == asset) + if metric_name: + stmt = stmt.where(AnalyticsRecord.metric_name == metric_name) + + results = session.execute(stmt).scalars().all() + logger.debug(f"Retrieved {len(results)} analytics records") + return results + except SQLAlchemyError as e: + logger.error(f"Failed to retrieve analytics records: {e}") + return [] + + # Legacy News Insights Methods (kept for backward compatibility) + + def save_news_insight( + self, + sentiment_result: Dict[str, Any], + article_data: Optional[Dict[str, Any]] = None, + ) -> Optional[NewsInsight]: + """ + Save a news sentiment analysis result + + Args: + sentiment_result: Sentiment analysis result dictionary + article_data: Optional article metadata + + Returns: + NewsInsight object if successful, None otherwise + """ + try: + with self.get_session() as session: + insight = NewsInsight( + article_id=article_data.get("id") if article_data else None, + article_title=article_data.get("title") if article_data else None, + article_url=article_data.get("url") if article_data else None, + source=article_data.get("source") if article_data else None, + sentiment_score=sentiment_result["compound_score"], + positive_score=sentiment_result["positive"], + negative_score=sentiment_result["negative"], + neutral_score=sentiment_result["neutral"], + sentiment_label=sentiment_result["sentiment_label"], + keywords=article_data.get("keywords") if article_data else None, + language=article_data.get("language") if article_data else None, + article_published_at=( + article_data.get("published_at") if article_data else None + ), + ) + session.add(insight) + session.flush() + logger.debug(f"Saved news insight: {insight.id}") + return insight + except SQLAlchemyError as e: + logger.error(f"Failed to save news insight: {e}") + return None + + def save_news_insights_batch( + self, sentiment_results: List[Dict[str, Any]], articles_data: List[Dict[str, Any]] = None + ) -> int: + """ + Save multiple news insights in a batch + + Args: + sentiment_results: List of sentiment analysis results + articles_data: Optional list of article metadata + + Returns: + Number of insights saved + """ + saved_count = 0 + try: + with self.get_session() as session: + for i, result in enumerate(sentiment_results): + article_data = articles_data[i] if articles_data and i < len(articles_data) else None + + insight = NewsInsight( + article_id=article_data.get("id") if article_data else None, + article_title=article_data.get("title") if article_data else None, + article_url=article_data.get("url") if article_data else None, + source=article_data.get("source") if article_data else None, + sentiment_score=result["compound_score"], + positive_score=result["positive"], + negative_score=result["negative"], + neutral_score=result["neutral"], + sentiment_label=result["sentiment_label"], + keywords=article_data.get("keywords") if article_data else None, + language=article_data.get("language") if article_data else None, + article_published_at=( + article_data.get("published_at") if article_data else None + ), + ) + session.add(insight) + saved_count += 1 + + logger.info(f"Saved {saved_count} news insights") + except SQLAlchemyError as e: + logger.error(f"Failed to save news insights batch: {e}") + + return saved_count + + def get_recent_news_insights( + self, limit: int = 100, hours: int = 24 + ) -> List[NewsInsight]: + """ + Get recent news insights + + Args: + limit: Maximum number of results + hours: Time window in hours + + Returns: + List of NewsInsight objects + """ + try: + with self.get_session() as session: + cutoff_time = datetime.utcnow() - timedelta(hours=hours) + stmt = ( + select(NewsInsight) + .where(NewsInsight.analyzed_at >= cutoff_time) + .order_by(desc(NewsInsight.analyzed_at)) + .limit(limit) + ) + results = session.execute(stmt).scalars().all() + logger.debug(f"Retrieved {len(results)} news insights") + return results + except SQLAlchemyError as e: + logger.error(f"Failed to retrieve news insights: {e}") + return [] + + # Legacy Asset Trends Methods (kept for backward compatibility) + + def save_asset_trend( + self, + asset: str, + metric_name: str, + window: str, + trend_data: Dict[str, Any], + ) -> Optional[AssetTrend]: + """ + Save an asset trend + + Args: + asset: Asset symbol (e.g., 'XLM') + metric_name: Metric name (e.g., 'sentiment_score') + window: Time window (e.g., '24h') + trend_data: Trend data dictionary + + Returns: + AssetTrend object if successful, None otherwise + """ + try: + with self.get_session() as session: + trend = AssetTrend( + asset=asset, + metric_name=metric_name, + window=window, + trend_direction=trend_data["trend_direction"], + score=trend_data.get("score", 0.0), + current_value=trend_data["current_value"], + previous_value=trend_data["previous_value"], + change_percentage=trend_data["change_percentage"], + extra_data=trend_data.get("extra_data") or trend_data.get("metadata"), + ) + session.add(trend) + session.flush() + logger.debug(f"Saved asset trend: {asset}/{metric_name}") + return trend + except SQLAlchemyError as e: + logger.error(f"Failed to save asset trend: {e}") + return None + + def save_asset_trends_batch( + self, asset: str, window: str, trends: List[Dict[str, Any]] + ) -> int: + """ + Save multiple asset trends in a batch + + Args: + asset: Asset symbol + window: Time window + trends: List of trend dictionaries + + Returns: + Number of trends saved + """ + saved_count = 0 + try: + with self.get_session() as session: + for trend_data in trends: + trend = AssetTrend( + asset=asset, + metric_name=trend_data["metric_name"], + window=window, + trend_direction=trend_data["trend_direction"], + score=trend_data.get("score", 0.0), + current_value=trend_data["current_value"], + previous_value=trend_data["previous_value"], + change_percentage=trend_data["change_percentage"], + extra_data=trend_data.get("extra_data") or trend_data.get("metadata"), + ) + session.add(trend) + saved_count += 1 + + logger.info(f"Saved {saved_count} asset trends for {asset}") + except SQLAlchemyError as e: + logger.error(f"Failed to save asset trends batch: {e}") + + return saved_count + + def get_recent_asset_trends( + self, asset: str, metric_name: Optional[str] = None, limit: int = 100 + ) -> List[AssetTrend]: + """ + Get recent asset trends + + Args: + asset: Asset symbol + metric_name: Optional metric name filter + limit: Maximum number of results + + Returns: + List of AssetTrend objects + """ + try: + with self.get_session() as session: + stmt = select(AssetTrend).where(AssetTrend.asset == asset) + + if metric_name: + stmt = stmt.where(AssetTrend.metric_name == metric_name) + + stmt = stmt.order_by(desc(AssetTrend.timestamp)).limit(limit) + + results = session.execute(stmt).scalars().all() + logger.debug(f"Retrieved {len(results)} asset trends for {asset}") + return results + except SQLAlchemyError as e: + logger.error(f"Failed to retrieve asset trends: {e}") + return [] + + def upsert_on_chain_entity(self, stable_id: str, entity_type: str, name: str, ticker: Optional[str] = None, extra_data: Optional[Dict] = None) -> OnChainEntity: + """ + Upsert an on-chain entity (create if not exists, update if exists). + + Args: + stable_id: Stable unique ID for the entity + entity_type: "project" or "asset" + name: Human-readable name + ticker: Optional asset ticker + extra_data: Optional additional metadata + + Returns: + The OnChainEntity object + """ + def _upsert(): + with self.get_session() as session: + existing = session.execute( + select(OnChainEntity).where(OnChainEntity.stable_id == stable_id) + ).scalar_one_or_none() + + if existing: + existing.name = name + existing.ticker = ticker or existing.ticker + existing.extra_data = extra_data or existing.extra_data + session.flush() + return existing + else: + entity = OnChainEntity( + stable_id=stable_id, + entity_type=entity_type, + name=name, + ticker=ticker, + extra_data=extra_data + ) + session.add(entity) + session.flush() + return entity + return self._retry_operation(_upsert) + + def link_article_to_entities(self, article_id: str, linked_entities: List) -> None: + """ + Link an article to on-chain entities. + + Args: + article_id: The article's unique ID + linked_entities: List of LinkedEntity objects + """ + def _link(): + with self.get_session() as session: + for entity in linked_entities: + # Upsert the entity first + self.upsert_on_chain_entity( + stable_id=entity.stable_id, + entity_type=entity.entity_type, + name=entity.name, + ticker=getattr(entity, 'ticker', None) + ) + + # Check if link already exists + existing_link = session.execute( + select(ArticleEntityLink).where( + and_( + ArticleEntityLink.article_id == article_id, + ArticleEntityLink.entity_stable_id == entity.stable_id + ) + ) + ).scalar_one_or_none() + + if not existing_link: + link = ArticleEntityLink( + article_id=article_id, + entity_stable_id=entity.stable_id, + confidence=getattr(entity, 'confidence', None) + ) + session.add(link) + self._retry_operation(_link) + + def get_article_linked_entities(self, article_id: str) -> List[Dict]: + """ + Get all entities linked to an article. + + Args: + article_id: The article's unique ID + + Returns: + List of entity data dictionaries + """ + try: + with self.get_session() as session: + links = session.execute( + select(ArticleEntityLink).where(ArticleEntityLink.article_id == article_id) + ).scalars().all() + + entities = [] + for link in links: + entity = session.execute( + select(OnChainEntity).where(OnChainEntity.stable_id == link.entity_stable_id) + ).scalar_one_or_none() + if entity: + entities.append({ + "stable_id": entity.stable_id, + "type": entity.entity_type, + "name": entity.name, + "ticker": entity.ticker, + "confidence": link.confidence + }) + return entities + except SQLAlchemyError as e: + logger.error(f"Failed to get linked entities for article {article_id}: {e}") + return [] + + def get_articles_for_entity(self, stable_id: str, limit: int = 100) -> List[Article]: + """ + Get all articles linked to a specific entity. + + Args: + stable_id: The entity's stable ID + limit: Maximum number of articles to return + + Returns: + List of Article objects + """ + try: + with self.get_session() as session: + links = session.execute( + select(ArticleEntityLink).where(ArticleEntityLink.entity_stable_id == stable_id).limit(limit) + ).scalars().all() + + article_ids = [link.article_id for link in links] + articles = session.execute( + select(Article).where(Article.article_id.in_(article_ids)).order_by(desc(Article.published_at)) + ).scalars().all() + return articles + except SQLAlchemyError as e: + logger.error(f"Failed to get articles for entity {stable_id}: {e}") + return [] + + def measure_entity_linker_precision(self) -> Dict[str, float]: + """ + Measure and log the precision of the entity linker. + + Returns: + Precision metrics dictionary + """ + metrics = measure_precision(self.entity_linker) + logger.info("Entity Linker Precision Metrics:") + logger.info(f" Precision: {metrics['precision']:.4f}") + logger.info(f" Recall: {metrics['recall']:.4f}") + logger.info(f" F1 Score: {metrics['f1']:.4f}") + logger.info(f" Test Cases: {metrics['test_cases']}") + return metrics + + def get_sentiment_summary(self, hours: int = 24) -> Dict[str, Any]: + """ + Get sentiment summary statistics + Args: + hours: Time window in hours + + Returns: + Summary statistics dictionary + """ + try: + with self.get_session() as session: + cutoff_time = datetime.utcnow() - timedelta(hours=hours) + + insights = session.execute( + select(NewsInsight).where(NewsInsight.analyzed_at >= cutoff_time) + ).scalars().all() + + if not insights: + return { + "total_articles": 0, + "average_sentiment": 0.0, + "positive_count": 0, + "negative_count": 0, + "neutral_count": 0, + } + + total = len(insights) + avg_sentiment = sum(i.sentiment_score for i in insights) / total + positive = sum(1 for i in insights if i.sentiment_label == "positive") + negative = sum(1 for i in insights if i.sentiment_label == "negative") + neutral = sum(1 for i in insights if i.sentiment_label == "neutral") + + return { + "total_articles": total, + "average_sentiment": round(avg_sentiment, 4), + "positive_count": positive, + "negative_count": negative, + "neutral_count": neutral, + "positive_percentage": round(positive / total * 100, 2), + "negative_percentage": round(negative / total * 100, 2), + "neutral_percentage": round(neutral / total * 100, 2), + } + except SQLAlchemyError as e: + logger.error(f"Failed to get sentiment summary: {e}") + return {} + + def cleanup_old_data(self, days: int = 30) -> Dict[str, int]: + """ + Clean up old analytics data + + Args: + days: Number of days to keep + + Returns: + Dictionary with counts of deleted records + """ + try: + cutoff_date = datetime.utcnow() - timedelta(days=days) + deleted_counts = { + "articles": 0, + "social_posts": 0, + "analytics_records": 0, + "news_insights": 0, + "asset_trends": 0, + } + + with self.get_session() as session: + # Delete old articles + articles_deleted = session.query(Article).filter( + Article.created_at < cutoff_date + ).delete() + deleted_counts["articles"] = articles_deleted + + # Delete old social posts + posts_deleted = session.query(SocialPost).filter( + SocialPost.created_at < cutoff_date + ).delete() + deleted_counts["social_posts"] = posts_deleted + + # Delete old analytics records + records_deleted = session.query(AnalyticsRecord).filter( + AnalyticsRecord.created_at < cutoff_date + ).delete() + deleted_counts["analytics_records"] = records_deleted + + # Delete old news insights (legacy) + news_deleted = session.query(NewsInsight).filter( + NewsInsight.created_at < cutoff_date + ).delete() + deleted_counts["news_insights"] = news_deleted + + # Delete old asset trends (legacy) + trends_deleted = session.query(AssetTrend).filter( + AssetTrend.created_at < cutoff_date + ).delete() + deleted_counts["asset_trends"] = trends_deleted + + logger.info(f"Cleaned up old data: {deleted_counts}") + return deleted_counts + except SQLAlchemyError as e: + logger.error(f"Failed to cleanup old data: {e}") + return { + "articles": 0, + "social_posts": 0, + "analytics_records": 0, + "news_insights": 0, + "asset_trends": 0, + } diff --git a/temp_backup/src/fetchers.py b/temp_backup/src/fetchers.py new file mode 100644 index 00000000..237a1336 --- /dev/null +++ b/temp_backup/src/fetchers.py @@ -0,0 +1,116 @@ +""" +News fetcher module - fetches crypto/market news from various sources +""" + +import requests +import logging +from datetime import datetime, timedelta +from typing import List, Dict, Any + +logger = logging.getLogger(__name__) + + +class NewsItem: + """Data class representing a news item""" + + def __init__( + self, title: str, content: str, source: str, url: str, published_at: datetime + ): + self.title = title + self.content = content + self.source = source + self.url = url + self.published_at = published_at + self.fetched_at = datetime.utcnow() + + def to_dict(self) -> Dict[str, Any]: + return { + "title": self.title, + "content": self.content, + "source": self.source, + "url": self.url, + "published_at": self.published_at.isoformat(), + "fetched_at": self.fetched_at.isoformat(), + } + + +class NewsFetcher: + """Fetches news from cryptocurrency and market sources""" + + def __init__(self): + self.sources = { + "crypto_news": "https://api.coingecko.com/api/v3/news", + "mock_market": "https://jsonplaceholder.typicode.com/posts", + } + + def fetch_crypto_news(self) -> List[NewsItem]: + """Fetch crypto news from CoinGecko API""" + try: + response = requests.get(self.sources["crypto_news"], timeout=10) + response.raise_for_status() + data = response.json() + + news_items = [] + # CoinGecko news endpoint returns a 'data' array + for article in data.get("data", [])[:10]: # Limit to 10 articles + try: + news_item = NewsItem( + title=article.get("title", ""), + content=article.get("description", article.get("title", "")), + source="CoinGecko", + url=article.get("url", ""), + published_at=( + datetime.fromisoformat( + article.get( + "published_at", datetime.utcnow().isoformat() + ).replace("Z", "+00:00") + ) + if article.get("published_at") + else datetime.utcnow() + ), + ) + news_items.append(news_item) + except Exception as e: + logger.warning(f"Error processing article: {e}") + continue + + logger.info(f"Fetched {len(news_items)} crypto news items") + return news_items + except requests.exceptions.RequestException as e: + logger.error(f"Error fetching crypto news: {e}") + return [] + + def fetch_market_news(self) -> List[NewsItem]: + """Fetch market news from mock source""" + try: + response = requests.get(self.sources["mock_market"], timeout=10) + response.raise_for_status() + data = response.json() + + news_items = [] + for article in data[:10]: # Limit to 10 articles + news_item = NewsItem( + title=article.get("title", f"Post {article.get('id', 'N/A')}"), + content=article.get("body", ""), + source="Mock Market Feed", + url=f"https://example.com/news/{article.get('id')}", + published_at=datetime.utcnow() + - timedelta(hours=article.get("id", 1) % 24), + ) + news_items.append(news_item) + + logger.info(f"Fetched {len(news_items)} market news items") + return news_items + except requests.exceptions.RequestException as e: + logger.error(f"Error fetching market news: {e}") + return [] + + def fetch_all_news(self) -> List[NewsItem]: + """Fetch news from all sources""" + crypto_news = self.fetch_crypto_news() + market_news = self.fetch_market_news() + + all_news = crypto_news + market_news + logger.info(f"Total news items fetched: {len(all_news)}") + + return all_news diff --git a/temp_backup/src/ingestion/__init__.py b/temp_backup/src/ingestion/__init__.py new file mode 100644 index 00000000..2bff5394 --- /dev/null +++ b/temp_backup/src/ingestion/__init__.py @@ -0,0 +1,42 @@ +""" +Data ingestion module for fetching external data. +""" + +from .news_fetcher import NewsFetcher, NewsArticle, fetch_news +from .stellar_fetcher import ( + StellarDataFetcher, + VolumeData, + TransactionRecord, + get_asset_volume, + get_network_overview, +) +from .price_fetcher import PriceFetcher +from .social_fetcher import ( + SocialFetcher, + SocialPost, + TwitterFetcher, + RedditFetcher, + RateLimiter, + SocialPlatform, + fetch_social, +) + +__all__ = [ + "NewsFetcher", + "NewsArticle", + "fetch_news", + "StellarDataFetcher", + "VolumeData", + "TransactionRecord", + "get_asset_volume", + "get_network_overview", + "PriceFetcher", + # Social media fetchers + "SocialFetcher", + "SocialPost", + "TwitterFetcher", + "RedditFetcher", + "RateLimiter", + "SocialPlatform", + "fetch_social", +] diff --git a/temp_backup/src/ingestion/news_deduplicator.py b/temp_backup/src/ingestion/news_deduplicator.py new file mode 100644 index 00000000..a94eed1d --- /dev/null +++ b/temp_backup/src/ingestion/news_deduplicator.py @@ -0,0 +1,198 @@ +""" +News deduplication module - removes duplicate articles to prevent re-processing +""" +import hashlib +import json +from datetime import datetime, timedelta, timezone +from typing import List, Dict, Optional, Set +from pathlib import Path +import logging + +logger = logging.getLogger(__name__) + + +class NewsDeduplicator: + """ + Handles deduplication of news articles to prevent re-processing of the same content. + Uses SHA-256 hashing of normalized content to identify duplicates. + """ + + def __init__(self, deduplication_window_days: int = 7, storage_path: str = "./data/deduplication.json"): + """ + Initialize the deduplicator + + Args: + deduplication_window_days: How many days back to check for duplicates + storage_path: Path to store seen hashes + """ + self.deduplication_window_days = deduplication_window_days + self.storage_path = Path(storage_path) + self.storage_path.parent.mkdir(parents=True, exist_ok=True) + + # Load existing hashes + self.seen_hashes: Dict[str, datetime] = {} + self._load_seen_hashes() + + # Calculate cutoff time for old hashes + self.cutoff_time = datetime.now(timezone.utc) - timedelta(days=self.deduplication_window_days) + + # Clean up old hashes periodically + self._cleanup_old_hashes() + + logger.info(f"Initialized NewsDeduplicator with window of {deduplication_window_days} days") + + def _normalize_article(self, article: Dict) -> str: + """ + Normalize article content for consistent hashing + + Args: + article: Article dictionary to normalize + + Returns: + Normalized string representation of the article + """ + # Extract and normalize key fields + title = (article.get('title') or '').strip().lower() + content = (article.get('content') or '').strip().lower() + url = (article.get('url') or '').strip().lower() + + # Create a canonical representation + canonical_data = { + 'title': title, + 'content': content, + 'url': url, + 'source': (article.get('source') or '').strip().lower(), + } + + # Convert to JSON string for consistent hashing + return json.dumps(canonical_data, sort_keys=True, separators=(',', ':')) + + def _compute_hash(self, article: Dict) -> str: + """ + Compute SHA-256 hash for an article + + Args: + article: Article dictionary to hash + + Returns: + SHA-256 hash as hex string + """ + normalized_content = self._normalize_article(article) + return hashlib.sha256(normalized_content.encode('utf-8')).hexdigest() + + def _load_seen_hashes(self): + """Load previously seen hashes from storage""" + if self.storage_path.exists(): + try: + with open(self.storage_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + for hash_str, timestamp_str in data.items(): + try: + if timestamp_str.endswith('+00:00'): + timestamp = datetime.fromisoformat(timestamp_str) + else: + # Handle naive datetime by assuming UTC + dt = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00')) + timestamp = dt + self.seen_hashes[hash_str] = timestamp + except ValueError: + logger.warning(f"Invalid timestamp format for hash {hash_str}: {timestamp_str}") + + logger.info(f"Loaded {len(self.seen_hashes)} previously seen hashes") + except (json.JSONDecodeError, IOError) as e: + logger.error(f"Error loading seen hashes from {self.storage_path}: {e}") + self.seen_hashes = {} + + def _save_seen_hashes(self): + """Save seen hashes to storage""" + try: + # Convert datetime objects to ISO format strings + data = { + hash_str: timestamp.isoformat() + for hash_str, timestamp in self.seen_hashes.items() + } + + with open(self.storage_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2) + + except IOError as e: + logger.error(f"Error saving seen hashes to {self.storage_path}: {e}") + + def _cleanup_old_hashes(self): + """Remove hashes older than the deduplication window""" + old_count = len(self.seen_hashes) + self.seen_hashes = { + hash_str: timestamp + for hash_str, timestamp in self.seen_hashes.items() + if timestamp > self.cutoff_time + } + removed_count = old_count - len(self.seen_hashes) + + if removed_count > 0: + logger.info(f"Removed {removed_count} old hashes outside the {self.deduplication_window_days}-day window") + + def is_duplicate(self, article: Dict) -> bool: + """ + Check if an article is a duplicate + + Args: + article: Article to check + + Returns: + True if the article is a duplicate, False otherwise + """ + article_hash = self._compute_hash(article) + return article_hash in self.seen_hashes + + def mark_seen(self, article: Dict): + """ + Mark an article as seen (add its hash to the seen set) + + Args: + article: Article to mark as seen + """ + article_hash = self._compute_hash(article) + self.seen_hashes[article_hash] = datetime.now(timezone.utc) + + def filter_duplicates(self, articles: List[Dict]) -> List[Dict]: + """ + Filter out duplicate articles from a list + + Args: + articles: List of articles to filter + + Returns: + List of articles with duplicates removed + """ + filtered_articles = [] + duplicates_found = 0 + + for article in articles: + if not self.is_duplicate(article): + self.mark_seen(article) + filtered_articles.append(article) + else: + duplicates_found += 1 + + if duplicates_found > 0: + logger.info(f"Filtered out {duplicates_found} duplicate articles") + + # Save updated hashes to storage + self._save_seen_hashes() + + return filtered_articles + + def get_statistics(self) -> Dict: + """ + Get statistics about the deduplication process + + Returns: + Dictionary with deduplication statistics + """ + return { + 'seen_hashes_count': len(self.seen_hashes), + 'deduplication_window_days': self.deduplication_window_days, + 'cutoff_time': self.cutoff_time.isoformat(), + 'storage_path': str(self.storage_path), + } \ No newline at end of file diff --git a/temp_backup/src/ingestion/news_fetcher.py b/temp_backup/src/ingestion/news_fetcher.py new file mode 100644 index 00000000..0ff1e04c --- /dev/null +++ b/temp_backup/src/ingestion/news_fetcher.py @@ -0,0 +1,333 @@ +""" +News Fetcher Service for cryptocurrency news. +Fetches data from external APIs and standardizes the format. +""" + +import os +import json +import time +from typing import List, Dict, Optional +from dataclasses import dataclass, asdict +from .news_deduplicator import NewsDeduplicator +from datetime import datetime +from src.utils.translator import translate_and_normalize +import requests +from requests.exceptions import RequestException +from src.utils.http_client import RobustHTTPClient + + +@dataclass +class NewsArticle: + """Standardized news article format""" + + id: str + title: str + content: Optional[str] + summary: Optional[str] + source: str + url: str + published_at: datetime + categories: List[str] + sentiment_score: Optional[float] = None # To be filled by sentiment engine + tags: Optional[List[str]] = None + + def to_dict(self) -> Dict: + """Convert to dictionary with serialized datetime""" + data = asdict(self) + data["published_at"] = self.published_at.isoformat() + return data + + +class APIConfig: + """Configuration for news APIs""" + + # API Endpoints + CRYPTOCOMPARE_URL = "https://min-api.cryptocompare.com/data/v2/news/" + NEWSAPI_URL = "https://newsapi.org/v2/everything" + + # Rate limiting + RATE_LIMIT_DELAY = 1.0 # seconds between requests + MAX_RETRIES = 3 + TIMEOUT = 10 # seconds + + +class NewsFetcher: + """ + Fetches cryptocurrency news from multiple APIs. + + Environment Variables Required: + - CRYPTOCOMPARE_API_KEY: API key for CryptoCompare + - NEWSAPI_API_KEY: API key for NewsAPI + """ + + def __init__(self, use_cryptocompare: bool = True, use_newsapi: bool = True): + """ + Initialize NewsFetcher with API keys from environment. + + Args: + use_cryptocompare: Whether to use CryptoCompare API + use_newsapi: Whether to use NewsAPI + """ + self.use_cryptocompare = use_cryptocompare + self.use_newsapi = use_newsapi + + # Load API keys from environment + self.cryptocompare_key = os.getenv("CRYPTOCOMPARE_API_KEY") + self.newsapi_key = os.getenv("NEWSAPI_API_KEY") + + # Validate API keys are available if services are enabled + if use_cryptocompare and not self.cryptocompare_key: + raise ValueError("CRYPTOCOMPARE_API_KEY environment variable not set") + if use_newsapi and not self.newsapi_key: + raise ValueError("NEWSAPI_API_KEY environment variable not set") + + # Session for connection pooling + self.session = RobustHTTPClient() + self.last_request_time = 0 + + # Cache for avoiding duplicate articles + self.seen_articles = set() + + # Initialize deduplicator + self.deduplicator = NewsDeduplicator(deduplication_window_days=7) + + def _respect_rate_limit(self): + """Ensure we respect rate limits by delaying if needed""" + current_time = time.time() + time_since_last = current_time - self.last_request_time + + if time_since_last < APIConfig.RATE_LIMIT_DELAY: + time.sleep(APIConfig.RATE_LIMIT_DELAY - time_since_last) + + self.last_request_time = time.time() + + def _handle_api_error(self, response: requests.Response, api_name: str) -> None: + """Handle API errors and raise appropriate exceptions""" + if response.status_code == 401: + raise PermissionError(f"{api_name} API: Invalid API key") + elif response.status_code == 429: + raise ConnectionError(f"{api_name} API: Rate limit exceeded") + elif response.status_code >= 500: + raise ConnectionError(f"{api_name} API: Server error") + else: + response.raise_for_status() + + def _fetch_cryptocompare(self, limit: int) -> List[NewsArticle]: + """Fetch news from CryptoCompare API""" + articles = [] + + try: + headers = {"Authorization": f"Apikey {self.cryptocompare_key}"} + + for lang in ["EN", "ES", "PT"]: + self._respect_rate_limit() + + params = { + "lang": lang, + "categories": "BTC,ETH,BLOCKCHAIN", + "excludeCategories": "Sponsored", + } + + response = self.session.get( + APIConfig.CRYPTOCOMPARE_URL, + params=params, + headers=headers, + timeout=APIConfig.TIMEOUT, + ) + + if response.status_code != 200: + self._handle_api_error(response, "CryptoCompare") + + data = response.json() + + if data.get("Type") != 100: + raise ValueError( + f"CryptoCompare API returned error: {data.get('Message', 'Unknown error')}" + ) + + # Parse articles + for item in data.get("Data", [])[:limit]: + try: + article = NewsArticle( + id=f"cc_{item['id']}", + title=translate_and_normalize(item.get("title", "")), + content=translate_and_normalize(item.get("body", "")), + summary=translate_and_normalize( + item.get("short_description", "") + ), + source=item.get("source", "Unknown"), + url=item.get("url", ""), + published_at=datetime.fromtimestamp( + item.get("published_on", 0) + ), + categories=( + item.get("categories", "").split("|") + if item.get("categories") + else [] + ), + tags=( + item.get("tags", "").split("|") + if item.get("tags") + else [] + ), + ) + + # Avoid duplicates + if article.id not in self.seen_articles: + articles.append(article) + self.seen_articles.add(article.id) + + except KeyError as e: + print(f"Warning: Missing key in CryptoCompare data: {e}") + continue + + except RequestException as e: + print(f"Error fetching from CryptoCompare: {e}") + except json.JSONDecodeError as e: + print(f"Error parsing CryptoCompare JSON: {e}") + + return articles + + def _fetch_newsapi(self, limit: int) -> List[NewsArticle]: + """Fetch news from NewsAPI""" + articles = [] + + try: + # Calculate date range (last 7 days for recent news) + to_date = datetime.now() + from_date = datetime.fromtimestamp(to_date.timestamp() - (7 * 24 * 3600)) + + for lang in ["en", "es", "pt"]: + self._respect_rate_limit() + + params = { + "q": "cryptocurrency OR blockchain OR bitcoin OR ethereum", + "language": lang, + "sortBy": "publishedAt", + "pageSize": min(limit, 100), # NewsAPI max is 100 + "from": from_date.strftime("%Y-%m-%d"), + "to": to_date.strftime("%Y-%m-%d"), + "apiKey": self.newsapi_key, + } + + response = self.session.get( + APIConfig.NEWSAPI_URL, params=params, timeout=APIConfig.TIMEOUT + ) + + if response.status_code != 200: + self._handle_api_error(response, "NewsAPI") + + data = response.json() + + # Parse articles + for item in data.get("articles", [])[:limit]: + try: + published_at = datetime.fromisoformat( + item["publishedAt"].replace("Z", "+00:00") + ) + + article = NewsArticle( + id=f"na_{hash(item['url']) & 0xFFFFFFFF}", + title=translate_and_normalize(item.get("title", "")), + content=translate_and_normalize(item.get("content", "")), + summary=translate_and_normalize( + item.get("description", "") + ), + source=item.get("source", {}).get("name", "Unknown"), + url=item.get("url", ""), + published_at=published_at, + categories=[ + "crypto", + "blockchain", + ], # NewsAPI doesn't provide categories + ) + + # Avoid duplicates + if article.id not in self.seen_articles: + articles.append(article) + self.seen_articles.add(article.id) + + except (KeyError, ValueError) as e: + print(f"Warning: Error parsing NewsAPI article: {e}") + continue + + except RequestException as e: + print(f"Error fetching from NewsAPI: {e}") + except json.JSONDecodeError as e: + print(f"Error parsing NewsAPI JSON: {e}") + + return articles + + def fetch_latest(self, limit: int = 10) -> List[Dict]: + """ + Fetch latest news articles from configured APIs. + + Args: + limit: Maximum number of articles to return from each API + + Returns: + List of standardized article dictionaries + + Raises: + ConnectionError: If all APIs fail + ValueError: If invalid parameters provided + """ + if limit <= 0: + raise ValueError("Limit must be positive") + + all_articles = [] + + # Fetch from CryptoCompare + if self.use_cryptocompare: + articles = self._fetch_cryptocompare(limit) + all_articles.extend(articles) + print(f"Fetched {len(articles)} articles from CryptoCompare") + + # Fetch from NewsAPI + if self.use_newsapi: + articles = self._fetch_newsapi(limit) + all_articles.extend(articles) + print(f"Fetched {len(articles)} articles from NewsAPI") + + # Sort by publication date (newest first) + all_articles.sort(key=lambda x: x.published_at, reverse=True) + + # Convert to dictionaries + articles_as_dicts = [article.to_dict() for article in all_articles] + + # Apply deduplication filter + deduplicated_articles = self.deduplicator.filter_duplicates(articles_as_dicts) + + result = deduplicated_articles[:limit] + + if not result: + print("Warning: No articles fetched from any API") + + return result + + def clear_cache(self): + """Clear the cache of seen articles""" + self.seen_articles.clear() + + def close(self): + """Close the session""" + self.session.close() + + +# Utility function for easy usage +def fetch_news( + limit: int = 10, use_cryptocompare: bool = True, use_newsapi: bool = True +) -> List[Dict]: + """ + Convenience function to fetch news. + + Example: + articles = fetch_news(limit=5) + for article in articles: + print(f"{article['title']} - {article['source']}") + """ + fetcher = NewsFetcher(use_cryptocompare=use_cryptocompare, use_newsapi=use_newsapi) + try: + return fetcher.fetch_latest(limit) + finally: + fetcher.close() diff --git a/temp_backup/src/ingestion/price_fetcher.py b/temp_backup/src/ingestion/price_fetcher.py new file mode 100644 index 00000000..2146c4da --- /dev/null +++ b/temp_backup/src/ingestion/price_fetcher.py @@ -0,0 +1,226 @@ +""" +Off-chain price fetcher for Soroban pricing adapter feeds. + +This module supports fetching USD prices for supported Stellar assets, +scaling them to the pricing adapter base decimals, and handling failures +with stale cache fallback. +""" + +from __future__ import annotations + +import logging +from datetime import datetime, timezone +from typing import Any, Dict, List, Optional + +from requests.exceptions import RequestException +from src.utils.http_client import RobustHTTPClient + +logger = logging.getLogger(__name__) + +BASE_DECIMALS = 7 +DEFAULT_CACHE_TTL_SECONDS = 300 +DEFAULT_STALE_TTL_SECONDS = 600 +DEFAULT_REQUEST_TIMEOUT = 10 + +COINGECKO_URL = "https://api.coingecko.com/api/v3/simple/price" +COINCAP_URL = "https://api.coincap.io/v2/assets" + +SUPPORTED_ASSETS: Dict[str, Dict[str, Any]] = { + "XLM": { + "coingecko_id": "stellar", + "coincap_id": "stellar", + "asset_decimals": 7, + "asset_issuer": None, + }, + "USDC": { + "coingecko_id": "usd-coin", + "coincap_id": "usd-coin", + "asset_decimals": 6, + "asset_issuer": None, + }, +} + + +class PriceFetcher: + """Fetch current asset prices and prepare adapter-ready payloads.""" + + def __init__( + self, + cache_ttl_seconds: int = DEFAULT_CACHE_TTL_SECONDS, + stale_ttl_seconds: int = DEFAULT_STALE_TTL_SECONDS, + request_timeout: int = DEFAULT_REQUEST_TIMEOUT, + ): + self.cache_ttl_seconds = cache_ttl_seconds + self.stale_ttl_seconds = stale_ttl_seconds + self.request_timeout = request_timeout + self.cache: Dict[str, Dict[str, Any]] = {} + self.session = RobustHTTPClient() + + def fetch_all_prices( + self, asset_codes: Optional[List[str]] = None + ) -> List[Dict[str, Any]]: + """Fetch prices for supported assets and return adapter-ready values.""" + asset_codes = asset_codes or list(SUPPORTED_ASSETS.keys()) + now = datetime.now(timezone.utc) + source = "coingecko" + + try: + price_map = self._fetch_coingecko(asset_codes) + except Exception as primary_error: + logger.warning( + "Primary price source failed: %s; trying fallback endpoint.", + primary_error, + ) + source = "coincap" + try: + price_map = self._fetch_coincap(asset_codes) + except Exception as fallback_error: + logger.warning( + "Fallback price source failed: %s; using cached stale values if available.", + fallback_error, + ) + price_map = {} + source = "cache" + + results: List[Dict[str, Any]] = [] + for asset_code in asset_codes: + asset_config = SUPPORTED_ASSETS.get(asset_code) + if not asset_config: + logger.warning("Skipping unsupported asset code: %s", asset_code) + continue + + coingecko_id = asset_config["coingecko_id"] + price_usd = price_map.get(coingecko_id) + + if price_usd is not None: + scaled_price = self._scale_price(price_usd) + payload = self._build_price_payload( + asset_code=asset_code, + asset_issuer=asset_config.get("asset_issuer"), + price_usd=price_usd, + scaled_price=scaled_price, + asset_decimals=asset_config["asset_decimals"], + source=source, + timestamp=now, + is_stale=False, + ) + self.cache[asset_code] = { + "payload": payload, + "cached_at": now, + } + results.append(payload) + continue + + stale_payload = self._get_stale_payload(asset_code, now) + if stale_payload is not None: + results.append(stale_payload) + continue + + results.append( + { + "asset_code": asset_code, + "asset_issuer": asset_config.get("asset_issuer"), + "success": False, + "error": "price_unavailable", + "source": source, + "is_stale": False, + "timestamp": now.isoformat(), + } + ) + + return results + + def fetch_price(self, asset_code: str) -> Dict[str, Any]: + """Fetch the current price for a single asset.""" + return self.fetch_all_prices([asset_code])[0] + + def _fetch_coingecko(self, asset_codes: List[str]) -> Dict[str, float]: + """Fetch usd prices from CoinGecko.""" + asset_ids = self._asset_ids(asset_codes, key="coingecko_id") + response = self.session.get( + COINGECKO_URL, + params={"ids": ",".join(asset_ids), "vs_currencies": "usd"}, + timeout=self.request_timeout, + ) + response.raise_for_status() + data = response.json() + prices: Dict[str, float] = {} + for asset_code in asset_codes: + asset_id = SUPPORTED_ASSETS[asset_code]["coingecko_id"] + asset_data = data.get(asset_id, {}) + usd_value = asset_data.get("usd") + if usd_value is not None: + prices[asset_id] = float(usd_value) + if not prices: + raise RequestException("CoinGecko returned no valid prices") + return prices + + def _fetch_coincap(self, asset_codes: List[str]) -> Dict[str, float]: + """Fetch usd prices from CoinCap as a fallback.""" + asset_ids = self._asset_ids(asset_codes, key="coincap_id") + response = self.session.get( + COINCAP_URL, + params={"ids": ",".join(asset_ids)}, + timeout=self.request_timeout, + ) + response.raise_for_status() + data = response.json() + prices: Dict[str, float] = {} + for item in data.get("data", []): + asset_id = item.get("id") + price_usd = item.get("priceUsd") + if asset_id and price_usd: + prices[asset_id] = float(price_usd) + if not prices: + raise RequestException("CoinCap returned no valid prices") + return prices + + def _scale_price(self, price_usd: float) -> int: + return int(round(price_usd * (10**BASE_DECIMALS))) + + def _build_price_payload( + self, + asset_code: str, + asset_issuer: Optional[str], + price_usd: float, + scaled_price: int, + asset_decimals: int, + source: str, + timestamp: datetime, + is_stale: bool, + ) -> Dict[str, Any]: + return { + "asset_code": asset_code, + "asset_issuer": asset_issuer, + "price_usd": price_usd, + "price": scaled_price, + "asset_decimals": asset_decimals, + "base_decimals": BASE_DECIMALS, + "source": source, + "timestamp": timestamp.isoformat(), + "is_stale": is_stale, + "success": True, + } + + def _asset_ids(self, asset_codes: List[str], key: str) -> List[str]: + return [SUPPORTED_ASSETS[asset_code][key] for asset_code in asset_codes] + + def _get_stale_payload( + self, asset_code: str, now: datetime + ) -> Optional[Dict[str, Any]]: + cached = self.cache.get(asset_code) + if not cached: + return None + age = (now - cached["cached_at"]).total_seconds() + if age > self.stale_ttl_seconds: + logger.warning( + "Cached price for %s is stale (%.0fs old), discarding.", + asset_code, + age, + ) + return None + payload = cached["payload"].copy() + payload["is_stale"] = True + payload["source"] = "cache" + payload["timestamp"] = now.isoformat() + return payload diff --git a/temp_backup/src/ingestion/run_ingestion_quality_checks.py b/temp_backup/src/ingestion/run_ingestion_quality_checks.py new file mode 100644 index 00000000..3431f90d --- /dev/null +++ b/temp_backup/src/ingestion/run_ingestion_quality_checks.py @@ -0,0 +1,28 @@ +"""CLI wrapper to run Stellar ingestion quality checks. + +This exists so scheduler/API can invoke a stable entrypoint. +""" + +from __future__ import annotations + +import os +import sys + +# Ensure local imports work when executed from repo root or app root. +HERE = os.path.dirname(__file__) +# apps/data-processing/src needs to be on sys.path so `import ingestion...` works. +# The project uses both import styles; this script uses the direct package under src. +SRC_ROOT = os.path.abspath(os.path.join(HERE, "..")) +sys.path.insert(0, SRC_ROOT) + +from ingestion.stellar_ingestion_checks import main + + + +if __name__ == "__main__": + raise SystemExit( + main( + argv=None + ) + ) + diff --git a/temp_backup/src/ingestion/social_fetcher.py b/temp_backup/src/ingestion/social_fetcher.py new file mode 100644 index 00000000..f300f3de --- /dev/null +++ b/temp_backup/src/ingestion/social_fetcher.py @@ -0,0 +1,741 @@ +""" +Social Media Fetcher Service for cryptocurrency sentiment analysis. +Fetches data from Twitter/X and Reddit APIs with proper rate limiting. +""" + +import json +import logging +import math +import os +import re +import time +from dataclasses import asdict, dataclass +from datetime import datetime, timezone +from enum import Enum +from typing import Dict, List, Optional + +from requests.exceptions import RequestException +from src.utils.http_client import RobustHTTPClient +from src.utils.translator import translate_and_normalize + +logger = logging.getLogger(__name__) + + +class SocialPlatform(Enum): + """Supported social media platforms""" + + TWITTER = "twitter" + REDDIT = "reddit" + + +@dataclass +class SocialPost: + """ + Standardized social media post format. + Normalizes data from different platforms (Twitter/X, Reddit). + """ + + id: str + platform: str + content: str + author: str + posted_at: datetime + url: str + # Engagement metrics + likes: int = 0 + comments: int = 0 + shares: int = 0 + # Sentiment-related + sentiment_score: Optional[float] = None + # Platform-specific metadata + hashtags: Optional[List[str]] = None + subreddit: Optional[str] = None + # Tracking + fetched_at: datetime = None + + def __post_init__(self): + if self.fetched_at is None: + self.fetched_at = datetime.now(timezone.utc) + if self.hashtags is None: + self.hashtags = [] + + def to_dict(self) -> Dict: + """Convert to dictionary with serialized datetimes""" + data = asdict(self) + data["posted_at"] = self.posted_at.isoformat() + data["fetched_at"] = self.fetched_at.isoformat() if self.fetched_at else None + data["platform"] = self.platform + return data + + def to_news_article_format(self) -> Dict: + """ + Convert to NewsArticle-compatible format for sentiment pipeline. + Allows social posts to flow through the existing sentiment analysis. + """ + return { + "id": f"social_{self.platform}_{self.id}", + "title": ( + self.content[:100] + "..." if len(self.content) > 100 else self.content + ), + "content": self.content, + "summary": self.content[:200] if len(self.content) > 200 else self.content, + "source": f"{self.platform.title()} - {self.subreddit or 'feed'}", + "url": self.url, + "published_at": self.posted_at.isoformat(), + "categories": self.hashtags or [], + "tags": self.hashtags or [], + "platform": self.platform, + "author": self.author, + "engagement": { + "likes": self.likes, + "comments": self.comments, + "shares": self.shares, + }, + } + + +class SocialAPIConfig: + """Configuration for social media APIs""" + + # Twitter/X API v2 endpoints + TWITTER_BASE_URL = "https://api.twitter.com/2" + TWITTER_SEARCH_ENDPOINT = "/tweets/search/recent" + + # Reddit API endpoints (using JSON feed - no auth required for public subreddits) + REDDIT_BASE_URL = "https://www.reddit.com" + REDDIT_SUBREDDIT_ENDPOINT = "/r/{subreddit}/new.json" + REDDIT_SEARCH_ENDPOINT = "/search.json" + + # Rate limiting (per platform) + # Twitter: 450 requests/15min = 30/min for app auth + TWITTER_RATE_LIMIT_DELAY = 2.0 # 2 seconds between requests (conservative) + TWITTER_REQUESTS_PER_WINDOW = 450 + TWITTER_WINDOW_SECONDS = 900 # 15 minutes + + # Reddit: 60 requests/minute + REDDIT_RATE_LIMIT_DELAY = 1.0 # 1 second between requests + REDDIT_REQUESTS_PER_MINUTE = 60 + + # Common settings + MAX_RETRIES = 3 + TIMEOUT = 15 + RETRY_BACKOFF_BASE = 2 + + # Target hashtags and subreddits for Stellar ecosystem + DEFAULT_HASHTAGS = ["#Stellar", "#Soroban", "#XLM", "#StellarLumen", "#DeFi"] + DEFAULT_SUBREDDITS = ["Stellar", "StellarLumen", "Soroban", "CryptoCurrency"] + + +class RateLimiter: + """ + Token bucket rate limiter for API requests. + Ensures we stay within API tier limits. + """ + + def __init__( + self, requests_per_window: int, window_seconds: int, min_delay: float = 0 + ): + """ + Initialize rate limiter. + + Args: + requests_per_window: Maximum requests allowed in the time window + window_seconds: Time window in seconds + min_delay: Minimum delay between requests (additional throttle) + """ + self.requests_per_window = requests_per_window + self.window_seconds = window_seconds + self.min_delay = min_delay + self.request_times: List[float] = [] + self.last_request_time = 0 + + def wait_if_needed(self) -> float: + """ + Wait if necessary to respect rate limits. + + Returns: + Time waited in seconds + """ + current_time = time.time() + waited = 0.0 + + # Ensure minimum delay between requests + time_since_last = current_time - self.last_request_time + if time_since_last < self.min_delay: + wait_time = self.min_delay - time_since_last + time.sleep(wait_time) + waited += wait_time + + # Clean old requests from tracking + cutoff_time = current_time - self.window_seconds + self.request_times = [t for t in self.request_times if t > cutoff_time] + + # Check if we're at the rate limit + if len(self.request_times) >= self.requests_per_window: + # Wait until oldest request exits the window + oldest = self.request_times[0] + wait_until = oldest + self.window_seconds + wait_time = wait_until - current_time + if wait_time > 0: + time.sleep(wait_time) + waited += wait_time + # Clean again after waiting + self.request_times = [ + t + for t in self.request_times + if t > time.time() - self.window_seconds + ] + + # Record this request + self.last_request_time = time.time() + self.request_times.append(self.last_request_time) + + return waited + + +class TwitterFetcher: + """ + Fetches tweets from Twitter/X API v2. + Requires Bearer Token for API access. + """ + + def __init__(self, bearer_token: Optional[str] = None): + """ + Initialize Twitter fetcher. + + Args: + bearer_token: Twitter API Bearer Token (can be from env TWITTER_BEARER_TOKEN) + """ + self.bearer_token = bearer_token or os.getenv("TWITTER_BEARER_TOKEN") + if not self.bearer_token: + logger.warning( + "TWITTER_BEARER_TOKEN not set. Twitter fetching will be disabled." + ) + + self.session = RobustHTTPClient() + self.session.headers.update({"Authorization": f"Bearer {self.bearer_token}"}) + + self.rate_limiter = RateLimiter( + SocialAPIConfig.TWITTER_REQUESTS_PER_WINDOW, + SocialAPIConfig.TWITTER_WINDOW_SECONDS, + SocialAPIConfig.TWITTER_RATE_LIMIT_DELAY, + ) + + self.enabled = bool(self.bearer_token) + + def fetch_hashtag( + self, hashtag: str, limit: int = 50, since_id: Optional[str] = None + ) -> List[SocialPost]: + """ + Fetch recent tweets containing a hashtag. + + Args: + hashtag: Hashtag to search (with or without #) + limit: Maximum tweets to return + since_id: Fetch tweets newer than this ID + + Returns: + List of SocialPost objects + """ + if not self.enabled: + logger.warning("Twitter API not configured. Skipping Twitter fetch.") + return [] + + posts = [] + + # Normalize hashtag + query = hashtag if hashtag.startswith("#") else f"#{hashtag}" + query = f"{query} -is:retweet (lang:en OR lang:es OR lang:pt)" # Exclude retweets, English/Spanish/Portuguese + + params = { + "query": query, + "max_results": min(limit, 100), # Twitter max is 100 per request + "tweet.fields": "created_at,public_metrics,entities,author_id", + "expansions": "author_id", + "user.fields": "username,name", + } + + if since_id: + params["since_id"] = since_id + + try: + self.rate_limiter.wait_if_needed() + + response = self.session.get( + f"{SocialAPIConfig.TWITTER_BASE_URL}{SocialAPIConfig.TWITTER_SEARCH_ENDPOINT}", + params=params, + timeout=SocialAPIConfig.TIMEOUT, + ) + + if response.status_code == 429: + logger.warning("Twitter rate limit exceeded. Waiting...") + # Get reset time from header + reset_time = int( + response.headers.get("x-rate-limit-reset", time.time() + 900) + ) + wait_seconds = reset_time - time.time() + if wait_seconds > 0: + time.sleep(wait_seconds) + return self.fetch_hashtag(hashtag, limit, since_id) + + response.raise_for_status() + data = response.json() + + # Parse tweets + includes = data.get("includes", {}) + users_map = {u["id"]: u for u in includes.get("users", [])} + + for tweet in data.get("data", [])[:limit]: + author_id = tweet.get("author_id", "") + user = users_map.get(author_id, {}) + metrics = tweet.get("public_metrics", {}) + + # Extract hashtags + entities = tweet.get("entities", {}) + hashtags = [f"#{tag['tag']}" for tag in entities.get("hashtags", [])] + + post = SocialPost( + id=tweet["id"], + platform=SocialPlatform.TWITTER.value, + content=translate_and_normalize(tweet.get("text", "")), + author=user.get("username", "unknown"), + posted_at=datetime.fromisoformat( + tweet["created_at"].replace("Z", "+00:00") + ), + url=f"https://twitter.com/user/status/{tweet['id']}", + likes=metrics.get("like_count", 0), + comments=metrics.get("reply_count", 0), + shares=metrics.get("retweet_count", 0), + hashtags=hashtags, + ) + posts.append(post) + + logger.info(f"Fetched {len(posts)} tweets for {hashtag}") + + except RequestException as e: + logger.error(f"Error fetching Twitter data for {hashtag}: {e}") + except (KeyError, json.JSONDecodeError) as e: + logger.error(f"Error parsing Twitter response: {e}") + + return posts + + def fetch_multiple_hashtags( + self, hashtags: List[str] = None, limit_per_hashtag: int = 25 + ) -> List[SocialPost]: + """ + Fetch tweets for multiple hashtags. + + Args: + hashtags: List of hashtags to search + limit_per_hashtag: Max tweets per hashtag + + Returns: + Combined list of SocialPosts + """ + hashtags = hashtags or SocialAPIConfig.DEFAULT_HASHTAGS + all_posts = [] + + for hashtag in hashtags: + posts = self.fetch_hashtag(hashtag, limit=limit_per_hashtag) + all_posts.extend(posts) + # Small delay between different hashtag searches + time.sleep(0.5) + + return all_posts + + def close(self): + """Close the session""" + self.session.close() + + +class RedditFetcher: + """ + Fetches posts from Reddit. + Uses public JSON API (no auth required for public subreddits). + """ + + def __init__(self): + """Initialize Reddit fetcher""" + self.session = RobustHTTPClient() + self.session.headers.update( + { + "User-Agent": "LumenPulseSentimentBot/1.0 (cryptocurrency sentiment analysis)" + } + ) + + self.rate_limiter = RateLimiter( + SocialAPIConfig.REDDIT_REQUESTS_PER_MINUTE, + 60, + SocialAPIConfig.REDDIT_RATE_LIMIT_DELAY, + ) + + def fetch_subreddit( + self, subreddit: str, limit: int = 50, after: Optional[str] = None + ) -> List[SocialPost]: + """ + Fetch recent posts from a subreddit. + + Args: + subreddit: Subreddit name (without r/) + limit: Maximum posts to return + after: Reddit fullname to fetch posts after + + Returns: + List of SocialPost objects + """ + posts = [] + + url = f"{SocialAPIConfig.REDDIT_BASE_URL}{SocialAPIConfig.REDDIT_SUBREDDIT_ENDPOINT.format(subreddit=subreddit)}" + + params = {"limit": min(limit, 100)} + if after: + params["after"] = after + + try: + self.rate_limiter.wait_if_needed() + + response = self.session.get( + url, params=params, timeout=SocialAPIConfig.TIMEOUT + ) + + if response.status_code == 429: + logger.warning("Reddit rate limit exceeded. Waiting...") + time.sleep(60) + return self.fetch_subreddit(subreddit, limit, after) + + response.raise_for_status() + data = response.json() + + # Parse posts + for child in data.get("data", {}).get("children", [])[:limit]: + post_data = child.get("data", {}) + + post = SocialPost( + id=post_data.get("id", ""), + platform=SocialPlatform.REDDIT.value, + content=translate_and_normalize( + post_data.get("selftext", "") or post_data.get("title", "") + ), + author=post_data.get("author", "[deleted]"), + posted_at=datetime.fromtimestamp( + post_data.get("created_utc", time.time()), tz=timezone.utc + ), + url=f"https://reddit.com{post_data.get('permalink', '')}", + likes=post_data.get("ups", 0), + comments=post_data.get("num_comments", 0), + shares=post_data.get("num_crossposts", 0), + subreddit=post_data.get("subreddit", subreddit), + hashtags=self._extract_hashtags(post_data), + ) + posts.append(post) + + logger.info(f"Fetched {len(posts)} posts from r/{subreddit}") + + except RequestException as e: + logger.error(f"Error fetching Reddit data from r/{subreddit}: {e}") + except (KeyError, json.JSONDecodeError) as e: + logger.error(f"Error parsing Reddit response: {e}") + + return posts + + def fetch_search( + self, query: str, subreddits: List[str] = None, limit: int = 50 + ) -> List[SocialPost]: + """ + Search Reddit for specific terms. + + Args: + query: Search query + subreddits: Restrict to these subreddits + limit: Maximum results + + Returns: + List of SocialPost objects + """ + posts = [] + + params = {"q": query, "limit": min(limit, 100), "sort": "new", "type": "link"} + + if subreddits: + params["restrict_sr"] = True + params["sr"] = ",".join(subreddits) + + try: + self.rate_limiter.wait_if_needed() + + response = self.session.get( + f"{SocialAPIConfig.REDDIT_BASE_URL}{SocialAPIConfig.REDDIT_SEARCH_ENDPOINT}", + params=params, + timeout=SocialAPIConfig.TIMEOUT, + ) + + response.raise_for_status() + data = response.json() + + for child in data.get("data", {}).get("children", [])[:limit]: + post_data = child.get("data", {}) + + post = SocialPost( + id=post_data.get("id", ""), + platform=SocialPlatform.REDDIT.value, + content=translate_and_normalize( + post_data.get("selftext", "") or post_data.get("title", "") + ), + author=post_data.get("author", "[deleted]"), + posted_at=datetime.fromtimestamp( + post_data.get("created_utc", time.time()), tz=timezone.utc + ), + url=f"https://reddit.com{post_data.get('permalink', '')}", + likes=post_data.get("ups", 0), + comments=post_data.get("num_comments", 0), + shares=post_data.get("num_crossposts", 0), + subreddit=post_data.get("subreddit", ""), + ) + posts.append(post) + + logger.info(f"Fetched {len(posts)} Reddit posts for query: {query}") + + except RequestException as e: + logger.error(f"Error searching Reddit: {e}") + except (KeyError, json.JSONDecodeError) as e: + logger.error(f"Error parsing Reddit search response: {e}") + + return posts + + def fetch_multiple_subreddits( + self, subreddits: List[str] = None, limit_per_subreddit: int = 25 + ) -> List[SocialPost]: + """ + Fetch posts from multiple subreddits. + + Args: + subreddits: List of subreddit names + limit_per_subreddit: Max posts per subreddit + + Returns: + Combined list of SocialPosts + """ + subreddits = subreddits or SocialAPIConfig.DEFAULT_SUBREDDITS + all_posts = [] + + for subreddit in subreddits: + posts = self.fetch_subreddit(subreddit, limit=limit_per_subreddit) + all_posts.extend(posts) + # Small delay between subreddit fetches + time.sleep(0.5) + + return all_posts + + def _extract_hashtags(self, post_data: Dict) -> List[str]: + """Extract hashtags from Reddit post title and body""" + hashtags = [] + text = f"{post_data.get('title', '')} {post_data.get('selftext', '')}" + + # Simple hashtag extraction + hashtags = re.findall(r"#\w+", text) + + # Also add link flair as hashtag + if post_data.get("link_flair_text"): + hashtags.append(f"#{post_data['link_flair_text'].replace(' ', '')}") + + return list(set(hashtags)) + + def close(self): + """Close the session""" + self.session.close() + + +class SocialFetcher: + """ + Main social media fetcher that coordinates Twitter and Reddit fetching. + Provides a unified interface for collecting social sentiment data. + """ + + def __init__( + self, + use_twitter: bool = True, + use_reddit: bool = True, + twitter_token: Optional[str] = None, + ): + """ + Initialize SocialFetcher. + + Args: + use_twitter: Enable Twitter/X fetching + use_reddit: Enable Reddit fetching + twitter_token: Twitter Bearer Token (optional, uses env) + """ + self.use_twitter = use_twitter + self.use_reddit = use_reddit + + # Initialize fetchers + self.twitter = ( + TwitterFetcher(bearer_token=twitter_token) if use_twitter else None + ) + self.reddit = RedditFetcher() if use_reddit else None + + # Deduplication tracking + self.seen_post_ids: set = set() + + def fetch_all( + self, + hashtags: List[str] = None, + subreddits: List[str] = None, + limit_per_source: int = 25, + ) -> List[Dict]: + """ + Fetch social posts from all configured sources. + + Args: + hashtags: Twitter hashtags to search + subreddits: Reddit subreddits to fetch + limit_per_source: Max posts per source/hashtag/subreddit + + Returns: + List of normalized post dictionaries + """ + all_posts = [] + + # Fetch from Twitter + if self.twitter and self.use_twitter: + twitter_posts = self.twitter.fetch_multiple_hashtags( + hashtags=hashtags, limit_per_hashtag=limit_per_source + ) + all_posts.extend(twitter_posts) + + # Fetch from Reddit + if self.reddit and self.use_reddit: + reddit_posts = self.reddit.fetch_multiple_subreddits( + subreddits=subreddits, limit_per_subreddit=limit_per_source + ) + all_posts.extend(reddit_posts) + + # Deduplicate and sort by date + unique_posts = [] + for post in all_posts: + post_id = f"{post.platform}_{post.id}" + if post_id not in self.seen_post_ids: + self.seen_post_ids.add(post_id) + unique_posts.append(post) + + # Sort by posted_at (newest first) + unique_posts.sort(key=lambda p: p.posted_at, reverse=True) + + logger.info(f"Total unique social posts: {len(unique_posts)}") + + return [post.to_dict() for post in unique_posts] + + def fetch_as_articles( + self, + hashtags: List[str] = None, + subreddits: List[str] = None, + limit_per_source: int = 25, + ) -> List[Dict]: + """ + Fetch posts in NewsArticle-compatible format. + Useful for feeding into existing sentiment analysis pipeline. + + Args: + hashtags: Twitter hashtags to search + subreddits: Reddit subreddits to fetch + limit_per_source: Max posts per source + + Returns: + List of posts in article-compatible format + """ + posts = self.fetch_all( + hashtags=hashtags, subreddits=subreddits, limit_per_source=limit_per_source + ) + + return [ + SocialPost( + id=p["id"], + platform=p["platform"], + content=p["content"], + author=p["author"], + posted_at=datetime.fromisoformat(p["posted_at"].replace("Z", "+00:00")), + url=p["url"], + likes=p.get("likes", 0), + comments=p.get("comments", 0), + shares=p.get("shares", 0), + hashtags=p.get("hashtags", []), + subreddit=p.get("subreddit"), + ).to_news_article_format() + for p in posts + ] + + def get_sentiment_weight(self, post: SocialPost) -> float: + """ + Calculate sentiment weight based on engagement. + Higher engagement = more weight for sentiment scoring. + + Args: + post: SocialPost to weight + + Returns: + Weight multiplier for sentiment scoring + """ + # Base weight + weight = 1.0 + + # Engagement bonus (logarithmic scaling) + total_engagement = post.likes + (post.comments * 2) + (post.shares * 3) + if total_engagement > 0: + weight += math.log10(total_engagement + 1) / 2 # Max ~0.5 bonus + + # Platform-specific weights + if post.platform == SocialPlatform.REDDIT.value: + # Reddit tends to have more detailed analysis + weight *= 1.2 + + return min(weight, 3.0) # Cap at 3x + + def clear_cache(self): + """Clear the seen post cache""" + self.seen_post_ids.clear() + + def close(self): + """Close all fetcher sessions""" + if self.twitter: + self.twitter.close() + if self.reddit: + self.reddit.close() + + +# Convenience function for easy usage +def fetch_social( + hashtags: List[str] = None, + subreddits: List[str] = None, + limit_per_source: int = 25, + use_twitter: bool = True, + use_reddit: bool = True, +) -> List[Dict]: + """ + Convenience function to fetch social posts. + + Example: + posts = fetch_social( + hashtags=["#Stellar", "#Soroban"], + subreddits=["Stellar"], + limit_per_source=10 + ) + for post in posts: + print(f"{post['platform']}: {post['content'][:50]}...") + + Args: + hashtags: Twitter hashtags to search + subreddits: Reddit subreddits to fetch + limit_per_source: Max posts per source + use_twitter: Enable Twitter fetching + use_reddit: Enable Reddit fetching + + Returns: + List of social post dictionaries + """ + fetcher = SocialFetcher(use_twitter=use_twitter, use_reddit=use_reddit) + try: + return fetcher.fetch_all( + hashtags=hashtags, subreddits=subreddits, limit_per_source=limit_per_source + ) + finally: + fetcher.close() diff --git a/temp_backup/src/ingestion/soroban_event_indexer.py b/temp_backup/src/ingestion/soroban_event_indexer.py new file mode 100644 index 00000000..41bbf0f6 --- /dev/null +++ b/temp_backup/src/ingestion/soroban_event_indexer.py @@ -0,0 +1,267 @@ +""" +Soroban Event Indexer for incremental sync +Polls Soroban RPC for new events and sends them to backend for processing +""" + +import os +import time +import json +import logging +from pathlib import Path +from datetime import datetime, timezone +import requests +from typing import List, Dict, Optional + +logger = logging.getLogger(__name__) + +class SorobanEventIndexer: + def __init__( + self, + rpc_url: str, + backend_url: str, + ingest_secret: str, + contract_ids: Optional[List[str]] = None, + state_file: str = "./data/soroban_indexer_state.json", + poll_interval: int = 30 + ): + self.rpc_url = rpc_url + self.backend_url = backend_url + self.ingest_secret = ingest_secret + self.contract_ids = contract_ids or [] + self.state_file = Path(state_file) + self.poll_interval = poll_interval + self.last_ledger: int = self._load_last_ledger() + + def _load_last_ledger(self) -> int: + """Load last processed ledger from state file""" + if self.state_file.exists(): + try: + with open(self.state_file, 'r') as f: + state = json.load(f) + return state.get("last_ledger", 0) + except (json.JSONDecodeError, KeyError): + logger.warning("Failed to load state file, starting from ledger 0") + return 0 + + def _save_last_ledger(self, ledger: int): + """Save last processed ledger to state file""" + self.state_file.parent.mkdir(parents=True, exist_ok=True) + with open(self.state_file, 'w') as f: + json.dump({"last_ledger": ledger, "timestamp": datetime.now(timezone.utc).isoformat()}, f) + self.last_ledger = ledger + + def fetch_latest_ledger(self) -> int: + """Get the latest ledger sequence from Soroban RPC""" + payload = { + "jsonrpc": "2.0", + "id": 1, + "method": "getLatestLedger" + } + + try: + response = requests.post(self.rpc_url, json=payload, timeout=30) + response.raise_for_status() + data = response.json() + return int(data.get("result", {}).get("sequence", 0)) + except Exception as e: + logger.error(f"Failed to fetch latest ledger: {e}") + raise + + def fetch_events_since(self, start_ledger: int) -> List[Dict]: + """Fetch events from Soroban RPC starting at the given ledger""" + all_events = [] + cursor = None + + while True: + filters = [] + if self.contract_ids: + filters.append({ + "type": "contract", + "contractIds": self.contract_ids + }) + + payload = { + "jsonrpc": "2.0", + "id": 1, + "method": "getEvents", + "params": { + "startLedger": start_ledger, + "filters": filters, + "pagination": { + "limit": 100 + } + } + } + + if cursor: + payload["params"]["pagination"]["cursor"] = cursor + + try: + response = requests.post(self.rpc_url, json=payload, timeout=30) + response.raise_for_status() + data = response.json() + except Exception as e: + logger.error(f"RPC Request failed: {e}") + raise + + if "error" in data: + logger.error(f"RPC Error: {data['error']}") + raise RuntimeError(f"RPC Error: {data['error']}") + + events = data.get("result", {}).get("events", []) + all_events.extend(events) + + # Check if we need to paginate + if len(events) < 100: + break + + # Get cursor from last event + if events: + cursor = events[-1].get("pagingToken") + + if not cursor: + break + + time.sleep(0.5) # Rate limiting + + return all_events + + def send_event_to_backend(self, event: Dict, event_index: int) -> bool: + """Send a single event to the backend ingest endpoint""" + tx_hash = event.get("transactionHash", "") + ledger_sequence = int(event.get("ledger", 0)) + contract_id = event.get("contractId") + event_type = event.get("type") + raw_payload = event + + ingest_payload = { + "txHash": tx_hash, + "eventIndex": event_index, + "ledgerSequence": ledger_sequence, + "contractId": contract_id, + "eventType": event_type, + "rawPayload": raw_payload + } + + headers = { + "Content-Type": "application/json", + "x-ingest-secret": self.ingest_secret + } + + try: + response = requests.post( + f"{self.backend_url}/soroban-events/ingest", + json=ingest_payload, + headers=headers, + timeout=30 + ) + response.raise_for_status() + logger.debug(f"Successfully sent event {tx_hash}:{event_index} to backend") + return True + except Exception as e: + logger.error(f"Failed to send event {tx_hash}:{event_index} to backend: {e}") + return False + + def run_once(self) -> Dict: + """Run one iteration of the indexer""" + logger.info("=" * 60) + logger.info("SOROBAN EVENT INDEXER - INCREMENTAL SYNC") + logger.info("=" * 60) + + try: + latest_ledger = self.fetch_latest_ledger() + logger.info(f"Latest ledger: {latest_ledger}") + logger.info(f"Last processed ledger: {self.last_ledger}") + + if latest_ledger <= self.last_ledger: + logger.info("No new ledgers to process") + return {"status": "no_new_ledgers", "events_processed": 0} + + start_ledger = self.last_ledger + 1 + logger.info(f"Fetching events from ledger {start_ledger} to {latest_ledger}") + + events = self.fetch_events_since(start_ledger) + logger.info(f"Found {len(events)} new events") + + # Send events to backend + sent_count = 0 + failed_count = 0 + highest_ledger = self.last_ledger + + for idx, event in enumerate(events): + success = self.send_event_to_backend(event, idx) + if success: + sent_count += 1 + else: + failed_count += 1 + + # Update highest ledger seen + event_ledger = int(event.get("ledger", 0)) + if event_ledger > highest_ledger: + highest_ledger = event_ledger + + # Update state to the highest ledger processed + self._save_last_ledger(highest_ledger) + + logger.info(f"Sent {sent_count} events to backend, {failed_count} failed") + logger.info(f"Updated last processed ledger to {highest_ledger}") + logger.info("=" * 60) + + return { + "status": "success", + "events_found": len(events), + "events_sent": sent_count, + "events_failed": failed_count, + "last_ledger": highest_ledger + } + + except Exception as e: + logger.error(f"Error in indexer run: {e}", exc_info=True) + return {"status": "error", "error": str(e)} + + def run_forever(self): + """Run the indexer continuously, polling for new events""" + logger.info("Starting Soroban event indexer (continuous mode)") + logger.info(f"Poll interval: {self.poll_interval} seconds") + + while True: + self.run_once() + time.sleep(self.poll_interval) + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Soroban Event Indexer") + parser.add_argument("--rpc-url", type=str, default=os.getenv("SOROBAN_RPC_URL", "https://soroban-testnet.stellar.org"), help="Soroban RPC URL") + parser.add_argument("--backend-url", type=str, default=os.getenv("BACKEND_URL", "http://localhost:3000"), help="Backend API URL") + parser.add_argument("--ingest-secret", type=str, default=os.getenv("SOROBAN_INGEST_SECRET", ""), help="Secret for backend ingest endpoint") + parser.add_argument("--contract-ids", nargs="*", default=os.getenv("SOROBAN_CONTRACT_IDS", "").split(","), help="List of contract IDs to index (comma-separated)") + parser.add_argument("--state-file", type=str, default="./data/soroban_indexer_state.json", help="Path to state file") + parser.add_argument("--poll-interval", type=int, default=30, help="Poll interval in seconds") + parser.add_argument("--once", action="store_true", help="Run once and exit") + + args = parser.parse_args() + + # Clean up contract ids + contract_ids = [cid.strip() for cid in args.contract_ids if cid.strip()] + + indexer = SorobanEventIndexer( + rpc_url=args.rpc_url, + backend_url=args.backend_url, + ingest_secret=args.ingest_secret, + contract_ids=contract_ids, + state_file=args.state_file, + poll_interval=args.poll_interval + ) + + if args.once: + indexer.run_once() + else: + indexer.run_forever() + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" + ) + main() diff --git a/temp_backup/src/ingestion/stellar_fetcher.py b/temp_backup/src/ingestion/stellar_fetcher.py new file mode 100644 index 00000000..16460a30 --- /dev/null +++ b/temp_backup/src/ingestion/stellar_fetcher.py @@ -0,0 +1,565 @@ +""" +Stellar Blockchain Data Fetcher +Fetches historical transaction and volume data from Stellar Horizon API. +""" + +import time +from typing import Dict, List, Optional, Tuple, Any +from datetime import datetime, timedelta +from dataclasses import dataclass +import json +from stellar_sdk import Server, Asset +from stellar_sdk.exceptions import NotFoundError, BadRequestError, ConnectionError +from stellar_sdk.call_builder.call_builder_async import PaymentsCallBuilder + + +@dataclass +class VolumeData: + """Volume data for a specific asset over a time period""" + + asset_code: str + asset_issuer: Optional[str] + time_period_hours: int + total_volume: float + transaction_count: int + start_time: datetime + end_time: datetime + volume_by_hour: Dict[str, float] # hour -> volume + + def to_dict(self) -> Dict: + """Convert to dictionary with serialized datetime""" + return { + "asset_code": self.asset_code, + "asset_issuer": self.asset_issuer, + "time_period_hours": self.time_period_hours, + "total_volume": self.total_volume, + "transaction_count": self.transaction_count, + "start_time": self.start_time.isoformat(), + "end_time": self.end_time.isoformat(), + "volume_by_hour": self.volume_by_hour, + "average_hourly_volume": ( + self.total_volume / self.time_period_hours + if self.time_period_hours > 0 + else 0 + ), + } + + +@dataclass +class TransactionRecord: + """Individual transaction record""" + + id: str + hash: str + created_at: datetime + source_account: str + operation_count: int + total_amount: float + fee_charged: float + memo: Optional[str] + successful: bool + + def to_dict(self) -> Dict: + """Convert to dictionary""" + return { + "id": self.id, + "hash": self.hash, + "created_at": self.created_at.isoformat(), + "source_account": self.source_account, + "operation_count": self.operation_count, + "total_amount": self.total_amount, + "fee_charged": self.fee_charged, + "memo": self.memo, + "successful": self.successful, + } + + +class StellarDataFetcher: + """ + Fetches on-chain data from Stellar blockchain via Horizon API. + + Features: + - Fetch volume data for specific assets + - Handle pagination for large datasets + - Aggregate data by time periods + - Error handling and retry logic + """ + + # Default Horizon servers (public instances) + HORIZON_SERVERS = [ + "https://horizon.stellar.org", # Mainnet - Stellar Development Foundation + "https://horizon-testnet.stellar.org", # Testnet + ] + + # Rate limiting + MAX_RETRIES = 3 + RETRY_DELAY = 1 # seconds + REQUEST_TIMEOUT = 30 # seconds + + def __init__( + self, + horizon_url: Optional[str] = None, + network: str = "public", + timeout: Optional[float] = None, + ): + """ + Initialize Stellar data fetcher. + + Args: + horizon_url: Custom Horizon server URL (optional) + network: 'public' for mainnet, 'testnet' for testnet + """ + if horizon_url: + self.horizon_url = horizon_url + else: + if network == "testnet": + self.horizon_url = self.HORIZON_SERVERS[1] + else: + self.horizon_url = self.HORIZON_SERVERS[0] + + print(f"Connecting to Horizon server: {self.horizon_url}") + + # Initialize Stellar SDK server + self.timeout = timeout if timeout is not None else self.REQUEST_TIMEOUT + self.server = Server(horizon_url=self.horizon_url, timeout=self.timeout) + + # Cache for recent requests + self.cache = {} + self.cache_ttl = 300 # 5 minutes + + def _handle_pagination(self, callable_func, *args, **kwargs) -> List[Dict]: + """ + Handle pagination for Horizon API responses. + + Args: + callable_func: Function that returns a pageable response + *args, **kwargs: Arguments for the function + + Returns: + List of all records across all pages + """ + records = [] + cursor = None + page_count = 0 + max_pages = 100 # Safety limit + + try: + while page_count < max_pages: + # Build query parameters + query_params = kwargs.copy() + if cursor: + query_params["cursor"] = cursor + + # Make the request + if "call" in dir(callable_func): + # If it's a call builder object + response = callable_func.call() + else: + # If it's a regular function + response = callable_func(*args, **query_params) + + # Get records from this page + page_records = response["_embedded"]["records"] + records.extend(page_records) + + # Check if there are more pages + links = response["_links"] + if "next" in links and "href" in links["next"]: + # Extract cursor from next URL + next_url = links["next"]["href"] + if "cursor=" in next_url: + cursor = next_url.split("cursor=")[1].split("&")[0] + else: + break # No more pages + else: + break + + page_count += 1 + + # Small delay to be nice to the API + time.sleep(0.1) + + except (ConnectionError, BadRequestError) as e: + print(f"Error during pagination: {e}") + except Exception as e: + print(f"Unexpected error during pagination: {e}") + + return records + + def _retry_request(self, func, *args, **kwargs): + """ + Retry logic for failed requests. + + Args: + func: Function to retry + *args, **kwargs: Arguments for the function + + Returns: + Function result + """ + for attempt in range(self.MAX_RETRIES): + try: + return func(*args, **kwargs) + except (ConnectionError, BadRequestError, Exception) as e: + if attempt < self.MAX_RETRIES - 1: + print( + f"Attempt {attempt + 1} failed: {e}. Retrying in {self.RETRY_DELAY}s..." + ) + time.sleep(self.RETRY_DELAY * (attempt + 1)) + else: + print(f"All retry attempts failed for {func.__name__}") + raise e + + def get_asset_volume(self, asset_code: str, hours: int = 24) -> VolumeData: + """ + Get trading volume for a specific asset over the last N hours. + + Args: + asset_code: Asset code (e.g., 'XLM', 'USDC') + hours: Number of hours to look back + + Returns: + VolumeData object with aggregated volume information + """ + # Generate cache key + cache_key = f"volume_{asset_code}_{hours}_{datetime.now().strftime('%Y%m%d%H')}" + + # Check cache + if cache_key in self.cache: + cached_time, cached_data = self.cache[cache_key] + if time.time() - cached_time < self.cache_ttl: + print(f"Returning cached data for {asset_code} (last {hours}h)") + return cached_data + + print(f"Fetching volume data for {asset_code} (last {hours}h)...") + + end_time = datetime.now() + start_time = end_time - timedelta(hours=hours) + + # Initialize volume tracking + total_volume = 0.0 + transaction_count = 0 + volume_by_hour = {f"hour_{i}": 0.0 for i in range(hours)} + + try: + # For XLM (native asset) + if asset_code == "XLM": + # Get payments (XLM transactions) + payments = self._get_payments_for_period( + start_time, end_time, asset_code="native" + ) + + for payment in payments: + try: + amount = float(payment.get("amount", "0")) + if amount > 0: + total_volume += amount + transaction_count += 1 + + # Add to hourly bucket + created_at = datetime.fromisoformat( + payment["created_at"].replace("Z", "+00:00") + ) + hours_ago = int( + (end_time - created_at).total_seconds() / 3600 + ) + if 0 <= hours_ago < hours: + volume_by_hour[f"hour_{hours_ago}"] += amount + + except (KeyError, ValueError) as e: + print(f"Error processing payment: {e}") + continue + + else: + # For other assets, we need to look at trades and path payments + # This is a simplified approach - in production you'd want more sophisticated logic + trades = self._get_trades_for_asset(asset_code, start_time, end_time) + + for trade in trades: + try: + # Check if this is buying or selling our target asset + base_asset = trade.get("base_asset_code") + counter_asset = trade.get("counter_asset_code") + + if base_asset == asset_code: + amount = float(trade.get("base_amount", "0")) + elif counter_asset == asset_code: + amount = float(trade.get("counter_amount", "0")) + else: + continue + + if amount > 0: + total_volume += amount + transaction_count += 1 + + # Add to hourly bucket + ledger_close_time = datetime.fromisoformat( + trade["ledger_close_time"].replace("Z", "+00:00") + ) + hours_ago = int( + (end_time - ledger_close_time).total_seconds() / 3600 + ) + if 0 <= hours_ago < hours: + volume_by_hour[f"hour_{hours_ago}"] += amount + + except (KeyError, ValueError) as e: + print(f"Error processing trade: {e}") + continue + + # Create VolumeData object + volume_data = VolumeData( + asset_code=asset_code, + asset_issuer=None, # Native XLM has no issuer, for others we'd need issuer info + time_period_hours=hours, + total_volume=total_volume, + transaction_count=transaction_count, + start_time=start_time, + end_time=end_time, + volume_by_hour=volume_by_hour, + ) + + # Cache the result + self.cache[cache_key] = (time.time(), volume_data) + + return volume_data + + except Exception as e: + print(f"Error fetching volume for {asset_code}: {e}") + import traceback + + traceback.print_exc() + + # Return empty volume data on error + return VolumeData( + asset_code=asset_code, + asset_issuer=None, + time_period_hours=hours, + total_volume=0.0, + transaction_count=0, + start_time=start_time, + end_time=end_time, + volume_by_hour={f"hour_{i}": 0.0 for i in range(hours)}, + ) + + def _get_payments_for_period( + self, start_time: datetime, end_time: datetime, asset_code: str = "native" + ) -> List[Dict]: + """ + Get payments for a specific asset within a time period. + + Args: + start_time: Start of time period + end_time: End of time period + asset_code: Asset code or 'native' for XLM + + Returns: + List of payment records + """ + payments = [] + + try: + # Build query + payments_call = self.server.payments().order(desc=False).limit(200) + + # For XLM (native asset) + if asset_code == "native": + payments_call = payments_call.for_asset(Asset.native()) + # Note: For other assets, we'd need the issuer as well + + # Get payments with pagination + records = self._retry_request(self._handle_pagination, payments_call) + + # Filter by time + for payment in records: + try: + created_at = datetime.fromisoformat( + payment["created_at"].replace("Z", "+00:00") + ) + if start_time <= created_at <= end_time: + payments.append(payment) + elif created_at > end_time: + # Since we're ordering ascending, we can break early + pass + + except (KeyError, ValueError) as e: + print(f"Error parsing payment timestamp: {e}") + continue + + except Exception as e: + print(f"Error getting payments: {e}") + + return payments + + def _get_trades_for_asset( + self, asset_code: str, start_time: datetime, end_time: datetime + ) -> List[Dict]: + """ + Get trades involving a specific asset. + + Args: + asset_code: Asset code to filter by + start_time: Start of time period + end_time: End of time period + + Returns: + List of trade records + """ + trades = [] + + try: + # Get trades with pagination + trades_call = self.server.trades().order(desc=False).limit(200) + records = self._retry_request(self._handle_pagination, trades_call) + + # Filter by asset and time + for trade in records: + try: + base_asset = trade.get("base_asset_code") + counter_asset = trade.get("counter_asset_code") + ledger_close_time = datetime.fromisoformat( + trade["ledger_close_time"].replace("Z", "+00:00") + ) + + # Check if trade involves our asset and is within time period + if ( + base_asset == asset_code or counter_asset == asset_code + ) and start_time <= ledger_close_time <= end_time: + trades.append(trade) + + except (KeyError, ValueError) as e: + print(f"Error parsing trade: {e}") + continue + + except Exception as e: + print(f"Error getting trades: {e}") + + return trades + + def get_network_stats(self) -> Dict[str, Any]: + """ + Get general Stellar network statistics. + + Returns: + Dictionary with network metrics + """ + try: + # Get ledger stats + ledgers_call = self.server.ledgers().order("desc").limit(1) + ledgers = self._retry_request(ledgers_call.call) + latest_ledger = ( + ledgers["_embedded"]["records"][0] + if ledgers["_embedded"]["records"] + else {} + ) + + # Get fee stats + fee_stats = self._retry_request(self.server.fee_stats) + + return { + "latest_ledger": latest_ledger.get("sequence", 0), + "ledger_close_time": latest_ledger.get("closed_at", ""), + "transaction_count": latest_ledger.get("transaction_count", 0), + "operation_count": latest_ledger.get("operation_count", 0), + "base_fee": fee_stats.get("last_ledger_base_fee", 0), + "fee_pool": fee_stats.get("fee_charged", {}).get("max", 0), + "protocol_version": latest_ledger.get("protocol_version", ""), + "total_coins": latest_ledger.get("total_coins", "0"), + } + + except Exception as e: + print(f"Error getting network stats: {e}") + return {} + + def get_account_transactions( + self, account_id: str, limit: int = 100 + ) -> List[TransactionRecord]: + """ + Get recent transactions for a specific account. + + Args: + account_id: Stellar account ID + limit: Maximum number of transactions to return + + Returns: + List of TransactionRecord objects + """ + transactions = [] + + try: + # Get transactions for account + transactions_call = ( + self.server.transactions() + .for_account(account_id) + .order("desc") + .limit(min(limit, 200)) + ) + records = self._retry_request(self._handle_pagination, transactions_call) + + for tx in records[:limit]: + try: + transaction = TransactionRecord( + id=tx.get("id", ""), + hash=tx.get("hash", ""), + created_at=datetime.fromisoformat( + tx["created_at"].replace("Z", "+00:00") + ), + source_account=tx.get("source_account", ""), + operation_count=int(tx.get("operation_count", 0)), + total_amount=float(tx.get("fee_charged", 0)) + * 0.0000001, # Convert stroops to XLM + fee_charged=float(tx.get("fee_charged", 0)) * 0.0000001, + memo=tx.get("memo", ""), + successful=tx.get("successful", False), + ) + transactions.append(transaction) + + except (KeyError, ValueError) as e: + print(f"Error parsing transaction: {e}") + continue + + except Exception as e: + print(f"Error getting account transactions: {e}") + + return transactions + + def clear_cache(self): + """Clear the request cache.""" + self.cache.clear() + + def test_connection(self) -> bool: + """Test connection to Horizon server.""" + try: + root = self._retry_request(self.server.root) + return "horizon_version" in root + except Exception as e: + print(f"Connection test failed: {e}") + return False + + +# Convenience functions +def get_asset_volume(asset_code: str = "XLM", hours: int = 24) -> Dict: + """ + Convenience function to get asset volume. + + Args: + asset_code: Asset code (default: 'XLM') + hours: Hours to look back (default: 24) + + Returns: + Dictionary with volume data + """ + fetcher = StellarDataFetcher() + try: + volume_data = fetcher.get_asset_volume(asset_code, hours) + return volume_data.to_dict() + finally: + fetcher.clear_cache() + + +def get_network_overview() -> Dict: + """Get Stellar network overview.""" + fetcher = StellarDataFetcher() + try: + return fetcher.get_network_stats() + finally: + fetcher.clear_cache() diff --git a/temp_backup/src/ingestion/stellar_ingestion_checks.py b/temp_backup/src/ingestion/stellar_ingestion_checks.py new file mode 100644 index 00000000..3025ea78 --- /dev/null +++ b/temp_backup/src/ingestion/stellar_ingestion_checks.py @@ -0,0 +1,463 @@ +"""Stellar ingestion quality checks for testnet. + +MVP goals (idempotent + low-noise): +- Detect missing ledger ranges / ingestion lag (best-effort via Horizon ledger + pipeline lag) +- Detect duplicate events (best-effort; this pipeline currently ingests aggregates, not raw ops) +- Detect drift between raw events and materialized views (best-effort; currently only aggregates exist) +- Produce a clear report to stdout + persisted JSON file + +This repository's current ingestion pipeline stores *aggregated* on-chain metrics (e.g. XLM volume windows) +rather than per-transaction/per-operation raw events. Therefore, checks are implemented against +what we actually persist: +- network/ledger freshness via Horizon latest ledger close time +- analytics drift between raw fetched volume vs stored recent analytics/materializations (analytics_records) + +If/when raw event tables are added, these checks can be extended without changing the report schema. +""" + +from __future__ import annotations + +import argparse +import json +import os +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +from src.db import PostgresService +from src.ingestion.stellar_fetcher import StellarDataFetcher + + +REPORT_DIR_DEFAULT = "./data/ingestion_reports" + + +@dataclass +class CheckFinding: + check_id: str + severity: str # "warning" | "error" + passed: bool + metric: Optional[str] = None + details: Optional[Dict[str, Any]] = None + + def to_dict(self) -> Dict[str, Any]: + return { + "check_id": self.check_id, + "severity": self.severity, + "passed": self.passed, + "metric": self.metric, + "details": self.details or {}, + } + + +def _parse_iso_datetime(s: str) -> Optional[datetime]: + if not s: + return None + try: + # Stellar Horizon uses RFC3339, often ends with Z + if s.endswith("Z"): + return datetime.fromisoformat(s.replace("Z", "+00:00")) + return datetime.fromisoformat(s) + except Exception: + return None + + +def _horizon_latest_ledger(fetcher: StellarDataFetcher) -> Dict[str, Any]: + """Return latest ledger sequence + close time via Horizon. + + Best-effort: uses existing StellarDataFetcher.get_network_stats(). + """ + stats = fetcher.get_network_stats() or {} + seq = stats.get("latest_ledger") or stats.get("latest_ledger_sequence") + closed_at = stats.get("ledger_close_time") or stats.get("closed_at") + dt = _parse_iso_datetime(closed_at) if isinstance(closed_at, str) else None + return { + "latest_ledger_sequence": seq, + "ledger_close_time": closed_at, + "ledger_close_time_dt": dt.isoformat() if dt else None, + } + + +def check_ingestion_lag( + *, + fetcher: StellarDataFetcher, + allowed_lag_seconds: int, +) -> CheckFinding: + """Detect ingestion lag. + + We can only reliably measure *network freshness* (latest ledger close time). + The current codebase does not persist per-ledger ingestion cursors. + + Heuristic: ingestion is considered stale if Horizon's latest ledger closed_at is + older than allowed_lag_seconds. + """ + latest = _horizon_latest_ledger(fetcher) + dt = _parse_iso_datetime(latest.get("ledger_close_time") or "") + if dt is None: + return CheckFinding( + check_id="missing_ledger_ranges_or_ingestion_lag", + severity="error", + passed=False, + metric="horizon_latest_ledger_close_time", + details={"reason": "Could not parse ledger_close_time from Horizon" , "latest": latest}, + ) + + now = datetime.now(timezone.utc) + lag = (now - dt).total_seconds() + + passed = lag <= allowed_lag_seconds + return CheckFinding( + check_id="missing_ledger_ranges_or_ingestion_lag", + severity="warning" if not passed else "warning", + passed=passed, + metric="ingestion_lag_seconds", + details={ + "now_utc": now.isoformat(), + "latest_ledger_close_time": dt.isoformat(), + "lag_seconds": lag, + "allowed_lag_seconds": allowed_lag_seconds, + "latest_ledger_sequence": latest.get("latest_ledger_sequence"), + }, + ) + + +def check_duplicate_events_best_effort( + *, + postgres: Optional[PostgresService], + window_hours: int, +) -> CheckFinding: + """Detect duplicates. + + The current ingestion pipeline persists analytics_records (aggregates) and + legacy tables (articles, social posts, insights). + + There is no canonical raw event table (tx hash + event index) to dedupe. + Therefore we detect likely duplicates by looking for repeated analytics_records + with same (record_type, metric_name, asset, window, timestamp bucket). + + Idempotent + safe: read-only. + """ + if postgres is None: + return CheckFinding( + check_id="duplicate_events", + severity="warning", + passed=True, + details={"note": "PostgreSQL unavailable; skipping duplicate event checks"}, + ) + + cutoff = datetime.utcnow() - timedelta(hours=window_hours) + + # PostgresService only exposes get_analytics_records(...) + # We'll fetch recent records and compute duplicates in-memory. + records = postgres.get_analytics_records(hours=window_hours, limit=5000) + if not records: + return CheckFinding( + check_id="duplicate_events", + severity="warning", + passed=True, + details={"note": "No analytics_records found in window"}, + ) + + # Bucket timestamp to the minute to keep noise low. + def bucket(ts: datetime) -> str: + return ts.replace(second=0, microsecond=0).isoformat() + + seen: Dict[Tuple[Any, ...], int] = {} + for r in records: + key = (r.record_type, r.asset, r.metric_name, r.window, bucket(r.timestamp)) + seen[key] = seen.get(key, 0) + 1 + + dupes = [{"key": list(k), "count": c} for k, c in seen.items() if c > 1] + + passed = len(dupes) == 0 + return CheckFinding( + check_id="duplicate_events", + severity="warning" if not passed else "warning", + passed=passed, + metric="duplicate_analytics_record_groups", + details={ + "window_hours": window_hours, + "records_fetched": len(records), + "duplicate_groups": len(dupes), + "examples": dupes[:10], + "cutoff_utc": cutoff.isoformat(), + }, + ) + + +def _compute_expected_volume_windows(asset: str, hours_list: List[int], network: str) -> Dict[str, float]: + """Fetch current on-chain volume for multiple horizons.""" + fetcher = StellarDataFetcher(network=network) + out: Dict[str, float] = {} + try: + for h in hours_list: + v = fetcher.get_asset_volume(asset, hours=h) + out[f"{h}h"] = float(v.total_volume) + return out + finally: + fetcher.clear_cache() + + +def check_drift_between_raw_and_materialized( + *, + postgres: Optional[PostgresService], + asset: str, + network: str, + hours_list: List[int], + compare_window_hours: int, + drift_ratio_threshold: float, +) -> CheckFinding: + """Detect drift between raw fetch results and materialized views. + + In this codebase, "materialized views" are approximated by analytics_records + persisted in PostgreSQL. Since the ingestion pipeline does not write a dedicated + view for raw volume, we look for analytics_records with metric_name == "volume" + and record_type == "onchain_volume" (best-effort). + + If no matching records exist, we pass with note (low-noise). + """ + if postgres is None: + return CheckFinding( + check_id="drift_between_raw_and_materialized_views", + severity="warning", + passed=True, + details={"note": "PostgreSQL unavailable; skipping drift checks"}, + ) + + # Fetch raw volume windows (fresh) + raw = _compute_expected_volume_windows(asset, hours_list, network) + + # Load recent analytics records and attempt to match by metric_name/window. + # get_analytics_records only supports record_type/asset/metric_name filters. + # We'll fetch by time window and filter in-memory. + recent = postgres.get_analytics_records(hours=compare_window_hours, limit=8000) + + # Best-effort matching: + # metric_name "volume" and record_type "onchain_volume" and asset == asset. + matches = [ + r + for r in recent + if (r.asset == asset) + and (str(r.metric_name).lower() in {"volume", "onchain_volume", "xlm_volume"}) + and (r.window is not None) + and (str(r.record_type).lower() in {"onchain_volume", "ingestion_onchain_volume", "stellar_volume"}) + ] + + if not matches: + return CheckFinding( + check_id="drift_between_raw_and_materialized_views", + severity="warning", + passed=True, + details={ + "note": "No matching analytics_records for on-chain volume found; skipping drift check to avoid noise.", + "raw": raw, + "compare_window_hours": compare_window_hours, + }, + ) + + # Take the latest per window + latest_by_window: Dict[str, Any] = {} + for r in matches: + latest_by_window[r.window] = max( + latest_by_window.get(r.window, r), + r, + key=lambda x: x.timestamp, + ) + + drift_reports: List[Dict[str, Any]] = [] + passed_all = True + for h in hours_list: + window_key_candidates = [f"{h}h", f"{h}h_window", f"{h}h".upper()] + found = None + for w in window_key_candidates: + if w in latest_by_window: + found = latest_by_window[w] + break + if found is None: + passed_all = False + drift_reports.append({ + "window": f"{h}h", + "status": "missing_materialization", + }) + continue + + materialized = float(found.value) + expected = float(raw[f"{h}h"]) + if expected == 0: + ratio = None + abs_diff = abs(materialized - expected) + passed = abs_diff == 0 + else: + ratio = abs(materialized - expected) / expected + passed = ratio <= drift_ratio_threshold + passed_all = passed_all and passed + + drift_reports.append({ + "window": f"{h}h", + "expected_raw_volume": expected, + "materialized_volume": materialized, + "abs_diff": abs(materialized - expected), + "drift_ratio": ratio, + "threshold": drift_ratio_threshold, + "passed": passed, + }) + + return CheckFinding( + check_id="drift_between_raw_and_materialized_views", + severity="warning" if not passed_all else "warning", + passed=passed_all, + metric="drift_ratio", + details={ + "asset": asset, + "network": network, + "raw": raw, + "compare_window_hours": compare_window_hours, + "drift_ratio_threshold": drift_ratio_threshold, + "drift_reports": drift_reports, + }, + ) + + +def run_all_checks( + *, + network: str, + asset: str, + ingestion_lag_seconds: int, + dup_window_hours: int, + drift_compare_window_hours: int, + drift_ratio_threshold: float, + hours_list: List[int], + report_dir: str, + manual_run_id: Optional[str], +) -> Dict[str, Any]: + """Run all checks and return report dict.""" + + report_ts = datetime.now(timezone.utc).isoformat() + + report_path = Path(report_dir) + report_path.mkdir(parents=True, exist_ok=True) + + out_file = report_path / f"stellar_ingestion_quality_{report_ts.replace(':','-')}.json" + + # Fetcher + postgres are created inside to keep this script safe. + fetcher = StellarDataFetcher(network=network) + + postgres: Optional[PostgresService] = None + try: + postgres = PostgresService() + except Exception: + postgres = None + + findings: List[CheckFinding] = [] + + findings.append( + check_ingestion_lag( + fetcher=fetcher, + allowed_lag_seconds=ingestion_lag_seconds, + ) + ) + + findings.append( + check_duplicate_events_best_effort( + postgres=postgres, + window_hours=dup_window_hours, + ) + ) + + findings.append( + check_drift_between_raw_and_materialized( + postgres=postgres, + asset=asset, + network=network, + hours_list=hours_list, + compare_window_hours=drift_compare_window_hours, + drift_ratio_threshold=drift_ratio_threshold, + ) + ) + + passed = all(f.passed for f in findings) + + report: Dict[str, Any] = { + "schema_version": 1, + "generated_at": report_ts, + "network": network, + "asset": asset, + "manual_run_id": manual_run_id, + "thresholds": { + "ingestion_lag_seconds": ingestion_lag_seconds, + "duplicate_check_window_hours": dup_window_hours, + "drift_compare_window_hours": drift_compare_window_hours, + "drift_ratio_threshold": drift_ratio_threshold, + "drift_hours_list": hours_list, + }, + "summary": { + "passed": passed, + "findings_total": len(findings), + "findings_failed": sum(1 for f in findings if not f.passed), + }, + "findings": [f.to_dict() for f in findings], + } + + # Persist + with open(out_file, "w", encoding="utf-8") as f: + json.dump(report, f, indent=2, ensure_ascii=False) + + # Print MVP clear report + print("\n=== Stellar Ingestion Quality Report ===") + print(f"generated_at: {report_ts}") + print(f"network: {network} | asset: {asset}") + print(f"passed: {passed}") + print(f"report_file: {str(out_file)}") + for fi in findings: + status = "PASS" if fi.passed else "FAIL" + print(f"- [{status}] {fi.check_id} severity={fi.severity} metric={fi.metric}") + + # If we want low-noise: exit non-zero only when ingestion lag fails. + # Drift/duplicates are warning-level (but can still be useful). + # Keep this as MVP behavior. + critical_fail = any((f.check_id == "missing_ledger_ranges_or_ingestion_lag") and (not f.passed) for f in findings) + return { + **report, + "exit_code": 1 if critical_fail else 0, + } + + +def main(argv: Optional[List[str]] = None) -> int: + parser = argparse.ArgumentParser(description="Run Stellar ingestion quality checks (testnet-focused).") + parser.add_argument("--network", default=os.getenv("STELLAR_NETWORK", "testnet"), choices=["testnet", "public"], help="Horizon network selector") + parser.add_argument("--asset", default=os.getenv("ONCHAIN_ASSET", "XLM"), help="Asset code") + + parser.add_argument("--ingestion-lag-seconds", type=int, default=int(os.getenv("INGESTION_LAG_SECONDS", "300")), help="Max allowed lag between Horizon latest ledger close time and now") + parser.add_argument("--duplicate-window-hours", type=int, default=int(os.getenv("DUPLICATE_WINDOW_HOURS", "24")), help="Lookback window for duplicate analytics record grouping") + + parser.add_argument("--drift-compare-window-hours", type=int, default=int(os.getenv("DRIFT_COMPARE_WINDOW_HOURS", "24")), help="Lookback for materialized view records") + parser.add_argument("--drift-ratio-threshold", type=float, default=float(os.getenv("DRIFT_RATIO_THRESHOLD", "0.05")), help="Max allowed relative drift (abs(diff)/expected)") + parser.add_argument("--drift-hours", default=os.getenv("DRIFT_HOURS_LIST", "24,48"), help="Comma-separated list of horizons to compare, e.g. 24,48") + + parser.add_argument("--report-dir", default=os.getenv("INGESTION_REPORT_DIR", REPORT_DIR_DEFAULT), help="Directory to persist reports") + parser.add_argument("--manual-run-id", default=os.getenv("MANUAL_RUN_ID"), help="Optional run identifier") + + args = parser.parse_args(argv) + + hours_list = [int(x.strip()) for x in str(args.drift_hours).split(",") if x.strip()] + if not hours_list: + hours_list = [24, 48] + + result = run_all_checks( + network=args.network, + asset=str(args.asset).upper(), + ingestion_lag_seconds=args.ingestion_lag_seconds, + dup_window_hours=args.duplicate_window_hours, + drift_compare_window_hours=args.drift_compare_window_hours, + drift_ratio_threshold=args.drift_ratio_threshold, + hours_list=hours_list, + report_dir=args.report_dir, + manual_run_id=args.manual_run_id, + ) + + return int(result.get("exit_code", 0)) + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/temp_backup/src/main.py b/temp_backup/src/main.py new file mode 100644 index 00000000..233b7293 --- /dev/null +++ b/temp_backup/src/main.py @@ -0,0 +1,382 @@ +""" +Main entry point for the data processing pipeline with both single-run and scheduled modes. +""" + +import os +import sys +import logging +import signal +import time +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime +from dotenv import load_dotenv + +# Add the src directory to the Python path +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# Import both pipeline and scheduler +from src.ingestion.news_fetcher import fetch_news +from src.ingestion.price_fetcher import PriceFetcher +from src.ingestion.stellar_fetcher import get_asset_volume, get_network_overview +from src.validators import validate_news_article, validate_onchain_metric +from src.analytics.market_analyzer import MarketAnalyzer, MarketData +from src.analytics.market_analyzer import get_explanation +from src.sentiment import SentimentAnalyzer +from src.anomaly_detector import AnomalyDetector +from src.alert_notifier import notifier +from scheduler import AnalyticsScheduler + +from src.utils.logger import setup_logger, CorrelationIdFilter +from src.utils.metrics import API_FAILURES_TOTAL, start_metrics_server +from pythonjsonlogger import jsonlogger + +# Configure logging +logger = setup_logger(__name__) +os.makedirs("./logs", exist_ok=True) +file_handler = logging.FileHandler("./logs/data_processor.log") +formatter = jsonlogger.JsonFormatter( + "%(asctime)s %(levelname)s %(name)s %(correlation_id)s %(message)s", + rename_fields={"levelname": "level"} +) +file_handler.addFilter(CorrelationIdFilter()) +file_handler.setFormatter(formatter) +logger.addHandler(file_handler) + +# Module-level detector so it accumulates rolling window data across +# scheduled pipeline runs (meaningful baselines build up over time). +anomaly_detector = AnomalyDetector(window_size_hours=24, z_threshold=2.5) + +# Global scheduler instance +scheduler = None + + +def setup_signal_handlers(): + """Setup signal handlers for graceful shutdown""" + + def signal_handler(sig, frame): + logger.info("Received shutdown signal, cleaning up...") + if scheduler: + scheduler.stop() + sys.exit(0) + + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + +def run_data_pipeline(): + """Run a single execution of the complete data processing pipeline.""" + print("=" * 60) + print("DATA PROCESSING PIPELINE") + print("=" * 60) + print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print() + + try: + pipeline_start = time.perf_counter() + + # ── Step 1 & 2: Fetch news + on-chain data concurrently ────── + print("1. FETCHING DATA (news + on-chain in parallel)") + print("-" * 40) + + price_fetcher = PriceFetcher() + with ThreadPoolExecutor(max_workers=5) as io_pool: + news_future = io_pool.submit(fetch_news, limit=5) + vol_24h_future = io_pool.submit(get_asset_volume, "XLM", hours=24) + vol_48h_future = io_pool.submit(get_asset_volume, "XLM", hours=48) + network_future = io_pool.submit(get_network_overview) + price_future = io_pool.submit(price_fetcher.fetch_all_prices, ["XLM", "USDC"]) + + raw_news_articles = news_future.result() + raw_volume_24h = vol_24h_future.result() + raw_volume_48h = vol_48h_future.result() + network_stats = network_future.result() + raw_price_feed = price_future.result() + + fetch_elapsed = time.perf_counter() - pipeline_start + print(f"All fetches completed in {fetch_elapsed:.2f}s (parallel)") + + # Validate and sanitize news articles + news_articles = [] + for idx, article in enumerate(raw_news_articles): + validated = validate_news_article(article) + if validated: + news_articles.append(validated.dict()) + else: + logger.warning(f"Dropped invalid news article at index {idx}") + + print(f"Fetched {len(raw_news_articles)} raw → {len(news_articles)} validated articles") + + print("\n2. PRICE FEED") + print("-" * 40) + if raw_price_feed: + for price_point in raw_price_feed: + status = "stale" if price_point.get("is_stale") else "fresh" + print( + f"{price_point['asset_code']}: ${price_point['price_usd']:.7f} " + f"({price_point['price']} scaled, decimals={price_point['asset_decimals']}, {status})" + ) + else: + print("Price feed unavailable") + + # ── Sentiment analysis (parallel for large batches) ────────── + print("\n3. SENTIMENT ANALYSIS") + print("-" * 40) + + sentiment_analyzer = SentimentAnalyzer() + if news_articles: + article_texts = [ + (a.get("title", "") + " " + a.get("summary", "")).strip() + for a in news_articles + ] + sentiment_results = sentiment_analyzer.analyze_batch_parallel(article_texts) + summary = sentiment_analyzer.get_sentiment_summary(sentiment_results) + avg_sentiment = summary["average_compound_score"] + print(f"Avg sentiment: {avg_sentiment:.4f} " + f"(+{summary['positive_count']} / " + f"-{summary['negative_count']} / " + f"~{summary['neutral_count']})") + else: + avg_sentiment = 0.0 + sentiment_results = [] + print("No valid articles, using neutral sentiment") + + # ── Validate on-chain metrics ──────────────────────────────── + print("\n4. STELLAR ON-CHAIN DATA") + print("-" * 40) + + validated_volume_24h = validate_onchain_metric({ + "metric_id": "xlm_volume_24h", + "value": raw_volume_24h.get("total_volume", 0.0), + "timestamp": raw_volume_24h.get("end_time", ""), + "chain": "stellar", + "extra": raw_volume_24h, + }) + if validated_volume_24h: + volume_24h = validated_volume_24h.dict() + else: + logger.warning("Invalid on-chain metric for 24h volume, using defaults.") + volume_24h = {"total_volume": 0.0, "transaction_count": 0} + + print(f"XLM Volume (24h): {volume_24h.get('total_volume', 0.0):,.2f}") + print(f"Transactions: {volume_24h.get('transaction_count', 0)}") + + validated_volume_48h = validate_onchain_metric({ + "metric_id": "xlm_volume_48h", + "value": raw_volume_48h.get("total_volume", 0.0), + "timestamp": raw_volume_48h.get("end_time", ""), + "chain": "stellar", + "extra": raw_volume_48h, + }) + if validated_volume_48h: + volume_48h = validated_volume_48h.dict() + else: + logger.warning("Invalid on-chain metric for 48h volume, using defaults.") + volume_48h = {"total_volume": 0.0} + + # Calculate volume change percentage + if volume_48h["total_volume"] > 0: + volume_change = ( + volume_24h["total_volume"] - volume_48h["total_volume"] + ) / volume_48h["total_volume"] + print(f"Volume Change (24h vs 48h): {volume_change:.2%}") + else: + volume_change = 0.0 + print("Insufficient data for volume change calculation") + + if network_stats: + print(f"Latest Ledger: {network_stats.get('latest_ledger', 'N/A')}") + print(f"Transaction Count: {network_stats.get('transaction_count', 0)}") + + # Step 5: Market Analysis + print("\n5. MARKET ANALYSIS") + print("-" * 40) + + # Create market data + market_data = MarketData( + sentiment_score=avg_sentiment, volume_change=volume_change + ) + + # Analyze market trend + trend, score, metrics = MarketAnalyzer.analyze_trend(market_data) + + print(f"Market Health Score: {score:.2f}") + print(f"Trend: {trend.value.upper()}") + print(f"Sentiment Component: {metrics['sentiment_component']:.2f}") + print(f"Volume Component: {metrics['volume_component']:.2f}") + + # Generate explanation + explanation = get_explanation(score, trend) + print(f"\nAnalysis: {explanation}") + + # Step 6: Anomaly Detection + print("\n6. ANOMALY DETECTION") + print("-" * 40) + + current_volume = float(volume_24h["total_volume"]) + now = datetime.utcnow() + + # Feed current data point into the rolling window detector + anomaly_detector.add_data_point( + volume=current_volume, + sentiment_score=avg_sentiment, + timestamp=now, + ) + + # Run detection on both metrics + volume_anomaly = anomaly_detector.detect_volume_anomaly(current_volume, now) + sentiment_anomaly = anomaly_detector.detect_sentiment_anomaly(avg_sentiment, now) + + anomalies_found = [] + + for result in [volume_anomaly, sentiment_anomaly]: + status = "⚠️ ANOMALY" if result.is_anomaly else "✓ Normal" + print( + f"{status} | {result.metric_name.capitalize():<10} | " + f"value={result.current_value:.4f} | " + f"z={result.z_score:.2f} | " + f"severity={result.severity_score:.2f}" + ) + if result.is_anomaly: + anomalies_found.append(result.to_dict()) + logger.warning( + f"Anomaly detected — metric={result.metric_name}, " + f"value={result.current_value:.4f}, " + f"z_score={result.z_score:.2f}, " + f"severity={result.severity_score:.2f}" + ) + + # Trigger alerts for detected anomalies + if anomalies_found: + notifier.notify_batch([volume_anomaly, sentiment_anomaly]) + + window_stats = anomaly_detector.get_window_stats() + print(f"Detector window: {window_stats['data_points_count']} data points") + + if not anomalies_found: + print("No anomalies detected in current pipeline run.") + + # Step 6: Output summary + total_elapsed = time.perf_counter() - pipeline_start + print("\n6. PIPELINE SUMMARY") + print("-" * 40) + print(f"✓ News Articles Processed: {len(news_articles)}") + print(f"✓ Sentiment Scores Computed: {len(sentiment_results)}") + print(f"✓ XLM Volume Analyzed: {volume_24h['total_volume']:,.2f}") + print(f"✓ Market Trend: {trend.value.upper()}") + print(f"✓ Anomalies Detected: {len(anomalies_found)}") + print(f"✓ Total Pipeline Time: {total_elapsed:.2f}s") + print(f"✓ Analysis Complete: {datetime.now().strftime('%H:%M:%S')}") + + result = { + "success": True, + "news_count": len(news_articles), + "volume_xlm": volume_24h["total_volume"], + "price_feed": raw_price_feed, + "market_trend": trend.value, + "health_score": score, + "anomalies": anomalies_found, + "timestamp": datetime.now().isoformat(), + } + + logger.info(f"Pipeline completed successfully: {result}") + return result + + except Exception as e: + error_msg = f"Pipeline Error: {e}" + print(f"\n❌ {error_msg}") + import traceback + + traceback.print_exc() + logger.error(error_msg, exc_info=True) + API_FAILURES_TOTAL.labels(method="worker", endpoint="pipeline").inc() + return { + "success": False, + "error": str(e), + "timestamp": datetime.now().isoformat(), + } + + +def start_scheduler(): + """Start the scheduled data processing service.""" + global scheduler + + # Start metrics server on port 9091 for background worker + start_metrics_server(port=9091) + + logger.info("=" * 70) + logger.info("LumenPulse Data Processing Service Starting") + logger.info("=" * 70) + + try: + # Initialize and start the scheduler + scheduler = AnalyticsScheduler(run_data_pipeline) + setup_signal_handlers() + + # Option to run immediately on startup (useful for testing) + run_on_startup = os.getenv("RUN_IMMEDIATELY", "false").lower() == "true" + + if run_on_startup: + logger.info("Running analyzer immediately on startup...") + scheduler.run_immediately() + + # Start the scheduler + scheduler.start() + + logger.info("Data processing service is running. Press Ctrl+C to stop.") + logger.info("The Market Analyzer will run automatically every hour.") + + # Keep the application running + import time + + while True: + time.sleep(1) + + except Exception as e: + logger.error(f"Fatal error in data processing service: {e}", exc_info=True) + if scheduler: + scheduler.stop() + sys.exit(1) + + +def main(): + """Main entry point - handles both CLI modes""" + load_dotenv() + + # Create logs directory if it doesn't exist + os.makedirs("./logs", exist_ok=True) + + # Check command line arguments + if len(sys.argv) > 1: + command = sys.argv[1].lower() + + if command == "run": + # Run pipeline once and exit + return run_data_pipeline() + elif command == "serve": + # Start scheduled service + start_scheduler() + elif command == "help": + print("Usage:") + print(" python pipeline.py run - Run pipeline once") + print(" python pipeline.py serve - Start scheduled service") + print(" python pipeline.py help - Show this help") + return {"help": True} + else: + print(f"Unknown command: {command}") + print("Use 'python pipeline.py help' for usage instructions") + return {"error": f"Unknown command: {command}"} + else: + # Default: run once (original behavior) + result = run_data_pipeline() + print("\n" + "=" * 60) + print("PIPELINE COMPLETE") + print("=" * 60) + return result + + +if __name__ == "__main__": + result = main() + if result and result.get("help"): + sys.exit(0) + elif result and not result.get("success", True): + sys.exit(1) \ No newline at end of file diff --git a/temp_backup/src/ml/__init__.py b/temp_backup/src/ml/__init__.py new file mode 100644 index 00000000..3a6ef8f9 --- /dev/null +++ b/temp_backup/src/ml/__init__.py @@ -0,0 +1,28 @@ +""" +ML module for price prediction and other data-driven models. +""" + +from .price_predictor import PricePredictor +from .model_registry import ( + save_model, + load_model, + promote_model, + get_live_model, + list_versions, + get_current_version, + get_registry_status, +) +from .retraining_pipeline import run_retraining, get_last_run_status + +__all__ = [ + "PricePredictor", + "save_model", + "load_model", + "promote_model", + "get_live_model", + "list_versions", + "get_current_version", + "get_registry_status", + "run_retraining", + "get_last_run_status", +] diff --git a/temp_backup/src/ml/feature_store.py b/temp_backup/src/ml/feature_store.py new file mode 100644 index 00000000..a7d29cb0 --- /dev/null +++ b/temp_backup/src/ml/feature_store.py @@ -0,0 +1,83 @@ +import pandas as pd +from sqlalchemy.orm import Session +from sqlalchemy import text +from datetime import datetime, timedelta, timezone + +class FeatureStore: + def __init__(self, db_session: Session): + """ + Initialize the FeatureStore with a SQLAlchemy database session. + """ + self.db = db_session + + def _parse_window_to_datetime(self, window: str) -> datetime: + """Helper to parse window strings like '24h' or '7d' into a past timestamp.""" + # Fix deprecation warning by using timezone-aware UTC datetime + now = datetime.now(timezone.utc) + if window.endswith('h'): + return now - timedelta(hours=int(window[:-1])) + elif window.endswith('d'): + return now - timedelta(days=int(window[:-1])) + else: + raise ValueError("Unsupported window format. Use 'h' (hours) or 'd' (days).") + + def _ensure_columns(self, df: pd.DataFrame, expected_col: str) -> pd.DataFrame: + """Ensures the DataFrame has the correct base columns, even if it's completely empty.""" + if 'timestamp' not in df.columns: + df['timestamp'] = pd.Series(dtype='datetime64[ns]') + if expected_col not in df.columns: + df[expected_col] = pd.Series(dtype='float64') + return df + + def get_features_for_asset(self, asset: str, window: str) -> pd.DataFrame: + """ + Retrieves and combines features for a specific asset over a given time window. + Combines: Sentiment stats, Volume metrics, and Volatility indicators. + """ + start_time = self._parse_window_to_datetime(window) + + sentiment_query = text(""" + SELECT timestamp, sentiment_score FROM asset_sentiment_view + WHERE asset = :asset AND timestamp >= :start_time + """) + + volume_query = text(""" + SELECT timestamp, volume FROM asset_volume_view + WHERE asset = :asset AND timestamp >= :start_time + """) + + volatility_query = text(""" + SELECT timestamp, volatility FROM asset_volatility_view + WHERE asset = :asset AND timestamp >= :start_time + """) + + conn = self.db.connection() + try: + params = {"asset": asset, "start_time": start_time} + sentiment_df = pd.read_sql(sentiment_query, conn, params=params) + volume_df = pd.read_sql(volume_query, conn, params=params) + volatility_df = pd.read_sql(volatility_query, conn, params=params) + except Exception: + sentiment_df = pd.DataFrame() + volume_df = pd.DataFrame() + volatility_df = pd.DataFrame() + + # Ensure all dataframes have the right columns before merging + sentiment_df = self._ensure_columns(sentiment_df, 'sentiment_score') + volume_df = self._ensure_columns(volume_df, 'volume') + volatility_df = self._ensure_columns(volatility_df, 'volatility') + + # Always merge using outer joins to align the time series and preserve column names + features_df = pd.merge(sentiment_df, volume_df, on='timestamp', how='outer') + features_df = pd.merge(features_df, volatility_df, on='timestamp', how='outer') + + # If no actual data exists, return the empty DataFrame (now with the correct headers) + if features_df.empty: + return features_df + + # Clean up the merged dataset (sort by time, forward fill missing values) + features_df.sort_values('timestamp', inplace=True) + features_df.ffill(inplace=True) + features_df.fillna(0, inplace=True) # Fill remaining NaNs with 0 + + return features_df \ No newline at end of file diff --git a/temp_backup/src/ml/model_registry.py b/temp_backup/src/ml/model_registry.py new file mode 100644 index 00000000..077c662b --- /dev/null +++ b/temp_backup/src/ml/model_registry.py @@ -0,0 +1,223 @@ +""" +Model Registry - versioned model storage with atomic zero-downtime swap. + +Versions follow semver-lite: v. (e.g. v1.0, v1.1, v2.0) +Each model type (sentiment, price_predictor) is stored independently. + +Directory layout: + models/ + sentiment/ + v1.0.pkl + v1.1.pkl + current -> v1.1.pkl (symlink, updated atomically) + price_predictor/ + v1.0.pkl + current -> v1.0.pkl +""" + +import os +import pickle +import shutil +import threading +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, Optional, Tuple + +from src.utils.logger import setup_logger + +logger = setup_logger(__name__) + +_MODELS_ROOT = Path(os.getenv("MODEL_REGISTRY_PATH", "./models")) + +# In-memory hot-swap: the live model is held here so the API never reads disk +# during inference. A reentrant read-write lock guards concurrent access. +_live_models: Dict[str, Any] = {} +_live_versions: Dict[str, str] = {} +_lock = threading.RLock() + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + +def _model_dir(model_type: str) -> Path: + d = _MODELS_ROOT / model_type + d.mkdir(parents=True, exist_ok=True) + return d + + +def _symlink_path(model_type: str) -> Path: + return _model_dir(model_type) / "current" + + +def _version_path(model_type: str, version: str) -> Path: + return _model_dir(model_type) / f"{version}.pkl" + + +def _next_version(model_type: str) -> str: + """Increment the minor version of the latest saved model.""" + existing = list_versions(model_type) + if not existing: + return "v1.0" + # Parse the highest version + def _parse(v: str) -> Tuple[int, int]: + parts = v.lstrip("v").split(".") + return int(parts[0]), int(parts[1]) + + major, minor = max(_parse(v) for v in existing) + return f"v{major}.{minor + 1}" + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def save_model(model_type: str, model_obj: Any, version: Optional[str] = None) -> str: + """ + Persist a trained model to disk and return the version string. + + Args: + model_type: e.g. "sentiment" or "price_predictor" + model_obj: The object to pickle (sklearn pipeline, VADER lexicon dict, …) + version: Explicit version string; auto-incremented if omitted. + + Returns: + The version string that was saved (e.g. "v1.2"). + """ + if version is None: + version = _next_version(model_type) + + path = _version_path(model_type, version) + with open(path, "wb") as fh: + pickle.dump(model_obj, fh, protocol=pickle.HIGHEST_PROTOCOL) + + logger.info(f"Model saved: type={model_type} version={version} path={path}") + return version + + +def load_model(model_type: str, version: str = "current") -> Any: + """ + Load a model from disk. + + Args: + model_type: e.g. "sentiment" or "price_predictor" + version: Specific version string or "current" (follows symlink). + + Returns: + The unpickled model object. + """ + if version == "current": + sym = _symlink_path(model_type) + if not sym.exists(): + raise FileNotFoundError( + f"No current model for '{model_type}'. Run retraining first." + ) + path = sym.resolve() + else: + path = _version_path(model_type, version) + + if not path.exists(): + raise FileNotFoundError(f"Model not found: {path}") + + with open(path, "rb") as fh: + obj = pickle.load(fh) + + logger.info(f"Model loaded from disk: type={model_type} version={version}") + return obj + + +def promote_model(model_type: str, version: str) -> None: + """ + Atomically promote a saved version to 'current' (zero-downtime swap). + + The on-disk symlink is updated atomically via a rename, and the + in-memory hot model is swapped under the RLock so in-flight requests + finish with the old model while new requests immediately use the new one. + + Args: + model_type: e.g. "sentiment" or "price_predictor" + version: The version to promote (must already be saved). + """ + target = _version_path(model_type, version) + if not target.exists(): + raise FileNotFoundError( + f"Cannot promote {model_type}@{version}: file not found at {target}" + ) + + sym = _symlink_path(model_type) + tmp_sym = sym.with_suffix(".tmp") + + # Atomic symlink swap (POSIX rename is atomic) + if tmp_sym.exists() or tmp_sym.is_symlink(): + tmp_sym.unlink() + tmp_sym.symlink_to(target.name) + tmp_sym.rename(sym) + + # Hot-swap in memory + new_model = load_model(model_type, version) + with _lock: + _live_models[model_type] = new_model + _live_versions[model_type] = version + + logger.info(f"Model promoted: type={model_type} version={version} (zero-downtime swap complete)") + + +def get_live_model(model_type: str) -> Any: + """ + Return the currently active in-memory model. + Falls back to loading from disk if not yet warm. + + Args: + model_type: e.g. "sentiment" or "price_predictor" + + Returns: + The live model object. + """ + with _lock: + if model_type in _live_models: + return _live_models[model_type] + + # Cold start: load from disk and cache + model = load_model(model_type, "current") + with _lock: + _live_models[model_type] = model + sym = _symlink_path(model_type) + if sym.exists(): + _live_versions[model_type] = sym.resolve().stem # filename without .pkl + return model + + +def list_versions(model_type: str) -> list: + """Return sorted list of saved version strings for a model type.""" + d = _model_dir(model_type) + versions = [ + p.stem for p in d.glob("v*.pkl") + ] + return sorted(versions) + + +def get_current_version(model_type: str) -> Optional[str]: + """Return the currently promoted version string, or None.""" + with _lock: + if model_type in _live_versions: + return _live_versions[model_type] + + sym = _symlink_path(model_type) + if sym.exists(): + return sym.resolve().stem + return None + + +def get_registry_status() -> Dict[str, Any]: + """Return a status snapshot of all registered model types.""" + status = {} + if _MODELS_ROOT.exists(): + for model_dir in _MODELS_ROOT.iterdir(): + if model_dir.is_dir(): + mtype = model_dir.name + status[mtype] = { + "current_version": get_current_version(mtype), + "available_versions": list_versions(mtype), + "live_in_memory": mtype in _live_models, + } + return status diff --git a/temp_backup/src/ml/price_predictor.py b/temp_backup/src/ml/price_predictor.py new file mode 100644 index 00000000..be9e1e14 --- /dev/null +++ b/temp_backup/src/ml/price_predictor.py @@ -0,0 +1,93 @@ +import logging +import pandas as pd +import numpy as np +from typing import Dict, Any, List, Optional +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error, r2_score + +logger = logging.getLogger(__name__) + +class PricePredictor: + """ + A structured ML predictor for asset prices using scikit-learn pipelines. + """ + + def __init__(self, model_name: str = "linear_regression"): + self.model_name = model_name + self.pipeline = self._build_pipeline() + self.is_trained = False + self.metrics: Dict[str, float] = {} + + def _build_pipeline(self) -> Pipeline: + """ + Builds the scikit-learn pipeline with scaling and a regressor. + """ + return Pipeline([ + ('scaler', StandardScaler()), + ('regressor', LinearRegression()) + ]) + + def fit(self, data: pd.DataFrame, target_column: str = 'target') -> Dict[str, float]: + """ + Trains the model using the provided training data. + + Args: + data: DataFrame containing features and the target column. + target_column: The name of the column to predict. + + Returns: + A dictionary containing training metrics. + """ + if data.empty: + raise ValueError("Training data is empty.") + + if target_column not in data.columns: + raise ValueError(f"Target column '{target_column}' not found in data.") + + logger.info(f"Training PricePredictor model: {self.model_name}") + + X = data.drop(columns=[target_column]) + y = data[target_column] + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + self.pipeline.fit(X_train, y_train) + + y_pred = self.pipeline.predict(X_test) + self.metrics = { + "mse": float(mean_squared_error(y_test, y_pred)), + "r2": float(r2_score(y_test, y_pred)) + } + + self.is_trained = True + logger.info(f"Model trained successfully. Metrics: {self.metrics}") + + return self.metrics + + def predict(self, features: pd.DataFrame) -> np.ndarray: + """ + Predicts the price based on input features. + + Args: + features: DataFrame containing the features for prediction. + + Returns: + Array of predicted values. + """ + if not self.is_trained: + raise RuntimeError("Model must be trained before calling predict.") + + if features.empty: + return np.array([]) + + logger.info(f"Predicting with model: {self.model_name}") + return self.pipeline.predict(features) + + def get_metrics(self) -> Dict[str, float]: + """ + Returns the metrics calculated during the last training session. + """ + return self.metrics diff --git a/temp_backup/src/ml/retraining_pipeline.py b/temp_backup/src/ml/retraining_pipeline.py new file mode 100644 index 00000000..6b16e61d --- /dev/null +++ b/temp_backup/src/ml/retraining_pipeline.py @@ -0,0 +1,274 @@ +""" +Automated Model Retraining Pipeline (Issue #454) + +Retrains both models on fresh data, evaluates quality gates, +versions the artifacts, and promotes them with zero downtime. + +Models: + - sentiment : VADER lexicon + custom crypto slang dictionary + - price_predictor : scikit-learn LinearRegression pipeline +""" + +import os +import json +import threading +from datetime import datetime, timedelta +from pathlib import Path +from typing import Any, Dict, Optional, Tuple + +import pandas as pd +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer + +from src.ml.model_registry import ( + save_model, + promote_model, + get_current_version, + get_registry_status, +) +from src.ml.price_predictor import PricePredictor +from src.utils.logger import setup_logger +from src.utils.metrics import JOBS_RUN_TOTAL, MODEL_RETRAINING_TOTAL, MODEL_RETRAINING_DURATION + +logger = setup_logger(__name__) + +# Path to the custom crypto-slang lexicon file (JSON: {"word": score, ...}) +_SLANG_LEXICON_PATH = Path( + os.getenv("CRYPTO_SLANG_LEXICON", "./data/crypto_slang_lexicon.json") +) + +# Quality gates: minimum acceptable metrics before promotion +_MIN_SENTIMENT_COVERAGE = float(os.getenv("MIN_SENTIMENT_COVERAGE", "0.0")) +_MIN_PRICE_R2 = float(os.getenv("MIN_PRICE_R2", "-1.0")) # permissive default + +# Thread-safety: only one retraining run at a time +_retrain_lock = threading.Lock() + +# Last run metadata (in-memory, also written to disk) +_last_run: Dict[str, Any] = {} + + +# --------------------------------------------------------------------------- +# Sentiment model retraining +# --------------------------------------------------------------------------- + +def _load_crypto_slang() -> Dict[str, float]: + """ + Load the custom crypto-slang lexicon from disk. + Returns an empty dict if the file doesn't exist yet. + """ + if not _SLANG_LEXICON_PATH.exists(): + logger.warning( + f"Crypto slang lexicon not found at {_SLANG_LEXICON_PATH}. " + "Using base VADER lexicon only." + ) + return {} + + with open(_SLANG_LEXICON_PATH) as fh: + lexicon = json.load(fh) + + logger.info(f"Loaded {len(lexicon)} custom crypto-slang entries") + return lexicon + + +def _build_sentiment_model() -> Tuple[SentimentIntensityAnalyzer, Dict[str, Any]]: + """ + Build a VADER analyzer enriched with the latest crypto-slang lexicon. + + Returns: + (analyzer, metrics_dict) + """ + analyzer = SentimentIntensityAnalyzer() + slang = _load_crypto_slang() + + if slang: + analyzer.lexicon.update(slang) + logger.info(f"Enriched VADER lexicon with {len(slang)} crypto-slang terms") + + metrics = { + "base_lexicon_size": len(SentimentIntensityAnalyzer().lexicon), + "custom_terms_added": len(slang), + "total_lexicon_size": len(analyzer.lexicon), + "coverage_ratio": len(slang) / max(len(analyzer.lexicon), 1), + } + return analyzer, metrics + + +# --------------------------------------------------------------------------- +# Price predictor retraining +# --------------------------------------------------------------------------- + +def _fetch_training_data(db_session=None) -> pd.DataFrame: + """ + Fetch recent feature data for the price predictor. + + In production this queries the feature store; falls back to a + synthetic dataset so the pipeline never hard-fails in CI/dev. + """ + if db_session is not None: + try: + from src.ml.feature_store import FeatureStore + store = FeatureStore(db_session) + df = store.get_features_for_asset("XLM", "30d") + if not df.empty and len(df) >= 20: + # Create a simple target: next-period sentiment shift + df["target"] = df["sentiment_score"].shift(-1) + df.dropna(inplace=True) + logger.info(f"Fetched {len(df)} rows from feature store for retraining") + return df + except Exception as exc: + logger.warning(f"Feature store unavailable, using synthetic data: {exc}") + + # Synthetic fallback — keeps the pipeline runnable without a live DB + import numpy as np + rng = np.random.default_rng(seed=int(datetime.utcnow().timestamp()) % 10_000) + n = 200 + df = pd.DataFrame({ + "sentiment_score": rng.uniform(-1, 1, n), + "volume": rng.uniform(1_000, 100_000, n), + "volatility": rng.uniform(0, 0.5, n), + "target": rng.uniform(-1, 1, n), + }) + logger.info("Using synthetic training data (no live DB session provided)") + return df + + +def _build_price_predictor(db_session=None) -> Tuple[PricePredictor, Dict[str, Any]]: + """ + Retrain the PricePredictor on fresh data. + + Returns: + (predictor, metrics_dict) + """ + df = _fetch_training_data(db_session) + predictor = PricePredictor(model_name="linear_regression") + metrics = predictor.fit(df, target_column="target") + logger.info(f"PricePredictor retrained: {metrics}") + return predictor, metrics + + +# --------------------------------------------------------------------------- +# Orchestrator +# --------------------------------------------------------------------------- + +def run_retraining( + db_session=None, + force: bool = False, +) -> Dict[str, Any]: + """ + Full retraining run: train → evaluate → version → promote. + + Args: + db_session: Optional SQLAlchemy session for the feature store. + force: Skip quality gates and always promote. + + Returns: + A result dict with versions, metrics, and status. + """ + global _last_run + + if not _retrain_lock.acquire(blocking=False): + logger.warning("Retraining already in progress, skipping this trigger") + return {"status": "skipped", "reason": "already_running"} + + started_at = datetime.utcnow() + result: Dict[str, Any] = { + "status": "started", + "started_at": started_at.isoformat(), + "models": {}, + } + + try: + logger.info("=" * 60) + logger.info("Automated Model Retraining Pipeline — START") + logger.info(f"Timestamp: {started_at.isoformat()}") + + # ── 1. Sentiment model ────────────────────────────────────────────── + logger.info("Step 1: Retraining sentiment model …") + with MODEL_RETRAINING_DURATION.labels(model_type="sentiment").time(): + sentiment_model, sentiment_metrics = _build_sentiment_model() + + passes_sentiment_gate = ( + force + or sentiment_metrics["coverage_ratio"] >= _MIN_SENTIMENT_COVERAGE + ) + + if passes_sentiment_gate: + s_version = save_model("sentiment", sentiment_model) + promote_model("sentiment", s_version) + MODEL_RETRAINING_TOTAL.labels(model_type="sentiment", status="success").inc() + result["models"]["sentiment"] = { + "version": s_version, + "metrics": sentiment_metrics, + "promoted": True, + } + logger.info(f"Sentiment model promoted: {s_version}") + else: + MODEL_RETRAINING_TOTAL.labels(model_type="sentiment", status="failed").inc() + result["models"]["sentiment"] = { + "metrics": sentiment_metrics, + "promoted": False, + "reason": "quality_gate_failed", + } + logger.warning("Sentiment model did NOT pass quality gate — skipping promotion") + + # ── 2. Price predictor ────────────────────────────────────────────── + logger.info("Step 2: Retraining price predictor …") + with MODEL_RETRAINING_DURATION.labels(model_type="price_predictor").time(): + price_model, price_metrics = _build_price_predictor(db_session) + + passes_price_gate = force or price_metrics.get("r2", -999) >= _MIN_PRICE_R2 + + if passes_price_gate: + p_version = save_model("price_predictor", price_model) + promote_model("price_predictor", p_version) + MODEL_RETRAINING_TOTAL.labels(model_type="price_predictor", status="success").inc() + result["models"]["price_predictor"] = { + "version": p_version, + "metrics": price_metrics, + "promoted": True, + } + logger.info(f"PricePredictor promoted: {p_version}") + else: + MODEL_RETRAINING_TOTAL.labels(model_type="price_predictor", status="failed").inc() + result["models"]["price_predictor"] = { + "metrics": price_metrics, + "promoted": False, + "reason": "quality_gate_failed", + } + logger.warning("PricePredictor did NOT pass quality gate — skipping promotion") + + # ── 3. Finalise ───────────────────────────────────────────────────── + finished_at = datetime.utcnow() + result.update( + { + "status": "completed", + "finished_at": finished_at.isoformat(), + "duration_seconds": (finished_at - started_at).total_seconds(), + "registry": get_registry_status(), + } + ) + + JOBS_RUN_TOTAL.inc() + logger.info("Automated Model Retraining Pipeline — DONE") + logger.info("=" * 60) + + except Exception as exc: + result.update( + { + "status": "failed", + "error": str(exc), + "finished_at": datetime.utcnow().isoformat(), + } + ) + logger.error(f"Retraining pipeline failed: {exc}", exc_info=True) + + finally: + _last_run = result + _retrain_lock.release() + + return result + + +def get_last_run_status() -> Dict[str, Any]: + """Return metadata from the most recent retraining run.""" + return _last_run or {"status": "never_run"} diff --git a/temp_backup/src/qa_exporter.py b/temp_backup/src/qa_exporter.py new file mode 100644 index 00000000..017be4a3 --- /dev/null +++ b/temp_backup/src/qa_exporter.py @@ -0,0 +1,256 @@ +""" +QA Dataset Exporter + +Exports raw events, materialized views, and KPIs for a given Stellar ledger range. +Intended for QA engineers and contributor debugging. + +Output format: JSON files written to output_dir/ + - events__.json : raw contract events (from AnalyticsRecord where record_type='event') + - views__.json : materialized views (aggregated Article + SocialPost sentiment) + - kpis__.json : computed KPIs (from AssetTrend) + +Each file has the envelope: + { + "status": "completed", + "exported_at": "", + "start_ledger": , + "end_ledger": , + "count": , + "records": [ ... ] + } +""" + +import json +import logging +import sys +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, List, Optional + +from sqlalchemy import create_engine, select, and_ +from sqlalchemy.orm import sessionmaker + +from src.db.models import AnalyticsRecord, Article, AssetTrend, SocialPost + +logger = logging.getLogger(__name__) + + +@dataclass +class ExportResult: + """Result of a single export operation.""" + + dataset: str + path: str + count: int + status: str + + def to_dict(self) -> Dict[str, Any]: + return { + "dataset": self.dataset, + "path": self.path, + "count": self.count, + "status": self.status, + } + + +class QAExporter: + """ + Exports QA datasets (events, views, KPIs) for a Stellar ledger range. + + Ledger numbers are mapped to AnalyticsRecord / AssetTrend rows via the + ``extra_data->>'ledger'`` JSON field written by the ingestion pipeline. + Articles and SocialPosts are included in the views export regardless of + ledger (they carry no ledger field) when no ledger filter can be applied. + """ + + def __init__( + self, + start_ledger: int, + end_ledger: int, + output_dir: str, + database_url: Optional[str] = None, + ): + import os + + self.start_ledger = start_ledger + self.end_ledger = end_ledger + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + + db_url = database_url or os.getenv( + "DATABASE_URL", + "postgresql://postgres:postgres@localhost:5432/lumenpulse", + ) + engine = create_engine(db_url, pool_pre_ping=True, echo=False) + self.Session = sessionmaker(bind=engine, expire_on_commit=False) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _envelope(self, records: List[Dict], dataset: str) -> Dict[str, Any]: + return { + "status": "completed", + "exported_at": datetime.now(timezone.utc).isoformat(), + "start_ledger": self.start_ledger, + "end_ledger": self.end_ledger, + "dataset": dataset, + "count": len(records), + "records": records, + } + + def _write(self, data: Dict, name: str) -> Path: + path = self.output_dir / f"{name}_{self.start_ledger}_{self.end_ledger}.json" + with open(path, "w") as f: + json.dump(data, f, indent=2, default=str) + return path + + def _ledger_filter(self, model): + """ + Return a SQLAlchemy filter that restricts rows whose extra_data JSON + contains a 'ledger' key within [start_ledger, end_ledger]. + Falls back to no filter if the column cast is unavailable. + """ + from sqlalchemy import cast, Integer + from sqlalchemy.dialects.postgresql import JSONB + + try: + ledger_col = model.extra_data["ledger"].astext.cast(Integer) + return and_( + ledger_col >= self.start_ledger, + ledger_col <= self.end_ledger, + ) + except Exception: + return None # no ledger field on this model; caller handles it + + # ------------------------------------------------------------------ + # Export methods + # ------------------------------------------------------------------ + + def export_events(self) -> ExportResult: + """Export raw events (AnalyticsRecord rows with record_type='event').""" + with self.Session() as session: + q = select(AnalyticsRecord).where( + AnalyticsRecord.record_type == "event" + ) + ledger_f = self._ledger_filter(AnalyticsRecord) + if ledger_f is not None: + q = q.where(ledger_f) + + rows = session.execute(q).scalars().all() + records = [ + { + "id": r.id, + "record_type": r.record_type, + "asset": r.asset, + "metric_name": r.metric_name, + "window": r.window, + "value": r.value, + "previous_value": r.previous_value, + "change_percentage": r.change_percentage, + "trend_direction": r.trend_direction, + "extra_data": r.extra_data, + "timestamp": r.timestamp.isoformat() if r.timestamp else None, + } + for r in rows + ] + + data = self._envelope(records, "events") + path = self._write(data, "events") + logger.info("Exported %d events → %s", len(records), path) + return ExportResult("events", str(path), len(records), "completed") + + def export_views(self) -> ExportResult: + """ + Export materialized views: aggregated sentiment from Articles and + SocialPosts, plus all non-event AnalyticsRecord rows. + """ + with self.Session() as session: + articles = session.execute(select(Article)).scalars().all() + posts = session.execute(select(SocialPost)).scalars().all() + + analytics_q = select(AnalyticsRecord).where( + AnalyticsRecord.record_type != "event" + ) + analytics = session.execute(analytics_q).scalars().all() + + records = { + "articles": [ + { + "article_id": a.article_id, + "title": a.title, + "source": a.source, + "primary_asset": a.primary_asset, + "sentiment_score": a.sentiment_score, + "sentiment_label": a.sentiment_label, + "published_at": a.published_at.isoformat() if a.published_at else None, + } + for a in articles + ], + "social_posts": [ + { + "post_id": p.post_id, + "platform": p.platform, + "primary_asset": p.primary_asset, + "sentiment_score": p.sentiment_score, + "sentiment_label": p.sentiment_label, + "posted_at": p.posted_at.isoformat() if p.posted_at else None, + } + for p in posts + ], + "analytics_records": [ + { + "id": r.id, + "record_type": r.record_type, + "asset": r.asset, + "metric_name": r.metric_name, + "window": r.window, + "value": r.value, + "timestamp": r.timestamp.isoformat() if r.timestamp else None, + } + for r in analytics + ], + } + + total = len(records["articles"]) + len(records["social_posts"]) + len(records["analytics_records"]) + data = self._envelope(records, "views") # type: ignore[arg-type] + data["count"] = total + path = self._write(data, "views") + logger.info("Exported views (%d total rows) → %s", total, path) + return ExportResult("views", str(path), total, "completed") + + def export_kpis(self) -> ExportResult: + """Export KPIs from AssetTrend rows within the ledger range.""" + with self.Session() as session: + rows = session.execute(select(AssetTrend)).scalars().all() + records = [ + { + "id": r.id, + "asset": r.asset, + "metric_name": r.metric_name, + "window": r.window, + "trend_direction": r.trend_direction, + "score": r.score, + "current_value": r.current_value, + "previous_value": r.previous_value, + "change_percentage": r.change_percentage, + "extra_data": r.extra_data, + "timestamp": r.timestamp.isoformat() if r.timestamp else None, + } + for r in rows + ] + + data = self._envelope(records, "kpis") + path = self._write(data, "kpis") + logger.info("Exported %d KPIs → %s", len(records), path) + return ExportResult("kpis", str(path), len(records), "completed") + + def run(self) -> List[ExportResult]: + """Run all three exports and return results.""" + results = [ + self.export_events(), + self.export_views(), + self.export_kpis(), + ] + return results diff --git a/temp_backup/src/scheduler.py b/temp_backup/src/scheduler.py new file mode 100644 index 00000000..1bb768d6 --- /dev/null +++ b/temp_backup/src/scheduler.py @@ -0,0 +1,285 @@ +""" +Job scheduler module - schedules and manages background jobs +""" + +from src.utils.logger import setup_logger +from src.utils.metrics import JOBS_RUN_TOTAL +from datetime import datetime +from apscheduler.schedulers.background import BackgroundScheduler +from apscheduler.triggers.interval import IntervalTrigger +from apscheduler.triggers.cron import CronTrigger +from apscheduler.job import Job + +from fetchers import NewsFetcher +from sentiment import SentimentAnalyzer +from trends import TrendCalculator +from database import DatabaseService, AnalyticsRecord +from anomaly_detector import AnomalyDetector, AnomalyResult +from alertbot import AlertBot +from src.ml.retraining_pipeline import run_retraining, get_last_run_status +from src.ingestion.run_ingestion_quality_checks import main as run_ingestion_quality_checks + + +logger = setup_logger(__name__) + + +class MarketAnalyzer: + """Main job that orchestrates the entire analysis pipeline""" + + def __init__(self): + self.fetcher = NewsFetcher() + self.sentiment_analyzer = SentimentAnalyzer() + self.trend_calculator = TrendCalculator() + self.db_service = DatabaseService() + self.anomaly_detector = AnomalyDetector(window_size_hours=24, z_threshold=2.5) + self.alert_bot = AlertBot() + + def run(self): + """ + Execute the full analysis pipeline: + 1. Fetch News + 2. Analyze Sentiment + 3. Calculate Trend + 4. Save to DB + """ + try: + logger.info("=" * 60) + logger.info("Starting MarketAnalyzer job") + logger.info(f"Timestamp: {datetime.utcnow().isoformat()}") + + # Step 1: Fetch News + logger.info("Step 1: Fetching news...") + news_items = self.fetcher.fetch_all_news() + + if not news_items: + logger.warning("No news items fetched") + return + + # Step 2: Analyze Sentiment + logger.info( + f"Step 2: Analyzing sentiment for {len(news_items)} articles..." + ) + news_texts = [f"{item.title} {item.content}" for item in news_items] + sentiment_results = self.sentiment_analyzer.analyze_batch(news_texts) + sentiment_summary = self.sentiment_analyzer.get_sentiment_summary( + sentiment_results + ) + + # Step 3: Calculate Trends + logger.info("Step 3: Calculating trends...") + trends = self.trend_calculator.calculate_all_trends(sentiment_summary) + trends_dict = [trend.to_dict() for trend in trends] + + # Step 4: Detect Anomalies + logger.info("Step 4: Detecting market anomalies...") + + # Get volume data (mock for demo - in real implementation, fetch actual volume) + current_volume = 1000.0 # This would come from Stellar fetcher + current_sentiment = sentiment_summary.get("average_compound_score", 0) + + # Detect anomalies + anomalies = self.anomaly_detector.detect_anomalies( + volume=current_volume, sentiment_score=current_sentiment + ) + + # Log anomaly results + anomaly_alerts = [] + for anomaly in anomalies: + if anomaly.is_anomaly: + logger.warning( + f"🚨 ANOMALY DETECTED: {anomaly.metric_name} " + f"(Severity: {anomaly.severity_score:.2f}, " + f"Z-Score: {anomaly.z_score:.2f})" + ) + anomaly_alerts.append(anomaly.to_dict()) + else: + logger.debug( + f"Normal {anomaly.metric_name} behavior " + f"(Z-Score: {anomaly.z_score:.2f})" + ) + + # Step 5: Save to Database + logger.info("Step 5: Saving analytics to database...") + + # Enhance record with anomaly data + enhanced_sentiment_data = sentiment_summary.copy() + enhanced_sentiment_data["anomalies_detected"] = len( + [a for a in anomalies if a.is_anomaly] + ) + enhanced_sentiment_data["anomaly_details"] = [ + a.to_dict() for a in anomalies + ] + + # Step 5.5: Check for high sentiment alerts + # Determine trend direction from calculated trends + trend_direction = "Unknown" + if trends: + primary_trend = trends[0] + trend_direction = getattr(primary_trend, "trend_direction", "Unknown") + + alert_sentiment_data = enhanced_sentiment_data.copy() + alert_sentiment_data["trend_direction"] = trend_direction + alert_sentiment_data["total_analyzed"] = len(news_items) + + self.alert_bot.check_and_alert( + analyzer_score=current_sentiment, + sentiment_data=alert_sentiment_data, + timestamp=datetime.utcnow(), + ) + + record = AnalyticsRecord( + timestamp=datetime.utcnow(), + news_count=len(news_items), + sentiment_data=enhanced_sentiment_data, + trends=trends_dict, + ) + + success = self.db_service.save_analytics(record) + + if success: + logger.info("✓ Analytics job completed successfully") + logger.info(f" - News items: {len(news_items)}") + logger.info( + f" - Average sentiment: {sentiment_summary.get('average_compound_score', 0):.4f}" + ) + logger.info( + f" - Positive: {sentiment_summary.get('sentiment_distribution', {}).get('positive', 0):.1%}" + ) + logger.info( + f" - Negative: {sentiment_summary.get('sentiment_distribution', {}).get('negative', 0):.1%}" + ) + logger.info(f" - Anomalies detected: {len(anomaly_alerts)}") + JOBS_RUN_TOTAL.inc() + else: + logger.error("✗ Failed to save analytics to database") + + logger.info("=" * 60) + except Exception as e: + logger.error(f"Error in MarketAnalyzer job: {e}", exc_info=True) + + +def _retraining_job() -> None: + """ + Scheduled retraining job wrapper. + Runs the full retraining pipeline and logs the outcome. + Errors are caught so a failed retrain never crashes the scheduler. + """ + logger.info("Scheduled model retraining job triggered") + try: + result = run_retraining() + if result.get("status") == "completed": + logger.info( + f"Scheduled retraining completed in " + f"{result.get('duration_seconds', 0):.1f}s — " + f"models: {list(result.get('models', {}).keys())}" + ) + else: + logger.warning(f"Scheduled retraining ended with status: {result.get('status')}") + except Exception as exc: + logger.error(f"Scheduled retraining job raised an exception: {exc}", exc_info=True) + + +def _ingestion_quality_checks_job() -> None: + """Run Stellar testnet ingestion quality checks. + + Scheduled wrapper. Errors are caught so the scheduler keeps running. + """ + try: + run_ingestion_quality_checks(argv=None) + except SystemExit: + # CLI may call sys.exit; ignore to keep scheduler alive. + pass + except Exception as e: + logger.error(f"Ingestion quality checks failed: {e}", exc_info=True) + + +class AnalyticsScheduler: + + """Manages the APScheduler scheduler for analytics jobs""" + + def __init__(self, pipeline_fn=None): + self.scheduler = BackgroundScheduler() + self.analyzer = MarketAnalyzer() + # Allow injecting a custom pipeline function (used by main.py) + self._pipeline_fn = pipeline_fn + + def start(self): + """Start the scheduler with all registered jobs.""" + try: + # ── Market Analyzer: every hour ────────────────────────────── + run_fn = self._pipeline_fn if self._pipeline_fn else self.analyzer.run + market_job = self.scheduler.add_job( + func=run_fn, + trigger=IntervalTrigger(hours=1), + id="market_analyzer_hourly", + name="Market Analyzer - Hourly Analytics", + replace_existing=True, + ) + + # ── Stellar ingestion quality checks: every hour ────────── + # Low-noise: only fails CI/process when ingestion lag is critical. + quality_job = self.scheduler.add_job( + func=self._ingestion_quality_checks_job, + trigger=IntervalTrigger(hours=1), + id="stellar_ingestion_quality_checks_hourly", + name="Stellar Ingestion Quality Checks - Hourly", + replace_existing=True, + ) + + # ── Model Retraining: daily at 02:00 UTC ───────────────────── + retrain_job = self.scheduler.add_job( + func=_retraining_job, + trigger=CronTrigger(hour=2, minute=0, timezone="UTC"), + id="model_retraining_daily", + name="Automated Model Retraining - Daily", + replace_existing=True, + ) + + self.scheduler.start() + logger.info("✓ Analytics scheduler started") + logger.info(f" - Job: {market_job.name} | Next: {market_job.next_run_time}") + logger.info(f" - Job: {retrain_job.name} | Next: {retrain_job.next_run_time}") + except Exception as e: + logger.error(f"Error starting scheduler: {e}") + raise + + def run_immediately(self): + """Run the analyzer job immediately (useful for testing)""" + logger.info("Running MarketAnalyzer immediately...") + if self._pipeline_fn: + self._pipeline_fn() + else: + self.analyzer.run() + + def trigger_retraining(self, force: bool = False) -> dict: + """Manually trigger a retraining run (e.g. from the API).""" + logger.info(f"Manual retraining triggered (force={force})") + return run_retraining(force=force) + + def stop(self): + """Stop the scheduler""" + try: + self.scheduler.shutdown(wait=True) + logger.info("✓ Analytics scheduler stopped") + except Exception as e: + logger.error(f"Error stopping scheduler: {e}") + + def get_jobs(self) -> list: + """Get list of scheduled jobs""" + return self.scheduler.get_jobs() + + def get_job_status(self, job_id: str) -> dict: + """Get status of a specific job""" + job = self.scheduler.get_job(job_id) + if job: + return { + "id": job.id, + "name": job.name, + "next_run_time": str(job.next_run_time), + "trigger": str(job.trigger), + } + return None + + def get_retraining_status(self) -> dict: + """Return the last retraining run metadata.""" + return get_last_run_status() diff --git a/temp_backup/src/security.py b/temp_backup/src/security.py new file mode 100644 index 00000000..e9ccd55b --- /dev/null +++ b/temp_backup/src/security.py @@ -0,0 +1,215 @@ +""" +Security middleware for API key authentication and rate limiting. +""" + +import os +import re +from typing import Optional, Callable, Dict, Any +from functools import wraps +from fastapi import Request, HTTPException, status +from fastapi.responses import JSONResponse +from slowapi import Limiter, _rate_limit_exceeded_handler +from slowapi.util import get_remote_address +from slowapi.errors import RateLimitExceeded +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + + +class SecurityConfig: + """Security configuration manager.""" + + def __init__(self): + self.api_key = os.getenv("API_KEY", "") + self.rate_limit_enabled = os.getenv("RATE_LIMIT_ENABLED", "true").lower() == "true" + self.rate_limit_default = os.getenv("RATE_LIMIT_DEFAULT", "100/minute") + self.rate_limit_strict = os.getenv("RATE_LIMIT_STRICT", "10/minute") + + # Parse rate limit strings + self._validate_rate_limit(self.rate_limit_default) + self._validate_rate_limit(self.rate_limit_strict) + + def _validate_rate_limit(self, limit_string: str) -> None: + """Validate rate limit string format (e.g., '100/minute').""" + pattern = r'^\d+/(second|minute|hour|day)$' + if not re.match(pattern, limit_string): + raise ValueError( + f"Invalid rate limit format: {limit_string}. " + "Expected format: 'N/second', 'N/minute', 'N/hour', or 'N/day'" + ) + + @property + def limiter(self) -> Optional[Limiter]: + """Create and configure the rate limiter.""" + if not self.rate_limit_enabled: + return None + + limiter = Limiter( + key_func=get_remote_address, + default_limits=[self.rate_limit_default], + storage_uri="memory://", # In-memory storage (use redis:// for production) + ) + return limiter + + def validate_api_key(self, request: Request) -> bool: + """ + Validate API key from request headers. + + Args: + request: FastAPI request object + + Returns: + True if API key is valid + + Raises: + HTTPException: If API key is missing or invalid + """ + if not self.api_key: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="API is not configured: API_KEY environment variable is missing.", + ) + + api_key_header = request.headers.get("X-API-Key") + + if not api_key_header: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Missing API key. Please provide X-API-Key header.", + headers={"WWW-Authenticate": "ApiKey"}, + ) + + if api_key_header != self.api_key: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Invalid API key", + headers={"WWW-Authenticate": "ApiKey"}, + ) + + return True + + def get_limiter_for_endpoint(self, endpoint_type: str = "default") -> Optional[Limiter]: + """ + Get a limiter configured for a specific endpoint type. + + Args: + endpoint_type: Type of endpoint ('default' or 'strict') + + Returns: + Configured Limiter instance or None if rate limiting is disabled + """ + if not self.rate_limit_enabled: + return None + + limit_string = ( + self.rate_limit_strict + if endpoint_type == "strict" + else self.rate_limit_default + ) + + limiter = Limiter( + key_func=get_remote_address, + default_limits=[limit_string], + storage_uri="memory://", + ) + return limiter + + +# Global security config instance +security_config = SecurityConfig() + + +def require_api_key(func: Callable) -> Callable: + """ + Decorator to require API key authentication for an endpoint. + + Usage: + @app.get("/protected") + @require_api_key + async def protected_endpoint(request: Request): + ... + """ + @wraps(func) + async def wrapper(request: Request, *args, **kwargs) -> Any: + security_config.validate_api_key(request) + return await func(request, *args, **kwargs) + return wrapper + + +def setup_security_middleware(app) -> None: + """ + Setup security middleware for a FastAPI application. + + Args: + app: FastAPI application instance + """ + @app.middleware("http") + async def api_key_middleware(request: Request, call_next): + """Middleware to check API key for all requests except health/metrics.""" + # Skip API key check for health checks and metrics + excluded_paths = [ + "/health", + "/metrics", + "/", + "/docs", + "/redoc", + "/openapi.json", + "/sentiment/legend", + ] + + if request.url.path in excluded_paths: + return await call_next(request) + + # Validate API key + try: + security_config.validate_api_key(request) + except HTTPException as exc: + return JSONResponse( + status_code=exc.status_code, + content={"detail": exc.detail}, + headers=exc.headers, + ) + + # Continue processing + return await call_next(request) + + +def setup_rate_limiter(app, limiter: Limiter) -> None: + """ + Setup rate limiting for a FastAPI application. + + Args: + app: FastAPI application instance + limiter: Slowapi Limiter instance + """ + app.state.limiter = limiter + app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) + + @app.exception_handler(RateLimitExceeded) + async def rate_limit_handler(request: Request, exc: RateLimitExceeded) -> JSONResponse: + """Custom rate limit exceeded handler.""" + return JSONResponse( + status_code=status.HTTP_429_TOO_MANY_REQUESTS, + content={ + "detail": "Rate limit exceeded", + "message": "Too many requests. Please try again later.", + "retry_after": str(exc.detail), + }, + ) + + +def get_rate_limit_decorator(limiter: Limiter, limit_string: Optional[str] = None): + """ + Get a rate limit decorator for specific endpoints. + + Args: + limiter: Slowapi Limiter instance + limit_string: Optional custom limit (e.g., "10/minute") + + Returns: + Decorator function for rate limiting + """ + if limit_string: + return limiter.limit(limit_string) + return limiter.limit diff --git a/temp_backup/src/sentiment.py b/temp_backup/src/sentiment.py new file mode 100644 index 00000000..f1b490bd --- /dev/null +++ b/temp_backup/src/sentiment.py @@ -0,0 +1,283 @@ +""" +Sentiment analyzer module - analyzes sentiment of news articles +""" + +import os +import logging +from typing import List, Dict, Any, Optional, Tuple +from concurrent.futures import ProcessPoolExecutor +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer +from dataclasses import dataclass + +# Import keyword extractor for asset filtering +from src.analytics.keywords import KeywordExtractor + +logger = logging.getLogger(__name__) + +# Minimum batch size to justify spawning worker processes. +_PARALLEL_THRESHOLD = 20 + + +def _analyze_in_worker(args: Tuple[str, Optional[str]]) -> dict: + """Process-safe sentiment analysis for a single text. + + Each worker initialises its own VADER analyzer and KeywordExtractor + because they cannot be pickled across process boundaries. Redis cache + is intentionally skipped in workers to avoid per-process connections. + """ + text, asset_filter = args + + extractor = KeywordExtractor() + asset_codes = extractor.extract_tickers_only(text) + + if asset_filter: + asset_filter = asset_filter.upper() + if asset_filter not in asset_codes: + return { + "text": text[:100], + "compound_score": 0.0, + "positive": 0.0, + "negative": 0.0, + "neutral": 1.0, + "sentiment_label": "neutral", + "asset_codes": [], + } + + analyzer = SentimentIntensityAnalyzer() + scores = analyzer.polarity_scores(text) + compound = scores["compound"] + + if compound >= 0.05: + label = "positive" + elif compound <= -0.05: + label = "negative" + else: + label = "neutral" + + return { + "text": text[:100], + "compound_score": compound, + "positive": scores["pos"], + "negative": scores["neg"], + "neutral": scores["neu"], + "sentiment_label": label, + "asset_codes": asset_codes, + } + + +@dataclass +class SentimentResult: + """Sentiment analysis result""" + + text: str + compound_score: float # -1 to 1 + positive: float # 0 to 1 + negative: float # 0 to 1 + neutral: float # 0 to 1 + sentiment_label: str # 'positive', 'negative', 'neutral' + asset_codes: List[str] = None # List of asset codes mentioned in text + + def __post_init__(self): + if self.asset_codes is None: + self.asset_codes = [] + + def to_dict(self) -> Dict[str, Any]: + return { + "text": self.text, + "compound_score": self.compound_score, + "positive": self.positive, + "negative": self.negative, + "neutral": self.neutral, + "sentiment_label": self.sentiment_label, + "asset_codes": self.asset_codes, + } + + +class SentimentAnalyzer: + """Analyzes sentiment of text using VADER sentiment analysis""" + + def __init__(self): + self.analyzer = SentimentIntensityAnalyzer() + self.keyword_extractor = KeywordExtractor() + self.cache: object | None = None + try: + from cache_manager import CacheManager + except ImportError: + logger.info("CacheManager unavailable - sentiment caching disabled") + else: + try: + self.cache = CacheManager(namespace="sentiment") + except Exception as e: + logger.warning("Redis unavailable - sentiment caching disabled: %s", e) + else: + logger.info("Sentiment cache ready") + + def analyze(self, text: str, asset_filter: Optional[str] = None) -> SentimentResult: + """ + Analyze sentiment of a single text + + Args: + text: Text to analyze + asset_filter: Optional asset code to filter results (e.g., 'XLM', 'USDC') + + Returns: + SentimentResult object + """ + # Extract asset codes from text + asset_codes = self.keyword_extractor.extract_tickers_only(text) + + # If asset_filter is specified, check if text mentions that asset + if asset_filter: + asset_filter = asset_filter.upper() + if asset_filter not in asset_codes: + # Return neutral result if asset not mentioned + return SentimentResult( + text=text[:100], + compound_score=0.0, + positive=0.0, + negative=0.0, + neutral=1.0, + sentiment_label="neutral", + asset_codes=[], + ) + + cache_key = f"{text}:{asset_filter}" if asset_filter else text + if self.cache: + cached = self.cache.get(cache_key) + if cached: + return SentimentResult(**cached) + + scores = self.analyzer.polarity_scores(text) + compound = scores["compound"] + if compound >= 0.05: + label = "positive" + elif compound <= -0.05: + label = "negative" + else: + label = "neutral" + + result = SentimentResult( + text=text[:100], + compound_score=compound, + positive=scores["pos"], + negative=scores["neg"], + neutral=scores["neu"], + sentiment_label=label, + asset_codes=asset_codes, + ) + + if self.cache: + self.cache.set(cache_key, result.to_dict()) + + return result + + def analyze_batch(self, texts: List[str], asset_filter: Optional[str] = None) -> List[SentimentResult]: + """ + Analyze sentiment of multiple texts + + Args: + texts: List of texts to analyze + asset_filter: Optional asset code to filter results (e.g., 'XLM', 'USDC') + + Returns: + List of SentimentResult objects + """ + results = [self.analyze(t, asset_filter) for t in texts] + logger.info("Analyzed %d texts for sentiment", len(results)) + if asset_filter: + logger.info("Filtered for asset: %s", asset_filter) + return results + + def analyze_batch_parallel( + self, + texts: List[str], + asset_filter: Optional[str] = None, + max_workers: Optional[int] = None, + ) -> List[SentimentResult]: + """Analyze sentiment using ProcessPoolExecutor for large batches. + + Falls back to sequential processing when the batch is smaller than + ``_PARALLEL_THRESHOLD`` or when running inside a child process. + + Args: + texts: List of texts to analyze. + asset_filter: Optional asset code to filter results. + max_workers: Max worker processes (defaults to CPU count). + + Returns: + List of SentimentResult objects. + """ + if not texts: + return [] + + # Fall back to sequential for small batches (overhead > benefit). + if len(texts) < _PARALLEL_THRESHOLD: + return self.analyze_batch(texts, asset_filter) + + if max_workers is None: + max_workers = min(os.cpu_count() or 2, 8) + + args = [(text, asset_filter) for text in texts] + + results: List[SentimentResult] = [] + try: + with ProcessPoolExecutor(max_workers=max_workers) as pool: + for result_dict in pool.map(_analyze_in_worker, args): + results.append(SentimentResult(**result_dict)) + except Exception: + logger.warning( + "ProcessPoolExecutor failed, falling back to sequential", + exc_info=True, + ) + return self.analyze_batch(texts, asset_filter) + + logger.info( + "Analyzed %d texts in parallel (%d workers)", len(results), max_workers + ) + return results + + def get_sentiment_summary(self, results: List[SentimentResult]) -> Dict[str, Any]: + """ + Get summary statistics from sentiment analysis results + + Args: + results: List of SentimentResult objects + + Returns: + Summary statistics + """ + if not results: + return { + "total_items": 0, + "average_compound_score": 0, + "positive_count": 0, + "negative_count": 0, + "neutral_count": 0, + "sentiment_distribution": {"positive": 0, "negative": 0, "neutral": 0}, + "asset_distribution": {}, + } + + positive_count = sum(1 for r in results if r.sentiment_label == "positive") + negative_count = sum(1 for r in results if r.sentiment_label == "negative") + neutral_count = sum(1 for r in results if r.sentiment_label == "neutral") + avg_compound = sum(r.compound_score for r in results) / len(results) + + # Calculate asset distribution + asset_distribution = {} + for result in results: + for asset in result.asset_codes: + asset_distribution[asset] = asset_distribution.get(asset, 0) + 1 + + return { + "total_items": len(results), + "average_compound_score": round(avg_compound, 4), + "positive_count": positive_count, + "negative_count": negative_count, + "neutral_count": neutral_count, + "sentiment_distribution": { + "positive": round(positive_count / len(results), 4), + "negative": round(negative_count / len(results), 4), + "neutral": round(neutral_count / len(results), 4), + }, + "asset_distribution": asset_distribution, + } diff --git a/temp_backup/src/trends.py b/temp_backup/src/trends.py new file mode 100644 index 00000000..44dcb736 --- /dev/null +++ b/temp_backup/src/trends.py @@ -0,0 +1,153 @@ +""" +Trend calculator module - calculates market trends from sentiment and data +""" + +import json +import logging +from typing import List, Dict, Any +from datetime import datetime, timezone +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + + +@dataclass +class Trend: + """Market trend information""" + + metric_name: str + current_value: float + previous_value: float + change_percentage: float + trend_direction: str # 'up', 'down', 'stable' + timestamp: datetime + + def to_dict(self) -> Dict[str, Any]: + return { + "metric_name": self.metric_name, + "current_value": self.current_value, + "previous_value": self.previous_value, + "change_percentage": self.change_percentage, + "trend_direction": self.trend_direction, + "timestamp": self.timestamp.isoformat(), + } + + +class TrendCalculator: + """Calculates trends from sentiment analysis and market data""" + + def __init__(self): + self.trend_history: Dict[str, Any] = {} + self.cache: object | None = None + try: + from cache_manager import CacheManager + except ImportError: + logger.info("CacheManager unavailable - trends caching disabled") + else: + try: + self.cache = CacheManager(namespace="trends") + except Exception as e: + logger.warning("Redis unavailable - trends caching disabled: %s", e) + else: + logger.info("Trends cache ready") + + @staticmethod + def _summary_cache_key(sentiment_summary: Dict[str, Any]) -> str: + """Deterministic key from a sentiment summary dict.""" + return json.dumps(sentiment_summary, sort_keys=True, default=str) + + def _compute_trend( + self, + metric_name: str, + current_value: float, + ) -> Trend: + previous_value = self.trend_history.get(metric_name, {}).get( + "value", current_value + ) + + # Calculate change + if previous_value != 0: + change_pct = ((current_value - previous_value) / abs(previous_value)) * 100 + else: + change_pct = 0.0 + + # Determine trend direction + if change_pct > 2: + direction = "up" + elif change_pct < -2: + direction = "down" + else: + direction = "stable" + + # Update trend history + self.trend_history[metric_name] = { + "value": current_value, + "timestamp": datetime.now(timezone.utc), + } + + trend = Trend( + metric_name=metric_name, + current_value=round(current_value, 4), + previous_value=round(previous_value, 4), + change_percentage=round(change_pct, 2), + trend_direction=direction, + timestamp=datetime.now(timezone.utc), + ) + logger.info("%s trend: %s (%.2f%%)", metric_name, direction, change_pct) + return trend + + def calculate_sentiment_trend(self, sentiment_summary: Dict[str, Any]) -> Trend: + current = sentiment_summary.get("average_compound_score", 0) + return self._compute_trend("sentiment_score", current) + + def calculate_positive_sentiment_trend( + self, sentiment_summary: Dict[str, Any] + ) -> Trend: + current = sentiment_summary.get("sentiment_distribution", {}).get("positive", 0) + return self._compute_trend("positive_sentiment_percentage", current) + + def calculate_negative_sentiment_trend( + self, sentiment_summary: Dict[str, Any] + ) -> Trend: + current = sentiment_summary.get("sentiment_distribution", {}).get("negative", 0) + return self._compute_trend("negative_sentiment_percentage", current) + + def calculate_all_trends(self, sentiment_summary: Dict[str, Any]) -> List[Trend]: + """ + Calculate all trends + + Args: + sentiment_summary: Summary from SentimentAnalyzer + + Returns: + List of Trend objects + """ + cache_key = self._summary_cache_key(sentiment_summary) + + # Check cache for cached results + if self.cache: + cached = self.cache.get(cache_key) + if cached: + return [ + Trend( + metric_name=t["metric_name"], + current_value=t["current_value"], + previous_value=t["previous_value"], + change_percentage=t["change_percentage"], + trend_direction=t["trend_direction"], + timestamp=datetime.fromisoformat(t["timestamp"]), + ) + for t in cached + ] + + trends = [ + self.calculate_sentiment_trend(sentiment_summary), + self.calculate_positive_sentiment_trend(sentiment_summary), + self.calculate_negative_sentiment_trend(sentiment_summary), + ] + + if self.cache: + self.cache.set(cache_key, [t.to_dict() for t in trends]) + + logger.info("Calculated %d trends", len(trends)) + return trends diff --git a/temp_backup/src/utils/__pycache__/logger.cpython-314.pyc b/temp_backup/src/utils/__pycache__/logger.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89e9cc4259b28de511215182cb9a5155e0813062 GIT binary patch literal 2989 zcma)8Uu+ab7@ysp+g|Tq|Ip(A+p^`LaG))b29O|XD3xMu%cAGY(an0dZ4bA5=gb^| z#)LB=DNz%7@S!9oy!b-mn>=V@qOT?<0&UVVU_v4F5CK#FjUbLGL=!p{83?woU2TQQ61GzV}^I4 z^U9>O7n}@+K4fYpHbX~Le}U;q1Z#Q4b( z@hvAC7mXH1iY{}={Qgnzf|F;|X!61s9X5&{D;aFoF~CcMIIvsD2e?TGgbe6}*N=iw zMLD@By2${u`T__;!6d7AbsHuEr=M^z97Zr4%cvYv$DQW7?*tVF z`uq9#$z!?qsY7UP_|({#ab)P=!Q8MzKVqeFZmcrrc;$*qot#}R(_F!3_A6yl$~%-6 zy_p;(`CNq+U7GVlFi^hC^=XL*-?0fu1O8qdZl+tItD=X-t1HIgyT;)qaeW5Yg1Y%=2_lxUCV+zi8n36F_9r}5|Pd%Tc z&}(Vz*QBgcv$PYcI;(nhTMe`~C%Rg}m9<=~W^FTq1Nu`1DGn+PS}fzNHYsxAxt7HZ zZ*p5|oAsMQ@P=ez_kQ8sJ?vZIoRRB;N% znR654g06s6N9Nz!v(Kh^R-AJ()Y!96W|c@VIjajBnMdx;0ZG_1pvPP5hsaKFMHCuw zWc5xL9k)O;F_MOL-U?jhXCx2L+Frp0R^nj_i!zm4fqBaI4CEt^pJT+a=T7@d1dp;ddUzQj4iI zO-)6fhC&^Bb#yI;La`OR?JnN7xP2L?K|Rs86yH@HS;dhR+;tasUC%D#?o}LJ!O6Qg zd3|~r_rSHmY-Snvu0}hm@A}2sOjV*p1g`7D@+tqHsx~XsL6K}4=>bzd*ak?j&?go| z?8$Dipb)_>O(Y~~M|KCrhKQ5k9z_9)f$pnwYhw8NgPT(~rsPWF9iS?kc!PY$HV}Yx z0!l$m^pZxm6!@8t(St$UB7NxhLmYv$tOfWO>`7O|Hw->s(bOVdiuT;WJ&l`dHwaRb zH;cEui0lUyKuGlwGC&{v5a9e5B!^(13jXjR=@9_*e6vLkzBjN}Zx|%$%?S{iacF#G zHj7+e{hy;o9ce^f0~0~?IIdLSUn?Z~AtG{LUhN~7nsGdbh!4m{ejsmvUX%L|qlqur zm-C;^fBwl*wEGV3mNqhRpBAF|lE<7&?4nI5c^yvj_VQ&mTk;zJem(<4PWp;jn-KAx z=DPoH=2t%+9UqyHF(x94(ue0A4j=b_GER_FpdtcE&qBARDT?v{B_5#82PpbG+6#re zwQnVP;9l~;otKaQJiDBnxPv;M>LI0TaqrE68v{=fh->YrcmMY-KgE8C$?xS~$01pt K=@2FPzJCBtIcVGf literal 0 HcmV?d00001 diff --git a/temp_backup/src/utils/http_client.py b/temp_backup/src/utils/http_client.py new file mode 100644 index 00000000..2d91e074 --- /dev/null +++ b/temp_backup/src/utils/http_client.py @@ -0,0 +1,138 @@ +import logging +import random +import time +from typing import Optional + +import requests + +logger = logging.getLogger("RobustHTTPClient") + + +class CircuitBreakerOpenException(Exception): + """Raised when the circuit breaker is open and fast-failing requests.""" + + pass + + +class RobustHTTPClient(requests.Session): + """ + A robust HTTP client extending requests.Session. + Features: + - Exponential backoff with jitter for transient errors (500, 502, 503, 504, 429) and connection issues. + - Circuit Breaker pattern to protect downstream services and fail fast. + """ + + def __init__( + self, + max_retries: int = 4, + backoff_factor: float = 1.5, + status_forcelist: Optional[set[int]] = None, + failure_threshold: int = 5, + recovery_timeout: float = 30.0, + ): + super().__init__() + self.max_retries = max_retries + self.backoff_factor = backoff_factor + self.status_forcelist = status_forcelist or {429, 500, 502, 503, 504} + + # Circuit Breaker state + self.failure_threshold = failure_threshold + self.recovery_timeout = recovery_timeout + self.failure_count = 0 + self.state = "CLOSED" # CLOSED, OPEN, HALF-OPEN + self.last_state_change = time.time() + self._circuit_opened_at = 0.0 + + def _check_circuit(self) -> None: + """Check and update circuit breaker state.""" + if self.state == "OPEN": + elapsed = time.time() - self._circuit_opened_at + if elapsed > self.recovery_timeout: + self.state = "HALF-OPEN" + self.last_state_change = time.time() + logger.warning( + "Circuit breaker transitioning to HALF-OPEN. Allowing trial request." + ) + else: + raise CircuitBreakerOpenException( + f"Circuit breaker is OPEN. Fast-failing request. Time remaining: {self.recovery_timeout - elapsed:.1f}s" + ) + + def _record_success(self) -> None: + """Record a successful request and reset breaker state if needed.""" + if self.state == "HALF-OPEN": + logger.info( + "Trial request succeeded. Circuit breaker transitioning to CLOSED." + ) + self.failure_count = 0 + self.state = "CLOSED" + self.last_state_change = time.time() + + def _record_failure(self) -> None: + """Record a failed request and trip breaker if threshold exceeded.""" + self.failure_count += 1 + if self.state == "HALF-OPEN" or self.failure_count >= self.failure_threshold: + self.state = "OPEN" + self._circuit_opened_at = time.time() + self.last_state_change = time.time() + logger.error( + f"Circuit breaker tripped to OPEN. Failure count: {self.failure_count}. " + f"Will reject requests for next {self.recovery_timeout} seconds." + ) + + def request(self, method: str, url: str, **kwargs) -> requests.Response: + """ + Sends an HTTP request with retry logic and circuit breaker protection. + """ + self._check_circuit() + + # Respect any custom timeout or set a default of 10s + if "timeout" not in kwargs: + kwargs["timeout"] = 10.0 + + retries = 0 + while True: + try: + response = super().request(method, url, **kwargs) + + # Check if the status code is a transient error that warrants a retry + if response.status_code in self.status_forcelist: + raise requests.exceptions.HTTPError( + f"Transient status {response.status_code}", response=response + ) + + # If we get here, it's a successful response (or non-retryable error like 400/404) + self._record_success() + return response + + except ( + requests.exceptions.ConnectionError, + requests.exceptions.Timeout, + requests.exceptions.HTTPError, + ) as e: + + # Check if the error response is non-retryable (e.g. 400 Bad Request) + if isinstance(e, requests.exceptions.HTTPError): + status_code = e.response.status_code + if status_code not in self.status_forcelist: + # Non-retryable HTTP error; record success (meaning server responded normally) and raise + self._record_success() + raise e + + retries += 1 + if retries > self.max_retries: + logger.error( + f"Max retries ({self.max_retries}) exceeded for {url}. Failure details: {str(e)}" + ) + self._record_failure() + raise e + + # Calculate exponential backoff with jitter + sleep_time = self.backoff_factor * (2 ** (retries - 1)) + sleep_time += random.uniform(0, 0.5) # Add jitter + + logger.warning( + f"Request to {url} failed: {str(e)}. " + f"Retrying in {sleep_time:.2f}s... (Attempt {retries}/{self.max_retries})" + ) + time.sleep(sleep_time) diff --git a/temp_backup/src/utils/logger.py b/temp_backup/src/utils/logger.py new file mode 100644 index 00000000..1b6f0f8a --- /dev/null +++ b/temp_backup/src/utils/logger.py @@ -0,0 +1,52 @@ +import logging +import contextvars +import uuid +from pythonjsonlogger import jsonlogger + +# Context variable for correlation ID +correlation_id_ctx = contextvars.ContextVar("correlation_id", default="system") + + +class CorrelationIdFilter(logging.Filter): + """Injects correlation ID into the log record""" + + def filter(self, record): + record.correlation_id = correlation_id_ctx.get() + return True + + +def setup_logger(name: str = "lumenpulse", level: int = logging.INFO) -> logging.Logger: + """Setup a structured JSON logger""" + logger = logging.getLogger(name) + + # Avoid duplicate handlers if setup_logger is called multiple times + if logger.handlers: + return logger + + logger.setLevel(level) + logger.propagate = False + + handler = logging.StreamHandler() + + # Use python-json-logger for JSON formatting + formatter = jsonlogger.JsonFormatter( + "%(asctime)s %(levelname)s %(name)s %(correlation_id)s %(message)s", + rename_fields={ + "levelname": "level" + } + ) + handler.setFormatter(formatter) + + # Add filter to inject correlation ID + filter = CorrelationIdFilter() + logger.addFilter(filter) + handler.addFilter(filter) + + logger.addHandler(handler) + return logger + +def get_logger(name: str) -> logging.Logger: + return setup_logger(name) + +def generate_correlation_id() -> str: + return str(uuid.uuid4()) diff --git a/temp_backup/src/utils/metrics.py b/temp_backup/src/utils/metrics.py new file mode 100644 index 00000000..9f36b944 --- /dev/null +++ b/temp_backup/src/utils/metrics.py @@ -0,0 +1,42 @@ +from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST +from prometheus_client import start_http_server + +# Define simple Prometheus counters +JOBS_RUN_TOTAL = Counter( + "jobs_run", + "Total number of jobs run in the pipeline" +) + +API_FAILURES_TOTAL = Counter( + "api_failures", + "Total number of API request failures", + ["method", "endpoint"] +) + +ANOMALIES_DETECTED_TOTAL = Counter( + "anomalies_detected", + "Total number of anomalies detected", + ["metric_name"] +) + +MODEL_RETRAINING_TOTAL = Counter( + "model_retraining_total", + "Total number of model retraining runs", + ["model_type", "status"], # status: success | failed | skipped +) + +MODEL_RETRAINING_DURATION = Histogram( + "model_retraining_duration_seconds", + "Duration of model retraining runs in seconds", + ["model_type"], + buckets=[1, 5, 10, 30, 60, 120, 300, 600], +) + +def start_metrics_server(port: int = 9090): + """Start standalone prometheus metrics server (for background workers)""" + try: + start_http_server(port) + except Exception as e: + # Ignore if server is already running + import logging + logging.getLogger(__name__).warning("Metrics server could not start: %s", e) diff --git a/temp_backup/src/utils/translator.py b/temp_backup/src/utils/translator.py new file mode 100644 index 00000000..928fc1dc --- /dev/null +++ b/temp_backup/src/utils/translator.py @@ -0,0 +1,88 @@ +import logging +import unicodedata +from src.utils.http_client import RobustHTTPClient + +from langdetect import detect + +logger = logging.getLogger(__name__) + +_client = RobustHTTPClient() + + +def normalize_text(text: str) -> str: + """ + Applies NFKD unicode normalization, normalizes spacing, and strips text. + Keeps casing intact as it is valuable for sentiment analysis. + """ + if not text: + return "" + + # NFKD normalization decomposes characters (e.g. accented characters) + normalized = unicodedata.normalize("NFKD", text) + + # Clean up whitespace and join + lines = normalized.splitlines() + cleaned_lines = [] + for line in lines: + cleaned_words = " ".join(line.split()) + if cleaned_words: + cleaned_lines.append(cleaned_words) + + return "\n".join(cleaned_lines).strip() + + +def translate_to_english(text: str, source_lang: str = "auto") -> str: + """ + Translates non-English text to English using Google's public translation endpoint. + If the translation fails or times out, falls back to the original text. + """ + if not text or not text.strip(): + return text + + url = "https://translate.googleapis.com/translate_a/single" + params = {"client": "gtx", "sl": source_lang, "tl": "en", "dt": "t", "q": text} + + try: + response = _client.get(url, params=params, timeout=5) + response.raise_for_status() + data = response.json() + + # Parse the translation chunks returned by Google Translate + if data and len(data) > 0 and data[0]: + translated_chunks = [] + for chunk in data[0]: + if chunk and len(chunk) > 0 and chunk[0]: + translated_chunks.append(chunk[0]) + if translated_chunks: + return "".join(translated_chunks) + + except Exception as e: + logger.warning(f"Translation failed, falling back to original text. Error: {e}") + + return text + + +def translate_and_normalize(text: str) -> str: + """ + Detects the language of the text. If it is not English, normalizes and + translates it to English. If it is English, just normalizes it. + """ + if not text or not text.strip(): + return "" + + # 1. Normalize first (helpful for language detection) + normalized = normalize_text(text) + + # 2. Detect language + try: + lang = detect(normalized) + except Exception: + # Default to English if detection fails (e.g. no letters) + lang = "en" + + # 3. Translate if not English + if lang != "en": + logger.info(f"Detected language '{lang}'. Translating to English.") + return translate_to_english(normalized, source_lang=lang) + + return normalized diff --git a/temp_backup/src/validators.py b/temp_backup/src/validators.py new file mode 100644 index 00000000..bfa7cd8d --- /dev/null +++ b/temp_backup/src/validators.py @@ -0,0 +1,58 @@ +""" +validators.py + +Provides data validation and sanitization for ingested records using Pydantic models. +Schemas: +- NewsArticle +- OnChainMetric + +Invalid records are logged and handled safely. +""" +from typing import Optional, Any +from pydantic import BaseModel, ValidationError, validator +import logging + +logger = logging.getLogger("data_validation") + +class NewsArticle(BaseModel): + id: str + title: str + content: str + published_at: str # ISO8601 string + source: Optional[str] + url: Optional[str] + + @validator("published_at") + def validate_published_at(cls, v): + # Optionally, add stricter ISO8601 validation here + if not v or not isinstance(v, str): + raise ValueError("published_at must be a non-empty string") + return v + + +class OnChainMetric(BaseModel): + metric_id: str + value: float + timestamp: str # ISO8601 string + chain: str + extra: Optional[Any] = None + + @validator("timestamp") + def validate_timestamp(cls, v): + if not v or not isinstance(v, str): + raise ValueError("timestamp must be a non-empty string") + return v + +def validate_news_article(data: dict) -> Optional[NewsArticle]: + try: + return NewsArticle(**data) + except ValidationError as e: + logger.warning(f"Invalid NewsArticle: {e.errors()}") + return None + +def validate_onchain_metric(data: dict) -> Optional[OnChainMetric]: + try: + return OnChainMetric(**data) + except ValidationError as e: + logger.warning(f"Invalid OnChainMetric: {e.errors()}") + return None