From efe153c28036997e0bcfce09394a804e0df74092 Mon Sep 17 00:00:00 2001 From: Yvette-0508 Date: Thu, 4 Dec 2025 21:36:34 -0800 Subject: [PATCH] feat: Add LlamaParse + Supabase integration for document extraction - Integrated LlamaParse for multi-modal document parsing (PDF, DOCX, XLSX, HTML, images) - Added Supabase cloud storage for extraction results - Created universal test script (test_extractor.py) with --store flag - Support for audio transcription via OpenAI Whisper - Chunking with table and formula extraction from markdown --- .claude/settings.local.json | 17 + .env.example | 88 +- .gitignore | 1 + requirements.txt | 4 +- .../__pycache__/parser.cpython-312.pyc | Bin 26291 -> 21215 bytes .../__pycache__/parser.cpython-39.pyc | Bin 16662 -> 17145 bytes src/extraction/parser.py | 937 +++++++----------- .../__pycache__/document.cpython-312.pyc | Bin 16144 -> 16144 bytes .../__pycache__/document.cpython-39.pyc | Bin 11405 -> 11405 bytes src/models/__pycache__/query.cpython-312.pyc | Bin 13313 -> 13313 bytes tests/__init__.py | 2 + tests/conftest.py | 12 + tests/test_extractor.py | 377 +++++++ tests/test_models.py | 583 +++++++++++ tests/test_summarizer.py | 640 ++++++++++++ tests/test_tagger.py | 896 +++++++++++++++++ 16 files changed, 2881 insertions(+), 676 deletions(-) create mode 100644 .claude/settings.local.json create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/test_extractor.py create mode 100644 tests/test_models.py create mode 100644 tests/test_summarizer.py create mode 100644 tests/test_tagger.py diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 0000000..e80c482 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,17 @@ +{ + "permissions": { + "allow": [ + "Bash(pytest:*)", + "Bash(pip install:*)", + "Bash(python -m pytest tests/test_parser.py -v)", + "Bash(ls:*)", + "Bash(source:*)", + "Bash(.venv/bin/python -m pip install:*)", + "Bash(.venv/bin/python -m pytest tests/test_parser.py -v)", + "Bash(grep:*)", + "Bash(python3:*)", + "Bash(python:*)" + ], + "deny": [] + } +} \ No newline at end of file diff --git a/.env.example b/.env.example index 3c467a4..c771547 100644 --- a/.env.example +++ b/.env.example @@ -1,82 +1,12 @@ -# QuantMind RAG System - Environment Configuration -# Copy this file to .env and update with your actual values +# QuantMind RAG Environment Variables +# Copy this file to .env and fill in your values -# ============================================================================= -# API SERVER SETTINGS -# ============================================================================= -QUANTMIND_HOST=127.0.0.1 -QUANTMIND_PORT=8000 -QUANTMIND_DEBUG=false +# LlamaParse API Key (required for document parsing) +# Get your key at: https://cloud.llamaindex.ai +LLAMA_CLOUD_API_KEY=your-api-key-here -# ============================================================================= -# LLM API KEYS (Required for production use) -# ============================================================================= -# OpenAI API Key (for GPT models) -OPENAI_API_KEY=your_openai_api_key_here +# Optional: OpenAI API Key (for embeddings/LLM) +OPENAI_API_KEY= -# Anthropic API Key (for Claude models) -# ANTHROPIC_API_KEY=your_anthropic_api_key_here - -# ============================================================================= -# LLM PROVIDER SETTINGS -# ============================================================================= -# Options: "openai", "anthropic", "mock" -QUANTMIND_LLM_PROVIDER=openai - -# OpenAI models: gpt-4o, gpt-4o-mini, gpt-4-turbo -# Anthropic models: claude-3-5-sonnet-20241022, claude-3-opus-20240229 -QUANTMIND_LLM_MODEL=gpt-4o-mini - -# ============================================================================= -# VECTOR STORE SETTINGS -# ============================================================================= -# Options: "memory", "chroma", "qdrant" -QUANTMIND_VECTOR_STORE=memory - -# Directory for persistent storage (optional, only for chroma/qdrant) -# QUANTMIND_PERSIST_DIR=./data/vectorstore - -# ============================================================================= -# EMBEDDING MODEL SETTINGS -# ============================================================================= -# Sentence Transformer model for embeddings -QUANTMIND_EMBEDDING_MODEL=all-MiniLM-L6-v2 - -# ============================================================================= -# RETRIEVAL SETTINGS -# ============================================================================= -# Maximum number of chunks to retrieve -QUANTMIND_MAX_CHUNKS=10 - -# Maximum tokens for context -QUANTMIND_MAX_TOKENS=4000 - -# Enable reranking for better results -QUANTMIND_RERANKING=true - -# ============================================================================= -# VERIFICATION SETTINGS -# ============================================================================= -# Enable hallucination verification -QUANTMIND_VERIFICATION=true - -# Minimum source tier (1-5, lower is more authoritative) -QUANTMIND_MIN_TIER=3 - -# ============================================================================= -# SUMMARIZATION SETTINGS -# ============================================================================= -# Enable adaptive summarization -QUANTMIND_SUMMARIZATION=true - -# ============================================================================= -# QDRANT SETTINGS (Optional - if using Qdrant) -# ============================================================================= -# Qdrant URL (for remote Qdrant server) -# QDRANT_URL=http://localhost:6333 - -# Qdrant API Key (for Qdrant Cloud) -# QDRANT_API_KEY=your_qdrant_api_key_here - -# Qdrant Collection Name -# QDRANT_COLLECTION=quantmind +# Optional: Anthropic API Key +ANTHROPIC_API_KEY= diff --git a/.gitignore b/.gitignore index 19c4bb5..f03e142 100644 --- a/.gitignore +++ b/.gitignore @@ -69,3 +69,4 @@ htmlcov/ temp/ tmp/ + diff --git a/requirements.txt b/requirements.txt index 52975ec..80868f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,8 @@ uvicorn>=0.24.0 pydantic>=2.5.0 # Document Parsing -PyMuPDF>=1.23.0 # PDF parsing (fitz) +llama-parse>=0.5.0 # LlamaParse - multi-modal document parsing (recommended) +PyMuPDF>=1.23.0 # PDF parsing fallback (fitz) beautifulsoup4>=4.12.0 # HTML parsing trafilatura>=1.6.0 # Web content extraction pandas>=2.0.0 # CSV/Excel handling @@ -15,6 +16,7 @@ openpyxl>=3.1.0 # Excel support # Embeddings & Vector Store sentence-transformers>=2.2.0 chromadb>=0.4.0 +supabase>=2.0.0 # Supabase for cloud storage # LLM APIs (optional - install based on provider) openai>=1.3.0 # OpenAI API diff --git a/src/extraction/__pycache__/parser.cpython-312.pyc b/src/extraction/__pycache__/parser.cpython-312.pyc index 2c614398207e9f9f94c67e74c909d9f5d343bc65..0cc10d606a5eec680d80aeea5d0d463fff3ba4e4 100644 GIT binary patch literal 21215 zcmc(HYjhjenb-{806`KYKoERtNKqm|k)YlWijqw6AxfgC2PsRYWg`$~M1cYbY6g@@ z1WaAm?Lx6OBIA=KRXay??X!krwV~3qR(f_D*-m1m?dbso+JGlYH|13KA;^l_XXTsz|IJR70%vYJA#3txq?o^XUimKEt4a#`Y?2o-c1O zkEGRJqc49jpQJV30$<@^Ax$YLuJG;D&`_dfbzYONXt2mvJXq{A51M_JL5r_su*7E_ zw36DYVTU~BOw8Hdrlq&RG6i*jV)ZXktl@1%DzE%J#pb<14_1&eBb4Q*%PJ=n_5#;N zXl^(%<_&n7kN8=ampSI*1&(LV`FZB}m}?|(#52M&C)y7%{S!if^D$$BXJm-!^}2j6 zQq^HF>~{$q%lJn!4eABv@QDAimt%)GrfWRFyW9Z}ROk=5hPYP4v4&1NvxoWmKois5 z#2jg2`X`xYW`G+HG%-(mgfW-5Wx#dT%L!2E3Y>@X06*pqjPabo&_6ah>gNMOD|4(9 z8h0M;INih?J2n8}>E3<_JNln)VtNLS^foaEeJ~t!;(|}``FYM^IKiFccy7eawKC@e zfl;BgrG@1#abEu@#|sWGeieK#H|}tGT1F=V=lvrsZm)lgZ6QUYxHc_@PjPko>?%>! z>k$H??&v6foXaaJ+eaouRj0=tu+yTBbp^P9$HzfhbqqcO;ykb4#X4DkfE&5=DQ=Qo zBkDT+?lB)X5)joL=f_4ah-&ifqPE-5`^LO3(I{6Km>A_m!!h1}i5qdjH@Kx;mO@?X zW$6cyn4vfdz?VYM9aQQm{KpovN>0V8%b=W9rBiB7eNMw_UQ!QgI4!GviIRbGP{-;x zJ*(#o=TxlWCDmXaYi9E}Bjn_B2J8j$jnJ-u&F9pRt4ik&OW1^PJDcH1K01OyImnVET{uG9G}%)M(IBIDgjf;#qrZrftWGuJ(a0W}tn4 zZ&%jPPI>4CQWdA?VVRRB4|X!gP8>Yae&QMCaMv?TXIFRo$=(5Gh#PV80DXR+b8O7R zHrSie&DhkgJE8qR*Xe;wNx&2Eax3x#{}}J)oMXIqMM1io0075|G9lm!j0r0;dDms9 z+dl#j7+6t~%5{3Vk)gnO=3rmeoZJYR0yG?hW;sfjL;Xkl_Gf(qIlj5-17bm_&e>OL zi#gWmg2^2`($zoEe&pD|V0u0Kj-JY1LmwA#!6|b|qm!kz@6PODb2HO5f)im*_%AcP zc+Q!A&OOF^0uxMwKXq#Dq(OV{K-Y=KZ+BN80fZArPcgk+Pj~e){aqa=PaGV0MjEb9 zRKnJX$_w0ts0KLna&DQd;IUMu{+4DTF++vuaat&$sXz*KQ`B$@VPj?aVd=M1?gLD`jkqOt$vvG)D#{awzJCwjBq0Hy&`Dg1({ z8R8Im@n&eqTQDerK-4dEo`BOCw8{sV;0K4CzYj(d+8}s= zS~OCY();EuuhrhLU$@Wo#ml#ZRabRMvo%L%Nja1mlBE@4)t?)epgrYIZx6iS2jD_3 z-GczOkQ#<~<@3T}!Wqy$#UxE*9nH!buwF*UmyvZ=v5*(YlV4e-ynQdy3ILbfJ>~94 zrth%1a(9oJZ9ipK_3?xX(PF+EP;P)rZDX*pp9?VAz(8ndIsQsg?xkc;56;z3X;r=TnckU(w@5aA1R=d((r#v{P4W9H#!^AGY^ld7zCnm?kQ8rD2}M^b}B zZm~R%Oboe*jaEF6Yh95Aue>#P{;+LzU078u)Gl9L0=WpwD<0URocXZZx^mnjeeQC; zBx_G{_8_k(^r2ijsznZ0;p{QCHfJ-1uFSVH9d;+h)zPyLNNLIaS%jSz`Cd42iK@S6RN z{5Ht=1HbMpw#|H9W%q6We9>$Db472Jzg~XBdE5U5za2-@0{adE`H1J(`|w7IhS9VX z3CnKCh!>(!76ZjDa=z}LM}xtf`xH&0xCJ2(iidj zu_9=VD1HwHh-xIJSK}E0UN#_Vxp9yx0-UHqVMJ7q@*a@pmB1<6brQdWNQ>VJftZ(C z0aDW#@CRI8rv&d}f%KM!iz8ScSQ8@aB0IpO9*JdAA;1TVS093Ys46@H<`p<#D`l;o zaemvmWHyu<9u`y9byM_WAysOdVW-)ot?Y*8x@LB4PBojKT*KU$ygr%Swmq?}=l-^y z*-eQGN36mTxp==~(>?Y9vv#iit)AC=-pGGYyKc_)*6{1YZ?r7sYx4^hP0;P4nbKPp zDH!FEjj~n5%Jw$th=%M z`tD>&)y(tL&nL@k6XngZ^5*28uEd`H`+NFlPbaFj#HzL=s&?J4+7*4~Y|>htux^Z5 zHzs#?Bz7OYzx(Ly&O}8^tfFOJGhY&`*bzN>DqZxNd#)i-yFFIBeW4l(o_I*%8f~DO|r5fS>5!BsS@S^-Ig@mFKJj*Q(g2CI?;LRe&;D{y)9O|Em8Z_{o1FZ z&$*I4#}Yly-tT#Kt|w8qBUZN~QP+9Dt~2W5K2s~#6hl8PTNd@|vVw=nj-crfJ%CKI}&%~j${5zZ$j54KZ8jU=64o(-OWT2KqKsD{^;2PA^0fBU* z(jy~QrbwpX2%h$^O$@FKZP#X_5fGSx1dD1v&vHC}52RlfPk(^1PvJm3GT{NgJor}jov;gj+FX6bF)%V&O-Z@F^n z`>RH2uFo z8bFvbSa|^CWfvoia&FH#k6UKO&Q34~I~^k3DAB8|UFV;HB@q?eMIf9&Yk;V(+VV3{ zgI~4f{87jb7A;5nKx)E}Enpu{+~}!}m!JJre&Ll9Q>rOXJf9)E|MKaaTO8GuOS`MS zIL7f4?rf}pSO(vh{tkkF*qvpSA^KH{Reo3bq#X{WIqfIzu`;Cm3T&bR&YMH7LdG5p zY_g=X1(Qh%7dV2O|1gY_jY$YcBGT)!k%I&#YLC}*fm?|gR1#)D23IN~u!0yK`Gk~S zMIdussaKJ5r=hCw90X4UnW&{Fo_|on20Lb5bL!d4bFSImc)tCQz!167!e7GJe_G&4 zAM9Kt2n5$BfsEXirocR*44OItO$RtQm zf{c~BA~pW!ShW=b*cF}eTHcksgsv>6E1Rvm(RjTvu3MMTHN|vIah)Tf+Z5Anny6eSLb3>cYFr8?sBaZaaba5{Z{cFc=u zqtgj0S#PRD>jbNn+v(&HynufW@H~q4q7K+)Fr5V^q(#VsJRWfaKS1gMAIu|~;Ge|+ zF&EEaf$FT^@8xj~Zc=2j1X_92>yV%h62f~ByhJ@z*wnhkom6#gvZ^LoVt=UDt1Car zFPU;*d2Ue+Y2dmFtmI9LZ#v3kUcx%S`)UdvDm3cKB_*U5F|}xFR2R>lUZfz3)GcB3 z@HDMe*UY*XDTpFl=T$dbmoT|ltW{Uc_G5O%-02AW`tz7w@es0WW;>+XN)pWv+^M@g zm};a{Tjn4eqKN7dM$+fum+eJW)R#(!kNr~RIO`jQ5Y`mJA&0ezf?Q8(Xq1-4Ucj|5CuHv&OvmUJBxj}g-Z~g5BMM)WwGVBSAb#|V;mwV zz#c|N1Na6-<8GKv`oRg}vs4L!LoV~7WJDx}#L8ucL^Z4Cz<>y*J}np!btzMzWM(wT zX2x?SHV+JcXizkQK~cfxzoe4S7Z?@`UZMt#Y!xvWnxKaw=+VgKpVP6$#K2h1nzM2> ztOarlNo^x%0wZDx7!iwDE7ZZ(My{AMqfiWn%~B|_kP@3bmsA}y^j-#~C8QMFSV0DbXw*qp@028bcO~hVK%;l1`+=W^X17h8bUA!4BnP7G%rgb<~U|N^CNk%ef zFqb`GW|IjyFqeL?e1MG_`ys(efmqA&9iLRIJy)fFI*Beaao7xgCqMMfN9jj8@HtTG<@$ za5!pL_AYk}Mk!*4z|KjBeWKPi>T#mkEic7_5@kfen7}#7(vy`JO_?-eIL<2?2<3GO zo**acysnX)W39}9iw7=U5*--?JM=*ydf4Stl3}rx=}dVDFme`fm#hT$ zxCHoQgKKLBl>kv528oiv^CIt1+IEMTDcjxh*`_;#jm_$8n^H*4Rrziy8515rLPEnhVPyMi|6t9i*PJJu>E@rNl{i(d~T ztyeb&B`{rmgyEJe#Sz%zCeS8@~#k7twN`qc-Fkr5f2Gn490EzXh}U>djtD1=0Xh z2VM){u9BfPS^f!^sd42q)MeUk2-Z9XI5JF;>}_DkWCfcJj(}dx>t%?GT{AiTt<0!r z6u@j%L9hiK6LO#!8Koh;77 zfDY^i{x2W`Uq18}Oku0#Nsy-l*lpyasBdkPN_Ii?#`hynEo>Zt<;CFw^NCni-PRZb0d zTAw_r4ylErKw4)_e5ycO3`c~%rI1wu{h(9Iskv5x4}-ch?I$T8_DM~y9$~I&Ml3y5 z7)fpw(q!B6<=kB8;g&=O6lw{1jsoI^2P<_hM06~7KC3~ppUVeh7z;VFQJm^`_>3__Y*{snSddHi9!+(#ys>!9m*Idx0_Ho`bOuw&^x z0M8Ar6V(Rf2`Tw%0{sp-Cle2E%B1B~NGp$z>)B8K0IhG9%K@#goiqf{E=iu@9darY zvw0!IazBMZ7nqSEiGRWU_ zt3X~cw?dxN>KWM!U1fkQ49M@{a|~r-#>JfTfLrJ&cx3~X0XJfq*OsJjte9@GS|F0J zRV&k-?gM%tbSvEmd&n4)K+;J$n-Z2BWz;5h~*Ye@_iQZiFcX?odA zPY^G*71A5|U^e}MjQ=c(2I)aXHRl@*Ox#pR$oCK|@|OyOVHuxGx&K;8D15Ir7z#3v-q7pO=%QT|fAU9}VMd`@8=J4y#!LAB;P$a*O z>OHr2MLK4M8{^*{zrE`V0a4|PRvQi6C-Oi1QyA>WU>^qfNqwIph5=HH6+j^XjtgEV z*yTB5=NI*o=osMGU}d)pwwkygtki3?GQk1~rp}Q%t&FIVQmKx4Tv9T!y_$amN774| z0Kip%&%<}su-`KR`c@XS%kF>^6&O6yJn*E23m0O6Q+Caiyd}D@sL+{G?8?pqMd^&R z{{UhHQ3LgW)ggWlJbo{aT%4!`XAn_~E*@UbS)jk?U7NRUhsq-;Vv73n+&Jrjs}ljq z?}2>6tJoNcv#9SHcXI@wMUB@#G{o_u7Fd`Ouwwx@p`R!3hkEurNj7^E8cibhTXn+L5VJKzta01M@WEf3D<71uiL~C|u?e|S zJ%%t9uig_k?+JH4C@xP{th*2Fjx||bo2YJyRfCN_UcCc2rw7*Z+3l}*6W02ewf>P_ zZ7v9REEZ7Z)rqpESXonKV7~SKmIHUK@v@$9$JIl?eOb#BCH7c}JyI7hX@O~#RwPQB zW2MazHeR|V+>;7Wuqa_dckqyYc+>=OdlpKYa7>+ehM6t%<76SXF1ds{5{R zPZ6y;{79>+hKXP81MY9##&E}ivF4!$_&{JKiw@8rTg@DrJ~VT5`siH8f~5iZ%F^(m zsV(6{zk8&mY&F2`LA3{#@|nKrzUaF4yES*s(Y0L(OLx@L{fXHcKJ@!VEmrycBj7H} zKBE+{I}eP-*Ltt?&hCxWM9k5$<^`kUfw^KXc)x!4{L}Zw7r66L&v2Z(uweFnX!QPm zQH_nh6#8M8U)*w_kovb>1_*yfS9CMf&njt5SCf#@A22F^X15*CDnFprkpF>J4LKhe zm6$Hn_mn6n6+{or`6ZYv(dwW(d7d8-hzb#IabNi;2uNTUN*FWv2HM zCD2zpL)u9V4bG}63Rvl!j3#Til_$5^Mgilf8!m!PS@DE4toi~l72xx(&Ly3WltI2_ zMI(3}E?u1`fB)Ml)*o0JDksyY%k8WiQrt3T7zS`Zfq(KB(BLMrmf8g0ei_7oXDAx} zO(-O}7Fe1}c{ac>wM=RQs4*hXaE+YG#OM&ZoMFl^kN_72w+iJ^3EQHW)P4gQDH3NG zDZ43X?dAetWkzK>L2baafa=bXB3(EK@FvmX1vCFxfd4amSMe`F#Q!r4!WdkE;8R?% zpzPMyOvnqG7rVjnPa}+-JLefEmN?MJAbt)Oqk`}p8Cxc4{J(+%{w)aX#iEYqz#l*e z2%-+i(p3vdF0S50>X3u6WR|D<%y zOlUeZ*D+r_6^fT`1r%`%{ zUNARCT=Vq2BErp&Cw3f&?KqIwaX7Z)@V&n1ndjp>ocA5hWO>C@$H$hkMJ;9B1?aiB zJWXTQie|La+DOOzj_>u&>VVAMw>8|=eUSfde&VTt*i!@X zr=DIgodQxb6Pymt)x5R-_4Sdp^PN+{c+M$MUnV&#fKlejeaI~^KQS67Sjv)o$ zFBMonAm&X0Wb(+9OCgHleX71{RsFQ)OYVUJ48f{X^+C*D(SHsdABrNY>KQWq$*F!S zkT0j<3v*ywHZ2yjdX$=F446w0f8q*(a|>%&G4mWcLl6{|Y~G4`Ij}U$JhQ@SAyxhs zTrs;bxVDpUwQ2hcgZPIz$NPOm1j94!2&y)1Y`!2WH*LH?k{2;~F{s=LnTnkk0Dnl# zr6AJ#2>RQrgi-hxG5Dt#Ae!5YbK(C91_T!x`TvAzTu)I8xAmNxHXfD}$3+e1>^v;y zceS7BJJ@%i-`UaL+bgl%YGKst3Gnx@r4nuvi@H-?`<=(y4|Iu2xN{6x76}56Y^_KO z`~t*8+(0%}GAZ1KtYFnDERxOPcOfEq59)!|PGW*y?hDh&yrQY&S9Z^?o7;4~@z$9{ zb9=10J-WXA@%g5y_A9Nkmbs$qWw+WA8}`OF?2Xp#P3D!%7G2r*W?N$Ip4i$w(aJr) zDJY)WKGQbcHm8i6>*kIF?ui#PBnnz$1ucn!ZLxxF@q!&;WiqcITD&2ax8aAqiLITn zt)0>4PLj7ambdm+Z=$I^*3=H%-Gjo4MB$oP;hLy!jpX5g8jfYWlO8f~pO)|%dI%np~6vKG5HIDFVj!z zt{#Wpv>_ch=4nHE;D?kWG|7JscuF1cn7{r1G13%$wfK+fN1c1d(9d?5ml80RE?C<^ zZbGJo|j)XG&nZwv)Az-z?y%mf`CV6 zi;qBHH;4+J<9`iF5sY2DXH-;!;oE&))E|M{;s<*V_H~KsA>KbWD&1I^!kQY_C|H>Y5~9oL2wWrI!Uzd!m7H|%?E+sKw3)N7XP>SqJ?|U@XH4QQ3W645|9$5 z`oK*Z&M0G$TSVa)?`$;{5_ozcqXNFZ+B8rJ=$bWQj`(poj+ zoA$lp4;G!=kK2CK7Ps_-yFM3-p+WO?P3Hd(nAEf$gWA60Ie z>U>aIKD*W^ZT_r)vQ#8NtUWS)WUekz zyE#_7d49*8?z@NLwMRd)9Q(v%oynihpJfx(j##xLa`}$tt|4B1_#;#AVhd$s7B_>( zbU0r^l8dmy%WNCC4FpMk4-zw3H^PAx!2A`Dj9fwr9+;OQdW0ULvNWcUmod`iGCa!d z!S0*t9qb*c6x`FylZVULCs<_=(053c<#?ikW1hh5pNz~huz%t?!A(y+%x*udUEZV?=BLOd z9AxfRhYX=SRu=;C2gyCEATl;388T>Ksjk)osZ*qHlWH%^_e%AvQp!pFB$R_`-_|Al z`U)d^Xfe{IHAiqu!tXZSNC&tamuHN=pFl+&=UHM>TmeS@74N0B!0cvuKW zFVdmDpk{#lSO;K?{|xz}iuYd@fNr^ns@bj=_1p+3jCs`1{|#2qor9l?0F?pH{~h*2 zaLKPRMKH+!!W2RrI3H!u2r8IGEf{{F*@#GcB;u$9-M>il{O_>MRt#_k{1FHQ#8U*% zqyh=u{19@3t2{9sX)V08JRw9mbr7V8WwN*@to@Cx20Vy1yuKkaG#`l9wk2$PuJnZ4 z!(++f(wPm@8)jXvIKmyt`o^~=UY~g5>xufEvHG2NEDQB}f1rI=`&0eY#hJ^~m%lR+ZEOo4h*}RNdyhqrpNJms|9Q`X`Q*=g?p}=0-`C#M z&MR*jj`(p%qLNdIdbL5YwIG~dG_wP zd&lox{CWSquDg8;<^3NS2jGHW*EIXJ*QBg#O|)9aVam{zA0u{~~TaDQt%| zeYdKlt3dPamSPCsYbxn7Yu;BBV_0qLs?xo`y`;-l@DpP(hSh4w`Kb-VWf&NRainQW zS7G6a;GxJ83jo`!e6j@(V6A~jO#&s&#oTA8E3ln8uwn)W=fYw#&dgR0XHMqTJ*q7F zpHo1BUqp}vB$(#_ndjeQ9zgM7g`_8!vKX9;(l_(~L+|>^H39sYxzm^RWlu{O2bk+p z8ISb32DsD2^Ampz$Pp|&Q+KAW;o0Zwp4(u@q=X{WXW>5g1yEF~y>RNs;W9E@o&&xa z-OhsL4g1<^&eTD5Y=_7Pllw3$Sn&L_XV@mdLXLfB*v2{$uIhR{BOI_Dh&lM*V?YoD zN@H2*<8R@08QXy#XCm$6K`%@RD_D+y17*S5)%-GQ8sP`eP!sCIulp9ZeHAJL7BH8D zb->mZTktAgw5ny^@=?+Da670AuN}Q|^tA(#n(x=&te>xWyHR3PK7_cd}!=n zR8qw|KshJ3ZN1a^QBgazwUi(lm~DUMP{L9dv(&{c>%(1HeEHai=52rvW(ubZ-_%C9 zyS4@9(1)hOpiL~^PouAoEnG+#8N03zu3)Epa0h~xTrRJkNANXpac!gzF67^pBnTOiM4a1jI4T=SzCpzevsbs=_9qRS+HFTpqhw@)y>Ga$wH&M%SohoiJr-9zK$6pSM(DZXP{BJYbE^VH1TmPHDpNZS%d zi{;g7D`r7754}Tlr}J*fj}9+ka`7ozr!K*3D-cCa&%19vN0RrXoanKWOIWgKF+nHL z8lp(&e928{4avp20<~?N+5S6K4YpO< z_)`AoDkZJ>oPq$r%1{w5P3X&G`tsSGaedtjn$L|BG;MiEVffi4rOKv=QkKsPE-7i- KXB38HM*kN@4r*8c literal 26291 zcmdsgdvsgJdFRFZNdP23f&^ceA}JCgMM~7uGA)VXLy|2~wn)hqZ3lq}kOU1rd;v-( z0@`lswp8LCQ}wo@lD1+VZKzn=Fx_s$bhoE+9<8?B2O!8Hbj@s)6L(vm{f9EyP2%n! z`}^kNUVs$r?DL%69SJjY=XGc1&iBnX-(&bU1qEgf?$YnKgfuU3+)wF2cDhVty;;R^ z*SG*Tzy(wRbyzi^Vxf9KjZhQTjA;k7W4ZyIit@E#{g`3Ez~Z{Fam+Me8Z!@=$1DSu zG3$VJtYDyk<>|w=G24KR#SLNmm}9`f;_7hWKw-cbFolc8oCD4=*MMuRc%XQ!WT0fs zJ>VWI9Vi_u8z>v|40y)M2g+5PnhO?=9+W$s4TF{Mpx@*Ms?^+ZE?|Cx3s{2McQl#Q zYL;q6YC*7CN#$9p4XO5=)EbuRKx$!5>Nb{IgjDA{>P)M((`v8F@4>6hr>DZv(9Y8n z0e_f(+AlU&2M?K_ZtU{%Pw=1XZ{~ZN`P0pO-;8-D-+zej555we!Ye2J zkqAmgklJ&IKOc%vq93n~`Y%p}{Q@5h2gicrQ7Uly5P#Yq9SM&4qoF~6xE1{zLm7T> zVmyi>DA9L_??d)D^7#p2*gqbMp)9)77>No~gV8A==rx=BrY0vRglOat|8y66)!BEx znSVBT5yMAEn)#DstV18tT=ja@qNd|mr)cwEL=za?STH&=5%8)+Q@|hf4~EfyuTIpS z3Pqx#wktFk6^&;m>7D*CYK~8fy8fxja1a%Ys1S^X#wf0R8Uy)|K8M#N8oMS2r)ZF( zu5)B+{3TTJUknFDLyt5-(R5;ZGAM*Y$g;|1`;jD?`zEG@!C-$VD2V2#g^5eSasT*W zP;_)kQ_>>@BO~L%NMum@@RF~LRFolLy} zgOX!JjxHz1G_Cg_|SNCf|uu`_a=8!Eovjd@Q_f38WC!X=RZE&dX8Sz zI_)2y^pCf8ONM;`RLH{^Ef<+$=A~~2DUawGr z7KKW9nXw4fh>Iqa_DP*`<#b9(ZK7H=oto;pKshh7WDZA}rL=GwTyI%sL~KQjEM>{36HUy}{krwmr}} z*VJlV!F=Z$hhX8vM-;4UIK7LduNmdof`*k?)MiUOq?)YrtmREhmpVpn{ZHX-Ub+GG z8{B{vOIH`rvgM0KI$#JCA;lQbNBG@5San6UB}NJa?z zCvU2RDwK&(WnNmDSZAT}P}Jv(6)6L3k<%Wh!iDf&=hB|aS?!zlbY=Cd_C2FV`qVU? zf(PCv7^2n4I)4$7Yg|<6YFrhifiWjJxR zkF_n6fuWw8TG1TBr-(!uLe*e~MO}E}G7zmXI39>x4n;>EflUS?F=ux03Q!)wYa@g* zUguUV#dAZyJ9y8sE$uA3)_%49?ZYe19ZBbo#r-K~Tf*9ww!7Grh-L}T`KE;MAWeeo zjEJyFIR*b3XObu$ZblQe$VKAnS5y(Tii_iGj~2+O*)XmQ`<7Ooz8P&?GiuM4kz?7i zGrG8L)FG#5!%=58F2@48Xt5m6wlG>Er)5L={c)|&DyI%}GkW|C(Q-K@t_$epl3m=3 zPjcJ~SO7D|8B^ReN=R~x3mD`NJ7bRPM+tRqadER;0`tU;)@;efW84%s%I}=9#0{gh zdCFL_t-N_k%VCDK~Rsz(S}F!<9fLj%z{hKN3RH*Z*g&b+z?mGv>?hWa~f@x zi^kQ_-Ew@CV0nv^`xD(KXN>NbW7#mS9c|1$$+6MLo{SDii;9bU@k8}Yvew#uPSdCautMt3AOxH2h|t2 zKJGI2isk}$S>?6HDi~g72ze-k{dg0NArQ*xjqQ`M^Y}y*1%M*Lpnscr33U(Q>7pN- zJfUemOHoc zBHu?N5cU2^R1Jt_zRd7_(q`ow5+=s*=HZ~IM-COy4TdKoK~Xz85gHdQ7^p8ZLeb_M z6oP2i$F?2Nic%;LRuL@pZVe6|qJd^29J&Zva>U=ZZ$DZdADjpTMbk*|l|Tp-eN;40 zW=Z2h#0NnUV>*2Ja?z=X*UEM>Mm4e-7kXIhm<>^by+AbLi%J7Cvl!cWMI(*FhtZ2# ziU~)ll3p5UgrRCZui1UD8#z@a(O? z(*AeCD=kNoEk{zWqqCYfO%I&Kv&M9>d)E9xRl}lY@tGf0wO;9+Juw$};B;Sm;_4F% zB`Ig)Y**Uqdh5)UGxJ^dtTiBPOFS#Zdy>U_mb#W3Q^lRLJ!xyninTInt(+fv&sx7` z=c;$D6>%o(Tc#_fH!Y}9UUQ@Adeg#4%F~MX+B0o5CW{-<^xWvej&+T?1Veb!jA44} zFw9kV^|kob_}j0pmhm^Lu2(H;zGc5@Uuis?Y&@K5Jen%&SSfodS@u+_?DU<%J3^xD z?1n+>p(bo=R<5RbR&&o-nYI_rnt#4=PQ^LOK}(w2SFJ^F^sl%Kt4?Mpk|E6#?b zvtd!cWKKCBziWMbO@~4sGm_-<+U{N44|nbAsnz`DHWmC$GzbAb+ipKb(A^HiKhSTz!d);2=ez;K-t{Q3Y|ywKyQ3fF@I3`$&0xwwxS8d0nQw{*m%T7;JGU zkIiCBKNve@bHTnGG&`zn!`U)@oYxrJ*2VAyFXIWGFc?oj%@eYuqeM)*d)LmFM9uD9 zFR|#$6n#0SIfz8{!IwZhNvJ5|H3-BH5r}jU4Sn66zTLZ?k|WQEdP+I?lvvQ+aklql z@9{ogXUC~iq6WYNyh!^6gICd-p^+GA7NO4skdvV8DLllgv(T0o%SBNr@+!R7xd-k_ zX&KI1)0V=y7m}8mg>#GHWc{I4XSK9?64r)vfpadNET~->;!afOm{|eQm;~_iy7P)5i5A4OW(wPk#8mdKD`-8a)o)#4w%VGFz zEZSUh9)x&?pzq@5R4;-?V)UJEMi1(Z>TPiWwY<;I7y_CZBWR^Oq<@yK0-b08-S`#I zjOMsGALXEcF!Cd+H%p=l?Eoq;1M)d(3iTL6&+zs)hIcZvJY&h}ozi3U0rMKS_!Q&- zsxA+`9?zj;@F*A*DT!j$*N6yzkmjT#J&&QPL8(@Zz@r{7X=obb@rUf6hViTUGg1QJ0O%#-6 zhK9_PMtW2TO^O<93j!TW1X{t2coQwriI;-oAoQljqrzEAeuX?*WkN4`gqyqt62>9~ z%@QQi>;gS91a=fLP}u@o_?-5>yFBfx*wAw>cf##W zId|OZTXMZ~A?e(|?D~u9?^fU0oqX)nUFWI$Ztp5zpRU=nQuA1{=CM@G^hce?!5hSviZck z?p~=kU0QXc;CjJA@Lp+i+PgF9+Wx?`J^A?Q#qCRbzqvEf_iWOI|J&Dga%Ht2wQ-JZ ztEH9cvZj@?-N~}usj@xks=9@dL}ly!$DdqyExG+b;_0)i)!uaN&U9UKns3GX`4YUp ztZ0tjU*DK$Jd~M-O)MSJx5L2QF5)|YQcQ)M~=F+3o5R!WKx=Cb&QX%tsn|l_MCs86!nP<{bhJeUR9{uisDBB?2l@@@ z0*6+%VQFV{QDwWzNAbnMW|6l#AcV@eZV-%`K2FQ{Pq7_aX{^l3l|Y=GDJT8WXiyl3 z+yN}{nv?zAr(cLP?S8&v=fI0cUTAqCvg7Q@zNe7X)6v;~=4@ZgdNve!iSL2%VnT?Z zN^_>lft)I*J9;~gcc1R=?QiKr)vl9$o#*=cPM+zFX-{`;??A0$`96cAo4wh4%XTJJ3zE-WPNAgh)scf}96JA{-=1x`WF5 zde8KC)4Ti6z-#P0)7y9E)X6T?>%z15$R{*c**g&pf+sq`=WDmus}Xe&_W;AG1)Mq~ zAU_93ym|?@9K~lA9Ws8Cr&To5Ec>KdhD<65O$k9VR(gaea|vBYxMBv+v9cUou^Aiw zHL@dwj9=$I(&{VB31cZ%Vu|P4z}11bUzjz&U%xMDt(iSO_rj{Z@U7`9)AR0y9q-w9 zrrou79kpvZZu@?)%pDS6a1({7EZH8kzKKDqZ~#C301{B$Qg}%j>QWQi(IYc4*b!CW zFOLMr6{a+?J^Yvap)iwD>4y*lohflEgeTAs;j{3Nz$6GnYX~n>LWTxqoZdr}>W8<5 z27MAqnR#}wFQJaN zM5sp~Y$uNz5wIt70tl&PZ5ByLyAD>2~h)vkU(EwaF9nNv$V6(ZXpnj zCD2+BXdGLGB~6q@9x>e_cj3LkJydIS#*fTu9lvJO=!!`kf?!d*PQgQSi_S?h6aC{gsw_Hw5m^Y9POVe0=F#a&meBx$+uGab-nFb#bj@~Dr7L-0cg}TQnO@V; z1J2}y#dGJbTGkBg(a05-u9;ZW%$aSo!7JtK7WUYsw(DFAcnyNZ6H9$JPp?ySZJR;o zTo_#A5G+Mlu>8sfJxK={8ss2;ijxW7^>*x-|6OvMY9?thC~_MkliQfsvn6N@T8GrM zkTSBG0!VO30<$^C0*Q+)lVcA$GUcsEEo7+$td2cc#B%fj8*-cwsT%_hmScpxs1Rwc zKoNT`#@n2DF3CKTDAa{#cc3^}8Z4t$&<77vOORR~aHCdfF!Sa>8Di{B0UDJjSd6mv zUYPt%Hz5>XqpT;TgX(mLC{gIgvku1O7EJlJn69;oJ zxdeafv7mn{8XB4k_f1SqZjmSrhJz466587=R~nJ!&Jn2sdld;CA5j8;ARcN25H2#A zzQBZ7q(^WWa`g-~`f0MMB23*@*%YKv>Rw?da=a>ugd~=Bj=+gQZmeX}bSUzshg6nM za2a{iZ`!4$Nz?r)a;8VnXg)bpVSYK&BeJBM>X>098l61U+8Q%bB!U?6FNoL9a13NT zQ9o1>K1H15-aI}FiTd`G^NED@2}PP@9f^*GW$r2?rZ;UQ z%JKZ0I4*A9NIrc7xuqQ3v3+p)jl(H;}TKxmfdF!>H`%y4K& z;ilyXvmkcOC$xfK0{Bs?m#@_*s7%R?@^UO2zLSZ5!n|mewz<55to*a7)RJpQX%A&? zls3{W4$KAGMYlM)XL1|)#9DHh00eyy4;kLcd^yaZBH#)b@uYkU#tJjVB$UUW=s$f* zMm1Z+HtpyR6IU=YK-7{brB^tHRB$1Q5)lhBY{bCCn4bg@na3!Oyh0}xPy{bb3}YIS z6>EepYP*}7D$UY6qVd`8W4@C*BW_Um>6ACFGv}EQ`Y>@VTNS0I5Vic4{0-k^v2r0WxP-;Bs|dhN_}C zK3KHOU=Lb(C`?`nqBObo2xQHX(EOt`Bs zMYDCXd&m~it|7LgJHs-kRV!nPW?l%6OBGQVLK=waF)miQt6PBoBCy&N4JFK;n@j-~<_E zfYt@%4H+<_b1icKGiXT1RUvLjy#<)8u?B@hkXg;7Wh*H_Y&vK#NV=&K3pS;GY$1Fl zDN+hvJ@_(%U=~CFH4>eM;;Uxde@WC2O@OllC5DlpKY&o@kAx7?L0;6dUD^Q3IfSE& zs518hR85f}F?Ye_5?)0i2363m)m)7175*jFG6jO9Fado7mWQ{3MGdV8Xih*A0ZNX6 zY2g{FNdV~0YF_L3-TVO9QK<^>oTS9RHzCu&Q zkFDPOPJY3)Fq3lbo9%k2=SnN?R`mVI-9M*EJDkKL{cZc2hATa%0!wH|%eOq=^vrc# zd-Cd&E3W#xuKMMo*>3E0uF`9dU43l+e9BpO*LiYL`z_N=(~>vU*p8>$+P^ZrYntu4 zf3$0EFNyva_bx@}p19|1zq4(&3jjIau&^&t=1n-g39I+#t9H*voXS)ISf4MNJNSh& zYZ^RdAoLi2td{#;ZNafUn(wu#;BSIZLh1}cKVXLn$S>9MMg%C;Feue*f>LcXLpXzU z*5*4*1RfzsnY^$rD=#FyG5m~xfINqctel24$An$b9?KA#=&RxaP!t_ji19A&E%Tsy z7^;acg4Ad7LyJ;Wi3E(dOfnQ-RlixHML(GCS-xH<(!-3No}S5?^7TZK%K=-R!rMuD zm9Ly3px@Nvf}9=~#0&Cbokx3Yn<*KeakKrJK1qSQrMscu@nc%L2UU!^^Ezl94oj?+Gw;fF$espRdE+sSw(}) z5i6C+42eFF`1W)-VW?2}6Vw&{l)PVu2WdH>H&GoO!7tDcP9@Zi0~g!&1)9krIra@NzyqZp*A0wxxx=9jbXR5h(WPw zVp3vZGGyn-p(=)zCjOF2szRcP4FyVE!$OiW?^3#1dcO~E7yg=3{~dY%p1i-IeBEF$ z9G3bOp*=-D2gspBqxh2!Yyd>1Kzt>_pO~b zcP<-JJC3gK9apV$s=3F~o{Bj`+FdrM1xl=Hxbd0mpIJPfs@ywgzVEC|cRaPy(Vy(- zPn;h}yztq?Isd(mi>a!?WXHwjwt3;k=dOQl$umJ{J=ThSQvx(;}B+p++oEy0F88FCeOa2R|Ep|WL$<-WF zt+g|e*e^b6<*Ea!UwmZdDhE_Q->6~)n#xp|wmO)0$C?2TA4j_I!QSYo=-$S?%a?VR z>fe2=gTur39xLmvG=9HGi}?55I`S*s!O#U`gcdO=yEh_k;@gn$$kLWfN zu4FsLpW%Z`8ZS9J2H=fZbE(HPGrBSDjGpZvTI?WMEoKAgJg7#=Fr7~?1ABuZA66mO z0~ik(>InGuH-K-kO9Yq^0K@zF@R@@3BPP5X>RHe%&a{C#rhFRHgzbf5=mO-?#6TaE z-63EJSUp6i6#(yZqja?2;-I5!hJFgo862>;CLgmZN^DQ&P}=m%-Ws>Y3vStDy%KpQ zf#t6-+z1UM;6abvqJjsJuHZp?tXjr{WS}7PE%+>RAd3z`yBVP4&Qw5+h)6Og!jYoN z-%r+3BSLUU_>V{s=E?iF@OnQCP?X3)uU6EcnrMMo5eF|x^6!j58vq&x0aEm29B3$} zn#hKVX}3asK|z(G6}6#4nxRKY#w4u56ZPZ4Xn0~!NKwjlmV&-PUda$CX{eI<7qMfg zT}hRJwg-uTwB_K``;>Z?%h zl|l)PejS;ynomHZ**TveRJx9`K&7M#x~8sRsPu2G`+t%{bl&%DTR52VG_QE}Bt3g( zpL$SScJ0NhFD~p^?4Nt_N5yS(sgqg*Iv8&TB70bJ@=9O<*2#XpDJ&g z>jJj)l;0@5UOK10f24bU`;FG?txMYZ)|6-8QeY{P^gMp2ZO(vFl{G6Bt;vekCDqd2 zWT_nhg0!?WV3Njhr24#k9uG+*-#^HIgunBRMAuWTs=*zs(sqZY{zF%PK^k}|crh+d`^*v7*k*vgc{pGbPZ>F^_=_l*pj7OlHt_Ily z?QZ~(Sl|!9s3;nWFl7L*Hf9J+gj!(0`LW3_yb`*^qNJhJGCXl9W{UvIVH%2uiQ#Z? zld%(hiaq$PAA|SXs;4*=Hjy~@VhK>ADz3sr;&X0w*@^re)io8+Tq$%hxPPv>th%Y{ zh3x4dDnO4w-LgwmO|tcpQ8ay>0iS4U9}bQOUzrpR$6B|5;3fpo(hg+e4@VBSC>6d+ z07*>!pZ^2*^VhkAtv$Kp@r7q*UzUz^Z=bqn=aKRYCMEyXifUMq4u+sLL5f*Mh@zne z0y|R^&|?sIX)uUW>j!9!eFx$1D8`V8mQ-^kecN6niW>j;G+5tN6xaC~VaG5Ifq{hj zEQE4l5Wo)Fh4MS0yDzDpMCb&<^PeRgBw?VxM?$RP(}w?BG}d25F%0zYjNYG=RLw`< z9$zVGN|rRulIFOz@U8YQw9j|nICcHhO2y7(#m>a;Q^|@`X=gR%7SC&Mn6I0^Y)x9~ z7PU*d@0h-AT5h=AmfF>mZ0t!pw|xXua#jU^U$xj$Y$oB~v$Ut3l~mt7-*nefkB0~L zjLq2tbY|0em6i{K`n?48x?iU3{RN$@B9vA6nbiTB)PN>+9%ypp*skdDt}U*QRE{)& zPR*9~v}jNBOWMoBaboT1(4KZndwR479Y=ZLE6|?XKRUZ_aZnOhaNQ>;i5p^73U-iW z6uiU^4MnGU5~qNH`4O6pnL^{2V4E=^w8S(acv=SAMMFXGc>H0ASHS8CQraf?rjNtm zdpjaI@QuMs@O_EPz_3|{VSxmtOcmp%M)*gH{}*_o346XTS8H4XX5nYZjP3uVb(|x1 z$pg*b#;REds!^bM|5sjJr~xpaeDl?>y!wlEn!I1N6{@_3m_-Oq0eT}X;7G*^E`}#2 z#-N5d%pMJwBcm*$ABwWjGCoaq<5FQDQ?g^H4B;Uf{NFb}0*MMXp|<~}+4v4xqZ2YG z;L-&k&{)Cc&`Y7oU?AitgdPKHovi>Vh2{aOSizS-uld81BYqm-|FvWL*PF+dv1RzL z_#6TWgGE>W$?mhh-M)@<{U^?xJqhi%ljpldJ4pGRq6><6$gI(hZv+fU7!fM-3=s%C8r&0>kCZ2Rw7_OCiCSDbB0X2SZBUghaT zi4O=7Z!VEBexNq`^0z_4fP}RLQa})a_NJu0DPe44Vy;cNnot45)oqC6&}RzoU&d)h1@V?zWuleH4ioG^zuT2O)@^a0SX=W* z?Y!gtgm-HnsjbL2vK{!uck3RhO^1jr4s)%fEP2Fo%UsS8a$r6f zTkk_6t3IR^78=8s_F4qmO+@|K?oQ~==E(31q7$$>^?@XwfHAsgJloxIs{iEaZqjX- zj)=ORv)z3sL><$f6?M?QJ=HB5JI;2V02=C&7cng}kb*Fgq_w3`a^Jv1Y}?iyQ0{(K zW#F%9CDV$3o%@NseAQVinXxCVJD{5RR`^Qzi{tb0q^)UI^T1+Tu~Z~26)8*Af(t|# zE8LuPHYcpj=>o@F&s}+LUiF0+Sh6?C#131N$wD4Y%gO35yqK@F<_n+6P5yv7m3fZ@ zT1U}e$((?l#(y7=gSnKL1qlq&KENtt{3kkDVWeAKR@n>oN9q}zt1=cwIQB435w)_# z$?!z^;H#HpVKm3WNP*j|eL8Wh!TrhYXQt(VMzJu8fa5-%YXOu`%rEx<%rC5x6xj$Q0fZH|xJRs#mNHgJWQ~Nf zIaW!34hh8n?=yfi#f!CAU)H!D>j!e)fSwq8d2N*P&88w(!6COE^Q8xFB8vS{TcXQZ zy-j97#29VDWxNd7;}8OYK!@o@A>k;+89@0cMhF^K)p3fyO5SVawUS5tQ|LM|u>^4s zBDgYyG3>H--AqeKc%4e0C2ts>Yz=f&PLSS~?Z^en_!aU#4-a>hWNd-hJA^kVdxM#JY^&9c8 z#J~0`*!p#z4TG_=WUhOoi1RilnqfSBWT8w^m>`pZ4=tJ6C6IWdQa2IE-T6LABiyKyoNsB5~+?s}pRQaA{`JTkyo;&7L zdEaVzeHsQneA8-m!;Sd$_?M{*6-DSmv2B*RP*azv+n?eOEbmV8?X%tYtd(h)WS_or zdcOXim0!oi!MI1X7sK*r?&Yhd6$hVm@GFjnqyu(T?>Smv_5gz&m_5h_JEDoq@7WjK zq`FENli}>oA_D3NQ1TY!&`e&7y_BpPMv=4C36A{faTPda>bM3nTwY5OK=8FQvOvvc zW_t@ulK@O2LW3hY{A`{sJ@5-qJ(TWtBBm{vM?b-9Wy^=7hzro}fC2*>g%;1W2&Ls$ ziHt{hxQ_x#DWEHI7&Tw#pd3)uu&&WK%=as|&-JJ6P8by5wO226t<>#J*6m%^ChOWi z(r^WZ^A{dcF&Gz^%)j`ELzHopeyz4i+bjGQ8uM2F&)P;1pFkkGMffUte@q_jRFZV@ zYZN1*N%%T>e@`Brkc2-V?+?i%#)U-M5V0c=`H^E%L9?>ekm9$g2zejk7kL>a$dEz@ zLkgQpSGA_kV@TnJy+Z98hhS+B3vL_N>1pk}szA4I5m^Y9>X$Xk{mZ6g+p%?eUaNBI zDi%>5!P35E=W<}VEV-w1ou1cr)ay#fG6KP3*HZD#r`9RD)~l+~c@{Nm90D?pK(HLW zee%1nty6TZN9ER))9Vl{qc(!uXb{0#Td}Tr(YeMUSn{k>u-2l{)h*!t2p0E{Nl4wA zxk$HjsdTw9xf`_+Pn>veox-)o$6@7RD_qkd1Xhv_Jm?Ab9cA-nSI?}O=*djh9u|sO zxe|V@fMPb;?88n^F_lhdrZFH`)-GT8uAN08)O=>(-<;W?WQm%hZ!n0T;%FU^Ti*|F ztDOg~kkbTpu#V8emO&3yBz4db(BX$0x?pLchlPh}+5nXu6RzZYTC#4CZhVvOBt&X! z_bU)9v)cwD>;iyKQRTnLx)8VE2F#`TpBePxfIj2yb_MQ0#+7+9$_C|`$<x_5vm-XK3cTl@-tGv)c0kQi$PAZpyCJfEGuI3gtB_JIq6=eKduQGL=?PdfadrwdEcu9A!fBvMnZ25O<~ zYS~I*O|q~iRai^cBY}Ad0=3XQtJ|%z0rQX{mX~Z=~CN-EL(9wiBIb|3&az1K>xoM6`r*J-K z!ZRaj*;f)_cFu0d6$t%4V)9tFb($2bN8%bs2o1wN+FJa@r&g_HUp%u)u{DFX)I6*G zO{?@76^NGzD5XcJs!$MZ_|zml&v$qNBVr(Pw92iE>g2M!HrN!!IYL93K9q_vANOTx z@~J=MV=-gLV@mx2tp_Tc0p0H@djnI3S9bku^SKs-@;wVV{mG}!rKsX#$10RtVi#&! zkm6KQ6gAd>0XsYJ;kE3Uf{_)Ngmsj4aN|eawhv*wkx(o*)#WyeXE%( z19yGBZ?WWS&!?-JuwP}4{1VJRK(9B6l?WOcMmvT+=bRNj#YQaWx>#bXDEJL6Wt@!Z z7)0h_S?6O*lj5ZeK{Hz1Sf>RGCdgE-aGwh3aEsh}Hr#B=vX7a)HBuv)`E`I+u^`Gj{^KJguzd@SAUJ4gX@H}-|g z5O!X`7*x1BM!LcjikZrZ2#T3)xI$N<%JW$q2dr@wFlj;D0F`*fv;ugpnoSQpDA(Gc zDzA+00&wt*;Z*}}7G&XNm7?_Nh!yt)qu`2>$|nN`-cM*oBFTj!%sI3<4W!F}T{|Y* z)b1oWXAI_;9h-O;H{6PQ^07sCHt7O>KBXs%8 z5PL7-!I=6G&**&NAid8)l=aIXk8vkmFDi)+!4E|9|uBHjmin_}}2v?j+%9zmqWC(^wbHYE7cNCszBHjd3)n^pH&`3y#mYz&y zKR{eKid5G0Id8iK^hE6t`$%GfGH2=E;wGDDU%xen{+3B)HpTd*51~ zmhRVF=v*qAi>KUsVYX*=tyt@mRxn~4z$>b#TPbf!mN%`GwEg{PAJ5%nX3CGc&S*T>9`1C)4RCknp#OX{nnSMt9G>cP9$Ch4R8cdWOx@UbH#B{p6 zD!pw7@SV%{kDR8!_D^d%RQWj4i5Z6xVP`G(F5jW;bZfp>XTrnxS{mU)O}&#he7{0N zai00LrmiZ@_nRBK$}~T4su2G{nGXJ^(&0o*OV`KH4kGO(JjI@(@4tl(Cu=W(ycg*+ zJ%(b+6)<3`J+zhR%yi!i0;0VU(0wlm)^?TYZ1YG$u<+al1rkoDw!C#hBfanoKHO9+>!&?w!lE8Y5Q zpxcB|Nwnjdk6<{`BJae~n2D5qOl$C#@{bEEc9XE_%Cg>ut-y>9;z)W?Yf6kncwGZf6LYWn5+3)ZqIw% zo*#3SKjuo2_%p3WrT!TQ5359_#yT{Y&$?Gko}|e$e=udLdtJX_v#1K@oEsd1jk+ z&Mrt1wFqcKZ=1I0Q7L}NHfY+8B}jNIrP^$*88XAAtJhmUGSQrEW{Ooi-(sb%vFzI0 z6*FQ+?^n#IrO#>EG|@4$is&jUO>`^KakHA}YDu>doiJ;Nu95UQqHE1nM6a4tv+Ea9 z>=a{$sHkm0FK}16SSc@DRnM_#6B*9eNDx8w+USgt^60hmwx5m+2OW^6ll}d@H zLHi?bwo+&{5Uiu04X=cluBxn7R~c_u_*mcpmS};@(zukU4{l|R;@O@7@nCRs-O9Nk zofdx`+*}J&y@*|z;sq-=#Y;T75C~mitVf7&H{oA~KWe~8uTC7OsuTMo^|2U8-FB=A z-YfEv3q~E(Ma3JDlTFeNx%?h5_d`@OD}}5$6zy!;1JZEv!zIgJZY@ODE_^b&kr~}+ zuM#iDdOB|E%ElJ$D^)I9eC5DHqOR%)ONw08*-m&MqjNd(9%%Y7;9kHo=YTve{;}$i zA&pEfbjH8VnsD4=C;n_?*Z-r1B03prS13CdAKsz7}*b2CrB3w>;hBnJo3n*baK$Zfb2 ztsnsF;h1@uHBo7p%%=_FcwM_z%QuLR1y)BfVt{^j8x!+^jpC)c>wQ+z3XP~uN|R!( zFeX8K9PY&F|RCMB-cH?g(ioAv+JzLC^s z7?+WJ(V^4zg67M(Hvfk{k4|0#-%9c!d( zH7_>iOq3`B%Z^1ZT8!^!8g-W138lRNhW|a*J8(6e(#xj=^$wWT4lH zFDL7aD40XyrDU>r8qK=^X8>mbNHst1LVn83f+5$0Fw2G~^2Kt_vF>yDVe}xoyvW$4 zNsE_?<#X>pVyORZam$)BbXq*V=ATm8c(Gh4O_^41@ksK0=sW4Mc)WR6LaysFlwSc1 z2(LM{MJCZFL4OJ$lTsS6BJLlc-u8v|mOB|`-^ElX+b*uBGOSnpU8;E)nmxmIc(FX{ z=_AFmi4tI5;zyx#fFQHDYB|$qAg+@~Cq2!{kNP%#4x)1U6*E6Z9X$)j*8Yxdg1U_A zsa@TkNhggj}Ukv zcp;ZBm1g;uz{CmaX_J*nKg*XK6ZgYGItH8W>zIc)D`vxTH)&5t`E%&}6YwLx0XdqU zKMn3T!3`9wQpq=G`Rc+Dc#zR&W0~^byMppRYs7< zCi#o#m3mVU3HOa!W$V64-hW+2k@%~PJ*F2b6rDT*$qUgTZ}D=zUUoExU_I1wW&{KlRtqz3G1a$3%(~ojW_; zJDUIerm~x(UhqfNbx!R0--Tb1!lZ>8V4XO(iZt~f!5YU`z=Ou>JQHI%!RA*EY! z!t;@9q_I(%(OhjldQCC4b5zYqCN`wF+IW?z6Fp6IoalXE1c>pdZ-5jqi2gj$)r$jy zL_bY*f(%r~YlajvG^Aw0;=i`E%yRN4)o0J3e6uZHOvGt@IJDjj0^q&k1j(Gt)be^F zJY(>-gSVYL&AW(@H98wYp`3DxCH^{i@-`nilP^tKw{c$KUidUsB-!Lq2n<0@l~Zx@ zrJT5Bz8M&|_%A_!ap%(LXMaj`ZB4Pu;>6a>?1OHIUv_jiG|#RnPQcaO zfMd)DX*cvMni;sF&O~XiLvzg3l8PJ6g_)FKrnr%r(D|erA`Oq5%#AcCWz~)J zDn6J0{QshL_5;+ahm?C8XJW3wJ=ZXeCS^QAQ^qjmdEz|A_@k}{bSOw4QCx%MPt8O} zm6MXNNTMEp-ROAL1w~R-q$dtN4=eL=r`nCrouF7{)Oo(1#1iv0Zq*_k zc}N*yX3z}XAA-_3HdF0Z%c*?>e8VuJ8Y5go2HO7a*6zLJVp$b9>H}rm(zf8*l~Mgd z{vn7VCdHQRNwb+6ql&BCyLyH>lofOAVdVnjy34Mr=k+1~Sfc1vizX>RrS^1G2s)(V zxP*4ZHj(U2&c-162Y_Ax_6Ya3($qxRrY|cMB(G;&K=HAC|I5T( zk*$qj;&VW} z*1IiE8~U#Pju8&2aW$cd#O|hH>=9l4TQRr|unw>u5CXIV(f~Ywm%aX%W?BlL-;l^Y zOosr*SFuEZ{|0fh@QvLew)>~VrB4(apa_x1A@2qK>S}XZ&HEP6NxaMAmwPrZ2Gc@v z?{kcGh#&8}b=HecmF=m?$qIKYlkT;0uMq{bp5H@P8Xz}PFP29tUO4Ap+;cg9z3>jd z6+pKPe}V7+g7P}LmKJ-{tq+MSeLL7|3$OO|GWLde>z0-V_;R{BKRH<{7UZ3I>m+T? zX0fHe;RNn5)2k2Xi#TeWN_xa{3S;TJ29Kww)xBws80PN~WD;J8TjwcW99v$Vvk9Ir z(o<)IUM%$W!I`r8bK+|M_CC2pxrD#RU>s0h__q5BmVEcgb@Tr~4;FXh@%l0t8WaEB zzouQMqtRm{9$R^-@xpRT@cr@2V*mc5{*m^&ev=-Q&U>3iOe07T zUlb&PP;hl42RGC12i^tfJWW8QYPi5v5gHiSB2O~}i!)X#4s&Fl7c5mqN2&5;t0yX^ zRkFM2)8^C!z0<_hK*u5ff*2mPoZ}=$sVsvW@gP6nu82cv!TSsrzZgg-(^PR{ETnD@ TM1qOh#H_N5ezVF@V*mdDF%B52 delta 5490 zcma)AX>eS{5q|Ub&>lKgr?om)vSev3S(1@#OCaCa*am#bv7H6ivfB3~t-RWm=RJX~ ztPx-vh=Ws@Nq{Pdk%5pHaulTy2w-d|s(@UOR0vg%q(TT#aSo0ORlpBI`kNd?}0OKkDC zRClB4He>xs-N-f&qBT=@`J0%#9dwa+KDVp+ysk*@v{}jYbvmCraIFaCZDc_)oVTk5 zPKTXbj=UP0t^(`$ywy=meQ}no7 zhM-$yihKN7n#9@S66O)F7T>W+PPzp8b`y}3GW&|a=m%puinD9^Rdi+PLURql>*CRp z&I!aUT*gD-)&c4PNK(EK-~luMWLCF+PmLox-bD_5Ya5+E*cq^zJz%T$a zXIW!ADK|%>NR>9$7xG4uK#VW=K*1{d8Wl`D;%X9qD1D$SsOS}1^}VWAqtzaEMk;lW zUZ4kvTc8&adh}AFs)C9Zy4S7MY4s#iEuJb1vKsMf*$0gaNL`q5nF?W|v#gT#!2J)z z-{;MjTd-QRmUp)=f*fK%JI;)8)3`2S4)HJCoFw$25arF4GQtikcgP$`Qi`Qaogy`z zz`rkdvYIX^T?&Ax<i2rS>wfl-|&6REwQKR&4c6ERr1 zlTLzXD&MTJ@)N0ea!k{s)5nb+K07|wi)X5qUL{?+9?FjaR$_*-XulpCGZO=2Njh^z zXS@o#Y$Lw`!EXZOPL!6-*%F7Tw_$2a^)^O3@nlU4TP%K66Q00io5Nnq(QkC|JE#%y zYI;09GMYAYX_VzQOrA&$TF!w)O2ZAH9pD==6%3n+k~9Fr=>R7CEQc8zw0*i8qSC54 z2Hj2_ol|4AAG3u}7gjB`t;13&-!2rie7(tpVZx>rJH=gfz3jWkR$~ zal<}-5B-yY;TwpuJcvRxmP}6Y@1SQBn2yo(sC_QX1|~miWsoGcj{6|a5)rRleaeAB z{zG*Bh#=y#4T#e8{1mtkL)sPBlS$j0Z121fI6|;NG|ayyfCaET{V`+alKj^EwXEz3 z2z?Czhc}aO*`}F}dEeMm{EPZEh%$v~HGh`i4so`5wfjjlgCZw< zmc1xm4EJ}u1cnsxr#7k=y%_=^mf?rbKqqfzNg@@S-;!g{^n=c&7e{){b`5}+n zLsqto^!#t(r=>6{A>X94FGFFmc)Ml0eZxN~mbZ4jw z1c@Q51L?PntMpZZ9Dgf~dQR+K)aJbr(*+5{`~HdvPDF+oK2sTHeTu10I!;o%Tsf#_ z)Jf+_s=)UuyqS7keM&|hc55oNyQ!Ts-F8rWgFRaIJE^^!+Melt7qy3|?IrEXu&+m%g=O=re4$tZftq$}geI8LgjJLAl_%$!LN#nE}p zp}CGZ4*E?s<2l4MM}?B{N4**Mq%Y$?$(WMy9rWz0pphPaLSq@yX59SUj9YUDm0>?ko{P!%5eLg6jxMg7=pp}Py}@zjQEJV`1jPn1 zWXxo^c0rbcs)TcS0`y)$5+JK5T1Ukyj>?l?Uh>hI5&$*EY)QdK(A)%Au!nG_qW;p= z*(P#2TPo(RF-n-%vBsN4y0dz@kE!y{{i*X)w>O{pm|HD#IEm*bdQP8%UR|t69k1G@ z_iv^A5jPZ;ZL)o}ymV{HL)x&@!a?)Qd_d29jUsB?q6{#4xvGaf84CF!+E-!6cv!S| zg~jWOE3^=a3{slVTCfZoR=5**t8%TRj;s5eefCjB73fnW8=gLDorvElmQ@hQosDa{ zLM4}xs%~;B@Y(nhai**8%C=s*d}2vs1rak8>XGMNrE_5Uc~#1Lgx30z7~QKrNsRfL$YBz#Myr!%BV_fFdUAS%CYP%iHf} zP|pI20XX_N&M)4J!@$lN90D9WI_o9z2AZ?y-t6HwP95jOgG)k1C1qH8M2Y@?lDnLb zi`!h);$KVpWhK8#+_`jKO)KARro%@>~hEm;vjED zV)YWC&~F^M^e0aDEb!5K{2QCp!^8(Y!F~joD##H1o11C>J;`P#oL&F7NK3;^I^(yM zfQ!F1mAtr-b-#ppobqAGktQ6nR*qfC89WTutV-zvGCeL{x}?>gl?CGXqVlQo-s6ll ziHDc3nTYtuQpVWmXqub4M#Yj`Oc^;Je-BFK23bXvPWH;weJhqR_L_KYWwqSC;{~zN(PSbni?Q}mTC5gQ zy`G*BZewgQb$R`cm#mK^lDZZ$)1d*~j1Psrvi*wCxO#Dj?}u&g!>oYi z;rhNY-7pRLlSm}MV+ksz2hyApCuWWrvQW2#v8t_CIxn63H57CcSl)H} Document: - """Parse source into Document""" - pass + Table schema (run this SQL in Supabase Dashboard): - @abstractmethod - def can_parse(self, source: Any) -> bool: - """Check if this parser can handle the source""" - pass - - -# ============================================================================= -# PDF PARSER -# ============================================================================= - -class PDFParser(BaseParser): - """ - PDF Parser using PyMuPDF (fitz). - Extracts text, tables, and structure. + CREATE TABLE extraction_results ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + document_id TEXT, + title TEXT, + source_url TEXT, + document_type TEXT, + status TEXT, + raw_content TEXT, + content_length INT, + chunks_count INT, + chunks JSONB, + tables_count INT, + formulas_count INT, + extracted_at TIMESTAMPTZ DEFAULT NOW(), + metadata JSONB + ); + + -- Enable Row Level Security (optional) + ALTER TABLE extraction_results ENABLE ROW LEVEL SECURITY; """ - def __init__(self): - self.chunk_size = 500 # tokens approximately - self.chunk_overlap = 50 + def __init__( + self, + url: Optional[str] = None, + key: Optional[str] = None, + table: str = "extraction_results" + ): + self.url = url or os.getenv("SUPABASE_URL") + self.key = key or os.getenv("SUPABASE_KEY") + self.table = table + self._client = None - def can_parse(self, source: Any) -> bool: - if isinstance(source, str): - return source.lower().endswith('.pdf') - return False + @property + def client(self): + if self._client is None: + from supabase import create_client + if not self.url or not self.key: + raise ValueError("Set SUPABASE_URL and SUPABASE_KEY env vars") + self._client = create_client(self.url, self.key) + return self._client - def parse(self, source: str, source_url: str = "") -> Document: - """Parse PDF file into Document""" - try: - import fitz # PyMuPDF - except ImportError: - logger.warning("PyMuPDF not installed, using fallback text extraction") - return self._fallback_parse(source, source_url) - - doc = Document( - source_url=source_url or source, - document_type=self._detect_document_type(source), - ) - - pdf = fitz.open(source) - doc.title = pdf.metadata.get('title', Path(source).stem) - - all_text = [] - all_tables = [] - - for page_num, page in enumerate(pdf, 1): - # Extract text - text = page.get_text("text") - all_text.append(text) - - # Extract tables (simplified - use camelot for production) - tables = self._extract_tables_from_page(page, page_num) - all_tables.extend(tables) - - pdf.close() - - # Store raw content - doc.raw_content = "\n\n".join(all_text) + def save(self, document: Document) -> Dict: + """Save extraction result to Supabase.""" + chunks_data = [ + { + "id": c.id, + "text": c.text[:5000], # Limit text size + "token_count": c.token_count, + "section": c.section_hierarchy[0] if c.section_hierarchy else None, + "tables_count": len(c.tables), + "formulas_count": len(c.formulas), + } + for c in document.chunks + ] - # Create chunks - doc.chunks = self._create_chunks( - all_text, all_tables, source_url - ) + total_tables = sum(len(c.tables) for c in document.chunks) + total_formulas = sum(len(c.formulas) for c in document.chunks) + + data = { + "document_id": document.id, + "title": document.title, + "source_url": document.source_url, + "document_type": document.document_type.value if document.document_type else "unknown", + "status": document.processing_status, + "raw_content": document.raw_content[:50000] if document.raw_content else "", # Limit size + "content_length": len(document.raw_content) if document.raw_content else 0, + "chunks_count": len(document.chunks), + "chunks": chunks_data, + "tables_count": total_tables, + "formulas_count": total_formulas, + "metadata": { + "provenance": { + "source_url": document.provenance.source_url if document.provenance else None, + "content_hash": document.provenance.content_hash if document.provenance else None, + } if document.provenance else None + } + } - # Set provenance - doc.provenance = Provenance( - source_url=source_url, - crawl_date=datetime.now(), - content_hash=hashlib.sha256(doc.raw_content.encode()).hexdigest()[:16] - ) + result = self.client.table(self.table).insert(data).execute() - doc.processing_status = "completed" - return doc + if result.data: + print(f"☁️ Saved to Supabase: {result.data[0].get('id')}") + return result.data[0] + return {} - def _detect_document_type(self, source: str) -> DocumentType: - """Detect document type from filename""" - source_lower = source.lower() - if '10-k' in source_lower or '10k' in source_lower: - return DocumentType.SEC_10K - elif '10-q' in source_lower or '10q' in source_lower: - return DocumentType.SEC_10Q - elif '8-k' in source_lower or '8k' in source_lower: - return DocumentType.SEC_8K - elif 'earnings' in source_lower: - return DocumentType.EARNINGS_CALL - return DocumentType.PDF + def list_all(self, limit: int = 50) -> List[Dict]: + """List recent extraction results.""" + result = self.client.table(self.table)\ + .select("id, title, source_url, document_type, status, chunks_count, content_length, extracted_at")\ + .order("extracted_at", desc=True)\ + .limit(limit)\ + .execute() + return result.data or [] - def _extract_tables_from_page(self, page, page_num: int) -> List[Table]: - """Extract tables from PDF page (simplified)""" - # In production, use camelot or tabula - tables = [] - # Placeholder for table extraction - return tables - - def _create_chunks( - self, - texts: List[str], - tables: List[Table], - source_url: str - ) -> List[Chunk]: - """Create chunks from extracted content""" - chunks = [] - - for page_num, text in enumerate(texts, 1): - # Split into paragraphs - paragraphs = text.split('\n\n') - - current_chunk_text = "" - current_section = self._detect_section(text) - - for para in paragraphs: - para = para.strip() - if not para: - continue - - # Check if adding this paragraph exceeds chunk size - if len(current_chunk_text) + len(para) > self.chunk_size * 4: # ~4 chars per token - if current_chunk_text: - chunk = Chunk( - text=current_chunk_text.strip(), - position={"page": page_num}, - section_hierarchy=[current_section] if current_section else [], - provenance=Provenance( - source_url=source_url, - page_number=page_num, - section=current_section - ) - ) - chunk.token_count = len(current_chunk_text) // 4 - chunks.append(chunk) - current_chunk_text = para - else: - current_chunk_text += "\n\n" + para if current_chunk_text else para - - # Don't forget the last chunk - if current_chunk_text: - chunk = Chunk( - text=current_chunk_text.strip(), - position={"page": page_num}, - section_hierarchy=[current_section] if current_section else [], - provenance=Provenance( - source_url=source_url, - page_number=page_num, - section=current_section - ) - ) - chunk.token_count = len(current_chunk_text) // 4 - chunks.append(chunk) - - # Attach tables to relevant chunks - for table in tables: - if table.page_number and chunks: - # Find chunk on same page - for chunk in chunks: - if chunk.position.get("page") == table.page_number: - chunk.tables.append(table) - break - - return chunks + def get(self, result_id: str) -> Optional[Dict]: + """Get a specific result by ID.""" + result = self.client.table(self.table)\ + .select("*")\ + .eq("id", result_id)\ + .single()\ + .execute() + return result.data - def _detect_section(self, text: str) -> Optional[str]: - """Detect document section from text patterns""" - section_patterns = [ - (r"ITEM\s+1[A-Z]?\.\s*RISK\s+FACTORS", "Risk Factors"), - (r"ITEM\s+7[A-Z]?\.\s*MANAGEMENT.S\s+DISCUSSION", "MD&A"), - (r"ITEM\s+1\.\s*BUSINESS", "Business"), - (r"ITEM\s+8\.\s*FINANCIAL\s+STATEMENTS", "Financial Statements"), - (r"NOTES\s+TO\s+(CONSOLIDATED\s+)?FINANCIAL\s+STATEMENTS", "Notes to Financial Statements"), - ] - - text_upper = text.upper() - for pattern, section_name in section_patterns: - if re.search(pattern, text_upper): - return section_name - - return None + def search(self, query: str, limit: int = 10) -> List[Dict]: + """Search results by title or content.""" + result = self.client.table(self.table)\ + .select("id, title, source_url, document_type, chunks_count")\ + .ilike("title", f"%{query}%")\ + .limit(limit)\ + .execute() + return result.data or [] - def _fallback_parse(self, source: str, source_url: str) -> Document: - """Fallback parser when PyMuPDF not available""" - doc = Document( - source_url=source_url or source, - document_type=DocumentType.PDF, - processing_status="failed", - ) - return doc + def delete(self, result_id: str) -> bool: + """Delete a result.""" + self.client.table(self.table).delete().eq("id", result_id).execute() + return True +SUPPORTED_EXTENSIONS = { + '.pdf', '.docx', '.doc', '.pptx', '.ppt', '.xlsx', '.xls', '.csv', + '.html', '.htm', '.xml', '.epub', '.jpg', '.jpeg', '.png', + '.gif', '.bmp', '.tiff', '.webp', '.txt', '.rtf', '.md' +} -# ============================================================================= -# WEB PAGE PARSER -# ============================================================================= -class WebPageParser(BaseParser): +class MultiModalParser: """ - Web Page Parser using trafilatura/BeautifulSoup. - Extracts clean text, tables, and hyperlinks. + LlamaParse-based multi-modal document parser with Supabase storage. - Based on QuantMind's multi-modal parsing for web content. + Usage: + parser = MultiModalParser() + doc = parser.parse("report.pdf") # Parse only + doc = parser.parse("report.pdf", save=True) # Parse and save to Supabase + doc = parser.parse_url("https://...", save=True) # Parse URL and save """ - def __init__(self): - self.chunk_size = 500 - - def can_parse(self, source: Any) -> bool: - if isinstance(source, str): - return source.startswith(('http://', 'https://')) - return False + def __init__( + self, + api_key: Optional[str] = None, + result_type: str = "markdown", + use_multimodal: bool = True, + multimodal_model: str = "anthropic-sonnet-3.5", + chunk_size: int = 500, + language: str = "en", + supabase_url: Optional[str] = None, + supabase_key: Optional[str] = None, + ): + """ + Args: + api_key: LlamaCloud API key (or set LLAMA_CLOUD_API_KEY env var) + result_type: Output format - "markdown", "text", or "json" + use_multimodal: Enable vision models for complex documents + multimodal_model: Vision model name (e.g., "anthropic-sonnet-3.5", "openai-gpt4o") + chunk_size: Target chunk size in tokens + language: Document language code + supabase_url: Supabase project URL (or set SUPABASE_URL env var) + supabase_key: Supabase API key (or set SUPABASE_KEY env var) + """ + self.api_key = api_key or os.getenv("LLAMA_CLOUD_API_KEY") + self.result_type = result_type + self.use_multimodal = use_multimodal + self.multimodal_model = multimodal_model + self.chunk_size = chunk_size + self.language = language + self._parser = None + + # Initialize Supabase store + self._store = None + self._supabase_url = supabase_url + self._supabase_key = supabase_key - def parse(self, html_content: str, source_url: str = "") -> Document: - """Parse HTML content into Document""" - - doc = Document( - source_url=source_url, - document_type=DocumentType.WEB_PAGE, - ) - - # Extract clean text (using trafilatura if available) - main_text, metadata = self._extract_main_content(html_content) - - # Extract tables - tables = self._extract_tables(html_content) - - # Extract hyperlinks - hyperlinks = self._extract_hyperlinks(html_content, source_url) - - # Set metadata - doc.title = metadata.get('title', '') - doc.raw_content = main_text - - # Determine source tier - doc.source_tier = self._classify_source_tier(source_url) - - # Create chunks - doc.chunks = self._create_chunks( - main_text, tables, hyperlinks, source_url, doc.source_tier - ) - - # Set provenance - doc.provenance = Provenance( - source_url=source_url, - source_domain=self._extract_domain(source_url), - source_tier=doc.source_tier, - crawl_date=datetime.now(), - content_date=metadata.get('date'), - freshness=self._calculate_freshness(metadata.get('date')), - content_hash=hashlib.sha256(main_text.encode()).hexdigest()[:16] - ) - - doc.processing_status = "completed" - return doc + @property + def store(self) -> SupabaseStore: + """Lazy initialization of Supabase store.""" + if self._store is None: + self._store = SupabaseStore( + url=self._supabase_url, + key=self._supabase_key + ) + return self._store - def _extract_main_content(self, html: str) -> Tuple[str, Dict]: - """Extract main content, removing boilerplate""" - try: - import trafilatura - text = trafilatura.extract(html, include_comments=False, include_tables=False) - metadata = trafilatura.extract_metadata(html) - return text or "", { - 'title': metadata.title if metadata else '', - 'date': metadata.date if metadata else None, - 'author': metadata.author if metadata else None, + def _get_parser(self): + """Lazy initialization of LlamaParse""" + if self._parser is None: + try: + from llama_parse import LlamaParse + except ImportError: + raise ImportError("Install llama-parse: pip install llama-parse") + + if not self.api_key: + raise ValueError("Set LLAMA_CLOUD_API_KEY env var or pass api_key") + + kwargs = { + "api_key": self.api_key, + "result_type": self.result_type, + "language": self.language, } - except ImportError: - # Fallback to BeautifulSoup - return self._beautifulsoup_extract(html) + if self.use_multimodal: + kwargs["use_vendor_multimodal_model"] = True + kwargs["vendor_multimodal_model_name"] = self.multimodal_model + + self._parser = LlamaParse(**kwargs) + return self._parser - def _beautifulsoup_extract(self, html: str) -> Tuple[str, Dict]: - """Fallback extraction using BeautifulSoup""" + def parse(self, source: str, source_url: str = "", save: bool = False) -> Document: + """ + Parse a file path into Document. + + Args: + source: File path to parse + source_url: Optional URL for provenance + save: If True, saves result to Supabase + + Returns: + Parsed Document object + """ try: - from bs4 import BeautifulSoup - soup = BeautifulSoup(html, 'html.parser') + parser = self._get_parser() + documents = parser.load_data(source) - # Remove script, style, nav, footer - for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): - tag.decompose() + if not documents: + return Document(source_url=source_url or source, processing_status="empty") - # Get title - title = soup.title.string if soup.title else '' + text = "\n\n".join(doc.text for doc in documents) + doc = Document( + source_url=source_url or source, + document_type=self._detect_type(source), + raw_content=text, + title=Path(source).stem, + ) - # Get main text - main = soup.find('main') or soup.find('article') or soup.body - text = main.get_text(separator='\n', strip=True) if main else '' + doc.chunks = self._create_chunks(text, source_url or source) - return text, {'title': title} - except ImportError: - return html, {} - - def _extract_tables(self, html: str) -> List[Table]: - """Extract tables from HTML""" - tables = [] - try: - from bs4 import BeautifulSoup - soup = BeautifulSoup(html, 'html.parser') + # Extract tables and formulas from markdown + tables = self._extract_tables(text) + formulas = self._extract_formulas(text) + if doc.chunks: + doc.chunks[0].tables.extend(tables) + doc.chunks[0].formulas.extend(formulas) - for i, table_tag in enumerate(soup.find_all('table')): - headers = [] - rows = [] - - # Get headers - header_row = table_tag.find('thead') - if header_row: - headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])] - - # Get rows - tbody = table_tag.find('tbody') or table_tag - for tr in tbody.find_all('tr'): - cells = [td.get_text(strip=True) for td in tr.find_all(['td', 'th'])] - if cells and cells != headers: - rows.append(cells) - - if headers or rows: - # Use first row as headers if no header found - if not headers and rows: - headers = rows.pop(0) - - caption = table_tag.find('caption') - tables.append(Table( - headers=headers, - rows=rows, - caption=caption.get_text(strip=True) if caption else f"Table {i+1}" - )) - except ImportError: - pass - - return tables + doc.provenance = Provenance( + source_url=source_url or source, + crawl_date=datetime.now(), + content_hash=hashlib.sha256(text.encode()).hexdigest()[:16] + ) + doc.processing_status = "completed" + + # Save to Supabase if requested + if save: + try: + self.store.save(doc) + except Exception as e: + logger.warning(f"Failed to save to Supabase: {e}") + + return doc + + except Exception as e: + logger.error(f"Parse failed: {e}") + return Document(source_url=source_url or source, processing_status="failed", raw_content=str(e)) - def _extract_hyperlinks(self, html: str, base_url: str) -> List[Hyperlink]: - """Extract and classify hyperlinks""" - hyperlinks = [] + def parse_url(self, url: str, save: bool = False) -> Document: + """Fetch and parse a URL.""" try: - from bs4 import BeautifulSoup - from urllib.parse import urljoin, urlparse + import requests + import tempfile - soup = BeautifulSoup(html, 'html.parser') - base_domain = urlparse(base_url).netloc + response = requests.get(url, timeout=30) + response.raise_for_status() - for a in soup.find_all('a', href=True): - href = a['href'] - - # Make absolute URL - if not href.startswith(('http://', 'https://')): - href = urljoin(base_url, href) - - # Get surrounding context - parent = a.parent - context = parent.get_text(strip=True)[:100] if parent else "" - - # Classify link type - link_type = self._classify_link(a, href, base_domain) + # Determine file extension + ext = '.html' + for e in ['.pdf', '.docx', '.pptx', '.xlsx']: + if url.lower().endswith(e) or e[1:] in response.headers.get('content-type', ''): + ext = e + break + + with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as f: + f.write(response.content if ext != '.html' else response.text.encode()) + return self.parse(f.name, url, save=save) - hyperlinks.append(Hyperlink( - url=href, - anchor_text=a.get_text(strip=True), - surrounding_context=context, - link_type=link_type, - )) - except ImportError: - pass - - return hyperlinks - - def _classify_link(self, a_tag, href: str, base_domain: str) -> str: - """Classify link type""" - from urllib.parse import urlparse - - href_domain = urlparse(href).netloc - - # Check for navigation links - if a_tag.parent and a_tag.parent.name in ['nav', 'header', 'footer']: - return "navigation" - - # Check for advertisement - if 'sponsor' in href.lower() or 'ad' in a_tag.get('class', []): - return "advertisement" - - # Check if citation (links to research/sources) - citation_patterns = ['doi.org', 'arxiv.org', 'sec.gov', 'scholar.google'] - if any(p in href.lower() for p in citation_patterns): - return "citation" - - # Check if same domain (internal) - if href_domain == base_domain: - return "internal" - - return "reference" - - def _classify_source_tier(self, url: str) -> SourceTier: - """Classify source authority tier""" - url_lower = url.lower() - - # Tier 1: Authoritative - tier1_patterns = ['sec.gov', 'investor.', 'ir.', '.gov', 'federalreserve.gov'] - if any(p in url_lower for p in tier1_patterns): - return SourceTier.TIER_1_AUTHORITATIVE - - # Tier 2: Verified reputable - tier2_patterns = ['reuters.com', 'bloomberg.com', 'wsj.com', 'ft.com', - 'nytimes.com', 'nature.com', 'arxiv.org'] - if any(p in url_lower for p in tier2_patterns): - return SourceTier.TIER_2_VERIFIED - - # Tier 3: Secondary - tier3_patterns = ['wikipedia.org', 'investopedia.com', 'seekingalpha.com'] - if any(p in url_lower for p in tier3_patterns): - return SourceTier.TIER_3_SECONDARY - - # Default to unverified - return SourceTier.TIER_4_UNVERIFIED - - def _extract_domain(self, url: str) -> str: - """Extract domain from URL""" - from urllib.parse import urlparse - return urlparse(url).netloc + except Exception as e: + logger.error(f"URL fetch failed: {e}") + return Document(source_url=url, processing_status="failed") - def _calculate_freshness(self, content_date: Optional[datetime]) -> ContentFreshness: - """Calculate content freshness""" - if not content_date: - return ContentFreshness.RECENT - - age = datetime.now() - content_date - - if age.total_seconds() < 3600: # < 1 hour - return ContentFreshness.REALTIME - elif age.days < 1: - return ContentFreshness.FRESH - elif age.days < 7: - return ContentFreshness.RECENT - elif age.days < 30: - return ContentFreshness.DATED - elif age.days < 365: - return ContentFreshness.STALE - else: - return ContentFreshness.ARCHIVED + def _detect_type(self, source: str) -> DocumentType: + """Detect document type from filename.""" + s = source.lower() + if '10-k' in s or '10k' in s: return DocumentType.SEC_10K + if '10-q' in s or '10q' in s: return DocumentType.SEC_10Q + if '8-k' in s or '8k' in s: return DocumentType.SEC_8K + if 'earnings' in s: return DocumentType.EARNINGS_CALL + + ext = Path(source.split('?')[0]).suffix.lower() + if ext in {'.xlsx', '.xls', '.csv'}: return DocumentType.CSV + if ext in {'.html', '.htm'}: return DocumentType.WEB_PAGE + return DocumentType.PDF - def _create_chunks( - self, - text: str, - tables: List[Table], - hyperlinks: List[Hyperlink], - source_url: str, - source_tier: SourceTier - ) -> List[Chunk]: - """Create chunks from web content""" + def _create_chunks(self, text: str, source_url: str) -> List[Chunk]: + """Create chunks from parsed content.""" chunks = [] + sections = re.split(r'\n(?=#{1,6}\s)', text) - # Split by paragraphs - paragraphs = text.split('\n\n') - - current_chunk_text = "" - current_links = [] - - for para in paragraphs: - para = para.strip() - if not para: + for section in sections: + section = section.strip() + if not section: continue - # Check if adding this paragraph exceeds chunk size - if len(current_chunk_text) + len(para) > self.chunk_size * 4: - if current_chunk_text: - chunk = Chunk( - text=current_chunk_text.strip(), - hyperlinks=self._find_links_in_text(current_chunk_text, hyperlinks), - provenance=Provenance( - source_url=source_url, - source_tier=source_tier, - crawl_date=datetime.now() - ) - ) - chunk.token_count = len(current_chunk_text) // 4 - chunks.append(chunk) - current_chunk_text = para + header = None + match = re.match(r'^#{1,6}\s+(.+?)$', section, re.MULTILINE) + if match: + header = match.group(1).strip() + + # Split large sections + if len(section) > self.chunk_size * 4: + current = "" + for para in section.split('\n\n'): + if len(current) + len(para) > self.chunk_size * 4: + if current: + chunks.append(self._make_chunk(current, header, source_url)) + current = para + else: + current = f"{current}\n\n{para}" if current else para + if current: + chunks.append(self._make_chunk(current, header, source_url)) else: - current_chunk_text += "\n\n" + para if current_chunk_text else para - - # Last chunk - if current_chunk_text: - chunk = Chunk( - text=current_chunk_text.strip(), - hyperlinks=self._find_links_in_text(current_chunk_text, hyperlinks), - provenance=Provenance( - source_url=source_url, - source_tier=source_tier, - crawl_date=datetime.now() - ) - ) - chunk.token_count = len(current_chunk_text) // 4 - chunks.append(chunk) - - # Distribute tables across chunks - for table in tables: - if chunks: - chunks[0].tables.append(table) + chunks.append(self._make_chunk(section, header, source_url)) return chunks - def _find_links_in_text(self, text: str, all_links: List[Hyperlink]) -> List[Hyperlink]: - """Find which hyperlinks appear in this text""" - return [ - link for link in all_links - if link.anchor_text and link.anchor_text in text - ] - - -# ============================================================================= -# CSV PARSER -# ============================================================================= - -class CSVParser(BaseParser): - """Parser for CSV/Excel files""" - - def can_parse(self, source: Any) -> bool: - if isinstance(source, str): - return source.lower().endswith(('.csv', '.xlsx', '.xls')) - return False - - def parse(self, source: str, source_url: str = "") -> Document: - """Parse CSV/Excel file""" - try: - import pandas as pd - except ImportError: - logger.error("pandas not installed") - return Document(processing_status="failed") - - doc = Document( - source_url=source_url or source, - document_type=DocumentType.CSV, - ) - - # Read file - if source.lower().endswith('.csv'): - df = pd.read_csv(source) - else: - df = pd.read_excel(source) - - doc.title = Path(source).stem - - # Create table - table = Table( - headers=df.columns.tolist(), - rows=df.values.tolist(), - caption=doc.title - ) - - # Create single chunk with table + def _make_chunk(self, text: str, header: Optional[str], source_url: str) -> Chunk: + """Create a single chunk.""" chunk = Chunk( - text=f"Data table with {len(df)} rows and {len(df.columns)} columns.", - tables=[table], - provenance=Provenance( - source_url=source_url, - crawl_date=datetime.now() - ) + text=text.strip(), + section_hierarchy=[header] if header else [], + provenance=Provenance(source_url=source_url, section=header, crawl_date=datetime.now()) ) - - doc.chunks = [chunk] - doc.raw_content = df.to_string() - doc.processing_status = "completed" - - return doc - - -# ============================================================================= -# MULTI-MODAL PARSER (Main Interface) -# ============================================================================= - -class MultiModalParser: - """ - Main parser interface that routes to appropriate parser. - Implements QuantMind's P(D) = {T, F, M, S} extraction. - """ + chunk.token_count = len(text) // 4 + return chunk - def __init__(self): - self.parsers = [ - PDFParser(), - WebPageParser(), - CSVParser(), - ] - - def parse(self, source: Any, source_url: str = "") -> Document: - """ - Parse any supported source into a Document. + def _extract_tables(self, text: str) -> List[Table]: + """Extract markdown tables.""" + tables = [] + pattern = r'(\|.+\|[\r\n]+\|[-:| ]+\|[\r\n]+(?:\|.+\|[\r\n]*)+)' - Args: - source: File path or URL or HTML content - source_url: Original URL if source is content + for i, match in enumerate(re.finditer(pattern, text)): + raw = match.group(1) + rows = [r.strip() for r in raw.strip().split('\n') if r.strip()] + if len(rows) < 2: + continue - Returns: - Document with extracted content - """ - # Find appropriate parser - for parser in self.parsers: - if parser.can_parse(source): - logger.info(f"Using {parser.__class__.__name__} for {source[:50]}...") - return parser.parse(source, source_url) + headers = [c.strip() for c in rows[0].split('|') if c.strip()] + data = [[c.strip() for c in r.split('|') if c.strip()] for r in rows[2:]] + + tables.append(Table(headers=headers, rows=data, caption=f"Table {i+1}")) - # Default handling - logger.warning(f"No parser found for source type, creating empty document") - return Document( - source_url=source_url or str(source), - processing_status="unsupported" - ) + return tables - def parse_url(self, url: str) -> Document: - """Fetch and parse a URL""" - try: - import requests - response = requests.get(url, timeout=30) - response.raise_for_status() - - # Check content type - content_type = response.headers.get('content-type', '') - - if 'application/pdf' in content_type: - # Save PDF temporarily and parse - import tempfile - with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f: - f.write(response.content) - return self.parsers[0].parse(f.name, url) - else: - # Parse as HTML - return self.parsers[1].parse(response.text, url) - - except Exception as e: - logger.error(f"Failed to fetch URL {url}: {e}") - return Document( - source_url=url, - processing_status="failed" - ) + def _extract_formulas(self, text: str) -> List[Formula]: + """Extract LaTeX formulas.""" + formulas = [] + + # Block formulas: $$...$$ + for m in re.finditer(r'\$\$([^$]+)\$\$', text): + formulas.append(Formula(latex=m.group(1).strip(), description="block")) + + # Inline formulas: $...$ (skip prices like $100) + for m in re.finditer(r'(?2P>^Sl|L$D6ofo$$g6p>WGLhTiZwPO#Zv96%ByzA*(<=D)wyO2Q+2yYf zAN1|_3){iQCShynI4p8I1FeDfuE0$TTPZJOD`(c&1Ik7>AU zW{)|8eqURt<$zx}nu7k0W|0&0H~0e~;b`=Q{EfoV5%RV9h0_;oXqIL)&RUNrjmImw znp>t+@?E)i+}V%t8fRaznZ;gy(^;W-xp%QT^Q*H{ac3*>_Pieo9+oe8?5qI~`uG;t z3YVd#@($OoiR=fVX#@5FH4ID8B>;ce&bicg%Y#82 z?vWTP4#u>M$;0u{Eqk=cNdG`&FRO(DIakMjEZFHH>zc5+pO+O@E+8N*0PPMS1atxi z0TWNzAsieAx_}(O%-5hiIBK!*uL?_*AUx)O7XDH{zr!h9NDdcXbe)+YZG4lSODrG$+oPuydw%8wQ8 zTX1;V_ zWc@w@i5j=pF%v4|kQWAI!U=qUKpG<0uYQoG2 zjh`tlQ#5L*&x@BU#dY{n!6=K5MA)gYo*a!wSuOv0S+8k09>+U|0W>El(I;>~i9QMG z6hMI)gY*PIAvz6-LPTw0W@1{ZydnGG%aRf$O@{tnvQv>!*;Kk)spZ3^l}aDKSo)Ci zDF3GP+De4fW2L%fFW`3LPL-covDeg(C!uKgF@YsbqV20^ ztOTlnXMjnd4;TcV1z;F^-XiB1dkLap7fCeV#(8hq24$SQyjJG2w#{GB-{G!35@Ur8 z!|B9v(|U%^B>zLbp&a2;tET;7x#7uS7qQQQqX4xd^7Hvgy|{a&@oKzv0CMm z5Bf}_No7D2)d(p?gxoSt4IZ-e$s~y%)jmD_b0>Z^2C&JNWLYKFQoaiwBYfZ6>oFT8L zxLQ$TKO(=*f2#1ThHbp5y)+!rYiwz9LIur3Z6G>ezoKbXLI6jbx2Svf?D=KGHa;S-2ZXaP|;q$T?!S|-sa^-Q%=YEaODOyQzjn;7HAD!0q|cCB)HqE7C3+?fe; zOUnQKEc+NL)N;2W8C67Ga}|4d!?P&!ClKXyE+5d1zS-L-cXQrm6Sk<9Vl%k@Q@}`% z%9V{h|8Hy*#aR~3>=2{M>c%o_iev>LZIJ{1o={N98B>IULPeW3HW;qfflnla zE2>3OsgamIoE%_O(9<~k3^4p9iOTUKd9YE|SZb4ncUO6xSbJ2qgrBc!JNhENwF@90 isgLNFiWhriVGt6zNAuB`bK^KL48Xnj8NhdM3jPgzZ=7-f delta 3460 zcmb7`TTIhe9LIZlfsO(eq#$AkiXat`qHbWkQJ@F|i zHnzyweN&t+CO$0-PkUIFY}xZJZ~JG@%d!{uuq@fblAZ6bxWB_c$|QXJopXLYzxyfk zqw}K$k)omkll*t)#^(hI3sZz@>8w)nhx}dPR&W1(;i!^_o%Vi3{9tLUSL`GH&hQc6 zA-`fd(%qw2x`vKv`NM(!z~D&Wu8Eh)3whbn7HfcQUkBL8Dt@1ZpC}C$-tCi(`c)7T0bqjBvP>hV( zgMQyYsPC{}vGoM~L%mvF(BI_`gcMu1FXZo5Y(pX6fM2ouf?d7RjqYXdVNy5l7Vi8i z<`#SM?>Sc*5$*OdmyyTS;;y}xRg0EV_tJ0nVdh+^BnAt9EUK55JXXF723y4r$3};) z<`P4WJ+mtfLem1Y01Xn_x^Os>PG{7Jn#iQH;V^H-Q8j>^@tpunXx3yRtMc7gTWLL> zh^hQ3tZhI((1sJ+L^2amdHa*t*$?QBkwj&w6Q07&td4@5EUaY##KX6(rrszICUDp! zG2;rxlR1|s6H`0(B_pZmc;oI2XT9M|)HfD$kuCC!pZt9%%HM)~NK2UGGlTP#svwz6{hvoW7JnbXXBWrFVY zJw7>CaXTf)*^}X^>11SOdZ6GHC)e*>b7aTykx@VaD4q^T&jQrvM)8cu)64Rgo|s%b zg{)A9!*LQbiQT0egtu&gdBrzn6C*}*`5X*jBy%13JftuX0iqI`HJZsJc?@eKC&t8_ ztBQ@$Kz-rbz+7kyt!RoTQW1VWtY)SXG2SeGDOU{BF@bw^2k4!kLZ8F|6?zg<0-(g4 zf^-__0FnTuh{nRm#kf@YNY24m6%{N_j{Z^6#$;BSD)+KxFT`VBJtz6u6 zMyh+v)VKT`ZrAU0iMfpj4C6>cQI8{oCB6dqCIDT>UjQfse1*}Is9Dl-AUKw8fZi?| zKNU;#G(QGR18|I=HAzkWB1GLUl3dtJ38iWqOHq`IRSxq&J$4`YzI_i3>geLG$?5cI z!+wU2D*kleVTZ-_CvW)&<%TDZzl8l)fFl5nBhBAYoZlb*sR&oInoY%H@_^Saak1KG zm`!Q}dQtV1Qbx$$%P>WWq)s98)G56S~3MnM|lMlju3EXna{ate&5WQ>O(q2alW$$1-VE-lFBjCdJ=dTn-9`qMjG-+IH(< z`El`ct!FcwlMNq4B<})y0X-&{xLq6lj~3nXkXHRxsr{t zQnYTsyoMTj|E*zpzZ~pjeEej5IGtFvkX)69HG2$G7fy`l@NJmYjDS{rGMR}+lDR^kQ*+%)r9ndna*d02?cq^zs$sW$-xeEg&3fgI$4R#N zqLd$PmfwU5joc@Y^e&>Qq2YiH&$`^7KvdJ!V!$-|mT#llt$Le9v&52Fej6q~1*j+l zmtfmR|KGErhqwnMQ*|cJqdA>Y^VoPK`p1hB9s(rS|2{A_hJ;wap0DO=7=RD z*=#DVPG-jW9JUvLJAfW max_chars: + print(f"\n... [{len(result) - max_chars} more characters]") + print("\n💡 Use --full to see complete output") + print("💡 Use --save to save to file") + + print("-" * 60) + return result + return None + + +def parse_audio(file_path: str) -> str: + """Transcribe audio using OpenAI Whisper API.""" + try: + from openai import OpenAI + client = OpenAI() # Uses OPENAI_API_KEY from .env + + with open(file_path, "rb") as audio_file: + print("🎤 Transcribing audio with Whisper...") + transcript = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file + ) + return transcript.text + except ImportError: + return "Error: pip install openai" + except Exception as e: + return f"Error: {e}" + + +def parse_file(file_path: str, full: bool = False, save: str = None, store: bool = False): + """Parse any supported file format.""" + path = Path(file_path) + ext = path.suffix.lower() + + print(f"📁 File: {file_path}") + print(f"📎 Type: {ext}\n") + + # Handle audio files separately + if ext in AUDIO_EXTENSIONS: + print("🎵 Audio file detected - using Whisper transcription\n") + text = parse_audio(file_path) + print("=" * 60) + print("TRANSCRIPTION:") + print("=" * 60) + if full: + print(text) + else: + print(text[:2000] if len(text) > 2000 else text) + print(f"\n✅ Total characters: {len(text)}") + if save: + with open(save, 'w', encoding='utf-8') as f: + f.write(text) + print(f"💾 Saved to: {save}") + return + + # Use LlamaParse for documents (save=True saves to Supabase) + parser = MultiModalParser(use_multimodal=False) + print("📄 Using LlamaParse...\n") + + doc = parser.parse(file_path, save=store) + + print("=" * 60) + print(f"Title: {doc.title}") + print(f"Status: {doc.processing_status}") + print(f"Type: {doc.document_type}") + print(f"Chunks: {len(doc.chunks)}") + print("=" * 60) + + if doc.processing_status == "completed": + # Save to file if requested + if save: + with open(save, 'w', encoding='utf-8') as f: + f.write(doc.raw_content) + print(f"💾 Saved to: {save}") + + print(f"\n📄 CONTENT{'' if full else ' PREVIEW'}:") + print("-" * 40) + if full: + print(doc.raw_content) + else: + preview = doc.raw_content[:2000] if doc.raw_content else "No content" + print(preview) + if len(doc.raw_content) > 2000: + print(f"\n... [{len(doc.raw_content) - 2000} more characters]") + print("\n💡 Use --full to see complete output") + print("💡 Use --save to save to file") + print("💡 Use --store to save to Supabase") + print("-" * 40) + + # Tables + total_tables = sum(len(c.tables) for c in doc.chunks) + if total_tables: + print(f"\n📊 TABLES: {total_tables}") + for chunk in doc.chunks: + for t in chunk.tables: + print(f" - {t.caption}: {len(t.headers)} cols, {len(t.rows)} rows") + + # Sections + sections = [c.section_hierarchy[0] for c in doc.chunks if c.section_hierarchy] + if sections: + print(f"\n📑 SECTIONS:") + for s in sections[:10]: + print(f" - {s}") + + print(f"\n✅ Total content: {len(doc.raw_content)} characters") + else: + print(f"\n❌ Failed: {doc.raw_content}") + + +def get_supabase_store(): + """Get Supabase store instance.""" + try: + return SupabaseStore() + except Exception as e: + print(f"❌ Supabase connection failed: {e}") + print(" Make sure SUPABASE_URL and SUPABASE_KEY are set in .env") + return None + + +def list_stored_results(): + """List all stored extraction results from Supabase.""" + store = get_supabase_store() + if not store: + return + + try: + results = store.list_all() + except Exception as e: + print(f"❌ Failed to fetch results: {e}") + return + + if not results: + print("📭 No stored results found in Supabase") + return + + print("=" * 60) + print(f"☁️ SUPABASE EXTRACTION RESULTS ({len(results)})") + print("=" * 60) + + for r in results: + status = "✅" if r.get('status') == 'completed' else "❌" + result_id = str(r.get('id', ''))[:8] + print(f"\n{status} {result_id}...") + print(f" Title: {r.get('title')}") + print(f" Type: {r.get('document_type')}") + print(f" Chunks: {r.get('chunks_count')} | {r.get('content_length')} chars") + print(f" Extracted: {r.get('extracted_at')}") + + print("\n" + "=" * 60) + print("💡 View in Supabase Dashboard: Table Editor > extraction_results") + + +def view_stored_result(result_id: str, full: bool = False): + """View a stored extraction result from Supabase.""" + store = get_supabase_store() + if not store: + return + + try: + data = store.get(result_id) + except Exception as e: + print(f"❌ Failed to fetch result: {e}") + return + + if not data: + print(f"❌ Result not found: {result_id}") + return + + print("=" * 60) + print(f"☁️ SUPABASE RESULT: {result_id[:8]}...") + print("=" * 60) + print(f"Title: {data.get('title')}") + print(f"Source: {data.get('source_url')}") + print(f"Type: {data.get('document_type')}") + print(f"Status: {data.get('status')}") + print(f"Chunks: {data.get('chunks_count')}") + print(f"Tables: {data.get('tables_count')}") + print(f"Formulas: {data.get('formulas_count')}") + print(f"Content Length: {data.get('content_length')} chars") + print(f"Extracted: {data.get('extracted_at')}") + print("-" * 60) + + # Show chunks summary + chunks = data.get('chunks', []) + if chunks: + print(f"\n📑 CHUNKS ({len(chunks)}):") + for i, chunk in enumerate(chunks[:5]): + section = chunk.get('section') or 'No section' + print(f" [{i+1}] {section} - {chunk.get('token_count', 0)} tokens") + if len(chunks) > 5: + print(f" ... and {len(chunks) - 5} more chunks") + + # Show content + if full: + print(f"\n📄 CONTENT:") + print("-" * 40) + content = data.get('raw_content', '') + print(content[:5000] if len(content) > 5000 else content) + if len(content) > 5000: + print(f"\n... [{len(content) - 5000} more characters]") + + print("=" * 60) + + +if __name__ == "__main__": + # Parse arguments + args = sys.argv[1:] + full_output = "--full" in args + store_result = "--store" in args + save_file = None + + if "--save" in args: + save_idx = args.index("--save") + if save_idx + 1 < len(args): + save_file = args[save_idx + 1] + args = [a for i, a in enumerate(args) if i != save_idx and i != save_idx + 1] + + args = [a for a in args if a not in ["--full", "--store"]] + + if args: + arg = args[0] + + # Commands for Supabase stored results + if arg == "--list": + list_stored_results() + elif arg == "--view" and len(args) > 1: + view_stored_result(args[1], full=full_output) + # Check if it's a job ID (UUID format) or --job flag + elif arg == "--job" and len(args) > 1: + job_id = args[1] + display_job_result(job_id, full=full_output, save=save_file) + elif len(arg) == 36 and arg.count('-') == 4: + # Looks like a UUID - could be job ID or Supabase result ID + # Try Supabase first, then LlamaCloud + store = get_supabase_store() + if store: + try: + result = store.get(arg) + if result: + view_stored_result(arg, full=full_output) + else: + display_job_result(arg, full=full_output, save=save_file) + except: + display_job_result(arg, full=full_output, save=save_file) + else: + display_job_result(arg, full=full_output, save=save_file) + else: + # Treat as file path + parse_file(arg, full=full_output, save=save_file, store=store_result) + else: + # Default: look for common test files + test_dir = Path(__file__).parent + candidates = list(test_dir.glob("*.pdf")) + list(test_dir.glob("*.mp3")) + if candidates: + file_path = str(candidates[0]) + parse_file(file_path, full=full_output, save=save_file, store=store_result) + else: + print("Usage:") + print(" python test_extractor.py # Parse a file") + print(" python test_extractor.py --store # Parse and save to Supabase") + print(" python test_extractor.py --full # Show full output") + print(" python test_extractor.py --save out.md # Save to markdown file") + print(" python test_extractor.py # Fetch job from LlamaCloud") + print("\nSupabase Cloud Storage:") + print(" python test_extractor.py --list # List all results in Supabase") + print(" python test_extractor.py --view # View result by UUID") + print(" python test_extractor.py --view --full # View full content") + print("\nSupported file formats:") + print(" Documents: .pdf, .docx, .pptx, .xlsx, .html, .txt, .md") + print(" Images: .jpg, .png, .gif, .webp") + print(" Audio: .mp3, .wav, .m4a, .flac") + print("\nExamples:") + print(" python test_extractor.py report.pdf --store") + print(" python test_extractor.py --list") + print(" python test_extractor.py --view a1b2c3d4-... --full") + sys.exit(1) diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..e4bfe45 --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,583 @@ +""" +Tests for Data Models (document.py & query.py) + +测试内容: +1. 数据类能正确实例化 +2. 默认值是否正确 +3. 嵌套关系是否正常 +4. 方法是否正常工作 +""" + +import pytest +from datetime import datetime + +# 导入 document.py 中的模型 +from src.models.document import ( + # Enums + DocumentType, SourceTier, ContentFreshness, EntityType, + # Dataclasses + Entity, Relationship, Table, Formula, Hyperlink, + DomainTags, Provenance, Chunk, Document, KnowledgeUnit +) + +# 导入 query.py 中的模型 +from src.models.query import ( + # Enums + QueryType, RetrievalStrategy, VerificationLevel, + # Dataclasses + QueryFilters, QuerySettings, QueryRequest, ParsedQuery, + RetrievalHop, RetrievalResult, Citation, VerificationResult, + QueryResponse, ContextPack +) + + +# ============================================================================= +# DOCUMENT.PY 测试 +# ============================================================================= + +class TestEnums: + """测试枚举类型""" + + def test_document_type_values(self): + """测试 DocumentType 枚举值""" + assert DocumentType.SEC_10K.value == "10-K" + assert DocumentType.SEC_10Q.value == "10-Q" + assert DocumentType.EARNINGS_CALL.value == "earnings_call" + assert DocumentType.PDF.value == "pdf" + + def test_source_tier_ordering(self): + """测试 SourceTier 可信度排序""" + # Tier 值越小越可信 + assert SourceTier.TIER_1_AUTHORITATIVE.value < SourceTier.TIER_2_VERIFIED.value + assert SourceTier.TIER_2_VERIFIED.value < SourceTier.TIER_3_SECONDARY.value + assert SourceTier.TIER_4_UNVERIFIED.value < SourceTier.TIER_5_BLACKLISTED.value + + def test_entity_type(self): + """测试 EntityType 枚举""" + assert EntityType.COMPANY.value == "company" + assert EntityType.METRIC.value == "metric" + assert EntityType.MONEY.value == "money" + + +class TestEntity: + """测试 Entity 数据类""" + + def test_entity_creation(self): + """测试创建 Entity""" + entity = Entity( + name="Apple Inc.", + type=EntityType.COMPANY, + confidence=0.95 + ) + assert entity.name == "Apple Inc." + assert entity.type == EntityType.COMPANY + assert entity.confidence == 0.95 + + def test_entity_default_values(self): + """测试 Entity 默认值""" + entity = Entity() + assert entity.name == "" + assert entity.type == EntityType.COMPANY # 默认是公司 + assert entity.confidence == 1.0 + assert entity.aliases == [] + + def test_entity_normalized_name(self): + """测试 __post_init__ 自动设置 normalized_name""" + entity = Entity(name="Apple Inc.") + assert entity.normalized_name == "apple inc." # 自动小写化 + + +class TestTable: + """测试 Table 数据类""" + + def test_table_creation(self): + """测试创建 Table""" + table = Table( + headers=["Metric", "FY2024", "FY2023"], + rows=[ + ["Revenue", "$383B", "$350B"], + ["Net Income", "$97B", "$90B"] + ], + caption="Financial Summary" + ) + assert len(table.headers) == 3 + assert len(table.rows) == 2 + + def test_table_to_markdown(self): + """测试 to_markdown() 方法""" + table = Table( + headers=["Name", "Value"], + rows=[["Revenue", "$100"]] + ) + md = table.to_markdown() + assert "| Name | Value |" in md + assert "| --- | --- |" in md + assert "| Revenue | $100 |" in md + + def test_empty_table_to_markdown(self): + """测试空表格的 markdown 输出""" + table = Table() + assert table.to_markdown() == "" + + +class TestProvenance: + """测试 Provenance 数据类""" + + def test_provenance_creation(self): + """测试创建 Provenance""" + prov = Provenance( + source_url="https://sec.gov/10k.html", + source_tier=SourceTier.TIER_1_AUTHORITATIVE, + page_number=42, + section="Risk Factors" + ) + assert prov.source_url == "https://sec.gov/10k.html" + assert prov.source_tier == SourceTier.TIER_1_AUTHORITATIVE + assert prov.page_number == 42 + + def test_calculate_hash(self): + """测试 calculate_hash() 方法""" + prov = Provenance() + hash1 = prov.calculate_hash("Hello World") + hash2 = prov.calculate_hash("Hello World") + hash3 = prov.calculate_hash("Different Content") + + assert hash1 == hash2 # 相同内容,相同 hash + assert hash1 != hash3 # 不同内容,不同 hash + assert len(hash1) == 16 # hash 长度为 16 + + +class TestChunk: + """测试 Chunk 数据类""" + + def test_chunk_creation(self): + """测试创建 Chunk""" + chunk = Chunk( + text="Apple reported revenue of $383 billion.", + document_id="doc_001" + ) + assert chunk.text == "Apple reported revenue of $383 billion." + assert chunk.document_id == "doc_001" + assert chunk.id != "" # 自动生成 UUID + + def test_chunk_with_entities(self): + """测试 Chunk 包含实体""" + entity1 = Entity(name="Apple", type=EntityType.COMPANY) + entity2 = Entity(name="$383 billion", type=EntityType.MONEY) + + chunk = Chunk( + text="Apple reported revenue of $383 billion.", + entities=[entity1, entity2] + ) + assert len(chunk.entities) == 2 + assert chunk.entities[0].name == "Apple" + + def test_chunk_with_tables(self): + """测试 Chunk 包含表格""" + table = Table(headers=["Q1", "Q2"], rows=[["$100", "$120"]]) + chunk = Chunk(text="Revenue by quarter:", tables=[table]) + + assert len(chunk.tables) == 1 + + def test_get_full_content(self): + """测试 get_full_content() 方法""" + table = Table( + headers=["Metric", "Value"], + rows=[["Revenue", "$100"]], + caption="Summary" + ) + chunk = Chunk(text="Financial data:", tables=[table]) + + content = chunk.get_full_content() + assert "Financial data:" in content + assert "[Table: Summary]" in content + assert "| Metric | Value |" in content + + +class TestDocument: + """测试 Document 数据类""" + + def test_document_creation(self): + """测试创建 Document""" + doc = Document( + title="Apple 10-K 2024", + source_url="https://sec.gov/apple-10k.html", + document_type=DocumentType.SEC_10K + ) + assert doc.title == "Apple 10-K 2024" + assert doc.document_type == DocumentType.SEC_10K + + def test_document_with_chunks(self): + """测试 Document 包含多个 Chunks""" + chunk1 = Chunk(text="Section 1 content") + chunk2 = Chunk(text="Section 2 content") + + doc = Document( + title="Test Document", + chunks=[chunk1, chunk2] + ) + assert doc.get_chunk_count() == 2 + + def test_document_with_entities(self): + """测试 Document 包含实体""" + entities = [ + Entity(name="Apple", type=EntityType.COMPANY), + Entity(name="Tim Cook", type=EntityType.PERSON) + ] + doc = Document(title="Test", entities=entities) + + assert doc.get_entity_count() == 2 + + +class TestKnowledgeUnit: + """测试 KnowledgeUnit 数据类""" + + def test_knowledge_unit_from_chunk(self): + """测试从 Chunk 和 Document 创建 KnowledgeUnit""" + # 创建 Document + doc = Document( + title="Apple 10-K", + source_url="https://sec.gov/apple", + document_type=DocumentType.SEC_10K, + global_summary="Apple's annual report..." + ) + + # 创建 Chunk + chunk = Chunk( + text="Revenue was $383 billion.", + local_summary="Revenue summary", + document_id=doc.id, + entities=[Entity(name="Apple", type=EntityType.COMPANY)] + ) + + # 转换为 KnowledgeUnit + ku = KnowledgeUnit.from_chunk(chunk, doc) + + assert ku.document_id == doc.id + assert ku.chunk_id == chunk.id + assert ku.text == chunk.text + assert ku.local_summary == "Revenue summary" + assert ku.global_context == "Apple's annual report..." + assert ku.document_type == DocumentType.SEC_10K + + +# ============================================================================= +# QUERY.PY 测试 +# ============================================================================= + +class TestQueryEnums: + """测试查询相关枚举""" + + def test_query_type(self): + """测试 QueryType 枚举""" + assert QueryType.SIMPLE.value == "simple" + assert QueryType.COMPARATIVE.value == "comparative" + assert QueryType.MULTI_HOP.value == "multi_hop" + + def test_retrieval_strategy(self): + """测试 RetrievalStrategy 枚举""" + assert RetrievalStrategy.SINGLE_SHOT.value == "single_shot" + assert RetrievalStrategy.ITERATIVE.value == "iterative" + + def test_verification_level(self): + """测试 VerificationLevel 枚举""" + assert VerificationLevel.NONE.value == "none" + assert VerificationLevel.STRICT.value == "strict" + + +class TestQueryFilters: + """测试 QueryFilters 数据类""" + + def test_default_filters(self): + """测试默认过滤器""" + filters = QueryFilters() + assert filters.date_range is None + assert filters.companies == [] + assert filters.document_types == [] + + def test_custom_filters(self): + """测试自定义过滤器""" + filters = QueryFilters( + companies=["AAPL", "MSFT"], + source_tiers=[1, 2], + topics=["revenue", "growth"] + ) + assert "AAPL" in filters.companies + assert 1 in filters.source_tiers + + +class TestQueryRequest: + """测试 QueryRequest 数据类""" + + def test_simple_request(self): + """测试简单请求""" + request = QueryRequest(query="What is Apple's revenue?") + assert request.query == "What is Apple's revenue?" + assert request.id != "" # 自动生成 + + def test_request_with_filters(self): + """测试带过滤器的请求""" + filters = QueryFilters(companies=["AAPL"]) + settings = QuerySettings(max_chunks=5, rerank=True) + + request = QueryRequest( + query="Apple revenue", + filters=filters, + settings=settings + ) + assert request.filters.companies == ["AAPL"] + assert request.settings.max_chunks == 5 + + +class TestParsedQuery: + """测试 ParsedQuery 数据类""" + + def test_parsed_query(self): + """测试解析后的查询""" + entity = Entity(name="Apple", type=EntityType.COMPANY) + + parsed = ParsedQuery( + original_query="What is Apple's revenue in Q3 2024?", + query_type=QueryType.TEMPORAL, + recommended_strategy=RetrievalStrategy.ITERATIVE, + entities=[entity], + temporal_references=["Q3 2024"], + metrics_requested=["revenue"], + enhanced_query="Apple Q3 2024 revenue financial results" + ) + + assert parsed.query_type == QueryType.TEMPORAL + assert "Q3 2024" in parsed.temporal_references + assert parsed.entities[0].name == "Apple" + + +class TestRetrievalResult: + """测试 RetrievalResult 数据类""" + + def test_retrieval_result(self): + """测试检索结果""" + result = RetrievalResult( + strategy_used=RetrievalStrategy.HYBRID, + total_tokens=3500, + retrieval_time_ms=150 + ) + assert result.strategy_used == RetrievalStrategy.HYBRID + assert result.total_tokens == 3500 + + +class TestCitation: + """测试 Citation 数据类""" + + def test_citation(self): + """测试引用""" + citation = Citation( + claim_text="Revenue was $383 billion", + source_url="https://sec.gov/apple", + source_title="Apple 10-K 2024", + page_number=42, + supporting_quote="Total revenue: $383 billion", + source_tier=SourceTier.TIER_1_AUTHORITATIVE, + is_verified=True + ) + assert citation.claim_text == "Revenue was $383 billion" + assert citation.is_verified == True + + +class TestQueryResponse: + """测试 QueryResponse 数据类""" + + def test_query_response(self): + """测试完整响应""" + response = QueryResponse( + answer="Apple's revenue was $383 billion in FY2024.", + confidence_score=0.92, + follow_up_questions=["How does this compare to last year?"] + ) + assert response.answer != "" + assert response.confidence_score == 0.92 + assert len(response.follow_up_questions) == 1 + + def test_response_to_dict(self): + """测试 to_dict() 方法""" + citation = Citation( + claim_text="Revenue $383B", + source_title="Apple 10-K", + source_url="https://sec.gov" + ) + response = QueryResponse( + answer="Test answer", + citations=[citation], + confidence_score=0.9 + ) + + result = response.to_dict() + assert result["answer"] == "Test answer" + assert result["confidence"] == 0.9 + assert len(result["citations"]) == 1 + + +class TestContextPack: + """测试 ContextPack 数据类""" + + def test_add_unit_within_budget(self): + """测试在预算内添加单元""" + pack = ContextPack(budget_tokens=1000) + + ku = KnowledgeUnit(text="Test content", chunk_id="c1") + result = pack.add_unit(ku, tokens=200) + + assert result == True + assert len(pack.knowledge_units) == 1 + assert pack.total_tokens == 200 + assert "c1" in pack.included_chunk_ids + + def test_add_unit_exceeds_budget(self): + """测试超出预算时拒绝添加""" + pack = ContextPack(budget_tokens=100) + + ku = KnowledgeUnit(text="Long content", chunk_id="c1") + result = pack.add_unit(ku, tokens=200) + + assert result == False + assert len(pack.knowledge_units) == 0 + assert "c1" in pack.excluded_chunk_ids + assert pack.exclusion_reasons["c1"] == "token_budget_exceeded" + + def test_format_for_llm(self): + """测试格式化 LLM 上下文""" + pack = ContextPack() + + ku1 = KnowledgeUnit( + text="Apple revenue was $383B.", + chunk_id="c1", + provenance=Provenance(source_url="https://sec.gov", page_number=42) + ) + ku2 = KnowledgeUnit( + text="Microsoft revenue was $200B.", + chunk_id="c2", + provenance=Provenance(source_url="https://sec.gov") + ) + + pack.add_unit(ku1, 100) + pack.add_unit(ku2, 100) + + formatted = pack.format_for_llm() + + assert "[Source 1:" in formatted + assert "[Source 2:" in formatted + assert "Apple revenue" in formatted + assert "(Page 42)" in formatted + + +# ============================================================================= +# 集成测试:完整数据流 +# ============================================================================= + +class TestIntegration: + """测试完整的数据流""" + + def test_full_document_pipeline(self): + """测试完整的文档处理流程""" + # 1. 创建实体 + entity = Entity(name="Apple Inc.", type=EntityType.COMPANY) + + # 2. 创建表格 + table = Table( + headers=["Year", "Revenue"], + rows=[["2024", "$383B"], ["2023", "$350B"]] + ) + + # 3. 创建 Chunk + chunk = Chunk( + text="Apple reported strong financial results.", + tables=[table], + entities=[entity], + tags=DomainTags( + primary_topic="financial_performance", + companies=["Apple Inc."] + ), + provenance=Provenance( + source_url="https://sec.gov/apple", + source_tier=SourceTier.TIER_1_AUTHORITATIVE, + page_number=5 + ) + ) + + # 4. 创建 Document + doc = Document( + title="Apple 10-K 2024", + document_type=DocumentType.SEC_10K, + chunks=[chunk], + entities=[entity], + global_summary="Apple's annual financial report." + ) + + # 5. 转换为 KnowledgeUnit + ku = KnowledgeUnit.from_chunk(chunk, doc) + + # 验证完整链路 + assert doc.get_chunk_count() == 1 + assert ku.global_context == "Apple's annual financial report." + assert ku.provenance.source_tier == SourceTier.TIER_1_AUTHORITATIVE + + def test_full_query_pipeline(self): + """测试完整的查询处理流程""" + # 1. 创建请求 + request = QueryRequest( + query="What is Apple's revenue?", + filters=QueryFilters(companies=["AAPL"]), + settings=QuerySettings(max_chunks=10, verification_level=VerificationLevel.STANDARD) + ) + + # 2. 模拟解析 + parsed = ParsedQuery( + original_query=request.query, + query_type=QueryType.SIMPLE, + recommended_strategy=RetrievalStrategy.SINGLE_SHOT, + entities=[Entity(name="Apple", type=EntityType.COMPANY)] + ) + + # 3. 模拟检索结果 + retrieval = RetrievalResult( + strategy_used=parsed.recommended_strategy, + retrieval_time_ms=100 + ) + + # 4. 创建引用 + citation = Citation( + claim_text="Revenue was $383B", + source_title="Apple 10-K", + is_verified=True + ) + + # 5. 创建验证结果 + verification = VerificationResult( + is_verified=True, + claims_total=1, + claims_verified=1 + ) + + # 6. 创建最终响应 + response = QueryResponse( + query_id=request.id, + answer="Apple's revenue was $383 billion.", + citations=[citation], + verification=verification, + parsed_query=parsed, + retrieval_result=retrieval + ) + + # 验证完整链路 + assert response.verification.is_verified == True + assert len(response.citations) == 1 + assert response.parsed_query.query_type == QueryType.SIMPLE + + +# ============================================================================= +# 运行测试 +# ============================================================================= + +if __name__ == "__main__": + # 运行所有测试 + pytest.main([__file__, "-v"]) + diff --git a/tests/test_summarizer.py b/tests/test_summarizer.py new file mode 100644 index 0000000..76daab8 --- /dev/null +++ b/tests/test_summarizer.py @@ -0,0 +1,640 @@ +""" +Tests for Adaptive Summarizer (summarizer.py) + +Tests: +1. LLMInterface implementations (MockLLM, OpenAILLM, AnthropicLLM) +2. AdaptiveSummarizer - two-stage summarization +3. Cost estimation +4. Integration with Document/Chunk models +""" + +import pytest +from unittest.mock import Mock, patch, MagicMock + +from src.extraction.summarizer import ( + LLMInterface, + OpenAILLM, + AnthropicLLM, + MockLLM, + AdaptiveSummarizer +) +from src.models.document import ( + Document, Chunk, Table, DocumentType, Provenance, DomainTags +) + + +# ============================================================================= +# TEST FIXTURES +# ============================================================================= + +@pytest.fixture +def sample_chunk(): + """Create a sample chunk for testing""" + return Chunk( + id="chunk_001", + document_id="doc_001", + text="""Apple Inc. reported quarterly revenue of $85.78 billion for Q1 2024, + representing a 2% increase year-over-year. iPhone sales remained the + largest contributor at $69.7 billion, while Services revenue reached + a record $23.12 billion, driven by strong App Store performance and + growing subscription services. The company announced a dividend increase + and expanded its share buyback program.""", + token_count=100 + ) + + +@pytest.fixture +def sample_chunk_with_table(): + """Create a sample chunk with table""" + table = Table( + headers=["Metric", "Q1 2024", "Q1 2023"], + rows=[ + ["Revenue", "$85.78B", "$83.5B"], + ["Net Income", "$23.6B", "$22.3B"], + ["EPS", "$1.53", "$1.45"] + ], + caption="Financial Highlights" + ) + + return Chunk( + id="chunk_002", + document_id="doc_001", + text="Financial results summary for the quarter:", + tables=[table], + token_count=150 + ) + + +@pytest.fixture +def sample_document(sample_chunk, sample_chunk_with_table): + """Create a sample document with multiple chunks""" + risk_chunk = Chunk( + id="chunk_003", + document_id="doc_001", + text="""RISK FACTORS: The company faces various risks including intense + competition, supply chain disruptions, regulatory changes, and currency + fluctuations. Changes in consumer preferences and economic conditions + could adversely impact demand for products and services.""", + token_count=80 + ) + + return Document( + id="doc_001", + title="Apple Inc. 10-K Annual Report 2024", + document_type=DocumentType.SEC_10K, + chunks=[sample_chunk, sample_chunk_with_table, risk_chunk] + ) + + +@pytest.fixture +def short_chunk(): + """Create a very short chunk""" + return Chunk( + id="chunk_short", + text="Short text.", + token_count=5 + ) + + +@pytest.fixture +def mock_summarizer(): + """Create summarizer with mock LLMs""" + return AdaptiveSummarizer(use_mock=True) + + +# ============================================================================= +# MOCK LLM TESTS +# ============================================================================= + +class TestMockLLM: + """Tests for MockLLM implementation""" + + def test_mock_llm_creation(self): + """Test MockLLM can be created""" + llm = MockLLM() + assert llm is not None + + def test_mock_llm_complete_summarize(self): + """Test MockLLM returns summary for summarize prompts""" + llm = MockLLM() + prompt = "Please summarize the following:\nContent: Apple reported revenue of $85 billion." + + response = llm.complete(prompt) + + assert response is not None + assert len(response) > 0 + assert "Summary:" in response + + def test_mock_llm_complete_other(self): + """Test MockLLM returns generic response for non-summarize prompts""" + llm = MockLLM() + prompt = "What is the capital of France?" + + response = llm.complete(prompt) + + assert response == "Mock response" + + def test_mock_llm_cost(self): + """Test MockLLM has zero cost""" + llm = MockLLM() + assert llm.get_cost_per_1k_tokens() == 0.0 + + def test_mock_llm_max_tokens(self): + """Test MockLLM respects max_tokens parameter""" + llm = MockLLM() + response = llm.complete("summarize: " + "x" * 1000, max_tokens=50) + + # Should return something (mock doesn't strictly enforce token limits) + assert response is not None + + +# ============================================================================= +# OPENAI LLM TESTS +# ============================================================================= + +class TestOpenAILLM: + """Tests for OpenAILLM implementation""" + + def test_openai_llm_creation(self): + """Test OpenAILLM can be created with default model""" + llm = OpenAILLM() + assert llm.model == "gpt-4o-mini" + assert llm.api_key is None + + def test_openai_llm_custom_model(self): + """Test OpenAILLM with custom model""" + llm = OpenAILLM(model="gpt-4o", api_key="test-key") + assert llm.model == "gpt-4o" + assert llm.api_key == "test-key" + + def test_openai_llm_cost_mini(self): + """Test cost for gpt-4o-mini""" + llm = OpenAILLM(model="gpt-4o-mini") + assert llm.get_cost_per_1k_tokens() == 0.00015 + + def test_openai_llm_cost_4o(self): + """Test cost for gpt-4o""" + llm = OpenAILLM(model="gpt-4o") + assert llm.get_cost_per_1k_tokens() == 0.005 + + def test_openai_llm_cost_turbo(self): + """Test cost for gpt-4-turbo""" + llm = OpenAILLM(model="gpt-4-turbo") + assert llm.get_cost_per_1k_tokens() == 0.01 + + def test_openai_llm_cost_unknown_model(self): + """Test cost for unknown model""" + llm = OpenAILLM(model="unknown-model") + assert llm.get_cost_per_1k_tokens() == 0.001 # Default fallback + + def test_openai_llm_complete_without_library(self): + """Test OpenAILLM gracefully handles missing openai library""" + llm = OpenAILLM() + + # Mock the import to fail + with patch.dict('sys.modules', {'openai': None}): + # Should not crash, return placeholder or empty + response = llm.complete("test prompt") + # The actual behavior depends on try/except in complete() + assert response is not None or response == "" + + +# ============================================================================= +# ANTHROPIC LLM TESTS +# ============================================================================= + +class TestAnthropicLLM: + """Tests for AnthropicLLM implementation""" + + def test_anthropic_llm_creation(self): + """Test AnthropicLLM can be created with default model""" + llm = AnthropicLLM() + assert llm.model == "claude-3-haiku-20240307" + assert llm.api_key is None + + def test_anthropic_llm_custom_model(self): + """Test AnthropicLLM with custom model""" + llm = AnthropicLLM(model="claude-3-5-sonnet-20241022", api_key="test-key") + assert llm.model == "claude-3-5-sonnet-20241022" + assert llm.api_key == "test-key" + + def test_anthropic_llm_cost_haiku(self): + """Test cost for Claude Haiku""" + llm = AnthropicLLM(model="claude-3-haiku-20240307") + assert llm.get_cost_per_1k_tokens() == 0.00025 + + def test_anthropic_llm_cost_sonnet(self): + """Test cost for Claude Sonnet""" + llm = AnthropicLLM(model="claude-3-5-sonnet-20241022") + assert llm.get_cost_per_1k_tokens() == 0.003 + + def test_anthropic_llm_cost_opus(self): + """Test cost for Claude Opus""" + llm = AnthropicLLM(model="claude-3-opus-20240229") + assert llm.get_cost_per_1k_tokens() == 0.015 + + def test_anthropic_llm_cost_unknown_model(self): + """Test cost for unknown model""" + llm = AnthropicLLM(model="unknown-model") + assert llm.get_cost_per_1k_tokens() == 0.001 # Default fallback + + +# ============================================================================= +# ADAPTIVE SUMMARIZER TESTS +# ============================================================================= + +class TestAdaptiveSummarizer: + """Tests for AdaptiveSummarizer""" + + def test_summarizer_creation_with_mock(self): + """Test creating summarizer with mock LLMs""" + summarizer = AdaptiveSummarizer(use_mock=True) + + assert summarizer.cheap_model is not None + assert summarizer.powerful_model is not None + assert isinstance(summarizer.cheap_model, MockLLM) + assert isinstance(summarizer.powerful_model, MockLLM) + + def test_summarizer_creation_with_defaults(self): + """Test creating summarizer with default models""" + summarizer = AdaptiveSummarizer() + + assert summarizer.cheap_model is not None + assert summarizer.powerful_model is not None + assert isinstance(summarizer.cheap_model, OpenAILLM) + assert isinstance(summarizer.powerful_model, OpenAILLM) + + def test_summarizer_creation_with_custom_models(self): + """Test creating summarizer with custom models""" + cheap = MockLLM() + powerful = MockLLM() + + summarizer = AdaptiveSummarizer(cheap_model=cheap, powerful_model=powerful) + + assert summarizer.cheap_model is cheap + assert summarizer.powerful_model is powerful + + def test_summarizer_has_prompts(self): + """Test summarizer has prompt templates""" + summarizer = AdaptiveSummarizer(use_mock=True) + + assert summarizer.local_summary_prompt is not None + assert "{content}" in summarizer.local_summary_prompt + + assert summarizer.global_summary_prompt is not None + assert "{summaries}" in summarizer.global_summary_prompt + + +class TestSummarizeChunk: + """Tests for chunk summarization""" + + def test_summarize_chunk_basic(self, sample_chunk, mock_summarizer): + """Test basic chunk summarization""" + summary = mock_summarizer.summarize_chunk(sample_chunk) + + assert summary is not None + assert len(summary) > 0 + + def test_summarize_chunk_with_table(self, sample_chunk_with_table, mock_summarizer): + """Test summarizing chunk with table""" + summary = mock_summarizer.summarize_chunk(sample_chunk_with_table) + + assert summary is not None + assert len(summary) > 0 + + def test_summarize_short_chunk_returns_content(self, short_chunk, mock_summarizer): + """Test that very short chunks return their content directly""" + summary = mock_summarizer.summarize_chunk(short_chunk) + + # Short chunks (<100 chars) should return content as-is + assert summary == short_chunk.text + + def test_summarize_chunk_uses_cheap_model(self, sample_chunk): + """Test that chunk summarization uses cheap model""" + cheap_mock = Mock(spec=LLMInterface) + cheap_mock.complete.return_value = "Mocked summary" + + summarizer = AdaptiveSummarizer(cheap_model=cheap_mock, powerful_model=MockLLM()) + summary = summarizer.summarize_chunk(sample_chunk) + + # Should call cheap model + cheap_mock.complete.assert_called_once() + assert summary == "Mocked summary" + + +class TestSummarizeDocument: + """Tests for document summarization""" + + def test_summarize_document_basic(self, sample_document, mock_summarizer): + """Test basic document summarization""" + summary = mock_summarizer.summarize_document(sample_document) + + assert summary is not None + assert len(summary) > 0 + + def test_summarize_document_empty(self, mock_summarizer): + """Test summarizing empty document""" + empty_doc = Document(title="Empty", chunks=[]) + + summary = mock_summarizer.summarize_document(empty_doc) + + assert summary == "" + + def test_summarize_document_sets_local_summaries(self, sample_document, mock_summarizer): + """Test that document summarization sets local summaries on chunks""" + # Clear existing local summaries + for chunk in sample_document.chunks: + chunk.local_summary = None + + mock_summarizer.summarize_document(sample_document) + + # All chunks should now have local summaries + for chunk in sample_document.chunks: + assert chunk.local_summary is not None + + def test_summarize_document_uses_both_models(self, sample_document): + """Test that document summarization uses both cheap and powerful models""" + cheap_mock = Mock(spec=LLMInterface) + cheap_mock.complete.return_value = "Local summary" + + powerful_mock = Mock(spec=LLMInterface) + powerful_mock.complete.return_value = "Global summary" + + summarizer = AdaptiveSummarizer(cheap_model=cheap_mock, powerful_model=powerful_mock) + summary = summarizer.summarize_document(sample_document) + + # Cheap model should be called for each chunk (3 chunks) + assert cheap_mock.complete.call_count == 3 + + # Powerful model should be called once for global synthesis + assert powerful_mock.complete.call_count == 1 + + assert summary == "Global summary" + + def test_summarize_document_skips_existing_summaries(self, sample_document, mock_summarizer): + """Test that existing local summaries are reused""" + # Set local summary on first chunk + sample_document.chunks[0].local_summary = "Pre-existing summary" + + cheap_mock = Mock(spec=LLMInterface) + cheap_mock.complete.return_value = "New summary" + + summarizer = AdaptiveSummarizer(cheap_model=cheap_mock, powerful_model=MockLLM()) + summarizer.summarize_document(sample_document) + + # Should only call cheap model for chunks without summaries (2 chunks) + assert cheap_mock.complete.call_count == 2 + + # First chunk should retain its original summary + assert sample_document.chunks[0].local_summary == "Pre-existing summary" + + +class TestProcessDocument: + """Tests for the process_document method""" + + def test_process_document(self, sample_document, mock_summarizer): + """Test complete document processing""" + result = mock_summarizer.process_document(sample_document) + + # Should return the same document + assert result is sample_document + + # All chunks should have local summaries + for chunk in result.chunks: + assert chunk.local_summary is not None + + # Document should have global summary + assert result.global_summary is not None + + def test_process_document_updates_document(self, sample_document, mock_summarizer): + """Test that process_document updates the document in place""" + original_id = sample_document.id + + result = mock_summarizer.process_document(sample_document) + + # Should be the same document object + assert result.id == original_id + assert result is sample_document + + +# ============================================================================= +# COST ESTIMATION TESTS +# ============================================================================= + +class TestCostEstimation: + """Tests for cost estimation""" + + def test_estimate_cost_basic(self, sample_document): + """Test basic cost estimation""" + summarizer = AdaptiveSummarizer(use_mock=True) + cost = summarizer.estimate_cost(sample_document) + + assert "local_summaries_cost" in cost + assert "global_summary_cost" in cost + assert "total_cost" in cost + assert "tokens_processed" in cost + + def test_estimate_cost_values(self, sample_document): + """Test cost estimation values""" + # Use real OpenAI models for cost calculation + summarizer = AdaptiveSummarizer() # Default models + cost = summarizer.estimate_cost(sample_document) + + # Total tokens = sum of chunk token counts + expected_tokens = sum(c.token_count for c in sample_document.chunks) + assert cost["tokens_processed"] == expected_tokens + + # Costs should be non-negative + assert cost["local_summaries_cost"] >= 0 + assert cost["global_summary_cost"] >= 0 + assert cost["total_cost"] >= 0 + + # Total should be sum of parts + assert cost["total_cost"] == cost["local_summaries_cost"] + cost["global_summary_cost"] + + def test_estimate_cost_cheap_vs_expensive(self, sample_document): + """Test that using cheap model is indeed cheaper""" + # Use different models for comparison + cheap_llm = OpenAILLM(model="gpt-4o-mini") + expensive_llm = OpenAILLM(model="gpt-4o") + + summarizer_cheap = AdaptiveSummarizer(cheap_model=cheap_llm, powerful_model=cheap_llm) + summarizer_expensive = AdaptiveSummarizer(cheap_model=expensive_llm, powerful_model=expensive_llm) + + cost_cheap = summarizer_cheap.estimate_cost(sample_document) + cost_expensive = summarizer_expensive.estimate_cost(sample_document) + + # Cheap should be less than expensive + assert cost_cheap["total_cost"] < cost_expensive["total_cost"] + + def test_estimate_cost_empty_document(self): + """Test cost estimation for empty document""" + summarizer = AdaptiveSummarizer(use_mock=True) + empty_doc = Document(title="Empty", chunks=[]) + + cost = summarizer.estimate_cost(empty_doc) + + assert cost["tokens_processed"] == 0 + assert cost["total_cost"] == 0 + + +# ============================================================================= +# PROMPT TESTS +# ============================================================================= + +class TestPrompts: + """Tests for prompt templates""" + + def test_local_summary_prompt_format(self): + """Test local summary prompt is properly formatted""" + summarizer = AdaptiveSummarizer(use_mock=True) + + test_content = "Apple reported revenue of $85 billion" + formatted = summarizer.local_summary_prompt.format(content=test_content) + + assert test_content in formatted + assert "summarize" in formatted.lower() + + def test_global_summary_prompt_format(self): + """Test global summary prompt is properly formatted""" + summarizer = AdaptiveSummarizer(use_mock=True) + + test_summaries = "[Section 1]: Revenue summary\n[Section 2]: Risk summary" + formatted = summarizer.global_summary_prompt.format(summaries=test_summaries) + + assert test_summaries in formatted + assert "executive summary" in formatted.lower() + + def test_global_prompt_includes_guidance(self): + """Test global prompt includes guidance for quality output""" + summarizer = AdaptiveSummarizer(use_mock=True) + + prompt = summarizer.global_summary_prompt + + # Should include guidance on what to capture + assert "topics" in prompt.lower() + assert "metrics" in prompt.lower() + assert "insights" in prompt.lower() + + +# ============================================================================= +# INTEGRATION TESTS +# ============================================================================= + +class TestIntegration: + """Integration tests for the complete summarization pipeline""" + + def test_full_summarization_pipeline(self, sample_document, mock_summarizer): + """Test complete summarization pipeline""" + # Clear any existing summaries + for chunk in sample_document.chunks: + chunk.local_summary = None + sample_document.global_summary = None + + # Process document + result = mock_summarizer.process_document(sample_document) + + # Verify local summaries + for chunk in result.chunks: + assert chunk.local_summary is not None + assert len(chunk.local_summary) > 0 + + # Verify global summary + assert result.global_summary is not None + assert len(result.global_summary) > 0 + + def test_summarization_preserves_document_structure(self, sample_document, mock_summarizer): + """Test that summarization doesn't modify document structure""" + original_chunk_count = len(sample_document.chunks) + original_chunk_ids = [c.id for c in sample_document.chunks] + original_title = sample_document.title + + mock_summarizer.process_document(sample_document) + + assert len(sample_document.chunks) == original_chunk_count + assert [c.id for c in sample_document.chunks] == original_chunk_ids + assert sample_document.title == original_title + + def test_summarization_with_various_content_types(self, mock_summarizer): + """Test summarization with different content types""" + # Create chunks with different content + text_chunk = Chunk(text="Plain text content about financial results.", token_count=20) + + table_chunk = Chunk( + text="Table data:", + tables=[Table(headers=["A", "B"], rows=[["1", "2"]])], + token_count=30 + ) + + long_chunk = Chunk( + text=" ".join(["Financial data."] * 100), # Long text + token_count=200 + ) + + doc = Document( + title="Mixed Content", + chunks=[text_chunk, table_chunk, long_chunk] + ) + + result = mock_summarizer.process_document(doc) + + # All chunks should be summarized + for chunk in result.chunks: + assert chunk.local_summary is not None + + +# ============================================================================= +# EDGE CASES +# ============================================================================= + +class TestEdgeCases: + """Tests for edge cases""" + + def test_very_long_content_truncation(self, mock_summarizer): + """Test that very long content is truncated before sending to LLM""" + # Create chunk with very long text + long_text = "Financial data. " * 10000 # ~150K chars + long_chunk = Chunk(text=long_text, token_count=30000) + + # Should not crash + summary = mock_summarizer.summarize_chunk(long_chunk) + assert summary is not None + + def test_special_characters_in_content(self, mock_summarizer): + """Test handling of special characters""" + special_chunk = Chunk( + text="Revenue: $100M (±5%) — see note™ © 2024 {details}", + token_count=20 + ) + + # Should not crash + summary = mock_summarizer.summarize_chunk(special_chunk) + assert summary is not None + + def test_unicode_content(self, mock_summarizer): + """Test handling of unicode content""" + unicode_chunk = Chunk( + text="Revenue: ¥10,000万 (€85M equivalent). Résumé of Q3 results.", + token_count=20 + ) + + # Should not crash + summary = mock_summarizer.summarize_chunk(unicode_chunk) + assert summary is not None + + def test_empty_chunk_text(self, mock_summarizer): + """Test handling of empty chunk text""" + empty_chunk = Chunk(text="", token_count=0) + + # Should return empty or the content itself + summary = mock_summarizer.summarize_chunk(empty_chunk) + assert summary == "" # Empty text returns as-is + + +# ============================================================================= +# RUN TESTS +# ============================================================================= + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) + diff --git a/tests/test_tagger.py b/tests/test_tagger.py new file mode 100644 index 0000000..d008102 --- /dev/null +++ b/tests/test_tagger.py @@ -0,0 +1,896 @@ +""" +Tests for Domain-Specialized Tagger (tagger.py) + +Tests: +1. EntityExtractor - Named entity extraction (companies, metrics, money, etc.) +2. TopicClassifier - Topic classification with confidence scores +3. DomainTagger - Combined tagging and relationship extraction +""" + +import pytest +from unittest.mock import Mock, patch, MagicMock + +from src.extraction.tagger import ( + EntityExtractor, + TopicClassifier, + DomainTagger +) +from src.models.document import ( + Document, Chunk, Entity, EntityType, DomainTags, + DocumentType, Relationship, Table, Provenance +) + + +# ============================================================================= +# TEST FIXTURES +# ============================================================================= + +@pytest.fixture +def sample_financial_text(): + """Sample financial text for testing""" + return """ + Apple Inc. reported quarterly revenue of $85.78 billion for Q3 2024, + representing a 12% increase year-over-year. iPhone sales remained the + largest contributor at $69.7 billion. The company's Services segment + grew by 11% to reach $23.12 billion. Microsoft Corp. reported similar + strong results with revenue of $65 billion. Net income for Apple was + $23.6 billion, with EPS of $1.53. + """ + + +@pytest.fixture +def sample_risk_text(): + """Sample risk factors text""" + return """ + RISK FACTORS: Our business faces various risks including economic + downturns, supply chain disruptions, and intense competition in the + technology sector. Changes in consumer preferences could impact demand. + Currency volatility and regulatory changes pose additional threats. + We may experience uncertainty in market conditions. + """ + + +@pytest.fixture +def sample_strategy_text(): + """Sample strategy text""" + return """ + Our strategy focuses on digital innovation and AI initiatives. + We plan to expand into new markets with a clear vision for growth. + Key objectives include increasing R&D investment and launching + new technology products. Our goal is to maintain market leadership. + """ + + +@pytest.fixture +def sample_chunk(): + """Create a sample chunk""" + return Chunk( + id="chunk_001", + document_id="doc_001", + text="""Apple Inc. reported revenue of $383 billion in FY2024, + a 5% increase from the previous year. Net income reached $97 billion.""", + token_count=50 + ) + + +@pytest.fixture +def sample_document(sample_chunk): + """Create a sample document with chunks""" + chunk2 = Chunk( + id="chunk_002", + document_id="doc_001", + text="""Risk factors include supply chain issues and competition. + The company faces uncertainty in the technology market.""", + token_count=30 + ) + + chunk3 = Chunk( + id="chunk_003", + document_id="doc_001", + text="""Microsoft Corp. also reported strong earnings with + revenue of $200 billion and EPS of $9.50.""", + token_count=30 + ) + + return Document( + id="doc_001", + title="Tech Company Financial Report", + document_type=DocumentType.SEC_10K, + chunks=[sample_chunk, chunk2, chunk3] + ) + + +@pytest.fixture +def entity_extractor(): + """Create EntityExtractor without spaCy""" + return EntityExtractor(use_spacy=False) + + +@pytest.fixture +def topic_classifier(): + """Create TopicClassifier""" + return TopicClassifier() + + +@pytest.fixture +def domain_tagger(): + """Create DomainTagger without spaCy""" + return DomainTagger(use_spacy=False) + + +# ============================================================================= +# ENTITY EXTRACTOR TESTS +# ============================================================================= + +class TestEntityExtractor: + """Tests for EntityExtractor""" + + def test_extractor_creation(self): + """Test creating EntityExtractor""" + extractor = EntityExtractor() + assert extractor is not None + assert extractor.use_spacy == False # Default + + def test_extractor_with_spacy_flag(self): + """Test creating extractor with spaCy flag""" + extractor = EntityExtractor(use_spacy=True) + assert extractor.use_spacy == True + + +class TestCompanyExtraction: + """Tests for company name extraction""" + + def test_extract_company_inc(self, entity_extractor): + """Test extracting company with Inc. suffix""" + text = "Apple Inc. reported strong results." + entities = entity_extractor._extract_companies(text, "chunk1") + + assert len(entities) >= 1 + company = next((e for e in entities if "Apple" in e.name), None) + assert company is not None + assert company.type == EntityType.COMPANY + + def test_extract_company_corp(self, entity_extractor): + """Test extracting company with Corp suffix""" + text = "Microsoft Corp. announced earnings." + entities = entity_extractor._extract_companies(text, "chunk1") + + assert len(entities) >= 1 + company = next((e for e in entities if "Microsoft" in e.name), None) + assert company is not None + + def test_extract_company_corporation(self, entity_extractor): + """Test extracting company with Corporation suffix""" + text = "Tesla Corporation released Q3 results." + entities = entity_extractor._extract_companies(text, "chunk1") + + # Should find Tesla Corporation + assert any("Tesla" in e.name for e in entities) + + def test_extract_multiple_companies(self, entity_extractor, sample_financial_text): + """Test extracting multiple companies""" + entities = entity_extractor._extract_companies(sample_financial_text, "chunk1") + + # Should find Apple Inc. and Microsoft Corp. + company_names = [e.name for e in entities] + assert any("Apple" in name for name in company_names) + assert any("Microsoft" in name for name in company_names) + + def test_extract_company_confidence(self, entity_extractor): + """Test company extraction confidence score""" + text = "Amazon Inc. is a major retailer." + entities = entity_extractor._extract_companies(text, "chunk1") + + assert len(entities) >= 1 + assert entities[0].confidence == 0.85 + + def test_extract_company_with_chunk_id(self, entity_extractor): + """Test that chunk_id is included in entity""" + text = "Apple Inc. reported results." + entities = entity_extractor._extract_companies(text, "chunk_123") + + assert len(entities) >= 1 + assert "chunk_123" in entities[0].source_chunk_ids + + +class TestMetricExtraction: + """Tests for financial metric extraction""" + + def test_extract_revenue(self, entity_extractor): + """Test extracting revenue metric""" + text = "Total revenue increased significantly." + entities = entity_extractor._extract_metrics(text, "chunk1") + + assert any(e.name == "revenue" for e in entities) + + def test_extract_net_income(self, entity_extractor): + """Test extracting net income metric""" + text = "Net income was $10 billion." + entities = entity_extractor._extract_metrics(text, "chunk1") + + assert any(e.name == "net_income" for e in entities) + + def test_extract_eps(self, entity_extractor): + """Test extracting EPS metric""" + text = "Earnings per share reached $1.50." + entities = entity_extractor._extract_metrics(text, "chunk1") + + assert any(e.name == "eps" for e in entities) + + def test_extract_gross_margin(self, entity_extractor): + """Test extracting gross margin metric""" + text = "Gross margin improved to 45%." + entities = entity_extractor._extract_metrics(text, "chunk1") + + assert any(e.name == "gross_margin" for e in entities) + + def test_extract_ebitda(self, entity_extractor): + """Test extracting EBITDA metric""" + text = "EBITDA was $50 billion." + entities = entity_extractor._extract_metrics(text, "chunk1") + + assert any(e.name == "ebitda" for e in entities) + + def test_extract_free_cash_flow(self, entity_extractor): + """Test extracting free cash flow metric""" + text = "Free cash flow totaled $30 billion." + entities = entity_extractor._extract_metrics(text, "chunk1") + + assert any(e.name == "free_cash_flow" for e in entities) + + def test_metric_confidence(self, entity_extractor): + """Test metric extraction confidence score""" + text = "Revenue increased." + entities = entity_extractor._extract_metrics(text, "chunk1") + + assert len(entities) >= 1 + assert entities[0].confidence == 0.9 + + def test_extract_multiple_metrics(self, entity_extractor, sample_financial_text): + """Test extracting multiple metrics from text""" + entities = entity_extractor._extract_metrics(sample_financial_text, "chunk1") + + metric_names = [e.name for e in entities] + assert "revenue" in metric_names + assert "net_income" in metric_names + assert "eps" in metric_names + + +class TestMoneyExtraction: + """Tests for monetary value extraction""" + + def test_extract_dollar_amount(self, entity_extractor): + """Test extracting basic dollar amount""" + text = "Revenue was $85 billion." + entities = entity_extractor._extract_money(text, "chunk1") + + # Should find "$85 billion" + assert len(entities) >= 1 + assert entities[0].type == EntityType.MONEY + + def test_extract_dollar_with_decimal(self, entity_extractor): + """Test extracting dollar amount with decimal""" + text = "EPS was $1.53." + entities = entity_extractor._extract_money(text, "chunk1") + + assert any("$1.53" in e.name for e in entities) + + def test_extract_dollar_with_millions(self, entity_extractor): + """Test extracting dollar amount with millions""" + text = "Cost was $500 million." + entities = entity_extractor._extract_money(text, "chunk1") + + assert len(entities) >= 1 + + def test_extract_dollar_with_commas(self, entity_extractor): + """Test extracting dollar amount with commas""" + text = "Revenue was $1,234,567." + entities = entity_extractor._extract_money(text, "chunk1") + + assert len(entities) >= 1 + + def test_money_confidence(self, entity_extractor): + """Test money extraction confidence score""" + text = "Price is $100." + entities = entity_extractor._extract_money(text, "chunk1") + + assert len(entities) >= 1 + assert entities[0].confidence == 0.95 + + +class TestPercentageExtraction: + """Tests for percentage extraction""" + + def test_extract_percentage(self, entity_extractor): + """Test extracting percentage""" + text = "Growth was 12%." + entities = entity_extractor._extract_percentages(text, "chunk1") + + assert len(entities) >= 1 + assert entities[0].type == EntityType.PERCENTAGE + assert "12%" in entities[0].name + + def test_extract_decimal_percentage(self, entity_extractor): + """Test extracting decimal percentage""" + text = "Margin was 45.5%." + entities = entity_extractor._extract_percentages(text, "chunk1") + + assert any("45.5%" in e.name for e in entities) + + def test_percentage_confidence(self, entity_extractor): + """Test percentage extraction confidence""" + text = "Up 10%." + entities = entity_extractor._extract_percentages(text, "chunk1") + + assert entities[0].confidence == 0.95 + + +class TestDateExtraction: + """Tests for date/period extraction""" + + def test_extract_fiscal_quarter(self, entity_extractor): + """Test extracting fiscal quarter""" + text = "Results for Q3 2024 were strong." + entities = entity_extractor._extract_dates(text, "chunk1") + + assert len(entities) >= 1 + assert any("Q3 2024" in e.name for e in entities) + + def test_extract_fiscal_year(self, entity_extractor): + """Test extracting fiscal year""" + text = "FY2024 results announced." + entities = entity_extractor._extract_dates(text, "chunk1") + + assert any("FY2024" in e.name for e in entities) + + def test_extract_year_only(self, entity_extractor): + """Test extracting year only""" + text = "In 2024, the company grew." + entities = entity_extractor._extract_dates(text, "chunk1") + + assert any("2024" in e.name for e in entities) + + def test_date_confidence(self, entity_extractor): + """Test date extraction confidence""" + text = "Q1 2024 results." + entities = entity_extractor._extract_dates(text, "chunk1") + + assert entities[0].confidence == 0.9 + + +class TestFullEntityExtraction: + """Tests for complete entity extraction""" + + def test_extract_all_entities(self, entity_extractor, sample_financial_text): + """Test extracting all entity types""" + entities = entity_extractor.extract_entities(sample_financial_text, "chunk1") + + # Should find various entity types + entity_types = set(e.type for e in entities) + + assert EntityType.COMPANY in entity_types + assert EntityType.MONEY in entity_types + assert EntityType.PERCENTAGE in entity_types + assert EntityType.METRIC in entity_types + assert EntityType.DATE in entity_types + + def test_deduplication(self, entity_extractor): + """Test entity deduplication""" + text = "Apple Inc. reported results. Apple Inc. also announced plans." + entities = entity_extractor.extract_entities(text, "chunk1") + + # Should deduplicate + apple_entities = [e for e in entities if "Apple" in e.name and e.type == EntityType.COMPANY] + assert len(apple_entities) == 1 + + def test_deduplication_merges_chunk_ids(self, entity_extractor): + """Test that deduplication merges chunk IDs""" + entities1 = [ + Entity(name="Apple", type=EntityType.COMPANY, source_chunk_ids=["c1"]), + Entity(name="Apple", type=EntityType.COMPANY, source_chunk_ids=["c2"]) + ] + + deduped = entity_extractor._deduplicate_entities(entities1) + + assert len(deduped) == 1 + assert "c1" in deduped[0].source_chunk_ids + assert "c2" in deduped[0].source_chunk_ids + + +class TestSpaCyIntegration: + """Tests for spaCy integration""" + + def test_spacy_lazy_load(self): + """Test that spaCy is lazy loaded""" + extractor = EntityExtractor(use_spacy=True) + + # _nlp should be None until accessed + assert extractor._nlp is None + + def test_spacy_fallback_on_import_error(self): + """Test fallback when spaCy not available""" + with patch.dict('sys.modules', {'spacy': None}): + extractor = EntityExtractor(use_spacy=True) + # Accessing nlp should trigger fallback + nlp = extractor.nlp + # Should have disabled spaCy + assert extractor.use_spacy == False or nlp is None + + +# ============================================================================= +# TOPIC CLASSIFIER TESTS +# ============================================================================= + +class TestTopicClassifier: + """Tests for TopicClassifier""" + + def test_classifier_creation(self, topic_classifier): + """Test creating TopicClassifier""" + assert topic_classifier is not None + assert len(topic_classifier.topic_keywords) > 0 + + +class TestTopicClassification: + """Tests for topic classification""" + + def test_classify_risk_factors(self, topic_classifier, sample_risk_text): + """Test classifying risk factors topic""" + primary, secondary, scores = topic_classifier.classify(sample_risk_text) + + assert primary == "risk_factors" + assert "risk_factors" in scores + assert scores["risk_factors"] > 0 + + def test_classify_financial_performance(self, topic_classifier, sample_financial_text): + """Test classifying financial performance topic""" + primary, secondary, scores = topic_classifier.classify(sample_financial_text) + + assert primary == "financial_performance" + assert scores["financial_performance"] > 0 + + def test_classify_strategy(self, topic_classifier, sample_strategy_text): + """Test classifying strategy topic""" + primary, secondary, scores = topic_classifier.classify(sample_strategy_text) + + assert primary == "strategy" or primary == "technology" + assert "strategy" in scores or "technology" in scores + + def test_classify_returns_secondary_topics(self, topic_classifier, sample_financial_text): + """Test that secondary topics are returned""" + primary, secondary, scores = topic_classifier.classify(sample_financial_text) + + # Financial text might have market_analysis as secondary + assert isinstance(secondary, list) + + def test_classify_returns_scores(self, topic_classifier, sample_financial_text): + """Test that scores are returned""" + primary, secondary, scores = topic_classifier.classify(sample_financial_text) + + assert isinstance(scores, dict) + assert all(0 <= score <= 1 for score in scores.values()) + + def test_classify_empty_text(self, topic_classifier): + """Test classifying empty text""" + primary, secondary, scores = topic_classifier.classify("") + + assert primary == "general" + assert secondary == [] + assert scores == {} + + def test_classify_no_matching_topic(self, topic_classifier): + """Test classifying text with no matching topic""" + text = "The quick brown fox jumps over the lazy dog." + primary, secondary, scores = topic_classifier.classify(text) + + assert primary == "general" + + def test_secondary_topics_threshold(self, topic_classifier): + """Test that secondary topics have score > 0.2""" + text = """ + Our strategy focuses on revenue growth and market expansion. + Risk factors include competition and regulatory changes. + We invest heavily in technology and digital innovation. + """ + primary, secondary, scores = topic_classifier.classify(text) + + # All secondary topics should have score > 0.2 + for topic in secondary: + assert scores[topic] > 0.2 + + def test_topic_weights_applied(self, topic_classifier): + """Test that topic weights are applied correctly""" + # risk_factors has weight 1.0 + # sustainability has weight 0.8 + assert topic_classifier.topic_keywords["risk_factors"]["weight"] == 1.0 + assert topic_classifier.topic_keywords["sustainability"]["weight"] == 0.8 + + +# ============================================================================= +# DOMAIN TAGGER TESTS +# ============================================================================= + +class TestDomainTagger: + """Tests for DomainTagger""" + + def test_tagger_creation(self, domain_tagger): + """Test creating DomainTagger""" + assert domain_tagger is not None + assert domain_tagger.entity_extractor is not None + assert domain_tagger.topic_classifier is not None + + def test_tagger_with_spacy(self): + """Test creating tagger with spaCy flag""" + tagger = DomainTagger(use_spacy=True) + assert tagger.entity_extractor.use_spacy == True + + +class TestTagChunk: + """Tests for chunk tagging""" + + def test_tag_chunk_basic(self, domain_tagger, sample_chunk): + """Test basic chunk tagging""" + tagged = domain_tagger.tag_chunk(sample_chunk) + + assert tagged.entities is not None + assert tagged.tags is not None + assert tagged is sample_chunk # Same object modified + + def test_tag_chunk_extracts_entities(self, domain_tagger, sample_chunk): + """Test that tagging extracts entities""" + tagged = domain_tagger.tag_chunk(sample_chunk) + + # Should find Apple Inc. and financial metrics + entity_names = [e.name for e in tagged.entities] + assert any("Apple" in name for name in entity_names) + + def test_tag_chunk_sets_primary_topic(self, domain_tagger, sample_chunk): + """Test that tagging sets primary topic""" + tagged = domain_tagger.tag_chunk(sample_chunk) + + assert tagged.tags.primary_topic is not None + assert tagged.tags.primary_topic == "financial_performance" + + def test_tag_chunk_sets_companies(self, domain_tagger, sample_chunk): + """Test that tagging sets companies in tags""" + tagged = domain_tagger.tag_chunk(sample_chunk) + + assert any("Apple" in c for c in tagged.tags.companies) + + def test_tag_chunk_sets_metrics(self, domain_tagger, sample_chunk): + """Test that tagging sets metrics in tags""" + tagged = domain_tagger.tag_chunk(sample_chunk) + + assert "revenue" in tagged.tags.metrics + + def test_tag_chunk_sets_time_period(self, domain_tagger, sample_chunk): + """Test that tagging extracts time period""" + tagged = domain_tagger.tag_chunk(sample_chunk) + + # Should find FY2024 + assert tagged.tags.time_period is not None + assert "2024" in tagged.tags.time_period + + def test_tag_chunk_with_table(self, domain_tagger): + """Test tagging chunk with table""" + table = Table( + headers=["Metric", "Value"], + rows=[["Revenue", "$100B"]] + ) + chunk = Chunk( + text="Apple Inc. financial summary:", + tables=[table] + ) + + tagged = domain_tagger.tag_chunk(chunk) + + # Should extract entities from table content too + assert tagged.entities is not None + + +class TestTagDocument: + """Tests for document tagging""" + + def test_tag_document_basic(self, domain_tagger, sample_document): + """Test basic document tagging""" + tagged = domain_tagger.tag_document(sample_document) + + assert tagged.entities is not None + assert tagged.tags is not None + assert tagged is sample_document + + def test_tag_document_tags_all_chunks(self, domain_tagger, sample_document): + """Test that all chunks are tagged""" + tagged = domain_tagger.tag_document(sample_document) + + for chunk in tagged.chunks: + assert chunk.tags is not None + assert chunk.entities is not None + + def test_tag_document_aggregates_entities(self, domain_tagger, sample_document): + """Test that entities are aggregated across chunks""" + tagged = domain_tagger.tag_document(sample_document) + + # Should have entities from all chunks + assert len(tagged.entities) > 0 + + # Should have both Apple and Microsoft + entity_names = [e.name for e in tagged.entities] + assert any("Apple" in name for name in entity_names) + assert any("Microsoft" in name for name in entity_names) + + def test_tag_document_deduplicates_entities(self, domain_tagger, sample_document): + """Test entity deduplication at document level""" + tagged = domain_tagger.tag_document(sample_document) + + # Check for duplicates + seen = set() + for entity in tagged.entities: + key = (entity.name.lower(), entity.type) + assert key not in seen, f"Duplicate entity: {entity.name}" + seen.add(key) + + def test_tag_document_primary_topic_from_chunks(self, domain_tagger, sample_document): + """Test that document primary topic is most common chunk topic""" + tagged = domain_tagger.tag_document(sample_document) + + # Count chunk topics + topic_counts = {} + for chunk in tagged.chunks: + topic = chunk.tags.primary_topic + topic_counts[topic] = topic_counts.get(topic, 0) + 1 + + # Document topic should be most common + expected_primary = max(topic_counts.items(), key=lambda x: x[1])[0] + assert tagged.tags.primary_topic == expected_primary + + def test_tag_document_sets_companies(self, domain_tagger, sample_document): + """Test document-level company list""" + tagged = domain_tagger.tag_document(sample_document) + + assert len(tagged.tags.companies) > 0 + + +class TestExtractTimePeriod: + """Tests for time period extraction""" + + def test_extract_time_period_fiscal_quarter(self, domain_tagger): + """Test extracting fiscal quarter""" + entities = [ + Entity(name="Q3 2024", type=EntityType.DATE), + Entity(name="2024", type=EntityType.DATE) + ] + + period = domain_tagger._extract_time_period(entities) + + # Should prefer Q3 2024 over just 2024 + assert period == "Q3 2024" + + def test_extract_time_period_fiscal_year(self, domain_tagger): + """Test extracting fiscal year""" + entities = [ + Entity(name="FY2024", type=EntityType.DATE), + Entity(name="2024", type=EntityType.DATE) + ] + + period = domain_tagger._extract_time_period(entities) + + # Should prefer FY2024 + assert period == "FY2024" + + def test_extract_time_period_no_dates(self, domain_tagger): + """Test when no date entities exist""" + entities = [ + Entity(name="Apple", type=EntityType.COMPANY) + ] + + period = domain_tagger._extract_time_period(entities) + + assert period is None + + def test_extract_time_period_year_only(self, domain_tagger): + """Test extracting year only""" + entities = [ + Entity(name="2024", type=EntityType.DATE) + ] + + period = domain_tagger._extract_time_period(entities) + + assert period == "2024" + + +class TestExtractRelationships: + """Tests for relationship extraction""" + + def test_extract_relationships_basic(self, domain_tagger): + """Test basic relationship extraction""" + chunk = Chunk( + id="chunk_001", + text="Apple Inc. reported revenue of $383 billion in Q3 2024." + ) + + # First tag the chunk + domain_tagger.tag_chunk(chunk) + + # Then extract relationships + relationships = domain_tagger.extract_relationships(chunk) + + assert len(relationships) > 0 + + def test_extract_reported_relationship(self, domain_tagger): + """Test extracting REPORTED relationship""" + chunk = Chunk(text="Apple Inc. reported revenue growth.") + domain_tagger.tag_chunk(chunk) + + relationships = domain_tagger.extract_relationships(chunk) + + # Should have Company -> REPORTED -> Metric + reported_rels = [r for r in relationships if r.relation_type == "REPORTED"] + assert len(reported_rels) > 0 + + def test_extract_has_value_relationship(self, domain_tagger): + """Test extracting HAS_VALUE relationship""" + chunk = Chunk(text="Revenue was $383 billion.") + domain_tagger.tag_chunk(chunk) + + relationships = domain_tagger.extract_relationships(chunk) + + # Should have Metric -> HAS_VALUE -> Money + value_rels = [r for r in relationships if r.relation_type == "HAS_VALUE"] + assert len(value_rels) > 0 + + def test_relationship_temporal_scope(self, domain_tagger): + """Test that relationships include temporal scope""" + chunk = Chunk(text="Apple Inc. reported revenue of $383 billion in Q3 2024.") + domain_tagger.tag_chunk(chunk) + + relationships = domain_tagger.extract_relationships(chunk) + + # At least one relationship should have temporal scope + has_temporal = any(r.temporal_scope is not None for r in relationships) + assert has_temporal + + def test_relationship_source_chunk_id(self, domain_tagger): + """Test that relationships include source chunk ID""" + chunk = Chunk(id="test_chunk", text="Apple Inc. revenue was high.") + domain_tagger.tag_chunk(chunk) + + relationships = domain_tagger.extract_relationships(chunk) + + for rel in relationships: + assert rel.source_chunk_id == "test_chunk" + + +# ============================================================================= +# INTEGRATION TESTS +# ============================================================================= + +class TestIntegration: + """Integration tests for complete tagging pipeline""" + + def test_full_document_tagging_pipeline(self, domain_tagger, sample_document): + """Test complete document tagging pipeline""" + # Tag document + tagged_doc = domain_tagger.tag_document(sample_document) + + # Verify document-level tags + assert tagged_doc.tags is not None + assert tagged_doc.tags.primary_topic is not None + assert len(tagged_doc.entities) > 0 + + # Verify all chunks are tagged + for chunk in tagged_doc.chunks: + assert chunk.tags is not None + assert chunk.tags.primary_topic is not None + + # Extract relationships from all chunks + all_relationships = [] + for chunk in tagged_doc.chunks: + rels = domain_tagger.extract_relationships(chunk) + all_relationships.extend(rels) + + # Should have relationships + assert len(all_relationships) > 0 + + def test_tagging_preserves_document_structure(self, domain_tagger, sample_document): + """Test that tagging doesn't modify document structure""" + original_chunk_count = len(sample_document.chunks) + original_chunk_ids = [c.id for c in sample_document.chunks] + original_title = sample_document.title + + domain_tagger.tag_document(sample_document) + + assert len(sample_document.chunks) == original_chunk_count + assert [c.id for c in sample_document.chunks] == original_chunk_ids + assert sample_document.title == original_title + + def test_tagging_real_financial_report(self, domain_tagger): + """Test tagging realistic financial report content""" + chunks = [ + Chunk(text=""" + Apple Inc. reported quarterly revenue of $94.8 billion for the fiscal + 2024 first quarter. iPhone revenue was $69.7 billion, up 6% year over year. + Services revenue reached a new all-time record of $23.1 billion. + """), + Chunk(text=""" + RISK FACTORS: The Company's business, reputation, results of operations, + financial condition and stock price can be affected by a number of factors, + whether currently known or unknown, including intense competition and rapid + technological change. + """), + Chunk(text=""" + Our strategy focuses on innovation in products and services. We continue + to invest in research and development, with R&D expenses of $7.7 billion. + We plan to expand our digital initiatives and AI capabilities. + """) + ] + + doc = Document( + title="Apple 10-K FY2024", + document_type=DocumentType.SEC_10K, + chunks=chunks + ) + + tagged = domain_tagger.tag_document(doc) + + # Verify comprehensive tagging + assert "Apple" in str(tagged.tags.companies) + assert "revenue" in tagged.tags.metrics + + # Verify multiple topics detected + chunk_topics = [c.tags.primary_topic for c in tagged.chunks] + assert "financial_performance" in chunk_topics + assert "operations" in chunk_topics + + +class TestEdgeCases: + """Tests for edge cases""" + + def test_empty_chunk(self, domain_tagger): + """Test tagging empty chunk""" + chunk = Chunk(text="") + tagged = domain_tagger.tag_chunk(chunk) + + assert tagged.tags is not None + assert tagged.tags.primary_topic == "general" + + def test_chunk_with_only_numbers(self, domain_tagger): + """Test chunk with only numbers""" + chunk = Chunk(text="$100 $200 $300 15% 20% 25%") + tagged = domain_tagger.tag_chunk(chunk) + + # Should find money and percentage entities + assert any(e.type == EntityType.MONEY for e in tagged.entities) + assert any(e.type == EntityType.PERCENTAGE for e in tagged.entities) + + def test_document_with_no_chunks(self, domain_tagger): + """Test tagging document with no chunks""" + doc = Document(title="Empty", chunks=[]) + tagged = domain_tagger.tag_document(doc) + + assert tagged.tags is not None + assert tagged.tags.primary_topic == "general" + + def test_special_characters_in_text(self, domain_tagger): + """Test handling special characters""" + chunk = Chunk(text="Apple Inc.™ revenue: $100B (±5%) — see note® © 2024") + + # Should not crash + tagged = domain_tagger.tag_chunk(chunk) + assert tagged.tags is not None + + def test_unicode_content(self, domain_tagger): + """Test handling unicode content""" + chunk = Chunk(text="Revenue: ¥10,000万 (€85M equivalent). Apple Inc. Q3 2024") + + # Should not crash + tagged = domain_tagger.tag_chunk(chunk) + assert tagged.entities is not None + + +# ============================================================================= +# RUN TESTS +# ============================================================================= + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) +