-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path_extract_check.py
More file actions
27 lines (20 loc) · 940 Bytes
/
_extract_check.py
File metadata and controls
27 lines (20 loc) · 940 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
"""Confirm v3.2.4 semantic-container extraction yields real article text."""
import tempfile
from pathlib import Path
from curl_cffi import requests
URL = "https://tularecounty.ca.gov/sheriff/media/news-releases/32-year-old-man-shoots-kills-father-leads-deputies-on-a-chase-then-shoots-himself"
print(f"GET {URL}")
r = requests.get(URL, impersonate="chrome131", timeout=30)
print(f" raw bytes: {len(r.content)}")
tmp = Path(tempfile.mkdtemp()) / "page.html"
tmp.write_bytes(r.content)
from oraculus_di_auditor.interface.routes.upload import ingest_uploaded_file
normalized = ingest_uploaded_file(tmp)
print(f"\nreturned keys: {sorted(normalized.keys())}")
for k, v in normalized.items():
if isinstance(v, str):
print(f" {k}: str(len={len(v)}) head={v[:120]!r}")
elif isinstance(v, (dict, list)):
print(f" {k}: {type(v).__name__}(len={len(v)})")
else:
print(f" {k}: {type(v).__name__}={v!r}")