diff --git a/agentic_eda/jupyterlab_extension_backend/.gitignore b/agentic_eda/jupyterlab_extension_backend/.gitignore new file mode 100644 index 000000000..b41013075 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/.gitignore @@ -0,0 +1,20 @@ +# OS files +.DS_Store + +# Python cache/build artifacts +__pycache__/ +*.py[cod] +*.pyo +*.pyd + +# Secrets and local environment files +.env +*.env +config/.env +*.secret +*secret* +*.key +*.pem +langchain-reference +AGENTS.md +traces/ diff --git a/agentic_eda/jupyterlab_extension_backend/README.md b/agentic_eda/jupyterlab_extension_backend/README.md new file mode 100644 index 000000000..d3a9b1185 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/README.md @@ -0,0 +1,19 @@ +# JupyterLab Extension Backend + +Run the backend entrypoint from this directory: + +```bash +cd /Users/indro/src/tutorials1/agentic_eda/jupyterlab_extension_backend +python -m src.main \ + --mode integrity \ + --path /Users/indro/src/tutorials1/agentic_eda/jupyterlab_extension_backend/datasets/T1_slice.csv +``` + +If you run from a different directory, set `PYTHONPATH`: + +```bash +PYTHONPATH=/Users/indro/src/tutorials1/agentic_eda/jupyterlab_extension_backend \ +python -m src.main \ + --mode integrity \ + --path /Users/indro/src/tutorials1/agentic_eda/jupyterlab_extension_backend/datasets/T1_slice.csv +``` diff --git a/agentic_eda/jupyterlab_extension_backend/datasets/T1_slice.csv b/agentic_eda/jupyterlab_extension_backend/datasets/T1_slice.csv new file mode 100644 index 000000000..fd8bb93b2 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/datasets/T1_slice.csv @@ -0,0 +1,101 @@ +Date/Time,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°) +01 01 2018 00:00,380.047790527343,5.31133604049682,416.328907824861,259.994903564453 +01 01 2018 00:10,453.76919555664,5.67216682434082,519.917511061494,268.64111328125 +01 01 2018 00:20,306.376586914062,5.21603679656982,390.900015810951,272.564788818359 +01 01 2018 00:30,419.645904541015,5.65967416763305,516.127568975674,271.258087158203 +01 01 2018 00:40,380.650695800781,5.57794094085693,491.702971953588,265.674285888671 +01 01 2018 00:50,402.391998291015,5.60405206680297,499.436385024805,264.57861328125 +01 01 2018 01:00,447.605712890625,5.79300785064697,557.372363290225,266.163604736328 +01 01 2018 01:10,387.2421875,5.30604982376098,414.898178826186,257.949493408203 +01 01 2018 01:20,463.651214599609,5.58462905883789,493.677652137077,253.480697631835 +01 01 2018 01:30,439.725708007812,5.52322816848754,475.706782818068,258.72378540039 +01 01 2018 01:40,498.181701660156,5.72411584854125,535.841397042263,251.850997924804 +01 01 2018 01:50,526.816223144531,5.93419885635375,603.014076510633,265.504699707031 +01 01 2018 02:00,710.587280273437,6.54741382598876,824.662513585882,274.23291015625 +01 01 2018 02:10,655.194274902343,6.19974613189697,693.472641075637,266.733184814453 +01 01 2018 02:20,754.762512207031,6.50538301467895,808.098138482693,266.76040649414 +01 01 2018 02:30,790.173278808593,6.63411617279052,859.459020788565,270.493194580078 +01 01 2018 02:40,742.985290527343,6.37891292572021,759.434536596592,266.593292236328 +01 01 2018 02:50,748.229614257812,6.4466528892517,785.28100987646,265.571807861328 +01 01 2018 03:00,736.647827148437,6.41508293151855,773.172863451736,261.15869140625 +01 01 2018 03:10,787.246215820312,6.43753099441528,781.7712157188,257.56021118164 +01 01 2018 03:20,722.864074707031,6.22002410888671,700.764699868076,255.926498413085 +01 01 2018 03:30,935.033386230468,6.89802598953247,970.736626881787,250.012893676757 +01 01 2018 03:40,1220.60900878906,7.60971117019653,1315.04892785216,255.985702514648 +01 01 2018 03:50,1053.77197265625,7.28835582733154,1151.26574355584,255.444595336914 +01 01 2018 04:00,1493.80798339843,7.94310188293457,1497.58372354361,256.407409667968 +01 01 2018 04:10,1724.48803710937,8.37616157531738,1752.19966204818,252.41259765625 +01 01 2018 04:20,1636.93505859375,8.23695755004882,1668.47070685152,247.979400634765 +01 01 2018 04:30,1385.48803710937,7.87959098815917,1461.81579081391,238.609603881835 +01 01 2018 04:40,1098.93200683593,7.10137605667114,1062.28503444311,245.095596313476 +01 01 2018 04:50,1021.4580078125,6.95530700683593,995.995854606612,245.410202026367 +01 01 2018 05:00,1164.89294433593,7.09829807281494,1060.85971215544,235.227905273437 +01 01 2018 05:10,1073.33203125,6.95363092422485,995.250960801046,242.872695922851 +01 01 2018 05:20,1165.30798339843,7.24957799911499,1132.4168612641,244.835693359375 +01 01 2018 05:30,1177.98999023437,7.29469108581542,1154.36530469206,242.48159790039 +01 01 2018 05:40,1170.53601074218,7.37636995315551,1194.8430985043,247.97720336914 +01 01 2018 05:50,1145.53601074218,7.44855403900146,1231.43070603717,249.682998657226 +01 01 2018 06:00,1114.02697753906,7.2392520904541,1127.43320551345,248.401000976562 +01 01 2018 06:10,1153.18505859375,7.32921123504638,1171.35504358957,244.621704101562 +01 01 2018 06:20,1125.3310546875,7.13970518112182,1080.13908466205,244.631805419921 +01 01 2018 06:30,1228.73205566406,7.47422885894775,1244.63353439737,245.785995483398 +01 01 2018 06:40,1021.79302978515,7.03317403793334,1030.99268581181,248.652206420898 +01 01 2018 06:50,957.378173828125,6.88645505905151,965.683334443832,244.611694335937 +01 01 2018 07:00,909.887817382812,6.88782119750976,966.279104864065,235.84829711914 +01 01 2018 07:10,1000.95397949218,7.21643209457397,1116.4718990154,232.842697143554 +01 01 2018 07:20,1024.47802734375,7.0685977935791,1047.17023059277,229.933197021484 +01 01 2018 07:30,1009.53399658203,6.93829584121704,988.451940715539,230.13670349121 +01 01 2018 07:40,899.492980957031,6.53668785095214,820.416658585943,234.933807373046 +01 01 2018 07:50,725.110107421875,6.18062496185302,686.636942163399,232.837905883789 +01 01 2018 08:00,585.259399414062,5.81682586669921,564.927659543473,240.328796386718 +01 01 2018 08:10,443.913909912109,5.45015096664428,454.773587146918,238.12629699707 +01 01 2018 08:20,565.253784179687,5.81814908981323,565.349093224668,235.80029296875 +01 01 2018 08:30,644.037780761718,6.13027286529541,668.823569309414,224.958694458007 +01 01 2018 08:40,712.058898925781,6.34707784652709,747.460673422601,216.803894042968 +01 01 2018 08:50,737.394775390625,6.34743690490722,747.595109122642,205.785293579101 +01 01 2018 09:00,725.868103027343,6.19436883926391,691.546334303948,199.848495483398 +01 01 2018 09:10,408.997406005859,4.97719812393188,330.417630427964,207.997802734375 +01 01 2018 09:20,628.436828613281,5.95911121368408,611.283836510667,210.954895019531 +01 01 2018 09:30,716.1005859375,6.21137619018554,697.649474372052,215.69400024414 +01 01 2018 09:40,711.49560546875,6.11145305633544,662.235163012206,220.84260559082 +01 01 2018 09:50,838.151916503906,6.45632219314575,789.011422412419,237.065307617187 +01 01 2018 10:00,881.062072753906,6.66665792465209,872.739625855708,235.667495727539 +01 01 2018 10:10,663.703125,6.16287899017333,680.327891653483,229.329696655273 +01 01 2018 10:20,578.261596679687,6.01316785812377,628.442560754699,234.900604248046 +01 01 2018 10:30,465.620086669921,5.56120300292968,486.779567601972,230.422805786132 +01 01 2018 10:40,311.050903320312,4.96073198318481,326.411025380213,229.537506103515 +01 01 2018 10:50,230.05549621582,4.60387516021728,244.31624421611,231.79849243164 +01 01 2018 11:00,233.990600585937,4.55453395843505,233.632780531927,234.105606079101 +01 01 2018 11:10,175.592193603515,4.26362895965576,173.573663122312,228.776702880859 +01 01 2018 11:20,118.133102416992,3.89413905143737,108.571221110423,227.938995361328 +01 01 2018 11:30,142.202499389648,4.03876113891601,130.229989593698,224.46499633789 +01 01 2018 11:40,212.566192626953,4.50565099716186,223.196784083793,224.950500488281 +01 01 2018 11:50,222.610000610351,4.54339790344238,231.242507343633,229.12759399414 +01 01 2018 12:00,194.181198120117,4.32376098632812,185.598479588255,227.039993286132 +01 01 2018 12:10,82.6407470703125,3.63443708419799,68.5028197987886,230.31460571289 +01 01 2018 12:20,75.8952178955078,3.70551204681396,78.3961653540173,233.953292846679 +01 01 2018 12:30,41.9472389221191,3.25396800041198,29.2869556318446,233.06590270996 +01 01 2018 12:40,118.534599304199,3.77513694763183,88.8713653309387,227.753494262695 +01 01 2018 12:50,250.755905151367,4.69350099563598,264.119257409418,229.896606445312 +01 01 2018 13:00,346.86441040039,5.00293922424316,336.721998240131,235.279495239257 +01 01 2018 13:10,416.417907714843,5.36474990844726,430.92108895689,235.585296630859 +01 01 2018 13:20,331.941497802734,5.01618194580078,339.984940156412,229.942901611328 +01 01 2018 13:30,583.479919433593,5.97040796279907,615.05563084927,235.69529724121 +01 01 2018 13:40,776.552673339843,6.6555209159851,868.180844867276,241.457397460937 +01 01 2018 13:50,752.726379394531,6.60090398788452,846.029409522117,242.782104492187 +01 01 2018 14:00,589.073120117187,5.98137807846069,618.731442665699,234.984405517578 +01 01 2018 14:10,1109.12805175781,7.42459392547607,1219.19978672882,235.14729309082 +01 01 2018 14:20,1482.4599609375,8.18645191192626,1638.50890923271,238.479095458984 +01 01 2018 14:30,1523.43005371093,8.27493000030517,1691.1470390233,237.033203125 +01 01 2018 14:40,1572.17004394531,8.44920253753662,1796.76309010091,238.332397460937 +01 01 2018 14:50,1698.93994140625,8.5759744644165,1875.04719734159,235.641403198242 +01 01 2018 15:00,1616.84594726562,8.28225994110107,1695.53877696245,236.461395263671 +01 01 2018 15:10,1796.82397460937,8.73455238342285,1974.47580025242,234.354797363281 +01 01 2018 15:20,1885.86096191406,8.76410388946533,1993.17071186444,231.001602172851 +01 01 2018 15:30,2327.51196289062,9.66943168640136,2568.82712862015,227.60009765625 +01 01 2018 15:40,2499.162109375,10.1410903930664,2876.75361614448,227.73159790039 +01 01 2018 15:50,2820.51293945312,10.7724199295043,3186.02988321436,225.276397705078 +01 01 2018 16:00,2812.27905273437,10.6475200653076,3133.25922420184,224.680603027343 +01 01 2018 16:10,2530.44702148437,9.98266124725341,2781.27404078649,225.519500732421 +01 01 2018 16:20,2399.12109375,9.87438583374023,2711.49245838958,227.273803710937 +01 01 2018 16:30,2335.587890625,9.78547954559326,2651.34100928894,229.255493164062 diff --git a/agentic_eda/jupyterlab_extension_backend/src/config/__init__.py b/agentic_eda/jupyterlab_extension_backend/src/config/__init__.py new file mode 100644 index 000000000..2a18c45cd --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/config/__init__.py @@ -0,0 +1,3 @@ +""" +Backend configuration package. +""" diff --git a/agentic_eda/jupyterlab_extension_backend/src/config/config.py b/agentic_eda/jupyterlab_extension_backend/src/config/config.py new file mode 100644 index 000000000..f64d5fa9a --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/config/config.py @@ -0,0 +1,128 @@ +""" +Import as: + +import src.config.config as cconf +""" + +import dataclasses +import functools +import os + +import dotenv +import langchain_anthropic +import langchain_google_genai +import langchain_openai +import pydantic + +dataclass = dataclasses.dataclass +lru_cache = functools.lru_cache +ChatOpenAI = langchain_openai.ChatOpenAI +ChatAnthropic = langchain_anthropic.ChatAnthropic +ChatGoogleGenerativeAI = langchain_google_genai.ChatGoogleGenerativeAI +SecretStr = pydantic.SecretStr + +dotenv.load_dotenv() + + +@dataclass(frozen=True) +class Settings: + """ + Store model provider settings. + """ + + provider: str + model: str + temperature: float + timeout: float + max_retries: int + + +def _need(name: str) -> str: + """ + Read a required environment variable. + + :param name: environment variable name + :return: environment variable value + """ + value = os.getenv(name) + if value is None or value == "": + raise RuntimeError(f"Missing required environment variable: {name}") + return value + + +@lru_cache(maxsize=1) +def get_settings() -> Settings: + """ + Build settings from environment variables. + + :return: configured settings + """ + settings = Settings( + provider=os.getenv("LLM_PROVIDER", "openai"), + model=os.getenv("LLM_MODEL", "gpt-5-nano"), + temperature=float(os.getenv("LLM_TEMP", 0.2)), + timeout=float(os.getenv("LLM_TIMEOUT", 60)), + max_retries=int(os.getenv("LLM_MAX_RETRIES", 2)), + ) + return settings + + +@lru_cache(maxsize=1) +def get_chat_model(*, model: str | None = None) -> object: + """ + Build the configured chat model client. + + :param model: optional model override + :return: langchain chat model client + """ + settings = get_settings() + model_name = settings.model if model is None else model + provider = settings.provider + if provider == "openai": + _need("OPENAI_API_KEY") + chat_model = ChatOpenAI( + model=model_name, + temperature=settings.temperature, + timeout=settings.timeout, + max_retries=settings.max_retries, + ) + elif provider == "openai_compatible": + base_url = _need("OPENAI_COMPAT_BASE_URL") + api_key = _need("OPENAI_COMPAT_API_KEY") + chat_model = ChatOpenAI( + model=model_name, + base_url=base_url, + api_key=SecretStr(api_key), + temperature=settings.temperature, + timeout=settings.timeout, + max_retries=settings.max_retries, + ) + elif provider == "azure_openai_v1": + azure_base = _need("AZURE_OPENAI_BASE_URL") + azure_key = SecretStr(_need("AZURE_OPENAI_API_KEY")) + chat_model = ChatOpenAI( + model=model_name, + base_url=azure_base, + api_key=azure_key, + temperature=settings.temperature, + timeout=settings.timeout, + max_retries=settings.max_retries, + ) + elif provider == "anthropic": + _need("ANTHROPIC_API_KEY") + chat_model = ChatAnthropic( + model_name=model_name, + temperature=settings.temperature, + timeout=settings.timeout, + max_retries=settings.max_retries, + stop=None, + ) + elif provider in ("google", "gemini", "google_genai"): + _need("GOOGLE_API_KEY") + chat_model = ChatGoogleGenerativeAI( + model=model_name, + temperature=settings.temperature, + ) + else: + raise ValueError(f"Unsupported provider='{provider}'") + return chat_model diff --git a/agentic_eda/jupyterlab_extension_backend/src/ingest/__init__.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/__init__.py new file mode 100644 index 000000000..176a9790e --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/__init__.py @@ -0,0 +1,3 @@ +""" +Ingestion stages for the Jupyter backend. +""" diff --git a/agentic_eda/jupyterlab_extension_backend/src/ingest/compute_temporal_stats.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/compute_temporal_stats.py new file mode 100644 index 000000000..1b323d8c8 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/compute_temporal_stats.py @@ -0,0 +1,223 @@ +""" +Import as: + +import src.ingest.compute_temporal_stats as sctstats +""" + +from __future__ import annotations + +import argparse +import logging +from typing import TypedDict + +import langgraph.graph as lgraph + +import src.ingest.infer_structure as sinferstruct +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class TemporalStatsState(TypedDict): + """ + Store deterministic temporal statistics. + """ + + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + + +class CompositeState(TypedDict): + """ + Store graph state for temporal statistics. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + + +def call_infer_structure(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to feature-structure inference. + + :param state: graph state + :return: composite payload from infer_structure + """ + payload = sinferstruct.run_infer_structure(state["path"]) + return payload + + +def compute_temporal_stats(state: CompositeState) -> dict: + """ + Compute deterministic temporal range, coverage, and frequency statistics. + + :param state: graph state + :return: temporal statistics payload + """ + temporal_report = tinptool.compute_temporal_stats.invoke( + { + "path": state["path"], + "time_col": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "winner_formatter": state["winner_formatter"], + } + ) + trace_payload = { + "primary_key": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "temporal_report": temporal_report, + } + tinptool.write_stage_trace(state["path"], "compute_temporal_stats", trace_payload) + payload = { + "n_nat_time": temporal_report["n_nat_time"], + "min_time": temporal_report["min_time"], + "max_time": temporal_report["max_time"], + "typical_delta_mode": temporal_report["typical_delta_mode"], + "typical_delta_median": temporal_report["typical_delta_median"], + "expected_frequency": temporal_report["expected_frequency"], + "dominant_frequency_fraction": temporal_report["dominant_frequency_fraction"], + "is_irregular_sampling": temporal_report["is_irregular_sampling"], + "resampling_decision": temporal_report["resampling_decision"], + "coverage_summary": temporal_report["coverage_summary"], + "coverage_per_entity": temporal_report["coverage_per_entity"], + } + return payload + + +temporal_stats = lgraph.StateGraph(CompositeState) +temporal_stats.add_node("infer_structure_pipeline", call_infer_structure) +temporal_stats.add_node("compute_temporal_stats", compute_temporal_stats) +temporal_stats.add_edge(lgraph.START, "infer_structure_pipeline") +temporal_stats.add_edge("infer_structure_pipeline", "compute_temporal_stats") +temporal_stats.add_edge("compute_temporal_stats", lgraph.END) +graph = temporal_stats.compile() + + +def run_compute_temporal_stats(path: str) -> dict: + """ + Execute temporal statistics end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + "n_nat_time": 0, + "min_time": None, + "max_time": None, + "typical_delta_mode": None, + "typical_delta_median": None, + "expected_frequency": None, + "dominant_frequency_fraction": 0.0, + "is_irregular_sampling": False, + "resampling_decision": "", + "coverage_summary": {}, + "coverage_per_entity": [], + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Temporal stats output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_compute_temporal_stats(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/ingest/format_datetime.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/format_datetime.py new file mode 100644 index 000000000..6af3065d7 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/format_datetime.py @@ -0,0 +1,251 @@ +""" +Import as: + +import src.ingest.format_datetime as sfordat +""" + +import logging +import pathlib +from typing import TypedDict + +import langchain.agents as lagents +import langchain.tools as ltools +import langchain_core.messages as lmessages +import langgraph.graph as lgraph +import numpy as np +import pandas as pd +import pydantic + +import src.config.config as cconf +import src.ingest.handle_inputs as shainp +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +def _score_parse(dt: pd.Series) -> float: + """ + Score datetime parse quality. + + :param dt: candidate datetime series + :return: score where larger means better + """ + datetime_series = pd.to_datetime(dt, errors="coerce", utc=True) + if datetime_series.isna().all(): + score = -1.0 + return score + parsed_fraction = float(datetime_series.notna().mean()) + min_timestamp = datetime_series.min() + max_timestamp = datetime_series.max() + range_score = 1.0 + min_bound = pd.Timestamp("1990-01-01", tz="UTC") + max_bound = pd.Timestamp("2035-01-01", tz="UTC") + if min_timestamp < min_bound or max_timestamp > max_bound: + range_score = 0.7 + datetime_no_na = datetime_series.dropna() + monotonic_score = 0.0 + if len(datetime_no_na) >= 3: + deltas = datetime_no_na.diff() + inversions = float((deltas < pd.Timedelta(0)).mean()) + monotonic_score = 1.0 - inversions + score = ( + parsed_fraction * 0.65 + range_score * 0.15 + monotonic_score * 0.20 + ) + return float(score) + + +class _Candidate(pydantic.BaseModel): + """ + Store one datetime parse candidate. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + format: str | None + dayfirst: bool | None + yearfirst: bool | None + utc: bool + + +class _ParseWithCandidatesArgs(pydantic.BaseModel): + """ + Store tool arguments for candidate parsing. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + path: str + col_name: str + candidates: list[_Candidate] + + +@ltools.tool(args_schema=_ParseWithCandidatesArgs) +def _parse_with_candidates( + path: str, + col_name: str, + candidates: list[_Candidate], +) -> dict: + """ + Parse one column with multiple datetime candidates and pick the best. + + :param path: dataset path + :param col_name: target column name + :param candidates: parse candidates + :return: best candidate summary + """ + dataset_path = pathlib.Path(path) + dataset = tinptool.load_dataset(dataset_path) + col = dataset[col_name] + best_score = -1.0 + best_candidate = None + best_parsed_fraction = 0.0 + series = col.astype(str).str.strip().replace( + { + "": np.nan, + "nan": np.nan, + "NaT": np.nan, + } + ) + for candidate in candidates: + candidate_dict = candidate.model_dump() + format_val = candidate_dict["format"] + dayfirst_val = candidate_dict["dayfirst"] + yearfirst_val = candidate_dict["yearfirst"] + utc_val = candidate_dict["utc"] + kwargs = { + key: val + for key, val in { + "format": format_val, + "dayfirst": dayfirst_val, + "yearfirst": yearfirst_val, + "utc": utc_val, + }.items() + if val is not None + } + try: + datetime_series = pd.to_datetime( + series, + errors="coerce", + **kwargs, + ) + except Exception: + continue + score = _score_parse(datetime_series) + if score > best_score: + best_score = score + best_candidate = candidate_dict + best_parsed_fraction = float(datetime_series.notna().mean()) + payload = { + "best_candidate": best_candidate, + "best_score": float(best_score), + "parsed_fraction": float(best_parsed_fraction), + } + return payload + + +class DateFormatterState(TypedDict): + """ + Store graph state for datetime formatting. + """ + + path: str + time_col: str + candidates: list[dict] + winner_formatter: dict + + +class DateFormatterOutput(pydantic.BaseModel): + """ + Store structured formatter output. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + candidates: list[_Candidate] + winner_formatter: _Candidate + + +def run_formatting_agent(state: DateFormatterState) -> dict: + """ + Run LLM tool-calling to find the best datetime parser. + + :param state: formatter graph state + :return: candidate list and winner formatter + """ + system_prompt = ( + "Use tools to convert the provided time column into a correct datetime " + "format.\n" + "1. Use extract_head to inspect the temporal column and propose parse " + "candidates.\n" + "2. Call _parse_with_candidates with those candidates.\n" + "3. Return all candidates and the winning formatter." + ) + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[_parse_with_candidates, tinptool.extract_head], + system_prompt=system_prompt, + response_format=DateFormatterOutput, + ) + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=( + f"The dataset path is {state['path']} and the time " + f"column name is {state['time_col']}" + ) + ) + ] + } + ) + structured_response = out["structured_response"].model_dump() + payload = { + "candidates": structured_response["candidates"], + "winner_formatter": structured_response["winner_formatter"], + } + return payload + + +def call_input_handler(state: DateFormatterState) -> dict: + """ + Run input handler and pick the first temporal column. + + :param state: formatter graph state + :return: selected temporal column + """ + out = shainp.run_input_handler(state["path"]) + temporal_cols = out.get("temporal_cols") or [] + if not temporal_cols: + raise ValueError("No temporal columns found by input handler.") + payload = {"time_col": temporal_cols[0]} + return payload + + +date_formatter = lgraph.StateGraph(DateFormatterState) +date_formatter.add_node("input_handler", call_input_handler) +date_formatter.add_node("run_formatting_agent", run_formatting_agent) +date_formatter.add_edge(lgraph.START, "input_handler") +date_formatter.add_edge("input_handler", "run_formatting_agent") +date_formatter.add_edge("run_formatting_agent", lgraph.END) +graph = date_formatter.compile() + + +def run_date_formatter(path: str) -> dict: + """ + Execute datetime formatter graph and parse the selected time column. + + :param path: dataset path + :return: output including selected formatter and parsed dtype + """ + graph_in = {"path": path} + out: DateFormatterState = graph.invoke(graph_in) # type: ignore[assignment] + dataset_path = pathlib.Path(path) + dataset = tinptool.load_dataset(dataset_path) + raw_args = out["winner_formatter"] + format_args = {key: val for key, val in raw_args.items() if val is not None} + parsed_time = pd.to_datetime(dataset[out["time_col"]], **format_args) + payload = { + "time_col": out["time_col"], + "winner_formatter": out["winner_formatter"], + "parsed_dtype": str(parsed_time.dtype), + } + _LOG.info("Date formatter output: %s", payload) + return payload diff --git a/agentic_eda/jupyterlab_extension_backend/src/ingest/handle_inputs.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/handle_inputs.py new file mode 100644 index 000000000..84a3474c1 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/handle_inputs.py @@ -0,0 +1,646 @@ +""" +Import as: + +import src.ingest.handle_inputs as shainp +""" + +from __future__ import annotations + +import argparse +import logging +import pathlib +from typing import Any +from typing import Literal +from typing import TypedDict + +import langchain.agents as lagents +import langchain_core.messages as lmessages +import langgraph.graph as lgraph +import pandas as pd +import pydantic + +import src.config.config as cconf +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class InputState(TypedDict): + """ + Store graph state for input checks. + """ + + path: str | pathlib.Path + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + + +class LLMOutput(pydantic.BaseModel): + """ + Store structured output from the header classifier. + """ + + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + + +class SeriesStructureFallbackOutput(pydantic.BaseModel): + """ + Store structured fallback output for ambiguous series-structure cases. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + secondary_keys: list[str] + + +class BadRowDescriptor(pydantic.BaseModel): + """ + Store one fuzzy descriptor for a bad row. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + row_index: int + fuzzy_descriptor: str + + +class BadRowDescriptorOutput(pydantic.BaseModel): + """ + Store structured fuzzy descriptors for detected bad rows. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + descriptors: list[BadRowDescriptor] + + +class SeriesStructureAssessment(TypedDict): + """ + Store deterministic and fallback evidence for series-structure inference. + """ + + duplicate_timestamps: int + duplicate_timestamp_fraction: float + timestamps_mostly_unique: bool + candidate_entity_cols: list[str] + entity_candidate_report: dict + secondary_keys: list[str] + confidence: Literal["high", "medium", "low"] + method: Literal["deterministic", "deterministic_no_panel", "fuzzy"] + + +def _json_safe_value(value: Any) -> Any: + """ + Convert dataframe cell values into JSON-safe Python values. + + :param value: raw cell value + :return: JSON-safe value + """ + if pd.isna(value): + return None + if hasattr(value, "item"): + try: + return value.item() + except Exception: + return str(value) + return value + + +def _row_to_record(row: pd.Series) -> dict[str, Any]: + """ + Convert one dataframe row into a JSON-safe mapping. + + :param row: dataframe row + :return: serialized row mapping + """ + return { + str(col): _json_safe_value(value) + for col, value in row.to_dict().items() + } + + +def detect_bad_rows(state: InputState) -> dict: + """ + Detect rows that do not behave like observations because their temporal + fields are missing or unparseable. + + Theory: + In time-series ingestion, observation rows should participate in the time + axis. Rows whose temporal fields cannot be parsed are often metadata, + annotation, footer, or malformed rows. Capturing them explicitly preserves + evidence for downstream handling without silently dropping information at + ingestion time. + + :param state: input graph state + :return: detected bad-row payload + """ + temporal_cols = state.get("temporal_cols") or [] + if not temporal_cols: + return {"bad_rows": []} + + dataset_path = pathlib.Path(str(state["path"])) + dataset = tinptool.load_dataset(dataset_path) + valid_temporal_cols = [col for col in temporal_cols if col in dataset.columns] + if not valid_temporal_cols: + return {"bad_rows": []} + + parse_matrix: dict[str, pd.Series] = {} + normalized_matrix: dict[str, pd.Series] = {} + for col in valid_temporal_cols: + raw_series = dataset[col] + normalized = raw_series.astype(str).str.strip().replace( + {"": pd.NA, "nan": pd.NA, "NaT": pd.NA} + ) + normalized_matrix[col] = normalized + parse_matrix[col] = pd.to_datetime(normalized, errors="coerce") + + bad_rows: list[dict[str, Any]] = [] + for row_idx in range(int(dataset.shape[0])): + reasons: list[str] = [] + temporal_values: dict[str, Any] = {} + has_temporal_signal = False + has_parseable_temporal = False + for col in valid_temporal_cols: + raw_value = normalized_matrix[col].iloc[row_idx] + parsed_value = parse_matrix[col].iloc[row_idx] + temporal_values[col] = _json_safe_value(raw_value) + if not pd.isna(raw_value): + has_temporal_signal = True + if not pd.isna(parsed_value): + has_parseable_temporal = True + continue + if pd.isna(raw_value): + reasons.append(f"missing_temporal_value:{col}") + else: + raw_text = str(raw_value).strip() + reasons.append(f"unparseable_temporal_value:{col}") + if raw_text.endswith(":"): + reasons.append(f"annotation_like_temporal_value:{col}") + if has_parseable_temporal: + continue + if not has_temporal_signal and not reasons: + continue + row = dataset.iloc[row_idx] + bad_rows.append( + { + "row_index": int(row_idx), + "csv_row_number": int(row_idx) + 2, + "temporal_values": temporal_values, + "reasons": sorted(dict.fromkeys(reasons)), + "raw_row": _row_to_record(row), + "fuzzy_descriptor": "", + } + ) + return {"bad_rows": bad_rows} + + +def describe_bad_rows(state: InputState) -> dict: + """ + Attach short fuzzy descriptors to already-detected bad rows. + + Theory: + Deterministic rules can reliably tell us that a row does not behave like a + data observation, but they are less expressive about the row's likely role. + A constrained model can add a short human-readable descriptor such as + metadata row, blank footer row, or malformed timestamp row without being + allowed to invent new row IDs or alter the deterministic evidence. + + :param state: input graph state + :return: bad rows with fuzzy descriptors + """ + bad_rows = [dict(row) for row in (state.get("bad_rows") or [])] + if not bad_rows: + return {"bad_rows": []} + + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[], + system_prompt=( + "You are labeling already-detected bad rows in a dataset. " + "For each row_index, return a short fuzzy descriptor such as " + "'metadata/control row', 'blank/incomplete row', " + "'annotation row', or 'malformed timestamp row'. " + "Do not change row_index values and do not add rows." + ), + response_format=BadRowDescriptorOutput, + ) + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=f"Detected bad rows: {bad_rows}" + ) + ] + } + ) + descriptors = out["structured_response"].model_dump().get("descriptors") or [] + descriptor_map = { + int(item["row_index"]): str(item["fuzzy_descriptor"]).strip() + for item in descriptors + } + for row in bad_rows: + row["fuzzy_descriptor"] = descriptor_map.get( + int(row["row_index"]), + "bad/non-data row", + ) + return {"bad_rows": bad_rows} + + +def _parse_time_series( + path: str | pathlib.Path, + time_col: str, + winner_formatter: dict | None = None, +) -> pd.Series: + """ + Parse a proposed time column to measure whether it behaves like a real time + axis. + + Theory: + Handle-input classification identifies candidate temporal columns, but it + does not establish whether the observed values actually parse into a stable + datetime axis. Parseability is the empirical question: can the values be + converted into usable timestamps with only a small failure rate? That check + is important because schema inference should rely on observed value + behavior, not just column labels or LLM guesses. + + :param path: dataset path + :param time_col: selected time column + :param winner_formatter: optional datetime parsing kwargs + :return: parsed timestamp series + """ + dataset = tinptool.load_dataset(pathlib.Path(str(path))) + format_args = winner_formatter or {} + format_args = {key: val for key, val in format_args.items() if val is not None} + try: + return pd.to_datetime(dataset[time_col], errors="coerce", **format_args) + except Exception: + return pd.to_datetime(dataset[time_col], errors="coerce") + + +def _select_entity_candidate_cols( + *, + cols: list[str], + time_col: str, + numeric_val_cols: list[str], + categorical_val_cols: list[str], + column_profiles: dict, +) -> list[str]: + """ + Select plausible entity-key candidates using value-level heuristics. + + Theory: + Entity keys should behave like identifiers that partition repeated + timestamps into coherent per-entity series. Measurement columns usually do + not do that, even if they repeat. The candidate filter therefore keeps + likely identifier-like categoricals and only a narrow class of integer-like + numeric columns, while excluding continuous measurements, binary flags, and + near-row-unique columns. + + :param cols: all dataset columns + :param time_col: selected time column + :param numeric_val_cols: numeric value columns + :param categorical_val_cols: categorical value columns + :param column_profiles: per-column deterministic profiles + :return: filtered candidate entity columns + """ + candidates: list[str] = [] + numeric_set = set(numeric_val_cols) + categorical_set = set(categorical_val_cols) + for col in cols: + if col == time_col: + continue + profile = column_profiles.get(col) or {} + n_unique = int(profile.get("n_unique", 0)) + unique_ratio = float(profile.get("unique_ratio", 1.0)) + if n_unique <= 1 or unique_ratio >= 0.95: + continue + if col in categorical_set: + candidates.append(col) + continue + if col in numeric_set: + if bool(profile.get("is_binary_like")): + continue + if not bool(profile.get("is_integer_like")): + continue + if not bool(profile.get("is_nonnegative_like")): + continue + if n_unique > 200: + continue + if unique_ratio > 0.50: + continue + candidates.append(col) + return candidates + + +def _fuzzy_secondary_key_agent( + *, + path: str, + time_col: str, + candidate_entity_cols: list[str], + entity_candidate_report: dict, + column_profiles: dict, +) -> list[str]: + """ + Resolve ambiguous panel-vs-multivariate cases with a constrained LLM tie + breaker. + + Theory: + Deterministic heuristics are strongest when the data exhibits clean + identifier behavior. Ambiguous cases remain, especially when columns are + poorly named or identifier-like columns are partially numeric. In those + cases, a model can act as a constrained judge over a narrow candidate set, + using deterministic evidence rather than inventing columns freely. This + keeps fuzzy reasoning explainable and bounded. + + :param path: dataset path + :param time_col: selected time column + :param candidate_entity_cols: filtered entity-key candidates + :param entity_candidate_report: deterministic scoring report + :param column_profiles: per-column profiles + :return: chosen secondary keys, possibly empty + """ + if not candidate_entity_cols: + return [] + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[tinptool.extract_head, tinptool.extract_metadata], + system_prompt=( + "You are resolving an ambiguous series-structure classification. " + "Choose secondary keys only from the provided candidate_entity_cols. " + "Return [] if the dataset still looks like a single or wide " + "multivariate time series rather than panel data. Prefer the " + "deterministic evidence report over column names." + ), + response_format=SeriesStructureFallbackOutput, + ) + profile_subset = { + col: column_profiles.get(col, {}) + for col in candidate_entity_cols + } + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=( + f"Dataset path: {path}\n" + f"time_col: {time_col}\n" + f"candidate_entity_cols: {candidate_entity_cols}\n" + f"entity_candidate_report: {entity_candidate_report}\n" + f"column_profiles: {profile_subset}" + ) + ) + ] + } + ) + structured = out["structured_response"].model_dump() + secondary_keys: list[str] = [] + seen: set[str] = set() + allowed = set(candidate_entity_cols) + for col in structured.get("secondary_keys") or []: + col_name = str(col) + if col_name not in allowed or col_name in seen: + continue + seen.add(col_name) + secondary_keys.append(col_name) + return secondary_keys + + +def assess_series_structure( + *, + path: str | pathlib.Path, + cols: list[str], + time_col: str, + numeric_val_cols: list[str], + categorical_val_cols: list[str], + winner_formatter: dict | None = None, +) -> SeriesStructureAssessment: + """ + Assess whether the dataset behaves like a single series, panel, or wide + multivariate time series. + + Theory: + The decisive signal for panel structure is not the column name but the time + axis itself. If timestamps are already mostly unique, there is no need to + search for entity keys: the data is behaving like one wide time-indexed + table. Only when timestamps repeat meaningfully should we look for + identifier columns that make `(entity, time)` close to unique. This staging + avoids promoting ordinary measurement columns into fake entity IDs. + + :param path: dataset path + :param cols: all dataset columns + :param time_col: selected time column + :param numeric_val_cols: numeric value columns + :param categorical_val_cols: categorical value columns + :param winner_formatter: optional datetime parsing kwargs + :return: series-structure assessment + """ + string_path = str(path) + timestamp = _parse_time_series(string_path, time_col, winner_formatter) + valid_ts = timestamp.dropna() + duplicate_timestamps = int(valid_ts.duplicated().sum()) + duplicate_fraction = ( + 0.0 if valid_ts.empty else float(duplicate_timestamps / max(1, int(valid_ts.shape[0]))) + ) + timestamps_mostly_unique = duplicate_timestamps == 0 or duplicate_fraction < 0.01 + profiles_out = tinptool.extract_column_profiles.invoke({"path": string_path}) + column_profiles = profiles_out.get("column_profiles") or {} + candidate_entity_cols = _select_entity_candidate_cols( + cols=cols, + time_col=time_col, + numeric_val_cols=numeric_val_cols, + categorical_val_cols=categorical_val_cols, + column_profiles=column_profiles, + ) + if timestamps_mostly_unique: + return { + "duplicate_timestamps": duplicate_timestamps, + "duplicate_timestamp_fraction": duplicate_fraction, + "timestamps_mostly_unique": True, + "candidate_entity_cols": [], + "entity_candidate_report": { + "time_col": time_col, + "candidate_cols": [], + "candidates": [], + "recommended_secondary_keys": [], + }, + "secondary_keys": [], + "confidence": "high", + "method": "deterministic_no_panel", + } + entity_candidate_report = tinptool.score_entity_candidates.invoke( + { + "path": string_path, + "time_col": time_col, + "candidate_cols": candidate_entity_cols, + "max_combo_size": 2, + } + ) + recommended_secondary_keys = ( + entity_candidate_report.get("recommended_secondary_keys") or [] + ) + candidates = entity_candidate_report.get("candidates") or [] + top_score = 0.0 if not candidates else float(candidates[0].get("score", 0.0)) + if recommended_secondary_keys: + confidence: Literal["high", "medium", "low"] = ( + "high" if top_score >= 0.75 else "medium" + ) + return { + "duplicate_timestamps": duplicate_timestamps, + "duplicate_timestamp_fraction": duplicate_fraction, + "timestamps_mostly_unique": False, + "candidate_entity_cols": candidate_entity_cols, + "entity_candidate_report": entity_candidate_report, + "secondary_keys": recommended_secondary_keys, + "confidence": confidence, + "method": "deterministic", + } + fuzzy_secondary_keys = _fuzzy_secondary_key_agent( + path=string_path, + time_col=time_col, + candidate_entity_cols=candidate_entity_cols, + entity_candidate_report=entity_candidate_report, + column_profiles=column_profiles, + ) + return { + "duplicate_timestamps": duplicate_timestamps, + "duplicate_timestamp_fraction": duplicate_fraction, + "timestamps_mostly_unique": False, + "candidate_entity_cols": candidate_entity_cols, + "entity_candidate_report": entity_candidate_report, + "secondary_keys": fuzzy_secondary_keys, + "confidence": "low" if fuzzy_secondary_keys else "medium", + "method": "fuzzy", + } + + +def header_classification_agent(state: InputState) -> dict: + """ + Classify temporal, numeric, and categorical columns. + + :param state: input graph state + :return: column classification payload + """ + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[tinptool.extract_head, tinptool.extract_metadata], + system_prompt=( + "You are a header classifier agent. Use tools to identify temporal " + "columns and classify the remaining value columns as numeric or " + "categorical. Output JSON with keys temporal_cols, " + "numeric_val_cols, and categorical_val_cols." + ), + response_format=LLMOutput, + ) + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=f"The dataset is in {state['path']}" + ) + ] + } + ) + result = out["structured_response"].model_dump() + return result + + +def error_node(state: InputState) -> dict: + """ + Log an error node transition. + + :param state: input graph state + :return: empty update + """ + _LOG.error("Input handler failed: %s", state["error"]) + return {} + + +def has_header(state: InputState) -> bool: + """ + Check if header validation passed. + + :param state: input graph state + :return: true when headers are valid + """ + has_header_flag = state["has_header"] + return has_header_flag + + +def run_input_handler(path: str | pathlib.Path) -> dict: + """ + Run dataset header and column classification checks. + + :param path: path to dataset + :return: final graph output + """ + graph_builder = lgraph.StateGraph(InputState) + graph_builder.add_node("header_analysis", tinptool.analyze_header) + graph_builder.add_node( + "header_classification_agent", + header_classification_agent, + ) + graph_builder.add_node("detect_bad_rows", detect_bad_rows) + graph_builder.add_node("describe_bad_rows", describe_bad_rows) + graph_builder.add_node("error", error_node) + graph_builder.add_edge(lgraph.START, "header_analysis") + graph_builder.add_conditional_edges( + "header_analysis", + has_header, + { + True: "header_classification_agent", + False: "error", + }, + ) + graph_builder.add_edge("error", lgraph.END) + graph_builder.add_edge("header_classification_agent", "detect_bad_rows") + graph_builder.add_edge("detect_bad_rows", "describe_bad_rows") + graph_builder.add_edge("describe_bad_rows", lgraph.END) + graph = graph_builder.compile() + init_state: InputState = { + "path": str(path), + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + } + out = graph.invoke(init_state) + _LOG.info("Input handler output: %s", out) + return out + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_input_handler(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/ingest/infer_structure.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/infer_structure.py new file mode 100644 index 000000000..a57f094f6 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/infer_structure.py @@ -0,0 +1,194 @@ +""" +Import as: + +import src.ingest.infer_structure as sinferstruct +""" + +from __future__ import annotations + +import argparse +import logging +from typing import TypedDict + +import langgraph.graph as lgraph + +import src.ingest.infer_type as sinfert +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class FeatureStructureState(TypedDict): + """ + Store inferred semantic feature groupings. + """ + + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + + +class CompositeState(TypedDict): + """ + Store graph state for feature-structure inference. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + + +def call_infer_type(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to series-type inference. + + :param state: graph state + :return: composite payload from infer_type + """ + payload = sinfert.run_infer_type(state["path"]) + return payload + + +def infer_structure(state: CompositeState) -> dict: + """ + Infer semantic feature roles for EDA deterministically from observed column + behavior. + + :param state: graph state + :return: inferred feature groupings + """ + feature_bucket_report = tinptool.infer_feature_buckets.invoke( + { + "path": state["path"], + "time_col": state["primary_key"], + "secondary_keys": state["secondary_keys"], + } + ) + trace_payload = { + "primary_key": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "series_type": state["type"], + "feature_bucket_report": feature_bucket_report, + } + tinptool.write_stage_trace(state["path"], "infer_structure", trace_payload) + payload = { + "numeric_continuous_cols": feature_bucket_report["numeric_continuous_cols"], + "numeric_count_cols": feature_bucket_report["numeric_count_cols"], + "binary_flag_cols": feature_bucket_report["binary_flag_cols"], + "categorical_feature_cols": feature_bucket_report["categorical_feature_cols"], + "known_exogenous_cols": feature_bucket_report["known_exogenous_cols"], + "target_cols": feature_bucket_report["target_cols"], + "covariate_cols": feature_bucket_report["covariate_cols"], + } + return payload + + +feature_structure = lgraph.StateGraph(CompositeState) +feature_structure.add_node("infer_type_pipeline", call_infer_type) +feature_structure.add_node("infer_structure", infer_structure) +feature_structure.add_edge(lgraph.START, "infer_type_pipeline") +feature_structure.add_edge("infer_type_pipeline", "infer_structure") +feature_structure.add_edge("infer_structure", lgraph.END) +graph = feature_structure.compile() + + +def run_infer_structure(path: str) -> dict: + """ + Execute feature-structure inference end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Feature structure output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_infer_structure(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/ingest/infer_type.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/infer_type.py new file mode 100644 index 000000000..e3fe05786 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/infer_type.py @@ -0,0 +1,222 @@ +""" +Import as: + +import src.ingest.infer_type as sinfert +""" + +from __future__ import annotations + +import argparse +import logging +import pathlib +from typing import Literal +from typing import TypedDict + +import langgraph.graph as lgraph + +import src.ingest.format_datetime as sfordat +import src.ingest.handle_inputs as shainp +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class SeriesTypeState(TypedDict): + """ + Store the inferred series structure. + """ + + type: Literal["single", "multiple", "multivariate"] + primary_key: str + secondary_keys: list[str] + + +class CompositeState(TypedDict): + """ + Store graph state for series-structure inference. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: Literal["single", "multiple", "multivariate"] + primary_key: str + secondary_keys: list[str] + + +def call_input_handler(state: CompositeState) -> dict: + """ + Run input handler and collect column metadata. + + :param state: graph state + :return: column classification payload + """ + dataset_path = pathlib.Path(state["path"]) + dataset = tinptool.load_dataset(dataset_path) + out = shainp.run_input_handler(state["path"]) + metadata = tinptool.extract_metadata.invoke({"path": state["path"]}) + payload = { + "done": out.get("done") or [], + "has_header": bool(out.get("has_header", True)), + "has_missing_values": bool(out.get("has_missing_values", False)), + "error": str(out.get("error") or ""), + "info": str(out.get("info") or ""), + "cols": [str(col) for col in dataset.columns.tolist()], + "temporal_cols": out.get("temporal_cols") or [], + "numeric_val_cols": out.get("numeric_val_cols") or [], + "categorical_val_cols": out.get("categorical_val_cols") or [], + "bad_rows": out.get("bad_rows") or [], + "numeric_cols": out.get("numeric_val_cols") or [], + "metadata": metadata, + } + return payload + + +def call_date_formatter(state: CompositeState) -> dict: + """ + Run the datetime formatter graph. + + :param state: graph state + :return: selected time column + """ + out: sfordat.DateFormatterState = sfordat.graph.invoke( # type: ignore + {"path": state["path"]} + ) + payload = { + "time_col": out["time_col"], + "candidates": out.get("candidates") or [], + "winner_formatter": out.get("winner_formatter") or {}, + } + return payload + + +def infer_type(state: CompositeState) -> dict: + """ + Infer whether the dataset is single-series, panel, or multivariate using + deterministic value-level evidence. + + :param state: graph state + :return: inferred series structure + """ + structure_assessment = shainp.assess_series_structure( + path=state["path"], + cols=state["cols"], + time_col=state["time_col"], + numeric_val_cols=state["numeric_val_cols"], + categorical_val_cols=state["categorical_val_cols"], + winner_formatter=state["winner_formatter"], + ) + primary_key = state["time_col"] + secondary_keys = structure_assessment.get("secondary_keys") or [] + if secondary_keys: + inferred_type: Literal["single", "multiple", "multivariate"] = "multiple" + elif len(state["numeric_val_cols"]) > 1: + inferred_type = "multivariate" + else: + inferred_type = "single" + trace_payload = { + "time_col": primary_key, + "structure_assessment": structure_assessment, + "inferred_type": inferred_type, + "secondary_keys": secondary_keys, + } + tinptool.write_stage_trace(state["path"], "infer_type", trace_payload) + payload = { + "type": inferred_type, + "primary_key": primary_key, + "secondary_keys": secondary_keys, + "entity_col": secondary_keys[0] if secondary_keys else None, + } + return payload + + +series_type = lgraph.StateGraph(CompositeState) +series_type.add_node("input_handler", call_input_handler) +series_type.add_node("date_formatter", call_date_formatter) +series_type.add_node("infer_type", infer_type) +series_type.add_edge(lgraph.START, "input_handler") +series_type.add_edge("input_handler", "date_formatter") +series_type.add_edge("date_formatter", "infer_type") +series_type.add_edge("infer_type", lgraph.END) +graph = series_type.compile() + + +def run_infer_type(path: str) -> dict: + """ + Execute series-structure inference end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "single", + "primary_key": "", + "secondary_keys": [], + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Series type output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_infer_type(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/ingest/integrity.py b/agentic_eda/jupyterlab_extension_backend/src/ingest/integrity.py new file mode 100644 index 000000000..71ee4670c --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/ingest/integrity.py @@ -0,0 +1,402 @@ +""" +Import as: + +import src.ingest.integrity as sinteg +""" + +import logging +import pathlib +from typing import Literal +from typing import TypedDict + +import langchain.agents as lagents +import langchain_core.messages as lmessages +import langgraph.graph as lgraph +import pandas as pd +import pydantic + +import src.config.config as cconf +import src.ingest.format_datetime as sfordat +import src.ingest.handle_inputs as shainp +import src.ingest.infer_type as sinfert +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class IntegrityState(TypedDict): + """ + Store graph state for integrity checks. + """ + + path: str + time_col: str | None + winner_formatter: dict + cols: list[str] + temporal_cols: list[str] + bad_rows: list[dict] + entity_col: str | None + numeric_cols: list[str] + categorical_val_cols: list[str] + metadata: dict + secondary_keys: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + + +class IntegrityJudgeOutput(pydantic.BaseModel): + """ + Store structured LLM judgment. + """ + + summary: str + flag: Literal["yes", "no"] + + +def call_date_formatter(state: IntegrityState) -> dict: + """ + Run the datetime formatter graph. + + :param state: integrity graph state + :return: selected time column and formatter + """ + out: sfordat.DateFormatterState = sfordat.graph.invoke( # type: ignore + {"path": state["path"]} + ) + payload = { + "time_col": out["time_col"], + "winner_formatter": out["winner_formatter"], + } + return payload + + +def _maybe_infer_columns(state: IntegrityState) -> dict: + """ + Collect schema context needed by downstream integrity checks. + + :param state: integrity graph state + :return: schema-related state updates + """ + if ( + state.get("cols") + and state.get("temporal_cols") + and state.get("numeric_cols") + and state.get("metadata") + ): + payload = {} + else: + dataset_path = pathlib.Path(state["path"]) + dataset = tinptool.load_dataset(dataset_path) + out = shainp.run_input_handler(state["path"]) + metadata = tinptool.extract_metadata.invoke({"path": state["path"]}) + payload = { + "cols": [str(col) for col in dataset.columns.tolist()], + "temporal_cols": out.get("temporal_cols") or [], + "bad_rows": out.get("bad_rows") or [], + "numeric_cols": out.get("numeric_val_cols") or [], + "categorical_val_cols": out.get("categorical_val_cols") or [], + "metadata": metadata, + } + return payload + + +def call_infer_type(state: IntegrityState) -> dict: + """ + Infer the series structure and derive the temporary entity key. + + :param state: integrity graph state + :return: inferred secondary keys and first entity key + """ + infer_state: sinfert.CompositeState = { + "path": state["path"], + "cols": state.get("cols") or [], + "temporal_cols": state.get("temporal_cols") or [], + "numeric_val_cols": state.get("numeric_cols") or [], + "categorical_val_cols": state.get("categorical_val_cols") or [], + "bad_rows": state.get("bad_rows") or [], + "metadata": state.get("metadata") or {}, + "time_col": state["time_col"] or "", + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "candidates": [], + "winner_formatter": state.get("winner_formatter") or {}, + "entity_col": None, + "numeric_cols": state.get("numeric_cols") or [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "single", + "primary_key": "", + "secondary_keys": [], + } + out = sinfert.infer_type(infer_state) + secondary_keys = out.get("secondary_keys") or [] + entity_col = secondary_keys[0] if secondary_keys else None + payload = { + "secondary_keys": secondary_keys, + "entity_col": entity_col, + } + return payload + + +def run_integrity_checks(state: IntegrityState) -> dict: + """ + Run deterministic integrity checks on a dataset. + + :param state: integrity graph state + :return: report payload + """ + dataset_path = pathlib.Path(state["path"]) + dataset = tinptool.load_dataset(dataset_path) + issues: list[dict] = [] + summary: dict = { + "n_rows": int(dataset.shape[0]), + "n_cols": int(dataset.shape[1]), + } + if dataset.shape[0] == 0: + issues.append({"type": "empty_dataset", "msg": "Dataset has 0 rows."}) + report = {"summary": summary, "issues": issues} + payload = {"report": report} + return payload + time_col = state.get("time_col") + if time_col is None or time_col not in dataset.columns: + issues.append( + { + "type": "missing_time_col", + "msg": f"time_col missing: {time_col!r}", + } + ) + report = {"summary": summary, "issues": issues} + payload = {"report": report} + return payload + format_args = state.get("winner_formatter") or {} + format_args = { + key: val + for key, val in format_args.items() + if val is not None + } + try: + timestamp = pd.to_datetime( + dataset[time_col], + errors="coerce", + **format_args, + ) + except Exception: + timestamp = pd.to_datetime(dataset[time_col], errors="coerce") + summary["n_nat_time"] = int(timestamp.isna().sum()) + summary["min_time"] = ( + None if timestamp.dropna().empty else str(timestamp.dropna().min()) + ) + summary["max_time"] = ( + None if timestamp.dropna().empty else str(timestamp.dropna().max()) + ) + duplicate_timestamps = int(timestamp.dropna().duplicated().sum()) + summary["duplicate_timestamps"] = duplicate_timestamps + if duplicate_timestamps > 0: + issues.append( + {"type": "duplicate_timestamps", "count": duplicate_timestamps} + ) + entity_col = state.get("entity_col") + # TODO: Use all inferred secondary_keys as a composite entity key for + # integrity checks; for now we temporarily use only the first key. + if entity_col is not None and entity_col in dataset.columns: + summary["n_entities"] = int(dataset[entity_col].nunique(dropna=True)) + tmp = dataset[[entity_col]].copy() + tmp["_ts"] = timestamp + duplicate_pairs = int( + tmp.dropna(subset=[entity_col, "_ts"]) + .duplicated(subset=[entity_col, "_ts"]) + .sum() + ) + summary["duplicate_entity_timestamp_pairs"] = duplicate_pairs + if duplicate_pairs > 0: + issues.append( + { + "type": "duplicate_entity_timestamp_pairs", + "count": duplicate_pairs, + } + ) + else: + summary["duplicate_entity_timestamp_pairs"] = None + numeric_cols = [col for col in state.get("numeric_cols") or []] + numeric_cols = [col for col in numeric_cols if col in dataset.columns] + nonnegative_cols = [col for col in state.get("nonnegative_cols") or []] + negative_report: dict = {} + for col in nonnegative_cols: + if col not in dataset.columns: + continue + series = pd.to_numeric(dataset[col], errors="coerce") + n_negative = int((series < 0).sum(skipna=True)) + if n_negative > 0: + negative_report[col] = n_negative + summary["negatives_in_nonnegative_cols"] = negative_report + if negative_report: + issues.append({"type": "negative_values", "details": negative_report}) + jump_mult = float(state.get("jump_mult") or 20.0) + jumps: dict = {} + if numeric_cols: + selected_cols = [time_col] + if entity_col is not None and entity_col in dataset.columns: + selected_cols.append(entity_col) + selected_cols.extend(numeric_cols) + tmp = dataset[selected_cols].copy() + tmp["_ts"] = timestamp + if entity_col is None or entity_col not in tmp.columns: + sort_cols = ["_ts"] + else: + sort_cols = [entity_col, "_ts"] + tmp = tmp.sort_values(sort_cols) + for col in numeric_cols: + tmp[col] = pd.to_numeric(tmp[col], errors="coerce") + if entity_col is None or entity_col not in tmp.columns: + diff = tmp[col].diff() + else: + diff = tmp.groupby(entity_col)[col].diff() + diff_abs = diff.abs() + scale = diff_abs.median() + if pd.isna(scale) or float(scale) <= 0.0: + scale = diff_abs.mean() + if pd.isna(scale) or float(scale) <= 0.0: + continue + threshold = float(scale) * jump_mult + flagged = diff_abs > threshold + n_flagged = int(flagged.sum(skipna=True)) + if n_flagged <= 0: + continue + examples: list[dict] = [] + flagged_idx = tmp.index[flagged.fillna(False)][:5] + for idx in flagged_idx: + diff_val = diff.loc[idx] + curr_val = tmp.loc[idx, col] + if pd.isna(diff_val) or pd.isna(curr_val): + prev_val = None + else: + prev_val = float(curr_val - diff_val) + example = { + "col": col, + "entity": ( + None + if entity_col is None or entity_col not in tmp.columns + else tmp.loc[idx, entity_col] + ), + "time": ( + None + if pd.isna(tmp.loc[idx, "_ts"]) + else str(tmp.loc[idx, "_ts"]) + ), + "prev": prev_val, + "curr": None if pd.isna(curr_val) else float(curr_val), + "diff": None if pd.isna(diff_val) else float(diff_val), + "threshold": float(threshold), + } + examples.append(example) + jumps[col] = { + "count": n_flagged, + "threshold": threshold, + "examples": examples, + } + issues.append( + { + "type": "impossible_jumps", + "col": col, + "count": n_flagged, + } + ) + summary["jump_mult"] = jump_mult + summary["jumps"] = jumps + report = {"summary": summary, "issues": issues} + payload = {"report": report} + return payload + + +def integrity_llm_summary(state: IntegrityState) -> dict: + """ + Summarize integrity report and provide go/no-go flag. + + :param state: integrity graph state + :return: summary and decision flag + """ + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[], + system_prompt=( + "You are an integrity judge. Decide if the dataset can proceed. " + "Return JSON with keys summary and flag. Set flag to yes only when " + "there are no meaningful integrity issues." + ), + response_format=IntegrityJudgeOutput, + ) + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=f"Here is the integrity report: {state['report']}" + ) + ] + } + ) + structured_response = out["structured_response"].model_dump() + payload = { + "summary": structured_response["summary"], + "flag": structured_response["flag"], + } + return payload + + +integrity = lgraph.StateGraph(IntegrityState) +integrity.add_node("date_formatter", call_date_formatter) +integrity.add_node("maybe_infer_columns", _maybe_infer_columns) +integrity.add_node("infer_type", call_infer_type) +integrity.add_node("run_integrity_checks", run_integrity_checks) +integrity.add_node("integrity_llm_summary", integrity_llm_summary) +integrity.add_edge(lgraph.START, "date_formatter") +integrity.add_edge("date_formatter", "maybe_infer_columns") +integrity.add_edge("maybe_infer_columns", "infer_type") +integrity.add_edge("infer_type", "run_integrity_checks") +integrity.add_edge("run_integrity_checks", "integrity_llm_summary") +integrity.add_edge("integrity_llm_summary", lgraph.END) +graph = integrity.compile() + + +def run_integrity(path: str) -> dict: + """ + Execute integrity graph end to end. + + :param path: dataset path + :return: integrity report with summary and flag + """ + init_state: IntegrityState = { + "path": path, + "time_col": None, + "winner_formatter": {}, + "cols": [], + "temporal_cols": [], + "bad_rows": [], + "entity_col": None, + "numeric_cols": [], + "categorical_val_cols": [], + "metadata": {}, + "secondary_keys": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + } + out = graph.invoke(init_state) + payload = { + "report": out["report"], + "summary": out["summary"], + "flag": out["flag"], + } + _LOG.info("Integrity output: %s", payload) + return payload diff --git a/agentic_eda/jupyterlab_extension_backend/src/main.py b/agentic_eda/jupyterlab_extension_backend/src/main.py new file mode 100644 index 000000000..f5fd3e70f --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/main.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +""" +Import as: + +import src.main as smain +""" + +import argparse +import json +import logging + +import src.ingest.compute_temporal_stats as sctstats +import src.ingest.format_datetime as sfordat +import src.ingest.handle_inputs as shainp +import src.ingest.infer_structure as sinferstruct +import src.ingest.infer_type as sinfert +import src.ingest.integrity as sinteg +import src.quality_handling.audit_missingness as sauditmiss +import src.quality_handling.handle_missingness as shandlemiss +import src.quality_handling.standardize as sstandard +import src.univariate_analysis.test_transforms as stransforms +import src.univariate_analysis.univariate_metrics_plotting as sunivar + +_LOG = logging.getLogger(__name__) + + +def _parse_args() -> argparse.Namespace: + """ + Parse CLI arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--mode", + required=True, + choices=[ + "input", + "format", + "infer_type", + "infer_structure", + "compute_temporal_stats", + "integrity", + "audit_missingness", + "handle_missingness", + "standardize", + "univariate_metrics_plotting", + "test_transforms", + ], + help="Pipeline stage to execute.", + ) + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +def _run_cli(args: argparse.Namespace) -> dict: + """ + Execute selected backend stage. + + :param args: parsed CLI args + :return: stage output payload + """ + mode = args.mode + if mode == "input": + payload = shainp.run_input_handler(args.path) + elif mode == "format": + payload = sfordat.run_date_formatter(args.path) + elif mode == "integrity": + payload = sinteg.run_integrity(args.path) + elif mode == "infer_type": + payload = sinfert.run_infer_type(args.path) + elif mode == "infer_structure": + payload = sinferstruct.run_infer_structure(args.path) + elif mode == "compute_temporal_stats": + payload = sctstats.run_compute_temporal_stats(args.path) + elif mode == "audit_missingness": + payload = sauditmiss.run_audit_missingness(args.path) + elif mode == "handle_missingness": + payload = shandlemiss.run_handle_missingness(args.path) + elif mode == "standardize": + payload = sstandard.run_standardize(args.path) + elif mode == "univariate_metrics_plotting": + payload = sunivar.run_univariate_metrics_plotting(args.path) + elif mode == "test_transforms": + payload = stransforms.run_test_transforms(args.path) + else: + raise ValueError(f"Unsupported mode='{mode}'") + return payload + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + cli_args = _parse_args() + output = _run_cli(cli_args) + _LOG.info("Pipeline output: %s", json.dumps(output, default=str, indent=2)) diff --git a/agentic_eda/jupyterlab_extension_backend/src/quality_handling/__init__.py b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/__init__.py new file mode 100644 index 000000000..b6cf94fe8 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/__init__.py @@ -0,0 +1,3 @@ +""" +Quality-handling stages and helpers for the Jupyter backend. +""" diff --git a/agentic_eda/jupyterlab_extension_backend/src/quality_handling/audit_missingness.py b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/audit_missingness.py new file mode 100644 index 000000000..a037ca02a --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/audit_missingness.py @@ -0,0 +1,209 @@ +""" +Import as: + +import src.quality_handling.audit_missingness as sauditmiss +""" + +from __future__ import annotations + +import argparse +import logging +from typing import TypedDict + +import langgraph.graph as lgraph + +import src.ingest.compute_temporal_stats as sctstats +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class MissingnessAuditState(TypedDict): + """ + Store deterministic missingness audit output. + """ + + missingness_report: dict + + +class CompositeState(TypedDict): + """ + Store graph state for missingness auditing. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + missingness_report: dict + + +def call_compute_temporal_stats(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to temporal statistics. + + :param state: graph state + :return: composite payload from compute_temporal_stats + """ + payload = sctstats.run_compute_temporal_stats(state["path"]) + return payload + + +def audit_missingness(state: CompositeState) -> dict: + """ + Audit value missingness and timestamp missingness deterministically. + + :param state: graph state + :return: missingness report payload + """ + missingness_report = tinptool.audit_missingness.invoke( + { + "path": state["path"], + "time_col": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "winner_formatter": state["winner_formatter"], + } + ) + trace_payload = { + "primary_key": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "missingness_report": missingness_report, + } + tinptool.write_stage_trace(state["path"], "audit_missingness", trace_payload) + payload = { + "missingness_report": missingness_report, + "has_missing_values": bool( + missingness_report["value_missingness_summary"]["total_missing_cells"] > 0 + or missingness_report["timestamp_missingness_summary"]["total_missing_timestamps"] > 0 + ), + } + return payload + + +missingness_audit = lgraph.StateGraph(CompositeState) +missingness_audit.add_node("compute_temporal_stats_pipeline", call_compute_temporal_stats) +missingness_audit.add_node("audit_missingness", audit_missingness) +missingness_audit.add_edge(lgraph.START, "compute_temporal_stats_pipeline") +missingness_audit.add_edge("compute_temporal_stats_pipeline", "audit_missingness") +missingness_audit.add_edge("audit_missingness", lgraph.END) +graph = missingness_audit.compile() + + +def run_audit_missingness(path: str) -> dict: + """ + Execute missingness auditing end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + "n_nat_time": 0, + "min_time": None, + "max_time": None, + "typical_delta_mode": None, + "typical_delta_median": None, + "expected_frequency": None, + "dominant_frequency_fraction": 0.0, + "is_irregular_sampling": False, + "resampling_decision": "", + "coverage_summary": {}, + "coverage_per_entity": [], + "missingness_report": {}, + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Missingness audit output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_audit_missingness(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/quality_handling/handle_missingness.py b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/handle_missingness.py new file mode 100644 index 000000000..325f1cdd4 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/handle_missingness.py @@ -0,0 +1,386 @@ +""" +Import as: + +import src.quality_handling.handle_missingness as shandlemiss +""" + +from __future__ import annotations + +import argparse +import logging +from typing import Literal +from typing import TypedDict + +import langchain.agents as lagents +import langchain_core.messages as lmessages +import langgraph.graph as lgraph +import pydantic + +import src.config.config as cconf +import src.quality_handling.audit_missingness as sauditmiss +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +def _build_missingness_plan_summary(actions: list[dict], *, defaulted_cols: int) -> str: + """ + Build a summary from the normalized missingness actions. + + :param actions: normalized action list + :param defaulted_cols: number of columns defaulted during normalization + :return: summary text aligned with the final plan + """ + if not actions: + return "No non-time columns required missingness handling." + counts: dict[str, int] = {} + for action in actions: + strategy = str(action["strategy"]) + counts[strategy] = counts.get(strategy, 0) + 1 + ordered_counts = ", ".join( + f"{strategy}={counts[strategy]}" + for strategy in sorted(counts) + ) + summary = ( + f"Normalized missingness plan for {len(actions)} columns: {ordered_counts}. " + "Actions reflect the final bounded plan after validation against eligible strategies." + ) + if defaulted_cols > 0: + summary += f" {defaulted_cols} columns were defaulted conservatively during normalization." + return summary + + +class MissingnessDecision(pydantic.BaseModel): + """ + Store one bounded missingness decision. + """ + + col: str + strategy: Literal[ + "leave_as_nan", + "forward_fill", + "interpolate", + "zero_fill", + "drop_rows", + ] + create_missingness_flag: bool = True + reason: str + + +class MissingnessPlanOutput(pydantic.BaseModel): + """ + Store LLM-produced missingness plan. + """ + + summary: str + actions: list[MissingnessDecision] + + +class CompositeState(TypedDict): + """ + Store graph state for missingness handling. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + missingness_report: dict + missingness_plan: dict + missingness_handling_report: dict + quality_dataset_path: str + + +def call_audit_missingness(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to missingness auditing. + + :param state: graph state + :return: composite payload from audit_missingness + """ + payload = sauditmiss.run_audit_missingness(state["path"]) + return payload + + +def _normalize_missingness_plan(state: CompositeState, raw_plan: dict) -> dict: + """ + Ensure every missing column has one supported action. + + :param state: graph state + :param raw_plan: LLM-produced plan + :return: normalized deterministic plan + """ + audit_report = state["missingness_report"] + missing_cols = [ + item + for item in audit_report["value_missingness_by_column"] + if item["n_missing"] > 0 and item["col"] != state["primary_key"] + ] + eligible_by_col = { + item["col"]: set(item["eligible_strategies"]) + for item in missing_cols + } + plan_by_col = {} + defaulted_cols = 0 + for item in raw_plan.get("actions") or []: + col = str(item.get("col") or "") + if col not in eligible_by_col: + continue + strategy = str(item.get("strategy") or "leave_as_nan") + if strategy not in eligible_by_col[col]: + strategy = "leave_as_nan" + plan_by_col[col] = { + "col": col, + "strategy": strategy, + "create_missingness_flag": bool(item.get("create_missingness_flag", True)), + "reason": str(item.get("reason") or ""), + } + normalized_actions = [] + for item in missing_cols: + col = item["col"] + action = plan_by_col.get( + col, + { + "col": col, + "strategy": "leave_as_nan", + "create_missingness_flag": True, + "reason": "Defaulted conservatively because no valid explicit plan was provided.", + }, + ) + normalized_actions.append(action) + if col not in plan_by_col: + defaulted_cols += 1 + return { + "summary": _build_missingness_plan_summary( + normalized_actions, + defaulted_cols=defaulted_cols, + ), + "actions": normalized_actions, + } + + +def choose_missingness_plan(state: CompositeState) -> dict: + """ + Choose bounded missingness actions using deterministic evidence. + + :param state: graph state + :return: normalized missingness plan + """ + missing_cols = [ + item + for item in state["missingness_report"]["value_missingness_by_column"] + if item["n_missing"] > 0 and item["col"] != state["primary_key"] + ] + if not missing_cols: + payload = { + "missingness_plan": { + "summary": "No non-time columns contain missing values requiring handling.", + "actions": [], + } + } + return payload + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[], + system_prompt=( + "You are a missingness planner for a time-series EDA backend. " + "Choose exactly one bounded strategy per column with missing values. " + "Allowed strategies are leave_as_nan, forward_fill, interpolate, " + "zero_fill, and drop_rows. Prefer conservative choices when the " + "evidence is weak. Use zero_fill only for true count-like variables " + "where structural zeros are plausible. Use interpolate only for " + "numeric columns. Use forward_fill for stateful or slowly varying " + "features when continuity is plausible. Missing timestamps are a " + "separate issue from missing cell values; do not pretend that a cell " + "imputation solves timestamp holes." + ), + response_format=MissingnessPlanOutput, + ) + evidence = { + "series_type": state["type"], + "expected_frequency": state["expected_frequency"], + "is_irregular_sampling": state["is_irregular_sampling"], + "timestamp_missingness_summary": state["missingness_report"]["timestamp_missingness_summary"], + "columns_with_missing_values": missing_cols, + "numeric_continuous_cols": state["numeric_continuous_cols"], + "numeric_count_cols": state["numeric_count_cols"], + "binary_flag_cols": state["binary_flag_cols"], + "categorical_feature_cols": state["categorical_feature_cols"], + } + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=f"Plan missingness handling from this evidence: {evidence}" + ) + ] + } + ) + raw_plan = out["structured_response"].model_dump() + normalized_plan = _normalize_missingness_plan(state, raw_plan) + payload = {"missingness_plan": normalized_plan} + return payload + + +def apply_missingness_plan(state: CompositeState) -> dict: + """ + Apply the chosen missingness plan deterministically. + + :param state: graph state + :return: handling report and output dataset path + """ + handling_report = tinptool.apply_missingness_actions.invoke( + { + "source_path": state["path"], + "input_path": state["path"], + "time_col": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "winner_formatter": state["winner_formatter"], + "actions": state["missingness_plan"]["actions"], + } + ) + trace_payload = { + "missingness_plan": state["missingness_plan"], + "missingness_handling_report": handling_report, + } + tinptool.write_stage_trace(state["path"], "handle_missingness", trace_payload) + payload = { + "missingness_handling_report": handling_report, + "quality_dataset_path": handling_report["output_path"], + } + return payload + + +missingness_handling = lgraph.StateGraph(CompositeState) +missingness_handling.add_node("audit_missingness_pipeline", call_audit_missingness) +missingness_handling.add_node("choose_missingness_plan", choose_missingness_plan) +missingness_handling.add_node("apply_missingness_plan", apply_missingness_plan) +missingness_handling.add_edge(lgraph.START, "audit_missingness_pipeline") +missingness_handling.add_edge("audit_missingness_pipeline", "choose_missingness_plan") +missingness_handling.add_edge("choose_missingness_plan", "apply_missingness_plan") +missingness_handling.add_edge("apply_missingness_plan", lgraph.END) +graph = missingness_handling.compile() + + +def run_handle_missingness(path: str) -> dict: + """ + Execute missingness handling end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + "n_nat_time": 0, + "min_time": None, + "max_time": None, + "typical_delta_mode": None, + "typical_delta_median": None, + "expected_frequency": None, + "dominant_frequency_fraction": 0.0, + "is_irregular_sampling": False, + "resampling_decision": "", + "coverage_summary": {}, + "coverage_per_entity": [], + "missingness_report": {}, + "missingness_plan": {}, + "missingness_handling_report": {}, + "quality_dataset_path": "", + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Missingness handling output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_handle_missingness(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/quality_handling/standardize.py b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/standardize.py new file mode 100644 index 000000000..0dab99163 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/quality_handling/standardize.py @@ -0,0 +1,488 @@ +""" +Import as: + +import src.quality_handling.standardize as sstandard +""" + +from __future__ import annotations + +import argparse +import logging +from typing import Literal +from typing import TypedDict + +import langchain.agents as lagents +import langchain_core.messages as lmessages +import langgraph.graph as lgraph +import pydantic + +import src.config.config as cconf +import src.quality_handling.handle_missingness as shandlemiss +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +def _build_standardization_plan_summary(actions: list[dict], *, defaulted_cols: int) -> str: + """ + Build a summary from the normalized standardization actions. + + :param actions: normalized action list + :param defaulted_cols: number of columns defaulted during normalization + :return: summary text aligned with the final plan + """ + if not actions: + return "No numeric candidate columns were selected for optional standardization." + counts: dict[str, int] = {} + for action in actions: + transform = str(action["action"]) + counts[transform] = counts.get(transform, 0) + 1 + ordered_counts = ", ".join( + f"{transform}={counts[transform]}" + for transform in sorted(counts) + ) + summary = ( + f"Normalized standardization plan for {len(actions)} columns: {ordered_counts}. " + "This summary reflects the final validated transform choices, not the raw LLM prose." + ) + if defaulted_cols > 0: + summary += f" {defaulted_cols} columns defaulted conservatively to `none`." + return summary + + +class StandardizationDecision(pydantic.BaseModel): + """ + Store one bounded standardization decision. + """ + + col: str + action: Literal["none", "robust_scale", "log1p", "log1p_then_robust_scale"] + reason: str + + +class StandardizationPlanOutput(pydantic.BaseModel): + """ + Store LLM-produced standardization plan. + """ + + summary: str + actions: list[StandardizationDecision] + + +class StandardizationGateOutput(pydantic.BaseModel): + """ + Store the dataset-level standardization gate decision. + """ + + should_standardize: bool + reason: str + + +class CompositeState(TypedDict): + """ + Store graph state for optional standardization. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + missingness_report: dict + missingness_plan: dict + missingness_handling_report: dict + quality_dataset_path: str + standardization_profile: dict + standardization_gate: dict + standardization_plan: dict + standardization_report: dict + standardized_dataset_path: str + + +def call_handle_missingness(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to missingness handling. + + :param state: graph state + :return: composite payload from handle_missingness + """ + payload = shandlemiss.run_handle_missingness(state["path"]) + return payload + + +def profile_standardization(state: CompositeState) -> dict: + """ + Profile numeric feature scale and tail behavior deterministically. + + :param state: graph state + :return: scale profile report + """ + input_path = state["quality_dataset_path"] or state["path"] + profile = tinptool.profile_standardization_candidates.invoke( + { + "path": input_path, + "numeric_continuous_cols": state["numeric_continuous_cols"], + "numeric_count_cols": state["numeric_count_cols"], + "binary_flag_cols": state["binary_flag_cols"], + } + ) + payload = {"standardization_profile": profile} + return payload + + +def choose_standardization_gate(state: CompositeState) -> dict: + """ + Decide whether optional standardization should run at all. + + :param state: graph state + :return: dataset-level gate decision + """ + per_column = state["standardization_profile"].get("per_column") or [] + if not per_column: + return { + "standardization_gate": { + "should_standardize": False, + "reason": "No numeric candidate columns were available for optional standardization.", + } + } + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[], + system_prompt=( + "You are the gatekeeper for point 9 in a time-series EDA backend. " + "Decide whether optional standardization should run at all for this dataset. " + "Favor should_standardize=false unless there is strong evidence that rescaling " + "or log-scaling is genuinely useful. Favor false for raw exploratory analysis, " + "for SCADA or sensor-style datasets where physical units matter, and for cases " + "where leaving values untouched preserves interpretability. Favor true only when " + "scale disparities or heavy tails are severe enough that not transforming would " + "materially hinder comparison or downstream modeling." + ), + response_format=StandardizationGateOutput, + ) + evidence = { + "series_type": state["type"], + "numeric_continuous_cols": state["numeric_continuous_cols"], + "numeric_count_cols": state["numeric_count_cols"], + "binary_flag_cols": state["binary_flag_cols"], + "scale_summary": state["standardization_profile"].get("scale_summary"), + "sample_profiles": per_column[:20], + } + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=f"Decide whether optional standardization should run from this evidence: {evidence}" + ) + ] + } + ) + gate = out["structured_response"].model_dump() + return {"standardization_gate": gate} + + +def _normalize_standardization_plan(state: CompositeState, raw_plan: dict) -> dict: + """ + Ensure every candidate column gets a supported transform decision. + + :param state: graph state + :param raw_plan: LLM-produced plan + :return: normalized plan + """ + per_column = state["standardization_profile"].get("per_column") or [] + eligible_by_col = { + item["col"]: set(item["eligible_actions"]) + for item in per_column + } + plan_by_col = {} + defaulted_cols = 0 + for item in raw_plan.get("actions") or []: + col = str(item.get("col") or "") + if col not in eligible_by_col: + continue + action = str(item.get("action") or "none") + if action not in eligible_by_col[col]: + action = "none" + plan_by_col[col] = { + "col": col, + "action": action, + "reason": str(item.get("reason") or ""), + } + normalized_actions = [] + for item in per_column: + col = item["col"] + if col not in plan_by_col: + defaulted_cols += 1 + normalized_actions.append( + plan_by_col.get( + col, + { + "col": col, + "action": "none", + "reason": "Defaulted conservatively because no valid transform was selected.", + }, + ) + ) + return { + "summary": _build_standardization_plan_summary( + normalized_actions, + defaulted_cols=defaulted_cols, + ), + "actions": normalized_actions, + } + + +def choose_standardization_plan(state: CompositeState) -> dict: + """ + Choose whether optional standardization is justified. + + :param state: graph state + :return: normalized standardization plan + """ + gate = state.get("standardization_gate") or {} + if not bool(gate.get("should_standardize")): + payload = { + "standardization_plan": { + "summary": ( + "Dataset-level standardization gate returned `no`. " + f"Reason: {str(gate.get('reason') or 'No reason provided.')}" + ), + "actions": [], + } + } + return payload + per_column = state["standardization_profile"].get("per_column") or [] + if not per_column: + payload = { + "standardization_plan": { + "summary": "No numeric candidate columns were available for optional standardization.", + "actions": [], + } + } + return payload + llm = cconf.get_chat_model(model="gpt-4.1") + agent = lagents.create_agent( + model=llm, + tools=[], + system_prompt=( + "You are an optional standardization planner for a time-series EDA backend. " + "This stage is optional. Use action none unless there is a concrete reason " + "to transform a feature. Allowed actions are none, robust_scale, log1p, " + "and log1p_then_robust_scale. Favor none when evidence is weak. Favor " + "robust_scale for large cross-feature scale disparities. Favor log1p for " + "strongly right-skewed nonnegative features. Never invent new actions." + ), + response_format=StandardizationPlanOutput, + ) + evidence = { + "series_type": state["type"], + "scale_summary": state["standardization_profile"].get("scale_summary"), + "per_column": per_column, + } + out = agent.invoke( + { + "messages": [ + lmessages.HumanMessage( + content=f"Choose optional standardization actions from this evidence: {evidence}" + ) + ] + } + ) + raw_plan = out["structured_response"].model_dump() + normalized_plan = _normalize_standardization_plan(state, raw_plan) + payload = {"standardization_plan": normalized_plan} + return payload + + +def apply_standardization_plan(state: CompositeState) -> dict: + """ + Apply the chosen standardization plan deterministically. + + :param state: graph state + :return: transformation report and output path + """ + input_path = state["quality_dataset_path"] or state["path"] + if not state["standardization_plan"]["actions"]: + report = { + "input_path": input_path, + "output_path": input_path, + "skipped": True, + "reason": state["standardization_plan"]["summary"], + "actions_applied": [], + } + trace_payload = { + "input_path": input_path, + "standardization_profile": state["standardization_profile"], + "standardization_gate": state.get("standardization_gate") or {}, + "standardization_plan": state["standardization_plan"], + "standardization_report": report, + } + tinptool.write_stage_trace(state["path"], "standardize", trace_payload) + payload = { + "standardization_report": report, + "standardized_dataset_path": input_path, + } + return payload + report = tinptool.apply_standardization_actions.invoke( + { + "source_path": state["path"], + "input_path": input_path, + "actions": state["standardization_plan"]["actions"], + } + ) + trace_payload = { + "input_path": input_path, + "standardization_profile": state["standardization_profile"], + "standardization_gate": state.get("standardization_gate") or {}, + "standardization_plan": state["standardization_plan"], + "standardization_report": report, + } + tinptool.write_stage_trace(state["path"], "standardize", trace_payload) + payload = { + "standardization_report": report, + "standardized_dataset_path": report["output_path"], + } + return payload + + +standardization = lgraph.StateGraph(CompositeState) +standardization.add_node("handle_missingness_pipeline", call_handle_missingness) +standardization.add_node("profile_standardization", profile_standardization) +standardization.add_node("choose_standardization_gate", choose_standardization_gate) +standardization.add_node("choose_standardization_plan", choose_standardization_plan) +standardization.add_node("apply_standardization_plan", apply_standardization_plan) +standardization.add_edge(lgraph.START, "handle_missingness_pipeline") +standardization.add_edge("handle_missingness_pipeline", "profile_standardization") +standardization.add_edge("profile_standardization", "choose_standardization_gate") +standardization.add_edge("choose_standardization_gate", "choose_standardization_plan") +standardization.add_edge("choose_standardization_plan", "apply_standardization_plan") +standardization.add_edge("apply_standardization_plan", lgraph.END) +graph = standardization.compile() + + +def run_standardize(path: str) -> dict: + """ + Execute optional standardization end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + "n_nat_time": 0, + "min_time": None, + "max_time": None, + "typical_delta_mode": None, + "typical_delta_median": None, + "expected_frequency": None, + "dominant_frequency_fraction": 0.0, + "is_irregular_sampling": False, + "resampling_decision": "", + "coverage_summary": {}, + "coverage_per_entity": [], + "missingness_report": {}, + "missingness_plan": {}, + "missingness_handling_report": {}, + "quality_dataset_path": "", + "standardization_profile": {}, + "standardization_gate": {}, + "standardization_plan": {}, + "standardization_report": {}, + "standardized_dataset_path": "", + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Standardization output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_standardize(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/tools/__init__.py b/agentic_eda/jupyterlab_extension_backend/src/tools/__init__.py new file mode 100644 index 000000000..46d455292 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/tools/__init__.py @@ -0,0 +1,3 @@ +""" +Backend tool package. +""" diff --git a/agentic_eda/jupyterlab_extension_backend/src/tools/input_tools.py b/agentic_eda/jupyterlab_extension_backend/src/tools/input_tools.py new file mode 100644 index 000000000..3490b4811 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/tools/input_tools.py @@ -0,0 +1,1970 @@ +""" +Import as: + +import src.tools.input_tools as tinptool +""" + +import json +import itertools +import pathlib +import re +from typing import Any + +import langchain.tools as ltools +import numpy as np +import pandas as pd +import pydantic + +_VALID_HEADER_START_RE = re.compile(r"^[A-Za-z_]") + + +def _trace_root() -> pathlib.Path: + """ + Return the backend-level trace directory. + + :return: absolute trace root + """ + trace_root = pathlib.Path(__file__).resolve().parents[2] / "traces" + trace_root.mkdir(parents=True, exist_ok=True) + return trace_root + + +def load_dataset(path: pathlib.Path) -> pd.DataFrame: + """ + Load a supported dataset from disk. + + :param path: path to dataset file + :return: dataset as dataframe + """ + ext = path.suffix.lower() + if ext == ".csv": + dataset = pd.read_csv(path) + else: + raise ValueError(f"Unsupported file extension='{ext}'") + return dataset + + +def _sample_values(series: pd.Series, *, limit: int = 5) -> list[str]: + """ + Return a small deterministic sample of distinct non-null values. + + Theory: + A short value sample gives downstream logic human-interpretable evidence + about whether a column behaves like a flag, identifier, category, or + free-form measurement, without depending on the column name alone. + + :param series: input series + :param limit: max number of sample values + :return: stringified sample values + """ + values: list[str] = [] + seen: set[str] = set() + for value in series.dropna().tolist(): + key = str(value) + if key in seen: + continue + seen.add(key) + values.append(key) + if len(values) >= limit: + break + return values + + +def _normalized_non_null_fraction(series: pd.Series) -> float: + """ + Compute the non-null fraction for a series. + + Theory: + Missingness changes how much confidence we should place in any inferred + semantic role. Columns with very little observed data provide weak evidence + for type inference, so completeness is a foundational statistic. + + :param series: input series + :return: non-null fraction + """ + if len(series) == 0: + return 0.0 + return float(series.notna().mean()) + + +def _coerce_numeric(series: pd.Series) -> pd.Series: + """ + Convert a series to numeric values where possible. + + Theory: + Many semantic distinctions begin with whether values actually behave like + numbers in the data, not whether the declared dtype says so. Numeric + coercion exposes columns that are numerically meaningful even when loaded + as strings. + + :param series: input series + :return: numeric series with NaN for non-numeric values + """ + return pd.to_numeric(series, errors="coerce") + + +def _is_integer_like(series: pd.Series) -> bool: + """ + Check whether numeric values are effectively integers. + + Theory: + Count variables and encoded flags often live on the integers, whereas + continuous measurements usually do not. Integer support is therefore a + useful deterministic signal for separating counts from continuous values. + + :param series: numeric-like series + :return: true when all observed values are close to integers + """ + numeric = _coerce_numeric(series).dropna() + if numeric.empty: + return False + rounded = numeric.round() + return bool((numeric - rounded).abs().le(1e-9).all()) + + +def _is_binary_like(series: pd.Series) -> bool: + """ + Check whether a column behaves like a binary flag. + + Theory: + Binary indicators are characterized by two logical states regardless of + whether they are stored as booleans, strings, or numeric codes. Recognizing + this two-state support helps prevent flags from being misclassified as + general categoricals or counts. + + :param series: input series + :return: true when the column has exactly two logical states + """ + non_null = series.dropna() + if non_null.empty: + return False + unique_raw = {str(value).strip().lower() for value in non_null.unique()} + binary_vocab = { + "0", + "1", + "true", + "false", + "t", + "f", + "yes", + "no", + "y", + "n", + } + if unique_raw and unique_raw.issubset(binary_vocab) and len(unique_raw) <= 2: + return True + return len(unique_raw) == 2 + + +def _build_column_profiles(dataset: pd.DataFrame) -> dict[str, dict[str, Any]]: + """ + Build deterministic per-column profiles used by downstream schema tools. + + Theory: + Robust schema inference should summarize how each column behaves in the + observed data: completeness, cardinality, numeric support, integer support, + binary support, and value examples. Those empirical signals are what later + stages use to infer keys and semantic feature types in a reproducible way. + + :param dataset: input dataframe + :return: map of column name to summary statistics + """ + profiles: dict[str, dict[str, Any]] = {} + n_rows = int(dataset.shape[0]) + for col in dataset.columns: + series = dataset[col] + non_null = series.dropna() + n_non_null = int(non_null.shape[0]) + n_unique = int(non_null.nunique(dropna=True)) + unique_ratio = 0.0 if n_non_null == 0 else float(n_unique / n_non_null) + numeric = _coerce_numeric(series) + numeric_non_null = numeric.dropna() + numeric_fraction = ( + 0.0 if n_non_null == 0 else float(numeric_non_null.shape[0] / n_non_null) + ) + integer_like = _is_integer_like(series) + nonnegative_like = ( + False + if numeric_non_null.empty + else bool((numeric_non_null >= 0).all()) + ) + profile = { + "dtype": str(series.dtype), + "n_rows": n_rows, + "n_non_null": n_non_null, + "non_null_fraction": _normalized_non_null_fraction(series), + "n_unique": n_unique, + "unique_ratio": unique_ratio, + "is_numeric_like": bool(numeric_fraction >= 0.95 and not numeric_non_null.empty), + "numeric_fraction": numeric_fraction, + "is_integer_like": integer_like, + "is_binary_like": _is_binary_like(series), + "is_nonnegative_like": nonnegative_like, + "sample_values": _sample_values(series), + } + if not numeric_non_null.empty: + profile["min_numeric"] = float(numeric_non_null.min()) + profile["max_numeric"] = float(numeric_non_null.max()) + else: + profile["min_numeric"] = None + profile["max_numeric"] = None + profiles[str(col)] = profile + return profiles + + +def write_stage_trace(path: str, stage: str, payload: dict[str, Any]) -> str: + """ + Persist diagnostic findings for one pipeline stage to a backend-local trace + file. + + :param path: dataset path + :param stage: pipeline stage name + :param payload: JSON-serializable diagnostic payload + :return: absolute trace file path + """ + dataset_path = pathlib.Path(path) + filename = f"{dataset_path.stem}.{stage}.json" + trace_path = _trace_root() / filename + trace_payload = { + "dataset_path": str(dataset_path), + "stage": stage, + "payload": payload, + } + trace_path.write_text( + json.dumps(trace_payload, default=str, indent=2), + encoding="utf-8", + ) + return str(trace_path) + + +def write_stage_dataset(path: str, stage: str, dataset: pd.DataFrame) -> str: + """ + Persist a stage-produced dataset artifact alongside trace files. + + :param path: source dataset path + :param stage: pipeline stage name + :param dataset: dataframe to serialize + :return: absolute output dataset path + """ + dataset_path = pathlib.Path(path) + filename = f"{dataset_path.stem}.{stage}.csv" + output_path = _trace_root() / filename + dataset.to_csv(output_path, index=False) + return str(output_path) + + +def write_stage_plot(path: str, stage: str, plot_name: str, fig: Any) -> str: + """ + Persist a stage-produced plot under the backend trace directory. + + :param path: source dataset path + :param stage: pipeline stage name + :param plot_name: plot-specific filename stem + :param fig: matplotlib figure + :return: absolute output plot path + """ + dataset_path = pathlib.Path(path) + plot_dir = _trace_root() / f"{dataset_path.stem}.{stage}" + plot_dir.mkdir(parents=True, exist_ok=True) + safe_name = re.sub(r"[^A-Za-z0-9_.-]+", "_", plot_name).strip("_") + if not safe_name: + safe_name = "plot" + output_path = plot_dir / f"{safe_name}.png" + fig.savefig(output_path, dpi=140, bbox_inches="tight") + return str(output_path) + + +def _parse_time_series( + dataset: pd.DataFrame, + time_col: str, + winner_formatter: dict[str, Any] | None = None, +) -> pd.Series: + """ + Parse the selected time column with the best-known formatter settings. + + Theory: + Temporal statistics are only meaningful once the time axis has been mapped + into a consistent datetime representation. Reusing the formatter selected + earlier in the pipeline avoids accidental drift between schema inference and + downstream coverage/frequency calculations. + + :param dataset: input dataframe + :param time_col: selected time column + :param winner_formatter: optional datetime parsing kwargs + :return: parsed timestamp series + """ + format_args = winner_formatter or {} + format_args = {key: val for key, val in format_args.items() if val is not None} + try: + return pd.to_datetime(dataset[time_col], errors="coerce", **format_args) + except Exception: + return pd.to_datetime(dataset[time_col], errors="coerce") + + +def _format_timedelta(delta: pd.Timedelta | None) -> str | None: + """ + Convert a timedelta into a stable string representation. + + Theory: + Frequency and gap summaries are easier to compare across stages when they + are rendered into a canonical textual duration rather than leaking pandas- + specific objects into the public payload. + + :param delta: input timedelta + :return: normalized string or None + """ + if delta is None or pd.isna(delta): + return None + return str(delta) + + +def _series_identifier(keys: list[str], values: tuple[Any, ...]) -> dict[str, Any] | None: + """ + Package one composite entity identifier as a JSON-friendly mapping. + + Theory: + Coverage and frequency statistics are naturally computed per series. When a + panel uses composite entity keys, the identifier must preserve every key + component so the reported findings still point back to the original series. + + :param keys: entity key column names + :param values: grouped key values + :return: key-value mapping or None for single-series data + """ + if not keys: + return None + return {key: value for key, value in zip(keys, values, strict=True)} + + +def _ordered_dataset( + dataset: pd.DataFrame, + time_col: str, + secondary_keys: list[str] | None = None, + winner_formatter: dict[str, Any] | None = None, +) -> pd.DataFrame: + """ + Return a stable, time-aware ordering for sequential quality operations. + + :param dataset: input dataframe + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param winner_formatter: optional datetime parsing kwargs + :return: ordered dataframe with helper columns + """ + ordered = dataset.copy() + ordered["_row_order"] = range(int(ordered.shape[0])) + if time_col in ordered.columns: + ordered["_ts"] = _parse_time_series(ordered, time_col, winner_formatter) + else: + ordered["_ts"] = pd.NaT + valid_secondary_keys = [ + key + for key in (secondary_keys or []) + if key in ordered.columns and key != time_col + ] + sort_cols = list(valid_secondary_keys) + if ordered["_ts"].notna().any(): + sort_cols.append("_ts") + sort_cols.append("_row_order") + ordered = ordered.sort_values(sort_cols, na_position="last").reset_index(drop=True) + return ordered + + +def _iter_series_frames( + dataset: pd.DataFrame, + secondary_keys: list[str] | None = None, +) -> list[tuple[dict[str, Any] | None, pd.DataFrame]]: + """ + Yield one frame per inferred series. + + :param dataset: ordered dataframe + :param secondary_keys: optional entity keys + :return: list of entity/frame pairs + """ + valid_secondary_keys = [ + key for key in (secondary_keys or []) if key in dataset.columns + ] + if not valid_secondary_keys: + return [(None, dataset)] + items: list[tuple[dict[str, Any] | None, pd.DataFrame]] = [] + grouped = dataset.groupby(valid_secondary_keys, dropna=False, sort=False) + for raw_key, frame in grouped: + key_tuple = raw_key if isinstance(raw_key, tuple) else (raw_key,) + items.append((_series_identifier(valid_secondary_keys, key_tuple), frame)) + return items + + +def _mask_run_lengths(mask: pd.Series) -> list[int]: + """ + Return lengths of consecutive true runs in a boolean mask. + + :param mask: boolean mask + :return: run lengths + """ + run_lengths: list[int] = [] + current = 0 + for is_true in mask.fillna(False).astype(bool).tolist(): + if is_true: + current += 1 + elif current > 0: + run_lengths.append(current) + current = 0 + if current > 0: + run_lengths.append(current) + return run_lengths + + +def _safe_float(value: Any) -> float | None: + """ + Convert a numeric-like value into a JSON-friendly float. + + :param value: input value + :return: float or None + """ + if value is None or pd.isna(value): + return None + return float(value) + + +def _candidate_univariate_numeric_cols( + dataset: pd.DataFrame, + *, + time_col: str, + secondary_keys: list[str] | None = None, + numeric_continuous_cols: list[str] | None = None, + numeric_count_cols: list[str] | None = None, + binary_flag_cols: list[str] | None = None, +) -> list[str]: + """ + Return deterministic numeric columns suitable for univariate analysis. + + :param dataset: input dataframe + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param numeric_continuous_cols: inferred continuous numeric columns + :param numeric_count_cols: inferred count columns + :param binary_flag_cols: inferred binary columns + :return: ordered numeric analysis columns + """ + excluded = {time_col, *(secondary_keys or [])} + candidates = list( + dict.fromkeys( + [ + *[col for col in (numeric_continuous_cols or []) if col in dataset.columns], + *[col for col in (numeric_count_cols or []) if col in dataset.columns], + *[col for col in (binary_flag_cols or []) if col in dataset.columns], + ] + ) + ) + if not candidates: + candidates = [ + str(col) + for col in dataset.columns + if str(col) not in excluded and pd.to_numeric(dataset[col], errors="coerce").notna().any() + ] + return [col for col in candidates if col not in excluded] + + +def _tail_ratio(series: pd.Series) -> float | None: + """ + Compute a simple deterministic tail ratio. + + :param series: numeric series + :return: tail ratio or None + """ + valid = pd.to_numeric(series, errors="coerce").dropna() + if valid.empty: + return None + p50 = valid.quantile(0.50) + p99 = valid.quantile(0.99) + if pd.isna(p50) or pd.isna(p99): + return None + if float(abs(p50)) <= 1e-12: + return None if float(abs(p99)) <= 1e-12 else float(abs(p99)) + return float(abs(p99) / abs(p50)) + + +def _univariate_summary(series: pd.Series) -> dict[str, Any]: + """ + Compute deterministic univariate summary statistics. + + :param series: numeric-like series + :return: summary stats + """ + numeric = pd.to_numeric(series, errors="coerce") + valid = numeric.dropna() + n_total = int(series.shape[0]) + n_non_null = int(valid.shape[0]) + n_missing = max(0, n_total - n_non_null) + if valid.empty: + return { + "n_total": n_total, + "n_non_null": 0, + "n_missing": n_missing, + "missing_pct": None if n_total == 0 else float(100.0 * n_missing / n_total), + "n_unique": 0, + "mean": None, + "std": None, + "min": None, + "p01": None, + "p05": None, + "p25": None, + "p50": None, + "p75": None, + "p95": None, + "p99": None, + "max": None, + "iqr": None, + "zero_fraction": None, + "skew": None, + "kurtosis": None, + "tail_ratio_p99_p50": None, + } + q = valid.quantile([0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99]) + return { + "n_total": n_total, + "n_non_null": n_non_null, + "n_missing": n_missing, + "missing_pct": None if n_total == 0 else float(100.0 * n_missing / n_total), + "n_unique": int(valid.nunique(dropna=True)), + "mean": _safe_float(valid.mean()), + "std": _safe_float(valid.std()), + "min": _safe_float(valid.min()), + "p01": _safe_float(q.loc[0.01]), + "p05": _safe_float(q.loc[0.05]), + "p25": _safe_float(q.loc[0.25]), + "p50": _safe_float(q.loc[0.50]), + "p75": _safe_float(q.loc[0.75]), + "p95": _safe_float(q.loc[0.95]), + "p99": _safe_float(q.loc[0.99]), + "max": _safe_float(valid.max()), + "iqr": _safe_float(q.loc[0.75] - q.loc[0.25]), + "zero_fraction": float((valid == 0).mean()), + "skew": _safe_float(valid.skew()), + "kurtosis": _safe_float(valid.kurt()), + "tail_ratio_p99_p50": _tail_ratio(valid), + } + + +def _gaussian_kde_curve(series: pd.Series, *, n_points: int = 256) -> tuple[np.ndarray, np.ndarray] | None: + """ + Compute a simple Gaussian KDE curve without scipy. + + :param series: numeric series + :param n_points: number of evaluation points + :return: x/y arrays or None when KDE is not appropriate + """ + valid = pd.to_numeric(series, errors="coerce").dropna().to_numpy(dtype=float) + if valid.size < 30: + return None + unique = np.unique(valid) + if unique.size < 10: + return None + std = float(np.std(valid, ddof=1)) + iqr = float(np.subtract(*np.percentile(valid, [75, 25]))) + scale = min(std, iqr / 1.34) if iqr > 0.0 else std + if not np.isfinite(scale) or scale <= 0.0: + return None + bandwidth = 0.9 * scale * (valid.size ** (-1.0 / 5.0)) + if not np.isfinite(bandwidth) or bandwidth <= 0.0: + return None + x_grid = np.linspace(float(valid.min()), float(valid.max()), n_points) + diffs = (x_grid[:, None] - valid[None, :]) / bandwidth + density = np.exp(-0.5 * diffs**2).sum(axis=1) + density /= float(valid.size * bandwidth * np.sqrt(2.0 * np.pi)) + return x_grid, density + + +def _transform_candidates(series: pd.Series) -> dict[str, pd.Series]: + """ + Build deterministic transform candidates for one numeric series. + + :param series: numeric series + :return: map of candidate name to transformed series + """ + numeric = pd.to_numeric(series, errors="coerce") + candidates: dict[str, pd.Series] = {"none": numeric} + valid = numeric.dropna() + if valid.empty: + return candidates + candidates["cuberoot"] = numeric.apply( + lambda value: np.cbrt(value) if pd.notna(value) else value + ) + if float(valid.min()) >= 0.0: + candidates["sqrt"] = numeric.apply( + lambda value: np.sqrt(value) if pd.notna(value) else value + ) + candidates["log1p"] = pd.Series(np.log1p(numeric), index=numeric.index) + return candidates + + +def _transform_score(series: pd.Series) -> dict[str, Any]: + """ + Score one transformed series using deterministic shape criteria. + + :param series: transformed numeric series + :return: score details + """ + summary = _univariate_summary(series) + valid = pd.to_numeric(series, errors="coerce").dropna() + if valid.empty: + return { + "summary": summary, + "score": None, + } + abs_skew = abs(float(summary["skew"])) if summary["skew"] is not None else 99.0 + abs_kurtosis = abs(float(summary["kurtosis"])) if summary["kurtosis"] is not None else 99.0 + tail_ratio = float(summary["tail_ratio_p99_p50"]) if summary["tail_ratio_p99_p50"] is not None else 99.0 + score = float(abs_skew + 0.25 * abs_kurtosis + 0.10 * tail_ratio) + return { + "summary": summary, + "score": score, + } + + +class _TemporalStatsArgs(pydantic.BaseModel): + """ + Store arguments for deterministic temporal statistics. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + path: str + time_col: str + secondary_keys: list[str] | None = None + winner_formatter: dict[str, Any] | None = None + + +@ltools.tool(args_schema=_TemporalStatsArgs) +def compute_temporal_stats( + path: str, + time_col: str, + secondary_keys: list[str] | None = None, + winner_formatter: dict[str, Any] | None = None, +) -> dict: + """ + Compute deterministic temporal range, coverage, and sampling-frequency + statistics. + + Theory: + Time-series coverage is defined relative to an expected sampling interval. + Once the timestamps are parsed, the empirical deltas between consecutive + observations reveal the dominant cadence of the data. That cadence becomes + the expected frequency against which we can measure irregular sampling, + missing timestamps, longest gaps, and per-entity coverage. For panel data, + these statistics must be computed per entity (or per composite entity key), + because a dataset can be well covered overall while still containing weak or + sparse individual series. + + :param path: dataset path + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param winner_formatter: optional datetime parsing kwargs + :return: temporal statistics payload + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + if time_col not in dataset.columns: + raise KeyError(f"time_col '{time_col}' not found in dataset") + secondary_keys = [ + key for key in (secondary_keys or []) if key in dataset.columns and key != time_col + ] + timestamp = _parse_time_series(dataset, time_col, winner_formatter) + valid_rows = dataset.copy() + valid_rows["_ts"] = timestamp + valid_rows = valid_rows.dropna(subset=["_ts"]) + if secondary_keys: + grouped_iter = valid_rows.groupby(secondary_keys, dropna=True) + group_items = list(grouped_iter) + else: + group_items = [(tuple(), valid_rows)] + + all_deltas: list[pd.Timedelta] = [] + per_entity: list[dict[str, Any]] = [] + global_min = None if valid_rows.empty else valid_rows["_ts"].min() + global_max = None if valid_rows.empty else valid_rows["_ts"].max() + + for raw_key, frame in group_items: + key_tuple = raw_key if isinstance(raw_key, tuple) else (raw_key,) + unique_ts = ( + frame["_ts"].dropna().drop_duplicates().sort_values().reset_index(drop=True) + ) + n_observed = int(unique_ts.shape[0]) + if n_observed >= 2: + deltas = unique_ts.diff().dropna() + positive_deltas = deltas[deltas > pd.Timedelta(0)] + else: + positive_deltas = pd.Series(dtype="timedelta64[ns]") + all_deltas.extend(list(positive_deltas.tolist())) + per_entity.append( + { + "entity": _series_identifier(secondary_keys, key_tuple), + "n_observed_timestamps": n_observed, + "min_time": None if unique_ts.empty else str(unique_ts.min()), + "max_time": None if unique_ts.empty else str(unique_ts.max()), + "_positive_deltas": positive_deltas, + } + ) + + if all_deltas: + delta_series = pd.Series(all_deltas, dtype="timedelta64[ns]") + mode_candidates = delta_series.mode() + mode_delta = None if mode_candidates.empty else mode_candidates.iloc[0] + median_delta = delta_series.median() + dominant_fraction = ( + 0.0 + if mode_delta is None + else float((delta_series == mode_delta).mean()) + ) + expected_delta = mode_delta if dominant_fraction >= 0.5 else median_delta + is_irregular_sampling = bool( + expected_delta is not None + and float((delta_series == expected_delta).mean()) < 0.8 + ) + else: + delta_series = pd.Series(dtype="timedelta64[ns]") + mode_delta = None + median_delta = None + dominant_fraction = 0.0 + expected_delta = None + is_irregular_sampling = False + + coverage_values: list[float] = [] + total_gaps = 0 + for item in per_entity: + positive_deltas = item.pop("_positive_deltas") + n_observed = item["n_observed_timestamps"] + if n_observed == 0 or expected_delta is None or pd.isna(expected_delta): + coverage_pct = None + n_expected = n_observed + gap_mask = pd.Series(dtype=bool) + longest_gap = None + else: + span = pd.Timestamp(item["max_time"]) - pd.Timestamp(item["min_time"]) + if expected_delta <= pd.Timedelta(0): + n_expected = n_observed + else: + n_expected = int(span / expected_delta) + 1 + n_expected = max(n_expected, n_observed, 1) + coverage_pct = float(100.0 * n_observed / n_expected) + gap_mask = positive_deltas > expected_delta + longest_gap = ( + None if positive_deltas.empty else positive_deltas.max() + ) + n_gaps = int(gap_mask.sum()) if not gap_mask.empty else 0 + total_gaps += n_gaps + if coverage_pct is not None: + coverage_values.append(coverage_pct) + item["n_expected_timestamps"] = int(n_expected) + item["coverage_pct"] = coverage_pct + item["n_gaps"] = n_gaps + item["longest_gap"] = _format_timedelta(longest_gap) + + if expected_delta is None: + resampling_decision = "insufficient_data" + elif is_irregular_sampling: + resampling_decision = "keep_irregular_gap_aware" + elif coverage_values and min(coverage_values) < 99.0: + resampling_decision = "resample_to_regular_grid" + else: + resampling_decision = "already_regular" + + coverage_summary = { + "n_series": len(per_entity), + "mean_coverage_pct": ( + None if not coverage_values else float(pd.Series(coverage_values).mean()) + ), + "min_coverage_pct": ( + None if not coverage_values else float(pd.Series(coverage_values).min()) + ), + "max_coverage_pct": ( + None if not coverage_values else float(pd.Series(coverage_values).max()) + ), + "total_gaps": int(total_gaps), + } + + return { + "time_col": time_col, + "secondary_keys": secondary_keys, + "n_nat_time": int(timestamp.isna().sum()), + "min_time": None if global_min is None else str(global_min), + "max_time": None if global_max is None else str(global_max), + "typical_delta_mode": _format_timedelta(mode_delta), + "typical_delta_median": _format_timedelta(median_delta), + "expected_frequency": _format_timedelta(expected_delta), + "dominant_frequency_fraction": dominant_fraction, + "is_irregular_sampling": is_irregular_sampling, + "resampling_decision": resampling_decision, + "coverage_summary": coverage_summary, + "coverage_per_entity": per_entity, + } + + +class _MissingnessAuditArgs(pydantic.BaseModel): + """ + Store arguments for deterministic missingness auditing. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + path: str + time_col: str + secondary_keys: list[str] | None = None + winner_formatter: dict[str, Any] | None = None + + +@ltools.tool(args_schema=_MissingnessAuditArgs) +def audit_missingness( + path: str, + time_col: str, + secondary_keys: list[str] | None = None, + winner_formatter: dict[str, Any] | None = None, +) -> dict: + """ + Audit missingness as two distinct problems: missing values and missing + timestamps. + + Theory: + Missing cells inside observed rows and missing timestamps in the implied + time grid are different failure modes. Value missingness tells us which + variables are incomplete at observed observation times. Timestamp + missingness tells us whether observations are absent from the expected + sampling cadence. The former guides imputation choices per feature; the + latter guides reindexing, coverage assessment, and gap-aware modeling. + + :param path: dataset path + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param winner_formatter: optional datetime parsing kwargs + :return: missingness audit payload + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + ordered = _ordered_dataset( + dataset, + time_col=time_col, + secondary_keys=secondary_keys, + winner_formatter=winner_formatter, + ) + temporal_report = compute_temporal_stats.invoke( + { + "path": path, + "time_col": time_col, + "secondary_keys": secondary_keys or [], + "winner_formatter": winner_formatter or {}, + } + ) + profiles = _build_column_profiles(dataset) + value_missingness_by_column: list[dict[str, Any]] = [] + total_missing_cells = int(dataset.isna().sum().sum()) + all_series_frames = _iter_series_frames(ordered, secondary_keys) + for col in [str(value) for value in dataset.columns]: + missing_mask = dataset[col].isna() + n_missing = int(missing_mask.sum()) + missing_pct = 0.0 if dataset.empty else float(100.0 * n_missing / len(dataset)) + run_lengths: list[int] = [] + for _, frame in all_series_frames: + frame_run_lengths = _mask_run_lengths(frame[col].isna()) + run_lengths.extend(frame_run_lengths) + eligible_strategies = ["leave_as_nan", "drop_rows"] + profile = profiles[col] + if n_missing > 0 and col != time_col: + eligible_strategies.append("forward_fill") + if profile["is_numeric_like"]: + eligible_strategies.append("interpolate") + if ( + profile["is_numeric_like"] + and profile["is_integer_like"] + and profile["is_nonnegative_like"] + ): + eligible_strategies.append("zero_fill") + value_missingness_by_column.append( + { + "col": col, + "dtype": profile["dtype"], + "n_missing": n_missing, + "missing_pct": missing_pct, + "n_missing_runs": int(len(run_lengths)), + "longest_missing_run": int(max(run_lengths, default=0)), + "eligible_strategies": eligible_strategies, + "sample_values": profile["sample_values"], + } + ) + value_missingness_by_column.sort( + key=lambda item: (item["n_missing"], item["longest_missing_run"]), + reverse=True, + ) + worst_value_col = next( + (item for item in value_missingness_by_column if item["n_missing"] > 0), + None, + ) + timestamp_missingness_by_entity: list[dict[str, Any]] = [] + total_expected_timestamps = 0 + total_observed_timestamps = 0 + total_missing_timestamps = 0 + n_series_with_timestamp_gaps = 0 + for item in temporal_report["coverage_per_entity"]: + n_observed = int(item.get("n_observed_timestamps") or 0) + n_expected = int(item.get("n_expected_timestamps") or n_observed) + n_missing_timestamps = max(0, n_expected - n_observed) + total_expected_timestamps += n_expected + total_observed_timestamps += n_observed + total_missing_timestamps += n_missing_timestamps + if n_missing_timestamps > 0: + n_series_with_timestamp_gaps += 1 + timestamp_missingness_by_entity.append( + { + "entity": item.get("entity"), + "n_observed_timestamps": n_observed, + "n_expected_timestamps": n_expected, + "n_missing_timestamps": n_missing_timestamps, + "coverage_pct": item.get("coverage_pct"), + "n_gaps": int(item.get("n_gaps") or 0), + "longest_gap": item.get("longest_gap"), + } + ) + timestamp_missingness_by_entity.sort( + key=lambda item: ( + item["n_missing_timestamps"], + item["n_gaps"], + item["coverage_pct"] if item["coverage_pct"] is not None else -1.0, + ), + reverse=True, + ) + return { + "time_col": time_col, + "secondary_keys": [ + key for key in (secondary_keys or []) if key in dataset.columns and key != time_col + ], + "n_rows": int(dataset.shape[0]), + "n_cols": int(dataset.shape[1]), + "value_missingness_summary": { + "total_missing_cells": total_missing_cells, + "total_missing_fraction": ( + 0.0 + if dataset.empty + else float(100.0 * total_missing_cells / max(1, int(dataset.size))) + ), + "columns_with_missing_values": int(sum(item["n_missing"] > 0 for item in value_missingness_by_column)), + "worst_column": None if worst_value_col is None else worst_value_col["col"], + "worst_column_missing_pct": ( + None if worst_value_col is None else worst_value_col["missing_pct"] + ), + }, + "value_missingness_by_column": value_missingness_by_column, + "timestamp_missingness_summary": { + "expected_frequency": temporal_report["expected_frequency"], + "is_irregular_sampling": temporal_report["is_irregular_sampling"], + "resampling_decision": temporal_report["resampling_decision"], + "n_nat_time": temporal_report["n_nat_time"], + "total_expected_timestamps": total_expected_timestamps, + "total_observed_timestamps": total_observed_timestamps, + "total_missing_timestamps": total_missing_timestamps, + "n_series_with_timestamp_gaps": n_series_with_timestamp_gaps, + }, + "timestamp_missingness_by_entity": timestamp_missingness_by_entity, + "column_profiles": profiles, + } + + +class MissingnessActionSpec(pydantic.BaseModel): + """ + Store one bounded missingness action. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + col: str + strategy: str + create_missingness_flag: bool = True + reason: str = "" + + +class _ApplyMissingnessActionsArgs(pydantic.BaseModel): + """ + Store arguments for deterministic missingness handling. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + source_path: str + input_path: str + time_col: str + secondary_keys: list[str] | None = None + winner_formatter: dict[str, Any] | None = None + actions: list[MissingnessActionSpec] + + +@ltools.tool(args_schema=_ApplyMissingnessActionsArgs) +def apply_missingness_actions( + source_path: str, + input_path: str, + time_col: str, + secondary_keys: list[str] | None = None, + winner_formatter: dict[str, Any] | None = None, + actions: list[MissingnessActionSpec] | None = None, +) -> dict: + """ + Apply one bounded missingness strategy per selected column. + + Theory: + The policy choice for each column may be ambiguous, but the mechanics of + applying a chosen action should be deterministic and reproducible. By + sorting within entity/time order, optionally adding missingness flags, and + then applying simple bounded transforms, the stage can record exactly what + changed without allowing the LLM to mutate data directly. + + :param source_path: original dataset path used for artifact naming + :param input_path: dataset path to transform + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param winner_formatter: optional datetime parsing kwargs + :param actions: bounded per-column action plan + :return: transformation report with output dataset path + """ + dataset = load_dataset(pathlib.Path(input_path)) + working = _ordered_dataset( + dataset, + time_col=time_col, + secondary_keys=secondary_keys, + winner_formatter=winner_formatter, + ) + valid_secondary_keys = [ + key + for key in (secondary_keys or []) + if key in working.columns and key != time_col + ] + action_items = [item.model_dump() if isinstance(item, pydantic.BaseModel) else item for item in (actions or [])] + drop_mask = pd.Series(False, index=working.index) + applied_actions: list[dict[str, Any]] = [] + for action in action_items: + col = str(action["col"]) + strategy = str(action["strategy"]) + create_missingness_flag = bool(action.get("create_missingness_flag", True)) + reason = str(action.get("reason") or "") + if col not in working.columns: + applied_actions.append( + { + "col": col, + "strategy": strategy, + "status": "skipped_missing_column", + "reason": reason, + } + ) + continue + before_mask = working[col].isna() + n_missing_before = int(before_mask.sum()) + if create_missingness_flag and n_missing_before > 0: + flag_col = f"{col}__was_missing" + if flag_col not in working.columns: + working[flag_col] = before_mask.astype(int) + status = "applied" + if strategy == "leave_as_nan": + pass + elif strategy == "drop_rows": + drop_mask = drop_mask | before_mask + elif strategy == "forward_fill": + if valid_secondary_keys: + working[col] = working.groupby(valid_secondary_keys, dropna=False)[col].ffill() + else: + working[col] = working[col].ffill() + elif strategy == "interpolate": + numeric = pd.to_numeric(working[col], errors="coerce") + if valid_secondary_keys: + working[col] = working.groupby(valid_secondary_keys, dropna=False)[numeric.name].transform( + lambda series: pd.to_numeric(series, errors="coerce").interpolate( + limit_area="inside" + ) + ) + else: + working[col] = numeric.interpolate(limit_area="inside") + elif strategy == "zero_fill": + numeric = pd.to_numeric(working[col], errors="coerce") + working[col] = numeric.fillna(0.0) + else: + status = "skipped_unsupported_strategy" + n_missing_after = int(working[col].isna().sum()) if col in working.columns else None + applied_actions.append( + { + "col": col, + "strategy": strategy, + "status": status, + "reason": reason, + "create_missingness_flag": create_missingness_flag, + "n_missing_before": n_missing_before, + "n_missing_after": n_missing_after, + "n_values_filled": None if n_missing_after is None else max(0, n_missing_before - n_missing_after), + "n_rows_marked_for_drop": int(before_mask.sum()) if strategy == "drop_rows" else 0, + } + ) + n_rows_before = int(working.shape[0]) + if bool(drop_mask.any()): + working = working.loc[~drop_mask].copy() + n_rows_after = int(working.shape[0]) + n_rows_dropped = max(0, n_rows_before - n_rows_after) + remaining_missing_by_column = { + str(col): int(working[col].isna().sum()) + for col in working.columns + if not str(col).startswith("_") + } + output_dataset = working.drop(columns=["_ts", "_row_order"], errors="ignore") + output_path = write_stage_dataset(source_path, "handle_missingness", output_dataset) + return { + "input_path": input_path, + "output_path": output_path, + "n_rows_before": n_rows_before, + "n_rows_after": n_rows_after, + "n_rows_dropped": n_rows_dropped, + "actions_applied": applied_actions, + "remaining_missing_by_column": remaining_missing_by_column, + "sorted_by": valid_secondary_keys + (["_ts"] if "_ts" in working.columns else []), + } + + +class _ScaleProfileArgs(pydantic.BaseModel): + """ + Store arguments for deterministic scale profiling. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + path: str + numeric_continuous_cols: list[str] | None = None + numeric_count_cols: list[str] | None = None + binary_flag_cols: list[str] | None = None + + +@ltools.tool(args_schema=_ScaleProfileArgs) +def profile_standardization_candidates( + path: str, + numeric_continuous_cols: list[str] | None = None, + numeric_count_cols: list[str] | None = None, + binary_flag_cols: list[str] | None = None, +) -> dict: + """ + Profile scale and tail behavior for numeric features. + + Theory: + Standardization is only justified when the observed numeric scales or tail + behaviors would otherwise distort comparisons or downstream models. Robust + scaling depends on median/IQR support, while `log1p` depends on nonnegative + support and heavy right tails. These properties can be measured + deterministically before the LLM decides whether the optional transform is + worth applying. + + :param path: dataset path + :param numeric_continuous_cols: inferred continuous numeric columns + :param numeric_count_cols: inferred count-like numeric columns + :param binary_flag_cols: inferred binary columns to exclude + :return: per-column scale profile + """ + dataset = load_dataset(pathlib.Path(path)) + continuous = [col for col in (numeric_continuous_cols or []) if col in dataset.columns] + counts = [col for col in (numeric_count_cols or []) if col in dataset.columns] + excluded = {col for col in (binary_flag_cols or []) if col in dataset.columns} + candidate_cols = [col for col in continuous + counts if col not in excluded] + candidate_cols = list(dict.fromkeys(candidate_cols)) + per_column: list[dict[str, Any]] = [] + iqr_values: list[float] = [] + for col in candidate_cols: + numeric = pd.to_numeric(dataset[col], errors="coerce").dropna() + if numeric.empty: + continue + median = numeric.median() + q1 = numeric.quantile(0.25) + q3 = numeric.quantile(0.75) + iqr = q3 - q1 + p01 = numeric.quantile(0.01) + p50 = numeric.quantile(0.50) + p99 = numeric.quantile(0.99) + positive_fraction = float((numeric >= 0).mean()) + zero_fraction = float((numeric == 0).mean()) + abs_median = abs(float(median)) if not pd.isna(median) else 0.0 + tail_ratio = None + if p50 > 0: + tail_ratio = float(p99 / p50) + if float(iqr) > 0.0: + iqr_values.append(float(iqr)) + feature_bucket = "numeric_continuous" if col in continuous else "numeric_count" + eligible_actions = ["none"] + if float(iqr) > 0.0: + eligible_actions.append("robust_scale") + if float(numeric.min()) >= 0.0: + eligible_actions.append("log1p") + if "robust_scale" in eligible_actions and "log1p" in eligible_actions: + eligible_actions.append("log1p_then_robust_scale") + per_column.append( + { + "col": col, + "feature_bucket": feature_bucket, + "n_non_null": int(numeric.shape[0]), + "min": _safe_float(numeric.min()), + "max": _safe_float(numeric.max()), + "mean": _safe_float(numeric.mean()), + "std": _safe_float(numeric.std()), + "median": _safe_float(median), + "iqr": _safe_float(iqr), + "p01": _safe_float(p01), + "p50": _safe_float(p50), + "p99": _safe_float(p99), + "zero_fraction": zero_fraction, + "positive_fraction": positive_fraction, + "skew": _safe_float(numeric.skew()), + "tail_ratio_p99_p50": None if tail_ratio is None else tail_ratio, + "scale_span": _safe_float(numeric.max() - numeric.min()), + "relative_iqr_to_median": None if abs_median <= 0.0 else float(iqr / abs_median), + "eligible_actions": eligible_actions, + } + ) + positive_iqrs = [value for value in iqr_values if value > 0.0] + return { + "path": path, + "candidate_cols": [item["col"] for item in per_column], + "n_candidate_cols": len(per_column), + "scale_summary": { + "max_iqr": None if not positive_iqrs else float(max(positive_iqrs)), + "min_positive_iqr": None if not positive_iqrs else float(min(positive_iqrs)), + "iqr_ratio_max_to_min": ( + None + if len(positive_iqrs) < 2 or min(positive_iqrs) <= 0.0 + else float(max(positive_iqrs) / min(positive_iqrs)) + ), + "n_nontrivial_log_candidates": int( + sum( + ( + item["min"] is not None + and item["min"] >= 0.0 + and item["tail_ratio_p99_p50"] is not None + and item["tail_ratio_p99_p50"] >= 5.0 + ) + for item in per_column + ) + ), + }, + "per_column": per_column, + } + + +class StandardizationActionSpec(pydantic.BaseModel): + """ + Store one bounded standardization action. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + col: str + action: str + reason: str = "" + + +class _ApplyStandardizationArgs(pydantic.BaseModel): + """ + Store arguments for deterministic standardization. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + source_path: str + input_path: str + actions: list[StandardizationActionSpec] + + +@ltools.tool(args_schema=_ApplyStandardizationArgs) +def apply_standardization_actions( + source_path: str, + input_path: str, + actions: list[StandardizationActionSpec] | None = None, +) -> dict: + """ + Apply bounded numeric transforms deterministically. + + Theory: + Whether a transform is desirable is an interpretive decision, but the + transform itself should be a pure function of the observed column values and + recorded parameters. Persisting medians, IQRs, and log usage makes the + optional stage reproducible and auditable. + + :param source_path: original dataset path used for artifact naming + :param input_path: dataset path to transform + :param actions: bounded per-column transformation plan + :return: transformation report with output dataset path + """ + dataset = load_dataset(pathlib.Path(input_path)).copy() + action_items = [item.model_dump() if isinstance(item, pydantic.BaseModel) else item for item in (actions or [])] + applied_actions: list[dict[str, Any]] = [] + for action in action_items: + col = str(action["col"]) + transform = str(action["action"]) + reason = str(action.get("reason") or "") + if col not in dataset.columns: + applied_actions.append( + { + "col": col, + "action": transform, + "status": "skipped_missing_column", + "reason": reason, + } + ) + continue + numeric = pd.to_numeric(dataset[col], errors="coerce") + valid = numeric.dropna() + if valid.empty: + applied_actions.append( + { + "col": col, + "action": transform, + "status": "skipped_no_numeric_values", + "reason": reason, + } + ) + continue + params: dict[str, Any] = {} + transformed = numeric.copy() + status = "applied" + if transform == "none": + pass + elif transform == "robust_scale": + median = valid.median() + q1 = valid.quantile(0.25) + q3 = valid.quantile(0.75) + iqr = q3 - q1 + if float(iqr) <= 0.0: + status = "skipped_zero_iqr" + else: + transformed = (numeric - median) / iqr + params = {"median": float(median), "iqr": float(iqr)} + elif transform == "log1p": + if float(valid.min()) < 0.0: + status = "skipped_negative_values" + else: + transformed = pd.Series(np.log1p(numeric), index=numeric.index) + params = {"log1p": True} + elif transform == "log1p_then_robust_scale": + if float(valid.min()) < 0.0: + status = "skipped_negative_values" + else: + logged = pd.Series(np.log1p(numeric), index=numeric.index) + logged_valid = logged.dropna() + median = logged_valid.median() + q1 = logged_valid.quantile(0.25) + q3 = logged_valid.quantile(0.75) + iqr = q3 - q1 + if float(iqr) <= 0.0: + status = "skipped_zero_iqr_after_log1p" + else: + transformed = (logged - median) / iqr + params = { + "log1p": True, + "median_after_log1p": float(median), + "iqr_after_log1p": float(iqr), + } + else: + status = "skipped_unsupported_action" + if status == "applied": + dataset[col] = transformed + applied_actions.append( + { + "col": col, + "action": transform, + "status": status, + "reason": reason, + "params": params, + } + ) + output_path = write_stage_dataset(source_path, "standardize", dataset) + return { + "input_path": input_path, + "output_path": output_path, + "actions_applied": applied_actions, + } + + +class _UnivariateAnalysisArgs(pydantic.BaseModel): + """ + Store arguments for deterministic univariate analysis. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + source_path: str + input_path: str + time_col: str + secondary_keys: list[str] | None = None + numeric_continuous_cols: list[str] | None = None + numeric_count_cols: list[str] | None = None + binary_flag_cols: list[str] | None = None + + +@ltools.tool(args_schema=_UnivariateAnalysisArgs) +def compute_univariate_metrics_and_plots( + source_path: str, + input_path: str, + time_col: str, + secondary_keys: list[str] | None = None, + numeric_continuous_cols: list[str] | None = None, + numeric_count_cols: list[str] | None = None, + binary_flag_cols: list[str] | None = None, +) -> dict: + """ + Compute deterministic univariate metrics and produce per-column plots. + + Theory: + Univariate EDA starts by measuring one feature at a time. Summary metrics + expose support, spread, skew, missingness, and tail behavior, while + histogram/ECDF/KDE plots show what "normal values" look like. For panel + data, per-entity summaries are also useful because a few odd entities can + hide inside an otherwise normal aggregate distribution. + + :param source_path: original dataset path used for artifact naming + :param input_path: dataset path to analyze + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param numeric_continuous_cols: inferred continuous numeric columns + :param numeric_count_cols: inferred count columns + :param binary_flag_cols: inferred binary columns + :return: summary report and plot manifest + """ + import matplotlib + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + dataset = load_dataset(pathlib.Path(input_path)) + candidate_cols = _candidate_univariate_numeric_cols( + dataset, + time_col=time_col, + secondary_keys=secondary_keys, + numeric_continuous_cols=numeric_continuous_cols, + numeric_count_cols=numeric_count_cols, + binary_flag_cols=binary_flag_cols, + ) + overall_feature_summaries: list[dict[str, Any]] = [] + per_entity_feature_summaries: list[dict[str, Any]] = [] + plot_manifest: list[dict[str, Any]] = [] + valid_secondary_keys = [ + key for key in (secondary_keys or []) if key in dataset.columns and key != time_col + ] + for col in candidate_cols: + summary = _univariate_summary(dataset[col]) + summary["col"] = col + summary["feature_bucket"] = ( + "numeric_continuous" + if col in (numeric_continuous_cols or []) + else "numeric_count" + if col in (numeric_count_cols or []) + else "binary_flag" + if col in (binary_flag_cols or []) + else "numeric" + ) + overall_feature_summaries.append(summary) + + numeric = pd.to_numeric(dataset[col], errors="coerce").dropna() + fig, axes = plt.subplots(1, 2, figsize=(10, 3.8)) + if numeric.empty: + axes[0].text(0.5, 0.5, "No numeric observations", ha="center", va="center") + axes[0].set_axis_off() + axes[1].text(0.5, 0.5, "No numeric observations", ha="center", va="center") + axes[1].set_axis_off() + kde_plotted = False + else: + n_bins = int(min(50, max(10, np.sqrt(numeric.shape[0])))) + axes[0].hist(numeric, bins=n_bins, color="#4472C4", alpha=0.75, density=True) + kde_curve = _gaussian_kde_curve(numeric) + kde_plotted = kde_curve is not None + if kde_curve is not None: + x_grid, density = kde_curve + axes[0].plot(x_grid, density, color="#D62728", linewidth=1.5) + sorted_vals = np.sort(numeric.to_numpy(dtype=float)) + y_ecdf = np.arange(1, sorted_vals.size + 1) / float(sorted_vals.size) + axes[1].step(sorted_vals, y_ecdf, where="post", color="#2CA02C", linewidth=1.5) + axes[1].set_ylim(0.0, 1.0) + axes[0].set_title(f"{col} histogram") + axes[1].set_title(f"{col} ECDF") + fig.suptitle( + f"{col} | skew={summary['skew']} | tail_ratio={summary['tail_ratio_p99_p50']}", + fontsize=10, + ) + plot_path = write_stage_plot(source_path, "univariate_metrics_plotting", f"{col}.distribution", fig) + plt.close(fig) + plot_manifest.append( + { + "col": col, + "plot_path": plot_path, + "kde_plotted": kde_plotted, + } + ) + + if valid_secondary_keys: + grouped = dataset.groupby(valid_secondary_keys, dropna=False, sort=False) + for raw_key, frame in grouped: + key_tuple = raw_key if isinstance(raw_key, tuple) else (raw_key,) + entity = _series_identifier(valid_secondary_keys, key_tuple) + entity_summary = _univariate_summary(frame[col]) + entity_summary["col"] = col + entity_summary["entity"] = entity + per_entity_feature_summaries.append(entity_summary) + + overall_feature_summaries.sort( + key=lambda item: ( + item["missing_pct"] if item["missing_pct"] is not None else -1.0, + abs(item["skew"]) if item["skew"] is not None else -1.0, + item["tail_ratio_p99_p50"] if item["tail_ratio_p99_p50"] is not None else -1.0, + ), + reverse=True, + ) + return { + "input_path": input_path, + "analysis_numeric_cols": candidate_cols, + "overall_feature_summaries": overall_feature_summaries, + "per_entity_feature_summaries": per_entity_feature_summaries, + "plot_manifest": plot_manifest, + } + + +class _TransformTestArgs(pydantic.BaseModel): + """ + Store arguments for deterministic transform testing. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + source_path: str + input_path: str + time_col: str + secondary_keys: list[str] | None = None + numeric_continuous_cols: list[str] | None = None + numeric_count_cols: list[str] | None = None + binary_flag_cols: list[str] | None = None + + +@ltools.tool(args_schema=_TransformTestArgs) +def test_univariate_transforms( + source_path: str, + input_path: str, + time_col: str, + secondary_keys: list[str] | None = None, + numeric_continuous_cols: list[str] | None = None, + numeric_count_cols: list[str] | None = None, + binary_flag_cols: list[str] | None = None, +) -> dict: + """ + Deterministically compare candidate transforms for skewed or heavy-tailed + numeric features. + + Theory: + Transform testing should only run when there is enough empirical evidence + that raw values may violate practical modeling assumptions or obscure + univariate structure. The decision can be made deterministically from + summary shape metrics such as skewness and tail ratios. Candidate transforms + are then compared by how much they reduce those distortions. + + :param source_path: original dataset path used for trace naming + :param input_path: dataset path to analyze + :param time_col: selected time column + :param secondary_keys: optional entity key columns + :param numeric_continuous_cols: inferred continuous numeric columns + :param numeric_count_cols: inferred count columns + :param binary_flag_cols: inferred binary columns + :return: transform test report + """ + dataset = load_dataset(pathlib.Path(input_path)) + candidate_cols = _candidate_univariate_numeric_cols( + dataset, + time_col=time_col, + secondary_keys=secondary_keys, + numeric_continuous_cols=numeric_continuous_cols, + numeric_count_cols=numeric_count_cols, + binary_flag_cols=binary_flag_cols, + ) + tested_columns: list[dict[str, Any]] = [] + skipped_columns: list[dict[str, Any]] = [] + for col in candidate_cols: + numeric = pd.to_numeric(dataset[col], errors="coerce") + base_summary = _univariate_summary(numeric) + n_non_null = int(base_summary["n_non_null"]) + abs_skew = abs(float(base_summary["skew"])) if base_summary["skew"] is not None else 0.0 + tail_ratio = float(base_summary["tail_ratio_p99_p50"]) if base_summary["tail_ratio_p99_p50"] is not None else 0.0 + should_test = bool( + n_non_null >= 30 + and ( + abs_skew >= 1.0 + or tail_ratio >= 4.0 + ) + ) + if not should_test: + skipped_columns.append( + { + "col": col, + "reason": ( + "Insufficient deterministic evidence for transform testing. " + f"n_non_null={n_non_null}, abs_skew={abs_skew:.3f}, tail_ratio={tail_ratio:.3f}" + ), + "base_summary": base_summary, + } + ) + continue + candidate_scores: list[dict[str, Any]] = [] + for name, transformed in _transform_candidates(numeric).items(): + score_payload = _transform_score(transformed) + candidate_scores.append( + { + "transform": name, + "score": score_payload["score"], + "summary": score_payload["summary"], + } + ) + valid_scores = [item for item in candidate_scores if item["score"] is not None] + valid_scores.sort(key=lambda item: float(item["score"])) + best = valid_scores[0] + baseline = next(item for item in valid_scores if item["transform"] == "none") + improvement = float(baseline["score"] - best["score"]) + if best["transform"] == "none" or improvement < 0.25: + recommendation = "none" + reason = ( + "Candidate transforms did not materially improve deterministic shape metrics " + f"(best_improvement={improvement:.3f})." + ) + else: + recommendation = best["transform"] + reason = ( + f"{best['transform']} best reduced deterministic shape distortion " + f"(baseline_score={baseline['score']:.3f}, best_score={best['score']:.3f})." + ) + tested_columns.append( + { + "col": col, + "base_summary": base_summary, + "candidate_scores": valid_scores, + "recommended_transform": recommendation, + "improvement_over_none": improvement, + "reason": reason, + } + ) + payload = { + "input_path": input_path, + "n_candidate_cols": len(candidate_cols), + "n_tested_cols": len(tested_columns), + "n_skipped_cols": len(skipped_columns), + "tested_columns": tested_columns, + "skipped_columns": skipped_columns, + } + write_stage_trace(source_path, "test_transforms", payload) + return payload + + +def analyze_header(state: dict) -> dict: + """ + Validate dataset headers. + + :param state: graph state containing dataset path + :return: updated state fields with header status + """ + path = pathlib.Path(str(state["path"])) + dataset = load_dataset(path) + cols = list(dataset.columns) + has_header = True + error = "" + if ( + all(isinstance(col, int) for col in cols) + and cols == list(range(len(cols))) + ): + has_header = False + error = "No column names." + else: + for col in cols: + if col is None: + has_header = False + error = "One or more column names missing." + break + col_name = str(col).strip() + if col_name == "": + has_header = False + error = "One or more column names missing." + break + if ( + col_name[0].isdigit() + or not _VALID_HEADER_START_RE.match(col_name) + ): + has_header = False + error = ( + "One or more column names start with invalid characters." + ) + break + if has_header: + result = {"has_header": has_header, "dataset": dataset} + else: + result = {"has_header": has_header, "error": error} + return result + + +@ltools.tool +def extract_metadata(path: str) -> dict: + """ + Return minimal dataset metadata. + + :param path: dataset path + :return: metadata with shape and per-column cardinality + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + n_rows, n_cols = dataset.shape + n_unique = dataset.nunique(dropna=True) + n_unique_map = {str(col): int(n_unique[col]) for col in n_unique.index} + metadata = { + "n_rows": int(n_rows), + "n_cols": int(n_cols), + "n_unique": n_unique_map, + } + return metadata + + +@ltools.tool +def extract_column_profiles(path: str) -> dict: + """ + Profile each column using value-level statistics rather than relying on + names alone. + + Theory: + Semantic feature inference becomes more robust when it is grounded in + empirical column behavior. Binary flags tend to have two states, counts + tend to be nonnegative integers, continuous measurements usually have many + distinct real-valued observations, and identifiers often repeat but are not + numeric measurements. These profile statistics give later stages stable + evidence even when column names are unhelpful. + + :param path: dataset path + :return: per-column profile map + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + profiles = _build_column_profiles(dataset) + return {"column_profiles": profiles} + + +class _EntityCandidateArgs(pydantic.BaseModel): + """ + Store arguments for deterministic entity-key scoring. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + path: str + time_col: str + candidate_cols: list[str] | None = None + max_combo_size: int = 2 + + +@ltools.tool(args_schema=_EntityCandidateArgs) +def score_entity_candidates( + path: str, + time_col: str, + candidate_cols: list[str] | None = None, + max_combo_size: int = 2, +) -> dict: + """ + Score candidate entity keys by how well they partition repeated time-series + observations into stable per-entity trajectories. + + Theory: + A useful entity key in panel data should do three things. First, entities + should reappear across multiple rows, otherwise the key behaves like a + row-level identifier rather than a series identifier. Second, the pair + `(entity_key, time_col)` should be close to unique, because that pair is + the natural coordinate system of a panel time series. Third, a good entity + key should explain repeated timestamps by reducing collisions once the + entity dimension is included. These criteria are deterministic and more + reliable than name-based guessing. + + :param path: dataset path + :param time_col: selected time column + :param candidate_cols: optional candidate entity columns + :param max_combo_size: max size of composite key combinations to evaluate + :return: scored candidate report with recommended secondary keys + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + if time_col not in dataset.columns: + raise KeyError(f"time_col '{time_col}' not found in dataset") + timestamp = pd.to_datetime(dataset[time_col], errors="coerce") + profiles = _build_column_profiles(dataset) + available_cols = [str(col) for col in dataset.columns if str(col) != time_col] + if candidate_cols is None: + selected = [] + for col in available_cols: + profile = profiles[col] + if profile["n_unique"] <= 1: + continue + if profile["unique_ratio"] >= 1.0: + continue + selected.append(col) + candidate_cols = selected + else: + candidate_cols = [ + col for col in candidate_cols if col in dataset.columns and col != time_col + ] + candidate_cols = sorted(dict.fromkeys(candidate_cols)) + max_combo_size = max(1, min(int(max_combo_size), 2)) + duplicate_timestamps = int(timestamp.dropna().duplicated().sum()) + candidates: list[dict[str, Any]] = [] + for combo_size in range(1, max_combo_size + 1): + for combo in itertools.combinations(candidate_cols, combo_size): + subset = dataset[list(combo)].copy() + subset["_ts"] = timestamp + valid = subset.dropna(subset=[*combo, "_ts"]) + if valid.empty: + continue + group_sizes = valid.groupby(list(combo), dropna=True).size() + if group_sizes.empty: + continue + n_entities = int(group_sizes.shape[0]) + mean_obs_per_entity = float(group_sizes.mean()) + entity_reuse_fraction = float((group_sizes > 1).mean()) + duplicate_pairs = int( + valid.duplicated(subset=[*combo, "_ts"]).sum() + ) + pair_uniqueness = float( + 1.0 - (duplicate_pairs / max(1, int(valid.shape[0]))) + ) + if duplicate_timestamps > 0: + collision_reduction = float( + 1.0 - (duplicate_pairs / max(1, duplicate_timestamps)) + ) + else: + collision_reduction = 1.0 if mean_obs_per_entity > 1.0 else 0.0 + repeatability_score = float(min(max((mean_obs_per_entity - 1.0) / 4.0, 0.0), 1.0)) + score = float( + 0.35 * pair_uniqueness + + 0.35 * repeatability_score + + 0.20 * entity_reuse_fraction + + 0.10 * max(0.0, min(collision_reduction, 1.0)) + ) + candidates.append( + { + "secondary_keys": list(combo), + "n_entities": n_entities, + "mean_obs_per_entity": mean_obs_per_entity, + "entity_reuse_fraction": entity_reuse_fraction, + "duplicate_entity_timestamp_pairs": duplicate_pairs, + "pair_uniqueness": pair_uniqueness, + "collision_reduction": collision_reduction, + "score": score, + } + ) + candidates.sort( + key=lambda item: ( + item["score"], + item["entity_reuse_fraction"], + item["mean_obs_per_entity"], + -len(item["secondary_keys"]), + ), + reverse=True, + ) + top_candidate = candidates[0] if candidates else None + if ( + top_candidate is not None + and top_candidate["score"] >= 0.60 + and top_candidate["n_entities"] >= 2 + and top_candidate["mean_obs_per_entity"] >= 2.0 + ): + recommended_secondary_keys = top_candidate["secondary_keys"] + else: + recommended_secondary_keys = [] + return { + "time_col": time_col, + "duplicate_timestamps": duplicate_timestamps, + "candidate_cols": candidate_cols, + "candidates": candidates[:10], + "recommended_secondary_keys": recommended_secondary_keys, + } + + +class _FeatureBucketsArgs(pydantic.BaseModel): + """ + Store arguments for deterministic semantic feature typing. + """ + + model_config = pydantic.ConfigDict(extra="forbid") + path: str + time_col: str + secondary_keys: list[str] | None = None + + +@ltools.tool(args_schema=_FeatureBucketsArgs) +def infer_feature_buckets( + path: str, + time_col: str, + secondary_keys: list[str] | None = None, +) -> dict: + """ + Deterministically type features from their observed value behavior. + + Theory: + The semantic distinction between counts, binary flags, continuous measures, + and categoricals can often be established directly from the support of the + observed values. Binary flags exhibit two states, counts live on the + nonnegative integers, continuous measures take broader real-valued ranges, + and categorical features are residual non-key columns that do not behave + like numeric measurements. Weakly inferred classes such as targets or + exogenous drivers are intentionally left empty because their meaning depends + more on task context than on value support alone. + + :param path: dataset path + :param time_col: selected time column + :param secondary_keys: optional entity key columns to exclude + :return: semantic feature buckets + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + profiles = _build_column_profiles(dataset) + excluded = {time_col, *(secondary_keys or [])} + numeric_continuous_cols: list[str] = [] + numeric_count_cols: list[str] = [] + binary_flag_cols: list[str] = [] + categorical_feature_cols: list[str] = [] + for col in [str(value) for value in dataset.columns]: + if col in excluded: + continue + profile = profiles[col] + if profile["is_binary_like"]: + binary_flag_cols.append(col) + elif ( + profile["is_numeric_like"] + and profile["is_integer_like"] + and profile["is_nonnegative_like"] + and profile["n_unique"] > 2 + ): + numeric_count_cols.append(col) + elif profile["is_numeric_like"]: + numeric_continuous_cols.append(col) + else: + categorical_feature_cols.append(col) + covariate_cols = ( + numeric_continuous_cols + + numeric_count_cols + + binary_flag_cols + + categorical_feature_cols + ) + return { + "numeric_continuous_cols": numeric_continuous_cols, + "numeric_count_cols": numeric_count_cols, + "binary_flag_cols": binary_flag_cols, + "categorical_feature_cols": categorical_feature_cols, + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": covariate_cols, + "column_profiles": profiles, + } + + +@ltools.tool +def extract_head(path: str, *, n: int = 5) -> dict: + """ + Return the first rows from a dataset. + + :param path: dataset path + :param n: number of rows to return + :return: head rows serialized as JSON-compatible payload + """ + dataset_path = pathlib.Path(path) + dataset = load_dataset(dataset_path) + n_rows = int(n) + if n_rows <= 0: + n_rows = 5 + n_rows = min(n_rows, 50) + head = dataset.head(n_rows) + rows = json.loads(head.to_json(orient="records", date_format="iso")) + payload = { + "n": n_rows, + "columns": [str(col) for col in head.columns.tolist()], + "rows": rows, + } + return payload diff --git a/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/__init__.py b/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/__init__.py new file mode 100644 index 000000000..66ee48f67 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/__init__.py @@ -0,0 +1,3 @@ +""" +Univariate analysis stages for the Jupyter backend. +""" diff --git a/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/test_transforms.py b/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/test_transforms.py new file mode 100644 index 000000000..115afd951 --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/test_transforms.py @@ -0,0 +1,212 @@ +""" +Import as: + +import src.univariate_analysis.test_transforms as stransforms +""" + +from __future__ import annotations + +import argparse +import logging +from typing import TypedDict + +import langgraph.graph as lgraph + +import src.univariate_analysis.univariate_metrics_plotting as sunivar +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class CompositeState(TypedDict): + """ + Store graph state for transform testing. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + missingness_report: dict + missingness_plan: dict + missingness_handling_report: dict + quality_dataset_path: str + standardization_profile: dict + standardization_gate: dict + standardization_plan: dict + standardization_report: dict + standardized_dataset_path: str + univariate_report: dict + transform_test_report: dict + + +def call_univariate_metrics_plotting(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to univariate metrics/plots. + + :param state: graph state + :return: composite payload from univariate metrics/plots + """ + payload = sunivar.run_univariate_metrics_plotting(state["path"]) + return payload + + +def test_transforms(state: CompositeState) -> dict: + """ + Compare candidate transforms deterministically for columns where it matters. + + :param state: graph state + :return: transform test report + """ + analysis_path = state.get("quality_dataset_path") or state["path"] + report = tinptool.test_univariate_transforms.invoke( + { + "source_path": state["path"], + "input_path": analysis_path, + "time_col": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "numeric_continuous_cols": state["numeric_continuous_cols"], + "numeric_count_cols": state["numeric_count_cols"], + "binary_flag_cols": state["binary_flag_cols"], + } + ) + payload = {"transform_test_report": report} + return payload + + +transform_testing = lgraph.StateGraph(CompositeState) +transform_testing.add_node("univariate_metrics_plotting_pipeline", call_univariate_metrics_plotting) +transform_testing.add_node("test_transforms", test_transforms) +transform_testing.add_edge(lgraph.START, "univariate_metrics_plotting_pipeline") +transform_testing.add_edge("univariate_metrics_plotting_pipeline", "test_transforms") +transform_testing.add_edge("test_transforms", lgraph.END) +graph = transform_testing.compile() + + +def run_test_transforms(path: str) -> dict: + """ + Execute transform testing end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + "n_nat_time": 0, + "min_time": None, + "max_time": None, + "typical_delta_mode": None, + "typical_delta_median": None, + "expected_frequency": None, + "dominant_frequency_fraction": 0.0, + "is_irregular_sampling": False, + "resampling_decision": "", + "coverage_summary": {}, + "coverage_per_entity": [], + "missingness_report": {}, + "missingness_plan": {}, + "missingness_handling_report": {}, + "quality_dataset_path": "", + "standardization_profile": {}, + "standardization_gate": {}, + "standardization_plan": {}, + "standardization_report": {}, + "standardized_dataset_path": "", + "univariate_report": {}, + "transform_test_report": {}, + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Transform testing output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + return parser.parse_args() + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_test_transforms(args.path) diff --git a/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/univariate_metrics_plotting.py b/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/univariate_metrics_plotting.py new file mode 100644 index 000000000..1bcd9b6bb --- /dev/null +++ b/agentic_eda/jupyterlab_extension_backend/src/univariate_analysis/univariate_metrics_plotting.py @@ -0,0 +1,214 @@ +""" +Import as: + +import src.univariate_analysis.univariate_metrics_plotting as sunivar +""" + +from __future__ import annotations + +import argparse +import logging +from typing import TypedDict + +import langgraph.graph as lgraph + +import src.quality_handling.standardize as sstandard +import src.tools.input_tools as tinptool + +_LOG = logging.getLogger(__name__) + + +class CompositeState(TypedDict): + """ + Store graph state for univariate metrics and plotting. + """ + + path: str + done: list[str] + has_header: bool + has_missing_values: bool + error: str + info: str + cols: list[str] + temporal_cols: list[str] + numeric_val_cols: list[str] + categorical_val_cols: list[str] + bad_rows: list[dict] + metadata: dict + time_col: str + candidates: list[dict] + winner_formatter: dict + entity_col: str | None + numeric_cols: list[str] + nonnegative_cols: list[str] + jump_mult: float + report: dict + summary: str + flag: str + type: str + primary_key: str + secondary_keys: list[str] + numeric_continuous_cols: list[str] + numeric_count_cols: list[str] + binary_flag_cols: list[str] + categorical_feature_cols: list[str] + known_exogenous_cols: list[str] + target_cols: list[str] + covariate_cols: list[str] + n_nat_time: int + min_time: str | None + max_time: str | None + typical_delta_mode: str | None + typical_delta_median: str | None + expected_frequency: str | None + dominant_frequency_fraction: float + is_irregular_sampling: bool + resampling_decision: str + coverage_summary: dict + coverage_per_entity: list[dict] + missingness_report: dict + missingness_plan: dict + missingness_handling_report: dict + quality_dataset_path: str + standardization_profile: dict + standardization_gate: dict + standardization_plan: dict + standardization_report: dict + standardized_dataset_path: str + univariate_report: dict + + +def call_standardize(state: CompositeState) -> dict: + """ + Run the sequential pipeline up to optional standardization. + + :param state: graph state + :return: composite payload from standardize + """ + payload = sstandard.run_standardize(state["path"]) + return payload + + +def compute_univariate_metrics_and_plots(state: CompositeState) -> dict: + """ + Compute univariate summaries and write per-feature distribution plots. + + :param state: graph state + :return: univariate report + """ + analysis_path = state.get("quality_dataset_path") or state["path"] + report = tinptool.compute_univariate_metrics_and_plots.invoke( + { + "source_path": state["path"], + "input_path": analysis_path, + "time_col": state["primary_key"], + "secondary_keys": state["secondary_keys"], + "numeric_continuous_cols": state["numeric_continuous_cols"], + "numeric_count_cols": state["numeric_count_cols"], + "binary_flag_cols": state["binary_flag_cols"], + } + ) + trace_payload = { + "analysis_path": analysis_path, + "univariate_report": report, + } + tinptool.write_stage_trace(state["path"], "univariate_metrics_plotting", trace_payload) + return {"univariate_report": report} + + +univariate_analysis = lgraph.StateGraph(CompositeState) +univariate_analysis.add_node("standardize_pipeline", call_standardize) +univariate_analysis.add_node("compute_univariate_metrics_and_plots", compute_univariate_metrics_and_plots) +univariate_analysis.add_edge(lgraph.START, "standardize_pipeline") +univariate_analysis.add_edge("standardize_pipeline", "compute_univariate_metrics_and_plots") +univariate_analysis.add_edge("compute_univariate_metrics_and_plots", lgraph.END) +graph = univariate_analysis.compile() + + +def run_univariate_metrics_plotting(path: str) -> dict: + """ + Execute univariate summaries and plotting end to end. + + :param path: dataset path + :return: full composite graph payload + """ + init_state: CompositeState = { + "path": path, + "done": [], + "has_header": True, + "has_missing_values": False, + "error": "", + "info": "", + "cols": [], + "temporal_cols": [], + "numeric_val_cols": [], + "categorical_val_cols": [], + "bad_rows": [], + "metadata": {}, + "time_col": "", + "candidates": [], + "winner_formatter": {}, + "entity_col": None, + "numeric_cols": [], + "nonnegative_cols": [], + "jump_mult": 20.0, + "report": {}, + "summary": "", + "flag": "", + "type": "", + "primary_key": "", + "secondary_keys": [], + "numeric_continuous_cols": [], + "numeric_count_cols": [], + "binary_flag_cols": [], + "categorical_feature_cols": [], + "known_exogenous_cols": [], + "target_cols": [], + "covariate_cols": [], + "n_nat_time": 0, + "min_time": None, + "max_time": None, + "typical_delta_mode": None, + "typical_delta_median": None, + "expected_frequency": None, + "dominant_frequency_fraction": 0.0, + "is_irregular_sampling": False, + "resampling_decision": "", + "coverage_summary": {}, + "coverage_per_entity": [], + "missingness_report": {}, + "missingness_plan": {}, + "missingness_handling_report": {}, + "quality_dataset_path": "", + "standardization_profile": {}, + "standardization_gate": {}, + "standardization_plan": {}, + "standardization_report": {}, + "standardized_dataset_path": "", + "univariate_report": {}, + } + out = graph.invoke(init_state) + payload: CompositeState = out + _LOG.info("Univariate analysis output: %s", payload) + return payload + + +def _parse_args() -> argparse.Namespace: + """ + Parse command-line arguments. + + :return: parsed arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="Path to dataset file.", + ) + return parser.parse_args() + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = _parse_args() + run_univariate_metrics_plotting(args.path)