CathodeX/dvc.yaml at main · Kernel-Guard/CathodeX · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# DVC Pipeline Definition
# Tracks data lineage from raw fetch → graph cache → training → calibration → release
#
# Usage:
#   dvc repro                    # Reproduce full pipeline
#   dvc repro train              # Reproduce from training onward
#   dvc repro calibrate          # Re-run calibration + evaluation only
#   dvc dag                      # View pipeline DAG
#   dvc metrics show             # Show tracked metrics
#
# Prerequisites:
#   - MP_API_KEY env var for fetch stage
#   - GPU recommended for train stage (~2h on RTX 2060)

stages:
  fetch:
    cmd: python scripts/00_fetch_mp.py --config configs/dataset_mp.yaml
    deps:
      - scripts/00_fetch_mp.py
      - configs/dataset_mp.yaml
    outs:
      - data/raw/mp/raw_mp_cathodes_v1.parquet:
          cache: true
    params:
      - configs/dataset_mp.yaml:
          - mp
          - dataset
          - filters
    meta:
      description: "Fetch raw TMO data from Materials Project API"
      owner: data-pipeline

  cache_structures:
    cmd: python scripts/02b_cache_structures.py --config configs/dataset_mp.yaml
    deps:
      - scripts/02b_cache_structures.py
      - data/raw/mp/raw_mp_cathodes_v1.parquet
    outs:
      - data/processed/mp/structures/:
          cache: true
    meta:
      description: "Cache crystal structures as .npz files"

  build_graphs:
    cmd: python scripts/02_build_graph_cache.py --config configs/dataset_mp.yaml
    deps:
      - scripts/02_build_graph_cache.py
      - data/raw/mp/raw_mp_cathodes_v1.parquet
    outs:
      - data/processed/mp/processed_mp_cathodes_v1.parquet:
          cache: true
      - data/interim/mp/graphs/:
          cache: true
    params:
      - configs/dataset_mp.yaml:
          - graph
    meta:
      description: "Build graph representations with RBF features"

  make_splits:
    cmd: python scripts/03a_make_soap_loco_splits.py
    deps:
      - scripts/03a_make_soap_loco_splits.py
      - data/processed/mp/processed_mp_cathodes_v1.parquet
    outs:
      - data/splits/mp/splits_mp_cathodes_v1_soap-loco.json:
          cache: true
    params:
      - configs/dataset_mp.yaml:
          - splits
    meta:
      description: "Generate SOAP-LOCO cluster splits"

  train:
    cmd: >-
      python scripts/04_train_ensemble.py
      --config configs/train_mace_ehull.yaml
      --run-id mace_ensemble_v2
      --skip-existing
    deps:
      - scripts/04_train_ensemble.py
      - scripts/04_train.py
      - configs/train_mace_ehull.yaml
      - data/processed/mp/structures/
      - data/splits/mp/splits_mp_cathodes_v1_soap-loco.json
    outs:
      - artifacts/models/mace_ensemble_v2/:
          cache: true
          persist: true
    params:
      - configs/train_mace_ehull.yaml:
          - model
          - train
          - loss
    metrics:
      - data/reports/training_metrics.json:
          cache: false
    meta:
      description: "Train 5-member MACE ensemble"

  predict:
    cmd: >-
      python scripts/07_predict_ensemble.py
      --ensemble-dir artifacts/models/mace_ensemble_v2
      --data-config configs/dataset_mp.yaml
      --split test
      --output data/predictions/ensemble_soap_loco_test.parquet
    deps:
      - scripts/07_predict_ensemble.py
      - artifacts/models/mace_ensemble_v2/
      - data/processed/mp/processed_mp_cathodes_v1.parquet
      - data/splits/mp/splits_mp_cathodes_v1_soap-loco.json
    outs:
      - data/predictions/ensemble_soap_loco_test.parquet:
          cache: true
    meta:
      description: "Generate ensemble predictions on test split"

  calibrate:
    cmd: >-
      python scripts/05b_conformal_calibrate.py
      --val-predictions data/predictions/ensemble_soap_loco_test.parquet
      --output-dir data/calibration
    deps:
      - scripts/05b_conformal_calibrate.py
      - data/predictions/ensemble_soap_loco_test.parquet
    outs:
      - data/calibration/conformal_params.json:
          cache: false
    meta:
      description: "Compute conformal calibration deltas (90% coverage)"

  evaluate:
    cmd: >-
      python scripts/09_evaluate_predictions.py
      --input data/predictions/ensemble_soap_loco_test.parquet
      --conformal-params data/calibration/conformal_params.json
      --output data/reports/model_validation/model_validation_report.json
    deps:
      - scripts/09_evaluate_predictions.py
      - data/predictions/ensemble_soap_loco_test.parquet
      - data/calibration/conformal_params.json
    metrics:
      - data/reports/model_validation/model_validation_report.json:
          cache: false
    meta:
      description: "Run governance checks and generate evaluation report"

  generate_manifest:
    cmd: >-
      python scripts/08_generate_artifact_manifest.py
      --artifact-dir artifacts/models/mace_ensemble_v2
      --output data/reports/artifact_manifest.json
    deps:
      - scripts/08_generate_artifact_manifest.py
      - artifacts/models/mace_ensemble_v2/
    outs:
      - data/reports/artifact_manifest.json:
          cache: false
    meta:
      description: "Generate signed artifact manifest"

  validate_release:
    cmd: >-
      python scripts/12_validate_release.py
      --input data/predictions/ensemble_soap_loco_test.parquet
      --conformal-params data/calibration/conformal_params.json
      --output data/reports/model_validation/release_validation.json
    deps:
      - scripts/12_validate_release.py
      - data/reports/model_validation/model_validation_report.json
      - data/reports/artifact_manifest.json
    metrics:
      - data/reports/model_validation/release_validation.json:
          cache: false
    meta:
      description: "Gate release on governance checks"