Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions bash/run_pipeline_alvis.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/bin/bash
#SBATCH -A NAISS2025-1-11 -p alvis
#SBATCH -N 1
###SBATCH --gpus-per-node=A40:1
#SBATCH --gpus-per-node=A100:1
#SBATCH --cpus-per-task=16
#SBATCH -t 08:00:00
#SBATCH -J stride-pipeline
#SBATCH --chdir=/mimer/NOBACKUP/groups/naiss2025-6-138/HCLIMAI/log/log_stride/
#SBATCH --error=%x-%j.error
#SBATCH --output=%x-%j.out

set -euo pipefail

if [ "$#" -lt 1 ]; then
echo "Usage: sbatch bash/run_pipeline.sh <pipeline_config.yaml> [--dry-run]"
exit 1
fi

PIPELINE_CONFIG="$1"
shift || true

EXTRA_ARGS=("$@")

REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
cd "$REPO_ROOT"

echo
echo "========================="
echo "STRIDE Slurm pipeline run"
echo "========================="
echo "Repository root: $REPO_ROOT"
echo "Pipeline config: $PIPELINE_CONFIG"
echo "Python: $(command -v python || true)"
echo "Working dir: $(pwd)"
echo "Extra args: ${EXTRA_ARGS[*]:-<none>}"
echo

current_date_time="`date`";
echo The run starts from $current_date_time
echo Check https://job.c3se.chalmers.se/alvis/$SLURM_JOB_ID for GPU usage.

#export HDF5_USE_FILE_LOCKING=FALSE
#export TF_GPU_ALLOCATOR=cuda_malloc_async
##export CUDA_VISIBLE_DEVICES=1
#export TF_DETERMINISTIC_OPS=0
#export TF_FORCE_GPU_ALLOW_GROWTH=true
#ecinteractive -g

DOMAIN='norcp'
#DOMAIN='TestDomain'
VARIABLE='tas'

echo 'domain is' ${DOMAIN}
set -exu

module --force purge
#module load virtualenv/20.26.2-GCCcore-13.3.0
#module load Python/3.12.3-GCCcore-13.3.0
#module load netcdf4-python/1.7.1.post2-foss-2024a
module load virtualenv/20.23.1-GCCcore-12.3.0
module load Python/3.11.3-GCCcore-12.3.0
module load CUDA/12.1.1
module load PyTorch/2.1.2-foss-2023a-CUDA-12.1.1
module load netcdf4-python/1.6.4-foss-2023a
module load zarr/2.17.1-foss-2023a
module load xarray/2023.9.0-gfbf-2023a
module load PyYAML/6.0-GCCcore-12.3.0
module load dask/2023.9.2-foss-2023a
source $HOME/venvs/stride/bin/activate

cd $HOME/STRIDE
python cli/launch_pipeline.py --config "$PIPELINE_CONFIG" "${EXTRA_ARGS[@]}"

current_date_time="`date`";
echo The run ends at $current_date_time

exit 0

107 changes: 107 additions & 0 deletions configs/datasets/norcp_alvis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
data:
root_dir: /mimer/NOBACKUP/groups/naiss2025-6-138/HCLIMAI/NorCP_SSE/cropped # /Users/au728490/Data/NorCP/cropped
scenario_name: ECMWF-ERAINT
temporal_tag: 6hr
target_spatial_tag: 3km
dynamic_spatial_tag: 12km

split:
manifest_path: data_adapters/norcp/saved/splits/temporal__ECMWF-ERAINT__train_auto_auto__val_2010-01-01T00-00-00_2012-12-31T18-00-00__test_2013-01-01T00-00-00_2018-12-31T18-00-00.json
name: train
stats_tag: temporal__ECMWF-ERAINT__train_auto_auto__val_2010-01-01T00-00-00_2012-12-31T18-00-00__test_2013-01-01T00-00-00_2018-12-31T18-00-00

domain:
tag: full_domain
crop: null
spatial_shuffle:
enabled: false
train_only: true
cutout_domain: null

target:
variable: prcp
source: NORCP_HR
time_offsets:
prcp: -3.0

conditioning:
dynamic:
source: NORCP_LR
variables:
[
prcp,
temp,
hus500,
ta500,
ua500,
va500,
zg500,
hus700,
ta700,
ua700,
va700,
zg700,
hus850,
ta850,
ua850,
va850,
zg850,
hus950,
ta950,
ua950,
va950,
zg950,
hus1000,
ta1000,
ua1000,
va1000,
zg1000,
]
time_offsets:
prcp: -3.0
upsample_to_target: false
upsample_mode: bilinear

static:
source: NORCP_STATIC
variables: [topo]
allow_missing: true

transforms:
apply: true

target:
prcp: log_zscore

conditioning:
dynamic:
prcp: log_zscore
temp: zscore
hus500: zscore
ta500: zscore
ua500: zscore
va500: zscore
zg500: zscore
hus700: zscore
ta700: zscore
ua700: zscore
va700: zscore
zg700: zscore
hus850: zscore
ta850: zscore
ua850: zscore
va850: zscore
zg850: zscore
hus950: zscore
ta950: zscore
ua950: zscore
va950: zscore
zg950: zscore
hus1000: zscore
ta1000: zscore
ua1000: zscore
va1000: zscore
zg1000: zscore

static:
topo: zscore
105 changes: 105 additions & 0 deletions configs/experiments/pipeline_norcp_alvis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
experiment:
name: pipeline_norcp
output_root: runs/pipeline_norcp
seed: 42

stages:
training: true
generation: true
evaluation: true

bases:
model: configs/models/edm_model_base.yaml
training: configs/training/training_base.yaml
generation: configs/generation/generation_base.yaml
sampler: configs/generation/sampler_base.yaml
evaluation: configs/evaluation/evaluation_base.yaml
data: configs/datasets/norcp_alvis.yaml

data:
target:
variable: prcp
transform: log1p
output_shape: [92, 68]

conditioning:
dynamic_variables: [prcp, temp]
static_variables: [topo]
input_shape: [23, 17]

domain:
hr_size: [92, 68]
lr_size: [23, 17]
large_domain: false

split:
train: null
val: null
test: null

statistics: # which split to use for computing data statistics (e.g. mean, std) used for normalization; if null, use training split
train: train
val: train
test: train

overrides:
domain:
spatial_shuffle:
enabled: false
train_only: true
cutout_domain: [170, 350, 340, 520]

training:
run_name: train_norcp
overrides:
loop:
max_epochs: 50

model:
overrides:
in_dynamic_channels: 2
in_static_channels: 1
out_channels: 1

spatial:
target_height: 92
target_width: 68
cond_height: 23
cond_width: 17
align_cond_to_target: true
cond_upsample_mode: bilinear

rain_gate:
model:
enabled: true
hidden_channels: 32
num_blocks: 3
input_mode: "cond"
loss:
enabled: false
loss_weight: 0.1
wet_threshold_mm: 0.1
target_variable: "prcp"
use_loss_reweighting: false
reweight_detach: true
reweight_power: 1.0


generation:
run_name: generate_norcp
overrides:
generation_run:
data:
split: test
batch_size: 50 # set to 50 to speed up generation for testing purposes; can be increased for final runs


evaluation:
run_name: evaluate_norcp
overrides:
evaluation_run:
data:
split: test
forecast_product_for_spatial: pmm
forecast_product_for_climatology: pmm
forecast_product_for_temporal: pmm
19 changes: 19 additions & 0 deletions documentations/setup_quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ python cli/launch_pipeline.py --config <experiment.yaml>
Target runtime:
- LUMI (primary)
- Local (for dry-run + smoke tests)
- [Alvis/NAISS](https://www.naiss.se/resource/alvis/)

---

Expand Down Expand Up @@ -60,6 +61,24 @@ Make sure your container (or overlay) contains:

If something fails with `ModuleNotFoundError`, your container is missing packages.


### Environment Setup on Alvis/NAISS (without using container)
```
mkdir $HOME/venvs
cd $HOME/venvs
module load virtualenv/20.23.1-GCCcore-12.3.0
virtualenv --system-site-packages stride
source $HOME/venvs/stride/bin/activate
module load Python/3.11.3-GCCcore-12.3.0
module load PyTorch/2.1.2-foss-2023a-CUDA-12.1.1
module load CUDA/12.1.1
module load netcdf4-python/1.6.4-foss-2023a
module load zarr/2.17.1-foss-2023a
module load xarray/2023.9.0-gfbf-2023a
module load PyYAML/6.0-GCCcore-12.3.0
module load dask/2023.9.2-foss-2023a
```

---

## 4. Data setup
Expand Down