TheaQG · fuxingwang2018 · Mar 19, 2026 · Mar 19, 2026
diff --git a/bash/run_pipeline_alvis.sh b/bash/run_pipeline_alvis.sh
@@ -0,0 +1,79 @@
+#!/bin/bash 
+#SBATCH -A NAISS2025-1-11  -p alvis
+#SBATCH -N 1 
+###SBATCH --gpus-per-node=A40:1
+#SBATCH --gpus-per-node=A100:1 
+#SBATCH --cpus-per-task=16
+#SBATCH -t 08:00:00
+#SBATCH -J stride-pipeline
+#SBATCH --chdir=/mimer/NOBACKUP/groups/naiss2025-6-138/HCLIMAI/log/log_stride/
+#SBATCH --error=%x-%j.error 
+#SBATCH --output=%x-%j.out
+
+set -euo pipefail
+
+if [ "$#" -lt 1 ]; then
+  echo "Usage: sbatch bash/run_pipeline.sh <pipeline_config.yaml> [--dry-run]"
+  exit 1
+fi
+
+PIPELINE_CONFIG="$1"
+shift || true
+
+EXTRA_ARGS=("$@")
+
+REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$REPO_ROOT"
+
+echo
+echo "========================="
+echo "STRIDE Slurm pipeline run"
+echo "========================="
+echo "Repository root: $REPO_ROOT"
+echo "Pipeline config: $PIPELINE_CONFIG"
+echo "Python: $(command -v python || true)"
+echo "Working dir: $(pwd)"
+echo "Extra args: ${EXTRA_ARGS[*]:-<none>}"
+echo
+
+current_date_time="`date`";
+echo The run starts from $current_date_time
+echo Check https://job.c3se.chalmers.se/alvis/$SLURM_JOB_ID for GPU usage.
+
+#export HDF5_USE_FILE_LOCKING=FALSE
+#export TF_GPU_ALLOCATOR=cuda_malloc_async
+##export CUDA_VISIBLE_DEVICES=1 
+#export TF_DETERMINISTIC_OPS=0
+#export TF_FORCE_GPU_ALLOW_GROWTH=true
+#ecinteractive -g
+
+DOMAIN='norcp'
+#DOMAIN='TestDomain'
+VARIABLE='tas'
+
+echo 'domain is' ${DOMAIN}
+set -exu 
+
+module --force purge
+#module load virtualenv/20.26.2-GCCcore-13.3.0
+#module load Python/3.12.3-GCCcore-13.3.0
+#module load netcdf4-python/1.7.1.post2-foss-2024a
+module load virtualenv/20.23.1-GCCcore-12.3.0
+module load Python/3.11.3-GCCcore-12.3.0
+module load CUDA/12.1.1
+module load PyTorch/2.1.2-foss-2023a-CUDA-12.1.1
+module load netcdf4-python/1.6.4-foss-2023a
+module load zarr/2.17.1-foss-2023a
+module load xarray/2023.9.0-gfbf-2023a
+module load PyYAML/6.0-GCCcore-12.3.0
+module load dask/2023.9.2-foss-2023a
+source $HOME/venvs/stride/bin/activate
+
+cd $HOME/STRIDE
+python cli/launch_pipeline.py --config "$PIPELINE_CONFIG" "${EXTRA_ARGS[@]}"
+
+current_date_time="`date`";
+echo The run ends at $current_date_time
+
+exit 0
+
diff --git a/configs/datasets/norcp_alvis.yaml b/configs/datasets/norcp_alvis.yaml
@@ -0,0 +1,107 @@
+data:
+  root_dir: /mimer/NOBACKUP/groups/naiss2025-6-138/HCLIMAI/NorCP_SSE/cropped # /Users/au728490/Data/NorCP/cropped
+  scenario_name: ECMWF-ERAINT
+  temporal_tag: 6hr
+  target_spatial_tag: 3km
+  dynamic_spatial_tag: 12km
+
+  split:
+    manifest_path: data_adapters/norcp/saved/splits/temporal__ECMWF-ERAINT__train_auto_auto__val_2010-01-01T00-00-00_2012-12-31T18-00-00__test_2013-01-01T00-00-00_2018-12-31T18-00-00.json
+    name: train
+    stats_tag: temporal__ECMWF-ERAINT__train_auto_auto__val_2010-01-01T00-00-00_2012-12-31T18-00-00__test_2013-01-01T00-00-00_2018-12-31T18-00-00
+
+  domain:
+    tag: full_domain
+    crop: null
+    spatial_shuffle:
+      enabled: false
+      train_only: true
+      cutout_domain: null
+
+  target:
+    variable: prcp
+    source: NORCP_HR
+    time_offsets:
+      prcp: -3.0
+
+  conditioning:
+    dynamic:
+      source: NORCP_LR
+      variables:
+        [
+          prcp,
+          temp,
+          hus500,
+          ta500,
+          ua500,
+          va500,
+          zg500,
+          hus700,
+          ta700,
+          ua700,
+          va700,
+          zg700,
+          hus850,
+          ta850,
+          ua850,
+          va850,
+          zg850,
+          hus950,
+          ta950,
+          ua950,
+          va950,
+          zg950,
+          hus1000,
+          ta1000,
+          ua1000,
+          va1000,
+          zg1000,
+        ]
+      time_offsets:
+        prcp: -3.0
+      upsample_to_target: false
+      upsample_mode: bilinear
+
+    static:
+      source: NORCP_STATIC
+      variables: [topo]
+      allow_missing: true
+
+  transforms:
+    apply: true
+
+    target:
+      prcp: log_zscore
+
+    conditioning:
+      dynamic:
+        prcp: log_zscore
+        temp: zscore
+        hus500: zscore
+        ta500: zscore
+        ua500: zscore
+        va500: zscore
+        zg500: zscore
+        hus700: zscore
+        ta700: zscore
+        ua700: zscore
+        va700: zscore
+        zg700: zscore
+        hus850: zscore
+        ta850: zscore
+        ua850: zscore
+        va850: zscore
+        zg850: zscore
+        hus950: zscore
+        ta950: zscore
+        ua950: zscore
+        va950: zscore
+        zg950: zscore
+        hus1000: zscore
+        ta1000: zscore
+        ua1000: zscore
+        va1000: zscore
+        zg1000: zscore
+
+      static:
+        topo: zscore
diff --git a/configs/experiments/pipeline_norcp_alvis.yaml b/configs/experiments/pipeline_norcp_alvis.yaml
@@ -0,0 +1,105 @@
+experiment:
+  name: pipeline_norcp
+  output_root: runs/pipeline_norcp
+  seed: 42
+
+stages:
+  training: true
+  generation: true
+  evaluation: true
+
+bases:
+  model: configs/models/edm_model_base.yaml
+  training: configs/training/training_base.yaml
+  generation: configs/generation/generation_base.yaml
+  sampler: configs/generation/sampler_base.yaml
+  evaluation: configs/evaluation/evaluation_base.yaml
+  data: configs/datasets/norcp_alvis.yaml
+
+data:
+  target:
+    variable: prcp
+    transform: log1p
+    output_shape: [92, 68]
+
+  conditioning:
+    dynamic_variables: [prcp, temp]
+    static_variables: [topo]
+    input_shape: [23, 17]
+
+  domain:
+    hr_size: [92, 68]
+    lr_size: [23, 17]
+    large_domain: false
+
+  split:
+    train: null
+    val: null
+    test: null
+
+    statistics: # which split to use for computing data statistics (e.g. mean, std) used for normalization; if null, use training split
+      train: train
+      val: train
+      test: train
+
+  overrides:
+    domain:
+      spatial_shuffle:
+        enabled: false
+        train_only: true
+        cutout_domain: [170, 350, 340, 520]
+
+training:
+  run_name: train_norcp
+  overrides:
+    loop:
+      max_epochs: 50
+
+model:
+  overrides:
+    in_dynamic_channels: 2
+    in_static_channels: 1
+    out_channels: 1
+
+    spatial:
+      target_height: 92
+      target_width: 68
+      cond_height: 23
+      cond_width: 17
+      align_cond_to_target: true
+      cond_upsample_mode: bilinear
+
+    rain_gate:
+      model:
+        enabled: true
+        hidden_channels: 32
+        num_blocks: 3
+        input_mode: "cond"
+      loss:
+        enabled: false
+        loss_weight: 0.1
+        wet_threshold_mm: 0.1
+        target_variable: "prcp"
+        use_loss_reweighting: false
+        reweight_detach: true
+        reweight_power: 1.0
+
+
+generation:
+  run_name: generate_norcp
+  overrides:
+    generation_run:
+      data:
+        split: test
+        batch_size: 50 # set to 50 to speed up generation for testing purposes; can be increased for final runs
+
+
+evaluation:
+  run_name: evaluate_norcp
+  overrides:
+    evaluation_run:
+      data:
+        split: test
+        forecast_product_for_spatial: pmm
+        forecast_product_for_climatology: pmm
+        forecast_product_for_temporal: pmm
diff --git a/documentations/setup_quickstart.md b/documentations/setup_quickstart.md
@@ -25,6 +25,7 @@ python cli/launch_pipeline.py --config <experiment.yaml>
 Target runtime:
 - LUMI (primary)
 - Local (for dry-run + smoke tests)
+- [Alvis/NAISS](https://www.naiss.se/resource/alvis/)
 
 ---
 
@@ -60,6 +61,24 @@ Make sure your container (or overlay) contains:
 
 If something fails with `ModuleNotFoundError`, your container is missing packages.
 
+
+### Environment Setup on Alvis/NAISS (without using container)
+```
+mkdir $HOME/venvs
+cd $HOME/venvs
+module load virtualenv/20.23.1-GCCcore-12.3.0
+virtualenv --system-site-packages stride
+source $HOME/venvs/stride/bin/activate
+module load Python/3.11.3-GCCcore-12.3.0
+module load PyTorch/2.1.2-foss-2023a-CUDA-12.1.1
+module load CUDA/12.1.1
+module load netcdf4-python/1.6.4-foss-2023a
+module load zarr/2.17.1-foss-2023a
+module load xarray/2023.9.0-gfbf-2023a 
+module load PyYAML/6.0-GCCcore-12.3.0
+module load dask/2023.9.2-foss-2023a
+```
+
 ---
 
 ## 4. Data setup