diff --git a/bash/run_pipeline_alvis.sh b/bash/run_pipeline_alvis.sh new file mode 100644 index 0000000..a9af1b5 --- /dev/null +++ b/bash/run_pipeline_alvis.sh @@ -0,0 +1,79 @@ +#!/bin/bash +#SBATCH -A NAISS2025-1-11 -p alvis +#SBATCH -N 1 +###SBATCH --gpus-per-node=A40:1 +#SBATCH --gpus-per-node=A100:1 +#SBATCH --cpus-per-task=16 +#SBATCH -t 08:00:00 +#SBATCH -J stride-pipeline +#SBATCH --chdir=/mimer/NOBACKUP/groups/naiss2025-6-138/HCLIMAI/log/log_stride/ +#SBATCH --error=%x-%j.error +#SBATCH --output=%x-%j.out + +set -euo pipefail + +if [ "$#" -lt 1 ]; then + echo "Usage: sbatch bash/run_pipeline.sh [--dry-run]" + exit 1 +fi + +PIPELINE_CONFIG="$1" +shift || true + +EXTRA_ARGS=("$@") + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +cd "$REPO_ROOT" + +echo +echo "=========================" +echo "STRIDE Slurm pipeline run" +echo "=========================" +echo "Repository root: $REPO_ROOT" +echo "Pipeline config: $PIPELINE_CONFIG" +echo "Python: $(command -v python || true)" +echo "Working dir: $(pwd)" +echo "Extra args: ${EXTRA_ARGS[*]:-}" +echo + +current_date_time="`date`"; +echo The run starts from $current_date_time +echo Check https://job.c3se.chalmers.se/alvis/$SLURM_JOB_ID for GPU usage. + +#export HDF5_USE_FILE_LOCKING=FALSE +#export TF_GPU_ALLOCATOR=cuda_malloc_async +##export CUDA_VISIBLE_DEVICES=1 +#export TF_DETERMINISTIC_OPS=0 +#export TF_FORCE_GPU_ALLOW_GROWTH=true +#ecinteractive -g + +DOMAIN='norcp' +#DOMAIN='TestDomain' +VARIABLE='tas' + +echo 'domain is' ${DOMAIN} +set -exu + +module --force purge +#module load virtualenv/20.26.2-GCCcore-13.3.0 +#module load Python/3.12.3-GCCcore-13.3.0 +#module load netcdf4-python/1.7.1.post2-foss-2024a +module load virtualenv/20.23.1-GCCcore-12.3.0 +module load Python/3.11.3-GCCcore-12.3.0 +module load CUDA/12.1.1 +module load PyTorch/2.1.2-foss-2023a-CUDA-12.1.1 +module load netcdf4-python/1.6.4-foss-2023a +module load zarr/2.17.1-foss-2023a +module load xarray/2023.9.0-gfbf-2023a +module load PyYAML/6.0-GCCcore-12.3.0 +module load dask/2023.9.2-foss-2023a +source $HOME/venvs/stride/bin/activate + +cd $HOME/STRIDE +python cli/launch_pipeline.py --config "$PIPELINE_CONFIG" "${EXTRA_ARGS[@]}" + +current_date_time="`date`"; +echo The run ends at $current_date_time + +exit 0 + diff --git a/configs/datasets/norcp_alvis.yaml b/configs/datasets/norcp_alvis.yaml new file mode 100644 index 0000000..1ad20c1 --- /dev/null +++ b/configs/datasets/norcp_alvis.yaml @@ -0,0 +1,107 @@ +data: + root_dir: /mimer/NOBACKUP/groups/naiss2025-6-138/HCLIMAI/NorCP_SSE/cropped # /Users/au728490/Data/NorCP/cropped + scenario_name: ECMWF-ERAINT + temporal_tag: 6hr + target_spatial_tag: 3km + dynamic_spatial_tag: 12km + + split: + manifest_path: data_adapters/norcp/saved/splits/temporal__ECMWF-ERAINT__train_auto_auto__val_2010-01-01T00-00-00_2012-12-31T18-00-00__test_2013-01-01T00-00-00_2018-12-31T18-00-00.json + name: train + stats_tag: temporal__ECMWF-ERAINT__train_auto_auto__val_2010-01-01T00-00-00_2012-12-31T18-00-00__test_2013-01-01T00-00-00_2018-12-31T18-00-00 + + domain: + tag: full_domain + crop: null + spatial_shuffle: + enabled: false + train_only: true + cutout_domain: null + + target: + variable: prcp + source: NORCP_HR + time_offsets: + prcp: -3.0 + + conditioning: + dynamic: + source: NORCP_LR + variables: + [ + prcp, + temp, + hus500, + ta500, + ua500, + va500, + zg500, + hus700, + ta700, + ua700, + va700, + zg700, + hus850, + ta850, + ua850, + va850, + zg850, + hus950, + ta950, + ua950, + va950, + zg950, + hus1000, + ta1000, + ua1000, + va1000, + zg1000, + ] + time_offsets: + prcp: -3.0 + upsample_to_target: false + upsample_mode: bilinear + + static: + source: NORCP_STATIC + variables: [topo] + allow_missing: true + + transforms: + apply: true + + target: + prcp: log_zscore + + conditioning: + dynamic: + prcp: log_zscore + temp: zscore + hus500: zscore + ta500: zscore + ua500: zscore + va500: zscore + zg500: zscore + hus700: zscore + ta700: zscore + ua700: zscore + va700: zscore + zg700: zscore + hus850: zscore + ta850: zscore + ua850: zscore + va850: zscore + zg850: zscore + hus950: zscore + ta950: zscore + ua950: zscore + va950: zscore + zg950: zscore + hus1000: zscore + ta1000: zscore + ua1000: zscore + va1000: zscore + zg1000: zscore + + static: + topo: zscore diff --git a/configs/experiments/pipeline_norcp_alvis.yaml b/configs/experiments/pipeline_norcp_alvis.yaml new file mode 100644 index 0000000..8c0abe2 --- /dev/null +++ b/configs/experiments/pipeline_norcp_alvis.yaml @@ -0,0 +1,105 @@ +experiment: + name: pipeline_norcp + output_root: runs/pipeline_norcp + seed: 42 + +stages: + training: true + generation: true + evaluation: true + +bases: + model: configs/models/edm_model_base.yaml + training: configs/training/training_base.yaml + generation: configs/generation/generation_base.yaml + sampler: configs/generation/sampler_base.yaml + evaluation: configs/evaluation/evaluation_base.yaml + data: configs/datasets/norcp_alvis.yaml + +data: + target: + variable: prcp + transform: log1p + output_shape: [92, 68] + + conditioning: + dynamic_variables: [prcp, temp] + static_variables: [topo] + input_shape: [23, 17] + + domain: + hr_size: [92, 68] + lr_size: [23, 17] + large_domain: false + + split: + train: null + val: null + test: null + + statistics: # which split to use for computing data statistics (e.g. mean, std) used for normalization; if null, use training split + train: train + val: train + test: train + + overrides: + domain: + spatial_shuffle: + enabled: false + train_only: true + cutout_domain: [170, 350, 340, 520] + +training: + run_name: train_norcp + overrides: + loop: + max_epochs: 50 + +model: + overrides: + in_dynamic_channels: 2 + in_static_channels: 1 + out_channels: 1 + + spatial: + target_height: 92 + target_width: 68 + cond_height: 23 + cond_width: 17 + align_cond_to_target: true + cond_upsample_mode: bilinear + + rain_gate: + model: + enabled: true + hidden_channels: 32 + num_blocks: 3 + input_mode: "cond" + loss: + enabled: false + loss_weight: 0.1 + wet_threshold_mm: 0.1 + target_variable: "prcp" + use_loss_reweighting: false + reweight_detach: true + reweight_power: 1.0 + + +generation: + run_name: generate_norcp + overrides: + generation_run: + data: + split: test + batch_size: 50 # set to 50 to speed up generation for testing purposes; can be increased for final runs + + +evaluation: + run_name: evaluate_norcp + overrides: + evaluation_run: + data: + split: test + forecast_product_for_spatial: pmm + forecast_product_for_climatology: pmm + forecast_product_for_temporal: pmm diff --git a/documentations/setup_quickstart.md b/documentations/setup_quickstart.md index d1c068a..03e8a94 100644 --- a/documentations/setup_quickstart.md +++ b/documentations/setup_quickstart.md @@ -25,6 +25,7 @@ python cli/launch_pipeline.py --config Target runtime: - LUMI (primary) - Local (for dry-run + smoke tests) +- [Alvis/NAISS](https://www.naiss.se/resource/alvis/) --- @@ -60,6 +61,24 @@ Make sure your container (or overlay) contains: If something fails with `ModuleNotFoundError`, your container is missing packages. + +### Environment Setup on Alvis/NAISS (without using container) +``` +mkdir $HOME/venvs +cd $HOME/venvs +module load virtualenv/20.23.1-GCCcore-12.3.0 +virtualenv --system-site-packages stride +source $HOME/venvs/stride/bin/activate +module load Python/3.11.3-GCCcore-12.3.0 +module load PyTorch/2.1.2-foss-2023a-CUDA-12.1.1 +module load CUDA/12.1.1 +module load netcdf4-python/1.6.4-foss-2023a +module load zarr/2.17.1-foss-2023a +module load xarray/2023.9.0-gfbf-2023a +module load PyYAML/6.0-GCCcore-12.3.0 +module load dask/2023.9.2-foss-2023a +``` + --- ## 4. Data setup