Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions run_3seed.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash
# Run 3-seed statistical significance test
# Usage: bash run_3seed.sh <script.sh>
# Example: bash run_3seed.sh run_enhanced.sh
#
# WARNING: This uses ~$12 of Runpod credits (3 full runs).
# Only use after you've validated the config with a single seed first.

SCRIPT=${1:-run_enhanced.sh}

echo "=== 3-seed run: $SCRIPT ==="
echo "Starting seed 1337..."
SEED=1337 bash $SCRIPT 2>&1 | tee logs/seed1337.txt

echo "Starting seed 42..."
SEED=42 bash $SCRIPT 2>&1 | tee logs/seed42.txt

echo "Starting seed 2025..."
SEED=2025 bash $SCRIPT 2>&1 | tee logs/seed2025.txt

echo "=== All 3 seeds complete ==="
echo "Results:"
grep "val_bpb" logs/seed1337.txt | tail -1
grep "val_bpb" logs/seed42.txt | tail -1
grep "val_bpb" logs/seed2025.txt | tail -1
67 changes: 67 additions & 0 deletions run_ablation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/bin/bash
# Ablation script: run multiple configs to isolate contributions
# Usage: bash run_ablation.sh <config_name> [ngpu]
#
# Configs:
# baseline - SOTA without our changes (control)
# depth_only - SOTA + depth recurrence (no enhanced TTT)
# recovery_only - SOTA + cosine recovery TTT (no depth recurrence)
# combined - SOTA + both improvements
# recovery_30ep - SOTA + 30 epoch recovery (higher budget)

CONFIG=${1:-combined}
NGPU=${2:-8}
SEED=${SEED:-1337}

# Common settings
COMMON="NUM_LAYERS=11 BIGRAM_VOCAB_SIZE=1536 XSA_LAST_N=4 \
EMA_ENABLED=1 EMA_DECAY=0.997 SWA_ENABLED=1 SWA_EVERY=50 \
ROPE_DIMS=16 LN_SCALE=1 LATE_QAT=1 LATE_QAT_THRESHOLD=0.15 \
VE_ENABLED=1 VE_DIM=128 VE_LAYERS=9,10 \
MUON_WD=0.04 ADAM_WD=0.04 \
MATRIX_LR=0.025 SCALAR_LR=0.025 TIED_EMBED_LR=0.035 \
MUON_MOMENTUM=0.99 MUON_MOMENTUM_WARMUP_START=0.92 \
MUON_MOMENTUM_WARMUP_STEPS=1500 WARMDOWN_ITERS=3500 \
ITERATIONS=9000 MAX_WALLCLOCK_SECONDS=600 EVAL_STRIDE=64"

# TTT common
TTT_COMMON="TTT_ENABLED=1 TTT_LR=0.002 TTT_CHUNK_TOKENS=32768 \
TTT_FREEZE_BLOCKS=0 TTT_MOMENTUM=0.9 TTT_BATCH_SEQS=32 TTT_GRAD_CLIP=1.0"

case $CONFIG in
baseline)
echo "Running: baseline (SOTA control)"
eval "$COMMON $TTT_COMMON TTT_EPOCHS=3 TTT_RECOVERY_EPOCHS=0 \
DEPTH_RECURRENCE= SEED=$SEED RUN_ID=ablation_baseline_s${SEED} \
torchrun --standalone --nproc_per_node=$NGPU train_gpt.py"
;;
depth_only)
echo "Running: SOTA + depth recurrence only"
eval "$COMMON $TTT_COMMON TTT_EPOCHS=3 TTT_RECOVERY_EPOCHS=0 \
DEPTH_RECURRENCE=4,5 SEED=$SEED RUN_ID=ablation_depth_s${SEED} \
torchrun --standalone --nproc_per_node=$NGPU train_gpt.py"
;;
recovery_only)
echo "Running: SOTA + cosine recovery TTT only"
eval "$COMMON $TTT_COMMON TTT_EPOCHS=5 TTT_RECOVERY_EPOCHS=20 TTT_RECOVERY_LR=0.001 \
DEPTH_RECURRENCE= SEED=$SEED RUN_ID=ablation_recovery_s${SEED} \
torchrun --standalone --nproc_per_node=$NGPU train_gpt.py"
;;
combined)
echo "Running: SOTA + depth recurrence + cosine recovery TTT"
eval "$COMMON $TTT_COMMON TTT_EPOCHS=5 TTT_RECOVERY_EPOCHS=20 TTT_RECOVERY_LR=0.001 \
DEPTH_RECURRENCE=4,5 SEED=$SEED RUN_ID=ablation_combined_s${SEED} \
torchrun --standalone --nproc_per_node=$NGPU train_gpt.py"
;;
recovery_30ep)
echo "Running: SOTA + 30-epoch cosine recovery TTT"
eval "$COMMON $TTT_COMMON TTT_EPOCHS=5 TTT_RECOVERY_EPOCHS=30 TTT_RECOVERY_LR=0.001 \
DEPTH_RECURRENCE= SEED=$SEED RUN_ID=ablation_recovery30_s${SEED} \
torchrun --standalone --nproc_per_node=$NGPU train_gpt.py"
;;
*)
echo "Unknown config: $CONFIG"
echo "Available: baseline, depth_only, recovery_only, combined, recovery_30ep"
exit 1
;;
esac
17 changes: 17 additions & 0 deletions run_baseline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash
# Baseline run: reproduces the merged SOTA (expected ~1.1194 bpb on 8xH100)
# Use this to verify the pipeline works before testing our improvements.

NUM_LAYERS=11 BIGRAM_VOCAB_SIZE=1536 XSA_LAST_N=4 \
EMA_ENABLED=1 EMA_DECAY=0.997 SWA_ENABLED=1 SWA_EVERY=50 \
ROPE_DIMS=16 LN_SCALE=1 LATE_QAT=1 LATE_QAT_THRESHOLD=0.15 \
VE_ENABLED=1 VE_DIM=128 VE_LAYERS=9,10 \
TTT_ENABLED=1 TTT_LR=0.002 TTT_EPOCHS=3 TTT_CHUNK_TOKENS=32768 \
TTT_FREEZE_BLOCKS=0 TTT_MOMENTUM=0.9 TTT_BATCH_SEQS=32 TTT_GRAD_CLIP=1.0 \
MUON_WD=0.04 ADAM_WD=0.04 \
MATRIX_LR=0.025 SCALAR_LR=0.025 TIED_EMBED_LR=0.035 \
MUON_MOMENTUM=0.99 MUON_MOMENTUM_WARMUP_START=0.92 \
MUON_MOMENTUM_WARMUP_STEPS=1500 WARMDOWN_ITERS=3500 \
ITERATIONS=9000 MAX_WALLCLOCK_SECONDS=600 EVAL_STRIDE=64 \
SEED=${SEED:-1337} \
torchrun --standalone --nproc_per_node=${NGPU:-8} train_gpt.py
26 changes: 26 additions & 0 deletions run_enhanced.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
# Enhanced run: SOTA + depth recurrence + cosine recovery TTT
# Expected improvement: ~0.02-0.05 bpb over baseline
#
# Key changes from baseline:
# 1. DEPTH_RECURRENCE=4,5 -> layers 4&5 repeat, 13 virtual from 11 physical
# 2. TTT_RECOVERY_EPOCHS=20 -> cosine recovery phase after standard TTT
# 3. TTT_RECOVERY_LR=0.001 -> recovery learning rate
# 4. TTT_EPOCHS=5 -> slightly more TTT epochs per chunk (3->5)
# 5. TTT_FREEZE_BLOCKS=0 -> unfreeze all blocks for TTT

NUM_LAYERS=11 BIGRAM_VOCAB_SIZE=1536 XSA_LAST_N=4 \
EMA_ENABLED=1 EMA_DECAY=0.997 SWA_ENABLED=1 SWA_EVERY=50 \
ROPE_DIMS=16 LN_SCALE=1 LATE_QAT=1 LATE_QAT_THRESHOLD=0.15 \
VE_ENABLED=1 VE_DIM=128 VE_LAYERS=9,10 \
DEPTH_RECURRENCE=4,5 \
TTT_ENABLED=1 TTT_LR=0.002 TTT_EPOCHS=5 TTT_CHUNK_TOKENS=32768 \
TTT_FREEZE_BLOCKS=0 TTT_MOMENTUM=0.9 TTT_BATCH_SEQS=32 TTT_GRAD_CLIP=1.0 \
TTT_RECOVERY_EPOCHS=20 TTT_RECOVERY_LR=0.001 \
MUON_WD=0.04 ADAM_WD=0.04 \
MATRIX_LR=0.025 SCALAR_LR=0.025 TIED_EMBED_LR=0.035 \
MUON_MOMENTUM=0.99 MUON_MOMENTUM_WARMUP_START=0.92 \
MUON_MOMENTUM_WARMUP_STEPS=1500 WARMDOWN_ITERS=3500 \
ITERATIONS=9000 MAX_WALLCLOCK_SECONDS=600 EVAL_STRIDE=64 \
SEED=${SEED:-1337} \
torchrun --standalone --nproc_per_node=${NGPU:-8} train_gpt.py
28 changes: 28 additions & 0 deletions setup_runpod.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
# Setup script for Runpod 8xH100 environment
# Run this ONCE when you first connect to the pod
set -e

echo "=== Parameter Golf Setup ==="
echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)"
echo "GPUs: $(nvidia-smi --query-gpu=count --format=csv,noheader | head -1)"

# Install dependencies
pip install -q sentencepiece numpy torch --upgrade 2>/dev/null || true

# Install FlashAttention 3 (Hopper only)
pip install flash-attn --no-build-isolation 2>/dev/null || echo "FA3 install failed, will use SDPA fallback"

# Download full training data (80 shards)
echo "=== Downloading training data (80 shards) ==="
python data/cached_challenge_fineweb.py --variant sp1024 --train-shards 80

echo "=== Setup complete ==="
echo "Data shards: $(ls data/datasets/fineweb10B_sp1024/fineweb_train_*.bin | wc -l)"
echo "Val shards: $(ls data/datasets/fineweb10B_sp1024/fineweb_val_*.bin | wc -l)"
echo ""
echo "To run baseline: bash run_baseline.sh"
echo "To run enhanced: bash run_enhanced.sh"
echo "To run ablation: bash run_ablation.sh <config>"
echo ""
echo "Available ablation configs: baseline, depth_only, recovery_only, combined, recovery_30ep"
Loading