-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparams.yaml
More file actions
66 lines (63 loc) · 2.97 KB
/
params.yaml
File metadata and controls
66 lines (63 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
tokenize:
dataset_path: "data/bigcode-the-stack-dedup"
glob_pattern: "*.py" # Glob pattern for file matching. Use "*" or "*.*" for all files, "*.py" for Python only, etc.
seed: 42
max_unique_words: 0
vocab_size: 35263 # 256 byte tokens + 20000 BPE merges + 7 special tokens (BOS, EOS, PAD, UNK, CURSOR, EDIT_START, EDIT_END)
pattern: '''(?i:[sdmt]|ll|ve|re)| ?[A-Za-z_(][A-Za-z_.]*|%(?:\.\d+)?[sdifFeEgGxXoc%]|[0-9]{1,3}| ?[^ %_A-Za-z0-9]+(?: ")?[\r\n]*|%|\s+$|\s+(?=\s)|\s'
# Multi-language pattern (uncomment for GitHub Code 2025):
# pattern: '''(?i:[sdmt]|ll|ve|re)|[A-Za-z_@$#][A-Za-z0-9_@$#]*|0[xX][0-9a-fA-F]+|0[bB][01]+|[0-9]+\.?[0-9]*(?:[eE][+-]?[0-9]+)?|[^\sA-Za-z0-9_@$#]+|\s+'''
bos_token: "<|startoftext|>"
eos_token: "<|endoftext|>"
pad_token: "<|pad|>"
cursor_token: "<|cursor|>"
edit_start_token: "<|edit_start|>"
edit_end_token: "<|edit_end|>"
max_train_size: "5GB" # Maximum data size to use for training tokenizer (e.g., "100MB", "1GB", "0" = no limit)
chunk_size: "256M" # Tokens per output chunk (e.g., "256M" = 256M tokens ≈ 1GB file for uint32)
dataset_dir: "out/tokenize/chunks" # Directory to store dataset chunks
tok_file: "out/tokenize/tok.bin"
data:
split_ratio: 0.99
seq_length: 256
max_tokens: 0
num_workers: 8
dataset_dir: "out/tokenize/chunks" # Directory containing chunk_*.bin files
bos_token_id: 35256 # <|startoftext|> token for document boundaries
eos_token_id: 35257 # <|endoftext|> token for document boundaries
pad_token_id: 35258 # <|pad|> token for padding batches
cursor_token_id: 35259 # <|cursor|> token for cursor position
edit_start_token_id: 35260 # <|edit_start|> token for marking edit region start
edit_end_token_id: 35261 # <|edit_end|> token for marking edit region end
model:
# Qwen3 architecture parameters
hidden_size: 512
num_hidden_layers: 10
num_attention_heads: 32
num_key_value_heads: 2
intermediate_size: 2024
max_position_embeddings: 512
rope_theta: 10000.0
attention_dropout: 0.1
rms_norm_eps: 0.000001
use_sliding_window: false
sliding_window: 4096
training:
prefix: "qwen3"
batch_size: 32
epochs: 5
lr: 0.0001
weight_decay: 0.1
grad_clip: 1.0
gradient_accumulation_steps: 4
use_amp: true # Use automatic mixed precision training
compile_mode: "max-autotune-no-cudagraphs" # Options: null, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"
devices: 1 # Number of devices to use (GPUs/TPUs/etc.)
strategy: "auto" # Distributed strategy: "auto", "ddp", "fsdp", "deepspeed", etc.
seed: 42
warmup_steps: 5000 # Number of warmup steps for learning rate
scheduler_t_max_steps: null # T_max for CosineAnnealingLR (steps). Set to null to auto-calculate as epochs × batches_per_epoch
log_every_n_steps: 100 # Log training metrics every N steps (null to disable per-step logging)
val_every_n_steps: 5000 # Compute validation loss every N steps (null to validate only at epoch end)
save_dir: "out/train/checkpoints"
log_dir: "out/train/logs"