Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
324 changes: 324 additions & 0 deletions tutorials/einsum_transformer/einsum_transformer_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,324 @@
settings:
experiment_id: ${modalities_env:experiment_id}
config_file_path: ${modalities_env:config_file_path}
referencing_keys:
sample_key: input_ids
target_key: target_ids
prediction_key: logits
cuda_env:
local_rank: ${cuda_env:LOCAL_RANK}
global_rank: ${cuda_env:RANK}
world_size: ${cuda_env:WORLD_SIZE}
paths:
experiments_root_path: ${modalities_env:experiments_root_path}
experiment_folder_path: ${settings.paths.experiments_root_path}/${settings.experiment_id}
train_dataset_path: /raid/s3/opengptx/max_lue/repositories/training_datasets/processed/fineweb_sample-100BT.pbin
intervals:
training_log_interval_in_steps: 1
checkpointing_interval_in_steps: 1001
evaluation_interval_in_steps: 1001
consistency_enforcement:
enforce_tokens_per_step_consistency: true
enforce_last_step_logged: false
enforce_last_step_evaluated: false
enforce_last_step_checkpointed: false
step_profile:
gradient_accumulation_steps: 10
local_train_micro_batch_size: 1
sequence_length: 2048
dp_degree:
instance_key: dp_degree
pass_type: BY_REFERENCE
training_target:
num_target_tokens:
component_key: number_conversion
variant_key: num_tokens_from_num_steps
config:
num_steps: ${settings.training_target.num_target_steps}
dp_degree:
instance_key: dp_degree
pass_type: BY_REFERENCE
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
sequence_length: ${settings.step_profile.sequence_length}
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
num_target_steps: 1000
training_progress:
global_num_seen_tokens: 0
num_seen_steps: 0
num_seen_samples: 0
last_step: -1

collate_fn:
component_key: collate_fn
variant_key: gpt_2_llm_collator
config:
sample_key: ${settings.referencing_keys.sample_key}
target_key: ${settings.referencing_keys.target_key}

train_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: ${settings.paths.train_dataset_path}
sequence_length: ${settings.step_profile.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}

train_dataloader:
component_key: data_loader
variant_key: default
config:
# we set num_workers to 0 so that the the data is loaded in the main process
# this is required to track how often the collator has been called
# in the library tutorials. Otherwise the collator will be copied for each worker
# and the number of call is out of scope.
num_workers: 0
pin_memory: true
dataloader_tag: train
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: ${settings.step_profile.local_train_micro_batch_size}
drop_last: true
sampler:
component_key: sampler
variant_key: resumable_distributed_sampler
config:
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: true
seed: 42
drop_last: true
skip_num_global_samples: ${settings.training_progress.num_seen_samples}
collate_fn:
instance_key: collate_fn
pass_type: BY_REFERENCE

eval_dataloaders: []

checkpoint_saving:
component_key: checkpoint_saving
variant_key: default
config:
checkpoint_saving_strategy:
component_key: checkpoint_saving_strategy
variant_key: save_k_most_recent_checkpoints_strategy
config:
k: -1 # -1 to save all checkpoints
checkpoint_saving_execution:
component_key: checkpoint_saving_execution
variant_key: dcp
config:
checkpoint_path: ${settings.paths.experiment_folder_path}
global_rank: ${settings.cuda_env.global_rank}
experiment_id: ${settings.experiment_id}

loss_fn:
component_key: loss
variant_key: clm_cross_entropy_loss
config:
target_key: ${settings.referencing_keys.target_key}
prediction_key: ${settings.referencing_keys.prediction_key}

device_mesh:
component_key: device_mesh
variant_key: default
config:
device_type: cuda
data_parallel_replicate_degree: 1
data_parallel_shard_degree: ${settings.cuda_env.world_size} # i.e., fully sharded
world_size: ${settings.cuda_env.world_size}

dp_degree:
component_key: number_conversion
variant_key: parallel_degree
config: # get the parallel degree from the device mesh
device_mesh:
instance_key: device_mesh
pass_type: BY_REFERENCE
parallelism_methods: [dp_shard, dp_replicate]

app_state:
component_key: app_state
variant_key: raw
config:
model:
instance_key: initialized_model
pass_type: BY_REFERENCE
optimizer:
instance_key: optimizer
pass_type: BY_REFERENCE
lr_scheduler:
instance_key: lr_scheduler
pass_type: BY_REFERENCE

initialized_model:
component_key: model
variant_key: model_initialized
config:
model:
instance_key: fsdp_model
pass_type: BY_REFERENCE
model_initializer:
component_key: model_initialization
variant_key: composed
config:
model_type: gpt2
weight_init_type: scaled
mean: 0.0
std: 0.02
num_layers: ${model_raw.config.num_layers}

fsdp_model:
component_key: model
variant_key: fsdp2_wrapped
config:
model:
instance_key: compiled_model
pass_type: BY_REFERENCE
device_mesh:
instance_key: device_mesh
pass_type: BY_REFERENCE
mixed_precision_settings:
param_dtype: BF_16
reduce_dtype: BF_16
block_names: [TransformerBlock]

compiled_model:
component_key: model
variant_key: compiled
config:
model:
instance_key: model_raw
pass_type: BY_REFERENCE
block_names: [TransformerBlock]

model_raw:
component_key: model
variant_key: einsum_transformer
config:
vocab_size: 128256
sequence_length: ${settings.step_profile.sequence_length}
embed_dim: 4096
num_q_heads: 32
num_kv_heads: 8
num_layers: 32
mlp_expansion_factor: 3.5
prediction_key: ${settings.referencing_keys.prediction_key}
sample_key: ${settings.referencing_keys.sample_key}

lr_scheduler:
component_key: scheduler
variant_key: onecycle_lr
config:
optimizer:
instance_key: optimizer
pass_type: BY_REFERENCE
max_lr: 6e-4
div_factor: 10
final_div_factor: 1
total_steps: ${settings.training_target.num_target_steps}
pct_start: 0.01
anneal_strategy: cos
last_epoch: ${settings.training_progress.last_step}

optimizer:
component_key: optimizer
variant_key: adam_w
config:
lr: 0.0001
betas: [0.9, 0.95]
eps: 1e-8
weight_decay: 1e-1
weight_decay_groups_excluded: [embedding, layernorm]
wrapped_model:
instance_key: initialized_model
pass_type: BY_REFERENCE

gradient_clipper:
component_key: gradient_clipper
variant_key: fsdp2
config:
wrapped_model:
instance_key: initialized_model
pass_type: BY_REFERENCE
norm_type: P2_NORM
max_norm: 1.0
device_mesh:
instance_key: device_mesh
pass_type: BY_REFERENCE

progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
global_rank: ${settings.cuda_env.global_rank}
num_seen_steps: ${settings.training_progress.num_seen_steps}
num_target_steps: ${settings.training_target.num_target_steps}
train_dataloader_tag: ${train_dataloader.config.dataloader_tag}
eval_dataloaders:
instance_key: eval_dataloaders
pass_type: BY_REFERENCE

evaluation_subscriber:
component_key: results_subscriber
variant_key: to_disc
config:
output_file_path: ${settings.paths.experiment_folder_path}/evaluation_results.jsonl

mfu_calculator:
component_key: mfu_calculator
variant_key: gpt2
config:
n_layer: ${model_raw.config.num_layers}
sequence_length: ${settings.step_profile.sequence_length}
n_embd: ${model_raw.config.embed_dim}
world_size: ${settings.cuda_env.world_size}
wrapped_model:
instance_key: initialized_model
pass_type: BY_REFERENCE
device_mesh:
instance_key: device_mesh
pass_type: BY_REFERENCE

# profiler:
# component_key: steppable_profiler
# variant_key: combined
# config:
# profilers:
# - instance_key: kernel_profiler
# pass_type: BY_REFERENCE
# # - instance_key: memory_profiler
# # pass_type: BY_REFERENCE

kernel_profiler:
component_key: steppable_profiler
variant_key: kernel_tracing
config:
num_wait_steps: 1
num_warmup_steps: 1
num_active_steps: 3
profiler_activities: [CUDA]
profile_memory: true
record_shapes: true
with_stack: true
with_flops: true
with_modules: true
tracked_ranks: [0]
output_folder_path: ${settings.paths.experiment_folder_path}/profiling

memory_profiler:
component_key: steppable_profiler
variant_key: memory_tracing
config:
memory_snapshot_folder_path: ${settings.paths.experiment_folder_path}/profiling
num_wait_steps: 1
num_warmup_steps: 1
num_active_steps: 3
tracked_ranks: [0]
Empty file.
Loading