-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdvc.lock
More file actions
200 lines (200 loc) · 5.84 KB
/
dvc.lock
File metadata and controls
200 lines (200 loc) · 5.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
schema: '2.0'
stages:
tokenize:
cmd: mkdir -p tokenizer/build && cmake -S tokenizer -B tokenizer/build
-DCMAKE_BUILD_TYPE=Release && cmake --build tokenizer/build -j$(nproc) &&
./tokenizer/build/tokenize
deps:
- path: tokenizer/CMakeLists.txt
hash: md5
md5: 769d2e7797f85b125e6dea339062535a
size: 6405
- path: tokenizer/src/lib/dataloader.cpp
hash: md5
md5: 53ebf2ee5ada158ea331af08bcf08c69
size: 2070
- path: tokenizer/src/lib/dataloader.hpp
hash: md5
md5: 8c74b03b4a1f4f941751083457b5f6ca
size: 600
- path: tokenizer/src/lib/io.cpp
hash: md5
md5: 29106b39a48a20faa8d1973ede05ff93
size: 6004
- path: tokenizer/src/lib/io.hpp
hash: md5
md5: a8177ebb8c302e566c180770cdc09639
size: 965
- path: tokenizer/src/lib/text.cpp
hash: md5
md5: 419b587d7e870e7665231a8376ea9481
size: 2872
- path: tokenizer/src/lib/text.hpp
hash: md5
md5: d3d105e19ac1861ea43655dcf6cbfd6e
size: 134
- path: tokenizer/src/lib/threading.cpp
hash: md5
md5: 5497a70216e75242a6b969862c92fc3b
size: 1367
- path: tokenizer/src/lib/threading.hpp
hash: md5
md5: 36ad00dece0ca60fdd9f73d4d3d2e982
size: 1134
- path: tokenizer/src/lib/tokenizer.cpp
hash: md5
md5: beb1595474211bafbde00a5e8946cd79
size: 43806
- path: tokenizer/src/lib/tokenizer.hpp
hash: md5
md5: 077446b68fa353a79c6af9e80c0926f6
size: 3706
- path: tokenizer/src/tokenize.cpp
hash: md5
md5: e38e0faff33642ec781675be7ce5cbba
size: 12728
params:
params.yaml:
tokenize:
dataset_path: data/bigcode-the-stack-dedup
glob_pattern: '*.py'
seed: 42
max_unique_words: 0
vocab_size: 35263
pattern: "'(?i:[sdmt]|ll|ve|re)| ?[A-Za-z_(][A-Za-z_.]*|%(?:\\.\\d+)?[sdifFeEgGxXoc%]|[0-9]{1,3}|
?[^ %_A-Za-z0-9]+(?: \")?[\\r\\n]*|%|\\s+$|\\s+(?=\\s)|\\s"
bos_token: <|startoftext|>
eos_token: <|endoftext|>
pad_token: <|pad|>
cursor_token: <|cursor|>
edit_start_token: <|edit_start|>
edit_end_token: <|edit_end|>
max_train_size: 5GB
chunk_size: 256M
dataset_dir: out/tokenize/chunks
tok_file: out/tokenize/tok.bin
outs:
- path: out/tokenize
hash: md5
md5: 2e65bce054cb499fc7aa433e08e93711.dir
size: 10751383763
nfiles: 12
train:
cmd: uv run python src/train.py
deps:
- path: out/tokenize
hash: md5
md5: 7cf9c81e19c2fed79e239bb6fcc993a7.dir
size: 10754551901
nfiles: 15
- path: src/dataloaders/token_dataloader.py
hash: md5
md5: eb3d2bfb850c82f33cb0c8192f767e32
size: 13755
- path: src/dataloaders/token_datamodule.py
hash: md5
md5: b3b7202bb562d39ddddb6e55fdecd333
size: 7937
- path: src/models/qwen3.py
hash: md5
md5: 78da8bc156449d3aa63cb52e1a7ee0c3
size: 14115
- path: src/train.py
hash: md5
md5: 7755a6ba951b23fd5df210d60877d40a
size: 8951
- path: src/trainers/trainer.py
hash: md5
md5: 2a28aa60aeb22603b161b343db1170e7
size: 19407
params:
params.yaml:
data.bos_token_id: 35256
data.dataset_dir: out/tokenize/chunks
data.eos_token_id: 35257
data.max_tokens: 0
data.num_workers: 8
data.pad_token_id: 35258
data.seq_length: 256
data.split_ratio: 0.99
model:
hidden_size: 512
num_hidden_layers: 10
num_attention_heads: 32
num_key_value_heads: 2
intermediate_size: 2024
max_position_embeddings: 512
rope_theta: 10000.0
attention_dropout: 0.1
rms_norm_eps: 1e-06
use_sliding_window: false
sliding_window: 4096
tokenize.dataset_dir: out/tokenize/chunks
tokenize.tok_file: out/tokenize/tok.bin
tokenize.vocab_size: 35263
training:
prefix: qwen3
batch_size: 32
epochs: 5
lr: 0.0001
weight_decay: 0.1
grad_clip: 1.0
gradient_accumulation_steps: 4
use_amp: true
compile_mode: max-autotune-no-cudagraphs
devices: 1
strategy: auto
seed: 42
warmup_steps: 5000
scheduler_t_max_steps:
log_every_n_steps: 100
val_every_n_steps: 5000
save_dir: out/train/checkpoints
log_dir: out/train/logs
outs:
- path: out/train/checkpoints/best.ckpt
hash: md5
md5: 9799630d1b9a05d63cceb8fe6f6ea7d0
size: 1341455351
- path: out/train/checkpoints/latest.ckpt
hash: md5
md5: 9799630d1b9a05d63cceb8fe6f6ea7d0
size: 1341455351
- path: out/train/checkpoints/metrics.json
hash: md5
md5: 8a80554c91d9fca8acb82f023de02f11
size: 3
- path: out/train/logs
hash: md5
md5: 4393757035b8cdea1ab037dc777bd0c3.dir
size: 748623
nfiles: 3
export:
cmd: uv run python src/export.py
deps:
- path: out/train/checkpoints/best.ckpt
hash: md5
md5: 9799630d1b9a05d63cceb8fe6f6ea7d0
size: 1341455351
- path: src/export.py
hash: md5
md5: 02c05e45817ebba881397a31f528a379
size: 4194
- path: src/models/qwen3.py
hash: md5
md5: 78da8bc156449d3aa63cb52e1a7ee0c3
size: 14115
params:
params.yaml:
tokenize.bos_token: <|startoftext|>
tokenize.eos_token: <|endoftext|>
tokenize.pad_token: <|pad|>
tokenize.tok_file: out/tokenize/tok.bin
tokenize.vocab_size: 35263
training.save_dir: out/train/checkpoints
outs:
- path: out/export
hash: md5
md5: 724d985afee85aff5b65de502e51cdab.dir
size: 447117099
nfiles: 5