-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathtrain_buzz_agent.lua
More file actions
361 lines (328 loc) · 13.5 KB
/
Copy pathtrain_buzz_agent.lua
File metadata and controls
361 lines (328 loc) · 13.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
--[[
Copyright (c) 2014 Google Inc.
See LICENSE file for full terms of limited license.
]]
local cmd = torch.CmdLine()
cmd:text()
cmd:text('Train Buzz Agent in Quizbowl Environment:')
cmd:text()
cmd:text('Options:')
-- data
cmd:option('-data_dir','dat/qb','data directory. Should contain the file input.txt with input data')
cmd:option('-input_file','input.txt','data file name')
cmd:option('-batch_size',50,'number of questions to process in parallel')
-- model params
cmd:option('-embedding', '', 'directory of pretrained word embeddings')
cmd:option('-init_content', '', 'pretrained content model parameters from checkpoint at this path')
cmd:option('-hist_len', 1, 'history length of state features')
cmd:option('-best', false, 'load best model or current model')
cmd:option('-learning_rate_decay',0.97,'learning rate decay')
cmd:option('-learning_rate_decay_after',20,'in number of epochs, when to start decaying the learning rate')
-- dqn
cmd:option('-framework', '', 'name of training framework')
cmd:option('-env', '', 'name of environment to use')
cmd:option('-simulate', 0, 'simulate players or not')
cmd:option('-supervise', 0, 'using supervised signal as reward during training')
cmd:option('-checkpoint_dir', '/fs/clip-scratch/hhe/opponent/cv', 'output directory where checkpoints get written')
cmd:option('-savefile','','filename to autosave the checkpont to. Will be inside checkpoint_dir/')
cmd:option('-network', '', 'reload pretrained network')
cmd:option('-agent', 'QBNeuralQLearner', 'name of agent file to use')
cmd:option('-agent_params', '', 'string of agent parameters')
cmd:option('-seed', 1, 'fixed input seed for repeatable experiments')
cmd:option('-saveNetworkParams', false,
'saves the agent network in a separate file')
cmd:option('-prog_freq', 5*10^3, 'frequency of progress output')
cmd:option('-save_freq', 5*10^4, 'the model is saved every save_freq steps')
cmd:option('-eval_freq', 0, 'frequency of greedy evaluation')
cmd:option('-max_epochs',50,'number of full passes through the training data')
cmd:option('-steps', 10^5, 'number of training steps to perform')
cmd:option('-eval_steps', 10^5, 'number of evaluation steps')
cmd:option('-verbose', 2,
'the higher the level, the more information is printed to screen')
cmd:option('-threads', 0, 'number of BLAS threads')
cmd:option('-gpuid', -1, 'which gpu to use. -1 = use CPU')
cmd:option('-test',0,'evaluate on test set')
cmd:text()
opt = cmd:parse(arg)
-- dqn package use gpu
opt.gpu = opt.gpuid
if opt.savefile == '' then opt.savefile = opt.agent end
opt.supervise = opt.supervise == 1 and true or false
--- General env setup based on opt
require 'setup'
env_setup()
local agent, game_env = qb_dqn_setup()
local actions = agent.actions
function test_framework()
local state, terminal, reward = game_env:new_game(1)
for i=1,2 do
if not terminal then
state, terminal, reward = game_env:step(qb.WAIT)
end
end
if not terminal then
game_env:step(qb.BUZZ)
end
end
-- override print to always flush the output
local old_print = print
local print = function(...)
old_print(...)
io.flush()
end
---------------------------------------------------
local learn_start = agent.learn_start
local start_time = sys.clock()
local reward_counts = {}
local episode_counts = {}
local time_history = {}
local v_history = {}
local qmax_history = {}
local td_history = {}
-- this is just training reward
local reward_history = {}
local eval_reward_history = {}
local step = 0
time_history[1] = 0
local total_reward = 0
local total_length = 0
local nepisodes = 0
local episode_reward = 0
local episode_length = 0
function reset_stats()
total_reward = 0
total_length = 0
nepisodes = 0
episode_reward = 0
episode_length = 0
end
function eval_split(split_index, test)
game_env:reset(split_index)
print(string.format('============= eval split %d =============', split_index))
test = test or false
local total_reward = 0
local total_length = 0
local nepisodes = 0
local episode_reward = 0
local episode_length = 0
local group_reward = torch.zeros(game_env.num_player_groups)
local group_length = torch.zeros(game_env.num_player_groups)
local group_nepisodes = torch.zeros(game_env.num_player_groups)
local gating = (string.find(opt.agent, 'QBO', 1) and (agent.n_experts > 0)) and true or false
local group_weights = gating and torch.FloatTensor(game_env.num_player_groups, agent.n_experts):zero() or nil
local log = opt.test == 1 and io.open(opt.savefile .. '.log', 'w') or nil
local eval_time = sys.clock()
local n = game_env.num_buzzes[split_index]
for i=1,n do
state, terminal, reward = game_env:new_game(split_index, true)
local episode_gating_weights
if test and gating then
episode_gating_weights = torch.FloatTensor(agent.n_experts):zero()
end
while true do
local action_index = agent:perceive(reward, state, terminal, true, 0.0)
if not terminal then
if test and gating then
episode_gating_weights:add(agent.gating_weights)
end
state, terminal, reward = game_env:step(actions[action_index])
-- record every reward
episode_reward = episode_reward + reward
episode_length = episode_length + 1
else break end
end
local group = game_env.player_group
if test and gating then
episode_gating_weights:div(episode_length)
group_weights[group]:add(episode_gating_weights)
end
-- group stats
if game_env.num_player_groups > 1 then
group_reward[group] = group_reward[group] + episode_reward
group_length[group] = group_length[group] + episode_length
group_nepisodes[group] = group_nepisodes[group] + 1
end
-- write log
if log ~= nil then
log:write(string.format('%d,%d,%d,%d,%.2f\n', game_env.qid, game_env.player_id, group, episode_length, episode_reward))
end
-- overall stats
total_reward = total_reward + episode_reward
total_length = total_length + episode_length
episode_reward = 0
episode_length = 0
nepisodes = nepisodes + 1
end
assert(nepisodes == n)
eval_time = sys.clock() - eval_time
total_reward = total_reward / nepisodes
total_length = total_length / nepisodes
if game_env.num_player_groups > 1 then
for g=1,game_env.num_player_groups do
group_reward[g] = group_reward[g] / group_nepisodes[g]
group_length[g] = group_length[g] / group_nepisodes[g]
if group_weights ~= nil then
group_weights[g]:div(group_nepisodes[g])
end
end
end
if log ~= nil then io.close(log) end
if not test then
start_time = start_time + eval_time
agent:compute_validation_statistics()
local ind = #reward_history+1
if #reward_history == 0 or total_reward > torch.Tensor(reward_history):max() then
print('new best network on dev set')
agent.best_network = agent.network:clone()
end
if agent.v_avg then
v_history[ind] = agent.v_avg
td_history[ind] = agent.tderr_avg
qmax_history[ind] = agent.q_max
end
print("V", v_history[ind], "TD error", td_history[ind], "Qmax", qmax_history[ind])
reward_history[ind] = total_reward
reward_counts[ind] = nrewards
time_history[ind+1] = sys.clock() - start_time
local time_dif = time_history[ind+1] - time_history[ind]
print(string.format(
'epsilon: %.2f, lr: %G\n' ..
'reward: %.2f, episode length: %.2f\n' ..
'training time: %ds, eval time: %ds, ' ..
'num. ep.: %d',
agent.ep, agent.lr, total_reward, total_length, time_dif,
eval_time,
n))
game_env:report_error_analysis()
else
print(string.format(
'\nreward: %.2f, episode length: %.2f, epsilon: %.2f, ' ..
'eval time: %ds, num. ep.: %d',
total_reward, total_length, agent.ep,
eval_time, n))
game_env:report_error_analysis()
if game_env.num_player_groups > 1 then
for g=1,game_env.num_player_groups do
print('------ player group ------' .. g)
print(string.format(
'reward: %.2f, episode length: %.2f, num. ep.: %d',
group_reward[g], group_length[g], group_nepisodes[g]))
if group_weights ~= nil then
for e=1,agent.n_experts do
io.write(string.format('%.4f ', group_weights[g][e]))
end
print('')
end
game_env:report_error_analysis(g)
end
end
-- record eval reward history
-- TODO: remove split_index
if eval_reward_history[split_index] == nil then
eval_reward_history[split_index] = {}
end
local ind = #eval_reward_history[split_index] + 1
eval_reward_history[split_index][ind] = total_reward
end
print('=========================================')
end
--------------------------- testing --------------------------------
if opt.test == 1 then
eval_split(3, true)
os.exit()
end
--------------------------- training --------------------------------
-- use num_questions instead of num_buzzes because
-- only one buzz is sampled for each question during training
local ntrain = game_env.num_questions[1]
--local ntrain = game_env.num_buzzes[1]
if opt.eval_freq == 0 then
opt.eval_freq = ntrain
end
opt.save_freq = 5*opt.eval_freq
local max_num_games = opt.max_epochs * ntrain
local epoch = 0
local state, terminal, reward
for i=1,max_num_games do
epoch = i / ntrain
state, terminal, reward = game_env:new_game(1)
while true do
local priority = reward ~= 0 and true or false
local action_index = agent:perceive(reward, state, terminal, false, nil, priority)
if not terminal then
state, terminal, reward = game_env:step(actions[action_index])
episode_reward = episode_reward + reward
episode_length = episode_length + 1
else break end
end
total_reward = total_reward + episode_reward
total_length = total_length + episode_length
episode_reward = 0
episode_length = 0
nepisodes = nepisodes + 1
-- progress report
if i % opt.prog_freq == 0 then
print(string.format("%d/%d (epoch %.3f), lr: %G, epsilon: %.2f\n" ..
"reward: %.2f, episode length: %.2f, num. ep.: %d",
i, max_num_games, epoch, agent.lr, agent.ep,
total_reward/nepisodes, total_length/nepisodes, nepisodes))
--agent:report()
reset_stats()
collectgarbage()
end
-- learning rate decay
if i % ntrain == 0 and opt.learning_rate_decay < 1 then
if epoch >= opt.learning_rate_decay_after then
agent.lr_start = agent.lr_start * opt.learning_rate_decay
agent.lr_end = agent.lr_start
print('decayed learning rate by a factor ' .. opt.learning_rate_decay .. ' to ' .. agent.lr_start)
end
end
-- evaluation
if i % opt.eval_freq == 0 then
--game_env.debug = true
eval_split(2)
--eval_split(3, true)
--game_env.debug = false
end
if i % ntrain == 0 then
eval_split(3, true)
end
if i % opt.save_freq == 0 or i == max_num_games then
local s, a, r, s2, term = agent.valid_s, agent.valid_a, agent.valid_r,
agent.valid_s2, agent.valid_term
agent.valid_s, agent.valid_a, agent.valid_r, agent.valid_s2,
agent.valid_term = nil, nil, nil, nil, nil, nil, nil
local w, dw, g, g2, delta, delta2, deltas, tmp = agent.w, agent.dw,
agent.g, agent.g2, agent.delta, agent.delta2, agent.deltas, agent.tmp
agent.w, agent.dw, agent.g, agent.g2, agent.delta, agent.delta2,
agent.deltas, agent.tmp = nil, nil, nil, nil, nil, nil, nil, nil
local filename = string.format('%s/%s_lr%.6f_disc%.2f_epoch%.2f.t7', opt.checkpoint_dir, opt.savefile, agent.lr, agent.discount, epoch)
print('saving checkpoint to ' .. filename)
torch.save(filename, {agent = agent,
model = agent.network,
best_model = agent.best_network,
reward_history = reward_history,
eval_reward_history = eval_reward_history,
reward_counts = reward_counts,
episode_counts = episode_counts,
time_history = time_history,
v_history = v_history,
td_history = td_history,
qmax_history = qmax_history,
arguments=opt})
if opt.saveNetworkParams then
local nets = {network=w:clone():float()}
torch.save(filename..'.params.t7', nets, 'ascii')
end
agent.valid_s, agent.valid_a, agent.valid_r, agent.valid_s2,
agent.valid_term = s, a, r, s2, term
agent.w, agent.dw, agent.g, agent.g2, agent.delta, agent.delta2,
agent.deltas, agent.tmp = w, dw, g, g2, delta, delta2, deltas, tmp
io.flush()
collectgarbage()
end
if i % 1000 == 0 then
collectgarbage()
end
end