diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..314255a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+# Ignore the virtual environment directory
+.venv/
diff --git a/ChessBoard.py b/ChessBoard.py
index 7ea4535..9d41b29 100755
--- a/ChessBoard.py
+++ b/ChessBoard.py
@@ -73,14 +73,14 @@ def remove(self, x, y):
del self.pieces[x, y]
def select(self, x, y, player_is_red):
- # 选中棋子
+ # 选中棋子 # Select chess pieces
if not self.selected_piece:
if (x, y) in self.pieces and self.pieces[x, y].is_red == player_is_red:
self.pieces[x, y].selected = True
self.selected_piece = self.pieces[x, y]
return False, None
- # 移动棋子
+ # 移动棋子 # Move the pieces
if not (x, y) in self.pieces:
if self.selected_piece:
ox, oy = self.selected_piece.x, self.selected_piece.y
@@ -91,11 +91,11 @@ def select(self, x, y, player_is_red):
return True, (ox, oy, x, y)
return False, None
- # 同一个棋子
+ # 同一个棋子 # The same chess piece
if self.pieces[x, y].selected:
return False, None
- # 吃子
+ # 吃子 # capture piece
if self.pieces[x, y].is_red != player_is_red:
ox, oy = self.selected_piece.x, self.selected_piece.y
if self.can_move(ox, oy, x-ox, y-oy):
@@ -105,10 +105,10 @@ def select(self, x, y, player_is_red):
return True, (ox, oy, x, y)
return False, None
- # 取消选中
+ # 取消选中 # Uncheck
for key in self.pieces.keys():
self.pieces[key].selected = False
- # 选择棋子
+ # 选择棋子 # Choose a piece
self.pieces[x, y].selected = True
self.selected_piece = self.pieces[x,y]
return False, None
\ No newline at end of file
diff --git a/ChessGame.py b/ChessGame.py
index 0e091d6..5ba71af 100755
--- a/ChessGame.py
+++ b/ChessGame.py
@@ -1,7 +1,5 @@
from ChessBoard import *
-from ChessView import ChessView
from main import *
-import tkinter
def real_coord(x):
if x <= 50:
@@ -23,18 +21,22 @@ class ChessGame:
time_green = []
def __init__(self, in_ai_count, in_ai_function, in_play_playout, in_delay, in_end_delay, batch_size, search_threads,
- processor, num_gpus, res_block_nums, human_color = "b"):
+ processor, num_gpus, res_block_nums, human_color = "b", no_gui=False):
self.human_color = human_color
self.current_player = "w"
self.players = {}
self.players[self.human_color] = "human"
ai_color = "w" if self.human_color == "b" else "b"
self.players[ai_color] = "AI"
+ self.no_gui = no_gui
ChessGame.board = ChessBoard(self.human_color == 'b')
- self.view = ChessView(self, board=ChessGame.board)
- self.view.showMsg("Loading Models...") #"Red" player_color
- self.view.draw_board(self.board)
+ if not self.no_gui:
+ from ChessView import ChessView
+ import tkinter
+ self.view = ChessView(self, board=ChessGame.board)
+ self.view.showMsg("Loading Models...") #"Red" player_color
+ self.view.draw_board(self.board)
ChessGame.game_mode = in_ai_count
self.ai_function = in_ai_function
self.play_playout = in_play_playout
@@ -45,7 +47,8 @@ def __init__(self, in_ai_count, in_ai_function, in_play_playout, in_delay, in_en
self.win_rate['w'] = 0.0
self.win_rate['b'] = 0.0
- self.view.root.update()
+ if not self.no_gui:
+ self.view.root.update()
self.cchess_engine = cchess_main(playout=self.play_playout, in_batch_size=batch_size, exploration=False, in_search_threads=search_threads,
processor=processor, num_gpus=num_gpus, res_block_nums=res_block_nums, human_color=human_color)
@@ -53,23 +56,32 @@ def player_is_red(self):
return self.current_player == "w"
def start(self):
- # below added by Fei Li
- self.view.showMsg("Red")
- if self.game_mode == 1:
- print ('-----Round %d-----' % self.cur_round)
- if self.players["w"] == "AI":
+ if not self.no_gui:
+ # below added by Fei Li
+ self.view.showMsg("Red")
+ if self.game_mode == 1:
+ print ('-----Round %d-----' % self.cur_round)
+ if self.players["w"] == "AI":
+ self.win_rate['w'] = self.perform_AI()
+ self.view.draw_board(self.board)
+ self.change_player()
+ elif self.game_mode == 2:
+ print('-----Round %d-----' % self.cur_round)
self.win_rate['w'] = self.perform_AI()
self.view.draw_board(self.board)
- self.change_player()
- elif self.game_mode == 2:
- print('-----Round %d-----' % self.cur_round)
- self.win_rate['w'] = self.perform_AI()
- self.view.draw_board(self.board)
- self.view.start()
+ self.view.start()
+ else:
+ self.start_no_gui()
+
+ def start_no_gui(self):
+ while not self.check_end():
+ self.perform_AI()
+ self.change_player()
def disp_mcts_msg(self):
- self.view.showMsg("MCTS Searching...")
+ if not self.no_gui:
+ self.view.showMsg("MCTS Searching...")
def callback(self, event):
if self.game_mode == 1 and self.players[self.current_player] == "AI":
@@ -109,21 +121,23 @@ def callback(self, event):
# below added by Fei Li
def quit(self):
- time.sleep(self.end_delay)
- self.view.quit()
+ if not self.no_gui:
+ time.sleep(self.end_delay)
+ self.view.quit()
def check_end(self):
ret, winner = self.cchess_engine.check_end()
if ret == True:
- if winner == "b":
- self.view.showMsg('*****Green Wins at Round %d*****' % self.cur_round)
- self.view.root.update()
- elif winner == "w":
- self.view.showMsg('*****Red Wins at Round %d*****' % self.cur_round)
- self.view.root.update()
- elif winner == "t":
- self.view.showMsg('*****Draw at Round %d*****' % self.cur_round)
- self.view.root.update()
+ if not self.no_gui:
+ if winner == "b":
+ self.view.showMsg('*****Green Wins at Round %d*****' % self.cur_round)
+ self.view.root.update()
+ elif winner == "w":
+ self.view.showMsg('*****Red Wins at Round %d*****' % self.cur_round)
+ self.view.root.update()
+ elif winner == "t":
+ self.view.showMsg('*****Draw at Round %d*****' % self.cur_round)
+ self.view.root.update()
return ret
def _check_end(self, board):
@@ -159,12 +173,13 @@ def change_player(self):
green_msg = " ({:.4f})".format(self.win_rate['b'])
sorted_move_probs = self.cchess_engine.get_hint(self.ai_function, True, self.disp_mcts_msg)
# print(sorted_move_probs)
- self.view.print_all_hint(sorted_move_probs)
- # self.move_images.append(tkinter.PhotoImage(file="images/OOS.gif"))
- # self.can.create_image(board_coord(x), board_coord(y), image=self.move_images[-1])
+ if not self.no_gui:
+ self.view.print_all_hint(sorted_move_probs)
+ # self.move_images.append(tkinter.PhotoImage(file="images/OOS.gif"))
+ # self.can.create_image(board_coord(x), board_coord(y), image=self.move_images[-1])
- self.view.showMsg("Red" + red_msg + " Green" + green_msg if self.current_player == "w" else "Green" + green_msg + " Red" + red_msg)
- self.view.root.update()
+ self.view.showMsg("Red" + red_msg + " Green" + green_msg if self.current_player == "w" else "Green" + green_msg + " Red" + red_msg)
+ self.view.root.update()
# if self.game_mode == 0:
# return False
if self.game_mode == 1:
@@ -182,9 +197,9 @@ def change_player(self):
def perform_AI(self):
print ('...AI is calculating...')
- START_TIME = time.clock()
+ START_TIME = time.perf_counter()
move, win_rate = self.cchess_engine.select_move(self.ai_function)
- time_used = time.clock() - START_TIME
+ time_used = time.perf_counter() - START_TIME
print ('...Use %fs...' % time_used)
if self.current_player == "w":
self.time_red.append(time_used)
@@ -197,11 +212,9 @@ def perform_AI(self):
# AI VS AI mode
def game_mode_2(self):
self.change_player()
- self.view.draw_board(self.board)
- self.view.root.update()
+ if not self.no_gui:
+ self.view.draw_board(self.board)
+ self.view.root.update()
if self.check_end():
return True
return False
-
-# game = ChessGame()
-# game.start()
diff --git a/ChessGame_tf2.py b/ChessGame_tf2.py
deleted file mode 100755
index 279a08f..0000000
--- a/ChessGame_tf2.py
+++ /dev/null
@@ -1,207 +0,0 @@
-from ChessBoard import *
-from ChessView import ChessView
-from main_tf2 import *
-import tkinter
-
-def real_coord(x):
- if x <= 50:
- return 0
- else:
- return (x-50)//40 + 1
-
-
-def board_coord(x):
- return 30 + 40*x
-
-
-class ChessGame:
-
- board = None #ChessBoard()
- cur_round = 1
- game_mode = 1 # 0:HUMAN VS HUMAN 1:HUMAN VS AI 2:AI VS AI
- time_red = []
- time_green = []
-
- def __init__(self, in_ai_count, in_ai_function, in_play_playout, in_delay, in_end_delay, batch_size, search_threads,
- processor, num_gpus, res_block_nums, human_color = "b"):
- self.human_color = human_color
- self.current_player = "w"
- self.players = {}
- self.players[self.human_color] = "human"
- ai_color = "w" if self.human_color == "b" else "b"
- self.players[ai_color] = "AI"
-
- ChessGame.board = ChessBoard(self.human_color == 'b')
- self.view = ChessView(self, board=ChessGame.board)
- self.view.showMsg("Loading Models...") #"Red" player_color
- self.view.draw_board(self.board)
- ChessGame.game_mode = in_ai_count
- self.ai_function = in_ai_function
- self.play_playout = in_play_playout
- self.delay = in_delay
- self.end_delay = in_end_delay
-
- self.win_rate = {}
- self.win_rate['w'] = 0.0
- self.win_rate['b'] = 0.0
-
- self.view.root.update()
- self.cchess_engine = cchess_main(playout=self.play_playout, in_batch_size=batch_size, exploration=False, in_search_threads=search_threads,
- processor=processor, num_gpus=num_gpus, res_block_nums=res_block_nums, human_color=human_color)
-
- def player_is_red(self):
- return self.current_player == "w"
-
- def start(self):
- # below added by Fei Li
- self.view.showMsg("Red")
- if self.game_mode == 1:
- print ('-----Round %d-----' % self.cur_round)
- if self.players["w"] == "AI":
- self.win_rate['w'] = self.perform_AI()
- self.view.draw_board(self.board)
- self.change_player()
- elif self.game_mode == 2:
- print('-----Round %d-----' % self.cur_round)
- self.win_rate['w'] = self.perform_AI()
- self.view.draw_board(self.board)
-
- self.view.start()
-
- def disp_mcts_msg(self):
- self.view.showMsg("MCTS Searching...")
-
- def callback(self, event):
- if self.game_mode == 1 and self.players[self.current_player] == "AI":
- return
- if self.game_mode == 2:
- return
- rx, ry = real_coord(event.x), real_coord(event.y)
- # print(rx, ry)
- change, coord = self.board.select(rx, ry, self.player_is_red())
- if self.view.print_text_flag == True:
- self.view.print_text_flag = False
- self.view.can.create_image(0, 0, image=self.view.img, anchor=tkinter.NW)
- self.view.draw_board(self.board)
- if self.check_end():
- self.view.root.update()
- self.quit()
- return
- if change:
- if self.cur_round == 1 and self.human_color == 'w':
- self.view.showMsg("MCTS Searching...")
-
- self.win_rate[self.current_player] = self.cchess_engine.human_move(coord, self.ai_function)
- if self.check_end():
- self.view.root.update()
- self.quit()
- return
- performed = self.change_player()
- if performed:
- self.view.draw_board(self.board)
- if self.check_end():
- self.view.root.update()
- self.quit()
- return
- self.change_player()
-
-
- # below added by Fei Li
-
- def quit(self):
- time.sleep(self.end_delay)
- self.view.quit()
-
- def check_end(self):
- ret, winner = self.cchess_engine.check_end()
- if ret == True:
- if winner == "b":
- self.view.showMsg('*****Green Wins at Round %d*****' % self.cur_round)
- self.view.root.update()
- elif winner == "w":
- self.view.showMsg('*****Red Wins at Round %d*****' % self.cur_round)
- self.view.root.update()
- elif winner == "t":
- self.view.showMsg('*****Draw at Round %d*****' % self.cur_round)
- self.view.root.update()
- return ret
-
- def _check_end(self, board):
- red_king = False
- green_king = False
- pieces = board.pieces
- for (x, y) in pieces.keys():
- if pieces[x, y].is_king:
- if pieces[x, y].is_red:
- red_king = True
- else:
- green_king = True
- if not red_king:
- self.view.showMsg('*****Green Wins at Round %d*****' % self.cur_round)
- self.view.root.update()
- return True
- elif not green_king:
- self.view.showMsg('*****Red Wins at Round %d*****' % self.cur_round)
- self.view.root.update()
- return True
- elif self.cur_round >= 200:
- self.view.showMsg('*****Draw at Round %d*****' % self.cur_round)
- self.view.root.update()
- return True
- return False
-
- def change_player(self):
- self.current_player = "w" if self.current_player == "b" else "b"
- if self.current_player == "w":
- self.cur_round += 1
- print ('-----Round %d-----' % self.cur_round)
- red_msg = " ({:.4f})".format(self.win_rate['w'])
- green_msg = " ({:.4f})".format(self.win_rate['b'])
- sorted_move_probs = self.cchess_engine.get_hint(self.ai_function, True, self.disp_mcts_msg)
- # print(sorted_move_probs)
- self.view.print_all_hint(sorted_move_probs)
- # self.move_images.append(tkinter.PhotoImage(file="images/OOS.gif"))
- # self.can.create_image(board_coord(x), board_coord(y), image=self.move_images[-1])
-
- self.view.showMsg("Red" + red_msg + " Green" + green_msg if self.current_player == "w" else "Green" + green_msg + " Red" + red_msg)
- self.view.root.update()
- # if self.game_mode == 0:
- # return False
- if self.game_mode == 1:
- if self.players[self.current_player] == "AI":
- self.win_rate[self.current_player] = self.perform_AI()
- return True
- return False
- elif self.game_mode == 2:
- # if self.current_player == "w":
- # self.human_win_rate = self.perform_AI()
- # else:
- self.win_rate[self.current_player] = self.perform_AI()
- return True
- return False
-
- def perform_AI(self):
- print ('...AI is calculating...')
- START_TIME = time.clock()
- move, win_rate = self.cchess_engine.select_move(self.ai_function)
- time_used = time.clock() - START_TIME
- print ('...Use %fs...' % time_used)
- if self.current_player == "w":
- self.time_red.append(time_used)
- else:
- self.time_green.append(time_used)
- if move is not None:
- self.board.move(move[0], move[1], move[2], move[3])
- return win_rate
-
- # AI VS AI mode
- def game_mode_2(self):
- self.change_player()
- self.view.draw_board(self.board)
- self.view.root.update()
- if self.check_end():
- return True
- return False
-
-# game = ChessGame()
-# game.start()
diff --git a/Mastering_Chess_and_Shogi_by_Self-Play_with_a_General_Reinforcement_Learning_Algorithm.ipynb b/Mastering_Chess_and_Shogi_by_Self-Play_with_a_General_Reinforcement_Learning_Algorithm.ipynb
index 8d7c777..8d50c0a 100644
--- a/Mastering_Chess_and_Shogi_by_Self-Play_with_a_General_Reinforcement_Learning_Algorithm.ipynb
+++ b/Mastering_Chess_and_Shogi_by_Self-Play_with_a_General_Reinforcement_Learning_Algorithm.ipynb
@@ -13,7 +13,9 @@
"source": [
"[`程世东`](http://zhihu.com/people/cheng-shi-dong-47) 翻译\n",
"\n",
- "[`GitHub`](http://github.com/chengstone) [`Mail`](mailto:69558140@163.com)"
+ "[`GitHub`](http://github.com/chengstone) [`Mail`](mailto:69558140@163.com)\n",
+ "\n",
+ "[English paper](https://arxiv.org/pdf/1712.01815.pdf)"
]
},
{
@@ -104,7 +106,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "其中c是控制L2正则化水平的参数。更新的参数被用于随后的自我对弈中。"
+ "其中c是控制L2正则化水平的参数。更新的参数被用于随后的自我对弈中。\n",
+ "\n",
+ "Where c is the parameter that controls the L2 regularization level. The updated parameters are used in the subsequent self-play."
]
},
{
@@ -167,8 +171,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "\n",
+ "\n",
"图1:训练AlphaZero 70万步。国际等级分是在不同的玩家之间的比赛进行评估计算出来的,每一步棋有1秒的思考时间。a国际象棋中AlphaZero的表现,与2016年TCEC世界冠军程序Stockfish比较。b在将棋中AlphaZero的表现,与2017年CSA世界冠军程序Elmo比较。c 在围棋中AlphaZero的表现,与AlphaGo Lee和AlphaGo Zero(20 block / 3天)比较。"
]
},
@@ -184,7 +187,7 @@
"metadata": {},
"source": [
"\n",
- "\n",
+ "\n",
"表1: 在国际象棋,将棋和围棋中评估AlphaZero,以AlphaZero的角度的胜平负,与Stockfish, Elmo,和训练了三天的AlphaGo Zero进行100场比赛。每个程序下一步棋有1分钟的思考时间。"
]
},
@@ -213,22 +216,24 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "表2:分析12个最受欢迎的人类开局(在线数据库中出现超过10万次)。每个开局标有其ECO代码和通用名称。该图显示了AlphaZero在自我训练比赛时使用的每次开局的比例。我们还从AlphaZero的角度报告了从每个开局开始与Stockfish 100场比赛的胜负/平局/失败结果,无论是白色(W)还是黑色(B)。最后,从每个开局提供AlphaZero的主要变着(PV)。"
+ "\n",
+ "表2:分析12个最受欢迎的人类开局(在线数据库中出现超过10万次)。每个开局标有其ECO代码和通用名称。该图显示了AlphaZero在自我训练比赛时使用的每次开局的比例。我们还从AlphaZero的角度报告了从每个开局开始与Stockfish 100场比赛的胜负/平局/失败结果,无论是白色(W)还是黑色(B)。最后,从每个开局提供AlphaZero的主要变着(PV)。\n",
+ "\n",
+ "Table 2: Analysis of the 12 most popular human openings (more than 100,000 occurrences in online databases). Each opening is marked with its ECO code and common name. The graph shows the ratio of each opening used by AlphaZero in self-training competitions. We also reported the results of 100 matches with Stockfish from the perspective of AlphaZero, regardless of whether it was white (W) or black (B). Finally, the main change (PV) of AlphaZero is provided from each opening."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
+ "\n",
"图2:关于AlphaZero思考时间的可扩展性,以国际等级分衡量。a在国际象棋中的AlphaZero和Stockfish的表现,描画每一步的思考时间。b在将棋中AlphaZero和Elmo的表现,描画每一步的思考时间。"
]
},
@@ -257,14 +262,17 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "每个局面s由手工特征φ(s)的稀疏向量描述,包括特定中局/残局[`子力`](http://chessprogramming.wikispaces.com/material)(译者注:由每条线上棋子价值的总和确定的一个术语。所有的棋子和兵。[`子力优势`](https://baike.baidu.com/item/%E5%9B%BD%E9%99%85%E8%B1%A1%E6%A3%8B%E6%9C%AF%E8%AF%AD/7549734?fr=aladdin)是棋手在棋盘上有比对手更多的棋子或棋子的价值更大。)点的价值,[`子力不平衡表`](http://chessprogramming.wikispaces.com/Material+Tables)(译者注:例如 车vs两个[`轻子`](https://baike.baidu.com/item/%E8%BD%BB%E5%AD%90)(Minor pieces:象和马),皇后vs两个车或三个轻子,三个兵vs普通棋子),[`Piece-Square表`](http://chessprogramming.wikispaces.com/Piece-Square+Tables)(译者注:给特定位置上的特定棋子分配一个值),[`机动性`](http://chessprogramming.wikispaces.com/Mobility)(译者注:衡量一个玩家在一个给定的位置上合法移动的选择数量,[`棋子的行动自由`](https://baike.baidu.com/item/%E5%9B%BD%E9%99%85%E8%B1%A1%E6%A3%8B%E6%9C%AF%E8%AF%AD/7549734?fr=aladdin)。)和[`被困棋子`](http://chessprogramming.wikispaces.com/Trapped+Pieces)(译者注:被困棋子是移动性差的极端例子),[`兵型`](http://chessprogramming.wikispaces.com/Pawn+Structure)(译者注:用来描述棋盘上所有兵的位置,忽略所有其他棋子。也指兵骨架。所有兵的位置的各个方面。),[`国王安全性`](http://chessprogramming.wikispaces.com/King+Safety),[`前哨`](http://chessprogramming.wikispaces.com/Outposts)(译者注:通常与马在棋盘中心或敌方一侧有关的国际象棋术语,被自己的棋子保护不再受对手棋子的攻击,或者在半开放线削弱对手的棋子,不再徒劳无功),[`双象`](https://en.wikipedia.org/wiki/Glossary_of_chess#Bishop_pair)(译者注:棋手是否有两个象),和其他复杂的评估 模型。通过手动和自动调整的组合,每个特征$φ_i$被分配相应的权重$w_i$,并且通过线性组合$v(s,w)=φ(s)^T w$来评估局面。然而,对于安全的位置,这个原始评估仅被认为是准确的,不包括未解决的[`吃子`](http://chessprogramming.wikispaces.com/Captures)和[`将军`](http://chessprogramming.wikispaces.com/Check)。在应用评估函数之前,使用领域专用的[`静止搜索`](http://chessprogramming.wikispaces.com/Quiescence+Search)来解决正在进行的战术局势。"
+ "每个局面s由手工特征φ(s)的稀疏向量描述,包括特定中局/残局[`子力`](http://chessprogramming.wikispaces.com/material)(译者注:由每条线上棋子价值的总和确定的一个术语。所有的棋子和兵。[`子力优势`](https://baike.baidu.com/item/%E5%9B%BD%E9%99%85%E8%B1%A1%E6%A3%8B%E6%9C%AF%E8%AF%AD/7549734?fr=aladdin)是棋手在棋盘上有比对手更多的棋子或棋子的价值更大。)点的价值,[`子力不平衡表`](http://chessprogramming.wikispaces.com/Material+Tables)(译者注:例如 车vs两个[`轻子`](https://baike.baidu.com/item/%E8%BD%BB%E5%AD%90)(Minor pieces:象和马),皇后vs两个车或三个轻子,三个兵vs普通棋子),[`Piece-Square表`](http://chessprogramming.wikispaces.com/Piece-Square+Tables)(译者注:给特定位置上的特定棋子分配一个值),[`机动性`](http://chessprogramming.wikispaces.com/Mobility)(译者注:衡量一个玩家在一个给定的位置上合法移动的选择数量,[`棋子的行动自由`](https://baike.baidu.com/item/%E5%9B%BD%E9%99%85%E8%B1%A1%E6%A3%8B%E6%9C%AF%E8%AF%AD/7549734?fr=aladdin)。)和[`被困棋子`](http://chessprogramming.wikispaces.com/Trapped+Pieces)\n",
+ "\n",
+ "\n",
+ "(译者注:被困棋子是移动性差的极端例子),[`兵型`](http://chessprogramming.wikispaces.com/Pawn+Structure)(译者注:用来描述棋盘上所有兵的位置,忽略所有其他棋子。也指兵骨架。所有兵的位置的各个方面。),[`国王安全性`](http://chessprogramming.wikispaces.com/King+Safety),[`前哨`](http://chessprogramming.wikispaces.com/Outposts)(译者注:通常与马在棋盘中心或敌方一侧有关的国际象棋术语,被自己的棋子保护不再受对手棋子的攻击,或者在半开放线削弱对手的棋子,不再徒劳无功),[`双象`](https://en.wikipedia.org/wiki/Glossary_of_chess#Bishop_pair)(译者注:棋手是否有两个象),和其他复杂的评估 模型。通过手动和自动调整的组合,每个特征$φ_i$被分配相应的权重$w_i$,并且通过线性组合$v(s,w)=φ(s)^T w$来评估局面。然而,对于安全的位置,这个原始评估仅被认为是准确的,不包括未解决的[`吃子`](http://chessprogramming.wikispaces.com/Captures)和[`将军`](http://chessprogramming.wikispaces.com/Check)。在应用评估函数之前,使用领域专用的[`静止搜索`](http://chessprogramming.wikispaces.com/Quiescence+Search)来解决正在进行的战术局势。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "局面s的最终评估是通过使用静止搜索评估每个叶子的极小极大搜索来计算的。alpha-beta剪枝被用来安全地剪切任何可能被另一个变着控制的分支。额外的剪切是使用愿望窗口和主要变着搜索实现的。其他剪枝策略包括无效走子修剪(假定走子以后结果比任何变着还要差),徒劳修剪(假设知道评估中可能的最大变着),和其他依赖于领域的修剪规则(假设知道被吃棋子的价值)。"
+ "局面s的最终评估是通过使用静止搜索评估每个叶子的极小极大搜索来计算的。alpha-beta剪枝被用来安全地剪切任何可能被另一个变着控制的分支。额外的剪切是使用愿望窗口和主要变着搜索实现的。其他剪枝策略包括无效走子修剪(假定走子以后结果比任何变着还要差),徒劳修剪(假设知道评估中可能的最大变着),和其他依赖于领域的修剪规则(假设知道被吃棋子的价值"
]
},
{
@@ -285,7 +293,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "[`换位表`](http://chessprogramming.wikispaces.com/Transposition+Table)(译者注:是存储先前执行的搜索的结果的数据库)便于重复使用在多个路径达到相同位置时的下棋顺序和值。经过仔细调整的开局库用于在棋局开始时选择走子。通过对残局位置的彻底逆向分析预先设计的残局库,在六个、有时七个或更少的所有位置提供最佳的走子。"
+ "[`换位表`] closed site(译者注:是存储先前执行的搜索的结果的数据库)便于重复使用在多个路径达到相同位置时的下棋顺序和值。经过仔细调整的开局库用于在棋局开始时选择走子。通过对残局位置的彻底逆向分析预先设计的残局库,在六个、有时七个或更少的所有位置提供最佳的走子。"
]
},
{
@@ -404,7 +412,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 领域知识"
+ "### 领域知识 Domain knowledge"
]
},
{
@@ -413,9 +421,9 @@
"source": [
"- 1.描述位置的输入特征和描述走子的输出特征被构造为一组平面;即神经网络结构与棋盘的网格结构相匹配。\n",
"- 2.为AlphaZero提供了完善的游戏规则知识。这些在MCTS期间被用来模拟由一系列走子产生的位置,以确定游戏的结束,并对达到结束状态的任何模拟对弈进行评分。\n",
- "- 3.对规则的了解也被用来编码输入平面(即[`王车易位`](http://chessprogramming.wikispaces.com/Castling),[`重复局面`](https://baike.baidu.com/item/%E5%9B%BD%E9%99%85%E8%B1%A1%E6%A3%8B%E6%9C%AF%E8%AF%AD/7549734?fr=aladdin),没有进展)和输出平面(棋子如何走子,升变和将棋中的[`取驹`](https://baike.baidu.com/item/%E5%B0%86%E6%A3%8B/491643)([`piece drops`](https://en.wikipedia.org/wiki/Shogi#Drops)))。\n",
+ "- 3.对规则的了解也被用来编码输入平面(即[`王车易位`](Castling),[`重复局面`](https://baike.baidu.com/item/%E5%9B%BD%E9%99%85%E8%B1%A1%E6%A3%8B%E6%9C%AF%E8%AF%AD/7549734?fr=aladdin),没有进展)和输出平面(棋子如何走子,升变和将棋中的[`取驹`](https://baike.baidu.com/item/%E5%B0%86%E6%A3%8B/491643)([`piece drops`](https://en.wikipedia.org/wiki/Shogi#Drops)))。\n",
"- 4.合法走子的典型数量用于缩放探索噪音(见下文)。\n",
- "- 5.国际象棋和将棋比赛超过最大步数(由典型比赛长度决定)将被终止,并被判为平局;围棋比赛结束,使用Tromp-Taylor规则打分。\n"
+ "- 5.国际象棋和将棋比赛超过最大步数(由典型比赛长度决定)将被终止,并被判为平局;围棋比赛结束,使用Tromp-Taylor规则打分。"
]
},
{
@@ -429,7 +437,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 表示"
+ "### 表示 Expression"
]
},
{
@@ -443,7 +451,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
+ "\n",
"表S1:分别在围棋,国际象棋和将棋中AlphaZero使用的输入特征。第一组特征是8步历史走子记录的每个局面。计数由实数值表示;其他输入特征通过使用指定数量的二值输入平面的独热编码来表示。当前玩家由P1表示,对手由P2表示。"
]
},
@@ -479,7 +487,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
+ "\n",
"表S2:国际象棋和将棋中AlphaZero使用的动作表示。该策略是由一堆编码合法走子概率分布的平面表示的;平面对应于表中的条目。"
]
},
@@ -515,7 +523,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
+ "\n",
"表S3:国际象棋,将棋和围棋中AlphaZero训练的选择统计。"
]
},
@@ -530,7 +538,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 评估"
+ "### 评估 Evaluate"
]
},
{
@@ -558,14 +566,21 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
+ "\n",
"表S4:国际象棋、将棋和围棋中AlphaZero,Stockfish和Elmo的评估速度(局面/秒)"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -579,7 +594,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.7"
+ "version": "3.7.3"
}
},
"nbformat": 4,
diff --git a/Mastering_the_Game_of_Go_without_Human_Knowledge.ipynb b/Mastering_the_Game_of_Go_without_Human_Knowledge.ipynb
index 6626403..6227ce8 100644
--- a/Mastering_the_Game_of_Go_without_Human_Knowledge.ipynb
+++ b/Mastering_the_Game_of_Go_without_Human_Knowledge.ipynb
@@ -823,7 +823,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -837,7 +837,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.7"
+ "version": "3.7.3"
}
},
"nbformat": 4,
diff --git a/README.ch.md b/README.ch.md
new file mode 100644
index 0000000..76ac827
--- /dev/null
+++ b/README.ch.md
@@ -0,0 +1,58 @@
+
+# cchess-zero
+AlphaZero implemented Chinese chess. AlphaGo Zero / AlphaZero实践项目,实现中国象棋。
+
+__Author__ chengstone
+
+__e-Mail__ 69558140@163.com
+
+代码详解请参见文内jupyter notebook和↓↓↓
+
+知乎专栏:https://zhuanlan.zhihu.com/p/34433581
+
+博客:http://blog.csdn.net/chengcheng1394/article/details/79526474
+
+欢迎转发扩散 ^_^
+
+这是一个AlphaZero的实践项目,实现了一个中国象棋程序,使用TensorFlow1.0和Python 3.5开发,还要安装uvloop。
+
+因为我的模型训练的不充分,只训练了不到4K次,模型刚刚学会用象和士防守,总之仍然下棋很烂。
+
+如果您有条件可以再多训练试试,我自从收到信用卡扣款400美元通知以后就把aws下线了:D 贫穷限制了我的想象力O(∩_∩)O
+
+我训练的模型文件下载地址:https://pan.baidu.com/s/1dLvxFFpeWZK-aZ2Koewrvg
+
+解压后放到项目根目录下即可,文件夹名叫做gpu_models
+
+现在介绍下命令如何使用:
+
+命令分为两类,一类是训练,一类是下棋。
+
+训练专用:
+
+ - --mode 指定是训练(train)还是下棋(play),默认是训练
+ - --train_playout 指定MCTS的模拟次数,论文中是1600,我做训练时使用1200
+ - --batch_size 指定训练数据达到多少时开始训练,默认512
+ - --search_threads 指定执行MCTS时的线程个数,默认16
+ - --processor 指定是使用cpu还是gpu,默认是cpu
+ - --num_gpus 指定gpu的个数,默认是1
+ - --res_block_nums 指定残差块的层数,论文中是19或39层,我默认是7
+
+下棋专用:
+
+ - --ai_count 指定ai的个数,1是人机对战,2是看两个ai下棋
+ - --ai_function 指定ai的下棋方法,是思考(mcts,会慢),还是直觉(net,下棋快)
+ - --play_playout 指定ai进行MCTS的模拟次数
+ - --delay和--end_delay默认就好,两个ai下棋太快,就不知道俩ai怎么下的了:)
+ - --human_color 指定人类棋手的颜色,w是先手,b是后手
+
+训练命令举例:
+
+python main.py --mode train --train_playout 1200 --batch_size 512 --search_threads 16 --processor gpu --num_gpus 2 --res_block_nums 7
+
+下棋命令举例:
+
+python main.py --mode play --ai_count 1 --ai_function mcts --play_playout 1200 --human_color w
+
+# 许可
+Licensed under the MIT License with the [`996ICU License`](https://github.com/996icu/996.ICU/blob/master/LICENSE).
diff --git a/README.en.md b/README.en.md
new file mode 100644
index 0000000..6a2be57
--- /dev/null
+++ b/README.en.md
@@ -0,0 +1,60 @@
+
+# cchess-zero
+AlphaZero implemented Chinese chess. AlphaGo Zero / AlphaZero Practical project to realize Chinese chess
+
+__Author__ chengstone
+
+__e-Mail__ 69558140@163.com
+
+For a detailed explanation of the code, please refer to the jupyter notebook and ↓↓↓
+
+知乎专栏:https://zhuanlan.zhihu.com/p/34433581
+
+Blog:http://blog.csdn.net/chengcheng1394/article/details/79526474
+
+Welcome to forward and spread ^_^
+
+This is a practical project of AlphaZero, which implements a Chinese chess program, developed with TensorFlow1.0 and Python 3.5, and also installs uvloop.
+
+Because my model is not fully trained, I only trained it less than 4K times. The model has just learned to use bishop and Advisor to defend. In short, It still play chess badly.
+
+If you have the conditions, you can try more training. I have taken aws offline since I received the 400 USD credit card charge notification: D Poverty limits my imagination O(∩_∩)O
+
+Download address of the model file I trained: https://pan.baidu.com/s/1dLvxFFpeWZK-aZ2Koewrvg
+
+After decompression, put it in the root directory of the project, the folder name is called gpu_models
+
+Now introduce how to use the following command:
+
+Commands are divided into two categories, one is training and the other is chess.
+
+Training dedicated:
+
+ - --mode specifies whether it is training (train) or chess (play), the default is training
+ - --train_playout specifies the simulation times of MCTS, 1600 in the paper, I use 1200 when I do training
+ - --batch_size specifies how much training data to start training, the default is 512
+ - --search_threads specifies the number of threads to execute MCTS, the default is 16
+ - --processor Specify whether to use cpu or gpu, the default is cpu
+ - --num_gpus specifies the number of gpus, the default is 1
+ - --res_block_nums specifies the number of layers of the residual block, in the paper it is 19 or 39 layers, my default is 7
+
+
+For Chess:
+
+ - --ai_count specifies the number of ai, 1 is human-computer battle, 2 is to watch two ai play chess
+ - --ai_function Specifies the chess method of ai, whether it is thinking (mcts, slow) or intuition (net, fast chess)
+ - --play_playout specifies the number of MCTS simulations performed by ai
+ - --delay and --end_delay are just fine by default. The two ai play chess too fast, so I don't know how the two ai played :)
+ - --human_color specifies the color of the human chess player, w is the initiative, b is the passive
+
+
+Examples of training commands:
+
+python main.py --mode train --train_playout 1200 --batch_size 512 --search_threads 16 --processor gpu --num_gpus 2 --res_block_nums 7
+
+Examples of chess commands:
+
+python main.py --mode play --ai_count 1 --ai_function mcts --play_playout 1200 --human_color w
+
+# License
+Licensed under the MIT License with the [`996ICU License`](https://github.com/996icu/996.ICU/blob/master/LICENSE).
\ No newline at end of file
diff --git a/README.md b/README.md
index 76ac827..22158d8 100644
--- a/README.md
+++ b/README.md
@@ -1,58 +1,60 @@
# cchess-zero
-AlphaZero implemented Chinese chess. AlphaGo Zero / AlphaZero实践项目,实现中国象棋。
+AlphaZero implemented Chinese chess. AlphaGo Zero / AlphaZero Practical project to realize Chinese chess
__Author__ chengstone
__e-Mail__ 69558140@163.com
-代码详解请参见文内jupyter notebook和↓↓↓
+For a detailed explanation of the code, please refer to the jupyter notebook and ↓↓↓
知乎专栏:https://zhuanlan.zhihu.com/p/34433581
-博客:http://blog.csdn.net/chengcheng1394/article/details/79526474
+Blog:http://blog.csdn.net/chengcheng1394/article/details/79526474
-欢迎转发扩散 ^_^
+Welcome to forward and spread ^_^
-这是一个AlphaZero的实践项目,实现了一个中国象棋程序,使用TensorFlow1.0和Python 3.5开发,还要安装uvloop。
+This is a practical project of AlphaZero, which implements a Chinese chess program, developed with TensorFlow1.0 and Python 3.5, and also installs uvloop.
-因为我的模型训练的不充分,只训练了不到4K次,模型刚刚学会用象和士防守,总之仍然下棋很烂。
+Because my model is not fully trained, I only trained it less than 4K times. The model has just learned to use elephants and soldiers to defend. In short, I still play chess badly.
-如果您有条件可以再多训练试试,我自从收到信用卡扣款400美元通知以后就把aws下线了:D 贫穷限制了我的想象力O(∩_∩)O
+If you have the conditions, you can try more training. I have taken aws offline since I received the 400 USD credit card charge notification: D Poverty limits my imagination O(∩_∩)O
-我训练的模型文件下载地址:https://pan.baidu.com/s/1dLvxFFpeWZK-aZ2Koewrvg
+Download address of the model file I trained: https://pan.baidu.com/s/1dLvxFFpeWZK-aZ2Koewrvg
-解压后放到项目根目录下即可,文件夹名叫做gpu_models
+After decompression, put it in the root directory of the project, the folder name is called gpu_models
-现在介绍下命令如何使用:
+Now introduce how to use the following command:
-命令分为两类,一类是训练,一类是下棋。
+Commands are divided into two categories, one is training and the other is chess.
-训练专用:
+Training dedicated:
- - --mode 指定是训练(train)还是下棋(play),默认是训练
- - --train_playout 指定MCTS的模拟次数,论文中是1600,我做训练时使用1200
- - --batch_size 指定训练数据达到多少时开始训练,默认512
- - --search_threads 指定执行MCTS时的线程个数,默认16
- - --processor 指定是使用cpu还是gpu,默认是cpu
- - --num_gpus 指定gpu的个数,默认是1
- - --res_block_nums 指定残差块的层数,论文中是19或39层,我默认是7
+ - --mode specifies whether it is training (train) or chess (play), the default is training
+ - --train_playout specifies the simulation times of MCTS, 1600 in the paper, I use 1200 when I do training
+ - --batch_size specifies how much training data to start training, the default is 512
+ - --search_threads specifies the number of threads to execute MCTS, the default is 16
+ - --processor Specify whether to use cpu or gpu, the default is cpu
+ - --num_gpus specifies the number of gpus, the default is 1
+ - --res_block_nums specifies the number of layers of the residual block, in the paper it is 19 or 39 layers, my default is 7
-下棋专用:
- - --ai_count 指定ai的个数,1是人机对战,2是看两个ai下棋
- - --ai_function 指定ai的下棋方法,是思考(mcts,会慢),还是直觉(net,下棋快)
- - --play_playout 指定ai进行MCTS的模拟次数
- - --delay和--end_delay默认就好,两个ai下棋太快,就不知道俩ai怎么下的了:)
- - --human_color 指定人类棋手的颜色,w是先手,b是后手
+For Chess:
-训练命令举例:
+ - --ai_count specifies the number of ai, 1 is human-computer battle, 2 is to watch two ai play chess
+ - --ai_function Specifies the chess method of ai, whether it is thinking (mcts, slow) or intuition (net, fast chess)
+ - --play_playout specifies the number of MCTS simulations performed by ai
+ - --delay and --end_delay are just fine by default. The two ai play chess too fast, so I don't know how the two ai played :)
+ - --human_color specifies the color of the human chess player, w is the first move, b is the second move
+
+
+Examples of training commands:
python main.py --mode train --train_playout 1200 --batch_size 512 --search_threads 16 --processor gpu --num_gpus 2 --res_block_nums 7
-下棋命令举例:
+Examples of chess commands:
python main.py --mode play --ai_count 1 --ai_function mcts --play_playout 1200 --human_color w
-# 许可
+# Permission
Licensed under the MIT License with the [`996ICU License`](https://github.com/996icu/996.ICU/blob/master/LICENSE).
diff --git a/__pycache__/ChessBoard.cpython-312.pyc b/__pycache__/ChessBoard.cpython-312.pyc
new file mode 100644
index 0000000..c16d9cb
Binary files /dev/null and b/__pycache__/ChessBoard.cpython-312.pyc differ
diff --git a/__pycache__/ChessGame.cpython-312.pyc b/__pycache__/ChessGame.cpython-312.pyc
new file mode 100644
index 0000000..dfde8a9
Binary files /dev/null and b/__pycache__/ChessGame.cpython-312.pyc differ
diff --git a/__pycache__/ChessPiece.cpython-312.pyc b/__pycache__/ChessPiece.cpython-312.pyc
new file mode 100644
index 0000000..4fd00a1
Binary files /dev/null and b/__pycache__/ChessPiece.cpython-312.pyc differ
diff --git a/__pycache__/ChessView.cpython-312.pyc b/__pycache__/ChessView.cpython-312.pyc
new file mode 100644
index 0000000..7d60bf2
Binary files /dev/null and b/__pycache__/ChessView.cpython-312.pyc differ
diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc
new file mode 100644
index 0000000..a65f69e
Binary files /dev/null and b/__pycache__/main.cpython-312.pyc differ
diff --git a/__pycache__/policy_value_network.cpython-312.pyc b/__pycache__/policy_value_network.cpython-312.pyc
new file mode 100644
index 0000000..ac31730
Binary files /dev/null and b/__pycache__/policy_value_network.cpython-312.pyc differ
diff --git a/assets/a1.png b/assets/a1.png
index 18fd836..7019a37 100644
Binary files a/assets/a1.png and b/assets/a1.png differ
diff --git a/assets/a10.png b/assets/a10.png
index a878b9d..183098d 100644
Binary files a/assets/a10.png and b/assets/a10.png differ
diff --git a/assets/a11.png b/assets/a11.png
index 9795e0b..a835c7f 100644
Binary files a/assets/a11.png and b/assets/a11.png differ
diff --git a/assets/a2.png b/assets/a2.png
index 50cdac0..b0fea35 100644
Binary files a/assets/a2.png and b/assets/a2.png differ
diff --git a/assets/a4.png b/assets/a4.png
index eb96410..47666ac 100644
Binary files a/assets/a4.png and b/assets/a4.png differ
diff --git a/assets/a5.png b/assets/a5.png
index 1fd1a34..060a276 100644
Binary files a/assets/a5.png and b/assets/a5.png differ
diff --git a/assets/a6.png b/assets/a6.png
index 1f54243..c9b78ff 100644
Binary files a/assets/a6.png and b/assets/a6.png differ
diff --git a/assets/a7.png b/assets/a7.png
index fa83f5a..0d3e606 100644
Binary files a/assets/a7.png and b/assets/a7.png differ
diff --git a/assets/a8.png b/assets/a8.png
index bdaee3d..3e539ea 100644
Binary files a/assets/a8.png and b/assets/a8.png differ
diff --git a/assets/a9.png b/assets/a9.png
index 8369447..04d1312 100644
Binary files a/assets/a9.png and b/assets/a9.png differ
diff --git a/assets/b2.png b/assets/b2.png
index 92127e5..741d0b2 100644
Binary files a/assets/b2.png and b/assets/b2.png differ
diff --git a/assets/b6.png b/assets/b6.png
index d09dfc8..d8a81f7 100644
Binary files a/assets/b6.png and b/assets/b6.png differ
diff --git a/assets/b7.png b/assets/b7.png
index c2bb7e2..9d0c05d 100644
Binary files a/assets/b7.png and b/assets/b7.png differ
diff --git a/assets/b8.png b/assets/b8.png
index d75491b..211e8f5 100644
Binary files a/assets/b8.png and b/assets/b8.png differ
diff --git a/assets/b9.png b/assets/b9.png
index 6857c5f..44f236e 100644
Binary files a/assets/b9.png and b/assets/b9.png differ
diff --git a/assets/c1.png b/assets/c1.png
index deab1ad..b0530ab 100644
Binary files a/assets/c1.png and b/assets/c1.png differ
diff --git a/assets/c2.png b/assets/c2.png
index d9df84e..38a9af6 100644
Binary files a/assets/c2.png and b/assets/c2.png differ
diff --git a/assets/c3.png b/assets/c3.png
index 45d1f5d..05adaa2 100644
Binary files a/assets/c3.png and b/assets/c3.png differ
diff --git a/cchess-zero.ipynb b/cchess-zero.ipynb
index 229469e..be3d1c2 100644
--- a/cchess-zero.ipynb
+++ b/cchess-zero.ipynb
@@ -4,7 +4,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# AlphaZero实践——中国象棋(附论文翻译)"
+ "# AlphaZero实践——中国象棋(附论文翻译)\n",
+ "AlphaZero Practice-Chinese Chess (with paper translation)"
]
},
{
@@ -12,43 +13,53 @@
"metadata": {},
"source": [
"作者:[`程世东`](http://zhihu.com/people/cheng-shi-dong-47)\n",
- "[`GitHub`](https://github.com/chengstone) [`Mail`](mailto:69558140@163.com)"
+ "[`GitHub`](https://github.com/chengstone) [`Mail`](mailto:69558140@163.com)\n",
+ "\n",
+ "Translation to English by [`ycechungAI`](https://github.com/ycechungAI)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "关于AlphaGo和后续的版本AlphaGo Zero等新闻大家都耳熟能详了,今天我们从论文的分析,并结合代码来一起讨论下AlphaZero在中国象棋上的实践。"
+ "关于AlphaGo和后续的版本AlphaGo Zero等新闻大家都耳熟能详了,今天我们从论文的分析,并结合代码来一起讨论下AlphaZero在中国象棋上的实践。\n",
+ "\n",
+ "Everyone is familiar with news about AlphaGo and the subsequent version AlphaGo Zero. Today we will discuss the practice of AlphaZero in Chinese chess from the analysis of the paper and combined with the code."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "实际上在GitHub上能够看到有很多关于AlphaGo的实践项目,包括[`国际象棋`](https://github.com/Zeta36/chess-alpha-zero)、[`围棋`](https://github.com/gcp/leela-zero)、[`五子棋`](https://github.com/junxiaosong/AlphaZero_Gomoku)、[`黑白棋`](https://github.com/mokemokechicken/reversi-alpha-zero)等等,我有个好友在实践麻将。"
+ "实际上在GitHub上能够看到有很多关于AlphaGo的实践项目,包括[`国际象棋`](https://github.com/Zeta36/chess-alpha-zero)、[`围棋`](https://github.com/gcp/leela-zero)、[`五子棋`](https://github.com/junxiaosong/AlphaZero_Gomoku)、[`黑白棋`](https://github.com/mokemokechicken/reversi-alpha-zero)等等,我有个好友在实践麻将。\n",
+ "\n",
+ "In fact, you can see many practical projects about AlphaGo on GitHub, including [`Chess`](https://github.com/Zeta36/chess-alpha-zero), [`Go`](https://github.com/gcp/leela-zero), [`Gomoku`](https://github.com/junxiao/AlphaZero_Gomoku), [`Othello`](https://github.com/mokemokechicken/reversi-alpha-zero) Wait, I have a friend who is practicing Mahjong."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "从算法上来说,大家都是基于AlphaGo Zero / AlphaZero的论文来实现的,差别在于不同Game的规则和使用了不同的trick。"
+ "从算法上来说,大家都是基于AlphaGo Zero / AlphaZero的论文来实现的,差别在于不同Game的规则和使用了不同的trick。\n",
+ "\n",
+ "In terms of algorithms, everyone is based on AlphaGo Zero / AlphaZero's papers. The difference lies in the rules of different games and the use of different tricks."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## 论文分析"
+ "## 论文分析 Paper Analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "我们要参考的就是AlphaGo Zero的论文《[`Mastering the Game of Go without Human Knowledge`](https://deepmind.com/documents/119/agz_unformatted_nature.pdf)》和AlphaZero的论文《[`Mastering Chess and Shogi by Self-Play with a General Reinforcement Learning Algorithm`](https://arxiv.org/pdf/1712.01815.pdf\n",
- ")》。"
+ "我们要参考的就是AlphaGo Zero的论文《[`Mastering the Game of Go without Human Knowledge`](https://web.archive.org/web/20171025100035/https://deepmind.com/documents/119/agz_unformatted_nature.pdf)》和AlphaZero的论文《[`Mastering Chess and Shogi by Self-Play with a General Reinforcement Learning Algorithm`](https://arxiv.org/pdf/1712.01815.pdf\n",
+ ")》。\n",
+ "\n",
+ "What we want to refer to is AlphaGo Zero's paper \"[`Mastering the Game of Go without Human Knowledge`](https://web.archive.org/web/20171025100035/https://deepmind.com/documents/119/agz_unformatted_nature.pdf)\" and AlphaZero's paper \"[`Mastering Chess and Shogi by Self-Play with a General Reinforcement Learning Algorithm`](https://arxiv.org/pdf/1712.01815.pdf)\"."
]
},
{
@@ -57,11 +68,18 @@
"source": [
"小弟不才,献丑翻译了这两篇论文,时间仓促,水平有限✧(≖ ◡ ≖✿),您要是看不惯英文,希望这两篇翻译能提供些许帮助。\n",
"\n",
+ "I’m not talented. The time is short and the level is limited ✧(≖ ◡ ≖✿), if you are not familiar with English, I hope these two translations can provide some help.\n",
+ "\n",
"[`《Mastering the Game of Go without Human Knowledge》`](https://github.com/chengstone/cchess-zero/blob/master/Mastering_the_Game_of_Go_without_Human_Knowledge.ipynb)\n",
"\n",
"[`《Mastering Chess and Shogi by Self-Play with a General Reinforcement Learning Algorithm》`](https://github.com/chengstone/cchess-zero/blob/master/Mastering_Chess_and_Shogi_by_Self-Play_with_a_General_Reinforcement_Learning_Algorithm.ipynb)\n",
"\n",
- "建议在本地用jupyter notebook打开看,我发现从GitHub上看的话,有些公式没有显示出来,另外图片也没有显示出来。"
+ "\n",
+ "建议在本地用jupyter notebook打开看,我发现从GitHub上看的话,有些公式没有显示出来,另外图片也没有显示出来。\n",
+ "\n",
+ "It is recommended to use jupyter notebook to open it locally. I found that some formulas are not displayed when I look at it on GitHub, and the pictures are not displayed.\n",
+ "\n",
+ "(Thank you so much for your kind help! ^_^)"
]
},
{
@@ -82,14 +100,23 @@
"我们是要抛弃人类棋谱的,学会如何下棋完全是通过自对弈来完成。\n",
"\n",
"过程是这样,首先生成棋谱,然后将棋谱作为输入训练神经网络,训练好的神经网络用来预测落子和胜率。如下图:\n",
- ""
+ "\n",
+ "Let's start with \"Mastering the Game of Go without Human Knowledge\", the algorithm is implemented according to this paper, and AlphaZero has only a few differences.\n",
+ "\n",
+ "In general, AlphaGo Zero is divided into two parts, one is MCTS (Monte Carlo Tree Search), and the other is neural network.\n",
+ "\n",
+ "We are going to abandon human chess records, and learning how to play chess is done entirely through self-play.\n",
+ "\n",
+ "The process is like this, first generate a chess record, and then use the chess record as an input to train the neural network, and the trained neural network is used to predict the move and the winning rate. As shown below:\n",
+ "\n",
+ ""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 蒙特卡洛树搜索算法"
+ "### 蒙特卡洛树搜索算法 Monte Carlo Tree Search Algorithm"
]
},
{
@@ -97,29 +124,40 @@
"metadata": {},
"source": [
"MCTS就是用来自对弈生成棋谱的,结合论文中的图示进行说明:\n",
- "\n",
+ "MCTS is used to generate chess records from the game, combined with the illustrations in the paper to illustrate:\n",
+ "\n",
"论文中的描述:\n",
"\n",
- "AlphaGo Zero中的蒙特卡洛树搜索。\n",
+ "AlphaGo Zero中的蒙特卡洛树搜索。 Monte Carlo tree search in AlphaGo Zero.\n",
"\n",
"- a.每次模拟通过选择具有最大行动价值Q的边加上取决于所存储的先验概率P和该边的访问计数N(每次访问都被增加一次)的上限置信区间U来遍历树。\n",
"- b.展开叶子节点,通过神经网络(P(s, ·), V (s)) = $f_θ(s)$来评估局面s;向量P的值存储在叶子结点扩展的边上。\n",
"- c.更新行动价值Q等于在该行动下的子树中的所有评估值V的均值。\n",
- "- d.一旦MCTS搜索完成,返回局面s下的落子概率π,与$N^{1 /τ}$成正比,其中N是从根状态每次移动的访问计数, τ是控制温度的参数。"
+ "- d.一旦MCTS搜索完成,返回局面s下的落子概率π,与$N^{1 /τ}$成正比,其中N是从根状态每次移动的访问计数, τ是控制温度的参数。\n",
+ "\n",
+ "\n",
+ "- a Each simulation traverses the tree by selecting the edge with maximum action-value Q, plus an upper confidence bound U that depends on a stored prior probability P and visit count N for that edge (which is incremented once traversed).\n",
+ "- b The leaf node is expanded and the associated position s is evaluated by the neural network (P (s, ·), V (s)) = f θ (s); the vector of P values are stored in the outgoing edges from s.\n",
+ "- c Action-values Q are updated to track the mean of all evaluations V in the subtree below that action.\n",
+ "- d Once the search is complete, search probabilities π are returned, proportional to N 1/τ , where N is the visit count of each move from the root state and τ is a parameter controlling temperature."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "按照论文所述,每次MCTS使用1600次模拟。过程是这样的,现在AI从白板一块开始自己跟自己下棋,只知道规则,不知道套路,那只好乱下。每下一步棋,都要通过MCTS模拟1600次上图中的a~c,从而得出我这次要怎么走子。"
+ "按照论文所述,每次MCTS使用1600次模拟。过程是这样的,现在AI从白板一块开始自己跟自己下棋,只知道规则,不知道套路,那只好乱下。每下一步棋,都要通过MCTS模拟1600次上图中的a~c,从而得出我这次要怎么走子。\n",
+ "\n",
+ "According to the paper, each MCTS uses 1600 simulations. The process is like this. Now the AI starts from the whiteboard and plays chess with itself. It only knows the rules and doesn't know the routines, so it has no choice but to mess around. For each next move, I have to simulate 1600 times a~c in the above picture through MCTS, so as to figure out how I am going to move this time."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "来说说a~c,MCTS本质上是我们来维护一棵树,这棵树的每个节点保存了每一个局面(situation)该如何走子(action)的信息。这些信息是,N(s, a)是访问次数,W(s, a)是总行动价值,Q(s, a)是平均行动价值,P(s, a)是被选择的概率。"
+ "来说说a~c,MCTS本质上是我们来维护一棵树,这棵树的每个节点保存了每一个局面(situation)该如何走子(action)的信息。这些信息是,N(s, a)是访问次数,W(s, a)是总行动价值,Q(s, a)是平均行动价值,P(s, a)是被选择的概率。\n",
+ "\n",
+ "Let’s talk about a~c, MCTS essentially means that we maintain a tree. Each node of this tree saves information on how to move each situation. The information is that N(s, a) is the number of visits, W(s, a) is the total action value, Q(s, a) is the average action value, and P(s, a) is the probability of being selected."
]
},
{
@@ -133,7 +171,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "每次模拟的过程都一样,从父节点的局面开始,选择一个走子。比如开局的时候,所有合法的走子都是可能的选择,那么我该选哪个走子呢?这就是select要做的事情。MCTS选择Q(s, a) + U(s, a)最大的那个action。Q的公式一会在Backup中描述。U的公式如下:"
+ "每次模拟的过程都一样,从父节点的局面开始,选择一个走子。比如开局的时候,所有合法的走子都是可能的选择,那么我该选哪个走子呢?这就是select要做的事情。MCTS选择Q(s, a) + U(s, a)最大的那个action。Q的公式一会在Backup中描述。U的公式如下:\n",
+ "\n",
+ "The process of each simulation is the same, starting from the position of the parent node, choose a move. For example, at the beginning of the game, all legal moves are possible choices, so which move should I choose? This is what select does. MCTS selects the action with the largest Q(s, a) + U(s, a). Q's formula will be described in Backup. The formula of U is as follows:"
]
},
{
@@ -149,14 +189,20 @@
"source": [
"这个可以理解成:U(s, a) = $c_{puct}$ × 概率P(s, a) × np.sqrt(父节点访问次数N) / ( 1 + 某子节点action的访问次数N(s, a) )\n",
"\n",
- "用论文中的话说,c_puct是一个决定探索水平的常数;这种搜索控制策略最初倾向于具有高先验概率和低访问次数的行为,但是渐近地倾向于具有高行动价值的行为。"
+ "用论文中的话说,c_puct是一个决定探索水平的常数;这种搜索控制策略最初倾向于具有高先验概率和低访问次数的行为,但是渐近地倾向于具有高行动价值的行为。\n",
+ "\n",
+ "This can be understood as: U(s, a) = $c_{puct}$ × probability P(s, a) × np.sqrt (the number of parent node visits N) / (1 + the number of child node action visits N( s, a))\n",
+ "\n",
+ "In the words of the paper, c_puct is a constant that determines the level of exploration; this search control strategy initially tends to have high prior probability and low number of visits, but asymptotically tends to have high action value."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "计算过后,我就知道当前局面下,哪个action的Q+U值最大,那这个action走子之后的局面就是第二次模拟的当前局面。比如开局,Q+U最大的是当头炮,然后我就Select当头炮这个action,再下一次Select就从当头炮的这个棋局选择下一个走子。"
+ "计算过后,我就知道当前局面下,哪个action的Q+U值最大,那这个action走子之后的局面就是第二次模拟的当前局面。比如开局,Q+U最大的是当头炮,然后我就Select当头炮这个action,再下一次Select就从当头炮的这个棋局选择下一个走子。\n",
+ "\n",
+ "After the calculation, I know which action has the largest Q+U value in the current situation, and the situation after this action moves is the current situation of the second simulation. For example, in the opening game, Q+U is the top shot, and then I select the action of the top shot, and the next time Select will choose the next move from the top shot."
]
},
{
@@ -170,21 +216,28 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "现在开始第二次模拟了,假如之前的action是当头炮,我们要接着这个局面选择action,但是这个局面是个叶子节点。就是说当头炮之后可以选择哪些action不知道,这样就需要expand了,通过expand得到一系列可能的action节点。这样实际上就是在扩展这棵树,从只有根节点开始,一点一点的扩展。"
+ "现在开始第二次模拟了,假如之前的action是当头炮,我们要接着这个局面选择action,但是这个局面是个叶子节点。就是说当头炮之后可以选择哪些action不知道,这样就需要expand了,通过expand得到一系列可能的action节点。这样实际上就是在扩展这棵树,从只有根节点开始,一点一点的扩展。\n",
+ "\n",
+ "Now it’s the second simulation. If the previous action was the Central Cannon, we have to select the action following this situation, but this situation is a leaf node. That is to say, you can choose which actions you can choose after Central Cannon you don’t know, so you need to expand, and you can get a series of possible action nodes through expand. This is actually expanding the tree, starting with only the root node and expanding little by little."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Expand and evaluate这个部分有个需要关注的地方。论文中说:在队列中的局面由神经网络使用最小批量mini-batch 大小为8进行评估;搜索线程被锁定,直到评估完成。叶子节点被展开,每个边($s_L$, a)被初始化为{N($s_L$, a) = 0,W($s_L$, a) = 0,Q($s_L$, a) = 0,P($s_L$, a) = $p_a$} 然后**值v被回传(backed up)**。"
+ "Expand and evaluate这个部分有个需要关注的地方。论文中说:在队列中的局面由神经网络使用最小批量mini-batch 大小为8进行评估;搜索线程被锁定,直到评估完成。叶子节点被展开,每个边($s_L$, a)被初始化为{N($s_L$, a) = 0,W($s_L$, a) = 0,Q($s_L$, a) = 0,P($s_L$, a) = $p_a$} 然后**值v被回传(backed up)**。\n",
+ "\n",
+ "There is a place to pay attention to in the Expand and evaluate section. The paper said: The situation in the queue is evaluated by the neural network using a minimum batch size of 8; the search thread is locked until the evaluation is completed. The leaf node is expanded, and each edge (𝑠𝐿\n",
+ ", a) is initialized to {N(𝑠𝐿, a) = 0, W(𝑠𝐿, a) = 0, Q(𝑠𝐿, a) = 0, P(𝑠𝐿, a) = 𝑝𝑎} Then the value v is returned ( backed up)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "如果我当前的局面没有被expand过,不知道下一步该怎么下,所以要expand,这个时候要用我们的神经网络出马。把当前的局面作为输入传给神经网络,神经网络会返回给我们一个action向量p和当前胜率v。其中action向量是当前局面每个合法action的走子概率。当然,因为神经网络还没有训练好,输出作为参考添加到我们的蒙特卡洛树上。这样在当前局面下,所有可走的action以及对应的概率p就都有了,每个新增的action节点都按照论文中说的对若干信息赋值,{N($s_L$, a) = 0,W($s_L$, a) = 0,Q($s_L$, a) = 0,P($s_L$, a) = $p_a$} 。这些新增的节点作为当前局面节点的子节点。"
+ "如果我当前的局面没有被expand过,不知道下一步该怎么下,所以要expand,这个时候要用我们的神经网络出马。把当前的局面作为输入传给神经网络,神经网络会返回给我们一个action向量p和当前胜率v。其中action向量是当前局面每个合法action的走子概率。当然,因为神经网络还没有训练好,输出作为参考添加到我们的蒙特卡洛树上。这样在当前局面下,所有可走的action以及对应的概率p就都有了,每个新增的action节点都按照论文中说的对若干信息赋值,{N($s_L$, a) = 0,W($s_L$, a) = 0,Q($s_L$, a) = 0,P($s_L$, a) = $p_a$} 。这些新增的节点作为当前局面节点的子节点。\n",
+ "\n",
+ "If my current situation has not been expanded, I don’t know what to do next, so I need to expand. At this time, we need to use our neural network. Pass the current situation as input to the neural network, and the neural network will return us an action vector p and the current winning percentage v. The action vector is the move probability of each legal action in the current situation. Of course, because the neural network has not been trained yet, the output is added to our Monte Carlo tree as a reference. In this way, in the current situation, all the actions that can be taken and the corresponding probability p are there, and each new action node is assigned a number of information according to the paper, {N($s_L$, a) = 0 , W($s_L$, a) = 0, Q($s_L$, a) = 0, P($s_L$, a) = $p_a$}. These newly added nodes serve as child nodes of the current situation node."
]
},
{
@@ -198,71 +251,89 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "接下来就是重点,evaluate和Backup一起说,先看看Backup做什么事吧:边的统计数据在每一步t≤L中反向更新。访问计数递增,$N(s_t , a_t) = N(s_t , a_t) +1$,并且动作价值更新为平均值, $W(s_t , a_t) = W(s_t , a_t) + v,Q(s_t , a_t) = \\frac{W(s_t ,a_t)}{N(s_t ,a_t)}$。我们使用**虚拟损失**来确保每个线程评估不同的节点。"
+ "接下来就是重点,evaluate和Backup一起说,先看看Backup做什么事吧:边的统计数据在每一步t≤L中反向更新。访问计数递增,$N(s_t , a_t) = N(s_t , a_t) +1$,并且动作价值更新为平均值, $W(s_t , a_t) = W(s_t , a_t) + v,Q(s_t , a_t) = \\frac{W(s_t ,a_t)}{N(s_t ,a_t)}$。我们使用**虚拟损失**来确保每个线程评估不同的节点。\n",
+ "\n",
+ "The next step is the key point. Evaluate and Backup said together, let’s take a look at what Backup does: the statistical data of the edge is updated in reverse at each step t≤L. The visit count is incremented, $N(s_t, a_t) = N(s_t, a_t) +1$, and the action value is updated to the average value, $W(s_t, a_t) = W(s_t, a_t) + v, Q(s_t , a_t) = \\frac{W(s_t ,a_t)}{N(s_t ,a_t)}$. We use **virtual loss** to ensure that each thread evaluates different nodes."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "我们来整理一下思路,任意一个局面(就是节点),要么被展开过(expand),要么没有展开过(就是叶子节点)。展开过的节点可以使用Select选择动作进入下一个局面,下一个局面仍然是这个过程,如果展开过还是可以通过Select进入下下个局面,这个过程一直持续下去直到这盘棋分出胜平负了,或者遇到某个局面没有被展开过为止。"
+ "我们来整理一下思路,任意一个局面(就是节点),要么被展开过(expand),要么没有展开过(就是叶子节点)。展开过的节点可以使用Select选择动作进入下一个局面,下一个局面仍然是这个过程,如果展开过还是可以通过Select进入下下个局面,这个过程一直持续下去直到这盘棋分出胜平负了,或者遇到某个局面没有被展开过为止。\n",
+ "\n",
+ "Let's sort out our thoughts, any situation (that is, a node), either has been expanded (expand) or has not been expanded (that is, a leaf node). The node that has been expanded can use the Select action to enter the next position. The next position is still this process. If it has been expanded, you can still enter the next position through Select. This process will continue until the game is won. , Or encounter a situation that has not been unfolded."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "如果没有展开过,那么执行expand操作,通过神经网络得到每个动作的概率和胜率v,把这些动作添加到树上,最后把胜率**v回传(backed up)**,backed up给谁?"
+ "如果没有展开过,那么执行expand操作,通过神经网络得到每个动作的概率和胜率v,把这些动作添加到树上,最后把胜率**v回传(backed up)**,backed up给谁?\n",
+ "\n",
+ "If it has not been expanded, then perform the expand operation, get the probability and winning rate v of each action through the neural network, add these actions to the tree, and finally return the winning rate **v backed up**, who is backed up to? ?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "我们知道这其实是一路递归下去的过程,一直在Select,递归必须要有结束条件,不然就是死循环了。所以分出胜负和遇到叶子节点就是递归结束条件,把胜率v或者分出的胜平负value作为返回值,回传给上一层。"
+ "我们知道这其实是一路递归下去的过程,一直在Select,递归必须要有结束条件,不然就是死循环了。所以分出胜负和遇到叶子节点就是递归结束条件,把胜率v或者分出的胜平负value作为返回值,回传给上一层。\n",
+ "\n",
+ "We know that this is actually a process of recursion all the way, always in Select, recursion must have an end condition, otherwise it will be an endless loop. Therefore, dividing the victory and encountering the leaf node is the recursive end condition, and the victory rate v or the divided victory and loss value is used as the return value and passed back to the upper layer."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "这个过程就是evaluate,是为了Backup步骤做准备。因为在Backup步骤,我们要用v来更新W和Q的,但是如果只做了一次Select,棋局还没有结束,此时的v是不明确的,必须要等到一盘棋完整的下完才能知道v到底是多少。就是说我现在下了一步棋,不管这步棋是好棋还是臭棋,只有下完整盘期分出胜负,才能给我下的这步棋评分。不管这步棋的得失,即使我这步棋丢了个车,但最后我赢了,那这个v就是积极的。同样即使我这步棋吃了对方一个子,但最后输棋了,也不能认为我这步棋就是好棋。"
+ "这个过程就是evaluate,是为了Backup步骤做准备。因为在Backup步骤,我们要用v来更新W和Q的,但是如果只做了一次Select,棋局还没有结束,此时的v是不明确的,必须要等到一盘棋完整的下完才能知道v到底是多少。就是说我现在下了一步棋,不管这步棋是好棋还是臭棋,只有下完整盘期分出胜负,才能给我下的这步棋评分。不管这步棋的得失,即使我这步棋丢了个车,但最后我赢了,那这个v就是积极的。同样即使我这步棋吃了对方一个子,但最后输棋了,也不能认为我这步棋就是好棋。\n",
+ "\n",
+ "This process is evaluate, which is to prepare for the Backup step. Because in the Backup step, we need to use v to update W and Q, but if we only do Select once, the game is not over yet. At this time, v is not clear, and we must wait until a game of chess is completed to know the end of v. how many. That is to say, I have made a move now, no matter whether it is a good move or a bad move, only by playing a complete game period can I score the move. Regardless of the pros and cons of this move, even if I lose a rook in this move, but in the end I win, then this v is positive. Similarly, even if I ate the opponent's piece in this move, but lost in the end, I can't think that my move is a good move."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "用一幅图概括一下这个过程:\n",
- ""
+ "用一幅图概括一下这个过程: Use a picture to summarize this process:\n",
+ ""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "当值被回传,就要做Backup了,这里很关键。因为我们是多线程同时在做MCTS,由于Select算法都一样,都是选择Q+U最大节点,所以很有可能所有的线程最终选择的是同一个节点,这就尴尬了。我们的目的是尽可能在树上搜索出各种不同的着法,最终选择一步好棋,怎么办呢?论文中已经给出了办法,“我们使用**虚拟损失**来确保每个线程评估不同的节点。”"
+ "当值被回传,就要做Backup了,这里很关键。因为我们是多线程同时在做MCTS,由于Select算法都一样,都是选择Q+U最大节点,所以很有可能所有的线程最终选择的是同一个节点,这就尴尬了。我们的目的是尽可能在树上搜索出各种不同的着法,最终选择一步好棋,怎么办呢?论文中已经给出了办法,“我们使用**虚拟损失**来确保每个线程评估不同的节点。”\n",
+ "\n",
+ "When the value is returned, backup is required, which is very important here. Because we are doing MCTS with multiple threads at the same time, and because the Select algorithm is the same, we choose the largest node of Q+U, so it is very likely that all threads will eventually choose the same node, which is embarrassing. Our goal is to search for various moves in the tree as much as possible, and finally choose a good move. What should we do? The method has been given in the paper, \"We use **virtual loss** to ensure that each thread evaluates different nodes.\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "就是说,通过Select选出某节点后,人为增大这个节点的访问次数N,并减少节点的总行动价值W,因为平均行动价值Q = W / N,这样分子减少,分母增加,就减少了Q值,这样递归进行的时候,此节点的Q+U不是最大,避免被选中,让其他的线程尝试选择别的节点进行树搜索。这个人为增加和减少的量就是虚拟损失virtual loss。"
+ "就是说,通过Select选出某节点后,人为增大这个节点的访问次数N,并减少节点的总行动价值W,因为平均行动价值Q = W / N,这样分子减少,分母增加,就减少了Q值,这样递归进行的时候,此节点的Q+U不是最大,避免被选中,让其他的线程尝试选择别的节点进行树搜索。这个人为增加和减少的量就是虚拟损失virtual loss。\n",
+ "\n",
+ "That is to say, after selecting a node through Select, the number of visits of this node N is artificially increased, and the total action value W of the node is reduced, because the average action value Q = W / N, so that the numerator decreases and the denominator increases, so it decreases Q value, so when recursively, the Q+U of this node is not the largest, avoid being selected, and let other threads try to select other nodes for tree search. This artificial increase and decrease is the virtual loss."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "现在MCTS的过程越来越清晰了,Select选择节点,选择后,对当前节点使用虚拟损失,通过递归继续Select,直到分出胜负或Expand节点,得到返回值value。现在就可以使用value进行Backup了,但首先要还原W和N,之前N增加了虚拟损失,这次要减回去,之前减少了虚拟损失的W也要加回来。"
+ "现在MCTS的过程越来越清晰了,Select选择节点,选择后,对当前节点使用虚拟损失,通过递归继续Select,直到分出胜负或Expand节点,得到返回值value。现在就可以使用value进行Backup了,但首先要还原W和N,之前N增加了虚拟损失,这次要减回去,之前减少了虚拟损失的W也要加回来。\n",
+ "\n",
+ "Now the process of MCTS is getting clearer and clearer. Select selects a node. After selection, virtual loss is applied to the current node. Select is continued through recursion until the winner or Expand node is determined, and the return value value is obtained. Now you can use value to perform backup, but first you need to restore W and N. Before N increased the virtual loss, this time you need to reduce it, and the W that previously reduced the virtual loss should also be added back."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "然后开始做Backup,“边的统计数据在每一步t≤L中反向更新。访问计数递增,$N(s_t , a_t) = N(s_t , a_t) +1$,并且动作价值更新为平均值, $W(s_t , a_t) = W(s_t , a_t) + v,Q(s_t , a_t) = \\frac{W(s_t ,a_t)}{N(s_t ,a_t)}$。”,这些不用我再解释了吧?同时我们还要更新U,U的公式上面给出过。这个反向更新,其实就是递归的把值返回回去。有一点一定要**注意,就是我们的返回值一定要符号反转**,怎么理解?就是说对于当前节点是胜,那么对于上一个节点一定是负,明白这个意思了吧?所以返回的是-value。"
+ "然后开始做Backup,“边的统计数据在每一步t≤L中反向更新。访问计数递增,$N(s_t , a_t) = N(s_t , a_t) +1$,并且动作价值更新为平均值, $W(s_t , a_t) = W(s_t , a_t) + v,Q(s_t , a_t) = \\frac{W(s_t ,a_t)}{N(s_t ,a_t)}$。”,这些不用我再解释了吧?同时我们还要更新U,U的公式上面给出过。这个反向更新,其实就是递归的把值返回回去。有一点一定要**注意,就是我们的返回值一定要符号反转**,怎么理解?就是说对于当前节点是胜,那么对于上一个节点一定是负,明白这个意思了吧?所以返回的是-value。\n",
+ "\n",
+ "Then start to do Backup, \"The statistical data of the edge is updated in the reverse direction at each step t≤L. The access count increases, $N(s_t, a_t) = N(s_t, a_t) +1$, and the action value is updated to the average value , $W(s_t, a_t) = W(s_t, a_t) + v, Q(s_t, a_t) = \\frac{W(s_t ,a_t)}{N(s_t ,a_t)}$.\", these don’t need me Explain it again? At the same time, we have to update U, the formula of U is given above. This reverse update is actually returning the value back recursively. One thing must be **attention, that is, our return value must be sign inverted**, how to understand? In other words, it is a win for the current node, so it must be a loss for the previous node. Do you understand this? So what is returned is -value."
]
},
{
@@ -276,84 +347,100 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "按照上述过程执行a~c,论文中是每步棋执行1600次模拟,那就是1600次的a~c,这个MCTS的过程就是模拟自我对弈的过程。模拟结束后,基本上能覆盖大多数的棋局和着法,每步棋该怎么下,下完以后胜率是多少,得到什么样的局面都能在树上找到。然后从树上选择当前局面应该下哪一步棋,这就是步骤d.play:\"在搜索结束时,AlphaGo Zero在根节点s0选择一个走子a,与其访问计数幂指数成正比,$π(a|s_0) = \\frac{N(s_0,a) ^{1/τ}}{\\sum_b N(s_0,b)^{1/τ}}$ ,其中τ是控制探索水平的温度参数。在随后的时间步重新使用搜索树:与所走子的动作对应的子节点成为新的根节点;保留这个节点下面的子树所有的统计信息,而树的其余部分被丢弃。如果根节点的价值和最好的子节点价值低于阈值$v_{resign}$,则AlphaGo Zero会认输。\""
+ "按照上述过程执行a~c,论文中是每步棋执行1600次模拟,那就是1600次的a~c,这个MCTS的过程就是模拟自我对弈的过程。模拟结束后,基本上能覆盖大多数的棋局和着法,每步棋该怎么下,下完以后胜率是多少,得到什么样的局面都能在树上找到。然后从树上选择当前局面应该下哪一步棋,这就是步骤d.play:\"在搜索结束时,AlphaGo Zero在根节点s0选择一个走子a,与其访问计数幂指数成正比,$π(a|s_0) = \\frac{N(s_0,a) ^{1/τ}}{\\sum_b N(s_0,b)^{1/τ}}$ ,其中τ是控制探索水平的温度参数。在随后的时间步重新使用搜索树:与所走子的动作对应的子节点成为新的根节点;保留这个节点下面的子树所有的统计信息,而树的其余部分被丢弃。如果根节点的价值和最好的子节点价值低于阈值$v_{resign}$,则AlphaGo Zero会认输。\"\n",
+ "\n",
+ "Follow the above process to execute a~c. In the paper, 1600 simulations are executed for each move, that is, 1600 a~c. This MCTS process is the process of simulating self-play. After the simulation is over, it can basically cover most of the chess games and moves, how to play each move, what is the winning rate after playing, and what kind of position you get can be found in the tree. Then select from the tree which move should be played in the current position. This is step d.play: \"At the end of the search, AlphaGo Zero selects a move a at the root node s0, which is proportional to its visit count power exponent, $π(a |s_0) = \\frac{N(s_0,a) ^{1/τ}}{\\sum_b N(s_0,b)^{1/τ}}$, where τ is the temperature parameter that controls the exploration level. In the following The search tree is reused at the time step: the child node corresponding to the action of the move becomes the new root node; all the statistical information of the subtree below this node is retained, and the rest of the tree is discarded. If the value of the root node and If the value of the best child node is lower than the threshold $v_{resign}$, AlphaGo Zero will admit defeat.\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "当模拟结束后,对于当前局面(就是树的根节点)的所有子节点就是每一步对应的action节点,选择哪一个action呢?按照论文所说是通过访问计数N来确定的。这个好理解吧?实现上也容易,当前节点的所有节点是可以获得的,每个子节点的信息N都可以获得,然后从多个action中选一个,这其实是多分类问题。我们使用softmax来得到选择某个action的概率,传给softmax的是每个action的logits($N(s_0,a) ^{1/τ}$),这其实可以改成$1/τ * log(N(s_0,a))$。这样就得到了当前局面所有可选action的概率向量,最终选择概率最大的那个action作为要下的一步棋,并且将这个选择的节点作为树的根节点。"
+ "当模拟结束后,对于当前局面(就是树的根节点)的所有子节点就是每一步对应的action节点,选择哪一个action呢?按照论文所说是通过访问计数N来确定的。这个好理解吧?实现上也容易,当前节点的所有节点是可以获得的,每个子节点的信息N都可以获得,然后从多个action中选一个,这其实是多分类问题。我们使用softmax来得到选择某个action的概率,传给softmax的是每个action的logits($N(s_0,a) ^{1/τ}$),这其实可以改成$1/τ * log(N(s_0,a))$。这样就得到了当前局面所有可选action的概率向量,最终选择概率最大的那个action作为要下的一步棋,并且将这个选择的节点作为树的根节点。\n",
+ "\n",
+ "When the simulation is over, all the child nodes of the current situation (that is, the root node of the tree) are the action nodes corresponding to each step. Which action should be selected? According to the paper, it is determined by the access count N. Is this easy to understand? It is also easy to implement. All nodes of the current node can be obtained, and the information N of each child node can be obtained, and then select one from multiple actions. This is actually a multi-classification problem. We use softmax to get the probability of choosing an action. What is passed to softmax is the logits of each action ($N(s_0,a) ^{1/τ}$), which can actually be changed to $1/τ * log( N(s_0,a))$. In this way, the probability vectors of all optional actions in the current situation are obtained, and finally the action with the highest probability is selected as the move to be played, and the selected node is regarded as the root node of the tree."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "按照图1中a.Self-Play的说法就是,从局面$s_t$进行自我对弈的树搜索(模拟),得到$a_t∼ π_t$,$a_t$就是动作action,$π_t$就是所有动作的概率向量。最终在局面$s_T$的时候得到胜平负的结果z,就是我们上面所说的value。"
+ "按照图1中a.Self-Play的说法就是,从局面$s_t$进行自我对弈的树搜索(模拟),得到$a_t∼ π_t$,$a_t$就是动作action,$π_t$就是所有动作的概率向量。最终在局面$s_T$的时候得到胜平负的结果z,就是我们上面所说的value。\n",
+ "\n",
+ "According to a.Self-Play in Figure 1, the tree search (simulation) of the self-play from the situation $s_t$ will get $a_t∼ π_t$, $a_t$ is the action action, and $π_t$ is the probability of all actions vector. In the end, in the situation $s_T$, we get the result z, which is the value we mentioned above."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "至此MCTS算法就分析完了。"
+ "至此MCTS算法就分析完了。\n",
+ "\n",
+ "So far the MCTS algorithm has been analyzed."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 神经网络"
+ "### 神经网络 Neural Networks"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "上面说过,通过MCTS算出该下哪一步棋。然后接着再经过1600次模拟算出下一步棋,如此循环直到分出胜负,这样一整盘棋就下完了,这就是一次完整的自对弈过程,那么MCTS就相当于人在大脑中思考。我们把每步棋的局面$s_t$、算出的action概率向量$π_t$和胜率$z_t$(就是返回值value)保存下来,作为棋谱数据训练神经网络。"
+ "As mentioned above, the MCTS is used to figure out which move to play. Then, after 1600 simulations, the next move is calculated, and the cycle continues until the winner is determined. In this way, the whole game is finished. This is a complete self-playing process. Then MCTS is equivalent to thinking in the brain. We save the position $s_t$ of each chess move, the calculated action probability vector $π_t$ and the winning rate $z_t$ (that is, the return value), and use them as the game record data to train the neural network."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "神经网络的输入是局面s,输出是预测的action概率向量p和胜率v,公式:$(p,v)= f_{θ_i} (s)$。训练目标是最小化预测胜率v和自我对弈的胜率z之间的误差,并使神经网络走子概率p与搜索概率π的相似度最大化。按照论文中所说,“具体而言,参数θ通过梯度下降分别在均方误差和交叉熵损失之和上的损失函数l进行调整,$l = (z - v)^2- π^T log p + c||θ||^2$,其中c是控制L2权重正则化水平的参数(防止过拟合)。”简单点说就是让神经网络的预测跟MCTS的搜索结果尽量接近。"
+ "神经网络的输入是局面s,输出是预测的action概率向量p和胜率v,公式:$(p,v)= f_{θ_i} (s)$。训练目标是最小化预测胜率v和自我对弈的胜率z之间的误差,并使神经网络走子概率p与搜索概率π的相似度最大化。按照论文中所说,“具体而言,参数θ通过梯度下降分别在均方误差和交叉熵损失之和上的损失函数l进行调整,$l = (z - v)^2- π^T log p + c||θ||^2$,其中c是控制L2权重正则化水平的参数(防止过拟合)。”简单点说就是让神经网络的预测跟MCTS的搜索结果尽量接近。\n",
+ "\n",
+ "The input of the neural network is the position s, and the output is the predicted action probability vector p and the winning rate v. The formula: $(p,v) = f_{θ_i} (s)$. The training goal is to minimize the error between the predicted winning rate v and the winning rate z of the self-game, and to maximize the similarity between the neural network walk probability p and the search probability π. According to the paper, \"Specifically, the parameter θ is adjusted by the loss function l of the sum of the mean square error and the cross-entropy loss through gradient descent, $l = (z-v)^2- π^T log p + c||θ||^2$, where c is the parameter that controls the L2 weight regularization level (to prevent overfitting).\" Simply put, the neural network prediction is as close as possible to the MCTS search result."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "胜率是回归问题,优化自然用MSE损失,概率向量的优化要用softmax交叉熵损失,目标就是最小化这个联合损失。"
+ "胜率是回归问题,优化自然用MSE损失,概率向量的优化要用softmax交叉熵损失,目标就是最小化这个联合损失。\n",
+ "\n",
+ "The winning rate is a regression problem. The optimization naturally uses the MSE loss, and the optimization of the probability vector uses the softmax cross-entropy loss. The goal is to minimize this joint loss."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "#### 神经网络结构"
+ "#### 神经网络结构 Neural network structure"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "网络结构没什么好说的,按照论文中的描述实现即可,下面是结构图:"
+ "网络结构没什么好说的,按照论文中的描述实现即可,下面是结构图:\n",
+ "\n",
+ "There is nothing to say about the network structure, just follow the description in the paper to implement it. The following is the structure diagram:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "到此,这篇论文基本上介绍的差不多了,有些训练和优化方面的细节这里就不介绍了。过程就是神经网络先随机初始化权重,使用MCTS下每一步棋,当树中节点没有被展开时通过神经网络预测出走子概率和胜率添加到树上,然后使用自我对弈的数据训练神经网络,在下一次自我对弈中使用新的训练过的神经网络进行预测,MCTS和神经网络你中有我、我中有你,如此反复迭代,网络预测的更准确,MCTS的结果更强大。实际上神经网络的预测可以理解为人的直觉。"
+ "到此,这篇论文基本上介绍的差不多了,有些训练和优化方面的细节这里就不介绍了。过程就是神经网络先随机初始化权重,使用MCTS下每一步棋,当树中节点没有被展开时通过神经网络预测出走子概率和胜率添加到树上,然后使用自我对弈的数据训练神经网络,在下一次自我对弈中使用新的训练过的神经网络进行预测,MCTS和神经网络你中有我、我中有你,如此反复迭代,网络预测的更准确,MCTS的结果更强大。实际上神经网络的预测可以理解为人的直觉。\n",
+ "\n",
+ "So far, this paper basically introduces almost the same, and some details of training and optimization are not introduced here. The process is that the neural network first randomly initializes the weights, uses MCTS to play each move, when the node in the tree is not expanded, the probability of move and the winning rate are predicted by the neural network and added to the tree, and then the neural network is trained using self-playing data. In the self-play game, a new trained neural network is used to make predictions. MCTS and neural networks have me in you and you in me. Repeated iterations will make the network predictions more accurate and the MCTS results more powerful. In fact, the prediction of neural network can be understood as human intuition."
]
},
{
@@ -367,14 +454,18 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "接下来一起看看AlphaZero的论文。"
+ "接下来一起看看AlphaZero的论文。\n",
+ "\n",
+ "Let's take a look at AlphaZero's paper."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "算法上没有区别,只有几个不同点:"
+ "算法上没有区别,只有几个不同点:\n",
+ "\n",
+ "There is no difference in the algorithm, only a few differences:"
]
},
{
@@ -383,98 +474,123 @@
"source": [
" - 1、在AlphaGo Zero中,自我对弈是由以前所有迭代中最好的玩家生成的。每次训练迭代之后,与最好玩家对弈测量新玩家的能力;如果以55%的优势获胜,那么它将取代最好的玩家,而自我对弈将由这个新玩家产生。相反,AlphaZero只维护一个不断更新的单个神经网络,而不是等待迭代完成。自我对弈是通过使用这个神经网络的最新参数生成的,省略了评估步骤和选择最佳玩家的过程。\n",
" - 2、比赛结果除了胜负以外,还有平局。\n",
- " - 3、围棋是可以进行数据增强的,因为围棋的规则是旋转和反转不变的。但是象棋、将棋等就不行。"
+ " - 3、围棋是可以进行数据增强的,因为围棋的规则是旋转和反转不变的。但是象棋、将棋等就不行。\n",
+ " \n",
+ " \n",
+ " - 1. In AlphaGo Zero, self-play is generated by the best players in all previous iterations. After each training iteration, play against the best player to measure the ability of the new player; if you win with a 55% advantage, it will replace the best player, and the self-play will be generated by this new player. In contrast, AlphaZero only maintains a single neural network that is constantly updated, rather than waiting for the iteration to complete. The self game is generated by using the latest parameters of this neural network, omitting the evaluation step and the process of selecting the best player.\n",
+ " - 2. In addition to the outcome of the game, there is a tie.\n",
+ " - 3. Go can be enhanced by data, because the rules of Go are rotation and reversal unchanged. But chess, shogi, etc. cannot work."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "好像也没啥大变化,我们重点要考虑的是输入特征的表示。"
+ "好像也没啥大变化,我们重点要考虑的是输入特征的表示。\n",
+ "\n",
+ "It doesn't seem to have changed much. The main thing we need to consider is the representation of the input features."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 输入特征的表示"
+ "### 输入特征的表示 Representation of input features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "刚刚介绍神经网络的结构时,没有对输入特征进行说明,先看看论文中的图示。"
+ "刚刚介绍神经网络的结构时,没有对输入特征进行说明,先看看论文中的图示。\n",
+ "\n",
+ "When I just introduced the structure of the neural network, I didn't explain the input features. First look at the diagram in the paper."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "网络结构图上能够看出神经网络的输入是19×19×17维度的图像栈。包含17个二值(只有两个值0和1)特征平面,8个特征平面$X_t$由二进制值组成,表示当前玩家存在的棋子(如果交点i在时间步t包含玩家颜色的棋子,那么$X_t^i = 1$;如果交叉点是空的,包含对手棋子,或者t <0,$X_t^i = 0$)。另外8个特征平面$Y_t$表示对手的棋子的相应特征。为什么每个玩家8个特征平面呢?是因为这是8步历史走子记录,就是说最近走的8步棋作为输入特征。最后的特征面C表示棋子颜色(当前的棋盘状态),是常量,如果是黑色棋子,则为1,如果是白色棋子则为0。这些平面连接在一起,给出输入特征$s_t$ = [$X_t , Y_t , X_{t-1}, Y_{t-1}, ..., X_{t-7}, Y_{t-7}, C$]。"
+ "网络结构图上能够看出神经网络的输入是19×19×17维度的图像栈。包含17个二值(只有两个值0和1)特征平面,8个特征平面$X_t$由二进制值组成,表示当前玩家存在的棋子(如果交点i在时间步t包含玩家颜色的棋子,那么$X_t^i = 1$;如果交叉点是空的,包含对手棋子,或者t <0,$X_t^i = 0$)。另外8个特征平面$Y_t$表示对手的棋子的相应特征。为什么每个玩家8个特征平面呢?是因为这是8步历史走子记录,就是说最近走的8步棋作为输入特征。最后的特征面C表示棋子颜色(当前的棋盘状态),是常量,如果是黑色棋子,则为1,如果是白色棋子则为0。这些平面连接在一起,给出输入特征$s_t$ = [$X_t , Y_t , X_{t-1}, Y_{t-1}, ..., X_{t-7}, Y_{t-7}, C$]。\n",
+ "\n",
+ "It can be seen from the network structure diagram that the input of the neural network is an image stack of 19×19×17 dimensions. Contains 17 binary (only two values 0 and 1) feature planes, 8 feature planes𝑋𝑡\n",
+ "It is composed of binary values and represents the current player's pawn (if the intersection i contains the player's pawn at time step t, then 𝑋𝑖𝑡=1; if the intersection is empty and contains the opponent's pawn, or t <0, 𝑋𝑖=0) . The other 8 feature planes 𝑌𝑡 represent the corresponding features of the opponent's pieces. Why are there 8 feature planes for each player? It is because this is a record of 8-step historical moves, that is, the most recent 8 moves are used as input features. The last characteristic surface C represents the color of the chess piece (the current state of the board), which is a constant. If it is a black piece, it is 1, and if it is a white piece, it is 0. These planes are connected together to give the input feature 𝑠𝑡 = [𝑋𝑡,𝑌𝑡,𝑋𝑡−1,𝑌𝑡−1,...,𝑋𝑡−7,𝑌𝑡−7,𝐶].\n",
+ "\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "国际象棋就不同了,加入了各种特征平面,用来表示不同的情况,王车易位啦,多少回合没有进展啦(没有吃子),重复的局面啦(多次重复会被判平局)等等,这些不是我想说的,这些特征可以根据不同的棋种自己去设计,我们重点关注的是棋子的特征。"
+ "国际象棋就不同了,加入了各种特征平面,用来表示不同的情况,王车易位啦,多少回合没有进展啦(没有吃子),重复的局面啦(多次重复会被判平局)等等,这些不是我想说的,这些特征可以根据不同的棋种自己去设计,我们重点关注的是棋子的特征。\n",
+ "\n",
+ "Chess is different. Various characteristic planes are added to indicate different situations. The king and the rook are transposed, how many rounds have not been progressed (there is no game), and the position is repeated (multiple repetitions will be considered a tie). Wait, these are not what I want to say, these features can be designed according to different chess types, we focus on the characteristics of chess pieces."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "对于围棋而言,每个棋子都是一样的,都是一类。而国际象棋分为6种棋子:车、马、象、后、王、兵,那在特征平面上怎么表示呢,总不能使用0~5吧。还是用0和1来表示棋盘上有子还是没子,然后既然是6类棋子,想当然的使用one-hot编码了,所以特征平面分成了6个平面,每一个平面用来表示不同种类棋子在棋盘上的位置。"
+ "对于围棋而言,每个棋子都是一样的,都是一类。而国际象棋分为6种棋子:车、马、象、后、王、兵,那在特征平面上怎么表示呢,总不能使用0~5吧。还是用0和1来表示棋盘上有子还是没子,然后既然是6类棋子,想当然的使用one-hot编码了,所以特征平面分成了6个平面,每一个平面用来表示不同种类棋子在棋盘上的位置。\n",
+ "\n",
+ "For Go, every chess piece is the same and all of the same type. There are 6 types of chess pieces: rook, horse, bishop, queen, king, and pawn. How do you express it on the characteristic plane? You can't use 0~5. Still use 0 and 1 to indicate whether there are stones or no stones on the chessboard. Then since it is a 6 types of chess pieces, one-hot encoding is taken for granted, so the feature plane is divided into 6 planes, and each plane is used to represent different types of chess pieces. The position on the chessboard."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "以上就是介绍的全部了,更多的细节,比如优化参数设为多少、学习率退火设为多少等等请阅读论文。"
+ "以上就是介绍的全部了,更多的细节,比如优化参数设为多少、学习率退火设为多少等等请阅读论文。\n",
+ "\n",
+ "The above is all that was introduced. For more details, such as how many optimization parameters are set, how much learning rate annealing is set, etc., please read the paper."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## 中国象棋的实现"
+ "## 中国象棋的实现 The realization of Chinese chess"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "原理讲了一大堆,该上代码了,这里根据论文中的算法实现一个中国象棋程序。"
+ "原理讲了一大堆,该上代码了,这里根据论文中的算法实现一个中国象棋程序。\n",
+ "\n",
+ "A lot of principles have been discussed, it is time to code, here is a Chinese chess program based on the algorithm in the paper."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "完整代码请参见[`项目地址`](https://github.com/chengstone/cchess-zero)"
+ "完整代码请参见[`项目地址`](https://github.com/chengstone/cchess-zero)\n",
+ "\n",
+ "For the complete code, please see [`Project Address`](https://github.com/chengstone/cchess-zero)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 输入特征的设计"
+ "### 输入特征的设计 Design of input features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "先实现神经网络的部分,那么就要先设计输入特征。其实跟国际象棋差不多,棋子分为:车、马、炮、象、士、将、兵,共7种棋子,那就是每个玩家7个特征平面,一共14个特征平面。至于论文中其他的特征平面,比如颜色、回合数、重复局面、历史走子记录等等我没有实现,只使用了当前棋盘上每个玩家每个棋子的位置特征作为输入,一共14个平面,当然论文中说的其他特征平面您也可以实现一下试试。棋盘大小是$9 * 10$,所以输入占位符就是:"
+ "先实现神经网络的部分,那么就要先设计输入特征。其实跟国际象棋差不多,棋子分为:车、马、炮、象、士、将、兵,共7种棋子,那就是每个玩家7个特征平面,一共14个特征平面。至于论文中其他的特征平面,比如颜色、回合数、重复局面、历史走子记录等等我没有实现,只使用了当前棋盘上每个玩家每个棋子的位置特征作为输入,一共14个平面,当然论文中说的其他特征平面您也可以实现一下试试。棋盘大小是$9 * 10$,所以输入占位符就是:\n",
+ "\n",
+ "To implement the neural network first, then the input features must be designed first. In fact, similar to chess, chess pieces are divided into: rook, horse, artillery, bishop, soldier, general, and pawn. There are 7 types of chess pieces, that is, each player has 7 characteristic planes, and a total of 14 characteristic planes. As for the other feature planes in the paper, such as color, number of rounds, repeated positions, historical move records, etc., I did not implement it. I only used the position features of each player and each piece on the current board as input, a total of 14 planes, Of course, you can also try the other feature planes mentioned in the paper. The board size is $9 * 10$, so the input placeholder is: input feature design"
]
},
{
@@ -488,23 +604,26 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "接下来是定义输入的概率向量pi(π),需要确定向量的长度,意味着需要确定所有合法走子的集合长度。函数如下:"
+ "接下来是定义输入的概率向量pi(π),需要确定向量的长度,意味着需要确定所有合法走子的集合长度。函数如下:\n",
+ "\n",
+ "The next step is to define the input probability vector pi (π). The length of the vector needs to be determined, which means that the set length of all legal walkers needs to be determined. The function is as follows:"
]
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# 创建所有合法走子UCI,size 2086\n",
+ "# Create all legal pawns UCI, size 2086\n",
"def create_uci_labels():\n",
" labels_array = []\n",
" letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']\n",
@@ -541,72 +660,85 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "长度一共是2086。关于UCCI的资料可以参考:[`中国象棋通用引擎协议 版本:3.0`](http://www.xqbase.com/protocol/cchess_ucci.htm)"
+ "长度一共是2086。关于UCCI的资料可以参考:[`中国象棋通用引擎协议 版本:3.0`](http://www.xqbase.com/protocol/cchess_ucci.htm)\n",
+ "\n",
+ "The total length is 2086. For information about UCCI, please refer to: [`Chinese Chess General Engine Protocol Version: 3.0`](http://www.xqbase.com/protocol/cchess_ucci.htm)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "概率向量pi的占位符定义:self.pi_ = tf.placeholder(tf.float32, [None, 2086], name='pi')"
+ "概率向量pi的占位符定义:self.pi_ = tf.placeholder(tf.float32, [None, 2086], name='pi')\n",
+ "\n",
+ "The placeholder definition of the probability vector pi: self.pi_ = tf.placeholder(tf.float32, [None, 2086], name='pi')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "胜率z的占位符定义:self.z_ = tf.placeholder(tf.float32, [None, 1], name='z')"
+ "胜率z的占位符定义:self.z_ = tf.placeholder(tf.float32, [None, 1], name='z')\n",
+ "\n",
+ "The placeholder definition of winning percentage z: self.z_ = tf.placeholder(tf.float32, [None, 1], name='z')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "学习率的定义:self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')"
+ "学习率的定义:self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')\n",
+ "\n",
+ "Definition of learning rate: self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "优化器使用Momentum:"
+ "优化器使用Momentum: The optimizer uses Momentum:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- " self.momentum = 0.9\n",
- " optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=self.momentum, use_nesterov=True) "
+ "self.momentum = 0.9\n",
+ "optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=self.momentum, use_nesterov=True) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "这里需要特殊说明一下,我实现的是多GPU训练,关于多GPU训练神经网络的实现可以参考TensorFlow官方的[`例子`](https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py),和[`TensorFlow多GPU并行计算实例---MNIST`](https://gitee.com/liyang619/mnist_multi_gpu_batching_train/blob/master/mnist_multi_gpu_batching_train.py)。"
+ "这里需要特殊说明一下,我实现的是多GPU训练,关于多GPU训练神经网络的实现可以参考TensorFlow官方的[`例子`](https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py),和[`TensorFlow多GPU并行计算实例---MNIST`](https://gitee.com/liyang619/mnist_multi_gpu_batching_train/blob/master/mnist_multi_gpu_batching_train.py)。\n",
+ "\n",
+ "Special explanation is needed here. I have implemented multi-GPU training. For the implementation of multi-GPU training neural network, please refer to the official [TensorFlow example](https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py) and the TensorFlow multi-GPU parallel computing example [MNIST](https://gitee.com/liyang619/mnist_multi_gpu_batching_train/blob/master/mnist_multi_gpu_batching_train.py)。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "实现思想是把输入数据按照使用的gpu数量均分:"
+ "实现思想是把输入数据按照使用的gpu数量均分:\n",
+ "\n",
+ "The realization idea is to divide the input data equally according to the number of GPUs used:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- " inputs_batches = tf.split(self.inputs_, self.num_gpus, axis=0)\n",
- " pi_batches = tf.split(self.pi_, self.num_gpus, axis=0)\n",
- " z_batches = tf.split(self.z_, self.num_gpus, axis=0)"
+ "inputs_batches = tf.split(self.inputs_, self.num_gpus, axis=0)\n",
+ "pi_batches = tf.split(self.pi_, self.num_gpus, axis=0)\n",
+ "z_batches = tf.split(self.z_, self.num_gpus, axis=0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
+ " \"\"\"COMMENTS ON NEXT SEGMENT OF CODE\"\"\"\n",
" tower_grads = [None] * self.num_gpus\n",
"\n",
" self.loss = 0\n",
@@ -615,22 +747,25 @@
" self.value_head = []\n",
"\n",
" with tf.variable_scope(tf.get_variable_scope()):\n",
- " \"\"\"Build the core model within the graph.\"\"\"\n",
+ " # Build the core model within the graph.\n",
" for i in range(self.num_gpus): \n",
- " with tf.device('/gpu:%d' % i): # 不同的gpu分别使用不同的name scope\n",
- " with tf.name_scope('TOWER_{}'.format(i)) as scope: \n",
+ " # 不同的gpu分别使用不同的name scope # Different GPUs use different name scopes\n",
+ " with tf.device('/gpu:%d' % i): \n",
+ " with tf.name_scope('TOWER_{}'.format(i)) as scope: \n",
" # 将上面均分的输入数据输入给各自的gpu进行运算\n",
- " inputs_batch, pi_batch, z_batch = inputs_batches[i], pi_batches[i], z_batches[i]\n",
- " # **划重点!运算图的构建一定要单独写在新的函数中,这样运行才不会出错,否则TensorFlow会提示不能重复使用变量。**\n",
- " loss = self.tower_loss(inputs_batch, pi_batch, z_batch, i) # 构建神经网络计算图的函数,一会详细说。\n",
+ " # Input the input data equally divided into the respective gpu for calculation\n",
+ " inputs_batch, pi_batch, z_batch = inputs_batches[i], pi_batches[i], z_batches[i]\n",
+ " # 划重点!运算图的构建一定要单独写在新的函数中,这样运行才不会出错,否则TensorFlow会提示不能重复使用变量。**\n",
+ " loss = self.tower_loss(inputs_batch, pi_batch, z_batch, i) # 构建神经网络计算图的函数,一会详细说。\n",
+ " # Key points! The construction of the arithmetic graph must be written separately in the new function, so that the operation will not go wrong, otherwise TensorFlow will prompt that the variable cannot be reused. loss = self.tower_loss(inputs_batch, pi_batch, z_batch, i) # The function of constructing the neural network calculation graph, I will talk about it in detail later.\n",
" # reuse variable happens here\n",
- " tf.get_variable_scope().reuse_variables()\n",
- " grad = optimizer.compute_gradients(loss)\n",
- " tower_grads[i] = grad # 保存每一个gpu的梯度\n",
- "\n",
- " self.loss /= self.num_gpus # loss是多个gpu的loss总和,所以要取平均\n",
- " self.accuracy /= self.num_gpus # acc也是同理\n",
- " grads = self.average_gradients(tower_grads) # 同理,对所有梯度取平均\n",
+ " tf.get_variable_scope().reuse_variables()\n",
+ " grad = optimizer.compute_gradients(loss)\n",
+ " tower_grads[i] = grad # 保存每一个gpu的梯度 # Save the gradient of each gpu\n",
+ " # loss是多个gpu的loss总和,所以要取平均 # Loss is the sum of the loss of multiple GPUs, so take the average\n",
+ " self.loss /= self.num_gpus \n",
+ " self.accuracy /= self.num_gpus # acc也是同理 # acc is the same\n",
+ " grads = self.average_gradients(tower_grads) # 同理,对所有梯度取平均 # Similarly, take the average of all gradients\n",
" self.train_op = optimizer.apply_gradients(grads, global_step=global_step)"
]
},
@@ -638,36 +773,38 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 实现神经网络计算图"
+ "### 实现神经网络计算图 Implement neural network computational graph convolution block"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "这里完全是按照论文所述的神经网络结构实现的,大家可以对照上面的结构图,是一一对应的。稍有不同的是,filters size我设为128,没有使用256。另外残差块的数量我默认使用了7层,没有使用19或者39,大家电脑给力的话可以尝试修改一下。"
+ "这里完全是按照论文所述的神经网络结构实现的,大家可以对照上面的结构图,是一一对应的。稍有不同的是,filters size我设为128,没有使用256。另外残差块的数量我默认使用了7层,没有使用19或者39,大家电脑给力的话可以尝试修改一下。\n",
+ "\n",
+ "This is completely implemented in accordance with the neural network structure described in the paper. You can compare the structure diagram above, which is a one-to-one correspondence. The slight difference is that I set the filters size to 128 instead of 256. In addition, I used 7 layers by default for the number of residual blocks, and did not use 19 or 39. You can try to modify it if your computer is strong."
]
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def tower_loss(self, inputs_batch, pi_batch, z_batch, i):\n",
- " # 卷积块\n",
+ " # 卷积块 # Convolution block\n",
" with tf.variable_scope('init'):\n",
" layer = tf.layers.conv2d(inputs_batch, self.filters_size, 3, padding='SAME') # filters 128(or 256)\n",
"\n",
" layer = tf.contrib.layers.batch_norm(layer, center=False, epsilon=1e-5, fused=True,\n",
" is_training=self.training, activation_fn=tf.nn.relu) # epsilon = 0.25\n",
"\n",
- " # 残差块\n",
+ " # 残差块 # Residual block\n",
" with tf.variable_scope(\"residual_block\"):\n",
" for _ in range(self.res_block_nums):\n",
" layer = self.residual_block(layer)\n",
"\n",
- " # 策略头\n",
+ " # 策略头 # Strategy header\n",
" with tf.variable_scope(\"policy_head\"):\n",
" policy_head = tf.layers.conv2d(layer, 2, 1, padding='SAME')\n",
" policy_head = tf.contrib.layers.batch_norm(policy_head, center=False, epsilon=1e-5, fused=True,\n",
@@ -676,9 +813,9 @@
" # print(self.policy_head.shape) # (?, 9, 10, 2)\n",
" policy_head = tf.reshape(policy_head, [-1, 9 * 10 * 2])\n",
" policy_head = tf.contrib.layers.fully_connected(policy_head, self.prob_size, activation_fn=None)\n",
- " self.policy_head.append(policy_head) # 保存多个gpu的策略头结果(走子概率向量)\n",
+ " self.policy_head.append(policy_head) # 保存多个gpu的策略头结果(走子概率向量)# Save the strategy header results of multiple GPUs \n",
"\n",
- " # 价值头\n",
+ " # 价值头 # Value Head\n",
" with tf.variable_scope(\"value_head\"):\n",
" value_head = tf.layers.conv2d(layer, 1, 1, padding='SAME')\n",
" value_head = tf.contrib.layers.batch_norm(value_head, center=False, epsilon=1e-5, fused=True,\n",
@@ -687,9 +824,9 @@
" value_head = tf.reshape(value_head, [-1, 9 * 10 * 1])\n",
" value_head = tf.contrib.layers.fully_connected(value_head, 256, activation_fn=tf.nn.relu)\n",
" value_head = tf.contrib.layers.fully_connected(value_head, 1, activation_fn=tf.nn.tanh)\n",
- " self.value_head.append(value_head) # 保存多个gpu的价值头结果(胜率)\n",
+ " self.value_head.append(value_head) # 保存多个gpu的价值头结果(胜率) #Save header results\n",
"\n",
- " # 损失\n",
+ " # 损失 # Loss\n",
" with tf.variable_scope(\"loss\"):\n",
" policy_loss = tf.nn.softmax_cross_entropy_with_logits(labels=pi_batch, logits=policy_head) \n",
" policy_loss = tf.reduce_mean(policy_loss)\n",
@@ -704,12 +841,13 @@
" l2_loss = tf.contrib.layers.apply_regularization(regularizer, regular_variables)\n",
"\n",
" # loss = value_loss - policy_loss + l2_loss\n",
- " loss = value_loss + policy_loss + l2_loss # softmax交叉熵损失 + MSE + l2损失\n",
- " self.loss += loss # 多个gpu的loss总和\n",
+ " loss = value_loss + policy_loss + l2_loss # softmax交叉熵损失 + MSE + l2损失 # # softmax cross entropy loss + MSE + l2 loss\n",
+ " self.loss += loss # 多个gpu的loss总和 # The sum of loss of multiple GPUs\n",
" tf.summary.scalar('loss_tower_{}'.format(i), loss)\n",
"\n",
" with tf.variable_scope(\"accuracy\"):\n",
" # Accuracy 这个准确率是预测概率向量和MCTS的概率向量的比较\n",
+ " # This accuracy is the comparison between the predicted probability vector and the MCTS probability vector\n",
" correct_prediction = tf.equal(tf.argmax(policy_head, 1), tf.argmax(pi_batch, 1)) \n",
" correct_prediction = tf.cast(correct_prediction, tf.float32)\n",
" accuracy = tf.reduce_mean(correct_prediction, name='accuracy')\n",
@@ -735,12 +873,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 训练网络"
+ "### 训练网络 Training Network"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -763,47 +901,57 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 使用神经网络预测"
+ "### 使用神经网络预测 Use neural network predictions"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "预测的代码稍微麻烦一些,因为我们自对弈训练时是多线程在跑的,传过来的输入数据可能并不能被gpu数量均分,比如我有2个gpu,但是传进来的输入size是3,这样的话就有一个gpu跑2个数据,一个gpu跑1个数据。可实际上这样代码是跑不起来的,会报错,我google了半天也没找到解决办法。"
+ "预测的代码稍微麻烦一些,因为我们自对弈训练时是多线程在跑的,传过来的输入数据可能并不能被gpu数量均分,比如我有2个gpu,但是传进来的输入size是3,这样的话就有一个gpu跑2个数据,一个gpu跑1个数据。可实际上这样代码是跑不起来的,会报错,我google了半天也没找到解决办法。\n",
+ "\n",
+ "The prediction code is a bit more troublesome, because we are running in multiple threads during self-game training, and the input data passed over may not be evenly divided by the number of GPUs. For example, I have 2 GPUs, but the input size passed in is 3. In this case, one gpu runs 2 data, and one gpu runs 1 data. But in fact, this code can't run, and it will report an error. I googled for a long time and couldn't find a solution."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "我的解决方案是,先看看输入数据的长度能否被gpu数量整除,如果能,那就一切正常,直接把输入传给网络就好,神经网络会将数据按照gpu数量均分。"
+ "我的解决方案是,先看看输入数据的长度能否被gpu数量整除,如果能,那就一切正常,直接把输入传给网络就好,神经网络会将数据按照gpu数量均分。\n",
+ "\n",
+ "My solution is to first see if the length of the input data can be divisible by the number of GPUs. If so, everything is normal. Just pass the input directly to the network. The neural network will divide the data equally by the number of GPUs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "一旦不能整除,那就把输入数据分成两部分,一部分是能被gpu数量整除的数据,一部分是余下的数据。比如我有2个gpu,输入数据的长度是5,那么把这5份数据分成4份和1份。4份数据的处理就是正常处理,直接把数据传给网络就好,神经网络会将数据按照gpu数量均分。"
+ "一旦不能整除,那就把输入数据分成两部分,一部分是能被gpu数量整除的数据,一部分是余下的数据。比如我有2个gpu,输入数据的长度是5,那么把这5份数据分成4份和1份。4份数据的处理就是正常处理,直接把数据传给网络就好,神经网络会将数据按照gpu数量均分。\n",
+ "\n",
+ "Once it is not divisible, divide the input data into two parts, one is the data that can be divisible by the number of GPUs, and the other is the remaining data. For example, if I have 2 GPUs and the length of the input data is 5, then the 5 pieces of data will be divided into 4 parts and 1 part. The processing of 4 pieces of data is normal processing, just send the data directly to the network, and the neural network will divide the data equally according to the number of GPUs."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "余下的那部分数据怎么处理呢?把余下的数据不断堆叠起来,直到数据能够被gpu数量均分为止。假如剩下1份数据,那就复制1份,变成2份相同的数据,这样正好被2个gpu数量均分。只不过这2个gpu处理后返回的数据,我们只要一个gpu的结果就行了,抛弃另外一个。"
+ "余下的那部分数据怎么处理呢?把余下的数据不断堆叠起来,直到数据能够被gpu数量均分为止。假如剩下1份数据,那就复制1份,变成2份相同的数据,这样正好被2个gpu数量均分。只不过这2个gpu处理后返回的数据,我们只要一个gpu的结果就行了,抛弃另外一个。\n",
+ "\n",
+ "How to deal with the rest of the data? Stack the remaining data until the data can be divided equally by the number of GPUs. If there is 1 copy of data left, then copy 1 copy and become 2 copies of the same data, so that it is equally divided by the number of 2 GPUs. It's just that the data returned after these 2 GPUs are processed, we only need the result of one GPU, and discard the other one."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "这段代码我只在aws的2个gpu的环境下跑过,更多的gpu就没试过了,也许有bug也不一定,您可以跑跑看:)"
+ "这段代码我只在aws的2个gpu的环境下跑过,更多的gpu就没试过了,也许有bug也不一定,您可以跑跑看:)\n",
+ "\n",
+ "I have only ran this code in the environment of 2 GPUs in aws. I haven’t tried more GPUs. Maybe there are bugs. You can run and see:)"
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -814,25 +962,26 @@
" batch_n = positions.shape[0] // self.num_gpus\n",
" alone = positions.shape[0] % self.num_gpus\n",
"\n",
- " if alone != 0: # 判断是否不能被gpu均分\n",
+ " if alone != 0: # 判断是否不能被gpu均分 # Determine whether it cannot be divided equally by gpu\n",
" if(positions.shape[0] != 1): # 如果不止1份数据。因为有可能输入数据的长度是1,这样肯定不能被多gpu均分了。\n",
+ " # If there is more than one piece of data. Because it is possible that the length of the input data is 1, it must not be evenly divided by multiple GPUs.\n",
" feed_dict = {\n",
- " self.inputs_: positions[:positions.shape[0] - alone], # 先将能均分的这部分数据传入神经网络\n",
+ " self.inputs_: positions[:positions.shape[0] - alone], # 先将能均分的这部分数据传入神经网络 # First pass this part of the data that can be divided into the neural network\n",
" self.training: False\n",
" }\n",
" action_probs, value = self.sess.run([self.policy_head, self.value_head], feed_dict=feed_dict)\n",
" action_probs, value = np.vstack(action_probs), np.vstack(value)\n",
"\n",
- " new_positions = positions[positions.shape[0] - alone:] # 取余下的这部分数据\n",
+ " new_positions = positions[positions.shape[0] - alone:] # 取余下的这部分数据 # Take the remaining part of the data\n",
" pos_lst = []\n",
" while len(pos_lst) == 0 or (np.array(pos_lst).shape[0] * np.array(pos_lst).shape[1]) % self.num_gpus != 0:\n",
- " pos_lst.append(new_positions) # 将余下的这部分数据堆叠起来,直到数量的长度能被gpu均分\n",
+ " pos_lst.append(new_positions) # 将余下的这部分数据堆叠起来,直到数量的长度能被gpu均分 # Stack the remaining part of the data until the length of the quantity can be divided equally by the gpu\n",
"\n",
" if(len(pos_lst) != 0):\n",
" shape = np.array(pos_lst).shape\n",
" pos_lst = np.array(pos_lst).reshape([shape[0] * shape[1], 9, 10, 14])\n",
" \n",
- " # 将数据传入网络,得到不能被gpu均分的数据的计算结果\n",
+ " # 将数据传入网络,得到不能被gpu均分的数据的计算结果 # Pass the data to the network and get the calculation result of the data that cannot be divided equally by the GPU\n",
" feed_dict = {\n",
" self.inputs_: pos_lst,\n",
" self.training: False\n",
@@ -845,17 +994,17 @@
" # print(\"action_probs_2.shape : \", np.array(action_probs_2).shape)\n",
" # print(\"value_2.shape : \", np.array(value_2).shape)\n",
"\n",
- " if(positions.shape[0] != 1): # 多个数据的计算结果\n",
+ " if(positions.shape[0] != 1): # 多个数据的计算结果 # Calculation results of multiple data\n",
" action_probs = np.concatenate((action_probs, action_probs_2),axis=0)\n",
" value = np.concatenate((value, value_2),axis=0)\n",
"\n",
" # print(\"action_probs.shape : \", np.array(action_probs).shape)\n",
" # print(\"value.shape : \", np.array(value).shape)\n",
" return action_probs, value\n",
- " else: # 只有1个数据的计算结果\n",
+ " else: # 只有1个数据的计算结果 # Calculation result with only 1 data\n",
" return action_probs_2, value_2\n",
" else:\n",
- " # 正常情况,能被gpu均分\n",
+ " # 正常情况,能被gpu均分 # Normally, it can be divided equally by gpu\n",
" feed_dict = {\n",
" self.inputs_: positions,\n",
" self.training: False\n",
@@ -863,7 +1012,7 @@
" action_probs, value = self.sess.run([self.policy_head, self.value_head], feed_dict=feed_dict)\n",
" # print(\"np.vstack(action_probs) shape : \", np.vstack(action_probs).shape)\n",
" # print(\"np.vstack(value) shape : \", np.vstack(value).shape)\n",
- " # 将多个gpu的计算结果堆叠起来返回\n",
+ " # 将多个gpu的计算结果堆叠起来返回 # Stack the calculation results of multiple GPUs and return\n",
" return np.vstack(action_probs), np.vstack(value)"
]
},
@@ -871,19 +1020,21 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 自对弈训练"
+ "### 自对弈训练 Self Play Training"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "自对弈训练的思想在上面分析论文时已经说过了,程序自己跟自己下棋,将每盘棋的数据保存起来,当数据量达到我们设置的大小时就开始训练神经网络。"
+ "自对弈训练的思想在上面分析论文时已经说过了,程序自己跟自己下棋,将每盘棋的数据保存起来,当数据量达到我们设置的大小时就开始训练神经网络。\n",
+ "\n",
+ "The idea of self-play training has already been mentioned in the above analysis of the paper. The program plays chess with itself, saves the data of each game, and starts training the neural network when the amount of data reaches the size we set."
]
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -892,15 +1043,15 @@
" try:\n",
" while(True):\n",
" batch_iter += 1\n",
- " play_data, episode_len = self.selfplay() # 自我对弈,返回下棋数据\n",
+ " play_data, episode_len = self.selfplay() # 自我对弈,返回下棋数据 # Self-play, return the chess data\n",
" print(\"batch i:{}, episode_len:{}\".format(batch_iter, episode_len))\n",
" extend_data = []\n",
" for state, mcts_prob, winner in play_data:\n",
" states_data = self.mcts.state_to_positions(state)\n",
- " extend_data.append((states_data, mcts_prob, winner)) # 将棋盘特征平面、MCTS算出的概率向量、胜率保存起来\n",
+ " extend_data.append((states_data, mcts_prob, winner)) # 将棋盘特征平面、MCTS算出的概率向量、胜率保存起来 # Save the chessboard feature plane, the probability vector calculated by MCTS, and the winning rate\n",
" self.data_buffer.extend(extend_data)\n",
- " if len(self.data_buffer) > self.batch_size: # 保存的数据达到指定数量时\n",
- " self.policy_update() # 开始训练\n",
+ " if len(self.data_buffer) > self.batch_size: # 保存的数据达到指定数量时 # When the saved data reaches the specified amount\n",
+ " self.policy_update() # 开始训练 # Start Training\n",
" except KeyboardInterrupt:\n",
" self.log_file.close()\n",
" self.policy_value_netowrk.save(self.global_step)"
@@ -910,18 +1061,18 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "#### 训练网络"
+ "#### 训练网络 Training Network"
]
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def policy_update(self):\n",
" \"\"\"update the policy-value net\"\"\"\n",
- " # 从数据中随机抽取一部分数据\n",
+ " # 从数据中随机抽取一部分数据 #Randomly extract a part of the data from the data\n",
" mini_batch = random.sample(self.data_buffer, self.batch_size)\n",
" #print(\"training data_buffer len : \", len(self.data_buffer))\n",
" state_batch = [data[0] for data in mini_batch]\n",
@@ -933,15 +1084,18 @@
" # print(winner_batch.shape)\n",
" # print(winner_batch)\n",
" start_time = time.time()\n",
- " old_probs, old_v = self.mcts.forward(state_batch) # 先通过正向传播预测下网络输出结果,用于计算训练后的KL散度\n",
+ " old_probs, old_v = self.mcts.forward(state_batch) # 先通过正向传播预测下网络输出结果,用于计算训练后的KL散度 # First predict the output of the network through forward propagation, which is used to calculate the KL divergence after training\n",
" for i in range(self.epochs): # 一共训练5次\n",
" # 训练网络。敲黑板!这里的学习率需要特别注意。我在aws上用的是g2.2xlarge,24小时只能下差不多200盘棋,很慢。\n",
+ " # Train the network. Knock on the blackboard! The learning rate here requires special attention. I use g2.2xlarge on aws, and I can only play about 200 games in 24 hours, which is very slow.\n",
" # 所以学习率是在这里是动态调整的。当然您也可以使用指数衰减学习率,在上面定义学习率的地方就需要修改成类似下面这句:\n",
+ " # So the learning rate is dynamically adjusted here. Of course, you can also use the exponential decay learning rate. In the above definition of the learning rate, you need to modify it to something like the following sentence:\n",
" # self.learning_rate = tf.maximum(tf.train.exponential_decay(0.001, self.global_step, 1e3, 0.66), 1e-5)\n",
" # 然后这里训练网络的地方学习率就不用作为参数传递了,也可以在训练网络函数里面不使用传递的学习率参数。\n",
+ " # # Then the learning rate where the network is trained here does not need to be passed as a parameter, and the passed learning rate parameter can also not be used in the training network function.\n",
" accuracy, loss, self.global_step = self.policy_value_netowrk.train_step(state_batch, mcts_probs_batch, winner_batch,\n",
" self.learning_rate * self.lr_multiplier) # \n",
- " new_probs, new_v = self.mcts.forward(state_batch) #使用训练后的新网络预测结果,跟之前的结果计算KL散度\n",
+ " new_probs, new_v = self.mcts.forward(state_batch) #使用训练后的新网络预测结果,跟之前的结果计算KL散度 #Use the new network prediction result after training, and calculate the KL divergence with the previous result\n",
" kl_tmp = old_probs * (np.log((old_probs + 1e-10) / (new_probs + 1e-10)))\n",
" # print(\"kl_tmp.shape\", kl_tmp.shape)\n",
" kl_lst = []\n",
@@ -958,7 +1112,7 @@
" self.policy_value_netowrk.save(self.global_step)\n",
" print(\"train using time {} s\".format(time.time() - start_time))\n",
"\n",
- " # 通过计算调整学习率乘子\n",
+ " # 通过计算调整学习率乘子 # Adjust the learning rate multiplier by calculation\n",
" # adaptively adjust the learning rate\n",
" if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:\n",
" self.lr_multiplier /= 1.5\n",
@@ -979,40 +1133,45 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "#### 自我对弈"
+ "#### 自我对弈 Self Play"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "自我对弈就是通过MCTS下每一步棋,直到分出胜负,并返回下棋数据。"
+ "自我对弈就是通过MCTS下每一步棋,直到分出胜负,并返回下棋数据。Self-play is to play every move through MCTS until the winner is determined and the chess data is returned."
]
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def selfplay(self):\n",
- " self.game_borad.reload() # 初始化棋盘\n",
+ " self.game_borad.reload() # 初始化棋盘 # Initialize the board\n",
" states, mcts_probs, current_players = [], [], []\n",
" z = None\n",
" game_over = False\n",
" winnner = \"\"\n",
" start_time = time.time()\n",
- " while(not game_over): # 下棋循环,结束条件是分出胜负\n",
- " action, probs, win_rate = self.get_action(self.game_borad.state, self.temperature) # 通过MCTS算出下哪一步棋\n",
+ " while(not game_over): # 下棋循环,结束条件是分出胜负 # Chess loop, the end condition is to decide the winner\n",
+ " action, probs, win_rate = self.get_action(self.game_borad.state, self.temperature) # 通过MCTS算出下哪一步棋 # Calculate which move to play through MCTS\n",
" ################################################\n",
" # 这部分代码是跟我的设计有关的。因为在输入特征平面中我没有使用颜色特征,\n",
" # 所以传给神经网络数据时,要把当前选手转换成红色(先手),转换的其实是棋盘的棋子位置\n",
" # 这样神经网络预测的始终是红色先手方向该如何下棋\n",
+ " # This part of the code is related to my design. Because I did not use color features in the input feature plane,\n",
+ " # So when passing the neural network data, the current player must be converted to red (first move), which is actually the position of the chess pieces on the board\n",
+ " # In this way, the neural network always predicts how to play chess in the red first direction\n",
" state, palyer = self.mcts.try_flip(self.game_borad.state, self.game_borad.current_player, self.mcts.is_black_turn(self.game_borad.current_player))\n",
" states.append(state)\n",
" prob = np.zeros(labels_len)\n",
" # 神经网络返回的概率向量也需要转换,假如当前选手是黑色,转换成红色后,由于棋盘位置的变化,概率向量(走子集合)是基于红色棋盘的\n",
+ " # The probability vector returned by the neural network also needs to be converted. If the current player is black, after converting to red, due to the change of the board position, the probability vector (set of moves) is based on the red board\n",
" # 要把走子action转换成黑色选手的方向才行。明白我的意思吧?\n",
+ " # The move must be converted to the direction of the black player. See what I mean?\n",
" if self.mcts.is_black_turn(self.game_borad.current_player):\n",
" for idx in range(len(probs[0][0])):\n",
" act = \"\".join((str(9 - int(a)) if a.isdigit() else a) for a in probs[0][0][idx])\n",
@@ -1025,16 +1184,16 @@
" current_players.append(self.game_borad.current_player)\n",
"\n",
" last_state = self.game_borad.state\n",
- " self.game_borad.state = GameBoard.sim_do_action(action, self.game_borad.state) # 在棋盘上下算出的这步棋,得到新的棋盘状态\n",
- " self.game_borad.round += 1 # 更新回合数\n",
- " self.game_borad.current_player = \"w\" if self.game_borad.current_player == \"b\" else \"b\" # 切换当前选手\n",
- " if is_kill_move(last_state, self.game_borad.state) == 0: # 刚刚下的棋是否吃子了\n",
- " self.game_borad.restrict_round += 1 # 更新没有进展回合数\n",
+ " self.game_borad.state = GameBoard.sim_do_action(action, self.game_borad.state) # 在棋盘上下算出的这步棋,得到新的棋盘状态 # Calculate this move on the board and get the new board state\n",
+ " self.game_borad.round += 1 # 更新回合数 # Update round number\n",
+ " self.game_borad.current_player = \"w\" if self.game_borad.current_player == \"b\" else \"b\" # 切换当前选手 # Switch current player\n",
+ " if is_kill_move(last_state, self.game_borad.state) == 0: # 刚刚下的棋是否吃子了 # Did the chess just play have a piece?\n",
+ " self.game_borad.restrict_round += 1 # 更新没有进展回合数 # Update the number of rounds without progress\n",
" else:\n",
" self.game_borad.restrict_round = 0\n",
"\n",
" if (self.game_borad.state.find('K') == -1 or self.game_borad.state.find('k') == -1): \n",
- " # 条件满足说明将/帅被吃了,游戏结束\n",
+ " # 条件满足说明将/帅被吃了,游戏结束 # If the conditions are met, the general/shuai was eaten, and the game is over\n",
" z = np.zeros(len(current_players))\n",
" if (self.game_borad.state.find('K') == -1):\n",
" winnner = \"b\"\n",
@@ -1044,48 +1203,55 @@
" z[np.array(current_players) != winnner] = -1.0\n",
" game_over = True\n",
" print(\"Game end. Winner is player : \", winnner, \" In {} steps\".format(self.game_borad.round - 1))\n",
- " elif self.game_borad.restrict_round >= 60: # 60回合没有进展(吃子),平局\n",
+ " elif self.game_borad.restrict_round >= 60: # 60回合没有进展(吃子),平局 # 60 rounds did not progress (take the child), draw\n",
" z = np.zeros(len(current_players))\n",
" game_over = True\n",
" print(\"Game end. Tie in {} steps\".format(self.game_borad.round - 1))\n",
- " # 认输的部分没有实现\n",
+ " # 认输的部分没有实现 # The part of confession is not implemented\n",
" # elif(self.mcts.root.v < self.resign_threshold):\n",
" # pass\n",
" # elif(self.mcts.root.Q < self.resign_threshold):\n",
" # pass\n",
" if(game_over):\n",
- " self.mcts.reload() # 游戏结束,重置棋盘\n",
+ " self.mcts.reload() # 游戏结束,重置棋盘 # Game over, reset the board\n",
" print(\"Using time {} s\".format(time.time() - start_time))\n",
- " return zip(states, mcts_probs, z), len(z) # 返回下棋数据"
+ " return zip(states, mcts_probs, z), len(z) # 返回下棋数据 # Return the chess data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### MCTS实现"
+ "### MCTS实现 \n",
+ "MCTS implementation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "关键的代码来了,函数通过MCTS进行若干次模拟(论文是1600次,我用了1200次),然后根据子节点的访问量决定要下哪步棋。"
+ "关键的代码来了,函数通过MCTS进行若干次模拟(论文是1600次,我用了1200次),然后根据子节点的访问量决定要下哪步棋。\n",
+ "\n",
+ "The key code is here. The function performs several simulations through MCTS (1600 times for the paper, I used 1200 times), and then decides which move to play based on the number of visits to the child nodes."
]
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"#@profile\n",
"def get_action(self, state, temperature = 1e-3):\n",
" # MCTS主函数,模拟下棋\n",
+ " # MCTS main function, simulated chess\n",
" self.mcts.main(state, self.game_borad.current_player, self.game_borad.restrict_round, self.playout_counts)\n",
" # 取得当前局面下所有子节点的合法走子和相应的访问量。\n",
" # 这个所有子节点可能并不会覆盖所有合法的走子,这个是由树搜索的质量决定的,加大模拟次数会搜索更多不同的走法,\n",
" # 就是加大思考的深度,考虑更多的局面,避免出现有些特别重要的棋步却没有考虑到的情况。\n",
+ " # Get the legal moves and corresponding visits of all child nodes in the current situation.\n",
+ " # All child nodes may not cover all legal moves. This is determined by the quality of the tree search. Increasing the number of simulations will search for more different moves.\n",
+ " # Is to increase the depth of thinking, consider more situations, and avoid situations where some particularly important moves are not considered.\n",
" actions_visits = [(act, nod.N) for act, nod in self.mcts.root.child.items()]\n",
" actions, visits = zip(*actions_visits)\n",
"\n",
@@ -1095,24 +1261,25 @@
"\n",
" if(self.exploration):\n",
" # 训练时,可以通过加入噪声来探索更多可能性的走子\n",
+ " # When training, you can explore more possibilities of walking by adding noise\n",
" act = np.random.choice(actions, p=0.75 * probs + 0.25*np.random.dirichlet(0.3*np.ones(len(probs))))\n",
" else:\n",
- " act = np.random.choice(actions, p=probs) # 通过节点访问量的softmax选择最大可能性的走子\n",
+ " act = np.random.choice(actions, p=probs) # 通过节点访问量的softmax选择最大可能性的走子 # Select the most likely mover through the softmax of node visits\n",
"\n",
- " win_rate = self.mcts.Q(act) # 将节点的Q值当做胜率\n",
- " self.mcts.update_tree(act) # 更新搜索树,将算出的这步棋的局面作为树的根节点"
+ " win_rate = self.mcts.Q(act) # 将节点的Q值当做胜率 # Take the Q value of the node as the winning rate\n",
+ " self.mcts.update_tree(act) # 更新搜索树,将算出的这步棋的局面作为树的根节点 # Update the search tree and use the calculated position as the root node of the tree"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "来看看MCTS的类定义:"
+ "来看看MCTS的类定义: Take a look at the MCTS class definition:"
]
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -1123,27 +1290,27 @@
"cut_off_depth = 30\n",
"\n",
"class MCTS_tree(object):\n",
- " def __init__(self, in_state, in_forward, search_threads): # 参数search_threads我默认使用16个搜索线程\n",
+ " def __init__(self, in_state, in_forward, search_threads): # 参数search_threads我默认使用16个搜索线程 # Parameter search_threads I use 16 search threads by default\n",
" self.noise_eps = 0.25\n",
" self.dirichlet_alpha = 0.3 #0.03\n",
- " # 根节点的先验概率加入了噪声\n",
+ " # 根节点的先验概率加入了噪声 # The prior probability of the root node adds noise\n",
" self.p_ = (1 - self.noise_eps) * 1 + self.noise_eps * np.random.dirichlet([self.dirichlet_alpha])\n",
- " # 定义根节点,传入概率和局面(棋子位置)\n",
+ " # 定义根节点,传入概率和局面(棋子位置) # Define the root node, incoming probability and position (position of chess pieces)\n",
" self.root = leaf_node(None, self.p_, in_state)\n",
" self.c_puct = 5 #1.5\n",
- " # 保存前向传播(预测)函数\n",
+ " # 保存前向传播(预测)函数 # Save the forward propagation (prediction) function\n",
" self.forward = in_forward\n",
" self.node_lock = defaultdict(Lock)\n",
- " # 虚拟损失\n",
+ " # 虚拟损失 #Virtual Loss\n",
" self.virtual_loss = 3\n",
- " # 用来保存正在扩展的节点\n",
+ " # 用来保存正在扩展的节点 # Used to save the node being expanded\n",
" self.now_expanding = set()\n",
- " # 保存扩展过的节点\n",
+ " # 保存扩展过的节点 # Save the expanded node\n",
" self.expanded = set()\n",
" self.cut_off_depth = 30\n",
" # self.QueueItem = namedtuple(\"QueueItem\", \"feature future\")\n",
" self.sem = asyncio.Semaphore(search_threads)\n",
- " # 保存搜索线程的队列\n",
+ " # 保存搜索线程的队列 # Save the queue of search threads\n",
" self.queue = Queue(search_threads)\n",
" self.loop = asyncio.get_event_loop()\n",
" self.running_simulation_num = 0"
@@ -1153,39 +1320,41 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "叶子节点的类定义:"
+ "叶子节点的类定义: The class definition of the leaf node:"
]
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"class leaf_node(object):\n",
- " # 定义节点时,传入父节点,概率和棋盘状态(棋子位置)\n",
+ " # 定义节点时,传入父节点,概率和棋盘状态(棋子位置) # When defining a node, pass in the parent node, probability and board state (position of the chess pieces)\n",
" def __init__(self, in_parent, in_prior_p, in_state):\n",
- " self.P = in_prior_p # 保存概率,其他值默认是0\n",
+ " self.P = in_prior_p # 保存概率,其他值默认是0 # Save the probability, other values are 0 by default\n",
" self.Q = 0\n",
" self.N = 0\n",
" self.v = 0\n",
" self.U = 0\n",
" self.W = 0\n",
- " self.parent = in_parent # 保存父节点\n",
- " self.child = {} # 子节点默认是空\n",
- " self.state = in_state # 保存棋盘状态"
+ " self.parent = in_parent # 保存父节点 # Save the parent node\n",
+ " self.child = {} # 子节点默认是空 # The child node is empty by default\n",
+ " self.state = in_state # 保存棋盘状态 # Save the board state"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "MCTS主函数,模拟下棋:"
+ "MCTS主函数,模拟下棋:\n",
+ "\n",
+ "MCTS main function, simulating chess:"
]
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -1196,23 +1365,23 @@
"#@profile\n",
"def main(self, state, current_player, restrict_round, playouts):\n",
" node = self.root\n",
- " # 先通过神经网络扩展根节点\n",
+ " # 先通过神经网络扩展根节点 # First expand the root node through the neural network\n",
" if not self.is_expanded(node): # and node.is_leaf() # node.state\n",
" # print('Expadning Root Node...')\n",
" positions = self.generate_inputs(node.state, current_player)\n",
" positions = np.expand_dims(positions, 0)\n",
- " action_probs, value = self.forward(positions) # 通过神经网络预测走子概率\n",
- " if self.is_black_turn(current_player): # 判断走子概率是否需要根据先手/后手进行转换\n",
+ " action_probs, value = self.forward(positions) # 通过神经网络预测走子概率 # Predict the probability of walking through a neural network\n",
+ " if self.is_black_turn(current_player): # 判断走子概率是否需要根据先手/后手进行转换 # Determine whether the probability of a move needs to be converted according to the first hand/back hand\n",
" action_probs = cchess_main.flip_policy(action_probs)\n",
- " # 取得当前局面所有合法的走子,有关中国象棋的算法就不在这里讨论了,感兴趣可以查看源代码\n",
+ " # 取得当前局面所有合法的走子,有关中国象棋的算法就不在这里讨论了,感兴趣可以查看源代码 # Get all the legal moves in the current situation. The algorithm of Chinese chess is not discussed here. If you are interested, you can view the source code\n",
" moves = GameBoard.get_legal_moves(node.state, current_player) \n",
" # print(\"current_player : \", current_player)\n",
" # print(moves)\n",
- " node.expand(moves, action_probs) # 扩展节点\n",
- " self.expanded.add(node) # 将当前节点加入到已扩展节点集合中\n",
+ " node.expand(moves, action_probs) # 扩展节点 # Expansion node\n",
+ " self.expanded.add(node) # 将当前节点加入到已扩展节点集合中 # Add the current node to the expanded node set\n",
"\n",
" coroutine_list = []\n",
- " for _ in range(playouts): # 模拟1200次,异步的方式执行,一共使用了16个线程\n",
+ " for _ in range(playouts): # 模拟1200次,异步的方式执行,一共使用了16个线程 # Simulate 1200 times, execute asynchronously, using a total of 16 threads\n",
" coroutine_list.append(self.tree_search(node, current_player, restrict_round))\n",
" coroutine_list.append(self.prediction_worker())\n",
" self.loop.run_until_complete(asyncio.gather(*coroutine_list))"
@@ -1220,7 +1389,7 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -1229,57 +1398,59 @@
" self.running_simulation_num += 1\n",
"\n",
" # reduce parallel search number\n",
- " with await self.sem: # 异步执行树搜索,共16个线程\n",
+ " with await self.sem: # 异步执行树搜索, # Asynchronous execution tree search, a total of 16 threads\n",
" value = await self.start_tree_search(node, current_player, restrict_round)\n",
" self.running_simulation_num -= 1\n",
"\n",
" return value\n",
"\n",
- "# ***树搜索函数***\n",
+ "# ***树搜索函数*** # ***Tree search function***\n",
"async def start_tree_search(self, node, current_player, restrict_round)->float:\n",
" \"\"\"Monte Carlo Tree search Select,Expand,Evauate,Backup\"\"\"\n",
" now_expanding = self.now_expanding\n",
"\n",
- " # 如果当前节点正在被扩展,就小睡一会\n",
+ " # 如果当前节点正在被扩展,就小睡一会 # If the current node is being expanded, take a nap\n",
" while node in now_expanding:\n",
" await asyncio.sleep(1e-4)\n",
"\n",
- " if not self.is_expanded(node): # 如果节点没有被扩展过,要扩展这个节点\n",
+ " if not self.is_expanded(node): # 如果节点没有被扩展过,要扩展这个节点 # If the node has not been expanded, expand this node\n",
" \"\"\"is leaf node try evaluate and expand\"\"\"\n",
" # add leaf node to expanding list\n",
- " self.now_expanding.add(node) # 加入到正在扩展集合中\n",
+ " self.now_expanding.add(node) # 加入到正在扩展集合中 # Add to the expanding collection\n",
"\n",
" positions = self.generate_inputs(node.state, current_player)\n",
"\n",
" # 这里有个trick,就是并不是一个节点一个节点的使用神经网络预测结果,这样效率太低\n",
+ " # There is a trick here, that is, it is not a node-by-node using neural network to predict the results, so the efficiency is too low\n",
" # 而是放到队列中,通过prediction_worker函数统一管理队列,将队列中的一组(16个)输入传给神经网络,得到预测结果\n",
+ " # Instead, put it in the queue, manage the queue uniformly through the prediction_worker function, and pass a set of (16) inputs in the queue to the neural network to get the prediction result\n",
" # 这一切都是异步的\n",
+ " # All this is asynchronous\n",
" # push extracted dihedral features of leaf node to the evaluation queue\n",
" future = await self.push_queue(positions) # type: Future\n",
" await future\n",
" action_probs, value = future.result()\n",
"\n",
- " if self.is_black_turn(current_player): # 根据当前棋手的颜色决定是否对走子概率翻转\n",
+ " if self.is_black_turn(current_player): # 根据当前棋手的颜色决定是否对走子概率翻转 # Determine whether to flip the move probability according to the current player's color\n",
" action_probs = cchess_main.flip_policy(action_probs)\n",
"\n",
" moves = GameBoard.get_legal_moves(node.state, current_player)\n",
" # print(\"current_player : \", current_player)\n",
" # print(moves)\n",
- " node.expand(moves, action_probs) # Expand操作,使用神经网络预测的结果扩展当前节点\n",
+ " node.expand(moves, action_probs) # Expand操作,使用神经网络预测的结果扩展当前节点 # Expand operation, use the results predicted by the neural network to expand the current node\n",
" self.expanded.add(node) # \n",
"\n",
" # remove leaf node from expanding list\n",
" self.now_expanding.remove(node)\n",
"\n",
" # must invert, because alternative layer has opposite objective\n",
- " return value[0] * -1 # 返回神经网络预测的胜率,一定要取负,理由在论文分析时已经说过了\n",
- "\n",
- " else: # 如果节点被扩展过,执行Select\n",
+ " return value[0] * -1 # 返回神经网络预测的胜率,一定要取负,理由在论文分析时已经说过了 # Return the winning rate predicted by the neural network, and it must be negative. The reason has already been said in the analysis of the paper\n",
+ " else: # 如果节点被扩展过,执行Select # If the node has been expanded, execute Select\n",
" \"\"\"node has already expanded. Enter select phase.\"\"\"\n",
" # select child node with maximum action scroe\n",
" last_state = node.state\n",
"\n",
- " action, node = node.select_new(c_PUCT) # Select操作,根据Q+U最大选择节点\n",
+ " action, node = node.select_new(c_PUCT) # Select操作,根据Q+U最大选择节点 # Select operation, select the node according to the maximum Q+U\n",
" current_player = \"w\" if current_player == \"b\" else \"b\"\n",
" if is_kill_move(last_state, node.state) == 0:\n",
" restrict_round += 1\n",
@@ -1288,38 +1459,41 @@
" last_state = node.state\n",
"\n",
" # 为选择的节点添加虚拟损失,防止其他线程继续探索这个节点,增加探索多样性\n",
+ " # Add a virtual loss to the selected node to prevent other threads from continuing to explore this node and increase the diversity of exploration\n",
" # add virtual loss\n",
" node.N += virtual_loss\n",
" node.W += -virtual_loss\n",
"\n",
" # evolve game board status\n",
" # 判断这个节点状态下,是否分出胜负\n",
+ " # Judge whether this node state is a winner\n",
" if (node.state.find('K') == -1 or node.state.find('k') == -1):\n",
- " # 分出胜负了,设置胜率1或者0\n",
+ " # 分出胜负了,设置胜率1或者0 # The winner is divided, set the winning rate to 1 or 0\n",
" if (node.state.find('K') == -1):\n",
" value = 1.0 if current_player == \"b\" else -1.0\n",
" if (node.state.find('k') == -1):\n",
" value = -1.0 if current_player == \"b\" else 1.0\n",
- " # 一定要符号取反\n",
+ " # 一定要符号取反 # The sign must be reversed\n",
" value = value * -1\n",
- " elif restrict_round >= 60: # 60回合无进展(吃子),平局\n",
+ " elif restrict_round >= 60: # 60回合无进展(吃子),平局 # 60 rounds no progress (take a child), draw\n",
" value = 0.0\n",
" else:\n",
- " # 没有分出胜负,在当前节点局面下继续树搜索\n",
+ " # 没有分出胜负,在当前节点局面下继续树搜索 # There is no winner or loser, continue the tree search under the current node position\n",
" value = await self.start_tree_search(node, current_player, restrict_round) # next move\n",
"\n",
- " # 当前节点搜索完毕,去掉虚拟损失,恢复节点状态\n",
+ " # 当前节点搜索完毕,去掉虚拟损失,恢复节点状态 # The current node search is completed, remove the virtual loss and restore the node state\n",
" node.N += -virtual_loss\n",
" node.W += virtual_loss\n",
"\n",
" # on returning search path\n",
" # update: N, W, Q, U\n",
- " node.back_up_value(value) # 执行节点的Backup操作,更新节点的各类数值\n",
+ " node.back_up_value(value) # 执行节点的Backup操作,更新节点的各类数值 # Execute the backup operation of the node and update the various values of the node\n",
"\n",
" # must invert\n",
- " return value * -1 # 一定要符号取反\n",
+ " return value * -1 # 一定要符号取反 # The sign must be reversed\n",
"\n",
"# 管理队列数据,一旦队列中有数据,就统一传给神经网络,获得预测结果\n",
+ "# Manage queue data, once there is data in the queue, it will be uniformly transmitted to the neural network to obtain the prediction result\n",
"async def prediction_worker(self):\n",
" \"\"\"For better performance, queueing prediction requests and predict together in this worker.\n",
" speed up about 45sec -> 15sec for example.\n",
@@ -1351,20 +1525,21 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "最后看看叶子节点的Select、Expand和Backup的实现。"
+ "最后看看叶子节点的Select、Expand和Backup的实现。\n",
+ "Finally, look at the implementation of Select, Expand and Backup of leaf nodes."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
- "# Select,选择Q+U最大的节点\n",
+ "# Select,选择Q+U最大的节点 # Select, select the node with the largest Q+U\n",
"def select_new(self, c_puct):\n",
" return max(self.child.items(), key=lambda node: node[1].get_Q_plus_U_new(c_puct))\n",
"\n",
- "# 返回节点的Q+U\n",
+ "# 返回节点的Q+U # Return the Q+U of the node\n",
"def get_Q_plus_U_new(self, c_puct):\n",
" \"\"\"Calculate and return the value for this node: a combination of leaf evaluations, Q, and\n",
" this node's prior adjusted for its visit count, u\n",
@@ -1375,59 +1550,65 @@
" return self.Q + U\n",
"\n",
"# 参数是所有合法走子moves,和神经网络预测的概率向量action_probs\n",
+ "# The parameter is all legal moves, and the probability vector predicted by the neural network action_probs\n",
"#@profile\n",
"def expand(self, moves, action_probs):\n",
" tot_p = 1e-8\n",
" action_probs = action_probs.flatten() \n",
" \n",
" for action in moves:\n",
- " # 模拟执行每一个合法走子,得到相应的局面(棋子位置)\n",
+ " # 模拟执行每一个合法走子,得到相应的局面(棋子位置) # Simulate each legal move and get the corresponding position (position of the chess piece)\n",
" in_state = GameBoard.sim_do_action(action, self.state)\n",
- " # 从概率向量中得到当前走子对应的概率\n",
+ " # 从概率向量中得到当前走子对应的概率 # Get the probability corresponding to the current move from the probability vector\n",
" mov_p = action_probs[label2i[action]]\n",
- " # 创建新节点,传入父节点(因为是扩展当前节点,所以当前节点是新节点的父节点)、概率、棋盘状态\n",
+ " # 创建新节点,传入父节点(因为是扩展当前节点,所以当前节点是新节点的父节点)、概率、棋盘状态 # Create a new node, pass in the parent node (because the current node is expanded, so the current node is the parent node of the new node), probability, and board state\n",
" new_node = leaf_node(self, mov_p, in_state)\n",
- " self.child[action] = new_node # 将新节点添加到当前节点的子节点集合中\n",
+ " self.child[action] = new_node # 将新节点添加到当前节点的子节点集合中 # Add the new node to the set of child nodes of the current node\n",
" tot_p += mov_p \n",
" \n",
" for a, n in self.child.items():\n",
" n.P /= tot_p\n",
"\n",
- "# 更新节点的各项参数\n",
+ "# 更新节点的各项参数 # Update the parameters of the node\n",
"def back_up_value(self, value):\n",
- " self.N += 1 # 计数加一\n",
- " self.W += value # 更新总行动价值\n",
+ " self.N += 1 # 计数加一 # Count plus one\n",
+ " self.W += value # 更新总行动价值 # Update total action value\n",
" self.v = value \n",
- " self.Q = self.W / self.N # 更新平均行动价值\n",
- " self.U = c_PUCT * self.P * np.sqrt(self.parent.N) / ( 1 + self.N) # 更新U"
+ " self.Q = self.W / self.N # 更新平均行动价值 # Update average action value\n",
+ " self.U = c_PUCT * self.P * np.sqrt(self.parent.N) / ( 1 + self.N) # 更新U # Update U"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "以上,就是自对弈训练神经网络的全部内容了,关于中国象棋的实现部分请看项目代码。"
+ "以上,就是自对弈训练神经网络的全部内容了,关于中国象棋的实现部分请看项目代码。\n",
+ "The above is the whole content of the self-play training neural network. For the realization of Chinese chess, please see the project code."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## 最后"
+ "## 最后 Conclusion"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "我来说说训练情况,因为是从白板一块开始训练,刚开始都是乱下,从乱下的经验当中学会下棋是需要大量对弈才行的。解的空间是很稀疏的,相当于100个数据,有99个是负例,只有1个正例。论文中训练了700K次的mini-batch,国际象棋开源项目[`chess-alpha-zero`](https://github.com/Zeta36/chess-alpha-zero)也训练了10K次。我呢,训练不到4K次,模型刚刚学会用象和士防守,总之仍然下棋很烂。如果您有条件可以再多训练试试,我自从收到信用卡扣款400美元通知以后就把aws下线了:D 贫穷限制了我的想象力O(∩_∩)O"
+ "我来说说训练情况,因为是从白板一块开始训练,刚开始都是乱下,从乱下的经验当中学会下棋是需要大量对弈才行的。解的空间是很稀疏的,相当于100个数据,有99个是负例,只有1个正例。论文中训练了700K次的mini-batch,国际象棋开源项目[`chess-alpha-zero`](https://github.com/Zeta36/chess-alpha-zero)也训练了10K次。我呢,训练不到4K次,模型刚刚学会用象和士防守,总之仍然下棋很烂。如果您有条件可以再多训练试试,我自从收到信用卡扣款400美元通知以后就把aws下线了:D 贫穷限制了我的想象力O(∩_∩)O\n",
+ "\n",
+ "Let me talk about the training situation, because I started training with the whiteboard. At the beginning, it was chaotic. Learning to play chess from chaotic experience requires a lot of games. The solution space is very sparse, equivalent to 100 data, 99 are negative examples, and only 1 positive example. In the paper, mini-batch was trained 700K times, and [`chess-alpha-zero`](https://github.com/Zeta36/chess-alpha-zero), an open source chess project, was also trained 10K times. As for me, I have trained less than 4K times. The model has just learned to defend with elephants and warriors. In short, I still play chess badly. If you have the conditions, you can try more training. I have taken aws offline since I received the 400 USD credit card charge notification: D Poverty limits my imagination O(∩_∩)O"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## 参考资料"
+ "## 参考资料\n",
+ "\n",
+ "Reference material¶"
]
},
{
@@ -1445,20 +1626,44 @@
" - 五子棋开源项目[`AlphaZero_Gomoku`](https://github.com/junxiaosong/AlphaZero_Gomoku)\n",
" - 黑白棋开源项目[`reversi-alpha-zero`](https://github.com/mokemokechicken/reversi-alpha-zero)\n",
" - 中国象棋开源项目[`IntelliChess`](https://github.com/lifei96/IntelliChess)\n",
- " - 中国象棋UI项目[`ChineseChess`](https://github.com/Linzertorte/ChineseChess)"
+ " - 中国象棋UI项目[`ChineseChess`](https://github.com/Linzertorte/ChineseChess)\n",
+ " \n",
+ " \n",
+ " Translation (English)\n",
+ " \n",
+ " - [`In-depth understanding of AlphaGo yuan`](https://charlesliuyx.github.io/2017/10/18/深入浅出看懂AlphaGo元/)\n",
+ " - [`Understand how AlphaGo plays chess`](https://charlesliuyx.github.io/2017/05/27/AlphaGo运行原理解析/)\n",
+ " - Go open source project [`AlphaGOZero-python-tensorflow`](https://github.com/yhyu13/AlphaGOZero-python-tensorflow)\n",
+ " - [`TensorFlow Multi-GPU Parallel Computing Example---MNIST`](https://gitee.com/liyang619/mnist_multi_gpu_batching_train/blob/master/mnist_multi_gpu_batching_train.py)\n",
+ " - Chess open source project [`chess-alpha-zero`](https://github.com/Zeta36/chess-alpha-zero)\n",
+ " - [`FEN file format`](http://www.xqbase.com/protocol/cchess_fen.htm)\n",
+ " - [`Movement representation`](http://www.xqbase.com/protocol/cchess_move.htm)\n",
+ " - [`Chinese Chess General Engine Protocol Version: 3.0`](http://www.xqbase.com/protocol/cchess_ucci.htm)\n",
+ " - Gobang open source project [`AlphaZero_Gomoku`](https://github.com/junxiaosong/AlphaZero_Gomoku)\n",
+ " - Othello open source project [`reversi-alpha-zero`](https://github.com/mokemokechicken/reversi-alpha-zero)\n",
+ " - Chinese Chess Open Source Project [`IntelliChess`](https://github.com/lifei96/IntelliChess)\n",
+ " - Chinese Chess UI project [`ChineseChess`](https://github.com/Linzertorte/ChineseChess)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## 今天的分享就到这里,请多指教~"
+ "## 今天的分享就到这里,请多指教~\n",
+ "Today’s sharing is here, please advise~"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -1472,7 +1677,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.7"
+ "version": "3.7.3"
}
},
"nbformat": 4,
diff --git a/chessman/__pycache__/Bing.cpython-312.pyc b/chessman/__pycache__/Bing.cpython-312.pyc
new file mode 100644
index 0000000..53ef6e8
Binary files /dev/null and b/chessman/__pycache__/Bing.cpython-312.pyc differ
diff --git a/chessman/__pycache__/Che.cpython-312.pyc b/chessman/__pycache__/Che.cpython-312.pyc
new file mode 100644
index 0000000..f6f91a8
Binary files /dev/null and b/chessman/__pycache__/Che.cpython-312.pyc differ
diff --git a/chessman/__pycache__/Ma.cpython-312.pyc b/chessman/__pycache__/Ma.cpython-312.pyc
new file mode 100644
index 0000000..d3c0b06
Binary files /dev/null and b/chessman/__pycache__/Ma.cpython-312.pyc differ
diff --git a/chessman/__pycache__/Pao.cpython-312.pyc b/chessman/__pycache__/Pao.cpython-312.pyc
new file mode 100644
index 0000000..8f8f523
Binary files /dev/null and b/chessman/__pycache__/Pao.cpython-312.pyc differ
diff --git a/chessman/__pycache__/Shi.cpython-312.pyc b/chessman/__pycache__/Shi.cpython-312.pyc
new file mode 100644
index 0000000..b6337ac
Binary files /dev/null and b/chessman/__pycache__/Shi.cpython-312.pyc differ
diff --git a/chessman/__pycache__/Shuai.cpython-312.pyc b/chessman/__pycache__/Shuai.cpython-312.pyc
new file mode 100644
index 0000000..443a6aa
Binary files /dev/null and b/chessman/__pycache__/Shuai.cpython-312.pyc differ
diff --git a/chessman/__pycache__/Xiang.cpython-312.pyc b/chessman/__pycache__/Xiang.cpython-312.pyc
new file mode 100644
index 0000000..205f589
Binary files /dev/null and b/chessman/__pycache__/Xiang.cpython-312.pyc differ
diff --git a/chessman/__pycache__/__init__.cpython-312.pyc b/chessman/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000..8591357
Binary files /dev/null and b/chessman/__pycache__/__init__.cpython-312.pyc differ
diff --git a/main.py b/main.py
index 5f09bbb..8416c59 100755
--- a/main.py
+++ b/main.py
@@ -1,1584 +1,1403 @@
-#coding:utf-8
-from asyncio import Future
-import asyncio
-from asyncio.queues import Queue
-import uvloop
-asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-
-import tensorflow as tf
-import numpy as np
-import os
-import sys
-import random
-import time
-import argparse
-from collections import deque, defaultdict, namedtuple
-import copy
-from policy_value_network import *
-from policy_value_network_gpus import *
-import scipy.stats
-from threading import Lock
-from concurrent.futures import ThreadPoolExecutor
-
-def flipped_uci_labels(param):
- def repl(x):
- return "".join([(str(9 - int(a)) if a.isdigit() else a) for a in x])
-
- return [repl(x) for x in param]
-
-# 创建所有合法走子UCI,size 2086
-def create_uci_labels():
- labels_array = []
- letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
- numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
-
- Advisor_labels = ['d7e8', 'e8d7', 'e8f9', 'f9e8', 'd0e1', 'e1d0', 'e1f2', 'f2e1',
- 'd2e1', 'e1d2', 'e1f0', 'f0e1', 'd9e8', 'e8d9', 'e8f7', 'f7e8']
- Bishop_labels = ['a2c4', 'c4a2', 'c0e2', 'e2c0', 'e2g4', 'g4e2', 'g0i2', 'i2g0',
- 'a7c9', 'c9a7', 'c5e7', 'e7c5', 'e7g9', 'g9e7', 'g5i7', 'i7g5',
- 'a2c0', 'c0a2', 'c4e2', 'e2c4', 'e2g0', 'g0e2', 'g4i2', 'i2g4',
- 'a7c5', 'c5a7', 'c9e7', 'e7c9', 'e7g5', 'g5e7', 'g9i7', 'i7g9']
- # King_labels = ['d0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9',
- # 'd7d0', 'd7d1', 'd7d2', 'd8d0', 'd8d1', 'd8d2', 'd9d0', 'd9d1', 'd9d2',
- # 'd0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9',
- # 'd0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9',
- # 'd0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9',
- # 'd0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9']
-
- for l1 in range(9):
- for n1 in range(10):
- destinations = [(t, n1) for t in range(9)] + \
- [(l1, t) for t in range(10)] + \
- [(l1 + a, n1 + b) for (a, b) in
- [(-2, -1), (-1, -2), (-2, 1), (1, -2), (2, -1), (-1, 2), (2, 1), (1, 2)]] # 马走日
- for (l2, n2) in destinations:
- if (l1, n1) != (l2, n2) and l2 in range(9) and n2 in range(10):
- move = letters[l1] + numbers[n1] + letters[l2] + numbers[n2]
- labels_array.append(move)
-
- for p in Advisor_labels:
- labels_array.append(p)
-
- for p in Bishop_labels:
- labels_array.append(p)
-
- return labels_array
-
-def create_position_labels():
- labels_array = []
- letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
- letters.reverse()
- numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
-
- for l1 in range(9):
- for n1 in range(10):
- move = letters[8 - l1] + numbers[n1]
- labels_array.append(move)
-# labels_array.reverse()
- return labels_array
-
-def create_position_labels_reverse():
- labels_array = []
- letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
- letters.reverse()
- numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
-
- for l1 in range(9):
- for n1 in range(10):
- move = letters[l1] + numbers[n1]
- labels_array.append(move)
- labels_array.reverse()
- return labels_array
-
-class leaf_node(object):
- def __init__(self, in_parent, in_prior_p, in_state):
- self.P = in_prior_p
- self.Q = 0
- self.N = 0
- self.v = 0
- self.U = 0
- self.W = 0
- self.parent = in_parent
- self.child = {}
- self.state = in_state
-
- def is_leaf(self):
- return self.child == {}
-
- def get_Q_plus_U_new(self, c_puct):
- """Calculate and return the value for this node: a combination of leaf evaluations, Q, and
- this node's prior adjusted for its visit count, u
- c_puct -- a number in (0, inf) controlling the relative impact of values, Q, and
- prior probability, P, on this node's score.
- """
- # self._u = c_puct * self._P * np.sqrt(self._parent._n_visits) / (1 + self._n_visits)
- U = c_puct * self.P * np.sqrt(self.parent.N) / ( 1 + self.N)
- return self.Q + U
-
- def get_Q_plus_U(self, c_puct):
- """Calculate and return the value for this node: a combination of leaf evaluations, Q, and
- this node's prior adjusted for its visit count, u
- c_puct -- a number in (0, inf) controlling the relative impact of values, Q, and
- prior probability, P, on this node's score.
- """
- # self._u = c_puct * self._P * np.sqrt(self._parent._n_visits) / (1 + self._n_visits)
- self.U = c_puct * self.P * np.sqrt(self.parent.N) / ( 1 + self.N)
- return self.Q + self.U
-
- # def select_move_by_action_score(self, noise=True):
- #
- # # P = params[self.lookup['P']]
- # # N = params[self.lookup['N']]
- # # Q = params[self.lookup['W']] / (N + 1e-8)
- # # U = c_PUCT * P * np.sqrt(np.sum(N)) / (1 + N)
- #
- # ret_a = None
- # ret_n = None
- # action_idx = {}
- # action_score = []
- # i = 0
- # for a, n in self.child.items():
- # U = c_PUCT * n.P * np.sqrt(n.parent.N) / ( 1 + n.N)
- # action_idx[i] = (a, n)
- #
- # if noise:
- # action_score.append(n.Q + U * (0.75 * n.P + 0.25 * dirichlet([.03] * (go.N ** 2 + 1))) / (n.P + 1e-8))
- # else:
- # action_score.append(n.Q + U)
- # i += 1
- # # if(n.Q + n.U > max_Q_plus_U):
- # # max_Q_plus_U = n.Q + n.U
- # # ret_a = a
- # # ret_n = n
- #
- # action_t = int(np.argmax(action_score[:-1]))
- #
- # return ret_a, ret_n
- # # return action_t
- def select_new(self, c_puct):
- return max(self.child.items(), key=lambda node: node[1].get_Q_plus_U_new(c_puct))
-
- def select(self, c_puct):
- # max_Q_plus_U = 1e-10
- # ret_a = None
- # ret_n = None
- # for a, n in self.child.items():
- # n.U = c_puct * n.P * np.sqrt(n.parent.N) / ( 1 + n.N)
- # if(n.Q + n.U > max_Q_plus_U):
- # max_Q_plus_U = n.Q + n.U
- # ret_a = a
- # ret_n = n
- # return ret_a, ret_n
- return max(self.child.items(), key=lambda node: node[1].get_Q_plus_U(c_puct))
-
- #@profile
- def expand(self, moves, action_probs):
- tot_p = 1e-8
- action_probs = action_probs.flatten() #.squeeze()
- # print("expand action_probs shape : ", action_probs.shape)
- for action in moves:
- in_state = GameBoard.sim_do_action(action, self.state)
- mov_p = action_probs[label2i[action]]
- new_node = leaf_node(self, mov_p, in_state)
- self.child[action] = new_node
- tot_p += mov_p
-
- for a, n in self.child.items():
- n.P /= tot_p
-
- def back_up_value(self, value):
- self.N += 1
- self.W += value
- self.v = value
- self.Q = self.W / self.N # node.Q += 1.0*(value - node.Q) / node.N
- self.U = c_PUCT * self.P * np.sqrt(self.parent.N) / ( 1 + self.N)
- # node = node.parent
- # value = -value
-
- def backup(self, value):
- node = self
- while node != None:
- node.N += 1
- node.W += value
- node.v = value
- node.Q = node.W / node.N # node.Q += 1.0*(value - node.Q) / node.N
- node = node.parent
- value = -value
-
-pieces_order = 'KARBNPCkarbnpc' # 9 x 10 x 14
-ind = {pieces_order[i]: i for i in range(14)}
-
-labels_array = create_uci_labels()
-labels_len = len(labels_array)
-flipped_labels = flipped_uci_labels(labels_array)
-unflipped_index = [labels_array.index(x) for x in flipped_labels]
-
-i2label = {i: val for i, val in enumerate(labels_array)}
-label2i = {val: i for i, val in enumerate(labels_array)}
-
-def get_pieces_count(state):
- count = 0
- for s in state:
- if s.isalpha():
- count += 1
- return count
-
-def is_kill_move(state_prev, state_next):
- return get_pieces_count(state_prev) - get_pieces_count(state_next)
-
-QueueItem = namedtuple("QueueItem", "feature future")
-c_PUCT = 5
-virtual_loss = 3
-cut_off_depth = 30
-
-class MCTS_tree(object):
- def __init__(self, in_state, in_forward, search_threads):
- self.noise_eps = 0.25
- self.dirichlet_alpha = 0.3 #0.03
- self.p_ = (1 - self.noise_eps) * 1 + self.noise_eps * np.random.dirichlet([self.dirichlet_alpha])
- self.root = leaf_node(None, self.p_, in_state)
- self.c_puct = 5 #1.5
- # self.policy_network = in_policy_network
- self.forward = in_forward
- self.node_lock = defaultdict(Lock)
-
- self.virtual_loss = 3
- self.now_expanding = set()
- self.expanded = set()
- self.cut_off_depth = 30
- # self.QueueItem = namedtuple("QueueItem", "feature future")
- self.sem = asyncio.Semaphore(search_threads)
- self.queue = Queue(search_threads)
- self.loop = asyncio.get_event_loop()
- self.running_simulation_num = 0
-
- def reload(self):
- self.root = leaf_node(None, self.p_,
- "RNBAKABNR/9/1C5C1/P1P1P1P1P/9/9/p1p1p1p1p/1c5c1/9/rnbakabnr") # "rnbakabnr/9/1c5c1/p1p1p1p1p/9/9/P1P1P1P1P/1C5C1/9/RNBAKABNR"
- self.expanded = set()
-
-
- def Q(self, move) -> float:
- ret = 0.0
- find = False
- for a, n in self.root.child.items():
- if move == a:
- ret = n.Q
- find = True
- if(find == False):
- print("{} not exist in the child".format(move))
- return ret
-
- def update_tree(self, act):
- # if(act in self.root.child):
- self.expanded.discard(self.root)
- self.root = self.root.child[act]
- self.root.parent = None
- # else:
- # self.root = leaf_node(None, self.p_, in_state)
-
-
- # def do_simulation(self, state, current_player, restrict_round):
- # node = self.root
- # last_state = state
- # while(node.is_leaf() == False):
- # # print("do_simulation while current_player : ", current_player)
- # with self.node_lock[node]:
- # action, node = node.select(self.c_puct)
- # current_player = "w" if current_player == "b" else "b"
- # if is_kill_move(last_state, node.state) == 0:
- # restrict_round += 1
- # else:
- # restrict_round = 0
- # last_state = node.state
- #
- # positions = self.generate_inputs(node.state, current_player)
- # positions = np.expand_dims(positions, 0)
- # action_probs, value = self.forward(positions)
- # if self.is_black_turn(current_player):
- # action_probs = cchess_main.flip_policy(action_probs)
- #
- # # print("action_probs shape : ", action_probs.shape) #(1, 2086)
- # with self.node_lock[node]:
- # if(node.state.find('K') == -1 or node.state.find('k') == -1):
- # if (node.state.find('K') == -1):
- # value = 1.0 if current_player == "b" else -1.0
- # if (node.state.find('k') == -1):
- # value = -1.0 if current_player == "b" else 1.0
- # elif restrict_round >= 60:
- # value = 0.0
- # else:
- # moves = GameBoard.get_legal_moves(node.state, current_player)
- # # print("current_player : ", current_player)
- # # print(moves)
- # node.expand(moves, action_probs)
- #
- # # if(node.parent != None):
- # # node.parent.N += self.virtual_loss
- # node.N += self.virtual_loss
- # node.W += -self.virtual_loss
- # node.Q = node.W / node.N
- #
- # # time.sleep(0.1)
- #
- # with self.node_lock[node]:
- # # if(node.parent != None):
- # # node.parent.N += -self.virtual_loss# + 1
- # node.N += -self.virtual_loss# + 1
- # node.W += self.virtual_loss# + leaf_v
- # # node.Q = node.W / node.N
- #
- # node.backup(-value)
-
- def is_expanded(self, key) -> bool:
- """Check expanded status"""
- return key in self.expanded
-
- async def tree_search(self, node, current_player, restrict_round) -> float:
- """Independent MCTS, stands for one simulation"""
- self.running_simulation_num += 1
-
- # reduce parallel search number
- with await self.sem:
- value = await self.start_tree_search(node, current_player, restrict_round)
- # logger.debug(f"value: {value}")
- # logger.debug(f'Current running threads : {RUNNING_SIMULATION_NUM}')
- self.running_simulation_num -= 1
-
- return value
-
- async def start_tree_search(self, node, current_player, restrict_round)->float:
- """Monte Carlo Tree search Select,Expand,Evauate,Backup"""
- now_expanding = self.now_expanding
-
- while node in now_expanding:
- await asyncio.sleep(1e-4)
-
- if not self.is_expanded(node): # and node.is_leaf()
- """is leaf node try evaluate and expand"""
- # add leaf node to expanding list
- self.now_expanding.add(node)
-
- positions = self.generate_inputs(node.state, current_player)
- # positions = np.expand_dims(positions, 0)
-
- # push extracted dihedral features of leaf node to the evaluation queue
- future = await self.push_queue(positions) # type: Future
- await future
- action_probs, value = future.result()
-
- # action_probs, value = self.forward(positions)
- if self.is_black_turn(current_player):
- action_probs = cchess_main.flip_policy(action_probs)
-
- moves = GameBoard.get_legal_moves(node.state, current_player)
- # print("current_player : ", current_player)
- # print(moves)
- node.expand(moves, action_probs)
- self.expanded.add(node) # node.state
-
- # remove leaf node from expanding list
- self.now_expanding.remove(node)
-
- # must invert, because alternative layer has opposite objective
- return value[0] * -1
-
- else:
- """node has already expanded. Enter select phase."""
- # select child node with maximum action scroe
- last_state = node.state
-
- action, node = node.select_new(c_PUCT)
- current_player = "w" if current_player == "b" else "b"
- if is_kill_move(last_state, node.state) == 0:
- restrict_round += 1
- else:
- restrict_round = 0
- last_state = node.state
-
- # action_t = self.select_move_by_action_score(key, noise=True)
-
- # add virtual loss
- # self.virtual_loss_do(key, action_t)
- node.N += virtual_loss
- node.W += -virtual_loss
-
- # evolve game board status
- # child_position = self.env_action(position, action_t)
-
- if (node.state.find('K') == -1 or node.state.find('k') == -1):
- if (node.state.find('K') == -1):
- value = 1.0 if current_player == "b" else -1.0
- if (node.state.find('k') == -1):
- value = -1.0 if current_player == "b" else 1.0
- value = value * -1
- elif restrict_round >= 60:
- value = 0.0
- else:
- value = await self.start_tree_search(node, current_player, restrict_round) # next move
- # if node is not None:
- # value = await self.start_tree_search(node) # next move
- # else:
- # # None position means illegal move
- # value = -1
-
- # self.virtual_loss_undo(key, action_t)
- node.N += -virtual_loss
- node.W += virtual_loss
-
- # on returning search path
- # update: N, W, Q, U
- # self.back_up_value(key, action_t, value)
- node.back_up_value(value) # -value
-
- # must invert
- return value * -1
- # if child_position is not None:
- # return value * -1
- # else:
- # # illegal move doesn't mean much for the opponent
- # return 0
-
- async def prediction_worker(self):
- """For better performance, queueing prediction requests and predict together in this worker.
- speed up about 45sec -> 15sec for example.
- """
- q = self.queue
- margin = 10 # avoid finishing before other searches starting.
- while self.running_simulation_num > 0 or margin > 0:
- if q.empty():
- if margin > 0:
- margin -= 1
- await asyncio.sleep(1e-3)
- continue
- item_list = [q.get_nowait() for _ in range(q.qsize())] # type: list[QueueItem]
- #logger.debug(f"predicting {len(item_list)} items")
- features = np.asarray([item.feature for item in item_list]) # asarray
- # print("prediction_worker [features.shape] before : ", features.shape)
- # shape = features.shape
- # features = features.reshape((shape[0] * shape[1], shape[2], shape[3], shape[4]))
- # print("prediction_worker [features.shape] after : ", features.shape)
- # policy_ary, value_ary = self.run_many(features)
- action_probs, value = self.forward(features)
- for p, v, item in zip(action_probs, value, item_list):
- item.future.set_result((p, v))
-
- async def push_queue(self, features):
- future = self.loop.create_future()
- item = QueueItem(features, future)
- await self.queue.put(item)
- return future
-
- #@profile
- def main(self, state, current_player, restrict_round, playouts):
- node = self.root
- if not self.is_expanded(node): # and node.is_leaf() # node.state
- # print('Expadning Root Node...')
- positions = self.generate_inputs(node.state, current_player)
- positions = np.expand_dims(positions, 0)
- action_probs, value = self.forward(positions)
- if self.is_black_turn(current_player):
- action_probs = cchess_main.flip_policy(action_probs)
-
- moves = GameBoard.get_legal_moves(node.state, current_player)
- # print("current_player : ", current_player)
- # print(moves)
- node.expand(moves, action_probs)
- self.expanded.add(node) # node.state
-
- coroutine_list = []
- for _ in range(playouts):
- coroutine_list.append(self.tree_search(node, current_player, restrict_round))
- coroutine_list.append(self.prediction_worker())
- self.loop.run_until_complete(asyncio.gather(*coroutine_list))
-
- def do_simulation(self, state, current_player, restrict_round):
- node = self.root
- last_state = state
- while(node.is_leaf() == False):
- # print("do_simulation while current_player : ", current_player)
- action, node = node.select(self.c_puct)
- current_player = "w" if current_player == "b" else "b"
- if is_kill_move(last_state, node.state) == 0:
- restrict_round += 1
- else:
- restrict_round = 0
- last_state = node.state
-
- positions = self.generate_inputs(node.state, current_player)
- positions = np.expand_dims(positions, 0)
- action_probs, value = self.forward(positions)
- if self.is_black_turn(current_player):
- action_probs = cchess_main.flip_policy(action_probs)
-
- # print("action_probs shape : ", action_probs.shape) #(1, 2086)
-
- if(node.state.find('K') == -1 or node.state.find('k') == -1):
- if (node.state.find('K') == -1):
- value = 1.0 if current_player == "b" else -1.0
- if (node.state.find('k') == -1):
- value = -1.0 if current_player == "b" else 1.0
- elif restrict_round >= 60:
- value = 0.0
- else:
- moves = GameBoard.get_legal_moves(node.state, current_player)
- # print("current_player : ", current_player)
- # print(moves)
- node.expand(moves, action_probs)
-
- node.backup(-value)
-
- def generate_inputs(self, in_state, current_player):
- state, palyer = self.try_flip(in_state, current_player, self.is_black_turn(current_player))
- return self.state_to_positions(state)
-
- def replace_board_tags(self, board):
- board = board.replace("2", "11")
- board = board.replace("3", "111")
- board = board.replace("4", "1111")
- board = board.replace("5", "11111")
- board = board.replace("6", "111111")
- board = board.replace("7", "1111111")
- board = board.replace("8", "11111111")
- board = board.replace("9", "111111111")
- return board.replace("/", "")
-
- # 感觉位置有点反了,当前角色的棋子在右侧,plane的后面
- def state_to_positions(self, state):
- # TODO C plain x 2
- board_state = self.replace_board_tags(state)
- pieces_plane = np.zeros(shape=(9, 10, 14), dtype=np.float32)
- for rank in range(9): #横线
- for file in range(10): #直线
- v = board_state[rank * 9 + file]
- if v.isalpha():
- pieces_plane[rank][file][ind[v]] = 1
- assert pieces_plane.shape == (9, 10, 14)
- return pieces_plane
-
-
- def try_flip(self, state, current_player, flip=False):
- if not flip:
- return state, current_player
-
- rows = state.split('/')
-
- def swapcase(a):
- if a.isalpha():
- return a.lower() if a.isupper() else a.upper()
- return a
-
- def swapall(aa):
- return "".join([swapcase(a) for a in aa])
-
- return "/".join([swapall(row) for row in reversed(rows)]), ('w' if current_player == 'b' else 'b')
-
- def is_black_turn(self, current_player):
- return current_player == 'b'
-
-class GameBoard(object):
- board_pos_name = np.array(create_position_labels()).reshape(9,10).transpose()
- Ny = 10
- Nx = 9
-
- def __init__(self):
- self.state = "RNBAKABNR/9/1C5C1/P1P1P1P1P/9/9/p1p1p1p1p/1c5c1/9/rnbakabnr"#"rnbakabnr/9/1c5c1/p1p1p1p1p/9/9/P1P1P1P1P/1C5C1/9/RNBAKABNR" #
- self.round = 1
- # self.players = ["w", "b"]
- self.current_player = "w"
- self.restrict_round = 0
-
-# 小写表示黑方,大写表示红方
-# [
-# "rheakaehr",
-# " ",
-# " c c ",
-# "p p p p p",
-# " ",
-# " ",
-# "P P P P P",
-# " C C ",
-# " ",
-# "RHEAKAEHR"
-# ]
- def reload(self):
- self.state = "RNBAKABNR/9/1C5C1/P1P1P1P1P/9/9/p1p1p1p1p/1c5c1/9/rnbakabnr"#"rnbakabnr/9/1c5c1/p1p1p1p1p/9/9/P1P1P1P1P/1C5C1/9/RNBAKABNR" #
- self.round = 1
- self.current_player = "w"
- self.restrict_round = 0
-
- @staticmethod
- def print_borad(board, action = None):
- def string_reverse(string):
- # return ''.join(string[len(string) - i] for i in range(1, len(string)+1))
- return ''.join(string[i] for i in range(len(string) - 1, -1, -1))
-
- x_trans = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8}
-
- if(action != None):
- src = action[0:2]
-
- src_x = int(x_trans[src[0]])
- src_y = int(src[1])
-
- # board = string_reverse(board)
- board = board.replace("1", " ")
- board = board.replace("2", " ")
- board = board.replace("3", " ")
- board = board.replace("4", " ")
- board = board.replace("5", " ")
- board = board.replace("6", " ")
- board = board.replace("7", " ")
- board = board.replace("8", " ")
- board = board.replace("9", " ")
- board = board.split('/')
- # board = board.replace("/", "\n")
- print(" abcdefghi")
- for i,line in enumerate(board):
- if (action != None):
- if(i == src_y):
- s = list(line)
- s[src_x] = 'x'
- line = ''.join(s)
- print(i,line)
- # print(board)
-
- @staticmethod
- def sim_do_action(in_action, in_state):
- x_trans = {'a':0, 'b':1, 'c':2, 'd':3, 'e':4, 'f':5, 'g':6, 'h':7, 'i':8}
-
- src = in_action[0:2]
- dst = in_action[2:4]
-
- src_x = int(x_trans[src[0]])
- src_y = int(src[1])
-
- dst_x = int(x_trans[dst[0]])
- dst_y = int(dst[1])
-
- # GameBoard.print_borad(in_state)
- # print("sim_do_action : ", in_action)
- # print(dst_y, dst_x, src_y, src_x)
- board_positions = GameBoard.board_to_pos_name(in_state)
- line_lst = []
- for line in board_positions:
- line_lst.append(list(line))
- lines = np.array(line_lst)
- # print(lines.shape)
- # print(board_positions[src_y])
- # print("before board_positions[dst_y] = ",board_positions[dst_y])
-
- lines[dst_y][dst_x] = lines[src_y][src_x]
- lines[src_y][src_x] = '1'
-
- board_positions[dst_y] = ''.join(lines[dst_y])
- board_positions[src_y] = ''.join(lines[src_y])
-
- # src_str = list(board_positions[src_y])
- # dst_str = list(board_positions[dst_y])
- # print("src_str[src_x] = ", src_str[src_x])
- # print("dst_str[dst_x] = ", dst_str[dst_x])
- # c = copy.deepcopy(src_str[src_x])
- # dst_str[dst_x] = c
- # src_str[src_x] = '1'
- # board_positions[dst_y] = ''.join(dst_str)
- # board_positions[src_y] = ''.join(src_str)
- # print("after board_positions[dst_y] = ", board_positions[dst_y])
-
- # board_positions[dst_y][dst_x] = board_positions[src_y][src_x]
- # board_positions[src_y][src_x] = '1'
-
- board = "/".join(board_positions)
- board = board.replace("111111111", "9")
- board = board.replace("11111111", "8")
- board = board.replace("1111111", "7")
- board = board.replace("111111", "6")
- board = board.replace("11111", "5")
- board = board.replace("1111", "4")
- board = board.replace("111", "3")
- board = board.replace("11", "2")
-
- # GameBoard.print_borad(board)
- return board
-
- @staticmethod
- def board_to_pos_name(board):
- board = board.replace("2", "11")
- board = board.replace("3", "111")
- board = board.replace("4", "1111")
- board = board.replace("5", "11111")
- board = board.replace("6", "111111")
- board = board.replace("7", "1111111")
- board = board.replace("8", "11111111")
- board = board.replace("9", "111111111")
- return board.split("/")
-
- @staticmethod
- def check_bounds(toY, toX):
- if toY < 0 or toX < 0:
- return False
-
- if toY >= GameBoard.Ny or toX >= GameBoard.Nx:
- return False
-
- return True
-
- @staticmethod
- def validate_move(c, upper=True):
- if (c.isalpha()):
- if (upper == True):
- if (c.islower()):
- return True
- else:
- return False
- else:
- if (c.isupper()):
- return True
- else:
- return False
- else:
- return True
-
- @staticmethod
- def get_legal_moves(state, current_player):
- moves = []
- k_x = None
- k_y = None
-
- K_x = None
- K_y = None
-
- face_to_face = False
-
- board_positions = np.array(GameBoard.board_to_pos_name(state))
- for y in range(board_positions.shape[0]):
- for x in range(len(board_positions[y])):
- if(board_positions[y][x].isalpha()):
- if(board_positions[y][x] == 'r' and current_player == 'b'):
- toY = y
- for toX in range(x - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- moves.append(m)
-
- for toX in range(x + 1, GameBoard.Nx):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- moves.append(m)
-
- toX = x
- for toY in range(y - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- moves.append(m)
-
- for toY in range(y + 1, GameBoard.Ny):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- moves.append(m)
-
- elif(board_positions[y][x] == 'R' and current_player == 'w'):
- toY = y
- for toX in range(x - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- moves.append(m)
-
- for toX in range(x + 1, GameBoard.Nx):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- moves.append(m)
-
- toX = x
- for toY in range(y - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- moves.append(m)
-
- for toY in range(y + 1, GameBoard.Ny):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- moves.append(m)
-
- elif ((board_positions[y][x] == 'n' or board_positions[y][x] == 'h') and current_player == 'b'):
- for i in range(-1, 3, 2):
- for j in range(-1, 3, 2):
- toY = y + 2 * i
- toX = x + 1 * j
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False) and board_positions[toY - i][x].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- toY = y + 1 * i
- toX = x + 2 * j
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False) and board_positions[y][toX - j].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif ((board_positions[y][x] == 'N' or board_positions[y][x] == 'H') and current_player == 'w'):
- for i in range(-1, 3, 2):
- for j in range(-1, 3, 2):
- toY = y + 2 * i
- toX = x + 1 * j
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True) and board_positions[toY - i][x].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- toY = y + 1 * i
- toX = x + 2 * j
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True) and board_positions[y][toX - j].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif ((board_positions[y][x] == 'b' or board_positions[y][x] == 'e') and current_player == 'b'):
- for i in range(-2, 3, 4):
- toY = y + i
- toX = x + i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=False) and toY >= 5 and \
- board_positions[y + i // 2][x + i // 2].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- toY = y + i
- toX = x - i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=False) and toY >= 5 and \
- board_positions[y + i // 2][x - i // 2].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif ((board_positions[y][x] == 'B' or board_positions[y][x] == 'E') and current_player == 'w'):
- for i in range(-2, 3, 4):
- toY = y + i
- toX = x + i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=True) and toY <= 4 and \
- board_positions[y + i // 2][x + i // 2].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- toY = y + i
- toX = x - i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=True) and toY <= 4 and \
- board_positions[y + i // 2][x - i // 2].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif (board_positions[y][x] == 'a' and current_player == 'b'):
- for i in range(-1, 3, 2):
- toY = y + i
- toX = x + i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=False) and toY >= 7 and toX >= 3 and toX <= 5:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- toY = y + i
- toX = x - i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=False) and toY >= 7 and toX >= 3 and toX <= 5:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif (board_positions[y][x] == 'A' and current_player == 'w'):
- for i in range(-1, 3, 2):
- toY = y + i
- toX = x + i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=True) and toY <= 2 and toX >= 3 and toX <= 5:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- toY = y + i
- toX = x - i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=True) and toY <= 2 and toX >= 3 and toX <= 5:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif (board_positions[y][x] == 'k'):
- k_x = x
- k_y = y
-
- if(current_player == 'b'):
- for i in range(2):
- for sign in range(-1, 2, 2):
- j = 1 - i
- toY = y + i * sign
- toX = x + j * sign
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=False) and toY >= 7 and toX >= 3 and toX <= 5:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif (board_positions[y][x] == 'K'):
- K_x = x
- K_y = y
-
- if(current_player == 'w'):
- for i in range(2):
- for sign in range(-1, 2, 2):
- j = 1 - i
- toY = y + i * sign
- toX = x + j * sign
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=True) and toY <= 2 and toX >= 3 and toX <= 5:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif (board_positions[y][x] == 'c' and current_player == 'b'):
- toY = y
- hits = False
- for toX in range(x - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- hits = False
- for toX in range(x + 1, GameBoard.Nx):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- toX = x
- hits = False
- for toY in range(y - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- hits = False
- for toY in range(y + 1, GameBoard.Ny):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
- elif (board_positions[y][x] == 'C' and current_player == 'w'):
- toY = y
- hits = False
- for toX in range(x - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- hits = False
- for toX in range(x + 1, GameBoard.Nx):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- toX = x
- hits = False
- for toY in range(y - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- hits = False
- for toY in range(y + 1, GameBoard.Ny):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
- elif (board_positions[y][x] == 'p' and current_player == 'b'):
- toY = y - 1
- toX = x
-
- if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False)):
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- if y < 5:
- toY = y
- toX = x + 1
- if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False)):
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- toX = x - 1
- if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False)):
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- elif (board_positions[y][x] == 'P' and current_player == 'w'):
- toY = y + 1
- toX = x
-
- if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True)):
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- if y > 4:
- toY = y
- toX = x + 1
- if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True)):
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- toX = x - 1
- if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True)):
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- if(K_x != None and k_x != None and K_x == k_x):
- face_to_face = True
- for i in range(K_y + 1, k_y, 1):
- if(board_positions[i][K_x].isalpha()):
- face_to_face = False
-
- if(face_to_face == True):
- if(current_player == 'b'):
- moves.append(GameBoard.board_pos_name[k_y][k_x] + GameBoard.board_pos_name[K_y][K_x])
- else:
- moves.append(GameBoard.board_pos_name[K_y][K_x] + GameBoard.board_pos_name[k_y][k_x])
-
- return moves
-
-def softmax(x):
- # print(x)
- probs = np.exp(x - np.max(x))
- # print(np.sum(probs))
- probs /= np.sum(probs)
- return probs
-
-class cchess_main(object):
-
- def __init__(self, playout=400, in_batch_size=128, exploration = True, in_search_threads = 16, processor = "cpu", num_gpus = 1, res_block_nums = 7, human_color = 'b'):
- self.epochs = 5
- self.playout_counts = playout #400 #800 #1600 200
- self.temperature = 1 #1e-8 1e-3
- # self.c = 1e-4
- self.batch_size = in_batch_size #128 #512
- # self.momentum = 0.9
- self.game_batch = 400 # Evaluation each 400 times
- # self.game_loop = 25000
- self.top_steps = 30
- self.top_temperature = 1 #2
- # self.Dirichlet = 0.3 # P(s,a) = (1 - ϵ)p_a + ϵη_a #self-play chapter in the paper
- self.eta = 0.03
- # self.epsilon = 0.25
- # self.v_resign = 0.05
- # self.c_puct = 5
- self.learning_rate = 0.001 #5e-3 # 0.001
- self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL
- self.buffer_size = 10000
- self.data_buffer = deque(maxlen=self.buffer_size)
- self.game_borad = GameBoard()
- # self.current_player = 'w' #“w”表示红方,“b”表示黑方。
- self.policy_value_netowrk = policy_value_network(res_block_nums) if processor == 'cpu' else policy_value_network_gpus(num_gpus, res_block_nums)
- self.search_threads = in_search_threads
- self.mcts = MCTS_tree(self.game_borad.state, self.policy_value_netowrk.forward, self.search_threads)
- self.exploration = exploration
- self.resign_threshold = -0.8 #0.05
- self.global_step = 0
- self.kl_targ = 0.025
- self.log_file = open(os.path.join(os.getcwd(), 'log_file.txt'), 'w')
- self.human_color = human_color
-
- @staticmethod
- def flip_policy(prob):
- prob = prob.flatten()
- return np.asarray([prob[ind] for ind in unflipped_index])
-
- def policy_update(self):
- """update the policy-value net"""
- mini_batch = random.sample(self.data_buffer, self.batch_size)
- #print("training data_buffer len : ", len(self.data_buffer))
- state_batch = [data[0] for data in mini_batch]
- mcts_probs_batch = [data[1] for data in mini_batch]
- winner_batch = [data[2] for data in mini_batch]
- # print(np.array(winner_batch).shape)
- # print(winner_batch)
- winner_batch = np.expand_dims(winner_batch, 1)
- # print(winner_batch.shape)
- # print(winner_batch)
- start_time = time.time()
- old_probs, old_v = self.mcts.forward(state_batch)
- for i in range(self.epochs):
- accuracy, loss, self.global_step = self.policy_value_netowrk.train_step(state_batch, mcts_probs_batch, winner_batch,
- self.learning_rate * self.lr_multiplier) #
- new_probs, new_v = self.mcts.forward(state_batch)
- kl_tmp = old_probs * (np.log((old_probs + 1e-10) / (new_probs + 1e-10)))
- # print("kl_tmp.shape", kl_tmp.shape)
- kl_lst = []
- for line in kl_tmp:
- # print("line.shape", line.shape)
- all_value = [x for x in line if str(x) != 'nan' and str(x)!= 'inf']#除去inf值
- kl_lst.append(np.sum(all_value))
- kl = np.mean(kl_lst)
- # kl = scipy.stats.entropy(old_probs, new_probs)
- # kl = np.mean(np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1))
-
- if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly
- break
- self.policy_value_netowrk.save(self.global_step)
- print("train using time {} s".format(time.time() - start_time))
-
- # adaptively adjust the learning rate
- if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
- self.lr_multiplier /= 1.5
- elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
- self.lr_multiplier *= 1.5
-
- explained_var_old = 1 - np.var(np.array(winner_batch) - old_v.flatten()) / np.var(np.array(winner_batch))
- explained_var_new = 1 - np.var(np.array(winner_batch) - new_v.flatten()) / np.var(np.array(winner_batch))
- print(
- "kl:{:.5f},lr_multiplier:{:.3f},loss:{},accuracy:{},explained_var_old:{:.3f},explained_var_new:{:.3f}".format(
- kl, self.lr_multiplier, loss, accuracy, explained_var_old, explained_var_new))
- self.log_file.write("kl:{:.5f},lr_multiplier:{:.3f},loss:{},accuracy:{},explained_var_old:{:.3f},explained_var_new:{:.3f}".format(
- kl, self.lr_multiplier, loss, accuracy, explained_var_old, explained_var_new) + '\n')
- self.log_file.flush()
- # return loss, accuracy
-
- # def policy_evaluate(self, n_games=10):
- # """
- # Evaluate the trained policy by playing games against the pure MCTS player
- # Note: this is only for monitoring the progress of training
- # """
- # # current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct,
- # # n_playout=self.n_playout)
- # # pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num)
- # win_cnt = defaultdict(int)
- # for i in range(n_games):
- # winner = self.game.start_play(start_player=i % 2) #current_mcts_player, pure_mcts_player,
- # win_cnt[winner] += 1
- # win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
- # print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(self.pure_mcts_playout_num, win_cnt[1], win_cnt[2],
- # win_cnt[-1]))
- # return win_ratio
-
- def run(self):
- #self.game_loop
- batch_iter = 0
- try:
- while(True):
- batch_iter += 1
- play_data, episode_len = self.selfplay()
- print("batch i:{}, episode_len:{}".format(batch_iter, episode_len))
- extend_data = []
- # states_data = []
- for state, mcts_prob, winner in play_data:
- states_data = self.mcts.state_to_positions(state)
- # prob = np.zeros(labels_len)
- # for idx in range(len(mcts_prob[0][0])):
- # prob[label2i[mcts_prob[0][0][idx]]] = mcts_prob[0][1][idx]
- extend_data.append((states_data, mcts_prob, winner))
- self.data_buffer.extend(extend_data)
- if len(self.data_buffer) > self.batch_size:
- self.policy_update()
- # if (batch_iter) % self.game_batch == 0:
- # print("current self-play batch: {}".format(batch_iter))
- # win_ratio = self.policy_evaluate()
- except KeyboardInterrupt:
- self.log_file.close()
- self.policy_value_netowrk.save(self.global_step)
-
- # def get_action(self, state, temperature = 1e-3):
- # # for i in range(self.playout_counts):
- # # state_sim = copy.deepcopy(state)
- # # self.mcts.do_simulation(state_sim, self.game_borad.current_player, self.game_borad.restrict_round)
- #
- # futures = []
- # with ThreadPoolExecutor(max_workers=self.search_threads) as executor:
- # for _ in range(self.playout_counts):
- # state_sim = copy.deepcopy(state)
- # futures.append(executor.submit(self.mcts.do_simulation, state_sim, self.game_borad.current_player, self.game_borad.restrict_round))
- #
- # vals = [f.result() for f in futures]
- #
- # actions_visits = [(act, nod.N) for act, nod in self.mcts.root.child.items()]
- # actions, visits = zip(*actions_visits)
- # probs = softmax(1.0 / temperature * np.log(visits)) #+ 1e-10
- # move_probs = []
- # move_probs.append([actions, probs])
- #
- # if(self.exploration):
- # act = np.random.choice(actions, p=0.75 * probs + 0.25*np.random.dirichlet(0.3*np.ones(len(probs))))
- # else:
- # act = np.random.choice(actions, p=probs)
- #
- # self.mcts.update_tree(act)
- #
- # return act, move_probs
-
- def get_hint(self, mcts_or_net, reverse, disp_mcts_msg_handler):
-
- if mcts_or_net == "mcts":
- if self.mcts.root.child == {}:
- disp_mcts_msg_handler()
- self.mcts.main(self.game_borad.state, self.game_borad.current_player, self.game_borad.restrict_round,
- self.playout_counts)
-
- actions_visits = [(act, nod.N) for act, nod in self.mcts.root.child.items()]
- actions, visits = zip(*actions_visits)
- # print("visits : ", visits)
- # print("np.log(visits) : ", np.log(visits))
- probs = softmax(1.0 / self.temperature * np.log(visits)) # + 1e-10
-
- act_prob_dict = defaultdict(float)
- for i in range(len(actions)):
- if self.human_color == 'w':
- action = "".join(flipped_uci_labels(actions[i]))
- else:
- action = actions[i]
- act_prob_dict[action] = probs[i]
-
- elif mcts_or_net == "net":
- positions = self.mcts.generate_inputs(self.game_borad.state, self.game_borad.current_player)
- positions = np.expand_dims(positions, 0)
- action_probs, value = self.mcts.forward(positions)
-
- if self.mcts.is_black_turn(self.game_borad.current_player):
- action_probs = cchess_main.flip_policy(action_probs)
- moves = GameBoard.get_legal_moves(self.game_borad.state, self.game_borad.current_player)
-
- tot_p = 1e-8
- action_probs = action_probs.flatten() # .squeeze()
- act_prob_dict = defaultdict(float)
- # print("expand action_probs shape : ", action_probs.shape)
- for action in moves:
- # in_state = GameBoard.sim_do_action(action, self.state)
- mov_p = action_probs[label2i[action]]
- if self.human_color == 'w':
- action = "".join(flipped_uci_labels(action))
- act_prob_dict[action] = mov_p
- # new_node = leaf_node(self, mov_p, in_state)
- # self.child[action] = new_node
- tot_p += mov_p
-
- for a, _ in act_prob_dict.items():
- act_prob_dict[a] /= tot_p
-
- sorted_move_probs = sorted(act_prob_dict.items(), key=lambda item: item[1], reverse=reverse)
- # print(sorted_move_probs)
-
- return sorted_move_probs
-
- #@profile
- def get_action(self, state, temperature = 1e-3):
- # for i in range(self.playout_counts):
- # state_sim = copy.deepcopy(state)
- # self.mcts.do_simulation(state_sim, self.game_borad.current_player, self.game_borad.restrict_round)
-
- self.mcts.main(state, self.game_borad.current_player, self.game_borad.restrict_round, self.playout_counts)
-
- actions_visits = [(act, nod.N) for act, nod in self.mcts.root.child.items()]
- actions, visits = zip(*actions_visits)
- probs = softmax(1.0 / temperature * np.log(visits)) #+ 1e-10
- move_probs = []
- move_probs.append([actions, probs])
-
- if(self.exploration):
- act = np.random.choice(actions, p=0.75 * probs + 0.25*np.random.dirichlet(0.3*np.ones(len(probs))))
- else:
- act = np.random.choice(actions, p=probs)
-
- win_rate = self.mcts.Q(act) # / 2.0 + 0.5
- self.mcts.update_tree(act)
-
- # if position.n < 30: # self.top_steps
- # move = select_weighted_random(position, on_board_move_prob)
- # else:
- # move = select_most_likely(position, on_board_move_prob)
-
- return act, move_probs, win_rate
-
- def get_action_old(self, state, temperature = 1e-3):
- for i in range(self.playout_counts):
- state_sim = copy.deepcopy(state)
- self.mcts.do_simulation(state_sim, self.game_borad.current_player, self.game_borad.restrict_round)
-
- actions_visits = [(act, nod.N) for act, nod in self.mcts.root.child.items()]
- actions, visits = zip(*actions_visits)
- probs = softmax(1.0 / temperature * np.log(visits)) #+ 1e-10
- move_probs = []
- move_probs.append([actions, probs])
-
- if(self.exploration):
- act = np.random.choice(actions, p=0.75 * probs + 0.25*np.random.dirichlet(0.3*np.ones(len(probs))))
- else:
- act = np.random.choice(actions, p=probs)
-
- self.mcts.update_tree(act)
-
- return act, move_probs
-
- def check_end(self):
- if (self.game_borad.state.find('K') == -1 or self.game_borad.state.find('k') == -1):
- if (self.game_borad.state.find('K') == -1):
- print("Green is Winner")
- return True, "b"
- if (self.game_borad.state.find('k') == -1):
- print("Red is Winner")
- return True, "w"
- elif self.game_borad.restrict_round >= 60:
- print("TIE! No Winners!")
- return True, "t"
- else:
- return False, ""
-
- def human_move(self, coord, mcts_or_net):
- win_rate = 0
- x_trans = {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i'}
-
- src = coord[0:2]
- dst = coord[2:4]
-
- src_x = (x_trans[src[0]])
- src_y = str(src[1])
-
- dst_x = (x_trans[dst[0]])
- dst_y = str(dst[1])
-
- action = src_x + src_y + dst_x + dst_y
-
- if self.human_color == 'w':
- action = "".join(flipped_uci_labels(action))
-
- if mcts_or_net == "mcts":
- if self.mcts.root.child == {}:
- # self.get_action(self.game_borad.state, self.temperature)
- self.mcts.main(self.game_borad.state, self.game_borad.current_player, self.game_borad.restrict_round,
- self.playout_counts)
- win_rate = self.mcts.Q(action) # / 2.0 + 0.5
- self.mcts.update_tree(action)
-
- last_state = self.game_borad.state
- # print(self.game_borad.current_player, " now take a action : ", action, "[Step {}]".format(self.game_borad.round))
- self.game_borad.state = GameBoard.sim_do_action(action, self.game_borad.state)
- self.game_borad.round += 1
- self.game_borad.current_player = "w" if self.game_borad.current_player == "b" else "b"
- if is_kill_move(last_state, self.game_borad.state) == 0:
- self.game_borad.restrict_round += 1
- else:
- self.game_borad.restrict_round = 0
-
- return win_rate
-
-
- def select_move(self, mcts_or_net):
- if mcts_or_net == "mcts":
- action, probs, win_rate = self.get_action(self.game_borad.state, self.temperature)
- # win_rate = self.mcts.Q(action) / 2.0 + 0.5
- elif mcts_or_net == "net":
- positions = self.mcts.generate_inputs(self.game_borad.state, self.game_borad.current_player)
- positions = np.expand_dims(positions, 0)
- action_probs, value = self.mcts.forward(positions)
- win_rate = value[0, 0] # / 2 + 0.5
- if self.mcts.is_black_turn(self.game_borad.current_player):
- action_probs = cchess_main.flip_policy(action_probs)
- moves = GameBoard.get_legal_moves(self.game_borad.state, self.game_borad.current_player)
-
- tot_p = 1e-8
- action_probs = action_probs.flatten() # .squeeze()
- act_prob_dict = defaultdict(float)
- # print("expand action_probs shape : ", action_probs.shape)
- for action in moves:
- # in_state = GameBoard.sim_do_action(action, self.state)
- mov_p = action_probs[label2i[action]]
- act_prob_dict[action] = mov_p
- # new_node = leaf_node(self, mov_p, in_state)
- # self.child[action] = new_node
- tot_p += mov_p
-
- for a, _ in act_prob_dict.items():
- act_prob_dict[a] /= tot_p
-
- action = max(act_prob_dict.items(), key=lambda node: node[1])[0]
- # self.mcts.update_tree(action)
-
- print('Win rate for player {} is {:.4f}'.format(self.game_borad.current_player, win_rate))
- last_state = self.game_borad.state
- print(self.game_borad.current_player, " now take a action : ", action, "[Step {}]".format(self.game_borad.round)) # if self.human_color == 'w' else "".join(flipped_uci_labels(action))
- self.game_borad.state = GameBoard.sim_do_action(action, self.game_borad.state)
- self.game_borad.round += 1
- self.game_borad.current_player = "w" if self.game_borad.current_player == "b" else "b"
- if is_kill_move(last_state, self.game_borad.state) == 0:
- self.game_borad.restrict_round += 1
- else:
- self.game_borad.restrict_round = 0
-
- self.game_borad.print_borad(self.game_borad.state)
-
- x_trans = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8}
-
- if self.human_color == 'w':
- action = "".join(flipped_uci_labels(action))
-
- src = action[0:2]
- dst = action[2:4]
-
- src_x = int(x_trans[src[0]])
- src_y = int(src[1])
-
- dst_x = int(x_trans[dst[0]])
- dst_y = int(dst[1])
-
- return (src_x, src_y, dst_x - src_x, dst_y - src_y), win_rate
-
- def selfplay(self):
- self.game_borad.reload()
- # p1, p2 = self.game_borad.players
- states, mcts_probs, current_players = [], [], []
- z = None
- game_over = False
- winnner = ""
- start_time = time.time()
- # self.game_borad.print_borad(self.game_borad.state)
- while(not game_over):
- action, probs, win_rate = self.get_action(self.game_borad.state, self.temperature)
- state, palyer = self.mcts.try_flip(self.game_borad.state, self.game_borad.current_player, self.mcts.is_black_turn(self.game_borad.current_player))
- states.append(state)
- prob = np.zeros(labels_len)
- if self.mcts.is_black_turn(self.game_borad.current_player):
- for idx in range(len(probs[0][0])):
- # probs[0][0][idx] = "".join((str(9 - int(a)) if a.isdigit() else a) for a in probs[0][0][idx])
- act = "".join((str(9 - int(a)) if a.isdigit() else a) for a in probs[0][0][idx])
- # for idx in range(len(mcts_prob[0][0])):
- prob[label2i[act]] = probs[0][1][idx]
- else:
- for idx in range(len(probs[0][0])):
- prob[label2i[probs[0][0][idx]]] = probs[0][1][idx]
- mcts_probs.append(prob)
- # mcts_probs.append(probs)
- current_players.append(self.game_borad.current_player)
-
- last_state = self.game_borad.state
- # print(self.game_borad.current_player, " now take a action : ", action, "[Step {}]".format(self.game_borad.round))
- self.game_borad.state = GameBoard.sim_do_action(action, self.game_borad.state)
- self.game_borad.round += 1
- self.game_borad.current_player = "w" if self.game_borad.current_player == "b" else "b"
- if is_kill_move(last_state, self.game_borad.state) == 0:
- self.game_borad.restrict_round += 1
- else:
- self.game_borad.restrict_round = 0
-
- # self.game_borad.print_borad(self.game_borad.state, action)
-
- if (self.game_borad.state.find('K') == -1 or self.game_borad.state.find('k') == -1):
- z = np.zeros(len(current_players))
- if (self.game_borad.state.find('K') == -1):
- winnner = "b"
- if (self.game_borad.state.find('k') == -1):
- winnner = "w"
- z[np.array(current_players) == winnner] = 1.0
- z[np.array(current_players) != winnner] = -1.0
- game_over = True
- print("Game end. Winner is player : ", winnner, " In {} steps".format(self.game_borad.round - 1))
- elif self.game_borad.restrict_round >= 60:
- z = np.zeros(len(current_players))
- game_over = True
- print("Game end. Tie in {} steps".format(self.game_borad.round - 1))
- # elif(self.mcts.root.v < self.resign_threshold):
- # pass
- # elif(self.mcts.root.Q < self.resign_threshold):
- # pass
- if(game_over):
- # self.mcts.root = leaf_node(None, self.mcts.p_, "RNBAKABNR/9/1C5C1/P1P1P1P1P/9/9/p1p1p1p1p/1c5c1/9/rnbakabnr")#"rnbakabnr/9/1c5c1/p1p1p1p1p/9/9/P1P1P1P1P/1C5C1/9/RNBAKABNR"
- self.mcts.reload()
- print("Using time {} s".format(time.time() - start_time))
- return zip(states, mcts_probs, z), len(z)
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- parser.add_argument('--mode', default='train', choices=['train', 'play'], type=str, help='train or play')
- parser.add_argument('--ai_count', default=1, choices=[1, 2], type=int, help='choose ai player count')
- parser.add_argument('--ai_function', default='mcts', choices=['mcts', 'net'], type=str, help='mcts or net')
- parser.add_argument('--train_playout', default=400, type=int, help='mcts train playout')
- parser.add_argument('--batch_size', default=512, type=int, help='train batch_size')
- parser.add_argument('--play_playout', default=400, type=int, help='mcts play playout')
- parser.add_argument('--delay', dest='delay', action='store',
- nargs='?', default=3, type=float, required=False,
- help='Set how many seconds you want to delay after each move')
- parser.add_argument('--end_delay', dest='end_delay', action='store',
- nargs='?', default=3, type=float, required=False,
- help='Set how many seconds you want to delay after the end of game')
- parser.add_argument('--search_threads', default=16, type=int, help='search_threads')
- parser.add_argument('--processor', default='cpu', choices=['cpu', 'gpu'], type=str, help='cpu or gpu')
- parser.add_argument('--num_gpus', default=1, type=int, help='gpu counts')
- parser.add_argument('--res_block_nums', default=7, type=int, help='res_block_nums')
- parser.add_argument('--human_color', default='b', choices=['w', 'b'], type=str, help='w or b')
- args = parser.parse_args()
-
- if args.mode == 'train':
- train_main = cchess_main(args.train_playout, args.batch_size, True, args.search_threads, args.processor, args.num_gpus, args.res_block_nums, args.human_color) # * args.num_gpus
- train_main.run()
- elif args.mode == 'play':
- from ChessGame import *
- game = ChessGame(args.ai_count, args.ai_function, args.play_playout, args.delay, args.end_delay, args.batch_size,
- args.search_threads, args.processor, args.num_gpus, args.res_block_nums, args.human_color) # * args.num_gpus
- game.start()
+#coding:utf-8
+from asyncio import Future
+import asyncio
+from asyncio.queues import Queue
+import uvloop
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+import tensorflow as tf
+import numpy as np
+import os
+import sys
+import random
+import time
+import argparse
+from collections import deque, defaultdict, namedtuple
+import copy
+from policy_value_network import *
+import scipy.stats
+from threading import Lock
+from concurrent.futures import ThreadPoolExecutor
+
+def flipped_uci_labels(param):
+ def repl(x):
+ return "".join([(str(9 - int(a)) if a.isdigit() else a) for a in x])
+
+ return [repl(x) for x in param]
+
+# 创建所有合法走子UCI,size 2086
+def create_uci_labels():
+ labels_array = []
+ letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
+ numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+
+ Advisor_labels = ['d7e8', 'e8d7', 'e8f9', 'f9e8', 'd0e1', 'e1d0', 'e1f2', 'f2e1',
+ 'd2e1', 'e1d2', 'e1f0', 'f0e1', 'd9e8', 'e8d9', 'e8f7', 'f7e8']
+ Bishop_labels = ['a2c4', 'c4a2', 'c0e2', 'e2c0', 'e2g4', 'g4e2', 'g0i2', 'i2g0',
+ 'a7c9', 'c9a7', 'c5e7', 'e7c5', 'e7g9', 'g9e7', 'g5i7', 'i7g5',
+ 'a2c0', 'c0a2', 'c4e2', 'e2c4', 'e2g0', 'g0e2', 'g4i2', 'i2g4',
+ 'a7c5', 'c5a7', 'c9e7', 'e7c9', 'e7g5', 'g5e7', 'g9i7', 'i7g9']
+ # King_labels = ['d0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9',
+ # 'd7d0', 'd7d1', 'd7d2', 'd8d0', 'd8d1', 'd8d2', 'd9d0', 'd9d1', 'd9d2',
+ # 'd0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9',
+ # 'd0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9',
+ # 'd0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9',
+ # 'd0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9']
+
+ for l1 in range(9):
+ for n1 in range(10):
+ destinations = [(t, n1) for t in range(9)] + \
+ [(l1, t) for t in range(10)] + \
+ [(l1 + a, n1 + b) for (a, b) in
+ [(-2, -1), (-1, -2), (-2, 1), (1, -2), (2, -1), (-1, 2), (2, 1), (1, 2)]] # 马走日
+ for (l2, n2) in destinations:
+ if (l1, n1) != (l2, n2) and l2 in range(9) and n2 in range(10):
+ move = letters[l1] + numbers[n1] + letters[l2] + numbers[n2]
+ labels_array.append(move)
+
+ for p in Advisor_labels:
+ labels_array.append(p)
+
+ for p in Bishop_labels:
+ labels_array.append(p)
+
+ return labels_array
+
+def create_position_labels():
+ labels_array = []
+ letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
+ letters.reverse()
+ numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+
+ for l1 in range(9):
+ for n1 in range(10):
+ move = letters[8 - l1] + numbers[n1]
+ labels_array.append(move)
+# labels_array.reverse()
+ return labels_array
+
+def create_position_labels_reverse():
+ labels_array = []
+ letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
+ letters.reverse()
+ numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+
+ for l1 in range(9):
+ for n1 in range(10):
+ move = letters[l1] + numbers[n1]
+ labels_array.append(move)
+ labels_array.reverse()
+ return labels_array
+
+class leaf_node(object):
+ def __init__(self, in_parent, in_prior_p, in_state):
+ self.P = in_prior_p
+ self.Q = 0
+ self.N = 0
+ self.v = 0
+ self.U = 0
+ self.W = 0
+ self.parent = in_parent
+ self.child = {}
+ self.state = in_state
+
+ def is_leaf(self):
+ return self.child == {}
+
+ def get_Q_plus_U_new(self, c_puct):
+ """Calculate and return the value for this node: a combination of leaf evaluations, Q, and
+ this node's prior adjusted for its visit count, u
+ c_puct -- a number in (0, inf) controlling the relative impact of values, Q, and
+ prior probability, P, on this node's score.
+ """
+ # self._u = c_puct * self._P * np.sqrt(self._parent._n_visits) / (1 + self._n_visits)
+ U = c_puct * self.P * np.sqrt(self.parent.N) / ( 1 + self.N)
+ return self.Q + U
+
+ def get_Q_plus_U(self, c_puct):
+ """Calculate and return the value for this node: a combination of leaf evaluations, Q, and
+ this node's prior adjusted for its visit count, u
+ c_puct -- a number in (0, inf) controlling the relative impact of values, Q, and
+ prior probability, P, on this node's score.
+ """
+ # self._u = c_puct * self._P * np.sqrt(self._parent._n_visits) / (1 + self._n_visits)
+ self.U = c_puct * self.P * np.sqrt(self.parent.N) / ( 1 + self.N)
+ return self.Q + self.U
+
+ def select_new(self, c_puct):
+ return max(self.child.items(), key=lambda node: node[1].get_Q_plus_U_new(c_puct))
+
+ def select(self, c_puct):
+ return max(self.child.items(), key=lambda node: node[1].get_Q_plus_U(c_puct))
+
+ #@profile
+ def expand(self, moves, action_probs):
+ tot_p = 1e-8
+ # print("action_probs : ", action_probs)
+ action_probs = tf.squeeze(action_probs) #.flatten() #.squeeze()
+ # print("expand action_probs shape : ", action_probs.shape)
+ for action in moves:
+ in_state = GameBoard.sim_do_action(action, self.state)
+ mov_p = action_probs[label2i[action]]
+ new_node = leaf_node(self, mov_p, in_state)
+ self.child[action] = new_node
+ tot_p += mov_p
+
+ for a, n in self.child.items():
+ n.P /= tot_p
+
+ def back_up_value(self, value):
+ self.N += 1
+ self.W += value
+ self.v = value
+ self.Q = self.W / self.N # node.Q += 1.0*(value - node.Q) / node.N
+ self.U = c_PUCT * self.P * np.sqrt(self.parent.N) / ( 1 + self.N)
+ # node = node.parent
+ # value = -value
+
+ def backup(self, value):
+ node = self
+ while node != None:
+ node.N += 1
+ node.W += value
+ node.v = value
+ node.Q = node.W / node.N # node.Q += 1.0*(value - node.Q) / node.N
+ node = node.parent
+ value = -value
+
+pieces_order = 'KARBNPCkarbnpc' # 9 x 10 x 14
+ind = {pieces_order[i]: i for i in range(14)}
+
+labels_array = create_uci_labels()
+labels_len = len(labels_array)
+flipped_labels = flipped_uci_labels(labels_array)
+unflipped_index = [labels_array.index(x) for x in flipped_labels]
+
+i2label = {i: val for i, val in enumerate(labels_array)}
+label2i = {val: i for i, val in enumerate(labels_array)}
+
+def get_pieces_count(state):
+ count = 0
+ for s in state:
+ if s.isalpha():
+ count += 1
+ return count
+
+def is_kill_move(state_prev, state_next):
+ return get_pieces_count(state_prev) - get_pieces_count(state_next)
+
+QueueItem = namedtuple("QueueItem", "feature future")
+c_PUCT = 5
+virtual_loss = 3
+cut_off_depth = 30
+
+class MCTS_tree(object):
+ def __init__(self, in_state, in_forward, search_threads):
+ self.noise_eps = 0.25
+ self.dirichlet_alpha = 0.3 #0.03
+ self.p_ = (1 - self.noise_eps) * 1 + self.noise_eps * np.random.dirichlet([self.dirichlet_alpha])
+ self.root = leaf_node(None, self.p_, in_state)
+ self.c_puct = 5 #1.5
+ # self.policy_network = in_policy_network
+ self.forward = in_forward
+ self.node_lock = defaultdict(Lock)
+
+ self.virtual_loss = 3
+ self.now_expanding = set()
+ self.expanded = set()
+ self.cut_off_depth = 30
+ # self.QueueItem = namedtuple("QueueItem", "feature future")
+ self.sem = asyncio.Semaphore(search_threads)
+ self.queue = Queue(search_threads)
+ self.loop = asyncio.get_event_loop()
+ self.running_simulation_num = 0
+
+ def reload(self):
+ self.root = leaf_node(None, self.p_,
+ "RNBAKABNR/9/1C5C1/P1P1P1P1P/9/9/p1p1p1p1p/1c5c1/9/rnbakabnr") # "rnbakabnr/9/1c5c1/p1p1p1p1p/9/9/P1P1P1P1P/1C5C1/9/RNBAKABNR"
+ self.expanded = set()
+
+
+ def Q(self, move) -> float:
+ ret = 0.0
+ find = False
+ for a, n in self.root.child.items():
+ if move == a:
+ ret = n.Q
+ find = True
+ if(find == False):
+ print("{} not exist in the child".format(move))
+ return ret
+
+ def update_tree(self, act):
+ # if(act in self.root.child):
+ self.expanded.discard(self.root)
+ self.root = self.root.child[act]
+ self.root.parent = None
+ # else:
+ # self.root = leaf_node(None, self.p_, in_state)
+
+ def is_expanded(self, key) -> bool:
+ """Check expanded status"""
+ return key in self.expanded
+
+ async def tree_search(self, node, current_player, restrict_round) -> float:
+ """Independent MCTS, stands for one simulation"""
+ self.running_simulation_num += 1
+
+ # reduce parallel search number
+ async with self.sem:
+ value = await self.start_tree_search(node, current_player, restrict_round)
+ # logger.debug(f"value: {value}")
+ # logger.debug(f'Current running threads : {RUNNING_SIMULATION_NUM}')
+ self.running_simulation_num -= 1
+
+ return value
+
+ async def start_tree_search(self, node, current_player, restrict_round)->float:
+ """Monte Carlo Tree search Select,Expand,Evauate,Backup"""
+ now_expanding = self.now_expanding
+
+ while node in now_expanding:
+ await asyncio.sleep(1e-4)
+
+ if not self.is_expanded(node): # and node.is_leaf()
+ """is leaf node try evaluate and expand"""
+ # add leaf node to expanding list
+ self.now_expanding.add(node)
+
+ positions = self.generate_inputs(node.state, current_player)
+ # positions = np.expand_dims(positions, 0)
+
+ # push extracted dihedral features of leaf node to the evaluation queue
+ future = await self.push_queue(positions) # type: Future
+ await future
+ action_probs, value = future.result()
+
+ # action_probs, value = self.forward(positions)
+ if self.is_black_turn(current_player):
+ action_probs = cchess_main.flip_policy(action_probs)
+
+ moves = GameBoard.get_legal_moves(node.state, current_player)
+ # print("current_player : ", current_player)
+ # print(moves)
+ node.expand(moves, action_probs)
+ self.expanded.add(node) # node.state
+
+ # remove leaf node from expanding list
+ self.now_expanding.remove(node)
+
+ # must invert, because alternative layer has opposite objective
+ return value[0] * -1
+
+ else:
+ """node has already expanded. Enter select phase."""
+ # select child node with maximum action scroe
+ last_state = node.state
+
+ action, node = node.select_new(c_PUCT)
+ current_player = "w" if current_player == "b" else "b"
+ if is_kill_move(last_state, node.state) == 0:
+ restrict_round += 1
+ else:
+ restrict_round = 0
+ last_state = node.state
+
+ # action_t = self.select_move_by_action_score(key, noise=True)
+
+ # add virtual loss
+ # self.virtual_loss_do(key, action_t)
+ node.N += virtual_loss
+ node.W += -virtual_loss
+
+ # evolve game board status
+ # child_position = self.env_action(position, action_t)
+
+ if (node.state.find('K') == -1 or node.state.find('k') == -1):
+ if (node.state.find('K') == -1):
+ value = 1.0 if current_player == "b" else -1.0
+ if (node.state.find('k') == -1):
+ value = -1.0 if current_player == "b" else 1.0
+ value = value * -1
+ elif restrict_round >= 60:
+ value = 0.0
+ else:
+ value = await self.start_tree_search(node, current_player, restrict_round) # next move
+
+ node.N += -virtual_loss
+ node.W += virtual_loss
+
+ node.back_up_value(value) # -value
+
+ # must invert
+ return value * -1
+
+ async def prediction_worker(self):
+ """For better performance, queueing prediction requests and predict together in this worker.
+ speed up about 45sec -> 15sec for example.
+ """
+ q = self.queue
+ margin = 10 # avoid finishing before other searches starting.
+ while self.running_simulation_num > 0 or margin > 0:
+ if q.empty():
+ if margin > 0:
+ margin -= 1
+ await asyncio.sleep(1e-3)
+ continue
+ item_list = [q.get_nowait() for _ in range(q.qsize())] # type: list[QueueItem]
+ #logger.debug(f"predicting {len(item_list)} items")
+ features = np.asarray([item.feature for item in item_list]) # asarray
+ action_probs, value = self.forward(features)
+ for p, v, item in zip(action_probs, value, item_list):
+ item.future.set_result((p, v))
+
+ async def push_queue(self, features):
+ future = self.loop.create_future()
+ item = QueueItem(features, future)
+ await self.queue.put(item)
+ return future
+
+ #@profile
+ def main(self, state, current_player, restrict_round, playouts):
+ node = self.root
+ if not self.is_expanded(node): # and node.is_leaf() # node.state
+ # print('Expadning Root Node...')
+ positions = self.generate_inputs(node.state, current_player)
+ positions = np.expand_dims(positions, 0)
+ action_probs, value = self.forward(positions)
+ if self.is_black_turn(current_player):
+ action_probs = cchess_main.flip_policy(action_probs)
+
+ moves = GameBoard.get_legal_moves(node.state, current_player)
+ # print("current_player : ", current_player)
+ # print(moves)
+ node.expand(moves, action_probs)
+ self.expanded.add(node) # node.state
+
+ coroutine_list = []
+ for _ in range(playouts):
+ coroutine_list.append(self.tree_search(node, current_player, restrict_round))
+ coroutine_list.append(self.prediction_worker())
+ self.loop.run_until_complete(asyncio.gather(*coroutine_list))
+
+ def do_simulation(self, state, current_player, restrict_round):
+ node = self.root
+ last_state = state
+ while(node.is_leaf() == False):
+ # print("do_simulation while current_player : ", current_player)
+ action, node = node.select(self.c_puct)
+ current_player = "w" if current_player == "b" else "b"
+ if is_kill_move(last_state, node.state) == 0:
+ restrict_round += 1
+ else:
+ restrict_round = 0
+ last_state = node.state
+
+ positions = self.generate_inputs(node.state, current_player)
+ positions = np.expand_dims(positions, 0)
+ action_probs, value = self.forward(positions)
+ if self.is_black_turn(current_player):
+ action_probs = cchess_main.flip_policy(action_probs)
+
+ # print("action_probs shape : ", action_probs.shape) #(1, 2086)
+
+ if(node.state.find('K') == -1 or node.state.find('k') == -1):
+ if (node.state.find('K') == -1):
+ value = 1.0 if current_player == "b" else -1.0
+ if (node.state.find('k') == -1):
+ value = -1.0 if current_player == "b" else 1.0
+ elif restrict_round >= 60:
+ value = 0.0
+ else:
+ moves = GameBoard.get_legal_moves(node.state, current_player)
+ # print("current_player : ", current_player)
+ # print(moves)
+ node.expand(moves, action_probs)
+
+ node.backup(-value)
+
+ def generate_inputs(self, in_state, current_player):
+ state, palyer = self.try_flip(in_state, current_player, self.is_black_turn(current_player))
+ return self.state_to_positions(state)
+
+ def replace_board_tags(self, board):
+ board = board.replace("2", "11")
+ board = board.replace("3", "111")
+ board = board.replace("4", "1111")
+ board = board.replace("5", "11111")
+ board = board.replace("6", "111111")
+ board = board.replace("7", "1111111")
+ board = board.replace("8", "11111111")
+ board = board.replace("9", "111111111")
+ return board.replace("/", "")
+
+ # 感觉位置有点反了,当前角色的棋子在右侧,plane的后面
+ def state_to_positions(self, state):
+ # TODO C plain x 2
+ board_state = self.replace_board_tags(state)
+ pieces_plane = np.zeros(shape=(9, 10, 14), dtype=np.float32)
+ for rank in range(9): #横线
+ for file in range(10): #直线
+ v = board_state[rank * 9 + file]
+ if v.isalpha():
+ pieces_plane[rank][file][ind[v]] = 1
+ assert pieces_plane.shape == (9, 10, 14)
+ return pieces_plane
+
+
+ def try_flip(self, state, current_player, flip=False):
+ if not flip:
+ return state, current_player
+
+ rows = state.split('/')
+
+ def swapcase(a):
+ if a.isalpha():
+ return a.lower() if a.isupper() else a.upper()
+ return a
+
+ def swapall(aa):
+ return "".join([swapcase(a) for a in aa])
+
+ return "/".join([swapall(row) for row in reversed(rows)]), ('w' if current_player == 'b' else 'b')
+
+ def is_black_turn(self, current_player):
+ return current_player == 'b'
+
+class GameBoard(object):
+ board_pos_name = np.array(create_position_labels()).reshape(9,10).transpose()
+ Ny = 10
+ Nx = 9
+
+ def __init__(self):
+ self.state = "RNBAKABNR/9/1C5C1/P1P1P1P1P/9/9/p1p1p1p1p/1c5c1/9/rnbakabnr"#"rnbakabnr/9/1c5c1/p1p1p1p1p/9/9/P1P1P1P1P/1C5C1/9/RNBAKABNR" #
+ self.round = 1
+ # self.players = ["w", "b"]
+ self.current_player = "w"
+ self.restrict_round = 0
+
+ def reload(self):
+ self.state = "RNBAKABNR/9/1C5C1/P1P1P1P1P/9/9/p1p1p1p1p/1c5c1/9/rnbakabnr"#"rnbakabnr/9/1c5c1/p1p1p1p1p/9/9/P1P1P1P1P/1C5C1/9/RNBAKABNR" #
+ self.round = 1
+ self.current_player = "w"
+ self.restrict_round = 0
+
+ @staticmethod
+ def print_borad(board, action = None):
+ def string_reverse(string):
+ # return ''.join(string[len(string) - i] for i in range(1, len(string)+1))
+ return ''.join(string[i] for i in range(len(string) - 1, -1, -1))
+
+ x_trans = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8}
+
+ if(action != None):
+ src = action[0:2]
+
+ src_x = int(x_trans[src[0]])
+ src_y = int(src[1])
+
+ # board = string_reverse(board)
+ board = board.replace("1", " ")
+ board = board.replace("2", " ")
+ board = board.replace("3", " ")
+ board = board.replace("4", " ")
+ board = board.replace("5", " ")
+ board = board.replace("6", " ")
+ board = board.replace("7", " ")
+ board = board.replace("8", " ")
+ board = board.replace("9", " ")
+ board = board.split('/')
+ # board = board.replace("/", "\n")
+ print(" abcdefghi")
+ for i,line in enumerate(board):
+ if (action != None):
+ if(i == src_y):
+ s = list(line)
+ s[src_x] = 'x'
+ line = ''.join(s)
+ print(i,line)
+ # print(board)
+
+ @staticmethod
+ def sim_do_action(in_action, in_state):
+ x_trans = {'a':0, 'b':1, 'c':2, 'd':3, 'e':4, 'f':5, 'g':6, 'h':7, 'i':8}
+
+ src = in_action[0:2]
+ dst = in_action[2:4]
+
+ src_x = int(x_trans[src[0]])
+ src_y = int(src[1])
+
+ dst_x = int(x_trans[dst[0]])
+ dst_y = int(dst[1])
+
+ board_positions = GameBoard.board_to_pos_name(in_state)
+ line_lst = []
+ for line in board_positions:
+ line_lst.append(list(line))
+ lines = np.array(line_lst)
+
+ lines[dst_y][dst_x] = lines[src_y][src_x]
+ lines[src_y][src_x] = '1'
+
+ board_positions[dst_y] = ''.join(lines[dst_y])
+ board_positions[src_y] = ''.join(lines[src_y])
+
+ board = "/".join(board_positions)
+ board = board.replace("111111111", "9")
+ board = board.replace("11111111", "8")
+ board = board.replace("1111111", "7")
+ board = board.replace("111111", "6")
+ board = board.replace("11111", "5")
+ board = board.replace("1111", "4")
+ board = board.replace("111", "3")
+ board = board.replace("11", "2")
+
+ return board
+
+ @staticmethod
+ def board_to_pos_name(board):
+ board = board.replace("2", "11")
+ board = board.replace("3", "111")
+ board = board.replace("4", "1111")
+ board = board.replace("5", "11111")
+ board = board.replace("6", "111111")
+ board = board.replace("7", "1111111")
+ board = board.replace("8", "11111111")
+ board = board.replace("9", "111111111")
+ return board.split("/")
+
+ @staticmethod
+ def check_bounds(toY, toX):
+ if toY < 0 or toX < 0:
+ return False
+
+ if toY >= GameBoard.Ny or toX >= GameBoard.Nx:
+ return False
+
+ return True
+
+ @staticmethod
+ def validate_move(c, upper=True):
+ if (c.isalpha()):
+ if (upper == True):
+ if (c.islower()):
+ return True
+ else:
+ return False
+ else:
+ if (c.isupper()):
+ return True
+ else:
+ return False
+ else:
+ return True
+
+ @staticmethod
+ def get_legal_moves(state, current_player):
+ moves = []
+ k_x = None
+ k_y = None
+
+ K_x = None
+ K_y = None
+
+ face_to_face = False
+
+ board_positions = np.array(GameBoard.board_to_pos_name(state))
+ for y in range(board_positions.shape[0]):
+ for x in range(len(board_positions[y])):
+ if(board_positions[y][x].isalpha()):
+ if(board_positions[y][x] == 'r' and current_player == 'b'):
+ toY = y
+ for toX in range(x - 1, -1, -1):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].isupper()):
+ moves.append(m)
+ break
+
+ moves.append(m)
+
+ for toX in range(x + 1, GameBoard.Nx):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].isupper()):
+ moves.append(m)
+ break
+
+ moves.append(m)
+
+ toX = x
+ for toY in range(y - 1, -1, -1):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].isupper()):
+ moves.append(m)
+ break
+
+ moves.append(m)
+
+ for toY in range(y + 1, GameBoard.Ny):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].isupper()):
+ moves.append(m)
+ break
+
+ moves.append(m)
+
+ elif(board_positions[y][x] == 'R' and current_player == 'w'):
+ toY = y
+ for toX in range(x - 1, -1, -1):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].islower()):
+ moves.append(m)
+ break
+
+ moves.append(m)
+
+ for toX in range(x + 1, GameBoard.Nx):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].islower()):
+ moves.append(m)
+ break
+
+ moves.append(m)
+
+ toX = x
+ for toY in range(y - 1, -1, -1):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].islower()):
+ moves.append(m)
+ break
+
+ moves.append(m)
+
+ for toY in range(y + 1, GameBoard.Ny):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].islower()):
+ moves.append(m)
+ break
+
+ moves.append(m)
+
+ elif ((board_positions[y][x] == 'n' or board_positions[y][x] == 'h') and current_player == 'b'):
+ for i in range(-1, 3, 2):
+ for j in range(-1, 3, 2):
+ toY = y + 2 * i
+ toX = x + 1 * j
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False) and board_positions[toY - i][x].isalpha() == False:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+ toY = y + 1 * i
+ toX = x + 2 * j
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False) and board_positions[y][toX - j].isalpha() == False:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+ elif ((board_positions[y][x] == 'N' or board_positions[y][x] == 'H') and current_player == 'w'):
+ for i in range(-1, 3, 2):
+ for j in range(-1, 3, 2):
+ toY = y + 2 * i
+ toX = x + 1 * j
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True) and board_positions[toY - i][x].isalpha() == False:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+ toY = y + 1 * i
+ toX = x + 2 * j
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True) and board_positions[y][toX - j].isalpha() == False:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+ elif ((board_positions[y][x] == 'b' or board_positions[y][x] == 'e') and current_player == 'b'):
+ for i in range(-2, 3, 4):
+ toY = y + i
+ toX = x + i
+
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
+ upper=False) and toY >= 5 and \
+ board_positions[y + i // 2][x + i // 2].isalpha() == False:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+ toY = y + i
+ toX = x - i
+
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
+ upper=False) and toY >= 5 and \
+ board_positions[y + i // 2][x - i // 2].isalpha() == False:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+ elif ((board_positions[y][x] == 'B' or board_positions[y][x] == 'E') and current_player == 'w'):
+ for i in range(-2, 3, 4):
+ toY = y + i
+ toX = x + i
+
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
+ upper=True) and toY <= 4 and \
+ board_positions[y + i // 2][x + i // 2].isalpha() == False:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+ toY = y + i
+ toX = x - i
+
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
+ upper=True) and toY <= 4 and \
+ board_positions[y + i // 2][x - i // 2].isalpha() == False:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+ elif (board_positions[y][x] == 'a' and current_player == 'b'):
+ for i in range(-1, 3, 2):
+ toY = y + i
+ toX = x + i
+
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
+ upper=False) and toY >= 7 and toX >= 3 and toX <= 5:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+
+ toY = y + i
+ toX = x - i
+
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
+ upper=False) and toY >= 7 and toX >= 3 and toX <= 5:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+ elif (board_positions[y][x] == 'A' and current_player == 'w'):
+ for i in range(-1, 3, 2):
+ toY = y + i
+ toX = x + i
+
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
+ upper=True) and toY <= 2 and toX >= 3 and toX <= 5:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+
+ toY = y + i
+ toX = x - i
+
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
+ upper=True) and toY <= 2 and toX >= 3 and toX <= 5:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+ elif (board_positions[y][x] == 'k'):
+ k_x = x
+ k_y = y
+
+ if(current_player == 'b'):
+ for i in range(2):
+ for sign in range(-1, 2, 2):
+ j = 1 - i
+ toY = y + i * sign
+ toX = x + j * sign
+
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
+ upper=False) and toY >= 7 and toX >= 3 and toX <= 5:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+ elif (board_positions[y][x] == 'K'):
+ K_x = x
+ K_y = y
+
+ if(current_player == 'w'):
+ for i in range(2):
+ for sign in range(-1, 2, 2):
+ j = 1 - i
+ toY = y + i * sign
+ toX = x + j * sign
+
+ if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
+ upper=True) and toY <= 2 and toX >= 3 and toX <= 5:
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+ elif (board_positions[y][x] == 'c' and current_player == 'b'):
+ toY = y
+ hits = False
+ for toX in range(x - 1, -1, -1):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (hits == False):
+ if (board_positions[toY][toX].isalpha()):
+ hits = True
+ else:
+ moves.append(m)
+ else:
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].isupper()):
+ moves.append(m)
+ break
+
+ hits = False
+ for toX in range(x + 1, GameBoard.Nx):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (hits == False):
+ if (board_positions[toY][toX].isalpha()):
+ hits = True
+ else:
+ moves.append(m)
+ else:
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].isupper()):
+ moves.append(m)
+ break
+
+ toX = x
+ hits = False
+ for toY in range(y - 1, -1, -1):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (hits == False):
+ if (board_positions[toY][toX].isalpha()):
+ hits = True
+ else:
+ moves.append(m)
+ else:
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].isupper()):
+ moves.append(m)
+ break
+
+ hits = False
+ for toY in range(y + 1, GameBoard.Ny):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (hits == False):
+ if (board_positions[toY][toX].isalpha()):
+ hits = True
+ else:
+ moves.append(m)
+ else:
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].isupper()):
+ moves.append(m)
+ break
+ elif (board_positions[y][x] == 'C' and current_player == 'w'):
+ toY = y
+ hits = False
+ for toX in range(x - 1, -1, -1):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (hits == False):
+ if (board_positions[toY][toX].isalpha()):
+ hits = True
+ else:
+ moves.append(m)
+ else:
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].islower()):
+ moves.append(m)
+ break
+
+ hits = False
+ for toX in range(x + 1, GameBoard.Nx):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (hits == False):
+ if (board_positions[toY][toX].isalpha()):
+ hits = True
+ else:
+ moves.append(m)
+ else:
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].islower()):
+ moves.append(m)
+ break
+
+ toX = x
+ hits = False
+ for toY in range(y - 1, -1, -1):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (hits == False):
+ if (board_positions[toY][toX].isalpha()):
+ hits = True
+ else:
+ moves.append(m)
+ else:
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].islower()):
+ moves.append(m)
+ break
+
+ hits = False
+ for toY in range(y + 1, GameBoard.Ny):
+ m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
+ if (hits == False):
+ if (board_positions[toY][toX].isalpha()):
+ hits = True
+ else:
+ moves.append(m)
+ else:
+ if (board_positions[toY][toX].isalpha()):
+ if (board_positions[toY][toX].islower()):
+ moves.append(m)
+ break
+ elif (board_positions[y][x] == 'p' and current_player == 'b'):
+ toY = y - 1
+ toX = x
+
+ if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False)):
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+
+ if y < 5:
+ toY = y
+ toX = x + 1
+ if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False)):
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+
+ toX = x - 1
+ if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False)):
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+
+ elif (board_positions[y][x] == 'P' and current_player == 'w'):
+ toY = y + 1
+ toX = x
+
+ if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True)):
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+
+ if y > 4:
+ toY = y
+ toX = x + 1
+ if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True)):
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+
+ toX = x - 1
+ if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True)):
+ moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
+
+ if(K_x != None and k_x != None and K_x == k_x):
+ face_to_face = True
+ for i in range(K_y + 1, k_y, 1):
+ if(board_positions[i][K_x].isalpha()):
+ face_to_face = False
+
+ if(face_to_face == True):
+ if(current_player == 'b'):
+ moves.append(GameBoard.board_pos_name[k_y][k_x] + GameBoard.board_pos_name[K_y][K_x])
+ else:
+ moves.append(GameBoard.board_pos_name[K_y][K_x] + GameBoard.board_pos_name[k_y][k_x])
+
+ return moves
+
+def softmax(x):
+ # print(x)
+ probs = np.exp(x - np.max(x))
+ # print(np.sum(probs))
+ probs /= np.sum(probs)
+ return probs
+
+class cchess_main(object):
+
+ def __init__(self, playout=400, in_batch_size=128, exploration = True, in_search_threads = 16, processor = "cpu", num_gpus = 1, res_block_nums = 7, human_color = 'b'):
+ self.epochs = 5
+ self.playout_counts = playout #400 #800 #1600 200
+ self.temperature = 1 #1e-8 1e-3
+ # self.c = 1e-4
+ self.batch_size = in_batch_size #128 #512
+ # self.momentum = 0.9
+ self.game_batch = 400 # Evaluation each 400 times
+ # self.game_loop = 25000
+ self.top_steps = 30
+ self.top_temperature = 1 #2
+ # self.Dirichlet = 0.3 # P(s,a) = (1 - ϵ)p_a + ϵη_a #self-play chapter in the paper
+ self.eta = 0.03
+ # self.epsilon = 0.25
+ # self.v_resign = 0.05
+ # self.c_puct = 5
+ self.learning_rate = 0.001 #5e-3 # 0.001
+ self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL
+ self.buffer_size = 10000
+ self.data_buffer = deque(maxlen=self.buffer_size)
+ self.game_borad = GameBoard()
+ self.processor = processor
+ # self.current_player = 'w' #“w”表示红方,“b”表示黑方。
+ self.policy_value_netowrk = policy_value_network(lambda: self.learning_rate * self.lr_multiplier, res_block_nums) if processor == 'cpu' else policy_value_network_gpus(num_gpus, res_block_nums)
+ self.search_threads = in_search_threads
+ self.mcts = MCTS_tree(self.game_borad.state, self.policy_value_netowrk.forward, self.search_threads)
+ self.exploration = exploration
+ self.resign_threshold = -0.8 #0.05
+ self.global_step = 0
+ self.kl_targ = 0.025
+ self.human_color = human_color
+
+ @staticmethod
+ def flip_policy(prob):
+ prob = tf.squeeze(prob) # .flatten()
+ return np.asarray([prob[ind] for ind in unflipped_index])
+
+ def policy_update(self):
+ """update the policy-value net"""
+ mini_batch = random.sample(self.data_buffer, self.batch_size)
+ #print("training data_buffer len : ", len(self.data_buffer))
+ state_batch = [data[0] for data in mini_batch]
+ mcts_probs_batch = [data[1] for data in mini_batch]
+ winner_batch = [data[2] for data in mini_batch]
+
+ winner_batch = np.expand_dims(winner_batch, 1)
+
+ start_time = time.time()
+ old_probs, old_v = self.mcts.forward(state_batch)
+ for i in range(self.epochs):
+ # print("tf.executing_eagerly() : ", tf.executing_eagerly())
+ state_batch = np.array(state_batch)
+ if len(state_batch.shape) == 3:
+ sp = state_batch.shape
+ state_batch = np.reshape(state_batch, [1, sp[0], sp[1], sp[2]])
+ if self.processor == 'cpu':
+ accuracy, loss, self.global_step = self.policy_value_netowrk.train_step(state_batch, mcts_probs_batch, winner_batch,
+ self.learning_rate * self.lr_multiplier) #
+ else:
+ # import pickle
+ # pickle.dump((state_batch, mcts_probs_batch, winner_batch, self.learning_rate * self.lr_multiplier), open('preprocess.p', 'wb'))
+ with self.policy_value_netowrk.strategy.scope():
+ train_dataset = tf.data.Dataset.from_tensor_slices((state_batch, mcts_probs_batch, winner_batch)).batch(len(winner_batch)) # , self.learning_rate * self.lr_multiplier
+ # .shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
+ train_iterator = self.policy_value_netowrk.strategy.make_dataset_iterator(train_dataset)
+ train_iterator.initialize()
+ accuracy, loss, self.global_step = self.policy_value_netowrk.distributed_train(train_iterator)
+
+ new_probs, new_v = self.mcts.forward(state_batch)
+ kl_tmp = old_probs * (np.log((old_probs + 1e-10) / (new_probs + 1e-10)))
+
+ kl_lst = []
+ for line in kl_tmp:
+ # print("line.shape", line.shape)
+ all_value = [x for x in line if str(x) != 'nan' and str(x)!= 'inf']#除去inf值
+ kl_lst.append(np.sum(all_value))
+ kl = np.mean(kl_lst)
+ # kl = scipy.stats.entropy(old_probs, new_probs)
+ # kl = np.mean(np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1))
+
+ if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly
+ break
+ self.policy_value_netowrk.save(self.global_step)
+ print("train using time {} s".format(time.time() - start_time))
+
+ # adaptively adjust the learning rate
+ if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
+ self.lr_multiplier /= 1.5
+ elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
+ self.lr_multiplier *= 1.5
+
+ explained_var_old = 1 - np.var(np.array(winner_batch) - tf.squeeze(old_v)) / np.var(np.array(winner_batch)) # .flatten()
+ explained_var_new = 1 - np.var(np.array(winner_batch) - tf.squeeze(new_v)) / np.var(np.array(winner_batch)) # .flatten()
+ print(
+ "kl:{:.5f},lr_multiplier:{:.3f},loss:{},accuracy:{},explained_var_old:{:.3f},explained_var_new:{:.3f}".format(
+ kl, self.lr_multiplier, loss, accuracy, explained_var_old, explained_var_new))
+
+ def run(self):
+ #self.game_loop
+ batch_iter = 0
+ try:
+ while(True):
+ batch_iter += 1
+ play_data, episode_len = self.selfplay()
+ print("batch i:{}, episode_len:{}".format(batch_iter, episode_len))
+ extend_data = []
+ # states_data = []
+ for state, mcts_prob, winner in play_data:
+ states_data = self.mcts.state_to_positions(state)
+ # prob = np.zeros(labels_len)
+ # for idx in range(len(mcts_prob[0][0])):
+ # prob[label2i[mcts_prob[0][0][idx]]] = mcts_prob[0][1][idx]
+ extend_data.append((states_data, mcts_prob, winner))
+ self.data_buffer.extend(extend_data)
+ if len(self.data_buffer) > self.batch_size:
+ self.policy_update()
+ # if (batch_iter) % self.game_batch == 0:
+ # print("current self-play batch: {}".format(batch_iter))
+ # win_ratio = self.policy_evaluate()
+ except KeyboardInterrupt:
+ self.policy_value_netowrk.save(self.global_step)
+
+ def get_hint(self, mcts_or_net, reverse, disp_mcts_msg_handler):
+
+ if mcts_or_net == "mcts":
+ if self.mcts.root.child == {}:
+ disp_mcts_msg_handler()
+ self.mcts.main(self.game_borad.state, self.game_borad.current_player, self.game_borad.restrict_round,
+ self.playout_counts)
+
+ actions_visits = [(act, nod.N) for act, nod in self.mcts.root.child.items()]
+ actions, visits = zip(*actions_visits)
+ # print("visits : ", visits)
+ # print("np.log(visits) : ", np.log(visits))
+ probs = softmax(1.0 / self.temperature * np.log(visits)) # + 1e-10
+
+ act_prob_dict = defaultdict(float)
+ for i in range(len(actions)):
+ if self.human_color == 'w':
+ action = "".join(flipped_uci_labels(actions[i]))
+ else:
+ action = actions[i]
+ act_prob_dict[action] = probs[i]
+
+ elif mcts_or_net == "net":
+ positions = self.mcts.generate_inputs(self.game_borad.state, self.game_borad.current_player)
+ positions = np.expand_dims(positions, 0)
+ action_probs, value = self.mcts.forward(positions)
+
+ if self.mcts.is_black_turn(self.game_borad.current_player):
+ action_probs = cchess_main.flip_policy(action_probs)
+ moves = GameBoard.get_legal_moves(self.game_borad.state, self.game_borad.current_player)
+
+ tot_p = 1e-8
+ action_probs = tf.squeeze(action_probs) # .flatten() # .squeeze()
+ act_prob_dict = defaultdict(float)
+ # print("expand action_probs shape : ", action_probs.shape)
+ for action in moves:
+ # in_state = GameBoard.sim_do_action(action, self.state)
+ mov_p = action_probs[label2i[action]]
+ if self.human_color == 'w':
+ action = "".join(flipped_uci_labels(action))
+ act_prob_dict[action] = mov_p
+ # new_node = leaf_node(self, mov_p, in_state)
+ # self.child[action] = new_node
+ tot_p += mov_p
+
+ for a, _ in act_prob_dict.items():
+ act_prob_dict[a] /= tot_p
+
+ sorted_move_probs = sorted(act_prob_dict.items(), key=lambda item: item[1], reverse=reverse)
+ # print(sorted_move_probs)
+
+ return sorted_move_probs
+
+ #@profile
+ def get_action(self, state, temperature = 1e-3):
+ # for i in range(self.playout_counts):
+ # state_sim = copy.deepcopy(state)
+ # self.mcts.do_simulation(state_sim, self.game_borad.current_player, self.game_borad.restrict_round)
+
+ self.mcts.main(state, self.game_borad.current_player, self.game_borad.restrict_round, self.playout_counts)
+
+ actions_visits = [(act, nod.N) for act, nod in self.mcts.root.child.items()]
+ actions, visits = zip(*actions_visits)
+ probs = softmax(1.0 / temperature * np.log(visits)) #+ 1e-10
+ move_probs = []
+ move_probs.append([actions, probs])
+
+ if(self.exploration):
+ act = np.random.choice(actions, p=0.75 * probs + 0.25*np.random.dirichlet(0.3*np.ones(len(probs))))
+ else:
+ act = np.random.choice(actions, p=probs)
+
+ win_rate = self.mcts.Q(act) # / 2.0 + 0.5
+ self.mcts.update_tree(act)
+
+ # if position.n < 30: # self.top_steps
+ # move = select_weighted_random(position, on_board_move_prob)
+ # else:
+ # move = select_most_likely(position, on_board_move_prob)
+
+ return act, move_probs, win_rate
+
+ def get_action_old(self, state, temperature = 1e-3):
+ for i in range(self.playout_counts):
+ state_sim = copy.deepcopy(state)
+ self.mcts.do_simulation(state_sim, self.game_borad.current_player, self.game_borad.restrict_round)
+
+ actions_visits = [(act, nod.N) for act, nod in self.mcts.root.child.items()]
+ actions, visits = zip(*actions_visits)
+ probs = softmax(1.0 / temperature * np.log(visits)) #+ 1e-10
+ move_probs = []
+ move_probs.append([actions, probs])
+
+ if(self.exploration):
+ act = np.random.choice(actions, p=0.75 * probs + 0.25*np.random.dirichlet(0.3*np.ones(len(probs))))
+ else:
+ act = np.random.choice(actions, p=probs)
+
+ self.mcts.update_tree(act)
+
+ return act, move_probs
+
+ def check_end(self):
+ if (self.game_borad.state.find('K') == -1 or self.game_borad.state.find('k') == -1):
+ if (self.game_borad.state.find('K') == -1):
+ print("Green is Winner")
+ return True, "b"
+ if (self.game_borad.state.find('k') == -1):
+ print("Red is Winner")
+ return True, "w"
+ elif self.game_borad.restrict_round >= 60:
+ print("TIE! No Winners!")
+ return True, "t"
+ else:
+ return False, ""
+
+ def human_move(self, coord, mcts_or_net):
+ win_rate = 0
+ x_trans = {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i'}
+
+ src = coord[0:2]
+ dst = coord[2:4]
+
+ src_x = (x_trans[src[0]])
+ src_y = str(src[1])
+
+ dst_x = (x_trans[dst[0]])
+ dst_y = str(dst[1])
+
+ action = src_x + src_y + dst_x + dst_y
+
+ if self.human_color == 'w':
+ action = "".join(flipped_uci_labels(action))
+
+ if mcts_or_net == "mcts":
+ if self.mcts.root.child == {}:
+ # self.get_action(self.game_borad.state, self.temperature)
+ self.mcts.main(self.game_borad.state, self.game_borad.current_player, self.game_borad.restrict_round,
+ self.playout_counts)
+ win_rate = self.mcts.Q(action) # / 2.0 + 0.5
+ self.mcts.update_tree(action)
+
+ last_state = self.game_borad.state
+ # print(self.game_borad.current_player, " now take a action : ", action, "[Step {}]".format(self.game_borad.round))
+ self.game_borad.state = GameBoard.sim_do_action(action, self.game_borad.state)
+ self.game_borad.round += 1
+ self.game_borad.current_player = "w" if self.game_borad.current_player == "b" else "b"
+ if is_kill_move(last_state, self.game_borad.state) == 0:
+ self.game_borad.restrict_round += 1
+ else:
+ self.game_borad.restrict_round = 0
+
+ return win_rate
+
+
+ def select_move(self, mcts_or_net):
+ if mcts_or_net == "mcts":
+ action, probs, win_rate = self.get_action(self.game_borad.state, self.temperature)
+ # win_rate = self.mcts.Q(action) / 2.0 + 0.5
+ elif mcts_or_net == "net":
+ positions = self.mcts.generate_inputs(self.game_borad.state, self.game_borad.current_player)
+ positions = np.expand_dims(positions, 0)
+ action_probs, value = self.mcts.forward(positions)
+ win_rate = value[0, 0] # / 2 + 0.5
+ if self.mcts.is_black_turn(self.game_borad.current_player):
+ action_probs = cchess_main.flip_policy(action_probs)
+ moves = GameBoard.get_legal_moves(self.game_borad.state, self.game_borad.current_player)
+
+ tot_p = 1e-8
+ action_probs = tf.squeeze(action_probs) # .flatten() # .squeeze()
+ act_prob_dict = defaultdict(float)
+ # print("expand action_probs shape : ", action_probs.shape)
+ for action in moves:
+ # in_state = GameBoard.sim_do_action(action, self.state)
+ mov_p = action_probs[label2i[action]]
+ act_prob_dict[action] = mov_p
+ # new_node = leaf_node(self, mov_p, in_state)
+ # self.child[action] = new_node
+ tot_p += mov_p
+
+ for a, _ in act_prob_dict.items():
+ act_prob_dict[a] /= tot_p
+
+ action = max(act_prob_dict.items(), key=lambda node: node[1])[0]
+ # self.mcts.update_tree(action)
+
+ print('Win rate for player {} is {:.4f}'.format(self.game_borad.current_player, win_rate))
+ last_state = self.game_borad.state
+ print(self.game_borad.current_player, " now take a action : ", action, "[Step {}]".format(self.game_borad.round)) # if self.human_color == 'w' else "".join(flipped_uci_labels(action))
+ self.game_borad.state = GameBoard.sim_do_action(action, self.game_borad.state)
+ self.game_borad.round += 1
+ self.game_borad.current_player = "w" if self.game_borad.current_player == "b" else "b"
+ if is_kill_move(last_state, self.game_borad.state) == 0:
+ self.game_borad.restrict_round += 1
+ else:
+ self.game_borad.restrict_round = 0
+
+ self.game_borad.print_borad(self.game_borad.state)
+
+ x_trans = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8}
+
+ if self.human_color == 'w':
+ action = "".join(flipped_uci_labels(action))
+
+ src = action[0:2]
+ dst = action[2:4]
+
+ src_x = int(x_trans[src[0]])
+ src_y = int(src[1])
+
+ dst_x = int(x_trans[dst[0]])
+ dst_y = int(dst[1])
+
+ return (src_x, src_y, dst_x - src_x, dst_y - src_y), win_rate
+
+ def selfplay(self):
+ self.game_borad.reload()
+ # p1, p2 = self.game_borad.players
+ states, mcts_probs, current_players = [], [], []
+ z = None
+ game_over = False
+ winnner = ""
+ start_time = time.time()
+ # self.game_borad.print_borad(self.game_borad.state)
+ while(not game_over):
+ action, probs, win_rate = self.get_action(self.game_borad.state, self.temperature)
+ state, palyer = self.mcts.try_flip(self.game_borad.state, self.game_borad.current_player, self.mcts.is_black_turn(self.game_borad.current_player))
+ states.append(state)
+ prob = np.zeros(labels_len)
+ if self.mcts.is_black_turn(self.game_borad.current_player):
+ for idx in range(len(probs[0][0])):
+ # probs[0][0][idx] = "".join((str(9 - int(a)) if a.isdigit() else a) for a in probs[0][0][idx])
+ act = "".join((str(9 - int(a)) if a.isdigit() else a) for a in probs[0][0][idx])
+ # for idx in range(len(mcts_prob[0][0])):
+ prob[label2i[act]] = probs[0][1][idx]
+ else:
+ for idx in range(len(probs[0][0])):
+ prob[label2i[probs[0][0][idx]]] = probs[0][1][idx]
+ mcts_probs.append(prob)
+ # mcts_probs.append(probs)
+ current_players.append(self.game_borad.current_player)
+
+ last_state = self.game_borad.state
+ # print(self.game_borad.current_player, " now take a action : ", action, "[Step {}]".format(self.game_borad.round))
+ self.game_borad.state = GameBoard.sim_do_action(action, self.game_borad.state)
+ self.game_borad.round += 1
+ self.game_borad.current_player = "w" if self.game_borad.current_player == "b" else "b"
+ if is_kill_move(last_state, self.game_borad.state) == 0:
+ self.game_borad.restrict_round += 1
+ else:
+ self.game_borad.restrict_round = 0
+
+ # self.game_borad.print_borad(self.game_borad.state, action)
+
+ if (self.game_borad.state.find('K') == -1 or self.game_borad.state.find('k') == -1):
+ z = np.zeros(len(current_players))
+ if (self.game_borad.state.find('K') == -1):
+ winnner = "b"
+ if (self.game_borad.state.find('k') == -1):
+ winnner = "w"
+ z[np.array(current_players) == winnner] = 1.0
+ z[np.array(current_players) != winnner] = -1.0
+ game_over = True
+ print("Game end. Winner is player : ", winnner, " In {} steps".format(self.game_borad.round - 1))
+ elif self.game_borad.restrict_round >= 60:
+ z = np.zeros(len(current_players))
+ game_over = True
+ print("Game end. Tie in {} steps".format(self.game_borad.round - 1))
+ # elif(self.mcts.root.v < self.resign_threshold):
+ # pass
+ # elif(self.mcts.root.Q < self.resign_threshold):
+ # pass
+ if(game_over):
+ # self.mcts.root = leaf_node(None, self.mcts.p_, "RNBAKABNR/9/1C5C1/P1P1P1P1P/9/9/p1p1p1p1p/1c5c1/9/rnbakabnr")#"rnbakabnr/9/1c5c1/p1p1p1p1p/9/9/P1P1P1P1P/1C5C1/9/RNBAKABNR"
+ self.mcts.reload()
+ print("Using time {} s".format(time.time() - start_time))
+ return zip(states, mcts_probs, z), len(z)
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--mode', default='train', choices=['train', 'play'], type=str, help='train or play')
+ parser.add_argument('--ai_count', default=1, choices=[1, 2], type=int, help='choose ai player count')
+ parser.add_argument('--ai_function', default='mcts', choices=['mcts', 'net'], type=str, help='mcts or net')
+ parser.add_argument('--train_playout', default=400, type=int, help='mcts train playout')
+ parser.add_argument('--batch_size', default=512, type=int, help='train batch_size')
+ parser.add_argument('--play_playout', default=400, type=int, help='mcts play playout')
+ parser.add_argument('--delay', dest='delay', action='store',
+ nargs='?', default=3, type=float, required=False,
+ help='Set how many seconds you want to delay after each move')
+ parser.add_argument('--end_delay', dest='end_delay', action='store',
+ nargs='?', default=3, type=float, required=False,
+ help='Set how many seconds you want to delay after the end of game')
+ parser.add_argument('--search_threads', default=16, type=int, help='search_threads')
+ parser.add_argument('--processor', default='cpu', choices=['cpu', 'gpu'], type=str, help='cpu or gpu')
+ parser.add_argument('--num_gpus', default=1, type=int, help='gpu counts')
+ parser.add_argument('--res_block_nums', default=7, type=int, help='res_block_nums')
+ parser.add_argument('--human_color', default='b', choices=['w', 'b'], type=str, help='w or b')
+ parser.add_argument('--no-gui', action='store_true', help='run without gui')
+ args = parser.parse_args()
+
+ if args.mode == 'train':
+ train_main = cchess_main(args.train_playout, args.batch_size, True, args.search_threads, args.processor, args.num_gpus, args.res_block_nums, args.human_color) # * args.num_gpus
+ train_main.run()
+ elif args.mode == 'play':
+ from ChessGame import *
+ game = ChessGame(args.ai_count, args.ai_function, args.play_playout, args.delay, args.end_delay, args.batch_size,
+ args.search_threads, args.processor, args.num_gpus, args.res_block_nums, args.human_color, args.no_gui) # * args.num_gpus
+ game.start()
diff --git a/main_tf2.py b/main_tf2.py
deleted file mode 100755
index a896ba8..0000000
--- a/main_tf2.py
+++ /dev/null
@@ -1,1603 +0,0 @@
-#coding:utf-8
-from asyncio import Future
-import asyncio
-from asyncio.queues import Queue
-import uvloop
-asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-
-import tensorflow as tf
-import numpy as np
-import os
-import sys
-import random
-import time
-import argparse
-from collections import deque, defaultdict, namedtuple
-import copy
-from policy_value_network_tf2 import *
-from policy_value_network_gpus_tf2 import *
-import scipy.stats
-from threading import Lock
-from concurrent.futures import ThreadPoolExecutor
-
-def flipped_uci_labels(param):
- def repl(x):
- return "".join([(str(9 - int(a)) if a.isdigit() else a) for a in x])
-
- return [repl(x) for x in param]
-
-# 创建所有合法走子UCI,size 2086
-def create_uci_labels():
- labels_array = []
- letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
- numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
-
- Advisor_labels = ['d7e8', 'e8d7', 'e8f9', 'f9e8', 'd0e1', 'e1d0', 'e1f2', 'f2e1',
- 'd2e1', 'e1d2', 'e1f0', 'f0e1', 'd9e8', 'e8d9', 'e8f7', 'f7e8']
- Bishop_labels = ['a2c4', 'c4a2', 'c0e2', 'e2c0', 'e2g4', 'g4e2', 'g0i2', 'i2g0',
- 'a7c9', 'c9a7', 'c5e7', 'e7c5', 'e7g9', 'g9e7', 'g5i7', 'i7g5',
- 'a2c0', 'c0a2', 'c4e2', 'e2c4', 'e2g0', 'g0e2', 'g4i2', 'i2g4',
- 'a7c5', 'c5a7', 'c9e7', 'e7c9', 'e7g5', 'g5e7', 'g9i7', 'i7g9']
- # King_labels = ['d0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9',
- # 'd7d0', 'd7d1', 'd7d2', 'd8d0', 'd8d1', 'd8d2', 'd9d0', 'd9d1', 'd9d2',
- # 'd0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9',
- # 'd0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9',
- # 'd0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9',
- # 'd0d7', 'd0d8', 'd0d9', 'd1d7', 'd1d8', 'd1d9', 'd2d7', 'd2d8', 'd2d9']
-
- for l1 in range(9):
- for n1 in range(10):
- destinations = [(t, n1) for t in range(9)] + \
- [(l1, t) for t in range(10)] + \
- [(l1 + a, n1 + b) for (a, b) in
- [(-2, -1), (-1, -2), (-2, 1), (1, -2), (2, -1), (-1, 2), (2, 1), (1, 2)]] # 马走日
- for (l2, n2) in destinations:
- if (l1, n1) != (l2, n2) and l2 in range(9) and n2 in range(10):
- move = letters[l1] + numbers[n1] + letters[l2] + numbers[n2]
- labels_array.append(move)
-
- for p in Advisor_labels:
- labels_array.append(p)
-
- for p in Bishop_labels:
- labels_array.append(p)
-
- return labels_array
-
-def create_position_labels():
- labels_array = []
- letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
- letters.reverse()
- numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
-
- for l1 in range(9):
- for n1 in range(10):
- move = letters[8 - l1] + numbers[n1]
- labels_array.append(move)
-# labels_array.reverse()
- return labels_array
-
-def create_position_labels_reverse():
- labels_array = []
- letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
- letters.reverse()
- numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
-
- for l1 in range(9):
- for n1 in range(10):
- move = letters[l1] + numbers[n1]
- labels_array.append(move)
- labels_array.reverse()
- return labels_array
-
-class leaf_node(object):
- def __init__(self, in_parent, in_prior_p, in_state):
- self.P = in_prior_p
- self.Q = 0
- self.N = 0
- self.v = 0
- self.U = 0
- self.W = 0
- self.parent = in_parent
- self.child = {}
- self.state = in_state
-
- def is_leaf(self):
- return self.child == {}
-
- def get_Q_plus_U_new(self, c_puct):
- """Calculate and return the value for this node: a combination of leaf evaluations, Q, and
- this node's prior adjusted for its visit count, u
- c_puct -- a number in (0, inf) controlling the relative impact of values, Q, and
- prior probability, P, on this node's score.
- """
- # self._u = c_puct * self._P * np.sqrt(self._parent._n_visits) / (1 + self._n_visits)
- U = c_puct * self.P * np.sqrt(self.parent.N) / ( 1 + self.N)
- return self.Q + U
-
- def get_Q_plus_U(self, c_puct):
- """Calculate and return the value for this node: a combination of leaf evaluations, Q, and
- this node's prior adjusted for its visit count, u
- c_puct -- a number in (0, inf) controlling the relative impact of values, Q, and
- prior probability, P, on this node's score.
- """
- # self._u = c_puct * self._P * np.sqrt(self._parent._n_visits) / (1 + self._n_visits)
- self.U = c_puct * self.P * np.sqrt(self.parent.N) / ( 1 + self.N)
- return self.Q + self.U
-
- # def select_move_by_action_score(self, noise=True):
- #
- # # P = params[self.lookup['P']]
- # # N = params[self.lookup['N']]
- # # Q = params[self.lookup['W']] / (N + 1e-8)
- # # U = c_PUCT * P * np.sqrt(np.sum(N)) / (1 + N)
- #
- # ret_a = None
- # ret_n = None
- # action_idx = {}
- # action_score = []
- # i = 0
- # for a, n in self.child.items():
- # U = c_PUCT * n.P * np.sqrt(n.parent.N) / ( 1 + n.N)
- # action_idx[i] = (a, n)
- #
- # if noise:
- # action_score.append(n.Q + U * (0.75 * n.P + 0.25 * dirichlet([.03] * (go.N ** 2 + 1))) / (n.P + 1e-8))
- # else:
- # action_score.append(n.Q + U)
- # i += 1
- # # if(n.Q + n.U > max_Q_plus_U):
- # # max_Q_plus_U = n.Q + n.U
- # # ret_a = a
- # # ret_n = n
- #
- # action_t = int(np.argmax(action_score[:-1]))
- #
- # return ret_a, ret_n
- # # return action_t
- def select_new(self, c_puct):
- return max(self.child.items(), key=lambda node: node[1].get_Q_plus_U_new(c_puct))
-
- def select(self, c_puct):
- # max_Q_plus_U = 1e-10
- # ret_a = None
- # ret_n = None
- # for a, n in self.child.items():
- # n.U = c_puct * n.P * np.sqrt(n.parent.N) / ( 1 + n.N)
- # if(n.Q + n.U > max_Q_plus_U):
- # max_Q_plus_U = n.Q + n.U
- # ret_a = a
- # ret_n = n
- # return ret_a, ret_n
- return max(self.child.items(), key=lambda node: node[1].get_Q_plus_U(c_puct))
-
- #@profile
- def expand(self, moves, action_probs):
- tot_p = 1e-8
- # print("action_probs : ", action_probs)
- action_probs = tf.squeeze(action_probs) #.flatten() #.squeeze()
- # print("expand action_probs shape : ", action_probs.shape)
- for action in moves:
- in_state = GameBoard.sim_do_action(action, self.state)
- mov_p = action_probs[label2i[action]]
- new_node = leaf_node(self, mov_p, in_state)
- self.child[action] = new_node
- tot_p += mov_p
-
- for a, n in self.child.items():
- n.P /= tot_p
-
- def back_up_value(self, value):
- self.N += 1
- self.W += value
- self.v = value
- self.Q = self.W / self.N # node.Q += 1.0*(value - node.Q) / node.N
- self.U = c_PUCT * self.P * np.sqrt(self.parent.N) / ( 1 + self.N)
- # node = node.parent
- # value = -value
-
- def backup(self, value):
- node = self
- while node != None:
- node.N += 1
- node.W += value
- node.v = value
- node.Q = node.W / node.N # node.Q += 1.0*(value - node.Q) / node.N
- node = node.parent
- value = -value
-
-pieces_order = 'KARBNPCkarbnpc' # 9 x 10 x 14
-ind = {pieces_order[i]: i for i in range(14)}
-
-labels_array = create_uci_labels()
-labels_len = len(labels_array)
-flipped_labels = flipped_uci_labels(labels_array)
-unflipped_index = [labels_array.index(x) for x in flipped_labels]
-
-i2label = {i: val for i, val in enumerate(labels_array)}
-label2i = {val: i for i, val in enumerate(labels_array)}
-
-def get_pieces_count(state):
- count = 0
- for s in state:
- if s.isalpha():
- count += 1
- return count
-
-def is_kill_move(state_prev, state_next):
- return get_pieces_count(state_prev) - get_pieces_count(state_next)
-
-QueueItem = namedtuple("QueueItem", "feature future")
-c_PUCT = 5
-virtual_loss = 3
-cut_off_depth = 30
-
-class MCTS_tree(object):
- def __init__(self, in_state, in_forward, search_threads):
- self.noise_eps = 0.25
- self.dirichlet_alpha = 0.3 #0.03
- self.p_ = (1 - self.noise_eps) * 1 + self.noise_eps * np.random.dirichlet([self.dirichlet_alpha])
- self.root = leaf_node(None, self.p_, in_state)
- self.c_puct = 5 #1.5
- # self.policy_network = in_policy_network
- self.forward = in_forward
- self.node_lock = defaultdict(Lock)
-
- self.virtual_loss = 3
- self.now_expanding = set()
- self.expanded = set()
- self.cut_off_depth = 30
- # self.QueueItem = namedtuple("QueueItem", "feature future")
- self.sem = asyncio.Semaphore(search_threads)
- self.queue = Queue(search_threads)
- self.loop = asyncio.get_event_loop()
- self.running_simulation_num = 0
-
- def reload(self):
- self.root = leaf_node(None, self.p_,
- "RNBAKABNR/9/1C5C1/P1P1P1P1P/9/9/p1p1p1p1p/1c5c1/9/rnbakabnr") # "rnbakabnr/9/1c5c1/p1p1p1p1p/9/9/P1P1P1P1P/1C5C1/9/RNBAKABNR"
- self.expanded = set()
-
-
- def Q(self, move) -> float:
- ret = 0.0
- find = False
- for a, n in self.root.child.items():
- if move == a:
- ret = n.Q
- find = True
- if(find == False):
- print("{} not exist in the child".format(move))
- return ret
-
- def update_tree(self, act):
- # if(act in self.root.child):
- self.expanded.discard(self.root)
- self.root = self.root.child[act]
- self.root.parent = None
- # else:
- # self.root = leaf_node(None, self.p_, in_state)
-
-
- # def do_simulation(self, state, current_player, restrict_round):
- # node = self.root
- # last_state = state
- # while(node.is_leaf() == False):
- # # print("do_simulation while current_player : ", current_player)
- # with self.node_lock[node]:
- # action, node = node.select(self.c_puct)
- # current_player = "w" if current_player == "b" else "b"
- # if is_kill_move(last_state, node.state) == 0:
- # restrict_round += 1
- # else:
- # restrict_round = 0
- # last_state = node.state
- #
- # positions = self.generate_inputs(node.state, current_player)
- # positions = np.expand_dims(positions, 0)
- # action_probs, value = self.forward(positions)
- # if self.is_black_turn(current_player):
- # action_probs = cchess_main.flip_policy(action_probs)
- #
- # # print("action_probs shape : ", action_probs.shape) #(1, 2086)
- # with self.node_lock[node]:
- # if(node.state.find('K') == -1 or node.state.find('k') == -1):
- # if (node.state.find('K') == -1):
- # value = 1.0 if current_player == "b" else -1.0
- # if (node.state.find('k') == -1):
- # value = -1.0 if current_player == "b" else 1.0
- # elif restrict_round >= 60:
- # value = 0.0
- # else:
- # moves = GameBoard.get_legal_moves(node.state, current_player)
- # # print("current_player : ", current_player)
- # # print(moves)
- # node.expand(moves, action_probs)
- #
- # # if(node.parent != None):
- # # node.parent.N += self.virtual_loss
- # node.N += self.virtual_loss
- # node.W += -self.virtual_loss
- # node.Q = node.W / node.N
- #
- # # time.sleep(0.1)
- #
- # with self.node_lock[node]:
- # # if(node.parent != None):
- # # node.parent.N += -self.virtual_loss# + 1
- # node.N += -self.virtual_loss# + 1
- # node.W += self.virtual_loss# + leaf_v
- # # node.Q = node.W / node.N
- #
- # node.backup(-value)
-
- def is_expanded(self, key) -> bool:
- """Check expanded status"""
- return key in self.expanded
-
- async def tree_search(self, node, current_player, restrict_round) -> float:
- """Independent MCTS, stands for one simulation"""
- self.running_simulation_num += 1
-
- # reduce parallel search number
- with await self.sem:
- value = await self.start_tree_search(node, current_player, restrict_round)
- # logger.debug(f"value: {value}")
- # logger.debug(f'Current running threads : {RUNNING_SIMULATION_NUM}')
- self.running_simulation_num -= 1
-
- return value
-
- async def start_tree_search(self, node, current_player, restrict_round)->float:
- """Monte Carlo Tree search Select,Expand,Evauate,Backup"""
- now_expanding = self.now_expanding
-
- while node in now_expanding:
- await asyncio.sleep(1e-4)
-
- if not self.is_expanded(node): # and node.is_leaf()
- """is leaf node try evaluate and expand"""
- # add leaf node to expanding list
- self.now_expanding.add(node)
-
- positions = self.generate_inputs(node.state, current_player)
- # positions = np.expand_dims(positions, 0)
-
- # push extracted dihedral features of leaf node to the evaluation queue
- future = await self.push_queue(positions) # type: Future
- await future
- action_probs, value = future.result()
-
- # action_probs, value = self.forward(positions)
- if self.is_black_turn(current_player):
- action_probs = cchess_main.flip_policy(action_probs)
-
- moves = GameBoard.get_legal_moves(node.state, current_player)
- # print("current_player : ", current_player)
- # print(moves)
- node.expand(moves, action_probs)
- self.expanded.add(node) # node.state
-
- # remove leaf node from expanding list
- self.now_expanding.remove(node)
-
- # must invert, because alternative layer has opposite objective
- return value[0] * -1
-
- else:
- """node has already expanded. Enter select phase."""
- # select child node with maximum action scroe
- last_state = node.state
-
- action, node = node.select_new(c_PUCT)
- current_player = "w" if current_player == "b" else "b"
- if is_kill_move(last_state, node.state) == 0:
- restrict_round += 1
- else:
- restrict_round = 0
- last_state = node.state
-
- # action_t = self.select_move_by_action_score(key, noise=True)
-
- # add virtual loss
- # self.virtual_loss_do(key, action_t)
- node.N += virtual_loss
- node.W += -virtual_loss
-
- # evolve game board status
- # child_position = self.env_action(position, action_t)
-
- if (node.state.find('K') == -1 or node.state.find('k') == -1):
- if (node.state.find('K') == -1):
- value = 1.0 if current_player == "b" else -1.0
- if (node.state.find('k') == -1):
- value = -1.0 if current_player == "b" else 1.0
- value = value * -1
- elif restrict_round >= 60:
- value = 0.0
- else:
- value = await self.start_tree_search(node, current_player, restrict_round) # next move
- # if node is not None:
- # value = await self.start_tree_search(node) # next move
- # else:
- # # None position means illegal move
- # value = -1
-
- # self.virtual_loss_undo(key, action_t)
- node.N += -virtual_loss
- node.W += virtual_loss
-
- # on returning search path
- # update: N, W, Q, U
- # self.back_up_value(key, action_t, value)
- node.back_up_value(value) # -value
-
- # must invert
- return value * -1
- # if child_position is not None:
- # return value * -1
- # else:
- # # illegal move doesn't mean much for the opponent
- # return 0
-
- async def prediction_worker(self):
- """For better performance, queueing prediction requests and predict together in this worker.
- speed up about 45sec -> 15sec for example.
- """
- q = self.queue
- margin = 10 # avoid finishing before other searches starting.
- while self.running_simulation_num > 0 or margin > 0:
- if q.empty():
- if margin > 0:
- margin -= 1
- await asyncio.sleep(1e-3)
- continue
- item_list = [q.get_nowait() for _ in range(q.qsize())] # type: list[QueueItem]
- #logger.debug(f"predicting {len(item_list)} items")
- features = np.asarray([item.feature for item in item_list]) # asarray
- # print("prediction_worker [features.shape] before : ", features.shape)
- # shape = features.shape
- # features = features.reshape((shape[0] * shape[1], shape[2], shape[3], shape[4]))
- # print("prediction_worker [features.shape] after : ", features.shape)
- # policy_ary, value_ary = self.run_many(features)
- action_probs, value = self.forward(features)
- for p, v, item in zip(action_probs, value, item_list):
- item.future.set_result((p, v))
-
- async def push_queue(self, features):
- future = self.loop.create_future()
- item = QueueItem(features, future)
- await self.queue.put(item)
- return future
-
- #@profile
- def main(self, state, current_player, restrict_round, playouts):
- node = self.root
- if not self.is_expanded(node): # and node.is_leaf() # node.state
- # print('Expadning Root Node...')
- positions = self.generate_inputs(node.state, current_player)
- positions = np.expand_dims(positions, 0)
- action_probs, value = self.forward(positions)
- if self.is_black_turn(current_player):
- action_probs = cchess_main.flip_policy(action_probs)
-
- moves = GameBoard.get_legal_moves(node.state, current_player)
- # print("current_player : ", current_player)
- # print(moves)
- node.expand(moves, action_probs)
- self.expanded.add(node) # node.state
-
- coroutine_list = []
- for _ in range(playouts):
- coroutine_list.append(self.tree_search(node, current_player, restrict_round))
- coroutine_list.append(self.prediction_worker())
- self.loop.run_until_complete(asyncio.gather(*coroutine_list))
-
- def do_simulation(self, state, current_player, restrict_round):
- node = self.root
- last_state = state
- while(node.is_leaf() == False):
- # print("do_simulation while current_player : ", current_player)
- action, node = node.select(self.c_puct)
- current_player = "w" if current_player == "b" else "b"
- if is_kill_move(last_state, node.state) == 0:
- restrict_round += 1
- else:
- restrict_round = 0
- last_state = node.state
-
- positions = self.generate_inputs(node.state, current_player)
- positions = np.expand_dims(positions, 0)
- action_probs, value = self.forward(positions)
- if self.is_black_turn(current_player):
- action_probs = cchess_main.flip_policy(action_probs)
-
- # print("action_probs shape : ", action_probs.shape) #(1, 2086)
-
- if(node.state.find('K') == -1 or node.state.find('k') == -1):
- if (node.state.find('K') == -1):
- value = 1.0 if current_player == "b" else -1.0
- if (node.state.find('k') == -1):
- value = -1.0 if current_player == "b" else 1.0
- elif restrict_round >= 60:
- value = 0.0
- else:
- moves = GameBoard.get_legal_moves(node.state, current_player)
- # print("current_player : ", current_player)
- # print(moves)
- node.expand(moves, action_probs)
-
- node.backup(-value)
-
- def generate_inputs(self, in_state, current_player):
- state, palyer = self.try_flip(in_state, current_player, self.is_black_turn(current_player))
- return self.state_to_positions(state)
-
- def replace_board_tags(self, board):
- board = board.replace("2", "11")
- board = board.replace("3", "111")
- board = board.replace("4", "1111")
- board = board.replace("5", "11111")
- board = board.replace("6", "111111")
- board = board.replace("7", "1111111")
- board = board.replace("8", "11111111")
- board = board.replace("9", "111111111")
- return board.replace("/", "")
-
- # 感觉位置有点反了,当前角色的棋子在右侧,plane的后面
- def state_to_positions(self, state):
- # TODO C plain x 2
- board_state = self.replace_board_tags(state)
- pieces_plane = np.zeros(shape=(9, 10, 14), dtype=np.float32)
- for rank in range(9): #横线
- for file in range(10): #直线
- v = board_state[rank * 9 + file]
- if v.isalpha():
- pieces_plane[rank][file][ind[v]] = 1
- assert pieces_plane.shape == (9, 10, 14)
- return pieces_plane
-
-
- def try_flip(self, state, current_player, flip=False):
- if not flip:
- return state, current_player
-
- rows = state.split('/')
-
- def swapcase(a):
- if a.isalpha():
- return a.lower() if a.isupper() else a.upper()
- return a
-
- def swapall(aa):
- return "".join([swapcase(a) for a in aa])
-
- return "/".join([swapall(row) for row in reversed(rows)]), ('w' if current_player == 'b' else 'b')
-
- def is_black_turn(self, current_player):
- return current_player == 'b'
-
-class GameBoard(object):
- board_pos_name = np.array(create_position_labels()).reshape(9,10).transpose()
- Ny = 10
- Nx = 9
-
- def __init__(self):
- self.state = "RNBAKABNR/9/1C5C1/P1P1P1P1P/9/9/p1p1p1p1p/1c5c1/9/rnbakabnr"#"rnbakabnr/9/1c5c1/p1p1p1p1p/9/9/P1P1P1P1P/1C5C1/9/RNBAKABNR" #
- self.round = 1
- # self.players = ["w", "b"]
- self.current_player = "w"
- self.restrict_round = 0
-
-# 小写表示黑方,大写表示红方
-# [
-# "rheakaehr",
-# " ",
-# " c c ",
-# "p p p p p",
-# " ",
-# " ",
-# "P P P P P",
-# " C C ",
-# " ",
-# "RHEAKAEHR"
-# ]
- def reload(self):
- self.state = "RNBAKABNR/9/1C5C1/P1P1P1P1P/9/9/p1p1p1p1p/1c5c1/9/rnbakabnr"#"rnbakabnr/9/1c5c1/p1p1p1p1p/9/9/P1P1P1P1P/1C5C1/9/RNBAKABNR" #
- self.round = 1
- self.current_player = "w"
- self.restrict_round = 0
-
- @staticmethod
- def print_borad(board, action = None):
- def string_reverse(string):
- # return ''.join(string[len(string) - i] for i in range(1, len(string)+1))
- return ''.join(string[i] for i in range(len(string) - 1, -1, -1))
-
- x_trans = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8}
-
- if(action != None):
- src = action[0:2]
-
- src_x = int(x_trans[src[0]])
- src_y = int(src[1])
-
- # board = string_reverse(board)
- board = board.replace("1", " ")
- board = board.replace("2", " ")
- board = board.replace("3", " ")
- board = board.replace("4", " ")
- board = board.replace("5", " ")
- board = board.replace("6", " ")
- board = board.replace("7", " ")
- board = board.replace("8", " ")
- board = board.replace("9", " ")
- board = board.split('/')
- # board = board.replace("/", "\n")
- print(" abcdefghi")
- for i,line in enumerate(board):
- if (action != None):
- if(i == src_y):
- s = list(line)
- s[src_x] = 'x'
- line = ''.join(s)
- print(i,line)
- # print(board)
-
- @staticmethod
- def sim_do_action(in_action, in_state):
- x_trans = {'a':0, 'b':1, 'c':2, 'd':3, 'e':4, 'f':5, 'g':6, 'h':7, 'i':8}
-
- src = in_action[0:2]
- dst = in_action[2:4]
-
- src_x = int(x_trans[src[0]])
- src_y = int(src[1])
-
- dst_x = int(x_trans[dst[0]])
- dst_y = int(dst[1])
-
- # GameBoard.print_borad(in_state)
- # print("sim_do_action : ", in_action)
- # print(dst_y, dst_x, src_y, src_x)
- board_positions = GameBoard.board_to_pos_name(in_state)
- line_lst = []
- for line in board_positions:
- line_lst.append(list(line))
- lines = np.array(line_lst)
- # print(lines.shape)
- # print(board_positions[src_y])
- # print("before board_positions[dst_y] = ",board_positions[dst_y])
-
- lines[dst_y][dst_x] = lines[src_y][src_x]
- lines[src_y][src_x] = '1'
-
- board_positions[dst_y] = ''.join(lines[dst_y])
- board_positions[src_y] = ''.join(lines[src_y])
-
- # src_str = list(board_positions[src_y])
- # dst_str = list(board_positions[dst_y])
- # print("src_str[src_x] = ", src_str[src_x])
- # print("dst_str[dst_x] = ", dst_str[dst_x])
- # c = copy.deepcopy(src_str[src_x])
- # dst_str[dst_x] = c
- # src_str[src_x] = '1'
- # board_positions[dst_y] = ''.join(dst_str)
- # board_positions[src_y] = ''.join(src_str)
- # print("after board_positions[dst_y] = ", board_positions[dst_y])
-
- # board_positions[dst_y][dst_x] = board_positions[src_y][src_x]
- # board_positions[src_y][src_x] = '1'
-
- board = "/".join(board_positions)
- board = board.replace("111111111", "9")
- board = board.replace("11111111", "8")
- board = board.replace("1111111", "7")
- board = board.replace("111111", "6")
- board = board.replace("11111", "5")
- board = board.replace("1111", "4")
- board = board.replace("111", "3")
- board = board.replace("11", "2")
-
- # GameBoard.print_borad(board)
- return board
-
- @staticmethod
- def board_to_pos_name(board):
- board = board.replace("2", "11")
- board = board.replace("3", "111")
- board = board.replace("4", "1111")
- board = board.replace("5", "11111")
- board = board.replace("6", "111111")
- board = board.replace("7", "1111111")
- board = board.replace("8", "11111111")
- board = board.replace("9", "111111111")
- return board.split("/")
-
- @staticmethod
- def check_bounds(toY, toX):
- if toY < 0 or toX < 0:
- return False
-
- if toY >= GameBoard.Ny or toX >= GameBoard.Nx:
- return False
-
- return True
-
- @staticmethod
- def validate_move(c, upper=True):
- if (c.isalpha()):
- if (upper == True):
- if (c.islower()):
- return True
- else:
- return False
- else:
- if (c.isupper()):
- return True
- else:
- return False
- else:
- return True
-
- @staticmethod
- def get_legal_moves(state, current_player):
- moves = []
- k_x = None
- k_y = None
-
- K_x = None
- K_y = None
-
- face_to_face = False
-
- board_positions = np.array(GameBoard.board_to_pos_name(state))
- for y in range(board_positions.shape[0]):
- for x in range(len(board_positions[y])):
- if(board_positions[y][x].isalpha()):
- if(board_positions[y][x] == 'r' and current_player == 'b'):
- toY = y
- for toX in range(x - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- moves.append(m)
-
- for toX in range(x + 1, GameBoard.Nx):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- moves.append(m)
-
- toX = x
- for toY in range(y - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- moves.append(m)
-
- for toY in range(y + 1, GameBoard.Ny):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- moves.append(m)
-
- elif(board_positions[y][x] == 'R' and current_player == 'w'):
- toY = y
- for toX in range(x - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- moves.append(m)
-
- for toX in range(x + 1, GameBoard.Nx):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- moves.append(m)
-
- toX = x
- for toY in range(y - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- moves.append(m)
-
- for toY in range(y + 1, GameBoard.Ny):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- moves.append(m)
-
- elif ((board_positions[y][x] == 'n' or board_positions[y][x] == 'h') and current_player == 'b'):
- for i in range(-1, 3, 2):
- for j in range(-1, 3, 2):
- toY = y + 2 * i
- toX = x + 1 * j
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False) and board_positions[toY - i][x].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- toY = y + 1 * i
- toX = x + 2 * j
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False) and board_positions[y][toX - j].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif ((board_positions[y][x] == 'N' or board_positions[y][x] == 'H') and current_player == 'w'):
- for i in range(-1, 3, 2):
- for j in range(-1, 3, 2):
- toY = y + 2 * i
- toX = x + 1 * j
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True) and board_positions[toY - i][x].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- toY = y + 1 * i
- toX = x + 2 * j
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True) and board_positions[y][toX - j].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif ((board_positions[y][x] == 'b' or board_positions[y][x] == 'e') and current_player == 'b'):
- for i in range(-2, 3, 4):
- toY = y + i
- toX = x + i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=False) and toY >= 5 and \
- board_positions[y + i // 2][x + i // 2].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- toY = y + i
- toX = x - i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=False) and toY >= 5 and \
- board_positions[y + i // 2][x - i // 2].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif ((board_positions[y][x] == 'B' or board_positions[y][x] == 'E') and current_player == 'w'):
- for i in range(-2, 3, 4):
- toY = y + i
- toX = x + i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=True) and toY <= 4 and \
- board_positions[y + i // 2][x + i // 2].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- toY = y + i
- toX = x - i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=True) and toY <= 4 and \
- board_positions[y + i // 2][x - i // 2].isalpha() == False:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif (board_positions[y][x] == 'a' and current_player == 'b'):
- for i in range(-1, 3, 2):
- toY = y + i
- toX = x + i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=False) and toY >= 7 and toX >= 3 and toX <= 5:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- toY = y + i
- toX = x - i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=False) and toY >= 7 and toX >= 3 and toX <= 5:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif (board_positions[y][x] == 'A' and current_player == 'w'):
- for i in range(-1, 3, 2):
- toY = y + i
- toX = x + i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=True) and toY <= 2 and toX >= 3 and toX <= 5:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- toY = y + i
- toX = x - i
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=True) and toY <= 2 and toX >= 3 and toX <= 5:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif (board_positions[y][x] == 'k'):
- k_x = x
- k_y = y
-
- if(current_player == 'b'):
- for i in range(2):
- for sign in range(-1, 2, 2):
- j = 1 - i
- toY = y + i * sign
- toX = x + j * sign
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=False) and toY >= 7 and toX >= 3 and toX <= 5:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif (board_positions[y][x] == 'K'):
- K_x = x
- K_y = y
-
- if(current_player == 'w'):
- for i in range(2):
- for sign in range(-1, 2, 2):
- j = 1 - i
- toY = y + i * sign
- toX = x + j * sign
-
- if GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX],
- upper=True) and toY <= 2 and toX >= 3 and toX <= 5:
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
- elif (board_positions[y][x] == 'c' and current_player == 'b'):
- toY = y
- hits = False
- for toX in range(x - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- hits = False
- for toX in range(x + 1, GameBoard.Nx):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- toX = x
- hits = False
- for toY in range(y - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
-
- hits = False
- for toY in range(y + 1, GameBoard.Ny):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].isupper()):
- moves.append(m)
- break
- elif (board_positions[y][x] == 'C' and current_player == 'w'):
- toY = y
- hits = False
- for toX in range(x - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- hits = False
- for toX in range(x + 1, GameBoard.Nx):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- toX = x
- hits = False
- for toY in range(y - 1, -1, -1):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
-
- hits = False
- for toY in range(y + 1, GameBoard.Ny):
- m = GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX]
- if (hits == False):
- if (board_positions[toY][toX].isalpha()):
- hits = True
- else:
- moves.append(m)
- else:
- if (board_positions[toY][toX].isalpha()):
- if (board_positions[toY][toX].islower()):
- moves.append(m)
- break
- elif (board_positions[y][x] == 'p' and current_player == 'b'):
- toY = y - 1
- toX = x
-
- if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False)):
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- if y < 5:
- toY = y
- toX = x + 1
- if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False)):
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- toX = x - 1
- if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=False)):
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- elif (board_positions[y][x] == 'P' and current_player == 'w'):
- toY = y + 1
- toX = x
-
- if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True)):
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- if y > 4:
- toY = y
- toX = x + 1
- if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True)):
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- toX = x - 1
- if (GameBoard.check_bounds(toY, toX) and GameBoard.validate_move(board_positions[toY][toX], upper=True)):
- moves.append(GameBoard.board_pos_name[y][x] + GameBoard.board_pos_name[toY][toX])
-
- if(K_x != None and k_x != None and K_x == k_x):
- face_to_face = True
- for i in range(K_y + 1, k_y, 1):
- if(board_positions[i][K_x].isalpha()):
- face_to_face = False
-
- if(face_to_face == True):
- if(current_player == 'b'):
- moves.append(GameBoard.board_pos_name[k_y][k_x] + GameBoard.board_pos_name[K_y][K_x])
- else:
- moves.append(GameBoard.board_pos_name[K_y][K_x] + GameBoard.board_pos_name[k_y][k_x])
-
- return moves
-
-def softmax(x):
- # print(x)
- probs = np.exp(x - np.max(x))
- # print(np.sum(probs))
- probs /= np.sum(probs)
- return probs
-
-class cchess_main(object):
-
- def __init__(self, playout=400, in_batch_size=128, exploration = True, in_search_threads = 16, processor = "cpu", num_gpus = 1, res_block_nums = 7, human_color = 'b'):
- self.epochs = 5
- self.playout_counts = playout #400 #800 #1600 200
- self.temperature = 1 #1e-8 1e-3
- # self.c = 1e-4
- self.batch_size = in_batch_size #128 #512
- # self.momentum = 0.9
- self.game_batch = 400 # Evaluation each 400 times
- # self.game_loop = 25000
- self.top_steps = 30
- self.top_temperature = 1 #2
- # self.Dirichlet = 0.3 # P(s,a) = (1 - ϵ)p_a + ϵη_a #self-play chapter in the paper
- self.eta = 0.03
- # self.epsilon = 0.25
- # self.v_resign = 0.05
- # self.c_puct = 5
- self.learning_rate = 0.001 #5e-3 # 0.001
- self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL
- self.buffer_size = 10000
- self.data_buffer = deque(maxlen=self.buffer_size)
- self.game_borad = GameBoard()
- self.processor = processor
- # self.current_player = 'w' #“w”表示红方,“b”表示黑方。
- self.policy_value_netowrk = policy_value_network(self.lr_callback, res_block_nums) if processor == 'cpu' else policy_value_network_gpus(num_gpus, res_block_nums)
- self.search_threads = in_search_threads
- self.mcts = MCTS_tree(self.game_borad.state, self.policy_value_netowrk.forward, self.search_threads)
- self.exploration = exploration
- self.resign_threshold = -0.8 #0.05
- self.global_step = 0
- self.kl_targ = 0.025
- self.log_file = open(os.path.join(os.getcwd(), 'log_file.txt'), 'w')
- self.human_color = human_color
-
- @staticmethod
- def flip_policy(prob):
- prob = tf.squeeze(prob) # .flatten()
- return np.asarray([prob[ind] for ind in unflipped_index])
-
- def lr_callback(self):
- return self.learning_rate * self.lr_multiplier
-
- def policy_update(self):
- """update the policy-value net"""
- mini_batch = random.sample(self.data_buffer, self.batch_size)
- #print("training data_buffer len : ", len(self.data_buffer))
- state_batch = [data[0] for data in mini_batch]
- mcts_probs_batch = [data[1] for data in mini_batch]
- winner_batch = [data[2] for data in mini_batch]
-
- winner_batch = np.expand_dims(winner_batch, 1)
-
- start_time = time.time()
- old_probs, old_v = self.mcts.forward(state_batch)
- for i in range(self.epochs):
- # print("tf.executing_eagerly() : ", tf.executing_eagerly())
- state_batch = np.array(state_batch)
- if len(state_batch.shape) == 3:
- sp = state_batch.shape
- state_batch = np.reshape(state_batch, [1, sp[0], sp[1], sp[2]])
- if self.processor == 'cpu':
- accuracy, loss, self.global_step = self.policy_value_netowrk.train_step(state_batch, mcts_probs_batch, winner_batch,
- self.learning_rate * self.lr_multiplier) #
- else:
- # import pickle
- # pickle.dump((state_batch, mcts_probs_batch, winner_batch, self.learning_rate * self.lr_multiplier), open('preprocess.p', 'wb'))
- with self.policy_value_netowrk.strategy.scope():
- train_dataset = tf.data.Dataset.from_tensor_slices((state_batch, mcts_probs_batch, winner_batch)).batch(len(winner_batch)) # , self.learning_rate * self.lr_multiplier
- # .shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
- train_iterator = self.policy_value_netowrk.strategy.make_dataset_iterator(train_dataset)
- train_iterator.initialize()
- accuracy, loss, self.global_step = self.policy_value_netowrk.distributed_train(train_iterator)
-
- new_probs, new_v = self.mcts.forward(state_batch)
- kl_tmp = old_probs * (np.log((old_probs + 1e-10) / (new_probs + 1e-10)))
-
- kl_lst = []
- for line in kl_tmp:
- # print("line.shape", line.shape)
- all_value = [x for x in line if str(x) != 'nan' and str(x)!= 'inf']#除去inf值
- kl_lst.append(np.sum(all_value))
- kl = np.mean(kl_lst)
- # kl = scipy.stats.entropy(old_probs, new_probs)
- # kl = np.mean(np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1))
-
- if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly
- break
- self.policy_value_netowrk.save(self.global_step)
- print("train using time {} s".format(time.time() - start_time))
-
- # adaptively adjust the learning rate
- if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
- self.lr_multiplier /= 1.5
- elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
- self.lr_multiplier *= 1.5
-
- explained_var_old = 1 - np.var(np.array(winner_batch) - tf.squeeze(old_v)) / np.var(np.array(winner_batch)) # .flatten()
- explained_var_new = 1 - np.var(np.array(winner_batch) - tf.squeeze(new_v)) / np.var(np.array(winner_batch)) # .flatten()
- print(
- "kl:{:.5f},lr_multiplier:{:.3f},loss:{},accuracy:{},explained_var_old:{:.3f},explained_var_new:{:.3f}".format(
- kl, self.lr_multiplier, loss, accuracy, explained_var_old, explained_var_new))
- self.log_file.write("kl:{:.5f},lr_multiplier:{:.3f},loss:{},accuracy:{},explained_var_old:{:.3f},explained_var_new:{:.3f}".format(
- kl, self.lr_multiplier, loss, accuracy, explained_var_old, explained_var_new) + '\n')
- self.log_file.flush()
- # return loss, accuracy
-
- # def policy_evaluate(self, n_games=10):
- # """
- # Evaluate the trained policy by playing games against the pure MCTS player
- # Note: this is only for monitoring the progress of training
- # """
- # # current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct,
- # # n_playout=self.n_playout)
- # # pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num)
- # win_cnt = defaultdict(int)
- # for i in range(n_games):
- # winner = self.game.start_play(start_player=i % 2) #current_mcts_player, pure_mcts_player,
- # win_cnt[winner] += 1
- # win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
- # print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(self.pure_mcts_playout_num, win_cnt[1], win_cnt[2],
- # win_cnt[-1]))
- # return win_ratio
-
- def run(self):
- #self.game_loop
- batch_iter = 0
- try:
- while(True):
- batch_iter += 1
- play_data, episode_len = self.selfplay()
- print("batch i:{}, episode_len:{}".format(batch_iter, episode_len))
- extend_data = []
- # states_data = []
- for state, mcts_prob, winner in play_data:
- states_data = self.mcts.state_to_positions(state)
- # prob = np.zeros(labels_len)
- # for idx in range(len(mcts_prob[0][0])):
- # prob[label2i[mcts_prob[0][0][idx]]] = mcts_prob[0][1][idx]
- extend_data.append((states_data, mcts_prob, winner))
- self.data_buffer.extend(extend_data)
- if len(self.data_buffer) > self.batch_size:
- self.policy_update()
- # if (batch_iter) % self.game_batch == 0:
- # print("current self-play batch: {}".format(batch_iter))
- # win_ratio = self.policy_evaluate()
- except KeyboardInterrupt:
- self.log_file.close()
- self.policy_value_netowrk.save(self.global_step)
-
- # def get_action(self, state, temperature = 1e-3):
- # # for i in range(self.playout_counts):
- # # state_sim = copy.deepcopy(state)
- # # self.mcts.do_simulation(state_sim, self.game_borad.current_player, self.game_borad.restrict_round)
- #
- # futures = []
- # with ThreadPoolExecutor(max_workers=self.search_threads) as executor:
- # for _ in range(self.playout_counts):
- # state_sim = copy.deepcopy(state)
- # futures.append(executor.submit(self.mcts.do_simulation, state_sim, self.game_borad.current_player, self.game_borad.restrict_round))
- #
- # vals = [f.result() for f in futures]
- #
- # actions_visits = [(act, nod.N) for act, nod in self.mcts.root.child.items()]
- # actions, visits = zip(*actions_visits)
- # probs = softmax(1.0 / temperature * np.log(visits)) #+ 1e-10
- # move_probs = []
- # move_probs.append([actions, probs])
- #
- # if(self.exploration):
- # act = np.random.choice(actions, p=0.75 * probs + 0.25*np.random.dirichlet(0.3*np.ones(len(probs))))
- # else:
- # act = np.random.choice(actions, p=probs)
- #
- # self.mcts.update_tree(act)
- #
- # return act, move_probs
-
- def get_hint(self, mcts_or_net, reverse, disp_mcts_msg_handler):
-
- if mcts_or_net == "mcts":
- if self.mcts.root.child == {}:
- disp_mcts_msg_handler()
- self.mcts.main(self.game_borad.state, self.game_borad.current_player, self.game_borad.restrict_round,
- self.playout_counts)
-
- actions_visits = [(act, nod.N) for act, nod in self.mcts.root.child.items()]
- actions, visits = zip(*actions_visits)
- # print("visits : ", visits)
- # print("np.log(visits) : ", np.log(visits))
- probs = softmax(1.0 / self.temperature * np.log(visits)) # + 1e-10
-
- act_prob_dict = defaultdict(float)
- for i in range(len(actions)):
- if self.human_color == 'w':
- action = "".join(flipped_uci_labels(actions[i]))
- else:
- action = actions[i]
- act_prob_dict[action] = probs[i]
-
- elif mcts_or_net == "net":
- positions = self.mcts.generate_inputs(self.game_borad.state, self.game_borad.current_player)
- positions = np.expand_dims(positions, 0)
- action_probs, value = self.mcts.forward(positions)
-
- if self.mcts.is_black_turn(self.game_borad.current_player):
- action_probs = cchess_main.flip_policy(action_probs)
- moves = GameBoard.get_legal_moves(self.game_borad.state, self.game_borad.current_player)
-
- tot_p = 1e-8
- action_probs = tf.squeeze(action_probs) # .flatten() # .squeeze()
- act_prob_dict = defaultdict(float)
- # print("expand action_probs shape : ", action_probs.shape)
- for action in moves:
- # in_state = GameBoard.sim_do_action(action, self.state)
- mov_p = action_probs[label2i[action]]
- if self.human_color == 'w':
- action = "".join(flipped_uci_labels(action))
- act_prob_dict[action] = mov_p
- # new_node = leaf_node(self, mov_p, in_state)
- # self.child[action] = new_node
- tot_p += mov_p
-
- for a, _ in act_prob_dict.items():
- act_prob_dict[a] /= tot_p
-
- sorted_move_probs = sorted(act_prob_dict.items(), key=lambda item: item[1], reverse=reverse)
- # print(sorted_move_probs)
-
- return sorted_move_probs
-
- #@profile
- def get_action(self, state, temperature = 1e-3):
- # for i in range(self.playout_counts):
- # state_sim = copy.deepcopy(state)
- # self.mcts.do_simulation(state_sim, self.game_borad.current_player, self.game_borad.restrict_round)
-
- self.mcts.main(state, self.game_borad.current_player, self.game_borad.restrict_round, self.playout_counts)
-
- actions_visits = [(act, nod.N) for act, nod in self.mcts.root.child.items()]
- actions, visits = zip(*actions_visits)
- probs = softmax(1.0 / temperature * np.log(visits)) #+ 1e-10
- move_probs = []
- move_probs.append([actions, probs])
-
- if(self.exploration):
- act = np.random.choice(actions, p=0.75 * probs + 0.25*np.random.dirichlet(0.3*np.ones(len(probs))))
- else:
- act = np.random.choice(actions, p=probs)
-
- win_rate = self.mcts.Q(act) # / 2.0 + 0.5
- self.mcts.update_tree(act)
-
- # if position.n < 30: # self.top_steps
- # move = select_weighted_random(position, on_board_move_prob)
- # else:
- # move = select_most_likely(position, on_board_move_prob)
-
- return act, move_probs, win_rate
-
- def get_action_old(self, state, temperature = 1e-3):
- for i in range(self.playout_counts):
- state_sim = copy.deepcopy(state)
- self.mcts.do_simulation(state_sim, self.game_borad.current_player, self.game_borad.restrict_round)
-
- actions_visits = [(act, nod.N) for act, nod in self.mcts.root.child.items()]
- actions, visits = zip(*actions_visits)
- probs = softmax(1.0 / temperature * np.log(visits)) #+ 1e-10
- move_probs = []
- move_probs.append([actions, probs])
-
- if(self.exploration):
- act = np.random.choice(actions, p=0.75 * probs + 0.25*np.random.dirichlet(0.3*np.ones(len(probs))))
- else:
- act = np.random.choice(actions, p=probs)
-
- self.mcts.update_tree(act)
-
- return act, move_probs
-
- def check_end(self):
- if (self.game_borad.state.find('K') == -1 or self.game_borad.state.find('k') == -1):
- if (self.game_borad.state.find('K') == -1):
- print("Green is Winner")
- return True, "b"
- if (self.game_borad.state.find('k') == -1):
- print("Red is Winner")
- return True, "w"
- elif self.game_borad.restrict_round >= 60:
- print("TIE! No Winners!")
- return True, "t"
- else:
- return False, ""
-
- def human_move(self, coord, mcts_or_net):
- win_rate = 0
- x_trans = {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i'}
-
- src = coord[0:2]
- dst = coord[2:4]
-
- src_x = (x_trans[src[0]])
- src_y = str(src[1])
-
- dst_x = (x_trans[dst[0]])
- dst_y = str(dst[1])
-
- action = src_x + src_y + dst_x + dst_y
-
- if self.human_color == 'w':
- action = "".join(flipped_uci_labels(action))
-
- if mcts_or_net == "mcts":
- if self.mcts.root.child == {}:
- # self.get_action(self.game_borad.state, self.temperature)
- self.mcts.main(self.game_borad.state, self.game_borad.current_player, self.game_borad.restrict_round,
- self.playout_counts)
- win_rate = self.mcts.Q(action) # / 2.0 + 0.5
- self.mcts.update_tree(action)
-
- last_state = self.game_borad.state
- # print(self.game_borad.current_player, " now take a action : ", action, "[Step {}]".format(self.game_borad.round))
- self.game_borad.state = GameBoard.sim_do_action(action, self.game_borad.state)
- self.game_borad.round += 1
- self.game_borad.current_player = "w" if self.game_borad.current_player == "b" else "b"
- if is_kill_move(last_state, self.game_borad.state) == 0:
- self.game_borad.restrict_round += 1
- else:
- self.game_borad.restrict_round = 0
-
- return win_rate
-
-
- def select_move(self, mcts_or_net):
- if mcts_or_net == "mcts":
- action, probs, win_rate = self.get_action(self.game_borad.state, self.temperature)
- # win_rate = self.mcts.Q(action) / 2.0 + 0.5
- elif mcts_or_net == "net":
- positions = self.mcts.generate_inputs(self.game_borad.state, self.game_borad.current_player)
- positions = np.expand_dims(positions, 0)
- action_probs, value = self.mcts.forward(positions)
- win_rate = value[0, 0] # / 2 + 0.5
- if self.mcts.is_black_turn(self.game_borad.current_player):
- action_probs = cchess_main.flip_policy(action_probs)
- moves = GameBoard.get_legal_moves(self.game_borad.state, self.game_borad.current_player)
-
- tot_p = 1e-8
- action_probs = tf.squeeze(action_probs) # .flatten() # .squeeze()
- act_prob_dict = defaultdict(float)
- # print("expand action_probs shape : ", action_probs.shape)
- for action in moves:
- # in_state = GameBoard.sim_do_action(action, self.state)
- mov_p = action_probs[label2i[action]]
- act_prob_dict[action] = mov_p
- # new_node = leaf_node(self, mov_p, in_state)
- # self.child[action] = new_node
- tot_p += mov_p
-
- for a, _ in act_prob_dict.items():
- act_prob_dict[a] /= tot_p
-
- action = max(act_prob_dict.items(), key=lambda node: node[1])[0]
- # self.mcts.update_tree(action)
-
- print('Win rate for player {} is {:.4f}'.format(self.game_borad.current_player, win_rate))
- last_state = self.game_borad.state
- print(self.game_borad.current_player, " now take a action : ", action, "[Step {}]".format(self.game_borad.round)) # if self.human_color == 'w' else "".join(flipped_uci_labels(action))
- self.game_borad.state = GameBoard.sim_do_action(action, self.game_borad.state)
- self.game_borad.round += 1
- self.game_borad.current_player = "w" if self.game_borad.current_player == "b" else "b"
- if is_kill_move(last_state, self.game_borad.state) == 0:
- self.game_borad.restrict_round += 1
- else:
- self.game_borad.restrict_round = 0
-
- self.game_borad.print_borad(self.game_borad.state)
-
- x_trans = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8}
-
- if self.human_color == 'w':
- action = "".join(flipped_uci_labels(action))
-
- src = action[0:2]
- dst = action[2:4]
-
- src_x = int(x_trans[src[0]])
- src_y = int(src[1])
-
- dst_x = int(x_trans[dst[0]])
- dst_y = int(dst[1])
-
- return (src_x, src_y, dst_x - src_x, dst_y - src_y), win_rate
-
- def selfplay(self):
- self.game_borad.reload()
- # p1, p2 = self.game_borad.players
- states, mcts_probs, current_players = [], [], []
- z = None
- game_over = False
- winnner = ""
- start_time = time.time()
- # self.game_borad.print_borad(self.game_borad.state)
- while(not game_over):
- action, probs, win_rate = self.get_action(self.game_borad.state, self.temperature)
- state, palyer = self.mcts.try_flip(self.game_borad.state, self.game_borad.current_player, self.mcts.is_black_turn(self.game_borad.current_player))
- states.append(state)
- prob = np.zeros(labels_len)
- if self.mcts.is_black_turn(self.game_borad.current_player):
- for idx in range(len(probs[0][0])):
- # probs[0][0][idx] = "".join((str(9 - int(a)) if a.isdigit() else a) for a in probs[0][0][idx])
- act = "".join((str(9 - int(a)) if a.isdigit() else a) for a in probs[0][0][idx])
- # for idx in range(len(mcts_prob[0][0])):
- prob[label2i[act]] = probs[0][1][idx]
- else:
- for idx in range(len(probs[0][0])):
- prob[label2i[probs[0][0][idx]]] = probs[0][1][idx]
- mcts_probs.append(prob)
- # mcts_probs.append(probs)
- current_players.append(self.game_borad.current_player)
-
- last_state = self.game_borad.state
- # print(self.game_borad.current_player, " now take a action : ", action, "[Step {}]".format(self.game_borad.round))
- self.game_borad.state = GameBoard.sim_do_action(action, self.game_borad.state)
- self.game_borad.round += 1
- self.game_borad.current_player = "w" if self.game_borad.current_player == "b" else "b"
- if is_kill_move(last_state, self.game_borad.state) == 0:
- self.game_borad.restrict_round += 1
- else:
- self.game_borad.restrict_round = 0
-
- # self.game_borad.print_borad(self.game_borad.state, action)
-
- if (self.game_borad.state.find('K') == -1 or self.game_borad.state.find('k') == -1):
- z = np.zeros(len(current_players))
- if (self.game_borad.state.find('K') == -1):
- winnner = "b"
- if (self.game_borad.state.find('k') == -1):
- winnner = "w"
- z[np.array(current_players) == winnner] = 1.0
- z[np.array(current_players) != winnner] = -1.0
- game_over = True
- print("Game end. Winner is player : ", winnner, " In {} steps".format(self.game_borad.round - 1))
- elif self.game_borad.restrict_round >= 60:
- z = np.zeros(len(current_players))
- game_over = True
- print("Game end. Tie in {} steps".format(self.game_borad.round - 1))
- # elif(self.mcts.root.v < self.resign_threshold):
- # pass
- # elif(self.mcts.root.Q < self.resign_threshold):
- # pass
- if(game_over):
- # self.mcts.root = leaf_node(None, self.mcts.p_, "RNBAKABNR/9/1C5C1/P1P1P1P1P/9/9/p1p1p1p1p/1c5c1/9/rnbakabnr")#"rnbakabnr/9/1c5c1/p1p1p1p1p/9/9/P1P1P1P1P/1C5C1/9/RNBAKABNR"
- self.mcts.reload()
- print("Using time {} s".format(time.time() - start_time))
- return zip(states, mcts_probs, z), len(z)
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- parser.add_argument('--mode', default='train', choices=['train', 'play'], type=str, help='train or play')
- parser.add_argument('--ai_count', default=1, choices=[1, 2], type=int, help='choose ai player count')
- parser.add_argument('--ai_function', default='mcts', choices=['mcts', 'net'], type=str, help='mcts or net')
- parser.add_argument('--train_playout', default=400, type=int, help='mcts train playout')
- parser.add_argument('--batch_size', default=512, type=int, help='train batch_size')
- parser.add_argument('--play_playout', default=400, type=int, help='mcts play playout')
- parser.add_argument('--delay', dest='delay', action='store',
- nargs='?', default=3, type=float, required=False,
- help='Set how many seconds you want to delay after each move')
- parser.add_argument('--end_delay', dest='end_delay', action='store',
- nargs='?', default=3, type=float, required=False,
- help='Set how many seconds you want to delay after the end of game')
- parser.add_argument('--search_threads', default=16, type=int, help='search_threads')
- parser.add_argument('--processor', default='cpu', choices=['cpu', 'gpu'], type=str, help='cpu or gpu')
- parser.add_argument('--num_gpus', default=1, type=int, help='gpu counts')
- parser.add_argument('--res_block_nums', default=7, type=int, help='res_block_nums')
- parser.add_argument('--human_color', default='b', choices=['w', 'b'], type=str, help='w or b')
- args = parser.parse_args()
-
- if args.mode == 'train':
- train_main = cchess_main(args.train_playout, args.batch_size, True, args.search_threads, args.processor, args.num_gpus, args.res_block_nums, args.human_color) # * args.num_gpus
- train_main.run()
- elif args.mode == 'play':
- from ChessGame_tf2 import *
- game = ChessGame(args.ai_count, args.ai_function, args.play_playout, args.delay, args.end_delay, args.batch_size,
- args.search_threads, args.processor, args.num_gpus, args.res_block_nums, args.human_color) # * args.num_gpus
- game.start()
diff --git a/models/summaries/eval/events.out.tfevents.1752759798.devbox.7445.1.v2 b/models/summaries/eval/events.out.tfevents.1752759798.devbox.7445.1.v2
new file mode 100644
index 0000000..9e01209
Binary files /dev/null and b/models/summaries/eval/events.out.tfevents.1752759798.devbox.7445.1.v2 differ
diff --git a/models/summaries/eval/events.out.tfevents.1752759842.devbox.7793.1.v2 b/models/summaries/eval/events.out.tfevents.1752759842.devbox.7793.1.v2
new file mode 100644
index 0000000..50943ed
Binary files /dev/null and b/models/summaries/eval/events.out.tfevents.1752759842.devbox.7793.1.v2 differ
diff --git a/models/summaries/eval/events.out.tfevents.1752759874.devbox.8089.1.v2 b/models/summaries/eval/events.out.tfevents.1752759874.devbox.8089.1.v2
new file mode 100644
index 0000000..ef318e9
Binary files /dev/null and b/models/summaries/eval/events.out.tfevents.1752759874.devbox.8089.1.v2 differ
diff --git a/models/summaries/train/events.out.tfevents.1752759798.devbox.7445.0.v2 b/models/summaries/train/events.out.tfevents.1752759798.devbox.7445.0.v2
new file mode 100644
index 0000000..9e01209
Binary files /dev/null and b/models/summaries/train/events.out.tfevents.1752759798.devbox.7445.0.v2 differ
diff --git a/models/summaries/train/events.out.tfevents.1752759842.devbox.7793.0.v2 b/models/summaries/train/events.out.tfevents.1752759842.devbox.7793.0.v2
new file mode 100644
index 0000000..50943ed
Binary files /dev/null and b/models/summaries/train/events.out.tfevents.1752759842.devbox.7793.0.v2 differ
diff --git a/models/summaries/train/events.out.tfevents.1752759874.devbox.8089.0.v2 b/models/summaries/train/events.out.tfevents.1752759874.devbox.8089.0.v2
new file mode 100644
index 0000000..ef318e9
Binary files /dev/null and b/models/summaries/train/events.out.tfevents.1752759874.devbox.8089.0.v2 differ
diff --git a/policy_value_network.py b/policy_value_network.py
index bb6c5f9..f14ea10 100755
--- a/policy_value_network.py
+++ b/policy_value_network.py
@@ -1,215 +1,130 @@
-#coding:utf-8
-import tensorflow as tf
-import numpy as np
-
-import os
-
-
-class policy_value_network(object):
- def __init__(self, res_block_nums = 7):
- # self.ckpt = os.path.join(os.getcwd(), 'models/best_model.ckpt-13999') # TODO
- self.save_dir = "./models"
- self.is_logging = True
-
- """reset TF Graph"""
- tf.reset_default_graph()
- """Creat a new graph for the network"""
- # g = tf.Graph()
-
- self.sess = tf.Session()
- # self.sess = tf.InteractiveSession()
-
- # Variables
- self.filters_size = 128 # or 256
- self.prob_size = 2086
- self.digest = None
- self.training = tf.placeholder(tf.bool, name='training')
- self.inputs_ = tf.placeholder(tf.float32, [None, 9, 10, 14], name='inputs') # + 2 # TODO C plain x 2
- self.c_l2 = 0.0001
- self.momentum = 0.9
- self.global_norm = 100
- self.learning_rate = tf.placeholder(tf.float32, name='learning_rate') #0.001 #5e-3 #0.05 #
- tf.summary.scalar('learning_rate', self.learning_rate)
-
- # First block
- self.pi_ = tf.placeholder(tf.float32, [None, self.prob_size], name='pi')
- self.z_ = tf.placeholder(tf.float32, [None, 1], name='z')
-
- # NWHC format
- # batch, 9 * 10, 14 channels
- # inputs_ = tf.reshape(self.inputs_, [-1, 9, 10, 14])
- # data_format: A string, one of `channels_last` (default) or `channels_first`.
- # The ordering of the dimensions in the inputs.
- # `channels_last` corresponds to inputs with shape `(batch, width, height, channels)`
- # while `channels_first` corresponds to inputs with shape `(batch, channels, width, height)`.
- self.layer = tf.layers.conv2d(self.inputs_, self.filters_size, 3, padding='SAME') # filters 128(or 256)
-
- self.layer = tf.contrib.layers.batch_norm(self.layer, center=False, epsilon=1e-5, fused=True,
- is_training=self.training, activation_fn=tf.nn.relu) # epsilon = 0.25
-
- # residual_block
- with tf.name_scope("residual_block"):
- for _ in range(res_block_nums):
- self.layer = self.residual_block(self.layer)
-
- # policy_head
- with tf.name_scope("policy_head"):
- self.policy_head = tf.layers.conv2d(self.layer, 2, 1, padding='SAME')
- self.policy_head = tf.contrib.layers.batch_norm(self.policy_head, center=False, epsilon=1e-5, fused=True,
- is_training=self.training, activation_fn=tf.nn.relu)
-
- # print(self.policy_head.shape) # (?, 9, 10, 2)
- self.policy_head = tf.reshape(self.policy_head, [-1, 9 * 10 * 2])
- self.policy_head = tf.contrib.layers.fully_connected(self.policy_head, self.prob_size, activation_fn=None)
- # self.prediction = tf.nn.softmax(self.policy_head)
-
- # value_head
- with tf.name_scope("value_head"):
- self.value_head = tf.layers.conv2d(self.layer, 1, 1, padding='SAME')
- self.value_head = tf.contrib.layers.batch_norm(self.value_head, center=False, epsilon=1e-5, fused=True,
- is_training=self.training, activation_fn=tf.nn.relu)
- # print(self.value_head.shape) # (?, 9, 10, 1)
- self.value_head = tf.reshape(self.value_head, [-1, 9 * 10 * 1])
- self.value_head = tf.contrib.layers.fully_connected(self.value_head, 256, activation_fn=tf.nn.relu)
- self.value_head = tf.contrib.layers.fully_connected(self.value_head, 1, activation_fn=tf.nn.tanh)
-
- # loss
- with tf.name_scope("loss"):
- self.policy_loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.pi_, logits=self.policy_head)
- self.policy_loss = tf.reduce_mean(self.policy_loss)
-
- # self.value_loss = tf.squared_difference(self.z_, self.value_head)
- self.value_loss = tf.losses.mean_squared_error(labels=self.z_, predictions=self.value_head)
- self.value_loss = tf.reduce_mean(self.value_loss)
- tf.summary.scalar('mse_loss', self.value_loss)
-
- regularizer = tf.contrib.layers.l2_regularizer(scale=self.c_l2)
- regular_variables = tf.trainable_variables()
- self.l2_loss = tf.contrib.layers.apply_regularization(regularizer, regular_variables)
-
- # self.loss = self.value_loss - self.policy_loss + self.l2_loss
- self.loss = self.value_loss + self.policy_loss + self.l2_loss
- tf.summary.scalar('loss', self.loss)
-
- # train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
- self.global_step = tf.Variable(0, name="global_step", trainable=False)
- # optimizer = tf.train.AdamOptimizer(self.learning_rate)
- # gradients = optimizer.compute_gradients(self.loss)
- # train_op = optimizer.apply_gradients(gradients, global_step=global_step)
-
- # 优化损失
- optimizer = tf.train.MomentumOptimizer(
- learning_rate=self.learning_rate, momentum=self.momentum, use_nesterov=True)
-
- # self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
- # with tf.control_dependencies(self.update_ops):
- # self.train_op = optimizer.minimize(self.loss, global_step=self.global_step)
-
- # Accuracy
- correct_prediction = tf.equal(tf.argmax(self.policy_head, 1), tf.argmax(self.pi_, 1))
- correct_prediction = tf.cast(correct_prediction, tf.float32)
- self.accuracy = tf.reduce_mean(correct_prediction, name='accuracy')
- tf.summary.scalar('move_accuracy', self.accuracy)
-
- # grads = self.average_gradients(tower_grads)
- grads = optimizer.compute_gradients(self.loss)
- # defensive step 2 to clip norm
- clipped_grads, self.norm = tf.clip_by_global_norm(
- [g for g, _ in grads], self.global_norm)
-
- # defensive step 3 check NaN
- # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating
- grad_check = [tf.check_numerics(g, message='NaN Found!') for g in clipped_grads]
- with tf.control_dependencies(grad_check):
- self.train_op = optimizer.apply_gradients(
- zip(clipped_grads, [v for _, v in grads]),
- global_step=self.global_step, name='train_step')
-
- if self.is_logging:
- for grad, var in grads:
- if grad is not None:
- tf.summary.histogram(var.op.name + '/gradients', grad)
- for var in tf.trainable_variables():
- tf.summary.histogram(var.op.name, var)
-
- self.summaries_op = tf.summary.merge_all()
-
- # Train Summaries
- self.train_writer = tf.summary.FileWriter(
- os.path.join(os.getcwd(), "cchesslogs/train"), self.sess.graph)
-
- # Test summaries
- self.test_writer = tf.summary.FileWriter(
- os.path.join(os.getcwd(), "cchesslogs/test"), self.sess.graph)
-
- self.sess.run(tf.global_variables_initializer())
- # self.sess.run(tf.local_variables_initializer())
- # self.sess.run(tf.initialize_all_variables())
- self.saver = tf.train.Saver()
- self.train_restore()
-
- def residual_block(self, in_layer):
- orig = tf.identity(in_layer)
-
- layer = tf.layers.conv2d(in_layer, self.filters_size, 3, padding='SAME') # filters 128(or 256)
- layer = tf.contrib.layers.batch_norm(layer, center=False, epsilon=1e-5, fused=True,
- is_training=self.training, activation_fn=tf.nn.relu)
-
- layer = tf.layers.conv2d(layer, self.filters_size, 3, padding='SAME') # filters 128(or 256)
- layer = tf.contrib.layers.batch_norm(layer, center=False, epsilon=1e-5, fused=True, is_training=self.training)
- out = tf.nn.relu(tf.add(orig, layer))
-
- return out
-
- def train_restore(self):
- if not os.path.isdir(self.save_dir):
- os.mkdir(self.save_dir)
- checkpoint = tf.train.get_checkpoint_state(self.save_dir)
- if checkpoint and checkpoint.model_checkpoint_path:
- # self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
- self.saver.restore(self.sess, tf.train.latest_checkpoint(self.save_dir))
- print("Successfully loaded:", tf.train.latest_checkpoint(self.save_dir))
- # print("Successfully loaded:", checkpoint.model_checkpoint_path)
- else:
- print("Could not find old network weights")
-
- def restore(self, file):
- print("Restoring from {0}".format(file))
- self.saver.restore(self.sess, file) # self.ckpt
-
- def save(self, in_global_step):
- # save_path = self.saver.save(self.sess, path, global_step=self.global_step)
- save_path = self.saver.save(self.sess, os.path.join(self.save_dir, 'best_model.ckpt'),
- global_step=in_global_step) #self.global_step
- print("Model saved in file: {}".format(save_path))
-
- def train_step(self, positions, probs, winners, learning_rate):
- feed_dict = {
- self.inputs_: positions,
- self.training: True,
- self.learning_rate: learning_rate,
- self.pi_: probs,
- self.z_: winners
- }
-
- _, accuracy, loss, global_step, summary = self.sess.run([self.train_op, self.accuracy, self.loss, self.global_step, self.summaries_op], feed_dict=feed_dict)
- self.train_writer.add_summary(summary, global_step)
- # print(accuracy)
- # print(loss)
- return accuracy, loss, global_step
-
- #@profile
- def forward(self, positions): # , probs, winners
- feed_dict = {
- self.inputs_: positions,
- self.training: False
- }
- # ,
- # self.pi_: probs,
- # self.z_: winners
- action_probs, value = self.sess.run([self.policy_head, self.value_head], feed_dict=feed_dict) # self.prediction
- # print(action_probs.shape)
- # print(value.shape)
-
- return action_probs, value
- # return action_probs, value
\ No newline at end of file
+#coding:utf-8
+import tensorflow as tf
+import numpy as np
+from tensorflow.python.ops import summary_ops_v2
+import os
+
+
+class policy_value_network(object):
+ def __init__(self, learning_rate_fn, res_block_nums = 7):
+ # self.ckpt = os.path.join(os.getcwd(), 'models/best_model.ckpt-13999') # TODO
+ self.save_dir = "./models"
+ self.is_logging = True
+
+ if tf.io.gfile.exists(self.save_dir):
+ # print('Removing existing model dir: {}'.format(MODEL_DIR))
+ # tf.io.gfile.rmtree(MODEL_DIR)
+ pass
+ else:
+ tf.io.gfile.makedirs(self.save_dir)
+
+ train_dir = os.path.join(self.save_dir, 'summaries', 'train')
+ test_dir = os.path.join(self.save_dir, 'summaries', 'eval')
+
+ self.train_summary_writer = summary_ops_v2.create_file_writer(train_dir, flush_millis=10000)
+ self.test_summary_writer = summary_ops_v2.create_file_writer(test_dir, flush_millis=10000, name='test')
+
+ # Variables
+ self.filters_size = 128 # or 256
+ self.prob_size = 2086
+ self.digest = None
+
+ self.inputs_ = tf.keras.layers.Input([9, 10, 14], dtype='float32', name='inputs') # TODO C plain x 2
+ self.c_l2 = 0.0001
+ self.momentum = 0.9
+ self.global_norm = 100
+
+ regularizer = tf.keras.regularizers.l2(self.c_l2)
+
+ self.layer = tf.keras.layers.Conv2D(kernel_size=3, filters=self.filters_size, padding='same', kernel_regularizer=regularizer)(self.inputs_)
+ self.layer = tf.keras.layers.BatchNormalization(epsilon=1e-5)(self.layer)
+ self.layer = tf.keras.layers.ReLU()(self.layer)
+
+ # residual_block
+ with tf.name_scope("residual_block"):
+ for _ in range(res_block_nums):
+ self.layer = self.residual_block(self.layer)
+
+ # policy_head
+ with tf.name_scope("policy_head"):
+ self.policy_head = tf.keras.layers.Conv2D(filters=2, kernel_size=1, padding='same', kernel_regularizer=regularizer)(self.layer)
+ self.policy_head = tf.keras.layers.BatchNormalization(epsilon=1e-5)(self.policy_head)
+ self.policy_head = tf.keras.layers.ReLU()(self.policy_head)
+
+ self.policy_head = tf.keras.layers.Reshape([9 * 10 * 2])(self.policy_head)
+ self.policy_head = tf.keras.layers.Dense(self.prob_size, kernel_regularizer=regularizer, name="policy_head_output")(self.policy_head)
+
+ # value_head
+ with tf.name_scope("value_head"):
+ self.value_head = tf.keras.layers.Conv2D(filters=1, kernel_size=1, padding='same', kernel_regularizer=regularizer)(self.layer)
+ self.value_head = tf.keras.layers.BatchNormalization(epsilon=1e-5)(
+ self.value_head)
+ self.value_head = tf.keras.layers.ReLU()(self.value_head)
+
+ self.value_head = tf.keras.layers.Reshape([9 * 10 * 1])(self.value_head)
+ self.value_head = tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=regularizer)(self.value_head)
+ self.value_head = tf.keras.layers.Dense(1, activation='tanh', kernel_regularizer=regularizer, name="value_head_output")(self.value_head)
+
+ self.model = tf.keras.Model(
+ inputs=[self.inputs_],
+ outputs=[self.policy_head, self.value_head])
+
+ self.model.summary()
+
+ self.global_step = tf.Variable(0, name="global_step", trainable=False)
+ # optimizer = tf.train.AdamOptimizer(self.learning_rate)
+
+ # 优化损失
+ self.optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate_fn, momentum=self.momentum)
+
+ self.model.compile(optimizer=self.optimizer,
+ loss={'policy_head_output': tf.keras.losses.CategoricalCrossentropy(from_logits=True),
+ 'value_head_output': tf.keras.losses.MeanSquaredError()},
+ loss_weights={'policy_head_output': 1.0, 'value_head_output': 1.0},
+ metrics={'policy_head_output': 'accuracy'})
+
+
+ self.checkpoint_dir = os.path.join(self.save_dir, 'checkpoints')
+ self.checkpoint_prefix = os.path.join(self.checkpoint_dir, 'ckpt')
+ self.checkpoint = tf.train.Checkpoint(model=self.model, optimizer=self.optimizer)
+
+ # Restore variables on creation if a checkpoint exists.
+ self.checkpoint.restore(tf.train.latest_checkpoint(self.checkpoint_dir))
+
+ def residual_block(self, in_layer):
+ orig = in_layer
+ regularizer = tf.keras.regularizers.l2(self.c_l2)
+ layer = tf.keras.layers.Conv2D(kernel_size=3, filters=self.filters_size, padding='same', kernel_regularizer=regularizer)(in_layer)
+ layer = tf.keras.layers.BatchNormalization(epsilon=1e-5)(layer)
+ layer = tf.keras.layers.ReLU()(layer)
+
+ layer = tf.keras.layers.Conv2D(kernel_size=3, filters=self.filters_size, padding='same', kernel_regularizer=regularizer)(layer)
+ layer = tf.keras.layers.BatchNormalization(epsilon=1e-5)(layer)
+ add_layer = tf.keras.layers.add([orig, layer])
+ out = tf.keras.layers.ReLU()(add_layer)
+
+ return out
+
+ def save(self, in_global_step):
+ self.checkpoint.save(self.checkpoint_prefix)
+
+ def train_step(self, positions, pi, z, learning_rate=0):
+ with self.train_summary_writer.as_default():
+ history = self.model.fit(positions, {'policy_head_output': pi, 'value_head_output': z}, batch_size=len(positions), verbose=0)
+ loss = history.history['loss'][0]
+ accuracy = history.history['policy_head_output_accuracy'][0]
+ self.global_step.assign_add(1)
+ tf.summary.scalar('loss', loss, step=self.global_step)
+ tf.summary.scalar('accuracy', accuracy, step=self.global_step)
+ return accuracy, loss, self.global_step
+
+ #@profile
+ def forward(self, positions):
+
+ positions=np.array(positions)
+ if len(positions.shape) == 3:
+ sp = positions.shape
+ positions=np.reshape(positions, [1, sp[0], sp[1], sp[2]])
+ action_probs, value = self.model(positions, training=False)
+
+ return action_probs, value
diff --git a/policy_value_network_tf2.py b/policy_value_network_tf2.py
deleted file mode 100755
index ca907ef..0000000
--- a/policy_value_network_tf2.py
+++ /dev/null
@@ -1,241 +0,0 @@
-#coding:utf-8
-import tensorflow as tf
-import numpy as np
-from tensorflow.python.ops import summary_ops_v2
-import os
-
-
-class policy_value_network(object):
- def __init__(self, learning_rate_fn, res_block_nums = 7):
- # self.ckpt = os.path.join(os.getcwd(), 'models/best_model.ckpt-13999') # TODO
- self.save_dir = "./models"
- self.is_logging = True
-
- if tf.io.gfile.exists(self.save_dir):
- # print('Removing existing model dir: {}'.format(MODEL_DIR))
- # tf.io.gfile.rmtree(MODEL_DIR)
- pass
- else:
- tf.io.gfile.makedirs(self.save_dir)
-
- train_dir = os.path.join(self.save_dir, 'summaries', 'train')
- test_dir = os.path.join(self.save_dir, 'summaries', 'eval')
-
- self.train_summary_writer = summary_ops_v2.create_file_writer(train_dir, flush_millis=10000)
- self.test_summary_writer = summary_ops_v2.create_file_writer(test_dir, flush_millis=10000, name='test')
-
- # Variables
- self.filters_size = 128 # or 256
- self.prob_size = 2086
- self.digest = None
-
- self.inputs_ = tf.keras.layers.Input([9, 10, 14], dtype='float32', name='inputs') # TODO C plain x 2
- self.c_l2 = 0.0001
- self.momentum = 0.9
- self.global_norm = 100
-
- self.layer = tf.keras.layers.Conv2D(kernel_size=3, filters=self.filters_size, padding='same')(self.inputs_)
- self.layer = tf.keras.layers.BatchNormalization(epsilon=1e-5, fused=True)(self.layer)
- self.layer = tf.keras.layers.ReLU()(self.layer)
-
- # residual_block
- with tf.name_scope("residual_block"):
- for _ in range(res_block_nums):
- self.layer = self.residual_block(self.layer)
-
- # policy_head
- with tf.name_scope("policy_head"):
- self.policy_head = tf.keras.layers.Conv2D(filters=2, kernel_size=1, padding='same')(self.layer)
- self.policy_head = tf.keras.layers.BatchNormalization(epsilon=1e-5, fused=True)(self.policy_head)
- self.policy_head = tf.keras.layers.ReLU()(self.policy_head)
-
- self.policy_head = tf.keras.layers.Reshape([9 * 10 * 2])(self.policy_head)
- self.policy_head = tf.keras.layers.Dense(self.prob_size)(self.policy_head)
-
- # value_head
- with tf.name_scope("value_head"):
- self.value_head = tf.keras.layers.Conv2D(filters=1, kernel_size=1, padding='same')(self.layer)
- self.value_head = tf.keras.layers.BatchNormalization(epsilon=1e-5, fused=True)(
- self.value_head)
- self.value_head = tf.keras.layers.ReLU()(self.value_head)
-
- self.value_head = tf.keras.layers.Reshape([9 * 10 * 1])(self.value_head)
- self.value_head = tf.keras.layers.Dense(256, activation='relu')(self.value_head)
- self.value_head = tf.keras.layers.Dense(1, activation='tanh')(self.value_head)
-
- self.model = tf.keras.Model(
- inputs=[self.inputs_],
- outputs=[self.policy_head, self.value_head])
-
- self.model.summary()
-
- self.global_step = tf.Variable(0, name="global_step", trainable=False)
- # optimizer = tf.train.AdamOptimizer(self.learning_rate)
-
- # 优化损失
- self.optimizer = tf.compat.v1.train.MomentumOptimizer(
- learning_rate=learning_rate_fn, momentum=self.momentum, use_nesterov=True)
-
- self.CategoricalCrossentropyLoss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
- self.MSE = tf.keras.losses.MeanSquaredError()
- self.ComputeMetrics = tf.keras.metrics.MeanAbsoluteError()
-
-
- # self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
- # with tf.control_dependencies(self.update_ops):
- # self.train_op = optimizer.minimize(self.loss, global_step=self.global_step)
-
- self.checkpoint_dir = os.path.join(self.save_dir, 'checkpoints')
- self.checkpoint_prefix = os.path.join(self.checkpoint_dir, 'ckpt')
- self.checkpoint = tf.train.Checkpoint(model=self.model, optimizer=self.optimizer)
-
- # Restore variables on creation if a checkpoint exists.
- self.checkpoint.restore(tf.train.latest_checkpoint(self.checkpoint_dir))
-
- def residual_block(self, in_layer):
- orig = tf.convert_to_tensor(in_layer) # tf.identity(in_layer)
- layer = tf.keras.layers.Conv2D(kernel_size=3, filters=self.filters_size, padding='same')(in_layer)
- layer = tf.keras.layers.BatchNormalization(epsilon=1e-5, fused=True)(layer)
- layer = tf.keras.layers.ReLU()(layer)
-
- layer = tf.keras.layers.Conv2D(kernel_size=3, filters=self.filters_size, padding='same')(layer)
- layer = tf.keras.layers.BatchNormalization(epsilon=1e-5, fused=True)(layer)
- add_layer = tf.keras.layers.add([orig, layer])
- out = tf.keras.layers.ReLU()(add_layer)
-
- return out
-
- # def train_restore(self):
- # if not os.path.isdir(self.save_dir):
- # os.mkdir(self.save_dir)
- # checkpoint = tf.train.get_checkpoint_state(self.save_dir)
- # if checkpoint and checkpoint.model_checkpoint_path:
- # # self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
- # self.saver.restore(self.sess, tf.train.latest_checkpoint(self.save_dir))
- # print("Successfully loaded:", tf.train.latest_checkpoint(self.save_dir))
- # # print("Successfully loaded:", checkpoint.model_checkpoint_path)
- # else:
- # print("Could not find old network weights")
-
- # def restore(self, file):
- # print("Restoring from {0}".format(file))
- # self.saver.restore(self.sess, file) # self.ckpt
-
- def save(self, in_global_step):
- self.checkpoint.save(self.checkpoint_prefix)
- # print("Model saved in file: {}".format(save_path))
-
- def compute_metrics(self, pi_, policy_head):
- # Accuracy
- correct_prediction = tf.equal(tf.argmax(input=policy_head, axis=1), tf.argmax(input=pi_, axis=1))
- correct_prediction = tf.cast(correct_prediction, tf.float32)
- accuracy = tf.reduce_mean(input_tensor=correct_prediction, name='accuracy')
-
- summary_ops_v2.scalar('move_accuracy', accuracy)
- return accuracy
-
- def apply_regularization(self, regularizer, weights_list=None):
- """Returns the summed penalty by applying `regularizer` to the `weights_list`.
- Adding a regularization penalty over the layer weights and embedding weights
- can help prevent overfitting the training data. Regularization over layer
- biases is less common/useful, but assuming proper data preprocessing/mean
- subtraction, it usually shouldn't hurt much either.
- Args:
- regularizer: A function that takes a single `Tensor` argument and returns
- a scalar `Tensor` output.
- weights_list: List of weights `Tensors` or `Variables` to apply
- `regularizer` over. Defaults to the `GraphKeys.WEIGHTS` collection if
- `None`.
- Returns:
- A scalar representing the overall regularization penalty.
- Raises:
- ValueError: If `regularizer` does not return a scalar output, or if we find
- no weights.
- """
- # if not weights_list:
- # weights_list = ops.get_collection(ops.GraphKeys.WEIGHTS)
- if not weights_list:
- raise ValueError('No weights to regularize.')
- with tf.name_scope('get_regularization_penalty',
- values=weights_list) as scope:
- penalties = [regularizer(w) for w in weights_list]
- penalties = [
- p if p is not None else tf.constant(0.0) for p in penalties
- ]
- for p in penalties:
- if p.get_shape().ndims != 0:
- raise ValueError('regularizer must return a scalar Tensor instead of a '
- 'Tensor with rank %d.' % p.get_shape().ndims)
-
- summed_penalty = tf.add_n(penalties, name=scope)
- # ops.add_to_collection(ops.GraphKeys.REGULARIZATION_LOSSES, summed_penalty)
- return summed_penalty
-
- def compute_loss(self, pi_, z_, policy_head, value_head):
-
- # loss
- with tf.name_scope("loss"):
- policy_loss = tf.keras.losses.categorical_crossentropy(y_true=pi_, y_pred=policy_head, from_logits=True)
- policy_loss = tf.reduce_mean(policy_loss)
-
- value_loss = tf.keras.losses.mean_squared_error(z_, value_head)
- value_loss = tf.reduce_mean(value_loss)
- summary_ops_v2.scalar('mse_loss', value_loss)
-
- regularizer = tf.keras.regularizers.l2(self.c_l2)
- regular_variables = self.model.trainable_variables
- l2_loss = self.apply_regularization(regularizer, regular_variables)
-
- # self.loss = value_loss - policy_loss + l2_loss
- self.loss = value_loss + policy_loss + l2_loss
- summary_ops_v2.scalar('loss', self.loss)
-
- return self.loss
-
- @tf.function
- def train_step(self, positions, pi, z, learning_rate=0):
- # Record the operations used to compute the loss, so that the gradient
- # of the loss with respect to the variables can be computed.
- # metrics = 0
-
- with tf.GradientTape() as tape:
- policy_head, value_head = self.model(positions, training=True)
- loss = self.compute_loss(pi, z, policy_head, value_head)
- # self.ComputeMetrics(y, logits)
- metrics = self.compute_metrics(pi, policy_head)
- grads = tape.gradient(loss, self.model.trainable_variables)
-
- # grads = self.average_gradients(tower_grads)
- # grads = self.optimizer.compute_gradients(self.loss)
- # defensive step 2 to clip norm
- # grads0_lst = tf.map_fn(lambda x: x[0], grads) # [g for g, _ in grads]
- clipped_grads, self.norm = tf.clip_by_global_norm(grads, self.global_norm)
-
- # defensive step 3 check NaN
- # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating
- grad_check = [tf.debugging.check_numerics(g, message='NaN Found!') for g in clipped_grads]
- with tf.control_dependencies(grad_check):
- self.optimizer.apply_gradients(
- zip(clipped_grads, self.model.trainable_variables), # [v for _, v in grads]
- global_step=self.global_step, name='train_step')
-
- if self.is_logging:
- for grad, var in zip(grads, self.model.trainable_variables):
- if grad is not None:
- summary_ops_v2.histogram(var.name + '/gradients', grad)
- for var in self.model.trainable_variables:
- summary_ops_v2.histogram(var.name, var)
-
- return metrics, loss, self.global_step
-
- #@profile
- def forward(self, positions):
-
- positions=np.array(positions)
- if len(positions.shape) == 3:
- sp = positions.shape
- positions=np.reshape(positions, [1, sp[0], sp[1], sp[2]])
- action_probs, value = self.model(positions, training=False)
-
- return action_probs, value
-