From aeee05591d45796e5d801702d13c9b8307d9e1ce Mon Sep 17 00:00:00 2001 From: Patrick Zhang Date: Thu, 10 Nov 2022 14:10:03 -0800 Subject: [PATCH] implementation of black jack game with monte carlo epsilon soft on policy control --- Chapter5.ipynb | 105 ++++++++++++++++++++++++++++++++++++++++++++ black_jack_agent.py | 37 ++++++++++++++++ black_jack_env.py | 80 +++++++++++++++++++++++++++++++++ 3 files changed, 222 insertions(+) create mode 100644 Chapter5.ipynb create mode 100644 black_jack_agent.py create mode 100644 black_jack_env.py diff --git a/Chapter5.ipynb b/Chapter5.ipynb new file mode 100644 index 0000000..be83e91 --- /dev/null +++ b/Chapter5.ipynb @@ -0,0 +1,105 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import random\n", + "import seaborn as sns\n", + "\n", + "sns.set_style('ticks')\n", + "\n", + "from tqdm import tqdm\n", + "from itertools import (count, product)\n", + "from scipy.stats import poisson\n", + "from black_jack_env import BlackJackEnv, calc_hand_value_usable_ace\n", + "from black_jack_agent import BlackJackAgent, get_state_idx" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "## Training Loop\n", + "\n", + "epsilon = 0.05\n", + "#initialization\n", + "policy = np.ones((2, 200)) * 0.5\n", + "q = np.zeros((2, 200))\n", + "\n", + "# loop\n", + "for ep_idx in range(1000):\n", + " agent = BlackJackAgent(policy)\n", + " env = BlackJackEnv(seed=None)\n", + " result, player_hand, dealer_card = env.deal_cards()\n", + " state_idxs = []\n", + " actions = []\n", + " while result is None:\n", + " player_value, num_aces = calc_hand_value_usable_ace(player_hand)\n", + " has_usable_ace = num_aces > 0\n", + " state_idx = get_state_idx(has_usable_ace, player_value, dealer_card)\n", + " action = agent.take_action(state_idx)\n", + " state_idxs.append(state_idx)\n", + " actions.append(action)\n", + " if action == 0:\n", + " # this is hit\n", + " result, player_hand = env.hit()\n", + " else:\n", + " result = env.stick()\n", + " for state_idx, action in zip(state_idxs, actions):\n", + " q_new = (q[action, state_idx] * ep_idx + result) / (ep_idx + 1)\n", + " q[action, state_idx] = q_new\n", + " a_star = np.argmax(q[:, state_idx])\n", + " policy[:, state_idx] = epsilon / 2\n", + " policy[a_star, state_idx] = 1 - epsilon + epsilon / 2\n", + " \n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.10 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/black_jack_agent.py b/black_jack_agent.py new file mode 100644 index 0000000..bf138a2 --- /dev/null +++ b/black_jack_agent.py @@ -0,0 +1,37 @@ +import numpy as np + +class BlackJackAgent: + """ + On policy first visit MC control (epsilon soft policy) + """ + + def __init__(self, policy, seed=42): + """ + Define state as: + 3D index of: has_usable_ace, player_value, dealer_card + index calculated by: + - usable_ace = 0 or 1, player_value = 0 - 10, dealer_card = 0 - 10 + - usable_ace * 100 + player_value * 10 + dealer_card + + """ + # np array of shape num_actions x num_states + # p (a | s) + self.rng = np.random.default_rng(seed) + self.policy = policy + + def take_action(self, state_idx): + """ + 0 for hit, 1 for stick + """ + action = self.rng.choice([0, 1], p=self.policy[:, state_idx]) + return action + + +def get_state_idx(has_usable_ace, player_value, dealer_card): + usable_idx = 1 if has_usable_ace else 0 + idx = usable_idx * 100 + player_idx = player_value - 12 + idx += player_idx * 10 + dealer_idx = 0 if dealer_card == "ACE" else int(dealer_card) - 1 + idx += dealer_idx + return idx diff --git a/black_jack_env.py b/black_jack_env.py new file mode 100644 index 0000000..a322b14 --- /dev/null +++ b/black_jack_env.py @@ -0,0 +1,80 @@ +import numpy as np + +CARDS = ["ACE", 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10] + +class BlackJackEnv: + def __init__(self, seed=None): + self.rng = np.random.default_rng(seed=seed) + self.dealer_hand = [] + self.player_hand = [] + + def deal_cards(self): + """ + Deals 2 cards for both player and dealer + returns 2 cards for player, one face-up card dealer card. + """ + self.dealer_hand = self.rng.choice(CARDS, 2).tolist() + self.player_hand = self.rng.choice(CARDS, 2).tolist() + + player_val = calc_hand_value(self.player_hand) + while player_val < 12: + card = self.rng.choice(CARDS) + self.player_hand.append(card) + player_val = calc_hand_value(self.player_hand) + + if calc_hand_value(self.player_hand) == 21: + dealer_val = calc_hand_value(self.dealer_hand) + if dealer_val == 21: + return 0, None, None + else: + return 1, None, None + return None, self.player_hand, self.dealer_hand[0] + + def hit(self): + """ + Deals 1 card to player + """ + card = self.rng.choice(CARDS) + self.player_hand.append(card) + player_val = calc_hand_value(self.player_hand) + if player_val > 21: + return -1, None + return None, self.player_hand + + def stick(self): + """ + Just keep adding cards to dealer deck until 17, + then call evaluate and end game + """ + while calc_hand_value(self.dealer_hand) < 17: + # print(self.dealer_hand) + self.dealer_hand.append(self.rng.choice(CARDS)) + player_val = calc_hand_value(self.player_hand) + dealer_val = calc_hand_value(self.dealer_hand) + if dealer_val > 21 or dealer_val < player_val: + return 1 + elif dealer_val == player_val: + return 0 + else: + return -1 + + +def calc_hand_value(hand): + # print(f"Hand: {hand}") + rep_hand = [int(x) if x != "ACE" else 11 for x in hand] + # print(f"Rep_hand: {rep_hand}") + sum = np.sum(np.array(rep_hand)) + num_aces = hand.count("ACE") + while sum > 21 and num_aces > 0: + sum = sum - 10 + num_aces -= 1 + return sum + +def calc_hand_value_usable_ace(hand): + rep_hand = [int(x) if x != "ACE" else 11 for x in hand] + sum = np.sum(np.array(rep_hand)) + num_aces = hand.count("ACE") + while sum > 21 and num_aces > 0: + sum = sum - 10 + num_aces -= 1 + return sum, num_aces \ No newline at end of file