Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions Chapter5.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import random\n",
"import seaborn as sns\n",
"\n",
"sns.set_style('ticks')\n",
"\n",
"from tqdm import tqdm\n",
"from itertools import (count, product)\n",
"from scipy.stats import poisson\n",
"from black_jack_env import BlackJackEnv, calc_hand_value_usable_ace\n",
"from black_jack_agent import BlackJackAgent, get_state_idx"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"## Training Loop\n",
"\n",
"epsilon = 0.05\n",
"#initialization\n",
"policy = np.ones((2, 200)) * 0.5\n",
"q = np.zeros((2, 200))\n",
"\n",
"# loop\n",
"for ep_idx in range(1000):\n",
" agent = BlackJackAgent(policy)\n",
" env = BlackJackEnv(seed=None)\n",
" result, player_hand, dealer_card = env.deal_cards()\n",
" state_idxs = []\n",
" actions = []\n",
" while result is None:\n",
" player_value, num_aces = calc_hand_value_usable_ace(player_hand)\n",
" has_usable_ace = num_aces > 0\n",
" state_idx = get_state_idx(has_usable_ace, player_value, dealer_card)\n",
" action = agent.take_action(state_idx)\n",
" state_idxs.append(state_idx)\n",
" actions.append(action)\n",
" if action == 0:\n",
" # this is hit\n",
" result, player_hand = env.hit()\n",
" else:\n",
" result = env.stick()\n",
" for state_idx, action in zip(state_idxs, actions):\n",
" q_new = (q[action, state_idx] * ep_idx + result) / (ep_idx + 1)\n",
" q[action, state_idx] = q_new\n",
" a_star = np.argmax(q[:, state_idx])\n",
" policy[:, state_idx] = epsilon / 2\n",
" policy[a_star, state_idx] = 1 - epsilon + epsilon / 2\n",
" \n",
" "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.10 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
37 changes: 37 additions & 0 deletions black_jack_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import numpy as np

class BlackJackAgent:
"""
On policy first visit MC control (epsilon soft policy)
"""

def __init__(self, policy, seed=42):
"""
Define state as:
3D index of: has_usable_ace, player_value, dealer_card
index calculated by:
- usable_ace = 0 or 1, player_value = 0 - 10, dealer_card = 0 - 10
- usable_ace * 100 + player_value * 10 + dealer_card

"""
# np array of shape num_actions x num_states
# p (a | s)
self.rng = np.random.default_rng(seed)
self.policy = policy

def take_action(self, state_idx):
"""
0 for hit, 1 for stick
"""
action = self.rng.choice([0, 1], p=self.policy[:, state_idx])
return action


def get_state_idx(has_usable_ace, player_value, dealer_card):
usable_idx = 1 if has_usable_ace else 0
idx = usable_idx * 100
player_idx = player_value - 12
idx += player_idx * 10
dealer_idx = 0 if dealer_card == "ACE" else int(dealer_card) - 1
idx += dealer_idx
return idx
80 changes: 80 additions & 0 deletions black_jack_env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import numpy as np

CARDS = ["ACE", 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]

class BlackJackEnv:
def __init__(self, seed=None):
self.rng = np.random.default_rng(seed=seed)
self.dealer_hand = []
self.player_hand = []

def deal_cards(self):
"""
Deals 2 cards for both player and dealer
returns 2 cards for player, one face-up card dealer card.
"""
self.dealer_hand = self.rng.choice(CARDS, 2).tolist()
self.player_hand = self.rng.choice(CARDS, 2).tolist()

player_val = calc_hand_value(self.player_hand)
while player_val < 12:
card = self.rng.choice(CARDS)
self.player_hand.append(card)
player_val = calc_hand_value(self.player_hand)

if calc_hand_value(self.player_hand) == 21:
dealer_val = calc_hand_value(self.dealer_hand)
if dealer_val == 21:
return 0, None, None
else:
return 1, None, None
return None, self.player_hand, self.dealer_hand[0]

def hit(self):
"""
Deals 1 card to player
"""
card = self.rng.choice(CARDS)
self.player_hand.append(card)
player_val = calc_hand_value(self.player_hand)
if player_val > 21:
return -1, None
return None, self.player_hand

def stick(self):
"""
Just keep adding cards to dealer deck until 17,
then call evaluate and end game
"""
while calc_hand_value(self.dealer_hand) < 17:
# print(self.dealer_hand)
self.dealer_hand.append(self.rng.choice(CARDS))
player_val = calc_hand_value(self.player_hand)
dealer_val = calc_hand_value(self.dealer_hand)
if dealer_val > 21 or dealer_val < player_val:
return 1
elif dealer_val == player_val:
return 0
else:
return -1


def calc_hand_value(hand):
# print(f"Hand: {hand}")
rep_hand = [int(x) if x != "ACE" else 11 for x in hand]
# print(f"Rep_hand: {rep_hand}")
sum = np.sum(np.array(rep_hand))
num_aces = hand.count("ACE")
while sum > 21 and num_aces > 0:
sum = sum - 10
num_aces -= 1
return sum

def calc_hand_value_usable_ace(hand):
rep_hand = [int(x) if x != "ACE" else 11 for x in hand]
sum = np.sum(np.array(rep_hand))
num_aces = hand.count("ACE")
while sum > 21 and num_aces > 0:
sum = sum - 10
num_aces -= 1
return sum, num_aces