walkerlab · pqz317 · Nov 10, 2022
diff --git a/Chapter5.ipynb b/Chapter5.ipynb
@@ -0,0 +1,105 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import random\n",
+    "import seaborn as sns\n",
+    "\n",
+    "sns.set_style('ticks')\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "from itertools import (count, product)\n",
+    "from scipy.stats import poisson\n",
+    "from black_jack_env import BlackJackEnv, calc_hand_value_usable_ace\n",
+    "from black_jack_agent import BlackJackAgent, get_state_idx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Training Loop\n",
+    "\n",
+    "epsilon = 0.05\n",
+    "#initialization\n",
+    "policy = np.ones((2, 200)) * 0.5\n",
+    "q = np.zeros((2, 200))\n",
+    "\n",
+    "# loop\n",
+    "for ep_idx in range(1000):\n",
+    "    agent = BlackJackAgent(policy)\n",
+    "    env = BlackJackEnv(seed=None)\n",
+    "    result, player_hand, dealer_card = env.deal_cards()\n",
+    "    state_idxs = []\n",
+    "    actions = []\n",
+    "    while result is None:\n",
+    "        player_value, num_aces = calc_hand_value_usable_ace(player_hand)\n",
+    "        has_usable_ace = num_aces > 0\n",
+    "        state_idx = get_state_idx(has_usable_ace, player_value, dealer_card)\n",
+    "        action = agent.take_action(state_idx)\n",
+    "        state_idxs.append(state_idx)\n",
+    "        actions.append(action)\n",
+    "        if action == 0:\n",
+    "            # this is hit\n",
+    "            result, player_hand = env.hit()\n",
+    "        else:\n",
+    "            result = env.stick()\n",
+    "    for state_idx, action in zip(state_idxs, actions):\n",
+    "        q_new = (q[action, state_idx] * ep_idx + result) / (ep_idx + 1)\n",
+    "        q[action, state_idx] = q_new\n",
+    "        a_star = np.argmax(q[:, state_idx])\n",
+    "        policy[:, state_idx] = epsilon / 2\n",
+    "        policy[a_star, state_idx] = 1 - epsilon + epsilon / 2\n",
+    "    \n",
+    "    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/black_jack_agent.py b/black_jack_agent.py
@@ -0,0 +1,37 @@
+import numpy as np
+
+class BlackJackAgent:
+    """
+    On policy first visit MC control (epsilon soft policy)
+    """
+
+    def __init__(self, policy, seed=42):
+        """
+        Define state as:
+        3D index of: has_usable_ace, player_value, dealer_card
+        index calculated by: 
+            - usable_ace = 0 or 1, player_value = 0 - 10, dealer_card = 0 - 10
+            - usable_ace * 100 + player_value * 10 + dealer_card
+
+        """
+        # np array of shape num_actions x num_states
+        # p (a | s)
+        self.rng = np.random.default_rng(seed)
+        self.policy = policy
+
+    def take_action(self, state_idx):
+        """
+        0 for hit, 1 for stick
+        """
+        action = self.rng.choice([0, 1], p=self.policy[:, state_idx])
+        return action
+
+
+def get_state_idx(has_usable_ace, player_value, dealer_card):
+    usable_idx = 1 if has_usable_ace else 0
+    idx = usable_idx * 100
+    player_idx = player_value - 12
+    idx += player_idx * 10
+    dealer_idx = 0 if dealer_card == "ACE" else int(dealer_card) - 1
+    idx += dealer_idx
+    return idx
diff --git a/black_jack_env.py b/black_jack_env.py
@@ -0,0 +1,80 @@
+import numpy as np
+
+CARDS = ["ACE", 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]
+
+class BlackJackEnv:
+    def __init__(self, seed=None):
+        self.rng = np.random.default_rng(seed=seed)
+        self.dealer_hand = []
+        self.player_hand = []
+
+    def deal_cards(self):
+        """
+        Deals 2 cards for both player and dealer
+        returns 2 cards for player, one face-up card dealer card. 
+        """
+        self.dealer_hand = self.rng.choice(CARDS, 2).tolist()
+        self.player_hand = self.rng.choice(CARDS, 2).tolist()
+
+        player_val = calc_hand_value(self.player_hand)
+        while player_val < 12:
+            card = self.rng.choice(CARDS)
+            self.player_hand.append(card)
+            player_val = calc_hand_value(self.player_hand)
+
+        if calc_hand_value(self.player_hand) == 21:
+            dealer_val = calc_hand_value(self.dealer_hand)
+            if dealer_val == 21:
+                return 0, None, None
+            else: 
+                return 1, None, None
+        return None, self.player_hand, self.dealer_hand[0]
+
+    def hit(self):
+        """
+        Deals 1 card to player
+        """
+        card = self.rng.choice(CARDS)
+        self.player_hand.append(card)
+        player_val = calc_hand_value(self.player_hand)
+        if player_val > 21:
+            return -1, None
+        return None, self.player_hand
+
+    def stick(self):
+        """
+        Just keep adding cards to dealer deck until 17, 
+        then call evaluate and end game
+        """
+        while calc_hand_value(self.dealer_hand) < 17:
+            # print(self.dealer_hand)
+            self.dealer_hand.append(self.rng.choice(CARDS))
+        player_val = calc_hand_value(self.player_hand)
+        dealer_val = calc_hand_value(self.dealer_hand)
+        if dealer_val > 21 or dealer_val < player_val:
+            return 1
+        elif dealer_val == player_val:
+            return 0
+        else:
+            return -1
+
+
+def calc_hand_value(hand):
+    # print(f"Hand: {hand}")
+    rep_hand = [int(x) if x != "ACE" else 11 for x in hand]
+    # print(f"Rep_hand: {rep_hand}")
+    sum = np.sum(np.array(rep_hand))
+    num_aces = hand.count("ACE")
+    while sum > 21 and num_aces > 0:
+        sum = sum - 10
+        num_aces -= 1
+    return sum
+
+def calc_hand_value_usable_ace(hand):
+    rep_hand = [int(x) if x != "ACE" else 11 for x in hand]
+    sum = np.sum(np.array(rep_hand))
+    num_aces = hand.count("ACE")
+    while sum > 21 and num_aces > 0:
+        sum = sum - 10
+        num_aces -= 1
+    return sum, num_aces