data-bootcamp-v4 · anwenvroberts-max · May 1, 2026
diff --git a/sakila_connect.ipynb b/sakila_connect.ipynb
@@ -0,0 +1,98 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4317edf6",
+   "metadata": {},
+   "source": [
+    "# Lab 7 - Connecting sql to python \n",
+    "https://github.com/data-bootcamp-v4/lab-sql-python-connection "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8d8cd92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Step 1: Connect Alchemy engine \n",
+    "\n",
+    "import pandas as pd\n",
+    "from sqlalchemy import create_engine\n",
+    "\n",
+    "# IMPORTANT: Don't enter sql credentials into files for upload EVER! Use .env file to get creds \n",
+    "#  Replace 'root' and 'my_password' placeholders\n",
+    "#  Str format like 'mysql+pymysql://username:password@localhost/database'#\n",
+    "\n",
+    "engine = create_engine('mysql+pymysql://root:my_password@localhost/sakila')\n",
+    "\n",
+    "#  Step 2: Fetching data (rentals_month)\n",
+    "\n",
+    "#  Instead of typing SELECT * in mysql, pass query string into python like pd.read_sql_query()\n",
+    "#  Use Python f-strings to \"inject\" month and year variables directly into sql\n",
+    "\n",
+    "def rentals_month(engine, month, year):\n",
+    "    query = f\"\"\"\n",
+    "        SELECT * \n",
+    "        FROM rental \n",
+    "        WHERE MONTH(rental_date) = {month} AND YEAR(rental_date) = {year};\n",
+    "    \"\"\"\n",
+    "    df = pd.read_sql_query(query, engine)\n",
+    "    return df\n",
+    "\n",
+    "# Step 3: Grouping the Data (rental_count_month)\n",
+    "\n",
+    "# Group by customer_id and count how many rentals they made, and rename column \n",
+    "def rental_count_month(df, month, year):\n",
+    "    summary_df = df.groupby('customer_id')['rental_id'].count().reset_index()\n",
+    "\n",
+    "    col_name = f\"rentals_{month:02d}_{year}\" # Add leading zero to single digit months like 3 -> 03\n",
+    "    summary_df = summary_df.rename(columns={'rental_id': col_name})\n",
+    "\n",
+    "    return summary_df\n",
+    "\n",
+    "# Step 4: Compare months with compare_rentals\n",
+    "# Compare the dfs by merging by customer_id - use OUTER merge so that months with no rentals are not dropped\n",
+    "\n",
+    "def compare_rentals(df1, df2):\n",
+    "    # Outer merge to keep all customers,fill NaNs with 0\n",
+    "    merged_df = pd.merge(df1, df2, on='customer_id', how='outer').fillna(0)     \n",
+    "    \n",
+    "    col_month1 = df1.columns[1]\n",
+    "    col_month2 = df2.columns[1]\n",
+    "    merged_df['difference'] = merged_df[col_month2] - merged_df[col_month1]\n",
+    "    \n",
+    "    return merged_df\n",
+    "\n",
+    "# Step 5: Execution Block\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    # Get raw data\n",
+    "    df_may = rentals_month(engine, 5, 2005)\n",
+    "    df_june = rentals_month(engine, 6, 2005)\n",
+    "    \n",
+    "    # Month counts\n",
+    "    count_may = rental_count_month(df_may, 5, 2005)\n",
+    "    count_june = rental_count_month(df_june, 6, 2005)\n",
+    "    \n",
+    "    # Compare as final_report\n",
+    "    final_report = compare_rentals(count_may, count_june)\n",
+    "    print(final_report.head())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "edfc5840",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}