diff --git a/sakila_connect.ipynb b/sakila_connect.ipynb new file mode 100644 index 0000000..a95e330 --- /dev/null +++ b/sakila_connect.ipynb @@ -0,0 +1,98 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4317edf6", + "metadata": {}, + "source": [ + "# Lab 7 - Connecting sql to python \n", + "https://github.com/data-bootcamp-v4/lab-sql-python-connection " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8d8cd92", + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Connect Alchemy engine \n", + "\n", + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "\n", + "# IMPORTANT: Don't enter sql credentials into files for upload EVER! Use .env file to get creds \n", + "# Replace 'root' and 'my_password' placeholders\n", + "# Str format like 'mysql+pymysql://username:password@localhost/database'#\n", + "\n", + "engine = create_engine('mysql+pymysql://root:my_password@localhost/sakila')\n", + "\n", + "# Step 2: Fetching data (rentals_month)\n", + "\n", + "# Instead of typing SELECT * in mysql, pass query string into python like pd.read_sql_query()\n", + "# Use Python f-strings to \"inject\" month and year variables directly into sql\n", + "\n", + "def rentals_month(engine, month, year):\n", + " query = f\"\"\"\n", + " SELECT * \n", + " FROM rental \n", + " WHERE MONTH(rental_date) = {month} AND YEAR(rental_date) = {year};\n", + " \"\"\"\n", + " df = pd.read_sql_query(query, engine)\n", + " return df\n", + "\n", + "# Step 3: Grouping the Data (rental_count_month)\n", + "\n", + "# Group by customer_id and count how many rentals they made, and rename column \n", + "def rental_count_month(df, month, year):\n", + " summary_df = df.groupby('customer_id')['rental_id'].count().reset_index()\n", + "\n", + " col_name = f\"rentals_{month:02d}_{year}\" # Add leading zero to single digit months like 3 -> 03\n", + " summary_df = summary_df.rename(columns={'rental_id': col_name})\n", + "\n", + " return summary_df\n", + "\n", + "# Step 4: Compare months with compare_rentals\n", + "# Compare the dfs by merging by customer_id - use OUTER merge so that months with no rentals are not dropped\n", + "\n", + "def compare_rentals(df1, df2):\n", + " # Outer merge to keep all customers,fill NaNs with 0\n", + " merged_df = pd.merge(df1, df2, on='customer_id', how='outer').fillna(0) \n", + " \n", + " col_month1 = df1.columns[1]\n", + " col_month2 = df2.columns[1]\n", + " merged_df['difference'] = merged_df[col_month2] - merged_df[col_month1]\n", + " \n", + " return merged_df\n", + "\n", + "# Step 5: Execution Block\n", + "\n", + "if __name__ == \"__main__\":\n", + " # Get raw data\n", + " df_may = rentals_month(engine, 5, 2005)\n", + " df_june = rentals_month(engine, 6, 2005)\n", + " \n", + " # Month counts\n", + " count_may = rental_count_month(df_may, 5, 2005)\n", + " count_june = rental_count_month(df_june, 6, 2005)\n", + " \n", + " # Compare as final_report\n", + " final_report = compare_rentals(count_may, count_june)\n", + " print(final_report.head())\n" + ] + }, + { + "cell_type": "markdown", + "id": "edfc5840", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}