Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions sakila_connect.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "4317edf6",
"metadata": {},
"source": [
"# Lab 7 - Connecting sql to python \n",
"https://github.com/data-bootcamp-v4/lab-sql-python-connection "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e8d8cd92",
"metadata": {},
"outputs": [],
"source": [
"# Step 1: Connect Alchemy engine \n",
"\n",
"import pandas as pd\n",
"from sqlalchemy import create_engine\n",
"\n",
"# IMPORTANT: Don't enter sql credentials into files for upload EVER! Use .env file to get creds \n",
"# Replace 'root' and 'my_password' placeholders\n",
"# Str format like 'mysql+pymysql://username:password@localhost/database'#\n",
"\n",
"engine = create_engine('mysql+pymysql://root:my_password@localhost/sakila')\n",
"\n",
"# Step 2: Fetching data (rentals_month)\n",
"\n",
"# Instead of typing SELECT * in mysql, pass query string into python like pd.read_sql_query()\n",
"# Use Python f-strings to \"inject\" month and year variables directly into sql\n",
"\n",
"def rentals_month(engine, month, year):\n",
" query = f\"\"\"\n",
" SELECT * \n",
" FROM rental \n",
" WHERE MONTH(rental_date) = {month} AND YEAR(rental_date) = {year};\n",
" \"\"\"\n",
" df = pd.read_sql_query(query, engine)\n",
" return df\n",
"\n",
"# Step 3: Grouping the Data (rental_count_month)\n",
"\n",
"# Group by customer_id and count how many rentals they made, and rename column \n",
"def rental_count_month(df, month, year):\n",
" summary_df = df.groupby('customer_id')['rental_id'].count().reset_index()\n",
"\n",
" col_name = f\"rentals_{month:02d}_{year}\" # Add leading zero to single digit months like 3 -> 03\n",
" summary_df = summary_df.rename(columns={'rental_id': col_name})\n",
"\n",
" return summary_df\n",
"\n",
"# Step 4: Compare months with compare_rentals\n",
"# Compare the dfs by merging by customer_id - use OUTER merge so that months with no rentals are not dropped\n",
"\n",
"def compare_rentals(df1, df2):\n",
" # Outer merge to keep all customers,fill NaNs with 0\n",
" merged_df = pd.merge(df1, df2, on='customer_id', how='outer').fillna(0) \n",
" \n",
" col_month1 = df1.columns[1]\n",
" col_month2 = df2.columns[1]\n",
" merged_df['difference'] = merged_df[col_month2] - merged_df[col_month1]\n",
" \n",
" return merged_df\n",
"\n",
"# Step 5: Execution Block\n",
"\n",
"if __name__ == \"__main__\":\n",
" # Get raw data\n",
" df_may = rentals_month(engine, 5, 2005)\n",
" df_june = rentals_month(engine, 6, 2005)\n",
" \n",
" # Month counts\n",
" count_may = rental_count_month(df_may, 5, 2005)\n",
" count_june = rental_count_month(df_june, 6, 2005)\n",
" \n",
" # Compare as final_report\n",
" final_report = compare_rentals(count_may, count_june)\n",
" print(final_report.head())\n"
]
},
{
"cell_type": "markdown",
"id": "edfc5840",
"metadata": {},
"source": []
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 5
}