From 049fdf71004c8703608147d834581c8b295445d1 Mon Sep 17 00:00:00 2001 From: Edson Antonio Date: Sun, 3 May 2026 23:21:33 +0200 Subject: [PATCH 1/2] Solved SQL subqueries lab --- .../sql_subqueries/lab_sql_subqueries.sql | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 OneDrive/Desktop/sql_subqueries/lab_sql_subqueries.sql diff --git a/OneDrive/Desktop/sql_subqueries/lab_sql_subqueries.sql b/OneDrive/Desktop/sql_subqueries/lab_sql_subqueries.sql new file mode 100644 index 0000000..ae066fb --- /dev/null +++ b/OneDrive/Desktop/sql_subqueries/lab_sql_subqueries.sql @@ -0,0 +1,71 @@ +-- LAB SQL SUBQUERIES +USE sakila; + +SELECT + title, + rental_rate +FROM film +WHERE rental_rate > ( + SELECT AVG(rental_rate) + FROM film +); + + + + +SELECT + customer_id, + COUNT(rental_id) AS total_rentals +FROM rental +GROUP BY customer_id +HAVING COUNT(rental_id) > ( + SELECT AVG(rental_count) + FROM ( + SELECT COUNT(rental_id) AS rental_count + FROM rental + GROUP BY customer_id + ) AS sub +); + +SELECT + f.title, + COUNT(r.rental_id) AS times_rented +FROM film f +JOIN inventory i + ON f.film_id = i.film_id +JOIN rental r + ON i.inventory_id = r.inventory_id +GROUP BY f.title +HAVING COUNT(r.rental_id) > ( + SELECT AVG(rental_count) + FROM ( + SELECT COUNT(*) AS rental_count + FROM rental r + JOIN inventory i + ON r.inventory_id = i.inventory_id + GROUP BY i.film_id + ) AS sub +) +ORDER BY times_rented DESC; + + + +SELECT + c.customer_id, + CONCAT(c.first_name, ' ', c.last_name) AS full_name, + SUM(p.amount) AS total_spent +FROM customer c +JOIN payment p + ON c.customer_id = p.customer_id +GROUP BY c.customer_id +HAVING SUM(p.amount) > ( + SELECT AVG(customer_total) + FROM ( + SELECT SUM(amount) AS customer_total + FROM payment + GROUP BY customer_id + ) AS sub +) +ORDER BY total_spent DESC; + + From 7ca02b596abeb51c9dfb6eeff0b36363c3a1db5e Mon Sep 17 00:00:00 2001 From: Edson Antonio Date: Fri, 15 May 2026 16:06:33 +0200 Subject: [PATCH 2/2] ensemble lab completed --- .../lab-ensemble-clean/lab-ensemble.ipynb | 804 ++++++++++++++++++ 1 file changed, 804 insertions(+) create mode 100644 OneDrive/Desktop/lab-ensemble-clean/lab-ensemble.ipynb diff --git a/OneDrive/Desktop/lab-ensemble-clean/lab-ensemble.ipynb b/OneDrive/Desktop/lab-ensemble-clean/lab-ensemble.ipynb new file mode 100644 index 0000000..3a5af57 --- /dev/null +++ b/OneDrive/Desktop/lab-ensemble-clean/lab-ensemble.ipynb @@ -0,0 +1,804 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LAB | Ensemble Methods" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Load the data**\n", + "\n", + "In this challenge, we will be working with the same Spaceship Titanic data, like the previous Lab. The data can be found here:\n", + "\n", + "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\n", + "\n", + "Metadata\n", + "\n", + "https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this Lab, you should try different ensemble methods in order to see if can obtain a better model than before. In order to do a fair comparison, you should perform the same feature scaling, engineering applied in previous Lab." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#Libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "from sklearn.ensemble import (\n", + " BaggingClassifier,\n", + " RandomForestClassifier,\n", + " GradientBoostingClassifier,\n", + " AdaBoostClassifier\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdHomePlanetCryoSleepCabinDestinationAgeVIPRoomServiceFoodCourtShoppingMallSpaVRDeckNameTransported
00001_01EuropaFalseB/0/PTRAPPIST-1e39.0False0.00.00.00.00.0Maham OfracculyFalse
10002_01EarthFalseF/0/STRAPPIST-1e24.0False109.09.025.0549.044.0Juanna VinesTrue
20003_01EuropaFalseA/0/STRAPPIST-1e58.0True43.03576.00.06715.049.0Altark SusentFalse
30003_02EuropaFalseA/0/STRAPPIST-1e33.0False0.01283.0371.03329.0193.0Solam SusentFalse
40004_01EarthFalseF/1/STRAPPIST-1e16.0False303.070.0151.0565.02.0Willy SantantinesTrue
\n", + "
" + ], + "text/plain": [ + " PassengerId HomePlanet CryoSleep Cabin Destination Age VIP \\\n", + "0 0001_01 Europa False B/0/P TRAPPIST-1e 39.0 False \n", + "1 0002_01 Earth False F/0/S TRAPPIST-1e 24.0 False \n", + "2 0003_01 Europa False A/0/S TRAPPIST-1e 58.0 True \n", + "3 0003_02 Europa False A/0/S TRAPPIST-1e 33.0 False \n", + "4 0004_01 Earth False F/1/S TRAPPIST-1e 16.0 False \n", + "\n", + " RoomService FoodCourt ShoppingMall Spa VRDeck Name \\\n", + "0 0.0 0.0 0.0 0.0 0.0 Maham Ofracculy \n", + "1 109.0 9.0 25.0 549.0 44.0 Juanna Vines \n", + "2 43.0 3576.0 0.0 6715.0 49.0 Altark Susent \n", + "3 0.0 1283.0 371.0 3329.0 193.0 Solam Susent \n", + "4 303.0 70.0 151.0 565.0 2.0 Willy Santantines \n", + "\n", + " Transported \n", + "0 False \n", + "1 True \n", + "2 False \n", + "3 False \n", + "4 True " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spaceship = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\")\n", + "spaceship.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now perform the same as before:\n", + "- Feature Scaling\n", + "- Feature Selection\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\vicky\\AppData\\Local\\Temp\\ipykernel_14720\\1597633469.py:5: Pandas4Warning: For backward compatibility, 'str' dtypes are included by select_dtypes when 'object' dtype is specified. This behavior is deprecated and will be removed in a future version. Explicitly pass 'str' to `include` to select them, or to `exclude` to remove them and silence this warning.\n", + "See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.\n", + " for col in spaceship.select_dtypes(include=\"object\").columns:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HomePlanetCryoSleepDestinationAgeVIPRoomServiceFoodCourtShoppingMallSpaVRDeckTransported
0EuropaFalseTRAPPIST-1e39.0False0.00.00.00.00.00
1EarthFalseTRAPPIST-1e24.0False109.09.025.0549.044.01
2EuropaFalseTRAPPIST-1e58.0True43.03576.00.06715.049.00
3EuropaFalseTRAPPIST-1e33.0False0.01283.0371.03329.0193.00
4EarthFalseTRAPPIST-1e16.0False303.070.0151.0565.02.01
\n", + "
" + ], + "text/plain": [ + " HomePlanet CryoSleep Destination Age VIP RoomService FoodCourt \\\n", + "0 Europa False TRAPPIST-1e 39.0 False 0.0 0.0 \n", + "1 Earth False TRAPPIST-1e 24.0 False 109.0 9.0 \n", + "2 Europa False TRAPPIST-1e 58.0 True 43.0 3576.0 \n", + "3 Europa False TRAPPIST-1e 33.0 False 0.0 1283.0 \n", + "4 Earth False TRAPPIST-1e 16.0 False 303.0 70.0 \n", + "\n", + " ShoppingMall Spa VRDeck Transported \n", + "0 0.0 0.0 0.0 0 \n", + "1 25.0 549.0 44.0 1 \n", + "2 0.0 6715.0 49.0 0 \n", + "3 371.0 3329.0 193.0 0 \n", + "4 151.0 565.0 2.0 1 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "# Drop unnecessary columns\n", + "spaceship = spaceship.drop(columns=[\"PassengerId\", \"Name\", \"Cabin\"])\n", + "\n", + "# Fill categorical missing values\n", + "for col in spaceship.select_dtypes(include=\"object\").columns:\n", + " spaceship[col] = spaceship[col].fillna(\n", + " spaceship[col].mode()[0]\n", + " )\n", + "\n", + "# Fill numerical missing values\n", + "for col in spaceship.select_dtypes(include=[\"float64\", \"int64\"]).columns:\n", + " spaceship[col] = spaceship[col].fillna(\n", + " spaceship[col].median()\n", + " )\n", + "\n", + "# Convert boolean columns\n", + "bool_cols = spaceship.select_dtypes(include=\"bool\").columns\n", + "\n", + "for col in bool_cols:\n", + " spaceship[col] = spaceship[col].astype(int)\n", + "\n", + "spaceship.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Perform Train Test Split**" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#your code here\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.pipeline import Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "X = spaceship.drop(\"Transported\", axis=1)\n", + "y = spaceship[\"Transported\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\vicky\\AppData\\Local\\Temp\\ipykernel_14720\\1713163199.py:1: Pandas4Warning: For backward compatibility, 'str' dtypes are included by select_dtypes when 'object' dtype is specified. This behavior is deprecated and will be removed in a future version. Explicitly pass 'str' to `include` to select them, or to `exclude` to remove them and silence this warning.\n", + "See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.\n", + " categorical_cols = X.select_dtypes(\n" + ] + } + ], + "source": [ + "categorical_cols = X.select_dtypes(\n", + " include=[\"object\"]\n", + ").columns\n", + "\n", + "numerical_cols = X.select_dtypes(\n", + " exclude=[\"object\"]\n", + ").columns" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " (\n", + " \"cat\",\n", + " OneHotEncoder(handle_unknown=\"ignore\"),\n", + " categorical_cols\n", + " ),\n", + " (\n", + " \"num\",\n", + " StandardScaler(),\n", + " numerical_cols\n", + " )\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X,\n", + " y,\n", + " test_size=0.2,\n", + " random_state=42\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = preprocessor.fit_transform(X_train)\n", + "X_test = preprocessor.transform(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Model Selection** - now you will try to apply different ensemble methods in order to get a better model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Bagging and Pasting" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bagging Accuracy: 0.7757331799884991\n" + ] + } + ], + "source": [ + "#your code here\n", + "bagging = BaggingClassifier(\n", + " n_estimators=100,\n", + " random_state=42\n", + ")\n", + "\n", + "bagging.fit(X_train, y_train)\n", + "\n", + "y_pred_bag = bagging.predict(X_test)\n", + "\n", + "bag_acc = accuracy_score(y_test, y_pred_bag)\n", + "\n", + "print(\"Bagging Accuracy:\", bag_acc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Random Forests" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Random Forest Accuracy: 0.7809085681426107\n" + ] + } + ], + "source": [ + "#your code here\n", + "rf = RandomForestClassifier(\n", + " n_estimators=100,\n", + " random_state=42\n", + ")\n", + "\n", + "rf.fit(X_train, y_train)\n", + "\n", + "y_pred_rf = rf.predict(X_test)\n", + "\n", + "rf_acc = accuracy_score(y_test, y_pred_rf)\n", + "\n", + "print(\"Random Forest Accuracy:\", rf_acc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Gradient Boosting" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Gradient Boosting Accuracy: 0.7809085681426107\n" + ] + } + ], + "source": [ + "#your code here\n", + "gb = GradientBoostingClassifier(\n", + " n_estimators=100,\n", + " random_state=42\n", + ")\n", + "\n", + "gb.fit(X_train, y_train)\n", + "\n", + "y_pred_gb = gb.predict(X_test)\n", + "\n", + "gb_acc = accuracy_score(y_test, y_pred_gb)\n", + "\n", + "print(\"Gradient Boosting Accuracy:\", gb_acc)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Adaptive Boosting" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AdaBoost Accuracy: 0.7607820586543991\n" + ] + } + ], + "source": [ + "#your code here\n", + "ada = AdaBoostClassifier(\n", + " n_estimators=100,\n", + " random_state=42\n", + ")\n", + "\n", + "ada.fit(X_train, y_train)\n", + "\n", + "y_pred_ada = ada.predict(X_test)\n", + "\n", + "ada_acc = accuracy_score(y_test, y_pred_ada)\n", + "\n", + "print(\"AdaBoost Accuracy:\", ada_acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Which model is the best and why?" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelAccuracy
1Random Forest0.780909
2Gradient Boosting0.780909
0Bagging0.775733
3AdaBoost0.760782
\n", + "
" + ], + "text/plain": [ + " Model Accuracy\n", + "1 Random Forest 0.780909\n", + "2 Gradient Boosting 0.780909\n", + "0 Bagging 0.775733\n", + "3 AdaBoost 0.760782" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#comment here\n", + "results = pd.DataFrame({\n", + " \"Model\": [\n", + " \"Bagging\",\n", + " \"Random Forest\",\n", + " \"Gradient Boosting\",\n", + " \"AdaBoost\"\n", + " ],\n", + " \"Accuracy\": [\n", + " bag_acc,\n", + " rf_acc,\n", + " gb_acc,\n", + " ada_acc\n", + " ]\n", + "})\n", + "\n", + "results.sort_values(by=\"Accuracy\", ascending=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Random Forest and Gradient Boosting performed the best.\n", + "\n", + "This is because ensemble methods combine multiple weak learners,\n", + "which improves generalization and reduces overfitting." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}