diff --git a/lab_python_sql.ipynb b/lab_python_sql.ipynb new file mode 100644 index 0000000..30ecd66 --- /dev/null +++ b/lab_python_sql.ipynb @@ -0,0 +1,907 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dced7a96", + "metadata": {}, + "source": [ + "Establecer una conexión entre Python y la base de datos Sakila." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb289843", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Conexión correcta.\n" + ] + } + ], + "source": [ + "from sqlalchemy import create_engine\n", + "from getpass import getpass # para no revelar contraseña\n", + "import pandas as pd\n", + "\n", + "# Datos de conexión\n", + "user = \"root\"\n", + "password = getpass(\"Introduce tu contraseña de MySQL: \") # para no revelar contraseña\n", + "host = \"localhost\"\n", + "database = \"sakila\"\n", + "\n", + "# Crear el engine\n", + "engine = create_engine(f\"mysql+pymysql://{user}:{password}@{host}/{database}\")\n", + "\n", + "# Probar la conexión leyendo una tabla\n", + "try:\n", + " df_total = pd.read_sql(\"SELECT * FROM rental LIMIT 10;\", engine)\n", + " print(\"Conexión correcta.\")\n", + "except Exception as e:\n", + " print(\"Error al conectar:\", e)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "17174180", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rental_idrental_dateinventory_idcustomer_idreturn_datestaff_idlast_update
012005-05-24 22:53:303671302005-05-26 22:04:3012006-02-15 21:30:53
122005-05-24 22:54:3315254592005-05-28 19:40:3312006-02-15 21:30:53
232005-05-24 23:03:3917114082005-06-01 22:12:3912006-02-15 21:30:53
342005-05-24 23:04:4124523332005-06-03 01:43:4122006-02-15 21:30:53
452005-05-24 23:05:2120792222005-06-02 04:33:2112006-02-15 21:30:53
\n", + "
" + ], + "text/plain": [ + " rental_id rental_date inventory_id customer_id \\\n", + "0 1 2005-05-24 22:53:30 367 130 \n", + "1 2 2005-05-24 22:54:33 1525 459 \n", + "2 3 2005-05-24 23:03:39 1711 408 \n", + "3 4 2005-05-24 23:04:41 2452 333 \n", + "4 5 2005-05-24 23:05:21 2079 222 \n", + "\n", + " return_date staff_id last_update \n", + "0 2005-05-26 22:04:30 1 2006-02-15 21:30:53 \n", + "1 2005-05-28 19:40:33 1 2006-02-15 21:30:53 \n", + "2 2005-06-01 22:12:39 1 2006-02-15 21:30:53 \n", + "3 2005-06-03 01:43:41 2 2006-02-15 21:30:53 \n", + "4 2005-06-02 04:33:21 1 2006-02-15 21:30:53 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_total.head()" + ] + }, + { + "cell_type": "markdown", + "id": "a0edf3ff", + "metadata": {}, + "source": [ + "Vamos a ver el rango total de fechas, pero primero quito la lomitacion de 10" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9b2fb147", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(16044, 7)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_total = pd.read_sql(\"SELECT * FROM rental;\", engine)\n", + "df_total['rental_date'].min(), df_total['rental_date'].max()\n", + "df_total.shape\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4587a5dd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "['2005-05', '2005-06', '2005-07', '2005-08', '2006-02']\n", + "Length: 5, dtype: period[M]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_total['rental_date'].dt.to_period('M').unique() # esta formula nos dice los meses años que vienen en la base de datos\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6c48cabc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "rental_date\n", + "2005-05 1156\n", + "2005-06 2311\n", + "2005-07 6709\n", + "2005-08 5686\n", + "2006-02 182\n", + "Freq: M, Name: count, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_total['rental_date'].dt.to_period('M').value_counts().sort_index()\n" + ] + }, + { + "cell_type": "markdown", + "id": "8c8e19b5", + "metadata": {}, + "source": [ + "Escriba una función de Python llamada rentals_month que recupere datos de alquiler para un mes y año determinados (pasados ​​como parámetros) de la base de datos Sakila como un DataFrame de Pandas. La función debe tomar tres parámetros: engine: un objeto que representa el motor de conexión a la base de datos que se utilizará para establecer una conexión con la base de datos Sakila. month: un entero que representa el mes para el que se recuperarán los datos de alquiler. year: un entero que representa el año para el que se recuperarán los datos de alquiler \n", + "La función debe ejecutar una consulta SQL para recuperar los datos de alquiler del mes y año especificados de la tabla de alquileres en la base de datos Sakila, y devolverlos como un DataFrame de pandas." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "13a6c91a", + "metadata": {}, + "outputs": [], + "source": [ + "def rentals_month(engine, month, year): # engie para el return month, year para hacer la query de SQL\n", + " \n", + " query = f\"\"\"\n", + " SELECT *\n", + " FROM rental\n", + " WHERE MONTH(rental_date) = {month} \n", + " AND YEAR(rental_date) = {year}; \n", + " \"\"\"\n", + " \n", + " return pd.read_sql(query, engine) # engine conectada a Sakila con \n", + " # (function) def read_sql(sql: _SQLStatement, con: _SQLConnection,-...)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5813d2c1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1156, 7)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rental_idrental_dateinventory_idcustomer_idreturn_datestaff_idlast_update
012005-05-24 22:53:303671302005-05-26 22:04:3012006-02-15 21:30:53
122005-05-24 22:54:3315254592005-05-28 19:40:3312006-02-15 21:30:53
232005-05-24 23:03:3917114082005-06-01 22:12:3912006-02-15 21:30:53
342005-05-24 23:04:4124523332005-06-03 01:43:4122006-02-15 21:30:53
452005-05-24 23:05:2120792222005-06-02 04:33:2112006-02-15 21:30:53
\n", + "
" + ], + "text/plain": [ + " rental_id rental_date inventory_id customer_id \\\n", + "0 1 2005-05-24 22:53:30 367 130 \n", + "1 2 2005-05-24 22:54:33 1525 459 \n", + "2 3 2005-05-24 23:03:39 1711 408 \n", + "3 4 2005-05-24 23:04:41 2452 333 \n", + "4 5 2005-05-24 23:05:21 2079 222 \n", + "\n", + " return_date staff_id last_update \n", + "0 2005-05-26 22:04:30 1 2006-02-15 21:30:53 \n", + "1 2005-05-28 19:40:33 1 2006-02-15 21:30:53 \n", + "2 2005-06-01 22:12:39 1 2006-02-15 21:30:53 \n", + "3 2005-06-03 01:43:41 2 2006-02-15 21:30:53 \n", + "4 2005-06-02 04:33:21 1 2006-02-15 21:30:53 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_rentals_5_2005 = rentals_month(engine,5,2005) \n", + "print(df_rentals_5_2005.shape)\n", + "df_rentals_5_2005.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "117f28ba", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(182, 7)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rental_idrental_dateinventory_idcustomer_idreturn_datestaff_idlast_update
0114962006-02-14 15:16:032047155None12006-02-15 21:30:53
1115412006-02-14 15:16:032026335None12006-02-15 21:30:53
2115632006-02-14 15:16:03154583None12006-02-15 21:30:53
3115772006-02-14 15:16:034106219None22006-02-15 21:30:53
4115932006-02-14 15:16:0381799None12006-02-15 21:30:53
\n", + "
" + ], + "text/plain": [ + " rental_id rental_date inventory_id customer_id return_date \\\n", + "0 11496 2006-02-14 15:16:03 2047 155 None \n", + "1 11541 2006-02-14 15:16:03 2026 335 None \n", + "2 11563 2006-02-14 15:16:03 1545 83 None \n", + "3 11577 2006-02-14 15:16:03 4106 219 None \n", + "4 11593 2006-02-14 15:16:03 817 99 None \n", + "\n", + " staff_id last_update \n", + "0 1 2006-02-15 21:30:53 \n", + "1 1 2006-02-15 21:30:53 \n", + "2 1 2006-02-15 21:30:53 \n", + "3 2 2006-02-15 21:30:53 \n", + "4 1 2006-02-15 21:30:53 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_rentals_2_2006 = rentals_month(engine,2,2006) \n", + "print(df_rentals_2_2006.shape)\n", + "df_rentals_2_2006.head()" + ] + }, + { + "cell_type": "markdown", + "id": "fa9fa48f", + "metadata": {}, + "source": [ + "Desarrolle una función de Python llamada Rental_count_month que tome el DataFrame proporcionado por Rentals_month como entrada junto con el mes y el año y devuelva un nuevo DataFrame que contenga la cantidad de alquileres realizados por cada customer_id durante el mes y año seleccionados. \n", + "\n", + "La función también debe incluir el mes y el año como parámetros y usarlos para nombrar la nueva columna según el mes y el año, por ejemplo, si el mes de entrada es 05 y el año es 2005, el nombre de la columna debe ser \"alquileres_05_2005\". \n", + "\n", + "Sugerencia: considere utilizar pandas groupby()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "19546fab", + "metadata": {}, + "outputs": [], + "source": [ + "def rental_count_month(df, month, year):\n", + "\n", + " nombre_columna_nueva = \"alquileres\"+\"_\"+ str(month).zfill(2) + \"_\" + str(year)\n", + " nombre_columna_nueva\n", + " serie_alquileres = df.groupby('customer_id').size()\n", + " df_final = serie_alquileres.reset_index()\n", + " df_final.columns = [\"customer_id\",nombre_columna_nueva ]\n", + " df_final.head()\n", + "\n", + " return df_final" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6ca58b80", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idalquileres_05_2005
012
121
232
353
463
\n", + "
" + ], + "text/plain": [ + " customer_id alquileres_05_2005\n", + "0 1 2\n", + "1 2 1\n", + "2 3 2\n", + "3 5 3\n", + "4 6 3" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_alquileres_5_2005 = rental_count_month(df_rentals_5_2005,5,2005) \n", + "df_alquileres_5_2005.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "3d0b2394", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idalquileres_02_2006
051
191
2111
3141
4152
\n", + "
" + ], + "text/plain": [ + " customer_id alquileres_02_2006\n", + "0 5 1\n", + "1 9 1\n", + "2 11 1\n", + "3 14 1\n", + "4 15 2" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_alquileres_2_2006 = rental_count_month(df_rentals_2_2006,2,2006) \n", + "df_alquileres_2_2006.head()\n" + ] + }, + { + "cell_type": "markdown", + "id": "efcbc164", + "metadata": {}, + "source": [ + "Crea una función en Python compare_rentals que reciba como entrada dos DataFrames que contengan el número de alquileres realizados por cada cliente en diferentes meses y años. \n", + "La función debe devolver un DataFrame combinado con una nueva columna llamada 'diferencia', que representa la diferencia entre el número de alquileres en los dos meses." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "16ae0a80", + "metadata": {}, + "outputs": [], + "source": [ + "def compare_rentals(df1, df2):\n", + " df_merged = df1.merge(df2, on=\"customer_id\", how=\"outer\").fillna(0).astype(int)# por que piden data frame combinado y aprovecho y quito int\n", + " \n", + " col1 = df1.columns[1]\n", + " col2 = df2.columns[1]\n", + "\n", + " df_merged[\"diferencia\"] = (df_merged[col1] - df_merged[col2]).astype(int)\n", + "\n", + " return df_merged\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "58730767", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idalquileres_05_2005alquileres_02_2006diferencia
01202
12101
23202
35312
46303
...............
534594404
535595101
536596615
537597211
538599101
\n", + "

539 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " customer_id alquileres_05_2005 alquileres_02_2006 diferencia\n", + "0 1 2 0 2\n", + "1 2 1 0 1\n", + "2 3 2 0 2\n", + "3 5 3 1 2\n", + "4 6 3 0 3\n", + ".. ... ... ... ...\n", + "534 594 4 0 4\n", + "535 595 1 0 1\n", + "536 596 6 1 5\n", + "537 597 2 1 1\n", + "538 599 1 0 1\n", + "\n", + "[539 rows x 4 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "compare_rentals(df_alquileres_5_2005,df_alquileres_2_2006)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}