diff --git a/Week6/Week6_Data_Cleaning_Pavithra.ipynb b/Week6/Week6_Data_Cleaning_Pavithra.ipynb new file mode 100644 index 00000000..03e882c1 --- /dev/null +++ b/Week6/Week6_Data_Cleaning_Pavithra.ipynb @@ -0,0 +1,1209 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 73 + }, + "id": "IAuZ2MEWn77I", + "outputId": "8ec80c10-e315-4e4a-8be7-ae53c8dbe03e" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving yellow_tripdata_2023-01.parquet to yellow_tripdata_2023-01.parquet\n" + ] + } + ], + "source": [ + "from google.colab import files\n", + "uploaded = files.upload()\n" + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "# Replace 'example.parquet' with your uploaded file name\n", + "df = pd.read_parquet('yellow_tripdata_2023-01.parquet')\n", + "\n", + "# Convert to CSV and save\n", + "df.to_csv('converted_file.csv', index=False)\n" + ], + "metadata": { + "id": "P7nAJPPeocm4" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import files\n", + "files.download('converted_file.csv')\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 17 + }, + "id": "dnhsLFEypo1n", + "outputId": "4007381d-1b8a-4e56-b7f7-360383c47bcb" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "\n", + " async function download(id, filename, size) {\n", + " if (!google.colab.kernel.accessAllowed) {\n", + " return;\n", + " }\n", + " const div = document.createElement('div');\n", + " const label = document.createElement('label');\n", + " label.textContent = `Downloading \"${filename}\": `;\n", + " div.appendChild(label);\n", + " const progress = document.createElement('progress');\n", + " progress.max = size;\n", + " div.appendChild(progress);\n", + " document.body.appendChild(div);\n", + "\n", + " const buffers = [];\n", + " let downloaded = 0;\n", + "\n", + " const channel = await google.colab.kernel.comms.open(id);\n", + " // Send a message to notify the kernel that we're ready.\n", + " channel.send({})\n", + "\n", + " for await (const message of channel.messages) {\n", + " // Send a message to notify the kernel that we're ready.\n", + " channel.send({})\n", + " if (message.buffers) {\n", + " for (const buffer of message.buffers) {\n", + " buffers.push(buffer);\n", + " downloaded += buffer.byteLength;\n", + " progress.value = downloaded;\n", + " }\n", + " }\n", + " }\n", + " const blob = new Blob(buffers, {type: 'application/binary'});\n", + " const a = document.createElement('a');\n", + " a.href = window.URL.createObjectURL(blob);\n", + " a.download = filename;\n", + " div.appendChild(a);\n", + " a.click();\n", + " div.remove();\n", + " }\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "download(\"download_35a521f0-497b-408a-85ca-70ad2251c346\", \"converted_file.csv\", 321956581)" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import files\n", + "uploaded = files.upload()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 73 + }, + "id": "3BGFJwN0qNIA", + "outputId": "0f0b83f5-fbb5-429a-ab6c-d0f19ba3b4ae" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving yellow_tripdata_2023-01.csv to yellow_tripdata_2023-01.csv\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import time\n", + "\n", + "start = time.time()\n", + "df_pd = pd.read_csv('yellow_tripdata_2023-01.csv') # Replace with exact uploaded filename\n", + "end = time.time()\n", + "\n", + "print(f\"Pandas read time: {end - start:.2f} seconds\")\n", + "df_pd.head()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 278 + }, + "id": "yMb8NfxK9btk", + "outputId": "c1901e49-fb5b-485c-f764-a3586ceb1fd5" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":5: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df_pd = pd.read_csv('yellow_tripdata_2023-01.csv') # Replace with exact uploaded filename\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Pandas read time: 14.54 seconds\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n", + "0 2 2023-01-01 00:32:10 2023-01-01 00:40:36 1.0 \n", + "1 2 2023-01-01 00:55:08 2023-01-01 01:01:27 1.0 \n", + "2 2 2023-01-01 00:25:04 2023-01-01 00:37:49 1.0 \n", + "3 1 2023-01-01 00:03:48 2023-01-01 00:13:25 0.0 \n", + "4 2 2023-01-01 00:10:29 2023-01-01 00:21:19 1.0 \n", + "\n", + " trip_distance RatecodeID store_and_fwd_flag PULocationID DOLocationID \\\n", + "0 0.97 1.0 N 161 141 \n", + "1 1.10 1.0 N 43 237 \n", + "2 2.51 1.0 N 48 238 \n", + "3 1.90 1.0 N 138 7 \n", + "4 1.43 1.0 N 107 79 \n", + "\n", + " payment_type fare_amount extra mta_tax tip_amount tolls_amount \\\n", + "0 2 9.3 1.00 0.5 0.00 0.0 \n", + "1 1 7.9 1.00 0.5 4.00 0.0 \n", + "2 1 14.9 1.00 0.5 15.00 0.0 \n", + "3 1 12.1 7.25 0.5 0.00 0.0 \n", + "4 1 11.4 1.00 0.5 3.28 0.0 \n", + "\n", + " improvement_surcharge total_amount congestion_surcharge airport_fee \n", + "0 1.0 14.30 2.5 0.00 \n", + "1 1.0 16.90 2.5 0.00 \n", + "2 1.0 34.90 2.5 0.00 \n", + "3 1.0 20.85 0.0 1.25 \n", + "4 1.0 19.68 2.5 0.00 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
VendorIDtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distanceRatecodeIDstore_and_fwd_flagPULocationIDDOLocationIDpayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amountcongestion_surchargeairport_fee
022023-01-01 00:32:102023-01-01 00:40:361.00.971.0N16114129.31.000.50.000.01.014.302.50.00
122023-01-01 00:55:082023-01-01 01:01:271.01.101.0N4323717.91.000.54.000.01.016.902.50.00
222023-01-01 00:25:042023-01-01 00:37:491.02.511.0N48238114.91.000.515.000.01.034.902.50.00
312023-01-01 00:03:482023-01-01 00:13:250.01.901.0N1387112.17.250.50.000.01.020.850.01.25
422023-01-01 00:10:292023-01-01 00:21:191.01.431.0N10779111.41.000.53.280.01.019.682.50.00
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df_pd" + } + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df_pd.columns = (\n", + " df_pd.columns.str.strip()\n", + " .str.replace(' ', '_')\n", + " .str.replace('[^A-Za-z0-9_]+', '', regex=True)\n", + ")\n", + "print(\"✅ Cleaned column names:\")\n", + "print(df_pd.columns.tolist())\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-2RVp--dBltX", + "outputId": "4d5c122f-a333-4ff8-cda2-e70e0f5a7ca0" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "✅ Cleaned column names:\n", + "['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!pip install pyyaml\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AJDeo57ABx2B", + "outputId": "c0e9693c-4eff-4b2f-82db-f9478dd5195d" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.11/dist-packages (6.0.2)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import yaml\n", + "\n", + "# Define your column names (already cleaned)\n", + "columns = df_pd.columns.tolist()\n", + "\n", + "# Define schema dictionary\n", + "schema = {\n", + " 'separator': ',',\n", + " 'columns': columns\n", + "}\n", + "\n", + "# Save schema to YAML file\n", + "with open('schema.yaml', 'w') as file:\n", + " yaml.dump(schema, file)\n", + "\n", + "print(\"✅ YAML schema created successfully!\")\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JTTvUFPwC_5j", + "outputId": "d06e5799-a552-4602-a4bc-9827349f34b4" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "✅ YAML schema created successfully!\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import yaml\n", + "\n", + "# Load the YAML schema\n", + "with open('schema.yaml', 'r') as file:\n", + " schema = yaml.safe_load(file)\n", + "\n", + "# Extract expected column names\n", + "expected_columns = schema['columns']\n", + "\n", + "# Compare with actual columns\n", + "actual_columns = df_pd.columns.tolist()\n", + "\n", + "# Validation\n", + "if expected_columns == actual_columns:\n", + " print(\"✅ Validation successful: Column names and order match the YAML schema.\")\n", + "else:\n", + " print(\"❌ Validation failed!\")\n", + " print(\"Expected columns:\")\n", + " print(expected_columns)\n", + " print(\"Actual columns:\")\n", + " print(actual_columns)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OoDqA33KD7lT", + "outputId": "ddb3ab1a-7ab1-4a28-a7df-b38da7682f03" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "✅ Validation successful: Column names and order match the YAML schema.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Save the DataFrame as a pipe-separated gzip file\n", + "output_file = 'yellow_tripdata_2023_pipe.gz'\n", + "\n", + "df_pd.to_csv(output_file, sep='|', index=False, compression='gzip')\n", + "\n", + "print(f\"✅ File written successfully as: {output_file}\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LBE4WjKDEXGm", + "outputId": "0af2361f-4bb8-417a-8959-fdbdc45e3d18" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "✅ File written successfully as: yellow_tripdata_2023_pipe.gz\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "\n", + "# Summary\n", + "num_rows = df_pd.shape[0]\n", + "num_cols = df_pd.shape[1]\n", + "file_size = os.path.getsize('yellow_tripdata_2023_pipe.gz') / (1024 * 1024) # Convert to MB\n", + "\n", + "print(\"✅ File Summary:\")\n", + "print(f\"Total Rows: {num_rows}\")\n", + "print(f\"Total Columns: {num_cols}\")\n", + "print(f\"File Size: {file_size:.2f} MB\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qgLevFTOEx2S", + "outputId": "a3baca75-eebc-4b76-e04d-91bdea02c9a8" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "✅ File Summary:\n", + "Total Rows: 3066766\n", + "Total Columns: 19\n", + "File Size: 53.37 MB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "YnWV_l2vGN5L" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/Week6/Week6_Data_Processing_Pavithra.pdf b/Week6/Week6_Data_Processing_Pavithra.pdf new file mode 100644 index 00000000..5f67ea73 Binary files /dev/null and b/Week6/Week6_Data_Processing_Pavithra.pdf differ diff --git a/Week6/schema.yaml b/Week6/schema.yaml new file mode 100644 index 00000000..9ab58d69 --- /dev/null +++ b/Week6/schema.yaml @@ -0,0 +1,21 @@ +columns: +- VendorID +- tpep_pickup_datetime +- tpep_dropoff_datetime +- passenger_count +- trip_distance +- RatecodeID +- store_and_fwd_flag +- PULocationID +- DOLocationID +- payment_type +- fare_amount +- extra +- mta_tax +- tip_amount +- tolls_amount +- improvement_surcharge +- total_amount +- congestion_surcharge +- airport_fee +separator: ',' diff --git a/Week6/yellow_tripdata_2023_pipe .gz b/Week6/yellow_tripdata_2023_pipe .gz new file mode 100644 index 00000000..a0c619f7 Binary files /dev/null and b/Week6/yellow_tripdata_2023_pipe .gz differ